diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,62940 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999499081649691, + "eval_steps": 500, + "global_step": 8982, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003339455668725998, + "grad_norm": 12.92171003192369, + "learning_rate": 1.1123470522803115e-08, + "loss": 0.2988, + "step": 1 + }, + { + "epoch": 0.0006678911337451996, + "grad_norm": 13.956338815299878, + "learning_rate": 2.224694104560623e-08, + "loss": 0.308, + "step": 2 + }, + { + "epoch": 0.0010018367006177993, + "grad_norm": 12.86023641194538, + "learning_rate": 3.337041156840935e-08, + "loss": 0.2821, + "step": 3 + }, + { + "epoch": 0.0013357822674903992, + "grad_norm": 13.913511855437857, + "learning_rate": 4.449388209121246e-08, + "loss": 0.2781, + "step": 4 + }, + { + "epoch": 0.0016697278343629988, + "grad_norm": 13.699199844473183, + "learning_rate": 5.561735261401558e-08, + "loss": 0.3071, + "step": 5 + }, + { + "epoch": 0.0020036734012355987, + "grad_norm": 14.10327817555655, + "learning_rate": 6.67408231368187e-08, + "loss": 0.3214, + "step": 6 + }, + { + "epoch": 0.0023376189681081983, + "grad_norm": 12.536899972256993, + "learning_rate": 7.78642936596218e-08, + "loss": 0.2617, + "step": 7 + }, + { + "epoch": 0.0026715645349807983, + "grad_norm": 12.888187017193689, + "learning_rate": 8.898776418242492e-08, + "loss": 0.2894, + "step": 8 + }, + { + "epoch": 0.003005510101853398, + "grad_norm": 12.454584884182633, + "learning_rate": 1.0011123470522804e-07, + "loss": 0.2908, + "step": 9 + }, + { + "epoch": 0.0033394556687259976, + "grad_norm": 12.00546599592856, + "learning_rate": 1.1123470522803116e-07, + "loss": 0.2737, + "step": 10 + }, + { + "epoch": 0.0036734012355985972, + "grad_norm": 12.881443540473851, + "learning_rate": 1.2235817575083427e-07, + "loss": 0.278, + "step": 11 + }, + { + "epoch": 0.004007346802471197, + "grad_norm": 13.509260575004946, + "learning_rate": 1.334816462736374e-07, + "loss": 0.3083, + "step": 12 + }, + { + "epoch": 0.004341292369343797, + "grad_norm": 12.995692303974334, + "learning_rate": 1.446051167964405e-07, + "loss": 0.3051, + "step": 13 + }, + { + "epoch": 0.0046752379362163966, + "grad_norm": 12.482273819850871, + "learning_rate": 1.557285873192436e-07, + "loss": 0.2637, + "step": 14 + }, + { + "epoch": 0.005009183503088996, + "grad_norm": 12.386935079831533, + "learning_rate": 1.6685205784204674e-07, + "loss": 0.2804, + "step": 15 + }, + { + "epoch": 0.005343129069961597, + "grad_norm": 12.819259388755173, + "learning_rate": 1.7797552836484985e-07, + "loss": 0.2941, + "step": 16 + }, + { + "epoch": 0.005677074636834196, + "grad_norm": 11.912184553214447, + "learning_rate": 1.8909899888765295e-07, + "loss": 0.2928, + "step": 17 + }, + { + "epoch": 0.006011020203706796, + "grad_norm": 12.941830493445417, + "learning_rate": 2.0022246941045608e-07, + "loss": 0.3081, + "step": 18 + }, + { + "epoch": 0.006344965770579396, + "grad_norm": 11.406373410492941, + "learning_rate": 2.113459399332592e-07, + "loss": 0.2819, + "step": 19 + }, + { + "epoch": 0.006678911337451995, + "grad_norm": 11.061731469121861, + "learning_rate": 2.2246941045606232e-07, + "loss": 0.228, + "step": 20 + }, + { + "epoch": 0.007012856904324595, + "grad_norm": 7.802127624417816, + "learning_rate": 2.3359288097886543e-07, + "loss": 0.2066, + "step": 21 + }, + { + "epoch": 0.0073468024711971945, + "grad_norm": 8.587928584552232, + "learning_rate": 2.4471635150166853e-07, + "loss": 0.2391, + "step": 22 + }, + { + "epoch": 0.007680748038069795, + "grad_norm": 8.077867973647216, + "learning_rate": 2.5583982202447166e-07, + "loss": 0.2279, + "step": 23 + }, + { + "epoch": 0.008014693604942395, + "grad_norm": 8.738395787944595, + "learning_rate": 2.669632925472748e-07, + "loss": 0.2266, + "step": 24 + }, + { + "epoch": 0.008348639171814994, + "grad_norm": 9.149113683182511, + "learning_rate": 2.780867630700779e-07, + "loss": 0.2537, + "step": 25 + }, + { + "epoch": 0.008682584738687594, + "grad_norm": 9.020287772281916, + "learning_rate": 2.89210233592881e-07, + "loss": 0.255, + "step": 26 + }, + { + "epoch": 0.009016530305560193, + "grad_norm": 7.417370767206343, + "learning_rate": 3.003337041156841e-07, + "loss": 0.2038, + "step": 27 + }, + { + "epoch": 0.009350475872432793, + "grad_norm": 5.316597345079005, + "learning_rate": 3.114571746384872e-07, + "loss": 0.1992, + "step": 28 + }, + { + "epoch": 0.009684421439305393, + "grad_norm": 4.27986688396643, + "learning_rate": 3.2258064516129035e-07, + "loss": 0.1671, + "step": 29 + }, + { + "epoch": 0.010018367006177992, + "grad_norm": 4.624509032720886, + "learning_rate": 3.337041156840935e-07, + "loss": 0.1496, + "step": 30 + }, + { + "epoch": 0.010352312573050592, + "grad_norm": 3.7335787807472025, + "learning_rate": 3.4482758620689656e-07, + "loss": 0.1196, + "step": 31 + }, + { + "epoch": 0.010686258139923193, + "grad_norm": 3.732271378852131, + "learning_rate": 3.559510567296997e-07, + "loss": 0.1486, + "step": 32 + }, + { + "epoch": 0.011020203706795793, + "grad_norm": 3.8306272781057356, + "learning_rate": 3.670745272525028e-07, + "loss": 0.1327, + "step": 33 + }, + { + "epoch": 0.011354149273668393, + "grad_norm": 3.4893370876395227, + "learning_rate": 3.781979977753059e-07, + "loss": 0.1839, + "step": 34 + }, + { + "epoch": 0.011688094840540992, + "grad_norm": 3.8733495197080647, + "learning_rate": 3.8932146829810904e-07, + "loss": 0.1686, + "step": 35 + }, + { + "epoch": 0.012022040407413592, + "grad_norm": 2.7736882693641483, + "learning_rate": 4.0044493882091217e-07, + "loss": 0.1284, + "step": 36 + }, + { + "epoch": 0.012355985974286192, + "grad_norm": 3.1365209481113356, + "learning_rate": 4.115684093437153e-07, + "loss": 0.1736, + "step": 37 + }, + { + "epoch": 0.012689931541158791, + "grad_norm": 2.5256497657985255, + "learning_rate": 4.226918798665184e-07, + "loss": 0.138, + "step": 38 + }, + { + "epoch": 0.01302387710803139, + "grad_norm": 2.71639770957255, + "learning_rate": 4.338153503893215e-07, + "loss": 0.1431, + "step": 39 + }, + { + "epoch": 0.01335782267490399, + "grad_norm": 2.00008767091192, + "learning_rate": 4.4493882091212464e-07, + "loss": 0.1214, + "step": 40 + }, + { + "epoch": 0.01369176824177659, + "grad_norm": 2.2089314405521474, + "learning_rate": 4.560622914349278e-07, + "loss": 0.1226, + "step": 41 + }, + { + "epoch": 0.01402571380864919, + "grad_norm": 1.911269906505354, + "learning_rate": 4.6718576195773085e-07, + "loss": 0.1284, + "step": 42 + }, + { + "epoch": 0.01435965937552179, + "grad_norm": 1.8689118089163397, + "learning_rate": 4.783092324805339e-07, + "loss": 0.1367, + "step": 43 + }, + { + "epoch": 0.014693604942394389, + "grad_norm": 1.9664760230436695, + "learning_rate": 4.894327030033371e-07, + "loss": 0.1411, + "step": 44 + }, + { + "epoch": 0.01502755050926699, + "grad_norm": 1.7429607986196036, + "learning_rate": 5.005561735261402e-07, + "loss": 0.1191, + "step": 45 + }, + { + "epoch": 0.01536149607613959, + "grad_norm": 1.4591054195429314, + "learning_rate": 5.116796440489433e-07, + "loss": 0.0894, + "step": 46 + }, + { + "epoch": 0.01569544164301219, + "grad_norm": 1.5500253735232825, + "learning_rate": 5.228031145717465e-07, + "loss": 0.1123, + "step": 47 + }, + { + "epoch": 0.01602938720988479, + "grad_norm": 1.430119262578493, + "learning_rate": 5.339265850945496e-07, + "loss": 0.0826, + "step": 48 + }, + { + "epoch": 0.01636333277675739, + "grad_norm": 1.5162569575731026, + "learning_rate": 5.450500556173527e-07, + "loss": 0.1104, + "step": 49 + }, + { + "epoch": 0.01669727834362999, + "grad_norm": 1.8050273480823371, + "learning_rate": 5.561735261401558e-07, + "loss": 0.1437, + "step": 50 + }, + { + "epoch": 0.017031223910502588, + "grad_norm": 1.4138690044307227, + "learning_rate": 5.672969966629589e-07, + "loss": 0.1006, + "step": 51 + }, + { + "epoch": 0.017365169477375188, + "grad_norm": 2.003451442850109, + "learning_rate": 5.78420467185762e-07, + "loss": 0.1202, + "step": 52 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 1.9150778054568744, + "learning_rate": 5.89543937708565e-07, + "loss": 0.1368, + "step": 53 + }, + { + "epoch": 0.018033060611120387, + "grad_norm": 1.420482579614073, + "learning_rate": 6.006674082313682e-07, + "loss": 0.1135, + "step": 54 + }, + { + "epoch": 0.018367006177992987, + "grad_norm": 1.6619813862888555, + "learning_rate": 6.117908787541713e-07, + "loss": 0.1098, + "step": 55 + }, + { + "epoch": 0.018700951744865586, + "grad_norm": 1.8039097675422715, + "learning_rate": 6.229143492769744e-07, + "loss": 0.1182, + "step": 56 + }, + { + "epoch": 0.019034897311738186, + "grad_norm": 1.9251360126472772, + "learning_rate": 6.340378197997777e-07, + "loss": 0.1229, + "step": 57 + }, + { + "epoch": 0.019368842878610786, + "grad_norm": 1.646127653594256, + "learning_rate": 6.451612903225807e-07, + "loss": 0.0908, + "step": 58 + }, + { + "epoch": 0.019702788445483385, + "grad_norm": 1.8096753461486952, + "learning_rate": 6.562847608453838e-07, + "loss": 0.0902, + "step": 59 + }, + { + "epoch": 0.020036734012355985, + "grad_norm": 1.3257550156504097, + "learning_rate": 6.67408231368187e-07, + "loss": 0.0927, + "step": 60 + }, + { + "epoch": 0.020370679579228584, + "grad_norm": 1.4611566503899422, + "learning_rate": 6.785317018909901e-07, + "loss": 0.0949, + "step": 61 + }, + { + "epoch": 0.020704625146101184, + "grad_norm": 1.4170363748007948, + "learning_rate": 6.896551724137931e-07, + "loss": 0.0756, + "step": 62 + }, + { + "epoch": 0.021038570712973784, + "grad_norm": 1.5649016246749317, + "learning_rate": 7.007786429365964e-07, + "loss": 0.1167, + "step": 63 + }, + { + "epoch": 0.021372516279846387, + "grad_norm": 1.7223755363490967, + "learning_rate": 7.119021134593994e-07, + "loss": 0.0914, + "step": 64 + }, + { + "epoch": 0.021706461846718986, + "grad_norm": 2.264053959910125, + "learning_rate": 7.230255839822026e-07, + "loss": 0.1022, + "step": 65 + }, + { + "epoch": 0.022040407413591586, + "grad_norm": 1.2864993126473974, + "learning_rate": 7.341490545050057e-07, + "loss": 0.1053, + "step": 66 + }, + { + "epoch": 0.022374352980464186, + "grad_norm": 1.44023472678529, + "learning_rate": 7.452725250278087e-07, + "loss": 0.0957, + "step": 67 + }, + { + "epoch": 0.022708298547336785, + "grad_norm": 1.6488839104515984, + "learning_rate": 7.563959955506118e-07, + "loss": 0.1114, + "step": 68 + }, + { + "epoch": 0.023042244114209385, + "grad_norm": 1.4538025741923084, + "learning_rate": 7.675194660734149e-07, + "loss": 0.108, + "step": 69 + }, + { + "epoch": 0.023376189681081985, + "grad_norm": 1.5002627715774677, + "learning_rate": 7.786429365962181e-07, + "loss": 0.1141, + "step": 70 + }, + { + "epoch": 0.023710135247954584, + "grad_norm": 1.586495576761618, + "learning_rate": 7.897664071190211e-07, + "loss": 0.1098, + "step": 71 + }, + { + "epoch": 0.024044080814827184, + "grad_norm": 1.1824262761185285, + "learning_rate": 8.008898776418243e-07, + "loss": 0.0772, + "step": 72 + }, + { + "epoch": 0.024378026381699783, + "grad_norm": 1.618110947873907, + "learning_rate": 8.120133481646274e-07, + "loss": 0.1097, + "step": 73 + }, + { + "epoch": 0.024711971948572383, + "grad_norm": 1.1671719943955008, + "learning_rate": 8.231368186874306e-07, + "loss": 0.0852, + "step": 74 + }, + { + "epoch": 0.025045917515444983, + "grad_norm": 1.2830768944527116, + "learning_rate": 8.342602892102336e-07, + "loss": 0.1165, + "step": 75 + }, + { + "epoch": 0.025379863082317582, + "grad_norm": 1.4018712313176183, + "learning_rate": 8.453837597330368e-07, + "loss": 0.0874, + "step": 76 + }, + { + "epoch": 0.025713808649190182, + "grad_norm": 1.2432680393980957, + "learning_rate": 8.565072302558399e-07, + "loss": 0.1088, + "step": 77 + }, + { + "epoch": 0.02604775421606278, + "grad_norm": 1.3164039971574175, + "learning_rate": 8.67630700778643e-07, + "loss": 0.0894, + "step": 78 + }, + { + "epoch": 0.02638169978293538, + "grad_norm": 1.642213217588854, + "learning_rate": 8.78754171301446e-07, + "loss": 0.0894, + "step": 79 + }, + { + "epoch": 0.02671564534980798, + "grad_norm": 1.0839090380138228, + "learning_rate": 8.898776418242493e-07, + "loss": 0.0806, + "step": 80 + }, + { + "epoch": 0.02704959091668058, + "grad_norm": 1.8952932050331013, + "learning_rate": 9.010011123470523e-07, + "loss": 0.132, + "step": 81 + }, + { + "epoch": 0.02738353648355318, + "grad_norm": 1.465175894344485, + "learning_rate": 9.121245828698556e-07, + "loss": 0.0628, + "step": 82 + }, + { + "epoch": 0.02771748205042578, + "grad_norm": 1.3366732234313092, + "learning_rate": 9.232480533926586e-07, + "loss": 0.0863, + "step": 83 + }, + { + "epoch": 0.02805142761729838, + "grad_norm": 1.643937154471812, + "learning_rate": 9.343715239154617e-07, + "loss": 0.096, + "step": 84 + }, + { + "epoch": 0.02838537318417098, + "grad_norm": 1.2570354689622436, + "learning_rate": 9.454949944382647e-07, + "loss": 0.0948, + "step": 85 + }, + { + "epoch": 0.02871931875104358, + "grad_norm": 1.5112058267872772, + "learning_rate": 9.566184649610679e-07, + "loss": 0.1044, + "step": 86 + }, + { + "epoch": 0.02905326431791618, + "grad_norm": 1.0702555105247553, + "learning_rate": 9.67741935483871e-07, + "loss": 0.0762, + "step": 87 + }, + { + "epoch": 0.029387209884788778, + "grad_norm": 1.6750200548468086, + "learning_rate": 9.788654060066741e-07, + "loss": 0.1282, + "step": 88 + }, + { + "epoch": 0.029721155451661378, + "grad_norm": 2.2918645416277785, + "learning_rate": 9.899888765294773e-07, + "loss": 0.0915, + "step": 89 + }, + { + "epoch": 0.03005510101853398, + "grad_norm": 2.242498528106184, + "learning_rate": 1.0011123470522804e-06, + "loss": 0.1077, + "step": 90 + }, + { + "epoch": 0.03038904658540658, + "grad_norm": 1.6529779325988916, + "learning_rate": 1.0122358175750835e-06, + "loss": 0.0981, + "step": 91 + }, + { + "epoch": 0.03072299215227918, + "grad_norm": 1.6048070404143326, + "learning_rate": 1.0233592880978867e-06, + "loss": 0.0823, + "step": 92 + }, + { + "epoch": 0.03105693771915178, + "grad_norm": 1.6293330212334107, + "learning_rate": 1.0344827586206898e-06, + "loss": 0.1174, + "step": 93 + }, + { + "epoch": 0.03139088328602438, + "grad_norm": 1.4328639917912847, + "learning_rate": 1.045606229143493e-06, + "loss": 0.1005, + "step": 94 + }, + { + "epoch": 0.03172482885289698, + "grad_norm": 1.2582858682128226, + "learning_rate": 1.056729699666296e-06, + "loss": 0.1019, + "step": 95 + }, + { + "epoch": 0.03205877441976958, + "grad_norm": 1.2746152228652043, + "learning_rate": 1.0678531701890992e-06, + "loss": 0.0946, + "step": 96 + }, + { + "epoch": 0.03239271998664218, + "grad_norm": 1.39449403536346, + "learning_rate": 1.0789766407119021e-06, + "loss": 0.0739, + "step": 97 + }, + { + "epoch": 0.03272666555351478, + "grad_norm": 1.3423087237113123, + "learning_rate": 1.0901001112347055e-06, + "loss": 0.0728, + "step": 98 + }, + { + "epoch": 0.03306061112038738, + "grad_norm": 1.6223819048746644, + "learning_rate": 1.1012235817575084e-06, + "loss": 0.1153, + "step": 99 + }, + { + "epoch": 0.03339455668725998, + "grad_norm": 1.3427962365052586, + "learning_rate": 1.1123470522803115e-06, + "loss": 0.0835, + "step": 100 + }, + { + "epoch": 0.03372850225413258, + "grad_norm": 1.149441339699243, + "learning_rate": 1.1234705228031146e-06, + "loss": 0.0849, + "step": 101 + }, + { + "epoch": 0.034062447821005176, + "grad_norm": 1.5347477678101182, + "learning_rate": 1.1345939933259178e-06, + "loss": 0.0949, + "step": 102 + }, + { + "epoch": 0.034396393387877776, + "grad_norm": 1.5538378922417313, + "learning_rate": 1.145717463848721e-06, + "loss": 0.1102, + "step": 103 + }, + { + "epoch": 0.034730338954750375, + "grad_norm": 1.6668690527736754, + "learning_rate": 1.156840934371524e-06, + "loss": 0.0914, + "step": 104 + }, + { + "epoch": 0.035064284521622975, + "grad_norm": 1.2103230671109944, + "learning_rate": 1.1679644048943272e-06, + "loss": 0.0626, + "step": 105 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 1.8764172669861614, + "learning_rate": 1.17908787541713e-06, + "loss": 0.0691, + "step": 106 + }, + { + "epoch": 0.035732175655368174, + "grad_norm": 1.722910145928535, + "learning_rate": 1.1902113459399334e-06, + "loss": 0.093, + "step": 107 + }, + { + "epoch": 0.036066121222240774, + "grad_norm": 1.0618234858765518, + "learning_rate": 1.2013348164627363e-06, + "loss": 0.0761, + "step": 108 + }, + { + "epoch": 0.036400066789113374, + "grad_norm": 1.3121783082566816, + "learning_rate": 1.2124582869855397e-06, + "loss": 0.089, + "step": 109 + }, + { + "epoch": 0.03673401235598597, + "grad_norm": 1.4499363672076466, + "learning_rate": 1.2235817575083426e-06, + "loss": 0.0804, + "step": 110 + }, + { + "epoch": 0.03706795792285857, + "grad_norm": 1.2615508501901613, + "learning_rate": 1.2347052280311457e-06, + "loss": 0.0845, + "step": 111 + }, + { + "epoch": 0.03740190348973117, + "grad_norm": 0.9940605122797614, + "learning_rate": 1.2458286985539489e-06, + "loss": 0.0668, + "step": 112 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 1.1405073614214674, + "learning_rate": 1.256952169076752e-06, + "loss": 0.0681, + "step": 113 + }, + { + "epoch": 0.03806979462347637, + "grad_norm": 1.4030861518429394, + "learning_rate": 1.2680756395995554e-06, + "loss": 0.0814, + "step": 114 + }, + { + "epoch": 0.03840374019034897, + "grad_norm": 1.3578007200676907, + "learning_rate": 1.2791991101223583e-06, + "loss": 0.0895, + "step": 115 + }, + { + "epoch": 0.03873768575722157, + "grad_norm": 1.0249529431802051, + "learning_rate": 1.2903225806451614e-06, + "loss": 0.0739, + "step": 116 + }, + { + "epoch": 0.03907163132409417, + "grad_norm": 1.235415087018068, + "learning_rate": 1.3014460511679643e-06, + "loss": 0.0824, + "step": 117 + }, + { + "epoch": 0.03940557689096677, + "grad_norm": 1.3479513963722216, + "learning_rate": 1.3125695216907677e-06, + "loss": 0.0801, + "step": 118 + }, + { + "epoch": 0.03973952245783937, + "grad_norm": 1.2754322824182192, + "learning_rate": 1.3236929922135708e-06, + "loss": 0.1071, + "step": 119 + }, + { + "epoch": 0.04007346802471197, + "grad_norm": 1.1196937681855994, + "learning_rate": 1.334816462736374e-06, + "loss": 0.0711, + "step": 120 + }, + { + "epoch": 0.04040741359158457, + "grad_norm": 1.0251751276579624, + "learning_rate": 1.3459399332591769e-06, + "loss": 0.0657, + "step": 121 + }, + { + "epoch": 0.04074135915845717, + "grad_norm": 1.8597822373350466, + "learning_rate": 1.3570634037819802e-06, + "loss": 0.0991, + "step": 122 + }, + { + "epoch": 0.04107530472532977, + "grad_norm": 0.8913110658609731, + "learning_rate": 1.3681868743047833e-06, + "loss": 0.0582, + "step": 123 + }, + { + "epoch": 0.04140925029220237, + "grad_norm": 1.7385584510532524, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.114, + "step": 124 + }, + { + "epoch": 0.04174319585907497, + "grad_norm": 1.1376218320024045, + "learning_rate": 1.3904338153503894e-06, + "loss": 0.0692, + "step": 125 + }, + { + "epoch": 0.04207714142594757, + "grad_norm": 1.3062772064746195, + "learning_rate": 1.4015572858731927e-06, + "loss": 0.0954, + "step": 126 + }, + { + "epoch": 0.04241108699282017, + "grad_norm": 1.366103399563069, + "learning_rate": 1.4126807563959956e-06, + "loss": 0.0928, + "step": 127 + }, + { + "epoch": 0.042745032559692774, + "grad_norm": 1.2246926697415539, + "learning_rate": 1.4238042269187988e-06, + "loss": 0.0791, + "step": 128 + }, + { + "epoch": 0.04307897812656537, + "grad_norm": 1.6570991234879342, + "learning_rate": 1.434927697441602e-06, + "loss": 0.0858, + "step": 129 + }, + { + "epoch": 0.04341292369343797, + "grad_norm": 1.369880127314859, + "learning_rate": 1.4460511679644053e-06, + "loss": 0.0833, + "step": 130 + }, + { + "epoch": 0.04374686926031057, + "grad_norm": 1.206854655306054, + "learning_rate": 1.4571746384872082e-06, + "loss": 0.0765, + "step": 131 + }, + { + "epoch": 0.04408081482718317, + "grad_norm": 1.125499057097289, + "learning_rate": 1.4682981090100113e-06, + "loss": 0.0756, + "step": 132 + }, + { + "epoch": 0.04441476039405577, + "grad_norm": 1.4059140667007435, + "learning_rate": 1.4794215795328142e-06, + "loss": 0.0848, + "step": 133 + }, + { + "epoch": 0.04474870596092837, + "grad_norm": 1.06168619350604, + "learning_rate": 1.4905450500556174e-06, + "loss": 0.052, + "step": 134 + }, + { + "epoch": 0.04508265152780097, + "grad_norm": 1.6567637931657537, + "learning_rate": 1.5016685205784207e-06, + "loss": 0.0992, + "step": 135 + }, + { + "epoch": 0.04541659709467357, + "grad_norm": 1.920957032614187, + "learning_rate": 1.5127919911012236e-06, + "loss": 0.1022, + "step": 136 + }, + { + "epoch": 0.04575054266154617, + "grad_norm": 1.074227017597323, + "learning_rate": 1.5239154616240268e-06, + "loss": 0.0768, + "step": 137 + }, + { + "epoch": 0.04608448822841877, + "grad_norm": 1.016076882611769, + "learning_rate": 1.5350389321468299e-06, + "loss": 0.0695, + "step": 138 + }, + { + "epoch": 0.04641843379529137, + "grad_norm": 0.9899774375747383, + "learning_rate": 1.5461624026696332e-06, + "loss": 0.0661, + "step": 139 + }, + { + "epoch": 0.04675237936216397, + "grad_norm": 1.308714111392643, + "learning_rate": 1.5572858731924361e-06, + "loss": 0.0605, + "step": 140 + }, + { + "epoch": 0.04708632492903657, + "grad_norm": 1.02745462441021, + "learning_rate": 1.5684093437152393e-06, + "loss": 0.0775, + "step": 141 + }, + { + "epoch": 0.04742027049590917, + "grad_norm": 1.168735536247947, + "learning_rate": 1.5795328142380422e-06, + "loss": 0.0917, + "step": 142 + }, + { + "epoch": 0.04775421606278177, + "grad_norm": 1.196785596172984, + "learning_rate": 1.5906562847608455e-06, + "loss": 0.0797, + "step": 143 + }, + { + "epoch": 0.04808816162965437, + "grad_norm": 1.1451563981154869, + "learning_rate": 1.6017797552836487e-06, + "loss": 0.078, + "step": 144 + }, + { + "epoch": 0.04842210719652697, + "grad_norm": 1.0392222279053414, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.0793, + "step": 145 + }, + { + "epoch": 0.04875605276339957, + "grad_norm": 1.0934735024383997, + "learning_rate": 1.6240266963292547e-06, + "loss": 0.0716, + "step": 146 + }, + { + "epoch": 0.049089998330272167, + "grad_norm": 1.4519901055252726, + "learning_rate": 1.635150166852058e-06, + "loss": 0.0757, + "step": 147 + }, + { + "epoch": 0.049423943897144766, + "grad_norm": 2.2852241619827813, + "learning_rate": 1.6462736373748612e-06, + "loss": 0.0862, + "step": 148 + }, + { + "epoch": 0.049757889464017366, + "grad_norm": 1.6408340659482898, + "learning_rate": 1.6573971078976641e-06, + "loss": 0.0886, + "step": 149 + }, + { + "epoch": 0.050091835030889965, + "grad_norm": 2.0454946446505016, + "learning_rate": 1.6685205784204673e-06, + "loss": 0.0772, + "step": 150 + }, + { + "epoch": 0.050425780597762565, + "grad_norm": 1.2259922787210893, + "learning_rate": 1.6796440489432706e-06, + "loss": 0.0683, + "step": 151 + }, + { + "epoch": 0.050759726164635165, + "grad_norm": 1.1562777397964157, + "learning_rate": 1.6907675194660735e-06, + "loss": 0.0757, + "step": 152 + }, + { + "epoch": 0.051093671731507764, + "grad_norm": 1.136779453981954, + "learning_rate": 1.7018909899888767e-06, + "loss": 0.0913, + "step": 153 + }, + { + "epoch": 0.051427617298380364, + "grad_norm": 1.0030489951613217, + "learning_rate": 1.7130144605116798e-06, + "loss": 0.0682, + "step": 154 + }, + { + "epoch": 0.051761562865252964, + "grad_norm": 1.7385357922589744, + "learning_rate": 1.724137931034483e-06, + "loss": 0.074, + "step": 155 + }, + { + "epoch": 0.05209550843212556, + "grad_norm": 1.3740643745362002, + "learning_rate": 1.735261401557286e-06, + "loss": 0.0753, + "step": 156 + }, + { + "epoch": 0.05242945399899816, + "grad_norm": 1.1583165071055108, + "learning_rate": 1.7463848720800892e-06, + "loss": 0.0624, + "step": 157 + }, + { + "epoch": 0.05276339956587076, + "grad_norm": 1.1621428052160172, + "learning_rate": 1.757508342602892e-06, + "loss": 0.0728, + "step": 158 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 0.9935446809403798, + "learning_rate": 1.7686318131256954e-06, + "loss": 0.064, + "step": 159 + }, + { + "epoch": 0.05343129069961596, + "grad_norm": 1.0753646446851233, + "learning_rate": 1.7797552836484986e-06, + "loss": 0.0735, + "step": 160 + }, + { + "epoch": 0.05376523626648856, + "grad_norm": 1.1635933935840013, + "learning_rate": 1.7908787541713015e-06, + "loss": 0.0601, + "step": 161 + }, + { + "epoch": 0.05409918183336116, + "grad_norm": 0.976271537576658, + "learning_rate": 1.8020022246941046e-06, + "loss": 0.0642, + "step": 162 + }, + { + "epoch": 0.05443312740023376, + "grad_norm": 1.4114206211723894, + "learning_rate": 1.813125695216908e-06, + "loss": 0.0934, + "step": 163 + }, + { + "epoch": 0.05476707296710636, + "grad_norm": 1.237154441559006, + "learning_rate": 1.824249165739711e-06, + "loss": 0.086, + "step": 164 + }, + { + "epoch": 0.05510101853397896, + "grad_norm": 1.2253824367688997, + "learning_rate": 1.835372636262514e-06, + "loss": 0.0865, + "step": 165 + }, + { + "epoch": 0.05543496410085156, + "grad_norm": 0.8378818554174366, + "learning_rate": 1.8464961067853172e-06, + "loss": 0.0659, + "step": 166 + }, + { + "epoch": 0.05576890966772416, + "grad_norm": 0.9223176156027858, + "learning_rate": 1.85761957730812e-06, + "loss": 0.0741, + "step": 167 + }, + { + "epoch": 0.05610285523459676, + "grad_norm": 1.3912311372924906, + "learning_rate": 1.8687430478309234e-06, + "loss": 0.0971, + "step": 168 + }, + { + "epoch": 0.05643680080146936, + "grad_norm": 1.0181918901777176, + "learning_rate": 1.8798665183537266e-06, + "loss": 0.0748, + "step": 169 + }, + { + "epoch": 0.05677074636834196, + "grad_norm": 0.8910063887713932, + "learning_rate": 1.8909899888765295e-06, + "loss": 0.0625, + "step": 170 + }, + { + "epoch": 0.05710469193521456, + "grad_norm": 1.3057004674938029, + "learning_rate": 1.9021134593993326e-06, + "loss": 0.0941, + "step": 171 + }, + { + "epoch": 0.05743863750208716, + "grad_norm": 1.193105077297629, + "learning_rate": 1.9132369299221357e-06, + "loss": 0.0643, + "step": 172 + }, + { + "epoch": 0.05777258306895976, + "grad_norm": 1.3055004936633776, + "learning_rate": 1.924360400444939e-06, + "loss": 0.0895, + "step": 173 + }, + { + "epoch": 0.05810652863583236, + "grad_norm": 1.1564338453638052, + "learning_rate": 1.935483870967742e-06, + "loss": 0.0582, + "step": 174 + }, + { + "epoch": 0.058440474202704956, + "grad_norm": 1.5341843111867526, + "learning_rate": 1.946607341490545e-06, + "loss": 0.0936, + "step": 175 + }, + { + "epoch": 0.058774419769577556, + "grad_norm": 1.4090086836755578, + "learning_rate": 1.9577308120133483e-06, + "loss": 0.0906, + "step": 176 + }, + { + "epoch": 0.059108365336450155, + "grad_norm": 0.9351572301997615, + "learning_rate": 1.9688542825361514e-06, + "loss": 0.0809, + "step": 177 + }, + { + "epoch": 0.059442310903322755, + "grad_norm": 1.2238183304370427, + "learning_rate": 1.9799777530589545e-06, + "loss": 0.0839, + "step": 178 + }, + { + "epoch": 0.059776256470195355, + "grad_norm": 0.7778091378640176, + "learning_rate": 1.9911012235817577e-06, + "loss": 0.0467, + "step": 179 + }, + { + "epoch": 0.06011020203706796, + "grad_norm": 1.760420759812158, + "learning_rate": 2.002224694104561e-06, + "loss": 0.1007, + "step": 180 + }, + { + "epoch": 0.06044414760394056, + "grad_norm": 1.1970302826818442, + "learning_rate": 2.013348164627364e-06, + "loss": 0.0835, + "step": 181 + }, + { + "epoch": 0.06077809317081316, + "grad_norm": 1.2558562706280731, + "learning_rate": 2.024471635150167e-06, + "loss": 0.0936, + "step": 182 + }, + { + "epoch": 0.06111203873768576, + "grad_norm": 0.9804679330236008, + "learning_rate": 2.03559510567297e-06, + "loss": 0.0696, + "step": 183 + }, + { + "epoch": 0.06144598430455836, + "grad_norm": 1.1097267450527353, + "learning_rate": 2.0467185761957733e-06, + "loss": 0.0801, + "step": 184 + }, + { + "epoch": 0.06177992987143096, + "grad_norm": 1.438725958156707, + "learning_rate": 2.0578420467185764e-06, + "loss": 0.0766, + "step": 185 + }, + { + "epoch": 0.06211387543830356, + "grad_norm": 1.450645634817868, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.0605, + "step": 186 + }, + { + "epoch": 0.06244782100517616, + "grad_norm": 0.9637236859717696, + "learning_rate": 2.0800889877641823e-06, + "loss": 0.071, + "step": 187 + }, + { + "epoch": 0.06278176657204876, + "grad_norm": 1.4369154647412303, + "learning_rate": 2.091212458286986e-06, + "loss": 0.0729, + "step": 188 + }, + { + "epoch": 0.06311571213892135, + "grad_norm": 1.489682534370634, + "learning_rate": 2.102335928809789e-06, + "loss": 0.0743, + "step": 189 + }, + { + "epoch": 0.06344965770579396, + "grad_norm": 0.9133377888567851, + "learning_rate": 2.113459399332592e-06, + "loss": 0.0658, + "step": 190 + }, + { + "epoch": 0.06378360327266655, + "grad_norm": 1.3622393297307778, + "learning_rate": 2.124582869855395e-06, + "loss": 0.0772, + "step": 191 + }, + { + "epoch": 0.06411754883953916, + "grad_norm": 1.0931775160388473, + "learning_rate": 2.1357063403781984e-06, + "loss": 0.064, + "step": 192 + }, + { + "epoch": 0.06445149440641175, + "grad_norm": 0.9403564090361918, + "learning_rate": 2.1468298109010015e-06, + "loss": 0.0643, + "step": 193 + }, + { + "epoch": 0.06478543997328436, + "grad_norm": 0.8675001719292815, + "learning_rate": 2.1579532814238042e-06, + "loss": 0.0589, + "step": 194 + }, + { + "epoch": 0.06511938554015695, + "grad_norm": 1.1933566142668406, + "learning_rate": 2.1690767519466073e-06, + "loss": 0.0862, + "step": 195 + }, + { + "epoch": 0.06545333110702956, + "grad_norm": 1.2459886422557267, + "learning_rate": 2.180200222469411e-06, + "loss": 0.0663, + "step": 196 + }, + { + "epoch": 0.06578727667390215, + "grad_norm": 1.3992714860593076, + "learning_rate": 2.1913236929922136e-06, + "loss": 0.0872, + "step": 197 + }, + { + "epoch": 0.06612122224077475, + "grad_norm": 1.2606360762089206, + "learning_rate": 2.2024471635150167e-06, + "loss": 0.0713, + "step": 198 + }, + { + "epoch": 0.06645516780764735, + "grad_norm": 1.2973243700693666, + "learning_rate": 2.21357063403782e-06, + "loss": 0.1072, + "step": 199 + }, + { + "epoch": 0.06678911337451995, + "grad_norm": 1.4558397957910205, + "learning_rate": 2.224694104560623e-06, + "loss": 0.1002, + "step": 200 + }, + { + "epoch": 0.06712305894139255, + "grad_norm": 0.9920794318487967, + "learning_rate": 2.235817575083426e-06, + "loss": 0.0677, + "step": 201 + }, + { + "epoch": 0.06745700450826515, + "grad_norm": 1.087436627599926, + "learning_rate": 2.2469410456062293e-06, + "loss": 0.0872, + "step": 202 + }, + { + "epoch": 0.06779095007513775, + "grad_norm": 1.02095781221236, + "learning_rate": 2.2580645161290324e-06, + "loss": 0.0721, + "step": 203 + }, + { + "epoch": 0.06812489564201035, + "grad_norm": 0.9306631155090207, + "learning_rate": 2.2691879866518355e-06, + "loss": 0.0601, + "step": 204 + }, + { + "epoch": 0.06845884120888296, + "grad_norm": 1.3190666786505112, + "learning_rate": 2.2803114571746387e-06, + "loss": 0.091, + "step": 205 + }, + { + "epoch": 0.06879278677575555, + "grad_norm": 0.8639747841314965, + "learning_rate": 2.291434927697442e-06, + "loss": 0.076, + "step": 206 + }, + { + "epoch": 0.06912673234262816, + "grad_norm": 1.238776383531512, + "learning_rate": 2.302558398220245e-06, + "loss": 0.091, + "step": 207 + }, + { + "epoch": 0.06946067790950075, + "grad_norm": 1.0654244974201894, + "learning_rate": 2.313681868743048e-06, + "loss": 0.0812, + "step": 208 + }, + { + "epoch": 0.06979462347637336, + "grad_norm": 1.0719673964669938, + "learning_rate": 2.324805339265851e-06, + "loss": 0.0608, + "step": 209 + }, + { + "epoch": 0.07012856904324595, + "grad_norm": 1.007448591232436, + "learning_rate": 2.3359288097886543e-06, + "loss": 0.0573, + "step": 210 + }, + { + "epoch": 0.07046251461011856, + "grad_norm": 0.9909285215050861, + "learning_rate": 2.3470522803114575e-06, + "loss": 0.0737, + "step": 211 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 1.4902013241813645, + "learning_rate": 2.35817575083426e-06, + "loss": 0.0762, + "step": 212 + }, + { + "epoch": 0.07113040574386376, + "grad_norm": 1.0594307960564555, + "learning_rate": 2.3692992213570637e-06, + "loss": 0.0729, + "step": 213 + }, + { + "epoch": 0.07146435131073635, + "grad_norm": 1.6186928433649752, + "learning_rate": 2.380422691879867e-06, + "loss": 0.1039, + "step": 214 + }, + { + "epoch": 0.07179829687760896, + "grad_norm": 1.0664749163779126, + "learning_rate": 2.39154616240267e-06, + "loss": 0.0708, + "step": 215 + }, + { + "epoch": 0.07213224244448155, + "grad_norm": 0.8702436463796133, + "learning_rate": 2.4026696329254727e-06, + "loss": 0.0815, + "step": 216 + }, + { + "epoch": 0.07246618801135415, + "grad_norm": 0.9979296999970432, + "learning_rate": 2.4137931034482762e-06, + "loss": 0.0753, + "step": 217 + }, + { + "epoch": 0.07280013357822675, + "grad_norm": 1.2439645375449007, + "learning_rate": 2.4249165739710794e-06, + "loss": 0.0891, + "step": 218 + }, + { + "epoch": 0.07313407914509935, + "grad_norm": 0.9537212933511459, + "learning_rate": 2.436040044493882e-06, + "loss": 0.0651, + "step": 219 + }, + { + "epoch": 0.07346802471197195, + "grad_norm": 1.0016290487955293, + "learning_rate": 2.4471635150166852e-06, + "loss": 0.0629, + "step": 220 + }, + { + "epoch": 0.07380197027884455, + "grad_norm": 1.0540971677217288, + "learning_rate": 2.4582869855394888e-06, + "loss": 0.0876, + "step": 221 + }, + { + "epoch": 0.07413591584571715, + "grad_norm": 1.2439977170608356, + "learning_rate": 2.4694104560622915e-06, + "loss": 0.0624, + "step": 222 + }, + { + "epoch": 0.07446986141258975, + "grad_norm": 1.2972720451499673, + "learning_rate": 2.4805339265850946e-06, + "loss": 0.0873, + "step": 223 + }, + { + "epoch": 0.07480380697946235, + "grad_norm": 1.4086762013586394, + "learning_rate": 2.4916573971078977e-06, + "loss": 0.0708, + "step": 224 + }, + { + "epoch": 0.07513775254633495, + "grad_norm": 1.0791618745968774, + "learning_rate": 2.502780867630701e-06, + "loss": 0.0705, + "step": 225 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 1.0990305310646942, + "learning_rate": 2.513904338153504e-06, + "loss": 0.0739, + "step": 226 + }, + { + "epoch": 0.07580564368008015, + "grad_norm": 1.013766222792433, + "learning_rate": 2.5250278086763076e-06, + "loss": 0.0687, + "step": 227 + }, + { + "epoch": 0.07613958924695274, + "grad_norm": 1.1326007702379792, + "learning_rate": 2.5361512791991107e-06, + "loss": 0.0843, + "step": 228 + }, + { + "epoch": 0.07647353481382535, + "grad_norm": 1.2189122363684406, + "learning_rate": 2.5472747497219134e-06, + "loss": 0.0777, + "step": 229 + }, + { + "epoch": 0.07680748038069794, + "grad_norm": 1.4445267765189558, + "learning_rate": 2.5583982202447165e-06, + "loss": 0.0654, + "step": 230 + }, + { + "epoch": 0.07714142594757055, + "grad_norm": 1.061842223719913, + "learning_rate": 2.5695216907675197e-06, + "loss": 0.0843, + "step": 231 + }, + { + "epoch": 0.07747537151444314, + "grad_norm": 0.8838057693927293, + "learning_rate": 2.580645161290323e-06, + "loss": 0.0471, + "step": 232 + }, + { + "epoch": 0.07780931708131575, + "grad_norm": 1.3696770724687637, + "learning_rate": 2.591768631813126e-06, + "loss": 0.0752, + "step": 233 + }, + { + "epoch": 0.07814326264818834, + "grad_norm": 1.2158788748088327, + "learning_rate": 2.6028921023359286e-06, + "loss": 0.0944, + "step": 234 + }, + { + "epoch": 0.07847720821506095, + "grad_norm": 1.295274490342438, + "learning_rate": 2.6140155728587318e-06, + "loss": 0.0857, + "step": 235 + }, + { + "epoch": 0.07881115378193354, + "grad_norm": 1.1539292236059824, + "learning_rate": 2.6251390433815353e-06, + "loss": 0.0508, + "step": 236 + }, + { + "epoch": 0.07914509934880615, + "grad_norm": 0.7116705685872293, + "learning_rate": 2.6362625139043385e-06, + "loss": 0.0517, + "step": 237 + }, + { + "epoch": 0.07947904491567874, + "grad_norm": 1.21665495177681, + "learning_rate": 2.6473859844271416e-06, + "loss": 0.0843, + "step": 238 + }, + { + "epoch": 0.07981299048255135, + "grad_norm": 0.8226166857909414, + "learning_rate": 2.6585094549499447e-06, + "loss": 0.0771, + "step": 239 + }, + { + "epoch": 0.08014693604942394, + "grad_norm": 1.140511279181508, + "learning_rate": 2.669632925472748e-06, + "loss": 0.0806, + "step": 240 + }, + { + "epoch": 0.08048088161629655, + "grad_norm": 2.6499827732546657, + "learning_rate": 2.6807563959955506e-06, + "loss": 0.072, + "step": 241 + }, + { + "epoch": 0.08081482718316914, + "grad_norm": 1.2457689203698872, + "learning_rate": 2.6918798665183537e-06, + "loss": 0.0827, + "step": 242 + }, + { + "epoch": 0.08114877275004174, + "grad_norm": 1.2331789620109057, + "learning_rate": 2.703003337041157e-06, + "loss": 0.0764, + "step": 243 + }, + { + "epoch": 0.08148271831691434, + "grad_norm": 2.403989830723792, + "learning_rate": 2.7141268075639604e-06, + "loss": 0.0933, + "step": 244 + }, + { + "epoch": 0.08181666388378694, + "grad_norm": 0.8317597079044875, + "learning_rate": 2.7252502780867635e-06, + "loss": 0.0544, + "step": 245 + }, + { + "epoch": 0.08215060945065954, + "grad_norm": 1.1552367973010966, + "learning_rate": 2.7363737486095667e-06, + "loss": 0.0958, + "step": 246 + }, + { + "epoch": 0.08248455501753214, + "grad_norm": 1.2187444616161907, + "learning_rate": 2.7474972191323694e-06, + "loss": 0.0896, + "step": 247 + }, + { + "epoch": 0.08281850058440474, + "grad_norm": 1.6494729328754958, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.0858, + "step": 248 + }, + { + "epoch": 0.08315244615127734, + "grad_norm": 1.2042569328691684, + "learning_rate": 2.7697441601779756e-06, + "loss": 0.0522, + "step": 249 + }, + { + "epoch": 0.08348639171814994, + "grad_norm": 1.0969576654880853, + "learning_rate": 2.7808676307007788e-06, + "loss": 0.0698, + "step": 250 + }, + { + "epoch": 0.08382033728502254, + "grad_norm": 1.4190087693901525, + "learning_rate": 2.791991101223582e-06, + "loss": 0.0858, + "step": 251 + }, + { + "epoch": 0.08415428285189513, + "grad_norm": 1.011867095931969, + "learning_rate": 2.8031145717463854e-06, + "loss": 0.0663, + "step": 252 + }, + { + "epoch": 0.08448822841876774, + "grad_norm": 1.1357462237499456, + "learning_rate": 2.8142380422691886e-06, + "loss": 0.0758, + "step": 253 + }, + { + "epoch": 0.08482217398564033, + "grad_norm": 1.0992432358389839, + "learning_rate": 2.8253615127919913e-06, + "loss": 0.0686, + "step": 254 + }, + { + "epoch": 0.08515611955251294, + "grad_norm": 1.2210379843748043, + "learning_rate": 2.8364849833147944e-06, + "loss": 0.0751, + "step": 255 + }, + { + "epoch": 0.08549006511938555, + "grad_norm": 1.333496324687641, + "learning_rate": 2.8476084538375975e-06, + "loss": 0.0856, + "step": 256 + }, + { + "epoch": 0.08582401068625814, + "grad_norm": 0.8982589393714245, + "learning_rate": 2.8587319243604007e-06, + "loss": 0.0685, + "step": 257 + }, + { + "epoch": 0.08615795625313075, + "grad_norm": 0.7664615334007062, + "learning_rate": 2.869855394883204e-06, + "loss": 0.0562, + "step": 258 + }, + { + "epoch": 0.08649190182000334, + "grad_norm": 0.8853220569466045, + "learning_rate": 2.8809788654060065e-06, + "loss": 0.0478, + "step": 259 + }, + { + "epoch": 0.08682584738687595, + "grad_norm": 1.3074993687517895, + "learning_rate": 2.8921023359288105e-06, + "loss": 0.0702, + "step": 260 + }, + { + "epoch": 0.08715979295374854, + "grad_norm": 1.4187470300484282, + "learning_rate": 2.903225806451613e-06, + "loss": 0.0798, + "step": 261 + }, + { + "epoch": 0.08749373852062114, + "grad_norm": 0.8946541168733985, + "learning_rate": 2.9143492769744163e-06, + "loss": 0.0546, + "step": 262 + }, + { + "epoch": 0.08782768408749374, + "grad_norm": 0.9019084816464253, + "learning_rate": 2.9254727474972195e-06, + "loss": 0.0637, + "step": 263 + }, + { + "epoch": 0.08816162965436634, + "grad_norm": 1.035031565835833, + "learning_rate": 2.9365962180200226e-06, + "loss": 0.071, + "step": 264 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 1.062399337907389, + "learning_rate": 2.9477196885428257e-06, + "loss": 0.0606, + "step": 265 + }, + { + "epoch": 0.08882952078811154, + "grad_norm": 1.3812400808776473, + "learning_rate": 2.9588431590656284e-06, + "loss": 0.099, + "step": 266 + }, + { + "epoch": 0.08916346635498414, + "grad_norm": 2.1106807968897905, + "learning_rate": 2.9699666295884316e-06, + "loss": 0.0697, + "step": 267 + }, + { + "epoch": 0.08949741192185674, + "grad_norm": 0.7470407396628205, + "learning_rate": 2.9810901001112347e-06, + "loss": 0.0525, + "step": 268 + }, + { + "epoch": 0.08983135748872934, + "grad_norm": 1.2416470600288456, + "learning_rate": 2.9922135706340383e-06, + "loss": 0.0878, + "step": 269 + }, + { + "epoch": 0.09016530305560194, + "grad_norm": 0.7739232097144934, + "learning_rate": 3.0033370411568414e-06, + "loss": 0.0486, + "step": 270 + }, + { + "epoch": 0.09049924862247453, + "grad_norm": 0.9689287092068435, + "learning_rate": 3.0144605116796445e-06, + "loss": 0.0615, + "step": 271 + }, + { + "epoch": 0.09083319418934714, + "grad_norm": 0.9706450420856501, + "learning_rate": 3.0255839822024472e-06, + "loss": 0.0663, + "step": 272 + }, + { + "epoch": 0.09116713975621973, + "grad_norm": 0.8113655680721432, + "learning_rate": 3.0367074527252504e-06, + "loss": 0.0584, + "step": 273 + }, + { + "epoch": 0.09150108532309234, + "grad_norm": 0.88652357445208, + "learning_rate": 3.0478309232480535e-06, + "loss": 0.0593, + "step": 274 + }, + { + "epoch": 0.09183503088996493, + "grad_norm": 0.9619013850625336, + "learning_rate": 3.0589543937708566e-06, + "loss": 0.0567, + "step": 275 + }, + { + "epoch": 0.09216897645683754, + "grad_norm": 1.0650640915491556, + "learning_rate": 3.0700778642936598e-06, + "loss": 0.0834, + "step": 276 + }, + { + "epoch": 0.09250292202371013, + "grad_norm": 1.0371199805327422, + "learning_rate": 3.0812013348164633e-06, + "loss": 0.0644, + "step": 277 + }, + { + "epoch": 0.09283686759058274, + "grad_norm": 1.5437364955186497, + "learning_rate": 3.0923248053392665e-06, + "loss": 0.0926, + "step": 278 + }, + { + "epoch": 0.09317081315745533, + "grad_norm": 1.350929344249909, + "learning_rate": 3.103448275862069e-06, + "loss": 0.0806, + "step": 279 + }, + { + "epoch": 0.09350475872432794, + "grad_norm": 0.984748131136075, + "learning_rate": 3.1145717463848723e-06, + "loss": 0.0644, + "step": 280 + }, + { + "epoch": 0.09383870429120053, + "grad_norm": 0.8197737946172219, + "learning_rate": 3.1256952169076754e-06, + "loss": 0.0551, + "step": 281 + }, + { + "epoch": 0.09417264985807314, + "grad_norm": 1.0028702309436293, + "learning_rate": 3.1368186874304786e-06, + "loss": 0.0855, + "step": 282 + }, + { + "epoch": 0.09450659542494573, + "grad_norm": 1.1354845043569812, + "learning_rate": 3.1479421579532817e-06, + "loss": 0.0656, + "step": 283 + }, + { + "epoch": 0.09484054099181834, + "grad_norm": 1.4093692815220873, + "learning_rate": 3.1590656284760844e-06, + "loss": 0.0926, + "step": 284 + }, + { + "epoch": 0.09517448655869093, + "grad_norm": 0.8150099194875217, + "learning_rate": 3.170189098998888e-06, + "loss": 0.0713, + "step": 285 + }, + { + "epoch": 0.09550843212556354, + "grad_norm": 0.9994394655029959, + "learning_rate": 3.181312569521691e-06, + "loss": 0.0471, + "step": 286 + }, + { + "epoch": 0.09584237769243613, + "grad_norm": 0.9166625944563191, + "learning_rate": 3.1924360400444942e-06, + "loss": 0.0714, + "step": 287 + }, + { + "epoch": 0.09617632325930874, + "grad_norm": 1.0476210871953306, + "learning_rate": 3.2035595105672973e-06, + "loss": 0.082, + "step": 288 + }, + { + "epoch": 0.09651026882618133, + "grad_norm": 0.8552898997492805, + "learning_rate": 3.2146829810901005e-06, + "loss": 0.0578, + "step": 289 + }, + { + "epoch": 0.09684421439305393, + "grad_norm": 0.9887021575952659, + "learning_rate": 3.225806451612903e-06, + "loss": 0.0721, + "step": 290 + }, + { + "epoch": 0.09717815995992653, + "grad_norm": 1.3001300892308882, + "learning_rate": 3.2369299221357063e-06, + "loss": 0.0797, + "step": 291 + }, + { + "epoch": 0.09751210552679913, + "grad_norm": 0.9616361386087116, + "learning_rate": 3.2480533926585095e-06, + "loss": 0.0751, + "step": 292 + }, + { + "epoch": 0.09784605109367173, + "grad_norm": 0.769818976357463, + "learning_rate": 3.259176863181313e-06, + "loss": 0.0476, + "step": 293 + }, + { + "epoch": 0.09817999666054433, + "grad_norm": 0.9039460600335342, + "learning_rate": 3.270300333704116e-06, + "loss": 0.0568, + "step": 294 + }, + { + "epoch": 0.09851394222741693, + "grad_norm": 0.7357670662426185, + "learning_rate": 3.2814238042269193e-06, + "loss": 0.061, + "step": 295 + }, + { + "epoch": 0.09884788779428953, + "grad_norm": 0.985486971157034, + "learning_rate": 3.2925472747497224e-06, + "loss": 0.0537, + "step": 296 + }, + { + "epoch": 0.09918183336116213, + "grad_norm": 1.0657623613030358, + "learning_rate": 3.303670745272525e-06, + "loss": 0.0727, + "step": 297 + }, + { + "epoch": 0.09951577892803473, + "grad_norm": 0.8178754986431233, + "learning_rate": 3.3147942157953282e-06, + "loss": 0.0575, + "step": 298 + }, + { + "epoch": 0.09984972449490732, + "grad_norm": 1.0267008299629332, + "learning_rate": 3.3259176863181314e-06, + "loss": 0.0837, + "step": 299 + }, + { + "epoch": 0.10018367006177993, + "grad_norm": 0.9577138509261269, + "learning_rate": 3.3370411568409345e-06, + "loss": 0.069, + "step": 300 + }, + { + "epoch": 0.10051761562865252, + "grad_norm": 0.7687077726851129, + "learning_rate": 3.3481646273637376e-06, + "loss": 0.0734, + "step": 301 + }, + { + "epoch": 0.10085156119552513, + "grad_norm": 0.9432917509341645, + "learning_rate": 3.359288097886541e-06, + "loss": 0.0901, + "step": 302 + }, + { + "epoch": 0.10118550676239772, + "grad_norm": 0.723295485828715, + "learning_rate": 3.3704115684093443e-06, + "loss": 0.0479, + "step": 303 + }, + { + "epoch": 0.10151945232927033, + "grad_norm": 0.8433208705630757, + "learning_rate": 3.381535038932147e-06, + "loss": 0.0631, + "step": 304 + }, + { + "epoch": 0.10185339789614292, + "grad_norm": 0.8451391738861146, + "learning_rate": 3.39265850945495e-06, + "loss": 0.0674, + "step": 305 + }, + { + "epoch": 0.10218734346301553, + "grad_norm": 0.9422394841862002, + "learning_rate": 3.4037819799777533e-06, + "loss": 0.0789, + "step": 306 + }, + { + "epoch": 0.10252128902988812, + "grad_norm": 0.6754944960236535, + "learning_rate": 3.4149054505005564e-06, + "loss": 0.0546, + "step": 307 + }, + { + "epoch": 0.10285523459676073, + "grad_norm": 0.854906938501467, + "learning_rate": 3.4260289210233596e-06, + "loss": 0.0691, + "step": 308 + }, + { + "epoch": 0.10318918016363333, + "grad_norm": 0.8412785322678099, + "learning_rate": 3.4371523915461623e-06, + "loss": 0.05, + "step": 309 + }, + { + "epoch": 0.10352312573050593, + "grad_norm": 0.975075371136509, + "learning_rate": 3.448275862068966e-06, + "loss": 0.0681, + "step": 310 + }, + { + "epoch": 0.10385707129737853, + "grad_norm": 1.204824033576981, + "learning_rate": 3.459399332591769e-06, + "loss": 0.0833, + "step": 311 + }, + { + "epoch": 0.10419101686425113, + "grad_norm": 1.6455562509076394, + "learning_rate": 3.470522803114572e-06, + "loss": 0.0872, + "step": 312 + }, + { + "epoch": 0.10452496243112373, + "grad_norm": 1.389913324675098, + "learning_rate": 3.4816462736373752e-06, + "loss": 0.0728, + "step": 313 + }, + { + "epoch": 0.10485890799799633, + "grad_norm": 1.0502187561605811, + "learning_rate": 3.4927697441601784e-06, + "loss": 0.067, + "step": 314 + }, + { + "epoch": 0.10519285356486893, + "grad_norm": 0.9059963677233313, + "learning_rate": 3.503893214682981e-06, + "loss": 0.064, + "step": 315 + }, + { + "epoch": 0.10552679913174152, + "grad_norm": 1.509854880843312, + "learning_rate": 3.515016685205784e-06, + "loss": 0.0755, + "step": 316 + }, + { + "epoch": 0.10586074469861413, + "grad_norm": 1.090887522966728, + "learning_rate": 3.5261401557285873e-06, + "loss": 0.0723, + "step": 317 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 0.8850762549294612, + "learning_rate": 3.537263626251391e-06, + "loss": 0.0788, + "step": 318 + }, + { + "epoch": 0.10652863583235933, + "grad_norm": 0.757063526049472, + "learning_rate": 3.548387096774194e-06, + "loss": 0.054, + "step": 319 + }, + { + "epoch": 0.10686258139923192, + "grad_norm": 0.8830245601771521, + "learning_rate": 3.559510567296997e-06, + "loss": 0.0651, + "step": 320 + }, + { + "epoch": 0.10719652696610453, + "grad_norm": 0.7908966462909623, + "learning_rate": 3.5706340378198003e-06, + "loss": 0.0682, + "step": 321 + }, + { + "epoch": 0.10753047253297712, + "grad_norm": 1.1397441595400608, + "learning_rate": 3.581757508342603e-06, + "loss": 0.0885, + "step": 322 + }, + { + "epoch": 0.10786441809984973, + "grad_norm": 0.8429977094893334, + "learning_rate": 3.592880978865406e-06, + "loss": 0.0614, + "step": 323 + }, + { + "epoch": 0.10819836366672232, + "grad_norm": 1.2403336317224285, + "learning_rate": 3.6040044493882093e-06, + "loss": 0.0795, + "step": 324 + }, + { + "epoch": 0.10853230923359493, + "grad_norm": 0.9696392027482027, + "learning_rate": 3.6151279199110124e-06, + "loss": 0.0728, + "step": 325 + }, + { + "epoch": 0.10886625480046752, + "grad_norm": 0.8848399109331946, + "learning_rate": 3.626251390433816e-06, + "loss": 0.0643, + "step": 326 + }, + { + "epoch": 0.10920020036734013, + "grad_norm": 0.8716169658223119, + "learning_rate": 3.637374860956619e-06, + "loss": 0.0509, + "step": 327 + }, + { + "epoch": 0.10953414593421272, + "grad_norm": 1.037505265979191, + "learning_rate": 3.648498331479422e-06, + "loss": 0.0753, + "step": 328 + }, + { + "epoch": 0.10986809150108533, + "grad_norm": 1.1819898795024548, + "learning_rate": 3.659621802002225e-06, + "loss": 0.0833, + "step": 329 + }, + { + "epoch": 0.11020203706795792, + "grad_norm": 1.1406256978667613, + "learning_rate": 3.670745272525028e-06, + "loss": 0.081, + "step": 330 + }, + { + "epoch": 0.11053598263483053, + "grad_norm": 1.0443853645878745, + "learning_rate": 3.681868743047831e-06, + "loss": 0.0614, + "step": 331 + }, + { + "epoch": 0.11086992820170312, + "grad_norm": 1.6306549726788961, + "learning_rate": 3.6929922135706343e-06, + "loss": 0.0434, + "step": 332 + }, + { + "epoch": 0.11120387376857573, + "grad_norm": 0.9531009380601928, + "learning_rate": 3.7041156840934374e-06, + "loss": 0.084, + "step": 333 + }, + { + "epoch": 0.11153781933544832, + "grad_norm": 1.0996412187005624, + "learning_rate": 3.71523915461624e-06, + "loss": 0.0864, + "step": 334 + }, + { + "epoch": 0.11187176490232092, + "grad_norm": 1.3393366382316005, + "learning_rate": 3.7263626251390437e-06, + "loss": 0.0723, + "step": 335 + }, + { + "epoch": 0.11220571046919352, + "grad_norm": 0.9568198725045595, + "learning_rate": 3.737486095661847e-06, + "loss": 0.0615, + "step": 336 + }, + { + "epoch": 0.11253965603606612, + "grad_norm": 0.9319636035637239, + "learning_rate": 3.74860956618465e-06, + "loss": 0.0644, + "step": 337 + }, + { + "epoch": 0.11287360160293872, + "grad_norm": 1.1870868966528694, + "learning_rate": 3.759733036707453e-06, + "loss": 0.0562, + "step": 338 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.9526342694332731, + "learning_rate": 3.7708565072302562e-06, + "loss": 0.0584, + "step": 339 + }, + { + "epoch": 0.11354149273668392, + "grad_norm": 1.2723184920889619, + "learning_rate": 3.781979977753059e-06, + "loss": 0.0736, + "step": 340 + }, + { + "epoch": 0.11387543830355652, + "grad_norm": 0.9263403362171003, + "learning_rate": 3.793103448275862e-06, + "loss": 0.0586, + "step": 341 + }, + { + "epoch": 0.11420938387042912, + "grad_norm": 0.8430467437161417, + "learning_rate": 3.804226918798665e-06, + "loss": 0.0556, + "step": 342 + }, + { + "epoch": 0.11454332943730172, + "grad_norm": 1.154771466253, + "learning_rate": 3.815350389321469e-06, + "loss": 0.0571, + "step": 343 + }, + { + "epoch": 0.11487727500417431, + "grad_norm": 1.1082196652446936, + "learning_rate": 3.8264738598442715e-06, + "loss": 0.0639, + "step": 344 + }, + { + "epoch": 0.11521122057104692, + "grad_norm": 0.8868903401007896, + "learning_rate": 3.837597330367075e-06, + "loss": 0.0533, + "step": 345 + }, + { + "epoch": 0.11554516613791951, + "grad_norm": 1.0803894175416395, + "learning_rate": 3.848720800889878e-06, + "loss": 0.0863, + "step": 346 + }, + { + "epoch": 0.11587911170479212, + "grad_norm": 1.0324462777222236, + "learning_rate": 3.859844271412681e-06, + "loss": 0.0578, + "step": 347 + }, + { + "epoch": 0.11621305727166471, + "grad_norm": 1.1025377393840483, + "learning_rate": 3.870967741935484e-06, + "loss": 0.0646, + "step": 348 + }, + { + "epoch": 0.11654700283853732, + "grad_norm": 0.7616860665860318, + "learning_rate": 3.8820912124582876e-06, + "loss": 0.0631, + "step": 349 + }, + { + "epoch": 0.11688094840540991, + "grad_norm": 1.3354653908723382, + "learning_rate": 3.89321468298109e-06, + "loss": 0.077, + "step": 350 + }, + { + "epoch": 0.11721489397228252, + "grad_norm": 0.8317935535836048, + "learning_rate": 3.904338153503894e-06, + "loss": 0.0741, + "step": 351 + }, + { + "epoch": 0.11754883953915511, + "grad_norm": 1.1109506420743003, + "learning_rate": 3.9154616240266965e-06, + "loss": 0.0763, + "step": 352 + }, + { + "epoch": 0.11788278510602772, + "grad_norm": 0.9494638434263117, + "learning_rate": 3.9265850945495e-06, + "loss": 0.0669, + "step": 353 + }, + { + "epoch": 0.11821673067290031, + "grad_norm": 0.7662930810479017, + "learning_rate": 3.937708565072303e-06, + "loss": 0.0597, + "step": 354 + }, + { + "epoch": 0.11855067623977292, + "grad_norm": 1.3373783677115336, + "learning_rate": 3.948832035595106e-06, + "loss": 0.0885, + "step": 355 + }, + { + "epoch": 0.11888462180664551, + "grad_norm": 1.0321161388076057, + "learning_rate": 3.959955506117909e-06, + "loss": 0.0572, + "step": 356 + }, + { + "epoch": 0.11921856737351812, + "grad_norm": 0.9316855711072523, + "learning_rate": 3.971078976640712e-06, + "loss": 0.0697, + "step": 357 + }, + { + "epoch": 0.11955251294039071, + "grad_norm": 1.1383940818027423, + "learning_rate": 3.982202447163515e-06, + "loss": 0.0543, + "step": 358 + }, + { + "epoch": 0.11988645850726332, + "grad_norm": 1.0241814060412053, + "learning_rate": 3.993325917686319e-06, + "loss": 0.068, + "step": 359 + }, + { + "epoch": 0.12022040407413592, + "grad_norm": 1.3927998360633838, + "learning_rate": 4.004449388209122e-06, + "loss": 0.0594, + "step": 360 + }, + { + "epoch": 0.12055434964100852, + "grad_norm": 1.465185164593061, + "learning_rate": 4.015572858731925e-06, + "loss": 0.0707, + "step": 361 + }, + { + "epoch": 0.12088829520788112, + "grad_norm": 1.0218384291719091, + "learning_rate": 4.026696329254728e-06, + "loss": 0.0862, + "step": 362 + }, + { + "epoch": 0.12122224077475371, + "grad_norm": 0.8335335272923647, + "learning_rate": 4.0378197997775306e-06, + "loss": 0.0666, + "step": 363 + }, + { + "epoch": 0.12155618634162632, + "grad_norm": 0.7971387678207199, + "learning_rate": 4.048943270300334e-06, + "loss": 0.0536, + "step": 364 + }, + { + "epoch": 0.12189013190849891, + "grad_norm": 0.9018760524980276, + "learning_rate": 4.060066740823137e-06, + "loss": 0.0652, + "step": 365 + }, + { + "epoch": 0.12222407747537152, + "grad_norm": 0.7100076734118838, + "learning_rate": 4.07119021134594e-06, + "loss": 0.0522, + "step": 366 + }, + { + "epoch": 0.12255802304224411, + "grad_norm": 1.1553456634700505, + "learning_rate": 4.082313681868743e-06, + "loss": 0.0665, + "step": 367 + }, + { + "epoch": 0.12289196860911672, + "grad_norm": 1.2667757129005257, + "learning_rate": 4.093437152391547e-06, + "loss": 0.0883, + "step": 368 + }, + { + "epoch": 0.12322591417598931, + "grad_norm": 0.6961100960879517, + "learning_rate": 4.104560622914349e-06, + "loss": 0.0633, + "step": 369 + }, + { + "epoch": 0.12355985974286192, + "grad_norm": 1.008433304396925, + "learning_rate": 4.115684093437153e-06, + "loss": 0.0758, + "step": 370 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 1.0695684974598691, + "learning_rate": 4.126807563959956e-06, + "loss": 0.0663, + "step": 371 + }, + { + "epoch": 0.12422775087660712, + "grad_norm": 0.806555344783645, + "learning_rate": 4.137931034482759e-06, + "loss": 0.0748, + "step": 372 + }, + { + "epoch": 0.12456169644347971, + "grad_norm": 1.107697753039673, + "learning_rate": 4.149054505005562e-06, + "loss": 0.067, + "step": 373 + }, + { + "epoch": 0.12489564201035232, + "grad_norm": 0.7010287093915, + "learning_rate": 4.160177975528365e-06, + "loss": 0.0691, + "step": 374 + }, + { + "epoch": 0.1252295875772249, + "grad_norm": 0.9084861258601861, + "learning_rate": 4.171301446051168e-06, + "loss": 0.0665, + "step": 375 + }, + { + "epoch": 0.12556353314409752, + "grad_norm": 1.0664391897856733, + "learning_rate": 4.182424916573972e-06, + "loss": 0.0897, + "step": 376 + }, + { + "epoch": 0.12589747871097012, + "grad_norm": 0.7424223077507432, + "learning_rate": 4.193548387096774e-06, + "loss": 0.064, + "step": 377 + }, + { + "epoch": 0.1262314242778427, + "grad_norm": 2.7620275717695226, + "learning_rate": 4.204671857619578e-06, + "loss": 0.088, + "step": 378 + }, + { + "epoch": 0.1265653698447153, + "grad_norm": 0.7941430874418766, + "learning_rate": 4.215795328142381e-06, + "loss": 0.0628, + "step": 379 + }, + { + "epoch": 0.12689931541158792, + "grad_norm": 0.9354132323804676, + "learning_rate": 4.226918798665184e-06, + "loss": 0.0679, + "step": 380 + }, + { + "epoch": 0.12723326097846052, + "grad_norm": 0.7920974181780777, + "learning_rate": 4.238042269187987e-06, + "loss": 0.0597, + "step": 381 + }, + { + "epoch": 0.1275672065453331, + "grad_norm": 0.867841544916767, + "learning_rate": 4.24916573971079e-06, + "loss": 0.0579, + "step": 382 + }, + { + "epoch": 0.1279011521122057, + "grad_norm": 1.4020195657180856, + "learning_rate": 4.260289210233593e-06, + "loss": 0.0819, + "step": 383 + }, + { + "epoch": 0.1282350976790783, + "grad_norm": 1.1938775225409488, + "learning_rate": 4.271412680756397e-06, + "loss": 0.0779, + "step": 384 + }, + { + "epoch": 0.12856904324595092, + "grad_norm": 0.8022906514664288, + "learning_rate": 4.2825361512791995e-06, + "loss": 0.0609, + "step": 385 + }, + { + "epoch": 0.1289029888128235, + "grad_norm": 0.6616577614601218, + "learning_rate": 4.293659621802003e-06, + "loss": 0.0606, + "step": 386 + }, + { + "epoch": 0.1292369343796961, + "grad_norm": 1.3449449266845075, + "learning_rate": 4.304783092324806e-06, + "loss": 0.0876, + "step": 387 + }, + { + "epoch": 0.1295708799465687, + "grad_norm": 1.0005342420994785, + "learning_rate": 4.3159065628476084e-06, + "loss": 0.0635, + "step": 388 + }, + { + "epoch": 0.12990482551344132, + "grad_norm": 0.7802565312984167, + "learning_rate": 4.327030033370412e-06, + "loss": 0.0545, + "step": 389 + }, + { + "epoch": 0.1302387710803139, + "grad_norm": 0.8621424847164936, + "learning_rate": 4.338153503893215e-06, + "loss": 0.0618, + "step": 390 + }, + { + "epoch": 0.1305727166471865, + "grad_norm": 0.7700216914249677, + "learning_rate": 4.349276974416018e-06, + "loss": 0.0682, + "step": 391 + }, + { + "epoch": 0.1309066622140591, + "grad_norm": 1.0332852000851969, + "learning_rate": 4.360400444938822e-06, + "loss": 0.0783, + "step": 392 + }, + { + "epoch": 0.13124060778093172, + "grad_norm": 0.7084663536325163, + "learning_rate": 4.3715239154616245e-06, + "loss": 0.0513, + "step": 393 + }, + { + "epoch": 0.1315745533478043, + "grad_norm": 0.7258019483196939, + "learning_rate": 4.382647385984427e-06, + "loss": 0.0618, + "step": 394 + }, + { + "epoch": 0.1319084989146769, + "grad_norm": 0.7384245252121673, + "learning_rate": 4.393770856507231e-06, + "loss": 0.0617, + "step": 395 + }, + { + "epoch": 0.1322424444815495, + "grad_norm": 0.879416102341382, + "learning_rate": 4.4048943270300335e-06, + "loss": 0.0893, + "step": 396 + }, + { + "epoch": 0.13257639004842212, + "grad_norm": 1.111269865167675, + "learning_rate": 4.416017797552837e-06, + "loss": 0.0815, + "step": 397 + }, + { + "epoch": 0.1329103356152947, + "grad_norm": 0.8541301967655361, + "learning_rate": 4.42714126807564e-06, + "loss": 0.0669, + "step": 398 + }, + { + "epoch": 0.1332442811821673, + "grad_norm": 1.0914177666489773, + "learning_rate": 4.4382647385984425e-06, + "loss": 0.0745, + "step": 399 + }, + { + "epoch": 0.1335782267490399, + "grad_norm": 0.8333840613959441, + "learning_rate": 4.449388209121246e-06, + "loss": 0.055, + "step": 400 + }, + { + "epoch": 0.13391217231591251, + "grad_norm": 0.5813378320576991, + "learning_rate": 4.4605116796440496e-06, + "loss": 0.0464, + "step": 401 + }, + { + "epoch": 0.1342461178827851, + "grad_norm": 1.0914380120324834, + "learning_rate": 4.471635150166852e-06, + "loss": 0.056, + "step": 402 + }, + { + "epoch": 0.1345800634496577, + "grad_norm": 0.8968599625799999, + "learning_rate": 4.482758620689656e-06, + "loss": 0.0689, + "step": 403 + }, + { + "epoch": 0.1349140090165303, + "grad_norm": 0.7137264684303274, + "learning_rate": 4.4938820912124585e-06, + "loss": 0.0723, + "step": 404 + }, + { + "epoch": 0.1352479545834029, + "grad_norm": 1.7192219912416853, + "learning_rate": 4.505005561735262e-06, + "loss": 0.0794, + "step": 405 + }, + { + "epoch": 0.1355819001502755, + "grad_norm": 0.6914730913580311, + "learning_rate": 4.516129032258065e-06, + "loss": 0.0507, + "step": 406 + }, + { + "epoch": 0.1359158457171481, + "grad_norm": 0.8852780808500477, + "learning_rate": 4.5272525027808675e-06, + "loss": 0.0589, + "step": 407 + }, + { + "epoch": 0.1362497912840207, + "grad_norm": 1.2348616096620406, + "learning_rate": 4.538375973303671e-06, + "loss": 0.0651, + "step": 408 + }, + { + "epoch": 0.1365837368508933, + "grad_norm": 0.7435214705909083, + "learning_rate": 4.549499443826475e-06, + "loss": 0.0481, + "step": 409 + }, + { + "epoch": 0.13691768241776592, + "grad_norm": 0.7747111792754291, + "learning_rate": 4.560622914349277e-06, + "loss": 0.0579, + "step": 410 + }, + { + "epoch": 0.1372516279846385, + "grad_norm": 1.2774674221987332, + "learning_rate": 4.571746384872081e-06, + "loss": 0.1106, + "step": 411 + }, + { + "epoch": 0.1375855735515111, + "grad_norm": 0.8184734262766915, + "learning_rate": 4.582869855394884e-06, + "loss": 0.0615, + "step": 412 + }, + { + "epoch": 0.1379195191183837, + "grad_norm": 0.8893517844574772, + "learning_rate": 4.593993325917686e-06, + "loss": 0.0694, + "step": 413 + }, + { + "epoch": 0.13825346468525632, + "grad_norm": 0.7942856390259999, + "learning_rate": 4.60511679644049e-06, + "loss": 0.0817, + "step": 414 + }, + { + "epoch": 0.1385874102521289, + "grad_norm": 0.6972510816940831, + "learning_rate": 4.6162402669632926e-06, + "loss": 0.0657, + "step": 415 + }, + { + "epoch": 0.1389213558190015, + "grad_norm": 0.6735407580491654, + "learning_rate": 4.627363737486096e-06, + "loss": 0.0422, + "step": 416 + }, + { + "epoch": 0.1392553013858741, + "grad_norm": 0.9068832429274885, + "learning_rate": 4.6384872080089e-06, + "loss": 0.0738, + "step": 417 + }, + { + "epoch": 0.13958924695274672, + "grad_norm": 0.5573937357195854, + "learning_rate": 4.649610678531702e-06, + "loss": 0.0486, + "step": 418 + }, + { + "epoch": 0.1399231925196193, + "grad_norm": 1.1306131771178998, + "learning_rate": 4.660734149054505e-06, + "loss": 0.0721, + "step": 419 + }, + { + "epoch": 0.1402571380864919, + "grad_norm": 0.9161627282442215, + "learning_rate": 4.671857619577309e-06, + "loss": 0.0636, + "step": 420 + }, + { + "epoch": 0.1405910836533645, + "grad_norm": 0.8662276360669395, + "learning_rate": 4.682981090100111e-06, + "loss": 0.057, + "step": 421 + }, + { + "epoch": 0.1409250292202371, + "grad_norm": 0.8059068213689825, + "learning_rate": 4.694104560622915e-06, + "loss": 0.0794, + "step": 422 + }, + { + "epoch": 0.1412589747871097, + "grad_norm": 0.7513356926995356, + "learning_rate": 4.705228031145718e-06, + "loss": 0.0685, + "step": 423 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 0.9234385601663734, + "learning_rate": 4.71635150166852e-06, + "loss": 0.0693, + "step": 424 + }, + { + "epoch": 0.1419268659208549, + "grad_norm": 0.7933568219531439, + "learning_rate": 4.727474972191325e-06, + "loss": 0.0727, + "step": 425 + }, + { + "epoch": 0.1422608114877275, + "grad_norm": 0.956125290520426, + "learning_rate": 4.7385984427141274e-06, + "loss": 0.089, + "step": 426 + }, + { + "epoch": 0.1425947570546001, + "grad_norm": 0.9789206648859708, + "learning_rate": 4.74972191323693e-06, + "loss": 0.0641, + "step": 427 + }, + { + "epoch": 0.1429287026214727, + "grad_norm": 0.644535093275283, + "learning_rate": 4.760845383759734e-06, + "loss": 0.0526, + "step": 428 + }, + { + "epoch": 0.1432626481883453, + "grad_norm": 0.8809124618007454, + "learning_rate": 4.771968854282536e-06, + "loss": 0.0594, + "step": 429 + }, + { + "epoch": 0.1435965937552179, + "grad_norm": 0.9101100326367371, + "learning_rate": 4.78309232480534e-06, + "loss": 0.0598, + "step": 430 + }, + { + "epoch": 0.1439305393220905, + "grad_norm": 0.8039963024048183, + "learning_rate": 4.794215795328143e-06, + "loss": 0.0764, + "step": 431 + }, + { + "epoch": 0.1442644848889631, + "grad_norm": 1.2707372886300605, + "learning_rate": 4.805339265850945e-06, + "loss": 0.0788, + "step": 432 + }, + { + "epoch": 0.1445984304558357, + "grad_norm": 0.8505934103256769, + "learning_rate": 4.816462736373749e-06, + "loss": 0.0595, + "step": 433 + }, + { + "epoch": 0.1449323760227083, + "grad_norm": 0.956829814390663, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.0749, + "step": 434 + }, + { + "epoch": 0.1452663215895809, + "grad_norm": 0.8762229891444342, + "learning_rate": 4.838709677419355e-06, + "loss": 0.0759, + "step": 435 + }, + { + "epoch": 0.1456002671564535, + "grad_norm": 0.8346509440556535, + "learning_rate": 4.849833147942159e-06, + "loss": 0.0558, + "step": 436 + }, + { + "epoch": 0.1459342127233261, + "grad_norm": 0.8166339845883285, + "learning_rate": 4.8609566184649615e-06, + "loss": 0.0648, + "step": 437 + }, + { + "epoch": 0.1462681582901987, + "grad_norm": 1.2323034153108503, + "learning_rate": 4.872080088987764e-06, + "loss": 0.0754, + "step": 438 + }, + { + "epoch": 0.1466021038570713, + "grad_norm": 0.9170291879229514, + "learning_rate": 4.883203559510568e-06, + "loss": 0.0802, + "step": 439 + }, + { + "epoch": 0.1469360494239439, + "grad_norm": 0.7221436604229694, + "learning_rate": 4.8943270300333704e-06, + "loss": 0.0481, + "step": 440 + }, + { + "epoch": 0.1472699949908165, + "grad_norm": 0.8990697598786687, + "learning_rate": 4.905450500556174e-06, + "loss": 0.0706, + "step": 441 + }, + { + "epoch": 0.1476039405576891, + "grad_norm": 0.7615310608245218, + "learning_rate": 4.9165739710789776e-06, + "loss": 0.0738, + "step": 442 + }, + { + "epoch": 0.14793788612456168, + "grad_norm": 0.7568416347024116, + "learning_rate": 4.92769744160178e-06, + "loss": 0.0723, + "step": 443 + }, + { + "epoch": 0.1482718316914343, + "grad_norm": 1.994169955997745, + "learning_rate": 4.938820912124583e-06, + "loss": 0.0915, + "step": 444 + }, + { + "epoch": 0.1486057772583069, + "grad_norm": 0.804709830172916, + "learning_rate": 4.9499443826473865e-06, + "loss": 0.0544, + "step": 445 + }, + { + "epoch": 0.1489397228251795, + "grad_norm": 0.7623771915839768, + "learning_rate": 4.961067853170189e-06, + "loss": 0.0533, + "step": 446 + }, + { + "epoch": 0.14927366839205208, + "grad_norm": 0.9978398241826115, + "learning_rate": 4.972191323692993e-06, + "loss": 0.0461, + "step": 447 + }, + { + "epoch": 0.1496076139589247, + "grad_norm": 1.0896383334289894, + "learning_rate": 4.9833147942157955e-06, + "loss": 0.0681, + "step": 448 + }, + { + "epoch": 0.1499415595257973, + "grad_norm": 0.9028419951187907, + "learning_rate": 4.994438264738598e-06, + "loss": 0.0606, + "step": 449 + }, + { + "epoch": 0.1502755050926699, + "grad_norm": 1.083921419144493, + "learning_rate": 5.005561735261402e-06, + "loss": 0.0702, + "step": 450 + }, + { + "epoch": 0.15060945065954248, + "grad_norm": 1.0713782553803741, + "learning_rate": 5.016685205784205e-06, + "loss": 0.0662, + "step": 451 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 1.1834646725533113, + "learning_rate": 5.027808676307008e-06, + "loss": 0.0735, + "step": 452 + }, + { + "epoch": 0.1512773417932877, + "grad_norm": 0.9703074891389526, + "learning_rate": 5.038932146829812e-06, + "loss": 0.0737, + "step": 453 + }, + { + "epoch": 0.1516112873601603, + "grad_norm": 0.755237585571562, + "learning_rate": 5.050055617352615e-06, + "loss": 0.0668, + "step": 454 + }, + { + "epoch": 0.15194523292703288, + "grad_norm": 0.9117012164808175, + "learning_rate": 5.061179087875418e-06, + "loss": 0.0586, + "step": 455 + }, + { + "epoch": 0.1522791784939055, + "grad_norm": 0.6181916310720402, + "learning_rate": 5.072302558398221e-06, + "loss": 0.0495, + "step": 456 + }, + { + "epoch": 0.1526131240607781, + "grad_norm": 0.8163763785306399, + "learning_rate": 5.083426028921023e-06, + "loss": 0.0634, + "step": 457 + }, + { + "epoch": 0.1529470696276507, + "grad_norm": 0.8796412602404203, + "learning_rate": 5.094549499443827e-06, + "loss": 0.0618, + "step": 458 + }, + { + "epoch": 0.15328101519452328, + "grad_norm": 0.7135447035944562, + "learning_rate": 5.1056729699666295e-06, + "loss": 0.0744, + "step": 459 + }, + { + "epoch": 0.15361496076139589, + "grad_norm": 0.7553785666045255, + "learning_rate": 5.116796440489433e-06, + "loss": 0.0686, + "step": 460 + }, + { + "epoch": 0.1539489063282685, + "grad_norm": 0.7545703857319316, + "learning_rate": 5.127919911012236e-06, + "loss": 0.061, + "step": 461 + }, + { + "epoch": 0.1542828518951411, + "grad_norm": 0.7582065006521851, + "learning_rate": 5.139043381535039e-06, + "loss": 0.0658, + "step": 462 + }, + { + "epoch": 0.1546167974620137, + "grad_norm": 0.7655579748134159, + "learning_rate": 5.150166852057843e-06, + "loss": 0.0447, + "step": 463 + }, + { + "epoch": 0.15495074302888628, + "grad_norm": 0.8955937293257055, + "learning_rate": 5.161290322580646e-06, + "loss": 0.0583, + "step": 464 + }, + { + "epoch": 0.1552846885957589, + "grad_norm": 0.7093979423603337, + "learning_rate": 5.172413793103449e-06, + "loss": 0.0517, + "step": 465 + }, + { + "epoch": 0.1556186341626315, + "grad_norm": 1.2047517628712154, + "learning_rate": 5.183537263626252e-06, + "loss": 0.0726, + "step": 466 + }, + { + "epoch": 0.1559525797295041, + "grad_norm": 0.6586898864529823, + "learning_rate": 5.1946607341490554e-06, + "loss": 0.0508, + "step": 467 + }, + { + "epoch": 0.15628652529637668, + "grad_norm": 0.7296233118104705, + "learning_rate": 5.205784204671857e-06, + "loss": 0.0547, + "step": 468 + }, + { + "epoch": 0.1566204708632493, + "grad_norm": 1.029385580406455, + "learning_rate": 5.216907675194661e-06, + "loss": 0.0617, + "step": 469 + }, + { + "epoch": 0.1569544164301219, + "grad_norm": 0.8828964156642398, + "learning_rate": 5.2280311457174636e-06, + "loss": 0.0738, + "step": 470 + }, + { + "epoch": 0.1572883619969945, + "grad_norm": 0.7357519449955818, + "learning_rate": 5.239154616240267e-06, + "loss": 0.0588, + "step": 471 + }, + { + "epoch": 0.15762230756386708, + "grad_norm": 0.9117473447896604, + "learning_rate": 5.250278086763071e-06, + "loss": 0.0548, + "step": 472 + }, + { + "epoch": 0.1579562531307397, + "grad_norm": 0.6906100135121893, + "learning_rate": 5.261401557285873e-06, + "loss": 0.0557, + "step": 473 + }, + { + "epoch": 0.1582901986976123, + "grad_norm": 1.1950263378701476, + "learning_rate": 5.272525027808677e-06, + "loss": 0.058, + "step": 474 + }, + { + "epoch": 0.1586241442644849, + "grad_norm": 0.6927807149939814, + "learning_rate": 5.28364849833148e-06, + "loss": 0.0557, + "step": 475 + }, + { + "epoch": 0.15895808983135748, + "grad_norm": 0.65365492551376, + "learning_rate": 5.294771968854283e-06, + "loss": 0.0582, + "step": 476 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 0.8799420679217809, + "learning_rate": 5.305895439377086e-06, + "loss": 0.0863, + "step": 477 + }, + { + "epoch": 0.1596259809651027, + "grad_norm": 0.7103549816274838, + "learning_rate": 5.3170189098998895e-06, + "loss": 0.0487, + "step": 478 + }, + { + "epoch": 0.1599599265319753, + "grad_norm": 0.7152667161275338, + "learning_rate": 5.328142380422693e-06, + "loss": 0.0441, + "step": 479 + }, + { + "epoch": 0.16029387209884788, + "grad_norm": 0.9695447542502575, + "learning_rate": 5.339265850945496e-06, + "loss": 0.0638, + "step": 480 + }, + { + "epoch": 0.16062781766572048, + "grad_norm": 0.7741189676524061, + "learning_rate": 5.350389321468299e-06, + "loss": 0.0509, + "step": 481 + }, + { + "epoch": 0.1609617632325931, + "grad_norm": 0.8810457057758688, + "learning_rate": 5.361512791991101e-06, + "loss": 0.0638, + "step": 482 + }, + { + "epoch": 0.1612957087994657, + "grad_norm": 0.6102246479100515, + "learning_rate": 5.372636262513905e-06, + "loss": 0.0581, + "step": 483 + }, + { + "epoch": 0.16162965436633828, + "grad_norm": 1.3278482352026502, + "learning_rate": 5.383759733036707e-06, + "loss": 0.0763, + "step": 484 + }, + { + "epoch": 0.16196359993321088, + "grad_norm": 0.7371166558096207, + "learning_rate": 5.394883203559511e-06, + "loss": 0.0647, + "step": 485 + }, + { + "epoch": 0.1622975455000835, + "grad_norm": 1.0736671434503915, + "learning_rate": 5.406006674082314e-06, + "loss": 0.0678, + "step": 486 + }, + { + "epoch": 0.1626314910669561, + "grad_norm": 0.6949676204696933, + "learning_rate": 5.417130144605117e-06, + "loss": 0.0523, + "step": 487 + }, + { + "epoch": 0.16296543663382868, + "grad_norm": 0.7936075395734432, + "learning_rate": 5.428253615127921e-06, + "loss": 0.0597, + "step": 488 + }, + { + "epoch": 0.16329938220070128, + "grad_norm": 0.6514863106711773, + "learning_rate": 5.4393770856507235e-06, + "loss": 0.0509, + "step": 489 + }, + { + "epoch": 0.1636333277675739, + "grad_norm": 0.8324752891246695, + "learning_rate": 5.450500556173527e-06, + "loss": 0.0716, + "step": 490 + }, + { + "epoch": 0.1639672733344465, + "grad_norm": 0.9053927241925424, + "learning_rate": 5.46162402669633e-06, + "loss": 0.0706, + "step": 491 + }, + { + "epoch": 0.16430121890131907, + "grad_norm": 0.8721247709102367, + "learning_rate": 5.472747497219133e-06, + "loss": 0.0699, + "step": 492 + }, + { + "epoch": 0.16463516446819168, + "grad_norm": 0.8430069088405966, + "learning_rate": 5.483870967741935e-06, + "loss": 0.0774, + "step": 493 + }, + { + "epoch": 0.1649691100350643, + "grad_norm": 0.5996875363467666, + "learning_rate": 5.494994438264739e-06, + "loss": 0.0473, + "step": 494 + }, + { + "epoch": 0.1653030556019369, + "grad_norm": 0.7525414129851855, + "learning_rate": 5.506117908787543e-06, + "loss": 0.0548, + "step": 495 + }, + { + "epoch": 0.16563700116880947, + "grad_norm": 0.7456419929890177, + "learning_rate": 5.517241379310345e-06, + "loss": 0.0521, + "step": 496 + }, + { + "epoch": 0.16597094673568208, + "grad_norm": 1.4821871731471925, + "learning_rate": 5.5283648498331485e-06, + "loss": 0.0633, + "step": 497 + }, + { + "epoch": 0.16630489230255469, + "grad_norm": 1.0035158211938564, + "learning_rate": 5.539488320355951e-06, + "loss": 0.0501, + "step": 498 + }, + { + "epoch": 0.1666388378694273, + "grad_norm": 1.0643854193297664, + "learning_rate": 5.550611790878755e-06, + "loss": 0.069, + "step": 499 + }, + { + "epoch": 0.16697278343629987, + "grad_norm": 0.8695276836951805, + "learning_rate": 5.5617352614015575e-06, + "loss": 0.0678, + "step": 500 + }, + { + "epoch": 0.16730672900317248, + "grad_norm": 0.8450565320978253, + "learning_rate": 5.572858731924361e-06, + "loss": 0.0571, + "step": 501 + }, + { + "epoch": 0.16764067457004508, + "grad_norm": 1.1669524985507995, + "learning_rate": 5.583982202447164e-06, + "loss": 0.0834, + "step": 502 + }, + { + "epoch": 0.1679746201369177, + "grad_norm": 0.970165804178769, + "learning_rate": 5.595105672969967e-06, + "loss": 0.0555, + "step": 503 + }, + { + "epoch": 0.16830856570379027, + "grad_norm": 0.7723836408243235, + "learning_rate": 5.606229143492771e-06, + "loss": 0.0464, + "step": 504 + }, + { + "epoch": 0.16864251127066288, + "grad_norm": 0.5381677090389018, + "learning_rate": 5.617352614015574e-06, + "loss": 0.0498, + "step": 505 + }, + { + "epoch": 0.16897645683753548, + "grad_norm": 1.0636586844048173, + "learning_rate": 5.628476084538377e-06, + "loss": 0.068, + "step": 506 + }, + { + "epoch": 0.1693104024044081, + "grad_norm": 0.8162554296195744, + "learning_rate": 5.639599555061179e-06, + "loss": 0.0645, + "step": 507 + }, + { + "epoch": 0.16964434797128067, + "grad_norm": 0.6481582277347443, + "learning_rate": 5.6507230255839826e-06, + "loss": 0.0615, + "step": 508 + }, + { + "epoch": 0.16997829353815327, + "grad_norm": 1.3923437281539484, + "learning_rate": 5.661846496106785e-06, + "loss": 0.0822, + "step": 509 + }, + { + "epoch": 0.17031223910502588, + "grad_norm": 0.977000370333707, + "learning_rate": 5.672969966629589e-06, + "loss": 0.0731, + "step": 510 + }, + { + "epoch": 0.1706461846718985, + "grad_norm": 0.8458998828404004, + "learning_rate": 5.6840934371523915e-06, + "loss": 0.0591, + "step": 511 + }, + { + "epoch": 0.1709801302387711, + "grad_norm": 0.7348551384773913, + "learning_rate": 5.695216907675195e-06, + "loss": 0.0588, + "step": 512 + }, + { + "epoch": 0.17131407580564367, + "grad_norm": 0.836801901679019, + "learning_rate": 5.706340378197999e-06, + "loss": 0.0621, + "step": 513 + }, + { + "epoch": 0.17164802137251628, + "grad_norm": 0.8189625571489982, + "learning_rate": 5.717463848720801e-06, + "loss": 0.0446, + "step": 514 + }, + { + "epoch": 0.1719819669393889, + "grad_norm": 1.432359875394896, + "learning_rate": 5.728587319243605e-06, + "loss": 0.0806, + "step": 515 + }, + { + "epoch": 0.1723159125062615, + "grad_norm": 0.7614900359440986, + "learning_rate": 5.739710789766408e-06, + "loss": 0.0551, + "step": 516 + }, + { + "epoch": 0.17264985807313407, + "grad_norm": 0.6671415448030087, + "learning_rate": 5.750834260289211e-06, + "loss": 0.0503, + "step": 517 + }, + { + "epoch": 0.17298380364000668, + "grad_norm": 0.8520371085785486, + "learning_rate": 5.761957730812013e-06, + "loss": 0.0585, + "step": 518 + }, + { + "epoch": 0.17331774920687928, + "grad_norm": 2.3571026809143714, + "learning_rate": 5.773081201334817e-06, + "loss": 0.0579, + "step": 519 + }, + { + "epoch": 0.1736516947737519, + "grad_norm": 0.9830936139267258, + "learning_rate": 5.784204671857621e-06, + "loss": 0.0679, + "step": 520 + }, + { + "epoch": 0.17398564034062447, + "grad_norm": 1.8476146611052202, + "learning_rate": 5.795328142380423e-06, + "loss": 0.0739, + "step": 521 + }, + { + "epoch": 0.17431958590749708, + "grad_norm": 1.0888171261377078, + "learning_rate": 5.806451612903226e-06, + "loss": 0.0607, + "step": 522 + }, + { + "epoch": 0.17465353147436968, + "grad_norm": 0.9907717227940405, + "learning_rate": 5.817575083426029e-06, + "loss": 0.057, + "step": 523 + }, + { + "epoch": 0.1749874770412423, + "grad_norm": 1.0443916244462574, + "learning_rate": 5.828698553948833e-06, + "loss": 0.0737, + "step": 524 + }, + { + "epoch": 0.17532142260811487, + "grad_norm": 0.7458714946625439, + "learning_rate": 5.839822024471635e-06, + "loss": 0.063, + "step": 525 + }, + { + "epoch": 0.17565536817498748, + "grad_norm": 0.7211494174786142, + "learning_rate": 5.850945494994439e-06, + "loss": 0.0523, + "step": 526 + }, + { + "epoch": 0.17598931374186008, + "grad_norm": 1.3712158931896836, + "learning_rate": 5.862068965517242e-06, + "loss": 0.0762, + "step": 527 + }, + { + "epoch": 0.1763232593087327, + "grad_norm": 0.7938349835402784, + "learning_rate": 5.873192436040045e-06, + "loss": 0.0589, + "step": 528 + }, + { + "epoch": 0.17665720487560527, + "grad_norm": 0.6992457731952962, + "learning_rate": 5.884315906562849e-06, + "loss": 0.0566, + "step": 529 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 0.9355890517584756, + "learning_rate": 5.8954393770856515e-06, + "loss": 0.0655, + "step": 530 + }, + { + "epoch": 0.17732509600935048, + "grad_norm": 0.9272921789800707, + "learning_rate": 5.906562847608455e-06, + "loss": 0.0793, + "step": 531 + }, + { + "epoch": 0.1776590415762231, + "grad_norm": 0.8859708515971795, + "learning_rate": 5.917686318131257e-06, + "loss": 0.0552, + "step": 532 + }, + { + "epoch": 0.17799298714309567, + "grad_norm": 0.8398942776604181, + "learning_rate": 5.9288097886540604e-06, + "loss": 0.0465, + "step": 533 + }, + { + "epoch": 0.17832693270996827, + "grad_norm": 0.7985514645311459, + "learning_rate": 5.939933259176863e-06, + "loss": 0.0675, + "step": 534 + }, + { + "epoch": 0.17866087827684088, + "grad_norm": 0.697071401921753, + "learning_rate": 5.951056729699667e-06, + "loss": 0.0429, + "step": 535 + }, + { + "epoch": 0.17899482384371349, + "grad_norm": 0.7041137612115344, + "learning_rate": 5.962180200222469e-06, + "loss": 0.0431, + "step": 536 + }, + { + "epoch": 0.17932876941058606, + "grad_norm": 0.6682339970873195, + "learning_rate": 5.973303670745273e-06, + "loss": 0.0592, + "step": 537 + }, + { + "epoch": 0.17966271497745867, + "grad_norm": 1.17716914430658, + "learning_rate": 5.9844271412680765e-06, + "loss": 0.0787, + "step": 538 + }, + { + "epoch": 0.17999666054433128, + "grad_norm": 2.1039130020292207, + "learning_rate": 5.995550611790879e-06, + "loss": 0.0586, + "step": 539 + }, + { + "epoch": 0.18033060611120388, + "grad_norm": 1.6854132829750088, + "learning_rate": 6.006674082313683e-06, + "loss": 0.088, + "step": 540 + }, + { + "epoch": 0.18066455167807646, + "grad_norm": 0.9436435794531032, + "learning_rate": 6.0177975528364855e-06, + "loss": 0.068, + "step": 541 + }, + { + "epoch": 0.18099849724494907, + "grad_norm": 1.4543330724458545, + "learning_rate": 6.028921023359289e-06, + "loss": 0.0814, + "step": 542 + }, + { + "epoch": 0.18133244281182168, + "grad_norm": 1.2058456355383562, + "learning_rate": 6.040044493882091e-06, + "loss": 0.0829, + "step": 543 + }, + { + "epoch": 0.18166638837869428, + "grad_norm": 0.8900032807415988, + "learning_rate": 6.0511679644048945e-06, + "loss": 0.0658, + "step": 544 + }, + { + "epoch": 0.18200033394556686, + "grad_norm": 1.025438934220678, + "learning_rate": 6.062291434927698e-06, + "loss": 0.0685, + "step": 545 + }, + { + "epoch": 0.18233427951243947, + "grad_norm": 1.488594859100904, + "learning_rate": 6.073414905450501e-06, + "loss": 0.0802, + "step": 546 + }, + { + "epoch": 0.18266822507931207, + "grad_norm": 1.077167826841067, + "learning_rate": 6.084538375973304e-06, + "loss": 0.0692, + "step": 547 + }, + { + "epoch": 0.18300217064618468, + "grad_norm": 0.8359382817432315, + "learning_rate": 6.095661846496107e-06, + "loss": 0.0617, + "step": 548 + }, + { + "epoch": 0.18333611621305726, + "grad_norm": 0.9359938494704068, + "learning_rate": 6.1067853170189106e-06, + "loss": 0.0598, + "step": 549 + }, + { + "epoch": 0.18367006177992987, + "grad_norm": 0.898094211129922, + "learning_rate": 6.117908787541713e-06, + "loss": 0.0715, + "step": 550 + }, + { + "epoch": 0.18400400734680247, + "grad_norm": 0.8085241518785767, + "learning_rate": 6.129032258064517e-06, + "loss": 0.0611, + "step": 551 + }, + { + "epoch": 0.18433795291367508, + "grad_norm": 0.835628986807132, + "learning_rate": 6.1401557285873195e-06, + "loss": 0.0545, + "step": 552 + }, + { + "epoch": 0.18467189848054766, + "grad_norm": 0.9870690066587878, + "learning_rate": 6.151279199110123e-06, + "loss": 0.0555, + "step": 553 + }, + { + "epoch": 0.18500584404742026, + "grad_norm": 0.7044338384587818, + "learning_rate": 6.162402669632927e-06, + "loss": 0.0563, + "step": 554 + }, + { + "epoch": 0.18533978961429287, + "grad_norm": 0.9421964678475573, + "learning_rate": 6.173526140155729e-06, + "loss": 0.0694, + "step": 555 + }, + { + "epoch": 0.18567373518116548, + "grad_norm": 0.7531432153729787, + "learning_rate": 6.184649610678533e-06, + "loss": 0.0398, + "step": 556 + }, + { + "epoch": 0.18600768074803806, + "grad_norm": 0.7178016597672993, + "learning_rate": 6.195773081201335e-06, + "loss": 0.0527, + "step": 557 + }, + { + "epoch": 0.18634162631491066, + "grad_norm": 0.6870955058429726, + "learning_rate": 6.206896551724138e-06, + "loss": 0.0515, + "step": 558 + }, + { + "epoch": 0.18667557188178327, + "grad_norm": 1.058584370525592, + "learning_rate": 6.218020022246941e-06, + "loss": 0.0716, + "step": 559 + }, + { + "epoch": 0.18700951744865588, + "grad_norm": 0.793793339054818, + "learning_rate": 6.229143492769745e-06, + "loss": 0.059, + "step": 560 + }, + { + "epoch": 0.18734346301552846, + "grad_norm": 0.7921846601688673, + "learning_rate": 6.240266963292548e-06, + "loss": 0.0571, + "step": 561 + }, + { + "epoch": 0.18767740858240106, + "grad_norm": 1.0822954840038115, + "learning_rate": 6.251390433815351e-06, + "loss": 0.0808, + "step": 562 + }, + { + "epoch": 0.18801135414927367, + "grad_norm": 1.1838974205578672, + "learning_rate": 6.262513904338154e-06, + "loss": 0.0802, + "step": 563 + }, + { + "epoch": 0.18834529971614627, + "grad_norm": 0.6664360867051361, + "learning_rate": 6.273637374860957e-06, + "loss": 0.0423, + "step": 564 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 1.12975769644805, + "learning_rate": 6.284760845383761e-06, + "loss": 0.059, + "step": 565 + }, + { + "epoch": 0.18901319084989146, + "grad_norm": 0.9023368076115867, + "learning_rate": 6.295884315906563e-06, + "loss": 0.0663, + "step": 566 + }, + { + "epoch": 0.18934713641676407, + "grad_norm": 0.7070819040726736, + "learning_rate": 6.307007786429367e-06, + "loss": 0.0599, + "step": 567 + }, + { + "epoch": 0.18968108198363667, + "grad_norm": 0.7167383442202009, + "learning_rate": 6.318131256952169e-06, + "loss": 0.0547, + "step": 568 + }, + { + "epoch": 0.19001502755050928, + "grad_norm": 0.8667596619045079, + "learning_rate": 6.329254727474972e-06, + "loss": 0.0577, + "step": 569 + }, + { + "epoch": 0.19034897311738186, + "grad_norm": 0.6903866651198595, + "learning_rate": 6.340378197997776e-06, + "loss": 0.0541, + "step": 570 + }, + { + "epoch": 0.19068291868425447, + "grad_norm": 0.8652401120314771, + "learning_rate": 6.351501668520579e-06, + "loss": 0.0675, + "step": 571 + }, + { + "epoch": 0.19101686425112707, + "grad_norm": 0.9567786258900463, + "learning_rate": 6.362625139043382e-06, + "loss": 0.0635, + "step": 572 + }, + { + "epoch": 0.19135080981799968, + "grad_norm": 1.4032940279907142, + "learning_rate": 6.373748609566185e-06, + "loss": 0.0721, + "step": 573 + }, + { + "epoch": 0.19168475538487226, + "grad_norm": 1.1264541824058907, + "learning_rate": 6.3848720800889884e-06, + "loss": 0.0686, + "step": 574 + }, + { + "epoch": 0.19201870095174486, + "grad_norm": 1.0736034358506426, + "learning_rate": 6.395995550611791e-06, + "loss": 0.0873, + "step": 575 + }, + { + "epoch": 0.19235264651861747, + "grad_norm": 0.8630872636820003, + "learning_rate": 6.407119021134595e-06, + "loss": 0.0582, + "step": 576 + }, + { + "epoch": 0.19268659208549008, + "grad_norm": 1.5039433166171976, + "learning_rate": 6.418242491657397e-06, + "loss": 0.0895, + "step": 577 + }, + { + "epoch": 0.19302053765236266, + "grad_norm": 0.658053148089993, + "learning_rate": 6.429365962180201e-06, + "loss": 0.0482, + "step": 578 + }, + { + "epoch": 0.19335448321923526, + "grad_norm": 0.6038070862576609, + "learning_rate": 6.4404894327030045e-06, + "loss": 0.0407, + "step": 579 + }, + { + "epoch": 0.19368842878610787, + "grad_norm": 0.8352398964922494, + "learning_rate": 6.451612903225806e-06, + "loss": 0.0594, + "step": 580 + }, + { + "epoch": 0.19402237435298048, + "grad_norm": 1.1267488159998478, + "learning_rate": 6.462736373748611e-06, + "loss": 0.074, + "step": 581 + }, + { + "epoch": 0.19435631991985305, + "grad_norm": 0.721628757355493, + "learning_rate": 6.473859844271413e-06, + "loss": 0.0678, + "step": 582 + }, + { + "epoch": 0.19469026548672566, + "grad_norm": 0.648380978198407, + "learning_rate": 6.484983314794216e-06, + "loss": 0.0756, + "step": 583 + }, + { + "epoch": 0.19502421105359827, + "grad_norm": 0.6572366191878652, + "learning_rate": 6.496106785317019e-06, + "loss": 0.0564, + "step": 584 + }, + { + "epoch": 0.19535815662047087, + "grad_norm": 0.6746106748324757, + "learning_rate": 6.5072302558398225e-06, + "loss": 0.0601, + "step": 585 + }, + { + "epoch": 0.19569210218734345, + "grad_norm": 0.9506138166905724, + "learning_rate": 6.518353726362626e-06, + "loss": 0.0729, + "step": 586 + }, + { + "epoch": 0.19602604775421606, + "grad_norm": 0.7944011705745151, + "learning_rate": 6.529477196885429e-06, + "loss": 0.0825, + "step": 587 + }, + { + "epoch": 0.19635999332108867, + "grad_norm": 0.7265582927727529, + "learning_rate": 6.540600667408232e-06, + "loss": 0.0751, + "step": 588 + }, + { + "epoch": 0.19669393888796127, + "grad_norm": 1.3335730160125474, + "learning_rate": 6.551724137931035e-06, + "loss": 0.0569, + "step": 589 + }, + { + "epoch": 0.19702788445483385, + "grad_norm": 0.7315873291244676, + "learning_rate": 6.5628476084538385e-06, + "loss": 0.0462, + "step": 590 + }, + { + "epoch": 0.19736183002170646, + "grad_norm": 0.7935447081743017, + "learning_rate": 6.573971078976641e-06, + "loss": 0.0528, + "step": 591 + }, + { + "epoch": 0.19769577558857906, + "grad_norm": 0.6859243667795377, + "learning_rate": 6.585094549499445e-06, + "loss": 0.0526, + "step": 592 + }, + { + "epoch": 0.19802972115545167, + "grad_norm": 0.7655199162301891, + "learning_rate": 6.596218020022247e-06, + "loss": 0.0584, + "step": 593 + }, + { + "epoch": 0.19836366672232425, + "grad_norm": 1.0167741029460753, + "learning_rate": 6.60734149054505e-06, + "loss": 0.0645, + "step": 594 + }, + { + "epoch": 0.19869761228919686, + "grad_norm": 0.969233187245511, + "learning_rate": 6.618464961067854e-06, + "loss": 0.0765, + "step": 595 + }, + { + "epoch": 0.19903155785606946, + "grad_norm": 0.8910946717229351, + "learning_rate": 6.6295884315906565e-06, + "loss": 0.0531, + "step": 596 + }, + { + "epoch": 0.19936550342294207, + "grad_norm": 0.7392381351339108, + "learning_rate": 6.64071190211346e-06, + "loss": 0.056, + "step": 597 + }, + { + "epoch": 0.19969944898981465, + "grad_norm": 0.6524425989201603, + "learning_rate": 6.651835372636263e-06, + "loss": 0.059, + "step": 598 + }, + { + "epoch": 0.20003339455668726, + "grad_norm": 0.8142443247115522, + "learning_rate": 6.662958843159066e-06, + "loss": 0.0686, + "step": 599 + }, + { + "epoch": 0.20036734012355986, + "grad_norm": 0.6323681488217677, + "learning_rate": 6.674082313681869e-06, + "loss": 0.0488, + "step": 600 + }, + { + "epoch": 0.20070128569043247, + "grad_norm": 0.8041917413196822, + "learning_rate": 6.6852057842046726e-06, + "loss": 0.0571, + "step": 601 + }, + { + "epoch": 0.20103523125730505, + "grad_norm": 0.5893400857741306, + "learning_rate": 6.696329254727475e-06, + "loss": 0.0445, + "step": 602 + }, + { + "epoch": 0.20136917682417765, + "grad_norm": 0.7214738998710117, + "learning_rate": 6.707452725250279e-06, + "loss": 0.0744, + "step": 603 + }, + { + "epoch": 0.20170312239105026, + "grad_norm": 1.2233939320369276, + "learning_rate": 6.718576195773082e-06, + "loss": 0.0653, + "step": 604 + }, + { + "epoch": 0.20203706795792287, + "grad_norm": 0.8197861276964413, + "learning_rate": 6.729699666295884e-06, + "loss": 0.0617, + "step": 605 + }, + { + "epoch": 0.20237101352479545, + "grad_norm": 0.9144895067312468, + "learning_rate": 6.740823136818689e-06, + "loss": 0.0467, + "step": 606 + }, + { + "epoch": 0.20270495909166805, + "grad_norm": 0.8306665062152114, + "learning_rate": 6.7519466073414905e-06, + "loss": 0.0522, + "step": 607 + }, + { + "epoch": 0.20303890465854066, + "grad_norm": 0.6614365082598056, + "learning_rate": 6.763070077864294e-06, + "loss": 0.0511, + "step": 608 + }, + { + "epoch": 0.20337285022541327, + "grad_norm": 0.713950052001751, + "learning_rate": 6.774193548387097e-06, + "loss": 0.0621, + "step": 609 + }, + { + "epoch": 0.20370679579228584, + "grad_norm": 0.8905334489243183, + "learning_rate": 6.7853170189099e-06, + "loss": 0.0589, + "step": 610 + }, + { + "epoch": 0.20404074135915845, + "grad_norm": 0.6543811592847132, + "learning_rate": 6.796440489432704e-06, + "loss": 0.0514, + "step": 611 + }, + { + "epoch": 0.20437468692603106, + "grad_norm": 0.9007985574710429, + "learning_rate": 6.807563959955507e-06, + "loss": 0.0497, + "step": 612 + }, + { + "epoch": 0.20470863249290366, + "grad_norm": 0.9073568133724813, + "learning_rate": 6.81868743047831e-06, + "loss": 0.0692, + "step": 613 + }, + { + "epoch": 0.20504257805977624, + "grad_norm": 0.6067895191197412, + "learning_rate": 6.829810901001113e-06, + "loss": 0.0494, + "step": 614 + }, + { + "epoch": 0.20537652362664885, + "grad_norm": 1.1233152951341259, + "learning_rate": 6.840934371523916e-06, + "loss": 0.0601, + "step": 615 + }, + { + "epoch": 0.20571046919352146, + "grad_norm": 0.7913630031379069, + "learning_rate": 6.852057842046719e-06, + "loss": 0.0443, + "step": 616 + }, + { + "epoch": 0.20604441476039406, + "grad_norm": 0.6710216282911835, + "learning_rate": 6.863181312569523e-06, + "loss": 0.0487, + "step": 617 + }, + { + "epoch": 0.20637836032726667, + "grad_norm": 0.5492071484783791, + "learning_rate": 6.8743047830923245e-06, + "loss": 0.0515, + "step": 618 + }, + { + "epoch": 0.20671230589413925, + "grad_norm": 0.725700461458707, + "learning_rate": 6.885428253615128e-06, + "loss": 0.0443, + "step": 619 + }, + { + "epoch": 0.20704625146101185, + "grad_norm": 0.730493977320189, + "learning_rate": 6.896551724137932e-06, + "loss": 0.0514, + "step": 620 + }, + { + "epoch": 0.20738019702788446, + "grad_norm": 0.629676391292487, + "learning_rate": 6.907675194660734e-06, + "loss": 0.0466, + "step": 621 + }, + { + "epoch": 0.20771414259475707, + "grad_norm": 0.9439772579694213, + "learning_rate": 6.918798665183538e-06, + "loss": 0.0593, + "step": 622 + }, + { + "epoch": 0.20804808816162965, + "grad_norm": 0.6727789860352383, + "learning_rate": 6.929922135706341e-06, + "loss": 0.0714, + "step": 623 + }, + { + "epoch": 0.20838203372850225, + "grad_norm": 0.7216726522421172, + "learning_rate": 6.941045606229144e-06, + "loss": 0.0605, + "step": 624 + }, + { + "epoch": 0.20871597929537486, + "grad_norm": 0.554147090395553, + "learning_rate": 6.952169076751947e-06, + "loss": 0.0414, + "step": 625 + }, + { + "epoch": 0.20904992486224747, + "grad_norm": 0.8311935206386598, + "learning_rate": 6.9632925472747504e-06, + "loss": 0.0578, + "step": 626 + }, + { + "epoch": 0.20938387042912004, + "grad_norm": 0.8000074593969697, + "learning_rate": 6.974416017797554e-06, + "loss": 0.0584, + "step": 627 + }, + { + "epoch": 0.20971781599599265, + "grad_norm": 0.9722412112123996, + "learning_rate": 6.985539488320357e-06, + "loss": 0.0609, + "step": 628 + }, + { + "epoch": 0.21005176156286526, + "grad_norm": 0.878780487308983, + "learning_rate": 6.99666295884316e-06, + "loss": 0.0672, + "step": 629 + }, + { + "epoch": 0.21038570712973786, + "grad_norm": 0.6398612017056129, + "learning_rate": 7.007786429365962e-06, + "loss": 0.0595, + "step": 630 + }, + { + "epoch": 0.21071965269661044, + "grad_norm": 1.3820823508166513, + "learning_rate": 7.0189098998887665e-06, + "loss": 0.0641, + "step": 631 + }, + { + "epoch": 0.21105359826348305, + "grad_norm": 0.8389581412239546, + "learning_rate": 7.030033370411568e-06, + "loss": 0.0534, + "step": 632 + }, + { + "epoch": 0.21138754383035566, + "grad_norm": 1.235867509288309, + "learning_rate": 7.041156840934372e-06, + "loss": 0.0649, + "step": 633 + }, + { + "epoch": 0.21172148939722826, + "grad_norm": 0.6822686682534393, + "learning_rate": 7.052280311457175e-06, + "loss": 0.0496, + "step": 634 + }, + { + "epoch": 0.21205543496410084, + "grad_norm": 0.677863381365653, + "learning_rate": 7.063403781979978e-06, + "loss": 0.0524, + "step": 635 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 0.5616965372733239, + "learning_rate": 7.074527252502782e-06, + "loss": 0.045, + "step": 636 + }, + { + "epoch": 0.21272332609784605, + "grad_norm": 0.9305795338913498, + "learning_rate": 7.0856507230255845e-06, + "loss": 0.0598, + "step": 637 + }, + { + "epoch": 0.21305727166471866, + "grad_norm": 1.148754024371969, + "learning_rate": 7.096774193548388e-06, + "loss": 0.0851, + "step": 638 + }, + { + "epoch": 0.21339121723159124, + "grad_norm": 0.6771383882990121, + "learning_rate": 7.107897664071191e-06, + "loss": 0.057, + "step": 639 + }, + { + "epoch": 0.21372516279846385, + "grad_norm": 0.7326053693987247, + "learning_rate": 7.119021134593994e-06, + "loss": 0.0378, + "step": 640 + }, + { + "epoch": 0.21405910836533645, + "grad_norm": 0.8642206474374593, + "learning_rate": 7.130144605116797e-06, + "loss": 0.0633, + "step": 641 + }, + { + "epoch": 0.21439305393220906, + "grad_norm": 0.689691985990761, + "learning_rate": 7.1412680756396006e-06, + "loss": 0.0507, + "step": 642 + }, + { + "epoch": 0.21472699949908164, + "grad_norm": 0.5505119046969866, + "learning_rate": 7.152391546162402e-06, + "loss": 0.0481, + "step": 643 + }, + { + "epoch": 0.21506094506595425, + "grad_norm": 0.7259987823104186, + "learning_rate": 7.163515016685206e-06, + "loss": 0.054, + "step": 644 + }, + { + "epoch": 0.21539489063282685, + "grad_norm": 0.9492663396246556, + "learning_rate": 7.1746384872080095e-06, + "loss": 0.0765, + "step": 645 + }, + { + "epoch": 0.21572883619969946, + "grad_norm": 1.1774451315327281, + "learning_rate": 7.185761957730812e-06, + "loss": 0.0643, + "step": 646 + }, + { + "epoch": 0.21606278176657204, + "grad_norm": 0.7103830226184474, + "learning_rate": 7.196885428253616e-06, + "loss": 0.0476, + "step": 647 + }, + { + "epoch": 0.21639672733344464, + "grad_norm": 0.7935594037529661, + "learning_rate": 7.2080088987764185e-06, + "loss": 0.0523, + "step": 648 + }, + { + "epoch": 0.21673067290031725, + "grad_norm": 0.6896064477390729, + "learning_rate": 7.219132369299222e-06, + "loss": 0.0655, + "step": 649 + }, + { + "epoch": 0.21706461846718986, + "grad_norm": 0.8771285074850425, + "learning_rate": 7.230255839822025e-06, + "loss": 0.0722, + "step": 650 + }, + { + "epoch": 0.21739856403406244, + "grad_norm": 0.6004195294857952, + "learning_rate": 7.241379310344828e-06, + "loss": 0.0442, + "step": 651 + }, + { + "epoch": 0.21773250960093504, + "grad_norm": 1.491334243239559, + "learning_rate": 7.252502780867632e-06, + "loss": 0.0693, + "step": 652 + }, + { + "epoch": 0.21806645516780765, + "grad_norm": 1.0264363212929928, + "learning_rate": 7.263626251390435e-06, + "loss": 0.0589, + "step": 653 + }, + { + "epoch": 0.21840040073468026, + "grad_norm": 0.6937073911174437, + "learning_rate": 7.274749721913238e-06, + "loss": 0.0485, + "step": 654 + }, + { + "epoch": 0.21873434630155283, + "grad_norm": 0.6813217875877775, + "learning_rate": 7.28587319243604e-06, + "loss": 0.0468, + "step": 655 + }, + { + "epoch": 0.21906829186842544, + "grad_norm": 0.8468751024073949, + "learning_rate": 7.296996662958844e-06, + "loss": 0.0473, + "step": 656 + }, + { + "epoch": 0.21940223743529805, + "grad_norm": 0.5826196253520607, + "learning_rate": 7.308120133481646e-06, + "loss": 0.0362, + "step": 657 + }, + { + "epoch": 0.21973618300217065, + "grad_norm": 0.8823320801298193, + "learning_rate": 7.31924360400445e-06, + "loss": 0.0696, + "step": 658 + }, + { + "epoch": 0.22007012856904323, + "grad_norm": 0.6752508644503113, + "learning_rate": 7.3303670745272525e-06, + "loss": 0.0597, + "step": 659 + }, + { + "epoch": 0.22040407413591584, + "grad_norm": 0.630584714988691, + "learning_rate": 7.341490545050056e-06, + "loss": 0.0462, + "step": 660 + }, + { + "epoch": 0.22073801970278845, + "grad_norm": 1.1380911578526518, + "learning_rate": 7.35261401557286e-06, + "loss": 0.0716, + "step": 661 + }, + { + "epoch": 0.22107196526966105, + "grad_norm": 0.8225765908938263, + "learning_rate": 7.363737486095662e-06, + "loss": 0.0871, + "step": 662 + }, + { + "epoch": 0.22140591083653363, + "grad_norm": 0.6508406370720058, + "learning_rate": 7.374860956618466e-06, + "loss": 0.0538, + "step": 663 + }, + { + "epoch": 0.22173985640340624, + "grad_norm": 0.6498773981616411, + "learning_rate": 7.385984427141269e-06, + "loss": 0.0421, + "step": 664 + }, + { + "epoch": 0.22207380197027884, + "grad_norm": 0.5632972140069444, + "learning_rate": 7.397107897664072e-06, + "loss": 0.0401, + "step": 665 + }, + { + "epoch": 0.22240774753715145, + "grad_norm": 0.6955201001818666, + "learning_rate": 7.408231368186875e-06, + "loss": 0.0515, + "step": 666 + }, + { + "epoch": 0.22274169310402406, + "grad_norm": 0.9245962605688041, + "learning_rate": 7.4193548387096784e-06, + "loss": 0.0658, + "step": 667 + }, + { + "epoch": 0.22307563867089664, + "grad_norm": 0.8566192824542013, + "learning_rate": 7.43047830923248e-06, + "loss": 0.0644, + "step": 668 + }, + { + "epoch": 0.22340958423776924, + "grad_norm": 0.7879617681877166, + "learning_rate": 7.441601779755284e-06, + "loss": 0.08, + "step": 669 + }, + { + "epoch": 0.22374352980464185, + "grad_norm": 0.5720327250980868, + "learning_rate": 7.452725250278087e-06, + "loss": 0.0602, + "step": 670 + }, + { + "epoch": 0.22407747537151446, + "grad_norm": 0.7243965181151018, + "learning_rate": 7.46384872080089e-06, + "loss": 0.0538, + "step": 671 + }, + { + "epoch": 0.22441142093838704, + "grad_norm": 0.6765854869756044, + "learning_rate": 7.474972191323694e-06, + "loss": 0.0438, + "step": 672 + }, + { + "epoch": 0.22474536650525964, + "grad_norm": 1.1551543065277514, + "learning_rate": 7.486095661846496e-06, + "loss": 0.0641, + "step": 673 + }, + { + "epoch": 0.22507931207213225, + "grad_norm": 1.2471246659573005, + "learning_rate": 7.4972191323693e-06, + "loss": 0.0954, + "step": 674 + }, + { + "epoch": 0.22541325763900485, + "grad_norm": 0.717581831189048, + "learning_rate": 7.508342602892103e-06, + "loss": 0.0717, + "step": 675 + }, + { + "epoch": 0.22574720320587743, + "grad_norm": 1.4383898751001543, + "learning_rate": 7.519466073414906e-06, + "loss": 0.0681, + "step": 676 + }, + { + "epoch": 0.22608114877275004, + "grad_norm": 0.6278799747313417, + "learning_rate": 7.53058954393771e-06, + "loss": 0.0368, + "step": 677 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 1.078681505592307, + "learning_rate": 7.5417130144605125e-06, + "loss": 0.055, + "step": 678 + }, + { + "epoch": 0.22674903990649525, + "grad_norm": 0.8391619044412681, + "learning_rate": 7.552836484983316e-06, + "loss": 0.057, + "step": 679 + }, + { + "epoch": 0.22708298547336783, + "grad_norm": 1.0326612676142242, + "learning_rate": 7.563959955506118e-06, + "loss": 0.0546, + "step": 680 + }, + { + "epoch": 0.22741693104024044, + "grad_norm": 0.6535947488360222, + "learning_rate": 7.575083426028922e-06, + "loss": 0.0574, + "step": 681 + }, + { + "epoch": 0.22775087660711305, + "grad_norm": 0.814336343141877, + "learning_rate": 7.586206896551724e-06, + "loss": 0.0697, + "step": 682 + }, + { + "epoch": 0.22808482217398565, + "grad_norm": 0.8931605432690218, + "learning_rate": 7.597330367074528e-06, + "loss": 0.0581, + "step": 683 + }, + { + "epoch": 0.22841876774085823, + "grad_norm": 0.698447467488072, + "learning_rate": 7.60845383759733e-06, + "loss": 0.0501, + "step": 684 + }, + { + "epoch": 0.22875271330773084, + "grad_norm": 0.8023761432510702, + "learning_rate": 7.619577308120134e-06, + "loss": 0.0847, + "step": 685 + }, + { + "epoch": 0.22908665887460344, + "grad_norm": 0.6095282065363956, + "learning_rate": 7.630700778642938e-06, + "loss": 0.062, + "step": 686 + }, + { + "epoch": 0.22942060444147605, + "grad_norm": 0.4774849614665295, + "learning_rate": 7.64182424916574e-06, + "loss": 0.0532, + "step": 687 + }, + { + "epoch": 0.22975455000834863, + "grad_norm": 0.5898418038186112, + "learning_rate": 7.652947719688543e-06, + "loss": 0.0516, + "step": 688 + }, + { + "epoch": 0.23008849557522124, + "grad_norm": 0.8783761532562184, + "learning_rate": 7.664071190211346e-06, + "loss": 0.0693, + "step": 689 + }, + { + "epoch": 0.23042244114209384, + "grad_norm": 0.6022197037001502, + "learning_rate": 7.67519466073415e-06, + "loss": 0.0441, + "step": 690 + }, + { + "epoch": 0.23075638670896645, + "grad_norm": 0.7320630483302178, + "learning_rate": 7.686318131256953e-06, + "loss": 0.0645, + "step": 691 + }, + { + "epoch": 0.23109033227583903, + "grad_norm": 0.7168522057955218, + "learning_rate": 7.697441601779755e-06, + "loss": 0.0585, + "step": 692 + }, + { + "epoch": 0.23142427784271163, + "grad_norm": 0.6468450826874373, + "learning_rate": 7.70856507230256e-06, + "loss": 0.0438, + "step": 693 + }, + { + "epoch": 0.23175822340958424, + "grad_norm": 0.6036146013357945, + "learning_rate": 7.719688542825363e-06, + "loss": 0.0351, + "step": 694 + }, + { + "epoch": 0.23209216897645685, + "grad_norm": 0.6184625300068487, + "learning_rate": 7.730812013348165e-06, + "loss": 0.054, + "step": 695 + }, + { + "epoch": 0.23242611454332943, + "grad_norm": 0.7161536721534412, + "learning_rate": 7.741935483870968e-06, + "loss": 0.0498, + "step": 696 + }, + { + "epoch": 0.23276006011020203, + "grad_norm": 0.92662755009442, + "learning_rate": 7.753058954393772e-06, + "loss": 0.0619, + "step": 697 + }, + { + "epoch": 0.23309400567707464, + "grad_norm": 0.9741063837144837, + "learning_rate": 7.764182424916575e-06, + "loss": 0.0615, + "step": 698 + }, + { + "epoch": 0.23342795124394725, + "grad_norm": 0.6571626264384028, + "learning_rate": 7.775305895439378e-06, + "loss": 0.0691, + "step": 699 + }, + { + "epoch": 0.23376189681081982, + "grad_norm": 0.6623436201391426, + "learning_rate": 7.78642936596218e-06, + "loss": 0.0631, + "step": 700 + }, + { + "epoch": 0.23409584237769243, + "grad_norm": 0.5671382014262107, + "learning_rate": 7.797552836484983e-06, + "loss": 0.0528, + "step": 701 + }, + { + "epoch": 0.23442978794456504, + "grad_norm": 0.5186931729414294, + "learning_rate": 7.808676307007788e-06, + "loss": 0.0473, + "step": 702 + }, + { + "epoch": 0.23476373351143764, + "grad_norm": 0.6306362158538241, + "learning_rate": 7.81979977753059e-06, + "loss": 0.0552, + "step": 703 + }, + { + "epoch": 0.23509767907831022, + "grad_norm": 0.5466189227341881, + "learning_rate": 7.830923248053393e-06, + "loss": 0.0568, + "step": 704 + }, + { + "epoch": 0.23543162464518283, + "grad_norm": 0.5613237493646677, + "learning_rate": 7.842046718576196e-06, + "loss": 0.0546, + "step": 705 + }, + { + "epoch": 0.23576557021205544, + "grad_norm": 0.5138027671363771, + "learning_rate": 7.853170189099e-06, + "loss": 0.0455, + "step": 706 + }, + { + "epoch": 0.23609951577892804, + "grad_norm": 1.0000382007451012, + "learning_rate": 7.864293659621803e-06, + "loss": 0.0815, + "step": 707 + }, + { + "epoch": 0.23643346134580062, + "grad_norm": 0.5309889504993703, + "learning_rate": 7.875417130144606e-06, + "loss": 0.0415, + "step": 708 + }, + { + "epoch": 0.23676740691267323, + "grad_norm": 0.601523770166247, + "learning_rate": 7.886540600667408e-06, + "loss": 0.0424, + "step": 709 + }, + { + "epoch": 0.23710135247954583, + "grad_norm": 0.5614576890827592, + "learning_rate": 7.897664071190213e-06, + "loss": 0.0511, + "step": 710 + }, + { + "epoch": 0.23743529804641844, + "grad_norm": 1.1787590770393492, + "learning_rate": 7.908787541713015e-06, + "loss": 0.0555, + "step": 711 + }, + { + "epoch": 0.23776924361329102, + "grad_norm": 0.6857456493243878, + "learning_rate": 7.919911012235818e-06, + "loss": 0.0489, + "step": 712 + }, + { + "epoch": 0.23810318918016363, + "grad_norm": 0.6570662490997047, + "learning_rate": 7.93103448275862e-06, + "loss": 0.0632, + "step": 713 + }, + { + "epoch": 0.23843713474703623, + "grad_norm": 0.6206550500253379, + "learning_rate": 7.942157953281424e-06, + "loss": 0.0581, + "step": 714 + }, + { + "epoch": 0.23877108031390884, + "grad_norm": 0.6819702882562815, + "learning_rate": 7.953281423804228e-06, + "loss": 0.0773, + "step": 715 + }, + { + "epoch": 0.23910502588078142, + "grad_norm": 0.6584842233833688, + "learning_rate": 7.96440489432703e-06, + "loss": 0.0606, + "step": 716 + }, + { + "epoch": 0.23943897144765403, + "grad_norm": 0.708678681452552, + "learning_rate": 7.975528364849833e-06, + "loss": 0.0527, + "step": 717 + }, + { + "epoch": 0.23977291701452663, + "grad_norm": 0.6559013503580498, + "learning_rate": 7.986651835372638e-06, + "loss": 0.0468, + "step": 718 + }, + { + "epoch": 0.24010686258139924, + "grad_norm": 0.6089295320922504, + "learning_rate": 7.99777530589544e-06, + "loss": 0.0469, + "step": 719 + }, + { + "epoch": 0.24044080814827185, + "grad_norm": 0.5421853109439192, + "learning_rate": 8.008898776418243e-06, + "loss": 0.0425, + "step": 720 + }, + { + "epoch": 0.24077475371514442, + "grad_norm": 0.6646654479239261, + "learning_rate": 8.020022246941046e-06, + "loss": 0.0526, + "step": 721 + }, + { + "epoch": 0.24110869928201703, + "grad_norm": 0.5909129859046834, + "learning_rate": 8.03114571746385e-06, + "loss": 0.0578, + "step": 722 + }, + { + "epoch": 0.24144264484888964, + "grad_norm": 0.5465817247840761, + "learning_rate": 8.042269187986651e-06, + "loss": 0.0381, + "step": 723 + }, + { + "epoch": 0.24177659041576224, + "grad_norm": 0.5663522143293497, + "learning_rate": 8.053392658509456e-06, + "loss": 0.0456, + "step": 724 + }, + { + "epoch": 0.24211053598263482, + "grad_norm": 0.7099755585096478, + "learning_rate": 8.064516129032258e-06, + "loss": 0.0553, + "step": 725 + }, + { + "epoch": 0.24244448154950743, + "grad_norm": 0.4703416320234455, + "learning_rate": 8.075639599555061e-06, + "loss": 0.0314, + "step": 726 + }, + { + "epoch": 0.24277842711638004, + "grad_norm": 0.748131315704341, + "learning_rate": 8.086763070077866e-06, + "loss": 0.0575, + "step": 727 + }, + { + "epoch": 0.24311237268325264, + "grad_norm": 0.7959707312517568, + "learning_rate": 8.097886540600668e-06, + "loss": 0.0522, + "step": 728 + }, + { + "epoch": 0.24344631825012522, + "grad_norm": 0.7143826412790298, + "learning_rate": 8.109010011123471e-06, + "loss": 0.0603, + "step": 729 + }, + { + "epoch": 0.24378026381699783, + "grad_norm": 0.7546112375024333, + "learning_rate": 8.120133481646274e-06, + "loss": 0.0447, + "step": 730 + }, + { + "epoch": 0.24411420938387043, + "grad_norm": 1.320002478448852, + "learning_rate": 8.131256952169078e-06, + "loss": 0.0613, + "step": 731 + }, + { + "epoch": 0.24444815495074304, + "grad_norm": 0.9123876302024165, + "learning_rate": 8.14238042269188e-06, + "loss": 0.0555, + "step": 732 + }, + { + "epoch": 0.24478210051761562, + "grad_norm": 1.0679687444316195, + "learning_rate": 8.153503893214683e-06, + "loss": 0.0661, + "step": 733 + }, + { + "epoch": 0.24511604608448823, + "grad_norm": 0.5913377615797212, + "learning_rate": 8.164627363737486e-06, + "loss": 0.0536, + "step": 734 + }, + { + "epoch": 0.24544999165136083, + "grad_norm": 0.6908337267132758, + "learning_rate": 8.17575083426029e-06, + "loss": 0.0545, + "step": 735 + }, + { + "epoch": 0.24578393721823344, + "grad_norm": 0.7525711702575639, + "learning_rate": 8.186874304783093e-06, + "loss": 0.0422, + "step": 736 + }, + { + "epoch": 0.24611788278510602, + "grad_norm": 0.6795534047118373, + "learning_rate": 8.197997775305896e-06, + "loss": 0.0435, + "step": 737 + }, + { + "epoch": 0.24645182835197862, + "grad_norm": 0.7419691167366567, + "learning_rate": 8.209121245828699e-06, + "loss": 0.0458, + "step": 738 + }, + { + "epoch": 0.24678577391885123, + "grad_norm": 0.9596461385373865, + "learning_rate": 8.220244716351501e-06, + "loss": 0.0529, + "step": 739 + }, + { + "epoch": 0.24711971948572384, + "grad_norm": 1.0129102887265393, + "learning_rate": 8.231368186874306e-06, + "loss": 0.0739, + "step": 740 + }, + { + "epoch": 0.24745366505259642, + "grad_norm": 1.227881171847166, + "learning_rate": 8.242491657397109e-06, + "loss": 0.0622, + "step": 741 + }, + { + "epoch": 0.24778761061946902, + "grad_norm": 0.6383626322611397, + "learning_rate": 8.253615127919911e-06, + "loss": 0.0607, + "step": 742 + }, + { + "epoch": 0.24812155618634163, + "grad_norm": 0.9642447588805736, + "learning_rate": 8.264738598442716e-06, + "loss": 0.0644, + "step": 743 + }, + { + "epoch": 0.24845550175321424, + "grad_norm": 0.8593161289423901, + "learning_rate": 8.275862068965518e-06, + "loss": 0.0537, + "step": 744 + }, + { + "epoch": 0.24878944732008682, + "grad_norm": 0.8432266000953739, + "learning_rate": 8.286985539488321e-06, + "loss": 0.0653, + "step": 745 + }, + { + "epoch": 0.24912339288695942, + "grad_norm": 0.6532807195982379, + "learning_rate": 8.298109010011124e-06, + "loss": 0.0575, + "step": 746 + }, + { + "epoch": 0.24945733845383203, + "grad_norm": 0.8080274258567304, + "learning_rate": 8.309232480533928e-06, + "loss": 0.0607, + "step": 747 + }, + { + "epoch": 0.24979128402070463, + "grad_norm": 0.7451776946200317, + "learning_rate": 8.32035595105673e-06, + "loss": 0.0688, + "step": 748 + }, + { + "epoch": 0.2501252295875772, + "grad_norm": 1.3723617173501166, + "learning_rate": 8.331479421579534e-06, + "loss": 0.0716, + "step": 749 + }, + { + "epoch": 0.2504591751544498, + "grad_norm": 0.8665671289199542, + "learning_rate": 8.342602892102336e-06, + "loss": 0.045, + "step": 750 + }, + { + "epoch": 0.2507931207213224, + "grad_norm": 1.0379503434218704, + "learning_rate": 8.353726362625139e-06, + "loss": 0.0583, + "step": 751 + }, + { + "epoch": 0.25112706628819503, + "grad_norm": 1.1391729732452065, + "learning_rate": 8.364849833147943e-06, + "loss": 0.0662, + "step": 752 + }, + { + "epoch": 0.25146101185506764, + "grad_norm": 0.8156416909847637, + "learning_rate": 8.375973303670746e-06, + "loss": 0.0554, + "step": 753 + }, + { + "epoch": 0.25179495742194025, + "grad_norm": 1.2010427348455242, + "learning_rate": 8.387096774193549e-06, + "loss": 0.0748, + "step": 754 + }, + { + "epoch": 0.2521289029888128, + "grad_norm": 0.9559734645728132, + "learning_rate": 8.398220244716352e-06, + "loss": 0.0617, + "step": 755 + }, + { + "epoch": 0.2524628485556854, + "grad_norm": 1.0341990720233247, + "learning_rate": 8.409343715239156e-06, + "loss": 0.0584, + "step": 756 + }, + { + "epoch": 0.252796794122558, + "grad_norm": 0.6904656633504147, + "learning_rate": 8.420467185761959e-06, + "loss": 0.0523, + "step": 757 + }, + { + "epoch": 0.2531307396894306, + "grad_norm": 0.6536837531277213, + "learning_rate": 8.431590656284761e-06, + "loss": 0.043, + "step": 758 + }, + { + "epoch": 0.2534646852563032, + "grad_norm": 1.0547443803828938, + "learning_rate": 8.442714126807566e-06, + "loss": 0.0428, + "step": 759 + }, + { + "epoch": 0.25379863082317583, + "grad_norm": 0.8767180736789091, + "learning_rate": 8.453837597330368e-06, + "loss": 0.0492, + "step": 760 + }, + { + "epoch": 0.25413257639004844, + "grad_norm": 0.7923571000035343, + "learning_rate": 8.464961067853171e-06, + "loss": 0.0644, + "step": 761 + }, + { + "epoch": 0.25446652195692104, + "grad_norm": 1.170351442324846, + "learning_rate": 8.476084538375974e-06, + "loss": 0.0725, + "step": 762 + }, + { + "epoch": 0.2548004675237936, + "grad_norm": 0.6763299749127198, + "learning_rate": 8.487208008898777e-06, + "loss": 0.0526, + "step": 763 + }, + { + "epoch": 0.2551344130906662, + "grad_norm": 0.6234066579891239, + "learning_rate": 8.49833147942158e-06, + "loss": 0.0565, + "step": 764 + }, + { + "epoch": 0.2554683586575388, + "grad_norm": 0.9791934529286336, + "learning_rate": 8.509454949944384e-06, + "loss": 0.0602, + "step": 765 + }, + { + "epoch": 0.2558023042244114, + "grad_norm": 0.9128970785663827, + "learning_rate": 8.520578420467186e-06, + "loss": 0.0603, + "step": 766 + }, + { + "epoch": 0.256136249791284, + "grad_norm": 0.7932800291438664, + "learning_rate": 8.531701890989989e-06, + "loss": 0.0515, + "step": 767 + }, + { + "epoch": 0.2564701953581566, + "grad_norm": 0.5921505126585687, + "learning_rate": 8.542825361512793e-06, + "loss": 0.0444, + "step": 768 + }, + { + "epoch": 0.25680414092502923, + "grad_norm": 0.6802827785195577, + "learning_rate": 8.553948832035596e-06, + "loss": 0.055, + "step": 769 + }, + { + "epoch": 0.25713808649190184, + "grad_norm": 0.6020518899742204, + "learning_rate": 8.565072302558399e-06, + "loss": 0.0499, + "step": 770 + }, + { + "epoch": 0.25747203205877445, + "grad_norm": 0.8934328993348921, + "learning_rate": 8.576195773081202e-06, + "loss": 0.0522, + "step": 771 + }, + { + "epoch": 0.257805977625647, + "grad_norm": 0.8499029920395502, + "learning_rate": 8.587319243604006e-06, + "loss": 0.0676, + "step": 772 + }, + { + "epoch": 0.2581399231925196, + "grad_norm": 0.6943521402387964, + "learning_rate": 8.598442714126807e-06, + "loss": 0.0407, + "step": 773 + }, + { + "epoch": 0.2584738687593922, + "grad_norm": 0.6458829421725016, + "learning_rate": 8.609566184649611e-06, + "loss": 0.044, + "step": 774 + }, + { + "epoch": 0.2588078143262648, + "grad_norm": 1.1875978947893053, + "learning_rate": 8.620689655172414e-06, + "loss": 0.0855, + "step": 775 + }, + { + "epoch": 0.2591417598931374, + "grad_norm": 0.744997675331295, + "learning_rate": 8.631813125695217e-06, + "loss": 0.0633, + "step": 776 + }, + { + "epoch": 0.25947570546001003, + "grad_norm": 0.7152958982591525, + "learning_rate": 8.642936596218021e-06, + "loss": 0.0547, + "step": 777 + }, + { + "epoch": 0.25980965102688264, + "grad_norm": 0.6618361809301214, + "learning_rate": 8.654060066740824e-06, + "loss": 0.053, + "step": 778 + }, + { + "epoch": 0.26014359659375524, + "grad_norm": 0.7450567717031901, + "learning_rate": 8.665183537263627e-06, + "loss": 0.0737, + "step": 779 + }, + { + "epoch": 0.2604775421606278, + "grad_norm": 0.7678230051735238, + "learning_rate": 8.67630700778643e-06, + "loss": 0.0713, + "step": 780 + }, + { + "epoch": 0.2608114877275004, + "grad_norm": 0.7930621080238116, + "learning_rate": 8.687430478309234e-06, + "loss": 0.0632, + "step": 781 + }, + { + "epoch": 0.261145433294373, + "grad_norm": 0.779443098676805, + "learning_rate": 8.698553948832036e-06, + "loss": 0.0708, + "step": 782 + }, + { + "epoch": 0.2614793788612456, + "grad_norm": 0.7710866735575862, + "learning_rate": 8.70967741935484e-06, + "loss": 0.0629, + "step": 783 + }, + { + "epoch": 0.2618133244281182, + "grad_norm": 0.5774069846802348, + "learning_rate": 8.720800889877644e-06, + "loss": 0.0502, + "step": 784 + }, + { + "epoch": 0.26214726999499083, + "grad_norm": 1.020267111033492, + "learning_rate": 8.731924360400446e-06, + "loss": 0.0881, + "step": 785 + }, + { + "epoch": 0.26248121556186343, + "grad_norm": 0.6543360371229927, + "learning_rate": 8.743047830923249e-06, + "loss": 0.0588, + "step": 786 + }, + { + "epoch": 0.26281516112873604, + "grad_norm": 1.2169325544853526, + "learning_rate": 8.754171301446052e-06, + "loss": 0.0568, + "step": 787 + }, + { + "epoch": 0.2631491066956086, + "grad_norm": 0.7084276003060112, + "learning_rate": 8.765294771968854e-06, + "loss": 0.0481, + "step": 788 + }, + { + "epoch": 0.2634830522624812, + "grad_norm": 0.885044828105131, + "learning_rate": 8.776418242491657e-06, + "loss": 0.0493, + "step": 789 + }, + { + "epoch": 0.2638169978293538, + "grad_norm": 0.7881657725641322, + "learning_rate": 8.787541713014462e-06, + "loss": 0.0676, + "step": 790 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.6772140173044136, + "learning_rate": 8.798665183537264e-06, + "loss": 0.0823, + "step": 791 + }, + { + "epoch": 0.264484888963099, + "grad_norm": 0.6563446607895745, + "learning_rate": 8.809788654060067e-06, + "loss": 0.0508, + "step": 792 + }, + { + "epoch": 0.2648188345299716, + "grad_norm": 0.7443612991045175, + "learning_rate": 8.820912124582871e-06, + "loss": 0.078, + "step": 793 + }, + { + "epoch": 0.26515278009684423, + "grad_norm": 0.8183742978778376, + "learning_rate": 8.832035595105674e-06, + "loss": 0.0539, + "step": 794 + }, + { + "epoch": 0.26548672566371684, + "grad_norm": 0.6131052919596894, + "learning_rate": 8.843159065628477e-06, + "loss": 0.0519, + "step": 795 + }, + { + "epoch": 0.2658206712305894, + "grad_norm": 0.6144509102084704, + "learning_rate": 8.85428253615128e-06, + "loss": 0.0546, + "step": 796 + }, + { + "epoch": 0.266154616797462, + "grad_norm": 0.49063270153285354, + "learning_rate": 8.865406006674084e-06, + "loss": 0.0326, + "step": 797 + }, + { + "epoch": 0.2664885623643346, + "grad_norm": 1.0582370293005623, + "learning_rate": 8.876529477196885e-06, + "loss": 0.0428, + "step": 798 + }, + { + "epoch": 0.2668225079312072, + "grad_norm": 0.9108709900656993, + "learning_rate": 8.88765294771969e-06, + "loss": 0.0628, + "step": 799 + }, + { + "epoch": 0.2671564534980798, + "grad_norm": 0.6789734849125307, + "learning_rate": 8.898776418242492e-06, + "loss": 0.0458, + "step": 800 + }, + { + "epoch": 0.2674903990649524, + "grad_norm": 0.728760335901269, + "learning_rate": 8.909899888765295e-06, + "loss": 0.065, + "step": 801 + }, + { + "epoch": 0.26782434463182503, + "grad_norm": 0.6239280771541308, + "learning_rate": 8.921023359288099e-06, + "loss": 0.0684, + "step": 802 + }, + { + "epoch": 0.26815829019869764, + "grad_norm": 0.7892196073901578, + "learning_rate": 8.932146829810902e-06, + "loss": 0.0546, + "step": 803 + }, + { + "epoch": 0.2684922357655702, + "grad_norm": 0.7022611679612035, + "learning_rate": 8.943270300333705e-06, + "loss": 0.0579, + "step": 804 + }, + { + "epoch": 0.2688261813324428, + "grad_norm": 0.533124691592435, + "learning_rate": 8.954393770856507e-06, + "loss": 0.046, + "step": 805 + }, + { + "epoch": 0.2691601268993154, + "grad_norm": 0.6878117959147262, + "learning_rate": 8.965517241379312e-06, + "loss": 0.0716, + "step": 806 + }, + { + "epoch": 0.269494072466188, + "grad_norm": 0.5774095121132162, + "learning_rate": 8.976640711902114e-06, + "loss": 0.0442, + "step": 807 + }, + { + "epoch": 0.2698280180330606, + "grad_norm": 0.5836306912559, + "learning_rate": 8.987764182424917e-06, + "loss": 0.0502, + "step": 808 + }, + { + "epoch": 0.2701619635999332, + "grad_norm": 0.6607292526260694, + "learning_rate": 8.998887652947721e-06, + "loss": 0.0601, + "step": 809 + }, + { + "epoch": 0.2704959091668058, + "grad_norm": 0.6557024370885054, + "learning_rate": 9.010011123470524e-06, + "loss": 0.0625, + "step": 810 + }, + { + "epoch": 0.27082985473367843, + "grad_norm": 0.7146971542868865, + "learning_rate": 9.021134593993327e-06, + "loss": 0.0534, + "step": 811 + }, + { + "epoch": 0.271163800300551, + "grad_norm": 0.6438970374293348, + "learning_rate": 9.03225806451613e-06, + "loss": 0.0638, + "step": 812 + }, + { + "epoch": 0.2714977458674236, + "grad_norm": 0.5829254200611607, + "learning_rate": 9.043381535038932e-06, + "loss": 0.0472, + "step": 813 + }, + { + "epoch": 0.2718316914342962, + "grad_norm": 0.654413773007462, + "learning_rate": 9.054505005561735e-06, + "loss": 0.0601, + "step": 814 + }, + { + "epoch": 0.2721656370011688, + "grad_norm": 0.5106614833580086, + "learning_rate": 9.06562847608454e-06, + "loss": 0.0465, + "step": 815 + }, + { + "epoch": 0.2724995825680414, + "grad_norm": 0.597727488309152, + "learning_rate": 9.076751946607342e-06, + "loss": 0.0498, + "step": 816 + }, + { + "epoch": 0.272833528134914, + "grad_norm": 0.7658624569422692, + "learning_rate": 9.087875417130145e-06, + "loss": 0.0818, + "step": 817 + }, + { + "epoch": 0.2731674737017866, + "grad_norm": 0.6226947658116192, + "learning_rate": 9.09899888765295e-06, + "loss": 0.049, + "step": 818 + }, + { + "epoch": 0.27350141926865923, + "grad_norm": 0.7874840061595467, + "learning_rate": 9.110122358175752e-06, + "loss": 0.0529, + "step": 819 + }, + { + "epoch": 0.27383536483553184, + "grad_norm": 0.7057495392935673, + "learning_rate": 9.121245828698555e-06, + "loss": 0.0655, + "step": 820 + }, + { + "epoch": 0.2741693104024044, + "grad_norm": 0.7556130731717619, + "learning_rate": 9.132369299221357e-06, + "loss": 0.0612, + "step": 821 + }, + { + "epoch": 0.274503255969277, + "grad_norm": 0.709870989496386, + "learning_rate": 9.143492769744162e-06, + "loss": 0.0536, + "step": 822 + }, + { + "epoch": 0.2748372015361496, + "grad_norm": 0.43499101929326484, + "learning_rate": 9.154616240266963e-06, + "loss": 0.0393, + "step": 823 + }, + { + "epoch": 0.2751711471030222, + "grad_norm": 0.6961537521945201, + "learning_rate": 9.165739710789767e-06, + "loss": 0.0557, + "step": 824 + }, + { + "epoch": 0.2755050926698948, + "grad_norm": 0.7553143434917614, + "learning_rate": 9.176863181312572e-06, + "loss": 0.0663, + "step": 825 + }, + { + "epoch": 0.2758390382367674, + "grad_norm": 0.5720851308592011, + "learning_rate": 9.187986651835373e-06, + "loss": 0.0523, + "step": 826 + }, + { + "epoch": 0.27617298380364, + "grad_norm": 0.5270648394655818, + "learning_rate": 9.199110122358177e-06, + "loss": 0.0455, + "step": 827 + }, + { + "epoch": 0.27650692937051263, + "grad_norm": 0.5470786945731327, + "learning_rate": 9.21023359288098e-06, + "loss": 0.0383, + "step": 828 + }, + { + "epoch": 0.2768408749373852, + "grad_norm": 0.5530166964220198, + "learning_rate": 9.221357063403782e-06, + "loss": 0.0539, + "step": 829 + }, + { + "epoch": 0.2771748205042578, + "grad_norm": 0.7434155435704276, + "learning_rate": 9.232480533926585e-06, + "loss": 0.0494, + "step": 830 + }, + { + "epoch": 0.2775087660711304, + "grad_norm": 0.5809199893108594, + "learning_rate": 9.24360400444939e-06, + "loss": 0.0527, + "step": 831 + }, + { + "epoch": 0.277842711638003, + "grad_norm": 0.7209529033280945, + "learning_rate": 9.254727474972192e-06, + "loss": 0.0537, + "step": 832 + }, + { + "epoch": 0.2781766572048756, + "grad_norm": 0.5102243838580376, + "learning_rate": 9.265850945494995e-06, + "loss": 0.0332, + "step": 833 + }, + { + "epoch": 0.2785106027717482, + "grad_norm": 0.6551621469992068, + "learning_rate": 9.2769744160178e-06, + "loss": 0.0496, + "step": 834 + }, + { + "epoch": 0.2788445483386208, + "grad_norm": 0.6155475477030943, + "learning_rate": 9.288097886540602e-06, + "loss": 0.0519, + "step": 835 + }, + { + "epoch": 0.27917849390549343, + "grad_norm": 0.749997680109745, + "learning_rate": 9.299221357063405e-06, + "loss": 0.0609, + "step": 836 + }, + { + "epoch": 0.279512439472366, + "grad_norm": 0.5426171336986118, + "learning_rate": 9.310344827586207e-06, + "loss": 0.058, + "step": 837 + }, + { + "epoch": 0.2798463850392386, + "grad_norm": 0.5920624994442251, + "learning_rate": 9.32146829810901e-06, + "loss": 0.0444, + "step": 838 + }, + { + "epoch": 0.2801803306061112, + "grad_norm": 0.538004372440245, + "learning_rate": 9.332591768631813e-06, + "loss": 0.0459, + "step": 839 + }, + { + "epoch": 0.2805142761729838, + "grad_norm": 0.5517405979478516, + "learning_rate": 9.343715239154617e-06, + "loss": 0.0482, + "step": 840 + }, + { + "epoch": 0.2808482217398564, + "grad_norm": 0.7691300801734562, + "learning_rate": 9.35483870967742e-06, + "loss": 0.0467, + "step": 841 + }, + { + "epoch": 0.281182167306729, + "grad_norm": 0.5454206931132759, + "learning_rate": 9.365962180200223e-06, + "loss": 0.0462, + "step": 842 + }, + { + "epoch": 0.2815161128736016, + "grad_norm": 0.6584147344642964, + "learning_rate": 9.377085650723027e-06, + "loss": 0.0532, + "step": 843 + }, + { + "epoch": 0.2818500584404742, + "grad_norm": 0.7540716528017773, + "learning_rate": 9.38820912124583e-06, + "loss": 0.0533, + "step": 844 + }, + { + "epoch": 0.2821840040073468, + "grad_norm": 0.6441723405492215, + "learning_rate": 9.399332591768633e-06, + "loss": 0.0572, + "step": 845 + }, + { + "epoch": 0.2825179495742194, + "grad_norm": 0.6245762842213952, + "learning_rate": 9.410456062291435e-06, + "loss": 0.0441, + "step": 846 + }, + { + "epoch": 0.282851895141092, + "grad_norm": 0.6946159964808465, + "learning_rate": 9.42157953281424e-06, + "loss": 0.0468, + "step": 847 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 1.06780412126692, + "learning_rate": 9.43270300333704e-06, + "loss": 0.0714, + "step": 848 + }, + { + "epoch": 0.2835197862748372, + "grad_norm": 0.5119991575843846, + "learning_rate": 9.443826473859845e-06, + "loss": 0.0441, + "step": 849 + }, + { + "epoch": 0.2838537318417098, + "grad_norm": 0.5935557569737968, + "learning_rate": 9.45494994438265e-06, + "loss": 0.046, + "step": 850 + }, + { + "epoch": 0.2841876774085824, + "grad_norm": 0.7699566957350499, + "learning_rate": 9.46607341490545e-06, + "loss": 0.0569, + "step": 851 + }, + { + "epoch": 0.284521622975455, + "grad_norm": 0.5323645354386496, + "learning_rate": 9.477196885428255e-06, + "loss": 0.0562, + "step": 852 + }, + { + "epoch": 0.2848555685423276, + "grad_norm": 0.6008477104164092, + "learning_rate": 9.488320355951058e-06, + "loss": 0.0502, + "step": 853 + }, + { + "epoch": 0.2851895141092002, + "grad_norm": 0.5769074354525283, + "learning_rate": 9.49944382647386e-06, + "loss": 0.0443, + "step": 854 + }, + { + "epoch": 0.2855234596760728, + "grad_norm": 0.6612659226019639, + "learning_rate": 9.510567296996663e-06, + "loss": 0.0401, + "step": 855 + }, + { + "epoch": 0.2858574052429454, + "grad_norm": 0.5255707752772606, + "learning_rate": 9.521690767519467e-06, + "loss": 0.0422, + "step": 856 + }, + { + "epoch": 0.286191350809818, + "grad_norm": 0.5689158347471137, + "learning_rate": 9.53281423804227e-06, + "loss": 0.0538, + "step": 857 + }, + { + "epoch": 0.2865252963766906, + "grad_norm": 0.7181350237218456, + "learning_rate": 9.543937708565073e-06, + "loss": 0.0509, + "step": 858 + }, + { + "epoch": 0.2868592419435632, + "grad_norm": 0.7983998789202897, + "learning_rate": 9.555061179087877e-06, + "loss": 0.0468, + "step": 859 + }, + { + "epoch": 0.2871931875104358, + "grad_norm": 0.6218813290653613, + "learning_rate": 9.56618464961068e-06, + "loss": 0.0496, + "step": 860 + }, + { + "epoch": 0.28752713307730837, + "grad_norm": 0.5965225546980025, + "learning_rate": 9.577308120133483e-06, + "loss": 0.0487, + "step": 861 + }, + { + "epoch": 0.287861078644181, + "grad_norm": 0.4985642306082885, + "learning_rate": 9.588431590656285e-06, + "loss": 0.0593, + "step": 862 + }, + { + "epoch": 0.2881950242110536, + "grad_norm": 1.2787183095841748, + "learning_rate": 9.599555061179088e-06, + "loss": 0.0612, + "step": 863 + }, + { + "epoch": 0.2885289697779262, + "grad_norm": 1.2982367757349111, + "learning_rate": 9.61067853170189e-06, + "loss": 0.0607, + "step": 864 + }, + { + "epoch": 0.2888629153447988, + "grad_norm": 0.678857658457712, + "learning_rate": 9.621802002224695e-06, + "loss": 0.0692, + "step": 865 + }, + { + "epoch": 0.2891968609116714, + "grad_norm": 0.6716018674150093, + "learning_rate": 9.632925472747498e-06, + "loss": 0.0457, + "step": 866 + }, + { + "epoch": 0.289530806478544, + "grad_norm": 0.3721700132825576, + "learning_rate": 9.6440489432703e-06, + "loss": 0.0319, + "step": 867 + }, + { + "epoch": 0.2898647520454166, + "grad_norm": 1.1152820078290502, + "learning_rate": 9.655172413793105e-06, + "loss": 0.0831, + "step": 868 + }, + { + "epoch": 0.29019869761228917, + "grad_norm": 0.7166835764536688, + "learning_rate": 9.666295884315908e-06, + "loss": 0.0592, + "step": 869 + }, + { + "epoch": 0.2905326431791618, + "grad_norm": 0.5634273378329758, + "learning_rate": 9.67741935483871e-06, + "loss": 0.049, + "step": 870 + }, + { + "epoch": 0.2908665887460344, + "grad_norm": 0.9374386237892269, + "learning_rate": 9.688542825361513e-06, + "loss": 0.0465, + "step": 871 + }, + { + "epoch": 0.291200534312907, + "grad_norm": 0.8652631794306441, + "learning_rate": 9.699666295884318e-06, + "loss": 0.0565, + "step": 872 + }, + { + "epoch": 0.2915344798797796, + "grad_norm": 0.5502052384336255, + "learning_rate": 9.710789766407119e-06, + "loss": 0.0436, + "step": 873 + }, + { + "epoch": 0.2918684254466522, + "grad_norm": 0.8517087684298448, + "learning_rate": 9.721913236929923e-06, + "loss": 0.0528, + "step": 874 + }, + { + "epoch": 0.2922023710135248, + "grad_norm": 0.6982728444124906, + "learning_rate": 9.733036707452727e-06, + "loss": 0.0568, + "step": 875 + }, + { + "epoch": 0.2925363165803974, + "grad_norm": 0.5480759138372429, + "learning_rate": 9.744160177975528e-06, + "loss": 0.0506, + "step": 876 + }, + { + "epoch": 0.29287026214727, + "grad_norm": 0.7227485062019516, + "learning_rate": 9.755283648498333e-06, + "loss": 0.0467, + "step": 877 + }, + { + "epoch": 0.2932042077141426, + "grad_norm": 1.2801312311944717, + "learning_rate": 9.766407119021135e-06, + "loss": 0.0587, + "step": 878 + }, + { + "epoch": 0.2935381532810152, + "grad_norm": 0.6922723872684086, + "learning_rate": 9.777530589543938e-06, + "loss": 0.0685, + "step": 879 + }, + { + "epoch": 0.2938720988478878, + "grad_norm": 0.5224930160013006, + "learning_rate": 9.788654060066741e-06, + "loss": 0.0517, + "step": 880 + }, + { + "epoch": 0.2942060444147604, + "grad_norm": 1.3256100303559288, + "learning_rate": 9.799777530589545e-06, + "loss": 0.0634, + "step": 881 + }, + { + "epoch": 0.294539989981633, + "grad_norm": 0.7626222770236885, + "learning_rate": 9.810901001112348e-06, + "loss": 0.0699, + "step": 882 + }, + { + "epoch": 0.2948739355485056, + "grad_norm": 0.6445500365350331, + "learning_rate": 9.82202447163515e-06, + "loss": 0.0567, + "step": 883 + }, + { + "epoch": 0.2952078811153782, + "grad_norm": 0.7133948826260625, + "learning_rate": 9.833147942157955e-06, + "loss": 0.0656, + "step": 884 + }, + { + "epoch": 0.2955418266822508, + "grad_norm": 0.7293006193055029, + "learning_rate": 9.844271412680758e-06, + "loss": 0.0682, + "step": 885 + }, + { + "epoch": 0.29587577224912337, + "grad_norm": 0.723469040579579, + "learning_rate": 9.85539488320356e-06, + "loss": 0.0557, + "step": 886 + }, + { + "epoch": 0.296209717815996, + "grad_norm": 0.9182207199919388, + "learning_rate": 9.866518353726363e-06, + "loss": 0.0757, + "step": 887 + }, + { + "epoch": 0.2965436633828686, + "grad_norm": 0.8696259837392362, + "learning_rate": 9.877641824249166e-06, + "loss": 0.0544, + "step": 888 + }, + { + "epoch": 0.2968776089497412, + "grad_norm": 0.5859047475602415, + "learning_rate": 9.888765294771969e-06, + "loss": 0.0426, + "step": 889 + }, + { + "epoch": 0.2972115545166138, + "grad_norm": 0.8098551825765264, + "learning_rate": 9.899888765294773e-06, + "loss": 0.0545, + "step": 890 + }, + { + "epoch": 0.2975455000834864, + "grad_norm": 0.7610901027395703, + "learning_rate": 9.911012235817576e-06, + "loss": 0.0508, + "step": 891 + }, + { + "epoch": 0.297879445650359, + "grad_norm": 0.7568651557259278, + "learning_rate": 9.922135706340378e-06, + "loss": 0.0673, + "step": 892 + }, + { + "epoch": 0.2982133912172316, + "grad_norm": 1.1494689009052674, + "learning_rate": 9.933259176863183e-06, + "loss": 0.0579, + "step": 893 + }, + { + "epoch": 0.29854733678410417, + "grad_norm": 0.5819263992119622, + "learning_rate": 9.944382647385986e-06, + "loss": 0.0589, + "step": 894 + }, + { + "epoch": 0.2988812823509768, + "grad_norm": 0.6959715412341217, + "learning_rate": 9.955506117908788e-06, + "loss": 0.0636, + "step": 895 + }, + { + "epoch": 0.2992152279178494, + "grad_norm": 0.6561594765200578, + "learning_rate": 9.966629588431591e-06, + "loss": 0.062, + "step": 896 + }, + { + "epoch": 0.299549173484722, + "grad_norm": 0.771047397124875, + "learning_rate": 9.977753058954395e-06, + "loss": 0.0512, + "step": 897 + }, + { + "epoch": 0.2998831190515946, + "grad_norm": 0.7812633820758519, + "learning_rate": 9.988876529477196e-06, + "loss": 0.0589, + "step": 898 + }, + { + "epoch": 0.3002170646184672, + "grad_norm": 0.6414790783874095, + "learning_rate": 1e-05, + "loss": 0.0529, + "step": 899 + }, + { + "epoch": 0.3005510101853398, + "grad_norm": 0.9018861790671471, + "learning_rate": 9.999999622345564e-06, + "loss": 0.0629, + "step": 900 + }, + { + "epoch": 0.3008849557522124, + "grad_norm": 1.1555100050578804, + "learning_rate": 9.999998489382312e-06, + "loss": 0.058, + "step": 901 + }, + { + "epoch": 0.30121890131908496, + "grad_norm": 0.5315076215327028, + "learning_rate": 9.999996601110414e-06, + "loss": 0.0444, + "step": 902 + }, + { + "epoch": 0.30155284688595757, + "grad_norm": 1.1152517950303242, + "learning_rate": 9.999993957530157e-06, + "loss": 0.0614, + "step": 903 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.6956334431936639, + "learning_rate": 9.999990558641939e-06, + "loss": 0.0578, + "step": 904 + }, + { + "epoch": 0.3022207380197028, + "grad_norm": 0.6092546044991591, + "learning_rate": 9.999986404446276e-06, + "loss": 0.0533, + "step": 905 + }, + { + "epoch": 0.3025546835865754, + "grad_norm": 0.9717116959813895, + "learning_rate": 9.999981494943791e-06, + "loss": 0.0716, + "step": 906 + }, + { + "epoch": 0.302888629153448, + "grad_norm": 0.9659324392329879, + "learning_rate": 9.99997583013523e-06, + "loss": 0.065, + "step": 907 + }, + { + "epoch": 0.3032225747203206, + "grad_norm": 0.40111083516509344, + "learning_rate": 9.999969410021447e-06, + "loss": 0.0425, + "step": 908 + }, + { + "epoch": 0.3035565202871932, + "grad_norm": 0.9630082239232272, + "learning_rate": 9.999962234603412e-06, + "loss": 0.0653, + "step": 909 + }, + { + "epoch": 0.30389046585406576, + "grad_norm": 0.6032894259625411, + "learning_rate": 9.99995430388221e-06, + "loss": 0.0411, + "step": 910 + }, + { + "epoch": 0.30422441142093837, + "grad_norm": 0.6243752152697278, + "learning_rate": 9.999945617859034e-06, + "loss": 0.0523, + "step": 911 + }, + { + "epoch": 0.304558356987811, + "grad_norm": 0.6661838900721574, + "learning_rate": 9.999936176535203e-06, + "loss": 0.0559, + "step": 912 + }, + { + "epoch": 0.3048923025546836, + "grad_norm": 0.7847123440405521, + "learning_rate": 9.99992597991214e-06, + "loss": 0.0547, + "step": 913 + }, + { + "epoch": 0.3052262481215562, + "grad_norm": 0.6034643612676034, + "learning_rate": 9.999915027991384e-06, + "loss": 0.0458, + "step": 914 + }, + { + "epoch": 0.3055601936884288, + "grad_norm": 0.6373222518679178, + "learning_rate": 9.999903320774593e-06, + "loss": 0.0436, + "step": 915 + }, + { + "epoch": 0.3058941392553014, + "grad_norm": 0.7836447446906802, + "learning_rate": 9.999890858263532e-06, + "loss": 0.051, + "step": 916 + }, + { + "epoch": 0.306228084822174, + "grad_norm": 0.6508583114723121, + "learning_rate": 9.999877640460085e-06, + "loss": 0.0473, + "step": 917 + }, + { + "epoch": 0.30656203038904656, + "grad_norm": 0.5706335931118961, + "learning_rate": 9.999863667366249e-06, + "loss": 0.0612, + "step": 918 + }, + { + "epoch": 0.30689597595591916, + "grad_norm": 0.6946210169872966, + "learning_rate": 9.999848938984135e-06, + "loss": 0.0472, + "step": 919 + }, + { + "epoch": 0.30722992152279177, + "grad_norm": 0.5162847660972449, + "learning_rate": 9.999833455315966e-06, + "loss": 0.0567, + "step": 920 + }, + { + "epoch": 0.3075638670896644, + "grad_norm": 0.6282724513719338, + "learning_rate": 9.999817216364085e-06, + "loss": 0.0586, + "step": 921 + }, + { + "epoch": 0.307897812656537, + "grad_norm": 0.5970451629512815, + "learning_rate": 9.99980022213094e-06, + "loss": 0.0541, + "step": 922 + }, + { + "epoch": 0.3082317582234096, + "grad_norm": 0.6049044469788369, + "learning_rate": 9.999782472619102e-06, + "loss": 0.0494, + "step": 923 + }, + { + "epoch": 0.3085657037902822, + "grad_norm": 0.6449474401906675, + "learning_rate": 9.99976396783125e-06, + "loss": 0.0616, + "step": 924 + }, + { + "epoch": 0.3088996493571548, + "grad_norm": 0.8027233078599899, + "learning_rate": 9.999744707770182e-06, + "loss": 0.0441, + "step": 925 + }, + { + "epoch": 0.3092335949240274, + "grad_norm": 0.8880518704433723, + "learning_rate": 9.999724692438805e-06, + "loss": 0.0791, + "step": 926 + }, + { + "epoch": 0.30956754049089996, + "grad_norm": 0.5138081603128517, + "learning_rate": 9.999703921840143e-06, + "loss": 0.0535, + "step": 927 + }, + { + "epoch": 0.30990148605777257, + "grad_norm": 0.6603602172738352, + "learning_rate": 9.999682395977334e-06, + "loss": 0.0551, + "step": 928 + }, + { + "epoch": 0.3102354316246452, + "grad_norm": 0.6890303648289062, + "learning_rate": 9.999660114853631e-06, + "loss": 0.0482, + "step": 929 + }, + { + "epoch": 0.3105693771915178, + "grad_norm": 0.43150917486352325, + "learning_rate": 9.999637078472398e-06, + "loss": 0.039, + "step": 930 + }, + { + "epoch": 0.3109033227583904, + "grad_norm": 0.45827832678573877, + "learning_rate": 9.999613286837115e-06, + "loss": 0.0352, + "step": 931 + }, + { + "epoch": 0.311237268325263, + "grad_norm": 0.6168319762636703, + "learning_rate": 9.999588739951376e-06, + "loss": 0.0588, + "step": 932 + }, + { + "epoch": 0.3115712138921356, + "grad_norm": 0.6007064538712662, + "learning_rate": 9.99956343781889e-06, + "loss": 0.067, + "step": 933 + }, + { + "epoch": 0.3119051594590082, + "grad_norm": 0.6869659934736515, + "learning_rate": 9.999537380443479e-06, + "loss": 0.0548, + "step": 934 + }, + { + "epoch": 0.31223910502588076, + "grad_norm": 0.6822343644574054, + "learning_rate": 9.999510567829079e-06, + "loss": 0.0456, + "step": 935 + }, + { + "epoch": 0.31257305059275337, + "grad_norm": 0.6001577713535029, + "learning_rate": 9.999482999979739e-06, + "loss": 0.0591, + "step": 936 + }, + { + "epoch": 0.31290699615962597, + "grad_norm": 0.8075508966129092, + "learning_rate": 9.999454676899628e-06, + "loss": 0.0573, + "step": 937 + }, + { + "epoch": 0.3132409417264986, + "grad_norm": 0.9578024765896318, + "learning_rate": 9.999425598593018e-06, + "loss": 0.0689, + "step": 938 + }, + { + "epoch": 0.3135748872933712, + "grad_norm": 0.6553702398957317, + "learning_rate": 9.999395765064308e-06, + "loss": 0.0749, + "step": 939 + }, + { + "epoch": 0.3139088328602438, + "grad_norm": 0.5845552152591339, + "learning_rate": 9.999365176318e-06, + "loss": 0.0464, + "step": 940 + }, + { + "epoch": 0.3142427784271164, + "grad_norm": 0.8267163038157918, + "learning_rate": 9.999333832358716e-06, + "loss": 0.062, + "step": 941 + }, + { + "epoch": 0.314576723993989, + "grad_norm": 0.6776387963285405, + "learning_rate": 9.999301733191193e-06, + "loss": 0.0404, + "step": 942 + }, + { + "epoch": 0.31491066956086156, + "grad_norm": 0.954616904867087, + "learning_rate": 9.999268878820278e-06, + "loss": 0.0676, + "step": 943 + }, + { + "epoch": 0.31524461512773416, + "grad_norm": 0.8261235198634368, + "learning_rate": 9.999235269250933e-06, + "loss": 0.0496, + "step": 944 + }, + { + "epoch": 0.31557856069460677, + "grad_norm": 0.4354925203700167, + "learning_rate": 9.999200904488238e-06, + "loss": 0.0453, + "step": 945 + }, + { + "epoch": 0.3159125062614794, + "grad_norm": 0.6393409713170454, + "learning_rate": 9.999165784537381e-06, + "loss": 0.0562, + "step": 946 + }, + { + "epoch": 0.316246451828352, + "grad_norm": 0.5051127870569156, + "learning_rate": 9.999129909403671e-06, + "loss": 0.0414, + "step": 947 + }, + { + "epoch": 0.3165803973952246, + "grad_norm": 0.6830945915913949, + "learning_rate": 9.999093279092524e-06, + "loss": 0.0657, + "step": 948 + }, + { + "epoch": 0.3169143429620972, + "grad_norm": 0.7493036064245449, + "learning_rate": 9.999055893609475e-06, + "loss": 0.0685, + "step": 949 + }, + { + "epoch": 0.3172482885289698, + "grad_norm": 0.6573495934445489, + "learning_rate": 9.999017752960172e-06, + "loss": 0.0541, + "step": 950 + }, + { + "epoch": 0.31758223409584235, + "grad_norm": 0.8220607798846838, + "learning_rate": 9.998978857150375e-06, + "loss": 0.0772, + "step": 951 + }, + { + "epoch": 0.31791617966271496, + "grad_norm": 0.6051049278520222, + "learning_rate": 9.99893920618596e-06, + "loss": 0.0548, + "step": 952 + }, + { + "epoch": 0.31825012522958757, + "grad_norm": 0.8396686530912078, + "learning_rate": 9.998898800072919e-06, + "loss": 0.0559, + "step": 953 + }, + { + "epoch": 0.3185840707964602, + "grad_norm": 0.5138888258715585, + "learning_rate": 9.998857638817354e-06, + "loss": 0.0408, + "step": 954 + }, + { + "epoch": 0.3189180163633328, + "grad_norm": 0.5930870657595226, + "learning_rate": 9.99881572242548e-06, + "loss": 0.0553, + "step": 955 + }, + { + "epoch": 0.3192519619302054, + "grad_norm": 0.5739338240819899, + "learning_rate": 9.998773050903637e-06, + "loss": 0.0451, + "step": 956 + }, + { + "epoch": 0.319585907497078, + "grad_norm": 0.6488602357523269, + "learning_rate": 9.998729624258262e-06, + "loss": 0.0547, + "step": 957 + }, + { + "epoch": 0.3199198530639506, + "grad_norm": 0.4766136004536333, + "learning_rate": 9.998685442495921e-06, + "loss": 0.0545, + "step": 958 + }, + { + "epoch": 0.32025379863082315, + "grad_norm": 0.6496173450622319, + "learning_rate": 9.998640505623284e-06, + "loss": 0.0527, + "step": 959 + }, + { + "epoch": 0.32058774419769576, + "grad_norm": 0.7607961932674538, + "learning_rate": 9.998594813647145e-06, + "loss": 0.0495, + "step": 960 + }, + { + "epoch": 0.32092168976456836, + "grad_norm": 0.551714885960616, + "learning_rate": 9.998548366574401e-06, + "loss": 0.0506, + "step": 961 + }, + { + "epoch": 0.32125563533144097, + "grad_norm": 0.44257355961637446, + "learning_rate": 9.99850116441207e-06, + "loss": 0.0409, + "step": 962 + }, + { + "epoch": 0.3215895808983136, + "grad_norm": 0.5235000652023187, + "learning_rate": 9.998453207167282e-06, + "loss": 0.0664, + "step": 963 + }, + { + "epoch": 0.3219235264651862, + "grad_norm": 0.597540827350683, + "learning_rate": 9.998404494847285e-06, + "loss": 0.0553, + "step": 964 + }, + { + "epoch": 0.3222574720320588, + "grad_norm": 0.49350745999906936, + "learning_rate": 9.998355027459432e-06, + "loss": 0.057, + "step": 965 + }, + { + "epoch": 0.3225914175989314, + "grad_norm": 0.5319476769802207, + "learning_rate": 9.998304805011199e-06, + "loss": 0.0516, + "step": 966 + }, + { + "epoch": 0.32292536316580395, + "grad_norm": 0.6810796573105793, + "learning_rate": 9.998253827510173e-06, + "loss": 0.0636, + "step": 967 + }, + { + "epoch": 0.32325930873267655, + "grad_norm": 0.8578957548648788, + "learning_rate": 9.998202094964053e-06, + "loss": 0.0741, + "step": 968 + }, + { + "epoch": 0.32359325429954916, + "grad_norm": 0.685083143230946, + "learning_rate": 9.998149607380654e-06, + "loss": 0.0414, + "step": 969 + }, + { + "epoch": 0.32392719986642177, + "grad_norm": 0.5894900776993433, + "learning_rate": 9.998096364767906e-06, + "loss": 0.0545, + "step": 970 + }, + { + "epoch": 0.3242611454332944, + "grad_norm": 0.768268478439697, + "learning_rate": 9.998042367133854e-06, + "loss": 0.0616, + "step": 971 + }, + { + "epoch": 0.324595091000167, + "grad_norm": 0.5999522250409731, + "learning_rate": 9.997987614486648e-06, + "loss": 0.0404, + "step": 972 + }, + { + "epoch": 0.3249290365670396, + "grad_norm": 0.5388683391884815, + "learning_rate": 9.997932106834567e-06, + "loss": 0.0387, + "step": 973 + }, + { + "epoch": 0.3252629821339122, + "grad_norm": 0.7358432151948985, + "learning_rate": 9.997875844185991e-06, + "loss": 0.0679, + "step": 974 + }, + { + "epoch": 0.3255969277007848, + "grad_norm": 0.7408974822801317, + "learning_rate": 9.99781882654942e-06, + "loss": 0.0512, + "step": 975 + }, + { + "epoch": 0.32593087326765735, + "grad_norm": 0.46737257421637174, + "learning_rate": 9.997761053933469e-06, + "loss": 0.0501, + "step": 976 + }, + { + "epoch": 0.32626481883452996, + "grad_norm": 0.49551151355877743, + "learning_rate": 9.997702526346864e-06, + "loss": 0.0511, + "step": 977 + }, + { + "epoch": 0.32659876440140256, + "grad_norm": 0.7232364196431961, + "learning_rate": 9.997643243798446e-06, + "loss": 0.06, + "step": 978 + }, + { + "epoch": 0.32693270996827517, + "grad_norm": 0.4266653292183982, + "learning_rate": 9.99758320629717e-06, + "loss": 0.0358, + "step": 979 + }, + { + "epoch": 0.3272666555351478, + "grad_norm": 0.49875757240014273, + "learning_rate": 9.997522413852108e-06, + "loss": 0.0539, + "step": 980 + }, + { + "epoch": 0.3276006011020204, + "grad_norm": 0.43873085951347823, + "learning_rate": 9.997460866472439e-06, + "loss": 0.0531, + "step": 981 + }, + { + "epoch": 0.327934546668893, + "grad_norm": 0.8212265247081236, + "learning_rate": 9.997398564167465e-06, + "loss": 0.0638, + "step": 982 + }, + { + "epoch": 0.3282684922357656, + "grad_norm": 0.5360694151874814, + "learning_rate": 9.997335506946596e-06, + "loss": 0.047, + "step": 983 + }, + { + "epoch": 0.32860243780263815, + "grad_norm": 0.7864436246170377, + "learning_rate": 9.997271694819354e-06, + "loss": 0.055, + "step": 984 + }, + { + "epoch": 0.32893638336951075, + "grad_norm": 0.6060787744687433, + "learning_rate": 9.997207127795383e-06, + "loss": 0.0418, + "step": 985 + }, + { + "epoch": 0.32927032893638336, + "grad_norm": 0.5567811944339895, + "learning_rate": 9.997141805884436e-06, + "loss": 0.0408, + "step": 986 + }, + { + "epoch": 0.32960427450325597, + "grad_norm": 0.6134680091972111, + "learning_rate": 9.997075729096379e-06, + "loss": 0.0545, + "step": 987 + }, + { + "epoch": 0.3299382200701286, + "grad_norm": 0.7391293100903685, + "learning_rate": 9.997008897441194e-06, + "loss": 0.0441, + "step": 988 + }, + { + "epoch": 0.3302721656370012, + "grad_norm": 0.4972724534693914, + "learning_rate": 9.996941310928978e-06, + "loss": 0.0533, + "step": 989 + }, + { + "epoch": 0.3306061112038738, + "grad_norm": 0.6167427767477403, + "learning_rate": 9.99687296956994e-06, + "loss": 0.049, + "step": 990 + }, + { + "epoch": 0.3309400567707464, + "grad_norm": 0.5005496813016653, + "learning_rate": 9.996803873374402e-06, + "loss": 0.0314, + "step": 991 + }, + { + "epoch": 0.33127400233761894, + "grad_norm": 0.6552605175128834, + "learning_rate": 9.996734022352805e-06, + "loss": 0.0582, + "step": 992 + }, + { + "epoch": 0.33160794790449155, + "grad_norm": 0.4989430154132851, + "learning_rate": 9.9966634165157e-06, + "loss": 0.046, + "step": 993 + }, + { + "epoch": 0.33194189347136416, + "grad_norm": 0.5191323578296947, + "learning_rate": 9.99659205587375e-06, + "loss": 0.0502, + "step": 994 + }, + { + "epoch": 0.33227583903823676, + "grad_norm": 0.8142715454027585, + "learning_rate": 9.996519940437737e-06, + "loss": 0.0715, + "step": 995 + }, + { + "epoch": 0.33260978460510937, + "grad_norm": 0.4443342477966491, + "learning_rate": 9.996447070218557e-06, + "loss": 0.0564, + "step": 996 + }, + { + "epoch": 0.332943730171982, + "grad_norm": 0.7482918825922684, + "learning_rate": 9.996373445227215e-06, + "loss": 0.0455, + "step": 997 + }, + { + "epoch": 0.3332776757388546, + "grad_norm": 0.6779946308046043, + "learning_rate": 9.996299065474832e-06, + "loss": 0.039, + "step": 998 + }, + { + "epoch": 0.3336116213057272, + "grad_norm": 0.7044409037727745, + "learning_rate": 9.996223930972649e-06, + "loss": 0.0564, + "step": 999 + }, + { + "epoch": 0.33394556687259974, + "grad_norm": 0.5201950603047609, + "learning_rate": 9.99614804173201e-06, + "loss": 0.0379, + "step": 1000 + }, + { + "epoch": 0.33427951243947235, + "grad_norm": 0.5919922786821062, + "learning_rate": 9.996071397764381e-06, + "loss": 0.0518, + "step": 1001 + }, + { + "epoch": 0.33461345800634495, + "grad_norm": 0.4468692713433013, + "learning_rate": 9.995993999081343e-06, + "loss": 0.0415, + "step": 1002 + }, + { + "epoch": 0.33494740357321756, + "grad_norm": 0.5228157333947675, + "learning_rate": 9.995915845694584e-06, + "loss": 0.0443, + "step": 1003 + }, + { + "epoch": 0.33528134914009017, + "grad_norm": 0.802915398850966, + "learning_rate": 9.995836937615913e-06, + "loss": 0.0471, + "step": 1004 + }, + { + "epoch": 0.3356152947069628, + "grad_norm": 0.4554512215067062, + "learning_rate": 9.995757274857246e-06, + "loss": 0.0472, + "step": 1005 + }, + { + "epoch": 0.3359492402738354, + "grad_norm": 0.5103336708517712, + "learning_rate": 9.995676857430621e-06, + "loss": 0.0544, + "step": 1006 + }, + { + "epoch": 0.336283185840708, + "grad_norm": 0.5132698819214155, + "learning_rate": 9.995595685348186e-06, + "loss": 0.0564, + "step": 1007 + }, + { + "epoch": 0.33661713140758054, + "grad_norm": 0.5511313978650463, + "learning_rate": 9.995513758622198e-06, + "loss": 0.061, + "step": 1008 + }, + { + "epoch": 0.33695107697445315, + "grad_norm": 0.5019800042411315, + "learning_rate": 9.995431077265038e-06, + "loss": 0.0457, + "step": 1009 + }, + { + "epoch": 0.33728502254132575, + "grad_norm": 0.5196839976674814, + "learning_rate": 9.995347641289194e-06, + "loss": 0.0453, + "step": 1010 + }, + { + "epoch": 0.33761896810819836, + "grad_norm": 0.6156570747013106, + "learning_rate": 9.995263450707273e-06, + "loss": 0.0609, + "step": 1011 + }, + { + "epoch": 0.33795291367507097, + "grad_norm": 0.5044945359037992, + "learning_rate": 9.995178505531989e-06, + "loss": 0.0403, + "step": 1012 + }, + { + "epoch": 0.33828685924194357, + "grad_norm": 0.6239801495147239, + "learning_rate": 9.995092805776175e-06, + "loss": 0.0599, + "step": 1013 + }, + { + "epoch": 0.3386208048088162, + "grad_norm": 0.855667110065577, + "learning_rate": 9.995006351452775e-06, + "loss": 0.0362, + "step": 1014 + }, + { + "epoch": 0.3389547503756888, + "grad_norm": 0.5684075673561093, + "learning_rate": 9.994919142574854e-06, + "loss": 0.0363, + "step": 1015 + }, + { + "epoch": 0.33928869594256134, + "grad_norm": 0.6532875524634826, + "learning_rate": 9.994831179155584e-06, + "loss": 0.0529, + "step": 1016 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.7820987397590781, + "learning_rate": 9.994742461208251e-06, + "loss": 0.0592, + "step": 1017 + }, + { + "epoch": 0.33995658707630655, + "grad_norm": 0.576914782518453, + "learning_rate": 9.994652988746258e-06, + "loss": 0.0433, + "step": 1018 + }, + { + "epoch": 0.34029053264317916, + "grad_norm": 0.9155248759228477, + "learning_rate": 9.994562761783122e-06, + "loss": 0.0585, + "step": 1019 + }, + { + "epoch": 0.34062447821005176, + "grad_norm": 0.49836935453655445, + "learning_rate": 9.99447178033247e-06, + "loss": 0.0466, + "step": 1020 + }, + { + "epoch": 0.34095842377692437, + "grad_norm": 0.6610249361759681, + "learning_rate": 9.99438004440805e-06, + "loss": 0.0486, + "step": 1021 + }, + { + "epoch": 0.341292369343797, + "grad_norm": 0.45444211329207995, + "learning_rate": 9.994287554023717e-06, + "loss": 0.0392, + "step": 1022 + }, + { + "epoch": 0.3416263149106696, + "grad_norm": 0.5625728324533656, + "learning_rate": 9.994194309193442e-06, + "loss": 0.0408, + "step": 1023 + }, + { + "epoch": 0.3419602604775422, + "grad_norm": 0.6195011121917613, + "learning_rate": 9.99410030993131e-06, + "loss": 0.0454, + "step": 1024 + }, + { + "epoch": 0.34229420604441474, + "grad_norm": 0.4493699382996825, + "learning_rate": 9.994005556251525e-06, + "loss": 0.043, + "step": 1025 + }, + { + "epoch": 0.34262815161128735, + "grad_norm": 0.5894717029644626, + "learning_rate": 9.993910048168399e-06, + "loss": 0.0406, + "step": 1026 + }, + { + "epoch": 0.34296209717815995, + "grad_norm": 0.6486042487221031, + "learning_rate": 9.993813785696355e-06, + "loss": 0.0433, + "step": 1027 + }, + { + "epoch": 0.34329604274503256, + "grad_norm": 0.5200330454917651, + "learning_rate": 9.993716768849942e-06, + "loss": 0.0394, + "step": 1028 + }, + { + "epoch": 0.34362998831190517, + "grad_norm": 0.7015607884906295, + "learning_rate": 9.99361899764381e-06, + "loss": 0.0558, + "step": 1029 + }, + { + "epoch": 0.3439639338787778, + "grad_norm": 0.8055226873973813, + "learning_rate": 9.993520472092732e-06, + "loss": 0.0544, + "step": 1030 + }, + { + "epoch": 0.3442978794456504, + "grad_norm": 0.41330444801652355, + "learning_rate": 9.99342119221159e-06, + "loss": 0.04, + "step": 1031 + }, + { + "epoch": 0.344631825012523, + "grad_norm": 0.48400055037656814, + "learning_rate": 9.993321158015379e-06, + "loss": 0.0349, + "step": 1032 + }, + { + "epoch": 0.34496577057939554, + "grad_norm": 0.7469177579288511, + "learning_rate": 9.993220369519215e-06, + "loss": 0.0596, + "step": 1033 + }, + { + "epoch": 0.34529971614626814, + "grad_norm": 0.5104501629279141, + "learning_rate": 9.99311882673832e-06, + "loss": 0.0323, + "step": 1034 + }, + { + "epoch": 0.34563366171314075, + "grad_norm": 0.6392111836152765, + "learning_rate": 9.993016529688033e-06, + "loss": 0.0515, + "step": 1035 + }, + { + "epoch": 0.34596760728001336, + "grad_norm": 0.6188335187658996, + "learning_rate": 9.99291347838381e-06, + "loss": 0.0436, + "step": 1036 + }, + { + "epoch": 0.34630155284688596, + "grad_norm": 0.5010576873349704, + "learning_rate": 9.992809672841218e-06, + "loss": 0.0466, + "step": 1037 + }, + { + "epoch": 0.34663549841375857, + "grad_norm": 0.6905362980082702, + "learning_rate": 9.992705113075933e-06, + "loss": 0.0484, + "step": 1038 + }, + { + "epoch": 0.3469694439806312, + "grad_norm": 0.4979053704058577, + "learning_rate": 9.992599799103754e-06, + "loss": 0.0408, + "step": 1039 + }, + { + "epoch": 0.3473033895475038, + "grad_norm": 0.7295972332427876, + "learning_rate": 9.99249373094059e-06, + "loss": 0.0626, + "step": 1040 + }, + { + "epoch": 0.34763733511437633, + "grad_norm": 0.8606004422561815, + "learning_rate": 9.992386908602466e-06, + "loss": 0.0692, + "step": 1041 + }, + { + "epoch": 0.34797128068124894, + "grad_norm": 0.4083370175535445, + "learning_rate": 9.992279332105512e-06, + "loss": 0.04, + "step": 1042 + }, + { + "epoch": 0.34830522624812155, + "grad_norm": 0.6516779145116438, + "learning_rate": 9.992171001465985e-06, + "loss": 0.0606, + "step": 1043 + }, + { + "epoch": 0.34863917181499415, + "grad_norm": 0.46181250675828023, + "learning_rate": 9.992061916700247e-06, + "loss": 0.0358, + "step": 1044 + }, + { + "epoch": 0.34897311738186676, + "grad_norm": 0.4205777188027031, + "learning_rate": 9.991952077824776e-06, + "loss": 0.0455, + "step": 1045 + }, + { + "epoch": 0.34930706294873937, + "grad_norm": 0.6479523601109954, + "learning_rate": 9.991841484856166e-06, + "loss": 0.0644, + "step": 1046 + }, + { + "epoch": 0.349641008515612, + "grad_norm": 0.4445216305048308, + "learning_rate": 9.991730137811122e-06, + "loss": 0.049, + "step": 1047 + }, + { + "epoch": 0.3499749540824846, + "grad_norm": 0.5093022665200985, + "learning_rate": 9.991618036706464e-06, + "loss": 0.0506, + "step": 1048 + }, + { + "epoch": 0.35030889964935713, + "grad_norm": 0.6502313742689316, + "learning_rate": 9.99150518155913e-06, + "loss": 0.0541, + "step": 1049 + }, + { + "epoch": 0.35064284521622974, + "grad_norm": 0.5924500908160205, + "learning_rate": 9.991391572386162e-06, + "loss": 0.0647, + "step": 1050 + }, + { + "epoch": 0.35097679078310234, + "grad_norm": 0.7866655871375102, + "learning_rate": 9.991277209204728e-06, + "loss": 0.061, + "step": 1051 + }, + { + "epoch": 0.35131073634997495, + "grad_norm": 0.7000607088577165, + "learning_rate": 9.991162092032101e-06, + "loss": 0.0764, + "step": 1052 + }, + { + "epoch": 0.35164468191684756, + "grad_norm": 0.5909027653068674, + "learning_rate": 9.99104622088567e-06, + "loss": 0.0566, + "step": 1053 + }, + { + "epoch": 0.35197862748372016, + "grad_norm": 0.6401638042696182, + "learning_rate": 9.990929595782938e-06, + "loss": 0.0387, + "step": 1054 + }, + { + "epoch": 0.35231257305059277, + "grad_norm": 0.5051120395345109, + "learning_rate": 9.990812216741529e-06, + "loss": 0.0415, + "step": 1055 + }, + { + "epoch": 0.3526465186174654, + "grad_norm": 0.5952307644884801, + "learning_rate": 9.990694083779166e-06, + "loss": 0.0572, + "step": 1056 + }, + { + "epoch": 0.3529804641843379, + "grad_norm": 0.6185415636964717, + "learning_rate": 9.990575196913699e-06, + "loss": 0.0438, + "step": 1057 + }, + { + "epoch": 0.35331440975121053, + "grad_norm": 0.6702811452763382, + "learning_rate": 9.990455556163086e-06, + "loss": 0.0621, + "step": 1058 + }, + { + "epoch": 0.35364835531808314, + "grad_norm": 0.8501224163188211, + "learning_rate": 9.990335161545401e-06, + "loss": 0.062, + "step": 1059 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 0.5000086771582775, + "learning_rate": 9.99021401307883e-06, + "loss": 0.056, + "step": 1060 + }, + { + "epoch": 0.35431624645182835, + "grad_norm": 0.6794738007002346, + "learning_rate": 9.990092110781675e-06, + "loss": 0.0613, + "step": 1061 + }, + { + "epoch": 0.35465019201870096, + "grad_norm": 0.7239179023620307, + "learning_rate": 9.98996945467235e-06, + "loss": 0.0515, + "step": 1062 + }, + { + "epoch": 0.35498413758557357, + "grad_norm": 0.4383656802090465, + "learning_rate": 9.989846044769384e-06, + "loss": 0.0332, + "step": 1063 + }, + { + "epoch": 0.3553180831524462, + "grad_norm": 0.6911848617553831, + "learning_rate": 9.98972188109142e-06, + "loss": 0.0509, + "step": 1064 + }, + { + "epoch": 0.3556520287193187, + "grad_norm": 0.8755822538317177, + "learning_rate": 9.989596963657213e-06, + "loss": 0.0655, + "step": 1065 + }, + { + "epoch": 0.35598597428619133, + "grad_norm": 1.2761713511915753, + "learning_rate": 9.989471292485636e-06, + "loss": 0.0824, + "step": 1066 + }, + { + "epoch": 0.35631991985306394, + "grad_norm": 0.7097954027148441, + "learning_rate": 9.989344867595668e-06, + "loss": 0.0664, + "step": 1067 + }, + { + "epoch": 0.35665386541993654, + "grad_norm": 0.662383198070942, + "learning_rate": 9.989217689006412e-06, + "loss": 0.0513, + "step": 1068 + }, + { + "epoch": 0.35698781098680915, + "grad_norm": 0.5819687943988014, + "learning_rate": 9.989089756737077e-06, + "loss": 0.0484, + "step": 1069 + }, + { + "epoch": 0.35732175655368176, + "grad_norm": 0.6486771942566604, + "learning_rate": 9.988961070806991e-06, + "loss": 0.055, + "step": 1070 + }, + { + "epoch": 0.35765570212055436, + "grad_norm": 0.5068948204875828, + "learning_rate": 9.988831631235591e-06, + "loss": 0.0504, + "step": 1071 + }, + { + "epoch": 0.35798964768742697, + "grad_norm": 0.475378088086222, + "learning_rate": 9.98870143804243e-06, + "loss": 0.0357, + "step": 1072 + }, + { + "epoch": 0.3583235932542995, + "grad_norm": 0.47100408678138567, + "learning_rate": 9.988570491247179e-06, + "loss": 0.0452, + "step": 1073 + }, + { + "epoch": 0.35865753882117213, + "grad_norm": 0.6000529837893382, + "learning_rate": 9.988438790869616e-06, + "loss": 0.0541, + "step": 1074 + }, + { + "epoch": 0.35899148438804473, + "grad_norm": 0.7406154549381468, + "learning_rate": 9.988306336929637e-06, + "loss": 0.0588, + "step": 1075 + }, + { + "epoch": 0.35932542995491734, + "grad_norm": 0.5825635763306092, + "learning_rate": 9.988173129447251e-06, + "loss": 0.0608, + "step": 1076 + }, + { + "epoch": 0.35965937552178995, + "grad_norm": 0.5387058170920624, + "learning_rate": 9.98803916844258e-06, + "loss": 0.0501, + "step": 1077 + }, + { + "epoch": 0.35999332108866255, + "grad_norm": 0.5341997705770352, + "learning_rate": 9.98790445393586e-06, + "loss": 0.0426, + "step": 1078 + }, + { + "epoch": 0.36032726665553516, + "grad_norm": 0.7479385849991379, + "learning_rate": 9.98776898594744e-06, + "loss": 0.0571, + "step": 1079 + }, + { + "epoch": 0.36066121222240777, + "grad_norm": 0.5834622266603493, + "learning_rate": 9.987632764497787e-06, + "loss": 0.0437, + "step": 1080 + }, + { + "epoch": 0.3609951577892804, + "grad_norm": 0.418269485846868, + "learning_rate": 9.987495789607478e-06, + "loss": 0.0478, + "step": 1081 + }, + { + "epoch": 0.3613291033561529, + "grad_norm": 0.5652661852831252, + "learning_rate": 9.987358061297203e-06, + "loss": 0.0522, + "step": 1082 + }, + { + "epoch": 0.36166304892302553, + "grad_norm": 0.5112807279357294, + "learning_rate": 9.987219579587768e-06, + "loss": 0.0421, + "step": 1083 + }, + { + "epoch": 0.36199699448989814, + "grad_norm": 0.5259032382547962, + "learning_rate": 9.987080344500094e-06, + "loss": 0.0503, + "step": 1084 + }, + { + "epoch": 0.36233094005677075, + "grad_norm": 0.622483979721095, + "learning_rate": 9.986940356055212e-06, + "loss": 0.0435, + "step": 1085 + }, + { + "epoch": 0.36266488562364335, + "grad_norm": 0.4294577205276959, + "learning_rate": 9.986799614274271e-06, + "loss": 0.0406, + "step": 1086 + }, + { + "epoch": 0.36299883119051596, + "grad_norm": 0.45824971878379894, + "learning_rate": 9.986658119178532e-06, + "loss": 0.0533, + "step": 1087 + }, + { + "epoch": 0.36333277675738856, + "grad_norm": 0.5766999228997975, + "learning_rate": 9.986515870789366e-06, + "loss": 0.0467, + "step": 1088 + }, + { + "epoch": 0.36366672232426117, + "grad_norm": 0.5595701577587491, + "learning_rate": 9.986372869128264e-06, + "loss": 0.0574, + "step": 1089 + }, + { + "epoch": 0.3640006678911337, + "grad_norm": 0.7333099233179504, + "learning_rate": 9.986229114216828e-06, + "loss": 0.0469, + "step": 1090 + }, + { + "epoch": 0.36433461345800633, + "grad_norm": 0.4619575799630175, + "learning_rate": 9.986084606076772e-06, + "loss": 0.0431, + "step": 1091 + }, + { + "epoch": 0.36466855902487894, + "grad_norm": 0.44056819753468957, + "learning_rate": 9.985939344729926e-06, + "loss": 0.0322, + "step": 1092 + }, + { + "epoch": 0.36500250459175154, + "grad_norm": 0.5098328152482641, + "learning_rate": 9.985793330198237e-06, + "loss": 0.0474, + "step": 1093 + }, + { + "epoch": 0.36533645015862415, + "grad_norm": 0.40154545526503355, + "learning_rate": 9.98564656250376e-06, + "loss": 0.0398, + "step": 1094 + }, + { + "epoch": 0.36567039572549676, + "grad_norm": 0.6710265566649831, + "learning_rate": 9.985499041668664e-06, + "loss": 0.0534, + "step": 1095 + }, + { + "epoch": 0.36600434129236936, + "grad_norm": 0.47184607186017474, + "learning_rate": 9.985350767715236e-06, + "loss": 0.0474, + "step": 1096 + }, + { + "epoch": 0.36633828685924197, + "grad_norm": 0.43565151040472383, + "learning_rate": 9.985201740665873e-06, + "loss": 0.0376, + "step": 1097 + }, + { + "epoch": 0.3666722324261145, + "grad_norm": 0.5620042551551873, + "learning_rate": 9.98505196054309e-06, + "loss": 0.0439, + "step": 1098 + }, + { + "epoch": 0.3670061779929871, + "grad_norm": 0.5732658369619484, + "learning_rate": 9.98490142736951e-06, + "loss": 0.0452, + "step": 1099 + }, + { + "epoch": 0.36734012355985973, + "grad_norm": 0.5005726529403091, + "learning_rate": 9.984750141167874e-06, + "loss": 0.0491, + "step": 1100 + }, + { + "epoch": 0.36767406912673234, + "grad_norm": 0.7443452256344757, + "learning_rate": 9.984598101961036e-06, + "loss": 0.0518, + "step": 1101 + }, + { + "epoch": 0.36800801469360495, + "grad_norm": 0.6843009797649618, + "learning_rate": 9.984445309771963e-06, + "loss": 0.0688, + "step": 1102 + }, + { + "epoch": 0.36834196026047755, + "grad_norm": 0.5428163389051452, + "learning_rate": 9.984291764623735e-06, + "loss": 0.0401, + "step": 1103 + }, + { + "epoch": 0.36867590582735016, + "grad_norm": 0.8129104692667752, + "learning_rate": 9.98413746653955e-06, + "loss": 0.0471, + "step": 1104 + }, + { + "epoch": 0.36900985139422277, + "grad_norm": 0.42308489801042903, + "learning_rate": 9.983982415542713e-06, + "loss": 0.035, + "step": 1105 + }, + { + "epoch": 0.3693437969610953, + "grad_norm": 0.575726981231527, + "learning_rate": 9.983826611656649e-06, + "loss": 0.0455, + "step": 1106 + }, + { + "epoch": 0.3696777425279679, + "grad_norm": 0.6214448886769854, + "learning_rate": 9.983670054904891e-06, + "loss": 0.0426, + "step": 1107 + }, + { + "epoch": 0.37001168809484053, + "grad_norm": 0.4840384379327109, + "learning_rate": 9.98351274531109e-06, + "loss": 0.0398, + "step": 1108 + }, + { + "epoch": 0.37034563366171314, + "grad_norm": 0.6726458580399798, + "learning_rate": 9.983354682899012e-06, + "loss": 0.0525, + "step": 1109 + }, + { + "epoch": 0.37067957922858574, + "grad_norm": 0.7898451002671479, + "learning_rate": 9.98319586769253e-06, + "loss": 0.0463, + "step": 1110 + }, + { + "epoch": 0.37101352479545835, + "grad_norm": 0.6329293017127283, + "learning_rate": 9.983036299715637e-06, + "loss": 0.0561, + "step": 1111 + }, + { + "epoch": 0.37134747036233096, + "grad_norm": 0.7822396614156675, + "learning_rate": 9.98287597899244e-06, + "loss": 0.0523, + "step": 1112 + }, + { + "epoch": 0.37168141592920356, + "grad_norm": 0.6611247395030577, + "learning_rate": 9.982714905547152e-06, + "loss": 0.0524, + "step": 1113 + }, + { + "epoch": 0.3720153614960761, + "grad_norm": 0.6292445023529448, + "learning_rate": 9.982553079404109e-06, + "loss": 0.0599, + "step": 1114 + }, + { + "epoch": 0.3723493070629487, + "grad_norm": 0.4281011826469759, + "learning_rate": 9.982390500587755e-06, + "loss": 0.0387, + "step": 1115 + }, + { + "epoch": 0.3726832526298213, + "grad_norm": 0.5993569991835048, + "learning_rate": 9.982227169122652e-06, + "loss": 0.0403, + "step": 1116 + }, + { + "epoch": 0.37301719819669393, + "grad_norm": 0.6425741270395904, + "learning_rate": 9.98206308503347e-06, + "loss": 0.0571, + "step": 1117 + }, + { + "epoch": 0.37335114376356654, + "grad_norm": 0.35852961411871975, + "learning_rate": 9.981898248344996e-06, + "loss": 0.0376, + "step": 1118 + }, + { + "epoch": 0.37368508933043915, + "grad_norm": 0.39932054074997514, + "learning_rate": 9.981732659082136e-06, + "loss": 0.0433, + "step": 1119 + }, + { + "epoch": 0.37401903489731175, + "grad_norm": 0.4132193378666293, + "learning_rate": 9.981566317269895e-06, + "loss": 0.0343, + "step": 1120 + }, + { + "epoch": 0.37435298046418436, + "grad_norm": 0.5548913168137909, + "learning_rate": 9.981399222933408e-06, + "loss": 0.0523, + "step": 1121 + }, + { + "epoch": 0.3746869260310569, + "grad_norm": 0.7247522744765278, + "learning_rate": 9.981231376097914e-06, + "loss": 0.0572, + "step": 1122 + }, + { + "epoch": 0.3750208715979295, + "grad_norm": 0.4952993307764918, + "learning_rate": 9.981062776788769e-06, + "loss": 0.0385, + "step": 1123 + }, + { + "epoch": 0.3753548171648021, + "grad_norm": 0.45534515652271046, + "learning_rate": 9.98089342503144e-06, + "loss": 0.0506, + "step": 1124 + }, + { + "epoch": 0.37568876273167473, + "grad_norm": 0.474082473254327, + "learning_rate": 9.980723320851512e-06, + "loss": 0.0494, + "step": 1125 + }, + { + "epoch": 0.37602270829854734, + "grad_norm": 0.5496501514083226, + "learning_rate": 9.98055246427468e-06, + "loss": 0.055, + "step": 1126 + }, + { + "epoch": 0.37635665386541994, + "grad_norm": 0.7029249775324979, + "learning_rate": 9.980380855326754e-06, + "loss": 0.0611, + "step": 1127 + }, + { + "epoch": 0.37669059943229255, + "grad_norm": 0.3571680349771877, + "learning_rate": 9.980208494033659e-06, + "loss": 0.0456, + "step": 1128 + }, + { + "epoch": 0.37702454499916516, + "grad_norm": 0.5787446290422406, + "learning_rate": 9.98003538042143e-06, + "loss": 0.0543, + "step": 1129 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.5193793235190981, + "learning_rate": 9.979861514516217e-06, + "loss": 0.0485, + "step": 1130 + }, + { + "epoch": 0.3776924361329103, + "grad_norm": 0.4450333193144988, + "learning_rate": 9.979686896344289e-06, + "loss": 0.047, + "step": 1131 + }, + { + "epoch": 0.3780263816997829, + "grad_norm": 0.5238410201117426, + "learning_rate": 9.97951152593202e-06, + "loss": 0.0596, + "step": 1132 + }, + { + "epoch": 0.3783603272666555, + "grad_norm": 0.7958266829905504, + "learning_rate": 9.979335403305904e-06, + "loss": 0.0491, + "step": 1133 + }, + { + "epoch": 0.37869427283352813, + "grad_norm": 0.5391712301146426, + "learning_rate": 9.979158528492546e-06, + "loss": 0.0496, + "step": 1134 + }, + { + "epoch": 0.37902821840040074, + "grad_norm": 0.7434018159065988, + "learning_rate": 9.978980901518663e-06, + "loss": 0.0519, + "step": 1135 + }, + { + "epoch": 0.37936216396727335, + "grad_norm": 0.5140398521432605, + "learning_rate": 9.978802522411091e-06, + "loss": 0.0477, + "step": 1136 + }, + { + "epoch": 0.37969610953414595, + "grad_norm": 0.525099189694247, + "learning_rate": 9.978623391196774e-06, + "loss": 0.0627, + "step": 1137 + }, + { + "epoch": 0.38003005510101856, + "grad_norm": 0.8405858175836848, + "learning_rate": 9.978443507902772e-06, + "loss": 0.0523, + "step": 1138 + }, + { + "epoch": 0.3803640006678911, + "grad_norm": 0.5362127551512809, + "learning_rate": 9.978262872556257e-06, + "loss": 0.0525, + "step": 1139 + }, + { + "epoch": 0.3806979462347637, + "grad_norm": 0.8901488866233538, + "learning_rate": 9.97808148518452e-06, + "loss": 0.0598, + "step": 1140 + }, + { + "epoch": 0.3810318918016363, + "grad_norm": 0.8614316646494008, + "learning_rate": 9.977899345814959e-06, + "loss": 0.0779, + "step": 1141 + }, + { + "epoch": 0.38136583736850893, + "grad_norm": 0.5113726601773714, + "learning_rate": 9.977716454475089e-06, + "loss": 0.0508, + "step": 1142 + }, + { + "epoch": 0.38169978293538154, + "grad_norm": 0.4148156818182061, + "learning_rate": 9.977532811192539e-06, + "loss": 0.0416, + "step": 1143 + }, + { + "epoch": 0.38203372850225414, + "grad_norm": 0.5817665146573645, + "learning_rate": 9.977348415995048e-06, + "loss": 0.0421, + "step": 1144 + }, + { + "epoch": 0.38236767406912675, + "grad_norm": 0.6024373699140408, + "learning_rate": 9.977163268910472e-06, + "loss": 0.0457, + "step": 1145 + }, + { + "epoch": 0.38270161963599936, + "grad_norm": 0.4992753612965335, + "learning_rate": 9.976977369966781e-06, + "loss": 0.0411, + "step": 1146 + }, + { + "epoch": 0.3830355652028719, + "grad_norm": 0.43207767333593483, + "learning_rate": 9.976790719192055e-06, + "loss": 0.042, + "step": 1147 + }, + { + "epoch": 0.3833695107697445, + "grad_norm": 0.6297237611298425, + "learning_rate": 9.976603316614492e-06, + "loss": 0.0579, + "step": 1148 + }, + { + "epoch": 0.3837034563366171, + "grad_norm": 0.4638601261210525, + "learning_rate": 9.976415162262401e-06, + "loss": 0.0303, + "step": 1149 + }, + { + "epoch": 0.38403740190348973, + "grad_norm": 0.5053291928207179, + "learning_rate": 9.976226256164204e-06, + "loss": 0.0531, + "step": 1150 + }, + { + "epoch": 0.38437134747036233, + "grad_norm": 0.49472959374363185, + "learning_rate": 9.976036598348437e-06, + "loss": 0.0415, + "step": 1151 + }, + { + "epoch": 0.38470529303723494, + "grad_norm": 0.43376405843719706, + "learning_rate": 9.975846188843754e-06, + "loss": 0.0386, + "step": 1152 + }, + { + "epoch": 0.38503923860410755, + "grad_norm": 0.49450793525193587, + "learning_rate": 9.975655027678913e-06, + "loss": 0.052, + "step": 1153 + }, + { + "epoch": 0.38537318417098015, + "grad_norm": 0.4413392442318113, + "learning_rate": 9.975463114882792e-06, + "loss": 0.0403, + "step": 1154 + }, + { + "epoch": 0.3857071297378527, + "grad_norm": 0.6403828404233969, + "learning_rate": 9.975270450484385e-06, + "loss": 0.0477, + "step": 1155 + }, + { + "epoch": 0.3860410753047253, + "grad_norm": 0.545724273094895, + "learning_rate": 9.975077034512795e-06, + "loss": 0.0438, + "step": 1156 + }, + { + "epoch": 0.3863750208715979, + "grad_norm": 0.4567797017718175, + "learning_rate": 9.97488286699724e-06, + "loss": 0.0503, + "step": 1157 + }, + { + "epoch": 0.3867089664384705, + "grad_norm": 0.4477030075936158, + "learning_rate": 9.974687947967047e-06, + "loss": 0.0407, + "step": 1158 + }, + { + "epoch": 0.38704291200534313, + "grad_norm": 0.5691343454538987, + "learning_rate": 9.974492277451668e-06, + "loss": 0.0468, + "step": 1159 + }, + { + "epoch": 0.38737685757221574, + "grad_norm": 0.6618096964997607, + "learning_rate": 9.974295855480658e-06, + "loss": 0.0562, + "step": 1160 + }, + { + "epoch": 0.38771080313908834, + "grad_norm": 0.4112412707585751, + "learning_rate": 9.974098682083687e-06, + "loss": 0.0313, + "step": 1161 + }, + { + "epoch": 0.38804474870596095, + "grad_norm": 0.3845254377778595, + "learning_rate": 9.973900757290541e-06, + "loss": 0.0445, + "step": 1162 + }, + { + "epoch": 0.3883786942728335, + "grad_norm": 0.49392505730444436, + "learning_rate": 9.97370208113112e-06, + "loss": 0.0509, + "step": 1163 + }, + { + "epoch": 0.3887126398397061, + "grad_norm": 0.588413841355745, + "learning_rate": 9.973502653635438e-06, + "loss": 0.0579, + "step": 1164 + }, + { + "epoch": 0.3890465854065787, + "grad_norm": 0.6271328850335199, + "learning_rate": 9.97330247483362e-06, + "loss": 0.0561, + "step": 1165 + }, + { + "epoch": 0.3893805309734513, + "grad_norm": 0.3921815676693701, + "learning_rate": 9.973101544755901e-06, + "loss": 0.0418, + "step": 1166 + }, + { + "epoch": 0.38971447654032393, + "grad_norm": 0.6668285107357198, + "learning_rate": 9.97289986343264e-06, + "loss": 0.0515, + "step": 1167 + }, + { + "epoch": 0.39004842210719654, + "grad_norm": 0.5044214747666783, + "learning_rate": 9.972697430894299e-06, + "loss": 0.048, + "step": 1168 + }, + { + "epoch": 0.39038236767406914, + "grad_norm": 0.5810068915729227, + "learning_rate": 9.97249424717146e-06, + "loss": 0.0437, + "step": 1169 + }, + { + "epoch": 0.39071631324094175, + "grad_norm": 0.4942384218878549, + "learning_rate": 9.972290312294816e-06, + "loss": 0.056, + "step": 1170 + }, + { + "epoch": 0.3910502588078143, + "grad_norm": 0.6300510182254271, + "learning_rate": 9.972085626295173e-06, + "loss": 0.052, + "step": 1171 + }, + { + "epoch": 0.3913842043746869, + "grad_norm": 0.5135514249497448, + "learning_rate": 9.971880189203452e-06, + "loss": 0.0556, + "step": 1172 + }, + { + "epoch": 0.3917181499415595, + "grad_norm": 0.5381850673904547, + "learning_rate": 9.971674001050687e-06, + "loss": 0.0545, + "step": 1173 + }, + { + "epoch": 0.3920520955084321, + "grad_norm": 0.7459483271483872, + "learning_rate": 9.971467061868022e-06, + "loss": 0.0546, + "step": 1174 + }, + { + "epoch": 0.3923860410753047, + "grad_norm": 0.5703554311385124, + "learning_rate": 9.971259371686724e-06, + "loss": 0.0378, + "step": 1175 + }, + { + "epoch": 0.39271998664217733, + "grad_norm": 0.611429485228926, + "learning_rate": 9.971050930538161e-06, + "loss": 0.0647, + "step": 1176 + }, + { + "epoch": 0.39305393220904994, + "grad_norm": 0.6200371316956779, + "learning_rate": 9.970841738453823e-06, + "loss": 0.0522, + "step": 1177 + }, + { + "epoch": 0.39338787777592255, + "grad_norm": 0.5539338167498827, + "learning_rate": 9.970631795465311e-06, + "loss": 0.0466, + "step": 1178 + }, + { + "epoch": 0.39372182334279515, + "grad_norm": 0.372460342051298, + "learning_rate": 9.970421101604339e-06, + "loss": 0.0396, + "step": 1179 + }, + { + "epoch": 0.3940557689096677, + "grad_norm": 0.5819236200727373, + "learning_rate": 9.970209656902734e-06, + "loss": 0.053, + "step": 1180 + }, + { + "epoch": 0.3943897144765403, + "grad_norm": 0.43112048749562476, + "learning_rate": 9.969997461392439e-06, + "loss": 0.0443, + "step": 1181 + }, + { + "epoch": 0.3947236600434129, + "grad_norm": 0.6862293235082171, + "learning_rate": 9.969784515105508e-06, + "loss": 0.0693, + "step": 1182 + }, + { + "epoch": 0.3950576056102855, + "grad_norm": 0.46118889763849785, + "learning_rate": 9.969570818074109e-06, + "loss": 0.0333, + "step": 1183 + }, + { + "epoch": 0.39539155117715813, + "grad_norm": 0.7480464718455543, + "learning_rate": 9.96935637033052e-06, + "loss": 0.0418, + "step": 1184 + }, + { + "epoch": 0.39572549674403074, + "grad_norm": 0.6006469165707279, + "learning_rate": 9.969141171907142e-06, + "loss": 0.0418, + "step": 1185 + }, + { + "epoch": 0.39605944231090334, + "grad_norm": 0.5126146905146513, + "learning_rate": 9.968925222836478e-06, + "loss": 0.047, + "step": 1186 + }, + { + "epoch": 0.39639338787777595, + "grad_norm": 0.39110614660927157, + "learning_rate": 9.968708523151154e-06, + "loss": 0.0468, + "step": 1187 + }, + { + "epoch": 0.3967273334446485, + "grad_norm": 0.5815177714468796, + "learning_rate": 9.968491072883902e-06, + "loss": 0.0576, + "step": 1188 + }, + { + "epoch": 0.3970612790115211, + "grad_norm": 0.6559422926494999, + "learning_rate": 9.968272872067571e-06, + "loss": 0.0634, + "step": 1189 + }, + { + "epoch": 0.3973952245783937, + "grad_norm": 0.6095510416572145, + "learning_rate": 9.968053920735124e-06, + "loss": 0.0592, + "step": 1190 + }, + { + "epoch": 0.3977291701452663, + "grad_norm": 0.5355585919017009, + "learning_rate": 9.967834218919634e-06, + "loss": 0.0459, + "step": 1191 + }, + { + "epoch": 0.3980631157121389, + "grad_norm": 0.5607840872774007, + "learning_rate": 9.967613766654293e-06, + "loss": 0.0447, + "step": 1192 + }, + { + "epoch": 0.39839706127901153, + "grad_norm": 0.48251025480811577, + "learning_rate": 9.967392563972399e-06, + "loss": 0.0415, + "step": 1193 + }, + { + "epoch": 0.39873100684588414, + "grad_norm": 0.5809032518017457, + "learning_rate": 9.96717061090737e-06, + "loss": 0.0536, + "step": 1194 + }, + { + "epoch": 0.39906495241275675, + "grad_norm": 0.5545765346935037, + "learning_rate": 9.966947907492734e-06, + "loss": 0.0505, + "step": 1195 + }, + { + "epoch": 0.3993988979796293, + "grad_norm": 0.5487534111585209, + "learning_rate": 9.966724453762131e-06, + "loss": 0.0651, + "step": 1196 + }, + { + "epoch": 0.3997328435465019, + "grad_norm": 0.7918118540428859, + "learning_rate": 9.96650024974932e-06, + "loss": 0.0901, + "step": 1197 + }, + { + "epoch": 0.4000667891133745, + "grad_norm": 0.4377465320214387, + "learning_rate": 9.966275295488165e-06, + "loss": 0.0478, + "step": 1198 + }, + { + "epoch": 0.4004007346802471, + "grad_norm": 0.4363946236918287, + "learning_rate": 9.966049591012651e-06, + "loss": 0.0412, + "step": 1199 + }, + { + "epoch": 0.4007346802471197, + "grad_norm": 0.5645750603178316, + "learning_rate": 9.965823136356877e-06, + "loss": 0.058, + "step": 1200 + }, + { + "epoch": 0.40106862581399233, + "grad_norm": 0.6490952716726128, + "learning_rate": 9.965595931555043e-06, + "loss": 0.052, + "step": 1201 + }, + { + "epoch": 0.40140257138086494, + "grad_norm": 0.4281760314358244, + "learning_rate": 9.965367976641478e-06, + "loss": 0.0394, + "step": 1202 + }, + { + "epoch": 0.40173651694773754, + "grad_norm": 0.8412813438780575, + "learning_rate": 9.965139271650614e-06, + "loss": 0.0763, + "step": 1203 + }, + { + "epoch": 0.4020704625146101, + "grad_norm": 0.6074908680773805, + "learning_rate": 9.964909816617002e-06, + "loss": 0.0564, + "step": 1204 + }, + { + "epoch": 0.4024044080814827, + "grad_norm": 0.4679413848655991, + "learning_rate": 9.964679611575298e-06, + "loss": 0.0369, + "step": 1205 + }, + { + "epoch": 0.4027383536483553, + "grad_norm": 0.5847037533969296, + "learning_rate": 9.964448656560286e-06, + "loss": 0.039, + "step": 1206 + }, + { + "epoch": 0.4030722992152279, + "grad_norm": 0.3845921861967511, + "learning_rate": 9.964216951606848e-06, + "loss": 0.0336, + "step": 1207 + }, + { + "epoch": 0.4034062447821005, + "grad_norm": 0.41271257424344426, + "learning_rate": 9.963984496749988e-06, + "loss": 0.0414, + "step": 1208 + }, + { + "epoch": 0.4037401903489731, + "grad_norm": 0.7027487676934003, + "learning_rate": 9.96375129202482e-06, + "loss": 0.0687, + "step": 1209 + }, + { + "epoch": 0.40407413591584573, + "grad_norm": 0.41417343851954624, + "learning_rate": 9.963517337466575e-06, + "loss": 0.0357, + "step": 1210 + }, + { + "epoch": 0.40440808148271834, + "grad_norm": 0.4817301873683958, + "learning_rate": 9.963282633110591e-06, + "loss": 0.0419, + "step": 1211 + }, + { + "epoch": 0.4047420270495909, + "grad_norm": 0.3188002448155716, + "learning_rate": 9.963047178992324e-06, + "loss": 0.0276, + "step": 1212 + }, + { + "epoch": 0.4050759726164635, + "grad_norm": 0.5874020573242373, + "learning_rate": 9.962810975147344e-06, + "loss": 0.0626, + "step": 1213 + }, + { + "epoch": 0.4054099181833361, + "grad_norm": 0.6407406965589435, + "learning_rate": 9.96257402161133e-06, + "loss": 0.0419, + "step": 1214 + }, + { + "epoch": 0.4057438637502087, + "grad_norm": 0.5759204262478326, + "learning_rate": 9.962336318420078e-06, + "loss": 0.0454, + "step": 1215 + }, + { + "epoch": 0.4060778093170813, + "grad_norm": 0.5243793824473311, + "learning_rate": 9.962097865609495e-06, + "loss": 0.0387, + "step": 1216 + }, + { + "epoch": 0.4064117548839539, + "grad_norm": 0.6572551064096243, + "learning_rate": 9.961858663215604e-06, + "loss": 0.0543, + "step": 1217 + }, + { + "epoch": 0.40674570045082653, + "grad_norm": 0.6558388201939296, + "learning_rate": 9.961618711274537e-06, + "loss": 0.0583, + "step": 1218 + }, + { + "epoch": 0.40707964601769914, + "grad_norm": 0.5551867364009926, + "learning_rate": 9.961378009822542e-06, + "loss": 0.0363, + "step": 1219 + }, + { + "epoch": 0.4074135915845717, + "grad_norm": 0.44824260044365105, + "learning_rate": 9.961136558895981e-06, + "loss": 0.0405, + "step": 1220 + }, + { + "epoch": 0.4077475371514443, + "grad_norm": 0.526321946092579, + "learning_rate": 9.960894358531328e-06, + "loss": 0.0413, + "step": 1221 + }, + { + "epoch": 0.4080814827183169, + "grad_norm": 0.6622908968547422, + "learning_rate": 9.960651408765168e-06, + "loss": 0.0417, + "step": 1222 + }, + { + "epoch": 0.4084154282851895, + "grad_norm": 0.5746512951521658, + "learning_rate": 9.960407709634203e-06, + "loss": 0.057, + "step": 1223 + }, + { + "epoch": 0.4087493738520621, + "grad_norm": 0.9897729337912883, + "learning_rate": 9.960163261175247e-06, + "loss": 0.0434, + "step": 1224 + }, + { + "epoch": 0.4090833194189347, + "grad_norm": 0.36426890117105776, + "learning_rate": 9.959918063425228e-06, + "loss": 0.0361, + "step": 1225 + }, + { + "epoch": 0.4094172649858073, + "grad_norm": 0.5631602662914157, + "learning_rate": 9.959672116421181e-06, + "loss": 0.0445, + "step": 1226 + }, + { + "epoch": 0.40975121055267993, + "grad_norm": 0.5870494363349898, + "learning_rate": 9.959425420200267e-06, + "loss": 0.045, + "step": 1227 + }, + { + "epoch": 0.4100851561195525, + "grad_norm": 0.6056365184139995, + "learning_rate": 9.959177974799742e-06, + "loss": 0.0617, + "step": 1228 + }, + { + "epoch": 0.4104191016864251, + "grad_norm": 0.47243494475642694, + "learning_rate": 9.958929780256996e-06, + "loss": 0.0515, + "step": 1229 + }, + { + "epoch": 0.4107530472532977, + "grad_norm": 0.4965355083160451, + "learning_rate": 9.958680836609516e-06, + "loss": 0.0419, + "step": 1230 + }, + { + "epoch": 0.4110869928201703, + "grad_norm": 0.6125293052777985, + "learning_rate": 9.95843114389491e-06, + "loss": 0.0533, + "step": 1231 + }, + { + "epoch": 0.4114209383870429, + "grad_norm": 0.48382750319054785, + "learning_rate": 9.958180702150895e-06, + "loss": 0.0361, + "step": 1232 + }, + { + "epoch": 0.4117548839539155, + "grad_norm": 0.6457316899991283, + "learning_rate": 9.957929511415304e-06, + "loss": 0.0574, + "step": 1233 + }, + { + "epoch": 0.4120888295207881, + "grad_norm": 0.6018987285814226, + "learning_rate": 9.957677571726084e-06, + "loss": 0.0533, + "step": 1234 + }, + { + "epoch": 0.41242277508766073, + "grad_norm": 0.5787195430357516, + "learning_rate": 9.95742488312129e-06, + "loss": 0.0706, + "step": 1235 + }, + { + "epoch": 0.41275672065453334, + "grad_norm": 0.5810859648790506, + "learning_rate": 9.957171445639096e-06, + "loss": 0.0495, + "step": 1236 + }, + { + "epoch": 0.4130906662214059, + "grad_norm": 0.5792339241008343, + "learning_rate": 9.956917259317788e-06, + "loss": 0.0414, + "step": 1237 + }, + { + "epoch": 0.4134246117882785, + "grad_norm": 0.5015069588431272, + "learning_rate": 9.95666232419576e-06, + "loss": 0.0476, + "step": 1238 + }, + { + "epoch": 0.4137585573551511, + "grad_norm": 0.5523189073691389, + "learning_rate": 9.956406640311527e-06, + "loss": 0.054, + "step": 1239 + }, + { + "epoch": 0.4140925029220237, + "grad_norm": 0.5407896672488476, + "learning_rate": 9.956150207703712e-06, + "loss": 0.0373, + "step": 1240 + }, + { + "epoch": 0.4144264484888963, + "grad_norm": 0.5743250272298346, + "learning_rate": 9.955893026411048e-06, + "loss": 0.0532, + "step": 1241 + }, + { + "epoch": 0.4147603940557689, + "grad_norm": 0.46544361103029347, + "learning_rate": 9.955635096472391e-06, + "loss": 0.0333, + "step": 1242 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.8758747467998904, + "learning_rate": 9.9553764179267e-06, + "loss": 0.0558, + "step": 1243 + }, + { + "epoch": 0.41542828518951413, + "grad_norm": 0.5038013010376116, + "learning_rate": 9.955116990813056e-06, + "loss": 0.0591, + "step": 1244 + }, + { + "epoch": 0.4157622307563867, + "grad_norm": 0.479430959952089, + "learning_rate": 9.954856815170644e-06, + "loss": 0.0454, + "step": 1245 + }, + { + "epoch": 0.4160961763232593, + "grad_norm": 0.6542054204376075, + "learning_rate": 9.95459589103877e-06, + "loss": 0.047, + "step": 1246 + }, + { + "epoch": 0.4164301218901319, + "grad_norm": 0.45841216149214287, + "learning_rate": 9.954334218456846e-06, + "loss": 0.0371, + "step": 1247 + }, + { + "epoch": 0.4167640674570045, + "grad_norm": 0.8704556228734338, + "learning_rate": 9.954071797464405e-06, + "loss": 0.0624, + "step": 1248 + }, + { + "epoch": 0.4170980130238771, + "grad_norm": 0.5585633517077369, + "learning_rate": 9.953808628101086e-06, + "loss": 0.0507, + "step": 1249 + }, + { + "epoch": 0.4174319585907497, + "grad_norm": 0.5522660658752164, + "learning_rate": 9.953544710406646e-06, + "loss": 0.0482, + "step": 1250 + }, + { + "epoch": 0.4177659041576223, + "grad_norm": 0.48948221308855294, + "learning_rate": 9.95328004442095e-06, + "loss": 0.0445, + "step": 1251 + }, + { + "epoch": 0.41809984972449493, + "grad_norm": 0.4797459764133718, + "learning_rate": 9.953014630183979e-06, + "loss": 0.0377, + "step": 1252 + }, + { + "epoch": 0.4184337952913675, + "grad_norm": 0.7395642030505643, + "learning_rate": 9.95274846773583e-06, + "loss": 0.0542, + "step": 1253 + }, + { + "epoch": 0.4187677408582401, + "grad_norm": 0.337693760872802, + "learning_rate": 9.952481557116708e-06, + "loss": 0.0294, + "step": 1254 + }, + { + "epoch": 0.4191016864251127, + "grad_norm": 0.5120454944498827, + "learning_rate": 9.952213898366932e-06, + "loss": 0.0487, + "step": 1255 + }, + { + "epoch": 0.4194356319919853, + "grad_norm": 0.4787237800304835, + "learning_rate": 9.951945491526938e-06, + "loss": 0.039, + "step": 1256 + }, + { + "epoch": 0.4197695775588579, + "grad_norm": 0.5560558795953889, + "learning_rate": 9.951676336637267e-06, + "loss": 0.0446, + "step": 1257 + }, + { + "epoch": 0.4201035231257305, + "grad_norm": 0.47763341704211254, + "learning_rate": 9.951406433738587e-06, + "loss": 0.0452, + "step": 1258 + }, + { + "epoch": 0.4204374686926031, + "grad_norm": 0.6009184004401019, + "learning_rate": 9.95113578287166e-06, + "loss": 0.0525, + "step": 1259 + }, + { + "epoch": 0.42077141425947573, + "grad_norm": 0.9478592385559077, + "learning_rate": 9.950864384077376e-06, + "loss": 0.0528, + "step": 1260 + }, + { + "epoch": 0.4211053598263483, + "grad_norm": 0.6769818272741464, + "learning_rate": 9.950592237396732e-06, + "loss": 0.0483, + "step": 1261 + }, + { + "epoch": 0.4214393053932209, + "grad_norm": 0.3330372524755465, + "learning_rate": 9.95031934287084e-06, + "loss": 0.0251, + "step": 1262 + }, + { + "epoch": 0.4217732509600935, + "grad_norm": 0.5699833015601791, + "learning_rate": 9.950045700540923e-06, + "loss": 0.0481, + "step": 1263 + }, + { + "epoch": 0.4221071965269661, + "grad_norm": 0.43939228148604276, + "learning_rate": 9.949771310448317e-06, + "loss": 0.0441, + "step": 1264 + }, + { + "epoch": 0.4224411420938387, + "grad_norm": 0.39998322656970237, + "learning_rate": 9.949496172634474e-06, + "loss": 0.0338, + "step": 1265 + }, + { + "epoch": 0.4227750876607113, + "grad_norm": 0.5307284843623364, + "learning_rate": 9.949220287140955e-06, + "loss": 0.0526, + "step": 1266 + }, + { + "epoch": 0.4231090332275839, + "grad_norm": 0.403318394857699, + "learning_rate": 9.948943654009438e-06, + "loss": 0.051, + "step": 1267 + }, + { + "epoch": 0.4234429787944565, + "grad_norm": 0.44163504205551896, + "learning_rate": 9.948666273281708e-06, + "loss": 0.0457, + "step": 1268 + }, + { + "epoch": 0.4237769243613291, + "grad_norm": 0.4168383676719667, + "learning_rate": 9.94838814499967e-06, + "loss": 0.0487, + "step": 1269 + }, + { + "epoch": 0.4241108699282017, + "grad_norm": 0.37701006647801505, + "learning_rate": 9.948109269205338e-06, + "loss": 0.0347, + "step": 1270 + }, + { + "epoch": 0.4244448154950743, + "grad_norm": 0.6020931247605068, + "learning_rate": 9.947829645940836e-06, + "loss": 0.0491, + "step": 1271 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 0.41915818325977333, + "learning_rate": 9.94754927524841e-06, + "loss": 0.0422, + "step": 1272 + }, + { + "epoch": 0.4251127066288195, + "grad_norm": 0.6051942529439326, + "learning_rate": 9.947268157170409e-06, + "loss": 0.0535, + "step": 1273 + }, + { + "epoch": 0.4254466521956921, + "grad_norm": 0.38521647220535943, + "learning_rate": 9.9469862917493e-06, + "loss": 0.0378, + "step": 1274 + }, + { + "epoch": 0.4257805977625647, + "grad_norm": 0.355228311505307, + "learning_rate": 9.946703679027664e-06, + "loss": 0.0356, + "step": 1275 + }, + { + "epoch": 0.4261145433294373, + "grad_norm": 0.5270972733848863, + "learning_rate": 9.946420319048192e-06, + "loss": 0.0689, + "step": 1276 + }, + { + "epoch": 0.4264484888963099, + "grad_norm": 0.5112082801340879, + "learning_rate": 9.946136211853689e-06, + "loss": 0.0505, + "step": 1277 + }, + { + "epoch": 0.4267824344631825, + "grad_norm": 0.44718504260614705, + "learning_rate": 9.94585135748707e-06, + "loss": 0.0493, + "step": 1278 + }, + { + "epoch": 0.4271163800300551, + "grad_norm": 0.5699231475397143, + "learning_rate": 9.94556575599137e-06, + "loss": 0.0357, + "step": 1279 + }, + { + "epoch": 0.4274503255969277, + "grad_norm": 0.5365753163832601, + "learning_rate": 9.94527940740973e-06, + "loss": 0.0448, + "step": 1280 + }, + { + "epoch": 0.4277842711638003, + "grad_norm": 0.4735308826285214, + "learning_rate": 9.944992311785406e-06, + "loss": 0.0546, + "step": 1281 + }, + { + "epoch": 0.4281182167306729, + "grad_norm": 0.47380049162337246, + "learning_rate": 9.94470446916177e-06, + "loss": 0.0452, + "step": 1282 + }, + { + "epoch": 0.4284521622975455, + "grad_norm": 0.4644022381468032, + "learning_rate": 9.9444158795823e-06, + "loss": 0.046, + "step": 1283 + }, + { + "epoch": 0.4287861078644181, + "grad_norm": 0.44717115014964487, + "learning_rate": 9.944126543090593e-06, + "loss": 0.0416, + "step": 1284 + }, + { + "epoch": 0.4291200534312907, + "grad_norm": 0.4414360842309514, + "learning_rate": 9.943836459730356e-06, + "loss": 0.0419, + "step": 1285 + }, + { + "epoch": 0.4294539989981633, + "grad_norm": 1.0178687587182773, + "learning_rate": 9.943545629545412e-06, + "loss": 0.0663, + "step": 1286 + }, + { + "epoch": 0.4297879445650359, + "grad_norm": 0.46936757985831473, + "learning_rate": 9.94325405257969e-06, + "loss": 0.0463, + "step": 1287 + }, + { + "epoch": 0.4301218901319085, + "grad_norm": 0.4487040089691065, + "learning_rate": 9.94296172887724e-06, + "loss": 0.0447, + "step": 1288 + }, + { + "epoch": 0.4304558356987811, + "grad_norm": 0.41537669627976054, + "learning_rate": 9.942668658482219e-06, + "loss": 0.0348, + "step": 1289 + }, + { + "epoch": 0.4307897812656537, + "grad_norm": 0.5726595986047104, + "learning_rate": 9.942374841438898e-06, + "loss": 0.0617, + "step": 1290 + }, + { + "epoch": 0.4311237268325263, + "grad_norm": 0.43582524228831776, + "learning_rate": 9.942080277791663e-06, + "loss": 0.0357, + "step": 1291 + }, + { + "epoch": 0.4314576723993989, + "grad_norm": 0.5963138767833485, + "learning_rate": 9.941784967585012e-06, + "loss": 0.0531, + "step": 1292 + }, + { + "epoch": 0.4317916179662715, + "grad_norm": 0.36943889395441276, + "learning_rate": 9.941488910863553e-06, + "loss": 0.0288, + "step": 1293 + }, + { + "epoch": 0.4321255635331441, + "grad_norm": 0.37474135577933215, + "learning_rate": 9.941192107672011e-06, + "loss": 0.0347, + "step": 1294 + }, + { + "epoch": 0.4324595091000167, + "grad_norm": 0.4891570752361696, + "learning_rate": 9.940894558055218e-06, + "loss": 0.0432, + "step": 1295 + }, + { + "epoch": 0.4327934546668893, + "grad_norm": 0.5033309022266377, + "learning_rate": 9.940596262058128e-06, + "loss": 0.0494, + "step": 1296 + }, + { + "epoch": 0.4331274002337619, + "grad_norm": 0.4535306288903673, + "learning_rate": 9.940297219725797e-06, + "loss": 0.0431, + "step": 1297 + }, + { + "epoch": 0.4334613458006345, + "grad_norm": 0.42543847729786305, + "learning_rate": 9.939997431103402e-06, + "loss": 0.0466, + "step": 1298 + }, + { + "epoch": 0.4337952913675071, + "grad_norm": 0.5496540648197035, + "learning_rate": 9.939696896236229e-06, + "loss": 0.0512, + "step": 1299 + }, + { + "epoch": 0.4341292369343797, + "grad_norm": 0.5483031819247153, + "learning_rate": 9.939395615169673e-06, + "loss": 0.0424, + "step": 1300 + }, + { + "epoch": 0.4344631825012523, + "grad_norm": 0.514101464539269, + "learning_rate": 9.939093587949254e-06, + "loss": 0.056, + "step": 1301 + }, + { + "epoch": 0.43479712806812487, + "grad_norm": 0.5153592179996069, + "learning_rate": 9.938790814620591e-06, + "loss": 0.0452, + "step": 1302 + }, + { + "epoch": 0.4351310736349975, + "grad_norm": 0.8560536548611567, + "learning_rate": 9.938487295229423e-06, + "loss": 0.0647, + "step": 1303 + }, + { + "epoch": 0.4354650192018701, + "grad_norm": 0.45392421020879836, + "learning_rate": 9.9381830298216e-06, + "loss": 0.0449, + "step": 1304 + }, + { + "epoch": 0.4357989647687427, + "grad_norm": 0.6660398362359009, + "learning_rate": 9.937878018443085e-06, + "loss": 0.053, + "step": 1305 + }, + { + "epoch": 0.4361329103356153, + "grad_norm": 1.093738726263721, + "learning_rate": 9.937572261139956e-06, + "loss": 0.0404, + "step": 1306 + }, + { + "epoch": 0.4364668559024879, + "grad_norm": 0.8851395672879241, + "learning_rate": 9.937265757958397e-06, + "loss": 0.0753, + "step": 1307 + }, + { + "epoch": 0.4368008014693605, + "grad_norm": 0.4741734263702632, + "learning_rate": 9.93695850894471e-06, + "loss": 0.0386, + "step": 1308 + }, + { + "epoch": 0.4371347470362331, + "grad_norm": 0.5789187133288732, + "learning_rate": 9.93665051414531e-06, + "loss": 0.0484, + "step": 1309 + }, + { + "epoch": 0.43746869260310567, + "grad_norm": 0.6102715857348852, + "learning_rate": 9.936341773606723e-06, + "loss": 0.0443, + "step": 1310 + }, + { + "epoch": 0.4378026381699783, + "grad_norm": 0.783967201141356, + "learning_rate": 9.936032287375587e-06, + "loss": 0.0554, + "step": 1311 + }, + { + "epoch": 0.4381365837368509, + "grad_norm": 0.48743951221359044, + "learning_rate": 9.935722055498655e-06, + "loss": 0.0369, + "step": 1312 + }, + { + "epoch": 0.4384705293037235, + "grad_norm": 0.6692855376112433, + "learning_rate": 9.935411078022791e-06, + "loss": 0.0537, + "step": 1313 + }, + { + "epoch": 0.4388044748705961, + "grad_norm": 0.7812088505589483, + "learning_rate": 9.93509935499497e-06, + "loss": 0.0537, + "step": 1314 + }, + { + "epoch": 0.4391384204374687, + "grad_norm": 0.975296219611782, + "learning_rate": 9.934786886462282e-06, + "loss": 0.0412, + "step": 1315 + }, + { + "epoch": 0.4394723660043413, + "grad_norm": 0.509882945002344, + "learning_rate": 9.934473672471931e-06, + "loss": 0.0426, + "step": 1316 + }, + { + "epoch": 0.4398063115712139, + "grad_norm": 0.4494078479356775, + "learning_rate": 9.934159713071229e-06, + "loss": 0.029, + "step": 1317 + }, + { + "epoch": 0.44014025713808647, + "grad_norm": 0.49041183105866554, + "learning_rate": 9.933845008307605e-06, + "loss": 0.0462, + "step": 1318 + }, + { + "epoch": 0.4404742027049591, + "grad_norm": 0.46408390070537386, + "learning_rate": 9.933529558228599e-06, + "loss": 0.0363, + "step": 1319 + }, + { + "epoch": 0.4408081482718317, + "grad_norm": 0.5570170216019052, + "learning_rate": 9.933213362881861e-06, + "loss": 0.052, + "step": 1320 + }, + { + "epoch": 0.4411420938387043, + "grad_norm": 0.4219448033529627, + "learning_rate": 9.932896422315159e-06, + "loss": 0.0382, + "step": 1321 + }, + { + "epoch": 0.4414760394055769, + "grad_norm": 0.6070665233479298, + "learning_rate": 9.93257873657637e-06, + "loss": 0.0449, + "step": 1322 + }, + { + "epoch": 0.4418099849724495, + "grad_norm": 0.5361328280997671, + "learning_rate": 9.932260305713481e-06, + "loss": 0.0529, + "step": 1323 + }, + { + "epoch": 0.4421439305393221, + "grad_norm": 0.5227207215911478, + "learning_rate": 9.9319411297746e-06, + "loss": 0.0443, + "step": 1324 + }, + { + "epoch": 0.4424778761061947, + "grad_norm": 0.6159240333179103, + "learning_rate": 9.931621208807939e-06, + "loss": 0.0454, + "step": 1325 + }, + { + "epoch": 0.44281182167306726, + "grad_norm": 0.6801978834254696, + "learning_rate": 9.931300542861826e-06, + "loss": 0.0604, + "step": 1326 + }, + { + "epoch": 0.44314576723993987, + "grad_norm": 0.6733576208335831, + "learning_rate": 9.930979131984702e-06, + "loss": 0.0428, + "step": 1327 + }, + { + "epoch": 0.4434797128068125, + "grad_norm": 0.5131450335966199, + "learning_rate": 9.93065697622512e-06, + "loss": 0.0412, + "step": 1328 + }, + { + "epoch": 0.4438136583736851, + "grad_norm": 0.45628227470387467, + "learning_rate": 9.930334075631745e-06, + "loss": 0.0448, + "step": 1329 + }, + { + "epoch": 0.4441476039405577, + "grad_norm": 0.5023094091205198, + "learning_rate": 9.930010430253356e-06, + "loss": 0.0354, + "step": 1330 + }, + { + "epoch": 0.4444815495074303, + "grad_norm": 0.557464084839111, + "learning_rate": 9.92968604013884e-06, + "loss": 0.0535, + "step": 1331 + }, + { + "epoch": 0.4448154950743029, + "grad_norm": 0.7409360498813402, + "learning_rate": 9.929360905337204e-06, + "loss": 0.0448, + "step": 1332 + }, + { + "epoch": 0.4451494406411755, + "grad_norm": 0.399591445429372, + "learning_rate": 9.929035025897561e-06, + "loss": 0.035, + "step": 1333 + }, + { + "epoch": 0.4454833862080481, + "grad_norm": 0.6806929089635603, + "learning_rate": 9.928708401869143e-06, + "loss": 0.0647, + "step": 1334 + }, + { + "epoch": 0.44581733177492067, + "grad_norm": 1.0748118239786535, + "learning_rate": 9.928381033301284e-06, + "loss": 0.0543, + "step": 1335 + }, + { + "epoch": 0.4461512773417933, + "grad_norm": 0.8691112605217693, + "learning_rate": 9.928052920243443e-06, + "loss": 0.0631, + "step": 1336 + }, + { + "epoch": 0.4464852229086659, + "grad_norm": 0.6264993005302014, + "learning_rate": 9.927724062745179e-06, + "loss": 0.0616, + "step": 1337 + }, + { + "epoch": 0.4468191684755385, + "grad_norm": 0.5416140116082875, + "learning_rate": 9.927394460856174e-06, + "loss": 0.0505, + "step": 1338 + }, + { + "epoch": 0.4471531140424111, + "grad_norm": 0.6290176108125836, + "learning_rate": 9.92706411462622e-06, + "loss": 0.0459, + "step": 1339 + }, + { + "epoch": 0.4474870596092837, + "grad_norm": 0.5369142702628142, + "learning_rate": 9.926733024105216e-06, + "loss": 0.0419, + "step": 1340 + }, + { + "epoch": 0.4478210051761563, + "grad_norm": 0.49018142983174273, + "learning_rate": 9.926401189343177e-06, + "loss": 0.0465, + "step": 1341 + }, + { + "epoch": 0.4481549507430289, + "grad_norm": 0.401600930205156, + "learning_rate": 9.926068610390231e-06, + "loss": 0.0372, + "step": 1342 + }, + { + "epoch": 0.44848889630990146, + "grad_norm": 0.4088672091096697, + "learning_rate": 9.925735287296621e-06, + "loss": 0.04, + "step": 1343 + }, + { + "epoch": 0.44882284187677407, + "grad_norm": 0.5276126911354879, + "learning_rate": 9.925401220112698e-06, + "loss": 0.0373, + "step": 1344 + }, + { + "epoch": 0.4491567874436467, + "grad_norm": 0.5425732637602215, + "learning_rate": 9.925066408888924e-06, + "loss": 0.0501, + "step": 1345 + }, + { + "epoch": 0.4494907330105193, + "grad_norm": 0.4226487446607824, + "learning_rate": 9.92473085367588e-06, + "loss": 0.048, + "step": 1346 + }, + { + "epoch": 0.4498246785773919, + "grad_norm": 0.39943297643787967, + "learning_rate": 9.924394554524252e-06, + "loss": 0.0417, + "step": 1347 + }, + { + "epoch": 0.4501586241442645, + "grad_norm": 0.43913272334149567, + "learning_rate": 9.924057511484844e-06, + "loss": 0.0381, + "step": 1348 + }, + { + "epoch": 0.4504925697111371, + "grad_norm": 0.5049662393105034, + "learning_rate": 9.92371972460857e-06, + "loss": 0.0484, + "step": 1349 + }, + { + "epoch": 0.4508265152780097, + "grad_norm": 1.1050941555590548, + "learning_rate": 9.923381193946457e-06, + "loss": 0.0595, + "step": 1350 + }, + { + "epoch": 0.45116046084488226, + "grad_norm": 0.6439100674202259, + "learning_rate": 9.923041919549644e-06, + "loss": 0.0457, + "step": 1351 + }, + { + "epoch": 0.45149440641175487, + "grad_norm": 0.5177257397789261, + "learning_rate": 9.92270190146938e-06, + "loss": 0.0413, + "step": 1352 + }, + { + "epoch": 0.4518283519786275, + "grad_norm": 0.6170011843335443, + "learning_rate": 9.922361139757033e-06, + "loss": 0.0446, + "step": 1353 + }, + { + "epoch": 0.4521622975455001, + "grad_norm": 0.8446550194519187, + "learning_rate": 9.922019634464077e-06, + "loss": 0.0466, + "step": 1354 + }, + { + "epoch": 0.4524962431123727, + "grad_norm": 0.5393970052349853, + "learning_rate": 9.9216773856421e-06, + "loss": 0.0406, + "step": 1355 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.36930192855971267, + "learning_rate": 9.921334393342803e-06, + "loss": 0.033, + "step": 1356 + }, + { + "epoch": 0.4531641342461179, + "grad_norm": 0.6997590413387955, + "learning_rate": 9.920990657617998e-06, + "loss": 0.0603, + "step": 1357 + }, + { + "epoch": 0.4534980798129905, + "grad_norm": 0.41790934970038635, + "learning_rate": 9.920646178519612e-06, + "loss": 0.0333, + "step": 1358 + }, + { + "epoch": 0.45383202537986306, + "grad_norm": 0.43059296536579117, + "learning_rate": 9.920300956099682e-06, + "loss": 0.0472, + "step": 1359 + }, + { + "epoch": 0.45416597094673566, + "grad_norm": 0.34400095690252225, + "learning_rate": 9.919954990410359e-06, + "loss": 0.0426, + "step": 1360 + }, + { + "epoch": 0.45449991651360827, + "grad_norm": 0.40050739871675406, + "learning_rate": 9.919608281503903e-06, + "loss": 0.0344, + "step": 1361 + }, + { + "epoch": 0.4548338620804809, + "grad_norm": 0.44401533509970764, + "learning_rate": 9.91926082943269e-06, + "loss": 0.0501, + "step": 1362 + }, + { + "epoch": 0.4551678076473535, + "grad_norm": 0.5826813299787759, + "learning_rate": 9.918912634249206e-06, + "loss": 0.0443, + "step": 1363 + }, + { + "epoch": 0.4555017532142261, + "grad_norm": 0.4915573623962292, + "learning_rate": 9.91856369600605e-06, + "loss": 0.0365, + "step": 1364 + }, + { + "epoch": 0.4558356987810987, + "grad_norm": 0.6009700282747781, + "learning_rate": 9.918214014755935e-06, + "loss": 0.0504, + "step": 1365 + }, + { + "epoch": 0.4561696443479713, + "grad_norm": 0.5468155746254137, + "learning_rate": 9.917863590551682e-06, + "loss": 0.0361, + "step": 1366 + }, + { + "epoch": 0.45650358991484385, + "grad_norm": 0.5516763083838638, + "learning_rate": 9.917512423446226e-06, + "loss": 0.0409, + "step": 1367 + }, + { + "epoch": 0.45683753548171646, + "grad_norm": 0.4891058902403625, + "learning_rate": 9.917160513492619e-06, + "loss": 0.0416, + "step": 1368 + }, + { + "epoch": 0.45717148104858907, + "grad_norm": 0.41191878216869804, + "learning_rate": 9.916807860744017e-06, + "loss": 0.0501, + "step": 1369 + }, + { + "epoch": 0.4575054266154617, + "grad_norm": 0.5190297522711743, + "learning_rate": 9.916454465253695e-06, + "loss": 0.049, + "step": 1370 + }, + { + "epoch": 0.4578393721823343, + "grad_norm": 0.36002436206656735, + "learning_rate": 9.916100327075038e-06, + "loss": 0.0469, + "step": 1371 + }, + { + "epoch": 0.4581733177492069, + "grad_norm": 0.45972030730800345, + "learning_rate": 9.91574544626154e-06, + "loss": 0.037, + "step": 1372 + }, + { + "epoch": 0.4585072633160795, + "grad_norm": 0.5769486944913123, + "learning_rate": 9.915389822866811e-06, + "loss": 0.0301, + "step": 1373 + }, + { + "epoch": 0.4588412088829521, + "grad_norm": 0.36635983513397724, + "learning_rate": 9.915033456944572e-06, + "loss": 0.0401, + "step": 1374 + }, + { + "epoch": 0.45917515444982465, + "grad_norm": 0.3934936851188898, + "learning_rate": 9.914676348548658e-06, + "loss": 0.0321, + "step": 1375 + }, + { + "epoch": 0.45950910001669726, + "grad_norm": 0.4123363366511699, + "learning_rate": 9.914318497733013e-06, + "loss": 0.0342, + "step": 1376 + }, + { + "epoch": 0.45984304558356986, + "grad_norm": 0.5779376661465719, + "learning_rate": 9.913959904551695e-06, + "loss": 0.0469, + "step": 1377 + }, + { + "epoch": 0.46017699115044247, + "grad_norm": 0.4974245654516736, + "learning_rate": 9.913600569058871e-06, + "loss": 0.0495, + "step": 1378 + }, + { + "epoch": 0.4605109367173151, + "grad_norm": 0.481123185282172, + "learning_rate": 9.913240491308828e-06, + "loss": 0.0335, + "step": 1379 + }, + { + "epoch": 0.4608448822841877, + "grad_norm": 0.47405440925593645, + "learning_rate": 9.912879671355956e-06, + "loss": 0.0376, + "step": 1380 + }, + { + "epoch": 0.4611788278510603, + "grad_norm": 0.3698815548610607, + "learning_rate": 9.912518109254763e-06, + "loss": 0.0249, + "step": 1381 + }, + { + "epoch": 0.4615127734179329, + "grad_norm": 0.5249294778729459, + "learning_rate": 9.912155805059866e-06, + "loss": 0.0445, + "step": 1382 + }, + { + "epoch": 0.4618467189848055, + "grad_norm": 0.6781920087763629, + "learning_rate": 9.911792758825996e-06, + "loss": 0.0489, + "step": 1383 + }, + { + "epoch": 0.46218066455167806, + "grad_norm": 0.5889363661388466, + "learning_rate": 9.911428970607995e-06, + "loss": 0.0505, + "step": 1384 + }, + { + "epoch": 0.46251461011855066, + "grad_norm": 0.7397372770295463, + "learning_rate": 9.911064440460818e-06, + "loss": 0.0443, + "step": 1385 + }, + { + "epoch": 0.46284855568542327, + "grad_norm": 0.48219568773534327, + "learning_rate": 9.91069916843953e-06, + "loss": 0.0377, + "step": 1386 + }, + { + "epoch": 0.4631825012522959, + "grad_norm": 0.5418126046127661, + "learning_rate": 9.910333154599314e-06, + "loss": 0.0462, + "step": 1387 + }, + { + "epoch": 0.4635164468191685, + "grad_norm": 0.45297834065714765, + "learning_rate": 9.909966398995456e-06, + "loss": 0.0313, + "step": 1388 + }, + { + "epoch": 0.4638503923860411, + "grad_norm": 0.6679683885195675, + "learning_rate": 9.909598901683361e-06, + "loss": 0.0543, + "step": 1389 + }, + { + "epoch": 0.4641843379529137, + "grad_norm": 0.5437056275819844, + "learning_rate": 9.909230662718543e-06, + "loss": 0.057, + "step": 1390 + }, + { + "epoch": 0.4645182835197863, + "grad_norm": 0.5834133195251893, + "learning_rate": 9.908861682156628e-06, + "loss": 0.0495, + "step": 1391 + }, + { + "epoch": 0.46485222908665885, + "grad_norm": 0.6343676710620773, + "learning_rate": 9.908491960053357e-06, + "loss": 0.0592, + "step": 1392 + }, + { + "epoch": 0.46518617465353146, + "grad_norm": 0.4219191146923237, + "learning_rate": 9.90812149646458e-06, + "loss": 0.0423, + "step": 1393 + }, + { + "epoch": 0.46552012022040407, + "grad_norm": 0.4053675842863338, + "learning_rate": 9.907750291446258e-06, + "loss": 0.0329, + "step": 1394 + }, + { + "epoch": 0.46585406578727667, + "grad_norm": 0.4003154328908937, + "learning_rate": 9.907378345054471e-06, + "loss": 0.0315, + "step": 1395 + }, + { + "epoch": 0.4661880113541493, + "grad_norm": 0.43633656747345956, + "learning_rate": 9.9070056573454e-06, + "loss": 0.0336, + "step": 1396 + }, + { + "epoch": 0.4665219569210219, + "grad_norm": 0.7441691070141674, + "learning_rate": 9.906632228375346e-06, + "loss": 0.057, + "step": 1397 + }, + { + "epoch": 0.4668559024878945, + "grad_norm": 0.44281489718854117, + "learning_rate": 9.906258058200722e-06, + "loss": 0.03, + "step": 1398 + }, + { + "epoch": 0.4671898480547671, + "grad_norm": 0.5879656363220077, + "learning_rate": 9.905883146878049e-06, + "loss": 0.0527, + "step": 1399 + }, + { + "epoch": 0.46752379362163965, + "grad_norm": 0.6613327027917685, + "learning_rate": 9.90550749446396e-06, + "loss": 0.0793, + "step": 1400 + }, + { + "epoch": 0.46785773918851226, + "grad_norm": 0.45880391936667725, + "learning_rate": 9.905131101015204e-06, + "loss": 0.0427, + "step": 1401 + }, + { + "epoch": 0.46819168475538486, + "grad_norm": 0.6543858751031911, + "learning_rate": 9.904753966588638e-06, + "loss": 0.0517, + "step": 1402 + }, + { + "epoch": 0.46852563032225747, + "grad_norm": 0.4481287511626373, + "learning_rate": 9.904376091241236e-06, + "loss": 0.0506, + "step": 1403 + }, + { + "epoch": 0.4688595758891301, + "grad_norm": 0.33458825171140666, + "learning_rate": 9.903997475030077e-06, + "loss": 0.0299, + "step": 1404 + }, + { + "epoch": 0.4691935214560027, + "grad_norm": 0.4958866313971678, + "learning_rate": 9.903618118012358e-06, + "loss": 0.0429, + "step": 1405 + }, + { + "epoch": 0.4695274670228753, + "grad_norm": 0.3971419975851642, + "learning_rate": 9.903238020245383e-06, + "loss": 0.0371, + "step": 1406 + }, + { + "epoch": 0.4698614125897479, + "grad_norm": 0.6540638399343981, + "learning_rate": 9.902857181786571e-06, + "loss": 0.0449, + "step": 1407 + }, + { + "epoch": 0.47019535815662045, + "grad_norm": 0.4322169481213042, + "learning_rate": 9.902475602693451e-06, + "loss": 0.032, + "step": 1408 + }, + { + "epoch": 0.47052930372349305, + "grad_norm": 0.3458875024135452, + "learning_rate": 9.90209328302367e-06, + "loss": 0.0323, + "step": 1409 + }, + { + "epoch": 0.47086324929036566, + "grad_norm": 0.422986124801938, + "learning_rate": 9.901710222834976e-06, + "loss": 0.0398, + "step": 1410 + }, + { + "epoch": 0.47119719485723827, + "grad_norm": 0.43510914715380106, + "learning_rate": 9.901326422185238e-06, + "loss": 0.0335, + "step": 1411 + }, + { + "epoch": 0.4715311404241109, + "grad_norm": 0.5283980317667754, + "learning_rate": 9.900941881132431e-06, + "loss": 0.0448, + "step": 1412 + }, + { + "epoch": 0.4718650859909835, + "grad_norm": 1.1088829014613777, + "learning_rate": 9.900556599734647e-06, + "loss": 0.0547, + "step": 1413 + }, + { + "epoch": 0.4721990315578561, + "grad_norm": 0.5843202080905576, + "learning_rate": 9.900170578050088e-06, + "loss": 0.0753, + "step": 1414 + }, + { + "epoch": 0.4725329771247287, + "grad_norm": 0.4905756109371659, + "learning_rate": 9.899783816137065e-06, + "loss": 0.0414, + "step": 1415 + }, + { + "epoch": 0.47286692269160124, + "grad_norm": 0.7559325911936297, + "learning_rate": 9.899396314054002e-06, + "loss": 0.0446, + "step": 1416 + }, + { + "epoch": 0.47320086825847385, + "grad_norm": 0.3388909843775115, + "learning_rate": 9.89900807185944e-06, + "loss": 0.028, + "step": 1417 + }, + { + "epoch": 0.47353481382534646, + "grad_norm": 0.46595584543428215, + "learning_rate": 9.89861908961202e-06, + "loss": 0.0498, + "step": 1418 + }, + { + "epoch": 0.47386875939221906, + "grad_norm": 0.9783336902808817, + "learning_rate": 9.89822936737051e-06, + "loss": 0.0614, + "step": 1419 + }, + { + "epoch": 0.47420270495909167, + "grad_norm": 0.5073491753338615, + "learning_rate": 9.897838905193781e-06, + "loss": 0.0345, + "step": 1420 + }, + { + "epoch": 0.4745366505259643, + "grad_norm": 0.7037993960648413, + "learning_rate": 9.897447703140813e-06, + "loss": 0.0512, + "step": 1421 + }, + { + "epoch": 0.4748705960928369, + "grad_norm": 0.5437711089780456, + "learning_rate": 9.897055761270705e-06, + "loss": 0.0386, + "step": 1422 + }, + { + "epoch": 0.4752045416597095, + "grad_norm": 0.5148402217137021, + "learning_rate": 9.896663079642663e-06, + "loss": 0.0529, + "step": 1423 + }, + { + "epoch": 0.47553848722658204, + "grad_norm": 0.579826595234891, + "learning_rate": 9.896269658316006e-06, + "loss": 0.0498, + "step": 1424 + }, + { + "epoch": 0.47587243279345465, + "grad_norm": 0.46060422264727224, + "learning_rate": 9.895875497350165e-06, + "loss": 0.0344, + "step": 1425 + }, + { + "epoch": 0.47620637836032725, + "grad_norm": 0.5779317629395826, + "learning_rate": 9.895480596804684e-06, + "loss": 0.0341, + "step": 1426 + }, + { + "epoch": 0.47654032392719986, + "grad_norm": 0.6251142827066727, + "learning_rate": 9.895084956739215e-06, + "loss": 0.044, + "step": 1427 + }, + { + "epoch": 0.47687426949407247, + "grad_norm": 0.5070877530779723, + "learning_rate": 9.894688577213527e-06, + "loss": 0.044, + "step": 1428 + }, + { + "epoch": 0.4772082150609451, + "grad_norm": 0.5413308077354688, + "learning_rate": 9.894291458287496e-06, + "loss": 0.0473, + "step": 1429 + }, + { + "epoch": 0.4775421606278177, + "grad_norm": 0.536917214023776, + "learning_rate": 9.893893600021112e-06, + "loss": 0.0447, + "step": 1430 + }, + { + "epoch": 0.4778761061946903, + "grad_norm": 0.5093039804216097, + "learning_rate": 9.893495002474475e-06, + "loss": 0.0417, + "step": 1431 + }, + { + "epoch": 0.47821005176156284, + "grad_norm": 0.5138668195571756, + "learning_rate": 9.893095665707801e-06, + "loss": 0.0427, + "step": 1432 + }, + { + "epoch": 0.47854399732843544, + "grad_norm": 0.4319009777536361, + "learning_rate": 9.89269558978141e-06, + "loss": 0.0356, + "step": 1433 + }, + { + "epoch": 0.47887794289530805, + "grad_norm": 0.9396179730801536, + "learning_rate": 9.892294774755741e-06, + "loss": 0.0696, + "step": 1434 + }, + { + "epoch": 0.47921188846218066, + "grad_norm": 0.8768865613461034, + "learning_rate": 9.891893220691343e-06, + "loss": 0.0496, + "step": 1435 + }, + { + "epoch": 0.47954583402905326, + "grad_norm": 0.5639812000975869, + "learning_rate": 9.891490927648872e-06, + "loss": 0.0428, + "step": 1436 + }, + { + "epoch": 0.47987977959592587, + "grad_norm": 0.706065626441353, + "learning_rate": 9.891087895689102e-06, + "loss": 0.0482, + "step": 1437 + }, + { + "epoch": 0.4802137251627985, + "grad_norm": 0.7787287102104395, + "learning_rate": 9.890684124872914e-06, + "loss": 0.0482, + "step": 1438 + }, + { + "epoch": 0.4805476707296711, + "grad_norm": 0.6289694427407534, + "learning_rate": 9.890279615261302e-06, + "loss": 0.0425, + "step": 1439 + }, + { + "epoch": 0.4808816162965437, + "grad_norm": 0.7495730884069466, + "learning_rate": 9.889874366915374e-06, + "loss": 0.053, + "step": 1440 + }, + { + "epoch": 0.48121556186341624, + "grad_norm": 0.6670551951706001, + "learning_rate": 9.889468379896347e-06, + "loss": 0.0537, + "step": 1441 + }, + { + "epoch": 0.48154950743028885, + "grad_norm": 0.4662113102746823, + "learning_rate": 9.88906165426555e-06, + "loss": 0.0533, + "step": 1442 + }, + { + "epoch": 0.48188345299716145, + "grad_norm": 0.3709485011214875, + "learning_rate": 9.888654190084422e-06, + "loss": 0.0342, + "step": 1443 + }, + { + "epoch": 0.48221739856403406, + "grad_norm": 0.6270254124222672, + "learning_rate": 9.888245987414517e-06, + "loss": 0.0481, + "step": 1444 + }, + { + "epoch": 0.48255134413090667, + "grad_norm": 0.5476949184551392, + "learning_rate": 9.8878370463175e-06, + "loss": 0.0523, + "step": 1445 + }, + { + "epoch": 0.4828852896977793, + "grad_norm": 0.40312540701883587, + "learning_rate": 9.887427366855142e-06, + "loss": 0.032, + "step": 1446 + }, + { + "epoch": 0.4832192352646519, + "grad_norm": 0.39758087626866595, + "learning_rate": 9.887016949089334e-06, + "loss": 0.0425, + "step": 1447 + }, + { + "epoch": 0.4835531808315245, + "grad_norm": 0.33656048927842874, + "learning_rate": 9.886605793082073e-06, + "loss": 0.0405, + "step": 1448 + }, + { + "epoch": 0.48388712639839704, + "grad_norm": 0.5003352475198972, + "learning_rate": 9.886193898895468e-06, + "loss": 0.0419, + "step": 1449 + }, + { + "epoch": 0.48422107196526964, + "grad_norm": 1.1045291501772663, + "learning_rate": 9.885781266591742e-06, + "loss": 0.0525, + "step": 1450 + }, + { + "epoch": 0.48455501753214225, + "grad_norm": 0.5375756063813563, + "learning_rate": 9.885367896233229e-06, + "loss": 0.0547, + "step": 1451 + }, + { + "epoch": 0.48488896309901486, + "grad_norm": 0.47135224008762544, + "learning_rate": 9.88495378788237e-06, + "loss": 0.0464, + "step": 1452 + }, + { + "epoch": 0.48522290866588746, + "grad_norm": 0.47155647032558995, + "learning_rate": 9.884538941601725e-06, + "loss": 0.0397, + "step": 1453 + }, + { + "epoch": 0.48555685423276007, + "grad_norm": 0.5062134825081213, + "learning_rate": 9.884123357453959e-06, + "loss": 0.045, + "step": 1454 + }, + { + "epoch": 0.4858907997996327, + "grad_norm": 0.4250903593753984, + "learning_rate": 9.883707035501849e-06, + "loss": 0.0382, + "step": 1455 + }, + { + "epoch": 0.4862247453665053, + "grad_norm": 0.5303503912675623, + "learning_rate": 9.883289975808288e-06, + "loss": 0.0474, + "step": 1456 + }, + { + "epoch": 0.48655869093337784, + "grad_norm": 0.5303299665051235, + "learning_rate": 9.882872178436277e-06, + "loss": 0.0421, + "step": 1457 + }, + { + "epoch": 0.48689263650025044, + "grad_norm": 0.7095470003827418, + "learning_rate": 9.882453643448933e-06, + "loss": 0.0622, + "step": 1458 + }, + { + "epoch": 0.48722658206712305, + "grad_norm": 0.6654969976908746, + "learning_rate": 9.882034370909474e-06, + "loss": 0.0604, + "step": 1459 + }, + { + "epoch": 0.48756052763399566, + "grad_norm": 0.46621656384269283, + "learning_rate": 9.88161436088124e-06, + "loss": 0.0438, + "step": 1460 + }, + { + "epoch": 0.48789447320086826, + "grad_norm": 0.4903801406467757, + "learning_rate": 9.881193613427676e-06, + "loss": 0.0431, + "step": 1461 + }, + { + "epoch": 0.48822841876774087, + "grad_norm": 0.4594116756992676, + "learning_rate": 9.880772128612345e-06, + "loss": 0.0386, + "step": 1462 + }, + { + "epoch": 0.4885623643346135, + "grad_norm": 0.5367656181476723, + "learning_rate": 9.880349906498914e-06, + "loss": 0.049, + "step": 1463 + }, + { + "epoch": 0.4888963099014861, + "grad_norm": 0.7229467490956691, + "learning_rate": 9.879926947151164e-06, + "loss": 0.0521, + "step": 1464 + }, + { + "epoch": 0.48923025546835863, + "grad_norm": 0.3537344850947976, + "learning_rate": 9.879503250632991e-06, + "loss": 0.0285, + "step": 1465 + }, + { + "epoch": 0.48956420103523124, + "grad_norm": 0.6767204938026421, + "learning_rate": 9.879078817008395e-06, + "loss": 0.0474, + "step": 1466 + }, + { + "epoch": 0.48989814660210385, + "grad_norm": 0.4013562222465895, + "learning_rate": 9.878653646341498e-06, + "loss": 0.0402, + "step": 1467 + }, + { + "epoch": 0.49023209216897645, + "grad_norm": 0.5299310670346635, + "learning_rate": 9.878227738696522e-06, + "loss": 0.0388, + "step": 1468 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.3553114238589389, + "learning_rate": 9.877801094137807e-06, + "loss": 0.0284, + "step": 1469 + }, + { + "epoch": 0.49089998330272167, + "grad_norm": 0.5492910503107569, + "learning_rate": 9.877373712729803e-06, + "loss": 0.0543, + "step": 1470 + }, + { + "epoch": 0.49123392886959427, + "grad_norm": 0.5645202038197307, + "learning_rate": 9.876945594537069e-06, + "loss": 0.0451, + "step": 1471 + }, + { + "epoch": 0.4915678744364669, + "grad_norm": 0.495824832203648, + "learning_rate": 9.876516739624279e-06, + "loss": 0.0433, + "step": 1472 + }, + { + "epoch": 0.49190182000333943, + "grad_norm": 0.7343312967983936, + "learning_rate": 9.876087148056217e-06, + "loss": 0.0451, + "step": 1473 + }, + { + "epoch": 0.49223576557021204, + "grad_norm": 0.5363624356990242, + "learning_rate": 9.875656819897776e-06, + "loss": 0.0425, + "step": 1474 + }, + { + "epoch": 0.49256971113708464, + "grad_norm": 0.509346221989081, + "learning_rate": 9.875225755213966e-06, + "loss": 0.0387, + "step": 1475 + }, + { + "epoch": 0.49290365670395725, + "grad_norm": 0.5782935647682491, + "learning_rate": 9.874793954069899e-06, + "loss": 0.05, + "step": 1476 + }, + { + "epoch": 0.49323760227082986, + "grad_norm": 0.36314841770955747, + "learning_rate": 9.874361416530808e-06, + "loss": 0.0354, + "step": 1477 + }, + { + "epoch": 0.49357154783770246, + "grad_norm": 0.4924488268107228, + "learning_rate": 9.873928142662031e-06, + "loss": 0.0472, + "step": 1478 + }, + { + "epoch": 0.49390549340457507, + "grad_norm": 0.4461045098551525, + "learning_rate": 9.873494132529018e-06, + "loss": 0.0256, + "step": 1479 + }, + { + "epoch": 0.4942394389714477, + "grad_norm": 0.5530793139483993, + "learning_rate": 9.873059386197335e-06, + "loss": 0.0598, + "step": 1480 + }, + { + "epoch": 0.4945733845383202, + "grad_norm": 0.5520821800980065, + "learning_rate": 9.872623903732652e-06, + "loss": 0.0459, + "step": 1481 + }, + { + "epoch": 0.49490733010519283, + "grad_norm": 0.3485202877204217, + "learning_rate": 9.872187685200756e-06, + "loss": 0.0312, + "step": 1482 + }, + { + "epoch": 0.49524127567206544, + "grad_norm": 0.6098012581793566, + "learning_rate": 9.87175073066754e-06, + "loss": 0.0541, + "step": 1483 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 0.4293178205870001, + "learning_rate": 9.871313040199015e-06, + "loss": 0.0478, + "step": 1484 + }, + { + "epoch": 0.49590916680581065, + "grad_norm": 0.3906292288259583, + "learning_rate": 9.870874613861297e-06, + "loss": 0.0382, + "step": 1485 + }, + { + "epoch": 0.49624311237268326, + "grad_norm": 0.4709217542922022, + "learning_rate": 9.870435451720614e-06, + "loss": 0.043, + "step": 1486 + }, + { + "epoch": 0.49657705793955587, + "grad_norm": 0.28737543979668984, + "learning_rate": 9.869995553843313e-06, + "loss": 0.0342, + "step": 1487 + }, + { + "epoch": 0.4969110035064285, + "grad_norm": 0.43962448048905367, + "learning_rate": 9.869554920295836e-06, + "loss": 0.0423, + "step": 1488 + }, + { + "epoch": 0.4972449490733011, + "grad_norm": 0.4467152430981485, + "learning_rate": 9.869113551144754e-06, + "loss": 0.0322, + "step": 1489 + }, + { + "epoch": 0.49757889464017363, + "grad_norm": 0.5938591642903027, + "learning_rate": 9.86867144645674e-06, + "loss": 0.0609, + "step": 1490 + }, + { + "epoch": 0.49791284020704624, + "grad_norm": 0.4091995526314113, + "learning_rate": 9.868228606298574e-06, + "loss": 0.0478, + "step": 1491 + }, + { + "epoch": 0.49824678577391884, + "grad_norm": 0.47491406195892255, + "learning_rate": 9.867785030737157e-06, + "loss": 0.0492, + "step": 1492 + }, + { + "epoch": 0.49858073134079145, + "grad_norm": 0.530286677828167, + "learning_rate": 9.867340719839494e-06, + "loss": 0.0529, + "step": 1493 + }, + { + "epoch": 0.49891467690766406, + "grad_norm": 0.9102296526082475, + "learning_rate": 9.866895673672704e-06, + "loss": 0.0534, + "step": 1494 + }, + { + "epoch": 0.49924862247453666, + "grad_norm": 0.4266869255406315, + "learning_rate": 9.866449892304017e-06, + "loss": 0.0412, + "step": 1495 + }, + { + "epoch": 0.49958256804140927, + "grad_norm": 0.4543185437825696, + "learning_rate": 9.866003375800773e-06, + "loss": 0.0477, + "step": 1496 + }, + { + "epoch": 0.4999165136082819, + "grad_norm": 0.36897957679861176, + "learning_rate": 9.865556124230425e-06, + "loss": 0.0423, + "step": 1497 + }, + { + "epoch": 0.5002504591751544, + "grad_norm": 0.4345258721100873, + "learning_rate": 9.865108137660533e-06, + "loss": 0.0377, + "step": 1498 + }, + { + "epoch": 0.500584404742027, + "grad_norm": 0.421072416575119, + "learning_rate": 9.864659416158773e-06, + "loss": 0.0397, + "step": 1499 + }, + { + "epoch": 0.5009183503088996, + "grad_norm": 0.40080400903218444, + "learning_rate": 9.864209959792927e-06, + "loss": 0.0428, + "step": 1500 + }, + { + "epoch": 0.5012522958757722, + "grad_norm": 0.7430605391213705, + "learning_rate": 9.863759768630893e-06, + "loss": 0.0556, + "step": 1501 + }, + { + "epoch": 0.5015862414426449, + "grad_norm": 0.4353906314679589, + "learning_rate": 9.863308842740678e-06, + "loss": 0.0467, + "step": 1502 + }, + { + "epoch": 0.5019201870095175, + "grad_norm": 0.42564048436155383, + "learning_rate": 9.862857182190398e-06, + "loss": 0.0445, + "step": 1503 + }, + { + "epoch": 0.5022541325763901, + "grad_norm": 0.37197214507407245, + "learning_rate": 9.862404787048283e-06, + "loss": 0.036, + "step": 1504 + }, + { + "epoch": 0.5025880781432627, + "grad_norm": 0.46236612219055323, + "learning_rate": 9.861951657382671e-06, + "loss": 0.0366, + "step": 1505 + }, + { + "epoch": 0.5029220237101353, + "grad_norm": 0.3598684883757102, + "learning_rate": 9.861497793262014e-06, + "loss": 0.0437, + "step": 1506 + }, + { + "epoch": 0.5032559692770079, + "grad_norm": 0.41849390499484806, + "learning_rate": 9.861043194754874e-06, + "loss": 0.047, + "step": 1507 + }, + { + "epoch": 0.5035899148438805, + "grad_norm": 0.36415995653183014, + "learning_rate": 9.860587861929922e-06, + "loss": 0.0295, + "step": 1508 + }, + { + "epoch": 0.5039238604107531, + "grad_norm": 0.3787109451952684, + "learning_rate": 9.86013179485594e-06, + "loss": 0.0431, + "step": 1509 + }, + { + "epoch": 0.5042578059776256, + "grad_norm": 0.32757238901458857, + "learning_rate": 9.859674993601826e-06, + "loss": 0.0354, + "step": 1510 + }, + { + "epoch": 0.5045917515444982, + "grad_norm": 0.58017880122697, + "learning_rate": 9.859217458236583e-06, + "loss": 0.0398, + "step": 1511 + }, + { + "epoch": 0.5049256971113708, + "grad_norm": 0.40980697132132676, + "learning_rate": 9.858759188829328e-06, + "loss": 0.0459, + "step": 1512 + }, + { + "epoch": 0.5052596426782434, + "grad_norm": 0.5442983985798375, + "learning_rate": 9.858300185449287e-06, + "loss": 0.0446, + "step": 1513 + }, + { + "epoch": 0.505593588245116, + "grad_norm": 0.4057797385047999, + "learning_rate": 9.857840448165798e-06, + "loss": 0.0463, + "step": 1514 + }, + { + "epoch": 0.5059275338119886, + "grad_norm": 0.3928975349261946, + "learning_rate": 9.857379977048311e-06, + "loss": 0.0323, + "step": 1515 + }, + { + "epoch": 0.5062614793788612, + "grad_norm": 0.3541880239496388, + "learning_rate": 9.856918772166385e-06, + "loss": 0.0352, + "step": 1516 + }, + { + "epoch": 0.5065954249457338, + "grad_norm": 0.5281051134338431, + "learning_rate": 9.856456833589688e-06, + "loss": 0.047, + "step": 1517 + }, + { + "epoch": 0.5069293705126064, + "grad_norm": 0.5800360079286986, + "learning_rate": 9.855994161388005e-06, + "loss": 0.0441, + "step": 1518 + }, + { + "epoch": 0.507263316079479, + "grad_norm": 0.4704690709667183, + "learning_rate": 9.855530755631226e-06, + "loss": 0.0584, + "step": 1519 + }, + { + "epoch": 0.5075972616463517, + "grad_norm": 0.46215825272653954, + "learning_rate": 9.855066616389356e-06, + "loss": 0.0314, + "step": 1520 + }, + { + "epoch": 0.5079312072132243, + "grad_norm": 0.50202463463797, + "learning_rate": 9.854601743732504e-06, + "loss": 0.0503, + "step": 1521 + }, + { + "epoch": 0.5082651527800969, + "grad_norm": 0.44982854674438427, + "learning_rate": 9.854136137730899e-06, + "loss": 0.0446, + "step": 1522 + }, + { + "epoch": 0.5085990983469695, + "grad_norm": 0.34760472847614, + "learning_rate": 9.853669798454875e-06, + "loss": 0.0297, + "step": 1523 + }, + { + "epoch": 0.5089330439138421, + "grad_norm": 0.5921353813069149, + "learning_rate": 9.853202725974878e-06, + "loss": 0.0563, + "step": 1524 + }, + { + "epoch": 0.5092669894807147, + "grad_norm": 0.5752415954719585, + "learning_rate": 9.852734920361465e-06, + "loss": 0.0521, + "step": 1525 + }, + { + "epoch": 0.5096009350475872, + "grad_norm": 0.7122673071217719, + "learning_rate": 9.8522663816853e-06, + "loss": 0.0624, + "step": 1526 + }, + { + "epoch": 0.5099348806144598, + "grad_norm": 0.3954636837544188, + "learning_rate": 9.851797110017167e-06, + "loss": 0.0264, + "step": 1527 + }, + { + "epoch": 0.5102688261813324, + "grad_norm": 0.3835123204727526, + "learning_rate": 9.851327105427952e-06, + "loss": 0.0279, + "step": 1528 + }, + { + "epoch": 0.510602771748205, + "grad_norm": 0.4386775905097899, + "learning_rate": 9.850856367988657e-06, + "loss": 0.0298, + "step": 1529 + }, + { + "epoch": 0.5109367173150776, + "grad_norm": 1.2477612411573338, + "learning_rate": 9.850384897770388e-06, + "loss": 0.0544, + "step": 1530 + }, + { + "epoch": 0.5112706628819502, + "grad_norm": 0.3895391857295608, + "learning_rate": 9.84991269484437e-06, + "loss": 0.0423, + "step": 1531 + }, + { + "epoch": 0.5116046084488228, + "grad_norm": 0.4555479557728127, + "learning_rate": 9.849439759281934e-06, + "loss": 0.0433, + "step": 1532 + }, + { + "epoch": 0.5119385540156954, + "grad_norm": 0.5499567731343226, + "learning_rate": 9.848966091154522e-06, + "loss": 0.0464, + "step": 1533 + }, + { + "epoch": 0.512272499582568, + "grad_norm": 0.4446478633482936, + "learning_rate": 9.848491690533686e-06, + "loss": 0.0429, + "step": 1534 + }, + { + "epoch": 0.5126064451494406, + "grad_norm": 0.45233848105286506, + "learning_rate": 9.848016557491092e-06, + "loss": 0.0484, + "step": 1535 + }, + { + "epoch": 0.5129403907163133, + "grad_norm": 0.8704994263668998, + "learning_rate": 9.847540692098513e-06, + "loss": 0.0533, + "step": 1536 + }, + { + "epoch": 0.5132743362831859, + "grad_norm": 0.45046021701537253, + "learning_rate": 9.847064094427835e-06, + "loss": 0.0407, + "step": 1537 + }, + { + "epoch": 0.5136082818500585, + "grad_norm": 0.5979846320083059, + "learning_rate": 9.846586764551054e-06, + "loss": 0.0453, + "step": 1538 + }, + { + "epoch": 0.5139422274169311, + "grad_norm": 0.8835141383688757, + "learning_rate": 9.846108702540274e-06, + "loss": 0.0864, + "step": 1539 + }, + { + "epoch": 0.5142761729838037, + "grad_norm": 0.578209835359361, + "learning_rate": 9.845629908467714e-06, + "loss": 0.038, + "step": 1540 + }, + { + "epoch": 0.5146101185506763, + "grad_norm": 0.6165144292679225, + "learning_rate": 9.8451503824057e-06, + "loss": 0.0402, + "step": 1541 + }, + { + "epoch": 0.5149440641175489, + "grad_norm": 0.6417056591975662, + "learning_rate": 9.844670124426672e-06, + "loss": 0.0401, + "step": 1542 + }, + { + "epoch": 0.5152780096844214, + "grad_norm": 0.5347630232075131, + "learning_rate": 9.844189134603178e-06, + "loss": 0.0447, + "step": 1543 + }, + { + "epoch": 0.515611955251294, + "grad_norm": 0.3234157913440331, + "learning_rate": 9.843707413007874e-06, + "loss": 0.0332, + "step": 1544 + }, + { + "epoch": 0.5159459008181666, + "grad_norm": 0.5786068038610814, + "learning_rate": 9.843224959713535e-06, + "loss": 0.0684, + "step": 1545 + }, + { + "epoch": 0.5162798463850392, + "grad_norm": 0.5250243847114785, + "learning_rate": 9.842741774793038e-06, + "loss": 0.0569, + "step": 1546 + }, + { + "epoch": 0.5166137919519118, + "grad_norm": 0.5759984993928302, + "learning_rate": 9.842257858319375e-06, + "loss": 0.0395, + "step": 1547 + }, + { + "epoch": 0.5169477375187844, + "grad_norm": 0.8308086977999896, + "learning_rate": 9.841773210365646e-06, + "loss": 0.0617, + "step": 1548 + }, + { + "epoch": 0.517281683085657, + "grad_norm": 0.4596758376186459, + "learning_rate": 9.841287831005064e-06, + "loss": 0.041, + "step": 1549 + }, + { + "epoch": 0.5176156286525296, + "grad_norm": 0.4076509829581428, + "learning_rate": 9.84080172031095e-06, + "loss": 0.0375, + "step": 1550 + }, + { + "epoch": 0.5179495742194022, + "grad_norm": 0.6507682472405012, + "learning_rate": 9.840314878356739e-06, + "loss": 0.0526, + "step": 1551 + }, + { + "epoch": 0.5182835197862748, + "grad_norm": 0.5564719871264918, + "learning_rate": 9.839827305215972e-06, + "loss": 0.0591, + "step": 1552 + }, + { + "epoch": 0.5186174653531475, + "grad_norm": 0.5070125812645897, + "learning_rate": 9.839339000962305e-06, + "loss": 0.044, + "step": 1553 + }, + { + "epoch": 0.5189514109200201, + "grad_norm": 0.5469926168367498, + "learning_rate": 9.838849965669499e-06, + "loss": 0.0462, + "step": 1554 + }, + { + "epoch": 0.5192853564868927, + "grad_norm": 0.4877482448023593, + "learning_rate": 9.83836019941143e-06, + "loss": 0.0426, + "step": 1555 + }, + { + "epoch": 0.5196193020537653, + "grad_norm": 0.46400833958387616, + "learning_rate": 9.837869702262082e-06, + "loss": 0.055, + "step": 1556 + }, + { + "epoch": 0.5199532476206379, + "grad_norm": 0.5469798799351714, + "learning_rate": 9.837378474295553e-06, + "loss": 0.0432, + "step": 1557 + }, + { + "epoch": 0.5202871931875105, + "grad_norm": 0.39835442324282394, + "learning_rate": 9.836886515586045e-06, + "loss": 0.0365, + "step": 1558 + }, + { + "epoch": 0.520621138754383, + "grad_norm": 0.4899421332569305, + "learning_rate": 9.83639382620788e-06, + "loss": 0.0292, + "step": 1559 + }, + { + "epoch": 0.5209550843212556, + "grad_norm": 0.31630185886358, + "learning_rate": 9.835900406235479e-06, + "loss": 0.0339, + "step": 1560 + }, + { + "epoch": 0.5212890298881282, + "grad_norm": 0.5410350548004083, + "learning_rate": 9.835406255743381e-06, + "loss": 0.0506, + "step": 1561 + }, + { + "epoch": 0.5216229754550008, + "grad_norm": 0.3826455467346051, + "learning_rate": 9.834911374806231e-06, + "loss": 0.0331, + "step": 1562 + }, + { + "epoch": 0.5219569210218734, + "grad_norm": 0.3690454201745996, + "learning_rate": 9.83441576349879e-06, + "loss": 0.036, + "step": 1563 + }, + { + "epoch": 0.522290866588746, + "grad_norm": 0.5198004451505397, + "learning_rate": 9.833919421895926e-06, + "loss": 0.046, + "step": 1564 + }, + { + "epoch": 0.5226248121556186, + "grad_norm": 0.40943166185929364, + "learning_rate": 9.833422350072615e-06, + "loss": 0.0358, + "step": 1565 + }, + { + "epoch": 0.5229587577224912, + "grad_norm": 0.6365032126717701, + "learning_rate": 9.832924548103945e-06, + "loss": 0.0478, + "step": 1566 + }, + { + "epoch": 0.5232927032893638, + "grad_norm": 0.3162022970947212, + "learning_rate": 9.832426016065117e-06, + "loss": 0.0346, + "step": 1567 + }, + { + "epoch": 0.5236266488562364, + "grad_norm": 0.43102986601306426, + "learning_rate": 9.83192675403144e-06, + "loss": 0.0491, + "step": 1568 + }, + { + "epoch": 0.523960594423109, + "grad_norm": 0.4866183135413825, + "learning_rate": 9.831426762078331e-06, + "loss": 0.0416, + "step": 1569 + }, + { + "epoch": 0.5242945399899817, + "grad_norm": 0.6415618280163502, + "learning_rate": 9.830926040281321e-06, + "loss": 0.0504, + "step": 1570 + }, + { + "epoch": 0.5246284855568543, + "grad_norm": 0.5820504422099528, + "learning_rate": 9.830424588716053e-06, + "loss": 0.0527, + "step": 1571 + }, + { + "epoch": 0.5249624311237269, + "grad_norm": 0.5336982251530853, + "learning_rate": 9.829922407458273e-06, + "loss": 0.0551, + "step": 1572 + }, + { + "epoch": 0.5252963766905995, + "grad_norm": 0.48056082471570233, + "learning_rate": 9.829419496583843e-06, + "loss": 0.0462, + "step": 1573 + }, + { + "epoch": 0.5256303222574721, + "grad_norm": 0.501265768809885, + "learning_rate": 9.828915856168734e-06, + "loss": 0.0486, + "step": 1574 + }, + { + "epoch": 0.5259642678243446, + "grad_norm": 0.6064150035667107, + "learning_rate": 9.828411486289026e-06, + "loss": 0.0389, + "step": 1575 + }, + { + "epoch": 0.5262982133912172, + "grad_norm": 0.5054548918351807, + "learning_rate": 9.82790638702091e-06, + "loss": 0.0449, + "step": 1576 + }, + { + "epoch": 0.5266321589580898, + "grad_norm": 0.478694127738411, + "learning_rate": 9.827400558440687e-06, + "loss": 0.0368, + "step": 1577 + }, + { + "epoch": 0.5269661045249624, + "grad_norm": 0.45181073485918044, + "learning_rate": 9.826894000624769e-06, + "loss": 0.0314, + "step": 1578 + }, + { + "epoch": 0.527300050091835, + "grad_norm": 0.7749356164087102, + "learning_rate": 9.826386713649678e-06, + "loss": 0.0636, + "step": 1579 + }, + { + "epoch": 0.5276339956587076, + "grad_norm": 0.5825159462865975, + "learning_rate": 9.825878697592046e-06, + "loss": 0.0467, + "step": 1580 + }, + { + "epoch": 0.5279679412255802, + "grad_norm": 0.5575072135049433, + "learning_rate": 9.825369952528611e-06, + "loss": 0.0473, + "step": 1581 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.41941816464691917, + "learning_rate": 9.824860478536231e-06, + "loss": 0.0307, + "step": 1582 + }, + { + "epoch": 0.5286358323593254, + "grad_norm": 0.5461230635241924, + "learning_rate": 9.824350275691864e-06, + "loss": 0.0514, + "step": 1583 + }, + { + "epoch": 0.528969777926198, + "grad_norm": 0.6438870299046571, + "learning_rate": 9.823839344072582e-06, + "loss": 0.0443, + "step": 1584 + }, + { + "epoch": 0.5293037234930706, + "grad_norm": 0.5777059381505149, + "learning_rate": 9.823327683755566e-06, + "loss": 0.0503, + "step": 1585 + }, + { + "epoch": 0.5296376690599433, + "grad_norm": 0.6353044866085541, + "learning_rate": 9.822815294818113e-06, + "loss": 0.0489, + "step": 1586 + }, + { + "epoch": 0.5299716146268159, + "grad_norm": 0.6029003429316526, + "learning_rate": 9.822302177337624e-06, + "loss": 0.0414, + "step": 1587 + }, + { + "epoch": 0.5303055601936885, + "grad_norm": 0.33971430969523186, + "learning_rate": 9.821788331391609e-06, + "loss": 0.0334, + "step": 1588 + }, + { + "epoch": 0.5306395057605611, + "grad_norm": 0.5268398704129876, + "learning_rate": 9.821273757057692e-06, + "loss": 0.0359, + "step": 1589 + }, + { + "epoch": 0.5309734513274337, + "grad_norm": 0.8283223934346511, + "learning_rate": 9.820758454413606e-06, + "loss": 0.04, + "step": 1590 + }, + { + "epoch": 0.5313073968943063, + "grad_norm": 0.5217381131428658, + "learning_rate": 9.820242423537192e-06, + "loss": 0.0408, + "step": 1591 + }, + { + "epoch": 0.5316413424611788, + "grad_norm": 0.5762948306371981, + "learning_rate": 9.819725664506404e-06, + "loss": 0.0335, + "step": 1592 + }, + { + "epoch": 0.5319752880280514, + "grad_norm": 0.7203062895879468, + "learning_rate": 9.819208177399303e-06, + "loss": 0.0432, + "step": 1593 + }, + { + "epoch": 0.532309233594924, + "grad_norm": 0.7160632979278783, + "learning_rate": 9.818689962294063e-06, + "loss": 0.0469, + "step": 1594 + }, + { + "epoch": 0.5326431791617966, + "grad_norm": 0.7917036861187641, + "learning_rate": 9.818171019268965e-06, + "loss": 0.0477, + "step": 1595 + }, + { + "epoch": 0.5329771247286692, + "grad_norm": 0.5237629728026038, + "learning_rate": 9.817651348402403e-06, + "loss": 0.0521, + "step": 1596 + }, + { + "epoch": 0.5333110702955418, + "grad_norm": 0.6723127715974083, + "learning_rate": 9.81713094977288e-06, + "loss": 0.039, + "step": 1597 + }, + { + "epoch": 0.5336450158624144, + "grad_norm": 0.5315645446991091, + "learning_rate": 9.816609823459007e-06, + "loss": 0.0369, + "step": 1598 + }, + { + "epoch": 0.533978961429287, + "grad_norm": 0.49683308900599976, + "learning_rate": 9.816087969539506e-06, + "loss": 0.0343, + "step": 1599 + }, + { + "epoch": 0.5343129069961596, + "grad_norm": 0.4968494208487672, + "learning_rate": 9.815565388093209e-06, + "loss": 0.0436, + "step": 1600 + }, + { + "epoch": 0.5346468525630322, + "grad_norm": 0.45501021484043996, + "learning_rate": 9.81504207919906e-06, + "loss": 0.04, + "step": 1601 + }, + { + "epoch": 0.5349807981299048, + "grad_norm": 0.5184804765890906, + "learning_rate": 9.814518042936107e-06, + "loss": 0.0502, + "step": 1602 + }, + { + "epoch": 0.5353147436967775, + "grad_norm": 0.5432491916613281, + "learning_rate": 9.813993279383518e-06, + "loss": 0.0451, + "step": 1603 + }, + { + "epoch": 0.5356486892636501, + "grad_norm": 0.3618440392593531, + "learning_rate": 9.813467788620559e-06, + "loss": 0.0335, + "step": 1604 + }, + { + "epoch": 0.5359826348305227, + "grad_norm": 0.40492935745007325, + "learning_rate": 9.812941570726615e-06, + "loss": 0.0322, + "step": 1605 + }, + { + "epoch": 0.5363165803973953, + "grad_norm": 0.6040359865504972, + "learning_rate": 9.812414625781175e-06, + "loss": 0.0476, + "step": 1606 + }, + { + "epoch": 0.5366505259642679, + "grad_norm": 0.49142896450495743, + "learning_rate": 9.811886953863841e-06, + "loss": 0.0343, + "step": 1607 + }, + { + "epoch": 0.5369844715311404, + "grad_norm": 0.47805940680703596, + "learning_rate": 9.811358555054326e-06, + "loss": 0.0326, + "step": 1608 + }, + { + "epoch": 0.537318417098013, + "grad_norm": 0.6548301712919927, + "learning_rate": 9.810829429432449e-06, + "loss": 0.034, + "step": 1609 + }, + { + "epoch": 0.5376523626648856, + "grad_norm": 0.5009292103530449, + "learning_rate": 9.81029957707814e-06, + "loss": 0.044, + "step": 1610 + }, + { + "epoch": 0.5379863082317582, + "grad_norm": 0.5624294342432482, + "learning_rate": 9.809768998071442e-06, + "loss": 0.0405, + "step": 1611 + }, + { + "epoch": 0.5383202537986308, + "grad_norm": 0.4170362571368871, + "learning_rate": 9.809237692492503e-06, + "loss": 0.0367, + "step": 1612 + }, + { + "epoch": 0.5386541993655034, + "grad_norm": 0.4596498288262667, + "learning_rate": 9.808705660421582e-06, + "loss": 0.0478, + "step": 1613 + }, + { + "epoch": 0.538988144932376, + "grad_norm": 0.7178474019180208, + "learning_rate": 9.808172901939053e-06, + "loss": 0.0459, + "step": 1614 + }, + { + "epoch": 0.5393220904992486, + "grad_norm": 0.3434626720597141, + "learning_rate": 9.807639417125392e-06, + "loss": 0.0312, + "step": 1615 + }, + { + "epoch": 0.5396560360661212, + "grad_norm": 0.5221178590381972, + "learning_rate": 9.807105206061186e-06, + "loss": 0.0348, + "step": 1616 + }, + { + "epoch": 0.5399899816329938, + "grad_norm": 0.4820129468059947, + "learning_rate": 9.80657026882714e-06, + "loss": 0.038, + "step": 1617 + }, + { + "epoch": 0.5403239271998664, + "grad_norm": 0.5177152300225278, + "learning_rate": 9.80603460550406e-06, + "loss": 0.0576, + "step": 1618 + }, + { + "epoch": 0.540657872766739, + "grad_norm": 0.48562119319527614, + "learning_rate": 9.805498216172861e-06, + "loss": 0.0389, + "step": 1619 + }, + { + "epoch": 0.5409918183336117, + "grad_norm": 0.4593153545602104, + "learning_rate": 9.804961100914575e-06, + "loss": 0.0381, + "step": 1620 + }, + { + "epoch": 0.5413257639004843, + "grad_norm": 0.3626106588247109, + "learning_rate": 9.804423259810338e-06, + "loss": 0.027, + "step": 1621 + }, + { + "epoch": 0.5416597094673569, + "grad_norm": 0.43240020249317507, + "learning_rate": 9.803884692941397e-06, + "loss": 0.0468, + "step": 1622 + }, + { + "epoch": 0.5419936550342295, + "grad_norm": 0.36564231841245876, + "learning_rate": 9.803345400389111e-06, + "loss": 0.0386, + "step": 1623 + }, + { + "epoch": 0.542327600601102, + "grad_norm": 0.4450810931513539, + "learning_rate": 9.802805382234941e-06, + "loss": 0.0436, + "step": 1624 + }, + { + "epoch": 0.5426615461679746, + "grad_norm": 0.4029971664558369, + "learning_rate": 9.80226463856047e-06, + "loss": 0.0408, + "step": 1625 + }, + { + "epoch": 0.5429954917348472, + "grad_norm": 0.6669350636767151, + "learning_rate": 9.801723169447378e-06, + "loss": 0.0484, + "step": 1626 + }, + { + "epoch": 0.5433294373017198, + "grad_norm": 0.45380330224505766, + "learning_rate": 9.801180974977466e-06, + "loss": 0.0361, + "step": 1627 + }, + { + "epoch": 0.5436633828685924, + "grad_norm": 0.40762758173819824, + "learning_rate": 9.800638055232635e-06, + "loss": 0.038, + "step": 1628 + }, + { + "epoch": 0.543997328435465, + "grad_norm": 0.5561520311338787, + "learning_rate": 9.800094410294897e-06, + "loss": 0.042, + "step": 1629 + }, + { + "epoch": 0.5443312740023376, + "grad_norm": 0.3911726844205256, + "learning_rate": 9.799550040246381e-06, + "loss": 0.0313, + "step": 1630 + }, + { + "epoch": 0.5446652195692102, + "grad_norm": 0.4119958716503093, + "learning_rate": 9.799004945169319e-06, + "loss": 0.0367, + "step": 1631 + }, + { + "epoch": 0.5449991651360828, + "grad_norm": 0.48263621914834215, + "learning_rate": 9.798459125146054e-06, + "loss": 0.0483, + "step": 1632 + }, + { + "epoch": 0.5453331107029554, + "grad_norm": 0.5034503809346389, + "learning_rate": 9.797912580259037e-06, + "loss": 0.0407, + "step": 1633 + }, + { + "epoch": 0.545667056269828, + "grad_norm": 0.7922156973587396, + "learning_rate": 9.797365310590832e-06, + "loss": 0.0591, + "step": 1634 + }, + { + "epoch": 0.5460010018367006, + "grad_norm": 0.46868734613073554, + "learning_rate": 9.796817316224107e-06, + "loss": 0.0359, + "step": 1635 + }, + { + "epoch": 0.5463349474035732, + "grad_norm": 0.41482875148647297, + "learning_rate": 9.79626859724165e-06, + "loss": 0.0394, + "step": 1636 + }, + { + "epoch": 0.5466688929704459, + "grad_norm": 0.5057102762417754, + "learning_rate": 9.795719153726345e-06, + "loss": 0.0325, + "step": 1637 + }, + { + "epoch": 0.5470028385373185, + "grad_norm": 0.49177583996918156, + "learning_rate": 9.795168985761192e-06, + "loss": 0.0454, + "step": 1638 + }, + { + "epoch": 0.5473367841041911, + "grad_norm": 0.38141980746291837, + "learning_rate": 9.794618093429305e-06, + "loss": 0.0427, + "step": 1639 + }, + { + "epoch": 0.5476707296710637, + "grad_norm": 0.4058774865359395, + "learning_rate": 9.794066476813901e-06, + "loss": 0.0394, + "step": 1640 + }, + { + "epoch": 0.5480046752379362, + "grad_norm": 0.4462450927151456, + "learning_rate": 9.793514135998306e-06, + "loss": 0.0301, + "step": 1641 + }, + { + "epoch": 0.5483386208048088, + "grad_norm": 0.3699683133102845, + "learning_rate": 9.792961071065958e-06, + "loss": 0.0467, + "step": 1642 + }, + { + "epoch": 0.5486725663716814, + "grad_norm": 0.4396087429356699, + "learning_rate": 9.792407282100407e-06, + "loss": 0.0277, + "step": 1643 + }, + { + "epoch": 0.549006511938554, + "grad_norm": 0.40474370695884687, + "learning_rate": 9.791852769185306e-06, + "loss": 0.0537, + "step": 1644 + }, + { + "epoch": 0.5493404575054266, + "grad_norm": 0.3409292888851801, + "learning_rate": 9.791297532404422e-06, + "loss": 0.0324, + "step": 1645 + }, + { + "epoch": 0.5496744030722992, + "grad_norm": 0.7477123909924196, + "learning_rate": 9.790741571841629e-06, + "loss": 0.0538, + "step": 1646 + }, + { + "epoch": 0.5500083486391718, + "grad_norm": 0.6511168542888314, + "learning_rate": 9.790184887580914e-06, + "loss": 0.0584, + "step": 1647 + }, + { + "epoch": 0.5503422942060444, + "grad_norm": 0.45904632423721786, + "learning_rate": 9.78962747970637e-06, + "loss": 0.0425, + "step": 1648 + }, + { + "epoch": 0.550676239772917, + "grad_norm": 0.44426679223407733, + "learning_rate": 9.789069348302197e-06, + "loss": 0.0379, + "step": 1649 + }, + { + "epoch": 0.5510101853397896, + "grad_norm": 0.45455317565005104, + "learning_rate": 9.78851049345271e-06, + "loss": 0.0512, + "step": 1650 + }, + { + "epoch": 0.5513441309066622, + "grad_norm": 0.4068642439545971, + "learning_rate": 9.78795091524233e-06, + "loss": 0.0378, + "step": 1651 + }, + { + "epoch": 0.5516780764735348, + "grad_norm": 0.3312250333711377, + "learning_rate": 9.78739061375559e-06, + "loss": 0.0294, + "step": 1652 + }, + { + "epoch": 0.5520120220404074, + "grad_norm": 0.7810027856995916, + "learning_rate": 9.786829589077125e-06, + "loss": 0.0573, + "step": 1653 + }, + { + "epoch": 0.55234596760728, + "grad_norm": 0.5475404100744792, + "learning_rate": 9.78626784129169e-06, + "loss": 0.0436, + "step": 1654 + }, + { + "epoch": 0.5526799131741527, + "grad_norm": 0.5330799826085645, + "learning_rate": 9.78570537048414e-06, + "loss": 0.0379, + "step": 1655 + }, + { + "epoch": 0.5530138587410253, + "grad_norm": 0.49487937190466735, + "learning_rate": 9.785142176739444e-06, + "loss": 0.0461, + "step": 1656 + }, + { + "epoch": 0.5533478043078978, + "grad_norm": 0.5147594641113324, + "learning_rate": 9.784578260142679e-06, + "loss": 0.0424, + "step": 1657 + }, + { + "epoch": 0.5536817498747704, + "grad_norm": 0.43346677742855155, + "learning_rate": 9.784013620779031e-06, + "loss": 0.0383, + "step": 1658 + }, + { + "epoch": 0.554015695441643, + "grad_norm": 0.4874022508489168, + "learning_rate": 9.783448258733795e-06, + "loss": 0.0508, + "step": 1659 + }, + { + "epoch": 0.5543496410085156, + "grad_norm": 0.9737800245753055, + "learning_rate": 9.782882174092377e-06, + "loss": 0.0457, + "step": 1660 + }, + { + "epoch": 0.5546835865753882, + "grad_norm": 0.44150611696364056, + "learning_rate": 9.78231536694029e-06, + "loss": 0.0448, + "step": 1661 + }, + { + "epoch": 0.5550175321422608, + "grad_norm": 0.4720638581175937, + "learning_rate": 9.781747837363158e-06, + "loss": 0.0426, + "step": 1662 + }, + { + "epoch": 0.5553514777091334, + "grad_norm": 0.39428004968531427, + "learning_rate": 9.781179585446711e-06, + "loss": 0.0461, + "step": 1663 + }, + { + "epoch": 0.555685423276006, + "grad_norm": 0.5364669161422959, + "learning_rate": 9.780610611276791e-06, + "loss": 0.0358, + "step": 1664 + }, + { + "epoch": 0.5560193688428786, + "grad_norm": 0.40805268475403095, + "learning_rate": 9.780040914939349e-06, + "loss": 0.0363, + "step": 1665 + }, + { + "epoch": 0.5563533144097512, + "grad_norm": 1.0136455115245742, + "learning_rate": 9.779470496520442e-06, + "loss": 0.1141, + "step": 1666 + }, + { + "epoch": 0.5566872599766238, + "grad_norm": 0.4639087743513537, + "learning_rate": 9.77889935610624e-06, + "loss": 0.0434, + "step": 1667 + }, + { + "epoch": 0.5570212055434964, + "grad_norm": 0.3441933451370595, + "learning_rate": 9.778327493783022e-06, + "loss": 0.0423, + "step": 1668 + }, + { + "epoch": 0.557355151110369, + "grad_norm": 0.41641457287248573, + "learning_rate": 9.777754909637173e-06, + "loss": 0.0279, + "step": 1669 + }, + { + "epoch": 0.5576890966772416, + "grad_norm": 0.40059036909704987, + "learning_rate": 9.777181603755188e-06, + "loss": 0.0347, + "step": 1670 + }, + { + "epoch": 0.5580230422441143, + "grad_norm": 0.564059108858758, + "learning_rate": 9.776607576223673e-06, + "loss": 0.0497, + "step": 1671 + }, + { + "epoch": 0.5583569878109869, + "grad_norm": 0.47482441046313406, + "learning_rate": 9.776032827129338e-06, + "loss": 0.0368, + "step": 1672 + }, + { + "epoch": 0.5586909333778594, + "grad_norm": 0.7269906153257919, + "learning_rate": 9.775457356559013e-06, + "loss": 0.0395, + "step": 1673 + }, + { + "epoch": 0.559024878944732, + "grad_norm": 0.760415482871149, + "learning_rate": 9.774881164599621e-06, + "loss": 0.0587, + "step": 1674 + }, + { + "epoch": 0.5593588245116046, + "grad_norm": 0.8227202729605906, + "learning_rate": 9.77430425133821e-06, + "loss": 0.0502, + "step": 1675 + }, + { + "epoch": 0.5596927700784772, + "grad_norm": 0.5023770215405837, + "learning_rate": 9.773726616861926e-06, + "loss": 0.0423, + "step": 1676 + }, + { + "epoch": 0.5600267156453498, + "grad_norm": 0.5027304555843701, + "learning_rate": 9.773148261258025e-06, + "loss": 0.053, + "step": 1677 + }, + { + "epoch": 0.5603606612122224, + "grad_norm": 0.4456588855350891, + "learning_rate": 9.772569184613879e-06, + "loss": 0.0308, + "step": 1678 + }, + { + "epoch": 0.560694606779095, + "grad_norm": 0.45427388831713583, + "learning_rate": 9.771989387016962e-06, + "loss": 0.0381, + "step": 1679 + }, + { + "epoch": 0.5610285523459676, + "grad_norm": 0.6142935080859407, + "learning_rate": 9.77140886855486e-06, + "loss": 0.0489, + "step": 1680 + }, + { + "epoch": 0.5613624979128402, + "grad_norm": 0.5833919680636962, + "learning_rate": 9.770827629315266e-06, + "loss": 0.0478, + "step": 1681 + }, + { + "epoch": 0.5616964434797128, + "grad_norm": 0.5926392272950822, + "learning_rate": 9.770245669385984e-06, + "loss": 0.0404, + "step": 1682 + }, + { + "epoch": 0.5620303890465854, + "grad_norm": 0.5270987927707339, + "learning_rate": 9.76966298885493e-06, + "loss": 0.0426, + "step": 1683 + }, + { + "epoch": 0.562364334613458, + "grad_norm": 0.586726309389911, + "learning_rate": 9.769079587810115e-06, + "loss": 0.0335, + "step": 1684 + }, + { + "epoch": 0.5626982801803306, + "grad_norm": 0.5887110017814984, + "learning_rate": 9.768495466339675e-06, + "loss": 0.0375, + "step": 1685 + }, + { + "epoch": 0.5630322257472032, + "grad_norm": 0.42871318705843875, + "learning_rate": 9.767910624531852e-06, + "loss": 0.0365, + "step": 1686 + }, + { + "epoch": 0.5633661713140758, + "grad_norm": 0.4295891737050711, + "learning_rate": 9.767325062474984e-06, + "loss": 0.0311, + "step": 1687 + }, + { + "epoch": 0.5637001168809485, + "grad_norm": 0.49488756256363087, + "learning_rate": 9.766738780257535e-06, + "loss": 0.0366, + "step": 1688 + }, + { + "epoch": 0.564034062447821, + "grad_norm": 0.397622714634608, + "learning_rate": 9.766151777968063e-06, + "loss": 0.0412, + "step": 1689 + }, + { + "epoch": 0.5643680080146936, + "grad_norm": 0.6398746761967488, + "learning_rate": 9.765564055695249e-06, + "loss": 0.0453, + "step": 1690 + }, + { + "epoch": 0.5647019535815662, + "grad_norm": 0.5777262669287194, + "learning_rate": 9.76497561352787e-06, + "loss": 0.0517, + "step": 1691 + }, + { + "epoch": 0.5650358991484388, + "grad_norm": 0.4354174250708227, + "learning_rate": 9.764386451554819e-06, + "loss": 0.0406, + "step": 1692 + }, + { + "epoch": 0.5653698447153114, + "grad_norm": 0.40765008160639593, + "learning_rate": 9.763796569865095e-06, + "loss": 0.0407, + "step": 1693 + }, + { + "epoch": 0.565703790282184, + "grad_norm": 0.34479313887853946, + "learning_rate": 9.763205968547808e-06, + "loss": 0.0405, + "step": 1694 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.44612130164603186, + "learning_rate": 9.762614647692175e-06, + "loss": 0.0353, + "step": 1695 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 0.4081773718045894, + "learning_rate": 9.762022607387522e-06, + "loss": 0.0403, + "step": 1696 + }, + { + "epoch": 0.5667056269828018, + "grad_norm": 0.3986242039861698, + "learning_rate": 9.761429847723281e-06, + "loss": 0.0453, + "step": 1697 + }, + { + "epoch": 0.5670395725496744, + "grad_norm": 0.4647791276949541, + "learning_rate": 9.760836368788999e-06, + "loss": 0.0457, + "step": 1698 + }, + { + "epoch": 0.567373518116547, + "grad_norm": 0.7058313139209909, + "learning_rate": 9.760242170674325e-06, + "loss": 0.0292, + "step": 1699 + }, + { + "epoch": 0.5677074636834196, + "grad_norm": 0.49994785567155847, + "learning_rate": 9.759647253469023e-06, + "loss": 0.0751, + "step": 1700 + }, + { + "epoch": 0.5680414092502922, + "grad_norm": 0.48979765791061497, + "learning_rate": 9.75905161726296e-06, + "loss": 0.0488, + "step": 1701 + }, + { + "epoch": 0.5683753548171648, + "grad_norm": 0.4754036512113146, + "learning_rate": 9.758455262146114e-06, + "loss": 0.0421, + "step": 1702 + }, + { + "epoch": 0.5687093003840374, + "grad_norm": 0.46348012900650337, + "learning_rate": 9.757858188208571e-06, + "loss": 0.0416, + "step": 1703 + }, + { + "epoch": 0.56904324595091, + "grad_norm": 0.4077132589607528, + "learning_rate": 9.757260395540527e-06, + "loss": 0.0331, + "step": 1704 + }, + { + "epoch": 0.5693771915177827, + "grad_norm": 0.48223135405172074, + "learning_rate": 9.756661884232286e-06, + "loss": 0.0506, + "step": 1705 + }, + { + "epoch": 0.5697111370846552, + "grad_norm": 0.40773646511134254, + "learning_rate": 9.756062654374259e-06, + "loss": 0.0398, + "step": 1706 + }, + { + "epoch": 0.5700450826515278, + "grad_norm": 0.4436906126403325, + "learning_rate": 9.755462706056966e-06, + "loss": 0.048, + "step": 1707 + }, + { + "epoch": 0.5703790282184004, + "grad_norm": 0.4733346088039868, + "learning_rate": 9.75486203937104e-06, + "loss": 0.0388, + "step": 1708 + }, + { + "epoch": 0.570712973785273, + "grad_norm": 0.48669551866401806, + "learning_rate": 9.754260654407214e-06, + "loss": 0.04, + "step": 1709 + }, + { + "epoch": 0.5710469193521456, + "grad_norm": 0.44310970928304405, + "learning_rate": 9.753658551256338e-06, + "loss": 0.0361, + "step": 1710 + }, + { + "epoch": 0.5713808649190182, + "grad_norm": 0.3834459594566251, + "learning_rate": 9.753055730009364e-06, + "loss": 0.0381, + "step": 1711 + }, + { + "epoch": 0.5717148104858908, + "grad_norm": 0.608494935824481, + "learning_rate": 9.752452190757358e-06, + "loss": 0.0475, + "step": 1712 + }, + { + "epoch": 0.5720487560527634, + "grad_norm": 0.3116160449574889, + "learning_rate": 9.751847933591489e-06, + "loss": 0.0298, + "step": 1713 + }, + { + "epoch": 0.572382701619636, + "grad_norm": 0.36538216205006346, + "learning_rate": 9.75124295860304e-06, + "loss": 0.0408, + "step": 1714 + }, + { + "epoch": 0.5727166471865086, + "grad_norm": 0.34464107849922015, + "learning_rate": 9.750637265883395e-06, + "loss": 0.0371, + "step": 1715 + }, + { + "epoch": 0.5730505927533812, + "grad_norm": 0.4398000685746299, + "learning_rate": 9.750030855524058e-06, + "loss": 0.0439, + "step": 1716 + }, + { + "epoch": 0.5733845383202538, + "grad_norm": 0.3898477877018435, + "learning_rate": 9.749423727616628e-06, + "loss": 0.0534, + "step": 1717 + }, + { + "epoch": 0.5737184838871264, + "grad_norm": 0.39525352162345495, + "learning_rate": 9.748815882252823e-06, + "loss": 0.0331, + "step": 1718 + }, + { + "epoch": 0.574052429453999, + "grad_norm": 0.40168206830824166, + "learning_rate": 9.748207319524462e-06, + "loss": 0.0518, + "step": 1719 + }, + { + "epoch": 0.5743863750208716, + "grad_norm": 0.4026888446467387, + "learning_rate": 9.747598039523476e-06, + "loss": 0.034, + "step": 1720 + }, + { + "epoch": 0.5747203205877442, + "grad_norm": 0.5043447327069664, + "learning_rate": 9.746988042341907e-06, + "loss": 0.0509, + "step": 1721 + }, + { + "epoch": 0.5750542661546167, + "grad_norm": 0.5878621389397128, + "learning_rate": 9.746377328071899e-06, + "loss": 0.033, + "step": 1722 + }, + { + "epoch": 0.5753882117214894, + "grad_norm": 0.4047824403195135, + "learning_rate": 9.74576589680571e-06, + "loss": 0.0354, + "step": 1723 + }, + { + "epoch": 0.575722157288362, + "grad_norm": 0.6433613853468115, + "learning_rate": 9.745153748635702e-06, + "loss": 0.0443, + "step": 1724 + }, + { + "epoch": 0.5760561028552346, + "grad_norm": 0.4954299759674441, + "learning_rate": 9.744540883654348e-06, + "loss": 0.0505, + "step": 1725 + }, + { + "epoch": 0.5763900484221072, + "grad_norm": 0.5261383027946491, + "learning_rate": 9.743927301954229e-06, + "loss": 0.0603, + "step": 1726 + }, + { + "epoch": 0.5767239939889798, + "grad_norm": 0.3101222368704965, + "learning_rate": 9.743313003628033e-06, + "loss": 0.0373, + "step": 1727 + }, + { + "epoch": 0.5770579395558524, + "grad_norm": 1.1817673381964775, + "learning_rate": 9.742697988768557e-06, + "loss": 0.0566, + "step": 1728 + }, + { + "epoch": 0.577391885122725, + "grad_norm": 0.4074827108070887, + "learning_rate": 9.742082257468705e-06, + "loss": 0.034, + "step": 1729 + }, + { + "epoch": 0.5777258306895976, + "grad_norm": 0.46863638424338366, + "learning_rate": 9.741465809821493e-06, + "loss": 0.0446, + "step": 1730 + }, + { + "epoch": 0.5780597762564702, + "grad_norm": 0.4836885565541822, + "learning_rate": 9.74084864592004e-06, + "loss": 0.0411, + "step": 1731 + }, + { + "epoch": 0.5783937218233428, + "grad_norm": 0.553986398907438, + "learning_rate": 9.74023076585758e-06, + "loss": 0.0366, + "step": 1732 + }, + { + "epoch": 0.5787276673902154, + "grad_norm": 0.44076096145232735, + "learning_rate": 9.739612169727446e-06, + "loss": 0.0271, + "step": 1733 + }, + { + "epoch": 0.579061612957088, + "grad_norm": 0.33565597315818607, + "learning_rate": 9.73899285762309e-06, + "loss": 0.0367, + "step": 1734 + }, + { + "epoch": 0.5793955585239606, + "grad_norm": 0.5922993089683395, + "learning_rate": 9.738372829638058e-06, + "loss": 0.045, + "step": 1735 + }, + { + "epoch": 0.5797295040908332, + "grad_norm": 0.6039800326539692, + "learning_rate": 9.73775208586602e-06, + "loss": 0.0505, + "step": 1736 + }, + { + "epoch": 0.5800634496577058, + "grad_norm": 0.4715635213171511, + "learning_rate": 9.737130626400745e-06, + "loss": 0.0461, + "step": 1737 + }, + { + "epoch": 0.5803973952245783, + "grad_norm": 0.9311351699330999, + "learning_rate": 9.736508451336111e-06, + "loss": 0.0603, + "step": 1738 + }, + { + "epoch": 0.580731340791451, + "grad_norm": 0.4618796011976892, + "learning_rate": 9.735885560766104e-06, + "loss": 0.0416, + "step": 1739 + }, + { + "epoch": 0.5810652863583236, + "grad_norm": 0.5566929361313651, + "learning_rate": 9.73526195478482e-06, + "loss": 0.0798, + "step": 1740 + }, + { + "epoch": 0.5813992319251962, + "grad_norm": 0.37517028857443224, + "learning_rate": 9.73463763348646e-06, + "loss": 0.0362, + "step": 1741 + }, + { + "epoch": 0.5817331774920688, + "grad_norm": 0.5412175742504487, + "learning_rate": 9.734012596965341e-06, + "loss": 0.0391, + "step": 1742 + }, + { + "epoch": 0.5820671230589414, + "grad_norm": 0.47452101080336717, + "learning_rate": 9.733386845315875e-06, + "loss": 0.0479, + "step": 1743 + }, + { + "epoch": 0.582401068625814, + "grad_norm": 0.9914074972686332, + "learning_rate": 9.732760378632592e-06, + "loss": 0.0661, + "step": 1744 + }, + { + "epoch": 0.5827350141926866, + "grad_norm": 0.45693844718749216, + "learning_rate": 9.73213319701013e-06, + "loss": 0.0359, + "step": 1745 + }, + { + "epoch": 0.5830689597595592, + "grad_norm": 0.4007173084318067, + "learning_rate": 9.731505300543228e-06, + "loss": 0.0363, + "step": 1746 + }, + { + "epoch": 0.5834029053264318, + "grad_norm": 0.5636375021117038, + "learning_rate": 9.730876689326739e-06, + "loss": 0.0442, + "step": 1747 + }, + { + "epoch": 0.5837368508933044, + "grad_norm": 0.5413144380237097, + "learning_rate": 9.730247363455621e-06, + "loss": 0.0447, + "step": 1748 + }, + { + "epoch": 0.584070796460177, + "grad_norm": 0.33551405977733567, + "learning_rate": 9.729617323024943e-06, + "loss": 0.0375, + "step": 1749 + }, + { + "epoch": 0.5844047420270496, + "grad_norm": 0.5237682630936968, + "learning_rate": 9.728986568129876e-06, + "loss": 0.0399, + "step": 1750 + }, + { + "epoch": 0.5847386875939222, + "grad_norm": 0.3544041932326313, + "learning_rate": 9.72835509886571e-06, + "loss": 0.0367, + "step": 1751 + }, + { + "epoch": 0.5850726331607948, + "grad_norm": 0.44789001502132775, + "learning_rate": 9.727722915327828e-06, + "loss": 0.036, + "step": 1752 + }, + { + "epoch": 0.5854065787276674, + "grad_norm": 0.4960974968783306, + "learning_rate": 9.727090017611736e-06, + "loss": 0.0311, + "step": 1753 + }, + { + "epoch": 0.58574052429454, + "grad_norm": 0.41086256928254816, + "learning_rate": 9.726456405813033e-06, + "loss": 0.0422, + "step": 1754 + }, + { + "epoch": 0.5860744698614125, + "grad_norm": 0.3315468335717578, + "learning_rate": 9.725822080027442e-06, + "loss": 0.0271, + "step": 1755 + }, + { + "epoch": 0.5864084154282851, + "grad_norm": 0.3730186120425334, + "learning_rate": 9.725187040350778e-06, + "loss": 0.0377, + "step": 1756 + }, + { + "epoch": 0.5867423609951578, + "grad_norm": 0.31288603321670977, + "learning_rate": 9.724551286878976e-06, + "loss": 0.0335, + "step": 1757 + }, + { + "epoch": 0.5870763065620304, + "grad_norm": 0.4264371401607715, + "learning_rate": 9.723914819708073e-06, + "loss": 0.0368, + "step": 1758 + }, + { + "epoch": 0.587410252128903, + "grad_norm": 0.3585948244649243, + "learning_rate": 9.723277638934212e-06, + "loss": 0.0339, + "step": 1759 + }, + { + "epoch": 0.5877441976957756, + "grad_norm": 0.362457406151004, + "learning_rate": 9.72263974465365e-06, + "loss": 0.0312, + "step": 1760 + }, + { + "epoch": 0.5880781432626482, + "grad_norm": 0.5232785559424709, + "learning_rate": 9.722001136962746e-06, + "loss": 0.0437, + "step": 1761 + }, + { + "epoch": 0.5884120888295208, + "grad_norm": 0.4422200875621492, + "learning_rate": 9.721361815957973e-06, + "loss": 0.0481, + "step": 1762 + }, + { + "epoch": 0.5887460343963934, + "grad_norm": 0.3447670320289213, + "learning_rate": 9.720721781735905e-06, + "loss": 0.0314, + "step": 1763 + }, + { + "epoch": 0.589079979963266, + "grad_norm": 0.41271915267881604, + "learning_rate": 9.720081034393226e-06, + "loss": 0.0441, + "step": 1764 + }, + { + "epoch": 0.5894139255301386, + "grad_norm": 0.36241051150896486, + "learning_rate": 9.71943957402673e-06, + "loss": 0.0382, + "step": 1765 + }, + { + "epoch": 0.5897478710970112, + "grad_norm": 0.3805774855878043, + "learning_rate": 9.718797400733314e-06, + "loss": 0.0337, + "step": 1766 + }, + { + "epoch": 0.5900818166638838, + "grad_norm": 0.37939735746645925, + "learning_rate": 9.718154514609992e-06, + "loss": 0.0419, + "step": 1767 + }, + { + "epoch": 0.5904157622307564, + "grad_norm": 0.5746897366424192, + "learning_rate": 9.717510915753876e-06, + "loss": 0.0404, + "step": 1768 + }, + { + "epoch": 0.590749707797629, + "grad_norm": 0.6070537951285289, + "learning_rate": 9.716866604262189e-06, + "loss": 0.0447, + "step": 1769 + }, + { + "epoch": 0.5910836533645016, + "grad_norm": 0.3279231579739388, + "learning_rate": 9.716221580232261e-06, + "loss": 0.0336, + "step": 1770 + }, + { + "epoch": 0.5914175989313741, + "grad_norm": 0.6739511464001847, + "learning_rate": 9.715575843761534e-06, + "loss": 0.0368, + "step": 1771 + }, + { + "epoch": 0.5917515444982467, + "grad_norm": 0.4083766841062251, + "learning_rate": 9.714929394947548e-06, + "loss": 0.0408, + "step": 1772 + }, + { + "epoch": 0.5920854900651193, + "grad_norm": 0.6761259201718556, + "learning_rate": 9.714282233887962e-06, + "loss": 0.0498, + "step": 1773 + }, + { + "epoch": 0.592419435631992, + "grad_norm": 0.5520271218117264, + "learning_rate": 9.713634360680537e-06, + "loss": 0.0446, + "step": 1774 + }, + { + "epoch": 0.5927533811988646, + "grad_norm": 0.48495273660881766, + "learning_rate": 9.712985775423141e-06, + "loss": 0.0475, + "step": 1775 + }, + { + "epoch": 0.5930873267657372, + "grad_norm": 0.3852406336180792, + "learning_rate": 9.712336478213747e-06, + "loss": 0.0311, + "step": 1776 + }, + { + "epoch": 0.5934212723326098, + "grad_norm": 0.3499732013840638, + "learning_rate": 9.711686469150444e-06, + "loss": 0.0438, + "step": 1777 + }, + { + "epoch": 0.5937552178994824, + "grad_norm": 0.5017946237801338, + "learning_rate": 9.711035748331421e-06, + "loss": 0.0492, + "step": 1778 + }, + { + "epoch": 0.594089163466355, + "grad_norm": 0.2771548188234721, + "learning_rate": 9.710384315854977e-06, + "loss": 0.022, + "step": 1779 + }, + { + "epoch": 0.5944231090332276, + "grad_norm": 0.5861557106448326, + "learning_rate": 9.70973217181952e-06, + "loss": 0.0515, + "step": 1780 + }, + { + "epoch": 0.5947570546001002, + "grad_norm": 0.8050176980062403, + "learning_rate": 9.709079316323564e-06, + "loss": 0.053, + "step": 1781 + }, + { + "epoch": 0.5950910001669728, + "grad_norm": 0.4056848136430168, + "learning_rate": 9.70842574946573e-06, + "loss": 0.0362, + "step": 1782 + }, + { + "epoch": 0.5954249457338454, + "grad_norm": 0.45483119774188496, + "learning_rate": 9.707771471344744e-06, + "loss": 0.0325, + "step": 1783 + }, + { + "epoch": 0.595758891300718, + "grad_norm": 0.3715318388374774, + "learning_rate": 9.707116482059447e-06, + "loss": 0.0411, + "step": 1784 + }, + { + "epoch": 0.5960928368675906, + "grad_norm": 0.41928845236130025, + "learning_rate": 9.70646078170878e-06, + "loss": 0.0356, + "step": 1785 + }, + { + "epoch": 0.5964267824344632, + "grad_norm": 0.44261717206772755, + "learning_rate": 9.705804370391794e-06, + "loss": 0.0342, + "step": 1786 + }, + { + "epoch": 0.5967607280013357, + "grad_norm": 0.515469603257908, + "learning_rate": 9.705147248207652e-06, + "loss": 0.0399, + "step": 1787 + }, + { + "epoch": 0.5970946735682083, + "grad_norm": 0.4767092190313439, + "learning_rate": 9.704489415255614e-06, + "loss": 0.0389, + "step": 1788 + }, + { + "epoch": 0.5974286191350809, + "grad_norm": 0.5800879449898292, + "learning_rate": 9.703830871635057e-06, + "loss": 0.0322, + "step": 1789 + }, + { + "epoch": 0.5977625647019535, + "grad_norm": 0.3808283732863363, + "learning_rate": 9.703171617445461e-06, + "loss": 0.0291, + "step": 1790 + }, + { + "epoch": 0.5980965102688262, + "grad_norm": 0.5152911556416488, + "learning_rate": 9.702511652786414e-06, + "loss": 0.0386, + "step": 1791 + }, + { + "epoch": 0.5984304558356988, + "grad_norm": 0.4657296814566914, + "learning_rate": 9.701850977757611e-06, + "loss": 0.0399, + "step": 1792 + }, + { + "epoch": 0.5987644014025714, + "grad_norm": 0.4495711775100688, + "learning_rate": 9.701189592458858e-06, + "loss": 0.0395, + "step": 1793 + }, + { + "epoch": 0.599098346969444, + "grad_norm": 0.8654683483972361, + "learning_rate": 9.70052749699006e-06, + "loss": 0.0431, + "step": 1794 + }, + { + "epoch": 0.5994322925363166, + "grad_norm": 0.4101268642173975, + "learning_rate": 9.699864691451236e-06, + "loss": 0.0328, + "step": 1795 + }, + { + "epoch": 0.5997662381031892, + "grad_norm": 0.3536285767977456, + "learning_rate": 9.699201175942514e-06, + "loss": 0.0311, + "step": 1796 + }, + { + "epoch": 0.6001001836700618, + "grad_norm": 0.3590359858839266, + "learning_rate": 9.698536950564121e-06, + "loss": 0.0326, + "step": 1797 + }, + { + "epoch": 0.6004341292369344, + "grad_norm": 0.5365861834913674, + "learning_rate": 9.6978720154164e-06, + "loss": 0.0385, + "step": 1798 + }, + { + "epoch": 0.600768074803807, + "grad_norm": 0.5436229825155504, + "learning_rate": 9.697206370599793e-06, + "loss": 0.0433, + "step": 1799 + }, + { + "epoch": 0.6011020203706796, + "grad_norm": 0.5012686078311579, + "learning_rate": 9.696540016214857e-06, + "loss": 0.0428, + "step": 1800 + }, + { + "epoch": 0.6014359659375522, + "grad_norm": 0.3583759614222113, + "learning_rate": 9.695872952362253e-06, + "loss": 0.038, + "step": 1801 + }, + { + "epoch": 0.6017699115044248, + "grad_norm": 0.4743476178265279, + "learning_rate": 9.695205179142746e-06, + "loss": 0.045, + "step": 1802 + }, + { + "epoch": 0.6021038570712974, + "grad_norm": 0.42979322267761627, + "learning_rate": 9.694536696657213e-06, + "loss": 0.039, + "step": 1803 + }, + { + "epoch": 0.6024378026381699, + "grad_norm": 0.4923396538362408, + "learning_rate": 9.693867505006634e-06, + "loss": 0.0465, + "step": 1804 + }, + { + "epoch": 0.6027717482050425, + "grad_norm": 0.33772456017783375, + "learning_rate": 9.693197604292101e-06, + "loss": 0.0353, + "step": 1805 + }, + { + "epoch": 0.6031056937719151, + "grad_norm": 0.40009823146013773, + "learning_rate": 9.69252699461481e-06, + "loss": 0.0395, + "step": 1806 + }, + { + "epoch": 0.6034396393387877, + "grad_norm": 0.399579996705222, + "learning_rate": 9.691855676076064e-06, + "loss": 0.0433, + "step": 1807 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.3547509009946232, + "learning_rate": 9.691183648777271e-06, + "loss": 0.0436, + "step": 1808 + }, + { + "epoch": 0.604107530472533, + "grad_norm": 0.38969256891694476, + "learning_rate": 9.690510912819952e-06, + "loss": 0.0411, + "step": 1809 + }, + { + "epoch": 0.6044414760394056, + "grad_norm": 0.4858599541616899, + "learning_rate": 9.689837468305732e-06, + "loss": 0.0398, + "step": 1810 + }, + { + "epoch": 0.6047754216062782, + "grad_norm": 0.5323739845380792, + "learning_rate": 9.689163315336339e-06, + "loss": 0.0534, + "step": 1811 + }, + { + "epoch": 0.6051093671731508, + "grad_norm": 0.4411506877737185, + "learning_rate": 9.688488454013616e-06, + "loss": 0.0462, + "step": 1812 + }, + { + "epoch": 0.6054433127400234, + "grad_norm": 0.4860508883152515, + "learning_rate": 9.687812884439506e-06, + "loss": 0.0409, + "step": 1813 + }, + { + "epoch": 0.605777258306896, + "grad_norm": 0.42259518973967364, + "learning_rate": 9.687136606716064e-06, + "loss": 0.0327, + "step": 1814 + }, + { + "epoch": 0.6061112038737686, + "grad_norm": 0.3614066633082068, + "learning_rate": 9.686459620945445e-06, + "loss": 0.0296, + "step": 1815 + }, + { + "epoch": 0.6064451494406412, + "grad_norm": 0.47366380002731584, + "learning_rate": 9.685781927229923e-06, + "loss": 0.044, + "step": 1816 + }, + { + "epoch": 0.6067790950075138, + "grad_norm": 0.5981411902514698, + "learning_rate": 9.685103525671864e-06, + "loss": 0.0753, + "step": 1817 + }, + { + "epoch": 0.6071130405743864, + "grad_norm": 0.3675903668150694, + "learning_rate": 9.684424416373754e-06, + "loss": 0.0463, + "step": 1818 + }, + { + "epoch": 0.607446986141259, + "grad_norm": 0.4676757309110554, + "learning_rate": 9.683744599438178e-06, + "loss": 0.048, + "step": 1819 + }, + { + "epoch": 0.6077809317081315, + "grad_norm": 0.3693956960582205, + "learning_rate": 9.683064074967832e-06, + "loss": 0.0451, + "step": 1820 + }, + { + "epoch": 0.6081148772750041, + "grad_norm": 0.3194528730340604, + "learning_rate": 9.682382843065516e-06, + "loss": 0.0321, + "step": 1821 + }, + { + "epoch": 0.6084488228418767, + "grad_norm": 0.44144989718851135, + "learning_rate": 9.681700903834137e-06, + "loss": 0.0443, + "step": 1822 + }, + { + "epoch": 0.6087827684087493, + "grad_norm": 0.5664392510781382, + "learning_rate": 9.681018257376713e-06, + "loss": 0.0507, + "step": 1823 + }, + { + "epoch": 0.609116713975622, + "grad_norm": 0.4714622252126422, + "learning_rate": 9.680334903796363e-06, + "loss": 0.0488, + "step": 1824 + }, + { + "epoch": 0.6094506595424946, + "grad_norm": 0.38670578461387006, + "learning_rate": 9.679650843196318e-06, + "loss": 0.0298, + "step": 1825 + }, + { + "epoch": 0.6097846051093672, + "grad_norm": 0.44357125166347744, + "learning_rate": 9.678966075679909e-06, + "loss": 0.0454, + "step": 1826 + }, + { + "epoch": 0.6101185506762398, + "grad_norm": 0.28844122123191956, + "learning_rate": 9.678280601350584e-06, + "loss": 0.0271, + "step": 1827 + }, + { + "epoch": 0.6104524962431124, + "grad_norm": 0.3736854608954971, + "learning_rate": 9.67759442031189e-06, + "loss": 0.0338, + "step": 1828 + }, + { + "epoch": 0.610786441809985, + "grad_norm": 0.3957082301752809, + "learning_rate": 9.676907532667478e-06, + "loss": 0.0358, + "step": 1829 + }, + { + "epoch": 0.6111203873768576, + "grad_norm": 0.4430521380416934, + "learning_rate": 9.676219938521116e-06, + "loss": 0.0412, + "step": 1830 + }, + { + "epoch": 0.6114543329437302, + "grad_norm": 0.4629812439050685, + "learning_rate": 9.675531637976673e-06, + "loss": 0.0264, + "step": 1831 + }, + { + "epoch": 0.6117882785106028, + "grad_norm": 0.35940162870134673, + "learning_rate": 9.674842631138121e-06, + "loss": 0.0293, + "step": 1832 + }, + { + "epoch": 0.6121222240774754, + "grad_norm": 0.4267551169880035, + "learning_rate": 9.674152918109547e-06, + "loss": 0.0393, + "step": 1833 + }, + { + "epoch": 0.612456169644348, + "grad_norm": 0.5339190057612913, + "learning_rate": 9.673462498995138e-06, + "loss": 0.0439, + "step": 1834 + }, + { + "epoch": 0.6127901152112206, + "grad_norm": 0.39801476705213834, + "learning_rate": 9.672771373899192e-06, + "loss": 0.0355, + "step": 1835 + }, + { + "epoch": 0.6131240607780931, + "grad_norm": 0.4429084200500223, + "learning_rate": 9.672079542926108e-06, + "loss": 0.0376, + "step": 1836 + }, + { + "epoch": 0.6134580063449657, + "grad_norm": 0.67222515154493, + "learning_rate": 9.671387006180398e-06, + "loss": 0.0499, + "step": 1837 + }, + { + "epoch": 0.6137919519118383, + "grad_norm": 0.47779150626601224, + "learning_rate": 9.670693763766674e-06, + "loss": 0.0401, + "step": 1838 + }, + { + "epoch": 0.6141258974787109, + "grad_norm": 0.37426042285762723, + "learning_rate": 9.669999815789664e-06, + "loss": 0.0328, + "step": 1839 + }, + { + "epoch": 0.6144598430455835, + "grad_norm": 0.41636243829079883, + "learning_rate": 9.669305162354194e-06, + "loss": 0.0354, + "step": 1840 + }, + { + "epoch": 0.6147937886124561, + "grad_norm": 0.6226963095081265, + "learning_rate": 9.6686098035652e-06, + "loss": 0.0488, + "step": 1841 + }, + { + "epoch": 0.6151277341793288, + "grad_norm": 0.429397152236842, + "learning_rate": 9.667913739527724e-06, + "loss": 0.0426, + "step": 1842 + }, + { + "epoch": 0.6154616797462014, + "grad_norm": 0.4372126677348734, + "learning_rate": 9.667216970346916e-06, + "loss": 0.0483, + "step": 1843 + }, + { + "epoch": 0.615795625313074, + "grad_norm": 0.3967715291605297, + "learning_rate": 9.666519496128027e-06, + "loss": 0.0385, + "step": 1844 + }, + { + "epoch": 0.6161295708799466, + "grad_norm": 0.426735892267052, + "learning_rate": 9.665821316976423e-06, + "loss": 0.0358, + "step": 1845 + }, + { + "epoch": 0.6164635164468192, + "grad_norm": 0.36159372896020636, + "learning_rate": 9.665122432997571e-06, + "loss": 0.032, + "step": 1846 + }, + { + "epoch": 0.6167974620136918, + "grad_norm": 0.2725741607332122, + "learning_rate": 9.664422844297045e-06, + "loss": 0.0309, + "step": 1847 + }, + { + "epoch": 0.6171314075805644, + "grad_norm": 0.45038910214507577, + "learning_rate": 9.663722550980528e-06, + "loss": 0.0401, + "step": 1848 + }, + { + "epoch": 0.617465353147437, + "grad_norm": 0.42086104921704576, + "learning_rate": 9.663021553153805e-06, + "loss": 0.0348, + "step": 1849 + }, + { + "epoch": 0.6177992987143096, + "grad_norm": 0.4668187156562129, + "learning_rate": 9.66231985092277e-06, + "loss": 0.034, + "step": 1850 + }, + { + "epoch": 0.6181332442811822, + "grad_norm": 0.3986616199994234, + "learning_rate": 9.661617444393427e-06, + "loss": 0.0487, + "step": 1851 + }, + { + "epoch": 0.6184671898480548, + "grad_norm": 0.42521752469741725, + "learning_rate": 9.660914333671878e-06, + "loss": 0.032, + "step": 1852 + }, + { + "epoch": 0.6188011354149273, + "grad_norm": 0.5122186042752632, + "learning_rate": 9.66021051886434e-06, + "loss": 0.0447, + "step": 1853 + }, + { + "epoch": 0.6191350809817999, + "grad_norm": 0.376105460409327, + "learning_rate": 9.65950600007713e-06, + "loss": 0.0347, + "step": 1854 + }, + { + "epoch": 0.6194690265486725, + "grad_norm": 0.3477680883817906, + "learning_rate": 9.658800777416676e-06, + "loss": 0.0304, + "step": 1855 + }, + { + "epoch": 0.6198029721155451, + "grad_norm": 0.31072974884173854, + "learning_rate": 9.658094850989508e-06, + "loss": 0.0279, + "step": 1856 + }, + { + "epoch": 0.6201369176824177, + "grad_norm": 0.3841256211026492, + "learning_rate": 9.657388220902265e-06, + "loss": 0.0346, + "step": 1857 + }, + { + "epoch": 0.6204708632492903, + "grad_norm": 0.3313444417218464, + "learning_rate": 9.656680887261693e-06, + "loss": 0.026, + "step": 1858 + }, + { + "epoch": 0.620804808816163, + "grad_norm": 0.43367576990253265, + "learning_rate": 9.655972850174642e-06, + "loss": 0.04, + "step": 1859 + }, + { + "epoch": 0.6211387543830356, + "grad_norm": 0.34493385105284713, + "learning_rate": 9.65526410974807e-06, + "loss": 0.0275, + "step": 1860 + }, + { + "epoch": 0.6214726999499082, + "grad_norm": 0.47405934018109014, + "learning_rate": 9.65455466608904e-06, + "loss": 0.0343, + "step": 1861 + }, + { + "epoch": 0.6218066455167808, + "grad_norm": 0.4681075468104997, + "learning_rate": 9.653844519304722e-06, + "loss": 0.0416, + "step": 1862 + }, + { + "epoch": 0.6221405910836534, + "grad_norm": 0.5265841271768287, + "learning_rate": 9.653133669502393e-06, + "loss": 0.0524, + "step": 1863 + }, + { + "epoch": 0.622474536650526, + "grad_norm": 0.30744485928642573, + "learning_rate": 9.652422116789432e-06, + "loss": 0.0271, + "step": 1864 + }, + { + "epoch": 0.6228084822173986, + "grad_norm": 0.6876129662779562, + "learning_rate": 9.651709861273334e-06, + "loss": 0.0364, + "step": 1865 + }, + { + "epoch": 0.6231424277842712, + "grad_norm": 0.7125513818465257, + "learning_rate": 9.650996903061685e-06, + "loss": 0.0388, + "step": 1866 + }, + { + "epoch": 0.6234763733511438, + "grad_norm": 0.39487180334252686, + "learning_rate": 9.650283242262192e-06, + "loss": 0.0421, + "step": 1867 + }, + { + "epoch": 0.6238103189180164, + "grad_norm": 0.3642856274716189, + "learning_rate": 9.64956887898266e-06, + "loss": 0.0474, + "step": 1868 + }, + { + "epoch": 0.6241442644848889, + "grad_norm": 2.8861305941138524, + "learning_rate": 9.648853813331e-06, + "loss": 0.0386, + "step": 1869 + }, + { + "epoch": 0.6244782100517615, + "grad_norm": 0.5704609822104695, + "learning_rate": 9.648138045415236e-06, + "loss": 0.0568, + "step": 1870 + }, + { + "epoch": 0.6248121556186341, + "grad_norm": 0.3623440610754578, + "learning_rate": 9.647421575343488e-06, + "loss": 0.0318, + "step": 1871 + }, + { + "epoch": 0.6251461011855067, + "grad_norm": 0.8475867166920036, + "learning_rate": 9.646704403223991e-06, + "loss": 0.0313, + "step": 1872 + }, + { + "epoch": 0.6254800467523793, + "grad_norm": 0.3528685542346121, + "learning_rate": 9.64598652916508e-06, + "loss": 0.0315, + "step": 1873 + }, + { + "epoch": 0.6258139923192519, + "grad_norm": 0.30780792596204365, + "learning_rate": 9.6452679532752e-06, + "loss": 0.0336, + "step": 1874 + }, + { + "epoch": 0.6261479378861246, + "grad_norm": 0.6832366896758314, + "learning_rate": 9.644548675662897e-06, + "loss": 0.0432, + "step": 1875 + }, + { + "epoch": 0.6264818834529972, + "grad_norm": 0.36973569271153145, + "learning_rate": 9.64382869643683e-06, + "loss": 0.0334, + "step": 1876 + }, + { + "epoch": 0.6268158290198698, + "grad_norm": 0.5904473422843772, + "learning_rate": 9.64310801570576e-06, + "loss": 0.0605, + "step": 1877 + }, + { + "epoch": 0.6271497745867424, + "grad_norm": 0.49367437779993034, + "learning_rate": 9.642386633578553e-06, + "loss": 0.049, + "step": 1878 + }, + { + "epoch": 0.627483720153615, + "grad_norm": 0.6309937635353293, + "learning_rate": 9.641664550164182e-06, + "loss": 0.0481, + "step": 1879 + }, + { + "epoch": 0.6278176657204876, + "grad_norm": 0.32741957513574144, + "learning_rate": 9.640941765571727e-06, + "loss": 0.0337, + "step": 1880 + }, + { + "epoch": 0.6281516112873602, + "grad_norm": 0.25633710985612546, + "learning_rate": 9.640218279910374e-06, + "loss": 0.0281, + "step": 1881 + }, + { + "epoch": 0.6284855568542328, + "grad_norm": 0.38195096646212046, + "learning_rate": 9.639494093289412e-06, + "loss": 0.0358, + "step": 1882 + }, + { + "epoch": 0.6288195024211054, + "grad_norm": 0.5386835031032556, + "learning_rate": 9.638769205818239e-06, + "loss": 0.0361, + "step": 1883 + }, + { + "epoch": 0.629153447987978, + "grad_norm": 0.7679738802288255, + "learning_rate": 9.638043617606358e-06, + "loss": 0.0617, + "step": 1884 + }, + { + "epoch": 0.6294873935548505, + "grad_norm": 0.5460854814891332, + "learning_rate": 9.637317328763378e-06, + "loss": 0.0433, + "step": 1885 + }, + { + "epoch": 0.6298213391217231, + "grad_norm": 0.365060548357797, + "learning_rate": 9.636590339399012e-06, + "loss": 0.0362, + "step": 1886 + }, + { + "epoch": 0.6301552846885957, + "grad_norm": 0.3705326290040247, + "learning_rate": 9.63586264962308e-06, + "loss": 0.0328, + "step": 1887 + }, + { + "epoch": 0.6304892302554683, + "grad_norm": 0.5471497013132491, + "learning_rate": 9.635134259545511e-06, + "loss": 0.0375, + "step": 1888 + }, + { + "epoch": 0.6308231758223409, + "grad_norm": 0.5263577057665104, + "learning_rate": 9.634405169276335e-06, + "loss": 0.0506, + "step": 1889 + }, + { + "epoch": 0.6311571213892135, + "grad_norm": 0.4272692073087324, + "learning_rate": 9.63367537892569e-06, + "loss": 0.0427, + "step": 1890 + }, + { + "epoch": 0.6314910669560861, + "grad_norm": 0.5705137389135463, + "learning_rate": 9.63294488860382e-06, + "loss": 0.0349, + "step": 1891 + }, + { + "epoch": 0.6318250125229588, + "grad_norm": 0.3558713877577924, + "learning_rate": 9.63221369842107e-06, + "loss": 0.0278, + "step": 1892 + }, + { + "epoch": 0.6321589580898314, + "grad_norm": 0.4721988518594218, + "learning_rate": 9.631481808487902e-06, + "loss": 0.0361, + "step": 1893 + }, + { + "epoch": 0.632492903656704, + "grad_norm": 0.4480326110973692, + "learning_rate": 9.63074921891487e-06, + "loss": 0.0427, + "step": 1894 + }, + { + "epoch": 0.6328268492235766, + "grad_norm": 0.37188152907558825, + "learning_rate": 9.630015929812646e-06, + "loss": 0.0252, + "step": 1895 + }, + { + "epoch": 0.6331607947904492, + "grad_norm": 0.5563749487990718, + "learning_rate": 9.629281941291998e-06, + "loss": 0.0485, + "step": 1896 + }, + { + "epoch": 0.6334947403573218, + "grad_norm": 0.36670401543741943, + "learning_rate": 9.628547253463804e-06, + "loss": 0.0301, + "step": 1897 + }, + { + "epoch": 0.6338286859241944, + "grad_norm": 0.37224420585800755, + "learning_rate": 9.627811866439048e-06, + "loss": 0.0401, + "step": 1898 + }, + { + "epoch": 0.634162631491067, + "grad_norm": 0.4893359595269946, + "learning_rate": 9.627075780328818e-06, + "loss": 0.0379, + "step": 1899 + }, + { + "epoch": 0.6344965770579396, + "grad_norm": 0.43686684602558706, + "learning_rate": 9.626338995244313e-06, + "loss": 0.0418, + "step": 1900 + }, + { + "epoch": 0.6348305226248122, + "grad_norm": 0.40625611969226033, + "learning_rate": 9.625601511296826e-06, + "loss": 0.0456, + "step": 1901 + }, + { + "epoch": 0.6351644681916847, + "grad_norm": 0.5709668921816438, + "learning_rate": 9.624863328597767e-06, + "loss": 0.0438, + "step": 1902 + }, + { + "epoch": 0.6354984137585573, + "grad_norm": 0.3871929125084485, + "learning_rate": 9.624124447258647e-06, + "loss": 0.0342, + "step": 1903 + }, + { + "epoch": 0.6358323593254299, + "grad_norm": 0.4971974547489401, + "learning_rate": 9.62338486739108e-06, + "loss": 0.0574, + "step": 1904 + }, + { + "epoch": 0.6361663048923025, + "grad_norm": 0.6026356798550235, + "learning_rate": 9.62264458910679e-06, + "loss": 0.0485, + "step": 1905 + }, + { + "epoch": 0.6365002504591751, + "grad_norm": 0.5130463017123024, + "learning_rate": 9.621903612517608e-06, + "loss": 0.0375, + "step": 1906 + }, + { + "epoch": 0.6368341960260477, + "grad_norm": 0.3757677910039498, + "learning_rate": 9.621161937735463e-06, + "loss": 0.0357, + "step": 1907 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 0.3495116271000302, + "learning_rate": 9.620419564872394e-06, + "loss": 0.0441, + "step": 1908 + }, + { + "epoch": 0.637502087159793, + "grad_norm": 0.570418547019151, + "learning_rate": 9.619676494040547e-06, + "loss": 0.0395, + "step": 1909 + }, + { + "epoch": 0.6378360327266656, + "grad_norm": 0.5738128070519113, + "learning_rate": 9.61893272535217e-06, + "loss": 0.0378, + "step": 1910 + }, + { + "epoch": 0.6381699782935382, + "grad_norm": 0.6166375661027198, + "learning_rate": 9.618188258919618e-06, + "loss": 0.0611, + "step": 1911 + }, + { + "epoch": 0.6385039238604108, + "grad_norm": 0.4201294810770252, + "learning_rate": 9.617443094855354e-06, + "loss": 0.0369, + "step": 1912 + }, + { + "epoch": 0.6388378694272834, + "grad_norm": 0.5173332455124691, + "learning_rate": 9.61669723327194e-06, + "loss": 0.0519, + "step": 1913 + }, + { + "epoch": 0.639171814994156, + "grad_norm": 0.5160328548914549, + "learning_rate": 9.615950674282049e-06, + "loss": 0.0426, + "step": 1914 + }, + { + "epoch": 0.6395057605610286, + "grad_norm": 0.40673952670075453, + "learning_rate": 9.61520341799846e-06, + "loss": 0.0304, + "step": 1915 + }, + { + "epoch": 0.6398397061279012, + "grad_norm": 0.5900313878047575, + "learning_rate": 9.614455464534049e-06, + "loss": 0.0586, + "step": 1916 + }, + { + "epoch": 0.6401736516947738, + "grad_norm": 0.38570360482895305, + "learning_rate": 9.613706814001809e-06, + "loss": 0.0343, + "step": 1917 + }, + { + "epoch": 0.6405075972616463, + "grad_norm": 0.5842377236150416, + "learning_rate": 9.612957466514829e-06, + "loss": 0.037, + "step": 1918 + }, + { + "epoch": 0.6408415428285189, + "grad_norm": 0.4148470046802926, + "learning_rate": 9.61220742218631e-06, + "loss": 0.0426, + "step": 1919 + }, + { + "epoch": 0.6411754883953915, + "grad_norm": 0.43247246033021025, + "learning_rate": 9.61145668112955e-06, + "loss": 0.0362, + "step": 1920 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.890996448157842, + "learning_rate": 9.610705243457962e-06, + "loss": 0.0537, + "step": 1921 + }, + { + "epoch": 0.6418433795291367, + "grad_norm": 0.44996598549319694, + "learning_rate": 9.609953109285057e-06, + "loss": 0.0463, + "step": 1922 + }, + { + "epoch": 0.6421773250960093, + "grad_norm": 0.3806053279338398, + "learning_rate": 9.609200278724456e-06, + "loss": 0.0335, + "step": 1923 + }, + { + "epoch": 0.6425112706628819, + "grad_norm": 0.40527587523840664, + "learning_rate": 9.60844675188988e-06, + "loss": 0.045, + "step": 1924 + }, + { + "epoch": 0.6428452162297545, + "grad_norm": 0.5400479824298494, + "learning_rate": 9.60769252889516e-06, + "loss": 0.0495, + "step": 1925 + }, + { + "epoch": 0.6431791617966272, + "grad_norm": 0.41163884457514005, + "learning_rate": 9.606937609854227e-06, + "loss": 0.0395, + "step": 1926 + }, + { + "epoch": 0.6435131073634998, + "grad_norm": 0.4872094368071328, + "learning_rate": 9.606181994881124e-06, + "loss": 0.0514, + "step": 1927 + }, + { + "epoch": 0.6438470529303724, + "grad_norm": 0.48280423993985105, + "learning_rate": 9.605425684089998e-06, + "loss": 0.074, + "step": 1928 + }, + { + "epoch": 0.644180998497245, + "grad_norm": 0.4409583152553842, + "learning_rate": 9.604668677595093e-06, + "loss": 0.0388, + "step": 1929 + }, + { + "epoch": 0.6445149440641176, + "grad_norm": 0.3967310876918796, + "learning_rate": 9.603910975510764e-06, + "loss": 0.0368, + "step": 1930 + }, + { + "epoch": 0.6448488896309902, + "grad_norm": 0.4668003203945541, + "learning_rate": 9.603152577951476e-06, + "loss": 0.0543, + "step": 1931 + }, + { + "epoch": 0.6451828351978628, + "grad_norm": 0.32286948560483264, + "learning_rate": 9.60239348503179e-06, + "loss": 0.0244, + "step": 1932 + }, + { + "epoch": 0.6455167807647354, + "grad_norm": 0.3690852719449052, + "learning_rate": 9.601633696866376e-06, + "loss": 0.0323, + "step": 1933 + }, + { + "epoch": 0.6458507263316079, + "grad_norm": 0.5874667298049112, + "learning_rate": 9.60087321357001e-06, + "loss": 0.0403, + "step": 1934 + }, + { + "epoch": 0.6461846718984805, + "grad_norm": 0.42739345880339813, + "learning_rate": 9.600112035257571e-06, + "loss": 0.0401, + "step": 1935 + }, + { + "epoch": 0.6465186174653531, + "grad_norm": 0.3030639504292046, + "learning_rate": 9.599350162044045e-06, + "loss": 0.0365, + "step": 1936 + }, + { + "epoch": 0.6468525630322257, + "grad_norm": 0.3580636364891596, + "learning_rate": 9.598587594044522e-06, + "loss": 0.0319, + "step": 1937 + }, + { + "epoch": 0.6471865085990983, + "grad_norm": 0.36452760261258627, + "learning_rate": 9.597824331374196e-06, + "loss": 0.0311, + "step": 1938 + }, + { + "epoch": 0.6475204541659709, + "grad_norm": 0.4047003448041299, + "learning_rate": 9.597060374148365e-06, + "loss": 0.0348, + "step": 1939 + }, + { + "epoch": 0.6478543997328435, + "grad_norm": 0.29086789236359495, + "learning_rate": 9.596295722482439e-06, + "loss": 0.0264, + "step": 1940 + }, + { + "epoch": 0.6481883452997161, + "grad_norm": 0.47002828184210704, + "learning_rate": 9.595530376491924e-06, + "loss": 0.0343, + "step": 1941 + }, + { + "epoch": 0.6485222908665887, + "grad_norm": 0.3602197839006709, + "learning_rate": 9.594764336292432e-06, + "loss": 0.0427, + "step": 1942 + }, + { + "epoch": 0.6488562364334614, + "grad_norm": 0.4153593770291862, + "learning_rate": 9.593997601999689e-06, + "loss": 0.0375, + "step": 1943 + }, + { + "epoch": 0.649190182000334, + "grad_norm": 0.42311990704308083, + "learning_rate": 9.593230173729514e-06, + "loss": 0.0389, + "step": 1944 + }, + { + "epoch": 0.6495241275672066, + "grad_norm": 0.3166040022386767, + "learning_rate": 9.592462051597838e-06, + "loss": 0.0342, + "step": 1945 + }, + { + "epoch": 0.6498580731340792, + "grad_norm": 0.35294407201689076, + "learning_rate": 9.591693235720695e-06, + "loss": 0.0341, + "step": 1946 + }, + { + "epoch": 0.6501920187009518, + "grad_norm": 0.37311930946901645, + "learning_rate": 9.590923726214224e-06, + "loss": 0.0374, + "step": 1947 + }, + { + "epoch": 0.6505259642678244, + "grad_norm": 0.4463196511307658, + "learning_rate": 9.590153523194665e-06, + "loss": 0.0439, + "step": 1948 + }, + { + "epoch": 0.650859909834697, + "grad_norm": 0.31649078827997107, + "learning_rate": 9.589382626778371e-06, + "loss": 0.0288, + "step": 1949 + }, + { + "epoch": 0.6511938554015696, + "grad_norm": 0.3298840446407738, + "learning_rate": 9.588611037081793e-06, + "loss": 0.0278, + "step": 1950 + }, + { + "epoch": 0.6515278009684421, + "grad_norm": 0.5228733689269723, + "learning_rate": 9.587838754221488e-06, + "loss": 0.0475, + "step": 1951 + }, + { + "epoch": 0.6518617465353147, + "grad_norm": 0.6327059040983106, + "learning_rate": 9.587065778314119e-06, + "loss": 0.0521, + "step": 1952 + }, + { + "epoch": 0.6521956921021873, + "grad_norm": 0.2451710269110937, + "learning_rate": 9.586292109476454e-06, + "loss": 0.0202, + "step": 1953 + }, + { + "epoch": 0.6525296376690599, + "grad_norm": 0.8916205012323529, + "learning_rate": 9.585517747825363e-06, + "loss": 0.0633, + "step": 1954 + }, + { + "epoch": 0.6528635832359325, + "grad_norm": 0.4709021087758125, + "learning_rate": 9.584742693477825e-06, + "loss": 0.0318, + "step": 1955 + }, + { + "epoch": 0.6531975288028051, + "grad_norm": 0.5136557067976495, + "learning_rate": 9.58396694655092e-06, + "loss": 0.0595, + "step": 1956 + }, + { + "epoch": 0.6535314743696777, + "grad_norm": 0.4756999308077858, + "learning_rate": 9.583190507161832e-06, + "loss": 0.0316, + "step": 1957 + }, + { + "epoch": 0.6538654199365503, + "grad_norm": 0.7455511037822554, + "learning_rate": 9.582413375427852e-06, + "loss": 0.0368, + "step": 1958 + }, + { + "epoch": 0.654199365503423, + "grad_norm": 0.4754787105088287, + "learning_rate": 9.581635551466376e-06, + "loss": 0.0462, + "step": 1959 + }, + { + "epoch": 0.6545333110702956, + "grad_norm": 0.38003687303801914, + "learning_rate": 9.580857035394904e-06, + "loss": 0.0315, + "step": 1960 + }, + { + "epoch": 0.6548672566371682, + "grad_norm": 0.4533530798330268, + "learning_rate": 9.580077827331038e-06, + "loss": 0.0401, + "step": 1961 + }, + { + "epoch": 0.6552012022040408, + "grad_norm": 0.42545050057012723, + "learning_rate": 9.579297927392488e-06, + "loss": 0.0354, + "step": 1962 + }, + { + "epoch": 0.6555351477709134, + "grad_norm": 0.49139603401747467, + "learning_rate": 9.578517335697065e-06, + "loss": 0.0453, + "step": 1963 + }, + { + "epoch": 0.655869093337786, + "grad_norm": 0.46637109731071646, + "learning_rate": 9.577736052362689e-06, + "loss": 0.0442, + "step": 1964 + }, + { + "epoch": 0.6562030389046586, + "grad_norm": 0.299441958264375, + "learning_rate": 9.576954077507381e-06, + "loss": 0.0278, + "step": 1965 + }, + { + "epoch": 0.6565369844715312, + "grad_norm": 0.5657838188124532, + "learning_rate": 9.576171411249269e-06, + "loss": 0.0441, + "step": 1966 + }, + { + "epoch": 0.6568709300384037, + "grad_norm": 0.40584154730467964, + "learning_rate": 9.575388053706582e-06, + "loss": 0.0381, + "step": 1967 + }, + { + "epoch": 0.6572048756052763, + "grad_norm": 0.4436611950009847, + "learning_rate": 9.574604004997654e-06, + "loss": 0.039, + "step": 1968 + }, + { + "epoch": 0.6575388211721489, + "grad_norm": 0.5730574727726172, + "learning_rate": 9.57381926524093e-06, + "loss": 0.0661, + "step": 1969 + }, + { + "epoch": 0.6578727667390215, + "grad_norm": 0.31332708853227675, + "learning_rate": 9.57303383455495e-06, + "loss": 0.0273, + "step": 1970 + }, + { + "epoch": 0.6582067123058941, + "grad_norm": 0.3980333285692315, + "learning_rate": 9.572247713058362e-06, + "loss": 0.0378, + "step": 1971 + }, + { + "epoch": 0.6585406578727667, + "grad_norm": 0.3250619162709416, + "learning_rate": 9.571460900869923e-06, + "loss": 0.0364, + "step": 1972 + }, + { + "epoch": 0.6588746034396393, + "grad_norm": 0.533505317222907, + "learning_rate": 9.570673398108485e-06, + "loss": 0.0459, + "step": 1973 + }, + { + "epoch": 0.6592085490065119, + "grad_norm": 0.44470417319471883, + "learning_rate": 9.569885204893015e-06, + "loss": 0.0591, + "step": 1974 + }, + { + "epoch": 0.6595424945733845, + "grad_norm": 0.3008623607862951, + "learning_rate": 9.569096321342574e-06, + "loss": 0.0302, + "step": 1975 + }, + { + "epoch": 0.6598764401402571, + "grad_norm": 0.431796799584234, + "learning_rate": 9.568306747576335e-06, + "loss": 0.0369, + "step": 1976 + }, + { + "epoch": 0.6602103857071298, + "grad_norm": 0.3580539264016544, + "learning_rate": 9.567516483713572e-06, + "loss": 0.0314, + "step": 1977 + }, + { + "epoch": 0.6605443312740024, + "grad_norm": 0.4010486344232972, + "learning_rate": 9.566725529873664e-06, + "loss": 0.039, + "step": 1978 + }, + { + "epoch": 0.660878276840875, + "grad_norm": 0.4370058000801178, + "learning_rate": 9.565933886176093e-06, + "loss": 0.0318, + "step": 1979 + }, + { + "epoch": 0.6612122224077476, + "grad_norm": 0.43869235801488754, + "learning_rate": 9.565141552740445e-06, + "loss": 0.0378, + "step": 1980 + }, + { + "epoch": 0.6615461679746202, + "grad_norm": 0.34181395068932374, + "learning_rate": 9.564348529686413e-06, + "loss": 0.0296, + "step": 1981 + }, + { + "epoch": 0.6618801135414928, + "grad_norm": 0.39098423304700447, + "learning_rate": 9.563554817133794e-06, + "loss": 0.039, + "step": 1982 + }, + { + "epoch": 0.6622140591083653, + "grad_norm": 0.32495353931107385, + "learning_rate": 9.562760415202483e-06, + "loss": 0.0292, + "step": 1983 + }, + { + "epoch": 0.6625480046752379, + "grad_norm": 0.2823617434602211, + "learning_rate": 9.56196532401249e-06, + "loss": 0.0262, + "step": 1984 + }, + { + "epoch": 0.6628819502421105, + "grad_norm": 0.5311234770914175, + "learning_rate": 9.561169543683917e-06, + "loss": 0.0389, + "step": 1985 + }, + { + "epoch": 0.6632158958089831, + "grad_norm": 0.3750951453671438, + "learning_rate": 9.560373074336977e-06, + "loss": 0.0425, + "step": 1986 + }, + { + "epoch": 0.6635498413758557, + "grad_norm": 0.44916711941040793, + "learning_rate": 9.55957591609199e-06, + "loss": 0.0381, + "step": 1987 + }, + { + "epoch": 0.6638837869427283, + "grad_norm": 0.44845371539395534, + "learning_rate": 9.558778069069373e-06, + "loss": 0.0416, + "step": 1988 + }, + { + "epoch": 0.6642177325096009, + "grad_norm": 0.4486434168528874, + "learning_rate": 9.55797953338965e-06, + "loss": 0.0313, + "step": 1989 + }, + { + "epoch": 0.6645516780764735, + "grad_norm": 0.5988333882434549, + "learning_rate": 9.55718030917345e-06, + "loss": 0.0424, + "step": 1990 + }, + { + "epoch": 0.6648856236433461, + "grad_norm": 0.4449319242672763, + "learning_rate": 9.556380396541507e-06, + "loss": 0.0374, + "step": 1991 + }, + { + "epoch": 0.6652195692102187, + "grad_norm": 0.657520332652472, + "learning_rate": 9.555579795614654e-06, + "loss": 0.0414, + "step": 1992 + }, + { + "epoch": 0.6655535147770913, + "grad_norm": 0.6832267111237972, + "learning_rate": 9.554778506513834e-06, + "loss": 0.0524, + "step": 1993 + }, + { + "epoch": 0.665887460343964, + "grad_norm": 0.5302737077362314, + "learning_rate": 9.553976529360087e-06, + "loss": 0.0663, + "step": 1994 + }, + { + "epoch": 0.6662214059108366, + "grad_norm": 0.37787301151681557, + "learning_rate": 9.553173864274567e-06, + "loss": 0.0293, + "step": 1995 + }, + { + "epoch": 0.6665553514777092, + "grad_norm": 0.7179998805532395, + "learning_rate": 9.552370511378522e-06, + "loss": 0.0401, + "step": 1996 + }, + { + "epoch": 0.6668892970445818, + "grad_norm": 0.6405562511755868, + "learning_rate": 9.551566470793308e-06, + "loss": 0.0435, + "step": 1997 + }, + { + "epoch": 0.6672232426114544, + "grad_norm": 0.3965480294280699, + "learning_rate": 9.550761742640387e-06, + "loss": 0.0374, + "step": 1998 + }, + { + "epoch": 0.667557188178327, + "grad_norm": 0.4954674875869502, + "learning_rate": 9.549956327041318e-06, + "loss": 0.0405, + "step": 1999 + }, + { + "epoch": 0.6678911337451995, + "grad_norm": 0.38398997797274265, + "learning_rate": 9.549150224117776e-06, + "loss": 0.0279, + "step": 2000 + }, + { + "epoch": 0.6682250793120721, + "grad_norm": 0.35076214100338154, + "learning_rate": 9.548343433991524e-06, + "loss": 0.0285, + "step": 2001 + }, + { + "epoch": 0.6685590248789447, + "grad_norm": 0.5925647380158942, + "learning_rate": 9.547535956784445e-06, + "loss": 0.0609, + "step": 2002 + }, + { + "epoch": 0.6688929704458173, + "grad_norm": 0.5143382525893382, + "learning_rate": 9.546727792618512e-06, + "loss": 0.0411, + "step": 2003 + }, + { + "epoch": 0.6692269160126899, + "grad_norm": 0.43516019025805047, + "learning_rate": 9.545918941615811e-06, + "loss": 0.0477, + "step": 2004 + }, + { + "epoch": 0.6695608615795625, + "grad_norm": 0.5467950491101975, + "learning_rate": 9.545109403898527e-06, + "loss": 0.0468, + "step": 2005 + }, + { + "epoch": 0.6698948071464351, + "grad_norm": 0.3388940167646659, + "learning_rate": 9.544299179588952e-06, + "loss": 0.0323, + "step": 2006 + }, + { + "epoch": 0.6702287527133077, + "grad_norm": 0.47902655682264245, + "learning_rate": 9.543488268809478e-06, + "loss": 0.0368, + "step": 2007 + }, + { + "epoch": 0.6705626982801803, + "grad_norm": 0.4270736708549859, + "learning_rate": 9.542676671682601e-06, + "loss": 0.0426, + "step": 2008 + }, + { + "epoch": 0.6708966438470529, + "grad_norm": 0.6044442787017646, + "learning_rate": 9.541864388330926e-06, + "loss": 0.0484, + "step": 2009 + }, + { + "epoch": 0.6712305894139255, + "grad_norm": 0.4259121663848109, + "learning_rate": 9.541051418877156e-06, + "loss": 0.0377, + "step": 2010 + }, + { + "epoch": 0.6715645349807982, + "grad_norm": 0.3568645051200129, + "learning_rate": 9.5402377634441e-06, + "loss": 0.0376, + "step": 2011 + }, + { + "epoch": 0.6718984805476708, + "grad_norm": 0.35494036371640764, + "learning_rate": 9.539423422154672e-06, + "loss": 0.0225, + "step": 2012 + }, + { + "epoch": 0.6722324261145434, + "grad_norm": 0.3154795081643971, + "learning_rate": 9.538608395131884e-06, + "loss": 0.0378, + "step": 2013 + }, + { + "epoch": 0.672566371681416, + "grad_norm": 0.39098881138831215, + "learning_rate": 9.537792682498859e-06, + "loss": 0.0373, + "step": 2014 + }, + { + "epoch": 0.6729003172482886, + "grad_norm": 0.4566591398312619, + "learning_rate": 9.536976284378818e-06, + "loss": 0.0484, + "step": 2015 + }, + { + "epoch": 0.6732342628151611, + "grad_norm": 0.34076311960195427, + "learning_rate": 9.536159200895088e-06, + "loss": 0.0429, + "step": 2016 + }, + { + "epoch": 0.6735682083820337, + "grad_norm": 0.42658190950359615, + "learning_rate": 9.535341432171098e-06, + "loss": 0.0375, + "step": 2017 + }, + { + "epoch": 0.6739021539489063, + "grad_norm": 0.4276502107061724, + "learning_rate": 9.534522978330384e-06, + "loss": 0.035, + "step": 2018 + }, + { + "epoch": 0.6742360995157789, + "grad_norm": 0.38321912147980164, + "learning_rate": 9.533703839496581e-06, + "loss": 0.0334, + "step": 2019 + }, + { + "epoch": 0.6745700450826515, + "grad_norm": 0.36958472601309506, + "learning_rate": 9.532884015793432e-06, + "loss": 0.0288, + "step": 2020 + }, + { + "epoch": 0.6749039906495241, + "grad_norm": 0.4357493497620933, + "learning_rate": 9.532063507344777e-06, + "loss": 0.0345, + "step": 2021 + }, + { + "epoch": 0.6752379362163967, + "grad_norm": 0.41201588517815246, + "learning_rate": 9.53124231427457e-06, + "loss": 0.0474, + "step": 2022 + }, + { + "epoch": 0.6755718817832693, + "grad_norm": 0.42070337228935134, + "learning_rate": 9.530420436706853e-06, + "loss": 0.0378, + "step": 2023 + }, + { + "epoch": 0.6759058273501419, + "grad_norm": 0.47467486330484465, + "learning_rate": 9.529597874765788e-06, + "loss": 0.0398, + "step": 2024 + }, + { + "epoch": 0.6762397729170145, + "grad_norm": 0.5304964057091106, + "learning_rate": 9.528774628575628e-06, + "loss": 0.0522, + "step": 2025 + }, + { + "epoch": 0.6765737184838871, + "grad_norm": 0.41027979340530846, + "learning_rate": 9.527950698260737e-06, + "loss": 0.04, + "step": 2026 + }, + { + "epoch": 0.6769076640507597, + "grad_norm": 0.798364895576848, + "learning_rate": 9.527126083945578e-06, + "loss": 0.0472, + "step": 2027 + }, + { + "epoch": 0.6772416096176324, + "grad_norm": 0.5383452982596113, + "learning_rate": 9.526300785754719e-06, + "loss": 0.0412, + "step": 2028 + }, + { + "epoch": 0.677575555184505, + "grad_norm": 0.6616398683443208, + "learning_rate": 9.525474803812831e-06, + "loss": 0.041, + "step": 2029 + }, + { + "epoch": 0.6779095007513776, + "grad_norm": 0.4049436603518837, + "learning_rate": 9.524648138244688e-06, + "loss": 0.043, + "step": 2030 + }, + { + "epoch": 0.6782434463182502, + "grad_norm": 0.4484259211351194, + "learning_rate": 9.523820789175167e-06, + "loss": 0.0305, + "step": 2031 + }, + { + "epoch": 0.6785773918851227, + "grad_norm": 0.3512661433794848, + "learning_rate": 9.52299275672925e-06, + "loss": 0.0329, + "step": 2032 + }, + { + "epoch": 0.6789113374519953, + "grad_norm": 0.6493754131574312, + "learning_rate": 9.52216404103202e-06, + "loss": 0.0336, + "step": 2033 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.7535163293104985, + "learning_rate": 9.521334642208666e-06, + "loss": 0.0413, + "step": 2034 + }, + { + "epoch": 0.6795792285857405, + "grad_norm": 0.49326517121601776, + "learning_rate": 9.520504560384476e-06, + "loss": 0.045, + "step": 2035 + }, + { + "epoch": 0.6799131741526131, + "grad_norm": 0.41120052266043067, + "learning_rate": 9.519673795684845e-06, + "loss": 0.0266, + "step": 2036 + }, + { + "epoch": 0.6802471197194857, + "grad_norm": 0.6140177963518832, + "learning_rate": 9.518842348235271e-06, + "loss": 0.0483, + "step": 2037 + }, + { + "epoch": 0.6805810652863583, + "grad_norm": 0.4725025346002652, + "learning_rate": 9.51801021816135e-06, + "loss": 0.0407, + "step": 2038 + }, + { + "epoch": 0.6809150108532309, + "grad_norm": 0.41261462649031916, + "learning_rate": 9.51717740558879e-06, + "loss": 0.0268, + "step": 2039 + }, + { + "epoch": 0.6812489564201035, + "grad_norm": 0.3707450546827718, + "learning_rate": 9.516343910643395e-06, + "loss": 0.0458, + "step": 2040 + }, + { + "epoch": 0.6815829019869761, + "grad_norm": 0.2771729301031353, + "learning_rate": 9.515509733451074e-06, + "loss": 0.0274, + "step": 2041 + }, + { + "epoch": 0.6819168475538487, + "grad_norm": 0.32940546739682114, + "learning_rate": 9.514674874137838e-06, + "loss": 0.0337, + "step": 2042 + }, + { + "epoch": 0.6822507931207213, + "grad_norm": 0.3734831314096347, + "learning_rate": 9.513839332829806e-06, + "loss": 0.0362, + "step": 2043 + }, + { + "epoch": 0.682584738687594, + "grad_norm": 0.4834078783984118, + "learning_rate": 9.513003109653192e-06, + "loss": 0.0374, + "step": 2044 + }, + { + "epoch": 0.6829186842544666, + "grad_norm": 0.3588811997999953, + "learning_rate": 9.512166204734322e-06, + "loss": 0.0307, + "step": 2045 + }, + { + "epoch": 0.6832526298213392, + "grad_norm": 0.325964726603329, + "learning_rate": 9.511328618199614e-06, + "loss": 0.0324, + "step": 2046 + }, + { + "epoch": 0.6835865753882118, + "grad_norm": 0.8229699925882626, + "learning_rate": 9.510490350175602e-06, + "loss": 0.0628, + "step": 2047 + }, + { + "epoch": 0.6839205209550844, + "grad_norm": 0.30887054089282684, + "learning_rate": 9.50965140078891e-06, + "loss": 0.0419, + "step": 2048 + }, + { + "epoch": 0.6842544665219569, + "grad_norm": 0.39605664376160044, + "learning_rate": 9.508811770166277e-06, + "loss": 0.0382, + "step": 2049 + }, + { + "epoch": 0.6845884120888295, + "grad_norm": 0.38009363458721646, + "learning_rate": 9.507971458434538e-06, + "loss": 0.0317, + "step": 2050 + }, + { + "epoch": 0.6849223576557021, + "grad_norm": 0.31482364804749247, + "learning_rate": 9.507130465720628e-06, + "loss": 0.0291, + "step": 2051 + }, + { + "epoch": 0.6852563032225747, + "grad_norm": 0.2793468062451066, + "learning_rate": 9.506288792151592e-06, + "loss": 0.0268, + "step": 2052 + }, + { + "epoch": 0.6855902487894473, + "grad_norm": 0.3838063300395601, + "learning_rate": 9.505446437854574e-06, + "loss": 0.0312, + "step": 2053 + }, + { + "epoch": 0.6859241943563199, + "grad_norm": 0.5550332249747015, + "learning_rate": 9.504603402956823e-06, + "loss": 0.0464, + "step": 2054 + }, + { + "epoch": 0.6862581399231925, + "grad_norm": 0.27606317902607436, + "learning_rate": 9.503759687585686e-06, + "loss": 0.0319, + "step": 2055 + }, + { + "epoch": 0.6865920854900651, + "grad_norm": 0.437480194017149, + "learning_rate": 9.50291529186862e-06, + "loss": 0.0351, + "step": 2056 + }, + { + "epoch": 0.6869260310569377, + "grad_norm": 0.3541870244307474, + "learning_rate": 9.502070215933177e-06, + "loss": 0.0328, + "step": 2057 + }, + { + "epoch": 0.6872599766238103, + "grad_norm": 0.4000587433984851, + "learning_rate": 9.501224459907019e-06, + "loss": 0.0334, + "step": 2058 + }, + { + "epoch": 0.6875939221906829, + "grad_norm": 0.32412375991716214, + "learning_rate": 9.500378023917906e-06, + "loss": 0.0288, + "step": 2059 + }, + { + "epoch": 0.6879278677575555, + "grad_norm": 0.39624546639503, + "learning_rate": 9.499530908093702e-06, + "loss": 0.0437, + "step": 2060 + }, + { + "epoch": 0.6882618133244282, + "grad_norm": 0.4105246708870515, + "learning_rate": 9.498683112562374e-06, + "loss": 0.036, + "step": 2061 + }, + { + "epoch": 0.6885957588913008, + "grad_norm": 0.3299231249871794, + "learning_rate": 9.497834637451992e-06, + "loss": 0.0322, + "step": 2062 + }, + { + "epoch": 0.6889297044581734, + "grad_norm": 0.366774085118686, + "learning_rate": 9.496985482890728e-06, + "loss": 0.0319, + "step": 2063 + }, + { + "epoch": 0.689263650025046, + "grad_norm": 0.4672075079573483, + "learning_rate": 9.496135649006857e-06, + "loss": 0.0345, + "step": 2064 + }, + { + "epoch": 0.6895975955919185, + "grad_norm": 0.3404779492995906, + "learning_rate": 9.495285135928755e-06, + "loss": 0.0333, + "step": 2065 + }, + { + "epoch": 0.6899315411587911, + "grad_norm": 0.30411140383949004, + "learning_rate": 9.494433943784901e-06, + "loss": 0.0336, + "step": 2066 + }, + { + "epoch": 0.6902654867256637, + "grad_norm": 0.3145670236628234, + "learning_rate": 9.493582072703883e-06, + "loss": 0.0348, + "step": 2067 + }, + { + "epoch": 0.6905994322925363, + "grad_norm": 0.3767704383684599, + "learning_rate": 9.49272952281438e-06, + "loss": 0.0368, + "step": 2068 + }, + { + "epoch": 0.6909333778594089, + "grad_norm": 0.36483330422600235, + "learning_rate": 9.491876294245184e-06, + "loss": 0.0315, + "step": 2069 + }, + { + "epoch": 0.6912673234262815, + "grad_norm": 0.6576254105862623, + "learning_rate": 9.491022387125183e-06, + "loss": 0.0356, + "step": 2070 + }, + { + "epoch": 0.6916012689931541, + "grad_norm": 0.3700579958974638, + "learning_rate": 9.490167801583373e-06, + "loss": 0.0324, + "step": 2071 + }, + { + "epoch": 0.6919352145600267, + "grad_norm": 0.36934678168549157, + "learning_rate": 9.489312537748843e-06, + "loss": 0.0376, + "step": 2072 + }, + { + "epoch": 0.6922691601268993, + "grad_norm": 0.37061885576754783, + "learning_rate": 9.488456595750795e-06, + "loss": 0.0364, + "step": 2073 + }, + { + "epoch": 0.6926031056937719, + "grad_norm": 0.5357535933097675, + "learning_rate": 9.487599975718529e-06, + "loss": 0.0457, + "step": 2074 + }, + { + "epoch": 0.6929370512606445, + "grad_norm": 0.331179966616601, + "learning_rate": 9.486742677781446e-06, + "loss": 0.0322, + "step": 2075 + }, + { + "epoch": 0.6932709968275171, + "grad_norm": 0.4479123448178622, + "learning_rate": 9.485884702069053e-06, + "loss": 0.0529, + "step": 2076 + }, + { + "epoch": 0.6936049423943897, + "grad_norm": 0.4043620854538527, + "learning_rate": 9.485026048710957e-06, + "loss": 0.035, + "step": 2077 + }, + { + "epoch": 0.6939388879612624, + "grad_norm": 0.3374418056381048, + "learning_rate": 9.484166717836865e-06, + "loss": 0.0343, + "step": 2078 + }, + { + "epoch": 0.694272833528135, + "grad_norm": 0.3844254319088217, + "learning_rate": 9.48330670957659e-06, + "loss": 0.0293, + "step": 2079 + }, + { + "epoch": 0.6946067790950076, + "grad_norm": 0.3985114019469326, + "learning_rate": 9.48244602406005e-06, + "loss": 0.0467, + "step": 2080 + }, + { + "epoch": 0.6949407246618801, + "grad_norm": 0.5016675984334057, + "learning_rate": 9.481584661417258e-06, + "loss": 0.0358, + "step": 2081 + }, + { + "epoch": 0.6952746702287527, + "grad_norm": 0.35840475345410083, + "learning_rate": 9.480722621778334e-06, + "loss": 0.036, + "step": 2082 + }, + { + "epoch": 0.6956086157956253, + "grad_norm": 0.28766872101826946, + "learning_rate": 9.479859905273498e-06, + "loss": 0.0273, + "step": 2083 + }, + { + "epoch": 0.6959425613624979, + "grad_norm": 0.6414607344065364, + "learning_rate": 9.478996512033074e-06, + "loss": 0.0593, + "step": 2084 + }, + { + "epoch": 0.6962765069293705, + "grad_norm": 0.4752069954799563, + "learning_rate": 9.478132442187491e-06, + "loss": 0.0337, + "step": 2085 + }, + { + "epoch": 0.6966104524962431, + "grad_norm": 0.39948088189108805, + "learning_rate": 9.477267695867275e-06, + "loss": 0.0372, + "step": 2086 + }, + { + "epoch": 0.6969443980631157, + "grad_norm": 0.4082284378453413, + "learning_rate": 9.476402273203052e-06, + "loss": 0.0389, + "step": 2087 + }, + { + "epoch": 0.6972783436299883, + "grad_norm": 0.36586269541710464, + "learning_rate": 9.47553617432556e-06, + "loss": 0.0415, + "step": 2088 + }, + { + "epoch": 0.6976122891968609, + "grad_norm": 0.45222951343907314, + "learning_rate": 9.47466939936563e-06, + "loss": 0.0407, + "step": 2089 + }, + { + "epoch": 0.6979462347637335, + "grad_norm": 0.3799030291064665, + "learning_rate": 9.473801948454199e-06, + "loss": 0.0316, + "step": 2090 + }, + { + "epoch": 0.6982801803306061, + "grad_norm": 0.3589556260093423, + "learning_rate": 9.472933821722307e-06, + "loss": 0.0404, + "step": 2091 + }, + { + "epoch": 0.6986141258974787, + "grad_norm": 0.3058433466630493, + "learning_rate": 9.472065019301095e-06, + "loss": 0.0276, + "step": 2092 + }, + { + "epoch": 0.6989480714643513, + "grad_norm": 0.4957747306104084, + "learning_rate": 9.471195541321805e-06, + "loss": 0.0365, + "step": 2093 + }, + { + "epoch": 0.699282017031224, + "grad_norm": 0.3660253521415888, + "learning_rate": 9.470325387915782e-06, + "loss": 0.0388, + "step": 2094 + }, + { + "epoch": 0.6996159625980966, + "grad_norm": 0.3969797950662937, + "learning_rate": 9.469454559214473e-06, + "loss": 0.0432, + "step": 2095 + }, + { + "epoch": 0.6999499081649692, + "grad_norm": 0.553351992055136, + "learning_rate": 9.468583055349425e-06, + "loss": 0.0394, + "step": 2096 + }, + { + "epoch": 0.7002838537318417, + "grad_norm": 0.3506807419904832, + "learning_rate": 9.467710876452292e-06, + "loss": 0.0418, + "step": 2097 + }, + { + "epoch": 0.7006177992987143, + "grad_norm": 0.5097933681573146, + "learning_rate": 9.466838022654826e-06, + "loss": 0.0371, + "step": 2098 + }, + { + "epoch": 0.7009517448655869, + "grad_norm": 0.4204143743306447, + "learning_rate": 9.465964494088879e-06, + "loss": 0.036, + "step": 2099 + }, + { + "epoch": 0.7012856904324595, + "grad_norm": 0.3803661985442615, + "learning_rate": 9.465090290886411e-06, + "loss": 0.0406, + "step": 2100 + }, + { + "epoch": 0.7016196359993321, + "grad_norm": 0.4085602290339402, + "learning_rate": 9.464215413179483e-06, + "loss": 0.0504, + "step": 2101 + }, + { + "epoch": 0.7019535815662047, + "grad_norm": 0.48582041271567966, + "learning_rate": 9.46333986110025e-06, + "loss": 0.0376, + "step": 2102 + }, + { + "epoch": 0.7022875271330773, + "grad_norm": 0.3169719380753962, + "learning_rate": 9.462463634780977e-06, + "loss": 0.0338, + "step": 2103 + }, + { + "epoch": 0.7026214726999499, + "grad_norm": 0.3126122735122749, + "learning_rate": 9.461586734354027e-06, + "loss": 0.024, + "step": 2104 + }, + { + "epoch": 0.7029554182668225, + "grad_norm": 0.66368486085725, + "learning_rate": 9.460709159951867e-06, + "loss": 0.0608, + "step": 2105 + }, + { + "epoch": 0.7032893638336951, + "grad_norm": 0.33121278172693575, + "learning_rate": 9.459830911707066e-06, + "loss": 0.0282, + "step": 2106 + }, + { + "epoch": 0.7036233094005677, + "grad_norm": 0.4406857934513667, + "learning_rate": 9.458951989752295e-06, + "loss": 0.0339, + "step": 2107 + }, + { + "epoch": 0.7039572549674403, + "grad_norm": 0.5133036530910118, + "learning_rate": 9.458072394220321e-06, + "loss": 0.043, + "step": 2108 + }, + { + "epoch": 0.7042912005343129, + "grad_norm": 0.30461348264628313, + "learning_rate": 9.457192125244021e-06, + "loss": 0.0245, + "step": 2109 + }, + { + "epoch": 0.7046251461011855, + "grad_norm": 0.34330988923124833, + "learning_rate": 9.456311182956368e-06, + "loss": 0.0308, + "step": 2110 + }, + { + "epoch": 0.7049590916680581, + "grad_norm": 0.32911770953151354, + "learning_rate": 9.45542956749044e-06, + "loss": 0.0328, + "step": 2111 + }, + { + "epoch": 0.7052930372349308, + "grad_norm": 0.6083984422933292, + "learning_rate": 9.454547278979415e-06, + "loss": 0.0406, + "step": 2112 + }, + { + "epoch": 0.7056269828018034, + "grad_norm": 0.3185936140057806, + "learning_rate": 9.453664317556572e-06, + "loss": 0.0274, + "step": 2113 + }, + { + "epoch": 0.7059609283686759, + "grad_norm": 0.48570418213064875, + "learning_rate": 9.452780683355295e-06, + "loss": 0.0373, + "step": 2114 + }, + { + "epoch": 0.7062948739355485, + "grad_norm": 0.3778477949862705, + "learning_rate": 9.451896376509065e-06, + "loss": 0.0336, + "step": 2115 + }, + { + "epoch": 0.7066288195024211, + "grad_norm": 0.3185035797864648, + "learning_rate": 9.451011397151469e-06, + "loss": 0.0282, + "step": 2116 + }, + { + "epoch": 0.7069627650692937, + "grad_norm": 0.32082110489545806, + "learning_rate": 9.450125745416191e-06, + "loss": 0.0303, + "step": 2117 + }, + { + "epoch": 0.7072967106361663, + "grad_norm": 0.5487871290248476, + "learning_rate": 9.44923942143702e-06, + "loss": 0.0467, + "step": 2118 + }, + { + "epoch": 0.7076306562030389, + "grad_norm": 0.7642656430859089, + "learning_rate": 9.448352425347848e-06, + "loss": 0.0599, + "step": 2119 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 0.41308813145148743, + "learning_rate": 9.447464757282665e-06, + "loss": 0.0468, + "step": 2120 + }, + { + "epoch": 0.7082985473367841, + "grad_norm": 0.5125691388816903, + "learning_rate": 9.44657641737556e-06, + "loss": 0.0381, + "step": 2121 + }, + { + "epoch": 0.7086324929036567, + "grad_norm": 0.30328822423975166, + "learning_rate": 9.445687405760735e-06, + "loss": 0.0367, + "step": 2122 + }, + { + "epoch": 0.7089664384705293, + "grad_norm": 0.3251459274141834, + "learning_rate": 9.444797722572479e-06, + "loss": 0.0361, + "step": 2123 + }, + { + "epoch": 0.7093003840374019, + "grad_norm": 0.929693515245696, + "learning_rate": 9.44390736794519e-06, + "loss": 0.0552, + "step": 2124 + }, + { + "epoch": 0.7096343296042745, + "grad_norm": 0.3665625751767317, + "learning_rate": 9.443016342013369e-06, + "loss": 0.0352, + "step": 2125 + }, + { + "epoch": 0.7099682751711471, + "grad_norm": 0.30557329648239634, + "learning_rate": 9.442124644911614e-06, + "loss": 0.0371, + "step": 2126 + }, + { + "epoch": 0.7103022207380197, + "grad_norm": 0.4266907565619148, + "learning_rate": 9.441232276774629e-06, + "loss": 0.0369, + "step": 2127 + }, + { + "epoch": 0.7106361663048923, + "grad_norm": 0.49036847738058204, + "learning_rate": 9.440339237737213e-06, + "loss": 0.0355, + "step": 2128 + }, + { + "epoch": 0.710970111871765, + "grad_norm": 0.5439634251546771, + "learning_rate": 9.439445527934272e-06, + "loss": 0.0478, + "step": 2129 + }, + { + "epoch": 0.7113040574386374, + "grad_norm": 0.7030520850755128, + "learning_rate": 9.438551147500812e-06, + "loss": 0.0472, + "step": 2130 + }, + { + "epoch": 0.7116380030055101, + "grad_norm": 0.4244879408655492, + "learning_rate": 9.437656096571938e-06, + "loss": 0.0321, + "step": 2131 + }, + { + "epoch": 0.7119719485723827, + "grad_norm": 0.5624451104866817, + "learning_rate": 9.436760375282858e-06, + "loss": 0.0414, + "step": 2132 + }, + { + "epoch": 0.7123058941392553, + "grad_norm": 0.5585892134208846, + "learning_rate": 9.435863983768884e-06, + "loss": 0.0424, + "step": 2133 + }, + { + "epoch": 0.7126398397061279, + "grad_norm": 0.4815680736159215, + "learning_rate": 9.434966922165424e-06, + "loss": 0.0458, + "step": 2134 + }, + { + "epoch": 0.7129737852730005, + "grad_norm": 0.6999091954272587, + "learning_rate": 9.43406919060799e-06, + "loss": 0.0393, + "step": 2135 + }, + { + "epoch": 0.7133077308398731, + "grad_norm": 0.5557423602281404, + "learning_rate": 9.433170789232196e-06, + "loss": 0.0287, + "step": 2136 + }, + { + "epoch": 0.7136416764067457, + "grad_norm": 0.6046902850475429, + "learning_rate": 9.432271718173756e-06, + "loss": 0.0417, + "step": 2137 + }, + { + "epoch": 0.7139756219736183, + "grad_norm": 0.51087554433223, + "learning_rate": 9.431371977568483e-06, + "loss": 0.0415, + "step": 2138 + }, + { + "epoch": 0.7143095675404909, + "grad_norm": 0.7207660363117191, + "learning_rate": 9.430471567552295e-06, + "loss": 0.0368, + "step": 2139 + }, + { + "epoch": 0.7146435131073635, + "grad_norm": 0.6787427646753198, + "learning_rate": 9.42957048826121e-06, + "loss": 0.0369, + "step": 2140 + }, + { + "epoch": 0.7149774586742361, + "grad_norm": 0.3798691880618357, + "learning_rate": 9.428668739831349e-06, + "loss": 0.0372, + "step": 2141 + }, + { + "epoch": 0.7153114042411087, + "grad_norm": 0.5857347216142876, + "learning_rate": 9.427766322398926e-06, + "loss": 0.0473, + "step": 2142 + }, + { + "epoch": 0.7156453498079813, + "grad_norm": 0.5065366747180108, + "learning_rate": 9.426863236100266e-06, + "loss": 0.0404, + "step": 2143 + }, + { + "epoch": 0.7159792953748539, + "grad_norm": 0.6105120015697073, + "learning_rate": 9.425959481071787e-06, + "loss": 0.0491, + "step": 2144 + }, + { + "epoch": 0.7163132409417265, + "grad_norm": 0.3026933591924967, + "learning_rate": 9.425055057450017e-06, + "loss": 0.0305, + "step": 2145 + }, + { + "epoch": 0.716647186508599, + "grad_norm": 0.3150470920767179, + "learning_rate": 9.424149965371576e-06, + "loss": 0.029, + "step": 2146 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.5381519528684029, + "learning_rate": 9.423244204973191e-06, + "loss": 0.0314, + "step": 2147 + }, + { + "epoch": 0.7173150776423443, + "grad_norm": 0.30701392419370926, + "learning_rate": 9.422337776391686e-06, + "loss": 0.0238, + "step": 2148 + }, + { + "epoch": 0.7176490232092169, + "grad_norm": 0.3641873297705958, + "learning_rate": 9.421430679763989e-06, + "loss": 0.0325, + "step": 2149 + }, + { + "epoch": 0.7179829687760895, + "grad_norm": 0.36154786456566407, + "learning_rate": 9.420522915227129e-06, + "loss": 0.0279, + "step": 2150 + }, + { + "epoch": 0.7183169143429621, + "grad_norm": 0.33684500270630197, + "learning_rate": 9.419614482918229e-06, + "loss": 0.039, + "step": 2151 + }, + { + "epoch": 0.7186508599098347, + "grad_norm": 0.38092296437484474, + "learning_rate": 9.418705382974524e-06, + "loss": 0.0285, + "step": 2152 + }, + { + "epoch": 0.7189848054767073, + "grad_norm": 0.3162897641730264, + "learning_rate": 9.417795615533343e-06, + "loss": 0.0355, + "step": 2153 + }, + { + "epoch": 0.7193187510435799, + "grad_norm": 0.5527311501906367, + "learning_rate": 9.416885180732115e-06, + "loss": 0.0504, + "step": 2154 + }, + { + "epoch": 0.7196526966104525, + "grad_norm": 0.31064578020814043, + "learning_rate": 9.415974078708375e-06, + "loss": 0.0297, + "step": 2155 + }, + { + "epoch": 0.7199866421773251, + "grad_norm": 0.2783485728603494, + "learning_rate": 9.415062309599751e-06, + "loss": 0.0242, + "step": 2156 + }, + { + "epoch": 0.7203205877441977, + "grad_norm": 0.45640344369628066, + "learning_rate": 9.414149873543983e-06, + "loss": 0.0422, + "step": 2157 + }, + { + "epoch": 0.7206545333110703, + "grad_norm": 0.43007533085532074, + "learning_rate": 9.4132367706789e-06, + "loss": 0.0353, + "step": 2158 + }, + { + "epoch": 0.7209884788779429, + "grad_norm": 0.28082360420233443, + "learning_rate": 9.412323001142438e-06, + "loss": 0.0282, + "step": 2159 + }, + { + "epoch": 0.7213224244448155, + "grad_norm": 0.3199494105748125, + "learning_rate": 9.411408565072635e-06, + "loss": 0.0455, + "step": 2160 + }, + { + "epoch": 0.7216563700116881, + "grad_norm": 0.3745445600000952, + "learning_rate": 9.410493462607623e-06, + "loss": 0.0315, + "step": 2161 + }, + { + "epoch": 0.7219903155785607, + "grad_norm": 0.46795502563432667, + "learning_rate": 9.409577693885642e-06, + "loss": 0.0359, + "step": 2162 + }, + { + "epoch": 0.7223242611454332, + "grad_norm": 0.6439904868634199, + "learning_rate": 9.408661259045032e-06, + "loss": 0.0371, + "step": 2163 + }, + { + "epoch": 0.7226582067123059, + "grad_norm": 0.36532717951072996, + "learning_rate": 9.407744158224227e-06, + "loss": 0.0335, + "step": 2164 + }, + { + "epoch": 0.7229921522791785, + "grad_norm": 0.4761693864609595, + "learning_rate": 9.406826391561767e-06, + "loss": 0.0436, + "step": 2165 + }, + { + "epoch": 0.7233260978460511, + "grad_norm": 0.542276120583976, + "learning_rate": 9.405907959196293e-06, + "loss": 0.0504, + "step": 2166 + }, + { + "epoch": 0.7236600434129237, + "grad_norm": 0.3101592116587309, + "learning_rate": 9.404988861266543e-06, + "loss": 0.0298, + "step": 2167 + }, + { + "epoch": 0.7239939889797963, + "grad_norm": 0.37567398821930725, + "learning_rate": 9.404069097911358e-06, + "loss": 0.0373, + "step": 2168 + }, + { + "epoch": 0.7243279345466689, + "grad_norm": 1.1232092577160215, + "learning_rate": 9.40314866926968e-06, + "loss": 0.0285, + "step": 2169 + }, + { + "epoch": 0.7246618801135415, + "grad_norm": 0.3456497379006263, + "learning_rate": 9.402227575480549e-06, + "loss": 0.0302, + "step": 2170 + }, + { + "epoch": 0.7249958256804141, + "grad_norm": 0.2956667125599662, + "learning_rate": 9.401305816683111e-06, + "loss": 0.0267, + "step": 2171 + }, + { + "epoch": 0.7253297712472867, + "grad_norm": 0.4008379808926853, + "learning_rate": 9.400383393016604e-06, + "loss": 0.0502, + "step": 2172 + }, + { + "epoch": 0.7256637168141593, + "grad_norm": 0.41478548582639985, + "learning_rate": 9.39946030462037e-06, + "loss": 0.0399, + "step": 2173 + }, + { + "epoch": 0.7259976623810319, + "grad_norm": 0.5694715010360888, + "learning_rate": 9.39853655163386e-06, + "loss": 0.0288, + "step": 2174 + }, + { + "epoch": 0.7263316079479045, + "grad_norm": 0.3686960281419378, + "learning_rate": 9.39761213419661e-06, + "loss": 0.032, + "step": 2175 + }, + { + "epoch": 0.7266655535147771, + "grad_norm": 0.3830247290605787, + "learning_rate": 9.396687052448267e-06, + "loss": 0.0476, + "step": 2176 + }, + { + "epoch": 0.7269994990816497, + "grad_norm": 0.35566223597964414, + "learning_rate": 9.395761306528576e-06, + "loss": 0.0342, + "step": 2177 + }, + { + "epoch": 0.7273334446485223, + "grad_norm": 0.393115357857828, + "learning_rate": 9.39483489657738e-06, + "loss": 0.0276, + "step": 2178 + }, + { + "epoch": 0.7276673902153948, + "grad_norm": 0.3353379272112651, + "learning_rate": 9.393907822734627e-06, + "loss": 0.0349, + "step": 2179 + }, + { + "epoch": 0.7280013357822674, + "grad_norm": 0.44949813030160335, + "learning_rate": 9.39298008514036e-06, + "loss": 0.045, + "step": 2180 + }, + { + "epoch": 0.72833528134914, + "grad_norm": 0.3584142367623185, + "learning_rate": 9.392051683934726e-06, + "loss": 0.0346, + "step": 2181 + }, + { + "epoch": 0.7286692269160127, + "grad_norm": 0.34676570759036107, + "learning_rate": 9.39112261925797e-06, + "loss": 0.0408, + "step": 2182 + }, + { + "epoch": 0.7290031724828853, + "grad_norm": 0.35127440088031114, + "learning_rate": 9.390192891250439e-06, + "loss": 0.0365, + "step": 2183 + }, + { + "epoch": 0.7293371180497579, + "grad_norm": 1.2571936885864603, + "learning_rate": 9.389262500052578e-06, + "loss": 0.0476, + "step": 2184 + }, + { + "epoch": 0.7296710636166305, + "grad_norm": 0.6176783849662094, + "learning_rate": 9.388331445804935e-06, + "loss": 0.0425, + "step": 2185 + }, + { + "epoch": 0.7300050091835031, + "grad_norm": 0.35129875622236834, + "learning_rate": 9.387399728648156e-06, + "loss": 0.0395, + "step": 2186 + }, + { + "epoch": 0.7303389547503757, + "grad_norm": 0.41741731330621706, + "learning_rate": 9.386467348722989e-06, + "loss": 0.0463, + "step": 2187 + }, + { + "epoch": 0.7306729003172483, + "grad_norm": 0.5003210965167986, + "learning_rate": 9.385534306170279e-06, + "loss": 0.0364, + "step": 2188 + }, + { + "epoch": 0.7310068458841209, + "grad_norm": 0.3415018618510439, + "learning_rate": 9.384600601130973e-06, + "loss": 0.0394, + "step": 2189 + }, + { + "epoch": 0.7313407914509935, + "grad_norm": 0.37855829690889353, + "learning_rate": 9.383666233746121e-06, + "loss": 0.0373, + "step": 2190 + }, + { + "epoch": 0.7316747370178661, + "grad_norm": 0.5621844322113564, + "learning_rate": 9.382731204156869e-06, + "loss": 0.0461, + "step": 2191 + }, + { + "epoch": 0.7320086825847387, + "grad_norm": 0.4227093732171971, + "learning_rate": 9.381795512504461e-06, + "loss": 0.0359, + "step": 2192 + }, + { + "epoch": 0.7323426281516113, + "grad_norm": 0.6463561503128954, + "learning_rate": 9.380859158930249e-06, + "loss": 0.0622, + "step": 2193 + }, + { + "epoch": 0.7326765737184839, + "grad_norm": 0.3230426282789849, + "learning_rate": 9.379922143575678e-06, + "loss": 0.0248, + "step": 2194 + }, + { + "epoch": 0.7330105192853564, + "grad_norm": 0.3773252718914423, + "learning_rate": 9.378984466582294e-06, + "loss": 0.0356, + "step": 2195 + }, + { + "epoch": 0.733344464852229, + "grad_norm": 0.4584867118916967, + "learning_rate": 9.378046128091748e-06, + "loss": 0.0386, + "step": 2196 + }, + { + "epoch": 0.7336784104191016, + "grad_norm": 0.39063708609130854, + "learning_rate": 9.377107128245782e-06, + "loss": 0.0268, + "step": 2197 + }, + { + "epoch": 0.7340123559859743, + "grad_norm": 0.4621952850142301, + "learning_rate": 9.376167467186246e-06, + "loss": 0.034, + "step": 2198 + }, + { + "epoch": 0.7343463015528469, + "grad_norm": 0.9020751036892598, + "learning_rate": 9.375227145055085e-06, + "loss": 0.032, + "step": 2199 + }, + { + "epoch": 0.7346802471197195, + "grad_norm": 0.581350479413104, + "learning_rate": 9.374286161994351e-06, + "loss": 0.0433, + "step": 2200 + }, + { + "epoch": 0.7350141926865921, + "grad_norm": 0.470124648155974, + "learning_rate": 9.373344518146184e-06, + "loss": 0.0504, + "step": 2201 + }, + { + "epoch": 0.7353481382534647, + "grad_norm": 0.4280483085829255, + "learning_rate": 9.372402213652833e-06, + "loss": 0.044, + "step": 2202 + }, + { + "epoch": 0.7356820838203373, + "grad_norm": 0.45019101568081704, + "learning_rate": 9.371459248656645e-06, + "loss": 0.0387, + "step": 2203 + }, + { + "epoch": 0.7360160293872099, + "grad_norm": 0.45417671331877707, + "learning_rate": 9.370515623300066e-06, + "loss": 0.0418, + "step": 2204 + }, + { + "epoch": 0.7363499749540825, + "grad_norm": 0.46822253382427065, + "learning_rate": 9.369571337725638e-06, + "loss": 0.0396, + "step": 2205 + }, + { + "epoch": 0.7366839205209551, + "grad_norm": 0.4384426918864997, + "learning_rate": 9.368626392076013e-06, + "loss": 0.0348, + "step": 2206 + }, + { + "epoch": 0.7370178660878277, + "grad_norm": 0.3886070779136618, + "learning_rate": 9.367680786493929e-06, + "loss": 0.0394, + "step": 2207 + }, + { + "epoch": 0.7373518116547003, + "grad_norm": 0.37301161619664175, + "learning_rate": 9.366734521122236e-06, + "loss": 0.0389, + "step": 2208 + }, + { + "epoch": 0.7376857572215729, + "grad_norm": 0.4353994364406186, + "learning_rate": 9.365787596103877e-06, + "loss": 0.0414, + "step": 2209 + }, + { + "epoch": 0.7380197027884455, + "grad_norm": 0.5940195018896346, + "learning_rate": 9.364840011581896e-06, + "loss": 0.042, + "step": 2210 + }, + { + "epoch": 0.7383536483553181, + "grad_norm": 0.3967047113384441, + "learning_rate": 9.363891767699437e-06, + "loss": 0.0358, + "step": 2211 + }, + { + "epoch": 0.7386875939221906, + "grad_norm": 0.40143940408633927, + "learning_rate": 9.362942864599746e-06, + "loss": 0.0318, + "step": 2212 + }, + { + "epoch": 0.7390215394890632, + "grad_norm": 0.49633045943653026, + "learning_rate": 9.36199330242616e-06, + "loss": 0.0474, + "step": 2213 + }, + { + "epoch": 0.7393554850559358, + "grad_norm": 0.3330337287688398, + "learning_rate": 9.361043081322125e-06, + "loss": 0.0316, + "step": 2214 + }, + { + "epoch": 0.7396894306228085, + "grad_norm": 0.4951336586004503, + "learning_rate": 9.360092201431186e-06, + "loss": 0.0392, + "step": 2215 + }, + { + "epoch": 0.7400233761896811, + "grad_norm": 0.5995767210304951, + "learning_rate": 9.359140662896978e-06, + "loss": 0.0504, + "step": 2216 + }, + { + "epoch": 0.7403573217565537, + "grad_norm": 0.33445634694416476, + "learning_rate": 9.358188465863247e-06, + "loss": 0.037, + "step": 2217 + }, + { + "epoch": 0.7406912673234263, + "grad_norm": 0.3243516604870869, + "learning_rate": 9.357235610473833e-06, + "loss": 0.0374, + "step": 2218 + }, + { + "epoch": 0.7410252128902989, + "grad_norm": 0.3249633073589906, + "learning_rate": 9.356282096872673e-06, + "loss": 0.0455, + "step": 2219 + }, + { + "epoch": 0.7413591584571715, + "grad_norm": 0.3272398278474091, + "learning_rate": 9.355327925203811e-06, + "loss": 0.0299, + "step": 2220 + }, + { + "epoch": 0.7416931040240441, + "grad_norm": 0.5104439431607845, + "learning_rate": 9.354373095611383e-06, + "loss": 0.0506, + "step": 2221 + }, + { + "epoch": 0.7420270495909167, + "grad_norm": 0.303402283180065, + "learning_rate": 9.353417608239627e-06, + "loss": 0.0355, + "step": 2222 + }, + { + "epoch": 0.7423609951577893, + "grad_norm": 0.48439694546886963, + "learning_rate": 9.352461463232882e-06, + "loss": 0.0417, + "step": 2223 + }, + { + "epoch": 0.7426949407246619, + "grad_norm": 0.5067073653283253, + "learning_rate": 9.351504660735583e-06, + "loss": 0.0466, + "step": 2224 + }, + { + "epoch": 0.7430288862915345, + "grad_norm": 0.26718770356295135, + "learning_rate": 9.350547200892271e-06, + "loss": 0.0295, + "step": 2225 + }, + { + "epoch": 0.7433628318584071, + "grad_norm": 0.37623817934192, + "learning_rate": 9.349589083847577e-06, + "loss": 0.0325, + "step": 2226 + }, + { + "epoch": 0.7436967774252797, + "grad_norm": 0.3326289340340217, + "learning_rate": 9.348630309746236e-06, + "loss": 0.0373, + "step": 2227 + }, + { + "epoch": 0.7440307229921522, + "grad_norm": 0.31750724437460565, + "learning_rate": 9.347670878733084e-06, + "loss": 0.0297, + "step": 2228 + }, + { + "epoch": 0.7443646685590248, + "grad_norm": 0.4042643363629417, + "learning_rate": 9.346710790953053e-06, + "loss": 0.0457, + "step": 2229 + }, + { + "epoch": 0.7446986141258974, + "grad_norm": 0.3791139189174066, + "learning_rate": 9.345750046551177e-06, + "loss": 0.0338, + "step": 2230 + }, + { + "epoch": 0.74503255969277, + "grad_norm": 0.3048806840867557, + "learning_rate": 9.344788645672585e-06, + "loss": 0.0295, + "step": 2231 + }, + { + "epoch": 0.7453665052596427, + "grad_norm": 0.3093142222126558, + "learning_rate": 9.343826588462513e-06, + "loss": 0.0307, + "step": 2232 + }, + { + "epoch": 0.7457004508265153, + "grad_norm": 0.423785351652411, + "learning_rate": 9.342863875066284e-06, + "loss": 0.0302, + "step": 2233 + }, + { + "epoch": 0.7460343963933879, + "grad_norm": 0.4104524654735883, + "learning_rate": 9.341900505629333e-06, + "loss": 0.0267, + "step": 2234 + }, + { + "epoch": 0.7463683419602605, + "grad_norm": 0.6179830673034177, + "learning_rate": 9.340936480297187e-06, + "loss": 0.0429, + "step": 2235 + }, + { + "epoch": 0.7467022875271331, + "grad_norm": 0.2712801420997214, + "learning_rate": 9.339971799215472e-06, + "loss": 0.0297, + "step": 2236 + }, + { + "epoch": 0.7470362330940057, + "grad_norm": 0.5720340115168108, + "learning_rate": 9.339006462529916e-06, + "loss": 0.0309, + "step": 2237 + }, + { + "epoch": 0.7473701786608783, + "grad_norm": 0.29473138517563946, + "learning_rate": 9.338040470386344e-06, + "loss": 0.0299, + "step": 2238 + }, + { + "epoch": 0.7477041242277509, + "grad_norm": 0.3660711868493045, + "learning_rate": 9.337073822930681e-06, + "loss": 0.0499, + "step": 2239 + }, + { + "epoch": 0.7480380697946235, + "grad_norm": 0.39026290597585506, + "learning_rate": 9.336106520308948e-06, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.7483720153614961, + "grad_norm": 0.34956067278873115, + "learning_rate": 9.335138562667267e-06, + "loss": 0.0462, + "step": 2241 + }, + { + "epoch": 0.7487059609283687, + "grad_norm": 0.328480681617309, + "learning_rate": 9.334169950151866e-06, + "loss": 0.0277, + "step": 2242 + }, + { + "epoch": 0.7490399064952413, + "grad_norm": 0.5570308483129467, + "learning_rate": 9.333200682909059e-06, + "loss": 0.042, + "step": 2243 + }, + { + "epoch": 0.7493738520621138, + "grad_norm": 0.5499427636240891, + "learning_rate": 9.332230761085265e-06, + "loss": 0.0396, + "step": 2244 + }, + { + "epoch": 0.7497077976289864, + "grad_norm": 0.3647005932018772, + "learning_rate": 9.331260184827006e-06, + "loss": 0.0375, + "step": 2245 + }, + { + "epoch": 0.750041743195859, + "grad_norm": 0.4355905627081893, + "learning_rate": 9.330288954280898e-06, + "loss": 0.0326, + "step": 2246 + }, + { + "epoch": 0.7503756887627316, + "grad_norm": 0.6850520487771359, + "learning_rate": 9.329317069593654e-06, + "loss": 0.0505, + "step": 2247 + }, + { + "epoch": 0.7507096343296042, + "grad_norm": 0.48892253010051345, + "learning_rate": 9.328344530912093e-06, + "loss": 0.0414, + "step": 2248 + }, + { + "epoch": 0.7510435798964769, + "grad_norm": 0.514361695486632, + "learning_rate": 9.327371338383124e-06, + "loss": 0.036, + "step": 2249 + }, + { + "epoch": 0.7513775254633495, + "grad_norm": 0.40093172239531716, + "learning_rate": 9.326397492153762e-06, + "loss": 0.0384, + "step": 2250 + }, + { + "epoch": 0.7517114710302221, + "grad_norm": 0.9703862802428921, + "learning_rate": 9.325422992371117e-06, + "loss": 0.0544, + "step": 2251 + }, + { + "epoch": 0.7520454165970947, + "grad_norm": 0.5118673491055077, + "learning_rate": 9.324447839182397e-06, + "loss": 0.0461, + "step": 2252 + }, + { + "epoch": 0.7523793621639673, + "grad_norm": 0.3639689648823959, + "learning_rate": 9.323472032734915e-06, + "loss": 0.032, + "step": 2253 + }, + { + "epoch": 0.7527133077308399, + "grad_norm": 0.6187138592600782, + "learning_rate": 9.322495573176073e-06, + "loss": 0.0458, + "step": 2254 + }, + { + "epoch": 0.7530472532977125, + "grad_norm": 0.27008895818226897, + "learning_rate": 9.321518460653381e-06, + "loss": 0.0268, + "step": 2255 + }, + { + "epoch": 0.7533811988645851, + "grad_norm": 0.37538592472089005, + "learning_rate": 9.32054069531444e-06, + "loss": 0.0279, + "step": 2256 + }, + { + "epoch": 0.7537151444314577, + "grad_norm": 0.5327976462756395, + "learning_rate": 9.319562277306955e-06, + "loss": 0.0404, + "step": 2257 + }, + { + "epoch": 0.7540490899983303, + "grad_norm": 0.33481660725693313, + "learning_rate": 9.318583206778726e-06, + "loss": 0.0327, + "step": 2258 + }, + { + "epoch": 0.7543830355652029, + "grad_norm": 0.43566182686512606, + "learning_rate": 9.317603483877654e-06, + "loss": 0.037, + "step": 2259 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.34502868792169167, + "learning_rate": 9.316623108751739e-06, + "loss": 0.0274, + "step": 2260 + }, + { + "epoch": 0.755050926698948, + "grad_norm": 0.4098480126148483, + "learning_rate": 9.315642081549074e-06, + "loss": 0.0346, + "step": 2261 + }, + { + "epoch": 0.7553848722658206, + "grad_norm": 0.3520297697972669, + "learning_rate": 9.31466040241786e-06, + "loss": 0.0339, + "step": 2262 + }, + { + "epoch": 0.7557188178326932, + "grad_norm": 0.3353997409151091, + "learning_rate": 9.313678071506388e-06, + "loss": 0.0325, + "step": 2263 + }, + { + "epoch": 0.7560527633995658, + "grad_norm": 0.28276655262328104, + "learning_rate": 9.31269508896305e-06, + "loss": 0.0395, + "step": 2264 + }, + { + "epoch": 0.7563867089664384, + "grad_norm": 0.4461004263857972, + "learning_rate": 9.31171145493634e-06, + "loss": 0.04, + "step": 2265 + }, + { + "epoch": 0.756720654533311, + "grad_norm": 0.4051370472975567, + "learning_rate": 9.310727169574847e-06, + "loss": 0.0371, + "step": 2266 + }, + { + "epoch": 0.7570546001001837, + "grad_norm": 0.3119139688090534, + "learning_rate": 9.309742233027258e-06, + "loss": 0.031, + "step": 2267 + }, + { + "epoch": 0.7573885456670563, + "grad_norm": 0.33750869813243045, + "learning_rate": 9.308756645442356e-06, + "loss": 0.0305, + "step": 2268 + }, + { + "epoch": 0.7577224912339289, + "grad_norm": 0.35256712007998253, + "learning_rate": 9.307770406969032e-06, + "loss": 0.0353, + "step": 2269 + }, + { + "epoch": 0.7580564368008015, + "grad_norm": 0.2884726594341784, + "learning_rate": 9.306783517756264e-06, + "loss": 0.03, + "step": 2270 + }, + { + "epoch": 0.7583903823676741, + "grad_norm": 0.42069566480415976, + "learning_rate": 9.305795977953134e-06, + "loss": 0.0364, + "step": 2271 + }, + { + "epoch": 0.7587243279345467, + "grad_norm": 0.4546454469635054, + "learning_rate": 9.304807787708825e-06, + "loss": 0.0374, + "step": 2272 + }, + { + "epoch": 0.7590582735014193, + "grad_norm": 0.39993702427824834, + "learning_rate": 9.303818947172611e-06, + "loss": 0.0308, + "step": 2273 + }, + { + "epoch": 0.7593922190682919, + "grad_norm": 0.4791795651688872, + "learning_rate": 9.302829456493868e-06, + "loss": 0.0431, + "step": 2274 + }, + { + "epoch": 0.7597261646351645, + "grad_norm": 0.33578853656263385, + "learning_rate": 9.301839315822072e-06, + "loss": 0.0339, + "step": 2275 + }, + { + "epoch": 0.7600601102020371, + "grad_norm": 0.3537306255409295, + "learning_rate": 9.300848525306797e-06, + "loss": 0.0365, + "step": 2276 + }, + { + "epoch": 0.7603940557689096, + "grad_norm": 0.335284276454725, + "learning_rate": 9.299857085097708e-06, + "loss": 0.0341, + "step": 2277 + }, + { + "epoch": 0.7607280013357822, + "grad_norm": 0.40738792079537517, + "learning_rate": 9.298864995344579e-06, + "loss": 0.0272, + "step": 2278 + }, + { + "epoch": 0.7610619469026548, + "grad_norm": 0.6229553924584766, + "learning_rate": 9.297872256197276e-06, + "loss": 0.0412, + "step": 2279 + }, + { + "epoch": 0.7613958924695274, + "grad_norm": 0.2538644625026248, + "learning_rate": 9.296878867805762e-06, + "loss": 0.022, + "step": 2280 + }, + { + "epoch": 0.7617298380364, + "grad_norm": 0.28342092080256903, + "learning_rate": 9.2958848303201e-06, + "loss": 0.0293, + "step": 2281 + }, + { + "epoch": 0.7620637836032726, + "grad_norm": 0.3192840612391781, + "learning_rate": 9.294890143890451e-06, + "loss": 0.0327, + "step": 2282 + }, + { + "epoch": 0.7623977291701453, + "grad_norm": 0.31682716384579585, + "learning_rate": 9.293894808667077e-06, + "loss": 0.038, + "step": 2283 + }, + { + "epoch": 0.7627316747370179, + "grad_norm": 0.37258062981176443, + "learning_rate": 9.292898824800333e-06, + "loss": 0.0375, + "step": 2284 + }, + { + "epoch": 0.7630656203038905, + "grad_norm": 0.39974063226260376, + "learning_rate": 9.291902192440673e-06, + "loss": 0.0429, + "step": 2285 + }, + { + "epoch": 0.7633995658707631, + "grad_norm": 0.3159295288529469, + "learning_rate": 9.290904911738653e-06, + "loss": 0.0335, + "step": 2286 + }, + { + "epoch": 0.7637335114376357, + "grad_norm": 0.36322486258442793, + "learning_rate": 9.289906982844923e-06, + "loss": 0.0381, + "step": 2287 + }, + { + "epoch": 0.7640674570045083, + "grad_norm": 0.6540773212903619, + "learning_rate": 9.288908405910228e-06, + "loss": 0.027, + "step": 2288 + }, + { + "epoch": 0.7644014025713809, + "grad_norm": 0.3262228264152689, + "learning_rate": 9.287909181085421e-06, + "loss": 0.0296, + "step": 2289 + }, + { + "epoch": 0.7647353481382535, + "grad_norm": 0.36663665870234724, + "learning_rate": 9.286909308521443e-06, + "loss": 0.0316, + "step": 2290 + }, + { + "epoch": 0.7650692937051261, + "grad_norm": 0.3391954940491059, + "learning_rate": 9.285908788369336e-06, + "loss": 0.0348, + "step": 2291 + }, + { + "epoch": 0.7654032392719987, + "grad_norm": 0.35131535631671784, + "learning_rate": 9.284907620780244e-06, + "loss": 0.0307, + "step": 2292 + }, + { + "epoch": 0.7657371848388712, + "grad_norm": 0.35743271985538666, + "learning_rate": 9.2839058059054e-06, + "loss": 0.0263, + "step": 2293 + }, + { + "epoch": 0.7660711304057438, + "grad_norm": 0.35251162520858464, + "learning_rate": 9.282903343896144e-06, + "loss": 0.0285, + "step": 2294 + }, + { + "epoch": 0.7664050759726164, + "grad_norm": 0.40568589387088894, + "learning_rate": 9.281900234903908e-06, + "loss": 0.0329, + "step": 2295 + }, + { + "epoch": 0.766739021539489, + "grad_norm": 0.40969132207513087, + "learning_rate": 9.280896479080224e-06, + "loss": 0.0417, + "step": 2296 + }, + { + "epoch": 0.7670729671063616, + "grad_norm": 0.33811726746443876, + "learning_rate": 9.27989207657672e-06, + "loss": 0.0273, + "step": 2297 + }, + { + "epoch": 0.7674069126732342, + "grad_norm": 0.36425845736357687, + "learning_rate": 9.278887027545125e-06, + "loss": 0.0395, + "step": 2298 + }, + { + "epoch": 0.7677408582401068, + "grad_norm": 0.4273780853380815, + "learning_rate": 9.277881332137261e-06, + "loss": 0.0412, + "step": 2299 + }, + { + "epoch": 0.7680748038069795, + "grad_norm": 0.39795117070638286, + "learning_rate": 9.276874990505053e-06, + "loss": 0.0417, + "step": 2300 + }, + { + "epoch": 0.7684087493738521, + "grad_norm": 0.5686838243198611, + "learning_rate": 9.27586800280052e-06, + "loss": 0.0401, + "step": 2301 + }, + { + "epoch": 0.7687426949407247, + "grad_norm": 0.3871820169781138, + "learning_rate": 9.274860369175775e-06, + "loss": 0.0417, + "step": 2302 + }, + { + "epoch": 0.7690766405075973, + "grad_norm": 0.5327552237845823, + "learning_rate": 9.27385208978304e-06, + "loss": 0.0562, + "step": 2303 + }, + { + "epoch": 0.7694105860744699, + "grad_norm": 0.326833888376255, + "learning_rate": 9.272843164774622e-06, + "loss": 0.0362, + "step": 2304 + }, + { + "epoch": 0.7697445316413425, + "grad_norm": 0.4505078375757577, + "learning_rate": 9.27183359430293e-06, + "loss": 0.0369, + "step": 2305 + }, + { + "epoch": 0.7700784772082151, + "grad_norm": 0.2827253170407005, + "learning_rate": 9.270823378520478e-06, + "loss": 0.0296, + "step": 2306 + }, + { + "epoch": 0.7704124227750877, + "grad_norm": 0.4820110768733872, + "learning_rate": 9.269812517579867e-06, + "loss": 0.0386, + "step": 2307 + }, + { + "epoch": 0.7707463683419603, + "grad_norm": 0.7226482197248502, + "learning_rate": 9.268801011633799e-06, + "loss": 0.0473, + "step": 2308 + }, + { + "epoch": 0.7710803139088329, + "grad_norm": 0.2851388528969421, + "learning_rate": 9.267788860835076e-06, + "loss": 0.0269, + "step": 2309 + }, + { + "epoch": 0.7714142594757054, + "grad_norm": 0.5878850312448372, + "learning_rate": 9.266776065336593e-06, + "loss": 0.0628, + "step": 2310 + }, + { + "epoch": 0.771748205042578, + "grad_norm": 0.34757519733300535, + "learning_rate": 9.265762625291346e-06, + "loss": 0.0337, + "step": 2311 + }, + { + "epoch": 0.7720821506094506, + "grad_norm": 0.3855968835433228, + "learning_rate": 9.264748540852427e-06, + "loss": 0.0383, + "step": 2312 + }, + { + "epoch": 0.7724160961763232, + "grad_norm": 0.332258418348324, + "learning_rate": 9.263733812173023e-06, + "loss": 0.035, + "step": 2313 + }, + { + "epoch": 0.7727500417431958, + "grad_norm": 0.6300936967816194, + "learning_rate": 9.262718439406425e-06, + "loss": 0.0473, + "step": 2314 + }, + { + "epoch": 0.7730839873100684, + "grad_norm": 0.5642941486462103, + "learning_rate": 9.261702422706014e-06, + "loss": 0.048, + "step": 2315 + }, + { + "epoch": 0.773417932876941, + "grad_norm": 0.3490947858434045, + "learning_rate": 9.260685762225273e-06, + "loss": 0.0349, + "step": 2316 + }, + { + "epoch": 0.7737518784438137, + "grad_norm": 0.34370853187301154, + "learning_rate": 9.25966845811778e-06, + "loss": 0.0375, + "step": 2317 + }, + { + "epoch": 0.7740858240106863, + "grad_norm": 0.2893861373537869, + "learning_rate": 9.258650510537208e-06, + "loss": 0.0218, + "step": 2318 + }, + { + "epoch": 0.7744197695775589, + "grad_norm": 0.42479897197665645, + "learning_rate": 9.257631919637333e-06, + "loss": 0.0342, + "step": 2319 + }, + { + "epoch": 0.7747537151444315, + "grad_norm": 0.3423376050436938, + "learning_rate": 9.256612685572027e-06, + "loss": 0.0361, + "step": 2320 + }, + { + "epoch": 0.7750876607113041, + "grad_norm": 1.0006130724014104, + "learning_rate": 9.255592808495254e-06, + "loss": 0.0559, + "step": 2321 + }, + { + "epoch": 0.7754216062781767, + "grad_norm": 0.3735277315857998, + "learning_rate": 9.254572288561077e-06, + "loss": 0.0361, + "step": 2322 + }, + { + "epoch": 0.7757555518450493, + "grad_norm": 0.38814019180363407, + "learning_rate": 9.253551125923662e-06, + "loss": 0.0466, + "step": 2323 + }, + { + "epoch": 0.7760894974119219, + "grad_norm": 0.564850405216005, + "learning_rate": 9.252529320737265e-06, + "loss": 0.0565, + "step": 2324 + }, + { + "epoch": 0.7764234429787945, + "grad_norm": 0.38527470396801416, + "learning_rate": 9.251506873156242e-06, + "loss": 0.0415, + "step": 2325 + }, + { + "epoch": 0.776757388545667, + "grad_norm": 0.4496699876510423, + "learning_rate": 9.250483783335046e-06, + "loss": 0.0356, + "step": 2326 + }, + { + "epoch": 0.7770913341125396, + "grad_norm": 0.44017310855164865, + "learning_rate": 9.249460051428226e-06, + "loss": 0.0318, + "step": 2327 + }, + { + "epoch": 0.7774252796794122, + "grad_norm": 0.3751258453069622, + "learning_rate": 9.24843567759043e-06, + "loss": 0.0293, + "step": 2328 + }, + { + "epoch": 0.7777592252462848, + "grad_norm": 0.4126464214266128, + "learning_rate": 9.247410661976402e-06, + "loss": 0.0398, + "step": 2329 + }, + { + "epoch": 0.7780931708131574, + "grad_norm": 0.4186674320930181, + "learning_rate": 9.246385004740981e-06, + "loss": 0.0591, + "step": 2330 + }, + { + "epoch": 0.77842711638003, + "grad_norm": 0.4846763983678124, + "learning_rate": 9.245358706039105e-06, + "loss": 0.0463, + "step": 2331 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 0.29009378694281474, + "learning_rate": 9.244331766025812e-06, + "loss": 0.0307, + "step": 2332 + }, + { + "epoch": 0.7790950075137753, + "grad_norm": 0.41617628696252, + "learning_rate": 9.243304184856226e-06, + "loss": 0.032, + "step": 2333 + }, + { + "epoch": 0.7794289530806479, + "grad_norm": 0.28182386606315113, + "learning_rate": 9.242275962685584e-06, + "loss": 0.0308, + "step": 2334 + }, + { + "epoch": 0.7797628986475205, + "grad_norm": 0.44821281607522734, + "learning_rate": 9.241247099669202e-06, + "loss": 0.0456, + "step": 2335 + }, + { + "epoch": 0.7800968442143931, + "grad_norm": 0.36446204885079103, + "learning_rate": 9.24021759596251e-06, + "loss": 0.033, + "step": 2336 + }, + { + "epoch": 0.7804307897812657, + "grad_norm": 0.4045213815459264, + "learning_rate": 9.239187451721021e-06, + "loss": 0.0399, + "step": 2337 + }, + { + "epoch": 0.7807647353481383, + "grad_norm": 0.4448765508265765, + "learning_rate": 9.238156667100354e-06, + "loss": 0.0487, + "step": 2338 + }, + { + "epoch": 0.7810986809150109, + "grad_norm": 0.38492584393252205, + "learning_rate": 9.237125242256219e-06, + "loss": 0.0482, + "step": 2339 + }, + { + "epoch": 0.7814326264818835, + "grad_norm": 0.3436442668647944, + "learning_rate": 9.236093177344427e-06, + "loss": 0.0349, + "step": 2340 + }, + { + "epoch": 0.7817665720487561, + "grad_norm": 0.3751537242198368, + "learning_rate": 9.23506047252088e-06, + "loss": 0.0346, + "step": 2341 + }, + { + "epoch": 0.7821005176156286, + "grad_norm": 0.424955879356634, + "learning_rate": 9.234027127941585e-06, + "loss": 0.034, + "step": 2342 + }, + { + "epoch": 0.7824344631825012, + "grad_norm": 0.4998866236749951, + "learning_rate": 9.232993143762637e-06, + "loss": 0.0372, + "step": 2343 + }, + { + "epoch": 0.7827684087493738, + "grad_norm": 0.25156417480387433, + "learning_rate": 9.231958520140232e-06, + "loss": 0.0228, + "step": 2344 + }, + { + "epoch": 0.7831023543162464, + "grad_norm": 0.6408870209428651, + "learning_rate": 9.230923257230663e-06, + "loss": 0.0407, + "step": 2345 + }, + { + "epoch": 0.783436299883119, + "grad_norm": 0.439321078845631, + "learning_rate": 9.22988735519032e-06, + "loss": 0.0283, + "step": 2346 + }, + { + "epoch": 0.7837702454499916, + "grad_norm": 0.7523973235580048, + "learning_rate": 9.228850814175684e-06, + "loss": 0.0274, + "step": 2347 + }, + { + "epoch": 0.7841041910168642, + "grad_norm": 0.3867047988392383, + "learning_rate": 9.22781363434334e-06, + "loss": 0.0307, + "step": 2348 + }, + { + "epoch": 0.7844381365837368, + "grad_norm": 0.4819176914879532, + "learning_rate": 9.226775815849969e-06, + "loss": 0.0392, + "step": 2349 + }, + { + "epoch": 0.7847720821506095, + "grad_norm": 0.365813211738815, + "learning_rate": 9.225737358852339e-06, + "loss": 0.039, + "step": 2350 + }, + { + "epoch": 0.7851060277174821, + "grad_norm": 0.3297844802233969, + "learning_rate": 9.224698263507326e-06, + "loss": 0.0385, + "step": 2351 + }, + { + "epoch": 0.7854399732843547, + "grad_norm": 0.4214744807341373, + "learning_rate": 9.223658529971896e-06, + "loss": 0.0444, + "step": 2352 + }, + { + "epoch": 0.7857739188512273, + "grad_norm": 0.445684392370362, + "learning_rate": 9.222618158403111e-06, + "loss": 0.0391, + "step": 2353 + }, + { + "epoch": 0.7861078644180999, + "grad_norm": 0.3742527468558874, + "learning_rate": 9.221577148958137e-06, + "loss": 0.0263, + "step": 2354 + }, + { + "epoch": 0.7864418099849725, + "grad_norm": 0.41245806746412306, + "learning_rate": 9.220535501794224e-06, + "loss": 0.0296, + "step": 2355 + }, + { + "epoch": 0.7867757555518451, + "grad_norm": 0.36738237117600886, + "learning_rate": 9.21949321706873e-06, + "loss": 0.0298, + "step": 2356 + }, + { + "epoch": 0.7871097011187177, + "grad_norm": 1.739612723513887, + "learning_rate": 9.218450294939103e-06, + "loss": 0.0424, + "step": 2357 + }, + { + "epoch": 0.7874436466855903, + "grad_norm": 0.502219044342745, + "learning_rate": 9.217406735562887e-06, + "loss": 0.0383, + "step": 2358 + }, + { + "epoch": 0.7877775922524628, + "grad_norm": 0.36902696622805026, + "learning_rate": 9.216362539097726e-06, + "loss": 0.0283, + "step": 2359 + }, + { + "epoch": 0.7881115378193354, + "grad_norm": 0.4595719327394596, + "learning_rate": 9.215317705701356e-06, + "loss": 0.0356, + "step": 2360 + }, + { + "epoch": 0.788445483386208, + "grad_norm": 0.40020534193794044, + "learning_rate": 9.214272235531615e-06, + "loss": 0.0223, + "step": 2361 + }, + { + "epoch": 0.7887794289530806, + "grad_norm": 0.4674393974345048, + "learning_rate": 9.213226128746431e-06, + "loss": 0.0423, + "step": 2362 + }, + { + "epoch": 0.7891133745199532, + "grad_norm": 0.37668519786186994, + "learning_rate": 9.21217938550383e-06, + "loss": 0.037, + "step": 2363 + }, + { + "epoch": 0.7894473200868258, + "grad_norm": 0.3140922341544152, + "learning_rate": 9.211132005961936e-06, + "loss": 0.0332, + "step": 2364 + }, + { + "epoch": 0.7897812656536984, + "grad_norm": 0.3887117251094952, + "learning_rate": 9.210083990278968e-06, + "loss": 0.0383, + "step": 2365 + }, + { + "epoch": 0.790115211220571, + "grad_norm": 0.2794176454357536, + "learning_rate": 9.209035338613242e-06, + "loss": 0.0253, + "step": 2366 + }, + { + "epoch": 0.7904491567874437, + "grad_norm": 0.4790062539480891, + "learning_rate": 9.207986051123167e-06, + "loss": 0.0435, + "step": 2367 + }, + { + "epoch": 0.7907831023543163, + "grad_norm": 0.3569405064812053, + "learning_rate": 9.206936127967254e-06, + "loss": 0.0348, + "step": 2368 + }, + { + "epoch": 0.7911170479211889, + "grad_norm": 0.46043945348276366, + "learning_rate": 9.205885569304103e-06, + "loss": 0.0518, + "step": 2369 + }, + { + "epoch": 0.7914509934880615, + "grad_norm": 0.48821940054543433, + "learning_rate": 9.204834375292413e-06, + "loss": 0.0454, + "step": 2370 + }, + { + "epoch": 0.7917849390549341, + "grad_norm": 0.39694079969159907, + "learning_rate": 9.20378254609098e-06, + "loss": 0.0377, + "step": 2371 + }, + { + "epoch": 0.7921188846218067, + "grad_norm": 0.3183483878023652, + "learning_rate": 9.202730081858697e-06, + "loss": 0.0288, + "step": 2372 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.37945832633015725, + "learning_rate": 9.201676982754549e-06, + "loss": 0.0349, + "step": 2373 + }, + { + "epoch": 0.7927867757555519, + "grad_norm": 0.3527971006203742, + "learning_rate": 9.200623248937619e-06, + "loss": 0.0336, + "step": 2374 + }, + { + "epoch": 0.7931207213224244, + "grad_norm": 0.5719888456394068, + "learning_rate": 9.199568880567085e-06, + "loss": 0.056, + "step": 2375 + }, + { + "epoch": 0.793454666889297, + "grad_norm": 0.2937981953922501, + "learning_rate": 9.198513877802226e-06, + "loss": 0.0311, + "step": 2376 + }, + { + "epoch": 0.7937886124561696, + "grad_norm": 0.418195405726491, + "learning_rate": 9.19745824080241e-06, + "loss": 0.044, + "step": 2377 + }, + { + "epoch": 0.7941225580230422, + "grad_norm": 0.31126928333848797, + "learning_rate": 9.196401969727101e-06, + "loss": 0.0342, + "step": 2378 + }, + { + "epoch": 0.7944565035899148, + "grad_norm": 0.3489075878698303, + "learning_rate": 9.195345064735865e-06, + "loss": 0.033, + "step": 2379 + }, + { + "epoch": 0.7947904491567874, + "grad_norm": 0.5326991082953855, + "learning_rate": 9.194287525988358e-06, + "loss": 0.0486, + "step": 2380 + }, + { + "epoch": 0.79512439472366, + "grad_norm": 0.30114911980458225, + "learning_rate": 9.193229353644336e-06, + "loss": 0.0273, + "step": 2381 + }, + { + "epoch": 0.7954583402905326, + "grad_norm": 0.30664946189607595, + "learning_rate": 9.192170547863644e-06, + "loss": 0.028, + "step": 2382 + }, + { + "epoch": 0.7957922858574052, + "grad_norm": 0.5154007863444322, + "learning_rate": 9.191111108806228e-06, + "loss": 0.06, + "step": 2383 + }, + { + "epoch": 0.7961262314242779, + "grad_norm": 0.3483317254335199, + "learning_rate": 9.190051036632133e-06, + "loss": 0.0267, + "step": 2384 + }, + { + "epoch": 0.7964601769911505, + "grad_norm": 0.3830564906103203, + "learning_rate": 9.188990331501493e-06, + "loss": 0.0317, + "step": 2385 + }, + { + "epoch": 0.7967941225580231, + "grad_norm": 0.3534612409362452, + "learning_rate": 9.187928993574537e-06, + "loss": 0.0399, + "step": 2386 + }, + { + "epoch": 0.7971280681248957, + "grad_norm": 0.5207150849458622, + "learning_rate": 9.186867023011598e-06, + "loss": 0.0435, + "step": 2387 + }, + { + "epoch": 0.7974620136917683, + "grad_norm": 0.5533648404301923, + "learning_rate": 9.185804419973096e-06, + "loss": 0.0298, + "step": 2388 + }, + { + "epoch": 0.7977959592586409, + "grad_norm": 0.39095097237270326, + "learning_rate": 9.18474118461955e-06, + "loss": 0.0371, + "step": 2389 + }, + { + "epoch": 0.7981299048255135, + "grad_norm": 0.3223845734875138, + "learning_rate": 9.183677317111572e-06, + "loss": 0.0256, + "step": 2390 + }, + { + "epoch": 0.798463850392386, + "grad_norm": 0.3270148569203314, + "learning_rate": 9.182612817609877e-06, + "loss": 0.0279, + "step": 2391 + }, + { + "epoch": 0.7987977959592586, + "grad_norm": 0.5074522008396516, + "learning_rate": 9.181547686275266e-06, + "loss": 0.0473, + "step": 2392 + }, + { + "epoch": 0.7991317415261312, + "grad_norm": 0.4095450740977763, + "learning_rate": 9.180481923268641e-06, + "loss": 0.0359, + "step": 2393 + }, + { + "epoch": 0.7994656870930038, + "grad_norm": 0.6349669291396607, + "learning_rate": 9.179415528750998e-06, + "loss": 0.0443, + "step": 2394 + }, + { + "epoch": 0.7997996326598764, + "grad_norm": 0.3217982545761102, + "learning_rate": 9.178348502883428e-06, + "loss": 0.0284, + "step": 2395 + }, + { + "epoch": 0.800133578226749, + "grad_norm": 0.4167659783631312, + "learning_rate": 9.17728084582712e-06, + "loss": 0.0476, + "step": 2396 + }, + { + "epoch": 0.8004675237936216, + "grad_norm": 0.3966339225977142, + "learning_rate": 9.176212557743352e-06, + "loss": 0.0354, + "step": 2397 + }, + { + "epoch": 0.8008014693604942, + "grad_norm": 0.5187414436076643, + "learning_rate": 9.175143638793504e-06, + "loss": 0.0425, + "step": 2398 + }, + { + "epoch": 0.8011354149273668, + "grad_norm": 0.4727472650003963, + "learning_rate": 9.174074089139048e-06, + "loss": 0.0334, + "step": 2399 + }, + { + "epoch": 0.8014693604942394, + "grad_norm": 0.46117154169807706, + "learning_rate": 9.173003908941555e-06, + "loss": 0.0405, + "step": 2400 + }, + { + "epoch": 0.801803306061112, + "grad_norm": 0.37078738977373793, + "learning_rate": 9.171933098362685e-06, + "loss": 0.0461, + "step": 2401 + }, + { + "epoch": 0.8021372516279847, + "grad_norm": 0.32751459125821236, + "learning_rate": 9.170861657564197e-06, + "loss": 0.0271, + "step": 2402 + }, + { + "epoch": 0.8024711971948573, + "grad_norm": 0.4234680642811903, + "learning_rate": 9.169789586707947e-06, + "loss": 0.0368, + "step": 2403 + }, + { + "epoch": 0.8028051427617299, + "grad_norm": 0.7077112915277073, + "learning_rate": 9.16871688595588e-06, + "loss": 0.0526, + "step": 2404 + }, + { + "epoch": 0.8031390883286025, + "grad_norm": 0.3150630326209093, + "learning_rate": 9.167643555470044e-06, + "loss": 0.032, + "step": 2405 + }, + { + "epoch": 0.8034730338954751, + "grad_norm": 0.48362754954045, + "learning_rate": 9.166569595412576e-06, + "loss": 0.0468, + "step": 2406 + }, + { + "epoch": 0.8038069794623477, + "grad_norm": 0.7833067374264567, + "learning_rate": 9.16549500594571e-06, + "loss": 0.0309, + "step": 2407 + }, + { + "epoch": 0.8041409250292202, + "grad_norm": 0.3873284202124137, + "learning_rate": 9.164419787231778e-06, + "loss": 0.0336, + "step": 2408 + }, + { + "epoch": 0.8044748705960928, + "grad_norm": 0.5462434144047629, + "learning_rate": 9.163343939433202e-06, + "loss": 0.0316, + "step": 2409 + }, + { + "epoch": 0.8048088161629654, + "grad_norm": 0.5206466027492244, + "learning_rate": 9.162267462712502e-06, + "loss": 0.04, + "step": 2410 + }, + { + "epoch": 0.805142761729838, + "grad_norm": 0.3640320791945312, + "learning_rate": 9.161190357232292e-06, + "loss": 0.0416, + "step": 2411 + }, + { + "epoch": 0.8054767072967106, + "grad_norm": 0.2826873572129083, + "learning_rate": 9.160112623155282e-06, + "loss": 0.0255, + "step": 2412 + }, + { + "epoch": 0.8058106528635832, + "grad_norm": 0.24947060858739167, + "learning_rate": 9.159034260644277e-06, + "loss": 0.0259, + "step": 2413 + }, + { + "epoch": 0.8061445984304558, + "grad_norm": 0.4415832391790346, + "learning_rate": 9.157955269862176e-06, + "loss": 0.0538, + "step": 2414 + }, + { + "epoch": 0.8064785439973284, + "grad_norm": 0.5403604731819692, + "learning_rate": 9.156875650971974e-06, + "loss": 0.0479, + "step": 2415 + }, + { + "epoch": 0.806812489564201, + "grad_norm": 0.37074362603031524, + "learning_rate": 9.155795404136757e-06, + "loss": 0.0313, + "step": 2416 + }, + { + "epoch": 0.8071464351310736, + "grad_norm": 0.28043313341854315, + "learning_rate": 9.154714529519715e-06, + "loss": 0.0257, + "step": 2417 + }, + { + "epoch": 0.8074803806979463, + "grad_norm": 0.3944787897956824, + "learning_rate": 9.15363302728412e-06, + "loss": 0.037, + "step": 2418 + }, + { + "epoch": 0.8078143262648189, + "grad_norm": 0.54183445956239, + "learning_rate": 9.15255089759335e-06, + "loss": 0.0376, + "step": 2419 + }, + { + "epoch": 0.8081482718316915, + "grad_norm": 0.3641107896799597, + "learning_rate": 9.151468140610872e-06, + "loss": 0.0345, + "step": 2420 + }, + { + "epoch": 0.8084822173985641, + "grad_norm": 0.34098392754429363, + "learning_rate": 9.150384756500249e-06, + "loss": 0.0296, + "step": 2421 + }, + { + "epoch": 0.8088161629654367, + "grad_norm": 0.4701897245201092, + "learning_rate": 9.14930074542514e-06, + "loss": 0.0375, + "step": 2422 + }, + { + "epoch": 0.8091501085323093, + "grad_norm": 0.8031930296798723, + "learning_rate": 9.148216107549297e-06, + "loss": 0.0449, + "step": 2423 + }, + { + "epoch": 0.8094840540991818, + "grad_norm": 0.36074531884402183, + "learning_rate": 9.147130843036567e-06, + "loss": 0.0343, + "step": 2424 + }, + { + "epoch": 0.8098179996660544, + "grad_norm": 0.3821526286807176, + "learning_rate": 9.146044952050891e-06, + "loss": 0.033, + "step": 2425 + }, + { + "epoch": 0.810151945232927, + "grad_norm": 0.41207661037352494, + "learning_rate": 9.144958434756308e-06, + "loss": 0.0406, + "step": 2426 + }, + { + "epoch": 0.8104858907997996, + "grad_norm": 0.4425118776433806, + "learning_rate": 9.14387129131695e-06, + "loss": 0.0339, + "step": 2427 + }, + { + "epoch": 0.8108198363666722, + "grad_norm": 0.4517910740924372, + "learning_rate": 9.142783521897038e-06, + "loss": 0.0349, + "step": 2428 + }, + { + "epoch": 0.8111537819335448, + "grad_norm": 0.42162971152368633, + "learning_rate": 9.141695126660896e-06, + "loss": 0.0321, + "step": 2429 + }, + { + "epoch": 0.8114877275004174, + "grad_norm": 0.411906356374825, + "learning_rate": 9.14060610577294e-06, + "loss": 0.0399, + "step": 2430 + }, + { + "epoch": 0.81182167306729, + "grad_norm": 0.28577457074283213, + "learning_rate": 9.139516459397675e-06, + "loss": 0.028, + "step": 2431 + }, + { + "epoch": 0.8121556186341626, + "grad_norm": 0.7449829445852623, + "learning_rate": 9.13842618769971e-06, + "loss": 0.0505, + "step": 2432 + }, + { + "epoch": 0.8124895642010352, + "grad_norm": 0.45647043906420526, + "learning_rate": 9.13733529084374e-06, + "loss": 0.0599, + "step": 2433 + }, + { + "epoch": 0.8128235097679078, + "grad_norm": 0.35935069149352, + "learning_rate": 9.13624376899456e-06, + "loss": 0.0375, + "step": 2434 + }, + { + "epoch": 0.8131574553347805, + "grad_norm": 0.26724471005808453, + "learning_rate": 9.135151622317054e-06, + "loss": 0.0214, + "step": 2435 + }, + { + "epoch": 0.8134914009016531, + "grad_norm": 0.4143829862143663, + "learning_rate": 9.134058850976205e-06, + "loss": 0.0394, + "step": 2436 + }, + { + "epoch": 0.8138253464685257, + "grad_norm": 0.3387135148533028, + "learning_rate": 9.132965455137092e-06, + "loss": 0.0232, + "step": 2437 + }, + { + "epoch": 0.8141592920353983, + "grad_norm": 0.387476338381007, + "learning_rate": 9.13187143496488e-06, + "loss": 0.0328, + "step": 2438 + }, + { + "epoch": 0.8144932376022709, + "grad_norm": 0.32699976371462375, + "learning_rate": 9.13077679062484e-06, + "loss": 0.0313, + "step": 2439 + }, + { + "epoch": 0.8148271831691434, + "grad_norm": 0.4811761527697918, + "learning_rate": 9.129681522282326e-06, + "loss": 0.052, + "step": 2440 + }, + { + "epoch": 0.815161128736016, + "grad_norm": 0.40425537229020103, + "learning_rate": 9.128585630102793e-06, + "loss": 0.0343, + "step": 2441 + }, + { + "epoch": 0.8154950743028886, + "grad_norm": 0.22313328264074472, + "learning_rate": 9.127489114251787e-06, + "loss": 0.0221, + "step": 2442 + }, + { + "epoch": 0.8158290198697612, + "grad_norm": 0.4236672242450553, + "learning_rate": 9.12639197489495e-06, + "loss": 0.0409, + "step": 2443 + }, + { + "epoch": 0.8161629654366338, + "grad_norm": 0.3226748396398696, + "learning_rate": 9.125294212198022e-06, + "loss": 0.0318, + "step": 2444 + }, + { + "epoch": 0.8164969110035064, + "grad_norm": 0.39649953078996875, + "learning_rate": 9.124195826326827e-06, + "loss": 0.0394, + "step": 2445 + }, + { + "epoch": 0.816830856570379, + "grad_norm": 0.44571838385313783, + "learning_rate": 9.12309681744729e-06, + "loss": 0.0359, + "step": 2446 + }, + { + "epoch": 0.8171648021372516, + "grad_norm": 0.4612803299732282, + "learning_rate": 9.121997185725433e-06, + "loss": 0.0503, + "step": 2447 + }, + { + "epoch": 0.8174987477041242, + "grad_norm": 0.28429125911881203, + "learning_rate": 9.120896931327366e-06, + "loss": 0.0328, + "step": 2448 + }, + { + "epoch": 0.8178326932709968, + "grad_norm": 0.3489991448508281, + "learning_rate": 9.119796054419295e-06, + "loss": 0.0397, + "step": 2449 + }, + { + "epoch": 0.8181666388378694, + "grad_norm": 0.46086943336663794, + "learning_rate": 9.118694555167521e-06, + "loss": 0.0435, + "step": 2450 + }, + { + "epoch": 0.818500584404742, + "grad_norm": 0.37026378315513875, + "learning_rate": 9.117592433738439e-06, + "loss": 0.0396, + "step": 2451 + }, + { + "epoch": 0.8188345299716147, + "grad_norm": 0.3769869415058017, + "learning_rate": 9.116489690298536e-06, + "loss": 0.0339, + "step": 2452 + }, + { + "epoch": 0.8191684755384873, + "grad_norm": 0.3655957147179104, + "learning_rate": 9.115386325014396e-06, + "loss": 0.0279, + "step": 2453 + }, + { + "epoch": 0.8195024211053599, + "grad_norm": 0.5595894810042473, + "learning_rate": 9.114282338052695e-06, + "loss": 0.0733, + "step": 2454 + }, + { + "epoch": 0.8198363666722325, + "grad_norm": 0.3916737281680144, + "learning_rate": 9.113177729580203e-06, + "loss": 0.0303, + "step": 2455 + }, + { + "epoch": 0.820170312239105, + "grad_norm": 0.264321562523143, + "learning_rate": 9.112072499763783e-06, + "loss": 0.0263, + "step": 2456 + }, + { + "epoch": 0.8205042578059776, + "grad_norm": 0.43920788251930337, + "learning_rate": 9.110966648770392e-06, + "loss": 0.0367, + "step": 2457 + }, + { + "epoch": 0.8208382033728502, + "grad_norm": 0.3810762130061277, + "learning_rate": 9.109860176767085e-06, + "loss": 0.0374, + "step": 2458 + }, + { + "epoch": 0.8211721489397228, + "grad_norm": 0.37419432634766514, + "learning_rate": 9.108753083921007e-06, + "loss": 0.0269, + "step": 2459 + }, + { + "epoch": 0.8215060945065954, + "grad_norm": 0.3937119028500666, + "learning_rate": 9.107645370399395e-06, + "loss": 0.035, + "step": 2460 + }, + { + "epoch": 0.821840040073468, + "grad_norm": 0.3157086225779351, + "learning_rate": 9.106537036369587e-06, + "loss": 0.027, + "step": 2461 + }, + { + "epoch": 0.8221739856403406, + "grad_norm": 0.36220649413296774, + "learning_rate": 9.105428081999004e-06, + "loss": 0.0377, + "step": 2462 + }, + { + "epoch": 0.8225079312072132, + "grad_norm": 0.29694055654637075, + "learning_rate": 9.10431850745517e-06, + "loss": 0.0274, + "step": 2463 + }, + { + "epoch": 0.8228418767740858, + "grad_norm": 0.5536074318217982, + "learning_rate": 9.103208312905698e-06, + "loss": 0.0339, + "step": 2464 + }, + { + "epoch": 0.8231758223409584, + "grad_norm": 0.31589960925835464, + "learning_rate": 9.102097498518299e-06, + "loss": 0.0328, + "step": 2465 + }, + { + "epoch": 0.823509767907831, + "grad_norm": 0.240873576082435, + "learning_rate": 9.100986064460769e-06, + "loss": 0.0253, + "step": 2466 + }, + { + "epoch": 0.8238437134747036, + "grad_norm": 0.6390537219668236, + "learning_rate": 9.099874010901009e-06, + "loss": 0.0384, + "step": 2467 + }, + { + "epoch": 0.8241776590415762, + "grad_norm": 0.4003867672451697, + "learning_rate": 9.098761338007003e-06, + "loss": 0.0421, + "step": 2468 + }, + { + "epoch": 0.8245116046084489, + "grad_norm": 0.25997349608393655, + "learning_rate": 9.097648045946837e-06, + "loss": 0.0276, + "step": 2469 + }, + { + "epoch": 0.8248455501753215, + "grad_norm": 0.37630087962921577, + "learning_rate": 9.096534134888685e-06, + "loss": 0.0397, + "step": 2470 + }, + { + "epoch": 0.8251794957421941, + "grad_norm": 0.448552322720191, + "learning_rate": 9.095419605000817e-06, + "loss": 0.0542, + "step": 2471 + }, + { + "epoch": 0.8255134413090667, + "grad_norm": 0.40038428905827006, + "learning_rate": 9.094304456451596e-06, + "loss": 0.0329, + "step": 2472 + }, + { + "epoch": 0.8258473868759392, + "grad_norm": 0.35070676220854424, + "learning_rate": 9.093188689409477e-06, + "loss": 0.0305, + "step": 2473 + }, + { + "epoch": 0.8261813324428118, + "grad_norm": 0.6567098569875411, + "learning_rate": 9.09207230404301e-06, + "loss": 0.0433, + "step": 2474 + }, + { + "epoch": 0.8265152780096844, + "grad_norm": 0.4146248814385917, + "learning_rate": 9.090955300520842e-06, + "loss": 0.0359, + "step": 2475 + }, + { + "epoch": 0.826849223576557, + "grad_norm": 0.53906952828287, + "learning_rate": 9.089837679011704e-06, + "loss": 0.0487, + "step": 2476 + }, + { + "epoch": 0.8271831691434296, + "grad_norm": 0.3961223055800164, + "learning_rate": 9.08871943968443e-06, + "loss": 0.0465, + "step": 2477 + }, + { + "epoch": 0.8275171147103022, + "grad_norm": 0.35789536394251836, + "learning_rate": 9.08760058270794e-06, + "loss": 0.0304, + "step": 2478 + }, + { + "epoch": 0.8278510602771748, + "grad_norm": 0.5180262448696479, + "learning_rate": 9.086481108251253e-06, + "loss": 0.0475, + "step": 2479 + }, + { + "epoch": 0.8281850058440474, + "grad_norm": 0.33508836274125614, + "learning_rate": 9.085361016483477e-06, + "loss": 0.0324, + "step": 2480 + }, + { + "epoch": 0.82851895141092, + "grad_norm": 0.46774903934797496, + "learning_rate": 9.084240307573816e-06, + "loss": 0.0418, + "step": 2481 + }, + { + "epoch": 0.8288528969777926, + "grad_norm": 0.4297754725269856, + "learning_rate": 9.083118981691567e-06, + "loss": 0.0387, + "step": 2482 + }, + { + "epoch": 0.8291868425446652, + "grad_norm": 0.4137131594058549, + "learning_rate": 9.081997039006117e-06, + "loss": 0.023, + "step": 2483 + }, + { + "epoch": 0.8295207881115378, + "grad_norm": 0.3159406308326085, + "learning_rate": 9.080874479686952e-06, + "loss": 0.0294, + "step": 2484 + }, + { + "epoch": 0.8298547336784105, + "grad_norm": 0.4318463669325737, + "learning_rate": 9.079751303903646e-06, + "loss": 0.0388, + "step": 2485 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.5132811378489065, + "learning_rate": 9.078627511825866e-06, + "loss": 0.0442, + "step": 2486 + }, + { + "epoch": 0.8305226248121557, + "grad_norm": 0.369999066202171, + "learning_rate": 9.077503103623379e-06, + "loss": 0.0415, + "step": 2487 + }, + { + "epoch": 0.8308565703790283, + "grad_norm": 0.35851599522502636, + "learning_rate": 9.076378079466036e-06, + "loss": 0.0292, + "step": 2488 + }, + { + "epoch": 0.8311905159459008, + "grad_norm": 0.39661769308364786, + "learning_rate": 9.075252439523785e-06, + "loss": 0.0273, + "step": 2489 + }, + { + "epoch": 0.8315244615127734, + "grad_norm": 0.6490263729376547, + "learning_rate": 9.074126183966669e-06, + "loss": 0.0459, + "step": 2490 + }, + { + "epoch": 0.831858407079646, + "grad_norm": 0.4993562408027148, + "learning_rate": 9.072999312964823e-06, + "loss": 0.0322, + "step": 2491 + }, + { + "epoch": 0.8321923526465186, + "grad_norm": 0.5590656530101995, + "learning_rate": 9.071871826688472e-06, + "loss": 0.037, + "step": 2492 + }, + { + "epoch": 0.8325262982133912, + "grad_norm": 0.40610149648410226, + "learning_rate": 9.070743725307937e-06, + "loss": 0.0353, + "step": 2493 + }, + { + "epoch": 0.8328602437802638, + "grad_norm": 0.47595095725779396, + "learning_rate": 9.06961500899363e-06, + "loss": 0.0434, + "step": 2494 + }, + { + "epoch": 0.8331941893471364, + "grad_norm": 0.3155987530334695, + "learning_rate": 9.068485677916059e-06, + "loss": 0.0268, + "step": 2495 + }, + { + "epoch": 0.833528134914009, + "grad_norm": 0.29476634712756444, + "learning_rate": 9.06735573224582e-06, + "loss": 0.0293, + "step": 2496 + }, + { + "epoch": 0.8338620804808816, + "grad_norm": 0.45532023846265596, + "learning_rate": 9.066225172153607e-06, + "loss": 0.0383, + "step": 2497 + }, + { + "epoch": 0.8341960260477542, + "grad_norm": 0.45987924413218256, + "learning_rate": 9.065093997810204e-06, + "loss": 0.0419, + "step": 2498 + }, + { + "epoch": 0.8345299716146268, + "grad_norm": 0.3355220422575681, + "learning_rate": 9.063962209386485e-06, + "loss": 0.0332, + "step": 2499 + }, + { + "epoch": 0.8348639171814994, + "grad_norm": 0.3454349722962344, + "learning_rate": 9.062829807053426e-06, + "loss": 0.0309, + "step": 2500 + }, + { + "epoch": 0.835197862748372, + "grad_norm": 0.35926714475349847, + "learning_rate": 9.061696790982086e-06, + "loss": 0.0392, + "step": 2501 + }, + { + "epoch": 0.8355318083152447, + "grad_norm": 0.4295126451366027, + "learning_rate": 9.060563161343618e-06, + "loss": 0.0349, + "step": 2502 + }, + { + "epoch": 0.8358657538821173, + "grad_norm": 0.29260561635820886, + "learning_rate": 9.059428918309276e-06, + "loss": 0.0257, + "step": 2503 + }, + { + "epoch": 0.8361996994489899, + "grad_norm": 0.33008995795141166, + "learning_rate": 9.058294062050396e-06, + "loss": 0.046, + "step": 2504 + }, + { + "epoch": 0.8365336450158624, + "grad_norm": 0.2517921444583666, + "learning_rate": 9.057158592738414e-06, + "loss": 0.0208, + "step": 2505 + }, + { + "epoch": 0.836867590582735, + "grad_norm": 0.31092700651231453, + "learning_rate": 9.056022510544855e-06, + "loss": 0.0264, + "step": 2506 + }, + { + "epoch": 0.8372015361496076, + "grad_norm": 0.3313582044848553, + "learning_rate": 9.054885815641336e-06, + "loss": 0.0388, + "step": 2507 + }, + { + "epoch": 0.8375354817164802, + "grad_norm": 0.39183559708166843, + "learning_rate": 9.05374850819957e-06, + "loss": 0.0335, + "step": 2508 + }, + { + "epoch": 0.8378694272833528, + "grad_norm": 0.3097698801810843, + "learning_rate": 9.052610588391363e-06, + "loss": 0.0304, + "step": 2509 + }, + { + "epoch": 0.8382033728502254, + "grad_norm": 0.46386041507843345, + "learning_rate": 9.051472056388606e-06, + "loss": 0.0411, + "step": 2510 + }, + { + "epoch": 0.838537318417098, + "grad_norm": 0.3402621546983913, + "learning_rate": 9.050332912363292e-06, + "loss": 0.0464, + "step": 2511 + }, + { + "epoch": 0.8388712639839706, + "grad_norm": 0.42905597080581825, + "learning_rate": 9.049193156487501e-06, + "loss": 0.0347, + "step": 2512 + }, + { + "epoch": 0.8392052095508432, + "grad_norm": 0.36758534229327194, + "learning_rate": 9.048052788933405e-06, + "loss": 0.0373, + "step": 2513 + }, + { + "epoch": 0.8395391551177158, + "grad_norm": 0.3109644251678908, + "learning_rate": 9.046911809873271e-06, + "loss": 0.0359, + "step": 2514 + }, + { + "epoch": 0.8398731006845884, + "grad_norm": 0.270042973528763, + "learning_rate": 9.045770219479457e-06, + "loss": 0.0283, + "step": 2515 + }, + { + "epoch": 0.840207046251461, + "grad_norm": 0.3615870964014073, + "learning_rate": 9.044628017924415e-06, + "loss": 0.0387, + "step": 2516 + }, + { + "epoch": 0.8405409918183336, + "grad_norm": 0.4431650479128026, + "learning_rate": 9.043485205380685e-06, + "loss": 0.0424, + "step": 2517 + }, + { + "epoch": 0.8408749373852062, + "grad_norm": 0.3106387026167272, + "learning_rate": 9.042341782020906e-06, + "loss": 0.0296, + "step": 2518 + }, + { + "epoch": 0.8412088829520789, + "grad_norm": 0.4525540474250504, + "learning_rate": 9.041197748017802e-06, + "loss": 0.0285, + "step": 2519 + }, + { + "epoch": 0.8415428285189515, + "grad_norm": 0.4281303859257426, + "learning_rate": 9.040053103544196e-06, + "loss": 0.0309, + "step": 2520 + }, + { + "epoch": 0.8418767740858241, + "grad_norm": 0.34463536752834206, + "learning_rate": 9.038907848772999e-06, + "loss": 0.0291, + "step": 2521 + }, + { + "epoch": 0.8422107196526966, + "grad_norm": 0.4978034332914274, + "learning_rate": 9.037761983877214e-06, + "loss": 0.0403, + "step": 2522 + }, + { + "epoch": 0.8425446652195692, + "grad_norm": 0.2935442990809466, + "learning_rate": 9.036615509029939e-06, + "loss": 0.0277, + "step": 2523 + }, + { + "epoch": 0.8428786107864418, + "grad_norm": 0.3939069379966287, + "learning_rate": 9.035468424404362e-06, + "loss": 0.0447, + "step": 2524 + }, + { + "epoch": 0.8432125563533144, + "grad_norm": 0.2692561881254716, + "learning_rate": 9.034320730173762e-06, + "loss": 0.032, + "step": 2525 + }, + { + "epoch": 0.843546501920187, + "grad_norm": 0.40534574419674485, + "learning_rate": 9.033172426511515e-06, + "loss": 0.0338, + "step": 2526 + }, + { + "epoch": 0.8438804474870596, + "grad_norm": 0.38824482878056465, + "learning_rate": 9.032023513591081e-06, + "loss": 0.0341, + "step": 2527 + }, + { + "epoch": 0.8442143930539322, + "grad_norm": 0.3807926866614681, + "learning_rate": 9.030873991586021e-06, + "loss": 0.0328, + "step": 2528 + }, + { + "epoch": 0.8445483386208048, + "grad_norm": 0.3329664612627948, + "learning_rate": 9.029723860669983e-06, + "loss": 0.0333, + "step": 2529 + }, + { + "epoch": 0.8448822841876774, + "grad_norm": 0.4923109783222729, + "learning_rate": 9.028573121016707e-06, + "loss": 0.0469, + "step": 2530 + }, + { + "epoch": 0.84521622975455, + "grad_norm": 0.4685760628222625, + "learning_rate": 9.027421772800027e-06, + "loss": 0.0455, + "step": 2531 + }, + { + "epoch": 0.8455501753214226, + "grad_norm": 0.3429466888956345, + "learning_rate": 9.026269816193867e-06, + "loss": 0.0326, + "step": 2532 + }, + { + "epoch": 0.8458841208882952, + "grad_norm": 0.41156545406048173, + "learning_rate": 9.025117251372242e-06, + "loss": 0.0389, + "step": 2533 + }, + { + "epoch": 0.8462180664551678, + "grad_norm": 0.38520743067716473, + "learning_rate": 9.023964078509263e-06, + "loss": 0.0468, + "step": 2534 + }, + { + "epoch": 0.8465520120220404, + "grad_norm": 0.4426812965109251, + "learning_rate": 9.022810297779129e-06, + "loss": 0.0314, + "step": 2535 + }, + { + "epoch": 0.846885957588913, + "grad_norm": 0.4054259111233482, + "learning_rate": 9.021655909356133e-06, + "loss": 0.0319, + "step": 2536 + }, + { + "epoch": 0.8472199031557857, + "grad_norm": 0.275475936813345, + "learning_rate": 9.020500913414658e-06, + "loss": 0.0313, + "step": 2537 + }, + { + "epoch": 0.8475538487226582, + "grad_norm": 0.5625464771479998, + "learning_rate": 9.019345310129179e-06, + "loss": 0.0343, + "step": 2538 + }, + { + "epoch": 0.8478877942895308, + "grad_norm": 0.37284068266175235, + "learning_rate": 9.018189099674266e-06, + "loss": 0.0393, + "step": 2539 + }, + { + "epoch": 0.8482217398564034, + "grad_norm": 0.39007837088611796, + "learning_rate": 9.017032282224577e-06, + "loss": 0.0318, + "step": 2540 + }, + { + "epoch": 0.848555685423276, + "grad_norm": 0.2668760370278359, + "learning_rate": 9.015874857954863e-06, + "loss": 0.0243, + "step": 2541 + }, + { + "epoch": 0.8488896309901486, + "grad_norm": 0.3726617870078607, + "learning_rate": 9.014716827039965e-06, + "loss": 0.0419, + "step": 2542 + }, + { + "epoch": 0.8492235765570212, + "grad_norm": 0.3365880533500763, + "learning_rate": 9.013558189654819e-06, + "loss": 0.0324, + "step": 2543 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 0.3724418543681984, + "learning_rate": 9.01239894597445e-06, + "loss": 0.0321, + "step": 2544 + }, + { + "epoch": 0.8498914676907664, + "grad_norm": 0.36786990425937205, + "learning_rate": 9.011239096173977e-06, + "loss": 0.026, + "step": 2545 + }, + { + "epoch": 0.850225413257639, + "grad_norm": 0.3369252086446057, + "learning_rate": 9.010078640428606e-06, + "loss": 0.0294, + "step": 2546 + }, + { + "epoch": 0.8505593588245116, + "grad_norm": 0.41643199528600433, + "learning_rate": 9.00891757891364e-06, + "loss": 0.0359, + "step": 2547 + }, + { + "epoch": 0.8508933043913842, + "grad_norm": 0.3551459626507266, + "learning_rate": 9.007755911804471e-06, + "loss": 0.0416, + "step": 2548 + }, + { + "epoch": 0.8512272499582568, + "grad_norm": 0.5818091704775687, + "learning_rate": 9.006593639276582e-06, + "loss": 0.0388, + "step": 2549 + }, + { + "epoch": 0.8515611955251294, + "grad_norm": 0.4401470784045499, + "learning_rate": 9.005430761505548e-06, + "loss": 0.0379, + "step": 2550 + }, + { + "epoch": 0.851895141092002, + "grad_norm": 0.34058136620945995, + "learning_rate": 9.004267278667032e-06, + "loss": 0.037, + "step": 2551 + }, + { + "epoch": 0.8522290866588746, + "grad_norm": 0.2688628258869588, + "learning_rate": 9.003103190936797e-06, + "loss": 0.0233, + "step": 2552 + }, + { + "epoch": 0.8525630322257473, + "grad_norm": 0.31540272385338997, + "learning_rate": 9.00193849849069e-06, + "loss": 0.026, + "step": 2553 + }, + { + "epoch": 0.8528969777926197, + "grad_norm": 0.3048526266868853, + "learning_rate": 9.00077320150465e-06, + "loss": 0.0309, + "step": 2554 + }, + { + "epoch": 0.8532309233594924, + "grad_norm": 0.6693321639654497, + "learning_rate": 8.999607300154712e-06, + "loss": 0.0421, + "step": 2555 + }, + { + "epoch": 0.853564868926365, + "grad_norm": 0.25290515928318363, + "learning_rate": 8.998440794616998e-06, + "loss": 0.0235, + "step": 2556 + }, + { + "epoch": 0.8538988144932376, + "grad_norm": 0.3147693099430538, + "learning_rate": 8.99727368506772e-06, + "loss": 0.0276, + "step": 2557 + }, + { + "epoch": 0.8542327600601102, + "grad_norm": 0.7088509591714547, + "learning_rate": 8.996105971683187e-06, + "loss": 0.0278, + "step": 2558 + }, + { + "epoch": 0.8545667056269828, + "grad_norm": 0.6308194130049286, + "learning_rate": 8.994937654639793e-06, + "loss": 0.0442, + "step": 2559 + }, + { + "epoch": 0.8549006511938554, + "grad_norm": 0.44898239754939884, + "learning_rate": 8.993768734114029e-06, + "loss": 0.0309, + "step": 2560 + }, + { + "epoch": 0.855234596760728, + "grad_norm": 0.28749219626684475, + "learning_rate": 8.992599210282471e-06, + "loss": 0.0315, + "step": 2561 + }, + { + "epoch": 0.8555685423276006, + "grad_norm": 0.3812508704110184, + "learning_rate": 8.991429083321792e-06, + "loss": 0.0312, + "step": 2562 + }, + { + "epoch": 0.8559024878944732, + "grad_norm": 1.7031389079076569, + "learning_rate": 8.990258353408754e-06, + "loss": 0.0459, + "step": 2563 + }, + { + "epoch": 0.8562364334613458, + "grad_norm": 0.31544007516151007, + "learning_rate": 8.989087020720204e-06, + "loss": 0.0371, + "step": 2564 + }, + { + "epoch": 0.8565703790282184, + "grad_norm": 0.38767338172827265, + "learning_rate": 8.987915085433092e-06, + "loss": 0.0335, + "step": 2565 + }, + { + "epoch": 0.856904324595091, + "grad_norm": 0.3000791654786373, + "learning_rate": 8.98674254772445e-06, + "loss": 0.0318, + "step": 2566 + }, + { + "epoch": 0.8572382701619636, + "grad_norm": 0.31229740046048454, + "learning_rate": 8.985569407771404e-06, + "loss": 0.0303, + "step": 2567 + }, + { + "epoch": 0.8575722157288362, + "grad_norm": 0.49339490655013896, + "learning_rate": 8.984395665751169e-06, + "loss": 0.0421, + "step": 2568 + }, + { + "epoch": 0.8579061612957088, + "grad_norm": 0.36816145730214384, + "learning_rate": 8.983221321841056e-06, + "loss": 0.0328, + "step": 2569 + }, + { + "epoch": 0.8582401068625815, + "grad_norm": 0.37529493325365043, + "learning_rate": 8.98204637621846e-06, + "loss": 0.0331, + "step": 2570 + }, + { + "epoch": 0.858574052429454, + "grad_norm": 0.45748221301458636, + "learning_rate": 8.980870829060872e-06, + "loss": 0.0321, + "step": 2571 + }, + { + "epoch": 0.8589079979963266, + "grad_norm": 0.5329144888857236, + "learning_rate": 8.979694680545872e-06, + "loss": 0.0339, + "step": 2572 + }, + { + "epoch": 0.8592419435631992, + "grad_norm": 0.33675903890161457, + "learning_rate": 8.978517930851132e-06, + "loss": 0.0284, + "step": 2573 + }, + { + "epoch": 0.8595758891300718, + "grad_norm": 0.44303915509008357, + "learning_rate": 8.977340580154411e-06, + "loss": 0.0367, + "step": 2574 + }, + { + "epoch": 0.8599098346969444, + "grad_norm": 0.4514433634093268, + "learning_rate": 8.976162628633565e-06, + "loss": 0.0457, + "step": 2575 + }, + { + "epoch": 0.860243780263817, + "grad_norm": 0.43410756328034344, + "learning_rate": 8.974984076466537e-06, + "loss": 0.035, + "step": 2576 + }, + { + "epoch": 0.8605777258306896, + "grad_norm": 0.3872709268496956, + "learning_rate": 8.97380492383136e-06, + "loss": 0.0305, + "step": 2577 + }, + { + "epoch": 0.8609116713975622, + "grad_norm": 0.47725018597115404, + "learning_rate": 8.972625170906157e-06, + "loss": 0.0483, + "step": 2578 + }, + { + "epoch": 0.8612456169644348, + "grad_norm": 0.31772576137655567, + "learning_rate": 8.971444817869148e-06, + "loss": 0.0283, + "step": 2579 + }, + { + "epoch": 0.8615795625313074, + "grad_norm": 0.37107286537717504, + "learning_rate": 8.970263864898636e-06, + "loss": 0.0413, + "step": 2580 + }, + { + "epoch": 0.86191350809818, + "grad_norm": 0.5725590523682224, + "learning_rate": 8.969082312173021e-06, + "loss": 0.0423, + "step": 2581 + }, + { + "epoch": 0.8622474536650526, + "grad_norm": 0.3941374912936759, + "learning_rate": 8.967900159870787e-06, + "loss": 0.0295, + "step": 2582 + }, + { + "epoch": 0.8625813992319252, + "grad_norm": 0.36191755909944073, + "learning_rate": 8.966717408170512e-06, + "loss": 0.031, + "step": 2583 + }, + { + "epoch": 0.8629153447987978, + "grad_norm": 0.3429675161089552, + "learning_rate": 8.965534057250866e-06, + "loss": 0.0433, + "step": 2584 + }, + { + "epoch": 0.8632492903656704, + "grad_norm": 0.3028077737377305, + "learning_rate": 8.964350107290609e-06, + "loss": 0.0297, + "step": 2585 + }, + { + "epoch": 0.863583235932543, + "grad_norm": 0.4830327842147132, + "learning_rate": 8.96316555846859e-06, + "loss": 0.0399, + "step": 2586 + }, + { + "epoch": 0.8639171814994155, + "grad_norm": 0.26916796497004913, + "learning_rate": 8.961980410963749e-06, + "loss": 0.02, + "step": 2587 + }, + { + "epoch": 0.8642511270662881, + "grad_norm": 0.4134466211792293, + "learning_rate": 8.960794664955115e-06, + "loss": 0.041, + "step": 2588 + }, + { + "epoch": 0.8645850726331608, + "grad_norm": 0.32659389902593844, + "learning_rate": 8.95960832062181e-06, + "loss": 0.029, + "step": 2589 + }, + { + "epoch": 0.8649190182000334, + "grad_norm": 0.7694702843000258, + "learning_rate": 8.958421378143046e-06, + "loss": 0.0386, + "step": 2590 + }, + { + "epoch": 0.865252963766906, + "grad_norm": 0.5168924274103711, + "learning_rate": 8.957233837698122e-06, + "loss": 0.0411, + "step": 2591 + }, + { + "epoch": 0.8655869093337786, + "grad_norm": 0.43121061815199463, + "learning_rate": 8.956045699466433e-06, + "loss": 0.0525, + "step": 2592 + }, + { + "epoch": 0.8659208549006512, + "grad_norm": 0.3314527135026029, + "learning_rate": 8.95485696362746e-06, + "loss": 0.0358, + "step": 2593 + }, + { + "epoch": 0.8662548004675238, + "grad_norm": 0.463869796250436, + "learning_rate": 8.953667630360778e-06, + "loss": 0.036, + "step": 2594 + }, + { + "epoch": 0.8665887460343964, + "grad_norm": 0.5740069960790621, + "learning_rate": 8.952477699846044e-06, + "loss": 0.0433, + "step": 2595 + }, + { + "epoch": 0.866922691601269, + "grad_norm": 0.6188270632194969, + "learning_rate": 8.951287172263018e-06, + "loss": 0.0291, + "step": 2596 + }, + { + "epoch": 0.8672566371681416, + "grad_norm": 0.422883593465061, + "learning_rate": 8.950096047791539e-06, + "loss": 0.0431, + "step": 2597 + }, + { + "epoch": 0.8675905827350142, + "grad_norm": 0.6846113727170556, + "learning_rate": 8.94890432661154e-06, + "loss": 0.0496, + "step": 2598 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.3031987176226039, + "learning_rate": 8.947712008903045e-06, + "loss": 0.0307, + "step": 2599 + }, + { + "epoch": 0.8682584738687594, + "grad_norm": 0.56108161337743, + "learning_rate": 8.946519094846169e-06, + "loss": 0.0335, + "step": 2600 + }, + { + "epoch": 0.868592419435632, + "grad_norm": 0.37223617309309653, + "learning_rate": 8.945325584621116e-06, + "loss": 0.0297, + "step": 2601 + }, + { + "epoch": 0.8689263650025046, + "grad_norm": 0.5549951575258364, + "learning_rate": 8.944131478408177e-06, + "loss": 0.0449, + "step": 2602 + }, + { + "epoch": 0.8692603105693771, + "grad_norm": 0.45803541602321984, + "learning_rate": 8.942936776387739e-06, + "loss": 0.0352, + "step": 2603 + }, + { + "epoch": 0.8695942561362497, + "grad_norm": 0.4264773699081465, + "learning_rate": 8.941741478740272e-06, + "loss": 0.0369, + "step": 2604 + }, + { + "epoch": 0.8699282017031224, + "grad_norm": 0.3191449035292794, + "learning_rate": 8.940545585646344e-06, + "loss": 0.0297, + "step": 2605 + }, + { + "epoch": 0.870262147269995, + "grad_norm": 0.3198982569663816, + "learning_rate": 8.939349097286608e-06, + "loss": 0.0265, + "step": 2606 + }, + { + "epoch": 0.8705960928368676, + "grad_norm": 0.5553117913988942, + "learning_rate": 8.938152013841803e-06, + "loss": 0.0366, + "step": 2607 + }, + { + "epoch": 0.8709300384037402, + "grad_norm": 0.44719504860903736, + "learning_rate": 8.93695433549277e-06, + "loss": 0.0376, + "step": 2608 + }, + { + "epoch": 0.8712639839706128, + "grad_norm": 0.32679155125163895, + "learning_rate": 8.935756062420426e-06, + "loss": 0.0346, + "step": 2609 + }, + { + "epoch": 0.8715979295374854, + "grad_norm": 0.4609210191387648, + "learning_rate": 8.934557194805787e-06, + "loss": 0.0325, + "step": 2610 + }, + { + "epoch": 0.871931875104358, + "grad_norm": 0.4706308256660193, + "learning_rate": 8.933357732829957e-06, + "loss": 0.0397, + "step": 2611 + }, + { + "epoch": 0.8722658206712306, + "grad_norm": 0.40300368032655215, + "learning_rate": 8.932157676674126e-06, + "loss": 0.0292, + "step": 2612 + }, + { + "epoch": 0.8725997662381032, + "grad_norm": 0.28732447433061004, + "learning_rate": 8.93095702651958e-06, + "loss": 0.0305, + "step": 2613 + }, + { + "epoch": 0.8729337118049758, + "grad_norm": 0.36369387012379667, + "learning_rate": 8.92975578254769e-06, + "loss": 0.0504, + "step": 2614 + }, + { + "epoch": 0.8732676573718484, + "grad_norm": 0.4591704692481582, + "learning_rate": 8.928553944939915e-06, + "loss": 0.0402, + "step": 2615 + }, + { + "epoch": 0.873601602938721, + "grad_norm": 0.33481246908498963, + "learning_rate": 8.92735151387781e-06, + "loss": 0.0287, + "step": 2616 + }, + { + "epoch": 0.8739355485055936, + "grad_norm": 0.357360199668993, + "learning_rate": 8.926148489543018e-06, + "loss": 0.0362, + "step": 2617 + }, + { + "epoch": 0.8742694940724662, + "grad_norm": 0.39042917680217704, + "learning_rate": 8.924944872117264e-06, + "loss": 0.0391, + "step": 2618 + }, + { + "epoch": 0.8746034396393388, + "grad_norm": 0.28540601454276765, + "learning_rate": 8.923740661782376e-06, + "loss": 0.0287, + "step": 2619 + }, + { + "epoch": 0.8749373852062113, + "grad_norm": 0.4140095320834736, + "learning_rate": 8.92253585872026e-06, + "loss": 0.0326, + "step": 2620 + }, + { + "epoch": 0.8752713307730839, + "grad_norm": 0.4121447335969474, + "learning_rate": 8.921330463112915e-06, + "loss": 0.0255, + "step": 2621 + }, + { + "epoch": 0.8756052763399566, + "grad_norm": 0.3554290077776676, + "learning_rate": 8.92012447514243e-06, + "loss": 0.031, + "step": 2622 + }, + { + "epoch": 0.8759392219068292, + "grad_norm": 0.7307532156262747, + "learning_rate": 8.918917894990989e-06, + "loss": 0.0362, + "step": 2623 + }, + { + "epoch": 0.8762731674737018, + "grad_norm": 0.2553936615311305, + "learning_rate": 8.917710722840853e-06, + "loss": 0.0178, + "step": 2624 + }, + { + "epoch": 0.8766071130405744, + "grad_norm": 0.37407512446904145, + "learning_rate": 8.916502958874385e-06, + "loss": 0.0363, + "step": 2625 + }, + { + "epoch": 0.876941058607447, + "grad_norm": 0.42475876395394646, + "learning_rate": 8.915294603274027e-06, + "loss": 0.0368, + "step": 2626 + }, + { + "epoch": 0.8772750041743196, + "grad_norm": 0.3358353924527036, + "learning_rate": 8.91408565622232e-06, + "loss": 0.0446, + "step": 2627 + }, + { + "epoch": 0.8776089497411922, + "grad_norm": 0.40566349975881943, + "learning_rate": 8.912876117901887e-06, + "loss": 0.0372, + "step": 2628 + }, + { + "epoch": 0.8779428953080648, + "grad_norm": 0.3742251940601137, + "learning_rate": 8.911665988495446e-06, + "loss": 0.0417, + "step": 2629 + }, + { + "epoch": 0.8782768408749374, + "grad_norm": 0.3712879563967958, + "learning_rate": 8.910455268185795e-06, + "loss": 0.0339, + "step": 2630 + }, + { + "epoch": 0.87861078644181, + "grad_norm": 0.34386298764350637, + "learning_rate": 8.909243957155835e-06, + "loss": 0.0326, + "step": 2631 + }, + { + "epoch": 0.8789447320086826, + "grad_norm": 0.3236861653841093, + "learning_rate": 8.908032055588544e-06, + "loss": 0.0474, + "step": 2632 + }, + { + "epoch": 0.8792786775755552, + "grad_norm": 0.26739114830091093, + "learning_rate": 8.906819563666997e-06, + "loss": 0.034, + "step": 2633 + }, + { + "epoch": 0.8796126231424278, + "grad_norm": 0.3358869564481119, + "learning_rate": 8.905606481574351e-06, + "loss": 0.0342, + "step": 2634 + }, + { + "epoch": 0.8799465687093004, + "grad_norm": 0.2755597387425229, + "learning_rate": 8.90439280949386e-06, + "loss": 0.0267, + "step": 2635 + }, + { + "epoch": 0.8802805142761729, + "grad_norm": 0.3766510920867843, + "learning_rate": 8.903178547608863e-06, + "loss": 0.0296, + "step": 2636 + }, + { + "epoch": 0.8806144598430455, + "grad_norm": 0.34793544795872505, + "learning_rate": 8.901963696102788e-06, + "loss": 0.034, + "step": 2637 + }, + { + "epoch": 0.8809484054099181, + "grad_norm": 0.275487457116531, + "learning_rate": 8.900748255159152e-06, + "loss": 0.0267, + "step": 2638 + }, + { + "epoch": 0.8812823509767908, + "grad_norm": 0.24705015902803695, + "learning_rate": 8.899532224961562e-06, + "loss": 0.0227, + "step": 2639 + }, + { + "epoch": 0.8816162965436634, + "grad_norm": 0.6536853667142282, + "learning_rate": 8.898315605693715e-06, + "loss": 0.0438, + "step": 2640 + }, + { + "epoch": 0.881950242110536, + "grad_norm": 0.43430130069824485, + "learning_rate": 8.897098397539394e-06, + "loss": 0.0317, + "step": 2641 + }, + { + "epoch": 0.8822841876774086, + "grad_norm": 0.43687515508839014, + "learning_rate": 8.895880600682472e-06, + "loss": 0.0384, + "step": 2642 + }, + { + "epoch": 0.8826181332442812, + "grad_norm": 0.39590306979153195, + "learning_rate": 8.894662215306913e-06, + "loss": 0.033, + "step": 2643 + }, + { + "epoch": 0.8829520788111538, + "grad_norm": 0.7290575707828736, + "learning_rate": 8.89344324159677e-06, + "loss": 0.0364, + "step": 2644 + }, + { + "epoch": 0.8832860243780264, + "grad_norm": 0.3751669108911836, + "learning_rate": 8.89222367973618e-06, + "loss": 0.0311, + "step": 2645 + }, + { + "epoch": 0.883619969944899, + "grad_norm": 0.34397576939891117, + "learning_rate": 8.891003529909375e-06, + "loss": 0.0371, + "step": 2646 + }, + { + "epoch": 0.8839539155117716, + "grad_norm": 0.4109270316750449, + "learning_rate": 8.889782792300672e-06, + "loss": 0.0301, + "step": 2647 + }, + { + "epoch": 0.8842878610786442, + "grad_norm": 0.30637162441557436, + "learning_rate": 8.888561467094476e-06, + "loss": 0.0275, + "step": 2648 + }, + { + "epoch": 0.8846218066455168, + "grad_norm": 0.30622959236678143, + "learning_rate": 8.887339554475284e-06, + "loss": 0.0326, + "step": 2649 + }, + { + "epoch": 0.8849557522123894, + "grad_norm": 0.3161478933617422, + "learning_rate": 8.886117054627682e-06, + "loss": 0.032, + "step": 2650 + }, + { + "epoch": 0.885289697779262, + "grad_norm": 0.4208502399428019, + "learning_rate": 8.88489396773634e-06, + "loss": 0.0242, + "step": 2651 + }, + { + "epoch": 0.8856236433461345, + "grad_norm": 0.3681407208121261, + "learning_rate": 8.883670293986019e-06, + "loss": 0.0298, + "step": 2652 + }, + { + "epoch": 0.8859575889130071, + "grad_norm": 0.34878417508861853, + "learning_rate": 8.882446033561576e-06, + "loss": 0.033, + "step": 2653 + }, + { + "epoch": 0.8862915344798797, + "grad_norm": 0.4684780724338432, + "learning_rate": 8.881221186647941e-06, + "loss": 0.045, + "step": 2654 + }, + { + "epoch": 0.8866254800467523, + "grad_norm": 0.3957874804275106, + "learning_rate": 8.879995753430148e-06, + "loss": 0.0347, + "step": 2655 + }, + { + "epoch": 0.886959425613625, + "grad_norm": 0.42665252936857895, + "learning_rate": 8.878769734093312e-06, + "loss": 0.032, + "step": 2656 + }, + { + "epoch": 0.8872933711804976, + "grad_norm": 0.29682241926451486, + "learning_rate": 8.877543128822634e-06, + "loss": 0.0298, + "step": 2657 + }, + { + "epoch": 0.8876273167473702, + "grad_norm": 0.3651957801300177, + "learning_rate": 8.876315937803413e-06, + "loss": 0.0323, + "step": 2658 + }, + { + "epoch": 0.8879612623142428, + "grad_norm": 0.2769485137576066, + "learning_rate": 8.875088161221025e-06, + "loss": 0.0307, + "step": 2659 + }, + { + "epoch": 0.8882952078811154, + "grad_norm": 0.4415688094211487, + "learning_rate": 8.873859799260944e-06, + "loss": 0.0353, + "step": 2660 + }, + { + "epoch": 0.888629153447988, + "grad_norm": 0.33974912444710537, + "learning_rate": 8.872630852108725e-06, + "loss": 0.0358, + "step": 2661 + }, + { + "epoch": 0.8889630990148606, + "grad_norm": 0.2861482838286048, + "learning_rate": 8.87140131995002e-06, + "loss": 0.0331, + "step": 2662 + }, + { + "epoch": 0.8892970445817332, + "grad_norm": 0.33208570178611857, + "learning_rate": 8.870171202970559e-06, + "loss": 0.0248, + "step": 2663 + }, + { + "epoch": 0.8896309901486058, + "grad_norm": 0.35668018176898314, + "learning_rate": 8.868940501356169e-06, + "loss": 0.0359, + "step": 2664 + }, + { + "epoch": 0.8899649357154784, + "grad_norm": 0.2931305648052054, + "learning_rate": 8.86770921529276e-06, + "loss": 0.0255, + "step": 2665 + }, + { + "epoch": 0.890298881282351, + "grad_norm": 0.8325037062693247, + "learning_rate": 8.866477344966334e-06, + "loss": 0.0302, + "step": 2666 + }, + { + "epoch": 0.8906328268492236, + "grad_norm": 0.27860169435527826, + "learning_rate": 8.865244890562978e-06, + "loss": 0.0297, + "step": 2667 + }, + { + "epoch": 0.8909667724160962, + "grad_norm": 0.49230324299758665, + "learning_rate": 8.864011852268872e-06, + "loss": 0.0431, + "step": 2668 + }, + { + "epoch": 0.8913007179829687, + "grad_norm": 0.3231549530220241, + "learning_rate": 8.862778230270276e-06, + "loss": 0.028, + "step": 2669 + }, + { + "epoch": 0.8916346635498413, + "grad_norm": 0.4893422631717919, + "learning_rate": 8.861544024753545e-06, + "loss": 0.0638, + "step": 2670 + }, + { + "epoch": 0.8919686091167139, + "grad_norm": 0.4412187905975192, + "learning_rate": 8.860309235905122e-06, + "loss": 0.0423, + "step": 2671 + }, + { + "epoch": 0.8923025546835865, + "grad_norm": 0.33046413914807304, + "learning_rate": 8.859073863911536e-06, + "loss": 0.0459, + "step": 2672 + }, + { + "epoch": 0.8926365002504592, + "grad_norm": 0.3452233475138895, + "learning_rate": 8.857837908959404e-06, + "loss": 0.0382, + "step": 2673 + }, + { + "epoch": 0.8929704458173318, + "grad_norm": 0.37413800958104054, + "learning_rate": 8.856601371235429e-06, + "loss": 0.0342, + "step": 2674 + }, + { + "epoch": 0.8933043913842044, + "grad_norm": 0.4060950887816249, + "learning_rate": 8.855364250926409e-06, + "loss": 0.0424, + "step": 2675 + }, + { + "epoch": 0.893638336951077, + "grad_norm": 0.35810437403999507, + "learning_rate": 8.854126548219222e-06, + "loss": 0.024, + "step": 2676 + }, + { + "epoch": 0.8939722825179496, + "grad_norm": 0.5647203968417588, + "learning_rate": 8.85288826330084e-06, + "loss": 0.0397, + "step": 2677 + }, + { + "epoch": 0.8943062280848222, + "grad_norm": 0.3599785221058561, + "learning_rate": 8.85164939635832e-06, + "loss": 0.0269, + "step": 2678 + }, + { + "epoch": 0.8946401736516948, + "grad_norm": 0.3624847754468508, + "learning_rate": 8.850409947578806e-06, + "loss": 0.0363, + "step": 2679 + }, + { + "epoch": 0.8949741192185674, + "grad_norm": 0.3693969366357945, + "learning_rate": 8.849169917149532e-06, + "loss": 0.0333, + "step": 2680 + }, + { + "epoch": 0.89530806478544, + "grad_norm": 0.323426425859245, + "learning_rate": 8.847929305257821e-06, + "loss": 0.0341, + "step": 2681 + }, + { + "epoch": 0.8956420103523126, + "grad_norm": 0.4534456123344399, + "learning_rate": 8.846688112091078e-06, + "loss": 0.0354, + "step": 2682 + }, + { + "epoch": 0.8959759559191852, + "grad_norm": 0.43091063353507614, + "learning_rate": 8.845446337836805e-06, + "loss": 0.0306, + "step": 2683 + }, + { + "epoch": 0.8963099014860578, + "grad_norm": 0.4524258538049501, + "learning_rate": 8.844203982682583e-06, + "loss": 0.0394, + "step": 2684 + }, + { + "epoch": 0.8966438470529303, + "grad_norm": 0.436855081098346, + "learning_rate": 8.842961046816085e-06, + "loss": 0.0378, + "step": 2685 + }, + { + "epoch": 0.8969777926198029, + "grad_norm": 0.31495273658629613, + "learning_rate": 8.841717530425071e-06, + "loss": 0.0311, + "step": 2686 + }, + { + "epoch": 0.8973117381866755, + "grad_norm": 0.3807648276384131, + "learning_rate": 8.84047343369739e-06, + "loss": 0.032, + "step": 2687 + }, + { + "epoch": 0.8976456837535481, + "grad_norm": 0.5188136392000254, + "learning_rate": 8.839228756820977e-06, + "loss": 0.0546, + "step": 2688 + }, + { + "epoch": 0.8979796293204207, + "grad_norm": 0.8778263519222621, + "learning_rate": 8.837983499983856e-06, + "loss": 0.0482, + "step": 2689 + }, + { + "epoch": 0.8983135748872934, + "grad_norm": 0.3485019158278814, + "learning_rate": 8.836737663374135e-06, + "loss": 0.0457, + "step": 2690 + }, + { + "epoch": 0.898647520454166, + "grad_norm": 0.314559173548534, + "learning_rate": 8.835491247180012e-06, + "loss": 0.0283, + "step": 2691 + }, + { + "epoch": 0.8989814660210386, + "grad_norm": 0.2484352935221926, + "learning_rate": 8.834244251589778e-06, + "loss": 0.021, + "step": 2692 + }, + { + "epoch": 0.8993154115879112, + "grad_norm": 0.5245493546269812, + "learning_rate": 8.832996676791802e-06, + "loss": 0.0511, + "step": 2693 + }, + { + "epoch": 0.8996493571547838, + "grad_norm": 0.3573273074502577, + "learning_rate": 8.831748522974545e-06, + "loss": 0.0271, + "step": 2694 + }, + { + "epoch": 0.8999833027216564, + "grad_norm": 0.5556433337685233, + "learning_rate": 8.830499790326556e-06, + "loss": 0.0342, + "step": 2695 + }, + { + "epoch": 0.900317248288529, + "grad_norm": 0.25571973762062844, + "learning_rate": 8.829250479036473e-06, + "loss": 0.032, + "step": 2696 + }, + { + "epoch": 0.9006511938554016, + "grad_norm": 0.4030184084917132, + "learning_rate": 8.828000589293016e-06, + "loss": 0.0408, + "step": 2697 + }, + { + "epoch": 0.9009851394222742, + "grad_norm": 0.3486286771438965, + "learning_rate": 8.826750121284998e-06, + "loss": 0.0311, + "step": 2698 + }, + { + "epoch": 0.9013190849891468, + "grad_norm": 0.40150810429876416, + "learning_rate": 8.825499075201314e-06, + "loss": 0.0371, + "step": 2699 + }, + { + "epoch": 0.9016530305560194, + "grad_norm": 0.3876708079046449, + "learning_rate": 8.824247451230949e-06, + "loss": 0.0381, + "step": 2700 + }, + { + "epoch": 0.9019869761228919, + "grad_norm": 0.37203978664868487, + "learning_rate": 8.82299524956298e-06, + "loss": 0.0316, + "step": 2701 + }, + { + "epoch": 0.9023209216897645, + "grad_norm": 0.2943335898487093, + "learning_rate": 8.821742470386565e-06, + "loss": 0.0196, + "step": 2702 + }, + { + "epoch": 0.9026548672566371, + "grad_norm": 0.374572155840365, + "learning_rate": 8.820489113890949e-06, + "loss": 0.0316, + "step": 2703 + }, + { + "epoch": 0.9029888128235097, + "grad_norm": 0.2859222579401617, + "learning_rate": 8.819235180265468e-06, + "loss": 0.0282, + "step": 2704 + }, + { + "epoch": 0.9033227583903823, + "grad_norm": 0.2638982265530527, + "learning_rate": 8.817980669699544e-06, + "loss": 0.0223, + "step": 2705 + }, + { + "epoch": 0.903656703957255, + "grad_norm": 0.3631531219422456, + "learning_rate": 8.816725582382681e-06, + "loss": 0.037, + "step": 2706 + }, + { + "epoch": 0.9039906495241276, + "grad_norm": 0.28931984035548053, + "learning_rate": 8.815469918504482e-06, + "loss": 0.0289, + "step": 2707 + }, + { + "epoch": 0.9043245950910002, + "grad_norm": 0.4812472575051047, + "learning_rate": 8.814213678254624e-06, + "loss": 0.047, + "step": 2708 + }, + { + "epoch": 0.9046585406578728, + "grad_norm": 0.3354390917849965, + "learning_rate": 8.81295686182288e-06, + "loss": 0.0299, + "step": 2709 + }, + { + "epoch": 0.9049924862247454, + "grad_norm": 0.3643640891783243, + "learning_rate": 8.811699469399106e-06, + "loss": 0.0318, + "step": 2710 + }, + { + "epoch": 0.905326431791618, + "grad_norm": 0.5331678735002254, + "learning_rate": 8.810441501173245e-06, + "loss": 0.0381, + "step": 2711 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.3199955531031879, + "learning_rate": 8.809182957335329e-06, + "loss": 0.0358, + "step": 2712 + }, + { + "epoch": 0.9059943229253632, + "grad_norm": 0.3128292176535784, + "learning_rate": 8.807923838075476e-06, + "loss": 0.026, + "step": 2713 + }, + { + "epoch": 0.9063282684922358, + "grad_norm": 0.4896501773103624, + "learning_rate": 8.80666414358389e-06, + "loss": 0.0442, + "step": 2714 + }, + { + "epoch": 0.9066622140591084, + "grad_norm": 0.4854663678009563, + "learning_rate": 8.805403874050864e-06, + "loss": 0.0296, + "step": 2715 + }, + { + "epoch": 0.906996159625981, + "grad_norm": 0.40495799536236976, + "learning_rate": 8.804143029666775e-06, + "loss": 0.0317, + "step": 2716 + }, + { + "epoch": 0.9073301051928536, + "grad_norm": 0.3624131473644916, + "learning_rate": 8.802881610622089e-06, + "loss": 0.0337, + "step": 2717 + }, + { + "epoch": 0.9076640507597261, + "grad_norm": 0.3117849134150128, + "learning_rate": 8.801619617107359e-06, + "loss": 0.0272, + "step": 2718 + }, + { + "epoch": 0.9079979963265987, + "grad_norm": 0.45987734826481586, + "learning_rate": 8.800357049313222e-06, + "loss": 0.0431, + "step": 2719 + }, + { + "epoch": 0.9083319418934713, + "grad_norm": 0.49114593678422225, + "learning_rate": 8.799093907430406e-06, + "loss": 0.0343, + "step": 2720 + }, + { + "epoch": 0.9086658874603439, + "grad_norm": 0.2913713443180512, + "learning_rate": 8.797830191649721e-06, + "loss": 0.0316, + "step": 2721 + }, + { + "epoch": 0.9089998330272165, + "grad_norm": 0.3934277531253081, + "learning_rate": 8.796565902162069e-06, + "loss": 0.039, + "step": 2722 + }, + { + "epoch": 0.9093337785940891, + "grad_norm": 0.44386869889786956, + "learning_rate": 8.795301039158433e-06, + "loss": 0.0358, + "step": 2723 + }, + { + "epoch": 0.9096677241609618, + "grad_norm": 0.45849877618788204, + "learning_rate": 8.794035602829887e-06, + "loss": 0.0458, + "step": 2724 + }, + { + "epoch": 0.9100016697278344, + "grad_norm": 0.33113577952821344, + "learning_rate": 8.792769593367591e-06, + "loss": 0.0332, + "step": 2725 + }, + { + "epoch": 0.910335615294707, + "grad_norm": 0.43368960494681885, + "learning_rate": 8.79150301096279e-06, + "loss": 0.0491, + "step": 2726 + }, + { + "epoch": 0.9106695608615796, + "grad_norm": 0.32205062944743973, + "learning_rate": 8.790235855806814e-06, + "loss": 0.0318, + "step": 2727 + }, + { + "epoch": 0.9110035064284522, + "grad_norm": 0.39229807110022535, + "learning_rate": 8.788968128091084e-06, + "loss": 0.024, + "step": 2728 + }, + { + "epoch": 0.9113374519953248, + "grad_norm": 0.3983195921827163, + "learning_rate": 8.787699828007104e-06, + "loss": 0.0332, + "step": 2729 + }, + { + "epoch": 0.9116713975621974, + "grad_norm": 0.3269993182645894, + "learning_rate": 8.786430955746468e-06, + "loss": 0.022, + "step": 2730 + }, + { + "epoch": 0.91200534312907, + "grad_norm": 0.508155656233159, + "learning_rate": 8.78516151150085e-06, + "loss": 0.0315, + "step": 2731 + }, + { + "epoch": 0.9123392886959426, + "grad_norm": 0.47003450333924607, + "learning_rate": 8.783891495462018e-06, + "loss": 0.0375, + "step": 2732 + }, + { + "epoch": 0.9126732342628152, + "grad_norm": 0.3494164836588301, + "learning_rate": 8.782620907821823e-06, + "loss": 0.04, + "step": 2733 + }, + { + "epoch": 0.9130071798296877, + "grad_norm": 0.3703653271950426, + "learning_rate": 8.781349748772198e-06, + "loss": 0.0297, + "step": 2734 + }, + { + "epoch": 0.9133411253965603, + "grad_norm": 0.613808416623768, + "learning_rate": 8.780078018505172e-06, + "loss": 0.0356, + "step": 2735 + }, + { + "epoch": 0.9136750709634329, + "grad_norm": 0.5787663724611068, + "learning_rate": 8.778805717212853e-06, + "loss": 0.0406, + "step": 2736 + }, + { + "epoch": 0.9140090165303055, + "grad_norm": 0.3541068759399978, + "learning_rate": 8.777532845087434e-06, + "loss": 0.0327, + "step": 2737 + }, + { + "epoch": 0.9143429620971781, + "grad_norm": 0.5111329190044734, + "learning_rate": 8.776259402321201e-06, + "loss": 0.0403, + "step": 2738 + }, + { + "epoch": 0.9146769076640507, + "grad_norm": 0.5241498884651358, + "learning_rate": 8.774985389106521e-06, + "loss": 0.0367, + "step": 2739 + }, + { + "epoch": 0.9150108532309233, + "grad_norm": 0.5279796385901651, + "learning_rate": 8.77371080563585e-06, + "loss": 0.0427, + "step": 2740 + }, + { + "epoch": 0.915344798797796, + "grad_norm": 0.3122021748914317, + "learning_rate": 8.772435652101726e-06, + "loss": 0.0329, + "step": 2741 + }, + { + "epoch": 0.9156787443646686, + "grad_norm": 0.4038853663767032, + "learning_rate": 8.771159928696779e-06, + "loss": 0.032, + "step": 2742 + }, + { + "epoch": 0.9160126899315412, + "grad_norm": 0.40808780028981756, + "learning_rate": 8.76988363561372e-06, + "loss": 0.024, + "step": 2743 + }, + { + "epoch": 0.9163466354984138, + "grad_norm": 0.585171766406125, + "learning_rate": 8.76860677304535e-06, + "loss": 0.0437, + "step": 2744 + }, + { + "epoch": 0.9166805810652864, + "grad_norm": 0.42957640561970534, + "learning_rate": 8.767329341184552e-06, + "loss": 0.0338, + "step": 2745 + }, + { + "epoch": 0.917014526632159, + "grad_norm": 0.3923721591081285, + "learning_rate": 8.766051340224297e-06, + "loss": 0.0463, + "step": 2746 + }, + { + "epoch": 0.9173484721990316, + "grad_norm": 0.5290985574802106, + "learning_rate": 8.764772770357646e-06, + "loss": 0.0374, + "step": 2747 + }, + { + "epoch": 0.9176824177659042, + "grad_norm": 0.3121582743944563, + "learning_rate": 8.763493631777738e-06, + "loss": 0.0243, + "step": 2748 + }, + { + "epoch": 0.9180163633327768, + "grad_norm": 0.3333000144950329, + "learning_rate": 8.762213924677802e-06, + "loss": 0.0251, + "step": 2749 + }, + { + "epoch": 0.9183503088996493, + "grad_norm": 0.5432337404166314, + "learning_rate": 8.760933649251155e-06, + "loss": 0.0443, + "step": 2750 + }, + { + "epoch": 0.9186842544665219, + "grad_norm": 0.3316771561942305, + "learning_rate": 8.759652805691197e-06, + "loss": 0.036, + "step": 2751 + }, + { + "epoch": 0.9190182000333945, + "grad_norm": 0.396723782048542, + "learning_rate": 8.758371394191415e-06, + "loss": 0.0338, + "step": 2752 + }, + { + "epoch": 0.9193521456002671, + "grad_norm": 0.3211967394461863, + "learning_rate": 8.75708941494538e-06, + "loss": 0.0205, + "step": 2753 + }, + { + "epoch": 0.9196860911671397, + "grad_norm": 0.42862694028879433, + "learning_rate": 8.75580686814675e-06, + "loss": 0.0396, + "step": 2754 + }, + { + "epoch": 0.9200200367340123, + "grad_norm": 0.5817811564476707, + "learning_rate": 8.75452375398927e-06, + "loss": 0.0267, + "step": 2755 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 0.294806437662154, + "learning_rate": 8.753240072666769e-06, + "loss": 0.0243, + "step": 2756 + }, + { + "epoch": 0.9206879278677575, + "grad_norm": 0.5437423516003268, + "learning_rate": 8.751955824373161e-06, + "loss": 0.0413, + "step": 2757 + }, + { + "epoch": 0.9210218734346302, + "grad_norm": 0.2887919920681828, + "learning_rate": 8.750671009302448e-06, + "loss": 0.0284, + "step": 2758 + }, + { + "epoch": 0.9213558190015028, + "grad_norm": 0.3030039732206616, + "learning_rate": 8.749385627648717e-06, + "loss": 0.0266, + "step": 2759 + }, + { + "epoch": 0.9216897645683754, + "grad_norm": 0.3945261183205245, + "learning_rate": 8.748099679606139e-06, + "loss": 0.0398, + "step": 2760 + }, + { + "epoch": 0.922023710135248, + "grad_norm": 0.37834962580284026, + "learning_rate": 8.746813165368973e-06, + "loss": 0.0252, + "step": 2761 + }, + { + "epoch": 0.9223576557021206, + "grad_norm": 0.45297643964361894, + "learning_rate": 8.745526085131559e-06, + "loss": 0.0271, + "step": 2762 + }, + { + "epoch": 0.9226916012689932, + "grad_norm": 0.31967657504572755, + "learning_rate": 8.744238439088328e-06, + "loss": 0.0234, + "step": 2763 + }, + { + "epoch": 0.9230255468358658, + "grad_norm": 0.36315923077791945, + "learning_rate": 8.742950227433795e-06, + "loss": 0.0321, + "step": 2764 + }, + { + "epoch": 0.9233594924027384, + "grad_norm": 0.4225791521025416, + "learning_rate": 8.741661450362559e-06, + "loss": 0.0296, + "step": 2765 + }, + { + "epoch": 0.923693437969611, + "grad_norm": 0.3165999016376335, + "learning_rate": 8.740372108069304e-06, + "loss": 0.0279, + "step": 2766 + }, + { + "epoch": 0.9240273835364835, + "grad_norm": 0.36103149782930405, + "learning_rate": 8.739082200748799e-06, + "loss": 0.0353, + "step": 2767 + }, + { + "epoch": 0.9243613291033561, + "grad_norm": 0.4443288162951213, + "learning_rate": 8.737791728595903e-06, + "loss": 0.0383, + "step": 2768 + }, + { + "epoch": 0.9246952746702287, + "grad_norm": 0.2758006368236196, + "learning_rate": 8.736500691805554e-06, + "loss": 0.0322, + "step": 2769 + }, + { + "epoch": 0.9250292202371013, + "grad_norm": 0.617235448945394, + "learning_rate": 8.73520909057278e-06, + "loss": 0.0445, + "step": 2770 + }, + { + "epoch": 0.9253631658039739, + "grad_norm": 0.5465684287209733, + "learning_rate": 8.733916925092691e-06, + "loss": 0.0467, + "step": 2771 + }, + { + "epoch": 0.9256971113708465, + "grad_norm": 0.30496841436394523, + "learning_rate": 8.732624195560487e-06, + "loss": 0.0294, + "step": 2772 + }, + { + "epoch": 0.9260310569377191, + "grad_norm": 0.3323619952467661, + "learning_rate": 8.731330902171447e-06, + "loss": 0.0383, + "step": 2773 + }, + { + "epoch": 0.9263650025045918, + "grad_norm": 0.4887364194723922, + "learning_rate": 8.730037045120941e-06, + "loss": 0.036, + "step": 2774 + }, + { + "epoch": 0.9266989480714644, + "grad_norm": 0.4225404524617371, + "learning_rate": 8.728742624604418e-06, + "loss": 0.0453, + "step": 2775 + }, + { + "epoch": 0.927032893638337, + "grad_norm": 0.6077111166457021, + "learning_rate": 8.727447640817417e-06, + "loss": 0.0415, + "step": 2776 + }, + { + "epoch": 0.9273668392052096, + "grad_norm": 0.3448105629744357, + "learning_rate": 8.726152093955561e-06, + "loss": 0.0245, + "step": 2777 + }, + { + "epoch": 0.9277007847720822, + "grad_norm": 0.42752963555637163, + "learning_rate": 8.724855984214558e-06, + "loss": 0.0355, + "step": 2778 + }, + { + "epoch": 0.9280347303389548, + "grad_norm": 0.3972583917930645, + "learning_rate": 8.723559311790197e-06, + "loss": 0.0535, + "step": 2779 + }, + { + "epoch": 0.9283686759058274, + "grad_norm": 0.29900926228908553, + "learning_rate": 8.722262076878361e-06, + "loss": 0.0276, + "step": 2780 + }, + { + "epoch": 0.9287026214727, + "grad_norm": 0.33922319676365026, + "learning_rate": 8.720964279675009e-06, + "loss": 0.0421, + "step": 2781 + }, + { + "epoch": 0.9290365670395726, + "grad_norm": 0.27641093661892724, + "learning_rate": 8.71966592037619e-06, + "loss": 0.0216, + "step": 2782 + }, + { + "epoch": 0.9293705126064451, + "grad_norm": 0.3760096877264999, + "learning_rate": 8.718366999178037e-06, + "loss": 0.0303, + "step": 2783 + }, + { + "epoch": 0.9297044581733177, + "grad_norm": 0.48778202627788486, + "learning_rate": 8.717067516276764e-06, + "loss": 0.0378, + "step": 2784 + }, + { + "epoch": 0.9300384037401903, + "grad_norm": 0.4340137713205771, + "learning_rate": 8.715767471868679e-06, + "loss": 0.0342, + "step": 2785 + }, + { + "epoch": 0.9303723493070629, + "grad_norm": 0.5365931673610034, + "learning_rate": 8.714466866150162e-06, + "loss": 0.0551, + "step": 2786 + }, + { + "epoch": 0.9307062948739355, + "grad_norm": 0.38121178906320907, + "learning_rate": 8.71316569931769e-06, + "loss": 0.0413, + "step": 2787 + }, + { + "epoch": 0.9310402404408081, + "grad_norm": 0.4073284225625517, + "learning_rate": 8.71186397156782e-06, + "loss": 0.0353, + "step": 2788 + }, + { + "epoch": 0.9313741860076807, + "grad_norm": 0.5559132115434412, + "learning_rate": 8.710561683097189e-06, + "loss": 0.0473, + "step": 2789 + }, + { + "epoch": 0.9317081315745533, + "grad_norm": 0.5318828253027332, + "learning_rate": 8.709258834102525e-06, + "loss": 0.0439, + "step": 2790 + }, + { + "epoch": 0.932042077141426, + "grad_norm": 0.9639026006766994, + "learning_rate": 8.70795542478064e-06, + "loss": 0.0436, + "step": 2791 + }, + { + "epoch": 0.9323760227082986, + "grad_norm": 0.4438159314820279, + "learning_rate": 8.706651455328427e-06, + "loss": 0.0402, + "step": 2792 + }, + { + "epoch": 0.9327099682751712, + "grad_norm": 0.392408891585289, + "learning_rate": 8.70534692594287e-06, + "loss": 0.031, + "step": 2793 + }, + { + "epoch": 0.9330439138420438, + "grad_norm": 0.5651403344284713, + "learning_rate": 8.704041836821029e-06, + "loss": 0.0464, + "step": 2794 + }, + { + "epoch": 0.9333778594089164, + "grad_norm": 0.49450949919654424, + "learning_rate": 8.702736188160055e-06, + "loss": 0.0361, + "step": 2795 + }, + { + "epoch": 0.933711804975789, + "grad_norm": 0.38494955406386794, + "learning_rate": 8.70142998015718e-06, + "loss": 0.032, + "step": 2796 + }, + { + "epoch": 0.9340457505426616, + "grad_norm": 0.4040279285565328, + "learning_rate": 8.700123213009726e-06, + "loss": 0.0405, + "step": 2797 + }, + { + "epoch": 0.9343796961095342, + "grad_norm": 0.45590293833918766, + "learning_rate": 8.698815886915094e-06, + "loss": 0.044, + "step": 2798 + }, + { + "epoch": 0.9347136416764067, + "grad_norm": 0.5284038477980592, + "learning_rate": 8.697508002070766e-06, + "loss": 0.0345, + "step": 2799 + }, + { + "epoch": 0.9350475872432793, + "grad_norm": 0.8424772908860036, + "learning_rate": 8.696199558674321e-06, + "loss": 0.0506, + "step": 2800 + }, + { + "epoch": 0.9353815328101519, + "grad_norm": 0.3998538125832933, + "learning_rate": 8.69489055692341e-06, + "loss": 0.0257, + "step": 2801 + }, + { + "epoch": 0.9357154783770245, + "grad_norm": 0.316886878656811, + "learning_rate": 8.693580997015775e-06, + "loss": 0.0332, + "step": 2802 + }, + { + "epoch": 0.9360494239438971, + "grad_norm": 0.4750431820729923, + "learning_rate": 8.692270879149241e-06, + "loss": 0.0422, + "step": 2803 + }, + { + "epoch": 0.9363833695107697, + "grad_norm": 0.3373304941500603, + "learning_rate": 8.690960203521713e-06, + "loss": 0.0332, + "step": 2804 + }, + { + "epoch": 0.9367173150776423, + "grad_norm": 0.38393173851861656, + "learning_rate": 8.689648970331188e-06, + "loss": 0.0366, + "step": 2805 + }, + { + "epoch": 0.9370512606445149, + "grad_norm": 0.7476014323783545, + "learning_rate": 8.68833717977574e-06, + "loss": 0.0296, + "step": 2806 + }, + { + "epoch": 0.9373852062113875, + "grad_norm": 0.33936631924839616, + "learning_rate": 8.687024832053534e-06, + "loss": 0.031, + "step": 2807 + }, + { + "epoch": 0.9377191517782602, + "grad_norm": 14.084074512678521, + "learning_rate": 8.685711927362815e-06, + "loss": 0.0994, + "step": 2808 + }, + { + "epoch": 0.9380530973451328, + "grad_norm": 13.830511528793766, + "learning_rate": 8.68439846590191e-06, + "loss": 0.1105, + "step": 2809 + }, + { + "epoch": 0.9383870429120054, + "grad_norm": 1.562466728274396, + "learning_rate": 8.683084447869234e-06, + "loss": 0.0396, + "step": 2810 + }, + { + "epoch": 0.938720988478878, + "grad_norm": 0.53221906447519, + "learning_rate": 8.681769873463286e-06, + "loss": 0.0438, + "step": 2811 + }, + { + "epoch": 0.9390549340457506, + "grad_norm": 0.38591734245315623, + "learning_rate": 8.680454742882647e-06, + "loss": 0.0473, + "step": 2812 + }, + { + "epoch": 0.9393888796126232, + "grad_norm": 0.6621567165500295, + "learning_rate": 8.679139056325983e-06, + "loss": 0.0497, + "step": 2813 + }, + { + "epoch": 0.9397228251794958, + "grad_norm": 0.5339382738206233, + "learning_rate": 8.677822813992046e-06, + "loss": 0.0471, + "step": 2814 + }, + { + "epoch": 0.9400567707463683, + "grad_norm": 0.4556400842899425, + "learning_rate": 8.676506016079664e-06, + "loss": 0.0422, + "step": 2815 + }, + { + "epoch": 0.9403907163132409, + "grad_norm": 0.2552204877044466, + "learning_rate": 8.675188662787762e-06, + "loss": 0.024, + "step": 2816 + }, + { + "epoch": 0.9407246618801135, + "grad_norm": 0.48970926286691463, + "learning_rate": 8.673870754315336e-06, + "loss": 0.0254, + "step": 2817 + }, + { + "epoch": 0.9410586074469861, + "grad_norm": 0.35539519724430313, + "learning_rate": 8.672552290861478e-06, + "loss": 0.0308, + "step": 2818 + }, + { + "epoch": 0.9413925530138587, + "grad_norm": 0.42194617584509675, + "learning_rate": 8.67123327262535e-06, + "loss": 0.0317, + "step": 2819 + }, + { + "epoch": 0.9417264985807313, + "grad_norm": 0.35950800586942655, + "learning_rate": 8.669913699806209e-06, + "loss": 0.0326, + "step": 2820 + }, + { + "epoch": 0.9420604441476039, + "grad_norm": 0.5884959091131621, + "learning_rate": 8.668593572603394e-06, + "loss": 0.0596, + "step": 2821 + }, + { + "epoch": 0.9423943897144765, + "grad_norm": 0.8979722718289816, + "learning_rate": 8.667272891216323e-06, + "loss": 0.0354, + "step": 2822 + }, + { + "epoch": 0.9427283352813491, + "grad_norm": 0.5702463862400153, + "learning_rate": 8.6659516558445e-06, + "loss": 0.0379, + "step": 2823 + }, + { + "epoch": 0.9430622808482217, + "grad_norm": 0.429804314201765, + "learning_rate": 8.664629866687514e-06, + "loss": 0.0364, + "step": 2824 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.3164168163392966, + "learning_rate": 8.663307523945038e-06, + "loss": 0.0276, + "step": 2825 + }, + { + "epoch": 0.943730171981967, + "grad_norm": 0.4377506840195495, + "learning_rate": 8.661984627816827e-06, + "loss": 0.0379, + "step": 2826 + }, + { + "epoch": 0.9440641175488396, + "grad_norm": 0.3477662801592816, + "learning_rate": 8.660661178502719e-06, + "loss": 0.0248, + "step": 2827 + }, + { + "epoch": 0.9443980631157122, + "grad_norm": 0.3631585132399093, + "learning_rate": 8.659337176202636e-06, + "loss": 0.0275, + "step": 2828 + }, + { + "epoch": 0.9447320086825848, + "grad_norm": 0.3416351430051519, + "learning_rate": 8.658012621116585e-06, + "loss": 0.0321, + "step": 2829 + }, + { + "epoch": 0.9450659542494574, + "grad_norm": 0.3337198340290349, + "learning_rate": 8.656687513444656e-06, + "loss": 0.0272, + "step": 2830 + }, + { + "epoch": 0.94539989981633, + "grad_norm": 0.35863055812839195, + "learning_rate": 8.655361853387024e-06, + "loss": 0.0385, + "step": 2831 + }, + { + "epoch": 0.9457338453832025, + "grad_norm": 0.5053231448166995, + "learning_rate": 8.654035641143944e-06, + "loss": 0.037, + "step": 2832 + }, + { + "epoch": 0.9460677909500751, + "grad_norm": 0.5382339861527287, + "learning_rate": 8.652708876915752e-06, + "loss": 0.0367, + "step": 2833 + }, + { + "epoch": 0.9464017365169477, + "grad_norm": 0.4457932521194742, + "learning_rate": 8.651381560902876e-06, + "loss": 0.0414, + "step": 2834 + }, + { + "epoch": 0.9467356820838203, + "grad_norm": 0.39745194892382557, + "learning_rate": 8.650053693305824e-06, + "loss": 0.035, + "step": 2835 + }, + { + "epoch": 0.9470696276506929, + "grad_norm": 0.5876270983655634, + "learning_rate": 8.648725274325182e-06, + "loss": 0.0425, + "step": 2836 + }, + { + "epoch": 0.9474035732175655, + "grad_norm": 0.47412194218262077, + "learning_rate": 8.647396304161625e-06, + "loss": 0.0356, + "step": 2837 + }, + { + "epoch": 0.9477375187844381, + "grad_norm": 0.33002487107674067, + "learning_rate": 8.64606678301591e-06, + "loss": 0.0265, + "step": 2838 + }, + { + "epoch": 0.9480714643513107, + "grad_norm": 0.4280598865539549, + "learning_rate": 8.644736711088874e-06, + "loss": 0.0404, + "step": 2839 + }, + { + "epoch": 0.9484054099181833, + "grad_norm": 0.29538546972578494, + "learning_rate": 8.643406088581446e-06, + "loss": 0.0354, + "step": 2840 + }, + { + "epoch": 0.948739355485056, + "grad_norm": 0.33778362712743576, + "learning_rate": 8.642074915694626e-06, + "loss": 0.0328, + "step": 2841 + }, + { + "epoch": 0.9490733010519286, + "grad_norm": 0.385851877180008, + "learning_rate": 8.640743192629507e-06, + "loss": 0.0322, + "step": 2842 + }, + { + "epoch": 0.9494072466188012, + "grad_norm": 0.4905991095903688, + "learning_rate": 8.63941091958726e-06, + "loss": 0.0299, + "step": 2843 + }, + { + "epoch": 0.9497411921856738, + "grad_norm": 0.62659788395832, + "learning_rate": 8.638078096769141e-06, + "loss": 0.0433, + "step": 2844 + }, + { + "epoch": 0.9500751377525464, + "grad_norm": 0.3106239489437533, + "learning_rate": 8.636744724376488e-06, + "loss": 0.0211, + "step": 2845 + }, + { + "epoch": 0.950409083319419, + "grad_norm": 0.5065096378817863, + "learning_rate": 8.635410802610724e-06, + "loss": 0.0392, + "step": 2846 + }, + { + "epoch": 0.9507430288862916, + "grad_norm": 0.37322447884044396, + "learning_rate": 8.634076331673354e-06, + "loss": 0.0372, + "step": 2847 + }, + { + "epoch": 0.9510769744531641, + "grad_norm": 0.37424053275601493, + "learning_rate": 8.632741311765962e-06, + "loss": 0.0269, + "step": 2848 + }, + { + "epoch": 0.9514109200200367, + "grad_norm": 0.4088336584549044, + "learning_rate": 8.631405743090223e-06, + "loss": 0.0335, + "step": 2849 + }, + { + "epoch": 0.9517448655869093, + "grad_norm": 0.42041913563134303, + "learning_rate": 8.630069625847885e-06, + "loss": 0.0343, + "step": 2850 + }, + { + "epoch": 0.9520788111537819, + "grad_norm": 0.37625611217598554, + "learning_rate": 8.628732960240788e-06, + "loss": 0.036, + "step": 2851 + }, + { + "epoch": 0.9524127567206545, + "grad_norm": 0.3041360538014126, + "learning_rate": 8.627395746470852e-06, + "loss": 0.0262, + "step": 2852 + }, + { + "epoch": 0.9527467022875271, + "grad_norm": 0.4914311564689556, + "learning_rate": 8.626057984740077e-06, + "loss": 0.0398, + "step": 2853 + }, + { + "epoch": 0.9530806478543997, + "grad_norm": 0.37470132846015813, + "learning_rate": 8.624719675250547e-06, + "loss": 0.0311, + "step": 2854 + }, + { + "epoch": 0.9534145934212723, + "grad_norm": 0.4379281940128687, + "learning_rate": 8.623380818204431e-06, + "loss": 0.0326, + "step": 2855 + }, + { + "epoch": 0.9537485389881449, + "grad_norm": 0.45115929352751377, + "learning_rate": 8.622041413803979e-06, + "loss": 0.0294, + "step": 2856 + }, + { + "epoch": 0.9540824845550175, + "grad_norm": 0.4001383163578965, + "learning_rate": 8.620701462251522e-06, + "loss": 0.0303, + "step": 2857 + }, + { + "epoch": 0.9544164301218901, + "grad_norm": 0.44020882077328205, + "learning_rate": 8.619360963749478e-06, + "loss": 0.0469, + "step": 2858 + }, + { + "epoch": 0.9547503756887628, + "grad_norm": 0.3479400556473947, + "learning_rate": 8.618019918500342e-06, + "loss": 0.0323, + "step": 2859 + }, + { + "epoch": 0.9550843212556354, + "grad_norm": 0.30866760767552254, + "learning_rate": 8.616678326706698e-06, + "loss": 0.0288, + "step": 2860 + }, + { + "epoch": 0.955418266822508, + "grad_norm": 0.5698954494113244, + "learning_rate": 8.615336188571208e-06, + "loss": 0.042, + "step": 2861 + }, + { + "epoch": 0.9557522123893806, + "grad_norm": 0.3137338000223819, + "learning_rate": 8.613993504296617e-06, + "loss": 0.0234, + "step": 2862 + }, + { + "epoch": 0.9560861579562532, + "grad_norm": 0.5571666589008545, + "learning_rate": 8.612650274085755e-06, + "loss": 0.0322, + "step": 2863 + }, + { + "epoch": 0.9564201035231257, + "grad_norm": 0.3795418938612114, + "learning_rate": 8.61130649814153e-06, + "loss": 0.0307, + "step": 2864 + }, + { + "epoch": 0.9567540490899983, + "grad_norm": 0.2991500991470353, + "learning_rate": 8.609962176666936e-06, + "loss": 0.0281, + "step": 2865 + }, + { + "epoch": 0.9570879946568709, + "grad_norm": 0.3933303996098282, + "learning_rate": 8.608617309865051e-06, + "loss": 0.0354, + "step": 2866 + }, + { + "epoch": 0.9574219402237435, + "grad_norm": 0.4401767972110846, + "learning_rate": 8.60727189793903e-06, + "loss": 0.0328, + "step": 2867 + }, + { + "epoch": 0.9577558857906161, + "grad_norm": 0.35415630693635686, + "learning_rate": 8.605925941092114e-06, + "loss": 0.0349, + "step": 2868 + }, + { + "epoch": 0.9580898313574887, + "grad_norm": 0.38490670451193765, + "learning_rate": 8.604579439527627e-06, + "loss": 0.0432, + "step": 2869 + }, + { + "epoch": 0.9584237769243613, + "grad_norm": 0.4523937847055889, + "learning_rate": 8.603232393448974e-06, + "loss": 0.0354, + "step": 2870 + }, + { + "epoch": 0.9587577224912339, + "grad_norm": 0.5011806427265976, + "learning_rate": 8.601884803059641e-06, + "loss": 0.0376, + "step": 2871 + }, + { + "epoch": 0.9590916680581065, + "grad_norm": 0.34696412310171365, + "learning_rate": 8.600536668563197e-06, + "loss": 0.0381, + "step": 2872 + }, + { + "epoch": 0.9594256136249791, + "grad_norm": 0.4690614391895925, + "learning_rate": 8.599187990163296e-06, + "loss": 0.0333, + "step": 2873 + }, + { + "epoch": 0.9597595591918517, + "grad_norm": 0.490118930140376, + "learning_rate": 8.597838768063667e-06, + "loss": 0.0408, + "step": 2874 + }, + { + "epoch": 0.9600935047587243, + "grad_norm": 0.7046487184714277, + "learning_rate": 8.596489002468132e-06, + "loss": 0.0341, + "step": 2875 + }, + { + "epoch": 0.960427450325597, + "grad_norm": 0.37477769288279206, + "learning_rate": 8.595138693580583e-06, + "loss": 0.0339, + "step": 2876 + }, + { + "epoch": 0.9607613958924696, + "grad_norm": 0.4023564173346981, + "learning_rate": 8.593787841605004e-06, + "loss": 0.0329, + "step": 2877 + }, + { + "epoch": 0.9610953414593422, + "grad_norm": 0.2927118313100555, + "learning_rate": 8.592436446745457e-06, + "loss": 0.0243, + "step": 2878 + }, + { + "epoch": 0.9614292870262148, + "grad_norm": 0.28765916145995385, + "learning_rate": 8.591084509206085e-06, + "loss": 0.0294, + "step": 2879 + }, + { + "epoch": 0.9617632325930874, + "grad_norm": 0.311112204922644, + "learning_rate": 8.589732029191113e-06, + "loss": 0.0373, + "step": 2880 + }, + { + "epoch": 0.9620971781599599, + "grad_norm": 0.5020139795287193, + "learning_rate": 8.588379006904852e-06, + "loss": 0.0298, + "step": 2881 + }, + { + "epoch": 0.9624311237268325, + "grad_norm": 0.39531595803524777, + "learning_rate": 8.587025442551689e-06, + "loss": 0.0327, + "step": 2882 + }, + { + "epoch": 0.9627650692937051, + "grad_norm": 0.3380535054161513, + "learning_rate": 8.585671336336096e-06, + "loss": 0.0238, + "step": 2883 + }, + { + "epoch": 0.9630990148605777, + "grad_norm": 0.328491408305338, + "learning_rate": 8.58431668846263e-06, + "loss": 0.0248, + "step": 2884 + }, + { + "epoch": 0.9634329604274503, + "grad_norm": 0.8318934448018565, + "learning_rate": 8.582961499135925e-06, + "loss": 0.0355, + "step": 2885 + }, + { + "epoch": 0.9637669059943229, + "grad_norm": 0.4144795128834873, + "learning_rate": 8.581605768560694e-06, + "loss": 0.0308, + "step": 2886 + }, + { + "epoch": 0.9641008515611955, + "grad_norm": 0.35680027909560613, + "learning_rate": 8.580249496941742e-06, + "loss": 0.0377, + "step": 2887 + }, + { + "epoch": 0.9644347971280681, + "grad_norm": 0.4077728117900122, + "learning_rate": 8.578892684483947e-06, + "loss": 0.0369, + "step": 2888 + }, + { + "epoch": 0.9647687426949407, + "grad_norm": 0.25562548526002254, + "learning_rate": 8.577535331392272e-06, + "loss": 0.0247, + "step": 2889 + }, + { + "epoch": 0.9651026882618133, + "grad_norm": 0.2902086333221242, + "learning_rate": 8.57617743787176e-06, + "loss": 0.0308, + "step": 2890 + }, + { + "epoch": 0.9654366338286859, + "grad_norm": 0.33474046267090213, + "learning_rate": 8.574819004127539e-06, + "loss": 0.0335, + "step": 2891 + }, + { + "epoch": 0.9657705793955585, + "grad_norm": 0.3320655781521263, + "learning_rate": 8.573460030364816e-06, + "loss": 0.0223, + "step": 2892 + }, + { + "epoch": 0.9661045249624312, + "grad_norm": 0.3968589907940447, + "learning_rate": 8.572100516788878e-06, + "loss": 0.0383, + "step": 2893 + }, + { + "epoch": 0.9664384705293038, + "grad_norm": 0.7192859827729127, + "learning_rate": 8.570740463605096e-06, + "loss": 0.0531, + "step": 2894 + }, + { + "epoch": 0.9667724160961764, + "grad_norm": 0.4164267642835362, + "learning_rate": 8.569379871018925e-06, + "loss": 0.0369, + "step": 2895 + }, + { + "epoch": 0.967106361663049, + "grad_norm": 0.30076241964324807, + "learning_rate": 8.568018739235895e-06, + "loss": 0.0231, + "step": 2896 + }, + { + "epoch": 0.9674403072299215, + "grad_norm": 0.39351832420344696, + "learning_rate": 8.566657068461624e-06, + "loss": 0.0293, + "step": 2897 + }, + { + "epoch": 0.9677742527967941, + "grad_norm": 0.3944978232327262, + "learning_rate": 8.565294858901804e-06, + "loss": 0.0342, + "step": 2898 + }, + { + "epoch": 0.9681081983636667, + "grad_norm": 0.30473112605308045, + "learning_rate": 8.563932110762218e-06, + "loss": 0.0284, + "step": 2899 + }, + { + "epoch": 0.9684421439305393, + "grad_norm": 0.30375003233444114, + "learning_rate": 8.562568824248722e-06, + "loss": 0.0307, + "step": 2900 + }, + { + "epoch": 0.9687760894974119, + "grad_norm": 0.4177704988999483, + "learning_rate": 8.561204999567258e-06, + "loss": 0.0348, + "step": 2901 + }, + { + "epoch": 0.9691100350642845, + "grad_norm": 0.8540798710917039, + "learning_rate": 8.559840636923845e-06, + "loss": 0.0428, + "step": 2902 + }, + { + "epoch": 0.9694439806311571, + "grad_norm": 0.3181147315438223, + "learning_rate": 8.55847573652459e-06, + "loss": 0.0307, + "step": 2903 + }, + { + "epoch": 0.9697779261980297, + "grad_norm": 0.3431224740505919, + "learning_rate": 8.557110298575674e-06, + "loss": 0.0332, + "step": 2904 + }, + { + "epoch": 0.9701118717649023, + "grad_norm": 0.24414566406935073, + "learning_rate": 8.555744323283364e-06, + "loss": 0.0275, + "step": 2905 + }, + { + "epoch": 0.9704458173317749, + "grad_norm": 0.30865069403048434, + "learning_rate": 8.554377810854006e-06, + "loss": 0.0321, + "step": 2906 + }, + { + "epoch": 0.9707797628986475, + "grad_norm": 0.48010232683215975, + "learning_rate": 8.553010761494029e-06, + "loss": 0.0306, + "step": 2907 + }, + { + "epoch": 0.9711137084655201, + "grad_norm": 0.6055659866288445, + "learning_rate": 8.551643175409941e-06, + "loss": 0.031, + "step": 2908 + }, + { + "epoch": 0.9714476540323927, + "grad_norm": 0.377750454693661, + "learning_rate": 8.550275052808332e-06, + "loss": 0.0443, + "step": 2909 + }, + { + "epoch": 0.9717815995992654, + "grad_norm": 0.2952143718133554, + "learning_rate": 8.548906393895876e-06, + "loss": 0.0272, + "step": 2910 + }, + { + "epoch": 0.972115545166138, + "grad_norm": 0.2652892479972758, + "learning_rate": 8.547537198879318e-06, + "loss": 0.0261, + "step": 2911 + }, + { + "epoch": 0.9724494907330106, + "grad_norm": 0.4987918289652696, + "learning_rate": 8.546167467965496e-06, + "loss": 0.0259, + "step": 2912 + }, + { + "epoch": 0.9727834362998831, + "grad_norm": 0.32460862107890287, + "learning_rate": 8.544797201361324e-06, + "loss": 0.0303, + "step": 2913 + }, + { + "epoch": 0.9731173818667557, + "grad_norm": 1.1289177431083712, + "learning_rate": 8.543426399273796e-06, + "loss": 0.0871, + "step": 2914 + }, + { + "epoch": 0.9734513274336283, + "grad_norm": 0.5390204663725657, + "learning_rate": 8.542055061909988e-06, + "loss": 0.0384, + "step": 2915 + }, + { + "epoch": 0.9737852730005009, + "grad_norm": 0.39398476412347677, + "learning_rate": 8.540683189477057e-06, + "loss": 0.0344, + "step": 2916 + }, + { + "epoch": 0.9741192185673735, + "grad_norm": 0.2954084404298375, + "learning_rate": 8.539310782182238e-06, + "loss": 0.0265, + "step": 2917 + }, + { + "epoch": 0.9744531641342461, + "grad_norm": 0.3686457963637047, + "learning_rate": 8.537937840232853e-06, + "loss": 0.044, + "step": 2918 + }, + { + "epoch": 0.9747871097011187, + "grad_norm": 0.3781534452843894, + "learning_rate": 8.5365643638363e-06, + "loss": 0.0361, + "step": 2919 + }, + { + "epoch": 0.9751210552679913, + "grad_norm": 0.399554948316432, + "learning_rate": 8.535190353200056e-06, + "loss": 0.0389, + "step": 2920 + }, + { + "epoch": 0.9754550008348639, + "grad_norm": 0.3504184417402766, + "learning_rate": 8.533815808531685e-06, + "loss": 0.0309, + "step": 2921 + }, + { + "epoch": 0.9757889464017365, + "grad_norm": 0.6489535836387182, + "learning_rate": 8.532440730038826e-06, + "loss": 0.0352, + "step": 2922 + }, + { + "epoch": 0.9761228919686091, + "grad_norm": 0.26295253255849527, + "learning_rate": 8.531065117929202e-06, + "loss": 0.0258, + "step": 2923 + }, + { + "epoch": 0.9764568375354817, + "grad_norm": 0.24537007744699338, + "learning_rate": 8.529688972410616e-06, + "loss": 0.0249, + "step": 2924 + }, + { + "epoch": 0.9767907831023543, + "grad_norm": 0.34239462407723953, + "learning_rate": 8.52831229369095e-06, + "loss": 0.0301, + "step": 2925 + }, + { + "epoch": 0.977124728669227, + "grad_norm": 0.35920630314760105, + "learning_rate": 8.526935081978166e-06, + "loss": 0.0403, + "step": 2926 + }, + { + "epoch": 0.9774586742360996, + "grad_norm": 0.33516431454490325, + "learning_rate": 8.52555733748031e-06, + "loss": 0.0332, + "step": 2927 + }, + { + "epoch": 0.9777926198029722, + "grad_norm": 0.9567710656426874, + "learning_rate": 8.524179060405507e-06, + "loss": 0.0478, + "step": 2928 + }, + { + "epoch": 0.9781265653698448, + "grad_norm": 0.4502980530312899, + "learning_rate": 8.52280025096196e-06, + "loss": 0.0492, + "step": 2929 + }, + { + "epoch": 0.9784605109367173, + "grad_norm": 0.3204750578200467, + "learning_rate": 8.521420909357956e-06, + "loss": 0.0338, + "step": 2930 + }, + { + "epoch": 0.9787944565035899, + "grad_norm": 0.695268630715423, + "learning_rate": 8.52004103580186e-06, + "loss": 0.046, + "step": 2931 + }, + { + "epoch": 0.9791284020704625, + "grad_norm": 0.4422763961698212, + "learning_rate": 8.51866063050212e-06, + "loss": 0.0317, + "step": 2932 + }, + { + "epoch": 0.9794623476373351, + "grad_norm": 0.27671788577447276, + "learning_rate": 8.51727969366726e-06, + "loss": 0.023, + "step": 2933 + }, + { + "epoch": 0.9797962932042077, + "grad_norm": 0.28110937671481945, + "learning_rate": 8.515898225505885e-06, + "loss": 0.0239, + "step": 2934 + }, + { + "epoch": 0.9801302387710803, + "grad_norm": 0.30396446778503616, + "learning_rate": 8.514516226226688e-06, + "loss": 0.0341, + "step": 2935 + }, + { + "epoch": 0.9804641843379529, + "grad_norm": 0.3796155250110715, + "learning_rate": 8.513133696038432e-06, + "loss": 0.0404, + "step": 2936 + }, + { + "epoch": 0.9807981299048255, + "grad_norm": 0.2370893181557276, + "learning_rate": 8.511750635149965e-06, + "loss": 0.0227, + "step": 2937 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 0.25849720854736874, + "learning_rate": 8.510367043770213e-06, + "loss": 0.0272, + "step": 2938 + }, + { + "epoch": 0.9814660210385707, + "grad_norm": 0.3026169531535005, + "learning_rate": 8.508982922108188e-06, + "loss": 0.0319, + "step": 2939 + }, + { + "epoch": 0.9817999666054433, + "grad_norm": 0.34747292936824925, + "learning_rate": 8.507598270372977e-06, + "loss": 0.0386, + "step": 2940 + }, + { + "epoch": 0.9821339121723159, + "grad_norm": 0.25846543228895674, + "learning_rate": 8.506213088773744e-06, + "loss": 0.0264, + "step": 2941 + }, + { + "epoch": 0.9824678577391885, + "grad_norm": 0.32624005236877335, + "learning_rate": 8.504827377519743e-06, + "loss": 0.029, + "step": 2942 + }, + { + "epoch": 0.9828018033060612, + "grad_norm": 0.37189342796157415, + "learning_rate": 8.503441136820296e-06, + "loss": 0.0372, + "step": 2943 + }, + { + "epoch": 0.9831357488729338, + "grad_norm": 0.2590228156928456, + "learning_rate": 8.502054366884813e-06, + "loss": 0.0258, + "step": 2944 + }, + { + "epoch": 0.9834696944398064, + "grad_norm": 0.4958551793393793, + "learning_rate": 8.500667067922784e-06, + "loss": 0.0345, + "step": 2945 + }, + { + "epoch": 0.9838036400066789, + "grad_norm": 0.46911834010090736, + "learning_rate": 8.499279240143776e-06, + "loss": 0.0377, + "step": 2946 + }, + { + "epoch": 0.9841375855735515, + "grad_norm": 0.30981252637620316, + "learning_rate": 8.497890883757434e-06, + "loss": 0.0299, + "step": 2947 + }, + { + "epoch": 0.9844715311404241, + "grad_norm": 0.3114864548517596, + "learning_rate": 8.496501998973489e-06, + "loss": 0.0301, + "step": 2948 + }, + { + "epoch": 0.9848054767072967, + "grad_norm": 0.3763100598918492, + "learning_rate": 8.495112586001747e-06, + "loss": 0.0385, + "step": 2949 + }, + { + "epoch": 0.9851394222741693, + "grad_norm": 0.49195061338951485, + "learning_rate": 8.493722645052093e-06, + "loss": 0.0498, + "step": 2950 + }, + { + "epoch": 0.9854733678410419, + "grad_norm": 0.43350088323905367, + "learning_rate": 8.4923321763345e-06, + "loss": 0.0223, + "step": 2951 + }, + { + "epoch": 0.9858073134079145, + "grad_norm": 0.41645937072227135, + "learning_rate": 8.490941180059009e-06, + "loss": 0.0343, + "step": 2952 + }, + { + "epoch": 0.9861412589747871, + "grad_norm": 0.24031710993391084, + "learning_rate": 8.489549656435748e-06, + "loss": 0.0287, + "step": 2953 + }, + { + "epoch": 0.9864752045416597, + "grad_norm": 0.2867054804285229, + "learning_rate": 8.488157605674924e-06, + "loss": 0.0261, + "step": 2954 + }, + { + "epoch": 0.9868091501085323, + "grad_norm": 0.35742715911016765, + "learning_rate": 8.486765027986821e-06, + "loss": 0.0374, + "step": 2955 + }, + { + "epoch": 0.9871430956754049, + "grad_norm": 0.34186016687613524, + "learning_rate": 8.485371923581807e-06, + "loss": 0.0296, + "step": 2956 + }, + { + "epoch": 0.9874770412422775, + "grad_norm": 0.33322174738473864, + "learning_rate": 8.483978292670324e-06, + "loss": 0.0259, + "step": 2957 + }, + { + "epoch": 0.9878109868091501, + "grad_norm": 0.30139118498858575, + "learning_rate": 8.482584135462896e-06, + "loss": 0.0301, + "step": 2958 + }, + { + "epoch": 0.9881449323760227, + "grad_norm": 0.5939531620552276, + "learning_rate": 8.48118945217013e-06, + "loss": 0.0502, + "step": 2959 + }, + { + "epoch": 0.9884788779428954, + "grad_norm": 0.3115499964256641, + "learning_rate": 8.479794243002707e-06, + "loss": 0.0307, + "step": 2960 + }, + { + "epoch": 0.988812823509768, + "grad_norm": 0.2615585331165329, + "learning_rate": 8.47839850817139e-06, + "loss": 0.0232, + "step": 2961 + }, + { + "epoch": 0.9891467690766405, + "grad_norm": 0.3671074028862246, + "learning_rate": 8.477002247887024e-06, + "loss": 0.0267, + "step": 2962 + }, + { + "epoch": 0.9894807146435131, + "grad_norm": 0.2969673994841143, + "learning_rate": 8.475605462360525e-06, + "loss": 0.0364, + "step": 2963 + }, + { + "epoch": 0.9898146602103857, + "grad_norm": 0.48234835840360146, + "learning_rate": 8.474208151802898e-06, + "loss": 0.0503, + "step": 2964 + }, + { + "epoch": 0.9901486057772583, + "grad_norm": 0.44747298497568694, + "learning_rate": 8.472810316425223e-06, + "loss": 0.0372, + "step": 2965 + }, + { + "epoch": 0.9904825513441309, + "grad_norm": 0.2952379007761491, + "learning_rate": 8.471411956438657e-06, + "loss": 0.0278, + "step": 2966 + }, + { + "epoch": 0.9908164969110035, + "grad_norm": 0.5301810921767381, + "learning_rate": 8.470013072054442e-06, + "loss": 0.0333, + "step": 2967 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 0.39644981737324414, + "learning_rate": 8.468613663483894e-06, + "loss": 0.03, + "step": 2968 + }, + { + "epoch": 0.9914843880447487, + "grad_norm": 0.517759109550299, + "learning_rate": 8.467213730938408e-06, + "loss": 0.0352, + "step": 2969 + }, + { + "epoch": 0.9918183336116213, + "grad_norm": 0.37509790116884606, + "learning_rate": 8.465813274629466e-06, + "loss": 0.0357, + "step": 2970 + }, + { + "epoch": 0.9921522791784939, + "grad_norm": 0.3729478237418451, + "learning_rate": 8.46441229476862e-06, + "loss": 0.0414, + "step": 2971 + }, + { + "epoch": 0.9924862247453665, + "grad_norm": 0.2887147074041474, + "learning_rate": 8.463010791567503e-06, + "loss": 0.0232, + "step": 2972 + }, + { + "epoch": 0.9928201703122391, + "grad_norm": 0.30249947551885503, + "learning_rate": 8.461608765237832e-06, + "loss": 0.0374, + "step": 2973 + }, + { + "epoch": 0.9931541158791117, + "grad_norm": 0.42989384332405156, + "learning_rate": 8.460206215991398e-06, + "loss": 0.0429, + "step": 2974 + }, + { + "epoch": 0.9934880614459843, + "grad_norm": 0.43458607387665993, + "learning_rate": 8.458803144040071e-06, + "loss": 0.0447, + "step": 2975 + }, + { + "epoch": 0.993822007012857, + "grad_norm": 0.32059077440084477, + "learning_rate": 8.457399549595803e-06, + "loss": 0.0285, + "step": 2976 + }, + { + "epoch": 0.9941559525797296, + "grad_norm": 0.43557159634889453, + "learning_rate": 8.455995432870626e-06, + "loss": 0.045, + "step": 2977 + }, + { + "epoch": 0.9944898981466022, + "grad_norm": 0.5529750352646045, + "learning_rate": 8.454590794076642e-06, + "loss": 0.0318, + "step": 2978 + }, + { + "epoch": 0.9948238437134747, + "grad_norm": 0.6860455648765753, + "learning_rate": 8.453185633426044e-06, + "loss": 0.037, + "step": 2979 + }, + { + "epoch": 0.9951577892803473, + "grad_norm": 0.2702880584625079, + "learning_rate": 8.451779951131096e-06, + "loss": 0.0279, + "step": 2980 + }, + { + "epoch": 0.9954917348472199, + "grad_norm": 0.3468127833882437, + "learning_rate": 8.450373747404143e-06, + "loss": 0.0294, + "step": 2981 + }, + { + "epoch": 0.9958256804140925, + "grad_norm": 0.3163357942388814, + "learning_rate": 8.448967022457611e-06, + "loss": 0.0308, + "step": 2982 + }, + { + "epoch": 0.9961596259809651, + "grad_norm": 0.2687120444587892, + "learning_rate": 8.447559776503998e-06, + "loss": 0.0287, + "step": 2983 + }, + { + "epoch": 0.9964935715478377, + "grad_norm": 0.23908782274380247, + "learning_rate": 8.446152009755886e-06, + "loss": 0.0262, + "step": 2984 + }, + { + "epoch": 0.9968275171147103, + "grad_norm": 0.4168061054496135, + "learning_rate": 8.444743722425937e-06, + "loss": 0.0394, + "step": 2985 + }, + { + "epoch": 0.9971614626815829, + "grad_norm": 0.33954139397298466, + "learning_rate": 8.443334914726886e-06, + "loss": 0.0307, + "step": 2986 + }, + { + "epoch": 0.9974954082484555, + "grad_norm": 0.3913146771735054, + "learning_rate": 8.441925586871556e-06, + "loss": 0.0361, + "step": 2987 + }, + { + "epoch": 0.9978293538153281, + "grad_norm": 0.2878594596242855, + "learning_rate": 8.440515739072836e-06, + "loss": 0.0278, + "step": 2988 + }, + { + "epoch": 0.9981632993822007, + "grad_norm": 0.42990855177109644, + "learning_rate": 8.439105371543703e-06, + "loss": 0.0299, + "step": 2989 + }, + { + "epoch": 0.9984972449490733, + "grad_norm": 0.2992324274086244, + "learning_rate": 8.43769448449721e-06, + "loss": 0.0259, + "step": 2990 + }, + { + "epoch": 0.9988311905159459, + "grad_norm": 0.32890318973287186, + "learning_rate": 8.436283078146488e-06, + "loss": 0.0311, + "step": 2991 + }, + { + "epoch": 0.9991651360828185, + "grad_norm": 0.5188806377460586, + "learning_rate": 8.434871152704745e-06, + "loss": 0.0443, + "step": 2992 + }, + { + "epoch": 0.9994990816496911, + "grad_norm": 0.4772461349888854, + "learning_rate": 8.433458708385272e-06, + "loss": 0.0367, + "step": 2993 + }, + { + "epoch": 0.9998330272165638, + "grad_norm": 0.3135369453907103, + "learning_rate": 8.432045745401431e-06, + "loss": 0.0308, + "step": 2994 + }, + { + "epoch": 0.9998330272165638, + "eval_loss": 0.03422769904136658, + "eval_runtime": 183.5729, + "eval_samples_per_second": 109.891, + "eval_steps_per_second": 1.721, + "step": 2994 + }, + { + "epoch": 1.0001669727834364, + "grad_norm": 0.29771575206090833, + "learning_rate": 8.430632263966672e-06, + "loss": 0.0323, + "step": 2995 + }, + { + "epoch": 1.0005009183503089, + "grad_norm": 0.3697550157731697, + "learning_rate": 8.429218264294512e-06, + "loss": 0.0268, + "step": 2996 + }, + { + "epoch": 1.0008348639171816, + "grad_norm": 0.26297889924216067, + "learning_rate": 8.427803746598557e-06, + "loss": 0.0258, + "step": 2997 + }, + { + "epoch": 1.001168809484054, + "grad_norm": 0.25011685660945276, + "learning_rate": 8.426388711092486e-06, + "loss": 0.0274, + "step": 2998 + }, + { + "epoch": 1.0015027550509268, + "grad_norm": 0.27126523572208655, + "learning_rate": 8.424973157990053e-06, + "loss": 0.0318, + "step": 2999 + }, + { + "epoch": 1.0018367006177993, + "grad_norm": 0.3722527919143557, + "learning_rate": 8.4235570875051e-06, + "loss": 0.0288, + "step": 3000 + }, + { + "epoch": 1.002170646184672, + "grad_norm": 0.32588205635847833, + "learning_rate": 8.422140499851536e-06, + "loss": 0.0291, + "step": 3001 + }, + { + "epoch": 1.0025045917515445, + "grad_norm": 0.29852829256692953, + "learning_rate": 8.420723395243356e-06, + "loss": 0.0354, + "step": 3002 + }, + { + "epoch": 1.002838537318417, + "grad_norm": 0.2865517515058864, + "learning_rate": 8.419305773894628e-06, + "loss": 0.0266, + "step": 3003 + }, + { + "epoch": 1.0031724828852897, + "grad_norm": 0.33608424316914015, + "learning_rate": 8.417887636019504e-06, + "loss": 0.0322, + "step": 3004 + }, + { + "epoch": 1.0035064284521622, + "grad_norm": 0.2776393554879091, + "learning_rate": 8.416468981832207e-06, + "loss": 0.0283, + "step": 3005 + }, + { + "epoch": 1.003840374019035, + "grad_norm": 0.36039590633364826, + "learning_rate": 8.415049811547043e-06, + "loss": 0.0381, + "step": 3006 + }, + { + "epoch": 1.0041743195859074, + "grad_norm": 0.3311398415075013, + "learning_rate": 8.413630125378393e-06, + "loss": 0.0261, + "step": 3007 + }, + { + "epoch": 1.0045082651527801, + "grad_norm": 0.26780439680340007, + "learning_rate": 8.412209923540719e-06, + "loss": 0.0283, + "step": 3008 + }, + { + "epoch": 1.0048422107196526, + "grad_norm": 0.342411464839398, + "learning_rate": 8.41078920624856e-06, + "loss": 0.0305, + "step": 3009 + }, + { + "epoch": 1.0051761562865253, + "grad_norm": 0.2657920894694099, + "learning_rate": 8.409367973716527e-06, + "loss": 0.0231, + "step": 3010 + }, + { + "epoch": 1.0055101018533978, + "grad_norm": 0.30883734956705133, + "learning_rate": 8.40794622615932e-06, + "loss": 0.0259, + "step": 3011 + }, + { + "epoch": 1.0058440474202706, + "grad_norm": 0.3813557990190081, + "learning_rate": 8.406523963791709e-06, + "loss": 0.0307, + "step": 3012 + }, + { + "epoch": 1.006177992987143, + "grad_norm": 0.38089081135860814, + "learning_rate": 8.405101186828542e-06, + "loss": 0.0422, + "step": 3013 + }, + { + "epoch": 1.0065119385540158, + "grad_norm": 0.3857972617574587, + "learning_rate": 8.403677895484746e-06, + "loss": 0.0281, + "step": 3014 + }, + { + "epoch": 1.0068458841208883, + "grad_norm": 0.27516406978293884, + "learning_rate": 8.402254089975328e-06, + "loss": 0.0249, + "step": 3015 + }, + { + "epoch": 1.007179829687761, + "grad_norm": 0.4172881502462482, + "learning_rate": 8.400829770515369e-06, + "loss": 0.0275, + "step": 3016 + }, + { + "epoch": 1.0075137752546335, + "grad_norm": 0.2727571111267168, + "learning_rate": 8.399404937320031e-06, + "loss": 0.0212, + "step": 3017 + }, + { + "epoch": 1.0078477208215062, + "grad_norm": 0.42937101245811243, + "learning_rate": 8.397979590604548e-06, + "loss": 0.0387, + "step": 3018 + }, + { + "epoch": 1.0081816663883787, + "grad_norm": 0.322584473450276, + "learning_rate": 8.39655373058424e-06, + "loss": 0.0239, + "step": 3019 + }, + { + "epoch": 1.0085156119552512, + "grad_norm": 0.40625205352551624, + "learning_rate": 8.395127357474498e-06, + "loss": 0.0399, + "step": 3020 + }, + { + "epoch": 1.008849557522124, + "grad_norm": 0.23824946169720115, + "learning_rate": 8.39370047149079e-06, + "loss": 0.0184, + "step": 3021 + }, + { + "epoch": 1.0091835030889964, + "grad_norm": 0.3081783469560006, + "learning_rate": 8.39227307284867e-06, + "loss": 0.0262, + "step": 3022 + }, + { + "epoch": 1.0095174486558691, + "grad_norm": 0.32016315512595445, + "learning_rate": 8.390845161763756e-06, + "loss": 0.0293, + "step": 3023 + }, + { + "epoch": 1.0098513942227416, + "grad_norm": 0.3542949947754674, + "learning_rate": 8.389416738451755e-06, + "loss": 0.0294, + "step": 3024 + }, + { + "epoch": 1.0101853397896143, + "grad_norm": 0.4493078332033422, + "learning_rate": 8.387987803128447e-06, + "loss": 0.0312, + "step": 3025 + }, + { + "epoch": 1.0105192853564868, + "grad_norm": 0.31875130673953356, + "learning_rate": 8.386558356009691e-06, + "loss": 0.0314, + "step": 3026 + }, + { + "epoch": 1.0108532309233595, + "grad_norm": 0.33589429627907247, + "learning_rate": 8.385128397311418e-06, + "loss": 0.0338, + "step": 3027 + }, + { + "epoch": 1.011187176490232, + "grad_norm": 0.2560370442628772, + "learning_rate": 8.383697927249641e-06, + "loss": 0.0225, + "step": 3028 + }, + { + "epoch": 1.0115211220571048, + "grad_norm": 0.520422510941351, + "learning_rate": 8.382266946040453e-06, + "loss": 0.0323, + "step": 3029 + }, + { + "epoch": 1.0118550676239773, + "grad_norm": 0.46973474511439056, + "learning_rate": 8.380835453900017e-06, + "loss": 0.0312, + "step": 3030 + }, + { + "epoch": 1.01218901319085, + "grad_norm": 0.25205200991018967, + "learning_rate": 8.379403451044576e-06, + "loss": 0.0198, + "step": 3031 + }, + { + "epoch": 1.0125229587577225, + "grad_norm": 0.315932615396693, + "learning_rate": 8.377970937690455e-06, + "loss": 0.033, + "step": 3032 + }, + { + "epoch": 1.0128569043245952, + "grad_norm": 0.4030566295570241, + "learning_rate": 8.376537914054048e-06, + "loss": 0.0281, + "step": 3033 + }, + { + "epoch": 1.0131908498914677, + "grad_norm": 0.3763206399060218, + "learning_rate": 8.37510438035183e-06, + "loss": 0.0259, + "step": 3034 + }, + { + "epoch": 1.0135247954583404, + "grad_norm": 0.3039694525112911, + "learning_rate": 8.373670336800358e-06, + "loss": 0.0271, + "step": 3035 + }, + { + "epoch": 1.013858741025213, + "grad_norm": 0.3738516402626686, + "learning_rate": 8.372235783616258e-06, + "loss": 0.0283, + "step": 3036 + }, + { + "epoch": 1.0141926865920854, + "grad_norm": 0.3591033770343096, + "learning_rate": 8.370800721016232e-06, + "loss": 0.0372, + "step": 3037 + }, + { + "epoch": 1.014526632158958, + "grad_norm": 0.39241184515001626, + "learning_rate": 8.369365149217072e-06, + "loss": 0.032, + "step": 3038 + }, + { + "epoch": 1.0148605777258306, + "grad_norm": 0.5526497550193588, + "learning_rate": 8.36792906843563e-06, + "loss": 0.0289, + "step": 3039 + }, + { + "epoch": 1.0151945232927033, + "grad_norm": 0.6452615909755268, + "learning_rate": 8.366492478888849e-06, + "loss": 0.0344, + "step": 3040 + }, + { + "epoch": 1.0155284688595758, + "grad_norm": 0.30586192991505584, + "learning_rate": 8.365055380793737e-06, + "loss": 0.0249, + "step": 3041 + }, + { + "epoch": 1.0158624144264485, + "grad_norm": 0.44331254419930605, + "learning_rate": 8.363617774367389e-06, + "loss": 0.027, + "step": 3042 + }, + { + "epoch": 1.016196359993321, + "grad_norm": 0.27610835252193916, + "learning_rate": 8.36217965982697e-06, + "loss": 0.0213, + "step": 3043 + }, + { + "epoch": 1.0165303055601937, + "grad_norm": 0.3366498975190066, + "learning_rate": 8.360741037389727e-06, + "loss": 0.0265, + "step": 3044 + }, + { + "epoch": 1.0168642511270662, + "grad_norm": 0.370029731400079, + "learning_rate": 8.359301907272976e-06, + "loss": 0.0266, + "step": 3045 + }, + { + "epoch": 1.017198196693939, + "grad_norm": 0.29690579079600554, + "learning_rate": 8.35786226969412e-06, + "loss": 0.0235, + "step": 3046 + }, + { + "epoch": 1.0175321422608115, + "grad_norm": 0.4331409717406702, + "learning_rate": 8.356422124870629e-06, + "loss": 0.0202, + "step": 3047 + }, + { + "epoch": 1.0178660878276842, + "grad_norm": 0.36375446537420747, + "learning_rate": 8.354981473020056e-06, + "loss": 0.0301, + "step": 3048 + }, + { + "epoch": 1.0182000333945567, + "grad_norm": 0.32901432769444916, + "learning_rate": 8.353540314360027e-06, + "loss": 0.0343, + "step": 3049 + }, + { + "epoch": 1.0185339789614294, + "grad_norm": 0.30963182120038696, + "learning_rate": 8.352098649108246e-06, + "loss": 0.0348, + "step": 3050 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 0.2813886580565502, + "learning_rate": 8.350656477482497e-06, + "loss": 0.0205, + "step": 3051 + }, + { + "epoch": 1.0192018700951744, + "grad_norm": 0.3631125626486861, + "learning_rate": 8.349213799700635e-06, + "loss": 0.0296, + "step": 3052 + }, + { + "epoch": 1.019535815662047, + "grad_norm": 0.37858193548772595, + "learning_rate": 8.34777061598059e-06, + "loss": 0.0286, + "step": 3053 + }, + { + "epoch": 1.0198697612289196, + "grad_norm": 0.32955853947458696, + "learning_rate": 8.346326926540377e-06, + "loss": 0.0321, + "step": 3054 + }, + { + "epoch": 1.0202037067957923, + "grad_norm": 0.768760878982909, + "learning_rate": 8.344882731598079e-06, + "loss": 0.0223, + "step": 3055 + }, + { + "epoch": 1.0205376523626648, + "grad_norm": 0.3193180970107813, + "learning_rate": 8.343438031371858e-06, + "loss": 0.0234, + "step": 3056 + }, + { + "epoch": 1.0208715979295375, + "grad_norm": 0.4827705103087651, + "learning_rate": 8.341992826079956e-06, + "loss": 0.056, + "step": 3057 + }, + { + "epoch": 1.02120554349641, + "grad_norm": 0.4434386097886292, + "learning_rate": 8.340547115940688e-06, + "loss": 0.0242, + "step": 3058 + }, + { + "epoch": 1.0215394890632827, + "grad_norm": 0.5546308377722811, + "learning_rate": 8.339100901172443e-06, + "loss": 0.0362, + "step": 3059 + }, + { + "epoch": 1.0218734346301552, + "grad_norm": 0.5023578367259403, + "learning_rate": 8.337654181993691e-06, + "loss": 0.0199, + "step": 3060 + }, + { + "epoch": 1.022207380197028, + "grad_norm": 1.078937180219304, + "learning_rate": 8.336206958622975e-06, + "loss": 0.0385, + "step": 3061 + }, + { + "epoch": 1.0225413257639004, + "grad_norm": 0.34334042347382643, + "learning_rate": 8.334759231278915e-06, + "loss": 0.0233, + "step": 3062 + }, + { + "epoch": 1.0228752713307732, + "grad_norm": 0.3297745842454771, + "learning_rate": 8.333311000180208e-06, + "loss": 0.0247, + "step": 3063 + }, + { + "epoch": 1.0232092168976457, + "grad_norm": 0.21351540675230476, + "learning_rate": 8.331862265545627e-06, + "loss": 0.0168, + "step": 3064 + }, + { + "epoch": 1.0235431624645184, + "grad_norm": 0.454635462538323, + "learning_rate": 8.330413027594019e-06, + "loss": 0.046, + "step": 3065 + }, + { + "epoch": 1.0238771080313909, + "grad_norm": 0.5560074504521261, + "learning_rate": 8.328963286544309e-06, + "loss": 0.04, + "step": 3066 + }, + { + "epoch": 1.0242110535982636, + "grad_norm": 0.45077409854950257, + "learning_rate": 8.327513042615496e-06, + "loss": 0.0342, + "step": 3067 + }, + { + "epoch": 1.024544999165136, + "grad_norm": 0.37806201311651555, + "learning_rate": 8.326062296026657e-06, + "loss": 0.0236, + "step": 3068 + }, + { + "epoch": 1.0248789447320086, + "grad_norm": 0.3586911886970298, + "learning_rate": 8.324611046996947e-06, + "loss": 0.0292, + "step": 3069 + }, + { + "epoch": 1.0252128902988813, + "grad_norm": 0.36652700517579456, + "learning_rate": 8.32315929574559e-06, + "loss": 0.0255, + "step": 3070 + }, + { + "epoch": 1.0255468358657538, + "grad_norm": 0.24863258101599792, + "learning_rate": 8.321707042491895e-06, + "loss": 0.0213, + "step": 3071 + }, + { + "epoch": 1.0258807814326265, + "grad_norm": 0.3639895846996747, + "learning_rate": 8.320254287455238e-06, + "loss": 0.0223, + "step": 3072 + }, + { + "epoch": 1.026214726999499, + "grad_norm": 0.2728335926701653, + "learning_rate": 8.318801030855078e-06, + "loss": 0.0188, + "step": 3073 + }, + { + "epoch": 1.0265486725663717, + "grad_norm": 0.573002744738013, + "learning_rate": 8.317347272910944e-06, + "loss": 0.0504, + "step": 3074 + }, + { + "epoch": 1.0268826181332442, + "grad_norm": 0.39901750658595253, + "learning_rate": 8.315893013842441e-06, + "loss": 0.0373, + "step": 3075 + }, + { + "epoch": 1.027216563700117, + "grad_norm": 0.41332567428108785, + "learning_rate": 8.31443825386926e-06, + "loss": 0.0369, + "step": 3076 + }, + { + "epoch": 1.0275505092669894, + "grad_norm": 0.4564478963697029, + "learning_rate": 8.312982993211151e-06, + "loss": 0.0486, + "step": 3077 + }, + { + "epoch": 1.0278844548338621, + "grad_norm": 0.26904193285387235, + "learning_rate": 8.311527232087951e-06, + "loss": 0.0269, + "step": 3078 + }, + { + "epoch": 1.0282184004007346, + "grad_norm": 0.3523462902540687, + "learning_rate": 8.310070970719573e-06, + "loss": 0.0281, + "step": 3079 + }, + { + "epoch": 1.0285523459676074, + "grad_norm": 0.30712694816648556, + "learning_rate": 8.308614209325997e-06, + "loss": 0.0311, + "step": 3080 + }, + { + "epoch": 1.0288862915344799, + "grad_norm": 0.4275847869419905, + "learning_rate": 8.30715694812729e-06, + "loss": 0.0244, + "step": 3081 + }, + { + "epoch": 1.0292202371013526, + "grad_norm": 0.44420722595328466, + "learning_rate": 8.305699187343586e-06, + "loss": 0.0352, + "step": 3082 + }, + { + "epoch": 1.029554182668225, + "grad_norm": 0.3178046177388971, + "learning_rate": 8.304240927195094e-06, + "loss": 0.0344, + "step": 3083 + }, + { + "epoch": 1.0298881282350978, + "grad_norm": 0.36535031040615956, + "learning_rate": 8.302782167902103e-06, + "loss": 0.0347, + "step": 3084 + }, + { + "epoch": 1.0302220738019703, + "grad_norm": 0.2985913932941345, + "learning_rate": 8.30132290968498e-06, + "loss": 0.0199, + "step": 3085 + }, + { + "epoch": 1.0305560193688428, + "grad_norm": 0.39983165675579185, + "learning_rate": 8.299863152764158e-06, + "loss": 0.0293, + "step": 3086 + }, + { + "epoch": 1.0308899649357155, + "grad_norm": 0.4044736097086519, + "learning_rate": 8.298402897360152e-06, + "loss": 0.0285, + "step": 3087 + }, + { + "epoch": 1.031223910502588, + "grad_norm": 0.30600013716904445, + "learning_rate": 8.29694214369355e-06, + "loss": 0.0312, + "step": 3088 + }, + { + "epoch": 1.0315578560694607, + "grad_norm": 0.3184850789055068, + "learning_rate": 8.295480891985019e-06, + "loss": 0.0296, + "step": 3089 + }, + { + "epoch": 1.0318918016363332, + "grad_norm": 0.46694113108521274, + "learning_rate": 8.294019142455295e-06, + "loss": 0.0386, + "step": 3090 + }, + { + "epoch": 1.032225747203206, + "grad_norm": 0.3435767279673977, + "learning_rate": 8.292556895325195e-06, + "loss": 0.0226, + "step": 3091 + }, + { + "epoch": 1.0325596927700784, + "grad_norm": 0.43518409317233214, + "learning_rate": 8.291094150815607e-06, + "loss": 0.0302, + "step": 3092 + }, + { + "epoch": 1.0328936383369511, + "grad_norm": 0.2878261284693619, + "learning_rate": 8.289630909147494e-06, + "loss": 0.0217, + "step": 3093 + }, + { + "epoch": 1.0332275839038236, + "grad_norm": 0.32810234847386943, + "learning_rate": 8.2881671705419e-06, + "loss": 0.019, + "step": 3094 + }, + { + "epoch": 1.0335615294706963, + "grad_norm": 0.28794547523903125, + "learning_rate": 8.286702935219936e-06, + "loss": 0.026, + "step": 3095 + }, + { + "epoch": 1.0338954750375688, + "grad_norm": 0.4588925092844552, + "learning_rate": 8.285238203402796e-06, + "loss": 0.0329, + "step": 3096 + }, + { + "epoch": 1.0342294206044416, + "grad_norm": 0.33891803932934295, + "learning_rate": 8.283772975311742e-06, + "loss": 0.0301, + "step": 3097 + }, + { + "epoch": 1.034563366171314, + "grad_norm": 0.3093952371234193, + "learning_rate": 8.282307251168116e-06, + "loss": 0.0234, + "step": 3098 + }, + { + "epoch": 1.0348973117381868, + "grad_norm": 0.3361344519386881, + "learning_rate": 8.28084103119333e-06, + "loss": 0.0254, + "step": 3099 + }, + { + "epoch": 1.0352312573050593, + "grad_norm": 0.4456375711383226, + "learning_rate": 8.279374315608877e-06, + "loss": 0.0361, + "step": 3100 + }, + { + "epoch": 1.0355652028719318, + "grad_norm": 0.36902061126373575, + "learning_rate": 8.27790710463632e-06, + "loss": 0.0288, + "step": 3101 + }, + { + "epoch": 1.0358991484388045, + "grad_norm": 1.2728677691161303, + "learning_rate": 8.276439398497298e-06, + "loss": 0.036, + "step": 3102 + }, + { + "epoch": 1.036233094005677, + "grad_norm": 0.31932718122490666, + "learning_rate": 8.274971197413527e-06, + "loss": 0.0215, + "step": 3103 + }, + { + "epoch": 1.0365670395725497, + "grad_norm": 0.30187842717202285, + "learning_rate": 8.273502501606794e-06, + "loss": 0.028, + "step": 3104 + }, + { + "epoch": 1.0369009851394222, + "grad_norm": 0.41631944796866643, + "learning_rate": 8.272033311298965e-06, + "loss": 0.0366, + "step": 3105 + }, + { + "epoch": 1.037234930706295, + "grad_norm": 0.37127879143926973, + "learning_rate": 8.270563626711979e-06, + "loss": 0.0318, + "step": 3106 + }, + { + "epoch": 1.0375688762731674, + "grad_norm": 0.24229265529666802, + "learning_rate": 8.269093448067845e-06, + "loss": 0.019, + "step": 3107 + }, + { + "epoch": 1.0379028218400401, + "grad_norm": 0.40860081811063315, + "learning_rate": 8.267622775588653e-06, + "loss": 0.0363, + "step": 3108 + }, + { + "epoch": 1.0382367674069126, + "grad_norm": 0.24137702262320918, + "learning_rate": 8.266151609496567e-06, + "loss": 0.0173, + "step": 3109 + }, + { + "epoch": 1.0385707129737853, + "grad_norm": 0.9606289959687807, + "learning_rate": 8.26467995001382e-06, + "loss": 0.0325, + "step": 3110 + }, + { + "epoch": 1.0389046585406578, + "grad_norm": 0.32029074991943135, + "learning_rate": 8.26320779736273e-06, + "loss": 0.0304, + "step": 3111 + }, + { + "epoch": 1.0392386041075306, + "grad_norm": 0.5093859227143862, + "learning_rate": 8.261735151765678e-06, + "loss": 0.0262, + "step": 3112 + }, + { + "epoch": 1.039572549674403, + "grad_norm": 0.38519493130500526, + "learning_rate": 8.260262013445126e-06, + "loss": 0.0216, + "step": 3113 + }, + { + "epoch": 1.0399064952412758, + "grad_norm": 0.2796498288038895, + "learning_rate": 8.258788382623607e-06, + "loss": 0.0261, + "step": 3114 + }, + { + "epoch": 1.0402404408081483, + "grad_norm": 0.4478050010652396, + "learning_rate": 8.257314259523732e-06, + "loss": 0.028, + "step": 3115 + }, + { + "epoch": 1.040574386375021, + "grad_norm": 0.40193088729007864, + "learning_rate": 8.255839644368185e-06, + "loss": 0.0288, + "step": 3116 + }, + { + "epoch": 1.0409083319418935, + "grad_norm": 0.4025068152340384, + "learning_rate": 8.254364537379725e-06, + "loss": 0.0272, + "step": 3117 + }, + { + "epoch": 1.041242277508766, + "grad_norm": 0.4665400693883106, + "learning_rate": 8.25288893878118e-06, + "loss": 0.0312, + "step": 3118 + }, + { + "epoch": 1.0415762230756387, + "grad_norm": 0.3983536390475072, + "learning_rate": 8.251412848795462e-06, + "loss": 0.0253, + "step": 3119 + }, + { + "epoch": 1.0419101686425112, + "grad_norm": 0.2601635485849514, + "learning_rate": 8.249936267645546e-06, + "loss": 0.0202, + "step": 3120 + }, + { + "epoch": 1.042244114209384, + "grad_norm": 0.3267182537592143, + "learning_rate": 8.248459195554492e-06, + "loss": 0.026, + "step": 3121 + }, + { + "epoch": 1.0425780597762564, + "grad_norm": 0.3610038201378598, + "learning_rate": 8.246981632745428e-06, + "loss": 0.0371, + "step": 3122 + }, + { + "epoch": 1.0429120053431291, + "grad_norm": 0.3471189852385901, + "learning_rate": 8.245503579441554e-06, + "loss": 0.0283, + "step": 3123 + }, + { + "epoch": 1.0432459509100016, + "grad_norm": 0.3313712073324069, + "learning_rate": 8.244025035866151e-06, + "loss": 0.0302, + "step": 3124 + }, + { + "epoch": 1.0435798964768743, + "grad_norm": 0.29523361657471914, + "learning_rate": 8.242546002242569e-06, + "loss": 0.0195, + "step": 3125 + }, + { + "epoch": 1.0439138420437468, + "grad_norm": 0.4708060332711091, + "learning_rate": 8.241066478794233e-06, + "loss": 0.0351, + "step": 3126 + }, + { + "epoch": 1.0442477876106195, + "grad_norm": 0.33361012747520763, + "learning_rate": 8.239586465744644e-06, + "loss": 0.0341, + "step": 3127 + }, + { + "epoch": 1.044581733177492, + "grad_norm": 0.44564837446813743, + "learning_rate": 8.238105963317376e-06, + "loss": 0.0407, + "step": 3128 + }, + { + "epoch": 1.0449156787443648, + "grad_norm": 0.2862680825008876, + "learning_rate": 8.236624971736071e-06, + "loss": 0.026, + "step": 3129 + }, + { + "epoch": 1.0452496243112372, + "grad_norm": 0.2474654693302596, + "learning_rate": 8.235143491224458e-06, + "loss": 0.0193, + "step": 3130 + }, + { + "epoch": 1.04558356987811, + "grad_norm": 0.2949794781745856, + "learning_rate": 8.233661522006324e-06, + "loss": 0.0193, + "step": 3131 + }, + { + "epoch": 1.0459175154449825, + "grad_norm": 0.31837585275654734, + "learning_rate": 8.232179064305545e-06, + "loss": 0.0306, + "step": 3132 + }, + { + "epoch": 1.0462514610118552, + "grad_norm": 0.3394771192991278, + "learning_rate": 8.230696118346059e-06, + "loss": 0.0322, + "step": 3133 + }, + { + "epoch": 1.0465854065787277, + "grad_norm": 0.36475144705170737, + "learning_rate": 8.229212684351886e-06, + "loss": 0.0418, + "step": 3134 + }, + { + "epoch": 1.0469193521456002, + "grad_norm": 0.29704859389937754, + "learning_rate": 8.227728762547112e-06, + "loss": 0.0156, + "step": 3135 + }, + { + "epoch": 1.0472532977124729, + "grad_norm": 0.2738307368121378, + "learning_rate": 8.226244353155906e-06, + "loss": 0.0227, + "step": 3136 + }, + { + "epoch": 1.0475872432793454, + "grad_norm": 0.3158699956701117, + "learning_rate": 8.2247594564025e-06, + "loss": 0.0252, + "step": 3137 + }, + { + "epoch": 1.047921188846218, + "grad_norm": 0.2778342337220743, + "learning_rate": 8.22327407251121e-06, + "loss": 0.0304, + "step": 3138 + }, + { + "epoch": 1.0482551344130906, + "grad_norm": 0.30138033930047925, + "learning_rate": 8.221788201706416e-06, + "loss": 0.0296, + "step": 3139 + }, + { + "epoch": 1.0485890799799633, + "grad_norm": 0.29959189001762615, + "learning_rate": 8.22030184421258e-06, + "loss": 0.0263, + "step": 3140 + }, + { + "epoch": 1.0489230255468358, + "grad_norm": 0.33416108029859287, + "learning_rate": 8.218815000254233e-06, + "loss": 0.0343, + "step": 3141 + }, + { + "epoch": 1.0492569711137085, + "grad_norm": 0.4379592928548615, + "learning_rate": 8.21732767005598e-06, + "loss": 0.0383, + "step": 3142 + }, + { + "epoch": 1.049590916680581, + "grad_norm": 0.5810887056619938, + "learning_rate": 8.215839853842498e-06, + "loss": 0.0421, + "step": 3143 + }, + { + "epoch": 1.0499248622474537, + "grad_norm": 0.38367710322524623, + "learning_rate": 8.214351551838541e-06, + "loss": 0.0374, + "step": 3144 + }, + { + "epoch": 1.0502588078143262, + "grad_norm": 0.33266657106849523, + "learning_rate": 8.212862764268936e-06, + "loss": 0.0243, + "step": 3145 + }, + { + "epoch": 1.050592753381199, + "grad_norm": 0.3200406377901553, + "learning_rate": 8.21137349135858e-06, + "loss": 0.0241, + "step": 3146 + }, + { + "epoch": 1.0509266989480714, + "grad_norm": 0.3031139344803291, + "learning_rate": 8.209883733332444e-06, + "loss": 0.0274, + "step": 3147 + }, + { + "epoch": 1.0512606445149442, + "grad_norm": 0.24042528181312012, + "learning_rate": 8.208393490415576e-06, + "loss": 0.0221, + "step": 3148 + }, + { + "epoch": 1.0515945900818167, + "grad_norm": 0.5308126282602463, + "learning_rate": 8.206902762833095e-06, + "loss": 0.0266, + "step": 3149 + }, + { + "epoch": 1.0519285356486892, + "grad_norm": 0.3003822510107234, + "learning_rate": 8.205411550810189e-06, + "loss": 0.0259, + "step": 3150 + }, + { + "epoch": 1.0522624812155619, + "grad_norm": 0.28714582443865505, + "learning_rate": 8.203919854572126e-06, + "loss": 0.025, + "step": 3151 + }, + { + "epoch": 1.0525964267824344, + "grad_norm": 0.3077690157961476, + "learning_rate": 8.202427674344246e-06, + "loss": 0.0243, + "step": 3152 + }, + { + "epoch": 1.052930372349307, + "grad_norm": 0.3751881310366447, + "learning_rate": 8.200935010351958e-06, + "loss": 0.0236, + "step": 3153 + }, + { + "epoch": 1.0532643179161796, + "grad_norm": 0.35076399474792186, + "learning_rate": 8.199441862820746e-06, + "loss": 0.0248, + "step": 3154 + }, + { + "epoch": 1.0535982634830523, + "grad_norm": 0.26307182834482473, + "learning_rate": 8.197948231976169e-06, + "loss": 0.0203, + "step": 3155 + }, + { + "epoch": 1.0539322090499248, + "grad_norm": 0.3044129331544508, + "learning_rate": 8.196454118043856e-06, + "loss": 0.0302, + "step": 3156 + }, + { + "epoch": 1.0542661546167975, + "grad_norm": 0.3627983241007162, + "learning_rate": 8.194959521249512e-06, + "loss": 0.0307, + "step": 3157 + }, + { + "epoch": 1.05460010018367, + "grad_norm": 0.44319883100320684, + "learning_rate": 8.193464441818913e-06, + "loss": 0.0465, + "step": 3158 + }, + { + "epoch": 1.0549340457505427, + "grad_norm": 0.2887795746389143, + "learning_rate": 8.191968879977907e-06, + "loss": 0.0245, + "step": 3159 + }, + { + "epoch": 1.0552679913174152, + "grad_norm": 0.3032079505776515, + "learning_rate": 8.190472835952419e-06, + "loss": 0.0265, + "step": 3160 + }, + { + "epoch": 1.055601936884288, + "grad_norm": 0.2592623562248579, + "learning_rate": 8.188976309968443e-06, + "loss": 0.0202, + "step": 3161 + }, + { + "epoch": 1.0559358824511604, + "grad_norm": 0.36726483272764104, + "learning_rate": 8.187479302252045e-06, + "loss": 0.0294, + "step": 3162 + }, + { + "epoch": 1.0562698280180332, + "grad_norm": 0.24711785591608035, + "learning_rate": 8.185981813029368e-06, + "loss": 0.0202, + "step": 3163 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 0.567690906291969, + "learning_rate": 8.184483842526623e-06, + "loss": 0.0227, + "step": 3164 + }, + { + "epoch": 1.0569377191517784, + "grad_norm": 0.26315373047195273, + "learning_rate": 8.1829853909701e-06, + "loss": 0.0216, + "step": 3165 + }, + { + "epoch": 1.0572716647186509, + "grad_norm": 0.3377394298084983, + "learning_rate": 8.181486458586153e-06, + "loss": 0.0407, + "step": 3166 + }, + { + "epoch": 1.0576056102855234, + "grad_norm": 0.3578886023927057, + "learning_rate": 8.179987045601217e-06, + "loss": 0.0261, + "step": 3167 + }, + { + "epoch": 1.057939555852396, + "grad_norm": 0.32132008387067623, + "learning_rate": 8.178487152241795e-06, + "loss": 0.0295, + "step": 3168 + }, + { + "epoch": 1.0582735014192686, + "grad_norm": 0.2954476552620977, + "learning_rate": 8.17698677873446e-06, + "loss": 0.033, + "step": 3169 + }, + { + "epoch": 1.0586074469861413, + "grad_norm": 0.3270198201043449, + "learning_rate": 8.175485925305867e-06, + "loss": 0.0204, + "step": 3170 + }, + { + "epoch": 1.0589413925530138, + "grad_norm": 0.340683504806724, + "learning_rate": 8.173984592182736e-06, + "loss": 0.0339, + "step": 3171 + }, + { + "epoch": 1.0592753381198865, + "grad_norm": 0.30570215261127376, + "learning_rate": 8.172482779591858e-06, + "loss": 0.0291, + "step": 3172 + }, + { + "epoch": 1.059609283686759, + "grad_norm": 0.37838930187846, + "learning_rate": 8.170980487760101e-06, + "loss": 0.0438, + "step": 3173 + }, + { + "epoch": 1.0599432292536317, + "grad_norm": 0.3383647238519604, + "learning_rate": 8.169477716914405e-06, + "loss": 0.0297, + "step": 3174 + }, + { + "epoch": 1.0602771748205042, + "grad_norm": 0.25213861969089507, + "learning_rate": 8.16797446728178e-06, + "loss": 0.0277, + "step": 3175 + }, + { + "epoch": 1.060611120387377, + "grad_norm": 0.39284531943287526, + "learning_rate": 8.16647073908931e-06, + "loss": 0.0501, + "step": 3176 + }, + { + "epoch": 1.0609450659542494, + "grad_norm": 0.40504969758581055, + "learning_rate": 8.164966532564152e-06, + "loss": 0.0406, + "step": 3177 + }, + { + "epoch": 1.0612790115211221, + "grad_norm": 0.4228096924800613, + "learning_rate": 8.163461847933532e-06, + "loss": 0.0233, + "step": 3178 + }, + { + "epoch": 1.0616129570879946, + "grad_norm": 0.3212886971594315, + "learning_rate": 8.161956685424752e-06, + "loss": 0.0267, + "step": 3179 + }, + { + "epoch": 1.0619469026548674, + "grad_norm": 0.3145132469817971, + "learning_rate": 8.160451045265183e-06, + "loss": 0.0223, + "step": 3180 + }, + { + "epoch": 1.0622808482217398, + "grad_norm": 0.2453353231679829, + "learning_rate": 8.158944927682269e-06, + "loss": 0.0234, + "step": 3181 + }, + { + "epoch": 1.0626147937886126, + "grad_norm": 0.28972504686288075, + "learning_rate": 8.157438332903531e-06, + "loss": 0.0286, + "step": 3182 + }, + { + "epoch": 1.062948739355485, + "grad_norm": 0.24724894945946668, + "learning_rate": 8.155931261156555e-06, + "loss": 0.0248, + "step": 3183 + }, + { + "epoch": 1.0632826849223576, + "grad_norm": 0.3756606442742857, + "learning_rate": 8.154423712669003e-06, + "loss": 0.0321, + "step": 3184 + }, + { + "epoch": 1.0636166304892303, + "grad_norm": 0.22225664087496289, + "learning_rate": 8.152915687668603e-06, + "loss": 0.0201, + "step": 3185 + }, + { + "epoch": 1.0639505760561028, + "grad_norm": 0.31357879955688045, + "learning_rate": 8.151407186383166e-06, + "loss": 0.0221, + "step": 3186 + }, + { + "epoch": 1.0642845216229755, + "grad_norm": 0.325060312972888, + "learning_rate": 8.149898209040568e-06, + "loss": 0.019, + "step": 3187 + }, + { + "epoch": 1.064618467189848, + "grad_norm": 0.336164714357586, + "learning_rate": 8.148388755868757e-06, + "loss": 0.0221, + "step": 3188 + }, + { + "epoch": 1.0649524127567207, + "grad_norm": 0.2754589561530737, + "learning_rate": 8.146878827095751e-06, + "loss": 0.0269, + "step": 3189 + }, + { + "epoch": 1.0652863583235932, + "grad_norm": 0.38697959026680784, + "learning_rate": 8.145368422949647e-06, + "loss": 0.0319, + "step": 3190 + }, + { + "epoch": 1.065620303890466, + "grad_norm": 0.2894212531269538, + "learning_rate": 8.143857543658606e-06, + "loss": 0.0218, + "step": 3191 + }, + { + "epoch": 1.0659542494573384, + "grad_norm": 0.2614254696383892, + "learning_rate": 8.142346189450866e-06, + "loss": 0.023, + "step": 3192 + }, + { + "epoch": 1.0662881950242111, + "grad_norm": 0.2484086201153225, + "learning_rate": 8.140834360554734e-06, + "loss": 0.0231, + "step": 3193 + }, + { + "epoch": 1.0666221405910836, + "grad_norm": 0.34486282980188804, + "learning_rate": 8.13932205719859e-06, + "loss": 0.0333, + "step": 3194 + }, + { + "epoch": 1.0669560861579563, + "grad_norm": 0.3475212405148576, + "learning_rate": 8.137809279610885e-06, + "loss": 0.024, + "step": 3195 + }, + { + "epoch": 1.0672900317248288, + "grad_norm": 0.35655065259988694, + "learning_rate": 8.13629602802014e-06, + "loss": 0.0244, + "step": 3196 + }, + { + "epoch": 1.0676239772917016, + "grad_norm": 0.4429028783957121, + "learning_rate": 8.134782302654953e-06, + "loss": 0.0402, + "step": 3197 + }, + { + "epoch": 1.067957922858574, + "grad_norm": 0.5389386221395466, + "learning_rate": 8.133268103743989e-06, + "loss": 0.0437, + "step": 3198 + }, + { + "epoch": 1.0682918684254465, + "grad_norm": 0.33079685403936093, + "learning_rate": 8.131753431515984e-06, + "loss": 0.0271, + "step": 3199 + }, + { + "epoch": 1.0686258139923193, + "grad_norm": 0.34631306750481533, + "learning_rate": 8.130238286199747e-06, + "loss": 0.0324, + "step": 3200 + }, + { + "epoch": 1.0689597595591918, + "grad_norm": 0.31929619605337367, + "learning_rate": 8.128722668024161e-06, + "loss": 0.0215, + "step": 3201 + }, + { + "epoch": 1.0692937051260645, + "grad_norm": 0.43532157020234624, + "learning_rate": 8.127206577218177e-06, + "loss": 0.0421, + "step": 3202 + }, + { + "epoch": 1.069627650692937, + "grad_norm": 0.2807873183261818, + "learning_rate": 8.125690014010814e-06, + "loss": 0.0251, + "step": 3203 + }, + { + "epoch": 1.0699615962598097, + "grad_norm": 0.405320920289809, + "learning_rate": 8.124172978631173e-06, + "loss": 0.0311, + "step": 3204 + }, + { + "epoch": 1.0702955418266822, + "grad_norm": 0.3051621480936762, + "learning_rate": 8.12265547130842e-06, + "loss": 0.0269, + "step": 3205 + }, + { + "epoch": 1.070629487393555, + "grad_norm": 0.37517077772627283, + "learning_rate": 8.121137492271787e-06, + "loss": 0.0361, + "step": 3206 + }, + { + "epoch": 1.0709634329604274, + "grad_norm": 0.3906482364046291, + "learning_rate": 8.119619041750586e-06, + "loss": 0.0366, + "step": 3207 + }, + { + "epoch": 1.0712973785273001, + "grad_norm": 0.7720388272918915, + "learning_rate": 8.118100119974197e-06, + "loss": 0.0259, + "step": 3208 + }, + { + "epoch": 1.0716313240941726, + "grad_norm": 0.34682141237815584, + "learning_rate": 8.116580727172071e-06, + "loss": 0.0216, + "step": 3209 + }, + { + "epoch": 1.0719652696610453, + "grad_norm": 0.31560666044668617, + "learning_rate": 8.115060863573729e-06, + "loss": 0.021, + "step": 3210 + }, + { + "epoch": 1.0722992152279178, + "grad_norm": 0.4940458994908807, + "learning_rate": 8.113540529408766e-06, + "loss": 0.0371, + "step": 3211 + }, + { + "epoch": 1.0726331607947905, + "grad_norm": 0.30957068605851945, + "learning_rate": 8.112019724906844e-06, + "loss": 0.0258, + "step": 3212 + }, + { + "epoch": 1.072967106361663, + "grad_norm": 0.4848843155643987, + "learning_rate": 8.1104984502977e-06, + "loss": 0.0321, + "step": 3213 + }, + { + "epoch": 1.0733010519285355, + "grad_norm": 1.0662051334114149, + "learning_rate": 8.108976705811138e-06, + "loss": 0.0445, + "step": 3214 + }, + { + "epoch": 1.0736349974954082, + "grad_norm": 0.21981439832497088, + "learning_rate": 8.107454491677041e-06, + "loss": 0.0192, + "step": 3215 + }, + { + "epoch": 1.0739689430622807, + "grad_norm": 0.3710162574426828, + "learning_rate": 8.10593180812535e-06, + "loss": 0.0265, + "step": 3216 + }, + { + "epoch": 1.0743028886291535, + "grad_norm": 0.37385849697364243, + "learning_rate": 8.104408655386092e-06, + "loss": 0.0464, + "step": 3217 + }, + { + "epoch": 1.074636834196026, + "grad_norm": 1.4727901927499285, + "learning_rate": 8.102885033689352e-06, + "loss": 0.0278, + "step": 3218 + }, + { + "epoch": 1.0749707797628987, + "grad_norm": 0.3679842685510149, + "learning_rate": 8.101360943265293e-06, + "loss": 0.0333, + "step": 3219 + }, + { + "epoch": 1.0753047253297712, + "grad_norm": 0.45617417497121054, + "learning_rate": 8.099836384344146e-06, + "loss": 0.0326, + "step": 3220 + }, + { + "epoch": 1.075638670896644, + "grad_norm": 0.3780406313130693, + "learning_rate": 8.098311357156213e-06, + "loss": 0.0273, + "step": 3221 + }, + { + "epoch": 1.0759726164635164, + "grad_norm": 0.38085812855747886, + "learning_rate": 8.096785861931868e-06, + "loss": 0.0254, + "step": 3222 + }, + { + "epoch": 1.076306562030389, + "grad_norm": 0.3362186305688881, + "learning_rate": 8.095259898901557e-06, + "loss": 0.034, + "step": 3223 + }, + { + "epoch": 1.0766405075972616, + "grad_norm": 0.30395187211020996, + "learning_rate": 8.09373346829579e-06, + "loss": 0.0263, + "step": 3224 + }, + { + "epoch": 1.0769744531641343, + "grad_norm": 0.24365452420250513, + "learning_rate": 8.092206570345158e-06, + "loss": 0.0181, + "step": 3225 + }, + { + "epoch": 1.0773083987310068, + "grad_norm": 0.21241122610162672, + "learning_rate": 8.090679205280311e-06, + "loss": 0.0213, + "step": 3226 + }, + { + "epoch": 1.0776423442978795, + "grad_norm": 0.3092399150533459, + "learning_rate": 8.08915137333198e-06, + "loss": 0.0312, + "step": 3227 + }, + { + "epoch": 1.077976289864752, + "grad_norm": 0.45577254513776233, + "learning_rate": 8.08762307473096e-06, + "loss": 0.0458, + "step": 3228 + }, + { + "epoch": 1.0783102354316247, + "grad_norm": 0.41877391528488833, + "learning_rate": 8.08609430970812e-06, + "loss": 0.0356, + "step": 3229 + }, + { + "epoch": 1.0786441809984972, + "grad_norm": 0.2907717077806986, + "learning_rate": 8.084565078494396e-06, + "loss": 0.0231, + "step": 3230 + }, + { + "epoch": 1.07897812656537, + "grad_norm": 0.3393837138554928, + "learning_rate": 8.083035381320798e-06, + "loss": 0.0209, + "step": 3231 + }, + { + "epoch": 1.0793120721322425, + "grad_norm": 0.3478604414360994, + "learning_rate": 8.081505218418403e-06, + "loss": 0.0265, + "step": 3232 + }, + { + "epoch": 1.079646017699115, + "grad_norm": 0.2954603675989673, + "learning_rate": 8.079974590018363e-06, + "loss": 0.0257, + "step": 3233 + }, + { + "epoch": 1.0799799632659877, + "grad_norm": 0.3202770249602899, + "learning_rate": 8.078443496351893e-06, + "loss": 0.0283, + "step": 3234 + }, + { + "epoch": 1.0803139088328602, + "grad_norm": 0.3527776921990553, + "learning_rate": 8.076911937650288e-06, + "loss": 0.0275, + "step": 3235 + }, + { + "epoch": 1.0806478543997329, + "grad_norm": 0.3004797505469835, + "learning_rate": 8.075379914144902e-06, + "loss": 0.0365, + "step": 3236 + }, + { + "epoch": 1.0809817999666054, + "grad_norm": 0.26293437171819317, + "learning_rate": 8.073847426067172e-06, + "loss": 0.0198, + "step": 3237 + }, + { + "epoch": 1.081315745533478, + "grad_norm": 0.30384901190258123, + "learning_rate": 8.072314473648595e-06, + "loss": 0.0363, + "step": 3238 + }, + { + "epoch": 1.0816496911003506, + "grad_norm": 0.3687712801154847, + "learning_rate": 8.07078105712074e-06, + "loss": 0.0327, + "step": 3239 + }, + { + "epoch": 1.0819836366672233, + "grad_norm": 0.3405941440703542, + "learning_rate": 8.06924717671525e-06, + "loss": 0.0366, + "step": 3240 + }, + { + "epoch": 1.0823175822340958, + "grad_norm": 0.24414727479773302, + "learning_rate": 8.067712832663831e-06, + "loss": 0.0237, + "step": 3241 + }, + { + "epoch": 1.0826515278009685, + "grad_norm": 0.24157122723692315, + "learning_rate": 8.066178025198272e-06, + "loss": 0.021, + "step": 3242 + }, + { + "epoch": 1.082985473367841, + "grad_norm": 0.3460307656468123, + "learning_rate": 8.064642754550418e-06, + "loss": 0.0287, + "step": 3243 + }, + { + "epoch": 1.0833194189347137, + "grad_norm": 0.33932834150817975, + "learning_rate": 8.06310702095219e-06, + "loss": 0.0257, + "step": 3244 + }, + { + "epoch": 1.0836533645015862, + "grad_norm": 0.34780098201834897, + "learning_rate": 8.06157082463558e-06, + "loss": 0.0392, + "step": 3245 + }, + { + "epoch": 1.083987310068459, + "grad_norm": 0.3875529529464877, + "learning_rate": 8.060034165832648e-06, + "loss": 0.0353, + "step": 3246 + }, + { + "epoch": 1.0843212556353314, + "grad_norm": 0.3928569828597569, + "learning_rate": 8.058497044775526e-06, + "loss": 0.0269, + "step": 3247 + }, + { + "epoch": 1.084655201202204, + "grad_norm": 0.5550811794335628, + "learning_rate": 8.05695946169641e-06, + "loss": 0.0352, + "step": 3248 + }, + { + "epoch": 1.0849891467690767, + "grad_norm": 0.3021650481495041, + "learning_rate": 8.055421416827575e-06, + "loss": 0.0232, + "step": 3249 + }, + { + "epoch": 1.0853230923359491, + "grad_norm": 0.396961012865088, + "learning_rate": 8.053882910401359e-06, + "loss": 0.0288, + "step": 3250 + }, + { + "epoch": 1.0856570379028219, + "grad_norm": 0.4026659132646484, + "learning_rate": 8.052343942650168e-06, + "loss": 0.0423, + "step": 3251 + }, + { + "epoch": 1.0859909834696944, + "grad_norm": 0.394582221220994, + "learning_rate": 8.050804513806488e-06, + "loss": 0.0343, + "step": 3252 + }, + { + "epoch": 1.086324929036567, + "grad_norm": 0.36225579539614333, + "learning_rate": 8.049264624102862e-06, + "loss": 0.0309, + "step": 3253 + }, + { + "epoch": 1.0866588746034396, + "grad_norm": 0.3280866037074307, + "learning_rate": 8.047724273771909e-06, + "loss": 0.027, + "step": 3254 + }, + { + "epoch": 1.0869928201703123, + "grad_norm": 0.4655193854035031, + "learning_rate": 8.046183463046322e-06, + "loss": 0.0355, + "step": 3255 + }, + { + "epoch": 1.0873267657371848, + "grad_norm": 0.30646720508211767, + "learning_rate": 8.044642192158854e-06, + "loss": 0.0323, + "step": 3256 + }, + { + "epoch": 1.0876607113040575, + "grad_norm": 0.26320109588751534, + "learning_rate": 8.043100461342332e-06, + "loss": 0.0218, + "step": 3257 + }, + { + "epoch": 1.08799465687093, + "grad_norm": 0.30419079651848474, + "learning_rate": 8.041558270829655e-06, + "loss": 0.0334, + "step": 3258 + }, + { + "epoch": 1.0883286024378027, + "grad_norm": 0.37646678852625404, + "learning_rate": 8.04001562085379e-06, + "loss": 0.0248, + "step": 3259 + }, + { + "epoch": 1.0886625480046752, + "grad_norm": 0.29796456459806475, + "learning_rate": 8.038472511647768e-06, + "loss": 0.0275, + "step": 3260 + }, + { + "epoch": 1.088996493571548, + "grad_norm": 0.34171278276903755, + "learning_rate": 8.036928943444698e-06, + "loss": 0.0281, + "step": 3261 + }, + { + "epoch": 1.0893304391384204, + "grad_norm": 0.29628993929471975, + "learning_rate": 8.03538491647775e-06, + "loss": 0.0202, + "step": 3262 + }, + { + "epoch": 1.089664384705293, + "grad_norm": 0.3659770174939796, + "learning_rate": 8.03384043098017e-06, + "loss": 0.026, + "step": 3263 + }, + { + "epoch": 1.0899983302721656, + "grad_norm": 0.5871669117756957, + "learning_rate": 8.032295487185273e-06, + "loss": 0.0488, + "step": 3264 + }, + { + "epoch": 1.0903322758390381, + "grad_norm": 0.2862753129702655, + "learning_rate": 8.030750085326438e-06, + "loss": 0.0314, + "step": 3265 + }, + { + "epoch": 1.0906662214059109, + "grad_norm": 0.341660846931696, + "learning_rate": 8.029204225637114e-06, + "loss": 0.0323, + "step": 3266 + }, + { + "epoch": 1.0910001669727833, + "grad_norm": 0.26031377523360283, + "learning_rate": 8.027657908350826e-06, + "loss": 0.023, + "step": 3267 + }, + { + "epoch": 1.091334112539656, + "grad_norm": 0.4909674930624971, + "learning_rate": 8.026111133701162e-06, + "loss": 0.0394, + "step": 3268 + }, + { + "epoch": 1.0916680581065286, + "grad_norm": 0.4673577196433493, + "learning_rate": 8.02456390192178e-06, + "loss": 0.0422, + "step": 3269 + }, + { + "epoch": 1.0920020036734013, + "grad_norm": 0.5810221713722641, + "learning_rate": 8.023016213246406e-06, + "loss": 0.0242, + "step": 3270 + }, + { + "epoch": 1.0923359492402738, + "grad_norm": 0.37038397677449103, + "learning_rate": 8.021468067908839e-06, + "loss": 0.032, + "step": 3271 + }, + { + "epoch": 1.0926698948071465, + "grad_norm": 0.4574071800556038, + "learning_rate": 8.019919466142945e-06, + "loss": 0.0341, + "step": 3272 + }, + { + "epoch": 1.093003840374019, + "grad_norm": 0.43810029082879615, + "learning_rate": 8.018370408182655e-06, + "loss": 0.0348, + "step": 3273 + }, + { + "epoch": 1.0933377859408917, + "grad_norm": 0.46502945180075905, + "learning_rate": 8.016820894261975e-06, + "loss": 0.0356, + "step": 3274 + }, + { + "epoch": 1.0936717315077642, + "grad_norm": 0.4901221523760667, + "learning_rate": 8.015270924614977e-06, + "loss": 0.0378, + "step": 3275 + }, + { + "epoch": 1.094005677074637, + "grad_norm": 0.43822557031995935, + "learning_rate": 8.013720499475804e-06, + "loss": 0.0426, + "step": 3276 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 0.3551130156793218, + "learning_rate": 8.012169619078662e-06, + "loss": 0.0277, + "step": 3277 + }, + { + "epoch": 1.0946735682083821, + "grad_norm": 0.34961132896253827, + "learning_rate": 8.010618283657834e-06, + "loss": 0.0337, + "step": 3278 + }, + { + "epoch": 1.0950075137752546, + "grad_norm": 0.3009876818314765, + "learning_rate": 8.009066493447664e-06, + "loss": 0.0257, + "step": 3279 + }, + { + "epoch": 1.0953414593421273, + "grad_norm": 0.35858495140204744, + "learning_rate": 8.00751424868257e-06, + "loss": 0.0303, + "step": 3280 + }, + { + "epoch": 1.0956754049089998, + "grad_norm": 0.3387476640151862, + "learning_rate": 8.005961549597037e-06, + "loss": 0.0324, + "step": 3281 + }, + { + "epoch": 1.0960093504758723, + "grad_norm": 0.4290414442587191, + "learning_rate": 8.004408396425617e-06, + "loss": 0.029, + "step": 3282 + }, + { + "epoch": 1.096343296042745, + "grad_norm": 0.33094967279971516, + "learning_rate": 8.002854789402931e-06, + "loss": 0.0253, + "step": 3283 + }, + { + "epoch": 1.0966772416096175, + "grad_norm": 0.2586770805586634, + "learning_rate": 8.001300728763674e-06, + "loss": 0.0262, + "step": 3284 + }, + { + "epoch": 1.0970111871764903, + "grad_norm": 0.3288852460265123, + "learning_rate": 7.999746214742603e-06, + "loss": 0.0281, + "step": 3285 + }, + { + "epoch": 1.0973451327433628, + "grad_norm": 0.2988088829590742, + "learning_rate": 7.998191247574545e-06, + "loss": 0.0218, + "step": 3286 + }, + { + "epoch": 1.0976790783102355, + "grad_norm": 0.49335428203556075, + "learning_rate": 7.996635827494397e-06, + "loss": 0.0237, + "step": 3287 + }, + { + "epoch": 1.098013023877108, + "grad_norm": 0.38631025593619045, + "learning_rate": 7.995079954737122e-06, + "loss": 0.0312, + "step": 3288 + }, + { + "epoch": 1.0983469694439807, + "grad_norm": 0.3600579178108258, + "learning_rate": 7.993523629537753e-06, + "loss": 0.0338, + "step": 3289 + }, + { + "epoch": 1.0986809150108532, + "grad_norm": 0.30779911947153704, + "learning_rate": 7.991966852131394e-06, + "loss": 0.0219, + "step": 3290 + }, + { + "epoch": 1.099014860577726, + "grad_norm": 0.2905441947150159, + "learning_rate": 7.990409622753212e-06, + "loss": 0.0284, + "step": 3291 + }, + { + "epoch": 1.0993488061445984, + "grad_norm": 0.306190108652269, + "learning_rate": 7.988851941638445e-06, + "loss": 0.0272, + "step": 3292 + }, + { + "epoch": 1.0996827517114711, + "grad_norm": 0.3428238124277849, + "learning_rate": 7.987293809022401e-06, + "loss": 0.0368, + "step": 3293 + }, + { + "epoch": 1.1000166972783436, + "grad_norm": 0.42435268643800517, + "learning_rate": 7.985735225140452e-06, + "loss": 0.0396, + "step": 3294 + }, + { + "epoch": 1.1003506428452163, + "grad_norm": 0.5434448966943168, + "learning_rate": 7.984176190228042e-06, + "loss": 0.0338, + "step": 3295 + }, + { + "epoch": 1.1006845884120888, + "grad_norm": 0.354328827694649, + "learning_rate": 7.98261670452068e-06, + "loss": 0.0386, + "step": 3296 + }, + { + "epoch": 1.1010185339789613, + "grad_norm": 0.35602064466517996, + "learning_rate": 7.981056768253945e-06, + "loss": 0.032, + "step": 3297 + }, + { + "epoch": 1.101352479545834, + "grad_norm": 0.2941162933646615, + "learning_rate": 7.979496381663486e-06, + "loss": 0.0236, + "step": 3298 + }, + { + "epoch": 1.1016864251127065, + "grad_norm": 0.38612453583250106, + "learning_rate": 7.977935544985016e-06, + "loss": 0.0317, + "step": 3299 + }, + { + "epoch": 1.1020203706795793, + "grad_norm": 0.3799863500385795, + "learning_rate": 7.976374258454317e-06, + "loss": 0.0289, + "step": 3300 + }, + { + "epoch": 1.1023543162464517, + "grad_norm": 0.3268097291310616, + "learning_rate": 7.97481252230724e-06, + "loss": 0.0258, + "step": 3301 + }, + { + "epoch": 1.1026882618133245, + "grad_norm": 0.39369341443797184, + "learning_rate": 7.973250336779705e-06, + "loss": 0.0322, + "step": 3302 + }, + { + "epoch": 1.103022207380197, + "grad_norm": 0.3737112857749979, + "learning_rate": 7.971687702107698e-06, + "loss": 0.0279, + "step": 3303 + }, + { + "epoch": 1.1033561529470697, + "grad_norm": 0.35479945345298036, + "learning_rate": 7.970124618527274e-06, + "loss": 0.0299, + "step": 3304 + }, + { + "epoch": 1.1036900985139422, + "grad_norm": 0.31349347013890455, + "learning_rate": 7.968561086274553e-06, + "loss": 0.0238, + "step": 3305 + }, + { + "epoch": 1.104024044080815, + "grad_norm": 0.2481600213298206, + "learning_rate": 7.966997105585727e-06, + "loss": 0.0263, + "step": 3306 + }, + { + "epoch": 1.1043579896476874, + "grad_norm": 0.3634651565674134, + "learning_rate": 7.965432676697052e-06, + "loss": 0.0295, + "step": 3307 + }, + { + "epoch": 1.10469193521456, + "grad_norm": 0.23875067279052692, + "learning_rate": 7.963867799844855e-06, + "loss": 0.0223, + "step": 3308 + }, + { + "epoch": 1.1050258807814326, + "grad_norm": 0.25377720340243354, + "learning_rate": 7.962302475265527e-06, + "loss": 0.0218, + "step": 3309 + }, + { + "epoch": 1.1053598263483053, + "grad_norm": 0.31721599881937135, + "learning_rate": 7.960736703195533e-06, + "loss": 0.0225, + "step": 3310 + }, + { + "epoch": 1.1056937719151778, + "grad_norm": 0.31578452859925404, + "learning_rate": 7.959170483871398e-06, + "loss": 0.0256, + "step": 3311 + }, + { + "epoch": 1.1060277174820503, + "grad_norm": 0.39843821196394125, + "learning_rate": 7.957603817529715e-06, + "loss": 0.0348, + "step": 3312 + }, + { + "epoch": 1.106361663048923, + "grad_norm": 0.4255737697271387, + "learning_rate": 7.956036704407153e-06, + "loss": 0.0411, + "step": 3313 + }, + { + "epoch": 1.1066956086157955, + "grad_norm": 0.2831333730132518, + "learning_rate": 7.954469144740441e-06, + "loss": 0.0224, + "step": 3314 + }, + { + "epoch": 1.1070295541826682, + "grad_norm": 0.3944181528504548, + "learning_rate": 7.952901138766376e-06, + "loss": 0.0358, + "step": 3315 + }, + { + "epoch": 1.1073634997495407, + "grad_norm": 0.3531085915481224, + "learning_rate": 7.951332686721825e-06, + "loss": 0.04, + "step": 3316 + }, + { + "epoch": 1.1076974453164135, + "grad_norm": 0.2681560115879675, + "learning_rate": 7.94976378884372e-06, + "loss": 0.0222, + "step": 3317 + }, + { + "epoch": 1.108031390883286, + "grad_norm": 0.3364026241493204, + "learning_rate": 7.948194445369065e-06, + "loss": 0.0278, + "step": 3318 + }, + { + "epoch": 1.1083653364501587, + "grad_norm": 0.34866005885048446, + "learning_rate": 7.946624656534922e-06, + "loss": 0.0342, + "step": 3319 + }, + { + "epoch": 1.1086992820170312, + "grad_norm": 0.36641364979886615, + "learning_rate": 7.945054422578432e-06, + "loss": 0.0324, + "step": 3320 + }, + { + "epoch": 1.1090332275839039, + "grad_norm": 0.34940742331356994, + "learning_rate": 7.943483743736793e-06, + "loss": 0.033, + "step": 3321 + }, + { + "epoch": 1.1093671731507764, + "grad_norm": 0.355563802184475, + "learning_rate": 7.941912620247276e-06, + "loss": 0.0276, + "step": 3322 + }, + { + "epoch": 1.109701118717649, + "grad_norm": 0.45120068049438616, + "learning_rate": 7.940341052347219e-06, + "loss": 0.0352, + "step": 3323 + }, + { + "epoch": 1.1100350642845216, + "grad_norm": 0.40408839744135716, + "learning_rate": 7.938769040274022e-06, + "loss": 0.0417, + "step": 3324 + }, + { + "epoch": 1.1103690098513943, + "grad_norm": 0.4338686807243482, + "learning_rate": 7.937196584265161e-06, + "loss": 0.0313, + "step": 3325 + }, + { + "epoch": 1.1107029554182668, + "grad_norm": 0.29181078715721387, + "learning_rate": 7.93562368455817e-06, + "loss": 0.0249, + "step": 3326 + }, + { + "epoch": 1.1110369009851395, + "grad_norm": 0.4185676067944589, + "learning_rate": 7.934050341390659e-06, + "loss": 0.0234, + "step": 3327 + }, + { + "epoch": 1.111370846552012, + "grad_norm": 0.2767702565319191, + "learning_rate": 7.932476555000294e-06, + "loss": 0.0243, + "step": 3328 + }, + { + "epoch": 1.1117047921188847, + "grad_norm": 0.29143933795252175, + "learning_rate": 7.930902325624816e-06, + "loss": 0.0217, + "step": 3329 + }, + { + "epoch": 1.1120387376857572, + "grad_norm": 0.27250096630310733, + "learning_rate": 7.929327653502032e-06, + "loss": 0.022, + "step": 3330 + }, + { + "epoch": 1.1123726832526297, + "grad_norm": 0.3517864948807678, + "learning_rate": 7.927752538869816e-06, + "loss": 0.0294, + "step": 3331 + }, + { + "epoch": 1.1127066288195024, + "grad_norm": 0.42682315501132234, + "learning_rate": 7.926176981966102e-06, + "loss": 0.0368, + "step": 3332 + }, + { + "epoch": 1.113040574386375, + "grad_norm": 0.5442250868163462, + "learning_rate": 7.924600983028903e-06, + "loss": 0.0431, + "step": 3333 + }, + { + "epoch": 1.1133745199532477, + "grad_norm": 0.332474222206286, + "learning_rate": 7.92302454229629e-06, + "loss": 0.0246, + "step": 3334 + }, + { + "epoch": 1.1137084655201201, + "grad_norm": 0.3218443871111087, + "learning_rate": 7.9214476600064e-06, + "loss": 0.0318, + "step": 3335 + }, + { + "epoch": 1.1140424110869929, + "grad_norm": 0.4195822755083483, + "learning_rate": 7.919870336397444e-06, + "loss": 0.0353, + "step": 3336 + }, + { + "epoch": 1.1143763566538654, + "grad_norm": 0.37589029930284823, + "learning_rate": 7.918292571707693e-06, + "loss": 0.0255, + "step": 3337 + }, + { + "epoch": 1.114710302220738, + "grad_norm": 0.3726668985785104, + "learning_rate": 7.916714366175487e-06, + "loss": 0.0303, + "step": 3338 + }, + { + "epoch": 1.1150442477876106, + "grad_norm": 0.2869112064253432, + "learning_rate": 7.915135720039233e-06, + "loss": 0.0251, + "step": 3339 + }, + { + "epoch": 1.1153781933544833, + "grad_norm": 0.3122401429651216, + "learning_rate": 7.913556633537403e-06, + "loss": 0.0242, + "step": 3340 + }, + { + "epoch": 1.1157121389213558, + "grad_norm": 0.33236127861053366, + "learning_rate": 7.91197710690854e-06, + "loss": 0.0362, + "step": 3341 + }, + { + "epoch": 1.1160460844882285, + "grad_norm": 0.3136339582547713, + "learning_rate": 7.910397140391244e-06, + "loss": 0.0287, + "step": 3342 + }, + { + "epoch": 1.116380030055101, + "grad_norm": 0.32873912375824965, + "learning_rate": 7.908816734224195e-06, + "loss": 0.0329, + "step": 3343 + }, + { + "epoch": 1.1167139756219737, + "grad_norm": 0.4083856391063721, + "learning_rate": 7.907235888646126e-06, + "loss": 0.0379, + "step": 3344 + }, + { + "epoch": 1.1170479211888462, + "grad_norm": 0.43212270409282794, + "learning_rate": 7.905654603895843e-06, + "loss": 0.0375, + "step": 3345 + }, + { + "epoch": 1.1173818667557187, + "grad_norm": 0.45992624147429745, + "learning_rate": 7.90407288021222e-06, + "loss": 0.0306, + "step": 3346 + }, + { + "epoch": 1.1177158123225914, + "grad_norm": 0.3892224439462069, + "learning_rate": 7.902490717834196e-06, + "loss": 0.037, + "step": 3347 + }, + { + "epoch": 1.118049757889464, + "grad_norm": 0.28294143055491255, + "learning_rate": 7.90090811700077e-06, + "loss": 0.0219, + "step": 3348 + }, + { + "epoch": 1.1183837034563366, + "grad_norm": 0.32420382240075996, + "learning_rate": 7.899325077951018e-06, + "loss": 0.0278, + "step": 3349 + }, + { + "epoch": 1.1187176490232091, + "grad_norm": 0.23351130904306372, + "learning_rate": 7.897741600924073e-06, + "loss": 0.0197, + "step": 3350 + }, + { + "epoch": 1.1190515945900819, + "grad_norm": 0.27507459110299837, + "learning_rate": 7.896157686159142e-06, + "loss": 0.0227, + "step": 3351 + }, + { + "epoch": 1.1193855401569544, + "grad_norm": 0.3928012030302448, + "learning_rate": 7.89457333389549e-06, + "loss": 0.0338, + "step": 3352 + }, + { + "epoch": 1.119719485723827, + "grad_norm": 0.27000272290262073, + "learning_rate": 7.892988544372454e-06, + "loss": 0.0204, + "step": 3353 + }, + { + "epoch": 1.1200534312906996, + "grad_norm": 0.3192793404473544, + "learning_rate": 7.891403317829434e-06, + "loss": 0.0291, + "step": 3354 + }, + { + "epoch": 1.1203873768575723, + "grad_norm": 0.3425808568634987, + "learning_rate": 7.889817654505897e-06, + "loss": 0.0314, + "step": 3355 + }, + { + "epoch": 1.1207213224244448, + "grad_norm": 0.28002198089551705, + "learning_rate": 7.888231554641377e-06, + "loss": 0.0267, + "step": 3356 + }, + { + "epoch": 1.1210552679913175, + "grad_norm": 0.27841002206811855, + "learning_rate": 7.886645018475474e-06, + "loss": 0.0224, + "step": 3357 + }, + { + "epoch": 1.12138921355819, + "grad_norm": 0.9556836898606712, + "learning_rate": 7.885058046247852e-06, + "loss": 0.0561, + "step": 3358 + }, + { + "epoch": 1.1217231591250627, + "grad_norm": 0.2725277534227355, + "learning_rate": 7.88347063819824e-06, + "loss": 0.024, + "step": 3359 + }, + { + "epoch": 1.1220571046919352, + "grad_norm": 0.3506925856290798, + "learning_rate": 7.881882794566438e-06, + "loss": 0.0308, + "step": 3360 + }, + { + "epoch": 1.1223910502588077, + "grad_norm": 0.3017350492143483, + "learning_rate": 7.880294515592304e-06, + "loss": 0.0278, + "step": 3361 + }, + { + "epoch": 1.1227249958256804, + "grad_norm": 0.3173029659624254, + "learning_rate": 7.878705801515772e-06, + "loss": 0.0287, + "step": 3362 + }, + { + "epoch": 1.123058941392553, + "grad_norm": 0.33327651253665935, + "learning_rate": 7.877116652576832e-06, + "loss": 0.0282, + "step": 3363 + }, + { + "epoch": 1.1233928869594256, + "grad_norm": 0.3330771960433005, + "learning_rate": 7.875527069015545e-06, + "loss": 0.0249, + "step": 3364 + }, + { + "epoch": 1.1237268325262981, + "grad_norm": 0.31208907904599537, + "learning_rate": 7.873937051072037e-06, + "loss": 0.0238, + "step": 3365 + }, + { + "epoch": 1.1240607780931708, + "grad_norm": 0.3232661666692226, + "learning_rate": 7.872346598986496e-06, + "loss": 0.0324, + "step": 3366 + }, + { + "epoch": 1.1243947236600433, + "grad_norm": 0.5704434100953263, + "learning_rate": 7.87075571299918e-06, + "loss": 0.0459, + "step": 3367 + }, + { + "epoch": 1.124728669226916, + "grad_norm": 0.25271598829371567, + "learning_rate": 7.869164393350412e-06, + "loss": 0.0191, + "step": 3368 + }, + { + "epoch": 1.1250626147937886, + "grad_norm": 0.3784736957459662, + "learning_rate": 7.86757264028058e-06, + "loss": 0.0307, + "step": 3369 + }, + { + "epoch": 1.1253965603606613, + "grad_norm": 0.27817786100942254, + "learning_rate": 7.865980454030135e-06, + "loss": 0.0329, + "step": 3370 + }, + { + "epoch": 1.1257305059275338, + "grad_norm": 0.3253548797529124, + "learning_rate": 7.864387834839598e-06, + "loss": 0.0256, + "step": 3371 + }, + { + "epoch": 1.1260644514944065, + "grad_norm": 0.37085916865559204, + "learning_rate": 7.86279478294955e-06, + "loss": 0.0291, + "step": 3372 + }, + { + "epoch": 1.126398397061279, + "grad_norm": 0.43750061606167917, + "learning_rate": 7.861201298600642e-06, + "loss": 0.0438, + "step": 3373 + }, + { + "epoch": 1.1267323426281517, + "grad_norm": 0.3429848284373353, + "learning_rate": 7.85960738203359e-06, + "loss": 0.0295, + "step": 3374 + }, + { + "epoch": 1.1270662881950242, + "grad_norm": 0.519164072515975, + "learning_rate": 7.858013033489171e-06, + "loss": 0.039, + "step": 3375 + }, + { + "epoch": 1.1274002337618967, + "grad_norm": 0.3046523315968954, + "learning_rate": 7.856418253208232e-06, + "loss": 0.0454, + "step": 3376 + }, + { + "epoch": 1.1277341793287694, + "grad_norm": 0.33487579971327147, + "learning_rate": 7.85482304143168e-06, + "loss": 0.0272, + "step": 3377 + }, + { + "epoch": 1.1280681248956421, + "grad_norm": 0.33452345204735684, + "learning_rate": 7.853227398400495e-06, + "loss": 0.0221, + "step": 3378 + }, + { + "epoch": 1.1284020704625146, + "grad_norm": 0.2993478347966669, + "learning_rate": 7.851631324355717e-06, + "loss": 0.02, + "step": 3379 + }, + { + "epoch": 1.1287360160293871, + "grad_norm": 0.3101706399966744, + "learning_rate": 7.850034819538448e-06, + "loss": 0.0319, + "step": 3380 + }, + { + "epoch": 1.1290699615962598, + "grad_norm": 0.30946031624112036, + "learning_rate": 7.848437884189864e-06, + "loss": 0.0284, + "step": 3381 + }, + { + "epoch": 1.1294039071631323, + "grad_norm": 0.35003207065103414, + "learning_rate": 7.846840518551197e-06, + "loss": 0.0366, + "step": 3382 + }, + { + "epoch": 1.129737852730005, + "grad_norm": 0.29124555633635885, + "learning_rate": 7.845242722863749e-06, + "loss": 0.031, + "step": 3383 + }, + { + "epoch": 1.1300717982968775, + "grad_norm": 0.330688106020703, + "learning_rate": 7.843644497368886e-06, + "loss": 0.0338, + "step": 3384 + }, + { + "epoch": 1.1304057438637503, + "grad_norm": 0.5756691362504007, + "learning_rate": 7.842045842308038e-06, + "loss": 0.045, + "step": 3385 + }, + { + "epoch": 1.1307396894306228, + "grad_norm": 0.3253252750964825, + "learning_rate": 7.840446757922704e-06, + "loss": 0.022, + "step": 3386 + }, + { + "epoch": 1.1310736349974955, + "grad_norm": 0.339599458339216, + "learning_rate": 7.838847244454441e-06, + "loss": 0.0256, + "step": 3387 + }, + { + "epoch": 1.131407580564368, + "grad_norm": 0.3720567727910156, + "learning_rate": 7.837247302144874e-06, + "loss": 0.0329, + "step": 3388 + }, + { + "epoch": 1.1317415261312407, + "grad_norm": 0.36887312774570036, + "learning_rate": 7.835646931235697e-06, + "loss": 0.0321, + "step": 3389 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.3199150608989669, + "learning_rate": 7.83404613196866e-06, + "loss": 0.0263, + "step": 3390 + }, + { + "epoch": 1.132409417264986, + "grad_norm": 0.3576472141589384, + "learning_rate": 7.832444904585587e-06, + "loss": 0.0298, + "step": 3391 + }, + { + "epoch": 1.1327433628318584, + "grad_norm": 0.33277393919868725, + "learning_rate": 7.83084324932836e-06, + "loss": 0.0412, + "step": 3392 + }, + { + "epoch": 1.133077308398731, + "grad_norm": 0.3431785654729045, + "learning_rate": 7.829241166438925e-06, + "loss": 0.0312, + "step": 3393 + }, + { + "epoch": 1.1334112539656036, + "grad_norm": 0.25972205424157063, + "learning_rate": 7.827638656159302e-06, + "loss": 0.0203, + "step": 3394 + }, + { + "epoch": 1.133745199532476, + "grad_norm": 0.3890664642320907, + "learning_rate": 7.826035718731564e-06, + "loss": 0.0445, + "step": 3395 + }, + { + "epoch": 1.1340791450993488, + "grad_norm": 0.24772805025026107, + "learning_rate": 7.824432354397857e-06, + "loss": 0.0266, + "step": 3396 + }, + { + "epoch": 1.1344130906662213, + "grad_norm": 0.37753984036104876, + "learning_rate": 7.822828563400384e-06, + "loss": 0.0236, + "step": 3397 + }, + { + "epoch": 1.134747036233094, + "grad_norm": 0.3605642048938382, + "learning_rate": 7.82122434598142e-06, + "loss": 0.0342, + "step": 3398 + }, + { + "epoch": 1.1350809817999665, + "grad_norm": 0.3795237258187998, + "learning_rate": 7.819619702383299e-06, + "loss": 0.0304, + "step": 3399 + }, + { + "epoch": 1.1354149273668392, + "grad_norm": 0.35098445157073305, + "learning_rate": 7.818014632848422e-06, + "loss": 0.0318, + "step": 3400 + }, + { + "epoch": 1.1357488729337117, + "grad_norm": 0.24736562010249868, + "learning_rate": 7.816409137619254e-06, + "loss": 0.0278, + "step": 3401 + }, + { + "epoch": 1.1360828185005845, + "grad_norm": 0.34561163672602185, + "learning_rate": 7.814803216938324e-06, + "loss": 0.0283, + "step": 3402 + }, + { + "epoch": 1.136416764067457, + "grad_norm": 0.2811089597579645, + "learning_rate": 7.813196871048226e-06, + "loss": 0.0246, + "step": 3403 + }, + { + "epoch": 1.1367507096343297, + "grad_norm": 0.29229247333057806, + "learning_rate": 7.811590100191613e-06, + "loss": 0.0281, + "step": 3404 + }, + { + "epoch": 1.1370846552012022, + "grad_norm": 0.3233596938603605, + "learning_rate": 7.809982904611213e-06, + "loss": 0.0294, + "step": 3405 + }, + { + "epoch": 1.1374186007680749, + "grad_norm": 0.2938492505209193, + "learning_rate": 7.808375284549807e-06, + "loss": 0.0314, + "step": 3406 + }, + { + "epoch": 1.1377525463349474, + "grad_norm": 0.3974430989729181, + "learning_rate": 7.806767240250248e-06, + "loss": 0.0349, + "step": 3407 + }, + { + "epoch": 1.13808649190182, + "grad_norm": 0.37393621261776927, + "learning_rate": 7.805158771955448e-06, + "loss": 0.0288, + "step": 3408 + }, + { + "epoch": 1.1384204374686926, + "grad_norm": 0.31201214590230125, + "learning_rate": 7.803549879908385e-06, + "loss": 0.0269, + "step": 3409 + }, + { + "epoch": 1.138754383035565, + "grad_norm": 0.23628934183711123, + "learning_rate": 7.801940564352103e-06, + "loss": 0.0238, + "step": 3410 + }, + { + "epoch": 1.1390883286024378, + "grad_norm": 0.3178175273801832, + "learning_rate": 7.800330825529707e-06, + "loss": 0.0289, + "step": 3411 + }, + { + "epoch": 1.1394222741693105, + "grad_norm": 0.28193244763416103, + "learning_rate": 7.798720663684367e-06, + "loss": 0.027, + "step": 3412 + }, + { + "epoch": 1.139756219736183, + "grad_norm": 0.33395340241939425, + "learning_rate": 7.797110079059315e-06, + "loss": 0.0303, + "step": 3413 + }, + { + "epoch": 1.1400901653030555, + "grad_norm": 0.3245023901004787, + "learning_rate": 7.795499071897855e-06, + "loss": 0.025, + "step": 3414 + }, + { + "epoch": 1.1404241108699282, + "grad_norm": 0.9728514347695408, + "learning_rate": 7.79388764244334e-06, + "loss": 0.0245, + "step": 3415 + }, + { + "epoch": 1.1407580564368007, + "grad_norm": 0.32287542439954203, + "learning_rate": 7.792275790939202e-06, + "loss": 0.024, + "step": 3416 + }, + { + "epoch": 1.1410920020036734, + "grad_norm": 0.41902094576109844, + "learning_rate": 7.790663517628927e-06, + "loss": 0.0259, + "step": 3417 + }, + { + "epoch": 1.141425947570546, + "grad_norm": 0.27249682731176167, + "learning_rate": 7.789050822756068e-06, + "loss": 0.0235, + "step": 3418 + }, + { + "epoch": 1.1417598931374187, + "grad_norm": 0.25567186294511735, + "learning_rate": 7.787437706564243e-06, + "loss": 0.0243, + "step": 3419 + }, + { + "epoch": 1.1420938387042912, + "grad_norm": 0.3892942977962352, + "learning_rate": 7.78582416929713e-06, + "loss": 0.0249, + "step": 3420 + }, + { + "epoch": 1.1424277842711639, + "grad_norm": 0.6207323860141518, + "learning_rate": 7.784210211198475e-06, + "loss": 0.0306, + "step": 3421 + }, + { + "epoch": 1.1427617298380364, + "grad_norm": 0.3786748110839557, + "learning_rate": 7.782595832512086e-06, + "loss": 0.0312, + "step": 3422 + }, + { + "epoch": 1.143095675404909, + "grad_norm": 0.44842472367060976, + "learning_rate": 7.780981033481832e-06, + "loss": 0.0214, + "step": 3423 + }, + { + "epoch": 1.1434296209717816, + "grad_norm": 0.4241362850976065, + "learning_rate": 7.779365814351648e-06, + "loss": 0.0382, + "step": 3424 + }, + { + "epoch": 1.143763566538654, + "grad_norm": 0.4863833526126975, + "learning_rate": 7.77775017536553e-06, + "loss": 0.0452, + "step": 3425 + }, + { + "epoch": 1.1440975121055268, + "grad_norm": 0.3739248050143184, + "learning_rate": 7.776134116767544e-06, + "loss": 0.0359, + "step": 3426 + }, + { + "epoch": 1.1444314576723995, + "grad_norm": 0.4540275651438344, + "learning_rate": 7.774517638801808e-06, + "loss": 0.0278, + "step": 3427 + }, + { + "epoch": 1.144765403239272, + "grad_norm": 0.3285748768169526, + "learning_rate": 7.772900741712516e-06, + "loss": 0.0293, + "step": 3428 + }, + { + "epoch": 1.1450993488061445, + "grad_norm": 0.41110587198841414, + "learning_rate": 7.771283425743916e-06, + "loss": 0.0336, + "step": 3429 + }, + { + "epoch": 1.1454332943730172, + "grad_norm": 0.2805074428621494, + "learning_rate": 7.769665691140325e-06, + "loss": 0.0222, + "step": 3430 + }, + { + "epoch": 1.1457672399398897, + "grad_norm": 0.4080248079241784, + "learning_rate": 7.76804753814612e-06, + "loss": 0.04, + "step": 3431 + }, + { + "epoch": 1.1461011855067624, + "grad_norm": 0.34620183085375505, + "learning_rate": 7.76642896700574e-06, + "loss": 0.0237, + "step": 3432 + }, + { + "epoch": 1.146435131073635, + "grad_norm": 0.3722667167027366, + "learning_rate": 7.764809977963692e-06, + "loss": 0.0262, + "step": 3433 + }, + { + "epoch": 1.1467690766405076, + "grad_norm": 0.30275538331877355, + "learning_rate": 7.763190571264542e-06, + "loss": 0.025, + "step": 3434 + }, + { + "epoch": 1.1471030222073801, + "grad_norm": 0.39944709590348804, + "learning_rate": 7.761570747152923e-06, + "loss": 0.0236, + "step": 3435 + }, + { + "epoch": 1.1474369677742529, + "grad_norm": 0.256332459908972, + "learning_rate": 7.759950505873523e-06, + "loss": 0.0233, + "step": 3436 + }, + { + "epoch": 1.1477709133411254, + "grad_norm": 0.48252140488184914, + "learning_rate": 7.758329847671103e-06, + "loss": 0.0362, + "step": 3437 + }, + { + "epoch": 1.148104858907998, + "grad_norm": 0.3171989479466795, + "learning_rate": 7.75670877279048e-06, + "loss": 0.0222, + "step": 3438 + }, + { + "epoch": 1.1484388044748706, + "grad_norm": 0.2702070056143434, + "learning_rate": 7.755087281476539e-06, + "loss": 0.024, + "step": 3439 + }, + { + "epoch": 1.1487727500417433, + "grad_norm": 0.24206119047571278, + "learning_rate": 7.753465373974223e-06, + "loss": 0.0225, + "step": 3440 + }, + { + "epoch": 1.1491066956086158, + "grad_norm": 0.41400502979105464, + "learning_rate": 7.751843050528543e-06, + "loss": 0.0299, + "step": 3441 + }, + { + "epoch": 1.1494406411754885, + "grad_norm": 0.3980616335286519, + "learning_rate": 7.750220311384567e-06, + "loss": 0.0392, + "step": 3442 + }, + { + "epoch": 1.149774586742361, + "grad_norm": 0.26094169338484696, + "learning_rate": 7.748597156787429e-06, + "loss": 0.0231, + "step": 3443 + }, + { + "epoch": 1.1501085323092335, + "grad_norm": 0.3244504831386949, + "learning_rate": 7.746973586982328e-06, + "loss": 0.029, + "step": 3444 + }, + { + "epoch": 1.1504424778761062, + "grad_norm": 0.2547299448602225, + "learning_rate": 7.745349602214522e-06, + "loss": 0.0182, + "step": 3445 + }, + { + "epoch": 1.1507764234429787, + "grad_norm": 0.32000861461781827, + "learning_rate": 7.743725202729335e-06, + "loss": 0.0291, + "step": 3446 + }, + { + "epoch": 1.1511103690098514, + "grad_norm": 0.35683895137351757, + "learning_rate": 7.742100388772148e-06, + "loss": 0.0235, + "step": 3447 + }, + { + "epoch": 1.151444314576724, + "grad_norm": 0.26881767875180934, + "learning_rate": 7.74047516058841e-06, + "loss": 0.0274, + "step": 3448 + }, + { + "epoch": 1.1517782601435966, + "grad_norm": 0.32882443243371134, + "learning_rate": 7.73884951842363e-06, + "loss": 0.0347, + "step": 3449 + }, + { + "epoch": 1.1521122057104691, + "grad_norm": 0.34210788270298503, + "learning_rate": 7.737223462523383e-06, + "loss": 0.0306, + "step": 3450 + }, + { + "epoch": 1.1524461512773418, + "grad_norm": 0.32502201568148453, + "learning_rate": 7.735596993133303e-06, + "loss": 0.0322, + "step": 3451 + }, + { + "epoch": 1.1527800968442143, + "grad_norm": 0.30890188288415493, + "learning_rate": 7.733970110499086e-06, + "loss": 0.0222, + "step": 3452 + }, + { + "epoch": 1.153114042411087, + "grad_norm": 0.35388885412840615, + "learning_rate": 7.732342814866489e-06, + "loss": 0.0264, + "step": 3453 + }, + { + "epoch": 1.1534479879779596, + "grad_norm": 0.33965422979713805, + "learning_rate": 7.730715106481342e-06, + "loss": 0.0338, + "step": 3454 + }, + { + "epoch": 1.1537819335448323, + "grad_norm": 0.24510797290870673, + "learning_rate": 7.729086985589523e-06, + "loss": 0.021, + "step": 3455 + }, + { + "epoch": 1.1541158791117048, + "grad_norm": 0.2540203541644058, + "learning_rate": 7.72745845243698e-06, + "loss": 0.0216, + "step": 3456 + }, + { + "epoch": 1.1544498246785775, + "grad_norm": 0.43230712144777433, + "learning_rate": 7.725829507269723e-06, + "loss": 0.0245, + "step": 3457 + }, + { + "epoch": 1.15478377024545, + "grad_norm": 0.45834909309480487, + "learning_rate": 7.724200150333826e-06, + "loss": 0.0318, + "step": 3458 + }, + { + "epoch": 1.1551177158123225, + "grad_norm": 0.3044753513105623, + "learning_rate": 7.722570381875418e-06, + "loss": 0.0308, + "step": 3459 + }, + { + "epoch": 1.1554516613791952, + "grad_norm": 0.357081990843535, + "learning_rate": 7.720940202140698e-06, + "loss": 0.0356, + "step": 3460 + }, + { + "epoch": 1.155785606946068, + "grad_norm": 0.23549803019765664, + "learning_rate": 7.71930961137592e-06, + "loss": 0.0218, + "step": 3461 + }, + { + "epoch": 1.1561195525129404, + "grad_norm": 0.3376045946718028, + "learning_rate": 7.717678609827409e-06, + "loss": 0.0281, + "step": 3462 + }, + { + "epoch": 1.156453498079813, + "grad_norm": 0.4660996016445457, + "learning_rate": 7.716047197741543e-06, + "loss": 0.0367, + "step": 3463 + }, + { + "epoch": 1.1567874436466856, + "grad_norm": 0.3289329792685201, + "learning_rate": 7.714415375364768e-06, + "loss": 0.0335, + "step": 3464 + }, + { + "epoch": 1.1571213892135581, + "grad_norm": 0.313194622895452, + "learning_rate": 7.712783142943588e-06, + "loss": 0.0304, + "step": 3465 + }, + { + "epoch": 1.1574553347804308, + "grad_norm": 0.31856894335048774, + "learning_rate": 7.711150500724574e-06, + "loss": 0.0314, + "step": 3466 + }, + { + "epoch": 1.1577892803473033, + "grad_norm": 0.290210905224967, + "learning_rate": 7.709517448954353e-06, + "loss": 0.0212, + "step": 3467 + }, + { + "epoch": 1.158123225914176, + "grad_norm": 0.3450403529308227, + "learning_rate": 7.707883987879617e-06, + "loss": 0.024, + "step": 3468 + }, + { + "epoch": 1.1584571714810485, + "grad_norm": 0.2853233655464682, + "learning_rate": 7.70625011774712e-06, + "loss": 0.0204, + "step": 3469 + }, + { + "epoch": 1.1587911170479213, + "grad_norm": 0.30596720190071924, + "learning_rate": 7.70461583880368e-06, + "loss": 0.0246, + "step": 3470 + }, + { + "epoch": 1.1591250626147938, + "grad_norm": 0.45582478478363625, + "learning_rate": 7.70298115129617e-06, + "loss": 0.0315, + "step": 3471 + }, + { + "epoch": 1.1594590081816665, + "grad_norm": 0.3343310583345285, + "learning_rate": 7.701346055471533e-06, + "loss": 0.0238, + "step": 3472 + }, + { + "epoch": 1.159792953748539, + "grad_norm": 0.25549838242569206, + "learning_rate": 7.699710551576763e-06, + "loss": 0.017, + "step": 3473 + }, + { + "epoch": 1.1601268993154115, + "grad_norm": 0.4784591140687851, + "learning_rate": 7.69807463985893e-06, + "loss": 0.0315, + "step": 3474 + }, + { + "epoch": 1.1604608448822842, + "grad_norm": 0.253805038978094, + "learning_rate": 7.696438320565152e-06, + "loss": 0.0234, + "step": 3475 + }, + { + "epoch": 1.160794790449157, + "grad_norm": 0.3239179659179292, + "learning_rate": 7.694801593942615e-06, + "loss": 0.0304, + "step": 3476 + }, + { + "epoch": 1.1611287360160294, + "grad_norm": 0.3700450583545322, + "learning_rate": 7.69316446023857e-06, + "loss": 0.0347, + "step": 3477 + }, + { + "epoch": 1.161462681582902, + "grad_norm": 0.4088219631304055, + "learning_rate": 7.691526919700319e-06, + "loss": 0.0237, + "step": 3478 + }, + { + "epoch": 1.1617966271497746, + "grad_norm": 0.5225888845081335, + "learning_rate": 7.689888972575237e-06, + "loss": 0.0566, + "step": 3479 + }, + { + "epoch": 1.162130572716647, + "grad_norm": 0.30268914331089336, + "learning_rate": 7.688250619110752e-06, + "loss": 0.0279, + "step": 3480 + }, + { + "epoch": 1.1624645182835198, + "grad_norm": 0.33861445710195365, + "learning_rate": 7.686611859554361e-06, + "loss": 0.0362, + "step": 3481 + }, + { + "epoch": 1.1627984638503923, + "grad_norm": 0.21615391449424057, + "learning_rate": 7.684972694153612e-06, + "loss": 0.0218, + "step": 3482 + }, + { + "epoch": 1.163132409417265, + "grad_norm": 0.3213041342802179, + "learning_rate": 7.683333123156122e-06, + "loss": 0.0264, + "step": 3483 + }, + { + "epoch": 1.1634663549841375, + "grad_norm": 0.2391705643336867, + "learning_rate": 7.681693146809572e-06, + "loss": 0.0216, + "step": 3484 + }, + { + "epoch": 1.1638003005510102, + "grad_norm": 0.2327024190047155, + "learning_rate": 7.680052765361693e-06, + "loss": 0.0228, + "step": 3485 + }, + { + "epoch": 1.1641342461178827, + "grad_norm": 0.466746356161113, + "learning_rate": 7.678411979060289e-06, + "loss": 0.0349, + "step": 3486 + }, + { + "epoch": 1.1644681916847555, + "grad_norm": 0.35594369741459175, + "learning_rate": 7.676770788153218e-06, + "loss": 0.0327, + "step": 3487 + }, + { + "epoch": 1.164802137251628, + "grad_norm": 0.4396354659182487, + "learning_rate": 7.6751291928884e-06, + "loss": 0.031, + "step": 3488 + }, + { + "epoch": 1.1651360828185007, + "grad_norm": 0.2986162472432065, + "learning_rate": 7.673487193513821e-06, + "loss": 0.0244, + "step": 3489 + }, + { + "epoch": 1.1654700283853732, + "grad_norm": 0.2855238553660112, + "learning_rate": 7.671844790277522e-06, + "loss": 0.0237, + "step": 3490 + }, + { + "epoch": 1.1658039739522459, + "grad_norm": 0.3669523981505205, + "learning_rate": 7.670201983427606e-06, + "loss": 0.0318, + "step": 3491 + }, + { + "epoch": 1.1661379195191184, + "grad_norm": 0.2802650217978861, + "learning_rate": 7.66855877321224e-06, + "loss": 0.0243, + "step": 3492 + }, + { + "epoch": 1.1664718650859909, + "grad_norm": 0.5173781157672644, + "learning_rate": 7.666915159879651e-06, + "loss": 0.0296, + "step": 3493 + }, + { + "epoch": 1.1668058106528636, + "grad_norm": 0.3631614765686001, + "learning_rate": 7.665271143678125e-06, + "loss": 0.0262, + "step": 3494 + }, + { + "epoch": 1.167139756219736, + "grad_norm": 0.35272600101893303, + "learning_rate": 7.66362672485601e-06, + "loss": 0.0325, + "step": 3495 + }, + { + "epoch": 1.1674737017866088, + "grad_norm": 0.2953204494126073, + "learning_rate": 7.661981903661715e-06, + "loss": 0.0225, + "step": 3496 + }, + { + "epoch": 1.1678076473534813, + "grad_norm": 0.3676966213427377, + "learning_rate": 7.66033668034371e-06, + "loss": 0.0282, + "step": 3497 + }, + { + "epoch": 1.168141592920354, + "grad_norm": 0.3675832189387499, + "learning_rate": 7.658691055150524e-06, + "loss": 0.0366, + "step": 3498 + }, + { + "epoch": 1.1684755384872265, + "grad_norm": 0.32632280987510315, + "learning_rate": 7.65704502833075e-06, + "loss": 0.0269, + "step": 3499 + }, + { + "epoch": 1.1688094840540992, + "grad_norm": 0.4123603700874419, + "learning_rate": 7.655398600133037e-06, + "loss": 0.0395, + "step": 3500 + }, + { + "epoch": 1.1691434296209717, + "grad_norm": 0.34577078396490485, + "learning_rate": 7.653751770806101e-06, + "loss": 0.0321, + "step": 3501 + }, + { + "epoch": 1.1694773751878444, + "grad_norm": 0.2937895703947491, + "learning_rate": 7.652104540598712e-06, + "loss": 0.0245, + "step": 3502 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 0.33622346719358, + "learning_rate": 7.650456909759707e-06, + "loss": 0.0198, + "step": 3503 + }, + { + "epoch": 1.1701452663215897, + "grad_norm": 0.4467975494526781, + "learning_rate": 7.648808878537976e-06, + "loss": 0.0308, + "step": 3504 + }, + { + "epoch": 1.1704792118884622, + "grad_norm": 0.33257231714219554, + "learning_rate": 7.647160447182475e-06, + "loss": 0.0355, + "step": 3505 + }, + { + "epoch": 1.1708131574553349, + "grad_norm": 0.24774628720941608, + "learning_rate": 7.645511615942218e-06, + "loss": 0.0264, + "step": 3506 + }, + { + "epoch": 1.1711471030222074, + "grad_norm": 0.2316830661797605, + "learning_rate": 7.643862385066285e-06, + "loss": 0.0232, + "step": 3507 + }, + { + "epoch": 1.1714810485890799, + "grad_norm": 0.3364496062607349, + "learning_rate": 7.642212754803804e-06, + "loss": 0.0254, + "step": 3508 + }, + { + "epoch": 1.1718149941559526, + "grad_norm": 0.3474891703591838, + "learning_rate": 7.640562725403978e-06, + "loss": 0.0347, + "step": 3509 + }, + { + "epoch": 1.1721489397228253, + "grad_norm": 0.2908152664638895, + "learning_rate": 7.638912297116061e-06, + "loss": 0.0288, + "step": 3510 + }, + { + "epoch": 1.1724828852896978, + "grad_norm": 0.25078594758561484, + "learning_rate": 7.637261470189369e-06, + "loss": 0.0203, + "step": 3511 + }, + { + "epoch": 1.1728168308565703, + "grad_norm": 0.24150363668531105, + "learning_rate": 7.635610244873277e-06, + "loss": 0.0228, + "step": 3512 + }, + { + "epoch": 1.173150776423443, + "grad_norm": 0.3553529908080216, + "learning_rate": 7.633958621417226e-06, + "loss": 0.0321, + "step": 3513 + }, + { + "epoch": 1.1734847219903155, + "grad_norm": 0.2882204107122788, + "learning_rate": 7.632306600070711e-06, + "loss": 0.0276, + "step": 3514 + }, + { + "epoch": 1.1738186675571882, + "grad_norm": 0.27175946355194863, + "learning_rate": 7.63065418108329e-06, + "loss": 0.0199, + "step": 3515 + }, + { + "epoch": 1.1741526131240607, + "grad_norm": 0.29365182270148615, + "learning_rate": 7.62900136470458e-06, + "loss": 0.0247, + "step": 3516 + }, + { + "epoch": 1.1744865586909334, + "grad_norm": 0.2794766476478305, + "learning_rate": 7.627348151184257e-06, + "loss": 0.0238, + "step": 3517 + }, + { + "epoch": 1.174820504257806, + "grad_norm": 0.3659872340269811, + "learning_rate": 7.625694540772062e-06, + "loss": 0.0306, + "step": 3518 + }, + { + "epoch": 1.1751544498246786, + "grad_norm": 0.35167227874267293, + "learning_rate": 7.624040533717789e-06, + "loss": 0.0285, + "step": 3519 + }, + { + "epoch": 1.1754883953915511, + "grad_norm": 0.290174682025725, + "learning_rate": 7.622386130271296e-06, + "loss": 0.0251, + "step": 3520 + }, + { + "epoch": 1.1758223409584239, + "grad_norm": 0.4219865137237148, + "learning_rate": 7.620731330682501e-06, + "loss": 0.0469, + "step": 3521 + }, + { + "epoch": 1.1761562865252964, + "grad_norm": 0.5707703670899809, + "learning_rate": 7.6190761352013795e-06, + "loss": 0.0272, + "step": 3522 + }, + { + "epoch": 1.1764902320921689, + "grad_norm": 0.327756162551458, + "learning_rate": 7.61742054407797e-06, + "loss": 0.0252, + "step": 3523 + }, + { + "epoch": 1.1768241776590416, + "grad_norm": 0.3274956389443555, + "learning_rate": 7.615764557562368e-06, + "loss": 0.028, + "step": 3524 + }, + { + "epoch": 1.1771581232259143, + "grad_norm": 0.4405481427142839, + "learning_rate": 7.6141081759047305e-06, + "loss": 0.0381, + "step": 3525 + }, + { + "epoch": 1.1774920687927868, + "grad_norm": 0.3735302426547869, + "learning_rate": 7.612451399355273e-06, + "loss": 0.0304, + "step": 3526 + }, + { + "epoch": 1.1778260143596593, + "grad_norm": 0.3824426145771473, + "learning_rate": 7.610794228164271e-06, + "loss": 0.0351, + "step": 3527 + }, + { + "epoch": 1.178159959926532, + "grad_norm": 0.26595801059874885, + "learning_rate": 7.60913666258206e-06, + "loss": 0.0219, + "step": 3528 + }, + { + "epoch": 1.1784939054934045, + "grad_norm": 0.24261111244989236, + "learning_rate": 7.6074787028590325e-06, + "loss": 0.0237, + "step": 3529 + }, + { + "epoch": 1.1788278510602772, + "grad_norm": 0.27314138563421714, + "learning_rate": 7.605820349245645e-06, + "loss": 0.0266, + "step": 3530 + }, + { + "epoch": 1.1791617966271497, + "grad_norm": 0.32357852051384983, + "learning_rate": 7.6041616019924125e-06, + "loss": 0.0225, + "step": 3531 + }, + { + "epoch": 1.1794957421940224, + "grad_norm": 0.3636625856897844, + "learning_rate": 7.602502461349907e-06, + "loss": 0.0337, + "step": 3532 + }, + { + "epoch": 1.179829687760895, + "grad_norm": 0.3435214508277399, + "learning_rate": 7.600842927568761e-06, + "loss": 0.0342, + "step": 3533 + }, + { + "epoch": 1.1801636333277676, + "grad_norm": 0.3477842064588296, + "learning_rate": 7.599183000899667e-06, + "loss": 0.0244, + "step": 3534 + }, + { + "epoch": 1.1804975788946401, + "grad_norm": 0.5292869411998767, + "learning_rate": 7.597522681593375e-06, + "loss": 0.0342, + "step": 3535 + }, + { + "epoch": 1.1808315244615128, + "grad_norm": 0.3037457490616166, + "learning_rate": 7.595861969900698e-06, + "loss": 0.0284, + "step": 3536 + }, + { + "epoch": 1.1811654700283853, + "grad_norm": 0.4447622076079352, + "learning_rate": 7.5942008660725065e-06, + "loss": 0.0306, + "step": 3537 + }, + { + "epoch": 1.181499415595258, + "grad_norm": 0.265649322156267, + "learning_rate": 7.5925393703597265e-06, + "loss": 0.0236, + "step": 3538 + }, + { + "epoch": 1.1818333611621306, + "grad_norm": 0.23806273170509104, + "learning_rate": 7.59087748301335e-06, + "loss": 0.0262, + "step": 3539 + }, + { + "epoch": 1.1821673067290033, + "grad_norm": 0.2513056552938845, + "learning_rate": 7.5892152042844224e-06, + "loss": 0.0199, + "step": 3540 + }, + { + "epoch": 1.1825012522958758, + "grad_norm": 0.3183336268239464, + "learning_rate": 7.58755253442405e-06, + "loss": 0.0263, + "step": 3541 + }, + { + "epoch": 1.1828351978627483, + "grad_norm": 0.3071757110631517, + "learning_rate": 7.585889473683401e-06, + "loss": 0.0255, + "step": 3542 + }, + { + "epoch": 1.183169143429621, + "grad_norm": 0.27160021026868736, + "learning_rate": 7.5842260223137e-06, + "loss": 0.0173, + "step": 3543 + }, + { + "epoch": 1.1835030889964935, + "grad_norm": 0.24386094830503965, + "learning_rate": 7.5825621805662285e-06, + "loss": 0.0241, + "step": 3544 + }, + { + "epoch": 1.1838370345633662, + "grad_norm": 0.47224786416194436, + "learning_rate": 7.580897948692332e-06, + "loss": 0.0303, + "step": 3545 + }, + { + "epoch": 1.1841709801302387, + "grad_norm": 0.2639401565130608, + "learning_rate": 7.579233326943412e-06, + "loss": 0.0216, + "step": 3546 + }, + { + "epoch": 1.1845049256971114, + "grad_norm": 0.29087331734446137, + "learning_rate": 7.577568315570925e-06, + "loss": 0.026, + "step": 3547 + }, + { + "epoch": 1.184838871263984, + "grad_norm": 0.6776129769735288, + "learning_rate": 7.5759029148263975e-06, + "loss": 0.0461, + "step": 3548 + }, + { + "epoch": 1.1851728168308566, + "grad_norm": 0.38653065090861605, + "learning_rate": 7.574237124961403e-06, + "loss": 0.0327, + "step": 3549 + }, + { + "epoch": 1.1855067623977291, + "grad_norm": 0.264678448438959, + "learning_rate": 7.572570946227582e-06, + "loss": 0.0289, + "step": 3550 + }, + { + "epoch": 1.1858407079646018, + "grad_norm": 0.530675620958939, + "learning_rate": 7.570904378876627e-06, + "loss": 0.0349, + "step": 3551 + }, + { + "epoch": 1.1861746535314743, + "grad_norm": 0.3488264650847778, + "learning_rate": 7.569237423160294e-06, + "loss": 0.0252, + "step": 3552 + }, + { + "epoch": 1.186508599098347, + "grad_norm": 0.4583056075333567, + "learning_rate": 7.567570079330395e-06, + "loss": 0.0457, + "step": 3553 + }, + { + "epoch": 1.1868425446652195, + "grad_norm": 0.5023824187574232, + "learning_rate": 7.565902347638806e-06, + "loss": 0.0385, + "step": 3554 + }, + { + "epoch": 1.1871764902320923, + "grad_norm": 0.3792435649151429, + "learning_rate": 7.564234228337452e-06, + "loss": 0.0284, + "step": 3555 + }, + { + "epoch": 1.1875104357989648, + "grad_norm": 0.2633029612235313, + "learning_rate": 7.5625657216783276e-06, + "loss": 0.0257, + "step": 3556 + }, + { + "epoch": 1.1878443813658373, + "grad_norm": 0.3584621687245536, + "learning_rate": 7.560896827913478e-06, + "loss": 0.0293, + "step": 3557 + }, + { + "epoch": 1.18817832693271, + "grad_norm": 0.2925668913663093, + "learning_rate": 7.559227547295007e-06, + "loss": 0.0242, + "step": 3558 + }, + { + "epoch": 1.1885122724995827, + "grad_norm": 0.23377559506332798, + "learning_rate": 7.557557880075082e-06, + "loss": 0.0159, + "step": 3559 + }, + { + "epoch": 1.1888462180664552, + "grad_norm": 0.31876519927547686, + "learning_rate": 7.555887826505926e-06, + "loss": 0.0235, + "step": 3560 + }, + { + "epoch": 1.1891801636333277, + "grad_norm": 0.4288110041407026, + "learning_rate": 7.554217386839817e-06, + "loss": 0.0367, + "step": 3561 + }, + { + "epoch": 1.1895141092002004, + "grad_norm": 0.4036764539007261, + "learning_rate": 7.552546561329097e-06, + "loss": 0.0243, + "step": 3562 + }, + { + "epoch": 1.189848054767073, + "grad_norm": 0.3111499091936664, + "learning_rate": 7.550875350226166e-06, + "loss": 0.0298, + "step": 3563 + }, + { + "epoch": 1.1901820003339456, + "grad_norm": 0.2472740508116731, + "learning_rate": 7.549203753783475e-06, + "loss": 0.0213, + "step": 3564 + }, + { + "epoch": 1.190515945900818, + "grad_norm": 0.46078982620520526, + "learning_rate": 7.547531772253542e-06, + "loss": 0.0373, + "step": 3565 + }, + { + "epoch": 1.1908498914676908, + "grad_norm": 0.3765683569247024, + "learning_rate": 7.54585940588894e-06, + "loss": 0.0405, + "step": 3566 + }, + { + "epoch": 1.1911838370345633, + "grad_norm": 0.2646796997015321, + "learning_rate": 7.544186654942296e-06, + "loss": 0.0259, + "step": 3567 + }, + { + "epoch": 1.191517782601436, + "grad_norm": 0.2364250317630789, + "learning_rate": 7.542513519666302e-06, + "loss": 0.0176, + "step": 3568 + }, + { + "epoch": 1.1918517281683085, + "grad_norm": 0.3678649976098643, + "learning_rate": 7.540840000313705e-06, + "loss": 0.035, + "step": 3569 + }, + { + "epoch": 1.1921856737351813, + "grad_norm": 0.22728263615909344, + "learning_rate": 7.539166097137306e-06, + "loss": 0.0189, + "step": 3570 + }, + { + "epoch": 1.1925196193020537, + "grad_norm": 0.37395129436814156, + "learning_rate": 7.537491810389972e-06, + "loss": 0.0359, + "step": 3571 + }, + { + "epoch": 1.1928535648689262, + "grad_norm": 0.42274723036577955, + "learning_rate": 7.535817140324622e-06, + "loss": 0.0344, + "step": 3572 + }, + { + "epoch": 1.193187510435799, + "grad_norm": 0.3830021184430302, + "learning_rate": 7.534142087194234e-06, + "loss": 0.0188, + "step": 3573 + }, + { + "epoch": 1.1935214560026717, + "grad_norm": 0.281046038536792, + "learning_rate": 7.532466651251846e-06, + "loss": 0.0293, + "step": 3574 + }, + { + "epoch": 1.1938554015695442, + "grad_norm": 0.3438638328473842, + "learning_rate": 7.5307908327505506e-06, + "loss": 0.0272, + "step": 3575 + }, + { + "epoch": 1.1941893471364167, + "grad_norm": 0.2938006118359201, + "learning_rate": 7.529114631943501e-06, + "loss": 0.02, + "step": 3576 + }, + { + "epoch": 1.1945232927032894, + "grad_norm": 0.3837708613941296, + "learning_rate": 7.527438049083908e-06, + "loss": 0.0281, + "step": 3577 + }, + { + "epoch": 1.1948572382701619, + "grad_norm": 0.42985674766269283, + "learning_rate": 7.5257610844250385e-06, + "loss": 0.0451, + "step": 3578 + }, + { + "epoch": 1.1951911838370346, + "grad_norm": 0.4765094879000336, + "learning_rate": 7.524083738220214e-06, + "loss": 0.0438, + "step": 3579 + }, + { + "epoch": 1.195525129403907, + "grad_norm": 0.3391737775316126, + "learning_rate": 7.522406010722824e-06, + "loss": 0.0326, + "step": 3580 + }, + { + "epoch": 1.1958590749707798, + "grad_norm": 0.4252817384598663, + "learning_rate": 7.5207279021863045e-06, + "loss": 0.0301, + "step": 3581 + }, + { + "epoch": 1.1961930205376523, + "grad_norm": 0.3537507345301453, + "learning_rate": 7.5190494128641545e-06, + "loss": 0.0264, + "step": 3582 + }, + { + "epoch": 1.196526966104525, + "grad_norm": 0.28006465042221873, + "learning_rate": 7.5173705430099295e-06, + "loss": 0.0217, + "step": 3583 + }, + { + "epoch": 1.1968609116713975, + "grad_norm": 0.31595347079765157, + "learning_rate": 7.515691292877243e-06, + "loss": 0.0351, + "step": 3584 + }, + { + "epoch": 1.1971948572382702, + "grad_norm": 0.3410057754683141, + "learning_rate": 7.514011662719766e-06, + "loss": 0.031, + "step": 3585 + }, + { + "epoch": 1.1975288028051427, + "grad_norm": 0.4029779175853151, + "learning_rate": 7.512331652791226e-06, + "loss": 0.0477, + "step": 3586 + }, + { + "epoch": 1.1978627483720155, + "grad_norm": 0.319223776904647, + "learning_rate": 7.510651263345408e-06, + "loss": 0.0328, + "step": 3587 + }, + { + "epoch": 1.198196693938888, + "grad_norm": 0.2832307786697477, + "learning_rate": 7.508970494636154e-06, + "loss": 0.0241, + "step": 3588 + }, + { + "epoch": 1.1985306395057607, + "grad_norm": 0.3140097991487349, + "learning_rate": 7.507289346917366e-06, + "loss": 0.0205, + "step": 3589 + }, + { + "epoch": 1.1988645850726332, + "grad_norm": 0.2893843548475163, + "learning_rate": 7.505607820442997e-06, + "loss": 0.0237, + "step": 3590 + }, + { + "epoch": 1.1991985306395057, + "grad_norm": 0.43556983943009747, + "learning_rate": 7.503925915467066e-06, + "loss": 0.033, + "step": 3591 + }, + { + "epoch": 1.1995324762063784, + "grad_norm": 0.3413309881274883, + "learning_rate": 7.502243632243645e-06, + "loss": 0.0378, + "step": 3592 + }, + { + "epoch": 1.1998664217732509, + "grad_norm": 0.2820857935235703, + "learning_rate": 7.500560971026856e-06, + "loss": 0.0314, + "step": 3593 + }, + { + "epoch": 1.2002003673401236, + "grad_norm": 0.34793538336750207, + "learning_rate": 7.498877932070892e-06, + "loss": 0.0307, + "step": 3594 + }, + { + "epoch": 1.200534312906996, + "grad_norm": 0.33430281958249236, + "learning_rate": 7.497194515629992e-06, + "loss": 0.0296, + "step": 3595 + }, + { + "epoch": 1.2008682584738688, + "grad_norm": 0.33080673886716166, + "learning_rate": 7.4955107219584575e-06, + "loss": 0.0269, + "step": 3596 + }, + { + "epoch": 1.2012022040407413, + "grad_norm": 0.3219894335146745, + "learning_rate": 7.493826551310645e-06, + "loss": 0.0255, + "step": 3597 + }, + { + "epoch": 1.201536149607614, + "grad_norm": 0.3366922056344637, + "learning_rate": 7.492142003940966e-06, + "loss": 0.0302, + "step": 3598 + }, + { + "epoch": 1.2018700951744865, + "grad_norm": 0.2769343236942904, + "learning_rate": 7.490457080103895e-06, + "loss": 0.0258, + "step": 3599 + }, + { + "epoch": 1.2022040407413592, + "grad_norm": 0.2959540008543708, + "learning_rate": 7.4887717800539584e-06, + "loss": 0.0261, + "step": 3600 + }, + { + "epoch": 1.2025379863082317, + "grad_norm": 0.28120309411333794, + "learning_rate": 7.48708610404574e-06, + "loss": 0.021, + "step": 3601 + }, + { + "epoch": 1.2028719318751044, + "grad_norm": 0.3780547262738609, + "learning_rate": 7.48540005233388e-06, + "loss": 0.0297, + "step": 3602 + }, + { + "epoch": 1.203205877441977, + "grad_norm": 0.37150191881556194, + "learning_rate": 7.483713625173078e-06, + "loss": 0.0218, + "step": 3603 + }, + { + "epoch": 1.2035398230088497, + "grad_norm": 0.2728269051670745, + "learning_rate": 7.482026822818088e-06, + "loss": 0.0199, + "step": 3604 + }, + { + "epoch": 1.2038737685757221, + "grad_norm": 0.4121363752550122, + "learning_rate": 7.480339645523721e-06, + "loss": 0.0345, + "step": 3605 + }, + { + "epoch": 1.2042077141425946, + "grad_norm": 0.3126177722679711, + "learning_rate": 7.478652093544846e-06, + "loss": 0.0237, + "step": 3606 + }, + { + "epoch": 1.2045416597094674, + "grad_norm": 0.28129377355025, + "learning_rate": 7.476964167136388e-06, + "loss": 0.0235, + "step": 3607 + }, + { + "epoch": 1.20487560527634, + "grad_norm": 0.30910687217761773, + "learning_rate": 7.475275866553326e-06, + "loss": 0.0277, + "step": 3608 + }, + { + "epoch": 1.2052095508432126, + "grad_norm": 0.5336132934124976, + "learning_rate": 7.473587192050698e-06, + "loss": 0.0323, + "step": 3609 + }, + { + "epoch": 1.205543496410085, + "grad_norm": 0.26106460006122423, + "learning_rate": 7.471898143883601e-06, + "loss": 0.0197, + "step": 3610 + }, + { + "epoch": 1.2058774419769578, + "grad_norm": 0.30317693876187, + "learning_rate": 7.470208722307183e-06, + "loss": 0.0253, + "step": 3611 + }, + { + "epoch": 1.2062113875438303, + "grad_norm": 0.25193483447797743, + "learning_rate": 7.468518927576653e-06, + "loss": 0.0215, + "step": 3612 + }, + { + "epoch": 1.206545333110703, + "grad_norm": 0.263215237487763, + "learning_rate": 7.466828759947271e-06, + "loss": 0.0227, + "step": 3613 + }, + { + "epoch": 1.2068792786775755, + "grad_norm": 0.29627708566072053, + "learning_rate": 7.465138219674359e-06, + "loss": 0.0288, + "step": 3614 + }, + { + "epoch": 1.2072132242444482, + "grad_norm": 0.3509762238645572, + "learning_rate": 7.463447307013294e-06, + "loss": 0.0331, + "step": 3615 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 0.35838158455769176, + "learning_rate": 7.461756022219507e-06, + "loss": 0.0219, + "step": 3616 + }, + { + "epoch": 1.2078811153781934, + "grad_norm": 0.5607378045821658, + "learning_rate": 7.460064365548486e-06, + "loss": 0.0337, + "step": 3617 + }, + { + "epoch": 1.208215060945066, + "grad_norm": 0.2473096236923032, + "learning_rate": 7.458372337255777e-06, + "loss": 0.0245, + "step": 3618 + }, + { + "epoch": 1.2085490065119386, + "grad_norm": 0.2791939675004245, + "learning_rate": 7.45667993759698e-06, + "loss": 0.0238, + "step": 3619 + }, + { + "epoch": 1.2088829520788111, + "grad_norm": 0.32236033089419347, + "learning_rate": 7.454987166827751e-06, + "loss": 0.033, + "step": 3620 + }, + { + "epoch": 1.2092168976456836, + "grad_norm": 0.34359333802445224, + "learning_rate": 7.4532940252038055e-06, + "loss": 0.0242, + "step": 3621 + }, + { + "epoch": 1.2095508432125563, + "grad_norm": 0.28346174168334615, + "learning_rate": 7.45160051298091e-06, + "loss": 0.0219, + "step": 3622 + }, + { + "epoch": 1.209884788779429, + "grad_norm": 0.3394500848050175, + "learning_rate": 7.4499066304148904e-06, + "loss": 0.0358, + "step": 3623 + }, + { + "epoch": 1.2102187343463016, + "grad_norm": 0.28727684362348355, + "learning_rate": 7.448212377761628e-06, + "loss": 0.0236, + "step": 3624 + }, + { + "epoch": 1.210552679913174, + "grad_norm": 0.23030080548127382, + "learning_rate": 7.4465177552770585e-06, + "loss": 0.0211, + "step": 3625 + }, + { + "epoch": 1.2108866254800468, + "grad_norm": 0.3752054423570739, + "learning_rate": 7.444822763217174e-06, + "loss": 0.0312, + "step": 3626 + }, + { + "epoch": 1.2112205710469193, + "grad_norm": 0.3880340810105748, + "learning_rate": 7.443127401838026e-06, + "loss": 0.0304, + "step": 3627 + }, + { + "epoch": 1.211554516613792, + "grad_norm": 0.3593823234963432, + "learning_rate": 7.441431671395717e-06, + "loss": 0.0252, + "step": 3628 + }, + { + "epoch": 1.2118884621806645, + "grad_norm": 0.28294692967676155, + "learning_rate": 7.439735572146407e-06, + "loss": 0.0224, + "step": 3629 + }, + { + "epoch": 1.2122224077475372, + "grad_norm": 0.4649483517510499, + "learning_rate": 7.438039104346312e-06, + "loss": 0.0338, + "step": 3630 + }, + { + "epoch": 1.2125563533144097, + "grad_norm": 0.28148285679186763, + "learning_rate": 7.436342268251702e-06, + "loss": 0.026, + "step": 3631 + }, + { + "epoch": 1.2128902988812824, + "grad_norm": 0.3519283974511048, + "learning_rate": 7.434645064118906e-06, + "loss": 0.0327, + "step": 3632 + }, + { + "epoch": 1.213224244448155, + "grad_norm": 0.2553342539182069, + "learning_rate": 7.432947492204308e-06, + "loss": 0.0185, + "step": 3633 + }, + { + "epoch": 1.2135581900150276, + "grad_norm": 0.3141381461297296, + "learning_rate": 7.431249552764342e-06, + "loss": 0.0314, + "step": 3634 + }, + { + "epoch": 1.2138921355819001, + "grad_norm": 0.3525165310268248, + "learning_rate": 7.429551246055504e-06, + "loss": 0.0335, + "step": 3635 + }, + { + "epoch": 1.2142260811487728, + "grad_norm": 0.3989324630739906, + "learning_rate": 7.427852572334344e-06, + "loss": 0.0335, + "step": 3636 + }, + { + "epoch": 1.2145600267156453, + "grad_norm": 0.3337331951530839, + "learning_rate": 7.426153531857466e-06, + "loss": 0.0324, + "step": 3637 + }, + { + "epoch": 1.214893972282518, + "grad_norm": 0.20674712407248522, + "learning_rate": 7.424454124881531e-06, + "loss": 0.0189, + "step": 3638 + }, + { + "epoch": 1.2152279178493905, + "grad_norm": 0.3204365065385207, + "learning_rate": 7.422754351663252e-06, + "loss": 0.032, + "step": 3639 + }, + { + "epoch": 1.215561863416263, + "grad_norm": 0.21512092753837306, + "learning_rate": 7.4210542124594e-06, + "loss": 0.0219, + "step": 3640 + }, + { + "epoch": 1.2158958089831358, + "grad_norm": 0.31341545847677554, + "learning_rate": 7.419353707526804e-06, + "loss": 0.0261, + "step": 3641 + }, + { + "epoch": 1.2162297545500083, + "grad_norm": 0.2889569310745874, + "learning_rate": 7.417652837122345e-06, + "loss": 0.0245, + "step": 3642 + }, + { + "epoch": 1.216563700116881, + "grad_norm": 0.2902261356520683, + "learning_rate": 7.4159516015029545e-06, + "loss": 0.0261, + "step": 3643 + }, + { + "epoch": 1.2168976456837535, + "grad_norm": 0.31994923991612734, + "learning_rate": 7.414250000925629e-06, + "loss": 0.0297, + "step": 3644 + }, + { + "epoch": 1.2172315912506262, + "grad_norm": 0.3074415733624934, + "learning_rate": 7.412548035647416e-06, + "loss": 0.0291, + "step": 3645 + }, + { + "epoch": 1.2175655368174987, + "grad_norm": 0.2600369049394611, + "learning_rate": 7.4108457059254135e-06, + "loss": 0.0271, + "step": 3646 + }, + { + "epoch": 1.2178994823843714, + "grad_norm": 0.3618982584590955, + "learning_rate": 7.40914301201678e-06, + "loss": 0.0349, + "step": 3647 + }, + { + "epoch": 1.218233427951244, + "grad_norm": 0.328663757492403, + "learning_rate": 7.407439954178729e-06, + "loss": 0.0312, + "step": 3648 + }, + { + "epoch": 1.2185673735181166, + "grad_norm": 0.24598683290237383, + "learning_rate": 7.405736532668525e-06, + "loss": 0.019, + "step": 3649 + }, + { + "epoch": 1.218901319084989, + "grad_norm": 0.30976253960270267, + "learning_rate": 7.4040327477434926e-06, + "loss": 0.0224, + "step": 3650 + }, + { + "epoch": 1.2192352646518618, + "grad_norm": 0.32080993247481554, + "learning_rate": 7.402328599661006e-06, + "loss": 0.0226, + "step": 3651 + }, + { + "epoch": 1.2195692102187343, + "grad_norm": 0.37200366630708953, + "learning_rate": 7.400624088678497e-06, + "loss": 0.0308, + "step": 3652 + }, + { + "epoch": 1.219903155785607, + "grad_norm": 0.27923380310336365, + "learning_rate": 7.398919215053455e-06, + "loss": 0.0207, + "step": 3653 + }, + { + "epoch": 1.2202371013524795, + "grad_norm": 0.2985047236029938, + "learning_rate": 7.397213979043418e-06, + "loss": 0.028, + "step": 3654 + }, + { + "epoch": 1.220571046919352, + "grad_norm": 0.34394570071340674, + "learning_rate": 7.395508380905983e-06, + "loss": 0.0231, + "step": 3655 + }, + { + "epoch": 1.2209049924862247, + "grad_norm": 0.30248672754705974, + "learning_rate": 7.393802420898801e-06, + "loss": 0.0242, + "step": 3656 + }, + { + "epoch": 1.2212389380530975, + "grad_norm": 0.2503042875788558, + "learning_rate": 7.392096099279579e-06, + "loss": 0.0202, + "step": 3657 + }, + { + "epoch": 1.22157288361997, + "grad_norm": 0.3569871645662349, + "learning_rate": 7.390389416306073e-06, + "loss": 0.0257, + "step": 3658 + }, + { + "epoch": 1.2219068291868425, + "grad_norm": 0.25009609421324414, + "learning_rate": 7.3886823722361e-06, + "loss": 0.0251, + "step": 3659 + }, + { + "epoch": 1.2222407747537152, + "grad_norm": 0.24203243906148403, + "learning_rate": 7.386974967327531e-06, + "loss": 0.0222, + "step": 3660 + }, + { + "epoch": 1.2225747203205877, + "grad_norm": 0.27631277036377877, + "learning_rate": 7.385267201838284e-06, + "loss": 0.0182, + "step": 3661 + }, + { + "epoch": 1.2229086658874604, + "grad_norm": 0.46240374040892274, + "learning_rate": 7.383559076026343e-06, + "loss": 0.0385, + "step": 3662 + }, + { + "epoch": 1.2232426114543329, + "grad_norm": 0.2415788281018439, + "learning_rate": 7.381850590149737e-06, + "loss": 0.02, + "step": 3663 + }, + { + "epoch": 1.2235765570212056, + "grad_norm": 0.34618615305852357, + "learning_rate": 7.380141744466555e-06, + "loss": 0.0246, + "step": 3664 + }, + { + "epoch": 1.223910502588078, + "grad_norm": 0.28776979877352327, + "learning_rate": 7.378432539234936e-06, + "loss": 0.0232, + "step": 3665 + }, + { + "epoch": 1.2242444481549508, + "grad_norm": 0.32124672291576994, + "learning_rate": 7.376722974713078e-06, + "loss": 0.0191, + "step": 3666 + }, + { + "epoch": 1.2245783937218233, + "grad_norm": 0.361088326471107, + "learning_rate": 7.3750130511592275e-06, + "loss": 0.0298, + "step": 3667 + }, + { + "epoch": 1.224912339288696, + "grad_norm": 0.41441701315501017, + "learning_rate": 7.373302768831694e-06, + "loss": 0.0395, + "step": 3668 + }, + { + "epoch": 1.2252462848555685, + "grad_norm": 0.2768492371418646, + "learning_rate": 7.371592127988831e-06, + "loss": 0.0302, + "step": 3669 + }, + { + "epoch": 1.225580230422441, + "grad_norm": 0.2773369556998907, + "learning_rate": 7.369881128889052e-06, + "loss": 0.0171, + "step": 3670 + }, + { + "epoch": 1.2259141759893137, + "grad_norm": 0.2766091810132415, + "learning_rate": 7.368169771790825e-06, + "loss": 0.0234, + "step": 3671 + }, + { + "epoch": 1.2262481215561865, + "grad_norm": 0.3764666912304945, + "learning_rate": 7.366458056952668e-06, + "loss": 0.0275, + "step": 3672 + }, + { + "epoch": 1.226582067123059, + "grad_norm": 0.25978182887351225, + "learning_rate": 7.36474598463316e-06, + "loss": 0.0206, + "step": 3673 + }, + { + "epoch": 1.2269160126899314, + "grad_norm": 0.28853253691773284, + "learning_rate": 7.363033555090925e-06, + "loss": 0.022, + "step": 3674 + }, + { + "epoch": 1.2272499582568042, + "grad_norm": 0.3224334967181729, + "learning_rate": 7.361320768584648e-06, + "loss": 0.0247, + "step": 3675 + }, + { + "epoch": 1.2275839038236767, + "grad_norm": 0.38351597041853136, + "learning_rate": 7.359607625373065e-06, + "loss": 0.0293, + "step": 3676 + }, + { + "epoch": 1.2279178493905494, + "grad_norm": 0.26102522211177076, + "learning_rate": 7.357894125714967e-06, + "loss": 0.0229, + "step": 3677 + }, + { + "epoch": 1.2282517949574219, + "grad_norm": 0.2969561641140478, + "learning_rate": 7.3561802698691976e-06, + "loss": 0.0271, + "step": 3678 + }, + { + "epoch": 1.2285857405242946, + "grad_norm": 0.30718238258239033, + "learning_rate": 7.354466058094656e-06, + "loss": 0.0296, + "step": 3679 + }, + { + "epoch": 1.228919686091167, + "grad_norm": 0.41127526819441096, + "learning_rate": 7.352751490650294e-06, + "loss": 0.041, + "step": 3680 + }, + { + "epoch": 1.2292536316580398, + "grad_norm": 0.34722166038411595, + "learning_rate": 7.3510365677951155e-06, + "loss": 0.0231, + "step": 3681 + }, + { + "epoch": 1.2295875772249123, + "grad_norm": 0.2969655574350491, + "learning_rate": 7.349321289788181e-06, + "loss": 0.0229, + "step": 3682 + }, + { + "epoch": 1.229921522791785, + "grad_norm": 0.36054722563729874, + "learning_rate": 7.3476056568886036e-06, + "loss": 0.0286, + "step": 3683 + }, + { + "epoch": 1.2302554683586575, + "grad_norm": 0.38218968715418694, + "learning_rate": 7.34588966935555e-06, + "loss": 0.0317, + "step": 3684 + }, + { + "epoch": 1.2305894139255302, + "grad_norm": 0.35561484948591066, + "learning_rate": 7.344173327448238e-06, + "loss": 0.0267, + "step": 3685 + }, + { + "epoch": 1.2309233594924027, + "grad_norm": 0.3315702141284436, + "learning_rate": 7.342456631425945e-06, + "loss": 0.0385, + "step": 3686 + }, + { + "epoch": 1.2312573050592754, + "grad_norm": 0.5917973351022848, + "learning_rate": 7.340739581547996e-06, + "loss": 0.0338, + "step": 3687 + }, + { + "epoch": 1.231591250626148, + "grad_norm": 0.36875324150377503, + "learning_rate": 7.339022178073772e-06, + "loss": 0.0227, + "step": 3688 + }, + { + "epoch": 1.2319251961930204, + "grad_norm": 0.36599153753174596, + "learning_rate": 7.337304421262706e-06, + "loss": 0.0271, + "step": 3689 + }, + { + "epoch": 1.2322591417598932, + "grad_norm": 0.28380159214160994, + "learning_rate": 7.335586311374287e-06, + "loss": 0.0294, + "step": 3690 + }, + { + "epoch": 1.2325930873267656, + "grad_norm": 0.3715206835350677, + "learning_rate": 7.3338678486680545e-06, + "loss": 0.0377, + "step": 3691 + }, + { + "epoch": 1.2329270328936384, + "grad_norm": 0.3889552947049423, + "learning_rate": 7.3321490334036035e-06, + "loss": 0.0379, + "step": 3692 + }, + { + "epoch": 1.2332609784605109, + "grad_norm": 0.43657615137252603, + "learning_rate": 7.3304298658405815e-06, + "loss": 0.0374, + "step": 3693 + }, + { + "epoch": 1.2335949240273836, + "grad_norm": 0.4115973135062599, + "learning_rate": 7.328710346238688e-06, + "loss": 0.0285, + "step": 3694 + }, + { + "epoch": 1.233928869594256, + "grad_norm": 0.3322707553589683, + "learning_rate": 7.326990474857676e-06, + "loss": 0.0161, + "step": 3695 + }, + { + "epoch": 1.2342628151611288, + "grad_norm": 0.26815057066610914, + "learning_rate": 7.3252702519573545e-06, + "loss": 0.0207, + "step": 3696 + }, + { + "epoch": 1.2345967607280013, + "grad_norm": 0.41046045045124907, + "learning_rate": 7.323549677797582e-06, + "loss": 0.0268, + "step": 3697 + }, + { + "epoch": 1.234930706294874, + "grad_norm": 0.2957086659234872, + "learning_rate": 7.3218287526382716e-06, + "loss": 0.0279, + "step": 3698 + }, + { + "epoch": 1.2352646518617465, + "grad_norm": 0.35141890241321566, + "learning_rate": 7.320107476739389e-06, + "loss": 0.0353, + "step": 3699 + }, + { + "epoch": 1.2355985974286192, + "grad_norm": 0.3318220269094786, + "learning_rate": 7.318385850360954e-06, + "loss": 0.0291, + "step": 3700 + }, + { + "epoch": 1.2359325429954917, + "grad_norm": 0.4101419387384786, + "learning_rate": 7.316663873763039e-06, + "loss": 0.0257, + "step": 3701 + }, + { + "epoch": 1.2362664885623644, + "grad_norm": 0.3066255629032018, + "learning_rate": 7.314941547205767e-06, + "loss": 0.0258, + "step": 3702 + }, + { + "epoch": 1.236600434129237, + "grad_norm": 0.27561284004815184, + "learning_rate": 7.313218870949317e-06, + "loss": 0.0236, + "step": 3703 + }, + { + "epoch": 1.2369343796961094, + "grad_norm": 0.28450644260498353, + "learning_rate": 7.31149584525392e-06, + "loss": 0.0265, + "step": 3704 + }, + { + "epoch": 1.2372683252629821, + "grad_norm": 0.2990032445567785, + "learning_rate": 7.309772470379856e-06, + "loss": 0.03, + "step": 3705 + }, + { + "epoch": 1.2376022708298549, + "grad_norm": 0.3054920377427087, + "learning_rate": 7.308048746587466e-06, + "loss": 0.0226, + "step": 3706 + }, + { + "epoch": 1.2379362163967274, + "grad_norm": 0.31201169241005794, + "learning_rate": 7.3063246741371365e-06, + "loss": 0.0216, + "step": 3707 + }, + { + "epoch": 1.2382701619635998, + "grad_norm": 0.389669345428009, + "learning_rate": 7.304600253289308e-06, + "loss": 0.0323, + "step": 3708 + }, + { + "epoch": 1.2386041075304726, + "grad_norm": 0.26737606664063485, + "learning_rate": 7.302875484304476e-06, + "loss": 0.0353, + "step": 3709 + }, + { + "epoch": 1.238938053097345, + "grad_norm": 0.3235634216216689, + "learning_rate": 7.301150367443186e-06, + "loss": 0.0283, + "step": 3710 + }, + { + "epoch": 1.2392719986642178, + "grad_norm": 0.33968239674236567, + "learning_rate": 7.299424902966039e-06, + "loss": 0.0314, + "step": 3711 + }, + { + "epoch": 1.2396059442310903, + "grad_norm": 0.24288643222976627, + "learning_rate": 7.297699091133685e-06, + "loss": 0.0251, + "step": 3712 + }, + { + "epoch": 1.239939889797963, + "grad_norm": 0.2603087747863267, + "learning_rate": 7.295972932206827e-06, + "loss": 0.0225, + "step": 3713 + }, + { + "epoch": 1.2402738353648355, + "grad_norm": 0.3163294158642131, + "learning_rate": 7.2942464264462255e-06, + "loss": 0.0244, + "step": 3714 + }, + { + "epoch": 1.2406077809317082, + "grad_norm": 0.299362756794637, + "learning_rate": 7.292519574112688e-06, + "loss": 0.0277, + "step": 3715 + }, + { + "epoch": 1.2409417264985807, + "grad_norm": 0.3554647010376918, + "learning_rate": 7.290792375467074e-06, + "loss": 0.0294, + "step": 3716 + }, + { + "epoch": 1.2412756720654534, + "grad_norm": 0.3630876067157386, + "learning_rate": 7.2890648307702985e-06, + "loss": 0.0349, + "step": 3717 + }, + { + "epoch": 1.241609617632326, + "grad_norm": 0.2755333053761441, + "learning_rate": 7.287336940283327e-06, + "loss": 0.0256, + "step": 3718 + }, + { + "epoch": 1.2419435631991984, + "grad_norm": 0.351516780352397, + "learning_rate": 7.28560870426718e-06, + "loss": 0.0319, + "step": 3719 + }, + { + "epoch": 1.2422775087660711, + "grad_norm": 0.3682681590002886, + "learning_rate": 7.2838801229829245e-06, + "loss": 0.0291, + "step": 3720 + }, + { + "epoch": 1.2426114543329438, + "grad_norm": 0.33590142023212194, + "learning_rate": 7.2821511966916845e-06, + "loss": 0.023, + "step": 3721 + }, + { + "epoch": 1.2429453998998163, + "grad_norm": 0.26643045835727613, + "learning_rate": 7.280421925654635e-06, + "loss": 0.0214, + "step": 3722 + }, + { + "epoch": 1.2432793454666888, + "grad_norm": 0.2776713771622553, + "learning_rate": 7.278692310133003e-06, + "loss": 0.0249, + "step": 3723 + }, + { + "epoch": 1.2436132910335616, + "grad_norm": 0.3039900578868121, + "learning_rate": 7.276962350388067e-06, + "loss": 0.039, + "step": 3724 + }, + { + "epoch": 1.243947236600434, + "grad_norm": 0.3580496006561448, + "learning_rate": 7.275232046681157e-06, + "loss": 0.0294, + "step": 3725 + }, + { + "epoch": 1.2442811821673068, + "grad_norm": 0.3491025317337139, + "learning_rate": 7.273501399273656e-06, + "loss": 0.0342, + "step": 3726 + }, + { + "epoch": 1.2446151277341793, + "grad_norm": 0.33509533968101746, + "learning_rate": 7.271770408427e-06, + "loss": 0.0341, + "step": 3727 + }, + { + "epoch": 1.244949073301052, + "grad_norm": 0.46171379206470553, + "learning_rate": 7.2700390744026735e-06, + "loss": 0.0333, + "step": 3728 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 0.36499803123861596, + "learning_rate": 7.2683073974622165e-06, + "loss": 0.021, + "step": 3729 + }, + { + "epoch": 1.2456169644347972, + "grad_norm": 0.48052357895113695, + "learning_rate": 7.26657537786722e-06, + "loss": 0.0369, + "step": 3730 + }, + { + "epoch": 1.2459509100016697, + "grad_norm": 0.32479066646778987, + "learning_rate": 7.264843015879321e-06, + "loss": 0.0188, + "step": 3731 + }, + { + "epoch": 1.2462848555685424, + "grad_norm": 0.5374273644484915, + "learning_rate": 7.263110311760221e-06, + "loss": 0.0341, + "step": 3732 + }, + { + "epoch": 1.246618801135415, + "grad_norm": 0.3937749073326078, + "learning_rate": 7.2613772657716585e-06, + "loss": 0.0267, + "step": 3733 + }, + { + "epoch": 1.2469527467022876, + "grad_norm": 0.339198762168034, + "learning_rate": 7.259643878175434e-06, + "loss": 0.0432, + "step": 3734 + }, + { + "epoch": 1.2472866922691601, + "grad_norm": 0.3832528094853473, + "learning_rate": 7.2579101492333956e-06, + "loss": 0.0332, + "step": 3735 + }, + { + "epoch": 1.2476206378360328, + "grad_norm": 0.3728492101697254, + "learning_rate": 7.256176079207442e-06, + "loss": 0.034, + "step": 3736 + }, + { + "epoch": 1.2479545834029053, + "grad_norm": 0.36633795428673205, + "learning_rate": 7.254441668359527e-06, + "loss": 0.0252, + "step": 3737 + }, + { + "epoch": 1.2482885289697778, + "grad_norm": 0.48856680714255346, + "learning_rate": 7.252706916951653e-06, + "loss": 0.0347, + "step": 3738 + }, + { + "epoch": 1.2486224745366505, + "grad_norm": 0.3314686964943417, + "learning_rate": 7.250971825245874e-06, + "loss": 0.0312, + "step": 3739 + }, + { + "epoch": 1.248956420103523, + "grad_norm": 0.2533430483215571, + "learning_rate": 7.249236393504296e-06, + "loss": 0.0219, + "step": 3740 + }, + { + "epoch": 1.2492903656703958, + "grad_norm": 0.31333906688251445, + "learning_rate": 7.247500621989078e-06, + "loss": 0.0306, + "step": 3741 + }, + { + "epoch": 1.2496243112372682, + "grad_norm": 0.374680220817901, + "learning_rate": 7.245764510962426e-06, + "loss": 0.0425, + "step": 3742 + }, + { + "epoch": 1.249958256804141, + "grad_norm": 0.3897165493108914, + "learning_rate": 7.244028060686603e-06, + "loss": 0.0266, + "step": 3743 + }, + { + "epoch": 1.2502922023710135, + "grad_norm": 0.32147299764024756, + "learning_rate": 7.242291271423919e-06, + "loss": 0.0236, + "step": 3744 + }, + { + "epoch": 1.2506261479378862, + "grad_norm": 0.34935109492629934, + "learning_rate": 7.240554143436735e-06, + "loss": 0.0279, + "step": 3745 + }, + { + "epoch": 1.2509600935047587, + "grad_norm": 0.4583164819519443, + "learning_rate": 7.238816676987467e-06, + "loss": 0.0305, + "step": 3746 + }, + { + "epoch": 1.2512940390716314, + "grad_norm": 0.44515078128846375, + "learning_rate": 7.237078872338579e-06, + "loss": 0.0332, + "step": 3747 + }, + { + "epoch": 1.2516279846385039, + "grad_norm": 0.39088623320149796, + "learning_rate": 7.235340729752584e-06, + "loss": 0.0283, + "step": 3748 + }, + { + "epoch": 1.2519619302053766, + "grad_norm": 0.2830828577664718, + "learning_rate": 7.233602249492055e-06, + "loss": 0.0244, + "step": 3749 + }, + { + "epoch": 1.252295875772249, + "grad_norm": 0.29156033814021487, + "learning_rate": 7.2318634318196045e-06, + "loss": 0.0274, + "step": 3750 + }, + { + "epoch": 1.2526298213391218, + "grad_norm": 0.4269714947221831, + "learning_rate": 7.230124276997903e-06, + "loss": 0.0371, + "step": 3751 + }, + { + "epoch": 1.2529637669059943, + "grad_norm": 0.292773984024898, + "learning_rate": 7.228384785289671e-06, + "loss": 0.0283, + "step": 3752 + }, + { + "epoch": 1.2532977124728668, + "grad_norm": 0.2606263972987227, + "learning_rate": 7.2266449569576804e-06, + "loss": 0.0235, + "step": 3753 + }, + { + "epoch": 1.2536316580397395, + "grad_norm": 0.45745548811577635, + "learning_rate": 7.224904792264748e-06, + "loss": 0.0399, + "step": 3754 + }, + { + "epoch": 1.2539656036066122, + "grad_norm": 0.3575217997424679, + "learning_rate": 7.223164291473752e-06, + "loss": 0.0217, + "step": 3755 + }, + { + "epoch": 1.2542995491734847, + "grad_norm": 0.34302189426855073, + "learning_rate": 7.221423454847611e-06, + "loss": 0.0219, + "step": 3756 + }, + { + "epoch": 1.2546334947403572, + "grad_norm": 0.3340657088961129, + "learning_rate": 7.219682282649302e-06, + "loss": 0.0209, + "step": 3757 + }, + { + "epoch": 1.25496744030723, + "grad_norm": 0.3380616537037228, + "learning_rate": 7.2179407751418485e-06, + "loss": 0.0244, + "step": 3758 + }, + { + "epoch": 1.2553013858741024, + "grad_norm": 0.2869316250472297, + "learning_rate": 7.216198932588325e-06, + "loss": 0.021, + "step": 3759 + }, + { + "epoch": 1.2556353314409752, + "grad_norm": 0.31710265418670425, + "learning_rate": 7.214456755251858e-06, + "loss": 0.0246, + "step": 3760 + }, + { + "epoch": 1.2559692770078477, + "grad_norm": 0.2905960517037898, + "learning_rate": 7.212714243395623e-06, + "loss": 0.0293, + "step": 3761 + }, + { + "epoch": 1.2563032225747204, + "grad_norm": 0.3541273326846222, + "learning_rate": 7.210971397282848e-06, + "loss": 0.0265, + "step": 3762 + }, + { + "epoch": 1.2566371681415929, + "grad_norm": 0.2694311729984884, + "learning_rate": 7.20922821717681e-06, + "loss": 0.0244, + "step": 3763 + }, + { + "epoch": 1.2569711137084656, + "grad_norm": 0.2422903599196817, + "learning_rate": 7.207484703340838e-06, + "loss": 0.0204, + "step": 3764 + }, + { + "epoch": 1.257305059275338, + "grad_norm": 0.2544145464785348, + "learning_rate": 7.205740856038308e-06, + "loss": 0.0234, + "step": 3765 + }, + { + "epoch": 1.2576390048422108, + "grad_norm": 0.29299204974133036, + "learning_rate": 7.2039966755326515e-06, + "loss": 0.0253, + "step": 3766 + }, + { + "epoch": 1.2579729504090833, + "grad_norm": 0.3177885966840588, + "learning_rate": 7.2022521620873456e-06, + "loss": 0.0256, + "step": 3767 + }, + { + "epoch": 1.2583068959759558, + "grad_norm": 0.2542501316544053, + "learning_rate": 7.2005073159659186e-06, + "loss": 0.0215, + "step": 3768 + }, + { + "epoch": 1.2586408415428285, + "grad_norm": 0.5497569773621858, + "learning_rate": 7.198762137431952e-06, + "loss": 0.0431, + "step": 3769 + }, + { + "epoch": 1.2589747871097012, + "grad_norm": 0.2553974569267879, + "learning_rate": 7.197016626749076e-06, + "loss": 0.019, + "step": 3770 + }, + { + "epoch": 1.2593087326765737, + "grad_norm": 0.19873663578782186, + "learning_rate": 7.195270784180968e-06, + "loss": 0.0156, + "step": 3771 + }, + { + "epoch": 1.2596426782434462, + "grad_norm": 0.4408211680948502, + "learning_rate": 7.193524609991359e-06, + "loss": 0.0429, + "step": 3772 + }, + { + "epoch": 1.259976623810319, + "grad_norm": 0.3525185783707462, + "learning_rate": 7.191778104444031e-06, + "loss": 0.0297, + "step": 3773 + }, + { + "epoch": 1.2603105693771914, + "grad_norm": 0.3667028187971598, + "learning_rate": 7.190031267802814e-06, + "loss": 0.0344, + "step": 3774 + }, + { + "epoch": 1.2606445149440642, + "grad_norm": 0.3342665533007622, + "learning_rate": 7.188284100331585e-06, + "loss": 0.0319, + "step": 3775 + }, + { + "epoch": 1.2609784605109366, + "grad_norm": 0.27642443399097244, + "learning_rate": 7.186536602294278e-06, + "loss": 0.0287, + "step": 3776 + }, + { + "epoch": 1.2613124060778094, + "grad_norm": 0.35601960051788384, + "learning_rate": 7.184788773954871e-06, + "loss": 0.0266, + "step": 3777 + }, + { + "epoch": 1.2616463516446819, + "grad_norm": 0.17862828576846648, + "learning_rate": 7.1830406155773946e-06, + "loss": 0.0139, + "step": 3778 + }, + { + "epoch": 1.2619802972115546, + "grad_norm": 0.35403715659010926, + "learning_rate": 7.181292127425928e-06, + "loss": 0.0265, + "step": 3779 + }, + { + "epoch": 1.262314242778427, + "grad_norm": 0.3091743906409628, + "learning_rate": 7.179543309764604e-06, + "loss": 0.0275, + "step": 3780 + }, + { + "epoch": 1.2626481883452998, + "grad_norm": 0.24350518113982092, + "learning_rate": 7.177794162857598e-06, + "loss": 0.0216, + "step": 3781 + }, + { + "epoch": 1.2629821339121723, + "grad_norm": 0.3420262121066958, + "learning_rate": 7.176044686969141e-06, + "loss": 0.0346, + "step": 3782 + }, + { + "epoch": 1.2633160794790448, + "grad_norm": 0.29497645371329917, + "learning_rate": 7.174294882363513e-06, + "loss": 0.0244, + "step": 3783 + }, + { + "epoch": 1.2636500250459175, + "grad_norm": 0.27482921785317416, + "learning_rate": 7.172544749305039e-06, + "loss": 0.0258, + "step": 3784 + }, + { + "epoch": 1.2639839706127902, + "grad_norm": 0.39636662140469486, + "learning_rate": 7.170794288058103e-06, + "loss": 0.0282, + "step": 3785 + }, + { + "epoch": 1.2643179161796627, + "grad_norm": 0.2670025994032695, + "learning_rate": 7.169043498887126e-06, + "loss": 0.0224, + "step": 3786 + }, + { + "epoch": 1.2646518617465352, + "grad_norm": 0.31426460403959644, + "learning_rate": 7.1672923820565925e-06, + "loss": 0.0214, + "step": 3787 + }, + { + "epoch": 1.264985807313408, + "grad_norm": 0.35382114056259417, + "learning_rate": 7.165540937831024e-06, + "loss": 0.0209, + "step": 3788 + }, + { + "epoch": 1.2653197528802806, + "grad_norm": 0.2190706486255055, + "learning_rate": 7.163789166474998e-06, + "loss": 0.0196, + "step": 3789 + }, + { + "epoch": 1.2656536984471531, + "grad_norm": 0.3089953610537669, + "learning_rate": 7.162037068253141e-06, + "loss": 0.0181, + "step": 3790 + }, + { + "epoch": 1.2659876440140256, + "grad_norm": 0.29351890815761744, + "learning_rate": 7.160284643430129e-06, + "loss": 0.0314, + "step": 3791 + }, + { + "epoch": 1.2663215895808984, + "grad_norm": 0.5468606345028866, + "learning_rate": 7.158531892270682e-06, + "loss": 0.0397, + "step": 3792 + }, + { + "epoch": 1.2666555351477709, + "grad_norm": 0.22776825216991417, + "learning_rate": 7.156778815039579e-06, + "loss": 0.0218, + "step": 3793 + }, + { + "epoch": 1.2669894807146436, + "grad_norm": 0.3511226987717307, + "learning_rate": 7.15502541200164e-06, + "loss": 0.0332, + "step": 3794 + }, + { + "epoch": 1.267323426281516, + "grad_norm": 0.2909516765573883, + "learning_rate": 7.153271683421738e-06, + "loss": 0.0229, + "step": 3795 + }, + { + "epoch": 1.2676573718483888, + "grad_norm": 0.3054934354023608, + "learning_rate": 7.151517629564795e-06, + "loss": 0.0344, + "step": 3796 + }, + { + "epoch": 1.2679913174152613, + "grad_norm": 0.25579604698663366, + "learning_rate": 7.14976325069578e-06, + "loss": 0.0239, + "step": 3797 + }, + { + "epoch": 1.268325262982134, + "grad_norm": 0.2818385793611901, + "learning_rate": 7.148008547079713e-06, + "loss": 0.0229, + "step": 3798 + }, + { + "epoch": 1.2686592085490065, + "grad_norm": 0.26293039949095737, + "learning_rate": 7.1462535189816636e-06, + "loss": 0.0227, + "step": 3799 + }, + { + "epoch": 1.2689931541158792, + "grad_norm": 0.2627632990160037, + "learning_rate": 7.14449816666675e-06, + "loss": 0.0223, + "step": 3800 + }, + { + "epoch": 1.2693270996827517, + "grad_norm": 0.2428387518039147, + "learning_rate": 7.142742490400135e-06, + "loss": 0.0191, + "step": 3801 + }, + { + "epoch": 1.2696610452496242, + "grad_norm": 0.34986548303294884, + "learning_rate": 7.140986490447039e-06, + "loss": 0.0307, + "step": 3802 + }, + { + "epoch": 1.269994990816497, + "grad_norm": 0.2687958689660581, + "learning_rate": 7.139230167072724e-06, + "loss": 0.0294, + "step": 3803 + }, + { + "epoch": 1.2703289363833696, + "grad_norm": 0.279248991904363, + "learning_rate": 7.137473520542503e-06, + "loss": 0.0263, + "step": 3804 + }, + { + "epoch": 1.2706628819502421, + "grad_norm": 0.3237279440794913, + "learning_rate": 7.135716551121739e-06, + "loss": 0.0355, + "step": 3805 + }, + { + "epoch": 1.2709968275171146, + "grad_norm": 0.32990174059468896, + "learning_rate": 7.133959259075844e-06, + "loss": 0.0377, + "step": 3806 + }, + { + "epoch": 1.2713307730839873, + "grad_norm": 0.27475097706870694, + "learning_rate": 7.132201644670274e-06, + "loss": 0.0196, + "step": 3807 + }, + { + "epoch": 1.2716647186508598, + "grad_norm": 0.2970159606782559, + "learning_rate": 7.13044370817054e-06, + "loss": 0.0329, + "step": 3808 + }, + { + "epoch": 1.2719986642177326, + "grad_norm": 0.3233728797140094, + "learning_rate": 7.128685449842201e-06, + "loss": 0.0235, + "step": 3809 + }, + { + "epoch": 1.272332609784605, + "grad_norm": 0.29928552659389207, + "learning_rate": 7.1269268699508574e-06, + "loss": 0.0235, + "step": 3810 + }, + { + "epoch": 1.2726665553514778, + "grad_norm": 0.269527821898528, + "learning_rate": 7.1251679687621685e-06, + "loss": 0.0193, + "step": 3811 + }, + { + "epoch": 1.2730005009183503, + "grad_norm": 0.3104286192101173, + "learning_rate": 7.123408746541835e-06, + "loss": 0.0293, + "step": 3812 + }, + { + "epoch": 1.273334446485223, + "grad_norm": 0.36794897135182747, + "learning_rate": 7.1216492035556075e-06, + "loss": 0.0273, + "step": 3813 + }, + { + "epoch": 1.2736683920520955, + "grad_norm": 0.3134344825908921, + "learning_rate": 7.119889340069286e-06, + "loss": 0.0231, + "step": 3814 + }, + { + "epoch": 1.2740023376189682, + "grad_norm": 0.2866279702689287, + "learning_rate": 7.1181291563487175e-06, + "loss": 0.0229, + "step": 3815 + }, + { + "epoch": 1.2743362831858407, + "grad_norm": 0.2963489265558395, + "learning_rate": 7.116368652659802e-06, + "loss": 0.0266, + "step": 3816 + }, + { + "epoch": 1.2746702287527132, + "grad_norm": 0.28477888768155046, + "learning_rate": 7.114607829268481e-06, + "loss": 0.0198, + "step": 3817 + }, + { + "epoch": 1.275004174319586, + "grad_norm": 0.28887194985404846, + "learning_rate": 7.1128466864407486e-06, + "loss": 0.0304, + "step": 3818 + }, + { + "epoch": 1.2753381198864586, + "grad_norm": 0.21621104290989435, + "learning_rate": 7.111085224442647e-06, + "loss": 0.017, + "step": 3819 + }, + { + "epoch": 1.2756720654533311, + "grad_norm": 0.25506144939716724, + "learning_rate": 7.109323443540263e-06, + "loss": 0.0245, + "step": 3820 + }, + { + "epoch": 1.2760060110202036, + "grad_norm": 0.3523064108286381, + "learning_rate": 7.107561343999739e-06, + "loss": 0.0225, + "step": 3821 + }, + { + "epoch": 1.2763399565870763, + "grad_norm": 0.3004006393385708, + "learning_rate": 7.105798926087257e-06, + "loss": 0.0324, + "step": 3822 + }, + { + "epoch": 1.2766739021539488, + "grad_norm": 0.2632550484129978, + "learning_rate": 7.104036190069052e-06, + "loss": 0.0213, + "step": 3823 + }, + { + "epoch": 1.2770078477208215, + "grad_norm": 0.23054401128039914, + "learning_rate": 7.102273136211407e-06, + "loss": 0.0226, + "step": 3824 + }, + { + "epoch": 1.277341793287694, + "grad_norm": 0.2628399430254801, + "learning_rate": 7.10050976478065e-06, + "loss": 0.0259, + "step": 3825 + }, + { + "epoch": 1.2776757388545668, + "grad_norm": 0.3727161911287167, + "learning_rate": 7.098746076043162e-06, + "loss": 0.0285, + "step": 3826 + }, + { + "epoch": 1.2780096844214393, + "grad_norm": 0.26894485749895153, + "learning_rate": 7.096982070265366e-06, + "loss": 0.0235, + "step": 3827 + }, + { + "epoch": 1.278343629988312, + "grad_norm": 0.3045997903242515, + "learning_rate": 7.0952177477137374e-06, + "loss": 0.0251, + "step": 3828 + }, + { + "epoch": 1.2786775755551845, + "grad_norm": 0.2708735775382479, + "learning_rate": 7.093453108654798e-06, + "loss": 0.029, + "step": 3829 + }, + { + "epoch": 1.2790115211220572, + "grad_norm": 0.373792728388833, + "learning_rate": 7.091688153355116e-06, + "loss": 0.0243, + "step": 3830 + }, + { + "epoch": 1.2793454666889297, + "grad_norm": 0.34068259251681265, + "learning_rate": 7.08992288208131e-06, + "loss": 0.0312, + "step": 3831 + }, + { + "epoch": 1.2796794122558022, + "grad_norm": 0.30366043592667635, + "learning_rate": 7.088157295100046e-06, + "loss": 0.0226, + "step": 3832 + }, + { + "epoch": 1.280013357822675, + "grad_norm": 0.2695816296727487, + "learning_rate": 7.0863913926780335e-06, + "loss": 0.0294, + "step": 3833 + }, + { + "epoch": 1.2803473033895476, + "grad_norm": 0.2530947877307193, + "learning_rate": 7.084625175082036e-06, + "loss": 0.0204, + "step": 3834 + }, + { + "epoch": 1.28068124895642, + "grad_norm": 0.3598741848865989, + "learning_rate": 7.082858642578861e-06, + "loss": 0.028, + "step": 3835 + }, + { + "epoch": 1.2810151945232926, + "grad_norm": 0.3821041027680648, + "learning_rate": 7.081091795435361e-06, + "loss": 0.0312, + "step": 3836 + }, + { + "epoch": 1.2813491400901653, + "grad_norm": 0.3892097845716495, + "learning_rate": 7.079324633918443e-06, + "loss": 0.0319, + "step": 3837 + }, + { + "epoch": 1.281683085657038, + "grad_norm": 0.2419078489802551, + "learning_rate": 7.077557158295053e-06, + "loss": 0.0201, + "step": 3838 + }, + { + "epoch": 1.2820170312239105, + "grad_norm": 0.3358163798342825, + "learning_rate": 7.075789368832194e-06, + "loss": 0.0292, + "step": 3839 + }, + { + "epoch": 1.282350976790783, + "grad_norm": 0.44046064172342075, + "learning_rate": 7.074021265796909e-06, + "loss": 0.0272, + "step": 3840 + }, + { + "epoch": 1.2826849223576557, + "grad_norm": 0.29494253778608953, + "learning_rate": 7.072252849456291e-06, + "loss": 0.0253, + "step": 3841 + }, + { + "epoch": 1.2830188679245282, + "grad_norm": 0.31714296561014493, + "learning_rate": 7.07048412007748e-06, + "loss": 0.0245, + "step": 3842 + }, + { + "epoch": 1.283352813491401, + "grad_norm": 0.4015211112379901, + "learning_rate": 7.068715077927664e-06, + "loss": 0.0233, + "step": 3843 + }, + { + "epoch": 1.2836867590582735, + "grad_norm": 0.32931183887132787, + "learning_rate": 7.066945723274077e-06, + "loss": 0.035, + "step": 3844 + }, + { + "epoch": 1.2840207046251462, + "grad_norm": 0.2152950381478552, + "learning_rate": 7.065176056383999e-06, + "loss": 0.0185, + "step": 3845 + }, + { + "epoch": 1.2843546501920187, + "grad_norm": 0.3725494017257642, + "learning_rate": 7.063406077524764e-06, + "loss": 0.0264, + "step": 3846 + }, + { + "epoch": 1.2846885957588914, + "grad_norm": 0.3170507120197753, + "learning_rate": 7.061635786963743e-06, + "loss": 0.0194, + "step": 3847 + }, + { + "epoch": 1.2850225413257639, + "grad_norm": 0.335061845849936, + "learning_rate": 7.059865184968362e-06, + "loss": 0.0221, + "step": 3848 + }, + { + "epoch": 1.2853564868926366, + "grad_norm": 0.3142541759000103, + "learning_rate": 7.058094271806091e-06, + "loss": 0.0356, + "step": 3849 + }, + { + "epoch": 1.285690432459509, + "grad_norm": 0.2390745082714666, + "learning_rate": 7.056323047744447e-06, + "loss": 0.0177, + "step": 3850 + }, + { + "epoch": 1.2860243780263816, + "grad_norm": 0.4288888245923048, + "learning_rate": 7.054551513050993e-06, + "loss": 0.0458, + "step": 3851 + }, + { + "epoch": 1.2863583235932543, + "grad_norm": 0.4158711399622571, + "learning_rate": 7.052779667993342e-06, + "loss": 0.0388, + "step": 3852 + }, + { + "epoch": 1.286692269160127, + "grad_norm": 0.3481540210344807, + "learning_rate": 7.051007512839153e-06, + "loss": 0.0276, + "step": 3853 + }, + { + "epoch": 1.2870262147269995, + "grad_norm": 0.4434873906513856, + "learning_rate": 7.0492350478561275e-06, + "loss": 0.0296, + "step": 3854 + }, + { + "epoch": 1.287360160293872, + "grad_norm": 0.40788009074663517, + "learning_rate": 7.04746227331202e-06, + "loss": 0.03, + "step": 3855 + }, + { + "epoch": 1.2876941058607447, + "grad_norm": 0.32346129814996305, + "learning_rate": 7.045689189474628e-06, + "loss": 0.0199, + "step": 3856 + }, + { + "epoch": 1.2880280514276172, + "grad_norm": 0.22659696481141453, + "learning_rate": 7.0439157966117955e-06, + "loss": 0.0196, + "step": 3857 + }, + { + "epoch": 1.28836199699449, + "grad_norm": 0.3076910939405401, + "learning_rate": 7.042142094991418e-06, + "loss": 0.0192, + "step": 3858 + }, + { + "epoch": 1.2886959425613624, + "grad_norm": 0.2412962650349266, + "learning_rate": 7.04036808488143e-06, + "loss": 0.0185, + "step": 3859 + }, + { + "epoch": 1.2890298881282352, + "grad_norm": 0.3110724652350511, + "learning_rate": 7.038593766549817e-06, + "loss": 0.0257, + "step": 3860 + }, + { + "epoch": 1.2893638336951077, + "grad_norm": 0.3382309693823327, + "learning_rate": 7.0368191402646145e-06, + "loss": 0.0315, + "step": 3861 + }, + { + "epoch": 1.2896977792619804, + "grad_norm": 0.38629258045217485, + "learning_rate": 7.035044206293898e-06, + "loss": 0.0277, + "step": 3862 + }, + { + "epoch": 1.2900317248288529, + "grad_norm": 0.3378226150888299, + "learning_rate": 7.0332689649057905e-06, + "loss": 0.0287, + "step": 3863 + }, + { + "epoch": 1.2903656703957256, + "grad_norm": 0.31026600784411296, + "learning_rate": 7.031493416368466e-06, + "loss": 0.0256, + "step": 3864 + }, + { + "epoch": 1.290699615962598, + "grad_norm": 0.42079825484514044, + "learning_rate": 7.029717560950141e-06, + "loss": 0.0389, + "step": 3865 + }, + { + "epoch": 1.2910335615294706, + "grad_norm": 0.30751877180824905, + "learning_rate": 7.027941398919078e-06, + "loss": 0.0233, + "step": 3866 + }, + { + "epoch": 1.2913675070963433, + "grad_norm": 0.3354377400321467, + "learning_rate": 7.0261649305435895e-06, + "loss": 0.0337, + "step": 3867 + }, + { + "epoch": 1.291701452663216, + "grad_norm": 0.3288505264109346, + "learning_rate": 7.02438815609203e-06, + "loss": 0.0265, + "step": 3868 + }, + { + "epoch": 1.2920353982300885, + "grad_norm": 0.4817040107976783, + "learning_rate": 7.022611075832804e-06, + "loss": 0.026, + "step": 3869 + }, + { + "epoch": 1.292369343796961, + "grad_norm": 0.44315485160950524, + "learning_rate": 7.02083369003436e-06, + "loss": 0.0412, + "step": 3870 + }, + { + "epoch": 1.2927032893638337, + "grad_norm": 0.24605793521653016, + "learning_rate": 7.019055998965191e-06, + "loss": 0.0198, + "step": 3871 + }, + { + "epoch": 1.2930372349307062, + "grad_norm": 0.4426945239123617, + "learning_rate": 7.017278002893841e-06, + "loss": 0.0282, + "step": 3872 + }, + { + "epoch": 1.293371180497579, + "grad_norm": 0.2725147428081745, + "learning_rate": 7.015499702088896e-06, + "loss": 0.0251, + "step": 3873 + }, + { + "epoch": 1.2937051260644514, + "grad_norm": 0.2762178036510364, + "learning_rate": 7.013721096818988e-06, + "loss": 0.0214, + "step": 3874 + }, + { + "epoch": 1.2940390716313241, + "grad_norm": 0.3484476255070034, + "learning_rate": 7.011942187352798e-06, + "loss": 0.0291, + "step": 3875 + }, + { + "epoch": 1.2943730171981966, + "grad_norm": 0.33750939212648334, + "learning_rate": 7.010162973959052e-06, + "loss": 0.0283, + "step": 3876 + }, + { + "epoch": 1.2947069627650694, + "grad_norm": 0.3273685064522598, + "learning_rate": 7.008383456906518e-06, + "loss": 0.0266, + "step": 3877 + }, + { + "epoch": 1.2950409083319419, + "grad_norm": 0.32685118427331566, + "learning_rate": 7.0066036364640165e-06, + "loss": 0.025, + "step": 3878 + }, + { + "epoch": 1.2953748538988146, + "grad_norm": 0.33583083029365574, + "learning_rate": 7.004823512900408e-06, + "loss": 0.0274, + "step": 3879 + }, + { + "epoch": 1.295708799465687, + "grad_norm": 0.36445779700535097, + "learning_rate": 7.003043086484602e-06, + "loss": 0.0306, + "step": 3880 + }, + { + "epoch": 1.2960427450325596, + "grad_norm": 0.2662035057674562, + "learning_rate": 7.001262357485553e-06, + "loss": 0.0227, + "step": 3881 + }, + { + "epoch": 1.2963766905994323, + "grad_norm": 0.34726810137953923, + "learning_rate": 6.99948132617226e-06, + "loss": 0.0353, + "step": 3882 + }, + { + "epoch": 1.296710636166305, + "grad_norm": 0.3300358273093104, + "learning_rate": 6.99769999281377e-06, + "loss": 0.0334, + "step": 3883 + }, + { + "epoch": 1.2970445817331775, + "grad_norm": 0.2953831654724505, + "learning_rate": 6.9959183576791745e-06, + "loss": 0.0208, + "step": 3884 + }, + { + "epoch": 1.29737852730005, + "grad_norm": 0.33514086178239627, + "learning_rate": 6.9941364210376095e-06, + "loss": 0.0212, + "step": 3885 + }, + { + "epoch": 1.2977124728669227, + "grad_norm": 0.34895797860840005, + "learning_rate": 6.992354183158258e-06, + "loss": 0.0242, + "step": 3886 + }, + { + "epoch": 1.2980464184337954, + "grad_norm": 0.3422201671917834, + "learning_rate": 6.9905716443103475e-06, + "loss": 0.0281, + "step": 3887 + }, + { + "epoch": 1.298380364000668, + "grad_norm": 0.38144608640187583, + "learning_rate": 6.9887888047631525e-06, + "loss": 0.0351, + "step": 3888 + }, + { + "epoch": 1.2987143095675404, + "grad_norm": 0.37988626182907137, + "learning_rate": 6.987005664785991e-06, + "loss": 0.0281, + "step": 3889 + }, + { + "epoch": 1.2990482551344131, + "grad_norm": 0.37015257657339745, + "learning_rate": 6.985222224648227e-06, + "loss": 0.0284, + "step": 3890 + }, + { + "epoch": 1.2993822007012856, + "grad_norm": 0.3035372110152349, + "learning_rate": 6.983438484619272e-06, + "loss": 0.0196, + "step": 3891 + }, + { + "epoch": 1.2997161462681583, + "grad_norm": 0.25495387431951255, + "learning_rate": 6.981654444968578e-06, + "loss": 0.0248, + "step": 3892 + }, + { + "epoch": 1.3000500918350308, + "grad_norm": 0.33620484416742547, + "learning_rate": 6.979870105965648e-06, + "loss": 0.0291, + "step": 3893 + }, + { + "epoch": 1.3003840374019036, + "grad_norm": 0.39856282078247696, + "learning_rate": 6.978085467880027e-06, + "loss": 0.0294, + "step": 3894 + }, + { + "epoch": 1.300717982968776, + "grad_norm": 0.40499896354271187, + "learning_rate": 6.9763005309813025e-06, + "loss": 0.0476, + "step": 3895 + }, + { + "epoch": 1.3010519285356488, + "grad_norm": 0.32998626585092516, + "learning_rate": 6.974515295539115e-06, + "loss": 0.0262, + "step": 3896 + }, + { + "epoch": 1.3013858741025213, + "grad_norm": 0.3444307969984011, + "learning_rate": 6.9727297618231416e-06, + "loss": 0.0296, + "step": 3897 + }, + { + "epoch": 1.301719819669394, + "grad_norm": 0.3279836003629136, + "learning_rate": 6.970943930103109e-06, + "loss": 0.0244, + "step": 3898 + }, + { + "epoch": 1.3020537652362665, + "grad_norm": 0.2944925667220935, + "learning_rate": 6.96915780064879e-06, + "loss": 0.0244, + "step": 3899 + }, + { + "epoch": 1.302387710803139, + "grad_norm": 0.3948240601667736, + "learning_rate": 6.96737137373e-06, + "loss": 0.0317, + "step": 3900 + }, + { + "epoch": 1.3027216563700117, + "grad_norm": 0.3032366799397449, + "learning_rate": 6.965584649616597e-06, + "loss": 0.0302, + "step": 3901 + }, + { + "epoch": 1.3030556019368844, + "grad_norm": 0.38122992390738114, + "learning_rate": 6.963797628578489e-06, + "loss": 0.032, + "step": 3902 + }, + { + "epoch": 1.303389547503757, + "grad_norm": 0.2678076758754484, + "learning_rate": 6.962010310885627e-06, + "loss": 0.0221, + "step": 3903 + }, + { + "epoch": 1.3037234930706294, + "grad_norm": 0.3214451468010525, + "learning_rate": 6.960222696808004e-06, + "loss": 0.0356, + "step": 3904 + }, + { + "epoch": 1.3040574386375021, + "grad_norm": 0.3518423311397084, + "learning_rate": 6.958434786615663e-06, + "loss": 0.0298, + "step": 3905 + }, + { + "epoch": 1.3043913842043746, + "grad_norm": 0.21057233219584512, + "learning_rate": 6.956646580578687e-06, + "loss": 0.021, + "step": 3906 + }, + { + "epoch": 1.3047253297712473, + "grad_norm": 0.5161034386935823, + "learning_rate": 6.954858078967207e-06, + "loss": 0.0426, + "step": 3907 + }, + { + "epoch": 1.3050592753381198, + "grad_norm": 0.26331783533453623, + "learning_rate": 6.953069282051397e-06, + "loss": 0.0232, + "step": 3908 + }, + { + "epoch": 1.3053932209049925, + "grad_norm": 0.23474646321070586, + "learning_rate": 6.951280190101475e-06, + "loss": 0.0212, + "step": 3909 + }, + { + "epoch": 1.305727166471865, + "grad_norm": 0.42496950645456616, + "learning_rate": 6.949490803387704e-06, + "loss": 0.0337, + "step": 3910 + }, + { + "epoch": 1.3060611120387378, + "grad_norm": 0.29339700000989377, + "learning_rate": 6.9477011221803935e-06, + "loss": 0.0309, + "step": 3911 + }, + { + "epoch": 1.3063950576056103, + "grad_norm": 0.6979893125456561, + "learning_rate": 6.945911146749894e-06, + "loss": 0.0268, + "step": 3912 + }, + { + "epoch": 1.306729003172483, + "grad_norm": 0.3445728422912598, + "learning_rate": 6.944120877366605e-06, + "loss": 0.0283, + "step": 3913 + }, + { + "epoch": 1.3070629487393555, + "grad_norm": 0.5227605014625762, + "learning_rate": 6.9423303143009644e-06, + "loss": 0.0412, + "step": 3914 + }, + { + "epoch": 1.307396894306228, + "grad_norm": 0.34993513734593257, + "learning_rate": 6.940539457823459e-06, + "loss": 0.0267, + "step": 3915 + }, + { + "epoch": 1.3077308398731007, + "grad_norm": 0.3492814502001204, + "learning_rate": 6.938748308204622e-06, + "loss": 0.0319, + "step": 3916 + }, + { + "epoch": 1.3080647854399734, + "grad_norm": 0.3836991897907742, + "learning_rate": 6.936956865715024e-06, + "loss": 0.0262, + "step": 3917 + }, + { + "epoch": 1.308398731006846, + "grad_norm": 0.4155299096434416, + "learning_rate": 6.9351651306252836e-06, + "loss": 0.0336, + "step": 3918 + }, + { + "epoch": 1.3087326765737184, + "grad_norm": 0.24702361474860343, + "learning_rate": 6.933373103206064e-06, + "loss": 0.0269, + "step": 3919 + }, + { + "epoch": 1.309066622140591, + "grad_norm": 0.39994215569764924, + "learning_rate": 6.931580783728075e-06, + "loss": 0.0357, + "step": 3920 + }, + { + "epoch": 1.3094005677074636, + "grad_norm": 0.27883276101757154, + "learning_rate": 6.929788172462063e-06, + "loss": 0.0281, + "step": 3921 + }, + { + "epoch": 1.3097345132743363, + "grad_norm": 0.28887807961612816, + "learning_rate": 6.927995269678826e-06, + "loss": 0.0273, + "step": 3922 + }, + { + "epoch": 1.3100684588412088, + "grad_norm": 0.4386138703907347, + "learning_rate": 6.926202075649202e-06, + "loss": 0.0467, + "step": 3923 + }, + { + "epoch": 1.3104024044080815, + "grad_norm": 0.40678813429125205, + "learning_rate": 6.924408590644073e-06, + "loss": 0.0328, + "step": 3924 + }, + { + "epoch": 1.310736349974954, + "grad_norm": 0.3913932790283656, + "learning_rate": 6.922614814934367e-06, + "loss": 0.0242, + "step": 3925 + }, + { + "epoch": 1.3110702955418267, + "grad_norm": 0.38960844261127175, + "learning_rate": 6.920820748791057e-06, + "loss": 0.0325, + "step": 3926 + }, + { + "epoch": 1.3114042411086992, + "grad_norm": 0.29252956824499116, + "learning_rate": 6.919026392485154e-06, + "loss": 0.0261, + "step": 3927 + }, + { + "epoch": 1.311738186675572, + "grad_norm": 0.3039028024201547, + "learning_rate": 6.91723174628772e-06, + "loss": 0.0267, + "step": 3928 + }, + { + "epoch": 1.3120721322424445, + "grad_norm": 0.38398838331243357, + "learning_rate": 6.915436810469856e-06, + "loss": 0.0253, + "step": 3929 + }, + { + "epoch": 1.312406077809317, + "grad_norm": 0.24845265321342241, + "learning_rate": 6.913641585302708e-06, + "loss": 0.0181, + "step": 3930 + }, + { + "epoch": 1.3127400233761897, + "grad_norm": 0.2506277610515143, + "learning_rate": 6.9118460710574665e-06, + "loss": 0.0245, + "step": 3931 + }, + { + "epoch": 1.3130739689430624, + "grad_norm": 0.32113820929456094, + "learning_rate": 6.910050268005364e-06, + "loss": 0.0363, + "step": 3932 + }, + { + "epoch": 1.3134079145099349, + "grad_norm": 0.3677484606300255, + "learning_rate": 6.908254176417679e-06, + "loss": 0.0328, + "step": 3933 + }, + { + "epoch": 1.3137418600768074, + "grad_norm": 0.27919780414072687, + "learning_rate": 6.906457796565732e-06, + "loss": 0.0186, + "step": 3934 + }, + { + "epoch": 1.31407580564368, + "grad_norm": 0.3400969435978823, + "learning_rate": 6.904661128720887e-06, + "loss": 0.027, + "step": 3935 + }, + { + "epoch": 1.3144097512105528, + "grad_norm": 0.3043884951884974, + "learning_rate": 6.902864173154551e-06, + "loss": 0.0263, + "step": 3936 + }, + { + "epoch": 1.3147436967774253, + "grad_norm": 0.293373219215153, + "learning_rate": 6.9010669301381765e-06, + "loss": 0.0289, + "step": 3937 + }, + { + "epoch": 1.3150776423442978, + "grad_norm": 0.39515451698408716, + "learning_rate": 6.899269399943258e-06, + "loss": 0.0413, + "step": 3938 + }, + { + "epoch": 1.3154115879111705, + "grad_norm": 0.3434005867420467, + "learning_rate": 6.897471582841333e-06, + "loss": 0.0264, + "step": 3939 + }, + { + "epoch": 1.315745533478043, + "grad_norm": 0.36105915879525285, + "learning_rate": 6.895673479103983e-06, + "loss": 0.025, + "step": 3940 + }, + { + "epoch": 1.3160794790449157, + "grad_norm": 0.39452187658778015, + "learning_rate": 6.893875089002835e-06, + "loss": 0.0372, + "step": 3941 + }, + { + "epoch": 1.3164134246117882, + "grad_norm": 0.2761487674620673, + "learning_rate": 6.892076412809553e-06, + "loss": 0.0242, + "step": 3942 + }, + { + "epoch": 1.316747370178661, + "grad_norm": 0.3253838307421517, + "learning_rate": 6.890277450795851e-06, + "loss": 0.0296, + "step": 3943 + }, + { + "epoch": 1.3170813157455334, + "grad_norm": 0.30682564746593466, + "learning_rate": 6.888478203233484e-06, + "loss": 0.0207, + "step": 3944 + }, + { + "epoch": 1.3174152613124062, + "grad_norm": 0.25463543711485626, + "learning_rate": 6.886678670394247e-06, + "loss": 0.0231, + "step": 3945 + }, + { + "epoch": 1.3177492068792787, + "grad_norm": 0.28710601370474476, + "learning_rate": 6.884878852549982e-06, + "loss": 0.0228, + "step": 3946 + }, + { + "epoch": 1.3180831524461514, + "grad_norm": 0.42826472453397146, + "learning_rate": 6.883078749972573e-06, + "loss": 0.0312, + "step": 3947 + }, + { + "epoch": 1.3184170980130239, + "grad_norm": 0.3229766959914937, + "learning_rate": 6.881278362933947e-06, + "loss": 0.0294, + "step": 3948 + }, + { + "epoch": 1.3187510435798964, + "grad_norm": 0.2917459954299232, + "learning_rate": 6.879477691706071e-06, + "loss": 0.0241, + "step": 3949 + }, + { + "epoch": 1.319084989146769, + "grad_norm": 0.36512101323717394, + "learning_rate": 6.877676736560961e-06, + "loss": 0.0316, + "step": 3950 + }, + { + "epoch": 1.3194189347136418, + "grad_norm": 0.45980782785937463, + "learning_rate": 6.87587549777067e-06, + "loss": 0.0462, + "step": 3951 + }, + { + "epoch": 1.3197528802805143, + "grad_norm": 0.3148451226110855, + "learning_rate": 6.874073975607298e-06, + "loss": 0.0287, + "step": 3952 + }, + { + "epoch": 1.3200868258473868, + "grad_norm": 0.3383131679801026, + "learning_rate": 6.872272170342985e-06, + "loss": 0.0273, + "step": 3953 + }, + { + "epoch": 1.3204207714142595, + "grad_norm": 0.5932896335075106, + "learning_rate": 6.870470082249917e-06, + "loss": 0.039, + "step": 3954 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.2994717729086685, + "learning_rate": 6.868667711600318e-06, + "loss": 0.0227, + "step": 3955 + }, + { + "epoch": 1.3210886625480047, + "grad_norm": 0.3657604982283384, + "learning_rate": 6.866865058666459e-06, + "loss": 0.0422, + "step": 3956 + }, + { + "epoch": 1.3214226081148772, + "grad_norm": 0.293653773705607, + "learning_rate": 6.86506212372065e-06, + "loss": 0.03, + "step": 3957 + }, + { + "epoch": 1.32175655368175, + "grad_norm": 0.3430768386734613, + "learning_rate": 6.863258907035246e-06, + "loss": 0.0233, + "step": 3958 + }, + { + "epoch": 1.3220904992486224, + "grad_norm": 0.350082700433093, + "learning_rate": 6.861455408882647e-06, + "loss": 0.039, + "step": 3959 + }, + { + "epoch": 1.3224244448154951, + "grad_norm": 0.6032409866829661, + "learning_rate": 6.85965162953529e-06, + "loss": 0.026, + "step": 3960 + }, + { + "epoch": 1.3227583903823676, + "grad_norm": 0.2505725461489505, + "learning_rate": 6.857847569265657e-06, + "loss": 0.0198, + "step": 3961 + }, + { + "epoch": 1.3230923359492404, + "grad_norm": 0.31158792659610374, + "learning_rate": 6.8560432283462745e-06, + "loss": 0.0252, + "step": 3962 + }, + { + "epoch": 1.3234262815161129, + "grad_norm": 0.3607329340646706, + "learning_rate": 6.854238607049707e-06, + "loss": 0.0341, + "step": 3963 + }, + { + "epoch": 1.3237602270829854, + "grad_norm": 0.27647703602120005, + "learning_rate": 6.852433705648566e-06, + "loss": 0.0256, + "step": 3964 + }, + { + "epoch": 1.324094172649858, + "grad_norm": 0.4046546985600388, + "learning_rate": 6.8506285244155e-06, + "loss": 0.0257, + "step": 3965 + }, + { + "epoch": 1.3244281182167308, + "grad_norm": 0.3514736909950181, + "learning_rate": 6.848823063623207e-06, + "loss": 0.0337, + "step": 3966 + }, + { + "epoch": 1.3247620637836033, + "grad_norm": 0.35150070137713213, + "learning_rate": 6.84701732354442e-06, + "loss": 0.0356, + "step": 3967 + }, + { + "epoch": 1.3250960093504758, + "grad_norm": 0.28099789366834593, + "learning_rate": 6.845211304451919e-06, + "loss": 0.0231, + "step": 3968 + }, + { + "epoch": 1.3254299549173485, + "grad_norm": 0.3049624975688236, + "learning_rate": 6.843405006618523e-06, + "loss": 0.0225, + "step": 3969 + }, + { + "epoch": 1.325763900484221, + "grad_norm": 0.23477450061474828, + "learning_rate": 6.841598430317096e-06, + "loss": 0.0226, + "step": 3970 + }, + { + "epoch": 1.3260978460510937, + "grad_norm": 0.3179366600343478, + "learning_rate": 6.839791575820541e-06, + "loss": 0.0189, + "step": 3971 + }, + { + "epoch": 1.3264317916179662, + "grad_norm": 0.3096965137248418, + "learning_rate": 6.837984443401807e-06, + "loss": 0.029, + "step": 3972 + }, + { + "epoch": 1.326765737184839, + "grad_norm": 0.3348005675592078, + "learning_rate": 6.836177033333882e-06, + "loss": 0.0236, + "step": 3973 + }, + { + "epoch": 1.3270996827517114, + "grad_norm": 0.2841523201390299, + "learning_rate": 6.834369345889793e-06, + "loss": 0.0257, + "step": 3974 + }, + { + "epoch": 1.3274336283185841, + "grad_norm": 0.29332746682343497, + "learning_rate": 6.832561381342617e-06, + "loss": 0.0335, + "step": 3975 + }, + { + "epoch": 1.3277675738854566, + "grad_norm": 0.246911495207825, + "learning_rate": 6.830753139965467e-06, + "loss": 0.0223, + "step": 3976 + }, + { + "epoch": 1.3281015194523293, + "grad_norm": 0.30704740336809216, + "learning_rate": 6.828944622031497e-06, + "loss": 0.0296, + "step": 3977 + }, + { + "epoch": 1.3284354650192018, + "grad_norm": 0.365974937372308, + "learning_rate": 6.827135827813909e-06, + "loss": 0.0288, + "step": 3978 + }, + { + "epoch": 1.3287694105860743, + "grad_norm": 0.3236335047280812, + "learning_rate": 6.825326757585939e-06, + "loss": 0.0316, + "step": 3979 + }, + { + "epoch": 1.329103356152947, + "grad_norm": 0.2800451560116284, + "learning_rate": 6.823517411620871e-06, + "loss": 0.0199, + "step": 3980 + }, + { + "epoch": 1.3294373017198198, + "grad_norm": 0.2877918505125145, + "learning_rate": 6.821707790192025e-06, + "loss": 0.0255, + "step": 3981 + }, + { + "epoch": 1.3297712472866923, + "grad_norm": 0.3273448367309883, + "learning_rate": 6.819897893572769e-06, + "loss": 0.025, + "step": 3982 + }, + { + "epoch": 1.3301051928535648, + "grad_norm": 0.2400797163629544, + "learning_rate": 6.818087722036507e-06, + "loss": 0.0233, + "step": 3983 + }, + { + "epoch": 1.3304391384204375, + "grad_norm": 0.3002257737199839, + "learning_rate": 6.8162772758566875e-06, + "loss": 0.0221, + "step": 3984 + }, + { + "epoch": 1.3307730839873102, + "grad_norm": 0.42312864191195376, + "learning_rate": 6.8144665553067975e-06, + "loss": 0.0386, + "step": 3985 + }, + { + "epoch": 1.3311070295541827, + "grad_norm": 0.234612610844101, + "learning_rate": 6.812655560660373e-06, + "loss": 0.0249, + "step": 3986 + }, + { + "epoch": 1.3314409751210552, + "grad_norm": 0.21714699849900038, + "learning_rate": 6.810844292190982e-06, + "loss": 0.0214, + "step": 3987 + }, + { + "epoch": 1.331774920687928, + "grad_norm": 0.36538631652745873, + "learning_rate": 6.809032750172236e-06, + "loss": 0.042, + "step": 3988 + }, + { + "epoch": 1.3321088662548004, + "grad_norm": 0.32197788602142097, + "learning_rate": 6.807220934877794e-06, + "loss": 0.0272, + "step": 3989 + }, + { + "epoch": 1.3324428118216731, + "grad_norm": 0.2526354456491122, + "learning_rate": 6.80540884658135e-06, + "loss": 0.0234, + "step": 3990 + }, + { + "epoch": 1.3327767573885456, + "grad_norm": 0.5313469904936909, + "learning_rate": 6.803596485556643e-06, + "loss": 0.0362, + "step": 3991 + }, + { + "epoch": 1.3331107029554183, + "grad_norm": 0.3438581739382402, + "learning_rate": 6.8017838520774494e-06, + "loss": 0.0213, + "step": 3992 + }, + { + "epoch": 1.3334446485222908, + "grad_norm": 0.36752865327424455, + "learning_rate": 6.79997094641759e-06, + "loss": 0.0298, + "step": 3993 + }, + { + "epoch": 1.3337785940891635, + "grad_norm": 0.5564283943026228, + "learning_rate": 6.798157768850924e-06, + "loss": 0.0283, + "step": 3994 + }, + { + "epoch": 1.334112539656036, + "grad_norm": 0.28383456837215054, + "learning_rate": 6.796344319651356e-06, + "loss": 0.018, + "step": 3995 + }, + { + "epoch": 1.3344464852229088, + "grad_norm": 1.0556049547755992, + "learning_rate": 6.794530599092826e-06, + "loss": 0.0376, + "step": 3996 + }, + { + "epoch": 1.3347804307897813, + "grad_norm": 0.4417443049678084, + "learning_rate": 6.792716607449319e-06, + "loss": 0.0303, + "step": 3997 + }, + { + "epoch": 1.3351143763566538, + "grad_norm": 0.6345478217588778, + "learning_rate": 6.790902344994861e-06, + "loss": 0.0257, + "step": 3998 + }, + { + "epoch": 1.3354483219235265, + "grad_norm": 0.3070025343946019, + "learning_rate": 6.789087812003516e-06, + "loss": 0.025, + "step": 3999 + }, + { + "epoch": 1.3357822674903992, + "grad_norm": 0.24306447100088768, + "learning_rate": 6.787273008749391e-06, + "loss": 0.0231, + "step": 4000 + }, + { + "epoch": 1.3361162130572717, + "grad_norm": 0.4126685742015921, + "learning_rate": 6.785457935506634e-06, + "loss": 0.0345, + "step": 4001 + }, + { + "epoch": 1.3364501586241442, + "grad_norm": 0.2969746284841204, + "learning_rate": 6.783642592549433e-06, + "loss": 0.0254, + "step": 4002 + }, + { + "epoch": 1.336784104191017, + "grad_norm": 0.38972185979025803, + "learning_rate": 6.781826980152015e-06, + "loss": 0.0262, + "step": 4003 + }, + { + "epoch": 1.3371180497578894, + "grad_norm": 0.2857683016397726, + "learning_rate": 6.780011098588654e-06, + "loss": 0.0289, + "step": 4004 + }, + { + "epoch": 1.337451995324762, + "grad_norm": 0.23426567056860534, + "learning_rate": 6.778194948133656e-06, + "loss": 0.0179, + "step": 4005 + }, + { + "epoch": 1.3377859408916346, + "grad_norm": 0.2651808306864631, + "learning_rate": 6.776378529061374e-06, + "loss": 0.0246, + "step": 4006 + }, + { + "epoch": 1.3381198864585073, + "grad_norm": 0.2885585517738164, + "learning_rate": 6.774561841646199e-06, + "loss": 0.0216, + "step": 4007 + }, + { + "epoch": 1.3384538320253798, + "grad_norm": 0.46368001528475267, + "learning_rate": 6.772744886162563e-06, + "loss": 0.0278, + "step": 4008 + }, + { + "epoch": 1.3387877775922525, + "grad_norm": 0.33113431384047287, + "learning_rate": 6.770927662884937e-06, + "loss": 0.034, + "step": 4009 + }, + { + "epoch": 1.339121723159125, + "grad_norm": 0.27994058941053224, + "learning_rate": 6.769110172087838e-06, + "loss": 0.0262, + "step": 4010 + }, + { + "epoch": 1.3394556687259978, + "grad_norm": 0.24977503041778243, + "learning_rate": 6.767292414045816e-06, + "loss": 0.023, + "step": 4011 + }, + { + "epoch": 1.3397896142928702, + "grad_norm": 0.9873981358898284, + "learning_rate": 6.765474389033464e-06, + "loss": 0.0331, + "step": 4012 + }, + { + "epoch": 1.3401235598597427, + "grad_norm": 0.4491882980158148, + "learning_rate": 6.7636560973254195e-06, + "loss": 0.0275, + "step": 4013 + }, + { + "epoch": 1.3404575054266155, + "grad_norm": 0.345020742446994, + "learning_rate": 6.761837539196355e-06, + "loss": 0.0252, + "step": 4014 + }, + { + "epoch": 1.3407914509934882, + "grad_norm": 0.39817735840628704, + "learning_rate": 6.760018714920985e-06, + "loss": 0.0305, + "step": 4015 + }, + { + "epoch": 1.3411253965603607, + "grad_norm": 0.3551586935083602, + "learning_rate": 6.758199624774065e-06, + "loss": 0.026, + "step": 4016 + }, + { + "epoch": 1.3414593421272332, + "grad_norm": 0.32117913628735567, + "learning_rate": 6.7563802690303895e-06, + "loss": 0.0289, + "step": 4017 + }, + { + "epoch": 1.3417932876941059, + "grad_norm": 0.3473114328174935, + "learning_rate": 6.7545606479647915e-06, + "loss": 0.0266, + "step": 4018 + }, + { + "epoch": 1.3421272332609784, + "grad_norm": 0.2941592745160124, + "learning_rate": 6.752740761852151e-06, + "loss": 0.0273, + "step": 4019 + }, + { + "epoch": 1.342461178827851, + "grad_norm": 0.2774360939047979, + "learning_rate": 6.7509206109673794e-06, + "loss": 0.0249, + "step": 4020 + }, + { + "epoch": 1.3427951243947236, + "grad_norm": 0.32894801950482583, + "learning_rate": 6.749100195585433e-06, + "loss": 0.0259, + "step": 4021 + }, + { + "epoch": 1.3431290699615963, + "grad_norm": 0.40103198680095004, + "learning_rate": 6.747279515981307e-06, + "loss": 0.049, + "step": 4022 + }, + { + "epoch": 1.3434630155284688, + "grad_norm": 0.3833407081992172, + "learning_rate": 6.745458572430038e-06, + "loss": 0.0289, + "step": 4023 + }, + { + "epoch": 1.3437969610953415, + "grad_norm": 0.26911043319382494, + "learning_rate": 6.743637365206698e-06, + "loss": 0.0226, + "step": 4024 + }, + { + "epoch": 1.344130906662214, + "grad_norm": 0.3224970861528086, + "learning_rate": 6.741815894586404e-06, + "loss": 0.0332, + "step": 4025 + }, + { + "epoch": 1.3444648522290867, + "grad_norm": 0.4219560992033902, + "learning_rate": 6.7399941608443096e-06, + "loss": 0.0199, + "step": 4026 + }, + { + "epoch": 1.3447987977959592, + "grad_norm": 0.2658775478467294, + "learning_rate": 6.7381721642556095e-06, + "loss": 0.0217, + "step": 4027 + }, + { + "epoch": 1.3451327433628317, + "grad_norm": 0.3586867143621259, + "learning_rate": 6.736349905095538e-06, + "loss": 0.0264, + "step": 4028 + }, + { + "epoch": 1.3454666889297044, + "grad_norm": 0.2978065766752167, + "learning_rate": 6.734527383639369e-06, + "loss": 0.027, + "step": 4029 + }, + { + "epoch": 1.3458006344965772, + "grad_norm": 0.25314422291377997, + "learning_rate": 6.732704600162414e-06, + "loss": 0.0269, + "step": 4030 + }, + { + "epoch": 1.3461345800634497, + "grad_norm": 0.2973028453678317, + "learning_rate": 6.730881554940029e-06, + "loss": 0.0275, + "step": 4031 + }, + { + "epoch": 1.3464685256303222, + "grad_norm": 0.32619727280955996, + "learning_rate": 6.729058248247602e-06, + "loss": 0.0346, + "step": 4032 + }, + { + "epoch": 1.3468024711971949, + "grad_norm": 0.3434849845380133, + "learning_rate": 6.727234680360569e-06, + "loss": 0.0361, + "step": 4033 + }, + { + "epoch": 1.3471364167640676, + "grad_norm": 0.22024543274433755, + "learning_rate": 6.725410851554401e-06, + "loss": 0.0227, + "step": 4034 + }, + { + "epoch": 1.34747036233094, + "grad_norm": 0.4967844811902783, + "learning_rate": 6.7235867621046055e-06, + "loss": 0.0297, + "step": 4035 + }, + { + "epoch": 1.3478043078978126, + "grad_norm": 0.310306987070182, + "learning_rate": 6.721762412286738e-06, + "loss": 0.0293, + "step": 4036 + }, + { + "epoch": 1.3481382534646853, + "grad_norm": 0.3388201384862846, + "learning_rate": 6.719937802376383e-06, + "loss": 0.0256, + "step": 4037 + }, + { + "epoch": 1.3484721990315578, + "grad_norm": 0.270161787617815, + "learning_rate": 6.718112932649171e-06, + "loss": 0.0246, + "step": 4038 + }, + { + "epoch": 1.3488061445984305, + "grad_norm": 0.3748167636011021, + "learning_rate": 6.716287803380771e-06, + "loss": 0.0252, + "step": 4039 + }, + { + "epoch": 1.349140090165303, + "grad_norm": 0.35972495101420165, + "learning_rate": 6.714462414846891e-06, + "loss": 0.0346, + "step": 4040 + }, + { + "epoch": 1.3494740357321757, + "grad_norm": 0.42476931990784994, + "learning_rate": 6.712636767323273e-06, + "loss": 0.0249, + "step": 4041 + }, + { + "epoch": 1.3498079812990482, + "grad_norm": 0.3103294509936811, + "learning_rate": 6.710810861085708e-06, + "loss": 0.0257, + "step": 4042 + }, + { + "epoch": 1.3501419268659207, + "grad_norm": 0.3012245512907342, + "learning_rate": 6.708984696410018e-06, + "loss": 0.0285, + "step": 4043 + }, + { + "epoch": 1.3504758724327934, + "grad_norm": 0.21337508707349964, + "learning_rate": 6.707158273572066e-06, + "loss": 0.0189, + "step": 4044 + }, + { + "epoch": 1.3508098179996662, + "grad_norm": 0.346151871120594, + "learning_rate": 6.7053315928477566e-06, + "loss": 0.0276, + "step": 4045 + }, + { + "epoch": 1.3511437635665386, + "grad_norm": 0.29626553431200836, + "learning_rate": 6.703504654513031e-06, + "loss": 0.0287, + "step": 4046 + }, + { + "epoch": 1.3514777091334111, + "grad_norm": 0.2876935778489747, + "learning_rate": 6.701677458843868e-06, + "loss": 0.0219, + "step": 4047 + }, + { + "epoch": 1.3518116547002839, + "grad_norm": 0.29341604070807464, + "learning_rate": 6.6998500061162884e-06, + "loss": 0.0245, + "step": 4048 + }, + { + "epoch": 1.3521456002671566, + "grad_norm": 0.418776001046412, + "learning_rate": 6.6980222966063516e-06, + "loss": 0.0193, + "step": 4049 + }, + { + "epoch": 1.352479545834029, + "grad_norm": 0.26535454991719315, + "learning_rate": 6.6961943305901515e-06, + "loss": 0.0259, + "step": 4050 + }, + { + "epoch": 1.3528134914009016, + "grad_norm": 0.5962298072412725, + "learning_rate": 6.694366108343827e-06, + "loss": 0.049, + "step": 4051 + }, + { + "epoch": 1.3531474369677743, + "grad_norm": 0.4392971826647376, + "learning_rate": 6.692537630143551e-06, + "loss": 0.0308, + "step": 4052 + }, + { + "epoch": 1.3534813825346468, + "grad_norm": 0.32724956884853096, + "learning_rate": 6.6907088962655375e-06, + "loss": 0.0323, + "step": 4053 + }, + { + "epoch": 1.3538153281015195, + "grad_norm": 0.3628099638430513, + "learning_rate": 6.688879906986036e-06, + "loss": 0.0284, + "step": 4054 + }, + { + "epoch": 1.354149273668392, + "grad_norm": 0.46593214083179785, + "learning_rate": 6.687050662581341e-06, + "loss": 0.0303, + "step": 4055 + }, + { + "epoch": 1.3544832192352647, + "grad_norm": 0.3224708133821626, + "learning_rate": 6.685221163327778e-06, + "loss": 0.0273, + "step": 4056 + }, + { + "epoch": 1.3548171648021372, + "grad_norm": 0.34515194238754754, + "learning_rate": 6.683391409501715e-06, + "loss": 0.0356, + "step": 4057 + }, + { + "epoch": 1.35515111036901, + "grad_norm": 0.46353458831364824, + "learning_rate": 6.6815614013795595e-06, + "loss": 0.0556, + "step": 4058 + }, + { + "epoch": 1.3554850559358824, + "grad_norm": 0.40208487931458303, + "learning_rate": 6.679731139237753e-06, + "loss": 0.0275, + "step": 4059 + }, + { + "epoch": 1.3558190015027551, + "grad_norm": 0.31487963630422133, + "learning_rate": 6.67790062335278e-06, + "loss": 0.0277, + "step": 4060 + }, + { + "epoch": 1.3561529470696276, + "grad_norm": 0.2344332093466108, + "learning_rate": 6.676069854001162e-06, + "loss": 0.0213, + "step": 4061 + }, + { + "epoch": 1.3564868926365001, + "grad_norm": 0.2652788846659976, + "learning_rate": 6.674238831459456e-06, + "loss": 0.033, + "step": 4062 + }, + { + "epoch": 1.3568208382033728, + "grad_norm": 0.3535027844673089, + "learning_rate": 6.672407556004262e-06, + "loss": 0.0308, + "step": 4063 + }, + { + "epoch": 1.3571547837702456, + "grad_norm": 0.3466652983065082, + "learning_rate": 6.670576027912215e-06, + "loss": 0.0246, + "step": 4064 + }, + { + "epoch": 1.357488729337118, + "grad_norm": 0.2872669596381113, + "learning_rate": 6.668744247459988e-06, + "loss": 0.0282, + "step": 4065 + }, + { + "epoch": 1.3578226749039906, + "grad_norm": 0.3195019261678072, + "learning_rate": 6.666912214924295e-06, + "loss": 0.0291, + "step": 4066 + }, + { + "epoch": 1.3581566204708633, + "grad_norm": 0.25006459208573967, + "learning_rate": 6.665079930581883e-06, + "loss": 0.0243, + "step": 4067 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 0.2879867660869676, + "learning_rate": 6.663247394709542e-06, + "loss": 0.0285, + "step": 4068 + }, + { + "epoch": 1.3588245116046085, + "grad_norm": 0.3299351416990284, + "learning_rate": 6.661414607584099e-06, + "loss": 0.0406, + "step": 4069 + }, + { + "epoch": 1.359158457171481, + "grad_norm": 0.24675535742236313, + "learning_rate": 6.659581569482415e-06, + "loss": 0.0211, + "step": 4070 + }, + { + "epoch": 1.3594924027383537, + "grad_norm": 0.25695780423165604, + "learning_rate": 6.657748280681395e-06, + "loss": 0.0273, + "step": 4071 + }, + { + "epoch": 1.3598263483052262, + "grad_norm": 0.3971129785947468, + "learning_rate": 6.65591474145798e-06, + "loss": 0.0311, + "step": 4072 + }, + { + "epoch": 1.360160293872099, + "grad_norm": 0.3535264899370516, + "learning_rate": 6.6540809520891425e-06, + "loss": 0.029, + "step": 4073 + }, + { + "epoch": 1.3604942394389714, + "grad_norm": 0.2722749459673896, + "learning_rate": 6.652246912851903e-06, + "loss": 0.0255, + "step": 4074 + }, + { + "epoch": 1.3608281850058441, + "grad_norm": 0.3570074437582283, + "learning_rate": 6.650412624023311e-06, + "loss": 0.0283, + "step": 4075 + }, + { + "epoch": 1.3611621305727166, + "grad_norm": 0.3915857756994722, + "learning_rate": 6.648578085880461e-06, + "loss": 0.0238, + "step": 4076 + }, + { + "epoch": 1.3614960761395891, + "grad_norm": 0.3202078213972106, + "learning_rate": 6.64674329870048e-06, + "loss": 0.027, + "step": 4077 + }, + { + "epoch": 1.3618300217064618, + "grad_norm": 0.2641663134144966, + "learning_rate": 6.644908262760531e-06, + "loss": 0.0221, + "step": 4078 + }, + { + "epoch": 1.3621639672733346, + "grad_norm": 0.28433526909822354, + "learning_rate": 6.643072978337823e-06, + "loss": 0.0253, + "step": 4079 + }, + { + "epoch": 1.362497912840207, + "grad_norm": 0.31299018742545603, + "learning_rate": 6.641237445709595e-06, + "loss": 0.0274, + "step": 4080 + }, + { + "epoch": 1.3628318584070795, + "grad_norm": 0.24717740346921763, + "learning_rate": 6.639401665153126e-06, + "loss": 0.0229, + "step": 4081 + }, + { + "epoch": 1.3631658039739523, + "grad_norm": 0.27096646609903063, + "learning_rate": 6.637565636945731e-06, + "loss": 0.0258, + "step": 4082 + }, + { + "epoch": 1.363499749540825, + "grad_norm": 0.2690275667539032, + "learning_rate": 6.635729361364765e-06, + "loss": 0.0213, + "step": 4083 + }, + { + "epoch": 1.3638336951076975, + "grad_norm": 0.26993706166426773, + "learning_rate": 6.633892838687621e-06, + "loss": 0.0186, + "step": 4084 + }, + { + "epoch": 1.36416764067457, + "grad_norm": 0.5005558098460409, + "learning_rate": 6.632056069191723e-06, + "loss": 0.0313, + "step": 4085 + }, + { + "epoch": 1.3645015862414427, + "grad_norm": 0.23785822741383483, + "learning_rate": 6.6302190531545395e-06, + "loss": 0.022, + "step": 4086 + }, + { + "epoch": 1.3648355318083152, + "grad_norm": 0.3218355765369769, + "learning_rate": 6.628381790853573e-06, + "loss": 0.035, + "step": 4087 + }, + { + "epoch": 1.365169477375188, + "grad_norm": 0.30629019108515765, + "learning_rate": 6.626544282566363e-06, + "loss": 0.0287, + "step": 4088 + }, + { + "epoch": 1.3655034229420604, + "grad_norm": 0.3601402344314719, + "learning_rate": 6.624706528570487e-06, + "loss": 0.034, + "step": 4089 + }, + { + "epoch": 1.3658373685089331, + "grad_norm": 0.4365428161116427, + "learning_rate": 6.6228685291435605e-06, + "loss": 0.0538, + "step": 4090 + }, + { + "epoch": 1.3661713140758056, + "grad_norm": 0.31154510065927254, + "learning_rate": 6.621030284563232e-06, + "loss": 0.0225, + "step": 4091 + }, + { + "epoch": 1.366505259642678, + "grad_norm": 0.2619073412486402, + "learning_rate": 6.619191795107195e-06, + "loss": 0.0241, + "step": 4092 + }, + { + "epoch": 1.3668392052095508, + "grad_norm": 0.3073309860724148, + "learning_rate": 6.617353061053171e-06, + "loss": 0.0346, + "step": 4093 + }, + { + "epoch": 1.3671731507764235, + "grad_norm": 0.3651138576018971, + "learning_rate": 6.615514082678922e-06, + "loss": 0.0275, + "step": 4094 + }, + { + "epoch": 1.367507096343296, + "grad_norm": 0.37263957485508603, + "learning_rate": 6.613674860262249e-06, + "loss": 0.0338, + "step": 4095 + }, + { + "epoch": 1.3678410419101685, + "grad_norm": 0.31759700401037727, + "learning_rate": 6.61183539408099e-06, + "loss": 0.0193, + "step": 4096 + }, + { + "epoch": 1.3681749874770412, + "grad_norm": 0.2796330865033312, + "learning_rate": 6.609995684413013e-06, + "loss": 0.0237, + "step": 4097 + }, + { + "epoch": 1.368508933043914, + "grad_norm": 0.3106557266787774, + "learning_rate": 6.608155731536233e-06, + "loss": 0.0282, + "step": 4098 + }, + { + "epoch": 1.3688428786107865, + "grad_norm": 0.23696635035589267, + "learning_rate": 6.606315535728594e-06, + "loss": 0.0242, + "step": 4099 + }, + { + "epoch": 1.369176824177659, + "grad_norm": 0.352664668684995, + "learning_rate": 6.604475097268079e-06, + "loss": 0.0261, + "step": 4100 + }, + { + "epoch": 1.3695107697445317, + "grad_norm": 0.2374625906875964, + "learning_rate": 6.602634416432708e-06, + "loss": 0.0187, + "step": 4101 + }, + { + "epoch": 1.3698447153114042, + "grad_norm": 0.47414479442954915, + "learning_rate": 6.600793493500539e-06, + "loss": 0.0406, + "step": 4102 + }, + { + "epoch": 1.3701786608782769, + "grad_norm": 0.33701968193020054, + "learning_rate": 6.5989523287496645e-06, + "loss": 0.021, + "step": 4103 + }, + { + "epoch": 1.3705126064451494, + "grad_norm": 0.39106877817440683, + "learning_rate": 6.597110922458214e-06, + "loss": 0.0207, + "step": 4104 + }, + { + "epoch": 1.370846552012022, + "grad_norm": 0.2999427539554092, + "learning_rate": 6.595269274904351e-06, + "loss": 0.0227, + "step": 4105 + }, + { + "epoch": 1.3711804975788946, + "grad_norm": 0.3149311012443616, + "learning_rate": 6.593427386366282e-06, + "loss": 0.0273, + "step": 4106 + }, + { + "epoch": 1.3715144431457673, + "grad_norm": 0.35364225049435305, + "learning_rate": 6.591585257122244e-06, + "loss": 0.0385, + "step": 4107 + }, + { + "epoch": 1.3718483887126398, + "grad_norm": 0.34072234254307066, + "learning_rate": 6.589742887450512e-06, + "loss": 0.0256, + "step": 4108 + }, + { + "epoch": 1.3721823342795125, + "grad_norm": 0.25273579340021224, + "learning_rate": 6.5879002776294e-06, + "loss": 0.0222, + "step": 4109 + }, + { + "epoch": 1.372516279846385, + "grad_norm": 0.2705604579708577, + "learning_rate": 6.586057427937252e-06, + "loss": 0.0295, + "step": 4110 + }, + { + "epoch": 1.3728502254132575, + "grad_norm": 0.3050249184143848, + "learning_rate": 6.584214338652455e-06, + "loss": 0.0333, + "step": 4111 + }, + { + "epoch": 1.3731841709801302, + "grad_norm": 0.32920241603327477, + "learning_rate": 6.582371010053429e-06, + "loss": 0.0345, + "step": 4112 + }, + { + "epoch": 1.373518116547003, + "grad_norm": 0.33457957602169924, + "learning_rate": 6.58052744241863e-06, + "loss": 0.0242, + "step": 4113 + }, + { + "epoch": 1.3738520621138754, + "grad_norm": 0.3227834142235845, + "learning_rate": 6.578683636026551e-06, + "loss": 0.0319, + "step": 4114 + }, + { + "epoch": 1.374186007680748, + "grad_norm": 0.3635738039039029, + "learning_rate": 6.576839591155719e-06, + "loss": 0.03, + "step": 4115 + }, + { + "epoch": 1.3745199532476207, + "grad_norm": 0.28247441512053173, + "learning_rate": 6.574995308084702e-06, + "loss": 0.0221, + "step": 4116 + }, + { + "epoch": 1.3748538988144932, + "grad_norm": 0.24864008592072592, + "learning_rate": 6.573150787092097e-06, + "loss": 0.018, + "step": 4117 + }, + { + "epoch": 1.3751878443813659, + "grad_norm": 0.3214042563466242, + "learning_rate": 6.5713060284565435e-06, + "loss": 0.0261, + "step": 4118 + }, + { + "epoch": 1.3755217899482384, + "grad_norm": 0.28233940140350905, + "learning_rate": 6.569461032456713e-06, + "loss": 0.0264, + "step": 4119 + }, + { + "epoch": 1.375855735515111, + "grad_norm": 0.28718907119051273, + "learning_rate": 6.567615799371313e-06, + "loss": 0.0233, + "step": 4120 + }, + { + "epoch": 1.3761896810819836, + "grad_norm": 0.3066969589450228, + "learning_rate": 6.565770329479089e-06, + "loss": 0.0208, + "step": 4121 + }, + { + "epoch": 1.3765236266488563, + "grad_norm": 0.2836497564182992, + "learning_rate": 6.5639246230588205e-06, + "loss": 0.029, + "step": 4122 + }, + { + "epoch": 1.3768575722157288, + "grad_norm": 0.2818740754115631, + "learning_rate": 6.562078680389323e-06, + "loss": 0.0182, + "step": 4123 + }, + { + "epoch": 1.3771915177826015, + "grad_norm": 0.5710079291815752, + "learning_rate": 6.560232501749446e-06, + "loss": 0.0284, + "step": 4124 + }, + { + "epoch": 1.377525463349474, + "grad_norm": 0.2753884512123724, + "learning_rate": 6.558386087418082e-06, + "loss": 0.025, + "step": 4125 + }, + { + "epoch": 1.3778594089163465, + "grad_norm": 0.31365721658402956, + "learning_rate": 6.556539437674147e-06, + "loss": 0.0279, + "step": 4126 + }, + { + "epoch": 1.3781933544832192, + "grad_norm": 0.33371329476070377, + "learning_rate": 6.554692552796604e-06, + "loss": 0.0338, + "step": 4127 + }, + { + "epoch": 1.378527300050092, + "grad_norm": 0.32997381420229777, + "learning_rate": 6.552845433064445e-06, + "loss": 0.0294, + "step": 4128 + }, + { + "epoch": 1.3788612456169644, + "grad_norm": 0.28196085692407397, + "learning_rate": 6.550998078756698e-06, + "loss": 0.026, + "step": 4129 + }, + { + "epoch": 1.379195191183837, + "grad_norm": 0.3099513281295468, + "learning_rate": 6.549150490152429e-06, + "loss": 0.0407, + "step": 4130 + }, + { + "epoch": 1.3795291367507097, + "grad_norm": 0.39082881671939157, + "learning_rate": 6.5473026675307394e-06, + "loss": 0.0366, + "step": 4131 + }, + { + "epoch": 1.3798630823175824, + "grad_norm": 0.35755906278361366, + "learning_rate": 6.545454611170762e-06, + "loss": 0.0288, + "step": 4132 + }, + { + "epoch": 1.3801970278844549, + "grad_norm": 0.31410143934597123, + "learning_rate": 6.543606321351668e-06, + "loss": 0.0305, + "step": 4133 + }, + { + "epoch": 1.3805309734513274, + "grad_norm": 0.3044608100050167, + "learning_rate": 6.541757798352664e-06, + "loss": 0.0279, + "step": 4134 + }, + { + "epoch": 1.3808649190182, + "grad_norm": 0.3787462097567985, + "learning_rate": 6.539909042452991e-06, + "loss": 0.0314, + "step": 4135 + }, + { + "epoch": 1.3811988645850726, + "grad_norm": 0.3948524816613283, + "learning_rate": 6.538060053931925e-06, + "loss": 0.0315, + "step": 4136 + }, + { + "epoch": 1.3815328101519453, + "grad_norm": 0.2848048596769651, + "learning_rate": 6.536210833068779e-06, + "loss": 0.0286, + "step": 4137 + }, + { + "epoch": 1.3818667557188178, + "grad_norm": 0.4213633188225437, + "learning_rate": 6.534361380142896e-06, + "loss": 0.0312, + "step": 4138 + }, + { + "epoch": 1.3822007012856905, + "grad_norm": 0.8024795017243784, + "learning_rate": 6.532511695433662e-06, + "loss": 0.0266, + "step": 4139 + }, + { + "epoch": 1.382534646852563, + "grad_norm": 0.2725350229821373, + "learning_rate": 6.5306617792204915e-06, + "loss": 0.0243, + "step": 4140 + }, + { + "epoch": 1.3828685924194355, + "grad_norm": 0.24386319344336235, + "learning_rate": 6.528811631782835e-06, + "loss": 0.0194, + "step": 4141 + }, + { + "epoch": 1.3832025379863082, + "grad_norm": 0.33063096325815156, + "learning_rate": 6.526961253400181e-06, + "loss": 0.0253, + "step": 4142 + }, + { + "epoch": 1.383536483553181, + "grad_norm": 0.49910265592710185, + "learning_rate": 6.525110644352052e-06, + "loss": 0.0318, + "step": 4143 + }, + { + "epoch": 1.3838704291200534, + "grad_norm": 0.2835546392846156, + "learning_rate": 6.523259804918001e-06, + "loss": 0.0225, + "step": 4144 + }, + { + "epoch": 1.384204374686926, + "grad_norm": 0.29621563311109494, + "learning_rate": 6.52140873537762e-06, + "loss": 0.0233, + "step": 4145 + }, + { + "epoch": 1.3845383202537986, + "grad_norm": 0.40928485304465473, + "learning_rate": 6.519557436010535e-06, + "loss": 0.0285, + "step": 4146 + }, + { + "epoch": 1.3848722658206714, + "grad_norm": 0.4048074209668024, + "learning_rate": 6.51770590709641e-06, + "loss": 0.0316, + "step": 4147 + }, + { + "epoch": 1.3852062113875439, + "grad_norm": 0.31960094426853036, + "learning_rate": 6.515854148914935e-06, + "loss": 0.0274, + "step": 4148 + }, + { + "epoch": 1.3855401569544163, + "grad_norm": 0.38366615291065215, + "learning_rate": 6.514002161745844e-06, + "loss": 0.0357, + "step": 4149 + }, + { + "epoch": 1.385874102521289, + "grad_norm": 0.46633156556378175, + "learning_rate": 6.512149945868898e-06, + "loss": 0.0389, + "step": 4150 + }, + { + "epoch": 1.3862080480881616, + "grad_norm": 0.46532757925687274, + "learning_rate": 6.510297501563899e-06, + "loss": 0.0404, + "step": 4151 + }, + { + "epoch": 1.3865419936550343, + "grad_norm": 0.4292523328715074, + "learning_rate": 6.5084448291106785e-06, + "loss": 0.0324, + "step": 4152 + }, + { + "epoch": 1.3868759392219068, + "grad_norm": 0.24459452855524624, + "learning_rate": 6.506591928789105e-06, + "loss": 0.0241, + "step": 4153 + }, + { + "epoch": 1.3872098847887795, + "grad_norm": 0.3004738237091303, + "learning_rate": 6.504738800879081e-06, + "loss": 0.0237, + "step": 4154 + }, + { + "epoch": 1.387543830355652, + "grad_norm": 0.22475864428390777, + "learning_rate": 6.502885445660544e-06, + "loss": 0.0227, + "step": 4155 + }, + { + "epoch": 1.3878777759225247, + "grad_norm": 0.2255535025880028, + "learning_rate": 6.501031863413464e-06, + "loss": 0.0166, + "step": 4156 + }, + { + "epoch": 1.3882117214893972, + "grad_norm": 0.2839987973562058, + "learning_rate": 6.499178054417847e-06, + "loss": 0.0249, + "step": 4157 + }, + { + "epoch": 1.38854566705627, + "grad_norm": 0.37338379091010143, + "learning_rate": 6.497324018953732e-06, + "loss": 0.0376, + "step": 4158 + }, + { + "epoch": 1.3888796126231424, + "grad_norm": 0.3242868836132737, + "learning_rate": 6.495469757301196e-06, + "loss": 0.0252, + "step": 4159 + }, + { + "epoch": 1.389213558190015, + "grad_norm": 0.3772569942795662, + "learning_rate": 6.493615269740343e-06, + "loss": 0.0459, + "step": 4160 + }, + { + "epoch": 1.3895475037568876, + "grad_norm": 0.3238315246329841, + "learning_rate": 6.491760556551315e-06, + "loss": 0.0263, + "step": 4161 + }, + { + "epoch": 1.3898814493237603, + "grad_norm": 0.2858225011114631, + "learning_rate": 6.489905618014293e-06, + "loss": 0.0281, + "step": 4162 + }, + { + "epoch": 1.3902153948906328, + "grad_norm": 0.28175386943143504, + "learning_rate": 6.488050454409483e-06, + "loss": 0.021, + "step": 4163 + }, + { + "epoch": 1.3905493404575053, + "grad_norm": 0.3189900098389574, + "learning_rate": 6.486195066017129e-06, + "loss": 0.0303, + "step": 4164 + }, + { + "epoch": 1.390883286024378, + "grad_norm": 0.3336994920742012, + "learning_rate": 6.484339453117514e-06, + "loss": 0.0288, + "step": 4165 + }, + { + "epoch": 1.3912172315912505, + "grad_norm": 0.2697524405797859, + "learning_rate": 6.482483615990945e-06, + "loss": 0.0205, + "step": 4166 + }, + { + "epoch": 1.3915511771581233, + "grad_norm": 0.2844755122937119, + "learning_rate": 6.480627554917771e-06, + "loss": 0.0245, + "step": 4167 + }, + { + "epoch": 1.3918851227249958, + "grad_norm": 0.2713321170353821, + "learning_rate": 6.47877127017837e-06, + "loss": 0.0185, + "step": 4168 + }, + { + "epoch": 1.3922190682918685, + "grad_norm": 0.5237458575178805, + "learning_rate": 6.476914762053158e-06, + "loss": 0.0482, + "step": 4169 + }, + { + "epoch": 1.392553013858741, + "grad_norm": 0.36295832297667063, + "learning_rate": 6.47505803082258e-06, + "loss": 0.0313, + "step": 4170 + }, + { + "epoch": 1.3928869594256137, + "grad_norm": 0.3764443012879255, + "learning_rate": 6.473201076767119e-06, + "loss": 0.036, + "step": 4171 + }, + { + "epoch": 1.3932209049924862, + "grad_norm": 0.2582316419690857, + "learning_rate": 6.471343900167289e-06, + "loss": 0.0255, + "step": 4172 + }, + { + "epoch": 1.393554850559359, + "grad_norm": 0.33000479078040534, + "learning_rate": 6.469486501303639e-06, + "loss": 0.0392, + "step": 4173 + }, + { + "epoch": 1.3938887961262314, + "grad_norm": 0.2500769154013191, + "learning_rate": 6.467628880456749e-06, + "loss": 0.0221, + "step": 4174 + }, + { + "epoch": 1.394222741693104, + "grad_norm": 0.4008734050914926, + "learning_rate": 6.465771037907236e-06, + "loss": 0.0483, + "step": 4175 + }, + { + "epoch": 1.3945566872599766, + "grad_norm": 0.20843075979871026, + "learning_rate": 6.463912973935749e-06, + "loss": 0.0192, + "step": 4176 + }, + { + "epoch": 1.3948906328268493, + "grad_norm": 0.39933295292421844, + "learning_rate": 6.462054688822971e-06, + "loss": 0.0326, + "step": 4177 + }, + { + "epoch": 1.3952245783937218, + "grad_norm": 0.2359623271327014, + "learning_rate": 6.460196182849616e-06, + "loss": 0.02, + "step": 4178 + }, + { + "epoch": 1.3955585239605943, + "grad_norm": 0.2774424476758121, + "learning_rate": 6.458337456296434e-06, + "loss": 0.0246, + "step": 4179 + }, + { + "epoch": 1.395892469527467, + "grad_norm": 0.44255909800554805, + "learning_rate": 6.456478509444209e-06, + "loss": 0.025, + "step": 4180 + }, + { + "epoch": 1.3962264150943398, + "grad_norm": 0.31440313277685955, + "learning_rate": 6.454619342573756e-06, + "loss": 0.0226, + "step": 4181 + }, + { + "epoch": 1.3965603606612123, + "grad_norm": 0.32671755747136283, + "learning_rate": 6.452759955965922e-06, + "loss": 0.0305, + "step": 4182 + }, + { + "epoch": 1.3968943062280847, + "grad_norm": 0.39497156158711616, + "learning_rate": 6.450900349901592e-06, + "loss": 0.028, + "step": 4183 + }, + { + "epoch": 1.3972282517949575, + "grad_norm": 0.3028024031748205, + "learning_rate": 6.449040524661681e-06, + "loss": 0.0249, + "step": 4184 + }, + { + "epoch": 1.39756219736183, + "grad_norm": 0.34207219241021913, + "learning_rate": 6.447180480527135e-06, + "loss": 0.027, + "step": 4185 + }, + { + "epoch": 1.3978961429287027, + "grad_norm": 0.36238534784500936, + "learning_rate": 6.445320217778939e-06, + "loss": 0.0329, + "step": 4186 + }, + { + "epoch": 1.3982300884955752, + "grad_norm": 0.665449263626439, + "learning_rate": 6.443459736698106e-06, + "loss": 0.0303, + "step": 4187 + }, + { + "epoch": 1.398564034062448, + "grad_norm": 0.3853842127778719, + "learning_rate": 6.4415990375656826e-06, + "loss": 0.0318, + "step": 4188 + }, + { + "epoch": 1.3988979796293204, + "grad_norm": 0.935871160664241, + "learning_rate": 6.4397381206627505e-06, + "loss": 0.0402, + "step": 4189 + }, + { + "epoch": 1.3992319251961929, + "grad_norm": 0.3994369014425904, + "learning_rate": 6.437876986270424e-06, + "loss": 0.0226, + "step": 4190 + }, + { + "epoch": 1.3995658707630656, + "grad_norm": 0.3582334612795359, + "learning_rate": 6.436015634669848e-06, + "loss": 0.0331, + "step": 4191 + }, + { + "epoch": 1.3998998163299383, + "grad_norm": 0.26236389446839914, + "learning_rate": 6.434154066142201e-06, + "loss": 0.0211, + "step": 4192 + }, + { + "epoch": 1.4002337618968108, + "grad_norm": 0.24871708051095404, + "learning_rate": 6.432292280968695e-06, + "loss": 0.0181, + "step": 4193 + }, + { + "epoch": 1.4005677074636833, + "grad_norm": 0.4326489182291974, + "learning_rate": 6.430430279430577e-06, + "loss": 0.0258, + "step": 4194 + }, + { + "epoch": 1.400901653030556, + "grad_norm": 0.25749714153032927, + "learning_rate": 6.428568061809122e-06, + "loss": 0.0249, + "step": 4195 + }, + { + "epoch": 1.4012355985974287, + "grad_norm": 0.35931741482762375, + "learning_rate": 6.426705628385641e-06, + "loss": 0.0252, + "step": 4196 + }, + { + "epoch": 1.4015695441643012, + "grad_norm": 0.5307331310360581, + "learning_rate": 6.4248429794414745e-06, + "loss": 0.0306, + "step": 4197 + }, + { + "epoch": 1.4019034897311737, + "grad_norm": 0.38274339803651247, + "learning_rate": 6.422980115258e-06, + "loss": 0.0339, + "step": 4198 + }, + { + "epoch": 1.4022374352980465, + "grad_norm": 0.3555093435984295, + "learning_rate": 6.421117036116624e-06, + "loss": 0.0236, + "step": 4199 + }, + { + "epoch": 1.402571380864919, + "grad_norm": 0.2907826146941461, + "learning_rate": 6.4192537422987864e-06, + "loss": 0.0295, + "step": 4200 + }, + { + "epoch": 1.4029053264317917, + "grad_norm": 0.30963497215325875, + "learning_rate": 6.417390234085961e-06, + "loss": 0.0277, + "step": 4201 + }, + { + "epoch": 1.4032392719986642, + "grad_norm": 0.35136844329402045, + "learning_rate": 6.415526511759649e-06, + "loss": 0.0259, + "step": 4202 + }, + { + "epoch": 1.4035732175655369, + "grad_norm": 0.2751243696032686, + "learning_rate": 6.413662575601391e-06, + "loss": 0.0365, + "step": 4203 + }, + { + "epoch": 1.4039071631324094, + "grad_norm": 0.22159857145657047, + "learning_rate": 6.4117984258927565e-06, + "loss": 0.0232, + "step": 4204 + }, + { + "epoch": 1.404241108699282, + "grad_norm": 0.29844503255762855, + "learning_rate": 6.409934062915345e-06, + "loss": 0.026, + "step": 4205 + }, + { + "epoch": 1.4045750542661546, + "grad_norm": 0.3961504186942763, + "learning_rate": 6.408069486950793e-06, + "loss": 0.0481, + "step": 4206 + }, + { + "epoch": 1.4049089998330273, + "grad_norm": 0.2538361477160226, + "learning_rate": 6.406204698280766e-06, + "loss": 0.023, + "step": 4207 + }, + { + "epoch": 1.4052429453998998, + "grad_norm": 0.22444211327860183, + "learning_rate": 6.40433969718696e-06, + "loss": 0.0156, + "step": 4208 + }, + { + "epoch": 1.4055768909667723, + "grad_norm": 0.44034457427431095, + "learning_rate": 6.402474483951109e-06, + "loss": 0.0284, + "step": 4209 + }, + { + "epoch": 1.405910836533645, + "grad_norm": 0.21447942513244683, + "learning_rate": 6.400609058854973e-06, + "loss": 0.0198, + "step": 4210 + }, + { + "epoch": 1.4062447821005177, + "grad_norm": 0.3768648995338918, + "learning_rate": 6.398743422180346e-06, + "loss": 0.0368, + "step": 4211 + }, + { + "epoch": 1.4065787276673902, + "grad_norm": 0.24537398824322698, + "learning_rate": 6.396877574209057e-06, + "loss": 0.023, + "step": 4212 + }, + { + "epoch": 1.4069126732342627, + "grad_norm": 0.31548463622046174, + "learning_rate": 6.395011515222962e-06, + "loss": 0.0222, + "step": 4213 + }, + { + "epoch": 1.4072466188011354, + "grad_norm": 0.2911911195637709, + "learning_rate": 6.393145245503951e-06, + "loss": 0.0278, + "step": 4214 + }, + { + "epoch": 1.407580564368008, + "grad_norm": 0.3424927468106257, + "learning_rate": 6.391278765333948e-06, + "loss": 0.0308, + "step": 4215 + }, + { + "epoch": 1.4079145099348807, + "grad_norm": 0.24610239576806028, + "learning_rate": 6.389412074994906e-06, + "loss": 0.0202, + "step": 4216 + }, + { + "epoch": 1.4082484555017531, + "grad_norm": 0.28614382765446894, + "learning_rate": 6.387545174768809e-06, + "loss": 0.034, + "step": 4217 + }, + { + "epoch": 1.4085824010686259, + "grad_norm": 0.34679664912713826, + "learning_rate": 6.385678064937677e-06, + "loss": 0.0354, + "step": 4218 + }, + { + "epoch": 1.4089163466354984, + "grad_norm": 0.28291344104145383, + "learning_rate": 6.383810745783556e-06, + "loss": 0.0239, + "step": 4219 + }, + { + "epoch": 1.409250292202371, + "grad_norm": 0.3148389335644667, + "learning_rate": 6.38194321758853e-06, + "loss": 0.028, + "step": 4220 + }, + { + "epoch": 1.4095842377692436, + "grad_norm": 0.24227708426055747, + "learning_rate": 6.3800754806347065e-06, + "loss": 0.0248, + "step": 4221 + }, + { + "epoch": 1.4099181833361163, + "grad_norm": 0.2842259981650139, + "learning_rate": 6.378207535204234e-06, + "loss": 0.0254, + "step": 4222 + }, + { + "epoch": 1.4102521289029888, + "grad_norm": 0.2360963689254675, + "learning_rate": 6.376339381579285e-06, + "loss": 0.0175, + "step": 4223 + }, + { + "epoch": 1.4105860744698613, + "grad_norm": 0.25162957127237595, + "learning_rate": 6.374471020042067e-06, + "loss": 0.0201, + "step": 4224 + }, + { + "epoch": 1.410920020036734, + "grad_norm": 0.40135490058030626, + "learning_rate": 6.372602450874816e-06, + "loss": 0.0209, + "step": 4225 + }, + { + "epoch": 1.4112539656036067, + "grad_norm": 0.3223280172139753, + "learning_rate": 6.370733674359803e-06, + "loss": 0.0439, + "step": 4226 + }, + { + "epoch": 1.4115879111704792, + "grad_norm": 0.4121077145420317, + "learning_rate": 6.36886469077933e-06, + "loss": 0.0552, + "step": 4227 + }, + { + "epoch": 1.4119218567373517, + "grad_norm": 0.4886731791786846, + "learning_rate": 6.366995500415727e-06, + "loss": 0.0337, + "step": 4228 + }, + { + "epoch": 1.4122558023042244, + "grad_norm": 0.26735245095573773, + "learning_rate": 6.365126103551358e-06, + "loss": 0.0198, + "step": 4229 + }, + { + "epoch": 1.4125897478710971, + "grad_norm": 0.379505760027139, + "learning_rate": 6.363256500468617e-06, + "loss": 0.0379, + "step": 4230 + }, + { + "epoch": 1.4129236934379696, + "grad_norm": 0.28808782349374135, + "learning_rate": 6.3613866914499285e-06, + "loss": 0.0299, + "step": 4231 + }, + { + "epoch": 1.4132576390048421, + "grad_norm": 0.30287090451349447, + "learning_rate": 6.359516676777751e-06, + "loss": 0.0271, + "step": 4232 + }, + { + "epoch": 1.4135915845717149, + "grad_norm": 0.3022571428909766, + "learning_rate": 6.357646456734574e-06, + "loss": 0.0249, + "step": 4233 + }, + { + "epoch": 1.4139255301385873, + "grad_norm": 0.2859251587416214, + "learning_rate": 6.3557760316029115e-06, + "loss": 0.0307, + "step": 4234 + }, + { + "epoch": 1.41425947570546, + "grad_norm": 0.2836402492400007, + "learning_rate": 6.353905401665317e-06, + "loss": 0.0193, + "step": 4235 + }, + { + "epoch": 1.4145934212723326, + "grad_norm": 0.270069829815595, + "learning_rate": 6.35203456720437e-06, + "loss": 0.0244, + "step": 4236 + }, + { + "epoch": 1.4149273668392053, + "grad_norm": 0.4785601138722959, + "learning_rate": 6.35016352850268e-06, + "loss": 0.0312, + "step": 4237 + }, + { + "epoch": 1.4152613124060778, + "grad_norm": 0.2924169655476126, + "learning_rate": 6.3482922858428915e-06, + "loss": 0.0326, + "step": 4238 + }, + { + "epoch": 1.4155952579729503, + "grad_norm": 0.3005980690764834, + "learning_rate": 6.34642083950768e-06, + "loss": 0.0235, + "step": 4239 + }, + { + "epoch": 1.415929203539823, + "grad_norm": 0.28610543510377096, + "learning_rate": 6.344549189779745e-06, + "loss": 0.024, + "step": 4240 + }, + { + "epoch": 1.4162631491066957, + "grad_norm": 0.23991623737660478, + "learning_rate": 6.342677336941825e-06, + "loss": 0.0252, + "step": 4241 + }, + { + "epoch": 1.4165970946735682, + "grad_norm": 0.31781159720858426, + "learning_rate": 6.340805281276683e-06, + "loss": 0.032, + "step": 4242 + }, + { + "epoch": 1.4169310402404407, + "grad_norm": 0.3675808070280987, + "learning_rate": 6.338933023067114e-06, + "loss": 0.0366, + "step": 4243 + }, + { + "epoch": 1.4172649858073134, + "grad_norm": 0.2489197449042772, + "learning_rate": 6.337060562595949e-06, + "loss": 0.0208, + "step": 4244 + }, + { + "epoch": 1.4175989313741861, + "grad_norm": 0.3387588987548785, + "learning_rate": 6.3351879001460425e-06, + "loss": 0.0271, + "step": 4245 + }, + { + "epoch": 1.4179328769410586, + "grad_norm": 0.39569938256230014, + "learning_rate": 6.333315036000281e-06, + "loss": 0.0406, + "step": 4246 + }, + { + "epoch": 1.4182668225079311, + "grad_norm": 0.40449303682465976, + "learning_rate": 6.331441970441585e-06, + "loss": 0.0311, + "step": 4247 + }, + { + "epoch": 1.4186007680748038, + "grad_norm": 0.26889267903113867, + "learning_rate": 6.329568703752902e-06, + "loss": 0.0223, + "step": 4248 + }, + { + "epoch": 1.4189347136416763, + "grad_norm": 0.24388086962817454, + "learning_rate": 6.32769523621721e-06, + "loss": 0.0224, + "step": 4249 + }, + { + "epoch": 1.419268659208549, + "grad_norm": 0.24511992387520704, + "learning_rate": 6.3258215681175215e-06, + "loss": 0.0195, + "step": 4250 + }, + { + "epoch": 1.4196026047754216, + "grad_norm": 0.26285398620511435, + "learning_rate": 6.323947699736873e-06, + "loss": 0.0218, + "step": 4251 + }, + { + "epoch": 1.4199365503422943, + "grad_norm": 0.312074709076044, + "learning_rate": 6.3220736313583345e-06, + "loss": 0.0246, + "step": 4252 + }, + { + "epoch": 1.4202704959091668, + "grad_norm": 0.34952999657564465, + "learning_rate": 6.320199363265008e-06, + "loss": 0.0339, + "step": 4253 + }, + { + "epoch": 1.4206044414760395, + "grad_norm": 0.28860880207619427, + "learning_rate": 6.318324895740023e-06, + "loss": 0.0233, + "step": 4254 + }, + { + "epoch": 1.420938387042912, + "grad_norm": 0.29733049298495257, + "learning_rate": 6.31645022906654e-06, + "loss": 0.0323, + "step": 4255 + }, + { + "epoch": 1.4212723326097847, + "grad_norm": 0.31805717441672404, + "learning_rate": 6.314575363527748e-06, + "loss": 0.0268, + "step": 4256 + }, + { + "epoch": 1.4216062781766572, + "grad_norm": 0.286972328699409, + "learning_rate": 6.312700299406871e-06, + "loss": 0.0287, + "step": 4257 + }, + { + "epoch": 1.4219402237435297, + "grad_norm": 0.2532610207953052, + "learning_rate": 6.310825036987154e-06, + "loss": 0.0223, + "step": 4258 + }, + { + "epoch": 1.4222741693104024, + "grad_norm": 0.35252950002232053, + "learning_rate": 6.308949576551884e-06, + "loss": 0.0317, + "step": 4259 + }, + { + "epoch": 1.4226081148772751, + "grad_norm": 0.3862716954803643, + "learning_rate": 6.3070739183843655e-06, + "loss": 0.0281, + "step": 4260 + }, + { + "epoch": 1.4229420604441476, + "grad_norm": 0.3171157760416562, + "learning_rate": 6.305198062767942e-06, + "loss": 0.0315, + "step": 4261 + }, + { + "epoch": 1.4232760060110201, + "grad_norm": 0.4086440283230938, + "learning_rate": 6.303322009985984e-06, + "loss": 0.0313, + "step": 4262 + }, + { + "epoch": 1.4236099515778928, + "grad_norm": 0.29594655062853514, + "learning_rate": 6.301445760321889e-06, + "loss": 0.0216, + "step": 4263 + }, + { + "epoch": 1.4239438971447653, + "grad_norm": 0.2193500553098334, + "learning_rate": 6.299569314059088e-06, + "loss": 0.0193, + "step": 4264 + }, + { + "epoch": 1.424277842711638, + "grad_norm": 0.25979567694041017, + "learning_rate": 6.297692671481042e-06, + "loss": 0.0247, + "step": 4265 + }, + { + "epoch": 1.4246117882785105, + "grad_norm": 0.5571964084684246, + "learning_rate": 6.295815832871235e-06, + "loss": 0.0257, + "step": 4266 + }, + { + "epoch": 1.4249457338453833, + "grad_norm": 0.27824904953702756, + "learning_rate": 6.2939387985131905e-06, + "loss": 0.0261, + "step": 4267 + }, + { + "epoch": 1.4252796794122558, + "grad_norm": 0.43944047402045616, + "learning_rate": 6.292061568690455e-06, + "loss": 0.0242, + "step": 4268 + }, + { + "epoch": 1.4256136249791285, + "grad_norm": 0.34363125953944945, + "learning_rate": 6.290184143686606e-06, + "loss": 0.0319, + "step": 4269 + }, + { + "epoch": 1.425947570546001, + "grad_norm": 0.367581617819465, + "learning_rate": 6.288306523785252e-06, + "loss": 0.0352, + "step": 4270 + }, + { + "epoch": 1.4262815161128737, + "grad_norm": 0.2806152073022378, + "learning_rate": 6.286428709270026e-06, + "loss": 0.0247, + "step": 4271 + }, + { + "epoch": 1.4266154616797462, + "grad_norm": 0.292103212824337, + "learning_rate": 6.284550700424597e-06, + "loss": 0.0223, + "step": 4272 + }, + { + "epoch": 1.4269494072466187, + "grad_norm": 0.2594443901105272, + "learning_rate": 6.282672497532659e-06, + "loss": 0.0234, + "step": 4273 + }, + { + "epoch": 1.4272833528134914, + "grad_norm": 0.21679403571505187, + "learning_rate": 6.280794100877938e-06, + "loss": 0.0201, + "step": 4274 + }, + { + "epoch": 1.427617298380364, + "grad_norm": 0.3305930506341123, + "learning_rate": 6.278915510744187e-06, + "loss": 0.0199, + "step": 4275 + }, + { + "epoch": 1.4279512439472366, + "grad_norm": 0.36608441028261, + "learning_rate": 6.277036727415189e-06, + "loss": 0.0294, + "step": 4276 + }, + { + "epoch": 1.428285189514109, + "grad_norm": 0.3227092739262827, + "learning_rate": 6.2751577511747575e-06, + "loss": 0.0285, + "step": 4277 + }, + { + "epoch": 1.4286191350809818, + "grad_norm": 0.2798813553302426, + "learning_rate": 6.273278582306732e-06, + "loss": 0.024, + "step": 4278 + }, + { + "epoch": 1.4289530806478545, + "grad_norm": 0.35298879503799635, + "learning_rate": 6.271399221094986e-06, + "loss": 0.0222, + "step": 4279 + }, + { + "epoch": 1.429287026214727, + "grad_norm": 0.31617957096070615, + "learning_rate": 6.269519667823416e-06, + "loss": 0.0328, + "step": 4280 + }, + { + "epoch": 1.4296209717815995, + "grad_norm": 0.2902821305467271, + "learning_rate": 6.267639922775952e-06, + "loss": 0.0192, + "step": 4281 + }, + { + "epoch": 1.4299549173484722, + "grad_norm": 0.25669861326313853, + "learning_rate": 6.265759986236552e-06, + "loss": 0.0195, + "step": 4282 + }, + { + "epoch": 1.4302888629153447, + "grad_norm": 0.29127756161554613, + "learning_rate": 6.263879858489204e-06, + "loss": 0.024, + "step": 4283 + }, + { + "epoch": 1.4306228084822175, + "grad_norm": 0.3462771143572902, + "learning_rate": 6.261999539817919e-06, + "loss": 0.0251, + "step": 4284 + }, + { + "epoch": 1.43095675404909, + "grad_norm": 0.2794947820600223, + "learning_rate": 6.260119030506746e-06, + "loss": 0.0201, + "step": 4285 + }, + { + "epoch": 1.4312906996159627, + "grad_norm": 0.27555656343229007, + "learning_rate": 6.258238330839754e-06, + "loss": 0.0206, + "step": 4286 + }, + { + "epoch": 1.4316246451828352, + "grad_norm": 0.35812082031648257, + "learning_rate": 6.2563574411010485e-06, + "loss": 0.0307, + "step": 4287 + }, + { + "epoch": 1.4319585907497077, + "grad_norm": 0.35617509376580264, + "learning_rate": 6.254476361574757e-06, + "loss": 0.0244, + "step": 4288 + }, + { + "epoch": 1.4322925363165804, + "grad_norm": 0.3051016083373015, + "learning_rate": 6.252595092545042e-06, + "loss": 0.0213, + "step": 4289 + }, + { + "epoch": 1.432626481883453, + "grad_norm": 0.2538639212635603, + "learning_rate": 6.250713634296087e-06, + "loss": 0.0202, + "step": 4290 + }, + { + "epoch": 1.4329604274503256, + "grad_norm": 0.3331413922065278, + "learning_rate": 6.248831987112113e-06, + "loss": 0.0291, + "step": 4291 + }, + { + "epoch": 1.433294373017198, + "grad_norm": 0.22297512890626758, + "learning_rate": 6.246950151277362e-06, + "loss": 0.0197, + "step": 4292 + }, + { + "epoch": 1.4336283185840708, + "grad_norm": 0.31374971852841, + "learning_rate": 6.245068127076109e-06, + "loss": 0.0261, + "step": 4293 + }, + { + "epoch": 1.4339622641509435, + "grad_norm": 0.2511216012849922, + "learning_rate": 6.243185914792655e-06, + "loss": 0.0203, + "step": 4294 + }, + { + "epoch": 1.434296209717816, + "grad_norm": 0.6807560465015144, + "learning_rate": 6.2413035147113295e-06, + "loss": 0.0395, + "step": 4295 + }, + { + "epoch": 1.4346301552846885, + "grad_norm": 0.4305937530593603, + "learning_rate": 6.239420927116493e-06, + "loss": 0.0335, + "step": 4296 + }, + { + "epoch": 1.4349641008515612, + "grad_norm": 0.23599825462107743, + "learning_rate": 6.2375381522925325e-06, + "loss": 0.0244, + "step": 4297 + }, + { + "epoch": 1.4352980464184337, + "grad_norm": 0.31721034326424485, + "learning_rate": 6.235655190523862e-06, + "loss": 0.0243, + "step": 4298 + }, + { + "epoch": 1.4356319919853064, + "grad_norm": 0.2833244629213553, + "learning_rate": 6.233772042094924e-06, + "loss": 0.0254, + "step": 4299 + }, + { + "epoch": 1.435965937552179, + "grad_norm": 0.30297019801640057, + "learning_rate": 6.231888707290194e-06, + "loss": 0.0254, + "step": 4300 + }, + { + "epoch": 1.4362998831190517, + "grad_norm": 0.3761343437617929, + "learning_rate": 6.230005186394169e-06, + "loss": 0.0197, + "step": 4301 + }, + { + "epoch": 1.4366338286859242, + "grad_norm": 0.20260845010605857, + "learning_rate": 6.228121479691377e-06, + "loss": 0.0176, + "step": 4302 + }, + { + "epoch": 1.4369677742527969, + "grad_norm": 0.2953658817660671, + "learning_rate": 6.226237587466375e-06, + "loss": 0.0299, + "step": 4303 + }, + { + "epoch": 1.4373017198196694, + "grad_norm": 0.30324716864151124, + "learning_rate": 6.224353510003747e-06, + "loss": 0.0331, + "step": 4304 + }, + { + "epoch": 1.437635665386542, + "grad_norm": 0.6184108877720337, + "learning_rate": 6.222469247588105e-06, + "loss": 0.0343, + "step": 4305 + }, + { + "epoch": 1.4379696109534146, + "grad_norm": 0.2735142999233015, + "learning_rate": 6.220584800504091e-06, + "loss": 0.0269, + "step": 4306 + }, + { + "epoch": 1.438303556520287, + "grad_norm": 0.2607299495322127, + "learning_rate": 6.218700169036368e-06, + "loss": 0.0265, + "step": 4307 + }, + { + "epoch": 1.4386375020871598, + "grad_norm": 0.2656994956847387, + "learning_rate": 6.216815353469636e-06, + "loss": 0.0244, + "step": 4308 + }, + { + "epoch": 1.4389714476540325, + "grad_norm": 0.19914033911661247, + "learning_rate": 6.214930354088618e-06, + "loss": 0.019, + "step": 4309 + }, + { + "epoch": 1.439305393220905, + "grad_norm": 0.29142320482694123, + "learning_rate": 6.213045171178063e-06, + "loss": 0.0238, + "step": 4310 + }, + { + "epoch": 1.4396393387877775, + "grad_norm": 0.21785132569368448, + "learning_rate": 6.2111598050227535e-06, + "loss": 0.0184, + "step": 4311 + }, + { + "epoch": 1.4399732843546502, + "grad_norm": 0.3170393956576559, + "learning_rate": 6.209274255907494e-06, + "loss": 0.0284, + "step": 4312 + }, + { + "epoch": 1.4403072299215227, + "grad_norm": 0.2704015210477321, + "learning_rate": 6.207388524117119e-06, + "loss": 0.029, + "step": 4313 + }, + { + "epoch": 1.4406411754883954, + "grad_norm": 0.33824713125453343, + "learning_rate": 6.205502609936491e-06, + "loss": 0.0247, + "step": 4314 + }, + { + "epoch": 1.440975121055268, + "grad_norm": 0.26962689220866193, + "learning_rate": 6.2036165136505e-06, + "loss": 0.0345, + "step": 4315 + }, + { + "epoch": 1.4413090666221406, + "grad_norm": 0.27196873050146303, + "learning_rate": 6.201730235544062e-06, + "loss": 0.0267, + "step": 4316 + }, + { + "epoch": 1.4416430121890131, + "grad_norm": 0.2806915451900328, + "learning_rate": 6.1998437759021235e-06, + "loss": 0.0215, + "step": 4317 + }, + { + "epoch": 1.4419769577558859, + "grad_norm": 0.4189514378039057, + "learning_rate": 6.197957135009653e-06, + "loss": 0.0321, + "step": 4318 + }, + { + "epoch": 1.4423109033227584, + "grad_norm": 0.3651276750301456, + "learning_rate": 6.196070313151652e-06, + "loss": 0.035, + "step": 4319 + }, + { + "epoch": 1.442644848889631, + "grad_norm": 0.3155492201587994, + "learning_rate": 6.194183310613147e-06, + "loss": 0.0323, + "step": 4320 + }, + { + "epoch": 1.4429787944565036, + "grad_norm": 0.2013473872887474, + "learning_rate": 6.1922961276791925e-06, + "loss": 0.014, + "step": 4321 + }, + { + "epoch": 1.443312740023376, + "grad_norm": 0.3723058797287197, + "learning_rate": 6.190408764634869e-06, + "loss": 0.0231, + "step": 4322 + }, + { + "epoch": 1.4436466855902488, + "grad_norm": 0.39725513943297047, + "learning_rate": 6.188521221765285e-06, + "loss": 0.0317, + "step": 4323 + }, + { + "epoch": 1.4439806311571215, + "grad_norm": 0.2618057425801992, + "learning_rate": 6.186633499355576e-06, + "loss": 0.0182, + "step": 4324 + }, + { + "epoch": 1.444314576723994, + "grad_norm": 0.26419649231389597, + "learning_rate": 6.184745597690903e-06, + "loss": 0.0221, + "step": 4325 + }, + { + "epoch": 1.4446485222908665, + "grad_norm": 0.2752264239920511, + "learning_rate": 6.1828575170564595e-06, + "loss": 0.0229, + "step": 4326 + }, + { + "epoch": 1.4449824678577392, + "grad_norm": 0.366699170884167, + "learning_rate": 6.18096925773746e-06, + "loss": 0.0289, + "step": 4327 + }, + { + "epoch": 1.445316413424612, + "grad_norm": 0.22060280111458522, + "learning_rate": 6.179080820019147e-06, + "loss": 0.0179, + "step": 4328 + }, + { + "epoch": 1.4456503589914844, + "grad_norm": 0.30863112948765364, + "learning_rate": 6.177192204186796e-06, + "loss": 0.0268, + "step": 4329 + }, + { + "epoch": 1.445984304558357, + "grad_norm": 0.2955937241533402, + "learning_rate": 6.1753034105257e-06, + "loss": 0.0265, + "step": 4330 + }, + { + "epoch": 1.4463182501252296, + "grad_norm": 0.2782935487389836, + "learning_rate": 6.173414439321185e-06, + "loss": 0.0284, + "step": 4331 + }, + { + "epoch": 1.4466521956921021, + "grad_norm": 0.3681146222875431, + "learning_rate": 6.171525290858602e-06, + "loss": 0.0282, + "step": 4332 + }, + { + "epoch": 1.4469861412589748, + "grad_norm": 0.3455793107526349, + "learning_rate": 6.169635965423331e-06, + "loss": 0.0314, + "step": 4333 + }, + { + "epoch": 1.4473200868258473, + "grad_norm": 0.29459532408232986, + "learning_rate": 6.167746463300774e-06, + "loss": 0.0252, + "step": 4334 + }, + { + "epoch": 1.44765403239272, + "grad_norm": 0.3417564847463384, + "learning_rate": 6.1658567847763655e-06, + "loss": 0.026, + "step": 4335 + }, + { + "epoch": 1.4479879779595926, + "grad_norm": 0.32711670984725155, + "learning_rate": 6.163966930135561e-06, + "loss": 0.0286, + "step": 4336 + }, + { + "epoch": 1.448321923526465, + "grad_norm": 0.4142744064136188, + "learning_rate": 6.162076899663846e-06, + "loss": 0.0352, + "step": 4337 + }, + { + "epoch": 1.4486558690933378, + "grad_norm": 0.3051924164507242, + "learning_rate": 6.160186693646732e-06, + "loss": 0.0257, + "step": 4338 + }, + { + "epoch": 1.4489898146602105, + "grad_norm": 0.831081294436607, + "learning_rate": 6.158296312369759e-06, + "loss": 0.0355, + "step": 4339 + }, + { + "epoch": 1.449323760227083, + "grad_norm": 0.35720312499269324, + "learning_rate": 6.156405756118489e-06, + "loss": 0.0349, + "step": 4340 + }, + { + "epoch": 1.4496577057939555, + "grad_norm": 0.40600814896802045, + "learning_rate": 6.154515025178511e-06, + "loss": 0.0354, + "step": 4341 + }, + { + "epoch": 1.4499916513608282, + "grad_norm": 0.3844705951190644, + "learning_rate": 6.152624119835447e-06, + "loss": 0.034, + "step": 4342 + }, + { + "epoch": 1.450325596927701, + "grad_norm": 0.4232775372248357, + "learning_rate": 6.150733040374937e-06, + "loss": 0.0365, + "step": 4343 + }, + { + "epoch": 1.4506595424945734, + "grad_norm": 0.25552025928843425, + "learning_rate": 6.148841787082653e-06, + "loss": 0.0231, + "step": 4344 + }, + { + "epoch": 1.450993488061446, + "grad_norm": 0.3526139504264799, + "learning_rate": 6.146950360244288e-06, + "loss": 0.028, + "step": 4345 + }, + { + "epoch": 1.4513274336283186, + "grad_norm": 0.6088722502298349, + "learning_rate": 6.145058760145568e-06, + "loss": 0.0499, + "step": 4346 + }, + { + "epoch": 1.4516613791951911, + "grad_norm": 0.25052877688381175, + "learning_rate": 6.14316698707224e-06, + "loss": 0.0218, + "step": 4347 + }, + { + "epoch": 1.4519953247620638, + "grad_norm": 0.3183343144553221, + "learning_rate": 6.1412750413100754e-06, + "loss": 0.0244, + "step": 4348 + }, + { + "epoch": 1.4523292703289363, + "grad_norm": 0.2834692629233744, + "learning_rate": 6.13938292314488e-06, + "loss": 0.0274, + "step": 4349 + }, + { + "epoch": 1.452663215895809, + "grad_norm": 0.49889728547952156, + "learning_rate": 6.137490632862479e-06, + "loss": 0.027, + "step": 4350 + }, + { + "epoch": 1.4529971614626815, + "grad_norm": 0.32268338807206937, + "learning_rate": 6.135598170748721e-06, + "loss": 0.025, + "step": 4351 + }, + { + "epoch": 1.4533311070295543, + "grad_norm": 0.32154313051083633, + "learning_rate": 6.13370553708949e-06, + "loss": 0.0262, + "step": 4352 + }, + { + "epoch": 1.4536650525964268, + "grad_norm": 0.4029298036487729, + "learning_rate": 6.13181273217069e-06, + "loss": 0.0449, + "step": 4353 + }, + { + "epoch": 1.4539989981632995, + "grad_norm": 0.3048666264152515, + "learning_rate": 6.129919756278248e-06, + "loss": 0.0209, + "step": 4354 + }, + { + "epoch": 1.454332943730172, + "grad_norm": 0.3170989508504314, + "learning_rate": 6.128026609698124e-06, + "loss": 0.0252, + "step": 4355 + }, + { + "epoch": 1.4546668892970445, + "grad_norm": 0.288892382898713, + "learning_rate": 6.126133292716297e-06, + "loss": 0.0312, + "step": 4356 + }, + { + "epoch": 1.4550008348639172, + "grad_norm": 0.24778110269529585, + "learning_rate": 6.124239805618778e-06, + "loss": 0.0273, + "step": 4357 + }, + { + "epoch": 1.45533478043079, + "grad_norm": 0.2340694718381469, + "learning_rate": 6.122346148691598e-06, + "loss": 0.013, + "step": 4358 + }, + { + "epoch": 1.4556687259976624, + "grad_norm": 0.27797544140500174, + "learning_rate": 6.120452322220818e-06, + "loss": 0.0217, + "step": 4359 + }, + { + "epoch": 1.456002671564535, + "grad_norm": 0.5159385172970974, + "learning_rate": 6.11855832649252e-06, + "loss": 0.0392, + "step": 4360 + }, + { + "epoch": 1.4563366171314076, + "grad_norm": 0.26066668128539205, + "learning_rate": 6.116664161792817e-06, + "loss": 0.0206, + "step": 4361 + }, + { + "epoch": 1.45667056269828, + "grad_norm": 0.2582490857089062, + "learning_rate": 6.114769828407845e-06, + "loss": 0.0186, + "step": 4362 + }, + { + "epoch": 1.4570045082651528, + "grad_norm": 0.27525466498956835, + "learning_rate": 6.112875326623763e-06, + "loss": 0.0243, + "step": 4363 + }, + { + "epoch": 1.4573384538320253, + "grad_norm": 0.25254017267673917, + "learning_rate": 6.110980656726759e-06, + "loss": 0.0209, + "step": 4364 + }, + { + "epoch": 1.457672399398898, + "grad_norm": 0.2421009015651126, + "learning_rate": 6.109085819003048e-06, + "loss": 0.0234, + "step": 4365 + }, + { + "epoch": 1.4580063449657705, + "grad_norm": 0.39741005395893025, + "learning_rate": 6.107190813738864e-06, + "loss": 0.0439, + "step": 4366 + }, + { + "epoch": 1.4583402905326432, + "grad_norm": 0.265471907374353, + "learning_rate": 6.10529564122047e-06, + "loss": 0.0219, + "step": 4367 + }, + { + "epoch": 1.4586742360995157, + "grad_norm": 0.35149394439823667, + "learning_rate": 6.103400301734155e-06, + "loss": 0.0262, + "step": 4368 + }, + { + "epoch": 1.4590081816663885, + "grad_norm": 0.28547971804341876, + "learning_rate": 6.101504795566232e-06, + "loss": 0.0246, + "step": 4369 + }, + { + "epoch": 1.459342127233261, + "grad_norm": 0.44836156586191384, + "learning_rate": 6.099609123003041e-06, + "loss": 0.0294, + "step": 4370 + }, + { + "epoch": 1.4596760728001335, + "grad_norm": 0.2108431768776645, + "learning_rate": 6.097713284330944e-06, + "loss": 0.0146, + "step": 4371 + }, + { + "epoch": 1.4600100183670062, + "grad_norm": 0.28812177099641406, + "learning_rate": 6.095817279836329e-06, + "loss": 0.0324, + "step": 4372 + }, + { + "epoch": 1.4603439639338789, + "grad_norm": 0.26926791362816294, + "learning_rate": 6.093921109805612e-06, + "loss": 0.0192, + "step": 4373 + }, + { + "epoch": 1.4606779095007514, + "grad_norm": 0.279596395422423, + "learning_rate": 6.092024774525231e-06, + "loss": 0.0245, + "step": 4374 + }, + { + "epoch": 1.4610118550676239, + "grad_norm": 0.29637531877517337, + "learning_rate": 6.090128274281649e-06, + "loss": 0.0227, + "step": 4375 + }, + { + "epoch": 1.4613458006344966, + "grad_norm": 0.27206098990143734, + "learning_rate": 6.0882316093613555e-06, + "loss": 0.027, + "step": 4376 + }, + { + "epoch": 1.4616797462013693, + "grad_norm": 0.43365534951385276, + "learning_rate": 6.086334780050865e-06, + "loss": 0.0538, + "step": 4377 + }, + { + "epoch": 1.4620136917682418, + "grad_norm": 0.26104853212611145, + "learning_rate": 6.084437786636713e-06, + "loss": 0.0206, + "step": 4378 + }, + { + "epoch": 1.4623476373351143, + "grad_norm": 0.28464550950118167, + "learning_rate": 6.082540629405467e-06, + "loss": 0.0257, + "step": 4379 + }, + { + "epoch": 1.462681582901987, + "grad_norm": 0.27122050896414035, + "learning_rate": 6.08064330864371e-06, + "loss": 0.0226, + "step": 4380 + }, + { + "epoch": 1.4630155284688595, + "grad_norm": 0.4038213204342197, + "learning_rate": 6.078745824638058e-06, + "loss": 0.0395, + "step": 4381 + }, + { + "epoch": 1.4633494740357322, + "grad_norm": 0.2634999359783141, + "learning_rate": 6.076848177675148e-06, + "loss": 0.0215, + "step": 4382 + }, + { + "epoch": 1.4636834196026047, + "grad_norm": 0.42833389864849086, + "learning_rate": 6.07495036804164e-06, + "loss": 0.0404, + "step": 4383 + }, + { + "epoch": 1.4640173651694774, + "grad_norm": 0.46512879630504517, + "learning_rate": 6.073052396024222e-06, + "loss": 0.0284, + "step": 4384 + }, + { + "epoch": 1.46435131073635, + "grad_norm": 0.3623048973487272, + "learning_rate": 6.071154261909605e-06, + "loss": 0.0321, + "step": 4385 + }, + { + "epoch": 1.4646852563032224, + "grad_norm": 0.32199116452410337, + "learning_rate": 6.069255965984524e-06, + "loss": 0.0277, + "step": 4386 + }, + { + "epoch": 1.4650192018700952, + "grad_norm": 0.2653636189061635, + "learning_rate": 6.067357508535741e-06, + "loss": 0.0213, + "step": 4387 + }, + { + "epoch": 1.4653531474369679, + "grad_norm": 0.31178905599193835, + "learning_rate": 6.065458889850037e-06, + "loss": 0.0379, + "step": 4388 + }, + { + "epoch": 1.4656870930038404, + "grad_norm": 0.33518735458407195, + "learning_rate": 6.063560110214224e-06, + "loss": 0.0299, + "step": 4389 + }, + { + "epoch": 1.4660210385707129, + "grad_norm": 0.26262022718549094, + "learning_rate": 6.061661169915132e-06, + "loss": 0.0235, + "step": 4390 + }, + { + "epoch": 1.4663549841375856, + "grad_norm": 0.3173487761294358, + "learning_rate": 6.05976206923962e-06, + "loss": 0.0331, + "step": 4391 + }, + { + "epoch": 1.4666889297044583, + "grad_norm": 0.25361114572533433, + "learning_rate": 6.057862808474569e-06, + "loss": 0.0256, + "step": 4392 + }, + { + "epoch": 1.4670228752713308, + "grad_norm": 0.44289787751970144, + "learning_rate": 6.055963387906884e-06, + "loss": 0.0326, + "step": 4393 + }, + { + "epoch": 1.4673568208382033, + "grad_norm": 0.23277029812391556, + "learning_rate": 6.054063807823497e-06, + "loss": 0.0204, + "step": 4394 + }, + { + "epoch": 1.467690766405076, + "grad_norm": 0.30250490534314056, + "learning_rate": 6.052164068511359e-06, + "loss": 0.0272, + "step": 4395 + }, + { + "epoch": 1.4680247119719485, + "grad_norm": 0.36103894659828506, + "learning_rate": 6.05026417025745e-06, + "loss": 0.0445, + "step": 4396 + }, + { + "epoch": 1.4683586575388212, + "grad_norm": 0.3890491899135918, + "learning_rate": 6.0483641133487736e-06, + "loss": 0.0349, + "step": 4397 + }, + { + "epoch": 1.4686926031056937, + "grad_norm": 0.41602537627815867, + "learning_rate": 6.046463898072351e-06, + "loss": 0.0252, + "step": 4398 + }, + { + "epoch": 1.4690265486725664, + "grad_norm": 0.6512734708596248, + "learning_rate": 6.044563524715237e-06, + "loss": 0.0358, + "step": 4399 + }, + { + "epoch": 1.469360494239439, + "grad_norm": 0.27057183116247335, + "learning_rate": 6.042662993564503e-06, + "loss": 0.0236, + "step": 4400 + }, + { + "epoch": 1.4696944398063116, + "grad_norm": 0.2517911938451049, + "learning_rate": 6.040762304907246e-06, + "loss": 0.0214, + "step": 4401 + }, + { + "epoch": 1.4700283853731841, + "grad_norm": 0.30410284603534155, + "learning_rate": 6.038861459030588e-06, + "loss": 0.0231, + "step": 4402 + }, + { + "epoch": 1.4703623309400569, + "grad_norm": 0.25305959236280945, + "learning_rate": 6.036960456221677e-06, + "loss": 0.0228, + "step": 4403 + }, + { + "epoch": 1.4706962765069294, + "grad_norm": 0.35791353127906433, + "learning_rate": 6.035059296767676e-06, + "loss": 0.0259, + "step": 4404 + }, + { + "epoch": 1.4710302220738019, + "grad_norm": 0.36887185957430985, + "learning_rate": 6.033157980955782e-06, + "loss": 0.0261, + "step": 4405 + }, + { + "epoch": 1.4713641676406746, + "grad_norm": 0.26456754460765436, + "learning_rate": 6.0312565090732115e-06, + "loss": 0.0219, + "step": 4406 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 0.26907973736125956, + "learning_rate": 6.0293548814072004e-06, + "loss": 0.026, + "step": 4407 + }, + { + "epoch": 1.4720320587744198, + "grad_norm": 0.28627901760653973, + "learning_rate": 6.0274530982450155e-06, + "loss": 0.0264, + "step": 4408 + }, + { + "epoch": 1.4723660043412923, + "grad_norm": 0.37333216269323416, + "learning_rate": 6.025551159873941e-06, + "loss": 0.0307, + "step": 4409 + }, + { + "epoch": 1.472699949908165, + "grad_norm": 0.3661677265346944, + "learning_rate": 6.023649066581288e-06, + "loss": 0.0324, + "step": 4410 + }, + { + "epoch": 1.4730338954750375, + "grad_norm": 0.3341440260192572, + "learning_rate": 6.021746818654393e-06, + "loss": 0.0268, + "step": 4411 + }, + { + "epoch": 1.4733678410419102, + "grad_norm": 0.31758251491581163, + "learning_rate": 6.019844416380609e-06, + "loss": 0.0352, + "step": 4412 + }, + { + "epoch": 1.4737017866087827, + "grad_norm": 0.29246718704932134, + "learning_rate": 6.017941860047318e-06, + "loss": 0.0196, + "step": 4413 + }, + { + "epoch": 1.4740357321756554, + "grad_norm": 0.38209107163761036, + "learning_rate": 6.016039149941924e-06, + "loss": 0.0241, + "step": 4414 + }, + { + "epoch": 1.474369677742528, + "grad_norm": 0.28155226219443474, + "learning_rate": 6.01413628635185e-06, + "loss": 0.0267, + "step": 4415 + }, + { + "epoch": 1.4747036233094006, + "grad_norm": 0.29575840316153873, + "learning_rate": 6.012233269564551e-06, + "loss": 0.0265, + "step": 4416 + }, + { + "epoch": 1.4750375688762731, + "grad_norm": 0.2775488169569573, + "learning_rate": 6.010330099867497e-06, + "loss": 0.021, + "step": 4417 + }, + { + "epoch": 1.4753715144431458, + "grad_norm": 0.3206485863371625, + "learning_rate": 6.008426777548186e-06, + "loss": 0.028, + "step": 4418 + }, + { + "epoch": 1.4757054600100183, + "grad_norm": 0.31890744967601664, + "learning_rate": 6.0065233028941365e-06, + "loss": 0.0236, + "step": 4419 + }, + { + "epoch": 1.4760394055768908, + "grad_norm": 0.23284350322239286, + "learning_rate": 6.00461967619289e-06, + "loss": 0.0249, + "step": 4420 + }, + { + "epoch": 1.4763733511437636, + "grad_norm": 0.24372449701774504, + "learning_rate": 6.002715897732013e-06, + "loss": 0.0173, + "step": 4421 + }, + { + "epoch": 1.4767072967106363, + "grad_norm": 0.3406960607347691, + "learning_rate": 6.000811967799092e-06, + "loss": 0.0267, + "step": 4422 + }, + { + "epoch": 1.4770412422775088, + "grad_norm": 0.31046737270932906, + "learning_rate": 5.99890788668174e-06, + "loss": 0.0253, + "step": 4423 + }, + { + "epoch": 1.4773751878443813, + "grad_norm": 0.263792987727194, + "learning_rate": 5.997003654667589e-06, + "loss": 0.0231, + "step": 4424 + }, + { + "epoch": 1.477709133411254, + "grad_norm": 0.2736562632219921, + "learning_rate": 5.995099272044298e-06, + "loss": 0.0239, + "step": 4425 + }, + { + "epoch": 1.4780430789781267, + "grad_norm": 0.39782948402809115, + "learning_rate": 5.9931947390995435e-06, + "loss": 0.0314, + "step": 4426 + }, + { + "epoch": 1.4783770245449992, + "grad_norm": 0.30590420348196146, + "learning_rate": 5.99129005612103e-06, + "loss": 0.0329, + "step": 4427 + }, + { + "epoch": 1.4787109701118717, + "grad_norm": 0.275694254754224, + "learning_rate": 5.989385223396482e-06, + "loss": 0.0231, + "step": 4428 + }, + { + "epoch": 1.4790449156787444, + "grad_norm": 0.2446036022092954, + "learning_rate": 5.987480241213646e-06, + "loss": 0.0234, + "step": 4429 + }, + { + "epoch": 1.479378861245617, + "grad_norm": 0.6236265871612486, + "learning_rate": 5.985575109860292e-06, + "loss": 0.042, + "step": 4430 + }, + { + "epoch": 1.4797128068124896, + "grad_norm": 0.4350835063370474, + "learning_rate": 5.983669829624214e-06, + "loss": 0.0296, + "step": 4431 + }, + { + "epoch": 1.4800467523793621, + "grad_norm": 0.26353919474889465, + "learning_rate": 5.981764400793224e-06, + "loss": 0.0235, + "step": 4432 + }, + { + "epoch": 1.4803806979462348, + "grad_norm": 0.26317645607539303, + "learning_rate": 5.9798588236551626e-06, + "loss": 0.0203, + "step": 4433 + }, + { + "epoch": 1.4807146435131073, + "grad_norm": 0.28637973774053704, + "learning_rate": 5.977953098497889e-06, + "loss": 0.0239, + "step": 4434 + }, + { + "epoch": 1.4810485890799798, + "grad_norm": 0.27151618029180996, + "learning_rate": 5.976047225609284e-06, + "loss": 0.0242, + "step": 4435 + }, + { + "epoch": 1.4813825346468525, + "grad_norm": 0.23468847255267425, + "learning_rate": 5.974141205277253e-06, + "loss": 0.0212, + "step": 4436 + }, + { + "epoch": 1.4817164802137253, + "grad_norm": 0.32883916141243086, + "learning_rate": 5.972235037789723e-06, + "loss": 0.0271, + "step": 4437 + }, + { + "epoch": 1.4820504257805978, + "grad_norm": 0.24546122673487103, + "learning_rate": 5.970328723434642e-06, + "loss": 0.022, + "step": 4438 + }, + { + "epoch": 1.4823843713474703, + "grad_norm": 0.30865368447552105, + "learning_rate": 5.968422262499983e-06, + "loss": 0.0233, + "step": 4439 + }, + { + "epoch": 1.482718316914343, + "grad_norm": 0.3202865420587498, + "learning_rate": 5.966515655273739e-06, + "loss": 0.0263, + "step": 4440 + }, + { + "epoch": 1.4830522624812157, + "grad_norm": 0.2924941338804397, + "learning_rate": 5.9646089020439245e-06, + "loss": 0.0256, + "step": 4441 + }, + { + "epoch": 1.4833862080480882, + "grad_norm": 0.239778534488851, + "learning_rate": 5.962702003098576e-06, + "loss": 0.0171, + "step": 4442 + }, + { + "epoch": 1.4837201536149607, + "grad_norm": 0.27248970962594404, + "learning_rate": 5.960794958725756e-06, + "loss": 0.0267, + "step": 4443 + }, + { + "epoch": 1.4840540991818334, + "grad_norm": 0.5668079145755521, + "learning_rate": 5.958887769213544e-06, + "loss": 0.0252, + "step": 4444 + }, + { + "epoch": 1.484388044748706, + "grad_norm": 0.2893695129438283, + "learning_rate": 5.956980434850044e-06, + "loss": 0.022, + "step": 4445 + }, + { + "epoch": 1.4847219903155786, + "grad_norm": 0.260698276431372, + "learning_rate": 5.955072955923381e-06, + "loss": 0.0266, + "step": 4446 + }, + { + "epoch": 1.485055935882451, + "grad_norm": 0.25696206994345266, + "learning_rate": 5.9531653327217035e-06, + "loss": 0.0221, + "step": 4447 + }, + { + "epoch": 1.4853898814493238, + "grad_norm": 0.25981626325338775, + "learning_rate": 5.951257565533177e-06, + "loss": 0.0241, + "step": 4448 + }, + { + "epoch": 1.4857238270161963, + "grad_norm": 0.3447712293620019, + "learning_rate": 5.949349654645997e-06, + "loss": 0.0276, + "step": 4449 + }, + { + "epoch": 1.486057772583069, + "grad_norm": 0.378301455185927, + "learning_rate": 5.947441600348373e-06, + "loss": 0.0472, + "step": 4450 + }, + { + "epoch": 1.4863917181499415, + "grad_norm": 0.48273204515525314, + "learning_rate": 5.945533402928537e-06, + "loss": 0.0458, + "step": 4451 + }, + { + "epoch": 1.4867256637168142, + "grad_norm": 0.3150713667270021, + "learning_rate": 5.9436250626747505e-06, + "loss": 0.0467, + "step": 4452 + }, + { + "epoch": 1.4870596092836867, + "grad_norm": 0.29999024142010366, + "learning_rate": 5.941716579875286e-06, + "loss": 0.0278, + "step": 4453 + }, + { + "epoch": 1.4873935548505592, + "grad_norm": 0.2847484281739037, + "learning_rate": 5.939807954818443e-06, + "loss": 0.0267, + "step": 4454 + }, + { + "epoch": 1.487727500417432, + "grad_norm": 1.1781087472656608, + "learning_rate": 5.937899187792544e-06, + "loss": 0.0407, + "step": 4455 + }, + { + "epoch": 1.4880614459843047, + "grad_norm": 0.23953326319386384, + "learning_rate": 5.935990279085928e-06, + "loss": 0.0212, + "step": 4456 + }, + { + "epoch": 1.4883953915511772, + "grad_norm": 0.365271084860815, + "learning_rate": 5.93408122898696e-06, + "loss": 0.0284, + "step": 4457 + }, + { + "epoch": 1.4887293371180497, + "grad_norm": 0.22194315158527386, + "learning_rate": 5.9321720377840245e-06, + "loss": 0.0185, + "step": 4458 + }, + { + "epoch": 1.4890632826849224, + "grad_norm": 0.2578458646307319, + "learning_rate": 5.930262705765526e-06, + "loss": 0.0197, + "step": 4459 + }, + { + "epoch": 1.4893972282517949, + "grad_norm": 0.3628195999931555, + "learning_rate": 5.928353233219893e-06, + "loss": 0.0211, + "step": 4460 + }, + { + "epoch": 1.4897311738186676, + "grad_norm": 0.24962598655161936, + "learning_rate": 5.926443620435572e-06, + "loss": 0.0199, + "step": 4461 + }, + { + "epoch": 1.49006511938554, + "grad_norm": 0.3309447927990789, + "learning_rate": 5.924533867701034e-06, + "loss": 0.022, + "step": 4462 + }, + { + "epoch": 1.4903990649524128, + "grad_norm": 1.1256395660318999, + "learning_rate": 5.922623975304771e-06, + "loss": 0.0288, + "step": 4463 + }, + { + "epoch": 1.4907330105192853, + "grad_norm": 0.24791379266010563, + "learning_rate": 5.920713943535291e-06, + "loss": 0.0231, + "step": 4464 + }, + { + "epoch": 1.491066956086158, + "grad_norm": 0.3540328285682308, + "learning_rate": 5.9188037726811285e-06, + "loss": 0.0296, + "step": 4465 + }, + { + "epoch": 1.4914009016530305, + "grad_norm": 0.40262122809029594, + "learning_rate": 5.9168934630308385e-06, + "loss": 0.0295, + "step": 4466 + }, + { + "epoch": 1.4917348472199032, + "grad_norm": 0.2741715077506556, + "learning_rate": 5.914983014872995e-06, + "loss": 0.0271, + "step": 4467 + }, + { + "epoch": 1.4920687927867757, + "grad_norm": 0.28712406032794907, + "learning_rate": 5.9130724284961924e-06, + "loss": 0.0242, + "step": 4468 + }, + { + "epoch": 1.4924027383536482, + "grad_norm": 0.2788407148776453, + "learning_rate": 5.91116170418905e-06, + "loss": 0.0243, + "step": 4469 + }, + { + "epoch": 1.492736683920521, + "grad_norm": 0.3824905570351801, + "learning_rate": 5.909250842240203e-06, + "loss": 0.0286, + "step": 4470 + }, + { + "epoch": 1.4930706294873937, + "grad_norm": 0.4407368792854273, + "learning_rate": 5.907339842938309e-06, + "loss": 0.0171, + "step": 4471 + }, + { + "epoch": 1.4934045750542662, + "grad_norm": 0.3349997431603896, + "learning_rate": 5.90542870657205e-06, + "loss": 0.0361, + "step": 4472 + }, + { + "epoch": 1.4937385206211387, + "grad_norm": 0.31415251690410684, + "learning_rate": 5.903517433430123e-06, + "loss": 0.0209, + "step": 4473 + }, + { + "epoch": 1.4940724661880114, + "grad_norm": 0.38454030743164036, + "learning_rate": 5.901606023801248e-06, + "loss": 0.027, + "step": 4474 + }, + { + "epoch": 1.494406411754884, + "grad_norm": 0.3702022580921498, + "learning_rate": 5.899694477974168e-06, + "loss": 0.0314, + "step": 4475 + }, + { + "epoch": 1.4947403573217566, + "grad_norm": 0.8169004780549912, + "learning_rate": 5.897782796237645e-06, + "loss": 0.0368, + "step": 4476 + }, + { + "epoch": 1.495074302888629, + "grad_norm": 0.3345481747212127, + "learning_rate": 5.895870978880457e-06, + "loss": 0.0292, + "step": 4477 + }, + { + "epoch": 1.4954082484555018, + "grad_norm": 0.24816626634703076, + "learning_rate": 5.89395902619141e-06, + "loss": 0.0204, + "step": 4478 + }, + { + "epoch": 1.4957421940223743, + "grad_norm": 0.2574079017165514, + "learning_rate": 5.892046938459327e-06, + "loss": 0.021, + "step": 4479 + }, + { + "epoch": 1.496076139589247, + "grad_norm": 0.36519695983560385, + "learning_rate": 5.890134715973049e-06, + "loss": 0.034, + "step": 4480 + }, + { + "epoch": 1.4964100851561195, + "grad_norm": 0.354264536613751, + "learning_rate": 5.888222359021443e-06, + "loss": 0.0285, + "step": 4481 + }, + { + "epoch": 1.4967440307229922, + "grad_norm": 0.4296920160950373, + "learning_rate": 5.8863098678933896e-06, + "loss": 0.0319, + "step": 4482 + }, + { + "epoch": 1.4970779762898647, + "grad_norm": 0.5234360225964875, + "learning_rate": 5.884397242877795e-06, + "loss": 0.033, + "step": 4483 + }, + { + "epoch": 1.4974119218567372, + "grad_norm": 0.44551044784097543, + "learning_rate": 5.882484484263584e-06, + "loss": 0.0294, + "step": 4484 + }, + { + "epoch": 1.49774586742361, + "grad_norm": 0.23448900773796297, + "learning_rate": 5.8805715923397e-06, + "loss": 0.02, + "step": 4485 + }, + { + "epoch": 1.4980798129904827, + "grad_norm": 0.30741229578954143, + "learning_rate": 5.87865856739511e-06, + "loss": 0.0288, + "step": 4486 + }, + { + "epoch": 1.4984137585573551, + "grad_norm": 0.23931479236630407, + "learning_rate": 5.876745409718796e-06, + "loss": 0.0175, + "step": 4487 + }, + { + "epoch": 1.4987477041242276, + "grad_norm": 0.19925632275590355, + "learning_rate": 5.874832119599766e-06, + "loss": 0.0131, + "step": 4488 + }, + { + "epoch": 1.4990816496911004, + "grad_norm": 0.29390092263538264, + "learning_rate": 5.872918697327042e-06, + "loss": 0.0273, + "step": 4489 + }, + { + "epoch": 1.499415595257973, + "grad_norm": 0.3195610194040288, + "learning_rate": 5.871005143189671e-06, + "loss": 0.0357, + "step": 4490 + }, + { + "epoch": 1.4997495408248456, + "grad_norm": 0.3002328256551994, + "learning_rate": 5.869091457476718e-06, + "loss": 0.0273, + "step": 4491 + }, + { + "epoch": 1.500083486391718, + "grad_norm": 0.2754902250643781, + "learning_rate": 5.8671776404772655e-06, + "loss": 0.0229, + "step": 4492 + }, + { + "epoch": 1.5004174319585908, + "grad_norm": 0.30637840916966047, + "learning_rate": 5.8652636924804206e-06, + "loss": 0.0322, + "step": 4493 + }, + { + "epoch": 1.5007513775254635, + "grad_norm": 0.29614098612626816, + "learning_rate": 5.863349613775308e-06, + "loss": 0.0274, + "step": 4494 + }, + { + "epoch": 1.501085323092336, + "grad_norm": 0.2593370898495482, + "learning_rate": 5.861435404651068e-06, + "loss": 0.0216, + "step": 4495 + }, + { + "epoch": 1.5014192686592085, + "grad_norm": 0.24726160704571823, + "learning_rate": 5.859521065396869e-06, + "loss": 0.0255, + "step": 4496 + }, + { + "epoch": 1.5017532142260812, + "grad_norm": 0.36087923078253414, + "learning_rate": 5.857606596301892e-06, + "loss": 0.0219, + "step": 4497 + }, + { + "epoch": 1.5020871597929537, + "grad_norm": 0.29399012235997096, + "learning_rate": 5.85569199765534e-06, + "loss": 0.0264, + "step": 4498 + }, + { + "epoch": 1.5024211053598262, + "grad_norm": 0.283003119902052, + "learning_rate": 5.853777269746438e-06, + "loss": 0.0181, + "step": 4499 + }, + { + "epoch": 1.502755050926699, + "grad_norm": 0.2295637318903182, + "learning_rate": 5.851862412864426e-06, + "loss": 0.0189, + "step": 4500 + }, + { + "epoch": 1.5030889964935716, + "grad_norm": 0.22368830141025758, + "learning_rate": 5.8499474272985654e-06, + "loss": 0.0199, + "step": 4501 + }, + { + "epoch": 1.5034229420604441, + "grad_norm": 0.3658507875277788, + "learning_rate": 5.848032313338139e-06, + "loss": 0.0283, + "step": 4502 + }, + { + "epoch": 1.5037568876273166, + "grad_norm": 0.2717646343434779, + "learning_rate": 5.846117071272444e-06, + "loss": 0.0264, + "step": 4503 + }, + { + "epoch": 1.5040908331941893, + "grad_norm": 0.2963475545938359, + "learning_rate": 5.844201701390806e-06, + "loss": 0.0327, + "step": 4504 + }, + { + "epoch": 1.504424778761062, + "grad_norm": 0.42533245553802934, + "learning_rate": 5.842286203982559e-06, + "loss": 0.0225, + "step": 4505 + }, + { + "epoch": 1.5047587243279346, + "grad_norm": 0.347760145868081, + "learning_rate": 5.840370579337063e-06, + "loss": 0.0319, + "step": 4506 + }, + { + "epoch": 1.505092669894807, + "grad_norm": 0.35348437597389887, + "learning_rate": 5.838454827743697e-06, + "loss": 0.0281, + "step": 4507 + }, + { + "epoch": 1.5054266154616798, + "grad_norm": 0.33239623968706344, + "learning_rate": 5.8365389494918565e-06, + "loss": 0.0234, + "step": 4508 + }, + { + "epoch": 1.5057605610285525, + "grad_norm": 0.27395724996539655, + "learning_rate": 5.834622944870959e-06, + "loss": 0.0248, + "step": 4509 + }, + { + "epoch": 1.506094506595425, + "grad_norm": 0.28594646026098314, + "learning_rate": 5.832706814170437e-06, + "loss": 0.0212, + "step": 4510 + }, + { + "epoch": 1.5064284521622975, + "grad_norm": 0.5007176706738875, + "learning_rate": 5.830790557679746e-06, + "loss": 0.0316, + "step": 4511 + }, + { + "epoch": 1.5067623977291702, + "grad_norm": 0.3828671663025841, + "learning_rate": 5.8288741756883585e-06, + "loss": 0.0361, + "step": 4512 + }, + { + "epoch": 1.5070963432960427, + "grad_norm": 0.3299652216663339, + "learning_rate": 5.826957668485768e-06, + "loss": 0.0226, + "step": 4513 + }, + { + "epoch": 1.5074302888629152, + "grad_norm": 0.2473818410219272, + "learning_rate": 5.825041036361484e-06, + "loss": 0.0184, + "step": 4514 + }, + { + "epoch": 1.507764234429788, + "grad_norm": 0.3635681183868704, + "learning_rate": 5.823124279605037e-06, + "loss": 0.0417, + "step": 4515 + }, + { + "epoch": 1.5080981799966606, + "grad_norm": 0.2588835718038487, + "learning_rate": 5.821207398505976e-06, + "loss": 0.018, + "step": 4516 + }, + { + "epoch": 1.5084321255635331, + "grad_norm": 0.4298291272505506, + "learning_rate": 5.819290393353867e-06, + "loss": 0.0459, + "step": 4517 + }, + { + "epoch": 1.5087660711304056, + "grad_norm": 0.42770580956101134, + "learning_rate": 5.817373264438297e-06, + "loss": 0.0314, + "step": 4518 + }, + { + "epoch": 1.5091000166972783, + "grad_norm": 0.2719253002527101, + "learning_rate": 5.815456012048873e-06, + "loss": 0.0225, + "step": 4519 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.29592703075543736, + "learning_rate": 5.8135386364752154e-06, + "loss": 0.0237, + "step": 4520 + }, + { + "epoch": 1.5097679078310235, + "grad_norm": 0.3969604385518994, + "learning_rate": 5.8116211380069675e-06, + "loss": 0.0282, + "step": 4521 + }, + { + "epoch": 1.510101853397896, + "grad_norm": 0.3272548522243053, + "learning_rate": 5.809703516933791e-06, + "loss": 0.0204, + "step": 4522 + }, + { + "epoch": 1.5104357989647688, + "grad_norm": 0.28527999186507796, + "learning_rate": 5.807785773545364e-06, + "loss": 0.0242, + "step": 4523 + }, + { + "epoch": 1.5107697445316415, + "grad_norm": 0.3083549976096167, + "learning_rate": 5.805867908131384e-06, + "loss": 0.024, + "step": 4524 + }, + { + "epoch": 1.511103690098514, + "grad_norm": 0.2796754045800017, + "learning_rate": 5.803949920981568e-06, + "loss": 0.0216, + "step": 4525 + }, + { + "epoch": 1.5114376356653865, + "grad_norm": 0.3457570056577854, + "learning_rate": 5.802031812385651e-06, + "loss": 0.0265, + "step": 4526 + }, + { + "epoch": 1.5117715812322592, + "grad_norm": 0.6303018675821321, + "learning_rate": 5.800113582633384e-06, + "loss": 0.0337, + "step": 4527 + }, + { + "epoch": 1.512105526799132, + "grad_norm": 0.25098579911670743, + "learning_rate": 5.7981952320145405e-06, + "loss": 0.0247, + "step": 4528 + }, + { + "epoch": 1.5124394723660042, + "grad_norm": 0.2910769566131532, + "learning_rate": 5.796276760818908e-06, + "loss": 0.0226, + "step": 4529 + }, + { + "epoch": 1.512773417932877, + "grad_norm": 0.38280059234355135, + "learning_rate": 5.794358169336295e-06, + "loss": 0.0291, + "step": 4530 + }, + { + "epoch": 1.5131073634997496, + "grad_norm": 0.3133082887840216, + "learning_rate": 5.792439457856528e-06, + "loss": 0.0318, + "step": 4531 + }, + { + "epoch": 1.513441309066622, + "grad_norm": 0.2964147026248281, + "learning_rate": 5.790520626669449e-06, + "loss": 0.0269, + "step": 4532 + }, + { + "epoch": 1.5137752546334946, + "grad_norm": 0.3291293657386248, + "learning_rate": 5.788601676064922e-06, + "loss": 0.0244, + "step": 4533 + }, + { + "epoch": 1.5141092002003673, + "grad_norm": 0.2534649953871449, + "learning_rate": 5.786682606332827e-06, + "loss": 0.021, + "step": 4534 + }, + { + "epoch": 1.51444314576724, + "grad_norm": 0.3956137234206681, + "learning_rate": 5.78476341776306e-06, + "loss": 0.0399, + "step": 4535 + }, + { + "epoch": 1.5147770913341125, + "grad_norm": 0.3521007059702409, + "learning_rate": 5.782844110645539e-06, + "loss": 0.0225, + "step": 4536 + }, + { + "epoch": 1.515111036900985, + "grad_norm": 0.3187280652722967, + "learning_rate": 5.780924685270198e-06, + "loss": 0.0263, + "step": 4537 + }, + { + "epoch": 1.5154449824678577, + "grad_norm": 0.2713978210437977, + "learning_rate": 5.779005141926988e-06, + "loss": 0.0265, + "step": 4538 + }, + { + "epoch": 1.5157789280347305, + "grad_norm": 0.24634928031527029, + "learning_rate": 5.777085480905877e-06, + "loss": 0.0216, + "step": 4539 + }, + { + "epoch": 1.516112873601603, + "grad_norm": 0.27789043202449926, + "learning_rate": 5.7751657024968565e-06, + "loss": 0.0299, + "step": 4540 + }, + { + "epoch": 1.5164468191684755, + "grad_norm": 0.4188611767754047, + "learning_rate": 5.773245806989929e-06, + "loss": 0.0379, + "step": 4541 + }, + { + "epoch": 1.5167807647353482, + "grad_norm": 0.3115149618402707, + "learning_rate": 5.771325794675117e-06, + "loss": 0.0253, + "step": 4542 + }, + { + "epoch": 1.517114710302221, + "grad_norm": 0.2560699542001376, + "learning_rate": 5.769405665842461e-06, + "loss": 0.0185, + "step": 4543 + }, + { + "epoch": 1.5174486558690934, + "grad_norm": 0.3159015963424157, + "learning_rate": 5.767485420782021e-06, + "loss": 0.0265, + "step": 4544 + }, + { + "epoch": 1.5177826014359659, + "grad_norm": 0.3235905206470134, + "learning_rate": 5.7655650597838704e-06, + "loss": 0.0262, + "step": 4545 + }, + { + "epoch": 1.5181165470028386, + "grad_norm": 0.2930813024022368, + "learning_rate": 5.7636445831381034e-06, + "loss": 0.0215, + "step": 4546 + }, + { + "epoch": 1.518450492569711, + "grad_norm": 0.2857122199031493, + "learning_rate": 5.761723991134831e-06, + "loss": 0.029, + "step": 4547 + }, + { + "epoch": 1.5187844381365836, + "grad_norm": 0.29260302770460334, + "learning_rate": 5.759803284064181e-06, + "loss": 0.0244, + "step": 4548 + }, + { + "epoch": 1.5191183837034563, + "grad_norm": 0.2531697490963169, + "learning_rate": 5.757882462216299e-06, + "loss": 0.0219, + "step": 4549 + }, + { + "epoch": 1.519452329270329, + "grad_norm": 0.5624367886881112, + "learning_rate": 5.755961525881345e-06, + "loss": 0.0275, + "step": 4550 + }, + { + "epoch": 1.5197862748372015, + "grad_norm": 0.4082335546430365, + "learning_rate": 5.7540404753495034e-06, + "loss": 0.0344, + "step": 4551 + }, + { + "epoch": 1.520120220404074, + "grad_norm": 0.3735604674461184, + "learning_rate": 5.75211931091097e-06, + "loss": 0.0454, + "step": 4552 + }, + { + "epoch": 1.5204541659709467, + "grad_norm": 0.2818705851229258, + "learning_rate": 5.750198032855956e-06, + "loss": 0.0192, + "step": 4553 + }, + { + "epoch": 1.5207881115378195, + "grad_norm": 0.27003828603018404, + "learning_rate": 5.748276641474698e-06, + "loss": 0.0189, + "step": 4554 + }, + { + "epoch": 1.521122057104692, + "grad_norm": 0.26984539217454406, + "learning_rate": 5.746355137057442e-06, + "loss": 0.0265, + "step": 4555 + }, + { + "epoch": 1.5214560026715644, + "grad_norm": 0.2599567549036431, + "learning_rate": 5.7444335198944555e-06, + "loss": 0.0251, + "step": 4556 + }, + { + "epoch": 1.5217899482384372, + "grad_norm": 0.3143515665008245, + "learning_rate": 5.7425117902760195e-06, + "loss": 0.0244, + "step": 4557 + }, + { + "epoch": 1.5221238938053099, + "grad_norm": 0.3370359114708471, + "learning_rate": 5.7405899484924346e-06, + "loss": 0.0462, + "step": 4558 + }, + { + "epoch": 1.5224578393721824, + "grad_norm": 0.2856420988296376, + "learning_rate": 5.738667994834019e-06, + "loss": 0.0242, + "step": 4559 + }, + { + "epoch": 1.5227917849390549, + "grad_norm": 0.29237378296137156, + "learning_rate": 5.736745929591103e-06, + "loss": 0.0207, + "step": 4560 + }, + { + "epoch": 1.5231257305059276, + "grad_norm": 0.26313380186967383, + "learning_rate": 5.734823753054042e-06, + "loss": 0.0285, + "step": 4561 + }, + { + "epoch": 1.5234596760728, + "grad_norm": 0.25120794571849253, + "learning_rate": 5.732901465513199e-06, + "loss": 0.0211, + "step": 4562 + }, + { + "epoch": 1.5237936216396726, + "grad_norm": 0.26062660804053955, + "learning_rate": 5.73097906725896e-06, + "loss": 0.0185, + "step": 4563 + }, + { + "epoch": 1.5241275672065453, + "grad_norm": 0.24209339902864357, + "learning_rate": 5.729056558581727e-06, + "loss": 0.0217, + "step": 4564 + }, + { + "epoch": 1.524461512773418, + "grad_norm": 0.6038532436961349, + "learning_rate": 5.727133939771915e-06, + "loss": 0.0327, + "step": 4565 + }, + { + "epoch": 1.5247954583402905, + "grad_norm": 0.289937371343371, + "learning_rate": 5.725211211119961e-06, + "loss": 0.0267, + "step": 4566 + }, + { + "epoch": 1.525129403907163, + "grad_norm": 0.247180956626713, + "learning_rate": 5.723288372916315e-06, + "loss": 0.0238, + "step": 4567 + }, + { + "epoch": 1.5254633494740357, + "grad_norm": 0.23066461883261297, + "learning_rate": 5.721365425451442e-06, + "loss": 0.0173, + "step": 4568 + }, + { + "epoch": 1.5257972950409084, + "grad_norm": 0.45808699359164345, + "learning_rate": 5.719442369015828e-06, + "loss": 0.0285, + "step": 4569 + }, + { + "epoch": 1.526131240607781, + "grad_norm": 0.23071180498252666, + "learning_rate": 5.717519203899975e-06, + "loss": 0.0147, + "step": 4570 + }, + { + "epoch": 1.5264651861746534, + "grad_norm": 0.3104740042009261, + "learning_rate": 5.715595930394396e-06, + "loss": 0.029, + "step": 4571 + }, + { + "epoch": 1.5267991317415261, + "grad_norm": 0.3886417927297356, + "learning_rate": 5.713672548789626e-06, + "loss": 0.0333, + "step": 4572 + }, + { + "epoch": 1.5271330773083989, + "grad_norm": 0.31689181981946263, + "learning_rate": 5.711749059376215e-06, + "loss": 0.0248, + "step": 4573 + }, + { + "epoch": 1.5274670228752714, + "grad_norm": 0.268398606883179, + "learning_rate": 5.7098254624447255e-06, + "loss": 0.0257, + "step": 4574 + }, + { + "epoch": 1.5278009684421439, + "grad_norm": 0.3220231489764643, + "learning_rate": 5.707901758285745e-06, + "loss": 0.025, + "step": 4575 + }, + { + "epoch": 1.5281349140090166, + "grad_norm": 0.2688945215404341, + "learning_rate": 5.705977947189868e-06, + "loss": 0.0181, + "step": 4576 + }, + { + "epoch": 1.5284688595758893, + "grad_norm": 0.3202538053443321, + "learning_rate": 5.704054029447708e-06, + "loss": 0.0306, + "step": 4577 + }, + { + "epoch": 1.5288028051427616, + "grad_norm": 0.42388476698429495, + "learning_rate": 5.702130005349899e-06, + "loss": 0.0276, + "step": 4578 + }, + { + "epoch": 1.5291367507096343, + "grad_norm": 0.38182722661325896, + "learning_rate": 5.700205875187084e-06, + "loss": 0.0366, + "step": 4579 + }, + { + "epoch": 1.529470696276507, + "grad_norm": 0.31316755531102236, + "learning_rate": 5.698281639249927e-06, + "loss": 0.022, + "step": 4580 + }, + { + "epoch": 1.5298046418433795, + "grad_norm": 0.2714309659875258, + "learning_rate": 5.696357297829106e-06, + "loss": 0.0231, + "step": 4581 + }, + { + "epoch": 1.530138587410252, + "grad_norm": 0.2712535452604488, + "learning_rate": 5.6944328512153165e-06, + "loss": 0.0237, + "step": 4582 + }, + { + "epoch": 1.5304725329771247, + "grad_norm": 0.3182382636290354, + "learning_rate": 5.692508299699269e-06, + "loss": 0.0275, + "step": 4583 + }, + { + "epoch": 1.5308064785439974, + "grad_norm": 0.238015636588773, + "learning_rate": 5.690583643571687e-06, + "loss": 0.0189, + "step": 4584 + }, + { + "epoch": 1.53114042411087, + "grad_norm": 0.4009935230094596, + "learning_rate": 5.688658883123315e-06, + "loss": 0.0403, + "step": 4585 + }, + { + "epoch": 1.5314743696777424, + "grad_norm": 0.25016289320218604, + "learning_rate": 5.68673401864491e-06, + "loss": 0.0228, + "step": 4586 + }, + { + "epoch": 1.5318083152446151, + "grad_norm": 0.267276585325518, + "learning_rate": 5.684809050427247e-06, + "loss": 0.018, + "step": 4587 + }, + { + "epoch": 1.5321422608114879, + "grad_norm": 0.3936855696850991, + "learning_rate": 5.682883978761111e-06, + "loss": 0.0241, + "step": 4588 + }, + { + "epoch": 1.5324762063783604, + "grad_norm": 0.2877023941912195, + "learning_rate": 5.680958803937311e-06, + "loss": 0.0182, + "step": 4589 + }, + { + "epoch": 1.5328101519452328, + "grad_norm": 0.3574736219479748, + "learning_rate": 5.6790335262466645e-06, + "loss": 0.032, + "step": 4590 + }, + { + "epoch": 1.5331440975121056, + "grad_norm": 0.3512393893416295, + "learning_rate": 5.677108145980008e-06, + "loss": 0.0337, + "step": 4591 + }, + { + "epoch": 1.5334780430789783, + "grad_norm": 0.2925069173622745, + "learning_rate": 5.675182663428196e-06, + "loss": 0.0282, + "step": 4592 + }, + { + "epoch": 1.5338119886458508, + "grad_norm": 0.20860657643540745, + "learning_rate": 5.673257078882091e-06, + "loss": 0.0174, + "step": 4593 + }, + { + "epoch": 1.5341459342127233, + "grad_norm": 0.36380528226657366, + "learning_rate": 5.671331392632577e-06, + "loss": 0.0314, + "step": 4594 + }, + { + "epoch": 1.534479879779596, + "grad_norm": 0.32532710429967887, + "learning_rate": 5.6694056049705506e-06, + "loss": 0.026, + "step": 4595 + }, + { + "epoch": 1.5348138253464685, + "grad_norm": 0.2512125523806715, + "learning_rate": 5.667479716186927e-06, + "loss": 0.0239, + "step": 4596 + }, + { + "epoch": 1.535147770913341, + "grad_norm": 0.313861757323899, + "learning_rate": 5.665553726572631e-06, + "loss": 0.0289, + "step": 4597 + }, + { + "epoch": 1.5354817164802137, + "grad_norm": 0.2554555233620205, + "learning_rate": 5.663627636418611e-06, + "loss": 0.0189, + "step": 4598 + }, + { + "epoch": 1.5358156620470864, + "grad_norm": 0.2700226452622836, + "learning_rate": 5.661701446015821e-06, + "loss": 0.0246, + "step": 4599 + }, + { + "epoch": 1.536149607613959, + "grad_norm": 0.2871125077340285, + "learning_rate": 5.659775155655235e-06, + "loss": 0.0254, + "step": 4600 + }, + { + "epoch": 1.5364835531808314, + "grad_norm": 0.4007450117048198, + "learning_rate": 5.6578487656278446e-06, + "loss": 0.0188, + "step": 4601 + }, + { + "epoch": 1.5368174987477041, + "grad_norm": 0.24601929474702078, + "learning_rate": 5.655922276224652e-06, + "loss": 0.0226, + "step": 4602 + }, + { + "epoch": 1.5371514443145768, + "grad_norm": 0.2495254899972368, + "learning_rate": 5.653995687736676e-06, + "loss": 0.0214, + "step": 4603 + }, + { + "epoch": 1.5374853898814493, + "grad_norm": 0.3368790194729096, + "learning_rate": 5.652069000454951e-06, + "loss": 0.026, + "step": 4604 + }, + { + "epoch": 1.5378193354483218, + "grad_norm": 0.2542367660282918, + "learning_rate": 5.650142214670527e-06, + "loss": 0.0185, + "step": 4605 + }, + { + "epoch": 1.5381532810151946, + "grad_norm": 1.0028351241572269, + "learning_rate": 5.648215330674464e-06, + "loss": 0.0296, + "step": 4606 + }, + { + "epoch": 1.5384872265820673, + "grad_norm": 0.3162108319139779, + "learning_rate": 5.646288348757845e-06, + "loss": 0.0309, + "step": 4607 + }, + { + "epoch": 1.5388211721489398, + "grad_norm": 0.36952837259072446, + "learning_rate": 5.64436126921176e-06, + "loss": 0.0217, + "step": 4608 + }, + { + "epoch": 1.5391551177158123, + "grad_norm": 0.26075476018207117, + "learning_rate": 5.642434092327318e-06, + "loss": 0.0178, + "step": 4609 + }, + { + "epoch": 1.539489063282685, + "grad_norm": 0.2950114033119621, + "learning_rate": 5.640506818395643e-06, + "loss": 0.0254, + "step": 4610 + }, + { + "epoch": 1.5398230088495575, + "grad_norm": 0.34970712078527644, + "learning_rate": 5.638579447707871e-06, + "loss": 0.0233, + "step": 4611 + }, + { + "epoch": 1.54015695441643, + "grad_norm": 0.26247724622548424, + "learning_rate": 5.636651980555153e-06, + "loss": 0.0291, + "step": 4612 + }, + { + "epoch": 1.5404908999833027, + "grad_norm": 0.4277826509695619, + "learning_rate": 5.634724417228658e-06, + "loss": 0.0328, + "step": 4613 + }, + { + "epoch": 1.5408248455501754, + "grad_norm": 0.3751967744435991, + "learning_rate": 5.632796758019566e-06, + "loss": 0.0302, + "step": 4614 + }, + { + "epoch": 1.541158791117048, + "grad_norm": 0.3050719666108031, + "learning_rate": 5.630869003219072e-06, + "loss": 0.0238, + "step": 4615 + }, + { + "epoch": 1.5414927366839204, + "grad_norm": 0.40607807074425567, + "learning_rate": 5.628941153118388e-06, + "loss": 0.0322, + "step": 4616 + }, + { + "epoch": 1.5418266822507931, + "grad_norm": 0.22955755926861565, + "learning_rate": 5.627013208008737e-06, + "loss": 0.0187, + "step": 4617 + }, + { + "epoch": 1.5421606278176658, + "grad_norm": 0.28266256943565177, + "learning_rate": 5.625085168181357e-06, + "loss": 0.0328, + "step": 4618 + }, + { + "epoch": 1.5424945733845383, + "grad_norm": 0.3007434350195192, + "learning_rate": 5.623157033927503e-06, + "loss": 0.0268, + "step": 4619 + }, + { + "epoch": 1.5428285189514108, + "grad_norm": 0.27442694457382366, + "learning_rate": 5.621228805538443e-06, + "loss": 0.0196, + "step": 4620 + }, + { + "epoch": 1.5431624645182835, + "grad_norm": 0.2891208608608508, + "learning_rate": 5.619300483305454e-06, + "loss": 0.0233, + "step": 4621 + }, + { + "epoch": 1.5434964100851563, + "grad_norm": 0.2621804175678766, + "learning_rate": 5.617372067519837e-06, + "loss": 0.0218, + "step": 4622 + }, + { + "epoch": 1.5438303556520288, + "grad_norm": 0.22817704008707426, + "learning_rate": 5.6154435584729e-06, + "loss": 0.0178, + "step": 4623 + }, + { + "epoch": 1.5441643012189012, + "grad_norm": 0.2564681222775103, + "learning_rate": 5.6135149564559665e-06, + "loss": 0.0235, + "step": 4624 + }, + { + "epoch": 1.544498246785774, + "grad_norm": 0.2912918765243214, + "learning_rate": 5.611586261760375e-06, + "loss": 0.0199, + "step": 4625 + }, + { + "epoch": 1.5448321923526467, + "grad_norm": 0.3001962667437202, + "learning_rate": 5.609657474677478e-06, + "loss": 0.0265, + "step": 4626 + }, + { + "epoch": 1.545166137919519, + "grad_norm": 0.37257182980937525, + "learning_rate": 5.607728595498641e-06, + "loss": 0.0268, + "step": 4627 + }, + { + "epoch": 1.5455000834863917, + "grad_norm": 0.25465617494988596, + "learning_rate": 5.6057996245152435e-06, + "loss": 0.0227, + "step": 4628 + }, + { + "epoch": 1.5458340290532644, + "grad_norm": 0.4444859778718718, + "learning_rate": 5.603870562018679e-06, + "loss": 0.0181, + "step": 4629 + }, + { + "epoch": 1.5461679746201369, + "grad_norm": 0.272913967079136, + "learning_rate": 5.601941408300358e-06, + "loss": 0.0298, + "step": 4630 + }, + { + "epoch": 1.5465019201870094, + "grad_norm": 0.2393553678940523, + "learning_rate": 5.600012163651698e-06, + "loss": 0.0179, + "step": 4631 + }, + { + "epoch": 1.546835865753882, + "grad_norm": 0.28587806870771554, + "learning_rate": 5.598082828364134e-06, + "loss": 0.0281, + "step": 4632 + }, + { + "epoch": 1.5471698113207548, + "grad_norm": 0.26092692494166153, + "learning_rate": 5.596153402729118e-06, + "loss": 0.026, + "step": 4633 + }, + { + "epoch": 1.5475037568876273, + "grad_norm": 0.2019802370150038, + "learning_rate": 5.594223887038113e-06, + "loss": 0.0215, + "step": 4634 + }, + { + "epoch": 1.5478377024544998, + "grad_norm": 0.40809753995562387, + "learning_rate": 5.592294281582591e-06, + "loss": 0.0192, + "step": 4635 + }, + { + "epoch": 1.5481716480213725, + "grad_norm": 0.21067349549869563, + "learning_rate": 5.590364586654043e-06, + "loss": 0.0167, + "step": 4636 + }, + { + "epoch": 1.5485055935882452, + "grad_norm": 0.4390158755584321, + "learning_rate": 5.588434802543975e-06, + "loss": 0.0263, + "step": 4637 + }, + { + "epoch": 1.5488395391551177, + "grad_norm": 0.25371956661779926, + "learning_rate": 5.5865049295439e-06, + "loss": 0.0193, + "step": 4638 + }, + { + "epoch": 1.5491734847219902, + "grad_norm": 0.34840836287415267, + "learning_rate": 5.584574967945351e-06, + "loss": 0.022, + "step": 4639 + }, + { + "epoch": 1.549507430288863, + "grad_norm": 0.3323124921361567, + "learning_rate": 5.582644918039869e-06, + "loss": 0.0283, + "step": 4640 + }, + { + "epoch": 1.5498413758557357, + "grad_norm": 0.2369539977035479, + "learning_rate": 5.580714780119011e-06, + "loss": 0.0232, + "step": 4641 + }, + { + "epoch": 1.5501753214226082, + "grad_norm": 0.2358771219447518, + "learning_rate": 5.578784554474348e-06, + "loss": 0.018, + "step": 4642 + }, + { + "epoch": 1.5505092669894807, + "grad_norm": 0.7038527335405669, + "learning_rate": 5.5768542413974645e-06, + "loss": 0.0339, + "step": 4643 + }, + { + "epoch": 1.5508432125563534, + "grad_norm": 0.33279921269341783, + "learning_rate": 5.574923841179953e-06, + "loss": 0.0262, + "step": 4644 + }, + { + "epoch": 1.5511771581232259, + "grad_norm": 0.26103378741182265, + "learning_rate": 5.572993354113429e-06, + "loss": 0.0139, + "step": 4645 + }, + { + "epoch": 1.5515111036900984, + "grad_norm": 0.42961469455510687, + "learning_rate": 5.5710627804895105e-06, + "loss": 0.0382, + "step": 4646 + }, + { + "epoch": 1.551845049256971, + "grad_norm": 0.3138532263978365, + "learning_rate": 5.569132120599834e-06, + "loss": 0.0213, + "step": 4647 + }, + { + "epoch": 1.5521789948238438, + "grad_norm": 0.2921545662946304, + "learning_rate": 5.567201374736051e-06, + "loss": 0.0207, + "step": 4648 + }, + { + "epoch": 1.5525129403907163, + "grad_norm": 0.3590977867771232, + "learning_rate": 5.565270543189821e-06, + "loss": 0.0278, + "step": 4649 + }, + { + "epoch": 1.5528468859575888, + "grad_norm": 0.24965628468729104, + "learning_rate": 5.563339626252819e-06, + "loss": 0.0252, + "step": 4650 + }, + { + "epoch": 1.5531808315244615, + "grad_norm": 0.2114257381053459, + "learning_rate": 5.561408624216734e-06, + "loss": 0.0166, + "step": 4651 + }, + { + "epoch": 1.5535147770913342, + "grad_norm": 0.3148892639908185, + "learning_rate": 5.559477537373267e-06, + "loss": 0.033, + "step": 4652 + }, + { + "epoch": 1.5538487226582067, + "grad_norm": 0.39515797368240496, + "learning_rate": 5.557546366014129e-06, + "loss": 0.0249, + "step": 4653 + }, + { + "epoch": 1.5541826682250792, + "grad_norm": 0.221771025652442, + "learning_rate": 5.555615110431049e-06, + "loss": 0.0195, + "step": 4654 + }, + { + "epoch": 1.554516613791952, + "grad_norm": 0.22627484311750407, + "learning_rate": 5.553683770915763e-06, + "loss": 0.0191, + "step": 4655 + }, + { + "epoch": 1.5548505593588247, + "grad_norm": 0.2843287276844521, + "learning_rate": 5.551752347760023e-06, + "loss": 0.0193, + "step": 4656 + }, + { + "epoch": 1.5551845049256972, + "grad_norm": 0.2767401280559718, + "learning_rate": 5.549820841255597e-06, + "loss": 0.0188, + "step": 4657 + }, + { + "epoch": 1.5555184504925696, + "grad_norm": 0.2782344950701206, + "learning_rate": 5.547889251694257e-06, + "loss": 0.0202, + "step": 4658 + }, + { + "epoch": 1.5558523960594424, + "grad_norm": 0.3133332604155637, + "learning_rate": 5.545957579367795e-06, + "loss": 0.0294, + "step": 4659 + }, + { + "epoch": 1.5561863416263149, + "grad_norm": 0.4599047583382848, + "learning_rate": 5.544025824568011e-06, + "loss": 0.0308, + "step": 4660 + }, + { + "epoch": 1.5565202871931874, + "grad_norm": 0.27903961028820656, + "learning_rate": 5.542093987586722e-06, + "loss": 0.0325, + "step": 4661 + }, + { + "epoch": 1.55685423276006, + "grad_norm": 0.3064781549878294, + "learning_rate": 5.540162068715752e-06, + "loss": 0.0232, + "step": 4662 + }, + { + "epoch": 1.5571881783269328, + "grad_norm": 0.40819545051225026, + "learning_rate": 5.538230068246942e-06, + "loss": 0.0298, + "step": 4663 + }, + { + "epoch": 1.5575221238938053, + "grad_norm": 0.3165798642538438, + "learning_rate": 5.536297986472142e-06, + "loss": 0.0221, + "step": 4664 + }, + { + "epoch": 1.5578560694606778, + "grad_norm": 0.2400764108757089, + "learning_rate": 5.534365823683219e-06, + "loss": 0.0215, + "step": 4665 + }, + { + "epoch": 1.5581900150275505, + "grad_norm": 0.4807705650038244, + "learning_rate": 5.532433580172044e-06, + "loss": 0.0334, + "step": 4666 + }, + { + "epoch": 1.5585239605944232, + "grad_norm": 0.26510184630934774, + "learning_rate": 5.5305012562305075e-06, + "loss": 0.0216, + "step": 4667 + }, + { + "epoch": 1.5588579061612957, + "grad_norm": 0.33302149776035345, + "learning_rate": 5.528568852150511e-06, + "loss": 0.0282, + "step": 4668 + }, + { + "epoch": 1.5591918517281682, + "grad_norm": 0.3316923001526142, + "learning_rate": 5.526636368223965e-06, + "loss": 0.0387, + "step": 4669 + }, + { + "epoch": 1.559525797295041, + "grad_norm": 0.2530947372334988, + "learning_rate": 5.524703804742793e-06, + "loss": 0.0235, + "step": 4670 + }, + { + "epoch": 1.5598597428619136, + "grad_norm": 0.23760674435311052, + "learning_rate": 5.522771161998936e-06, + "loss": 0.0183, + "step": 4671 + }, + { + "epoch": 1.5601936884287861, + "grad_norm": 0.29753277466576084, + "learning_rate": 5.52083844028434e-06, + "loss": 0.0237, + "step": 4672 + }, + { + "epoch": 1.5605276339956586, + "grad_norm": 0.2949765784112042, + "learning_rate": 5.518905639890961e-06, + "loss": 0.0253, + "step": 4673 + }, + { + "epoch": 1.5608615795625314, + "grad_norm": 0.27461339126092194, + "learning_rate": 5.516972761110778e-06, + "loss": 0.0262, + "step": 4674 + }, + { + "epoch": 1.561195525129404, + "grad_norm": 0.27062981712833906, + "learning_rate": 5.515039804235772e-06, + "loss": 0.0215, + "step": 4675 + }, + { + "epoch": 1.5615294706962763, + "grad_norm": 0.23118327380251755, + "learning_rate": 5.51310676955794e-06, + "loss": 0.019, + "step": 4676 + }, + { + "epoch": 1.561863416263149, + "grad_norm": 0.3252735016340356, + "learning_rate": 5.511173657369287e-06, + "loss": 0.0306, + "step": 4677 + }, + { + "epoch": 1.5621973618300218, + "grad_norm": 0.29006622084789985, + "learning_rate": 5.509240467961835e-06, + "loss": 0.0223, + "step": 4678 + }, + { + "epoch": 1.5625313073968943, + "grad_norm": 0.18297376051731895, + "learning_rate": 5.507307201627614e-06, + "loss": 0.0154, + "step": 4679 + }, + { + "epoch": 1.5628652529637668, + "grad_norm": 0.25067390514121257, + "learning_rate": 5.505373858658668e-06, + "loss": 0.0225, + "step": 4680 + }, + { + "epoch": 1.5631991985306395, + "grad_norm": 0.21910683859517935, + "learning_rate": 5.503440439347048e-06, + "loss": 0.0179, + "step": 4681 + }, + { + "epoch": 1.5635331440975122, + "grad_norm": 0.30904611155154754, + "learning_rate": 5.501506943984823e-06, + "loss": 0.0167, + "step": 4682 + }, + { + "epoch": 1.5638670896643847, + "grad_norm": 0.30637023547215625, + "learning_rate": 5.4995733728640695e-06, + "loss": 0.0163, + "step": 4683 + }, + { + "epoch": 1.5642010352312572, + "grad_norm": 0.27583062890681237, + "learning_rate": 5.497639726276876e-06, + "loss": 0.023, + "step": 4684 + }, + { + "epoch": 1.56453498079813, + "grad_norm": 0.3082953985784777, + "learning_rate": 5.49570600451534e-06, + "loss": 0.031, + "step": 4685 + }, + { + "epoch": 1.5648689263650026, + "grad_norm": 0.27738685199879104, + "learning_rate": 5.493772207871577e-06, + "loss": 0.0205, + "step": 4686 + }, + { + "epoch": 1.5652028719318751, + "grad_norm": 0.42280758757405773, + "learning_rate": 5.491838336637708e-06, + "loss": 0.035, + "step": 4687 + }, + { + "epoch": 1.5655368174987476, + "grad_norm": 0.2799160244683536, + "learning_rate": 5.4899043911058665e-06, + "loss": 0.0168, + "step": 4688 + }, + { + "epoch": 1.5658707630656203, + "grad_norm": 0.24153761994573122, + "learning_rate": 5.487970371568199e-06, + "loss": 0.0141, + "step": 4689 + }, + { + "epoch": 1.566204708632493, + "grad_norm": 0.19062471668626987, + "learning_rate": 5.486036278316861e-06, + "loss": 0.0116, + "step": 4690 + }, + { + "epoch": 1.5665386541993656, + "grad_norm": 0.39218107389774626, + "learning_rate": 5.48410211164402e-06, + "loss": 0.0194, + "step": 4691 + }, + { + "epoch": 1.566872599766238, + "grad_norm": 0.33593644567960307, + "learning_rate": 5.482167871841855e-06, + "loss": 0.0308, + "step": 4692 + }, + { + "epoch": 1.5672065453331108, + "grad_norm": 0.2418824940638314, + "learning_rate": 5.480233559202556e-06, + "loss": 0.0218, + "step": 4693 + }, + { + "epoch": 1.5675404908999833, + "grad_norm": 0.3160367765089976, + "learning_rate": 5.4782991740183225e-06, + "loss": 0.034, + "step": 4694 + }, + { + "epoch": 1.5678744364668558, + "grad_norm": 0.25420505987501624, + "learning_rate": 5.476364716581367e-06, + "loss": 0.0193, + "step": 4695 + }, + { + "epoch": 1.5682083820337285, + "grad_norm": 0.24988980694723345, + "learning_rate": 5.474430187183912e-06, + "loss": 0.0215, + "step": 4696 + }, + { + "epoch": 1.5685423276006012, + "grad_norm": 0.3130027566208691, + "learning_rate": 5.472495586118192e-06, + "loss": 0.0285, + "step": 4697 + }, + { + "epoch": 1.5688762731674737, + "grad_norm": 0.3616846271941303, + "learning_rate": 5.47056091367645e-06, + "loss": 0.0254, + "step": 4698 + }, + { + "epoch": 1.5692102187343462, + "grad_norm": 0.3860937318368782, + "learning_rate": 5.468626170150942e-06, + "loss": 0.0292, + "step": 4699 + }, + { + "epoch": 1.569544164301219, + "grad_norm": 0.3273696176668461, + "learning_rate": 5.466691355833932e-06, + "loss": 0.0342, + "step": 4700 + }, + { + "epoch": 1.5698781098680916, + "grad_norm": 0.2990149381635294, + "learning_rate": 5.464756471017696e-06, + "loss": 0.0265, + "step": 4701 + }, + { + "epoch": 1.5702120554349641, + "grad_norm": 0.20533438782692248, + "learning_rate": 5.462821515994525e-06, + "loss": 0.0146, + "step": 4702 + }, + { + "epoch": 1.5705460010018366, + "grad_norm": 0.2766583366827619, + "learning_rate": 5.460886491056714e-06, + "loss": 0.0231, + "step": 4703 + }, + { + "epoch": 1.5708799465687093, + "grad_norm": 0.29272686931138153, + "learning_rate": 5.458951396496572e-06, + "loss": 0.0309, + "step": 4704 + }, + { + "epoch": 1.571213892135582, + "grad_norm": 0.2622278349438255, + "learning_rate": 5.457016232606417e-06, + "loss": 0.0241, + "step": 4705 + }, + { + "epoch": 1.5715478377024545, + "grad_norm": 0.31097621246261814, + "learning_rate": 5.455080999678579e-06, + "loss": 0.0317, + "step": 4706 + }, + { + "epoch": 1.571881783269327, + "grad_norm": 0.3514563774469613, + "learning_rate": 5.453145698005399e-06, + "loss": 0.0283, + "step": 4707 + }, + { + "epoch": 1.5722157288361998, + "grad_norm": 0.23391581133573316, + "learning_rate": 5.451210327879223e-06, + "loss": 0.0212, + "step": 4708 + }, + { + "epoch": 1.5725496744030723, + "grad_norm": 0.31739255467856997, + "learning_rate": 5.449274889592416e-06, + "loss": 0.02, + "step": 4709 + }, + { + "epoch": 1.5728836199699447, + "grad_norm": 0.3555040759906985, + "learning_rate": 5.4473393834373466e-06, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.5732175655368175, + "grad_norm": 0.2935354434700193, + "learning_rate": 5.445403809706395e-06, + "loss": 0.0271, + "step": 4711 + }, + { + "epoch": 1.5735515111036902, + "grad_norm": 0.22900019021369852, + "learning_rate": 5.443468168691954e-06, + "loss": 0.0233, + "step": 4712 + }, + { + "epoch": 1.5738854566705627, + "grad_norm": 0.2423720507858526, + "learning_rate": 5.441532460686426e-06, + "loss": 0.0231, + "step": 4713 + }, + { + "epoch": 1.5742194022374352, + "grad_norm": 0.3058308198831014, + "learning_rate": 5.4395966859822195e-06, + "loss": 0.0283, + "step": 4714 + }, + { + "epoch": 1.574553347804308, + "grad_norm": 0.23322716561321696, + "learning_rate": 5.437660844871758e-06, + "loss": 0.0137, + "step": 4715 + }, + { + "epoch": 1.5748872933711806, + "grad_norm": 0.29576286678046054, + "learning_rate": 5.435724937647473e-06, + "loss": 0.0326, + "step": 4716 + }, + { + "epoch": 1.575221238938053, + "grad_norm": 0.3089105400151078, + "learning_rate": 5.433788964601804e-06, + "loss": 0.0232, + "step": 4717 + }, + { + "epoch": 1.5755551845049256, + "grad_norm": 0.2899807206089013, + "learning_rate": 5.431852926027206e-06, + "loss": 0.0251, + "step": 4718 + }, + { + "epoch": 1.5758891300717983, + "grad_norm": 0.29668329095541673, + "learning_rate": 5.429916822216138e-06, + "loss": 0.0372, + "step": 4719 + }, + { + "epoch": 1.576223075638671, + "grad_norm": 0.23919092453680474, + "learning_rate": 5.42798065346107e-06, + "loss": 0.015, + "step": 4720 + }, + { + "epoch": 1.5765570212055435, + "grad_norm": 0.2786262183787538, + "learning_rate": 5.426044420054488e-06, + "loss": 0.0203, + "step": 4721 + }, + { + "epoch": 1.576890966772416, + "grad_norm": 0.3806863772664518, + "learning_rate": 5.424108122288878e-06, + "loss": 0.034, + "step": 4722 + }, + { + "epoch": 1.5772249123392887, + "grad_norm": 0.29144612056718594, + "learning_rate": 5.4221717604567435e-06, + "loss": 0.0266, + "step": 4723 + }, + { + "epoch": 1.5775588579061615, + "grad_norm": 0.30137680589643145, + "learning_rate": 5.420235334850593e-06, + "loss": 0.0251, + "step": 4724 + }, + { + "epoch": 1.5778928034730337, + "grad_norm": 0.29449817772581827, + "learning_rate": 5.418298845762947e-06, + "loss": 0.0253, + "step": 4725 + }, + { + "epoch": 1.5782267490399065, + "grad_norm": 0.3016360861775003, + "learning_rate": 5.416362293486336e-06, + "loss": 0.0205, + "step": 4726 + }, + { + "epoch": 1.5785606946067792, + "grad_norm": 0.32694696518846245, + "learning_rate": 5.4144256783132975e-06, + "loss": 0.0227, + "step": 4727 + }, + { + "epoch": 1.5788946401736517, + "grad_norm": 0.3042867092000363, + "learning_rate": 5.41248900053638e-06, + "loss": 0.034, + "step": 4728 + }, + { + "epoch": 1.5792285857405242, + "grad_norm": 0.25855069164140154, + "learning_rate": 5.4105522604481435e-06, + "loss": 0.0205, + "step": 4729 + }, + { + "epoch": 1.5795625313073969, + "grad_norm": 0.2916706163829039, + "learning_rate": 5.408615458341152e-06, + "loss": 0.0259, + "step": 4730 + }, + { + "epoch": 1.5798964768742696, + "grad_norm": 0.20060261462918372, + "learning_rate": 5.4066785945079855e-06, + "loss": 0.0178, + "step": 4731 + }, + { + "epoch": 1.580230422441142, + "grad_norm": 0.27302644205906934, + "learning_rate": 5.404741669241228e-06, + "loss": 0.0208, + "step": 4732 + }, + { + "epoch": 1.5805643680080146, + "grad_norm": 0.3215309320982359, + "learning_rate": 5.402804682833477e-06, + "loss": 0.0261, + "step": 4733 + }, + { + "epoch": 1.5808983135748873, + "grad_norm": 0.42220882713573354, + "learning_rate": 5.400867635577335e-06, + "loss": 0.0339, + "step": 4734 + }, + { + "epoch": 1.58123225914176, + "grad_norm": 0.36305931152161164, + "learning_rate": 5.398930527765416e-06, + "loss": 0.0336, + "step": 4735 + }, + { + "epoch": 1.5815662047086325, + "grad_norm": 0.26248823150529205, + "learning_rate": 5.396993359690345e-06, + "loss": 0.025, + "step": 4736 + }, + { + "epoch": 1.581900150275505, + "grad_norm": 0.4569090185707832, + "learning_rate": 5.395056131644752e-06, + "loss": 0.022, + "step": 4737 + }, + { + "epoch": 1.5822340958423777, + "grad_norm": 0.2605050720874279, + "learning_rate": 5.393118843921277e-06, + "loss": 0.0248, + "step": 4738 + }, + { + "epoch": 1.5825680414092504, + "grad_norm": 0.7888558177840966, + "learning_rate": 5.391181496812573e-06, + "loss": 0.0196, + "step": 4739 + }, + { + "epoch": 1.582901986976123, + "grad_norm": 0.2629408746991032, + "learning_rate": 5.389244090611298e-06, + "loss": 0.0244, + "step": 4740 + }, + { + "epoch": 1.5832359325429954, + "grad_norm": 0.41574229723430983, + "learning_rate": 5.38730662561012e-06, + "loss": 0.0294, + "step": 4741 + }, + { + "epoch": 1.5835698781098682, + "grad_norm": 0.3006902731232163, + "learning_rate": 5.385369102101716e-06, + "loss": 0.0214, + "step": 4742 + }, + { + "epoch": 1.5839038236767407, + "grad_norm": 0.5117946854034846, + "learning_rate": 5.38343152037877e-06, + "loss": 0.0298, + "step": 4743 + }, + { + "epoch": 1.5842377692436131, + "grad_norm": 0.2865823988910225, + "learning_rate": 5.38149388073398e-06, + "loss": 0.0271, + "step": 4744 + }, + { + "epoch": 1.5845717148104859, + "grad_norm": 0.2404491870137412, + "learning_rate": 5.379556183460047e-06, + "loss": 0.021, + "step": 4745 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 0.3456730889133306, + "learning_rate": 5.377618428849683e-06, + "loss": 0.0297, + "step": 4746 + }, + { + "epoch": 1.585239605944231, + "grad_norm": 0.4326588378740707, + "learning_rate": 5.375680617195609e-06, + "loss": 0.0401, + "step": 4747 + }, + { + "epoch": 1.5855735515111036, + "grad_norm": 0.3092114747232085, + "learning_rate": 5.373742748790555e-06, + "loss": 0.0276, + "step": 4748 + }, + { + "epoch": 1.5859074970779763, + "grad_norm": 0.32861060357738203, + "learning_rate": 5.371804823927258e-06, + "loss": 0.0231, + "step": 4749 + }, + { + "epoch": 1.586241442644849, + "grad_norm": 0.3309238635446407, + "learning_rate": 5.369866842898465e-06, + "loss": 0.0302, + "step": 4750 + }, + { + "epoch": 1.5865753882117215, + "grad_norm": 0.273042234665651, + "learning_rate": 5.367928805996929e-06, + "loss": 0.0179, + "step": 4751 + }, + { + "epoch": 1.586909333778594, + "grad_norm": 0.22121126306814043, + "learning_rate": 5.365990713515414e-06, + "loss": 0.0155, + "step": 4752 + }, + { + "epoch": 1.5872432793454667, + "grad_norm": 0.36004708447717365, + "learning_rate": 5.364052565746693e-06, + "loss": 0.0339, + "step": 4753 + }, + { + "epoch": 1.5875772249123394, + "grad_norm": 0.35471488508422894, + "learning_rate": 5.362114362983547e-06, + "loss": 0.0285, + "step": 4754 + }, + { + "epoch": 1.587911170479212, + "grad_norm": 0.3278108083323332, + "learning_rate": 5.360176105518761e-06, + "loss": 0.0252, + "step": 4755 + }, + { + "epoch": 1.5882451160460844, + "grad_norm": 0.5442057996894242, + "learning_rate": 5.358237793645133e-06, + "loss": 0.0259, + "step": 4756 + }, + { + "epoch": 1.5885790616129571, + "grad_norm": 0.3066559517227354, + "learning_rate": 5.356299427655469e-06, + "loss": 0.0248, + "step": 4757 + }, + { + "epoch": 1.5889130071798296, + "grad_norm": 0.3509292080854506, + "learning_rate": 5.354361007842581e-06, + "loss": 0.0238, + "step": 4758 + }, + { + "epoch": 1.5892469527467021, + "grad_norm": 0.3127844158130525, + "learning_rate": 5.352422534499291e-06, + "loss": 0.0209, + "step": 4759 + }, + { + "epoch": 1.5895808983135749, + "grad_norm": 0.25436475877059445, + "learning_rate": 5.350484007918428e-06, + "loss": 0.0288, + "step": 4760 + }, + { + "epoch": 1.5899148438804476, + "grad_norm": 0.26940685127458835, + "learning_rate": 5.3485454283928265e-06, + "loss": 0.0202, + "step": 4761 + }, + { + "epoch": 1.59024878944732, + "grad_norm": 0.29915499194764766, + "learning_rate": 5.346606796215335e-06, + "loss": 0.0227, + "step": 4762 + }, + { + "epoch": 1.5905827350141926, + "grad_norm": 0.3150840168448465, + "learning_rate": 5.344668111678805e-06, + "loss": 0.0289, + "step": 4763 + }, + { + "epoch": 1.5909166805810653, + "grad_norm": 0.29708402447560933, + "learning_rate": 5.3427293750761e-06, + "loss": 0.0293, + "step": 4764 + }, + { + "epoch": 1.591250626147938, + "grad_norm": 0.507848113811238, + "learning_rate": 5.340790586700086e-06, + "loss": 0.0444, + "step": 4765 + }, + { + "epoch": 1.5915845717148105, + "grad_norm": 0.47958756147843334, + "learning_rate": 5.338851746843643e-06, + "loss": 0.0504, + "step": 4766 + }, + { + "epoch": 1.591918517281683, + "grad_norm": 0.3857724170103133, + "learning_rate": 5.336912855799652e-06, + "loss": 0.0333, + "step": 4767 + }, + { + "epoch": 1.5922524628485557, + "grad_norm": 0.3256115713330487, + "learning_rate": 5.334973913861008e-06, + "loss": 0.0298, + "step": 4768 + }, + { + "epoch": 1.5925864084154284, + "grad_norm": 0.3210869938593134, + "learning_rate": 5.33303492132061e-06, + "loss": 0.0245, + "step": 4769 + }, + { + "epoch": 1.592920353982301, + "grad_norm": 0.3027694155959764, + "learning_rate": 5.3310958784713655e-06, + "loss": 0.0224, + "step": 4770 + }, + { + "epoch": 1.5932542995491734, + "grad_norm": 0.3374932600880021, + "learning_rate": 5.329156785606191e-06, + "loss": 0.0272, + "step": 4771 + }, + { + "epoch": 1.5935882451160461, + "grad_norm": 0.19829963870156075, + "learning_rate": 5.327217643018008e-06, + "loss": 0.0188, + "step": 4772 + }, + { + "epoch": 1.5939221906829188, + "grad_norm": 0.2875739802901327, + "learning_rate": 5.325278450999747e-06, + "loss": 0.0285, + "step": 4773 + }, + { + "epoch": 1.5942561362497911, + "grad_norm": 0.28658185117328083, + "learning_rate": 5.323339209844346e-06, + "loss": 0.0225, + "step": 4774 + }, + { + "epoch": 1.5945900818166638, + "grad_norm": 0.20728421149729695, + "learning_rate": 5.32139991984475e-06, + "loss": 0.0188, + "step": 4775 + }, + { + "epoch": 1.5949240273835366, + "grad_norm": 0.23157554397758384, + "learning_rate": 5.319460581293911e-06, + "loss": 0.0245, + "step": 4776 + }, + { + "epoch": 1.595257972950409, + "grad_norm": 0.4322591968780855, + "learning_rate": 5.317521194484791e-06, + "loss": 0.0361, + "step": 4777 + }, + { + "epoch": 1.5955919185172815, + "grad_norm": 0.2646817157765945, + "learning_rate": 5.315581759710356e-06, + "loss": 0.0262, + "step": 4778 + }, + { + "epoch": 1.5959258640841543, + "grad_norm": 0.3301913898104641, + "learning_rate": 5.313642277263577e-06, + "loss": 0.032, + "step": 4779 + }, + { + "epoch": 1.596259809651027, + "grad_norm": 0.2703215674687887, + "learning_rate": 5.311702747437443e-06, + "loss": 0.0237, + "step": 4780 + }, + { + "epoch": 1.5965937552178995, + "grad_norm": 0.2530070537761009, + "learning_rate": 5.309763170524937e-06, + "loss": 0.0201, + "step": 4781 + }, + { + "epoch": 1.596927700784772, + "grad_norm": 0.3446708667156327, + "learning_rate": 5.307823546819056e-06, + "loss": 0.0251, + "step": 4782 + }, + { + "epoch": 1.5972616463516447, + "grad_norm": 0.22083629292761292, + "learning_rate": 5.305883876612805e-06, + "loss": 0.0175, + "step": 4783 + }, + { + "epoch": 1.5975955919185174, + "grad_norm": 0.3487952843616271, + "learning_rate": 5.303944160199193e-06, + "loss": 0.0277, + "step": 4784 + }, + { + "epoch": 1.59792953748539, + "grad_norm": 0.3540184288587415, + "learning_rate": 5.302004397871237e-06, + "loss": 0.0268, + "step": 4785 + }, + { + "epoch": 1.5982634830522624, + "grad_norm": 0.3465534816893311, + "learning_rate": 5.3000645899219594e-06, + "loss": 0.035, + "step": 4786 + }, + { + "epoch": 1.5985974286191351, + "grad_norm": 0.3162003508577011, + "learning_rate": 5.298124736644392e-06, + "loss": 0.0262, + "step": 4787 + }, + { + "epoch": 1.5989313741860078, + "grad_norm": 0.35194628643626913, + "learning_rate": 5.296184838331575e-06, + "loss": 0.0272, + "step": 4788 + }, + { + "epoch": 1.5992653197528803, + "grad_norm": 0.32583954152290284, + "learning_rate": 5.2942448952765495e-06, + "loss": 0.0192, + "step": 4789 + }, + { + "epoch": 1.5995992653197528, + "grad_norm": 0.2805320714680657, + "learning_rate": 5.292304907772367e-06, + "loss": 0.02, + "step": 4790 + }, + { + "epoch": 1.5999332108866255, + "grad_norm": 0.40436549839817487, + "learning_rate": 5.290364876112088e-06, + "loss": 0.0296, + "step": 4791 + }, + { + "epoch": 1.600267156453498, + "grad_norm": 0.24443108313728634, + "learning_rate": 5.288424800588775e-06, + "loss": 0.0195, + "step": 4792 + }, + { + "epoch": 1.6006011020203705, + "grad_norm": 0.2727773278746564, + "learning_rate": 5.2864846814955e-06, + "loss": 0.0249, + "step": 4793 + }, + { + "epoch": 1.6009350475872433, + "grad_norm": 0.23916156551333528, + "learning_rate": 5.28454451912534e-06, + "loss": 0.0212, + "step": 4794 + }, + { + "epoch": 1.601268993154116, + "grad_norm": 0.3076333340386845, + "learning_rate": 5.28260431377138e-06, + "loss": 0.0223, + "step": 4795 + }, + { + "epoch": 1.6016029387209885, + "grad_norm": 0.6434255776259884, + "learning_rate": 5.280664065726712e-06, + "loss": 0.0325, + "step": 4796 + }, + { + "epoch": 1.601936884287861, + "grad_norm": 0.2842031209752502, + "learning_rate": 5.278723775284432e-06, + "loss": 0.0297, + "step": 4797 + }, + { + "epoch": 1.6022708298547337, + "grad_norm": 0.25222956898096344, + "learning_rate": 5.276783442737642e-06, + "loss": 0.0255, + "step": 4798 + }, + { + "epoch": 1.6026047754216064, + "grad_norm": 0.321502376766453, + "learning_rate": 5.274843068379456e-06, + "loss": 0.0307, + "step": 4799 + }, + { + "epoch": 1.602938720988479, + "grad_norm": 0.29430310303389207, + "learning_rate": 5.272902652502988e-06, + "loss": 0.0237, + "step": 4800 + }, + { + "epoch": 1.6032726665553514, + "grad_norm": 0.27058933831977283, + "learning_rate": 5.27096219540136e-06, + "loss": 0.0305, + "step": 4801 + }, + { + "epoch": 1.603606612122224, + "grad_norm": 0.2555684774059336, + "learning_rate": 5.269021697367702e-06, + "loss": 0.0242, + "step": 4802 + }, + { + "epoch": 1.6039405576890968, + "grad_norm": 0.3308202615222362, + "learning_rate": 5.26708115869515e-06, + "loss": 0.0239, + "step": 4803 + }, + { + "epoch": 1.6042745032559693, + "grad_norm": 0.2510960013556946, + "learning_rate": 5.265140579676844e-06, + "loss": 0.0258, + "step": 4804 + }, + { + "epoch": 1.6046084488228418, + "grad_norm": 0.21484759970123984, + "learning_rate": 5.263199960605931e-06, + "loss": 0.0194, + "step": 4805 + }, + { + "epoch": 1.6049423943897145, + "grad_norm": 0.3116786428964571, + "learning_rate": 5.261259301775564e-06, + "loss": 0.034, + "step": 4806 + }, + { + "epoch": 1.605276339956587, + "grad_norm": 0.2277523219865577, + "learning_rate": 5.259318603478904e-06, + "loss": 0.0169, + "step": 4807 + }, + { + "epoch": 1.6056102855234595, + "grad_norm": 0.48014624121360283, + "learning_rate": 5.2573778660091156e-06, + "loss": 0.0371, + "step": 4808 + }, + { + "epoch": 1.6059442310903322, + "grad_norm": 0.29796799196893226, + "learning_rate": 5.255437089659371e-06, + "loss": 0.0293, + "step": 4809 + }, + { + "epoch": 1.606278176657205, + "grad_norm": 0.4735160658342941, + "learning_rate": 5.253496274722846e-06, + "loss": 0.0348, + "step": 4810 + }, + { + "epoch": 1.6066121222240775, + "grad_norm": 0.21864767422674275, + "learning_rate": 5.251555421492722e-06, + "loss": 0.0221, + "step": 4811 + }, + { + "epoch": 1.60694606779095, + "grad_norm": 0.22988882576601047, + "learning_rate": 5.249614530262191e-06, + "loss": 0.0168, + "step": 4812 + }, + { + "epoch": 1.6072800133578227, + "grad_norm": 0.23866870118765524, + "learning_rate": 5.2476736013244475e-06, + "loss": 0.0188, + "step": 4813 + }, + { + "epoch": 1.6076139589246954, + "grad_norm": 0.242067290022722, + "learning_rate": 5.245732634972688e-06, + "loss": 0.0207, + "step": 4814 + }, + { + "epoch": 1.6079479044915679, + "grad_norm": 0.27981419296431, + "learning_rate": 5.243791631500122e-06, + "loss": 0.0256, + "step": 4815 + }, + { + "epoch": 1.6082818500584404, + "grad_norm": 0.2874140162528723, + "learning_rate": 5.24185059119996e-06, + "loss": 0.0182, + "step": 4816 + }, + { + "epoch": 1.608615795625313, + "grad_norm": 0.4656270336515884, + "learning_rate": 5.239909514365415e-06, + "loss": 0.0198, + "step": 4817 + }, + { + "epoch": 1.6089497411921858, + "grad_norm": 0.27341091957596597, + "learning_rate": 5.237968401289717e-06, + "loss": 0.0251, + "step": 4818 + }, + { + "epoch": 1.6092836867590583, + "grad_norm": 0.23066541951949823, + "learning_rate": 5.236027252266088e-06, + "loss": 0.0183, + "step": 4819 + }, + { + "epoch": 1.6096176323259308, + "grad_norm": 0.29781999695816197, + "learning_rate": 5.234086067587765e-06, + "loss": 0.0354, + "step": 4820 + }, + { + "epoch": 1.6099515778928035, + "grad_norm": 0.28720055288235125, + "learning_rate": 5.232144847547983e-06, + "loss": 0.0199, + "step": 4821 + }, + { + "epoch": 1.6102855234596762, + "grad_norm": 0.4228859821305198, + "learning_rate": 5.230203592439989e-06, + "loss": 0.0314, + "step": 4822 + }, + { + "epoch": 1.6106194690265485, + "grad_norm": 0.26080625923864204, + "learning_rate": 5.228262302557034e-06, + "loss": 0.0222, + "step": 4823 + }, + { + "epoch": 1.6109534145934212, + "grad_norm": 0.34423070383950344, + "learning_rate": 5.226320978192369e-06, + "loss": 0.0325, + "step": 4824 + }, + { + "epoch": 1.611287360160294, + "grad_norm": 0.2234218708852561, + "learning_rate": 5.224379619639253e-06, + "loss": 0.016, + "step": 4825 + }, + { + "epoch": 1.6116213057271664, + "grad_norm": 0.4742693452016969, + "learning_rate": 5.222438227190957e-06, + "loss": 0.0224, + "step": 4826 + }, + { + "epoch": 1.611955251294039, + "grad_norm": 0.2688246703311745, + "learning_rate": 5.220496801140746e-06, + "loss": 0.0215, + "step": 4827 + }, + { + "epoch": 1.6122891968609117, + "grad_norm": 0.40459376139778924, + "learning_rate": 5.218555341781897e-06, + "loss": 0.0262, + "step": 4828 + }, + { + "epoch": 1.6126231424277844, + "grad_norm": 0.3428041641425851, + "learning_rate": 5.216613849407691e-06, + "loss": 0.0312, + "step": 4829 + }, + { + "epoch": 1.6129570879946569, + "grad_norm": 0.3605583467383309, + "learning_rate": 5.214672324311412e-06, + "loss": 0.0303, + "step": 4830 + }, + { + "epoch": 1.6132910335615294, + "grad_norm": 0.31557417238903585, + "learning_rate": 5.21273076678635e-06, + "loss": 0.0233, + "step": 4831 + }, + { + "epoch": 1.613624979128402, + "grad_norm": 0.2642680826640008, + "learning_rate": 5.210789177125802e-06, + "loss": 0.0184, + "step": 4832 + }, + { + "epoch": 1.6139589246952748, + "grad_norm": 0.3859219655297724, + "learning_rate": 5.208847555623066e-06, + "loss": 0.0286, + "step": 4833 + }, + { + "epoch": 1.6142928702621473, + "grad_norm": 0.25315683691876983, + "learning_rate": 5.206905902571447e-06, + "loss": 0.0182, + "step": 4834 + }, + { + "epoch": 1.6146268158290198, + "grad_norm": 0.25054761531808073, + "learning_rate": 5.204964218264258e-06, + "loss": 0.0198, + "step": 4835 + }, + { + "epoch": 1.6149607613958925, + "grad_norm": 0.26758594323195806, + "learning_rate": 5.203022502994808e-06, + "loss": 0.0188, + "step": 4836 + }, + { + "epoch": 1.6152947069627652, + "grad_norm": 0.24297502186397885, + "learning_rate": 5.201080757056418e-06, + "loss": 0.0233, + "step": 4837 + }, + { + "epoch": 1.6156286525296377, + "grad_norm": 0.3336641968278252, + "learning_rate": 5.1991389807424145e-06, + "loss": 0.0298, + "step": 4838 + }, + { + "epoch": 1.6159625980965102, + "grad_norm": 0.28801638864308404, + "learning_rate": 5.1971971743461215e-06, + "loss": 0.0218, + "step": 4839 + }, + { + "epoch": 1.616296543663383, + "grad_norm": 0.33992786590422136, + "learning_rate": 5.195255338160873e-06, + "loss": 0.0312, + "step": 4840 + }, + { + "epoch": 1.6166304892302554, + "grad_norm": 0.3300505734606772, + "learning_rate": 5.193313472480007e-06, + "loss": 0.0351, + "step": 4841 + }, + { + "epoch": 1.616964434797128, + "grad_norm": 0.2469267585111408, + "learning_rate": 5.191371577596866e-06, + "loss": 0.0212, + "step": 4842 + }, + { + "epoch": 1.6172983803640006, + "grad_norm": 0.25766521677771825, + "learning_rate": 5.189429653804794e-06, + "loss": 0.0227, + "step": 4843 + }, + { + "epoch": 1.6176323259308734, + "grad_norm": 0.2638100017500362, + "learning_rate": 5.187487701397142e-06, + "loss": 0.0232, + "step": 4844 + }, + { + "epoch": 1.6179662714977459, + "grad_norm": 0.30013885783696354, + "learning_rate": 5.185545720667266e-06, + "loss": 0.0254, + "step": 4845 + }, + { + "epoch": 1.6183002170646184, + "grad_norm": 0.2777395363844827, + "learning_rate": 5.183603711908523e-06, + "loss": 0.0343, + "step": 4846 + }, + { + "epoch": 1.618634162631491, + "grad_norm": 0.25097533760935614, + "learning_rate": 5.181661675414278e-06, + "loss": 0.0218, + "step": 4847 + }, + { + "epoch": 1.6189681081983638, + "grad_norm": 0.3333191072539681, + "learning_rate": 5.179719611477898e-06, + "loss": 0.0292, + "step": 4848 + }, + { + "epoch": 1.6193020537652363, + "grad_norm": 0.2892431012234522, + "learning_rate": 5.1777775203927535e-06, + "loss": 0.0259, + "step": 4849 + }, + { + "epoch": 1.6196359993321088, + "grad_norm": 0.2756012623572237, + "learning_rate": 5.175835402452223e-06, + "loss": 0.0247, + "step": 4850 + }, + { + "epoch": 1.6199699448989815, + "grad_norm": 0.314328113990028, + "learning_rate": 5.173893257949683e-06, + "loss": 0.0237, + "step": 4851 + }, + { + "epoch": 1.6203038904658542, + "grad_norm": 0.35503470207424787, + "learning_rate": 5.17195108717852e-06, + "loss": 0.0343, + "step": 4852 + }, + { + "epoch": 1.6206378360327267, + "grad_norm": 0.33815210771986753, + "learning_rate": 5.170008890432121e-06, + "loss": 0.0242, + "step": 4853 + }, + { + "epoch": 1.6209717815995992, + "grad_norm": 0.34596514073694246, + "learning_rate": 5.168066668003876e-06, + "loss": 0.0299, + "step": 4854 + }, + { + "epoch": 1.621305727166472, + "grad_norm": 0.33130168382659214, + "learning_rate": 5.166124420187182e-06, + "loss": 0.0315, + "step": 4855 + }, + { + "epoch": 1.6216396727333444, + "grad_norm": 0.5759503857143327, + "learning_rate": 5.164182147275439e-06, + "loss": 0.034, + "step": 4856 + }, + { + "epoch": 1.621973618300217, + "grad_norm": 0.371117361447823, + "learning_rate": 5.16223984956205e-06, + "loss": 0.0289, + "step": 4857 + }, + { + "epoch": 1.6223075638670896, + "grad_norm": 0.3252396054377313, + "learning_rate": 5.1602975273404196e-06, + "loss": 0.0333, + "step": 4858 + }, + { + "epoch": 1.6226415094339623, + "grad_norm": 0.2675817867545674, + "learning_rate": 5.158355180903961e-06, + "loss": 0.0238, + "step": 4859 + }, + { + "epoch": 1.6229754550008348, + "grad_norm": 0.22672990587307784, + "learning_rate": 5.156412810546089e-06, + "loss": 0.0151, + "step": 4860 + }, + { + "epoch": 1.6233094005677073, + "grad_norm": 0.34311026777566395, + "learning_rate": 5.154470416560219e-06, + "loss": 0.0365, + "step": 4861 + }, + { + "epoch": 1.62364334613458, + "grad_norm": 0.2885105262846136, + "learning_rate": 5.152527999239774e-06, + "loss": 0.0196, + "step": 4862 + }, + { + "epoch": 1.6239772917014528, + "grad_norm": 0.33436878170656337, + "learning_rate": 5.150585558878177e-06, + "loss": 0.0288, + "step": 4863 + }, + { + "epoch": 1.6243112372683253, + "grad_norm": 0.2304555832945819, + "learning_rate": 5.148643095768861e-06, + "loss": 0.0182, + "step": 4864 + }, + { + "epoch": 1.6246451828351978, + "grad_norm": 0.22989453735018273, + "learning_rate": 5.146700610205254e-06, + "loss": 0.0193, + "step": 4865 + }, + { + "epoch": 1.6249791284020705, + "grad_norm": 0.3030183725671978, + "learning_rate": 5.144758102480792e-06, + "loss": 0.0272, + "step": 4866 + }, + { + "epoch": 1.6253130739689432, + "grad_norm": 0.2848636087310223, + "learning_rate": 5.142815572888915e-06, + "loss": 0.0179, + "step": 4867 + }, + { + "epoch": 1.6256470195358157, + "grad_norm": 0.30807782885876955, + "learning_rate": 5.140873021723065e-06, + "loss": 0.0164, + "step": 4868 + }, + { + "epoch": 1.6259809651026882, + "grad_norm": 0.2636911885214975, + "learning_rate": 5.138930449276686e-06, + "loss": 0.0213, + "step": 4869 + }, + { + "epoch": 1.626314910669561, + "grad_norm": 0.3594121242116709, + "learning_rate": 5.136987855843226e-06, + "loss": 0.0335, + "step": 4870 + }, + { + "epoch": 1.6266488562364336, + "grad_norm": 0.30992688499839566, + "learning_rate": 5.135045241716138e-06, + "loss": 0.0272, + "step": 4871 + }, + { + "epoch": 1.626982801803306, + "grad_norm": 0.6901160642035612, + "learning_rate": 5.133102607188875e-06, + "loss": 0.0462, + "step": 4872 + }, + { + "epoch": 1.6273167473701786, + "grad_norm": 0.23929753820696184, + "learning_rate": 5.131159952554896e-06, + "loss": 0.023, + "step": 4873 + }, + { + "epoch": 1.6276506929370513, + "grad_norm": 0.2724424655945263, + "learning_rate": 5.129217278107663e-06, + "loss": 0.0282, + "step": 4874 + }, + { + "epoch": 1.6279846385039238, + "grad_norm": 0.282057777681632, + "learning_rate": 5.127274584140636e-06, + "loss": 0.0213, + "step": 4875 + }, + { + "epoch": 1.6283185840707963, + "grad_norm": 0.3379046637925344, + "learning_rate": 5.125331870947287e-06, + "loss": 0.0296, + "step": 4876 + }, + { + "epoch": 1.628652529637669, + "grad_norm": 0.24309572848826755, + "learning_rate": 5.123389138821084e-06, + "loss": 0.0199, + "step": 4877 + }, + { + "epoch": 1.6289864752045418, + "grad_norm": 0.36315804777175525, + "learning_rate": 5.121446388055497e-06, + "loss": 0.0367, + "step": 4878 + }, + { + "epoch": 1.6293204207714143, + "grad_norm": 0.23689374667209845, + "learning_rate": 5.119503618944004e-06, + "loss": 0.0192, + "step": 4879 + }, + { + "epoch": 1.6296543663382868, + "grad_norm": 0.22554833791099554, + "learning_rate": 5.117560831780082e-06, + "loss": 0.0198, + "step": 4880 + }, + { + "epoch": 1.6299883119051595, + "grad_norm": 0.2855831070144288, + "learning_rate": 5.115618026857211e-06, + "loss": 0.0214, + "step": 4881 + }, + { + "epoch": 1.6303222574720322, + "grad_norm": 0.28459196633380857, + "learning_rate": 5.113675204468876e-06, + "loss": 0.0268, + "step": 4882 + }, + { + "epoch": 1.6306562030389047, + "grad_norm": 0.2986998450636787, + "learning_rate": 5.111732364908564e-06, + "loss": 0.027, + "step": 4883 + }, + { + "epoch": 1.6309901486057772, + "grad_norm": 0.21241646747444373, + "learning_rate": 5.109789508469761e-06, + "loss": 0.019, + "step": 4884 + }, + { + "epoch": 1.63132409417265, + "grad_norm": 0.24206957076505653, + "learning_rate": 5.107846635445962e-06, + "loss": 0.026, + "step": 4885 + }, + { + "epoch": 1.6316580397395226, + "grad_norm": 0.37132477665509145, + "learning_rate": 5.1059037461306586e-06, + "loss": 0.0271, + "step": 4886 + }, + { + "epoch": 1.631991985306395, + "grad_norm": 0.3072025606318378, + "learning_rate": 5.103960840817346e-06, + "loss": 0.0236, + "step": 4887 + }, + { + "epoch": 1.6323259308732676, + "grad_norm": 0.2767586912028166, + "learning_rate": 5.1020179197995245e-06, + "loss": 0.03, + "step": 4888 + }, + { + "epoch": 1.6326598764401403, + "grad_norm": 0.3670096785316321, + "learning_rate": 5.1000749833706964e-06, + "loss": 0.0238, + "step": 4889 + }, + { + "epoch": 1.6329938220070128, + "grad_norm": 0.30533979712995624, + "learning_rate": 5.098132031824362e-06, + "loss": 0.0239, + "step": 4890 + }, + { + "epoch": 1.6333277675738853, + "grad_norm": 0.2868800217312553, + "learning_rate": 5.096189065454029e-06, + "loss": 0.0241, + "step": 4891 + }, + { + "epoch": 1.633661713140758, + "grad_norm": 0.298824688834227, + "learning_rate": 5.094246084553206e-06, + "loss": 0.0386, + "step": 4892 + }, + { + "epoch": 1.6339956587076307, + "grad_norm": 0.2670158781325221, + "learning_rate": 5.092303089415403e-06, + "loss": 0.0191, + "step": 4893 + }, + { + "epoch": 1.6343296042745032, + "grad_norm": 0.26080028444541575, + "learning_rate": 5.09036008033413e-06, + "loss": 0.0245, + "step": 4894 + }, + { + "epoch": 1.6346635498413757, + "grad_norm": 0.35152356013618197, + "learning_rate": 5.0884170576029034e-06, + "loss": 0.0425, + "step": 4895 + }, + { + "epoch": 1.6349974954082485, + "grad_norm": 0.37809725267474514, + "learning_rate": 5.086474021515238e-06, + "loss": 0.0287, + "step": 4896 + }, + { + "epoch": 1.6353314409751212, + "grad_norm": 0.3471898600544406, + "learning_rate": 5.084530972364656e-06, + "loss": 0.0259, + "step": 4897 + }, + { + "epoch": 1.6356653865419937, + "grad_norm": 0.3368310914230557, + "learning_rate": 5.082587910444674e-06, + "loss": 0.0273, + "step": 4898 + }, + { + "epoch": 1.6359993321088662, + "grad_norm": 0.36674910929416554, + "learning_rate": 5.080644836048815e-06, + "loss": 0.0237, + "step": 4899 + }, + { + "epoch": 1.6363332776757389, + "grad_norm": 0.234722678580623, + "learning_rate": 5.0787017494706035e-06, + "loss": 0.019, + "step": 4900 + }, + { + "epoch": 1.6366672232426116, + "grad_norm": 0.32775241114624526, + "learning_rate": 5.076758651003567e-06, + "loss": 0.0229, + "step": 4901 + }, + { + "epoch": 1.637001168809484, + "grad_norm": 0.27395380220635784, + "learning_rate": 5.0748155409412325e-06, + "loss": 0.0247, + "step": 4902 + }, + { + "epoch": 1.6373351143763566, + "grad_norm": 0.26414085875841076, + "learning_rate": 5.0728724195771295e-06, + "loss": 0.0253, + "step": 4903 + }, + { + "epoch": 1.6376690599432293, + "grad_norm": 0.32636371422360083, + "learning_rate": 5.070929287204789e-06, + "loss": 0.0207, + "step": 4904 + }, + { + "epoch": 1.6380030055101018, + "grad_norm": 0.3133734134501154, + "learning_rate": 5.068986144117746e-06, + "loss": 0.0236, + "step": 4905 + }, + { + "epoch": 1.6383369510769743, + "grad_norm": 0.38445653705556826, + "learning_rate": 5.067042990609533e-06, + "loss": 0.0308, + "step": 4906 + }, + { + "epoch": 1.638670896643847, + "grad_norm": 0.34188288130237365, + "learning_rate": 5.065099826973685e-06, + "loss": 0.0313, + "step": 4907 + }, + { + "epoch": 1.6390048422107197, + "grad_norm": 0.2945760238756098, + "learning_rate": 5.0631566535037435e-06, + "loss": 0.025, + "step": 4908 + }, + { + "epoch": 1.6393387877775922, + "grad_norm": 0.250507187653982, + "learning_rate": 5.061213470493246e-06, + "loss": 0.0219, + "step": 4909 + }, + { + "epoch": 1.6396727333444647, + "grad_norm": 0.35430706689850594, + "learning_rate": 5.059270278235732e-06, + "loss": 0.0365, + "step": 4910 + }, + { + "epoch": 1.6400066789113374, + "grad_norm": 0.26003382451159485, + "learning_rate": 5.057327077024745e-06, + "loss": 0.023, + "step": 4911 + }, + { + "epoch": 1.6403406244782102, + "grad_norm": 0.32358799810408434, + "learning_rate": 5.055383867153829e-06, + "loss": 0.0288, + "step": 4912 + }, + { + "epoch": 1.6406745700450827, + "grad_norm": 0.24320296939869507, + "learning_rate": 5.053440648916526e-06, + "loss": 0.018, + "step": 4913 + }, + { + "epoch": 1.6410085156119552, + "grad_norm": 0.3309120854791246, + "learning_rate": 5.051497422606385e-06, + "loss": 0.0219, + "step": 4914 + }, + { + "epoch": 1.6413424611788279, + "grad_norm": 0.24222949820992562, + "learning_rate": 5.049554188516952e-06, + "loss": 0.022, + "step": 4915 + }, + { + "epoch": 1.6416764067457006, + "grad_norm": 0.28324159577532293, + "learning_rate": 5.047610946941775e-06, + "loss": 0.0252, + "step": 4916 + }, + { + "epoch": 1.642010352312573, + "grad_norm": 0.2467957660971369, + "learning_rate": 5.045667698174403e-06, + "loss": 0.0197, + "step": 4917 + }, + { + "epoch": 1.6423442978794456, + "grad_norm": 0.270345628312582, + "learning_rate": 5.043724442508388e-06, + "loss": 0.0235, + "step": 4918 + }, + { + "epoch": 1.6426782434463183, + "grad_norm": 0.3826462739790992, + "learning_rate": 5.0417811802372815e-06, + "loss": 0.0253, + "step": 4919 + }, + { + "epoch": 1.643012189013191, + "grad_norm": 0.2954307686434209, + "learning_rate": 5.039837911654637e-06, + "loss": 0.023, + "step": 4920 + }, + { + "epoch": 1.6433461345800633, + "grad_norm": 0.25784881747962524, + "learning_rate": 5.037894637054005e-06, + "loss": 0.0194, + "step": 4921 + }, + { + "epoch": 1.643680080146936, + "grad_norm": 0.3703488806793411, + "learning_rate": 5.035951356728942e-06, + "loss": 0.0263, + "step": 4922 + }, + { + "epoch": 1.6440140257138087, + "grad_norm": 0.23582795623156796, + "learning_rate": 5.034008070973004e-06, + "loss": 0.0207, + "step": 4923 + }, + { + "epoch": 1.6443479712806812, + "grad_norm": 0.4003376452782364, + "learning_rate": 5.032064780079746e-06, + "loss": 0.0342, + "step": 4924 + }, + { + "epoch": 1.6446819168475537, + "grad_norm": 0.2683086827761114, + "learning_rate": 5.030121484342725e-06, + "loss": 0.0159, + "step": 4925 + }, + { + "epoch": 1.6450158624144264, + "grad_norm": 0.2923408313808179, + "learning_rate": 5.0281781840555e-06, + "loss": 0.0165, + "step": 4926 + }, + { + "epoch": 1.6453498079812992, + "grad_norm": 0.2837884708755334, + "learning_rate": 5.026234879511629e-06, + "loss": 0.0255, + "step": 4927 + }, + { + "epoch": 1.6456837535481716, + "grad_norm": 0.3337187235406368, + "learning_rate": 5.024291571004668e-06, + "loss": 0.0299, + "step": 4928 + }, + { + "epoch": 1.6460176991150441, + "grad_norm": 0.2334689315079052, + "learning_rate": 5.022348258828181e-06, + "loss": 0.0188, + "step": 4929 + }, + { + "epoch": 1.6463516446819169, + "grad_norm": 0.35295419225541663, + "learning_rate": 5.020404943275727e-06, + "loss": 0.0246, + "step": 4930 + }, + { + "epoch": 1.6466855902487896, + "grad_norm": 0.46047678634995604, + "learning_rate": 5.018461624640864e-06, + "loss": 0.0273, + "step": 4931 + }, + { + "epoch": 1.647019535815662, + "grad_norm": 0.42576537149936977, + "learning_rate": 5.016518303217157e-06, + "loss": 0.03, + "step": 4932 + }, + { + "epoch": 1.6473534813825346, + "grad_norm": 0.31993065717037794, + "learning_rate": 5.014574979298166e-06, + "loss": 0.0266, + "step": 4933 + }, + { + "epoch": 1.6476874269494073, + "grad_norm": 0.43189933456424034, + "learning_rate": 5.012631653177451e-06, + "loss": 0.0346, + "step": 4934 + }, + { + "epoch": 1.64802137251628, + "grad_norm": 0.24598475765528363, + "learning_rate": 5.010688325148577e-06, + "loss": 0.019, + "step": 4935 + }, + { + "epoch": 1.6483553180831525, + "grad_norm": 0.2454250345953795, + "learning_rate": 5.008744995505107e-06, + "loss": 0.0231, + "step": 4936 + }, + { + "epoch": 1.648689263650025, + "grad_norm": 0.2958006817103818, + "learning_rate": 5.0068016645406e-06, + "loss": 0.0227, + "step": 4937 + }, + { + "epoch": 1.6490232092168977, + "grad_norm": 0.328464060756527, + "learning_rate": 5.0048583325486234e-06, + "loss": 0.029, + "step": 4938 + }, + { + "epoch": 1.6493571547837702, + "grad_norm": 0.2824788317369057, + "learning_rate": 5.002914999822737e-06, + "loss": 0.0262, + "step": 4939 + }, + { + "epoch": 1.6496911003506427, + "grad_norm": 0.3191445345770001, + "learning_rate": 5.000971666656508e-06, + "loss": 0.0189, + "step": 4940 + }, + { + "epoch": 1.6500250459175154, + "grad_norm": 0.3926108442312257, + "learning_rate": 4.999028333343494e-06, + "loss": 0.0322, + "step": 4941 + }, + { + "epoch": 1.6503589914843881, + "grad_norm": 0.26382261560676995, + "learning_rate": 4.9970850001772634e-06, + "loss": 0.0255, + "step": 4942 + }, + { + "epoch": 1.6506929370512606, + "grad_norm": 0.48378684788902976, + "learning_rate": 4.995141667451378e-06, + "loss": 0.038, + "step": 4943 + }, + { + "epoch": 1.6510268826181331, + "grad_norm": 0.25734112215571786, + "learning_rate": 4.993198335459401e-06, + "loss": 0.0204, + "step": 4944 + }, + { + "epoch": 1.6513608281850058, + "grad_norm": 0.427964087391538, + "learning_rate": 4.991255004494896e-06, + "loss": 0.03, + "step": 4945 + }, + { + "epoch": 1.6516947737518786, + "grad_norm": 0.3083034757934975, + "learning_rate": 4.989311674851424e-06, + "loss": 0.025, + "step": 4946 + }, + { + "epoch": 1.652028719318751, + "grad_norm": 0.2904322188828532, + "learning_rate": 4.9873683468225495e-06, + "loss": 0.0349, + "step": 4947 + }, + { + "epoch": 1.6523626648856236, + "grad_norm": 0.25018309190884663, + "learning_rate": 4.985425020701836e-06, + "loss": 0.0232, + "step": 4948 + }, + { + "epoch": 1.6526966104524963, + "grad_norm": 0.2443123792017699, + "learning_rate": 4.983481696782844e-06, + "loss": 0.0228, + "step": 4949 + }, + { + "epoch": 1.653030556019369, + "grad_norm": 0.32784355546403116, + "learning_rate": 4.9815383753591365e-06, + "loss": 0.0224, + "step": 4950 + }, + { + "epoch": 1.6533645015862415, + "grad_norm": 0.3556476588955834, + "learning_rate": 4.9795950567242754e-06, + "loss": 0.0338, + "step": 4951 + }, + { + "epoch": 1.653698447153114, + "grad_norm": 0.21734563076902141, + "learning_rate": 4.9776517411718214e-06, + "loss": 0.0156, + "step": 4952 + }, + { + "epoch": 1.6540323927199867, + "grad_norm": 0.5343447426928528, + "learning_rate": 4.9757084289953325e-06, + "loss": 0.033, + "step": 4953 + }, + { + "epoch": 1.6543663382868592, + "grad_norm": 0.2637916442901931, + "learning_rate": 4.973765120488373e-06, + "loss": 0.0206, + "step": 4954 + }, + { + "epoch": 1.6547002838537317, + "grad_norm": 0.4122999440215748, + "learning_rate": 4.9718218159445015e-06, + "loss": 0.0349, + "step": 4955 + }, + { + "epoch": 1.6550342294206044, + "grad_norm": 0.631519622363533, + "learning_rate": 4.969878515657276e-06, + "loss": 0.0309, + "step": 4956 + }, + { + "epoch": 1.6553681749874771, + "grad_norm": 0.2901519199422888, + "learning_rate": 4.967935219920257e-06, + "loss": 0.0217, + "step": 4957 + }, + { + "epoch": 1.6557021205543496, + "grad_norm": 0.21665088728859036, + "learning_rate": 4.9659919290269986e-06, + "loss": 0.0194, + "step": 4958 + }, + { + "epoch": 1.6560360661212221, + "grad_norm": 0.2702260715546599, + "learning_rate": 4.964048643271058e-06, + "loss": 0.0272, + "step": 4959 + }, + { + "epoch": 1.6563700116880948, + "grad_norm": 0.34327017188350023, + "learning_rate": 4.962105362945996e-06, + "loss": 0.0318, + "step": 4960 + }, + { + "epoch": 1.6567039572549676, + "grad_norm": 0.3175698342141884, + "learning_rate": 4.960162088345365e-06, + "loss": 0.0207, + "step": 4961 + }, + { + "epoch": 1.65703790282184, + "grad_norm": 0.2674284079862701, + "learning_rate": 4.958218819762719e-06, + "loss": 0.0197, + "step": 4962 + }, + { + "epoch": 1.6573718483887125, + "grad_norm": 0.21629682957993326, + "learning_rate": 4.9562755574916125e-06, + "loss": 0.0174, + "step": 4963 + }, + { + "epoch": 1.6577057939555853, + "grad_norm": 0.28107690820445647, + "learning_rate": 4.954332301825597e-06, + "loss": 0.026, + "step": 4964 + }, + { + "epoch": 1.658039739522458, + "grad_norm": 0.34105964033776986, + "learning_rate": 4.952389053058226e-06, + "loss": 0.0251, + "step": 4965 + }, + { + "epoch": 1.6583736850893305, + "grad_norm": 0.2805619123027518, + "learning_rate": 4.95044581148305e-06, + "loss": 0.0165, + "step": 4966 + }, + { + "epoch": 1.658707630656203, + "grad_norm": 0.3159403043348787, + "learning_rate": 4.948502577393617e-06, + "loss": 0.0295, + "step": 4967 + }, + { + "epoch": 1.6590415762230757, + "grad_norm": 0.29156347754202194, + "learning_rate": 4.946559351083475e-06, + "loss": 0.0247, + "step": 4968 + }, + { + "epoch": 1.6593755217899484, + "grad_norm": 0.3093528722391559, + "learning_rate": 4.944616132846174e-06, + "loss": 0.025, + "step": 4969 + }, + { + "epoch": 1.6597094673568207, + "grad_norm": 0.29106247154535825, + "learning_rate": 4.942672922975255e-06, + "loss": 0.0303, + "step": 4970 + }, + { + "epoch": 1.6600434129236934, + "grad_norm": 0.29749313478295963, + "learning_rate": 4.940729721764268e-06, + "loss": 0.0268, + "step": 4971 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 0.29895015574619965, + "learning_rate": 4.938786529506755e-06, + "loss": 0.0286, + "step": 4972 + }, + { + "epoch": 1.6607113040574386, + "grad_norm": 0.3307527429050344, + "learning_rate": 4.936843346496257e-06, + "loss": 0.0334, + "step": 4973 + }, + { + "epoch": 1.661045249624311, + "grad_norm": 0.3007469570717518, + "learning_rate": 4.934900173026316e-06, + "loss": 0.024, + "step": 4974 + }, + { + "epoch": 1.6613791951911838, + "grad_norm": 0.3809458735159086, + "learning_rate": 4.93295700939047e-06, + "loss": 0.0431, + "step": 4975 + }, + { + "epoch": 1.6617131407580565, + "grad_norm": 0.37616593115326924, + "learning_rate": 4.931013855882255e-06, + "loss": 0.0276, + "step": 4976 + }, + { + "epoch": 1.662047086324929, + "grad_norm": 0.33783893099792933, + "learning_rate": 4.929070712795211e-06, + "loss": 0.035, + "step": 4977 + }, + { + "epoch": 1.6623810318918015, + "grad_norm": 0.27044433571857607, + "learning_rate": 4.927127580422871e-06, + "loss": 0.0205, + "step": 4978 + }, + { + "epoch": 1.6627149774586742, + "grad_norm": 0.33689189069593234, + "learning_rate": 4.925184459058769e-06, + "loss": 0.0206, + "step": 4979 + }, + { + "epoch": 1.663048923025547, + "grad_norm": 0.32713079428308717, + "learning_rate": 4.9232413489964345e-06, + "loss": 0.0361, + "step": 4980 + }, + { + "epoch": 1.6633828685924195, + "grad_norm": 0.3093826978462791, + "learning_rate": 4.921298250529398e-06, + "loss": 0.0281, + "step": 4981 + }, + { + "epoch": 1.663716814159292, + "grad_norm": 0.2130044144120047, + "learning_rate": 4.919355163951186e-06, + "loss": 0.02, + "step": 4982 + }, + { + "epoch": 1.6640507597261647, + "grad_norm": 0.27556262868873405, + "learning_rate": 4.917412089555328e-06, + "loss": 0.0252, + "step": 4983 + }, + { + "epoch": 1.6643847052930374, + "grad_norm": 0.2516454214029749, + "learning_rate": 4.915469027635345e-06, + "loss": 0.0227, + "step": 4984 + }, + { + "epoch": 1.6647186508599099, + "grad_norm": 0.2651718544270958, + "learning_rate": 4.9135259784847625e-06, + "loss": 0.0143, + "step": 4985 + }, + { + "epoch": 1.6650525964267824, + "grad_norm": 0.3111543496451446, + "learning_rate": 4.911582942397098e-06, + "loss": 0.0267, + "step": 4986 + }, + { + "epoch": 1.665386541993655, + "grad_norm": 0.3924904379069039, + "learning_rate": 4.909639919665872e-06, + "loss": 0.029, + "step": 4987 + }, + { + "epoch": 1.6657204875605276, + "grad_norm": 0.32837484331891814, + "learning_rate": 4.907696910584599e-06, + "loss": 0.03, + "step": 4988 + }, + { + "epoch": 1.6660544331274, + "grad_norm": 0.22603938931000145, + "learning_rate": 4.905753915446795e-06, + "loss": 0.0176, + "step": 4989 + }, + { + "epoch": 1.6663883786942728, + "grad_norm": 0.3157935549046105, + "learning_rate": 4.903810934545972e-06, + "loss": 0.0266, + "step": 4990 + }, + { + "epoch": 1.6667223242611455, + "grad_norm": 0.2541625230640935, + "learning_rate": 4.90186796817564e-06, + "loss": 0.0247, + "step": 4991 + }, + { + "epoch": 1.667056269828018, + "grad_norm": 0.19296385447867082, + "learning_rate": 4.899925016629307e-06, + "loss": 0.0165, + "step": 4992 + }, + { + "epoch": 1.6673902153948905, + "grad_norm": 0.3805727338041553, + "learning_rate": 4.897982080200477e-06, + "loss": 0.0392, + "step": 4993 + }, + { + "epoch": 1.6677241609617632, + "grad_norm": 0.26836873484687457, + "learning_rate": 4.896039159182655e-06, + "loss": 0.022, + "step": 4994 + }, + { + "epoch": 1.668058106528636, + "grad_norm": 0.26205931655509546, + "learning_rate": 4.894096253869343e-06, + "loss": 0.0217, + "step": 4995 + }, + { + "epoch": 1.6683920520955084, + "grad_norm": 0.34799301349475786, + "learning_rate": 4.89215336455404e-06, + "loss": 0.0252, + "step": 4996 + }, + { + "epoch": 1.668725997662381, + "grad_norm": 0.25375672055132636, + "learning_rate": 4.89021049153024e-06, + "loss": 0.019, + "step": 4997 + }, + { + "epoch": 1.6690599432292537, + "grad_norm": 0.1707851497636513, + "learning_rate": 4.888267635091439e-06, + "loss": 0.0118, + "step": 4998 + }, + { + "epoch": 1.6693938887961264, + "grad_norm": 0.35173002736254716, + "learning_rate": 4.886324795531126e-06, + "loss": 0.0272, + "step": 4999 + }, + { + "epoch": 1.6697278343629989, + "grad_norm": 0.23934163516206688, + "learning_rate": 4.88438197314279e-06, + "loss": 0.0176, + "step": 5000 + }, + { + "epoch": 1.6700617799298714, + "grad_norm": 0.2814792448653315, + "learning_rate": 4.88243916821992e-06, + "loss": 0.0284, + "step": 5001 + }, + { + "epoch": 1.670395725496744, + "grad_norm": 0.2720343620929229, + "learning_rate": 4.880496381055998e-06, + "loss": 0.0244, + "step": 5002 + }, + { + "epoch": 1.6707296710636166, + "grad_norm": 0.32709680479017667, + "learning_rate": 4.878553611944505e-06, + "loss": 0.0237, + "step": 5003 + }, + { + "epoch": 1.671063616630489, + "grad_norm": 0.2350543918677866, + "learning_rate": 4.876610861178918e-06, + "loss": 0.0207, + "step": 5004 + }, + { + "epoch": 1.6713975621973618, + "grad_norm": 0.26545230223903515, + "learning_rate": 4.874668129052712e-06, + "loss": 0.0211, + "step": 5005 + }, + { + "epoch": 1.6717315077642345, + "grad_norm": 0.282447405556621, + "learning_rate": 4.872725415859363e-06, + "loss": 0.0212, + "step": 5006 + }, + { + "epoch": 1.672065453331107, + "grad_norm": 0.24449657408298028, + "learning_rate": 4.8707827218923385e-06, + "loss": 0.0297, + "step": 5007 + }, + { + "epoch": 1.6723993988979795, + "grad_norm": 0.4229927007115372, + "learning_rate": 4.868840047445106e-06, + "loss": 0.0406, + "step": 5008 + }, + { + "epoch": 1.6727333444648522, + "grad_norm": 0.255050706127349, + "learning_rate": 4.866897392811127e-06, + "loss": 0.0208, + "step": 5009 + }, + { + "epoch": 1.673067290031725, + "grad_norm": 0.3434314227523628, + "learning_rate": 4.864954758283865e-06, + "loss": 0.0377, + "step": 5010 + }, + { + "epoch": 1.6734012355985974, + "grad_norm": 0.24896547532933488, + "learning_rate": 4.8630121441567755e-06, + "loss": 0.0289, + "step": 5011 + }, + { + "epoch": 1.67373518116547, + "grad_norm": 0.19163970837587832, + "learning_rate": 4.861069550723316e-06, + "loss": 0.0204, + "step": 5012 + }, + { + "epoch": 1.6740691267323426, + "grad_norm": 0.293813592423613, + "learning_rate": 4.859126978276937e-06, + "loss": 0.0303, + "step": 5013 + }, + { + "epoch": 1.6744030722992154, + "grad_norm": 0.30395564935825325, + "learning_rate": 4.857184427111086e-06, + "loss": 0.0302, + "step": 5014 + }, + { + "epoch": 1.6747370178660879, + "grad_norm": 0.2145584342304078, + "learning_rate": 4.855241897519209e-06, + "loss": 0.0263, + "step": 5015 + }, + { + "epoch": 1.6750709634329604, + "grad_norm": 0.3326339898574843, + "learning_rate": 4.8532993897947464e-06, + "loss": 0.0272, + "step": 5016 + }, + { + "epoch": 1.675404908999833, + "grad_norm": 0.2685224873280316, + "learning_rate": 4.851356904231139e-06, + "loss": 0.0226, + "step": 5017 + }, + { + "epoch": 1.6757388545667058, + "grad_norm": 0.2736298493314978, + "learning_rate": 4.849414441121823e-06, + "loss": 0.0287, + "step": 5018 + }, + { + "epoch": 1.676072800133578, + "grad_norm": 0.29949992752936794, + "learning_rate": 4.847472000760228e-06, + "loss": 0.0203, + "step": 5019 + }, + { + "epoch": 1.6764067457004508, + "grad_norm": 0.4163779178955796, + "learning_rate": 4.845529583439783e-06, + "loss": 0.0331, + "step": 5020 + }, + { + "epoch": 1.6767406912673235, + "grad_norm": 0.28545506524486947, + "learning_rate": 4.843587189453914e-06, + "loss": 0.0274, + "step": 5021 + }, + { + "epoch": 1.677074636834196, + "grad_norm": 0.3181720812252546, + "learning_rate": 4.84164481909604e-06, + "loss": 0.0261, + "step": 5022 + }, + { + "epoch": 1.6774085824010685, + "grad_norm": 0.3070593709975989, + "learning_rate": 4.839702472659581e-06, + "loss": 0.0279, + "step": 5023 + }, + { + "epoch": 1.6777425279679412, + "grad_norm": 0.2729106425334527, + "learning_rate": 4.837760150437952e-06, + "loss": 0.0206, + "step": 5024 + }, + { + "epoch": 1.678076473534814, + "grad_norm": 0.29145481234092924, + "learning_rate": 4.8358178527245625e-06, + "loss": 0.0248, + "step": 5025 + }, + { + "epoch": 1.6784104191016864, + "grad_norm": 0.3115074242308137, + "learning_rate": 4.83387557981282e-06, + "loss": 0.0226, + "step": 5026 + }, + { + "epoch": 1.678744364668559, + "grad_norm": 0.2944538379749023, + "learning_rate": 4.831933331996126e-06, + "loss": 0.0192, + "step": 5027 + }, + { + "epoch": 1.6790783102354316, + "grad_norm": 0.307979192560287, + "learning_rate": 4.8299911095678816e-06, + "loss": 0.0283, + "step": 5028 + }, + { + "epoch": 1.6794122558023044, + "grad_norm": 0.2662280595537268, + "learning_rate": 4.82804891282148e-06, + "loss": 0.0203, + "step": 5029 + }, + { + "epoch": 1.6797462013691769, + "grad_norm": 0.222976795987867, + "learning_rate": 4.8261067420503175e-06, + "loss": 0.0175, + "step": 5030 + }, + { + "epoch": 1.6800801469360493, + "grad_norm": 0.23873787386123677, + "learning_rate": 4.8241645975477785e-06, + "loss": 0.0188, + "step": 5031 + }, + { + "epoch": 1.680414092502922, + "grad_norm": 0.2925453023614851, + "learning_rate": 4.822222479607247e-06, + "loss": 0.0211, + "step": 5032 + }, + { + "epoch": 1.6807480380697948, + "grad_norm": 0.35771726874298143, + "learning_rate": 4.820280388522104e-06, + "loss": 0.0308, + "step": 5033 + }, + { + "epoch": 1.6810819836366673, + "grad_norm": 0.2684815292118438, + "learning_rate": 4.818338324585725e-06, + "loss": 0.0186, + "step": 5034 + }, + { + "epoch": 1.6814159292035398, + "grad_norm": 0.2983882412900993, + "learning_rate": 4.816396288091478e-06, + "loss": 0.0254, + "step": 5035 + }, + { + "epoch": 1.6817498747704125, + "grad_norm": 0.27390334861865234, + "learning_rate": 4.814454279332737e-06, + "loss": 0.0199, + "step": 5036 + }, + { + "epoch": 1.682083820337285, + "grad_norm": 0.41113374231178185, + "learning_rate": 4.81251229860286e-06, + "loss": 0.0191, + "step": 5037 + }, + { + "epoch": 1.6824177659041575, + "grad_norm": 0.6393521000517295, + "learning_rate": 4.810570346195207e-06, + "loss": 0.0328, + "step": 5038 + }, + { + "epoch": 1.6827517114710302, + "grad_norm": 0.30404000290546124, + "learning_rate": 4.808628422403135e-06, + "loss": 0.022, + "step": 5039 + }, + { + "epoch": 1.683085657037903, + "grad_norm": 0.2962533988056783, + "learning_rate": 4.806686527519994e-06, + "loss": 0.0233, + "step": 5040 + }, + { + "epoch": 1.6834196026047754, + "grad_norm": 0.2950371437436768, + "learning_rate": 4.804744661839128e-06, + "loss": 0.0303, + "step": 5041 + }, + { + "epoch": 1.683753548171648, + "grad_norm": 0.2075556348178394, + "learning_rate": 4.80280282565388e-06, + "loss": 0.0168, + "step": 5042 + }, + { + "epoch": 1.6840874937385206, + "grad_norm": 0.2673260805204257, + "learning_rate": 4.800861019257587e-06, + "loss": 0.0225, + "step": 5043 + }, + { + "epoch": 1.6844214393053933, + "grad_norm": 0.34873262895580137, + "learning_rate": 4.798919242943583e-06, + "loss": 0.029, + "step": 5044 + }, + { + "epoch": 1.6847553848722658, + "grad_norm": 0.307026568834567, + "learning_rate": 4.796977497005194e-06, + "loss": 0.0253, + "step": 5045 + }, + { + "epoch": 1.6850893304391383, + "grad_norm": 0.5801891199429996, + "learning_rate": 4.795035781735743e-06, + "loss": 0.0203, + "step": 5046 + }, + { + "epoch": 1.685423276006011, + "grad_norm": 0.3324355995162621, + "learning_rate": 4.793094097428552e-06, + "loss": 0.0297, + "step": 5047 + }, + { + "epoch": 1.6857572215728838, + "grad_norm": 0.23355545241862927, + "learning_rate": 4.7911524443769346e-06, + "loss": 0.0209, + "step": 5048 + }, + { + "epoch": 1.6860911671397563, + "grad_norm": 0.30561981730612925, + "learning_rate": 4.789210822874199e-06, + "loss": 0.0178, + "step": 5049 + }, + { + "epoch": 1.6864251127066288, + "grad_norm": 0.27650548852788287, + "learning_rate": 4.787269233213651e-06, + "loss": 0.0234, + "step": 5050 + }, + { + "epoch": 1.6867590582735015, + "grad_norm": 0.21521902160155762, + "learning_rate": 4.785327675688591e-06, + "loss": 0.0214, + "step": 5051 + }, + { + "epoch": 1.687093003840374, + "grad_norm": 0.24266712803809304, + "learning_rate": 4.7833861505923096e-06, + "loss": 0.021, + "step": 5052 + }, + { + "epoch": 1.6874269494072465, + "grad_norm": 0.31718648799906535, + "learning_rate": 4.781444658218103e-06, + "loss": 0.0345, + "step": 5053 + }, + { + "epoch": 1.6877608949741192, + "grad_norm": 0.5583495864488831, + "learning_rate": 4.779503198859255e-06, + "loss": 0.0264, + "step": 5054 + }, + { + "epoch": 1.688094840540992, + "grad_norm": 0.3001913541199259, + "learning_rate": 4.777561772809045e-06, + "loss": 0.0225, + "step": 5055 + }, + { + "epoch": 1.6884287861078644, + "grad_norm": 0.3979758757656625, + "learning_rate": 4.775620380360747e-06, + "loss": 0.0254, + "step": 5056 + }, + { + "epoch": 1.688762731674737, + "grad_norm": 0.8123384527348672, + "learning_rate": 4.773679021807634e-06, + "loss": 0.033, + "step": 5057 + }, + { + "epoch": 1.6890966772416096, + "grad_norm": 0.23010428257933566, + "learning_rate": 4.771737697442968e-06, + "loss": 0.0181, + "step": 5058 + }, + { + "epoch": 1.6894306228084823, + "grad_norm": 0.28248420722514384, + "learning_rate": 4.7697964075600114e-06, + "loss": 0.0257, + "step": 5059 + }, + { + "epoch": 1.6897645683753548, + "grad_norm": 0.33196036112840116, + "learning_rate": 4.767855152452019e-06, + "loss": 0.0237, + "step": 5060 + }, + { + "epoch": 1.6900985139422273, + "grad_norm": 0.40334280586752824, + "learning_rate": 4.765913932412237e-06, + "loss": 0.0416, + "step": 5061 + }, + { + "epoch": 1.6904324595091, + "grad_norm": 0.2651581033201635, + "learning_rate": 4.763972747733913e-06, + "loss": 0.027, + "step": 5062 + }, + { + "epoch": 1.6907664050759728, + "grad_norm": 0.3741347101468278, + "learning_rate": 4.762031598710285e-06, + "loss": 0.0235, + "step": 5063 + }, + { + "epoch": 1.6911003506428453, + "grad_norm": 0.3128418321858523, + "learning_rate": 4.760090485634584e-06, + "loss": 0.0294, + "step": 5064 + }, + { + "epoch": 1.6914342962097177, + "grad_norm": 0.2593181856558939, + "learning_rate": 4.758149408800042e-06, + "loss": 0.0244, + "step": 5065 + }, + { + "epoch": 1.6917682417765905, + "grad_norm": 0.25238998632163034, + "learning_rate": 4.756208368499879e-06, + "loss": 0.0227, + "step": 5066 + }, + { + "epoch": 1.692102187343463, + "grad_norm": 0.24034790988450105, + "learning_rate": 4.754267365027314e-06, + "loss": 0.0195, + "step": 5067 + }, + { + "epoch": 1.6924361329103355, + "grad_norm": 0.30594466697486583, + "learning_rate": 4.752326398675555e-06, + "loss": 0.023, + "step": 5068 + }, + { + "epoch": 1.6927700784772082, + "grad_norm": 0.3484229596217318, + "learning_rate": 4.750385469737811e-06, + "loss": 0.0269, + "step": 5069 + }, + { + "epoch": 1.693104024044081, + "grad_norm": 0.2868801325419484, + "learning_rate": 4.748444578507278e-06, + "loss": 0.0256, + "step": 5070 + }, + { + "epoch": 1.6934379696109534, + "grad_norm": 0.35948168405650954, + "learning_rate": 4.746503725277156e-06, + "loss": 0.0276, + "step": 5071 + }, + { + "epoch": 1.6937719151778259, + "grad_norm": 0.405572432672421, + "learning_rate": 4.744562910340631e-06, + "loss": 0.035, + "step": 5072 + }, + { + "epoch": 1.6941058607446986, + "grad_norm": 0.2597793140715524, + "learning_rate": 4.742622133990885e-06, + "loss": 0.024, + "step": 5073 + }, + { + "epoch": 1.6944398063115713, + "grad_norm": 0.34892402665041466, + "learning_rate": 4.740681396521097e-06, + "loss": 0.033, + "step": 5074 + }, + { + "epoch": 1.6947737518784438, + "grad_norm": 0.4626790306972768, + "learning_rate": 4.738740698224438e-06, + "loss": 0.0336, + "step": 5075 + }, + { + "epoch": 1.6951076974453163, + "grad_norm": 0.26948137870452266, + "learning_rate": 4.73680003939407e-06, + "loss": 0.0183, + "step": 5076 + }, + { + "epoch": 1.695441643012189, + "grad_norm": 0.3210469611402509, + "learning_rate": 4.734859420323158e-06, + "loss": 0.0298, + "step": 5077 + }, + { + "epoch": 1.6957755885790617, + "grad_norm": 0.36254289631206843, + "learning_rate": 4.7329188413048515e-06, + "loss": 0.0235, + "step": 5078 + }, + { + "epoch": 1.6961095341459342, + "grad_norm": 0.2519877993211459, + "learning_rate": 4.7309783026322995e-06, + "loss": 0.0179, + "step": 5079 + }, + { + "epoch": 1.6964434797128067, + "grad_norm": 0.25224254971516313, + "learning_rate": 4.7290378045986425e-06, + "loss": 0.0264, + "step": 5080 + }, + { + "epoch": 1.6967774252796795, + "grad_norm": 0.33297507889500944, + "learning_rate": 4.727097347497014e-06, + "loss": 0.0309, + "step": 5081 + }, + { + "epoch": 1.6971113708465522, + "grad_norm": 0.34603534888592014, + "learning_rate": 4.7251569316205455e-06, + "loss": 0.0267, + "step": 5082 + }, + { + "epoch": 1.6974453164134247, + "grad_norm": 0.3009500608729052, + "learning_rate": 4.723216557262359e-06, + "loss": 0.0223, + "step": 5083 + }, + { + "epoch": 1.6977792619802972, + "grad_norm": 0.40241970379084385, + "learning_rate": 4.721276224715569e-06, + "loss": 0.028, + "step": 5084 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.2164773872430984, + "learning_rate": 4.719335934273289e-06, + "loss": 0.0147, + "step": 5085 + }, + { + "epoch": 1.6984471531140424, + "grad_norm": 0.34221091426098466, + "learning_rate": 4.717395686228621e-06, + "loss": 0.0233, + "step": 5086 + }, + { + "epoch": 1.6987810986809149, + "grad_norm": 0.24905328977217717, + "learning_rate": 4.715455480874661e-06, + "loss": 0.0239, + "step": 5087 + }, + { + "epoch": 1.6991150442477876, + "grad_norm": 0.31499459708637956, + "learning_rate": 4.713515318504501e-06, + "loss": 0.0228, + "step": 5088 + }, + { + "epoch": 1.6994489898146603, + "grad_norm": 0.28305044429283904, + "learning_rate": 4.711575199411226e-06, + "loss": 0.0242, + "step": 5089 + }, + { + "epoch": 1.6997829353815328, + "grad_norm": 0.3738602532204356, + "learning_rate": 4.7096351238879135e-06, + "loss": 0.033, + "step": 5090 + }, + { + "epoch": 1.7001168809484053, + "grad_norm": 0.640430751504491, + "learning_rate": 4.707695092227634e-06, + "loss": 0.0311, + "step": 5091 + }, + { + "epoch": 1.700450826515278, + "grad_norm": 0.30386975850806625, + "learning_rate": 4.705755104723453e-06, + "loss": 0.0302, + "step": 5092 + }, + { + "epoch": 1.7007847720821507, + "grad_norm": 0.7792120964224993, + "learning_rate": 4.703815161668426e-06, + "loss": 0.0291, + "step": 5093 + }, + { + "epoch": 1.7011187176490232, + "grad_norm": 0.28215177134505537, + "learning_rate": 4.701875263355608e-06, + "loss": 0.0274, + "step": 5094 + }, + { + "epoch": 1.7014526632158957, + "grad_norm": 0.37768556717253043, + "learning_rate": 4.699935410078042e-06, + "loss": 0.0167, + "step": 5095 + }, + { + "epoch": 1.7017866087827684, + "grad_norm": 0.3064811018277969, + "learning_rate": 4.697995602128766e-06, + "loss": 0.0253, + "step": 5096 + }, + { + "epoch": 1.7021205543496412, + "grad_norm": 0.28903167050073053, + "learning_rate": 4.696055839800809e-06, + "loss": 0.0282, + "step": 5097 + }, + { + "epoch": 1.7024544999165137, + "grad_norm": 0.34452536171124826, + "learning_rate": 4.694116123387197e-06, + "loss": 0.0339, + "step": 5098 + }, + { + "epoch": 1.7027884454833861, + "grad_norm": 0.31291977199400417, + "learning_rate": 4.692176453180944e-06, + "loss": 0.0232, + "step": 5099 + }, + { + "epoch": 1.7031223910502589, + "grad_norm": 0.2163732660843581, + "learning_rate": 4.6902368294750644e-06, + "loss": 0.0163, + "step": 5100 + }, + { + "epoch": 1.7034563366171314, + "grad_norm": 0.33786859647608486, + "learning_rate": 4.688297252562559e-06, + "loss": 0.0307, + "step": 5101 + }, + { + "epoch": 1.7037902821840039, + "grad_norm": 0.27604472899960997, + "learning_rate": 4.6863577227364235e-06, + "loss": 0.028, + "step": 5102 + }, + { + "epoch": 1.7041242277508766, + "grad_norm": 0.2840436666898987, + "learning_rate": 4.684418240289648e-06, + "loss": 0.0255, + "step": 5103 + }, + { + "epoch": 1.7044581733177493, + "grad_norm": 0.30621435120215135, + "learning_rate": 4.682478805515212e-06, + "loss": 0.0321, + "step": 5104 + }, + { + "epoch": 1.7047921188846218, + "grad_norm": 0.4193594079022245, + "learning_rate": 4.680539418706091e-06, + "loss": 0.0352, + "step": 5105 + }, + { + "epoch": 1.7051260644514943, + "grad_norm": 0.342921331965035, + "learning_rate": 4.678600080155252e-06, + "loss": 0.0201, + "step": 5106 + }, + { + "epoch": 1.705460010018367, + "grad_norm": 0.23207198540603993, + "learning_rate": 4.676660790155656e-06, + "loss": 0.0189, + "step": 5107 + }, + { + "epoch": 1.7057939555852397, + "grad_norm": 0.26082751138107063, + "learning_rate": 4.674721549000255e-06, + "loss": 0.0215, + "step": 5108 + }, + { + "epoch": 1.7061279011521122, + "grad_norm": 0.3967764398350967, + "learning_rate": 4.6727823569819944e-06, + "loss": 0.0211, + "step": 5109 + }, + { + "epoch": 1.7064618467189847, + "grad_norm": 0.2165363032956635, + "learning_rate": 4.670843214393811e-06, + "loss": 0.0216, + "step": 5110 + }, + { + "epoch": 1.7067957922858574, + "grad_norm": 0.2974595631503717, + "learning_rate": 4.6689041215286344e-06, + "loss": 0.0247, + "step": 5111 + }, + { + "epoch": 1.7071297378527301, + "grad_norm": 0.31041041022123594, + "learning_rate": 4.666965078679391e-06, + "loss": 0.0233, + "step": 5112 + }, + { + "epoch": 1.7074636834196026, + "grad_norm": 0.23339664932116805, + "learning_rate": 4.665026086138993e-06, + "loss": 0.0225, + "step": 5113 + }, + { + "epoch": 1.7077976289864751, + "grad_norm": 0.2414185889361508, + "learning_rate": 4.66308714420035e-06, + "loss": 0.0191, + "step": 5114 + }, + { + "epoch": 1.7081315745533479, + "grad_norm": 0.2714979313342858, + "learning_rate": 4.6611482531563595e-06, + "loss": 0.0157, + "step": 5115 + }, + { + "epoch": 1.7084655201202203, + "grad_norm": 0.3195693561574328, + "learning_rate": 4.659209413299916e-06, + "loss": 0.0328, + "step": 5116 + }, + { + "epoch": 1.7087994656870928, + "grad_norm": 0.31177788487252767, + "learning_rate": 4.657270624923901e-06, + "loss": 0.0197, + "step": 5117 + }, + { + "epoch": 1.7091334112539656, + "grad_norm": 0.43420911966539794, + "learning_rate": 4.6553318883211955e-06, + "loss": 0.032, + "step": 5118 + }, + { + "epoch": 1.7094673568208383, + "grad_norm": 0.24534622116361365, + "learning_rate": 4.653393203784667e-06, + "loss": 0.0231, + "step": 5119 + }, + { + "epoch": 1.7098013023877108, + "grad_norm": 0.21379368492645143, + "learning_rate": 4.651454571607176e-06, + "loss": 0.0146, + "step": 5120 + }, + { + "epoch": 1.7101352479545833, + "grad_norm": 0.37667456250455755, + "learning_rate": 4.649515992081576e-06, + "loss": 0.0216, + "step": 5121 + }, + { + "epoch": 1.710469193521456, + "grad_norm": 0.26418397352790207, + "learning_rate": 4.64757746550071e-06, + "loss": 0.0243, + "step": 5122 + }, + { + "epoch": 1.7108031390883287, + "grad_norm": 0.28943731105183423, + "learning_rate": 4.645638992157419e-06, + "loss": 0.0267, + "step": 5123 + }, + { + "epoch": 1.7111370846552012, + "grad_norm": 0.2594891781153189, + "learning_rate": 4.6437005723445316e-06, + "loss": 0.02, + "step": 5124 + }, + { + "epoch": 1.7114710302220737, + "grad_norm": 0.31454282807529704, + "learning_rate": 4.6417622063548675e-06, + "loss": 0.0281, + "step": 5125 + }, + { + "epoch": 1.7118049757889464, + "grad_norm": 0.656527704033975, + "learning_rate": 4.6398238944812414e-06, + "loss": 0.0293, + "step": 5126 + }, + { + "epoch": 1.7121389213558191, + "grad_norm": 0.2766119451674607, + "learning_rate": 4.637885637016456e-06, + "loss": 0.03, + "step": 5127 + }, + { + "epoch": 1.7124728669226916, + "grad_norm": 0.22158608692699336, + "learning_rate": 4.635947434253308e-06, + "loss": 0.0186, + "step": 5128 + }, + { + "epoch": 1.7128068124895641, + "grad_norm": 0.24689145353828215, + "learning_rate": 4.634009286484586e-06, + "loss": 0.0164, + "step": 5129 + }, + { + "epoch": 1.7131407580564368, + "grad_norm": 0.25846956500234836, + "learning_rate": 4.632071194003073e-06, + "loss": 0.0219, + "step": 5130 + }, + { + "epoch": 1.7134747036233096, + "grad_norm": 0.3741788662744667, + "learning_rate": 4.630133157101537e-06, + "loss": 0.0375, + "step": 5131 + }, + { + "epoch": 1.713808649190182, + "grad_norm": 0.28587885711693023, + "learning_rate": 4.6281951760727435e-06, + "loss": 0.0261, + "step": 5132 + }, + { + "epoch": 1.7141425947570545, + "grad_norm": 0.22445231096509569, + "learning_rate": 4.626257251209446e-06, + "loss": 0.0198, + "step": 5133 + }, + { + "epoch": 1.7144765403239273, + "grad_norm": 0.5551044535713995, + "learning_rate": 4.624319382804391e-06, + "loss": 0.0383, + "step": 5134 + }, + { + "epoch": 1.7148104858907998, + "grad_norm": 0.2488565314772827, + "learning_rate": 4.622381571150317e-06, + "loss": 0.0226, + "step": 5135 + }, + { + "epoch": 1.7151444314576723, + "grad_norm": 0.41971143384081905, + "learning_rate": 4.620443816539954e-06, + "loss": 0.0194, + "step": 5136 + }, + { + "epoch": 1.715478377024545, + "grad_norm": 0.23765851993618284, + "learning_rate": 4.618506119266021e-06, + "loss": 0.0195, + "step": 5137 + }, + { + "epoch": 1.7158123225914177, + "grad_norm": 0.28219873903593257, + "learning_rate": 4.6165684796212306e-06, + "loss": 0.0218, + "step": 5138 + }, + { + "epoch": 1.7161462681582902, + "grad_norm": 0.3627060960703866, + "learning_rate": 4.6146308978982865e-06, + "loss": 0.0201, + "step": 5139 + }, + { + "epoch": 1.7164802137251627, + "grad_norm": 0.24050130170589512, + "learning_rate": 4.612693374389881e-06, + "loss": 0.0207, + "step": 5140 + }, + { + "epoch": 1.7168141592920354, + "grad_norm": 0.291678696132503, + "learning_rate": 4.610755909388703e-06, + "loss": 0.0262, + "step": 5141 + }, + { + "epoch": 1.7171481048589081, + "grad_norm": 0.33128741432440445, + "learning_rate": 4.608818503187428e-06, + "loss": 0.0231, + "step": 5142 + }, + { + "epoch": 1.7174820504257806, + "grad_norm": 0.22460146277547102, + "learning_rate": 4.606881156078725e-06, + "loss": 0.0176, + "step": 5143 + }, + { + "epoch": 1.717815995992653, + "grad_norm": 0.3173399095946577, + "learning_rate": 4.604943868355251e-06, + "loss": 0.0262, + "step": 5144 + }, + { + "epoch": 1.7181499415595258, + "grad_norm": 0.32845666940588364, + "learning_rate": 4.603006640309658e-06, + "loss": 0.0269, + "step": 5145 + }, + { + "epoch": 1.7184838871263985, + "grad_norm": 0.24539305320111826, + "learning_rate": 4.601069472234584e-06, + "loss": 0.0191, + "step": 5146 + }, + { + "epoch": 1.718817832693271, + "grad_norm": 0.24112482557342005, + "learning_rate": 4.599132364422666e-06, + "loss": 0.023, + "step": 5147 + }, + { + "epoch": 1.7191517782601435, + "grad_norm": 0.2584785514803024, + "learning_rate": 4.597195317166525e-06, + "loss": 0.024, + "step": 5148 + }, + { + "epoch": 1.7194857238270163, + "grad_norm": 0.25819795597233125, + "learning_rate": 4.595258330758773e-06, + "loss": 0.021, + "step": 5149 + }, + { + "epoch": 1.7198196693938888, + "grad_norm": 0.2731501980834195, + "learning_rate": 4.593321405492017e-06, + "loss": 0.0202, + "step": 5150 + }, + { + "epoch": 1.7201536149607612, + "grad_norm": 0.2899731157210256, + "learning_rate": 4.59138454165885e-06, + "loss": 0.0251, + "step": 5151 + }, + { + "epoch": 1.720487560527634, + "grad_norm": 0.27693644241374615, + "learning_rate": 4.589447739551857e-06, + "loss": 0.0251, + "step": 5152 + }, + { + "epoch": 1.7208215060945067, + "grad_norm": 0.3480525121627245, + "learning_rate": 4.58751099946362e-06, + "loss": 0.0277, + "step": 5153 + }, + { + "epoch": 1.7211554516613792, + "grad_norm": 0.27701070481131357, + "learning_rate": 4.585574321686704e-06, + "loss": 0.0257, + "step": 5154 + }, + { + "epoch": 1.7214893972282517, + "grad_norm": 0.26125681321982575, + "learning_rate": 4.583637706513665e-06, + "loss": 0.0249, + "step": 5155 + }, + { + "epoch": 1.7218233427951244, + "grad_norm": 0.2415452060756729, + "learning_rate": 4.5817011542370535e-06, + "loss": 0.0227, + "step": 5156 + }, + { + "epoch": 1.722157288361997, + "grad_norm": 0.26543938489252966, + "learning_rate": 4.579764665149409e-06, + "loss": 0.0217, + "step": 5157 + }, + { + "epoch": 1.7224912339288696, + "grad_norm": 0.28644415612875096, + "learning_rate": 4.577828239543257e-06, + "loss": 0.0198, + "step": 5158 + }, + { + "epoch": 1.722825179495742, + "grad_norm": 0.24786963473825338, + "learning_rate": 4.575891877711123e-06, + "loss": 0.0254, + "step": 5159 + }, + { + "epoch": 1.7231591250626148, + "grad_norm": 0.25167514847976147, + "learning_rate": 4.573955579945514e-06, + "loss": 0.0305, + "step": 5160 + }, + { + "epoch": 1.7234930706294875, + "grad_norm": 0.2044952223511773, + "learning_rate": 4.572019346538931e-06, + "loss": 0.0142, + "step": 5161 + }, + { + "epoch": 1.72382701619636, + "grad_norm": 0.3470529283014334, + "learning_rate": 4.570083177783865e-06, + "loss": 0.0294, + "step": 5162 + }, + { + "epoch": 1.7241609617632325, + "grad_norm": 0.28886051693419007, + "learning_rate": 4.568147073972795e-06, + "loss": 0.0324, + "step": 5163 + }, + { + "epoch": 1.7244949073301052, + "grad_norm": 0.2575548960939929, + "learning_rate": 4.566211035398196e-06, + "loss": 0.0272, + "step": 5164 + }, + { + "epoch": 1.7248288528969777, + "grad_norm": 0.23550200627980475, + "learning_rate": 4.564275062352529e-06, + "loss": 0.0193, + "step": 5165 + }, + { + "epoch": 1.7251627984638502, + "grad_norm": 0.3550995831730855, + "learning_rate": 4.5623391551282435e-06, + "loss": 0.0318, + "step": 5166 + }, + { + "epoch": 1.725496744030723, + "grad_norm": 0.2433252411104981, + "learning_rate": 4.560403314017782e-06, + "loss": 0.0218, + "step": 5167 + }, + { + "epoch": 1.7258306895975957, + "grad_norm": 0.24278489909282805, + "learning_rate": 4.558467539313576e-06, + "loss": 0.0204, + "step": 5168 + }, + { + "epoch": 1.7261646351644682, + "grad_norm": 0.2910727371167379, + "learning_rate": 4.556531831308045e-06, + "loss": 0.0289, + "step": 5169 + }, + { + "epoch": 1.7264985807313407, + "grad_norm": 0.30541878123697425, + "learning_rate": 4.554596190293606e-06, + "loss": 0.0231, + "step": 5170 + }, + { + "epoch": 1.7268325262982134, + "grad_norm": 0.35690239159501624, + "learning_rate": 4.552660616562655e-06, + "loss": 0.0173, + "step": 5171 + }, + { + "epoch": 1.727166471865086, + "grad_norm": 0.2581256706438739, + "learning_rate": 4.550725110407586e-06, + "loss": 0.0224, + "step": 5172 + }, + { + "epoch": 1.7275004174319586, + "grad_norm": 0.22722182545082023, + "learning_rate": 4.548789672120779e-06, + "loss": 0.0158, + "step": 5173 + }, + { + "epoch": 1.727834362998831, + "grad_norm": 0.2604577885495867, + "learning_rate": 4.5468543019946045e-06, + "loss": 0.0185, + "step": 5174 + }, + { + "epoch": 1.7281683085657038, + "grad_norm": 0.3221706980927786, + "learning_rate": 4.544919000321421e-06, + "loss": 0.0216, + "step": 5175 + }, + { + "epoch": 1.7285022541325765, + "grad_norm": 0.2266912598497077, + "learning_rate": 4.542983767393584e-06, + "loss": 0.0175, + "step": 5176 + }, + { + "epoch": 1.728836199699449, + "grad_norm": 0.3941274815544478, + "learning_rate": 4.541048603503429e-06, + "loss": 0.0432, + "step": 5177 + }, + { + "epoch": 1.7291701452663215, + "grad_norm": 0.27304423179494175, + "learning_rate": 4.539113508943287e-06, + "loss": 0.0263, + "step": 5178 + }, + { + "epoch": 1.7295040908331942, + "grad_norm": 0.242960209646446, + "learning_rate": 4.537178484005476e-06, + "loss": 0.02, + "step": 5179 + }, + { + "epoch": 1.729838036400067, + "grad_norm": 0.2741842848720956, + "learning_rate": 4.535243528982305e-06, + "loss": 0.0185, + "step": 5180 + }, + { + "epoch": 1.7301719819669394, + "grad_norm": 0.2607738698037124, + "learning_rate": 4.53330864416607e-06, + "loss": 0.0173, + "step": 5181 + }, + { + "epoch": 1.730505927533812, + "grad_norm": 0.255859782690453, + "learning_rate": 4.531373829849061e-06, + "loss": 0.0223, + "step": 5182 + }, + { + "epoch": 1.7308398731006847, + "grad_norm": 0.28134369235214096, + "learning_rate": 4.529439086323552e-06, + "loss": 0.0239, + "step": 5183 + }, + { + "epoch": 1.7311738186675572, + "grad_norm": 0.25464785465833706, + "learning_rate": 4.52750441388181e-06, + "loss": 0.0224, + "step": 5184 + }, + { + "epoch": 1.7315077642344296, + "grad_norm": 0.22273440750512966, + "learning_rate": 4.52556981281609e-06, + "loss": 0.0166, + "step": 5185 + }, + { + "epoch": 1.7318417098013024, + "grad_norm": 0.29297547989847234, + "learning_rate": 4.523635283418635e-06, + "loss": 0.0231, + "step": 5186 + }, + { + "epoch": 1.732175655368175, + "grad_norm": 0.25082038119545436, + "learning_rate": 4.521700825981678e-06, + "loss": 0.0231, + "step": 5187 + }, + { + "epoch": 1.7325096009350476, + "grad_norm": 0.25437856871721237, + "learning_rate": 4.519766440797446e-06, + "loss": 0.0244, + "step": 5188 + }, + { + "epoch": 1.73284354650192, + "grad_norm": 0.27913666196932985, + "learning_rate": 4.517832128158147e-06, + "loss": 0.0307, + "step": 5189 + }, + { + "epoch": 1.7331774920687928, + "grad_norm": 0.20122095846884455, + "learning_rate": 4.515897888355982e-06, + "loss": 0.0168, + "step": 5190 + }, + { + "epoch": 1.7335114376356655, + "grad_norm": 0.27965290295736694, + "learning_rate": 4.513963721683142e-06, + "loss": 0.017, + "step": 5191 + }, + { + "epoch": 1.733845383202538, + "grad_norm": 0.2580682457275597, + "learning_rate": 4.5120296284318035e-06, + "loss": 0.0161, + "step": 5192 + }, + { + "epoch": 1.7341793287694105, + "grad_norm": 0.23191787950935147, + "learning_rate": 4.510095608894134e-06, + "loss": 0.0207, + "step": 5193 + }, + { + "epoch": 1.7345132743362832, + "grad_norm": 0.29488894091518947, + "learning_rate": 4.508161663362294e-06, + "loss": 0.0239, + "step": 5194 + }, + { + "epoch": 1.734847219903156, + "grad_norm": 0.2894669906650829, + "learning_rate": 4.506227792128424e-06, + "loss": 0.0261, + "step": 5195 + }, + { + "epoch": 1.7351811654700284, + "grad_norm": 0.2769926890552985, + "learning_rate": 4.504293995484662e-06, + "loss": 0.0217, + "step": 5196 + }, + { + "epoch": 1.735515111036901, + "grad_norm": 0.2817270523625594, + "learning_rate": 4.502360273723127e-06, + "loss": 0.0228, + "step": 5197 + }, + { + "epoch": 1.7358490566037736, + "grad_norm": 0.3528872875931685, + "learning_rate": 4.500426627135933e-06, + "loss": 0.0217, + "step": 5198 + }, + { + "epoch": 1.7361830021706461, + "grad_norm": 0.3912683429319728, + "learning_rate": 4.4984930560151776e-06, + "loss": 0.0207, + "step": 5199 + }, + { + "epoch": 1.7365169477375186, + "grad_norm": 0.26965344769366073, + "learning_rate": 4.496559560652952e-06, + "loss": 0.0231, + "step": 5200 + }, + { + "epoch": 1.7368508933043914, + "grad_norm": 0.3743078925794467, + "learning_rate": 4.494626141341334e-06, + "loss": 0.0229, + "step": 5201 + }, + { + "epoch": 1.737184838871264, + "grad_norm": 0.2502915835025997, + "learning_rate": 4.4926927983723876e-06, + "loss": 0.0261, + "step": 5202 + }, + { + "epoch": 1.7375187844381366, + "grad_norm": 0.34834333501190895, + "learning_rate": 4.490759532038166e-06, + "loss": 0.0271, + "step": 5203 + }, + { + "epoch": 1.737852730005009, + "grad_norm": 0.33511590487420073, + "learning_rate": 4.488826342630714e-06, + "loss": 0.0263, + "step": 5204 + }, + { + "epoch": 1.7381866755718818, + "grad_norm": 0.3954516710449802, + "learning_rate": 4.486893230442062e-06, + "loss": 0.0495, + "step": 5205 + }, + { + "epoch": 1.7385206211387545, + "grad_norm": 0.2259170817390708, + "learning_rate": 4.4849601957642295e-06, + "loss": 0.0151, + "step": 5206 + }, + { + "epoch": 1.738854566705627, + "grad_norm": 0.2672302838807095, + "learning_rate": 4.483027238889223e-06, + "loss": 0.0272, + "step": 5207 + }, + { + "epoch": 1.7391885122724995, + "grad_norm": 0.26479393205927415, + "learning_rate": 4.48109436010904e-06, + "loss": 0.0205, + "step": 5208 + }, + { + "epoch": 1.7395224578393722, + "grad_norm": 0.2575717548051547, + "learning_rate": 4.4791615597156635e-06, + "loss": 0.0182, + "step": 5209 + }, + { + "epoch": 1.739856403406245, + "grad_norm": 0.2696827620570794, + "learning_rate": 4.477228838001065e-06, + "loss": 0.0225, + "step": 5210 + }, + { + "epoch": 1.7401903489731174, + "grad_norm": 0.26423003954609514, + "learning_rate": 4.475296195257206e-06, + "loss": 0.021, + "step": 5211 + }, + { + "epoch": 1.74052429453999, + "grad_norm": 0.27984330617580844, + "learning_rate": 4.4733636317760365e-06, + "loss": 0.0253, + "step": 5212 + }, + { + "epoch": 1.7408582401068626, + "grad_norm": 0.25585669363698704, + "learning_rate": 4.471431147849491e-06, + "loss": 0.0173, + "step": 5213 + }, + { + "epoch": 1.7411921856737351, + "grad_norm": 0.4212922976308605, + "learning_rate": 4.469498743769493e-06, + "loss": 0.0391, + "step": 5214 + }, + { + "epoch": 1.7415261312406076, + "grad_norm": 0.31581981326018854, + "learning_rate": 4.467566419827958e-06, + "loss": 0.0299, + "step": 5215 + }, + { + "epoch": 1.7418600768074803, + "grad_norm": 0.3834912936761464, + "learning_rate": 4.465634176316782e-06, + "loss": 0.0256, + "step": 5216 + }, + { + "epoch": 1.742194022374353, + "grad_norm": 0.21644224182564878, + "learning_rate": 4.463702013527857e-06, + "loss": 0.0172, + "step": 5217 + }, + { + "epoch": 1.7425279679412256, + "grad_norm": 0.330355490467127, + "learning_rate": 4.4617699317530585e-06, + "loss": 0.0303, + "step": 5218 + }, + { + "epoch": 1.742861913508098, + "grad_norm": 0.2143760695592441, + "learning_rate": 4.459837931284249e-06, + "loss": 0.0164, + "step": 5219 + }, + { + "epoch": 1.7431958590749708, + "grad_norm": 0.2588968504882097, + "learning_rate": 4.45790601241328e-06, + "loss": 0.0202, + "step": 5220 + }, + { + "epoch": 1.7435298046418435, + "grad_norm": 0.3282147515400972, + "learning_rate": 4.45597417543199e-06, + "loss": 0.0241, + "step": 5221 + }, + { + "epoch": 1.743863750208716, + "grad_norm": 0.3435669407216119, + "learning_rate": 4.454042420632206e-06, + "loss": 0.0279, + "step": 5222 + }, + { + "epoch": 1.7441976957755885, + "grad_norm": 0.2101296550084173, + "learning_rate": 4.452110748305744e-06, + "loss": 0.017, + "step": 5223 + }, + { + "epoch": 1.7445316413424612, + "grad_norm": 0.21290305536117435, + "learning_rate": 4.450179158744405e-06, + "loss": 0.0174, + "step": 5224 + }, + { + "epoch": 1.744865586909334, + "grad_norm": 0.26165712554358617, + "learning_rate": 4.448247652239978e-06, + "loss": 0.0202, + "step": 5225 + }, + { + "epoch": 1.7451995324762064, + "grad_norm": 0.23896565599813474, + "learning_rate": 4.4463162290842395e-06, + "loss": 0.0152, + "step": 5226 + }, + { + "epoch": 1.745533478043079, + "grad_norm": 0.20755652136602437, + "learning_rate": 4.444384889568954e-06, + "loss": 0.0154, + "step": 5227 + }, + { + "epoch": 1.7458674236099516, + "grad_norm": 0.3630635864615925, + "learning_rate": 4.442453633985872e-06, + "loss": 0.028, + "step": 5228 + }, + { + "epoch": 1.7462013691768243, + "grad_norm": 0.328488739761976, + "learning_rate": 4.4405224626267345e-06, + "loss": 0.0236, + "step": 5229 + }, + { + "epoch": 1.7465353147436968, + "grad_norm": 0.4058031221515435, + "learning_rate": 4.438591375783267e-06, + "loss": 0.0358, + "step": 5230 + }, + { + "epoch": 1.7468692603105693, + "grad_norm": 0.35575753423783746, + "learning_rate": 4.4366603737471825e-06, + "loss": 0.0262, + "step": 5231 + }, + { + "epoch": 1.747203205877442, + "grad_norm": 0.23757363888091992, + "learning_rate": 4.434729456810182e-06, + "loss": 0.0156, + "step": 5232 + }, + { + "epoch": 1.7475371514443145, + "grad_norm": 0.3700115258942405, + "learning_rate": 4.432798625263951e-06, + "loss": 0.0298, + "step": 5233 + }, + { + "epoch": 1.747871097011187, + "grad_norm": 0.2706429041718493, + "learning_rate": 4.430867879400167e-06, + "loss": 0.0145, + "step": 5234 + }, + { + "epoch": 1.7482050425780598, + "grad_norm": 0.27304067087571326, + "learning_rate": 4.428937219510491e-06, + "loss": 0.0202, + "step": 5235 + }, + { + "epoch": 1.7485389881449325, + "grad_norm": 0.38639372012668477, + "learning_rate": 4.427006645886573e-06, + "loss": 0.0234, + "step": 5236 + }, + { + "epoch": 1.748872933711805, + "grad_norm": 0.2612221157674085, + "learning_rate": 4.425076158820048e-06, + "loss": 0.0277, + "step": 5237 + }, + { + "epoch": 1.7492068792786775, + "grad_norm": 0.3044920989736937, + "learning_rate": 4.423145758602538e-06, + "loss": 0.0311, + "step": 5238 + }, + { + "epoch": 1.7495408248455502, + "grad_norm": 0.3040687089932642, + "learning_rate": 4.4212154455256535e-06, + "loss": 0.0252, + "step": 5239 + }, + { + "epoch": 1.749874770412423, + "grad_norm": 0.3793438133424752, + "learning_rate": 4.41928521988099e-06, + "loss": 0.0382, + "step": 5240 + }, + { + "epoch": 1.7502087159792954, + "grad_norm": 0.2999439361917826, + "learning_rate": 4.417355081960133e-06, + "loss": 0.0217, + "step": 5241 + }, + { + "epoch": 1.7505426615461679, + "grad_norm": 0.2658247453435652, + "learning_rate": 4.415425032054651e-06, + "loss": 0.0308, + "step": 5242 + }, + { + "epoch": 1.7508766071130406, + "grad_norm": 0.23317641913026302, + "learning_rate": 4.413495070456101e-06, + "loss": 0.0183, + "step": 5243 + }, + { + "epoch": 1.7512105526799133, + "grad_norm": 0.21068434522021748, + "learning_rate": 4.411565197456027e-06, + "loss": 0.0165, + "step": 5244 + }, + { + "epoch": 1.7515444982467858, + "grad_norm": 0.27416963372766473, + "learning_rate": 4.409635413345956e-06, + "loss": 0.0251, + "step": 5245 + }, + { + "epoch": 1.7518784438136583, + "grad_norm": 0.30041001314863824, + "learning_rate": 4.40770571841741e-06, + "loss": 0.0198, + "step": 5246 + }, + { + "epoch": 1.752212389380531, + "grad_norm": 0.2574052028154111, + "learning_rate": 4.405776112961889e-06, + "loss": 0.0219, + "step": 5247 + }, + { + "epoch": 1.7525463349474035, + "grad_norm": 0.2543463471795309, + "learning_rate": 4.4038465972708824e-06, + "loss": 0.0255, + "step": 5248 + }, + { + "epoch": 1.752880280514276, + "grad_norm": 0.31099364739070234, + "learning_rate": 4.4019171716358675e-06, + "loss": 0.0342, + "step": 5249 + }, + { + "epoch": 1.7532142260811487, + "grad_norm": 0.4155957865786336, + "learning_rate": 4.399987836348305e-06, + "loss": 0.0459, + "step": 5250 + }, + { + "epoch": 1.7535481716480215, + "grad_norm": 0.3225731040108076, + "learning_rate": 4.398058591699645e-06, + "loss": 0.0297, + "step": 5251 + }, + { + "epoch": 1.753882117214894, + "grad_norm": 0.30866483615255197, + "learning_rate": 4.396129437981322e-06, + "loss": 0.026, + "step": 5252 + }, + { + "epoch": 1.7542160627817664, + "grad_norm": 0.2823405793941288, + "learning_rate": 4.394200375484758e-06, + "loss": 0.0203, + "step": 5253 + }, + { + "epoch": 1.7545500083486392, + "grad_norm": 0.3377115568658968, + "learning_rate": 4.392271404501361e-06, + "loss": 0.0323, + "step": 5254 + }, + { + "epoch": 1.7548839539155119, + "grad_norm": 0.30509659035590736, + "learning_rate": 4.390342525322524e-06, + "loss": 0.0275, + "step": 5255 + }, + { + "epoch": 1.7552178994823844, + "grad_norm": 0.22784622096849774, + "learning_rate": 4.3884137382396255e-06, + "loss": 0.0144, + "step": 5256 + }, + { + "epoch": 1.7555518450492569, + "grad_norm": 0.24378374758424304, + "learning_rate": 4.3864850435440335e-06, + "loss": 0.0225, + "step": 5257 + }, + { + "epoch": 1.7558857906161296, + "grad_norm": 0.31573952872689903, + "learning_rate": 4.3845564415271e-06, + "loss": 0.0258, + "step": 5258 + }, + { + "epoch": 1.7562197361830023, + "grad_norm": 0.36788575475331386, + "learning_rate": 4.382627932480164e-06, + "loss": 0.026, + "step": 5259 + }, + { + "epoch": 1.7565536817498748, + "grad_norm": 0.2653527898917707, + "learning_rate": 4.380699516694547e-06, + "loss": 0.0213, + "step": 5260 + }, + { + "epoch": 1.7568876273167473, + "grad_norm": 0.36528065971526696, + "learning_rate": 4.37877119446156e-06, + "loss": 0.0235, + "step": 5261 + }, + { + "epoch": 1.75722157288362, + "grad_norm": 0.33795778085562383, + "learning_rate": 4.3768429660725e-06, + "loss": 0.0231, + "step": 5262 + }, + { + "epoch": 1.7575555184504925, + "grad_norm": 0.2356782285703482, + "learning_rate": 4.374914831818643e-06, + "loss": 0.0161, + "step": 5263 + }, + { + "epoch": 1.757889464017365, + "grad_norm": 0.33317869895051455, + "learning_rate": 4.372986791991265e-06, + "loss": 0.0184, + "step": 5264 + }, + { + "epoch": 1.7582234095842377, + "grad_norm": 0.2287786204950603, + "learning_rate": 4.371058846881614e-06, + "loss": 0.0219, + "step": 5265 + }, + { + "epoch": 1.7585573551511104, + "grad_norm": 0.26532748190311334, + "learning_rate": 4.36913099678093e-06, + "loss": 0.0271, + "step": 5266 + }, + { + "epoch": 1.758891300717983, + "grad_norm": 0.32653349239746027, + "learning_rate": 4.367203241980437e-06, + "loss": 0.0226, + "step": 5267 + }, + { + "epoch": 1.7592252462848554, + "grad_norm": 0.2428359751991708, + "learning_rate": 4.3652755827713456e-06, + "loss": 0.0206, + "step": 5268 + }, + { + "epoch": 1.7595591918517282, + "grad_norm": 0.253140371750707, + "learning_rate": 4.363348019444848e-06, + "loss": 0.0207, + "step": 5269 + }, + { + "epoch": 1.7598931374186009, + "grad_norm": 0.3183427687083458, + "learning_rate": 4.361420552292132e-06, + "loss": 0.0215, + "step": 5270 + }, + { + "epoch": 1.7602270829854734, + "grad_norm": 0.20742994013478552, + "learning_rate": 4.35949318160436e-06, + "loss": 0.0237, + "step": 5271 + }, + { + "epoch": 1.7605610285523459, + "grad_norm": 0.31861864003067447, + "learning_rate": 4.357565907672684e-06, + "loss": 0.0224, + "step": 5272 + }, + { + "epoch": 1.7608949741192186, + "grad_norm": 0.2872871253468191, + "learning_rate": 4.355638730788242e-06, + "loss": 0.0245, + "step": 5273 + }, + { + "epoch": 1.7612289196860913, + "grad_norm": 0.3656731111702706, + "learning_rate": 4.353711651242157e-06, + "loss": 0.0265, + "step": 5274 + }, + { + "epoch": 1.7615628652529638, + "grad_norm": 0.22647542136807475, + "learning_rate": 4.3517846693255365e-06, + "loss": 0.0172, + "step": 5275 + }, + { + "epoch": 1.7618968108198363, + "grad_norm": 0.3169411742043286, + "learning_rate": 4.349857785329475e-06, + "loss": 0.0324, + "step": 5276 + }, + { + "epoch": 1.762230756386709, + "grad_norm": 0.20135404230110754, + "learning_rate": 4.34793099954505e-06, + "loss": 0.014, + "step": 5277 + }, + { + "epoch": 1.7625647019535817, + "grad_norm": 0.32577566003284725, + "learning_rate": 4.3460043122633256e-06, + "loss": 0.0411, + "step": 5278 + }, + { + "epoch": 1.7628986475204542, + "grad_norm": 0.22638661706663452, + "learning_rate": 4.344077723775349e-06, + "loss": 0.0163, + "step": 5279 + }, + { + "epoch": 1.7632325930873267, + "grad_norm": 0.3359730383509722, + "learning_rate": 4.342151234372155e-06, + "loss": 0.0407, + "step": 5280 + }, + { + "epoch": 1.7635665386541994, + "grad_norm": 0.26602280764370484, + "learning_rate": 4.340224844344766e-06, + "loss": 0.0176, + "step": 5281 + }, + { + "epoch": 1.763900484221072, + "grad_norm": 0.2628029157397793, + "learning_rate": 4.338298553984181e-06, + "loss": 0.024, + "step": 5282 + }, + { + "epoch": 1.7642344297879444, + "grad_norm": 0.3145709521741432, + "learning_rate": 4.336372363581391e-06, + "loss": 0.0215, + "step": 5283 + }, + { + "epoch": 1.7645683753548171, + "grad_norm": 0.2618626178881137, + "learning_rate": 4.33444627342737e-06, + "loss": 0.023, + "step": 5284 + }, + { + "epoch": 1.7649023209216899, + "grad_norm": 0.24790118139469644, + "learning_rate": 4.332520283813075e-06, + "loss": 0.0214, + "step": 5285 + }, + { + "epoch": 1.7652362664885624, + "grad_norm": 0.3790678109613269, + "learning_rate": 4.330594395029449e-06, + "loss": 0.0621, + "step": 5286 + }, + { + "epoch": 1.7655702120554349, + "grad_norm": 0.25914103207267886, + "learning_rate": 4.328668607367424e-06, + "loss": 0.0212, + "step": 5287 + }, + { + "epoch": 1.7659041576223076, + "grad_norm": 0.29920527792637985, + "learning_rate": 4.326742921117911e-06, + "loss": 0.0264, + "step": 5288 + }, + { + "epoch": 1.7662381031891803, + "grad_norm": 0.38668903811160105, + "learning_rate": 4.324817336571806e-06, + "loss": 0.0299, + "step": 5289 + }, + { + "epoch": 1.7665720487560528, + "grad_norm": 0.31248843933088094, + "learning_rate": 4.3228918540199926e-06, + "loss": 0.0309, + "step": 5290 + }, + { + "epoch": 1.7669059943229253, + "grad_norm": 0.24132598955109996, + "learning_rate": 4.320966473753337e-06, + "loss": 0.0246, + "step": 5291 + }, + { + "epoch": 1.767239939889798, + "grad_norm": 0.26448154221641196, + "learning_rate": 4.31904119606269e-06, + "loss": 0.0214, + "step": 5292 + }, + { + "epoch": 1.7675738854566707, + "grad_norm": 0.32016062345779206, + "learning_rate": 4.31711602123889e-06, + "loss": 0.0201, + "step": 5293 + }, + { + "epoch": 1.7679078310235432, + "grad_norm": 0.20820968552951152, + "learning_rate": 4.315190949572755e-06, + "loss": 0.0128, + "step": 5294 + }, + { + "epoch": 1.7682417765904157, + "grad_norm": 0.2877356275411102, + "learning_rate": 4.313265981355091e-06, + "loss": 0.0237, + "step": 5295 + }, + { + "epoch": 1.7685757221572884, + "grad_norm": 0.3507237164174799, + "learning_rate": 4.311341116876687e-06, + "loss": 0.0199, + "step": 5296 + }, + { + "epoch": 1.768909667724161, + "grad_norm": 0.2842876528233048, + "learning_rate": 4.309416356428315e-06, + "loss": 0.0198, + "step": 5297 + }, + { + "epoch": 1.7692436132910334, + "grad_norm": 0.3450221107394591, + "learning_rate": 4.307491700300733e-06, + "loss": 0.0233, + "step": 5298 + }, + { + "epoch": 1.7695775588579061, + "grad_norm": 0.25229333425881884, + "learning_rate": 4.305567148784685e-06, + "loss": 0.0187, + "step": 5299 + }, + { + "epoch": 1.7699115044247788, + "grad_norm": 0.20358798555497293, + "learning_rate": 4.3036427021708955e-06, + "loss": 0.0149, + "step": 5300 + }, + { + "epoch": 1.7702454499916513, + "grad_norm": 0.349335170862707, + "learning_rate": 4.301718360750074e-06, + "loss": 0.0235, + "step": 5301 + }, + { + "epoch": 1.7705793955585238, + "grad_norm": 0.21444336137973968, + "learning_rate": 4.299794124812918e-06, + "loss": 0.0185, + "step": 5302 + }, + { + "epoch": 1.7709133411253966, + "grad_norm": 0.34497724819497605, + "learning_rate": 4.297869994650103e-06, + "loss": 0.0337, + "step": 5303 + }, + { + "epoch": 1.7712472866922693, + "grad_norm": 0.24126082129522472, + "learning_rate": 4.295945970552293e-06, + "loss": 0.0209, + "step": 5304 + }, + { + "epoch": 1.7715812322591418, + "grad_norm": 0.2643741860051308, + "learning_rate": 4.294022052810134e-06, + "loss": 0.0262, + "step": 5305 + }, + { + "epoch": 1.7719151778260143, + "grad_norm": 0.338841061616678, + "learning_rate": 4.292098241714256e-06, + "loss": 0.0221, + "step": 5306 + }, + { + "epoch": 1.772249123392887, + "grad_norm": 0.22618246910030593, + "learning_rate": 4.290174537555275e-06, + "loss": 0.0189, + "step": 5307 + }, + { + "epoch": 1.7725830689597597, + "grad_norm": 0.40917282584344794, + "learning_rate": 4.2882509406237885e-06, + "loss": 0.0192, + "step": 5308 + }, + { + "epoch": 1.7729170145266322, + "grad_norm": 0.24258968238238737, + "learning_rate": 4.286327451210377e-06, + "loss": 0.0196, + "step": 5309 + }, + { + "epoch": 1.7732509600935047, + "grad_norm": 0.2887790744063041, + "learning_rate": 4.284404069605605e-06, + "loss": 0.0322, + "step": 5310 + }, + { + "epoch": 1.7735849056603774, + "grad_norm": 0.2617785767881327, + "learning_rate": 4.282480796100027e-06, + "loss": 0.0232, + "step": 5311 + }, + { + "epoch": 1.77391885122725, + "grad_norm": 0.24059581061942995, + "learning_rate": 4.280557630984173e-06, + "loss": 0.0178, + "step": 5312 + }, + { + "epoch": 1.7742527967941224, + "grad_norm": 0.3566445937685634, + "learning_rate": 4.27863457454856e-06, + "loss": 0.0309, + "step": 5313 + }, + { + "epoch": 1.7745867423609951, + "grad_norm": 0.2085309533801013, + "learning_rate": 4.276711627083688e-06, + "loss": 0.0166, + "step": 5314 + }, + { + "epoch": 1.7749206879278678, + "grad_norm": 0.29753796148611467, + "learning_rate": 4.274788788880041e-06, + "loss": 0.0274, + "step": 5315 + }, + { + "epoch": 1.7752546334947403, + "grad_norm": 0.310507014057238, + "learning_rate": 4.272866060228084e-06, + "loss": 0.026, + "step": 5316 + }, + { + "epoch": 1.7755885790616128, + "grad_norm": 0.2375018822584037, + "learning_rate": 4.270943441418275e-06, + "loss": 0.0236, + "step": 5317 + }, + { + "epoch": 1.7759225246284855, + "grad_norm": 0.22875405020671685, + "learning_rate": 4.2690209327410406e-06, + "loss": 0.0149, + "step": 5318 + }, + { + "epoch": 1.7762564701953583, + "grad_norm": 0.33326698236459007, + "learning_rate": 4.267098534486803e-06, + "loss": 0.0282, + "step": 5319 + }, + { + "epoch": 1.7765904157622308, + "grad_norm": 0.30360024900284505, + "learning_rate": 4.26517624694596e-06, + "loss": 0.0206, + "step": 5320 + }, + { + "epoch": 1.7769243613291033, + "grad_norm": 0.27982814877797346, + "learning_rate": 4.2632540704088975e-06, + "loss": 0.0234, + "step": 5321 + }, + { + "epoch": 1.777258306895976, + "grad_norm": 0.35480889529813037, + "learning_rate": 4.261332005165984e-06, + "loss": 0.0188, + "step": 5322 + }, + { + "epoch": 1.7775922524628487, + "grad_norm": 0.25259645616514015, + "learning_rate": 4.259410051507567e-06, + "loss": 0.0204, + "step": 5323 + }, + { + "epoch": 1.7779261980297212, + "grad_norm": 0.3132454024785608, + "learning_rate": 4.257488209723981e-06, + "loss": 0.0273, + "step": 5324 + }, + { + "epoch": 1.7782601435965937, + "grad_norm": 0.29434662822649665, + "learning_rate": 4.255566480105546e-06, + "loss": 0.0253, + "step": 5325 + }, + { + "epoch": 1.7785940891634664, + "grad_norm": 0.29831340270595536, + "learning_rate": 4.2536448629425585e-06, + "loss": 0.0173, + "step": 5326 + }, + { + "epoch": 1.7789280347303391, + "grad_norm": 0.2569724696689149, + "learning_rate": 4.2517233585253024e-06, + "loss": 0.0257, + "step": 5327 + }, + { + "epoch": 1.7792619802972116, + "grad_norm": 0.376623295708525, + "learning_rate": 4.2498019671440435e-06, + "loss": 0.0334, + "step": 5328 + }, + { + "epoch": 1.779595925864084, + "grad_norm": 0.25523237722503017, + "learning_rate": 4.247880689089033e-06, + "loss": 0.0242, + "step": 5329 + }, + { + "epoch": 1.7799298714309568, + "grad_norm": 0.2569011744362559, + "learning_rate": 4.245959524650498e-06, + "loss": 0.0223, + "step": 5330 + }, + { + "epoch": 1.7802638169978293, + "grad_norm": 0.26138192449838726, + "learning_rate": 4.244038474118656e-06, + "loss": 0.0188, + "step": 5331 + }, + { + "epoch": 1.7805977625647018, + "grad_norm": 0.22985300788655164, + "learning_rate": 4.242117537783704e-06, + "loss": 0.0199, + "step": 5332 + }, + { + "epoch": 1.7809317081315745, + "grad_norm": 0.29138987527189336, + "learning_rate": 4.2401967159358195e-06, + "loss": 0.0191, + "step": 5333 + }, + { + "epoch": 1.7812656536984472, + "grad_norm": 0.3103929794813883, + "learning_rate": 4.2382760088651696e-06, + "loss": 0.0213, + "step": 5334 + }, + { + "epoch": 1.7815995992653197, + "grad_norm": 0.28698349102347676, + "learning_rate": 4.236355416861897e-06, + "loss": 0.025, + "step": 5335 + }, + { + "epoch": 1.7819335448321922, + "grad_norm": 0.21359581758066254, + "learning_rate": 4.23443494021613e-06, + "loss": 0.0207, + "step": 5336 + }, + { + "epoch": 1.782267490399065, + "grad_norm": 0.2733168254407143, + "learning_rate": 4.232514579217981e-06, + "loss": 0.0205, + "step": 5337 + }, + { + "epoch": 1.7826014359659377, + "grad_norm": 0.31333058790493223, + "learning_rate": 4.23059433415754e-06, + "loss": 0.029, + "step": 5338 + }, + { + "epoch": 1.7829353815328102, + "grad_norm": 0.2553071507076, + "learning_rate": 4.228674205324884e-06, + "loss": 0.0208, + "step": 5339 + }, + { + "epoch": 1.7832693270996827, + "grad_norm": 0.3666602706218142, + "learning_rate": 4.226754193010072e-06, + "loss": 0.02, + "step": 5340 + }, + { + "epoch": 1.7836032726665554, + "grad_norm": 0.1696611479409936, + "learning_rate": 4.224834297503145e-06, + "loss": 0.0142, + "step": 5341 + }, + { + "epoch": 1.783937218233428, + "grad_norm": 0.2690713183795865, + "learning_rate": 4.222914519094124e-06, + "loss": 0.0224, + "step": 5342 + }, + { + "epoch": 1.7842711638003006, + "grad_norm": 0.29383558269603804, + "learning_rate": 4.220994858073014e-06, + "loss": 0.0273, + "step": 5343 + }, + { + "epoch": 1.784605109367173, + "grad_norm": 0.22268131709141537, + "learning_rate": 4.2190753147298044e-06, + "loss": 0.0203, + "step": 5344 + }, + { + "epoch": 1.7849390549340458, + "grad_norm": 0.24896416899627177, + "learning_rate": 4.2171558893544626e-06, + "loss": 0.0233, + "step": 5345 + }, + { + "epoch": 1.7852730005009183, + "grad_norm": 0.2636696161135228, + "learning_rate": 4.215236582236941e-06, + "loss": 0.0254, + "step": 5346 + }, + { + "epoch": 1.7856069460677908, + "grad_norm": 0.18638440619800603, + "learning_rate": 4.213317393667175e-06, + "loss": 0.0168, + "step": 5347 + }, + { + "epoch": 1.7859408916346635, + "grad_norm": 0.25220214295454435, + "learning_rate": 4.211398323935079e-06, + "loss": 0.0227, + "step": 5348 + }, + { + "epoch": 1.7862748372015362, + "grad_norm": 0.5021878889729603, + "learning_rate": 4.209479373330552e-06, + "loss": 0.0391, + "step": 5349 + }, + { + "epoch": 1.7866087827684087, + "grad_norm": 0.2631743106456026, + "learning_rate": 4.207560542143474e-06, + "loss": 0.023, + "step": 5350 + }, + { + "epoch": 1.7869427283352812, + "grad_norm": 0.227006636329623, + "learning_rate": 4.205641830663706e-06, + "loss": 0.0187, + "step": 5351 + }, + { + "epoch": 1.787276673902154, + "grad_norm": 0.3444413732172693, + "learning_rate": 4.2037232391810925e-06, + "loss": 0.0234, + "step": 5352 + }, + { + "epoch": 1.7876106194690267, + "grad_norm": 0.22090864487906997, + "learning_rate": 4.20180476798546e-06, + "loss": 0.0195, + "step": 5353 + }, + { + "epoch": 1.7879445650358992, + "grad_norm": 0.3037373120044528, + "learning_rate": 4.1998864173666174e-06, + "loss": 0.0262, + "step": 5354 + }, + { + "epoch": 1.7882785106027717, + "grad_norm": 0.29325394785793535, + "learning_rate": 4.197968187614351e-06, + "loss": 0.0189, + "step": 5355 + }, + { + "epoch": 1.7886124561696444, + "grad_norm": 0.32265916252831683, + "learning_rate": 4.196050079018433e-06, + "loss": 0.0246, + "step": 5356 + }, + { + "epoch": 1.788946401736517, + "grad_norm": 0.6868177119693587, + "learning_rate": 4.194132091868616e-06, + "loss": 0.0484, + "step": 5357 + }, + { + "epoch": 1.7892803473033896, + "grad_norm": 0.26307189164847017, + "learning_rate": 4.1922142264546365e-06, + "loss": 0.0213, + "step": 5358 + }, + { + "epoch": 1.789614292870262, + "grad_norm": 0.2590407420495831, + "learning_rate": 4.1902964830662104e-06, + "loss": 0.021, + "step": 5359 + }, + { + "epoch": 1.7899482384371348, + "grad_norm": 0.29313996054388525, + "learning_rate": 4.188378861993034e-06, + "loss": 0.0188, + "step": 5360 + }, + { + "epoch": 1.7902821840040073, + "grad_norm": 0.2642773265804162, + "learning_rate": 4.186461363524786e-06, + "loss": 0.0224, + "step": 5361 + }, + { + "epoch": 1.7906161295708798, + "grad_norm": 0.2644774454859336, + "learning_rate": 4.184543987951127e-06, + "loss": 0.0281, + "step": 5362 + }, + { + "epoch": 1.7909500751377525, + "grad_norm": 0.23094529361829833, + "learning_rate": 4.182626735561703e-06, + "loss": 0.0182, + "step": 5363 + }, + { + "epoch": 1.7912840207046252, + "grad_norm": 0.3491309779710296, + "learning_rate": 4.180709606646134e-06, + "loss": 0.0325, + "step": 5364 + }, + { + "epoch": 1.7916179662714977, + "grad_norm": 0.22344855625000304, + "learning_rate": 4.178792601494026e-06, + "loss": 0.0228, + "step": 5365 + }, + { + "epoch": 1.7919519118383702, + "grad_norm": 0.35552602513091575, + "learning_rate": 4.176875720394965e-06, + "loss": 0.0286, + "step": 5366 + }, + { + "epoch": 1.792285857405243, + "grad_norm": 0.37183661349160846, + "learning_rate": 4.174958963638518e-06, + "loss": 0.0419, + "step": 5367 + }, + { + "epoch": 1.7926198029721157, + "grad_norm": 0.2390298845947433, + "learning_rate": 4.173042331514234e-06, + "loss": 0.018, + "step": 5368 + }, + { + "epoch": 1.7929537485389881, + "grad_norm": 0.3338490058455825, + "learning_rate": 4.171125824311642e-06, + "loss": 0.0301, + "step": 5369 + }, + { + "epoch": 1.7932876941058606, + "grad_norm": 0.24368378382488928, + "learning_rate": 4.169209442320255e-06, + "loss": 0.0231, + "step": 5370 + }, + { + "epoch": 1.7936216396727334, + "grad_norm": 0.2817320940247766, + "learning_rate": 4.167293185829565e-06, + "loss": 0.023, + "step": 5371 + }, + { + "epoch": 1.793955585239606, + "grad_norm": 0.18946796263856613, + "learning_rate": 4.165377055129043e-06, + "loss": 0.0169, + "step": 5372 + }, + { + "epoch": 1.7942895308064786, + "grad_norm": 0.29722934666835743, + "learning_rate": 4.163461050508144e-06, + "loss": 0.028, + "step": 5373 + }, + { + "epoch": 1.794623476373351, + "grad_norm": 0.21908600998447844, + "learning_rate": 4.161545172256303e-06, + "loss": 0.0189, + "step": 5374 + }, + { + "epoch": 1.7949574219402238, + "grad_norm": 0.3225208887986041, + "learning_rate": 4.1596294206629375e-06, + "loss": 0.0309, + "step": 5375 + }, + { + "epoch": 1.7952913675070965, + "grad_norm": 0.30967674890293173, + "learning_rate": 4.157713796017442e-06, + "loss": 0.0272, + "step": 5376 + }, + { + "epoch": 1.795625313073969, + "grad_norm": 0.2835020834126306, + "learning_rate": 4.155798298609196e-06, + "loss": 0.0238, + "step": 5377 + }, + { + "epoch": 1.7959592586408415, + "grad_norm": 0.38842210457929394, + "learning_rate": 4.1538829287275565e-06, + "loss": 0.0306, + "step": 5378 + }, + { + "epoch": 1.7962932042077142, + "grad_norm": 0.4373247550605237, + "learning_rate": 4.151967686661864e-06, + "loss": 0.0304, + "step": 5379 + }, + { + "epoch": 1.7966271497745867, + "grad_norm": 0.2178778814345169, + "learning_rate": 4.150052572701435e-06, + "loss": 0.0183, + "step": 5380 + }, + { + "epoch": 1.7969610953414592, + "grad_norm": 0.23803039038479404, + "learning_rate": 4.148137587135575e-06, + "loss": 0.011, + "step": 5381 + }, + { + "epoch": 1.797295040908332, + "grad_norm": 0.27432238572838175, + "learning_rate": 4.146222730253563e-06, + "loss": 0.0174, + "step": 5382 + }, + { + "epoch": 1.7976289864752046, + "grad_norm": 0.2886665548459234, + "learning_rate": 4.1443080023446605e-06, + "loss": 0.025, + "step": 5383 + }, + { + "epoch": 1.7979629320420771, + "grad_norm": 0.2903039365585172, + "learning_rate": 4.1423934036981096e-06, + "loss": 0.0255, + "step": 5384 + }, + { + "epoch": 1.7982968776089496, + "grad_norm": 0.3539049120782072, + "learning_rate": 4.140478934603133e-06, + "loss": 0.036, + "step": 5385 + }, + { + "epoch": 1.7986308231758223, + "grad_norm": 0.22683048673008707, + "learning_rate": 4.138564595348932e-06, + "loss": 0.0213, + "step": 5386 + }, + { + "epoch": 1.798964768742695, + "grad_norm": 0.22565394299325583, + "learning_rate": 4.136650386224694e-06, + "loss": 0.0201, + "step": 5387 + }, + { + "epoch": 1.7992987143095676, + "grad_norm": 0.30882097747445864, + "learning_rate": 4.13473630751958e-06, + "loss": 0.0287, + "step": 5388 + }, + { + "epoch": 1.79963265987644, + "grad_norm": 0.26398179500046115, + "learning_rate": 4.132822359522735e-06, + "loss": 0.0174, + "step": 5389 + }, + { + "epoch": 1.7999666054433128, + "grad_norm": 0.3235547733972141, + "learning_rate": 4.130908542523285e-06, + "loss": 0.0313, + "step": 5390 + }, + { + "epoch": 1.8003005510101855, + "grad_norm": 0.3575902350850217, + "learning_rate": 4.128994856810332e-06, + "loss": 0.0169, + "step": 5391 + }, + { + "epoch": 1.800634496577058, + "grad_norm": 0.27565167641382193, + "learning_rate": 4.127081302672958e-06, + "loss": 0.0218, + "step": 5392 + }, + { + "epoch": 1.8009684421439305, + "grad_norm": 0.2936261877894819, + "learning_rate": 4.125167880400235e-06, + "loss": 0.0216, + "step": 5393 + }, + { + "epoch": 1.8013023877108032, + "grad_norm": 0.3016931264197796, + "learning_rate": 4.1232545902812046e-06, + "loss": 0.0206, + "step": 5394 + }, + { + "epoch": 1.8016363332776757, + "grad_norm": 0.30476717861849295, + "learning_rate": 4.121341432604892e-06, + "loss": 0.0265, + "step": 5395 + }, + { + "epoch": 1.8019702788445482, + "grad_norm": 0.27747897043735814, + "learning_rate": 4.1194284076603004e-06, + "loss": 0.027, + "step": 5396 + }, + { + "epoch": 1.802304224411421, + "grad_norm": 0.31358773125177547, + "learning_rate": 4.117515515736418e-06, + "loss": 0.0253, + "step": 5397 + }, + { + "epoch": 1.8026381699782936, + "grad_norm": 0.23139827292246676, + "learning_rate": 4.1156027571222054e-06, + "loss": 0.0173, + "step": 5398 + }, + { + "epoch": 1.8029721155451661, + "grad_norm": 0.31138422498829915, + "learning_rate": 4.113690132106611e-06, + "loss": 0.0286, + "step": 5399 + }, + { + "epoch": 1.8033060611120386, + "grad_norm": 0.3279422610620659, + "learning_rate": 4.111777640978559e-06, + "loss": 0.025, + "step": 5400 + }, + { + "epoch": 1.8036400066789113, + "grad_norm": 0.23702486611837187, + "learning_rate": 4.109865284026953e-06, + "loss": 0.0194, + "step": 5401 + }, + { + "epoch": 1.803973952245784, + "grad_norm": 0.4036518116576094, + "learning_rate": 4.107953061540676e-06, + "loss": 0.0207, + "step": 5402 + }, + { + "epoch": 1.8043078978126565, + "grad_norm": 0.26599846945307454, + "learning_rate": 4.10604097380859e-06, + "loss": 0.0193, + "step": 5403 + }, + { + "epoch": 1.804641843379529, + "grad_norm": 0.2421582318039491, + "learning_rate": 4.104129021119543e-06, + "loss": 0.0201, + "step": 5404 + }, + { + "epoch": 1.8049757889464018, + "grad_norm": 0.27743796176695185, + "learning_rate": 4.102217203762357e-06, + "loss": 0.029, + "step": 5405 + }, + { + "epoch": 1.8053097345132745, + "grad_norm": 0.28819453175578164, + "learning_rate": 4.1003055220258335e-06, + "loss": 0.0218, + "step": 5406 + }, + { + "epoch": 1.805643680080147, + "grad_norm": 0.257400719403574, + "learning_rate": 4.0983939761987535e-06, + "loss": 0.0246, + "step": 5407 + }, + { + "epoch": 1.8059776256470195, + "grad_norm": 0.2718063590711352, + "learning_rate": 4.09648256656988e-06, + "loss": 0.0237, + "step": 5408 + }, + { + "epoch": 1.8063115712138922, + "grad_norm": 0.31384646806016225, + "learning_rate": 4.094571293427951e-06, + "loss": 0.0288, + "step": 5409 + }, + { + "epoch": 1.8066455167807647, + "grad_norm": 0.2976011549234811, + "learning_rate": 4.092660157061691e-06, + "loss": 0.0236, + "step": 5410 + }, + { + "epoch": 1.8069794623476372, + "grad_norm": 0.3234524195314314, + "learning_rate": 4.090749157759799e-06, + "loss": 0.0326, + "step": 5411 + }, + { + "epoch": 1.80731340791451, + "grad_norm": 0.3429949172983408, + "learning_rate": 4.088838295810952e-06, + "loss": 0.0371, + "step": 5412 + }, + { + "epoch": 1.8076473534813826, + "grad_norm": 0.2363244998146425, + "learning_rate": 4.086927571503808e-06, + "loss": 0.0248, + "step": 5413 + }, + { + "epoch": 1.807981299048255, + "grad_norm": 0.3265329850053635, + "learning_rate": 4.0850169851270075e-06, + "loss": 0.0336, + "step": 5414 + }, + { + "epoch": 1.8083152446151276, + "grad_norm": 0.24605220793741153, + "learning_rate": 4.0831065369691615e-06, + "loss": 0.0206, + "step": 5415 + }, + { + "epoch": 1.8086491901820003, + "grad_norm": 0.3377906714487289, + "learning_rate": 4.0811962273188714e-06, + "loss": 0.0232, + "step": 5416 + }, + { + "epoch": 1.808983135748873, + "grad_norm": 0.28045208147729683, + "learning_rate": 4.0792860564647105e-06, + "loss": 0.0179, + "step": 5417 + }, + { + "epoch": 1.8093170813157455, + "grad_norm": 0.23352590492682648, + "learning_rate": 4.077376024695231e-06, + "loss": 0.0167, + "step": 5418 + }, + { + "epoch": 1.809651026882618, + "grad_norm": 0.34066937785685186, + "learning_rate": 4.075466132298967e-06, + "loss": 0.0299, + "step": 5419 + }, + { + "epoch": 1.8099849724494907, + "grad_norm": 0.29209763808469125, + "learning_rate": 4.073556379564429e-06, + "loss": 0.0288, + "step": 5420 + }, + { + "epoch": 1.8103189180163635, + "grad_norm": 0.28219662731217826, + "learning_rate": 4.071646766780109e-06, + "loss": 0.0195, + "step": 5421 + }, + { + "epoch": 1.810652863583236, + "grad_norm": 0.24070868548418348, + "learning_rate": 4.069737294234475e-06, + "loss": 0.02, + "step": 5422 + }, + { + "epoch": 1.8109868091501085, + "grad_norm": 0.352103957259622, + "learning_rate": 4.067827962215977e-06, + "loss": 0.0238, + "step": 5423 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 0.3068937119539319, + "learning_rate": 4.065918771013042e-06, + "loss": 0.0255, + "step": 5424 + }, + { + "epoch": 1.811654700283854, + "grad_norm": 0.2297808906169926, + "learning_rate": 4.064009720914074e-06, + "loss": 0.022, + "step": 5425 + }, + { + "epoch": 1.8119886458507264, + "grad_norm": 0.28344916963934125, + "learning_rate": 4.062100812207459e-06, + "loss": 0.0212, + "step": 5426 + }, + { + "epoch": 1.8123225914175989, + "grad_norm": 0.2414624350592072, + "learning_rate": 4.060192045181558e-06, + "loss": 0.0194, + "step": 5427 + }, + { + "epoch": 1.8126565369844716, + "grad_norm": 0.2601365970991397, + "learning_rate": 4.058283420124716e-06, + "loss": 0.021, + "step": 5428 + }, + { + "epoch": 1.812990482551344, + "grad_norm": 0.21451975788454472, + "learning_rate": 4.056374937325251e-06, + "loss": 0.0156, + "step": 5429 + }, + { + "epoch": 1.8133244281182166, + "grad_norm": 0.34697941022019413, + "learning_rate": 4.054466597071464e-06, + "loss": 0.0366, + "step": 5430 + }, + { + "epoch": 1.8136583736850893, + "grad_norm": 0.24356598349836267, + "learning_rate": 4.05255839965163e-06, + "loss": 0.017, + "step": 5431 + }, + { + "epoch": 1.813992319251962, + "grad_norm": 0.3650783429151526, + "learning_rate": 4.050650345354006e-06, + "loss": 0.0227, + "step": 5432 + }, + { + "epoch": 1.8143262648188345, + "grad_norm": 0.36655433912376695, + "learning_rate": 4.048742434466823e-06, + "loss": 0.0311, + "step": 5433 + }, + { + "epoch": 1.814660210385707, + "grad_norm": 0.290337405991827, + "learning_rate": 4.046834667278298e-06, + "loss": 0.0206, + "step": 5434 + }, + { + "epoch": 1.8149941559525797, + "grad_norm": 0.24916852160715636, + "learning_rate": 4.04492704407662e-06, + "loss": 0.0274, + "step": 5435 + }, + { + "epoch": 1.8153281015194525, + "grad_norm": 0.29535786465917413, + "learning_rate": 4.043019565149958e-06, + "loss": 0.027, + "step": 5436 + }, + { + "epoch": 1.815662047086325, + "grad_norm": 0.2021908403005868, + "learning_rate": 4.041112230786458e-06, + "loss": 0.0155, + "step": 5437 + }, + { + "epoch": 1.8159959926531974, + "grad_norm": 0.3498627376980739, + "learning_rate": 4.039205041274247e-06, + "loss": 0.0167, + "step": 5438 + }, + { + "epoch": 1.8163299382200702, + "grad_norm": 0.41619769673398277, + "learning_rate": 4.0372979969014245e-06, + "loss": 0.0343, + "step": 5439 + }, + { + "epoch": 1.8166638837869429, + "grad_norm": 0.25849631373366844, + "learning_rate": 4.035391097956077e-06, + "loss": 0.0255, + "step": 5440 + }, + { + "epoch": 1.8169978293538154, + "grad_norm": 0.32699202396126836, + "learning_rate": 4.0334843447262625e-06, + "loss": 0.0309, + "step": 5441 + }, + { + "epoch": 1.8173317749206879, + "grad_norm": 0.335101820793403, + "learning_rate": 4.0315777375000185e-06, + "loss": 0.0372, + "step": 5442 + }, + { + "epoch": 1.8176657204875606, + "grad_norm": 0.23616061677896705, + "learning_rate": 4.029671276565359e-06, + "loss": 0.0155, + "step": 5443 + }, + { + "epoch": 1.817999666054433, + "grad_norm": 0.3101037907121459, + "learning_rate": 4.027764962210278e-06, + "loss": 0.0167, + "step": 5444 + }, + { + "epoch": 1.8183336116213056, + "grad_norm": 0.22442417044732038, + "learning_rate": 4.025858794722749e-06, + "loss": 0.0204, + "step": 5445 + }, + { + "epoch": 1.8186675571881783, + "grad_norm": 0.2515916617840221, + "learning_rate": 4.0239527743907184e-06, + "loss": 0.0233, + "step": 5446 + }, + { + "epoch": 1.819001502755051, + "grad_norm": 0.26487508507309315, + "learning_rate": 4.022046901502114e-06, + "loss": 0.023, + "step": 5447 + }, + { + "epoch": 1.8193354483219235, + "grad_norm": 0.23250313759564167, + "learning_rate": 4.020141176344839e-06, + "loss": 0.0206, + "step": 5448 + }, + { + "epoch": 1.819669393888796, + "grad_norm": 0.3282770658819903, + "learning_rate": 4.018235599206778e-06, + "loss": 0.025, + "step": 5449 + }, + { + "epoch": 1.8200033394556687, + "grad_norm": 0.3208453089063136, + "learning_rate": 4.016330170375787e-06, + "loss": 0.0274, + "step": 5450 + }, + { + "epoch": 1.8203372850225414, + "grad_norm": 0.29679640130276935, + "learning_rate": 4.014424890139709e-06, + "loss": 0.0276, + "step": 5451 + }, + { + "epoch": 1.820671230589414, + "grad_norm": 0.24060158633305925, + "learning_rate": 4.012519758786355e-06, + "loss": 0.0237, + "step": 5452 + }, + { + "epoch": 1.8210051761562864, + "grad_norm": 0.25024447045270015, + "learning_rate": 4.01061477660352e-06, + "loss": 0.0204, + "step": 5453 + }, + { + "epoch": 1.8213391217231591, + "grad_norm": 0.23602236265844892, + "learning_rate": 4.008709943878971e-06, + "loss": 0.0199, + "step": 5454 + }, + { + "epoch": 1.8216730672900319, + "grad_norm": 0.2680017862209099, + "learning_rate": 4.006805260900458e-06, + "loss": 0.0211, + "step": 5455 + }, + { + "epoch": 1.8220070128569044, + "grad_norm": 0.4033688267476277, + "learning_rate": 4.004900727955703e-06, + "loss": 0.0283, + "step": 5456 + }, + { + "epoch": 1.8223409584237769, + "grad_norm": 0.4268011302014313, + "learning_rate": 4.0029963453324115e-06, + "loss": 0.024, + "step": 5457 + }, + { + "epoch": 1.8226749039906496, + "grad_norm": 0.26721601677980833, + "learning_rate": 4.001092113318261e-06, + "loss": 0.0216, + "step": 5458 + }, + { + "epoch": 1.823008849557522, + "grad_norm": 0.29060048266066063, + "learning_rate": 3.99918803220091e-06, + "loss": 0.0237, + "step": 5459 + }, + { + "epoch": 1.8233427951243946, + "grad_norm": 0.259958566041274, + "learning_rate": 3.99728410226799e-06, + "loss": 0.0284, + "step": 5460 + }, + { + "epoch": 1.8236767406912673, + "grad_norm": 0.19359468067247562, + "learning_rate": 3.995380323807113e-06, + "loss": 0.0143, + "step": 5461 + }, + { + "epoch": 1.82401068625814, + "grad_norm": 0.21317160490398643, + "learning_rate": 3.993476697105864e-06, + "loss": 0.0147, + "step": 5462 + }, + { + "epoch": 1.8243446318250125, + "grad_norm": 0.32302572897565923, + "learning_rate": 3.991573222451815e-06, + "loss": 0.0315, + "step": 5463 + }, + { + "epoch": 1.824678577391885, + "grad_norm": 0.2740700371674136, + "learning_rate": 3.989669900132504e-06, + "loss": 0.0162, + "step": 5464 + }, + { + "epoch": 1.8250125229587577, + "grad_norm": 0.31972446446373626, + "learning_rate": 3.987766730435451e-06, + "loss": 0.037, + "step": 5465 + }, + { + "epoch": 1.8253464685256304, + "grad_norm": 0.23864627315021922, + "learning_rate": 3.9858637136481515e-06, + "loss": 0.0232, + "step": 5466 + }, + { + "epoch": 1.825680414092503, + "grad_norm": 0.21408657342708176, + "learning_rate": 3.98396085005808e-06, + "loss": 0.023, + "step": 5467 + }, + { + "epoch": 1.8260143596593754, + "grad_norm": 0.2684457102186131, + "learning_rate": 3.982058139952684e-06, + "loss": 0.0155, + "step": 5468 + }, + { + "epoch": 1.8263483052262481, + "grad_norm": 0.23174021707235026, + "learning_rate": 3.980155583619392e-06, + "loss": 0.019, + "step": 5469 + }, + { + "epoch": 1.8266822507931209, + "grad_norm": 0.19011721333269616, + "learning_rate": 3.978253181345609e-06, + "loss": 0.016, + "step": 5470 + }, + { + "epoch": 1.8270161963599933, + "grad_norm": 0.2078323252649679, + "learning_rate": 3.9763509334187125e-06, + "loss": 0.016, + "step": 5471 + }, + { + "epoch": 1.8273501419268658, + "grad_norm": 0.3644020722265962, + "learning_rate": 3.974448840126061e-06, + "loss": 0.0251, + "step": 5472 + }, + { + "epoch": 1.8276840874937386, + "grad_norm": 0.23660112447892773, + "learning_rate": 3.972546901754987e-06, + "loss": 0.0181, + "step": 5473 + }, + { + "epoch": 1.8280180330606113, + "grad_norm": 0.25494333574362743, + "learning_rate": 3.9706451185928e-06, + "loss": 0.0254, + "step": 5474 + }, + { + "epoch": 1.8283519786274836, + "grad_norm": 0.27315501958549304, + "learning_rate": 3.968743490926791e-06, + "loss": 0.0181, + "step": 5475 + }, + { + "epoch": 1.8286859241943563, + "grad_norm": 0.24251661486588935, + "learning_rate": 3.966842019044219e-06, + "loss": 0.0246, + "step": 5476 + }, + { + "epoch": 1.829019869761229, + "grad_norm": 0.30620427884970364, + "learning_rate": 3.964940703232326e-06, + "loss": 0.0269, + "step": 5477 + }, + { + "epoch": 1.8293538153281015, + "grad_norm": 0.2927813112869122, + "learning_rate": 3.963039543778327e-06, + "loss": 0.0227, + "step": 5478 + }, + { + "epoch": 1.829687760894974, + "grad_norm": 0.23695367680264587, + "learning_rate": 3.961138540969411e-06, + "loss": 0.0231, + "step": 5479 + }, + { + "epoch": 1.8300217064618467, + "grad_norm": 0.29434700237793276, + "learning_rate": 3.9592376950927545e-06, + "loss": 0.018, + "step": 5480 + }, + { + "epoch": 1.8303556520287194, + "grad_norm": 0.3291314134586716, + "learning_rate": 3.957337006435499e-06, + "loss": 0.02, + "step": 5481 + }, + { + "epoch": 1.830689597595592, + "grad_norm": 0.47854055491961983, + "learning_rate": 3.955436475284764e-06, + "loss": 0.0221, + "step": 5482 + }, + { + "epoch": 1.8310235431624644, + "grad_norm": 0.29821311669194633, + "learning_rate": 3.95353610192765e-06, + "loss": 0.0216, + "step": 5483 + }, + { + "epoch": 1.8313574887293371, + "grad_norm": 0.5937005251560681, + "learning_rate": 3.95163588665123e-06, + "loss": 0.0246, + "step": 5484 + }, + { + "epoch": 1.8316914342962098, + "grad_norm": 0.26448450237898724, + "learning_rate": 3.949735829742549e-06, + "loss": 0.0193, + "step": 5485 + }, + { + "epoch": 1.8320253798630823, + "grad_norm": 0.3381094734753388, + "learning_rate": 3.947835931488642e-06, + "loss": 0.031, + "step": 5486 + }, + { + "epoch": 1.8323593254299548, + "grad_norm": 0.3108842067678793, + "learning_rate": 3.9459361921765045e-06, + "loss": 0.0237, + "step": 5487 + }, + { + "epoch": 1.8326932709968276, + "grad_norm": 0.21893302179562465, + "learning_rate": 3.944036612093117e-06, + "loss": 0.0176, + "step": 5488 + }, + { + "epoch": 1.8330272165637003, + "grad_norm": 0.40966298117956673, + "learning_rate": 3.942137191525434e-06, + "loss": 0.0259, + "step": 5489 + }, + { + "epoch": 1.8333611621305728, + "grad_norm": 0.25483669604333686, + "learning_rate": 3.9402379307603825e-06, + "loss": 0.0156, + "step": 5490 + }, + { + "epoch": 1.8336951076974453, + "grad_norm": 0.3885415497226158, + "learning_rate": 3.93833883008487e-06, + "loss": 0.0289, + "step": 5491 + }, + { + "epoch": 1.834029053264318, + "grad_norm": 0.3160823222538175, + "learning_rate": 3.936439889785778e-06, + "loss": 0.0309, + "step": 5492 + }, + { + "epoch": 1.8343629988311905, + "grad_norm": 0.39990525790638476, + "learning_rate": 3.934541110149964e-06, + "loss": 0.0201, + "step": 5493 + }, + { + "epoch": 1.834696944398063, + "grad_norm": 0.2894636221218856, + "learning_rate": 3.932642491464261e-06, + "loss": 0.0216, + "step": 5494 + }, + { + "epoch": 1.8350308899649357, + "grad_norm": 0.25044931248245683, + "learning_rate": 3.930744034015477e-06, + "loss": 0.0288, + "step": 5495 + }, + { + "epoch": 1.8353648355318084, + "grad_norm": 0.2636980287182773, + "learning_rate": 3.9288457380903954e-06, + "loss": 0.0261, + "step": 5496 + }, + { + "epoch": 1.835698781098681, + "grad_norm": 0.3629198209733116, + "learning_rate": 3.926947603975778e-06, + "loss": 0.0184, + "step": 5497 + }, + { + "epoch": 1.8360327266655534, + "grad_norm": 0.2810675742420417, + "learning_rate": 3.925049631958361e-06, + "loss": 0.0263, + "step": 5498 + }, + { + "epoch": 1.8363666722324261, + "grad_norm": 0.22680372732984186, + "learning_rate": 3.923151822324854e-06, + "loss": 0.0115, + "step": 5499 + }, + { + "epoch": 1.8367006177992988, + "grad_norm": 0.21607623890424313, + "learning_rate": 3.9212541753619435e-06, + "loss": 0.0209, + "step": 5500 + }, + { + "epoch": 1.8370345633661713, + "grad_norm": 0.28889443060475545, + "learning_rate": 3.9193566913562915e-06, + "loss": 0.023, + "step": 5501 + }, + { + "epoch": 1.8373685089330438, + "grad_norm": 0.3346094052197944, + "learning_rate": 3.917459370594537e-06, + "loss": 0.0316, + "step": 5502 + }, + { + "epoch": 1.8377024544999165, + "grad_norm": 0.21726344198060354, + "learning_rate": 3.915562213363287e-06, + "loss": 0.0178, + "step": 5503 + }, + { + "epoch": 1.8380364000667893, + "grad_norm": 0.23047898029796604, + "learning_rate": 3.9136652199491365e-06, + "loss": 0.0271, + "step": 5504 + }, + { + "epoch": 1.8383703456336618, + "grad_norm": 0.3490157278091612, + "learning_rate": 3.911768390638645e-06, + "loss": 0.0211, + "step": 5505 + }, + { + "epoch": 1.8387042912005342, + "grad_norm": 0.3597586877823906, + "learning_rate": 3.909871725718353e-06, + "loss": 0.0306, + "step": 5506 + }, + { + "epoch": 1.839038236767407, + "grad_norm": 0.26121684414929136, + "learning_rate": 3.907975225474771e-06, + "loss": 0.0194, + "step": 5507 + }, + { + "epoch": 1.8393721823342795, + "grad_norm": 0.33833553870163363, + "learning_rate": 3.906078890194391e-06, + "loss": 0.035, + "step": 5508 + }, + { + "epoch": 1.839706127901152, + "grad_norm": 0.25208364698224595, + "learning_rate": 3.904182720163672e-06, + "loss": 0.0195, + "step": 5509 + }, + { + "epoch": 1.8400400734680247, + "grad_norm": 0.2557373009577202, + "learning_rate": 3.902286715669058e-06, + "loss": 0.0222, + "step": 5510 + }, + { + "epoch": 1.8403740190348974, + "grad_norm": 0.3247971345380231, + "learning_rate": 3.9003908769969615e-06, + "loss": 0.0402, + "step": 5511 + }, + { + "epoch": 1.8407079646017699, + "grad_norm": 0.2700198529014962, + "learning_rate": 3.89849520443377e-06, + "loss": 0.0209, + "step": 5512 + }, + { + "epoch": 1.8410419101686424, + "grad_norm": 0.30992211623056726, + "learning_rate": 3.896599698265847e-06, + "loss": 0.0201, + "step": 5513 + }, + { + "epoch": 1.841375855735515, + "grad_norm": 0.23716161756218984, + "learning_rate": 3.894704358779533e-06, + "loss": 0.0191, + "step": 5514 + }, + { + "epoch": 1.8417098013023878, + "grad_norm": 0.24735710770163488, + "learning_rate": 3.892809186261138e-06, + "loss": 0.0244, + "step": 5515 + }, + { + "epoch": 1.8420437468692603, + "grad_norm": 0.4304108744527725, + "learning_rate": 3.890914180996954e-06, + "loss": 0.0257, + "step": 5516 + }, + { + "epoch": 1.8423776924361328, + "grad_norm": 0.30686700018936613, + "learning_rate": 3.889019343273242e-06, + "loss": 0.0154, + "step": 5517 + }, + { + "epoch": 1.8427116380030055, + "grad_norm": 0.22599827439120004, + "learning_rate": 3.887124673376239e-06, + "loss": 0.0151, + "step": 5518 + }, + { + "epoch": 1.8430455835698782, + "grad_norm": 0.31114427910408443, + "learning_rate": 3.885230171592157e-06, + "loss": 0.0282, + "step": 5519 + }, + { + "epoch": 1.8433795291367507, + "grad_norm": 0.24629323970394204, + "learning_rate": 3.883335838207183e-06, + "loss": 0.0212, + "step": 5520 + }, + { + "epoch": 1.8437134747036232, + "grad_norm": 0.3182009568722478, + "learning_rate": 3.881441673507481e-06, + "loss": 0.0244, + "step": 5521 + }, + { + "epoch": 1.844047420270496, + "grad_norm": 0.2564380913013376, + "learning_rate": 3.879547677779184e-06, + "loss": 0.0194, + "step": 5522 + }, + { + "epoch": 1.8443813658373687, + "grad_norm": 0.21173823822645305, + "learning_rate": 3.8776538513084036e-06, + "loss": 0.0205, + "step": 5523 + }, + { + "epoch": 1.844715311404241, + "grad_norm": 0.24248205699015107, + "learning_rate": 3.875760194381224e-06, + "loss": 0.0168, + "step": 5524 + }, + { + "epoch": 1.8450492569711137, + "grad_norm": 0.3347737084558029, + "learning_rate": 3.873866707283704e-06, + "loss": 0.0302, + "step": 5525 + }, + { + "epoch": 1.8453832025379864, + "grad_norm": 0.30224256905798147, + "learning_rate": 3.871973390301876e-06, + "loss": 0.0214, + "step": 5526 + }, + { + "epoch": 1.8457171481048589, + "grad_norm": 0.33206429677499233, + "learning_rate": 3.8700802437217526e-06, + "loss": 0.0214, + "step": 5527 + }, + { + "epoch": 1.8460510936717314, + "grad_norm": 0.22123229129665162, + "learning_rate": 3.8681872678293115e-06, + "loss": 0.0182, + "step": 5528 + }, + { + "epoch": 1.846385039238604, + "grad_norm": 0.3093987292653506, + "learning_rate": 3.866294462910511e-06, + "loss": 0.0162, + "step": 5529 + }, + { + "epoch": 1.8467189848054768, + "grad_norm": 0.26523134207398047, + "learning_rate": 3.86440182925128e-06, + "loss": 0.0196, + "step": 5530 + }, + { + "epoch": 1.8470529303723493, + "grad_norm": 0.2637429917377835, + "learning_rate": 3.862509367137525e-06, + "loss": 0.0235, + "step": 5531 + }, + { + "epoch": 1.8473868759392218, + "grad_norm": 0.2110379872242139, + "learning_rate": 3.86061707685512e-06, + "loss": 0.0171, + "step": 5532 + }, + { + "epoch": 1.8477208215060945, + "grad_norm": 0.2526531718422867, + "learning_rate": 3.8587249586899245e-06, + "loss": 0.0267, + "step": 5533 + }, + { + "epoch": 1.8480547670729672, + "grad_norm": 0.21459919782795164, + "learning_rate": 3.856833012927762e-06, + "loss": 0.0208, + "step": 5534 + }, + { + "epoch": 1.8483887126398397, + "grad_norm": 0.21105754001049573, + "learning_rate": 3.854941239854433e-06, + "loss": 0.0205, + "step": 5535 + }, + { + "epoch": 1.8487226582067122, + "grad_norm": 0.28888844943590203, + "learning_rate": 3.853049639755713e-06, + "loss": 0.0289, + "step": 5536 + }, + { + "epoch": 1.849056603773585, + "grad_norm": 0.3479629629831733, + "learning_rate": 3.8511582129173495e-06, + "loss": 0.0217, + "step": 5537 + }, + { + "epoch": 1.8493905493404577, + "grad_norm": 0.3040582837759172, + "learning_rate": 3.8492669596250636e-06, + "loss": 0.0274, + "step": 5538 + }, + { + "epoch": 1.8497244949073302, + "grad_norm": 0.5059198619227633, + "learning_rate": 3.8473758801645535e-06, + "loss": 0.0432, + "step": 5539 + }, + { + "epoch": 1.8500584404742026, + "grad_norm": 0.31288464994197834, + "learning_rate": 3.84548497482149e-06, + "loss": 0.03, + "step": 5540 + }, + { + "epoch": 1.8503923860410754, + "grad_norm": 0.20476246652095195, + "learning_rate": 3.843594243881513e-06, + "loss": 0.019, + "step": 5541 + }, + { + "epoch": 1.8507263316079479, + "grad_norm": 0.29668482094120385, + "learning_rate": 3.841703687630243e-06, + "loss": 0.0215, + "step": 5542 + }, + { + "epoch": 1.8510602771748204, + "grad_norm": 0.37539580908242665, + "learning_rate": 3.8398133063532685e-06, + "loss": 0.0462, + "step": 5543 + }, + { + "epoch": 1.851394222741693, + "grad_norm": 0.2731567019708558, + "learning_rate": 3.837923100336155e-06, + "loss": 0.0303, + "step": 5544 + }, + { + "epoch": 1.8517281683085658, + "grad_norm": 0.25254166715668286, + "learning_rate": 3.836033069864441e-06, + "loss": 0.0195, + "step": 5545 + }, + { + "epoch": 1.8520621138754383, + "grad_norm": 0.7063946387194417, + "learning_rate": 3.834143215223637e-06, + "loss": 0.0333, + "step": 5546 + }, + { + "epoch": 1.8523960594423108, + "grad_norm": 0.6160870778017123, + "learning_rate": 3.832253536699227e-06, + "loss": 0.0197, + "step": 5547 + }, + { + "epoch": 1.8527300050091835, + "grad_norm": 0.30730347146628395, + "learning_rate": 3.8303640345766714e-06, + "loss": 0.029, + "step": 5548 + }, + { + "epoch": 1.8530639505760562, + "grad_norm": 0.2326576001773235, + "learning_rate": 3.8284747091414e-06, + "loss": 0.0171, + "step": 5549 + }, + { + "epoch": 1.8533978961429287, + "grad_norm": 0.1841606448958243, + "learning_rate": 3.826585560678816e-06, + "loss": 0.0152, + "step": 5550 + }, + { + "epoch": 1.8537318417098012, + "grad_norm": 0.2471422845895843, + "learning_rate": 3.824696589474301e-06, + "loss": 0.0186, + "step": 5551 + }, + { + "epoch": 1.854065787276674, + "grad_norm": 0.2660303246249173, + "learning_rate": 3.8228077958132055e-06, + "loss": 0.0257, + "step": 5552 + }, + { + "epoch": 1.8543997328435466, + "grad_norm": 0.39585633761465444, + "learning_rate": 3.8209191799808535e-06, + "loss": 0.0223, + "step": 5553 + }, + { + "epoch": 1.8547336784104191, + "grad_norm": 0.2511907798723612, + "learning_rate": 3.819030742262542e-06, + "loss": 0.0251, + "step": 5554 + }, + { + "epoch": 1.8550676239772916, + "grad_norm": 0.3425668300199371, + "learning_rate": 3.817142482943543e-06, + "loss": 0.0224, + "step": 5555 + }, + { + "epoch": 1.8554015695441644, + "grad_norm": 0.31983763319282277, + "learning_rate": 3.815254402309097e-06, + "loss": 0.0272, + "step": 5556 + }, + { + "epoch": 1.8557355151110368, + "grad_norm": 0.33181148570323177, + "learning_rate": 3.813366500644426e-06, + "loss": 0.0307, + "step": 5557 + }, + { + "epoch": 1.8560694606779093, + "grad_norm": 0.22575433477709975, + "learning_rate": 3.8114787782347172e-06, + "loss": 0.0185, + "step": 5558 + }, + { + "epoch": 1.856403406244782, + "grad_norm": 0.28208061818423985, + "learning_rate": 3.809591235365133e-06, + "loss": 0.0185, + "step": 5559 + }, + { + "epoch": 1.8567373518116548, + "grad_norm": 0.22743798029102708, + "learning_rate": 3.807703872320809e-06, + "loss": 0.0171, + "step": 5560 + }, + { + "epoch": 1.8570712973785273, + "grad_norm": 0.21158273922660456, + "learning_rate": 3.8058166893868543e-06, + "loss": 0.017, + "step": 5561 + }, + { + "epoch": 1.8574052429453998, + "grad_norm": 0.22146074037145108, + "learning_rate": 3.8039296868483493e-06, + "loss": 0.0168, + "step": 5562 + }, + { + "epoch": 1.8577391885122725, + "grad_norm": 0.2663937377420855, + "learning_rate": 3.802042864990349e-06, + "loss": 0.0288, + "step": 5563 + }, + { + "epoch": 1.8580731340791452, + "grad_norm": 0.2616890466945177, + "learning_rate": 3.8001562240978785e-06, + "loss": 0.025, + "step": 5564 + }, + { + "epoch": 1.8584070796460177, + "grad_norm": 0.26220805872412206, + "learning_rate": 3.7982697644559385e-06, + "loss": 0.0279, + "step": 5565 + }, + { + "epoch": 1.8587410252128902, + "grad_norm": 0.2610702092037741, + "learning_rate": 3.7963834863495013e-06, + "loss": 0.0228, + "step": 5566 + }, + { + "epoch": 1.859074970779763, + "grad_norm": 0.19892127676747182, + "learning_rate": 3.794497390063509e-06, + "loss": 0.0138, + "step": 5567 + }, + { + "epoch": 1.8594089163466356, + "grad_norm": 0.28811953069250973, + "learning_rate": 3.792611475882881e-06, + "loss": 0.0217, + "step": 5568 + }, + { + "epoch": 1.8597428619135081, + "grad_norm": 0.25996900075359325, + "learning_rate": 3.790725744092507e-06, + "loss": 0.0238, + "step": 5569 + }, + { + "epoch": 1.8600768074803806, + "grad_norm": 0.3049636787340111, + "learning_rate": 3.788840194977248e-06, + "loss": 0.0225, + "step": 5570 + }, + { + "epoch": 1.8604107530472533, + "grad_norm": 0.27304032137042455, + "learning_rate": 3.7869548288219383e-06, + "loss": 0.0224, + "step": 5571 + }, + { + "epoch": 1.860744698614126, + "grad_norm": 0.3510042654290772, + "learning_rate": 3.7850696459113845e-06, + "loss": 0.0335, + "step": 5572 + }, + { + "epoch": 1.8610786441809983, + "grad_norm": 0.312305379265438, + "learning_rate": 3.783184646530364e-06, + "loss": 0.0304, + "step": 5573 + }, + { + "epoch": 1.861412589747871, + "grad_norm": 0.2705352857466786, + "learning_rate": 3.7812998309636323e-06, + "loss": 0.0219, + "step": 5574 + }, + { + "epoch": 1.8617465353147438, + "grad_norm": 0.25166842792429045, + "learning_rate": 3.779415199495911e-06, + "loss": 0.0198, + "step": 5575 + }, + { + "epoch": 1.8620804808816163, + "grad_norm": 0.24989128103290548, + "learning_rate": 3.777530752411896e-06, + "loss": 0.0231, + "step": 5576 + }, + { + "epoch": 1.8624144264484888, + "grad_norm": 0.3480029471981483, + "learning_rate": 3.7756464899962546e-06, + "loss": 0.0333, + "step": 5577 + }, + { + "epoch": 1.8627483720153615, + "grad_norm": 0.24458747733387348, + "learning_rate": 3.773762412533627e-06, + "loss": 0.0189, + "step": 5578 + }, + { + "epoch": 1.8630823175822342, + "grad_norm": 0.27455416860199744, + "learning_rate": 3.771878520308624e-06, + "loss": 0.0249, + "step": 5579 + }, + { + "epoch": 1.8634162631491067, + "grad_norm": 0.2784356958064225, + "learning_rate": 3.7699948136058327e-06, + "loss": 0.0202, + "step": 5580 + }, + { + "epoch": 1.8637502087159792, + "grad_norm": 0.22976269059104357, + "learning_rate": 3.768111292709808e-06, + "loss": 0.0157, + "step": 5581 + }, + { + "epoch": 1.864084154282852, + "grad_norm": 0.41097509033896396, + "learning_rate": 3.7662279579050777e-06, + "loss": 0.0219, + "step": 5582 + }, + { + "epoch": 1.8644180998497246, + "grad_norm": 0.23981699775525583, + "learning_rate": 3.764344809476141e-06, + "loss": 0.0187, + "step": 5583 + }, + { + "epoch": 1.8647520454165971, + "grad_norm": 0.281541505320401, + "learning_rate": 3.7624618477074705e-06, + "loss": 0.0204, + "step": 5584 + }, + { + "epoch": 1.8650859909834696, + "grad_norm": 0.23371525454578326, + "learning_rate": 3.760579072883508e-06, + "loss": 0.0192, + "step": 5585 + }, + { + "epoch": 1.8654199365503423, + "grad_norm": 0.26629309955721303, + "learning_rate": 3.758696485288672e-06, + "loss": 0.025, + "step": 5586 + }, + { + "epoch": 1.865753882117215, + "grad_norm": 0.34621385985352066, + "learning_rate": 3.7568140852073464e-06, + "loss": 0.0277, + "step": 5587 + }, + { + "epoch": 1.8660878276840875, + "grad_norm": 0.28514982345251355, + "learning_rate": 3.754931872923892e-06, + "loss": 0.0226, + "step": 5588 + }, + { + "epoch": 1.86642177325096, + "grad_norm": 0.24262838635866096, + "learning_rate": 3.7530498487226384e-06, + "loss": 0.0209, + "step": 5589 + }, + { + "epoch": 1.8667557188178328, + "grad_norm": 0.32330920987705275, + "learning_rate": 3.751168012887888e-06, + "loss": 0.0241, + "step": 5590 + }, + { + "epoch": 1.8670896643847052, + "grad_norm": 0.24429685845411966, + "learning_rate": 3.7492863657039126e-06, + "loss": 0.0208, + "step": 5591 + }, + { + "epoch": 1.8674236099515777, + "grad_norm": 0.3181913740552329, + "learning_rate": 3.7474049074549596e-06, + "loss": 0.0338, + "step": 5592 + }, + { + "epoch": 1.8677575555184505, + "grad_norm": 0.29926290786382986, + "learning_rate": 3.7455236384252435e-06, + "loss": 0.0295, + "step": 5593 + }, + { + "epoch": 1.8680915010853232, + "grad_norm": 0.24550763167551037, + "learning_rate": 3.743642558898953e-06, + "loss": 0.0172, + "step": 5594 + }, + { + "epoch": 1.8684254466521957, + "grad_norm": 0.34331971296626457, + "learning_rate": 3.7417616691602477e-06, + "loss": 0.0268, + "step": 5595 + }, + { + "epoch": 1.8687593922190682, + "grad_norm": 0.2886453361558632, + "learning_rate": 3.739880969493257e-06, + "loss": 0.0217, + "step": 5596 + }, + { + "epoch": 1.869093337785941, + "grad_norm": 0.2564313897081605, + "learning_rate": 3.738000460182081e-06, + "loss": 0.0256, + "step": 5597 + }, + { + "epoch": 1.8694272833528136, + "grad_norm": 0.2634234063519612, + "learning_rate": 3.736120141510798e-06, + "loss": 0.0287, + "step": 5598 + }, + { + "epoch": 1.869761228919686, + "grad_norm": 0.4393229550372479, + "learning_rate": 3.734240013763448e-06, + "loss": 0.0308, + "step": 5599 + }, + { + "epoch": 1.8700951744865586, + "grad_norm": 0.2164664379744827, + "learning_rate": 3.732360077224049e-06, + "loss": 0.0146, + "step": 5600 + }, + { + "epoch": 1.8704291200534313, + "grad_norm": 0.26072485374206744, + "learning_rate": 3.730480332176586e-06, + "loss": 0.02, + "step": 5601 + }, + { + "epoch": 1.870763065620304, + "grad_norm": 0.3526006967683492, + "learning_rate": 3.7286007789050147e-06, + "loss": 0.0308, + "step": 5602 + }, + { + "epoch": 1.8710970111871765, + "grad_norm": 0.3136971586674732, + "learning_rate": 3.726721417693268e-06, + "loss": 0.0279, + "step": 5603 + }, + { + "epoch": 1.871430956754049, + "grad_norm": 0.257007205554229, + "learning_rate": 3.7248422488252433e-06, + "loss": 0.029, + "step": 5604 + }, + { + "epoch": 1.8717649023209217, + "grad_norm": 0.5048281900375152, + "learning_rate": 3.722963272584812e-06, + "loss": 0.026, + "step": 5605 + }, + { + "epoch": 1.8720988478877942, + "grad_norm": 0.26789406283606615, + "learning_rate": 3.721084489255815e-06, + "loss": 0.021, + "step": 5606 + }, + { + "epoch": 1.8724327934546667, + "grad_norm": 0.2747009983922569, + "learning_rate": 3.719205899122064e-06, + "loss": 0.0161, + "step": 5607 + }, + { + "epoch": 1.8727667390215395, + "grad_norm": 0.3052164921402659, + "learning_rate": 3.7173275024673424e-06, + "loss": 0.0282, + "step": 5608 + }, + { + "epoch": 1.8731006845884122, + "grad_norm": 0.5504177680294845, + "learning_rate": 3.7154492995754046e-06, + "loss": 0.0328, + "step": 5609 + }, + { + "epoch": 1.8734346301552847, + "grad_norm": 0.2852296707309642, + "learning_rate": 3.7135712907299753e-06, + "loss": 0.0222, + "step": 5610 + }, + { + "epoch": 1.8737685757221572, + "grad_norm": 0.2528660856970436, + "learning_rate": 3.7116934762147504e-06, + "loss": 0.0165, + "step": 5611 + }, + { + "epoch": 1.8741025212890299, + "grad_norm": 0.28073984732189905, + "learning_rate": 3.709815856313395e-06, + "loss": 0.0232, + "step": 5612 + }, + { + "epoch": 1.8744364668559026, + "grad_norm": 0.3032357761909178, + "learning_rate": 3.7079384313095464e-06, + "loss": 0.0226, + "step": 5613 + }, + { + "epoch": 1.874770412422775, + "grad_norm": 0.27339499967603625, + "learning_rate": 3.70606120148681e-06, + "loss": 0.0223, + "step": 5614 + }, + { + "epoch": 1.8751043579896476, + "grad_norm": 0.8596306710365944, + "learning_rate": 3.7041841671287654e-06, + "loss": 0.0305, + "step": 5615 + }, + { + "epoch": 1.8754383035565203, + "grad_norm": 0.33671063875926993, + "learning_rate": 3.70230732851896e-06, + "loss": 0.0281, + "step": 5616 + }, + { + "epoch": 1.875772249123393, + "grad_norm": 0.3501949992836972, + "learning_rate": 3.7004306859409134e-06, + "loss": 0.0302, + "step": 5617 + }, + { + "epoch": 1.8761061946902655, + "grad_norm": 0.3367802086748022, + "learning_rate": 3.6985542396781127e-06, + "loss": 0.0265, + "step": 5618 + }, + { + "epoch": 1.876440140257138, + "grad_norm": 0.3153966030061333, + "learning_rate": 3.6966779900140193e-06, + "loss": 0.0216, + "step": 5619 + }, + { + "epoch": 1.8767740858240107, + "grad_norm": 1.9249950041547073, + "learning_rate": 3.694801937232058e-06, + "loss": 0.0308, + "step": 5620 + }, + { + "epoch": 1.8771080313908834, + "grad_norm": 0.3497465104917823, + "learning_rate": 3.6929260816156353e-06, + "loss": 0.037, + "step": 5621 + }, + { + "epoch": 1.8774419769577557, + "grad_norm": 0.2232673582977845, + "learning_rate": 3.691050423448118e-06, + "loss": 0.0221, + "step": 5622 + }, + { + "epoch": 1.8777759225246284, + "grad_norm": 0.32552142053258964, + "learning_rate": 3.689174963012847e-06, + "loss": 0.0326, + "step": 5623 + }, + { + "epoch": 1.8781098680915012, + "grad_norm": 0.23739151785510731, + "learning_rate": 3.6872997005931323e-06, + "loss": 0.0162, + "step": 5624 + }, + { + "epoch": 1.8784438136583737, + "grad_norm": 0.367580525575576, + "learning_rate": 3.6854246364722534e-06, + "loss": 0.0258, + "step": 5625 + }, + { + "epoch": 1.8787777592252461, + "grad_norm": 0.2453190407078018, + "learning_rate": 3.683549770933461e-06, + "loss": 0.0221, + "step": 5626 + }, + { + "epoch": 1.8791117047921189, + "grad_norm": 0.4175588122602081, + "learning_rate": 3.6816751042599774e-06, + "loss": 0.0339, + "step": 5627 + }, + { + "epoch": 1.8794456503589916, + "grad_norm": 0.7360282038989799, + "learning_rate": 3.6798006367349926e-06, + "loss": 0.0312, + "step": 5628 + }, + { + "epoch": 1.879779595925864, + "grad_norm": 0.2902616679070814, + "learning_rate": 3.6779263686416668e-06, + "loss": 0.0209, + "step": 5629 + }, + { + "epoch": 1.8801135414927366, + "grad_norm": 0.27211614708357434, + "learning_rate": 3.676052300263129e-06, + "loss": 0.0168, + "step": 5630 + }, + { + "epoch": 1.8804474870596093, + "grad_norm": 0.3081016855972875, + "learning_rate": 3.6741784318824814e-06, + "loss": 0.0249, + "step": 5631 + }, + { + "epoch": 1.880781432626482, + "grad_norm": 0.21709985011962707, + "learning_rate": 3.6723047637827897e-06, + "loss": 0.0152, + "step": 5632 + }, + { + "epoch": 1.8811153781933545, + "grad_norm": 0.28996588861927514, + "learning_rate": 3.670431296247099e-06, + "loss": 0.018, + "step": 5633 + }, + { + "epoch": 1.881449323760227, + "grad_norm": 0.31626106909858126, + "learning_rate": 3.6685580295584162e-06, + "loss": 0.023, + "step": 5634 + }, + { + "epoch": 1.8817832693270997, + "grad_norm": 0.7737782998676195, + "learning_rate": 3.6666849639997205e-06, + "loss": 0.02, + "step": 5635 + }, + { + "epoch": 1.8821172148939724, + "grad_norm": 0.31216587439059384, + "learning_rate": 3.6648120998539596e-06, + "loss": 0.025, + "step": 5636 + }, + { + "epoch": 1.882451160460845, + "grad_norm": 0.31648761971277567, + "learning_rate": 3.662939437404053e-06, + "loss": 0.0392, + "step": 5637 + }, + { + "epoch": 1.8827851060277174, + "grad_norm": 0.36291856789574106, + "learning_rate": 3.6610669769328853e-06, + "loss": 0.0331, + "step": 5638 + }, + { + "epoch": 1.8831190515945901, + "grad_norm": 0.28430850615956293, + "learning_rate": 3.659194718723319e-06, + "loss": 0.0221, + "step": 5639 + }, + { + "epoch": 1.8834529971614626, + "grad_norm": 0.26557372516319366, + "learning_rate": 3.657322663058177e-06, + "loss": 0.0246, + "step": 5640 + }, + { + "epoch": 1.8837869427283351, + "grad_norm": 0.3382648773244678, + "learning_rate": 3.655450810220257e-06, + "loss": 0.0254, + "step": 5641 + }, + { + "epoch": 1.8841208882952079, + "grad_norm": 0.3856131556946965, + "learning_rate": 3.6535791604923225e-06, + "loss": 0.0258, + "step": 5642 + }, + { + "epoch": 1.8844548338620806, + "grad_norm": 0.2625114895455324, + "learning_rate": 3.6517077141571076e-06, + "loss": 0.0184, + "step": 5643 + }, + { + "epoch": 1.884788779428953, + "grad_norm": 0.6705068872465284, + "learning_rate": 3.649836471497321e-06, + "loss": 0.0396, + "step": 5644 + }, + { + "epoch": 1.8851227249958256, + "grad_norm": 0.3247720638992046, + "learning_rate": 3.6479654327956325e-06, + "loss": 0.0246, + "step": 5645 + }, + { + "epoch": 1.8854566705626983, + "grad_norm": 0.4229451507494173, + "learning_rate": 3.646094598334685e-06, + "loss": 0.0267, + "step": 5646 + }, + { + "epoch": 1.885790616129571, + "grad_norm": 0.3830282898171951, + "learning_rate": 3.64422396839709e-06, + "loss": 0.0236, + "step": 5647 + }, + { + "epoch": 1.8861245616964435, + "grad_norm": 0.24184828368269592, + "learning_rate": 3.642353543265429e-06, + "loss": 0.0214, + "step": 5648 + }, + { + "epoch": 1.886458507263316, + "grad_norm": 0.26558508192597324, + "learning_rate": 3.640483323222248e-06, + "loss": 0.0239, + "step": 5649 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.3366403275042395, + "learning_rate": 3.638613308550072e-06, + "loss": 0.0315, + "step": 5650 + }, + { + "epoch": 1.8871263983970614, + "grad_norm": 0.26774828007915513, + "learning_rate": 3.636743499531385e-06, + "loss": 0.0204, + "step": 5651 + }, + { + "epoch": 1.887460343963934, + "grad_norm": 0.243076448644584, + "learning_rate": 3.634873896448644e-06, + "loss": 0.0213, + "step": 5652 + }, + { + "epoch": 1.8877942895308064, + "grad_norm": 0.20326265865240656, + "learning_rate": 3.633004499584275e-06, + "loss": 0.0179, + "step": 5653 + }, + { + "epoch": 1.8881282350976791, + "grad_norm": 0.2484244726530193, + "learning_rate": 3.6311353092206723e-06, + "loss": 0.0201, + "step": 5654 + }, + { + "epoch": 1.8884621806645516, + "grad_norm": 0.28341009012279234, + "learning_rate": 3.6292663256401967e-06, + "loss": 0.0276, + "step": 5655 + }, + { + "epoch": 1.8887961262314241, + "grad_norm": 0.3091307445849336, + "learning_rate": 3.6273975491251844e-06, + "loss": 0.0272, + "step": 5656 + }, + { + "epoch": 1.8891300717982968, + "grad_norm": 0.2813763442913171, + "learning_rate": 3.625528979957935e-06, + "loss": 0.0208, + "step": 5657 + }, + { + "epoch": 1.8894640173651696, + "grad_norm": 0.30988343067989294, + "learning_rate": 3.6236606184207164e-06, + "loss": 0.0221, + "step": 5658 + }, + { + "epoch": 1.889797962932042, + "grad_norm": 0.30786214327412476, + "learning_rate": 3.621792464795767e-06, + "loss": 0.0255, + "step": 5659 + }, + { + "epoch": 1.8901319084989145, + "grad_norm": 0.24506763944937457, + "learning_rate": 3.6199245193652944e-06, + "loss": 0.0217, + "step": 5660 + }, + { + "epoch": 1.8904658540657873, + "grad_norm": 0.24257643910939025, + "learning_rate": 3.6180567824114715e-06, + "loss": 0.0221, + "step": 5661 + }, + { + "epoch": 1.89079979963266, + "grad_norm": 0.2831029912470948, + "learning_rate": 3.6161892542164444e-06, + "loss": 0.0202, + "step": 5662 + }, + { + "epoch": 1.8911337451995325, + "grad_norm": 0.29041372715845953, + "learning_rate": 3.614321935062325e-06, + "loss": 0.0212, + "step": 5663 + }, + { + "epoch": 1.891467690766405, + "grad_norm": 0.27341642196534827, + "learning_rate": 3.6124548252311918e-06, + "loss": 0.0289, + "step": 5664 + }, + { + "epoch": 1.8918016363332777, + "grad_norm": 0.3695378161978298, + "learning_rate": 3.610587925005097e-06, + "loss": 0.0306, + "step": 5665 + }, + { + "epoch": 1.8921355819001504, + "grad_norm": 0.39052305016986566, + "learning_rate": 3.608721234666054e-06, + "loss": 0.0321, + "step": 5666 + }, + { + "epoch": 1.892469527467023, + "grad_norm": 0.25340199290291054, + "learning_rate": 3.6068547544960493e-06, + "loss": 0.0249, + "step": 5667 + }, + { + "epoch": 1.8928034730338954, + "grad_norm": 0.20473054344839864, + "learning_rate": 3.6049884847770396e-06, + "loss": 0.0178, + "step": 5668 + }, + { + "epoch": 1.8931374186007681, + "grad_norm": 0.2541131857332617, + "learning_rate": 3.6031224257909448e-06, + "loss": 0.0185, + "step": 5669 + }, + { + "epoch": 1.8934713641676408, + "grad_norm": 0.32645449183874226, + "learning_rate": 3.6012565778196552e-06, + "loss": 0.0331, + "step": 5670 + }, + { + "epoch": 1.893805309734513, + "grad_norm": 0.4710526248291429, + "learning_rate": 3.5993909411450297e-06, + "loss": 0.0314, + "step": 5671 + }, + { + "epoch": 1.8941392553013858, + "grad_norm": 0.43063289232438423, + "learning_rate": 3.597525516048894e-06, + "loss": 0.0329, + "step": 5672 + }, + { + "epoch": 1.8944732008682585, + "grad_norm": 0.3069695240366432, + "learning_rate": 3.5956603028130397e-06, + "loss": 0.0293, + "step": 5673 + }, + { + "epoch": 1.894807146435131, + "grad_norm": 0.3191714135604289, + "learning_rate": 3.5937953017192356e-06, + "loss": 0.0323, + "step": 5674 + }, + { + "epoch": 1.8951410920020035, + "grad_norm": 0.2955595424077923, + "learning_rate": 3.591930513049208e-06, + "loss": 0.0267, + "step": 5675 + }, + { + "epoch": 1.8954750375688763, + "grad_norm": 0.2569862023773537, + "learning_rate": 3.5900659370846556e-06, + "loss": 0.0184, + "step": 5676 + }, + { + "epoch": 1.895808983135749, + "grad_norm": 0.29791923924704183, + "learning_rate": 3.5882015741072464e-06, + "loss": 0.0295, + "step": 5677 + }, + { + "epoch": 1.8961429287026215, + "grad_norm": 0.33406877157660403, + "learning_rate": 3.586337424398609e-06, + "loss": 0.0292, + "step": 5678 + }, + { + "epoch": 1.896476874269494, + "grad_norm": 0.30303765639025965, + "learning_rate": 3.584473488240352e-06, + "loss": 0.0365, + "step": 5679 + }, + { + "epoch": 1.8968108198363667, + "grad_norm": 0.31733216907511996, + "learning_rate": 3.5826097659140413e-06, + "loss": 0.0259, + "step": 5680 + }, + { + "epoch": 1.8971447654032394, + "grad_norm": 0.22585600941395312, + "learning_rate": 3.5807462577012152e-06, + "loss": 0.0221, + "step": 5681 + }, + { + "epoch": 1.897478710970112, + "grad_norm": 0.25590338746280156, + "learning_rate": 3.5788829638833777e-06, + "loss": 0.0253, + "step": 5682 + }, + { + "epoch": 1.8978126565369844, + "grad_norm": 0.26186866055214336, + "learning_rate": 3.5770198847420016e-06, + "loss": 0.0226, + "step": 5683 + }, + { + "epoch": 1.898146602103857, + "grad_norm": 0.2633618834673288, + "learning_rate": 3.5751570205585264e-06, + "loss": 0.0249, + "step": 5684 + }, + { + "epoch": 1.8984805476707298, + "grad_norm": 0.28940639133878754, + "learning_rate": 3.573294371614361e-06, + "loss": 0.0211, + "step": 5685 + }, + { + "epoch": 1.8988144932376023, + "grad_norm": 0.3674848153022131, + "learning_rate": 3.571431938190879e-06, + "loss": 0.0333, + "step": 5686 + }, + { + "epoch": 1.8991484388044748, + "grad_norm": 0.3772093176187774, + "learning_rate": 3.5695697205694246e-06, + "loss": 0.0207, + "step": 5687 + }, + { + "epoch": 1.8994823843713475, + "grad_norm": 0.279200525858833, + "learning_rate": 3.567707719031306e-06, + "loss": 0.0315, + "step": 5688 + }, + { + "epoch": 1.89981632993822, + "grad_norm": 0.2888839645900449, + "learning_rate": 3.5658459338578016e-06, + "loss": 0.0288, + "step": 5689 + }, + { + "epoch": 1.9001502755050925, + "grad_norm": 0.22384834559700187, + "learning_rate": 3.563984365330153e-06, + "loss": 0.0249, + "step": 5690 + }, + { + "epoch": 1.9004842210719652, + "grad_norm": 0.21465859175580254, + "learning_rate": 3.562123013729577e-06, + "loss": 0.018, + "step": 5691 + }, + { + "epoch": 1.900818166638838, + "grad_norm": 0.2767172704703618, + "learning_rate": 3.56026187933725e-06, + "loss": 0.0262, + "step": 5692 + }, + { + "epoch": 1.9011521122057105, + "grad_norm": 0.23014830000881922, + "learning_rate": 3.5584009624343187e-06, + "loss": 0.0199, + "step": 5693 + }, + { + "epoch": 1.901486057772583, + "grad_norm": 0.2751937117181656, + "learning_rate": 3.5565402633018963e-06, + "loss": 0.0153, + "step": 5694 + }, + { + "epoch": 1.9018200033394557, + "grad_norm": 0.2824563250664518, + "learning_rate": 3.554679782221063e-06, + "loss": 0.0301, + "step": 5695 + }, + { + "epoch": 1.9021539489063284, + "grad_norm": 0.24291693898765357, + "learning_rate": 3.552819519472865e-06, + "loss": 0.0194, + "step": 5696 + }, + { + "epoch": 1.9024878944732009, + "grad_norm": 0.2851782921848308, + "learning_rate": 3.5509594753383202e-06, + "loss": 0.0234, + "step": 5697 + }, + { + "epoch": 1.9028218400400734, + "grad_norm": 0.27319900921435764, + "learning_rate": 3.5490996500984085e-06, + "loss": 0.0306, + "step": 5698 + }, + { + "epoch": 1.903155785606946, + "grad_norm": 0.31762881081794064, + "learning_rate": 3.547240044034079e-06, + "loss": 0.0212, + "step": 5699 + }, + { + "epoch": 1.9034897311738188, + "grad_norm": 0.27509760813151607, + "learning_rate": 3.545380657426247e-06, + "loss": 0.024, + "step": 5700 + }, + { + "epoch": 1.9038236767406913, + "grad_norm": 0.30464699764027325, + "learning_rate": 3.5435214905557937e-06, + "loss": 0.0229, + "step": 5701 + }, + { + "epoch": 1.9041576223075638, + "grad_norm": 0.24258387734140954, + "learning_rate": 3.5416625437035656e-06, + "loss": 0.0237, + "step": 5702 + }, + { + "epoch": 1.9044915678744365, + "grad_norm": 0.23685690862307018, + "learning_rate": 3.539803817150385e-06, + "loss": 0.0214, + "step": 5703 + }, + { + "epoch": 1.904825513441309, + "grad_norm": 0.26934409608958326, + "learning_rate": 3.5379453111770313e-06, + "loss": 0.0258, + "step": 5704 + }, + { + "epoch": 1.9051594590081815, + "grad_norm": 0.30369357691944876, + "learning_rate": 3.536087026064252e-06, + "loss": 0.0269, + "step": 5705 + }, + { + "epoch": 1.9054934045750542, + "grad_norm": 0.25558186749101636, + "learning_rate": 3.534228962092766e-06, + "loss": 0.0225, + "step": 5706 + }, + { + "epoch": 1.905827350141927, + "grad_norm": 0.23940305901461514, + "learning_rate": 3.5323711195432533e-06, + "loss": 0.0237, + "step": 5707 + }, + { + "epoch": 1.9061612957087994, + "grad_norm": 0.20537004199358336, + "learning_rate": 3.530513498696363e-06, + "loss": 0.017, + "step": 5708 + }, + { + "epoch": 1.906495241275672, + "grad_norm": 0.366429410200943, + "learning_rate": 3.5286560998327125e-06, + "loss": 0.0206, + "step": 5709 + }, + { + "epoch": 1.9068291868425447, + "grad_norm": 0.23299796097563769, + "learning_rate": 3.5267989232328827e-06, + "loss": 0.0169, + "step": 5710 + }, + { + "epoch": 1.9071631324094174, + "grad_norm": 0.31482949941738964, + "learning_rate": 3.5249419691774212e-06, + "loss": 0.0187, + "step": 5711 + }, + { + "epoch": 1.9074970779762899, + "grad_norm": 0.2997997130992164, + "learning_rate": 3.523085237946844e-06, + "loss": 0.0223, + "step": 5712 + }, + { + "epoch": 1.9078310235431624, + "grad_norm": 0.2357928221885971, + "learning_rate": 3.5212287298216306e-06, + "loss": 0.0158, + "step": 5713 + }, + { + "epoch": 1.908164969110035, + "grad_norm": 0.4568262748198393, + "learning_rate": 3.5193724450822296e-06, + "loss": 0.0276, + "step": 5714 + }, + { + "epoch": 1.9084989146769078, + "grad_norm": 0.34578103578375924, + "learning_rate": 3.517516384009056e-06, + "loss": 0.0279, + "step": 5715 + }, + { + "epoch": 1.9088328602437803, + "grad_norm": 0.23662446686899274, + "learning_rate": 3.515660546882488e-06, + "loss": 0.0162, + "step": 5716 + }, + { + "epoch": 1.9091668058106528, + "grad_norm": 0.31818349141824004, + "learning_rate": 3.5138049339828718e-06, + "loss": 0.0219, + "step": 5717 + }, + { + "epoch": 1.9095007513775255, + "grad_norm": 0.4035219440624549, + "learning_rate": 3.5119495455905194e-06, + "loss": 0.0229, + "step": 5718 + }, + { + "epoch": 1.9098346969443982, + "grad_norm": 0.3607144084836721, + "learning_rate": 3.5100943819857082e-06, + "loss": 0.0224, + "step": 5719 + }, + { + "epoch": 1.9101686425112705, + "grad_norm": 0.21702268535076308, + "learning_rate": 3.508239443448685e-06, + "loss": 0.0182, + "step": 5720 + }, + { + "epoch": 1.9105025880781432, + "grad_norm": 0.2472802879250286, + "learning_rate": 3.5063847302596587e-06, + "loss": 0.018, + "step": 5721 + }, + { + "epoch": 1.910836533645016, + "grad_norm": 0.29950848375670075, + "learning_rate": 3.504530242698806e-06, + "loss": 0.0241, + "step": 5722 + }, + { + "epoch": 1.9111704792118884, + "grad_norm": 0.27458335393601785, + "learning_rate": 3.5026759810462687e-06, + "loss": 0.0164, + "step": 5723 + }, + { + "epoch": 1.911504424778761, + "grad_norm": 0.21515272952143766, + "learning_rate": 3.5008219455821546e-06, + "loss": 0.0189, + "step": 5724 + }, + { + "epoch": 1.9118383703456336, + "grad_norm": 0.27069095778701835, + "learning_rate": 3.4989681365865363e-06, + "loss": 0.0182, + "step": 5725 + }, + { + "epoch": 1.9121723159125064, + "grad_norm": 0.26467824055167855, + "learning_rate": 3.497114554339457e-06, + "loss": 0.019, + "step": 5726 + }, + { + "epoch": 1.9125062614793789, + "grad_norm": 0.469183792243403, + "learning_rate": 3.4952611991209197e-06, + "loss": 0.0313, + "step": 5727 + }, + { + "epoch": 1.9128402070462514, + "grad_norm": 0.252396180883767, + "learning_rate": 3.4934080712108964e-06, + "loss": 0.0225, + "step": 5728 + }, + { + "epoch": 1.913174152613124, + "grad_norm": 0.2550532671470629, + "learning_rate": 3.4915551708893236e-06, + "loss": 0.0201, + "step": 5729 + }, + { + "epoch": 1.9135080981799968, + "grad_norm": 0.25301491539265647, + "learning_rate": 3.489702498436103e-06, + "loss": 0.0246, + "step": 5730 + }, + { + "epoch": 1.9138420437468693, + "grad_norm": 0.3181839797481373, + "learning_rate": 3.487850054131103e-06, + "loss": 0.0238, + "step": 5731 + }, + { + "epoch": 1.9141759893137418, + "grad_norm": 0.28803121860494485, + "learning_rate": 3.4859978382541575e-06, + "loss": 0.0277, + "step": 5732 + }, + { + "epoch": 1.9145099348806145, + "grad_norm": 0.26992756840531185, + "learning_rate": 3.4841458510850656e-06, + "loss": 0.0208, + "step": 5733 + }, + { + "epoch": 1.9148438804474872, + "grad_norm": 0.25450634681763407, + "learning_rate": 3.482294092903592e-06, + "loss": 0.0176, + "step": 5734 + }, + { + "epoch": 1.9151778260143597, + "grad_norm": 0.27097711118186313, + "learning_rate": 3.480442563989466e-06, + "loss": 0.02, + "step": 5735 + }, + { + "epoch": 1.9155117715812322, + "grad_norm": 0.2543927295526409, + "learning_rate": 3.4785912646223813e-06, + "loss": 0.0248, + "step": 5736 + }, + { + "epoch": 1.915845717148105, + "grad_norm": 0.3572264494090962, + "learning_rate": 3.4767401950820003e-06, + "loss": 0.0348, + "step": 5737 + }, + { + "epoch": 1.9161796627149774, + "grad_norm": 0.2006125898602107, + "learning_rate": 3.4748893556479497e-06, + "loss": 0.0149, + "step": 5738 + }, + { + "epoch": 1.91651360828185, + "grad_norm": 0.17789457397292407, + "learning_rate": 3.4730387465998194e-06, + "loss": 0.0138, + "step": 5739 + }, + { + "epoch": 1.9168475538487226, + "grad_norm": 0.313329535925791, + "learning_rate": 3.4711883682171666e-06, + "loss": 0.0248, + "step": 5740 + }, + { + "epoch": 1.9171814994155953, + "grad_norm": 0.25576284198479743, + "learning_rate": 3.4693382207795114e-06, + "loss": 0.0186, + "step": 5741 + }, + { + "epoch": 1.9175154449824678, + "grad_norm": 0.259449240986749, + "learning_rate": 3.4674883045663404e-06, + "loss": 0.0135, + "step": 5742 + }, + { + "epoch": 1.9178493905493403, + "grad_norm": 0.2631975844534271, + "learning_rate": 3.465638619857104e-06, + "loss": 0.0221, + "step": 5743 + }, + { + "epoch": 1.918183336116213, + "grad_norm": 0.3371311670108778, + "learning_rate": 3.463789166931223e-06, + "loss": 0.0317, + "step": 5744 + }, + { + "epoch": 1.9185172816830858, + "grad_norm": 0.23233644899160083, + "learning_rate": 3.4619399460680757e-06, + "loss": 0.0158, + "step": 5745 + }, + { + "epoch": 1.9188512272499583, + "grad_norm": 0.4168115584854737, + "learning_rate": 3.460090957547011e-06, + "loss": 0.0185, + "step": 5746 + }, + { + "epoch": 1.9191851728168308, + "grad_norm": 0.22650684368857574, + "learning_rate": 3.4582422016473384e-06, + "loss": 0.0171, + "step": 5747 + }, + { + "epoch": 1.9195191183837035, + "grad_norm": 0.2607711815041672, + "learning_rate": 3.4563936786483345e-06, + "loss": 0.02, + "step": 5748 + }, + { + "epoch": 1.9198530639505762, + "grad_norm": 0.2680289256407857, + "learning_rate": 3.454545388829239e-06, + "loss": 0.0245, + "step": 5749 + }, + { + "epoch": 1.9201870095174487, + "grad_norm": 0.30776760651468293, + "learning_rate": 3.4526973324692614e-06, + "loss": 0.0272, + "step": 5750 + }, + { + "epoch": 1.9205209550843212, + "grad_norm": 0.297492885355231, + "learning_rate": 3.4508495098475712e-06, + "loss": 0.0257, + "step": 5751 + }, + { + "epoch": 1.920854900651194, + "grad_norm": 0.3382392401642789, + "learning_rate": 3.4490019212433035e-06, + "loss": 0.0411, + "step": 5752 + }, + { + "epoch": 1.9211888462180664, + "grad_norm": 0.305923167577958, + "learning_rate": 3.447154566935557e-06, + "loss": 0.0247, + "step": 5753 + }, + { + "epoch": 1.921522791784939, + "grad_norm": 0.2524122769731954, + "learning_rate": 3.4453074472033975e-06, + "loss": 0.0188, + "step": 5754 + }, + { + "epoch": 1.9218567373518116, + "grad_norm": 0.28425994275426447, + "learning_rate": 3.443460562325853e-06, + "loss": 0.022, + "step": 5755 + }, + { + "epoch": 1.9221906829186843, + "grad_norm": 0.3118905363265386, + "learning_rate": 3.4416139125819204e-06, + "loss": 0.0264, + "step": 5756 + }, + { + "epoch": 1.9225246284855568, + "grad_norm": 0.3281207577346625, + "learning_rate": 3.4397674982505546e-06, + "loss": 0.0255, + "step": 5757 + }, + { + "epoch": 1.9228585740524293, + "grad_norm": 0.28634068475023605, + "learning_rate": 3.43792131961068e-06, + "loss": 0.0238, + "step": 5758 + }, + { + "epoch": 1.923192519619302, + "grad_norm": 0.21798902076613347, + "learning_rate": 3.4360753769411816e-06, + "loss": 0.0154, + "step": 5759 + }, + { + "epoch": 1.9235264651861748, + "grad_norm": 0.32068310275870915, + "learning_rate": 3.4342296705209112e-06, + "loss": 0.0223, + "step": 5760 + }, + { + "epoch": 1.9238604107530473, + "grad_norm": 0.22021435612465068, + "learning_rate": 3.432384200628688e-06, + "loss": 0.0211, + "step": 5761 + }, + { + "epoch": 1.9241943563199198, + "grad_norm": 0.24396127581095575, + "learning_rate": 3.4305389675432882e-06, + "loss": 0.0197, + "step": 5762 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 0.36661985082876813, + "learning_rate": 3.4286939715434573e-06, + "loss": 0.0256, + "step": 5763 + }, + { + "epoch": 1.9248622474536652, + "grad_norm": 0.29503627923718295, + "learning_rate": 3.4268492129079047e-06, + "loss": 0.0218, + "step": 5764 + }, + { + "epoch": 1.9251961930205377, + "grad_norm": 0.2542535312199132, + "learning_rate": 3.4250046919153e-06, + "loss": 0.0262, + "step": 5765 + }, + { + "epoch": 1.9255301385874102, + "grad_norm": 0.2847098668501808, + "learning_rate": 3.4231604088442806e-06, + "loss": 0.0219, + "step": 5766 + }, + { + "epoch": 1.925864084154283, + "grad_norm": 0.2625897038985264, + "learning_rate": 3.4213163639734504e-06, + "loss": 0.0169, + "step": 5767 + }, + { + "epoch": 1.9261980297211556, + "grad_norm": 0.2753234418285163, + "learning_rate": 3.4194725575813707e-06, + "loss": 0.0212, + "step": 5768 + }, + { + "epoch": 1.9265319752880279, + "grad_norm": 0.22249139602145235, + "learning_rate": 3.417628989946572e-06, + "loss": 0.0199, + "step": 5769 + }, + { + "epoch": 1.9268659208549006, + "grad_norm": 0.2643160394653972, + "learning_rate": 3.415785661347546e-06, + "loss": 0.0197, + "step": 5770 + }, + { + "epoch": 1.9271998664217733, + "grad_norm": 0.3020632813878166, + "learning_rate": 3.4139425720627494e-06, + "loss": 0.0259, + "step": 5771 + }, + { + "epoch": 1.9275338119886458, + "grad_norm": 0.43705020368872793, + "learning_rate": 3.412099722370601e-06, + "loss": 0.0228, + "step": 5772 + }, + { + "epoch": 1.9278677575555183, + "grad_norm": 0.31263637342794637, + "learning_rate": 3.4102571125494877e-06, + "loss": 0.0274, + "step": 5773 + }, + { + "epoch": 1.928201703122391, + "grad_norm": 0.3005120875419277, + "learning_rate": 3.408414742877757e-06, + "loss": 0.0206, + "step": 5774 + }, + { + "epoch": 1.9285356486892637, + "grad_norm": 0.314114858724423, + "learning_rate": 3.406572613633719e-06, + "loss": 0.0238, + "step": 5775 + }, + { + "epoch": 1.9288695942561362, + "grad_norm": 0.20709489788582736, + "learning_rate": 3.40473072509565e-06, + "loss": 0.0179, + "step": 5776 + }, + { + "epoch": 1.9292035398230087, + "grad_norm": 0.2079678458008911, + "learning_rate": 3.4028890775417887e-06, + "loss": 0.0151, + "step": 5777 + }, + { + "epoch": 1.9295374853898815, + "grad_norm": 0.3596915574099437, + "learning_rate": 3.4010476712503367e-06, + "loss": 0.031, + "step": 5778 + }, + { + "epoch": 1.9298714309567542, + "grad_norm": 0.2911308911076081, + "learning_rate": 3.3992065064994615e-06, + "loss": 0.0204, + "step": 5779 + }, + { + "epoch": 1.9302053765236267, + "grad_norm": 0.2545928557163873, + "learning_rate": 3.3973655835672923e-06, + "loss": 0.0269, + "step": 5780 + }, + { + "epoch": 1.9305393220904992, + "grad_norm": 0.231130465888615, + "learning_rate": 3.3955249027319214e-06, + "loss": 0.0202, + "step": 5781 + }, + { + "epoch": 1.9308732676573719, + "grad_norm": 0.3375892586951769, + "learning_rate": 3.3936844642714073e-06, + "loss": 0.0279, + "step": 5782 + }, + { + "epoch": 1.9312072132242446, + "grad_norm": 0.2477328990343217, + "learning_rate": 3.3918442684637687e-06, + "loss": 0.0273, + "step": 5783 + }, + { + "epoch": 1.931541158791117, + "grad_norm": 0.29131288253390225, + "learning_rate": 3.3900043155869865e-06, + "loss": 0.0218, + "step": 5784 + }, + { + "epoch": 1.9318751043579896, + "grad_norm": 0.240469610356494, + "learning_rate": 3.388164605919012e-06, + "loss": 0.0234, + "step": 5785 + }, + { + "epoch": 1.9322090499248623, + "grad_norm": 0.3167886733801051, + "learning_rate": 3.3863251397377516e-06, + "loss": 0.0195, + "step": 5786 + }, + { + "epoch": 1.9325429954917348, + "grad_norm": 0.2639229475840124, + "learning_rate": 3.3844859173210797e-06, + "loss": 0.0232, + "step": 5787 + }, + { + "epoch": 1.9328769410586073, + "grad_norm": 0.3323833296643505, + "learning_rate": 3.382646938946832e-06, + "loss": 0.0219, + "step": 5788 + }, + { + "epoch": 1.93321088662548, + "grad_norm": 0.2609261762353949, + "learning_rate": 3.3808082048928083e-06, + "loss": 0.0182, + "step": 5789 + }, + { + "epoch": 1.9335448321923527, + "grad_norm": 0.26727327381174226, + "learning_rate": 3.378969715436767e-06, + "loss": 0.0201, + "step": 5790 + }, + { + "epoch": 1.9338787777592252, + "grad_norm": 0.2311478036321507, + "learning_rate": 3.3771314708564408e-06, + "loss": 0.0234, + "step": 5791 + }, + { + "epoch": 1.9342127233260977, + "grad_norm": 0.31724214783039545, + "learning_rate": 3.3752934714295146e-06, + "loss": 0.0282, + "step": 5792 + }, + { + "epoch": 1.9345466688929704, + "grad_norm": 0.3639905507013953, + "learning_rate": 3.373455717433639e-06, + "loss": 0.0358, + "step": 5793 + }, + { + "epoch": 1.9348806144598432, + "grad_norm": 0.29177698252553197, + "learning_rate": 3.3716182091464295e-06, + "loss": 0.0197, + "step": 5794 + }, + { + "epoch": 1.9352145600267157, + "grad_norm": 0.34356793338454994, + "learning_rate": 3.3697809468454634e-06, + "loss": 0.0276, + "step": 5795 + }, + { + "epoch": 1.9355485055935882, + "grad_norm": 0.3375503364780255, + "learning_rate": 3.3679439308082777e-06, + "loss": 0.024, + "step": 5796 + }, + { + "epoch": 1.9358824511604609, + "grad_norm": 0.28808755513578127, + "learning_rate": 3.366107161312381e-06, + "loss": 0.0228, + "step": 5797 + }, + { + "epoch": 1.9362163967273336, + "grad_norm": 0.3256739763787763, + "learning_rate": 3.3642706386352355e-06, + "loss": 0.0317, + "step": 5798 + }, + { + "epoch": 1.936550342294206, + "grad_norm": 0.47020890589467057, + "learning_rate": 3.3624343630542707e-06, + "loss": 0.0198, + "step": 5799 + }, + { + "epoch": 1.9368842878610786, + "grad_norm": 0.2845018000102603, + "learning_rate": 3.3605983348468764e-06, + "loss": 0.0241, + "step": 5800 + }, + { + "epoch": 1.9372182334279513, + "grad_norm": 0.41753467962227386, + "learning_rate": 3.3587625542904063e-06, + "loss": 0.0335, + "step": 5801 + }, + { + "epoch": 1.9375521789948238, + "grad_norm": 0.3515233072582555, + "learning_rate": 3.356927021662178e-06, + "loss": 0.0276, + "step": 5802 + }, + { + "epoch": 1.9378861245616963, + "grad_norm": 0.2916770899627356, + "learning_rate": 3.3550917372394696e-06, + "loss": 0.023, + "step": 5803 + }, + { + "epoch": 1.938220070128569, + "grad_norm": 0.2270008434678482, + "learning_rate": 3.353256701299522e-06, + "loss": 0.0232, + "step": 5804 + }, + { + "epoch": 1.9385540156954417, + "grad_norm": 0.2586504692258007, + "learning_rate": 3.3514219141195404e-06, + "loss": 0.0184, + "step": 5805 + }, + { + "epoch": 1.9388879612623142, + "grad_norm": 0.2470140225476682, + "learning_rate": 3.3495873759766897e-06, + "loss": 0.0204, + "step": 5806 + }, + { + "epoch": 1.9392219068291867, + "grad_norm": 0.2258297573077572, + "learning_rate": 3.347753087148098e-06, + "loss": 0.0213, + "step": 5807 + }, + { + "epoch": 1.9395558523960594, + "grad_norm": 0.33504239696759075, + "learning_rate": 3.3459190479108583e-06, + "loss": 0.0291, + "step": 5808 + }, + { + "epoch": 1.9398897979629321, + "grad_norm": 0.31120048688790203, + "learning_rate": 3.344085258542022e-06, + "loss": 0.028, + "step": 5809 + }, + { + "epoch": 1.9402237435298046, + "grad_norm": 0.26852814800786234, + "learning_rate": 3.3422517193186056e-06, + "loss": 0.0249, + "step": 5810 + }, + { + "epoch": 1.9405576890966771, + "grad_norm": 0.2889792359846112, + "learning_rate": 3.340418430517586e-06, + "loss": 0.0248, + "step": 5811 + }, + { + "epoch": 1.9408916346635499, + "grad_norm": 0.3341469130820314, + "learning_rate": 3.338585392415904e-06, + "loss": 0.0321, + "step": 5812 + }, + { + "epoch": 1.9412255802304226, + "grad_norm": 0.430265299263613, + "learning_rate": 3.3367526052904585e-06, + "loss": 0.0187, + "step": 5813 + }, + { + "epoch": 1.941559525797295, + "grad_norm": 0.46345645303214994, + "learning_rate": 3.3349200694181182e-06, + "loss": 0.0256, + "step": 5814 + }, + { + "epoch": 1.9418934713641676, + "grad_norm": 0.26956771851846767, + "learning_rate": 3.333087785075707e-06, + "loss": 0.0209, + "step": 5815 + }, + { + "epoch": 1.9422274169310403, + "grad_norm": 0.27106704261332304, + "learning_rate": 3.3312557525400133e-06, + "loss": 0.0178, + "step": 5816 + }, + { + "epoch": 1.942561362497913, + "grad_norm": 0.35578560827960454, + "learning_rate": 3.329423972087787e-06, + "loss": 0.0286, + "step": 5817 + }, + { + "epoch": 1.9428953080647853, + "grad_norm": 0.33650992551520637, + "learning_rate": 3.3275924439957397e-06, + "loss": 0.0328, + "step": 5818 + }, + { + "epoch": 1.943229253631658, + "grad_norm": 0.26326183730826697, + "learning_rate": 3.3257611685405444e-06, + "loss": 0.0135, + "step": 5819 + }, + { + "epoch": 1.9435631991985307, + "grad_norm": 0.38701056914123266, + "learning_rate": 3.3239301459988395e-06, + "loss": 0.0445, + "step": 5820 + }, + { + "epoch": 1.9438971447654032, + "grad_norm": 0.354903143809386, + "learning_rate": 3.322099376647221e-06, + "loss": 0.0302, + "step": 5821 + }, + { + "epoch": 1.9442310903322757, + "grad_norm": 0.24735400000002247, + "learning_rate": 3.320268860762249e-06, + "loss": 0.0158, + "step": 5822 + }, + { + "epoch": 1.9445650358991484, + "grad_norm": 0.255502633960381, + "learning_rate": 3.318438598620444e-06, + "loss": 0.0186, + "step": 5823 + }, + { + "epoch": 1.9448989814660211, + "grad_norm": 0.28664866311148385, + "learning_rate": 3.316608590498287e-06, + "loss": 0.0207, + "step": 5824 + }, + { + "epoch": 1.9452329270328936, + "grad_norm": 0.3129524544312331, + "learning_rate": 3.314778836672224e-06, + "loss": 0.0166, + "step": 5825 + }, + { + "epoch": 1.9455668725997661, + "grad_norm": 0.2518869915826915, + "learning_rate": 3.312949337418661e-06, + "loss": 0.0207, + "step": 5826 + }, + { + "epoch": 1.9459008181666388, + "grad_norm": 0.3028117432261122, + "learning_rate": 3.311120093013964e-06, + "loss": 0.0335, + "step": 5827 + }, + { + "epoch": 1.9462347637335116, + "grad_norm": 0.257864600229698, + "learning_rate": 3.3092911037344642e-06, + "loss": 0.0226, + "step": 5828 + }, + { + "epoch": 1.946568709300384, + "grad_norm": 0.23405019122218812, + "learning_rate": 3.30746236985645e-06, + "loss": 0.0178, + "step": 5829 + }, + { + "epoch": 1.9469026548672566, + "grad_norm": 0.31640288842976955, + "learning_rate": 3.305633891656175e-06, + "loss": 0.0256, + "step": 5830 + }, + { + "epoch": 1.9472366004341293, + "grad_norm": 0.2811885023984529, + "learning_rate": 3.3038056694098485e-06, + "loss": 0.0243, + "step": 5831 + }, + { + "epoch": 1.947570546001002, + "grad_norm": 0.2595925379709668, + "learning_rate": 3.3019777033936497e-06, + "loss": 0.0217, + "step": 5832 + }, + { + "epoch": 1.9479044915678745, + "grad_norm": 0.2987458082606157, + "learning_rate": 3.3001499938837124e-06, + "loss": 0.0235, + "step": 5833 + }, + { + "epoch": 1.948238437134747, + "grad_norm": 0.3306504487927109, + "learning_rate": 3.2983225411561338e-06, + "loss": 0.0244, + "step": 5834 + }, + { + "epoch": 1.9485723827016197, + "grad_norm": 0.3396602763806835, + "learning_rate": 3.296495345486971e-06, + "loss": 0.0288, + "step": 5835 + }, + { + "epoch": 1.9489063282684922, + "grad_norm": 0.2226756267363155, + "learning_rate": 3.294668407152245e-06, + "loss": 0.0195, + "step": 5836 + }, + { + "epoch": 1.9492402738353647, + "grad_norm": 0.2801191040536907, + "learning_rate": 3.2928417264279338e-06, + "loss": 0.0215, + "step": 5837 + }, + { + "epoch": 1.9495742194022374, + "grad_norm": 0.24784811196277576, + "learning_rate": 3.2910153035899826e-06, + "loss": 0.0206, + "step": 5838 + }, + { + "epoch": 1.9499081649691101, + "grad_norm": 0.34129288191423274, + "learning_rate": 3.2891891389142933e-06, + "loss": 0.0306, + "step": 5839 + }, + { + "epoch": 1.9502421105359826, + "grad_norm": 0.22300306004602136, + "learning_rate": 3.2873632326767278e-06, + "loss": 0.0215, + "step": 5840 + }, + { + "epoch": 1.9505760561028551, + "grad_norm": 0.23560460974879197, + "learning_rate": 3.2855375851531122e-06, + "loss": 0.0178, + "step": 5841 + }, + { + "epoch": 1.9509100016697278, + "grad_norm": 0.31498906746012384, + "learning_rate": 3.283712196619229e-06, + "loss": 0.0214, + "step": 5842 + }, + { + "epoch": 1.9512439472366006, + "grad_norm": 0.3113196487782054, + "learning_rate": 3.2818870673508297e-06, + "loss": 0.0278, + "step": 5843 + }, + { + "epoch": 1.951577892803473, + "grad_norm": 0.21125492883964472, + "learning_rate": 3.2800621976236184e-06, + "loss": 0.0158, + "step": 5844 + }, + { + "epoch": 1.9519118383703455, + "grad_norm": 0.28307853134694105, + "learning_rate": 3.2782375877132643e-06, + "loss": 0.0205, + "step": 5845 + }, + { + "epoch": 1.9522457839372183, + "grad_norm": 0.2999456377062616, + "learning_rate": 3.276413237895395e-06, + "loss": 0.0232, + "step": 5846 + }, + { + "epoch": 1.952579729504091, + "grad_norm": 0.2567854381982598, + "learning_rate": 3.2745891484456016e-06, + "loss": 0.0245, + "step": 5847 + }, + { + "epoch": 1.9529136750709635, + "grad_norm": 0.38292431688849965, + "learning_rate": 3.2727653196394314e-06, + "loss": 0.0247, + "step": 5848 + }, + { + "epoch": 1.953247620637836, + "grad_norm": 0.2800679231787765, + "learning_rate": 3.270941751752398e-06, + "loss": 0.0264, + "step": 5849 + }, + { + "epoch": 1.9535815662047087, + "grad_norm": 0.19956091896284442, + "learning_rate": 3.269118445059973e-06, + "loss": 0.0167, + "step": 5850 + }, + { + "epoch": 1.9539155117715812, + "grad_norm": 0.26335270714390147, + "learning_rate": 3.267295399837587e-06, + "loss": 0.0254, + "step": 5851 + }, + { + "epoch": 1.9542494573384537, + "grad_norm": 0.23201558038246078, + "learning_rate": 3.2654726163606333e-06, + "loss": 0.024, + "step": 5852 + }, + { + "epoch": 1.9545834029053264, + "grad_norm": 0.33735614001278114, + "learning_rate": 3.2636500949044637e-06, + "loss": 0.0371, + "step": 5853 + }, + { + "epoch": 1.9549173484721991, + "grad_norm": 0.29281664745168995, + "learning_rate": 3.2618278357443913e-06, + "loss": 0.0246, + "step": 5854 + }, + { + "epoch": 1.9552512940390716, + "grad_norm": 0.23201336782090234, + "learning_rate": 3.260005839155691e-06, + "loss": 0.0198, + "step": 5855 + }, + { + "epoch": 1.955585239605944, + "grad_norm": 0.2814654517399288, + "learning_rate": 3.258184105413597e-06, + "loss": 0.0276, + "step": 5856 + }, + { + "epoch": 1.9559191851728168, + "grad_norm": 0.21019308925441066, + "learning_rate": 3.256362634793303e-06, + "loss": 0.0202, + "step": 5857 + }, + { + "epoch": 1.9562531307396895, + "grad_norm": 0.4581378154718263, + "learning_rate": 3.2545414275699638e-06, + "loss": 0.0302, + "step": 5858 + }, + { + "epoch": 1.956587076306562, + "grad_norm": 0.262252359623129, + "learning_rate": 3.2527204840186944e-06, + "loss": 0.0237, + "step": 5859 + }, + { + "epoch": 1.9569210218734345, + "grad_norm": 0.4503845110632144, + "learning_rate": 3.2508998044145674e-06, + "loss": 0.0188, + "step": 5860 + }, + { + "epoch": 1.9572549674403072, + "grad_norm": 0.20579539512288297, + "learning_rate": 3.249079389032621e-06, + "loss": 0.0136, + "step": 5861 + }, + { + "epoch": 1.95758891300718, + "grad_norm": 0.3146515037895394, + "learning_rate": 3.247259238147851e-06, + "loss": 0.0233, + "step": 5862 + }, + { + "epoch": 1.9579228585740525, + "grad_norm": 0.2913048497578694, + "learning_rate": 3.245439352035209e-06, + "loss": 0.0211, + "step": 5863 + }, + { + "epoch": 1.958256804140925, + "grad_norm": 0.22715714845211402, + "learning_rate": 3.243619730969614e-06, + "loss": 0.0168, + "step": 5864 + }, + { + "epoch": 1.9585907497077977, + "grad_norm": 0.21930421251497045, + "learning_rate": 3.2418003752259374e-06, + "loss": 0.0173, + "step": 5865 + }, + { + "epoch": 1.9589246952746704, + "grad_norm": 0.21710261658100735, + "learning_rate": 3.239981285079016e-06, + "loss": 0.0186, + "step": 5866 + }, + { + "epoch": 1.9592586408415427, + "grad_norm": 0.27064666090436496, + "learning_rate": 3.238162460803646e-06, + "loss": 0.028, + "step": 5867 + }, + { + "epoch": 1.9595925864084154, + "grad_norm": 0.23906712649623107, + "learning_rate": 3.2363439026745813e-06, + "loss": 0.0184, + "step": 5868 + }, + { + "epoch": 1.959926531975288, + "grad_norm": 0.2076308046728014, + "learning_rate": 3.2345256109665366e-06, + "loss": 0.0137, + "step": 5869 + }, + { + "epoch": 1.9602604775421606, + "grad_norm": 0.2954053456605062, + "learning_rate": 3.2327075859541867e-06, + "loss": 0.026, + "step": 5870 + }, + { + "epoch": 1.960594423109033, + "grad_norm": 0.25075346173026614, + "learning_rate": 3.2308898279121646e-06, + "loss": 0.0219, + "step": 5871 + }, + { + "epoch": 1.9609283686759058, + "grad_norm": 0.28114262490316694, + "learning_rate": 3.2290723371150627e-06, + "loss": 0.0223, + "step": 5872 + }, + { + "epoch": 1.9612623142427785, + "grad_norm": 0.2210698203821854, + "learning_rate": 3.2272551138374387e-06, + "loss": 0.018, + "step": 5873 + }, + { + "epoch": 1.961596259809651, + "grad_norm": 0.24530615285130294, + "learning_rate": 3.2254381583538025e-06, + "loss": 0.0211, + "step": 5874 + }, + { + "epoch": 1.9619302053765235, + "grad_norm": 0.31106545427697324, + "learning_rate": 3.223621470938628e-06, + "loss": 0.0238, + "step": 5875 + }, + { + "epoch": 1.9622641509433962, + "grad_norm": 0.2660833208769417, + "learning_rate": 3.2218050518663457e-06, + "loss": 0.0213, + "step": 5876 + }, + { + "epoch": 1.962598096510269, + "grad_norm": 0.21743227427686262, + "learning_rate": 3.219988901411347e-06, + "loss": 0.0223, + "step": 5877 + }, + { + "epoch": 1.9629320420771414, + "grad_norm": 0.24486708130129758, + "learning_rate": 3.218173019847985e-06, + "loss": 0.0135, + "step": 5878 + }, + { + "epoch": 1.963265987644014, + "grad_norm": 0.35907634828149954, + "learning_rate": 3.2163574074505686e-06, + "loss": 0.027, + "step": 5879 + }, + { + "epoch": 1.9635999332108867, + "grad_norm": 0.32844558924637945, + "learning_rate": 3.214542064493367e-06, + "loss": 0.0219, + "step": 5880 + }, + { + "epoch": 1.9639338787777594, + "grad_norm": 0.35326428299476076, + "learning_rate": 3.2127269912506103e-06, + "loss": 0.0248, + "step": 5881 + }, + { + "epoch": 1.9642678243446319, + "grad_norm": 0.32373207491017764, + "learning_rate": 3.210912187996486e-06, + "loss": 0.0276, + "step": 5882 + }, + { + "epoch": 1.9646017699115044, + "grad_norm": 0.26299833241497556, + "learning_rate": 3.2090976550051393e-06, + "loss": 0.0241, + "step": 5883 + }, + { + "epoch": 1.964935715478377, + "grad_norm": 0.40795756898929836, + "learning_rate": 3.207283392550681e-06, + "loss": 0.0262, + "step": 5884 + }, + { + "epoch": 1.9652696610452496, + "grad_norm": 0.2401971528025246, + "learning_rate": 3.2054694009071753e-06, + "loss": 0.0176, + "step": 5885 + }, + { + "epoch": 1.965603606612122, + "grad_norm": 0.30160405688234604, + "learning_rate": 3.2036556803486465e-06, + "loss": 0.0315, + "step": 5886 + }, + { + "epoch": 1.9659375521789948, + "grad_norm": 0.4733971362045815, + "learning_rate": 3.2018422311490778e-06, + "loss": 0.0231, + "step": 5887 + }, + { + "epoch": 1.9662714977458675, + "grad_norm": 0.27799211423995956, + "learning_rate": 3.200029053582413e-06, + "loss": 0.03, + "step": 5888 + }, + { + "epoch": 1.96660544331274, + "grad_norm": 0.26146796580960074, + "learning_rate": 3.1982161479225514e-06, + "loss": 0.0191, + "step": 5889 + }, + { + "epoch": 1.9669393888796125, + "grad_norm": 0.32179812754743276, + "learning_rate": 3.196403514443358e-06, + "loss": 0.0277, + "step": 5890 + }, + { + "epoch": 1.9672733344464852, + "grad_norm": 0.25901646040108445, + "learning_rate": 3.19459115341865e-06, + "loss": 0.0213, + "step": 5891 + }, + { + "epoch": 1.967607280013358, + "grad_norm": 0.2580956396135416, + "learning_rate": 3.1927790651222073e-06, + "loss": 0.0273, + "step": 5892 + }, + { + "epoch": 1.9679412255802304, + "grad_norm": 0.3359443142087912, + "learning_rate": 3.1909672498277656e-06, + "loss": 0.0222, + "step": 5893 + }, + { + "epoch": 1.968275171147103, + "grad_norm": 0.3314281062252131, + "learning_rate": 3.1891557078090218e-06, + "loss": 0.0309, + "step": 5894 + }, + { + "epoch": 1.9686091167139756, + "grad_norm": 0.34978716289236084, + "learning_rate": 3.187344439339628e-06, + "loss": 0.0361, + "step": 5895 + }, + { + "epoch": 1.9689430622808484, + "grad_norm": 0.34410950313006444, + "learning_rate": 3.1855334446932025e-06, + "loss": 0.0307, + "step": 5896 + }, + { + "epoch": 1.9692770078477209, + "grad_norm": 0.20713580653399105, + "learning_rate": 3.1837227241433145e-06, + "loss": 0.0186, + "step": 5897 + }, + { + "epoch": 1.9696109534145934, + "grad_norm": 0.24913821861245006, + "learning_rate": 3.181912277963495e-06, + "loss": 0.027, + "step": 5898 + }, + { + "epoch": 1.969944898981466, + "grad_norm": 0.2547974035629979, + "learning_rate": 3.180102106427233e-06, + "loss": 0.0261, + "step": 5899 + }, + { + "epoch": 1.9702788445483386, + "grad_norm": 0.8450885560275477, + "learning_rate": 3.178292209807976e-06, + "loss": 0.0271, + "step": 5900 + }, + { + "epoch": 1.970612790115211, + "grad_norm": 0.29206840523406574, + "learning_rate": 3.1764825883791306e-06, + "loss": 0.0229, + "step": 5901 + }, + { + "epoch": 1.9709467356820838, + "grad_norm": 0.30484728775592423, + "learning_rate": 3.174673242414062e-06, + "loss": 0.0257, + "step": 5902 + }, + { + "epoch": 1.9712806812489565, + "grad_norm": 0.37122635945952387, + "learning_rate": 3.1728641721860925e-06, + "loss": 0.0222, + "step": 5903 + }, + { + "epoch": 1.971614626815829, + "grad_norm": 0.2567382911579384, + "learning_rate": 3.1710553779685036e-06, + "loss": 0.0223, + "step": 5904 + }, + { + "epoch": 1.9719485723827015, + "grad_norm": 0.23499162344359104, + "learning_rate": 3.169246860034535e-06, + "loss": 0.0208, + "step": 5905 + }, + { + "epoch": 1.9722825179495742, + "grad_norm": 0.2646541002066373, + "learning_rate": 3.1674386186573853e-06, + "loss": 0.0251, + "step": 5906 + }, + { + "epoch": 1.972616463516447, + "grad_norm": 0.24678009565546116, + "learning_rate": 3.1656306541102073e-06, + "loss": 0.021, + "step": 5907 + }, + { + "epoch": 1.9729504090833194, + "grad_norm": 0.1881145249223792, + "learning_rate": 3.16382296666612e-06, + "loss": 0.0158, + "step": 5908 + }, + { + "epoch": 1.973284354650192, + "grad_norm": 0.2901700905718627, + "learning_rate": 3.1620155565981942e-06, + "loss": 0.0265, + "step": 5909 + }, + { + "epoch": 1.9736183002170646, + "grad_norm": 0.34233287596476747, + "learning_rate": 3.1602084241794595e-06, + "loss": 0.0258, + "step": 5910 + }, + { + "epoch": 1.9739522457839374, + "grad_norm": 0.21072531762872868, + "learning_rate": 3.158401569682906e-06, + "loss": 0.0218, + "step": 5911 + }, + { + "epoch": 1.9742861913508098, + "grad_norm": 0.2662037549693113, + "learning_rate": 3.156594993381479e-06, + "loss": 0.0164, + "step": 5912 + }, + { + "epoch": 1.9746201369176823, + "grad_norm": 0.19624794911578164, + "learning_rate": 3.154788695548082e-06, + "loss": 0.0195, + "step": 5913 + }, + { + "epoch": 1.974954082484555, + "grad_norm": 0.22435623212512112, + "learning_rate": 3.152982676455581e-06, + "loss": 0.0166, + "step": 5914 + }, + { + "epoch": 1.9752880280514278, + "grad_norm": 0.2546770021129933, + "learning_rate": 3.151176936376794e-06, + "loss": 0.0201, + "step": 5915 + }, + { + "epoch": 1.9756219736183, + "grad_norm": 0.25422196414367904, + "learning_rate": 3.1493714755845013e-06, + "loss": 0.0172, + "step": 5916 + }, + { + "epoch": 1.9759559191851728, + "grad_norm": 0.2636805130416644, + "learning_rate": 3.1475662943514366e-06, + "loss": 0.0182, + "step": 5917 + }, + { + "epoch": 1.9762898647520455, + "grad_norm": 0.2726425398062032, + "learning_rate": 3.145761392950293e-06, + "loss": 0.0291, + "step": 5918 + }, + { + "epoch": 1.976623810318918, + "grad_norm": 0.28387231861318524, + "learning_rate": 3.1439567716537268e-06, + "loss": 0.026, + "step": 5919 + }, + { + "epoch": 1.9769577558857905, + "grad_norm": 0.3159811989141398, + "learning_rate": 3.142152430734343e-06, + "loss": 0.0288, + "step": 5920 + }, + { + "epoch": 1.9772917014526632, + "grad_norm": 0.23312569497717564, + "learning_rate": 3.140348370464711e-06, + "loss": 0.0159, + "step": 5921 + }, + { + "epoch": 1.977625647019536, + "grad_norm": 0.4675278517518357, + "learning_rate": 3.138544591117354e-06, + "loss": 0.0214, + "step": 5922 + }, + { + "epoch": 1.9779595925864084, + "grad_norm": 0.2598155604686844, + "learning_rate": 3.1367410929647544e-06, + "loss": 0.0273, + "step": 5923 + }, + { + "epoch": 1.978293538153281, + "grad_norm": 0.3597219327094149, + "learning_rate": 3.1349378762793515e-06, + "loss": 0.0318, + "step": 5924 + }, + { + "epoch": 1.9786274837201536, + "grad_norm": 0.2603932482412402, + "learning_rate": 3.133134941333543e-06, + "loss": 0.0191, + "step": 5925 + }, + { + "epoch": 1.9789614292870263, + "grad_norm": 0.3058024009180753, + "learning_rate": 3.1313322883996833e-06, + "loss": 0.0229, + "step": 5926 + }, + { + "epoch": 1.9792953748538988, + "grad_norm": 0.2246348142136579, + "learning_rate": 3.129529917750085e-06, + "loss": 0.0122, + "step": 5927 + }, + { + "epoch": 1.9796293204207713, + "grad_norm": 0.2996893775026067, + "learning_rate": 3.1277278296570157e-06, + "loss": 0.0307, + "step": 5928 + }, + { + "epoch": 1.979963265987644, + "grad_norm": 0.5614506963389504, + "learning_rate": 3.1259260243927035e-06, + "loss": 0.0244, + "step": 5929 + }, + { + "epoch": 1.9802972115545168, + "grad_norm": 0.3243657307907125, + "learning_rate": 3.12412450222933e-06, + "loss": 0.0229, + "step": 5930 + }, + { + "epoch": 1.9806311571213893, + "grad_norm": 0.19348918220019287, + "learning_rate": 3.12232326343904e-06, + "loss": 0.0127, + "step": 5931 + }, + { + "epoch": 1.9809651026882618, + "grad_norm": 0.28759933936010396, + "learning_rate": 3.1205223082939302e-06, + "loss": 0.0223, + "step": 5932 + }, + { + "epoch": 1.9812990482551345, + "grad_norm": 0.23128262171772607, + "learning_rate": 3.1187216370660558e-06, + "loss": 0.0188, + "step": 5933 + }, + { + "epoch": 1.981632993822007, + "grad_norm": 0.36061628504818183, + "learning_rate": 3.1169212500274294e-06, + "loss": 0.0272, + "step": 5934 + }, + { + "epoch": 1.9819669393888795, + "grad_norm": 0.28025072567451725, + "learning_rate": 3.11512114745002e-06, + "loss": 0.0202, + "step": 5935 + }, + { + "epoch": 1.9823008849557522, + "grad_norm": 0.2682571408976759, + "learning_rate": 3.113321329605754e-06, + "loss": 0.0183, + "step": 5936 + }, + { + "epoch": 1.982634830522625, + "grad_norm": 0.2645470983431173, + "learning_rate": 3.1115217967665174e-06, + "loss": 0.0303, + "step": 5937 + }, + { + "epoch": 1.9829687760894974, + "grad_norm": 0.2340744816177296, + "learning_rate": 3.1097225492041494e-06, + "loss": 0.0178, + "step": 5938 + }, + { + "epoch": 1.98330272165637, + "grad_norm": 0.3225713067830606, + "learning_rate": 3.107923587190448e-06, + "loss": 0.0258, + "step": 5939 + }, + { + "epoch": 1.9836366672232426, + "grad_norm": 0.27858292660121353, + "learning_rate": 3.106124910997168e-06, + "loss": 0.0268, + "step": 5940 + }, + { + "epoch": 1.9839706127901153, + "grad_norm": 0.2682566434750068, + "learning_rate": 3.1043265208960187e-06, + "loss": 0.0215, + "step": 5941 + }, + { + "epoch": 1.9843045583569878, + "grad_norm": 0.33843940950024376, + "learning_rate": 3.102528417158668e-06, + "loss": 0.0219, + "step": 5942 + }, + { + "epoch": 1.9846385039238603, + "grad_norm": 0.2745822590878245, + "learning_rate": 3.1007306000567434e-06, + "loss": 0.0246, + "step": 5943 + }, + { + "epoch": 1.984972449490733, + "grad_norm": 0.24053022466255353, + "learning_rate": 3.0989330698618248e-06, + "loss": 0.0181, + "step": 5944 + }, + { + "epoch": 1.9853063950576058, + "grad_norm": 0.21327671700227108, + "learning_rate": 3.097135826845451e-06, + "loss": 0.0185, + "step": 5945 + }, + { + "epoch": 1.9856403406244783, + "grad_norm": 0.2619959346125703, + "learning_rate": 3.0953388712791155e-06, + "loss": 0.0171, + "step": 5946 + }, + { + "epoch": 1.9859742861913507, + "grad_norm": 0.25444385012693954, + "learning_rate": 3.09354220343427e-06, + "loss": 0.0206, + "step": 5947 + }, + { + "epoch": 1.9863082317582235, + "grad_norm": 0.31650063936320205, + "learning_rate": 3.0917458235823215e-06, + "loss": 0.0247, + "step": 5948 + }, + { + "epoch": 1.986642177325096, + "grad_norm": 0.3739815208954093, + "learning_rate": 3.089949731994637e-06, + "loss": 0.0289, + "step": 5949 + }, + { + "epoch": 1.9869761228919685, + "grad_norm": 0.24452356544852305, + "learning_rate": 3.088153928942535e-06, + "loss": 0.0215, + "step": 5950 + }, + { + "epoch": 1.9873100684588412, + "grad_norm": 0.24877693440300758, + "learning_rate": 3.0863584146972935e-06, + "loss": 0.017, + "step": 5951 + }, + { + "epoch": 1.987644014025714, + "grad_norm": 0.23985073167068344, + "learning_rate": 3.084563189530146e-06, + "loss": 0.0226, + "step": 5952 + }, + { + "epoch": 1.9879779595925864, + "grad_norm": 0.2734363984573269, + "learning_rate": 3.0827682537122817e-06, + "loss": 0.0241, + "step": 5953 + }, + { + "epoch": 1.9883119051594589, + "grad_norm": 0.38896925458025244, + "learning_rate": 3.0809736075148456e-06, + "loss": 0.0246, + "step": 5954 + }, + { + "epoch": 1.9886458507263316, + "grad_norm": 0.3042865422394154, + "learning_rate": 3.0791792512089443e-06, + "loss": 0.0253, + "step": 5955 + }, + { + "epoch": 1.9889797962932043, + "grad_norm": 0.28970310667020616, + "learning_rate": 3.0773851850656335e-06, + "loss": 0.0239, + "step": 5956 + }, + { + "epoch": 1.9893137418600768, + "grad_norm": 0.2598842642261865, + "learning_rate": 3.075591409355929e-06, + "loss": 0.0254, + "step": 5957 + }, + { + "epoch": 1.9896476874269493, + "grad_norm": 0.2377822154904266, + "learning_rate": 3.073797924350801e-06, + "loss": 0.0202, + "step": 5958 + }, + { + "epoch": 1.989981632993822, + "grad_norm": 0.2333692680246371, + "learning_rate": 3.0720047303211746e-06, + "loss": 0.0192, + "step": 5959 + }, + { + "epoch": 1.9903155785606947, + "grad_norm": 0.2423296384346233, + "learning_rate": 3.0702118275379376e-06, + "loss": 0.0278, + "step": 5960 + }, + { + "epoch": 1.9906495241275672, + "grad_norm": 0.25947751515757067, + "learning_rate": 3.0684192162719263e-06, + "loss": 0.0201, + "step": 5961 + }, + { + "epoch": 1.9909834696944397, + "grad_norm": 0.24743408871594888, + "learning_rate": 3.066626896793936e-06, + "loss": 0.0209, + "step": 5962 + }, + { + "epoch": 1.9913174152613125, + "grad_norm": 0.26507318718394685, + "learning_rate": 3.0648348693747177e-06, + "loss": 0.0236, + "step": 5963 + }, + { + "epoch": 1.9916513608281852, + "grad_norm": 0.3655272594041909, + "learning_rate": 3.063043134284979e-06, + "loss": 0.03, + "step": 5964 + }, + { + "epoch": 1.9919853063950574, + "grad_norm": 0.20299432033076675, + "learning_rate": 3.0612516917953783e-06, + "loss": 0.0167, + "step": 5965 + }, + { + "epoch": 1.9923192519619302, + "grad_norm": 0.21254003248625267, + "learning_rate": 3.0594605421765406e-06, + "loss": 0.0156, + "step": 5966 + }, + { + "epoch": 1.9926531975288029, + "grad_norm": 0.32424159826192184, + "learning_rate": 3.057669685699037e-06, + "loss": 0.0309, + "step": 5967 + }, + { + "epoch": 1.9929871430956754, + "grad_norm": 0.3552126929745393, + "learning_rate": 3.0558791226333974e-06, + "loss": 0.0164, + "step": 5968 + }, + { + "epoch": 1.9933210886625479, + "grad_norm": 0.2805002505757546, + "learning_rate": 3.0540888532501075e-06, + "loss": 0.0235, + "step": 5969 + }, + { + "epoch": 1.9936550342294206, + "grad_norm": 0.2822730332647638, + "learning_rate": 3.052298877819608e-06, + "loss": 0.0198, + "step": 5970 + }, + { + "epoch": 1.9939889797962933, + "grad_norm": 0.33483079517361786, + "learning_rate": 3.050509196612297e-06, + "loss": 0.0265, + "step": 5971 + }, + { + "epoch": 1.9943229253631658, + "grad_norm": 0.21161861271095728, + "learning_rate": 3.0487198098985265e-06, + "loss": 0.0155, + "step": 5972 + }, + { + "epoch": 1.9946568709300383, + "grad_norm": 0.32276428423138714, + "learning_rate": 3.046930717948604e-06, + "loss": 0.0209, + "step": 5973 + }, + { + "epoch": 1.994990816496911, + "grad_norm": 0.30491785406405486, + "learning_rate": 3.0451419210327935e-06, + "loss": 0.0148, + "step": 5974 + }, + { + "epoch": 1.9953247620637837, + "grad_norm": 0.2608247963775136, + "learning_rate": 3.0433534194213143e-06, + "loss": 0.017, + "step": 5975 + }, + { + "epoch": 1.9956587076306562, + "grad_norm": 0.254211017514349, + "learning_rate": 3.0415652133843375e-06, + "loss": 0.0194, + "step": 5976 + }, + { + "epoch": 1.9959926531975287, + "grad_norm": 0.28198311333264253, + "learning_rate": 3.0397773031919966e-06, + "loss": 0.0243, + "step": 5977 + }, + { + "epoch": 1.9963265987644014, + "grad_norm": 0.3038115591835136, + "learning_rate": 3.0379896891143746e-06, + "loss": 0.0241, + "step": 5978 + }, + { + "epoch": 1.9966605443312742, + "grad_norm": 0.2950551580736191, + "learning_rate": 3.036202371421513e-06, + "loss": 0.0199, + "step": 5979 + }, + { + "epoch": 1.9969944898981467, + "grad_norm": 0.4063860114054834, + "learning_rate": 3.034415350383405e-06, + "loss": 0.031, + "step": 5980 + }, + { + "epoch": 1.9973284354650191, + "grad_norm": 0.2924471722384363, + "learning_rate": 3.0326286262700035e-06, + "loss": 0.0194, + "step": 5981 + }, + { + "epoch": 1.9976623810318919, + "grad_norm": 0.24206756223897735, + "learning_rate": 3.030842199351212e-06, + "loss": 0.0163, + "step": 5982 + }, + { + "epoch": 1.9979963265987644, + "grad_norm": 0.29146831920685096, + "learning_rate": 3.0290560698968907e-06, + "loss": 0.0238, + "step": 5983 + }, + { + "epoch": 1.9983302721656369, + "grad_norm": 0.29091063864844646, + "learning_rate": 3.0272702381768593e-06, + "loss": 0.0226, + "step": 5984 + }, + { + "epoch": 1.9986642177325096, + "grad_norm": 0.24201912088373048, + "learning_rate": 3.0254847044608872e-06, + "loss": 0.0186, + "step": 5985 + }, + { + "epoch": 1.9989981632993823, + "grad_norm": 0.2498637660186078, + "learning_rate": 3.0236994690186983e-06, + "loss": 0.0182, + "step": 5986 + }, + { + "epoch": 1.9993321088662548, + "grad_norm": 0.31008017670084925, + "learning_rate": 3.0219145321199763e-06, + "loss": 0.0215, + "step": 5987 + }, + { + "epoch": 1.9996660544331273, + "grad_norm": 0.2321229794882306, + "learning_rate": 3.0201298940343543e-06, + "loss": 0.0164, + "step": 5988 + }, + { + "epoch": 2.0, + "grad_norm": 0.31303179367197403, + "learning_rate": 3.018345555031422e-06, + "loss": 0.0276, + "step": 5989 + }, + { + "epoch": 2.0, + "eval_loss": 0.02529776282608509, + "eval_runtime": 180.2975, + "eval_samples_per_second": 111.887, + "eval_steps_per_second": 1.753, + "step": 5989 + }, + { + "epoch": 2.0003339455668727, + "grad_norm": 0.15175628656848728, + "learning_rate": 3.0165615153807293e-06, + "loss": 0.0101, + "step": 5990 + }, + { + "epoch": 2.000667891133745, + "grad_norm": 0.20080372094640112, + "learning_rate": 3.014777775351774e-06, + "loss": 0.0125, + "step": 5991 + }, + { + "epoch": 2.0010018367006177, + "grad_norm": 0.19208222718758047, + "learning_rate": 3.012994335214011e-06, + "loss": 0.0122, + "step": 5992 + }, + { + "epoch": 2.0013357822674904, + "grad_norm": 0.1494873384407777, + "learning_rate": 3.0112111952368496e-06, + "loss": 0.0105, + "step": 5993 + }, + { + "epoch": 2.001669727834363, + "grad_norm": 0.2370525768330149, + "learning_rate": 3.009428355689654e-06, + "loss": 0.0214, + "step": 5994 + }, + { + "epoch": 2.0020036734012354, + "grad_norm": 0.21197485891283063, + "learning_rate": 3.007645816841743e-06, + "loss": 0.0177, + "step": 5995 + }, + { + "epoch": 2.002337618968108, + "grad_norm": 0.2595571537221928, + "learning_rate": 3.0058635789623926e-06, + "loss": 0.0151, + "step": 5996 + }, + { + "epoch": 2.002671564534981, + "grad_norm": 0.22107331988782644, + "learning_rate": 3.0040816423208276e-06, + "loss": 0.0142, + "step": 5997 + }, + { + "epoch": 2.0030055101018536, + "grad_norm": 0.21302220094074045, + "learning_rate": 3.002300007186232e-06, + "loss": 0.0157, + "step": 5998 + }, + { + "epoch": 2.003339455668726, + "grad_norm": 0.2418773728097957, + "learning_rate": 3.0005186738277407e-06, + "loss": 0.0108, + "step": 5999 + }, + { + "epoch": 2.0036734012355986, + "grad_norm": 0.24464390588700685, + "learning_rate": 2.9987376425144477e-06, + "loss": 0.0175, + "step": 6000 + }, + { + "epoch": 2.0040073468024713, + "grad_norm": 0.23346157094483946, + "learning_rate": 2.9969569135153985e-06, + "loss": 0.0125, + "step": 6001 + }, + { + "epoch": 2.004341292369344, + "grad_norm": 0.32085743175992615, + "learning_rate": 2.9951764870995925e-06, + "loss": 0.0238, + "step": 6002 + }, + { + "epoch": 2.0046752379362163, + "grad_norm": 0.30667266904713103, + "learning_rate": 2.9933963635359847e-06, + "loss": 0.0309, + "step": 6003 + }, + { + "epoch": 2.005009183503089, + "grad_norm": 0.25174422534977653, + "learning_rate": 2.991616543093483e-06, + "loss": 0.0203, + "step": 6004 + }, + { + "epoch": 2.0053431290699617, + "grad_norm": 0.25210599501180714, + "learning_rate": 2.9898370260409502e-06, + "loss": 0.0136, + "step": 6005 + }, + { + "epoch": 2.005677074636834, + "grad_norm": 0.32066276200037597, + "learning_rate": 2.9880578126472015e-06, + "loss": 0.0235, + "step": 6006 + }, + { + "epoch": 2.0060110202037067, + "grad_norm": 0.29859901902839975, + "learning_rate": 2.9862789031810126e-06, + "loss": 0.0174, + "step": 6007 + }, + { + "epoch": 2.0063449657705794, + "grad_norm": 0.2444167474205298, + "learning_rate": 2.984500297911106e-06, + "loss": 0.0135, + "step": 6008 + }, + { + "epoch": 2.006678911337452, + "grad_norm": 0.24874680321922657, + "learning_rate": 2.9827219971061607e-06, + "loss": 0.0139, + "step": 6009 + }, + { + "epoch": 2.0070128569043244, + "grad_norm": 0.2718681703918907, + "learning_rate": 2.98094400103481e-06, + "loss": 0.0153, + "step": 6010 + }, + { + "epoch": 2.007346802471197, + "grad_norm": 0.243284484635126, + "learning_rate": 2.9791663099656424e-06, + "loss": 0.0126, + "step": 6011 + }, + { + "epoch": 2.00768074803807, + "grad_norm": 0.3588516268580067, + "learning_rate": 2.977388924167196e-06, + "loss": 0.0181, + "step": 6012 + }, + { + "epoch": 2.0080146936049426, + "grad_norm": 0.2603451593112769, + "learning_rate": 2.975611843907971e-06, + "loss": 0.0165, + "step": 6013 + }, + { + "epoch": 2.008348639171815, + "grad_norm": 0.2297039277694502, + "learning_rate": 2.9738350694564117e-06, + "loss": 0.0087, + "step": 6014 + }, + { + "epoch": 2.0086825847386875, + "grad_norm": 0.19118966348710978, + "learning_rate": 2.9720586010809234e-06, + "loss": 0.0103, + "step": 6015 + }, + { + "epoch": 2.0090165303055603, + "grad_norm": 0.17734445888402203, + "learning_rate": 2.9702824390498615e-06, + "loss": 0.0091, + "step": 6016 + }, + { + "epoch": 2.009350475872433, + "grad_norm": 0.29652862266656277, + "learning_rate": 2.9685065836315362e-06, + "loss": 0.0171, + "step": 6017 + }, + { + "epoch": 2.0096844214393053, + "grad_norm": 0.2501294834147169, + "learning_rate": 2.9667310350942103e-06, + "loss": 0.0093, + "step": 6018 + }, + { + "epoch": 2.010018367006178, + "grad_norm": 0.33681145379379857, + "learning_rate": 2.964955793706104e-06, + "loss": 0.0147, + "step": 6019 + }, + { + "epoch": 2.0103523125730507, + "grad_norm": 0.24074715765915838, + "learning_rate": 2.963180859735387e-06, + "loss": 0.0117, + "step": 6020 + }, + { + "epoch": 2.0106862581399234, + "grad_norm": 0.35974894103064475, + "learning_rate": 2.961406233450184e-06, + "loss": 0.0207, + "step": 6021 + }, + { + "epoch": 2.0110202037067957, + "grad_norm": 0.3357420455626794, + "learning_rate": 2.9596319151185713e-06, + "loss": 0.0188, + "step": 6022 + }, + { + "epoch": 2.0113541492736684, + "grad_norm": 0.3218706265057225, + "learning_rate": 2.9578579050085836e-06, + "loss": 0.0215, + "step": 6023 + }, + { + "epoch": 2.011688094840541, + "grad_norm": 0.2745907153839172, + "learning_rate": 2.956084203388204e-06, + "loss": 0.0196, + "step": 6024 + }, + { + "epoch": 2.0120220404074134, + "grad_norm": 0.21460785258188947, + "learning_rate": 2.9543108105253733e-06, + "loss": 0.0119, + "step": 6025 + }, + { + "epoch": 2.012355985974286, + "grad_norm": 0.2323097710648883, + "learning_rate": 2.9525377266879813e-06, + "loss": 0.0121, + "step": 6026 + }, + { + "epoch": 2.012689931541159, + "grad_norm": 0.2676166773319939, + "learning_rate": 2.950764952143874e-06, + "loss": 0.0162, + "step": 6027 + }, + { + "epoch": 2.0130238771080315, + "grad_norm": 0.27274738950486416, + "learning_rate": 2.9489924871608495e-06, + "loss": 0.0137, + "step": 6028 + }, + { + "epoch": 2.013357822674904, + "grad_norm": 0.3127813298403477, + "learning_rate": 2.9472203320066594e-06, + "loss": 0.0208, + "step": 6029 + }, + { + "epoch": 2.0136917682417765, + "grad_norm": 0.32911083350257675, + "learning_rate": 2.9454484869490074e-06, + "loss": 0.0151, + "step": 6030 + }, + { + "epoch": 2.0140257138086493, + "grad_norm": 0.3737748175194281, + "learning_rate": 2.943676952255554e-06, + "loss": 0.0208, + "step": 6031 + }, + { + "epoch": 2.014359659375522, + "grad_norm": 0.3241492409129578, + "learning_rate": 2.9419057281939106e-06, + "loss": 0.0166, + "step": 6032 + }, + { + "epoch": 2.0146936049423942, + "grad_norm": 0.2091162730671664, + "learning_rate": 2.94013481503164e-06, + "loss": 0.0105, + "step": 6033 + }, + { + "epoch": 2.015027550509267, + "grad_norm": 0.2896212018597147, + "learning_rate": 2.9383642130362596e-06, + "loss": 0.0155, + "step": 6034 + }, + { + "epoch": 2.0153614960761397, + "grad_norm": 0.2938511006885393, + "learning_rate": 2.9365939224752394e-06, + "loss": 0.0142, + "step": 6035 + }, + { + "epoch": 2.0156954416430124, + "grad_norm": 0.3404838851864081, + "learning_rate": 2.934823943616001e-06, + "loss": 0.0163, + "step": 6036 + }, + { + "epoch": 2.0160293872098847, + "grad_norm": 0.2968319342281456, + "learning_rate": 2.933054276725925e-06, + "loss": 0.0182, + "step": 6037 + }, + { + "epoch": 2.0163633327767574, + "grad_norm": 0.3218646982495804, + "learning_rate": 2.9312849220723382e-06, + "loss": 0.0193, + "step": 6038 + }, + { + "epoch": 2.01669727834363, + "grad_norm": 0.27731965017708854, + "learning_rate": 2.929515879922522e-06, + "loss": 0.0149, + "step": 6039 + }, + { + "epoch": 2.0170312239105024, + "grad_norm": 0.29319446372010927, + "learning_rate": 2.9277471505437105e-06, + "loss": 0.0147, + "step": 6040 + }, + { + "epoch": 2.017365169477375, + "grad_norm": 0.32453300186935935, + "learning_rate": 2.925978734203092e-06, + "loss": 0.0171, + "step": 6041 + }, + { + "epoch": 2.017699115044248, + "grad_norm": 0.42720111743976336, + "learning_rate": 2.924210631167807e-06, + "loss": 0.0148, + "step": 6042 + }, + { + "epoch": 2.0180330606111205, + "grad_norm": 0.35981122091536205, + "learning_rate": 2.922442841704948e-06, + "loss": 0.0231, + "step": 6043 + }, + { + "epoch": 2.018367006177993, + "grad_norm": 0.2634298939888762, + "learning_rate": 2.920675366081559e-06, + "loss": 0.0103, + "step": 6044 + }, + { + "epoch": 2.0187009517448655, + "grad_norm": 0.40644227124389015, + "learning_rate": 2.9189082045646404e-06, + "loss": 0.0218, + "step": 6045 + }, + { + "epoch": 2.0190348973117382, + "grad_norm": 0.22790158180052494, + "learning_rate": 2.9171413574211426e-06, + "loss": 0.0157, + "step": 6046 + }, + { + "epoch": 2.019368842878611, + "grad_norm": 0.24907090607140453, + "learning_rate": 2.9153748249179637e-06, + "loss": 0.0139, + "step": 6047 + }, + { + "epoch": 2.0197027884454832, + "grad_norm": 0.1856542662559802, + "learning_rate": 2.9136086073219665e-06, + "loss": 0.01, + "step": 6048 + }, + { + "epoch": 2.020036734012356, + "grad_norm": 0.2218450590776673, + "learning_rate": 2.9118427048999544e-06, + "loss": 0.0122, + "step": 6049 + }, + { + "epoch": 2.0203706795792287, + "grad_norm": 0.33827557376306444, + "learning_rate": 2.9100771179186904e-06, + "loss": 0.0177, + "step": 6050 + }, + { + "epoch": 2.0207046251461014, + "grad_norm": 0.24309427342943882, + "learning_rate": 2.9083118466448845e-06, + "loss": 0.0137, + "step": 6051 + }, + { + "epoch": 2.0210385707129737, + "grad_norm": 0.2970030949584663, + "learning_rate": 2.9065468913452045e-06, + "loss": 0.0157, + "step": 6052 + }, + { + "epoch": 2.0213725162798464, + "grad_norm": 0.293592731613864, + "learning_rate": 2.904782252286264e-06, + "loss": 0.0164, + "step": 6053 + }, + { + "epoch": 2.021706461846719, + "grad_norm": 0.2411401378571078, + "learning_rate": 2.903017929734635e-06, + "loss": 0.011, + "step": 6054 + }, + { + "epoch": 2.0220404074135914, + "grad_norm": 0.34656115399225423, + "learning_rate": 2.9012539239568405e-06, + "loss": 0.0185, + "step": 6055 + }, + { + "epoch": 2.022374352980464, + "grad_norm": 0.2545646721008136, + "learning_rate": 2.899490235219351e-06, + "loss": 0.0113, + "step": 6056 + }, + { + "epoch": 2.022708298547337, + "grad_norm": 0.1859773868262544, + "learning_rate": 2.897726863788595e-06, + "loss": 0.0108, + "step": 6057 + }, + { + "epoch": 2.0230422441142095, + "grad_norm": 0.23682701564000458, + "learning_rate": 2.8959638099309504e-06, + "loss": 0.0139, + "step": 6058 + }, + { + "epoch": 2.023376189681082, + "grad_norm": 0.2757830867378058, + "learning_rate": 2.8942010739127446e-06, + "loss": 0.0158, + "step": 6059 + }, + { + "epoch": 2.0237101352479545, + "grad_norm": 0.3019110217368406, + "learning_rate": 2.8924386560002627e-06, + "loss": 0.0148, + "step": 6060 + }, + { + "epoch": 2.0240440808148272, + "grad_norm": 0.27416394262098626, + "learning_rate": 2.8906765564597384e-06, + "loss": 0.0103, + "step": 6061 + }, + { + "epoch": 2.0243780263817, + "grad_norm": 0.2877330047605113, + "learning_rate": 2.8889147755573556e-06, + "loss": 0.0115, + "step": 6062 + }, + { + "epoch": 2.024711971948572, + "grad_norm": 0.40638905717884094, + "learning_rate": 2.8871533135592544e-06, + "loss": 0.0279, + "step": 6063 + }, + { + "epoch": 2.025045917515445, + "grad_norm": 0.22426651913039228, + "learning_rate": 2.8853921707315215e-06, + "loss": 0.0132, + "step": 6064 + }, + { + "epoch": 2.0253798630823177, + "grad_norm": 0.307370164224585, + "learning_rate": 2.8836313473402e-06, + "loss": 0.0225, + "step": 6065 + }, + { + "epoch": 2.0257138086491904, + "grad_norm": 0.22825809003354988, + "learning_rate": 2.881870843651282e-06, + "loss": 0.0103, + "step": 6066 + }, + { + "epoch": 2.0260477542160626, + "grad_norm": 0.22814925295941907, + "learning_rate": 2.8801106599307164e-06, + "loss": 0.0129, + "step": 6067 + }, + { + "epoch": 2.0263816997829354, + "grad_norm": 0.3003877625096707, + "learning_rate": 2.8783507964443942e-06, + "loss": 0.0147, + "step": 6068 + }, + { + "epoch": 2.026715645349808, + "grad_norm": 0.31922608946006525, + "learning_rate": 2.8765912534581674e-06, + "loss": 0.0219, + "step": 6069 + }, + { + "epoch": 2.027049590916681, + "grad_norm": 0.23767329459980238, + "learning_rate": 2.874832031237833e-06, + "loss": 0.0113, + "step": 6070 + }, + { + "epoch": 2.027383536483553, + "grad_norm": 0.2996470845907607, + "learning_rate": 2.873073130049142e-06, + "loss": 0.0106, + "step": 6071 + }, + { + "epoch": 2.027717482050426, + "grad_norm": 0.2946807365764898, + "learning_rate": 2.8713145501578e-06, + "loss": 0.0142, + "step": 6072 + }, + { + "epoch": 2.0280514276172985, + "grad_norm": 0.4829768510277467, + "learning_rate": 2.869556291829461e-06, + "loss": 0.0135, + "step": 6073 + }, + { + "epoch": 2.028385373184171, + "grad_norm": 0.20999719678359777, + "learning_rate": 2.8677983553297266e-06, + "loss": 0.0087, + "step": 6074 + }, + { + "epoch": 2.0287193187510435, + "grad_norm": 0.290353405698059, + "learning_rate": 2.8660407409241593e-06, + "loss": 0.0113, + "step": 6075 + }, + { + "epoch": 2.029053264317916, + "grad_norm": 0.4619981297584356, + "learning_rate": 2.864283448878262e-06, + "loss": 0.0212, + "step": 6076 + }, + { + "epoch": 2.029387209884789, + "grad_norm": 0.2796731561001804, + "learning_rate": 2.8625264794574975e-06, + "loss": 0.0129, + "step": 6077 + }, + { + "epoch": 2.029721155451661, + "grad_norm": 0.32994452425599774, + "learning_rate": 2.860769832927276e-06, + "loss": 0.0177, + "step": 6078 + }, + { + "epoch": 2.030055101018534, + "grad_norm": 0.30679330425306117, + "learning_rate": 2.8590135095529624e-06, + "loss": 0.0272, + "step": 6079 + }, + { + "epoch": 2.0303890465854066, + "grad_norm": 0.22030996569126765, + "learning_rate": 2.8572575095998646e-06, + "loss": 0.0108, + "step": 6080 + }, + { + "epoch": 2.0307229921522794, + "grad_norm": 0.21127749541792287, + "learning_rate": 2.855501833333253e-06, + "loss": 0.0099, + "step": 6081 + }, + { + "epoch": 2.0310569377191516, + "grad_norm": 0.24058478118174959, + "learning_rate": 2.853746481018337e-06, + "loss": 0.0105, + "step": 6082 + }, + { + "epoch": 2.0313908832860244, + "grad_norm": 0.32205879831670353, + "learning_rate": 2.8519914529202868e-06, + "loss": 0.0211, + "step": 6083 + }, + { + "epoch": 2.031724828852897, + "grad_norm": 0.2832939774790598, + "learning_rate": 2.8502367493042217e-06, + "loss": 0.0197, + "step": 6084 + }, + { + "epoch": 2.03205877441977, + "grad_norm": 0.26575117464371134, + "learning_rate": 2.848482370435206e-06, + "loss": 0.0163, + "step": 6085 + }, + { + "epoch": 2.032392719986642, + "grad_norm": 0.3258080129190906, + "learning_rate": 2.8467283165782643e-06, + "loss": 0.0215, + "step": 6086 + }, + { + "epoch": 2.0327266655535148, + "grad_norm": 0.47511153250582916, + "learning_rate": 2.8449745879983614e-06, + "loss": 0.0181, + "step": 6087 + }, + { + "epoch": 2.0330606111203875, + "grad_norm": 0.2476053327934346, + "learning_rate": 2.8432211849604218e-06, + "loss": 0.0202, + "step": 6088 + }, + { + "epoch": 2.0333945566872598, + "grad_norm": 0.2783084602439737, + "learning_rate": 2.841468107729318e-06, + "loss": 0.0156, + "step": 6089 + }, + { + "epoch": 2.0337285022541325, + "grad_norm": 0.3357539801022698, + "learning_rate": 2.8397153565698744e-06, + "loss": 0.0175, + "step": 6090 + }, + { + "epoch": 2.034062447821005, + "grad_norm": 0.352764606623764, + "learning_rate": 2.8379629317468604e-06, + "loss": 0.0226, + "step": 6091 + }, + { + "epoch": 2.034396393387878, + "grad_norm": 0.22486127145038035, + "learning_rate": 2.8362108335250044e-06, + "loss": 0.0094, + "step": 6092 + }, + { + "epoch": 2.03473033895475, + "grad_norm": 0.33740378604902393, + "learning_rate": 2.834459062168978e-06, + "loss": 0.0234, + "step": 6093 + }, + { + "epoch": 2.035064284521623, + "grad_norm": 0.2876460462444672, + "learning_rate": 2.8327076179434088e-06, + "loss": 0.0142, + "step": 6094 + }, + { + "epoch": 2.0353982300884956, + "grad_norm": 0.24471925855800045, + "learning_rate": 2.8309565011128732e-06, + "loss": 0.0147, + "step": 6095 + }, + { + "epoch": 2.0357321756553683, + "grad_norm": 0.297454363053485, + "learning_rate": 2.8292057119418994e-06, + "loss": 0.012, + "step": 6096 + }, + { + "epoch": 2.0360661212222406, + "grad_norm": 0.2657319967797067, + "learning_rate": 2.827455250694961e-06, + "loss": 0.0143, + "step": 6097 + }, + { + "epoch": 2.0364000667891133, + "grad_norm": 0.279516415732953, + "learning_rate": 2.8257051176364903e-06, + "loss": 0.0113, + "step": 6098 + }, + { + "epoch": 2.036734012355986, + "grad_norm": 0.4482745624442629, + "learning_rate": 2.8239553130308604e-06, + "loss": 0.0326, + "step": 6099 + }, + { + "epoch": 2.0370679579228588, + "grad_norm": 0.27664131408412507, + "learning_rate": 2.8222058371424033e-06, + "loss": 0.0113, + "step": 6100 + }, + { + "epoch": 2.037401903489731, + "grad_norm": 0.35872564722376515, + "learning_rate": 2.820456690235397e-06, + "loss": 0.0214, + "step": 6101 + }, + { + "epoch": 2.0377358490566038, + "grad_norm": 0.3491645170001635, + "learning_rate": 2.8187078725740723e-06, + "loss": 0.0155, + "step": 6102 + }, + { + "epoch": 2.0380697946234765, + "grad_norm": 0.2861994696551622, + "learning_rate": 2.8169593844226063e-06, + "loss": 0.0176, + "step": 6103 + }, + { + "epoch": 2.0384037401903488, + "grad_norm": 0.24515443776860424, + "learning_rate": 2.815211226045131e-06, + "loss": 0.0132, + "step": 6104 + }, + { + "epoch": 2.0387376857572215, + "grad_norm": 0.2921236535017713, + "learning_rate": 2.8134633977057236e-06, + "loss": 0.0192, + "step": 6105 + }, + { + "epoch": 2.039071631324094, + "grad_norm": 0.2680276689556573, + "learning_rate": 2.811715899668415e-06, + "loss": 0.0154, + "step": 6106 + }, + { + "epoch": 2.039405576890967, + "grad_norm": 0.2536113930376566, + "learning_rate": 2.8099687321971887e-06, + "loss": 0.0116, + "step": 6107 + }, + { + "epoch": 2.039739522457839, + "grad_norm": 0.23374677415265926, + "learning_rate": 2.80822189555597e-06, + "loss": 0.0106, + "step": 6108 + }, + { + "epoch": 2.040073468024712, + "grad_norm": 0.2467280502889194, + "learning_rate": 2.8064753900086427e-06, + "loss": 0.0132, + "step": 6109 + }, + { + "epoch": 2.0404074135915846, + "grad_norm": 0.3602627966972387, + "learning_rate": 2.804729215819034e-06, + "loss": 0.03, + "step": 6110 + }, + { + "epoch": 2.0407413591584573, + "grad_norm": 0.27184514001034055, + "learning_rate": 2.8029833732509282e-06, + "loss": 0.0139, + "step": 6111 + }, + { + "epoch": 2.0410753047253296, + "grad_norm": 0.2669730051001783, + "learning_rate": 2.801237862568048e-06, + "loss": 0.0134, + "step": 6112 + }, + { + "epoch": 2.0414092502922023, + "grad_norm": 0.5003174822212786, + "learning_rate": 2.799492684034083e-06, + "loss": 0.0293, + "step": 6113 + }, + { + "epoch": 2.041743195859075, + "grad_norm": 0.33582503522969365, + "learning_rate": 2.797747837912656e-06, + "loss": 0.0172, + "step": 6114 + }, + { + "epoch": 2.0420771414259478, + "grad_norm": 0.2970612700629547, + "learning_rate": 2.796003324467351e-06, + "loss": 0.018, + "step": 6115 + }, + { + "epoch": 2.04241108699282, + "grad_norm": 0.293969635600175, + "learning_rate": 2.794259143961693e-06, + "loss": 0.0139, + "step": 6116 + }, + { + "epoch": 2.0427450325596928, + "grad_norm": 0.28897709967415713, + "learning_rate": 2.7925152966591627e-06, + "loss": 0.0181, + "step": 6117 + }, + { + "epoch": 2.0430789781265655, + "grad_norm": 0.23328926665983554, + "learning_rate": 2.7907717828231893e-06, + "loss": 0.0107, + "step": 6118 + }, + { + "epoch": 2.043412923693438, + "grad_norm": 0.3653229302778942, + "learning_rate": 2.7890286027171532e-06, + "loss": 0.0097, + "step": 6119 + }, + { + "epoch": 2.0437468692603105, + "grad_norm": 0.2549791164964824, + "learning_rate": 2.7872857566043775e-06, + "loss": 0.0137, + "step": 6120 + }, + { + "epoch": 2.044080814827183, + "grad_norm": 0.3701554878191902, + "learning_rate": 2.7855432447481444e-06, + "loss": 0.0197, + "step": 6121 + }, + { + "epoch": 2.044414760394056, + "grad_norm": 0.4363622609730641, + "learning_rate": 2.7838010674116767e-06, + "loss": 0.0261, + "step": 6122 + }, + { + "epoch": 2.044748705960928, + "grad_norm": 0.2544040344814649, + "learning_rate": 2.7820592248581523e-06, + "loss": 0.0092, + "step": 6123 + }, + { + "epoch": 2.045082651527801, + "grad_norm": 0.3189030079520338, + "learning_rate": 2.780317717350697e-06, + "loss": 0.0184, + "step": 6124 + }, + { + "epoch": 2.0454165970946736, + "grad_norm": 0.26266961940353706, + "learning_rate": 2.7785765451523896e-06, + "loss": 0.0141, + "step": 6125 + }, + { + "epoch": 2.0457505426615463, + "grad_norm": 0.28467830927450516, + "learning_rate": 2.7768357085262486e-06, + "loss": 0.0133, + "step": 6126 + }, + { + "epoch": 2.0460844882284186, + "grad_norm": 0.29069874970425696, + "learning_rate": 2.7750952077352534e-06, + "loss": 0.015, + "step": 6127 + }, + { + "epoch": 2.0464184337952913, + "grad_norm": 0.27639028866434623, + "learning_rate": 2.7733550430423216e-06, + "loss": 0.0194, + "step": 6128 + }, + { + "epoch": 2.046752379362164, + "grad_norm": 0.24647568749153773, + "learning_rate": 2.7716152147103292e-06, + "loss": 0.0133, + "step": 6129 + }, + { + "epoch": 2.0470863249290367, + "grad_norm": 0.21392067621442906, + "learning_rate": 2.7698757230020986e-06, + "loss": 0.0107, + "step": 6130 + }, + { + "epoch": 2.047420270495909, + "grad_norm": 0.28322222234106154, + "learning_rate": 2.7681365681803967e-06, + "loss": 0.0155, + "step": 6131 + }, + { + "epoch": 2.0477542160627817, + "grad_norm": 0.29557830115031114, + "learning_rate": 2.7663977505079483e-06, + "loss": 0.0173, + "step": 6132 + }, + { + "epoch": 2.0480881616296545, + "grad_norm": 0.292441068650384, + "learning_rate": 2.764659270247417e-06, + "loss": 0.0216, + "step": 6133 + }, + { + "epoch": 2.048422107196527, + "grad_norm": 0.3001107000648257, + "learning_rate": 2.7629211276614255e-06, + "loss": 0.0152, + "step": 6134 + }, + { + "epoch": 2.0487560527633994, + "grad_norm": 0.30063661297242134, + "learning_rate": 2.761183323012534e-06, + "loss": 0.0134, + "step": 6135 + }, + { + "epoch": 2.049089998330272, + "grad_norm": 0.23132167995057779, + "learning_rate": 2.7594458565632664e-06, + "loss": 0.0125, + "step": 6136 + }, + { + "epoch": 2.049423943897145, + "grad_norm": 0.30096447248142183, + "learning_rate": 2.757708728576083e-06, + "loss": 0.0144, + "step": 6137 + }, + { + "epoch": 2.049757889464017, + "grad_norm": 0.237418866501332, + "learning_rate": 2.7559719393133987e-06, + "loss": 0.011, + "step": 6138 + }, + { + "epoch": 2.05009183503089, + "grad_norm": 0.24139322555604806, + "learning_rate": 2.754235489037575e-06, + "loss": 0.0129, + "step": 6139 + }, + { + "epoch": 2.0504257805977626, + "grad_norm": 0.23864369974629612, + "learning_rate": 2.7524993780109254e-06, + "loss": 0.0113, + "step": 6140 + }, + { + "epoch": 2.0507597261646353, + "grad_norm": 0.31222063886359813, + "learning_rate": 2.750763606495704e-06, + "loss": 0.0136, + "step": 6141 + }, + { + "epoch": 2.0510936717315076, + "grad_norm": 0.3838453344596363, + "learning_rate": 2.7490281747541276e-06, + "loss": 0.0293, + "step": 6142 + }, + { + "epoch": 2.0514276172983803, + "grad_norm": 0.300159067963588, + "learning_rate": 2.747293083048348e-06, + "loss": 0.0176, + "step": 6143 + }, + { + "epoch": 2.051761562865253, + "grad_norm": 0.33554352526393266, + "learning_rate": 2.7455583316404744e-06, + "loss": 0.017, + "step": 6144 + }, + { + "epoch": 2.0520955084321257, + "grad_norm": 0.32068925701366413, + "learning_rate": 2.743823920792559e-06, + "loss": 0.0152, + "step": 6145 + }, + { + "epoch": 2.052429453998998, + "grad_norm": 0.2834314103614763, + "learning_rate": 2.742089850766607e-06, + "loss": 0.0204, + "step": 6146 + }, + { + "epoch": 2.0527633995658707, + "grad_norm": 0.3034670465553409, + "learning_rate": 2.7403561218245654e-06, + "loss": 0.0178, + "step": 6147 + }, + { + "epoch": 2.0530973451327434, + "grad_norm": 0.3080500108252758, + "learning_rate": 2.7386227342283423e-06, + "loss": 0.0199, + "step": 6148 + }, + { + "epoch": 2.053431290699616, + "grad_norm": 0.3562626710488752, + "learning_rate": 2.73688968823978e-06, + "loss": 0.0159, + "step": 6149 + }, + { + "epoch": 2.0537652362664884, + "grad_norm": 0.20246524699762142, + "learning_rate": 2.7351569841206792e-06, + "loss": 0.0112, + "step": 6150 + }, + { + "epoch": 2.054099181833361, + "grad_norm": 0.35429621418538865, + "learning_rate": 2.733424622132782e-06, + "loss": 0.0211, + "step": 6151 + }, + { + "epoch": 2.054433127400234, + "grad_norm": 0.24907192378489898, + "learning_rate": 2.7316926025377855e-06, + "loss": 0.0147, + "step": 6152 + }, + { + "epoch": 2.054767072967106, + "grad_norm": 0.2848960654848761, + "learning_rate": 2.729960925597328e-06, + "loss": 0.0178, + "step": 6153 + }, + { + "epoch": 2.055101018533979, + "grad_norm": 0.22352736192834902, + "learning_rate": 2.7282295915730016e-06, + "loss": 0.0107, + "step": 6154 + }, + { + "epoch": 2.0554349641008516, + "grad_norm": 0.29154379201592695, + "learning_rate": 2.726498600726346e-06, + "loss": 0.0144, + "step": 6155 + }, + { + "epoch": 2.0557689096677243, + "grad_norm": 0.24102221843832142, + "learning_rate": 2.7247679533188446e-06, + "loss": 0.012, + "step": 6156 + }, + { + "epoch": 2.0561028552345966, + "grad_norm": 0.2936137109883559, + "learning_rate": 2.723037649611936e-06, + "loss": 0.0148, + "step": 6157 + }, + { + "epoch": 2.0564368008014693, + "grad_norm": 0.32490189824501264, + "learning_rate": 2.721307689866997e-06, + "loss": 0.0189, + "step": 6158 + }, + { + "epoch": 2.056770746368342, + "grad_norm": 0.2786962057928275, + "learning_rate": 2.719578074345366e-06, + "loss": 0.0165, + "step": 6159 + }, + { + "epoch": 2.0571046919352147, + "grad_norm": 0.29844488405350844, + "learning_rate": 2.7178488033083163e-06, + "loss": 0.0165, + "step": 6160 + }, + { + "epoch": 2.057438637502087, + "grad_norm": 0.39936642719643084, + "learning_rate": 2.7161198770170784e-06, + "loss": 0.0193, + "step": 6161 + }, + { + "epoch": 2.0577725830689597, + "grad_norm": 0.30338747450520953, + "learning_rate": 2.714391295732822e-06, + "loss": 0.0137, + "step": 6162 + }, + { + "epoch": 2.0581065286358324, + "grad_norm": 0.5142800910557641, + "learning_rate": 2.712663059716675e-06, + "loss": 0.0138, + "step": 6163 + }, + { + "epoch": 2.058440474202705, + "grad_norm": 0.27738714768212364, + "learning_rate": 2.7109351692297015e-06, + "loss": 0.0156, + "step": 6164 + }, + { + "epoch": 2.0587744197695774, + "grad_norm": 0.2553756722956166, + "learning_rate": 2.7092076245329273e-06, + "loss": 0.0109, + "step": 6165 + }, + { + "epoch": 2.05910836533645, + "grad_norm": 0.2558447520215222, + "learning_rate": 2.7074804258873127e-06, + "loss": 0.0113, + "step": 6166 + }, + { + "epoch": 2.059442310903323, + "grad_norm": 0.3399173166704573, + "learning_rate": 2.7057535735537754e-06, + "loss": 0.0174, + "step": 6167 + }, + { + "epoch": 2.0597762564701956, + "grad_norm": 0.31623765510457286, + "learning_rate": 2.704027067793173e-06, + "loss": 0.0161, + "step": 6168 + }, + { + "epoch": 2.060110202037068, + "grad_norm": 0.2733253699048302, + "learning_rate": 2.7023009088663176e-06, + "loss": 0.0111, + "step": 6169 + }, + { + "epoch": 2.0604441476039406, + "grad_norm": 0.3468734264286785, + "learning_rate": 2.7005750970339607e-06, + "loss": 0.0149, + "step": 6170 + }, + { + "epoch": 2.0607780931708133, + "grad_norm": 0.34320281938735064, + "learning_rate": 2.698849632556815e-06, + "loss": 0.019, + "step": 6171 + }, + { + "epoch": 2.0611120387376856, + "grad_norm": 0.2604572316624649, + "learning_rate": 2.697124515695524e-06, + "loss": 0.0173, + "step": 6172 + }, + { + "epoch": 2.0614459843045583, + "grad_norm": 0.2774091219209041, + "learning_rate": 2.695399746710693e-06, + "loss": 0.022, + "step": 6173 + }, + { + "epoch": 2.061779929871431, + "grad_norm": 0.310753394931392, + "learning_rate": 2.6936753258628643e-06, + "loss": 0.0208, + "step": 6174 + }, + { + "epoch": 2.0621138754383037, + "grad_norm": 0.30587375121149146, + "learning_rate": 2.691951253412536e-06, + "loss": 0.015, + "step": 6175 + }, + { + "epoch": 2.062447821005176, + "grad_norm": 0.20310024699670445, + "learning_rate": 2.6902275296201445e-06, + "loss": 0.0089, + "step": 6176 + }, + { + "epoch": 2.0627817665720487, + "grad_norm": 0.23136331185968897, + "learning_rate": 2.688504154746082e-06, + "loss": 0.0102, + "step": 6177 + }, + { + "epoch": 2.0631157121389214, + "grad_norm": 0.2088692663882883, + "learning_rate": 2.686781129050685e-06, + "loss": 0.0113, + "step": 6178 + }, + { + "epoch": 2.063449657705794, + "grad_norm": 0.27160561119880056, + "learning_rate": 2.685058452794235e-06, + "loss": 0.0119, + "step": 6179 + }, + { + "epoch": 2.0637836032726664, + "grad_norm": 0.28719766913607175, + "learning_rate": 2.6833361262369644e-06, + "loss": 0.0133, + "step": 6180 + }, + { + "epoch": 2.064117548839539, + "grad_norm": 0.29712817431083016, + "learning_rate": 2.681614149639048e-06, + "loss": 0.0293, + "step": 6181 + }, + { + "epoch": 2.064451494406412, + "grad_norm": 0.2586448746195902, + "learning_rate": 2.679892523260612e-06, + "loss": 0.0138, + "step": 6182 + }, + { + "epoch": 2.0647854399732846, + "grad_norm": 0.278751060322034, + "learning_rate": 2.6781712473617293e-06, + "loss": 0.0161, + "step": 6183 + }, + { + "epoch": 2.065119385540157, + "grad_norm": 0.3270286494815345, + "learning_rate": 2.6764503222024202e-06, + "loss": 0.0158, + "step": 6184 + }, + { + "epoch": 2.0654533311070296, + "grad_norm": 0.28789205731344475, + "learning_rate": 2.674729748042647e-06, + "loss": 0.0135, + "step": 6185 + }, + { + "epoch": 2.0657872766739023, + "grad_norm": 0.2645692272440605, + "learning_rate": 2.673009525142326e-06, + "loss": 0.0135, + "step": 6186 + }, + { + "epoch": 2.0661212222407745, + "grad_norm": 0.2732618694162732, + "learning_rate": 2.6712896537613143e-06, + "loss": 0.0134, + "step": 6187 + }, + { + "epoch": 2.0664551678076473, + "grad_norm": 0.21235744945635984, + "learning_rate": 2.6695701341594193e-06, + "loss": 0.0113, + "step": 6188 + }, + { + "epoch": 2.06678911337452, + "grad_norm": 0.257867563067866, + "learning_rate": 2.667850966596396e-06, + "loss": 0.0113, + "step": 6189 + }, + { + "epoch": 2.0671230589413927, + "grad_norm": 0.24490635522610477, + "learning_rate": 2.6661321513319467e-06, + "loss": 0.0116, + "step": 6190 + }, + { + "epoch": 2.067457004508265, + "grad_norm": 0.2792789359976981, + "learning_rate": 2.6644136886257138e-06, + "loss": 0.0148, + "step": 6191 + }, + { + "epoch": 2.0677909500751377, + "grad_norm": 0.28195978320173876, + "learning_rate": 2.6626955787372962e-06, + "loss": 0.0114, + "step": 6192 + }, + { + "epoch": 2.0681248956420104, + "grad_norm": 0.3160367855950243, + "learning_rate": 2.6609778219262296e-06, + "loss": 0.0126, + "step": 6193 + }, + { + "epoch": 2.068458841208883, + "grad_norm": 0.22697057263532353, + "learning_rate": 2.659260418452005e-06, + "loss": 0.011, + "step": 6194 + }, + { + "epoch": 2.0687927867757554, + "grad_norm": 0.258081658283097, + "learning_rate": 2.6575433685740547e-06, + "loss": 0.0112, + "step": 6195 + }, + { + "epoch": 2.069126732342628, + "grad_norm": 0.2660646367791906, + "learning_rate": 2.655826672551762e-06, + "loss": 0.0109, + "step": 6196 + }, + { + "epoch": 2.069460677909501, + "grad_norm": 0.34897654305382064, + "learning_rate": 2.6541103306444516e-06, + "loss": 0.014, + "step": 6197 + }, + { + "epoch": 2.0697946234763736, + "grad_norm": 0.29879647705197826, + "learning_rate": 2.6523943431113985e-06, + "loss": 0.0155, + "step": 6198 + }, + { + "epoch": 2.070128569043246, + "grad_norm": 0.33255267726468113, + "learning_rate": 2.6506787102118204e-06, + "loss": 0.0136, + "step": 6199 + }, + { + "epoch": 2.0704625146101185, + "grad_norm": 0.24807825968469902, + "learning_rate": 2.6489634322048853e-06, + "loss": 0.0098, + "step": 6200 + }, + { + "epoch": 2.0707964601769913, + "grad_norm": 0.2806514649688107, + "learning_rate": 2.647248509349708e-06, + "loss": 0.011, + "step": 6201 + }, + { + "epoch": 2.0711304057438635, + "grad_norm": 0.4061888496602113, + "learning_rate": 2.645533941905345e-06, + "loss": 0.0188, + "step": 6202 + }, + { + "epoch": 2.0714643513107363, + "grad_norm": 0.27149238375049944, + "learning_rate": 2.6438197301308045e-06, + "loss": 0.0132, + "step": 6203 + }, + { + "epoch": 2.071798296877609, + "grad_norm": 0.3471557875876584, + "learning_rate": 2.6421058742850346e-06, + "loss": 0.0172, + "step": 6204 + }, + { + "epoch": 2.0721322424444817, + "grad_norm": 0.2785416139500432, + "learning_rate": 2.6403923746269368e-06, + "loss": 0.0138, + "step": 6205 + }, + { + "epoch": 2.072466188011354, + "grad_norm": 0.31617376730516916, + "learning_rate": 2.638679231415353e-06, + "loss": 0.0103, + "step": 6206 + }, + { + "epoch": 2.0728001335782267, + "grad_norm": 0.233890516100944, + "learning_rate": 2.636966444909077e-06, + "loss": 0.0109, + "step": 6207 + }, + { + "epoch": 2.0731340791450994, + "grad_norm": 0.22880810709401206, + "learning_rate": 2.635254015366842e-06, + "loss": 0.0131, + "step": 6208 + }, + { + "epoch": 2.073468024711972, + "grad_norm": 0.2688531281020386, + "learning_rate": 2.633541943047334e-06, + "loss": 0.0146, + "step": 6209 + }, + { + "epoch": 2.0738019702788444, + "grad_norm": 0.2998383308147042, + "learning_rate": 2.6318302282091772e-06, + "loss": 0.0128, + "step": 6210 + }, + { + "epoch": 2.074135915845717, + "grad_norm": 0.3200518557729673, + "learning_rate": 2.6301188711109494e-06, + "loss": 0.0155, + "step": 6211 + }, + { + "epoch": 2.07446986141259, + "grad_norm": 0.2882625256509772, + "learning_rate": 2.6284078720111693e-06, + "loss": 0.0168, + "step": 6212 + }, + { + "epoch": 2.0748038069794625, + "grad_norm": 0.3810363900456062, + "learning_rate": 2.626697231168308e-06, + "loss": 0.0183, + "step": 6213 + }, + { + "epoch": 2.075137752546335, + "grad_norm": 0.2600797779262035, + "learning_rate": 2.624986948840772e-06, + "loss": 0.0129, + "step": 6214 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 0.23901841608496618, + "learning_rate": 2.6232770252869243e-06, + "loss": 0.0108, + "step": 6215 + }, + { + "epoch": 2.0758056436800802, + "grad_norm": 0.2895261670531142, + "learning_rate": 2.6215674607650653e-06, + "loss": 0.0161, + "step": 6216 + }, + { + "epoch": 2.076139589246953, + "grad_norm": 0.24797933486687493, + "learning_rate": 2.619858255533446e-06, + "loss": 0.0134, + "step": 6217 + }, + { + "epoch": 2.0764735348138252, + "grad_norm": 0.23863762950528655, + "learning_rate": 2.6181494098502626e-06, + "loss": 0.0132, + "step": 6218 + }, + { + "epoch": 2.076807480380698, + "grad_norm": 0.23554487880100103, + "learning_rate": 2.616440923973659e-06, + "loss": 0.01, + "step": 6219 + }, + { + "epoch": 2.0771414259475707, + "grad_norm": 0.2629196257938369, + "learning_rate": 2.6147327981617167e-06, + "loss": 0.0109, + "step": 6220 + }, + { + "epoch": 2.077475371514443, + "grad_norm": 0.30347136443509615, + "learning_rate": 2.613025032672472e-06, + "loss": 0.0156, + "step": 6221 + }, + { + "epoch": 2.0778093170813157, + "grad_norm": 0.2758700539876662, + "learning_rate": 2.611317627763901e-06, + "loss": 0.0165, + "step": 6222 + }, + { + "epoch": 2.0781432626481884, + "grad_norm": 0.25343059122580197, + "learning_rate": 2.609610583693928e-06, + "loss": 0.0138, + "step": 6223 + }, + { + "epoch": 2.078477208215061, + "grad_norm": 0.28907889847773965, + "learning_rate": 2.6079039007204238e-06, + "loss": 0.0137, + "step": 6224 + }, + { + "epoch": 2.0788111537819334, + "grad_norm": 0.3238433865568032, + "learning_rate": 2.6061975791011996e-06, + "loss": 0.0173, + "step": 6225 + }, + { + "epoch": 2.079145099348806, + "grad_norm": 0.2417912605480169, + "learning_rate": 2.6044916190940194e-06, + "loss": 0.0142, + "step": 6226 + }, + { + "epoch": 2.079479044915679, + "grad_norm": 0.25117331307032686, + "learning_rate": 2.6027860209565835e-06, + "loss": 0.0148, + "step": 6227 + }, + { + "epoch": 2.0798129904825515, + "grad_norm": 0.3077747308192545, + "learning_rate": 2.6010807849465468e-06, + "loss": 0.0172, + "step": 6228 + }, + { + "epoch": 2.080146936049424, + "grad_norm": 0.29674813218974383, + "learning_rate": 2.5993759113215032e-06, + "loss": 0.0137, + "step": 6229 + }, + { + "epoch": 2.0804808816162965, + "grad_norm": 0.20110443136718312, + "learning_rate": 2.5976714003389963e-06, + "loss": 0.0111, + "step": 6230 + }, + { + "epoch": 2.0808148271831692, + "grad_norm": 0.298389559232403, + "learning_rate": 2.5959672522565095e-06, + "loss": 0.0173, + "step": 6231 + }, + { + "epoch": 2.081148772750042, + "grad_norm": 0.5126092442675864, + "learning_rate": 2.594263467331477e-06, + "loss": 0.021, + "step": 6232 + }, + { + "epoch": 2.0814827183169142, + "grad_norm": 0.32916226434549356, + "learning_rate": 2.592560045821273e-06, + "loss": 0.0196, + "step": 6233 + }, + { + "epoch": 2.081816663883787, + "grad_norm": 0.2980573368158267, + "learning_rate": 2.5908569879832223e-06, + "loss": 0.0144, + "step": 6234 + }, + { + "epoch": 2.0821506094506597, + "grad_norm": 0.28157395797472956, + "learning_rate": 2.5891542940745873e-06, + "loss": 0.0162, + "step": 6235 + }, + { + "epoch": 2.082484555017532, + "grad_norm": 0.3448861362716233, + "learning_rate": 2.5874519643525864e-06, + "loss": 0.0178, + "step": 6236 + }, + { + "epoch": 2.0828185005844047, + "grad_norm": 0.26130095063103065, + "learning_rate": 2.5857499990743706e-06, + "loss": 0.013, + "step": 6237 + }, + { + "epoch": 2.0831524461512774, + "grad_norm": 0.320555994239304, + "learning_rate": 2.584048398497047e-06, + "loss": 0.0208, + "step": 6238 + }, + { + "epoch": 2.08348639171815, + "grad_norm": 0.29398743811544803, + "learning_rate": 2.5823471628776574e-06, + "loss": 0.0117, + "step": 6239 + }, + { + "epoch": 2.0838203372850224, + "grad_norm": 0.263872652087751, + "learning_rate": 2.5806462924731955e-06, + "loss": 0.0122, + "step": 6240 + }, + { + "epoch": 2.084154282851895, + "grad_norm": 0.28444258494075586, + "learning_rate": 2.5789457875405986e-06, + "loss": 0.0201, + "step": 6241 + }, + { + "epoch": 2.084488228418768, + "grad_norm": 0.29910679769368453, + "learning_rate": 2.57724564833675e-06, + "loss": 0.0196, + "step": 6242 + }, + { + "epoch": 2.0848221739856405, + "grad_norm": 0.27112794160906495, + "learning_rate": 2.5755458751184705e-06, + "loss": 0.0188, + "step": 6243 + }, + { + "epoch": 2.085156119552513, + "grad_norm": 0.3007399325807635, + "learning_rate": 2.5738464681425356e-06, + "loss": 0.0139, + "step": 6244 + }, + { + "epoch": 2.0854900651193855, + "grad_norm": 0.3825229399760879, + "learning_rate": 2.5721474276656566e-06, + "loss": 0.0218, + "step": 6245 + }, + { + "epoch": 2.0858240106862582, + "grad_norm": 0.30070767227458534, + "learning_rate": 2.5704487539444956e-06, + "loss": 0.0153, + "step": 6246 + }, + { + "epoch": 2.086157956253131, + "grad_norm": 0.4106733638952333, + "learning_rate": 2.5687504472356596e-06, + "loss": 0.0176, + "step": 6247 + }, + { + "epoch": 2.086491901820003, + "grad_norm": 0.2881483708610056, + "learning_rate": 2.5670525077956944e-06, + "loss": 0.0143, + "step": 6248 + }, + { + "epoch": 2.086825847386876, + "grad_norm": 0.26086768861553566, + "learning_rate": 2.5653549358810957e-06, + "loss": 0.0114, + "step": 6249 + }, + { + "epoch": 2.0871597929537486, + "grad_norm": 0.4159783982692846, + "learning_rate": 2.563657731748299e-06, + "loss": 0.0176, + "step": 6250 + }, + { + "epoch": 2.087493738520621, + "grad_norm": 0.24033871419824848, + "learning_rate": 2.5619608956536895e-06, + "loss": 0.0156, + "step": 6251 + }, + { + "epoch": 2.0878276840874936, + "grad_norm": 0.3776534290505859, + "learning_rate": 2.5602644278535937e-06, + "loss": 0.0273, + "step": 6252 + }, + { + "epoch": 2.0881616296543664, + "grad_norm": 0.29359583761742586, + "learning_rate": 2.558568328604285e-06, + "loss": 0.0152, + "step": 6253 + }, + { + "epoch": 2.088495575221239, + "grad_norm": 0.28565380949311164, + "learning_rate": 2.5568725981619747e-06, + "loss": 0.0193, + "step": 6254 + }, + { + "epoch": 2.0888295207881113, + "grad_norm": 0.4426396794338646, + "learning_rate": 2.5551772367828276e-06, + "loss": 0.0424, + "step": 6255 + }, + { + "epoch": 2.089163466354984, + "grad_norm": 0.21743482548014179, + "learning_rate": 2.5534822447229436e-06, + "loss": 0.0105, + "step": 6256 + }, + { + "epoch": 2.089497411921857, + "grad_norm": 0.2923398831037656, + "learning_rate": 2.551787622238376e-06, + "loss": 0.0131, + "step": 6257 + }, + { + "epoch": 2.0898313574887295, + "grad_norm": 0.3110990186071121, + "learning_rate": 2.5500933695851104e-06, + "loss": 0.0157, + "step": 6258 + }, + { + "epoch": 2.0901653030556018, + "grad_norm": 0.32442775102682153, + "learning_rate": 2.548399487019092e-06, + "loss": 0.0157, + "step": 6259 + }, + { + "epoch": 2.0904992486224745, + "grad_norm": 0.3482995484791825, + "learning_rate": 2.5467059747961953e-06, + "loss": 0.0134, + "step": 6260 + }, + { + "epoch": 2.090833194189347, + "grad_norm": 0.3089071994523559, + "learning_rate": 2.54501283317225e-06, + "loss": 0.016, + "step": 6261 + }, + { + "epoch": 2.09116713975622, + "grad_norm": 0.24004573560699569, + "learning_rate": 2.5433200624030212e-06, + "loss": 0.0107, + "step": 6262 + }, + { + "epoch": 2.091501085323092, + "grad_norm": 0.3106056910743025, + "learning_rate": 2.541627662744225e-06, + "loss": 0.0133, + "step": 6263 + }, + { + "epoch": 2.091835030889965, + "grad_norm": 0.28718205108796635, + "learning_rate": 2.5399356344515138e-06, + "loss": 0.0159, + "step": 6264 + }, + { + "epoch": 2.0921689764568376, + "grad_norm": 0.2687950827191494, + "learning_rate": 2.538243977780494e-06, + "loss": 0.0116, + "step": 6265 + }, + { + "epoch": 2.0925029220237104, + "grad_norm": 0.32488093758238024, + "learning_rate": 2.5365526929867056e-06, + "loss": 0.0161, + "step": 6266 + }, + { + "epoch": 2.0928368675905826, + "grad_norm": 0.4031736692518948, + "learning_rate": 2.534861780325642e-06, + "loss": 0.0153, + "step": 6267 + }, + { + "epoch": 2.0931708131574553, + "grad_norm": 0.2373644499881437, + "learning_rate": 2.53317124005273e-06, + "loss": 0.0099, + "step": 6268 + }, + { + "epoch": 2.093504758724328, + "grad_norm": 0.2597092551565546, + "learning_rate": 2.5314810724233502e-06, + "loss": 0.0107, + "step": 6269 + }, + { + "epoch": 2.0938387042912003, + "grad_norm": 0.36860569074985583, + "learning_rate": 2.529791277692818e-06, + "loss": 0.0175, + "step": 6270 + }, + { + "epoch": 2.094172649858073, + "grad_norm": 0.26038967520554, + "learning_rate": 2.5281018561163996e-06, + "loss": 0.0134, + "step": 6271 + }, + { + "epoch": 2.0945065954249458, + "grad_norm": 0.2659192944639581, + "learning_rate": 2.5264128079493033e-06, + "loss": 0.0144, + "step": 6272 + }, + { + "epoch": 2.0948405409918185, + "grad_norm": 0.29276916811500125, + "learning_rate": 2.524724133446676e-06, + "loss": 0.0155, + "step": 6273 + }, + { + "epoch": 2.0951744865586908, + "grad_norm": 0.2795195143239489, + "learning_rate": 2.523035832863614e-06, + "loss": 0.0118, + "step": 6274 + }, + { + "epoch": 2.0955084321255635, + "grad_norm": 0.21419630636848647, + "learning_rate": 2.521347906455154e-06, + "loss": 0.0087, + "step": 6275 + }, + { + "epoch": 2.095842377692436, + "grad_norm": 0.4287695267783562, + "learning_rate": 2.5196603544762804e-06, + "loss": 0.0228, + "step": 6276 + }, + { + "epoch": 2.096176323259309, + "grad_norm": 0.24742579163324185, + "learning_rate": 2.5179731771819133e-06, + "loss": 0.0114, + "step": 6277 + }, + { + "epoch": 2.096510268826181, + "grad_norm": 0.280917963843559, + "learning_rate": 2.5162863748269247e-06, + "loss": 0.0177, + "step": 6278 + }, + { + "epoch": 2.096844214393054, + "grad_norm": 0.37891106724615875, + "learning_rate": 2.514599947666122e-06, + "loss": 0.0206, + "step": 6279 + }, + { + "epoch": 2.0971781599599266, + "grad_norm": 0.35690210579383713, + "learning_rate": 2.5129138959542633e-06, + "loss": 0.0201, + "step": 6280 + }, + { + "epoch": 2.0975121055267993, + "grad_norm": 0.27909643901718395, + "learning_rate": 2.5112282199460415e-06, + "loss": 0.0178, + "step": 6281 + }, + { + "epoch": 2.0978460510936716, + "grad_norm": 0.4217844694011119, + "learning_rate": 2.5095429198961056e-06, + "loss": 0.0228, + "step": 6282 + }, + { + "epoch": 2.0981799966605443, + "grad_norm": 0.687734506847529, + "learning_rate": 2.507857996059034e-06, + "loss": 0.0286, + "step": 6283 + }, + { + "epoch": 2.098513942227417, + "grad_norm": 0.5163752703538171, + "learning_rate": 2.5061734486893574e-06, + "loss": 0.0168, + "step": 6284 + }, + { + "epoch": 2.0988478877942893, + "grad_norm": 0.26744004222956186, + "learning_rate": 2.504489278041544e-06, + "loss": 0.0149, + "step": 6285 + }, + { + "epoch": 2.099181833361162, + "grad_norm": 0.2885805123873711, + "learning_rate": 2.5028054843700102e-06, + "loss": 0.0154, + "step": 6286 + }, + { + "epoch": 2.0995157789280348, + "grad_norm": 0.2313607874220595, + "learning_rate": 2.501122067929108e-06, + "loss": 0.0107, + "step": 6287 + }, + { + "epoch": 2.0998497244949075, + "grad_norm": 0.20117095925556797, + "learning_rate": 2.4994390289731446e-06, + "loss": 0.011, + "step": 6288 + }, + { + "epoch": 2.1001836700617798, + "grad_norm": 0.38636228293728364, + "learning_rate": 2.497756367756357e-06, + "loss": 0.0169, + "step": 6289 + }, + { + "epoch": 2.1005176156286525, + "grad_norm": 0.26307109841840925, + "learning_rate": 2.496074084532935e-06, + "loss": 0.0151, + "step": 6290 + }, + { + "epoch": 2.100851561195525, + "grad_norm": 0.27202503480199225, + "learning_rate": 2.4943921795570033e-06, + "loss": 0.0134, + "step": 6291 + }, + { + "epoch": 2.101185506762398, + "grad_norm": 0.27228359902656274, + "learning_rate": 2.4927106530826372e-06, + "loss": 0.0179, + "step": 6292 + }, + { + "epoch": 2.10151945232927, + "grad_norm": 0.2976352902965977, + "learning_rate": 2.491029505363848e-06, + "loss": 0.0139, + "step": 6293 + }, + { + "epoch": 2.101853397896143, + "grad_norm": 0.2543321242263, + "learning_rate": 2.489348736654593e-06, + "loss": 0.0136, + "step": 6294 + }, + { + "epoch": 2.1021873434630156, + "grad_norm": 0.2196231601619742, + "learning_rate": 2.4876683472087767e-06, + "loss": 0.0085, + "step": 6295 + }, + { + "epoch": 2.1025212890298883, + "grad_norm": 0.23987061103317567, + "learning_rate": 2.4859883372802357e-06, + "loss": 0.0136, + "step": 6296 + }, + { + "epoch": 2.1028552345967606, + "grad_norm": 0.19884545816103985, + "learning_rate": 2.484308707122758e-06, + "loss": 0.0101, + "step": 6297 + }, + { + "epoch": 2.1031891801636333, + "grad_norm": 0.383114758245921, + "learning_rate": 2.4826294569900725e-06, + "loss": 0.0165, + "step": 6298 + }, + { + "epoch": 2.103523125730506, + "grad_norm": 0.33389410315398704, + "learning_rate": 2.4809505871358476e-06, + "loss": 0.019, + "step": 6299 + }, + { + "epoch": 2.1038570712973783, + "grad_norm": 0.29862187139083723, + "learning_rate": 2.4792720978136967e-06, + "loss": 0.0162, + "step": 6300 + }, + { + "epoch": 2.104191016864251, + "grad_norm": 0.21961769644919973, + "learning_rate": 2.4775939892771787e-06, + "loss": 0.012, + "step": 6301 + }, + { + "epoch": 2.1045249624311237, + "grad_norm": 0.30846576518311836, + "learning_rate": 2.4759162617797873e-06, + "loss": 0.02, + "step": 6302 + }, + { + "epoch": 2.1048589079979965, + "grad_norm": 0.3291334958589151, + "learning_rate": 2.4742389155749657e-06, + "loss": 0.0187, + "step": 6303 + }, + { + "epoch": 2.1051928535648687, + "grad_norm": 0.28348334818111015, + "learning_rate": 2.472561950916094e-06, + "loss": 0.0122, + "step": 6304 + }, + { + "epoch": 2.1055267991317415, + "grad_norm": 0.2821980081552556, + "learning_rate": 2.4708853680565e-06, + "loss": 0.0151, + "step": 6305 + }, + { + "epoch": 2.105860744698614, + "grad_norm": 0.22092780482715804, + "learning_rate": 2.4692091672494494e-06, + "loss": 0.0095, + "step": 6306 + }, + { + "epoch": 2.106194690265487, + "grad_norm": 0.34163952014772775, + "learning_rate": 2.4675333487481558e-06, + "loss": 0.0148, + "step": 6307 + }, + { + "epoch": 2.106528635832359, + "grad_norm": 0.23487466372973265, + "learning_rate": 2.4658579128057665e-06, + "loss": 0.0147, + "step": 6308 + }, + { + "epoch": 2.106862581399232, + "grad_norm": 0.309473110287223, + "learning_rate": 2.4641828596753803e-06, + "loss": 0.0174, + "step": 6309 + }, + { + "epoch": 2.1071965269661046, + "grad_norm": 0.23957152064386736, + "learning_rate": 2.4625081896100294e-06, + "loss": 0.0147, + "step": 6310 + }, + { + "epoch": 2.1075304725329773, + "grad_norm": 0.2356402669385959, + "learning_rate": 2.4608339028626943e-06, + "loss": 0.0185, + "step": 6311 + }, + { + "epoch": 2.1078644180998496, + "grad_norm": 0.2478075683890819, + "learning_rate": 2.4591599996862957e-06, + "loss": 0.0114, + "step": 6312 + }, + { + "epoch": 2.1081983636667223, + "grad_norm": 0.3299389202861783, + "learning_rate": 2.457486480333699e-06, + "loss": 0.0208, + "step": 6313 + }, + { + "epoch": 2.108532309233595, + "grad_norm": 0.32861519914876286, + "learning_rate": 2.4558133450577044e-06, + "loss": 0.0184, + "step": 6314 + }, + { + "epoch": 2.1088662548004677, + "grad_norm": 0.3236467368213756, + "learning_rate": 2.4541405941110626e-06, + "loss": 0.0199, + "step": 6315 + }, + { + "epoch": 2.10920020036734, + "grad_norm": 0.4048303066970319, + "learning_rate": 2.452468227746459e-06, + "loss": 0.0119, + "step": 6316 + }, + { + "epoch": 2.1095341459342127, + "grad_norm": 0.34363621322258253, + "learning_rate": 2.4507962462165254e-06, + "loss": 0.0145, + "step": 6317 + }, + { + "epoch": 2.1098680915010855, + "grad_norm": 0.2583666199765121, + "learning_rate": 2.449124649773835e-06, + "loss": 0.0129, + "step": 6318 + }, + { + "epoch": 2.1102020370679577, + "grad_norm": 0.39906251268362747, + "learning_rate": 2.4474534386709036e-06, + "loss": 0.0192, + "step": 6319 + }, + { + "epoch": 2.1105359826348304, + "grad_norm": 0.30957618565823813, + "learning_rate": 2.4457826131601835e-06, + "loss": 0.0147, + "step": 6320 + }, + { + "epoch": 2.110869928201703, + "grad_norm": 0.37197451281109395, + "learning_rate": 2.444112173494077e-06, + "loss": 0.0206, + "step": 6321 + }, + { + "epoch": 2.111203873768576, + "grad_norm": 0.2476699885184769, + "learning_rate": 2.4424421199249194e-06, + "loss": 0.0118, + "step": 6322 + }, + { + "epoch": 2.111537819335448, + "grad_norm": 0.28164011124237065, + "learning_rate": 2.440772452704993e-06, + "loss": 0.0202, + "step": 6323 + }, + { + "epoch": 2.111871764902321, + "grad_norm": 0.2985031021074347, + "learning_rate": 2.4391031720865246e-06, + "loss": 0.0191, + "step": 6324 + }, + { + "epoch": 2.1122057104691936, + "grad_norm": 0.28205015046644444, + "learning_rate": 2.4374342783216732e-06, + "loss": 0.0122, + "step": 6325 + }, + { + "epoch": 2.1125396560360663, + "grad_norm": 0.28397498027623147, + "learning_rate": 2.435765771662549e-06, + "loss": 0.0157, + "step": 6326 + }, + { + "epoch": 2.1128736016029386, + "grad_norm": 0.2527821376524227, + "learning_rate": 2.4340976523611957e-06, + "loss": 0.0154, + "step": 6327 + }, + { + "epoch": 2.1132075471698113, + "grad_norm": 0.2657558328790342, + "learning_rate": 2.4324299206696057e-06, + "loss": 0.0121, + "step": 6328 + }, + { + "epoch": 2.113541492736684, + "grad_norm": 0.328007150050808, + "learning_rate": 2.4307625768397077e-06, + "loss": 0.0196, + "step": 6329 + }, + { + "epoch": 2.1138754383035567, + "grad_norm": 0.2794516305190381, + "learning_rate": 2.4290956211233757e-06, + "loss": 0.0129, + "step": 6330 + }, + { + "epoch": 2.114209383870429, + "grad_norm": 0.2403463497273294, + "learning_rate": 2.42742905377242e-06, + "loss": 0.0089, + "step": 6331 + }, + { + "epoch": 2.1145433294373017, + "grad_norm": 0.32149971714143255, + "learning_rate": 2.4257628750385987e-06, + "loss": 0.0176, + "step": 6332 + }, + { + "epoch": 2.1148772750041744, + "grad_norm": 0.3159822559410141, + "learning_rate": 2.424097085173604e-06, + "loss": 0.0148, + "step": 6333 + }, + { + "epoch": 2.1152112205710467, + "grad_norm": 0.3292482716731008, + "learning_rate": 2.4224316844290747e-06, + "loss": 0.0167, + "step": 6334 + }, + { + "epoch": 2.1155451661379194, + "grad_norm": 0.3620125389575101, + "learning_rate": 2.4207666730565893e-06, + "loss": 0.0178, + "step": 6335 + }, + { + "epoch": 2.115879111704792, + "grad_norm": 0.30644156127925465, + "learning_rate": 2.4191020513076697e-06, + "loss": 0.0142, + "step": 6336 + }, + { + "epoch": 2.116213057271665, + "grad_norm": 0.2866973652860801, + "learning_rate": 2.4174378194337715e-06, + "loss": 0.0131, + "step": 6337 + }, + { + "epoch": 2.116547002838537, + "grad_norm": 0.3570314974441156, + "learning_rate": 2.4157739776863023e-06, + "loss": 0.0186, + "step": 6338 + }, + { + "epoch": 2.11688094840541, + "grad_norm": 0.29606147566208474, + "learning_rate": 2.4141105263166e-06, + "loss": 0.0128, + "step": 6339 + }, + { + "epoch": 2.1172148939722826, + "grad_norm": 0.33670647754402017, + "learning_rate": 2.41244746557595e-06, + "loss": 0.0202, + "step": 6340 + }, + { + "epoch": 2.1175488395391553, + "grad_norm": 0.2600737581622472, + "learning_rate": 2.4107847957155784e-06, + "loss": 0.0139, + "step": 6341 + }, + { + "epoch": 2.1178827851060276, + "grad_norm": 0.2520569871419981, + "learning_rate": 2.409122516986652e-06, + "loss": 0.0133, + "step": 6342 + }, + { + "epoch": 2.1182167306729003, + "grad_norm": 0.2549206631474349, + "learning_rate": 2.4074606296402735e-06, + "loss": 0.0123, + "step": 6343 + }, + { + "epoch": 2.118550676239773, + "grad_norm": 0.3133443756263147, + "learning_rate": 2.405799133927496e-06, + "loss": 0.0168, + "step": 6344 + }, + { + "epoch": 2.1188846218066457, + "grad_norm": 0.36328037869246643, + "learning_rate": 2.404138030099303e-06, + "loss": 0.0163, + "step": 6345 + }, + { + "epoch": 2.119218567373518, + "grad_norm": 0.24759314519351655, + "learning_rate": 2.4024773184066253e-06, + "loss": 0.0154, + "step": 6346 + }, + { + "epoch": 2.1195525129403907, + "grad_norm": 0.27322584324323756, + "learning_rate": 2.4008169991003356e-06, + "loss": 0.0136, + "step": 6347 + }, + { + "epoch": 2.1198864585072634, + "grad_norm": 0.23665414390601153, + "learning_rate": 2.3991570724312405e-06, + "loss": 0.0146, + "step": 6348 + }, + { + "epoch": 2.1202204040741357, + "grad_norm": 0.27824772245001744, + "learning_rate": 2.3974975386500958e-06, + "loss": 0.0163, + "step": 6349 + }, + { + "epoch": 2.1205543496410084, + "grad_norm": 0.2579284454433024, + "learning_rate": 2.3958383980075896e-06, + "loss": 0.0123, + "step": 6350 + }, + { + "epoch": 2.120888295207881, + "grad_norm": 0.2927224426767084, + "learning_rate": 2.394179650754358e-06, + "loss": 0.0138, + "step": 6351 + }, + { + "epoch": 2.121222240774754, + "grad_norm": 0.28411440366395213, + "learning_rate": 2.3925212971409688e-06, + "loss": 0.0166, + "step": 6352 + }, + { + "epoch": 2.121556186341626, + "grad_norm": 0.20964275273940322, + "learning_rate": 2.3908633374179436e-06, + "loss": 0.0089, + "step": 6353 + }, + { + "epoch": 2.121890131908499, + "grad_norm": 0.2808166456045462, + "learning_rate": 2.3892057718357308e-06, + "loss": 0.0146, + "step": 6354 + }, + { + "epoch": 2.1222240774753716, + "grad_norm": 0.22847341818999706, + "learning_rate": 2.3875486006447294e-06, + "loss": 0.0117, + "step": 6355 + }, + { + "epoch": 2.1225580230422443, + "grad_norm": 0.2654764079191746, + "learning_rate": 2.3858918240952703e-06, + "loss": 0.0098, + "step": 6356 + }, + { + "epoch": 2.1228919686091166, + "grad_norm": 0.2741049101290846, + "learning_rate": 2.384235442437632e-06, + "loss": 0.0132, + "step": 6357 + }, + { + "epoch": 2.1232259141759893, + "grad_norm": 0.2961895766669102, + "learning_rate": 2.3825794559220296e-06, + "loss": 0.0194, + "step": 6358 + }, + { + "epoch": 2.123559859742862, + "grad_norm": 0.36911682734464085, + "learning_rate": 2.380923864798621e-06, + "loss": 0.0266, + "step": 6359 + }, + { + "epoch": 2.1238938053097347, + "grad_norm": 0.20844802443615157, + "learning_rate": 2.3792686693174993e-06, + "loss": 0.0106, + "step": 6360 + }, + { + "epoch": 2.124227750876607, + "grad_norm": 0.27269892682133406, + "learning_rate": 2.3776138697287055e-06, + "loss": 0.0145, + "step": 6361 + }, + { + "epoch": 2.1245616964434797, + "grad_norm": 0.26181954181026257, + "learning_rate": 2.3759594662822122e-06, + "loss": 0.0129, + "step": 6362 + }, + { + "epoch": 2.1248956420103524, + "grad_norm": 0.27356183681183144, + "learning_rate": 2.3743054592279386e-06, + "loss": 0.0119, + "step": 6363 + }, + { + "epoch": 2.125229587577225, + "grad_norm": 0.2732613382505249, + "learning_rate": 2.372651848815742e-06, + "loss": 0.0144, + "step": 6364 + }, + { + "epoch": 2.1255635331440974, + "grad_norm": 0.17167852700380207, + "learning_rate": 2.370998635295421e-06, + "loss": 0.0074, + "step": 6365 + }, + { + "epoch": 2.12589747871097, + "grad_norm": 0.27158385445851063, + "learning_rate": 2.3693458189167106e-06, + "loss": 0.0161, + "step": 6366 + }, + { + "epoch": 2.126231424277843, + "grad_norm": 0.32176653140968703, + "learning_rate": 2.3676933999292905e-06, + "loss": 0.0135, + "step": 6367 + }, + { + "epoch": 2.126565369844715, + "grad_norm": 0.3311250748765162, + "learning_rate": 2.366041378582775e-06, + "loss": 0.018, + "step": 6368 + }, + { + "epoch": 2.126899315411588, + "grad_norm": 0.24458509726088312, + "learning_rate": 2.364389755126723e-06, + "loss": 0.0101, + "step": 6369 + }, + { + "epoch": 2.1272332609784605, + "grad_norm": 0.2083650726094157, + "learning_rate": 2.3627385298106344e-06, + "loss": 0.0094, + "step": 6370 + }, + { + "epoch": 2.1275672065453333, + "grad_norm": 0.3471440551333678, + "learning_rate": 2.361087702883941e-06, + "loss": 0.0127, + "step": 6371 + }, + { + "epoch": 2.1279011521122055, + "grad_norm": 0.26878538303276184, + "learning_rate": 2.359437274596024e-06, + "loss": 0.0117, + "step": 6372 + }, + { + "epoch": 2.1282350976790783, + "grad_norm": 0.24044339642405563, + "learning_rate": 2.357787245196197e-06, + "loss": 0.0128, + "step": 6373 + }, + { + "epoch": 2.128569043245951, + "grad_norm": 0.1791980061910617, + "learning_rate": 2.3561376149337188e-06, + "loss": 0.0052, + "step": 6374 + }, + { + "epoch": 2.1289029888128237, + "grad_norm": 0.2726359222798816, + "learning_rate": 2.3544883840577815e-06, + "loss": 0.0139, + "step": 6375 + }, + { + "epoch": 2.129236934379696, + "grad_norm": 0.31427581655705467, + "learning_rate": 2.352839552817527e-06, + "loss": 0.0174, + "step": 6376 + }, + { + "epoch": 2.1295708799465687, + "grad_norm": 0.31105706909069314, + "learning_rate": 2.3511911214620255e-06, + "loss": 0.0116, + "step": 6377 + }, + { + "epoch": 2.1299048255134414, + "grad_norm": 0.3165157283981129, + "learning_rate": 2.3495430902402956e-06, + "loss": 0.0225, + "step": 6378 + }, + { + "epoch": 2.1302387710803137, + "grad_norm": 0.2708561870368086, + "learning_rate": 2.3478954594012884e-06, + "loss": 0.0099, + "step": 6379 + }, + { + "epoch": 2.1305727166471864, + "grad_norm": 0.2567672621447335, + "learning_rate": 2.346248229193901e-06, + "loss": 0.0139, + "step": 6380 + }, + { + "epoch": 2.130906662214059, + "grad_norm": 0.36063235869416643, + "learning_rate": 2.344601399866962e-06, + "loss": 0.0222, + "step": 6381 + }, + { + "epoch": 2.131240607780932, + "grad_norm": 0.2289741706220592, + "learning_rate": 2.342954971669252e-06, + "loss": 0.0139, + "step": 6382 + }, + { + "epoch": 2.131574553347804, + "grad_norm": 0.3242083045253181, + "learning_rate": 2.341308944849477e-06, + "loss": 0.0127, + "step": 6383 + }, + { + "epoch": 2.131908498914677, + "grad_norm": 0.24755677452308103, + "learning_rate": 2.3396633196562924e-06, + "loss": 0.0111, + "step": 6384 + }, + { + "epoch": 2.1322424444815495, + "grad_norm": 0.35538912735353595, + "learning_rate": 2.3380180963382866e-06, + "loss": 0.0194, + "step": 6385 + }, + { + "epoch": 2.1325763900484223, + "grad_norm": 0.333269934452208, + "learning_rate": 2.3363732751439926e-06, + "loss": 0.0152, + "step": 6386 + }, + { + "epoch": 2.1329103356152945, + "grad_norm": 0.38942370033643875, + "learning_rate": 2.334728856321875e-06, + "loss": 0.0165, + "step": 6387 + }, + { + "epoch": 2.1332442811821672, + "grad_norm": 0.3414507275124361, + "learning_rate": 2.33308484012035e-06, + "loss": 0.0178, + "step": 6388 + }, + { + "epoch": 2.13357822674904, + "grad_norm": 0.2320738052381853, + "learning_rate": 2.33144122678776e-06, + "loss": 0.0099, + "step": 6389 + }, + { + "epoch": 2.1339121723159127, + "grad_norm": 0.2678591322809289, + "learning_rate": 2.3297980165723953e-06, + "loss": 0.0108, + "step": 6390 + }, + { + "epoch": 2.134246117882785, + "grad_norm": 0.3449058952106492, + "learning_rate": 2.3281552097224798e-06, + "loss": 0.0173, + "step": 6391 + }, + { + "epoch": 2.1345800634496577, + "grad_norm": 0.3138892815959436, + "learning_rate": 2.326512806486181e-06, + "loss": 0.0149, + "step": 6392 + }, + { + "epoch": 2.1349140090165304, + "grad_norm": 0.21716274667209393, + "learning_rate": 2.3248708071116005e-06, + "loss": 0.0125, + "step": 6393 + }, + { + "epoch": 2.135247954583403, + "grad_norm": 0.23095461058696024, + "learning_rate": 2.323229211846783e-06, + "loss": 0.0108, + "step": 6394 + }, + { + "epoch": 2.1355819001502754, + "grad_norm": 0.27261593654845534, + "learning_rate": 2.3215880209397133e-06, + "loss": 0.0147, + "step": 6395 + }, + { + "epoch": 2.135915845717148, + "grad_norm": 0.3127963098104018, + "learning_rate": 2.319947234638308e-06, + "loss": 0.0144, + "step": 6396 + }, + { + "epoch": 2.136249791284021, + "grad_norm": 0.3241815094646612, + "learning_rate": 2.3183068531904317e-06, + "loss": 0.0116, + "step": 6397 + }, + { + "epoch": 2.136583736850893, + "grad_norm": 0.23018954007064094, + "learning_rate": 2.3166668768438772e-06, + "loss": 0.0104, + "step": 6398 + }, + { + "epoch": 2.136917682417766, + "grad_norm": 0.27751962651823514, + "learning_rate": 2.31502730584639e-06, + "loss": 0.0158, + "step": 6399 + }, + { + "epoch": 2.1372516279846385, + "grad_norm": 0.2480879720731281, + "learning_rate": 2.313388140445641e-06, + "loss": 0.0123, + "step": 6400 + }, + { + "epoch": 2.1375855735515112, + "grad_norm": 0.23526793343388888, + "learning_rate": 2.311749380889249e-06, + "loss": 0.0157, + "step": 6401 + }, + { + "epoch": 2.1379195191183835, + "grad_norm": 0.44094618802044216, + "learning_rate": 2.310111027424764e-06, + "loss": 0.0255, + "step": 6402 + }, + { + "epoch": 2.1382534646852562, + "grad_norm": 0.26614479700607774, + "learning_rate": 2.308473080299683e-06, + "loss": 0.0138, + "step": 6403 + }, + { + "epoch": 2.138587410252129, + "grad_norm": 0.25853358473002913, + "learning_rate": 2.3068355397614313e-06, + "loss": 0.0166, + "step": 6404 + }, + { + "epoch": 2.1389213558190017, + "grad_norm": 0.3646570931116901, + "learning_rate": 2.3051984060573855e-06, + "loss": 0.0156, + "step": 6405 + }, + { + "epoch": 2.139255301385874, + "grad_norm": 0.2844331645715239, + "learning_rate": 2.303561679434849e-06, + "loss": 0.0152, + "step": 6406 + }, + { + "epoch": 2.1395892469527467, + "grad_norm": 0.2704412909914307, + "learning_rate": 2.3019253601410725e-06, + "loss": 0.0125, + "step": 6407 + }, + { + "epoch": 2.1399231925196194, + "grad_norm": 0.26389364475661037, + "learning_rate": 2.300289448423237e-06, + "loss": 0.0142, + "step": 6408 + }, + { + "epoch": 2.140257138086492, + "grad_norm": 0.26890366044211367, + "learning_rate": 2.2986539445284705e-06, + "loss": 0.0127, + "step": 6409 + }, + { + "epoch": 2.1405910836533644, + "grad_norm": 0.29937362645021015, + "learning_rate": 2.2970188487038293e-06, + "loss": 0.0138, + "step": 6410 + }, + { + "epoch": 2.140925029220237, + "grad_norm": 0.46247979897770325, + "learning_rate": 2.295384161196321e-06, + "loss": 0.0169, + "step": 6411 + }, + { + "epoch": 2.14125897478711, + "grad_norm": 0.21502341753824644, + "learning_rate": 2.293749882252879e-06, + "loss": 0.0125, + "step": 6412 + }, + { + "epoch": 2.1415929203539825, + "grad_norm": 0.22272698105449634, + "learning_rate": 2.2921160121203847e-06, + "loss": 0.0118, + "step": 6413 + }, + { + "epoch": 2.141926865920855, + "grad_norm": 0.2998770049748982, + "learning_rate": 2.290482551045649e-06, + "loss": 0.0177, + "step": 6414 + }, + { + "epoch": 2.1422608114877275, + "grad_norm": 0.23166158608170034, + "learning_rate": 2.2888494992754294e-06, + "loss": 0.0105, + "step": 6415 + }, + { + "epoch": 2.1425947570546002, + "grad_norm": 0.2593065690196099, + "learning_rate": 2.2872168570564136e-06, + "loss": 0.0142, + "step": 6416 + }, + { + "epoch": 2.1429287026214725, + "grad_norm": 0.22886712443986532, + "learning_rate": 2.2855846246352335e-06, + "loss": 0.0101, + "step": 6417 + }, + { + "epoch": 2.143262648188345, + "grad_norm": 0.27142361160843476, + "learning_rate": 2.2839528022584596e-06, + "loss": 0.0158, + "step": 6418 + }, + { + "epoch": 2.143596593755218, + "grad_norm": 0.3377025436421419, + "learning_rate": 2.2823213901725927e-06, + "loss": 0.0219, + "step": 6419 + }, + { + "epoch": 2.1439305393220907, + "grad_norm": 0.32254654670742733, + "learning_rate": 2.2806903886240815e-06, + "loss": 0.0219, + "step": 6420 + }, + { + "epoch": 2.144264484888963, + "grad_norm": 0.2978003362475645, + "learning_rate": 2.2790597978593044e-06, + "loss": 0.0161, + "step": 6421 + }, + { + "epoch": 2.1445984304558356, + "grad_norm": 0.4167027119901389, + "learning_rate": 2.2774296181245825e-06, + "loss": 0.027, + "step": 6422 + }, + { + "epoch": 2.1449323760227084, + "grad_norm": 0.31401897722913386, + "learning_rate": 2.275799849666174e-06, + "loss": 0.0151, + "step": 6423 + }, + { + "epoch": 2.145266321589581, + "grad_norm": 0.34065054419097557, + "learning_rate": 2.274170492730277e-06, + "loss": 0.0209, + "step": 6424 + }, + { + "epoch": 2.1456002671564534, + "grad_norm": 0.45491689300781946, + "learning_rate": 2.27254154756302e-06, + "loss": 0.0291, + "step": 6425 + }, + { + "epoch": 2.145934212723326, + "grad_norm": 0.3007007719494336, + "learning_rate": 2.2709130144104795e-06, + "loss": 0.0106, + "step": 6426 + }, + { + "epoch": 2.146268158290199, + "grad_norm": 0.19476559482408035, + "learning_rate": 2.26928489351866e-06, + "loss": 0.008, + "step": 6427 + }, + { + "epoch": 2.146602103857071, + "grad_norm": 0.3470246119824881, + "learning_rate": 2.267657185133511e-06, + "loss": 0.0156, + "step": 6428 + }, + { + "epoch": 2.146936049423944, + "grad_norm": 0.22884004535190874, + "learning_rate": 2.2660298895009157e-06, + "loss": 0.0101, + "step": 6429 + }, + { + "epoch": 2.1472699949908165, + "grad_norm": 0.3585614088561408, + "learning_rate": 2.2644030068666993e-06, + "loss": 0.0214, + "step": 6430 + }, + { + "epoch": 2.147603940557689, + "grad_norm": 0.43823519765365065, + "learning_rate": 2.2627765374766175e-06, + "loss": 0.0155, + "step": 6431 + }, + { + "epoch": 2.1479378861245615, + "grad_norm": 0.3609804258874382, + "learning_rate": 2.2611504815763715e-06, + "loss": 0.0129, + "step": 6432 + }, + { + "epoch": 2.148271831691434, + "grad_norm": 0.30559656784704303, + "learning_rate": 2.259524839411592e-06, + "loss": 0.0134, + "step": 6433 + }, + { + "epoch": 2.148605777258307, + "grad_norm": 0.26901653891642513, + "learning_rate": 2.2578996112278535e-06, + "loss": 0.0133, + "step": 6434 + }, + { + "epoch": 2.1489397228251796, + "grad_norm": 0.35426557485104243, + "learning_rate": 2.2562747972706663e-06, + "loss": 0.0215, + "step": 6435 + }, + { + "epoch": 2.149273668392052, + "grad_norm": 0.35077812604510644, + "learning_rate": 2.254650397785479e-06, + "loss": 0.0172, + "step": 6436 + }, + { + "epoch": 2.1496076139589246, + "grad_norm": 0.3718090048434478, + "learning_rate": 2.253026413017672e-06, + "loss": 0.0187, + "step": 6437 + }, + { + "epoch": 2.1499415595257974, + "grad_norm": 0.23086826190060963, + "learning_rate": 2.2514028432125722e-06, + "loss": 0.0094, + "step": 6438 + }, + { + "epoch": 2.15027550509267, + "grad_norm": 0.3515387319496888, + "learning_rate": 2.249779688615435e-06, + "loss": 0.0219, + "step": 6439 + }, + { + "epoch": 2.1506094506595423, + "grad_norm": 0.24022766804051457, + "learning_rate": 2.248156949471459e-06, + "loss": 0.0162, + "step": 6440 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 0.3259619450139486, + "learning_rate": 2.2465346260257786e-06, + "loss": 0.0322, + "step": 6441 + }, + { + "epoch": 2.151277341793288, + "grad_norm": 0.27871676383828087, + "learning_rate": 2.2449127185234626e-06, + "loss": 0.0165, + "step": 6442 + }, + { + "epoch": 2.1516112873601605, + "grad_norm": 0.47230038262915325, + "learning_rate": 2.2432912272095227e-06, + "loss": 0.0228, + "step": 6443 + }, + { + "epoch": 2.1519452329270328, + "grad_norm": 0.2745384003892179, + "learning_rate": 2.2416701523288997e-06, + "loss": 0.0171, + "step": 6444 + }, + { + "epoch": 2.1522791784939055, + "grad_norm": 0.19520006217044095, + "learning_rate": 2.240049494126479e-06, + "loss": 0.0123, + "step": 6445 + }, + { + "epoch": 2.152613124060778, + "grad_norm": 0.2682638297511614, + "learning_rate": 2.238429252847079e-06, + "loss": 0.0193, + "step": 6446 + }, + { + "epoch": 2.1529470696276505, + "grad_norm": 0.2798517279219314, + "learning_rate": 2.2368094287354586e-06, + "loss": 0.0128, + "step": 6447 + }, + { + "epoch": 2.153281015194523, + "grad_norm": 0.29017966020638897, + "learning_rate": 2.2351900220363083e-06, + "loss": 0.0167, + "step": 6448 + }, + { + "epoch": 2.153614960761396, + "grad_norm": 0.24358722672839853, + "learning_rate": 2.2335710329942613e-06, + "loss": 0.0135, + "step": 6449 + }, + { + "epoch": 2.1539489063282686, + "grad_norm": 0.27452795615277803, + "learning_rate": 2.2319524618538814e-06, + "loss": 0.0115, + "step": 6450 + }, + { + "epoch": 2.154282851895141, + "grad_norm": 0.32600071812298304, + "learning_rate": 2.2303343088596753e-06, + "loss": 0.0174, + "step": 6451 + }, + { + "epoch": 2.1546167974620136, + "grad_norm": 0.2860901905940594, + "learning_rate": 2.2287165742560828e-06, + "loss": 0.0175, + "step": 6452 + }, + { + "epoch": 2.1549507430288863, + "grad_norm": 0.27689664455515633, + "learning_rate": 2.227099258287485e-06, + "loss": 0.0187, + "step": 6453 + }, + { + "epoch": 2.155284688595759, + "grad_norm": 0.36139584950542614, + "learning_rate": 2.2254823611981926e-06, + "loss": 0.0261, + "step": 6454 + }, + { + "epoch": 2.1556186341626313, + "grad_norm": 0.27794608830824197, + "learning_rate": 2.2238658832324593e-06, + "loss": 0.0134, + "step": 6455 + }, + { + "epoch": 2.155952579729504, + "grad_norm": 0.2172174704558338, + "learning_rate": 2.222249824634471e-06, + "loss": 0.0116, + "step": 6456 + }, + { + "epoch": 2.1562865252963768, + "grad_norm": 0.39581746911098087, + "learning_rate": 2.220634185648354e-06, + "loss": 0.0237, + "step": 6457 + }, + { + "epoch": 2.1566204708632495, + "grad_norm": 0.2841949761379651, + "learning_rate": 2.2190189665181684e-06, + "loss": 0.0154, + "step": 6458 + }, + { + "epoch": 2.1569544164301218, + "grad_norm": 0.2331623300770666, + "learning_rate": 2.2174041674879152e-06, + "loss": 0.0119, + "step": 6459 + }, + { + "epoch": 2.1572883619969945, + "grad_norm": 0.274387595273132, + "learning_rate": 2.2157897888015247e-06, + "loss": 0.01, + "step": 6460 + }, + { + "epoch": 2.157622307563867, + "grad_norm": 0.28893125434374417, + "learning_rate": 2.214175830702871e-06, + "loss": 0.0119, + "step": 6461 + }, + { + "epoch": 2.15795625313074, + "grad_norm": 0.22081395823937297, + "learning_rate": 2.2125622934357588e-06, + "loss": 0.0119, + "step": 6462 + }, + { + "epoch": 2.158290198697612, + "grad_norm": 0.4761637320071046, + "learning_rate": 2.210949177243933e-06, + "loss": 0.0134, + "step": 6463 + }, + { + "epoch": 2.158624144264485, + "grad_norm": 0.37701366853689444, + "learning_rate": 2.209336482371076e-06, + "loss": 0.0286, + "step": 6464 + }, + { + "epoch": 2.1589580898313576, + "grad_norm": 0.30219015519774584, + "learning_rate": 2.2077242090608e-06, + "loss": 0.016, + "step": 6465 + }, + { + "epoch": 2.15929203539823, + "grad_norm": 0.28032324471684783, + "learning_rate": 2.206112357556662e-06, + "loss": 0.0186, + "step": 6466 + }, + { + "epoch": 2.1596259809651026, + "grad_norm": 0.2399235550633597, + "learning_rate": 2.2045009281021486e-06, + "loss": 0.0101, + "step": 6467 + }, + { + "epoch": 2.1599599265319753, + "grad_norm": 0.30685223515071325, + "learning_rate": 2.202889920940685e-06, + "loss": 0.0178, + "step": 6468 + }, + { + "epoch": 2.160293872098848, + "grad_norm": 0.28914116472358037, + "learning_rate": 2.2012793363156337e-06, + "loss": 0.0158, + "step": 6469 + }, + { + "epoch": 2.1606278176657203, + "grad_norm": 0.2973262388915178, + "learning_rate": 2.199669174470295e-06, + "loss": 0.014, + "step": 6470 + }, + { + "epoch": 2.160961763232593, + "grad_norm": 0.34694651618081374, + "learning_rate": 2.1980594356478977e-06, + "loss": 0.0199, + "step": 6471 + }, + { + "epoch": 2.1612957087994658, + "grad_norm": 0.22795188552565387, + "learning_rate": 2.196450120091617e-06, + "loss": 0.0141, + "step": 6472 + }, + { + "epoch": 2.1616296543663385, + "grad_norm": 0.2548834137226818, + "learning_rate": 2.194841228044554e-06, + "loss": 0.0113, + "step": 6473 + }, + { + "epoch": 2.1619635999332107, + "grad_norm": 0.3079495826836035, + "learning_rate": 2.1932327597497537e-06, + "loss": 0.0158, + "step": 6474 + }, + { + "epoch": 2.1622975455000835, + "grad_norm": 0.3183568664049095, + "learning_rate": 2.1916247154501937e-06, + "loss": 0.0148, + "step": 6475 + }, + { + "epoch": 2.162631491066956, + "grad_norm": 0.29722517078008187, + "learning_rate": 2.190017095388789e-06, + "loss": 0.0209, + "step": 6476 + }, + { + "epoch": 2.1629654366338285, + "grad_norm": 0.2374639534191777, + "learning_rate": 2.1884098998083867e-06, + "loss": 0.0115, + "step": 6477 + }, + { + "epoch": 2.163299382200701, + "grad_norm": 0.3003642132969079, + "learning_rate": 2.1868031289517773e-06, + "loss": 0.015, + "step": 6478 + }, + { + "epoch": 2.163633327767574, + "grad_norm": 0.2790728792422586, + "learning_rate": 2.1851967830616773e-06, + "loss": 0.0141, + "step": 6479 + }, + { + "epoch": 2.1639672733344466, + "grad_norm": 0.37289641299565107, + "learning_rate": 2.1835908623807462e-06, + "loss": 0.0155, + "step": 6480 + }, + { + "epoch": 2.164301218901319, + "grad_norm": 0.2828161579770664, + "learning_rate": 2.1819853671515774e-06, + "loss": 0.0144, + "step": 6481 + }, + { + "epoch": 2.1646351644681916, + "grad_norm": 0.35307565715924766, + "learning_rate": 2.180380297616702e-06, + "loss": 0.0195, + "step": 6482 + }, + { + "epoch": 2.1649691100350643, + "grad_norm": 0.3231217864449414, + "learning_rate": 2.178775654018581e-06, + "loss": 0.0229, + "step": 6483 + }, + { + "epoch": 2.165303055601937, + "grad_norm": 0.32269783910999195, + "learning_rate": 2.177171436599618e-06, + "loss": 0.023, + "step": 6484 + }, + { + "epoch": 2.1656370011688093, + "grad_norm": 0.33757397074450257, + "learning_rate": 2.1755676456021454e-06, + "loss": 0.0186, + "step": 6485 + }, + { + "epoch": 2.165970946735682, + "grad_norm": 0.26188053160341745, + "learning_rate": 2.173964281268436e-06, + "loss": 0.0153, + "step": 6486 + }, + { + "epoch": 2.1663048923025547, + "grad_norm": 0.30936393138757456, + "learning_rate": 2.1723613438407e-06, + "loss": 0.0132, + "step": 6487 + }, + { + "epoch": 2.1666388378694275, + "grad_norm": 0.3458044555886651, + "learning_rate": 2.170758833561075e-06, + "loss": 0.016, + "step": 6488 + }, + { + "epoch": 2.1669727834362997, + "grad_norm": 0.31332504262449845, + "learning_rate": 2.1691567506716433e-06, + "loss": 0.01, + "step": 6489 + }, + { + "epoch": 2.1673067290031724, + "grad_norm": 0.32582133447973455, + "learning_rate": 2.1675550954144147e-06, + "loss": 0.0137, + "step": 6490 + }, + { + "epoch": 2.167640674570045, + "grad_norm": 0.2617877479730621, + "learning_rate": 2.1659538680313403e-06, + "loss": 0.0161, + "step": 6491 + }, + { + "epoch": 2.167974620136918, + "grad_norm": 0.3380088068416834, + "learning_rate": 2.1643530687643036e-06, + "loss": 0.0171, + "step": 6492 + }, + { + "epoch": 2.16830856570379, + "grad_norm": 0.484747950505644, + "learning_rate": 2.1627526978551265e-06, + "loss": 0.0187, + "step": 6493 + }, + { + "epoch": 2.168642511270663, + "grad_norm": 0.36607927637539583, + "learning_rate": 2.1611527555455604e-06, + "loss": 0.0152, + "step": 6494 + }, + { + "epoch": 2.1689764568375356, + "grad_norm": 0.2269337363172091, + "learning_rate": 2.159553242077298e-06, + "loss": 0.0122, + "step": 6495 + }, + { + "epoch": 2.169310402404408, + "grad_norm": 0.2705252246172487, + "learning_rate": 2.1579541576919624e-06, + "loss": 0.0147, + "step": 6496 + }, + { + "epoch": 2.1696443479712806, + "grad_norm": 0.46140511901702713, + "learning_rate": 2.1563555026311166e-06, + "loss": 0.0217, + "step": 6497 + }, + { + "epoch": 2.1699782935381533, + "grad_norm": 0.32301551635597753, + "learning_rate": 2.154757277136251e-06, + "loss": 0.0156, + "step": 6498 + }, + { + "epoch": 2.170312239105026, + "grad_norm": 0.25392799693456997, + "learning_rate": 2.153159481448805e-06, + "loss": 0.012, + "step": 6499 + }, + { + "epoch": 2.1706461846718983, + "grad_norm": 0.362154037471294, + "learning_rate": 2.1515621158101372e-06, + "loss": 0.0238, + "step": 6500 + }, + { + "epoch": 2.170980130238771, + "grad_norm": 0.24543762967602123, + "learning_rate": 2.1499651804615534e-06, + "loss": 0.0126, + "step": 6501 + }, + { + "epoch": 2.1713140758056437, + "grad_norm": 0.32792892953249453, + "learning_rate": 2.148368675644285e-06, + "loss": 0.0102, + "step": 6502 + }, + { + "epoch": 2.1716480213725164, + "grad_norm": 0.31516955497983384, + "learning_rate": 2.146772601599507e-06, + "loss": 0.0119, + "step": 6503 + }, + { + "epoch": 2.1719819669393887, + "grad_norm": 0.32905645746958895, + "learning_rate": 2.1451769585683196e-06, + "loss": 0.0162, + "step": 6504 + }, + { + "epoch": 2.1723159125062614, + "grad_norm": 0.2558183977993946, + "learning_rate": 2.14358174679177e-06, + "loss": 0.0126, + "step": 6505 + }, + { + "epoch": 2.172649858073134, + "grad_norm": 0.20034120251384852, + "learning_rate": 2.1419869665108303e-06, + "loss": 0.0092, + "step": 6506 + }, + { + "epoch": 2.172983803640007, + "grad_norm": 0.35462762982860063, + "learning_rate": 2.140392617966412e-06, + "loss": 0.0192, + "step": 6507 + }, + { + "epoch": 2.173317749206879, + "grad_norm": 0.2697304698584604, + "learning_rate": 2.1387987013993583e-06, + "loss": 0.0128, + "step": 6508 + }, + { + "epoch": 2.173651694773752, + "grad_norm": 0.26940891178046966, + "learning_rate": 2.137205217050452e-06, + "loss": 0.0147, + "step": 6509 + }, + { + "epoch": 2.1739856403406246, + "grad_norm": 0.2975749995789792, + "learning_rate": 2.135612165160404e-06, + "loss": 0.0177, + "step": 6510 + }, + { + "epoch": 2.1743195859074973, + "grad_norm": 0.2968568004179634, + "learning_rate": 2.1340195459698653e-06, + "loss": 0.0153, + "step": 6511 + }, + { + "epoch": 2.1746535314743696, + "grad_norm": 0.24343310815444663, + "learning_rate": 2.1324273597194223e-06, + "loss": 0.011, + "step": 6512 + }, + { + "epoch": 2.1749874770412423, + "grad_norm": 0.20837938839405604, + "learning_rate": 2.1308356066495893e-06, + "loss": 0.0087, + "step": 6513 + }, + { + "epoch": 2.175321422608115, + "grad_norm": 0.23374895493572262, + "learning_rate": 2.1292442870008213e-06, + "loss": 0.0112, + "step": 6514 + }, + { + "epoch": 2.1756553681749873, + "grad_norm": 0.35733174135473794, + "learning_rate": 2.1276534010135053e-06, + "loss": 0.0148, + "step": 6515 + }, + { + "epoch": 2.17598931374186, + "grad_norm": 0.2928966567009293, + "learning_rate": 2.1260629489279662e-06, + "loss": 0.0165, + "step": 6516 + }, + { + "epoch": 2.1763232593087327, + "grad_norm": 0.23835990278481167, + "learning_rate": 2.1244729309844564e-06, + "loss": 0.0107, + "step": 6517 + }, + { + "epoch": 2.1766572048756054, + "grad_norm": 0.2586294519234971, + "learning_rate": 2.1228833474231703e-06, + "loss": 0.0107, + "step": 6518 + }, + { + "epoch": 2.1769911504424777, + "grad_norm": 0.30566469616604947, + "learning_rate": 2.1212941984842295e-06, + "loss": 0.0143, + "step": 6519 + }, + { + "epoch": 2.1773250960093504, + "grad_norm": 0.29438725613258704, + "learning_rate": 2.1197054844076975e-06, + "loss": 0.0162, + "step": 6520 + }, + { + "epoch": 2.177659041576223, + "grad_norm": 0.33845121195766575, + "learning_rate": 2.118117205433563e-06, + "loss": 0.0148, + "step": 6521 + }, + { + "epoch": 2.177992987143096, + "grad_norm": 0.2795510097775822, + "learning_rate": 2.1165293618017612e-06, + "loss": 0.0167, + "step": 6522 + }, + { + "epoch": 2.178326932709968, + "grad_norm": 0.29923949401454447, + "learning_rate": 2.1149419537521495e-06, + "loss": 0.0167, + "step": 6523 + }, + { + "epoch": 2.178660878276841, + "grad_norm": 0.31872030152725334, + "learning_rate": 2.1133549815245273e-06, + "loss": 0.0201, + "step": 6524 + }, + { + "epoch": 2.1789948238437136, + "grad_norm": 0.30638147299674867, + "learning_rate": 2.1117684453586236e-06, + "loss": 0.0158, + "step": 6525 + }, + { + "epoch": 2.179328769410586, + "grad_norm": 0.2840187173218848, + "learning_rate": 2.110182345494105e-06, + "loss": 0.0135, + "step": 6526 + }, + { + "epoch": 2.1796627149774586, + "grad_norm": 0.3239255247053515, + "learning_rate": 2.1085966821705662e-06, + "loss": 0.0154, + "step": 6527 + }, + { + "epoch": 2.1799966605443313, + "grad_norm": 0.34214025711008544, + "learning_rate": 2.1070114556275473e-06, + "loss": 0.0144, + "step": 6528 + }, + { + "epoch": 2.180330606111204, + "grad_norm": 0.3442808319286775, + "learning_rate": 2.1054266661045105e-06, + "loss": 0.024, + "step": 6529 + }, + { + "epoch": 2.1806645516780763, + "grad_norm": 0.29995911473421333, + "learning_rate": 2.103842313840859e-06, + "loss": 0.0154, + "step": 6530 + }, + { + "epoch": 2.180998497244949, + "grad_norm": 0.24219781517831027, + "learning_rate": 2.1022583990759265e-06, + "loss": 0.0106, + "step": 6531 + }, + { + "epoch": 2.1813324428118217, + "grad_norm": 0.2571694548375631, + "learning_rate": 2.1006749220489834e-06, + "loss": 0.0114, + "step": 6532 + }, + { + "epoch": 2.1816663883786944, + "grad_norm": 0.25602685008977827, + "learning_rate": 2.0990918829992307e-06, + "loss": 0.0096, + "step": 6533 + }, + { + "epoch": 2.1820003339455667, + "grad_norm": 0.2898189520049827, + "learning_rate": 2.097509282165806e-06, + "loss": 0.0122, + "step": 6534 + }, + { + "epoch": 2.1823342795124394, + "grad_norm": 0.32648803670202425, + "learning_rate": 2.0959271197877816e-06, + "loss": 0.0227, + "step": 6535 + }, + { + "epoch": 2.182668225079312, + "grad_norm": 0.27872952226235354, + "learning_rate": 2.0943453961041587e-06, + "loss": 0.0144, + "step": 6536 + }, + { + "epoch": 2.183002170646185, + "grad_norm": 0.2623052154519515, + "learning_rate": 2.0927641113538764e-06, + "loss": 0.0117, + "step": 6537 + }, + { + "epoch": 2.183336116213057, + "grad_norm": 0.28053985522515906, + "learning_rate": 2.0911832657758086e-06, + "loss": 0.012, + "step": 6538 + }, + { + "epoch": 2.18367006177993, + "grad_norm": 0.22013024035220435, + "learning_rate": 2.089602859608757e-06, + "loss": 0.0107, + "step": 6539 + }, + { + "epoch": 2.1840040073468026, + "grad_norm": 0.298667729904666, + "learning_rate": 2.088022893091462e-06, + "loss": 0.0189, + "step": 6540 + }, + { + "epoch": 2.1843379529136753, + "grad_norm": 0.3372124117393272, + "learning_rate": 2.086443366462598e-06, + "loss": 0.0216, + "step": 6541 + }, + { + "epoch": 2.1846718984805475, + "grad_norm": 0.25179480117190695, + "learning_rate": 2.084864279960768e-06, + "loss": 0.0143, + "step": 6542 + }, + { + "epoch": 2.1850058440474203, + "grad_norm": 0.3368441001252815, + "learning_rate": 2.0832856338245157e-06, + "loss": 0.0162, + "step": 6543 + }, + { + "epoch": 2.185339789614293, + "grad_norm": 0.3397412128187092, + "learning_rate": 2.0817074282923087e-06, + "loss": 0.0125, + "step": 6544 + }, + { + "epoch": 2.1856737351811653, + "grad_norm": 0.2201558676784805, + "learning_rate": 2.080129663602557e-06, + "loss": 0.0111, + "step": 6545 + }, + { + "epoch": 2.186007680748038, + "grad_norm": 0.25782742203882697, + "learning_rate": 2.0785523399935996e-06, + "loss": 0.0115, + "step": 6546 + }, + { + "epoch": 2.1863416263149107, + "grad_norm": 0.431806806501223, + "learning_rate": 2.076975457703712e-06, + "loss": 0.0194, + "step": 6547 + }, + { + "epoch": 2.1866755718817834, + "grad_norm": 0.23690739666194224, + "learning_rate": 2.0753990169710973e-06, + "loss": 0.0098, + "step": 6548 + }, + { + "epoch": 2.1870095174486557, + "grad_norm": 0.2609284901050491, + "learning_rate": 2.0738230180338993e-06, + "loss": 0.0115, + "step": 6549 + }, + { + "epoch": 2.1873434630155284, + "grad_norm": 0.26290840908723817, + "learning_rate": 2.0722474611301868e-06, + "loss": 0.0166, + "step": 6550 + }, + { + "epoch": 2.187677408582401, + "grad_norm": 0.2747028612559764, + "learning_rate": 2.0706723464979687e-06, + "loss": 0.0142, + "step": 6551 + }, + { + "epoch": 2.188011354149274, + "grad_norm": 0.35122297292874205, + "learning_rate": 2.0690976743751844e-06, + "loss": 0.0216, + "step": 6552 + }, + { + "epoch": 2.188345299716146, + "grad_norm": 0.2420889603896751, + "learning_rate": 2.0675234449997085e-06, + "loss": 0.0112, + "step": 6553 + }, + { + "epoch": 2.188679245283019, + "grad_norm": 0.29582887432454824, + "learning_rate": 2.065949658609343e-06, + "loss": 0.0163, + "step": 6554 + }, + { + "epoch": 2.1890131908498915, + "grad_norm": 0.26322567331836094, + "learning_rate": 2.0643763154418304e-06, + "loss": 0.016, + "step": 6555 + }, + { + "epoch": 2.1893471364167643, + "grad_norm": 0.2516184473720276, + "learning_rate": 2.06280341573484e-06, + "loss": 0.0131, + "step": 6556 + }, + { + "epoch": 2.1896810819836365, + "grad_norm": 0.24844362417839239, + "learning_rate": 2.0612309597259776e-06, + "loss": 0.0127, + "step": 6557 + }, + { + "epoch": 2.1900150275505093, + "grad_norm": 0.2824712930720436, + "learning_rate": 2.059658947652784e-06, + "loss": 0.0148, + "step": 6558 + }, + { + "epoch": 2.190348973117382, + "grad_norm": 0.2223810259384668, + "learning_rate": 2.058087379752725e-06, + "loss": 0.012, + "step": 6559 + }, + { + "epoch": 2.1906829186842547, + "grad_norm": 0.33889376103291663, + "learning_rate": 2.056516256263208e-06, + "loss": 0.0141, + "step": 6560 + }, + { + "epoch": 2.191016864251127, + "grad_norm": 0.4351553799654831, + "learning_rate": 2.0549455774215705e-06, + "loss": 0.0215, + "step": 6561 + }, + { + "epoch": 2.1913508098179997, + "grad_norm": 0.23904238429895308, + "learning_rate": 2.0533753434650784e-06, + "loss": 0.0103, + "step": 6562 + }, + { + "epoch": 2.1916847553848724, + "grad_norm": 0.4589855113278374, + "learning_rate": 2.0518055546309362e-06, + "loss": 0.0121, + "step": 6563 + }, + { + "epoch": 2.1920187009517447, + "grad_norm": 0.27028252936995945, + "learning_rate": 2.0502362111562806e-06, + "loss": 0.0126, + "step": 6564 + }, + { + "epoch": 2.1923526465186174, + "grad_norm": 0.21546077643021916, + "learning_rate": 2.048667313278176e-06, + "loss": 0.0103, + "step": 6565 + }, + { + "epoch": 2.19268659208549, + "grad_norm": 0.34168154460500905, + "learning_rate": 2.0470988612336264e-06, + "loss": 0.0121, + "step": 6566 + }, + { + "epoch": 2.193020537652363, + "grad_norm": 0.30497679243547027, + "learning_rate": 2.045530855259561e-06, + "loss": 0.0156, + "step": 6567 + }, + { + "epoch": 2.193354483219235, + "grad_norm": 0.3072013185745378, + "learning_rate": 2.043963295592848e-06, + "loss": 0.0157, + "step": 6568 + }, + { + "epoch": 2.193688428786108, + "grad_norm": 0.3228384471020377, + "learning_rate": 2.042396182470285e-06, + "loss": 0.016, + "step": 6569 + }, + { + "epoch": 2.1940223743529805, + "grad_norm": 0.3016048980193982, + "learning_rate": 2.040829516128605e-06, + "loss": 0.0123, + "step": 6570 + }, + { + "epoch": 2.1943563199198532, + "grad_norm": 0.32031088004186065, + "learning_rate": 2.0392632968044686e-06, + "loss": 0.0192, + "step": 6571 + }, + { + "epoch": 2.1946902654867255, + "grad_norm": 0.2905920654524365, + "learning_rate": 2.0376975247344736e-06, + "loss": 0.0152, + "step": 6572 + }, + { + "epoch": 2.1950242110535982, + "grad_norm": 0.29792214862022215, + "learning_rate": 2.0361322001551466e-06, + "loss": 0.0144, + "step": 6573 + }, + { + "epoch": 2.195358156620471, + "grad_norm": 0.2731855199925054, + "learning_rate": 2.034567323302949e-06, + "loss": 0.0104, + "step": 6574 + }, + { + "epoch": 2.1956921021873432, + "grad_norm": 0.30065602321357754, + "learning_rate": 2.0330028944142736e-06, + "loss": 0.0142, + "step": 6575 + }, + { + "epoch": 2.196026047754216, + "grad_norm": 0.2649649674226487, + "learning_rate": 2.031438913725448e-06, + "loss": 0.0109, + "step": 6576 + }, + { + "epoch": 2.1963599933210887, + "grad_norm": 0.4033152837191047, + "learning_rate": 2.0298753814727267e-06, + "loss": 0.0176, + "step": 6577 + }, + { + "epoch": 2.1966939388879614, + "grad_norm": 0.29091787328055135, + "learning_rate": 2.028312297892303e-06, + "loss": 0.0178, + "step": 6578 + }, + { + "epoch": 2.1970278844548337, + "grad_norm": 0.2882873443886919, + "learning_rate": 2.0267496632202953e-06, + "loss": 0.0148, + "step": 6579 + }, + { + "epoch": 2.1973618300217064, + "grad_norm": 0.34924737508772247, + "learning_rate": 2.0251874776927598e-06, + "loss": 0.0138, + "step": 6580 + }, + { + "epoch": 2.197695775588579, + "grad_norm": 0.2722375379686338, + "learning_rate": 2.0236257415456833e-06, + "loss": 0.012, + "step": 6581 + }, + { + "epoch": 2.198029721155452, + "grad_norm": 0.3017348298669219, + "learning_rate": 2.022064455014986e-06, + "loss": 0.0115, + "step": 6582 + }, + { + "epoch": 2.198363666722324, + "grad_norm": 0.291571694536511, + "learning_rate": 2.0205036183365145e-06, + "loss": 0.0135, + "step": 6583 + }, + { + "epoch": 2.198697612289197, + "grad_norm": 0.22837631193895652, + "learning_rate": 2.018943231746056e-06, + "loss": 0.0143, + "step": 6584 + }, + { + "epoch": 2.1990315578560695, + "grad_norm": 0.32345728882665414, + "learning_rate": 2.0173832954793216e-06, + "loss": 0.0229, + "step": 6585 + }, + { + "epoch": 2.1993655034229422, + "grad_norm": 0.3151011464115596, + "learning_rate": 2.0158238097719597e-06, + "loss": 0.0119, + "step": 6586 + }, + { + "epoch": 2.1996994489898145, + "grad_norm": 0.3343215908705643, + "learning_rate": 2.0142647748595502e-06, + "loss": 0.0197, + "step": 6587 + }, + { + "epoch": 2.2000333945566872, + "grad_norm": 0.40873630806611855, + "learning_rate": 2.0127061909776e-06, + "loss": 0.0225, + "step": 6588 + }, + { + "epoch": 2.20036734012356, + "grad_norm": 0.28452049085939546, + "learning_rate": 2.0111480583615566e-06, + "loss": 0.0138, + "step": 6589 + }, + { + "epoch": 2.2007012856904327, + "grad_norm": 0.29368232293360924, + "learning_rate": 2.00959037724679e-06, + "loss": 0.0151, + "step": 6590 + }, + { + "epoch": 2.201035231257305, + "grad_norm": 0.3738527865316701, + "learning_rate": 2.0080331478686087e-06, + "loss": 0.027, + "step": 6591 + }, + { + "epoch": 2.2013691768241777, + "grad_norm": 0.2954931019724482, + "learning_rate": 2.006476370462247e-06, + "loss": 0.0182, + "step": 6592 + }, + { + "epoch": 2.2017031223910504, + "grad_norm": 0.23965981424437982, + "learning_rate": 2.0049200452628803e-06, + "loss": 0.0113, + "step": 6593 + }, + { + "epoch": 2.2020370679579226, + "grad_norm": 0.2654406426479545, + "learning_rate": 2.0033641725056048e-06, + "loss": 0.0106, + "step": 6594 + }, + { + "epoch": 2.2023710135247954, + "grad_norm": 0.1984422467408501, + "learning_rate": 2.001808752425457e-06, + "loss": 0.0083, + "step": 6595 + }, + { + "epoch": 2.202704959091668, + "grad_norm": 0.27510526024218646, + "learning_rate": 2.000253785257398e-06, + "loss": 0.0151, + "step": 6596 + }, + { + "epoch": 2.203038904658541, + "grad_norm": 0.3277857679527759, + "learning_rate": 1.998699271236326e-06, + "loss": 0.0157, + "step": 6597 + }, + { + "epoch": 2.203372850225413, + "grad_norm": 0.3006917973785943, + "learning_rate": 1.997145210597068e-06, + "loss": 0.0187, + "step": 6598 + }, + { + "epoch": 2.203706795792286, + "grad_norm": 0.2466694275717017, + "learning_rate": 1.9955916035743855e-06, + "loss": 0.0112, + "step": 6599 + }, + { + "epoch": 2.2040407413591585, + "grad_norm": 0.26123473352892995, + "learning_rate": 1.9940384504029647e-06, + "loss": 0.0094, + "step": 6600 + }, + { + "epoch": 2.2043746869260312, + "grad_norm": 0.2895012517364698, + "learning_rate": 1.9924857513174324e-06, + "loss": 0.0129, + "step": 6601 + }, + { + "epoch": 2.2047086324929035, + "grad_norm": 0.2729192583071324, + "learning_rate": 1.990933506552337e-06, + "loss": 0.0106, + "step": 6602 + }, + { + "epoch": 2.205042578059776, + "grad_norm": 0.31083387330033807, + "learning_rate": 1.989381716342167e-06, + "loss": 0.0156, + "step": 6603 + }, + { + "epoch": 2.205376523626649, + "grad_norm": 0.32074137316370627, + "learning_rate": 1.9878303809213367e-06, + "loss": 0.0133, + "step": 6604 + }, + { + "epoch": 2.2057104691935217, + "grad_norm": 0.3382822914612812, + "learning_rate": 1.986279500524197e-06, + "loss": 0.0117, + "step": 6605 + }, + { + "epoch": 2.206044414760394, + "grad_norm": 0.24538926003413847, + "learning_rate": 1.984729075385022e-06, + "loss": 0.0091, + "step": 6606 + }, + { + "epoch": 2.2063783603272666, + "grad_norm": 0.2698418649853723, + "learning_rate": 1.983179105738026e-06, + "loss": 0.0116, + "step": 6607 + }, + { + "epoch": 2.2067123058941394, + "grad_norm": 0.32079606395170346, + "learning_rate": 1.9816295918173462e-06, + "loss": 0.0135, + "step": 6608 + }, + { + "epoch": 2.207046251461012, + "grad_norm": 0.27266592971802145, + "learning_rate": 1.9800805338570562e-06, + "loss": 0.0109, + "step": 6609 + }, + { + "epoch": 2.2073801970278843, + "grad_norm": 0.32347673712171227, + "learning_rate": 1.9785319320911623e-06, + "loss": 0.0145, + "step": 6610 + }, + { + "epoch": 2.207714142594757, + "grad_norm": 0.24981179450852878, + "learning_rate": 1.9769837867535948e-06, + "loss": 0.0132, + "step": 6611 + }, + { + "epoch": 2.20804808816163, + "grad_norm": 0.2783778140995123, + "learning_rate": 1.9754360980782227e-06, + "loss": 0.0141, + "step": 6612 + }, + { + "epoch": 2.208382033728502, + "grad_norm": 0.5386887104628023, + "learning_rate": 1.973888866298839e-06, + "loss": 0.0243, + "step": 6613 + }, + { + "epoch": 2.2087159792953748, + "grad_norm": 0.2840583006409117, + "learning_rate": 1.972342091649176e-06, + "loss": 0.015, + "step": 6614 + }, + { + "epoch": 2.2090499248622475, + "grad_norm": 0.25509634295763384, + "learning_rate": 1.9707957743628854e-06, + "loss": 0.0096, + "step": 6615 + }, + { + "epoch": 2.20938387042912, + "grad_norm": 0.31283208613488706, + "learning_rate": 1.9692499146735646e-06, + "loss": 0.0185, + "step": 6616 + }, + { + "epoch": 2.2097178159959925, + "grad_norm": 0.26442709656835756, + "learning_rate": 1.967704512814728e-06, + "loss": 0.0123, + "step": 6617 + }, + { + "epoch": 2.210051761562865, + "grad_norm": 0.27959036621366046, + "learning_rate": 1.966159569019831e-06, + "loss": 0.0103, + "step": 6618 + }, + { + "epoch": 2.210385707129738, + "grad_norm": 0.26272583128668314, + "learning_rate": 1.9646150835222517e-06, + "loss": 0.0132, + "step": 6619 + }, + { + "epoch": 2.2107196526966106, + "grad_norm": 0.28851377822694657, + "learning_rate": 1.9630710565553063e-06, + "loss": 0.0108, + "step": 6620 + }, + { + "epoch": 2.211053598263483, + "grad_norm": 0.23989503512968924, + "learning_rate": 1.9615274883522327e-06, + "loss": 0.0123, + "step": 6621 + }, + { + "epoch": 2.2113875438303556, + "grad_norm": 0.301524978850988, + "learning_rate": 1.9599843791462123e-06, + "loss": 0.0181, + "step": 6622 + }, + { + "epoch": 2.2117214893972283, + "grad_norm": 0.23221174730581715, + "learning_rate": 1.958441729170345e-06, + "loss": 0.0118, + "step": 6623 + }, + { + "epoch": 2.2120554349641006, + "grad_norm": 0.2665690588320243, + "learning_rate": 1.9568995386576695e-06, + "loss": 0.0111, + "step": 6624 + }, + { + "epoch": 2.2123893805309733, + "grad_norm": 0.22961451400894545, + "learning_rate": 1.9553578078411476e-06, + "loss": 0.0116, + "step": 6625 + }, + { + "epoch": 2.212723326097846, + "grad_norm": 0.26487606187895424, + "learning_rate": 1.953816536953681e-06, + "loss": 0.0101, + "step": 6626 + }, + { + "epoch": 2.2130572716647188, + "grad_norm": 0.27195401533111785, + "learning_rate": 1.95227572622809e-06, + "loss": 0.0132, + "step": 6627 + }, + { + "epoch": 2.213391217231591, + "grad_norm": 0.29527337141955756, + "learning_rate": 1.95073537589714e-06, + "loss": 0.0173, + "step": 6628 + }, + { + "epoch": 2.2137251627984638, + "grad_norm": 0.22944929591469504, + "learning_rate": 1.949195486193514e-06, + "loss": 0.0128, + "step": 6629 + }, + { + "epoch": 2.2140591083653365, + "grad_norm": 0.23158442415002364, + "learning_rate": 1.9476560573498332e-06, + "loss": 0.0108, + "step": 6630 + }, + { + "epoch": 2.214393053932209, + "grad_norm": 0.2807469430714143, + "learning_rate": 1.946117089598644e-06, + "loss": 0.0147, + "step": 6631 + }, + { + "epoch": 2.2147269994990815, + "grad_norm": 0.2590706685467037, + "learning_rate": 1.9445785831724274e-06, + "loss": 0.012, + "step": 6632 + }, + { + "epoch": 2.215060945065954, + "grad_norm": 0.2721975963281122, + "learning_rate": 1.943040538303591e-06, + "loss": 0.014, + "step": 6633 + }, + { + "epoch": 2.215394890632827, + "grad_norm": 0.24478951889187736, + "learning_rate": 1.9415029552244758e-06, + "loss": 0.0131, + "step": 6634 + }, + { + "epoch": 2.2157288361996996, + "grad_norm": 0.37398456573476724, + "learning_rate": 1.939965834167354e-06, + "loss": 0.0248, + "step": 6635 + }, + { + "epoch": 2.216062781766572, + "grad_norm": 0.22501235029711275, + "learning_rate": 1.9384291753644215e-06, + "loss": 0.0118, + "step": 6636 + }, + { + "epoch": 2.2163967273334446, + "grad_norm": 0.2222184565929166, + "learning_rate": 1.9368929790478126e-06, + "loss": 0.0107, + "step": 6637 + }, + { + "epoch": 2.2167306729003173, + "grad_norm": 0.30438470736257894, + "learning_rate": 1.935357245449583e-06, + "loss": 0.0141, + "step": 6638 + }, + { + "epoch": 2.21706461846719, + "grad_norm": 0.3370022723149651, + "learning_rate": 1.9338219748017297e-06, + "loss": 0.0143, + "step": 6639 + }, + { + "epoch": 2.2173985640340623, + "grad_norm": 0.33336519710123375, + "learning_rate": 1.932287167336168e-06, + "loss": 0.0193, + "step": 6640 + }, + { + "epoch": 2.217732509600935, + "grad_norm": 0.2916646274486344, + "learning_rate": 1.9307528232847533e-06, + "loss": 0.014, + "step": 6641 + }, + { + "epoch": 2.2180664551678078, + "grad_norm": 0.25762380383595834, + "learning_rate": 1.9292189428792617e-06, + "loss": 0.0115, + "step": 6642 + }, + { + "epoch": 2.21840040073468, + "grad_norm": 0.36295273735003497, + "learning_rate": 1.927685526351408e-06, + "loss": 0.0201, + "step": 6643 + }, + { + "epoch": 2.2187343463015528, + "grad_norm": 0.26969220260430155, + "learning_rate": 1.9261525739328273e-06, + "loss": 0.0142, + "step": 6644 + }, + { + "epoch": 2.2190682918684255, + "grad_norm": 0.23887853483812735, + "learning_rate": 1.924620085855097e-06, + "loss": 0.0103, + "step": 6645 + }, + { + "epoch": 2.219402237435298, + "grad_norm": 0.3358340061922241, + "learning_rate": 1.923088062349713e-06, + "loss": 0.0172, + "step": 6646 + }, + { + "epoch": 2.2197361830021705, + "grad_norm": 0.27072620057255836, + "learning_rate": 1.9215565036481083e-06, + "loss": 0.0116, + "step": 6647 + }, + { + "epoch": 2.220070128569043, + "grad_norm": 0.2871520252799103, + "learning_rate": 1.920025409981639e-06, + "loss": 0.0157, + "step": 6648 + }, + { + "epoch": 2.220404074135916, + "grad_norm": 0.2663963856217783, + "learning_rate": 1.918494781581599e-06, + "loss": 0.0118, + "step": 6649 + }, + { + "epoch": 2.2207380197027886, + "grad_norm": 0.273154380274795, + "learning_rate": 1.9169646186792025e-06, + "loss": 0.0158, + "step": 6650 + }, + { + "epoch": 2.221071965269661, + "grad_norm": 0.24243409303867358, + "learning_rate": 1.9154349215056052e-06, + "loss": 0.0123, + "step": 6651 + }, + { + "epoch": 2.2214059108365336, + "grad_norm": 0.2917113561997659, + "learning_rate": 1.9139056902918805e-06, + "loss": 0.0164, + "step": 6652 + }, + { + "epoch": 2.2217398564034063, + "grad_norm": 0.2460869043839189, + "learning_rate": 1.912376925269041e-06, + "loss": 0.0085, + "step": 6653 + }, + { + "epoch": 2.222073801970279, + "grad_norm": 0.3132078323676648, + "learning_rate": 1.910848626668021e-06, + "loss": 0.0132, + "step": 6654 + }, + { + "epoch": 2.2224077475371513, + "grad_norm": 0.2701488132093934, + "learning_rate": 1.9093207947196908e-06, + "loss": 0.0157, + "step": 6655 + }, + { + "epoch": 2.222741693104024, + "grad_norm": 0.29168666693425044, + "learning_rate": 1.9077934296548445e-06, + "loss": 0.0153, + "step": 6656 + }, + { + "epoch": 2.2230756386708967, + "grad_norm": 0.2863420988214763, + "learning_rate": 1.9062665317042106e-06, + "loss": 0.0165, + "step": 6657 + }, + { + "epoch": 2.2234095842377695, + "grad_norm": 0.2226448764019503, + "learning_rate": 1.9047401010984456e-06, + "loss": 0.0119, + "step": 6658 + }, + { + "epoch": 2.2237435298046417, + "grad_norm": 0.26354728659444615, + "learning_rate": 1.9032141380681329e-06, + "loss": 0.0094, + "step": 6659 + }, + { + "epoch": 2.2240774753715145, + "grad_norm": 0.2978104319107039, + "learning_rate": 1.9016886428437893e-06, + "loss": 0.0163, + "step": 6660 + }, + { + "epoch": 2.224411420938387, + "grad_norm": 0.3054822295300854, + "learning_rate": 1.9001636156558562e-06, + "loss": 0.0165, + "step": 6661 + }, + { + "epoch": 2.2247453665052594, + "grad_norm": 0.3426405779533461, + "learning_rate": 1.8986390567347085e-06, + "loss": 0.0113, + "step": 6662 + }, + { + "epoch": 2.225079312072132, + "grad_norm": 0.27080000362096207, + "learning_rate": 1.8971149663106482e-06, + "loss": 0.0149, + "step": 6663 + }, + { + "epoch": 2.225413257639005, + "grad_norm": 0.255362712712875, + "learning_rate": 1.8955913446139096e-06, + "loss": 0.0179, + "step": 6664 + }, + { + "epoch": 2.2257472032058776, + "grad_norm": 0.21085822866973594, + "learning_rate": 1.8940681918746495e-06, + "loss": 0.0096, + "step": 6665 + }, + { + "epoch": 2.22608114877275, + "grad_norm": 0.26197638895665815, + "learning_rate": 1.8925455083229622e-06, + "loss": 0.0129, + "step": 6666 + }, + { + "epoch": 2.2264150943396226, + "grad_norm": 0.296044755979088, + "learning_rate": 1.891023294188863e-06, + "loss": 0.015, + "step": 6667 + }, + { + "epoch": 2.2267490399064953, + "grad_norm": 0.33847615330580905, + "learning_rate": 1.8895015497023022e-06, + "loss": 0.0174, + "step": 6668 + }, + { + "epoch": 2.227082985473368, + "grad_norm": 0.27161238309725877, + "learning_rate": 1.8879802750931574e-06, + "loss": 0.0131, + "step": 6669 + }, + { + "epoch": 2.2274169310402403, + "grad_norm": 0.2630033886363965, + "learning_rate": 1.886459470591237e-06, + "loss": 0.0184, + "step": 6670 + }, + { + "epoch": 2.227750876607113, + "grad_norm": 0.25221611148972534, + "learning_rate": 1.8849391364262721e-06, + "loss": 0.0102, + "step": 6671 + }, + { + "epoch": 2.2280848221739857, + "grad_norm": 0.2411037462392019, + "learning_rate": 1.883419272827931e-06, + "loss": 0.0119, + "step": 6672 + }, + { + "epoch": 2.228418767740858, + "grad_norm": 0.30842097982208766, + "learning_rate": 1.881899880025802e-06, + "loss": 0.0118, + "step": 6673 + }, + { + "epoch": 2.2287527133077307, + "grad_norm": 0.30269452723790186, + "learning_rate": 1.8803809582494143e-06, + "loss": 0.0132, + "step": 6674 + }, + { + "epoch": 2.2290866588746034, + "grad_norm": 0.2315962193941701, + "learning_rate": 1.878862507728213e-06, + "loss": 0.014, + "step": 6675 + }, + { + "epoch": 2.229420604441476, + "grad_norm": 0.2576225977756485, + "learning_rate": 1.877344528691582e-06, + "loss": 0.0119, + "step": 6676 + }, + { + "epoch": 2.2297545500083484, + "grad_norm": 0.2687141455819776, + "learning_rate": 1.8758270213688263e-06, + "loss": 0.0123, + "step": 6677 + }, + { + "epoch": 2.230088495575221, + "grad_norm": 0.3142648959932829, + "learning_rate": 1.8743099859891866e-06, + "loss": 0.0163, + "step": 6678 + }, + { + "epoch": 2.230422441142094, + "grad_norm": 0.22243140409970208, + "learning_rate": 1.8727934227818255e-06, + "loss": 0.0087, + "step": 6679 + }, + { + "epoch": 2.2307563867089666, + "grad_norm": 0.31782764706341865, + "learning_rate": 1.8712773319758398e-06, + "loss": 0.0175, + "step": 6680 + }, + { + "epoch": 2.231090332275839, + "grad_norm": 0.23012015237340638, + "learning_rate": 1.8697617138002545e-06, + "loss": 0.0113, + "step": 6681 + }, + { + "epoch": 2.2314242778427116, + "grad_norm": 0.2907323691970788, + "learning_rate": 1.8682465684840178e-06, + "loss": 0.0108, + "step": 6682 + }, + { + "epoch": 2.2317582234095843, + "grad_norm": 0.23011189568327983, + "learning_rate": 1.8667318962560137e-06, + "loss": 0.0082, + "step": 6683 + }, + { + "epoch": 2.232092168976457, + "grad_norm": 0.2516721806927547, + "learning_rate": 1.865217697345048e-06, + "loss": 0.0097, + "step": 6684 + }, + { + "epoch": 2.2324261145433293, + "grad_norm": 0.2817613761200602, + "learning_rate": 1.86370397197986e-06, + "loss": 0.012, + "step": 6685 + }, + { + "epoch": 2.232760060110202, + "grad_norm": 0.42020472149998295, + "learning_rate": 1.8621907203891159e-06, + "loss": 0.0237, + "step": 6686 + }, + { + "epoch": 2.2330940056770747, + "grad_norm": 0.43109621633257, + "learning_rate": 1.8606779428014116e-06, + "loss": 0.0168, + "step": 6687 + }, + { + "epoch": 2.2334279512439474, + "grad_norm": 0.3332873336373983, + "learning_rate": 1.8591656394452667e-06, + "loss": 0.0167, + "step": 6688 + }, + { + "epoch": 2.2337618968108197, + "grad_norm": 0.29492481170953533, + "learning_rate": 1.8576538105491359e-06, + "loss": 0.0125, + "step": 6689 + }, + { + "epoch": 2.2340958423776924, + "grad_norm": 0.4297722631827301, + "learning_rate": 1.8561424563413949e-06, + "loss": 0.0137, + "step": 6690 + }, + { + "epoch": 2.234429787944565, + "grad_norm": 0.3126356239382773, + "learning_rate": 1.8546315770503537e-06, + "loss": 0.01, + "step": 6691 + }, + { + "epoch": 2.2347637335114374, + "grad_norm": 0.2659002685378052, + "learning_rate": 1.8531211729042486e-06, + "loss": 0.0119, + "step": 6692 + }, + { + "epoch": 2.23509767907831, + "grad_norm": 0.2515884976630654, + "learning_rate": 1.8516112441312451e-06, + "loss": 0.0102, + "step": 6693 + }, + { + "epoch": 2.235431624645183, + "grad_norm": 0.6584408224358603, + "learning_rate": 1.8501017909594327e-06, + "loss": 0.0292, + "step": 6694 + }, + { + "epoch": 2.2357655702120556, + "grad_norm": 0.32916335912485434, + "learning_rate": 1.8485928136168353e-06, + "loss": 0.0189, + "step": 6695 + }, + { + "epoch": 2.236099515778928, + "grad_norm": 0.30130453888410424, + "learning_rate": 1.8470843123313982e-06, + "loss": 0.0139, + "step": 6696 + }, + { + "epoch": 2.2364334613458006, + "grad_norm": 0.31575108799020773, + "learning_rate": 1.8455762873309995e-06, + "loss": 0.0208, + "step": 6697 + }, + { + "epoch": 2.2367674069126733, + "grad_norm": 0.2634477834138086, + "learning_rate": 1.844068738843446e-06, + "loss": 0.0101, + "step": 6698 + }, + { + "epoch": 2.237101352479546, + "grad_norm": 0.31670778701029184, + "learning_rate": 1.8425616670964702e-06, + "loss": 0.016, + "step": 6699 + }, + { + "epoch": 2.2374352980464183, + "grad_norm": 0.2863805719542064, + "learning_rate": 1.8410550723177306e-06, + "loss": 0.0176, + "step": 6700 + }, + { + "epoch": 2.237769243613291, + "grad_norm": 0.4318660466650565, + "learning_rate": 1.8395489547348193e-06, + "loss": 0.0321, + "step": 6701 + }, + { + "epoch": 2.2381031891801637, + "grad_norm": 0.2541954139214152, + "learning_rate": 1.8380433145752502e-06, + "loss": 0.0181, + "step": 6702 + }, + { + "epoch": 2.2384371347470364, + "grad_norm": 0.25345957940192115, + "learning_rate": 1.8365381520664695e-06, + "loss": 0.0143, + "step": 6703 + }, + { + "epoch": 2.2387710803139087, + "grad_norm": 0.2487211406285926, + "learning_rate": 1.8350334674358505e-06, + "loss": 0.0132, + "step": 6704 + }, + { + "epoch": 2.2391050258807814, + "grad_norm": 0.34324111636550203, + "learning_rate": 1.8335292609106914e-06, + "loss": 0.0147, + "step": 6705 + }, + { + "epoch": 2.239438971447654, + "grad_norm": 0.22447618926400847, + "learning_rate": 1.8320255327182224e-06, + "loss": 0.0108, + "step": 6706 + }, + { + "epoch": 2.239772917014527, + "grad_norm": 0.24868281068883916, + "learning_rate": 1.8305222830855973e-06, + "loss": 0.011, + "step": 6707 + }, + { + "epoch": 2.240106862581399, + "grad_norm": 0.37030940073547103, + "learning_rate": 1.8290195122399007e-06, + "loss": 0.0122, + "step": 6708 + }, + { + "epoch": 2.240440808148272, + "grad_norm": 0.249673743056727, + "learning_rate": 1.8275172204081437e-06, + "loss": 0.0102, + "step": 6709 + }, + { + "epoch": 2.2407747537151446, + "grad_norm": 0.24818510455504927, + "learning_rate": 1.826015407817267e-06, + "loss": 0.0126, + "step": 6710 + }, + { + "epoch": 2.241108699282017, + "grad_norm": 0.3418088337757231, + "learning_rate": 1.8245140746941336e-06, + "loss": 0.0177, + "step": 6711 + }, + { + "epoch": 2.2414426448488896, + "grad_norm": 0.2745073300071627, + "learning_rate": 1.823013221265541e-06, + "loss": 0.013, + "step": 6712 + }, + { + "epoch": 2.2417765904157623, + "grad_norm": 0.4142563888912224, + "learning_rate": 1.8215128477582077e-06, + "loss": 0.0187, + "step": 6713 + }, + { + "epoch": 2.242110535982635, + "grad_norm": 0.3484803774881054, + "learning_rate": 1.8200129543987843e-06, + "loss": 0.0225, + "step": 6714 + }, + { + "epoch": 2.2424444815495073, + "grad_norm": 0.306940495659452, + "learning_rate": 1.818513541413847e-06, + "loss": 0.0199, + "step": 6715 + }, + { + "epoch": 2.24277842711638, + "grad_norm": 0.22152939963141818, + "learning_rate": 1.8170146090299018e-06, + "loss": 0.0096, + "step": 6716 + }, + { + "epoch": 2.2431123726832527, + "grad_norm": 0.2774631148214958, + "learning_rate": 1.8155161574733772e-06, + "loss": 0.0123, + "step": 6717 + }, + { + "epoch": 2.2434463182501254, + "grad_norm": 0.2479872398163057, + "learning_rate": 1.8140181869706341e-06, + "loss": 0.0115, + "step": 6718 + }, + { + "epoch": 2.2437802638169977, + "grad_norm": 0.2731487174337819, + "learning_rate": 1.812520697747956e-06, + "loss": 0.016, + "step": 6719 + }, + { + "epoch": 2.2441142093838704, + "grad_norm": 0.2709296983405702, + "learning_rate": 1.8110236900315582e-06, + "loss": 0.0121, + "step": 6720 + }, + { + "epoch": 2.244448154950743, + "grad_norm": 0.3014712209019079, + "learning_rate": 1.8095271640475802e-06, + "loss": 0.0134, + "step": 6721 + }, + { + "epoch": 2.2447821005176154, + "grad_norm": 0.31515946349656015, + "learning_rate": 1.8080311200220935e-06, + "loss": 0.0127, + "step": 6722 + }, + { + "epoch": 2.245116046084488, + "grad_norm": 0.28613752523689995, + "learning_rate": 1.8065355581810878e-06, + "loss": 0.012, + "step": 6723 + }, + { + "epoch": 2.245449991651361, + "grad_norm": 0.3752827969453257, + "learning_rate": 1.80504047875049e-06, + "loss": 0.0191, + "step": 6724 + }, + { + "epoch": 2.2457839372182336, + "grad_norm": 0.26723048621555223, + "learning_rate": 1.8035458819561453e-06, + "loss": 0.0141, + "step": 6725 + }, + { + "epoch": 2.246117882785106, + "grad_norm": 0.22271638835274343, + "learning_rate": 1.8020517680238326e-06, + "loss": 0.0113, + "step": 6726 + }, + { + "epoch": 2.2464518283519785, + "grad_norm": 0.24665940678808335, + "learning_rate": 1.8005581371792564e-06, + "loss": 0.0115, + "step": 6727 + }, + { + "epoch": 2.2467857739188513, + "grad_norm": 0.3005858141558532, + "learning_rate": 1.799064989648044e-06, + "loss": 0.0129, + "step": 6728 + }, + { + "epoch": 2.247119719485724, + "grad_norm": 0.25348457130729, + "learning_rate": 1.797572325655756e-06, + "loss": 0.0088, + "step": 6729 + }, + { + "epoch": 2.2474536650525963, + "grad_norm": 0.24754307031892908, + "learning_rate": 1.7960801454278742e-06, + "loss": 0.0113, + "step": 6730 + }, + { + "epoch": 2.247787610619469, + "grad_norm": 0.2522656652125552, + "learning_rate": 1.7945884491898119e-06, + "loss": 0.01, + "step": 6731 + }, + { + "epoch": 2.2481215561863417, + "grad_norm": 0.26088498011835287, + "learning_rate": 1.7930972371669064e-06, + "loss": 0.0146, + "step": 6732 + }, + { + "epoch": 2.2484555017532144, + "grad_norm": 0.24391651493052038, + "learning_rate": 1.791606509584425e-06, + "loss": 0.0101, + "step": 6733 + }, + { + "epoch": 2.2487894473200867, + "grad_norm": 0.22394944524557142, + "learning_rate": 1.7901162666675564e-06, + "loss": 0.0093, + "step": 6734 + }, + { + "epoch": 2.2491233928869594, + "grad_norm": 0.370741820473369, + "learning_rate": 1.7886265086414222e-06, + "loss": 0.0241, + "step": 6735 + }, + { + "epoch": 2.249457338453832, + "grad_norm": 0.3234539530147868, + "learning_rate": 1.7871372357310651e-06, + "loss": 0.0184, + "step": 6736 + }, + { + "epoch": 2.249791284020705, + "grad_norm": 0.26048540767066103, + "learning_rate": 1.7856484481614605e-06, + "loss": 0.0137, + "step": 6737 + }, + { + "epoch": 2.250125229587577, + "grad_norm": 0.22129009578567416, + "learning_rate": 1.784160146157502e-06, + "loss": 0.0139, + "step": 6738 + }, + { + "epoch": 2.25045917515445, + "grad_norm": 0.37748350638730316, + "learning_rate": 1.7826723299440224e-06, + "loss": 0.0289, + "step": 6739 + }, + { + "epoch": 2.2507931207213225, + "grad_norm": 0.24091456501707934, + "learning_rate": 1.7811849997457681e-06, + "loss": 0.0144, + "step": 6740 + }, + { + "epoch": 2.251127066288195, + "grad_norm": 0.33289209133700864, + "learning_rate": 1.779698155787422e-06, + "loss": 0.0247, + "step": 6741 + }, + { + "epoch": 2.2514610118550675, + "grad_norm": 0.29840920550364575, + "learning_rate": 1.7782117982935854e-06, + "loss": 0.02, + "step": 6742 + }, + { + "epoch": 2.2517949574219402, + "grad_norm": 0.27802535917937715, + "learning_rate": 1.7767259274887937e-06, + "loss": 0.0163, + "step": 6743 + }, + { + "epoch": 2.252128902988813, + "grad_norm": 0.3311624672647798, + "learning_rate": 1.7752405435975002e-06, + "loss": 0.0159, + "step": 6744 + }, + { + "epoch": 2.2524628485556852, + "grad_norm": 0.2797337923935447, + "learning_rate": 1.7737556468440964e-06, + "loss": 0.0089, + "step": 6745 + }, + { + "epoch": 2.252796794122558, + "grad_norm": 0.28803284395983925, + "learning_rate": 1.7722712374528877e-06, + "loss": 0.0125, + "step": 6746 + }, + { + "epoch": 2.2531307396894307, + "grad_norm": 0.3025555554692831, + "learning_rate": 1.7707873156481158e-06, + "loss": 0.0162, + "step": 6747 + }, + { + "epoch": 2.2534646852563034, + "grad_norm": 0.296463753648962, + "learning_rate": 1.7693038816539416e-06, + "loss": 0.0103, + "step": 6748 + }, + { + "epoch": 2.2537986308231757, + "grad_norm": 0.30204472467144544, + "learning_rate": 1.767820935694457e-06, + "loss": 0.0125, + "step": 6749 + }, + { + "epoch": 2.2541325763900484, + "grad_norm": 0.30579246176501707, + "learning_rate": 1.7663384779936764e-06, + "loss": 0.0158, + "step": 6750 + }, + { + "epoch": 2.254466521956921, + "grad_norm": 0.31808116764866246, + "learning_rate": 1.7648565087755442e-06, + "loss": 0.0152, + "step": 6751 + }, + { + "epoch": 2.2548004675237934, + "grad_norm": 0.31607876908662236, + "learning_rate": 1.76337502826393e-06, + "loss": 0.0122, + "step": 6752 + }, + { + "epoch": 2.255134413090666, + "grad_norm": 0.3078147858767164, + "learning_rate": 1.7618940366826266e-06, + "loss": 0.0152, + "step": 6753 + }, + { + "epoch": 2.255468358657539, + "grad_norm": 0.3184779928496126, + "learning_rate": 1.7604135342553564e-06, + "loss": 0.0199, + "step": 6754 + }, + { + "epoch": 2.2558023042244115, + "grad_norm": 0.29048561038570914, + "learning_rate": 1.7589335212057663e-06, + "loss": 0.0159, + "step": 6755 + }, + { + "epoch": 2.2561362497912842, + "grad_norm": 0.2856138908380283, + "learning_rate": 1.7574539977574323e-06, + "loss": 0.0151, + "step": 6756 + }, + { + "epoch": 2.2564701953581565, + "grad_norm": 0.23988233482648116, + "learning_rate": 1.7559749641338497e-06, + "loss": 0.0138, + "step": 6757 + }, + { + "epoch": 2.2568041409250292, + "grad_norm": 0.2280298279760251, + "learning_rate": 1.7544964205584476e-06, + "loss": 0.0087, + "step": 6758 + }, + { + "epoch": 2.257138086491902, + "grad_norm": 0.24679574669725393, + "learning_rate": 1.7530183672545743e-06, + "loss": 0.0095, + "step": 6759 + }, + { + "epoch": 2.2574720320587742, + "grad_norm": 0.29375683971947864, + "learning_rate": 1.7515408044455102e-06, + "loss": 0.015, + "step": 6760 + }, + { + "epoch": 2.257805977625647, + "grad_norm": 0.28482641688588567, + "learning_rate": 1.7500637323544534e-06, + "loss": 0.0116, + "step": 6761 + }, + { + "epoch": 2.2581399231925197, + "grad_norm": 0.29429403922629754, + "learning_rate": 1.74858715120454e-06, + "loss": 0.0139, + "step": 6762 + }, + { + "epoch": 2.2584738687593924, + "grad_norm": 0.5477196044056735, + "learning_rate": 1.7471110612188203e-06, + "loss": 0.021, + "step": 6763 + }, + { + "epoch": 2.2588078143262647, + "grad_norm": 0.3411849167684714, + "learning_rate": 1.7456354626202775e-06, + "loss": 0.0204, + "step": 6764 + }, + { + "epoch": 2.2591417598931374, + "grad_norm": 0.22717898295287525, + "learning_rate": 1.7441603556318155e-06, + "loss": 0.0107, + "step": 6765 + }, + { + "epoch": 2.25947570546001, + "grad_norm": 0.21815232273039045, + "learning_rate": 1.74268574047627e-06, + "loss": 0.0067, + "step": 6766 + }, + { + "epoch": 2.259809651026883, + "grad_norm": 0.24089500821530055, + "learning_rate": 1.7412116173763931e-06, + "loss": 0.0111, + "step": 6767 + }, + { + "epoch": 2.260143596593755, + "grad_norm": 0.27440544097516567, + "learning_rate": 1.7397379865548758e-06, + "loss": 0.0151, + "step": 6768 + }, + { + "epoch": 2.260477542160628, + "grad_norm": 0.30917108392500187, + "learning_rate": 1.7382648482343229e-06, + "loss": 0.0193, + "step": 6769 + }, + { + "epoch": 2.2608114877275005, + "grad_norm": 0.28570002063275857, + "learning_rate": 1.7367922026372713e-06, + "loss": 0.0172, + "step": 6770 + }, + { + "epoch": 2.261145433294373, + "grad_norm": 0.31167117187193333, + "learning_rate": 1.7353200499861794e-06, + "loss": 0.0189, + "step": 6771 + }, + { + "epoch": 2.2614793788612455, + "grad_norm": 0.22686891157216968, + "learning_rate": 1.733848390503436e-06, + "loss": 0.0101, + "step": 6772 + }, + { + "epoch": 2.261813324428118, + "grad_norm": 0.279376728241647, + "learning_rate": 1.732377224411349e-06, + "loss": 0.0147, + "step": 6773 + }, + { + "epoch": 2.262147269994991, + "grad_norm": 0.31891679122809297, + "learning_rate": 1.7309065519321572e-06, + "loss": 0.0169, + "step": 6774 + }, + { + "epoch": 2.2624812155618637, + "grad_norm": 0.25239017270506375, + "learning_rate": 1.729436373288025e-06, + "loss": 0.0113, + "step": 6775 + }, + { + "epoch": 2.262815161128736, + "grad_norm": 0.32504179795442667, + "learning_rate": 1.7279666887010361e-06, + "loss": 0.0132, + "step": 6776 + }, + { + "epoch": 2.2631491066956086, + "grad_norm": 0.3534521670275748, + "learning_rate": 1.726497498393206e-06, + "loss": 0.0175, + "step": 6777 + }, + { + "epoch": 2.2634830522624814, + "grad_norm": 0.3267800201637023, + "learning_rate": 1.7250288025864747e-06, + "loss": 0.0153, + "step": 6778 + }, + { + "epoch": 2.2638169978293536, + "grad_norm": 0.345343770191701, + "learning_rate": 1.7235606015027029e-06, + "loss": 0.0131, + "step": 6779 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.2891933591093856, + "learning_rate": 1.7220928953636812e-06, + "loss": 0.0109, + "step": 6780 + }, + { + "epoch": 2.264484888963099, + "grad_norm": 0.3172121439753211, + "learning_rate": 1.7206256843911252e-06, + "loss": 0.0164, + "step": 6781 + }, + { + "epoch": 2.264818834529972, + "grad_norm": 0.3066571591895138, + "learning_rate": 1.7191589688066706e-06, + "loss": 0.017, + "step": 6782 + }, + { + "epoch": 2.265152780096844, + "grad_norm": 0.4642163671941375, + "learning_rate": 1.7176927488318868e-06, + "loss": 0.0235, + "step": 6783 + }, + { + "epoch": 2.265486725663717, + "grad_norm": 0.28202389264257427, + "learning_rate": 1.7162270246882595e-06, + "loss": 0.0132, + "step": 6784 + }, + { + "epoch": 2.2658206712305895, + "grad_norm": 0.279908610262276, + "learning_rate": 1.7147617965972052e-06, + "loss": 0.0129, + "step": 6785 + }, + { + "epoch": 2.266154616797462, + "grad_norm": 0.22167584604368668, + "learning_rate": 1.7132970647800639e-06, + "loss": 0.009, + "step": 6786 + }, + { + "epoch": 2.2664885623643345, + "grad_norm": 0.3382200241546776, + "learning_rate": 1.7118328294581028e-06, + "loss": 0.0201, + "step": 6787 + }, + { + "epoch": 2.266822507931207, + "grad_norm": 0.2213767516570011, + "learning_rate": 1.7103690908525072e-06, + "loss": 0.007, + "step": 6788 + }, + { + "epoch": 2.26715645349808, + "grad_norm": 0.36591740613831275, + "learning_rate": 1.7089058491843967e-06, + "loss": 0.0194, + "step": 6789 + }, + { + "epoch": 2.267490399064952, + "grad_norm": 0.29368537764968344, + "learning_rate": 1.7074431046748075e-06, + "loss": 0.0154, + "step": 6790 + }, + { + "epoch": 2.267824344631825, + "grad_norm": 0.4352427002983423, + "learning_rate": 1.7059808575447057e-06, + "loss": 0.0285, + "step": 6791 + }, + { + "epoch": 2.2681582901986976, + "grad_norm": 0.2450375169758912, + "learning_rate": 1.7045191080149815e-06, + "loss": 0.0147, + "step": 6792 + }, + { + "epoch": 2.2684922357655704, + "grad_norm": 0.23659982706957028, + "learning_rate": 1.7030578563064504e-06, + "loss": 0.0136, + "step": 6793 + }, + { + "epoch": 2.2688261813324426, + "grad_norm": 0.2753970165028371, + "learning_rate": 1.7015971026398487e-06, + "loss": 0.0153, + "step": 6794 + }, + { + "epoch": 2.2691601268993153, + "grad_norm": 0.24823421067993465, + "learning_rate": 1.7001368472358442e-06, + "loss": 0.0106, + "step": 6795 + }, + { + "epoch": 2.269494072466188, + "grad_norm": 0.25569894707620683, + "learning_rate": 1.6986770903150213e-06, + "loss": 0.0098, + "step": 6796 + }, + { + "epoch": 2.269828018033061, + "grad_norm": 0.26151944726558574, + "learning_rate": 1.697217832097896e-06, + "loss": 0.0098, + "step": 6797 + }, + { + "epoch": 2.270161963599933, + "grad_norm": 0.3804194185347123, + "learning_rate": 1.6957590728049078e-06, + "loss": 0.0183, + "step": 6798 + }, + { + "epoch": 2.2704959091668058, + "grad_norm": 0.3077685386335056, + "learning_rate": 1.6943008126564164e-06, + "loss": 0.015, + "step": 6799 + }, + { + "epoch": 2.2708298547336785, + "grad_norm": 0.251061230961801, + "learning_rate": 1.6928430518727102e-06, + "loss": 0.0104, + "step": 6800 + }, + { + "epoch": 2.2711638003005508, + "grad_norm": 0.2500679716397881, + "learning_rate": 1.6913857906740033e-06, + "loss": 0.0108, + "step": 6801 + }, + { + "epoch": 2.2714977458674235, + "grad_norm": 0.2668967253339908, + "learning_rate": 1.6899290292804288e-06, + "loss": 0.0138, + "step": 6802 + }, + { + "epoch": 2.271831691434296, + "grad_norm": 0.27352451073187406, + "learning_rate": 1.6884727679120493e-06, + "loss": 0.0123, + "step": 6803 + }, + { + "epoch": 2.272165637001169, + "grad_norm": 0.2851665401691066, + "learning_rate": 1.687017006788852e-06, + "loss": 0.0133, + "step": 6804 + }, + { + "epoch": 2.2724995825680416, + "grad_norm": 0.2827338506966288, + "learning_rate": 1.6855617461307427e-06, + "loss": 0.0179, + "step": 6805 + }, + { + "epoch": 2.272833528134914, + "grad_norm": 0.25163410520849566, + "learning_rate": 1.6841069861575598e-06, + "loss": 0.0126, + "step": 6806 + }, + { + "epoch": 2.2731674737017866, + "grad_norm": 0.2799593201961557, + "learning_rate": 1.6826527270890587e-06, + "loss": 0.0125, + "step": 6807 + }, + { + "epoch": 2.2735014192686593, + "grad_norm": 0.36610892656306426, + "learning_rate": 1.6811989691449232e-06, + "loss": 0.0222, + "step": 6808 + }, + { + "epoch": 2.2738353648355316, + "grad_norm": 0.29707283974348947, + "learning_rate": 1.6797457125447614e-06, + "loss": 0.0132, + "step": 6809 + }, + { + "epoch": 2.2741693104024043, + "grad_norm": 0.2830973072756119, + "learning_rate": 1.678292957508106e-06, + "loss": 0.0176, + "step": 6810 + }, + { + "epoch": 2.274503255969277, + "grad_norm": 0.2518703588316546, + "learning_rate": 1.6768407042544093e-06, + "loss": 0.013, + "step": 6811 + }, + { + "epoch": 2.2748372015361498, + "grad_norm": 0.26358998186935967, + "learning_rate": 1.6753889530030554e-06, + "loss": 0.0108, + "step": 6812 + }, + { + "epoch": 2.275171147103022, + "grad_norm": 0.29076873005784715, + "learning_rate": 1.673937703973344e-06, + "loss": 0.0136, + "step": 6813 + }, + { + "epoch": 2.2755050926698948, + "grad_norm": 0.27140123012108447, + "learning_rate": 1.6724869573845054e-06, + "loss": 0.0095, + "step": 6814 + }, + { + "epoch": 2.2758390382367675, + "grad_norm": 0.3248385125076713, + "learning_rate": 1.6710367134556926e-06, + "loss": 0.0152, + "step": 6815 + }, + { + "epoch": 2.27617298380364, + "grad_norm": 0.42139279076769803, + "learning_rate": 1.6695869724059827e-06, + "loss": 0.0186, + "step": 6816 + }, + { + "epoch": 2.2765069293705125, + "grad_norm": 0.30791526847134654, + "learning_rate": 1.6681377344543737e-06, + "loss": 0.0165, + "step": 6817 + }, + { + "epoch": 2.276840874937385, + "grad_norm": 0.2404710251135645, + "learning_rate": 1.6666889998197927e-06, + "loss": 0.011, + "step": 6818 + }, + { + "epoch": 2.277174820504258, + "grad_norm": 0.296729816970035, + "learning_rate": 1.6652407687210853e-06, + "loss": 0.0143, + "step": 6819 + }, + { + "epoch": 2.27750876607113, + "grad_norm": 0.2629727000599189, + "learning_rate": 1.6637930413770249e-06, + "loss": 0.0102, + "step": 6820 + }, + { + "epoch": 2.277842711638003, + "grad_norm": 0.30968336667798596, + "learning_rate": 1.6623458180063084e-06, + "loss": 0.0124, + "step": 6821 + }, + { + "epoch": 2.2781766572048756, + "grad_norm": 0.23392177958027288, + "learning_rate": 1.6608990988275575e-06, + "loss": 0.0114, + "step": 6822 + }, + { + "epoch": 2.2785106027717483, + "grad_norm": 0.26399538480166607, + "learning_rate": 1.6594528840593128e-06, + "loss": 0.0126, + "step": 6823 + }, + { + "epoch": 2.278844548338621, + "grad_norm": 0.3126256564920955, + "learning_rate": 1.6580071739200448e-06, + "loss": 0.0198, + "step": 6824 + }, + { + "epoch": 2.2791784939054933, + "grad_norm": 0.1980324301544163, + "learning_rate": 1.6565619686281425e-06, + "loss": 0.0086, + "step": 6825 + }, + { + "epoch": 2.279512439472366, + "grad_norm": 0.3800839800161238, + "learning_rate": 1.6551172684019224e-06, + "loss": 0.0219, + "step": 6826 + }, + { + "epoch": 2.2798463850392388, + "grad_norm": 0.34973357722789106, + "learning_rate": 1.6536730734596257e-06, + "loss": 0.0222, + "step": 6827 + }, + { + "epoch": 2.280180330606111, + "grad_norm": 0.3159132113182457, + "learning_rate": 1.652229384019411e-06, + "loss": 0.0142, + "step": 6828 + }, + { + "epoch": 2.2805142761729837, + "grad_norm": 0.28538657841798853, + "learning_rate": 1.650786200299368e-06, + "loss": 0.0143, + "step": 6829 + }, + { + "epoch": 2.2808482217398565, + "grad_norm": 0.383727581012012, + "learning_rate": 1.6493435225175042e-06, + "loss": 0.0165, + "step": 6830 + }, + { + "epoch": 2.281182167306729, + "grad_norm": 0.2018080194084335, + "learning_rate": 1.6479013508917552e-06, + "loss": 0.0079, + "step": 6831 + }, + { + "epoch": 2.2815161128736015, + "grad_norm": 0.2674727772870895, + "learning_rate": 1.6464596856399734e-06, + "loss": 0.0176, + "step": 6832 + }, + { + "epoch": 2.281850058440474, + "grad_norm": 0.23790251702327064, + "learning_rate": 1.6450185269799462e-06, + "loss": 0.0101, + "step": 6833 + }, + { + "epoch": 2.282184004007347, + "grad_norm": 0.2611975481305445, + "learning_rate": 1.6435778751293723e-06, + "loss": 0.0138, + "step": 6834 + }, + { + "epoch": 2.2825179495742196, + "grad_norm": 0.20370042032538926, + "learning_rate": 1.6421377303058829e-06, + "loss": 0.0103, + "step": 6835 + }, + { + "epoch": 2.282851895141092, + "grad_norm": 0.32989165343685595, + "learning_rate": 1.640698092727025e-06, + "loss": 0.0115, + "step": 6836 + }, + { + "epoch": 2.2831858407079646, + "grad_norm": 0.2379040986628446, + "learning_rate": 1.639258962610275e-06, + "loss": 0.0106, + "step": 6837 + }, + { + "epoch": 2.2835197862748373, + "grad_norm": 0.2680402438125095, + "learning_rate": 1.6378203401730303e-06, + "loss": 0.015, + "step": 6838 + }, + { + "epoch": 2.2838537318417096, + "grad_norm": 0.32978647443444303, + "learning_rate": 1.6363822256326128e-06, + "loss": 0.0162, + "step": 6839 + }, + { + "epoch": 2.2841876774085823, + "grad_norm": 0.32164075396309677, + "learning_rate": 1.6349446192062635e-06, + "loss": 0.0152, + "step": 6840 + }, + { + "epoch": 2.284521622975455, + "grad_norm": 0.3271033440192926, + "learning_rate": 1.633507521111154e-06, + "loss": 0.0129, + "step": 6841 + }, + { + "epoch": 2.2848555685423277, + "grad_norm": 0.2508946834293806, + "learning_rate": 1.6320709315643708e-06, + "loss": 0.014, + "step": 6842 + }, + { + "epoch": 2.2851895141092, + "grad_norm": 0.2986067958825717, + "learning_rate": 1.6306348507829294e-06, + "loss": 0.0171, + "step": 6843 + }, + { + "epoch": 2.2855234596760727, + "grad_norm": 0.31711981611601764, + "learning_rate": 1.6291992789837669e-06, + "loss": 0.0142, + "step": 6844 + }, + { + "epoch": 2.2858574052429455, + "grad_norm": 0.3179418395778233, + "learning_rate": 1.6277642163837444e-06, + "loss": 0.0162, + "step": 6845 + }, + { + "epoch": 2.286191350809818, + "grad_norm": 0.27948032372466824, + "learning_rate": 1.6263296631996422e-06, + "loss": 0.0126, + "step": 6846 + }, + { + "epoch": 2.2865252963766904, + "grad_norm": 0.2152095919458289, + "learning_rate": 1.6248956196481701e-06, + "loss": 0.0107, + "step": 6847 + }, + { + "epoch": 2.286859241943563, + "grad_norm": 0.3129421993487875, + "learning_rate": 1.6234620859459537e-06, + "loss": 0.0187, + "step": 6848 + }, + { + "epoch": 2.287193187510436, + "grad_norm": 0.29263364992986585, + "learning_rate": 1.6220290623095463e-06, + "loss": 0.0124, + "step": 6849 + }, + { + "epoch": 2.287527133077308, + "grad_norm": 0.3180497394134938, + "learning_rate": 1.6205965489554248e-06, + "loss": 0.0168, + "step": 6850 + }, + { + "epoch": 2.287861078644181, + "grad_norm": 0.32067412381710797, + "learning_rate": 1.619164546099985e-06, + "loss": 0.0253, + "step": 6851 + }, + { + "epoch": 2.2881950242110536, + "grad_norm": 0.23473276148048866, + "learning_rate": 1.6177330539595493e-06, + "loss": 0.01, + "step": 6852 + }, + { + "epoch": 2.2885289697779263, + "grad_norm": 0.2160113689800733, + "learning_rate": 1.6163020727503592e-06, + "loss": 0.0112, + "step": 6853 + }, + { + "epoch": 2.288862915344799, + "grad_norm": 0.3801212896786191, + "learning_rate": 1.6148716026885847e-06, + "loss": 0.0218, + "step": 6854 + }, + { + "epoch": 2.2891968609116713, + "grad_norm": 0.3801251650044196, + "learning_rate": 1.61344164399031e-06, + "loss": 0.0207, + "step": 6855 + }, + { + "epoch": 2.289530806478544, + "grad_norm": 0.25983595677989696, + "learning_rate": 1.6120121968715535e-06, + "loss": 0.0102, + "step": 6856 + }, + { + "epoch": 2.2898647520454167, + "grad_norm": 0.2972474075437074, + "learning_rate": 1.6105832615482453e-06, + "loss": 0.0161, + "step": 6857 + }, + { + "epoch": 2.290198697612289, + "grad_norm": 0.2326329260414521, + "learning_rate": 1.609154838236246e-06, + "loss": 0.0117, + "step": 6858 + }, + { + "epoch": 2.2905326431791617, + "grad_norm": 0.2507480605395413, + "learning_rate": 1.6077269271513328e-06, + "loss": 0.0125, + "step": 6859 + }, + { + "epoch": 2.2908665887460344, + "grad_norm": 0.290488371092609, + "learning_rate": 1.606299528509212e-06, + "loss": 0.0175, + "step": 6860 + }, + { + "epoch": 2.291200534312907, + "grad_norm": 0.2883163507713603, + "learning_rate": 1.604872642525503e-06, + "loss": 0.0111, + "step": 6861 + }, + { + "epoch": 2.2915344798797794, + "grad_norm": 0.2844972972797543, + "learning_rate": 1.6034462694157615e-06, + "loss": 0.0174, + "step": 6862 + }, + { + "epoch": 2.291868425446652, + "grad_norm": 0.33202652174069175, + "learning_rate": 1.6020204093954523e-06, + "loss": 0.0163, + "step": 6863 + }, + { + "epoch": 2.292202371013525, + "grad_norm": 0.2275984281278442, + "learning_rate": 1.6005950626799716e-06, + "loss": 0.012, + "step": 6864 + }, + { + "epoch": 2.2925363165803976, + "grad_norm": 0.3253942305308068, + "learning_rate": 1.5991702294846318e-06, + "loss": 0.0127, + "step": 6865 + }, + { + "epoch": 2.29287026214727, + "grad_norm": 0.3488086489966652, + "learning_rate": 1.597745910024674e-06, + "loss": 0.0189, + "step": 6866 + }, + { + "epoch": 2.2932042077141426, + "grad_norm": 0.317684008652152, + "learning_rate": 1.5963221045152537e-06, + "loss": 0.0149, + "step": 6867 + }, + { + "epoch": 2.2935381532810153, + "grad_norm": 0.28305997131249266, + "learning_rate": 1.5948988131714594e-06, + "loss": 0.0133, + "step": 6868 + }, + { + "epoch": 2.2938720988478876, + "grad_norm": 0.291190951166451, + "learning_rate": 1.593476036208292e-06, + "loss": 0.0108, + "step": 6869 + }, + { + "epoch": 2.2942060444147603, + "grad_norm": 0.33410447493705175, + "learning_rate": 1.5920537738406811e-06, + "loss": 0.015, + "step": 6870 + }, + { + "epoch": 2.294539989981633, + "grad_norm": 0.29687307023421855, + "learning_rate": 1.5906320262834735e-06, + "loss": 0.0082, + "step": 6871 + }, + { + "epoch": 2.2948739355485057, + "grad_norm": 0.3159385432404704, + "learning_rate": 1.5892107937514424e-06, + "loss": 0.0194, + "step": 6872 + }, + { + "epoch": 2.2952078811153784, + "grad_norm": 0.2783235833559468, + "learning_rate": 1.587790076459283e-06, + "loss": 0.0116, + "step": 6873 + }, + { + "epoch": 2.2955418266822507, + "grad_norm": 0.2997547526346615, + "learning_rate": 1.5863698746216082e-06, + "loss": 0.0137, + "step": 6874 + }, + { + "epoch": 2.2958757722491234, + "grad_norm": 0.31551977953032, + "learning_rate": 1.58495018845296e-06, + "loss": 0.014, + "step": 6875 + }, + { + "epoch": 2.296209717815996, + "grad_norm": 0.2157668011951437, + "learning_rate": 1.5835310181677954e-06, + "loss": 0.0127, + "step": 6876 + }, + { + "epoch": 2.2965436633828684, + "grad_norm": 0.29478318697081235, + "learning_rate": 1.5821123639804992e-06, + "loss": 0.0133, + "step": 6877 + }, + { + "epoch": 2.296877608949741, + "grad_norm": 0.22043845791969274, + "learning_rate": 1.5806942261053715e-06, + "loss": 0.0098, + "step": 6878 + }, + { + "epoch": 2.297211554516614, + "grad_norm": 0.27271994474637484, + "learning_rate": 1.5792766047566455e-06, + "loss": 0.0134, + "step": 6879 + }, + { + "epoch": 2.2975455000834866, + "grad_norm": 0.24570288637658616, + "learning_rate": 1.5778595001484648e-06, + "loss": 0.0119, + "step": 6880 + }, + { + "epoch": 2.297879445650359, + "grad_norm": 0.4306132286684389, + "learning_rate": 1.5764429124949022e-06, + "loss": 0.0267, + "step": 6881 + }, + { + "epoch": 2.2982133912172316, + "grad_norm": 0.2673610459356278, + "learning_rate": 1.5750268420099468e-06, + "loss": 0.011, + "step": 6882 + }, + { + "epoch": 2.2985473367841043, + "grad_norm": 0.29930055585529564, + "learning_rate": 1.5736112889075167e-06, + "loss": 0.0128, + "step": 6883 + }, + { + "epoch": 2.298881282350977, + "grad_norm": 0.27314842131745765, + "learning_rate": 1.5721962534014424e-06, + "loss": 0.0148, + "step": 6884 + }, + { + "epoch": 2.2992152279178493, + "grad_norm": 0.2539822566919215, + "learning_rate": 1.5707817357054882e-06, + "loss": 0.0112, + "step": 6885 + }, + { + "epoch": 2.299549173484722, + "grad_norm": 0.3423133751705591, + "learning_rate": 1.5693677360333293e-06, + "loss": 0.0117, + "step": 6886 + }, + { + "epoch": 2.2998831190515947, + "grad_norm": 0.2775115327127339, + "learning_rate": 1.56795425459857e-06, + "loss": 0.0119, + "step": 6887 + }, + { + "epoch": 2.300217064618467, + "grad_norm": 0.3363374162676071, + "learning_rate": 1.5665412916147298e-06, + "loss": 0.018, + "step": 6888 + }, + { + "epoch": 2.3005510101853397, + "grad_norm": 0.23109839671365773, + "learning_rate": 1.5651288472952564e-06, + "loss": 0.011, + "step": 6889 + }, + { + "epoch": 2.3008849557522124, + "grad_norm": 0.39509648574155504, + "learning_rate": 1.563716921853512e-06, + "loss": 0.0196, + "step": 6890 + }, + { + "epoch": 2.301218901319085, + "grad_norm": 0.30126459375211606, + "learning_rate": 1.562305515502791e-06, + "loss": 0.0143, + "step": 6891 + }, + { + "epoch": 2.3015528468859574, + "grad_norm": 0.2832106918828107, + "learning_rate": 1.5608946284562977e-06, + "loss": 0.0141, + "step": 6892 + }, + { + "epoch": 2.30188679245283, + "grad_norm": 0.33834053618681464, + "learning_rate": 1.559484260927166e-06, + "loss": 0.0184, + "step": 6893 + }, + { + "epoch": 2.302220738019703, + "grad_norm": 0.32368834968773247, + "learning_rate": 1.5580744131284464e-06, + "loss": 0.0188, + "step": 6894 + }, + { + "epoch": 2.3025546835865756, + "grad_norm": 0.2713427793874071, + "learning_rate": 1.5566650852731151e-06, + "loss": 0.0082, + "step": 6895 + }, + { + "epoch": 2.302888629153448, + "grad_norm": 0.19785854455069538, + "learning_rate": 1.5552562775740654e-06, + "loss": 0.0062, + "step": 6896 + }, + { + "epoch": 2.3032225747203205, + "grad_norm": 0.3499223117876873, + "learning_rate": 1.5538479902441156e-06, + "loss": 0.023, + "step": 6897 + }, + { + "epoch": 2.3035565202871933, + "grad_norm": 0.35008911224844896, + "learning_rate": 1.5524402234960056e-06, + "loss": 0.0168, + "step": 6898 + }, + { + "epoch": 2.3038904658540655, + "grad_norm": 0.3280570545798395, + "learning_rate": 1.5510329775423916e-06, + "loss": 0.0188, + "step": 6899 + }, + { + "epoch": 2.3042244114209383, + "grad_norm": 0.309890201802014, + "learning_rate": 1.5496262525958583e-06, + "loss": 0.0129, + "step": 6900 + }, + { + "epoch": 2.304558356987811, + "grad_norm": 0.22455359922836288, + "learning_rate": 1.5482200488689054e-06, + "loss": 0.0091, + "step": 6901 + }, + { + "epoch": 2.3048923025546837, + "grad_norm": 0.359645016511295, + "learning_rate": 1.5468143665739565e-06, + "loss": 0.0131, + "step": 6902 + }, + { + "epoch": 2.3052262481215564, + "grad_norm": 0.3260309762792919, + "learning_rate": 1.5454092059233583e-06, + "loss": 0.0147, + "step": 6903 + }, + { + "epoch": 2.3055601936884287, + "grad_norm": 0.28768781892352635, + "learning_rate": 1.5440045671293774e-06, + "loss": 0.0109, + "step": 6904 + }, + { + "epoch": 2.3058941392553014, + "grad_norm": 0.26557102939232013, + "learning_rate": 1.542600450404198e-06, + "loss": 0.0147, + "step": 6905 + }, + { + "epoch": 2.306228084822174, + "grad_norm": 0.3395811271144134, + "learning_rate": 1.5411968559599317e-06, + "loss": 0.0187, + "step": 6906 + }, + { + "epoch": 2.3065620303890464, + "grad_norm": 0.3522346805217385, + "learning_rate": 1.5397937840086048e-06, + "loss": 0.0198, + "step": 6907 + }, + { + "epoch": 2.306895975955919, + "grad_norm": 0.22118044128481698, + "learning_rate": 1.5383912347621693e-06, + "loss": 0.0093, + "step": 6908 + }, + { + "epoch": 2.307229921522792, + "grad_norm": 0.3118623337917481, + "learning_rate": 1.5369892084324972e-06, + "loss": 0.0112, + "step": 6909 + }, + { + "epoch": 2.3075638670896645, + "grad_norm": 0.2884975298254704, + "learning_rate": 1.5355877052313822e-06, + "loss": 0.0161, + "step": 6910 + }, + { + "epoch": 2.307897812656537, + "grad_norm": 0.28911069870718353, + "learning_rate": 1.534186725370535e-06, + "loss": 0.0129, + "step": 6911 + }, + { + "epoch": 2.3082317582234095, + "grad_norm": 0.31201033265750977, + "learning_rate": 1.532786269061593e-06, + "loss": 0.0115, + "step": 6912 + }, + { + "epoch": 2.3085657037902823, + "grad_norm": 0.3165311675238296, + "learning_rate": 1.531386336516107e-06, + "loss": 0.015, + "step": 6913 + }, + { + "epoch": 2.308899649357155, + "grad_norm": 0.29782598920207437, + "learning_rate": 1.52998692794556e-06, + "loss": 0.0091, + "step": 6914 + }, + { + "epoch": 2.3092335949240272, + "grad_norm": 0.2584111156420597, + "learning_rate": 1.5285880435613438e-06, + "loss": 0.0105, + "step": 6915 + }, + { + "epoch": 2.3095675404909, + "grad_norm": 0.3932468032770264, + "learning_rate": 1.5271896835747795e-06, + "loss": 0.0256, + "step": 6916 + }, + { + "epoch": 2.3099014860577727, + "grad_norm": 0.2470975333731172, + "learning_rate": 1.5257918481971028e-06, + "loss": 0.0131, + "step": 6917 + }, + { + "epoch": 2.310235431624645, + "grad_norm": 0.24886696312141146, + "learning_rate": 1.524394537639477e-06, + "loss": 0.0097, + "step": 6918 + }, + { + "epoch": 2.3105693771915177, + "grad_norm": 0.27187073985292953, + "learning_rate": 1.5229977521129785e-06, + "loss": 0.0129, + "step": 6919 + }, + { + "epoch": 2.3109033227583904, + "grad_norm": 0.3119106316207161, + "learning_rate": 1.5216014918286097e-06, + "loss": 0.0142, + "step": 6920 + }, + { + "epoch": 2.311237268325263, + "grad_norm": 0.3043107936059499, + "learning_rate": 1.5202057569972945e-06, + "loss": 0.015, + "step": 6921 + }, + { + "epoch": 2.311571213892136, + "grad_norm": 0.24181495314185136, + "learning_rate": 1.518810547829871e-06, + "loss": 0.0079, + "step": 6922 + }, + { + "epoch": 2.311905159459008, + "grad_norm": 0.379917378957993, + "learning_rate": 1.517415864537105e-06, + "loss": 0.0153, + "step": 6923 + }, + { + "epoch": 2.312239105025881, + "grad_norm": 0.38406960085102787, + "learning_rate": 1.516021707329678e-06, + "loss": 0.02, + "step": 6924 + }, + { + "epoch": 2.3125730505927535, + "grad_norm": 0.2261195871270627, + "learning_rate": 1.5146280764181942e-06, + "loss": 0.014, + "step": 6925 + }, + { + "epoch": 2.312906996159626, + "grad_norm": 0.3532238162584895, + "learning_rate": 1.5132349720131783e-06, + "loss": 0.0216, + "step": 6926 + }, + { + "epoch": 2.3132409417264985, + "grad_norm": 0.31298472699062174, + "learning_rate": 1.511842394325077e-06, + "loss": 0.0113, + "step": 6927 + }, + { + "epoch": 2.3135748872933712, + "grad_norm": 0.3177922138619716, + "learning_rate": 1.5104503435642526e-06, + "loss": 0.0156, + "step": 6928 + }, + { + "epoch": 2.313908832860244, + "grad_norm": 0.2768380147428541, + "learning_rate": 1.5090588199409927e-06, + "loss": 0.0137, + "step": 6929 + }, + { + "epoch": 2.3142427784271162, + "grad_norm": 0.33564312248612466, + "learning_rate": 1.5076678236655018e-06, + "loss": 0.0191, + "step": 6930 + }, + { + "epoch": 2.314576723993989, + "grad_norm": 0.3354268527242691, + "learning_rate": 1.5062773549479064e-06, + "loss": 0.0173, + "step": 6931 + }, + { + "epoch": 2.3149106695608617, + "grad_norm": 0.25080284224765265, + "learning_rate": 1.504887413998254e-06, + "loss": 0.0124, + "step": 6932 + }, + { + "epoch": 2.3152446151277344, + "grad_norm": 0.28694018057723303, + "learning_rate": 1.5034980010265127e-06, + "loss": 0.0124, + "step": 6933 + }, + { + "epoch": 2.3155785606946067, + "grad_norm": 0.3303089047311808, + "learning_rate": 1.5021091162425672e-06, + "loss": 0.0141, + "step": 6934 + }, + { + "epoch": 2.3159125062614794, + "grad_norm": 0.27309001326669086, + "learning_rate": 1.5007207598562268e-06, + "loss": 0.0175, + "step": 6935 + }, + { + "epoch": 2.316246451828352, + "grad_norm": 0.3623573581505953, + "learning_rate": 1.4993329320772177e-06, + "loss": 0.0245, + "step": 6936 + }, + { + "epoch": 2.3165803973952244, + "grad_norm": 0.27206488443658156, + "learning_rate": 1.4979456331151875e-06, + "loss": 0.0176, + "step": 6937 + }, + { + "epoch": 2.316914342962097, + "grad_norm": 0.561266236606752, + "learning_rate": 1.4965588631797052e-06, + "loss": 0.0149, + "step": 6938 + }, + { + "epoch": 2.31724828852897, + "grad_norm": 0.23942318099071092, + "learning_rate": 1.4951726224802593e-06, + "loss": 0.011, + "step": 6939 + }, + { + "epoch": 2.3175822340958425, + "grad_norm": 0.251998175067427, + "learning_rate": 1.493786911226256e-06, + "loss": 0.0179, + "step": 6940 + }, + { + "epoch": 2.317916179662715, + "grad_norm": 0.276215128862187, + "learning_rate": 1.492401729627025e-06, + "loss": 0.0116, + "step": 6941 + }, + { + "epoch": 2.3182501252295875, + "grad_norm": 0.2651607229741079, + "learning_rate": 1.491017077891812e-06, + "loss": 0.0148, + "step": 6942 + }, + { + "epoch": 2.3185840707964602, + "grad_norm": 0.25248514184534754, + "learning_rate": 1.4896329562297863e-06, + "loss": 0.009, + "step": 6943 + }, + { + "epoch": 2.318918016363333, + "grad_norm": 0.2548273018450235, + "learning_rate": 1.4882493648500373e-06, + "loss": 0.0098, + "step": 6944 + }, + { + "epoch": 2.319251961930205, + "grad_norm": 0.3088948076685334, + "learning_rate": 1.48686630396157e-06, + "loss": 0.0174, + "step": 6945 + }, + { + "epoch": 2.319585907497078, + "grad_norm": 0.30576510424073655, + "learning_rate": 1.4854837737733147e-06, + "loss": 0.0206, + "step": 6946 + }, + { + "epoch": 2.3199198530639507, + "grad_norm": 0.2979807996026274, + "learning_rate": 1.484101774494116e-06, + "loss": 0.0148, + "step": 6947 + }, + { + "epoch": 2.320253798630823, + "grad_norm": 0.2565580544831514, + "learning_rate": 1.4827203063327427e-06, + "loss": 0.0136, + "step": 6948 + }, + { + "epoch": 2.3205877441976956, + "grad_norm": 0.2731189156615786, + "learning_rate": 1.4813393694978812e-06, + "loss": 0.0136, + "step": 6949 + }, + { + "epoch": 2.3209216897645684, + "grad_norm": 0.2849877677152315, + "learning_rate": 1.479958964198141e-06, + "loss": 0.0126, + "step": 6950 + }, + { + "epoch": 2.321255635331441, + "grad_norm": 0.42411171708168294, + "learning_rate": 1.4785790906420445e-06, + "loss": 0.0188, + "step": 6951 + }, + { + "epoch": 2.321589580898314, + "grad_norm": 0.2441349390152868, + "learning_rate": 1.4771997490380414e-06, + "loss": 0.0154, + "step": 6952 + }, + { + "epoch": 2.321923526465186, + "grad_norm": 0.33877273899911525, + "learning_rate": 1.4758209395944945e-06, + "loss": 0.0202, + "step": 6953 + }, + { + "epoch": 2.322257472032059, + "grad_norm": 0.32485886612406134, + "learning_rate": 1.47444266251969e-06, + "loss": 0.0084, + "step": 6954 + }, + { + "epoch": 2.3225914175989315, + "grad_norm": 0.3386213394227128, + "learning_rate": 1.4730649180218337e-06, + "loss": 0.0166, + "step": 6955 + }, + { + "epoch": 2.322925363165804, + "grad_norm": 0.22010222064052062, + "learning_rate": 1.4716877063090517e-06, + "loss": 0.0092, + "step": 6956 + }, + { + "epoch": 2.3232593087326765, + "grad_norm": 0.2629499196798745, + "learning_rate": 1.4703110275893846e-06, + "loss": 0.0156, + "step": 6957 + }, + { + "epoch": 2.323593254299549, + "grad_norm": 0.2552628821030842, + "learning_rate": 1.4689348820707988e-06, + "loss": 0.0125, + "step": 6958 + }, + { + "epoch": 2.323927199866422, + "grad_norm": 0.265968468931718, + "learning_rate": 1.4675592699611741e-06, + "loss": 0.0094, + "step": 6959 + }, + { + "epoch": 2.324261145433294, + "grad_norm": 0.35108506391580413, + "learning_rate": 1.4661841914683156e-06, + "loss": 0.0233, + "step": 6960 + }, + { + "epoch": 2.324595091000167, + "grad_norm": 0.28573449329162687, + "learning_rate": 1.464809646799944e-06, + "loss": 0.0137, + "step": 6961 + }, + { + "epoch": 2.3249290365670396, + "grad_norm": 0.24620526100717358, + "learning_rate": 1.463435636163702e-06, + "loss": 0.0189, + "step": 6962 + }, + { + "epoch": 2.3252629821339124, + "grad_norm": 0.28282179128012264, + "learning_rate": 1.4620621597671476e-06, + "loss": 0.0167, + "step": 6963 + }, + { + "epoch": 2.3255969277007846, + "grad_norm": 0.28613556450697475, + "learning_rate": 1.4606892178177633e-06, + "loss": 0.0156, + "step": 6964 + }, + { + "epoch": 2.3259308732676574, + "grad_norm": 0.24834290335390774, + "learning_rate": 1.459316810522945e-06, + "loss": 0.0124, + "step": 6965 + }, + { + "epoch": 2.32626481883453, + "grad_norm": 0.3049774665147699, + "learning_rate": 1.457944938090013e-06, + "loss": 0.0162, + "step": 6966 + }, + { + "epoch": 2.3265987644014023, + "grad_norm": 0.29986365571686485, + "learning_rate": 1.456573600726206e-06, + "loss": 0.017, + "step": 6967 + }, + { + "epoch": 2.326932709968275, + "grad_norm": 0.28644245612453273, + "learning_rate": 1.4552027986386775e-06, + "loss": 0.0125, + "step": 6968 + }, + { + "epoch": 2.3272666555351478, + "grad_norm": 0.28840902391683015, + "learning_rate": 1.453832532034506e-06, + "loss": 0.024, + "step": 6969 + }, + { + "epoch": 2.3276006011020205, + "grad_norm": 0.28433771644705647, + "learning_rate": 1.4524628011206843e-06, + "loss": 0.0156, + "step": 6970 + }, + { + "epoch": 2.327934546668893, + "grad_norm": 0.38140193932310595, + "learning_rate": 1.4510936061041269e-06, + "loss": 0.0199, + "step": 6971 + }, + { + "epoch": 2.3282684922357655, + "grad_norm": 0.19225305041517793, + "learning_rate": 1.449724947191668e-06, + "loss": 0.0061, + "step": 6972 + }, + { + "epoch": 2.328602437802638, + "grad_norm": 0.28570383362481183, + "learning_rate": 1.4483568245900597e-06, + "loss": 0.0154, + "step": 6973 + }, + { + "epoch": 2.328936383369511, + "grad_norm": 0.2903318790132737, + "learning_rate": 1.4469892385059713e-06, + "loss": 0.0136, + "step": 6974 + }, + { + "epoch": 2.329270328936383, + "grad_norm": 0.2538408576499128, + "learning_rate": 1.4456221891459953e-06, + "loss": 0.0093, + "step": 6975 + }, + { + "epoch": 2.329604274503256, + "grad_norm": 0.2874971317361385, + "learning_rate": 1.4442556767166371e-06, + "loss": 0.0131, + "step": 6976 + }, + { + "epoch": 2.3299382200701286, + "grad_norm": 0.24425712032382957, + "learning_rate": 1.4428897014243288e-06, + "loss": 0.0109, + "step": 6977 + }, + { + "epoch": 2.3302721656370013, + "grad_norm": 0.25296973405459755, + "learning_rate": 1.4415242634754107e-06, + "loss": 0.0123, + "step": 6978 + }, + { + "epoch": 2.3306061112038736, + "grad_norm": 0.2320438547831141, + "learning_rate": 1.4401593630761562e-06, + "loss": 0.012, + "step": 6979 + }, + { + "epoch": 2.3309400567707463, + "grad_norm": 0.3235855326462055, + "learning_rate": 1.4387950004327434e-06, + "loss": 0.013, + "step": 6980 + }, + { + "epoch": 2.331274002337619, + "grad_norm": 0.24662402204525793, + "learning_rate": 1.4374311757512798e-06, + "loss": 0.0116, + "step": 6981 + }, + { + "epoch": 2.3316079479044918, + "grad_norm": 0.33864863398546513, + "learning_rate": 1.4360678892377833e-06, + "loss": 0.0161, + "step": 6982 + }, + { + "epoch": 2.331941893471364, + "grad_norm": 0.3952467987766312, + "learning_rate": 1.434705141098197e-06, + "loss": 0.0213, + "step": 6983 + }, + { + "epoch": 2.3322758390382368, + "grad_norm": 0.2700400125085471, + "learning_rate": 1.4333429315383768e-06, + "loss": 0.0147, + "step": 6984 + }, + { + "epoch": 2.3326097846051095, + "grad_norm": 0.30636721537276057, + "learning_rate": 1.4319812607641055e-06, + "loss": 0.0135, + "step": 6985 + }, + { + "epoch": 2.3329437301719818, + "grad_norm": 0.29262149370274226, + "learning_rate": 1.4306201289810756e-06, + "loss": 0.016, + "step": 6986 + }, + { + "epoch": 2.3332776757388545, + "grad_norm": 0.30306257392613584, + "learning_rate": 1.4292595363949047e-06, + "loss": 0.0155, + "step": 6987 + }, + { + "epoch": 2.333611621305727, + "grad_norm": 0.2660978556325702, + "learning_rate": 1.4278994832111232e-06, + "loss": 0.0127, + "step": 6988 + }, + { + "epoch": 2.3339455668726, + "grad_norm": 0.2369096043320216, + "learning_rate": 1.4265399696351867e-06, + "loss": 0.0094, + "step": 6989 + }, + { + "epoch": 2.334279512439472, + "grad_norm": 0.29684526715594656, + "learning_rate": 1.4251809958724623e-06, + "loss": 0.0151, + "step": 6990 + }, + { + "epoch": 2.334613458006345, + "grad_norm": 0.26183653131005824, + "learning_rate": 1.4238225621282403e-06, + "loss": 0.0129, + "step": 6991 + }, + { + "epoch": 2.3349474035732176, + "grad_norm": 0.2715118542480326, + "learning_rate": 1.4224646686077303e-06, + "loss": 0.0119, + "step": 6992 + }, + { + "epoch": 2.3352813491400903, + "grad_norm": 0.28918724818389474, + "learning_rate": 1.4211073155160544e-06, + "loss": 0.019, + "step": 6993 + }, + { + "epoch": 2.3356152947069626, + "grad_norm": 0.28190471982389226, + "learning_rate": 1.4197505030582588e-06, + "loss": 0.0139, + "step": 6994 + }, + { + "epoch": 2.3359492402738353, + "grad_norm": 0.23096099581309007, + "learning_rate": 1.4183942314393056e-06, + "loss": 0.0118, + "step": 6995 + }, + { + "epoch": 2.336283185840708, + "grad_norm": 0.24940806167635468, + "learning_rate": 1.4170385008640774e-06, + "loss": 0.0108, + "step": 6996 + }, + { + "epoch": 2.3366171314075803, + "grad_norm": 0.2900515861486615, + "learning_rate": 1.4156833115373702e-06, + "loss": 0.0096, + "step": 6997 + }, + { + "epoch": 2.336951076974453, + "grad_norm": 0.29814869780191827, + "learning_rate": 1.4143286636639043e-06, + "loss": 0.0143, + "step": 6998 + }, + { + "epoch": 2.3372850225413258, + "grad_norm": 0.1977826268024735, + "learning_rate": 1.4129745574483123e-06, + "loss": 0.0097, + "step": 6999 + }, + { + "epoch": 2.3376189681081985, + "grad_norm": 0.31898545352338264, + "learning_rate": 1.4116209930951508e-06, + "loss": 0.0104, + "step": 7000 + }, + { + "epoch": 2.337952913675071, + "grad_norm": 0.28063789586595905, + "learning_rate": 1.4102679708088867e-06, + "loss": 0.0127, + "step": 7001 + }, + { + "epoch": 2.3382868592419435, + "grad_norm": 0.20999288065604801, + "learning_rate": 1.4089154907939162e-06, + "loss": 0.0108, + "step": 7002 + }, + { + "epoch": 2.338620804808816, + "grad_norm": 0.41120695251909045, + "learning_rate": 1.4075635532545435e-06, + "loss": 0.0164, + "step": 7003 + }, + { + "epoch": 2.338954750375689, + "grad_norm": 0.30117205030740735, + "learning_rate": 1.4062121583949967e-06, + "loss": 0.0146, + "step": 7004 + }, + { + "epoch": 2.339288695942561, + "grad_norm": 0.3122728919338953, + "learning_rate": 1.4048613064194178e-06, + "loss": 0.0194, + "step": 7005 + }, + { + "epoch": 2.339622641509434, + "grad_norm": 0.23533936551158222, + "learning_rate": 1.4035109975318712e-06, + "loss": 0.012, + "step": 7006 + }, + { + "epoch": 2.3399565870763066, + "grad_norm": 0.29820578850033036, + "learning_rate": 1.4021612319363326e-06, + "loss": 0.0137, + "step": 7007 + }, + { + "epoch": 2.3402905326431793, + "grad_norm": 0.2877623873664818, + "learning_rate": 1.4008120098367062e-06, + "loss": 0.0205, + "step": 7008 + }, + { + "epoch": 2.3406244782100516, + "grad_norm": 0.2736968095086382, + "learning_rate": 1.3994633314368034e-06, + "loss": 0.013, + "step": 7009 + }, + { + "epoch": 2.3409584237769243, + "grad_norm": 0.2743224683523854, + "learning_rate": 1.3981151969403606e-06, + "loss": 0.0099, + "step": 7010 + }, + { + "epoch": 2.341292369343797, + "grad_norm": 0.29674448270865644, + "learning_rate": 1.3967676065510266e-06, + "loss": 0.0121, + "step": 7011 + }, + { + "epoch": 2.3416263149106697, + "grad_norm": 0.24211897585212844, + "learning_rate": 1.3954205604723742e-06, + "loss": 0.0076, + "step": 7012 + }, + { + "epoch": 2.341960260477542, + "grad_norm": 0.25201915901381633, + "learning_rate": 1.3940740589078872e-06, + "loss": 0.0101, + "step": 7013 + }, + { + "epoch": 2.3422942060444147, + "grad_norm": 0.3697045818940729, + "learning_rate": 1.3927281020609712e-06, + "loss": 0.0332, + "step": 7014 + }, + { + "epoch": 2.3426281516112875, + "grad_norm": 0.34153673187121747, + "learning_rate": 1.391382690134952e-06, + "loss": 0.0204, + "step": 7015 + }, + { + "epoch": 2.3429620971781597, + "grad_norm": 0.28501533535633566, + "learning_rate": 1.3900378233330658e-06, + "loss": 0.0119, + "step": 7016 + }, + { + "epoch": 2.3432960427450324, + "grad_norm": 0.346380977016863, + "learning_rate": 1.3886935018584719e-06, + "loss": 0.0205, + "step": 7017 + }, + { + "epoch": 2.343629988311905, + "grad_norm": 0.295754362676055, + "learning_rate": 1.3873497259142483e-06, + "loss": 0.0129, + "step": 7018 + }, + { + "epoch": 2.343963933878778, + "grad_norm": 0.35969859400328646, + "learning_rate": 1.3860064957033847e-06, + "loss": 0.0171, + "step": 7019 + }, + { + "epoch": 2.3442978794456506, + "grad_norm": 0.27965472909276673, + "learning_rate": 1.384663811428793e-06, + "loss": 0.0132, + "step": 7020 + }, + { + "epoch": 2.344631825012523, + "grad_norm": 0.40781867550159484, + "learning_rate": 1.3833216732933035e-06, + "loss": 0.0145, + "step": 7021 + }, + { + "epoch": 2.3449657705793956, + "grad_norm": 0.2365689774849031, + "learning_rate": 1.3819800814996587e-06, + "loss": 0.0099, + "step": 7022 + }, + { + "epoch": 2.3452997161462683, + "grad_norm": 0.3094593364384856, + "learning_rate": 1.3806390362505251e-06, + "loss": 0.0187, + "step": 7023 + }, + { + "epoch": 2.3456336617131406, + "grad_norm": 0.24395158215992865, + "learning_rate": 1.3792985377484796e-06, + "loss": 0.0102, + "step": 7024 + }, + { + "epoch": 2.3459676072800133, + "grad_norm": 0.2884488668419262, + "learning_rate": 1.3779585861960226e-06, + "loss": 0.0131, + "step": 7025 + }, + { + "epoch": 2.346301552846886, + "grad_norm": 0.27578421765993216, + "learning_rate": 1.3766191817955699e-06, + "loss": 0.011, + "step": 7026 + }, + { + "epoch": 2.3466354984137587, + "grad_norm": 0.3523686929773811, + "learning_rate": 1.3752803247494545e-06, + "loss": 0.0165, + "step": 7027 + }, + { + "epoch": 2.346969443980631, + "grad_norm": 0.2952555179298213, + "learning_rate": 1.3739420152599247e-06, + "loss": 0.018, + "step": 7028 + }, + { + "epoch": 2.3473033895475037, + "grad_norm": 0.20665887221493115, + "learning_rate": 1.37260425352915e-06, + "loss": 0.0082, + "step": 7029 + }, + { + "epoch": 2.3476373351143764, + "grad_norm": 0.19721715321930855, + "learning_rate": 1.3712670397592127e-06, + "loss": 0.0085, + "step": 7030 + }, + { + "epoch": 2.347971280681249, + "grad_norm": 0.25467914543662806, + "learning_rate": 1.3699303741521158e-06, + "loss": 0.0135, + "step": 7031 + }, + { + "epoch": 2.3483052262481214, + "grad_norm": 0.38590963097806746, + "learning_rate": 1.3685942569097793e-06, + "loss": 0.0272, + "step": 7032 + }, + { + "epoch": 2.348639171814994, + "grad_norm": 0.26568628385319204, + "learning_rate": 1.3672586882340393e-06, + "loss": 0.0121, + "step": 7033 + }, + { + "epoch": 2.348973117381867, + "grad_norm": 0.3321610478768502, + "learning_rate": 1.3659236683266475e-06, + "loss": 0.0138, + "step": 7034 + }, + { + "epoch": 2.349307062948739, + "grad_norm": 0.25198850580372606, + "learning_rate": 1.3645891973892772e-06, + "loss": 0.0139, + "step": 7035 + }, + { + "epoch": 2.349641008515612, + "grad_norm": 0.27852847233661115, + "learning_rate": 1.3632552756235124e-06, + "loss": 0.0127, + "step": 7036 + }, + { + "epoch": 2.3499749540824846, + "grad_norm": 0.286362480694555, + "learning_rate": 1.3619219032308594e-06, + "loss": 0.0137, + "step": 7037 + }, + { + "epoch": 2.3503088996493573, + "grad_norm": 0.31169443623660936, + "learning_rate": 1.3605890804127415e-06, + "loss": 0.0195, + "step": 7038 + }, + { + "epoch": 2.3506428452162296, + "grad_norm": 0.25787425292668803, + "learning_rate": 1.3592568073704943e-06, + "loss": 0.0147, + "step": 7039 + }, + { + "epoch": 2.3509767907831023, + "grad_norm": 0.26326282020932457, + "learning_rate": 1.3579250843053747e-06, + "loss": 0.0125, + "step": 7040 + }, + { + "epoch": 2.351310736349975, + "grad_norm": 0.2838348168143694, + "learning_rate": 1.3565939114185568e-06, + "loss": 0.0155, + "step": 7041 + }, + { + "epoch": 2.3516446819168477, + "grad_norm": 0.3433587724559374, + "learning_rate": 1.3552632889111266e-06, + "loss": 0.0127, + "step": 7042 + }, + { + "epoch": 2.35197862748372, + "grad_norm": 0.2779079478718898, + "learning_rate": 1.3539332169840918e-06, + "loss": 0.0143, + "step": 7043 + }, + { + "epoch": 2.3523125730505927, + "grad_norm": 0.2840763118001524, + "learning_rate": 1.3526036958383777e-06, + "loss": 0.0109, + "step": 7044 + }, + { + "epoch": 2.3526465186174654, + "grad_norm": 0.2691719799676859, + "learning_rate": 1.35127472567482e-06, + "loss": 0.0111, + "step": 7045 + }, + { + "epoch": 2.3529804641843377, + "grad_norm": 0.24783968197198344, + "learning_rate": 1.3499463066941787e-06, + "loss": 0.0095, + "step": 7046 + }, + { + "epoch": 2.3533144097512104, + "grad_norm": 0.2499373106705042, + "learning_rate": 1.3486184390971246e-06, + "loss": 0.012, + "step": 7047 + }, + { + "epoch": 2.353648355318083, + "grad_norm": 0.3529376993851651, + "learning_rate": 1.347291123084249e-06, + "loss": 0.0257, + "step": 7048 + }, + { + "epoch": 2.353982300884956, + "grad_norm": 0.29267545149309965, + "learning_rate": 1.3459643588560583e-06, + "loss": 0.0185, + "step": 7049 + }, + { + "epoch": 2.3543162464518286, + "grad_norm": 0.23077744975338793, + "learning_rate": 1.3446381466129777e-06, + "loss": 0.0105, + "step": 7050 + }, + { + "epoch": 2.354650192018701, + "grad_norm": 0.33530589063300237, + "learning_rate": 1.3433124865553437e-06, + "loss": 0.0167, + "step": 7051 + }, + { + "epoch": 2.3549841375855736, + "grad_norm": 0.21482688137489903, + "learning_rate": 1.3419873788834164e-06, + "loss": 0.0077, + "step": 7052 + }, + { + "epoch": 2.3553180831524463, + "grad_norm": 0.3833841379576733, + "learning_rate": 1.3406628237973662e-06, + "loss": 0.0211, + "step": 7053 + }, + { + "epoch": 2.3556520287193186, + "grad_norm": 0.33450685389920704, + "learning_rate": 1.339338821497283e-06, + "loss": 0.0168, + "step": 7054 + }, + { + "epoch": 2.3559859742861913, + "grad_norm": 0.2138274774323666, + "learning_rate": 1.3380153721831745e-06, + "loss": 0.0106, + "step": 7055 + }, + { + "epoch": 2.356319919853064, + "grad_norm": 0.2927400855130381, + "learning_rate": 1.3366924760549632e-06, + "loss": 0.0166, + "step": 7056 + }, + { + "epoch": 2.3566538654199367, + "grad_norm": 0.26819134391433797, + "learning_rate": 1.3353701333124863e-06, + "loss": 0.0117, + "step": 7057 + }, + { + "epoch": 2.356987810986809, + "grad_norm": 0.2713273801765072, + "learning_rate": 1.3340483441555024e-06, + "loss": 0.0099, + "step": 7058 + }, + { + "epoch": 2.3573217565536817, + "grad_norm": 0.2613137718575152, + "learning_rate": 1.3327271087836792e-06, + "loss": 0.0155, + "step": 7059 + }, + { + "epoch": 2.3576557021205544, + "grad_norm": 0.23319159108616203, + "learning_rate": 1.331406427396607e-06, + "loss": 0.0116, + "step": 7060 + }, + { + "epoch": 2.357989647687427, + "grad_norm": 0.36631150451240413, + "learning_rate": 1.3300863001937902e-06, + "loss": 0.0181, + "step": 7061 + }, + { + "epoch": 2.3583235932542994, + "grad_norm": 0.28616188309510715, + "learning_rate": 1.3287667273746513e-06, + "loss": 0.0116, + "step": 7062 + }, + { + "epoch": 2.358657538821172, + "grad_norm": 0.6289186504409826, + "learning_rate": 1.3274477091385241e-06, + "loss": 0.026, + "step": 7063 + }, + { + "epoch": 2.358991484388045, + "grad_norm": 0.27430290347825487, + "learning_rate": 1.3261292456846648e-06, + "loss": 0.0197, + "step": 7064 + }, + { + "epoch": 2.359325429954917, + "grad_norm": 0.36644620101865705, + "learning_rate": 1.3248113372122395e-06, + "loss": 0.0207, + "step": 7065 + }, + { + "epoch": 2.35965937552179, + "grad_norm": 0.32346152710879994, + "learning_rate": 1.3234939839203358e-06, + "loss": 0.0125, + "step": 7066 + }, + { + "epoch": 2.3599933210886626, + "grad_norm": 0.32124866089323395, + "learning_rate": 1.3221771860079569e-06, + "loss": 0.0152, + "step": 7067 + }, + { + "epoch": 2.3603272666555353, + "grad_norm": 0.29419654271254136, + "learning_rate": 1.3208609436740178e-06, + "loss": 0.0127, + "step": 7068 + }, + { + "epoch": 2.360661212222408, + "grad_norm": 0.2304331921607892, + "learning_rate": 1.3195452571173551e-06, + "loss": 0.0088, + "step": 7069 + }, + { + "epoch": 2.3609951577892803, + "grad_norm": 0.2631261767561719, + "learning_rate": 1.3182301265367154e-06, + "loss": 0.0144, + "step": 7070 + }, + { + "epoch": 2.361329103356153, + "grad_norm": 0.24281070467974317, + "learning_rate": 1.3169155521307664e-06, + "loss": 0.0082, + "step": 7071 + }, + { + "epoch": 2.3616630489230257, + "grad_norm": 0.2448273905444959, + "learning_rate": 1.3156015340980904e-06, + "loss": 0.0092, + "step": 7072 + }, + { + "epoch": 2.361996994489898, + "grad_norm": 0.23846182020447573, + "learning_rate": 1.3142880726371865e-06, + "loss": 0.0107, + "step": 7073 + }, + { + "epoch": 2.3623309400567707, + "grad_norm": 0.24963247889376136, + "learning_rate": 1.312975167946466e-06, + "loss": 0.0106, + "step": 7074 + }, + { + "epoch": 2.3626648856236434, + "grad_norm": 0.25951704733003367, + "learning_rate": 1.3116628202242603e-06, + "loss": 0.0119, + "step": 7075 + }, + { + "epoch": 2.362998831190516, + "grad_norm": 0.27714867760392925, + "learning_rate": 1.3103510296688137e-06, + "loss": 0.0136, + "step": 7076 + }, + { + "epoch": 2.3633327767573884, + "grad_norm": 0.20879695838851028, + "learning_rate": 1.309039796478288e-06, + "loss": 0.0093, + "step": 7077 + }, + { + "epoch": 2.363666722324261, + "grad_norm": 0.33077264493637937, + "learning_rate": 1.307729120850761e-06, + "loss": 0.0157, + "step": 7078 + }, + { + "epoch": 2.364000667891134, + "grad_norm": 0.25900661975678163, + "learning_rate": 1.306419002984226e-06, + "loss": 0.0114, + "step": 7079 + }, + { + "epoch": 2.3643346134580066, + "grad_norm": 0.2781441894602771, + "learning_rate": 1.3051094430765905e-06, + "loss": 0.0122, + "step": 7080 + }, + { + "epoch": 2.364668559024879, + "grad_norm": 0.3098809962725239, + "learning_rate": 1.3038004413256805e-06, + "loss": 0.0162, + "step": 7081 + }, + { + "epoch": 2.3650025045917515, + "grad_norm": 0.2227075007275612, + "learning_rate": 1.3024919979292338e-06, + "loss": 0.0132, + "step": 7082 + }, + { + "epoch": 2.3653364501586243, + "grad_norm": 0.28777777897750045, + "learning_rate": 1.3011841130849079e-06, + "loss": 0.0114, + "step": 7083 + }, + { + "epoch": 2.3656703957254965, + "grad_norm": 0.25312442289604575, + "learning_rate": 1.2998767869902733e-06, + "loss": 0.0217, + "step": 7084 + }, + { + "epoch": 2.3660043412923693, + "grad_norm": 0.27828982828361787, + "learning_rate": 1.2985700198428197e-06, + "loss": 0.0146, + "step": 7085 + }, + { + "epoch": 2.366338286859242, + "grad_norm": 0.3197384774721434, + "learning_rate": 1.2972638118399456e-06, + "loss": 0.0135, + "step": 7086 + }, + { + "epoch": 2.3666722324261147, + "grad_norm": 0.22957950968851462, + "learning_rate": 1.2959581631789725e-06, + "loss": 0.0099, + "step": 7087 + }, + { + "epoch": 2.367006177992987, + "grad_norm": 0.2988210668936548, + "learning_rate": 1.2946530740571316e-06, + "loss": 0.0143, + "step": 7088 + }, + { + "epoch": 2.3673401235598597, + "grad_norm": 0.29309095316351913, + "learning_rate": 1.293348544671572e-06, + "loss": 0.0158, + "step": 7089 + }, + { + "epoch": 2.3676740691267324, + "grad_norm": 0.254190088622963, + "learning_rate": 1.2920445752193617e-06, + "loss": 0.0163, + "step": 7090 + }, + { + "epoch": 2.368008014693605, + "grad_norm": 0.30099725210470746, + "learning_rate": 1.2907411658974756e-06, + "loss": 0.0153, + "step": 7091 + }, + { + "epoch": 2.3683419602604774, + "grad_norm": 0.253633411322103, + "learning_rate": 1.2894383169028134e-06, + "loss": 0.011, + "step": 7092 + }, + { + "epoch": 2.36867590582735, + "grad_norm": 0.257362772235316, + "learning_rate": 1.2881360284321825e-06, + "loss": 0.015, + "step": 7093 + }, + { + "epoch": 2.369009851394223, + "grad_norm": 0.304343203003211, + "learning_rate": 1.2868343006823113e-06, + "loss": 0.0195, + "step": 7094 + }, + { + "epoch": 2.369343796961095, + "grad_norm": 0.23013095858508806, + "learning_rate": 1.2855331338498377e-06, + "loss": 0.0114, + "step": 7095 + }, + { + "epoch": 2.369677742527968, + "grad_norm": 0.3436576246126651, + "learning_rate": 1.2842325281313233e-06, + "loss": 0.0175, + "step": 7096 + }, + { + "epoch": 2.3700116880948405, + "grad_norm": 0.3026982252998837, + "learning_rate": 1.282932483723236e-06, + "loss": 0.0122, + "step": 7097 + }, + { + "epoch": 2.3703456336617132, + "grad_norm": 0.2957131535759339, + "learning_rate": 1.2816330008219656e-06, + "loss": 0.0135, + "step": 7098 + }, + { + "epoch": 2.370679579228586, + "grad_norm": 0.2721010527776561, + "learning_rate": 1.280334079623811e-06, + "loss": 0.0142, + "step": 7099 + }, + { + "epoch": 2.3710135247954582, + "grad_norm": 0.29651695111407894, + "learning_rate": 1.2790357203249931e-06, + "loss": 0.0195, + "step": 7100 + }, + { + "epoch": 2.371347470362331, + "grad_norm": 0.31437865394416964, + "learning_rate": 1.2777379231216391e-06, + "loss": 0.0178, + "step": 7101 + }, + { + "epoch": 2.3716814159292037, + "grad_norm": 0.371615913467936, + "learning_rate": 1.2764406882098035e-06, + "loss": 0.0245, + "step": 7102 + }, + { + "epoch": 2.372015361496076, + "grad_norm": 0.2632986372930604, + "learning_rate": 1.2751440157854439e-06, + "loss": 0.011, + "step": 7103 + }, + { + "epoch": 2.3723493070629487, + "grad_norm": 0.3588190483629132, + "learning_rate": 1.2738479060444408e-06, + "loss": 0.0273, + "step": 7104 + }, + { + "epoch": 2.3726832526298214, + "grad_norm": 0.20756599791892164, + "learning_rate": 1.2725523591825845e-06, + "loss": 0.0096, + "step": 7105 + }, + { + "epoch": 2.373017198196694, + "grad_norm": 0.2528450889854755, + "learning_rate": 1.2712573753955842e-06, + "loss": 0.0119, + "step": 7106 + }, + { + "epoch": 2.3733511437635664, + "grad_norm": 0.274285984152569, + "learning_rate": 1.2699629548790599e-06, + "loss": 0.0146, + "step": 7107 + }, + { + "epoch": 2.373685089330439, + "grad_norm": 0.32026522896008774, + "learning_rate": 1.2686690978285533e-06, + "loss": 0.0153, + "step": 7108 + }, + { + "epoch": 2.374019034897312, + "grad_norm": 0.2295521880397819, + "learning_rate": 1.267375804439513e-06, + "loss": 0.0098, + "step": 7109 + }, + { + "epoch": 2.3743529804641845, + "grad_norm": 0.31821061012740803, + "learning_rate": 1.2660830749073093e-06, + "loss": 0.0197, + "step": 7110 + }, + { + "epoch": 2.374686926031057, + "grad_norm": 0.31868346130439756, + "learning_rate": 1.2647909094272215e-06, + "loss": 0.0138, + "step": 7111 + }, + { + "epoch": 2.3750208715979295, + "grad_norm": 0.2875321158025383, + "learning_rate": 1.2634993081944469e-06, + "loss": 0.0129, + "step": 7112 + }, + { + "epoch": 2.3753548171648022, + "grad_norm": 0.4186992288337688, + "learning_rate": 1.2622082714040995e-06, + "loss": 0.019, + "step": 7113 + }, + { + "epoch": 2.3756887627316745, + "grad_norm": 0.27442706926786037, + "learning_rate": 1.2609177992512022e-06, + "loss": 0.013, + "step": 7114 + }, + { + "epoch": 2.3760227082985472, + "grad_norm": 0.26981130860632113, + "learning_rate": 1.2596278919306993e-06, + "loss": 0.0101, + "step": 7115 + }, + { + "epoch": 2.37635665386542, + "grad_norm": 0.25333406289441435, + "learning_rate": 1.2583385496374428e-06, + "loss": 0.0128, + "step": 7116 + }, + { + "epoch": 2.3766905994322927, + "grad_norm": 0.32993803245798575, + "learning_rate": 1.2570497725662067e-06, + "loss": 0.0162, + "step": 7117 + }, + { + "epoch": 2.3770245449991654, + "grad_norm": 0.30424642025928195, + "learning_rate": 1.2557615609116713e-06, + "loss": 0.0133, + "step": 7118 + }, + { + "epoch": 2.3773584905660377, + "grad_norm": 0.21437599529078102, + "learning_rate": 1.254473914868442e-06, + "loss": 0.0091, + "step": 7119 + }, + { + "epoch": 2.3776924361329104, + "grad_norm": 0.28701269204362556, + "learning_rate": 1.2531868346310288e-06, + "loss": 0.0118, + "step": 7120 + }, + { + "epoch": 2.378026381699783, + "grad_norm": 0.262028252236464, + "learning_rate": 1.2519003203938628e-06, + "loss": 0.0101, + "step": 7121 + }, + { + "epoch": 2.3783603272666554, + "grad_norm": 0.26227682214931713, + "learning_rate": 1.2506143723512842e-06, + "loss": 0.0107, + "step": 7122 + }, + { + "epoch": 2.378694272833528, + "grad_norm": 0.41353513779341977, + "learning_rate": 1.2493289906975543e-06, + "loss": 0.0201, + "step": 7123 + }, + { + "epoch": 2.379028218400401, + "grad_norm": 0.39013520606720037, + "learning_rate": 1.2480441756268397e-06, + "loss": 0.0216, + "step": 7124 + }, + { + "epoch": 2.3793621639672735, + "grad_norm": 0.39793688390234655, + "learning_rate": 1.2467599273332332e-06, + "loss": 0.0206, + "step": 7125 + }, + { + "epoch": 2.379696109534146, + "grad_norm": 0.3389970768895821, + "learning_rate": 1.245476246010731e-06, + "loss": 0.0182, + "step": 7126 + }, + { + "epoch": 2.3800300551010185, + "grad_norm": 0.24997088072860787, + "learning_rate": 1.244193131853252e-06, + "loss": 0.0128, + "step": 7127 + }, + { + "epoch": 2.3803640006678912, + "grad_norm": 0.35901501094601995, + "learning_rate": 1.2429105850546213e-06, + "loss": 0.0193, + "step": 7128 + }, + { + "epoch": 2.380697946234764, + "grad_norm": 0.26315141431875416, + "learning_rate": 1.241628605808587e-06, + "loss": 0.0096, + "step": 7129 + }, + { + "epoch": 2.381031891801636, + "grad_norm": 0.322861217747456, + "learning_rate": 1.2403471943088018e-06, + "loss": 0.0153, + "step": 7130 + }, + { + "epoch": 2.381365837368509, + "grad_norm": 0.2886616062972126, + "learning_rate": 1.239066350748845e-06, + "loss": 0.0119, + "step": 7131 + }, + { + "epoch": 2.3816997829353816, + "grad_norm": 0.29726272959524785, + "learning_rate": 1.2377860753221976e-06, + "loss": 0.0177, + "step": 7132 + }, + { + "epoch": 2.382033728502254, + "grad_norm": 0.2537239930121517, + "learning_rate": 1.236506368222264e-06, + "loss": 0.0095, + "step": 7133 + }, + { + "epoch": 2.3823676740691266, + "grad_norm": 0.28960540961632786, + "learning_rate": 1.235227229642355e-06, + "loss": 0.0114, + "step": 7134 + }, + { + "epoch": 2.3827016196359994, + "grad_norm": 0.3111297053772571, + "learning_rate": 1.2339486597757038e-06, + "loss": 0.0162, + "step": 7135 + }, + { + "epoch": 2.383035565202872, + "grad_norm": 0.2679547588899558, + "learning_rate": 1.2326706588154496e-06, + "loss": 0.0177, + "step": 7136 + }, + { + "epoch": 2.3833695107697443, + "grad_norm": 0.2949021800531175, + "learning_rate": 1.2313932269546518e-06, + "loss": 0.0125, + "step": 7137 + }, + { + "epoch": 2.383703456336617, + "grad_norm": 0.2565085317899601, + "learning_rate": 1.2301163643862817e-06, + "loss": 0.0114, + "step": 7138 + }, + { + "epoch": 2.38403740190349, + "grad_norm": 0.2386767623144155, + "learning_rate": 1.2288400713032227e-06, + "loss": 0.0106, + "step": 7139 + }, + { + "epoch": 2.3843713474703625, + "grad_norm": 0.37789984358150264, + "learning_rate": 1.2275643478982762e-06, + "loss": 0.0225, + "step": 7140 + }, + { + "epoch": 2.3847052930372348, + "grad_norm": 0.27959514576471695, + "learning_rate": 1.2262891943641526e-06, + "loss": 0.0127, + "step": 7141 + }, + { + "epoch": 2.3850392386041075, + "grad_norm": 0.280060467735272, + "learning_rate": 1.2250146108934802e-06, + "loss": 0.0112, + "step": 7142 + }, + { + "epoch": 2.38537318417098, + "grad_norm": 0.279827979047377, + "learning_rate": 1.2237405976787997e-06, + "loss": 0.0143, + "step": 7143 + }, + { + "epoch": 2.3857071297378525, + "grad_norm": 0.2539864619433211, + "learning_rate": 1.2224671549125673e-06, + "loss": 0.0135, + "step": 7144 + }, + { + "epoch": 2.386041075304725, + "grad_norm": 0.3003236120274151, + "learning_rate": 1.2211942827871486e-06, + "loss": 0.0161, + "step": 7145 + }, + { + "epoch": 2.386375020871598, + "grad_norm": 0.29455212569386563, + "learning_rate": 1.2199219814948294e-06, + "loss": 0.0133, + "step": 7146 + }, + { + "epoch": 2.3867089664384706, + "grad_norm": 0.4499970929383756, + "learning_rate": 1.218650251227802e-06, + "loss": 0.0194, + "step": 7147 + }, + { + "epoch": 2.3870429120053434, + "grad_norm": 0.3853206507772999, + "learning_rate": 1.2173790921781786e-06, + "loss": 0.0145, + "step": 7148 + }, + { + "epoch": 2.3873768575722156, + "grad_norm": 0.4580032725791835, + "learning_rate": 1.2161085045379818e-06, + "loss": 0.0279, + "step": 7149 + }, + { + "epoch": 2.3877108031390883, + "grad_norm": 0.30795167351760605, + "learning_rate": 1.214838488499151e-06, + "loss": 0.0131, + "step": 7150 + }, + { + "epoch": 2.388044748705961, + "grad_norm": 0.2449759315544695, + "learning_rate": 1.2135690442535335e-06, + "loss": 0.0116, + "step": 7151 + }, + { + "epoch": 2.3883786942728333, + "grad_norm": 0.23540419332513418, + "learning_rate": 1.2123001719928972e-06, + "loss": 0.0122, + "step": 7152 + }, + { + "epoch": 2.388712639839706, + "grad_norm": 0.27832047670593196, + "learning_rate": 1.211031871908916e-06, + "loss": 0.0133, + "step": 7153 + }, + { + "epoch": 2.3890465854065788, + "grad_norm": 0.35796883834542315, + "learning_rate": 1.2097641441931868e-06, + "loss": 0.0169, + "step": 7154 + }, + { + "epoch": 2.3893805309734515, + "grad_norm": 0.33903461284328346, + "learning_rate": 1.2084969890372111e-06, + "loss": 0.0122, + "step": 7155 + }, + { + "epoch": 2.3897144765403238, + "grad_norm": 0.3545470345634991, + "learning_rate": 1.2072304066324103e-06, + "loss": 0.0179, + "step": 7156 + }, + { + "epoch": 2.3900484221071965, + "grad_norm": 0.278512013267484, + "learning_rate": 1.205964397170113e-06, + "loss": 0.0116, + "step": 7157 + }, + { + "epoch": 2.390382367674069, + "grad_norm": 0.32364809262854305, + "learning_rate": 1.2046989608415682e-06, + "loss": 0.0136, + "step": 7158 + }, + { + "epoch": 2.390716313240942, + "grad_norm": 0.27568292826293606, + "learning_rate": 1.2034340978379328e-06, + "loss": 0.0151, + "step": 7159 + }, + { + "epoch": 2.391050258807814, + "grad_norm": 0.7998645300973338, + "learning_rate": 1.2021698083502797e-06, + "loss": 0.0176, + "step": 7160 + }, + { + "epoch": 2.391384204374687, + "grad_norm": 0.24883334864576495, + "learning_rate": 1.2009060925695965e-06, + "loss": 0.0129, + "step": 7161 + }, + { + "epoch": 2.3917181499415596, + "grad_norm": 0.24945895571795934, + "learning_rate": 1.1996429506867797e-06, + "loss": 0.0091, + "step": 7162 + }, + { + "epoch": 2.392052095508432, + "grad_norm": 0.31494602785813586, + "learning_rate": 1.1983803828926438e-06, + "loss": 0.013, + "step": 7163 + }, + { + "epoch": 2.3923860410753046, + "grad_norm": 0.31393119034954425, + "learning_rate": 1.1971183893779125e-06, + "loss": 0.013, + "step": 7164 + }, + { + "epoch": 2.3927199866421773, + "grad_norm": 0.3238796788901826, + "learning_rate": 1.1958569703332262e-06, + "loss": 0.0147, + "step": 7165 + }, + { + "epoch": 2.39305393220905, + "grad_norm": 0.342314876593608, + "learning_rate": 1.1945961259491368e-06, + "loss": 0.015, + "step": 7166 + }, + { + "epoch": 2.3933878777759228, + "grad_norm": 0.2750549613113608, + "learning_rate": 1.1933358564161108e-06, + "loss": 0.013, + "step": 7167 + }, + { + "epoch": 2.393721823342795, + "grad_norm": 0.2584896582368959, + "learning_rate": 1.1920761619245246e-06, + "loss": 0.0128, + "step": 7168 + }, + { + "epoch": 2.3940557689096678, + "grad_norm": 0.3167388422063857, + "learning_rate": 1.1908170426646726e-06, + "loss": 0.01, + "step": 7169 + }, + { + "epoch": 2.3943897144765405, + "grad_norm": 0.20762422386899154, + "learning_rate": 1.189558498826756e-06, + "loss": 0.0112, + "step": 7170 + }, + { + "epoch": 2.3947236600434127, + "grad_norm": 0.2825120417049143, + "learning_rate": 1.1883005306008955e-06, + "loss": 0.0153, + "step": 7171 + }, + { + "epoch": 2.3950576056102855, + "grad_norm": 0.26261833220252195, + "learning_rate": 1.1870431381771203e-06, + "loss": 0.0103, + "step": 7172 + }, + { + "epoch": 2.395391551177158, + "grad_norm": 0.2590827538536333, + "learning_rate": 1.185786321745377e-06, + "loss": 0.0119, + "step": 7173 + }, + { + "epoch": 2.395725496744031, + "grad_norm": 0.24902586742990407, + "learning_rate": 1.1845300814955192e-06, + "loss": 0.0126, + "step": 7174 + }, + { + "epoch": 2.396059442310903, + "grad_norm": 0.2858493562375798, + "learning_rate": 1.18327441761732e-06, + "loss": 0.0127, + "step": 7175 + }, + { + "epoch": 2.396393387877776, + "grad_norm": 0.3054522282716761, + "learning_rate": 1.1820193303004584e-06, + "loss": 0.0156, + "step": 7176 + }, + { + "epoch": 2.3967273334446486, + "grad_norm": 0.16667413021003308, + "learning_rate": 1.1807648197345327e-06, + "loss": 0.0064, + "step": 7177 + }, + { + "epoch": 2.3970612790115213, + "grad_norm": 0.2759422229871676, + "learning_rate": 1.1795108861090515e-06, + "loss": 0.0105, + "step": 7178 + }, + { + "epoch": 2.3973952245783936, + "grad_norm": 0.293786695623443, + "learning_rate": 1.1782575296134363e-06, + "loss": 0.0135, + "step": 7179 + }, + { + "epoch": 2.3977291701452663, + "grad_norm": 0.2837512554177609, + "learning_rate": 1.1770047504370197e-06, + "loss": 0.0151, + "step": 7180 + }, + { + "epoch": 2.398063115712139, + "grad_norm": 0.27206695994681046, + "learning_rate": 1.1757525487690513e-06, + "loss": 0.0128, + "step": 7181 + }, + { + "epoch": 2.3983970612790113, + "grad_norm": 0.2886242427903375, + "learning_rate": 1.1745009247986882e-06, + "loss": 0.0187, + "step": 7182 + }, + { + "epoch": 2.398731006845884, + "grad_norm": 0.2743196315802026, + "learning_rate": 1.1732498787150044e-06, + "loss": 0.0088, + "step": 7183 + }, + { + "epoch": 2.3990649524127567, + "grad_norm": 0.27742323748456466, + "learning_rate": 1.171999410706986e-06, + "loss": 0.0121, + "step": 7184 + }, + { + "epoch": 2.3993988979796295, + "grad_norm": 0.28507840321145467, + "learning_rate": 1.1707495209635283e-06, + "loss": 0.0128, + "step": 7185 + }, + { + "epoch": 2.3997328435465017, + "grad_norm": 0.2745559950112873, + "learning_rate": 1.1695002096734454e-06, + "loss": 0.0155, + "step": 7186 + }, + { + "epoch": 2.4000667891133745, + "grad_norm": 0.31055652439516734, + "learning_rate": 1.1682514770254567e-06, + "loss": 0.0114, + "step": 7187 + }, + { + "epoch": 2.400400734680247, + "grad_norm": 0.2619762054457191, + "learning_rate": 1.1670033232081995e-06, + "loss": 0.0119, + "step": 7188 + }, + { + "epoch": 2.40073468024712, + "grad_norm": 0.2915813885638635, + "learning_rate": 1.1657557484102228e-06, + "loss": 0.0157, + "step": 7189 + }, + { + "epoch": 2.401068625813992, + "grad_norm": 0.28864581385503507, + "learning_rate": 1.1645087528199883e-06, + "loss": 0.0148, + "step": 7190 + }, + { + "epoch": 2.401402571380865, + "grad_norm": 0.24759176434505523, + "learning_rate": 1.1632623366258666e-06, + "loss": 0.0111, + "step": 7191 + }, + { + "epoch": 2.4017365169477376, + "grad_norm": 0.2710021724502984, + "learning_rate": 1.162016500016147e-06, + "loss": 0.0088, + "step": 7192 + }, + { + "epoch": 2.40207046251461, + "grad_norm": 0.2917673858939121, + "learning_rate": 1.1607712431790242e-06, + "loss": 0.0164, + "step": 7193 + }, + { + "epoch": 2.4024044080814826, + "grad_norm": 0.24049259677765855, + "learning_rate": 1.15952656630261e-06, + "loss": 0.0097, + "step": 7194 + }, + { + "epoch": 2.4027383536483553, + "grad_norm": 0.44935370061229574, + "learning_rate": 1.158282469574929e-06, + "loss": 0.0232, + "step": 7195 + }, + { + "epoch": 2.403072299215228, + "grad_norm": 0.2765799696859926, + "learning_rate": 1.1570389531839165e-06, + "loss": 0.0145, + "step": 7196 + }, + { + "epoch": 2.4034062447821007, + "grad_norm": 0.24358066177297905, + "learning_rate": 1.1557960173174183e-06, + "loss": 0.0108, + "step": 7197 + }, + { + "epoch": 2.403740190348973, + "grad_norm": 0.25225171377279826, + "learning_rate": 1.154553662163197e-06, + "loss": 0.0133, + "step": 7198 + }, + { + "epoch": 2.4040741359158457, + "grad_norm": 0.28316155252835085, + "learning_rate": 1.1533118879089227e-06, + "loss": 0.0109, + "step": 7199 + }, + { + "epoch": 2.4044080814827185, + "grad_norm": 0.26854994796798737, + "learning_rate": 1.1520706947421806e-06, + "loss": 0.0103, + "step": 7200 + }, + { + "epoch": 2.4047420270495907, + "grad_norm": 0.32211611766440146, + "learning_rate": 1.1508300828504682e-06, + "loss": 0.015, + "step": 7201 + }, + { + "epoch": 2.4050759726164634, + "grad_norm": 0.2574191532961684, + "learning_rate": 1.1495900524211955e-06, + "loss": 0.0137, + "step": 7202 + }, + { + "epoch": 2.405409918183336, + "grad_norm": 0.2597273172537471, + "learning_rate": 1.1483506036416814e-06, + "loss": 0.0142, + "step": 7203 + }, + { + "epoch": 2.405743863750209, + "grad_norm": 0.253138913648762, + "learning_rate": 1.1471117366991613e-06, + "loss": 0.0121, + "step": 7204 + }, + { + "epoch": 2.406077809317081, + "grad_norm": 0.23321722385078825, + "learning_rate": 1.1458734517807785e-06, + "loss": 0.0148, + "step": 7205 + }, + { + "epoch": 2.406411754883954, + "grad_norm": 0.2894150642303496, + "learning_rate": 1.1446357490735921e-06, + "loss": 0.0115, + "step": 7206 + }, + { + "epoch": 2.4067457004508266, + "grad_norm": 0.20410274855343485, + "learning_rate": 1.143398628764572e-06, + "loss": 0.0087, + "step": 7207 + }, + { + "epoch": 2.4070796460176993, + "grad_norm": 0.25520301126643197, + "learning_rate": 1.1421620910405977e-06, + "loss": 0.0124, + "step": 7208 + }, + { + "epoch": 2.4074135915845716, + "grad_norm": 0.2591264065436835, + "learning_rate": 1.1409261360884661e-06, + "loss": 0.0137, + "step": 7209 + }, + { + "epoch": 2.4077475371514443, + "grad_norm": 0.2382586100039912, + "learning_rate": 1.1396907640948785e-06, + "loss": 0.0113, + "step": 7210 + }, + { + "epoch": 2.408081482718317, + "grad_norm": 0.2955676439934013, + "learning_rate": 1.1384559752464553e-06, + "loss": 0.0134, + "step": 7211 + }, + { + "epoch": 2.4084154282851893, + "grad_norm": 0.3206987871751663, + "learning_rate": 1.137221769729725e-06, + "loss": 0.0117, + "step": 7212 + }, + { + "epoch": 2.408749373852062, + "grad_norm": 0.3463327320325806, + "learning_rate": 1.1359881477311301e-06, + "loss": 0.0165, + "step": 7213 + }, + { + "epoch": 2.4090833194189347, + "grad_norm": 0.33288614094209124, + "learning_rate": 1.1347551094370224e-06, + "loss": 0.0216, + "step": 7214 + }, + { + "epoch": 2.4094172649858074, + "grad_norm": 0.2988111602023324, + "learning_rate": 1.1335226550336676e-06, + "loss": 0.0161, + "step": 7215 + }, + { + "epoch": 2.40975121055268, + "grad_norm": 0.22773046890329157, + "learning_rate": 1.1322907847072411e-06, + "loss": 0.0095, + "step": 7216 + }, + { + "epoch": 2.4100851561195524, + "grad_norm": 0.27544140321724186, + "learning_rate": 1.1310594986438339e-06, + "loss": 0.0133, + "step": 7217 + }, + { + "epoch": 2.410419101686425, + "grad_norm": 0.21034760716212833, + "learning_rate": 1.129828797029442e-06, + "loss": 0.0083, + "step": 7218 + }, + { + "epoch": 2.410753047253298, + "grad_norm": 0.3253036600406103, + "learning_rate": 1.128598680049982e-06, + "loss": 0.0158, + "step": 7219 + }, + { + "epoch": 2.41108699282017, + "grad_norm": 0.2774108699535163, + "learning_rate": 1.1273691478912752e-06, + "loss": 0.0111, + "step": 7220 + }, + { + "epoch": 2.411420938387043, + "grad_norm": 0.31876378259886223, + "learning_rate": 1.1261402007390587e-06, + "loss": 0.0172, + "step": 7221 + }, + { + "epoch": 2.4117548839539156, + "grad_norm": 0.3119847861004831, + "learning_rate": 1.1249118387789764e-06, + "loss": 0.0146, + "step": 7222 + }, + { + "epoch": 2.4120888295207883, + "grad_norm": 0.2828378050063791, + "learning_rate": 1.12368406219659e-06, + "loss": 0.0184, + "step": 7223 + }, + { + "epoch": 2.4124227750876606, + "grad_norm": 0.24241244445989907, + "learning_rate": 1.1224568711773653e-06, + "loss": 0.008, + "step": 7224 + }, + { + "epoch": 2.4127567206545333, + "grad_norm": 0.25305626652921254, + "learning_rate": 1.1212302659066898e-06, + "loss": 0.013, + "step": 7225 + }, + { + "epoch": 2.413090666221406, + "grad_norm": 0.29622762532899843, + "learning_rate": 1.1200042465698518e-06, + "loss": 0.0162, + "step": 7226 + }, + { + "epoch": 2.4134246117882787, + "grad_norm": 0.24693155073398468, + "learning_rate": 1.1187788133520594e-06, + "loss": 0.0125, + "step": 7227 + }, + { + "epoch": 2.413758557355151, + "grad_norm": 0.222319670796678, + "learning_rate": 1.1175539664384261e-06, + "loss": 0.0122, + "step": 7228 + }, + { + "epoch": 2.4140925029220237, + "grad_norm": 0.2643043895840703, + "learning_rate": 1.1163297060139815e-06, + "loss": 0.0099, + "step": 7229 + }, + { + "epoch": 2.4144264484888964, + "grad_norm": 0.2755059947823996, + "learning_rate": 1.1151060322636625e-06, + "loss": 0.0109, + "step": 7230 + }, + { + "epoch": 2.4147603940557687, + "grad_norm": 0.3011623464267576, + "learning_rate": 1.1138829453723204e-06, + "loss": 0.0167, + "step": 7231 + }, + { + "epoch": 2.4150943396226414, + "grad_norm": 0.24423068821650673, + "learning_rate": 1.112660445524718e-06, + "loss": 0.0114, + "step": 7232 + }, + { + "epoch": 2.415428285189514, + "grad_norm": 0.2468340235552794, + "learning_rate": 1.1114385329055262e-06, + "loss": 0.0099, + "step": 7233 + }, + { + "epoch": 2.415762230756387, + "grad_norm": 0.30319247623744344, + "learning_rate": 1.1102172076993301e-06, + "loss": 0.0091, + "step": 7234 + }, + { + "epoch": 2.416096176323259, + "grad_norm": 0.322759779147581, + "learning_rate": 1.1089964700906257e-06, + "loss": 0.0122, + "step": 7235 + }, + { + "epoch": 2.416430121890132, + "grad_norm": 0.3288493547130975, + "learning_rate": 1.1077763202638208e-06, + "loss": 0.0216, + "step": 7236 + }, + { + "epoch": 2.4167640674570046, + "grad_norm": 0.26661830516489915, + "learning_rate": 1.106556758403231e-06, + "loss": 0.0124, + "step": 7237 + }, + { + "epoch": 2.4170980130238773, + "grad_norm": 0.26447273292117623, + "learning_rate": 1.105337784693088e-06, + "loss": 0.0136, + "step": 7238 + }, + { + "epoch": 2.4174319585907496, + "grad_norm": 0.3060522227695716, + "learning_rate": 1.1041193993175293e-06, + "loss": 0.0147, + "step": 7239 + }, + { + "epoch": 2.4177659041576223, + "grad_norm": 0.38026942731737745, + "learning_rate": 1.1029016024606093e-06, + "loss": 0.0093, + "step": 7240 + }, + { + "epoch": 2.418099849724495, + "grad_norm": 0.32183588906554944, + "learning_rate": 1.101684394306286e-06, + "loss": 0.0145, + "step": 7241 + }, + { + "epoch": 2.4184337952913673, + "grad_norm": 0.2660050271437931, + "learning_rate": 1.100467775038439e-06, + "loss": 0.0132, + "step": 7242 + }, + { + "epoch": 2.41876774085824, + "grad_norm": 0.423035256858228, + "learning_rate": 1.099251744840849e-06, + "loss": 0.0198, + "step": 7243 + }, + { + "epoch": 2.4191016864251127, + "grad_norm": 0.3625454286746034, + "learning_rate": 1.0980363038972141e-06, + "loss": 0.0213, + "step": 7244 + }, + { + "epoch": 2.4194356319919854, + "grad_norm": 0.2424465912500267, + "learning_rate": 1.096821452391138e-06, + "loss": 0.0105, + "step": 7245 + }, + { + "epoch": 2.419769577558858, + "grad_norm": 0.2586066381095055, + "learning_rate": 1.0956071905061415e-06, + "loss": 0.0128, + "step": 7246 + }, + { + "epoch": 2.4201035231257304, + "grad_norm": 0.3333268966603381, + "learning_rate": 1.0943935184256487e-06, + "loss": 0.0177, + "step": 7247 + }, + { + "epoch": 2.420437468692603, + "grad_norm": 0.286084481345273, + "learning_rate": 1.093180436333005e-06, + "loss": 0.0149, + "step": 7248 + }, + { + "epoch": 2.420771414259476, + "grad_norm": 0.2493653138982644, + "learning_rate": 1.091967944411456e-06, + "loss": 0.0107, + "step": 7249 + }, + { + "epoch": 2.421105359826348, + "grad_norm": 0.25159429541466294, + "learning_rate": 1.0907560428441666e-06, + "loss": 0.0087, + "step": 7250 + }, + { + "epoch": 2.421439305393221, + "grad_norm": 0.2513470760755896, + "learning_rate": 1.0895447318142043e-06, + "loss": 0.0121, + "step": 7251 + }, + { + "epoch": 2.4217732509600935, + "grad_norm": 0.20574294406547308, + "learning_rate": 1.0883340115045566e-06, + "loss": 0.0081, + "step": 7252 + }, + { + "epoch": 2.4221071965269663, + "grad_norm": 0.2306528422692096, + "learning_rate": 1.0871238820981133e-06, + "loss": 0.0096, + "step": 7253 + }, + { + "epoch": 2.4224411420938385, + "grad_norm": 0.2616276871365394, + "learning_rate": 1.0859143437776803e-06, + "loss": 0.0138, + "step": 7254 + }, + { + "epoch": 2.4227750876607113, + "grad_norm": 0.321239771478697, + "learning_rate": 1.0847053967259736e-06, + "loss": 0.0132, + "step": 7255 + }, + { + "epoch": 2.423109033227584, + "grad_norm": 0.29953783460510724, + "learning_rate": 1.0834970411256167e-06, + "loss": 0.0155, + "step": 7256 + }, + { + "epoch": 2.4234429787944567, + "grad_norm": 0.3217997164330562, + "learning_rate": 1.082289277159147e-06, + "loss": 0.0111, + "step": 7257 + }, + { + "epoch": 2.423776924361329, + "grad_norm": 0.34331779083669667, + "learning_rate": 1.0810821050090132e-06, + "loss": 0.0223, + "step": 7258 + }, + { + "epoch": 2.4241108699282017, + "grad_norm": 0.29478707084790945, + "learning_rate": 1.0798755248575694e-06, + "loss": 0.0141, + "step": 7259 + }, + { + "epoch": 2.4244448154950744, + "grad_norm": 0.2786668752463381, + "learning_rate": 1.078669536887086e-06, + "loss": 0.0106, + "step": 7260 + }, + { + "epoch": 2.4247787610619467, + "grad_norm": 0.21496456487304513, + "learning_rate": 1.077464141279742e-06, + "loss": 0.0097, + "step": 7261 + }, + { + "epoch": 2.4251127066288194, + "grad_norm": 0.2752509532762129, + "learning_rate": 1.0762593382176244e-06, + "loss": 0.0104, + "step": 7262 + }, + { + "epoch": 2.425446652195692, + "grad_norm": 0.23406747399877303, + "learning_rate": 1.0750551278827365e-06, + "loss": 0.0113, + "step": 7263 + }, + { + "epoch": 2.425780597762565, + "grad_norm": 0.2843164037544598, + "learning_rate": 1.073851510456984e-06, + "loss": 0.013, + "step": 7264 + }, + { + "epoch": 2.4261145433294375, + "grad_norm": 0.29019737142117985, + "learning_rate": 1.0726484861221902e-06, + "loss": 0.0146, + "step": 7265 + }, + { + "epoch": 2.42644848889631, + "grad_norm": 0.32223930603474793, + "learning_rate": 1.0714460550600859e-06, + "loss": 0.0204, + "step": 7266 + }, + { + "epoch": 2.4267824344631825, + "grad_norm": 0.313047643406518, + "learning_rate": 1.0702442174523132e-06, + "loss": 0.016, + "step": 7267 + }, + { + "epoch": 2.4271163800300553, + "grad_norm": 0.2593153263087686, + "learning_rate": 1.0690429734804214e-06, + "loss": 0.0088, + "step": 7268 + }, + { + "epoch": 2.4274503255969275, + "grad_norm": 0.2815875789425934, + "learning_rate": 1.0678423233258755e-06, + "loss": 0.0149, + "step": 7269 + }, + { + "epoch": 2.4277842711638002, + "grad_norm": 0.3730049656620886, + "learning_rate": 1.0666422671700438e-06, + "loss": 0.0155, + "step": 7270 + }, + { + "epoch": 2.428118216730673, + "grad_norm": 0.35707437192802916, + "learning_rate": 1.065442805194214e-06, + "loss": 0.0131, + "step": 7271 + }, + { + "epoch": 2.4284521622975457, + "grad_norm": 0.2455000952029101, + "learning_rate": 1.0642439375795748e-06, + "loss": 0.0108, + "step": 7272 + }, + { + "epoch": 2.428786107864418, + "grad_norm": 0.3720971236342139, + "learning_rate": 1.0630456645072324e-06, + "loss": 0.0213, + "step": 7273 + }, + { + "epoch": 2.4291200534312907, + "grad_norm": 0.29797087934836636, + "learning_rate": 1.0618479861581971e-06, + "loss": 0.0137, + "step": 7274 + }, + { + "epoch": 2.4294539989981634, + "grad_norm": 0.2400773867898801, + "learning_rate": 1.060650902713395e-06, + "loss": 0.0109, + "step": 7275 + }, + { + "epoch": 2.429787944565036, + "grad_norm": 0.3130496837461216, + "learning_rate": 1.0594544143536572e-06, + "loss": 0.0158, + "step": 7276 + }, + { + "epoch": 2.4301218901319084, + "grad_norm": 0.2865455182480135, + "learning_rate": 1.0582585212597286e-06, + "loss": 0.0127, + "step": 7277 + }, + { + "epoch": 2.430455835698781, + "grad_norm": 0.305507595068663, + "learning_rate": 1.0570632236122641e-06, + "loss": 0.0096, + "step": 7278 + }, + { + "epoch": 2.430789781265654, + "grad_norm": 0.22714468428546217, + "learning_rate": 1.0558685215918246e-06, + "loss": 0.0083, + "step": 7279 + }, + { + "epoch": 2.431123726832526, + "grad_norm": 0.30195079622243304, + "learning_rate": 1.0546744153788858e-06, + "loss": 0.013, + "step": 7280 + }, + { + "epoch": 2.431457672399399, + "grad_norm": 0.4330203059273548, + "learning_rate": 1.0534809051538324e-06, + "loss": 0.0173, + "step": 7281 + }, + { + "epoch": 2.4317916179662715, + "grad_norm": 0.3901599505968961, + "learning_rate": 1.0522879910969563e-06, + "loss": 0.0167, + "step": 7282 + }, + { + "epoch": 2.4321255635331442, + "grad_norm": 0.33959626776148355, + "learning_rate": 1.0510956733884614e-06, + "loss": 0.0118, + "step": 7283 + }, + { + "epoch": 2.4324595091000165, + "grad_norm": 0.3049771255951287, + "learning_rate": 1.0499039522084637e-06, + "loss": 0.012, + "step": 7284 + }, + { + "epoch": 2.4327934546668892, + "grad_norm": 0.2925027780827184, + "learning_rate": 1.0487128277369829e-06, + "loss": 0.015, + "step": 7285 + }, + { + "epoch": 2.433127400233762, + "grad_norm": 0.3465757430233572, + "learning_rate": 1.0475223001539564e-06, + "loss": 0.0143, + "step": 7286 + }, + { + "epoch": 2.4334613458006347, + "grad_norm": 0.2921377854090987, + "learning_rate": 1.0463323696392236e-06, + "loss": 0.0131, + "step": 7287 + }, + { + "epoch": 2.433795291367507, + "grad_norm": 0.22218855506720242, + "learning_rate": 1.0451430363725395e-06, + "loss": 0.0098, + "step": 7288 + }, + { + "epoch": 2.4341292369343797, + "grad_norm": 0.24695462801766666, + "learning_rate": 1.043954300533566e-06, + "loss": 0.0117, + "step": 7289 + }, + { + "epoch": 2.4344631825012524, + "grad_norm": 0.28325740225039686, + "learning_rate": 1.0427661623018786e-06, + "loss": 0.0142, + "step": 7290 + }, + { + "epoch": 2.4347971280681246, + "grad_norm": 0.28787787794787173, + "learning_rate": 1.0415786218569557e-06, + "loss": 0.015, + "step": 7291 + }, + { + "epoch": 2.4351310736349974, + "grad_norm": 0.21698008235982977, + "learning_rate": 1.0403916793781922e-06, + "loss": 0.0126, + "step": 7292 + }, + { + "epoch": 2.43546501920187, + "grad_norm": 0.20833791702536347, + "learning_rate": 1.0392053350448867e-06, + "loss": 0.0089, + "step": 7293 + }, + { + "epoch": 2.435798964768743, + "grad_norm": 0.30648024865173745, + "learning_rate": 1.0380195890362527e-06, + "loss": 0.015, + "step": 7294 + }, + { + "epoch": 2.4361329103356155, + "grad_norm": 0.25092182838695504, + "learning_rate": 1.0368344415314101e-06, + "loss": 0.0135, + "step": 7295 + }, + { + "epoch": 2.436466855902488, + "grad_norm": 0.23198410448315185, + "learning_rate": 1.0356498927093916e-06, + "loss": 0.0091, + "step": 7296 + }, + { + "epoch": 2.4368008014693605, + "grad_norm": 0.2375015576403759, + "learning_rate": 1.0344659427491343e-06, + "loss": 0.0114, + "step": 7297 + }, + { + "epoch": 2.4371347470362332, + "grad_norm": 0.2945697420611448, + "learning_rate": 1.0332825918294898e-06, + "loss": 0.013, + "step": 7298 + }, + { + "epoch": 2.4374686926031055, + "grad_norm": 0.2983015023268837, + "learning_rate": 1.0320998401292154e-06, + "loss": 0.0177, + "step": 7299 + }, + { + "epoch": 2.437802638169978, + "grad_norm": 0.3106444295761094, + "learning_rate": 1.0309176878269806e-06, + "loss": 0.0163, + "step": 7300 + }, + { + "epoch": 2.438136583736851, + "grad_norm": 0.27717876792538665, + "learning_rate": 1.0297361351013646e-06, + "loss": 0.0155, + "step": 7301 + }, + { + "epoch": 2.4384705293037237, + "grad_norm": 0.39237590046406856, + "learning_rate": 1.028555182130853e-06, + "loss": 0.0179, + "step": 7302 + }, + { + "epoch": 2.438804474870596, + "grad_norm": 0.25520872453543136, + "learning_rate": 1.027374829093843e-06, + "loss": 0.0153, + "step": 7303 + }, + { + "epoch": 2.4391384204374686, + "grad_norm": 0.26162517217255865, + "learning_rate": 1.0261950761686423e-06, + "loss": 0.0107, + "step": 7304 + }, + { + "epoch": 2.4394723660043414, + "grad_norm": 0.20978647733689199, + "learning_rate": 1.0250159235334645e-06, + "loss": 0.0087, + "step": 7305 + }, + { + "epoch": 2.439806311571214, + "grad_norm": 0.3167807769076685, + "learning_rate": 1.0238373713664351e-06, + "loss": 0.0153, + "step": 7306 + }, + { + "epoch": 2.4401402571380864, + "grad_norm": 0.27678055611181496, + "learning_rate": 1.0226594198455903e-06, + "loss": 0.0155, + "step": 7307 + }, + { + "epoch": 2.440474202704959, + "grad_norm": 0.2565559204887818, + "learning_rate": 1.0214820691488698e-06, + "loss": 0.0088, + "step": 7308 + }, + { + "epoch": 2.440808148271832, + "grad_norm": 0.28826978115615537, + "learning_rate": 1.02030531945413e-06, + "loss": 0.0167, + "step": 7309 + }, + { + "epoch": 2.441142093838704, + "grad_norm": 0.284312603833394, + "learning_rate": 1.0191291709391298e-06, + "loss": 0.0125, + "step": 7310 + }, + { + "epoch": 2.441476039405577, + "grad_norm": 0.3861989846609746, + "learning_rate": 1.0179536237815413e-06, + "loss": 0.0231, + "step": 7311 + }, + { + "epoch": 2.4418099849724495, + "grad_norm": 0.24064228201789706, + "learning_rate": 1.016778678158945e-06, + "loss": 0.0112, + "step": 7312 + }, + { + "epoch": 2.442143930539322, + "grad_norm": 0.3123762844764769, + "learning_rate": 1.015604334248832e-06, + "loss": 0.0163, + "step": 7313 + }, + { + "epoch": 2.442477876106195, + "grad_norm": 0.26853057421196386, + "learning_rate": 1.0144305922285975e-06, + "loss": 0.0102, + "step": 7314 + }, + { + "epoch": 2.442811821673067, + "grad_norm": 0.31314817805475603, + "learning_rate": 1.0132574522755518e-06, + "loss": 0.0206, + "step": 7315 + }, + { + "epoch": 2.44314576723994, + "grad_norm": 0.3021525501381858, + "learning_rate": 1.0120849145669093e-06, + "loss": 0.0176, + "step": 7316 + }, + { + "epoch": 2.4434797128068126, + "grad_norm": 0.2970065454054504, + "learning_rate": 1.010912979279796e-06, + "loss": 0.0143, + "step": 7317 + }, + { + "epoch": 2.443813658373685, + "grad_norm": 0.24825508662583712, + "learning_rate": 1.009741646591248e-06, + "loss": 0.0126, + "step": 7318 + }, + { + "epoch": 2.4441476039405576, + "grad_norm": 0.2597659499615799, + "learning_rate": 1.0085709166782088e-06, + "loss": 0.0144, + "step": 7319 + }, + { + "epoch": 2.4444815495074304, + "grad_norm": 0.25450949330419564, + "learning_rate": 1.0074007897175291e-06, + "loss": 0.0111, + "step": 7320 + }, + { + "epoch": 2.444815495074303, + "grad_norm": 0.3156661941355761, + "learning_rate": 1.0062312658859723e-06, + "loss": 0.0171, + "step": 7321 + }, + { + "epoch": 2.4451494406411753, + "grad_norm": 0.32955147077355185, + "learning_rate": 1.0050623453602075e-06, + "loss": 0.0137, + "step": 7322 + }, + { + "epoch": 2.445483386208048, + "grad_norm": 0.2703462933932539, + "learning_rate": 1.0038940283168136e-06, + "loss": 0.0145, + "step": 7323 + }, + { + "epoch": 2.4458173317749208, + "grad_norm": 0.3294855990312569, + "learning_rate": 1.0027263149322797e-06, + "loss": 0.0192, + "step": 7324 + }, + { + "epoch": 2.4461512773417935, + "grad_norm": 0.24937110062318102, + "learning_rate": 1.001559205383003e-06, + "loss": 0.018, + "step": 7325 + }, + { + "epoch": 2.4464852229086658, + "grad_norm": 0.2548893452856415, + "learning_rate": 1.000392699845288e-06, + "loss": 0.0158, + "step": 7326 + }, + { + "epoch": 2.4468191684755385, + "grad_norm": 0.32509317137948335, + "learning_rate": 9.992267984953503e-07, + "loss": 0.0163, + "step": 7327 + }, + { + "epoch": 2.447153114042411, + "grad_norm": 0.3356595603778868, + "learning_rate": 9.98061501509311e-07, + "loss": 0.0165, + "step": 7328 + }, + { + "epoch": 2.4474870596092835, + "grad_norm": 0.2976814000591848, + "learning_rate": 9.968968090632032e-07, + "loss": 0.0141, + "step": 7329 + }, + { + "epoch": 2.447821005176156, + "grad_norm": 0.31139018416994313, + "learning_rate": 9.957327213329687e-07, + "loss": 0.0202, + "step": 7330 + }, + { + "epoch": 2.448154950743029, + "grad_norm": 0.23872054040481558, + "learning_rate": 9.945692384944544e-07, + "loss": 0.0105, + "step": 7331 + }, + { + "epoch": 2.4484888963099016, + "grad_norm": 0.3190787506388644, + "learning_rate": 9.934063607234202e-07, + "loss": 0.0157, + "step": 7332 + }, + { + "epoch": 2.448822841876774, + "grad_norm": 0.28657011536320304, + "learning_rate": 9.922440881955298e-07, + "loss": 0.0118, + "step": 7333 + }, + { + "epoch": 2.4491567874436466, + "grad_norm": 0.3540342936916669, + "learning_rate": 9.910824210863611e-07, + "loss": 0.0171, + "step": 7334 + }, + { + "epoch": 2.4494907330105193, + "grad_norm": 0.33985510555077725, + "learning_rate": 9.899213595713935e-07, + "loss": 0.024, + "step": 7335 + }, + { + "epoch": 2.449824678577392, + "grad_norm": 0.24922446167159795, + "learning_rate": 9.887609038260243e-07, + "loss": 0.0142, + "step": 7336 + }, + { + "epoch": 2.4501586241442643, + "grad_norm": 0.4667165619725214, + "learning_rate": 9.876010540255504e-07, + "loss": 0.0163, + "step": 7337 + }, + { + "epoch": 2.450492569711137, + "grad_norm": 0.25087403523273744, + "learning_rate": 9.86441810345183e-07, + "loss": 0.0123, + "step": 7338 + }, + { + "epoch": 2.4508265152780098, + "grad_norm": 0.2623893719815298, + "learning_rate": 9.852831729600365e-07, + "loss": 0.0131, + "step": 7339 + }, + { + "epoch": 2.451160460844882, + "grad_norm": 0.18227982549195418, + "learning_rate": 9.841251420451398e-07, + "loss": 0.008, + "step": 7340 + }, + { + "epoch": 2.4514944064117548, + "grad_norm": 0.2928098522990452, + "learning_rate": 9.829677177754231e-07, + "loss": 0.0142, + "step": 7341 + }, + { + "epoch": 2.4518283519786275, + "grad_norm": 0.28737546887449433, + "learning_rate": 9.818109003257348e-07, + "loss": 0.0124, + "step": 7342 + }, + { + "epoch": 2.4521622975455, + "grad_norm": 0.283085700704366, + "learning_rate": 9.806546898708213e-07, + "loss": 0.0089, + "step": 7343 + }, + { + "epoch": 2.452496243112373, + "grad_norm": 0.2503307668814493, + "learning_rate": 9.794990865853444e-07, + "loss": 0.0098, + "step": 7344 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 0.3123889008674629, + "learning_rate": 9.783440906438686e-07, + "loss": 0.016, + "step": 7345 + }, + { + "epoch": 2.453164134246118, + "grad_norm": 0.2523087173336739, + "learning_rate": 9.771897022208732e-07, + "loss": 0.0085, + "step": 7346 + }, + { + "epoch": 2.4534980798129906, + "grad_norm": 0.2572594641327064, + "learning_rate": 9.760359214907372e-07, + "loss": 0.0114, + "step": 7347 + }, + { + "epoch": 2.453832025379863, + "grad_norm": 0.3432933434003597, + "learning_rate": 9.74882748627759e-07, + "loss": 0.0113, + "step": 7348 + }, + { + "epoch": 2.4541659709467356, + "grad_norm": 0.24869987251894454, + "learning_rate": 9.737301838061342e-07, + "loss": 0.0074, + "step": 7349 + }, + { + "epoch": 2.4544999165136083, + "grad_norm": 0.2459174687004886, + "learning_rate": 9.725782271999744e-07, + "loss": 0.0105, + "step": 7350 + }, + { + "epoch": 2.454833862080481, + "grad_norm": 0.2563920283285319, + "learning_rate": 9.714268789832937e-07, + "loss": 0.0125, + "step": 7351 + }, + { + "epoch": 2.4551678076473533, + "grad_norm": 0.32344159898648145, + "learning_rate": 9.702761393300176e-07, + "loss": 0.0084, + "step": 7352 + }, + { + "epoch": 2.455501753214226, + "grad_norm": 0.263201655566848, + "learning_rate": 9.691260084139802e-07, + "loss": 0.01, + "step": 7353 + }, + { + "epoch": 2.4558356987810988, + "grad_norm": 0.2712607887343785, + "learning_rate": 9.679764864089203e-07, + "loss": 0.0132, + "step": 7354 + }, + { + "epoch": 2.4561696443479715, + "grad_norm": 0.32113476946235286, + "learning_rate": 9.668275734884885e-07, + "loss": 0.0158, + "step": 7355 + }, + { + "epoch": 2.4565035899148437, + "grad_norm": 0.28333905746829324, + "learning_rate": 9.656792698262402e-07, + "loss": 0.0119, + "step": 7356 + }, + { + "epoch": 2.4568375354817165, + "grad_norm": 0.2698329597727835, + "learning_rate": 9.645315755956413e-07, + "loss": 0.0122, + "step": 7357 + }, + { + "epoch": 2.457171481048589, + "grad_norm": 0.23963820262015254, + "learning_rate": 9.633844909700618e-07, + "loss": 0.013, + "step": 7358 + }, + { + "epoch": 2.4575054266154615, + "grad_norm": 0.3335867582433281, + "learning_rate": 9.622380161227873e-07, + "loss": 0.0171, + "step": 7359 + }, + { + "epoch": 2.457839372182334, + "grad_norm": 0.24621102816372686, + "learning_rate": 9.61092151227002e-07, + "loss": 0.0115, + "step": 7360 + }, + { + "epoch": 2.458173317749207, + "grad_norm": 0.25744811295419195, + "learning_rate": 9.599468964558051e-07, + "loss": 0.0081, + "step": 7361 + }, + { + "epoch": 2.4585072633160796, + "grad_norm": 0.2814718685341393, + "learning_rate": 9.588022519821983e-07, + "loss": 0.0226, + "step": 7362 + }, + { + "epoch": 2.4588412088829523, + "grad_norm": 0.25557267803743267, + "learning_rate": 9.576582179790967e-07, + "loss": 0.0108, + "step": 7363 + }, + { + "epoch": 2.4591751544498246, + "grad_norm": 0.25217261839201205, + "learning_rate": 9.565147946193149e-07, + "loss": 0.009, + "step": 7364 + }, + { + "epoch": 2.4595091000166973, + "grad_norm": 0.39641517511590446, + "learning_rate": 9.553719820755869e-07, + "loss": 0.0188, + "step": 7365 + }, + { + "epoch": 2.45984304558357, + "grad_norm": 0.2666369562077089, + "learning_rate": 9.542297805205436e-07, + "loss": 0.0131, + "step": 7366 + }, + { + "epoch": 2.4601769911504423, + "grad_norm": 0.29185457548413524, + "learning_rate": 9.530881901267308e-07, + "loss": 0.0118, + "step": 7367 + }, + { + "epoch": 2.460510936717315, + "grad_norm": 0.34129391396897135, + "learning_rate": 9.519472110665967e-07, + "loss": 0.015, + "step": 7368 + }, + { + "epoch": 2.4608448822841877, + "grad_norm": 0.3359051402514027, + "learning_rate": 9.508068435125012e-07, + "loss": 0.0184, + "step": 7369 + }, + { + "epoch": 2.4611788278510605, + "grad_norm": 0.3149923402450988, + "learning_rate": 9.496670876367076e-07, + "loss": 0.0131, + "step": 7370 + }, + { + "epoch": 2.4615127734179327, + "grad_norm": 0.2595265333132604, + "learning_rate": 9.485279436113942e-07, + "loss": 0.0115, + "step": 7371 + }, + { + "epoch": 2.4618467189848054, + "grad_norm": 0.3149118619492064, + "learning_rate": 9.473894116086379e-07, + "loss": 0.0146, + "step": 7372 + }, + { + "epoch": 2.462180664551678, + "grad_norm": 0.2807543285827548, + "learning_rate": 9.462514918004301e-07, + "loss": 0.0113, + "step": 7373 + }, + { + "epoch": 2.462514610118551, + "grad_norm": 0.37467502315809514, + "learning_rate": 9.451141843586647e-07, + "loss": 0.0135, + "step": 7374 + }, + { + "epoch": 2.462848555685423, + "grad_norm": 0.259262591473988, + "learning_rate": 9.439774894551479e-07, + "loss": 0.0105, + "step": 7375 + }, + { + "epoch": 2.463182501252296, + "grad_norm": 0.34799217970770124, + "learning_rate": 9.428414072615877e-07, + "loss": 0.0202, + "step": 7376 + }, + { + "epoch": 2.4635164468191686, + "grad_norm": 0.2427597504087343, + "learning_rate": 9.417059379496047e-07, + "loss": 0.0096, + "step": 7377 + }, + { + "epoch": 2.463850392386041, + "grad_norm": 0.2867707268451149, + "learning_rate": 9.40571081690726e-07, + "loss": 0.0111, + "step": 7378 + }, + { + "epoch": 2.4641843379529136, + "grad_norm": 0.2845436020853542, + "learning_rate": 9.394368386563823e-07, + "loss": 0.0095, + "step": 7379 + }, + { + "epoch": 2.4645182835197863, + "grad_norm": 0.29399836991743034, + "learning_rate": 9.383032090179173e-07, + "loss": 0.0152, + "step": 7380 + }, + { + "epoch": 2.464852229086659, + "grad_norm": 0.228948456972105, + "learning_rate": 9.371701929465759e-07, + "loss": 0.008, + "step": 7381 + }, + { + "epoch": 2.4651861746535313, + "grad_norm": 0.36012924008642466, + "learning_rate": 9.360377906135148e-07, + "loss": 0.0247, + "step": 7382 + }, + { + "epoch": 2.465520120220404, + "grad_norm": 0.28176533973421675, + "learning_rate": 9.349060021897976e-07, + "loss": 0.0118, + "step": 7383 + }, + { + "epoch": 2.4658540657872767, + "grad_norm": 0.2968451728673293, + "learning_rate": 9.337748278463948e-07, + "loss": 0.0136, + "step": 7384 + }, + { + "epoch": 2.4661880113541494, + "grad_norm": 0.41312460562774683, + "learning_rate": 9.326442677541813e-07, + "loss": 0.0188, + "step": 7385 + }, + { + "epoch": 2.4665219569210217, + "grad_norm": 0.32533184691092876, + "learning_rate": 9.31514322083944e-07, + "loss": 0.0149, + "step": 7386 + }, + { + "epoch": 2.4668559024878944, + "grad_norm": 0.2682778800681192, + "learning_rate": 9.303849910063717e-07, + "loss": 0.014, + "step": 7387 + }, + { + "epoch": 2.467189848054767, + "grad_norm": 0.34155137603953306, + "learning_rate": 9.292562746920647e-07, + "loss": 0.025, + "step": 7388 + }, + { + "epoch": 2.4675237936216394, + "grad_norm": 0.30184382378474317, + "learning_rate": 9.281281733115288e-07, + "loss": 0.0127, + "step": 7389 + }, + { + "epoch": 2.467857739188512, + "grad_norm": 0.28929079886766496, + "learning_rate": 9.270006870351789e-07, + "loss": 0.0133, + "step": 7390 + }, + { + "epoch": 2.468191684755385, + "grad_norm": 0.3159966548414994, + "learning_rate": 9.258738160333314e-07, + "loss": 0.0197, + "step": 7391 + }, + { + "epoch": 2.4685256303222576, + "grad_norm": 0.3834637689221866, + "learning_rate": 9.247475604762168e-07, + "loss": 0.018, + "step": 7392 + }, + { + "epoch": 2.4688595758891303, + "grad_norm": 0.2406444693430883, + "learning_rate": 9.236219205339647e-07, + "loss": 0.0098, + "step": 7393 + }, + { + "epoch": 2.4691935214560026, + "grad_norm": 0.24324079827551298, + "learning_rate": 9.224968963766223e-07, + "loss": 0.0124, + "step": 7394 + }, + { + "epoch": 2.4695274670228753, + "grad_norm": 0.2855170662342513, + "learning_rate": 9.213724881741337e-07, + "loss": 0.011, + "step": 7395 + }, + { + "epoch": 2.469861412589748, + "grad_norm": 0.2398212449249697, + "learning_rate": 9.202486960963559e-07, + "loss": 0.0116, + "step": 7396 + }, + { + "epoch": 2.4701953581566203, + "grad_norm": 0.2823663592353502, + "learning_rate": 9.191255203130489e-07, + "loss": 0.0149, + "step": 7397 + }, + { + "epoch": 2.470529303723493, + "grad_norm": 0.2707020008992273, + "learning_rate": 9.18002960993884e-07, + "loss": 0.0138, + "step": 7398 + }, + { + "epoch": 2.4708632492903657, + "grad_norm": 0.2717785560876566, + "learning_rate": 9.168810183084348e-07, + "loss": 0.0158, + "step": 7399 + }, + { + "epoch": 2.4711971948572384, + "grad_norm": 0.2772250103816418, + "learning_rate": 9.157596924261847e-07, + "loss": 0.0113, + "step": 7400 + }, + { + "epoch": 2.4715311404241107, + "grad_norm": 0.35576545877944604, + "learning_rate": 9.146389835165248e-07, + "loss": 0.0142, + "step": 7401 + }, + { + "epoch": 2.4718650859909834, + "grad_norm": 0.2481302215494884, + "learning_rate": 9.135188917487487e-07, + "loss": 0.0088, + "step": 7402 + }, + { + "epoch": 2.472199031557856, + "grad_norm": 0.23749219989866024, + "learning_rate": 9.12399417292062e-07, + "loss": 0.0082, + "step": 7403 + }, + { + "epoch": 2.472532977124729, + "grad_norm": 0.2543245478587539, + "learning_rate": 9.112805603155716e-07, + "loss": 0.0121, + "step": 7404 + }, + { + "epoch": 2.472866922691601, + "grad_norm": 0.5376307491249888, + "learning_rate": 9.101623209882965e-07, + "loss": 0.0158, + "step": 7405 + }, + { + "epoch": 2.473200868258474, + "grad_norm": 0.26655967629657945, + "learning_rate": 9.090446994791585e-07, + "loss": 0.0147, + "step": 7406 + }, + { + "epoch": 2.4735348138253466, + "grad_norm": 0.4096313618904711, + "learning_rate": 9.079276959569899e-07, + "loss": 0.0204, + "step": 7407 + }, + { + "epoch": 2.473868759392219, + "grad_norm": 0.29010719357614445, + "learning_rate": 9.068113105905235e-07, + "loss": 0.0132, + "step": 7408 + }, + { + "epoch": 2.4742027049590916, + "grad_norm": 0.2664435999033433, + "learning_rate": 9.056955435484061e-07, + "loss": 0.0112, + "step": 7409 + }, + { + "epoch": 2.4745366505259643, + "grad_norm": 0.31839424122550386, + "learning_rate": 9.045803949991843e-07, + "loss": 0.0134, + "step": 7410 + }, + { + "epoch": 2.474870596092837, + "grad_norm": 0.22594237538056391, + "learning_rate": 9.034658651113154e-07, + "loss": 0.0105, + "step": 7411 + }, + { + "epoch": 2.4752045416597097, + "grad_norm": 0.32982060721670076, + "learning_rate": 9.023519540531633e-07, + "loss": 0.0206, + "step": 7412 + }, + { + "epoch": 2.475538487226582, + "grad_norm": 0.22372970550593976, + "learning_rate": 9.01238661992998e-07, + "loss": 0.0111, + "step": 7413 + }, + { + "epoch": 2.4758724327934547, + "grad_norm": 0.2198342491823257, + "learning_rate": 9.001259890989927e-07, + "loss": 0.01, + "step": 7414 + }, + { + "epoch": 2.4762063783603274, + "grad_norm": 0.25854013340820664, + "learning_rate": 8.990139355392324e-07, + "loss": 0.0098, + "step": 7415 + }, + { + "epoch": 2.4765403239271997, + "grad_norm": 0.2059290500377877, + "learning_rate": 8.979025014817039e-07, + "loss": 0.0071, + "step": 7416 + }, + { + "epoch": 2.4768742694940724, + "grad_norm": 0.24791823447152975, + "learning_rate": 8.967916870943028e-07, + "loss": 0.0105, + "step": 7417 + }, + { + "epoch": 2.477208215060945, + "grad_norm": 0.2267840483916544, + "learning_rate": 8.956814925448309e-07, + "loss": 0.012, + "step": 7418 + }, + { + "epoch": 2.477542160627818, + "grad_norm": 0.27758594398830816, + "learning_rate": 8.945719180009977e-07, + "loss": 0.0131, + "step": 7419 + }, + { + "epoch": 2.47787610619469, + "grad_norm": 0.2643542821536824, + "learning_rate": 8.934629636304149e-07, + "loss": 0.0117, + "step": 7420 + }, + { + "epoch": 2.478210051761563, + "grad_norm": 0.5148591572402273, + "learning_rate": 8.923546296006058e-07, + "loss": 0.029, + "step": 7421 + }, + { + "epoch": 2.4785439973284356, + "grad_norm": 0.2317178743468494, + "learning_rate": 8.912469160789944e-07, + "loss": 0.0094, + "step": 7422 + }, + { + "epoch": 2.4788779428953083, + "grad_norm": 0.3059872414456733, + "learning_rate": 8.901398232329156e-07, + "loss": 0.0127, + "step": 7423 + }, + { + "epoch": 2.4792118884621805, + "grad_norm": 0.32666304109913874, + "learning_rate": 8.890333512296095e-07, + "loss": 0.0137, + "step": 7424 + }, + { + "epoch": 2.4795458340290533, + "grad_norm": 0.2492647478725229, + "learning_rate": 8.879275002362197e-07, + "loss": 0.0113, + "step": 7425 + }, + { + "epoch": 2.479879779595926, + "grad_norm": 0.5699061430872919, + "learning_rate": 8.868222704198004e-07, + "loss": 0.0153, + "step": 7426 + }, + { + "epoch": 2.4802137251627983, + "grad_norm": 0.32537831526187644, + "learning_rate": 8.857176619473068e-07, + "loss": 0.0133, + "step": 7427 + }, + { + "epoch": 2.480547670729671, + "grad_norm": 0.2645468568701481, + "learning_rate": 8.846136749856044e-07, + "loss": 0.0106, + "step": 7428 + }, + { + "epoch": 2.4808816162965437, + "grad_norm": 0.28970387862449526, + "learning_rate": 8.835103097014636e-07, + "loss": 0.0119, + "step": 7429 + }, + { + "epoch": 2.4812155618634164, + "grad_norm": 0.3264385633712302, + "learning_rate": 8.824075662615617e-07, + "loss": 0.0164, + "step": 7430 + }, + { + "epoch": 2.4815495074302887, + "grad_norm": 0.3392268866713018, + "learning_rate": 8.813054448324792e-07, + "loss": 0.0216, + "step": 7431 + }, + { + "epoch": 2.4818834529971614, + "grad_norm": 0.24878933053929583, + "learning_rate": 8.80203945580706e-07, + "loss": 0.0111, + "step": 7432 + }, + { + "epoch": 2.482217398564034, + "grad_norm": 0.3115663099998713, + "learning_rate": 8.791030686726349e-07, + "loss": 0.0133, + "step": 7433 + }, + { + "epoch": 2.482551344130907, + "grad_norm": 0.2686470558722783, + "learning_rate": 8.780028142745673e-07, + "loss": 0.013, + "step": 7434 + }, + { + "epoch": 2.482885289697779, + "grad_norm": 0.26978008421913086, + "learning_rate": 8.769031825527097e-07, + "loss": 0.0111, + "step": 7435 + }, + { + "epoch": 2.483219235264652, + "grad_norm": 0.3637520293394349, + "learning_rate": 8.758041736731753e-07, + "loss": 0.0114, + "step": 7436 + }, + { + "epoch": 2.4835531808315245, + "grad_norm": 0.27255003830385555, + "learning_rate": 8.747057878019799e-07, + "loss": 0.014, + "step": 7437 + }, + { + "epoch": 2.483887126398397, + "grad_norm": 0.2747134069292236, + "learning_rate": 8.736080251050505e-07, + "loss": 0.0103, + "step": 7438 + }, + { + "epoch": 2.4842210719652695, + "grad_norm": 0.2372651711736165, + "learning_rate": 8.725108857482145e-07, + "loss": 0.0101, + "step": 7439 + }, + { + "epoch": 2.4845550175321423, + "grad_norm": 0.3305539702320851, + "learning_rate": 8.714143698972083e-07, + "loss": 0.0133, + "step": 7440 + }, + { + "epoch": 2.484888963099015, + "grad_norm": 0.2739958689199965, + "learning_rate": 8.703184777176743e-07, + "loss": 0.0093, + "step": 7441 + }, + { + "epoch": 2.4852229086658877, + "grad_norm": 0.23830424327958363, + "learning_rate": 8.692232093751613e-07, + "loss": 0.011, + "step": 7442 + }, + { + "epoch": 2.48555685423276, + "grad_norm": 0.3024521958358322, + "learning_rate": 8.68128565035119e-07, + "loss": 0.0179, + "step": 7443 + }, + { + "epoch": 2.4858907997996327, + "grad_norm": 0.26137383555521065, + "learning_rate": 8.670345448629097e-07, + "loss": 0.0092, + "step": 7444 + }, + { + "epoch": 2.4862247453665054, + "grad_norm": 0.2872450507530698, + "learning_rate": 8.659411490237951e-07, + "loss": 0.0166, + "step": 7445 + }, + { + "epoch": 2.4865586909333777, + "grad_norm": 0.37505759068410366, + "learning_rate": 8.648483776829469e-07, + "loss": 0.0167, + "step": 7446 + }, + { + "epoch": 2.4868926365002504, + "grad_norm": 0.26292564056472145, + "learning_rate": 8.637562310054425e-07, + "loss": 0.0131, + "step": 7447 + }, + { + "epoch": 2.487226582067123, + "grad_norm": 0.28773112340703616, + "learning_rate": 8.626647091562612e-07, + "loss": 0.0184, + "step": 7448 + }, + { + "epoch": 2.487560527633996, + "grad_norm": 0.2730958661513948, + "learning_rate": 8.61573812300292e-07, + "loss": 0.0106, + "step": 7449 + }, + { + "epoch": 2.487894473200868, + "grad_norm": 0.23593585732233188, + "learning_rate": 8.604835406023254e-07, + "loss": 0.0091, + "step": 7450 + }, + { + "epoch": 2.488228418767741, + "grad_norm": 0.22218945927555991, + "learning_rate": 8.593938942270613e-07, + "loss": 0.0105, + "step": 7451 + }, + { + "epoch": 2.4885623643346135, + "grad_norm": 0.2848038418982785, + "learning_rate": 8.583048733391036e-07, + "loss": 0.0127, + "step": 7452 + }, + { + "epoch": 2.4888963099014862, + "grad_norm": 0.28377579944649034, + "learning_rate": 8.57216478102963e-07, + "loss": 0.0124, + "step": 7453 + }, + { + "epoch": 2.4892302554683585, + "grad_norm": 0.24586441459462724, + "learning_rate": 8.561287086830516e-07, + "loss": 0.0083, + "step": 7454 + }, + { + "epoch": 2.4895642010352312, + "grad_norm": 0.2367840812098278, + "learning_rate": 8.550415652436927e-07, + "loss": 0.0075, + "step": 7455 + }, + { + "epoch": 2.489898146602104, + "grad_norm": 0.2748449301147865, + "learning_rate": 8.539550479491093e-07, + "loss": 0.0171, + "step": 7456 + }, + { + "epoch": 2.4902320921689762, + "grad_norm": 0.2958625824272446, + "learning_rate": 8.528691569634357e-07, + "loss": 0.019, + "step": 7457 + }, + { + "epoch": 2.490566037735849, + "grad_norm": 0.33993780967279436, + "learning_rate": 8.517838924507039e-07, + "loss": 0.0151, + "step": 7458 + }, + { + "epoch": 2.4908999833027217, + "grad_norm": 0.318394649387788, + "learning_rate": 8.50699254574861e-07, + "loss": 0.0171, + "step": 7459 + }, + { + "epoch": 2.4912339288695944, + "grad_norm": 0.2135113378957332, + "learning_rate": 8.496152434997518e-07, + "loss": 0.0096, + "step": 7460 + }, + { + "epoch": 2.491567874436467, + "grad_norm": 0.26150100790717984, + "learning_rate": 8.485318593891295e-07, + "loss": 0.0112, + "step": 7461 + }, + { + "epoch": 2.4919018200033394, + "grad_norm": 0.5438477535756069, + "learning_rate": 8.474491024066512e-07, + "loss": 0.0295, + "step": 7462 + }, + { + "epoch": 2.492235765570212, + "grad_norm": 0.2324033045906197, + "learning_rate": 8.463669727158819e-07, + "loss": 0.0104, + "step": 7463 + }, + { + "epoch": 2.492569711137085, + "grad_norm": 0.2868012703797143, + "learning_rate": 8.45285470480286e-07, + "loss": 0.0128, + "step": 7464 + }, + { + "epoch": 2.492903656703957, + "grad_norm": 0.34009350810193995, + "learning_rate": 8.442045958632428e-07, + "loss": 0.0119, + "step": 7465 + }, + { + "epoch": 2.49323760227083, + "grad_norm": 0.4513546666143164, + "learning_rate": 8.431243490280267e-07, + "loss": 0.0266, + "step": 7466 + }, + { + "epoch": 2.4935715478377025, + "grad_norm": 0.19228656975813063, + "learning_rate": 8.420447301378249e-07, + "loss": 0.0058, + "step": 7467 + }, + { + "epoch": 2.4939054934045752, + "grad_norm": 0.26433361673700384, + "learning_rate": 8.409657393557236e-07, + "loss": 0.0148, + "step": 7468 + }, + { + "epoch": 2.4942394389714475, + "grad_norm": 0.36446607846846146, + "learning_rate": 8.39887376844718e-07, + "loss": 0.0166, + "step": 7469 + }, + { + "epoch": 2.4945733845383202, + "grad_norm": 0.31405048361481996, + "learning_rate": 8.388096427677095e-07, + "loss": 0.0167, + "step": 7470 + }, + { + "epoch": 2.494907330105193, + "grad_norm": 0.32071936783487687, + "learning_rate": 8.377325372874995e-07, + "loss": 0.0162, + "step": 7471 + }, + { + "epoch": 2.4952412756720657, + "grad_norm": 0.26660919712543363, + "learning_rate": 8.366560605668006e-07, + "loss": 0.0092, + "step": 7472 + }, + { + "epoch": 2.495575221238938, + "grad_norm": 0.2847134033925736, + "learning_rate": 8.355802127682238e-07, + "loss": 0.0115, + "step": 7473 + }, + { + "epoch": 2.4959091668058107, + "grad_norm": 0.26420205422985205, + "learning_rate": 8.345049940542904e-07, + "loss": 0.0088, + "step": 7474 + }, + { + "epoch": 2.4962431123726834, + "grad_norm": 0.26939620063837594, + "learning_rate": 8.334304045874248e-07, + "loss": 0.0112, + "step": 7475 + }, + { + "epoch": 2.4965770579395556, + "grad_norm": 0.21972676943472785, + "learning_rate": 8.323564445299575e-07, + "loss": 0.0089, + "step": 7476 + }, + { + "epoch": 2.4969110035064284, + "grad_norm": 0.21377191805667578, + "learning_rate": 8.312831140441207e-07, + "loss": 0.0075, + "step": 7477 + }, + { + "epoch": 2.497244949073301, + "grad_norm": 0.21299621422449647, + "learning_rate": 8.302104132920552e-07, + "loss": 0.0079, + "step": 7478 + }, + { + "epoch": 2.497578894640174, + "grad_norm": 0.23024827188708863, + "learning_rate": 8.291383424358041e-07, + "loss": 0.0108, + "step": 7479 + }, + { + "epoch": 2.497912840207046, + "grad_norm": 0.33857903181597176, + "learning_rate": 8.280669016373172e-07, + "loss": 0.0172, + "step": 7480 + }, + { + "epoch": 2.498246785773919, + "grad_norm": 0.26519779138516475, + "learning_rate": 8.269960910584457e-07, + "loss": 0.0134, + "step": 7481 + }, + { + "epoch": 2.4985807313407915, + "grad_norm": 0.33444793311586507, + "learning_rate": 8.259259108609524e-07, + "loss": 0.0189, + "step": 7482 + }, + { + "epoch": 2.4989146769076642, + "grad_norm": 0.25943343687083725, + "learning_rate": 8.248563612064969e-07, + "loss": 0.0103, + "step": 7483 + }, + { + "epoch": 2.4992486224745365, + "grad_norm": 0.30237306680255943, + "learning_rate": 8.237874422566505e-07, + "loss": 0.0186, + "step": 7484 + }, + { + "epoch": 2.499582568041409, + "grad_norm": 0.2649511850101908, + "learning_rate": 8.227191541728829e-07, + "loss": 0.0139, + "step": 7485 + }, + { + "epoch": 2.499916513608282, + "grad_norm": 0.32639988819096716, + "learning_rate": 8.21651497116574e-07, + "loss": 0.0125, + "step": 7486 + }, + { + "epoch": 2.500250459175154, + "grad_norm": 0.33547525147974094, + "learning_rate": 8.205844712490024e-07, + "loss": 0.0135, + "step": 7487 + }, + { + "epoch": 2.500584404742027, + "grad_norm": 0.2617696819554997, + "learning_rate": 8.195180767313604e-07, + "loss": 0.0099, + "step": 7488 + }, + { + "epoch": 2.5009183503088996, + "grad_norm": 0.3643538060755637, + "learning_rate": 8.184523137247346e-07, + "loss": 0.0177, + "step": 7489 + }, + { + "epoch": 2.5012522958757724, + "grad_norm": 0.30941708656489425, + "learning_rate": 8.173871823901247e-07, + "loss": 0.012, + "step": 7490 + }, + { + "epoch": 2.501586241442645, + "grad_norm": 0.32032174298152893, + "learning_rate": 8.16322682888428e-07, + "loss": 0.0187, + "step": 7491 + }, + { + "epoch": 2.5019201870095173, + "grad_norm": 0.27503930542634536, + "learning_rate": 8.15258815380453e-07, + "loss": 0.0174, + "step": 7492 + }, + { + "epoch": 2.50225413257639, + "grad_norm": 0.2863134875388935, + "learning_rate": 8.141955800269058e-07, + "loss": 0.0144, + "step": 7493 + }, + { + "epoch": 2.502588078143263, + "grad_norm": 0.3006021417430872, + "learning_rate": 8.131329769884027e-07, + "loss": 0.0118, + "step": 7494 + }, + { + "epoch": 2.502922023710135, + "grad_norm": 0.2758410721079843, + "learning_rate": 8.120710064254634e-07, + "loss": 0.0118, + "step": 7495 + }, + { + "epoch": 2.5032559692770078, + "grad_norm": 0.3139761802139935, + "learning_rate": 8.110096684985086e-07, + "loss": 0.0154, + "step": 7496 + }, + { + "epoch": 2.5035899148438805, + "grad_norm": 0.2696734721830606, + "learning_rate": 8.099489633678676e-07, + "loss": 0.0151, + "step": 7497 + }, + { + "epoch": 2.503923860410753, + "grad_norm": 0.3142316456179232, + "learning_rate": 8.088888911937726e-07, + "loss": 0.0158, + "step": 7498 + }, + { + "epoch": 2.5042578059776255, + "grad_norm": 0.4117660260331043, + "learning_rate": 8.078294521363584e-07, + "loss": 0.0259, + "step": 7499 + }, + { + "epoch": 2.504591751544498, + "grad_norm": 0.3236151848025178, + "learning_rate": 8.067706463556663e-07, + "loss": 0.0125, + "step": 7500 + }, + { + "epoch": 2.504925697111371, + "grad_norm": 0.26476105995267335, + "learning_rate": 8.057124740116434e-07, + "loss": 0.0174, + "step": 7501 + }, + { + "epoch": 2.5052596426782436, + "grad_norm": 0.21317169985024756, + "learning_rate": 8.046549352641359e-07, + "loss": 0.0084, + "step": 7502 + }, + { + "epoch": 2.505593588245116, + "grad_norm": 0.2718079570234273, + "learning_rate": 8.035980302729008e-07, + "loss": 0.0141, + "step": 7503 + }, + { + "epoch": 2.5059275338119886, + "grad_norm": 0.26982581170003705, + "learning_rate": 8.025417591975926e-07, + "loss": 0.0096, + "step": 7504 + }, + { + "epoch": 2.5062614793788613, + "grad_norm": 0.2680654971650118, + "learning_rate": 8.014861221977749e-07, + "loss": 0.0146, + "step": 7505 + }, + { + "epoch": 2.5065954249457336, + "grad_norm": 0.3180238980190388, + "learning_rate": 8.004311194329145e-07, + "loss": 0.0152, + "step": 7506 + }, + { + "epoch": 2.5069293705126063, + "grad_norm": 0.24209967918279646, + "learning_rate": 7.993767510623834e-07, + "loss": 0.0112, + "step": 7507 + }, + { + "epoch": 2.507263316079479, + "grad_norm": 0.3272298526452807, + "learning_rate": 7.983230172454531e-07, + "loss": 0.018, + "step": 7508 + }, + { + "epoch": 2.5075972616463518, + "grad_norm": 0.27835864475758876, + "learning_rate": 7.972699181413058e-07, + "loss": 0.012, + "step": 7509 + }, + { + "epoch": 2.5079312072132245, + "grad_norm": 0.27716962050594207, + "learning_rate": 7.962174539090201e-07, + "loss": 0.0104, + "step": 7510 + }, + { + "epoch": 2.5082651527800968, + "grad_norm": 0.26851036744495926, + "learning_rate": 7.951656247075884e-07, + "loss": 0.0174, + "step": 7511 + }, + { + "epoch": 2.5085990983469695, + "grad_norm": 0.3936971307343381, + "learning_rate": 7.941144306958986e-07, + "loss": 0.0266, + "step": 7512 + }, + { + "epoch": 2.508933043913842, + "grad_norm": 0.38525609535037336, + "learning_rate": 7.930638720327477e-07, + "loss": 0.0187, + "step": 7513 + }, + { + "epoch": 2.5092669894807145, + "grad_norm": 0.30054222328034796, + "learning_rate": 7.920139488768325e-07, + "loss": 0.0158, + "step": 7514 + }, + { + "epoch": 2.509600935047587, + "grad_norm": 0.36527045291501914, + "learning_rate": 7.909646613867594e-07, + "loss": 0.0135, + "step": 7515 + }, + { + "epoch": 2.50993488061446, + "grad_norm": 0.28270674511044247, + "learning_rate": 7.899160097210329e-07, + "loss": 0.0127, + "step": 7516 + }, + { + "epoch": 2.510268826181332, + "grad_norm": 0.26290850558982126, + "learning_rate": 7.888679940380644e-07, + "loss": 0.0108, + "step": 7517 + }, + { + "epoch": 2.510602771748205, + "grad_norm": 0.30135195085473543, + "learning_rate": 7.87820614496172e-07, + "loss": 0.0286, + "step": 7518 + }, + { + "epoch": 2.5109367173150776, + "grad_norm": 0.30532503578447345, + "learning_rate": 7.867738712535711e-07, + "loss": 0.0137, + "step": 7519 + }, + { + "epoch": 2.5112706628819503, + "grad_norm": 0.2941924862667987, + "learning_rate": 7.857277644683858e-07, + "loss": 0.0193, + "step": 7520 + }, + { + "epoch": 2.511604608448823, + "grad_norm": 0.26657489483577435, + "learning_rate": 7.846822942986449e-07, + "loss": 0.0104, + "step": 7521 + }, + { + "epoch": 2.5119385540156953, + "grad_norm": 0.34821001515380123, + "learning_rate": 7.836374609022756e-07, + "loss": 0.0162, + "step": 7522 + }, + { + "epoch": 2.512272499582568, + "grad_norm": 0.25992029250167054, + "learning_rate": 7.825932644371137e-07, + "loss": 0.0119, + "step": 7523 + }, + { + "epoch": 2.5126064451494408, + "grad_norm": 0.26989945031975804, + "learning_rate": 7.815497050608989e-07, + "loss": 0.0107, + "step": 7524 + }, + { + "epoch": 2.512940390716313, + "grad_norm": 0.284167749975026, + "learning_rate": 7.805067829312707e-07, + "loss": 0.0131, + "step": 7525 + }, + { + "epoch": 2.5132743362831858, + "grad_norm": 0.26881029588972233, + "learning_rate": 7.79464498205777e-07, + "loss": 0.0137, + "step": 7526 + }, + { + "epoch": 2.5136082818500585, + "grad_norm": 0.28799350090633236, + "learning_rate": 7.78422851041865e-07, + "loss": 0.0124, + "step": 7527 + }, + { + "epoch": 2.513942227416931, + "grad_norm": 0.347891544805565, + "learning_rate": 7.773818415968887e-07, + "loss": 0.0134, + "step": 7528 + }, + { + "epoch": 2.514276172983804, + "grad_norm": 0.2970741533603355, + "learning_rate": 7.763414700281053e-07, + "loss": 0.0119, + "step": 7529 + }, + { + "epoch": 2.514610118550676, + "grad_norm": 0.3539225000276794, + "learning_rate": 7.753017364926757e-07, + "loss": 0.0208, + "step": 7530 + }, + { + "epoch": 2.514944064117549, + "grad_norm": 0.33629761066323144, + "learning_rate": 7.742626411476617e-07, + "loss": 0.0134, + "step": 7531 + }, + { + "epoch": 2.5152780096844216, + "grad_norm": 0.28155100990311854, + "learning_rate": 7.732241841500332e-07, + "loss": 0.0137, + "step": 7532 + }, + { + "epoch": 2.515611955251294, + "grad_norm": 0.2659163733742089, + "learning_rate": 7.721863656566597e-07, + "loss": 0.0128, + "step": 7533 + }, + { + "epoch": 2.5159459008181666, + "grad_norm": 0.24208404243411683, + "learning_rate": 7.711491858243164e-07, + "loss": 0.0138, + "step": 7534 + }, + { + "epoch": 2.5162798463850393, + "grad_norm": 0.26105239540569763, + "learning_rate": 7.701126448096813e-07, + "loss": 0.0101, + "step": 7535 + }, + { + "epoch": 2.5166137919519116, + "grad_norm": 0.2802534568300955, + "learning_rate": 7.69076742769338e-07, + "loss": 0.0126, + "step": 7536 + }, + { + "epoch": 2.5169477375187843, + "grad_norm": 0.24357848550385142, + "learning_rate": 7.68041479859769e-07, + "loss": 0.0154, + "step": 7537 + }, + { + "epoch": 2.517281683085657, + "grad_norm": 0.35228867344053577, + "learning_rate": 7.670068562373656e-07, + "loss": 0.0183, + "step": 7538 + }, + { + "epoch": 2.5176156286525297, + "grad_norm": 0.332641741046175, + "learning_rate": 7.65972872058417e-07, + "loss": 0.0135, + "step": 7539 + }, + { + "epoch": 2.5179495742194025, + "grad_norm": 0.20734832982401147, + "learning_rate": 7.6493952747912e-07, + "loss": 0.0084, + "step": 7540 + }, + { + "epoch": 2.5182835197862747, + "grad_norm": 0.23680939621094196, + "learning_rate": 7.639068226555751e-07, + "loss": 0.0116, + "step": 7541 + }, + { + "epoch": 2.5186174653531475, + "grad_norm": 0.24809204560322856, + "learning_rate": 7.628747577437817e-07, + "loss": 0.0106, + "step": 7542 + }, + { + "epoch": 2.51895141092002, + "grad_norm": 0.24711898252742784, + "learning_rate": 7.618433328996466e-07, + "loss": 0.0106, + "step": 7543 + }, + { + "epoch": 2.5192853564868924, + "grad_norm": 0.39365759248803117, + "learning_rate": 7.608125482789802e-07, + "loss": 0.0192, + "step": 7544 + }, + { + "epoch": 2.519619302053765, + "grad_norm": 0.41108589361119663, + "learning_rate": 7.597824040374918e-07, + "loss": 0.02, + "step": 7545 + }, + { + "epoch": 2.519953247620638, + "grad_norm": 0.19765272619035332, + "learning_rate": 7.587529003307981e-07, + "loss": 0.0081, + "step": 7546 + }, + { + "epoch": 2.5202871931875106, + "grad_norm": 0.31210370871376003, + "learning_rate": 7.57724037314419e-07, + "loss": 0.0157, + "step": 7547 + }, + { + "epoch": 2.520621138754383, + "grad_norm": 0.3066968391693989, + "learning_rate": 7.566958151437743e-07, + "loss": 0.0146, + "step": 7548 + }, + { + "epoch": 2.5209550843212556, + "grad_norm": 0.299597242559998, + "learning_rate": 7.556682339741911e-07, + "loss": 0.0159, + "step": 7549 + }, + { + "epoch": 2.5212890298881283, + "grad_norm": 0.21504881144512858, + "learning_rate": 7.546412939608955e-07, + "loss": 0.0096, + "step": 7550 + }, + { + "epoch": 2.521622975455001, + "grad_norm": 0.46879823634392637, + "learning_rate": 7.5361499525902e-07, + "loss": 0.0323, + "step": 7551 + }, + { + "epoch": 2.5219569210218733, + "grad_norm": 0.22235446846648538, + "learning_rate": 7.525893380235988e-07, + "loss": 0.0082, + "step": 7552 + }, + { + "epoch": 2.522290866588746, + "grad_norm": 0.32767128932311534, + "learning_rate": 7.515643224095709e-07, + "loss": 0.0212, + "step": 7553 + }, + { + "epoch": 2.5226248121556187, + "grad_norm": 0.3528000327967212, + "learning_rate": 7.505399485717746e-07, + "loss": 0.0254, + "step": 7554 + }, + { + "epoch": 2.522958757722491, + "grad_norm": 0.2639725899059416, + "learning_rate": 7.495162166649561e-07, + "loss": 0.0125, + "step": 7555 + }, + { + "epoch": 2.5232927032893637, + "grad_norm": 0.3484569359303812, + "learning_rate": 7.484931268437595e-07, + "loss": 0.0127, + "step": 7556 + }, + { + "epoch": 2.5236266488562364, + "grad_norm": 0.21416052591129747, + "learning_rate": 7.474706792627362e-07, + "loss": 0.0139, + "step": 7557 + }, + { + "epoch": 2.523960594423109, + "grad_norm": 0.2237026002440248, + "learning_rate": 7.464488740763387e-07, + "loss": 0.01, + "step": 7558 + }, + { + "epoch": 2.524294539989982, + "grad_norm": 0.30163527901392606, + "learning_rate": 7.454277114389241e-07, + "loss": 0.0119, + "step": 7559 + }, + { + "epoch": 2.524628485556854, + "grad_norm": 0.3235806606250762, + "learning_rate": 7.444071915047479e-07, + "loss": 0.0148, + "step": 7560 + }, + { + "epoch": 2.524962431123727, + "grad_norm": 0.2672013720760013, + "learning_rate": 7.433873144279751e-07, + "loss": 0.0124, + "step": 7561 + }, + { + "epoch": 2.5252963766905996, + "grad_norm": 0.4594222545439994, + "learning_rate": 7.42368080362667e-07, + "loss": 0.0211, + "step": 7562 + }, + { + "epoch": 2.525630322257472, + "grad_norm": 0.2933518960107444, + "learning_rate": 7.413494894627926e-07, + "loss": 0.0135, + "step": 7563 + }, + { + "epoch": 2.5259642678243446, + "grad_norm": 0.26549948183498945, + "learning_rate": 7.403315418822215e-07, + "loss": 0.0127, + "step": 7564 + }, + { + "epoch": 2.5262982133912173, + "grad_norm": 0.29135788599720286, + "learning_rate": 7.393142377747287e-07, + "loss": 0.0143, + "step": 7565 + }, + { + "epoch": 2.5266321589580896, + "grad_norm": 0.2077255763095782, + "learning_rate": 7.382975772939866e-07, + "loss": 0.0088, + "step": 7566 + }, + { + "epoch": 2.5269661045249623, + "grad_norm": 0.29887337214834014, + "learning_rate": 7.372815605935763e-07, + "loss": 0.0193, + "step": 7567 + }, + { + "epoch": 2.527300050091835, + "grad_norm": 0.29036737874299484, + "learning_rate": 7.362661878269772e-07, + "loss": 0.0162, + "step": 7568 + }, + { + "epoch": 2.5276339956587077, + "grad_norm": 0.28973666902738576, + "learning_rate": 7.352514591475746e-07, + "loss": 0.0156, + "step": 7569 + }, + { + "epoch": 2.5279679412255804, + "grad_norm": 0.5974911089777487, + "learning_rate": 7.342373747086557e-07, + "loss": 0.0222, + "step": 7570 + }, + { + "epoch": 2.5283018867924527, + "grad_norm": 0.2353122906771022, + "learning_rate": 7.332239346634079e-07, + "loss": 0.0081, + "step": 7571 + }, + { + "epoch": 2.5286358323593254, + "grad_norm": 0.27092979239073095, + "learning_rate": 7.322111391649261e-07, + "loss": 0.0155, + "step": 7572 + }, + { + "epoch": 2.528969777926198, + "grad_norm": 0.262717083733501, + "learning_rate": 7.311989883662018e-07, + "loss": 0.0167, + "step": 7573 + }, + { + "epoch": 2.5293037234930704, + "grad_norm": 0.5500810831817916, + "learning_rate": 7.301874824201349e-07, + "loss": 0.0178, + "step": 7574 + }, + { + "epoch": 2.529637669059943, + "grad_norm": 0.31262105886576164, + "learning_rate": 7.29176621479522e-07, + "loss": 0.0135, + "step": 7575 + }, + { + "epoch": 2.529971614626816, + "grad_norm": 0.3606297777899301, + "learning_rate": 7.2816640569707e-07, + "loss": 0.0207, + "step": 7576 + }, + { + "epoch": 2.5303055601936886, + "grad_norm": 0.2674616062185029, + "learning_rate": 7.271568352253804e-07, + "loss": 0.0112, + "step": 7577 + }, + { + "epoch": 2.5306395057605613, + "grad_norm": 0.3352255147195767, + "learning_rate": 7.261479102169627e-07, + "loss": 0.0131, + "step": 7578 + }, + { + "epoch": 2.5309734513274336, + "grad_norm": 0.24186772682285884, + "learning_rate": 7.251396308242259e-07, + "loss": 0.0077, + "step": 7579 + }, + { + "epoch": 2.5313073968943063, + "grad_norm": 0.2645461840386459, + "learning_rate": 7.241319971994831e-07, + "loss": 0.0123, + "step": 7580 + }, + { + "epoch": 2.531641342461179, + "grad_norm": 0.1929282593891482, + "learning_rate": 7.231250094949472e-07, + "loss": 0.0076, + "step": 7581 + }, + { + "epoch": 2.5319752880280513, + "grad_norm": 0.36436333781062513, + "learning_rate": 7.221186678627389e-07, + "loss": 0.0164, + "step": 7582 + }, + { + "epoch": 2.532309233594924, + "grad_norm": 0.3421473823759679, + "learning_rate": 7.211129724548754e-07, + "loss": 0.0193, + "step": 7583 + }, + { + "epoch": 2.5326431791617967, + "grad_norm": 0.2593095570302387, + "learning_rate": 7.201079234232805e-07, + "loss": 0.0116, + "step": 7584 + }, + { + "epoch": 2.532977124728669, + "grad_norm": 0.28797119661021686, + "learning_rate": 7.191035209197772e-07, + "loss": 0.0164, + "step": 7585 + }, + { + "epoch": 2.5333110702955417, + "grad_norm": 0.3690058847072772, + "learning_rate": 7.180997650960936e-07, + "loss": 0.0241, + "step": 7586 + }, + { + "epoch": 2.5336450158624144, + "grad_norm": 0.3042760633825699, + "learning_rate": 7.170966561038561e-07, + "loss": 0.0159, + "step": 7587 + }, + { + "epoch": 2.533978961429287, + "grad_norm": 0.3038722096925673, + "learning_rate": 7.160941940946009e-07, + "loss": 0.0136, + "step": 7588 + }, + { + "epoch": 2.53431290699616, + "grad_norm": 0.36788491784806165, + "learning_rate": 7.150923792197579e-07, + "loss": 0.0184, + "step": 7589 + }, + { + "epoch": 2.534646852563032, + "grad_norm": 0.26606150918988686, + "learning_rate": 7.140912116306648e-07, + "loss": 0.009, + "step": 7590 + }, + { + "epoch": 2.534980798129905, + "grad_norm": 0.3430438676656165, + "learning_rate": 7.130906914785585e-07, + "loss": 0.0113, + "step": 7591 + }, + { + "epoch": 2.5353147436967776, + "grad_norm": 0.27256466236182747, + "learning_rate": 7.120908189145798e-07, + "loss": 0.0107, + "step": 7592 + }, + { + "epoch": 2.53564868926365, + "grad_norm": 0.28485656773511575, + "learning_rate": 7.110915940897722e-07, + "loss": 0.0095, + "step": 7593 + }, + { + "epoch": 2.5359826348305226, + "grad_norm": 0.3147528853222281, + "learning_rate": 7.100930171550785e-07, + "loss": 0.0186, + "step": 7594 + }, + { + "epoch": 2.5363165803973953, + "grad_norm": 0.30007872100734273, + "learning_rate": 7.090950882613479e-07, + "loss": 0.0131, + "step": 7595 + }, + { + "epoch": 2.536650525964268, + "grad_norm": 0.3003483182019013, + "learning_rate": 7.08097807559327e-07, + "loss": 0.0141, + "step": 7596 + }, + { + "epoch": 2.5369844715311403, + "grad_norm": 0.27311366686798244, + "learning_rate": 7.071011751996687e-07, + "loss": 0.0093, + "step": 7597 + }, + { + "epoch": 2.537318417098013, + "grad_norm": 0.33483066735765116, + "learning_rate": 7.061051913329231e-07, + "loss": 0.0219, + "step": 7598 + }, + { + "epoch": 2.5376523626648857, + "grad_norm": 0.27756377119210085, + "learning_rate": 7.051098561095493e-07, + "loss": 0.0129, + "step": 7599 + }, + { + "epoch": 2.5379863082317584, + "grad_norm": 0.2838138355904635, + "learning_rate": 7.041151696799014e-07, + "loss": 0.013, + "step": 7600 + }, + { + "epoch": 2.5383202537986307, + "grad_norm": 0.2870240267803289, + "learning_rate": 7.031211321942405e-07, + "loss": 0.0124, + "step": 7601 + }, + { + "epoch": 2.5386541993655034, + "grad_norm": 0.22704462870237718, + "learning_rate": 7.021277438027258e-07, + "loss": 0.0104, + "step": 7602 + }, + { + "epoch": 2.538988144932376, + "grad_norm": 0.43760637475478004, + "learning_rate": 7.011350046554227e-07, + "loss": 0.0151, + "step": 7603 + }, + { + "epoch": 2.5393220904992484, + "grad_norm": 0.32335918948087855, + "learning_rate": 7.001429149022915e-07, + "loss": 0.0196, + "step": 7604 + }, + { + "epoch": 2.539656036066121, + "grad_norm": 0.24849127560724107, + "learning_rate": 6.991514746932048e-07, + "loss": 0.0093, + "step": 7605 + }, + { + "epoch": 2.539989981632994, + "grad_norm": 0.289605915782507, + "learning_rate": 6.981606841779281e-07, + "loss": 0.0124, + "step": 7606 + }, + { + "epoch": 2.5403239271998665, + "grad_norm": 0.2405093080198529, + "learning_rate": 6.971705435061333e-07, + "loss": 0.0093, + "step": 7607 + }, + { + "epoch": 2.5406578727667393, + "grad_norm": 0.365772313700415, + "learning_rate": 6.96181052827391e-07, + "loss": 0.0189, + "step": 7608 + }, + { + "epoch": 2.5409918183336115, + "grad_norm": 0.29299179302693745, + "learning_rate": 6.951922122911775e-07, + "loss": 0.0129, + "step": 7609 + }, + { + "epoch": 2.5413257639004843, + "grad_norm": 0.21477693270870274, + "learning_rate": 6.942040220468654e-07, + "loss": 0.0098, + "step": 7610 + }, + { + "epoch": 2.541659709467357, + "grad_norm": 0.2627015365413541, + "learning_rate": 6.932164822437371e-07, + "loss": 0.0092, + "step": 7611 + }, + { + "epoch": 2.5419936550342292, + "grad_norm": 0.3053830771654161, + "learning_rate": 6.922295930309691e-07, + "loss": 0.0128, + "step": 7612 + }, + { + "epoch": 2.542327600601102, + "grad_norm": 0.31507398162041766, + "learning_rate": 6.912433545576446e-07, + "loss": 0.013, + "step": 7613 + }, + { + "epoch": 2.5426615461679747, + "grad_norm": 0.35287209472836883, + "learning_rate": 6.90257766972744e-07, + "loss": 0.0209, + "step": 7614 + }, + { + "epoch": 2.542995491734847, + "grad_norm": 0.23745912593933097, + "learning_rate": 6.892728304251544e-07, + "loss": 0.009, + "step": 7615 + }, + { + "epoch": 2.5433294373017197, + "grad_norm": 0.2910545767108932, + "learning_rate": 6.8828854506366e-07, + "loss": 0.0133, + "step": 7616 + }, + { + "epoch": 2.5436633828685924, + "grad_norm": 0.2950379758085675, + "learning_rate": 6.873049110369495e-07, + "loss": 0.0115, + "step": 7617 + }, + { + "epoch": 2.543997328435465, + "grad_norm": 0.2464590801362038, + "learning_rate": 6.863219284936135e-07, + "loss": 0.0109, + "step": 7618 + }, + { + "epoch": 2.544331274002338, + "grad_norm": 0.3096588368307441, + "learning_rate": 6.853395975821414e-07, + "loss": 0.0144, + "step": 7619 + }, + { + "epoch": 2.54466521956921, + "grad_norm": 0.2714670629343454, + "learning_rate": 6.843579184509275e-07, + "loss": 0.0107, + "step": 7620 + }, + { + "epoch": 2.544999165136083, + "grad_norm": 0.19873774869983954, + "learning_rate": 6.833768912482636e-07, + "loss": 0.0088, + "step": 7621 + }, + { + "epoch": 2.5453331107029555, + "grad_norm": 0.2732883874401984, + "learning_rate": 6.823965161223472e-07, + "loss": 0.0156, + "step": 7622 + }, + { + "epoch": 2.545667056269828, + "grad_norm": 0.2141874848917232, + "learning_rate": 6.814167932212751e-07, + "loss": 0.0074, + "step": 7623 + }, + { + "epoch": 2.5460010018367005, + "grad_norm": 0.2861711486348828, + "learning_rate": 6.804377226930469e-07, + "loss": 0.0179, + "step": 7624 + }, + { + "epoch": 2.5463349474035732, + "grad_norm": 0.24788504031554284, + "learning_rate": 6.794593046855613e-07, + "loss": 0.0093, + "step": 7625 + }, + { + "epoch": 2.546668892970446, + "grad_norm": 0.20039467144689546, + "learning_rate": 6.784815393466215e-07, + "loss": 0.0092, + "step": 7626 + }, + { + "epoch": 2.5470028385373187, + "grad_norm": 0.2830293777900935, + "learning_rate": 6.775044268239278e-07, + "loss": 0.0089, + "step": 7627 + }, + { + "epoch": 2.547336784104191, + "grad_norm": 0.31463979243652335, + "learning_rate": 6.765279672650865e-07, + "loss": 0.0167, + "step": 7628 + }, + { + "epoch": 2.5476707296710637, + "grad_norm": 0.2826136080760445, + "learning_rate": 6.75552160817603e-07, + "loss": 0.0106, + "step": 7629 + }, + { + "epoch": 2.5480046752379364, + "grad_norm": 0.27206352461557476, + "learning_rate": 6.745770076288854e-07, + "loss": 0.0108, + "step": 7630 + }, + { + "epoch": 2.5483386208048087, + "grad_norm": 0.29022417323012484, + "learning_rate": 6.736025078462399e-07, + "loss": 0.013, + "step": 7631 + }, + { + "epoch": 2.5486725663716814, + "grad_norm": 0.22705775467584716, + "learning_rate": 6.726286616168781e-07, + "loss": 0.0095, + "step": 7632 + }, + { + "epoch": 2.549006511938554, + "grad_norm": 0.2822732800886761, + "learning_rate": 6.716554690879085e-07, + "loss": 0.0148, + "step": 7633 + }, + { + "epoch": 2.5493404575054264, + "grad_norm": 0.34597594823694056, + "learning_rate": 6.706829304063467e-07, + "loss": 0.0193, + "step": 7634 + }, + { + "epoch": 2.549674403072299, + "grad_norm": 0.27436227871033536, + "learning_rate": 6.697110457191031e-07, + "loss": 0.0156, + "step": 7635 + }, + { + "epoch": 2.550008348639172, + "grad_norm": 0.29438849725969307, + "learning_rate": 6.687398151729951e-07, + "loss": 0.018, + "step": 7636 + }, + { + "epoch": 2.5503422942060445, + "grad_norm": 0.28789897424501965, + "learning_rate": 6.677692389147355e-07, + "loss": 0.0123, + "step": 7637 + }, + { + "epoch": 2.5506762397729172, + "grad_norm": 0.2913385282077587, + "learning_rate": 6.667993170909437e-07, + "loss": 0.0119, + "step": 7638 + }, + { + "epoch": 2.5510101853397895, + "grad_norm": 0.22954873855808372, + "learning_rate": 6.658300498481363e-07, + "loss": 0.0104, + "step": 7639 + }, + { + "epoch": 2.5513441309066622, + "grad_norm": 0.4022487794590685, + "learning_rate": 6.648614373327328e-07, + "loss": 0.0304, + "step": 7640 + }, + { + "epoch": 2.551678076473535, + "grad_norm": 0.31154895901564816, + "learning_rate": 6.638934796910545e-07, + "loss": 0.0108, + "step": 7641 + }, + { + "epoch": 2.5520120220404072, + "grad_norm": 0.32510097482021605, + "learning_rate": 6.629261770693213e-07, + "loss": 0.0149, + "step": 7642 + }, + { + "epoch": 2.55234596760728, + "grad_norm": 0.3928796312435496, + "learning_rate": 6.619595296136577e-07, + "loss": 0.0131, + "step": 7643 + }, + { + "epoch": 2.5526799131741527, + "grad_norm": 0.26520098808510795, + "learning_rate": 6.609935374700849e-07, + "loss": 0.0104, + "step": 7644 + }, + { + "epoch": 2.5530138587410254, + "grad_norm": 0.28194036719044246, + "learning_rate": 6.600282007845277e-07, + "loss": 0.0122, + "step": 7645 + }, + { + "epoch": 2.5533478043078977, + "grad_norm": 0.23465194951929508, + "learning_rate": 6.590635197028128e-07, + "loss": 0.0084, + "step": 7646 + }, + { + "epoch": 2.5536817498747704, + "grad_norm": 0.2654408510525354, + "learning_rate": 6.580994943706675e-07, + "loss": 0.0124, + "step": 7647 + }, + { + "epoch": 2.554015695441643, + "grad_norm": 0.23877218559479607, + "learning_rate": 6.571361249337161e-07, + "loss": 0.0101, + "step": 7648 + }, + { + "epoch": 2.554349641008516, + "grad_norm": 0.2791182314163454, + "learning_rate": 6.561734115374901e-07, + "loss": 0.0113, + "step": 7649 + }, + { + "epoch": 2.554683586575388, + "grad_norm": 0.24895617590823962, + "learning_rate": 6.552113543274158e-07, + "loss": 0.0101, + "step": 7650 + }, + { + "epoch": 2.555017532142261, + "grad_norm": 0.23747428376138735, + "learning_rate": 6.54249953448825e-07, + "loss": 0.0097, + "step": 7651 + }, + { + "epoch": 2.5553514777091335, + "grad_norm": 0.28487369253520783, + "learning_rate": 6.532892090469484e-07, + "loss": 0.0176, + "step": 7652 + }, + { + "epoch": 2.555685423276006, + "grad_norm": 0.3385339732444899, + "learning_rate": 6.52329121266918e-07, + "loss": 0.0194, + "step": 7653 + }, + { + "epoch": 2.5560193688428785, + "grad_norm": 0.3390754635806754, + "learning_rate": 6.513696902537653e-07, + "loss": 0.018, + "step": 7654 + }, + { + "epoch": 2.556353314409751, + "grad_norm": 0.36453070238608143, + "learning_rate": 6.504109161524257e-07, + "loss": 0.0135, + "step": 7655 + }, + { + "epoch": 2.556687259976624, + "grad_norm": 0.2416387585270207, + "learning_rate": 6.494527991077304e-07, + "loss": 0.0102, + "step": 7656 + }, + { + "epoch": 2.5570212055434967, + "grad_norm": 0.3112773119366655, + "learning_rate": 6.484953392644161e-07, + "loss": 0.0113, + "step": 7657 + }, + { + "epoch": 2.557355151110369, + "grad_norm": 0.2958964808698193, + "learning_rate": 6.475385367671183e-07, + "loss": 0.0158, + "step": 7658 + }, + { + "epoch": 2.5576890966772416, + "grad_norm": 0.3434098314001665, + "learning_rate": 6.465823917603742e-07, + "loss": 0.0199, + "step": 7659 + }, + { + "epoch": 2.5580230422441144, + "grad_norm": 0.30371730468915425, + "learning_rate": 6.456269043886182e-07, + "loss": 0.0133, + "step": 7660 + }, + { + "epoch": 2.5583569878109866, + "grad_norm": 0.2606506237596131, + "learning_rate": 6.446720747961904e-07, + "loss": 0.0104, + "step": 7661 + }, + { + "epoch": 2.5586909333778594, + "grad_norm": 0.22068375526277711, + "learning_rate": 6.437179031273272e-07, + "loss": 0.0114, + "step": 7662 + }, + { + "epoch": 2.559024878944732, + "grad_norm": 0.271297923643344, + "learning_rate": 6.427643895261687e-07, + "loss": 0.0147, + "step": 7663 + }, + { + "epoch": 2.5593588245116043, + "grad_norm": 0.24599932657434925, + "learning_rate": 6.418115341367543e-07, + "loss": 0.0099, + "step": 7664 + }, + { + "epoch": 2.559692770078477, + "grad_norm": 0.2343290015702136, + "learning_rate": 6.408593371030231e-07, + "loss": 0.0092, + "step": 7665 + }, + { + "epoch": 2.56002671564535, + "grad_norm": 0.3017282925307235, + "learning_rate": 6.399077985688168e-07, + "loss": 0.0118, + "step": 7666 + }, + { + "epoch": 2.5603606612122225, + "grad_norm": 0.24802952333676967, + "learning_rate": 6.389569186778754e-07, + "loss": 0.0118, + "step": 7667 + }, + { + "epoch": 2.560694606779095, + "grad_norm": 0.19669040049922745, + "learning_rate": 6.38006697573842e-07, + "loss": 0.0088, + "step": 7668 + }, + { + "epoch": 2.5610285523459675, + "grad_norm": 0.31336593097840215, + "learning_rate": 6.370571354002553e-07, + "loss": 0.0146, + "step": 7669 + }, + { + "epoch": 2.56136249791284, + "grad_norm": 0.2375684635525042, + "learning_rate": 6.361082323005624e-07, + "loss": 0.0105, + "step": 7670 + }, + { + "epoch": 2.561696443479713, + "grad_norm": 0.3266110919657544, + "learning_rate": 6.351599884181037e-07, + "loss": 0.0144, + "step": 7671 + }, + { + "epoch": 2.562030389046585, + "grad_norm": 0.3165245167755812, + "learning_rate": 6.342124038961234e-07, + "loss": 0.0101, + "step": 7672 + }, + { + "epoch": 2.562364334613458, + "grad_norm": 0.24043706621919486, + "learning_rate": 6.332654788777642e-07, + "loss": 0.0112, + "step": 7673 + }, + { + "epoch": 2.5626982801803306, + "grad_norm": 0.3571244729415984, + "learning_rate": 6.323192135060713e-07, + "loss": 0.0168, + "step": 7674 + }, + { + "epoch": 2.5630322257472034, + "grad_norm": 0.36735819824508675, + "learning_rate": 6.31373607923989e-07, + "loss": 0.0211, + "step": 7675 + }, + { + "epoch": 2.563366171314076, + "grad_norm": 0.23117631065589533, + "learning_rate": 6.304286622743627e-07, + "loss": 0.0082, + "step": 7676 + }, + { + "epoch": 2.5637001168809483, + "grad_norm": 0.25786461123132604, + "learning_rate": 6.294843766999364e-07, + "loss": 0.013, + "step": 7677 + }, + { + "epoch": 2.564034062447821, + "grad_norm": 0.21760487266740203, + "learning_rate": 6.285407513433572e-07, + "loss": 0.0089, + "step": 7678 + }, + { + "epoch": 2.564368008014694, + "grad_norm": 0.295119637137543, + "learning_rate": 6.275977863471683e-07, + "loss": 0.0118, + "step": 7679 + }, + { + "epoch": 2.564701953581566, + "grad_norm": 0.4136616325040832, + "learning_rate": 6.266554818538173e-07, + "loss": 0.0167, + "step": 7680 + }, + { + "epoch": 2.5650358991484388, + "grad_norm": 0.2318301327249102, + "learning_rate": 6.257138380056505e-07, + "loss": 0.0116, + "step": 7681 + }, + { + "epoch": 2.5653698447153115, + "grad_norm": 0.2987969519182141, + "learning_rate": 6.24772854944915e-07, + "loss": 0.0144, + "step": 7682 + }, + { + "epoch": 2.5657037902821838, + "grad_norm": 0.2911329749864612, + "learning_rate": 6.238325328137552e-07, + "loss": 0.0139, + "step": 7683 + }, + { + "epoch": 2.5660377358490565, + "grad_norm": 0.26056387721801183, + "learning_rate": 6.228928717542205e-07, + "loss": 0.0131, + "step": 7684 + }, + { + "epoch": 2.566371681415929, + "grad_norm": 0.23239858572829802, + "learning_rate": 6.219538719082546e-07, + "loss": 0.0116, + "step": 7685 + }, + { + "epoch": 2.566705626982802, + "grad_norm": 0.24117066736489734, + "learning_rate": 6.210155334177064e-07, + "loss": 0.0088, + "step": 7686 + }, + { + "epoch": 2.5670395725496746, + "grad_norm": 0.24887119804868804, + "learning_rate": 6.200778564243237e-07, + "loss": 0.011, + "step": 7687 + }, + { + "epoch": 2.567373518116547, + "grad_norm": 0.30875433406908454, + "learning_rate": 6.19140841069752e-07, + "loss": 0.0144, + "step": 7688 + }, + { + "epoch": 2.5677074636834196, + "grad_norm": 0.22436605657133116, + "learning_rate": 6.1820448749554e-07, + "loss": 0.0093, + "step": 7689 + }, + { + "epoch": 2.5680414092502923, + "grad_norm": 0.3271448836658194, + "learning_rate": 6.172687958431328e-07, + "loss": 0.0134, + "step": 7690 + }, + { + "epoch": 2.5683753548171646, + "grad_norm": 0.2576727611285128, + "learning_rate": 6.163337662538793e-07, + "loss": 0.0161, + "step": 7691 + }, + { + "epoch": 2.5687093003840373, + "grad_norm": 0.29886937485751547, + "learning_rate": 6.153993988690266e-07, + "loss": 0.0201, + "step": 7692 + }, + { + "epoch": 2.56904324595091, + "grad_norm": 0.3063418935809696, + "learning_rate": 6.144656938297227e-07, + "loss": 0.0167, + "step": 7693 + }, + { + "epoch": 2.5693771915177828, + "grad_norm": 0.32364416951354497, + "learning_rate": 6.135326512770124e-07, + "loss": 0.0126, + "step": 7694 + }, + { + "epoch": 2.569711137084655, + "grad_norm": 0.2810622315265582, + "learning_rate": 6.126002713518453e-07, + "loss": 0.0164, + "step": 7695 + }, + { + "epoch": 2.5700450826515278, + "grad_norm": 0.22022296796365828, + "learning_rate": 6.116685541950663e-07, + "loss": 0.008, + "step": 7696 + }, + { + "epoch": 2.5703790282184005, + "grad_norm": 0.39942266286389627, + "learning_rate": 6.107374999474236e-07, + "loss": 0.0249, + "step": 7697 + }, + { + "epoch": 2.570712973785273, + "grad_norm": 0.21230024467934458, + "learning_rate": 6.098071087495621e-07, + "loss": 0.0106, + "step": 7698 + }, + { + "epoch": 2.5710469193521455, + "grad_norm": 0.3109824929172941, + "learning_rate": 6.088773807420312e-07, + "loss": 0.0141, + "step": 7699 + }, + { + "epoch": 2.571380864919018, + "grad_norm": 0.3816087620716031, + "learning_rate": 6.07948316065275e-07, + "loss": 0.0138, + "step": 7700 + }, + { + "epoch": 2.571714810485891, + "grad_norm": 0.2952428503471391, + "learning_rate": 6.070199148596411e-07, + "loss": 0.0137, + "step": 7701 + }, + { + "epoch": 2.572048756052763, + "grad_norm": 0.3007554892639507, + "learning_rate": 6.060921772653738e-07, + "loss": 0.0108, + "step": 7702 + }, + { + "epoch": 2.572382701619636, + "grad_norm": 0.22970382919460358, + "learning_rate": 6.051651034226208e-07, + "loss": 0.013, + "step": 7703 + }, + { + "epoch": 2.5727166471865086, + "grad_norm": 0.23843549246395956, + "learning_rate": 6.042386934714245e-07, + "loss": 0.012, + "step": 7704 + }, + { + "epoch": 2.5730505927533813, + "grad_norm": 0.29578000624515915, + "learning_rate": 6.03312947551734e-07, + "loss": 0.0152, + "step": 7705 + }, + { + "epoch": 2.573384538320254, + "grad_norm": 0.3037885960494729, + "learning_rate": 6.02387865803391e-07, + "loss": 0.0157, + "step": 7706 + }, + { + "epoch": 2.5737184838871263, + "grad_norm": 0.3112733567743452, + "learning_rate": 6.014634483661419e-07, + "loss": 0.0124, + "step": 7707 + }, + { + "epoch": 2.574052429453999, + "grad_norm": 0.2704221580142933, + "learning_rate": 6.005396953796294e-07, + "loss": 0.0144, + "step": 7708 + }, + { + "epoch": 2.5743863750208718, + "grad_norm": 0.2189766287771203, + "learning_rate": 5.996166069833976e-07, + "loss": 0.0094, + "step": 7709 + }, + { + "epoch": 2.574720320587744, + "grad_norm": 0.3106542306978342, + "learning_rate": 5.986941833168913e-07, + "loss": 0.0149, + "step": 7710 + }, + { + "epoch": 2.5750542661546167, + "grad_norm": 0.23564296655504463, + "learning_rate": 5.97772424519451e-07, + "loss": 0.0154, + "step": 7711 + }, + { + "epoch": 2.5753882117214895, + "grad_norm": 0.30775047343083617, + "learning_rate": 5.96851330730322e-07, + "loss": 0.0129, + "step": 7712 + }, + { + "epoch": 2.5757221572883617, + "grad_norm": 0.23715489706835652, + "learning_rate": 5.959309020886433e-07, + "loss": 0.0087, + "step": 7713 + }, + { + "epoch": 2.5760561028552345, + "grad_norm": 0.25233445364544266, + "learning_rate": 5.950111387334584e-07, + "loss": 0.0119, + "step": 7714 + }, + { + "epoch": 2.576390048422107, + "grad_norm": 0.3186240092501902, + "learning_rate": 5.940920408037081e-07, + "loss": 0.013, + "step": 7715 + }, + { + "epoch": 2.57672399398898, + "grad_norm": 0.2728406137670835, + "learning_rate": 5.93173608438234e-07, + "loss": 0.0125, + "step": 7716 + }, + { + "epoch": 2.5770579395558526, + "grad_norm": 0.25408731449490213, + "learning_rate": 5.92255841775774e-07, + "loss": 0.0113, + "step": 7717 + }, + { + "epoch": 2.577391885122725, + "grad_norm": 0.3087394440844738, + "learning_rate": 5.913387409549693e-07, + "loss": 0.014, + "step": 7718 + }, + { + "epoch": 2.5777258306895976, + "grad_norm": 0.27728454768206234, + "learning_rate": 5.904223061143577e-07, + "loss": 0.016, + "step": 7719 + }, + { + "epoch": 2.5780597762564703, + "grad_norm": 0.2976316052310394, + "learning_rate": 5.895065373923781e-07, + "loss": 0.0142, + "step": 7720 + }, + { + "epoch": 2.5783937218233426, + "grad_norm": 0.28446300269637065, + "learning_rate": 5.885914349273664e-07, + "loss": 0.0142, + "step": 7721 + }, + { + "epoch": 2.5787276673902153, + "grad_norm": 0.25315372764163047, + "learning_rate": 5.876769988575631e-07, + "loss": 0.0095, + "step": 7722 + }, + { + "epoch": 2.579061612957088, + "grad_norm": 0.32400592565478337, + "learning_rate": 5.867632293211011e-07, + "loss": 0.0187, + "step": 7723 + }, + { + "epoch": 2.5793955585239607, + "grad_norm": 0.30809121386222355, + "learning_rate": 5.85850126456019e-07, + "loss": 0.0136, + "step": 7724 + }, + { + "epoch": 2.5797295040908335, + "grad_norm": 0.2820419405285735, + "learning_rate": 5.84937690400249e-07, + "loss": 0.0086, + "step": 7725 + }, + { + "epoch": 2.5800634496577057, + "grad_norm": 0.30018832499031356, + "learning_rate": 5.840259212916277e-07, + "loss": 0.011, + "step": 7726 + }, + { + "epoch": 2.5803973952245784, + "grad_norm": 0.29804342774190334, + "learning_rate": 5.831148192678853e-07, + "loss": 0.0131, + "step": 7727 + }, + { + "epoch": 2.580731340791451, + "grad_norm": 0.21314453819518234, + "learning_rate": 5.822043844666586e-07, + "loss": 0.0079, + "step": 7728 + }, + { + "epoch": 2.5810652863583234, + "grad_norm": 0.3536771226099465, + "learning_rate": 5.812946170254763e-07, + "loss": 0.0219, + "step": 7729 + }, + { + "epoch": 2.581399231925196, + "grad_norm": 0.3179741117100696, + "learning_rate": 5.803855170817718e-07, + "loss": 0.0125, + "step": 7730 + }, + { + "epoch": 2.581733177492069, + "grad_norm": 0.33287077989551483, + "learning_rate": 5.794770847728736e-07, + "loss": 0.015, + "step": 7731 + }, + { + "epoch": 2.582067123058941, + "grad_norm": 0.24447350410250296, + "learning_rate": 5.785693202360121e-07, + "loss": 0.0085, + "step": 7732 + }, + { + "epoch": 2.582401068625814, + "grad_norm": 0.24002018090808191, + "learning_rate": 5.776622236083146e-07, + "loss": 0.0078, + "step": 7733 + }, + { + "epoch": 2.5827350141926866, + "grad_norm": 0.21165752468060922, + "learning_rate": 5.767557950268099e-07, + "loss": 0.0092, + "step": 7734 + }, + { + "epoch": 2.5830689597595593, + "grad_norm": 0.3562783324191469, + "learning_rate": 5.758500346284252e-07, + "loss": 0.016, + "step": 7735 + }, + { + "epoch": 2.583402905326432, + "grad_norm": 0.3204976801978708, + "learning_rate": 5.749449425499843e-07, + "loss": 0.0202, + "step": 7736 + }, + { + "epoch": 2.5837368508933043, + "grad_norm": 0.263270083553635, + "learning_rate": 5.740405189282134e-07, + "loss": 0.0157, + "step": 7737 + }, + { + "epoch": 2.584070796460177, + "grad_norm": 0.27260822303976673, + "learning_rate": 5.73136763899737e-07, + "loss": 0.01, + "step": 7738 + }, + { + "epoch": 2.5844047420270497, + "grad_norm": 0.25270345589887516, + "learning_rate": 5.722336776010756e-07, + "loss": 0.0106, + "step": 7739 + }, + { + "epoch": 2.584738687593922, + "grad_norm": 0.34004623097511627, + "learning_rate": 5.713312601686533e-07, + "loss": 0.0199, + "step": 7740 + }, + { + "epoch": 2.5850726331607947, + "grad_norm": 0.32386090811729507, + "learning_rate": 5.704295117387904e-07, + "loss": 0.0165, + "step": 7741 + }, + { + "epoch": 2.5854065787276674, + "grad_norm": 0.3082270707226719, + "learning_rate": 5.695284324477052e-07, + "loss": 0.0143, + "step": 7742 + }, + { + "epoch": 2.58574052429454, + "grad_norm": 0.25192242369583967, + "learning_rate": 5.686280224315189e-07, + "loss": 0.0121, + "step": 7743 + }, + { + "epoch": 2.5860744698614124, + "grad_norm": 0.2573272403479741, + "learning_rate": 5.677282818262464e-07, + "loss": 0.0112, + "step": 7744 + }, + { + "epoch": 2.586408415428285, + "grad_norm": 0.29357437238210815, + "learning_rate": 5.668292107678048e-07, + "loss": 0.0132, + "step": 7745 + }, + { + "epoch": 2.586742360995158, + "grad_norm": 0.3752654176831875, + "learning_rate": 5.659308093920101e-07, + "loss": 0.0177, + "step": 7746 + }, + { + "epoch": 2.5870763065620306, + "grad_norm": 0.2861699049823195, + "learning_rate": 5.650330778345776e-07, + "loss": 0.013, + "step": 7747 + }, + { + "epoch": 2.587410252128903, + "grad_norm": 0.23728377208801518, + "learning_rate": 5.641360162311171e-07, + "loss": 0.0091, + "step": 7748 + }, + { + "epoch": 2.5877441976957756, + "grad_norm": 0.2538075352426027, + "learning_rate": 5.632396247171429e-07, + "loss": 0.0134, + "step": 7749 + }, + { + "epoch": 2.5880781432626483, + "grad_norm": 0.2775516067571275, + "learning_rate": 5.623439034280625e-07, + "loss": 0.0127, + "step": 7750 + }, + { + "epoch": 2.5884120888295206, + "grad_norm": 0.3580076647398268, + "learning_rate": 5.614488524991896e-07, + "loss": 0.0189, + "step": 7751 + }, + { + "epoch": 2.5887460343963933, + "grad_norm": 0.2606431332039776, + "learning_rate": 5.605544720657286e-07, + "loss": 0.0103, + "step": 7752 + }, + { + "epoch": 2.589079979963266, + "grad_norm": 0.5082055733961746, + "learning_rate": 5.596607622627887e-07, + "loss": 0.0165, + "step": 7753 + }, + { + "epoch": 2.5894139255301387, + "grad_norm": 0.2675982553742719, + "learning_rate": 5.587677232253725e-07, + "loss": 0.0115, + "step": 7754 + }, + { + "epoch": 2.5897478710970114, + "grad_norm": 0.23943591364195121, + "learning_rate": 5.57875355088387e-07, + "loss": 0.0107, + "step": 7755 + }, + { + "epoch": 2.5900818166638837, + "grad_norm": 0.28025763503549567, + "learning_rate": 5.569836579866316e-07, + "loss": 0.0109, + "step": 7756 + }, + { + "epoch": 2.5904157622307564, + "grad_norm": 0.2860556200199481, + "learning_rate": 5.560926320548105e-07, + "loss": 0.0149, + "step": 7757 + }, + { + "epoch": 2.590749707797629, + "grad_norm": 0.28321692167299, + "learning_rate": 5.552022774275228e-07, + "loss": 0.0111, + "step": 7758 + }, + { + "epoch": 2.5910836533645014, + "grad_norm": 0.2748970840787289, + "learning_rate": 5.543125942392664e-07, + "loss": 0.01, + "step": 7759 + }, + { + "epoch": 2.591417598931374, + "grad_norm": 0.22994941830488969, + "learning_rate": 5.534235826244389e-07, + "loss": 0.0103, + "step": 7760 + }, + { + "epoch": 2.591751544498247, + "grad_norm": 0.35041866367625174, + "learning_rate": 5.525352427173369e-07, + "loss": 0.0193, + "step": 7761 + }, + { + "epoch": 2.592085490065119, + "grad_norm": 0.2863518183261444, + "learning_rate": 5.516475746521527e-07, + "loss": 0.0118, + "step": 7762 + }, + { + "epoch": 2.592419435631992, + "grad_norm": 0.2705942370985263, + "learning_rate": 5.507605785629794e-07, + "loss": 0.0122, + "step": 7763 + }, + { + "epoch": 2.5927533811988646, + "grad_norm": 0.29767577101175474, + "learning_rate": 5.498742545838104e-07, + "loss": 0.0156, + "step": 7764 + }, + { + "epoch": 2.5930873267657373, + "grad_norm": 0.19476089992018275, + "learning_rate": 5.48988602848533e-07, + "loss": 0.008, + "step": 7765 + }, + { + "epoch": 2.59342127233261, + "grad_norm": 0.23204861842288593, + "learning_rate": 5.481036234909365e-07, + "loss": 0.0091, + "step": 7766 + }, + { + "epoch": 2.5937552178994823, + "grad_norm": 0.29781530126581196, + "learning_rate": 5.472193166447065e-07, + "loss": 0.0161, + "step": 7767 + }, + { + "epoch": 2.594089163466355, + "grad_norm": 0.2546827814738975, + "learning_rate": 5.463356824434285e-07, + "loss": 0.0112, + "step": 7768 + }, + { + "epoch": 2.5944231090332277, + "grad_norm": 0.19954995236601833, + "learning_rate": 5.454527210205857e-07, + "loss": 0.0071, + "step": 7769 + }, + { + "epoch": 2.5947570546001, + "grad_norm": 0.28657743196987234, + "learning_rate": 5.445704325095613e-07, + "loss": 0.0167, + "step": 7770 + }, + { + "epoch": 2.5950910001669727, + "grad_norm": 0.32990929435981003, + "learning_rate": 5.436888170436327e-07, + "loss": 0.019, + "step": 7771 + }, + { + "epoch": 2.5954249457338454, + "grad_norm": 0.22321175087720183, + "learning_rate": 5.428078747559806e-07, + "loss": 0.0101, + "step": 7772 + }, + { + "epoch": 2.595758891300718, + "grad_norm": 0.2939906935182841, + "learning_rate": 5.419276057796802e-07, + "loss": 0.012, + "step": 7773 + }, + { + "epoch": 2.596092836867591, + "grad_norm": 0.3560238371429349, + "learning_rate": 5.410480102477067e-07, + "loss": 0.0172, + "step": 7774 + }, + { + "epoch": 2.596426782434463, + "grad_norm": 0.30955404912738377, + "learning_rate": 5.401690882929333e-07, + "loss": 0.0207, + "step": 7775 + }, + { + "epoch": 2.596760728001336, + "grad_norm": 0.2687572631681687, + "learning_rate": 5.392908400481334e-07, + "loss": 0.0121, + "step": 7776 + }, + { + "epoch": 2.5970946735682086, + "grad_norm": 0.24242286721787118, + "learning_rate": 5.384132656459745e-07, + "loss": 0.0116, + "step": 7777 + }, + { + "epoch": 2.597428619135081, + "grad_norm": 0.2903164453331221, + "learning_rate": 5.375363652190257e-07, + "loss": 0.0141, + "step": 7778 + }, + { + "epoch": 2.5977625647019535, + "grad_norm": 0.2947254617095242, + "learning_rate": 5.366601388997522e-07, + "loss": 0.015, + "step": 7779 + }, + { + "epoch": 2.5980965102688263, + "grad_norm": 0.23439835279182605, + "learning_rate": 5.357845868205191e-07, + "loss": 0.0079, + "step": 7780 + }, + { + "epoch": 2.5984304558356985, + "grad_norm": 0.2871025774439091, + "learning_rate": 5.34909709113589e-07, + "loss": 0.0132, + "step": 7781 + }, + { + "epoch": 2.5987644014025713, + "grad_norm": 0.32349870967462496, + "learning_rate": 5.340355059111213e-07, + "loss": 0.0138, + "step": 7782 + }, + { + "epoch": 2.599098346969444, + "grad_norm": 0.2303197504499297, + "learning_rate": 5.331619773451757e-07, + "loss": 0.0104, + "step": 7783 + }, + { + "epoch": 2.5994322925363167, + "grad_norm": 0.2917282176252594, + "learning_rate": 5.32289123547709e-07, + "loss": 0.0152, + "step": 7784 + }, + { + "epoch": 2.5997662381031894, + "grad_norm": 0.29936471310078494, + "learning_rate": 5.314169446505757e-07, + "loss": 0.0138, + "step": 7785 + }, + { + "epoch": 2.6001001836700617, + "grad_norm": 0.2808927535783025, + "learning_rate": 5.305454407855282e-07, + "loss": 0.0136, + "step": 7786 + }, + { + "epoch": 2.6004341292369344, + "grad_norm": 0.30781220110910307, + "learning_rate": 5.296746120842189e-07, + "loss": 0.0166, + "step": 7787 + }, + { + "epoch": 2.600768074803807, + "grad_norm": 0.33708264863124765, + "learning_rate": 5.288044586781955e-07, + "loss": 0.0198, + "step": 7788 + }, + { + "epoch": 2.6011020203706794, + "grad_norm": 0.3525456228701188, + "learning_rate": 5.279349806989054e-07, + "loss": 0.0191, + "step": 7789 + }, + { + "epoch": 2.601435965937552, + "grad_norm": 0.29643886549077286, + "learning_rate": 5.270661782776931e-07, + "loss": 0.0134, + "step": 7790 + }, + { + "epoch": 2.601769911504425, + "grad_norm": 0.3522666220172226, + "learning_rate": 5.26198051545801e-07, + "loss": 0.0207, + "step": 7791 + }, + { + "epoch": 2.6021038570712975, + "grad_norm": 0.2335916004126007, + "learning_rate": 5.253306006343706e-07, + "loss": 0.0104, + "step": 7792 + }, + { + "epoch": 2.60243780263817, + "grad_norm": 0.22011570111638332, + "learning_rate": 5.244638256744422e-07, + "loss": 0.0113, + "step": 7793 + }, + { + "epoch": 2.6027717482050425, + "grad_norm": 0.3209341737954237, + "learning_rate": 5.235977267969489e-07, + "loss": 0.0127, + "step": 7794 + }, + { + "epoch": 2.6031056937719153, + "grad_norm": 0.28920014648819403, + "learning_rate": 5.227323041327281e-07, + "loss": 0.0133, + "step": 7795 + }, + { + "epoch": 2.603439639338788, + "grad_norm": 0.2582740540276249, + "learning_rate": 5.218675578125099e-07, + "loss": 0.0102, + "step": 7796 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 0.3073350534343159, + "learning_rate": 5.210034879669257e-07, + "loss": 0.0162, + "step": 7797 + }, + { + "epoch": 2.604107530472533, + "grad_norm": 0.28066859886880224, + "learning_rate": 5.201400947265029e-07, + "loss": 0.0129, + "step": 7798 + }, + { + "epoch": 2.6044414760394057, + "grad_norm": 0.3188945910965467, + "learning_rate": 5.192773782216681e-07, + "loss": 0.0155, + "step": 7799 + }, + { + "epoch": 2.604775421606278, + "grad_norm": 0.30526122254461424, + "learning_rate": 5.184153385827434e-07, + "loss": 0.0155, + "step": 7800 + }, + { + "epoch": 2.6051093671731507, + "grad_norm": 0.2701536008517957, + "learning_rate": 5.175539759399518e-07, + "loss": 0.0096, + "step": 7801 + }, + { + "epoch": 2.6054433127400234, + "grad_norm": 0.22318834661855, + "learning_rate": 5.166932904234101e-07, + "loss": 0.0097, + "step": 7802 + }, + { + "epoch": 2.605777258306896, + "grad_norm": 0.2435016455260839, + "learning_rate": 5.158332821631362e-07, + "loss": 0.0109, + "step": 7803 + }, + { + "epoch": 2.606111203873769, + "grad_norm": 0.2918131522531103, + "learning_rate": 5.149739512890445e-07, + "loss": 0.0155, + "step": 7804 + }, + { + "epoch": 2.606445149440641, + "grad_norm": 0.30717006423983095, + "learning_rate": 5.141152979309477e-07, + "loss": 0.016, + "step": 7805 + }, + { + "epoch": 2.606779095007514, + "grad_norm": 0.347405470232238, + "learning_rate": 5.132573222185539e-07, + "loss": 0.017, + "step": 7806 + }, + { + "epoch": 2.6071130405743865, + "grad_norm": 0.2976334048782599, + "learning_rate": 5.124000242814725e-07, + "loss": 0.0158, + "step": 7807 + }, + { + "epoch": 2.607446986141259, + "grad_norm": 0.2869317304551244, + "learning_rate": 5.115434042492057e-07, + "loss": 0.0126, + "step": 7808 + }, + { + "epoch": 2.6077809317081315, + "grad_norm": 0.27989027783297954, + "learning_rate": 5.106874622511576e-07, + "loss": 0.0124, + "step": 7809 + }, + { + "epoch": 2.6081148772750042, + "grad_norm": 0.2866276623557583, + "learning_rate": 5.098321984166293e-07, + "loss": 0.0166, + "step": 7810 + }, + { + "epoch": 2.6084488228418765, + "grad_norm": 0.36940599428484794, + "learning_rate": 5.089776128748169e-07, + "loss": 0.0127, + "step": 7811 + }, + { + "epoch": 2.6087827684087492, + "grad_norm": 0.27347853902866026, + "learning_rate": 5.081237057548166e-07, + "loss": 0.0128, + "step": 7812 + }, + { + "epoch": 2.609116713975622, + "grad_norm": 0.501024325812926, + "learning_rate": 5.072704771856201e-07, + "loss": 0.0192, + "step": 7813 + }, + { + "epoch": 2.6094506595424947, + "grad_norm": 0.270871788245993, + "learning_rate": 5.06417927296119e-07, + "loss": 0.009, + "step": 7814 + }, + { + "epoch": 2.6097846051093674, + "grad_norm": 0.26017247693699724, + "learning_rate": 5.055660562150983e-07, + "loss": 0.0114, + "step": 7815 + }, + { + "epoch": 2.6101185506762397, + "grad_norm": 0.2854172663792184, + "learning_rate": 5.047148640712468e-07, + "loss": 0.0101, + "step": 7816 + }, + { + "epoch": 2.6104524962431124, + "grad_norm": 0.30607888740182465, + "learning_rate": 5.038643509931446e-07, + "loss": 0.0186, + "step": 7817 + }, + { + "epoch": 2.610786441809985, + "grad_norm": 0.3272155334397794, + "learning_rate": 5.030145171092732e-07, + "loss": 0.0143, + "step": 7818 + }, + { + "epoch": 2.6111203873768574, + "grad_norm": 0.26239799598991864, + "learning_rate": 5.021653625480089e-07, + "loss": 0.0119, + "step": 7819 + }, + { + "epoch": 2.61145433294373, + "grad_norm": 0.3149126051533802, + "learning_rate": 5.013168874376273e-07, + "loss": 0.0163, + "step": 7820 + }, + { + "epoch": 2.611788278510603, + "grad_norm": 0.33734061301169993, + "learning_rate": 5.004690919062983e-07, + "loss": 0.0138, + "step": 7821 + }, + { + "epoch": 2.6121222240774755, + "grad_norm": 0.3582999727333817, + "learning_rate": 4.996219760820947e-07, + "loss": 0.0145, + "step": 7822 + }, + { + "epoch": 2.6124561696443482, + "grad_norm": 0.36645911870783865, + "learning_rate": 4.987755400929817e-07, + "loss": 0.0124, + "step": 7823 + }, + { + "epoch": 2.6127901152112205, + "grad_norm": 0.263407979707053, + "learning_rate": 4.97929784066824e-07, + "loss": 0.0106, + "step": 7824 + }, + { + "epoch": 2.6131240607780932, + "grad_norm": 0.28669460111882933, + "learning_rate": 4.970847081313818e-07, + "loss": 0.0126, + "step": 7825 + }, + { + "epoch": 2.613458006344966, + "grad_norm": 0.34181469769634143, + "learning_rate": 4.962403124143156e-07, + "loss": 0.0134, + "step": 7826 + }, + { + "epoch": 2.613791951911838, + "grad_norm": 0.2943655509060492, + "learning_rate": 4.953965970431779e-07, + "loss": 0.0156, + "step": 7827 + }, + { + "epoch": 2.614125897478711, + "grad_norm": 0.2693809041343175, + "learning_rate": 4.945535621454268e-07, + "loss": 0.012, + "step": 7828 + }, + { + "epoch": 2.6144598430455837, + "grad_norm": 0.30883783427694184, + "learning_rate": 4.937112078484086e-07, + "loss": 0.0147, + "step": 7829 + }, + { + "epoch": 2.614793788612456, + "grad_norm": 0.30272494392255195, + "learning_rate": 4.928695342793733e-07, + "loss": 0.0163, + "step": 7830 + }, + { + "epoch": 2.6151277341793286, + "grad_norm": 0.3222143371438454, + "learning_rate": 4.92028541565464e-07, + "loss": 0.0099, + "step": 7831 + }, + { + "epoch": 2.6154616797462014, + "grad_norm": 0.35735085101889014, + "learning_rate": 4.911882298337228e-07, + "loss": 0.0175, + "step": 7832 + }, + { + "epoch": 2.615795625313074, + "grad_norm": 0.19956015862823392, + "learning_rate": 4.903485992110901e-07, + "loss": 0.0076, + "step": 7833 + }, + { + "epoch": 2.616129570879947, + "grad_norm": 0.30573601023846625, + "learning_rate": 4.895096498243995e-07, + "loss": 0.0118, + "step": 7834 + }, + { + "epoch": 2.616463516446819, + "grad_norm": 0.3836827700213579, + "learning_rate": 4.886713818003874e-07, + "loss": 0.0147, + "step": 7835 + }, + { + "epoch": 2.616797462013692, + "grad_norm": 0.2902416380310542, + "learning_rate": 4.878337952656809e-07, + "loss": 0.0152, + "step": 7836 + }, + { + "epoch": 2.6171314075805645, + "grad_norm": 0.3225021336855386, + "learning_rate": 4.869968903468092e-07, + "loss": 0.0159, + "step": 7837 + }, + { + "epoch": 2.617465353147437, + "grad_norm": 0.2990407742420705, + "learning_rate": 4.861606671701946e-07, + "loss": 0.0112, + "step": 7838 + }, + { + "epoch": 2.6177992987143095, + "grad_norm": 0.32854258306647427, + "learning_rate": 4.853251258621621e-07, + "loss": 0.0196, + "step": 7839 + }, + { + "epoch": 2.618133244281182, + "grad_norm": 0.3114884955059214, + "learning_rate": 4.844902665489265e-07, + "loss": 0.0143, + "step": 7840 + }, + { + "epoch": 2.618467189848055, + "grad_norm": 0.24263950940313359, + "learning_rate": 4.836560893566056e-07, + "loss": 0.0109, + "step": 7841 + }, + { + "epoch": 2.618801135414927, + "grad_norm": 0.2452556024663025, + "learning_rate": 4.828225944112097e-07, + "loss": 0.0153, + "step": 7842 + }, + { + "epoch": 2.6191350809818, + "grad_norm": 0.3056648642732613, + "learning_rate": 4.819897818386499e-07, + "loss": 0.0121, + "step": 7843 + }, + { + "epoch": 2.6194690265486726, + "grad_norm": 0.23198214125610783, + "learning_rate": 4.811576517647299e-07, + "loss": 0.0097, + "step": 7844 + }, + { + "epoch": 2.6198029721155454, + "grad_norm": 0.34270264884464957, + "learning_rate": 4.803262043151557e-07, + "loss": 0.0205, + "step": 7845 + }, + { + "epoch": 2.6201369176824176, + "grad_norm": 0.3403452881567089, + "learning_rate": 4.794954396155249e-07, + "loss": 0.0163, + "step": 7846 + }, + { + "epoch": 2.6204708632492903, + "grad_norm": 0.28619503287707593, + "learning_rate": 4.786653577913364e-07, + "loss": 0.0144, + "step": 7847 + }, + { + "epoch": 2.620804808816163, + "grad_norm": 0.29811911634736843, + "learning_rate": 4.77835958967981e-07, + "loss": 0.0143, + "step": 7848 + }, + { + "epoch": 2.6211387543830353, + "grad_norm": 0.2733347677589082, + "learning_rate": 4.770072432707523e-07, + "loss": 0.0106, + "step": 7849 + }, + { + "epoch": 2.621472699949908, + "grad_norm": 0.35678348971836715, + "learning_rate": 4.761792108248342e-07, + "loss": 0.0228, + "step": 7850 + }, + { + "epoch": 2.6218066455167808, + "grad_norm": 0.30352719624801333, + "learning_rate": 4.753518617553138e-07, + "loss": 0.0163, + "step": 7851 + }, + { + "epoch": 2.6221405910836535, + "grad_norm": 0.2541955296554193, + "learning_rate": 4.745251961871705e-07, + "loss": 0.0155, + "step": 7852 + }, + { + "epoch": 2.622474536650526, + "grad_norm": 0.3092492674665828, + "learning_rate": 4.736992142452823e-07, + "loss": 0.0149, + "step": 7853 + }, + { + "epoch": 2.6228084822173985, + "grad_norm": 0.3452254837202833, + "learning_rate": 4.728739160544227e-07, + "loss": 0.0132, + "step": 7854 + }, + { + "epoch": 2.623142427784271, + "grad_norm": 0.29395539921865604, + "learning_rate": 4.720493017392641e-07, + "loss": 0.013, + "step": 7855 + }, + { + "epoch": 2.623476373351144, + "grad_norm": 0.2625686656669474, + "learning_rate": 4.712253714243725e-07, + "loss": 0.0122, + "step": 7856 + }, + { + "epoch": 2.623810318918016, + "grad_norm": 0.30476099803603063, + "learning_rate": 4.7040212523421335e-07, + "loss": 0.0155, + "step": 7857 + }, + { + "epoch": 2.624144264484889, + "grad_norm": 0.28600785805079787, + "learning_rate": 4.695795632931477e-07, + "loss": 0.0156, + "step": 7858 + }, + { + "epoch": 2.6244782100517616, + "grad_norm": 0.20763764250099112, + "learning_rate": 4.687576857254328e-07, + "loss": 0.0066, + "step": 7859 + }, + { + "epoch": 2.624812155618634, + "grad_norm": 0.2392660061310057, + "learning_rate": 4.679364926552238e-07, + "loss": 0.0087, + "step": 7860 + }, + { + "epoch": 2.6251461011855066, + "grad_norm": 0.2398332101110139, + "learning_rate": 4.671159842065698e-07, + "loss": 0.0123, + "step": 7861 + }, + { + "epoch": 2.6254800467523793, + "grad_norm": 0.3110648976872475, + "learning_rate": 4.662961605034194e-07, + "loss": 0.0143, + "step": 7862 + }, + { + "epoch": 2.625813992319252, + "grad_norm": 0.25693346836410147, + "learning_rate": 4.654770216696169e-07, + "loss": 0.0099, + "step": 7863 + }, + { + "epoch": 2.6261479378861248, + "grad_norm": 0.28816989474842675, + "learning_rate": 4.646585678289034e-07, + "loss": 0.0179, + "step": 7864 + }, + { + "epoch": 2.626481883452997, + "grad_norm": 0.3194372432316785, + "learning_rate": 4.6384079910491376e-07, + "loss": 0.0139, + "step": 7865 + }, + { + "epoch": 2.6268158290198698, + "grad_norm": 0.2594581126553459, + "learning_rate": 4.630237156211842e-07, + "loss": 0.0107, + "step": 7866 + }, + { + "epoch": 2.6271497745867425, + "grad_norm": 0.32859966823538417, + "learning_rate": 4.6220731750114267e-07, + "loss": 0.0208, + "step": 7867 + }, + { + "epoch": 2.6274837201536148, + "grad_norm": 0.3396579390284738, + "learning_rate": 4.6139160486811663e-07, + "loss": 0.0179, + "step": 7868 + }, + { + "epoch": 2.6278176657204875, + "grad_norm": 0.2345310176014767, + "learning_rate": 4.605765778453292e-07, + "loss": 0.0091, + "step": 7869 + }, + { + "epoch": 2.62815161128736, + "grad_norm": 0.3285333915493347, + "learning_rate": 4.597622365559007e-07, + "loss": 0.0267, + "step": 7870 + }, + { + "epoch": 2.628485556854233, + "grad_norm": 0.26953723072086627, + "learning_rate": 4.5894858112284445e-07, + "loss": 0.0111, + "step": 7871 + }, + { + "epoch": 2.6288195024211056, + "grad_norm": 0.24074633675252588, + "learning_rate": 4.581356116690755e-07, + "loss": 0.0105, + "step": 7872 + }, + { + "epoch": 2.629153447987978, + "grad_norm": 0.1822939536673304, + "learning_rate": 4.573233283173989e-07, + "loss": 0.0077, + "step": 7873 + }, + { + "epoch": 2.6294873935548506, + "grad_norm": 0.2215272780562539, + "learning_rate": 4.5651173119052427e-07, + "loss": 0.0085, + "step": 7874 + }, + { + "epoch": 2.6298213391217233, + "grad_norm": 0.3693724794689987, + "learning_rate": 4.5570082041104915e-07, + "loss": 0.0156, + "step": 7875 + }, + { + "epoch": 2.6301552846885956, + "grad_norm": 0.27095227834966373, + "learning_rate": 4.5489059610147323e-07, + "loss": 0.0128, + "step": 7876 + }, + { + "epoch": 2.6304892302554683, + "grad_norm": 0.2618553764053157, + "learning_rate": 4.5408105838418924e-07, + "loss": 0.01, + "step": 7877 + }, + { + "epoch": 2.630823175822341, + "grad_norm": 0.3314126642674739, + "learning_rate": 4.5327220738148823e-07, + "loss": 0.0177, + "step": 7878 + }, + { + "epoch": 2.6311571213892133, + "grad_norm": 0.2184385277216219, + "learning_rate": 4.524640432155558e-07, + "loss": 0.0124, + "step": 7879 + }, + { + "epoch": 2.631491066956086, + "grad_norm": 0.23886265818361968, + "learning_rate": 4.516565660084754e-07, + "loss": 0.0106, + "step": 7880 + }, + { + "epoch": 2.6318250125229588, + "grad_norm": 0.2093007838288809, + "learning_rate": 4.5084977588222613e-07, + "loss": 0.0102, + "step": 7881 + }, + { + "epoch": 2.6321589580898315, + "grad_norm": 0.22978651514981277, + "learning_rate": 4.500436729586821e-07, + "loss": 0.011, + "step": 7882 + }, + { + "epoch": 2.632492903656704, + "grad_norm": 0.25385386043168123, + "learning_rate": 4.4923825735961604e-07, + "loss": 0.011, + "step": 7883 + }, + { + "epoch": 2.6328268492235765, + "grad_norm": 0.29391073574245624, + "learning_rate": 4.484335292066938e-07, + "loss": 0.0111, + "step": 7884 + }, + { + "epoch": 2.633160794790449, + "grad_norm": 0.21919384575116094, + "learning_rate": 4.476294886214799e-07, + "loss": 0.0062, + "step": 7885 + }, + { + "epoch": 2.633494740357322, + "grad_norm": 0.3645711293848547, + "learning_rate": 4.468261357254339e-07, + "loss": 0.0151, + "step": 7886 + }, + { + "epoch": 2.633828685924194, + "grad_norm": 0.23348420561641633, + "learning_rate": 4.46023470639913e-07, + "loss": 0.0126, + "step": 7887 + }, + { + "epoch": 2.634162631491067, + "grad_norm": 0.4223737283881882, + "learning_rate": 4.452214934861676e-07, + "loss": 0.0233, + "step": 7888 + }, + { + "epoch": 2.6344965770579396, + "grad_norm": 0.4017994011427713, + "learning_rate": 4.4442020438534737e-07, + "loss": 0.0228, + "step": 7889 + }, + { + "epoch": 2.6348305226248123, + "grad_norm": 0.25590162258996446, + "learning_rate": 4.436196034584944e-07, + "loss": 0.0136, + "step": 7890 + }, + { + "epoch": 2.6351644681916846, + "grad_norm": 0.2628167263739924, + "learning_rate": 4.4281969082654976e-07, + "loss": 0.0124, + "step": 7891 + }, + { + "epoch": 2.6354984137585573, + "grad_norm": 0.33306797091433593, + "learning_rate": 4.4202046661035e-07, + "loss": 0.0101, + "step": 7892 + }, + { + "epoch": 2.63583235932543, + "grad_norm": 0.2899528935574976, + "learning_rate": 4.4122193093062815e-07, + "loss": 0.016, + "step": 7893 + }, + { + "epoch": 2.6361663048923027, + "grad_norm": 0.2715359116789147, + "learning_rate": 4.4042408390801097e-07, + "loss": 0.0091, + "step": 7894 + }, + { + "epoch": 2.636500250459175, + "grad_norm": 0.2292529748099387, + "learning_rate": 4.3962692566302366e-07, + "loss": 0.0082, + "step": 7895 + }, + { + "epoch": 2.6368341960260477, + "grad_norm": 0.2674417840369013, + "learning_rate": 4.38830456316085e-07, + "loss": 0.013, + "step": 7896 + }, + { + "epoch": 2.6371681415929205, + "grad_norm": 0.3124380951995918, + "learning_rate": 4.38034675987512e-07, + "loss": 0.0152, + "step": 7897 + }, + { + "epoch": 2.6375020871597927, + "grad_norm": 0.34431549977777304, + "learning_rate": 4.372395847975164e-07, + "loss": 0.0179, + "step": 7898 + }, + { + "epoch": 2.6378360327266654, + "grad_norm": 0.3441224854387786, + "learning_rate": 4.364451828662075e-07, + "loss": 0.0124, + "step": 7899 + }, + { + "epoch": 2.638169978293538, + "grad_norm": 0.34566215561888247, + "learning_rate": 4.356514703135867e-07, + "loss": 0.0211, + "step": 7900 + }, + { + "epoch": 2.638503923860411, + "grad_norm": 0.28016577725788827, + "learning_rate": 4.348584472595557e-07, + "loss": 0.0096, + "step": 7901 + }, + { + "epoch": 2.6388378694272836, + "grad_norm": 0.26713189067875304, + "learning_rate": 4.3406611382390826e-07, + "loss": 0.0105, + "step": 7902 + }, + { + "epoch": 2.639171814994156, + "grad_norm": 0.26716024400740856, + "learning_rate": 4.3327447012633695e-07, + "loss": 0.0118, + "step": 7903 + }, + { + "epoch": 2.6395057605610286, + "grad_norm": 0.23008488659461046, + "learning_rate": 4.324835162864283e-07, + "loss": 0.0075, + "step": 7904 + }, + { + "epoch": 2.6398397061279013, + "grad_norm": 0.25282644215243727, + "learning_rate": 4.31693252423665e-07, + "loss": 0.0109, + "step": 7905 + }, + { + "epoch": 2.6401736516947736, + "grad_norm": 0.2528030423118093, + "learning_rate": 4.3090367865742666e-07, + "loss": 0.0114, + "step": 7906 + }, + { + "epoch": 2.6405075972616463, + "grad_norm": 0.26423103992968655, + "learning_rate": 4.3011479510698615e-07, + "loss": 0.0112, + "step": 7907 + }, + { + "epoch": 2.640841542828519, + "grad_norm": 0.2417972838722505, + "learning_rate": 4.293266018915149e-07, + "loss": 0.0125, + "step": 7908 + }, + { + "epoch": 2.6411754883953913, + "grad_norm": 0.2200353282870818, + "learning_rate": 4.2853909913007807e-07, + "loss": 0.008, + "step": 7909 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 0.3074011894361888, + "learning_rate": 4.277522869416384e-07, + "loss": 0.0158, + "step": 7910 + }, + { + "epoch": 2.6418433795291367, + "grad_norm": 0.28136241325893485, + "learning_rate": 4.269661654450513e-07, + "loss": 0.0107, + "step": 7911 + }, + { + "epoch": 2.6421773250960094, + "grad_norm": 0.22021278378467113, + "learning_rate": 4.261807347590713e-07, + "loss": 0.0084, + "step": 7912 + }, + { + "epoch": 2.642511270662882, + "grad_norm": 0.26388601100882164, + "learning_rate": 4.253959950023456e-07, + "loss": 0.0143, + "step": 7913 + }, + { + "epoch": 2.6428452162297544, + "grad_norm": 0.2833154500686583, + "learning_rate": 4.246119462934195e-07, + "loss": 0.0131, + "step": 7914 + }, + { + "epoch": 2.643179161796627, + "grad_norm": 0.2923873934305097, + "learning_rate": 4.238285887507315e-07, + "loss": 0.0151, + "step": 7915 + }, + { + "epoch": 2.6435131073635, + "grad_norm": 0.24641108987918847, + "learning_rate": 4.230459224926198e-07, + "loss": 0.0087, + "step": 7916 + }, + { + "epoch": 2.643847052930372, + "grad_norm": 0.3087129446537835, + "learning_rate": 4.222639476373119e-07, + "loss": 0.016, + "step": 7917 + }, + { + "epoch": 2.644180998497245, + "grad_norm": 0.2963660809794913, + "learning_rate": 4.2148266430293627e-07, + "loss": 0.0115, + "step": 7918 + }, + { + "epoch": 2.6445149440641176, + "grad_norm": 0.31529640063222963, + "learning_rate": 4.207020726075145e-07, + "loss": 0.0169, + "step": 7919 + }, + { + "epoch": 2.6448488896309903, + "grad_norm": 0.30567255421916395, + "learning_rate": 4.199221726689634e-07, + "loss": 0.0171, + "step": 7920 + }, + { + "epoch": 2.645182835197863, + "grad_norm": 0.2694320969090033, + "learning_rate": 4.191429646050971e-07, + "loss": 0.011, + "step": 7921 + }, + { + "epoch": 2.6455167807647353, + "grad_norm": 0.24352837601722344, + "learning_rate": 4.1836444853362465e-07, + "loss": 0.01, + "step": 7922 + }, + { + "epoch": 2.645850726331608, + "grad_norm": 0.29336149931524796, + "learning_rate": 4.1758662457214884e-07, + "loss": 0.0086, + "step": 7923 + }, + { + "epoch": 2.6461846718984807, + "grad_norm": 0.2433622100799936, + "learning_rate": 4.1680949283816996e-07, + "loss": 0.0083, + "step": 7924 + }, + { + "epoch": 2.646518617465353, + "grad_norm": 0.3011407689417221, + "learning_rate": 4.160330534490814e-07, + "loss": 0.0138, + "step": 7925 + }, + { + "epoch": 2.6468525630322257, + "grad_norm": 0.3218541200246468, + "learning_rate": 4.152573065221749e-07, + "loss": 0.0115, + "step": 7926 + }, + { + "epoch": 2.6471865085990984, + "grad_norm": 0.27047284576994124, + "learning_rate": 4.1448225217463724e-07, + "loss": 0.0128, + "step": 7927 + }, + { + "epoch": 2.6475204541659707, + "grad_norm": 0.21217979110827842, + "learning_rate": 4.1370789052354644e-07, + "loss": 0.0095, + "step": 7928 + }, + { + "epoch": 2.6478543997328434, + "grad_norm": 0.2692342587483447, + "learning_rate": 4.129342216858817e-07, + "loss": 0.0128, + "step": 7929 + }, + { + "epoch": 2.648188345299716, + "grad_norm": 0.34308220776747567, + "learning_rate": 4.1216124577851293e-07, + "loss": 0.0165, + "step": 7930 + }, + { + "epoch": 2.648522290866589, + "grad_norm": 0.37272679841455425, + "learning_rate": 4.113889629182083e-07, + "loss": 0.015, + "step": 7931 + }, + { + "epoch": 2.6488562364334616, + "grad_norm": 0.3403401664272049, + "learning_rate": 4.106173732216295e-07, + "loss": 0.021, + "step": 7932 + }, + { + "epoch": 2.649190182000334, + "grad_norm": 0.29545181861376985, + "learning_rate": 4.0984647680533564e-07, + "loss": 0.0136, + "step": 7933 + }, + { + "epoch": 2.6495241275672066, + "grad_norm": 0.29643565717233555, + "learning_rate": 4.090762737857784e-07, + "loss": 0.0127, + "step": 7934 + }, + { + "epoch": 2.6498580731340793, + "grad_norm": 0.4484958160602119, + "learning_rate": 4.0830676427930646e-07, + "loss": 0.014, + "step": 7935 + }, + { + "epoch": 2.6501920187009516, + "grad_norm": 0.24193996415630625, + "learning_rate": 4.0753794840216296e-07, + "loss": 0.0123, + "step": 7936 + }, + { + "epoch": 2.6505259642678243, + "grad_norm": 0.34522343178556875, + "learning_rate": 4.067698262704878e-07, + "loss": 0.0194, + "step": 7937 + }, + { + "epoch": 2.650859909834697, + "grad_norm": 0.2816068497944424, + "learning_rate": 4.0600239800031136e-07, + "loss": 0.0163, + "step": 7938 + }, + { + "epoch": 2.6511938554015697, + "grad_norm": 0.28276559725879397, + "learning_rate": 4.0523566370756774e-07, + "loss": 0.0105, + "step": 7939 + }, + { + "epoch": 2.651527800968442, + "grad_norm": 0.3014795509320092, + "learning_rate": 4.044696235080775e-07, + "loss": 0.015, + "step": 7940 + }, + { + "epoch": 2.6518617465353147, + "grad_norm": 0.2974375921754659, + "learning_rate": 4.037042775175626e-07, + "loss": 0.0143, + "step": 7941 + }, + { + "epoch": 2.6521956921021874, + "grad_norm": 0.52780190094282, + "learning_rate": 4.0293962585163493e-07, + "loss": 0.0218, + "step": 7942 + }, + { + "epoch": 2.65252963766906, + "grad_norm": 0.25706312676563936, + "learning_rate": 4.02175668625806e-07, + "loss": 0.0113, + "step": 7943 + }, + { + "epoch": 2.6528635832359324, + "grad_norm": 0.2235437248344753, + "learning_rate": 4.014124059554786e-07, + "loss": 0.0126, + "step": 7944 + }, + { + "epoch": 2.653197528802805, + "grad_norm": 0.2884744355343741, + "learning_rate": 4.006498379559559e-07, + "loss": 0.0126, + "step": 7945 + }, + { + "epoch": 2.653531474369678, + "grad_norm": 0.2805849343560099, + "learning_rate": 3.9988796474242977e-07, + "loss": 0.0143, + "step": 7946 + }, + { + "epoch": 2.65386541993655, + "grad_norm": 0.27667167315839414, + "learning_rate": 3.9912678642999134e-07, + "loss": 0.0159, + "step": 7947 + }, + { + "epoch": 2.654199365503423, + "grad_norm": 0.2572554077178099, + "learning_rate": 3.983663031336249e-07, + "loss": 0.0106, + "step": 7948 + }, + { + "epoch": 2.6545333110702956, + "grad_norm": 0.2528709531305157, + "learning_rate": 3.976065149682112e-07, + "loss": 0.0102, + "step": 7949 + }, + { + "epoch": 2.6548672566371683, + "grad_norm": 0.23314522901235568, + "learning_rate": 3.968474220485252e-07, + "loss": 0.0129, + "step": 7950 + }, + { + "epoch": 2.655201202204041, + "grad_norm": 0.4715283509730433, + "learning_rate": 3.960890244892362e-07, + "loss": 0.0247, + "step": 7951 + }, + { + "epoch": 2.6555351477709133, + "grad_norm": 0.252285031994945, + "learning_rate": 3.953313224049099e-07, + "loss": 0.0123, + "step": 7952 + }, + { + "epoch": 2.655869093337786, + "grad_norm": 0.3095174305167158, + "learning_rate": 3.945743159100046e-07, + "loss": 0.0087, + "step": 7953 + }, + { + "epoch": 2.6562030389046587, + "grad_norm": 0.28143247101252206, + "learning_rate": 3.938180051188756e-07, + "loss": 0.0145, + "step": 7954 + }, + { + "epoch": 2.656536984471531, + "grad_norm": 0.26371545852487455, + "learning_rate": 3.930623901457736e-07, + "loss": 0.0111, + "step": 7955 + }, + { + "epoch": 2.6568709300384037, + "grad_norm": 0.32299396913893724, + "learning_rate": 3.92307471104843e-07, + "loss": 0.0142, + "step": 7956 + }, + { + "epoch": 2.6572048756052764, + "grad_norm": 0.2540323397540815, + "learning_rate": 3.915532481101225e-07, + "loss": 0.0128, + "step": 7957 + }, + { + "epoch": 2.6575388211721487, + "grad_norm": 0.3154603263706469, + "learning_rate": 3.9079972127554657e-07, + "loss": 0.0176, + "step": 7958 + }, + { + "epoch": 2.6578727667390214, + "grad_norm": 0.29047185577927653, + "learning_rate": 3.9004689071494406e-07, + "loss": 0.0144, + "step": 7959 + }, + { + "epoch": 2.658206712305894, + "grad_norm": 0.25129909334646433, + "learning_rate": 3.8929475654203963e-07, + "loss": 0.0111, + "step": 7960 + }, + { + "epoch": 2.658540657872767, + "grad_norm": 0.3390049623505517, + "learning_rate": 3.8854331887045016e-07, + "loss": 0.0158, + "step": 7961 + }, + { + "epoch": 2.6588746034396396, + "grad_norm": 0.2527729454463676, + "learning_rate": 3.877925778136921e-07, + "loss": 0.0143, + "step": 7962 + }, + { + "epoch": 2.659208549006512, + "grad_norm": 0.3583623696241767, + "learning_rate": 3.870425334851713e-07, + "loss": 0.0205, + "step": 7963 + }, + { + "epoch": 2.6595424945733845, + "grad_norm": 0.33013999254450654, + "learning_rate": 3.8629318599819224e-07, + "loss": 0.0208, + "step": 7964 + }, + { + "epoch": 2.6598764401402573, + "grad_norm": 0.23859842709351678, + "learning_rate": 3.855445354659515e-07, + "loss": 0.0128, + "step": 7965 + }, + { + "epoch": 2.6602103857071295, + "grad_norm": 0.2781061623677838, + "learning_rate": 3.847965820015426e-07, + "loss": 0.0138, + "step": 7966 + }, + { + "epoch": 2.6605443312740022, + "grad_norm": 0.2925399044881, + "learning_rate": 3.8404932571795115e-07, + "loss": 0.0116, + "step": 7967 + }, + { + "epoch": 2.660878276840875, + "grad_norm": 0.31565697847506213, + "learning_rate": 3.833027667280614e-07, + "loss": 0.0144, + "step": 7968 + }, + { + "epoch": 2.6612122224077477, + "grad_norm": 0.31843999255394795, + "learning_rate": 3.825569051446476e-07, + "loss": 0.01, + "step": 7969 + }, + { + "epoch": 2.6615461679746204, + "grad_norm": 0.2959583423166002, + "learning_rate": 3.8181174108038286e-07, + "loss": 0.0121, + "step": 7970 + }, + { + "epoch": 2.6618801135414927, + "grad_norm": 0.31047100804786, + "learning_rate": 3.810672746478317e-07, + "loss": 0.0151, + "step": 7971 + }, + { + "epoch": 2.6622140591083654, + "grad_norm": 0.3295440901634914, + "learning_rate": 3.803235059594551e-07, + "loss": 0.0189, + "step": 7972 + }, + { + "epoch": 2.662548004675238, + "grad_norm": 0.3180453972385154, + "learning_rate": 3.795804351276072e-07, + "loss": 0.0173, + "step": 7973 + }, + { + "epoch": 2.6628819502421104, + "grad_norm": 0.2767718062908812, + "learning_rate": 3.788380622645382e-07, + "loss": 0.0161, + "step": 7974 + }, + { + "epoch": 2.663215895808983, + "grad_norm": 0.2735517432168516, + "learning_rate": 3.780963874823934e-07, + "loss": 0.0132, + "step": 7975 + }, + { + "epoch": 2.663549841375856, + "grad_norm": 0.32422867861538396, + "learning_rate": 3.773554108932093e-07, + "loss": 0.0152, + "step": 7976 + }, + { + "epoch": 2.663883786942728, + "grad_norm": 0.37788016034916155, + "learning_rate": 3.7661513260892067e-07, + "loss": 0.0158, + "step": 7977 + }, + { + "epoch": 2.664217732509601, + "grad_norm": 0.3500274988992972, + "learning_rate": 3.7587555274135544e-07, + "loss": 0.0198, + "step": 7978 + }, + { + "epoch": 2.6645516780764735, + "grad_norm": 0.37085429756111626, + "learning_rate": 3.751366714022342e-07, + "loss": 0.019, + "step": 7979 + }, + { + "epoch": 2.6648856236433462, + "grad_norm": 0.25528020499862125, + "learning_rate": 3.7439848870317487e-07, + "loss": 0.0118, + "step": 7980 + }, + { + "epoch": 2.665219569210219, + "grad_norm": 0.3816040838362776, + "learning_rate": 3.7366100475568935e-07, + "loss": 0.022, + "step": 7981 + }, + { + "epoch": 2.6655535147770912, + "grad_norm": 0.7436971589685613, + "learning_rate": 3.7292421967118185e-07, + "loss": 0.0178, + "step": 7982 + }, + { + "epoch": 2.665887460343964, + "grad_norm": 0.26221711860801566, + "learning_rate": 3.72188133560954e-07, + "loss": 0.013, + "step": 7983 + }, + { + "epoch": 2.6662214059108367, + "grad_norm": 0.29950999992828703, + "learning_rate": 3.7145274653619776e-07, + "loss": 0.0136, + "step": 7984 + }, + { + "epoch": 2.666555351477709, + "grad_norm": 0.21266047381305353, + "learning_rate": 3.7071805870800395e-07, + "loss": 0.0088, + "step": 7985 + }, + { + "epoch": 2.6668892970445817, + "grad_norm": 0.30667339393285914, + "learning_rate": 3.6998407018735525e-07, + "loss": 0.0127, + "step": 7986 + }, + { + "epoch": 2.6672232426114544, + "grad_norm": 0.33277628709798035, + "learning_rate": 3.6925078108513033e-07, + "loss": 0.0222, + "step": 7987 + }, + { + "epoch": 2.667557188178327, + "grad_norm": 0.3070737249695833, + "learning_rate": 3.6851819151209947e-07, + "loss": 0.0096, + "step": 7988 + }, + { + "epoch": 2.6678911337451994, + "grad_norm": 0.34693941534621314, + "learning_rate": 3.677863015789307e-07, + "loss": 0.0129, + "step": 7989 + }, + { + "epoch": 2.668225079312072, + "grad_norm": 0.24484842749232208, + "learning_rate": 3.6705511139618177e-07, + "loss": 0.0096, + "step": 7990 + }, + { + "epoch": 2.668559024878945, + "grad_norm": 0.3031856005710634, + "learning_rate": 3.66324621074311e-07, + "loss": 0.0182, + "step": 7991 + }, + { + "epoch": 2.6688929704458175, + "grad_norm": 0.2435595058431979, + "learning_rate": 3.6559483072366506e-07, + "loss": 0.0104, + "step": 7992 + }, + { + "epoch": 2.66922691601269, + "grad_norm": 0.26620775820370635, + "learning_rate": 3.6486574045448973e-07, + "loss": 0.0141, + "step": 7993 + }, + { + "epoch": 2.6695608615795625, + "grad_norm": 0.24179455274565803, + "learning_rate": 3.6413735037691966e-07, + "loss": 0.0087, + "step": 7994 + }, + { + "epoch": 2.6698948071464352, + "grad_norm": 0.30278456733418374, + "learning_rate": 3.634096606009896e-07, + "loss": 0.014, + "step": 7995 + }, + { + "epoch": 2.6702287527133075, + "grad_norm": 0.2328723905753084, + "learning_rate": 3.626826712366233e-07, + "loss": 0.0116, + "step": 7996 + }, + { + "epoch": 2.6705626982801802, + "grad_norm": 0.2423226347129979, + "learning_rate": 3.6195638239364225e-07, + "loss": 0.0106, + "step": 7997 + }, + { + "epoch": 2.670896643847053, + "grad_norm": 0.2854525409037026, + "learning_rate": 3.612307941817622e-07, + "loss": 0.0146, + "step": 7998 + }, + { + "epoch": 2.6712305894139257, + "grad_norm": 0.2186770992320193, + "learning_rate": 3.605059067105887e-07, + "loss": 0.0092, + "step": 7999 + }, + { + "epoch": 2.6715645349807984, + "grad_norm": 0.3058158622552652, + "learning_rate": 3.59781720089627e-07, + "loss": 0.014, + "step": 8000 + }, + { + "epoch": 2.6718984805476707, + "grad_norm": 0.34223709014788484, + "learning_rate": 3.5905823442827393e-07, + "loss": 0.0137, + "step": 8001 + }, + { + "epoch": 2.6722324261145434, + "grad_norm": 0.30259500138328077, + "learning_rate": 3.583354498358188e-07, + "loss": 0.0157, + "step": 8002 + }, + { + "epoch": 2.672566371681416, + "grad_norm": 0.22158030938206486, + "learning_rate": 3.576133664214476e-07, + "loss": 0.0097, + "step": 8003 + }, + { + "epoch": 2.6729003172482884, + "grad_norm": 0.2649250336288251, + "learning_rate": 3.568919842942409e-07, + "loss": 0.012, + "step": 8004 + }, + { + "epoch": 2.673234262815161, + "grad_norm": 0.2637569150240111, + "learning_rate": 3.5617130356316977e-07, + "loss": 0.0102, + "step": 8005 + }, + { + "epoch": 2.673568208382034, + "grad_norm": 0.30541122925855263, + "learning_rate": 3.554513243371038e-07, + "loss": 0.016, + "step": 8006 + }, + { + "epoch": 2.673902153948906, + "grad_norm": 0.27997727959432517, + "learning_rate": 3.5473204672480224e-07, + "loss": 0.0157, + "step": 8007 + }, + { + "epoch": 2.674236099515779, + "grad_norm": 0.30358337684296777, + "learning_rate": 3.5401347083492077e-07, + "loss": 0.0124, + "step": 8008 + }, + { + "epoch": 2.6745700450826515, + "grad_norm": 0.25635752574494675, + "learning_rate": 3.532955967760093e-07, + "loss": 0.0093, + "step": 8009 + }, + { + "epoch": 2.674903990649524, + "grad_norm": 0.2681677235283582, + "learning_rate": 3.5257842465651226e-07, + "loss": 0.0118, + "step": 8010 + }, + { + "epoch": 2.675237936216397, + "grad_norm": 0.28356099347718106, + "learning_rate": 3.5186195458476515e-07, + "loss": 0.0122, + "step": 8011 + }, + { + "epoch": 2.675571881783269, + "grad_norm": 0.2990796627631461, + "learning_rate": 3.5114618666900023e-07, + "loss": 0.0121, + "step": 8012 + }, + { + "epoch": 2.675905827350142, + "grad_norm": 0.29134438124978296, + "learning_rate": 3.5043112101734166e-07, + "loss": 0.0132, + "step": 8013 + }, + { + "epoch": 2.6762397729170146, + "grad_norm": 0.22071778531624958, + "learning_rate": 3.4971675773780913e-07, + "loss": 0.0109, + "step": 8014 + }, + { + "epoch": 2.676573718483887, + "grad_norm": 0.2844547768955682, + "learning_rate": 3.490030969383157e-07, + "loss": 0.0162, + "step": 8015 + }, + { + "epoch": 2.6769076640507596, + "grad_norm": 0.31511100856874213, + "learning_rate": 3.482901387266685e-07, + "loss": 0.0139, + "step": 8016 + }, + { + "epoch": 2.6772416096176324, + "grad_norm": 0.22112655020495478, + "learning_rate": 3.475778832105681e-07, + "loss": 0.0108, + "step": 8017 + }, + { + "epoch": 2.677575555184505, + "grad_norm": 0.3212531174791497, + "learning_rate": 3.468663304976089e-07, + "loss": 0.0172, + "step": 8018 + }, + { + "epoch": 2.677909500751378, + "grad_norm": 0.26223578531047076, + "learning_rate": 3.4615548069527883e-07, + "loss": 0.011, + "step": 8019 + }, + { + "epoch": 2.67824344631825, + "grad_norm": 0.36040554371455963, + "learning_rate": 3.4544533391096093e-07, + "loss": 0.0136, + "step": 8020 + }, + { + "epoch": 2.678577391885123, + "grad_norm": 0.4739960398196577, + "learning_rate": 3.4473589025193155e-07, + "loss": 0.0216, + "step": 8021 + }, + { + "epoch": 2.6789113374519955, + "grad_norm": 0.3343868088008449, + "learning_rate": 3.440271498253589e-07, + "loss": 0.0134, + "step": 8022 + }, + { + "epoch": 2.6792452830188678, + "grad_norm": 0.31885474021106164, + "learning_rate": 3.433191127383079e-07, + "loss": 0.0138, + "step": 8023 + }, + { + "epoch": 2.6795792285857405, + "grad_norm": 0.21750168149864377, + "learning_rate": 3.4261177909773624e-07, + "loss": 0.0098, + "step": 8024 + }, + { + "epoch": 2.679913174152613, + "grad_norm": 0.2439737161928737, + "learning_rate": 3.419051490104935e-07, + "loss": 0.0086, + "step": 8025 + }, + { + "epoch": 2.6802471197194855, + "grad_norm": 0.2600754845704635, + "learning_rate": 3.4119922258332496e-07, + "loss": 0.0118, + "step": 8026 + }, + { + "epoch": 2.680581065286358, + "grad_norm": 0.22698588542062947, + "learning_rate": 3.4049399992287067e-07, + "loss": 0.0072, + "step": 8027 + }, + { + "epoch": 2.680915010853231, + "grad_norm": 0.2628552445714976, + "learning_rate": 3.3978948113566056e-07, + "loss": 0.011, + "step": 8028 + }, + { + "epoch": 2.6812489564201036, + "grad_norm": 0.3248050397345352, + "learning_rate": 3.390856663281228e-07, + "loss": 0.0158, + "step": 8029 + }, + { + "epoch": 2.6815829019869764, + "grad_norm": 0.2863939964227281, + "learning_rate": 3.3838255560657453e-07, + "loss": 0.012, + "step": 8030 + }, + { + "epoch": 2.6819168475538486, + "grad_norm": 0.3037715014542918, + "learning_rate": 3.3768014907722966e-07, + "loss": 0.0145, + "step": 8031 + }, + { + "epoch": 2.6822507931207213, + "grad_norm": 0.2524322097338231, + "learning_rate": 3.369784468461956e-07, + "loss": 0.0097, + "step": 8032 + }, + { + "epoch": 2.682584738687594, + "grad_norm": 0.3118841681447238, + "learning_rate": 3.3627744901947313e-07, + "loss": 0.0142, + "step": 8033 + }, + { + "epoch": 2.6829186842544663, + "grad_norm": 0.310488698328538, + "learning_rate": 3.3557715570295523e-07, + "loss": 0.0141, + "step": 8034 + }, + { + "epoch": 2.683252629821339, + "grad_norm": 0.32455197075888503, + "learning_rate": 3.3487756700243014e-07, + "loss": 0.0156, + "step": 8035 + }, + { + "epoch": 2.6835865753882118, + "grad_norm": 0.3049481263981558, + "learning_rate": 3.341786830235777e-07, + "loss": 0.0149, + "step": 8036 + }, + { + "epoch": 2.6839205209550845, + "grad_norm": 0.27165623449050624, + "learning_rate": 3.334805038719735e-07, + "loss": 0.0097, + "step": 8037 + }, + { + "epoch": 2.6842544665219568, + "grad_norm": 0.3288765366179696, + "learning_rate": 3.3278302965308593e-07, + "loss": 0.0115, + "step": 8038 + }, + { + "epoch": 2.6845884120888295, + "grad_norm": 0.28492294659238515, + "learning_rate": 3.3208626047227687e-07, + "loss": 0.0186, + "step": 8039 + }, + { + "epoch": 2.684922357655702, + "grad_norm": 0.2755663138797607, + "learning_rate": 3.313901964348004e-07, + "loss": 0.0127, + "step": 8040 + }, + { + "epoch": 2.685256303222575, + "grad_norm": 0.32871736218643544, + "learning_rate": 3.306948376458069e-07, + "loss": 0.0188, + "step": 8041 + }, + { + "epoch": 2.685590248789447, + "grad_norm": 0.3093111892613027, + "learning_rate": 3.3000018421033675e-07, + "loss": 0.0153, + "step": 8042 + }, + { + "epoch": 2.68592419435632, + "grad_norm": 0.2796736650506178, + "learning_rate": 3.29306236233326e-07, + "loss": 0.0144, + "step": 8043 + }, + { + "epoch": 2.6862581399231926, + "grad_norm": 0.31501408410236686, + "learning_rate": 3.286129938196048e-07, + "loss": 0.0093, + "step": 8044 + }, + { + "epoch": 2.686592085490065, + "grad_norm": 0.28540574385417367, + "learning_rate": 3.279204570738936e-07, + "loss": 0.0154, + "step": 8045 + }, + { + "epoch": 2.6869260310569376, + "grad_norm": 0.2695178465695714, + "learning_rate": 3.272286261008095e-07, + "loss": 0.0107, + "step": 8046 + }, + { + "epoch": 2.6872599766238103, + "grad_norm": 0.2998989004523264, + "learning_rate": 3.2653750100486213e-07, + "loss": 0.0103, + "step": 8047 + }, + { + "epoch": 2.687593922190683, + "grad_norm": 0.2582947189414363, + "learning_rate": 3.25847081890453e-07, + "loss": 0.0116, + "step": 8048 + }, + { + "epoch": 2.6879278677575558, + "grad_norm": 0.24954918084620384, + "learning_rate": 3.251573688618781e-07, + "loss": 0.0131, + "step": 8049 + }, + { + "epoch": 2.688261813324428, + "grad_norm": 0.2927417683379775, + "learning_rate": 3.2446836202332854e-07, + "loss": 0.0122, + "step": 8050 + }, + { + "epoch": 2.6885957588913008, + "grad_norm": 0.2691385750291096, + "learning_rate": 3.237800614788844e-07, + "loss": 0.0134, + "step": 8051 + }, + { + "epoch": 2.6889297044581735, + "grad_norm": 0.2973712814192687, + "learning_rate": 3.230924673325231e-07, + "loss": 0.0133, + "step": 8052 + }, + { + "epoch": 2.6892636500250457, + "grad_norm": 0.2490960157513933, + "learning_rate": 3.2240557968811315e-07, + "loss": 0.0099, + "step": 8053 + }, + { + "epoch": 2.6895975955919185, + "grad_norm": 0.3191588454744281, + "learning_rate": 3.217193986494177e-07, + "loss": 0.0234, + "step": 8054 + }, + { + "epoch": 2.689931541158791, + "grad_norm": 0.25023599468160346, + "learning_rate": 3.2103392432009105e-07, + "loss": 0.0106, + "step": 8055 + }, + { + "epoch": 2.6902654867256635, + "grad_norm": 0.28204142857925146, + "learning_rate": 3.203491568036843e-07, + "loss": 0.0116, + "step": 8056 + }, + { + "epoch": 2.690599432292536, + "grad_norm": 0.2734569800461845, + "learning_rate": 3.196650962036374e-07, + "loss": 0.014, + "step": 8057 + }, + { + "epoch": 2.690933377859409, + "grad_norm": 0.39120310053416835, + "learning_rate": 3.189817426232883e-07, + "loss": 0.0169, + "step": 8058 + }, + { + "epoch": 2.6912673234262816, + "grad_norm": 0.3281417840344649, + "learning_rate": 3.182990961658633e-07, + "loss": 0.017, + "step": 8059 + }, + { + "epoch": 2.6916012689931543, + "grad_norm": 0.3200298450914653, + "learning_rate": 3.1761715693448546e-07, + "loss": 0.0132, + "step": 8060 + }, + { + "epoch": 2.6919352145600266, + "grad_norm": 0.27028344663247766, + "learning_rate": 3.1693592503216795e-07, + "loss": 0.0104, + "step": 8061 + }, + { + "epoch": 2.6922691601268993, + "grad_norm": 0.2550863765463541, + "learning_rate": 3.162554005618218e-07, + "loss": 0.0125, + "step": 8062 + }, + { + "epoch": 2.692603105693772, + "grad_norm": 0.24732052208510927, + "learning_rate": 3.155755836262464e-07, + "loss": 0.0132, + "step": 8063 + }, + { + "epoch": 2.6929370512606443, + "grad_norm": 0.33277953986162934, + "learning_rate": 3.148964743281363e-07, + "loss": 0.0208, + "step": 8064 + }, + { + "epoch": 2.693270996827517, + "grad_norm": 0.2689153505510985, + "learning_rate": 3.1421807277007885e-07, + "loss": 0.018, + "step": 8065 + }, + { + "epoch": 2.6936049423943897, + "grad_norm": 0.24414343837281255, + "learning_rate": 3.1354037905455547e-07, + "loss": 0.0131, + "step": 8066 + }, + { + "epoch": 2.6939388879612625, + "grad_norm": 0.28762204133327557, + "learning_rate": 3.1286339328393755e-07, + "loss": 0.0239, + "step": 8067 + }, + { + "epoch": 2.694272833528135, + "grad_norm": 0.24201984860907172, + "learning_rate": 3.1218711556049494e-07, + "loss": 0.0082, + "step": 8068 + }, + { + "epoch": 2.6946067790950075, + "grad_norm": 0.27568309089650767, + "learning_rate": 3.115115459863849e-07, + "loss": 0.0095, + "step": 8069 + }, + { + "epoch": 2.69494072466188, + "grad_norm": 0.3442774595777744, + "learning_rate": 3.108366846636618e-07, + "loss": 0.0213, + "step": 8070 + }, + { + "epoch": 2.695274670228753, + "grad_norm": 0.3006104524297572, + "learning_rate": 3.101625316942697e-07, + "loss": 0.0156, + "step": 8071 + }, + { + "epoch": 2.695608615795625, + "grad_norm": 0.33543481218275956, + "learning_rate": 3.094890871800488e-07, + "loss": 0.0229, + "step": 8072 + }, + { + "epoch": 2.695942561362498, + "grad_norm": 0.2712218291717025, + "learning_rate": 3.0881635122273047e-07, + "loss": 0.0164, + "step": 8073 + }, + { + "epoch": 2.6962765069293706, + "grad_norm": 0.25249449195822743, + "learning_rate": 3.0814432392393847e-07, + "loss": 0.0116, + "step": 8074 + }, + { + "epoch": 2.696610452496243, + "grad_norm": 0.2302066994843045, + "learning_rate": 3.074730053851921e-07, + "loss": 0.008, + "step": 8075 + }, + { + "epoch": 2.6969443980631156, + "grad_norm": 0.23432412131902264, + "learning_rate": 3.068023957078997e-07, + "loss": 0.0106, + "step": 8076 + }, + { + "epoch": 2.6972783436299883, + "grad_norm": 0.24369353617110326, + "learning_rate": 3.061324949933675e-07, + "loss": 0.0099, + "step": 8077 + }, + { + "epoch": 2.697612289196861, + "grad_norm": 0.3668128503923368, + "learning_rate": 3.054633033427884e-07, + "loss": 0.0151, + "step": 8078 + }, + { + "epoch": 2.6979462347637337, + "grad_norm": 0.3131536970729404, + "learning_rate": 3.0479482085725545e-07, + "loss": 0.0178, + "step": 8079 + }, + { + "epoch": 2.698280180330606, + "grad_norm": 0.37327430626818686, + "learning_rate": 3.0412704763774836e-07, + "loss": 0.0212, + "step": 8080 + }, + { + "epoch": 2.6986141258974787, + "grad_norm": 0.27930249318555983, + "learning_rate": 3.034599837851432e-07, + "loss": 0.0141, + "step": 8081 + }, + { + "epoch": 2.6989480714643515, + "grad_norm": 0.2976113162848265, + "learning_rate": 3.027936294002071e-07, + "loss": 0.0136, + "step": 8082 + }, + { + "epoch": 2.6992820170312237, + "grad_norm": 0.35091913577245337, + "learning_rate": 3.021279845836017e-07, + "loss": 0.0155, + "step": 8083 + }, + { + "epoch": 2.6996159625980964, + "grad_norm": 0.2765008982831201, + "learning_rate": 3.0146304943587833e-07, + "loss": 0.0167, + "step": 8084 + }, + { + "epoch": 2.699949908164969, + "grad_norm": 0.26195630891909977, + "learning_rate": 3.007988240574866e-07, + "loss": 0.014, + "step": 8085 + }, + { + "epoch": 2.7002838537318414, + "grad_norm": 0.29485613821089995, + "learning_rate": 3.0013530854876296e-07, + "loss": 0.0167, + "step": 8086 + }, + { + "epoch": 2.700617799298714, + "grad_norm": 0.3045324367854926, + "learning_rate": 2.9947250300994046e-07, + "loss": 0.0181, + "step": 8087 + }, + { + "epoch": 2.700951744865587, + "grad_norm": 0.20798518186480916, + "learning_rate": 2.98810407541143e-07, + "loss": 0.0092, + "step": 8088 + }, + { + "epoch": 2.7012856904324596, + "grad_norm": 0.2512070057497568, + "learning_rate": 2.9814902224238886e-07, + "loss": 0.0084, + "step": 8089 + }, + { + "epoch": 2.7016196359993323, + "grad_norm": 0.30290084768448594, + "learning_rate": 2.974883472135859e-07, + "loss": 0.011, + "step": 8090 + }, + { + "epoch": 2.7019535815662046, + "grad_norm": 0.2897283208639351, + "learning_rate": 2.968283825545398e-07, + "loss": 0.0141, + "step": 8091 + }, + { + "epoch": 2.7022875271330773, + "grad_norm": 0.26225589520719345, + "learning_rate": 2.961691283649437e-07, + "loss": 0.011, + "step": 8092 + }, + { + "epoch": 2.70262147269995, + "grad_norm": 0.29711547066990696, + "learning_rate": 2.955105847443873e-07, + "loss": 0.0136, + "step": 8093 + }, + { + "epoch": 2.7029554182668223, + "grad_norm": 0.2745414877389005, + "learning_rate": 2.9485275179235e-07, + "loss": 0.0118, + "step": 8094 + }, + { + "epoch": 2.703289363833695, + "grad_norm": 0.2873513473148231, + "learning_rate": 2.9419562960820656e-07, + "loss": 0.0217, + "step": 8095 + }, + { + "epoch": 2.7036233094005677, + "grad_norm": 0.2553205630656262, + "learning_rate": 2.9353921829122167e-07, + "loss": 0.0154, + "step": 8096 + }, + { + "epoch": 2.7039572549674404, + "grad_norm": 0.2285208586329596, + "learning_rate": 2.928835179405548e-07, + "loss": 0.008, + "step": 8097 + }, + { + "epoch": 2.704291200534313, + "grad_norm": 0.25691285277924664, + "learning_rate": 2.922285286552579e-07, + "loss": 0.0105, + "step": 8098 + }, + { + "epoch": 2.7046251461011854, + "grad_norm": 0.24433197907471793, + "learning_rate": 2.915742505342728e-07, + "loss": 0.0139, + "step": 8099 + }, + { + "epoch": 2.704959091668058, + "grad_norm": 0.34876670402277193, + "learning_rate": 2.9092068367643776e-07, + "loss": 0.0189, + "step": 8100 + }, + { + "epoch": 2.705293037234931, + "grad_norm": 0.31597055136604885, + "learning_rate": 2.902678281804805e-07, + "loss": 0.0087, + "step": 8101 + }, + { + "epoch": 2.705626982801803, + "grad_norm": 0.20953360657229994, + "learning_rate": 2.896156841450232e-07, + "loss": 0.0093, + "step": 8102 + }, + { + "epoch": 2.705960928368676, + "grad_norm": 0.2502107260103645, + "learning_rate": 2.8896425166857976e-07, + "loss": 0.0101, + "step": 8103 + }, + { + "epoch": 2.7062948739355486, + "grad_norm": 0.3900922235048903, + "learning_rate": 2.8831353084955717e-07, + "loss": 0.0143, + "step": 8104 + }, + { + "epoch": 2.706628819502421, + "grad_norm": 0.29621728324110896, + "learning_rate": 2.8766352178625387e-07, + "loss": 0.015, + "step": 8105 + }, + { + "epoch": 2.7069627650692936, + "grad_norm": 0.2883050242164719, + "learning_rate": 2.87014224576862e-07, + "loss": 0.0169, + "step": 8106 + }, + { + "epoch": 2.7072967106361663, + "grad_norm": 0.32317815155902635, + "learning_rate": 2.863656393194636e-07, + "loss": 0.0105, + "step": 8107 + }, + { + "epoch": 2.707630656203039, + "grad_norm": 0.22040945097768017, + "learning_rate": 2.8571776611203804e-07, + "loss": 0.0096, + "step": 8108 + }, + { + "epoch": 2.7079646017699117, + "grad_norm": 0.48663802471115114, + "learning_rate": 2.850706050524521e-07, + "loss": 0.0146, + "step": 8109 + }, + { + "epoch": 2.708298547336784, + "grad_norm": 0.2628339874374742, + "learning_rate": 2.844241562384686e-07, + "loss": 0.0132, + "step": 8110 + }, + { + "epoch": 2.7086324929036567, + "grad_norm": 0.2806529826266646, + "learning_rate": 2.8377841976773955e-07, + "loss": 0.0128, + "step": 8111 + }, + { + "epoch": 2.7089664384705294, + "grad_norm": 0.2933929171767404, + "learning_rate": 2.83133395737813e-07, + "loss": 0.0123, + "step": 8112 + }, + { + "epoch": 2.7093003840374017, + "grad_norm": 0.4455140012227884, + "learning_rate": 2.824890842461242e-07, + "loss": 0.0301, + "step": 8113 + }, + { + "epoch": 2.7096343296042744, + "grad_norm": 0.2858646834365565, + "learning_rate": 2.818454853900082e-07, + "loss": 0.0112, + "step": 8114 + }, + { + "epoch": 2.709968275171147, + "grad_norm": 0.2761482203608089, + "learning_rate": 2.8120259926668505e-07, + "loss": 0.0163, + "step": 8115 + }, + { + "epoch": 2.71030222073802, + "grad_norm": 0.35001589040383363, + "learning_rate": 2.8056042597327196e-07, + "loss": 0.0193, + "step": 8116 + }, + { + "epoch": 2.7106361663048926, + "grad_norm": 0.27105912949837924, + "learning_rate": 2.799189656067758e-07, + "loss": 0.0104, + "step": 8117 + }, + { + "epoch": 2.710970111871765, + "grad_norm": 0.24695272701899895, + "learning_rate": 2.792782182640974e-07, + "loss": 0.0138, + "step": 8118 + }, + { + "epoch": 2.7113040574386376, + "grad_norm": 0.2774423352284431, + "learning_rate": 2.7863818404202823e-07, + "loss": 0.0161, + "step": 8119 + }, + { + "epoch": 2.7116380030055103, + "grad_norm": 0.3021631042135275, + "learning_rate": 2.7799886303725376e-07, + "loss": 0.0134, + "step": 8120 + }, + { + "epoch": 2.7119719485723826, + "grad_norm": 0.3236457801146371, + "learning_rate": 2.7736025534635115e-07, + "loss": 0.0106, + "step": 8121 + }, + { + "epoch": 2.7123058941392553, + "grad_norm": 0.23251390289212942, + "learning_rate": 2.767223610657888e-07, + "loss": 0.0103, + "step": 8122 + }, + { + "epoch": 2.712639839706128, + "grad_norm": 0.32552163691099706, + "learning_rate": 2.7608518029192897e-07, + "loss": 0.0153, + "step": 8123 + }, + { + "epoch": 2.7129737852730003, + "grad_norm": 0.346564248573981, + "learning_rate": 2.7544871312102485e-07, + "loss": 0.0157, + "step": 8124 + }, + { + "epoch": 2.713307730839873, + "grad_norm": 0.2567624866081163, + "learning_rate": 2.7481295964922216e-07, + "loss": 0.0107, + "step": 8125 + }, + { + "epoch": 2.7136416764067457, + "grad_norm": 0.253078640282921, + "learning_rate": 2.7417791997255916e-07, + "loss": 0.0104, + "step": 8126 + }, + { + "epoch": 2.7139756219736184, + "grad_norm": 0.3255809496823744, + "learning_rate": 2.735435941869663e-07, + "loss": 0.0145, + "step": 8127 + }, + { + "epoch": 2.714309567540491, + "grad_norm": 0.24491376851842311, + "learning_rate": 2.7290998238826584e-07, + "loss": 0.0095, + "step": 8128 + }, + { + "epoch": 2.7146435131073634, + "grad_norm": 0.3531616503885645, + "learning_rate": 2.7227708467217227e-07, + "loss": 0.0183, + "step": 8129 + }, + { + "epoch": 2.714977458674236, + "grad_norm": 0.2397817905060915, + "learning_rate": 2.71644901134292e-07, + "loss": 0.0102, + "step": 8130 + }, + { + "epoch": 2.715311404241109, + "grad_norm": 0.4127895105540195, + "learning_rate": 2.7101343187012354e-07, + "loss": 0.0125, + "step": 8131 + }, + { + "epoch": 2.715645349807981, + "grad_norm": 0.28047852679318064, + "learning_rate": 2.7038267697505894e-07, + "loss": 0.0145, + "step": 8132 + }, + { + "epoch": 2.715979295374854, + "grad_norm": 0.24697138421435968, + "learning_rate": 2.697526365443803e-07, + "loss": 0.0128, + "step": 8133 + }, + { + "epoch": 2.7163132409417265, + "grad_norm": 0.31241607663246995, + "learning_rate": 2.691233106732627e-07, + "loss": 0.015, + "step": 8134 + }, + { + "epoch": 2.716647186508599, + "grad_norm": 0.2832482923302587, + "learning_rate": 2.684946994567733e-07, + "loss": 0.0136, + "step": 8135 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 0.27857070735171374, + "learning_rate": 2.678668029898712e-07, + "loss": 0.0113, + "step": 8136 + }, + { + "epoch": 2.7173150776423443, + "grad_norm": 0.2229044900955066, + "learning_rate": 2.672396213674072e-07, + "loss": 0.0119, + "step": 8137 + }, + { + "epoch": 2.717649023209217, + "grad_norm": 0.20152071027655438, + "learning_rate": 2.66613154684125e-07, + "loss": 0.0091, + "step": 8138 + }, + { + "epoch": 2.7179829687760897, + "grad_norm": 0.2938025169107522, + "learning_rate": 2.659874030346604e-07, + "loss": 0.017, + "step": 8139 + }, + { + "epoch": 2.718316914342962, + "grad_norm": 0.35397618109201595, + "learning_rate": 2.653623665135391e-07, + "loss": 0.0153, + "step": 8140 + }, + { + "epoch": 2.7186508599098347, + "grad_norm": 0.2778299889738381, + "learning_rate": 2.6473804521518097e-07, + "loss": 0.0122, + "step": 8141 + }, + { + "epoch": 2.7189848054767074, + "grad_norm": 0.21914334159016047, + "learning_rate": 2.641144392338968e-07, + "loss": 0.0078, + "step": 8142 + }, + { + "epoch": 2.7193187510435797, + "grad_norm": 0.25109535212329154, + "learning_rate": 2.6349154866389e-07, + "loss": 0.0114, + "step": 8143 + }, + { + "epoch": 2.7196526966104524, + "grad_norm": 0.3195333006679426, + "learning_rate": 2.6286937359925545e-07, + "loss": 0.011, + "step": 8144 + }, + { + "epoch": 2.719986642177325, + "grad_norm": 0.19721292850911923, + "learning_rate": 2.622479141339801e-07, + "loss": 0.0084, + "step": 8145 + }, + { + "epoch": 2.720320587744198, + "grad_norm": 0.2693325560794091, + "learning_rate": 2.6162717036194274e-07, + "loss": 0.012, + "step": 8146 + }, + { + "epoch": 2.7206545333110705, + "grad_norm": 0.2868955195589291, + "learning_rate": 2.610071423769128e-07, + "loss": 0.0119, + "step": 8147 + }, + { + "epoch": 2.720988478877943, + "grad_norm": 0.3962024079719642, + "learning_rate": 2.603878302725543e-07, + "loss": 0.0236, + "step": 8148 + }, + { + "epoch": 2.7213224244448155, + "grad_norm": 0.2405097653399732, + "learning_rate": 2.5976923414242126e-07, + "loss": 0.0105, + "step": 8149 + }, + { + "epoch": 2.7216563700116883, + "grad_norm": 0.2654105029663251, + "learning_rate": 2.5915135407996005e-07, + "loss": 0.0113, + "step": 8150 + }, + { + "epoch": 2.7219903155785605, + "grad_norm": 0.2233171349241094, + "learning_rate": 2.585341901785082e-07, + "loss": 0.0113, + "step": 8151 + }, + { + "epoch": 2.7223242611454332, + "grad_norm": 0.3385804106149172, + "learning_rate": 2.579177425312962e-07, + "loss": 0.0137, + "step": 8152 + }, + { + "epoch": 2.722658206712306, + "grad_norm": 0.25119578119040376, + "learning_rate": 2.5730201123144503e-07, + "loss": 0.0135, + "step": 8153 + }, + { + "epoch": 2.7229921522791782, + "grad_norm": 0.33930815236777434, + "learning_rate": 2.566869963719681e-07, + "loss": 0.0135, + "step": 8154 + }, + { + "epoch": 2.723326097846051, + "grad_norm": 0.3114077082281841, + "learning_rate": 2.5607269804577174e-07, + "loss": 0.0125, + "step": 8155 + }, + { + "epoch": 2.7236600434129237, + "grad_norm": 0.2592412741302571, + "learning_rate": 2.5545911634565266e-07, + "loss": 0.0088, + "step": 8156 + }, + { + "epoch": 2.7239939889797964, + "grad_norm": 0.282059793097031, + "learning_rate": 2.5484625136429854e-07, + "loss": 0.0118, + "step": 8157 + }, + { + "epoch": 2.724327934546669, + "grad_norm": 0.27502578295242364, + "learning_rate": 2.5423410319429075e-07, + "loss": 0.0113, + "step": 8158 + }, + { + "epoch": 2.7246618801135414, + "grad_norm": 0.28663454986505266, + "learning_rate": 2.5362267192810095e-07, + "loss": 0.013, + "step": 8159 + }, + { + "epoch": 2.724995825680414, + "grad_norm": 0.3061127537372279, + "learning_rate": 2.530119576580936e-07, + "loss": 0.0128, + "step": 8160 + }, + { + "epoch": 2.725329771247287, + "grad_norm": 0.2231746098020713, + "learning_rate": 2.5240196047652377e-07, + "loss": 0.0083, + "step": 8161 + }, + { + "epoch": 2.725663716814159, + "grad_norm": 0.2902111367982422, + "learning_rate": 2.5179268047553937e-07, + "loss": 0.0147, + "step": 8162 + }, + { + "epoch": 2.725997662381032, + "grad_norm": 0.29400139303921846, + "learning_rate": 2.5118411774717857e-07, + "loss": 0.0125, + "step": 8163 + }, + { + "epoch": 2.7263316079479045, + "grad_norm": 0.31005771623133305, + "learning_rate": 2.5057627238337324e-07, + "loss": 0.0144, + "step": 8164 + }, + { + "epoch": 2.7266655535147772, + "grad_norm": 0.2824070007399706, + "learning_rate": 2.4996914447594334e-07, + "loss": 0.0185, + "step": 8165 + }, + { + "epoch": 2.72699949908165, + "grad_norm": 0.3182981688916357, + "learning_rate": 2.493627341166044e-07, + "loss": 0.0144, + "step": 8166 + }, + { + "epoch": 2.7273334446485222, + "grad_norm": 0.2449660151149918, + "learning_rate": 2.48757041396962e-07, + "loss": 0.011, + "step": 8167 + }, + { + "epoch": 2.727667390215395, + "grad_norm": 0.26265010743791456, + "learning_rate": 2.481520664085113e-07, + "loss": 0.0103, + "step": 8168 + }, + { + "epoch": 2.7280013357822677, + "grad_norm": 0.29750084853772446, + "learning_rate": 2.4754780924264366e-07, + "loss": 0.015, + "step": 8169 + }, + { + "epoch": 2.72833528134914, + "grad_norm": 0.24564357049346475, + "learning_rate": 2.4694426999063657e-07, + "loss": 0.0088, + "step": 8170 + }, + { + "epoch": 2.7286692269160127, + "grad_norm": 0.28270023167926145, + "learning_rate": 2.463414487436633e-07, + "loss": 0.007, + "step": 8171 + }, + { + "epoch": 2.7290031724828854, + "grad_norm": 0.2535544268708272, + "learning_rate": 2.4573934559278646e-07, + "loss": 0.0101, + "step": 8172 + }, + { + "epoch": 2.7293371180497576, + "grad_norm": 0.20093188441982038, + "learning_rate": 2.4513796062896166e-07, + "loss": 0.0078, + "step": 8173 + }, + { + "epoch": 2.7296710636166304, + "grad_norm": 0.3142522270328027, + "learning_rate": 2.4453729394303404e-07, + "loss": 0.0104, + "step": 8174 + }, + { + "epoch": 2.730005009183503, + "grad_norm": 0.3166882283710865, + "learning_rate": 2.439373456257427e-07, + "loss": 0.0115, + "step": 8175 + }, + { + "epoch": 2.730338954750376, + "grad_norm": 0.29365553858337456, + "learning_rate": 2.433381157677156e-07, + "loss": 0.0132, + "step": 8176 + }, + { + "epoch": 2.7306729003172485, + "grad_norm": 0.24433303266143705, + "learning_rate": 2.427396044594743e-07, + "loss": 0.0104, + "step": 8177 + }, + { + "epoch": 2.731006845884121, + "grad_norm": 0.6027711421808059, + "learning_rate": 2.421418117914298e-07, + "loss": 0.0144, + "step": 8178 + }, + { + "epoch": 2.7313407914509935, + "grad_norm": 0.2897866679069052, + "learning_rate": 2.415447378538871e-07, + "loss": 0.0106, + "step": 8179 + }, + { + "epoch": 2.7316747370178662, + "grad_norm": 0.40784173706742166, + "learning_rate": 2.409483827370407e-07, + "loss": 0.023, + "step": 8180 + }, + { + "epoch": 2.7320086825847385, + "grad_norm": 0.3133242765929012, + "learning_rate": 2.4035274653097797e-07, + "loss": 0.0151, + "step": 8181 + }, + { + "epoch": 2.732342628151611, + "grad_norm": 0.379545256249535, + "learning_rate": 2.3975782932567473e-07, + "loss": 0.0286, + "step": 8182 + }, + { + "epoch": 2.732676573718484, + "grad_norm": 0.274653596333824, + "learning_rate": 2.391636312110024e-07, + "loss": 0.0098, + "step": 8183 + }, + { + "epoch": 2.733010519285356, + "grad_norm": 0.24565840561989097, + "learning_rate": 2.385701522767192e-07, + "loss": 0.0114, + "step": 8184 + }, + { + "epoch": 2.733344464852229, + "grad_norm": 0.32159704736477485, + "learning_rate": 2.3797739261247955e-07, + "loss": 0.0168, + "step": 8185 + }, + { + "epoch": 2.7336784104191016, + "grad_norm": 0.3453941063010027, + "learning_rate": 2.3738535230782568e-07, + "loss": 0.0112, + "step": 8186 + }, + { + "epoch": 2.7340123559859744, + "grad_norm": 0.25889253912849375, + "learning_rate": 2.3679403145219214e-07, + "loss": 0.0111, + "step": 8187 + }, + { + "epoch": 2.734346301552847, + "grad_norm": 0.3782274519653321, + "learning_rate": 2.362034301349053e-07, + "loss": 0.0103, + "step": 8188 + }, + { + "epoch": 2.7346802471197194, + "grad_norm": 0.328646201769715, + "learning_rate": 2.3561354844518157e-07, + "loss": 0.0194, + "step": 8189 + }, + { + "epoch": 2.735014192686592, + "grad_norm": 0.29036599921858414, + "learning_rate": 2.3502438647213132e-07, + "loss": 0.0143, + "step": 8190 + }, + { + "epoch": 2.735348138253465, + "grad_norm": 0.23070780893248077, + "learning_rate": 2.3443594430475224e-07, + "loss": 0.0079, + "step": 8191 + }, + { + "epoch": 2.735682083820337, + "grad_norm": 0.24094828888252745, + "learning_rate": 2.3384822203193714e-07, + "loss": 0.0099, + "step": 8192 + }, + { + "epoch": 2.73601602938721, + "grad_norm": 0.2792369213107012, + "learning_rate": 2.332612197424672e-07, + "loss": 0.0118, + "step": 8193 + }, + { + "epoch": 2.7363499749540825, + "grad_norm": 0.3373230905145829, + "learning_rate": 2.32674937525017e-07, + "loss": 0.0164, + "step": 8194 + }, + { + "epoch": 2.736683920520955, + "grad_norm": 0.28816021991983115, + "learning_rate": 2.3208937546815026e-07, + "loss": 0.0142, + "step": 8195 + }, + { + "epoch": 2.737017866087828, + "grad_norm": 0.3125144546949365, + "learning_rate": 2.3150453366032445e-07, + "loss": 0.0138, + "step": 8196 + }, + { + "epoch": 2.7373518116547, + "grad_norm": 0.3136773891008518, + "learning_rate": 2.309204121898856e-07, + "loss": 0.0127, + "step": 8197 + }, + { + "epoch": 2.737685757221573, + "grad_norm": 0.28118525928695043, + "learning_rate": 2.3033701114507313e-07, + "loss": 0.0113, + "step": 8198 + }, + { + "epoch": 2.7380197027884456, + "grad_norm": 0.36287240362798984, + "learning_rate": 2.2975433061401541e-07, + "loss": 0.0128, + "step": 8199 + }, + { + "epoch": 2.738353648355318, + "grad_norm": 0.3175172402768893, + "learning_rate": 2.2917237068473484e-07, + "loss": 0.017, + "step": 8200 + }, + { + "epoch": 2.7386875939221906, + "grad_norm": 0.3288060226239171, + "learning_rate": 2.2859113144514055e-07, + "loss": 0.012, + "step": 8201 + }, + { + "epoch": 2.7390215394890634, + "grad_norm": 0.2694604123570534, + "learning_rate": 2.2801061298303895e-07, + "loss": 0.0121, + "step": 8202 + }, + { + "epoch": 2.7393554850559356, + "grad_norm": 0.23336597870360848, + "learning_rate": 2.2743081538612154e-07, + "loss": 0.0108, + "step": 8203 + }, + { + "epoch": 2.7396894306228083, + "grad_norm": 0.22539188471782481, + "learning_rate": 2.268517387419761e-07, + "loss": 0.0092, + "step": 8204 + }, + { + "epoch": 2.740023376189681, + "grad_norm": 0.34108942387662355, + "learning_rate": 2.2627338313807645e-07, + "loss": 0.014, + "step": 8205 + }, + { + "epoch": 2.7403573217565538, + "grad_norm": 0.2880143722050247, + "learning_rate": 2.2569574866179166e-07, + "loss": 0.0146, + "step": 8206 + }, + { + "epoch": 2.7406912673234265, + "grad_norm": 0.2861275846293823, + "learning_rate": 2.2511883540037805e-07, + "loss": 0.0151, + "step": 8207 + }, + { + "epoch": 2.7410252128902988, + "grad_norm": 0.21978870075720008, + "learning_rate": 2.2454264344098865e-07, + "loss": 0.0081, + "step": 8208 + }, + { + "epoch": 2.7413591584571715, + "grad_norm": 0.28266386451179687, + "learning_rate": 2.2396717287066106e-07, + "loss": 0.0127, + "step": 8209 + }, + { + "epoch": 2.741693104024044, + "grad_norm": 0.2519643336463548, + "learning_rate": 2.233924237763291e-07, + "loss": 0.0101, + "step": 8210 + }, + { + "epoch": 2.7420270495909165, + "grad_norm": 0.27988364118866643, + "learning_rate": 2.2281839624481328e-07, + "loss": 0.0141, + "step": 8211 + }, + { + "epoch": 2.742360995157789, + "grad_norm": 0.20551095327571595, + "learning_rate": 2.222450903628287e-07, + "loss": 0.009, + "step": 8212 + }, + { + "epoch": 2.742694940724662, + "grad_norm": 0.2921509515606145, + "learning_rate": 2.2167250621697944e-07, + "loss": 0.0151, + "step": 8213 + }, + { + "epoch": 2.7430288862915346, + "grad_norm": 0.2811275891975004, + "learning_rate": 2.2110064389376017e-07, + "loss": 0.0128, + "step": 8214 + }, + { + "epoch": 2.7433628318584073, + "grad_norm": 0.3089409785322832, + "learning_rate": 2.205295034795596e-07, + "loss": 0.011, + "step": 8215 + }, + { + "epoch": 2.7436967774252796, + "grad_norm": 0.23001989701544498, + "learning_rate": 2.1995908506065366e-07, + "loss": 0.0069, + "step": 8216 + }, + { + "epoch": 2.7440307229921523, + "grad_norm": 0.273604365015591, + "learning_rate": 2.1938938872321014e-07, + "loss": 0.0105, + "step": 8217 + }, + { + "epoch": 2.744364668559025, + "grad_norm": 0.2765553120627356, + "learning_rate": 2.1882041455329073e-07, + "loss": 0.0154, + "step": 8218 + }, + { + "epoch": 2.7446986141258973, + "grad_norm": 0.35516951383829043, + "learning_rate": 2.1825216263684336e-07, + "loss": 0.0189, + "step": 8219 + }, + { + "epoch": 2.74503255969277, + "grad_norm": 0.32217286641570436, + "learning_rate": 2.176846330597099e-07, + "loss": 0.0158, + "step": 8220 + }, + { + "epoch": 2.7453665052596428, + "grad_norm": 0.24849044606753853, + "learning_rate": 2.1711782590762344e-07, + "loss": 0.0075, + "step": 8221 + }, + { + "epoch": 2.745700450826515, + "grad_norm": 0.20754128360930993, + "learning_rate": 2.165517412662055e-07, + "loss": 0.0085, + "step": 8222 + }, + { + "epoch": 2.7460343963933878, + "grad_norm": 0.3109369589059883, + "learning_rate": 2.1598637922097098e-07, + "loss": 0.0165, + "step": 8223 + }, + { + "epoch": 2.7463683419602605, + "grad_norm": 0.2596669166669222, + "learning_rate": 2.1542173985732274e-07, + "loss": 0.0135, + "step": 8224 + }, + { + "epoch": 2.746702287527133, + "grad_norm": 0.28923075375522145, + "learning_rate": 2.148578232605575e-07, + "loss": 0.0144, + "step": 8225 + }, + { + "epoch": 2.747036233094006, + "grad_norm": 0.2157993011104292, + "learning_rate": 2.14294629515861e-07, + "loss": 0.0084, + "step": 8226 + }, + { + "epoch": 2.747370178660878, + "grad_norm": 0.2542783390431188, + "learning_rate": 2.137321587083119e-07, + "loss": 0.0095, + "step": 8227 + }, + { + "epoch": 2.747704124227751, + "grad_norm": 0.2739989612947763, + "learning_rate": 2.1317041092287548e-07, + "loss": 0.01, + "step": 8228 + }, + { + "epoch": 2.7480380697946236, + "grad_norm": 0.33830015955136855, + "learning_rate": 2.126093862444123e-07, + "loss": 0.0214, + "step": 8229 + }, + { + "epoch": 2.748372015361496, + "grad_norm": 0.273366251968806, + "learning_rate": 2.1204908475767005e-07, + "loss": 0.0134, + "step": 8230 + }, + { + "epoch": 2.7487059609283686, + "grad_norm": 0.2641925398859872, + "learning_rate": 2.114895065472905e-07, + "loss": 0.0106, + "step": 8231 + }, + { + "epoch": 2.7490399064952413, + "grad_norm": 0.5610458409108607, + "learning_rate": 2.109306516978038e-07, + "loss": 0.0159, + "step": 8232 + }, + { + "epoch": 2.7493738520621136, + "grad_norm": 0.25527657542575755, + "learning_rate": 2.1037252029363242e-07, + "loss": 0.0114, + "step": 8233 + }, + { + "epoch": 2.7497077976289863, + "grad_norm": 0.23889412036671726, + "learning_rate": 2.098151124190867e-07, + "loss": 0.0103, + "step": 8234 + }, + { + "epoch": 2.750041743195859, + "grad_norm": 0.37063197825375, + "learning_rate": 2.092584281583715e-07, + "loss": 0.019, + "step": 8235 + }, + { + "epoch": 2.7503756887627318, + "grad_norm": 0.31912294717825307, + "learning_rate": 2.0870246759557956e-07, + "loss": 0.0132, + "step": 8236 + }, + { + "epoch": 2.7507096343296045, + "grad_norm": 0.2984038361409962, + "learning_rate": 2.0814723081469535e-07, + "loss": 0.0172, + "step": 8237 + }, + { + "epoch": 2.7510435798964767, + "grad_norm": 0.31061195911274414, + "learning_rate": 2.0759271789959513e-07, + "loss": 0.0136, + "step": 8238 + }, + { + "epoch": 2.7513775254633495, + "grad_norm": 0.310034734249356, + "learning_rate": 2.0703892893404299e-07, + "loss": 0.0175, + "step": 8239 + }, + { + "epoch": 2.751711471030222, + "grad_norm": 0.3638576227274786, + "learning_rate": 2.064858640016959e-07, + "loss": 0.0114, + "step": 8240 + }, + { + "epoch": 2.7520454165970945, + "grad_norm": 0.3131498484811339, + "learning_rate": 2.0593352318610093e-07, + "loss": 0.0115, + "step": 8241 + }, + { + "epoch": 2.752379362163967, + "grad_norm": 0.23521264084117519, + "learning_rate": 2.0538190657069523e-07, + "loss": 0.0098, + "step": 8242 + }, + { + "epoch": 2.75271330773084, + "grad_norm": 0.3490876568458798, + "learning_rate": 2.048310142388077e-07, + "loss": 0.0195, + "step": 8243 + }, + { + "epoch": 2.7530472532977126, + "grad_norm": 0.22969879460114054, + "learning_rate": 2.0428084627365729e-07, + "loss": 0.0099, + "step": 8244 + }, + { + "epoch": 2.7533811988645853, + "grad_norm": 0.25860538536446154, + "learning_rate": 2.0373140275835203e-07, + "loss": 0.0116, + "step": 8245 + }, + { + "epoch": 2.7537151444314576, + "grad_norm": 0.23864379334135657, + "learning_rate": 2.0318268377589323e-07, + "loss": 0.0088, + "step": 8246 + }, + { + "epoch": 2.7540490899983303, + "grad_norm": 0.31311993559159723, + "learning_rate": 2.026346894091702e-07, + "loss": 0.012, + "step": 8247 + }, + { + "epoch": 2.754383035565203, + "grad_norm": 0.32681308038560614, + "learning_rate": 2.0208741974096445e-07, + "loss": 0.0104, + "step": 8248 + }, + { + "epoch": 2.7547169811320753, + "grad_norm": 0.2744665529596475, + "learning_rate": 2.0154087485394713e-07, + "loss": 0.0112, + "step": 8249 + }, + { + "epoch": 2.755050926698948, + "grad_norm": 0.2119489886910582, + "learning_rate": 2.0099505483068216e-07, + "loss": 0.0101, + "step": 8250 + }, + { + "epoch": 2.7553848722658207, + "grad_norm": 0.29936244161533654, + "learning_rate": 2.0044995975361914e-07, + "loss": 0.0173, + "step": 8251 + }, + { + "epoch": 2.755718817832693, + "grad_norm": 0.3434641407558833, + "learning_rate": 1.9990558970510388e-07, + "loss": 0.018, + "step": 8252 + }, + { + "epoch": 2.7560527633995657, + "grad_norm": 0.2883160995611944, + "learning_rate": 1.9936194476736782e-07, + "loss": 0.0159, + "step": 8253 + }, + { + "epoch": 2.7563867089664384, + "grad_norm": 0.2768632663561537, + "learning_rate": 1.9881902502253525e-07, + "loss": 0.0094, + "step": 8254 + }, + { + "epoch": 2.756720654533311, + "grad_norm": 0.4145861037047996, + "learning_rate": 1.9827683055262114e-07, + "loss": 0.0217, + "step": 8255 + }, + { + "epoch": 2.757054600100184, + "grad_norm": 0.22037173957492298, + "learning_rate": 1.977353614395311e-07, + "loss": 0.0076, + "step": 8256 + }, + { + "epoch": 2.757388545667056, + "grad_norm": 0.23478814974502749, + "learning_rate": 1.971946177650591e-07, + "loss": 0.0079, + "step": 8257 + }, + { + "epoch": 2.757722491233929, + "grad_norm": 0.3040970763032444, + "learning_rate": 1.966545996108915e-07, + "loss": 0.0106, + "step": 8258 + }, + { + "epoch": 2.7580564368008016, + "grad_norm": 0.30878299409882476, + "learning_rate": 1.961153070586036e-07, + "loss": 0.0146, + "step": 8259 + }, + { + "epoch": 2.758390382367674, + "grad_norm": 0.4017240722856722, + "learning_rate": 1.9557674018966244e-07, + "loss": 0.0187, + "step": 8260 + }, + { + "epoch": 2.7587243279345466, + "grad_norm": 0.2859867777005715, + "learning_rate": 1.9503889908542572e-07, + "loss": 0.0124, + "step": 8261 + }, + { + "epoch": 2.7590582735014193, + "grad_norm": 0.3568003795269254, + "learning_rate": 1.9450178382713957e-07, + "loss": 0.0175, + "step": 8262 + }, + { + "epoch": 2.759392219068292, + "grad_norm": 0.28907602310094616, + "learning_rate": 1.9396539449594131e-07, + "loss": 0.0116, + "step": 8263 + }, + { + "epoch": 2.7597261646351647, + "grad_norm": 0.2261150234283839, + "learning_rate": 1.9342973117286056e-07, + "loss": 0.0115, + "step": 8264 + }, + { + "epoch": 2.760060110202037, + "grad_norm": 0.3483782701719299, + "learning_rate": 1.9289479393881317e-07, + "loss": 0.0137, + "step": 8265 + }, + { + "epoch": 2.7603940557689097, + "grad_norm": 0.2950855371660985, + "learning_rate": 1.9236058287460946e-07, + "loss": 0.0128, + "step": 8266 + }, + { + "epoch": 2.7607280013357824, + "grad_norm": 0.3326006143018145, + "learning_rate": 1.9182709806094823e-07, + "loss": 0.0205, + "step": 8267 + }, + { + "epoch": 2.7610619469026547, + "grad_norm": 0.24293641324088883, + "learning_rate": 1.9129433957841781e-07, + "loss": 0.0128, + "step": 8268 + }, + { + "epoch": 2.7613958924695274, + "grad_norm": 0.26490891658974086, + "learning_rate": 1.907623075074988e-07, + "loss": 0.0104, + "step": 8269 + }, + { + "epoch": 2.7617298380364, + "grad_norm": 0.3232274630483029, + "learning_rate": 1.9023100192855914e-07, + "loss": 0.015, + "step": 8270 + }, + { + "epoch": 2.7620637836032724, + "grad_norm": 0.28624041980764076, + "learning_rate": 1.897004229218602e-07, + "loss": 0.0114, + "step": 8271 + }, + { + "epoch": 2.762397729170145, + "grad_norm": 0.3475303207976439, + "learning_rate": 1.8917057056755172e-07, + "loss": 0.0129, + "step": 8272 + }, + { + "epoch": 2.762731674737018, + "grad_norm": 0.21519354891121584, + "learning_rate": 1.8864144494567528e-07, + "loss": 0.0087, + "step": 8273 + }, + { + "epoch": 2.7630656203038906, + "grad_norm": 0.3212635662342531, + "learning_rate": 1.881130461361591e-07, + "loss": 0.0132, + "step": 8274 + }, + { + "epoch": 2.7633995658707633, + "grad_norm": 0.20233210592967804, + "learning_rate": 1.8758537421882662e-07, + "loss": 0.0088, + "step": 8275 + }, + { + "epoch": 2.7637335114376356, + "grad_norm": 0.29612235037874146, + "learning_rate": 1.870584292733868e-07, + "loss": 0.017, + "step": 8276 + }, + { + "epoch": 2.7640674570045083, + "grad_norm": 0.36608550574456183, + "learning_rate": 1.8653221137944155e-07, + "loss": 0.0174, + "step": 8277 + }, + { + "epoch": 2.764401402571381, + "grad_norm": 0.24274056332917915, + "learning_rate": 1.8600672061648283e-07, + "loss": 0.0091, + "step": 8278 + }, + { + "epoch": 2.7647353481382533, + "grad_norm": 0.37905160620084416, + "learning_rate": 1.8548195706389272e-07, + "loss": 0.0274, + "step": 8279 + }, + { + "epoch": 2.765069293705126, + "grad_norm": 0.2027616527742171, + "learning_rate": 1.849579208009411e-07, + "loss": 0.0079, + "step": 8280 + }, + { + "epoch": 2.7654032392719987, + "grad_norm": 0.29833224591697277, + "learning_rate": 1.844346119067919e-07, + "loss": 0.0125, + "step": 8281 + }, + { + "epoch": 2.765737184838871, + "grad_norm": 0.2802195270286133, + "learning_rate": 1.8391203046049522e-07, + "loss": 0.0135, + "step": 8282 + }, + { + "epoch": 2.7660711304057437, + "grad_norm": 0.24459429299538646, + "learning_rate": 1.8339017654099344e-07, + "loss": 0.0099, + "step": 8283 + }, + { + "epoch": 2.7664050759726164, + "grad_norm": 0.3075248922555339, + "learning_rate": 1.828690502271202e-07, + "loss": 0.0111, + "step": 8284 + }, + { + "epoch": 2.766739021539489, + "grad_norm": 0.284368159402573, + "learning_rate": 1.823486515975964e-07, + "loss": 0.0195, + "step": 8285 + }, + { + "epoch": 2.767072967106362, + "grad_norm": 0.2786709877146374, + "learning_rate": 1.818289807310347e-07, + "loss": 0.0125, + "step": 8286 + }, + { + "epoch": 2.767406912673234, + "grad_norm": 0.33666991223234466, + "learning_rate": 1.813100377059379e-07, + "loss": 0.0138, + "step": 8287 + }, + { + "epoch": 2.767740858240107, + "grad_norm": 0.2952515007051211, + "learning_rate": 1.8079182260069773e-07, + "loss": 0.0115, + "step": 8288 + }, + { + "epoch": 2.7680748038069796, + "grad_norm": 0.19962157741548053, + "learning_rate": 1.8027433549359764e-07, + "loss": 0.0065, + "step": 8289 + }, + { + "epoch": 2.768408749373852, + "grad_norm": 0.3311269764807552, + "learning_rate": 1.7975757646280955e-07, + "loss": 0.0137, + "step": 8290 + }, + { + "epoch": 2.7687426949407246, + "grad_norm": 0.26512061831780587, + "learning_rate": 1.792415455863955e-07, + "loss": 0.014, + "step": 8291 + }, + { + "epoch": 2.7690766405075973, + "grad_norm": 0.316513736875226, + "learning_rate": 1.7872624294230924e-07, + "loss": 0.0155, + "step": 8292 + }, + { + "epoch": 2.76941058607447, + "grad_norm": 0.3049738888738994, + "learning_rate": 1.7821166860839179e-07, + "loss": 0.0166, + "step": 8293 + }, + { + "epoch": 2.7697445316413427, + "grad_norm": 0.34716067903135284, + "learning_rate": 1.7769782266237767e-07, + "loss": 0.0089, + "step": 8294 + }, + { + "epoch": 2.770078477208215, + "grad_norm": 0.2752261525444329, + "learning_rate": 1.7718470518188645e-07, + "loss": 0.0127, + "step": 8295 + }, + { + "epoch": 2.7704124227750877, + "grad_norm": 0.2474051824336063, + "learning_rate": 1.7667231624443393e-07, + "loss": 0.0103, + "step": 8296 + }, + { + "epoch": 2.7707463683419604, + "grad_norm": 0.2671973771982285, + "learning_rate": 1.7616065592742038e-07, + "loss": 0.0135, + "step": 8297 + }, + { + "epoch": 2.7710803139088327, + "grad_norm": 0.2664023577613685, + "learning_rate": 1.7564972430813899e-07, + "loss": 0.0128, + "step": 8298 + }, + { + "epoch": 2.7714142594757054, + "grad_norm": 0.2864257505428321, + "learning_rate": 1.751395214637708e-07, + "loss": 0.0169, + "step": 8299 + }, + { + "epoch": 2.771748205042578, + "grad_norm": 0.23575076178032306, + "learning_rate": 1.7463004747138967e-07, + "loss": 0.0102, + "step": 8300 + }, + { + "epoch": 2.7720821506094504, + "grad_norm": 0.2798314314257989, + "learning_rate": 1.7412130240795578e-07, + "loss": 0.0122, + "step": 8301 + }, + { + "epoch": 2.772416096176323, + "grad_norm": 0.2858984309856811, + "learning_rate": 1.736132863503226e-07, + "loss": 0.0116, + "step": 8302 + }, + { + "epoch": 2.772750041743196, + "grad_norm": 0.28838342882239176, + "learning_rate": 1.7310599937523153e-07, + "loss": 0.0119, + "step": 8303 + }, + { + "epoch": 2.7730839873100686, + "grad_norm": 0.31156847304098106, + "learning_rate": 1.7259944155931407e-07, + "loss": 0.0133, + "step": 8304 + }, + { + "epoch": 2.7734179328769413, + "grad_norm": 0.27535895094586527, + "learning_rate": 1.720936129790912e-07, + "loss": 0.0144, + "step": 8305 + }, + { + "epoch": 2.7737518784438135, + "grad_norm": 0.25413450315704256, + "learning_rate": 1.7158851371097518e-07, + "loss": 0.0088, + "step": 8306 + }, + { + "epoch": 2.7740858240106863, + "grad_norm": 0.2909353055050728, + "learning_rate": 1.7108414383126658e-07, + "loss": 0.0151, + "step": 8307 + }, + { + "epoch": 2.774419769577559, + "grad_norm": 0.25360047465185825, + "learning_rate": 1.7058050341615783e-07, + "loss": 0.0088, + "step": 8308 + }, + { + "epoch": 2.7747537151444313, + "grad_norm": 0.3036242447424197, + "learning_rate": 1.7007759254172752e-07, + "loss": 0.0177, + "step": 8309 + }, + { + "epoch": 2.775087660711304, + "grad_norm": 0.26463018184094694, + "learning_rate": 1.6957541128394817e-07, + "loss": 0.0095, + "step": 8310 + }, + { + "epoch": 2.7754216062781767, + "grad_norm": 0.3597683368456448, + "learning_rate": 1.6907395971867858e-07, + "loss": 0.0164, + "step": 8311 + }, + { + "epoch": 2.7757555518450494, + "grad_norm": 0.28661055365270777, + "learning_rate": 1.685732379216698e-07, + "loss": 0.0145, + "step": 8312 + }, + { + "epoch": 2.776089497411922, + "grad_norm": 0.24566557974280834, + "learning_rate": 1.680732459685619e-07, + "loss": 0.0092, + "step": 8313 + }, + { + "epoch": 2.7764234429787944, + "grad_norm": 0.3819991270581732, + "learning_rate": 1.6757398393488443e-07, + "loss": 0.0295, + "step": 8314 + }, + { + "epoch": 2.776757388545667, + "grad_norm": 0.25257389005476655, + "learning_rate": 1.6707545189605657e-07, + "loss": 0.0113, + "step": 8315 + }, + { + "epoch": 2.77709133411254, + "grad_norm": 0.2692351510514966, + "learning_rate": 1.6657764992738746e-07, + "loss": 0.0116, + "step": 8316 + }, + { + "epoch": 2.777425279679412, + "grad_norm": 0.2621873406998039, + "learning_rate": 1.6608057810407586e-07, + "loss": 0.0129, + "step": 8317 + }, + { + "epoch": 2.777759225246285, + "grad_norm": 0.2199847937029809, + "learning_rate": 1.6558423650121003e-07, + "loss": 0.0069, + "step": 8318 + }, + { + "epoch": 2.7780931708131575, + "grad_norm": 0.32149850230755, + "learning_rate": 1.6508862519376945e-07, + "loss": 0.0155, + "step": 8319 + }, + { + "epoch": 2.77842711638003, + "grad_norm": 0.25761239616973197, + "learning_rate": 1.6459374425662088e-07, + "loss": 0.0105, + "step": 8320 + }, + { + "epoch": 2.7787610619469025, + "grad_norm": 0.23574463132175358, + "learning_rate": 1.6409959376452289e-07, + "loss": 0.0095, + "step": 8321 + }, + { + "epoch": 2.7790950075137753, + "grad_norm": 0.24001499632074175, + "learning_rate": 1.6360617379212185e-07, + "loss": 0.0111, + "step": 8322 + }, + { + "epoch": 2.779428953080648, + "grad_norm": 0.3145800959517236, + "learning_rate": 1.6311348441395535e-07, + "loss": 0.0163, + "step": 8323 + }, + { + "epoch": 2.7797628986475207, + "grad_norm": 0.2788190083968446, + "learning_rate": 1.6262152570444777e-07, + "loss": 0.0146, + "step": 8324 + }, + { + "epoch": 2.780096844214393, + "grad_norm": 0.2697233591058573, + "learning_rate": 1.6213029773791912e-07, + "loss": 0.014, + "step": 8325 + }, + { + "epoch": 2.7804307897812657, + "grad_norm": 0.35953557372609307, + "learning_rate": 1.6163980058857164e-07, + "loss": 0.0128, + "step": 8326 + }, + { + "epoch": 2.7807647353481384, + "grad_norm": 0.4353401983976432, + "learning_rate": 1.6115003433050336e-07, + "loss": 0.0189, + "step": 8327 + }, + { + "epoch": 2.7810986809150107, + "grad_norm": 0.23561925217782875, + "learning_rate": 1.6066099903769726e-07, + "loss": 0.0112, + "step": 8328 + }, + { + "epoch": 2.7814326264818834, + "grad_norm": 0.20941101105095522, + "learning_rate": 1.6017269478402875e-07, + "loss": 0.0098, + "step": 8329 + }, + { + "epoch": 2.781766572048756, + "grad_norm": 0.32137760512333763, + "learning_rate": 1.59685121643261e-07, + "loss": 0.0177, + "step": 8330 + }, + { + "epoch": 2.7821005176156284, + "grad_norm": 0.263350231782085, + "learning_rate": 1.5919827968904955e-07, + "loss": 0.0082, + "step": 8331 + }, + { + "epoch": 2.782434463182501, + "grad_norm": 0.27889321678011675, + "learning_rate": 1.5871216899493612e-07, + "loss": 0.0136, + "step": 8332 + }, + { + "epoch": 2.782768408749374, + "grad_norm": 0.26586255032676426, + "learning_rate": 1.5822678963435479e-07, + "loss": 0.0201, + "step": 8333 + }, + { + "epoch": 2.7831023543162465, + "grad_norm": 0.30107242446010524, + "learning_rate": 1.5774214168062575e-07, + "loss": 0.0167, + "step": 8334 + }, + { + "epoch": 2.7834362998831192, + "grad_norm": 0.299668277910782, + "learning_rate": 1.5725822520696267e-07, + "loss": 0.0173, + "step": 8335 + }, + { + "epoch": 2.7837702454499915, + "grad_norm": 0.3040714664780066, + "learning_rate": 1.567750402864654e-07, + "loss": 0.0106, + "step": 8336 + }, + { + "epoch": 2.7841041910168642, + "grad_norm": 0.31414191753188925, + "learning_rate": 1.5629258699212613e-07, + "loss": 0.0155, + "step": 8337 + }, + { + "epoch": 2.784438136583737, + "grad_norm": 0.2652252331317683, + "learning_rate": 1.5581086539682433e-07, + "loss": 0.0094, + "step": 8338 + }, + { + "epoch": 2.7847720821506092, + "grad_norm": 0.2914617850460002, + "learning_rate": 1.5532987557332902e-07, + "loss": 0.0135, + "step": 8339 + }, + { + "epoch": 2.785106027717482, + "grad_norm": 0.2576522483730419, + "learning_rate": 1.5484961759430095e-07, + "loss": 0.011, + "step": 8340 + }, + { + "epoch": 2.7854399732843547, + "grad_norm": 0.35448494208890885, + "learning_rate": 1.5437009153228766e-07, + "loss": 0.0115, + "step": 8341 + }, + { + "epoch": 2.7857739188512274, + "grad_norm": 0.24914656785217557, + "learning_rate": 1.538912974597273e-07, + "loss": 0.0101, + "step": 8342 + }, + { + "epoch": 2.7861078644181, + "grad_norm": 0.2527460071849557, + "learning_rate": 1.5341323544894758e-07, + "loss": 0.0092, + "step": 8343 + }, + { + "epoch": 2.7864418099849724, + "grad_norm": 0.28201016644945903, + "learning_rate": 1.5293590557216577e-07, + "loss": 0.0156, + "step": 8344 + }, + { + "epoch": 2.786775755551845, + "grad_norm": 0.28782282799149767, + "learning_rate": 1.5245930790148743e-07, + "loss": 0.0106, + "step": 8345 + }, + { + "epoch": 2.787109701118718, + "grad_norm": 0.23980872769715128, + "learning_rate": 1.5198344250890894e-07, + "loss": 0.0095, + "step": 8346 + }, + { + "epoch": 2.78744364668559, + "grad_norm": 0.24121466281769527, + "learning_rate": 1.515083094663139e-07, + "loss": 0.01, + "step": 8347 + }, + { + "epoch": 2.787777592252463, + "grad_norm": 0.2464676825272029, + "learning_rate": 1.5103390884547931e-07, + "loss": 0.0118, + "step": 8348 + }, + { + "epoch": 2.7881115378193355, + "grad_norm": 0.344368993388975, + "learning_rate": 1.5056024071806674e-07, + "loss": 0.0178, + "step": 8349 + }, + { + "epoch": 2.788445483386208, + "grad_norm": 0.42110579895533357, + "learning_rate": 1.5008730515563064e-07, + "loss": 0.0392, + "step": 8350 + }, + { + "epoch": 2.7887794289530805, + "grad_norm": 0.25930592309704714, + "learning_rate": 1.4961510222961216e-07, + "loss": 0.0134, + "step": 8351 + }, + { + "epoch": 2.7891133745199532, + "grad_norm": 0.20907074816868176, + "learning_rate": 1.4914363201134486e-07, + "loss": 0.0075, + "step": 8352 + }, + { + "epoch": 2.789447320086826, + "grad_norm": 0.3429588995847818, + "learning_rate": 1.4867289457204726e-07, + "loss": 0.0255, + "step": 8353 + }, + { + "epoch": 2.7897812656536987, + "grad_norm": 0.28556945714494547, + "learning_rate": 1.4820288998283304e-07, + "loss": 0.0138, + "step": 8354 + }, + { + "epoch": 2.790115211220571, + "grad_norm": 0.22833053998156072, + "learning_rate": 1.477336183146999e-07, + "loss": 0.0077, + "step": 8355 + }, + { + "epoch": 2.7904491567874437, + "grad_norm": 0.3207524916382697, + "learning_rate": 1.4726507963853776e-07, + "loss": 0.0093, + "step": 8356 + }, + { + "epoch": 2.7907831023543164, + "grad_norm": 0.31784077700257857, + "learning_rate": 1.4679727402512334e-07, + "loss": 0.0168, + "step": 8357 + }, + { + "epoch": 2.7911170479211886, + "grad_norm": 0.22468100786292214, + "learning_rate": 1.4633020154512677e-07, + "loss": 0.0097, + "step": 8358 + }, + { + "epoch": 2.7914509934880614, + "grad_norm": 0.3174227513116866, + "learning_rate": 1.458638622691022e-07, + "loss": 0.016, + "step": 8359 + }, + { + "epoch": 2.791784939054934, + "grad_norm": 0.2639845338570142, + "learning_rate": 1.4539825626749715e-07, + "loss": 0.0147, + "step": 8360 + }, + { + "epoch": 2.792118884621807, + "grad_norm": 0.2627020187115576, + "learning_rate": 1.4493338361064646e-07, + "loss": 0.014, + "step": 8361 + }, + { + "epoch": 2.7924528301886795, + "grad_norm": 0.2535252270232833, + "learning_rate": 1.4446924436877507e-07, + "loss": 0.0133, + "step": 8362 + }, + { + "epoch": 2.792786775755552, + "grad_norm": 0.33633229865249903, + "learning_rate": 1.4400583861199636e-07, + "loss": 0.0091, + "step": 8363 + }, + { + "epoch": 2.7931207213224245, + "grad_norm": 0.24451771841418674, + "learning_rate": 1.4354316641031263e-07, + "loss": 0.0132, + "step": 8364 + }, + { + "epoch": 2.7934546668892972, + "grad_norm": 0.31350688222524326, + "learning_rate": 1.4308122783361688e-07, + "loss": 0.0128, + "step": 8365 + }, + { + "epoch": 2.7937886124561695, + "grad_norm": 0.2668994633524597, + "learning_rate": 1.4262002295168997e-07, + "loss": 0.0121, + "step": 8366 + }, + { + "epoch": 2.794122558023042, + "grad_norm": 0.2473686699944472, + "learning_rate": 1.4215955183420282e-07, + "loss": 0.012, + "step": 8367 + }, + { + "epoch": 2.794456503589915, + "grad_norm": 0.23643030610916663, + "learning_rate": 1.4169981455071368e-07, + "loss": 0.0185, + "step": 8368 + }, + { + "epoch": 2.794790449156787, + "grad_norm": 0.25203111907376435, + "learning_rate": 1.4124081117067313e-07, + "loss": 0.0101, + "step": 8369 + }, + { + "epoch": 2.79512439472366, + "grad_norm": 0.3042123226518819, + "learning_rate": 1.4078254176341788e-07, + "loss": 0.0165, + "step": 8370 + }, + { + "epoch": 2.7954583402905326, + "grad_norm": 0.2805553275281972, + "learning_rate": 1.4032500639817426e-07, + "loss": 0.0131, + "step": 8371 + }, + { + "epoch": 2.7957922858574054, + "grad_norm": 0.3299491931551103, + "learning_rate": 1.3986820514405973e-07, + "loss": 0.0143, + "step": 8372 + }, + { + "epoch": 2.796126231424278, + "grad_norm": 0.3957722296616314, + "learning_rate": 1.394121380700797e-07, + "loss": 0.0187, + "step": 8373 + }, + { + "epoch": 2.7964601769911503, + "grad_norm": 0.3194524644252628, + "learning_rate": 1.3895680524512734e-07, + "loss": 0.0158, + "step": 8374 + }, + { + "epoch": 2.796794122558023, + "grad_norm": 0.2865162913913697, + "learning_rate": 1.3850220673798655e-07, + "loss": 0.019, + "step": 8375 + }, + { + "epoch": 2.797128068124896, + "grad_norm": 0.3621511730767648, + "learning_rate": 1.3804834261732957e-07, + "loss": 0.0217, + "step": 8376 + }, + { + "epoch": 2.797462013691768, + "grad_norm": 0.26980065793884134, + "learning_rate": 1.3759521295171773e-07, + "loss": 0.014, + "step": 8377 + }, + { + "epoch": 2.7977959592586408, + "grad_norm": 0.23282010559479888, + "learning_rate": 1.3714281780960237e-07, + "loss": 0.0132, + "step": 8378 + }, + { + "epoch": 2.7981299048255135, + "grad_norm": 0.3347537165698293, + "learning_rate": 1.366911572593227e-07, + "loss": 0.0142, + "step": 8379 + }, + { + "epoch": 2.7984638503923858, + "grad_norm": 0.31495637518135455, + "learning_rate": 1.3624023136910691e-07, + "loss": 0.0136, + "step": 8380 + }, + { + "epoch": 2.7987977959592585, + "grad_norm": 0.25856814855169197, + "learning_rate": 1.3579004020707387e-07, + "loss": 0.0089, + "step": 8381 + }, + { + "epoch": 2.799131741526131, + "grad_norm": 0.3036830138780181, + "learning_rate": 1.3534058384122862e-07, + "loss": 0.0133, + "step": 8382 + }, + { + "epoch": 2.799465687093004, + "grad_norm": 0.3126728041819368, + "learning_rate": 1.3489186233946793e-07, + "loss": 0.0171, + "step": 8383 + }, + { + "epoch": 2.7997996326598766, + "grad_norm": 0.34767110591674416, + "learning_rate": 1.3444387576957706e-07, + "loss": 0.0183, + "step": 8384 + }, + { + "epoch": 2.800133578226749, + "grad_norm": 0.28919378287035513, + "learning_rate": 1.33996624199228e-07, + "loss": 0.0135, + "step": 8385 + }, + { + "epoch": 2.8004675237936216, + "grad_norm": 0.33563019294314644, + "learning_rate": 1.335501076959844e-07, + "loss": 0.0155, + "step": 8386 + }, + { + "epoch": 2.8008014693604943, + "grad_norm": 0.34985493173092885, + "learning_rate": 1.331043263272974e-07, + "loss": 0.0136, + "step": 8387 + }, + { + "epoch": 2.8011354149273666, + "grad_norm": 0.3023121399247982, + "learning_rate": 1.3265928016050756e-07, + "loss": 0.0103, + "step": 8388 + }, + { + "epoch": 2.8014693604942393, + "grad_norm": 0.29181389524536816, + "learning_rate": 1.3221496926284493e-07, + "loss": 0.0131, + "step": 8389 + }, + { + "epoch": 2.801803306061112, + "grad_norm": 0.3205013258648555, + "learning_rate": 1.3177139370142755e-07, + "loss": 0.0156, + "step": 8390 + }, + { + "epoch": 2.8021372516279848, + "grad_norm": 0.26342576122319006, + "learning_rate": 1.3132855354326236e-07, + "loss": 0.0122, + "step": 8391 + }, + { + "epoch": 2.8024711971948575, + "grad_norm": 0.2924159114380533, + "learning_rate": 1.3088644885524637e-07, + "loss": 0.0141, + "step": 8392 + }, + { + "epoch": 2.8028051427617298, + "grad_norm": 0.26825934096652987, + "learning_rate": 1.3044507970416398e-07, + "loss": 0.0123, + "step": 8393 + }, + { + "epoch": 2.8031390883286025, + "grad_norm": 0.29890559772657394, + "learning_rate": 1.3000444615668906e-07, + "loss": 0.0196, + "step": 8394 + }, + { + "epoch": 2.803473033895475, + "grad_norm": 0.26558573606851443, + "learning_rate": 1.2956454827938557e-07, + "loss": 0.0119, + "step": 8395 + }, + { + "epoch": 2.8038069794623475, + "grad_norm": 0.2786752920512263, + "learning_rate": 1.291253861387043e-07, + "loss": 0.0098, + "step": 8396 + }, + { + "epoch": 2.80414092502922, + "grad_norm": 0.23928573084074356, + "learning_rate": 1.28686959800986e-07, + "loss": 0.0088, + "step": 8397 + }, + { + "epoch": 2.804474870596093, + "grad_norm": 0.3268059382040651, + "learning_rate": 1.2824926933246106e-07, + "loss": 0.0134, + "step": 8398 + }, + { + "epoch": 2.804808816162965, + "grad_norm": 0.26133151883908656, + "learning_rate": 1.2781231479924606e-07, + "loss": 0.011, + "step": 8399 + }, + { + "epoch": 2.805142761729838, + "grad_norm": 0.2895260831922916, + "learning_rate": 1.2737609626734927e-07, + "loss": 0.0119, + "step": 8400 + }, + { + "epoch": 2.8054767072967106, + "grad_norm": 0.3425229516058676, + "learning_rate": 1.269406138026663e-07, + "loss": 0.0193, + "step": 8401 + }, + { + "epoch": 2.8058106528635833, + "grad_norm": 0.283686057394506, + "learning_rate": 1.2650586747098238e-07, + "loss": 0.0138, + "step": 8402 + }, + { + "epoch": 2.806144598430456, + "grad_norm": 0.2851033035169662, + "learning_rate": 1.2607185733797044e-07, + "loss": 0.018, + "step": 8403 + }, + { + "epoch": 2.8064785439973283, + "grad_norm": 0.3138832867948211, + "learning_rate": 1.2563858346919365e-07, + "loss": 0.0137, + "step": 8404 + }, + { + "epoch": 2.806812489564201, + "grad_norm": 0.23805023464287908, + "learning_rate": 1.2520604593010189e-07, + "loss": 0.0139, + "step": 8405 + }, + { + "epoch": 2.8071464351310738, + "grad_norm": 0.28574374435213823, + "learning_rate": 1.247742447860356e-07, + "loss": 0.0157, + "step": 8406 + }, + { + "epoch": 2.807480380697946, + "grad_norm": 0.28526161309306497, + "learning_rate": 1.2434318010222434e-07, + "loss": 0.0114, + "step": 8407 + }, + { + "epoch": 2.8078143262648187, + "grad_norm": 0.28554133114268365, + "learning_rate": 1.2391285194378433e-07, + "loss": 0.0122, + "step": 8408 + }, + { + "epoch": 2.8081482718316915, + "grad_norm": 0.2679164720587576, + "learning_rate": 1.2348326037572244e-07, + "loss": 0.0136, + "step": 8409 + }, + { + "epoch": 2.808482217398564, + "grad_norm": 0.34096941221535726, + "learning_rate": 1.2305440546293236e-07, + "loss": 0.0177, + "step": 8410 + }, + { + "epoch": 2.808816162965437, + "grad_norm": 0.23635400336796952, + "learning_rate": 1.2262628727019942e-07, + "loss": 0.011, + "step": 8411 + }, + { + "epoch": 2.809150108532309, + "grad_norm": 0.23210019008562568, + "learning_rate": 1.221989058621942e-07, + "loss": 0.0087, + "step": 8412 + }, + { + "epoch": 2.809484054099182, + "grad_norm": 0.278626224498374, + "learning_rate": 1.2177226130347886e-07, + "loss": 0.0125, + "step": 8413 + }, + { + "epoch": 2.8098179996660546, + "grad_norm": 0.22672558954302258, + "learning_rate": 1.21346353658503e-07, + "loss": 0.015, + "step": 8414 + }, + { + "epoch": 2.810151945232927, + "grad_norm": 0.24113637952206665, + "learning_rate": 1.209211829916046e-07, + "loss": 0.0118, + "step": 8415 + }, + { + "epoch": 2.8104858907997996, + "grad_norm": 0.30406505669797074, + "learning_rate": 1.204967493670106e-07, + "loss": 0.0133, + "step": 8416 + }, + { + "epoch": 2.8108198363666723, + "grad_norm": 0.280063746051557, + "learning_rate": 1.2007305284883696e-07, + "loss": 0.0112, + "step": 8417 + }, + { + "epoch": 2.8111537819335446, + "grad_norm": 0.30297807034631186, + "learning_rate": 1.1965009350108747e-07, + "loss": 0.0271, + "step": 8418 + }, + { + "epoch": 2.8114877275004173, + "grad_norm": 0.27213332098461923, + "learning_rate": 1.1922787138765656e-07, + "loss": 0.0112, + "step": 8419 + }, + { + "epoch": 2.81182167306729, + "grad_norm": 0.26232310304595063, + "learning_rate": 1.188063865723238e-07, + "loss": 0.0114, + "step": 8420 + }, + { + "epoch": 2.8121556186341627, + "grad_norm": 0.22527477009169755, + "learning_rate": 1.1838563911876155e-07, + "loss": 0.0113, + "step": 8421 + }, + { + "epoch": 2.8124895642010355, + "grad_norm": 0.2809461366900473, + "learning_rate": 1.1796562909052734e-07, + "loss": 0.0126, + "step": 8422 + }, + { + "epoch": 2.8128235097679077, + "grad_norm": 0.36457444507388703, + "learning_rate": 1.1754635655106928e-07, + "loss": 0.021, + "step": 8423 + }, + { + "epoch": 2.8131574553347805, + "grad_norm": 0.2571348420697469, + "learning_rate": 1.1712782156372226e-07, + "loss": 0.0121, + "step": 8424 + }, + { + "epoch": 2.813491400901653, + "grad_norm": 0.2642181664400732, + "learning_rate": 1.167100241917124e-07, + "loss": 0.0148, + "step": 8425 + }, + { + "epoch": 2.8138253464685254, + "grad_norm": 0.29227821178334, + "learning_rate": 1.1629296449815197e-07, + "loss": 0.0128, + "step": 8426 + }, + { + "epoch": 2.814159292035398, + "grad_norm": 0.22396646696066097, + "learning_rate": 1.1587664254604336e-07, + "loss": 0.0086, + "step": 8427 + }, + { + "epoch": 2.814493237602271, + "grad_norm": 0.26553623466395604, + "learning_rate": 1.1546105839827626e-07, + "loss": 0.0114, + "step": 8428 + }, + { + "epoch": 2.814827183169143, + "grad_norm": 0.2357038060759402, + "learning_rate": 1.150462121176299e-07, + "loss": 0.0104, + "step": 8429 + }, + { + "epoch": 2.815161128736016, + "grad_norm": 0.34500413522697126, + "learning_rate": 1.1463210376677192e-07, + "loss": 0.0173, + "step": 8430 + }, + { + "epoch": 2.8154950743028886, + "grad_norm": 0.27684281474371836, + "learning_rate": 1.1421873340825729e-07, + "loss": 0.012, + "step": 8431 + }, + { + "epoch": 2.8158290198697613, + "grad_norm": 0.27103118963474787, + "learning_rate": 1.1380610110453217e-07, + "loss": 0.0138, + "step": 8432 + }, + { + "epoch": 2.816162965436634, + "grad_norm": 0.24028946798563416, + "learning_rate": 1.133942069179278e-07, + "loss": 0.0105, + "step": 8433 + }, + { + "epoch": 2.8164969110035063, + "grad_norm": 0.40271768500581506, + "learning_rate": 1.1298305091066664e-07, + "loss": 0.0216, + "step": 8434 + }, + { + "epoch": 2.816830856570379, + "grad_norm": 0.37001445583746134, + "learning_rate": 1.1257263314485844e-07, + "loss": 0.0158, + "step": 8435 + }, + { + "epoch": 2.8171648021372517, + "grad_norm": 0.28539301569455333, + "learning_rate": 1.1216295368250196e-07, + "loss": 0.0149, + "step": 8436 + }, + { + "epoch": 2.817498747704124, + "grad_norm": 0.26953687510639085, + "learning_rate": 1.1175401258548324e-07, + "loss": 0.0124, + "step": 8437 + }, + { + "epoch": 2.8178326932709967, + "grad_norm": 0.2986731476846757, + "learning_rate": 1.1134580991557842e-07, + "loss": 0.0162, + "step": 8438 + }, + { + "epoch": 2.8181666388378694, + "grad_norm": 0.24487272593825893, + "learning_rate": 1.1093834573445094e-07, + "loss": 0.0125, + "step": 8439 + }, + { + "epoch": 2.818500584404742, + "grad_norm": 0.33134490719741383, + "learning_rate": 1.1053162010365326e-07, + "loss": 0.0122, + "step": 8440 + }, + { + "epoch": 2.818834529971615, + "grad_norm": 0.38176389561748614, + "learning_rate": 1.1012563308462565e-07, + "loss": 0.0239, + "step": 8441 + }, + { + "epoch": 2.819168475538487, + "grad_norm": 0.24794065109067973, + "learning_rate": 1.0972038473869795e-07, + "loss": 0.0109, + "step": 8442 + }, + { + "epoch": 2.81950242110536, + "grad_norm": 0.32262308348965707, + "learning_rate": 1.093158751270873e-07, + "loss": 0.0151, + "step": 8443 + }, + { + "epoch": 2.8198363666722326, + "grad_norm": 0.29734626106521483, + "learning_rate": 1.0891210431089983e-07, + "loss": 0.0106, + "step": 8444 + }, + { + "epoch": 2.820170312239105, + "grad_norm": 0.28189267524660805, + "learning_rate": 1.0850907235112895e-07, + "loss": 0.0128, + "step": 8445 + }, + { + "epoch": 2.8205042578059776, + "grad_norm": 0.2869779936280662, + "learning_rate": 1.0810677930865876e-07, + "loss": 0.012, + "step": 8446 + }, + { + "epoch": 2.8208382033728503, + "grad_norm": 0.2901808939607543, + "learning_rate": 1.0770522524425898e-07, + "loss": 0.0134, + "step": 8447 + }, + { + "epoch": 2.8211721489397226, + "grad_norm": 0.32842743169031546, + "learning_rate": 1.0730441021859106e-07, + "loss": 0.0134, + "step": 8448 + }, + { + "epoch": 2.8215060945065953, + "grad_norm": 0.338455094506942, + "learning_rate": 1.0690433429220049e-07, + "loss": 0.0144, + "step": 8449 + }, + { + "epoch": 2.821840040073468, + "grad_norm": 0.27895776404779044, + "learning_rate": 1.0650499752552557e-07, + "loss": 0.0159, + "step": 8450 + }, + { + "epoch": 2.8221739856403407, + "grad_norm": 0.2976618575069383, + "learning_rate": 1.0610639997888917e-07, + "loss": 0.0119, + "step": 8451 + }, + { + "epoch": 2.8225079312072134, + "grad_norm": 0.25465651607196915, + "learning_rate": 1.0570854171250478e-07, + "loss": 0.0129, + "step": 8452 + }, + { + "epoch": 2.8228418767740857, + "grad_norm": 0.3134176234571828, + "learning_rate": 1.0531142278647378e-07, + "loss": 0.0128, + "step": 8453 + }, + { + "epoch": 2.8231758223409584, + "grad_norm": 0.27868733224420017, + "learning_rate": 1.0491504326078483e-07, + "loss": 0.0075, + "step": 8454 + }, + { + "epoch": 2.823509767907831, + "grad_norm": 0.23699800131791296, + "learning_rate": 1.0451940319531728e-07, + "loss": 0.0084, + "step": 8455 + }, + { + "epoch": 2.8238437134747034, + "grad_norm": 0.40073273739297754, + "learning_rate": 1.0412450264983609e-07, + "loss": 0.0185, + "step": 8456 + }, + { + "epoch": 2.824177659041576, + "grad_norm": 0.2813315415535806, + "learning_rate": 1.0373034168399521e-07, + "loss": 0.0207, + "step": 8457 + }, + { + "epoch": 2.824511604608449, + "grad_norm": 0.23131194956899465, + "learning_rate": 1.0333692035733867e-07, + "loss": 0.0108, + "step": 8458 + }, + { + "epoch": 2.8248455501753216, + "grad_norm": 0.4652767329889953, + "learning_rate": 1.0294423872929615e-07, + "loss": 0.017, + "step": 8459 + }, + { + "epoch": 2.8251794957421943, + "grad_norm": 0.36143315764560047, + "learning_rate": 1.0255229685918744e-07, + "loss": 0.0224, + "step": 8460 + }, + { + "epoch": 2.8255134413090666, + "grad_norm": 0.2974311280656284, + "learning_rate": 1.0216109480622017e-07, + "loss": 0.0118, + "step": 8461 + }, + { + "epoch": 2.8258473868759393, + "grad_norm": 1.1702102524851237, + "learning_rate": 1.0177063262948927e-07, + "loss": 0.014, + "step": 8462 + }, + { + "epoch": 2.826181332442812, + "grad_norm": 0.3534851980015013, + "learning_rate": 1.0138091038797982e-07, + "loss": 0.0158, + "step": 8463 + }, + { + "epoch": 2.8265152780096843, + "grad_norm": 0.2723782456039754, + "learning_rate": 1.0099192814056247e-07, + "loss": 0.0143, + "step": 8464 + }, + { + "epoch": 2.826849223576557, + "grad_norm": 0.22216513094016702, + "learning_rate": 1.0060368594599856e-07, + "loss": 0.012, + "step": 8465 + }, + { + "epoch": 2.8271831691434297, + "grad_norm": 0.35377554256356336, + "learning_rate": 1.002161838629362e-07, + "loss": 0.0087, + "step": 8466 + }, + { + "epoch": 2.827517114710302, + "grad_norm": 0.20115279592806767, + "learning_rate": 9.982942194991297e-08, + "loss": 0.0066, + "step": 8467 + }, + { + "epoch": 2.8278510602771747, + "grad_norm": 0.30575556991443903, + "learning_rate": 9.94434002653527e-08, + "loss": 0.0161, + "step": 8468 + }, + { + "epoch": 2.8281850058440474, + "grad_norm": 0.25882505482968193, + "learning_rate": 9.905811886756933e-08, + "loss": 0.0129, + "step": 8469 + }, + { + "epoch": 2.82851895141092, + "grad_norm": 0.2929484761962619, + "learning_rate": 9.867357781476294e-08, + "loss": 0.0139, + "step": 8470 + }, + { + "epoch": 2.828852896977793, + "grad_norm": 0.4300323198699619, + "learning_rate": 9.828977716502486e-08, + "loss": 0.0144, + "step": 8471 + }, + { + "epoch": 2.829186842544665, + "grad_norm": 0.2973213778977374, + "learning_rate": 9.790671697633092e-08, + "loss": 0.0151, + "step": 8472 + }, + { + "epoch": 2.829520788111538, + "grad_norm": 0.3748368631779452, + "learning_rate": 9.752439730654872e-08, + "loss": 0.0224, + "step": 8473 + }, + { + "epoch": 2.8298547336784106, + "grad_norm": 0.3208448130130757, + "learning_rate": 9.714281821343041e-08, + "loss": 0.0121, + "step": 8474 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.2544573923943401, + "learning_rate": 9.676197975461876e-08, + "loss": 0.012, + "step": 8475 + }, + { + "epoch": 2.8305226248121556, + "grad_norm": 0.27098197281819014, + "learning_rate": 9.638188198764387e-08, + "loss": 0.0119, + "step": 8476 + }, + { + "epoch": 2.8308565703790283, + "grad_norm": 0.2879514253745916, + "learning_rate": 9.600252496992369e-08, + "loss": 0.0107, + "step": 8477 + }, + { + "epoch": 2.8311905159459005, + "grad_norm": 0.2322135051338501, + "learning_rate": 9.562390875876515e-08, + "loss": 0.0082, + "step": 8478 + }, + { + "epoch": 2.8315244615127733, + "grad_norm": 0.3014665266822095, + "learning_rate": 9.524603341136251e-08, + "loss": 0.0145, + "step": 8479 + }, + { + "epoch": 2.831858407079646, + "grad_norm": 0.24936357592251743, + "learning_rate": 9.486889898479734e-08, + "loss": 0.0098, + "step": 8480 + }, + { + "epoch": 2.8321923526465187, + "grad_norm": 0.23670990454764407, + "learning_rate": 9.449250553604184e-08, + "loss": 0.0093, + "step": 8481 + }, + { + "epoch": 2.8325262982133914, + "grad_norm": 0.2030634051800307, + "learning_rate": 9.41168531219533e-08, + "loss": 0.0089, + "step": 8482 + }, + { + "epoch": 2.8328602437802637, + "grad_norm": 0.32479644427744586, + "learning_rate": 9.374194179927909e-08, + "loss": 0.0168, + "step": 8483 + }, + { + "epoch": 2.8331941893471364, + "grad_norm": 0.28146344194371314, + "learning_rate": 9.336777162465449e-08, + "loss": 0.0136, + "step": 8484 + }, + { + "epoch": 2.833528134914009, + "grad_norm": 0.28794033571057465, + "learning_rate": 9.299434265460095e-08, + "loss": 0.0138, + "step": 8485 + }, + { + "epoch": 2.8338620804808814, + "grad_norm": 0.2845003959676957, + "learning_rate": 9.262165494553055e-08, + "loss": 0.0154, + "step": 8486 + }, + { + "epoch": 2.834196026047754, + "grad_norm": 0.37816624497676077, + "learning_rate": 9.22497085537416e-08, + "loss": 0.0135, + "step": 8487 + }, + { + "epoch": 2.834529971614627, + "grad_norm": 0.42961347025405255, + "learning_rate": 9.187850353542082e-08, + "loss": 0.0173, + "step": 8488 + }, + { + "epoch": 2.8348639171814995, + "grad_norm": 0.2786534731328175, + "learning_rate": 9.150803994664337e-08, + "loss": 0.0154, + "step": 8489 + }, + { + "epoch": 2.8351978627483723, + "grad_norm": 0.5368650444095259, + "learning_rate": 9.113831784337279e-08, + "loss": 0.0169, + "step": 8490 + }, + { + "epoch": 2.8355318083152445, + "grad_norm": 0.330794427971968, + "learning_rate": 9.076933728145832e-08, + "loss": 0.0132, + "step": 8491 + }, + { + "epoch": 2.8358657538821173, + "grad_norm": 0.3324737949827866, + "learning_rate": 9.040109831664035e-08, + "loss": 0.0182, + "step": 8492 + }, + { + "epoch": 2.83619969944899, + "grad_norm": 0.2736450738801953, + "learning_rate": 9.003360100454495e-08, + "loss": 0.0118, + "step": 8493 + }, + { + "epoch": 2.8365336450158622, + "grad_norm": 0.27565907317194843, + "learning_rate": 8.966684540068659e-08, + "loss": 0.0121, + "step": 8494 + }, + { + "epoch": 2.836867590582735, + "grad_norm": 0.2851302314397976, + "learning_rate": 8.930083156046931e-08, + "loss": 0.011, + "step": 8495 + }, + { + "epoch": 2.8372015361496077, + "grad_norm": 0.3410516278374826, + "learning_rate": 8.893555953918276e-08, + "loss": 0.0143, + "step": 8496 + }, + { + "epoch": 2.83753548171648, + "grad_norm": 0.29575558981494526, + "learning_rate": 8.857102939200557e-08, + "loss": 0.0136, + "step": 8497 + }, + { + "epoch": 2.8378694272833527, + "grad_norm": 0.29374282178627564, + "learning_rate": 8.820724117400536e-08, + "loss": 0.0116, + "step": 8498 + }, + { + "epoch": 2.8382033728502254, + "grad_norm": 0.2226854391655593, + "learning_rate": 8.784419494013541e-08, + "loss": 0.0101, + "step": 8499 + }, + { + "epoch": 2.838537318417098, + "grad_norm": 0.26833583075797063, + "learning_rate": 8.74818907452385e-08, + "loss": 0.0164, + "step": 8500 + }, + { + "epoch": 2.838871263983971, + "grad_norm": 0.25534358032828847, + "learning_rate": 8.712032864404529e-08, + "loss": 0.0124, + "step": 8501 + }, + { + "epoch": 2.839205209550843, + "grad_norm": 0.2242527309536938, + "learning_rate": 8.675950869117323e-08, + "loss": 0.0092, + "step": 8502 + }, + { + "epoch": 2.839539155117716, + "grad_norm": 0.3060378584530344, + "learning_rate": 8.639943094112868e-08, + "loss": 0.0127, + "step": 8503 + }, + { + "epoch": 2.8398731006845885, + "grad_norm": 0.32974861235148545, + "learning_rate": 8.604009544830705e-08, + "loss": 0.0181, + "step": 8504 + }, + { + "epoch": 2.840207046251461, + "grad_norm": 0.27513536522164506, + "learning_rate": 8.568150226698823e-08, + "loss": 0.0121, + "step": 8505 + }, + { + "epoch": 2.8405409918183335, + "grad_norm": 0.36152341510785624, + "learning_rate": 8.532365145134226e-08, + "loss": 0.0157, + "step": 8506 + }, + { + "epoch": 2.8408749373852062, + "grad_norm": 0.31136696128853075, + "learning_rate": 8.496654305542807e-08, + "loss": 0.0162, + "step": 8507 + }, + { + "epoch": 2.841208882952079, + "grad_norm": 0.3697520653983878, + "learning_rate": 8.461017713318976e-08, + "loss": 0.0189, + "step": 8508 + }, + { + "epoch": 2.8415428285189517, + "grad_norm": 0.26632652178921934, + "learning_rate": 8.425455373846147e-08, + "loss": 0.0122, + "step": 8509 + }, + { + "epoch": 2.841876774085824, + "grad_norm": 0.24430852577156545, + "learning_rate": 8.38996729249636e-08, + "loss": 0.0086, + "step": 8510 + }, + { + "epoch": 2.8422107196526967, + "grad_norm": 0.34850018244760905, + "learning_rate": 8.354553474630489e-08, + "loss": 0.0175, + "step": 8511 + }, + { + "epoch": 2.8425446652195694, + "grad_norm": 0.36303384284039225, + "learning_rate": 8.319213925598258e-08, + "loss": 0.0183, + "step": 8512 + }, + { + "epoch": 2.8428786107864417, + "grad_norm": 0.2679445838842964, + "learning_rate": 8.283948650738172e-08, + "loss": 0.0119, + "step": 8513 + }, + { + "epoch": 2.8432125563533144, + "grad_norm": 0.3110846121880377, + "learning_rate": 8.248757655377415e-08, + "loss": 0.0183, + "step": 8514 + }, + { + "epoch": 2.843546501920187, + "grad_norm": 0.22924830934423535, + "learning_rate": 8.213640944831957e-08, + "loss": 0.013, + "step": 8515 + }, + { + "epoch": 2.8438804474870594, + "grad_norm": 0.2661107097521215, + "learning_rate": 8.178598524406667e-08, + "loss": 0.0164, + "step": 8516 + }, + { + "epoch": 2.844214393053932, + "grad_norm": 0.2956356286503423, + "learning_rate": 8.143630399395031e-08, + "loss": 0.0119, + "step": 8517 + }, + { + "epoch": 2.844548338620805, + "grad_norm": 0.3453447186550457, + "learning_rate": 8.108736575079434e-08, + "loss": 0.0189, + "step": 8518 + }, + { + "epoch": 2.8448822841876775, + "grad_norm": 0.24075542788156157, + "learning_rate": 8.073917056731106e-08, + "loss": 0.0116, + "step": 8519 + }, + { + "epoch": 2.8452162297545502, + "grad_norm": 0.35297050546727476, + "learning_rate": 8.039171849609728e-08, + "loss": 0.0133, + "step": 8520 + }, + { + "epoch": 2.8455501753214225, + "grad_norm": 0.2527799302308656, + "learning_rate": 8.004500958964211e-08, + "loss": 0.0104, + "step": 8521 + }, + { + "epoch": 2.8458841208882952, + "grad_norm": 0.27290973030080484, + "learning_rate": 7.969904390031812e-08, + "loss": 0.0117, + "step": 8522 + }, + { + "epoch": 2.846218066455168, + "grad_norm": 0.2921832885408687, + "learning_rate": 7.935382148038794e-08, + "loss": 0.0116, + "step": 8523 + }, + { + "epoch": 2.8465520120220402, + "grad_norm": 0.2988831261199746, + "learning_rate": 7.900934238200265e-08, + "loss": 0.0138, + "step": 8524 + }, + { + "epoch": 2.846885957588913, + "grad_norm": 0.328375647624051, + "learning_rate": 7.866560665719836e-08, + "loss": 0.0147, + "step": 8525 + }, + { + "epoch": 2.8472199031557857, + "grad_norm": 0.3395522286479151, + "learning_rate": 7.832261435790078e-08, + "loss": 0.02, + "step": 8526 + }, + { + "epoch": 2.847553848722658, + "grad_norm": 0.3205350868707443, + "learning_rate": 7.798036553592403e-08, + "loss": 0.0128, + "step": 8527 + }, + { + "epoch": 2.8478877942895306, + "grad_norm": 0.21270356895204348, + "learning_rate": 7.763886024296729e-08, + "loss": 0.0073, + "step": 8528 + }, + { + "epoch": 2.8482217398564034, + "grad_norm": 0.26342686256275705, + "learning_rate": 7.729809853061987e-08, + "loss": 0.0116, + "step": 8529 + }, + { + "epoch": 2.848555685423276, + "grad_norm": 0.3028909044047864, + "learning_rate": 7.69580804503578e-08, + "loss": 0.0151, + "step": 8530 + }, + { + "epoch": 2.848889630990149, + "grad_norm": 0.2404667209106315, + "learning_rate": 7.661880605354444e-08, + "loss": 0.0109, + "step": 8531 + }, + { + "epoch": 2.849223576557021, + "grad_norm": 0.2594877666811875, + "learning_rate": 7.628027539143156e-08, + "loss": 0.0114, + "step": 8532 + }, + { + "epoch": 2.849557522123894, + "grad_norm": 0.2030132717973318, + "learning_rate": 7.594248851515717e-08, + "loss": 0.009, + "step": 8533 + }, + { + "epoch": 2.8498914676907665, + "grad_norm": 0.29151936401500517, + "learning_rate": 7.560544547574988e-08, + "loss": 0.0121, + "step": 8534 + }, + { + "epoch": 2.850225413257639, + "grad_norm": 0.22534914580383095, + "learning_rate": 7.526914632412175e-08, + "loss": 0.0083, + "step": 8535 + }, + { + "epoch": 2.8505593588245115, + "grad_norm": 0.33263694189915644, + "learning_rate": 7.493359111107712e-08, + "loss": 0.0139, + "step": 8536 + }, + { + "epoch": 2.850893304391384, + "grad_norm": 0.30218668753966177, + "learning_rate": 7.459877988730325e-08, + "loss": 0.0175, + "step": 8537 + }, + { + "epoch": 2.851227249958257, + "grad_norm": 0.2430993896365685, + "learning_rate": 7.42647127033791e-08, + "loss": 0.0105, + "step": 8538 + }, + { + "epoch": 2.8515611955251297, + "grad_norm": 0.303225374287104, + "learning_rate": 7.393138960976876e-08, + "loss": 0.0145, + "step": 8539 + }, + { + "epoch": 2.851895141092002, + "grad_norm": 0.2842340670103229, + "learning_rate": 7.359881065682473e-08, + "loss": 0.0151, + "step": 8540 + }, + { + "epoch": 2.8522290866588746, + "grad_norm": 0.24462551342850572, + "learning_rate": 7.32669758947857e-08, + "loss": 0.0112, + "step": 8541 + }, + { + "epoch": 2.8525630322257474, + "grad_norm": 0.24453162496305242, + "learning_rate": 7.29358853737816e-08, + "loss": 0.0105, + "step": 8542 + }, + { + "epoch": 2.8528969777926196, + "grad_norm": 0.2637551487610621, + "learning_rate": 7.260553914382573e-08, + "loss": 0.0103, + "step": 8543 + }, + { + "epoch": 2.8532309233594924, + "grad_norm": 0.2662783522173254, + "learning_rate": 7.227593725482207e-08, + "loss": 0.0133, + "step": 8544 + }, + { + "epoch": 2.853564868926365, + "grad_norm": 0.28591637654753016, + "learning_rate": 7.194707975655912e-08, + "loss": 0.0167, + "step": 8545 + }, + { + "epoch": 2.8538988144932373, + "grad_norm": 0.34518955222935715, + "learning_rate": 7.161896669871605e-08, + "loss": 0.0149, + "step": 8546 + }, + { + "epoch": 2.85423276006011, + "grad_norm": 0.2941960925392155, + "learning_rate": 7.129159813085817e-08, + "loss": 0.0163, + "step": 8547 + }, + { + "epoch": 2.854566705626983, + "grad_norm": 0.2712849523985334, + "learning_rate": 7.096497410243819e-08, + "loss": 0.0122, + "step": 8548 + }, + { + "epoch": 2.8549006511938555, + "grad_norm": 0.3106976369290745, + "learning_rate": 7.063909466279605e-08, + "loss": 0.0139, + "step": 8549 + }, + { + "epoch": 2.855234596760728, + "grad_norm": 0.363119854761904, + "learning_rate": 7.031395986116019e-08, + "loss": 0.0157, + "step": 8550 + }, + { + "epoch": 2.8555685423276005, + "grad_norm": 0.25050871163578137, + "learning_rate": 6.998956974664573e-08, + "loss": 0.0109, + "step": 8551 + }, + { + "epoch": 2.855902487894473, + "grad_norm": 0.3928211617260432, + "learning_rate": 6.966592436825514e-08, + "loss": 0.0276, + "step": 8552 + }, + { + "epoch": 2.856236433461346, + "grad_norm": 0.2503705121237086, + "learning_rate": 6.934302377488045e-08, + "loss": 0.0123, + "step": 8553 + }, + { + "epoch": 2.856570379028218, + "grad_norm": 0.259217195806298, + "learning_rate": 6.902086801529817e-08, + "loss": 0.0102, + "step": 8554 + }, + { + "epoch": 2.856904324595091, + "grad_norm": 0.24940453514628133, + "learning_rate": 6.869945713817438e-08, + "loss": 0.0093, + "step": 8555 + }, + { + "epoch": 2.8572382701619636, + "grad_norm": 0.2735992066774658, + "learning_rate": 6.837879119206192e-08, + "loss": 0.014, + "step": 8556 + }, + { + "epoch": 2.8575722157288364, + "grad_norm": 0.6348134765812143, + "learning_rate": 6.805887022540093e-08, + "loss": 0.017, + "step": 8557 + }, + { + "epoch": 2.857906161295709, + "grad_norm": 0.2972151048234801, + "learning_rate": 6.773969428651883e-08, + "loss": 0.0146, + "step": 8558 + }, + { + "epoch": 2.8582401068625813, + "grad_norm": 0.3751021316558058, + "learning_rate": 6.742126342363153e-08, + "loss": 0.0196, + "step": 8559 + }, + { + "epoch": 2.858574052429454, + "grad_norm": 0.35876437528401917, + "learning_rate": 6.710357768484165e-08, + "loss": 0.017, + "step": 8560 + }, + { + "epoch": 2.8589079979963268, + "grad_norm": 0.22743288700869818, + "learning_rate": 6.67866371181397e-08, + "loss": 0.0089, + "step": 8561 + }, + { + "epoch": 2.859241943563199, + "grad_norm": 0.32869503420014967, + "learning_rate": 6.647044177140293e-08, + "loss": 0.0127, + "step": 8562 + }, + { + "epoch": 2.8595758891300718, + "grad_norm": 0.25432944067935565, + "learning_rate": 6.615499169239647e-08, + "loss": 0.0145, + "step": 8563 + }, + { + "epoch": 2.8599098346969445, + "grad_norm": 0.2673055231272969, + "learning_rate": 6.584028692877164e-08, + "loss": 0.0104, + "step": 8564 + }, + { + "epoch": 2.8602437802638168, + "grad_norm": 0.324891112488638, + "learning_rate": 6.552632752807042e-08, + "loss": 0.0181, + "step": 8565 + }, + { + "epoch": 2.8605777258306895, + "grad_norm": 0.27747951249758, + "learning_rate": 6.52131135377182e-08, + "loss": 0.0157, + "step": 8566 + }, + { + "epoch": 2.860911671397562, + "grad_norm": 0.2685322676877951, + "learning_rate": 6.490064500503102e-08, + "loss": 0.0116, + "step": 8567 + }, + { + "epoch": 2.861245616964435, + "grad_norm": 0.28318602323609887, + "learning_rate": 6.458892197721e-08, + "loss": 0.0129, + "step": 8568 + }, + { + "epoch": 2.8615795625313076, + "grad_norm": 0.2645660557938995, + "learning_rate": 6.427794450134529e-08, + "loss": 0.0103, + "step": 8569 + }, + { + "epoch": 2.86191350809818, + "grad_norm": 0.2570216881372266, + "learning_rate": 6.396771262441259e-08, + "loss": 0.0119, + "step": 8570 + }, + { + "epoch": 2.8622474536650526, + "grad_norm": 0.29667492982733074, + "learning_rate": 6.365822639327724e-08, + "loss": 0.0124, + "step": 8571 + }, + { + "epoch": 2.8625813992319253, + "grad_norm": 0.3254460047612813, + "learning_rate": 6.334948585469014e-08, + "loss": 0.0147, + "step": 8572 + }, + { + "epoch": 2.8629153447987976, + "grad_norm": 0.30839793575310176, + "learning_rate": 6.304149105529067e-08, + "loss": 0.0196, + "step": 8573 + }, + { + "epoch": 2.8632492903656703, + "grad_norm": 0.3010324802122414, + "learning_rate": 6.273424204160438e-08, + "loss": 0.0165, + "step": 8574 + }, + { + "epoch": 2.863583235932543, + "grad_norm": 0.25597586835558983, + "learning_rate": 6.242773886004583e-08, + "loss": 0.0111, + "step": 8575 + }, + { + "epoch": 2.8639171814994153, + "grad_norm": 0.23180622247104044, + "learning_rate": 6.212198155691518e-08, + "loss": 0.009, + "step": 8576 + }, + { + "epoch": 2.864251127066288, + "grad_norm": 0.26706278527959076, + "learning_rate": 6.181697017840049e-08, + "loss": 0.0128, + "step": 8577 + }, + { + "epoch": 2.8645850726331608, + "grad_norm": 0.3062347972340337, + "learning_rate": 6.151270477057825e-08, + "loss": 0.0228, + "step": 8578 + }, + { + "epoch": 2.8649190182000335, + "grad_norm": 0.28246594751599063, + "learning_rate": 6.120918537941001e-08, + "loss": 0.0177, + "step": 8579 + }, + { + "epoch": 2.865252963766906, + "grad_norm": 0.2404341118431869, + "learning_rate": 6.090641205074743e-08, + "loss": 0.0086, + "step": 8580 + }, + { + "epoch": 2.8655869093337785, + "grad_norm": 0.23522856635149827, + "learning_rate": 6.060438483032671e-08, + "loss": 0.0084, + "step": 8581 + }, + { + "epoch": 2.865920854900651, + "grad_norm": 0.3327211212380855, + "learning_rate": 6.030310376377302e-08, + "loss": 0.0131, + "step": 8582 + }, + { + "epoch": 2.866254800467524, + "grad_norm": 0.2355739520729309, + "learning_rate": 6.000256889659883e-08, + "loss": 0.0139, + "step": 8583 + }, + { + "epoch": 2.866588746034396, + "grad_norm": 0.2901331532502492, + "learning_rate": 5.97027802742034e-08, + "loss": 0.0114, + "step": 8584 + }, + { + "epoch": 2.866922691601269, + "grad_norm": 0.29450294292448287, + "learning_rate": 5.940373794187326e-08, + "loss": 0.0117, + "step": 8585 + }, + { + "epoch": 2.8672566371681416, + "grad_norm": 0.19670420235039293, + "learning_rate": 5.910544194478174e-08, + "loss": 0.0072, + "step": 8586 + }, + { + "epoch": 2.8675905827350143, + "grad_norm": 0.3484481470828203, + "learning_rate": 5.880789232799e-08, + "loss": 0.0285, + "step": 8587 + }, + { + "epoch": 2.867924528301887, + "grad_norm": 0.2550703476448516, + "learning_rate": 5.851108913644765e-08, + "loss": 0.0105, + "step": 8588 + }, + { + "epoch": 2.8682584738687593, + "grad_norm": 0.2961267925782829, + "learning_rate": 5.821503241498882e-08, + "loss": 0.0184, + "step": 8589 + }, + { + "epoch": 2.868592419435632, + "grad_norm": 0.27187699589818554, + "learning_rate": 5.791972220833719e-08, + "loss": 0.0109, + "step": 8590 + }, + { + "epoch": 2.8689263650025048, + "grad_norm": 0.25437284434188395, + "learning_rate": 5.762515856110262e-08, + "loss": 0.0099, + "step": 8591 + }, + { + "epoch": 2.869260310569377, + "grad_norm": 0.2543393518130791, + "learning_rate": 5.7331341517782855e-08, + "loss": 0.0126, + "step": 8592 + }, + { + "epoch": 2.8695942561362497, + "grad_norm": 0.1976849610871578, + "learning_rate": 5.703827112276128e-08, + "loss": 0.0082, + "step": 8593 + }, + { + "epoch": 2.8699282017031225, + "grad_norm": 0.22656775373268642, + "learning_rate": 5.674594742031081e-08, + "loss": 0.0082, + "step": 8594 + }, + { + "epoch": 2.8702621472699947, + "grad_norm": 0.3012257046414503, + "learning_rate": 5.6454370454589456e-08, + "loss": 0.0143, + "step": 8595 + }, + { + "epoch": 2.8705960928368675, + "grad_norm": 0.3275795352504451, + "learning_rate": 5.6163540269644215e-08, + "loss": 0.0234, + "step": 8596 + }, + { + "epoch": 2.87093003840374, + "grad_norm": 0.3114172374999129, + "learning_rate": 5.5873456909407706e-08, + "loss": 0.0106, + "step": 8597 + }, + { + "epoch": 2.871263983970613, + "grad_norm": 0.283723519554095, + "learning_rate": 5.5584120417701005e-08, + "loss": 0.0127, + "step": 8598 + }, + { + "epoch": 2.8715979295374856, + "grad_norm": 0.21519421110218015, + "learning_rate": 5.529553083823136e-08, + "loss": 0.0099, + "step": 8599 + }, + { + "epoch": 2.871931875104358, + "grad_norm": 0.34906342767238513, + "learning_rate": 5.50076882145939e-08, + "loss": 0.0165, + "step": 8600 + }, + { + "epoch": 2.8722658206712306, + "grad_norm": 0.27965922090622586, + "learning_rate": 5.472059259027051e-08, + "loss": 0.0168, + "step": 8601 + }, + { + "epoch": 2.8725997662381033, + "grad_norm": 0.42169443466053885, + "learning_rate": 5.44342440086304e-08, + "loss": 0.0177, + "step": 8602 + }, + { + "epoch": 2.8729337118049756, + "grad_norm": 0.23363688731913357, + "learning_rate": 5.414864251293006e-08, + "loss": 0.0085, + "step": 8603 + }, + { + "epoch": 2.8732676573718483, + "grad_norm": 0.32918919326802065, + "learning_rate": 5.386378814631277e-08, + "loss": 0.0182, + "step": 8604 + }, + { + "epoch": 2.873601602938721, + "grad_norm": 0.25972777777421585, + "learning_rate": 5.3579680951808545e-08, + "loss": 0.0097, + "step": 8605 + }, + { + "epoch": 2.8739355485055937, + "grad_norm": 0.31170947328139753, + "learning_rate": 5.329632097233639e-08, + "loss": 0.0095, + "step": 8606 + }, + { + "epoch": 2.8742694940724665, + "grad_norm": 0.30807716276869274, + "learning_rate": 5.3013708250700405e-08, + "loss": 0.0086, + "step": 8607 + }, + { + "epoch": 2.8746034396393387, + "grad_norm": 0.26363665871249625, + "learning_rate": 5.2731842829591984e-08, + "loss": 0.0136, + "step": 8608 + }, + { + "epoch": 2.8749373852062114, + "grad_norm": 0.36052019194724466, + "learning_rate": 5.2450724751592076e-08, + "loss": 0.0138, + "step": 8609 + }, + { + "epoch": 2.875271330773084, + "grad_norm": 0.25209575972945103, + "learning_rate": 5.217035405916449e-08, + "loss": 0.0107, + "step": 8610 + }, + { + "epoch": 2.8756052763399564, + "grad_norm": 0.26828565895104245, + "learning_rate": 5.1890730794664227e-08, + "loss": 0.0105, + "step": 8611 + }, + { + "epoch": 2.875939221906829, + "grad_norm": 0.31240163681450744, + "learning_rate": 5.161185500033139e-08, + "loss": 0.0145, + "step": 8612 + }, + { + "epoch": 2.876273167473702, + "grad_norm": 0.3429539275711395, + "learning_rate": 5.1333726718293396e-08, + "loss": 0.0157, + "step": 8613 + }, + { + "epoch": 2.876607113040574, + "grad_norm": 0.2784040083795381, + "learning_rate": 5.105634599056386e-08, + "loss": 0.0113, + "step": 8614 + }, + { + "epoch": 2.876941058607447, + "grad_norm": 0.3182685741683252, + "learning_rate": 5.077971285904593e-08, + "loss": 0.0187, + "step": 8615 + }, + { + "epoch": 2.8772750041743196, + "grad_norm": 0.28762007501535114, + "learning_rate": 5.050382736552728e-08, + "loss": 0.0125, + "step": 8616 + }, + { + "epoch": 2.8776089497411923, + "grad_norm": 0.2701771129361674, + "learning_rate": 5.022868955168403e-08, + "loss": 0.01, + "step": 8617 + }, + { + "epoch": 2.877942895308065, + "grad_norm": 0.28235383957536087, + "learning_rate": 4.995429945907848e-08, + "loss": 0.0125, + "step": 8618 + }, + { + "epoch": 2.8782768408749373, + "grad_norm": 0.3443954539083251, + "learning_rate": 4.968065712916137e-08, + "loss": 0.0206, + "step": 8619 + }, + { + "epoch": 2.87861078644181, + "grad_norm": 0.26059304888406293, + "learning_rate": 4.940776260326907e-08, + "loss": 0.0167, + "step": 8620 + }, + { + "epoch": 2.8789447320086827, + "grad_norm": 0.25791943630723885, + "learning_rate": 4.913561592262528e-08, + "loss": 0.0121, + "step": 8621 + }, + { + "epoch": 2.879278677575555, + "grad_norm": 0.2655859327898246, + "learning_rate": 4.886421712834155e-08, + "loss": 0.0133, + "step": 8622 + }, + { + "epoch": 2.8796126231424277, + "grad_norm": 0.30223419439100285, + "learning_rate": 4.859356626141509e-08, + "loss": 0.0157, + "step": 8623 + }, + { + "epoch": 2.8799465687093004, + "grad_norm": 0.32348923902848525, + "learning_rate": 4.8323663362732084e-08, + "loss": 0.0198, + "step": 8624 + }, + { + "epoch": 2.8802805142761727, + "grad_norm": 0.31712286439454235, + "learning_rate": 4.8054508473063253e-08, + "loss": 0.0154, + "step": 8625 + }, + { + "epoch": 2.8806144598430454, + "grad_norm": 0.25780039181033787, + "learning_rate": 4.778610163306885e-08, + "loss": 0.0121, + "step": 8626 + }, + { + "epoch": 2.880948405409918, + "grad_norm": 0.35193051371421863, + "learning_rate": 4.751844288329366e-08, + "loss": 0.0232, + "step": 8627 + }, + { + "epoch": 2.881282350976791, + "grad_norm": 0.22856815421864873, + "learning_rate": 4.72515322641709e-08, + "loss": 0.0088, + "step": 8628 + }, + { + "epoch": 2.8816162965436636, + "grad_norm": 0.3197896763682093, + "learning_rate": 4.6985369816021644e-08, + "loss": 0.0082, + "step": 8629 + }, + { + "epoch": 2.881950242110536, + "grad_norm": 0.2553349556509692, + "learning_rate": 4.6719955579052064e-08, + "loss": 0.0177, + "step": 8630 + }, + { + "epoch": 2.8822841876774086, + "grad_norm": 0.26694491821126515, + "learning_rate": 4.6455289593355656e-08, + "loss": 0.0123, + "step": 8631 + }, + { + "epoch": 2.8826181332442813, + "grad_norm": 0.2932190552414894, + "learning_rate": 4.619137189891432e-08, + "loss": 0.0154, + "step": 8632 + }, + { + "epoch": 2.8829520788111536, + "grad_norm": 0.32154559052800763, + "learning_rate": 4.5928202535595044e-08, + "loss": 0.0178, + "step": 8633 + }, + { + "epoch": 2.8832860243780263, + "grad_norm": 0.2685217622127027, + "learning_rate": 4.5665781543153266e-08, + "loss": 0.0105, + "step": 8634 + }, + { + "epoch": 2.883619969944899, + "grad_norm": 0.31063917030906485, + "learning_rate": 4.54041089612306e-08, + "loss": 0.0127, + "step": 8635 + }, + { + "epoch": 2.8839539155117717, + "grad_norm": 0.28607046453238255, + "learning_rate": 4.514318482935598e-08, + "loss": 0.0173, + "step": 8636 + }, + { + "epoch": 2.8842878610786444, + "grad_norm": 0.24263089680815572, + "learning_rate": 4.488300918694455e-08, + "loss": 0.0112, + "step": 8637 + }, + { + "epoch": 2.8846218066455167, + "grad_norm": 0.2736154358704746, + "learning_rate": 4.4623582073299864e-08, + "loss": 0.0121, + "step": 8638 + }, + { + "epoch": 2.8849557522123894, + "grad_norm": 0.28647529293389273, + "learning_rate": 4.4364903527610026e-08, + "loss": 0.014, + "step": 8639 + }, + { + "epoch": 2.885289697779262, + "grad_norm": 0.2810451887262522, + "learning_rate": 4.410697358895211e-08, + "loss": 0.0113, + "step": 8640 + }, + { + "epoch": 2.8856236433461344, + "grad_norm": 0.3365754474126144, + "learning_rate": 4.384979229628994e-08, + "loss": 0.0186, + "step": 8641 + }, + { + "epoch": 2.885957588913007, + "grad_norm": 0.33665519179452863, + "learning_rate": 4.359335968847356e-08, + "loss": 0.0163, + "step": 8642 + }, + { + "epoch": 2.88629153447988, + "grad_norm": 0.32270914265569817, + "learning_rate": 4.333767580423976e-08, + "loss": 0.0134, + "step": 8643 + }, + { + "epoch": 2.886625480046752, + "grad_norm": 0.32316203055753245, + "learning_rate": 4.3082740682213186e-08, + "loss": 0.0164, + "step": 8644 + }, + { + "epoch": 2.886959425613625, + "grad_norm": 0.33441153874827034, + "learning_rate": 4.2828554360904165e-08, + "loss": 0.0194, + "step": 8645 + }, + { + "epoch": 2.8872933711804976, + "grad_norm": 0.27536690878555026, + "learning_rate": 4.25751168787103e-08, + "loss": 0.0141, + "step": 8646 + }, + { + "epoch": 2.8876273167473703, + "grad_norm": 0.23970856884759603, + "learning_rate": 4.2322428273917635e-08, + "loss": 0.0113, + "step": 8647 + }, + { + "epoch": 2.887961262314243, + "grad_norm": 0.2933379710425046, + "learning_rate": 4.2070488584696754e-08, + "loss": 0.0116, + "step": 8648 + }, + { + "epoch": 2.8882952078811153, + "grad_norm": 0.27268144692593416, + "learning_rate": 4.18192978491061e-08, + "loss": 0.0126, + "step": 8649 + }, + { + "epoch": 2.888629153447988, + "grad_norm": 0.23595906502391337, + "learning_rate": 4.1568856105091424e-08, + "loss": 0.009, + "step": 8650 + }, + { + "epoch": 2.8889630990148607, + "grad_norm": 0.29042081610527437, + "learning_rate": 4.1319163390484693e-08, + "loss": 0.0168, + "step": 8651 + }, + { + "epoch": 2.889297044581733, + "grad_norm": 0.3239046850135436, + "learning_rate": 4.107021974300407e-08, + "loss": 0.0151, + "step": 8652 + }, + { + "epoch": 2.8896309901486057, + "grad_norm": 0.2848550150771415, + "learning_rate": 4.082202520025724e-08, + "loss": 0.0162, + "step": 8653 + }, + { + "epoch": 2.8899649357154784, + "grad_norm": 0.2662346842513805, + "learning_rate": 4.0574579799735335e-08, + "loss": 0.0132, + "step": 8654 + }, + { + "epoch": 2.890298881282351, + "grad_norm": 0.3175551371439921, + "learning_rate": 4.0327883578819006e-08, + "loss": 0.0151, + "step": 8655 + }, + { + "epoch": 2.890632826849224, + "grad_norm": 0.29936501600193655, + "learning_rate": 4.008193657477399e-08, + "loss": 0.0139, + "step": 8656 + }, + { + "epoch": 2.890966772416096, + "grad_norm": 0.2675577484150458, + "learning_rate": 3.9836738824753364e-08, + "loss": 0.0157, + "step": 8657 + }, + { + "epoch": 2.891300717982969, + "grad_norm": 0.2185916716171451, + "learning_rate": 3.959229036579748e-08, + "loss": 0.0099, + "step": 8658 + }, + { + "epoch": 2.8916346635498416, + "grad_norm": 0.25632943115240303, + "learning_rate": 3.9348591234832926e-08, + "loss": 0.0126, + "step": 8659 + }, + { + "epoch": 2.891968609116714, + "grad_norm": 0.24724161710315734, + "learning_rate": 3.9105641468673574e-08, + "loss": 0.0106, + "step": 8660 + }, + { + "epoch": 2.8923025546835865, + "grad_norm": 0.2237241762635861, + "learning_rate": 3.886344110402007e-08, + "loss": 0.0098, + "step": 8661 + }, + { + "epoch": 2.8926365002504593, + "grad_norm": 0.280499808435951, + "learning_rate": 3.862199017745871e-08, + "loss": 0.0103, + "step": 8662 + }, + { + "epoch": 2.8929704458173315, + "grad_norm": 0.21940658965183582, + "learning_rate": 3.838128872546421e-08, + "loss": 0.0108, + "step": 8663 + }, + { + "epoch": 2.8933043913842043, + "grad_norm": 0.25735699024605174, + "learning_rate": 3.814133678439691e-08, + "loss": 0.0132, + "step": 8664 + }, + { + "epoch": 2.893638336951077, + "grad_norm": 0.24372094354039864, + "learning_rate": 3.790213439050561e-08, + "loss": 0.0091, + "step": 8665 + }, + { + "epoch": 2.8939722825179497, + "grad_norm": 0.24735252275639746, + "learning_rate": 3.766368157992306e-08, + "loss": 0.0123, + "step": 8666 + }, + { + "epoch": 2.8943062280848224, + "grad_norm": 0.267038414140022, + "learning_rate": 3.7425978388671014e-08, + "loss": 0.0151, + "step": 8667 + }, + { + "epoch": 2.8946401736516947, + "grad_norm": 0.2591770195906735, + "learning_rate": 3.718902485265741e-08, + "loss": 0.0111, + "step": 8668 + }, + { + "epoch": 2.8949741192185674, + "grad_norm": 0.28739627134963536, + "learning_rate": 3.6952821007676943e-08, + "loss": 0.0122, + "step": 8669 + }, + { + "epoch": 2.89530806478544, + "grad_norm": 0.2728938631570337, + "learning_rate": 3.671736688941108e-08, + "loss": 0.0126, + "step": 8670 + }, + { + "epoch": 2.8956420103523124, + "grad_norm": 0.19581358338224103, + "learning_rate": 3.6482662533426914e-08, + "loss": 0.009, + "step": 8671 + }, + { + "epoch": 2.895975955919185, + "grad_norm": 0.2844638849590318, + "learning_rate": 3.6248707975181096e-08, + "loss": 0.0106, + "step": 8672 + }, + { + "epoch": 2.896309901486058, + "grad_norm": 0.2482466704684108, + "learning_rate": 3.601550325001313e-08, + "loss": 0.0092, + "step": 8673 + }, + { + "epoch": 2.89664384705293, + "grad_norm": 0.282092274796016, + "learning_rate": 3.578304839315316e-08, + "loss": 0.0099, + "step": 8674 + }, + { + "epoch": 2.896977792619803, + "grad_norm": 0.28047275914017394, + "learning_rate": 3.5551343439715336e-08, + "loss": 0.0133, + "step": 8675 + }, + { + "epoch": 2.8973117381866755, + "grad_norm": 0.2855626412237467, + "learning_rate": 3.5320388424701644e-08, + "loss": 0.0133, + "step": 8676 + }, + { + "epoch": 2.8976456837535483, + "grad_norm": 0.28058573181733476, + "learning_rate": 3.50901833830003e-08, + "loss": 0.0078, + "step": 8677 + }, + { + "epoch": 2.897979629320421, + "grad_norm": 0.2116113856357964, + "learning_rate": 3.4860728349386807e-08, + "loss": 0.0097, + "step": 8678 + }, + { + "epoch": 2.8983135748872932, + "grad_norm": 0.31618409308896017, + "learning_rate": 3.4632023358522894e-08, + "loss": 0.0156, + "step": 8679 + }, + { + "epoch": 2.898647520454166, + "grad_norm": 0.28344553338861805, + "learning_rate": 3.440406844495758e-08, + "loss": 0.013, + "step": 8680 + }, + { + "epoch": 2.8989814660210387, + "grad_norm": 0.22523993877103798, + "learning_rate": 3.4176863643125e-08, + "loss": 0.0083, + "step": 8681 + }, + { + "epoch": 2.899315411587911, + "grad_norm": 0.27484649593540206, + "learning_rate": 3.395040898734825e-08, + "loss": 0.0148, + "step": 8682 + }, + { + "epoch": 2.8996493571547837, + "grad_norm": 0.3562614267683864, + "learning_rate": 3.372470451183496e-08, + "loss": 0.0215, + "step": 8683 + }, + { + "epoch": 2.8999833027216564, + "grad_norm": 0.24983673297695597, + "learning_rate": 3.349975025068175e-08, + "loss": 0.0119, + "step": 8684 + }, + { + "epoch": 2.900317248288529, + "grad_norm": 0.3660830884604933, + "learning_rate": 3.327554623786977e-08, + "loss": 0.0143, + "step": 8685 + }, + { + "epoch": 2.900651193855402, + "grad_norm": 0.2695513887765066, + "learning_rate": 3.305209250726804e-08, + "loss": 0.0153, + "step": 8686 + }, + { + "epoch": 2.900985139422274, + "grad_norm": 0.2972901671866103, + "learning_rate": 3.282938909263122e-08, + "loss": 0.016, + "step": 8687 + }, + { + "epoch": 2.901319084989147, + "grad_norm": 0.32989993264429485, + "learning_rate": 3.2607436027601854e-08, + "loss": 0.0187, + "step": 8688 + }, + { + "epoch": 2.9016530305560195, + "grad_norm": 0.28538556338583776, + "learning_rate": 3.238623334570812e-08, + "loss": 0.0155, + "step": 8689 + }, + { + "epoch": 2.901986976122892, + "grad_norm": 0.3164212136337197, + "learning_rate": 3.2165781080366054e-08, + "loss": 0.0158, + "step": 8690 + }, + { + "epoch": 2.9023209216897645, + "grad_norm": 0.3062420702595496, + "learning_rate": 3.194607926487681e-08, + "loss": 0.0202, + "step": 8691 + }, + { + "epoch": 2.9026548672566372, + "grad_norm": 0.2767076916236891, + "learning_rate": 3.1727127932429936e-08, + "loss": 0.0134, + "step": 8692 + }, + { + "epoch": 2.9029888128235095, + "grad_norm": 0.2034915095180857, + "learning_rate": 3.150892711609899e-08, + "loss": 0.0064, + "step": 8693 + }, + { + "epoch": 2.9033227583903822, + "grad_norm": 0.23242973300874203, + "learning_rate": 3.129147684884704e-08, + "loss": 0.0089, + "step": 8694 + }, + { + "epoch": 2.903656703957255, + "grad_norm": 0.2663350165078304, + "learning_rate": 3.107477716352225e-08, + "loss": 0.0125, + "step": 8695 + }, + { + "epoch": 2.9039906495241277, + "grad_norm": 0.33861888331435996, + "learning_rate": 3.0858828092859564e-08, + "loss": 0.0173, + "step": 8696 + }, + { + "epoch": 2.9043245950910004, + "grad_norm": 0.3209430979212991, + "learning_rate": 3.0643629669480644e-08, + "loss": 0.0128, + "step": 8697 + }, + { + "epoch": 2.9046585406578727, + "grad_norm": 0.306165968142353, + "learning_rate": 3.042918192589395e-08, + "loss": 0.0141, + "step": 8698 + }, + { + "epoch": 2.9049924862247454, + "grad_norm": 0.26839596857883113, + "learning_rate": 3.021548489449355e-08, + "loss": 0.0134, + "step": 8699 + }, + { + "epoch": 2.905326431791618, + "grad_norm": 0.2358241488476551, + "learning_rate": 3.000253860756197e-08, + "loss": 0.0103, + "step": 8700 + }, + { + "epoch": 2.9056603773584904, + "grad_norm": 0.31773401568435783, + "learning_rate": 2.979034309726625e-08, + "loss": 0.0188, + "step": 8701 + }, + { + "epoch": 2.905994322925363, + "grad_norm": 0.22052354319979584, + "learning_rate": 2.9578898395661858e-08, + "loss": 0.0085, + "step": 8702 + }, + { + "epoch": 2.906328268492236, + "grad_norm": 0.24852538150490014, + "learning_rate": 2.9368204534689916e-08, + "loss": 0.019, + "step": 8703 + }, + { + "epoch": 2.9066622140591085, + "grad_norm": 0.29408670813894333, + "learning_rate": 2.915826154617718e-08, + "loss": 0.0128, + "step": 8704 + }, + { + "epoch": 2.9069961596259812, + "grad_norm": 0.24527733369242583, + "learning_rate": 2.8949069461839952e-08, + "loss": 0.0102, + "step": 8705 + }, + { + "epoch": 2.9073301051928535, + "grad_norm": 0.2969798744619057, + "learning_rate": 2.8740628313276842e-08, + "loss": 0.0147, + "step": 8706 + }, + { + "epoch": 2.9076640507597262, + "grad_norm": 0.24956870936206352, + "learning_rate": 2.853293813197766e-08, + "loss": 0.0131, + "step": 8707 + }, + { + "epoch": 2.907997996326599, + "grad_norm": 0.33572656869942524, + "learning_rate": 2.8325998949314536e-08, + "loss": 0.0148, + "step": 8708 + }, + { + "epoch": 2.908331941893471, + "grad_norm": 0.43030877178110216, + "learning_rate": 2.811981079654913e-08, + "loss": 0.0195, + "step": 8709 + }, + { + "epoch": 2.908665887460344, + "grad_norm": 0.30389642896195596, + "learning_rate": 2.7914373704827634e-08, + "loss": 0.0113, + "step": 8710 + }, + { + "epoch": 2.9089998330272167, + "grad_norm": 0.4144430579268167, + "learning_rate": 2.7709687705185227e-08, + "loss": 0.021, + "step": 8711 + }, + { + "epoch": 2.909333778594089, + "grad_norm": 0.28878617439074666, + "learning_rate": 2.7505752828541065e-08, + "loss": 0.0112, + "step": 8712 + }, + { + "epoch": 2.9096677241609616, + "grad_norm": 0.3441762591137517, + "learning_rate": 2.730256910570217e-08, + "loss": 0.0183, + "step": 8713 + }, + { + "epoch": 2.9100016697278344, + "grad_norm": 0.4018551305354934, + "learning_rate": 2.7100136567361767e-08, + "loss": 0.0203, + "step": 8714 + }, + { + "epoch": 2.910335615294707, + "grad_norm": 0.2742731362558585, + "learning_rate": 2.689845524409984e-08, + "loss": 0.0145, + "step": 8715 + }, + { + "epoch": 2.91066956086158, + "grad_norm": 0.2640693293958381, + "learning_rate": 2.6697525166382575e-08, + "loss": 0.015, + "step": 8716 + }, + { + "epoch": 2.911003506428452, + "grad_norm": 0.20604488465997525, + "learning_rate": 2.649734636456236e-08, + "loss": 0.0087, + "step": 8717 + }, + { + "epoch": 2.911337451995325, + "grad_norm": 0.2602360261733091, + "learning_rate": 2.629791886888e-08, + "loss": 0.0134, + "step": 8718 + }, + { + "epoch": 2.9116713975621975, + "grad_norm": 0.23131539403434442, + "learning_rate": 2.6099242709459737e-08, + "loss": 0.0127, + "step": 8719 + }, + { + "epoch": 2.91200534312907, + "grad_norm": 0.33407922563788495, + "learning_rate": 2.5901317916314783e-08, + "loss": 0.0154, + "step": 8720 + }, + { + "epoch": 2.9123392886959425, + "grad_norm": 0.31518587321229735, + "learning_rate": 2.5704144519344e-08, + "loss": 0.0178, + "step": 8721 + }, + { + "epoch": 2.912673234262815, + "grad_norm": 0.2994628741166894, + "learning_rate": 2.5507722548332446e-08, + "loss": 0.0125, + "step": 8722 + }, + { + "epoch": 2.9130071798296875, + "grad_norm": 0.36807659410784077, + "learning_rate": 2.5312052032952505e-08, + "loss": 0.0134, + "step": 8723 + }, + { + "epoch": 2.91334112539656, + "grad_norm": 0.24185584342283248, + "learning_rate": 2.5117133002762196e-08, + "loss": 0.0104, + "step": 8724 + }, + { + "epoch": 2.913675070963433, + "grad_norm": 0.25499478938167525, + "learning_rate": 2.492296548720574e-08, + "loss": 0.0113, + "step": 8725 + }, + { + "epoch": 2.9140090165303056, + "grad_norm": 0.2535233257687075, + "learning_rate": 2.4729549515615235e-08, + "loss": 0.0128, + "step": 8726 + }, + { + "epoch": 2.9143429620971784, + "grad_norm": 0.28416792676128777, + "learning_rate": 2.453688511720842e-08, + "loss": 0.0099, + "step": 8727 + }, + { + "epoch": 2.9146769076640506, + "grad_norm": 0.3454984001255326, + "learning_rate": 2.4344972321089234e-08, + "loss": 0.016, + "step": 8728 + }, + { + "epoch": 2.9150108532309233, + "grad_norm": 0.26467789417597154, + "learning_rate": 2.415381115624782e-08, + "loss": 0.0105, + "step": 8729 + }, + { + "epoch": 2.915344798797796, + "grad_norm": 0.28623993926336866, + "learning_rate": 2.3963401651562747e-08, + "loss": 0.0131, + "step": 8730 + }, + { + "epoch": 2.9156787443646683, + "grad_norm": 0.22363555013131667, + "learning_rate": 2.3773743835796558e-08, + "loss": 0.0109, + "step": 8731 + }, + { + "epoch": 2.916012689931541, + "grad_norm": 0.26112782302038295, + "learning_rate": 2.358483773759912e-08, + "loss": 0.0101, + "step": 8732 + }, + { + "epoch": 2.9163466354984138, + "grad_norm": 0.3250892536190527, + "learning_rate": 2.33966833855076e-08, + "loss": 0.0147, + "step": 8733 + }, + { + "epoch": 2.9166805810652865, + "grad_norm": 0.30163415328018506, + "learning_rate": 2.320928080794482e-08, + "loss": 0.0125, + "step": 8734 + }, + { + "epoch": 2.917014526632159, + "grad_norm": 0.3275314454711843, + "learning_rate": 2.3022630033219807e-08, + "loss": 0.0131, + "step": 8735 + }, + { + "epoch": 2.9173484721990315, + "grad_norm": 0.37484526774275956, + "learning_rate": 2.2836731089528886e-08, + "loss": 0.0179, + "step": 8736 + }, + { + "epoch": 2.917682417765904, + "grad_norm": 0.3665810232642229, + "learning_rate": 2.2651584004953485e-08, + "loss": 0.0206, + "step": 8737 + }, + { + "epoch": 2.918016363332777, + "grad_norm": 0.34566664698549754, + "learning_rate": 2.2467188807462902e-08, + "loss": 0.0182, + "step": 8738 + }, + { + "epoch": 2.918350308899649, + "grad_norm": 0.2779653906613591, + "learning_rate": 2.2283545524912075e-08, + "loss": 0.0125, + "step": 8739 + }, + { + "epoch": 2.918684254466522, + "grad_norm": 0.4214386655749725, + "learning_rate": 2.210065418504215e-08, + "loss": 0.0313, + "step": 8740 + }, + { + "epoch": 2.9190182000333946, + "grad_norm": 0.26226303616696295, + "learning_rate": 2.1918514815481572e-08, + "loss": 0.0103, + "step": 8741 + }, + { + "epoch": 2.919352145600267, + "grad_norm": 0.23341419489962137, + "learning_rate": 2.17371274437439e-08, + "loss": 0.0153, + "step": 8742 + }, + { + "epoch": 2.9196860911671396, + "grad_norm": 0.3488548374087801, + "learning_rate": 2.155649209723054e-08, + "loss": 0.0196, + "step": 8743 + }, + { + "epoch": 2.9200200367340123, + "grad_norm": 0.2935287958576697, + "learning_rate": 2.137660880322856e-08, + "loss": 0.0099, + "step": 8744 + }, + { + "epoch": 2.920353982300885, + "grad_norm": 0.25887511113137146, + "learning_rate": 2.1197477588910666e-08, + "loss": 0.0111, + "step": 8745 + }, + { + "epoch": 2.9206879278677578, + "grad_norm": 0.32883151961491597, + "learning_rate": 2.101909848133743e-08, + "loss": 0.0164, + "step": 8746 + }, + { + "epoch": 2.92102187343463, + "grad_norm": 0.20466615117086215, + "learning_rate": 2.0841471507455635e-08, + "loss": 0.0074, + "step": 8747 + }, + { + "epoch": 2.9213558190015028, + "grad_norm": 0.3481873657204859, + "learning_rate": 2.0664596694096596e-08, + "loss": 0.016, + "step": 8748 + }, + { + "epoch": 2.9216897645683755, + "grad_norm": 0.2948192719071884, + "learning_rate": 2.0488474067980045e-08, + "loss": 0.0212, + "step": 8749 + }, + { + "epoch": 2.9220237101352478, + "grad_norm": 0.3409574930296834, + "learning_rate": 2.0313103655711373e-08, + "loss": 0.0124, + "step": 8750 + }, + { + "epoch": 2.9223576557021205, + "grad_norm": 0.3038855744380676, + "learning_rate": 2.0138485483782723e-08, + "loss": 0.0106, + "step": 8751 + }, + { + "epoch": 2.922691601268993, + "grad_norm": 0.3159034059422304, + "learning_rate": 1.996461957857132e-08, + "loss": 0.0138, + "step": 8752 + }, + { + "epoch": 2.923025546835866, + "grad_norm": 0.34065442628863085, + "learning_rate": 1.9791505966342273e-08, + "loss": 0.0164, + "step": 8753 + }, + { + "epoch": 2.9233594924027386, + "grad_norm": 0.28514155172896277, + "learning_rate": 1.9619144673246325e-08, + "loss": 0.0149, + "step": 8754 + }, + { + "epoch": 2.923693437969611, + "grad_norm": 0.3061562221831954, + "learning_rate": 1.9447535725320987e-08, + "loss": 0.0148, + "step": 8755 + }, + { + "epoch": 2.9240273835364836, + "grad_norm": 0.3602103456345199, + "learning_rate": 1.9276679148488854e-08, + "loss": 0.0149, + "step": 8756 + }, + { + "epoch": 2.9243613291033563, + "grad_norm": 0.3478663526829974, + "learning_rate": 1.9106574968560943e-08, + "loss": 0.0145, + "step": 8757 + }, + { + "epoch": 2.9246952746702286, + "grad_norm": 0.2915040935327554, + "learning_rate": 1.8937223211232257e-08, + "loss": 0.0156, + "step": 8758 + }, + { + "epoch": 2.9250292202371013, + "grad_norm": 0.3098722563320424, + "learning_rate": 1.876862390208678e-08, + "loss": 0.0134, + "step": 8759 + }, + { + "epoch": 2.925363165803974, + "grad_norm": 0.28094944512030523, + "learning_rate": 1.8600777066593023e-08, + "loss": 0.0184, + "step": 8760 + }, + { + "epoch": 2.9256971113708463, + "grad_norm": 0.2797787627219745, + "learning_rate": 1.8433682730105706e-08, + "loss": 0.0146, + "step": 8761 + }, + { + "epoch": 2.926031056937719, + "grad_norm": 0.2576842763659107, + "learning_rate": 1.8267340917866306e-08, + "loss": 0.0123, + "step": 8762 + }, + { + "epoch": 2.9263650025045918, + "grad_norm": 0.31929204793777843, + "learning_rate": 1.8101751655003053e-08, + "loss": 0.0137, + "step": 8763 + }, + { + "epoch": 2.9266989480714645, + "grad_norm": 0.2631502610137791, + "learning_rate": 1.793691496653094e-08, + "loss": 0.0155, + "step": 8764 + }, + { + "epoch": 2.927032893638337, + "grad_norm": 0.2625030484537923, + "learning_rate": 1.7772830877348933e-08, + "loss": 0.0099, + "step": 8765 + }, + { + "epoch": 2.9273668392052095, + "grad_norm": 0.2231807882324196, + "learning_rate": 1.760949941224499e-08, + "loss": 0.0094, + "step": 8766 + }, + { + "epoch": 2.927700784772082, + "grad_norm": 0.31845048629295863, + "learning_rate": 1.7446920595892147e-08, + "loss": 0.016, + "step": 8767 + }, + { + "epoch": 2.928034730338955, + "grad_norm": 0.2612939278961106, + "learning_rate": 1.7285094452849095e-08, + "loss": 0.0114, + "step": 8768 + }, + { + "epoch": 2.928368675905827, + "grad_norm": 0.3775987112064319, + "learning_rate": 1.7124021007562385e-08, + "loss": 0.0098, + "step": 8769 + }, + { + "epoch": 2.9287026214727, + "grad_norm": 0.31509669595507855, + "learning_rate": 1.696370028436367e-08, + "loss": 0.0143, + "step": 8770 + }, + { + "epoch": 2.9290365670395726, + "grad_norm": 0.29480089335483095, + "learning_rate": 1.6804132307471354e-08, + "loss": 0.0099, + "step": 8771 + }, + { + "epoch": 2.929370512606445, + "grad_norm": 0.2897833691487252, + "learning_rate": 1.6645317100990044e-08, + "loss": 0.0128, + "step": 8772 + }, + { + "epoch": 2.9297044581733176, + "grad_norm": 0.36133706228333407, + "learning_rate": 1.6487254688910546e-08, + "loss": 0.0181, + "step": 8773 + }, + { + "epoch": 2.9300384037401903, + "grad_norm": 0.2404834580281326, + "learning_rate": 1.6329945095110435e-08, + "loss": 0.0102, + "step": 8774 + }, + { + "epoch": 2.930372349307063, + "grad_norm": 0.27565874276431335, + "learning_rate": 1.6173388343352915e-08, + "loss": 0.0248, + "step": 8775 + }, + { + "epoch": 2.9307062948739357, + "grad_norm": 0.32454303115924815, + "learning_rate": 1.601758445728796e-08, + "loss": 0.0101, + "step": 8776 + }, + { + "epoch": 2.931040240440808, + "grad_norm": 0.3657214541168895, + "learning_rate": 1.586253346045119e-08, + "loss": 0.0152, + "step": 8777 + }, + { + "epoch": 2.9313741860076807, + "grad_norm": 0.25646527488828263, + "learning_rate": 1.570823537626498e-08, + "loss": 0.0147, + "step": 8778 + }, + { + "epoch": 2.9317081315745535, + "grad_norm": 0.25589474577330984, + "learning_rate": 1.5554690228037905e-08, + "loss": 0.0096, + "step": 8779 + }, + { + "epoch": 2.9320420771414257, + "grad_norm": 0.21896462623009297, + "learning_rate": 1.5401898038964748e-08, + "loss": 0.0093, + "step": 8780 + }, + { + "epoch": 2.9323760227082984, + "grad_norm": 0.2189527789194369, + "learning_rate": 1.5249858832126486e-08, + "loss": 0.008, + "step": 8781 + }, + { + "epoch": 2.932709968275171, + "grad_norm": 0.32614187488584695, + "learning_rate": 1.5098572630491414e-08, + "loss": 0.0156, + "step": 8782 + }, + { + "epoch": 2.933043913842044, + "grad_norm": 0.2768713733398159, + "learning_rate": 1.4948039456911256e-08, + "loss": 0.0116, + "step": 8783 + }, + { + "epoch": 2.9333778594089166, + "grad_norm": 0.33250918763287496, + "learning_rate": 1.4798259334127263e-08, + "loss": 0.0132, + "step": 8784 + }, + { + "epoch": 2.933711804975789, + "grad_norm": 0.30370025576747817, + "learning_rate": 1.4649232284765225e-08, + "loss": 0.0128, + "step": 8785 + }, + { + "epoch": 2.9340457505426616, + "grad_norm": 0.2236663279924463, + "learning_rate": 1.4500958331337134e-08, + "loss": 0.0083, + "step": 8786 + }, + { + "epoch": 2.9343796961095343, + "grad_norm": 0.2681366571156902, + "learning_rate": 1.435343749624174e-08, + "loss": 0.012, + "step": 8787 + }, + { + "epoch": 2.9347136416764066, + "grad_norm": 0.2907583918296267, + "learning_rate": 1.420666980176344e-08, + "loss": 0.0115, + "step": 8788 + }, + { + "epoch": 2.9350475872432793, + "grad_norm": 0.47247866287833307, + "learning_rate": 1.4060655270073387e-08, + "loss": 0.0155, + "step": 8789 + }, + { + "epoch": 2.935381532810152, + "grad_norm": 0.2662172562483003, + "learning_rate": 1.3915393923228936e-08, + "loss": 0.0123, + "step": 8790 + }, + { + "epoch": 2.9357154783770243, + "grad_norm": 0.250694005734805, + "learning_rate": 1.3770885783173649e-08, + "loss": 0.009, + "step": 8791 + }, + { + "epoch": 2.936049423943897, + "grad_norm": 0.2936251125656518, + "learning_rate": 1.3627130871737282e-08, + "loss": 0.012, + "step": 8792 + }, + { + "epoch": 2.9363833695107697, + "grad_norm": 0.25562172619410156, + "learning_rate": 1.3484129210635243e-08, + "loss": 0.0117, + "step": 8793 + }, + { + "epoch": 2.9367173150776424, + "grad_norm": 0.3303159384027027, + "learning_rate": 1.3341880821469699e-08, + "loss": 0.0179, + "step": 8794 + }, + { + "epoch": 2.937051260644515, + "grad_norm": 0.2584599284615368, + "learning_rate": 1.3200385725729014e-08, + "loss": 0.0139, + "step": 8795 + }, + { + "epoch": 2.9373852062113874, + "grad_norm": 0.35616793549975456, + "learning_rate": 1.3059643944787759e-08, + "loss": 0.0105, + "step": 8796 + }, + { + "epoch": 2.93771915177826, + "grad_norm": 0.2935202521427242, + "learning_rate": 1.2919655499906703e-08, + "loss": 0.012, + "step": 8797 + }, + { + "epoch": 2.938053097345133, + "grad_norm": 0.28669489947757315, + "learning_rate": 1.2780420412232263e-08, + "loss": 0.0101, + "step": 8798 + }, + { + "epoch": 2.938387042912005, + "grad_norm": 0.5376356661696571, + "learning_rate": 1.2641938702798174e-08, + "loss": 0.0175, + "step": 8799 + }, + { + "epoch": 2.938720988478878, + "grad_norm": 0.23862457416839503, + "learning_rate": 1.2504210392523808e-08, + "loss": 0.0089, + "step": 8800 + }, + { + "epoch": 2.9390549340457506, + "grad_norm": 0.3331618786605174, + "learning_rate": 1.2367235502214192e-08, + "loss": 0.0147, + "step": 8801 + }, + { + "epoch": 2.9393888796126233, + "grad_norm": 0.27056080806631566, + "learning_rate": 1.2231014052560553e-08, + "loss": 0.0117, + "step": 8802 + }, + { + "epoch": 2.939722825179496, + "grad_norm": 0.312923999169443, + "learning_rate": 1.2095546064141982e-08, + "loss": 0.016, + "step": 8803 + }, + { + "epoch": 2.9400567707463683, + "grad_norm": 0.37431215586823213, + "learning_rate": 1.196083155742156e-08, + "loss": 0.0146, + "step": 8804 + }, + { + "epoch": 2.940390716313241, + "grad_norm": 0.2726460575793837, + "learning_rate": 1.1826870552749669e-08, + "loss": 0.0118, + "step": 8805 + }, + { + "epoch": 2.9407246618801137, + "grad_norm": 0.2816191408205805, + "learning_rate": 1.169366307036346e-08, + "loss": 0.0135, + "step": 8806 + }, + { + "epoch": 2.941058607446986, + "grad_norm": 0.25846786258633725, + "learning_rate": 1.1561209130384055e-08, + "loss": 0.0094, + "step": 8807 + }, + { + "epoch": 2.9413925530138587, + "grad_norm": 0.22722585849230448, + "learning_rate": 1.1429508752821561e-08, + "loss": 0.0111, + "step": 8808 + }, + { + "epoch": 2.9417264985807314, + "grad_norm": 0.2970481442227012, + "learning_rate": 1.1298561957570065e-08, + "loss": 0.0096, + "step": 8809 + }, + { + "epoch": 2.9420604441476037, + "grad_norm": 0.38339991784619254, + "learning_rate": 1.1168368764410408e-08, + "loss": 0.0209, + "step": 8810 + }, + { + "epoch": 2.9423943897144764, + "grad_norm": 0.25604073011029854, + "learning_rate": 1.103892919301075e-08, + "loss": 0.0115, + "step": 8811 + }, + { + "epoch": 2.942728335281349, + "grad_norm": 0.2727229391793246, + "learning_rate": 1.0910243262923781e-08, + "loss": 0.0152, + "step": 8812 + }, + { + "epoch": 2.943062280848222, + "grad_norm": 0.30034640582846706, + "learning_rate": 1.0782310993589506e-08, + "loss": 0.0147, + "step": 8813 + }, + { + "epoch": 2.9433962264150946, + "grad_norm": 0.2684617169656456, + "learning_rate": 1.0655132404333024e-08, + "loss": 0.0116, + "step": 8814 + }, + { + "epoch": 2.943730171981967, + "grad_norm": 0.33602067958668014, + "learning_rate": 1.0528707514366743e-08, + "loss": 0.0198, + "step": 8815 + }, + { + "epoch": 2.9440641175488396, + "grad_norm": 0.2827167620515951, + "learning_rate": 1.0403036342787609e-08, + "loss": 0.0132, + "step": 8816 + }, + { + "epoch": 2.9443980631157123, + "grad_norm": 0.180604425398328, + "learning_rate": 1.0278118908580992e-08, + "loss": 0.0057, + "step": 8817 + }, + { + "epoch": 2.9447320086825846, + "grad_norm": 0.3180144117107635, + "learning_rate": 1.0153955230616241e-08, + "loss": 0.0146, + "step": 8818 + }, + { + "epoch": 2.9450659542494573, + "grad_norm": 0.2736349316275919, + "learning_rate": 1.0030545327650576e-08, + "loss": 0.0138, + "step": 8819 + }, + { + "epoch": 2.94539989981633, + "grad_norm": 0.2686584094563105, + "learning_rate": 9.907889218325751e-09, + "loss": 0.0172, + "step": 8820 + }, + { + "epoch": 2.9457338453832023, + "grad_norm": 0.32138877629272083, + "learning_rate": 9.78598692117083e-09, + "loss": 0.0206, + "step": 8821 + }, + { + "epoch": 2.946067790950075, + "grad_norm": 0.25011918184665605, + "learning_rate": 9.664838454599978e-09, + "loss": 0.0115, + "step": 8822 + }, + { + "epoch": 2.9464017365169477, + "grad_norm": 0.26070132075448404, + "learning_rate": 9.544443836914664e-09, + "loss": 0.0139, + "step": 8823 + }, + { + "epoch": 2.9467356820838204, + "grad_norm": 0.30632666567619926, + "learning_rate": 9.42480308630256e-09, + "loss": 0.0135, + "step": 8824 + }, + { + "epoch": 2.947069627650693, + "grad_norm": 0.3313239726533775, + "learning_rate": 9.30591622083532e-09, + "loss": 0.0171, + "step": 8825 + }, + { + "epoch": 2.9474035732175654, + "grad_norm": 0.2797843695228306, + "learning_rate": 9.187783258473027e-09, + "loss": 0.0086, + "step": 8826 + }, + { + "epoch": 2.947737518784438, + "grad_norm": 0.5733809244732592, + "learning_rate": 9.070404217061402e-09, + "loss": 0.0195, + "step": 8827 + }, + { + "epoch": 2.948071464351311, + "grad_norm": 0.26359051517832066, + "learning_rate": 8.953779114331262e-09, + "loss": 0.0088, + "step": 8828 + }, + { + "epoch": 2.948405409918183, + "grad_norm": 0.4198663123835468, + "learning_rate": 8.837907967900183e-09, + "loss": 0.0216, + "step": 8829 + }, + { + "epoch": 2.948739355485056, + "grad_norm": 0.342304415582743, + "learning_rate": 8.722790795272495e-09, + "loss": 0.0161, + "step": 8830 + }, + { + "epoch": 2.9490733010519286, + "grad_norm": 0.3091524362828571, + "learning_rate": 8.608427613837622e-09, + "loss": 0.0211, + "step": 8831 + }, + { + "epoch": 2.9494072466188013, + "grad_norm": 0.34980570394693283, + "learning_rate": 8.494818440871189e-09, + "loss": 0.0186, + "step": 8832 + }, + { + "epoch": 2.949741192185674, + "grad_norm": 0.32398566768990994, + "learning_rate": 8.381963293535577e-09, + "loss": 0.0129, + "step": 8833 + }, + { + "epoch": 2.9500751377525463, + "grad_norm": 0.21247373849122486, + "learning_rate": 8.269862188879374e-09, + "loss": 0.0132, + "step": 8834 + }, + { + "epoch": 2.950409083319419, + "grad_norm": 0.3118763549997627, + "learning_rate": 8.158515143835698e-09, + "loss": 0.0124, + "step": 8835 + }, + { + "epoch": 2.9507430288862917, + "grad_norm": 0.2520505010345685, + "learning_rate": 8.047922175225542e-09, + "loss": 0.0107, + "step": 8836 + }, + { + "epoch": 2.951076974453164, + "grad_norm": 0.2962583648791619, + "learning_rate": 7.938083299754984e-09, + "loss": 0.012, + "step": 8837 + }, + { + "epoch": 2.9514109200200367, + "grad_norm": 0.3061490935777087, + "learning_rate": 7.828998534016308e-09, + "loss": 0.0167, + "step": 8838 + }, + { + "epoch": 2.9517448655869094, + "grad_norm": 0.34722479934123923, + "learning_rate": 7.720667894488554e-09, + "loss": 0.0184, + "step": 8839 + }, + { + "epoch": 2.9520788111537817, + "grad_norm": 0.25616124307887617, + "learning_rate": 7.613091397535855e-09, + "loss": 0.0101, + "step": 8840 + }, + { + "epoch": 2.9524127567206544, + "grad_norm": 0.2498636529316553, + "learning_rate": 7.506269059409654e-09, + "loss": 0.0119, + "step": 8841 + }, + { + "epoch": 2.952746702287527, + "grad_norm": 0.24209862700190157, + "learning_rate": 7.400200896245935e-09, + "loss": 0.0115, + "step": 8842 + }, + { + "epoch": 2.9530806478544, + "grad_norm": 0.3096880800803158, + "learning_rate": 7.29488692406799e-09, + "loss": 0.0143, + "step": 8843 + }, + { + "epoch": 2.9534145934212725, + "grad_norm": 0.2995089814872804, + "learning_rate": 7.190327158784205e-09, + "loss": 0.0144, + "step": 8844 + }, + { + "epoch": 2.953748538988145, + "grad_norm": 0.28098736177119993, + "learning_rate": 7.0865216161902785e-09, + "loss": 0.0145, + "step": 8845 + }, + { + "epoch": 2.9540824845550175, + "grad_norm": 0.25440121926030973, + "learning_rate": 6.983470311967e-09, + "loss": 0.0108, + "step": 8846 + }, + { + "epoch": 2.9544164301218903, + "grad_norm": 0.26911353456543136, + "learning_rate": 6.881173261680807e-09, + "loss": 0.0113, + "step": 8847 + }, + { + "epoch": 2.9547503756887625, + "grad_norm": 0.2838865771790412, + "learning_rate": 6.779630480786004e-09, + "loss": 0.0136, + "step": 8848 + }, + { + "epoch": 2.9550843212556352, + "grad_norm": 0.2503857183749329, + "learning_rate": 6.678841984621432e-09, + "loss": 0.0109, + "step": 8849 + }, + { + "epoch": 2.955418266822508, + "grad_norm": 0.30434982450600045, + "learning_rate": 6.578807788411579e-09, + "loss": 0.0151, + "step": 8850 + }, + { + "epoch": 2.9557522123893807, + "grad_norm": 0.2573602736956534, + "learning_rate": 6.479527907268801e-09, + "loss": 0.0127, + "step": 8851 + }, + { + "epoch": 2.9560861579562534, + "grad_norm": 0.29336908656839916, + "learning_rate": 6.381002356189991e-09, + "loss": 0.0118, + "step": 8852 + }, + { + "epoch": 2.9564201035231257, + "grad_norm": 0.31956946539516207, + "learning_rate": 6.283231150058799e-09, + "loss": 0.012, + "step": 8853 + }, + { + "epoch": 2.9567540490899984, + "grad_norm": 0.26904002432170365, + "learning_rate": 6.186214303645077e-09, + "loss": 0.0096, + "step": 8854 + }, + { + "epoch": 2.957087994656871, + "grad_norm": 0.2817797657289491, + "learning_rate": 6.0899518316032135e-09, + "loss": 0.0144, + "step": 8855 + }, + { + "epoch": 2.9574219402237434, + "grad_norm": 0.2891315774175175, + "learning_rate": 5.99444374847602e-09, + "loss": 0.0121, + "step": 8856 + }, + { + "epoch": 2.957755885790616, + "grad_norm": 0.23546706675110624, + "learning_rate": 5.899690068690289e-09, + "loss": 0.0118, + "step": 8857 + }, + { + "epoch": 2.958089831357489, + "grad_norm": 0.2503079524048365, + "learning_rate": 5.805690806560127e-09, + "loss": 0.0109, + "step": 8858 + }, + { + "epoch": 2.958423776924361, + "grad_norm": 0.28038767855348384, + "learning_rate": 5.712445976285286e-09, + "loss": 0.014, + "step": 8859 + }, + { + "epoch": 2.958757722491234, + "grad_norm": 0.3581868050776321, + "learning_rate": 5.619955591951165e-09, + "loss": 0.0271, + "step": 8860 + }, + { + "epoch": 2.9590916680581065, + "grad_norm": 0.28498615376492137, + "learning_rate": 5.528219667529921e-09, + "loss": 0.02, + "step": 8861 + }, + { + "epoch": 2.9594256136249792, + "grad_norm": 0.21941935688277253, + "learning_rate": 5.437238216878804e-09, + "loss": 0.0085, + "step": 8862 + }, + { + "epoch": 2.959759559191852, + "grad_norm": 0.30634642551595337, + "learning_rate": 5.347011253741819e-09, + "loss": 0.0099, + "step": 8863 + }, + { + "epoch": 2.9600935047587242, + "grad_norm": 0.2477810188915607, + "learning_rate": 5.257538791749173e-09, + "loss": 0.0125, + "step": 8864 + }, + { + "epoch": 2.960427450325597, + "grad_norm": 0.27414438700160665, + "learning_rate": 5.168820844416167e-09, + "loss": 0.0136, + "step": 8865 + }, + { + "epoch": 2.9607613958924697, + "grad_norm": 0.22304751891524885, + "learning_rate": 5.080857425145413e-09, + "loss": 0.0096, + "step": 8866 + }, + { + "epoch": 2.961095341459342, + "grad_norm": 0.25385290361558166, + "learning_rate": 4.993648547224062e-09, + "loss": 0.0133, + "step": 8867 + }, + { + "epoch": 2.9614292870262147, + "grad_norm": 0.309799338589133, + "learning_rate": 4.907194223826572e-09, + "loss": 0.017, + "step": 8868 + }, + { + "epoch": 2.9617632325930874, + "grad_norm": 0.309199565152647, + "learning_rate": 4.8214944680125e-09, + "loss": 0.0138, + "step": 8869 + }, + { + "epoch": 2.9620971781599597, + "grad_norm": 0.31027870939859425, + "learning_rate": 4.736549292728154e-09, + "loss": 0.0145, + "step": 8870 + }, + { + "epoch": 2.9624311237268324, + "grad_norm": 0.39834860507149, + "learning_rate": 4.652358710805494e-09, + "loss": 0.0272, + "step": 8871 + }, + { + "epoch": 2.962765069293705, + "grad_norm": 0.3028396026405605, + "learning_rate": 4.5689227349626775e-09, + "loss": 0.013, + "step": 8872 + }, + { + "epoch": 2.963099014860578, + "grad_norm": 0.24185272703681146, + "learning_rate": 4.486241377802958e-09, + "loss": 0.0113, + "step": 8873 + }, + { + "epoch": 2.9634329604274505, + "grad_norm": 0.27679245352313514, + "learning_rate": 4.404314651816344e-09, + "loss": 0.0178, + "step": 8874 + }, + { + "epoch": 2.963766905994323, + "grad_norm": 0.3055707292428465, + "learning_rate": 4.323142569379602e-09, + "loss": 0.016, + "step": 8875 + }, + { + "epoch": 2.9641008515611955, + "grad_norm": 0.3105337545534266, + "learning_rate": 4.242725142754589e-09, + "loss": 0.0128, + "step": 8876 + }, + { + "epoch": 2.9644347971280682, + "grad_norm": 0.2042614322041993, + "learning_rate": 4.163062384088812e-09, + "loss": 0.0067, + "step": 8877 + }, + { + "epoch": 2.9647687426949405, + "grad_norm": 0.42435157442567994, + "learning_rate": 4.0841543054165324e-09, + "loss": 0.0211, + "step": 8878 + }, + { + "epoch": 2.9651026882618132, + "grad_norm": 0.2605864036382383, + "learning_rate": 4.006000918658215e-09, + "loss": 0.0103, + "step": 8879 + }, + { + "epoch": 2.965436633828686, + "grad_norm": 0.2542717122667056, + "learning_rate": 3.928602235618861e-09, + "loss": 0.012, + "step": 8880 + }, + { + "epoch": 2.9657705793955587, + "grad_norm": 0.3105241541659346, + "learning_rate": 3.851958267990785e-09, + "loss": 0.0117, + "step": 8881 + }, + { + "epoch": 2.9661045249624314, + "grad_norm": 0.2810087337567596, + "learning_rate": 3.776069027352503e-09, + "loss": 0.0121, + "step": 8882 + }, + { + "epoch": 2.9664384705293037, + "grad_norm": 0.3579953796513702, + "learning_rate": 3.700934525167621e-09, + "loss": 0.0135, + "step": 8883 + }, + { + "epoch": 2.9667724160961764, + "grad_norm": 0.1942049991957917, + "learning_rate": 3.626554772786506e-09, + "loss": 0.0072, + "step": 8884 + }, + { + "epoch": 2.967106361663049, + "grad_norm": 0.2874251572794949, + "learning_rate": 3.5529297814440587e-09, + "loss": 0.0142, + "step": 8885 + }, + { + "epoch": 2.9674403072299214, + "grad_norm": 0.3079602859417745, + "learning_rate": 3.4800595622630497e-09, + "loss": 0.0164, + "step": 8886 + }, + { + "epoch": 2.967774252796794, + "grad_norm": 0.284304055545771, + "learning_rate": 3.407944126251339e-09, + "loss": 0.0116, + "step": 8887 + }, + { + "epoch": 2.968108198363667, + "grad_norm": 0.3024477077304248, + "learning_rate": 3.336583484301881e-09, + "loss": 0.0124, + "step": 8888 + }, + { + "epoch": 2.968442143930539, + "grad_norm": 0.3452235143357418, + "learning_rate": 3.2659776471960505e-09, + "loss": 0.0243, + "step": 8889 + }, + { + "epoch": 2.968776089497412, + "grad_norm": 0.19849054713234496, + "learning_rate": 3.19612662559865e-09, + "loss": 0.0085, + "step": 8890 + }, + { + "epoch": 2.9691100350642845, + "grad_norm": 0.3413891758042656, + "learning_rate": 3.1270304300617947e-09, + "loss": 0.0185, + "step": 8891 + }, + { + "epoch": 2.969443980631157, + "grad_norm": 0.27201839017148005, + "learning_rate": 3.0586890710232465e-09, + "loss": 0.0154, + "step": 8892 + }, + { + "epoch": 2.96977792619803, + "grad_norm": 0.20336426410852446, + "learning_rate": 2.9911025588069685e-09, + "loss": 0.009, + "step": 8893 + }, + { + "epoch": 2.970111871764902, + "grad_norm": 0.2243150327657297, + "learning_rate": 2.9242709036225723e-09, + "loss": 0.0104, + "step": 8894 + }, + { + "epoch": 2.970445817331775, + "grad_norm": 0.28980795488718264, + "learning_rate": 2.858194115565871e-09, + "loss": 0.0136, + "step": 8895 + }, + { + "epoch": 2.9707797628986476, + "grad_norm": 0.3133702871500427, + "learning_rate": 2.7928722046177692e-09, + "loss": 0.0151, + "step": 8896 + }, + { + "epoch": 2.97111370846552, + "grad_norm": 0.25170122840614795, + "learning_rate": 2.7283051806470394e-09, + "loss": 0.0108, + "step": 8897 + }, + { + "epoch": 2.9714476540323926, + "grad_norm": 0.3569107012442742, + "learning_rate": 2.664493053406436e-09, + "loss": 0.0126, + "step": 8898 + }, + { + "epoch": 2.9717815995992654, + "grad_norm": 0.3503953913843525, + "learning_rate": 2.6014358325360256e-09, + "loss": 0.0171, + "step": 8899 + }, + { + "epoch": 2.972115545166138, + "grad_norm": 0.2662913521255247, + "learning_rate": 2.5391335275609665e-09, + "loss": 0.0107, + "step": 8900 + }, + { + "epoch": 2.972449490733011, + "grad_norm": 0.31383737687940944, + "learning_rate": 2.4775861478937293e-09, + "loss": 0.0137, + "step": 8901 + }, + { + "epoch": 2.972783436299883, + "grad_norm": 0.27899930926298144, + "learning_rate": 2.416793702830211e-09, + "loss": 0.0096, + "step": 8902 + }, + { + "epoch": 2.973117381866756, + "grad_norm": 0.1619463579430933, + "learning_rate": 2.3567562015547328e-09, + "loss": 0.0066, + "step": 8903 + }, + { + "epoch": 2.9734513274336285, + "grad_norm": 0.2975755401252966, + "learning_rate": 2.297473653136706e-09, + "loss": 0.0145, + "step": 8904 + }, + { + "epoch": 2.9737852730005008, + "grad_norm": 0.3154087999668039, + "learning_rate": 2.2389460665317443e-09, + "loss": 0.0158, + "step": 8905 + }, + { + "epoch": 2.9741192185673735, + "grad_norm": 0.3746830196540353, + "learning_rate": 2.1811734505799985e-09, + "loss": 0.0244, + "step": 8906 + }, + { + "epoch": 2.974453164134246, + "grad_norm": 0.24551995169987959, + "learning_rate": 2.1241558140100426e-09, + "loss": 0.0138, + "step": 8907 + }, + { + "epoch": 2.9747871097011185, + "grad_norm": 0.309728247324327, + "learning_rate": 2.0678931654344314e-09, + "loss": 0.0104, + "step": 8908 + }, + { + "epoch": 2.975121055267991, + "grad_norm": 0.22154709674498288, + "learning_rate": 2.012385513351922e-09, + "loss": 0.0088, + "step": 8909 + }, + { + "epoch": 2.975455000834864, + "grad_norm": 0.28139571562754934, + "learning_rate": 1.9576328661480293e-09, + "loss": 0.0161, + "step": 8910 + }, + { + "epoch": 2.9757889464017366, + "grad_norm": 0.25823426789692366, + "learning_rate": 1.9036352320939146e-09, + "loss": 0.0114, + "step": 8911 + }, + { + "epoch": 2.9761228919686094, + "grad_norm": 0.3092041079081828, + "learning_rate": 1.850392619345831e-09, + "loss": 0.0135, + "step": 8912 + }, + { + "epoch": 2.9764568375354816, + "grad_norm": 0.3460372426064356, + "learning_rate": 1.7979050359479e-09, + "loss": 0.015, + "step": 8913 + }, + { + "epoch": 2.9767907831023543, + "grad_norm": 0.2979925804685489, + "learning_rate": 1.746172489828224e-09, + "loss": 0.0171, + "step": 8914 + }, + { + "epoch": 2.977124728669227, + "grad_norm": 0.23833186938846646, + "learning_rate": 1.6951949888016627e-09, + "loss": 0.0101, + "step": 8915 + }, + { + "epoch": 2.9774586742360993, + "grad_norm": 0.27949876327286516, + "learning_rate": 1.6449725405687234e-09, + "loss": 0.0106, + "step": 8916 + }, + { + "epoch": 2.977792619802972, + "grad_norm": 0.38883786417116634, + "learning_rate": 1.59550515271667e-09, + "loss": 0.0138, + "step": 8917 + }, + { + "epoch": 2.9781265653698448, + "grad_norm": 0.2897917268514722, + "learning_rate": 1.5467928327178582e-09, + "loss": 0.0123, + "step": 8918 + }, + { + "epoch": 2.978460510936717, + "grad_norm": 0.30086512667102744, + "learning_rate": 1.498835587930847e-09, + "loss": 0.0119, + "step": 8919 + }, + { + "epoch": 2.9787944565035898, + "grad_norm": 0.23115848105974932, + "learning_rate": 1.4516334256003962e-09, + "loss": 0.0116, + "step": 8920 + }, + { + "epoch": 2.9791284020704625, + "grad_norm": 0.2572533680568834, + "learning_rate": 1.4051863528563581e-09, + "loss": 0.0097, + "step": 8921 + }, + { + "epoch": 2.979462347637335, + "grad_norm": 0.22789782990855917, + "learning_rate": 1.3594943767158974e-09, + "loss": 0.0117, + "step": 8922 + }, + { + "epoch": 2.979796293204208, + "grad_norm": 0.276466857170987, + "learning_rate": 1.3145575040801605e-09, + "loss": 0.0136, + "step": 8923 + }, + { + "epoch": 2.98013023877108, + "grad_norm": 0.2251519102107459, + "learning_rate": 1.2703757417387164e-09, + "loss": 0.0101, + "step": 8924 + }, + { + "epoch": 2.980464184337953, + "grad_norm": 0.2564634115977526, + "learning_rate": 1.2269490963651154e-09, + "loss": 0.0115, + "step": 8925 + }, + { + "epoch": 2.9807981299048256, + "grad_norm": 0.2942127534403624, + "learning_rate": 1.1842775745196655e-09, + "loss": 0.0142, + "step": 8926 + }, + { + "epoch": 2.981132075471698, + "grad_norm": 0.20510329834816302, + "learning_rate": 1.1423611826477665e-09, + "loss": 0.01, + "step": 8927 + }, + { + "epoch": 2.9814660210385706, + "grad_norm": 0.22350720799851165, + "learning_rate": 1.1011999270821305e-09, + "loss": 0.008, + "step": 8928 + }, + { + "epoch": 2.9817999666054433, + "grad_norm": 0.2701959643035672, + "learning_rate": 1.0607938140400064e-09, + "loss": 0.0117, + "step": 8929 + }, + { + "epoch": 2.982133912172316, + "grad_norm": 0.2876642994947557, + "learning_rate": 1.0211428496259557e-09, + "loss": 0.0146, + "step": 8930 + }, + { + "epoch": 2.9824678577391888, + "grad_norm": 0.2424076255433641, + "learning_rate": 9.822470398296312e-10, + "loss": 0.0085, + "step": 8931 + }, + { + "epoch": 2.982801803306061, + "grad_norm": 0.2765927673245172, + "learning_rate": 9.441063905257785e-10, + "loss": 0.0104, + "step": 8932 + }, + { + "epoch": 2.9831357488729338, + "grad_norm": 0.23925778763867475, + "learning_rate": 9.067209074770101e-10, + "loss": 0.0099, + "step": 8933 + }, + { + "epoch": 2.9834696944398065, + "grad_norm": 0.27178512887000095, + "learning_rate": 8.700905963304751e-10, + "loss": 0.018, + "step": 8934 + }, + { + "epoch": 2.9838036400066787, + "grad_norm": 0.28788672421898087, + "learning_rate": 8.342154626195254e-10, + "loss": 0.0128, + "step": 8935 + }, + { + "epoch": 2.9841375855735515, + "grad_norm": 0.31505382086792205, + "learning_rate": 7.990955117631594e-10, + "loss": 0.016, + "step": 8936 + }, + { + "epoch": 2.984471531140424, + "grad_norm": 0.29247050361386245, + "learning_rate": 7.647307490676881e-10, + "loss": 0.0108, + "step": 8937 + }, + { + "epoch": 2.9848054767072965, + "grad_norm": 0.2752972063113647, + "learning_rate": 7.311211797234041e-10, + "loss": 0.0098, + "step": 8938 + }, + { + "epoch": 2.985139422274169, + "grad_norm": 0.35351372704834705, + "learning_rate": 6.982668088079126e-10, + "loss": 0.015, + "step": 8939 + }, + { + "epoch": 2.985473367841042, + "grad_norm": 0.28399803612731106, + "learning_rate": 6.661676412844653e-10, + "loss": 0.0147, + "step": 8940 + }, + { + "epoch": 2.9858073134079146, + "grad_norm": 0.28649577636249296, + "learning_rate": 6.348236820008513e-10, + "loss": 0.0091, + "step": 8941 + }, + { + "epoch": 2.9861412589747873, + "grad_norm": 0.25405506085454094, + "learning_rate": 6.042349356932819e-10, + "loss": 0.0135, + "step": 8942 + }, + { + "epoch": 2.9864752045416596, + "grad_norm": 0.27463384914468136, + "learning_rate": 5.744014069819503e-10, + "loss": 0.0101, + "step": 8943 + }, + { + "epoch": 2.9868091501085323, + "grad_norm": 0.24347676417976405, + "learning_rate": 5.453231003732518e-10, + "loss": 0.0101, + "step": 8944 + }, + { + "epoch": 2.987143095675405, + "grad_norm": 0.2737953735825022, + "learning_rate": 5.170000202608938e-10, + "loss": 0.0123, + "step": 8945 + }, + { + "epoch": 2.9874770412422773, + "grad_norm": 0.2646011800101981, + "learning_rate": 4.894321709220106e-10, + "loss": 0.0124, + "step": 8946 + }, + { + "epoch": 2.98781098680915, + "grad_norm": 0.2703402316729183, + "learning_rate": 4.626195565221592e-10, + "loss": 0.011, + "step": 8947 + }, + { + "epoch": 2.9881449323760227, + "grad_norm": 0.38012935164309164, + "learning_rate": 4.365621811108778e-10, + "loss": 0.0187, + "step": 8948 + }, + { + "epoch": 2.9884788779428955, + "grad_norm": 0.5136560493941013, + "learning_rate": 4.112600486250173e-10, + "loss": 0.0216, + "step": 8949 + }, + { + "epoch": 2.988812823509768, + "grad_norm": 0.3051470139876893, + "learning_rate": 3.867131628865206e-10, + "loss": 0.0193, + "step": 8950 + }, + { + "epoch": 2.9891467690766405, + "grad_norm": 0.28368761760330513, + "learning_rate": 3.629215276035325e-10, + "loss": 0.0177, + "step": 8951 + }, + { + "epoch": 2.989480714643513, + "grad_norm": 0.20997699417754753, + "learning_rate": 3.3988514637040003e-10, + "loss": 0.0119, + "step": 8952 + }, + { + "epoch": 2.989814660210386, + "grad_norm": 0.24432751054133622, + "learning_rate": 3.176040226660071e-10, + "loss": 0.01, + "step": 8953 + }, + { + "epoch": 2.990148605777258, + "grad_norm": 0.2316491881808177, + "learning_rate": 2.960781598576601e-10, + "loss": 0.0097, + "step": 8954 + }, + { + "epoch": 2.990482551344131, + "grad_norm": 0.2472025433832417, + "learning_rate": 2.7530756119609204e-10, + "loss": 0.0099, + "step": 8955 + }, + { + "epoch": 2.9908164969110036, + "grad_norm": 0.31696959508144734, + "learning_rate": 2.5529222981879323e-10, + "loss": 0.0176, + "step": 8956 + }, + { + "epoch": 2.991150442477876, + "grad_norm": 0.24118441142533606, + "learning_rate": 2.360321687500111e-10, + "loss": 0.0104, + "step": 8957 + }, + { + "epoch": 2.9914843880447486, + "grad_norm": 0.2928267346519803, + "learning_rate": 2.175273808985301e-10, + "loss": 0.0134, + "step": 8958 + }, + { + "epoch": 2.9918183336116213, + "grad_norm": 0.3227256092843894, + "learning_rate": 1.9977786906044683e-10, + "loss": 0.0152, + "step": 8959 + }, + { + "epoch": 2.992152279178494, + "grad_norm": 0.22895195359200451, + "learning_rate": 1.827836359163948e-10, + "loss": 0.01, + "step": 8960 + }, + { + "epoch": 2.9924862247453667, + "grad_norm": 0.25063268294709157, + "learning_rate": 1.665446840343199e-10, + "loss": 0.0118, + "step": 8961 + }, + { + "epoch": 2.992820170312239, + "grad_norm": 0.2837991515215964, + "learning_rate": 1.5106101586614963e-10, + "loss": 0.0092, + "step": 8962 + }, + { + "epoch": 2.9931541158791117, + "grad_norm": 0.3109087909957699, + "learning_rate": 1.3633263375223414e-10, + "loss": 0.0184, + "step": 8963 + }, + { + "epoch": 2.9934880614459844, + "grad_norm": 0.28444023257986717, + "learning_rate": 1.223595399163502e-10, + "loss": 0.0114, + "step": 8964 + }, + { + "epoch": 2.9938220070128567, + "grad_norm": 0.34104403056075155, + "learning_rate": 1.091417364695868e-10, + "loss": 0.0133, + "step": 8965 + }, + { + "epoch": 2.9941559525797294, + "grad_norm": 0.27880065197401577, + "learning_rate": 9.667922540868013e-11, + "loss": 0.0136, + "step": 8966 + }, + { + "epoch": 2.994489898146602, + "grad_norm": 0.3617347223905858, + "learning_rate": 8.49720086165684e-11, + "loss": 0.0141, + "step": 8967 + }, + { + "epoch": 2.9948238437134744, + "grad_norm": 0.25073747583367095, + "learning_rate": 7.40200878618369e-11, + "loss": 0.0135, + "step": 8968 + }, + { + "epoch": 2.995157789280347, + "grad_norm": 0.3533225001730111, + "learning_rate": 6.382346479816282e-11, + "loss": 0.0177, + "step": 8969 + }, + { + "epoch": 2.99549173484722, + "grad_norm": 0.3506858654933053, + "learning_rate": 5.438214096653571e-11, + "loss": 0.0203, + "step": 8970 + }, + { + "epoch": 2.9958256804140926, + "grad_norm": 0.2517764691384077, + "learning_rate": 4.569611779248195e-11, + "loss": 0.0088, + "step": 8971 + }, + { + "epoch": 2.9961596259809653, + "grad_norm": 0.22076960965863357, + "learning_rate": 3.776539658939538e-11, + "loss": 0.0111, + "step": 8972 + }, + { + "epoch": 2.9964935715478376, + "grad_norm": 0.35316797391407945, + "learning_rate": 3.0589978553541286e-11, + "loss": 0.0281, + "step": 8973 + }, + { + "epoch": 2.9968275171147103, + "grad_norm": 0.2310932540955058, + "learning_rate": 2.416986477071781e-11, + "loss": 0.0081, + "step": 8974 + }, + { + "epoch": 2.997161462681583, + "grad_norm": 0.436589341577375, + "learning_rate": 1.850505620903942e-11, + "loss": 0.0168, + "step": 8975 + }, + { + "epoch": 2.9974954082484553, + "grad_norm": 0.2866360076380691, + "learning_rate": 1.3595553725598287e-11, + "loss": 0.0143, + "step": 8976 + }, + { + "epoch": 2.997829353815328, + "grad_norm": 0.1983579307956965, + "learning_rate": 9.441358061468286e-12, + "loss": 0.0061, + "step": 8977 + }, + { + "epoch": 2.9981632993822007, + "grad_norm": 0.24569002362550083, + "learning_rate": 6.042469843925425e-12, + "loss": 0.0122, + "step": 8978 + }, + { + "epoch": 2.9984972449490734, + "grad_norm": 0.32003891184141653, + "learning_rate": 3.398889586447851e-12, + "loss": 0.0157, + "step": 8979 + }, + { + "epoch": 2.998831190515946, + "grad_norm": 0.2915456339083111, + "learning_rate": 1.5106176892709656e-12, + "loss": 0.0155, + "step": 8980 + }, + { + "epoch": 2.9991651360828184, + "grad_norm": 0.233203274595632, + "learning_rate": 3.7765443661186283e-13, + "loss": 0.0068, + "step": 8981 + }, + { + "epoch": 2.999499081649691, + "grad_norm": 0.24824277888305468, + "learning_rate": 0.0, + "loss": 0.0116, + "step": 8982 + }, + { + "epoch": 2.999499081649691, + "eval_loss": 0.025284353643655777, + "eval_runtime": 179.6039, + "eval_samples_per_second": 112.319, + "eval_steps_per_second": 1.759, + "step": 8982 + }, + { + "epoch": 2.999499081649691, + "step": 8982, + "total_flos": 2.774679234360541e+18, + "train_loss": 0.03039964318122616, + "train_runtime": 42140.8615, + "train_samples_per_second": 27.284, + "train_steps_per_second": 0.213 + } + ], + "logging_steps": 1, + "max_steps": 8982, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.774679234360541e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}