diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,130513 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 18640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.3648068669527894e-05, + "grad_norm": 0.39453125, + "learning_rate": 1.78826895565093e-09, + "loss": 2.4083, + "step": 1 + }, + { + "epoch": 0.00010729613733905579, + "grad_norm": 0.326171875, + "learning_rate": 3.57653791130186e-09, + "loss": 2.498, + "step": 2 + }, + { + "epoch": 0.0001609442060085837, + "grad_norm": 0.390625, + "learning_rate": 5.36480686695279e-09, + "loss": 2.5823, + "step": 3 + }, + { + "epoch": 0.00021459227467811158, + "grad_norm": 1.1953125, + "learning_rate": 7.15307582260372e-09, + "loss": 2.5493, + "step": 4 + }, + { + "epoch": 0.0002682403433476395, + "grad_norm": 0.419921875, + "learning_rate": 8.94134477825465e-09, + "loss": 2.5265, + "step": 5 + }, + { + "epoch": 0.0003218884120171674, + "grad_norm": 0.443359375, + "learning_rate": 1.072961373390558e-08, + "loss": 2.8562, + "step": 6 + }, + { + "epoch": 0.0003755364806866953, + "grad_norm": 0.408203125, + "learning_rate": 1.251788268955651e-08, + "loss": 2.6135, + "step": 7 + }, + { + "epoch": 0.00042918454935622315, + "grad_norm": 0.35546875, + "learning_rate": 1.430615164520744e-08, + "loss": 2.3897, + "step": 8 + }, + { + "epoch": 0.00048283261802575106, + "grad_norm": 0.3828125, + "learning_rate": 1.609442060085837e-08, + "loss": 2.4573, + "step": 9 + }, + { + "epoch": 0.000536480686695279, + "grad_norm": 0.291015625, + "learning_rate": 1.78826895565093e-08, + "loss": 2.036, + "step": 10 + }, + { + "epoch": 0.0005901287553648068, + "grad_norm": 0.98828125, + "learning_rate": 1.967095851216023e-08, + "loss": 2.801, + "step": 11 + }, + { + "epoch": 0.0006437768240343348, + "grad_norm": 0.412109375, + "learning_rate": 2.145922746781116e-08, + "loss": 2.1978, + "step": 12 + }, + { + "epoch": 0.0006974248927038626, + "grad_norm": 0.431640625, + "learning_rate": 2.324749642346209e-08, + "loss": 2.5903, + "step": 13 + }, + { + "epoch": 0.0007510729613733906, + "grad_norm": 1.3671875, + "learning_rate": 2.503576537911302e-08, + "loss": 2.687, + "step": 14 + }, + { + "epoch": 0.0008047210300429185, + "grad_norm": 0.333984375, + "learning_rate": 2.682403433476395e-08, + "loss": 2.4478, + "step": 15 + }, + { + "epoch": 0.0008583690987124463, + "grad_norm": 0.302734375, + "learning_rate": 2.861230329041488e-08, + "loss": 2.2632, + "step": 16 + }, + { + "epoch": 0.0009120171673819743, + "grad_norm": 0.4453125, + "learning_rate": 3.040057224606581e-08, + "loss": 1.7356, + "step": 17 + }, + { + "epoch": 0.0009656652360515021, + "grad_norm": 0.306640625, + "learning_rate": 3.218884120171674e-08, + "loss": 2.4523, + "step": 18 + }, + { + "epoch": 0.00101931330472103, + "grad_norm": 0.478515625, + "learning_rate": 3.397711015736767e-08, + "loss": 2.4249, + "step": 19 + }, + { + "epoch": 0.001072961373390558, + "grad_norm": 0.390625, + "learning_rate": 3.57653791130186e-08, + "loss": 2.5147, + "step": 20 + }, + { + "epoch": 0.0011266094420600858, + "grad_norm": 0.44140625, + "learning_rate": 3.755364806866953e-08, + "loss": 2.5997, + "step": 21 + }, + { + "epoch": 0.0011802575107296136, + "grad_norm": 0.32421875, + "learning_rate": 3.934191702432046e-08, + "loss": 2.5245, + "step": 22 + }, + { + "epoch": 0.0012339055793991417, + "grad_norm": 0.34375, + "learning_rate": 4.1130185979971395e-08, + "loss": 2.3243, + "step": 23 + }, + { + "epoch": 0.0012875536480686696, + "grad_norm": 0.306640625, + "learning_rate": 4.291845493562232e-08, + "loss": 2.5863, + "step": 24 + }, + { + "epoch": 0.0013412017167381974, + "grad_norm": 0.2373046875, + "learning_rate": 4.470672389127325e-08, + "loss": 2.1751, + "step": 25 + }, + { + "epoch": 0.0013948497854077253, + "grad_norm": 0.41015625, + "learning_rate": 4.649499284692418e-08, + "loss": 2.2426, + "step": 26 + }, + { + "epoch": 0.0014484978540772531, + "grad_norm": 0.3515625, + "learning_rate": 4.8283261802575116e-08, + "loss": 2.4159, + "step": 27 + }, + { + "epoch": 0.0015021459227467812, + "grad_norm": 0.51953125, + "learning_rate": 5.007153075822604e-08, + "loss": 2.4058, + "step": 28 + }, + { + "epoch": 0.001555793991416309, + "grad_norm": 0.396484375, + "learning_rate": 5.1859799713876974e-08, + "loss": 2.6801, + "step": 29 + }, + { + "epoch": 0.001609442060085837, + "grad_norm": 0.578125, + "learning_rate": 5.36480686695279e-08, + "loss": 2.4075, + "step": 30 + }, + { + "epoch": 0.0016630901287553648, + "grad_norm": 0.404296875, + "learning_rate": 5.543633762517884e-08, + "loss": 2.6774, + "step": 31 + }, + { + "epoch": 0.0017167381974248926, + "grad_norm": 0.330078125, + "learning_rate": 5.722460658082976e-08, + "loss": 2.5303, + "step": 32 + }, + { + "epoch": 0.0017703862660944207, + "grad_norm": 0.328125, + "learning_rate": 5.9012875536480695e-08, + "loss": 2.596, + "step": 33 + }, + { + "epoch": 0.0018240343347639485, + "grad_norm": 0.4609375, + "learning_rate": 6.080114449213162e-08, + "loss": 2.8418, + "step": 34 + }, + { + "epoch": 0.0018776824034334764, + "grad_norm": 0.34375, + "learning_rate": 6.258941344778256e-08, + "loss": 2.5661, + "step": 35 + }, + { + "epoch": 0.0019313304721030042, + "grad_norm": 0.353515625, + "learning_rate": 6.437768240343348e-08, + "loss": 2.561, + "step": 36 + }, + { + "epoch": 0.0019849785407725323, + "grad_norm": 0.27734375, + "learning_rate": 6.616595135908442e-08, + "loss": 2.2062, + "step": 37 + }, + { + "epoch": 0.00203862660944206, + "grad_norm": 0.359375, + "learning_rate": 6.795422031473535e-08, + "loss": 2.4371, + "step": 38 + }, + { + "epoch": 0.002092274678111588, + "grad_norm": 0.37109375, + "learning_rate": 6.974248927038627e-08, + "loss": 2.4343, + "step": 39 + }, + { + "epoch": 0.002145922746781116, + "grad_norm": 0.36328125, + "learning_rate": 7.15307582260372e-08, + "loss": 2.457, + "step": 40 + }, + { + "epoch": 0.0021995708154506437, + "grad_norm": 0.37109375, + "learning_rate": 7.331902718168813e-08, + "loss": 2.4671, + "step": 41 + }, + { + "epoch": 0.0022532188841201716, + "grad_norm": 0.359375, + "learning_rate": 7.510729613733906e-08, + "loss": 2.6244, + "step": 42 + }, + { + "epoch": 0.0023068669527896994, + "grad_norm": 0.404296875, + "learning_rate": 7.689556509298999e-08, + "loss": 2.3905, + "step": 43 + }, + { + "epoch": 0.0023605150214592273, + "grad_norm": 0.318359375, + "learning_rate": 7.868383404864092e-08, + "loss": 2.4561, + "step": 44 + }, + { + "epoch": 0.0024141630901287556, + "grad_norm": 0.51953125, + "learning_rate": 8.047210300429185e-08, + "loss": 2.5588, + "step": 45 + }, + { + "epoch": 0.0024678111587982834, + "grad_norm": 0.404296875, + "learning_rate": 8.226037195994279e-08, + "loss": 2.2913, + "step": 46 + }, + { + "epoch": 0.0025214592274678113, + "grad_norm": 0.52734375, + "learning_rate": 8.404864091559372e-08, + "loss": 2.3237, + "step": 47 + }, + { + "epoch": 0.002575107296137339, + "grad_norm": 0.5078125, + "learning_rate": 8.583690987124465e-08, + "loss": 2.2655, + "step": 48 + }, + { + "epoch": 0.002628755364806867, + "grad_norm": 0.640625, + "learning_rate": 8.762517882689556e-08, + "loss": 2.682, + "step": 49 + }, + { + "epoch": 0.002682403433476395, + "grad_norm": 0.490234375, + "learning_rate": 8.94134477825465e-08, + "loss": 2.2509, + "step": 50 + }, + { + "epoch": 0.0027360515021459227, + "grad_norm": 0.328125, + "learning_rate": 9.120171673819743e-08, + "loss": 2.5278, + "step": 51 + }, + { + "epoch": 0.0027896995708154505, + "grad_norm": 0.447265625, + "learning_rate": 9.298998569384836e-08, + "loss": 2.4237, + "step": 52 + }, + { + "epoch": 0.0028433476394849784, + "grad_norm": 0.29296875, + "learning_rate": 9.477825464949929e-08, + "loss": 2.2723, + "step": 53 + }, + { + "epoch": 0.0028969957081545063, + "grad_norm": 0.8203125, + "learning_rate": 9.656652360515023e-08, + "loss": 1.4755, + "step": 54 + }, + { + "epoch": 0.0029506437768240345, + "grad_norm": 0.78125, + "learning_rate": 9.835479256080116e-08, + "loss": 2.4364, + "step": 55 + }, + { + "epoch": 0.0030042918454935624, + "grad_norm": 0.291015625, + "learning_rate": 1.0014306151645208e-07, + "loss": 2.7337, + "step": 56 + }, + { + "epoch": 0.0030579399141630902, + "grad_norm": 0.427734375, + "learning_rate": 1.01931330472103e-07, + "loss": 2.5014, + "step": 57 + }, + { + "epoch": 0.003111587982832618, + "grad_norm": 0.337890625, + "learning_rate": 1.0371959942775395e-07, + "loss": 2.0146, + "step": 58 + }, + { + "epoch": 0.003165236051502146, + "grad_norm": 1.40625, + "learning_rate": 1.0550786838340488e-07, + "loss": 2.4555, + "step": 59 + }, + { + "epoch": 0.003218884120171674, + "grad_norm": 0.470703125, + "learning_rate": 1.072961373390558e-07, + "loss": 2.553, + "step": 60 + }, + { + "epoch": 0.0032725321888412017, + "grad_norm": 0.4140625, + "learning_rate": 1.0908440629470673e-07, + "loss": 2.639, + "step": 61 + }, + { + "epoch": 0.0033261802575107295, + "grad_norm": 0.35546875, + "learning_rate": 1.1087267525035768e-07, + "loss": 2.212, + "step": 62 + }, + { + "epoch": 0.0033798283261802574, + "grad_norm": 0.306640625, + "learning_rate": 1.1266094420600859e-07, + "loss": 2.4842, + "step": 63 + }, + { + "epoch": 0.0034334763948497852, + "grad_norm": 0.357421875, + "learning_rate": 1.1444921316165952e-07, + "loss": 2.7548, + "step": 64 + }, + { + "epoch": 0.0034871244635193135, + "grad_norm": 0.400390625, + "learning_rate": 1.1623748211731045e-07, + "loss": 2.8246, + "step": 65 + }, + { + "epoch": 0.0035407725321888414, + "grad_norm": 0.341796875, + "learning_rate": 1.1802575107296139e-07, + "loss": 2.4038, + "step": 66 + }, + { + "epoch": 0.003594420600858369, + "grad_norm": 0.2890625, + "learning_rate": 1.198140200286123e-07, + "loss": 2.2669, + "step": 67 + }, + { + "epoch": 0.003648068669527897, + "grad_norm": 0.34375, + "learning_rate": 1.2160228898426323e-07, + "loss": 2.4509, + "step": 68 + }, + { + "epoch": 0.003701716738197425, + "grad_norm": 0.37109375, + "learning_rate": 1.2339055793991416e-07, + "loss": 2.6045, + "step": 69 + }, + { + "epoch": 0.0037553648068669528, + "grad_norm": 0.51953125, + "learning_rate": 1.2517882689556512e-07, + "loss": 2.3488, + "step": 70 + }, + { + "epoch": 0.0038090128755364806, + "grad_norm": 0.35546875, + "learning_rate": 1.2696709585121605e-07, + "loss": 2.514, + "step": 71 + }, + { + "epoch": 0.0038626609442060085, + "grad_norm": 0.32421875, + "learning_rate": 1.2875536480686695e-07, + "loss": 2.5386, + "step": 72 + }, + { + "epoch": 0.003916309012875536, + "grad_norm": 0.291015625, + "learning_rate": 1.3054363376251788e-07, + "loss": 2.5442, + "step": 73 + }, + { + "epoch": 0.003969957081545065, + "grad_norm": 0.330078125, + "learning_rate": 1.3233190271816883e-07, + "loss": 2.5375, + "step": 74 + }, + { + "epoch": 0.004023605150214592, + "grad_norm": 0.291015625, + "learning_rate": 1.3412017167381976e-07, + "loss": 2.2197, + "step": 75 + }, + { + "epoch": 0.00407725321888412, + "grad_norm": 0.3203125, + "learning_rate": 1.359084406294707e-07, + "loss": 2.4232, + "step": 76 + }, + { + "epoch": 0.004130901287553648, + "grad_norm": 0.5625, + "learning_rate": 1.3769670958512162e-07, + "loss": 2.2159, + "step": 77 + }, + { + "epoch": 0.004184549356223176, + "grad_norm": 0.361328125, + "learning_rate": 1.3948497854077255e-07, + "loss": 2.6874, + "step": 78 + }, + { + "epoch": 0.0042381974248927035, + "grad_norm": 1.0078125, + "learning_rate": 1.4127324749642348e-07, + "loss": 2.5792, + "step": 79 + }, + { + "epoch": 0.004291845493562232, + "grad_norm": 0.451171875, + "learning_rate": 1.430615164520744e-07, + "loss": 2.6089, + "step": 80 + }, + { + "epoch": 0.00434549356223176, + "grad_norm": 0.287109375, + "learning_rate": 1.4484978540772534e-07, + "loss": 2.4192, + "step": 81 + }, + { + "epoch": 0.0043991416309012875, + "grad_norm": 1.015625, + "learning_rate": 1.4663805436337626e-07, + "loss": 1.806, + "step": 82 + }, + { + "epoch": 0.004452789699570816, + "grad_norm": 0.33984375, + "learning_rate": 1.484263233190272e-07, + "loss": 2.5854, + "step": 83 + }, + { + "epoch": 0.004506437768240343, + "grad_norm": 0.3359375, + "learning_rate": 1.5021459227467812e-07, + "loss": 2.5636, + "step": 84 + }, + { + "epoch": 0.0045600858369098714, + "grad_norm": 0.353515625, + "learning_rate": 1.5200286123032905e-07, + "loss": 2.5148, + "step": 85 + }, + { + "epoch": 0.004613733905579399, + "grad_norm": 0.333984375, + "learning_rate": 1.5379113018597998e-07, + "loss": 2.5357, + "step": 86 + }, + { + "epoch": 0.004667381974248927, + "grad_norm": 0.419921875, + "learning_rate": 1.555793991416309e-07, + "loss": 2.6014, + "step": 87 + }, + { + "epoch": 0.004721030042918455, + "grad_norm": 0.5546875, + "learning_rate": 1.5736766809728184e-07, + "loss": 2.6102, + "step": 88 + }, + { + "epoch": 0.004774678111587983, + "grad_norm": 0.62890625, + "learning_rate": 1.591559370529328e-07, + "loss": 2.6584, + "step": 89 + }, + { + "epoch": 0.004828326180257511, + "grad_norm": 0.40625, + "learning_rate": 1.609442060085837e-07, + "loss": 2.5585, + "step": 90 + }, + { + "epoch": 0.0048819742489270386, + "grad_norm": 0.310546875, + "learning_rate": 1.6273247496423465e-07, + "loss": 2.5021, + "step": 91 + }, + { + "epoch": 0.004935622317596567, + "grad_norm": 0.462890625, + "learning_rate": 1.6452074391988558e-07, + "loss": 2.3809, + "step": 92 + }, + { + "epoch": 0.004989270386266094, + "grad_norm": 0.3671875, + "learning_rate": 1.6630901287553648e-07, + "loss": 2.6141, + "step": 93 + }, + { + "epoch": 0.0050429184549356226, + "grad_norm": 0.3046875, + "learning_rate": 1.6809728183118744e-07, + "loss": 2.561, + "step": 94 + }, + { + "epoch": 0.00509656652360515, + "grad_norm": 0.4296875, + "learning_rate": 1.6988555078683834e-07, + "loss": 2.2725, + "step": 95 + }, + { + "epoch": 0.005150214592274678, + "grad_norm": 0.4453125, + "learning_rate": 1.716738197424893e-07, + "loss": 2.2078, + "step": 96 + }, + { + "epoch": 0.005203862660944206, + "grad_norm": 0.30078125, + "learning_rate": 1.7346208869814022e-07, + "loss": 2.7496, + "step": 97 + }, + { + "epoch": 0.005257510729613734, + "grad_norm": 0.412109375, + "learning_rate": 1.7525035765379112e-07, + "loss": 1.7421, + "step": 98 + }, + { + "epoch": 0.005311158798283261, + "grad_norm": 0.3203125, + "learning_rate": 1.7703862660944208e-07, + "loss": 2.4606, + "step": 99 + }, + { + "epoch": 0.00536480686695279, + "grad_norm": 0.32421875, + "learning_rate": 1.78826895565093e-07, + "loss": 2.5337, + "step": 100 + }, + { + "epoch": 0.005418454935622318, + "grad_norm": 0.4453125, + "learning_rate": 1.8061516452074394e-07, + "loss": 2.4455, + "step": 101 + }, + { + "epoch": 0.005472103004291845, + "grad_norm": 1.765625, + "learning_rate": 1.8240343347639487e-07, + "loss": 2.3098, + "step": 102 + }, + { + "epoch": 0.005525751072961374, + "grad_norm": 0.6953125, + "learning_rate": 1.8419170243204577e-07, + "loss": 2.692, + "step": 103 + }, + { + "epoch": 0.005579399141630901, + "grad_norm": 0.98828125, + "learning_rate": 1.8597997138769672e-07, + "loss": 2.61, + "step": 104 + }, + { + "epoch": 0.005633047210300429, + "grad_norm": 0.640625, + "learning_rate": 1.8776824034334768e-07, + "loss": 2.6198, + "step": 105 + }, + { + "epoch": 0.005686695278969957, + "grad_norm": 0.39453125, + "learning_rate": 1.8955650929899858e-07, + "loss": 2.9015, + "step": 106 + }, + { + "epoch": 0.005740343347639485, + "grad_norm": 0.421875, + "learning_rate": 1.913447782546495e-07, + "loss": 2.5938, + "step": 107 + }, + { + "epoch": 0.0057939914163090125, + "grad_norm": 0.43359375, + "learning_rate": 1.9313304721030046e-07, + "loss": 2.2852, + "step": 108 + }, + { + "epoch": 0.005847639484978541, + "grad_norm": 0.310546875, + "learning_rate": 1.9492131616595137e-07, + "loss": 2.4845, + "step": 109 + }, + { + "epoch": 0.005901287553648069, + "grad_norm": 0.93359375, + "learning_rate": 1.9670958512160232e-07, + "loss": 2.4035, + "step": 110 + }, + { + "epoch": 0.0059549356223175965, + "grad_norm": 0.5546875, + "learning_rate": 1.9849785407725322e-07, + "loss": 2.2722, + "step": 111 + }, + { + "epoch": 0.006008583690987125, + "grad_norm": 0.375, + "learning_rate": 2.0028612303290415e-07, + "loss": 2.5463, + "step": 112 + }, + { + "epoch": 0.006062231759656652, + "grad_norm": 0.62109375, + "learning_rate": 2.020743919885551e-07, + "loss": 2.5563, + "step": 113 + }, + { + "epoch": 0.0061158798283261805, + "grad_norm": 0.546875, + "learning_rate": 2.03862660944206e-07, + "loss": 2.3814, + "step": 114 + }, + { + "epoch": 0.006169527896995708, + "grad_norm": 0.35546875, + "learning_rate": 2.0565092989985697e-07, + "loss": 2.4245, + "step": 115 + }, + { + "epoch": 0.006223175965665236, + "grad_norm": 0.52734375, + "learning_rate": 2.074391988555079e-07, + "loss": 1.6676, + "step": 116 + }, + { + "epoch": 0.006276824034334764, + "grad_norm": 0.462890625, + "learning_rate": 2.092274678111588e-07, + "loss": 2.6345, + "step": 117 + }, + { + "epoch": 0.006330472103004292, + "grad_norm": 0.302734375, + "learning_rate": 2.1101573676680975e-07, + "loss": 2.5535, + "step": 118 + }, + { + "epoch": 0.006384120171673819, + "grad_norm": 0.32421875, + "learning_rate": 2.1280400572246065e-07, + "loss": 2.5378, + "step": 119 + }, + { + "epoch": 0.006437768240343348, + "grad_norm": 0.3203125, + "learning_rate": 2.145922746781116e-07, + "loss": 2.6598, + "step": 120 + }, + { + "epoch": 0.006491416309012876, + "grad_norm": 0.310546875, + "learning_rate": 2.1638054363376254e-07, + "loss": 2.5333, + "step": 121 + }, + { + "epoch": 0.006545064377682403, + "grad_norm": 0.39453125, + "learning_rate": 2.1816881258941347e-07, + "loss": 2.5408, + "step": 122 + }, + { + "epoch": 0.006598712446351932, + "grad_norm": 0.4921875, + "learning_rate": 2.199570815450644e-07, + "loss": 2.4658, + "step": 123 + }, + { + "epoch": 0.006652360515021459, + "grad_norm": 0.326171875, + "learning_rate": 2.2174535050071535e-07, + "loss": 2.4675, + "step": 124 + }, + { + "epoch": 0.006706008583690987, + "grad_norm": 0.421875, + "learning_rate": 2.2353361945636625e-07, + "loss": 2.6432, + "step": 125 + }, + { + "epoch": 0.006759656652360515, + "grad_norm": 0.365234375, + "learning_rate": 2.2532188841201718e-07, + "loss": 2.5494, + "step": 126 + }, + { + "epoch": 0.006813304721030043, + "grad_norm": 0.423828125, + "learning_rate": 2.271101573676681e-07, + "loss": 2.7104, + "step": 127 + }, + { + "epoch": 0.0068669527896995704, + "grad_norm": 0.349609375, + "learning_rate": 2.2889842632331904e-07, + "loss": 2.3482, + "step": 128 + }, + { + "epoch": 0.006920600858369099, + "grad_norm": 0.35546875, + "learning_rate": 2.3068669527897e-07, + "loss": 2.4934, + "step": 129 + }, + { + "epoch": 0.006974248927038627, + "grad_norm": 0.71484375, + "learning_rate": 2.324749642346209e-07, + "loss": 2.5158, + "step": 130 + }, + { + "epoch": 0.0070278969957081544, + "grad_norm": 0.453125, + "learning_rate": 2.3426323319027185e-07, + "loss": 2.2247, + "step": 131 + }, + { + "epoch": 0.007081545064377683, + "grad_norm": 0.396484375, + "learning_rate": 2.3605150214592278e-07, + "loss": 1.7295, + "step": 132 + }, + { + "epoch": 0.00713519313304721, + "grad_norm": 0.31640625, + "learning_rate": 2.3783977110157368e-07, + "loss": 2.0676, + "step": 133 + }, + { + "epoch": 0.007188841201716738, + "grad_norm": 0.380859375, + "learning_rate": 2.396280400572246e-07, + "loss": 2.3562, + "step": 134 + }, + { + "epoch": 0.007242489270386266, + "grad_norm": 0.365234375, + "learning_rate": 2.4141630901287554e-07, + "loss": 2.5425, + "step": 135 + }, + { + "epoch": 0.007296137339055794, + "grad_norm": 0.486328125, + "learning_rate": 2.4320457796852647e-07, + "loss": 2.4493, + "step": 136 + }, + { + "epoch": 0.0073497854077253216, + "grad_norm": 0.3359375, + "learning_rate": 2.4499284692417745e-07, + "loss": 2.3006, + "step": 137 + }, + { + "epoch": 0.00740343347639485, + "grad_norm": 0.326171875, + "learning_rate": 2.4678111587982833e-07, + "loss": 2.6299, + "step": 138 + }, + { + "epoch": 0.007457081545064377, + "grad_norm": 0.328125, + "learning_rate": 2.4856938483547926e-07, + "loss": 2.3178, + "step": 139 + }, + { + "epoch": 0.0075107296137339056, + "grad_norm": 0.3515625, + "learning_rate": 2.5035765379113024e-07, + "loss": 2.4958, + "step": 140 + }, + { + "epoch": 0.007564377682403434, + "grad_norm": 0.34375, + "learning_rate": 2.521459227467811e-07, + "loss": 2.4116, + "step": 141 + }, + { + "epoch": 0.007618025751072961, + "grad_norm": 2.890625, + "learning_rate": 2.539341917024321e-07, + "loss": 2.6777, + "step": 142 + }, + { + "epoch": 0.0076716738197424895, + "grad_norm": 0.3515625, + "learning_rate": 2.5572246065808297e-07, + "loss": 2.3124, + "step": 143 + }, + { + "epoch": 0.007725321888412017, + "grad_norm": 0.640625, + "learning_rate": 2.575107296137339e-07, + "loss": 2.6318, + "step": 144 + }, + { + "epoch": 0.007778969957081545, + "grad_norm": 0.3125, + "learning_rate": 2.592989985693849e-07, + "loss": 2.6204, + "step": 145 + }, + { + "epoch": 0.007832618025751073, + "grad_norm": 0.322265625, + "learning_rate": 2.6108726752503576e-07, + "loss": 2.5832, + "step": 146 + }, + { + "epoch": 0.007886266094420601, + "grad_norm": 0.31640625, + "learning_rate": 2.6287553648068674e-07, + "loss": 2.4318, + "step": 147 + }, + { + "epoch": 0.00793991416309013, + "grad_norm": 0.298828125, + "learning_rate": 2.6466380543633767e-07, + "loss": 2.4571, + "step": 148 + }, + { + "epoch": 0.007993562231759656, + "grad_norm": 0.314453125, + "learning_rate": 2.664520743919886e-07, + "loss": 2.6741, + "step": 149 + }, + { + "epoch": 0.008047210300429184, + "grad_norm": 0.5078125, + "learning_rate": 2.682403433476395e-07, + "loss": 2.631, + "step": 150 + }, + { + "epoch": 0.008100858369098712, + "grad_norm": 0.3046875, + "learning_rate": 2.700286123032904e-07, + "loss": 2.5523, + "step": 151 + }, + { + "epoch": 0.00815450643776824, + "grad_norm": 0.326171875, + "learning_rate": 2.718168812589414e-07, + "loss": 2.377, + "step": 152 + }, + { + "epoch": 0.008208154506437769, + "grad_norm": 0.412109375, + "learning_rate": 2.736051502145923e-07, + "loss": 2.5577, + "step": 153 + }, + { + "epoch": 0.008261802575107296, + "grad_norm": 0.326171875, + "learning_rate": 2.7539341917024324e-07, + "loss": 2.6477, + "step": 154 + }, + { + "epoch": 0.008315450643776824, + "grad_norm": 0.384765625, + "learning_rate": 2.7718168812589417e-07, + "loss": 2.4416, + "step": 155 + }, + { + "epoch": 0.008369098712446352, + "grad_norm": 0.49609375, + "learning_rate": 2.789699570815451e-07, + "loss": 2.2736, + "step": 156 + }, + { + "epoch": 0.00842274678111588, + "grad_norm": 0.373046875, + "learning_rate": 2.8075822603719603e-07, + "loss": 2.2339, + "step": 157 + }, + { + "epoch": 0.008476394849785407, + "grad_norm": 0.462890625, + "learning_rate": 2.8254649499284696e-07, + "loss": 2.591, + "step": 158 + }, + { + "epoch": 0.008530042918454935, + "grad_norm": 0.337890625, + "learning_rate": 2.843347639484979e-07, + "loss": 2.188, + "step": 159 + }, + { + "epoch": 0.008583690987124463, + "grad_norm": 0.380859375, + "learning_rate": 2.861230329041488e-07, + "loss": 2.4335, + "step": 160 + }, + { + "epoch": 0.008637339055793992, + "grad_norm": 0.29296875, + "learning_rate": 2.8791130185979974e-07, + "loss": 2.4525, + "step": 161 + }, + { + "epoch": 0.00869098712446352, + "grad_norm": 0.3359375, + "learning_rate": 2.8969957081545067e-07, + "loss": 2.3558, + "step": 162 + }, + { + "epoch": 0.008744635193133047, + "grad_norm": 0.61328125, + "learning_rate": 2.914878397711016e-07, + "loss": 2.1916, + "step": 163 + }, + { + "epoch": 0.008798283261802575, + "grad_norm": 0.42578125, + "learning_rate": 2.9327610872675253e-07, + "loss": 2.4573, + "step": 164 + }, + { + "epoch": 0.008851931330472103, + "grad_norm": 0.42578125, + "learning_rate": 2.9506437768240346e-07, + "loss": 2.3892, + "step": 165 + }, + { + "epoch": 0.008905579399141631, + "grad_norm": 0.3046875, + "learning_rate": 2.968526466380544e-07, + "loss": 2.5658, + "step": 166 + }, + { + "epoch": 0.008959227467811158, + "grad_norm": 0.29296875, + "learning_rate": 2.986409155937053e-07, + "loss": 2.6882, + "step": 167 + }, + { + "epoch": 0.009012875536480686, + "grad_norm": 0.453125, + "learning_rate": 3.0042918454935624e-07, + "loss": 2.4261, + "step": 168 + }, + { + "epoch": 0.009066523605150215, + "grad_norm": 0.396484375, + "learning_rate": 3.0221745350500717e-07, + "loss": 2.564, + "step": 169 + }, + { + "epoch": 0.009120171673819743, + "grad_norm": 0.400390625, + "learning_rate": 3.040057224606581e-07, + "loss": 2.5462, + "step": 170 + }, + { + "epoch": 0.009173819742489271, + "grad_norm": 0.384765625, + "learning_rate": 3.0579399141630903e-07, + "loss": 2.6185, + "step": 171 + }, + { + "epoch": 0.009227467811158798, + "grad_norm": 0.388671875, + "learning_rate": 3.0758226037195996e-07, + "loss": 2.5382, + "step": 172 + }, + { + "epoch": 0.009281115879828326, + "grad_norm": 0.5703125, + "learning_rate": 3.093705293276109e-07, + "loss": 2.6381, + "step": 173 + }, + { + "epoch": 0.009334763948497854, + "grad_norm": 0.34765625, + "learning_rate": 3.111587982832618e-07, + "loss": 2.4119, + "step": 174 + }, + { + "epoch": 0.009388412017167383, + "grad_norm": 0.3203125, + "learning_rate": 3.1294706723891274e-07, + "loss": 2.4732, + "step": 175 + }, + { + "epoch": 0.00944206008583691, + "grad_norm": 0.38671875, + "learning_rate": 3.1473533619456367e-07, + "loss": 2.9193, + "step": 176 + }, + { + "epoch": 0.009495708154506437, + "grad_norm": 11.6875, + "learning_rate": 3.1652360515021465e-07, + "loss": 2.406, + "step": 177 + }, + { + "epoch": 0.009549356223175966, + "grad_norm": 0.38671875, + "learning_rate": 3.183118741058656e-07, + "loss": 2.5553, + "step": 178 + }, + { + "epoch": 0.009603004291845494, + "grad_norm": 0.4921875, + "learning_rate": 3.2010014306151646e-07, + "loss": 2.4548, + "step": 179 + }, + { + "epoch": 0.009656652360515022, + "grad_norm": 0.41015625, + "learning_rate": 3.218884120171674e-07, + "loss": 2.3571, + "step": 180 + }, + { + "epoch": 0.009710300429184549, + "grad_norm": 0.375, + "learning_rate": 3.236766809728183e-07, + "loss": 2.7181, + "step": 181 + }, + { + "epoch": 0.009763948497854077, + "grad_norm": 0.35546875, + "learning_rate": 3.254649499284693e-07, + "loss": 2.4217, + "step": 182 + }, + { + "epoch": 0.009817596566523605, + "grad_norm": 0.357421875, + "learning_rate": 3.2725321888412023e-07, + "loss": 2.8628, + "step": 183 + }, + { + "epoch": 0.009871244635193134, + "grad_norm": 0.33984375, + "learning_rate": 3.2904148783977116e-07, + "loss": 2.3178, + "step": 184 + }, + { + "epoch": 0.00992489270386266, + "grad_norm": 0.287109375, + "learning_rate": 3.3082975679542203e-07, + "loss": 2.4766, + "step": 185 + }, + { + "epoch": 0.009978540772532189, + "grad_norm": 0.38671875, + "learning_rate": 3.3261802575107296e-07, + "loss": 2.5345, + "step": 186 + }, + { + "epoch": 0.010032188841201717, + "grad_norm": 0.42578125, + "learning_rate": 3.3440629470672394e-07, + "loss": 2.5146, + "step": 187 + }, + { + "epoch": 0.010085836909871245, + "grad_norm": 0.40625, + "learning_rate": 3.3619456366237487e-07, + "loss": 2.2619, + "step": 188 + }, + { + "epoch": 0.010139484978540772, + "grad_norm": 0.333984375, + "learning_rate": 3.379828326180258e-07, + "loss": 2.5021, + "step": 189 + }, + { + "epoch": 0.0101931330472103, + "grad_norm": 0.5234375, + "learning_rate": 3.397711015736767e-07, + "loss": 2.5583, + "step": 190 + }, + { + "epoch": 0.010246781115879828, + "grad_norm": 0.349609375, + "learning_rate": 3.415593705293276e-07, + "loss": 2.5187, + "step": 191 + }, + { + "epoch": 0.010300429184549357, + "grad_norm": 0.486328125, + "learning_rate": 3.433476394849786e-07, + "loss": 2.8518, + "step": 192 + }, + { + "epoch": 0.010354077253218885, + "grad_norm": 0.4453125, + "learning_rate": 3.451359084406295e-07, + "loss": 1.8132, + "step": 193 + }, + { + "epoch": 0.010407725321888411, + "grad_norm": 0.33984375, + "learning_rate": 3.4692417739628044e-07, + "loss": 2.6649, + "step": 194 + }, + { + "epoch": 0.01046137339055794, + "grad_norm": 0.30859375, + "learning_rate": 3.487124463519313e-07, + "loss": 2.5046, + "step": 195 + }, + { + "epoch": 0.010515021459227468, + "grad_norm": 0.38671875, + "learning_rate": 3.5050071530758225e-07, + "loss": 2.5836, + "step": 196 + }, + { + "epoch": 0.010568669527896996, + "grad_norm": 0.322265625, + "learning_rate": 3.5228898426323323e-07, + "loss": 2.5395, + "step": 197 + }, + { + "epoch": 0.010622317596566523, + "grad_norm": 0.37109375, + "learning_rate": 3.5407725321888416e-07, + "loss": 2.6288, + "step": 198 + }, + { + "epoch": 0.010675965665236051, + "grad_norm": 0.66796875, + "learning_rate": 3.558655221745351e-07, + "loss": 1.7938, + "step": 199 + }, + { + "epoch": 0.01072961373390558, + "grad_norm": 0.33203125, + "learning_rate": 3.57653791130186e-07, + "loss": 2.5429, + "step": 200 + }, + { + "epoch": 0.010783261802575108, + "grad_norm": 0.349609375, + "learning_rate": 3.594420600858369e-07, + "loss": 2.734, + "step": 201 + }, + { + "epoch": 0.010836909871244636, + "grad_norm": 0.447265625, + "learning_rate": 3.612303290414879e-07, + "loss": 2.4992, + "step": 202 + }, + { + "epoch": 0.010890557939914162, + "grad_norm": 0.421875, + "learning_rate": 3.630185979971388e-07, + "loss": 2.7004, + "step": 203 + }, + { + "epoch": 0.01094420600858369, + "grad_norm": 0.34375, + "learning_rate": 3.6480686695278973e-07, + "loss": 2.1132, + "step": 204 + }, + { + "epoch": 0.010997854077253219, + "grad_norm": 1.171875, + "learning_rate": 3.665951359084407e-07, + "loss": 2.4594, + "step": 205 + }, + { + "epoch": 0.011051502145922747, + "grad_norm": 0.53125, + "learning_rate": 3.6838340486409154e-07, + "loss": 2.5401, + "step": 206 + }, + { + "epoch": 0.011105150214592274, + "grad_norm": 0.40625, + "learning_rate": 3.701716738197425e-07, + "loss": 2.3527, + "step": 207 + }, + { + "epoch": 0.011158798283261802, + "grad_norm": 0.337890625, + "learning_rate": 3.7195994277539345e-07, + "loss": 2.4697, + "step": 208 + }, + { + "epoch": 0.01121244635193133, + "grad_norm": 0.34765625, + "learning_rate": 3.737482117310444e-07, + "loss": 2.5975, + "step": 209 + }, + { + "epoch": 0.011266094420600859, + "grad_norm": 0.279296875, + "learning_rate": 3.7553648068669536e-07, + "loss": 2.4245, + "step": 210 + }, + { + "epoch": 0.011319742489270387, + "grad_norm": 0.466796875, + "learning_rate": 3.7732474964234623e-07, + "loss": 2.5424, + "step": 211 + }, + { + "epoch": 0.011373390557939914, + "grad_norm": 0.3671875, + "learning_rate": 3.7911301859799716e-07, + "loss": 2.3144, + "step": 212 + }, + { + "epoch": 0.011427038626609442, + "grad_norm": 0.302734375, + "learning_rate": 3.809012875536481e-07, + "loss": 2.174, + "step": 213 + }, + { + "epoch": 0.01148068669527897, + "grad_norm": 0.357421875, + "learning_rate": 3.82689556509299e-07, + "loss": 2.5862, + "step": 214 + }, + { + "epoch": 0.011534334763948498, + "grad_norm": 1.1171875, + "learning_rate": 3.8447782546495e-07, + "loss": 2.4637, + "step": 215 + }, + { + "epoch": 0.011587982832618025, + "grad_norm": 0.498046875, + "learning_rate": 3.8626609442060093e-07, + "loss": 2.6565, + "step": 216 + }, + { + "epoch": 0.011641630901287553, + "grad_norm": 0.4921875, + "learning_rate": 3.880543633762518e-07, + "loss": 2.7114, + "step": 217 + }, + { + "epoch": 0.011695278969957082, + "grad_norm": 0.341796875, + "learning_rate": 3.8984263233190273e-07, + "loss": 2.6905, + "step": 218 + }, + { + "epoch": 0.01174892703862661, + "grad_norm": 0.451171875, + "learning_rate": 3.9163090128755366e-07, + "loss": 2.6216, + "step": 219 + }, + { + "epoch": 0.011802575107296138, + "grad_norm": 0.373046875, + "learning_rate": 3.9341917024320464e-07, + "loss": 2.4426, + "step": 220 + }, + { + "epoch": 0.011856223175965665, + "grad_norm": 0.453125, + "learning_rate": 3.9520743919885557e-07, + "loss": 2.42, + "step": 221 + }, + { + "epoch": 0.011909871244635193, + "grad_norm": 0.306640625, + "learning_rate": 3.9699570815450645e-07, + "loss": 2.4862, + "step": 222 + }, + { + "epoch": 0.011963519313304721, + "grad_norm": 0.341796875, + "learning_rate": 3.987839771101574e-07, + "loss": 2.5665, + "step": 223 + }, + { + "epoch": 0.01201716738197425, + "grad_norm": 0.296875, + "learning_rate": 4.005722460658083e-07, + "loss": 2.2103, + "step": 224 + }, + { + "epoch": 0.012070815450643776, + "grad_norm": 0.341796875, + "learning_rate": 4.023605150214593e-07, + "loss": 2.5508, + "step": 225 + }, + { + "epoch": 0.012124463519313304, + "grad_norm": 0.37109375, + "learning_rate": 4.041487839771102e-07, + "loss": 2.4504, + "step": 226 + }, + { + "epoch": 0.012178111587982833, + "grad_norm": 0.52734375, + "learning_rate": 4.059370529327611e-07, + "loss": 2.5348, + "step": 227 + }, + { + "epoch": 0.012231759656652361, + "grad_norm": 0.349609375, + "learning_rate": 4.07725321888412e-07, + "loss": 2.4402, + "step": 228 + }, + { + "epoch": 0.012285407725321888, + "grad_norm": 0.359375, + "learning_rate": 4.0951359084406295e-07, + "loss": 2.6193, + "step": 229 + }, + { + "epoch": 0.012339055793991416, + "grad_norm": 0.6171875, + "learning_rate": 4.1130185979971393e-07, + "loss": 2.4065, + "step": 230 + }, + { + "epoch": 0.012392703862660944, + "grad_norm": 0.36328125, + "learning_rate": 4.1309012875536486e-07, + "loss": 2.3769, + "step": 231 + }, + { + "epoch": 0.012446351931330472, + "grad_norm": 0.462890625, + "learning_rate": 4.148783977110158e-07, + "loss": 2.4518, + "step": 232 + }, + { + "epoch": 0.0125, + "grad_norm": 0.66015625, + "learning_rate": 4.1666666666666667e-07, + "loss": 2.3601, + "step": 233 + }, + { + "epoch": 0.012553648068669527, + "grad_norm": 0.328125, + "learning_rate": 4.184549356223176e-07, + "loss": 2.4887, + "step": 234 + }, + { + "epoch": 0.012607296137339056, + "grad_norm": 0.404296875, + "learning_rate": 4.202432045779686e-07, + "loss": 2.5771, + "step": 235 + }, + { + "epoch": 0.012660944206008584, + "grad_norm": 0.337890625, + "learning_rate": 4.220314735336195e-07, + "loss": 2.3046, + "step": 236 + }, + { + "epoch": 0.012714592274678112, + "grad_norm": 0.333984375, + "learning_rate": 4.2381974248927043e-07, + "loss": 2.5424, + "step": 237 + }, + { + "epoch": 0.012768240343347639, + "grad_norm": 0.30078125, + "learning_rate": 4.256080114449213e-07, + "loss": 2.3938, + "step": 238 + }, + { + "epoch": 0.012821888412017167, + "grad_norm": 0.427734375, + "learning_rate": 4.273962804005723e-07, + "loss": 2.4174, + "step": 239 + }, + { + "epoch": 0.012875536480686695, + "grad_norm": 0.435546875, + "learning_rate": 4.291845493562232e-07, + "loss": 2.646, + "step": 240 + }, + { + "epoch": 0.012929184549356224, + "grad_norm": 0.322265625, + "learning_rate": 4.3097281831187415e-07, + "loss": 2.2584, + "step": 241 + }, + { + "epoch": 0.012982832618025752, + "grad_norm": 0.3125, + "learning_rate": 4.327610872675251e-07, + "loss": 2.1425, + "step": 242 + }, + { + "epoch": 0.013036480686695278, + "grad_norm": 0.298828125, + "learning_rate": 4.3454935622317595e-07, + "loss": 2.5278, + "step": 243 + }, + { + "epoch": 0.013090128755364807, + "grad_norm": 0.361328125, + "learning_rate": 4.3633762517882693e-07, + "loss": 2.6812, + "step": 244 + }, + { + "epoch": 0.013143776824034335, + "grad_norm": 0.8359375, + "learning_rate": 4.3812589413447786e-07, + "loss": 2.5466, + "step": 245 + }, + { + "epoch": 0.013197424892703863, + "grad_norm": 0.455078125, + "learning_rate": 4.399141630901288e-07, + "loss": 2.4388, + "step": 246 + }, + { + "epoch": 0.01325107296137339, + "grad_norm": 0.60546875, + "learning_rate": 4.417024320457797e-07, + "loss": 1.9663, + "step": 247 + }, + { + "epoch": 0.013304721030042918, + "grad_norm": 0.7890625, + "learning_rate": 4.434907010014307e-07, + "loss": 2.0127, + "step": 248 + }, + { + "epoch": 0.013358369098712446, + "grad_norm": 0.337890625, + "learning_rate": 4.452789699570816e-07, + "loss": 2.2317, + "step": 249 + }, + { + "epoch": 0.013412017167381975, + "grad_norm": 0.345703125, + "learning_rate": 4.470672389127325e-07, + "loss": 2.285, + "step": 250 + }, + { + "epoch": 0.013465665236051503, + "grad_norm": 0.58984375, + "learning_rate": 4.4885550786838344e-07, + "loss": 2.5223, + "step": 251 + }, + { + "epoch": 0.01351931330472103, + "grad_norm": 0.328125, + "learning_rate": 4.5064377682403436e-07, + "loss": 2.5103, + "step": 252 + }, + { + "epoch": 0.013572961373390558, + "grad_norm": 0.65234375, + "learning_rate": 4.5243204577968535e-07, + "loss": 2.5283, + "step": 253 + }, + { + "epoch": 0.013626609442060086, + "grad_norm": 0.353515625, + "learning_rate": 4.542203147353362e-07, + "loss": 2.5737, + "step": 254 + }, + { + "epoch": 0.013680257510729614, + "grad_norm": 0.357421875, + "learning_rate": 4.5600858369098715e-07, + "loss": 2.612, + "step": 255 + }, + { + "epoch": 0.013733905579399141, + "grad_norm": 0.53515625, + "learning_rate": 4.577968526466381e-07, + "loss": 2.5613, + "step": 256 + }, + { + "epoch": 0.01378755364806867, + "grad_norm": 0.50390625, + "learning_rate": 4.59585121602289e-07, + "loss": 2.6785, + "step": 257 + }, + { + "epoch": 0.013841201716738197, + "grad_norm": 0.5625, + "learning_rate": 4.6137339055794e-07, + "loss": 2.3022, + "step": 258 + }, + { + "epoch": 0.013894849785407726, + "grad_norm": 0.412109375, + "learning_rate": 4.6316165951359087e-07, + "loss": 2.6421, + "step": 259 + }, + { + "epoch": 0.013948497854077254, + "grad_norm": 0.349609375, + "learning_rate": 4.649499284692418e-07, + "loss": 2.2318, + "step": 260 + }, + { + "epoch": 0.01400214592274678, + "grad_norm": 0.29296875, + "learning_rate": 4.667381974248927e-07, + "loss": 2.3664, + "step": 261 + }, + { + "epoch": 0.014055793991416309, + "grad_norm": 0.396484375, + "learning_rate": 4.685264663805437e-07, + "loss": 2.5308, + "step": 262 + }, + { + "epoch": 0.014109442060085837, + "grad_norm": 0.271484375, + "learning_rate": 4.7031473533619463e-07, + "loss": 2.287, + "step": 263 + }, + { + "epoch": 0.014163090128755365, + "grad_norm": 0.419921875, + "learning_rate": 4.7210300429184556e-07, + "loss": 2.4583, + "step": 264 + }, + { + "epoch": 0.014216738197424892, + "grad_norm": 0.384765625, + "learning_rate": 4.7389127324749644e-07, + "loss": 2.3593, + "step": 265 + }, + { + "epoch": 0.01427038626609442, + "grad_norm": 1.984375, + "learning_rate": 4.7567954220314737e-07, + "loss": 2.7945, + "step": 266 + }, + { + "epoch": 0.014324034334763949, + "grad_norm": 0.51953125, + "learning_rate": 4.774678111587983e-07, + "loss": 2.7293, + "step": 267 + }, + { + "epoch": 0.014377682403433477, + "grad_norm": 0.31640625, + "learning_rate": 4.792560801144492e-07, + "loss": 2.558, + "step": 268 + }, + { + "epoch": 0.014431330472103005, + "grad_norm": 0.341796875, + "learning_rate": 4.810443490701002e-07, + "loss": 2.5879, + "step": 269 + }, + { + "epoch": 0.014484978540772532, + "grad_norm": 0.283203125, + "learning_rate": 4.828326180257511e-07, + "loss": 2.2996, + "step": 270 + }, + { + "epoch": 0.01453862660944206, + "grad_norm": 0.400390625, + "learning_rate": 4.846208869814021e-07, + "loss": 2.1767, + "step": 271 + }, + { + "epoch": 0.014592274678111588, + "grad_norm": 0.3515625, + "learning_rate": 4.864091559370529e-07, + "loss": 2.4629, + "step": 272 + }, + { + "epoch": 0.014645922746781117, + "grad_norm": 0.337890625, + "learning_rate": 4.881974248927039e-07, + "loss": 2.5235, + "step": 273 + }, + { + "epoch": 0.014699570815450643, + "grad_norm": 0.296875, + "learning_rate": 4.899856938483549e-07, + "loss": 2.3942, + "step": 274 + }, + { + "epoch": 0.014753218884120171, + "grad_norm": 0.38671875, + "learning_rate": 4.917739628040058e-07, + "loss": 2.4046, + "step": 275 + }, + { + "epoch": 0.0148068669527897, + "grad_norm": 0.31640625, + "learning_rate": 4.935622317596567e-07, + "loss": 2.1799, + "step": 276 + }, + { + "epoch": 0.014860515021459228, + "grad_norm": 0.35546875, + "learning_rate": 4.953505007153076e-07, + "loss": 2.7221, + "step": 277 + }, + { + "epoch": 0.014914163090128755, + "grad_norm": 0.58203125, + "learning_rate": 4.971387696709585e-07, + "loss": 2.7621, + "step": 278 + }, + { + "epoch": 0.014967811158798283, + "grad_norm": 0.34765625, + "learning_rate": 4.989270386266095e-07, + "loss": 1.3594, + "step": 279 + }, + { + "epoch": 0.015021459227467811, + "grad_norm": 0.318359375, + "learning_rate": 5.007153075822605e-07, + "loss": 2.6645, + "step": 280 + }, + { + "epoch": 0.01507510729613734, + "grad_norm": 0.33984375, + "learning_rate": 5.025035765379114e-07, + "loss": 2.5015, + "step": 281 + }, + { + "epoch": 0.015128755364806868, + "grad_norm": 0.341796875, + "learning_rate": 5.042918454935622e-07, + "loss": 2.5557, + "step": 282 + }, + { + "epoch": 0.015182403433476394, + "grad_norm": 0.46484375, + "learning_rate": 5.060801144492132e-07, + "loss": 2.6156, + "step": 283 + }, + { + "epoch": 0.015236051502145923, + "grad_norm": 0.357421875, + "learning_rate": 5.078683834048642e-07, + "loss": 2.3768, + "step": 284 + }, + { + "epoch": 0.01528969957081545, + "grad_norm": 0.259765625, + "learning_rate": 5.096566523605151e-07, + "loss": 2.6167, + "step": 285 + }, + { + "epoch": 0.015343347639484979, + "grad_norm": 0.83984375, + "learning_rate": 5.114449213161659e-07, + "loss": 2.3451, + "step": 286 + }, + { + "epoch": 0.015396995708154506, + "grad_norm": 0.310546875, + "learning_rate": 5.132331902718169e-07, + "loss": 2.454, + "step": 287 + }, + { + "epoch": 0.015450643776824034, + "grad_norm": 0.376953125, + "learning_rate": 5.150214592274678e-07, + "loss": 2.5051, + "step": 288 + }, + { + "epoch": 0.015504291845493562, + "grad_norm": 0.3203125, + "learning_rate": 5.168097281831188e-07, + "loss": 2.411, + "step": 289 + }, + { + "epoch": 0.01555793991416309, + "grad_norm": 0.4296875, + "learning_rate": 5.185979971387698e-07, + "loss": 2.4192, + "step": 290 + }, + { + "epoch": 0.015611587982832619, + "grad_norm": 0.3984375, + "learning_rate": 5.203862660944206e-07, + "loss": 2.6243, + "step": 291 + }, + { + "epoch": 0.015665236051502145, + "grad_norm": 0.390625, + "learning_rate": 5.221745350500715e-07, + "loss": 2.4333, + "step": 292 + }, + { + "epoch": 0.015718884120171674, + "grad_norm": 0.392578125, + "learning_rate": 5.239628040057225e-07, + "loss": 2.5379, + "step": 293 + }, + { + "epoch": 0.015772532188841202, + "grad_norm": 0.349609375, + "learning_rate": 5.257510729613735e-07, + "loss": 2.5397, + "step": 294 + }, + { + "epoch": 0.01582618025751073, + "grad_norm": 0.37109375, + "learning_rate": 5.275393419170244e-07, + "loss": 2.7619, + "step": 295 + }, + { + "epoch": 0.01587982832618026, + "grad_norm": 0.326171875, + "learning_rate": 5.293276108726753e-07, + "loss": 2.4961, + "step": 296 + }, + { + "epoch": 0.015933476394849787, + "grad_norm": 0.4609375, + "learning_rate": 5.311158798283262e-07, + "loss": 2.3486, + "step": 297 + }, + { + "epoch": 0.01598712446351931, + "grad_norm": 0.318359375, + "learning_rate": 5.329041487839772e-07, + "loss": 2.419, + "step": 298 + }, + { + "epoch": 0.01604077253218884, + "grad_norm": 0.49609375, + "learning_rate": 5.346924177396281e-07, + "loss": 1.5741, + "step": 299 + }, + { + "epoch": 0.016094420600858368, + "grad_norm": 0.337890625, + "learning_rate": 5.36480686695279e-07, + "loss": 2.4041, + "step": 300 + }, + { + "epoch": 0.016148068669527896, + "grad_norm": 0.263671875, + "learning_rate": 5.382689556509299e-07, + "loss": 2.0668, + "step": 301 + }, + { + "epoch": 0.016201716738197425, + "grad_norm": 0.349609375, + "learning_rate": 5.400572246065808e-07, + "loss": 2.4519, + "step": 302 + }, + { + "epoch": 0.016255364806866953, + "grad_norm": 0.380859375, + "learning_rate": 5.418454935622318e-07, + "loss": 2.4274, + "step": 303 + }, + { + "epoch": 0.01630901287553648, + "grad_norm": 0.373046875, + "learning_rate": 5.436337625178828e-07, + "loss": 2.4423, + "step": 304 + }, + { + "epoch": 0.01636266094420601, + "grad_norm": 0.353515625, + "learning_rate": 5.454220314735336e-07, + "loss": 2.4688, + "step": 305 + }, + { + "epoch": 0.016416309012875538, + "grad_norm": 0.271484375, + "learning_rate": 5.472103004291846e-07, + "loss": 2.4001, + "step": 306 + }, + { + "epoch": 0.016469957081545063, + "grad_norm": 0.41015625, + "learning_rate": 5.489985693848355e-07, + "loss": 2.5308, + "step": 307 + }, + { + "epoch": 0.01652360515021459, + "grad_norm": 0.3671875, + "learning_rate": 5.507868383404865e-07, + "loss": 2.442, + "step": 308 + }, + { + "epoch": 0.01657725321888412, + "grad_norm": 0.373046875, + "learning_rate": 5.525751072961374e-07, + "loss": 2.654, + "step": 309 + }, + { + "epoch": 0.016630901287553648, + "grad_norm": 0.349609375, + "learning_rate": 5.543633762517883e-07, + "loss": 2.5613, + "step": 310 + }, + { + "epoch": 0.016684549356223176, + "grad_norm": 0.39453125, + "learning_rate": 5.561516452074392e-07, + "loss": 2.2215, + "step": 311 + }, + { + "epoch": 0.016738197424892704, + "grad_norm": 0.314453125, + "learning_rate": 5.579399141630902e-07, + "loss": 2.4609, + "step": 312 + }, + { + "epoch": 0.016791845493562232, + "grad_norm": 0.310546875, + "learning_rate": 5.597281831187411e-07, + "loss": 2.3655, + "step": 313 + }, + { + "epoch": 0.01684549356223176, + "grad_norm": 0.458984375, + "learning_rate": 5.615164520743921e-07, + "loss": 2.6857, + "step": 314 + }, + { + "epoch": 0.01689914163090129, + "grad_norm": 0.51171875, + "learning_rate": 5.633047210300429e-07, + "loss": 2.7834, + "step": 315 + }, + { + "epoch": 0.016952789699570814, + "grad_norm": 0.3046875, + "learning_rate": 5.650929899856939e-07, + "loss": 2.479, + "step": 316 + }, + { + "epoch": 0.017006437768240342, + "grad_norm": 0.318359375, + "learning_rate": 5.668812589413449e-07, + "loss": 2.0574, + "step": 317 + }, + { + "epoch": 0.01706008583690987, + "grad_norm": 0.390625, + "learning_rate": 5.686695278969958e-07, + "loss": 2.4947, + "step": 318 + }, + { + "epoch": 0.0171137339055794, + "grad_norm": 7.09375, + "learning_rate": 5.704577968526466e-07, + "loss": 2.4121, + "step": 319 + }, + { + "epoch": 0.017167381974248927, + "grad_norm": 0.361328125, + "learning_rate": 5.722460658082976e-07, + "loss": 2.1336, + "step": 320 + }, + { + "epoch": 0.017221030042918455, + "grad_norm": 0.3671875, + "learning_rate": 5.740343347639486e-07, + "loss": 2.4242, + "step": 321 + }, + { + "epoch": 0.017274678111587984, + "grad_norm": 0.326171875, + "learning_rate": 5.758226037195995e-07, + "loss": 2.5378, + "step": 322 + }, + { + "epoch": 0.017328326180257512, + "grad_norm": 0.376953125, + "learning_rate": 5.776108726752504e-07, + "loss": 1.8996, + "step": 323 + }, + { + "epoch": 0.01738197424892704, + "grad_norm": 1.0078125, + "learning_rate": 5.793991416309013e-07, + "loss": 2.0291, + "step": 324 + }, + { + "epoch": 0.017435622317596565, + "grad_norm": 0.404296875, + "learning_rate": 5.811874105865522e-07, + "loss": 2.4706, + "step": 325 + }, + { + "epoch": 0.017489270386266093, + "grad_norm": 1.1328125, + "learning_rate": 5.829756795422032e-07, + "loss": 2.4258, + "step": 326 + }, + { + "epoch": 0.01754291845493562, + "grad_norm": 0.51953125, + "learning_rate": 5.847639484978542e-07, + "loss": 2.1966, + "step": 327 + }, + { + "epoch": 0.01759656652360515, + "grad_norm": 0.40234375, + "learning_rate": 5.865522174535051e-07, + "loss": 2.281, + "step": 328 + }, + { + "epoch": 0.017650214592274678, + "grad_norm": 0.3046875, + "learning_rate": 5.883404864091559e-07, + "loss": 2.3281, + "step": 329 + }, + { + "epoch": 0.017703862660944206, + "grad_norm": 0.27734375, + "learning_rate": 5.901287553648069e-07, + "loss": 2.729, + "step": 330 + }, + { + "epoch": 0.017757510729613735, + "grad_norm": 4.0, + "learning_rate": 5.919170243204579e-07, + "loss": 2.273, + "step": 331 + }, + { + "epoch": 0.017811158798283263, + "grad_norm": 0.314453125, + "learning_rate": 5.937052932761088e-07, + "loss": 2.4466, + "step": 332 + }, + { + "epoch": 0.01786480686695279, + "grad_norm": 0.29296875, + "learning_rate": 5.954935622317598e-07, + "loss": 2.2985, + "step": 333 + }, + { + "epoch": 0.017918454935622316, + "grad_norm": 0.439453125, + "learning_rate": 5.972818311874106e-07, + "loss": 2.7088, + "step": 334 + }, + { + "epoch": 0.017972103004291844, + "grad_norm": 0.470703125, + "learning_rate": 5.990701001430615e-07, + "loss": 2.4683, + "step": 335 + }, + { + "epoch": 0.018025751072961373, + "grad_norm": 0.46484375, + "learning_rate": 6.008583690987125e-07, + "loss": 2.7179, + "step": 336 + }, + { + "epoch": 0.0180793991416309, + "grad_norm": 0.47265625, + "learning_rate": 6.026466380543635e-07, + "loss": 2.5419, + "step": 337 + }, + { + "epoch": 0.01813304721030043, + "grad_norm": 0.41796875, + "learning_rate": 6.044349070100143e-07, + "loss": 2.4026, + "step": 338 + }, + { + "epoch": 0.018186695278969957, + "grad_norm": 0.37890625, + "learning_rate": 6.062231759656652e-07, + "loss": 2.7435, + "step": 339 + }, + { + "epoch": 0.018240343347639486, + "grad_norm": 6.0625, + "learning_rate": 6.080114449213162e-07, + "loss": 2.349, + "step": 340 + }, + { + "epoch": 0.018293991416309014, + "grad_norm": 0.318359375, + "learning_rate": 6.097997138769672e-07, + "loss": 2.4787, + "step": 341 + }, + { + "epoch": 0.018347639484978542, + "grad_norm": 0.310546875, + "learning_rate": 6.115879828326181e-07, + "loss": 2.2879, + "step": 342 + }, + { + "epoch": 0.018401287553648067, + "grad_norm": 0.67578125, + "learning_rate": 6.13376251788269e-07, + "loss": 2.5408, + "step": 343 + }, + { + "epoch": 0.018454935622317595, + "grad_norm": 0.416015625, + "learning_rate": 6.151645207439199e-07, + "loss": 2.3358, + "step": 344 + }, + { + "epoch": 0.018508583690987124, + "grad_norm": 0.36328125, + "learning_rate": 6.169527896995708e-07, + "loss": 2.2695, + "step": 345 + }, + { + "epoch": 0.018562231759656652, + "grad_norm": 0.44921875, + "learning_rate": 6.187410586552218e-07, + "loss": 2.1789, + "step": 346 + }, + { + "epoch": 0.01861587982832618, + "grad_norm": 0.39453125, + "learning_rate": 6.205293276108728e-07, + "loss": 2.6383, + "step": 347 + }, + { + "epoch": 0.01866952789699571, + "grad_norm": 0.291015625, + "learning_rate": 6.223175965665236e-07, + "loss": 2.2311, + "step": 348 + }, + { + "epoch": 0.018723175965665237, + "grad_norm": 0.35546875, + "learning_rate": 6.241058655221746e-07, + "loss": 2.4769, + "step": 349 + }, + { + "epoch": 0.018776824034334765, + "grad_norm": 0.259765625, + "learning_rate": 6.258941344778255e-07, + "loss": 2.528, + "step": 350 + }, + { + "epoch": 0.018830472103004293, + "grad_norm": 0.59375, + "learning_rate": 6.276824034334765e-07, + "loss": 2.2412, + "step": 351 + }, + { + "epoch": 0.01888412017167382, + "grad_norm": 0.373046875, + "learning_rate": 6.294706723891273e-07, + "loss": 2.5979, + "step": 352 + }, + { + "epoch": 0.018937768240343347, + "grad_norm": 0.2734375, + "learning_rate": 6.312589413447782e-07, + "loss": 2.312, + "step": 353 + }, + { + "epoch": 0.018991416309012875, + "grad_norm": 0.50390625, + "learning_rate": 6.330472103004293e-07, + "loss": 2.6484, + "step": 354 + }, + { + "epoch": 0.019045064377682403, + "grad_norm": 0.27734375, + "learning_rate": 6.348354792560802e-07, + "loss": 2.5796, + "step": 355 + }, + { + "epoch": 0.01909871244635193, + "grad_norm": 0.365234375, + "learning_rate": 6.366237482117312e-07, + "loss": 2.5359, + "step": 356 + }, + { + "epoch": 0.01915236051502146, + "grad_norm": 0.3359375, + "learning_rate": 6.38412017167382e-07, + "loss": 2.4539, + "step": 357 + }, + { + "epoch": 0.019206008583690988, + "grad_norm": 0.3828125, + "learning_rate": 6.402002861230329e-07, + "loss": 2.6497, + "step": 358 + }, + { + "epoch": 0.019259656652360516, + "grad_norm": 0.365234375, + "learning_rate": 6.419885550786839e-07, + "loss": 2.5039, + "step": 359 + }, + { + "epoch": 0.019313304721030045, + "grad_norm": 0.40625, + "learning_rate": 6.437768240343348e-07, + "loss": 2.562, + "step": 360 + }, + { + "epoch": 0.01936695278969957, + "grad_norm": 0.41015625, + "learning_rate": 6.455650929899858e-07, + "loss": 2.432, + "step": 361 + }, + { + "epoch": 0.019420600858369098, + "grad_norm": 0.88671875, + "learning_rate": 6.473533619456366e-07, + "loss": 2.4255, + "step": 362 + }, + { + "epoch": 0.019474248927038626, + "grad_norm": 0.5078125, + "learning_rate": 6.491416309012875e-07, + "loss": 2.5831, + "step": 363 + }, + { + "epoch": 0.019527896995708154, + "grad_norm": 0.40625, + "learning_rate": 6.509298998569386e-07, + "loss": 2.524, + "step": 364 + }, + { + "epoch": 0.019581545064377683, + "grad_norm": 0.43359375, + "learning_rate": 6.527181688125895e-07, + "loss": 2.6172, + "step": 365 + }, + { + "epoch": 0.01963519313304721, + "grad_norm": 0.53515625, + "learning_rate": 6.545064377682405e-07, + "loss": 2.3431, + "step": 366 + }, + { + "epoch": 0.01968884120171674, + "grad_norm": 0.3984375, + "learning_rate": 6.562947067238913e-07, + "loss": 2.3687, + "step": 367 + }, + { + "epoch": 0.019742489270386267, + "grad_norm": 1.1796875, + "learning_rate": 6.580829756795423e-07, + "loss": 2.2573, + "step": 368 + }, + { + "epoch": 0.019796137339055796, + "grad_norm": 0.4765625, + "learning_rate": 6.598712446351932e-07, + "loss": 2.4733, + "step": 369 + }, + { + "epoch": 0.01984978540772532, + "grad_norm": 0.310546875, + "learning_rate": 6.616595135908441e-07, + "loss": 2.6257, + "step": 370 + }, + { + "epoch": 0.01990343347639485, + "grad_norm": 0.46875, + "learning_rate": 6.63447782546495e-07, + "loss": 2.3036, + "step": 371 + }, + { + "epoch": 0.019957081545064377, + "grad_norm": 0.400390625, + "learning_rate": 6.652360515021459e-07, + "loss": 2.6102, + "step": 372 + }, + { + "epoch": 0.020010729613733905, + "grad_norm": 0.3125, + "learning_rate": 6.67024320457797e-07, + "loss": 2.5883, + "step": 373 + }, + { + "epoch": 0.020064377682403434, + "grad_norm": 0.91015625, + "learning_rate": 6.688125894134479e-07, + "loss": 2.6761, + "step": 374 + }, + { + "epoch": 0.020118025751072962, + "grad_norm": 0.3046875, + "learning_rate": 6.706008583690988e-07, + "loss": 2.1163, + "step": 375 + }, + { + "epoch": 0.02017167381974249, + "grad_norm": 0.333984375, + "learning_rate": 6.723891273247497e-07, + "loss": 2.4389, + "step": 376 + }, + { + "epoch": 0.02022532188841202, + "grad_norm": 0.34375, + "learning_rate": 6.741773962804006e-07, + "loss": 2.6518, + "step": 377 + }, + { + "epoch": 0.020278969957081543, + "grad_norm": 0.40234375, + "learning_rate": 6.759656652360516e-07, + "loss": 2.5494, + "step": 378 + }, + { + "epoch": 0.02033261802575107, + "grad_norm": 0.291015625, + "learning_rate": 6.777539341917025e-07, + "loss": 2.3615, + "step": 379 + }, + { + "epoch": 0.0203862660944206, + "grad_norm": 0.349609375, + "learning_rate": 6.795422031473534e-07, + "loss": 2.3903, + "step": 380 + }, + { + "epoch": 0.020439914163090128, + "grad_norm": 0.390625, + "learning_rate": 6.813304721030043e-07, + "loss": 2.2546, + "step": 381 + }, + { + "epoch": 0.020493562231759656, + "grad_norm": 0.34375, + "learning_rate": 6.831187410586552e-07, + "loss": 2.762, + "step": 382 + }, + { + "epoch": 0.020547210300429185, + "grad_norm": 0.2890625, + "learning_rate": 6.849070100143063e-07, + "loss": 2.5368, + "step": 383 + }, + { + "epoch": 0.020600858369098713, + "grad_norm": 0.318359375, + "learning_rate": 6.866952789699572e-07, + "loss": 2.6037, + "step": 384 + }, + { + "epoch": 0.02065450643776824, + "grad_norm": 0.404296875, + "learning_rate": 6.88483547925608e-07, + "loss": 2.2548, + "step": 385 + }, + { + "epoch": 0.02070815450643777, + "grad_norm": 0.43359375, + "learning_rate": 6.90271816881259e-07, + "loss": 2.4479, + "step": 386 + }, + { + "epoch": 0.020761802575107294, + "grad_norm": 0.32421875, + "learning_rate": 6.920600858369099e-07, + "loss": 2.3025, + "step": 387 + }, + { + "epoch": 0.020815450643776823, + "grad_norm": 0.54296875, + "learning_rate": 6.938483547925609e-07, + "loss": 2.5665, + "step": 388 + }, + { + "epoch": 0.02086909871244635, + "grad_norm": 0.384765625, + "learning_rate": 6.956366237482118e-07, + "loss": 2.4038, + "step": 389 + }, + { + "epoch": 0.02092274678111588, + "grad_norm": 0.275390625, + "learning_rate": 6.974248927038626e-07, + "loss": 2.4645, + "step": 390 + }, + { + "epoch": 0.020976394849785408, + "grad_norm": 0.7109375, + "learning_rate": 6.992131616595136e-07, + "loss": 2.4951, + "step": 391 + }, + { + "epoch": 0.021030042918454936, + "grad_norm": 0.61328125, + "learning_rate": 7.010014306151645e-07, + "loss": 2.6755, + "step": 392 + }, + { + "epoch": 0.021083690987124464, + "grad_norm": 0.6953125, + "learning_rate": 7.027896995708156e-07, + "loss": 2.8145, + "step": 393 + }, + { + "epoch": 0.021137339055793992, + "grad_norm": 1.1015625, + "learning_rate": 7.045779685264665e-07, + "loss": 2.4966, + "step": 394 + }, + { + "epoch": 0.02119098712446352, + "grad_norm": 0.490234375, + "learning_rate": 7.063662374821173e-07, + "loss": 2.6325, + "step": 395 + }, + { + "epoch": 0.021244635193133046, + "grad_norm": 4.625, + "learning_rate": 7.081545064377683e-07, + "loss": 2.4619, + "step": 396 + }, + { + "epoch": 0.021298283261802574, + "grad_norm": 0.4453125, + "learning_rate": 7.099427753934192e-07, + "loss": 2.2704, + "step": 397 + }, + { + "epoch": 0.021351931330472102, + "grad_norm": 0.482421875, + "learning_rate": 7.117310443490702e-07, + "loss": 2.3632, + "step": 398 + }, + { + "epoch": 0.02140557939914163, + "grad_norm": 0.318359375, + "learning_rate": 7.13519313304721e-07, + "loss": 2.0946, + "step": 399 + }, + { + "epoch": 0.02145922746781116, + "grad_norm": 0.35546875, + "learning_rate": 7.15307582260372e-07, + "loss": 2.6352, + "step": 400 + }, + { + "epoch": 0.021512875536480687, + "grad_norm": 0.328125, + "learning_rate": 7.170958512160229e-07, + "loss": 2.5781, + "step": 401 + }, + { + "epoch": 0.021566523605150215, + "grad_norm": 0.330078125, + "learning_rate": 7.188841201716738e-07, + "loss": 2.3473, + "step": 402 + }, + { + "epoch": 0.021620171673819744, + "grad_norm": 0.314453125, + "learning_rate": 7.206723891273249e-07, + "loss": 2.575, + "step": 403 + }, + { + "epoch": 0.021673819742489272, + "grad_norm": 0.54296875, + "learning_rate": 7.224606580829757e-07, + "loss": 2.1073, + "step": 404 + }, + { + "epoch": 0.021727467811158797, + "grad_norm": 0.326171875, + "learning_rate": 7.242489270386267e-07, + "loss": 2.5382, + "step": 405 + }, + { + "epoch": 0.021781115879828325, + "grad_norm": 0.357421875, + "learning_rate": 7.260371959942776e-07, + "loss": 2.3385, + "step": 406 + }, + { + "epoch": 0.021834763948497853, + "grad_norm": 0.388671875, + "learning_rate": 7.278254649499285e-07, + "loss": 2.1501, + "step": 407 + }, + { + "epoch": 0.02188841201716738, + "grad_norm": 0.26171875, + "learning_rate": 7.296137339055795e-07, + "loss": 2.3243, + "step": 408 + }, + { + "epoch": 0.02194206008583691, + "grad_norm": 0.640625, + "learning_rate": 7.314020028612303e-07, + "loss": 2.1815, + "step": 409 + }, + { + "epoch": 0.021995708154506438, + "grad_norm": 0.52734375, + "learning_rate": 7.331902718168814e-07, + "loss": 2.6204, + "step": 410 + }, + { + "epoch": 0.022049356223175966, + "grad_norm": 0.34375, + "learning_rate": 7.349785407725323e-07, + "loss": 2.4681, + "step": 411 + }, + { + "epoch": 0.022103004291845495, + "grad_norm": 0.375, + "learning_rate": 7.367668097281831e-07, + "loss": 2.7185, + "step": 412 + }, + { + "epoch": 0.022156652360515023, + "grad_norm": 0.484375, + "learning_rate": 7.385550786838342e-07, + "loss": 2.4633, + "step": 413 + }, + { + "epoch": 0.022210300429184548, + "grad_norm": 0.443359375, + "learning_rate": 7.40343347639485e-07, + "loss": 2.9364, + "step": 414 + }, + { + "epoch": 0.022263948497854076, + "grad_norm": 1.546875, + "learning_rate": 7.42131616595136e-07, + "loss": 2.5495, + "step": 415 + }, + { + "epoch": 0.022317596566523604, + "grad_norm": 0.55078125, + "learning_rate": 7.439198855507869e-07, + "loss": 2.6852, + "step": 416 + }, + { + "epoch": 0.022371244635193133, + "grad_norm": 0.50390625, + "learning_rate": 7.457081545064378e-07, + "loss": 1.7639, + "step": 417 + }, + { + "epoch": 0.02242489270386266, + "grad_norm": 0.279296875, + "learning_rate": 7.474964234620888e-07, + "loss": 2.199, + "step": 418 + }, + { + "epoch": 0.02247854077253219, + "grad_norm": 0.337890625, + "learning_rate": 7.492846924177396e-07, + "loss": 2.4641, + "step": 419 + }, + { + "epoch": 0.022532188841201718, + "grad_norm": 0.458984375, + "learning_rate": 7.510729613733907e-07, + "loss": 1.6108, + "step": 420 + }, + { + "epoch": 0.022585836909871246, + "grad_norm": 0.30078125, + "learning_rate": 7.528612303290416e-07, + "loss": 2.4512, + "step": 421 + }, + { + "epoch": 0.022639484978540774, + "grad_norm": 0.35546875, + "learning_rate": 7.546494992846925e-07, + "loss": 2.3345, + "step": 422 + }, + { + "epoch": 0.0226931330472103, + "grad_norm": 0.439453125, + "learning_rate": 7.564377682403434e-07, + "loss": 2.576, + "step": 423 + }, + { + "epoch": 0.022746781115879827, + "grad_norm": 0.55078125, + "learning_rate": 7.582260371959943e-07, + "loss": 2.4091, + "step": 424 + }, + { + "epoch": 0.022800429184549355, + "grad_norm": 0.375, + "learning_rate": 7.600143061516453e-07, + "loss": 2.2566, + "step": 425 + }, + { + "epoch": 0.022854077253218884, + "grad_norm": 0.3828125, + "learning_rate": 7.618025751072962e-07, + "loss": 2.4053, + "step": 426 + }, + { + "epoch": 0.022907725321888412, + "grad_norm": 0.6484375, + "learning_rate": 7.635908440629471e-07, + "loss": 2.7371, + "step": 427 + }, + { + "epoch": 0.02296137339055794, + "grad_norm": 0.328125, + "learning_rate": 7.65379113018598e-07, + "loss": 2.6196, + "step": 428 + }, + { + "epoch": 0.02301502145922747, + "grad_norm": 0.43359375, + "learning_rate": 7.671673819742489e-07, + "loss": 2.4226, + "step": 429 + }, + { + "epoch": 0.023068669527896997, + "grad_norm": 0.412109375, + "learning_rate": 7.689556509299e-07, + "loss": 2.5252, + "step": 430 + }, + { + "epoch": 0.023122317596566525, + "grad_norm": 2.546875, + "learning_rate": 7.707439198855509e-07, + "loss": 2.2297, + "step": 431 + }, + { + "epoch": 0.02317596566523605, + "grad_norm": 0.4453125, + "learning_rate": 7.725321888412019e-07, + "loss": 2.4877, + "step": 432 + }, + { + "epoch": 0.02322961373390558, + "grad_norm": 0.408203125, + "learning_rate": 7.743204577968527e-07, + "loss": 2.5749, + "step": 433 + }, + { + "epoch": 0.023283261802575107, + "grad_norm": 0.32421875, + "learning_rate": 7.761087267525036e-07, + "loss": 2.3846, + "step": 434 + }, + { + "epoch": 0.023336909871244635, + "grad_norm": 1.15625, + "learning_rate": 7.778969957081546e-07, + "loss": 2.5232, + "step": 435 + }, + { + "epoch": 0.023390557939914163, + "grad_norm": 0.271484375, + "learning_rate": 7.796852646638055e-07, + "loss": 2.4935, + "step": 436 + }, + { + "epoch": 0.02344420600858369, + "grad_norm": 0.3359375, + "learning_rate": 7.814735336194564e-07, + "loss": 2.4845, + "step": 437 + }, + { + "epoch": 0.02349785407725322, + "grad_norm": 1.1171875, + "learning_rate": 7.832618025751073e-07, + "loss": 2.6366, + "step": 438 + }, + { + "epoch": 0.023551502145922748, + "grad_norm": 0.306640625, + "learning_rate": 7.850500715307582e-07, + "loss": 2.3498, + "step": 439 + }, + { + "epoch": 0.023605150214592276, + "grad_norm": 0.416015625, + "learning_rate": 7.868383404864093e-07, + "loss": 2.5227, + "step": 440 + }, + { + "epoch": 0.0236587982832618, + "grad_norm": 0.333984375, + "learning_rate": 7.886266094420602e-07, + "loss": 2.5374, + "step": 441 + }, + { + "epoch": 0.02371244635193133, + "grad_norm": 0.322265625, + "learning_rate": 7.904148783977111e-07, + "loss": 2.3822, + "step": 442 + }, + { + "epoch": 0.023766094420600858, + "grad_norm": 0.400390625, + "learning_rate": 7.92203147353362e-07, + "loss": 2.4555, + "step": 443 + }, + { + "epoch": 0.023819742489270386, + "grad_norm": 0.4375, + "learning_rate": 7.939914163090129e-07, + "loss": 2.7176, + "step": 444 + }, + { + "epoch": 0.023873390557939914, + "grad_norm": 0.498046875, + "learning_rate": 7.957796852646639e-07, + "loss": 1.7076, + "step": 445 + }, + { + "epoch": 0.023927038626609443, + "grad_norm": 0.306640625, + "learning_rate": 7.975679542203148e-07, + "loss": 2.5388, + "step": 446 + }, + { + "epoch": 0.02398068669527897, + "grad_norm": 0.34765625, + "learning_rate": 7.993562231759657e-07, + "loss": 2.4425, + "step": 447 + }, + { + "epoch": 0.0240343347639485, + "grad_norm": 0.306640625, + "learning_rate": 8.011444921316166e-07, + "loss": 2.4235, + "step": 448 + }, + { + "epoch": 0.024087982832618027, + "grad_norm": 0.376953125, + "learning_rate": 8.029327610872675e-07, + "loss": 2.1424, + "step": 449 + }, + { + "epoch": 0.024141630901287552, + "grad_norm": 0.478515625, + "learning_rate": 8.047210300429186e-07, + "loss": 2.6018, + "step": 450 + }, + { + "epoch": 0.02419527896995708, + "grad_norm": 0.439453125, + "learning_rate": 8.065092989985695e-07, + "loss": 2.6914, + "step": 451 + }, + { + "epoch": 0.02424892703862661, + "grad_norm": 0.34375, + "learning_rate": 8.082975679542204e-07, + "loss": 2.7616, + "step": 452 + }, + { + "epoch": 0.024302575107296137, + "grad_norm": 0.294921875, + "learning_rate": 8.100858369098713e-07, + "loss": 2.4871, + "step": 453 + }, + { + "epoch": 0.024356223175965665, + "grad_norm": 0.796875, + "learning_rate": 8.118741058655222e-07, + "loss": 2.4732, + "step": 454 + }, + { + "epoch": 0.024409871244635194, + "grad_norm": 0.3671875, + "learning_rate": 8.136623748211732e-07, + "loss": 2.5515, + "step": 455 + }, + { + "epoch": 0.024463519313304722, + "grad_norm": 0.4375, + "learning_rate": 8.15450643776824e-07, + "loss": 2.2687, + "step": 456 + }, + { + "epoch": 0.02451716738197425, + "grad_norm": 0.46484375, + "learning_rate": 8.17238912732475e-07, + "loss": 2.6547, + "step": 457 + }, + { + "epoch": 0.024570815450643775, + "grad_norm": 1.0703125, + "learning_rate": 8.190271816881259e-07, + "loss": 2.6164, + "step": 458 + }, + { + "epoch": 0.024624463519313303, + "grad_norm": 0.33203125, + "learning_rate": 8.208154506437768e-07, + "loss": 2.5067, + "step": 459 + }, + { + "epoch": 0.02467811158798283, + "grad_norm": 0.353515625, + "learning_rate": 8.226037195994279e-07, + "loss": 2.486, + "step": 460 + }, + { + "epoch": 0.02473175965665236, + "grad_norm": 0.5078125, + "learning_rate": 8.243919885550787e-07, + "loss": 2.3203, + "step": 461 + }, + { + "epoch": 0.024785407725321888, + "grad_norm": 0.470703125, + "learning_rate": 8.261802575107297e-07, + "loss": 2.5493, + "step": 462 + }, + { + "epoch": 0.024839055793991417, + "grad_norm": 0.37109375, + "learning_rate": 8.279685264663806e-07, + "loss": 2.4726, + "step": 463 + }, + { + "epoch": 0.024892703862660945, + "grad_norm": 0.341796875, + "learning_rate": 8.297567954220316e-07, + "loss": 2.6852, + "step": 464 + }, + { + "epoch": 0.024946351931330473, + "grad_norm": 0.380859375, + "learning_rate": 8.315450643776825e-07, + "loss": 2.1376, + "step": 465 + }, + { + "epoch": 0.025, + "grad_norm": 0.546875, + "learning_rate": 8.333333333333333e-07, + "loss": 2.3662, + "step": 466 + }, + { + "epoch": 0.025053648068669526, + "grad_norm": 0.421875, + "learning_rate": 8.351216022889844e-07, + "loss": 2.5358, + "step": 467 + }, + { + "epoch": 0.025107296137339054, + "grad_norm": 0.34765625, + "learning_rate": 8.369098712446352e-07, + "loss": 2.5964, + "step": 468 + }, + { + "epoch": 0.025160944206008583, + "grad_norm": 6.46875, + "learning_rate": 8.386981402002863e-07, + "loss": 2.3873, + "step": 469 + }, + { + "epoch": 0.02521459227467811, + "grad_norm": 0.34375, + "learning_rate": 8.404864091559372e-07, + "loss": 2.4337, + "step": 470 + }, + { + "epoch": 0.02526824034334764, + "grad_norm": 0.271484375, + "learning_rate": 8.42274678111588e-07, + "loss": 2.1511, + "step": 471 + }, + { + "epoch": 0.025321888412017168, + "grad_norm": 0.376953125, + "learning_rate": 8.44062947067239e-07, + "loss": 2.2533, + "step": 472 + }, + { + "epoch": 0.025375536480686696, + "grad_norm": 0.396484375, + "learning_rate": 8.458512160228899e-07, + "loss": 2.6373, + "step": 473 + }, + { + "epoch": 0.025429184549356224, + "grad_norm": 0.51171875, + "learning_rate": 8.476394849785409e-07, + "loss": 2.3247, + "step": 474 + }, + { + "epoch": 0.025482832618025753, + "grad_norm": 0.318359375, + "learning_rate": 8.494277539341917e-07, + "loss": 2.7036, + "step": 475 + }, + { + "epoch": 0.025536480686695277, + "grad_norm": 0.443359375, + "learning_rate": 8.512160228898426e-07, + "loss": 2.5895, + "step": 476 + }, + { + "epoch": 0.025590128755364806, + "grad_norm": 0.37890625, + "learning_rate": 8.530042918454937e-07, + "loss": 2.4512, + "step": 477 + }, + { + "epoch": 0.025643776824034334, + "grad_norm": 4.34375, + "learning_rate": 8.547925608011446e-07, + "loss": 1.327, + "step": 478 + }, + { + "epoch": 0.025697424892703862, + "grad_norm": 0.3359375, + "learning_rate": 8.565808297567956e-07, + "loss": 2.5544, + "step": 479 + }, + { + "epoch": 0.02575107296137339, + "grad_norm": 0.392578125, + "learning_rate": 8.583690987124464e-07, + "loss": 2.4943, + "step": 480 + }, + { + "epoch": 0.02580472103004292, + "grad_norm": 0.365234375, + "learning_rate": 8.601573676680973e-07, + "loss": 2.6054, + "step": 481 + }, + { + "epoch": 0.025858369098712447, + "grad_norm": 0.3125, + "learning_rate": 8.619456366237483e-07, + "loss": 2.4367, + "step": 482 + }, + { + "epoch": 0.025912017167381975, + "grad_norm": 0.478515625, + "learning_rate": 8.637339055793992e-07, + "loss": 2.4768, + "step": 483 + }, + { + "epoch": 0.025965665236051504, + "grad_norm": 0.42578125, + "learning_rate": 8.655221745350502e-07, + "loss": 1.9497, + "step": 484 + }, + { + "epoch": 0.02601931330472103, + "grad_norm": 0.33203125, + "learning_rate": 8.67310443490701e-07, + "loss": 2.6494, + "step": 485 + }, + { + "epoch": 0.026072961373390557, + "grad_norm": 0.330078125, + "learning_rate": 8.690987124463519e-07, + "loss": 2.4921, + "step": 486 + }, + { + "epoch": 0.026126609442060085, + "grad_norm": 0.3671875, + "learning_rate": 8.70886981402003e-07, + "loss": 2.4631, + "step": 487 + }, + { + "epoch": 0.026180257510729613, + "grad_norm": 0.36328125, + "learning_rate": 8.726752503576539e-07, + "loss": 1.8175, + "step": 488 + }, + { + "epoch": 0.02623390557939914, + "grad_norm": 0.515625, + "learning_rate": 8.744635193133049e-07, + "loss": 2.4221, + "step": 489 + }, + { + "epoch": 0.02628755364806867, + "grad_norm": 0.3125, + "learning_rate": 8.762517882689557e-07, + "loss": 2.165, + "step": 490 + }, + { + "epoch": 0.026341201716738198, + "grad_norm": 0.3359375, + "learning_rate": 8.780400572246066e-07, + "loss": 2.507, + "step": 491 + }, + { + "epoch": 0.026394849785407726, + "grad_norm": 0.455078125, + "learning_rate": 8.798283261802576e-07, + "loss": 2.4094, + "step": 492 + }, + { + "epoch": 0.026448497854077255, + "grad_norm": 0.322265625, + "learning_rate": 8.816165951359085e-07, + "loss": 2.543, + "step": 493 + }, + { + "epoch": 0.02650214592274678, + "grad_norm": 0.37109375, + "learning_rate": 8.834048640915594e-07, + "loss": 2.3122, + "step": 494 + }, + { + "epoch": 0.026555793991416308, + "grad_norm": 0.498046875, + "learning_rate": 8.851931330472103e-07, + "loss": 2.7045, + "step": 495 + }, + { + "epoch": 0.026609442060085836, + "grad_norm": 0.3046875, + "learning_rate": 8.869814020028614e-07, + "loss": 2.4252, + "step": 496 + }, + { + "epoch": 0.026663090128755364, + "grad_norm": 0.28125, + "learning_rate": 8.887696709585123e-07, + "loss": 2.7205, + "step": 497 + }, + { + "epoch": 0.026716738197424893, + "grad_norm": 0.341796875, + "learning_rate": 8.905579399141632e-07, + "loss": 2.398, + "step": 498 + }, + { + "epoch": 0.02677038626609442, + "grad_norm": 0.353515625, + "learning_rate": 8.923462088698141e-07, + "loss": 2.3684, + "step": 499 + }, + { + "epoch": 0.02682403433476395, + "grad_norm": 0.341796875, + "learning_rate": 8.94134477825465e-07, + "loss": 2.509, + "step": 500 + }, + { + "epoch": 0.026877682403433478, + "grad_norm": 0.328125, + "learning_rate": 8.95922746781116e-07, + "loss": 2.6755, + "step": 501 + }, + { + "epoch": 0.026931330472103006, + "grad_norm": 0.283203125, + "learning_rate": 8.977110157367669e-07, + "loss": 2.4974, + "step": 502 + }, + { + "epoch": 0.02698497854077253, + "grad_norm": 1.1484375, + "learning_rate": 8.994992846924177e-07, + "loss": 2.8117, + "step": 503 + }, + { + "epoch": 0.02703862660944206, + "grad_norm": 0.38671875, + "learning_rate": 9.012875536480687e-07, + "loss": 2.3854, + "step": 504 + }, + { + "epoch": 0.027092274678111587, + "grad_norm": 0.359375, + "learning_rate": 9.030758226037196e-07, + "loss": 2.3714, + "step": 505 + }, + { + "epoch": 0.027145922746781116, + "grad_norm": 0.306640625, + "learning_rate": 9.048640915593707e-07, + "loss": 2.6873, + "step": 506 + }, + { + "epoch": 0.027199570815450644, + "grad_norm": 0.33984375, + "learning_rate": 9.066523605150216e-07, + "loss": 2.3912, + "step": 507 + }, + { + "epoch": 0.027253218884120172, + "grad_norm": 0.29296875, + "learning_rate": 9.084406294706724e-07, + "loss": 2.3976, + "step": 508 + }, + { + "epoch": 0.0273068669527897, + "grad_norm": 0.298828125, + "learning_rate": 9.102288984263234e-07, + "loss": 2.4203, + "step": 509 + }, + { + "epoch": 0.02736051502145923, + "grad_norm": 1.015625, + "learning_rate": 9.120171673819743e-07, + "loss": 2.5275, + "step": 510 + }, + { + "epoch": 0.027414163090128757, + "grad_norm": 0.5078125, + "learning_rate": 9.138054363376253e-07, + "loss": 2.5554, + "step": 511 + }, + { + "epoch": 0.027467811158798282, + "grad_norm": 0.59765625, + "learning_rate": 9.155937052932762e-07, + "loss": 1.7675, + "step": 512 + }, + { + "epoch": 0.02752145922746781, + "grad_norm": 0.357421875, + "learning_rate": 9.17381974248927e-07, + "loss": 2.3573, + "step": 513 + }, + { + "epoch": 0.02757510729613734, + "grad_norm": 0.33984375, + "learning_rate": 9.19170243204578e-07, + "loss": 2.3346, + "step": 514 + }, + { + "epoch": 0.027628755364806867, + "grad_norm": 0.34765625, + "learning_rate": 9.209585121602289e-07, + "loss": 2.3726, + "step": 515 + }, + { + "epoch": 0.027682403433476395, + "grad_norm": 0.42578125, + "learning_rate": 9.2274678111588e-07, + "loss": 2.3747, + "step": 516 + }, + { + "epoch": 0.027736051502145923, + "grad_norm": 1.2421875, + "learning_rate": 9.245350500715309e-07, + "loss": 2.5336, + "step": 517 + }, + { + "epoch": 0.02778969957081545, + "grad_norm": 0.435546875, + "learning_rate": 9.263233190271817e-07, + "loss": 2.3231, + "step": 518 + }, + { + "epoch": 0.02784334763948498, + "grad_norm": 0.375, + "learning_rate": 9.281115879828327e-07, + "loss": 2.4974, + "step": 519 + }, + { + "epoch": 0.027896995708154508, + "grad_norm": 0.36328125, + "learning_rate": 9.298998569384836e-07, + "loss": 2.4507, + "step": 520 + }, + { + "epoch": 0.027950643776824033, + "grad_norm": 0.51953125, + "learning_rate": 9.316881258941346e-07, + "loss": 2.1741, + "step": 521 + }, + { + "epoch": 0.02800429184549356, + "grad_norm": 0.337890625, + "learning_rate": 9.334763948497854e-07, + "loss": 2.6607, + "step": 522 + }, + { + "epoch": 0.02805793991416309, + "grad_norm": 0.3671875, + "learning_rate": 9.352646638054363e-07, + "loss": 2.2669, + "step": 523 + }, + { + "epoch": 0.028111587982832618, + "grad_norm": 0.4921875, + "learning_rate": 9.370529327610874e-07, + "loss": 2.4106, + "step": 524 + }, + { + "epoch": 0.028165236051502146, + "grad_norm": 0.443359375, + "learning_rate": 9.388412017167382e-07, + "loss": 2.3468, + "step": 525 + }, + { + "epoch": 0.028218884120171674, + "grad_norm": 0.462890625, + "learning_rate": 9.406294706723893e-07, + "loss": 2.2069, + "step": 526 + }, + { + "epoch": 0.028272532188841203, + "grad_norm": 0.5625, + "learning_rate": 9.424177396280401e-07, + "loss": 1.9182, + "step": 527 + }, + { + "epoch": 0.02832618025751073, + "grad_norm": 0.40625, + "learning_rate": 9.442060085836911e-07, + "loss": 2.412, + "step": 528 + }, + { + "epoch": 0.02837982832618026, + "grad_norm": 0.5625, + "learning_rate": 9.45994277539342e-07, + "loss": 2.2071, + "step": 529 + }, + { + "epoch": 0.028433476394849784, + "grad_norm": 0.357421875, + "learning_rate": 9.477825464949929e-07, + "loss": 1.9348, + "step": 530 + }, + { + "epoch": 0.028487124463519312, + "grad_norm": 0.404296875, + "learning_rate": 9.495708154506439e-07, + "loss": 2.3738, + "step": 531 + }, + { + "epoch": 0.02854077253218884, + "grad_norm": 0.41796875, + "learning_rate": 9.513590844062947e-07, + "loss": 2.5984, + "step": 532 + }, + { + "epoch": 0.02859442060085837, + "grad_norm": 0.310546875, + "learning_rate": 9.531473533619458e-07, + "loss": 2.3606, + "step": 533 + }, + { + "epoch": 0.028648068669527897, + "grad_norm": 0.4375, + "learning_rate": 9.549356223175967e-07, + "loss": 2.1681, + "step": 534 + }, + { + "epoch": 0.028701716738197425, + "grad_norm": 0.33203125, + "learning_rate": 9.567238912732476e-07, + "loss": 1.9664, + "step": 535 + }, + { + "epoch": 0.028755364806866954, + "grad_norm": 0.439453125, + "learning_rate": 9.585121602288984e-07, + "loss": 2.3031, + "step": 536 + }, + { + "epoch": 0.028809012875536482, + "grad_norm": 0.318359375, + "learning_rate": 9.603004291845493e-07, + "loss": 2.6263, + "step": 537 + }, + { + "epoch": 0.02886266094420601, + "grad_norm": 0.41796875, + "learning_rate": 9.620886981402004e-07, + "loss": 2.5548, + "step": 538 + }, + { + "epoch": 0.028916309012875535, + "grad_norm": 0.310546875, + "learning_rate": 9.638769670958513e-07, + "loss": 2.4333, + "step": 539 + }, + { + "epoch": 0.028969957081545063, + "grad_norm": 3.0625, + "learning_rate": 9.656652360515022e-07, + "loss": 2.4949, + "step": 540 + }, + { + "epoch": 0.02902360515021459, + "grad_norm": 0.412109375, + "learning_rate": 9.674535050071533e-07, + "loss": 2.4585, + "step": 541 + }, + { + "epoch": 0.02907725321888412, + "grad_norm": 0.306640625, + "learning_rate": 9.692417739628041e-07, + "loss": 2.4813, + "step": 542 + }, + { + "epoch": 0.02913090128755365, + "grad_norm": 0.9921875, + "learning_rate": 9.71030042918455e-07, + "loss": 1.7287, + "step": 543 + }, + { + "epoch": 0.029184549356223177, + "grad_norm": 0.474609375, + "learning_rate": 9.728183118741059e-07, + "loss": 2.4169, + "step": 544 + }, + { + "epoch": 0.029238197424892705, + "grad_norm": 0.283203125, + "learning_rate": 9.746065808297568e-07, + "loss": 2.5388, + "step": 545 + }, + { + "epoch": 0.029291845493562233, + "grad_norm": 0.373046875, + "learning_rate": 9.763948497854078e-07, + "loss": 2.5995, + "step": 546 + }, + { + "epoch": 0.029345493562231758, + "grad_norm": 0.3359375, + "learning_rate": 9.781831187410587e-07, + "loss": 2.4117, + "step": 547 + }, + { + "epoch": 0.029399141630901286, + "grad_norm": 0.5390625, + "learning_rate": 9.799713876967098e-07, + "loss": 2.4271, + "step": 548 + }, + { + "epoch": 0.029452789699570815, + "grad_norm": 0.408203125, + "learning_rate": 9.817596566523607e-07, + "loss": 2.4392, + "step": 549 + }, + { + "epoch": 0.029506437768240343, + "grad_norm": 0.369140625, + "learning_rate": 9.835479256080116e-07, + "loss": 2.1713, + "step": 550 + }, + { + "epoch": 0.02956008583690987, + "grad_norm": 0.421875, + "learning_rate": 9.853361945636624e-07, + "loss": 2.2869, + "step": 551 + }, + { + "epoch": 0.0296137339055794, + "grad_norm": 0.59765625, + "learning_rate": 9.871244635193133e-07, + "loss": 2.4478, + "step": 552 + }, + { + "epoch": 0.029667381974248928, + "grad_norm": 0.337890625, + "learning_rate": 9.889127324749644e-07, + "loss": 2.3429, + "step": 553 + }, + { + "epoch": 0.029721030042918456, + "grad_norm": 0.59375, + "learning_rate": 9.907010014306153e-07, + "loss": 2.4669, + "step": 554 + }, + { + "epoch": 0.029774678111587984, + "grad_norm": 0.302734375, + "learning_rate": 9.924892703862661e-07, + "loss": 2.3334, + "step": 555 + }, + { + "epoch": 0.02982832618025751, + "grad_norm": 0.318359375, + "learning_rate": 9.94277539341917e-07, + "loss": 2.376, + "step": 556 + }, + { + "epoch": 0.029881974248927037, + "grad_norm": 0.330078125, + "learning_rate": 9.96065808297568e-07, + "loss": 2.5067, + "step": 557 + }, + { + "epoch": 0.029935622317596566, + "grad_norm": 0.5390625, + "learning_rate": 9.97854077253219e-07, + "loss": 2.5734, + "step": 558 + }, + { + "epoch": 0.029989270386266094, + "grad_norm": 0.345703125, + "learning_rate": 9.996423462088699e-07, + "loss": 2.5279, + "step": 559 + }, + { + "epoch": 0.030042918454935622, + "grad_norm": 0.439453125, + "learning_rate": 1.001430615164521e-06, + "loss": 2.3779, + "step": 560 + }, + { + "epoch": 0.03009656652360515, + "grad_norm": 0.373046875, + "learning_rate": 1.0032188841201718e-06, + "loss": 1.9072, + "step": 561 + }, + { + "epoch": 0.03015021459227468, + "grad_norm": 0.7734375, + "learning_rate": 1.0050071530758227e-06, + "loss": 2.42, + "step": 562 + }, + { + "epoch": 0.030203862660944207, + "grad_norm": 0.470703125, + "learning_rate": 1.0067954220314736e-06, + "loss": 2.6524, + "step": 563 + }, + { + "epoch": 0.030257510729613735, + "grad_norm": 0.458984375, + "learning_rate": 1.0085836909871245e-06, + "loss": 2.6576, + "step": 564 + }, + { + "epoch": 0.03031115879828326, + "grad_norm": 0.53125, + "learning_rate": 1.0103719599427755e-06, + "loss": 2.4042, + "step": 565 + }, + { + "epoch": 0.03036480686695279, + "grad_norm": 0.337890625, + "learning_rate": 1.0121602288984264e-06, + "loss": 2.5033, + "step": 566 + }, + { + "epoch": 0.030418454935622317, + "grad_norm": 0.326171875, + "learning_rate": 1.0139484978540773e-06, + "loss": 2.417, + "step": 567 + }, + { + "epoch": 0.030472103004291845, + "grad_norm": 0.337890625, + "learning_rate": 1.0157367668097284e-06, + "loss": 2.2659, + "step": 568 + }, + { + "epoch": 0.030525751072961373, + "grad_norm": 0.3046875, + "learning_rate": 1.0175250357653793e-06, + "loss": 2.525, + "step": 569 + }, + { + "epoch": 0.0305793991416309, + "grad_norm": 0.396484375, + "learning_rate": 1.0193133047210301e-06, + "loss": 2.4183, + "step": 570 + }, + { + "epoch": 0.03063304721030043, + "grad_norm": 0.408203125, + "learning_rate": 1.021101573676681e-06, + "loss": 2.2936, + "step": 571 + }, + { + "epoch": 0.030686695278969958, + "grad_norm": 0.2890625, + "learning_rate": 1.0228898426323319e-06, + "loss": 2.3341, + "step": 572 + }, + { + "epoch": 0.030740343347639486, + "grad_norm": 0.333984375, + "learning_rate": 1.024678111587983e-06, + "loss": 1.9359, + "step": 573 + }, + { + "epoch": 0.03079399141630901, + "grad_norm": 0.51953125, + "learning_rate": 1.0264663805436338e-06, + "loss": 2.3845, + "step": 574 + }, + { + "epoch": 0.03084763948497854, + "grad_norm": 0.369140625, + "learning_rate": 1.0282546494992847e-06, + "loss": 2.6345, + "step": 575 + }, + { + "epoch": 0.030901287553648068, + "grad_norm": 0.3125, + "learning_rate": 1.0300429184549356e-06, + "loss": 2.3578, + "step": 576 + }, + { + "epoch": 0.030954935622317596, + "grad_norm": 0.41796875, + "learning_rate": 1.0318311874105865e-06, + "loss": 2.3443, + "step": 577 + }, + { + "epoch": 0.031008583690987124, + "grad_norm": 0.369140625, + "learning_rate": 1.0336194563662376e-06, + "loss": 2.5009, + "step": 578 + }, + { + "epoch": 0.031062231759656653, + "grad_norm": 0.4140625, + "learning_rate": 1.0354077253218884e-06, + "loss": 2.5208, + "step": 579 + }, + { + "epoch": 0.03111587982832618, + "grad_norm": 0.271484375, + "learning_rate": 1.0371959942775395e-06, + "loss": 2.4333, + "step": 580 + }, + { + "epoch": 0.03116952789699571, + "grad_norm": 0.515625, + "learning_rate": 1.0389842632331904e-06, + "loss": 2.5238, + "step": 581 + }, + { + "epoch": 0.031223175965665238, + "grad_norm": 0.30078125, + "learning_rate": 1.0407725321888413e-06, + "loss": 2.4288, + "step": 582 + }, + { + "epoch": 0.031276824034334766, + "grad_norm": 0.57421875, + "learning_rate": 1.0425608011444922e-06, + "loss": 2.6612, + "step": 583 + }, + { + "epoch": 0.03133047210300429, + "grad_norm": 0.392578125, + "learning_rate": 1.044349070100143e-06, + "loss": 2.5919, + "step": 584 + }, + { + "epoch": 0.03138412017167382, + "grad_norm": 0.369140625, + "learning_rate": 1.0461373390557941e-06, + "loss": 2.377, + "step": 585 + }, + { + "epoch": 0.03143776824034335, + "grad_norm": 0.41015625, + "learning_rate": 1.047925608011445e-06, + "loss": 2.3663, + "step": 586 + }, + { + "epoch": 0.03149141630901287, + "grad_norm": 0.359375, + "learning_rate": 1.0497138769670959e-06, + "loss": 2.6805, + "step": 587 + }, + { + "epoch": 0.031545064377682404, + "grad_norm": 2.296875, + "learning_rate": 1.051502145922747e-06, + "loss": 2.5538, + "step": 588 + }, + { + "epoch": 0.03159871244635193, + "grad_norm": 0.4140625, + "learning_rate": 1.0532904148783978e-06, + "loss": 2.3455, + "step": 589 + }, + { + "epoch": 0.03165236051502146, + "grad_norm": 0.322265625, + "learning_rate": 1.0550786838340487e-06, + "loss": 2.3263, + "step": 590 + }, + { + "epoch": 0.031706008583690985, + "grad_norm": 0.47265625, + "learning_rate": 1.0568669527896996e-06, + "loss": 1.4262, + "step": 591 + }, + { + "epoch": 0.03175965665236052, + "grad_norm": 0.333984375, + "learning_rate": 1.0586552217453507e-06, + "loss": 2.0717, + "step": 592 + }, + { + "epoch": 0.03181330472103004, + "grad_norm": 0.359375, + "learning_rate": 1.0604434907010015e-06, + "loss": 2.4091, + "step": 593 + }, + { + "epoch": 0.031866952789699574, + "grad_norm": 0.365234375, + "learning_rate": 1.0622317596566524e-06, + "loss": 2.1548, + "step": 594 + }, + { + "epoch": 0.0319206008583691, + "grad_norm": 0.41796875, + "learning_rate": 1.0640200286123035e-06, + "loss": 2.3398, + "step": 595 + }, + { + "epoch": 0.03197424892703862, + "grad_norm": 0.5078125, + "learning_rate": 1.0658082975679544e-06, + "loss": 2.4948, + "step": 596 + }, + { + "epoch": 0.032027896995708155, + "grad_norm": 0.376953125, + "learning_rate": 1.0675965665236053e-06, + "loss": 2.4331, + "step": 597 + }, + { + "epoch": 0.03208154506437768, + "grad_norm": 0.396484375, + "learning_rate": 1.0693848354792561e-06, + "loss": 2.4488, + "step": 598 + }, + { + "epoch": 0.03213519313304721, + "grad_norm": 0.42578125, + "learning_rate": 1.071173104434907e-06, + "loss": 2.4702, + "step": 599 + }, + { + "epoch": 0.032188841201716736, + "grad_norm": 0.333984375, + "learning_rate": 1.072961373390558e-06, + "loss": 1.7696, + "step": 600 + }, + { + "epoch": 0.03224248927038627, + "grad_norm": 0.365234375, + "learning_rate": 1.074749642346209e-06, + "loss": 2.4332, + "step": 601 + }, + { + "epoch": 0.03229613733905579, + "grad_norm": 0.451171875, + "learning_rate": 1.0765379113018599e-06, + "loss": 2.4186, + "step": 602 + }, + { + "epoch": 0.032349785407725325, + "grad_norm": 0.5234375, + "learning_rate": 1.0783261802575107e-06, + "loss": 2.5228, + "step": 603 + }, + { + "epoch": 0.03240343347639485, + "grad_norm": 0.5546875, + "learning_rate": 1.0801144492131616e-06, + "loss": 2.6045, + "step": 604 + }, + { + "epoch": 0.032457081545064374, + "grad_norm": 0.302734375, + "learning_rate": 1.0819027181688127e-06, + "loss": 2.452, + "step": 605 + }, + { + "epoch": 0.032510729613733906, + "grad_norm": 0.6015625, + "learning_rate": 1.0836909871244636e-06, + "loss": 2.4305, + "step": 606 + }, + { + "epoch": 0.03256437768240343, + "grad_norm": 0.2734375, + "learning_rate": 1.0854792560801147e-06, + "loss": 2.3182, + "step": 607 + }, + { + "epoch": 0.03261802575107296, + "grad_norm": 0.470703125, + "learning_rate": 1.0872675250357655e-06, + "loss": 1.7148, + "step": 608 + }, + { + "epoch": 0.03267167381974249, + "grad_norm": 0.41796875, + "learning_rate": 1.0890557939914164e-06, + "loss": 2.6712, + "step": 609 + }, + { + "epoch": 0.03272532188841202, + "grad_norm": 0.310546875, + "learning_rate": 1.0908440629470673e-06, + "loss": 2.4819, + "step": 610 + }, + { + "epoch": 0.032778969957081544, + "grad_norm": 0.322265625, + "learning_rate": 1.0926323319027182e-06, + "loss": 2.2184, + "step": 611 + }, + { + "epoch": 0.032832618025751076, + "grad_norm": 0.671875, + "learning_rate": 1.0944206008583692e-06, + "loss": 2.3071, + "step": 612 + }, + { + "epoch": 0.0328862660944206, + "grad_norm": 0.455078125, + "learning_rate": 1.0962088698140201e-06, + "loss": 2.4986, + "step": 613 + }, + { + "epoch": 0.032939914163090125, + "grad_norm": 0.419921875, + "learning_rate": 1.097997138769671e-06, + "loss": 2.5145, + "step": 614 + }, + { + "epoch": 0.03299356223175966, + "grad_norm": 0.298828125, + "learning_rate": 1.099785407725322e-06, + "loss": 2.6306, + "step": 615 + }, + { + "epoch": 0.03304721030042918, + "grad_norm": 0.330078125, + "learning_rate": 1.101573676680973e-06, + "loss": 2.5035, + "step": 616 + }, + { + "epoch": 0.033100858369098714, + "grad_norm": 0.359375, + "learning_rate": 1.1033619456366238e-06, + "loss": 2.2657, + "step": 617 + }, + { + "epoch": 0.03315450643776824, + "grad_norm": 0.50390625, + "learning_rate": 1.1051502145922747e-06, + "loss": 2.0746, + "step": 618 + }, + { + "epoch": 0.03320815450643777, + "grad_norm": 0.25, + "learning_rate": 1.1069384835479256e-06, + "loss": 2.3618, + "step": 619 + }, + { + "epoch": 0.033261802575107295, + "grad_norm": 0.408203125, + "learning_rate": 1.1087267525035767e-06, + "loss": 2.5551, + "step": 620 + }, + { + "epoch": 0.03331545064377683, + "grad_norm": 0.373046875, + "learning_rate": 1.1105150214592276e-06, + "loss": 2.5902, + "step": 621 + }, + { + "epoch": 0.03336909871244635, + "grad_norm": 0.53515625, + "learning_rate": 1.1123032904148784e-06, + "loss": 2.5069, + "step": 622 + }, + { + "epoch": 0.03342274678111588, + "grad_norm": 0.353515625, + "learning_rate": 1.1140915593705293e-06, + "loss": 2.4267, + "step": 623 + }, + { + "epoch": 0.03347639484978541, + "grad_norm": 0.55859375, + "learning_rate": 1.1158798283261804e-06, + "loss": 2.4193, + "step": 624 + }, + { + "epoch": 0.03353004291845493, + "grad_norm": 0.337890625, + "learning_rate": 1.1176680972818313e-06, + "loss": 2.5159, + "step": 625 + }, + { + "epoch": 0.033583690987124465, + "grad_norm": 0.431640625, + "learning_rate": 1.1194563662374821e-06, + "loss": 2.5467, + "step": 626 + }, + { + "epoch": 0.03363733905579399, + "grad_norm": 0.439453125, + "learning_rate": 1.1212446351931332e-06, + "loss": 2.4166, + "step": 627 + }, + { + "epoch": 0.03369098712446352, + "grad_norm": 0.75390625, + "learning_rate": 1.1230329041487841e-06, + "loss": 2.6872, + "step": 628 + }, + { + "epoch": 0.033744635193133046, + "grad_norm": 0.5, + "learning_rate": 1.124821173104435e-06, + "loss": 2.4373, + "step": 629 + }, + { + "epoch": 0.03379828326180258, + "grad_norm": 1.6953125, + "learning_rate": 1.1266094420600859e-06, + "loss": 2.338, + "step": 630 + }, + { + "epoch": 0.0338519313304721, + "grad_norm": 0.35546875, + "learning_rate": 1.1283977110157367e-06, + "loss": 2.4845, + "step": 631 + }, + { + "epoch": 0.03390557939914163, + "grad_norm": 0.5546875, + "learning_rate": 1.1301859799713878e-06, + "loss": 2.4222, + "step": 632 + }, + { + "epoch": 0.03395922746781116, + "grad_norm": 0.431640625, + "learning_rate": 1.1319742489270387e-06, + "loss": 2.5177, + "step": 633 + }, + { + "epoch": 0.034012875536480684, + "grad_norm": 0.3046875, + "learning_rate": 1.1337625178826898e-06, + "loss": 2.2976, + "step": 634 + }, + { + "epoch": 0.034066523605150216, + "grad_norm": 0.984375, + "learning_rate": 1.1355507868383407e-06, + "loss": 2.3124, + "step": 635 + }, + { + "epoch": 0.03412017167381974, + "grad_norm": 0.36328125, + "learning_rate": 1.1373390557939915e-06, + "loss": 2.3869, + "step": 636 + }, + { + "epoch": 0.03417381974248927, + "grad_norm": 0.388671875, + "learning_rate": 1.1391273247496424e-06, + "loss": 2.7494, + "step": 637 + }, + { + "epoch": 0.0342274678111588, + "grad_norm": 0.4140625, + "learning_rate": 1.1409155937052933e-06, + "loss": 2.3011, + "step": 638 + }, + { + "epoch": 0.03428111587982833, + "grad_norm": 0.408203125, + "learning_rate": 1.1427038626609444e-06, + "loss": 2.4173, + "step": 639 + }, + { + "epoch": 0.034334763948497854, + "grad_norm": 0.44921875, + "learning_rate": 1.1444921316165953e-06, + "loss": 2.4018, + "step": 640 + }, + { + "epoch": 0.03438841201716738, + "grad_norm": 0.484375, + "learning_rate": 1.1462804005722461e-06, + "loss": 2.4956, + "step": 641 + }, + { + "epoch": 0.03444206008583691, + "grad_norm": 0.5078125, + "learning_rate": 1.1480686695278972e-06, + "loss": 2.1901, + "step": 642 + }, + { + "epoch": 0.034495708154506435, + "grad_norm": 0.359375, + "learning_rate": 1.1498569384835479e-06, + "loss": 2.3271, + "step": 643 + }, + { + "epoch": 0.03454935622317597, + "grad_norm": 0.40625, + "learning_rate": 1.151645207439199e-06, + "loss": 2.6526, + "step": 644 + }, + { + "epoch": 0.03460300429184549, + "grad_norm": 0.341796875, + "learning_rate": 1.1534334763948498e-06, + "loss": 2.5722, + "step": 645 + }, + { + "epoch": 0.034656652360515024, + "grad_norm": 0.3203125, + "learning_rate": 1.1552217453505007e-06, + "loss": 2.3245, + "step": 646 + }, + { + "epoch": 0.03471030042918455, + "grad_norm": 0.447265625, + "learning_rate": 1.1570100143061518e-06, + "loss": 2.3918, + "step": 647 + }, + { + "epoch": 0.03476394849785408, + "grad_norm": 8.125, + "learning_rate": 1.1587982832618027e-06, + "loss": 2.543, + "step": 648 + }, + { + "epoch": 0.034817596566523605, + "grad_norm": 0.4140625, + "learning_rate": 1.1605865522174536e-06, + "loss": 2.5161, + "step": 649 + }, + { + "epoch": 0.03487124463519313, + "grad_norm": 0.55859375, + "learning_rate": 1.1623748211731044e-06, + "loss": 2.4413, + "step": 650 + }, + { + "epoch": 0.03492489270386266, + "grad_norm": 0.494140625, + "learning_rate": 1.1641630901287553e-06, + "loss": 1.8642, + "step": 651 + }, + { + "epoch": 0.034978540772532186, + "grad_norm": 0.35546875, + "learning_rate": 1.1659513590844064e-06, + "loss": 2.2716, + "step": 652 + }, + { + "epoch": 0.03503218884120172, + "grad_norm": 0.72265625, + "learning_rate": 1.1677396280400573e-06, + "loss": 2.6355, + "step": 653 + }, + { + "epoch": 0.03508583690987124, + "grad_norm": 0.462890625, + "learning_rate": 1.1695278969957084e-06, + "loss": 2.5791, + "step": 654 + }, + { + "epoch": 0.035139484978540775, + "grad_norm": 0.361328125, + "learning_rate": 1.1713161659513592e-06, + "loss": 2.2059, + "step": 655 + }, + { + "epoch": 0.0351931330472103, + "grad_norm": 0.35546875, + "learning_rate": 1.1731044349070101e-06, + "loss": 2.5133, + "step": 656 + }, + { + "epoch": 0.03524678111587983, + "grad_norm": 0.30859375, + "learning_rate": 1.174892703862661e-06, + "loss": 2.5332, + "step": 657 + }, + { + "epoch": 0.035300429184549356, + "grad_norm": 0.49609375, + "learning_rate": 1.1766809728183119e-06, + "loss": 2.5722, + "step": 658 + }, + { + "epoch": 0.03535407725321888, + "grad_norm": 0.408203125, + "learning_rate": 1.178469241773963e-06, + "loss": 2.9663, + "step": 659 + }, + { + "epoch": 0.03540772532188841, + "grad_norm": 0.30859375, + "learning_rate": 1.1802575107296138e-06, + "loss": 2.3916, + "step": 660 + }, + { + "epoch": 0.03546137339055794, + "grad_norm": 0.78125, + "learning_rate": 1.182045779685265e-06, + "loss": 1.5143, + "step": 661 + }, + { + "epoch": 0.03551502145922747, + "grad_norm": 0.400390625, + "learning_rate": 1.1838340486409158e-06, + "loss": 2.5112, + "step": 662 + }, + { + "epoch": 0.035568669527896994, + "grad_norm": 0.388671875, + "learning_rate": 1.1856223175965667e-06, + "loss": 2.3137, + "step": 663 + }, + { + "epoch": 0.035622317596566526, + "grad_norm": 0.4609375, + "learning_rate": 1.1874105865522175e-06, + "loss": 2.6819, + "step": 664 + }, + { + "epoch": 0.03567596566523605, + "grad_norm": 0.3359375, + "learning_rate": 1.1891988555078684e-06, + "loss": 2.3629, + "step": 665 + }, + { + "epoch": 0.03572961373390558, + "grad_norm": 0.3359375, + "learning_rate": 1.1909871244635195e-06, + "loss": 2.3997, + "step": 666 + }, + { + "epoch": 0.03578326180257511, + "grad_norm": 1.90625, + "learning_rate": 1.1927753934191704e-06, + "loss": 2.6001, + "step": 667 + }, + { + "epoch": 0.03583690987124463, + "grad_norm": 0.392578125, + "learning_rate": 1.1945636623748213e-06, + "loss": 2.6031, + "step": 668 + }, + { + "epoch": 0.035890557939914164, + "grad_norm": 0.57421875, + "learning_rate": 1.1963519313304721e-06, + "loss": 2.6134, + "step": 669 + }, + { + "epoch": 0.03594420600858369, + "grad_norm": 0.388671875, + "learning_rate": 1.198140200286123e-06, + "loss": 2.3309, + "step": 670 + }, + { + "epoch": 0.03599785407725322, + "grad_norm": 0.333984375, + "learning_rate": 1.199928469241774e-06, + "loss": 2.4558, + "step": 671 + }, + { + "epoch": 0.036051502145922745, + "grad_norm": 0.412109375, + "learning_rate": 1.201716738197425e-06, + "loss": 2.2724, + "step": 672 + }, + { + "epoch": 0.03610515021459228, + "grad_norm": 0.34375, + "learning_rate": 1.2035050071530758e-06, + "loss": 2.4542, + "step": 673 + }, + { + "epoch": 0.0361587982832618, + "grad_norm": 0.453125, + "learning_rate": 1.205293276108727e-06, + "loss": 2.6605, + "step": 674 + }, + { + "epoch": 0.036212446351931334, + "grad_norm": 0.36328125, + "learning_rate": 1.2070815450643778e-06, + "loss": 2.3726, + "step": 675 + }, + { + "epoch": 0.03626609442060086, + "grad_norm": 0.373046875, + "learning_rate": 1.2088698140200287e-06, + "loss": 2.3764, + "step": 676 + }, + { + "epoch": 0.03631974248927038, + "grad_norm": 0.33984375, + "learning_rate": 1.2106580829756796e-06, + "loss": 2.5479, + "step": 677 + }, + { + "epoch": 0.036373390557939915, + "grad_norm": 0.27734375, + "learning_rate": 1.2124463519313304e-06, + "loss": 2.7122, + "step": 678 + }, + { + "epoch": 0.03642703862660944, + "grad_norm": 0.32421875, + "learning_rate": 1.2142346208869815e-06, + "loss": 2.6049, + "step": 679 + }, + { + "epoch": 0.03648068669527897, + "grad_norm": 0.40234375, + "learning_rate": 1.2160228898426324e-06, + "loss": 2.3247, + "step": 680 + }, + { + "epoch": 0.036534334763948496, + "grad_norm": 0.6171875, + "learning_rate": 1.2178111587982835e-06, + "loss": 2.465, + "step": 681 + }, + { + "epoch": 0.03658798283261803, + "grad_norm": 0.388671875, + "learning_rate": 1.2195994277539344e-06, + "loss": 2.3235, + "step": 682 + }, + { + "epoch": 0.03664163090128755, + "grad_norm": 0.337890625, + "learning_rate": 1.2213876967095852e-06, + "loss": 2.4169, + "step": 683 + }, + { + "epoch": 0.036695278969957085, + "grad_norm": 0.279296875, + "learning_rate": 1.2231759656652361e-06, + "loss": 2.4014, + "step": 684 + }, + { + "epoch": 0.03674892703862661, + "grad_norm": 0.55078125, + "learning_rate": 1.224964234620887e-06, + "loss": 2.5669, + "step": 685 + }, + { + "epoch": 0.036802575107296134, + "grad_norm": 0.361328125, + "learning_rate": 1.226752503576538e-06, + "loss": 2.2072, + "step": 686 + }, + { + "epoch": 0.036856223175965666, + "grad_norm": 0.3671875, + "learning_rate": 1.228540772532189e-06, + "loss": 2.4111, + "step": 687 + }, + { + "epoch": 0.03690987124463519, + "grad_norm": 0.318359375, + "learning_rate": 1.2303290414878398e-06, + "loss": 2.1474, + "step": 688 + }, + { + "epoch": 0.03696351931330472, + "grad_norm": 0.341796875, + "learning_rate": 1.2321173104434907e-06, + "loss": 2.4994, + "step": 689 + }, + { + "epoch": 0.03701716738197425, + "grad_norm": 0.263671875, + "learning_rate": 1.2339055793991416e-06, + "loss": 2.1547, + "step": 690 + }, + { + "epoch": 0.03707081545064378, + "grad_norm": 0.353515625, + "learning_rate": 1.2356938483547927e-06, + "loss": 2.5118, + "step": 691 + }, + { + "epoch": 0.037124463519313304, + "grad_norm": 0.5546875, + "learning_rate": 1.2374821173104435e-06, + "loss": 2.5398, + "step": 692 + }, + { + "epoch": 0.037178111587982836, + "grad_norm": 0.40234375, + "learning_rate": 1.2392703862660946e-06, + "loss": 2.3984, + "step": 693 + }, + { + "epoch": 0.03723175965665236, + "grad_norm": 0.5703125, + "learning_rate": 1.2410586552217455e-06, + "loss": 2.6697, + "step": 694 + }, + { + "epoch": 0.037285407725321885, + "grad_norm": 0.37109375, + "learning_rate": 1.2428469241773964e-06, + "loss": 2.4184, + "step": 695 + }, + { + "epoch": 0.03733905579399142, + "grad_norm": 0.291015625, + "learning_rate": 1.2446351931330473e-06, + "loss": 2.317, + "step": 696 + }, + { + "epoch": 0.03739270386266094, + "grad_norm": 0.56640625, + "learning_rate": 1.2464234620886981e-06, + "loss": 2.7187, + "step": 697 + }, + { + "epoch": 0.037446351931330474, + "grad_norm": 0.400390625, + "learning_rate": 1.2482117310443492e-06, + "loss": 2.334, + "step": 698 + }, + { + "epoch": 0.0375, + "grad_norm": 0.337890625, + "learning_rate": 1.25e-06, + "loss": 2.26, + "step": 699 + }, + { + "epoch": 0.03755364806866953, + "grad_norm": 0.443359375, + "learning_rate": 1.251788268955651e-06, + "loss": 2.4786, + "step": 700 + }, + { + "epoch": 0.037607296137339055, + "grad_norm": 0.72265625, + "learning_rate": 1.2535765379113019e-06, + "loss": 2.5206, + "step": 701 + }, + { + "epoch": 0.03766094420600859, + "grad_norm": 0.326171875, + "learning_rate": 1.255364806866953e-06, + "loss": 2.6237, + "step": 702 + }, + { + "epoch": 0.03771459227467811, + "grad_norm": 0.302734375, + "learning_rate": 1.2571530758226038e-06, + "loss": 2.6274, + "step": 703 + }, + { + "epoch": 0.03776824034334764, + "grad_norm": 0.4140625, + "learning_rate": 1.2589413447782547e-06, + "loss": 2.5546, + "step": 704 + }, + { + "epoch": 0.03782188841201717, + "grad_norm": 0.318359375, + "learning_rate": 1.2607296137339056e-06, + "loss": 2.6304, + "step": 705 + }, + { + "epoch": 0.03787553648068669, + "grad_norm": 0.404296875, + "learning_rate": 1.2625178826895564e-06, + "loss": 2.7341, + "step": 706 + }, + { + "epoch": 0.037929184549356225, + "grad_norm": 0.578125, + "learning_rate": 1.2643061516452077e-06, + "loss": 2.4848, + "step": 707 + }, + { + "epoch": 0.03798283261802575, + "grad_norm": 1.8515625, + "learning_rate": 1.2660944206008586e-06, + "loss": 2.6051, + "step": 708 + }, + { + "epoch": 0.03803648068669528, + "grad_norm": 0.359375, + "learning_rate": 1.2678826895565095e-06, + "loss": 2.5885, + "step": 709 + }, + { + "epoch": 0.038090128755364806, + "grad_norm": 0.3828125, + "learning_rate": 1.2696709585121604e-06, + "loss": 2.443, + "step": 710 + }, + { + "epoch": 0.03814377682403434, + "grad_norm": 1.1015625, + "learning_rate": 1.271459227467811e-06, + "loss": 2.2928, + "step": 711 + }, + { + "epoch": 0.03819742489270386, + "grad_norm": 0.515625, + "learning_rate": 1.2732474964234623e-06, + "loss": 2.4642, + "step": 712 + }, + { + "epoch": 0.03825107296137339, + "grad_norm": 0.3046875, + "learning_rate": 1.2750357653791132e-06, + "loss": 2.1973, + "step": 713 + }, + { + "epoch": 0.03830472103004292, + "grad_norm": 0.376953125, + "learning_rate": 1.276824034334764e-06, + "loss": 2.5249, + "step": 714 + }, + { + "epoch": 0.038358369098712444, + "grad_norm": 0.462890625, + "learning_rate": 1.278612303290415e-06, + "loss": 2.4157, + "step": 715 + }, + { + "epoch": 0.038412017167381976, + "grad_norm": 0.4609375, + "learning_rate": 1.2804005722460658e-06, + "loss": 2.0761, + "step": 716 + }, + { + "epoch": 0.0384656652360515, + "grad_norm": 2.65625, + "learning_rate": 1.282188841201717e-06, + "loss": 2.0861, + "step": 717 + }, + { + "epoch": 0.03851931330472103, + "grad_norm": 0.40625, + "learning_rate": 1.2839771101573678e-06, + "loss": 2.3872, + "step": 718 + }, + { + "epoch": 0.03857296137339056, + "grad_norm": 0.404296875, + "learning_rate": 1.2857653791130187e-06, + "loss": 2.3345, + "step": 719 + }, + { + "epoch": 0.03862660944206009, + "grad_norm": 0.33984375, + "learning_rate": 1.2875536480686696e-06, + "loss": 2.2226, + "step": 720 + }, + { + "epoch": 0.038680257510729614, + "grad_norm": 0.369140625, + "learning_rate": 1.2893419170243204e-06, + "loss": 2.4944, + "step": 721 + }, + { + "epoch": 0.03873390557939914, + "grad_norm": 0.326171875, + "learning_rate": 1.2911301859799715e-06, + "loss": 2.6303, + "step": 722 + }, + { + "epoch": 0.03878755364806867, + "grad_norm": 2.3125, + "learning_rate": 1.2929184549356224e-06, + "loss": 2.69, + "step": 723 + }, + { + "epoch": 0.038841201716738195, + "grad_norm": 0.359375, + "learning_rate": 1.2947067238912733e-06, + "loss": 2.4801, + "step": 724 + }, + { + "epoch": 0.03889484978540773, + "grad_norm": 0.57421875, + "learning_rate": 1.2964949928469241e-06, + "loss": 2.3558, + "step": 725 + }, + { + "epoch": 0.03894849785407725, + "grad_norm": 0.287109375, + "learning_rate": 1.298283261802575e-06, + "loss": 2.4959, + "step": 726 + }, + { + "epoch": 0.039002145922746784, + "grad_norm": 0.59765625, + "learning_rate": 1.3000715307582263e-06, + "loss": 2.482, + "step": 727 + }, + { + "epoch": 0.03905579399141631, + "grad_norm": 0.322265625, + "learning_rate": 1.3018597997138772e-06, + "loss": 2.4073, + "step": 728 + }, + { + "epoch": 0.03910944206008584, + "grad_norm": 0.412109375, + "learning_rate": 1.303648068669528e-06, + "loss": 2.225, + "step": 729 + }, + { + "epoch": 0.039163090128755365, + "grad_norm": 0.3984375, + "learning_rate": 1.305436337625179e-06, + "loss": 2.3287, + "step": 730 + }, + { + "epoch": 0.03921673819742489, + "grad_norm": 0.419921875, + "learning_rate": 1.30722460658083e-06, + "loss": 2.6098, + "step": 731 + }, + { + "epoch": 0.03927038626609442, + "grad_norm": 0.322265625, + "learning_rate": 1.309012875536481e-06, + "loss": 2.4133, + "step": 732 + }, + { + "epoch": 0.039324034334763946, + "grad_norm": 0.38671875, + "learning_rate": 1.3108011444921318e-06, + "loss": 2.3345, + "step": 733 + }, + { + "epoch": 0.03937768240343348, + "grad_norm": 0.390625, + "learning_rate": 1.3125894134477827e-06, + "loss": 2.3745, + "step": 734 + }, + { + "epoch": 0.039431330472103, + "grad_norm": 0.337890625, + "learning_rate": 1.3143776824034335e-06, + "loss": 2.4408, + "step": 735 + }, + { + "epoch": 0.039484978540772535, + "grad_norm": 0.400390625, + "learning_rate": 1.3161659513590846e-06, + "loss": 2.5597, + "step": 736 + }, + { + "epoch": 0.03953862660944206, + "grad_norm": 0.462890625, + "learning_rate": 1.3179542203147355e-06, + "loss": 2.6759, + "step": 737 + }, + { + "epoch": 0.03959227467811159, + "grad_norm": 0.439453125, + "learning_rate": 1.3197424892703864e-06, + "loss": 1.5004, + "step": 738 + }, + { + "epoch": 0.039645922746781116, + "grad_norm": 0.5078125, + "learning_rate": 1.3215307582260373e-06, + "loss": 1.9843, + "step": 739 + }, + { + "epoch": 0.03969957081545064, + "grad_norm": 0.69921875, + "learning_rate": 1.3233190271816881e-06, + "loss": 2.5284, + "step": 740 + }, + { + "epoch": 0.03975321888412017, + "grad_norm": 0.796875, + "learning_rate": 1.3251072961373392e-06, + "loss": 1.4086, + "step": 741 + }, + { + "epoch": 0.0398068669527897, + "grad_norm": 0.3046875, + "learning_rate": 1.32689556509299e-06, + "loss": 2.4115, + "step": 742 + }, + { + "epoch": 0.03986051502145923, + "grad_norm": 0.39453125, + "learning_rate": 1.328683834048641e-06, + "loss": 2.7345, + "step": 743 + }, + { + "epoch": 0.039914163090128754, + "grad_norm": 0.35546875, + "learning_rate": 1.3304721030042918e-06, + "loss": 2.2906, + "step": 744 + }, + { + "epoch": 0.039967811158798286, + "grad_norm": 0.328125, + "learning_rate": 1.3322603719599427e-06, + "loss": 2.2802, + "step": 745 + }, + { + "epoch": 0.04002145922746781, + "grad_norm": 0.291015625, + "learning_rate": 1.334048640915594e-06, + "loss": 2.1479, + "step": 746 + }, + { + "epoch": 0.04007510729613734, + "grad_norm": 0.390625, + "learning_rate": 1.335836909871245e-06, + "loss": 2.2092, + "step": 747 + }, + { + "epoch": 0.04012875536480687, + "grad_norm": 0.39453125, + "learning_rate": 1.3376251788268958e-06, + "loss": 2.6033, + "step": 748 + }, + { + "epoch": 0.04018240343347639, + "grad_norm": 0.38671875, + "learning_rate": 1.3394134477825466e-06, + "loss": 2.3872, + "step": 749 + }, + { + "epoch": 0.040236051502145924, + "grad_norm": 0.3125, + "learning_rate": 1.3412017167381975e-06, + "loss": 2.3537, + "step": 750 + }, + { + "epoch": 0.04028969957081545, + "grad_norm": 0.4296875, + "learning_rate": 1.3429899856938486e-06, + "loss": 2.5458, + "step": 751 + }, + { + "epoch": 0.04034334763948498, + "grad_norm": 0.408203125, + "learning_rate": 1.3447782546494995e-06, + "loss": 2.1961, + "step": 752 + }, + { + "epoch": 0.040396995708154505, + "grad_norm": 0.3671875, + "learning_rate": 1.3465665236051504e-06, + "loss": 2.4071, + "step": 753 + }, + { + "epoch": 0.04045064377682404, + "grad_norm": 0.376953125, + "learning_rate": 1.3483547925608012e-06, + "loss": 2.5349, + "step": 754 + }, + { + "epoch": 0.04050429184549356, + "grad_norm": 0.40234375, + "learning_rate": 1.3501430615164521e-06, + "loss": 2.155, + "step": 755 + }, + { + "epoch": 0.04055793991416309, + "grad_norm": 0.28125, + "learning_rate": 1.3519313304721032e-06, + "loss": 2.5522, + "step": 756 + }, + { + "epoch": 0.04061158798283262, + "grad_norm": 0.328125, + "learning_rate": 1.353719599427754e-06, + "loss": 2.3316, + "step": 757 + }, + { + "epoch": 0.04066523605150214, + "grad_norm": 0.462890625, + "learning_rate": 1.355507868383405e-06, + "loss": 2.5888, + "step": 758 + }, + { + "epoch": 0.040718884120171675, + "grad_norm": 0.349609375, + "learning_rate": 1.3572961373390558e-06, + "loss": 2.4959, + "step": 759 + }, + { + "epoch": 0.0407725321888412, + "grad_norm": 0.353515625, + "learning_rate": 1.3590844062947067e-06, + "loss": 2.259, + "step": 760 + }, + { + "epoch": 0.04082618025751073, + "grad_norm": 0.361328125, + "learning_rate": 1.3608726752503578e-06, + "loss": 2.2827, + "step": 761 + }, + { + "epoch": 0.040879828326180256, + "grad_norm": 1.3671875, + "learning_rate": 1.3626609442060087e-06, + "loss": 2.4127, + "step": 762 + }, + { + "epoch": 0.04093347639484979, + "grad_norm": 0.359375, + "learning_rate": 1.3644492131616595e-06, + "loss": 2.3973, + "step": 763 + }, + { + "epoch": 0.04098712446351931, + "grad_norm": 0.396484375, + "learning_rate": 1.3662374821173104e-06, + "loss": 2.5341, + "step": 764 + }, + { + "epoch": 0.04104077253218884, + "grad_norm": 0.77734375, + "learning_rate": 1.3680257510729613e-06, + "loss": 2.1937, + "step": 765 + }, + { + "epoch": 0.04109442060085837, + "grad_norm": 0.443359375, + "learning_rate": 1.3698140200286126e-06, + "loss": 2.3396, + "step": 766 + }, + { + "epoch": 0.041148068669527894, + "grad_norm": 0.365234375, + "learning_rate": 1.3716022889842635e-06, + "loss": 1.9297, + "step": 767 + }, + { + "epoch": 0.041201716738197426, + "grad_norm": 0.408203125, + "learning_rate": 1.3733905579399143e-06, + "loss": 2.4015, + "step": 768 + }, + { + "epoch": 0.04125536480686695, + "grad_norm": 0.40234375, + "learning_rate": 1.3751788268955652e-06, + "loss": 2.3264, + "step": 769 + }, + { + "epoch": 0.04130901287553648, + "grad_norm": 0.51953125, + "learning_rate": 1.376967095851216e-06, + "loss": 2.3259, + "step": 770 + }, + { + "epoch": 0.04136266094420601, + "grad_norm": 0.349609375, + "learning_rate": 1.3787553648068672e-06, + "loss": 2.6022, + "step": 771 + }, + { + "epoch": 0.04141630901287554, + "grad_norm": 0.3359375, + "learning_rate": 1.380543633762518e-06, + "loss": 2.4966, + "step": 772 + }, + { + "epoch": 0.041469957081545064, + "grad_norm": 0.54296875, + "learning_rate": 1.382331902718169e-06, + "loss": 2.3775, + "step": 773 + }, + { + "epoch": 0.04152360515021459, + "grad_norm": 0.3515625, + "learning_rate": 1.3841201716738198e-06, + "loss": 2.5992, + "step": 774 + }, + { + "epoch": 0.04157725321888412, + "grad_norm": 0.435546875, + "learning_rate": 1.3859084406294707e-06, + "loss": 2.688, + "step": 775 + }, + { + "epoch": 0.041630901287553645, + "grad_norm": 0.369140625, + "learning_rate": 1.3876967095851218e-06, + "loss": 2.5141, + "step": 776 + }, + { + "epoch": 0.04168454935622318, + "grad_norm": 0.32421875, + "learning_rate": 1.3894849785407727e-06, + "loss": 2.4136, + "step": 777 + }, + { + "epoch": 0.0417381974248927, + "grad_norm": 1.3359375, + "learning_rate": 1.3912732474964235e-06, + "loss": 2.3696, + "step": 778 + }, + { + "epoch": 0.041791845493562234, + "grad_norm": 0.419921875, + "learning_rate": 1.3930615164520744e-06, + "loss": 2.5719, + "step": 779 + }, + { + "epoch": 0.04184549356223176, + "grad_norm": 0.28515625, + "learning_rate": 1.3948497854077253e-06, + "loss": 2.5019, + "step": 780 + }, + { + "epoch": 0.04189914163090129, + "grad_norm": 0.388671875, + "learning_rate": 1.3966380543633764e-06, + "loss": 2.4958, + "step": 781 + }, + { + "epoch": 0.041952789699570815, + "grad_norm": 0.318359375, + "learning_rate": 1.3984263233190272e-06, + "loss": 2.5139, + "step": 782 + }, + { + "epoch": 0.04200643776824034, + "grad_norm": 0.490234375, + "learning_rate": 1.4002145922746781e-06, + "loss": 2.2762, + "step": 783 + }, + { + "epoch": 0.04206008583690987, + "grad_norm": 0.431640625, + "learning_rate": 1.402002861230329e-06, + "loss": 2.0563, + "step": 784 + }, + { + "epoch": 0.0421137339055794, + "grad_norm": 0.244140625, + "learning_rate": 1.4037911301859799e-06, + "loss": 2.9708, + "step": 785 + }, + { + "epoch": 0.04216738197424893, + "grad_norm": 0.375, + "learning_rate": 1.4055793991416312e-06, + "loss": 2.4161, + "step": 786 + }, + { + "epoch": 0.04222103004291845, + "grad_norm": 0.36328125, + "learning_rate": 1.407367668097282e-06, + "loss": 2.2735, + "step": 787 + }, + { + "epoch": 0.042274678111587985, + "grad_norm": 0.40625, + "learning_rate": 1.409155937052933e-06, + "loss": 2.6733, + "step": 788 + }, + { + "epoch": 0.04232832618025751, + "grad_norm": 0.5625, + "learning_rate": 1.4109442060085838e-06, + "loss": 2.3567, + "step": 789 + }, + { + "epoch": 0.04238197424892704, + "grad_norm": 0.447265625, + "learning_rate": 1.4127324749642347e-06, + "loss": 2.7518, + "step": 790 + }, + { + "epoch": 0.042435622317596566, + "grad_norm": 0.4453125, + "learning_rate": 1.4145207439198858e-06, + "loss": 2.4696, + "step": 791 + }, + { + "epoch": 0.04248927038626609, + "grad_norm": 0.62890625, + "learning_rate": 1.4163090128755366e-06, + "loss": 2.4709, + "step": 792 + }, + { + "epoch": 0.04254291845493562, + "grad_norm": 0.474609375, + "learning_rate": 1.4180972818311875e-06, + "loss": 2.2495, + "step": 793 + }, + { + "epoch": 0.04259656652360515, + "grad_norm": 0.423828125, + "learning_rate": 1.4198855507868384e-06, + "loss": 2.5038, + "step": 794 + }, + { + "epoch": 0.04265021459227468, + "grad_norm": 0.318359375, + "learning_rate": 1.4216738197424895e-06, + "loss": 2.377, + "step": 795 + }, + { + "epoch": 0.042703862660944204, + "grad_norm": 0.765625, + "learning_rate": 1.4234620886981404e-06, + "loss": 2.5334, + "step": 796 + }, + { + "epoch": 0.042757510729613736, + "grad_norm": 0.291015625, + "learning_rate": 1.4252503576537912e-06, + "loss": 2.7178, + "step": 797 + }, + { + "epoch": 0.04281115879828326, + "grad_norm": 0.41015625, + "learning_rate": 1.427038626609442e-06, + "loss": 2.5246, + "step": 798 + }, + { + "epoch": 0.04286480686695279, + "grad_norm": 0.35546875, + "learning_rate": 1.428826895565093e-06, + "loss": 2.4296, + "step": 799 + }, + { + "epoch": 0.04291845493562232, + "grad_norm": 0.3125, + "learning_rate": 1.430615164520744e-06, + "loss": 2.3586, + "step": 800 + }, + { + "epoch": 0.04297210300429184, + "grad_norm": 0.32421875, + "learning_rate": 1.432403433476395e-06, + "loss": 2.4828, + "step": 801 + }, + { + "epoch": 0.043025751072961374, + "grad_norm": 0.451171875, + "learning_rate": 1.4341917024320458e-06, + "loss": 2.53, + "step": 802 + }, + { + "epoch": 0.0430793991416309, + "grad_norm": 0.35546875, + "learning_rate": 1.4359799713876967e-06, + "loss": 2.502, + "step": 803 + }, + { + "epoch": 0.04313304721030043, + "grad_norm": 0.3203125, + "learning_rate": 1.4377682403433476e-06, + "loss": 2.3363, + "step": 804 + }, + { + "epoch": 0.043186695278969955, + "grad_norm": 0.431640625, + "learning_rate": 1.4395565092989989e-06, + "loss": 2.6009, + "step": 805 + }, + { + "epoch": 0.04324034334763949, + "grad_norm": 0.63671875, + "learning_rate": 1.4413447782546497e-06, + "loss": 2.6981, + "step": 806 + }, + { + "epoch": 0.04329399141630901, + "grad_norm": 0.64453125, + "learning_rate": 1.4431330472103006e-06, + "loss": 2.6242, + "step": 807 + }, + { + "epoch": 0.043347639484978544, + "grad_norm": 0.330078125, + "learning_rate": 1.4449213161659515e-06, + "loss": 2.4446, + "step": 808 + }, + { + "epoch": 0.04340128755364807, + "grad_norm": 0.33203125, + "learning_rate": 1.4467095851216024e-06, + "loss": 2.4934, + "step": 809 + }, + { + "epoch": 0.04345493562231759, + "grad_norm": 0.34375, + "learning_rate": 1.4484978540772535e-06, + "loss": 2.3758, + "step": 810 + }, + { + "epoch": 0.043508583690987125, + "grad_norm": 0.43359375, + "learning_rate": 1.4502861230329043e-06, + "loss": 2.493, + "step": 811 + }, + { + "epoch": 0.04356223175965665, + "grad_norm": 0.390625, + "learning_rate": 1.4520743919885552e-06, + "loss": 2.5109, + "step": 812 + }, + { + "epoch": 0.04361587982832618, + "grad_norm": 0.314453125, + "learning_rate": 1.453862660944206e-06, + "loss": 2.4121, + "step": 813 + }, + { + "epoch": 0.043669527896995707, + "grad_norm": 0.3671875, + "learning_rate": 1.455650929899857e-06, + "loss": 1.6285, + "step": 814 + }, + { + "epoch": 0.04372317596566524, + "grad_norm": 0.30859375, + "learning_rate": 1.457439198855508e-06, + "loss": 2.5189, + "step": 815 + }, + { + "epoch": 0.04377682403433476, + "grad_norm": 0.359375, + "learning_rate": 1.459227467811159e-06, + "loss": 2.416, + "step": 816 + }, + { + "epoch": 0.043830472103004295, + "grad_norm": 0.3671875, + "learning_rate": 1.4610157367668098e-06, + "loss": 2.4916, + "step": 817 + }, + { + "epoch": 0.04388412017167382, + "grad_norm": 0.361328125, + "learning_rate": 1.4628040057224607e-06, + "loss": 2.301, + "step": 818 + }, + { + "epoch": 0.043937768240343344, + "grad_norm": 0.47265625, + "learning_rate": 1.4645922746781116e-06, + "loss": 2.2703, + "step": 819 + }, + { + "epoch": 0.043991416309012876, + "grad_norm": 1.40625, + "learning_rate": 1.4663805436337629e-06, + "loss": 2.6941, + "step": 820 + }, + { + "epoch": 0.0440450643776824, + "grad_norm": 0.421875, + "learning_rate": 1.4681688125894137e-06, + "loss": 2.6363, + "step": 821 + }, + { + "epoch": 0.04409871244635193, + "grad_norm": 0.396484375, + "learning_rate": 1.4699570815450646e-06, + "loss": 2.6261, + "step": 822 + }, + { + "epoch": 0.04415236051502146, + "grad_norm": 0.640625, + "learning_rate": 1.4717453505007155e-06, + "loss": 2.3352, + "step": 823 + }, + { + "epoch": 0.04420600858369099, + "grad_norm": 0.3359375, + "learning_rate": 1.4735336194563661e-06, + "loss": 2.479, + "step": 824 + }, + { + "epoch": 0.044259656652360514, + "grad_norm": 0.35546875, + "learning_rate": 1.4753218884120174e-06, + "loss": 2.5648, + "step": 825 + }, + { + "epoch": 0.044313304721030046, + "grad_norm": 0.470703125, + "learning_rate": 1.4771101573676683e-06, + "loss": 2.411, + "step": 826 + }, + { + "epoch": 0.04436695278969957, + "grad_norm": 0.275390625, + "learning_rate": 1.4788984263233192e-06, + "loss": 2.8137, + "step": 827 + }, + { + "epoch": 0.044420600858369096, + "grad_norm": 0.482421875, + "learning_rate": 1.48068669527897e-06, + "loss": 2.2456, + "step": 828 + }, + { + "epoch": 0.04447424892703863, + "grad_norm": 0.400390625, + "learning_rate": 1.482474964234621e-06, + "loss": 2.4765, + "step": 829 + }, + { + "epoch": 0.04452789699570815, + "grad_norm": 0.447265625, + "learning_rate": 1.484263233190272e-06, + "loss": 2.3816, + "step": 830 + }, + { + "epoch": 0.044581545064377684, + "grad_norm": 0.32421875, + "learning_rate": 1.486051502145923e-06, + "loss": 2.3729, + "step": 831 + }, + { + "epoch": 0.04463519313304721, + "grad_norm": 0.3125, + "learning_rate": 1.4878397711015738e-06, + "loss": 2.3915, + "step": 832 + }, + { + "epoch": 0.04468884120171674, + "grad_norm": 0.400390625, + "learning_rate": 1.4896280400572247e-06, + "loss": 2.3064, + "step": 833 + }, + { + "epoch": 0.044742489270386265, + "grad_norm": 0.5703125, + "learning_rate": 1.4914163090128755e-06, + "loss": 2.6276, + "step": 834 + }, + { + "epoch": 0.0447961373390558, + "grad_norm": 0.5625, + "learning_rate": 1.4932045779685266e-06, + "loss": 2.6878, + "step": 835 + }, + { + "epoch": 0.04484978540772532, + "grad_norm": 0.470703125, + "learning_rate": 1.4949928469241775e-06, + "loss": 2.4363, + "step": 836 + }, + { + "epoch": 0.04490343347639485, + "grad_norm": 0.408203125, + "learning_rate": 1.4967811158798284e-06, + "loss": 2.3689, + "step": 837 + }, + { + "epoch": 0.04495708154506438, + "grad_norm": 0.451171875, + "learning_rate": 1.4985693848354793e-06, + "loss": 2.2237, + "step": 838 + }, + { + "epoch": 0.0450107296137339, + "grad_norm": 0.48046875, + "learning_rate": 1.5003576537911301e-06, + "loss": 2.4521, + "step": 839 + }, + { + "epoch": 0.045064377682403435, + "grad_norm": 0.76171875, + "learning_rate": 1.5021459227467814e-06, + "loss": 2.3089, + "step": 840 + }, + { + "epoch": 0.04511802575107296, + "grad_norm": 0.3671875, + "learning_rate": 1.5039341917024323e-06, + "loss": 2.4181, + "step": 841 + }, + { + "epoch": 0.04517167381974249, + "grad_norm": 0.375, + "learning_rate": 1.5057224606580832e-06, + "loss": 2.5944, + "step": 842 + }, + { + "epoch": 0.045225321888412016, + "grad_norm": 0.486328125, + "learning_rate": 1.507510729613734e-06, + "loss": 2.4615, + "step": 843 + }, + { + "epoch": 0.04527896995708155, + "grad_norm": 0.421875, + "learning_rate": 1.509298998569385e-06, + "loss": 2.2305, + "step": 844 + }, + { + "epoch": 0.04533261802575107, + "grad_norm": 0.3671875, + "learning_rate": 1.511087267525036e-06, + "loss": 2.54, + "step": 845 + }, + { + "epoch": 0.0453862660944206, + "grad_norm": 0.353515625, + "learning_rate": 1.512875536480687e-06, + "loss": 2.7056, + "step": 846 + }, + { + "epoch": 0.04543991416309013, + "grad_norm": 0.328125, + "learning_rate": 1.5146638054363378e-06, + "loss": 2.3568, + "step": 847 + }, + { + "epoch": 0.045493562231759654, + "grad_norm": 0.421875, + "learning_rate": 1.5164520743919886e-06, + "loss": 2.4975, + "step": 848 + }, + { + "epoch": 0.045547210300429186, + "grad_norm": 0.357421875, + "learning_rate": 1.5182403433476395e-06, + "loss": 2.5089, + "step": 849 + }, + { + "epoch": 0.04560085836909871, + "grad_norm": 0.5234375, + "learning_rate": 1.5200286123032906e-06, + "loss": 2.3455, + "step": 850 + }, + { + "epoch": 0.04565450643776824, + "grad_norm": 0.375, + "learning_rate": 1.5218168812589415e-06, + "loss": 2.4103, + "step": 851 + }, + { + "epoch": 0.04570815450643777, + "grad_norm": 0.427734375, + "learning_rate": 1.5236051502145924e-06, + "loss": 2.4878, + "step": 852 + }, + { + "epoch": 0.0457618025751073, + "grad_norm": 0.69921875, + "learning_rate": 1.5253934191702432e-06, + "loss": 1.5303, + "step": 853 + }, + { + "epoch": 0.045815450643776824, + "grad_norm": 0.486328125, + "learning_rate": 1.5271816881258941e-06, + "loss": 2.4561, + "step": 854 + }, + { + "epoch": 0.04586909871244635, + "grad_norm": 0.578125, + "learning_rate": 1.5289699570815452e-06, + "loss": 2.5645, + "step": 855 + }, + { + "epoch": 0.04592274678111588, + "grad_norm": 0.4453125, + "learning_rate": 1.530758226037196e-06, + "loss": 2.4688, + "step": 856 + }, + { + "epoch": 0.045976394849785406, + "grad_norm": 0.447265625, + "learning_rate": 1.532546494992847e-06, + "loss": 2.4702, + "step": 857 + }, + { + "epoch": 0.04603004291845494, + "grad_norm": 0.310546875, + "learning_rate": 1.5343347639484978e-06, + "loss": 2.3451, + "step": 858 + }, + { + "epoch": 0.04608369098712446, + "grad_norm": 0.314453125, + "learning_rate": 1.5361230329041491e-06, + "loss": 2.3625, + "step": 859 + }, + { + "epoch": 0.046137339055793994, + "grad_norm": 0.33203125, + "learning_rate": 1.5379113018598e-06, + "loss": 2.6092, + "step": 860 + }, + { + "epoch": 0.04619098712446352, + "grad_norm": 0.3203125, + "learning_rate": 1.5396995708154509e-06, + "loss": 2.4964, + "step": 861 + }, + { + "epoch": 0.04624463519313305, + "grad_norm": 0.36328125, + "learning_rate": 1.5414878397711018e-06, + "loss": 2.5576, + "step": 862 + }, + { + "epoch": 0.046298283261802575, + "grad_norm": 0.423828125, + "learning_rate": 1.5432761087267526e-06, + "loss": 2.1949, + "step": 863 + }, + { + "epoch": 0.0463519313304721, + "grad_norm": 0.349609375, + "learning_rate": 1.5450643776824037e-06, + "loss": 2.4155, + "step": 864 + }, + { + "epoch": 0.04640557939914163, + "grad_norm": 0.4296875, + "learning_rate": 1.5468526466380546e-06, + "loss": 2.5361, + "step": 865 + }, + { + "epoch": 0.04645922746781116, + "grad_norm": 0.2890625, + "learning_rate": 1.5486409155937055e-06, + "loss": 2.3784, + "step": 866 + }, + { + "epoch": 0.04651287553648069, + "grad_norm": 0.333984375, + "learning_rate": 1.5504291845493563e-06, + "loss": 2.4383, + "step": 867 + }, + { + "epoch": 0.04656652360515021, + "grad_norm": 0.322265625, + "learning_rate": 1.5522174535050072e-06, + "loss": 2.2894, + "step": 868 + }, + { + "epoch": 0.046620171673819745, + "grad_norm": 0.44921875, + "learning_rate": 1.5540057224606583e-06, + "loss": 2.534, + "step": 869 + }, + { + "epoch": 0.04667381974248927, + "grad_norm": 0.359375, + "learning_rate": 1.5557939914163092e-06, + "loss": 2.1384, + "step": 870 + }, + { + "epoch": 0.0467274678111588, + "grad_norm": 0.345703125, + "learning_rate": 1.55758226037196e-06, + "loss": 2.3173, + "step": 871 + }, + { + "epoch": 0.046781115879828326, + "grad_norm": 0.296875, + "learning_rate": 1.559370529327611e-06, + "loss": 2.4922, + "step": 872 + }, + { + "epoch": 0.04683476394849785, + "grad_norm": 0.419921875, + "learning_rate": 1.5611587982832618e-06, + "loss": 2.47, + "step": 873 + }, + { + "epoch": 0.04688841201716738, + "grad_norm": 0.42578125, + "learning_rate": 1.562947067238913e-06, + "loss": 2.1809, + "step": 874 + }, + { + "epoch": 0.04694206008583691, + "grad_norm": 0.365234375, + "learning_rate": 1.5647353361945638e-06, + "loss": 2.5304, + "step": 875 + }, + { + "epoch": 0.04699570815450644, + "grad_norm": 0.345703125, + "learning_rate": 1.5665236051502147e-06, + "loss": 2.4909, + "step": 876 + }, + { + "epoch": 0.047049356223175964, + "grad_norm": 0.61328125, + "learning_rate": 1.5683118741058655e-06, + "loss": 2.4724, + "step": 877 + }, + { + "epoch": 0.047103004291845496, + "grad_norm": 0.38671875, + "learning_rate": 1.5701001430615164e-06, + "loss": 2.4594, + "step": 878 + }, + { + "epoch": 0.04715665236051502, + "grad_norm": 0.39453125, + "learning_rate": 1.5718884120171677e-06, + "loss": 2.3539, + "step": 879 + }, + { + "epoch": 0.04721030042918455, + "grad_norm": 0.439453125, + "learning_rate": 1.5736766809728186e-06, + "loss": 2.1959, + "step": 880 + }, + { + "epoch": 0.04726394849785408, + "grad_norm": 0.2734375, + "learning_rate": 1.5754649499284695e-06, + "loss": 2.2715, + "step": 881 + }, + { + "epoch": 0.0473175965665236, + "grad_norm": 0.84375, + "learning_rate": 1.5772532188841203e-06, + "loss": 2.4584, + "step": 882 + }, + { + "epoch": 0.047371244635193134, + "grad_norm": 0.3984375, + "learning_rate": 1.5790414878397712e-06, + "loss": 2.4965, + "step": 883 + }, + { + "epoch": 0.04742489270386266, + "grad_norm": 0.447265625, + "learning_rate": 1.5808297567954223e-06, + "loss": 2.4237, + "step": 884 + }, + { + "epoch": 0.04747854077253219, + "grad_norm": 0.482421875, + "learning_rate": 1.5826180257510732e-06, + "loss": 2.5613, + "step": 885 + }, + { + "epoch": 0.047532188841201715, + "grad_norm": 0.419921875, + "learning_rate": 1.584406294706724e-06, + "loss": 2.383, + "step": 886 + }, + { + "epoch": 0.04758583690987125, + "grad_norm": 0.3671875, + "learning_rate": 1.586194563662375e-06, + "loss": 2.474, + "step": 887 + }, + { + "epoch": 0.04763948497854077, + "grad_norm": 0.33984375, + "learning_rate": 1.5879828326180258e-06, + "loss": 2.6701, + "step": 888 + }, + { + "epoch": 0.047693133047210304, + "grad_norm": 0.7265625, + "learning_rate": 1.5897711015736769e-06, + "loss": 2.6443, + "step": 889 + }, + { + "epoch": 0.04774678111587983, + "grad_norm": 0.470703125, + "learning_rate": 1.5915593705293278e-06, + "loss": 2.6659, + "step": 890 + }, + { + "epoch": 0.04780042918454935, + "grad_norm": 0.310546875, + "learning_rate": 1.5933476394849786e-06, + "loss": 2.3637, + "step": 891 + }, + { + "epoch": 0.047854077253218885, + "grad_norm": 0.515625, + "learning_rate": 1.5951359084406295e-06, + "loss": 2.5664, + "step": 892 + }, + { + "epoch": 0.04790772532188841, + "grad_norm": 0.26953125, + "learning_rate": 1.5969241773962804e-06, + "loss": 2.533, + "step": 893 + }, + { + "epoch": 0.04796137339055794, + "grad_norm": 0.357421875, + "learning_rate": 1.5987124463519315e-06, + "loss": 2.4087, + "step": 894 + }, + { + "epoch": 0.04801502145922747, + "grad_norm": 0.326171875, + "learning_rate": 1.6005007153075824e-06, + "loss": 2.5388, + "step": 895 + }, + { + "epoch": 0.048068669527897, + "grad_norm": 0.302734375, + "learning_rate": 1.6022889842632332e-06, + "loss": 2.4701, + "step": 896 + }, + { + "epoch": 0.04812231759656652, + "grad_norm": 0.376953125, + "learning_rate": 1.604077253218884e-06, + "loss": 2.5579, + "step": 897 + }, + { + "epoch": 0.048175965665236055, + "grad_norm": 0.25, + "learning_rate": 1.605865522174535e-06, + "loss": 2.3166, + "step": 898 + }, + { + "epoch": 0.04822961373390558, + "grad_norm": 0.58203125, + "learning_rate": 1.6076537911301863e-06, + "loss": 2.5638, + "step": 899 + }, + { + "epoch": 0.048283261802575105, + "grad_norm": 0.26953125, + "learning_rate": 1.6094420600858372e-06, + "loss": 2.3247, + "step": 900 + }, + { + "epoch": 0.048336909871244636, + "grad_norm": 0.443359375, + "learning_rate": 1.611230329041488e-06, + "loss": 2.5444, + "step": 901 + }, + { + "epoch": 0.04839055793991416, + "grad_norm": 0.41015625, + "learning_rate": 1.613018597997139e-06, + "loss": 2.4365, + "step": 902 + }, + { + "epoch": 0.04844420600858369, + "grad_norm": 0.298828125, + "learning_rate": 1.6148068669527898e-06, + "loss": 2.3902, + "step": 903 + }, + { + "epoch": 0.04849785407725322, + "grad_norm": 0.37109375, + "learning_rate": 1.6165951359084409e-06, + "loss": 2.3938, + "step": 904 + }, + { + "epoch": 0.04855150214592275, + "grad_norm": 0.396484375, + "learning_rate": 1.6183834048640917e-06, + "loss": 2.383, + "step": 905 + }, + { + "epoch": 0.048605150214592274, + "grad_norm": 0.306640625, + "learning_rate": 1.6201716738197426e-06, + "loss": 2.2779, + "step": 906 + }, + { + "epoch": 0.048658798283261806, + "grad_norm": 0.357421875, + "learning_rate": 1.6219599427753935e-06, + "loss": 2.3873, + "step": 907 + }, + { + "epoch": 0.04871244635193133, + "grad_norm": 0.302734375, + "learning_rate": 1.6237482117310444e-06, + "loss": 2.5644, + "step": 908 + }, + { + "epoch": 0.048766094420600856, + "grad_norm": 0.451171875, + "learning_rate": 1.6255364806866955e-06, + "loss": 2.3583, + "step": 909 + }, + { + "epoch": 0.04881974248927039, + "grad_norm": 0.55078125, + "learning_rate": 1.6273247496423463e-06, + "loss": 2.094, + "step": 910 + }, + { + "epoch": 0.04887339055793991, + "grad_norm": 0.337890625, + "learning_rate": 1.6291130185979972e-06, + "loss": 2.0487, + "step": 911 + }, + { + "epoch": 0.048927038626609444, + "grad_norm": 0.28515625, + "learning_rate": 1.630901287553648e-06, + "loss": 2.4457, + "step": 912 + }, + { + "epoch": 0.04898068669527897, + "grad_norm": 0.51171875, + "learning_rate": 1.632689556509299e-06, + "loss": 2.4113, + "step": 913 + }, + { + "epoch": 0.0490343347639485, + "grad_norm": 0.65234375, + "learning_rate": 1.63447782546495e-06, + "loss": 2.4289, + "step": 914 + }, + { + "epoch": 0.049087982832618025, + "grad_norm": 1.6640625, + "learning_rate": 1.636266094420601e-06, + "loss": 2.414, + "step": 915 + }, + { + "epoch": 0.04914163090128755, + "grad_norm": 0.333984375, + "learning_rate": 1.6380543633762518e-06, + "loss": 2.4622, + "step": 916 + }, + { + "epoch": 0.04919527896995708, + "grad_norm": 0.369140625, + "learning_rate": 1.6398426323319027e-06, + "loss": 2.5566, + "step": 917 + }, + { + "epoch": 0.04924892703862661, + "grad_norm": 0.361328125, + "learning_rate": 1.6416309012875536e-06, + "loss": 2.6255, + "step": 918 + }, + { + "epoch": 0.04930257510729614, + "grad_norm": 0.341796875, + "learning_rate": 1.6434191702432049e-06, + "loss": 2.3611, + "step": 919 + }, + { + "epoch": 0.04935622317596566, + "grad_norm": 0.392578125, + "learning_rate": 1.6452074391988557e-06, + "loss": 2.1584, + "step": 920 + }, + { + "epoch": 0.049409871244635195, + "grad_norm": 0.470703125, + "learning_rate": 1.6469957081545066e-06, + "loss": 2.3039, + "step": 921 + }, + { + "epoch": 0.04946351931330472, + "grad_norm": 0.404296875, + "learning_rate": 1.6487839771101575e-06, + "loss": 2.0669, + "step": 922 + }, + { + "epoch": 0.04951716738197425, + "grad_norm": 0.279296875, + "learning_rate": 1.6505722460658086e-06, + "loss": 2.3928, + "step": 923 + }, + { + "epoch": 0.049570815450643776, + "grad_norm": 0.43359375, + "learning_rate": 1.6523605150214594e-06, + "loss": 2.4045, + "step": 924 + }, + { + "epoch": 0.0496244635193133, + "grad_norm": 0.392578125, + "learning_rate": 1.6541487839771103e-06, + "loss": 1.9758, + "step": 925 + }, + { + "epoch": 0.04967811158798283, + "grad_norm": 0.33984375, + "learning_rate": 1.6559370529327612e-06, + "loss": 2.2327, + "step": 926 + }, + { + "epoch": 0.04973175965665236, + "grad_norm": 0.37890625, + "learning_rate": 1.657725321888412e-06, + "loss": 2.3561, + "step": 927 + }, + { + "epoch": 0.04978540772532189, + "grad_norm": 0.33203125, + "learning_rate": 1.6595135908440632e-06, + "loss": 2.4657, + "step": 928 + }, + { + "epoch": 0.049839055793991414, + "grad_norm": 0.349609375, + "learning_rate": 1.661301859799714e-06, + "loss": 2.3788, + "step": 929 + }, + { + "epoch": 0.049892703862660946, + "grad_norm": 0.41796875, + "learning_rate": 1.663090128755365e-06, + "loss": 2.604, + "step": 930 + }, + { + "epoch": 0.04994635193133047, + "grad_norm": 0.6640625, + "learning_rate": 1.6648783977110158e-06, + "loss": 2.4638, + "step": 931 + }, + { + "epoch": 0.05, + "grad_norm": 0.453125, + "learning_rate": 1.6666666666666667e-06, + "loss": 2.456, + "step": 932 + }, + { + "epoch": 0.05005364806866953, + "grad_norm": 0.57421875, + "learning_rate": 1.668454935622318e-06, + "loss": 2.3991, + "step": 933 + }, + { + "epoch": 0.05010729613733905, + "grad_norm": 0.4140625, + "learning_rate": 1.6702432045779688e-06, + "loss": 1.7884, + "step": 934 + }, + { + "epoch": 0.050160944206008584, + "grad_norm": 0.88671875, + "learning_rate": 1.6720314735336197e-06, + "loss": 2.6017, + "step": 935 + }, + { + "epoch": 0.05021459227467811, + "grad_norm": 0.322265625, + "learning_rate": 1.6738197424892704e-06, + "loss": 2.4886, + "step": 936 + }, + { + "epoch": 0.05026824034334764, + "grad_norm": 0.484375, + "learning_rate": 1.6756080114449213e-06, + "loss": 2.6471, + "step": 937 + }, + { + "epoch": 0.050321888412017166, + "grad_norm": 0.80859375, + "learning_rate": 1.6773962804005726e-06, + "loss": 2.4132, + "step": 938 + }, + { + "epoch": 0.0503755364806867, + "grad_norm": 0.51953125, + "learning_rate": 1.6791845493562234e-06, + "loss": 1.9285, + "step": 939 + }, + { + "epoch": 0.05042918454935622, + "grad_norm": 0.376953125, + "learning_rate": 1.6809728183118743e-06, + "loss": 2.2931, + "step": 940 + }, + { + "epoch": 0.050482832618025754, + "grad_norm": 0.58203125, + "learning_rate": 1.6827610872675252e-06, + "loss": 2.5463, + "step": 941 + }, + { + "epoch": 0.05053648068669528, + "grad_norm": 0.40234375, + "learning_rate": 1.684549356223176e-06, + "loss": 2.621, + "step": 942 + }, + { + "epoch": 0.050590128755364804, + "grad_norm": 0.28125, + "learning_rate": 1.6863376251788271e-06, + "loss": 2.2759, + "step": 943 + }, + { + "epoch": 0.050643776824034335, + "grad_norm": 0.380859375, + "learning_rate": 1.688125894134478e-06, + "loss": 2.5368, + "step": 944 + }, + { + "epoch": 0.05069742489270386, + "grad_norm": 0.37109375, + "learning_rate": 1.689914163090129e-06, + "loss": 2.4487, + "step": 945 + }, + { + "epoch": 0.05075107296137339, + "grad_norm": 0.296875, + "learning_rate": 1.6917024320457798e-06, + "loss": 2.407, + "step": 946 + }, + { + "epoch": 0.05080472103004292, + "grad_norm": 0.546875, + "learning_rate": 1.6934907010014306e-06, + "loss": 2.1775, + "step": 947 + }, + { + "epoch": 0.05085836909871245, + "grad_norm": 0.578125, + "learning_rate": 1.6952789699570817e-06, + "loss": 2.2645, + "step": 948 + }, + { + "epoch": 0.05091201716738197, + "grad_norm": 0.2734375, + "learning_rate": 1.6970672389127326e-06, + "loss": 2.3912, + "step": 949 + }, + { + "epoch": 0.050965665236051505, + "grad_norm": 0.3359375, + "learning_rate": 1.6988555078683835e-06, + "loss": 2.4095, + "step": 950 + }, + { + "epoch": 0.05101931330472103, + "grad_norm": 0.41015625, + "learning_rate": 1.7006437768240344e-06, + "loss": 2.4562, + "step": 951 + }, + { + "epoch": 0.051072961373390555, + "grad_norm": 0.6015625, + "learning_rate": 1.7024320457796852e-06, + "loss": 2.4764, + "step": 952 + }, + { + "epoch": 0.051126609442060086, + "grad_norm": 0.54296875, + "learning_rate": 1.7042203147353365e-06, + "loss": 2.3143, + "step": 953 + }, + { + "epoch": 0.05118025751072961, + "grad_norm": 0.376953125, + "learning_rate": 1.7060085836909874e-06, + "loss": 2.4838, + "step": 954 + }, + { + "epoch": 0.05123390557939914, + "grad_norm": 0.86328125, + "learning_rate": 1.7077968526466383e-06, + "loss": 2.3901, + "step": 955 + }, + { + "epoch": 0.05128755364806867, + "grad_norm": 0.23828125, + "learning_rate": 1.7095851216022892e-06, + "loss": 2.4918, + "step": 956 + }, + { + "epoch": 0.0513412017167382, + "grad_norm": 0.26953125, + "learning_rate": 1.71137339055794e-06, + "loss": 2.4895, + "step": 957 + }, + { + "epoch": 0.051394849785407724, + "grad_norm": 0.54296875, + "learning_rate": 1.7131616595135911e-06, + "loss": 2.5146, + "step": 958 + }, + { + "epoch": 0.051448497854077256, + "grad_norm": 0.369140625, + "learning_rate": 1.714949928469242e-06, + "loss": 2.472, + "step": 959 + }, + { + "epoch": 0.05150214592274678, + "grad_norm": 0.48046875, + "learning_rate": 1.7167381974248929e-06, + "loss": 2.4624, + "step": 960 + }, + { + "epoch": 0.051555793991416306, + "grad_norm": 0.38671875, + "learning_rate": 1.7185264663805438e-06, + "loss": 2.4335, + "step": 961 + }, + { + "epoch": 0.05160944206008584, + "grad_norm": 0.3125, + "learning_rate": 1.7203147353361946e-06, + "loss": 2.059, + "step": 962 + }, + { + "epoch": 0.05166309012875536, + "grad_norm": 0.271484375, + "learning_rate": 1.7221030042918457e-06, + "loss": 2.4688, + "step": 963 + }, + { + "epoch": 0.051716738197424894, + "grad_norm": 0.255859375, + "learning_rate": 1.7238912732474966e-06, + "loss": 2.1237, + "step": 964 + }, + { + "epoch": 0.05177038626609442, + "grad_norm": 0.27734375, + "learning_rate": 1.7256795422031475e-06, + "loss": 2.3123, + "step": 965 + }, + { + "epoch": 0.05182403433476395, + "grad_norm": 0.32421875, + "learning_rate": 1.7274678111587983e-06, + "loss": 2.1579, + "step": 966 + }, + { + "epoch": 0.051877682403433475, + "grad_norm": 0.3125, + "learning_rate": 1.7292560801144492e-06, + "loss": 2.2758, + "step": 967 + }, + { + "epoch": 0.05193133047210301, + "grad_norm": 0.419921875, + "learning_rate": 1.7310443490701003e-06, + "loss": 2.4724, + "step": 968 + }, + { + "epoch": 0.05198497854077253, + "grad_norm": 0.443359375, + "learning_rate": 1.7328326180257512e-06, + "loss": 2.5202, + "step": 969 + }, + { + "epoch": 0.05203862660944206, + "grad_norm": 0.40234375, + "learning_rate": 1.734620886981402e-06, + "loss": 1.9985, + "step": 970 + }, + { + "epoch": 0.05209227467811159, + "grad_norm": 0.3046875, + "learning_rate": 1.736409155937053e-06, + "loss": 2.7304, + "step": 971 + }, + { + "epoch": 0.05214592274678111, + "grad_norm": 0.287109375, + "learning_rate": 1.7381974248927038e-06, + "loss": 2.0823, + "step": 972 + }, + { + "epoch": 0.052199570815450645, + "grad_norm": 0.33203125, + "learning_rate": 1.7399856938483551e-06, + "loss": 2.6022, + "step": 973 + }, + { + "epoch": 0.05225321888412017, + "grad_norm": 0.34765625, + "learning_rate": 1.741773962804006e-06, + "loss": 2.1868, + "step": 974 + }, + { + "epoch": 0.0523068669527897, + "grad_norm": 0.50390625, + "learning_rate": 1.7435622317596569e-06, + "loss": 2.7772, + "step": 975 + }, + { + "epoch": 0.05236051502145923, + "grad_norm": 2.59375, + "learning_rate": 1.7453505007153077e-06, + "loss": 2.3234, + "step": 976 + }, + { + "epoch": 0.05241416309012876, + "grad_norm": 0.365234375, + "learning_rate": 1.7471387696709586e-06, + "loss": 2.3316, + "step": 977 + }, + { + "epoch": 0.05246781115879828, + "grad_norm": 0.306640625, + "learning_rate": 1.7489270386266097e-06, + "loss": 2.5826, + "step": 978 + }, + { + "epoch": 0.05252145922746781, + "grad_norm": 0.310546875, + "learning_rate": 1.7507153075822606e-06, + "loss": 2.4483, + "step": 979 + }, + { + "epoch": 0.05257510729613734, + "grad_norm": 0.369140625, + "learning_rate": 1.7525035765379115e-06, + "loss": 2.4893, + "step": 980 + }, + { + "epoch": 0.052628755364806865, + "grad_norm": 0.33984375, + "learning_rate": 1.7542918454935623e-06, + "loss": 2.5281, + "step": 981 + }, + { + "epoch": 0.052682403433476396, + "grad_norm": 0.310546875, + "learning_rate": 1.7560801144492132e-06, + "loss": 2.3841, + "step": 982 + }, + { + "epoch": 0.05273605150214592, + "grad_norm": 0.384765625, + "learning_rate": 1.7578683834048643e-06, + "loss": 2.3863, + "step": 983 + }, + { + "epoch": 0.05278969957081545, + "grad_norm": 0.28515625, + "learning_rate": 1.7596566523605152e-06, + "loss": 2.4567, + "step": 984 + }, + { + "epoch": 0.05284334763948498, + "grad_norm": 0.484375, + "learning_rate": 1.761444921316166e-06, + "loss": 2.6365, + "step": 985 + }, + { + "epoch": 0.05289699570815451, + "grad_norm": 0.474609375, + "learning_rate": 1.763233190271817e-06, + "loss": 2.6337, + "step": 986 + }, + { + "epoch": 0.052950643776824034, + "grad_norm": 0.396484375, + "learning_rate": 1.765021459227468e-06, + "loss": 2.5241, + "step": 987 + }, + { + "epoch": 0.05300429184549356, + "grad_norm": 0.455078125, + "learning_rate": 1.7668097281831189e-06, + "loss": 2.0958, + "step": 988 + }, + { + "epoch": 0.05305793991416309, + "grad_norm": 0.333984375, + "learning_rate": 1.7685979971387698e-06, + "loss": 2.6487, + "step": 989 + }, + { + "epoch": 0.053111587982832616, + "grad_norm": 0.77734375, + "learning_rate": 1.7703862660944206e-06, + "loss": 2.4158, + "step": 990 + }, + { + "epoch": 0.05316523605150215, + "grad_norm": 0.396484375, + "learning_rate": 1.7721745350500715e-06, + "loss": 2.5518, + "step": 991 + }, + { + "epoch": 0.05321888412017167, + "grad_norm": 0.3515625, + "learning_rate": 1.7739628040057228e-06, + "loss": 2.4129, + "step": 992 + }, + { + "epoch": 0.053272532188841204, + "grad_norm": 0.43359375, + "learning_rate": 1.7757510729613737e-06, + "loss": 2.3807, + "step": 993 + }, + { + "epoch": 0.05332618025751073, + "grad_norm": 0.490234375, + "learning_rate": 1.7775393419170246e-06, + "loss": 2.1954, + "step": 994 + }, + { + "epoch": 0.05337982832618026, + "grad_norm": 0.369140625, + "learning_rate": 1.7793276108726754e-06, + "loss": 2.3352, + "step": 995 + }, + { + "epoch": 0.053433476394849785, + "grad_norm": 0.337890625, + "learning_rate": 1.7811158798283263e-06, + "loss": 2.049, + "step": 996 + }, + { + "epoch": 0.05348712446351931, + "grad_norm": 0.26171875, + "learning_rate": 1.7829041487839774e-06, + "loss": 2.2232, + "step": 997 + }, + { + "epoch": 0.05354077253218884, + "grad_norm": 0.55859375, + "learning_rate": 1.7846924177396283e-06, + "loss": 2.9451, + "step": 998 + }, + { + "epoch": 0.05359442060085837, + "grad_norm": 0.451171875, + "learning_rate": 1.7864806866952792e-06, + "loss": 2.3928, + "step": 999 + }, + { + "epoch": 0.0536480686695279, + "grad_norm": 0.384765625, + "learning_rate": 1.78826895565093e-06, + "loss": 2.4644, + "step": 1000 + }, + { + "epoch": 0.05370171673819742, + "grad_norm": 0.287109375, + "learning_rate": 1.790057224606581e-06, + "loss": 2.571, + "step": 1001 + }, + { + "epoch": 0.053755364806866955, + "grad_norm": 0.28515625, + "learning_rate": 1.791845493562232e-06, + "loss": 2.3715, + "step": 1002 + }, + { + "epoch": 0.05380901287553648, + "grad_norm": 0.337890625, + "learning_rate": 1.7936337625178829e-06, + "loss": 2.13, + "step": 1003 + }, + { + "epoch": 0.05386266094420601, + "grad_norm": 0.59375, + "learning_rate": 1.7954220314735337e-06, + "loss": 2.2157, + "step": 1004 + }, + { + "epoch": 0.053916309012875537, + "grad_norm": 0.50390625, + "learning_rate": 1.7972103004291846e-06, + "loss": 2.311, + "step": 1005 + }, + { + "epoch": 0.05396995708154506, + "grad_norm": 0.28515625, + "learning_rate": 1.7989985693848355e-06, + "loss": 2.456, + "step": 1006 + }, + { + "epoch": 0.05402360515021459, + "grad_norm": 1.3046875, + "learning_rate": 1.8007868383404866e-06, + "loss": 2.3761, + "step": 1007 + }, + { + "epoch": 0.05407725321888412, + "grad_norm": 0.333984375, + "learning_rate": 1.8025751072961375e-06, + "loss": 1.6944, + "step": 1008 + }, + { + "epoch": 0.05413090128755365, + "grad_norm": 2.09375, + "learning_rate": 1.8043633762517883e-06, + "loss": 2.435, + "step": 1009 + }, + { + "epoch": 0.054184549356223174, + "grad_norm": 0.984375, + "learning_rate": 1.8061516452074392e-06, + "loss": 2.4782, + "step": 1010 + }, + { + "epoch": 0.054238197424892706, + "grad_norm": 0.451171875, + "learning_rate": 1.80793991416309e-06, + "loss": 2.5311, + "step": 1011 + }, + { + "epoch": 0.05429184549356223, + "grad_norm": 0.333984375, + "learning_rate": 1.8097281831187414e-06, + "loss": 2.3233, + "step": 1012 + }, + { + "epoch": 0.05434549356223176, + "grad_norm": 0.80859375, + "learning_rate": 1.8115164520743923e-06, + "loss": 2.5227, + "step": 1013 + }, + { + "epoch": 0.05439914163090129, + "grad_norm": 0.33984375, + "learning_rate": 1.8133047210300431e-06, + "loss": 2.1428, + "step": 1014 + }, + { + "epoch": 0.05445278969957081, + "grad_norm": 0.6328125, + "learning_rate": 1.815092989985694e-06, + "loss": 2.563, + "step": 1015 + }, + { + "epoch": 0.054506437768240344, + "grad_norm": 0.306640625, + "learning_rate": 1.8168812589413449e-06, + "loss": 2.4096, + "step": 1016 + }, + { + "epoch": 0.05456008583690987, + "grad_norm": 0.353515625, + "learning_rate": 1.818669527896996e-06, + "loss": 2.5096, + "step": 1017 + }, + { + "epoch": 0.0546137339055794, + "grad_norm": 0.3671875, + "learning_rate": 1.8204577968526469e-06, + "loss": 2.3854, + "step": 1018 + }, + { + "epoch": 0.054667381974248926, + "grad_norm": 0.49609375, + "learning_rate": 1.8222460658082977e-06, + "loss": 2.4078, + "step": 1019 + }, + { + "epoch": 0.05472103004291846, + "grad_norm": 0.306640625, + "learning_rate": 1.8240343347639486e-06, + "loss": 2.3344, + "step": 1020 + }, + { + "epoch": 0.05477467811158798, + "grad_norm": 0.451171875, + "learning_rate": 1.8258226037195995e-06, + "loss": 2.1087, + "step": 1021 + }, + { + "epoch": 0.054828326180257514, + "grad_norm": 0.318359375, + "learning_rate": 1.8276108726752506e-06, + "loss": 2.0824, + "step": 1022 + }, + { + "epoch": 0.05488197424892704, + "grad_norm": 0.318359375, + "learning_rate": 1.8293991416309014e-06, + "loss": 2.3063, + "step": 1023 + }, + { + "epoch": 0.054935622317596564, + "grad_norm": 0.53125, + "learning_rate": 1.8311874105865523e-06, + "loss": 2.5562, + "step": 1024 + }, + { + "epoch": 0.054989270386266095, + "grad_norm": 0.59375, + "learning_rate": 1.8329756795422032e-06, + "loss": 2.5037, + "step": 1025 + }, + { + "epoch": 0.05504291845493562, + "grad_norm": 0.3046875, + "learning_rate": 1.834763948497854e-06, + "loss": 2.2668, + "step": 1026 + }, + { + "epoch": 0.05509656652360515, + "grad_norm": 0.66015625, + "learning_rate": 1.8365522174535052e-06, + "loss": 2.4268, + "step": 1027 + }, + { + "epoch": 0.05515021459227468, + "grad_norm": 0.30859375, + "learning_rate": 1.838340486409156e-06, + "loss": 2.1841, + "step": 1028 + }, + { + "epoch": 0.05520386266094421, + "grad_norm": 0.37109375, + "learning_rate": 1.840128755364807e-06, + "loss": 2.277, + "step": 1029 + }, + { + "epoch": 0.05525751072961373, + "grad_norm": 0.318359375, + "learning_rate": 1.8419170243204578e-06, + "loss": 2.4837, + "step": 1030 + }, + { + "epoch": 0.055311158798283265, + "grad_norm": 0.36328125, + "learning_rate": 1.8437052932761087e-06, + "loss": 2.485, + "step": 1031 + }, + { + "epoch": 0.05536480686695279, + "grad_norm": 0.31640625, + "learning_rate": 1.84549356223176e-06, + "loss": 2.4371, + "step": 1032 + }, + { + "epoch": 0.055418454935622315, + "grad_norm": 0.341796875, + "learning_rate": 1.8472818311874108e-06, + "loss": 2.402, + "step": 1033 + }, + { + "epoch": 0.055472103004291846, + "grad_norm": 0.330078125, + "learning_rate": 1.8490701001430617e-06, + "loss": 2.1684, + "step": 1034 + }, + { + "epoch": 0.05552575107296137, + "grad_norm": 0.33984375, + "learning_rate": 1.8508583690987126e-06, + "loss": 2.5576, + "step": 1035 + }, + { + "epoch": 0.0555793991416309, + "grad_norm": 0.35546875, + "learning_rate": 1.8526466380543635e-06, + "loss": 2.4097, + "step": 1036 + }, + { + "epoch": 0.05563304721030043, + "grad_norm": 0.3671875, + "learning_rate": 1.8544349070100146e-06, + "loss": 2.4593, + "step": 1037 + }, + { + "epoch": 0.05568669527896996, + "grad_norm": 0.267578125, + "learning_rate": 1.8562231759656654e-06, + "loss": 2.4015, + "step": 1038 + }, + { + "epoch": 0.055740343347639484, + "grad_norm": 0.30859375, + "learning_rate": 1.8580114449213163e-06, + "loss": 2.549, + "step": 1039 + }, + { + "epoch": 0.055793991416309016, + "grad_norm": 0.435546875, + "learning_rate": 1.8597997138769672e-06, + "loss": 2.3902, + "step": 1040 + }, + { + "epoch": 0.05584763948497854, + "grad_norm": 0.474609375, + "learning_rate": 1.861587982832618e-06, + "loss": 2.2929, + "step": 1041 + }, + { + "epoch": 0.055901287553648066, + "grad_norm": 0.45703125, + "learning_rate": 1.8633762517882691e-06, + "loss": 2.2991, + "step": 1042 + }, + { + "epoch": 0.0559549356223176, + "grad_norm": 0.275390625, + "learning_rate": 1.86516452074392e-06, + "loss": 2.4877, + "step": 1043 + }, + { + "epoch": 0.05600858369098712, + "grad_norm": 0.35546875, + "learning_rate": 1.866952789699571e-06, + "loss": 2.427, + "step": 1044 + }, + { + "epoch": 0.056062231759656654, + "grad_norm": 0.248046875, + "learning_rate": 1.8687410586552218e-06, + "loss": 2.2265, + "step": 1045 + }, + { + "epoch": 0.05611587982832618, + "grad_norm": 0.66796875, + "learning_rate": 1.8705293276108726e-06, + "loss": 2.4849, + "step": 1046 + }, + { + "epoch": 0.05616952789699571, + "grad_norm": 0.333984375, + "learning_rate": 1.872317596566524e-06, + "loss": 2.4121, + "step": 1047 + }, + { + "epoch": 0.056223175965665236, + "grad_norm": 3.375, + "learning_rate": 1.8741058655221748e-06, + "loss": 2.4454, + "step": 1048 + }, + { + "epoch": 0.05627682403433477, + "grad_norm": 0.61328125, + "learning_rate": 1.8758941344778255e-06, + "loss": 2.4874, + "step": 1049 + }, + { + "epoch": 0.05633047210300429, + "grad_norm": 0.310546875, + "learning_rate": 1.8776824034334764e-06, + "loss": 2.2602, + "step": 1050 + }, + { + "epoch": 0.05638412017167382, + "grad_norm": 0.453125, + "learning_rate": 1.8794706723891277e-06, + "loss": 2.7601, + "step": 1051 + }, + { + "epoch": 0.05643776824034335, + "grad_norm": 0.388671875, + "learning_rate": 1.8812589413447785e-06, + "loss": 2.5427, + "step": 1052 + }, + { + "epoch": 0.056491416309012873, + "grad_norm": 0.427734375, + "learning_rate": 1.8830472103004294e-06, + "loss": 2.3579, + "step": 1053 + }, + { + "epoch": 0.056545064377682405, + "grad_norm": 0.30859375, + "learning_rate": 1.8848354792560803e-06, + "loss": 2.3036, + "step": 1054 + }, + { + "epoch": 0.05659871244635193, + "grad_norm": 0.283203125, + "learning_rate": 1.8866237482117312e-06, + "loss": 2.2291, + "step": 1055 + }, + { + "epoch": 0.05665236051502146, + "grad_norm": 0.3828125, + "learning_rate": 1.8884120171673823e-06, + "loss": 2.3804, + "step": 1056 + }, + { + "epoch": 0.05670600858369099, + "grad_norm": 0.29296875, + "learning_rate": 1.8902002861230331e-06, + "loss": 2.2085, + "step": 1057 + }, + { + "epoch": 0.05675965665236052, + "grad_norm": 0.4453125, + "learning_rate": 1.891988555078684e-06, + "loss": 2.4983, + "step": 1058 + }, + { + "epoch": 0.05681330472103004, + "grad_norm": 0.3984375, + "learning_rate": 1.8937768240343349e-06, + "loss": 2.3611, + "step": 1059 + }, + { + "epoch": 0.05686695278969957, + "grad_norm": 0.376953125, + "learning_rate": 1.8955650929899858e-06, + "loss": 2.4902, + "step": 1060 + }, + { + "epoch": 0.0569206008583691, + "grad_norm": 0.427734375, + "learning_rate": 1.8973533619456368e-06, + "loss": 2.3957, + "step": 1061 + }, + { + "epoch": 0.056974248927038625, + "grad_norm": 0.357421875, + "learning_rate": 1.8991416309012877e-06, + "loss": 2.2801, + "step": 1062 + }, + { + "epoch": 0.057027896995708156, + "grad_norm": 0.431640625, + "learning_rate": 1.9009298998569386e-06, + "loss": 2.4692, + "step": 1063 + }, + { + "epoch": 0.05708154506437768, + "grad_norm": 0.482421875, + "learning_rate": 1.9027181688125895e-06, + "loss": 1.8775, + "step": 1064 + }, + { + "epoch": 0.05713519313304721, + "grad_norm": 0.27734375, + "learning_rate": 1.9045064377682403e-06, + "loss": 2.3998, + "step": 1065 + }, + { + "epoch": 0.05718884120171674, + "grad_norm": 0.4375, + "learning_rate": 1.9062947067238916e-06, + "loss": 2.1461, + "step": 1066 + }, + { + "epoch": 0.05724248927038627, + "grad_norm": 0.283203125, + "learning_rate": 1.9080829756795425e-06, + "loss": 2.1151, + "step": 1067 + }, + { + "epoch": 0.057296137339055794, + "grad_norm": 0.44140625, + "learning_rate": 1.9098712446351934e-06, + "loss": 2.4771, + "step": 1068 + }, + { + "epoch": 0.05734978540772532, + "grad_norm": 0.5, + "learning_rate": 1.9116595135908443e-06, + "loss": 2.6075, + "step": 1069 + }, + { + "epoch": 0.05740343347639485, + "grad_norm": 0.4296875, + "learning_rate": 1.913447782546495e-06, + "loss": 2.2719, + "step": 1070 + }, + { + "epoch": 0.057457081545064376, + "grad_norm": 0.341796875, + "learning_rate": 1.915236051502146e-06, + "loss": 2.1579, + "step": 1071 + }, + { + "epoch": 0.05751072961373391, + "grad_norm": 0.2578125, + "learning_rate": 1.917024320457797e-06, + "loss": 2.2327, + "step": 1072 + }, + { + "epoch": 0.05756437768240343, + "grad_norm": 0.30078125, + "learning_rate": 1.9188125894134478e-06, + "loss": 2.3832, + "step": 1073 + }, + { + "epoch": 0.057618025751072964, + "grad_norm": 0.59765625, + "learning_rate": 1.9206008583690987e-06, + "loss": 1.6711, + "step": 1074 + }, + { + "epoch": 0.05767167381974249, + "grad_norm": 0.37890625, + "learning_rate": 1.9223891273247495e-06, + "loss": 2.2593, + "step": 1075 + }, + { + "epoch": 0.05772532188841202, + "grad_norm": 0.326171875, + "learning_rate": 1.924177396280401e-06, + "loss": 2.617, + "step": 1076 + }, + { + "epoch": 0.057778969957081545, + "grad_norm": 0.3046875, + "learning_rate": 1.9259656652360517e-06, + "loss": 2.4137, + "step": 1077 + }, + { + "epoch": 0.05783261802575107, + "grad_norm": 0.416015625, + "learning_rate": 1.9277539341917026e-06, + "loss": 2.3979, + "step": 1078 + }, + { + "epoch": 0.0578862660944206, + "grad_norm": 0.5078125, + "learning_rate": 1.9295422031473535e-06, + "loss": 2.566, + "step": 1079 + }, + { + "epoch": 0.05793991416309013, + "grad_norm": 0.384765625, + "learning_rate": 1.9313304721030043e-06, + "loss": 2.4632, + "step": 1080 + }, + { + "epoch": 0.05799356223175966, + "grad_norm": 0.2890625, + "learning_rate": 1.9331187410586556e-06, + "loss": 2.407, + "step": 1081 + }, + { + "epoch": 0.05804721030042918, + "grad_norm": 0.73046875, + "learning_rate": 1.9349070100143065e-06, + "loss": 1.8138, + "step": 1082 + }, + { + "epoch": 0.058100858369098715, + "grad_norm": 0.31640625, + "learning_rate": 1.9366952789699574e-06, + "loss": 2.3145, + "step": 1083 + }, + { + "epoch": 0.05815450643776824, + "grad_norm": 0.3359375, + "learning_rate": 1.9384835479256083e-06, + "loss": 2.3573, + "step": 1084 + }, + { + "epoch": 0.058208154506437765, + "grad_norm": 0.34375, + "learning_rate": 1.940271816881259e-06, + "loss": 2.4484, + "step": 1085 + }, + { + "epoch": 0.0582618025751073, + "grad_norm": 0.337890625, + "learning_rate": 1.94206008583691e-06, + "loss": 2.6359, + "step": 1086 + }, + { + "epoch": 0.05831545064377682, + "grad_norm": 0.302734375, + "learning_rate": 1.943848354792561e-06, + "loss": 2.3065, + "step": 1087 + }, + { + "epoch": 0.05836909871244635, + "grad_norm": 0.515625, + "learning_rate": 1.9456366237482118e-06, + "loss": 2.5201, + "step": 1088 + }, + { + "epoch": 0.05842274678111588, + "grad_norm": 0.3125, + "learning_rate": 1.9474248927038626e-06, + "loss": 2.5494, + "step": 1089 + }, + { + "epoch": 0.05847639484978541, + "grad_norm": 0.25390625, + "learning_rate": 1.9492131616595135e-06, + "loss": 2.3559, + "step": 1090 + }, + { + "epoch": 0.058530042918454935, + "grad_norm": 0.26171875, + "learning_rate": 1.951001430615165e-06, + "loss": 2.3355, + "step": 1091 + }, + { + "epoch": 0.058583690987124466, + "grad_norm": 0.453125, + "learning_rate": 1.9527896995708157e-06, + "loss": 2.2991, + "step": 1092 + }, + { + "epoch": 0.05863733905579399, + "grad_norm": 0.330078125, + "learning_rate": 1.9545779685264666e-06, + "loss": 2.4964, + "step": 1093 + }, + { + "epoch": 0.058690987124463516, + "grad_norm": 0.27734375, + "learning_rate": 1.9563662374821174e-06, + "loss": 2.376, + "step": 1094 + }, + { + "epoch": 0.05874463519313305, + "grad_norm": 0.369140625, + "learning_rate": 1.9581545064377683e-06, + "loss": 2.3024, + "step": 1095 + }, + { + "epoch": 0.05879828326180257, + "grad_norm": 0.390625, + "learning_rate": 1.9599427753934196e-06, + "loss": 2.3209, + "step": 1096 + }, + { + "epoch": 0.058851931330472104, + "grad_norm": 0.515625, + "learning_rate": 1.9617310443490705e-06, + "loss": 2.1563, + "step": 1097 + }, + { + "epoch": 0.05890557939914163, + "grad_norm": 0.333984375, + "learning_rate": 1.9635193133047214e-06, + "loss": 2.5832, + "step": 1098 + }, + { + "epoch": 0.05895922746781116, + "grad_norm": 0.294921875, + "learning_rate": 1.9653075822603722e-06, + "loss": 2.2935, + "step": 1099 + }, + { + "epoch": 0.059012875536480686, + "grad_norm": 0.349609375, + "learning_rate": 1.967095851216023e-06, + "loss": 2.4748, + "step": 1100 + }, + { + "epoch": 0.05906652360515022, + "grad_norm": 0.51171875, + "learning_rate": 1.968884120171674e-06, + "loss": 2.4425, + "step": 1101 + }, + { + "epoch": 0.05912017167381974, + "grad_norm": 0.306640625, + "learning_rate": 1.970672389127325e-06, + "loss": 2.5559, + "step": 1102 + }, + { + "epoch": 0.05917381974248927, + "grad_norm": 0.64453125, + "learning_rate": 1.9724606580829757e-06, + "loss": 1.3932, + "step": 1103 + }, + { + "epoch": 0.0592274678111588, + "grad_norm": 0.361328125, + "learning_rate": 1.9742489270386266e-06, + "loss": 2.4042, + "step": 1104 + }, + { + "epoch": 0.059281115879828324, + "grad_norm": 0.33984375, + "learning_rate": 1.9760371959942775e-06, + "loss": 2.6409, + "step": 1105 + }, + { + "epoch": 0.059334763948497855, + "grad_norm": 0.498046875, + "learning_rate": 1.977825464949929e-06, + "loss": 2.436, + "step": 1106 + }, + { + "epoch": 0.05938841201716738, + "grad_norm": 0.3671875, + "learning_rate": 1.9796137339055797e-06, + "loss": 2.2128, + "step": 1107 + }, + { + "epoch": 0.05944206008583691, + "grad_norm": 0.59375, + "learning_rate": 1.9814020028612305e-06, + "loss": 1.4469, + "step": 1108 + }, + { + "epoch": 0.05949570815450644, + "grad_norm": 0.609375, + "learning_rate": 1.9831902718168814e-06, + "loss": 2.41, + "step": 1109 + }, + { + "epoch": 0.05954935622317597, + "grad_norm": 0.3828125, + "learning_rate": 1.9849785407725323e-06, + "loss": 2.3678, + "step": 1110 + }, + { + "epoch": 0.05960300429184549, + "grad_norm": 0.451171875, + "learning_rate": 1.986766809728183e-06, + "loss": 2.4742, + "step": 1111 + }, + { + "epoch": 0.05965665236051502, + "grad_norm": 0.33984375, + "learning_rate": 1.988555078683834e-06, + "loss": 2.4125, + "step": 1112 + }, + { + "epoch": 0.05971030042918455, + "grad_norm": 0.478515625, + "learning_rate": 1.990343347639485e-06, + "loss": 2.4311, + "step": 1113 + }, + { + "epoch": 0.059763948497854075, + "grad_norm": 0.30078125, + "learning_rate": 1.992131616595136e-06, + "loss": 2.3403, + "step": 1114 + }, + { + "epoch": 0.059817596566523606, + "grad_norm": 0.36328125, + "learning_rate": 1.993919885550787e-06, + "loss": 2.5221, + "step": 1115 + }, + { + "epoch": 0.05987124463519313, + "grad_norm": 0.337890625, + "learning_rate": 1.995708154506438e-06, + "loss": 2.3283, + "step": 1116 + }, + { + "epoch": 0.05992489270386266, + "grad_norm": 0.365234375, + "learning_rate": 1.997496423462089e-06, + "loss": 2.6577, + "step": 1117 + }, + { + "epoch": 0.05997854077253219, + "grad_norm": 0.46875, + "learning_rate": 1.9992846924177397e-06, + "loss": 2.4224, + "step": 1118 + }, + { + "epoch": 0.06003218884120172, + "grad_norm": 0.35546875, + "learning_rate": 2.0010729613733906e-06, + "loss": 2.4903, + "step": 1119 + }, + { + "epoch": 0.060085836909871244, + "grad_norm": 0.443359375, + "learning_rate": 2.002861230329042e-06, + "loss": 2.3487, + "step": 1120 + }, + { + "epoch": 0.06013948497854077, + "grad_norm": 0.29296875, + "learning_rate": 2.0046494992846928e-06, + "loss": 2.3983, + "step": 1121 + }, + { + "epoch": 0.0601931330472103, + "grad_norm": 0.27734375, + "learning_rate": 2.0064377682403437e-06, + "loss": 2.3932, + "step": 1122 + }, + { + "epoch": 0.060246781115879826, + "grad_norm": 0.71484375, + "learning_rate": 2.0082260371959945e-06, + "loss": 2.48, + "step": 1123 + }, + { + "epoch": 0.06030042918454936, + "grad_norm": 0.373046875, + "learning_rate": 2.0100143061516454e-06, + "loss": 2.4728, + "step": 1124 + }, + { + "epoch": 0.06035407725321888, + "grad_norm": 0.310546875, + "learning_rate": 2.0118025751072963e-06, + "loss": 2.4378, + "step": 1125 + }, + { + "epoch": 0.060407725321888414, + "grad_norm": 0.27734375, + "learning_rate": 2.013590844062947e-06, + "loss": 2.435, + "step": 1126 + }, + { + "epoch": 0.06046137339055794, + "grad_norm": 0.30859375, + "learning_rate": 2.015379113018598e-06, + "loss": 2.2777, + "step": 1127 + }, + { + "epoch": 0.06051502145922747, + "grad_norm": 0.416015625, + "learning_rate": 2.017167381974249e-06, + "loss": 2.3112, + "step": 1128 + }, + { + "epoch": 0.060568669527896996, + "grad_norm": 0.4140625, + "learning_rate": 2.0189556509298998e-06, + "loss": 2.0091, + "step": 1129 + }, + { + "epoch": 0.06062231759656652, + "grad_norm": 0.27734375, + "learning_rate": 2.020743919885551e-06, + "loss": 2.4632, + "step": 1130 + }, + { + "epoch": 0.06067596566523605, + "grad_norm": 0.3828125, + "learning_rate": 2.022532188841202e-06, + "loss": 1.7995, + "step": 1131 + }, + { + "epoch": 0.06072961373390558, + "grad_norm": 0.298828125, + "learning_rate": 2.024320457796853e-06, + "loss": 2.605, + "step": 1132 + }, + { + "epoch": 0.06078326180257511, + "grad_norm": 0.4375, + "learning_rate": 2.0261087267525037e-06, + "loss": 2.4429, + "step": 1133 + }, + { + "epoch": 0.060836909871244634, + "grad_norm": 0.326171875, + "learning_rate": 2.0278969957081546e-06, + "loss": 2.3473, + "step": 1134 + }, + { + "epoch": 0.060890557939914165, + "grad_norm": 0.34375, + "learning_rate": 2.029685264663806e-06, + "loss": 2.2276, + "step": 1135 + }, + { + "epoch": 0.06094420600858369, + "grad_norm": 0.578125, + "learning_rate": 2.0314735336194568e-06, + "loss": 2.0804, + "step": 1136 + }, + { + "epoch": 0.06099785407725322, + "grad_norm": 0.337890625, + "learning_rate": 2.0332618025751076e-06, + "loss": 2.2593, + "step": 1137 + }, + { + "epoch": 0.06105150214592275, + "grad_norm": 0.328125, + "learning_rate": 2.0350500715307585e-06, + "loss": 2.4027, + "step": 1138 + }, + { + "epoch": 0.06110515021459227, + "grad_norm": 0.330078125, + "learning_rate": 2.0368383404864094e-06, + "loss": 2.4081, + "step": 1139 + }, + { + "epoch": 0.0611587982832618, + "grad_norm": 0.32421875, + "learning_rate": 2.0386266094420603e-06, + "loss": 2.6636, + "step": 1140 + }, + { + "epoch": 0.06121244635193133, + "grad_norm": 2.96875, + "learning_rate": 2.040414878397711e-06, + "loss": 2.3214, + "step": 1141 + }, + { + "epoch": 0.06126609442060086, + "grad_norm": 0.375, + "learning_rate": 2.042203147353362e-06, + "loss": 2.5356, + "step": 1142 + }, + { + "epoch": 0.061319742489270385, + "grad_norm": 0.302734375, + "learning_rate": 2.043991416309013e-06, + "loss": 2.385, + "step": 1143 + }, + { + "epoch": 0.061373390557939916, + "grad_norm": 0.453125, + "learning_rate": 2.0457796852646638e-06, + "loss": 2.5885, + "step": 1144 + }, + { + "epoch": 0.06142703862660944, + "grad_norm": 0.333984375, + "learning_rate": 2.047567954220315e-06, + "loss": 2.3704, + "step": 1145 + }, + { + "epoch": 0.06148068669527897, + "grad_norm": 0.4296875, + "learning_rate": 2.049356223175966e-06, + "loss": 2.3786, + "step": 1146 + }, + { + "epoch": 0.0615343347639485, + "grad_norm": 0.337890625, + "learning_rate": 2.051144492131617e-06, + "loss": 2.4855, + "step": 1147 + }, + { + "epoch": 0.06158798283261802, + "grad_norm": 0.376953125, + "learning_rate": 2.0529327610872677e-06, + "loss": 2.3171, + "step": 1148 + }, + { + "epoch": 0.061641630901287554, + "grad_norm": 0.412109375, + "learning_rate": 2.0547210300429186e-06, + "loss": 2.142, + "step": 1149 + }, + { + "epoch": 0.06169527896995708, + "grad_norm": 0.28125, + "learning_rate": 2.0565092989985694e-06, + "loss": 2.5542, + "step": 1150 + }, + { + "epoch": 0.06174892703862661, + "grad_norm": 0.330078125, + "learning_rate": 2.0582975679542203e-06, + "loss": 2.0082, + "step": 1151 + }, + { + "epoch": 0.061802575107296136, + "grad_norm": 0.4140625, + "learning_rate": 2.060085836909871e-06, + "loss": 2.4426, + "step": 1152 + }, + { + "epoch": 0.06185622317596567, + "grad_norm": 0.298828125, + "learning_rate": 2.061874105865522e-06, + "loss": 2.3753, + "step": 1153 + }, + { + "epoch": 0.06190987124463519, + "grad_norm": 0.322265625, + "learning_rate": 2.063662374821173e-06, + "loss": 2.6331, + "step": 1154 + }, + { + "epoch": 0.061963519313304724, + "grad_norm": 0.421875, + "learning_rate": 2.0654506437768243e-06, + "loss": 2.3861, + "step": 1155 + }, + { + "epoch": 0.06201716738197425, + "grad_norm": 0.42578125, + "learning_rate": 2.067238912732475e-06, + "loss": 2.5383, + "step": 1156 + }, + { + "epoch": 0.062070815450643774, + "grad_norm": 0.2734375, + "learning_rate": 2.069027181688126e-06, + "loss": 2.4629, + "step": 1157 + }, + { + "epoch": 0.062124463519313305, + "grad_norm": 0.392578125, + "learning_rate": 2.070815450643777e-06, + "loss": 2.2688, + "step": 1158 + }, + { + "epoch": 0.06217811158798283, + "grad_norm": 0.3515625, + "learning_rate": 2.0726037195994278e-06, + "loss": 2.3472, + "step": 1159 + }, + { + "epoch": 0.06223175965665236, + "grad_norm": 0.56640625, + "learning_rate": 2.074391988555079e-06, + "loss": 1.9622, + "step": 1160 + }, + { + "epoch": 0.06228540772532189, + "grad_norm": 0.341796875, + "learning_rate": 2.07618025751073e-06, + "loss": 2.5177, + "step": 1161 + }, + { + "epoch": 0.06233905579399142, + "grad_norm": 0.51171875, + "learning_rate": 2.077968526466381e-06, + "loss": 2.4929, + "step": 1162 + }, + { + "epoch": 0.06239270386266094, + "grad_norm": 0.322265625, + "learning_rate": 2.0797567954220317e-06, + "loss": 2.0277, + "step": 1163 + }, + { + "epoch": 0.062446351931330475, + "grad_norm": 0.271484375, + "learning_rate": 2.0815450643776826e-06, + "loss": 2.2877, + "step": 1164 + }, + { + "epoch": 0.0625, + "grad_norm": 0.322265625, + "learning_rate": 2.0833333333333334e-06, + "loss": 2.5657, + "step": 1165 + }, + { + "epoch": 0.06255364806866953, + "grad_norm": 0.4453125, + "learning_rate": 2.0851216022889843e-06, + "loss": 2.4232, + "step": 1166 + }, + { + "epoch": 0.06260729613733905, + "grad_norm": 0.369140625, + "learning_rate": 2.086909871244635e-06, + "loss": 2.5732, + "step": 1167 + }, + { + "epoch": 0.06266094420600858, + "grad_norm": 0.314453125, + "learning_rate": 2.088698140200286e-06, + "loss": 2.3661, + "step": 1168 + }, + { + "epoch": 0.06271459227467811, + "grad_norm": 0.267578125, + "learning_rate": 2.090486409155937e-06, + "loss": 2.3541, + "step": 1169 + }, + { + "epoch": 0.06276824034334764, + "grad_norm": 0.298828125, + "learning_rate": 2.0922746781115882e-06, + "loss": 2.5543, + "step": 1170 + }, + { + "epoch": 0.06282188841201716, + "grad_norm": 0.296875, + "learning_rate": 2.094062947067239e-06, + "loss": 2.2346, + "step": 1171 + }, + { + "epoch": 0.0628755364806867, + "grad_norm": 0.330078125, + "learning_rate": 2.09585121602289e-06, + "loss": 2.3261, + "step": 1172 + }, + { + "epoch": 0.06292918454935623, + "grad_norm": 0.359375, + "learning_rate": 2.097639484978541e-06, + "loss": 2.4965, + "step": 1173 + }, + { + "epoch": 0.06298283261802574, + "grad_norm": 0.373046875, + "learning_rate": 2.0994277539341917e-06, + "loss": 2.2068, + "step": 1174 + }, + { + "epoch": 0.06303648068669528, + "grad_norm": 0.287109375, + "learning_rate": 2.101216022889843e-06, + "loss": 2.0726, + "step": 1175 + }, + { + "epoch": 0.06309012875536481, + "grad_norm": 0.271484375, + "learning_rate": 2.103004291845494e-06, + "loss": 2.5033, + "step": 1176 + }, + { + "epoch": 0.06314377682403434, + "grad_norm": 0.330078125, + "learning_rate": 2.104792560801145e-06, + "loss": 2.4512, + "step": 1177 + }, + { + "epoch": 0.06319742489270386, + "grad_norm": 0.333984375, + "learning_rate": 2.1065808297567957e-06, + "loss": 2.4084, + "step": 1178 + }, + { + "epoch": 0.06325107296137339, + "grad_norm": 0.390625, + "learning_rate": 2.1083690987124465e-06, + "loss": 2.4135, + "step": 1179 + }, + { + "epoch": 0.06330472103004292, + "grad_norm": 0.298828125, + "learning_rate": 2.1101573676680974e-06, + "loss": 2.4083, + "step": 1180 + }, + { + "epoch": 0.06335836909871245, + "grad_norm": 0.390625, + "learning_rate": 2.1119456366237483e-06, + "loss": 2.1365, + "step": 1181 + }, + { + "epoch": 0.06341201716738197, + "grad_norm": 0.33203125, + "learning_rate": 2.113733905579399e-06, + "loss": 2.3492, + "step": 1182 + }, + { + "epoch": 0.0634656652360515, + "grad_norm": 1.953125, + "learning_rate": 2.11552217453505e-06, + "loss": 2.4888, + "step": 1183 + }, + { + "epoch": 0.06351931330472103, + "grad_norm": 0.357421875, + "learning_rate": 2.1173104434907013e-06, + "loss": 2.3181, + "step": 1184 + }, + { + "epoch": 0.06357296137339055, + "grad_norm": 0.466796875, + "learning_rate": 2.1190987124463522e-06, + "loss": 2.4765, + "step": 1185 + }, + { + "epoch": 0.06362660944206008, + "grad_norm": 0.39453125, + "learning_rate": 2.120886981402003e-06, + "loss": 2.355, + "step": 1186 + }, + { + "epoch": 0.06368025751072962, + "grad_norm": 0.326171875, + "learning_rate": 2.122675250357654e-06, + "loss": 2.5157, + "step": 1187 + }, + { + "epoch": 0.06373390557939915, + "grad_norm": 0.373046875, + "learning_rate": 2.124463519313305e-06, + "loss": 2.4965, + "step": 1188 + }, + { + "epoch": 0.06378755364806867, + "grad_norm": 0.890625, + "learning_rate": 2.126251788268956e-06, + "loss": 2.5433, + "step": 1189 + }, + { + "epoch": 0.0638412017167382, + "grad_norm": 2.9375, + "learning_rate": 2.128040057224607e-06, + "loss": 2.4044, + "step": 1190 + }, + { + "epoch": 0.06389484978540773, + "grad_norm": 0.279296875, + "learning_rate": 2.129828326180258e-06, + "loss": 2.2361, + "step": 1191 + }, + { + "epoch": 0.06394849785407725, + "grad_norm": 0.298828125, + "learning_rate": 2.1316165951359088e-06, + "loss": 2.4295, + "step": 1192 + }, + { + "epoch": 0.06400214592274678, + "grad_norm": 0.32421875, + "learning_rate": 2.1334048640915597e-06, + "loss": 2.5419, + "step": 1193 + }, + { + "epoch": 0.06405579399141631, + "grad_norm": 0.32421875, + "learning_rate": 2.1351931330472105e-06, + "loss": 2.4801, + "step": 1194 + }, + { + "epoch": 0.06410944206008584, + "grad_norm": 0.40234375, + "learning_rate": 2.1369814020028614e-06, + "loss": 2.5202, + "step": 1195 + }, + { + "epoch": 0.06416309012875536, + "grad_norm": 0.3046875, + "learning_rate": 2.1387696709585123e-06, + "loss": 2.5454, + "step": 1196 + }, + { + "epoch": 0.06421673819742489, + "grad_norm": 0.287109375, + "learning_rate": 2.140557939914163e-06, + "loss": 2.526, + "step": 1197 + }, + { + "epoch": 0.06427038626609442, + "grad_norm": 0.435546875, + "learning_rate": 2.142346208869814e-06, + "loss": 2.5152, + "step": 1198 + }, + { + "epoch": 0.06432403433476395, + "grad_norm": 0.287109375, + "learning_rate": 2.1441344778254653e-06, + "loss": 2.2807, + "step": 1199 + }, + { + "epoch": 0.06437768240343347, + "grad_norm": 0.390625, + "learning_rate": 2.145922746781116e-06, + "loss": 2.2159, + "step": 1200 + }, + { + "epoch": 0.064431330472103, + "grad_norm": 0.32421875, + "learning_rate": 2.147711015736767e-06, + "loss": 2.5431, + "step": 1201 + }, + { + "epoch": 0.06448497854077254, + "grad_norm": 0.349609375, + "learning_rate": 2.149499284692418e-06, + "loss": 2.7205, + "step": 1202 + }, + { + "epoch": 0.06453862660944205, + "grad_norm": 0.35546875, + "learning_rate": 2.151287553648069e-06, + "loss": 2.2434, + "step": 1203 + }, + { + "epoch": 0.06459227467811159, + "grad_norm": 0.357421875, + "learning_rate": 2.1530758226037197e-06, + "loss": 2.5699, + "step": 1204 + }, + { + "epoch": 0.06464592274678112, + "grad_norm": 0.28125, + "learning_rate": 2.1548640915593706e-06, + "loss": 2.4834, + "step": 1205 + }, + { + "epoch": 0.06469957081545065, + "grad_norm": 0.29296875, + "learning_rate": 2.1566523605150215e-06, + "loss": 2.2271, + "step": 1206 + }, + { + "epoch": 0.06475321888412017, + "grad_norm": 0.283203125, + "learning_rate": 2.1584406294706723e-06, + "loss": 2.396, + "step": 1207 + }, + { + "epoch": 0.0648068669527897, + "grad_norm": 0.400390625, + "learning_rate": 2.160228898426323e-06, + "loss": 2.2544, + "step": 1208 + }, + { + "epoch": 0.06486051502145923, + "grad_norm": 0.310546875, + "learning_rate": 2.1620171673819745e-06, + "loss": 2.4526, + "step": 1209 + }, + { + "epoch": 0.06491416309012875, + "grad_norm": 0.27734375, + "learning_rate": 2.1638054363376254e-06, + "loss": 2.3646, + "step": 1210 + }, + { + "epoch": 0.06496781115879828, + "grad_norm": 0.51953125, + "learning_rate": 2.1655937052932763e-06, + "loss": 2.5277, + "step": 1211 + }, + { + "epoch": 0.06502145922746781, + "grad_norm": 0.310546875, + "learning_rate": 2.167381974248927e-06, + "loss": 2.2434, + "step": 1212 + }, + { + "epoch": 0.06507510729613734, + "grad_norm": 0.4375, + "learning_rate": 2.169170243204578e-06, + "loss": 2.3638, + "step": 1213 + }, + { + "epoch": 0.06512875536480686, + "grad_norm": 0.271484375, + "learning_rate": 2.1709585121602293e-06, + "loss": 2.3775, + "step": 1214 + }, + { + "epoch": 0.0651824034334764, + "grad_norm": 0.326171875, + "learning_rate": 2.17274678111588e-06, + "loss": 2.2663, + "step": 1215 + }, + { + "epoch": 0.06523605150214593, + "grad_norm": 0.28515625, + "learning_rate": 2.174535050071531e-06, + "loss": 2.4125, + "step": 1216 + }, + { + "epoch": 0.06528969957081546, + "grad_norm": 0.369140625, + "learning_rate": 2.176323319027182e-06, + "loss": 2.2322, + "step": 1217 + }, + { + "epoch": 0.06534334763948497, + "grad_norm": 0.482421875, + "learning_rate": 2.178111587982833e-06, + "loss": 2.4743, + "step": 1218 + }, + { + "epoch": 0.0653969957081545, + "grad_norm": 0.44921875, + "learning_rate": 2.1798998569384837e-06, + "loss": 1.6516, + "step": 1219 + }, + { + "epoch": 0.06545064377682404, + "grad_norm": 0.46875, + "learning_rate": 2.1816881258941346e-06, + "loss": 1.718, + "step": 1220 + }, + { + "epoch": 0.06550429184549356, + "grad_norm": 0.4765625, + "learning_rate": 2.1834763948497854e-06, + "loss": 2.493, + "step": 1221 + }, + { + "epoch": 0.06555793991416309, + "grad_norm": 0.4765625, + "learning_rate": 2.1852646638054363e-06, + "loss": 1.6151, + "step": 1222 + }, + { + "epoch": 0.06561158798283262, + "grad_norm": 0.490234375, + "learning_rate": 2.187052932761087e-06, + "loss": 2.2422, + "step": 1223 + }, + { + "epoch": 0.06566523605150215, + "grad_norm": 0.283203125, + "learning_rate": 2.1888412017167385e-06, + "loss": 2.3243, + "step": 1224 + }, + { + "epoch": 0.06571888412017167, + "grad_norm": 0.341796875, + "learning_rate": 2.1906294706723894e-06, + "loss": 2.369, + "step": 1225 + }, + { + "epoch": 0.0657725321888412, + "grad_norm": 0.44140625, + "learning_rate": 2.1924177396280402e-06, + "loss": 2.2829, + "step": 1226 + }, + { + "epoch": 0.06582618025751073, + "grad_norm": 0.2890625, + "learning_rate": 2.194206008583691e-06, + "loss": 2.3281, + "step": 1227 + }, + { + "epoch": 0.06587982832618025, + "grad_norm": 0.8203125, + "learning_rate": 2.195994277539342e-06, + "loss": 2.7643, + "step": 1228 + }, + { + "epoch": 0.06593347639484978, + "grad_norm": 0.330078125, + "learning_rate": 2.1977825464949933e-06, + "loss": 2.3876, + "step": 1229 + }, + { + "epoch": 0.06598712446351931, + "grad_norm": 0.341796875, + "learning_rate": 2.199570815450644e-06, + "loss": 2.2415, + "step": 1230 + }, + { + "epoch": 0.06604077253218885, + "grad_norm": 0.94921875, + "learning_rate": 2.201359084406295e-06, + "loss": 2.4427, + "step": 1231 + }, + { + "epoch": 0.06609442060085836, + "grad_norm": 0.8203125, + "learning_rate": 2.203147353361946e-06, + "loss": 2.5501, + "step": 1232 + }, + { + "epoch": 0.0661480686695279, + "grad_norm": 0.337890625, + "learning_rate": 2.204935622317597e-06, + "loss": 2.2966, + "step": 1233 + }, + { + "epoch": 0.06620171673819743, + "grad_norm": 1.5625, + "learning_rate": 2.2067238912732477e-06, + "loss": 2.5513, + "step": 1234 + }, + { + "epoch": 0.06625536480686696, + "grad_norm": 0.3203125, + "learning_rate": 2.2085121602288986e-06, + "loss": 2.294, + "step": 1235 + }, + { + "epoch": 0.06630901287553648, + "grad_norm": 0.4921875, + "learning_rate": 2.2103004291845494e-06, + "loss": 2.2894, + "step": 1236 + }, + { + "epoch": 0.06636266094420601, + "grad_norm": 0.265625, + "learning_rate": 2.2120886981402003e-06, + "loss": 2.4269, + "step": 1237 + }, + { + "epoch": 0.06641630901287554, + "grad_norm": 0.263671875, + "learning_rate": 2.213876967095851e-06, + "loss": 2.2876, + "step": 1238 + }, + { + "epoch": 0.06646995708154506, + "grad_norm": 0.55859375, + "learning_rate": 2.2156652360515025e-06, + "loss": 2.6765, + "step": 1239 + }, + { + "epoch": 0.06652360515021459, + "grad_norm": 0.271484375, + "learning_rate": 2.2174535050071534e-06, + "loss": 2.3873, + "step": 1240 + }, + { + "epoch": 0.06657725321888412, + "grad_norm": 0.59375, + "learning_rate": 2.2192417739628042e-06, + "loss": 2.5207, + "step": 1241 + }, + { + "epoch": 0.06663090128755365, + "grad_norm": 0.60546875, + "learning_rate": 2.221030042918455e-06, + "loss": 1.6185, + "step": 1242 + }, + { + "epoch": 0.06668454935622317, + "grad_norm": 0.470703125, + "learning_rate": 2.222818311874106e-06, + "loss": 2.3341, + "step": 1243 + }, + { + "epoch": 0.0667381974248927, + "grad_norm": 0.53125, + "learning_rate": 2.224606580829757e-06, + "loss": 2.3384, + "step": 1244 + }, + { + "epoch": 0.06679184549356224, + "grad_norm": 0.330078125, + "learning_rate": 2.2263948497854077e-06, + "loss": 2.393, + "step": 1245 + }, + { + "epoch": 0.06684549356223175, + "grad_norm": 0.2890625, + "learning_rate": 2.2281831187410586e-06, + "loss": 2.3016, + "step": 1246 + }, + { + "epoch": 0.06689914163090128, + "grad_norm": 0.53515625, + "learning_rate": 2.2299713876967095e-06, + "loss": 2.3031, + "step": 1247 + }, + { + "epoch": 0.06695278969957082, + "grad_norm": 0.310546875, + "learning_rate": 2.2317596566523608e-06, + "loss": 2.5499, + "step": 1248 + }, + { + "epoch": 0.06700643776824035, + "grad_norm": 0.384765625, + "learning_rate": 2.2335479256080117e-06, + "loss": 2.3637, + "step": 1249 + }, + { + "epoch": 0.06706008583690987, + "grad_norm": 0.28515625, + "learning_rate": 2.2353361945636625e-06, + "loss": 2.3558, + "step": 1250 + }, + { + "epoch": 0.0671137339055794, + "grad_norm": 0.310546875, + "learning_rate": 2.2371244635193134e-06, + "loss": 2.3235, + "step": 1251 + }, + { + "epoch": 0.06716738197424893, + "grad_norm": 0.2734375, + "learning_rate": 2.2389127324749643e-06, + "loss": 2.4403, + "step": 1252 + }, + { + "epoch": 0.06722103004291846, + "grad_norm": 0.3046875, + "learning_rate": 2.2407010014306156e-06, + "loss": 2.4165, + "step": 1253 + }, + { + "epoch": 0.06727467811158798, + "grad_norm": 0.34765625, + "learning_rate": 2.2424892703862665e-06, + "loss": 2.3628, + "step": 1254 + }, + { + "epoch": 0.06732832618025751, + "grad_norm": 5.71875, + "learning_rate": 2.2442775393419173e-06, + "loss": 2.3425, + "step": 1255 + }, + { + "epoch": 0.06738197424892704, + "grad_norm": 0.3828125, + "learning_rate": 2.2460658082975682e-06, + "loss": 2.4959, + "step": 1256 + }, + { + "epoch": 0.06743562231759656, + "grad_norm": 0.4296875, + "learning_rate": 2.247854077253219e-06, + "loss": 2.3719, + "step": 1257 + }, + { + "epoch": 0.06748927038626609, + "grad_norm": 0.27734375, + "learning_rate": 2.24964234620887e-06, + "loss": 2.4724, + "step": 1258 + }, + { + "epoch": 0.06754291845493562, + "grad_norm": 0.310546875, + "learning_rate": 2.251430615164521e-06, + "loss": 2.07, + "step": 1259 + }, + { + "epoch": 0.06759656652360516, + "grad_norm": 0.412109375, + "learning_rate": 2.2532188841201717e-06, + "loss": 2.4505, + "step": 1260 + }, + { + "epoch": 0.06765021459227467, + "grad_norm": 0.41015625, + "learning_rate": 2.2550071530758226e-06, + "loss": 2.3488, + "step": 1261 + }, + { + "epoch": 0.0677038626609442, + "grad_norm": 0.4921875, + "learning_rate": 2.2567954220314735e-06, + "loss": 1.9886, + "step": 1262 + }, + { + "epoch": 0.06775751072961374, + "grad_norm": 0.408203125, + "learning_rate": 2.2585836909871248e-06, + "loss": 2.4132, + "step": 1263 + }, + { + "epoch": 0.06781115879828326, + "grad_norm": 0.87890625, + "learning_rate": 2.2603719599427756e-06, + "loss": 2.586, + "step": 1264 + }, + { + "epoch": 0.06786480686695279, + "grad_norm": 0.28515625, + "learning_rate": 2.2621602288984265e-06, + "loss": 2.4215, + "step": 1265 + }, + { + "epoch": 0.06791845493562232, + "grad_norm": 0.62109375, + "learning_rate": 2.2639484978540774e-06, + "loss": 2.8144, + "step": 1266 + }, + { + "epoch": 0.06797210300429185, + "grad_norm": 0.26171875, + "learning_rate": 2.2657367668097283e-06, + "loss": 2.2076, + "step": 1267 + }, + { + "epoch": 0.06802575107296137, + "grad_norm": 0.34375, + "learning_rate": 2.2675250357653796e-06, + "loss": 2.1318, + "step": 1268 + }, + { + "epoch": 0.0680793991416309, + "grad_norm": 0.494140625, + "learning_rate": 2.2693133047210304e-06, + "loss": 2.4568, + "step": 1269 + }, + { + "epoch": 0.06813304721030043, + "grad_norm": 0.296875, + "learning_rate": 2.2711015736766813e-06, + "loss": 2.5732, + "step": 1270 + }, + { + "epoch": 0.06818669527896996, + "grad_norm": 0.29296875, + "learning_rate": 2.272889842632332e-06, + "loss": 2.4195, + "step": 1271 + }, + { + "epoch": 0.06824034334763948, + "grad_norm": 0.33984375, + "learning_rate": 2.274678111587983e-06, + "loss": 2.2619, + "step": 1272 + }, + { + "epoch": 0.06829399141630901, + "grad_norm": 0.275390625, + "learning_rate": 2.276466380543634e-06, + "loss": 2.353, + "step": 1273 + }, + { + "epoch": 0.06834763948497855, + "grad_norm": 0.302734375, + "learning_rate": 2.278254649499285e-06, + "loss": 2.4534, + "step": 1274 + }, + { + "epoch": 0.06840128755364806, + "grad_norm": 0.3671875, + "learning_rate": 2.2800429184549357e-06, + "loss": 1.7626, + "step": 1275 + }, + { + "epoch": 0.0684549356223176, + "grad_norm": 0.3046875, + "learning_rate": 2.2818311874105866e-06, + "loss": 2.6983, + "step": 1276 + }, + { + "epoch": 0.06850858369098713, + "grad_norm": 0.439453125, + "learning_rate": 2.2836194563662375e-06, + "loss": 2.5449, + "step": 1277 + }, + { + "epoch": 0.06856223175965666, + "grad_norm": 0.37890625, + "learning_rate": 2.2854077253218888e-06, + "loss": 1.8378, + "step": 1278 + }, + { + "epoch": 0.06861587982832618, + "grad_norm": 0.6875, + "learning_rate": 2.2871959942775396e-06, + "loss": 2.1457, + "step": 1279 + }, + { + "epoch": 0.06866952789699571, + "grad_norm": 0.3515625, + "learning_rate": 2.2889842632331905e-06, + "loss": 2.4122, + "step": 1280 + }, + { + "epoch": 0.06872317596566524, + "grad_norm": 0.29296875, + "learning_rate": 2.2907725321888414e-06, + "loss": 2.2969, + "step": 1281 + }, + { + "epoch": 0.06877682403433476, + "grad_norm": 0.50390625, + "learning_rate": 2.2925608011444923e-06, + "loss": 2.4978, + "step": 1282 + }, + { + "epoch": 0.06883047210300429, + "grad_norm": 0.369140625, + "learning_rate": 2.2943490701001436e-06, + "loss": 2.4457, + "step": 1283 + }, + { + "epoch": 0.06888412017167382, + "grad_norm": 0.25, + "learning_rate": 2.2961373390557944e-06, + "loss": 2.2512, + "step": 1284 + }, + { + "epoch": 0.06893776824034335, + "grad_norm": 0.435546875, + "learning_rate": 2.2979256080114453e-06, + "loss": 2.4441, + "step": 1285 + }, + { + "epoch": 0.06899141630901287, + "grad_norm": 0.3984375, + "learning_rate": 2.2997138769670958e-06, + "loss": 2.4678, + "step": 1286 + }, + { + "epoch": 0.0690450643776824, + "grad_norm": 0.291015625, + "learning_rate": 2.3015021459227466e-06, + "loss": 2.6685, + "step": 1287 + }, + { + "epoch": 0.06909871244635193, + "grad_norm": 0.40625, + "learning_rate": 2.303290414878398e-06, + "loss": 2.5109, + "step": 1288 + }, + { + "epoch": 0.06915236051502147, + "grad_norm": 0.65234375, + "learning_rate": 2.305078683834049e-06, + "loss": 2.2062, + "step": 1289 + }, + { + "epoch": 0.06920600858369098, + "grad_norm": 0.97265625, + "learning_rate": 2.3068669527896997e-06, + "loss": 2.5004, + "step": 1290 + }, + { + "epoch": 0.06925965665236052, + "grad_norm": 0.3828125, + "learning_rate": 2.3086552217453506e-06, + "loss": 2.2881, + "step": 1291 + }, + { + "epoch": 0.06931330472103005, + "grad_norm": 0.37890625, + "learning_rate": 2.3104434907010014e-06, + "loss": 1.8552, + "step": 1292 + }, + { + "epoch": 0.06936695278969957, + "grad_norm": 0.408203125, + "learning_rate": 2.3122317596566527e-06, + "loss": 1.5957, + "step": 1293 + }, + { + "epoch": 0.0694206008583691, + "grad_norm": 1.078125, + "learning_rate": 2.3140200286123036e-06, + "loss": 1.8955, + "step": 1294 + }, + { + "epoch": 0.06947424892703863, + "grad_norm": 0.388671875, + "learning_rate": 2.3158082975679545e-06, + "loss": 2.5858, + "step": 1295 + }, + { + "epoch": 0.06952789699570816, + "grad_norm": 0.48046875, + "learning_rate": 2.3175965665236054e-06, + "loss": 1.6048, + "step": 1296 + }, + { + "epoch": 0.06958154506437768, + "grad_norm": 0.30078125, + "learning_rate": 2.3193848354792562e-06, + "loss": 2.3952, + "step": 1297 + }, + { + "epoch": 0.06963519313304721, + "grad_norm": 0.392578125, + "learning_rate": 2.321173104434907e-06, + "loss": 2.1566, + "step": 1298 + }, + { + "epoch": 0.06968884120171674, + "grad_norm": 0.609375, + "learning_rate": 2.322961373390558e-06, + "loss": 2.546, + "step": 1299 + }, + { + "epoch": 0.06974248927038626, + "grad_norm": 0.3046875, + "learning_rate": 2.324749642346209e-06, + "loss": 2.5943, + "step": 1300 + }, + { + "epoch": 0.06979613733905579, + "grad_norm": 0.33984375, + "learning_rate": 2.3265379113018597e-06, + "loss": 2.2548, + "step": 1301 + }, + { + "epoch": 0.06984978540772532, + "grad_norm": 0.453125, + "learning_rate": 2.3283261802575106e-06, + "loss": 2.5333, + "step": 1302 + }, + { + "epoch": 0.06990343347639486, + "grad_norm": 0.23828125, + "learning_rate": 2.330114449213162e-06, + "loss": 2.1772, + "step": 1303 + }, + { + "epoch": 0.06995708154506437, + "grad_norm": 0.48828125, + "learning_rate": 2.331902718168813e-06, + "loss": 2.5498, + "step": 1304 + }, + { + "epoch": 0.0700107296137339, + "grad_norm": 0.28125, + "learning_rate": 2.3336909871244637e-06, + "loss": 2.3038, + "step": 1305 + }, + { + "epoch": 0.07006437768240344, + "grad_norm": 0.349609375, + "learning_rate": 2.3354792560801145e-06, + "loss": 2.48, + "step": 1306 + }, + { + "epoch": 0.07011802575107297, + "grad_norm": 0.33203125, + "learning_rate": 2.337267525035766e-06, + "loss": 2.5535, + "step": 1307 + }, + { + "epoch": 0.07017167381974249, + "grad_norm": 0.2890625, + "learning_rate": 2.3390557939914167e-06, + "loss": 2.2639, + "step": 1308 + }, + { + "epoch": 0.07022532188841202, + "grad_norm": 0.400390625, + "learning_rate": 2.3408440629470676e-06, + "loss": 2.3274, + "step": 1309 + }, + { + "epoch": 0.07027896995708155, + "grad_norm": 0.298828125, + "learning_rate": 2.3426323319027185e-06, + "loss": 2.4543, + "step": 1310 + }, + { + "epoch": 0.07033261802575107, + "grad_norm": 0.341796875, + "learning_rate": 2.3444206008583693e-06, + "loss": 2.2669, + "step": 1311 + }, + { + "epoch": 0.0703862660944206, + "grad_norm": 0.41796875, + "learning_rate": 2.3462088698140202e-06, + "loss": 2.3353, + "step": 1312 + }, + { + "epoch": 0.07043991416309013, + "grad_norm": 0.6171875, + "learning_rate": 2.347997138769671e-06, + "loss": 2.3281, + "step": 1313 + }, + { + "epoch": 0.07049356223175966, + "grad_norm": 0.47265625, + "learning_rate": 2.349785407725322e-06, + "loss": 2.4948, + "step": 1314 + }, + { + "epoch": 0.07054721030042918, + "grad_norm": 0.353515625, + "learning_rate": 2.351573676680973e-06, + "loss": 2.2631, + "step": 1315 + }, + { + "epoch": 0.07060085836909871, + "grad_norm": 0.283203125, + "learning_rate": 2.3533619456366237e-06, + "loss": 2.3839, + "step": 1316 + }, + { + "epoch": 0.07065450643776824, + "grad_norm": 0.359375, + "learning_rate": 2.355150214592275e-06, + "loss": 2.0054, + "step": 1317 + }, + { + "epoch": 0.07070815450643776, + "grad_norm": 0.390625, + "learning_rate": 2.356938483547926e-06, + "loss": 2.5686, + "step": 1318 + }, + { + "epoch": 0.0707618025751073, + "grad_norm": 0.578125, + "learning_rate": 2.3587267525035768e-06, + "loss": 2.5402, + "step": 1319 + }, + { + "epoch": 0.07081545064377683, + "grad_norm": 0.3515625, + "learning_rate": 2.3605150214592277e-06, + "loss": 2.5343, + "step": 1320 + }, + { + "epoch": 0.07086909871244636, + "grad_norm": 0.330078125, + "learning_rate": 2.3623032904148785e-06, + "loss": 2.1977, + "step": 1321 + }, + { + "epoch": 0.07092274678111588, + "grad_norm": 0.271484375, + "learning_rate": 2.36409155937053e-06, + "loss": 2.5157, + "step": 1322 + }, + { + "epoch": 0.0709763948497854, + "grad_norm": 0.30859375, + "learning_rate": 2.3658798283261807e-06, + "loss": 2.3208, + "step": 1323 + }, + { + "epoch": 0.07103004291845494, + "grad_norm": 0.40234375, + "learning_rate": 2.3676680972818316e-06, + "loss": 2.5653, + "step": 1324 + }, + { + "epoch": 0.07108369098712447, + "grad_norm": 0.439453125, + "learning_rate": 2.3694563662374825e-06, + "loss": 2.4658, + "step": 1325 + }, + { + "epoch": 0.07113733905579399, + "grad_norm": 0.5390625, + "learning_rate": 2.3712446351931333e-06, + "loss": 2.4376, + "step": 1326 + }, + { + "epoch": 0.07119098712446352, + "grad_norm": 0.26953125, + "learning_rate": 2.373032904148784e-06, + "loss": 2.3002, + "step": 1327 + }, + { + "epoch": 0.07124463519313305, + "grad_norm": 0.30859375, + "learning_rate": 2.374821173104435e-06, + "loss": 2.6897, + "step": 1328 + }, + { + "epoch": 0.07129828326180257, + "grad_norm": 0.318359375, + "learning_rate": 2.376609442060086e-06, + "loss": 2.3498, + "step": 1329 + }, + { + "epoch": 0.0713519313304721, + "grad_norm": 0.3671875, + "learning_rate": 2.378397711015737e-06, + "loss": 2.2547, + "step": 1330 + }, + { + "epoch": 0.07140557939914163, + "grad_norm": 0.38671875, + "learning_rate": 2.3801859799713877e-06, + "loss": 2.2929, + "step": 1331 + }, + { + "epoch": 0.07145922746781116, + "grad_norm": 0.294921875, + "learning_rate": 2.381974248927039e-06, + "loss": 2.3957, + "step": 1332 + }, + { + "epoch": 0.07151287553648068, + "grad_norm": 0.375, + "learning_rate": 2.38376251788269e-06, + "loss": 2.5716, + "step": 1333 + }, + { + "epoch": 0.07156652360515021, + "grad_norm": 0.310546875, + "learning_rate": 2.3855507868383408e-06, + "loss": 2.4233, + "step": 1334 + }, + { + "epoch": 0.07162017167381975, + "grad_norm": 0.294921875, + "learning_rate": 2.3873390557939916e-06, + "loss": 2.2534, + "step": 1335 + }, + { + "epoch": 0.07167381974248926, + "grad_norm": 0.44140625, + "learning_rate": 2.3891273247496425e-06, + "loss": 2.4689, + "step": 1336 + }, + { + "epoch": 0.0717274678111588, + "grad_norm": 0.5234375, + "learning_rate": 2.3909155937052934e-06, + "loss": 2.3941, + "step": 1337 + }, + { + "epoch": 0.07178111587982833, + "grad_norm": 0.29296875, + "learning_rate": 2.3927038626609443e-06, + "loss": 2.3894, + "step": 1338 + }, + { + "epoch": 0.07183476394849786, + "grad_norm": 0.43359375, + "learning_rate": 2.394492131616595e-06, + "loss": 2.3029, + "step": 1339 + }, + { + "epoch": 0.07188841201716738, + "grad_norm": 0.306640625, + "learning_rate": 2.396280400572246e-06, + "loss": 2.4447, + "step": 1340 + }, + { + "epoch": 0.07194206008583691, + "grad_norm": 0.34375, + "learning_rate": 2.398068669527897e-06, + "loss": 2.4873, + "step": 1341 + }, + { + "epoch": 0.07199570815450644, + "grad_norm": 0.443359375, + "learning_rate": 2.399856938483548e-06, + "loss": 2.2794, + "step": 1342 + }, + { + "epoch": 0.07204935622317596, + "grad_norm": 0.306640625, + "learning_rate": 2.401645207439199e-06, + "loss": 2.126, + "step": 1343 + }, + { + "epoch": 0.07210300429184549, + "grad_norm": 0.31640625, + "learning_rate": 2.40343347639485e-06, + "loss": 2.2857, + "step": 1344 + }, + { + "epoch": 0.07215665236051502, + "grad_norm": 0.400390625, + "learning_rate": 2.405221745350501e-06, + "loss": 2.4254, + "step": 1345 + }, + { + "epoch": 0.07221030042918455, + "grad_norm": 0.2734375, + "learning_rate": 2.4070100143061517e-06, + "loss": 2.2774, + "step": 1346 + }, + { + "epoch": 0.07226394849785407, + "grad_norm": 0.35546875, + "learning_rate": 2.408798283261803e-06, + "loss": 2.4125, + "step": 1347 + }, + { + "epoch": 0.0723175965665236, + "grad_norm": 0.3203125, + "learning_rate": 2.410586552217454e-06, + "loss": 2.3733, + "step": 1348 + }, + { + "epoch": 0.07237124463519314, + "grad_norm": 0.474609375, + "learning_rate": 2.4123748211731047e-06, + "loss": 1.6031, + "step": 1349 + }, + { + "epoch": 0.07242489270386267, + "grad_norm": 0.52734375, + "learning_rate": 2.4141630901287556e-06, + "loss": 2.4995, + "step": 1350 + }, + { + "epoch": 0.07247854077253219, + "grad_norm": 1.3359375, + "learning_rate": 2.4159513590844065e-06, + "loss": 2.3552, + "step": 1351 + }, + { + "epoch": 0.07253218884120172, + "grad_norm": 0.2451171875, + "learning_rate": 2.4177396280400574e-06, + "loss": 2.4324, + "step": 1352 + }, + { + "epoch": 0.07258583690987125, + "grad_norm": 0.283203125, + "learning_rate": 2.4195278969957083e-06, + "loss": 2.4589, + "step": 1353 + }, + { + "epoch": 0.07263948497854077, + "grad_norm": 0.53515625, + "learning_rate": 2.421316165951359e-06, + "loss": 2.5053, + "step": 1354 + }, + { + "epoch": 0.0726931330472103, + "grad_norm": 0.388671875, + "learning_rate": 2.42310443490701e-06, + "loss": 2.3562, + "step": 1355 + }, + { + "epoch": 0.07274678111587983, + "grad_norm": 0.33203125, + "learning_rate": 2.424892703862661e-06, + "loss": 2.2064, + "step": 1356 + }, + { + "epoch": 0.07280042918454936, + "grad_norm": 0.306640625, + "learning_rate": 2.426680972818312e-06, + "loss": 2.2803, + "step": 1357 + }, + { + "epoch": 0.07285407725321888, + "grad_norm": 0.396484375, + "learning_rate": 2.428469241773963e-06, + "loss": 2.4075, + "step": 1358 + }, + { + "epoch": 0.07290772532188841, + "grad_norm": 0.416015625, + "learning_rate": 2.430257510729614e-06, + "loss": 2.5274, + "step": 1359 + }, + { + "epoch": 0.07296137339055794, + "grad_norm": 0.33984375, + "learning_rate": 2.432045779685265e-06, + "loss": 2.3402, + "step": 1360 + }, + { + "epoch": 0.07301502145922746, + "grad_norm": 0.302734375, + "learning_rate": 2.4338340486409157e-06, + "loss": 2.5289, + "step": 1361 + }, + { + "epoch": 0.07306866952789699, + "grad_norm": 0.408203125, + "learning_rate": 2.435622317596567e-06, + "loss": 2.535, + "step": 1362 + }, + { + "epoch": 0.07312231759656652, + "grad_norm": 0.28125, + "learning_rate": 2.437410586552218e-06, + "loss": 2.3481, + "step": 1363 + }, + { + "epoch": 0.07317596566523606, + "grad_norm": 0.33203125, + "learning_rate": 2.4391988555078687e-06, + "loss": 2.2138, + "step": 1364 + }, + { + "epoch": 0.07322961373390557, + "grad_norm": 0.37890625, + "learning_rate": 2.4409871244635196e-06, + "loss": 2.1971, + "step": 1365 + }, + { + "epoch": 0.0732832618025751, + "grad_norm": 0.28125, + "learning_rate": 2.4427753934191705e-06, + "loss": 2.6494, + "step": 1366 + }, + { + "epoch": 0.07333690987124464, + "grad_norm": 0.416015625, + "learning_rate": 2.4445636623748214e-06, + "loss": 1.6314, + "step": 1367 + }, + { + "epoch": 0.07339055793991417, + "grad_norm": 0.376953125, + "learning_rate": 2.4463519313304722e-06, + "loss": 1.6725, + "step": 1368 + }, + { + "epoch": 0.07344420600858369, + "grad_norm": 0.287109375, + "learning_rate": 2.448140200286123e-06, + "loss": 2.4225, + "step": 1369 + }, + { + "epoch": 0.07349785407725322, + "grad_norm": 0.64453125, + "learning_rate": 2.449928469241774e-06, + "loss": 2.4131, + "step": 1370 + }, + { + "epoch": 0.07355150214592275, + "grad_norm": 0.42578125, + "learning_rate": 2.4517167381974253e-06, + "loss": 2.2076, + "step": 1371 + }, + { + "epoch": 0.07360515021459227, + "grad_norm": 0.28125, + "learning_rate": 2.453505007153076e-06, + "loss": 2.3243, + "step": 1372 + }, + { + "epoch": 0.0736587982832618, + "grad_norm": 0.32421875, + "learning_rate": 2.455293276108727e-06, + "loss": 2.3296, + "step": 1373 + }, + { + "epoch": 0.07371244635193133, + "grad_norm": 0.46875, + "learning_rate": 2.457081545064378e-06, + "loss": 2.4342, + "step": 1374 + }, + { + "epoch": 0.07376609442060086, + "grad_norm": 0.4296875, + "learning_rate": 2.458869814020029e-06, + "loss": 2.2965, + "step": 1375 + }, + { + "epoch": 0.07381974248927038, + "grad_norm": 0.34375, + "learning_rate": 2.4606580829756797e-06, + "loss": 1.9686, + "step": 1376 + }, + { + "epoch": 0.07387339055793991, + "grad_norm": 0.482421875, + "learning_rate": 2.4624463519313305e-06, + "loss": 2.622, + "step": 1377 + }, + { + "epoch": 0.07392703862660945, + "grad_norm": 0.271484375, + "learning_rate": 2.4642346208869814e-06, + "loss": 2.4454, + "step": 1378 + }, + { + "epoch": 0.07398068669527896, + "grad_norm": 0.353515625, + "learning_rate": 2.4660228898426323e-06, + "loss": 2.3148, + "step": 1379 + }, + { + "epoch": 0.0740343347639485, + "grad_norm": 0.337890625, + "learning_rate": 2.467811158798283e-06, + "loss": 2.4646, + "step": 1380 + }, + { + "epoch": 0.07408798283261803, + "grad_norm": 0.26953125, + "learning_rate": 2.4695994277539345e-06, + "loss": 2.4344, + "step": 1381 + }, + { + "epoch": 0.07414163090128756, + "grad_norm": 0.306640625, + "learning_rate": 2.4713876967095853e-06, + "loss": 2.2819, + "step": 1382 + }, + { + "epoch": 0.07419527896995708, + "grad_norm": 0.3359375, + "learning_rate": 2.4731759656652362e-06, + "loss": 2.4617, + "step": 1383 + }, + { + "epoch": 0.07424892703862661, + "grad_norm": 0.416015625, + "learning_rate": 2.474964234620887e-06, + "loss": 2.3197, + "step": 1384 + }, + { + "epoch": 0.07430257510729614, + "grad_norm": 0.66015625, + "learning_rate": 2.476752503576538e-06, + "loss": 2.7264, + "step": 1385 + }, + { + "epoch": 0.07435622317596567, + "grad_norm": 0.515625, + "learning_rate": 2.4785407725321893e-06, + "loss": 2.6231, + "step": 1386 + }, + { + "epoch": 0.07440987124463519, + "grad_norm": 0.4140625, + "learning_rate": 2.48032904148784e-06, + "loss": 2.458, + "step": 1387 + }, + { + "epoch": 0.07446351931330472, + "grad_norm": 0.3359375, + "learning_rate": 2.482117310443491e-06, + "loss": 2.3704, + "step": 1388 + }, + { + "epoch": 0.07451716738197425, + "grad_norm": 0.3046875, + "learning_rate": 2.483905579399142e-06, + "loss": 2.1709, + "step": 1389 + }, + { + "epoch": 0.07457081545064377, + "grad_norm": 0.365234375, + "learning_rate": 2.4856938483547928e-06, + "loss": 2.2664, + "step": 1390 + }, + { + "epoch": 0.0746244635193133, + "grad_norm": 0.384765625, + "learning_rate": 2.4874821173104437e-06, + "loss": 2.3177, + "step": 1391 + }, + { + "epoch": 0.07467811158798283, + "grad_norm": 0.35546875, + "learning_rate": 2.4892703862660945e-06, + "loss": 2.3361, + "step": 1392 + }, + { + "epoch": 0.07473175965665237, + "grad_norm": 0.265625, + "learning_rate": 2.4910586552217454e-06, + "loss": 2.3965, + "step": 1393 + }, + { + "epoch": 0.07478540772532188, + "grad_norm": 0.353515625, + "learning_rate": 2.4928469241773963e-06, + "loss": 2.5248, + "step": 1394 + }, + { + "epoch": 0.07483905579399142, + "grad_norm": 0.466796875, + "learning_rate": 2.494635193133047e-06, + "loss": 2.479, + "step": 1395 + }, + { + "epoch": 0.07489270386266095, + "grad_norm": 0.275390625, + "learning_rate": 2.4964234620886985e-06, + "loss": 2.5775, + "step": 1396 + }, + { + "epoch": 0.07494635193133047, + "grad_norm": 0.83203125, + "learning_rate": 2.4982117310443493e-06, + "loss": 2.3551, + "step": 1397 + }, + { + "epoch": 0.075, + "grad_norm": 0.32421875, + "learning_rate": 2.5e-06, + "loss": 2.2707, + "step": 1398 + }, + { + "epoch": 0.07505364806866953, + "grad_norm": 0.35546875, + "learning_rate": 2.501788268955651e-06, + "loss": 2.4277, + "step": 1399 + }, + { + "epoch": 0.07510729613733906, + "grad_norm": 0.341796875, + "learning_rate": 2.503576537911302e-06, + "loss": 2.4192, + "step": 1400 + }, + { + "epoch": 0.07516094420600858, + "grad_norm": 0.2578125, + "learning_rate": 2.505364806866953e-06, + "loss": 2.2467, + "step": 1401 + }, + { + "epoch": 0.07521459227467811, + "grad_norm": 0.443359375, + "learning_rate": 2.5071530758226037e-06, + "loss": 2.4671, + "step": 1402 + }, + { + "epoch": 0.07526824034334764, + "grad_norm": 0.25390625, + "learning_rate": 2.508941344778255e-06, + "loss": 2.1597, + "step": 1403 + }, + { + "epoch": 0.07532188841201717, + "grad_norm": 0.310546875, + "learning_rate": 2.510729613733906e-06, + "loss": 2.339, + "step": 1404 + }, + { + "epoch": 0.07537553648068669, + "grad_norm": 0.3671875, + "learning_rate": 2.5125178826895568e-06, + "loss": 2.6584, + "step": 1405 + }, + { + "epoch": 0.07542918454935622, + "grad_norm": 0.546875, + "learning_rate": 2.5143061516452076e-06, + "loss": 2.4126, + "step": 1406 + }, + { + "epoch": 0.07548283261802576, + "grad_norm": 0.32421875, + "learning_rate": 2.5160944206008585e-06, + "loss": 2.3315, + "step": 1407 + }, + { + "epoch": 0.07553648068669527, + "grad_norm": 0.279296875, + "learning_rate": 2.5178826895565094e-06, + "loss": 1.815, + "step": 1408 + }, + { + "epoch": 0.0755901287553648, + "grad_norm": 0.353515625, + "learning_rate": 2.5196709585121603e-06, + "loss": 2.3813, + "step": 1409 + }, + { + "epoch": 0.07564377682403434, + "grad_norm": 0.41796875, + "learning_rate": 2.521459227467811e-06, + "loss": 2.3991, + "step": 1410 + }, + { + "epoch": 0.07569742489270387, + "grad_norm": 0.353515625, + "learning_rate": 2.523247496423462e-06, + "loss": 2.45, + "step": 1411 + }, + { + "epoch": 0.07575107296137339, + "grad_norm": 0.265625, + "learning_rate": 2.525035765379113e-06, + "loss": 2.6294, + "step": 1412 + }, + { + "epoch": 0.07580472103004292, + "grad_norm": 0.4609375, + "learning_rate": 2.5268240343347646e-06, + "loss": 2.1842, + "step": 1413 + }, + { + "epoch": 0.07585836909871245, + "grad_norm": 0.32421875, + "learning_rate": 2.5286123032904155e-06, + "loss": 2.5732, + "step": 1414 + }, + { + "epoch": 0.07591201716738197, + "grad_norm": 0.349609375, + "learning_rate": 2.5304005722460664e-06, + "loss": 2.4838, + "step": 1415 + }, + { + "epoch": 0.0759656652360515, + "grad_norm": 0.55859375, + "learning_rate": 2.5321888412017172e-06, + "loss": 2.3502, + "step": 1416 + }, + { + "epoch": 0.07601931330472103, + "grad_norm": 0.39453125, + "learning_rate": 2.533977110157368e-06, + "loss": 2.3725, + "step": 1417 + }, + { + "epoch": 0.07607296137339056, + "grad_norm": 0.40625, + "learning_rate": 2.535765379113019e-06, + "loss": 2.3296, + "step": 1418 + }, + { + "epoch": 0.07612660944206008, + "grad_norm": 0.298828125, + "learning_rate": 2.53755364806867e-06, + "loss": 2.3893, + "step": 1419 + }, + { + "epoch": 0.07618025751072961, + "grad_norm": 0.306640625, + "learning_rate": 2.5393419170243207e-06, + "loss": 2.081, + "step": 1420 + }, + { + "epoch": 0.07623390557939914, + "grad_norm": 2.1875, + "learning_rate": 2.5411301859799716e-06, + "loss": 2.5035, + "step": 1421 + }, + { + "epoch": 0.07628755364806868, + "grad_norm": 0.3359375, + "learning_rate": 2.542918454935622e-06, + "loss": 2.4706, + "step": 1422 + }, + { + "epoch": 0.0763412017167382, + "grad_norm": 0.404296875, + "learning_rate": 2.544706723891274e-06, + "loss": 2.0578, + "step": 1423 + }, + { + "epoch": 0.07639484978540773, + "grad_norm": 0.326171875, + "learning_rate": 2.5464949928469247e-06, + "loss": 2.5513, + "step": 1424 + }, + { + "epoch": 0.07644849785407726, + "grad_norm": 0.34765625, + "learning_rate": 2.5482832618025755e-06, + "loss": 2.2776, + "step": 1425 + }, + { + "epoch": 0.07650214592274678, + "grad_norm": 0.294921875, + "learning_rate": 2.5500715307582264e-06, + "loss": 2.3373, + "step": 1426 + }, + { + "epoch": 0.07655579399141631, + "grad_norm": 0.28515625, + "learning_rate": 2.5518597997138773e-06, + "loss": 2.3209, + "step": 1427 + }, + { + "epoch": 0.07660944206008584, + "grad_norm": 0.35546875, + "learning_rate": 2.553648068669528e-06, + "loss": 2.237, + "step": 1428 + }, + { + "epoch": 0.07666309012875537, + "grad_norm": 0.404296875, + "learning_rate": 2.555436337625179e-06, + "loss": 2.4875, + "step": 1429 + }, + { + "epoch": 0.07671673819742489, + "grad_norm": 0.318359375, + "learning_rate": 2.55722460658083e-06, + "loss": 2.1972, + "step": 1430 + }, + { + "epoch": 0.07677038626609442, + "grad_norm": 0.30078125, + "learning_rate": 2.559012875536481e-06, + "loss": 2.5334, + "step": 1431 + }, + { + "epoch": 0.07682403433476395, + "grad_norm": 0.30078125, + "learning_rate": 2.5608011444921317e-06, + "loss": 1.9442, + "step": 1432 + }, + { + "epoch": 0.07687768240343347, + "grad_norm": 0.337890625, + "learning_rate": 2.562589413447783e-06, + "loss": 2.1832, + "step": 1433 + }, + { + "epoch": 0.076931330472103, + "grad_norm": 0.359375, + "learning_rate": 2.564377682403434e-06, + "loss": 2.6028, + "step": 1434 + }, + { + "epoch": 0.07698497854077253, + "grad_norm": 0.60546875, + "learning_rate": 2.5661659513590847e-06, + "loss": 2.4501, + "step": 1435 + }, + { + "epoch": 0.07703862660944207, + "grad_norm": 0.341796875, + "learning_rate": 2.5679542203147356e-06, + "loss": 2.6473, + "step": 1436 + }, + { + "epoch": 0.07709227467811158, + "grad_norm": 0.361328125, + "learning_rate": 2.5697424892703865e-06, + "loss": 2.1789, + "step": 1437 + }, + { + "epoch": 0.07714592274678111, + "grad_norm": 0.41796875, + "learning_rate": 2.5715307582260374e-06, + "loss": 1.6642, + "step": 1438 + }, + { + "epoch": 0.07719957081545065, + "grad_norm": 0.40234375, + "learning_rate": 2.5733190271816882e-06, + "loss": 2.4271, + "step": 1439 + }, + { + "epoch": 0.07725321888412018, + "grad_norm": 0.44140625, + "learning_rate": 2.575107296137339e-06, + "loss": 2.4993, + "step": 1440 + }, + { + "epoch": 0.0773068669527897, + "grad_norm": 0.30859375, + "learning_rate": 2.57689556509299e-06, + "loss": 2.2405, + "step": 1441 + }, + { + "epoch": 0.07736051502145923, + "grad_norm": 0.53125, + "learning_rate": 2.578683834048641e-06, + "loss": 2.4233, + "step": 1442 + }, + { + "epoch": 0.07741416309012876, + "grad_norm": 0.416015625, + "learning_rate": 2.580472103004292e-06, + "loss": 2.3012, + "step": 1443 + }, + { + "epoch": 0.07746781115879828, + "grad_norm": 0.390625, + "learning_rate": 2.582260371959943e-06, + "loss": 2.3787, + "step": 1444 + }, + { + "epoch": 0.07752145922746781, + "grad_norm": 1.78125, + "learning_rate": 2.584048640915594e-06, + "loss": 2.6991, + "step": 1445 + }, + { + "epoch": 0.07757510729613734, + "grad_norm": 0.34375, + "learning_rate": 2.5858369098712448e-06, + "loss": 2.1993, + "step": 1446 + }, + { + "epoch": 0.07762875536480687, + "grad_norm": 0.41015625, + "learning_rate": 2.5876251788268957e-06, + "loss": 2.0444, + "step": 1447 + }, + { + "epoch": 0.07768240343347639, + "grad_norm": 0.28125, + "learning_rate": 2.5894134477825465e-06, + "loss": 2.3355, + "step": 1448 + }, + { + "epoch": 0.07773605150214592, + "grad_norm": 0.32421875, + "learning_rate": 2.5912017167381974e-06, + "loss": 1.9716, + "step": 1449 + }, + { + "epoch": 0.07778969957081545, + "grad_norm": 0.388671875, + "learning_rate": 2.5929899856938483e-06, + "loss": 2.6277, + "step": 1450 + }, + { + "epoch": 0.07784334763948497, + "grad_norm": 0.265625, + "learning_rate": 2.594778254649499e-06, + "loss": 2.413, + "step": 1451 + }, + { + "epoch": 0.0778969957081545, + "grad_norm": 0.50390625, + "learning_rate": 2.59656652360515e-06, + "loss": 2.2826, + "step": 1452 + }, + { + "epoch": 0.07795064377682404, + "grad_norm": 0.361328125, + "learning_rate": 2.5983547925608018e-06, + "loss": 2.2317, + "step": 1453 + }, + { + "epoch": 0.07800429184549357, + "grad_norm": 0.333984375, + "learning_rate": 2.6001430615164526e-06, + "loss": 2.289, + "step": 1454 + }, + { + "epoch": 0.07805793991416309, + "grad_norm": 0.275390625, + "learning_rate": 2.6019313304721035e-06, + "loss": 1.9588, + "step": 1455 + }, + { + "epoch": 0.07811158798283262, + "grad_norm": 0.34375, + "learning_rate": 2.6037195994277544e-06, + "loss": 2.3168, + "step": 1456 + }, + { + "epoch": 0.07816523605150215, + "grad_norm": 0.322265625, + "learning_rate": 2.6055078683834053e-06, + "loss": 2.4448, + "step": 1457 + }, + { + "epoch": 0.07821888412017168, + "grad_norm": 0.482421875, + "learning_rate": 2.607296137339056e-06, + "loss": 1.7533, + "step": 1458 + }, + { + "epoch": 0.0782725321888412, + "grad_norm": 0.443359375, + "learning_rate": 2.609084406294707e-06, + "loss": 1.5643, + "step": 1459 + }, + { + "epoch": 0.07832618025751073, + "grad_norm": 0.419921875, + "learning_rate": 2.610872675250358e-06, + "loss": 2.4983, + "step": 1460 + }, + { + "epoch": 0.07837982832618026, + "grad_norm": 0.462890625, + "learning_rate": 2.6126609442060088e-06, + "loss": 2.6497, + "step": 1461 + }, + { + "epoch": 0.07843347639484978, + "grad_norm": 0.3515625, + "learning_rate": 2.61444921316166e-06, + "loss": 2.4169, + "step": 1462 + }, + { + "epoch": 0.07848712446351931, + "grad_norm": 0.330078125, + "learning_rate": 2.616237482117311e-06, + "loss": 2.2239, + "step": 1463 + }, + { + "epoch": 0.07854077253218884, + "grad_norm": 0.330078125, + "learning_rate": 2.618025751072962e-06, + "loss": 2.7294, + "step": 1464 + }, + { + "epoch": 0.07859442060085838, + "grad_norm": 0.271484375, + "learning_rate": 2.6198140200286127e-06, + "loss": 2.5084, + "step": 1465 + }, + { + "epoch": 0.07864806866952789, + "grad_norm": 0.380859375, + "learning_rate": 2.6216022889842636e-06, + "loss": 2.4401, + "step": 1466 + }, + { + "epoch": 0.07870171673819742, + "grad_norm": 0.33203125, + "learning_rate": 2.6233905579399144e-06, + "loss": 2.5095, + "step": 1467 + }, + { + "epoch": 0.07875536480686696, + "grad_norm": 0.34765625, + "learning_rate": 2.6251788268955653e-06, + "loss": 2.3893, + "step": 1468 + }, + { + "epoch": 0.07880901287553647, + "grad_norm": 0.40234375, + "learning_rate": 2.626967095851216e-06, + "loss": 2.5088, + "step": 1469 + }, + { + "epoch": 0.078862660944206, + "grad_norm": 0.248046875, + "learning_rate": 2.628755364806867e-06, + "loss": 2.4513, + "step": 1470 + }, + { + "epoch": 0.07891630901287554, + "grad_norm": 0.2314453125, + "learning_rate": 2.630543633762518e-06, + "loss": 2.3734, + "step": 1471 + }, + { + "epoch": 0.07896995708154507, + "grad_norm": 0.44921875, + "learning_rate": 2.6323319027181693e-06, + "loss": 2.7865, + "step": 1472 + }, + { + "epoch": 0.07902360515021459, + "grad_norm": 0.279296875, + "learning_rate": 2.63412017167382e-06, + "loss": 2.2712, + "step": 1473 + }, + { + "epoch": 0.07907725321888412, + "grad_norm": 0.52734375, + "learning_rate": 2.635908440629471e-06, + "loss": 2.4487, + "step": 1474 + }, + { + "epoch": 0.07913090128755365, + "grad_norm": 0.306640625, + "learning_rate": 2.637696709585122e-06, + "loss": 2.4802, + "step": 1475 + }, + { + "epoch": 0.07918454935622318, + "grad_norm": 0.2890625, + "learning_rate": 2.6394849785407728e-06, + "loss": 2.3237, + "step": 1476 + }, + { + "epoch": 0.0792381974248927, + "grad_norm": 0.453125, + "learning_rate": 2.6412732474964236e-06, + "loss": 2.3525, + "step": 1477 + }, + { + "epoch": 0.07929184549356223, + "grad_norm": 0.66015625, + "learning_rate": 2.6430615164520745e-06, + "loss": 2.3678, + "step": 1478 + }, + { + "epoch": 0.07934549356223176, + "grad_norm": 1.015625, + "learning_rate": 2.6448497854077254e-06, + "loss": 2.2862, + "step": 1479 + }, + { + "epoch": 0.07939914163090128, + "grad_norm": 0.275390625, + "learning_rate": 2.6466380543633763e-06, + "loss": 2.4603, + "step": 1480 + }, + { + "epoch": 0.07945278969957081, + "grad_norm": 0.3515625, + "learning_rate": 2.648426323319027e-06, + "loss": 2.1415, + "step": 1481 + }, + { + "epoch": 0.07950643776824035, + "grad_norm": 0.275390625, + "learning_rate": 2.6502145922746784e-06, + "loss": 2.406, + "step": 1482 + }, + { + "epoch": 0.07956008583690988, + "grad_norm": 0.37109375, + "learning_rate": 2.6520028612303293e-06, + "loss": 2.2256, + "step": 1483 + }, + { + "epoch": 0.0796137339055794, + "grad_norm": 0.255859375, + "learning_rate": 2.65379113018598e-06, + "loss": 2.1231, + "step": 1484 + }, + { + "epoch": 0.07966738197424893, + "grad_norm": 0.34375, + "learning_rate": 2.655579399141631e-06, + "loss": 2.5046, + "step": 1485 + }, + { + "epoch": 0.07972103004291846, + "grad_norm": 0.279296875, + "learning_rate": 2.657367668097282e-06, + "loss": 2.3656, + "step": 1486 + }, + { + "epoch": 0.07977467811158798, + "grad_norm": 0.46875, + "learning_rate": 2.659155937052933e-06, + "loss": 2.2715, + "step": 1487 + }, + { + "epoch": 0.07982832618025751, + "grad_norm": 0.56640625, + "learning_rate": 2.6609442060085837e-06, + "loss": 2.3546, + "step": 1488 + }, + { + "epoch": 0.07988197424892704, + "grad_norm": 0.345703125, + "learning_rate": 2.6627324749642346e-06, + "loss": 2.1245, + "step": 1489 + }, + { + "epoch": 0.07993562231759657, + "grad_norm": 0.302734375, + "learning_rate": 2.6645207439198854e-06, + "loss": 2.5736, + "step": 1490 + }, + { + "epoch": 0.07998927038626609, + "grad_norm": 0.302734375, + "learning_rate": 2.6663090128755363e-06, + "loss": 2.4807, + "step": 1491 + }, + { + "epoch": 0.08004291845493562, + "grad_norm": 0.359375, + "learning_rate": 2.668097281831188e-06, + "loss": 2.332, + "step": 1492 + }, + { + "epoch": 0.08009656652360515, + "grad_norm": 0.287109375, + "learning_rate": 2.669885550786839e-06, + "loss": 2.4803, + "step": 1493 + }, + { + "epoch": 0.08015021459227469, + "grad_norm": 0.255859375, + "learning_rate": 2.67167381974249e-06, + "loss": 2.2893, + "step": 1494 + }, + { + "epoch": 0.0802038626609442, + "grad_norm": 0.3046875, + "learning_rate": 2.6734620886981407e-06, + "loss": 2.4047, + "step": 1495 + }, + { + "epoch": 0.08025751072961373, + "grad_norm": 0.4765625, + "learning_rate": 2.6752503576537915e-06, + "loss": 2.3819, + "step": 1496 + }, + { + "epoch": 0.08031115879828327, + "grad_norm": 0.53125, + "learning_rate": 2.6770386266094424e-06, + "loss": 2.1909, + "step": 1497 + }, + { + "epoch": 0.08036480686695278, + "grad_norm": 0.3984375, + "learning_rate": 2.6788268955650933e-06, + "loss": 2.3897, + "step": 1498 + }, + { + "epoch": 0.08041845493562232, + "grad_norm": 0.435546875, + "learning_rate": 2.680615164520744e-06, + "loss": 2.3626, + "step": 1499 + }, + { + "epoch": 0.08047210300429185, + "grad_norm": 0.236328125, + "learning_rate": 2.682403433476395e-06, + "loss": 2.489, + "step": 1500 + }, + { + "epoch": 0.08052575107296138, + "grad_norm": 0.453125, + "learning_rate": 2.684191702432046e-06, + "loss": 2.3638, + "step": 1501 + }, + { + "epoch": 0.0805793991416309, + "grad_norm": 0.318359375, + "learning_rate": 2.6859799713876972e-06, + "loss": 2.2639, + "step": 1502 + }, + { + "epoch": 0.08063304721030043, + "grad_norm": 0.390625, + "learning_rate": 2.687768240343348e-06, + "loss": 2.3295, + "step": 1503 + }, + { + "epoch": 0.08068669527896996, + "grad_norm": 0.26953125, + "learning_rate": 2.689556509298999e-06, + "loss": 2.3914, + "step": 1504 + }, + { + "epoch": 0.08074034334763948, + "grad_norm": 0.28125, + "learning_rate": 2.69134477825465e-06, + "loss": 2.1996, + "step": 1505 + }, + { + "epoch": 0.08079399141630901, + "grad_norm": 0.318359375, + "learning_rate": 2.6931330472103007e-06, + "loss": 2.3858, + "step": 1506 + }, + { + "epoch": 0.08084763948497854, + "grad_norm": 0.71875, + "learning_rate": 2.6949213161659516e-06, + "loss": 2.3786, + "step": 1507 + }, + { + "epoch": 0.08090128755364807, + "grad_norm": 0.302734375, + "learning_rate": 2.6967095851216025e-06, + "loss": 2.6032, + "step": 1508 + }, + { + "epoch": 0.08095493562231759, + "grad_norm": 0.28515625, + "learning_rate": 2.6984978540772533e-06, + "loss": 2.4609, + "step": 1509 + }, + { + "epoch": 0.08100858369098712, + "grad_norm": 0.341796875, + "learning_rate": 2.7002861230329042e-06, + "loss": 2.5199, + "step": 1510 + }, + { + "epoch": 0.08106223175965666, + "grad_norm": 0.298828125, + "learning_rate": 2.702074391988555e-06, + "loss": 2.2739, + "step": 1511 + }, + { + "epoch": 0.08111587982832617, + "grad_norm": 0.328125, + "learning_rate": 2.7038626609442064e-06, + "loss": 2.3826, + "step": 1512 + }, + { + "epoch": 0.0811695278969957, + "grad_norm": 0.73046875, + "learning_rate": 2.7056509298998573e-06, + "loss": 2.5041, + "step": 1513 + }, + { + "epoch": 0.08122317596566524, + "grad_norm": 0.2490234375, + "learning_rate": 2.707439198855508e-06, + "loss": 2.5956, + "step": 1514 + }, + { + "epoch": 0.08127682403433477, + "grad_norm": 0.26953125, + "learning_rate": 2.709227467811159e-06, + "loss": 2.6355, + "step": 1515 + }, + { + "epoch": 0.08133047210300429, + "grad_norm": 0.283203125, + "learning_rate": 2.71101573676681e-06, + "loss": 2.513, + "step": 1516 + }, + { + "epoch": 0.08138412017167382, + "grad_norm": 0.275390625, + "learning_rate": 2.7128040057224608e-06, + "loss": 2.3027, + "step": 1517 + }, + { + "epoch": 0.08143776824034335, + "grad_norm": 0.3046875, + "learning_rate": 2.7145922746781117e-06, + "loss": 2.5787, + "step": 1518 + }, + { + "epoch": 0.08149141630901288, + "grad_norm": 0.34765625, + "learning_rate": 2.7163805436337625e-06, + "loss": 2.4882, + "step": 1519 + }, + { + "epoch": 0.0815450643776824, + "grad_norm": 0.400390625, + "learning_rate": 2.7181688125894134e-06, + "loss": 2.5983, + "step": 1520 + }, + { + "epoch": 0.08159871244635193, + "grad_norm": 0.447265625, + "learning_rate": 2.7199570815450643e-06, + "loss": 2.4839, + "step": 1521 + }, + { + "epoch": 0.08165236051502146, + "grad_norm": 0.5625, + "learning_rate": 2.7217453505007156e-06, + "loss": 2.4553, + "step": 1522 + }, + { + "epoch": 0.08170600858369098, + "grad_norm": 0.328125, + "learning_rate": 2.7235336194563665e-06, + "loss": 2.5919, + "step": 1523 + }, + { + "epoch": 0.08175965665236051, + "grad_norm": 0.3125, + "learning_rate": 2.7253218884120173e-06, + "loss": 2.46, + "step": 1524 + }, + { + "epoch": 0.08181330472103004, + "grad_norm": 0.275390625, + "learning_rate": 2.727110157367668e-06, + "loss": 2.5501, + "step": 1525 + }, + { + "epoch": 0.08186695278969958, + "grad_norm": 0.36328125, + "learning_rate": 2.728898426323319e-06, + "loss": 2.2296, + "step": 1526 + }, + { + "epoch": 0.0819206008583691, + "grad_norm": 0.56640625, + "learning_rate": 2.73068669527897e-06, + "loss": 2.0532, + "step": 1527 + }, + { + "epoch": 0.08197424892703863, + "grad_norm": 0.46875, + "learning_rate": 2.732474964234621e-06, + "loss": 2.6117, + "step": 1528 + }, + { + "epoch": 0.08202789699570816, + "grad_norm": 0.302734375, + "learning_rate": 2.7342632331902717e-06, + "loss": 2.4008, + "step": 1529 + }, + { + "epoch": 0.08208154506437768, + "grad_norm": 0.30859375, + "learning_rate": 2.7360515021459226e-06, + "loss": 1.9459, + "step": 1530 + }, + { + "epoch": 0.08213519313304721, + "grad_norm": 0.2734375, + "learning_rate": 2.7378397711015743e-06, + "loss": 2.4995, + "step": 1531 + }, + { + "epoch": 0.08218884120171674, + "grad_norm": 0.3046875, + "learning_rate": 2.739628040057225e-06, + "loss": 2.544, + "step": 1532 + }, + { + "epoch": 0.08224248927038627, + "grad_norm": 0.33203125, + "learning_rate": 2.741416309012876e-06, + "loss": 2.3106, + "step": 1533 + }, + { + "epoch": 0.08229613733905579, + "grad_norm": 0.34375, + "learning_rate": 2.743204577968527e-06, + "loss": 2.2351, + "step": 1534 + }, + { + "epoch": 0.08234978540772532, + "grad_norm": 0.32421875, + "learning_rate": 2.744992846924178e-06, + "loss": 2.3834, + "step": 1535 + }, + { + "epoch": 0.08240343347639485, + "grad_norm": 0.353515625, + "learning_rate": 2.7467811158798287e-06, + "loss": 2.1329, + "step": 1536 + }, + { + "epoch": 0.08245708154506438, + "grad_norm": 0.341796875, + "learning_rate": 2.7485693848354796e-06, + "loss": 2.354, + "step": 1537 + }, + { + "epoch": 0.0825107296137339, + "grad_norm": 0.3125, + "learning_rate": 2.7503576537911304e-06, + "loss": 2.3315, + "step": 1538 + }, + { + "epoch": 0.08256437768240343, + "grad_norm": 0.2412109375, + "learning_rate": 2.7521459227467813e-06, + "loss": 2.18, + "step": 1539 + }, + { + "epoch": 0.08261802575107297, + "grad_norm": 0.359375, + "learning_rate": 2.753934191702432e-06, + "loss": 2.3234, + "step": 1540 + }, + { + "epoch": 0.08267167381974248, + "grad_norm": 0.36328125, + "learning_rate": 2.7557224606580835e-06, + "loss": 2.5048, + "step": 1541 + }, + { + "epoch": 0.08272532188841202, + "grad_norm": 0.345703125, + "learning_rate": 2.7575107296137344e-06, + "loss": 2.6175, + "step": 1542 + }, + { + "epoch": 0.08277896995708155, + "grad_norm": 0.5234375, + "learning_rate": 2.7592989985693852e-06, + "loss": 2.3178, + "step": 1543 + }, + { + "epoch": 0.08283261802575108, + "grad_norm": 0.314453125, + "learning_rate": 2.761087267525036e-06, + "loss": 2.2476, + "step": 1544 + }, + { + "epoch": 0.0828862660944206, + "grad_norm": 0.41796875, + "learning_rate": 2.762875536480687e-06, + "loss": 2.219, + "step": 1545 + }, + { + "epoch": 0.08293991416309013, + "grad_norm": 0.248046875, + "learning_rate": 2.764663805436338e-06, + "loss": 2.3697, + "step": 1546 + }, + { + "epoch": 0.08299356223175966, + "grad_norm": 0.5078125, + "learning_rate": 2.7664520743919887e-06, + "loss": 2.3612, + "step": 1547 + }, + { + "epoch": 0.08304721030042918, + "grad_norm": 0.3671875, + "learning_rate": 2.7682403433476396e-06, + "loss": 2.4247, + "step": 1548 + }, + { + "epoch": 0.08310085836909871, + "grad_norm": 0.271484375, + "learning_rate": 2.7700286123032905e-06, + "loss": 2.3883, + "step": 1549 + }, + { + "epoch": 0.08315450643776824, + "grad_norm": 0.275390625, + "learning_rate": 2.7718168812589414e-06, + "loss": 2.4006, + "step": 1550 + }, + { + "epoch": 0.08320815450643777, + "grad_norm": 1.171875, + "learning_rate": 2.7736051502145927e-06, + "loss": 2.6842, + "step": 1551 + }, + { + "epoch": 0.08326180257510729, + "grad_norm": 0.3828125, + "learning_rate": 2.7753934191702436e-06, + "loss": 2.0033, + "step": 1552 + }, + { + "epoch": 0.08331545064377682, + "grad_norm": 0.294921875, + "learning_rate": 2.7771816881258944e-06, + "loss": 2.4223, + "step": 1553 + }, + { + "epoch": 0.08336909871244635, + "grad_norm": 0.294921875, + "learning_rate": 2.7789699570815453e-06, + "loss": 2.5069, + "step": 1554 + }, + { + "epoch": 0.08342274678111589, + "grad_norm": 0.59375, + "learning_rate": 2.780758226037196e-06, + "loss": 2.3374, + "step": 1555 + }, + { + "epoch": 0.0834763948497854, + "grad_norm": 0.291015625, + "learning_rate": 2.782546494992847e-06, + "loss": 2.2345, + "step": 1556 + }, + { + "epoch": 0.08353004291845494, + "grad_norm": 0.703125, + "learning_rate": 2.784334763948498e-06, + "loss": 2.5835, + "step": 1557 + }, + { + "epoch": 0.08358369098712447, + "grad_norm": 0.412109375, + "learning_rate": 2.786123032904149e-06, + "loss": 2.6222, + "step": 1558 + }, + { + "epoch": 0.08363733905579399, + "grad_norm": 0.345703125, + "learning_rate": 2.7879113018597997e-06, + "loss": 2.4265, + "step": 1559 + }, + { + "epoch": 0.08369098712446352, + "grad_norm": 0.34765625, + "learning_rate": 2.7896995708154506e-06, + "loss": 2.207, + "step": 1560 + }, + { + "epoch": 0.08374463519313305, + "grad_norm": 0.31640625, + "learning_rate": 2.791487839771102e-06, + "loss": 2.4686, + "step": 1561 + }, + { + "epoch": 0.08379828326180258, + "grad_norm": 0.26171875, + "learning_rate": 2.7932761087267527e-06, + "loss": 2.1425, + "step": 1562 + }, + { + "epoch": 0.0838519313304721, + "grad_norm": 0.283203125, + "learning_rate": 2.7950643776824036e-06, + "loss": 2.5028, + "step": 1563 + }, + { + "epoch": 0.08390557939914163, + "grad_norm": 0.330078125, + "learning_rate": 2.7968526466380545e-06, + "loss": 2.1965, + "step": 1564 + }, + { + "epoch": 0.08395922746781116, + "grad_norm": 2.078125, + "learning_rate": 2.7986409155937054e-06, + "loss": 2.3909, + "step": 1565 + }, + { + "epoch": 0.08401287553648068, + "grad_norm": 0.33984375, + "learning_rate": 2.8004291845493562e-06, + "loss": 2.1402, + "step": 1566 + }, + { + "epoch": 0.08406652360515021, + "grad_norm": 0.2578125, + "learning_rate": 2.802217453505007e-06, + "loss": 2.2027, + "step": 1567 + }, + { + "epoch": 0.08412017167381974, + "grad_norm": 0.4453125, + "learning_rate": 2.804005722460658e-06, + "loss": 2.4546, + "step": 1568 + }, + { + "epoch": 0.08417381974248928, + "grad_norm": 0.365234375, + "learning_rate": 2.805793991416309e-06, + "loss": 2.2559, + "step": 1569 + }, + { + "epoch": 0.0842274678111588, + "grad_norm": 0.326171875, + "learning_rate": 2.8075822603719597e-06, + "loss": 2.602, + "step": 1570 + }, + { + "epoch": 0.08428111587982832, + "grad_norm": 0.2890625, + "learning_rate": 2.8093705293276115e-06, + "loss": 2.2797, + "step": 1571 + }, + { + "epoch": 0.08433476394849786, + "grad_norm": 0.375, + "learning_rate": 2.8111587982832623e-06, + "loss": 2.1914, + "step": 1572 + }, + { + "epoch": 0.08438841201716739, + "grad_norm": 0.478515625, + "learning_rate": 2.8129470672389132e-06, + "loss": 2.4491, + "step": 1573 + }, + { + "epoch": 0.0844420600858369, + "grad_norm": 0.267578125, + "learning_rate": 2.814735336194564e-06, + "loss": 2.2267, + "step": 1574 + }, + { + "epoch": 0.08449570815450644, + "grad_norm": 0.478515625, + "learning_rate": 2.816523605150215e-06, + "loss": 2.4832, + "step": 1575 + }, + { + "epoch": 0.08454935622317597, + "grad_norm": 0.5859375, + "learning_rate": 2.818311874105866e-06, + "loss": 2.4369, + "step": 1576 + }, + { + "epoch": 0.08460300429184549, + "grad_norm": 0.6796875, + "learning_rate": 2.8201001430615167e-06, + "loss": 2.3372, + "step": 1577 + }, + { + "epoch": 0.08465665236051502, + "grad_norm": 0.412109375, + "learning_rate": 2.8218884120171676e-06, + "loss": 2.5336, + "step": 1578 + }, + { + "epoch": 0.08471030042918455, + "grad_norm": 0.37890625, + "learning_rate": 2.8236766809728185e-06, + "loss": 2.3271, + "step": 1579 + }, + { + "epoch": 0.08476394849785408, + "grad_norm": 0.2890625, + "learning_rate": 2.8254649499284693e-06, + "loss": 2.3997, + "step": 1580 + }, + { + "epoch": 0.0848175965665236, + "grad_norm": 0.25, + "learning_rate": 2.8272532188841206e-06, + "loss": 2.2547, + "step": 1581 + }, + { + "epoch": 0.08487124463519313, + "grad_norm": 0.283203125, + "learning_rate": 2.8290414878397715e-06, + "loss": 2.3256, + "step": 1582 + }, + { + "epoch": 0.08492489270386266, + "grad_norm": 0.4453125, + "learning_rate": 2.8308297567954224e-06, + "loss": 2.2915, + "step": 1583 + }, + { + "epoch": 0.08497854077253218, + "grad_norm": 0.353515625, + "learning_rate": 2.8326180257510733e-06, + "loss": 2.2529, + "step": 1584 + }, + { + "epoch": 0.08503218884120171, + "grad_norm": 0.34375, + "learning_rate": 2.834406294706724e-06, + "loss": 2.3849, + "step": 1585 + }, + { + "epoch": 0.08508583690987125, + "grad_norm": 0.59765625, + "learning_rate": 2.836194563662375e-06, + "loss": 2.4409, + "step": 1586 + }, + { + "epoch": 0.08513948497854078, + "grad_norm": 0.46875, + "learning_rate": 2.837982832618026e-06, + "loss": 2.6106, + "step": 1587 + }, + { + "epoch": 0.0851931330472103, + "grad_norm": 0.423828125, + "learning_rate": 2.8397711015736768e-06, + "loss": 2.6496, + "step": 1588 + }, + { + "epoch": 0.08524678111587983, + "grad_norm": 0.6796875, + "learning_rate": 2.8415593705293277e-06, + "loss": 2.503, + "step": 1589 + }, + { + "epoch": 0.08530042918454936, + "grad_norm": 0.33203125, + "learning_rate": 2.843347639484979e-06, + "loss": 2.2833, + "step": 1590 + }, + { + "epoch": 0.08535407725321889, + "grad_norm": 0.5703125, + "learning_rate": 2.84513590844063e-06, + "loss": 2.5118, + "step": 1591 + }, + { + "epoch": 0.08540772532188841, + "grad_norm": 0.416015625, + "learning_rate": 2.8469241773962807e-06, + "loss": 2.3155, + "step": 1592 + }, + { + "epoch": 0.08546137339055794, + "grad_norm": 0.294921875, + "learning_rate": 2.8487124463519316e-06, + "loss": 2.4373, + "step": 1593 + }, + { + "epoch": 0.08551502145922747, + "grad_norm": 0.283203125, + "learning_rate": 2.8505007153075825e-06, + "loss": 2.3812, + "step": 1594 + }, + { + "epoch": 0.08556866952789699, + "grad_norm": 0.337890625, + "learning_rate": 2.8522889842632333e-06, + "loss": 2.3048, + "step": 1595 + }, + { + "epoch": 0.08562231759656652, + "grad_norm": 0.328125, + "learning_rate": 2.854077253218884e-06, + "loss": 2.4165, + "step": 1596 + }, + { + "epoch": 0.08567596566523605, + "grad_norm": 0.306640625, + "learning_rate": 2.855865522174535e-06, + "loss": 2.5981, + "step": 1597 + }, + { + "epoch": 0.08572961373390559, + "grad_norm": 0.267578125, + "learning_rate": 2.857653791130186e-06, + "loss": 2.282, + "step": 1598 + }, + { + "epoch": 0.0857832618025751, + "grad_norm": 0.3203125, + "learning_rate": 2.859442060085837e-06, + "loss": 2.2179, + "step": 1599 + }, + { + "epoch": 0.08583690987124463, + "grad_norm": 0.296875, + "learning_rate": 2.861230329041488e-06, + "loss": 2.6469, + "step": 1600 + }, + { + "epoch": 0.08589055793991417, + "grad_norm": 0.4140625, + "learning_rate": 2.863018597997139e-06, + "loss": 1.8423, + "step": 1601 + }, + { + "epoch": 0.08594420600858368, + "grad_norm": 0.578125, + "learning_rate": 2.86480686695279e-06, + "loss": 2.5013, + "step": 1602 + }, + { + "epoch": 0.08599785407725322, + "grad_norm": 0.390625, + "learning_rate": 2.8665951359084408e-06, + "loss": 2.5707, + "step": 1603 + }, + { + "epoch": 0.08605150214592275, + "grad_norm": 0.482421875, + "learning_rate": 2.8683834048640916e-06, + "loss": 2.5692, + "step": 1604 + }, + { + "epoch": 0.08610515021459228, + "grad_norm": 0.5078125, + "learning_rate": 2.8701716738197425e-06, + "loss": 2.49, + "step": 1605 + }, + { + "epoch": 0.0861587982832618, + "grad_norm": 0.578125, + "learning_rate": 2.8719599427753934e-06, + "loss": 1.6159, + "step": 1606 + }, + { + "epoch": 0.08621244635193133, + "grad_norm": 0.333984375, + "learning_rate": 2.8737482117310443e-06, + "loss": 2.4376, + "step": 1607 + }, + { + "epoch": 0.08626609442060086, + "grad_norm": 0.2431640625, + "learning_rate": 2.875536480686695e-06, + "loss": 2.1662, + "step": 1608 + }, + { + "epoch": 0.08631974248927039, + "grad_norm": 0.34375, + "learning_rate": 2.877324749642346e-06, + "loss": 2.355, + "step": 1609 + }, + { + "epoch": 0.08637339055793991, + "grad_norm": 0.298828125, + "learning_rate": 2.8791130185979977e-06, + "loss": 2.4748, + "step": 1610 + }, + { + "epoch": 0.08642703862660944, + "grad_norm": 0.328125, + "learning_rate": 2.8809012875536486e-06, + "loss": 2.4992, + "step": 1611 + }, + { + "epoch": 0.08648068669527897, + "grad_norm": 0.29296875, + "learning_rate": 2.8826895565092995e-06, + "loss": 2.2948, + "step": 1612 + }, + { + "epoch": 0.08653433476394849, + "grad_norm": 0.328125, + "learning_rate": 2.8844778254649504e-06, + "loss": 2.3368, + "step": 1613 + }, + { + "epoch": 0.08658798283261802, + "grad_norm": 0.443359375, + "learning_rate": 2.8862660944206012e-06, + "loss": 2.511, + "step": 1614 + }, + { + "epoch": 0.08664163090128756, + "grad_norm": 0.322265625, + "learning_rate": 2.888054363376252e-06, + "loss": 2.307, + "step": 1615 + }, + { + "epoch": 0.08669527896995709, + "grad_norm": 0.8203125, + "learning_rate": 2.889842632331903e-06, + "loss": 2.4359, + "step": 1616 + }, + { + "epoch": 0.0867489270386266, + "grad_norm": 0.322265625, + "learning_rate": 2.891630901287554e-06, + "loss": 2.6047, + "step": 1617 + }, + { + "epoch": 0.08680257510729614, + "grad_norm": 0.5625, + "learning_rate": 2.8934191702432047e-06, + "loss": 2.3722, + "step": 1618 + }, + { + "epoch": 0.08685622317596567, + "grad_norm": 0.294921875, + "learning_rate": 2.8952074391988556e-06, + "loss": 2.3401, + "step": 1619 + }, + { + "epoch": 0.08690987124463519, + "grad_norm": 0.494140625, + "learning_rate": 2.896995708154507e-06, + "loss": 2.3208, + "step": 1620 + }, + { + "epoch": 0.08696351931330472, + "grad_norm": 0.27734375, + "learning_rate": 2.898783977110158e-06, + "loss": 1.8771, + "step": 1621 + }, + { + "epoch": 0.08701716738197425, + "grad_norm": 0.7265625, + "learning_rate": 2.9005722460658087e-06, + "loss": 2.3384, + "step": 1622 + }, + { + "epoch": 0.08707081545064378, + "grad_norm": 0.365234375, + "learning_rate": 2.9023605150214595e-06, + "loss": 2.347, + "step": 1623 + }, + { + "epoch": 0.0871244635193133, + "grad_norm": 0.31640625, + "learning_rate": 2.9041487839771104e-06, + "loss": 2.3725, + "step": 1624 + }, + { + "epoch": 0.08717811158798283, + "grad_norm": 0.66796875, + "learning_rate": 2.9059370529327613e-06, + "loss": 2.3976, + "step": 1625 + }, + { + "epoch": 0.08723175965665236, + "grad_norm": 0.30078125, + "learning_rate": 2.907725321888412e-06, + "loss": 2.302, + "step": 1626 + }, + { + "epoch": 0.0872854077253219, + "grad_norm": 0.30859375, + "learning_rate": 2.909513590844063e-06, + "loss": 2.0829, + "step": 1627 + }, + { + "epoch": 0.08733905579399141, + "grad_norm": 0.330078125, + "learning_rate": 2.911301859799714e-06, + "loss": 2.3819, + "step": 1628 + }, + { + "epoch": 0.08739270386266094, + "grad_norm": 0.34375, + "learning_rate": 2.913090128755365e-06, + "loss": 2.1782, + "step": 1629 + }, + { + "epoch": 0.08744635193133048, + "grad_norm": 0.5234375, + "learning_rate": 2.914878397711016e-06, + "loss": 2.2776, + "step": 1630 + }, + { + "epoch": 0.0875, + "grad_norm": 0.275390625, + "learning_rate": 2.916666666666667e-06, + "loss": 2.3167, + "step": 1631 + }, + { + "epoch": 0.08755364806866953, + "grad_norm": 0.337890625, + "learning_rate": 2.918454935622318e-06, + "loss": 2.3628, + "step": 1632 + }, + { + "epoch": 0.08760729613733906, + "grad_norm": 0.7734375, + "learning_rate": 2.9202432045779687e-06, + "loss": 2.583, + "step": 1633 + }, + { + "epoch": 0.08766094420600859, + "grad_norm": 0.2890625, + "learning_rate": 2.9220314735336196e-06, + "loss": 2.6267, + "step": 1634 + }, + { + "epoch": 0.08771459227467811, + "grad_norm": 0.302734375, + "learning_rate": 2.9238197424892705e-06, + "loss": 2.3221, + "step": 1635 + }, + { + "epoch": 0.08776824034334764, + "grad_norm": 0.3125, + "learning_rate": 2.9256080114449214e-06, + "loss": 2.3199, + "step": 1636 + }, + { + "epoch": 0.08782188841201717, + "grad_norm": 0.353515625, + "learning_rate": 2.9273962804005722e-06, + "loss": 2.5893, + "step": 1637 + }, + { + "epoch": 0.08787553648068669, + "grad_norm": 0.27734375, + "learning_rate": 2.929184549356223e-06, + "loss": 2.4188, + "step": 1638 + }, + { + "epoch": 0.08792918454935622, + "grad_norm": 0.30859375, + "learning_rate": 2.930972818311874e-06, + "loss": 2.5456, + "step": 1639 + }, + { + "epoch": 0.08798283261802575, + "grad_norm": 1.546875, + "learning_rate": 2.9327610872675257e-06, + "loss": 2.2658, + "step": 1640 + }, + { + "epoch": 0.08803648068669528, + "grad_norm": 0.41796875, + "learning_rate": 2.9345493562231766e-06, + "loss": 2.4275, + "step": 1641 + }, + { + "epoch": 0.0880901287553648, + "grad_norm": 0.28515625, + "learning_rate": 2.9363376251788275e-06, + "loss": 2.2089, + "step": 1642 + }, + { + "epoch": 0.08814377682403433, + "grad_norm": 0.287109375, + "learning_rate": 2.9381258941344783e-06, + "loss": 2.2894, + "step": 1643 + }, + { + "epoch": 0.08819742489270387, + "grad_norm": 0.328125, + "learning_rate": 2.939914163090129e-06, + "loss": 2.4265, + "step": 1644 + }, + { + "epoch": 0.0882510729613734, + "grad_norm": 0.330078125, + "learning_rate": 2.94170243204578e-06, + "loss": 2.4809, + "step": 1645 + }, + { + "epoch": 0.08830472103004292, + "grad_norm": 0.310546875, + "learning_rate": 2.943490701001431e-06, + "loss": 2.3953, + "step": 1646 + }, + { + "epoch": 0.08835836909871245, + "grad_norm": 0.46484375, + "learning_rate": 2.9452789699570814e-06, + "loss": 1.7697, + "step": 1647 + }, + { + "epoch": 0.08841201716738198, + "grad_norm": 0.2890625, + "learning_rate": 2.9470672389127323e-06, + "loss": 2.4435, + "step": 1648 + }, + { + "epoch": 0.0884656652360515, + "grad_norm": 0.2734375, + "learning_rate": 2.948855507868383e-06, + "loss": 2.4252, + "step": 1649 + }, + { + "epoch": 0.08851931330472103, + "grad_norm": 0.373046875, + "learning_rate": 2.950643776824035e-06, + "loss": 2.4754, + "step": 1650 + }, + { + "epoch": 0.08857296137339056, + "grad_norm": 0.4609375, + "learning_rate": 2.9524320457796858e-06, + "loss": 2.6361, + "step": 1651 + }, + { + "epoch": 0.08862660944206009, + "grad_norm": 0.287109375, + "learning_rate": 2.9542203147353366e-06, + "loss": 2.4172, + "step": 1652 + }, + { + "epoch": 0.08868025751072961, + "grad_norm": 0.255859375, + "learning_rate": 2.9560085836909875e-06, + "loss": 2.2565, + "step": 1653 + }, + { + "epoch": 0.08873390557939914, + "grad_norm": 0.365234375, + "learning_rate": 2.9577968526466384e-06, + "loss": 2.3482, + "step": 1654 + }, + { + "epoch": 0.08878755364806867, + "grad_norm": 0.33203125, + "learning_rate": 2.9595851216022893e-06, + "loss": 2.3525, + "step": 1655 + }, + { + "epoch": 0.08884120171673819, + "grad_norm": 0.341796875, + "learning_rate": 2.96137339055794e-06, + "loss": 2.6015, + "step": 1656 + }, + { + "epoch": 0.08889484978540772, + "grad_norm": 0.30859375, + "learning_rate": 2.963161659513591e-06, + "loss": 2.2409, + "step": 1657 + }, + { + "epoch": 0.08894849785407725, + "grad_norm": 0.296875, + "learning_rate": 2.964949928469242e-06, + "loss": 2.3734, + "step": 1658 + }, + { + "epoch": 0.08900214592274679, + "grad_norm": 0.3984375, + "learning_rate": 2.966738197424893e-06, + "loss": 2.6253, + "step": 1659 + }, + { + "epoch": 0.0890557939914163, + "grad_norm": 0.375, + "learning_rate": 2.968526466380544e-06, + "loss": 2.0491, + "step": 1660 + }, + { + "epoch": 0.08910944206008584, + "grad_norm": 0.373046875, + "learning_rate": 2.970314735336195e-06, + "loss": 2.3519, + "step": 1661 + }, + { + "epoch": 0.08916309012875537, + "grad_norm": 0.40625, + "learning_rate": 2.972103004291846e-06, + "loss": 2.4189, + "step": 1662 + }, + { + "epoch": 0.0892167381974249, + "grad_norm": 0.447265625, + "learning_rate": 2.9738912732474967e-06, + "loss": 2.6889, + "step": 1663 + }, + { + "epoch": 0.08927038626609442, + "grad_norm": 0.37890625, + "learning_rate": 2.9756795422031476e-06, + "loss": 2.4061, + "step": 1664 + }, + { + "epoch": 0.08932403433476395, + "grad_norm": 0.29296875, + "learning_rate": 2.9774678111587984e-06, + "loss": 2.2944, + "step": 1665 + }, + { + "epoch": 0.08937768240343348, + "grad_norm": 0.341796875, + "learning_rate": 2.9792560801144493e-06, + "loss": 2.4057, + "step": 1666 + }, + { + "epoch": 0.089431330472103, + "grad_norm": 0.5703125, + "learning_rate": 2.9810443490701e-06, + "loss": 2.5046, + "step": 1667 + }, + { + "epoch": 0.08948497854077253, + "grad_norm": 0.306640625, + "learning_rate": 2.982832618025751e-06, + "loss": 2.4419, + "step": 1668 + }, + { + "epoch": 0.08953862660944206, + "grad_norm": 0.44921875, + "learning_rate": 2.9846208869814024e-06, + "loss": 2.1259, + "step": 1669 + }, + { + "epoch": 0.0895922746781116, + "grad_norm": 0.50390625, + "learning_rate": 2.9864091559370533e-06, + "loss": 2.5487, + "step": 1670 + }, + { + "epoch": 0.08964592274678111, + "grad_norm": 0.298828125, + "learning_rate": 2.988197424892704e-06, + "loss": 2.3095, + "step": 1671 + }, + { + "epoch": 0.08969957081545064, + "grad_norm": 1.8203125, + "learning_rate": 2.989985693848355e-06, + "loss": 2.2443, + "step": 1672 + }, + { + "epoch": 0.08975321888412018, + "grad_norm": 0.419921875, + "learning_rate": 2.991773962804006e-06, + "loss": 2.4987, + "step": 1673 + }, + { + "epoch": 0.0898068669527897, + "grad_norm": 0.294921875, + "learning_rate": 2.9935622317596568e-06, + "loss": 2.513, + "step": 1674 + }, + { + "epoch": 0.08986051502145923, + "grad_norm": 0.27734375, + "learning_rate": 2.9953505007153076e-06, + "loss": 2.574, + "step": 1675 + }, + { + "epoch": 0.08991416309012876, + "grad_norm": 0.291015625, + "learning_rate": 2.9971387696709585e-06, + "loss": 2.2824, + "step": 1676 + }, + { + "epoch": 0.08996781115879829, + "grad_norm": 0.53125, + "learning_rate": 2.9989270386266094e-06, + "loss": 2.5166, + "step": 1677 + }, + { + "epoch": 0.0900214592274678, + "grad_norm": 0.46875, + "learning_rate": 3.0007153075822603e-06, + "loss": 2.4907, + "step": 1678 + }, + { + "epoch": 0.09007510729613734, + "grad_norm": 0.375, + "learning_rate": 3.002503576537912e-06, + "loss": 2.2257, + "step": 1679 + }, + { + "epoch": 0.09012875536480687, + "grad_norm": 0.349609375, + "learning_rate": 3.004291845493563e-06, + "loss": 2.1002, + "step": 1680 + }, + { + "epoch": 0.09018240343347639, + "grad_norm": 0.322265625, + "learning_rate": 3.0060801144492137e-06, + "loss": 2.5217, + "step": 1681 + }, + { + "epoch": 0.09023605150214592, + "grad_norm": 0.341796875, + "learning_rate": 3.0078683834048646e-06, + "loss": 2.471, + "step": 1682 + }, + { + "epoch": 0.09028969957081545, + "grad_norm": 0.46484375, + "learning_rate": 3.0096566523605155e-06, + "loss": 2.5779, + "step": 1683 + }, + { + "epoch": 0.09034334763948498, + "grad_norm": 0.306640625, + "learning_rate": 3.0114449213161664e-06, + "loss": 2.4124, + "step": 1684 + }, + { + "epoch": 0.0903969957081545, + "grad_norm": 0.306640625, + "learning_rate": 3.0132331902718172e-06, + "loss": 2.1771, + "step": 1685 + }, + { + "epoch": 0.09045064377682403, + "grad_norm": 0.443359375, + "learning_rate": 3.015021459227468e-06, + "loss": 1.8479, + "step": 1686 + }, + { + "epoch": 0.09050429184549356, + "grad_norm": 0.765625, + "learning_rate": 3.016809728183119e-06, + "loss": 2.3868, + "step": 1687 + }, + { + "epoch": 0.0905579399141631, + "grad_norm": 0.361328125, + "learning_rate": 3.01859799713877e-06, + "loss": 2.4749, + "step": 1688 + }, + { + "epoch": 0.09061158798283261, + "grad_norm": 0.275390625, + "learning_rate": 3.020386266094421e-06, + "loss": 2.2583, + "step": 1689 + }, + { + "epoch": 0.09066523605150215, + "grad_norm": 0.31640625, + "learning_rate": 3.022174535050072e-06, + "loss": 2.5223, + "step": 1690 + }, + { + "epoch": 0.09071888412017168, + "grad_norm": 0.3359375, + "learning_rate": 3.023962804005723e-06, + "loss": 2.2676, + "step": 1691 + }, + { + "epoch": 0.0907725321888412, + "grad_norm": 0.390625, + "learning_rate": 3.025751072961374e-06, + "loss": 2.4467, + "step": 1692 + }, + { + "epoch": 0.09082618025751073, + "grad_norm": 0.365234375, + "learning_rate": 3.0275393419170247e-06, + "loss": 2.4149, + "step": 1693 + }, + { + "epoch": 0.09087982832618026, + "grad_norm": 0.2734375, + "learning_rate": 3.0293276108726755e-06, + "loss": 2.2264, + "step": 1694 + }, + { + "epoch": 0.09093347639484979, + "grad_norm": 0.30859375, + "learning_rate": 3.0311158798283264e-06, + "loss": 2.2556, + "step": 1695 + }, + { + "epoch": 0.09098712446351931, + "grad_norm": 0.59765625, + "learning_rate": 3.0329041487839773e-06, + "loss": 1.8309, + "step": 1696 + }, + { + "epoch": 0.09104077253218884, + "grad_norm": 0.330078125, + "learning_rate": 3.034692417739628e-06, + "loss": 2.3766, + "step": 1697 + }, + { + "epoch": 0.09109442060085837, + "grad_norm": 0.31640625, + "learning_rate": 3.036480686695279e-06, + "loss": 2.3454, + "step": 1698 + }, + { + "epoch": 0.09114806866952789, + "grad_norm": 0.416015625, + "learning_rate": 3.0382689556509303e-06, + "loss": 2.6418, + "step": 1699 + }, + { + "epoch": 0.09120171673819742, + "grad_norm": 0.416015625, + "learning_rate": 3.0400572246065812e-06, + "loss": 2.3859, + "step": 1700 + }, + { + "epoch": 0.09125536480686695, + "grad_norm": 0.275390625, + "learning_rate": 3.041845493562232e-06, + "loss": 2.5065, + "step": 1701 + }, + { + "epoch": 0.09130901287553649, + "grad_norm": 0.2490234375, + "learning_rate": 3.043633762517883e-06, + "loss": 2.0341, + "step": 1702 + }, + { + "epoch": 0.091362660944206, + "grad_norm": 0.328125, + "learning_rate": 3.045422031473534e-06, + "loss": 2.3479, + "step": 1703 + }, + { + "epoch": 0.09141630901287554, + "grad_norm": 0.314453125, + "learning_rate": 3.0472103004291847e-06, + "loss": 2.178, + "step": 1704 + }, + { + "epoch": 0.09146995708154507, + "grad_norm": 0.326171875, + "learning_rate": 3.0489985693848356e-06, + "loss": 2.2725, + "step": 1705 + }, + { + "epoch": 0.0915236051502146, + "grad_norm": 0.267578125, + "learning_rate": 3.0507868383404865e-06, + "loss": 2.1962, + "step": 1706 + }, + { + "epoch": 0.09157725321888412, + "grad_norm": 0.322265625, + "learning_rate": 3.0525751072961373e-06, + "loss": 2.4466, + "step": 1707 + }, + { + "epoch": 0.09163090128755365, + "grad_norm": 0.287109375, + "learning_rate": 3.0543633762517882e-06, + "loss": 2.4715, + "step": 1708 + }, + { + "epoch": 0.09168454935622318, + "grad_norm": 0.365234375, + "learning_rate": 3.0561516452074395e-06, + "loss": 2.428, + "step": 1709 + }, + { + "epoch": 0.0917381974248927, + "grad_norm": 0.33203125, + "learning_rate": 3.0579399141630904e-06, + "loss": 2.4215, + "step": 1710 + }, + { + "epoch": 0.09179184549356223, + "grad_norm": 0.3125, + "learning_rate": 3.0597281831187413e-06, + "loss": 2.3881, + "step": 1711 + }, + { + "epoch": 0.09184549356223176, + "grad_norm": 0.326171875, + "learning_rate": 3.061516452074392e-06, + "loss": 2.3477, + "step": 1712 + }, + { + "epoch": 0.0918991416309013, + "grad_norm": 0.490234375, + "learning_rate": 3.063304721030043e-06, + "loss": 2.3202, + "step": 1713 + }, + { + "epoch": 0.09195278969957081, + "grad_norm": 0.34375, + "learning_rate": 3.065092989985694e-06, + "loss": 2.2663, + "step": 1714 + }, + { + "epoch": 0.09200643776824034, + "grad_norm": 0.55859375, + "learning_rate": 3.0668812589413448e-06, + "loss": 1.7334, + "step": 1715 + }, + { + "epoch": 0.09206008583690987, + "grad_norm": 0.4765625, + "learning_rate": 3.0686695278969957e-06, + "loss": 2.5669, + "step": 1716 + }, + { + "epoch": 0.09211373390557939, + "grad_norm": 0.361328125, + "learning_rate": 3.0704577968526465e-06, + "loss": 2.0605, + "step": 1717 + }, + { + "epoch": 0.09216738197424892, + "grad_norm": 0.6328125, + "learning_rate": 3.0722460658082983e-06, + "loss": 2.5894, + "step": 1718 + }, + { + "epoch": 0.09222103004291846, + "grad_norm": 0.3515625, + "learning_rate": 3.074034334763949e-06, + "loss": 2.2887, + "step": 1719 + }, + { + "epoch": 0.09227467811158799, + "grad_norm": 0.419921875, + "learning_rate": 3.0758226037196e-06, + "loss": 2.4736, + "step": 1720 + }, + { + "epoch": 0.0923283261802575, + "grad_norm": 0.287109375, + "learning_rate": 3.077610872675251e-06, + "loss": 2.5362, + "step": 1721 + }, + { + "epoch": 0.09238197424892704, + "grad_norm": 0.287109375, + "learning_rate": 3.0793991416309018e-06, + "loss": 2.4599, + "step": 1722 + }, + { + "epoch": 0.09243562231759657, + "grad_norm": 0.37890625, + "learning_rate": 3.0811874105865526e-06, + "loss": 2.4554, + "step": 1723 + }, + { + "epoch": 0.0924892703862661, + "grad_norm": 0.3828125, + "learning_rate": 3.0829756795422035e-06, + "loss": 2.3486, + "step": 1724 + }, + { + "epoch": 0.09254291845493562, + "grad_norm": 0.306640625, + "learning_rate": 3.0847639484978544e-06, + "loss": 2.5251, + "step": 1725 + }, + { + "epoch": 0.09259656652360515, + "grad_norm": 0.2451171875, + "learning_rate": 3.0865522174535053e-06, + "loss": 2.2678, + "step": 1726 + }, + { + "epoch": 0.09265021459227468, + "grad_norm": 0.396484375, + "learning_rate": 3.088340486409156e-06, + "loss": 2.5122, + "step": 1727 + }, + { + "epoch": 0.0927038626609442, + "grad_norm": 0.283203125, + "learning_rate": 3.0901287553648074e-06, + "loss": 2.2722, + "step": 1728 + }, + { + "epoch": 0.09275751072961373, + "grad_norm": 0.53125, + "learning_rate": 3.0919170243204583e-06, + "loss": 2.2476, + "step": 1729 + }, + { + "epoch": 0.09281115879828326, + "grad_norm": 0.333984375, + "learning_rate": 3.093705293276109e-06, + "loss": 2.6554, + "step": 1730 + }, + { + "epoch": 0.0928648068669528, + "grad_norm": 0.3515625, + "learning_rate": 3.09549356223176e-06, + "loss": 2.0587, + "step": 1731 + }, + { + "epoch": 0.09291845493562231, + "grad_norm": 0.3125, + "learning_rate": 3.097281831187411e-06, + "loss": 2.4185, + "step": 1732 + }, + { + "epoch": 0.09297210300429185, + "grad_norm": 0.92578125, + "learning_rate": 3.099070100143062e-06, + "loss": 2.2311, + "step": 1733 + }, + { + "epoch": 0.09302575107296138, + "grad_norm": 0.283203125, + "learning_rate": 3.1008583690987127e-06, + "loss": 2.449, + "step": 1734 + }, + { + "epoch": 0.0930793991416309, + "grad_norm": 0.388671875, + "learning_rate": 3.1026466380543636e-06, + "loss": 2.2951, + "step": 1735 + }, + { + "epoch": 0.09313304721030043, + "grad_norm": 0.435546875, + "learning_rate": 3.1044349070100144e-06, + "loss": 1.8554, + "step": 1736 + }, + { + "epoch": 0.09318669527896996, + "grad_norm": 0.302734375, + "learning_rate": 3.1062231759656653e-06, + "loss": 2.2981, + "step": 1737 + }, + { + "epoch": 0.09324034334763949, + "grad_norm": 0.3515625, + "learning_rate": 3.1080114449213166e-06, + "loss": 2.4538, + "step": 1738 + }, + { + "epoch": 0.09329399141630901, + "grad_norm": 0.384765625, + "learning_rate": 3.1097997138769675e-06, + "loss": 2.2129, + "step": 1739 + }, + { + "epoch": 0.09334763948497854, + "grad_norm": 0.375, + "learning_rate": 3.1115879828326184e-06, + "loss": 2.2269, + "step": 1740 + }, + { + "epoch": 0.09340128755364807, + "grad_norm": 0.361328125, + "learning_rate": 3.1133762517882692e-06, + "loss": 2.0675, + "step": 1741 + }, + { + "epoch": 0.0934549356223176, + "grad_norm": 0.45703125, + "learning_rate": 3.11516452074392e-06, + "loss": 2.0077, + "step": 1742 + }, + { + "epoch": 0.09350858369098712, + "grad_norm": 0.298828125, + "learning_rate": 3.116952789699571e-06, + "loss": 2.5025, + "step": 1743 + }, + { + "epoch": 0.09356223175965665, + "grad_norm": 0.353515625, + "learning_rate": 3.118741058655222e-06, + "loss": 2.1936, + "step": 1744 + }, + { + "epoch": 0.09361587982832618, + "grad_norm": 0.306640625, + "learning_rate": 3.1205293276108727e-06, + "loss": 2.4738, + "step": 1745 + }, + { + "epoch": 0.0936695278969957, + "grad_norm": 0.2578125, + "learning_rate": 3.1223175965665236e-06, + "loss": 2.3554, + "step": 1746 + }, + { + "epoch": 0.09372317596566523, + "grad_norm": 0.283203125, + "learning_rate": 3.1241058655221745e-06, + "loss": 2.4081, + "step": 1747 + }, + { + "epoch": 0.09377682403433477, + "grad_norm": 0.375, + "learning_rate": 3.125894134477826e-06, + "loss": 2.671, + "step": 1748 + }, + { + "epoch": 0.0938304721030043, + "grad_norm": 0.375, + "learning_rate": 3.1276824034334767e-06, + "loss": 2.3079, + "step": 1749 + }, + { + "epoch": 0.09388412017167382, + "grad_norm": 0.30078125, + "learning_rate": 3.1294706723891276e-06, + "loss": 2.2735, + "step": 1750 + }, + { + "epoch": 0.09393776824034335, + "grad_norm": 0.291015625, + "learning_rate": 3.1312589413447784e-06, + "loss": 2.4347, + "step": 1751 + }, + { + "epoch": 0.09399141630901288, + "grad_norm": 0.294921875, + "learning_rate": 3.1330472103004293e-06, + "loss": 2.4219, + "step": 1752 + }, + { + "epoch": 0.0940450643776824, + "grad_norm": 0.30078125, + "learning_rate": 3.13483547925608e-06, + "loss": 2.2839, + "step": 1753 + }, + { + "epoch": 0.09409871244635193, + "grad_norm": 0.3203125, + "learning_rate": 3.136623748211731e-06, + "loss": 2.3425, + "step": 1754 + }, + { + "epoch": 0.09415236051502146, + "grad_norm": 0.28125, + "learning_rate": 3.138412017167382e-06, + "loss": 2.4336, + "step": 1755 + }, + { + "epoch": 0.09420600858369099, + "grad_norm": 0.345703125, + "learning_rate": 3.140200286123033e-06, + "loss": 2.583, + "step": 1756 + }, + { + "epoch": 0.09425965665236051, + "grad_norm": 0.44140625, + "learning_rate": 3.1419885550786837e-06, + "loss": 2.4741, + "step": 1757 + }, + { + "epoch": 0.09431330472103004, + "grad_norm": 0.384765625, + "learning_rate": 3.1437768240343354e-06, + "loss": 2.7355, + "step": 1758 + }, + { + "epoch": 0.09436695278969957, + "grad_norm": 0.439453125, + "learning_rate": 3.1455650929899863e-06, + "loss": 2.3669, + "step": 1759 + }, + { + "epoch": 0.0944206008583691, + "grad_norm": 0.35546875, + "learning_rate": 3.147353361945637e-06, + "loss": 2.1677, + "step": 1760 + }, + { + "epoch": 0.09447424892703862, + "grad_norm": 0.267578125, + "learning_rate": 3.149141630901288e-06, + "loss": 2.4743, + "step": 1761 + }, + { + "epoch": 0.09452789699570815, + "grad_norm": 0.365234375, + "learning_rate": 3.150929899856939e-06, + "loss": 2.588, + "step": 1762 + }, + { + "epoch": 0.09458154506437769, + "grad_norm": 0.283203125, + "learning_rate": 3.1527181688125898e-06, + "loss": 2.2236, + "step": 1763 + }, + { + "epoch": 0.0946351931330472, + "grad_norm": 0.3046875, + "learning_rate": 3.1545064377682407e-06, + "loss": 2.251, + "step": 1764 + }, + { + "epoch": 0.09468884120171674, + "grad_norm": 0.3046875, + "learning_rate": 3.1562947067238915e-06, + "loss": 2.2167, + "step": 1765 + }, + { + "epoch": 0.09474248927038627, + "grad_norm": 0.298828125, + "learning_rate": 3.1580829756795424e-06, + "loss": 2.3791, + "step": 1766 + }, + { + "epoch": 0.0947961373390558, + "grad_norm": 0.337890625, + "learning_rate": 3.1598712446351933e-06, + "loss": 1.9216, + "step": 1767 + }, + { + "epoch": 0.09484978540772532, + "grad_norm": 1.21875, + "learning_rate": 3.1616595135908446e-06, + "loss": 2.3138, + "step": 1768 + }, + { + "epoch": 0.09490343347639485, + "grad_norm": 0.283203125, + "learning_rate": 3.1634477825464955e-06, + "loss": 2.3739, + "step": 1769 + }, + { + "epoch": 0.09495708154506438, + "grad_norm": 0.296875, + "learning_rate": 3.1652360515021463e-06, + "loss": 2.6038, + "step": 1770 + }, + { + "epoch": 0.0950107296137339, + "grad_norm": 0.30859375, + "learning_rate": 3.1670243204577972e-06, + "loss": 2.5266, + "step": 1771 + }, + { + "epoch": 0.09506437768240343, + "grad_norm": 0.283203125, + "learning_rate": 3.168812589413448e-06, + "loss": 2.4506, + "step": 1772 + }, + { + "epoch": 0.09511802575107296, + "grad_norm": 0.28125, + "learning_rate": 3.170600858369099e-06, + "loss": 2.4585, + "step": 1773 + }, + { + "epoch": 0.0951716738197425, + "grad_norm": 0.455078125, + "learning_rate": 3.17238912732475e-06, + "loss": 2.1654, + "step": 1774 + }, + { + "epoch": 0.09522532188841201, + "grad_norm": 0.412109375, + "learning_rate": 3.1741773962804007e-06, + "loss": 2.4272, + "step": 1775 + }, + { + "epoch": 0.09527896995708154, + "grad_norm": 0.310546875, + "learning_rate": 3.1759656652360516e-06, + "loss": 2.5441, + "step": 1776 + }, + { + "epoch": 0.09533261802575108, + "grad_norm": 0.302734375, + "learning_rate": 3.1777539341917025e-06, + "loss": 2.5528, + "step": 1777 + }, + { + "epoch": 0.09538626609442061, + "grad_norm": 0.328125, + "learning_rate": 3.1795422031473538e-06, + "loss": 2.1644, + "step": 1778 + }, + { + "epoch": 0.09543991416309013, + "grad_norm": 0.2890625, + "learning_rate": 3.1813304721030046e-06, + "loss": 2.2783, + "step": 1779 + }, + { + "epoch": 0.09549356223175966, + "grad_norm": 0.29296875, + "learning_rate": 3.1831187410586555e-06, + "loss": 2.3409, + "step": 1780 + }, + { + "epoch": 0.09554721030042919, + "grad_norm": 0.328125, + "learning_rate": 3.1849070100143064e-06, + "loss": 2.3533, + "step": 1781 + }, + { + "epoch": 0.0956008583690987, + "grad_norm": 0.265625, + "learning_rate": 3.1866952789699573e-06, + "loss": 2.2971, + "step": 1782 + }, + { + "epoch": 0.09565450643776824, + "grad_norm": 0.404296875, + "learning_rate": 3.188483547925608e-06, + "loss": 2.3262, + "step": 1783 + }, + { + "epoch": 0.09570815450643777, + "grad_norm": 0.78125, + "learning_rate": 3.190271816881259e-06, + "loss": 2.4741, + "step": 1784 + }, + { + "epoch": 0.0957618025751073, + "grad_norm": 0.3359375, + "learning_rate": 3.19206008583691e-06, + "loss": 2.3702, + "step": 1785 + }, + { + "epoch": 0.09581545064377682, + "grad_norm": 0.62890625, + "learning_rate": 3.1938483547925608e-06, + "loss": 1.6672, + "step": 1786 + }, + { + "epoch": 0.09586909871244635, + "grad_norm": 0.4140625, + "learning_rate": 3.195636623748212e-06, + "loss": 2.1808, + "step": 1787 + }, + { + "epoch": 0.09592274678111588, + "grad_norm": 1.0546875, + "learning_rate": 3.197424892703863e-06, + "loss": 2.1714, + "step": 1788 + }, + { + "epoch": 0.0959763948497854, + "grad_norm": 0.349609375, + "learning_rate": 3.199213161659514e-06, + "loss": 2.555, + "step": 1789 + }, + { + "epoch": 0.09603004291845493, + "grad_norm": 0.34375, + "learning_rate": 3.2010014306151647e-06, + "loss": 2.4018, + "step": 1790 + }, + { + "epoch": 0.09608369098712446, + "grad_norm": 0.3046875, + "learning_rate": 3.2027896995708156e-06, + "loss": 2.2365, + "step": 1791 + }, + { + "epoch": 0.096137339055794, + "grad_norm": 0.58984375, + "learning_rate": 3.2045779685264665e-06, + "loss": 2.2972, + "step": 1792 + }, + { + "epoch": 0.09619098712446351, + "grad_norm": 0.3203125, + "learning_rate": 3.2063662374821173e-06, + "loss": 2.3763, + "step": 1793 + }, + { + "epoch": 0.09624463519313305, + "grad_norm": 0.408203125, + "learning_rate": 3.208154506437768e-06, + "loss": 1.6084, + "step": 1794 + }, + { + "epoch": 0.09629828326180258, + "grad_norm": 0.29296875, + "learning_rate": 3.209942775393419e-06, + "loss": 2.316, + "step": 1795 + }, + { + "epoch": 0.09635193133047211, + "grad_norm": 0.3203125, + "learning_rate": 3.21173104434907e-06, + "loss": 2.3623, + "step": 1796 + }, + { + "epoch": 0.09640557939914163, + "grad_norm": 0.361328125, + "learning_rate": 3.2135193133047217e-06, + "loss": 2.4884, + "step": 1797 + }, + { + "epoch": 0.09645922746781116, + "grad_norm": 0.419921875, + "learning_rate": 3.2153075822603726e-06, + "loss": 2.3853, + "step": 1798 + }, + { + "epoch": 0.09651287553648069, + "grad_norm": 0.326171875, + "learning_rate": 3.2170958512160234e-06, + "loss": 2.3767, + "step": 1799 + }, + { + "epoch": 0.09656652360515021, + "grad_norm": 0.3125, + "learning_rate": 3.2188841201716743e-06, + "loss": 2.3763, + "step": 1800 + }, + { + "epoch": 0.09662017167381974, + "grad_norm": 0.416015625, + "learning_rate": 3.220672389127325e-06, + "loss": 2.2875, + "step": 1801 + }, + { + "epoch": 0.09667381974248927, + "grad_norm": 0.453125, + "learning_rate": 3.222460658082976e-06, + "loss": 2.1212, + "step": 1802 + }, + { + "epoch": 0.0967274678111588, + "grad_norm": 0.30859375, + "learning_rate": 3.224248927038627e-06, + "loss": 2.3382, + "step": 1803 + }, + { + "epoch": 0.09678111587982832, + "grad_norm": 0.37109375, + "learning_rate": 3.226037195994278e-06, + "loss": 2.2166, + "step": 1804 + }, + { + "epoch": 0.09683476394849785, + "grad_norm": 0.341796875, + "learning_rate": 3.2278254649499287e-06, + "loss": 2.3092, + "step": 1805 + }, + { + "epoch": 0.09688841201716739, + "grad_norm": 1.125, + "learning_rate": 3.2296137339055796e-06, + "loss": 1.9457, + "step": 1806 + }, + { + "epoch": 0.0969420600858369, + "grad_norm": 0.314453125, + "learning_rate": 3.231402002861231e-06, + "loss": 2.4845, + "step": 1807 + }, + { + "epoch": 0.09699570815450644, + "grad_norm": 0.3125, + "learning_rate": 3.2331902718168817e-06, + "loss": 2.4129, + "step": 1808 + }, + { + "epoch": 0.09704935622317597, + "grad_norm": 0.314453125, + "learning_rate": 3.2349785407725326e-06, + "loss": 2.3114, + "step": 1809 + }, + { + "epoch": 0.0971030042918455, + "grad_norm": 0.380859375, + "learning_rate": 3.2367668097281835e-06, + "loss": 2.3205, + "step": 1810 + }, + { + "epoch": 0.09715665236051502, + "grad_norm": 0.310546875, + "learning_rate": 3.2385550786838344e-06, + "loss": 2.2329, + "step": 1811 + }, + { + "epoch": 0.09721030042918455, + "grad_norm": 0.34765625, + "learning_rate": 3.2403433476394852e-06, + "loss": 2.4674, + "step": 1812 + }, + { + "epoch": 0.09726394849785408, + "grad_norm": 0.294921875, + "learning_rate": 3.242131616595136e-06, + "loss": 2.433, + "step": 1813 + }, + { + "epoch": 0.09731759656652361, + "grad_norm": 0.287109375, + "learning_rate": 3.243919885550787e-06, + "loss": 2.3941, + "step": 1814 + }, + { + "epoch": 0.09737124463519313, + "grad_norm": 0.453125, + "learning_rate": 3.245708154506438e-06, + "loss": 2.3346, + "step": 1815 + }, + { + "epoch": 0.09742489270386266, + "grad_norm": 0.36328125, + "learning_rate": 3.2474964234620887e-06, + "loss": 2.3223, + "step": 1816 + }, + { + "epoch": 0.0974785407725322, + "grad_norm": 0.68359375, + "learning_rate": 3.24928469241774e-06, + "loss": 2.1036, + "step": 1817 + }, + { + "epoch": 0.09753218884120171, + "grad_norm": 0.380859375, + "learning_rate": 3.251072961373391e-06, + "loss": 2.442, + "step": 1818 + }, + { + "epoch": 0.09758583690987124, + "grad_norm": 0.330078125, + "learning_rate": 3.252861230329042e-06, + "loss": 2.4005, + "step": 1819 + }, + { + "epoch": 0.09763948497854077, + "grad_norm": 0.44140625, + "learning_rate": 3.2546494992846927e-06, + "loss": 2.4433, + "step": 1820 + }, + { + "epoch": 0.0976931330472103, + "grad_norm": 0.47265625, + "learning_rate": 3.2564377682403435e-06, + "loss": 2.0415, + "step": 1821 + }, + { + "epoch": 0.09774678111587982, + "grad_norm": 0.390625, + "learning_rate": 3.2582260371959944e-06, + "loss": 2.6357, + "step": 1822 + }, + { + "epoch": 0.09780042918454936, + "grad_norm": 0.28515625, + "learning_rate": 3.2600143061516453e-06, + "loss": 2.4659, + "step": 1823 + }, + { + "epoch": 0.09785407725321889, + "grad_norm": 0.65234375, + "learning_rate": 3.261802575107296e-06, + "loss": 2.2302, + "step": 1824 + }, + { + "epoch": 0.0979077253218884, + "grad_norm": 0.2490234375, + "learning_rate": 3.263590844062947e-06, + "loss": 2.1841, + "step": 1825 + }, + { + "epoch": 0.09796137339055794, + "grad_norm": 0.36328125, + "learning_rate": 3.265379113018598e-06, + "loss": 2.4522, + "step": 1826 + }, + { + "epoch": 0.09801502145922747, + "grad_norm": 0.275390625, + "learning_rate": 3.2671673819742492e-06, + "loss": 2.3899, + "step": 1827 + }, + { + "epoch": 0.098068669527897, + "grad_norm": 1.4765625, + "learning_rate": 3.2689556509299e-06, + "loss": 2.5442, + "step": 1828 + }, + { + "epoch": 0.09812231759656652, + "grad_norm": 0.291015625, + "learning_rate": 3.270743919885551e-06, + "loss": 2.4684, + "step": 1829 + }, + { + "epoch": 0.09817596566523605, + "grad_norm": 0.5625, + "learning_rate": 3.272532188841202e-06, + "loss": 2.4278, + "step": 1830 + }, + { + "epoch": 0.09822961373390558, + "grad_norm": 0.275390625, + "learning_rate": 3.2743204577968527e-06, + "loss": 2.2508, + "step": 1831 + }, + { + "epoch": 0.0982832618025751, + "grad_norm": 0.353515625, + "learning_rate": 3.2761087267525036e-06, + "loss": 2.3228, + "step": 1832 + }, + { + "epoch": 0.09833690987124463, + "grad_norm": 0.296875, + "learning_rate": 3.2778969957081545e-06, + "loss": 2.2496, + "step": 1833 + }, + { + "epoch": 0.09839055793991416, + "grad_norm": 0.345703125, + "learning_rate": 3.2796852646638054e-06, + "loss": 2.5139, + "step": 1834 + }, + { + "epoch": 0.0984442060085837, + "grad_norm": 0.318359375, + "learning_rate": 3.2814735336194562e-06, + "loss": 2.0646, + "step": 1835 + }, + { + "epoch": 0.09849785407725321, + "grad_norm": 0.271484375, + "learning_rate": 3.283261802575107e-06, + "loss": 2.469, + "step": 1836 + }, + { + "epoch": 0.09855150214592275, + "grad_norm": 0.337890625, + "learning_rate": 3.285050071530759e-06, + "loss": 2.4398, + "step": 1837 + }, + { + "epoch": 0.09860515021459228, + "grad_norm": 0.306640625, + "learning_rate": 3.2868383404864097e-06, + "loss": 2.172, + "step": 1838 + }, + { + "epoch": 0.09865879828326181, + "grad_norm": 0.326171875, + "learning_rate": 3.2886266094420606e-06, + "loss": 2.5456, + "step": 1839 + }, + { + "epoch": 0.09871244635193133, + "grad_norm": 0.365234375, + "learning_rate": 3.2904148783977115e-06, + "loss": 2.3088, + "step": 1840 + }, + { + "epoch": 0.09876609442060086, + "grad_norm": 0.302734375, + "learning_rate": 3.2922031473533623e-06, + "loss": 2.3013, + "step": 1841 + }, + { + "epoch": 0.09881974248927039, + "grad_norm": 0.384765625, + "learning_rate": 3.293991416309013e-06, + "loss": 2.5251, + "step": 1842 + }, + { + "epoch": 0.09887339055793991, + "grad_norm": 0.4140625, + "learning_rate": 3.295779685264664e-06, + "loss": 2.2745, + "step": 1843 + }, + { + "epoch": 0.09892703862660944, + "grad_norm": 0.310546875, + "learning_rate": 3.297567954220315e-06, + "loss": 1.8906, + "step": 1844 + }, + { + "epoch": 0.09898068669527897, + "grad_norm": 0.30859375, + "learning_rate": 3.299356223175966e-06, + "loss": 2.4155, + "step": 1845 + }, + { + "epoch": 0.0990343347639485, + "grad_norm": 0.271484375, + "learning_rate": 3.301144492131617e-06, + "loss": 2.4911, + "step": 1846 + }, + { + "epoch": 0.09908798283261802, + "grad_norm": 0.3984375, + "learning_rate": 3.302932761087268e-06, + "loss": 2.2057, + "step": 1847 + }, + { + "epoch": 0.09914163090128755, + "grad_norm": 0.48828125, + "learning_rate": 3.304721030042919e-06, + "loss": 2.2291, + "step": 1848 + }, + { + "epoch": 0.09919527896995708, + "grad_norm": 0.3046875, + "learning_rate": 3.3065092989985698e-06, + "loss": 2.1683, + "step": 1849 + }, + { + "epoch": 0.0992489270386266, + "grad_norm": 0.392578125, + "learning_rate": 3.3082975679542206e-06, + "loss": 2.0233, + "step": 1850 + }, + { + "epoch": 0.09930257510729613, + "grad_norm": 0.33203125, + "learning_rate": 3.3100858369098715e-06, + "loss": 2.6861, + "step": 1851 + }, + { + "epoch": 0.09935622317596567, + "grad_norm": 1.15625, + "learning_rate": 3.3118741058655224e-06, + "loss": 2.5563, + "step": 1852 + }, + { + "epoch": 0.0994098712446352, + "grad_norm": 0.458984375, + "learning_rate": 3.3136623748211733e-06, + "loss": 2.322, + "step": 1853 + }, + { + "epoch": 0.09946351931330472, + "grad_norm": 0.87890625, + "learning_rate": 3.315450643776824e-06, + "loss": 2.6716, + "step": 1854 + }, + { + "epoch": 0.09951716738197425, + "grad_norm": 0.57421875, + "learning_rate": 3.317238912732475e-06, + "loss": 2.5271, + "step": 1855 + }, + { + "epoch": 0.09957081545064378, + "grad_norm": 0.3359375, + "learning_rate": 3.3190271816881263e-06, + "loss": 2.2757, + "step": 1856 + }, + { + "epoch": 0.09962446351931331, + "grad_norm": 0.29296875, + "learning_rate": 3.320815450643777e-06, + "loss": 2.4664, + "step": 1857 + }, + { + "epoch": 0.09967811158798283, + "grad_norm": 0.423828125, + "learning_rate": 3.322603719599428e-06, + "loss": 2.2297, + "step": 1858 + }, + { + "epoch": 0.09973175965665236, + "grad_norm": 0.337890625, + "learning_rate": 3.324391988555079e-06, + "loss": 2.2805, + "step": 1859 + }, + { + "epoch": 0.09978540772532189, + "grad_norm": 0.396484375, + "learning_rate": 3.32618025751073e-06, + "loss": 2.3565, + "step": 1860 + }, + { + "epoch": 0.09983905579399141, + "grad_norm": 0.2734375, + "learning_rate": 3.3279685264663807e-06, + "loss": 2.5298, + "step": 1861 + }, + { + "epoch": 0.09989270386266094, + "grad_norm": 0.38671875, + "learning_rate": 3.3297567954220316e-06, + "loss": 2.1956, + "step": 1862 + }, + { + "epoch": 0.09994635193133047, + "grad_norm": 2.515625, + "learning_rate": 3.3315450643776824e-06, + "loss": 2.5034, + "step": 1863 + }, + { + "epoch": 0.1, + "grad_norm": 0.36328125, + "learning_rate": 3.3333333333333333e-06, + "loss": 2.4421, + "step": 1864 + }, + { + "epoch": 0.10005364806866952, + "grad_norm": 0.28515625, + "learning_rate": 3.335121602288984e-06, + "loss": 2.2629, + "step": 1865 + }, + { + "epoch": 0.10010729613733906, + "grad_norm": 0.310546875, + "learning_rate": 3.336909871244636e-06, + "loss": 2.3191, + "step": 1866 + }, + { + "epoch": 0.10016094420600859, + "grad_norm": 0.32421875, + "learning_rate": 3.338698140200287e-06, + "loss": 2.3113, + "step": 1867 + }, + { + "epoch": 0.1002145922746781, + "grad_norm": 0.302734375, + "learning_rate": 3.3404864091559377e-06, + "loss": 2.5534, + "step": 1868 + }, + { + "epoch": 0.10026824034334764, + "grad_norm": 0.275390625, + "learning_rate": 3.3422746781115885e-06, + "loss": 2.3333, + "step": 1869 + }, + { + "epoch": 0.10032188841201717, + "grad_norm": 0.41015625, + "learning_rate": 3.3440629470672394e-06, + "loss": 2.4726, + "step": 1870 + }, + { + "epoch": 0.1003755364806867, + "grad_norm": 0.30078125, + "learning_rate": 3.3458512160228903e-06, + "loss": 2.3039, + "step": 1871 + }, + { + "epoch": 0.10042918454935622, + "grad_norm": 0.73046875, + "learning_rate": 3.3476394849785408e-06, + "loss": 2.2414, + "step": 1872 + }, + { + "epoch": 0.10048283261802575, + "grad_norm": 0.37890625, + "learning_rate": 3.3494277539341916e-06, + "loss": 2.3642, + "step": 1873 + }, + { + "epoch": 0.10053648068669528, + "grad_norm": 0.41015625, + "learning_rate": 3.3512160228898425e-06, + "loss": 2.4431, + "step": 1874 + }, + { + "epoch": 0.10059012875536481, + "grad_norm": 0.40625, + "learning_rate": 3.3530042918454934e-06, + "loss": 2.1551, + "step": 1875 + }, + { + "epoch": 0.10064377682403433, + "grad_norm": 0.28125, + "learning_rate": 3.354792560801145e-06, + "loss": 2.6725, + "step": 1876 + }, + { + "epoch": 0.10069742489270386, + "grad_norm": 0.294921875, + "learning_rate": 3.356580829756796e-06, + "loss": 2.6802, + "step": 1877 + }, + { + "epoch": 0.1007510729613734, + "grad_norm": 0.314453125, + "learning_rate": 3.358369098712447e-06, + "loss": 2.1911, + "step": 1878 + }, + { + "epoch": 0.10080472103004291, + "grad_norm": 0.2294921875, + "learning_rate": 3.3601573676680977e-06, + "loss": 2.0293, + "step": 1879 + }, + { + "epoch": 0.10085836909871244, + "grad_norm": 0.2578125, + "learning_rate": 3.3619456366237486e-06, + "loss": 2.2231, + "step": 1880 + }, + { + "epoch": 0.10091201716738198, + "grad_norm": 0.283203125, + "learning_rate": 3.3637339055793995e-06, + "loss": 2.3435, + "step": 1881 + }, + { + "epoch": 0.10096566523605151, + "grad_norm": 0.33203125, + "learning_rate": 3.3655221745350504e-06, + "loss": 2.1474, + "step": 1882 + }, + { + "epoch": 0.10101931330472103, + "grad_norm": 0.32421875, + "learning_rate": 3.3673104434907012e-06, + "loss": 2.338, + "step": 1883 + }, + { + "epoch": 0.10107296137339056, + "grad_norm": 0.2890625, + "learning_rate": 3.369098712446352e-06, + "loss": 2.1223, + "step": 1884 + }, + { + "epoch": 0.10112660944206009, + "grad_norm": 0.296875, + "learning_rate": 3.370886981402003e-06, + "loss": 2.4205, + "step": 1885 + }, + { + "epoch": 0.10118025751072961, + "grad_norm": 0.35546875, + "learning_rate": 3.3726752503576543e-06, + "loss": 2.1381, + "step": 1886 + }, + { + "epoch": 0.10123390557939914, + "grad_norm": 0.380859375, + "learning_rate": 3.374463519313305e-06, + "loss": 2.2856, + "step": 1887 + }, + { + "epoch": 0.10128755364806867, + "grad_norm": 0.47265625, + "learning_rate": 3.376251788268956e-06, + "loss": 2.4862, + "step": 1888 + }, + { + "epoch": 0.1013412017167382, + "grad_norm": 0.314453125, + "learning_rate": 3.378040057224607e-06, + "loss": 2.3237, + "step": 1889 + }, + { + "epoch": 0.10139484978540772, + "grad_norm": 0.38671875, + "learning_rate": 3.379828326180258e-06, + "loss": 2.6489, + "step": 1890 + }, + { + "epoch": 0.10144849785407725, + "grad_norm": 0.353515625, + "learning_rate": 3.3816165951359087e-06, + "loss": 2.3974, + "step": 1891 + }, + { + "epoch": 0.10150214592274678, + "grad_norm": 0.359375, + "learning_rate": 3.3834048640915595e-06, + "loss": 2.7624, + "step": 1892 + }, + { + "epoch": 0.10155579399141632, + "grad_norm": 0.314453125, + "learning_rate": 3.3851931330472104e-06, + "loss": 2.5371, + "step": 1893 + }, + { + "epoch": 0.10160944206008583, + "grad_norm": 0.46875, + "learning_rate": 3.3869814020028613e-06, + "loss": 1.8085, + "step": 1894 + }, + { + "epoch": 0.10166309012875537, + "grad_norm": 0.330078125, + "learning_rate": 3.388769670958512e-06, + "loss": 2.23, + "step": 1895 + }, + { + "epoch": 0.1017167381974249, + "grad_norm": 0.37890625, + "learning_rate": 3.3905579399141635e-06, + "loss": 1.7654, + "step": 1896 + }, + { + "epoch": 0.10177038626609441, + "grad_norm": 0.322265625, + "learning_rate": 3.3923462088698143e-06, + "loss": 2.4983, + "step": 1897 + }, + { + "epoch": 0.10182403433476395, + "grad_norm": 0.423828125, + "learning_rate": 3.3941344778254652e-06, + "loss": 2.3797, + "step": 1898 + }, + { + "epoch": 0.10187768240343348, + "grad_norm": 0.275390625, + "learning_rate": 3.395922746781116e-06, + "loss": 2.548, + "step": 1899 + }, + { + "epoch": 0.10193133047210301, + "grad_norm": 0.353515625, + "learning_rate": 3.397711015736767e-06, + "loss": 2.0337, + "step": 1900 + }, + { + "epoch": 0.10198497854077253, + "grad_norm": 0.380859375, + "learning_rate": 3.399499284692418e-06, + "loss": 2.1763, + "step": 1901 + }, + { + "epoch": 0.10203862660944206, + "grad_norm": 0.82421875, + "learning_rate": 3.4012875536480687e-06, + "loss": 2.5312, + "step": 1902 + }, + { + "epoch": 0.10209227467811159, + "grad_norm": 0.31640625, + "learning_rate": 3.4030758226037196e-06, + "loss": 2.2981, + "step": 1903 + }, + { + "epoch": 0.10214592274678111, + "grad_norm": 0.302734375, + "learning_rate": 3.4048640915593705e-06, + "loss": 1.9508, + "step": 1904 + }, + { + "epoch": 0.10219957081545064, + "grad_norm": 0.375, + "learning_rate": 3.4066523605150213e-06, + "loss": 2.2551, + "step": 1905 + }, + { + "epoch": 0.10225321888412017, + "grad_norm": 0.365234375, + "learning_rate": 3.408440629470673e-06, + "loss": 2.5305, + "step": 1906 + }, + { + "epoch": 0.1023068669527897, + "grad_norm": 0.2890625, + "learning_rate": 3.410228898426324e-06, + "loss": 2.3295, + "step": 1907 + }, + { + "epoch": 0.10236051502145922, + "grad_norm": 0.37890625, + "learning_rate": 3.412017167381975e-06, + "loss": 2.5087, + "step": 1908 + }, + { + "epoch": 0.10241416309012875, + "grad_norm": 0.27734375, + "learning_rate": 3.4138054363376257e-06, + "loss": 1.9473, + "step": 1909 + }, + { + "epoch": 0.10246781115879829, + "grad_norm": 0.310546875, + "learning_rate": 3.4155937052932766e-06, + "loss": 2.1492, + "step": 1910 + }, + { + "epoch": 0.10252145922746782, + "grad_norm": 0.2734375, + "learning_rate": 3.4173819742489275e-06, + "loss": 2.5761, + "step": 1911 + }, + { + "epoch": 0.10257510729613734, + "grad_norm": 0.29296875, + "learning_rate": 3.4191702432045783e-06, + "loss": 2.5124, + "step": 1912 + }, + { + "epoch": 0.10262875536480687, + "grad_norm": 0.302734375, + "learning_rate": 3.420958512160229e-06, + "loss": 2.4523, + "step": 1913 + }, + { + "epoch": 0.1026824034334764, + "grad_norm": 0.298828125, + "learning_rate": 3.42274678111588e-06, + "loss": 2.4263, + "step": 1914 + }, + { + "epoch": 0.10273605150214592, + "grad_norm": 0.349609375, + "learning_rate": 3.4245350500715314e-06, + "loss": 2.1202, + "step": 1915 + }, + { + "epoch": 0.10278969957081545, + "grad_norm": 0.3984375, + "learning_rate": 3.4263233190271823e-06, + "loss": 2.1661, + "step": 1916 + }, + { + "epoch": 0.10284334763948498, + "grad_norm": 0.279296875, + "learning_rate": 3.428111587982833e-06, + "loss": 2.1987, + "step": 1917 + }, + { + "epoch": 0.10289699570815451, + "grad_norm": 0.32421875, + "learning_rate": 3.429899856938484e-06, + "loss": 2.4675, + "step": 1918 + }, + { + "epoch": 0.10295064377682403, + "grad_norm": 0.306640625, + "learning_rate": 3.431688125894135e-06, + "loss": 2.3728, + "step": 1919 + }, + { + "epoch": 0.10300429184549356, + "grad_norm": 0.328125, + "learning_rate": 3.4334763948497858e-06, + "loss": 2.2861, + "step": 1920 + }, + { + "epoch": 0.1030579399141631, + "grad_norm": 0.322265625, + "learning_rate": 3.4352646638054366e-06, + "loss": 2.1771, + "step": 1921 + }, + { + "epoch": 0.10311158798283261, + "grad_norm": 0.296875, + "learning_rate": 3.4370529327610875e-06, + "loss": 2.4603, + "step": 1922 + }, + { + "epoch": 0.10316523605150214, + "grad_norm": 0.36328125, + "learning_rate": 3.4388412017167384e-06, + "loss": 1.7366, + "step": 1923 + }, + { + "epoch": 0.10321888412017168, + "grad_norm": 0.306640625, + "learning_rate": 3.4406294706723893e-06, + "loss": 2.2869, + "step": 1924 + }, + { + "epoch": 0.1032725321888412, + "grad_norm": 0.33203125, + "learning_rate": 3.4424177396280406e-06, + "loss": 2.6591, + "step": 1925 + }, + { + "epoch": 0.10332618025751072, + "grad_norm": 0.384765625, + "learning_rate": 3.4442060085836914e-06, + "loss": 2.212, + "step": 1926 + }, + { + "epoch": 0.10337982832618026, + "grad_norm": 0.330078125, + "learning_rate": 3.4459942775393423e-06, + "loss": 2.4467, + "step": 1927 + }, + { + "epoch": 0.10343347639484979, + "grad_norm": 0.490234375, + "learning_rate": 3.447782546494993e-06, + "loss": 2.4213, + "step": 1928 + }, + { + "epoch": 0.10348712446351932, + "grad_norm": 0.369140625, + "learning_rate": 3.449570815450644e-06, + "loss": 2.4396, + "step": 1929 + }, + { + "epoch": 0.10354077253218884, + "grad_norm": 0.37890625, + "learning_rate": 3.451359084406295e-06, + "loss": 2.2573, + "step": 1930 + }, + { + "epoch": 0.10359442060085837, + "grad_norm": 0.2890625, + "learning_rate": 3.453147353361946e-06, + "loss": 2.3364, + "step": 1931 + }, + { + "epoch": 0.1036480686695279, + "grad_norm": 0.357421875, + "learning_rate": 3.4549356223175967e-06, + "loss": 2.1312, + "step": 1932 + }, + { + "epoch": 0.10370171673819742, + "grad_norm": 0.3125, + "learning_rate": 3.4567238912732476e-06, + "loss": 2.4162, + "step": 1933 + }, + { + "epoch": 0.10375536480686695, + "grad_norm": 0.44921875, + "learning_rate": 3.4585121602288984e-06, + "loss": 2.5472, + "step": 1934 + }, + { + "epoch": 0.10380901287553648, + "grad_norm": 0.291015625, + "learning_rate": 3.4603004291845497e-06, + "loss": 2.4274, + "step": 1935 + }, + { + "epoch": 0.10386266094420601, + "grad_norm": 0.255859375, + "learning_rate": 3.4620886981402006e-06, + "loss": 2.2202, + "step": 1936 + }, + { + "epoch": 0.10391630901287553, + "grad_norm": 0.28515625, + "learning_rate": 3.4638769670958515e-06, + "loss": 2.2816, + "step": 1937 + }, + { + "epoch": 0.10396995708154506, + "grad_norm": 0.326171875, + "learning_rate": 3.4656652360515024e-06, + "loss": 2.227, + "step": 1938 + }, + { + "epoch": 0.1040236051502146, + "grad_norm": 0.369140625, + "learning_rate": 3.4674535050071532e-06, + "loss": 2.4261, + "step": 1939 + }, + { + "epoch": 0.10407725321888411, + "grad_norm": 0.328125, + "learning_rate": 3.469241773962804e-06, + "loss": 2.361, + "step": 1940 + }, + { + "epoch": 0.10413090128755365, + "grad_norm": 0.69140625, + "learning_rate": 3.471030042918455e-06, + "loss": 2.4049, + "step": 1941 + }, + { + "epoch": 0.10418454935622318, + "grad_norm": 0.30859375, + "learning_rate": 3.472818311874106e-06, + "loss": 2.3647, + "step": 1942 + }, + { + "epoch": 0.10423819742489271, + "grad_norm": 0.31640625, + "learning_rate": 3.4746065808297567e-06, + "loss": 2.4357, + "step": 1943 + }, + { + "epoch": 0.10429184549356223, + "grad_norm": 0.53515625, + "learning_rate": 3.4763948497854076e-06, + "loss": 2.227, + "step": 1944 + }, + { + "epoch": 0.10434549356223176, + "grad_norm": 0.29296875, + "learning_rate": 3.4781831187410593e-06, + "loss": 2.3218, + "step": 1945 + }, + { + "epoch": 0.10439914163090129, + "grad_norm": 1.234375, + "learning_rate": 3.4799713876967102e-06, + "loss": 2.3803, + "step": 1946 + }, + { + "epoch": 0.10445278969957082, + "grad_norm": 0.412109375, + "learning_rate": 3.481759656652361e-06, + "loss": 2.5235, + "step": 1947 + }, + { + "epoch": 0.10450643776824034, + "grad_norm": 0.271484375, + "learning_rate": 3.483547925608012e-06, + "loss": 2.0958, + "step": 1948 + }, + { + "epoch": 0.10456008583690987, + "grad_norm": 0.33203125, + "learning_rate": 3.485336194563663e-06, + "loss": 2.2272, + "step": 1949 + }, + { + "epoch": 0.1046137339055794, + "grad_norm": 0.318359375, + "learning_rate": 3.4871244635193137e-06, + "loss": 2.4878, + "step": 1950 + }, + { + "epoch": 0.10466738197424892, + "grad_norm": 0.30078125, + "learning_rate": 3.4889127324749646e-06, + "loss": 2.4345, + "step": 1951 + }, + { + "epoch": 0.10472103004291845, + "grad_norm": 0.392578125, + "learning_rate": 3.4907010014306155e-06, + "loss": 2.5248, + "step": 1952 + }, + { + "epoch": 0.10477467811158798, + "grad_norm": 0.68359375, + "learning_rate": 3.4924892703862664e-06, + "loss": 2.4817, + "step": 1953 + }, + { + "epoch": 0.10482832618025752, + "grad_norm": 0.38671875, + "learning_rate": 3.4942775393419172e-06, + "loss": 2.4703, + "step": 1954 + }, + { + "epoch": 0.10488197424892703, + "grad_norm": 0.2578125, + "learning_rate": 3.4960658082975685e-06, + "loss": 2.1155, + "step": 1955 + }, + { + "epoch": 0.10493562231759657, + "grad_norm": 0.26953125, + "learning_rate": 3.4978540772532194e-06, + "loss": 2.3372, + "step": 1956 + }, + { + "epoch": 0.1049892703862661, + "grad_norm": 0.30078125, + "learning_rate": 3.4996423462088703e-06, + "loss": 2.4501, + "step": 1957 + }, + { + "epoch": 0.10504291845493562, + "grad_norm": 0.283203125, + "learning_rate": 3.501430615164521e-06, + "loss": 2.4934, + "step": 1958 + }, + { + "epoch": 0.10509656652360515, + "grad_norm": 0.353515625, + "learning_rate": 3.503218884120172e-06, + "loss": 2.0084, + "step": 1959 + }, + { + "epoch": 0.10515021459227468, + "grad_norm": 0.349609375, + "learning_rate": 3.505007153075823e-06, + "loss": 2.1249, + "step": 1960 + }, + { + "epoch": 0.10520386266094421, + "grad_norm": 0.369140625, + "learning_rate": 3.5067954220314738e-06, + "loss": 2.0833, + "step": 1961 + }, + { + "epoch": 0.10525751072961373, + "grad_norm": 0.34375, + "learning_rate": 3.5085836909871247e-06, + "loss": 2.4178, + "step": 1962 + }, + { + "epoch": 0.10531115879828326, + "grad_norm": 0.333984375, + "learning_rate": 3.5103719599427755e-06, + "loss": 2.4738, + "step": 1963 + }, + { + "epoch": 0.10536480686695279, + "grad_norm": 0.291015625, + "learning_rate": 3.5121602288984264e-06, + "loss": 2.2389, + "step": 1964 + }, + { + "epoch": 0.10541845493562232, + "grad_norm": 0.3359375, + "learning_rate": 3.5139484978540777e-06, + "loss": 2.3417, + "step": 1965 + }, + { + "epoch": 0.10547210300429184, + "grad_norm": 0.306640625, + "learning_rate": 3.5157367668097286e-06, + "loss": 2.3153, + "step": 1966 + }, + { + "epoch": 0.10552575107296137, + "grad_norm": 0.3125, + "learning_rate": 3.5175250357653795e-06, + "loss": 2.4313, + "step": 1967 + }, + { + "epoch": 0.1055793991416309, + "grad_norm": 0.43359375, + "learning_rate": 3.5193133047210303e-06, + "loss": 2.4879, + "step": 1968 + }, + { + "epoch": 0.10563304721030042, + "grad_norm": 0.328125, + "learning_rate": 3.5211015736766812e-06, + "loss": 2.4743, + "step": 1969 + }, + { + "epoch": 0.10568669527896996, + "grad_norm": 0.3359375, + "learning_rate": 3.522889842632332e-06, + "loss": 2.3775, + "step": 1970 + }, + { + "epoch": 0.10574034334763949, + "grad_norm": 0.263671875, + "learning_rate": 3.524678111587983e-06, + "loss": 2.143, + "step": 1971 + }, + { + "epoch": 0.10579399141630902, + "grad_norm": 0.349609375, + "learning_rate": 3.526466380543634e-06, + "loss": 2.2427, + "step": 1972 + }, + { + "epoch": 0.10584763948497854, + "grad_norm": 0.330078125, + "learning_rate": 3.5282546494992847e-06, + "loss": 2.4789, + "step": 1973 + }, + { + "epoch": 0.10590128755364807, + "grad_norm": 0.2734375, + "learning_rate": 3.530042918454936e-06, + "loss": 2.4512, + "step": 1974 + }, + { + "epoch": 0.1059549356223176, + "grad_norm": 0.318359375, + "learning_rate": 3.531831187410587e-06, + "loss": 2.3281, + "step": 1975 + }, + { + "epoch": 0.10600858369098712, + "grad_norm": 0.337890625, + "learning_rate": 3.5336194563662378e-06, + "loss": 2.2938, + "step": 1976 + }, + { + "epoch": 0.10606223175965665, + "grad_norm": 0.34765625, + "learning_rate": 3.5354077253218886e-06, + "loss": 2.2877, + "step": 1977 + }, + { + "epoch": 0.10611587982832618, + "grad_norm": 0.3359375, + "learning_rate": 3.5371959942775395e-06, + "loss": 2.3506, + "step": 1978 + }, + { + "epoch": 0.10616952789699571, + "grad_norm": 0.28515625, + "learning_rate": 3.5389842632331904e-06, + "loss": 2.3387, + "step": 1979 + }, + { + "epoch": 0.10622317596566523, + "grad_norm": 0.3125, + "learning_rate": 3.5407725321888413e-06, + "loss": 2.3663, + "step": 1980 + }, + { + "epoch": 0.10627682403433476, + "grad_norm": 0.345703125, + "learning_rate": 3.542560801144492e-06, + "loss": 2.4255, + "step": 1981 + }, + { + "epoch": 0.1063304721030043, + "grad_norm": 0.44140625, + "learning_rate": 3.544349070100143e-06, + "loss": 2.3015, + "step": 1982 + }, + { + "epoch": 0.10638412017167383, + "grad_norm": 0.28125, + "learning_rate": 3.546137339055794e-06, + "loss": 2.4187, + "step": 1983 + }, + { + "epoch": 0.10643776824034334, + "grad_norm": 0.2578125, + "learning_rate": 3.5479256080114456e-06, + "loss": 2.2769, + "step": 1984 + }, + { + "epoch": 0.10649141630901288, + "grad_norm": 0.314453125, + "learning_rate": 3.5497138769670965e-06, + "loss": 2.2324, + "step": 1985 + }, + { + "epoch": 0.10654506437768241, + "grad_norm": 0.490234375, + "learning_rate": 3.5515021459227474e-06, + "loss": 1.9225, + "step": 1986 + }, + { + "epoch": 0.10659871244635193, + "grad_norm": 0.259765625, + "learning_rate": 3.5532904148783982e-06, + "loss": 2.0301, + "step": 1987 + }, + { + "epoch": 0.10665236051502146, + "grad_norm": 0.275390625, + "learning_rate": 3.555078683834049e-06, + "loss": 2.3181, + "step": 1988 + }, + { + "epoch": 0.10670600858369099, + "grad_norm": 0.333984375, + "learning_rate": 3.5568669527897e-06, + "loss": 2.3787, + "step": 1989 + }, + { + "epoch": 0.10675965665236052, + "grad_norm": 0.359375, + "learning_rate": 3.558655221745351e-06, + "loss": 2.4299, + "step": 1990 + }, + { + "epoch": 0.10681330472103004, + "grad_norm": 0.296875, + "learning_rate": 3.5604434907010018e-06, + "loss": 2.2037, + "step": 1991 + }, + { + "epoch": 0.10686695278969957, + "grad_norm": 0.318359375, + "learning_rate": 3.5622317596566526e-06, + "loss": 2.3649, + "step": 1992 + }, + { + "epoch": 0.1069206008583691, + "grad_norm": 0.30859375, + "learning_rate": 3.5640200286123035e-06, + "loss": 2.1486, + "step": 1993 + }, + { + "epoch": 0.10697424892703862, + "grad_norm": 0.296875, + "learning_rate": 3.565808297567955e-06, + "loss": 2.4181, + "step": 1994 + }, + { + "epoch": 0.10702789699570815, + "grad_norm": 0.26953125, + "learning_rate": 3.5675965665236057e-06, + "loss": 2.418, + "step": 1995 + }, + { + "epoch": 0.10708154506437768, + "grad_norm": 1.1875, + "learning_rate": 3.5693848354792566e-06, + "loss": 2.1563, + "step": 1996 + }, + { + "epoch": 0.10713519313304722, + "grad_norm": 0.3046875, + "learning_rate": 3.5711731044349074e-06, + "loss": 2.2594, + "step": 1997 + }, + { + "epoch": 0.10718884120171673, + "grad_norm": 0.482421875, + "learning_rate": 3.5729613733905583e-06, + "loss": 2.3455, + "step": 1998 + }, + { + "epoch": 0.10724248927038627, + "grad_norm": 0.28515625, + "learning_rate": 3.574749642346209e-06, + "loss": 2.22, + "step": 1999 + }, + { + "epoch": 0.1072961373390558, + "grad_norm": 0.32421875, + "learning_rate": 3.57653791130186e-06, + "loss": 2.5418, + "step": 2000 + }, + { + "epoch": 0.10734978540772531, + "grad_norm": 0.2734375, + "learning_rate": 3.578326180257511e-06, + "loss": 2.3393, + "step": 2001 + }, + { + "epoch": 0.10740343347639485, + "grad_norm": 0.296875, + "learning_rate": 3.580114449213162e-06, + "loss": 2.1836, + "step": 2002 + }, + { + "epoch": 0.10745708154506438, + "grad_norm": 0.515625, + "learning_rate": 3.5819027181688127e-06, + "loss": 2.3163, + "step": 2003 + }, + { + "epoch": 0.10751072961373391, + "grad_norm": 0.33984375, + "learning_rate": 3.583690987124464e-06, + "loss": 2.2571, + "step": 2004 + }, + { + "epoch": 0.10756437768240343, + "grad_norm": 0.384765625, + "learning_rate": 3.585479256080115e-06, + "loss": 2.1186, + "step": 2005 + }, + { + "epoch": 0.10761802575107296, + "grad_norm": 0.345703125, + "learning_rate": 3.5872675250357657e-06, + "loss": 2.5612, + "step": 2006 + }, + { + "epoch": 0.10767167381974249, + "grad_norm": 0.36328125, + "learning_rate": 3.5890557939914166e-06, + "loss": 2.2638, + "step": 2007 + }, + { + "epoch": 0.10772532188841202, + "grad_norm": 0.44140625, + "learning_rate": 3.5908440629470675e-06, + "loss": 2.5159, + "step": 2008 + }, + { + "epoch": 0.10777896995708154, + "grad_norm": 0.296875, + "learning_rate": 3.5926323319027184e-06, + "loss": 2.4112, + "step": 2009 + }, + { + "epoch": 0.10783261802575107, + "grad_norm": 0.52734375, + "learning_rate": 3.5944206008583692e-06, + "loss": 2.5405, + "step": 2010 + }, + { + "epoch": 0.1078862660944206, + "grad_norm": 0.296875, + "learning_rate": 3.59620886981402e-06, + "loss": 2.4297, + "step": 2011 + }, + { + "epoch": 0.10793991416309012, + "grad_norm": 0.2734375, + "learning_rate": 3.597997138769671e-06, + "loss": 2.2347, + "step": 2012 + }, + { + "epoch": 0.10799356223175965, + "grad_norm": 0.326171875, + "learning_rate": 3.599785407725322e-06, + "loss": 2.4735, + "step": 2013 + }, + { + "epoch": 0.10804721030042919, + "grad_norm": 0.33984375, + "learning_rate": 3.601573676680973e-06, + "loss": 2.5507, + "step": 2014 + }, + { + "epoch": 0.10810085836909872, + "grad_norm": 0.28515625, + "learning_rate": 3.603361945636624e-06, + "loss": 2.2149, + "step": 2015 + }, + { + "epoch": 0.10815450643776824, + "grad_norm": 1.203125, + "learning_rate": 3.605150214592275e-06, + "loss": 2.59, + "step": 2016 + }, + { + "epoch": 0.10820815450643777, + "grad_norm": 0.25, + "learning_rate": 3.606938483547926e-06, + "loss": 2.2556, + "step": 2017 + }, + { + "epoch": 0.1082618025751073, + "grad_norm": 0.390625, + "learning_rate": 3.6087267525035767e-06, + "loss": 2.4696, + "step": 2018 + }, + { + "epoch": 0.10831545064377682, + "grad_norm": 0.453125, + "learning_rate": 3.6105150214592275e-06, + "loss": 2.2095, + "step": 2019 + }, + { + "epoch": 0.10836909871244635, + "grad_norm": 0.357421875, + "learning_rate": 3.6123032904148784e-06, + "loss": 2.3194, + "step": 2020 + }, + { + "epoch": 0.10842274678111588, + "grad_norm": 0.326171875, + "learning_rate": 3.6140915593705293e-06, + "loss": 2.4833, + "step": 2021 + }, + { + "epoch": 0.10847639484978541, + "grad_norm": 0.384765625, + "learning_rate": 3.61587982832618e-06, + "loss": 2.4363, + "step": 2022 + }, + { + "epoch": 0.10853004291845493, + "grad_norm": 0.283203125, + "learning_rate": 3.617668097281831e-06, + "loss": 2.2152, + "step": 2023 + }, + { + "epoch": 0.10858369098712446, + "grad_norm": 0.255859375, + "learning_rate": 3.6194563662374828e-06, + "loss": 2.0057, + "step": 2024 + }, + { + "epoch": 0.108637339055794, + "grad_norm": 0.283203125, + "learning_rate": 3.6212446351931336e-06, + "loss": 2.3656, + "step": 2025 + }, + { + "epoch": 0.10869098712446353, + "grad_norm": 0.30859375, + "learning_rate": 3.6230329041487845e-06, + "loss": 2.5074, + "step": 2026 + }, + { + "epoch": 0.10874463519313304, + "grad_norm": 0.52734375, + "learning_rate": 3.6248211731044354e-06, + "loss": 2.4202, + "step": 2027 + }, + { + "epoch": 0.10879828326180258, + "grad_norm": 0.271484375, + "learning_rate": 3.6266094420600863e-06, + "loss": 2.3536, + "step": 2028 + }, + { + "epoch": 0.10885193133047211, + "grad_norm": 0.283203125, + "learning_rate": 3.628397711015737e-06, + "loss": 2.4317, + "step": 2029 + }, + { + "epoch": 0.10890557939914162, + "grad_norm": 0.396484375, + "learning_rate": 3.630185979971388e-06, + "loss": 2.4109, + "step": 2030 + }, + { + "epoch": 0.10895922746781116, + "grad_norm": 0.376953125, + "learning_rate": 3.631974248927039e-06, + "loss": 2.1464, + "step": 2031 + }, + { + "epoch": 0.10901287553648069, + "grad_norm": 0.474609375, + "learning_rate": 3.6337625178826898e-06, + "loss": 2.3748, + "step": 2032 + }, + { + "epoch": 0.10906652360515022, + "grad_norm": 0.37109375, + "learning_rate": 3.6355507868383407e-06, + "loss": 2.3743, + "step": 2033 + }, + { + "epoch": 0.10912017167381974, + "grad_norm": 0.365234375, + "learning_rate": 3.637339055793992e-06, + "loss": 2.1516, + "step": 2034 + }, + { + "epoch": 0.10917381974248927, + "grad_norm": 0.294921875, + "learning_rate": 3.639127324749643e-06, + "loss": 2.4735, + "step": 2035 + }, + { + "epoch": 0.1092274678111588, + "grad_norm": 0.2578125, + "learning_rate": 3.6409155937052937e-06, + "loss": 2.1733, + "step": 2036 + }, + { + "epoch": 0.10928111587982832, + "grad_norm": 0.2294921875, + "learning_rate": 3.6427038626609446e-06, + "loss": 2.1797, + "step": 2037 + }, + { + "epoch": 0.10933476394849785, + "grad_norm": 0.28125, + "learning_rate": 3.6444921316165955e-06, + "loss": 2.442, + "step": 2038 + }, + { + "epoch": 0.10938841201716738, + "grad_norm": 0.470703125, + "learning_rate": 3.6462804005722463e-06, + "loss": 1.5494, + "step": 2039 + }, + { + "epoch": 0.10944206008583691, + "grad_norm": 0.48828125, + "learning_rate": 3.648068669527897e-06, + "loss": 2.2333, + "step": 2040 + }, + { + "epoch": 0.10949570815450643, + "grad_norm": 0.267578125, + "learning_rate": 3.649856938483548e-06, + "loss": 2.2534, + "step": 2041 + }, + { + "epoch": 0.10954935622317596, + "grad_norm": 0.4375, + "learning_rate": 3.651645207439199e-06, + "loss": 2.3841, + "step": 2042 + }, + { + "epoch": 0.1096030042918455, + "grad_norm": 0.35546875, + "learning_rate": 3.6534334763948503e-06, + "loss": 2.3175, + "step": 2043 + }, + { + "epoch": 0.10965665236051503, + "grad_norm": 0.32421875, + "learning_rate": 3.655221745350501e-06, + "loss": 2.5703, + "step": 2044 + }, + { + "epoch": 0.10971030042918455, + "grad_norm": 0.361328125, + "learning_rate": 3.657010014306152e-06, + "loss": 2.3797, + "step": 2045 + }, + { + "epoch": 0.10976394849785408, + "grad_norm": 0.263671875, + "learning_rate": 3.658798283261803e-06, + "loss": 2.0358, + "step": 2046 + }, + { + "epoch": 0.10981759656652361, + "grad_norm": 0.287109375, + "learning_rate": 3.6605865522174538e-06, + "loss": 2.2185, + "step": 2047 + }, + { + "epoch": 0.10987124463519313, + "grad_norm": 0.33984375, + "learning_rate": 3.6623748211731046e-06, + "loss": 2.0697, + "step": 2048 + }, + { + "epoch": 0.10992489270386266, + "grad_norm": 0.318359375, + "learning_rate": 3.6641630901287555e-06, + "loss": 2.4192, + "step": 2049 + }, + { + "epoch": 0.10997854077253219, + "grad_norm": 0.384765625, + "learning_rate": 3.6659513590844064e-06, + "loss": 2.2785, + "step": 2050 + }, + { + "epoch": 0.11003218884120172, + "grad_norm": 0.279296875, + "learning_rate": 3.6677396280400573e-06, + "loss": 2.2398, + "step": 2051 + }, + { + "epoch": 0.11008583690987124, + "grad_norm": 0.3203125, + "learning_rate": 3.669527896995708e-06, + "loss": 2.3269, + "step": 2052 + }, + { + "epoch": 0.11013948497854077, + "grad_norm": 0.26953125, + "learning_rate": 3.6713161659513594e-06, + "loss": 2.3572, + "step": 2053 + }, + { + "epoch": 0.1101931330472103, + "grad_norm": 0.46484375, + "learning_rate": 3.6731044349070103e-06, + "loss": 2.5661, + "step": 2054 + }, + { + "epoch": 0.11024678111587982, + "grad_norm": 0.408203125, + "learning_rate": 3.674892703862661e-06, + "loss": 2.4468, + "step": 2055 + }, + { + "epoch": 0.11030042918454935, + "grad_norm": 0.314453125, + "learning_rate": 3.676680972818312e-06, + "loss": 2.1503, + "step": 2056 + }, + { + "epoch": 0.11035407725321889, + "grad_norm": 0.384765625, + "learning_rate": 3.678469241773963e-06, + "loss": 2.3768, + "step": 2057 + }, + { + "epoch": 0.11040772532188842, + "grad_norm": 0.357421875, + "learning_rate": 3.680257510729614e-06, + "loss": 2.3515, + "step": 2058 + }, + { + "epoch": 0.11046137339055793, + "grad_norm": 0.328125, + "learning_rate": 3.6820457796852647e-06, + "loss": 2.4019, + "step": 2059 + }, + { + "epoch": 0.11051502145922747, + "grad_norm": 0.341796875, + "learning_rate": 3.6838340486409156e-06, + "loss": 2.1538, + "step": 2060 + }, + { + "epoch": 0.110568669527897, + "grad_norm": 0.369140625, + "learning_rate": 3.6856223175965664e-06, + "loss": 2.6055, + "step": 2061 + }, + { + "epoch": 0.11062231759656653, + "grad_norm": 0.26171875, + "learning_rate": 3.6874105865522173e-06, + "loss": 2.5345, + "step": 2062 + }, + { + "epoch": 0.11067596566523605, + "grad_norm": 0.361328125, + "learning_rate": 3.689198855507869e-06, + "loss": 2.3935, + "step": 2063 + }, + { + "epoch": 0.11072961373390558, + "grad_norm": 0.271484375, + "learning_rate": 3.69098712446352e-06, + "loss": 2.4324, + "step": 2064 + }, + { + "epoch": 0.11078326180257511, + "grad_norm": 0.310546875, + "learning_rate": 3.692775393419171e-06, + "loss": 2.3122, + "step": 2065 + }, + { + "epoch": 0.11083690987124463, + "grad_norm": 0.294921875, + "learning_rate": 3.6945636623748217e-06, + "loss": 2.3617, + "step": 2066 + }, + { + "epoch": 0.11089055793991416, + "grad_norm": 0.337890625, + "learning_rate": 3.6963519313304725e-06, + "loss": 2.4604, + "step": 2067 + }, + { + "epoch": 0.11094420600858369, + "grad_norm": 0.37109375, + "learning_rate": 3.6981402002861234e-06, + "loss": 2.4305, + "step": 2068 + }, + { + "epoch": 0.11099785407725322, + "grad_norm": 0.5234375, + "learning_rate": 3.6999284692417743e-06, + "loss": 2.3713, + "step": 2069 + }, + { + "epoch": 0.11105150214592274, + "grad_norm": 0.29296875, + "learning_rate": 3.701716738197425e-06, + "loss": 2.0554, + "step": 2070 + }, + { + "epoch": 0.11110515021459227, + "grad_norm": 0.306640625, + "learning_rate": 3.703505007153076e-06, + "loss": 2.5994, + "step": 2071 + }, + { + "epoch": 0.1111587982832618, + "grad_norm": 0.32421875, + "learning_rate": 3.705293276108727e-06, + "loss": 2.4616, + "step": 2072 + }, + { + "epoch": 0.11121244635193132, + "grad_norm": 0.330078125, + "learning_rate": 3.7070815450643782e-06, + "loss": 2.2356, + "step": 2073 + }, + { + "epoch": 0.11126609442060086, + "grad_norm": 0.50390625, + "learning_rate": 3.708869814020029e-06, + "loss": 1.6502, + "step": 2074 + }, + { + "epoch": 0.11131974248927039, + "grad_norm": 0.384765625, + "learning_rate": 3.71065808297568e-06, + "loss": 1.4668, + "step": 2075 + }, + { + "epoch": 0.11137339055793992, + "grad_norm": 0.251953125, + "learning_rate": 3.712446351931331e-06, + "loss": 2.4413, + "step": 2076 + }, + { + "epoch": 0.11142703862660944, + "grad_norm": 0.26953125, + "learning_rate": 3.7142346208869817e-06, + "loss": 2.3826, + "step": 2077 + }, + { + "epoch": 0.11148068669527897, + "grad_norm": 0.55078125, + "learning_rate": 3.7160228898426326e-06, + "loss": 2.3511, + "step": 2078 + }, + { + "epoch": 0.1115343347639485, + "grad_norm": 0.306640625, + "learning_rate": 3.7178111587982835e-06, + "loss": 2.2094, + "step": 2079 + }, + { + "epoch": 0.11158798283261803, + "grad_norm": 0.318359375, + "learning_rate": 3.7195994277539344e-06, + "loss": 2.2363, + "step": 2080 + }, + { + "epoch": 0.11164163090128755, + "grad_norm": 0.49609375, + "learning_rate": 3.7213876967095852e-06, + "loss": 2.3397, + "step": 2081 + }, + { + "epoch": 0.11169527896995708, + "grad_norm": 0.337890625, + "learning_rate": 3.723175965665236e-06, + "loss": 2.4028, + "step": 2082 + }, + { + "epoch": 0.11174892703862661, + "grad_norm": 0.294921875, + "learning_rate": 3.7249642346208874e-06, + "loss": 2.4442, + "step": 2083 + }, + { + "epoch": 0.11180257510729613, + "grad_norm": 0.94140625, + "learning_rate": 3.7267525035765383e-06, + "loss": 2.3734, + "step": 2084 + }, + { + "epoch": 0.11185622317596566, + "grad_norm": 0.25390625, + "learning_rate": 3.728540772532189e-06, + "loss": 2.0349, + "step": 2085 + }, + { + "epoch": 0.1119098712446352, + "grad_norm": 0.3203125, + "learning_rate": 3.73032904148784e-06, + "loss": 2.12, + "step": 2086 + }, + { + "epoch": 0.11196351931330473, + "grad_norm": 0.30859375, + "learning_rate": 3.732117310443491e-06, + "loss": 2.4551, + "step": 2087 + }, + { + "epoch": 0.11201716738197424, + "grad_norm": 0.427734375, + "learning_rate": 3.733905579399142e-06, + "loss": 2.2976, + "step": 2088 + }, + { + "epoch": 0.11207081545064378, + "grad_norm": 0.314453125, + "learning_rate": 3.7356938483547927e-06, + "loss": 2.5275, + "step": 2089 + }, + { + "epoch": 0.11212446351931331, + "grad_norm": 1.734375, + "learning_rate": 3.7374821173104435e-06, + "loss": 2.2941, + "step": 2090 + }, + { + "epoch": 0.11217811158798283, + "grad_norm": 0.337890625, + "learning_rate": 3.7392703862660944e-06, + "loss": 2.1755, + "step": 2091 + }, + { + "epoch": 0.11223175965665236, + "grad_norm": 0.353515625, + "learning_rate": 3.7410586552217453e-06, + "loss": 2.4996, + "step": 2092 + }, + { + "epoch": 0.11228540772532189, + "grad_norm": 0.310546875, + "learning_rate": 3.742846924177397e-06, + "loss": 2.2313, + "step": 2093 + }, + { + "epoch": 0.11233905579399142, + "grad_norm": 1.921875, + "learning_rate": 3.744635193133048e-06, + "loss": 2.3327, + "step": 2094 + }, + { + "epoch": 0.11239270386266094, + "grad_norm": 0.48828125, + "learning_rate": 3.7464234620886988e-06, + "loss": 2.0455, + "step": 2095 + }, + { + "epoch": 0.11244635193133047, + "grad_norm": 0.30078125, + "learning_rate": 3.7482117310443496e-06, + "loss": 2.3301, + "step": 2096 + }, + { + "epoch": 0.1125, + "grad_norm": 0.376953125, + "learning_rate": 3.7500000000000005e-06, + "loss": 2.2528, + "step": 2097 + }, + { + "epoch": 0.11255364806866953, + "grad_norm": 0.451171875, + "learning_rate": 3.751788268955651e-06, + "loss": 2.5271, + "step": 2098 + }, + { + "epoch": 0.11260729613733905, + "grad_norm": 0.62890625, + "learning_rate": 3.753576537911302e-06, + "loss": 2.1511, + "step": 2099 + }, + { + "epoch": 0.11266094420600858, + "grad_norm": 0.3203125, + "learning_rate": 3.7553648068669527e-06, + "loss": 2.2892, + "step": 2100 + }, + { + "epoch": 0.11271459227467812, + "grad_norm": 0.271484375, + "learning_rate": 3.7571530758226036e-06, + "loss": 2.1526, + "step": 2101 + }, + { + "epoch": 0.11276824034334763, + "grad_norm": 0.333984375, + "learning_rate": 3.7589413447782553e-06, + "loss": 1.9284, + "step": 2102 + }, + { + "epoch": 0.11282188841201717, + "grad_norm": 0.33203125, + "learning_rate": 3.760729613733906e-06, + "loss": 2.2432, + "step": 2103 + }, + { + "epoch": 0.1128755364806867, + "grad_norm": 0.625, + "learning_rate": 3.762517882689557e-06, + "loss": 2.5191, + "step": 2104 + }, + { + "epoch": 0.11292918454935623, + "grad_norm": 0.375, + "learning_rate": 3.764306151645208e-06, + "loss": 2.4555, + "step": 2105 + }, + { + "epoch": 0.11298283261802575, + "grad_norm": 0.265625, + "learning_rate": 3.766094420600859e-06, + "loss": 2.3928, + "step": 2106 + }, + { + "epoch": 0.11303648068669528, + "grad_norm": 0.58203125, + "learning_rate": 3.7678826895565097e-06, + "loss": 2.5085, + "step": 2107 + }, + { + "epoch": 0.11309012875536481, + "grad_norm": 0.279296875, + "learning_rate": 3.7696709585121606e-06, + "loss": 2.4378, + "step": 2108 + }, + { + "epoch": 0.11314377682403433, + "grad_norm": 0.287109375, + "learning_rate": 3.7714592274678115e-06, + "loss": 2.2744, + "step": 2109 + }, + { + "epoch": 0.11319742489270386, + "grad_norm": 0.333984375, + "learning_rate": 3.7732474964234623e-06, + "loss": 2.3983, + "step": 2110 + }, + { + "epoch": 0.11325107296137339, + "grad_norm": 0.255859375, + "learning_rate": 3.775035765379113e-06, + "loss": 2.2542, + "step": 2111 + }, + { + "epoch": 0.11330472103004292, + "grad_norm": 0.287109375, + "learning_rate": 3.7768240343347645e-06, + "loss": 2.2811, + "step": 2112 + }, + { + "epoch": 0.11335836909871244, + "grad_norm": 0.4140625, + "learning_rate": 3.7786123032904154e-06, + "loss": 2.5827, + "step": 2113 + }, + { + "epoch": 0.11341201716738197, + "grad_norm": 0.5078125, + "learning_rate": 3.7804005722460663e-06, + "loss": 2.3535, + "step": 2114 + }, + { + "epoch": 0.1134656652360515, + "grad_norm": 0.36328125, + "learning_rate": 3.782188841201717e-06, + "loss": 2.4892, + "step": 2115 + }, + { + "epoch": 0.11351931330472104, + "grad_norm": 0.326171875, + "learning_rate": 3.783977110157368e-06, + "loss": 2.5538, + "step": 2116 + }, + { + "epoch": 0.11357296137339055, + "grad_norm": 0.361328125, + "learning_rate": 3.785765379113019e-06, + "loss": 2.2038, + "step": 2117 + }, + { + "epoch": 0.11362660944206009, + "grad_norm": 0.38671875, + "learning_rate": 3.7875536480686698e-06, + "loss": 2.3133, + "step": 2118 + }, + { + "epoch": 0.11368025751072962, + "grad_norm": 0.35546875, + "learning_rate": 3.7893419170243206e-06, + "loss": 2.723, + "step": 2119 + }, + { + "epoch": 0.11373390557939914, + "grad_norm": 0.373046875, + "learning_rate": 3.7911301859799715e-06, + "loss": 2.372, + "step": 2120 + }, + { + "epoch": 0.11378755364806867, + "grad_norm": 0.388671875, + "learning_rate": 3.7929184549356224e-06, + "loss": 2.3645, + "step": 2121 + }, + { + "epoch": 0.1138412017167382, + "grad_norm": 0.287109375, + "learning_rate": 3.7947067238912737e-06, + "loss": 2.3282, + "step": 2122 + }, + { + "epoch": 0.11389484978540773, + "grad_norm": 0.283203125, + "learning_rate": 3.7964949928469246e-06, + "loss": 2.2937, + "step": 2123 + }, + { + "epoch": 0.11394849785407725, + "grad_norm": 0.625, + "learning_rate": 3.7982832618025754e-06, + "loss": 1.9435, + "step": 2124 + }, + { + "epoch": 0.11400214592274678, + "grad_norm": 0.271484375, + "learning_rate": 3.8000715307582263e-06, + "loss": 2.0551, + "step": 2125 + }, + { + "epoch": 0.11405579399141631, + "grad_norm": 0.2734375, + "learning_rate": 3.801859799713877e-06, + "loss": 2.2766, + "step": 2126 + }, + { + "epoch": 0.11410944206008583, + "grad_norm": 0.271484375, + "learning_rate": 3.803648068669528e-06, + "loss": 2.4182, + "step": 2127 + }, + { + "epoch": 0.11416309012875536, + "grad_norm": 0.29296875, + "learning_rate": 3.805436337625179e-06, + "loss": 2.2064, + "step": 2128 + }, + { + "epoch": 0.1142167381974249, + "grad_norm": 0.310546875, + "learning_rate": 3.80722460658083e-06, + "loss": 2.2537, + "step": 2129 + }, + { + "epoch": 0.11427038626609443, + "grad_norm": 0.359375, + "learning_rate": 3.8090128755364807e-06, + "loss": 2.3787, + "step": 2130 + }, + { + "epoch": 0.11432403433476394, + "grad_norm": 0.25390625, + "learning_rate": 3.8108011444921316e-06, + "loss": 2.4839, + "step": 2131 + }, + { + "epoch": 0.11437768240343348, + "grad_norm": 0.314453125, + "learning_rate": 3.8125894134477833e-06, + "loss": 2.3926, + "step": 2132 + }, + { + "epoch": 0.11443133047210301, + "grad_norm": 0.28125, + "learning_rate": 3.814377682403434e-06, + "loss": 2.3715, + "step": 2133 + }, + { + "epoch": 0.11448497854077254, + "grad_norm": 0.4921875, + "learning_rate": 3.816165951359085e-06, + "loss": 2.4406, + "step": 2134 + }, + { + "epoch": 0.11453862660944206, + "grad_norm": 0.78125, + "learning_rate": 3.8179542203147355e-06, + "loss": 2.519, + "step": 2135 + }, + { + "epoch": 0.11459227467811159, + "grad_norm": 0.423828125, + "learning_rate": 3.819742489270387e-06, + "loss": 2.3861, + "step": 2136 + }, + { + "epoch": 0.11464592274678112, + "grad_norm": 0.5, + "learning_rate": 3.821530758226037e-06, + "loss": 2.3669, + "step": 2137 + }, + { + "epoch": 0.11469957081545064, + "grad_norm": 0.34375, + "learning_rate": 3.8233190271816885e-06, + "loss": 2.51, + "step": 2138 + }, + { + "epoch": 0.11475321888412017, + "grad_norm": 0.55859375, + "learning_rate": 3.825107296137339e-06, + "loss": 2.1557, + "step": 2139 + }, + { + "epoch": 0.1148068669527897, + "grad_norm": 0.271484375, + "learning_rate": 3.82689556509299e-06, + "loss": 1.9156, + "step": 2140 + }, + { + "epoch": 0.11486051502145923, + "grad_norm": 0.291015625, + "learning_rate": 3.828683834048641e-06, + "loss": 2.2557, + "step": 2141 + }, + { + "epoch": 0.11491416309012875, + "grad_norm": 0.390625, + "learning_rate": 3.830472103004292e-06, + "loss": 2.4983, + "step": 2142 + }, + { + "epoch": 0.11496781115879828, + "grad_norm": 0.34765625, + "learning_rate": 3.832260371959943e-06, + "loss": 2.3592, + "step": 2143 + }, + { + "epoch": 0.11502145922746781, + "grad_norm": 0.326171875, + "learning_rate": 3.834048640915594e-06, + "loss": 2.425, + "step": 2144 + }, + { + "epoch": 0.11507510729613733, + "grad_norm": 0.400390625, + "learning_rate": 3.835836909871245e-06, + "loss": 2.3276, + "step": 2145 + }, + { + "epoch": 0.11512875536480686, + "grad_norm": 0.302734375, + "learning_rate": 3.8376251788268956e-06, + "loss": 2.3912, + "step": 2146 + }, + { + "epoch": 0.1151824034334764, + "grad_norm": 0.34375, + "learning_rate": 3.839413447782547e-06, + "loss": 2.6002, + "step": 2147 + }, + { + "epoch": 0.11523605150214593, + "grad_norm": 0.302734375, + "learning_rate": 3.841201716738197e-06, + "loss": 2.1401, + "step": 2148 + }, + { + "epoch": 0.11528969957081545, + "grad_norm": 0.3359375, + "learning_rate": 3.842989985693849e-06, + "loss": 1.7984, + "step": 2149 + }, + { + "epoch": 0.11534334763948498, + "grad_norm": 0.24609375, + "learning_rate": 3.844778254649499e-06, + "loss": 2.2028, + "step": 2150 + }, + { + "epoch": 0.11539699570815451, + "grad_norm": 0.2890625, + "learning_rate": 3.84656652360515e-06, + "loss": 2.25, + "step": 2151 + }, + { + "epoch": 0.11545064377682404, + "grad_norm": 0.36328125, + "learning_rate": 3.848354792560802e-06, + "loss": 2.2792, + "step": 2152 + }, + { + "epoch": 0.11550429184549356, + "grad_norm": 0.3359375, + "learning_rate": 3.850143061516453e-06, + "loss": 2.5209, + "step": 2153 + }, + { + "epoch": 0.11555793991416309, + "grad_norm": 0.296875, + "learning_rate": 3.851931330472103e-06, + "loss": 2.3123, + "step": 2154 + }, + { + "epoch": 0.11561158798283262, + "grad_norm": 0.271484375, + "learning_rate": 3.853719599427755e-06, + "loss": 2.4092, + "step": 2155 + }, + { + "epoch": 0.11566523605150214, + "grad_norm": 0.375, + "learning_rate": 3.855507868383405e-06, + "loss": 2.4056, + "step": 2156 + }, + { + "epoch": 0.11571888412017167, + "grad_norm": 0.275390625, + "learning_rate": 3.8572961373390565e-06, + "loss": 2.4188, + "step": 2157 + }, + { + "epoch": 0.1157725321888412, + "grad_norm": 0.310546875, + "learning_rate": 3.859084406294707e-06, + "loss": 2.4943, + "step": 2158 + }, + { + "epoch": 0.11582618025751074, + "grad_norm": 0.291015625, + "learning_rate": 3.860872675250358e-06, + "loss": 2.2302, + "step": 2159 + }, + { + "epoch": 0.11587982832618025, + "grad_norm": 0.25, + "learning_rate": 3.862660944206009e-06, + "loss": 1.9849, + "step": 2160 + }, + { + "epoch": 0.11593347639484979, + "grad_norm": 0.32421875, + "learning_rate": 3.86444921316166e-06, + "loss": 2.3456, + "step": 2161 + }, + { + "epoch": 0.11598712446351932, + "grad_norm": 0.318359375, + "learning_rate": 3.866237482117311e-06, + "loss": 2.3134, + "step": 2162 + }, + { + "epoch": 0.11604077253218884, + "grad_norm": 0.30859375, + "learning_rate": 3.868025751072962e-06, + "loss": 2.2202, + "step": 2163 + }, + { + "epoch": 0.11609442060085837, + "grad_norm": 0.294921875, + "learning_rate": 3.869814020028613e-06, + "loss": 2.4422, + "step": 2164 + }, + { + "epoch": 0.1161480686695279, + "grad_norm": 0.4375, + "learning_rate": 3.8716022889842635e-06, + "loss": 2.3571, + "step": 2165 + }, + { + "epoch": 0.11620171673819743, + "grad_norm": 0.306640625, + "learning_rate": 3.873390557939915e-06, + "loss": 2.5081, + "step": 2166 + }, + { + "epoch": 0.11625536480686695, + "grad_norm": 0.3203125, + "learning_rate": 3.875178826895565e-06, + "loss": 2.4022, + "step": 2167 + }, + { + "epoch": 0.11630901287553648, + "grad_norm": 0.25, + "learning_rate": 3.8769670958512165e-06, + "loss": 2.2048, + "step": 2168 + }, + { + "epoch": 0.11636266094420601, + "grad_norm": 0.318359375, + "learning_rate": 3.878755364806867e-06, + "loss": 2.449, + "step": 2169 + }, + { + "epoch": 0.11641630901287553, + "grad_norm": 0.4375, + "learning_rate": 3.880543633762518e-06, + "loss": 2.4724, + "step": 2170 + }, + { + "epoch": 0.11646995708154506, + "grad_norm": 1.046875, + "learning_rate": 3.8823319027181696e-06, + "loss": 2.3919, + "step": 2171 + }, + { + "epoch": 0.1165236051502146, + "grad_norm": 0.330078125, + "learning_rate": 3.88412017167382e-06, + "loss": 2.1569, + "step": 2172 + }, + { + "epoch": 0.11657725321888412, + "grad_norm": 0.326171875, + "learning_rate": 3.885908440629471e-06, + "loss": 2.0496, + "step": 2173 + }, + { + "epoch": 0.11663090128755364, + "grad_norm": 0.3671875, + "learning_rate": 3.887696709585122e-06, + "loss": 2.6973, + "step": 2174 + }, + { + "epoch": 0.11668454935622317, + "grad_norm": 0.3203125, + "learning_rate": 3.889484978540773e-06, + "loss": 1.9998, + "step": 2175 + }, + { + "epoch": 0.1167381974248927, + "grad_norm": 0.267578125, + "learning_rate": 3.8912732474964235e-06, + "loss": 2.3962, + "step": 2176 + }, + { + "epoch": 0.11679184549356224, + "grad_norm": 0.30859375, + "learning_rate": 3.893061516452075e-06, + "loss": 2.395, + "step": 2177 + }, + { + "epoch": 0.11684549356223176, + "grad_norm": 0.33984375, + "learning_rate": 3.894849785407725e-06, + "loss": 2.2053, + "step": 2178 + }, + { + "epoch": 0.11689914163090129, + "grad_norm": 0.37109375, + "learning_rate": 3.8966380543633766e-06, + "loss": 2.3221, + "step": 2179 + }, + { + "epoch": 0.11695278969957082, + "grad_norm": 0.470703125, + "learning_rate": 3.898426323319027e-06, + "loss": 2.4779, + "step": 2180 + }, + { + "epoch": 0.11700643776824034, + "grad_norm": 0.32421875, + "learning_rate": 3.900214592274678e-06, + "loss": 2.3247, + "step": 2181 + }, + { + "epoch": 0.11706008583690987, + "grad_norm": 3.96875, + "learning_rate": 3.90200286123033e-06, + "loss": 2.5246, + "step": 2182 + }, + { + "epoch": 0.1171137339055794, + "grad_norm": 0.47265625, + "learning_rate": 3.90379113018598e-06, + "loss": 2.2556, + "step": 2183 + }, + { + "epoch": 0.11716738197424893, + "grad_norm": 0.326171875, + "learning_rate": 3.905579399141631e-06, + "loss": 2.6245, + "step": 2184 + }, + { + "epoch": 0.11722103004291845, + "grad_norm": 0.310546875, + "learning_rate": 3.907367668097282e-06, + "loss": 2.3562, + "step": 2185 + }, + { + "epoch": 0.11727467811158798, + "grad_norm": 0.4296875, + "learning_rate": 3.909155937052933e-06, + "loss": 2.4487, + "step": 2186 + }, + { + "epoch": 0.11732832618025751, + "grad_norm": 0.341796875, + "learning_rate": 3.910944206008584e-06, + "loss": 2.1309, + "step": 2187 + }, + { + "epoch": 0.11738197424892703, + "grad_norm": 0.28515625, + "learning_rate": 3.912732474964235e-06, + "loss": 2.2199, + "step": 2188 + }, + { + "epoch": 0.11743562231759656, + "grad_norm": 0.3125, + "learning_rate": 3.914520743919885e-06, + "loss": 2.3453, + "step": 2189 + }, + { + "epoch": 0.1174892703862661, + "grad_norm": 0.33203125, + "learning_rate": 3.916309012875537e-06, + "loss": 2.2479, + "step": 2190 + }, + { + "epoch": 0.11754291845493563, + "grad_norm": 0.5625, + "learning_rate": 3.918097281831188e-06, + "loss": 2.7141, + "step": 2191 + }, + { + "epoch": 0.11759656652360514, + "grad_norm": 0.326171875, + "learning_rate": 3.919885550786839e-06, + "loss": 2.263, + "step": 2192 + }, + { + "epoch": 0.11765021459227468, + "grad_norm": 0.32421875, + "learning_rate": 3.92167381974249e-06, + "loss": 2.2928, + "step": 2193 + }, + { + "epoch": 0.11770386266094421, + "grad_norm": 0.6953125, + "learning_rate": 3.923462088698141e-06, + "loss": 2.3435, + "step": 2194 + }, + { + "epoch": 0.11775751072961374, + "grad_norm": 0.3203125, + "learning_rate": 3.9252503576537914e-06, + "loss": 2.5288, + "step": 2195 + }, + { + "epoch": 0.11781115879828326, + "grad_norm": 0.294921875, + "learning_rate": 3.927038626609443e-06, + "loss": 2.2515, + "step": 2196 + }, + { + "epoch": 0.11786480686695279, + "grad_norm": 0.388671875, + "learning_rate": 3.928826895565093e-06, + "loss": 2.3688, + "step": 2197 + }, + { + "epoch": 0.11791845493562232, + "grad_norm": 0.31640625, + "learning_rate": 3.9306151645207445e-06, + "loss": 2.3643, + "step": 2198 + }, + { + "epoch": 0.11797210300429184, + "grad_norm": 0.494140625, + "learning_rate": 3.932403433476395e-06, + "loss": 2.4439, + "step": 2199 + }, + { + "epoch": 0.11802575107296137, + "grad_norm": 0.359375, + "learning_rate": 3.934191702432046e-06, + "loss": 2.5429, + "step": 2200 + }, + { + "epoch": 0.1180793991416309, + "grad_norm": 0.486328125, + "learning_rate": 3.9359799713876975e-06, + "loss": 2.5077, + "step": 2201 + }, + { + "epoch": 0.11813304721030043, + "grad_norm": 0.54296875, + "learning_rate": 3.937768240343348e-06, + "loss": 2.4334, + "step": 2202 + }, + { + "epoch": 0.11818669527896995, + "grad_norm": 0.3203125, + "learning_rate": 3.939556509298999e-06, + "loss": 2.1058, + "step": 2203 + }, + { + "epoch": 0.11824034334763948, + "grad_norm": 0.4375, + "learning_rate": 3.94134477825465e-06, + "loss": 2.4818, + "step": 2204 + }, + { + "epoch": 0.11829399141630902, + "grad_norm": 0.291015625, + "learning_rate": 3.943133047210301e-06, + "loss": 2.4755, + "step": 2205 + }, + { + "epoch": 0.11834763948497853, + "grad_norm": 0.291015625, + "learning_rate": 3.9449213161659515e-06, + "loss": 2.1629, + "step": 2206 + }, + { + "epoch": 0.11840128755364807, + "grad_norm": 0.31640625, + "learning_rate": 3.946709585121603e-06, + "loss": 2.4157, + "step": 2207 + }, + { + "epoch": 0.1184549356223176, + "grad_norm": 0.26171875, + "learning_rate": 3.948497854077253e-06, + "loss": 2.1229, + "step": 2208 + }, + { + "epoch": 0.11850858369098713, + "grad_norm": 0.365234375, + "learning_rate": 3.9502861230329045e-06, + "loss": 2.0797, + "step": 2209 + }, + { + "epoch": 0.11856223175965665, + "grad_norm": 1.6953125, + "learning_rate": 3.952074391988555e-06, + "loss": 2.4303, + "step": 2210 + }, + { + "epoch": 0.11861587982832618, + "grad_norm": 0.359375, + "learning_rate": 3.953862660944206e-06, + "loss": 2.2895, + "step": 2211 + }, + { + "epoch": 0.11866952789699571, + "grad_norm": 0.37109375, + "learning_rate": 3.955650929899858e-06, + "loss": 2.6209, + "step": 2212 + }, + { + "epoch": 0.11872317596566524, + "grad_norm": 0.41796875, + "learning_rate": 3.957439198855508e-06, + "loss": 2.4826, + "step": 2213 + }, + { + "epoch": 0.11877682403433476, + "grad_norm": 0.369140625, + "learning_rate": 3.959227467811159e-06, + "loss": 2.375, + "step": 2214 + }, + { + "epoch": 0.11883047210300429, + "grad_norm": 0.3125, + "learning_rate": 3.96101573676681e-06, + "loss": 2.3093, + "step": 2215 + }, + { + "epoch": 0.11888412017167382, + "grad_norm": 0.291015625, + "learning_rate": 3.962804005722461e-06, + "loss": 2.4422, + "step": 2216 + }, + { + "epoch": 0.11893776824034334, + "grad_norm": 0.3046875, + "learning_rate": 3.9645922746781115e-06, + "loss": 2.2265, + "step": 2217 + }, + { + "epoch": 0.11899141630901287, + "grad_norm": 0.32421875, + "learning_rate": 3.966380543633763e-06, + "loss": 2.4606, + "step": 2218 + }, + { + "epoch": 0.1190450643776824, + "grad_norm": 0.33984375, + "learning_rate": 3.968168812589413e-06, + "loss": 2.2314, + "step": 2219 + }, + { + "epoch": 0.11909871244635194, + "grad_norm": 0.359375, + "learning_rate": 3.969957081545065e-06, + "loss": 2.3227, + "step": 2220 + }, + { + "epoch": 0.11915236051502145, + "grad_norm": 0.353515625, + "learning_rate": 3.971745350500716e-06, + "loss": 2.5058, + "step": 2221 + }, + { + "epoch": 0.11920600858369099, + "grad_norm": 0.310546875, + "learning_rate": 3.973533619456366e-06, + "loss": 2.4485, + "step": 2222 + }, + { + "epoch": 0.11925965665236052, + "grad_norm": 0.4921875, + "learning_rate": 3.975321888412018e-06, + "loss": 2.1844, + "step": 2223 + }, + { + "epoch": 0.11931330472103004, + "grad_norm": 0.93359375, + "learning_rate": 3.977110157367668e-06, + "loss": 2.0816, + "step": 2224 + }, + { + "epoch": 0.11936695278969957, + "grad_norm": 0.38671875, + "learning_rate": 3.978898426323319e-06, + "loss": 2.2378, + "step": 2225 + }, + { + "epoch": 0.1194206008583691, + "grad_norm": 0.3359375, + "learning_rate": 3.98068669527897e-06, + "loss": 2.347, + "step": 2226 + }, + { + "epoch": 0.11947424892703863, + "grad_norm": 0.37109375, + "learning_rate": 3.982474964234621e-06, + "loss": 2.4598, + "step": 2227 + }, + { + "epoch": 0.11952789699570815, + "grad_norm": 0.326171875, + "learning_rate": 3.984263233190272e-06, + "loss": 2.2469, + "step": 2228 + }, + { + "epoch": 0.11958154506437768, + "grad_norm": 1.0, + "learning_rate": 3.986051502145923e-06, + "loss": 2.3343, + "step": 2229 + }, + { + "epoch": 0.11963519313304721, + "grad_norm": 0.2890625, + "learning_rate": 3.987839771101574e-06, + "loss": 2.1662, + "step": 2230 + }, + { + "epoch": 0.11968884120171674, + "grad_norm": 0.326171875, + "learning_rate": 3.9896280400572255e-06, + "loss": 2.423, + "step": 2231 + }, + { + "epoch": 0.11974248927038626, + "grad_norm": 0.265625, + "learning_rate": 3.991416309012876e-06, + "loss": 2.363, + "step": 2232 + }, + { + "epoch": 0.1197961373390558, + "grad_norm": 0.34765625, + "learning_rate": 3.993204577968527e-06, + "loss": 2.3781, + "step": 2233 + }, + { + "epoch": 0.11984978540772533, + "grad_norm": 0.296875, + "learning_rate": 3.994992846924178e-06, + "loss": 2.2162, + "step": 2234 + }, + { + "epoch": 0.11990343347639484, + "grad_norm": 0.515625, + "learning_rate": 3.996781115879829e-06, + "loss": 2.2439, + "step": 2235 + }, + { + "epoch": 0.11995708154506438, + "grad_norm": 0.44140625, + "learning_rate": 3.9985693848354795e-06, + "loss": 2.1383, + "step": 2236 + }, + { + "epoch": 0.12001072961373391, + "grad_norm": 0.314453125, + "learning_rate": 4.000357653791131e-06, + "loss": 2.255, + "step": 2237 + }, + { + "epoch": 0.12006437768240344, + "grad_norm": 0.291015625, + "learning_rate": 4.002145922746781e-06, + "loss": 2.3466, + "step": 2238 + }, + { + "epoch": 0.12011802575107296, + "grad_norm": 0.5703125, + "learning_rate": 4.0039341917024325e-06, + "loss": 2.0544, + "step": 2239 + }, + { + "epoch": 0.12017167381974249, + "grad_norm": 0.439453125, + "learning_rate": 4.005722460658084e-06, + "loss": 2.4871, + "step": 2240 + }, + { + "epoch": 0.12022532188841202, + "grad_norm": 0.3125, + "learning_rate": 4.007510729613734e-06, + "loss": 2.6536, + "step": 2241 + }, + { + "epoch": 0.12027896995708154, + "grad_norm": 0.2890625, + "learning_rate": 4.0092989985693856e-06, + "loss": 2.2285, + "step": 2242 + }, + { + "epoch": 0.12033261802575107, + "grad_norm": 0.58984375, + "learning_rate": 4.011087267525036e-06, + "loss": 2.4802, + "step": 2243 + }, + { + "epoch": 0.1203862660944206, + "grad_norm": 0.27734375, + "learning_rate": 4.012875536480687e-06, + "loss": 2.4202, + "step": 2244 + }, + { + "epoch": 0.12043991416309013, + "grad_norm": 0.283203125, + "learning_rate": 4.014663805436338e-06, + "loss": 1.9998, + "step": 2245 + }, + { + "epoch": 0.12049356223175965, + "grad_norm": 0.3515625, + "learning_rate": 4.016452074391989e-06, + "loss": 2.4737, + "step": 2246 + }, + { + "epoch": 0.12054721030042918, + "grad_norm": 0.2890625, + "learning_rate": 4.0182403433476395e-06, + "loss": 2.3249, + "step": 2247 + }, + { + "epoch": 0.12060085836909872, + "grad_norm": 0.546875, + "learning_rate": 4.020028612303291e-06, + "loss": 2.4467, + "step": 2248 + }, + { + "epoch": 0.12065450643776825, + "grad_norm": 0.466796875, + "learning_rate": 4.021816881258941e-06, + "loss": 1.607, + "step": 2249 + }, + { + "epoch": 0.12070815450643776, + "grad_norm": 0.365234375, + "learning_rate": 4.0236051502145926e-06, + "loss": 2.5862, + "step": 2250 + }, + { + "epoch": 0.1207618025751073, + "grad_norm": 0.32421875, + "learning_rate": 4.025393419170244e-06, + "loss": 2.5477, + "step": 2251 + }, + { + "epoch": 0.12081545064377683, + "grad_norm": 0.35546875, + "learning_rate": 4.027181688125894e-06, + "loss": 2.2915, + "step": 2252 + }, + { + "epoch": 0.12086909871244635, + "grad_norm": 0.275390625, + "learning_rate": 4.028969957081546e-06, + "loss": 2.0082, + "step": 2253 + }, + { + "epoch": 0.12092274678111588, + "grad_norm": 1.71875, + "learning_rate": 4.030758226037196e-06, + "loss": 2.2484, + "step": 2254 + }, + { + "epoch": 0.12097639484978541, + "grad_norm": 0.31640625, + "learning_rate": 4.032546494992847e-06, + "loss": 2.339, + "step": 2255 + }, + { + "epoch": 0.12103004291845494, + "grad_norm": 0.318359375, + "learning_rate": 4.034334763948498e-06, + "loss": 2.475, + "step": 2256 + }, + { + "epoch": 0.12108369098712446, + "grad_norm": 0.271484375, + "learning_rate": 4.036123032904149e-06, + "loss": 2.2974, + "step": 2257 + }, + { + "epoch": 0.12113733905579399, + "grad_norm": 0.34765625, + "learning_rate": 4.0379113018597996e-06, + "loss": 2.1981, + "step": 2258 + }, + { + "epoch": 0.12119098712446352, + "grad_norm": 0.34375, + "learning_rate": 4.039699570815451e-06, + "loss": 2.528, + "step": 2259 + }, + { + "epoch": 0.12124463519313304, + "grad_norm": 0.314453125, + "learning_rate": 4.041487839771102e-06, + "loss": 2.4562, + "step": 2260 + }, + { + "epoch": 0.12129828326180257, + "grad_norm": 1.2578125, + "learning_rate": 4.043276108726753e-06, + "loss": 2.3322, + "step": 2261 + }, + { + "epoch": 0.1213519313304721, + "grad_norm": 0.349609375, + "learning_rate": 4.045064377682404e-06, + "loss": 2.2515, + "step": 2262 + }, + { + "epoch": 0.12140557939914164, + "grad_norm": 0.73046875, + "learning_rate": 4.046852646638054e-06, + "loss": 2.2698, + "step": 2263 + }, + { + "epoch": 0.12145922746781115, + "grad_norm": 0.318359375, + "learning_rate": 4.048640915593706e-06, + "loss": 2.3795, + "step": 2264 + }, + { + "epoch": 0.12151287553648069, + "grad_norm": 0.30859375, + "learning_rate": 4.050429184549356e-06, + "loss": 2.394, + "step": 2265 + }, + { + "epoch": 0.12156652360515022, + "grad_norm": 0.30078125, + "learning_rate": 4.0522174535050074e-06, + "loss": 2.1762, + "step": 2266 + }, + { + "epoch": 0.12162017167381975, + "grad_norm": 0.314453125, + "learning_rate": 4.054005722460658e-06, + "loss": 2.2588, + "step": 2267 + }, + { + "epoch": 0.12167381974248927, + "grad_norm": 0.361328125, + "learning_rate": 4.055793991416309e-06, + "loss": 2.6372, + "step": 2268 + }, + { + "epoch": 0.1217274678111588, + "grad_norm": 0.310546875, + "learning_rate": 4.05758226037196e-06, + "loss": 2.1577, + "step": 2269 + }, + { + "epoch": 0.12178111587982833, + "grad_norm": 0.29296875, + "learning_rate": 4.059370529327612e-06, + "loss": 2.6252, + "step": 2270 + }, + { + "epoch": 0.12183476394849785, + "grad_norm": 0.30859375, + "learning_rate": 4.061158798283262e-06, + "loss": 2.2482, + "step": 2271 + }, + { + "epoch": 0.12188841201716738, + "grad_norm": 0.349609375, + "learning_rate": 4.0629470672389135e-06, + "loss": 2.2392, + "step": 2272 + }, + { + "epoch": 0.12194206008583691, + "grad_norm": 0.3046875, + "learning_rate": 4.064735336194564e-06, + "loss": 2.3367, + "step": 2273 + }, + { + "epoch": 0.12199570815450644, + "grad_norm": 0.3515625, + "learning_rate": 4.066523605150215e-06, + "loss": 2.3145, + "step": 2274 + }, + { + "epoch": 0.12204935622317596, + "grad_norm": 0.384765625, + "learning_rate": 4.068311874105866e-06, + "loss": 2.1383, + "step": 2275 + }, + { + "epoch": 0.1221030042918455, + "grad_norm": 0.306640625, + "learning_rate": 4.070100143061517e-06, + "loss": 2.4898, + "step": 2276 + }, + { + "epoch": 0.12215665236051503, + "grad_norm": 0.33984375, + "learning_rate": 4.0718884120171675e-06, + "loss": 2.0382, + "step": 2277 + }, + { + "epoch": 0.12221030042918454, + "grad_norm": 0.31640625, + "learning_rate": 4.073676680972819e-06, + "loss": 2.4287, + "step": 2278 + }, + { + "epoch": 0.12226394849785407, + "grad_norm": 0.298828125, + "learning_rate": 4.075464949928469e-06, + "loss": 2.0626, + "step": 2279 + }, + { + "epoch": 0.1223175965665236, + "grad_norm": 0.3359375, + "learning_rate": 4.0772532188841205e-06, + "loss": 2.1176, + "step": 2280 + }, + { + "epoch": 0.12237124463519314, + "grad_norm": 0.34375, + "learning_rate": 4.079041487839772e-06, + "loss": 2.2022, + "step": 2281 + }, + { + "epoch": 0.12242489270386266, + "grad_norm": 0.275390625, + "learning_rate": 4.080829756795422e-06, + "loss": 2.0632, + "step": 2282 + }, + { + "epoch": 0.12247854077253219, + "grad_norm": 0.7734375, + "learning_rate": 4.082618025751074e-06, + "loss": 2.414, + "step": 2283 + }, + { + "epoch": 0.12253218884120172, + "grad_norm": 0.3515625, + "learning_rate": 4.084406294706724e-06, + "loss": 2.3139, + "step": 2284 + }, + { + "epoch": 0.12258583690987125, + "grad_norm": 0.337890625, + "learning_rate": 4.086194563662375e-06, + "loss": 2.4407, + "step": 2285 + }, + { + "epoch": 0.12263948497854077, + "grad_norm": 0.9453125, + "learning_rate": 4.087982832618026e-06, + "loss": 2.3742, + "step": 2286 + }, + { + "epoch": 0.1226931330472103, + "grad_norm": 0.3671875, + "learning_rate": 4.089771101573677e-06, + "loss": 2.2795, + "step": 2287 + }, + { + "epoch": 0.12274678111587983, + "grad_norm": 0.490234375, + "learning_rate": 4.0915593705293275e-06, + "loss": 2.1934, + "step": 2288 + }, + { + "epoch": 0.12280042918454935, + "grad_norm": 0.3984375, + "learning_rate": 4.093347639484979e-06, + "loss": 2.7744, + "step": 2289 + }, + { + "epoch": 0.12285407725321888, + "grad_norm": 0.30078125, + "learning_rate": 4.09513590844063e-06, + "loss": 2.4153, + "step": 2290 + }, + { + "epoch": 0.12290772532188841, + "grad_norm": 0.333984375, + "learning_rate": 4.096924177396281e-06, + "loss": 2.251, + "step": 2291 + }, + { + "epoch": 0.12296137339055795, + "grad_norm": 0.248046875, + "learning_rate": 4.098712446351932e-06, + "loss": 2.2073, + "step": 2292 + }, + { + "epoch": 0.12301502145922746, + "grad_norm": 0.306640625, + "learning_rate": 4.100500715307582e-06, + "loss": 2.1259, + "step": 2293 + }, + { + "epoch": 0.123068669527897, + "grad_norm": 0.29296875, + "learning_rate": 4.102288984263234e-06, + "loss": 2.2025, + "step": 2294 + }, + { + "epoch": 0.12312231759656653, + "grad_norm": 2.625, + "learning_rate": 4.104077253218884e-06, + "loss": 2.3774, + "step": 2295 + }, + { + "epoch": 0.12317596566523605, + "grad_norm": 0.3046875, + "learning_rate": 4.105865522174535e-06, + "loss": 2.4393, + "step": 2296 + }, + { + "epoch": 0.12322961373390558, + "grad_norm": 0.3125, + "learning_rate": 4.107653791130186e-06, + "loss": 2.3066, + "step": 2297 + }, + { + "epoch": 0.12328326180257511, + "grad_norm": 0.326171875, + "learning_rate": 4.109442060085837e-06, + "loss": 2.3295, + "step": 2298 + }, + { + "epoch": 0.12333690987124464, + "grad_norm": 0.390625, + "learning_rate": 4.1112303290414884e-06, + "loss": 2.5174, + "step": 2299 + }, + { + "epoch": 0.12339055793991416, + "grad_norm": 0.3828125, + "learning_rate": 4.113018597997139e-06, + "loss": 2.4497, + "step": 2300 + }, + { + "epoch": 0.12344420600858369, + "grad_norm": 0.294921875, + "learning_rate": 4.11480686695279e-06, + "loss": 2.4167, + "step": 2301 + }, + { + "epoch": 0.12349785407725322, + "grad_norm": 0.326171875, + "learning_rate": 4.116595135908441e-06, + "loss": 2.2234, + "step": 2302 + }, + { + "epoch": 0.12355150214592275, + "grad_norm": 0.275390625, + "learning_rate": 4.118383404864092e-06, + "loss": 2.348, + "step": 2303 + }, + { + "epoch": 0.12360515021459227, + "grad_norm": 0.30078125, + "learning_rate": 4.120171673819742e-06, + "loss": 2.5239, + "step": 2304 + }, + { + "epoch": 0.1236587982832618, + "grad_norm": 0.322265625, + "learning_rate": 4.121959942775394e-06, + "loss": 2.7057, + "step": 2305 + }, + { + "epoch": 0.12371244635193133, + "grad_norm": 0.291015625, + "learning_rate": 4.123748211731044e-06, + "loss": 2.1187, + "step": 2306 + }, + { + "epoch": 0.12376609442060085, + "grad_norm": 0.341796875, + "learning_rate": 4.1255364806866955e-06, + "loss": 2.1877, + "step": 2307 + }, + { + "epoch": 0.12381974248927038, + "grad_norm": 0.330078125, + "learning_rate": 4.127324749642346e-06, + "loss": 2.481, + "step": 2308 + }, + { + "epoch": 0.12387339055793992, + "grad_norm": 0.35546875, + "learning_rate": 4.129113018597998e-06, + "loss": 2.4386, + "step": 2309 + }, + { + "epoch": 0.12392703862660945, + "grad_norm": 0.330078125, + "learning_rate": 4.1309012875536485e-06, + "loss": 1.9539, + "step": 2310 + }, + { + "epoch": 0.12398068669527897, + "grad_norm": 1.21875, + "learning_rate": 4.1326895565093e-06, + "loss": 2.4367, + "step": 2311 + }, + { + "epoch": 0.1240343347639485, + "grad_norm": 0.294921875, + "learning_rate": 4.13447782546495e-06, + "loss": 2.3158, + "step": 2312 + }, + { + "epoch": 0.12408798283261803, + "grad_norm": 0.255859375, + "learning_rate": 4.1362660944206016e-06, + "loss": 2.0182, + "step": 2313 + }, + { + "epoch": 0.12414163090128755, + "grad_norm": 0.318359375, + "learning_rate": 4.138054363376252e-06, + "loss": 2.6264, + "step": 2314 + }, + { + "epoch": 0.12419527896995708, + "grad_norm": 0.3359375, + "learning_rate": 4.139842632331903e-06, + "loss": 2.2984, + "step": 2315 + }, + { + "epoch": 0.12424892703862661, + "grad_norm": 0.310546875, + "learning_rate": 4.141630901287554e-06, + "loss": 2.4359, + "step": 2316 + }, + { + "epoch": 0.12430257510729614, + "grad_norm": 0.3359375, + "learning_rate": 4.143419170243205e-06, + "loss": 2.1293, + "step": 2317 + }, + { + "epoch": 0.12435622317596566, + "grad_norm": 0.3125, + "learning_rate": 4.1452074391988555e-06, + "loss": 2.2591, + "step": 2318 + }, + { + "epoch": 0.12440987124463519, + "grad_norm": 0.294921875, + "learning_rate": 4.146995708154507e-06, + "loss": 2.2768, + "step": 2319 + }, + { + "epoch": 0.12446351931330472, + "grad_norm": 0.357421875, + "learning_rate": 4.148783977110158e-06, + "loss": 2.1906, + "step": 2320 + }, + { + "epoch": 0.12451716738197426, + "grad_norm": 0.30859375, + "learning_rate": 4.1505722460658086e-06, + "loss": 2.2269, + "step": 2321 + }, + { + "epoch": 0.12457081545064377, + "grad_norm": 0.306640625, + "learning_rate": 4.15236051502146e-06, + "loss": 2.451, + "step": 2322 + }, + { + "epoch": 0.1246244635193133, + "grad_norm": 0.314453125, + "learning_rate": 4.15414878397711e-06, + "loss": 2.1325, + "step": 2323 + }, + { + "epoch": 0.12467811158798284, + "grad_norm": 0.34375, + "learning_rate": 4.155937052932762e-06, + "loss": 2.3826, + "step": 2324 + }, + { + "epoch": 0.12473175965665236, + "grad_norm": 0.390625, + "learning_rate": 4.157725321888412e-06, + "loss": 2.205, + "step": 2325 + }, + { + "epoch": 0.12478540772532189, + "grad_norm": 0.29296875, + "learning_rate": 4.159513590844063e-06, + "loss": 2.3052, + "step": 2326 + }, + { + "epoch": 0.12483905579399142, + "grad_norm": 0.427734375, + "learning_rate": 4.161301859799714e-06, + "loss": 2.3904, + "step": 2327 + }, + { + "epoch": 0.12489270386266095, + "grad_norm": 0.49609375, + "learning_rate": 4.163090128755365e-06, + "loss": 2.3911, + "step": 2328 + }, + { + "epoch": 0.12494635193133047, + "grad_norm": 0.337890625, + "learning_rate": 4.164878397711016e-06, + "loss": 2.2883, + "step": 2329 + }, + { + "epoch": 0.125, + "grad_norm": 0.35546875, + "learning_rate": 4.166666666666667e-06, + "loss": 2.2196, + "step": 2330 + }, + { + "epoch": 0.12505364806866953, + "grad_norm": 0.392578125, + "learning_rate": 4.168454935622318e-06, + "loss": 2.2411, + "step": 2331 + }, + { + "epoch": 0.12510729613733906, + "grad_norm": 0.40625, + "learning_rate": 4.170243204577969e-06, + "loss": 2.407, + "step": 2332 + }, + { + "epoch": 0.1251609442060086, + "grad_norm": 0.53125, + "learning_rate": 4.17203147353362e-06, + "loss": 2.3722, + "step": 2333 + }, + { + "epoch": 0.1252145922746781, + "grad_norm": 0.337890625, + "learning_rate": 4.17381974248927e-06, + "loss": 2.4601, + "step": 2334 + }, + { + "epoch": 0.12526824034334763, + "grad_norm": 0.345703125, + "learning_rate": 4.175608011444922e-06, + "loss": 2.4901, + "step": 2335 + }, + { + "epoch": 0.12532188841201716, + "grad_norm": 0.408203125, + "learning_rate": 4.177396280400572e-06, + "loss": 2.5503, + "step": 2336 + }, + { + "epoch": 0.1253755364806867, + "grad_norm": 0.322265625, + "learning_rate": 4.179184549356223e-06, + "loss": 2.3725, + "step": 2337 + }, + { + "epoch": 0.12542918454935623, + "grad_norm": 0.359375, + "learning_rate": 4.180972818311874e-06, + "loss": 2.5572, + "step": 2338 + }, + { + "epoch": 0.12548283261802576, + "grad_norm": 0.3828125, + "learning_rate": 4.182761087267526e-06, + "loss": 2.3689, + "step": 2339 + }, + { + "epoch": 0.1255364806866953, + "grad_norm": 0.38671875, + "learning_rate": 4.1845493562231765e-06, + "loss": 2.1153, + "step": 2340 + }, + { + "epoch": 0.1255901287553648, + "grad_norm": 0.34375, + "learning_rate": 4.186337625178828e-06, + "loss": 2.1709, + "step": 2341 + }, + { + "epoch": 0.12564377682403433, + "grad_norm": 0.56640625, + "learning_rate": 4.188125894134478e-06, + "loss": 1.6597, + "step": 2342 + }, + { + "epoch": 0.12569742489270386, + "grad_norm": 0.328125, + "learning_rate": 4.1899141630901295e-06, + "loss": 2.1081, + "step": 2343 + }, + { + "epoch": 0.1257510729613734, + "grad_norm": 0.36328125, + "learning_rate": 4.19170243204578e-06, + "loss": 2.3789, + "step": 2344 + }, + { + "epoch": 0.12580472103004292, + "grad_norm": 0.302734375, + "learning_rate": 4.193490701001431e-06, + "loss": 2.455, + "step": 2345 + }, + { + "epoch": 0.12585836909871245, + "grad_norm": 0.283203125, + "learning_rate": 4.195278969957082e-06, + "loss": 2.2965, + "step": 2346 + }, + { + "epoch": 0.12591201716738198, + "grad_norm": 0.369140625, + "learning_rate": 4.197067238912732e-06, + "loss": 2.4752, + "step": 2347 + }, + { + "epoch": 0.1259656652360515, + "grad_norm": 0.318359375, + "learning_rate": 4.1988555078683835e-06, + "loss": 2.4816, + "step": 2348 + }, + { + "epoch": 0.12601931330472102, + "grad_norm": 0.326171875, + "learning_rate": 4.200643776824035e-06, + "loss": 2.3087, + "step": 2349 + }, + { + "epoch": 0.12607296137339055, + "grad_norm": 0.3125, + "learning_rate": 4.202432045779686e-06, + "loss": 2.2867, + "step": 2350 + }, + { + "epoch": 0.12612660944206008, + "grad_norm": 0.373046875, + "learning_rate": 4.2042203147353365e-06, + "loss": 2.4131, + "step": 2351 + }, + { + "epoch": 0.12618025751072962, + "grad_norm": 0.3984375, + "learning_rate": 4.206008583690988e-06, + "loss": 2.1236, + "step": 2352 + }, + { + "epoch": 0.12623390557939915, + "grad_norm": 0.30859375, + "learning_rate": 4.207796852646638e-06, + "loss": 2.3046, + "step": 2353 + }, + { + "epoch": 0.12628755364806868, + "grad_norm": 0.310546875, + "learning_rate": 4.20958512160229e-06, + "loss": 2.5478, + "step": 2354 + }, + { + "epoch": 0.1263412017167382, + "grad_norm": 0.27734375, + "learning_rate": 4.21137339055794e-06, + "loss": 2.3794, + "step": 2355 + }, + { + "epoch": 0.12639484978540771, + "grad_norm": 0.27734375, + "learning_rate": 4.213161659513591e-06, + "loss": 2.4166, + "step": 2356 + }, + { + "epoch": 0.12644849785407725, + "grad_norm": 0.322265625, + "learning_rate": 4.214949928469242e-06, + "loss": 2.4934, + "step": 2357 + }, + { + "epoch": 0.12650214592274678, + "grad_norm": 0.359375, + "learning_rate": 4.216738197424893e-06, + "loss": 2.3715, + "step": 2358 + }, + { + "epoch": 0.1265557939914163, + "grad_norm": 0.51953125, + "learning_rate": 4.218526466380544e-06, + "loss": 2.3627, + "step": 2359 + }, + { + "epoch": 0.12660944206008584, + "grad_norm": 0.287109375, + "learning_rate": 4.220314735336195e-06, + "loss": 2.3747, + "step": 2360 + }, + { + "epoch": 0.12666309012875537, + "grad_norm": 0.31640625, + "learning_rate": 4.222103004291846e-06, + "loss": 2.4197, + "step": 2361 + }, + { + "epoch": 0.1267167381974249, + "grad_norm": 0.3046875, + "learning_rate": 4.223891273247497e-06, + "loss": 2.0905, + "step": 2362 + }, + { + "epoch": 0.1267703862660944, + "grad_norm": 0.357421875, + "learning_rate": 4.225679542203148e-06, + "loss": 2.5155, + "step": 2363 + }, + { + "epoch": 0.12682403433476394, + "grad_norm": 0.33203125, + "learning_rate": 4.227467811158798e-06, + "loss": 2.6983, + "step": 2364 + }, + { + "epoch": 0.12687768240343347, + "grad_norm": 0.361328125, + "learning_rate": 4.22925608011445e-06, + "loss": 2.0489, + "step": 2365 + }, + { + "epoch": 0.126931330472103, + "grad_norm": 0.296875, + "learning_rate": 4.2310443490701e-06, + "loss": 2.46, + "step": 2366 + }, + { + "epoch": 0.12698497854077254, + "grad_norm": 0.39453125, + "learning_rate": 4.232832618025751e-06, + "loss": 2.1907, + "step": 2367 + }, + { + "epoch": 0.12703862660944207, + "grad_norm": 0.27734375, + "learning_rate": 4.234620886981403e-06, + "loss": 2.3217, + "step": 2368 + }, + { + "epoch": 0.1270922746781116, + "grad_norm": 0.359375, + "learning_rate": 4.236409155937053e-06, + "loss": 2.4051, + "step": 2369 + }, + { + "epoch": 0.1271459227467811, + "grad_norm": 0.375, + "learning_rate": 4.2381974248927044e-06, + "loss": 2.2565, + "step": 2370 + }, + { + "epoch": 0.12719957081545064, + "grad_norm": 0.515625, + "learning_rate": 4.239985693848355e-06, + "loss": 2.6269, + "step": 2371 + }, + { + "epoch": 0.12725321888412017, + "grad_norm": 0.326171875, + "learning_rate": 4.241773962804006e-06, + "loss": 2.3833, + "step": 2372 + }, + { + "epoch": 0.1273068669527897, + "grad_norm": 0.298828125, + "learning_rate": 4.243562231759657e-06, + "loss": 2.1214, + "step": 2373 + }, + { + "epoch": 0.12736051502145923, + "grad_norm": 0.314453125, + "learning_rate": 4.245350500715308e-06, + "loss": 2.2173, + "step": 2374 + }, + { + "epoch": 0.12741416309012876, + "grad_norm": 0.4765625, + "learning_rate": 4.247138769670958e-06, + "loss": 2.7094, + "step": 2375 + }, + { + "epoch": 0.1274678111587983, + "grad_norm": 0.291015625, + "learning_rate": 4.24892703862661e-06, + "loss": 2.412, + "step": 2376 + }, + { + "epoch": 0.1275214592274678, + "grad_norm": 0.3515625, + "learning_rate": 4.25071530758226e-06, + "loss": 2.1372, + "step": 2377 + }, + { + "epoch": 0.12757510729613733, + "grad_norm": 0.337890625, + "learning_rate": 4.252503576537912e-06, + "loss": 2.3329, + "step": 2378 + }, + { + "epoch": 0.12762875536480686, + "grad_norm": 0.3515625, + "learning_rate": 4.254291845493563e-06, + "loss": 2.4013, + "step": 2379 + }, + { + "epoch": 0.1276824034334764, + "grad_norm": 0.84375, + "learning_rate": 4.256080114449214e-06, + "loss": 2.3499, + "step": 2380 + }, + { + "epoch": 0.12773605150214593, + "grad_norm": 0.283203125, + "learning_rate": 4.2578683834048645e-06, + "loss": 2.3622, + "step": 2381 + }, + { + "epoch": 0.12778969957081546, + "grad_norm": 0.267578125, + "learning_rate": 4.259656652360516e-06, + "loss": 2.3571, + "step": 2382 + }, + { + "epoch": 0.127843347639485, + "grad_norm": 0.337890625, + "learning_rate": 4.261444921316166e-06, + "loss": 2.426, + "step": 2383 + }, + { + "epoch": 0.1278969957081545, + "grad_norm": 0.32421875, + "learning_rate": 4.2632331902718175e-06, + "loss": 2.3873, + "step": 2384 + }, + { + "epoch": 0.12795064377682402, + "grad_norm": 0.267578125, + "learning_rate": 4.265021459227468e-06, + "loss": 2.4855, + "step": 2385 + }, + { + "epoch": 0.12800429184549356, + "grad_norm": 0.27734375, + "learning_rate": 4.266809728183119e-06, + "loss": 2.368, + "step": 2386 + }, + { + "epoch": 0.1280579399141631, + "grad_norm": 0.3125, + "learning_rate": 4.26859799713877e-06, + "loss": 2.3733, + "step": 2387 + }, + { + "epoch": 0.12811158798283262, + "grad_norm": 0.34765625, + "learning_rate": 4.270386266094421e-06, + "loss": 2.4825, + "step": 2388 + }, + { + "epoch": 0.12816523605150215, + "grad_norm": 0.400390625, + "learning_rate": 4.272174535050072e-06, + "loss": 2.3239, + "step": 2389 + }, + { + "epoch": 0.12821888412017168, + "grad_norm": 0.322265625, + "learning_rate": 4.273962804005723e-06, + "loss": 2.3973, + "step": 2390 + }, + { + "epoch": 0.12827253218884122, + "grad_norm": 0.41015625, + "learning_rate": 4.275751072961374e-06, + "loss": 2.6471, + "step": 2391 + }, + { + "epoch": 0.12832618025751072, + "grad_norm": 0.5625, + "learning_rate": 4.2775393419170246e-06, + "loss": 2.04, + "step": 2392 + }, + { + "epoch": 0.12837982832618025, + "grad_norm": 0.296875, + "learning_rate": 4.279327610872676e-06, + "loss": 2.24, + "step": 2393 + }, + { + "epoch": 0.12843347639484978, + "grad_norm": 0.32421875, + "learning_rate": 4.281115879828326e-06, + "loss": 2.2296, + "step": 2394 + }, + { + "epoch": 0.12848712446351931, + "grad_norm": 0.33203125, + "learning_rate": 4.282904148783978e-06, + "loss": 2.0129, + "step": 2395 + }, + { + "epoch": 0.12854077253218885, + "grad_norm": 0.302734375, + "learning_rate": 4.284692417739628e-06, + "loss": 2.2294, + "step": 2396 + }, + { + "epoch": 0.12859442060085838, + "grad_norm": 0.3125, + "learning_rate": 4.286480686695279e-06, + "loss": 2.3946, + "step": 2397 + }, + { + "epoch": 0.1286480686695279, + "grad_norm": 1.078125, + "learning_rate": 4.288268955650931e-06, + "loss": 2.4282, + "step": 2398 + }, + { + "epoch": 0.1287017167381974, + "grad_norm": 0.46484375, + "learning_rate": 4.290057224606581e-06, + "loss": 2.3935, + "step": 2399 + }, + { + "epoch": 0.12875536480686695, + "grad_norm": 0.3125, + "learning_rate": 4.291845493562232e-06, + "loss": 2.4373, + "step": 2400 + }, + { + "epoch": 0.12880901287553648, + "grad_norm": 0.4375, + "learning_rate": 4.293633762517883e-06, + "loss": 2.2988, + "step": 2401 + }, + { + "epoch": 0.128862660944206, + "grad_norm": 0.35546875, + "learning_rate": 4.295422031473534e-06, + "loss": 2.1422, + "step": 2402 + }, + { + "epoch": 0.12891630901287554, + "grad_norm": 0.3203125, + "learning_rate": 4.297210300429185e-06, + "loss": 2.2053, + "step": 2403 + }, + { + "epoch": 0.12896995708154507, + "grad_norm": 0.33203125, + "learning_rate": 4.298998569384836e-06, + "loss": 2.355, + "step": 2404 + }, + { + "epoch": 0.1290236051502146, + "grad_norm": 0.365234375, + "learning_rate": 4.300786838340486e-06, + "loss": 2.3379, + "step": 2405 + }, + { + "epoch": 0.1290772532188841, + "grad_norm": 0.361328125, + "learning_rate": 4.302575107296138e-06, + "loss": 2.7169, + "step": 2406 + }, + { + "epoch": 0.12913090128755364, + "grad_norm": 0.291015625, + "learning_rate": 4.304363376251788e-06, + "loss": 2.2074, + "step": 2407 + }, + { + "epoch": 0.12918454935622317, + "grad_norm": 0.3203125, + "learning_rate": 4.306151645207439e-06, + "loss": 2.1688, + "step": 2408 + }, + { + "epoch": 0.1292381974248927, + "grad_norm": 0.3125, + "learning_rate": 4.307939914163091e-06, + "loss": 2.1795, + "step": 2409 + }, + { + "epoch": 0.12929184549356224, + "grad_norm": 0.396484375, + "learning_rate": 4.309728183118741e-06, + "loss": 2.6022, + "step": 2410 + }, + { + "epoch": 0.12934549356223177, + "grad_norm": 0.3125, + "learning_rate": 4.3115164520743925e-06, + "loss": 2.6432, + "step": 2411 + }, + { + "epoch": 0.1293991416309013, + "grad_norm": 0.42578125, + "learning_rate": 4.313304721030043e-06, + "loss": 1.8675, + "step": 2412 + }, + { + "epoch": 0.1294527896995708, + "grad_norm": 0.33203125, + "learning_rate": 4.315092989985694e-06, + "loss": 2.4212, + "step": 2413 + }, + { + "epoch": 0.12950643776824033, + "grad_norm": 0.3359375, + "learning_rate": 4.316881258941345e-06, + "loss": 2.4522, + "step": 2414 + }, + { + "epoch": 0.12956008583690987, + "grad_norm": 0.26953125, + "learning_rate": 4.318669527896996e-06, + "loss": 2.038, + "step": 2415 + }, + { + "epoch": 0.1296137339055794, + "grad_norm": 0.353515625, + "learning_rate": 4.320457796852646e-06, + "loss": 2.3895, + "step": 2416 + }, + { + "epoch": 0.12966738197424893, + "grad_norm": 0.296875, + "learning_rate": 4.3222460658082986e-06, + "loss": 2.3928, + "step": 2417 + }, + { + "epoch": 0.12972103004291846, + "grad_norm": 0.318359375, + "learning_rate": 4.324034334763949e-06, + "loss": 2.3931, + "step": 2418 + }, + { + "epoch": 0.129774678111588, + "grad_norm": 0.3359375, + "learning_rate": 4.3258226037196e-06, + "loss": 2.4203, + "step": 2419 + }, + { + "epoch": 0.1298283261802575, + "grad_norm": 0.283203125, + "learning_rate": 4.327610872675251e-06, + "loss": 2.2639, + "step": 2420 + }, + { + "epoch": 0.12988197424892703, + "grad_norm": 0.341796875, + "learning_rate": 4.329399141630902e-06, + "loss": 2.2074, + "step": 2421 + }, + { + "epoch": 0.12993562231759656, + "grad_norm": 0.341796875, + "learning_rate": 4.3311874105865525e-06, + "loss": 2.2854, + "step": 2422 + }, + { + "epoch": 0.1299892703862661, + "grad_norm": 0.494140625, + "learning_rate": 4.332975679542204e-06, + "loss": 2.1781, + "step": 2423 + }, + { + "epoch": 0.13004291845493562, + "grad_norm": 0.61328125, + "learning_rate": 4.334763948497854e-06, + "loss": 1.6977, + "step": 2424 + }, + { + "epoch": 0.13009656652360516, + "grad_norm": 0.330078125, + "learning_rate": 4.3365522174535056e-06, + "loss": 2.401, + "step": 2425 + }, + { + "epoch": 0.1301502145922747, + "grad_norm": 0.3203125, + "learning_rate": 4.338340486409156e-06, + "loss": 2.4613, + "step": 2426 + }, + { + "epoch": 0.13020386266094422, + "grad_norm": 0.314453125, + "learning_rate": 4.340128755364807e-06, + "loss": 2.3405, + "step": 2427 + }, + { + "epoch": 0.13025751072961372, + "grad_norm": 0.287109375, + "learning_rate": 4.341917024320459e-06, + "loss": 2.3144, + "step": 2428 + }, + { + "epoch": 0.13031115879828326, + "grad_norm": 0.423828125, + "learning_rate": 4.343705293276109e-06, + "loss": 1.2648, + "step": 2429 + }, + { + "epoch": 0.1303648068669528, + "grad_norm": 0.33203125, + "learning_rate": 4.34549356223176e-06, + "loss": 2.4152, + "step": 2430 + }, + { + "epoch": 0.13041845493562232, + "grad_norm": 0.31640625, + "learning_rate": 4.347281831187411e-06, + "loss": 2.3419, + "step": 2431 + }, + { + "epoch": 0.13047210300429185, + "grad_norm": 0.416015625, + "learning_rate": 4.349070100143062e-06, + "loss": 2.2502, + "step": 2432 + }, + { + "epoch": 0.13052575107296138, + "grad_norm": 0.31640625, + "learning_rate": 4.350858369098713e-06, + "loss": 2.5858, + "step": 2433 + }, + { + "epoch": 0.13057939914163091, + "grad_norm": 0.28515625, + "learning_rate": 4.352646638054364e-06, + "loss": 2.371, + "step": 2434 + }, + { + "epoch": 0.13063304721030042, + "grad_norm": 0.3515625, + "learning_rate": 4.354434907010014e-06, + "loss": 2.52, + "step": 2435 + }, + { + "epoch": 0.13068669527896995, + "grad_norm": 0.3671875, + "learning_rate": 4.356223175965666e-06, + "loss": 2.5011, + "step": 2436 + }, + { + "epoch": 0.13074034334763948, + "grad_norm": 0.4453125, + "learning_rate": 4.358011444921317e-06, + "loss": 2.3871, + "step": 2437 + }, + { + "epoch": 0.130793991416309, + "grad_norm": 0.3203125, + "learning_rate": 4.359799713876967e-06, + "loss": 2.3866, + "step": 2438 + }, + { + "epoch": 0.13084763948497855, + "grad_norm": 0.61328125, + "learning_rate": 4.361587982832619e-06, + "loss": 2.3212, + "step": 2439 + }, + { + "epoch": 0.13090128755364808, + "grad_norm": 0.390625, + "learning_rate": 4.363376251788269e-06, + "loss": 2.4741, + "step": 2440 + }, + { + "epoch": 0.1309549356223176, + "grad_norm": 0.32421875, + "learning_rate": 4.3651645207439204e-06, + "loss": 2.4285, + "step": 2441 + }, + { + "epoch": 0.1310085836909871, + "grad_norm": 0.365234375, + "learning_rate": 4.366952789699571e-06, + "loss": 2.4339, + "step": 2442 + }, + { + "epoch": 0.13106223175965664, + "grad_norm": 0.412109375, + "learning_rate": 4.368741058655222e-06, + "loss": 2.3463, + "step": 2443 + }, + { + "epoch": 0.13111587982832618, + "grad_norm": 0.32421875, + "learning_rate": 4.370529327610873e-06, + "loss": 2.4214, + "step": 2444 + }, + { + "epoch": 0.1311695278969957, + "grad_norm": 0.478515625, + "learning_rate": 4.372317596566524e-06, + "loss": 2.1768, + "step": 2445 + }, + { + "epoch": 0.13122317596566524, + "grad_norm": 0.34765625, + "learning_rate": 4.374105865522174e-06, + "loss": 2.4294, + "step": 2446 + }, + { + "epoch": 0.13127682403433477, + "grad_norm": 0.302734375, + "learning_rate": 4.375894134477826e-06, + "loss": 2.4205, + "step": 2447 + }, + { + "epoch": 0.1313304721030043, + "grad_norm": 0.376953125, + "learning_rate": 4.377682403433477e-06, + "loss": 2.4307, + "step": 2448 + }, + { + "epoch": 0.1313841201716738, + "grad_norm": 0.50390625, + "learning_rate": 4.3794706723891274e-06, + "loss": 2.2793, + "step": 2449 + }, + { + "epoch": 0.13143776824034334, + "grad_norm": 0.62109375, + "learning_rate": 4.381258941344779e-06, + "loss": 1.4698, + "step": 2450 + }, + { + "epoch": 0.13149141630901287, + "grad_norm": 0.314453125, + "learning_rate": 4.383047210300429e-06, + "loss": 2.5063, + "step": 2451 + }, + { + "epoch": 0.1315450643776824, + "grad_norm": 0.357421875, + "learning_rate": 4.3848354792560805e-06, + "loss": 2.1089, + "step": 2452 + }, + { + "epoch": 0.13159871244635193, + "grad_norm": 0.298828125, + "learning_rate": 4.386623748211731e-06, + "loss": 2.2249, + "step": 2453 + }, + { + "epoch": 0.13165236051502147, + "grad_norm": 0.326171875, + "learning_rate": 4.388412017167382e-06, + "loss": 2.3682, + "step": 2454 + }, + { + "epoch": 0.131706008583691, + "grad_norm": 0.390625, + "learning_rate": 4.390200286123033e-06, + "loss": 2.6533, + "step": 2455 + }, + { + "epoch": 0.1317596566523605, + "grad_norm": 0.279296875, + "learning_rate": 4.391988555078684e-06, + "loss": 2.187, + "step": 2456 + }, + { + "epoch": 0.13181330472103003, + "grad_norm": 0.33984375, + "learning_rate": 4.393776824034335e-06, + "loss": 2.4068, + "step": 2457 + }, + { + "epoch": 0.13186695278969957, + "grad_norm": 0.419921875, + "learning_rate": 4.395565092989987e-06, + "loss": 2.3627, + "step": 2458 + }, + { + "epoch": 0.1319206008583691, + "grad_norm": 0.27734375, + "learning_rate": 4.397353361945637e-06, + "loss": 2.3878, + "step": 2459 + }, + { + "epoch": 0.13197424892703863, + "grad_norm": 0.42578125, + "learning_rate": 4.399141630901288e-06, + "loss": 2.319, + "step": 2460 + }, + { + "epoch": 0.13202789699570816, + "grad_norm": 0.375, + "learning_rate": 4.400929899856939e-06, + "loss": 2.1403, + "step": 2461 + }, + { + "epoch": 0.1320815450643777, + "grad_norm": 0.35546875, + "learning_rate": 4.40271816881259e-06, + "loss": 2.1959, + "step": 2462 + }, + { + "epoch": 0.13213519313304722, + "grad_norm": 0.3359375, + "learning_rate": 4.4045064377682406e-06, + "loss": 1.9592, + "step": 2463 + }, + { + "epoch": 0.13218884120171673, + "grad_norm": 0.34375, + "learning_rate": 4.406294706723892e-06, + "loss": 2.3746, + "step": 2464 + }, + { + "epoch": 0.13224248927038626, + "grad_norm": 0.34375, + "learning_rate": 4.408082975679542e-06, + "loss": 2.421, + "step": 2465 + }, + { + "epoch": 0.1322961373390558, + "grad_norm": 0.57421875, + "learning_rate": 4.409871244635194e-06, + "loss": 2.345, + "step": 2466 + }, + { + "epoch": 0.13234978540772532, + "grad_norm": 0.34375, + "learning_rate": 4.411659513590845e-06, + "loss": 2.4756, + "step": 2467 + }, + { + "epoch": 0.13240343347639486, + "grad_norm": 0.330078125, + "learning_rate": 4.413447782546495e-06, + "loss": 2.166, + "step": 2468 + }, + { + "epoch": 0.1324570815450644, + "grad_norm": 0.474609375, + "learning_rate": 4.415236051502147e-06, + "loss": 2.3386, + "step": 2469 + }, + { + "epoch": 0.13251072961373392, + "grad_norm": 0.38671875, + "learning_rate": 4.417024320457797e-06, + "loss": 2.3124, + "step": 2470 + }, + { + "epoch": 0.13256437768240342, + "grad_norm": 0.52734375, + "learning_rate": 4.418812589413448e-06, + "loss": 2.3705, + "step": 2471 + }, + { + "epoch": 0.13261802575107295, + "grad_norm": 0.33984375, + "learning_rate": 4.420600858369099e-06, + "loss": 2.4909, + "step": 2472 + }, + { + "epoch": 0.13267167381974249, + "grad_norm": 0.3046875, + "learning_rate": 4.42238912732475e-06, + "loss": 2.3539, + "step": 2473 + }, + { + "epoch": 0.13272532188841202, + "grad_norm": 0.3515625, + "learning_rate": 4.424177396280401e-06, + "loss": 2.444, + "step": 2474 + }, + { + "epoch": 0.13277896995708155, + "grad_norm": 0.453125, + "learning_rate": 4.425965665236052e-06, + "loss": 2.2236, + "step": 2475 + }, + { + "epoch": 0.13283261802575108, + "grad_norm": 0.58203125, + "learning_rate": 4.427753934191702e-06, + "loss": 2.1339, + "step": 2476 + }, + { + "epoch": 0.1328862660944206, + "grad_norm": 0.330078125, + "learning_rate": 4.429542203147354e-06, + "loss": 2.3223, + "step": 2477 + }, + { + "epoch": 0.13293991416309012, + "grad_norm": 0.34375, + "learning_rate": 4.431330472103005e-06, + "loss": 2.2508, + "step": 2478 + }, + { + "epoch": 0.13299356223175965, + "grad_norm": 0.330078125, + "learning_rate": 4.433118741058655e-06, + "loss": 2.2068, + "step": 2479 + }, + { + "epoch": 0.13304721030042918, + "grad_norm": 0.7265625, + "learning_rate": 4.434907010014307e-06, + "loss": 1.6193, + "step": 2480 + }, + { + "epoch": 0.1331008583690987, + "grad_norm": 0.33203125, + "learning_rate": 4.436695278969957e-06, + "loss": 2.4532, + "step": 2481 + }, + { + "epoch": 0.13315450643776824, + "grad_norm": 0.7109375, + "learning_rate": 4.4384835479256085e-06, + "loss": 2.3496, + "step": 2482 + }, + { + "epoch": 0.13320815450643778, + "grad_norm": 0.66796875, + "learning_rate": 4.440271816881259e-06, + "loss": 2.4124, + "step": 2483 + }, + { + "epoch": 0.1332618025751073, + "grad_norm": 0.33984375, + "learning_rate": 4.44206008583691e-06, + "loss": 2.2207, + "step": 2484 + }, + { + "epoch": 0.1333154506437768, + "grad_norm": 0.33984375, + "learning_rate": 4.443848354792561e-06, + "loss": 2.5058, + "step": 2485 + }, + { + "epoch": 0.13336909871244634, + "grad_norm": 0.259765625, + "learning_rate": 4.445636623748212e-06, + "loss": 2.2336, + "step": 2486 + }, + { + "epoch": 0.13342274678111588, + "grad_norm": 0.333984375, + "learning_rate": 4.447424892703863e-06, + "loss": 1.9363, + "step": 2487 + }, + { + "epoch": 0.1334763948497854, + "grad_norm": 0.3359375, + "learning_rate": 4.449213161659514e-06, + "loss": 2.3032, + "step": 2488 + }, + { + "epoch": 0.13353004291845494, + "grad_norm": 0.328125, + "learning_rate": 4.451001430615165e-06, + "loss": 2.3392, + "step": 2489 + }, + { + "epoch": 0.13358369098712447, + "grad_norm": 0.322265625, + "learning_rate": 4.4527896995708155e-06, + "loss": 2.1145, + "step": 2490 + }, + { + "epoch": 0.133637339055794, + "grad_norm": 0.3359375, + "learning_rate": 4.454577968526467e-06, + "loss": 2.1897, + "step": 2491 + }, + { + "epoch": 0.1336909871244635, + "grad_norm": 0.3515625, + "learning_rate": 4.456366237482117e-06, + "loss": 2.2062, + "step": 2492 + }, + { + "epoch": 0.13374463519313304, + "grad_norm": 0.55078125, + "learning_rate": 4.4581545064377685e-06, + "loss": 1.5148, + "step": 2493 + }, + { + "epoch": 0.13379828326180257, + "grad_norm": 0.6484375, + "learning_rate": 4.459942775393419e-06, + "loss": 2.3787, + "step": 2494 + }, + { + "epoch": 0.1338519313304721, + "grad_norm": 0.287109375, + "learning_rate": 4.46173104434907e-06, + "loss": 2.1106, + "step": 2495 + }, + { + "epoch": 0.13390557939914163, + "grad_norm": 0.333984375, + "learning_rate": 4.4635193133047216e-06, + "loss": 2.4752, + "step": 2496 + }, + { + "epoch": 0.13395922746781116, + "grad_norm": 0.384765625, + "learning_rate": 4.465307582260373e-06, + "loss": 2.4259, + "step": 2497 + }, + { + "epoch": 0.1340128755364807, + "grad_norm": 0.373046875, + "learning_rate": 4.467095851216023e-06, + "loss": 2.2329, + "step": 2498 + }, + { + "epoch": 0.1340665236051502, + "grad_norm": 0.3359375, + "learning_rate": 4.468884120171675e-06, + "loss": 2.502, + "step": 2499 + }, + { + "epoch": 0.13412017167381973, + "grad_norm": 0.34765625, + "learning_rate": 4.470672389127325e-06, + "loss": 2.4366, + "step": 2500 + }, + { + "epoch": 0.13417381974248926, + "grad_norm": 0.353515625, + "learning_rate": 4.472460658082976e-06, + "loss": 2.42, + "step": 2501 + }, + { + "epoch": 0.1342274678111588, + "grad_norm": 0.291015625, + "learning_rate": 4.474248927038627e-06, + "loss": 2.0264, + "step": 2502 + }, + { + "epoch": 0.13428111587982833, + "grad_norm": 0.3671875, + "learning_rate": 4.476037195994278e-06, + "loss": 2.3941, + "step": 2503 + }, + { + "epoch": 0.13433476394849786, + "grad_norm": 0.447265625, + "learning_rate": 4.4778254649499286e-06, + "loss": 1.7976, + "step": 2504 + }, + { + "epoch": 0.1343884120171674, + "grad_norm": 0.40625, + "learning_rate": 4.47961373390558e-06, + "loss": 2.5317, + "step": 2505 + }, + { + "epoch": 0.13444206008583692, + "grad_norm": 0.353515625, + "learning_rate": 4.481402002861231e-06, + "loss": 2.2196, + "step": 2506 + }, + { + "epoch": 0.13449570815450643, + "grad_norm": 0.3984375, + "learning_rate": 4.483190271816882e-06, + "loss": 2.512, + "step": 2507 + }, + { + "epoch": 0.13454935622317596, + "grad_norm": 0.306640625, + "learning_rate": 4.484978540772533e-06, + "loss": 2.1115, + "step": 2508 + }, + { + "epoch": 0.1346030042918455, + "grad_norm": 0.65625, + "learning_rate": 4.486766809728183e-06, + "loss": 2.2687, + "step": 2509 + }, + { + "epoch": 0.13465665236051502, + "grad_norm": 0.38671875, + "learning_rate": 4.488555078683835e-06, + "loss": 1.9127, + "step": 2510 + }, + { + "epoch": 0.13471030042918455, + "grad_norm": 0.30078125, + "learning_rate": 4.490343347639485e-06, + "loss": 2.3341, + "step": 2511 + }, + { + "epoch": 0.13476394849785409, + "grad_norm": 0.265625, + "learning_rate": 4.4921316165951364e-06, + "loss": 2.3171, + "step": 2512 + }, + { + "epoch": 0.13481759656652362, + "grad_norm": 0.333984375, + "learning_rate": 4.493919885550787e-06, + "loss": 2.3731, + "step": 2513 + }, + { + "epoch": 0.13487124463519312, + "grad_norm": 0.283203125, + "learning_rate": 4.495708154506438e-06, + "loss": 2.223, + "step": 2514 + }, + { + "epoch": 0.13492489270386265, + "grad_norm": 0.330078125, + "learning_rate": 4.497496423462089e-06, + "loss": 1.994, + "step": 2515 + }, + { + "epoch": 0.13497854077253219, + "grad_norm": 0.28125, + "learning_rate": 4.49928469241774e-06, + "loss": 2.1764, + "step": 2516 + }, + { + "epoch": 0.13503218884120172, + "grad_norm": 1.0859375, + "learning_rate": 4.501072961373391e-06, + "loss": 2.3938, + "step": 2517 + }, + { + "epoch": 0.13508583690987125, + "grad_norm": 0.337890625, + "learning_rate": 4.502861230329042e-06, + "loss": 2.2027, + "step": 2518 + }, + { + "epoch": 0.13513948497854078, + "grad_norm": 0.33984375, + "learning_rate": 4.504649499284693e-06, + "loss": 2.2728, + "step": 2519 + }, + { + "epoch": 0.1351931330472103, + "grad_norm": 1.2265625, + "learning_rate": 4.5064377682403434e-06, + "loss": 2.7375, + "step": 2520 + }, + { + "epoch": 0.13524678111587982, + "grad_norm": 0.333984375, + "learning_rate": 4.508226037195995e-06, + "loss": 2.3979, + "step": 2521 + }, + { + "epoch": 0.13530042918454935, + "grad_norm": 0.41015625, + "learning_rate": 4.510014306151645e-06, + "loss": 2.2997, + "step": 2522 + }, + { + "epoch": 0.13535407725321888, + "grad_norm": 0.32421875, + "learning_rate": 4.5118025751072965e-06, + "loss": 2.5079, + "step": 2523 + }, + { + "epoch": 0.1354077253218884, + "grad_norm": 0.490234375, + "learning_rate": 4.513590844062947e-06, + "loss": 2.3939, + "step": 2524 + }, + { + "epoch": 0.13546137339055794, + "grad_norm": 0.33984375, + "learning_rate": 4.515379113018598e-06, + "loss": 2.3026, + "step": 2525 + }, + { + "epoch": 0.13551502145922747, + "grad_norm": 0.296875, + "learning_rate": 4.5171673819742495e-06, + "loss": 2.637, + "step": 2526 + }, + { + "epoch": 0.135568669527897, + "grad_norm": 0.484375, + "learning_rate": 4.5189556509299e-06, + "loss": 2.266, + "step": 2527 + }, + { + "epoch": 0.1356223175965665, + "grad_norm": 0.349609375, + "learning_rate": 4.520743919885551e-06, + "loss": 2.3067, + "step": 2528 + }, + { + "epoch": 0.13567596566523604, + "grad_norm": 0.5625, + "learning_rate": 4.522532188841202e-06, + "loss": 2.1779, + "step": 2529 + }, + { + "epoch": 0.13572961373390557, + "grad_norm": 0.27734375, + "learning_rate": 4.524320457796853e-06, + "loss": 2.4284, + "step": 2530 + }, + { + "epoch": 0.1357832618025751, + "grad_norm": 0.33203125, + "learning_rate": 4.5261087267525035e-06, + "loss": 2.1247, + "step": 2531 + }, + { + "epoch": 0.13583690987124464, + "grad_norm": 0.33203125, + "learning_rate": 4.527896995708155e-06, + "loss": 2.3699, + "step": 2532 + }, + { + "epoch": 0.13589055793991417, + "grad_norm": 0.439453125, + "learning_rate": 4.529685264663805e-06, + "loss": 2.4085, + "step": 2533 + }, + { + "epoch": 0.1359442060085837, + "grad_norm": 0.2578125, + "learning_rate": 4.5314735336194565e-06, + "loss": 2.3652, + "step": 2534 + }, + { + "epoch": 0.1359978540772532, + "grad_norm": 2.359375, + "learning_rate": 4.533261802575107e-06, + "loss": 2.346, + "step": 2535 + }, + { + "epoch": 0.13605150214592274, + "grad_norm": 0.33203125, + "learning_rate": 4.535050071530759e-06, + "loss": 2.4387, + "step": 2536 + }, + { + "epoch": 0.13610515021459227, + "grad_norm": 0.306640625, + "learning_rate": 4.53683834048641e-06, + "loss": 2.1356, + "step": 2537 + }, + { + "epoch": 0.1361587982832618, + "grad_norm": 0.85546875, + "learning_rate": 4.538626609442061e-06, + "loss": 2.4683, + "step": 2538 + }, + { + "epoch": 0.13621244635193133, + "grad_norm": 0.34765625, + "learning_rate": 4.540414878397711e-06, + "loss": 2.4848, + "step": 2539 + }, + { + "epoch": 0.13626609442060086, + "grad_norm": 0.51171875, + "learning_rate": 4.542203147353363e-06, + "loss": 2.1465, + "step": 2540 + }, + { + "epoch": 0.1363197424892704, + "grad_norm": 0.283203125, + "learning_rate": 4.543991416309013e-06, + "loss": 2.4244, + "step": 2541 + }, + { + "epoch": 0.13637339055793993, + "grad_norm": 0.3671875, + "learning_rate": 4.545779685264664e-06, + "loss": 1.6193, + "step": 2542 + }, + { + "epoch": 0.13642703862660943, + "grad_norm": 0.30078125, + "learning_rate": 4.547567954220315e-06, + "loss": 2.403, + "step": 2543 + }, + { + "epoch": 0.13648068669527896, + "grad_norm": 0.306640625, + "learning_rate": 4.549356223175966e-06, + "loss": 2.3303, + "step": 2544 + }, + { + "epoch": 0.1365343347639485, + "grad_norm": 0.3984375, + "learning_rate": 4.5511444921316174e-06, + "loss": 2.3047, + "step": 2545 + }, + { + "epoch": 0.13658798283261803, + "grad_norm": 0.435546875, + "learning_rate": 4.552932761087268e-06, + "loss": 2.2599, + "step": 2546 + }, + { + "epoch": 0.13664163090128756, + "grad_norm": 0.271484375, + "learning_rate": 4.554721030042919e-06, + "loss": 2.3043, + "step": 2547 + }, + { + "epoch": 0.1366952789699571, + "grad_norm": 0.314453125, + "learning_rate": 4.55650929899857e-06, + "loss": 2.2332, + "step": 2548 + }, + { + "epoch": 0.13674892703862662, + "grad_norm": 0.3046875, + "learning_rate": 4.558297567954221e-06, + "loss": 2.2255, + "step": 2549 + }, + { + "epoch": 0.13680257510729613, + "grad_norm": 0.29296875, + "learning_rate": 4.560085836909871e-06, + "loss": 2.4245, + "step": 2550 + }, + { + "epoch": 0.13685622317596566, + "grad_norm": 0.3515625, + "learning_rate": 4.561874105865523e-06, + "loss": 2.4481, + "step": 2551 + }, + { + "epoch": 0.1369098712446352, + "grad_norm": 0.31640625, + "learning_rate": 4.563662374821173e-06, + "loss": 2.3068, + "step": 2552 + }, + { + "epoch": 0.13696351931330472, + "grad_norm": 0.345703125, + "learning_rate": 4.5654506437768245e-06, + "loss": 2.2526, + "step": 2553 + }, + { + "epoch": 0.13701716738197425, + "grad_norm": 0.296875, + "learning_rate": 4.567238912732475e-06, + "loss": 2.035, + "step": 2554 + }, + { + "epoch": 0.13707081545064378, + "grad_norm": 0.314453125, + "learning_rate": 4.569027181688126e-06, + "loss": 2.2996, + "step": 2555 + }, + { + "epoch": 0.13712446351931332, + "grad_norm": 0.5234375, + "learning_rate": 4.5708154506437775e-06, + "loss": 2.3824, + "step": 2556 + }, + { + "epoch": 0.13717811158798282, + "grad_norm": 0.369140625, + "learning_rate": 4.572603719599428e-06, + "loss": 2.2531, + "step": 2557 + }, + { + "epoch": 0.13723175965665235, + "grad_norm": 0.2890625, + "learning_rate": 4.574391988555079e-06, + "loss": 2.2902, + "step": 2558 + }, + { + "epoch": 0.13728540772532188, + "grad_norm": 0.34765625, + "learning_rate": 4.57618025751073e-06, + "loss": 2.4555, + "step": 2559 + }, + { + "epoch": 0.13733905579399142, + "grad_norm": 0.34765625, + "learning_rate": 4.577968526466381e-06, + "loss": 2.2617, + "step": 2560 + }, + { + "epoch": 0.13739270386266095, + "grad_norm": 0.326171875, + "learning_rate": 4.5797567954220315e-06, + "loss": 2.418, + "step": 2561 + }, + { + "epoch": 0.13744635193133048, + "grad_norm": 0.28125, + "learning_rate": 4.581545064377683e-06, + "loss": 2.2828, + "step": 2562 + }, + { + "epoch": 0.1375, + "grad_norm": 0.330078125, + "learning_rate": 4.583333333333333e-06, + "loss": 2.2, + "step": 2563 + }, + { + "epoch": 0.13755364806866952, + "grad_norm": 0.291015625, + "learning_rate": 4.5851216022889845e-06, + "loss": 2.332, + "step": 2564 + }, + { + "epoch": 0.13760729613733905, + "grad_norm": 0.33203125, + "learning_rate": 4.586909871244636e-06, + "loss": 2.2765, + "step": 2565 + }, + { + "epoch": 0.13766094420600858, + "grad_norm": 0.34765625, + "learning_rate": 4.588698140200287e-06, + "loss": 2.2534, + "step": 2566 + }, + { + "epoch": 0.1377145922746781, + "grad_norm": 0.470703125, + "learning_rate": 4.5904864091559376e-06, + "loss": 2.5869, + "step": 2567 + }, + { + "epoch": 0.13776824034334764, + "grad_norm": 0.376953125, + "learning_rate": 4.592274678111589e-06, + "loss": 2.5798, + "step": 2568 + }, + { + "epoch": 0.13782188841201717, + "grad_norm": 0.6015625, + "learning_rate": 4.594062947067239e-06, + "loss": 2.4364, + "step": 2569 + }, + { + "epoch": 0.1378755364806867, + "grad_norm": 0.359375, + "learning_rate": 4.595851216022891e-06, + "loss": 2.4366, + "step": 2570 + }, + { + "epoch": 0.1379291845493562, + "grad_norm": 0.322265625, + "learning_rate": 4.597639484978541e-06, + "loss": 2.6136, + "step": 2571 + }, + { + "epoch": 0.13798283261802574, + "grad_norm": 0.330078125, + "learning_rate": 4.5994277539341915e-06, + "loss": 2.463, + "step": 2572 + }, + { + "epoch": 0.13803648068669527, + "grad_norm": 0.30859375, + "learning_rate": 4.601216022889843e-06, + "loss": 2.0181, + "step": 2573 + }, + { + "epoch": 0.1380901287553648, + "grad_norm": 0.298828125, + "learning_rate": 4.603004291845493e-06, + "loss": 2.4837, + "step": 2574 + }, + { + "epoch": 0.13814377682403434, + "grad_norm": 0.4609375, + "learning_rate": 4.604792560801145e-06, + "loss": 2.423, + "step": 2575 + }, + { + "epoch": 0.13819742489270387, + "grad_norm": 0.494140625, + "learning_rate": 4.606580829756796e-06, + "loss": 2.565, + "step": 2576 + }, + { + "epoch": 0.1382510729613734, + "grad_norm": 0.427734375, + "learning_rate": 4.608369098712447e-06, + "loss": 2.2916, + "step": 2577 + }, + { + "epoch": 0.13830472103004293, + "grad_norm": 0.29296875, + "learning_rate": 4.610157367668098e-06, + "loss": 2.297, + "step": 2578 + }, + { + "epoch": 0.13835836909871244, + "grad_norm": 0.390625, + "learning_rate": 4.611945636623749e-06, + "loss": 2.2495, + "step": 2579 + }, + { + "epoch": 0.13841201716738197, + "grad_norm": 0.314453125, + "learning_rate": 4.613733905579399e-06, + "loss": 2.4079, + "step": 2580 + }, + { + "epoch": 0.1384656652360515, + "grad_norm": 0.365234375, + "learning_rate": 4.615522174535051e-06, + "loss": 2.2718, + "step": 2581 + }, + { + "epoch": 0.13851931330472103, + "grad_norm": 0.32421875, + "learning_rate": 4.617310443490701e-06, + "loss": 2.1141, + "step": 2582 + }, + { + "epoch": 0.13857296137339056, + "grad_norm": 0.296875, + "learning_rate": 4.6190987124463524e-06, + "loss": 2.4308, + "step": 2583 + }, + { + "epoch": 0.1386266094420601, + "grad_norm": 0.357421875, + "learning_rate": 4.620886981402003e-06, + "loss": 2.5445, + "step": 2584 + }, + { + "epoch": 0.13868025751072963, + "grad_norm": 0.3046875, + "learning_rate": 4.622675250357654e-06, + "loss": 2.1886, + "step": 2585 + }, + { + "epoch": 0.13873390557939913, + "grad_norm": 0.55078125, + "learning_rate": 4.6244635193133055e-06, + "loss": 2.5193, + "step": 2586 + }, + { + "epoch": 0.13878755364806866, + "grad_norm": 0.294921875, + "learning_rate": 4.626251788268956e-06, + "loss": 2.121, + "step": 2587 + }, + { + "epoch": 0.1388412017167382, + "grad_norm": 0.388671875, + "learning_rate": 4.628040057224607e-06, + "loss": 2.2893, + "step": 2588 + }, + { + "epoch": 0.13889484978540773, + "grad_norm": 0.474609375, + "learning_rate": 4.629828326180258e-06, + "loss": 2.405, + "step": 2589 + }, + { + "epoch": 0.13894849785407726, + "grad_norm": 1.71875, + "learning_rate": 4.631616595135909e-06, + "loss": 2.2152, + "step": 2590 + }, + { + "epoch": 0.1390021459227468, + "grad_norm": 0.310546875, + "learning_rate": 4.6334048640915594e-06, + "loss": 2.1278, + "step": 2591 + }, + { + "epoch": 0.13905579399141632, + "grad_norm": 0.357421875, + "learning_rate": 4.635193133047211e-06, + "loss": 2.1979, + "step": 2592 + }, + { + "epoch": 0.13910944206008583, + "grad_norm": 0.625, + "learning_rate": 4.636981402002861e-06, + "loss": 2.4573, + "step": 2593 + }, + { + "epoch": 0.13916309012875536, + "grad_norm": 0.345703125, + "learning_rate": 4.6387696709585125e-06, + "loss": 2.367, + "step": 2594 + }, + { + "epoch": 0.1392167381974249, + "grad_norm": 0.310546875, + "learning_rate": 4.640557939914164e-06, + "loss": 2.2484, + "step": 2595 + }, + { + "epoch": 0.13927038626609442, + "grad_norm": 0.330078125, + "learning_rate": 4.642346208869814e-06, + "loss": 2.5028, + "step": 2596 + }, + { + "epoch": 0.13932403433476395, + "grad_norm": 0.451171875, + "learning_rate": 4.6441344778254655e-06, + "loss": 2.0904, + "step": 2597 + }, + { + "epoch": 0.13937768240343348, + "grad_norm": 0.3671875, + "learning_rate": 4.645922746781116e-06, + "loss": 2.2378, + "step": 2598 + }, + { + "epoch": 0.13943133047210302, + "grad_norm": 0.3515625, + "learning_rate": 4.647711015736767e-06, + "loss": 2.5893, + "step": 2599 + }, + { + "epoch": 0.13948497854077252, + "grad_norm": 0.41796875, + "learning_rate": 4.649499284692418e-06, + "loss": 2.3466, + "step": 2600 + }, + { + "epoch": 0.13953862660944205, + "grad_norm": 0.40625, + "learning_rate": 4.651287553648069e-06, + "loss": 2.2, + "step": 2601 + }, + { + "epoch": 0.13959227467811158, + "grad_norm": 0.3984375, + "learning_rate": 4.6530758226037195e-06, + "loss": 1.4297, + "step": 2602 + }, + { + "epoch": 0.13964592274678111, + "grad_norm": 0.33203125, + "learning_rate": 4.654864091559371e-06, + "loss": 2.4652, + "step": 2603 + }, + { + "epoch": 0.13969957081545065, + "grad_norm": 0.408203125, + "learning_rate": 4.656652360515021e-06, + "loss": 2.4568, + "step": 2604 + }, + { + "epoch": 0.13975321888412018, + "grad_norm": 0.33984375, + "learning_rate": 4.658440629470673e-06, + "loss": 2.3081, + "step": 2605 + }, + { + "epoch": 0.1398068669527897, + "grad_norm": 0.34765625, + "learning_rate": 4.660228898426324e-06, + "loss": 2.3514, + "step": 2606 + }, + { + "epoch": 0.13986051502145921, + "grad_norm": 0.5, + "learning_rate": 4.662017167381975e-06, + "loss": 2.1297, + "step": 2607 + }, + { + "epoch": 0.13991416309012875, + "grad_norm": 0.28515625, + "learning_rate": 4.663805436337626e-06, + "loss": 2.2784, + "step": 2608 + }, + { + "epoch": 0.13996781115879828, + "grad_norm": 0.5234375, + "learning_rate": 4.665593705293277e-06, + "loss": 2.2588, + "step": 2609 + }, + { + "epoch": 0.1400214592274678, + "grad_norm": 0.419921875, + "learning_rate": 4.667381974248927e-06, + "loss": 2.2252, + "step": 2610 + }, + { + "epoch": 0.14007510729613734, + "grad_norm": 0.349609375, + "learning_rate": 4.669170243204579e-06, + "loss": 2.3417, + "step": 2611 + }, + { + "epoch": 0.14012875536480687, + "grad_norm": 0.322265625, + "learning_rate": 4.670958512160229e-06, + "loss": 2.2812, + "step": 2612 + }, + { + "epoch": 0.1401824034334764, + "grad_norm": 0.310546875, + "learning_rate": 4.67274678111588e-06, + "loss": 2.3051, + "step": 2613 + }, + { + "epoch": 0.14023605150214594, + "grad_norm": 0.287109375, + "learning_rate": 4.674535050071532e-06, + "loss": 2.1997, + "step": 2614 + }, + { + "epoch": 0.14028969957081544, + "grad_norm": 0.28125, + "learning_rate": 4.676323319027182e-06, + "loss": 2.2611, + "step": 2615 + }, + { + "epoch": 0.14034334763948497, + "grad_norm": 0.3046875, + "learning_rate": 4.6781115879828334e-06, + "loss": 2.3767, + "step": 2616 + }, + { + "epoch": 0.1403969957081545, + "grad_norm": 0.287109375, + "learning_rate": 4.679899856938484e-06, + "loss": 2.2376, + "step": 2617 + }, + { + "epoch": 0.14045064377682404, + "grad_norm": 0.498046875, + "learning_rate": 4.681688125894135e-06, + "loss": 1.58, + "step": 2618 + }, + { + "epoch": 0.14050429184549357, + "grad_norm": 1.71875, + "learning_rate": 4.683476394849786e-06, + "loss": 2.2382, + "step": 2619 + }, + { + "epoch": 0.1405579399141631, + "grad_norm": 0.3046875, + "learning_rate": 4.685264663805437e-06, + "loss": 2.254, + "step": 2620 + }, + { + "epoch": 0.14061158798283263, + "grad_norm": 0.31640625, + "learning_rate": 4.687052932761087e-06, + "loss": 2.6836, + "step": 2621 + }, + { + "epoch": 0.14066523605150213, + "grad_norm": 0.3984375, + "learning_rate": 4.688841201716739e-06, + "loss": 2.137, + "step": 2622 + }, + { + "epoch": 0.14071888412017167, + "grad_norm": 0.30859375, + "learning_rate": 4.690629470672389e-06, + "loss": 2.2046, + "step": 2623 + }, + { + "epoch": 0.1407725321888412, + "grad_norm": 0.357421875, + "learning_rate": 4.6924177396280405e-06, + "loss": 2.2915, + "step": 2624 + }, + { + "epoch": 0.14082618025751073, + "grad_norm": 0.33203125, + "learning_rate": 4.694206008583692e-06, + "loss": 2.2197, + "step": 2625 + }, + { + "epoch": 0.14087982832618026, + "grad_norm": 0.408203125, + "learning_rate": 4.695994277539342e-06, + "loss": 2.4271, + "step": 2626 + }, + { + "epoch": 0.1409334763948498, + "grad_norm": 0.7734375, + "learning_rate": 4.6977825464949935e-06, + "loss": 1.5871, + "step": 2627 + }, + { + "epoch": 0.14098712446351933, + "grad_norm": 0.32421875, + "learning_rate": 4.699570815450644e-06, + "loss": 2.1468, + "step": 2628 + }, + { + "epoch": 0.14104077253218883, + "grad_norm": 0.326171875, + "learning_rate": 4.701359084406295e-06, + "loss": 2.2049, + "step": 2629 + }, + { + "epoch": 0.14109442060085836, + "grad_norm": 0.283203125, + "learning_rate": 4.703147353361946e-06, + "loss": 2.0035, + "step": 2630 + }, + { + "epoch": 0.1411480686695279, + "grad_norm": 0.294921875, + "learning_rate": 4.704935622317597e-06, + "loss": 2.4976, + "step": 2631 + }, + { + "epoch": 0.14120171673819742, + "grad_norm": 0.294921875, + "learning_rate": 4.7067238912732475e-06, + "loss": 2.3313, + "step": 2632 + }, + { + "epoch": 0.14125536480686696, + "grad_norm": 0.345703125, + "learning_rate": 4.708512160228899e-06, + "loss": 2.3224, + "step": 2633 + }, + { + "epoch": 0.1413090128755365, + "grad_norm": 0.3046875, + "learning_rate": 4.71030042918455e-06, + "loss": 2.365, + "step": 2634 + }, + { + "epoch": 0.14136266094420602, + "grad_norm": 0.349609375, + "learning_rate": 4.7120886981402005e-06, + "loss": 2.1, + "step": 2635 + }, + { + "epoch": 0.14141630901287552, + "grad_norm": 0.400390625, + "learning_rate": 4.713876967095852e-06, + "loss": 2.746, + "step": 2636 + }, + { + "epoch": 0.14146995708154506, + "grad_norm": 0.421875, + "learning_rate": 4.715665236051502e-06, + "loss": 1.5985, + "step": 2637 + }, + { + "epoch": 0.1415236051502146, + "grad_norm": 0.287109375, + "learning_rate": 4.7174535050071536e-06, + "loss": 2.3276, + "step": 2638 + }, + { + "epoch": 0.14157725321888412, + "grad_norm": 0.30078125, + "learning_rate": 4.719241773962804e-06, + "loss": 2.4061, + "step": 2639 + }, + { + "epoch": 0.14163090128755365, + "grad_norm": 0.2890625, + "learning_rate": 4.721030042918455e-06, + "loss": 2.2149, + "step": 2640 + }, + { + "epoch": 0.14168454935622318, + "grad_norm": 0.310546875, + "learning_rate": 4.722818311874106e-06, + "loss": 2.3146, + "step": 2641 + }, + { + "epoch": 0.14173819742489271, + "grad_norm": 0.28125, + "learning_rate": 4.724606580829757e-06, + "loss": 2.1986, + "step": 2642 + }, + { + "epoch": 0.14179184549356222, + "grad_norm": 0.423828125, + "learning_rate": 4.7263948497854075e-06, + "loss": 2.5118, + "step": 2643 + }, + { + "epoch": 0.14184549356223175, + "grad_norm": 0.375, + "learning_rate": 4.72818311874106e-06, + "loss": 2.3738, + "step": 2644 + }, + { + "epoch": 0.14189914163090128, + "grad_norm": 0.306640625, + "learning_rate": 4.72997138769671e-06, + "loss": 2.3896, + "step": 2645 + }, + { + "epoch": 0.1419527896995708, + "grad_norm": 0.59765625, + "learning_rate": 4.731759656652361e-06, + "loss": 2.1322, + "step": 2646 + }, + { + "epoch": 0.14200643776824035, + "grad_norm": 0.462890625, + "learning_rate": 4.733547925608012e-06, + "loss": 1.6399, + "step": 2647 + }, + { + "epoch": 0.14206008583690988, + "grad_norm": 0.369140625, + "learning_rate": 4.735336194563663e-06, + "loss": 2.3551, + "step": 2648 + }, + { + "epoch": 0.1421137339055794, + "grad_norm": 0.341796875, + "learning_rate": 4.737124463519314e-06, + "loss": 2.254, + "step": 2649 + }, + { + "epoch": 0.14216738197424894, + "grad_norm": 0.30078125, + "learning_rate": 4.738912732474965e-06, + "loss": 2.3137, + "step": 2650 + }, + { + "epoch": 0.14222103004291844, + "grad_norm": 0.59765625, + "learning_rate": 4.740701001430615e-06, + "loss": 2.1373, + "step": 2651 + }, + { + "epoch": 0.14227467811158798, + "grad_norm": 0.3515625, + "learning_rate": 4.742489270386267e-06, + "loss": 2.1213, + "step": 2652 + }, + { + "epoch": 0.1423283261802575, + "grad_norm": 0.310546875, + "learning_rate": 4.744277539341917e-06, + "loss": 2.1753, + "step": 2653 + }, + { + "epoch": 0.14238197424892704, + "grad_norm": 0.640625, + "learning_rate": 4.746065808297568e-06, + "loss": 2.2534, + "step": 2654 + }, + { + "epoch": 0.14243562231759657, + "grad_norm": 0.466796875, + "learning_rate": 4.74785407725322e-06, + "loss": 2.2368, + "step": 2655 + }, + { + "epoch": 0.1424892703862661, + "grad_norm": 0.412109375, + "learning_rate": 4.74964234620887e-06, + "loss": 2.4112, + "step": 2656 + }, + { + "epoch": 0.14254291845493564, + "grad_norm": 0.287109375, + "learning_rate": 4.7514306151645215e-06, + "loss": 2.1501, + "step": 2657 + }, + { + "epoch": 0.14259656652360514, + "grad_norm": 0.330078125, + "learning_rate": 4.753218884120172e-06, + "loss": 2.2626, + "step": 2658 + }, + { + "epoch": 0.14265021459227467, + "grad_norm": 0.259765625, + "learning_rate": 4.755007153075823e-06, + "loss": 2.3156, + "step": 2659 + }, + { + "epoch": 0.1427038626609442, + "grad_norm": 0.36328125, + "learning_rate": 4.756795422031474e-06, + "loss": 2.2897, + "step": 2660 + }, + { + "epoch": 0.14275751072961373, + "grad_norm": 0.294921875, + "learning_rate": 4.758583690987125e-06, + "loss": 2.3294, + "step": 2661 + }, + { + "epoch": 0.14281115879828327, + "grad_norm": 0.30859375, + "learning_rate": 4.7603719599427754e-06, + "loss": 2.5314, + "step": 2662 + }, + { + "epoch": 0.1428648068669528, + "grad_norm": 0.330078125, + "learning_rate": 4.762160228898427e-06, + "loss": 2.425, + "step": 2663 + }, + { + "epoch": 0.14291845493562233, + "grad_norm": 0.3984375, + "learning_rate": 4.763948497854078e-06, + "loss": 2.5369, + "step": 2664 + }, + { + "epoch": 0.14297210300429183, + "grad_norm": 0.6640625, + "learning_rate": 4.7657367668097285e-06, + "loss": 2.3594, + "step": 2665 + }, + { + "epoch": 0.14302575107296137, + "grad_norm": 2.109375, + "learning_rate": 4.76752503576538e-06, + "loss": 2.2354, + "step": 2666 + }, + { + "epoch": 0.1430793991416309, + "grad_norm": 0.29296875, + "learning_rate": 4.76931330472103e-06, + "loss": 2.3803, + "step": 2667 + }, + { + "epoch": 0.14313304721030043, + "grad_norm": 0.357421875, + "learning_rate": 4.7711015736766815e-06, + "loss": 2.3382, + "step": 2668 + }, + { + "epoch": 0.14318669527896996, + "grad_norm": 0.310546875, + "learning_rate": 4.772889842632332e-06, + "loss": 2.393, + "step": 2669 + }, + { + "epoch": 0.1432403433476395, + "grad_norm": 0.3125, + "learning_rate": 4.774678111587983e-06, + "loss": 2.3854, + "step": 2670 + }, + { + "epoch": 0.14329399141630902, + "grad_norm": 0.3359375, + "learning_rate": 4.776466380543634e-06, + "loss": 2.3227, + "step": 2671 + }, + { + "epoch": 0.14334763948497853, + "grad_norm": 0.328125, + "learning_rate": 4.778254649499285e-06, + "loss": 2.2748, + "step": 2672 + }, + { + "epoch": 0.14340128755364806, + "grad_norm": 0.318359375, + "learning_rate": 4.780042918454936e-06, + "loss": 2.2692, + "step": 2673 + }, + { + "epoch": 0.1434549356223176, + "grad_norm": 0.296875, + "learning_rate": 4.781831187410587e-06, + "loss": 2.3911, + "step": 2674 + }, + { + "epoch": 0.14350858369098712, + "grad_norm": 0.341796875, + "learning_rate": 4.783619456366238e-06, + "loss": 2.4411, + "step": 2675 + }, + { + "epoch": 0.14356223175965666, + "grad_norm": 0.306640625, + "learning_rate": 4.7854077253218885e-06, + "loss": 2.4293, + "step": 2676 + }, + { + "epoch": 0.1436158798283262, + "grad_norm": 0.498046875, + "learning_rate": 4.78719599427754e-06, + "loss": 2.3404, + "step": 2677 + }, + { + "epoch": 0.14366952789699572, + "grad_norm": 0.291015625, + "learning_rate": 4.78898426323319e-06, + "loss": 2.4107, + "step": 2678 + }, + { + "epoch": 0.14372317596566522, + "grad_norm": 0.375, + "learning_rate": 4.790772532188842e-06, + "loss": 2.3747, + "step": 2679 + }, + { + "epoch": 0.14377682403433475, + "grad_norm": 0.3046875, + "learning_rate": 4.792560801144492e-06, + "loss": 2.3006, + "step": 2680 + }, + { + "epoch": 0.1438304721030043, + "grad_norm": 0.33203125, + "learning_rate": 4.794349070100143e-06, + "loss": 2.4459, + "step": 2681 + }, + { + "epoch": 0.14388412017167382, + "grad_norm": 0.353515625, + "learning_rate": 4.796137339055794e-06, + "loss": 2.3547, + "step": 2682 + }, + { + "epoch": 0.14393776824034335, + "grad_norm": 0.3125, + "learning_rate": 4.797925608011446e-06, + "loss": 2.4161, + "step": 2683 + }, + { + "epoch": 0.14399141630901288, + "grad_norm": 0.326171875, + "learning_rate": 4.799713876967096e-06, + "loss": 2.3509, + "step": 2684 + }, + { + "epoch": 0.1440450643776824, + "grad_norm": 0.421875, + "learning_rate": 4.801502145922748e-06, + "loss": 2.3247, + "step": 2685 + }, + { + "epoch": 0.14409871244635192, + "grad_norm": 0.326171875, + "learning_rate": 4.803290414878398e-06, + "loss": 2.2587, + "step": 2686 + }, + { + "epoch": 0.14415236051502145, + "grad_norm": 0.341796875, + "learning_rate": 4.8050786838340494e-06, + "loss": 2.5253, + "step": 2687 + }, + { + "epoch": 0.14420600858369098, + "grad_norm": 0.349609375, + "learning_rate": 4.8068669527897e-06, + "loss": 2.3822, + "step": 2688 + }, + { + "epoch": 0.1442596566523605, + "grad_norm": 0.337890625, + "learning_rate": 4.808655221745351e-06, + "loss": 2.0962, + "step": 2689 + }, + { + "epoch": 0.14431330472103004, + "grad_norm": 0.404296875, + "learning_rate": 4.810443490701002e-06, + "loss": 2.5429, + "step": 2690 + }, + { + "epoch": 0.14436695278969958, + "grad_norm": 0.4375, + "learning_rate": 4.812231759656653e-06, + "loss": 2.4205, + "step": 2691 + }, + { + "epoch": 0.1444206008583691, + "grad_norm": 0.322265625, + "learning_rate": 4.814020028612303e-06, + "loss": 2.3183, + "step": 2692 + }, + { + "epoch": 0.14447424892703864, + "grad_norm": 0.5625, + "learning_rate": 4.815808297567955e-06, + "loss": 2.3242, + "step": 2693 + }, + { + "epoch": 0.14452789699570814, + "grad_norm": 0.3046875, + "learning_rate": 4.817596566523606e-06, + "loss": 2.2377, + "step": 2694 + }, + { + "epoch": 0.14458154506437768, + "grad_norm": 0.345703125, + "learning_rate": 4.8193848354792564e-06, + "loss": 2.4881, + "step": 2695 + }, + { + "epoch": 0.1446351931330472, + "grad_norm": 0.404296875, + "learning_rate": 4.821173104434908e-06, + "loss": 2.4473, + "step": 2696 + }, + { + "epoch": 0.14468884120171674, + "grad_norm": 0.328125, + "learning_rate": 4.822961373390558e-06, + "loss": 2.3599, + "step": 2697 + }, + { + "epoch": 0.14474248927038627, + "grad_norm": 0.41015625, + "learning_rate": 4.8247496423462095e-06, + "loss": 2.3837, + "step": 2698 + }, + { + "epoch": 0.1447961373390558, + "grad_norm": 0.345703125, + "learning_rate": 4.82653791130186e-06, + "loss": 1.5673, + "step": 2699 + }, + { + "epoch": 0.14484978540772533, + "grad_norm": 0.345703125, + "learning_rate": 4.828326180257511e-06, + "loss": 1.7055, + "step": 2700 + }, + { + "epoch": 0.14490343347639484, + "grad_norm": 0.3046875, + "learning_rate": 4.830114449213162e-06, + "loss": 2.5134, + "step": 2701 + }, + { + "epoch": 0.14495708154506437, + "grad_norm": 1.1015625, + "learning_rate": 4.831902718168813e-06, + "loss": 2.5049, + "step": 2702 + }, + { + "epoch": 0.1450107296137339, + "grad_norm": 0.314453125, + "learning_rate": 4.833690987124464e-06, + "loss": 2.5554, + "step": 2703 + }, + { + "epoch": 0.14506437768240343, + "grad_norm": 0.353515625, + "learning_rate": 4.835479256080115e-06, + "loss": 2.3257, + "step": 2704 + }, + { + "epoch": 0.14511802575107297, + "grad_norm": 0.2734375, + "learning_rate": 4.837267525035766e-06, + "loss": 2.2938, + "step": 2705 + }, + { + "epoch": 0.1451716738197425, + "grad_norm": 0.314453125, + "learning_rate": 4.8390557939914165e-06, + "loss": 2.3984, + "step": 2706 + }, + { + "epoch": 0.14522532188841203, + "grad_norm": 0.333984375, + "learning_rate": 4.840844062947068e-06, + "loss": 2.353, + "step": 2707 + }, + { + "epoch": 0.14527896995708153, + "grad_norm": 0.318359375, + "learning_rate": 4.842632331902718e-06, + "loss": 2.5066, + "step": 2708 + }, + { + "epoch": 0.14533261802575106, + "grad_norm": 0.298828125, + "learning_rate": 4.8444206008583696e-06, + "loss": 2.2283, + "step": 2709 + }, + { + "epoch": 0.1453862660944206, + "grad_norm": 0.271484375, + "learning_rate": 4.84620886981402e-06, + "loss": 2.2358, + "step": 2710 + }, + { + "epoch": 0.14543991416309013, + "grad_norm": 0.3125, + "learning_rate": 4.847997138769671e-06, + "loss": 2.3584, + "step": 2711 + }, + { + "epoch": 0.14549356223175966, + "grad_norm": 0.287109375, + "learning_rate": 4.849785407725322e-06, + "loss": 2.4151, + "step": 2712 + }, + { + "epoch": 0.1455472103004292, + "grad_norm": 0.376953125, + "learning_rate": 4.851573676680973e-06, + "loss": 2.3095, + "step": 2713 + }, + { + "epoch": 0.14560085836909872, + "grad_norm": 0.28515625, + "learning_rate": 4.853361945636624e-06, + "loss": 2.233, + "step": 2714 + }, + { + "epoch": 0.14565450643776823, + "grad_norm": 0.341796875, + "learning_rate": 4.855150214592275e-06, + "loss": 2.6101, + "step": 2715 + }, + { + "epoch": 0.14570815450643776, + "grad_norm": 0.330078125, + "learning_rate": 4.856938483547926e-06, + "loss": 2.2529, + "step": 2716 + }, + { + "epoch": 0.1457618025751073, + "grad_norm": 1.0859375, + "learning_rate": 4.8587267525035766e-06, + "loss": 2.5046, + "step": 2717 + }, + { + "epoch": 0.14581545064377682, + "grad_norm": 0.33984375, + "learning_rate": 4.860515021459228e-06, + "loss": 2.3408, + "step": 2718 + }, + { + "epoch": 0.14586909871244635, + "grad_norm": 0.34375, + "learning_rate": 4.862303290414878e-06, + "loss": 2.3857, + "step": 2719 + }, + { + "epoch": 0.1459227467811159, + "grad_norm": 0.375, + "learning_rate": 4.86409155937053e-06, + "loss": 2.4297, + "step": 2720 + }, + { + "epoch": 0.14597639484978542, + "grad_norm": 0.400390625, + "learning_rate": 4.86587982832618e-06, + "loss": 2.3854, + "step": 2721 + }, + { + "epoch": 0.14603004291845492, + "grad_norm": 0.396484375, + "learning_rate": 4.867668097281831e-06, + "loss": 2.5682, + "step": 2722 + }, + { + "epoch": 0.14608369098712445, + "grad_norm": 0.32421875, + "learning_rate": 4.869456366237483e-06, + "loss": 2.361, + "step": 2723 + }, + { + "epoch": 0.14613733905579399, + "grad_norm": 0.400390625, + "learning_rate": 4.871244635193134e-06, + "loss": 2.3099, + "step": 2724 + }, + { + "epoch": 0.14619098712446352, + "grad_norm": 0.26171875, + "learning_rate": 4.873032904148784e-06, + "loss": 2.2004, + "step": 2725 + }, + { + "epoch": 0.14624463519313305, + "grad_norm": 0.314453125, + "learning_rate": 4.874821173104436e-06, + "loss": 2.1678, + "step": 2726 + }, + { + "epoch": 0.14629828326180258, + "grad_norm": 0.310546875, + "learning_rate": 4.876609442060086e-06, + "loss": 2.1168, + "step": 2727 + }, + { + "epoch": 0.1463519313304721, + "grad_norm": 0.44140625, + "learning_rate": 4.8783977110157375e-06, + "loss": 2.2041, + "step": 2728 + }, + { + "epoch": 0.14640557939914164, + "grad_norm": 0.314453125, + "learning_rate": 4.880185979971388e-06, + "loss": 2.3005, + "step": 2729 + }, + { + "epoch": 0.14645922746781115, + "grad_norm": 0.291015625, + "learning_rate": 4.881974248927039e-06, + "loss": 2.4552, + "step": 2730 + }, + { + "epoch": 0.14651287553648068, + "grad_norm": 0.2890625, + "learning_rate": 4.88376251788269e-06, + "loss": 2.4796, + "step": 2731 + }, + { + "epoch": 0.1465665236051502, + "grad_norm": 0.365234375, + "learning_rate": 4.885550786838341e-06, + "loss": 2.2024, + "step": 2732 + }, + { + "epoch": 0.14662017167381974, + "grad_norm": 0.3203125, + "learning_rate": 4.887339055793992e-06, + "loss": 2.4015, + "step": 2733 + }, + { + "epoch": 0.14667381974248928, + "grad_norm": 0.310546875, + "learning_rate": 4.889127324749643e-06, + "loss": 2.3284, + "step": 2734 + }, + { + "epoch": 0.1467274678111588, + "grad_norm": 0.341796875, + "learning_rate": 4.890915593705294e-06, + "loss": 2.1558, + "step": 2735 + }, + { + "epoch": 0.14678111587982834, + "grad_norm": 0.73046875, + "learning_rate": 4.8927038626609445e-06, + "loss": 2.5159, + "step": 2736 + }, + { + "epoch": 0.14683476394849784, + "grad_norm": 0.337890625, + "learning_rate": 4.894492131616596e-06, + "loss": 2.2842, + "step": 2737 + }, + { + "epoch": 0.14688841201716737, + "grad_norm": 0.328125, + "learning_rate": 4.896280400572246e-06, + "loss": 2.5236, + "step": 2738 + }, + { + "epoch": 0.1469420600858369, + "grad_norm": 0.30859375, + "learning_rate": 4.8980686695278975e-06, + "loss": 2.2813, + "step": 2739 + }, + { + "epoch": 0.14699570815450644, + "grad_norm": 0.25390625, + "learning_rate": 4.899856938483548e-06, + "loss": 2.1114, + "step": 2740 + }, + { + "epoch": 0.14704935622317597, + "grad_norm": 0.365234375, + "learning_rate": 4.901645207439199e-06, + "loss": 2.1121, + "step": 2741 + }, + { + "epoch": 0.1471030042918455, + "grad_norm": 0.3046875, + "learning_rate": 4.9034334763948506e-06, + "loss": 2.4642, + "step": 2742 + }, + { + "epoch": 0.14715665236051503, + "grad_norm": 0.423828125, + "learning_rate": 4.905221745350501e-06, + "loss": 2.3189, + "step": 2743 + }, + { + "epoch": 0.14721030042918454, + "grad_norm": 0.357421875, + "learning_rate": 4.907010014306152e-06, + "loss": 2.4125, + "step": 2744 + }, + { + "epoch": 0.14726394849785407, + "grad_norm": 0.88671875, + "learning_rate": 4.908798283261803e-06, + "loss": 2.0357, + "step": 2745 + }, + { + "epoch": 0.1473175965665236, + "grad_norm": 0.32421875, + "learning_rate": 4.910586552217454e-06, + "loss": 2.532, + "step": 2746 + }, + { + "epoch": 0.14737124463519313, + "grad_norm": 0.296875, + "learning_rate": 4.9123748211731045e-06, + "loss": 2.2029, + "step": 2747 + }, + { + "epoch": 0.14742489270386266, + "grad_norm": 0.384765625, + "learning_rate": 4.914163090128756e-06, + "loss": 2.601, + "step": 2748 + }, + { + "epoch": 0.1474785407725322, + "grad_norm": 0.314453125, + "learning_rate": 4.915951359084406e-06, + "loss": 2.0479, + "step": 2749 + }, + { + "epoch": 0.14753218884120173, + "grad_norm": 0.3203125, + "learning_rate": 4.917739628040058e-06, + "loss": 2.4917, + "step": 2750 + }, + { + "epoch": 0.14758583690987123, + "grad_norm": 0.345703125, + "learning_rate": 4.919527896995708e-06, + "loss": 2.2418, + "step": 2751 + }, + { + "epoch": 0.14763948497854076, + "grad_norm": 0.34375, + "learning_rate": 4.921316165951359e-06, + "loss": 2.196, + "step": 2752 + }, + { + "epoch": 0.1476931330472103, + "grad_norm": 0.390625, + "learning_rate": 4.923104434907011e-06, + "loss": 2.4146, + "step": 2753 + }, + { + "epoch": 0.14774678111587983, + "grad_norm": 0.33203125, + "learning_rate": 4.924892703862661e-06, + "loss": 2.2549, + "step": 2754 + }, + { + "epoch": 0.14780042918454936, + "grad_norm": 0.251953125, + "learning_rate": 4.926680972818312e-06, + "loss": 2.1658, + "step": 2755 + }, + { + "epoch": 0.1478540772532189, + "grad_norm": 0.435546875, + "learning_rate": 4.928469241773963e-06, + "loss": 2.4253, + "step": 2756 + }, + { + "epoch": 0.14790772532188842, + "grad_norm": 0.3984375, + "learning_rate": 4.930257510729614e-06, + "loss": 2.4405, + "step": 2757 + }, + { + "epoch": 0.14796137339055793, + "grad_norm": 0.5390625, + "learning_rate": 4.932045779685265e-06, + "loss": 2.3129, + "step": 2758 + }, + { + "epoch": 0.14801502145922746, + "grad_norm": 0.322265625, + "learning_rate": 4.933834048640916e-06, + "loss": 2.3697, + "step": 2759 + }, + { + "epoch": 0.148068669527897, + "grad_norm": 0.314453125, + "learning_rate": 4.935622317596566e-06, + "loss": 2.3762, + "step": 2760 + }, + { + "epoch": 0.14812231759656652, + "grad_norm": 0.384765625, + "learning_rate": 4.937410586552218e-06, + "loss": 2.5149, + "step": 2761 + }, + { + "epoch": 0.14817596566523605, + "grad_norm": 0.5390625, + "learning_rate": 4.939198855507869e-06, + "loss": 2.2073, + "step": 2762 + }, + { + "epoch": 0.14822961373390559, + "grad_norm": 0.390625, + "learning_rate": 4.94098712446352e-06, + "loss": 2.5632, + "step": 2763 + }, + { + "epoch": 0.14828326180257512, + "grad_norm": 0.314453125, + "learning_rate": 4.942775393419171e-06, + "loss": 2.2833, + "step": 2764 + }, + { + "epoch": 0.14833690987124465, + "grad_norm": 0.294921875, + "learning_rate": 4.944563662374822e-06, + "loss": 2.4316, + "step": 2765 + }, + { + "epoch": 0.14839055793991415, + "grad_norm": 0.455078125, + "learning_rate": 4.9463519313304724e-06, + "loss": 2.4265, + "step": 2766 + }, + { + "epoch": 0.14844420600858368, + "grad_norm": 0.306640625, + "learning_rate": 4.948140200286124e-06, + "loss": 2.2723, + "step": 2767 + }, + { + "epoch": 0.14849785407725322, + "grad_norm": 0.283203125, + "learning_rate": 4.949928469241774e-06, + "loss": 2.3607, + "step": 2768 + }, + { + "epoch": 0.14855150214592275, + "grad_norm": 0.361328125, + "learning_rate": 4.9517167381974255e-06, + "loss": 2.5423, + "step": 2769 + }, + { + "epoch": 0.14860515021459228, + "grad_norm": 0.318359375, + "learning_rate": 4.953505007153076e-06, + "loss": 2.6443, + "step": 2770 + }, + { + "epoch": 0.1486587982832618, + "grad_norm": 0.306640625, + "learning_rate": 4.955293276108727e-06, + "loss": 2.3618, + "step": 2771 + }, + { + "epoch": 0.14871244635193134, + "grad_norm": 0.306640625, + "learning_rate": 4.9570815450643785e-06, + "loss": 2.1007, + "step": 2772 + }, + { + "epoch": 0.14876609442060085, + "grad_norm": 0.3203125, + "learning_rate": 4.958869814020029e-06, + "loss": 2.4026, + "step": 2773 + }, + { + "epoch": 0.14881974248927038, + "grad_norm": 0.53125, + "learning_rate": 4.96065808297568e-06, + "loss": 2.4717, + "step": 2774 + }, + { + "epoch": 0.1488733905579399, + "grad_norm": 0.765625, + "learning_rate": 4.962446351931331e-06, + "loss": 2.331, + "step": 2775 + }, + { + "epoch": 0.14892703862660944, + "grad_norm": 0.333984375, + "learning_rate": 4.964234620886982e-06, + "loss": 2.4633, + "step": 2776 + }, + { + "epoch": 0.14898068669527897, + "grad_norm": 0.412109375, + "learning_rate": 4.9660228898426325e-06, + "loss": 2.3683, + "step": 2777 + }, + { + "epoch": 0.1490343347639485, + "grad_norm": 0.50390625, + "learning_rate": 4.967811158798284e-06, + "loss": 2.324, + "step": 2778 + }, + { + "epoch": 0.14908798283261804, + "grad_norm": 0.275390625, + "learning_rate": 4.969599427753934e-06, + "loss": 2.0219, + "step": 2779 + }, + { + "epoch": 0.14914163090128754, + "grad_norm": 0.4296875, + "learning_rate": 4.9713876967095855e-06, + "loss": 2.6862, + "step": 2780 + }, + { + "epoch": 0.14919527896995707, + "grad_norm": 0.431640625, + "learning_rate": 4.973175965665236e-06, + "loss": 2.3303, + "step": 2781 + }, + { + "epoch": 0.1492489270386266, + "grad_norm": 0.298828125, + "learning_rate": 4.974964234620887e-06, + "loss": 2.3672, + "step": 2782 + }, + { + "epoch": 0.14930257510729614, + "grad_norm": 0.322265625, + "learning_rate": 4.976752503576539e-06, + "loss": 2.3249, + "step": 2783 + }, + { + "epoch": 0.14935622317596567, + "grad_norm": 0.36328125, + "learning_rate": 4.978540772532189e-06, + "loss": 2.5729, + "step": 2784 + }, + { + "epoch": 0.1494098712446352, + "grad_norm": 0.353515625, + "learning_rate": 4.98032904148784e-06, + "loss": 1.9681, + "step": 2785 + }, + { + "epoch": 0.14946351931330473, + "grad_norm": 0.359375, + "learning_rate": 4.982117310443491e-06, + "loss": 2.4205, + "step": 2786 + }, + { + "epoch": 0.14951716738197424, + "grad_norm": 0.421875, + "learning_rate": 4.983905579399142e-06, + "loss": 2.3927, + "step": 2787 + }, + { + "epoch": 0.14957081545064377, + "grad_norm": 0.322265625, + "learning_rate": 4.9856938483547926e-06, + "loss": 2.3645, + "step": 2788 + }, + { + "epoch": 0.1496244635193133, + "grad_norm": 0.7578125, + "learning_rate": 4.987482117310444e-06, + "loss": 2.4078, + "step": 2789 + }, + { + "epoch": 0.14967811158798283, + "grad_norm": 0.36328125, + "learning_rate": 4.989270386266094e-06, + "loss": 2.5312, + "step": 2790 + }, + { + "epoch": 0.14973175965665236, + "grad_norm": 0.400390625, + "learning_rate": 4.991058655221746e-06, + "loss": 2.4083, + "step": 2791 + }, + { + "epoch": 0.1497854077253219, + "grad_norm": 0.375, + "learning_rate": 4.992846924177397e-06, + "loss": 2.3295, + "step": 2792 + }, + { + "epoch": 0.14983905579399143, + "grad_norm": 2.703125, + "learning_rate": 4.994635193133048e-06, + "loss": 2.3189, + "step": 2793 + }, + { + "epoch": 0.14989270386266093, + "grad_norm": 0.30078125, + "learning_rate": 4.996423462088699e-06, + "loss": 2.552, + "step": 2794 + }, + { + "epoch": 0.14994635193133046, + "grad_norm": 0.357421875, + "learning_rate": 4.99821173104435e-06, + "loss": 2.4193, + "step": 2795 + }, + { + "epoch": 0.15, + "grad_norm": 0.43359375, + "learning_rate": 5e-06, + "loss": 2.5891, + "step": 2796 + }, + { + "epoch": 0.15005364806866953, + "grad_norm": 0.51171875, + "learning_rate": 4.999999998490495e-06, + "loss": 2.5787, + "step": 2797 + }, + { + "epoch": 0.15010729613733906, + "grad_norm": 0.3125, + "learning_rate": 4.999999993961981e-06, + "loss": 2.2645, + "step": 2798 + }, + { + "epoch": 0.1501609442060086, + "grad_norm": 0.314453125, + "learning_rate": 4.9999999864144584e-06, + "loss": 2.628, + "step": 2799 + }, + { + "epoch": 0.15021459227467812, + "grad_norm": 0.369140625, + "learning_rate": 4.999999975847926e-06, + "loss": 2.4222, + "step": 2800 + }, + { + "epoch": 0.15026824034334765, + "grad_norm": 0.392578125, + "learning_rate": 4.999999962262384e-06, + "loss": 1.6503, + "step": 2801 + }, + { + "epoch": 0.15032188841201716, + "grad_norm": 0.42578125, + "learning_rate": 4.999999945657832e-06, + "loss": 2.2628, + "step": 2802 + }, + { + "epoch": 0.1503755364806867, + "grad_norm": 0.373046875, + "learning_rate": 4.999999926034271e-06, + "loss": 2.2423, + "step": 2803 + }, + { + "epoch": 0.15042918454935622, + "grad_norm": 0.34765625, + "learning_rate": 4.999999903391702e-06, + "loss": 2.2874, + "step": 2804 + }, + { + "epoch": 0.15048283261802575, + "grad_norm": 0.318359375, + "learning_rate": 4.999999877730122e-06, + "loss": 2.0837, + "step": 2805 + }, + { + "epoch": 0.15053648068669528, + "grad_norm": 0.333984375, + "learning_rate": 4.999999849049534e-06, + "loss": 2.3925, + "step": 2806 + }, + { + "epoch": 0.15059012875536482, + "grad_norm": 0.279296875, + "learning_rate": 4.999999817349936e-06, + "loss": 2.3559, + "step": 2807 + }, + { + "epoch": 0.15064377682403435, + "grad_norm": 0.298828125, + "learning_rate": 4.999999782631329e-06, + "loss": 2.3486, + "step": 2808 + }, + { + "epoch": 0.15069742489270385, + "grad_norm": 0.30859375, + "learning_rate": 4.999999744893713e-06, + "loss": 2.22, + "step": 2809 + }, + { + "epoch": 0.15075107296137338, + "grad_norm": 0.3984375, + "learning_rate": 4.999999704137088e-06, + "loss": 2.1162, + "step": 2810 + }, + { + "epoch": 0.15080472103004292, + "grad_norm": 0.275390625, + "learning_rate": 4.999999660361454e-06, + "loss": 2.3627, + "step": 2811 + }, + { + "epoch": 0.15085836909871245, + "grad_norm": 0.57421875, + "learning_rate": 4.999999613566811e-06, + "loss": 2.297, + "step": 2812 + }, + { + "epoch": 0.15091201716738198, + "grad_norm": 0.328125, + "learning_rate": 4.9999995637531595e-06, + "loss": 2.4804, + "step": 2813 + }, + { + "epoch": 0.1509656652360515, + "grad_norm": 0.30078125, + "learning_rate": 4.999999510920498e-06, + "loss": 2.1224, + "step": 2814 + }, + { + "epoch": 0.15101931330472104, + "grad_norm": 0.28515625, + "learning_rate": 4.999999455068829e-06, + "loss": 2.4671, + "step": 2815 + }, + { + "epoch": 0.15107296137339055, + "grad_norm": 0.32421875, + "learning_rate": 4.999999396198151e-06, + "loss": 2.365, + "step": 2816 + }, + { + "epoch": 0.15112660944206008, + "grad_norm": 0.462890625, + "learning_rate": 4.999999334308464e-06, + "loss": 2.3896, + "step": 2817 + }, + { + "epoch": 0.1511802575107296, + "grad_norm": 0.287109375, + "learning_rate": 4.999999269399769e-06, + "loss": 2.3185, + "step": 2818 + }, + { + "epoch": 0.15123390557939914, + "grad_norm": 0.7421875, + "learning_rate": 4.9999992014720645e-06, + "loss": 2.264, + "step": 2819 + }, + { + "epoch": 0.15128755364806867, + "grad_norm": 0.388671875, + "learning_rate": 4.999999130525352e-06, + "loss": 2.521, + "step": 2820 + }, + { + "epoch": 0.1513412017167382, + "grad_norm": 0.306640625, + "learning_rate": 4.999999056559632e-06, + "loss": 2.274, + "step": 2821 + }, + { + "epoch": 0.15139484978540774, + "grad_norm": 0.34375, + "learning_rate": 4.999998979574903e-06, + "loss": 2.2192, + "step": 2822 + }, + { + "epoch": 0.15144849785407724, + "grad_norm": 0.34765625, + "learning_rate": 4.999998899571166e-06, + "loss": 2.3761, + "step": 2823 + }, + { + "epoch": 0.15150214592274677, + "grad_norm": 0.34765625, + "learning_rate": 4.999998816548421e-06, + "loss": 2.2751, + "step": 2824 + }, + { + "epoch": 0.1515557939914163, + "grad_norm": 0.318359375, + "learning_rate": 4.999998730506668e-06, + "loss": 2.3395, + "step": 2825 + }, + { + "epoch": 0.15160944206008584, + "grad_norm": 0.322265625, + "learning_rate": 4.999998641445907e-06, + "loss": 2.1313, + "step": 2826 + }, + { + "epoch": 0.15166309012875537, + "grad_norm": 0.333984375, + "learning_rate": 4.999998549366139e-06, + "loss": 2.3032, + "step": 2827 + }, + { + "epoch": 0.1517167381974249, + "grad_norm": 0.42578125, + "learning_rate": 4.999998454267363e-06, + "loss": 2.3531, + "step": 2828 + }, + { + "epoch": 0.15177038626609443, + "grad_norm": 0.3359375, + "learning_rate": 4.999998356149579e-06, + "loss": 2.2538, + "step": 2829 + }, + { + "epoch": 0.15182403433476394, + "grad_norm": 0.353515625, + "learning_rate": 4.999998255012788e-06, + "loss": 2.3843, + "step": 2830 + }, + { + "epoch": 0.15187768240343347, + "grad_norm": 0.3828125, + "learning_rate": 4.99999815085699e-06, + "loss": 2.5385, + "step": 2831 + }, + { + "epoch": 0.151931330472103, + "grad_norm": 0.33203125, + "learning_rate": 4.9999980436821844e-06, + "loss": 2.0365, + "step": 2832 + }, + { + "epoch": 0.15198497854077253, + "grad_norm": 0.353515625, + "learning_rate": 4.999997933488372e-06, + "loss": 2.2918, + "step": 2833 + }, + { + "epoch": 0.15203862660944206, + "grad_norm": 0.423828125, + "learning_rate": 4.999997820275553e-06, + "loss": 2.298, + "step": 2834 + }, + { + "epoch": 0.1520922746781116, + "grad_norm": 0.283203125, + "learning_rate": 4.999997704043726e-06, + "loss": 2.138, + "step": 2835 + }, + { + "epoch": 0.15214592274678113, + "grad_norm": 0.333984375, + "learning_rate": 4.999997584792894e-06, + "loss": 2.3109, + "step": 2836 + }, + { + "epoch": 0.15219957081545063, + "grad_norm": 0.421875, + "learning_rate": 4.9999974625230554e-06, + "loss": 2.1306, + "step": 2837 + }, + { + "epoch": 0.15225321888412016, + "grad_norm": 0.419921875, + "learning_rate": 4.999997337234209e-06, + "loss": 2.3245, + "step": 2838 + }, + { + "epoch": 0.1523068669527897, + "grad_norm": 0.326171875, + "learning_rate": 4.999997208926358e-06, + "loss": 2.445, + "step": 2839 + }, + { + "epoch": 0.15236051502145923, + "grad_norm": 0.294921875, + "learning_rate": 4.9999970775995e-06, + "loss": 2.4067, + "step": 2840 + }, + { + "epoch": 0.15241416309012876, + "grad_norm": 0.32421875, + "learning_rate": 4.999996943253636e-06, + "loss": 2.4876, + "step": 2841 + }, + { + "epoch": 0.1524678111587983, + "grad_norm": 0.35546875, + "learning_rate": 4.9999968058887685e-06, + "loss": 2.2376, + "step": 2842 + }, + { + "epoch": 0.15252145922746782, + "grad_norm": 0.35546875, + "learning_rate": 4.999996665504894e-06, + "loss": 2.5607, + "step": 2843 + }, + { + "epoch": 0.15257510729613735, + "grad_norm": 0.375, + "learning_rate": 4.999996522102013e-06, + "loss": 2.3409, + "step": 2844 + }, + { + "epoch": 0.15262875536480686, + "grad_norm": 0.267578125, + "learning_rate": 4.999996375680128e-06, + "loss": 2.0361, + "step": 2845 + }, + { + "epoch": 0.1526824034334764, + "grad_norm": 0.3125, + "learning_rate": 4.999996226239239e-06, + "loss": 2.2849, + "step": 2846 + }, + { + "epoch": 0.15273605150214592, + "grad_norm": 0.79296875, + "learning_rate": 4.999996073779343e-06, + "loss": 2.3811, + "step": 2847 + }, + { + "epoch": 0.15278969957081545, + "grad_norm": 0.359375, + "learning_rate": 4.9999959183004445e-06, + "loss": 2.5932, + "step": 2848 + }, + { + "epoch": 0.15284334763948498, + "grad_norm": 0.28125, + "learning_rate": 4.99999575980254e-06, + "loss": 1.9496, + "step": 2849 + }, + { + "epoch": 0.15289699570815452, + "grad_norm": 0.322265625, + "learning_rate": 4.999995598285632e-06, + "loss": 2.4133, + "step": 2850 + }, + { + "epoch": 0.15295064377682405, + "grad_norm": 0.333984375, + "learning_rate": 4.999995433749719e-06, + "loss": 2.3756, + "step": 2851 + }, + { + "epoch": 0.15300429184549355, + "grad_norm": 0.302734375, + "learning_rate": 4.999995266194804e-06, + "loss": 2.4075, + "step": 2852 + }, + { + "epoch": 0.15305793991416308, + "grad_norm": 0.484375, + "learning_rate": 4.999995095620884e-06, + "loss": 2.4858, + "step": 2853 + }, + { + "epoch": 0.15311158798283261, + "grad_norm": 0.318359375, + "learning_rate": 4.99999492202796e-06, + "loss": 2.4783, + "step": 2854 + }, + { + "epoch": 0.15316523605150215, + "grad_norm": 0.7890625, + "learning_rate": 4.999994745416033e-06, + "loss": 2.1558, + "step": 2855 + }, + { + "epoch": 0.15321888412017168, + "grad_norm": 0.30859375, + "learning_rate": 4.999994565785105e-06, + "loss": 2.2787, + "step": 2856 + }, + { + "epoch": 0.1532725321888412, + "grad_norm": 0.318359375, + "learning_rate": 4.999994383135172e-06, + "loss": 2.2524, + "step": 2857 + }, + { + "epoch": 0.15332618025751074, + "grad_norm": 0.345703125, + "learning_rate": 4.999994197466238e-06, + "loss": 2.2235, + "step": 2858 + }, + { + "epoch": 0.15337982832618025, + "grad_norm": 0.388671875, + "learning_rate": 4.9999940087783e-06, + "loss": 2.1485, + "step": 2859 + }, + { + "epoch": 0.15343347639484978, + "grad_norm": 0.4609375, + "learning_rate": 4.999993817071361e-06, + "loss": 2.3396, + "step": 2860 + }, + { + "epoch": 0.1534871244635193, + "grad_norm": 2.984375, + "learning_rate": 4.9999936223454195e-06, + "loss": 2.4769, + "step": 2861 + }, + { + "epoch": 0.15354077253218884, + "grad_norm": 0.6015625, + "learning_rate": 4.999993424600476e-06, + "loss": 2.4733, + "step": 2862 + }, + { + "epoch": 0.15359442060085837, + "grad_norm": 0.431640625, + "learning_rate": 4.999993223836532e-06, + "loss": 2.0257, + "step": 2863 + }, + { + "epoch": 0.1536480686695279, + "grad_norm": 0.55078125, + "learning_rate": 4.999993020053587e-06, + "loss": 2.4673, + "step": 2864 + }, + { + "epoch": 0.15370171673819744, + "grad_norm": 0.3984375, + "learning_rate": 4.9999928132516404e-06, + "loss": 2.5639, + "step": 2865 + }, + { + "epoch": 0.15375536480686694, + "grad_norm": 0.28515625, + "learning_rate": 4.999992603430693e-06, + "loss": 2.1499, + "step": 2866 + }, + { + "epoch": 0.15380901287553647, + "grad_norm": 0.35546875, + "learning_rate": 4.9999923905907455e-06, + "loss": 2.4858, + "step": 2867 + }, + { + "epoch": 0.153862660944206, + "grad_norm": 0.333984375, + "learning_rate": 4.999992174731798e-06, + "loss": 2.4142, + "step": 2868 + }, + { + "epoch": 0.15391630901287554, + "grad_norm": 0.353515625, + "learning_rate": 4.99999195585385e-06, + "loss": 2.054, + "step": 2869 + }, + { + "epoch": 0.15396995708154507, + "grad_norm": 0.345703125, + "learning_rate": 4.999991733956902e-06, + "loss": 2.0872, + "step": 2870 + }, + { + "epoch": 0.1540236051502146, + "grad_norm": 0.359375, + "learning_rate": 4.999991509040956e-06, + "loss": 2.323, + "step": 2871 + }, + { + "epoch": 0.15407725321888413, + "grad_norm": 0.333984375, + "learning_rate": 4.99999128110601e-06, + "loss": 2.2234, + "step": 2872 + }, + { + "epoch": 0.15413090128755363, + "grad_norm": 0.341796875, + "learning_rate": 4.999991050152065e-06, + "loss": 2.2082, + "step": 2873 + }, + { + "epoch": 0.15418454935622317, + "grad_norm": 0.4296875, + "learning_rate": 4.999990816179122e-06, + "loss": 2.1567, + "step": 2874 + }, + { + "epoch": 0.1542381974248927, + "grad_norm": 0.4296875, + "learning_rate": 4.999990579187181e-06, + "loss": 2.1189, + "step": 2875 + }, + { + "epoch": 0.15429184549356223, + "grad_norm": 0.310546875, + "learning_rate": 4.9999903391762415e-06, + "loss": 2.105, + "step": 2876 + }, + { + "epoch": 0.15434549356223176, + "grad_norm": 0.326171875, + "learning_rate": 4.999990096146304e-06, + "loss": 2.3419, + "step": 2877 + }, + { + "epoch": 0.1543991416309013, + "grad_norm": 0.302734375, + "learning_rate": 4.999989850097369e-06, + "loss": 2.5081, + "step": 2878 + }, + { + "epoch": 0.15445278969957082, + "grad_norm": 1.7734375, + "learning_rate": 4.999989601029437e-06, + "loss": 2.4285, + "step": 2879 + }, + { + "epoch": 0.15450643776824036, + "grad_norm": 0.8203125, + "learning_rate": 4.999989348942509e-06, + "loss": 2.2725, + "step": 2880 + }, + { + "epoch": 0.15456008583690986, + "grad_norm": 0.33984375, + "learning_rate": 4.999989093836584e-06, + "loss": 2.2848, + "step": 2881 + }, + { + "epoch": 0.1546137339055794, + "grad_norm": 0.32421875, + "learning_rate": 4.9999888357116625e-06, + "loss": 2.1026, + "step": 2882 + }, + { + "epoch": 0.15466738197424892, + "grad_norm": 0.310546875, + "learning_rate": 4.999988574567746e-06, + "loss": 2.2956, + "step": 2883 + }, + { + "epoch": 0.15472103004291846, + "grad_norm": 0.318359375, + "learning_rate": 4.999988310404832e-06, + "loss": 2.2701, + "step": 2884 + }, + { + "epoch": 0.154774678111588, + "grad_norm": 0.349609375, + "learning_rate": 4.999988043222924e-06, + "loss": 2.4013, + "step": 2885 + }, + { + "epoch": 0.15482832618025752, + "grad_norm": 1.2734375, + "learning_rate": 4.999987773022021e-06, + "loss": 2.6425, + "step": 2886 + }, + { + "epoch": 0.15488197424892705, + "grad_norm": 0.34375, + "learning_rate": 4.999987499802124e-06, + "loss": 2.1413, + "step": 2887 + }, + { + "epoch": 0.15493562231759656, + "grad_norm": 0.60546875, + "learning_rate": 4.999987223563232e-06, + "loss": 2.3177, + "step": 2888 + }, + { + "epoch": 0.1549892703862661, + "grad_norm": 0.3984375, + "learning_rate": 4.9999869443053465e-06, + "loss": 2.1482, + "step": 2889 + }, + { + "epoch": 0.15504291845493562, + "grad_norm": 0.419921875, + "learning_rate": 4.999986662028468e-06, + "loss": 2.2753, + "step": 2890 + }, + { + "epoch": 0.15509656652360515, + "grad_norm": 0.318359375, + "learning_rate": 4.999986376732595e-06, + "loss": 2.3597, + "step": 2891 + }, + { + "epoch": 0.15515021459227468, + "grad_norm": 0.515625, + "learning_rate": 4.9999860884177295e-06, + "loss": 2.0585, + "step": 2892 + }, + { + "epoch": 0.15520386266094421, + "grad_norm": 0.361328125, + "learning_rate": 4.999985797083871e-06, + "loss": 2.3373, + "step": 2893 + }, + { + "epoch": 0.15525751072961375, + "grad_norm": 0.33203125, + "learning_rate": 4.9999855027310205e-06, + "loss": 2.3513, + "step": 2894 + }, + { + "epoch": 0.15531115879828325, + "grad_norm": 0.8984375, + "learning_rate": 4.999985205359178e-06, + "loss": 2.5046, + "step": 2895 + }, + { + "epoch": 0.15536480686695278, + "grad_norm": 0.375, + "learning_rate": 4.999984904968345e-06, + "loss": 1.2947, + "step": 2896 + }, + { + "epoch": 0.1554184549356223, + "grad_norm": 0.27734375, + "learning_rate": 4.99998460155852e-06, + "loss": 2.3581, + "step": 2897 + }, + { + "epoch": 0.15547210300429185, + "grad_norm": 0.435546875, + "learning_rate": 4.999984295129704e-06, + "loss": 2.2245, + "step": 2898 + }, + { + "epoch": 0.15552575107296138, + "grad_norm": 0.455078125, + "learning_rate": 4.999983985681899e-06, + "loss": 1.9621, + "step": 2899 + }, + { + "epoch": 0.1555793991416309, + "grad_norm": 0.361328125, + "learning_rate": 4.999983673215102e-06, + "loss": 2.2669, + "step": 2900 + }, + { + "epoch": 0.15563304721030044, + "grad_norm": 0.453125, + "learning_rate": 4.999983357729317e-06, + "loss": 1.9368, + "step": 2901 + }, + { + "epoch": 0.15568669527896994, + "grad_norm": 0.3203125, + "learning_rate": 4.999983039224542e-06, + "loss": 2.4738, + "step": 2902 + }, + { + "epoch": 0.15574034334763948, + "grad_norm": 0.30859375, + "learning_rate": 4.999982717700778e-06, + "loss": 2.6183, + "step": 2903 + }, + { + "epoch": 0.155793991416309, + "grad_norm": 0.3828125, + "learning_rate": 4.999982393158026e-06, + "loss": 2.2388, + "step": 2904 + }, + { + "epoch": 0.15584763948497854, + "grad_norm": 0.412109375, + "learning_rate": 4.999982065596286e-06, + "loss": 2.4618, + "step": 2905 + }, + { + "epoch": 0.15590128755364807, + "grad_norm": 0.296875, + "learning_rate": 4.999981735015557e-06, + "loss": 2.3603, + "step": 2906 + }, + { + "epoch": 0.1559549356223176, + "grad_norm": 0.326171875, + "learning_rate": 4.999981401415842e-06, + "loss": 2.4621, + "step": 2907 + }, + { + "epoch": 0.15600858369098713, + "grad_norm": 0.341796875, + "learning_rate": 4.999981064797139e-06, + "loss": 2.1829, + "step": 2908 + }, + { + "epoch": 0.15606223175965664, + "grad_norm": 0.365234375, + "learning_rate": 4.99998072515945e-06, + "loss": 1.814, + "step": 2909 + }, + { + "epoch": 0.15611587982832617, + "grad_norm": 0.267578125, + "learning_rate": 4.999980382502776e-06, + "loss": 2.3075, + "step": 2910 + }, + { + "epoch": 0.1561695278969957, + "grad_norm": 0.359375, + "learning_rate": 4.999980036827115e-06, + "loss": 2.1264, + "step": 2911 + }, + { + "epoch": 0.15622317596566523, + "grad_norm": 0.31640625, + "learning_rate": 4.999979688132469e-06, + "loss": 2.3834, + "step": 2912 + }, + { + "epoch": 0.15627682403433477, + "grad_norm": 0.328125, + "learning_rate": 4.999979336418838e-06, + "loss": 2.3468, + "step": 2913 + }, + { + "epoch": 0.1563304721030043, + "grad_norm": 0.29296875, + "learning_rate": 4.999978981686223e-06, + "loss": 2.4374, + "step": 2914 + }, + { + "epoch": 0.15638412017167383, + "grad_norm": 0.27734375, + "learning_rate": 4.999978623934623e-06, + "loss": 2.1806, + "step": 2915 + }, + { + "epoch": 0.15643776824034336, + "grad_norm": 0.3125, + "learning_rate": 4.9999782631640415e-06, + "loss": 2.442, + "step": 2916 + }, + { + "epoch": 0.15649141630901287, + "grad_norm": 0.4453125, + "learning_rate": 4.999977899374475e-06, + "loss": 2.4856, + "step": 2917 + }, + { + "epoch": 0.1565450643776824, + "grad_norm": 0.322265625, + "learning_rate": 4.999977532565927e-06, + "loss": 2.0513, + "step": 2918 + }, + { + "epoch": 0.15659871244635193, + "grad_norm": 0.5859375, + "learning_rate": 4.999977162738396e-06, + "loss": 2.3431, + "step": 2919 + }, + { + "epoch": 0.15665236051502146, + "grad_norm": 0.26171875, + "learning_rate": 4.999976789891884e-06, + "loss": 2.2557, + "step": 2920 + }, + { + "epoch": 0.156706008583691, + "grad_norm": 0.2890625, + "learning_rate": 4.99997641402639e-06, + "loss": 2.2875, + "step": 2921 + }, + { + "epoch": 0.15675965665236052, + "grad_norm": 0.314453125, + "learning_rate": 4.999976035141915e-06, + "loss": 2.2785, + "step": 2922 + }, + { + "epoch": 0.15681330472103006, + "grad_norm": 0.33203125, + "learning_rate": 4.99997565323846e-06, + "loss": 2.3567, + "step": 2923 + }, + { + "epoch": 0.15686695278969956, + "grad_norm": 0.36328125, + "learning_rate": 4.9999752683160255e-06, + "loss": 2.2314, + "step": 2924 + }, + { + "epoch": 0.1569206008583691, + "grad_norm": 0.392578125, + "learning_rate": 4.99997488037461e-06, + "loss": 2.3358, + "step": 2925 + }, + { + "epoch": 0.15697424892703862, + "grad_norm": 0.32421875, + "learning_rate": 4.9999744894142166e-06, + "loss": 2.4157, + "step": 2926 + }, + { + "epoch": 0.15702789699570815, + "grad_norm": 0.296875, + "learning_rate": 4.999974095434844e-06, + "loss": 2.3627, + "step": 2927 + }, + { + "epoch": 0.1570815450643777, + "grad_norm": 0.376953125, + "learning_rate": 4.9999736984364935e-06, + "loss": 2.711, + "step": 2928 + }, + { + "epoch": 0.15713519313304722, + "grad_norm": 0.3125, + "learning_rate": 4.999973298419166e-06, + "loss": 2.2503, + "step": 2929 + }, + { + "epoch": 0.15718884120171675, + "grad_norm": 0.302734375, + "learning_rate": 4.999972895382861e-06, + "loss": 2.3112, + "step": 2930 + }, + { + "epoch": 0.15724248927038625, + "grad_norm": 0.37890625, + "learning_rate": 4.999972489327579e-06, + "loss": 2.2113, + "step": 2931 + }, + { + "epoch": 0.15729613733905579, + "grad_norm": 0.298828125, + "learning_rate": 4.999972080253321e-06, + "loss": 2.4061, + "step": 2932 + }, + { + "epoch": 0.15734978540772532, + "grad_norm": 0.3203125, + "learning_rate": 4.999971668160088e-06, + "loss": 2.2865, + "step": 2933 + }, + { + "epoch": 0.15740343347639485, + "grad_norm": 0.333984375, + "learning_rate": 4.99997125304788e-06, + "loss": 2.6153, + "step": 2934 + }, + { + "epoch": 0.15745708154506438, + "grad_norm": 0.29296875, + "learning_rate": 4.999970834916696e-06, + "loss": 2.364, + "step": 2935 + }, + { + "epoch": 0.1575107296137339, + "grad_norm": 0.35546875, + "learning_rate": 4.999970413766538e-06, + "loss": 2.3354, + "step": 2936 + }, + { + "epoch": 0.15756437768240344, + "grad_norm": 0.33203125, + "learning_rate": 4.999969989597407e-06, + "loss": 2.5057, + "step": 2937 + }, + { + "epoch": 0.15761802575107295, + "grad_norm": 0.431640625, + "learning_rate": 4.999969562409303e-06, + "loss": 2.2805, + "step": 2938 + }, + { + "epoch": 0.15767167381974248, + "grad_norm": 0.302734375, + "learning_rate": 4.999969132202226e-06, + "loss": 2.2487, + "step": 2939 + }, + { + "epoch": 0.157725321888412, + "grad_norm": 0.306640625, + "learning_rate": 4.999968698976178e-06, + "loss": 2.063, + "step": 2940 + }, + { + "epoch": 0.15777896995708154, + "grad_norm": 0.296875, + "learning_rate": 4.999968262731157e-06, + "loss": 2.3068, + "step": 2941 + }, + { + "epoch": 0.15783261802575108, + "grad_norm": 0.357421875, + "learning_rate": 4.999967823467165e-06, + "loss": 2.5461, + "step": 2942 + }, + { + "epoch": 0.1578862660944206, + "grad_norm": 0.404296875, + "learning_rate": 4.999967381184203e-06, + "loss": 2.5751, + "step": 2943 + }, + { + "epoch": 0.15793991416309014, + "grad_norm": 0.427734375, + "learning_rate": 4.999966935882271e-06, + "loss": 2.3922, + "step": 2944 + }, + { + "epoch": 0.15799356223175964, + "grad_norm": 0.33984375, + "learning_rate": 4.99996648756137e-06, + "loss": 2.4071, + "step": 2945 + }, + { + "epoch": 0.15804721030042918, + "grad_norm": 0.349609375, + "learning_rate": 4.9999660362214996e-06, + "loss": 2.31, + "step": 2946 + }, + { + "epoch": 0.1581008583690987, + "grad_norm": 0.322265625, + "learning_rate": 4.999965581862661e-06, + "loss": 2.2755, + "step": 2947 + }, + { + "epoch": 0.15815450643776824, + "grad_norm": 0.37890625, + "learning_rate": 4.999965124484854e-06, + "loss": 2.5833, + "step": 2948 + }, + { + "epoch": 0.15820815450643777, + "grad_norm": 0.3515625, + "learning_rate": 4.9999646640880805e-06, + "loss": 2.4116, + "step": 2949 + }, + { + "epoch": 0.1582618025751073, + "grad_norm": 0.3203125, + "learning_rate": 4.99996420067234e-06, + "loss": 2.4404, + "step": 2950 + }, + { + "epoch": 0.15831545064377683, + "grad_norm": 0.353515625, + "learning_rate": 4.999963734237634e-06, + "loss": 2.282, + "step": 2951 + }, + { + "epoch": 0.15836909871244637, + "grad_norm": 0.3359375, + "learning_rate": 4.999963264783961e-06, + "loss": 2.265, + "step": 2952 + }, + { + "epoch": 0.15842274678111587, + "grad_norm": 0.41015625, + "learning_rate": 4.999962792311324e-06, + "loss": 2.3552, + "step": 2953 + }, + { + "epoch": 0.1584763948497854, + "grad_norm": 0.3203125, + "learning_rate": 4.999962316819722e-06, + "loss": 2.463, + "step": 2954 + }, + { + "epoch": 0.15853004291845493, + "grad_norm": 0.333984375, + "learning_rate": 4.999961838309156e-06, + "loss": 2.2893, + "step": 2955 + }, + { + "epoch": 0.15858369098712446, + "grad_norm": 0.373046875, + "learning_rate": 4.999961356779628e-06, + "loss": 2.3824, + "step": 2956 + }, + { + "epoch": 0.158637339055794, + "grad_norm": 0.33984375, + "learning_rate": 4.999960872231137e-06, + "loss": 2.5831, + "step": 2957 + }, + { + "epoch": 0.15869098712446353, + "grad_norm": 0.328125, + "learning_rate": 4.999960384663683e-06, + "loss": 2.289, + "step": 2958 + }, + { + "epoch": 0.15874463519313306, + "grad_norm": 0.34765625, + "learning_rate": 4.999959894077267e-06, + "loss": 2.1898, + "step": 2959 + }, + { + "epoch": 0.15879828326180256, + "grad_norm": 1.484375, + "learning_rate": 4.999959400471892e-06, + "loss": 2.5676, + "step": 2960 + }, + { + "epoch": 0.1588519313304721, + "grad_norm": 0.322265625, + "learning_rate": 4.999958903847555e-06, + "loss": 2.4441, + "step": 2961 + }, + { + "epoch": 0.15890557939914163, + "grad_norm": 0.3203125, + "learning_rate": 4.999958404204259e-06, + "loss": 2.4836, + "step": 2962 + }, + { + "epoch": 0.15895922746781116, + "grad_norm": 0.306640625, + "learning_rate": 4.999957901542004e-06, + "loss": 2.2034, + "step": 2963 + }, + { + "epoch": 0.1590128755364807, + "grad_norm": 0.310546875, + "learning_rate": 4.99995739586079e-06, + "loss": 2.205, + "step": 2964 + }, + { + "epoch": 0.15906652360515022, + "grad_norm": 0.357421875, + "learning_rate": 4.999956887160618e-06, + "loss": 2.3255, + "step": 2965 + }, + { + "epoch": 0.15912017167381975, + "grad_norm": 0.345703125, + "learning_rate": 4.999956375441489e-06, + "loss": 2.5908, + "step": 2966 + }, + { + "epoch": 0.15917381974248926, + "grad_norm": 0.322265625, + "learning_rate": 4.999955860703404e-06, + "loss": 2.2493, + "step": 2967 + }, + { + "epoch": 0.1592274678111588, + "grad_norm": 0.384765625, + "learning_rate": 4.9999553429463615e-06, + "loss": 2.4873, + "step": 2968 + }, + { + "epoch": 0.15928111587982832, + "grad_norm": 0.3359375, + "learning_rate": 4.999954822170364e-06, + "loss": 2.3738, + "step": 2969 + }, + { + "epoch": 0.15933476394849785, + "grad_norm": 0.349609375, + "learning_rate": 4.999954298375412e-06, + "loss": 2.2892, + "step": 2970 + }, + { + "epoch": 0.15938841201716739, + "grad_norm": 0.44921875, + "learning_rate": 4.999953771561506e-06, + "loss": 1.9496, + "step": 2971 + }, + { + "epoch": 0.15944206008583692, + "grad_norm": 0.3828125, + "learning_rate": 4.999953241728646e-06, + "loss": 2.3711, + "step": 2972 + }, + { + "epoch": 0.15949570815450645, + "grad_norm": 0.353515625, + "learning_rate": 4.999952708876834e-06, + "loss": 2.2203, + "step": 2973 + }, + { + "epoch": 0.15954935622317595, + "grad_norm": 0.458984375, + "learning_rate": 4.999952173006069e-06, + "loss": 2.4476, + "step": 2974 + }, + { + "epoch": 0.15960300429184548, + "grad_norm": 0.859375, + "learning_rate": 4.999951634116352e-06, + "loss": 2.3961, + "step": 2975 + }, + { + "epoch": 0.15965665236051502, + "grad_norm": 0.365234375, + "learning_rate": 4.9999510922076845e-06, + "loss": 2.3185, + "step": 2976 + }, + { + "epoch": 0.15971030042918455, + "grad_norm": 0.3203125, + "learning_rate": 4.999950547280067e-06, + "loss": 2.4042, + "step": 2977 + }, + { + "epoch": 0.15976394849785408, + "grad_norm": 0.39453125, + "learning_rate": 4.9999499993335e-06, + "loss": 2.2289, + "step": 2978 + }, + { + "epoch": 0.1598175965665236, + "grad_norm": 0.34375, + "learning_rate": 4.9999494483679824e-06, + "loss": 2.3645, + "step": 2979 + }, + { + "epoch": 0.15987124463519314, + "grad_norm": 0.38671875, + "learning_rate": 4.999948894383519e-06, + "loss": 2.1757, + "step": 2980 + }, + { + "epoch": 0.15992489270386265, + "grad_norm": 0.5234375, + "learning_rate": 4.999948337380106e-06, + "loss": 2.2433, + "step": 2981 + }, + { + "epoch": 0.15997854077253218, + "grad_norm": 0.26953125, + "learning_rate": 4.999947777357746e-06, + "loss": 2.2636, + "step": 2982 + }, + { + "epoch": 0.1600321888412017, + "grad_norm": 0.41015625, + "learning_rate": 4.999947214316441e-06, + "loss": 2.299, + "step": 2983 + }, + { + "epoch": 0.16008583690987124, + "grad_norm": 0.31640625, + "learning_rate": 4.9999466482561885e-06, + "loss": 2.3283, + "step": 2984 + }, + { + "epoch": 0.16013948497854077, + "grad_norm": 0.345703125, + "learning_rate": 4.999946079176993e-06, + "loss": 2.4906, + "step": 2985 + }, + { + "epoch": 0.1601931330472103, + "grad_norm": 0.375, + "learning_rate": 4.999945507078852e-06, + "loss": 1.9057, + "step": 2986 + }, + { + "epoch": 0.16024678111587984, + "grad_norm": 0.298828125, + "learning_rate": 4.9999449319617684e-06, + "loss": 2.2467, + "step": 2987 + }, + { + "epoch": 0.16030042918454937, + "grad_norm": 0.412109375, + "learning_rate": 4.999944353825741e-06, + "loss": 2.3052, + "step": 2988 + }, + { + "epoch": 0.16035407725321887, + "grad_norm": 0.33203125, + "learning_rate": 4.999943772670772e-06, + "loss": 2.182, + "step": 2989 + }, + { + "epoch": 0.1604077253218884, + "grad_norm": 0.337890625, + "learning_rate": 4.999943188496862e-06, + "loss": 2.4651, + "step": 2990 + }, + { + "epoch": 0.16046137339055794, + "grad_norm": 1.53125, + "learning_rate": 4.9999426013040095e-06, + "loss": 2.2734, + "step": 2991 + }, + { + "epoch": 0.16051502145922747, + "grad_norm": 0.310546875, + "learning_rate": 4.999942011092218e-06, + "loss": 2.4753, + "step": 2992 + }, + { + "epoch": 0.160568669527897, + "grad_norm": 0.76953125, + "learning_rate": 4.999941417861488e-06, + "loss": 1.1445, + "step": 2993 + }, + { + "epoch": 0.16062231759656653, + "grad_norm": 0.435546875, + "learning_rate": 4.999940821611819e-06, + "loss": 2.3508, + "step": 2994 + }, + { + "epoch": 0.16067596566523606, + "grad_norm": 0.298828125, + "learning_rate": 4.999940222343212e-06, + "loss": 2.2533, + "step": 2995 + }, + { + "epoch": 0.16072961373390557, + "grad_norm": 0.341796875, + "learning_rate": 4.9999396200556665e-06, + "loss": 2.1818, + "step": 2996 + }, + { + "epoch": 0.1607832618025751, + "grad_norm": 0.294921875, + "learning_rate": 4.999939014749186e-06, + "loss": 2.1984, + "step": 2997 + }, + { + "epoch": 0.16083690987124463, + "grad_norm": 0.322265625, + "learning_rate": 4.999938406423769e-06, + "loss": 2.1887, + "step": 2998 + }, + { + "epoch": 0.16089055793991416, + "grad_norm": 0.3125, + "learning_rate": 4.999937795079417e-06, + "loss": 2.448, + "step": 2999 + }, + { + "epoch": 0.1609442060085837, + "grad_norm": 0.3125, + "learning_rate": 4.999937180716132e-06, + "loss": 2.1849, + "step": 3000 + }, + { + "epoch": 0.16099785407725323, + "grad_norm": 0.56640625, + "learning_rate": 4.999936563333912e-06, + "loss": 2.3685, + "step": 3001 + }, + { + "epoch": 0.16105150214592276, + "grad_norm": 0.390625, + "learning_rate": 4.99993594293276e-06, + "loss": 2.2872, + "step": 3002 + }, + { + "epoch": 0.16110515021459226, + "grad_norm": 0.333984375, + "learning_rate": 4.999935319512675e-06, + "loss": 2.2782, + "step": 3003 + }, + { + "epoch": 0.1611587982832618, + "grad_norm": 0.30078125, + "learning_rate": 4.99993469307366e-06, + "loss": 2.0301, + "step": 3004 + }, + { + "epoch": 0.16121244635193133, + "grad_norm": 0.298828125, + "learning_rate": 4.999934063615715e-06, + "loss": 2.313, + "step": 3005 + }, + { + "epoch": 0.16126609442060086, + "grad_norm": 0.33203125, + "learning_rate": 4.999933431138839e-06, + "loss": 2.2765, + "step": 3006 + }, + { + "epoch": 0.1613197424892704, + "grad_norm": 0.328125, + "learning_rate": 4.999932795643034e-06, + "loss": 2.4604, + "step": 3007 + }, + { + "epoch": 0.16137339055793992, + "grad_norm": 0.44140625, + "learning_rate": 4.999932157128301e-06, + "loss": 1.8302, + "step": 3008 + }, + { + "epoch": 0.16142703862660945, + "grad_norm": 0.375, + "learning_rate": 4.99993151559464e-06, + "loss": 2.1337, + "step": 3009 + }, + { + "epoch": 0.16148068669527896, + "grad_norm": 0.380859375, + "learning_rate": 4.9999308710420535e-06, + "loss": 2.3498, + "step": 3010 + }, + { + "epoch": 0.1615343347639485, + "grad_norm": 0.96484375, + "learning_rate": 4.999930223470541e-06, + "loss": 2.1496, + "step": 3011 + }, + { + "epoch": 0.16158798283261802, + "grad_norm": 0.333984375, + "learning_rate": 4.999929572880103e-06, + "loss": 2.5164, + "step": 3012 + }, + { + "epoch": 0.16164163090128755, + "grad_norm": 0.341796875, + "learning_rate": 4.99992891927074e-06, + "loss": 1.9714, + "step": 3013 + }, + { + "epoch": 0.16169527896995708, + "grad_norm": 0.359375, + "learning_rate": 4.999928262642455e-06, + "loss": 2.3774, + "step": 3014 + }, + { + "epoch": 0.16174892703862662, + "grad_norm": 0.341796875, + "learning_rate": 4.999927602995246e-06, + "loss": 2.3247, + "step": 3015 + }, + { + "epoch": 0.16180257510729615, + "grad_norm": 0.3828125, + "learning_rate": 4.999926940329116e-06, + "loss": 2.3886, + "step": 3016 + }, + { + "epoch": 0.16185622317596565, + "grad_norm": 0.515625, + "learning_rate": 4.999926274644064e-06, + "loss": 2.2071, + "step": 3017 + }, + { + "epoch": 0.16190987124463518, + "grad_norm": 0.3828125, + "learning_rate": 4.999925605940092e-06, + "loss": 2.4894, + "step": 3018 + }, + { + "epoch": 0.16196351931330472, + "grad_norm": 0.3984375, + "learning_rate": 4.999924934217201e-06, + "loss": 1.9559, + "step": 3019 + }, + { + "epoch": 0.16201716738197425, + "grad_norm": 0.333984375, + "learning_rate": 4.99992425947539e-06, + "loss": 2.2218, + "step": 3020 + }, + { + "epoch": 0.16207081545064378, + "grad_norm": 0.34375, + "learning_rate": 4.999923581714662e-06, + "loss": 2.432, + "step": 3021 + }, + { + "epoch": 0.1621244635193133, + "grad_norm": 0.322265625, + "learning_rate": 4.999922900935017e-06, + "loss": 2.1085, + "step": 3022 + }, + { + "epoch": 0.16217811158798284, + "grad_norm": 0.44921875, + "learning_rate": 4.999922217136455e-06, + "loss": 2.0134, + "step": 3023 + }, + { + "epoch": 0.16223175965665235, + "grad_norm": 1.4140625, + "learning_rate": 4.999921530318978e-06, + "loss": 2.4152, + "step": 3024 + }, + { + "epoch": 0.16228540772532188, + "grad_norm": 0.27734375, + "learning_rate": 4.999920840482586e-06, + "loss": 2.1376, + "step": 3025 + }, + { + "epoch": 0.1623390557939914, + "grad_norm": 0.3828125, + "learning_rate": 4.999920147627281e-06, + "loss": 2.5019, + "step": 3026 + }, + { + "epoch": 0.16239270386266094, + "grad_norm": 0.306640625, + "learning_rate": 4.999919451753063e-06, + "loss": 2.3584, + "step": 3027 + }, + { + "epoch": 0.16244635193133047, + "grad_norm": 0.35546875, + "learning_rate": 4.999918752859933e-06, + "loss": 2.4754, + "step": 3028 + }, + { + "epoch": 0.1625, + "grad_norm": 0.337890625, + "learning_rate": 4.999918050947891e-06, + "loss": 2.1221, + "step": 3029 + }, + { + "epoch": 0.16255364806866954, + "grad_norm": 0.361328125, + "learning_rate": 4.999917346016939e-06, + "loss": 2.533, + "step": 3030 + }, + { + "epoch": 0.16260729613733907, + "grad_norm": 0.453125, + "learning_rate": 4.999916638067077e-06, + "loss": 2.2593, + "step": 3031 + }, + { + "epoch": 0.16266094420600857, + "grad_norm": 0.33984375, + "learning_rate": 4.9999159270983075e-06, + "loss": 2.363, + "step": 3032 + }, + { + "epoch": 0.1627145922746781, + "grad_norm": 0.328125, + "learning_rate": 4.999915213110629e-06, + "loss": 2.3398, + "step": 3033 + }, + { + "epoch": 0.16276824034334764, + "grad_norm": 0.3046875, + "learning_rate": 4.999914496104044e-06, + "loss": 2.1531, + "step": 3034 + }, + { + "epoch": 0.16282188841201717, + "grad_norm": 0.39453125, + "learning_rate": 4.999913776078552e-06, + "loss": 2.4777, + "step": 3035 + }, + { + "epoch": 0.1628755364806867, + "grad_norm": 0.3125, + "learning_rate": 4.999913053034157e-06, + "loss": 2.2968, + "step": 3036 + }, + { + "epoch": 0.16292918454935623, + "grad_norm": 0.765625, + "learning_rate": 4.999912326970856e-06, + "loss": 2.4817, + "step": 3037 + }, + { + "epoch": 0.16298283261802576, + "grad_norm": 0.36328125, + "learning_rate": 4.999911597888652e-06, + "loss": 2.3244, + "step": 3038 + }, + { + "epoch": 0.16303648068669527, + "grad_norm": 0.330078125, + "learning_rate": 4.999910865787544e-06, + "loss": 2.1467, + "step": 3039 + }, + { + "epoch": 0.1630901287553648, + "grad_norm": 0.267578125, + "learning_rate": 4.999910130667536e-06, + "loss": 2.1953, + "step": 3040 + }, + { + "epoch": 0.16314377682403433, + "grad_norm": 0.55859375, + "learning_rate": 4.999909392528626e-06, + "loss": 2.1382, + "step": 3041 + }, + { + "epoch": 0.16319742489270386, + "grad_norm": 0.384765625, + "learning_rate": 4.999908651370816e-06, + "loss": 2.195, + "step": 3042 + }, + { + "epoch": 0.1632510729613734, + "grad_norm": 0.36328125, + "learning_rate": 4.999907907194108e-06, + "loss": 2.5342, + "step": 3043 + }, + { + "epoch": 0.16330472103004293, + "grad_norm": 0.318359375, + "learning_rate": 4.9999071599985015e-06, + "loss": 2.3359, + "step": 3044 + }, + { + "epoch": 0.16335836909871246, + "grad_norm": 0.5234375, + "learning_rate": 4.999906409783997e-06, + "loss": 2.4478, + "step": 3045 + }, + { + "epoch": 0.16341201716738196, + "grad_norm": 0.345703125, + "learning_rate": 4.999905656550597e-06, + "loss": 2.4441, + "step": 3046 + }, + { + "epoch": 0.1634656652360515, + "grad_norm": 0.33984375, + "learning_rate": 4.999904900298301e-06, + "loss": 2.3957, + "step": 3047 + }, + { + "epoch": 0.16351931330472103, + "grad_norm": 0.359375, + "learning_rate": 4.999904141027111e-06, + "loss": 2.582, + "step": 3048 + }, + { + "epoch": 0.16357296137339056, + "grad_norm": 0.2890625, + "learning_rate": 4.9999033787370275e-06, + "loss": 2.0739, + "step": 3049 + }, + { + "epoch": 0.1636266094420601, + "grad_norm": 0.34765625, + "learning_rate": 4.99990261342805e-06, + "loss": 2.1627, + "step": 3050 + }, + { + "epoch": 0.16368025751072962, + "grad_norm": 0.34375, + "learning_rate": 4.999901845100182e-06, + "loss": 2.1991, + "step": 3051 + }, + { + "epoch": 0.16373390557939915, + "grad_norm": 0.296875, + "learning_rate": 4.999901073753423e-06, + "loss": 2.2931, + "step": 3052 + }, + { + "epoch": 0.16378755364806866, + "grad_norm": 0.32421875, + "learning_rate": 4.999900299387774e-06, + "loss": 2.2556, + "step": 3053 + }, + { + "epoch": 0.1638412017167382, + "grad_norm": 0.32421875, + "learning_rate": 4.9998995220032365e-06, + "loss": 2.2973, + "step": 3054 + }, + { + "epoch": 0.16389484978540772, + "grad_norm": 0.375, + "learning_rate": 4.99989874159981e-06, + "loss": 2.3728, + "step": 3055 + }, + { + "epoch": 0.16394849785407725, + "grad_norm": 0.287109375, + "learning_rate": 4.999897958177497e-06, + "loss": 2.024, + "step": 3056 + }, + { + "epoch": 0.16400214592274678, + "grad_norm": 0.322265625, + "learning_rate": 4.999897171736298e-06, + "loss": 2.1755, + "step": 3057 + }, + { + "epoch": 0.16405579399141632, + "grad_norm": 0.28515625, + "learning_rate": 4.999896382276213e-06, + "loss": 1.9975, + "step": 3058 + }, + { + "epoch": 0.16410944206008585, + "grad_norm": 0.33984375, + "learning_rate": 4.9998955897972445e-06, + "loss": 2.4196, + "step": 3059 + }, + { + "epoch": 0.16416309012875535, + "grad_norm": 0.3515625, + "learning_rate": 4.999894794299393e-06, + "loss": 2.508, + "step": 3060 + }, + { + "epoch": 0.16421673819742488, + "grad_norm": 0.3203125, + "learning_rate": 4.999893995782658e-06, + "loss": 2.2658, + "step": 3061 + }, + { + "epoch": 0.16427038626609441, + "grad_norm": 0.314453125, + "learning_rate": 4.999893194247042e-06, + "loss": 2.2202, + "step": 3062 + }, + { + "epoch": 0.16432403433476395, + "grad_norm": 0.34375, + "learning_rate": 4.999892389692546e-06, + "loss": 2.4654, + "step": 3063 + }, + { + "epoch": 0.16437768240343348, + "grad_norm": 0.29296875, + "learning_rate": 4.999891582119171e-06, + "loss": 2.2596, + "step": 3064 + }, + { + "epoch": 0.164431330472103, + "grad_norm": 0.314453125, + "learning_rate": 4.999890771526917e-06, + "loss": 2.2797, + "step": 3065 + }, + { + "epoch": 0.16448497854077254, + "grad_norm": 0.302734375, + "learning_rate": 4.999889957915786e-06, + "loss": 2.5245, + "step": 3066 + }, + { + "epoch": 0.16453862660944207, + "grad_norm": 0.322265625, + "learning_rate": 4.9998891412857776e-06, + "loss": 2.1606, + "step": 3067 + }, + { + "epoch": 0.16459227467811158, + "grad_norm": 0.396484375, + "learning_rate": 4.999888321636894e-06, + "loss": 2.2127, + "step": 3068 + }, + { + "epoch": 0.1646459227467811, + "grad_norm": 0.388671875, + "learning_rate": 4.999887498969136e-06, + "loss": 2.3409, + "step": 3069 + }, + { + "epoch": 0.16469957081545064, + "grad_norm": 0.34765625, + "learning_rate": 4.9998866732825046e-06, + "loss": 2.3322, + "step": 3070 + }, + { + "epoch": 0.16475321888412017, + "grad_norm": 0.388671875, + "learning_rate": 4.999885844577001e-06, + "loss": 2.6339, + "step": 3071 + }, + { + "epoch": 0.1648068669527897, + "grad_norm": 0.3984375, + "learning_rate": 4.9998850128526255e-06, + "loss": 2.3737, + "step": 3072 + }, + { + "epoch": 0.16486051502145924, + "grad_norm": 0.33984375, + "learning_rate": 4.99988417810938e-06, + "loss": 2.3553, + "step": 3073 + }, + { + "epoch": 0.16491416309012877, + "grad_norm": 0.361328125, + "learning_rate": 4.999883340347264e-06, + "loss": 2.1704, + "step": 3074 + }, + { + "epoch": 0.16496781115879827, + "grad_norm": 0.5546875, + "learning_rate": 4.99988249956628e-06, + "loss": 2.3842, + "step": 3075 + }, + { + "epoch": 0.1650214592274678, + "grad_norm": 1.109375, + "learning_rate": 4.999881655766429e-06, + "loss": 2.4809, + "step": 3076 + }, + { + "epoch": 0.16507510729613734, + "grad_norm": 0.29296875, + "learning_rate": 4.999880808947711e-06, + "loss": 1.9871, + "step": 3077 + }, + { + "epoch": 0.16512875536480687, + "grad_norm": 0.341796875, + "learning_rate": 4.999879959110128e-06, + "loss": 1.7171, + "step": 3078 + }, + { + "epoch": 0.1651824034334764, + "grad_norm": 0.33203125, + "learning_rate": 4.99987910625368e-06, + "loss": 2.4618, + "step": 3079 + }, + { + "epoch": 0.16523605150214593, + "grad_norm": 0.400390625, + "learning_rate": 4.9998782503783695e-06, + "loss": 2.3074, + "step": 3080 + }, + { + "epoch": 0.16528969957081546, + "grad_norm": 0.34375, + "learning_rate": 4.999877391484196e-06, + "loss": 2.4151, + "step": 3081 + }, + { + "epoch": 0.16534334763948497, + "grad_norm": 0.38671875, + "learning_rate": 4.99987652957116e-06, + "loss": 2.4674, + "step": 3082 + }, + { + "epoch": 0.1653969957081545, + "grad_norm": 0.431640625, + "learning_rate": 4.999875664639265e-06, + "loss": 2.2466, + "step": 3083 + }, + { + "epoch": 0.16545064377682403, + "grad_norm": 0.33203125, + "learning_rate": 4.999874796688512e-06, + "loss": 2.2667, + "step": 3084 + }, + { + "epoch": 0.16550429184549356, + "grad_norm": 0.349609375, + "learning_rate": 4.999873925718899e-06, + "loss": 2.3136, + "step": 3085 + }, + { + "epoch": 0.1655579399141631, + "grad_norm": 0.35546875, + "learning_rate": 4.9998730517304295e-06, + "loss": 2.1719, + "step": 3086 + }, + { + "epoch": 0.16561158798283263, + "grad_norm": 0.44140625, + "learning_rate": 4.999872174723104e-06, + "loss": 2.3216, + "step": 3087 + }, + { + "epoch": 0.16566523605150216, + "grad_norm": 0.314453125, + "learning_rate": 4.999871294696924e-06, + "loss": 2.5928, + "step": 3088 + }, + { + "epoch": 0.16571888412017166, + "grad_norm": 0.330078125, + "learning_rate": 4.999870411651889e-06, + "loss": 2.1592, + "step": 3089 + }, + { + "epoch": 0.1657725321888412, + "grad_norm": 0.349609375, + "learning_rate": 4.9998695255880015e-06, + "loss": 2.3946, + "step": 3090 + }, + { + "epoch": 0.16582618025751072, + "grad_norm": 0.314453125, + "learning_rate": 4.999868636505262e-06, + "loss": 2.3731, + "step": 3091 + }, + { + "epoch": 0.16587982832618026, + "grad_norm": 0.314453125, + "learning_rate": 4.999867744403672e-06, + "loss": 2.3471, + "step": 3092 + }, + { + "epoch": 0.1659334763948498, + "grad_norm": 0.306640625, + "learning_rate": 4.999866849283232e-06, + "loss": 2.4329, + "step": 3093 + }, + { + "epoch": 0.16598712446351932, + "grad_norm": 0.435546875, + "learning_rate": 4.999865951143945e-06, + "loss": 2.2495, + "step": 3094 + }, + { + "epoch": 0.16604077253218885, + "grad_norm": 0.373046875, + "learning_rate": 4.999865049985809e-06, + "loss": 2.351, + "step": 3095 + }, + { + "epoch": 0.16609442060085836, + "grad_norm": 0.33984375, + "learning_rate": 4.999864145808827e-06, + "loss": 2.3577, + "step": 3096 + }, + { + "epoch": 0.1661480686695279, + "grad_norm": 0.34765625, + "learning_rate": 4.9998632386129995e-06, + "loss": 2.3916, + "step": 3097 + }, + { + "epoch": 0.16620171673819742, + "grad_norm": 0.369140625, + "learning_rate": 4.999862328398328e-06, + "loss": 2.198, + "step": 3098 + }, + { + "epoch": 0.16625536480686695, + "grad_norm": 0.349609375, + "learning_rate": 4.999861415164814e-06, + "loss": 2.3444, + "step": 3099 + }, + { + "epoch": 0.16630901287553648, + "grad_norm": 0.3203125, + "learning_rate": 4.999860498912457e-06, + "loss": 2.3039, + "step": 3100 + }, + { + "epoch": 0.16636266094420601, + "grad_norm": 1.03125, + "learning_rate": 4.999859579641258e-06, + "loss": 2.2005, + "step": 3101 + }, + { + "epoch": 0.16641630901287555, + "grad_norm": 0.30859375, + "learning_rate": 4.999858657351222e-06, + "loss": 2.4575, + "step": 3102 + }, + { + "epoch": 0.16646995708154508, + "grad_norm": 0.396484375, + "learning_rate": 4.9998577320423455e-06, + "loss": 2.2672, + "step": 3103 + }, + { + "epoch": 0.16652360515021458, + "grad_norm": 0.49609375, + "learning_rate": 4.999856803714632e-06, + "loss": 2.4381, + "step": 3104 + }, + { + "epoch": 0.1665772532188841, + "grad_norm": 0.3046875, + "learning_rate": 4.999855872368081e-06, + "loss": 2.6286, + "step": 3105 + }, + { + "epoch": 0.16663090128755365, + "grad_norm": 0.466796875, + "learning_rate": 4.999854938002696e-06, + "loss": 2.3551, + "step": 3106 + }, + { + "epoch": 0.16668454935622318, + "grad_norm": 0.47265625, + "learning_rate": 4.9998540006184765e-06, + "loss": 2.291, + "step": 3107 + }, + { + "epoch": 0.1667381974248927, + "grad_norm": 0.302734375, + "learning_rate": 4.999853060215424e-06, + "loss": 1.9017, + "step": 3108 + }, + { + "epoch": 0.16679184549356224, + "grad_norm": 0.365234375, + "learning_rate": 4.999852116793539e-06, + "loss": 2.3767, + "step": 3109 + }, + { + "epoch": 0.16684549356223177, + "grad_norm": 0.3203125, + "learning_rate": 4.999851170352824e-06, + "loss": 2.1544, + "step": 3110 + }, + { + "epoch": 0.16689914163090128, + "grad_norm": 0.322265625, + "learning_rate": 4.999850220893279e-06, + "loss": 2.3262, + "step": 3111 + }, + { + "epoch": 0.1669527896995708, + "grad_norm": 0.3125, + "learning_rate": 4.999849268414905e-06, + "loss": 2.1154, + "step": 3112 + }, + { + "epoch": 0.16700643776824034, + "grad_norm": 0.30859375, + "learning_rate": 4.999848312917704e-06, + "loss": 2.1047, + "step": 3113 + }, + { + "epoch": 0.16706008583690987, + "grad_norm": 0.298828125, + "learning_rate": 4.999847354401677e-06, + "loss": 2.2553, + "step": 3114 + }, + { + "epoch": 0.1671137339055794, + "grad_norm": 0.341796875, + "learning_rate": 4.999846392866825e-06, + "loss": 2.3133, + "step": 3115 + }, + { + "epoch": 0.16716738197424894, + "grad_norm": 0.318359375, + "learning_rate": 4.999845428313149e-06, + "loss": 2.1893, + "step": 3116 + }, + { + "epoch": 0.16722103004291847, + "grad_norm": 0.3359375, + "learning_rate": 4.999844460740651e-06, + "loss": 2.3183, + "step": 3117 + }, + { + "epoch": 0.16727467811158797, + "grad_norm": 0.40625, + "learning_rate": 4.99984349014933e-06, + "loss": 2.3593, + "step": 3118 + }, + { + "epoch": 0.1673283261802575, + "grad_norm": 0.365234375, + "learning_rate": 4.9998425165391894e-06, + "loss": 2.2579, + "step": 3119 + }, + { + "epoch": 0.16738197424892703, + "grad_norm": 0.6484375, + "learning_rate": 4.999841539910229e-06, + "loss": 1.241, + "step": 3120 + }, + { + "epoch": 0.16743562231759657, + "grad_norm": 0.482421875, + "learning_rate": 4.999840560262452e-06, + "loss": 2.3806, + "step": 3121 + }, + { + "epoch": 0.1674892703862661, + "grad_norm": 0.4140625, + "learning_rate": 4.999839577595858e-06, + "loss": 2.258, + "step": 3122 + }, + { + "epoch": 0.16754291845493563, + "grad_norm": 0.30078125, + "learning_rate": 4.999838591910448e-06, + "loss": 2.2611, + "step": 3123 + }, + { + "epoch": 0.16759656652360516, + "grad_norm": 0.412109375, + "learning_rate": 4.999837603206222e-06, + "loss": 2.127, + "step": 3124 + }, + { + "epoch": 0.16765021459227467, + "grad_norm": 0.58203125, + "learning_rate": 4.999836611483184e-06, + "loss": 2.3895, + "step": 3125 + }, + { + "epoch": 0.1677038626609442, + "grad_norm": 0.408203125, + "learning_rate": 4.999835616741334e-06, + "loss": 2.4691, + "step": 3126 + }, + { + "epoch": 0.16775751072961373, + "grad_norm": 0.318359375, + "learning_rate": 4.9998346189806735e-06, + "loss": 2.2573, + "step": 3127 + }, + { + "epoch": 0.16781115879828326, + "grad_norm": 0.328125, + "learning_rate": 4.999833618201203e-06, + "loss": 2.2333, + "step": 3128 + }, + { + "epoch": 0.1678648068669528, + "grad_norm": 0.291015625, + "learning_rate": 4.999832614402924e-06, + "loss": 2.3284, + "step": 3129 + }, + { + "epoch": 0.16791845493562232, + "grad_norm": 0.353515625, + "learning_rate": 4.999831607585838e-06, + "loss": 2.3956, + "step": 3130 + }, + { + "epoch": 0.16797210300429186, + "grad_norm": 0.361328125, + "learning_rate": 4.999830597749946e-06, + "loss": 1.5601, + "step": 3131 + }, + { + "epoch": 0.16802575107296136, + "grad_norm": 0.30859375, + "learning_rate": 4.999829584895248e-06, + "loss": 2.3643, + "step": 3132 + }, + { + "epoch": 0.1680793991416309, + "grad_norm": 0.29296875, + "learning_rate": 4.999828569021748e-06, + "loss": 2.2138, + "step": 3133 + }, + { + "epoch": 0.16813304721030042, + "grad_norm": 0.34375, + "learning_rate": 4.999827550129445e-06, + "loss": 2.147, + "step": 3134 + }, + { + "epoch": 0.16818669527896996, + "grad_norm": 0.326171875, + "learning_rate": 4.999826528218341e-06, + "loss": 2.1964, + "step": 3135 + }, + { + "epoch": 0.1682403433476395, + "grad_norm": 0.34375, + "learning_rate": 4.999825503288437e-06, + "loss": 2.5229, + "step": 3136 + }, + { + "epoch": 0.16829399141630902, + "grad_norm": 0.453125, + "learning_rate": 4.999824475339734e-06, + "loss": 2.3371, + "step": 3137 + }, + { + "epoch": 0.16834763948497855, + "grad_norm": 0.3671875, + "learning_rate": 4.9998234443722344e-06, + "loss": 2.1122, + "step": 3138 + }, + { + "epoch": 0.16840128755364808, + "grad_norm": 0.625, + "learning_rate": 4.9998224103859384e-06, + "loss": 2.255, + "step": 3139 + }, + { + "epoch": 0.1684549356223176, + "grad_norm": 0.333984375, + "learning_rate": 4.999821373380847e-06, + "loss": 2.2843, + "step": 3140 + }, + { + "epoch": 0.16850858369098712, + "grad_norm": 0.2734375, + "learning_rate": 4.9998203333569625e-06, + "loss": 2.0095, + "step": 3141 + }, + { + "epoch": 0.16856223175965665, + "grad_norm": 0.291015625, + "learning_rate": 4.999819290314285e-06, + "loss": 2.4354, + "step": 3142 + }, + { + "epoch": 0.16861587982832618, + "grad_norm": 0.3046875, + "learning_rate": 4.999818244252816e-06, + "loss": 2.1551, + "step": 3143 + }, + { + "epoch": 0.1686695278969957, + "grad_norm": 0.376953125, + "learning_rate": 4.999817195172558e-06, + "loss": 2.3181, + "step": 3144 + }, + { + "epoch": 0.16872317596566525, + "grad_norm": 0.33984375, + "learning_rate": 4.999816143073511e-06, + "loss": 2.4014, + "step": 3145 + }, + { + "epoch": 0.16877682403433478, + "grad_norm": 0.34765625, + "learning_rate": 4.999815087955677e-06, + "loss": 2.2641, + "step": 3146 + }, + { + "epoch": 0.16883047210300428, + "grad_norm": 0.48046875, + "learning_rate": 4.999814029819058e-06, + "loss": 2.3694, + "step": 3147 + }, + { + "epoch": 0.1688841201716738, + "grad_norm": 0.302734375, + "learning_rate": 4.9998129686636524e-06, + "loss": 2.2717, + "step": 3148 + }, + { + "epoch": 0.16893776824034334, + "grad_norm": 0.35546875, + "learning_rate": 4.999811904489464e-06, + "loss": 2.3496, + "step": 3149 + }, + { + "epoch": 0.16899141630901288, + "grad_norm": 0.45703125, + "learning_rate": 4.999810837296493e-06, + "loss": 2.505, + "step": 3150 + }, + { + "epoch": 0.1690450643776824, + "grad_norm": 0.341796875, + "learning_rate": 4.999809767084741e-06, + "loss": 2.131, + "step": 3151 + }, + { + "epoch": 0.16909871244635194, + "grad_norm": 0.32421875, + "learning_rate": 4.99980869385421e-06, + "loss": 2.3331, + "step": 3152 + }, + { + "epoch": 0.16915236051502147, + "grad_norm": 0.41796875, + "learning_rate": 4.9998076176049e-06, + "loss": 2.2497, + "step": 3153 + }, + { + "epoch": 0.16920600858369098, + "grad_norm": 0.341796875, + "learning_rate": 4.999806538336814e-06, + "loss": 2.2078, + "step": 3154 + }, + { + "epoch": 0.1692596566523605, + "grad_norm": 0.451171875, + "learning_rate": 4.9998054560499515e-06, + "loss": 2.5175, + "step": 3155 + }, + { + "epoch": 0.16931330472103004, + "grad_norm": 0.37109375, + "learning_rate": 4.9998043707443145e-06, + "loss": 2.3045, + "step": 3156 + }, + { + "epoch": 0.16936695278969957, + "grad_norm": 0.4140625, + "learning_rate": 4.999803282419905e-06, + "loss": 2.1089, + "step": 3157 + }, + { + "epoch": 0.1694206008583691, + "grad_norm": 0.375, + "learning_rate": 4.9998021910767225e-06, + "loss": 2.3731, + "step": 3158 + }, + { + "epoch": 0.16947424892703863, + "grad_norm": 0.32421875, + "learning_rate": 4.99980109671477e-06, + "loss": 2.4786, + "step": 3159 + }, + { + "epoch": 0.16952789699570817, + "grad_norm": 0.337890625, + "learning_rate": 4.999799999334049e-06, + "loss": 2.2357, + "step": 3160 + }, + { + "epoch": 0.16958154506437767, + "grad_norm": 0.63671875, + "learning_rate": 4.999798898934559e-06, + "loss": 2.1435, + "step": 3161 + }, + { + "epoch": 0.1696351931330472, + "grad_norm": 0.3046875, + "learning_rate": 4.9997977955163036e-06, + "loss": 2.2298, + "step": 3162 + }, + { + "epoch": 0.16968884120171673, + "grad_norm": 0.2890625, + "learning_rate": 4.999796689079282e-06, + "loss": 1.968, + "step": 3163 + }, + { + "epoch": 0.16974248927038627, + "grad_norm": 0.34375, + "learning_rate": 4.999795579623498e-06, + "loss": 2.2578, + "step": 3164 + }, + { + "epoch": 0.1697961373390558, + "grad_norm": 0.314453125, + "learning_rate": 4.999794467148951e-06, + "loss": 2.2324, + "step": 3165 + }, + { + "epoch": 0.16984978540772533, + "grad_norm": 0.427734375, + "learning_rate": 4.999793351655642e-06, + "loss": 2.4872, + "step": 3166 + }, + { + "epoch": 0.16990343347639486, + "grad_norm": 0.34765625, + "learning_rate": 4.999792233143573e-06, + "loss": 2.3728, + "step": 3167 + }, + { + "epoch": 0.16995708154506436, + "grad_norm": 0.33984375, + "learning_rate": 4.999791111612747e-06, + "loss": 2.0667, + "step": 3168 + }, + { + "epoch": 0.1700107296137339, + "grad_norm": 0.41796875, + "learning_rate": 4.999789987063163e-06, + "loss": 2.278, + "step": 3169 + }, + { + "epoch": 0.17006437768240343, + "grad_norm": 0.380859375, + "learning_rate": 4.9997888594948226e-06, + "loss": 2.3148, + "step": 3170 + }, + { + "epoch": 0.17011802575107296, + "grad_norm": 0.345703125, + "learning_rate": 4.999787728907729e-06, + "loss": 1.9347, + "step": 3171 + }, + { + "epoch": 0.1701716738197425, + "grad_norm": 0.28515625, + "learning_rate": 4.999786595301882e-06, + "loss": 2.236, + "step": 3172 + }, + { + "epoch": 0.17022532188841202, + "grad_norm": 0.361328125, + "learning_rate": 4.999785458677283e-06, + "loss": 2.3851, + "step": 3173 + }, + { + "epoch": 0.17027896995708156, + "grad_norm": 0.8125, + "learning_rate": 4.999784319033933e-06, + "loss": 2.6344, + "step": 3174 + }, + { + "epoch": 0.17033261802575106, + "grad_norm": 0.296875, + "learning_rate": 4.9997831763718355e-06, + "loss": 1.8797, + "step": 3175 + }, + { + "epoch": 0.1703862660944206, + "grad_norm": 0.341796875, + "learning_rate": 4.99978203069099e-06, + "loss": 2.475, + "step": 3176 + }, + { + "epoch": 0.17043991416309012, + "grad_norm": 0.373046875, + "learning_rate": 4.999780881991398e-06, + "loss": 2.2721, + "step": 3177 + }, + { + "epoch": 0.17049356223175965, + "grad_norm": 0.314453125, + "learning_rate": 4.999779730273062e-06, + "loss": 2.251, + "step": 3178 + }, + { + "epoch": 0.1705472103004292, + "grad_norm": 0.328125, + "learning_rate": 4.999778575535982e-06, + "loss": 2.332, + "step": 3179 + }, + { + "epoch": 0.17060085836909872, + "grad_norm": 0.3125, + "learning_rate": 4.99977741778016e-06, + "loss": 2.1134, + "step": 3180 + }, + { + "epoch": 0.17065450643776825, + "grad_norm": 0.349609375, + "learning_rate": 4.999776257005599e-06, + "loss": 2.3073, + "step": 3181 + }, + { + "epoch": 0.17070815450643778, + "grad_norm": 0.353515625, + "learning_rate": 4.999775093212297e-06, + "loss": 2.2242, + "step": 3182 + }, + { + "epoch": 0.17076180257510729, + "grad_norm": 0.294921875, + "learning_rate": 4.999773926400257e-06, + "loss": 2.2581, + "step": 3183 + }, + { + "epoch": 0.17081545064377682, + "grad_norm": 0.357421875, + "learning_rate": 4.999772756569482e-06, + "loss": 2.3005, + "step": 3184 + }, + { + "epoch": 0.17086909871244635, + "grad_norm": 0.3984375, + "learning_rate": 4.999771583719971e-06, + "loss": 2.5681, + "step": 3185 + }, + { + "epoch": 0.17092274678111588, + "grad_norm": 2.6875, + "learning_rate": 4.999770407851727e-06, + "loss": 1.5555, + "step": 3186 + }, + { + "epoch": 0.1709763948497854, + "grad_norm": 0.298828125, + "learning_rate": 4.999769228964751e-06, + "loss": 2.3159, + "step": 3187 + }, + { + "epoch": 0.17103004291845494, + "grad_norm": 0.328125, + "learning_rate": 4.999768047059044e-06, + "loss": 2.5875, + "step": 3188 + }, + { + "epoch": 0.17108369098712448, + "grad_norm": 0.33984375, + "learning_rate": 4.999766862134607e-06, + "loss": 1.9357, + "step": 3189 + }, + { + "epoch": 0.17113733905579398, + "grad_norm": 0.380859375, + "learning_rate": 4.999765674191444e-06, + "loss": 2.3963, + "step": 3190 + }, + { + "epoch": 0.1711909871244635, + "grad_norm": 0.3046875, + "learning_rate": 4.999764483229553e-06, + "loss": 2.3, + "step": 3191 + }, + { + "epoch": 0.17124463519313304, + "grad_norm": 0.306640625, + "learning_rate": 4.999763289248938e-06, + "loss": 2.2408, + "step": 3192 + }, + { + "epoch": 0.17129828326180258, + "grad_norm": 0.39453125, + "learning_rate": 4.999762092249598e-06, + "loss": 2.1983, + "step": 3193 + }, + { + "epoch": 0.1713519313304721, + "grad_norm": 0.6953125, + "learning_rate": 4.999760892231537e-06, + "loss": 2.2777, + "step": 3194 + }, + { + "epoch": 0.17140557939914164, + "grad_norm": 0.28515625, + "learning_rate": 4.9997596891947555e-06, + "loss": 2.2594, + "step": 3195 + }, + { + "epoch": 0.17145922746781117, + "grad_norm": 0.8203125, + "learning_rate": 4.999758483139255e-06, + "loss": 2.3641, + "step": 3196 + }, + { + "epoch": 0.17151287553648067, + "grad_norm": 0.380859375, + "learning_rate": 4.999757274065037e-06, + "loss": 2.1683, + "step": 3197 + }, + { + "epoch": 0.1715665236051502, + "grad_norm": 0.376953125, + "learning_rate": 4.999756061972102e-06, + "loss": 2.5835, + "step": 3198 + }, + { + "epoch": 0.17162017167381974, + "grad_norm": 0.42578125, + "learning_rate": 4.999754846860452e-06, + "loss": 2.3336, + "step": 3199 + }, + { + "epoch": 0.17167381974248927, + "grad_norm": 0.380859375, + "learning_rate": 4.99975362873009e-06, + "loss": 2.1053, + "step": 3200 + }, + { + "epoch": 0.1717274678111588, + "grad_norm": 0.37109375, + "learning_rate": 4.9997524075810144e-06, + "loss": 2.0307, + "step": 3201 + }, + { + "epoch": 0.17178111587982833, + "grad_norm": 0.353515625, + "learning_rate": 4.99975118341323e-06, + "loss": 2.453, + "step": 3202 + }, + { + "epoch": 0.17183476394849787, + "grad_norm": 0.369140625, + "learning_rate": 4.999749956226736e-06, + "loss": 2.2887, + "step": 3203 + }, + { + "epoch": 0.17188841201716737, + "grad_norm": 0.390625, + "learning_rate": 4.999748726021535e-06, + "loss": 2.2973, + "step": 3204 + }, + { + "epoch": 0.1719420600858369, + "grad_norm": 0.32421875, + "learning_rate": 4.999747492797627e-06, + "loss": 2.4164, + "step": 3205 + }, + { + "epoch": 0.17199570815450643, + "grad_norm": 0.55078125, + "learning_rate": 4.999746256555016e-06, + "loss": 2.1882, + "step": 3206 + }, + { + "epoch": 0.17204935622317596, + "grad_norm": 0.33203125, + "learning_rate": 4.999745017293701e-06, + "loss": 2.5538, + "step": 3207 + }, + { + "epoch": 0.1721030042918455, + "grad_norm": 0.9296875, + "learning_rate": 4.9997437750136845e-06, + "loss": 2.3498, + "step": 3208 + }, + { + "epoch": 0.17215665236051503, + "grad_norm": 0.357421875, + "learning_rate": 4.999742529714968e-06, + "loss": 2.3734, + "step": 3209 + }, + { + "epoch": 0.17221030042918456, + "grad_norm": 0.337890625, + "learning_rate": 4.999741281397554e-06, + "loss": 2.1609, + "step": 3210 + }, + { + "epoch": 0.17226394849785406, + "grad_norm": 0.40234375, + "learning_rate": 4.9997400300614416e-06, + "loss": 2.2774, + "step": 3211 + }, + { + "epoch": 0.1723175965665236, + "grad_norm": 1.25, + "learning_rate": 4.999738775706635e-06, + "loss": 2.423, + "step": 3212 + }, + { + "epoch": 0.17237124463519313, + "grad_norm": 0.310546875, + "learning_rate": 4.999737518333134e-06, + "loss": 2.2135, + "step": 3213 + }, + { + "epoch": 0.17242489270386266, + "grad_norm": 0.373046875, + "learning_rate": 4.999736257940941e-06, + "loss": 2.2934, + "step": 3214 + }, + { + "epoch": 0.1724785407725322, + "grad_norm": 0.34765625, + "learning_rate": 4.999734994530057e-06, + "loss": 2.3543, + "step": 3215 + }, + { + "epoch": 0.17253218884120172, + "grad_norm": 0.34375, + "learning_rate": 4.999733728100483e-06, + "loss": 2.0632, + "step": 3216 + }, + { + "epoch": 0.17258583690987125, + "grad_norm": 0.318359375, + "learning_rate": 4.9997324586522225e-06, + "loss": 2.4013, + "step": 3217 + }, + { + "epoch": 0.17263948497854079, + "grad_norm": 0.330078125, + "learning_rate": 4.999731186185275e-06, + "loss": 2.4675, + "step": 3218 + }, + { + "epoch": 0.1726931330472103, + "grad_norm": 0.34375, + "learning_rate": 4.9997299106996425e-06, + "loss": 2.2307, + "step": 3219 + }, + { + "epoch": 0.17274678111587982, + "grad_norm": 0.41015625, + "learning_rate": 4.999728632195327e-06, + "loss": 2.2844, + "step": 3220 + }, + { + "epoch": 0.17280042918454935, + "grad_norm": 0.53515625, + "learning_rate": 4.99972735067233e-06, + "loss": 2.5044, + "step": 3221 + }, + { + "epoch": 0.17285407725321889, + "grad_norm": 0.28125, + "learning_rate": 4.999726066130652e-06, + "loss": 1.9436, + "step": 3222 + }, + { + "epoch": 0.17290772532188842, + "grad_norm": 0.3125, + "learning_rate": 4.999724778570296e-06, + "loss": 2.2543, + "step": 3223 + }, + { + "epoch": 0.17296137339055795, + "grad_norm": 0.3671875, + "learning_rate": 4.999723487991264e-06, + "loss": 2.2868, + "step": 3224 + }, + { + "epoch": 0.17301502145922748, + "grad_norm": 0.33984375, + "learning_rate": 4.999722194393556e-06, + "loss": 2.2675, + "step": 3225 + }, + { + "epoch": 0.17306866952789698, + "grad_norm": 0.400390625, + "learning_rate": 4.999720897777173e-06, + "loss": 2.4373, + "step": 3226 + }, + { + "epoch": 0.17312231759656652, + "grad_norm": 0.37890625, + "learning_rate": 4.9997195981421185e-06, + "loss": 2.1368, + "step": 3227 + }, + { + "epoch": 0.17317596566523605, + "grad_norm": 0.43359375, + "learning_rate": 4.999718295488393e-06, + "loss": 2.5211, + "step": 3228 + }, + { + "epoch": 0.17322961373390558, + "grad_norm": 0.435546875, + "learning_rate": 4.999716989815999e-06, + "loss": 1.2818, + "step": 3229 + }, + { + "epoch": 0.1732832618025751, + "grad_norm": 0.3125, + "learning_rate": 4.999715681124937e-06, + "loss": 2.287, + "step": 3230 + }, + { + "epoch": 0.17333690987124464, + "grad_norm": 0.283203125, + "learning_rate": 4.9997143694152085e-06, + "loss": 2.3857, + "step": 3231 + }, + { + "epoch": 0.17339055793991417, + "grad_norm": 0.337890625, + "learning_rate": 4.999713054686816e-06, + "loss": 2.4663, + "step": 3232 + }, + { + "epoch": 0.17344420600858368, + "grad_norm": 0.3515625, + "learning_rate": 4.999711736939761e-06, + "loss": 2.3371, + "step": 3233 + }, + { + "epoch": 0.1734978540772532, + "grad_norm": 0.38671875, + "learning_rate": 4.999710416174044e-06, + "loss": 2.1901, + "step": 3234 + }, + { + "epoch": 0.17355150214592274, + "grad_norm": 0.3125, + "learning_rate": 4.9997090923896675e-06, + "loss": 2.308, + "step": 3235 + }, + { + "epoch": 0.17360515021459227, + "grad_norm": 0.369140625, + "learning_rate": 4.999707765586634e-06, + "loss": 2.4501, + "step": 3236 + }, + { + "epoch": 0.1736587982832618, + "grad_norm": 0.421875, + "learning_rate": 4.999706435764943e-06, + "loss": 2.2521, + "step": 3237 + }, + { + "epoch": 0.17371244635193134, + "grad_norm": 0.80078125, + "learning_rate": 4.999705102924597e-06, + "loss": 2.1943, + "step": 3238 + }, + { + "epoch": 0.17376609442060087, + "grad_norm": 0.5625, + "learning_rate": 4.999703767065598e-06, + "loss": 2.4079, + "step": 3239 + }, + { + "epoch": 0.17381974248927037, + "grad_norm": 0.3515625, + "learning_rate": 4.999702428187947e-06, + "loss": 1.9698, + "step": 3240 + }, + { + "epoch": 0.1738733905579399, + "grad_norm": 0.3515625, + "learning_rate": 4.999701086291646e-06, + "loss": 1.9105, + "step": 3241 + }, + { + "epoch": 0.17392703862660944, + "grad_norm": 0.34375, + "learning_rate": 4.999699741376697e-06, + "loss": 1.9771, + "step": 3242 + }, + { + "epoch": 0.17398068669527897, + "grad_norm": 0.41796875, + "learning_rate": 4.999698393443102e-06, + "loss": 2.4524, + "step": 3243 + }, + { + "epoch": 0.1740343347639485, + "grad_norm": 0.322265625, + "learning_rate": 4.9996970424908605e-06, + "loss": 2.4005, + "step": 3244 + }, + { + "epoch": 0.17408798283261803, + "grad_norm": 0.294921875, + "learning_rate": 4.999695688519977e-06, + "loss": 2.1979, + "step": 3245 + }, + { + "epoch": 0.17414163090128756, + "grad_norm": 0.34765625, + "learning_rate": 4.999694331530451e-06, + "loss": 2.2744, + "step": 3246 + }, + { + "epoch": 0.17419527896995707, + "grad_norm": 0.400390625, + "learning_rate": 4.999692971522283e-06, + "loss": 2.3643, + "step": 3247 + }, + { + "epoch": 0.1742489270386266, + "grad_norm": 0.3203125, + "learning_rate": 4.9996916084954785e-06, + "loss": 2.5828, + "step": 3248 + }, + { + "epoch": 0.17430257510729613, + "grad_norm": 0.31640625, + "learning_rate": 4.999690242450036e-06, + "loss": 2.4592, + "step": 3249 + }, + { + "epoch": 0.17435622317596566, + "grad_norm": 0.306640625, + "learning_rate": 4.9996888733859585e-06, + "loss": 2.3639, + "step": 3250 + }, + { + "epoch": 0.1744098712446352, + "grad_norm": 0.384765625, + "learning_rate": 4.999687501303247e-06, + "loss": 2.5285, + "step": 3251 + }, + { + "epoch": 0.17446351931330473, + "grad_norm": 0.357421875, + "learning_rate": 4.999686126201904e-06, + "loss": 2.3332, + "step": 3252 + }, + { + "epoch": 0.17451716738197426, + "grad_norm": 1.15625, + "learning_rate": 4.999684748081931e-06, + "loss": 1.1773, + "step": 3253 + }, + { + "epoch": 0.1745708154506438, + "grad_norm": 0.3046875, + "learning_rate": 4.9996833669433285e-06, + "loss": 2.3244, + "step": 3254 + }, + { + "epoch": 0.1746244635193133, + "grad_norm": 0.34375, + "learning_rate": 4.9996819827861e-06, + "loss": 2.5577, + "step": 3255 + }, + { + "epoch": 0.17467811158798283, + "grad_norm": 0.3671875, + "learning_rate": 4.9996805956102455e-06, + "loss": 2.164, + "step": 3256 + }, + { + "epoch": 0.17473175965665236, + "grad_norm": 0.447265625, + "learning_rate": 4.999679205415767e-06, + "loss": 2.2861, + "step": 3257 + }, + { + "epoch": 0.1747854077253219, + "grad_norm": 0.326171875, + "learning_rate": 4.999677812202667e-06, + "loss": 2.2747, + "step": 3258 + }, + { + "epoch": 0.17483905579399142, + "grad_norm": 0.36328125, + "learning_rate": 4.9996764159709465e-06, + "loss": 2.3115, + "step": 3259 + }, + { + "epoch": 0.17489270386266095, + "grad_norm": 0.3125, + "learning_rate": 4.999675016720608e-06, + "loss": 2.3404, + "step": 3260 + }, + { + "epoch": 0.17494635193133048, + "grad_norm": 0.333984375, + "learning_rate": 4.999673614451652e-06, + "loss": 2.2792, + "step": 3261 + }, + { + "epoch": 0.175, + "grad_norm": 0.37109375, + "learning_rate": 4.9996722091640805e-06, + "loss": 2.2425, + "step": 3262 + }, + { + "epoch": 0.17505364806866952, + "grad_norm": 0.326171875, + "learning_rate": 4.999670800857897e-06, + "loss": 2.1457, + "step": 3263 + }, + { + "epoch": 0.17510729613733905, + "grad_norm": 0.318359375, + "learning_rate": 4.999669389533099e-06, + "loss": 2.3595, + "step": 3264 + }, + { + "epoch": 0.17516094420600858, + "grad_norm": 0.38671875, + "learning_rate": 4.999667975189693e-06, + "loss": 2.4525, + "step": 3265 + }, + { + "epoch": 0.17521459227467812, + "grad_norm": 0.28515625, + "learning_rate": 4.9996665578276785e-06, + "loss": 1.885, + "step": 3266 + }, + { + "epoch": 0.17526824034334765, + "grad_norm": 0.400390625, + "learning_rate": 4.999665137447056e-06, + "loss": 2.3516, + "step": 3267 + }, + { + "epoch": 0.17532188841201718, + "grad_norm": 0.3046875, + "learning_rate": 4.99966371404783e-06, + "loss": 2.2103, + "step": 3268 + }, + { + "epoch": 0.17537553648068668, + "grad_norm": 0.333984375, + "learning_rate": 4.999662287629999e-06, + "loss": 2.3393, + "step": 3269 + }, + { + "epoch": 0.17542918454935622, + "grad_norm": 0.333984375, + "learning_rate": 4.999660858193568e-06, + "loss": 2.2829, + "step": 3270 + }, + { + "epoch": 0.17548283261802575, + "grad_norm": 0.365234375, + "learning_rate": 4.999659425738537e-06, + "loss": 2.4836, + "step": 3271 + }, + { + "epoch": 0.17553648068669528, + "grad_norm": 0.298828125, + "learning_rate": 4.999657990264908e-06, + "loss": 2.3352, + "step": 3272 + }, + { + "epoch": 0.1755901287553648, + "grad_norm": 0.625, + "learning_rate": 4.9996565517726815e-06, + "loss": 2.1433, + "step": 3273 + }, + { + "epoch": 0.17564377682403434, + "grad_norm": 0.322265625, + "learning_rate": 4.999655110261862e-06, + "loss": 2.2799, + "step": 3274 + }, + { + "epoch": 0.17569742489270387, + "grad_norm": 0.34375, + "learning_rate": 4.999653665732448e-06, + "loss": 1.9374, + "step": 3275 + }, + { + "epoch": 0.17575107296137338, + "grad_norm": 0.275390625, + "learning_rate": 4.999652218184444e-06, + "loss": 2.4497, + "step": 3276 + }, + { + "epoch": 0.1758047210300429, + "grad_norm": 0.625, + "learning_rate": 4.99965076761785e-06, + "loss": 2.3395, + "step": 3277 + }, + { + "epoch": 0.17585836909871244, + "grad_norm": 0.349609375, + "learning_rate": 4.999649314032669e-06, + "loss": 2.1741, + "step": 3278 + }, + { + "epoch": 0.17591201716738197, + "grad_norm": 0.484375, + "learning_rate": 4.999647857428901e-06, + "loss": 2.2208, + "step": 3279 + }, + { + "epoch": 0.1759656652360515, + "grad_norm": 0.32421875, + "learning_rate": 4.99964639780655e-06, + "loss": 2.2799, + "step": 3280 + }, + { + "epoch": 0.17601931330472104, + "grad_norm": 0.291015625, + "learning_rate": 4.9996449351656165e-06, + "loss": 1.9881, + "step": 3281 + }, + { + "epoch": 0.17607296137339057, + "grad_norm": 0.455078125, + "learning_rate": 4.999643469506102e-06, + "loss": 2.3862, + "step": 3282 + }, + { + "epoch": 0.17612660944206007, + "grad_norm": 0.3203125, + "learning_rate": 4.999642000828008e-06, + "loss": 2.1831, + "step": 3283 + }, + { + "epoch": 0.1761802575107296, + "grad_norm": 0.333984375, + "learning_rate": 4.999640529131338e-06, + "loss": 2.6248, + "step": 3284 + }, + { + "epoch": 0.17623390557939914, + "grad_norm": 0.447265625, + "learning_rate": 4.999639054416093e-06, + "loss": 2.4783, + "step": 3285 + }, + { + "epoch": 0.17628755364806867, + "grad_norm": 0.345703125, + "learning_rate": 4.9996375766822735e-06, + "loss": 2.3509, + "step": 3286 + }, + { + "epoch": 0.1763412017167382, + "grad_norm": 0.408203125, + "learning_rate": 4.999636095929883e-06, + "loss": 2.5601, + "step": 3287 + }, + { + "epoch": 0.17639484978540773, + "grad_norm": 0.326171875, + "learning_rate": 4.999634612158922e-06, + "loss": 2.369, + "step": 3288 + }, + { + "epoch": 0.17644849785407726, + "grad_norm": 0.34765625, + "learning_rate": 4.999633125369394e-06, + "loss": 2.3526, + "step": 3289 + }, + { + "epoch": 0.1765021459227468, + "grad_norm": 0.42578125, + "learning_rate": 4.999631635561298e-06, + "loss": 2.2429, + "step": 3290 + }, + { + "epoch": 0.1765557939914163, + "grad_norm": 0.314453125, + "learning_rate": 4.999630142734639e-06, + "loss": 2.2739, + "step": 3291 + }, + { + "epoch": 0.17660944206008583, + "grad_norm": 0.3125, + "learning_rate": 4.999628646889416e-06, + "loss": 1.9876, + "step": 3292 + }, + { + "epoch": 0.17666309012875536, + "grad_norm": 0.359375, + "learning_rate": 4.999627148025633e-06, + "loss": 2.634, + "step": 3293 + }, + { + "epoch": 0.1767167381974249, + "grad_norm": 0.326171875, + "learning_rate": 4.999625646143291e-06, + "loss": 2.45, + "step": 3294 + }, + { + "epoch": 0.17677038626609443, + "grad_norm": 0.28515625, + "learning_rate": 4.999624141242391e-06, + "loss": 2.188, + "step": 3295 + }, + { + "epoch": 0.17682403433476396, + "grad_norm": 0.3828125, + "learning_rate": 4.999622633322936e-06, + "loss": 2.1063, + "step": 3296 + }, + { + "epoch": 0.1768776824034335, + "grad_norm": 0.51171875, + "learning_rate": 4.999621122384927e-06, + "loss": 1.9712, + "step": 3297 + }, + { + "epoch": 0.176931330472103, + "grad_norm": 0.44140625, + "learning_rate": 4.999619608428367e-06, + "loss": 2.2042, + "step": 3298 + }, + { + "epoch": 0.17698497854077253, + "grad_norm": 5.28125, + "learning_rate": 4.999618091453256e-06, + "loss": 2.1749, + "step": 3299 + }, + { + "epoch": 0.17703862660944206, + "grad_norm": 0.345703125, + "learning_rate": 4.999616571459597e-06, + "loss": 2.142, + "step": 3300 + }, + { + "epoch": 0.1770922746781116, + "grad_norm": 0.32421875, + "learning_rate": 4.999615048447392e-06, + "loss": 2.2357, + "step": 3301 + }, + { + "epoch": 0.17714592274678112, + "grad_norm": 0.3125, + "learning_rate": 4.999613522416642e-06, + "loss": 2.3569, + "step": 3302 + }, + { + "epoch": 0.17719957081545065, + "grad_norm": 1.2421875, + "learning_rate": 4.999611993367351e-06, + "loss": 2.3319, + "step": 3303 + }, + { + "epoch": 0.17725321888412018, + "grad_norm": 0.376953125, + "learning_rate": 4.999610461299517e-06, + "loss": 2.3566, + "step": 3304 + }, + { + "epoch": 0.1773068669527897, + "grad_norm": 0.37109375, + "learning_rate": 4.999608926213145e-06, + "loss": 2.4683, + "step": 3305 + }, + { + "epoch": 0.17736051502145922, + "grad_norm": 0.326171875, + "learning_rate": 4.999607388108236e-06, + "loss": 1.5518, + "step": 3306 + }, + { + "epoch": 0.17741416309012875, + "grad_norm": 0.34765625, + "learning_rate": 4.999605846984791e-06, + "loss": 2.3326, + "step": 3307 + }, + { + "epoch": 0.17746781115879828, + "grad_norm": 0.34765625, + "learning_rate": 4.999604302842813e-06, + "loss": 2.4353, + "step": 3308 + }, + { + "epoch": 0.17752145922746781, + "grad_norm": 0.7265625, + "learning_rate": 4.999602755682304e-06, + "loss": 2.0704, + "step": 3309 + }, + { + "epoch": 0.17757510729613735, + "grad_norm": 0.390625, + "learning_rate": 4.999601205503265e-06, + "loss": 2.64, + "step": 3310 + }, + { + "epoch": 0.17762875536480688, + "grad_norm": 0.375, + "learning_rate": 4.999599652305698e-06, + "loss": 2.433, + "step": 3311 + }, + { + "epoch": 0.17768240343347638, + "grad_norm": 0.408203125, + "learning_rate": 4.999598096089605e-06, + "loss": 2.0271, + "step": 3312 + }, + { + "epoch": 0.17773605150214591, + "grad_norm": 0.55078125, + "learning_rate": 4.999596536854989e-06, + "loss": 2.2744, + "step": 3313 + }, + { + "epoch": 0.17778969957081545, + "grad_norm": 0.330078125, + "learning_rate": 4.99959497460185e-06, + "loss": 2.4125, + "step": 3314 + }, + { + "epoch": 0.17784334763948498, + "grad_norm": 0.37109375, + "learning_rate": 4.999593409330191e-06, + "loss": 2.4591, + "step": 3315 + }, + { + "epoch": 0.1778969957081545, + "grad_norm": 0.353515625, + "learning_rate": 4.999591841040014e-06, + "loss": 2.4206, + "step": 3316 + }, + { + "epoch": 0.17795064377682404, + "grad_norm": 0.498046875, + "learning_rate": 4.9995902697313195e-06, + "loss": 2.4566, + "step": 3317 + }, + { + "epoch": 0.17800429184549357, + "grad_norm": 0.326171875, + "learning_rate": 4.999588695404111e-06, + "loss": 2.4033, + "step": 3318 + }, + { + "epoch": 0.17805793991416308, + "grad_norm": 0.34375, + "learning_rate": 4.999587118058389e-06, + "loss": 2.4679, + "step": 3319 + }, + { + "epoch": 0.1781115879828326, + "grad_norm": 0.314453125, + "learning_rate": 4.999585537694157e-06, + "loss": 2.208, + "step": 3320 + }, + { + "epoch": 0.17816523605150214, + "grad_norm": 0.388671875, + "learning_rate": 4.999583954311416e-06, + "loss": 1.8692, + "step": 3321 + }, + { + "epoch": 0.17821888412017167, + "grad_norm": 0.30859375, + "learning_rate": 4.999582367910169e-06, + "loss": 2.3098, + "step": 3322 + }, + { + "epoch": 0.1782725321888412, + "grad_norm": 0.33203125, + "learning_rate": 4.999580778490416e-06, + "loss": 2.6037, + "step": 3323 + }, + { + "epoch": 0.17832618025751074, + "grad_norm": 0.345703125, + "learning_rate": 4.99957918605216e-06, + "loss": 2.267, + "step": 3324 + }, + { + "epoch": 0.17837982832618027, + "grad_norm": 0.439453125, + "learning_rate": 4.999577590595402e-06, + "loss": 2.213, + "step": 3325 + }, + { + "epoch": 0.1784334763948498, + "grad_norm": 0.3828125, + "learning_rate": 4.999575992120146e-06, + "loss": 2.3533, + "step": 3326 + }, + { + "epoch": 0.1784871244635193, + "grad_norm": 0.390625, + "learning_rate": 4.999574390626393e-06, + "loss": 2.2569, + "step": 3327 + }, + { + "epoch": 0.17854077253218884, + "grad_norm": 0.396484375, + "learning_rate": 4.999572786114143e-06, + "loss": 2.3025, + "step": 3328 + }, + { + "epoch": 0.17859442060085837, + "grad_norm": 0.5, + "learning_rate": 4.999571178583401e-06, + "loss": 2.3264, + "step": 3329 + }, + { + "epoch": 0.1786480686695279, + "grad_norm": 0.33203125, + "learning_rate": 4.999569568034167e-06, + "loss": 2.2307, + "step": 3330 + }, + { + "epoch": 0.17870171673819743, + "grad_norm": 0.390625, + "learning_rate": 4.999567954466443e-06, + "loss": 2.4987, + "step": 3331 + }, + { + "epoch": 0.17875536480686696, + "grad_norm": 0.462890625, + "learning_rate": 4.999566337880232e-06, + "loss": 2.4274, + "step": 3332 + }, + { + "epoch": 0.1788090128755365, + "grad_norm": 0.4140625, + "learning_rate": 4.999564718275534e-06, + "loss": 2.5596, + "step": 3333 + }, + { + "epoch": 0.178862660944206, + "grad_norm": 0.421875, + "learning_rate": 4.9995630956523535e-06, + "loss": 2.3608, + "step": 3334 + }, + { + "epoch": 0.17891630901287553, + "grad_norm": 0.3828125, + "learning_rate": 4.999561470010691e-06, + "loss": 2.3548, + "step": 3335 + }, + { + "epoch": 0.17896995708154506, + "grad_norm": 0.494140625, + "learning_rate": 4.999559841350549e-06, + "loss": 2.2794, + "step": 3336 + }, + { + "epoch": 0.1790236051502146, + "grad_norm": 0.345703125, + "learning_rate": 4.999558209671928e-06, + "loss": 2.6123, + "step": 3337 + }, + { + "epoch": 0.17907725321888412, + "grad_norm": 0.56640625, + "learning_rate": 4.999556574974833e-06, + "loss": 2.3173, + "step": 3338 + }, + { + "epoch": 0.17913090128755366, + "grad_norm": 0.3515625, + "learning_rate": 4.999554937259263e-06, + "loss": 2.1194, + "step": 3339 + }, + { + "epoch": 0.1791845493562232, + "grad_norm": 0.361328125, + "learning_rate": 4.999553296525222e-06, + "loss": 2.3731, + "step": 3340 + }, + { + "epoch": 0.1792381974248927, + "grad_norm": 0.3828125, + "learning_rate": 4.999551652772709e-06, + "loss": 2.4907, + "step": 3341 + }, + { + "epoch": 0.17929184549356222, + "grad_norm": 0.380859375, + "learning_rate": 4.99955000600173e-06, + "loss": 2.4025, + "step": 3342 + }, + { + "epoch": 0.17934549356223176, + "grad_norm": 0.455078125, + "learning_rate": 4.999548356212284e-06, + "loss": 2.2577, + "step": 3343 + }, + { + "epoch": 0.1793991416309013, + "grad_norm": 0.37890625, + "learning_rate": 4.999546703404374e-06, + "loss": 2.3747, + "step": 3344 + }, + { + "epoch": 0.17945278969957082, + "grad_norm": 0.373046875, + "learning_rate": 4.999545047578002e-06, + "loss": 2.5943, + "step": 3345 + }, + { + "epoch": 0.17950643776824035, + "grad_norm": 0.361328125, + "learning_rate": 4.999543388733171e-06, + "loss": 2.3229, + "step": 3346 + }, + { + "epoch": 0.17956008583690988, + "grad_norm": 0.32421875, + "learning_rate": 4.999541726869882e-06, + "loss": 2.2618, + "step": 3347 + }, + { + "epoch": 0.1796137339055794, + "grad_norm": 0.392578125, + "learning_rate": 4.9995400619881365e-06, + "loss": 2.1445, + "step": 3348 + }, + { + "epoch": 0.17966738197424892, + "grad_norm": 0.466796875, + "learning_rate": 4.9995383940879376e-06, + "loss": 2.1275, + "step": 3349 + }, + { + "epoch": 0.17972103004291845, + "grad_norm": 0.333984375, + "learning_rate": 4.999536723169286e-06, + "loss": 2.3194, + "step": 3350 + }, + { + "epoch": 0.17977467811158798, + "grad_norm": 0.392578125, + "learning_rate": 4.999535049232185e-06, + "loss": 2.3622, + "step": 3351 + }, + { + "epoch": 0.17982832618025751, + "grad_norm": 0.31640625, + "learning_rate": 4.9995333722766355e-06, + "loss": 2.2805, + "step": 3352 + }, + { + "epoch": 0.17988197424892705, + "grad_norm": 0.37890625, + "learning_rate": 4.99953169230264e-06, + "loss": 2.4102, + "step": 3353 + }, + { + "epoch": 0.17993562231759658, + "grad_norm": 0.345703125, + "learning_rate": 4.999530009310201e-06, + "loss": 2.6786, + "step": 3354 + }, + { + "epoch": 0.17998927038626608, + "grad_norm": 0.63671875, + "learning_rate": 4.999528323299321e-06, + "loss": 2.0473, + "step": 3355 + }, + { + "epoch": 0.1800429184549356, + "grad_norm": 0.3046875, + "learning_rate": 4.99952663427e-06, + "loss": 2.1816, + "step": 3356 + }, + { + "epoch": 0.18009656652360514, + "grad_norm": 0.3828125, + "learning_rate": 4.999524942222242e-06, + "loss": 2.2232, + "step": 3357 + }, + { + "epoch": 0.18015021459227468, + "grad_norm": 0.3125, + "learning_rate": 4.999523247156048e-06, + "loss": 2.0789, + "step": 3358 + }, + { + "epoch": 0.1802038626609442, + "grad_norm": 1.0, + "learning_rate": 4.999521549071421e-06, + "loss": 2.4047, + "step": 3359 + }, + { + "epoch": 0.18025751072961374, + "grad_norm": 0.375, + "learning_rate": 4.999519847968362e-06, + "loss": 2.4037, + "step": 3360 + }, + { + "epoch": 0.18031115879828327, + "grad_norm": 0.34375, + "learning_rate": 4.999518143846873e-06, + "loss": 2.2792, + "step": 3361 + }, + { + "epoch": 0.18036480686695278, + "grad_norm": 0.34765625, + "learning_rate": 4.9995164367069575e-06, + "loss": 2.304, + "step": 3362 + }, + { + "epoch": 0.1804184549356223, + "grad_norm": 0.349609375, + "learning_rate": 4.999514726548615e-06, + "loss": 2.2679, + "step": 3363 + }, + { + "epoch": 0.18047210300429184, + "grad_norm": 0.357421875, + "learning_rate": 4.999513013371851e-06, + "loss": 2.4485, + "step": 3364 + }, + { + "epoch": 0.18052575107296137, + "grad_norm": 0.359375, + "learning_rate": 4.999511297176665e-06, + "loss": 2.1135, + "step": 3365 + }, + { + "epoch": 0.1805793991416309, + "grad_norm": 0.412109375, + "learning_rate": 4.99950957796306e-06, + "loss": 2.4306, + "step": 3366 + }, + { + "epoch": 0.18063304721030043, + "grad_norm": 0.322265625, + "learning_rate": 4.999507855731037e-06, + "loss": 2.2778, + "step": 3367 + }, + { + "epoch": 0.18068669527896997, + "grad_norm": 0.3203125, + "learning_rate": 4.999506130480599e-06, + "loss": 2.292, + "step": 3368 + }, + { + "epoch": 0.1807403433476395, + "grad_norm": 0.38671875, + "learning_rate": 4.999504402211749e-06, + "loss": 2.3065, + "step": 3369 + }, + { + "epoch": 0.180793991416309, + "grad_norm": 0.439453125, + "learning_rate": 4.999502670924487e-06, + "loss": 2.4243, + "step": 3370 + }, + { + "epoch": 0.18084763948497853, + "grad_norm": 0.44921875, + "learning_rate": 4.999500936618817e-06, + "loss": 2.2116, + "step": 3371 + }, + { + "epoch": 0.18090128755364807, + "grad_norm": 0.369140625, + "learning_rate": 4.999499199294741e-06, + "loss": 2.5735, + "step": 3372 + }, + { + "epoch": 0.1809549356223176, + "grad_norm": 0.40625, + "learning_rate": 4.999497458952259e-06, + "loss": 2.3608, + "step": 3373 + }, + { + "epoch": 0.18100858369098713, + "grad_norm": 0.5703125, + "learning_rate": 4.999495715591375e-06, + "loss": 2.2326, + "step": 3374 + }, + { + "epoch": 0.18106223175965666, + "grad_norm": 0.283203125, + "learning_rate": 4.999493969212091e-06, + "loss": 2.0893, + "step": 3375 + }, + { + "epoch": 0.1811158798283262, + "grad_norm": 1.3203125, + "learning_rate": 4.999492219814408e-06, + "loss": 2.482, + "step": 3376 + }, + { + "epoch": 0.1811695278969957, + "grad_norm": 0.328125, + "learning_rate": 4.9994904673983295e-06, + "loss": 2.4992, + "step": 3377 + }, + { + "epoch": 0.18122317596566523, + "grad_norm": 0.341796875, + "learning_rate": 4.999488711963857e-06, + "loss": 2.372, + "step": 3378 + }, + { + "epoch": 0.18127682403433476, + "grad_norm": 0.341796875, + "learning_rate": 4.999486953510991e-06, + "loss": 2.4141, + "step": 3379 + }, + { + "epoch": 0.1813304721030043, + "grad_norm": 0.447265625, + "learning_rate": 4.999485192039737e-06, + "loss": 2.26, + "step": 3380 + }, + { + "epoch": 0.18138412017167382, + "grad_norm": 0.2890625, + "learning_rate": 4.999483427550095e-06, + "loss": 2.238, + "step": 3381 + }, + { + "epoch": 0.18143776824034336, + "grad_norm": 0.5625, + "learning_rate": 4.999481660042067e-06, + "loss": 1.9186, + "step": 3382 + }, + { + "epoch": 0.1814914163090129, + "grad_norm": 0.451171875, + "learning_rate": 4.999479889515656e-06, + "loss": 2.2742, + "step": 3383 + }, + { + "epoch": 0.1815450643776824, + "grad_norm": 0.349609375, + "learning_rate": 4.999478115970863e-06, + "loss": 2.2801, + "step": 3384 + }, + { + "epoch": 0.18159871244635192, + "grad_norm": 0.359375, + "learning_rate": 4.999476339407691e-06, + "loss": 2.3801, + "step": 3385 + }, + { + "epoch": 0.18165236051502145, + "grad_norm": 0.3203125, + "learning_rate": 4.999474559826143e-06, + "loss": 2.1989, + "step": 3386 + }, + { + "epoch": 0.181706008583691, + "grad_norm": 0.421875, + "learning_rate": 4.999472777226219e-06, + "loss": 2.1665, + "step": 3387 + }, + { + "epoch": 0.18175965665236052, + "grad_norm": 0.7890625, + "learning_rate": 4.999470991607923e-06, + "loss": 2.3768, + "step": 3388 + }, + { + "epoch": 0.18181330472103005, + "grad_norm": 0.390625, + "learning_rate": 4.999469202971256e-06, + "loss": 2.4013, + "step": 3389 + }, + { + "epoch": 0.18186695278969958, + "grad_norm": 0.328125, + "learning_rate": 4.999467411316221e-06, + "loss": 2.3772, + "step": 3390 + }, + { + "epoch": 0.18192060085836909, + "grad_norm": 0.51171875, + "learning_rate": 4.999465616642819e-06, + "loss": 1.7803, + "step": 3391 + }, + { + "epoch": 0.18197424892703862, + "grad_norm": 0.431640625, + "learning_rate": 4.9994638189510545e-06, + "loss": 2.4562, + "step": 3392 + }, + { + "epoch": 0.18202789699570815, + "grad_norm": 0.349609375, + "learning_rate": 4.999462018240927e-06, + "loss": 2.2297, + "step": 3393 + }, + { + "epoch": 0.18208154506437768, + "grad_norm": 0.3359375, + "learning_rate": 4.999460214512441e-06, + "loss": 2.4821, + "step": 3394 + }, + { + "epoch": 0.1821351931330472, + "grad_norm": 0.369140625, + "learning_rate": 4.9994584077655955e-06, + "loss": 1.8082, + "step": 3395 + }, + { + "epoch": 0.18218884120171674, + "grad_norm": 0.302734375, + "learning_rate": 4.999456598000396e-06, + "loss": 2.3186, + "step": 3396 + }, + { + "epoch": 0.18224248927038628, + "grad_norm": 0.369140625, + "learning_rate": 4.999454785216843e-06, + "loss": 2.5002, + "step": 3397 + }, + { + "epoch": 0.18229613733905578, + "grad_norm": 0.365234375, + "learning_rate": 4.999452969414939e-06, + "loss": 2.3104, + "step": 3398 + }, + { + "epoch": 0.1823497854077253, + "grad_norm": 0.34765625, + "learning_rate": 4.999451150594686e-06, + "loss": 2.0113, + "step": 3399 + }, + { + "epoch": 0.18240343347639484, + "grad_norm": 0.30078125, + "learning_rate": 4.999449328756086e-06, + "loss": 2.3823, + "step": 3400 + }, + { + "epoch": 0.18245708154506438, + "grad_norm": 0.283203125, + "learning_rate": 4.999447503899143e-06, + "loss": 2.1941, + "step": 3401 + }, + { + "epoch": 0.1825107296137339, + "grad_norm": 0.357421875, + "learning_rate": 4.9994456760238576e-06, + "loss": 2.1923, + "step": 3402 + }, + { + "epoch": 0.18256437768240344, + "grad_norm": 0.34765625, + "learning_rate": 4.999443845130231e-06, + "loss": 2.4688, + "step": 3403 + }, + { + "epoch": 0.18261802575107297, + "grad_norm": 0.326171875, + "learning_rate": 4.999442011218268e-06, + "loss": 2.2298, + "step": 3404 + }, + { + "epoch": 0.1826716738197425, + "grad_norm": 0.349609375, + "learning_rate": 4.9994401742879685e-06, + "loss": 2.2121, + "step": 3405 + }, + { + "epoch": 0.182725321888412, + "grad_norm": 0.453125, + "learning_rate": 4.999438334339336e-06, + "loss": 2.335, + "step": 3406 + }, + { + "epoch": 0.18277896995708154, + "grad_norm": 0.345703125, + "learning_rate": 4.999436491372373e-06, + "loss": 2.3449, + "step": 3407 + }, + { + "epoch": 0.18283261802575107, + "grad_norm": 0.373046875, + "learning_rate": 4.99943464538708e-06, + "loss": 2.6075, + "step": 3408 + }, + { + "epoch": 0.1828862660944206, + "grad_norm": 0.51171875, + "learning_rate": 4.999432796383461e-06, + "loss": 2.445, + "step": 3409 + }, + { + "epoch": 0.18293991416309013, + "grad_norm": 0.296875, + "learning_rate": 4.9994309443615174e-06, + "loss": 2.3144, + "step": 3410 + }, + { + "epoch": 0.18299356223175967, + "grad_norm": 0.42578125, + "learning_rate": 4.999429089321252e-06, + "loss": 2.3242, + "step": 3411 + }, + { + "epoch": 0.1830472103004292, + "grad_norm": 0.345703125, + "learning_rate": 4.999427231262667e-06, + "loss": 2.3671, + "step": 3412 + }, + { + "epoch": 0.1831008583690987, + "grad_norm": 0.3359375, + "learning_rate": 4.999425370185763e-06, + "loss": 2.2541, + "step": 3413 + }, + { + "epoch": 0.18315450643776823, + "grad_norm": 0.3046875, + "learning_rate": 4.999423506090545e-06, + "loss": 2.2874, + "step": 3414 + }, + { + "epoch": 0.18320815450643776, + "grad_norm": 0.400390625, + "learning_rate": 4.9994216389770126e-06, + "loss": 2.1851, + "step": 3415 + }, + { + "epoch": 0.1832618025751073, + "grad_norm": 0.37109375, + "learning_rate": 4.99941976884517e-06, + "loss": 2.5008, + "step": 3416 + }, + { + "epoch": 0.18331545064377683, + "grad_norm": 0.318359375, + "learning_rate": 4.999417895695019e-06, + "loss": 2.379, + "step": 3417 + }, + { + "epoch": 0.18336909871244636, + "grad_norm": 0.3671875, + "learning_rate": 4.99941601952656e-06, + "loss": 2.3804, + "step": 3418 + }, + { + "epoch": 0.1834227467811159, + "grad_norm": 3.15625, + "learning_rate": 4.999414140339798e-06, + "loss": 2.4586, + "step": 3419 + }, + { + "epoch": 0.1834763948497854, + "grad_norm": 0.66796875, + "learning_rate": 4.999412258134734e-06, + "loss": 2.3534, + "step": 3420 + }, + { + "epoch": 0.18353004291845493, + "grad_norm": 0.376953125, + "learning_rate": 4.999410372911371e-06, + "loss": 2.2021, + "step": 3421 + }, + { + "epoch": 0.18358369098712446, + "grad_norm": 0.361328125, + "learning_rate": 4.99940848466971e-06, + "loss": 2.3689, + "step": 3422 + }, + { + "epoch": 0.183637339055794, + "grad_norm": 0.375, + "learning_rate": 4.9994065934097535e-06, + "loss": 2.2204, + "step": 3423 + }, + { + "epoch": 0.18369098712446352, + "grad_norm": 0.341796875, + "learning_rate": 4.9994046991315046e-06, + "loss": 2.3684, + "step": 3424 + }, + { + "epoch": 0.18374463519313305, + "grad_norm": 0.3125, + "learning_rate": 4.999402801834966e-06, + "loss": 2.2298, + "step": 3425 + }, + { + "epoch": 0.1837982832618026, + "grad_norm": 0.4296875, + "learning_rate": 4.999400901520138e-06, + "loss": 2.257, + "step": 3426 + }, + { + "epoch": 0.1838519313304721, + "grad_norm": 0.34375, + "learning_rate": 4.999398998187025e-06, + "loss": 2.3133, + "step": 3427 + }, + { + "epoch": 0.18390557939914162, + "grad_norm": 0.703125, + "learning_rate": 4.999397091835628e-06, + "loss": 2.4289, + "step": 3428 + }, + { + "epoch": 0.18395922746781115, + "grad_norm": 0.31640625, + "learning_rate": 4.999395182465951e-06, + "loss": 2.188, + "step": 3429 + }, + { + "epoch": 0.18401287553648069, + "grad_norm": 0.322265625, + "learning_rate": 4.9993932700779926e-06, + "loss": 2.3185, + "step": 3430 + }, + { + "epoch": 0.18406652360515022, + "grad_norm": 0.4921875, + "learning_rate": 4.999391354671759e-06, + "loss": 2.3274, + "step": 3431 + }, + { + "epoch": 0.18412017167381975, + "grad_norm": 0.4296875, + "learning_rate": 4.999389436247251e-06, + "loss": 2.2183, + "step": 3432 + }, + { + "epoch": 0.18417381974248928, + "grad_norm": 0.31640625, + "learning_rate": 4.9993875148044706e-06, + "loss": 2.2863, + "step": 3433 + }, + { + "epoch": 0.18422746781115878, + "grad_norm": 0.310546875, + "learning_rate": 4.999385590343421e-06, + "loss": 1.8773, + "step": 3434 + }, + { + "epoch": 0.18428111587982832, + "grad_norm": 0.3828125, + "learning_rate": 4.999383662864103e-06, + "loss": 2.3493, + "step": 3435 + }, + { + "epoch": 0.18433476394849785, + "grad_norm": 0.96484375, + "learning_rate": 4.999381732366521e-06, + "loss": 2.2495, + "step": 3436 + }, + { + "epoch": 0.18438841201716738, + "grad_norm": 0.34765625, + "learning_rate": 4.999379798850676e-06, + "loss": 2.3449, + "step": 3437 + }, + { + "epoch": 0.1844420600858369, + "grad_norm": 0.80859375, + "learning_rate": 4.99937786231657e-06, + "loss": 2.3799, + "step": 3438 + }, + { + "epoch": 0.18449570815450644, + "grad_norm": 0.3359375, + "learning_rate": 4.9993759227642055e-06, + "loss": 2.461, + "step": 3439 + }, + { + "epoch": 0.18454935622317598, + "grad_norm": 0.44140625, + "learning_rate": 4.999373980193587e-06, + "loss": 2.6467, + "step": 3440 + }, + { + "epoch": 0.1846030042918455, + "grad_norm": 0.314453125, + "learning_rate": 4.999372034604714e-06, + "loss": 2.3603, + "step": 3441 + }, + { + "epoch": 0.184656652360515, + "grad_norm": 0.341796875, + "learning_rate": 4.99937008599759e-06, + "loss": 2.2857, + "step": 3442 + }, + { + "epoch": 0.18471030042918454, + "grad_norm": 0.43359375, + "learning_rate": 4.999368134372217e-06, + "loss": 2.4246, + "step": 3443 + }, + { + "epoch": 0.18476394849785407, + "grad_norm": 0.302734375, + "learning_rate": 4.9993661797285984e-06, + "loss": 2.2866, + "step": 3444 + }, + { + "epoch": 0.1848175965665236, + "grad_norm": 0.314453125, + "learning_rate": 4.999364222066736e-06, + "loss": 2.0998, + "step": 3445 + }, + { + "epoch": 0.18487124463519314, + "grad_norm": 0.369140625, + "learning_rate": 4.999362261386631e-06, + "loss": 2.3095, + "step": 3446 + }, + { + "epoch": 0.18492489270386267, + "grad_norm": 0.359375, + "learning_rate": 4.999360297688287e-06, + "loss": 2.2902, + "step": 3447 + }, + { + "epoch": 0.1849785407725322, + "grad_norm": 0.388671875, + "learning_rate": 4.999358330971707e-06, + "loss": 2.2763, + "step": 3448 + }, + { + "epoch": 0.1850321888412017, + "grad_norm": 0.34375, + "learning_rate": 4.999356361236893e-06, + "loss": 2.2048, + "step": 3449 + }, + { + "epoch": 0.18508583690987124, + "grad_norm": 0.375, + "learning_rate": 4.999354388483845e-06, + "loss": 1.6719, + "step": 3450 + }, + { + "epoch": 0.18513948497854077, + "grad_norm": 0.287109375, + "learning_rate": 4.999352412712568e-06, + "loss": 2.1735, + "step": 3451 + }, + { + "epoch": 0.1851931330472103, + "grad_norm": 0.3515625, + "learning_rate": 4.9993504339230635e-06, + "loss": 2.3448, + "step": 3452 + }, + { + "epoch": 0.18524678111587983, + "grad_norm": 0.404296875, + "learning_rate": 4.999348452115334e-06, + "loss": 1.4639, + "step": 3453 + }, + { + "epoch": 0.18530042918454936, + "grad_norm": 0.365234375, + "learning_rate": 4.9993464672893836e-06, + "loss": 2.2797, + "step": 3454 + }, + { + "epoch": 0.1853540772532189, + "grad_norm": 0.5, + "learning_rate": 4.9993444794452105e-06, + "loss": 2.0837, + "step": 3455 + }, + { + "epoch": 0.1854077253218884, + "grad_norm": 0.3515625, + "learning_rate": 4.999342488582821e-06, + "loss": 2.3752, + "step": 3456 + }, + { + "epoch": 0.18546137339055793, + "grad_norm": 0.33203125, + "learning_rate": 4.999340494702216e-06, + "loss": 2.2828, + "step": 3457 + }, + { + "epoch": 0.18551502145922746, + "grad_norm": 0.3359375, + "learning_rate": 4.999338497803399e-06, + "loss": 2.5626, + "step": 3458 + }, + { + "epoch": 0.185568669527897, + "grad_norm": 0.388671875, + "learning_rate": 4.99933649788637e-06, + "loss": 2.6549, + "step": 3459 + }, + { + "epoch": 0.18562231759656653, + "grad_norm": 0.408203125, + "learning_rate": 4.999334494951134e-06, + "loss": 2.5398, + "step": 3460 + }, + { + "epoch": 0.18567596566523606, + "grad_norm": 0.28125, + "learning_rate": 4.999332488997691e-06, + "loss": 2.1847, + "step": 3461 + }, + { + "epoch": 0.1857296137339056, + "grad_norm": 0.3984375, + "learning_rate": 4.999330480026046e-06, + "loss": 2.416, + "step": 3462 + }, + { + "epoch": 0.1857832618025751, + "grad_norm": 0.26171875, + "learning_rate": 4.999328468036199e-06, + "loss": 1.9335, + "step": 3463 + }, + { + "epoch": 0.18583690987124463, + "grad_norm": 0.52734375, + "learning_rate": 4.9993264530281535e-06, + "loss": 2.2439, + "step": 3464 + }, + { + "epoch": 0.18589055793991416, + "grad_norm": 0.34375, + "learning_rate": 4.999324435001913e-06, + "loss": 2.2019, + "step": 3465 + }, + { + "epoch": 0.1859442060085837, + "grad_norm": 0.40234375, + "learning_rate": 4.999322413957479e-06, + "loss": 2.3207, + "step": 3466 + }, + { + "epoch": 0.18599785407725322, + "grad_norm": 0.255859375, + "learning_rate": 4.999320389894854e-06, + "loss": 2.0921, + "step": 3467 + }, + { + "epoch": 0.18605150214592275, + "grad_norm": 0.34765625, + "learning_rate": 4.999318362814039e-06, + "loss": 2.4678, + "step": 3468 + }, + { + "epoch": 0.18610515021459229, + "grad_norm": 0.33203125, + "learning_rate": 4.999316332715038e-06, + "loss": 2.2175, + "step": 3469 + }, + { + "epoch": 0.1861587982832618, + "grad_norm": 1.0234375, + "learning_rate": 4.999314299597855e-06, + "loss": 2.3363, + "step": 3470 + }, + { + "epoch": 0.18621244635193132, + "grad_norm": 0.330078125, + "learning_rate": 4.999312263462489e-06, + "loss": 2.3586, + "step": 3471 + }, + { + "epoch": 0.18626609442060085, + "grad_norm": 0.33203125, + "learning_rate": 4.999310224308945e-06, + "loss": 2.2399, + "step": 3472 + }, + { + "epoch": 0.18631974248927038, + "grad_norm": 0.455078125, + "learning_rate": 4.999308182137224e-06, + "loss": 1.4328, + "step": 3473 + }, + { + "epoch": 0.18637339055793992, + "grad_norm": 0.369140625, + "learning_rate": 4.999306136947329e-06, + "loss": 2.3049, + "step": 3474 + }, + { + "epoch": 0.18642703862660945, + "grad_norm": 0.35546875, + "learning_rate": 4.999304088739263e-06, + "loss": 2.3299, + "step": 3475 + }, + { + "epoch": 0.18648068669527898, + "grad_norm": 0.359375, + "learning_rate": 4.999302037513029e-06, + "loss": 2.5396, + "step": 3476 + }, + { + "epoch": 0.1865343347639485, + "grad_norm": 0.359375, + "learning_rate": 4.9992999832686265e-06, + "loss": 2.4046, + "step": 3477 + }, + { + "epoch": 0.18658798283261802, + "grad_norm": 0.31640625, + "learning_rate": 4.9992979260060606e-06, + "loss": 2.1985, + "step": 3478 + }, + { + "epoch": 0.18664163090128755, + "grad_norm": 0.361328125, + "learning_rate": 4.999295865725334e-06, + "loss": 2.1355, + "step": 3479 + }, + { + "epoch": 0.18669527896995708, + "grad_norm": 0.365234375, + "learning_rate": 4.999293802426448e-06, + "loss": 2.3734, + "step": 3480 + }, + { + "epoch": 0.1867489270386266, + "grad_norm": 0.314453125, + "learning_rate": 4.999291736109405e-06, + "loss": 2.2939, + "step": 3481 + }, + { + "epoch": 0.18680257510729614, + "grad_norm": 0.369140625, + "learning_rate": 4.999289666774208e-06, + "loss": 2.2704, + "step": 3482 + }, + { + "epoch": 0.18685622317596567, + "grad_norm": 0.291015625, + "learning_rate": 4.99928759442086e-06, + "loss": 2.2917, + "step": 3483 + }, + { + "epoch": 0.1869098712446352, + "grad_norm": 0.3359375, + "learning_rate": 4.999285519049362e-06, + "loss": 2.5451, + "step": 3484 + }, + { + "epoch": 0.1869635193133047, + "grad_norm": 0.5078125, + "learning_rate": 4.999283440659718e-06, + "loss": 2.3403, + "step": 3485 + }, + { + "epoch": 0.18701716738197424, + "grad_norm": 0.330078125, + "learning_rate": 4.999281359251929e-06, + "loss": 2.3792, + "step": 3486 + }, + { + "epoch": 0.18707081545064377, + "grad_norm": 0.4140625, + "learning_rate": 4.999279274826e-06, + "loss": 2.5092, + "step": 3487 + }, + { + "epoch": 0.1871244635193133, + "grad_norm": 0.365234375, + "learning_rate": 4.9992771873819315e-06, + "loss": 2.1716, + "step": 3488 + }, + { + "epoch": 0.18717811158798284, + "grad_norm": 0.33984375, + "learning_rate": 4.999275096919726e-06, + "loss": 2.3364, + "step": 3489 + }, + { + "epoch": 0.18723175965665237, + "grad_norm": 1.0625, + "learning_rate": 4.9992730034393875e-06, + "loss": 2.2777, + "step": 3490 + }, + { + "epoch": 0.1872854077253219, + "grad_norm": 0.44921875, + "learning_rate": 4.999270906940916e-06, + "loss": 2.2686, + "step": 3491 + }, + { + "epoch": 0.1873390557939914, + "grad_norm": 0.423828125, + "learning_rate": 4.999268807424317e-06, + "loss": 1.5979, + "step": 3492 + }, + { + "epoch": 0.18739270386266094, + "grad_norm": 0.62890625, + "learning_rate": 4.999266704889591e-06, + "loss": 2.549, + "step": 3493 + }, + { + "epoch": 0.18744635193133047, + "grad_norm": 0.326171875, + "learning_rate": 4.999264599336742e-06, + "loss": 2.498, + "step": 3494 + }, + { + "epoch": 0.1875, + "grad_norm": 0.375, + "learning_rate": 4.99926249076577e-06, + "loss": 2.3884, + "step": 3495 + }, + { + "epoch": 0.18755364806866953, + "grad_norm": 0.431640625, + "learning_rate": 4.999260379176681e-06, + "loss": 2.2608, + "step": 3496 + }, + { + "epoch": 0.18760729613733906, + "grad_norm": 0.392578125, + "learning_rate": 4.999258264569475e-06, + "loss": 2.4884, + "step": 3497 + }, + { + "epoch": 0.1876609442060086, + "grad_norm": 0.326171875, + "learning_rate": 4.999256146944155e-06, + "loss": 2.3062, + "step": 3498 + }, + { + "epoch": 0.1877145922746781, + "grad_norm": 0.361328125, + "learning_rate": 4.999254026300724e-06, + "loss": 2.2905, + "step": 3499 + }, + { + "epoch": 0.18776824034334763, + "grad_norm": 0.330078125, + "learning_rate": 4.999251902639185e-06, + "loss": 2.4094, + "step": 3500 + }, + { + "epoch": 0.18782188841201716, + "grad_norm": 0.326171875, + "learning_rate": 4.99924977595954e-06, + "loss": 2.2175, + "step": 3501 + }, + { + "epoch": 0.1878755364806867, + "grad_norm": 0.310546875, + "learning_rate": 4.999247646261791e-06, + "loss": 2.101, + "step": 3502 + }, + { + "epoch": 0.18792918454935623, + "grad_norm": 0.328125, + "learning_rate": 4.999245513545942e-06, + "loss": 2.4006, + "step": 3503 + }, + { + "epoch": 0.18798283261802576, + "grad_norm": 0.322265625, + "learning_rate": 4.999243377811994e-06, + "loss": 2.1634, + "step": 3504 + }, + { + "epoch": 0.1880364806866953, + "grad_norm": 0.390625, + "learning_rate": 4.999241239059951e-06, + "loss": 2.3065, + "step": 3505 + }, + { + "epoch": 0.1880901287553648, + "grad_norm": 0.322265625, + "learning_rate": 4.999239097289815e-06, + "loss": 2.4631, + "step": 3506 + }, + { + "epoch": 0.18814377682403433, + "grad_norm": 0.294921875, + "learning_rate": 4.999236952501588e-06, + "loss": 2.256, + "step": 3507 + }, + { + "epoch": 0.18819742489270386, + "grad_norm": 0.330078125, + "learning_rate": 4.999234804695273e-06, + "loss": 2.2992, + "step": 3508 + }, + { + "epoch": 0.1882510729613734, + "grad_norm": 0.400390625, + "learning_rate": 4.999232653870873e-06, + "loss": 2.2621, + "step": 3509 + }, + { + "epoch": 0.18830472103004292, + "grad_norm": 0.423828125, + "learning_rate": 4.9992305000283905e-06, + "loss": 2.3287, + "step": 3510 + }, + { + "epoch": 0.18835836909871245, + "grad_norm": 0.3359375, + "learning_rate": 4.999228343167826e-06, + "loss": 2.3188, + "step": 3511 + }, + { + "epoch": 0.18841201716738198, + "grad_norm": 0.30078125, + "learning_rate": 4.9992261832891865e-06, + "loss": 2.4167, + "step": 3512 + }, + { + "epoch": 0.1884656652360515, + "grad_norm": 0.27734375, + "learning_rate": 4.999224020392472e-06, + "loss": 1.9209, + "step": 3513 + }, + { + "epoch": 0.18851931330472102, + "grad_norm": 0.357421875, + "learning_rate": 4.999221854477684e-06, + "loss": 2.3907, + "step": 3514 + }, + { + "epoch": 0.18857296137339055, + "grad_norm": 0.384765625, + "learning_rate": 4.999219685544826e-06, + "loss": 2.3742, + "step": 3515 + }, + { + "epoch": 0.18862660944206008, + "grad_norm": 0.3203125, + "learning_rate": 4.999217513593901e-06, + "loss": 2.0246, + "step": 3516 + }, + { + "epoch": 0.18868025751072962, + "grad_norm": 0.5234375, + "learning_rate": 4.999215338624913e-06, + "loss": 2.4549, + "step": 3517 + }, + { + "epoch": 0.18873390557939915, + "grad_norm": 0.33203125, + "learning_rate": 4.999213160637862e-06, + "loss": 2.4412, + "step": 3518 + }, + { + "epoch": 0.18878755364806868, + "grad_norm": 1.015625, + "learning_rate": 4.999210979632753e-06, + "loss": 2.0107, + "step": 3519 + }, + { + "epoch": 0.1888412017167382, + "grad_norm": 0.318359375, + "learning_rate": 4.999208795609586e-06, + "loss": 2.3458, + "step": 3520 + }, + { + "epoch": 0.18889484978540771, + "grad_norm": 0.32421875, + "learning_rate": 4.999206608568366e-06, + "loss": 2.2923, + "step": 3521 + }, + { + "epoch": 0.18894849785407725, + "grad_norm": 0.38671875, + "learning_rate": 4.999204418509094e-06, + "loss": 2.3239, + "step": 3522 + }, + { + "epoch": 0.18900214592274678, + "grad_norm": 0.33203125, + "learning_rate": 4.999202225431774e-06, + "loss": 2.3252, + "step": 3523 + }, + { + "epoch": 0.1890557939914163, + "grad_norm": 0.59375, + "learning_rate": 4.999200029336408e-06, + "loss": 2.1715, + "step": 3524 + }, + { + "epoch": 0.18910944206008584, + "grad_norm": 10.3125, + "learning_rate": 4.999197830222998e-06, + "loss": 2.5428, + "step": 3525 + }, + { + "epoch": 0.18916309012875537, + "grad_norm": 0.74609375, + "learning_rate": 4.999195628091548e-06, + "loss": 2.5996, + "step": 3526 + }, + { + "epoch": 0.1892167381974249, + "grad_norm": 0.54296875, + "learning_rate": 4.99919342294206e-06, + "loss": 2.2091, + "step": 3527 + }, + { + "epoch": 0.1892703862660944, + "grad_norm": 0.283203125, + "learning_rate": 4.999191214774537e-06, + "loss": 2.3285, + "step": 3528 + }, + { + "epoch": 0.18932403433476394, + "grad_norm": 0.62109375, + "learning_rate": 4.99918900358898e-06, + "loss": 2.3978, + "step": 3529 + }, + { + "epoch": 0.18937768240343347, + "grad_norm": 0.37109375, + "learning_rate": 4.999186789385394e-06, + "loss": 2.5225, + "step": 3530 + }, + { + "epoch": 0.189431330472103, + "grad_norm": 0.416015625, + "learning_rate": 4.999184572163781e-06, + "loss": 2.2393, + "step": 3531 + }, + { + "epoch": 0.18948497854077254, + "grad_norm": 0.53125, + "learning_rate": 4.999182351924142e-06, + "loss": 2.3109, + "step": 3532 + }, + { + "epoch": 0.18953862660944207, + "grad_norm": 0.369140625, + "learning_rate": 4.999180128666482e-06, + "loss": 2.1772, + "step": 3533 + }, + { + "epoch": 0.1895922746781116, + "grad_norm": 0.349609375, + "learning_rate": 4.999177902390802e-06, + "loss": 2.3418, + "step": 3534 + }, + { + "epoch": 0.1896459227467811, + "grad_norm": 0.34765625, + "learning_rate": 4.999175673097107e-06, + "loss": 2.3384, + "step": 3535 + }, + { + "epoch": 0.18969957081545064, + "grad_norm": 0.322265625, + "learning_rate": 4.999173440785397e-06, + "loss": 2.2618, + "step": 3536 + }, + { + "epoch": 0.18975321888412017, + "grad_norm": 0.341796875, + "learning_rate": 4.999171205455677e-06, + "loss": 2.3976, + "step": 3537 + }, + { + "epoch": 0.1898068669527897, + "grad_norm": 0.431640625, + "learning_rate": 4.999168967107946e-06, + "loss": 2.3531, + "step": 3538 + }, + { + "epoch": 0.18986051502145923, + "grad_norm": 0.2890625, + "learning_rate": 4.999166725742211e-06, + "loss": 2.1376, + "step": 3539 + }, + { + "epoch": 0.18991416309012876, + "grad_norm": 0.341796875, + "learning_rate": 4.9991644813584725e-06, + "loss": 2.1202, + "step": 3540 + }, + { + "epoch": 0.1899678111587983, + "grad_norm": 0.419921875, + "learning_rate": 4.999162233956734e-06, + "loss": 2.3545, + "step": 3541 + }, + { + "epoch": 0.1900214592274678, + "grad_norm": 0.3125, + "learning_rate": 4.999159983536997e-06, + "loss": 2.3493, + "step": 3542 + }, + { + "epoch": 0.19007510729613733, + "grad_norm": 0.349609375, + "learning_rate": 4.999157730099265e-06, + "loss": 2.3253, + "step": 3543 + }, + { + "epoch": 0.19012875536480686, + "grad_norm": 0.40625, + "learning_rate": 4.999155473643541e-06, + "loss": 2.5497, + "step": 3544 + }, + { + "epoch": 0.1901824034334764, + "grad_norm": 0.33984375, + "learning_rate": 4.999153214169828e-06, + "loss": 2.2844, + "step": 3545 + }, + { + "epoch": 0.19023605150214593, + "grad_norm": 0.482421875, + "learning_rate": 4.999150951678128e-06, + "loss": 2.371, + "step": 3546 + }, + { + "epoch": 0.19028969957081546, + "grad_norm": 0.63671875, + "learning_rate": 4.9991486861684434e-06, + "loss": 2.2654, + "step": 3547 + }, + { + "epoch": 0.190343347639485, + "grad_norm": 0.375, + "learning_rate": 4.999146417640778e-06, + "loss": 2.2754, + "step": 3548 + }, + { + "epoch": 0.1903969957081545, + "grad_norm": 0.41796875, + "learning_rate": 4.999144146095134e-06, + "loss": 2.2407, + "step": 3549 + }, + { + "epoch": 0.19045064377682402, + "grad_norm": 0.482421875, + "learning_rate": 4.999141871531514e-06, + "loss": 2.0627, + "step": 3550 + }, + { + "epoch": 0.19050429184549356, + "grad_norm": 0.400390625, + "learning_rate": 4.999139593949921e-06, + "loss": 2.0629, + "step": 3551 + }, + { + "epoch": 0.1905579399141631, + "grad_norm": 0.73046875, + "learning_rate": 4.999137313350358e-06, + "loss": 2.3311, + "step": 3552 + }, + { + "epoch": 0.19061158798283262, + "grad_norm": 0.404296875, + "learning_rate": 4.999135029732827e-06, + "loss": 2.2931, + "step": 3553 + }, + { + "epoch": 0.19066523605150215, + "grad_norm": 1.8984375, + "learning_rate": 4.999132743097331e-06, + "loss": 2.76, + "step": 3554 + }, + { + "epoch": 0.19071888412017168, + "grad_norm": 0.345703125, + "learning_rate": 4.999130453443873e-06, + "loss": 2.5255, + "step": 3555 + }, + { + "epoch": 0.19077253218884122, + "grad_norm": 0.390625, + "learning_rate": 4.999128160772456e-06, + "loss": 2.3594, + "step": 3556 + }, + { + "epoch": 0.19082618025751072, + "grad_norm": 0.30859375, + "learning_rate": 4.999125865083082e-06, + "loss": 2.3338, + "step": 3557 + }, + { + "epoch": 0.19087982832618025, + "grad_norm": 0.333984375, + "learning_rate": 4.999123566375755e-06, + "loss": 2.1156, + "step": 3558 + }, + { + "epoch": 0.19093347639484978, + "grad_norm": 0.34765625, + "learning_rate": 4.999121264650476e-06, + "loss": 2.3692, + "step": 3559 + }, + { + "epoch": 0.19098712446351931, + "grad_norm": 0.4765625, + "learning_rate": 4.99911895990725e-06, + "loss": 2.5257, + "step": 3560 + }, + { + "epoch": 0.19104077253218885, + "grad_norm": 0.671875, + "learning_rate": 4.999116652146078e-06, + "loss": 2.3114, + "step": 3561 + }, + { + "epoch": 0.19109442060085838, + "grad_norm": 0.498046875, + "learning_rate": 4.999114341366963e-06, + "loss": 2.0707, + "step": 3562 + }, + { + "epoch": 0.1911480686695279, + "grad_norm": 0.349609375, + "learning_rate": 4.999112027569908e-06, + "loss": 2.1959, + "step": 3563 + }, + { + "epoch": 0.1912017167381974, + "grad_norm": 0.330078125, + "learning_rate": 4.999109710754917e-06, + "loss": 2.4688, + "step": 3564 + }, + { + "epoch": 0.19125536480686695, + "grad_norm": 0.34765625, + "learning_rate": 4.999107390921991e-06, + "loss": 2.0701, + "step": 3565 + }, + { + "epoch": 0.19130901287553648, + "grad_norm": 0.345703125, + "learning_rate": 4.999105068071134e-06, + "loss": 2.4694, + "step": 3566 + }, + { + "epoch": 0.191362660944206, + "grad_norm": 0.30078125, + "learning_rate": 4.999102742202348e-06, + "loss": 2.0513, + "step": 3567 + }, + { + "epoch": 0.19141630901287554, + "grad_norm": 8.5, + "learning_rate": 4.9991004133156365e-06, + "loss": 2.1997, + "step": 3568 + }, + { + "epoch": 0.19146995708154507, + "grad_norm": 0.3359375, + "learning_rate": 4.999098081411001e-06, + "loss": 2.388, + "step": 3569 + }, + { + "epoch": 0.1915236051502146, + "grad_norm": 0.53125, + "learning_rate": 4.999095746488446e-06, + "loss": 2.2121, + "step": 3570 + }, + { + "epoch": 0.1915772532188841, + "grad_norm": 0.384765625, + "learning_rate": 4.9990934085479735e-06, + "loss": 2.3827, + "step": 3571 + }, + { + "epoch": 0.19163090128755364, + "grad_norm": 0.4140625, + "learning_rate": 4.999091067589587e-06, + "loss": 2.1896, + "step": 3572 + }, + { + "epoch": 0.19168454935622317, + "grad_norm": 0.39453125, + "learning_rate": 4.999088723613288e-06, + "loss": 2.0773, + "step": 3573 + }, + { + "epoch": 0.1917381974248927, + "grad_norm": 0.310546875, + "learning_rate": 4.999086376619081e-06, + "loss": 2.4285, + "step": 3574 + }, + { + "epoch": 0.19179184549356224, + "grad_norm": 0.37109375, + "learning_rate": 4.9990840266069664e-06, + "loss": 2.2971, + "step": 3575 + }, + { + "epoch": 0.19184549356223177, + "grad_norm": 0.3984375, + "learning_rate": 4.99908167357695e-06, + "loss": 2.4197, + "step": 3576 + }, + { + "epoch": 0.1918991416309013, + "grad_norm": 0.314453125, + "learning_rate": 4.999079317529033e-06, + "loss": 2.3048, + "step": 3577 + }, + { + "epoch": 0.1919527896995708, + "grad_norm": 2.8125, + "learning_rate": 4.9990769584632175e-06, + "loss": 2.3682, + "step": 3578 + }, + { + "epoch": 0.19200643776824033, + "grad_norm": 0.359375, + "learning_rate": 4.999074596379509e-06, + "loss": 2.2912, + "step": 3579 + }, + { + "epoch": 0.19206008583690987, + "grad_norm": 0.353515625, + "learning_rate": 4.999072231277906e-06, + "loss": 2.2548, + "step": 3580 + }, + { + "epoch": 0.1921137339055794, + "grad_norm": 0.37109375, + "learning_rate": 4.999069863158417e-06, + "loss": 2.7071, + "step": 3581 + }, + { + "epoch": 0.19216738197424893, + "grad_norm": 0.326171875, + "learning_rate": 4.99906749202104e-06, + "loss": 2.2968, + "step": 3582 + }, + { + "epoch": 0.19222103004291846, + "grad_norm": 0.32421875, + "learning_rate": 4.99906511786578e-06, + "loss": 2.1637, + "step": 3583 + }, + { + "epoch": 0.192274678111588, + "grad_norm": 0.421875, + "learning_rate": 4.999062740692639e-06, + "loss": 2.0415, + "step": 3584 + }, + { + "epoch": 0.1923283261802575, + "grad_norm": 0.40234375, + "learning_rate": 4.999060360501622e-06, + "loss": 2.5261, + "step": 3585 + }, + { + "epoch": 0.19238197424892703, + "grad_norm": 0.357421875, + "learning_rate": 4.999057977292729e-06, + "loss": 2.1958, + "step": 3586 + }, + { + "epoch": 0.19243562231759656, + "grad_norm": 0.74609375, + "learning_rate": 4.999055591065966e-06, + "loss": 2.5282, + "step": 3587 + }, + { + "epoch": 0.1924892703862661, + "grad_norm": 0.333984375, + "learning_rate": 4.999053201821332e-06, + "loss": 2.351, + "step": 3588 + }, + { + "epoch": 0.19254291845493562, + "grad_norm": 0.39453125, + "learning_rate": 4.999050809558833e-06, + "loss": 2.4838, + "step": 3589 + }, + { + "epoch": 0.19259656652360516, + "grad_norm": 0.2734375, + "learning_rate": 4.999048414278471e-06, + "loss": 1.9929, + "step": 3590 + }, + { + "epoch": 0.1926502145922747, + "grad_norm": 0.328125, + "learning_rate": 4.999046015980249e-06, + "loss": 2.1449, + "step": 3591 + }, + { + "epoch": 0.19270386266094422, + "grad_norm": 0.42578125, + "learning_rate": 4.9990436146641695e-06, + "loss": 2.2974, + "step": 3592 + }, + { + "epoch": 0.19275751072961372, + "grad_norm": 0.31640625, + "learning_rate": 4.9990412103302345e-06, + "loss": 2.3795, + "step": 3593 + }, + { + "epoch": 0.19281115879828326, + "grad_norm": 0.353515625, + "learning_rate": 4.999038802978448e-06, + "loss": 2.2729, + "step": 3594 + }, + { + "epoch": 0.1928648068669528, + "grad_norm": 0.333984375, + "learning_rate": 4.999036392608815e-06, + "loss": 1.8608, + "step": 3595 + }, + { + "epoch": 0.19291845493562232, + "grad_norm": 0.3515625, + "learning_rate": 4.999033979221335e-06, + "loss": 2.2807, + "step": 3596 + }, + { + "epoch": 0.19297210300429185, + "grad_norm": 0.412109375, + "learning_rate": 4.999031562816012e-06, + "loss": 2.4529, + "step": 3597 + }, + { + "epoch": 0.19302575107296138, + "grad_norm": 0.55078125, + "learning_rate": 4.999029143392849e-06, + "loss": 2.4017, + "step": 3598 + }, + { + "epoch": 0.19307939914163091, + "grad_norm": 0.419921875, + "learning_rate": 4.999026720951849e-06, + "loss": 2.2852, + "step": 3599 + }, + { + "epoch": 0.19313304721030042, + "grad_norm": 0.380859375, + "learning_rate": 4.999024295493016e-06, + "loss": 2.3689, + "step": 3600 + }, + { + "epoch": 0.19318669527896995, + "grad_norm": 0.435546875, + "learning_rate": 4.99902186701635e-06, + "loss": 2.4554, + "step": 3601 + }, + { + "epoch": 0.19324034334763948, + "grad_norm": 0.35546875, + "learning_rate": 4.999019435521858e-06, + "loss": 2.351, + "step": 3602 + }, + { + "epoch": 0.193293991416309, + "grad_norm": 0.435546875, + "learning_rate": 4.999017001009539e-06, + "loss": 2.4714, + "step": 3603 + }, + { + "epoch": 0.19334763948497855, + "grad_norm": 0.4140625, + "learning_rate": 4.999014563479398e-06, + "loss": 2.5559, + "step": 3604 + }, + { + "epoch": 0.19340128755364808, + "grad_norm": 0.322265625, + "learning_rate": 4.999012122931439e-06, + "loss": 2.3615, + "step": 3605 + }, + { + "epoch": 0.1934549356223176, + "grad_norm": 1.390625, + "learning_rate": 4.999009679365663e-06, + "loss": 1.8281, + "step": 3606 + }, + { + "epoch": 0.1935085836909871, + "grad_norm": 0.45703125, + "learning_rate": 4.999007232782073e-06, + "loss": 2.4625, + "step": 3607 + }, + { + "epoch": 0.19356223175965664, + "grad_norm": 0.32421875, + "learning_rate": 4.999004783180673e-06, + "loss": 2.2341, + "step": 3608 + }, + { + "epoch": 0.19361587982832618, + "grad_norm": 0.306640625, + "learning_rate": 4.999002330561465e-06, + "loss": 2.2692, + "step": 3609 + }, + { + "epoch": 0.1936695278969957, + "grad_norm": 0.408203125, + "learning_rate": 4.998999874924453e-06, + "loss": 2.4344, + "step": 3610 + }, + { + "epoch": 0.19372317596566524, + "grad_norm": 0.47265625, + "learning_rate": 4.998997416269638e-06, + "loss": 2.1732, + "step": 3611 + }, + { + "epoch": 0.19377682403433477, + "grad_norm": 0.345703125, + "learning_rate": 4.998994954597026e-06, + "loss": 2.0586, + "step": 3612 + }, + { + "epoch": 0.1938304721030043, + "grad_norm": 0.34765625, + "learning_rate": 4.9989924899066176e-06, + "loss": 2.3098, + "step": 3613 + }, + { + "epoch": 0.1938841201716738, + "grad_norm": 0.3515625, + "learning_rate": 4.998990022198417e-06, + "loss": 2.131, + "step": 3614 + }, + { + "epoch": 0.19393776824034334, + "grad_norm": 0.328125, + "learning_rate": 4.998987551472426e-06, + "loss": 2.3397, + "step": 3615 + }, + { + "epoch": 0.19399141630901287, + "grad_norm": 0.443359375, + "learning_rate": 4.998985077728649e-06, + "loss": 2.5663, + "step": 3616 + }, + { + "epoch": 0.1940450643776824, + "grad_norm": 0.458984375, + "learning_rate": 4.9989826009670875e-06, + "loss": 2.2089, + "step": 3617 + }, + { + "epoch": 0.19409871244635193, + "grad_norm": 0.361328125, + "learning_rate": 4.998980121187745e-06, + "loss": 2.3569, + "step": 3618 + }, + { + "epoch": 0.19415236051502147, + "grad_norm": 0.31640625, + "learning_rate": 4.998977638390625e-06, + "loss": 2.2526, + "step": 3619 + }, + { + "epoch": 0.194206008583691, + "grad_norm": 1.1796875, + "learning_rate": 4.998975152575731e-06, + "loss": 2.3339, + "step": 3620 + }, + { + "epoch": 0.1942596566523605, + "grad_norm": 0.373046875, + "learning_rate": 4.998972663743064e-06, + "loss": 1.9098, + "step": 3621 + }, + { + "epoch": 0.19431330472103003, + "grad_norm": 0.330078125, + "learning_rate": 4.99897017189263e-06, + "loss": 2.5726, + "step": 3622 + }, + { + "epoch": 0.19436695278969957, + "grad_norm": 0.326171875, + "learning_rate": 4.998967677024429e-06, + "loss": 2.332, + "step": 3623 + }, + { + "epoch": 0.1944206008583691, + "grad_norm": 0.333984375, + "learning_rate": 4.998965179138465e-06, + "loss": 2.1371, + "step": 3624 + }, + { + "epoch": 0.19447424892703863, + "grad_norm": 0.345703125, + "learning_rate": 4.998962678234742e-06, + "loss": 2.5313, + "step": 3625 + }, + { + "epoch": 0.19452789699570816, + "grad_norm": 0.337890625, + "learning_rate": 4.998960174313263e-06, + "loss": 2.4092, + "step": 3626 + }, + { + "epoch": 0.1945815450643777, + "grad_norm": 0.49609375, + "learning_rate": 4.998957667374029e-06, + "loss": 2.5753, + "step": 3627 + }, + { + "epoch": 0.19463519313304722, + "grad_norm": 0.349609375, + "learning_rate": 4.998955157417045e-06, + "loss": 2.4157, + "step": 3628 + }, + { + "epoch": 0.19468884120171673, + "grad_norm": 0.392578125, + "learning_rate": 4.9989526444423135e-06, + "loss": 2.4477, + "step": 3629 + }, + { + "epoch": 0.19474248927038626, + "grad_norm": 0.34765625, + "learning_rate": 4.998950128449837e-06, + "loss": 2.02, + "step": 3630 + }, + { + "epoch": 0.1947961373390558, + "grad_norm": 0.3359375, + "learning_rate": 4.9989476094396185e-06, + "loss": 2.3539, + "step": 3631 + }, + { + "epoch": 0.19484978540772532, + "grad_norm": 0.279296875, + "learning_rate": 4.9989450874116625e-06, + "loss": 2.3531, + "step": 3632 + }, + { + "epoch": 0.19490343347639486, + "grad_norm": 0.328125, + "learning_rate": 4.9989425623659705e-06, + "loss": 2.3169, + "step": 3633 + }, + { + "epoch": 0.1949570815450644, + "grad_norm": 0.373046875, + "learning_rate": 4.9989400343025465e-06, + "loss": 2.2393, + "step": 3634 + }, + { + "epoch": 0.19501072961373392, + "grad_norm": 0.333984375, + "learning_rate": 4.998937503221393e-06, + "loss": 2.4262, + "step": 3635 + }, + { + "epoch": 0.19506437768240342, + "grad_norm": 0.40234375, + "learning_rate": 4.998934969122512e-06, + "loss": 2.4457, + "step": 3636 + }, + { + "epoch": 0.19511802575107295, + "grad_norm": 0.279296875, + "learning_rate": 4.998932432005909e-06, + "loss": 2.197, + "step": 3637 + }, + { + "epoch": 0.19517167381974249, + "grad_norm": 0.3515625, + "learning_rate": 4.9989298918715865e-06, + "loss": 2.3284, + "step": 3638 + }, + { + "epoch": 0.19522532188841202, + "grad_norm": 0.455078125, + "learning_rate": 4.998927348719545e-06, + "loss": 2.3696, + "step": 3639 + }, + { + "epoch": 0.19527896995708155, + "grad_norm": 0.384765625, + "learning_rate": 4.998924802549792e-06, + "loss": 2.2769, + "step": 3640 + }, + { + "epoch": 0.19533261802575108, + "grad_norm": 0.341796875, + "learning_rate": 4.998922253362326e-06, + "loss": 2.5742, + "step": 3641 + }, + { + "epoch": 0.1953862660944206, + "grad_norm": 0.353515625, + "learning_rate": 4.9989197011571525e-06, + "loss": 2.2631, + "step": 3642 + }, + { + "epoch": 0.19543991416309012, + "grad_norm": 0.41015625, + "learning_rate": 4.998917145934274e-06, + "loss": 2.2535, + "step": 3643 + }, + { + "epoch": 0.19549356223175965, + "grad_norm": 0.66015625, + "learning_rate": 4.998914587693695e-06, + "loss": 2.3326, + "step": 3644 + }, + { + "epoch": 0.19554721030042918, + "grad_norm": 0.388671875, + "learning_rate": 4.998912026435416e-06, + "loss": 2.0941, + "step": 3645 + }, + { + "epoch": 0.1956008583690987, + "grad_norm": 0.291015625, + "learning_rate": 4.9989094621594424e-06, + "loss": 2.161, + "step": 3646 + }, + { + "epoch": 0.19565450643776824, + "grad_norm": 0.328125, + "learning_rate": 4.998906894865776e-06, + "loss": 2.4242, + "step": 3647 + }, + { + "epoch": 0.19570815450643778, + "grad_norm": 0.328125, + "learning_rate": 4.99890432455442e-06, + "loss": 2.0964, + "step": 3648 + }, + { + "epoch": 0.1957618025751073, + "grad_norm": 0.34375, + "learning_rate": 4.998901751225378e-06, + "loss": 2.3705, + "step": 3649 + }, + { + "epoch": 0.1958154506437768, + "grad_norm": 0.32421875, + "learning_rate": 4.998899174878653e-06, + "loss": 2.3342, + "step": 3650 + }, + { + "epoch": 0.19586909871244634, + "grad_norm": 0.875, + "learning_rate": 4.998896595514248e-06, + "loss": 2.2027, + "step": 3651 + }, + { + "epoch": 0.19592274678111588, + "grad_norm": 0.3828125, + "learning_rate": 4.998894013132166e-06, + "loss": 2.5297, + "step": 3652 + }, + { + "epoch": 0.1959763948497854, + "grad_norm": 0.33203125, + "learning_rate": 4.998891427732411e-06, + "loss": 2.221, + "step": 3653 + }, + { + "epoch": 0.19603004291845494, + "grad_norm": 0.365234375, + "learning_rate": 4.998888839314984e-06, + "loss": 2.407, + "step": 3654 + }, + { + "epoch": 0.19608369098712447, + "grad_norm": 0.408203125, + "learning_rate": 4.9988862478798905e-06, + "loss": 2.5994, + "step": 3655 + }, + { + "epoch": 0.196137339055794, + "grad_norm": 0.4140625, + "learning_rate": 4.998883653427132e-06, + "loss": 2.1585, + "step": 3656 + }, + { + "epoch": 0.1961909871244635, + "grad_norm": 0.376953125, + "learning_rate": 4.998881055956713e-06, + "loss": 2.2046, + "step": 3657 + }, + { + "epoch": 0.19624463519313304, + "grad_norm": 0.3125, + "learning_rate": 4.9988784554686345e-06, + "loss": 2.2883, + "step": 3658 + }, + { + "epoch": 0.19629828326180257, + "grad_norm": 0.39453125, + "learning_rate": 4.998875851962902e-06, + "loss": 2.3967, + "step": 3659 + }, + { + "epoch": 0.1963519313304721, + "grad_norm": 0.625, + "learning_rate": 4.998873245439517e-06, + "loss": 2.5802, + "step": 3660 + }, + { + "epoch": 0.19640557939914163, + "grad_norm": 0.345703125, + "learning_rate": 4.998870635898484e-06, + "loss": 2.2706, + "step": 3661 + }, + { + "epoch": 0.19645922746781116, + "grad_norm": 0.298828125, + "learning_rate": 4.998868023339805e-06, + "loss": 1.9519, + "step": 3662 + }, + { + "epoch": 0.1965128755364807, + "grad_norm": 0.3671875, + "learning_rate": 4.998865407763484e-06, + "loss": 2.5355, + "step": 3663 + }, + { + "epoch": 0.1965665236051502, + "grad_norm": 0.341796875, + "learning_rate": 4.9988627891695245e-06, + "loss": 2.3811, + "step": 3664 + }, + { + "epoch": 0.19662017167381973, + "grad_norm": 0.416015625, + "learning_rate": 4.998860167557928e-06, + "loss": 2.3244, + "step": 3665 + }, + { + "epoch": 0.19667381974248926, + "grad_norm": 0.310546875, + "learning_rate": 4.998857542928698e-06, + "loss": 2.4252, + "step": 3666 + }, + { + "epoch": 0.1967274678111588, + "grad_norm": 0.494140625, + "learning_rate": 4.998854915281839e-06, + "loss": 2.2045, + "step": 3667 + }, + { + "epoch": 0.19678111587982833, + "grad_norm": 0.38671875, + "learning_rate": 4.998852284617353e-06, + "loss": 2.1697, + "step": 3668 + }, + { + "epoch": 0.19683476394849786, + "grad_norm": 0.333984375, + "learning_rate": 4.9988496509352444e-06, + "loss": 2.5588, + "step": 3669 + }, + { + "epoch": 0.1968884120171674, + "grad_norm": 0.45703125, + "learning_rate": 4.998847014235515e-06, + "loss": 2.356, + "step": 3670 + }, + { + "epoch": 0.19694206008583692, + "grad_norm": 0.4765625, + "learning_rate": 4.998844374518169e-06, + "loss": 1.7579, + "step": 3671 + }, + { + "epoch": 0.19699570815450643, + "grad_norm": 0.330078125, + "learning_rate": 4.998841731783208e-06, + "loss": 2.2309, + "step": 3672 + }, + { + "epoch": 0.19704935622317596, + "grad_norm": 0.369140625, + "learning_rate": 4.998839086030638e-06, + "loss": 2.3103, + "step": 3673 + }, + { + "epoch": 0.1971030042918455, + "grad_norm": 0.337890625, + "learning_rate": 4.9988364372604595e-06, + "loss": 2.2833, + "step": 3674 + }, + { + "epoch": 0.19715665236051502, + "grad_norm": 0.35546875, + "learning_rate": 4.998833785472678e-06, + "loss": 2.2847, + "step": 3675 + }, + { + "epoch": 0.19721030042918455, + "grad_norm": 0.3984375, + "learning_rate": 4.998831130667294e-06, + "loss": 2.5506, + "step": 3676 + }, + { + "epoch": 0.19726394849785409, + "grad_norm": 0.34765625, + "learning_rate": 4.9988284728443125e-06, + "loss": 2.1098, + "step": 3677 + }, + { + "epoch": 0.19731759656652362, + "grad_norm": 0.361328125, + "learning_rate": 4.998825812003737e-06, + "loss": 2.3558, + "step": 3678 + }, + { + "epoch": 0.19737124463519312, + "grad_norm": 0.326171875, + "learning_rate": 4.998823148145569e-06, + "loss": 2.5246, + "step": 3679 + }, + { + "epoch": 0.19742489270386265, + "grad_norm": 0.427734375, + "learning_rate": 4.998820481269813e-06, + "loss": 2.6184, + "step": 3680 + }, + { + "epoch": 0.19747854077253219, + "grad_norm": 0.283203125, + "learning_rate": 4.998817811376473e-06, + "loss": 1.9266, + "step": 3681 + }, + { + "epoch": 0.19753218884120172, + "grad_norm": 0.30859375, + "learning_rate": 4.998815138465551e-06, + "loss": 2.1919, + "step": 3682 + }, + { + "epoch": 0.19758583690987125, + "grad_norm": 0.41796875, + "learning_rate": 4.99881246253705e-06, + "loss": 2.4375, + "step": 3683 + }, + { + "epoch": 0.19763948497854078, + "grad_norm": 0.408203125, + "learning_rate": 4.998809783590974e-06, + "loss": 2.2626, + "step": 3684 + }, + { + "epoch": 0.1976931330472103, + "grad_norm": 0.337890625, + "learning_rate": 4.998807101627325e-06, + "loss": 2.3596, + "step": 3685 + }, + { + "epoch": 0.19774678111587982, + "grad_norm": 0.8046875, + "learning_rate": 4.998804416646109e-06, + "loss": 2.4084, + "step": 3686 + }, + { + "epoch": 0.19780042918454935, + "grad_norm": 0.36328125, + "learning_rate": 4.998801728647326e-06, + "loss": 2.462, + "step": 3687 + }, + { + "epoch": 0.19785407725321888, + "grad_norm": 0.33984375, + "learning_rate": 4.998799037630981e-06, + "loss": 2.0718, + "step": 3688 + }, + { + "epoch": 0.1979077253218884, + "grad_norm": 0.361328125, + "learning_rate": 4.998796343597078e-06, + "loss": 2.4215, + "step": 3689 + }, + { + "epoch": 0.19796137339055794, + "grad_norm": 0.349609375, + "learning_rate": 4.9987936465456175e-06, + "loss": 2.1503, + "step": 3690 + }, + { + "epoch": 0.19801502145922747, + "grad_norm": 0.34375, + "learning_rate": 4.9987909464766046e-06, + "loss": 2.1006, + "step": 3691 + }, + { + "epoch": 0.198068669527897, + "grad_norm": 0.333984375, + "learning_rate": 4.9987882433900436e-06, + "loss": 2.3061, + "step": 3692 + }, + { + "epoch": 0.1981223175965665, + "grad_norm": 0.53515625, + "learning_rate": 4.998785537285935e-06, + "loss": 2.2674, + "step": 3693 + }, + { + "epoch": 0.19817596566523604, + "grad_norm": 0.453125, + "learning_rate": 4.998782828164285e-06, + "loss": 2.2897, + "step": 3694 + }, + { + "epoch": 0.19822961373390557, + "grad_norm": 0.298828125, + "learning_rate": 4.998780116025095e-06, + "loss": 2.1001, + "step": 3695 + }, + { + "epoch": 0.1982832618025751, + "grad_norm": 0.296875, + "learning_rate": 4.9987774008683685e-06, + "loss": 2.1871, + "step": 3696 + }, + { + "epoch": 0.19833690987124464, + "grad_norm": 0.361328125, + "learning_rate": 4.998774682694109e-06, + "loss": 1.9129, + "step": 3697 + }, + { + "epoch": 0.19839055793991417, + "grad_norm": 0.333984375, + "learning_rate": 4.998771961502321e-06, + "loss": 2.2642, + "step": 3698 + }, + { + "epoch": 0.1984442060085837, + "grad_norm": 0.3203125, + "learning_rate": 4.998769237293005e-06, + "loss": 2.2632, + "step": 3699 + }, + { + "epoch": 0.1984978540772532, + "grad_norm": 0.41015625, + "learning_rate": 4.998766510066168e-06, + "loss": 2.4069, + "step": 3700 + }, + { + "epoch": 0.19855150214592274, + "grad_norm": 0.44921875, + "learning_rate": 4.9987637798218095e-06, + "loss": 2.3592, + "step": 3701 + }, + { + "epoch": 0.19860515021459227, + "grad_norm": 0.322265625, + "learning_rate": 4.998761046559934e-06, + "loss": 2.2194, + "step": 3702 + }, + { + "epoch": 0.1986587982832618, + "grad_norm": 0.357421875, + "learning_rate": 4.998758310280547e-06, + "loss": 2.4778, + "step": 3703 + }, + { + "epoch": 0.19871244635193133, + "grad_norm": 0.345703125, + "learning_rate": 4.998755570983649e-06, + "loss": 2.3354, + "step": 3704 + }, + { + "epoch": 0.19876609442060086, + "grad_norm": 0.61328125, + "learning_rate": 4.998752828669244e-06, + "loss": 1.5046, + "step": 3705 + }, + { + "epoch": 0.1988197424892704, + "grad_norm": 0.3125, + "learning_rate": 4.998750083337337e-06, + "loss": 2.508, + "step": 3706 + }, + { + "epoch": 0.19887339055793993, + "grad_norm": 0.32421875, + "learning_rate": 4.998747334987929e-06, + "loss": 2.264, + "step": 3707 + }, + { + "epoch": 0.19892703862660943, + "grad_norm": 0.38671875, + "learning_rate": 4.998744583621025e-06, + "loss": 2.5261, + "step": 3708 + }, + { + "epoch": 0.19898068669527896, + "grad_norm": 0.328125, + "learning_rate": 4.998741829236627e-06, + "loss": 2.2358, + "step": 3709 + }, + { + "epoch": 0.1990343347639485, + "grad_norm": 0.32421875, + "learning_rate": 4.998739071834739e-06, + "loss": 2.2591, + "step": 3710 + }, + { + "epoch": 0.19908798283261803, + "grad_norm": 0.30859375, + "learning_rate": 4.998736311415366e-06, + "loss": 2.4445, + "step": 3711 + }, + { + "epoch": 0.19914163090128756, + "grad_norm": 0.328125, + "learning_rate": 4.9987335479785085e-06, + "loss": 2.2439, + "step": 3712 + }, + { + "epoch": 0.1991952789699571, + "grad_norm": 0.333984375, + "learning_rate": 4.99873078152417e-06, + "loss": 2.4737, + "step": 3713 + }, + { + "epoch": 0.19924892703862662, + "grad_norm": 0.359375, + "learning_rate": 4.9987280120523564e-06, + "loss": 1.7776, + "step": 3714 + }, + { + "epoch": 0.19930257510729613, + "grad_norm": 0.33203125, + "learning_rate": 4.998725239563068e-06, + "loss": 2.3689, + "step": 3715 + }, + { + "epoch": 0.19935622317596566, + "grad_norm": 0.408203125, + "learning_rate": 4.998722464056312e-06, + "loss": 2.3294, + "step": 3716 + }, + { + "epoch": 0.1994098712446352, + "grad_norm": 0.384765625, + "learning_rate": 4.998719685532088e-06, + "loss": 2.37, + "step": 3717 + }, + { + "epoch": 0.19946351931330472, + "grad_norm": 0.384765625, + "learning_rate": 4.998716903990399e-06, + "loss": 2.306, + "step": 3718 + }, + { + "epoch": 0.19951716738197425, + "grad_norm": 0.62890625, + "learning_rate": 4.998714119431253e-06, + "loss": 1.5099, + "step": 3719 + }, + { + "epoch": 0.19957081545064378, + "grad_norm": 0.4140625, + "learning_rate": 4.9987113318546484e-06, + "loss": 2.1718, + "step": 3720 + }, + { + "epoch": 0.19962446351931332, + "grad_norm": 0.404296875, + "learning_rate": 4.998708541260592e-06, + "loss": 2.3903, + "step": 3721 + }, + { + "epoch": 0.19967811158798282, + "grad_norm": 0.416015625, + "learning_rate": 4.9987057476490855e-06, + "loss": 2.509, + "step": 3722 + }, + { + "epoch": 0.19973175965665235, + "grad_norm": 0.3984375, + "learning_rate": 4.998702951020132e-06, + "loss": 2.2881, + "step": 3723 + }, + { + "epoch": 0.19978540772532188, + "grad_norm": 0.458984375, + "learning_rate": 4.998700151373736e-06, + "loss": 2.3485, + "step": 3724 + }, + { + "epoch": 0.19983905579399142, + "grad_norm": 0.34375, + "learning_rate": 4.998697348709899e-06, + "loss": 2.3466, + "step": 3725 + }, + { + "epoch": 0.19989270386266095, + "grad_norm": 0.392578125, + "learning_rate": 4.9986945430286275e-06, + "loss": 2.1849, + "step": 3726 + }, + { + "epoch": 0.19994635193133048, + "grad_norm": 0.369140625, + "learning_rate": 4.998691734329923e-06, + "loss": 2.2618, + "step": 3727 + }, + { + "epoch": 0.2, + "grad_norm": 0.3984375, + "learning_rate": 4.998688922613788e-06, + "loss": 2.4533, + "step": 3728 + }, + { + "epoch": 0.20005364806866952, + "grad_norm": 0.40625, + "learning_rate": 4.998686107880227e-06, + "loss": 2.2267, + "step": 3729 + }, + { + "epoch": 0.20010729613733905, + "grad_norm": 0.306640625, + "learning_rate": 4.998683290129244e-06, + "loss": 2.298, + "step": 3730 + }, + { + "epoch": 0.20016094420600858, + "grad_norm": 0.53515625, + "learning_rate": 4.9986804693608406e-06, + "loss": 2.2704, + "step": 3731 + }, + { + "epoch": 0.2002145922746781, + "grad_norm": 0.3359375, + "learning_rate": 4.998677645575022e-06, + "loss": 2.5978, + "step": 3732 + }, + { + "epoch": 0.20026824034334764, + "grad_norm": 0.37890625, + "learning_rate": 4.998674818771792e-06, + "loss": 2.1734, + "step": 3733 + }, + { + "epoch": 0.20032188841201717, + "grad_norm": 0.337890625, + "learning_rate": 4.998671988951151e-06, + "loss": 2.1405, + "step": 3734 + }, + { + "epoch": 0.2003755364806867, + "grad_norm": 0.37109375, + "learning_rate": 4.998669156113105e-06, + "loss": 2.2729, + "step": 3735 + }, + { + "epoch": 0.2004291845493562, + "grad_norm": 0.357421875, + "learning_rate": 4.998666320257657e-06, + "loss": 2.1119, + "step": 3736 + }, + { + "epoch": 0.20048283261802574, + "grad_norm": 0.3671875, + "learning_rate": 4.99866348138481e-06, + "loss": 2.6904, + "step": 3737 + }, + { + "epoch": 0.20053648068669527, + "grad_norm": 0.361328125, + "learning_rate": 4.998660639494568e-06, + "loss": 2.3007, + "step": 3738 + }, + { + "epoch": 0.2005901287553648, + "grad_norm": 1.796875, + "learning_rate": 4.998657794586934e-06, + "loss": 2.2288, + "step": 3739 + }, + { + "epoch": 0.20064377682403434, + "grad_norm": 0.373046875, + "learning_rate": 4.998654946661911e-06, + "loss": 2.4253, + "step": 3740 + }, + { + "epoch": 0.20069742489270387, + "grad_norm": 0.29296875, + "learning_rate": 4.9986520957195025e-06, + "loss": 2.4607, + "step": 3741 + }, + { + "epoch": 0.2007510729613734, + "grad_norm": 0.357421875, + "learning_rate": 4.998649241759714e-06, + "loss": 2.2238, + "step": 3742 + }, + { + "epoch": 0.20080472103004293, + "grad_norm": 0.314453125, + "learning_rate": 4.998646384782546e-06, + "loss": 2.2562, + "step": 3743 + }, + { + "epoch": 0.20085836909871244, + "grad_norm": 0.3359375, + "learning_rate": 4.998643524788004e-06, + "loss": 2.2903, + "step": 3744 + }, + { + "epoch": 0.20091201716738197, + "grad_norm": 0.431640625, + "learning_rate": 4.9986406617760904e-06, + "loss": 2.3423, + "step": 3745 + }, + { + "epoch": 0.2009656652360515, + "grad_norm": 0.345703125, + "learning_rate": 4.998637795746809e-06, + "loss": 2.4406, + "step": 3746 + }, + { + "epoch": 0.20101931330472103, + "grad_norm": 0.326171875, + "learning_rate": 4.998634926700163e-06, + "loss": 2.4196, + "step": 3747 + }, + { + "epoch": 0.20107296137339056, + "grad_norm": 0.353515625, + "learning_rate": 4.998632054636156e-06, + "loss": 2.4486, + "step": 3748 + }, + { + "epoch": 0.2011266094420601, + "grad_norm": 1.03125, + "learning_rate": 4.998629179554792e-06, + "loss": 2.3189, + "step": 3749 + }, + { + "epoch": 0.20118025751072963, + "grad_norm": 0.361328125, + "learning_rate": 4.998626301456073e-06, + "loss": 2.4865, + "step": 3750 + }, + { + "epoch": 0.20123390557939913, + "grad_norm": 0.36328125, + "learning_rate": 4.9986234203400055e-06, + "loss": 2.4066, + "step": 3751 + }, + { + "epoch": 0.20128755364806866, + "grad_norm": 0.3671875, + "learning_rate": 4.9986205362065895e-06, + "loss": 2.1745, + "step": 3752 + }, + { + "epoch": 0.2013412017167382, + "grad_norm": 0.310546875, + "learning_rate": 4.99861764905583e-06, + "loss": 2.2674, + "step": 3753 + }, + { + "epoch": 0.20139484978540773, + "grad_norm": 0.373046875, + "learning_rate": 4.998614758887731e-06, + "loss": 2.1586, + "step": 3754 + }, + { + "epoch": 0.20144849785407726, + "grad_norm": 0.34765625, + "learning_rate": 4.998611865702295e-06, + "loss": 2.3709, + "step": 3755 + }, + { + "epoch": 0.2015021459227468, + "grad_norm": 0.353515625, + "learning_rate": 4.998608969499526e-06, + "loss": 2.5199, + "step": 3756 + }, + { + "epoch": 0.20155579399141632, + "grad_norm": 0.2890625, + "learning_rate": 4.998606070279428e-06, + "loss": 2.2897, + "step": 3757 + }, + { + "epoch": 0.20160944206008583, + "grad_norm": 0.353515625, + "learning_rate": 4.998603168042004e-06, + "loss": 2.4381, + "step": 3758 + }, + { + "epoch": 0.20166309012875536, + "grad_norm": 0.33984375, + "learning_rate": 4.998600262787256e-06, + "loss": 2.3445, + "step": 3759 + }, + { + "epoch": 0.2017167381974249, + "grad_norm": 0.373046875, + "learning_rate": 4.998597354515191e-06, + "loss": 2.5062, + "step": 3760 + }, + { + "epoch": 0.20177038626609442, + "grad_norm": 0.3671875, + "learning_rate": 4.998594443225809e-06, + "loss": 2.2543, + "step": 3761 + }, + { + "epoch": 0.20182403433476395, + "grad_norm": 0.294921875, + "learning_rate": 4.998591528919116e-06, + "loss": 2.081, + "step": 3762 + }, + { + "epoch": 0.20187768240343348, + "grad_norm": 0.4140625, + "learning_rate": 4.998588611595114e-06, + "loss": 2.414, + "step": 3763 + }, + { + "epoch": 0.20193133047210302, + "grad_norm": 0.40234375, + "learning_rate": 4.998585691253806e-06, + "loss": 1.6603, + "step": 3764 + }, + { + "epoch": 0.20198497854077252, + "grad_norm": 0.32421875, + "learning_rate": 4.998582767895198e-06, + "loss": 2.1271, + "step": 3765 + }, + { + "epoch": 0.20203862660944205, + "grad_norm": 0.34765625, + "learning_rate": 4.998579841519292e-06, + "loss": 2.2006, + "step": 3766 + }, + { + "epoch": 0.20209227467811158, + "grad_norm": 0.2890625, + "learning_rate": 4.998576912126091e-06, + "loss": 2.1217, + "step": 3767 + }, + { + "epoch": 0.20214592274678111, + "grad_norm": 0.39453125, + "learning_rate": 4.998573979715599e-06, + "loss": 2.5188, + "step": 3768 + }, + { + "epoch": 0.20219957081545065, + "grad_norm": 0.71875, + "learning_rate": 4.998571044287821e-06, + "loss": 2.6488, + "step": 3769 + }, + { + "epoch": 0.20225321888412018, + "grad_norm": 0.4375, + "learning_rate": 4.9985681058427585e-06, + "loss": 2.3367, + "step": 3770 + }, + { + "epoch": 0.2023068669527897, + "grad_norm": 0.353515625, + "learning_rate": 4.998565164380415e-06, + "loss": 2.5516, + "step": 3771 + }, + { + "epoch": 0.20236051502145921, + "grad_norm": 0.384765625, + "learning_rate": 4.998562219900795e-06, + "loss": 2.1424, + "step": 3772 + }, + { + "epoch": 0.20241416309012875, + "grad_norm": 1.375, + "learning_rate": 4.998559272403904e-06, + "loss": 2.2496, + "step": 3773 + }, + { + "epoch": 0.20246781115879828, + "grad_norm": 0.353515625, + "learning_rate": 4.998556321889741e-06, + "loss": 2.2902, + "step": 3774 + }, + { + "epoch": 0.2025214592274678, + "grad_norm": 0.412109375, + "learning_rate": 4.998553368358313e-06, + "loss": 2.1833, + "step": 3775 + }, + { + "epoch": 0.20257510729613734, + "grad_norm": 0.3828125, + "learning_rate": 4.9985504118096226e-06, + "loss": 2.4305, + "step": 3776 + }, + { + "epoch": 0.20262875536480687, + "grad_norm": 0.3125, + "learning_rate": 4.998547452243673e-06, + "loss": 2.2489, + "step": 3777 + }, + { + "epoch": 0.2026824034334764, + "grad_norm": 0.375, + "learning_rate": 4.9985444896604684e-06, + "loss": 2.4892, + "step": 3778 + }, + { + "epoch": 0.20273605150214594, + "grad_norm": 0.37109375, + "learning_rate": 4.998541524060012e-06, + "loss": 2.5133, + "step": 3779 + }, + { + "epoch": 0.20278969957081544, + "grad_norm": 0.345703125, + "learning_rate": 4.998538555442308e-06, + "loss": 2.1878, + "step": 3780 + }, + { + "epoch": 0.20284334763948497, + "grad_norm": 0.953125, + "learning_rate": 4.998535583807359e-06, + "loss": 2.0517, + "step": 3781 + }, + { + "epoch": 0.2028969957081545, + "grad_norm": 0.39453125, + "learning_rate": 4.9985326091551685e-06, + "loss": 2.6349, + "step": 3782 + }, + { + "epoch": 0.20295064377682404, + "grad_norm": 1.8515625, + "learning_rate": 4.998529631485741e-06, + "loss": 2.3338, + "step": 3783 + }, + { + "epoch": 0.20300429184549357, + "grad_norm": 0.38671875, + "learning_rate": 4.99852665079908e-06, + "loss": 2.2319, + "step": 3784 + }, + { + "epoch": 0.2030579399141631, + "grad_norm": 1.109375, + "learning_rate": 4.99852366709519e-06, + "loss": 2.4848, + "step": 3785 + }, + { + "epoch": 0.20311158798283263, + "grad_norm": 0.326171875, + "learning_rate": 4.998520680374072e-06, + "loss": 2.3535, + "step": 3786 + }, + { + "epoch": 0.20316523605150213, + "grad_norm": 0.37109375, + "learning_rate": 4.998517690635731e-06, + "loss": 2.1452, + "step": 3787 + }, + { + "epoch": 0.20321888412017167, + "grad_norm": 0.326171875, + "learning_rate": 4.998514697880171e-06, + "loss": 2.1224, + "step": 3788 + }, + { + "epoch": 0.2032725321888412, + "grad_norm": 0.32421875, + "learning_rate": 4.998511702107396e-06, + "loss": 2.0312, + "step": 3789 + }, + { + "epoch": 0.20332618025751073, + "grad_norm": 0.322265625, + "learning_rate": 4.998508703317408e-06, + "loss": 2.0127, + "step": 3790 + }, + { + "epoch": 0.20337982832618026, + "grad_norm": 0.37109375, + "learning_rate": 4.998505701510212e-06, + "loss": 2.3762, + "step": 3791 + }, + { + "epoch": 0.2034334763948498, + "grad_norm": 0.37890625, + "learning_rate": 4.9985026966858105e-06, + "loss": 2.3511, + "step": 3792 + }, + { + "epoch": 0.20348712446351933, + "grad_norm": 0.365234375, + "learning_rate": 4.998499688844208e-06, + "loss": 2.2865, + "step": 3793 + }, + { + "epoch": 0.20354077253218883, + "grad_norm": 0.423828125, + "learning_rate": 4.998496677985409e-06, + "loss": 2.2841, + "step": 3794 + }, + { + "epoch": 0.20359442060085836, + "grad_norm": 0.34765625, + "learning_rate": 4.998493664109414e-06, + "loss": 2.5196, + "step": 3795 + }, + { + "epoch": 0.2036480686695279, + "grad_norm": 0.333984375, + "learning_rate": 4.998490647216231e-06, + "loss": 2.3083, + "step": 3796 + }, + { + "epoch": 0.20370171673819742, + "grad_norm": 0.443359375, + "learning_rate": 4.99848762730586e-06, + "loss": 2.4107, + "step": 3797 + }, + { + "epoch": 0.20375536480686696, + "grad_norm": 0.419921875, + "learning_rate": 4.998484604378306e-06, + "loss": 2.3881, + "step": 3798 + }, + { + "epoch": 0.2038090128755365, + "grad_norm": 0.337890625, + "learning_rate": 4.998481578433573e-06, + "loss": 2.3179, + "step": 3799 + }, + { + "epoch": 0.20386266094420602, + "grad_norm": 0.375, + "learning_rate": 4.998478549471664e-06, + "loss": 2.0142, + "step": 3800 + }, + { + "epoch": 0.20391630901287552, + "grad_norm": 0.34375, + "learning_rate": 4.998475517492582e-06, + "loss": 2.2588, + "step": 3801 + }, + { + "epoch": 0.20396995708154506, + "grad_norm": 0.345703125, + "learning_rate": 4.998472482496334e-06, + "loss": 2.3742, + "step": 3802 + }, + { + "epoch": 0.2040236051502146, + "grad_norm": 0.3515625, + "learning_rate": 4.99846944448292e-06, + "loss": 2.1959, + "step": 3803 + }, + { + "epoch": 0.20407725321888412, + "grad_norm": 0.3515625, + "learning_rate": 4.998466403452344e-06, + "loss": 2.3707, + "step": 3804 + }, + { + "epoch": 0.20413090128755365, + "grad_norm": 0.84375, + "learning_rate": 4.998463359404612e-06, + "loss": 2.1335, + "step": 3805 + }, + { + "epoch": 0.20418454935622318, + "grad_norm": 0.390625, + "learning_rate": 4.998460312339725e-06, + "loss": 2.1404, + "step": 3806 + }, + { + "epoch": 0.20423819742489271, + "grad_norm": 0.375, + "learning_rate": 4.9984572622576884e-06, + "loss": 1.6164, + "step": 3807 + }, + { + "epoch": 0.20429184549356222, + "grad_norm": 0.302734375, + "learning_rate": 4.998454209158506e-06, + "loss": 2.225, + "step": 3808 + }, + { + "epoch": 0.20434549356223175, + "grad_norm": 0.341796875, + "learning_rate": 4.998451153042181e-06, + "loss": 2.3854, + "step": 3809 + }, + { + "epoch": 0.20439914163090128, + "grad_norm": 0.3359375, + "learning_rate": 4.998448093908717e-06, + "loss": 2.2163, + "step": 3810 + }, + { + "epoch": 0.2044527896995708, + "grad_norm": 0.400390625, + "learning_rate": 4.998445031758117e-06, + "loss": 2.2603, + "step": 3811 + }, + { + "epoch": 0.20450643776824035, + "grad_norm": 0.357421875, + "learning_rate": 4.9984419665903865e-06, + "loss": 2.2016, + "step": 3812 + }, + { + "epoch": 0.20456008583690988, + "grad_norm": 0.36328125, + "learning_rate": 4.998438898405527e-06, + "loss": 2.3492, + "step": 3813 + }, + { + "epoch": 0.2046137339055794, + "grad_norm": 0.427734375, + "learning_rate": 4.998435827203544e-06, + "loss": 2.7806, + "step": 3814 + }, + { + "epoch": 0.20466738197424894, + "grad_norm": 0.359375, + "learning_rate": 4.99843275298444e-06, + "loss": 2.5066, + "step": 3815 + }, + { + "epoch": 0.20472103004291844, + "grad_norm": 0.3359375, + "learning_rate": 4.998429675748219e-06, + "loss": 1.9309, + "step": 3816 + }, + { + "epoch": 0.20477467811158798, + "grad_norm": 0.4296875, + "learning_rate": 4.998426595494886e-06, + "loss": 2.4851, + "step": 3817 + }, + { + "epoch": 0.2048283261802575, + "grad_norm": 0.427734375, + "learning_rate": 4.998423512224443e-06, + "loss": 1.9166, + "step": 3818 + }, + { + "epoch": 0.20488197424892704, + "grad_norm": 0.365234375, + "learning_rate": 4.998420425936895e-06, + "loss": 2.2036, + "step": 3819 + }, + { + "epoch": 0.20493562231759657, + "grad_norm": 0.32421875, + "learning_rate": 4.9984173366322445e-06, + "loss": 2.3051, + "step": 3820 + }, + { + "epoch": 0.2049892703862661, + "grad_norm": 0.361328125, + "learning_rate": 4.998414244310496e-06, + "loss": 2.6282, + "step": 3821 + }, + { + "epoch": 0.20504291845493564, + "grad_norm": 0.328125, + "learning_rate": 4.998411148971653e-06, + "loss": 2.0352, + "step": 3822 + }, + { + "epoch": 0.20509656652360514, + "grad_norm": 0.482421875, + "learning_rate": 4.99840805061572e-06, + "loss": 1.9536, + "step": 3823 + }, + { + "epoch": 0.20515021459227467, + "grad_norm": 0.3359375, + "learning_rate": 4.998404949242699e-06, + "loss": 2.3131, + "step": 3824 + }, + { + "epoch": 0.2052038626609442, + "grad_norm": 0.384765625, + "learning_rate": 4.998401844852595e-06, + "loss": 2.2638, + "step": 3825 + }, + { + "epoch": 0.20525751072961373, + "grad_norm": 0.373046875, + "learning_rate": 4.9983987374454125e-06, + "loss": 2.3642, + "step": 3826 + }, + { + "epoch": 0.20531115879828327, + "grad_norm": 0.337890625, + "learning_rate": 4.9983956270211544e-06, + "loss": 2.3778, + "step": 3827 + }, + { + "epoch": 0.2053648068669528, + "grad_norm": 0.333984375, + "learning_rate": 4.998392513579823e-06, + "loss": 2.1531, + "step": 3828 + }, + { + "epoch": 0.20541845493562233, + "grad_norm": 4.5625, + "learning_rate": 4.9983893971214246e-06, + "loss": 2.4674, + "step": 3829 + }, + { + "epoch": 0.20547210300429183, + "grad_norm": 0.326171875, + "learning_rate": 4.998386277645961e-06, + "loss": 2.3772, + "step": 3830 + }, + { + "epoch": 0.20552575107296137, + "grad_norm": 0.3671875, + "learning_rate": 4.998383155153438e-06, + "loss": 2.5074, + "step": 3831 + }, + { + "epoch": 0.2055793991416309, + "grad_norm": 0.3515625, + "learning_rate": 4.998380029643857e-06, + "loss": 2.2774, + "step": 3832 + }, + { + "epoch": 0.20563304721030043, + "grad_norm": 0.3046875, + "learning_rate": 4.998376901117224e-06, + "loss": 2.2666, + "step": 3833 + }, + { + "epoch": 0.20568669527896996, + "grad_norm": 0.349609375, + "learning_rate": 4.99837376957354e-06, + "loss": 2.3217, + "step": 3834 + }, + { + "epoch": 0.2057403433476395, + "grad_norm": 0.388671875, + "learning_rate": 4.998370635012811e-06, + "loss": 2.3125, + "step": 3835 + }, + { + "epoch": 0.20579399141630902, + "grad_norm": 0.3125, + "learning_rate": 4.998367497435041e-06, + "loss": 2.3681, + "step": 3836 + }, + { + "epoch": 0.20584763948497853, + "grad_norm": 0.43359375, + "learning_rate": 4.998364356840233e-06, + "loss": 2.5208, + "step": 3837 + }, + { + "epoch": 0.20590128755364806, + "grad_norm": 0.40625, + "learning_rate": 4.99836121322839e-06, + "loss": 2.4522, + "step": 3838 + }, + { + "epoch": 0.2059549356223176, + "grad_norm": 0.5625, + "learning_rate": 4.998358066599518e-06, + "loss": 1.7315, + "step": 3839 + }, + { + "epoch": 0.20600858369098712, + "grad_norm": 0.37890625, + "learning_rate": 4.998354916953618e-06, + "loss": 2.2592, + "step": 3840 + }, + { + "epoch": 0.20606223175965666, + "grad_norm": 0.70703125, + "learning_rate": 4.998351764290696e-06, + "loss": 2.2865, + "step": 3841 + }, + { + "epoch": 0.2061158798283262, + "grad_norm": 0.33984375, + "learning_rate": 4.998348608610755e-06, + "loss": 2.37, + "step": 3842 + }, + { + "epoch": 0.20616952789699572, + "grad_norm": 0.310546875, + "learning_rate": 4.998345449913799e-06, + "loss": 2.3505, + "step": 3843 + }, + { + "epoch": 0.20622317596566522, + "grad_norm": 0.353515625, + "learning_rate": 4.998342288199831e-06, + "loss": 2.036, + "step": 3844 + }, + { + "epoch": 0.20627682403433475, + "grad_norm": 0.365234375, + "learning_rate": 4.998339123468856e-06, + "loss": 2.3174, + "step": 3845 + }, + { + "epoch": 0.2063304721030043, + "grad_norm": 0.37109375, + "learning_rate": 4.998335955720877e-06, + "loss": 2.3928, + "step": 3846 + }, + { + "epoch": 0.20638412017167382, + "grad_norm": 0.404296875, + "learning_rate": 4.998332784955898e-06, + "loss": 2.517, + "step": 3847 + }, + { + "epoch": 0.20643776824034335, + "grad_norm": 0.326171875, + "learning_rate": 4.998329611173924e-06, + "loss": 2.3277, + "step": 3848 + }, + { + "epoch": 0.20649141630901288, + "grad_norm": 0.515625, + "learning_rate": 4.998326434374957e-06, + "loss": 2.1569, + "step": 3849 + }, + { + "epoch": 0.2065450643776824, + "grad_norm": 0.376953125, + "learning_rate": 4.998323254559002e-06, + "loss": 2.2477, + "step": 3850 + }, + { + "epoch": 0.20659871244635192, + "grad_norm": 0.345703125, + "learning_rate": 4.998320071726062e-06, + "loss": 2.3162, + "step": 3851 + }, + { + "epoch": 0.20665236051502145, + "grad_norm": 0.44140625, + "learning_rate": 4.9983168858761415e-06, + "loss": 2.1794, + "step": 3852 + }, + { + "epoch": 0.20670600858369098, + "grad_norm": 0.46484375, + "learning_rate": 4.998313697009244e-06, + "loss": 2.2518, + "step": 3853 + }, + { + "epoch": 0.2067596566523605, + "grad_norm": 0.42578125, + "learning_rate": 4.998310505125374e-06, + "loss": 2.3789, + "step": 3854 + }, + { + "epoch": 0.20681330472103004, + "grad_norm": 0.447265625, + "learning_rate": 4.9983073102245346e-06, + "loss": 2.2064, + "step": 3855 + }, + { + "epoch": 0.20686695278969958, + "grad_norm": 0.34375, + "learning_rate": 4.99830411230673e-06, + "loss": 2.364, + "step": 3856 + }, + { + "epoch": 0.2069206008583691, + "grad_norm": 0.322265625, + "learning_rate": 4.998300911371964e-06, + "loss": 1.9633, + "step": 3857 + }, + { + "epoch": 0.20697424892703864, + "grad_norm": 0.345703125, + "learning_rate": 4.99829770742024e-06, + "loss": 2.3715, + "step": 3858 + }, + { + "epoch": 0.20702789699570814, + "grad_norm": 0.33203125, + "learning_rate": 4.998294500451563e-06, + "loss": 2.4128, + "step": 3859 + }, + { + "epoch": 0.20708154506437768, + "grad_norm": 0.3828125, + "learning_rate": 4.9982912904659355e-06, + "loss": 2.4717, + "step": 3860 + }, + { + "epoch": 0.2071351931330472, + "grad_norm": 0.31640625, + "learning_rate": 4.998288077463363e-06, + "loss": 2.172, + "step": 3861 + }, + { + "epoch": 0.20718884120171674, + "grad_norm": 0.36328125, + "learning_rate": 4.998284861443848e-06, + "loss": 2.4427, + "step": 3862 + }, + { + "epoch": 0.20724248927038627, + "grad_norm": 0.376953125, + "learning_rate": 4.998281642407394e-06, + "loss": 2.2613, + "step": 3863 + }, + { + "epoch": 0.2072961373390558, + "grad_norm": 0.3203125, + "learning_rate": 4.9982784203540065e-06, + "loss": 2.2279, + "step": 3864 + }, + { + "epoch": 0.20734978540772533, + "grad_norm": 0.40625, + "learning_rate": 4.998275195283689e-06, + "loss": 2.4857, + "step": 3865 + }, + { + "epoch": 0.20740343347639484, + "grad_norm": 1.46875, + "learning_rate": 4.998271967196445e-06, + "loss": 2.2866, + "step": 3866 + }, + { + "epoch": 0.20745708154506437, + "grad_norm": 0.3984375, + "learning_rate": 4.998268736092278e-06, + "loss": 2.2933, + "step": 3867 + }, + { + "epoch": 0.2075107296137339, + "grad_norm": 0.3125, + "learning_rate": 4.998265501971192e-06, + "loss": 2.1317, + "step": 3868 + }, + { + "epoch": 0.20756437768240343, + "grad_norm": 0.33984375, + "learning_rate": 4.998262264833192e-06, + "loss": 2.3518, + "step": 3869 + }, + { + "epoch": 0.20761802575107297, + "grad_norm": 0.318359375, + "learning_rate": 4.99825902467828e-06, + "loss": 2.2185, + "step": 3870 + }, + { + "epoch": 0.2076716738197425, + "grad_norm": 0.337890625, + "learning_rate": 4.998255781506463e-06, + "loss": 2.2066, + "step": 3871 + }, + { + "epoch": 0.20772532188841203, + "grad_norm": 0.3828125, + "learning_rate": 4.998252535317741e-06, + "loss": 2.335, + "step": 3872 + }, + { + "epoch": 0.20777896995708153, + "grad_norm": 0.318359375, + "learning_rate": 4.998249286112121e-06, + "loss": 2.2384, + "step": 3873 + }, + { + "epoch": 0.20783261802575106, + "grad_norm": 0.486328125, + "learning_rate": 4.998246033889605e-06, + "loss": 2.354, + "step": 3874 + }, + { + "epoch": 0.2078862660944206, + "grad_norm": 0.353515625, + "learning_rate": 4.998242778650198e-06, + "loss": 2.3831, + "step": 3875 + }, + { + "epoch": 0.20793991416309013, + "grad_norm": 0.421875, + "learning_rate": 4.998239520393903e-06, + "loss": 2.5176, + "step": 3876 + }, + { + "epoch": 0.20799356223175966, + "grad_norm": 0.3125, + "learning_rate": 4.998236259120726e-06, + "loss": 2.4492, + "step": 3877 + }, + { + "epoch": 0.2080472103004292, + "grad_norm": 0.3671875, + "learning_rate": 4.99823299483067e-06, + "loss": 2.328, + "step": 3878 + }, + { + "epoch": 0.20810085836909872, + "grad_norm": 0.3671875, + "learning_rate": 4.998229727523737e-06, + "loss": 2.2554, + "step": 3879 + }, + { + "epoch": 0.20815450643776823, + "grad_norm": 0.3359375, + "learning_rate": 4.998226457199932e-06, + "loss": 2.2248, + "step": 3880 + }, + { + "epoch": 0.20820815450643776, + "grad_norm": 247.0, + "learning_rate": 4.9982231838592605e-06, + "loss": 2.3378, + "step": 3881 + }, + { + "epoch": 0.2082618025751073, + "grad_norm": 0.326171875, + "learning_rate": 4.998219907501725e-06, + "loss": 2.093, + "step": 3882 + }, + { + "epoch": 0.20831545064377682, + "grad_norm": 0.416015625, + "learning_rate": 4.99821662812733e-06, + "loss": 2.4154, + "step": 3883 + }, + { + "epoch": 0.20836909871244635, + "grad_norm": 0.365234375, + "learning_rate": 4.998213345736079e-06, + "loss": 2.2663, + "step": 3884 + }, + { + "epoch": 0.2084227467811159, + "grad_norm": 0.62890625, + "learning_rate": 4.998210060327976e-06, + "loss": 2.1113, + "step": 3885 + }, + { + "epoch": 0.20847639484978542, + "grad_norm": 0.298828125, + "learning_rate": 4.998206771903025e-06, + "loss": 2.5071, + "step": 3886 + }, + { + "epoch": 0.20853004291845492, + "grad_norm": 0.640625, + "learning_rate": 4.99820348046123e-06, + "loss": 2.3559, + "step": 3887 + }, + { + "epoch": 0.20858369098712445, + "grad_norm": 0.431640625, + "learning_rate": 4.9982001860025965e-06, + "loss": 2.3928, + "step": 3888 + }, + { + "epoch": 0.20863733905579399, + "grad_norm": 0.33203125, + "learning_rate": 4.998196888527126e-06, + "loss": 2.3485, + "step": 3889 + }, + { + "epoch": 0.20869098712446352, + "grad_norm": 0.5, + "learning_rate": 4.998193588034824e-06, + "loss": 2.5808, + "step": 3890 + }, + { + "epoch": 0.20874463519313305, + "grad_norm": 0.46875, + "learning_rate": 4.998190284525694e-06, + "loss": 2.4403, + "step": 3891 + }, + { + "epoch": 0.20879828326180258, + "grad_norm": 0.3828125, + "learning_rate": 4.99818697799974e-06, + "loss": 2.3935, + "step": 3892 + }, + { + "epoch": 0.2088519313304721, + "grad_norm": 0.318359375, + "learning_rate": 4.998183668456965e-06, + "loss": 2.276, + "step": 3893 + }, + { + "epoch": 0.20890557939914164, + "grad_norm": 0.341796875, + "learning_rate": 4.998180355897375e-06, + "loss": 2.4047, + "step": 3894 + }, + { + "epoch": 0.20895922746781115, + "grad_norm": 0.318359375, + "learning_rate": 4.998177040320973e-06, + "loss": 2.4313, + "step": 3895 + }, + { + "epoch": 0.20901287553648068, + "grad_norm": 0.365234375, + "learning_rate": 4.998173721727764e-06, + "loss": 2.1914, + "step": 3896 + }, + { + "epoch": 0.2090665236051502, + "grad_norm": 0.392578125, + "learning_rate": 4.99817040011775e-06, + "loss": 2.2927, + "step": 3897 + }, + { + "epoch": 0.20912017167381974, + "grad_norm": 0.33984375, + "learning_rate": 4.998167075490936e-06, + "loss": 2.2463, + "step": 3898 + }, + { + "epoch": 0.20917381974248928, + "grad_norm": 0.453125, + "learning_rate": 4.998163747847326e-06, + "loss": 2.535, + "step": 3899 + }, + { + "epoch": 0.2092274678111588, + "grad_norm": 0.328125, + "learning_rate": 4.998160417186925e-06, + "loss": 2.0962, + "step": 3900 + }, + { + "epoch": 0.20928111587982834, + "grad_norm": 0.3515625, + "learning_rate": 4.998157083509735e-06, + "loss": 2.2723, + "step": 3901 + }, + { + "epoch": 0.20933476394849784, + "grad_norm": 0.37109375, + "learning_rate": 4.998153746815762e-06, + "loss": 2.0851, + "step": 3902 + }, + { + "epoch": 0.20938841201716737, + "grad_norm": 0.412109375, + "learning_rate": 4.9981504071050076e-06, + "loss": 2.0602, + "step": 3903 + }, + { + "epoch": 0.2094420600858369, + "grad_norm": 0.33203125, + "learning_rate": 4.9981470643774795e-06, + "loss": 2.3225, + "step": 3904 + }, + { + "epoch": 0.20949570815450644, + "grad_norm": 0.376953125, + "learning_rate": 4.998143718633178e-06, + "loss": 2.2877, + "step": 3905 + }, + { + "epoch": 0.20954935622317597, + "grad_norm": 0.40234375, + "learning_rate": 4.99814036987211e-06, + "loss": 2.4064, + "step": 3906 + }, + { + "epoch": 0.2096030042918455, + "grad_norm": 0.439453125, + "learning_rate": 4.998137018094278e-06, + "loss": 2.3032, + "step": 3907 + }, + { + "epoch": 0.20965665236051503, + "grad_norm": 0.376953125, + "learning_rate": 4.998133663299686e-06, + "loss": 2.0184, + "step": 3908 + }, + { + "epoch": 0.20971030042918454, + "grad_norm": 0.412109375, + "learning_rate": 4.998130305488339e-06, + "loss": 2.3746, + "step": 3909 + }, + { + "epoch": 0.20976394849785407, + "grad_norm": 0.369140625, + "learning_rate": 4.99812694466024e-06, + "loss": 2.0001, + "step": 3910 + }, + { + "epoch": 0.2098175965665236, + "grad_norm": 0.384765625, + "learning_rate": 4.998123580815394e-06, + "loss": 2.3673, + "step": 3911 + }, + { + "epoch": 0.20987124463519313, + "grad_norm": 0.294921875, + "learning_rate": 4.998120213953804e-06, + "loss": 2.2298, + "step": 3912 + }, + { + "epoch": 0.20992489270386266, + "grad_norm": 0.375, + "learning_rate": 4.998116844075474e-06, + "loss": 2.141, + "step": 3913 + }, + { + "epoch": 0.2099785407725322, + "grad_norm": 0.365234375, + "learning_rate": 4.99811347118041e-06, + "loss": 2.2594, + "step": 3914 + }, + { + "epoch": 0.21003218884120173, + "grad_norm": 0.32421875, + "learning_rate": 4.998110095268615e-06, + "loss": 2.2526, + "step": 3915 + }, + { + "epoch": 0.21008583690987123, + "grad_norm": 0.384765625, + "learning_rate": 4.998106716340092e-06, + "loss": 2.4891, + "step": 3916 + }, + { + "epoch": 0.21013948497854076, + "grad_norm": 0.35546875, + "learning_rate": 4.998103334394846e-06, + "loss": 1.9348, + "step": 3917 + }, + { + "epoch": 0.2101931330472103, + "grad_norm": 0.33984375, + "learning_rate": 4.998099949432882e-06, + "loss": 2.2799, + "step": 3918 + }, + { + "epoch": 0.21024678111587983, + "grad_norm": 0.359375, + "learning_rate": 4.998096561454202e-06, + "loss": 2.3053, + "step": 3919 + }, + { + "epoch": 0.21030042918454936, + "grad_norm": 0.41796875, + "learning_rate": 4.998093170458812e-06, + "loss": 2.3952, + "step": 3920 + }, + { + "epoch": 0.2103540772532189, + "grad_norm": 0.431640625, + "learning_rate": 4.998089776446715e-06, + "loss": 2.1114, + "step": 3921 + }, + { + "epoch": 0.21040772532188842, + "grad_norm": 0.42578125, + "learning_rate": 4.998086379417915e-06, + "loss": 2.2787, + "step": 3922 + }, + { + "epoch": 0.21046137339055793, + "grad_norm": 0.341796875, + "learning_rate": 4.998082979372418e-06, + "loss": 2.2043, + "step": 3923 + }, + { + "epoch": 0.21051502145922746, + "grad_norm": 0.423828125, + "learning_rate": 4.998079576310225e-06, + "loss": 2.3041, + "step": 3924 + }, + { + "epoch": 0.210568669527897, + "grad_norm": 0.390625, + "learning_rate": 4.998076170231343e-06, + "loss": 2.4187, + "step": 3925 + }, + { + "epoch": 0.21062231759656652, + "grad_norm": 0.365234375, + "learning_rate": 4.998072761135774e-06, + "loss": 2.1564, + "step": 3926 + }, + { + "epoch": 0.21067596566523605, + "grad_norm": 0.404296875, + "learning_rate": 4.998069349023523e-06, + "loss": 2.4847, + "step": 3927 + }, + { + "epoch": 0.21072961373390559, + "grad_norm": 0.345703125, + "learning_rate": 4.998065933894594e-06, + "loss": 2.0966, + "step": 3928 + }, + { + "epoch": 0.21078326180257512, + "grad_norm": 0.369140625, + "learning_rate": 4.998062515748992e-06, + "loss": 2.3758, + "step": 3929 + }, + { + "epoch": 0.21083690987124465, + "grad_norm": 0.357421875, + "learning_rate": 4.99805909458672e-06, + "loss": 2.3133, + "step": 3930 + }, + { + "epoch": 0.21089055793991415, + "grad_norm": 0.380859375, + "learning_rate": 4.998055670407782e-06, + "loss": 2.4737, + "step": 3931 + }, + { + "epoch": 0.21094420600858368, + "grad_norm": 0.326171875, + "learning_rate": 4.998052243212184e-06, + "loss": 2.3436, + "step": 3932 + }, + { + "epoch": 0.21099785407725322, + "grad_norm": 0.306640625, + "learning_rate": 4.998048812999927e-06, + "loss": 2.1672, + "step": 3933 + }, + { + "epoch": 0.21105150214592275, + "grad_norm": 0.34375, + "learning_rate": 4.9980453797710175e-06, + "loss": 2.491, + "step": 3934 + }, + { + "epoch": 0.21110515021459228, + "grad_norm": 0.412109375, + "learning_rate": 4.99804194352546e-06, + "loss": 2.2919, + "step": 3935 + }, + { + "epoch": 0.2111587982832618, + "grad_norm": 0.318359375, + "learning_rate": 4.998038504263256e-06, + "loss": 2.1322, + "step": 3936 + }, + { + "epoch": 0.21121244635193134, + "grad_norm": 0.32421875, + "learning_rate": 4.998035061984413e-06, + "loss": 2.3783, + "step": 3937 + }, + { + "epoch": 0.21126609442060085, + "grad_norm": 0.5546875, + "learning_rate": 4.998031616688932e-06, + "loss": 1.4935, + "step": 3938 + }, + { + "epoch": 0.21131974248927038, + "grad_norm": 0.33984375, + "learning_rate": 4.9980281683768195e-06, + "loss": 2.2284, + "step": 3939 + }, + { + "epoch": 0.2113733905579399, + "grad_norm": 0.318359375, + "learning_rate": 4.998024717048079e-06, + "loss": 1.8369, + "step": 3940 + }, + { + "epoch": 0.21142703862660944, + "grad_norm": 0.40234375, + "learning_rate": 4.9980212627027135e-06, + "loss": 2.181, + "step": 3941 + }, + { + "epoch": 0.21148068669527897, + "grad_norm": 0.357421875, + "learning_rate": 4.9980178053407295e-06, + "loss": 2.4551, + "step": 3942 + }, + { + "epoch": 0.2115343347639485, + "grad_norm": 0.375, + "learning_rate": 4.998014344962128e-06, + "loss": 2.1323, + "step": 3943 + }, + { + "epoch": 0.21158798283261804, + "grad_norm": 0.365234375, + "learning_rate": 4.998010881566917e-06, + "loss": 2.3545, + "step": 3944 + }, + { + "epoch": 0.21164163090128754, + "grad_norm": 0.32421875, + "learning_rate": 4.998007415155097e-06, + "loss": 2.1893, + "step": 3945 + }, + { + "epoch": 0.21169527896995707, + "grad_norm": 0.333984375, + "learning_rate": 4.998003945726675e-06, + "loss": 2.5058, + "step": 3946 + }, + { + "epoch": 0.2117489270386266, + "grad_norm": 0.373046875, + "learning_rate": 4.998000473281654e-06, + "loss": 2.3696, + "step": 3947 + }, + { + "epoch": 0.21180257510729614, + "grad_norm": 0.4921875, + "learning_rate": 4.997996997820038e-06, + "loss": 1.3883, + "step": 3948 + }, + { + "epoch": 0.21185622317596567, + "grad_norm": 0.345703125, + "learning_rate": 4.997993519341831e-06, + "loss": 2.171, + "step": 3949 + }, + { + "epoch": 0.2119098712446352, + "grad_norm": 0.380859375, + "learning_rate": 4.997990037847039e-06, + "loss": 2.2555, + "step": 3950 + }, + { + "epoch": 0.21196351931330473, + "grad_norm": 0.333984375, + "learning_rate": 4.997986553335664e-06, + "loss": 2.3815, + "step": 3951 + }, + { + "epoch": 0.21201716738197424, + "grad_norm": 0.349609375, + "learning_rate": 4.997983065807711e-06, + "loss": 2.4007, + "step": 3952 + }, + { + "epoch": 0.21207081545064377, + "grad_norm": 1.046875, + "learning_rate": 4.9979795752631845e-06, + "loss": 2.2918, + "step": 3953 + }, + { + "epoch": 0.2121244635193133, + "grad_norm": 0.3125, + "learning_rate": 4.997976081702089e-06, + "loss": 2.2767, + "step": 3954 + }, + { + "epoch": 0.21217811158798283, + "grad_norm": 0.353515625, + "learning_rate": 4.997972585124428e-06, + "loss": 2.3588, + "step": 3955 + }, + { + "epoch": 0.21223175965665236, + "grad_norm": 0.33203125, + "learning_rate": 4.997969085530205e-06, + "loss": 2.3985, + "step": 3956 + }, + { + "epoch": 0.2122854077253219, + "grad_norm": 0.439453125, + "learning_rate": 4.9979655829194264e-06, + "loss": 2.3488, + "step": 3957 + }, + { + "epoch": 0.21233905579399143, + "grad_norm": 0.2890625, + "learning_rate": 4.997962077292095e-06, + "loss": 2.1719, + "step": 3958 + }, + { + "epoch": 0.21239270386266093, + "grad_norm": 0.30078125, + "learning_rate": 4.997958568648214e-06, + "loss": 2.3404, + "step": 3959 + }, + { + "epoch": 0.21244635193133046, + "grad_norm": 0.3828125, + "learning_rate": 4.99795505698779e-06, + "loss": 2.3092, + "step": 3960 + }, + { + "epoch": 0.2125, + "grad_norm": 0.337890625, + "learning_rate": 4.9979515423108255e-06, + "loss": 2.3467, + "step": 3961 + }, + { + "epoch": 0.21255364806866953, + "grad_norm": 0.361328125, + "learning_rate": 4.997948024617326e-06, + "loss": 2.3594, + "step": 3962 + }, + { + "epoch": 0.21260729613733906, + "grad_norm": 0.34375, + "learning_rate": 4.9979445039072945e-06, + "loss": 2.2872, + "step": 3963 + }, + { + "epoch": 0.2126609442060086, + "grad_norm": 0.42578125, + "learning_rate": 4.997940980180736e-06, + "loss": 2.1411, + "step": 3964 + }, + { + "epoch": 0.21271459227467812, + "grad_norm": 0.33203125, + "learning_rate": 4.9979374534376546e-06, + "loss": 2.1414, + "step": 3965 + }, + { + "epoch": 0.21276824034334765, + "grad_norm": 0.546875, + "learning_rate": 4.9979339236780545e-06, + "loss": 2.229, + "step": 3966 + }, + { + "epoch": 0.21282188841201716, + "grad_norm": 0.388671875, + "learning_rate": 4.9979303909019405e-06, + "loss": 2.5274, + "step": 3967 + }, + { + "epoch": 0.2128755364806867, + "grad_norm": 0.4140625, + "learning_rate": 4.997926855109316e-06, + "loss": 2.3803, + "step": 3968 + }, + { + "epoch": 0.21292918454935622, + "grad_norm": 0.38671875, + "learning_rate": 4.997923316300186e-06, + "loss": 2.3826, + "step": 3969 + }, + { + "epoch": 0.21298283261802575, + "grad_norm": 0.40625, + "learning_rate": 4.9979197744745544e-06, + "loss": 2.3743, + "step": 3970 + }, + { + "epoch": 0.21303648068669528, + "grad_norm": 0.337890625, + "learning_rate": 4.997916229632425e-06, + "loss": 2.2563, + "step": 3971 + }, + { + "epoch": 0.21309012875536482, + "grad_norm": 0.455078125, + "learning_rate": 4.997912681773803e-06, + "loss": 2.4757, + "step": 3972 + }, + { + "epoch": 0.21314377682403435, + "grad_norm": 0.357421875, + "learning_rate": 4.9979091308986926e-06, + "loss": 2.2695, + "step": 3973 + }, + { + "epoch": 0.21319742489270385, + "grad_norm": 0.439453125, + "learning_rate": 4.9979055770070975e-06, + "loss": 2.5158, + "step": 3974 + }, + { + "epoch": 0.21325107296137338, + "grad_norm": 0.412109375, + "learning_rate": 4.997902020099022e-06, + "loss": 2.1662, + "step": 3975 + }, + { + "epoch": 0.21330472103004292, + "grad_norm": 0.41796875, + "learning_rate": 4.997898460174471e-06, + "loss": 1.9903, + "step": 3976 + }, + { + "epoch": 0.21335836909871245, + "grad_norm": 0.3125, + "learning_rate": 4.997894897233449e-06, + "loss": 2.1737, + "step": 3977 + }, + { + "epoch": 0.21341201716738198, + "grad_norm": 0.6171875, + "learning_rate": 4.997891331275958e-06, + "loss": 2.2818, + "step": 3978 + }, + { + "epoch": 0.2134656652360515, + "grad_norm": 0.33984375, + "learning_rate": 4.997887762302005e-06, + "loss": 2.3255, + "step": 3979 + }, + { + "epoch": 0.21351931330472104, + "grad_norm": 0.37890625, + "learning_rate": 4.9978841903115934e-06, + "loss": 2.3157, + "step": 3980 + }, + { + "epoch": 0.21357296137339055, + "grad_norm": 0.32421875, + "learning_rate": 4.9978806153047274e-06, + "loss": 2.364, + "step": 3981 + }, + { + "epoch": 0.21362660944206008, + "grad_norm": 0.318359375, + "learning_rate": 4.997877037281412e-06, + "loss": 2.2239, + "step": 3982 + }, + { + "epoch": 0.2136802575107296, + "grad_norm": 0.365234375, + "learning_rate": 4.997873456241651e-06, + "loss": 2.2862, + "step": 3983 + }, + { + "epoch": 0.21373390557939914, + "grad_norm": 0.43359375, + "learning_rate": 4.997869872185448e-06, + "loss": 2.4245, + "step": 3984 + }, + { + "epoch": 0.21378755364806867, + "grad_norm": 0.416015625, + "learning_rate": 4.997866285112808e-06, + "loss": 2.289, + "step": 3985 + }, + { + "epoch": 0.2138412017167382, + "grad_norm": 0.5625, + "learning_rate": 4.9978626950237355e-06, + "loss": 2.2769, + "step": 3986 + }, + { + "epoch": 0.21389484978540774, + "grad_norm": 0.326171875, + "learning_rate": 4.997859101918235e-06, + "loss": 2.2194, + "step": 3987 + }, + { + "epoch": 0.21394849785407724, + "grad_norm": 0.39453125, + "learning_rate": 4.99785550579631e-06, + "loss": 2.4305, + "step": 3988 + }, + { + "epoch": 0.21400214592274677, + "grad_norm": 0.29296875, + "learning_rate": 4.997851906657966e-06, + "loss": 2.1487, + "step": 3989 + }, + { + "epoch": 0.2140557939914163, + "grad_norm": 0.44921875, + "learning_rate": 4.997848304503206e-06, + "loss": 2.4681, + "step": 3990 + }, + { + "epoch": 0.21410944206008584, + "grad_norm": 0.38671875, + "learning_rate": 4.997844699332035e-06, + "loss": 2.4863, + "step": 3991 + }, + { + "epoch": 0.21416309012875537, + "grad_norm": 0.376953125, + "learning_rate": 4.997841091144457e-06, + "loss": 2.4268, + "step": 3992 + }, + { + "epoch": 0.2142167381974249, + "grad_norm": 0.337890625, + "learning_rate": 4.9978374799404776e-06, + "loss": 2.2287, + "step": 3993 + }, + { + "epoch": 0.21427038626609443, + "grad_norm": 0.4140625, + "learning_rate": 4.9978338657201e-06, + "loss": 2.327, + "step": 3994 + }, + { + "epoch": 0.21432403433476394, + "grad_norm": 0.373046875, + "learning_rate": 4.997830248483328e-06, + "loss": 2.2019, + "step": 3995 + }, + { + "epoch": 0.21437768240343347, + "grad_norm": 0.4375, + "learning_rate": 4.997826628230168e-06, + "loss": 2.2521, + "step": 3996 + }, + { + "epoch": 0.214431330472103, + "grad_norm": 0.318359375, + "learning_rate": 4.997823004960623e-06, + "loss": 2.3175, + "step": 3997 + }, + { + "epoch": 0.21448497854077253, + "grad_norm": 0.3203125, + "learning_rate": 4.997819378674698e-06, + "loss": 2.3772, + "step": 3998 + }, + { + "epoch": 0.21453862660944206, + "grad_norm": 0.3359375, + "learning_rate": 4.9978157493723964e-06, + "loss": 2.1896, + "step": 3999 + }, + { + "epoch": 0.2145922746781116, + "grad_norm": 0.365234375, + "learning_rate": 4.997812117053723e-06, + "loss": 2.4738, + "step": 4000 + }, + { + "epoch": 0.21464592274678113, + "grad_norm": 0.333984375, + "learning_rate": 4.9978084817186825e-06, + "loss": 2.0489, + "step": 4001 + }, + { + "epoch": 0.21469957081545063, + "grad_norm": 0.310546875, + "learning_rate": 4.997804843367279e-06, + "loss": 2.4799, + "step": 4002 + }, + { + "epoch": 0.21475321888412016, + "grad_norm": 0.41015625, + "learning_rate": 4.997801201999517e-06, + "loss": 2.145, + "step": 4003 + }, + { + "epoch": 0.2148068669527897, + "grad_norm": 0.38671875, + "learning_rate": 4.997797557615401e-06, + "loss": 2.0311, + "step": 4004 + }, + { + "epoch": 0.21486051502145923, + "grad_norm": 0.349609375, + "learning_rate": 4.997793910214935e-06, + "loss": 2.269, + "step": 4005 + }, + { + "epoch": 0.21491416309012876, + "grad_norm": 0.50390625, + "learning_rate": 4.997790259798125e-06, + "loss": 2.242, + "step": 4006 + }, + { + "epoch": 0.2149678111587983, + "grad_norm": 0.427734375, + "learning_rate": 4.997786606364973e-06, + "loss": 1.6027, + "step": 4007 + }, + { + "epoch": 0.21502145922746782, + "grad_norm": 1.2265625, + "learning_rate": 4.997782949915484e-06, + "loss": 2.1967, + "step": 4008 + }, + { + "epoch": 0.21507510729613735, + "grad_norm": 0.375, + "learning_rate": 4.997779290449663e-06, + "loss": 2.3899, + "step": 4009 + }, + { + "epoch": 0.21512875536480686, + "grad_norm": 0.326171875, + "learning_rate": 4.997775627967516e-06, + "loss": 2.3062, + "step": 4010 + }, + { + "epoch": 0.2151824034334764, + "grad_norm": 0.37109375, + "learning_rate": 4.997771962469045e-06, + "loss": 2.31, + "step": 4011 + }, + { + "epoch": 0.21523605150214592, + "grad_norm": 0.345703125, + "learning_rate": 4.9977682939542545e-06, + "loss": 2.3, + "step": 4012 + }, + { + "epoch": 0.21528969957081545, + "grad_norm": 0.62890625, + "learning_rate": 4.99776462242315e-06, + "loss": 2.2512, + "step": 4013 + }, + { + "epoch": 0.21534334763948498, + "grad_norm": 0.74609375, + "learning_rate": 4.997760947875736e-06, + "loss": 1.4578, + "step": 4014 + }, + { + "epoch": 0.21539699570815452, + "grad_norm": 0.32421875, + "learning_rate": 4.997757270312016e-06, + "loss": 2.1911, + "step": 4015 + }, + { + "epoch": 0.21545064377682405, + "grad_norm": 0.3984375, + "learning_rate": 4.997753589731995e-06, + "loss": 2.2334, + "step": 4016 + }, + { + "epoch": 0.21550429184549355, + "grad_norm": 0.353515625, + "learning_rate": 4.997749906135679e-06, + "loss": 2.3351, + "step": 4017 + }, + { + "epoch": 0.21555793991416308, + "grad_norm": 0.408203125, + "learning_rate": 4.997746219523069e-06, + "loss": 2.102, + "step": 4018 + }, + { + "epoch": 0.21561158798283261, + "grad_norm": 0.298828125, + "learning_rate": 4.997742529894171e-06, + "loss": 2.1458, + "step": 4019 + }, + { + "epoch": 0.21566523605150215, + "grad_norm": 0.376953125, + "learning_rate": 4.997738837248991e-06, + "loss": 2.2441, + "step": 4020 + }, + { + "epoch": 0.21571888412017168, + "grad_norm": 0.416015625, + "learning_rate": 4.997735141587532e-06, + "loss": 2.2863, + "step": 4021 + }, + { + "epoch": 0.2157725321888412, + "grad_norm": 0.33984375, + "learning_rate": 4.997731442909798e-06, + "loss": 2.5768, + "step": 4022 + }, + { + "epoch": 0.21582618025751074, + "grad_norm": 0.3515625, + "learning_rate": 4.9977277412157945e-06, + "loss": 2.4309, + "step": 4023 + }, + { + "epoch": 0.21587982832618025, + "grad_norm": 0.4453125, + "learning_rate": 4.997724036505525e-06, + "loss": 2.2762, + "step": 4024 + }, + { + "epoch": 0.21593347639484978, + "grad_norm": 0.322265625, + "learning_rate": 4.997720328778996e-06, + "loss": 2.1867, + "step": 4025 + }, + { + "epoch": 0.2159871244635193, + "grad_norm": 0.435546875, + "learning_rate": 4.99771661803621e-06, + "loss": 2.3371, + "step": 4026 + }, + { + "epoch": 0.21604077253218884, + "grad_norm": 0.37890625, + "learning_rate": 4.997712904277172e-06, + "loss": 2.1668, + "step": 4027 + }, + { + "epoch": 0.21609442060085837, + "grad_norm": 0.416015625, + "learning_rate": 4.9977091875018865e-06, + "loss": 2.126, + "step": 4028 + }, + { + "epoch": 0.2161480686695279, + "grad_norm": 0.390625, + "learning_rate": 4.9977054677103575e-06, + "loss": 2.3522, + "step": 4029 + }, + { + "epoch": 0.21620171673819744, + "grad_norm": 0.3125, + "learning_rate": 4.997701744902591e-06, + "loss": 2.2453, + "step": 4030 + }, + { + "epoch": 0.21625536480686694, + "grad_norm": 0.44140625, + "learning_rate": 4.9976980190785896e-06, + "loss": 2.3728, + "step": 4031 + }, + { + "epoch": 0.21630901287553647, + "grad_norm": 0.34765625, + "learning_rate": 4.99769429023836e-06, + "loss": 2.468, + "step": 4032 + }, + { + "epoch": 0.216362660944206, + "grad_norm": 0.34765625, + "learning_rate": 4.997690558381904e-06, + "loss": 2.1287, + "step": 4033 + }, + { + "epoch": 0.21641630901287554, + "grad_norm": 0.326171875, + "learning_rate": 4.997686823509228e-06, + "loss": 2.1396, + "step": 4034 + }, + { + "epoch": 0.21646995708154507, + "grad_norm": 0.5078125, + "learning_rate": 4.997683085620336e-06, + "loss": 2.2958, + "step": 4035 + }, + { + "epoch": 0.2165236051502146, + "grad_norm": 0.388671875, + "learning_rate": 4.997679344715233e-06, + "loss": 2.6031, + "step": 4036 + }, + { + "epoch": 0.21657725321888413, + "grad_norm": 0.40625, + "learning_rate": 4.997675600793922e-06, + "loss": 1.9906, + "step": 4037 + }, + { + "epoch": 0.21663090128755363, + "grad_norm": 0.3671875, + "learning_rate": 4.9976718538564095e-06, + "loss": 2.3578, + "step": 4038 + }, + { + "epoch": 0.21668454935622317, + "grad_norm": 0.431640625, + "learning_rate": 4.9976681039026996e-06, + "loss": 2.3561, + "step": 4039 + }, + { + "epoch": 0.2167381974248927, + "grad_norm": 0.33984375, + "learning_rate": 4.997664350932795e-06, + "loss": 2.258, + "step": 4040 + }, + { + "epoch": 0.21679184549356223, + "grad_norm": 0.50390625, + "learning_rate": 4.997660594946703e-06, + "loss": 2.3824, + "step": 4041 + }, + { + "epoch": 0.21684549356223176, + "grad_norm": 0.46484375, + "learning_rate": 4.9976568359444255e-06, + "loss": 2.5739, + "step": 4042 + }, + { + "epoch": 0.2168991416309013, + "grad_norm": 0.41796875, + "learning_rate": 4.997653073925969e-06, + "loss": 2.4491, + "step": 4043 + }, + { + "epoch": 0.21695278969957082, + "grad_norm": 0.4140625, + "learning_rate": 4.997649308891336e-06, + "loss": 2.1466, + "step": 4044 + }, + { + "epoch": 0.21700643776824036, + "grad_norm": 0.37109375, + "learning_rate": 4.9976455408405335e-06, + "loss": 2.2835, + "step": 4045 + }, + { + "epoch": 0.21706008583690986, + "grad_norm": 0.298828125, + "learning_rate": 4.997641769773564e-06, + "loss": 2.3029, + "step": 4046 + }, + { + "epoch": 0.2171137339055794, + "grad_norm": 0.34375, + "learning_rate": 4.9976379956904344e-06, + "loss": 2.1545, + "step": 4047 + }, + { + "epoch": 0.21716738197424892, + "grad_norm": 0.361328125, + "learning_rate": 4.997634218591147e-06, + "loss": 2.2468, + "step": 4048 + }, + { + "epoch": 0.21722103004291846, + "grad_norm": 0.609375, + "learning_rate": 4.997630438475707e-06, + "loss": 2.3443, + "step": 4049 + }, + { + "epoch": 0.217274678111588, + "grad_norm": 0.365234375, + "learning_rate": 4.99762665534412e-06, + "loss": 2.2253, + "step": 4050 + }, + { + "epoch": 0.21732832618025752, + "grad_norm": 0.369140625, + "learning_rate": 4.997622869196389e-06, + "loss": 2.3901, + "step": 4051 + }, + { + "epoch": 0.21738197424892705, + "grad_norm": 0.330078125, + "learning_rate": 4.9976190800325184e-06, + "loss": 2.0571, + "step": 4052 + }, + { + "epoch": 0.21743562231759656, + "grad_norm": 0.34765625, + "learning_rate": 4.997615287852514e-06, + "loss": 2.5846, + "step": 4053 + }, + { + "epoch": 0.2174892703862661, + "grad_norm": 0.412109375, + "learning_rate": 4.997611492656381e-06, + "loss": 2.4826, + "step": 4054 + }, + { + "epoch": 0.21754291845493562, + "grad_norm": 0.95703125, + "learning_rate": 4.9976076944441235e-06, + "loss": 2.2685, + "step": 4055 + }, + { + "epoch": 0.21759656652360515, + "grad_norm": 0.390625, + "learning_rate": 4.997603893215744e-06, + "loss": 2.1656, + "step": 4056 + }, + { + "epoch": 0.21765021459227468, + "grad_norm": 0.373046875, + "learning_rate": 4.997600088971249e-06, + "loss": 2.7254, + "step": 4057 + }, + { + "epoch": 0.21770386266094421, + "grad_norm": 0.345703125, + "learning_rate": 4.9975962817106435e-06, + "loss": 2.4529, + "step": 4058 + }, + { + "epoch": 0.21775751072961375, + "grad_norm": 0.43359375, + "learning_rate": 4.9975924714339304e-06, + "loss": 1.562, + "step": 4059 + }, + { + "epoch": 0.21781115879828325, + "grad_norm": 0.328125, + "learning_rate": 4.997588658141116e-06, + "loss": 2.209, + "step": 4060 + }, + { + "epoch": 0.21786480686695278, + "grad_norm": 0.4296875, + "learning_rate": 4.997584841832204e-06, + "loss": 2.5256, + "step": 4061 + }, + { + "epoch": 0.2179184549356223, + "grad_norm": 0.345703125, + "learning_rate": 4.997581022507199e-06, + "loss": 2.2974, + "step": 4062 + }, + { + "epoch": 0.21797210300429185, + "grad_norm": 0.345703125, + "learning_rate": 4.997577200166106e-06, + "loss": 2.1465, + "step": 4063 + }, + { + "epoch": 0.21802575107296138, + "grad_norm": 0.337890625, + "learning_rate": 4.99757337480893e-06, + "loss": 1.8446, + "step": 4064 + }, + { + "epoch": 0.2180793991416309, + "grad_norm": 0.3515625, + "learning_rate": 4.997569546435674e-06, + "loss": 2.2883, + "step": 4065 + }, + { + "epoch": 0.21813304721030044, + "grad_norm": 0.474609375, + "learning_rate": 4.997565715046344e-06, + "loss": 2.3877, + "step": 4066 + }, + { + "epoch": 0.21818669527896994, + "grad_norm": 0.3203125, + "learning_rate": 4.997561880640944e-06, + "loss": 2.4532, + "step": 4067 + }, + { + "epoch": 0.21824034334763948, + "grad_norm": 0.388671875, + "learning_rate": 4.9975580432194795e-06, + "loss": 2.1314, + "step": 4068 + }, + { + "epoch": 0.218293991416309, + "grad_norm": 0.8125, + "learning_rate": 4.997554202781954e-06, + "loss": 2.7232, + "step": 4069 + }, + { + "epoch": 0.21834763948497854, + "grad_norm": 0.357421875, + "learning_rate": 4.997550359328373e-06, + "loss": 2.2616, + "step": 4070 + }, + { + "epoch": 0.21840128755364807, + "grad_norm": 0.349609375, + "learning_rate": 4.997546512858741e-06, + "loss": 2.1862, + "step": 4071 + }, + { + "epoch": 0.2184549356223176, + "grad_norm": 0.36328125, + "learning_rate": 4.9975426633730616e-06, + "loss": 2.2073, + "step": 4072 + }, + { + "epoch": 0.21850858369098713, + "grad_norm": 0.3359375, + "learning_rate": 4.997538810871342e-06, + "loss": 2.0765, + "step": 4073 + }, + { + "epoch": 0.21856223175965664, + "grad_norm": 0.314453125, + "learning_rate": 4.997534955353584e-06, + "loss": 2.2481, + "step": 4074 + }, + { + "epoch": 0.21861587982832617, + "grad_norm": 0.318359375, + "learning_rate": 4.997531096819793e-06, + "loss": 2.4375, + "step": 4075 + }, + { + "epoch": 0.2186695278969957, + "grad_norm": 0.28515625, + "learning_rate": 4.997527235269975e-06, + "loss": 1.93, + "step": 4076 + }, + { + "epoch": 0.21872317596566523, + "grad_norm": 0.33984375, + "learning_rate": 4.997523370704133e-06, + "loss": 2.3142, + "step": 4077 + }, + { + "epoch": 0.21877682403433477, + "grad_norm": 0.294921875, + "learning_rate": 4.9975195031222736e-06, + "loss": 1.9226, + "step": 4078 + }, + { + "epoch": 0.2188304721030043, + "grad_norm": 0.302734375, + "learning_rate": 4.997515632524399e-06, + "loss": 2.2361, + "step": 4079 + }, + { + "epoch": 0.21888412017167383, + "grad_norm": 0.322265625, + "learning_rate": 4.997511758910516e-06, + "loss": 2.4612, + "step": 4080 + }, + { + "epoch": 0.21893776824034336, + "grad_norm": 0.392578125, + "learning_rate": 4.9975078822806275e-06, + "loss": 2.4059, + "step": 4081 + }, + { + "epoch": 0.21899141630901287, + "grad_norm": 0.359375, + "learning_rate": 4.99750400263474e-06, + "loss": 2.4502, + "step": 4082 + }, + { + "epoch": 0.2190450643776824, + "grad_norm": 0.484375, + "learning_rate": 4.9975001199728564e-06, + "loss": 2.518, + "step": 4083 + }, + { + "epoch": 0.21909871244635193, + "grad_norm": 0.3359375, + "learning_rate": 4.997496234294984e-06, + "loss": 2.3763, + "step": 4084 + }, + { + "epoch": 0.21915236051502146, + "grad_norm": 0.45703125, + "learning_rate": 4.9974923456011245e-06, + "loss": 2.4759, + "step": 4085 + }, + { + "epoch": 0.219206008583691, + "grad_norm": 0.62109375, + "learning_rate": 4.997488453891284e-06, + "loss": 2.4035, + "step": 4086 + }, + { + "epoch": 0.21925965665236052, + "grad_norm": 0.3125, + "learning_rate": 4.9974845591654675e-06, + "loss": 2.2103, + "step": 4087 + }, + { + "epoch": 0.21931330472103006, + "grad_norm": 0.412109375, + "learning_rate": 4.997480661423678e-06, + "loss": 2.2447, + "step": 4088 + }, + { + "epoch": 0.21936695278969956, + "grad_norm": 0.291015625, + "learning_rate": 4.997476760665923e-06, + "loss": 1.8772, + "step": 4089 + }, + { + "epoch": 0.2194206008583691, + "grad_norm": 0.48046875, + "learning_rate": 4.997472856892205e-06, + "loss": 2.334, + "step": 4090 + }, + { + "epoch": 0.21947424892703862, + "grad_norm": 0.40234375, + "learning_rate": 4.997468950102529e-06, + "loss": 2.5466, + "step": 4091 + }, + { + "epoch": 0.21952789699570815, + "grad_norm": 0.37109375, + "learning_rate": 4.997465040296901e-06, + "loss": 2.4838, + "step": 4092 + }, + { + "epoch": 0.2195815450643777, + "grad_norm": 0.376953125, + "learning_rate": 4.997461127475325e-06, + "loss": 2.4476, + "step": 4093 + }, + { + "epoch": 0.21963519313304722, + "grad_norm": 0.515625, + "learning_rate": 4.9974572116378056e-06, + "loss": 2.2645, + "step": 4094 + }, + { + "epoch": 0.21968884120171675, + "grad_norm": 0.310546875, + "learning_rate": 4.997453292784347e-06, + "loss": 2.4131, + "step": 4095 + }, + { + "epoch": 0.21974248927038625, + "grad_norm": 0.38671875, + "learning_rate": 4.997449370914955e-06, + "loss": 2.4566, + "step": 4096 + }, + { + "epoch": 0.21979613733905579, + "grad_norm": 0.37109375, + "learning_rate": 4.997445446029633e-06, + "loss": 2.037, + "step": 4097 + }, + { + "epoch": 0.21984978540772532, + "grad_norm": 0.427734375, + "learning_rate": 4.997441518128387e-06, + "loss": 2.3066, + "step": 4098 + }, + { + "epoch": 0.21990343347639485, + "grad_norm": 0.400390625, + "learning_rate": 4.997437587211221e-06, + "loss": 2.4734, + "step": 4099 + }, + { + "epoch": 0.21995708154506438, + "grad_norm": 0.3125, + "learning_rate": 4.99743365327814e-06, + "loss": 2.2844, + "step": 4100 + }, + { + "epoch": 0.2200107296137339, + "grad_norm": 0.306640625, + "learning_rate": 4.997429716329149e-06, + "loss": 2.3598, + "step": 4101 + }, + { + "epoch": 0.22006437768240344, + "grad_norm": 0.380859375, + "learning_rate": 4.997425776364252e-06, + "loss": 2.3107, + "step": 4102 + }, + { + "epoch": 0.22011802575107295, + "grad_norm": 0.3515625, + "learning_rate": 4.997421833383455e-06, + "loss": 2.2064, + "step": 4103 + }, + { + "epoch": 0.22017167381974248, + "grad_norm": 0.32421875, + "learning_rate": 4.997417887386762e-06, + "loss": 2.2864, + "step": 4104 + }, + { + "epoch": 0.220225321888412, + "grad_norm": 0.3671875, + "learning_rate": 4.997413938374177e-06, + "loss": 2.2828, + "step": 4105 + }, + { + "epoch": 0.22027896995708154, + "grad_norm": 0.34375, + "learning_rate": 4.997409986345706e-06, + "loss": 2.3734, + "step": 4106 + }, + { + "epoch": 0.22033261802575108, + "grad_norm": 0.359375, + "learning_rate": 4.997406031301354e-06, + "loss": 1.9961, + "step": 4107 + }, + { + "epoch": 0.2203862660944206, + "grad_norm": 0.3203125, + "learning_rate": 4.997402073241124e-06, + "loss": 2.2787, + "step": 4108 + }, + { + "epoch": 0.22043991416309014, + "grad_norm": 0.466796875, + "learning_rate": 4.997398112165023e-06, + "loss": 2.4275, + "step": 4109 + }, + { + "epoch": 0.22049356223175964, + "grad_norm": 0.30859375, + "learning_rate": 4.997394148073053e-06, + "loss": 2.1917, + "step": 4110 + }, + { + "epoch": 0.22054721030042918, + "grad_norm": 0.86328125, + "learning_rate": 4.997390180965222e-06, + "loss": 2.3926, + "step": 4111 + }, + { + "epoch": 0.2206008583690987, + "grad_norm": 0.34375, + "learning_rate": 4.9973862108415335e-06, + "loss": 2.445, + "step": 4112 + }, + { + "epoch": 0.22065450643776824, + "grad_norm": 0.37890625, + "learning_rate": 4.997382237701991e-06, + "loss": 2.25, + "step": 4113 + }, + { + "epoch": 0.22070815450643777, + "grad_norm": 0.3203125, + "learning_rate": 4.9973782615466e-06, + "loss": 2.0381, + "step": 4114 + }, + { + "epoch": 0.2207618025751073, + "grad_norm": 0.39453125, + "learning_rate": 4.997374282375367e-06, + "loss": 2.389, + "step": 4115 + }, + { + "epoch": 0.22081545064377683, + "grad_norm": 0.361328125, + "learning_rate": 4.9973703001882955e-06, + "loss": 2.2784, + "step": 4116 + }, + { + "epoch": 0.22086909871244637, + "grad_norm": 0.3515625, + "learning_rate": 4.997366314985389e-06, + "loss": 2.166, + "step": 4117 + }, + { + "epoch": 0.22092274678111587, + "grad_norm": 0.447265625, + "learning_rate": 4.997362326766654e-06, + "loss": 1.4802, + "step": 4118 + }, + { + "epoch": 0.2209763948497854, + "grad_norm": 0.392578125, + "learning_rate": 4.997358335532095e-06, + "loss": 2.4287, + "step": 4119 + }, + { + "epoch": 0.22103004291845493, + "grad_norm": 0.380859375, + "learning_rate": 4.997354341281717e-06, + "loss": 2.1453, + "step": 4120 + }, + { + "epoch": 0.22108369098712446, + "grad_norm": 0.423828125, + "learning_rate": 4.9973503440155245e-06, + "loss": 2.2321, + "step": 4121 + }, + { + "epoch": 0.221137339055794, + "grad_norm": 0.380859375, + "learning_rate": 4.9973463437335226e-06, + "loss": 2.4141, + "step": 4122 + }, + { + "epoch": 0.22119098712446353, + "grad_norm": 0.357421875, + "learning_rate": 4.997342340435715e-06, + "loss": 2.3735, + "step": 4123 + }, + { + "epoch": 0.22124463519313306, + "grad_norm": 0.40625, + "learning_rate": 4.997338334122108e-06, + "loss": 2.4722, + "step": 4124 + }, + { + "epoch": 0.22129828326180256, + "grad_norm": 0.37109375, + "learning_rate": 4.997334324792706e-06, + "loss": 2.1414, + "step": 4125 + }, + { + "epoch": 0.2213519313304721, + "grad_norm": 0.455078125, + "learning_rate": 4.997330312447514e-06, + "loss": 2.3678, + "step": 4126 + }, + { + "epoch": 0.22140557939914163, + "grad_norm": 0.359375, + "learning_rate": 4.997326297086536e-06, + "loss": 2.4745, + "step": 4127 + }, + { + "epoch": 0.22145922746781116, + "grad_norm": 0.2890625, + "learning_rate": 4.997322278709777e-06, + "loss": 2.2053, + "step": 4128 + }, + { + "epoch": 0.2215128755364807, + "grad_norm": 0.4296875, + "learning_rate": 4.997318257317243e-06, + "loss": 2.3606, + "step": 4129 + }, + { + "epoch": 0.22156652360515022, + "grad_norm": 0.5, + "learning_rate": 4.997314232908938e-06, + "loss": 2.3505, + "step": 4130 + }, + { + "epoch": 0.22162017167381975, + "grad_norm": 0.37109375, + "learning_rate": 4.997310205484866e-06, + "loss": 2.3237, + "step": 4131 + }, + { + "epoch": 0.22167381974248926, + "grad_norm": 0.341796875, + "learning_rate": 4.997306175045034e-06, + "loss": 2.1099, + "step": 4132 + }, + { + "epoch": 0.2217274678111588, + "grad_norm": 0.3359375, + "learning_rate": 4.997302141589445e-06, + "loss": 2.2394, + "step": 4133 + }, + { + "epoch": 0.22178111587982832, + "grad_norm": 0.330078125, + "learning_rate": 4.997298105118106e-06, + "loss": 2.2243, + "step": 4134 + }, + { + "epoch": 0.22183476394849785, + "grad_norm": 0.47265625, + "learning_rate": 4.997294065631019e-06, + "loss": 2.2589, + "step": 4135 + }, + { + "epoch": 0.22188841201716739, + "grad_norm": 0.5234375, + "learning_rate": 4.99729002312819e-06, + "loss": 2.3076, + "step": 4136 + }, + { + "epoch": 0.22194206008583692, + "grad_norm": 0.333984375, + "learning_rate": 4.997285977609625e-06, + "loss": 2.4285, + "step": 4137 + }, + { + "epoch": 0.22199570815450645, + "grad_norm": 0.357421875, + "learning_rate": 4.997281929075328e-06, + "loss": 2.4437, + "step": 4138 + }, + { + "epoch": 0.22204935622317595, + "grad_norm": 1.203125, + "learning_rate": 4.997277877525304e-06, + "loss": 2.49, + "step": 4139 + }, + { + "epoch": 0.22210300429184548, + "grad_norm": 0.375, + "learning_rate": 4.997273822959558e-06, + "loss": 2.196, + "step": 4140 + }, + { + "epoch": 0.22215665236051502, + "grad_norm": 0.3359375, + "learning_rate": 4.997269765378095e-06, + "loss": 2.1052, + "step": 4141 + }, + { + "epoch": 0.22221030042918455, + "grad_norm": 0.37890625, + "learning_rate": 4.997265704780919e-06, + "loss": 2.2092, + "step": 4142 + }, + { + "epoch": 0.22226394849785408, + "grad_norm": 0.33203125, + "learning_rate": 4.997261641168037e-06, + "loss": 2.3784, + "step": 4143 + }, + { + "epoch": 0.2223175965665236, + "grad_norm": 0.50390625, + "learning_rate": 4.99725757453945e-06, + "loss": 2.4013, + "step": 4144 + }, + { + "epoch": 0.22237124463519314, + "grad_norm": 0.337890625, + "learning_rate": 4.997253504895168e-06, + "loss": 2.1885, + "step": 4145 + }, + { + "epoch": 0.22242489270386265, + "grad_norm": 0.3828125, + "learning_rate": 4.997249432235191e-06, + "loss": 2.4027, + "step": 4146 + }, + { + "epoch": 0.22247854077253218, + "grad_norm": 0.29296875, + "learning_rate": 4.997245356559528e-06, + "loss": 2.4625, + "step": 4147 + }, + { + "epoch": 0.2225321888412017, + "grad_norm": 0.49609375, + "learning_rate": 4.997241277868181e-06, + "loss": 2.4189, + "step": 4148 + }, + { + "epoch": 0.22258583690987124, + "grad_norm": 0.33984375, + "learning_rate": 4.9972371961611565e-06, + "loss": 2.1018, + "step": 4149 + }, + { + "epoch": 0.22263948497854077, + "grad_norm": 0.46875, + "learning_rate": 4.9972331114384595e-06, + "loss": 2.3631, + "step": 4150 + }, + { + "epoch": 0.2226931330472103, + "grad_norm": 0.35546875, + "learning_rate": 4.997229023700094e-06, + "loss": 2.1902, + "step": 4151 + }, + { + "epoch": 0.22274678111587984, + "grad_norm": 0.3515625, + "learning_rate": 4.997224932946065e-06, + "loss": 2.4781, + "step": 4152 + }, + { + "epoch": 0.22280042918454937, + "grad_norm": 0.3671875, + "learning_rate": 4.997220839176379e-06, + "loss": 2.25, + "step": 4153 + }, + { + "epoch": 0.22285407725321887, + "grad_norm": 0.4140625, + "learning_rate": 4.997216742391038e-06, + "loss": 2.0111, + "step": 4154 + }, + { + "epoch": 0.2229077253218884, + "grad_norm": 0.423828125, + "learning_rate": 4.99721264259005e-06, + "loss": 2.5159, + "step": 4155 + }, + { + "epoch": 0.22296137339055794, + "grad_norm": 0.49609375, + "learning_rate": 4.997208539773418e-06, + "loss": 2.3582, + "step": 4156 + }, + { + "epoch": 0.22301502145922747, + "grad_norm": 0.375, + "learning_rate": 4.997204433941148e-06, + "loss": 2.509, + "step": 4157 + }, + { + "epoch": 0.223068669527897, + "grad_norm": 0.396484375, + "learning_rate": 4.997200325093244e-06, + "loss": 2.416, + "step": 4158 + }, + { + "epoch": 0.22312231759656653, + "grad_norm": 0.412109375, + "learning_rate": 4.997196213229713e-06, + "loss": 1.8608, + "step": 4159 + }, + { + "epoch": 0.22317596566523606, + "grad_norm": 0.306640625, + "learning_rate": 4.9971920983505566e-06, + "loss": 2.1949, + "step": 4160 + }, + { + "epoch": 0.22322961373390557, + "grad_norm": 1.5703125, + "learning_rate": 4.997187980455783e-06, + "loss": 2.255, + "step": 4161 + }, + { + "epoch": 0.2232832618025751, + "grad_norm": 0.50390625, + "learning_rate": 4.9971838595453945e-06, + "loss": 2.293, + "step": 4162 + }, + { + "epoch": 0.22333690987124463, + "grad_norm": 0.451171875, + "learning_rate": 4.997179735619399e-06, + "loss": 2.4587, + "step": 4163 + }, + { + "epoch": 0.22339055793991416, + "grad_norm": 0.353515625, + "learning_rate": 4.997175608677799e-06, + "loss": 2.5435, + "step": 4164 + }, + { + "epoch": 0.2234442060085837, + "grad_norm": 0.37109375, + "learning_rate": 4.997171478720601e-06, + "loss": 2.3422, + "step": 4165 + }, + { + "epoch": 0.22349785407725323, + "grad_norm": 0.421875, + "learning_rate": 4.997167345747809e-06, + "loss": 2.1909, + "step": 4166 + }, + { + "epoch": 0.22355150214592276, + "grad_norm": 0.30859375, + "learning_rate": 4.997163209759428e-06, + "loss": 2.2839, + "step": 4167 + }, + { + "epoch": 0.22360515021459226, + "grad_norm": 0.384765625, + "learning_rate": 4.997159070755464e-06, + "loss": 2.201, + "step": 4168 + }, + { + "epoch": 0.2236587982832618, + "grad_norm": 4.53125, + "learning_rate": 4.99715492873592e-06, + "loss": 2.4416, + "step": 4169 + }, + { + "epoch": 0.22371244635193133, + "grad_norm": 0.35546875, + "learning_rate": 4.997150783700804e-06, + "loss": 2.2467, + "step": 4170 + }, + { + "epoch": 0.22376609442060086, + "grad_norm": 1.0703125, + "learning_rate": 4.997146635650119e-06, + "loss": 2.3992, + "step": 4171 + }, + { + "epoch": 0.2238197424892704, + "grad_norm": 0.419921875, + "learning_rate": 4.9971424845838695e-06, + "loss": 2.1445, + "step": 4172 + }, + { + "epoch": 0.22387339055793992, + "grad_norm": 0.404296875, + "learning_rate": 4.997138330502062e-06, + "loss": 2.114, + "step": 4173 + }, + { + "epoch": 0.22392703862660945, + "grad_norm": 0.55859375, + "learning_rate": 4.997134173404701e-06, + "loss": 2.3427, + "step": 4174 + }, + { + "epoch": 0.22398068669527896, + "grad_norm": 0.328125, + "learning_rate": 4.997130013291791e-06, + "loss": 2.0437, + "step": 4175 + }, + { + "epoch": 0.2240343347639485, + "grad_norm": 0.443359375, + "learning_rate": 4.997125850163338e-06, + "loss": 2.4732, + "step": 4176 + }, + { + "epoch": 0.22408798283261802, + "grad_norm": 0.494140625, + "learning_rate": 4.997121684019346e-06, + "loss": 2.1482, + "step": 4177 + }, + { + "epoch": 0.22414163090128755, + "grad_norm": 0.34765625, + "learning_rate": 4.99711751485982e-06, + "loss": 2.3567, + "step": 4178 + }, + { + "epoch": 0.22419527896995708, + "grad_norm": 0.294921875, + "learning_rate": 4.997113342684766e-06, + "loss": 2.1047, + "step": 4179 + }, + { + "epoch": 0.22424892703862662, + "grad_norm": 0.373046875, + "learning_rate": 4.997109167494189e-06, + "loss": 2.2717, + "step": 4180 + }, + { + "epoch": 0.22430257510729615, + "grad_norm": 0.3359375, + "learning_rate": 4.997104989288094e-06, + "loss": 2.3135, + "step": 4181 + }, + { + "epoch": 0.22435622317596565, + "grad_norm": 0.421875, + "learning_rate": 4.997100808066485e-06, + "loss": 2.3569, + "step": 4182 + }, + { + "epoch": 0.22440987124463518, + "grad_norm": 0.3359375, + "learning_rate": 4.997096623829367e-06, + "loss": 2.2812, + "step": 4183 + }, + { + "epoch": 0.22446351931330472, + "grad_norm": 0.326171875, + "learning_rate": 4.997092436576747e-06, + "loss": 2.2852, + "step": 4184 + }, + { + "epoch": 0.22451716738197425, + "grad_norm": 1.1171875, + "learning_rate": 4.997088246308628e-06, + "loss": 2.2118, + "step": 4185 + }, + { + "epoch": 0.22457081545064378, + "grad_norm": 0.376953125, + "learning_rate": 4.997084053025017e-06, + "loss": 2.3466, + "step": 4186 + }, + { + "epoch": 0.2246244635193133, + "grad_norm": 0.310546875, + "learning_rate": 4.997079856725916e-06, + "loss": 2.2519, + "step": 4187 + }, + { + "epoch": 0.22467811158798284, + "grad_norm": 0.6640625, + "learning_rate": 4.997075657411334e-06, + "loss": 2.2376, + "step": 4188 + }, + { + "epoch": 0.22473175965665235, + "grad_norm": 0.482421875, + "learning_rate": 4.997071455081272e-06, + "loss": 2.3722, + "step": 4189 + }, + { + "epoch": 0.22478540772532188, + "grad_norm": 0.306640625, + "learning_rate": 4.997067249735739e-06, + "loss": 2.3682, + "step": 4190 + }, + { + "epoch": 0.2248390557939914, + "grad_norm": 0.3203125, + "learning_rate": 4.997063041374738e-06, + "loss": 2.3158, + "step": 4191 + }, + { + "epoch": 0.22489270386266094, + "grad_norm": 0.3046875, + "learning_rate": 4.997058829998273e-06, + "loss": 2.2059, + "step": 4192 + }, + { + "epoch": 0.22494635193133047, + "grad_norm": 0.39453125, + "learning_rate": 4.997054615606351e-06, + "loss": 2.2753, + "step": 4193 + }, + { + "epoch": 0.225, + "grad_norm": 0.33203125, + "learning_rate": 4.997050398198977e-06, + "loss": 2.3583, + "step": 4194 + }, + { + "epoch": 0.22505364806866954, + "grad_norm": 0.384765625, + "learning_rate": 4.997046177776156e-06, + "loss": 1.6972, + "step": 4195 + }, + { + "epoch": 0.22510729613733907, + "grad_norm": 0.349609375, + "learning_rate": 4.997041954337891e-06, + "loss": 2.3159, + "step": 4196 + }, + { + "epoch": 0.22516094420600857, + "grad_norm": 0.400390625, + "learning_rate": 4.9970377278841905e-06, + "loss": 2.3623, + "step": 4197 + }, + { + "epoch": 0.2252145922746781, + "grad_norm": 0.3828125, + "learning_rate": 4.997033498415057e-06, + "loss": 2.429, + "step": 4198 + }, + { + "epoch": 0.22526824034334764, + "grad_norm": 0.796875, + "learning_rate": 4.9970292659304965e-06, + "loss": 2.2387, + "step": 4199 + }, + { + "epoch": 0.22532188841201717, + "grad_norm": 0.3046875, + "learning_rate": 4.997025030430515e-06, + "loss": 2.0988, + "step": 4200 + }, + { + "epoch": 0.2253755364806867, + "grad_norm": 0.341796875, + "learning_rate": 4.997020791915115e-06, + "loss": 2.1337, + "step": 4201 + }, + { + "epoch": 0.22542918454935623, + "grad_norm": 0.3671875, + "learning_rate": 4.997016550384305e-06, + "loss": 2.2575, + "step": 4202 + }, + { + "epoch": 0.22548283261802576, + "grad_norm": 0.369140625, + "learning_rate": 4.997012305838087e-06, + "loss": 2.342, + "step": 4203 + }, + { + "epoch": 0.22553648068669527, + "grad_norm": 0.365234375, + "learning_rate": 4.997008058276469e-06, + "loss": 2.5416, + "step": 4204 + }, + { + "epoch": 0.2255901287553648, + "grad_norm": 0.359375, + "learning_rate": 4.997003807699454e-06, + "loss": 2.1543, + "step": 4205 + }, + { + "epoch": 0.22564377682403433, + "grad_norm": 0.373046875, + "learning_rate": 4.9969995541070485e-06, + "loss": 2.4598, + "step": 4206 + }, + { + "epoch": 0.22569742489270386, + "grad_norm": 0.353515625, + "learning_rate": 4.996995297499256e-06, + "loss": 2.1357, + "step": 4207 + }, + { + "epoch": 0.2257510729613734, + "grad_norm": 0.314453125, + "learning_rate": 4.996991037876083e-06, + "loss": 1.5575, + "step": 4208 + }, + { + "epoch": 0.22580472103004293, + "grad_norm": 0.283203125, + "learning_rate": 4.996986775237535e-06, + "loss": 2.352, + "step": 4209 + }, + { + "epoch": 0.22585836909871246, + "grad_norm": 0.408203125, + "learning_rate": 4.996982509583615e-06, + "loss": 2.2336, + "step": 4210 + }, + { + "epoch": 0.22591201716738196, + "grad_norm": 0.37109375, + "learning_rate": 4.996978240914329e-06, + "loss": 2.4904, + "step": 4211 + }, + { + "epoch": 0.2259656652360515, + "grad_norm": 0.35546875, + "learning_rate": 4.9969739692296845e-06, + "loss": 1.9298, + "step": 4212 + }, + { + "epoch": 0.22601931330472103, + "grad_norm": 0.447265625, + "learning_rate": 4.996969694529685e-06, + "loss": 2.0481, + "step": 4213 + }, + { + "epoch": 0.22607296137339056, + "grad_norm": 0.357421875, + "learning_rate": 4.996965416814335e-06, + "loss": 2.2185, + "step": 4214 + }, + { + "epoch": 0.2261266094420601, + "grad_norm": 0.482421875, + "learning_rate": 4.9969611360836405e-06, + "loss": 2.3952, + "step": 4215 + }, + { + "epoch": 0.22618025751072962, + "grad_norm": 0.314453125, + "learning_rate": 4.996956852337606e-06, + "loss": 2.407, + "step": 4216 + }, + { + "epoch": 0.22623390557939915, + "grad_norm": 0.384765625, + "learning_rate": 4.996952565576236e-06, + "loss": 2.3033, + "step": 4217 + }, + { + "epoch": 0.22628755364806866, + "grad_norm": 0.34765625, + "learning_rate": 4.996948275799538e-06, + "loss": 1.9608, + "step": 4218 + }, + { + "epoch": 0.2263412017167382, + "grad_norm": 24.375, + "learning_rate": 4.996943983007515e-06, + "loss": 2.5074, + "step": 4219 + }, + { + "epoch": 0.22639484978540772, + "grad_norm": 0.380859375, + "learning_rate": 4.996939687200174e-06, + "loss": 2.3027, + "step": 4220 + }, + { + "epoch": 0.22644849785407725, + "grad_norm": 0.353515625, + "learning_rate": 4.99693538837752e-06, + "loss": 2.1501, + "step": 4221 + }, + { + "epoch": 0.22650214592274678, + "grad_norm": 0.30078125, + "learning_rate": 4.996931086539556e-06, + "loss": 2.233, + "step": 4222 + }, + { + "epoch": 0.22655579399141632, + "grad_norm": 0.421875, + "learning_rate": 4.996926781686289e-06, + "loss": 2.155, + "step": 4223 + }, + { + "epoch": 0.22660944206008585, + "grad_norm": 0.34375, + "learning_rate": 4.996922473817724e-06, + "loss": 2.2284, + "step": 4224 + }, + { + "epoch": 0.22666309012875535, + "grad_norm": 0.353515625, + "learning_rate": 4.996918162933866e-06, + "loss": 2.1372, + "step": 4225 + }, + { + "epoch": 0.22671673819742488, + "grad_norm": 0.45703125, + "learning_rate": 4.99691384903472e-06, + "loss": 1.8424, + "step": 4226 + }, + { + "epoch": 0.22677038626609441, + "grad_norm": 0.388671875, + "learning_rate": 4.996909532120292e-06, + "loss": 1.9846, + "step": 4227 + }, + { + "epoch": 0.22682403433476395, + "grad_norm": 0.423828125, + "learning_rate": 4.996905212190587e-06, + "loss": 1.6471, + "step": 4228 + }, + { + "epoch": 0.22687768240343348, + "grad_norm": 0.373046875, + "learning_rate": 4.99690088924561e-06, + "loss": 2.2717, + "step": 4229 + }, + { + "epoch": 0.226931330472103, + "grad_norm": 0.357421875, + "learning_rate": 4.996896563285365e-06, + "loss": 2.4051, + "step": 4230 + }, + { + "epoch": 0.22698497854077254, + "grad_norm": 0.37109375, + "learning_rate": 4.996892234309859e-06, + "loss": 2.343, + "step": 4231 + }, + { + "epoch": 0.22703862660944207, + "grad_norm": 0.345703125, + "learning_rate": 4.996887902319097e-06, + "loss": 1.8322, + "step": 4232 + }, + { + "epoch": 0.22709227467811158, + "grad_norm": 0.87890625, + "learning_rate": 4.996883567313083e-06, + "loss": 2.1477, + "step": 4233 + }, + { + "epoch": 0.2271459227467811, + "grad_norm": 0.34375, + "learning_rate": 4.996879229291823e-06, + "loss": 2.3502, + "step": 4234 + }, + { + "epoch": 0.22719957081545064, + "grad_norm": 0.46484375, + "learning_rate": 4.996874888255322e-06, + "loss": 2.3956, + "step": 4235 + }, + { + "epoch": 0.22725321888412017, + "grad_norm": 0.431640625, + "learning_rate": 4.996870544203586e-06, + "loss": 2.4711, + "step": 4236 + }, + { + "epoch": 0.2273068669527897, + "grad_norm": 0.62109375, + "learning_rate": 4.996866197136621e-06, + "loss": 2.2162, + "step": 4237 + }, + { + "epoch": 0.22736051502145924, + "grad_norm": 0.55078125, + "learning_rate": 4.996861847054429e-06, + "loss": 2.305, + "step": 4238 + }, + { + "epoch": 0.22741416309012877, + "grad_norm": 0.376953125, + "learning_rate": 4.996857493957019e-06, + "loss": 2.3068, + "step": 4239 + }, + { + "epoch": 0.22746781115879827, + "grad_norm": 0.484375, + "learning_rate": 4.996853137844393e-06, + "loss": 2.3764, + "step": 4240 + }, + { + "epoch": 0.2275214592274678, + "grad_norm": 0.47265625, + "learning_rate": 4.996848778716558e-06, + "loss": 2.2387, + "step": 4241 + }, + { + "epoch": 0.22757510729613734, + "grad_norm": 0.40234375, + "learning_rate": 4.9968444165735204e-06, + "loss": 2.3558, + "step": 4242 + }, + { + "epoch": 0.22762875536480687, + "grad_norm": 0.375, + "learning_rate": 4.996840051415284e-06, + "loss": 2.2759, + "step": 4243 + }, + { + "epoch": 0.2276824034334764, + "grad_norm": 0.59375, + "learning_rate": 4.996835683241853e-06, + "loss": 2.1816, + "step": 4244 + }, + { + "epoch": 0.22773605150214593, + "grad_norm": 0.470703125, + "learning_rate": 4.996831312053234e-06, + "loss": 1.4201, + "step": 4245 + }, + { + "epoch": 0.22778969957081546, + "grad_norm": 0.3671875, + "learning_rate": 4.9968269378494325e-06, + "loss": 2.026, + "step": 4246 + }, + { + "epoch": 0.22784334763948497, + "grad_norm": 0.466796875, + "learning_rate": 4.996822560630454e-06, + "loss": 2.2255, + "step": 4247 + }, + { + "epoch": 0.2278969957081545, + "grad_norm": 0.341796875, + "learning_rate": 4.996818180396303e-06, + "loss": 2.3907, + "step": 4248 + }, + { + "epoch": 0.22795064377682403, + "grad_norm": 0.365234375, + "learning_rate": 4.996813797146984e-06, + "loss": 2.4678, + "step": 4249 + }, + { + "epoch": 0.22800429184549356, + "grad_norm": 0.439453125, + "learning_rate": 4.996809410882504e-06, + "loss": 2.1182, + "step": 4250 + }, + { + "epoch": 0.2280579399141631, + "grad_norm": 0.396484375, + "learning_rate": 4.996805021602868e-06, + "loss": 2.4954, + "step": 4251 + }, + { + "epoch": 0.22811158798283263, + "grad_norm": 0.34375, + "learning_rate": 4.996800629308081e-06, + "loss": 2.3289, + "step": 4252 + }, + { + "epoch": 0.22816523605150216, + "grad_norm": 0.484375, + "learning_rate": 4.9967962339981465e-06, + "loss": 2.0933, + "step": 4253 + }, + { + "epoch": 0.22821888412017166, + "grad_norm": 0.423828125, + "learning_rate": 4.996791835673073e-06, + "loss": 2.5717, + "step": 4254 + }, + { + "epoch": 0.2282725321888412, + "grad_norm": 0.361328125, + "learning_rate": 4.996787434332865e-06, + "loss": 2.3071, + "step": 4255 + }, + { + "epoch": 0.22832618025751072, + "grad_norm": 0.41015625, + "learning_rate": 4.9967830299775255e-06, + "loss": 2.1496, + "step": 4256 + }, + { + "epoch": 0.22837982832618026, + "grad_norm": 0.421875, + "learning_rate": 4.9967786226070625e-06, + "loss": 2.4181, + "step": 4257 + }, + { + "epoch": 0.2284334763948498, + "grad_norm": 0.6015625, + "learning_rate": 4.99677421222148e-06, + "loss": 2.0865, + "step": 4258 + }, + { + "epoch": 0.22848712446351932, + "grad_norm": 0.357421875, + "learning_rate": 4.996769798820783e-06, + "loss": 2.1846, + "step": 4259 + }, + { + "epoch": 0.22854077253218885, + "grad_norm": 0.4609375, + "learning_rate": 4.996765382404978e-06, + "loss": 2.5013, + "step": 4260 + }, + { + "epoch": 0.22859442060085836, + "grad_norm": 0.361328125, + "learning_rate": 4.99676096297407e-06, + "loss": 2.5358, + "step": 4261 + }, + { + "epoch": 0.2286480686695279, + "grad_norm": 0.36328125, + "learning_rate": 4.996756540528064e-06, + "loss": 2.4987, + "step": 4262 + }, + { + "epoch": 0.22870171673819742, + "grad_norm": 0.38671875, + "learning_rate": 4.996752115066965e-06, + "loss": 2.3572, + "step": 4263 + }, + { + "epoch": 0.22875536480686695, + "grad_norm": 0.37890625, + "learning_rate": 4.9967476865907796e-06, + "loss": 2.1733, + "step": 4264 + }, + { + "epoch": 0.22880901287553648, + "grad_norm": 0.38671875, + "learning_rate": 4.996743255099511e-06, + "loss": 2.1379, + "step": 4265 + }, + { + "epoch": 0.22886266094420601, + "grad_norm": 0.359375, + "learning_rate": 4.996738820593167e-06, + "loss": 2.376, + "step": 4266 + }, + { + "epoch": 0.22891630901287555, + "grad_norm": 0.8125, + "learning_rate": 4.996734383071751e-06, + "loss": 2.3207, + "step": 4267 + }, + { + "epoch": 0.22896995708154508, + "grad_norm": 9.6875, + "learning_rate": 4.99672994253527e-06, + "loss": 2.5585, + "step": 4268 + }, + { + "epoch": 0.22902360515021458, + "grad_norm": 0.392578125, + "learning_rate": 4.996725498983729e-06, + "loss": 2.2412, + "step": 4269 + }, + { + "epoch": 0.2290772532188841, + "grad_norm": 0.322265625, + "learning_rate": 4.996721052417132e-06, + "loss": 2.3119, + "step": 4270 + }, + { + "epoch": 0.22913090128755365, + "grad_norm": 0.443359375, + "learning_rate": 4.996716602835485e-06, + "loss": 2.6014, + "step": 4271 + }, + { + "epoch": 0.22918454935622318, + "grad_norm": 0.3828125, + "learning_rate": 4.996712150238794e-06, + "loss": 2.4704, + "step": 4272 + }, + { + "epoch": 0.2292381974248927, + "grad_norm": 0.3984375, + "learning_rate": 4.9967076946270645e-06, + "loss": 2.4205, + "step": 4273 + }, + { + "epoch": 0.22929184549356224, + "grad_norm": 0.58984375, + "learning_rate": 4.996703236000301e-06, + "loss": 1.6981, + "step": 4274 + }, + { + "epoch": 0.22934549356223177, + "grad_norm": 0.353515625, + "learning_rate": 4.99669877435851e-06, + "loss": 2.2291, + "step": 4275 + }, + { + "epoch": 0.22939914163090128, + "grad_norm": 0.345703125, + "learning_rate": 4.996694309701695e-06, + "loss": 2.2809, + "step": 4276 + }, + { + "epoch": 0.2294527896995708, + "grad_norm": 0.6171875, + "learning_rate": 4.996689842029864e-06, + "loss": 2.3663, + "step": 4277 + }, + { + "epoch": 0.22950643776824034, + "grad_norm": 0.275390625, + "learning_rate": 4.99668537134302e-06, + "loss": 1.8146, + "step": 4278 + }, + { + "epoch": 0.22956008583690987, + "grad_norm": 0.353515625, + "learning_rate": 4.99668089764117e-06, + "loss": 2.4173, + "step": 4279 + }, + { + "epoch": 0.2296137339055794, + "grad_norm": 0.3359375, + "learning_rate": 4.996676420924318e-06, + "loss": 2.1545, + "step": 4280 + }, + { + "epoch": 0.22966738197424894, + "grad_norm": 0.380859375, + "learning_rate": 4.996671941192471e-06, + "loss": 2.2793, + "step": 4281 + }, + { + "epoch": 0.22972103004291847, + "grad_norm": 0.421875, + "learning_rate": 4.996667458445633e-06, + "loss": 2.4141, + "step": 4282 + }, + { + "epoch": 0.22977467811158797, + "grad_norm": 0.54296875, + "learning_rate": 4.9966629726838104e-06, + "loss": 2.2975, + "step": 4283 + }, + { + "epoch": 0.2298283261802575, + "grad_norm": 0.294921875, + "learning_rate": 4.996658483907008e-06, + "loss": 2.0498, + "step": 4284 + }, + { + "epoch": 0.22988197424892703, + "grad_norm": 0.359375, + "learning_rate": 4.996653992115232e-06, + "loss": 2.2445, + "step": 4285 + }, + { + "epoch": 0.22993562231759657, + "grad_norm": 0.392578125, + "learning_rate": 4.996649497308487e-06, + "loss": 2.3789, + "step": 4286 + }, + { + "epoch": 0.2299892703862661, + "grad_norm": 0.33203125, + "learning_rate": 4.996644999486778e-06, + "loss": 2.262, + "step": 4287 + }, + { + "epoch": 0.23004291845493563, + "grad_norm": 0.353515625, + "learning_rate": 4.996640498650111e-06, + "loss": 2.3432, + "step": 4288 + }, + { + "epoch": 0.23009656652360516, + "grad_norm": 0.40234375, + "learning_rate": 4.996635994798493e-06, + "loss": 2.2205, + "step": 4289 + }, + { + "epoch": 0.23015021459227467, + "grad_norm": 0.390625, + "learning_rate": 4.996631487931928e-06, + "loss": 1.9318, + "step": 4290 + }, + { + "epoch": 0.2302038626609442, + "grad_norm": 0.357421875, + "learning_rate": 4.9966269780504205e-06, + "loss": 2.3816, + "step": 4291 + }, + { + "epoch": 0.23025751072961373, + "grad_norm": 0.34765625, + "learning_rate": 4.996622465153977e-06, + "loss": 2.3693, + "step": 4292 + }, + { + "epoch": 0.23031115879828326, + "grad_norm": 0.6328125, + "learning_rate": 4.996617949242603e-06, + "loss": 2.3436, + "step": 4293 + }, + { + "epoch": 0.2303648068669528, + "grad_norm": 0.369140625, + "learning_rate": 4.996613430316304e-06, + "loss": 2.4592, + "step": 4294 + }, + { + "epoch": 0.23041845493562232, + "grad_norm": 0.369140625, + "learning_rate": 4.996608908375085e-06, + "loss": 2.6785, + "step": 4295 + }, + { + "epoch": 0.23047210300429186, + "grad_norm": 0.53515625, + "learning_rate": 4.996604383418952e-06, + "loss": 2.2458, + "step": 4296 + }, + { + "epoch": 0.23052575107296136, + "grad_norm": 0.34375, + "learning_rate": 4.99659985544791e-06, + "loss": 2.315, + "step": 4297 + }, + { + "epoch": 0.2305793991416309, + "grad_norm": 0.3515625, + "learning_rate": 4.996595324461965e-06, + "loss": 2.408, + "step": 4298 + }, + { + "epoch": 0.23063304721030042, + "grad_norm": 0.3359375, + "learning_rate": 4.996590790461121e-06, + "loss": 2.4442, + "step": 4299 + }, + { + "epoch": 0.23068669527896996, + "grad_norm": 0.345703125, + "learning_rate": 4.996586253445386e-06, + "loss": 2.3645, + "step": 4300 + }, + { + "epoch": 0.2307403433476395, + "grad_norm": 0.396484375, + "learning_rate": 4.996581713414763e-06, + "loss": 2.0742, + "step": 4301 + }, + { + "epoch": 0.23079399141630902, + "grad_norm": 0.333984375, + "learning_rate": 4.9965771703692595e-06, + "loss": 2.3076, + "step": 4302 + }, + { + "epoch": 0.23084763948497855, + "grad_norm": 0.359375, + "learning_rate": 4.996572624308879e-06, + "loss": 2.3194, + "step": 4303 + }, + { + "epoch": 0.23090128755364808, + "grad_norm": 0.388671875, + "learning_rate": 4.9965680752336295e-06, + "loss": 2.4004, + "step": 4304 + }, + { + "epoch": 0.2309549356223176, + "grad_norm": 0.3515625, + "learning_rate": 4.996563523143514e-06, + "loss": 2.1587, + "step": 4305 + }, + { + "epoch": 0.23100858369098712, + "grad_norm": 0.3125, + "learning_rate": 4.99655896803854e-06, + "loss": 2.2278, + "step": 4306 + }, + { + "epoch": 0.23106223175965665, + "grad_norm": 0.30078125, + "learning_rate": 4.99655440991871e-06, + "loss": 2.0133, + "step": 4307 + }, + { + "epoch": 0.23111587982832618, + "grad_norm": 0.609375, + "learning_rate": 4.996549848784034e-06, + "loss": 1.75, + "step": 4308 + }, + { + "epoch": 0.2311695278969957, + "grad_norm": 0.349609375, + "learning_rate": 4.996545284634513e-06, + "loss": 2.202, + "step": 4309 + }, + { + "epoch": 0.23122317596566525, + "grad_norm": 0.37109375, + "learning_rate": 4.9965407174701555e-06, + "loss": 1.9887, + "step": 4310 + }, + { + "epoch": 0.23127682403433478, + "grad_norm": 0.380859375, + "learning_rate": 4.9965361472909654e-06, + "loss": 2.2903, + "step": 4311 + }, + { + "epoch": 0.23133047210300428, + "grad_norm": 0.357421875, + "learning_rate": 4.99653157409695e-06, + "loss": 2.4258, + "step": 4312 + }, + { + "epoch": 0.2313841201716738, + "grad_norm": 0.361328125, + "learning_rate": 4.996526997888114e-06, + "loss": 2.1743, + "step": 4313 + }, + { + "epoch": 0.23143776824034334, + "grad_norm": 0.33984375, + "learning_rate": 4.996522418664461e-06, + "loss": 2.1457, + "step": 4314 + }, + { + "epoch": 0.23149141630901288, + "grad_norm": 0.37890625, + "learning_rate": 4.996517836425999e-06, + "loss": 2.3442, + "step": 4315 + }, + { + "epoch": 0.2315450643776824, + "grad_norm": 0.4296875, + "learning_rate": 4.996513251172733e-06, + "loss": 2.2321, + "step": 4316 + }, + { + "epoch": 0.23159871244635194, + "grad_norm": 0.419921875, + "learning_rate": 4.996508662904667e-06, + "loss": 2.241, + "step": 4317 + }, + { + "epoch": 0.23165236051502147, + "grad_norm": 0.796875, + "learning_rate": 4.996504071621809e-06, + "loss": 2.4286, + "step": 4318 + }, + { + "epoch": 0.23170600858369098, + "grad_norm": 0.5234375, + "learning_rate": 4.996499477324164e-06, + "loss": 2.53, + "step": 4319 + }, + { + "epoch": 0.2317596566523605, + "grad_norm": 0.330078125, + "learning_rate": 4.996494880011735e-06, + "loss": 2.1664, + "step": 4320 + }, + { + "epoch": 0.23181330472103004, + "grad_norm": 0.345703125, + "learning_rate": 4.996490279684531e-06, + "loss": 2.332, + "step": 4321 + }, + { + "epoch": 0.23186695278969957, + "grad_norm": 0.427734375, + "learning_rate": 4.996485676342555e-06, + "loss": 2.4197, + "step": 4322 + }, + { + "epoch": 0.2319206008583691, + "grad_norm": 0.310546875, + "learning_rate": 4.996481069985814e-06, + "loss": 2.3553, + "step": 4323 + }, + { + "epoch": 0.23197424892703863, + "grad_norm": 0.359375, + "learning_rate": 4.996476460614313e-06, + "loss": 2.4276, + "step": 4324 + }, + { + "epoch": 0.23202789699570817, + "grad_norm": 0.55078125, + "learning_rate": 4.9964718482280576e-06, + "loss": 2.3264, + "step": 4325 + }, + { + "epoch": 0.23208154506437767, + "grad_norm": 0.52734375, + "learning_rate": 4.996467232827053e-06, + "loss": 2.1602, + "step": 4326 + }, + { + "epoch": 0.2321351931330472, + "grad_norm": 0.6484375, + "learning_rate": 4.996462614411306e-06, + "loss": 2.6062, + "step": 4327 + }, + { + "epoch": 0.23218884120171673, + "grad_norm": 0.310546875, + "learning_rate": 4.996457992980821e-06, + "loss": 2.1464, + "step": 4328 + }, + { + "epoch": 0.23224248927038627, + "grad_norm": 0.37109375, + "learning_rate": 4.996453368535605e-06, + "loss": 2.3305, + "step": 4329 + }, + { + "epoch": 0.2322961373390558, + "grad_norm": 0.396484375, + "learning_rate": 4.996448741075661e-06, + "loss": 2.1118, + "step": 4330 + }, + { + "epoch": 0.23234978540772533, + "grad_norm": 0.3359375, + "learning_rate": 4.996444110600997e-06, + "loss": 2.2976, + "step": 4331 + }, + { + "epoch": 0.23240343347639486, + "grad_norm": 0.5234375, + "learning_rate": 4.996439477111618e-06, + "loss": 2.485, + "step": 4332 + }, + { + "epoch": 0.23245708154506436, + "grad_norm": 0.33984375, + "learning_rate": 4.996434840607529e-06, + "loss": 2.5021, + "step": 4333 + }, + { + "epoch": 0.2325107296137339, + "grad_norm": 0.333984375, + "learning_rate": 4.996430201088736e-06, + "loss": 2.069, + "step": 4334 + }, + { + "epoch": 0.23256437768240343, + "grad_norm": 0.34765625, + "learning_rate": 4.9964255585552435e-06, + "loss": 2.3059, + "step": 4335 + }, + { + "epoch": 0.23261802575107296, + "grad_norm": 0.36328125, + "learning_rate": 4.996420913007059e-06, + "loss": 2.466, + "step": 4336 + }, + { + "epoch": 0.2326716738197425, + "grad_norm": 0.388671875, + "learning_rate": 4.996416264444187e-06, + "loss": 2.2537, + "step": 4337 + }, + { + "epoch": 0.23272532188841202, + "grad_norm": 0.384765625, + "learning_rate": 4.996411612866634e-06, + "loss": 2.4731, + "step": 4338 + }, + { + "epoch": 0.23277896995708156, + "grad_norm": 0.373046875, + "learning_rate": 4.996406958274405e-06, + "loss": 2.2731, + "step": 4339 + }, + { + "epoch": 0.23283261802575106, + "grad_norm": 0.419921875, + "learning_rate": 4.996402300667505e-06, + "loss": 2.1868, + "step": 4340 + }, + { + "epoch": 0.2328862660944206, + "grad_norm": 0.384765625, + "learning_rate": 4.9963976400459406e-06, + "loss": 2.496, + "step": 4341 + }, + { + "epoch": 0.23293991416309012, + "grad_norm": 0.3828125, + "learning_rate": 4.996392976409717e-06, + "loss": 2.2628, + "step": 4342 + }, + { + "epoch": 0.23299356223175965, + "grad_norm": 0.361328125, + "learning_rate": 4.996388309758839e-06, + "loss": 2.4299, + "step": 4343 + }, + { + "epoch": 0.2330472103004292, + "grad_norm": 0.390625, + "learning_rate": 4.996383640093314e-06, + "loss": 2.0676, + "step": 4344 + }, + { + "epoch": 0.23310085836909872, + "grad_norm": 0.396484375, + "learning_rate": 4.996378967413146e-06, + "loss": 1.9596, + "step": 4345 + }, + { + "epoch": 0.23315450643776825, + "grad_norm": 0.3671875, + "learning_rate": 4.9963742917183425e-06, + "loss": 2.1436, + "step": 4346 + }, + { + "epoch": 0.23320815450643778, + "grad_norm": 0.384765625, + "learning_rate": 4.996369613008907e-06, + "loss": 2.3112, + "step": 4347 + }, + { + "epoch": 0.23326180257510729, + "grad_norm": 0.4140625, + "learning_rate": 4.996364931284847e-06, + "loss": 2.1911, + "step": 4348 + }, + { + "epoch": 0.23331545064377682, + "grad_norm": 0.326171875, + "learning_rate": 4.996360246546167e-06, + "loss": 2.3145, + "step": 4349 + }, + { + "epoch": 0.23336909871244635, + "grad_norm": 0.40234375, + "learning_rate": 4.996355558792874e-06, + "loss": 2.0818, + "step": 4350 + }, + { + "epoch": 0.23342274678111588, + "grad_norm": 0.77734375, + "learning_rate": 4.996350868024971e-06, + "loss": 2.3827, + "step": 4351 + }, + { + "epoch": 0.2334763948497854, + "grad_norm": 0.455078125, + "learning_rate": 4.9963461742424665e-06, + "loss": 2.5579, + "step": 4352 + }, + { + "epoch": 0.23353004291845494, + "grad_norm": 0.427734375, + "learning_rate": 4.996341477445364e-06, + "loss": 2.2483, + "step": 4353 + }, + { + "epoch": 0.23358369098712448, + "grad_norm": 0.341796875, + "learning_rate": 4.996336777633671e-06, + "loss": 2.2574, + "step": 4354 + }, + { + "epoch": 0.23363733905579398, + "grad_norm": 0.4296875, + "learning_rate": 4.996332074807393e-06, + "loss": 2.3737, + "step": 4355 + }, + { + "epoch": 0.2336909871244635, + "grad_norm": 0.40625, + "learning_rate": 4.996327368966533e-06, + "loss": 2.304, + "step": 4356 + }, + { + "epoch": 0.23374463519313304, + "grad_norm": 0.322265625, + "learning_rate": 4.996322660111101e-06, + "loss": 2.1667, + "step": 4357 + }, + { + "epoch": 0.23379828326180258, + "grad_norm": 0.388671875, + "learning_rate": 4.996317948241099e-06, + "loss": 2.4131, + "step": 4358 + }, + { + "epoch": 0.2338519313304721, + "grad_norm": 0.37109375, + "learning_rate": 4.996313233356535e-06, + "loss": 2.2624, + "step": 4359 + }, + { + "epoch": 0.23390557939914164, + "grad_norm": 0.38671875, + "learning_rate": 4.996308515457413e-06, + "loss": 2.5098, + "step": 4360 + }, + { + "epoch": 0.23395922746781117, + "grad_norm": 0.6875, + "learning_rate": 4.996303794543739e-06, + "loss": 2.2161, + "step": 4361 + }, + { + "epoch": 0.23401287553648067, + "grad_norm": 0.3828125, + "learning_rate": 4.99629907061552e-06, + "loss": 2.3879, + "step": 4362 + }, + { + "epoch": 0.2340665236051502, + "grad_norm": 0.41015625, + "learning_rate": 4.99629434367276e-06, + "loss": 2.4089, + "step": 4363 + }, + { + "epoch": 0.23412017167381974, + "grad_norm": 0.375, + "learning_rate": 4.996289613715467e-06, + "loss": 2.3358, + "step": 4364 + }, + { + "epoch": 0.23417381974248927, + "grad_norm": 0.30078125, + "learning_rate": 4.996284880743645e-06, + "loss": 2.3747, + "step": 4365 + }, + { + "epoch": 0.2342274678111588, + "grad_norm": 0.3671875, + "learning_rate": 4.9962801447573e-06, + "loss": 2.1104, + "step": 4366 + }, + { + "epoch": 0.23428111587982833, + "grad_norm": 0.63671875, + "learning_rate": 4.996275405756436e-06, + "loss": 2.5994, + "step": 4367 + }, + { + "epoch": 0.23433476394849787, + "grad_norm": 0.37109375, + "learning_rate": 4.996270663741063e-06, + "loss": 2.2762, + "step": 4368 + }, + { + "epoch": 0.23438841201716737, + "grad_norm": 0.466796875, + "learning_rate": 4.996265918711183e-06, + "loss": 2.5074, + "step": 4369 + }, + { + "epoch": 0.2344420600858369, + "grad_norm": 0.337890625, + "learning_rate": 4.996261170666803e-06, + "loss": 2.1983, + "step": 4370 + }, + { + "epoch": 0.23449570815450643, + "grad_norm": 0.328125, + "learning_rate": 4.996256419607929e-06, + "loss": 2.2616, + "step": 4371 + }, + { + "epoch": 0.23454935622317596, + "grad_norm": 0.33203125, + "learning_rate": 4.996251665534565e-06, + "loss": 2.3466, + "step": 4372 + }, + { + "epoch": 0.2346030042918455, + "grad_norm": 0.4140625, + "learning_rate": 4.996246908446719e-06, + "loss": 1.7617, + "step": 4373 + }, + { + "epoch": 0.23465665236051503, + "grad_norm": 0.3359375, + "learning_rate": 4.9962421483443965e-06, + "loss": 2.3169, + "step": 4374 + }, + { + "epoch": 0.23471030042918456, + "grad_norm": 4.65625, + "learning_rate": 4.996237385227602e-06, + "loss": 1.3484, + "step": 4375 + }, + { + "epoch": 0.23476394849785406, + "grad_norm": 0.34765625, + "learning_rate": 4.996232619096342e-06, + "loss": 2.2149, + "step": 4376 + }, + { + "epoch": 0.2348175965665236, + "grad_norm": 0.484375, + "learning_rate": 4.996227849950622e-06, + "loss": 2.3391, + "step": 4377 + }, + { + "epoch": 0.23487124463519313, + "grad_norm": 0.345703125, + "learning_rate": 4.996223077790449e-06, + "loss": 2.476, + "step": 4378 + }, + { + "epoch": 0.23492489270386266, + "grad_norm": 0.421875, + "learning_rate": 4.996218302615826e-06, + "loss": 2.1686, + "step": 4379 + }, + { + "epoch": 0.2349785407725322, + "grad_norm": 0.33203125, + "learning_rate": 4.9962135244267615e-06, + "loss": 2.2898, + "step": 4380 + }, + { + "epoch": 0.23503218884120172, + "grad_norm": 0.369140625, + "learning_rate": 4.99620874322326e-06, + "loss": 2.3067, + "step": 4381 + }, + { + "epoch": 0.23508583690987125, + "grad_norm": 0.90234375, + "learning_rate": 4.996203959005327e-06, + "loss": 2.3529, + "step": 4382 + }, + { + "epoch": 0.23513948497854079, + "grad_norm": 0.53125, + "learning_rate": 4.996199171772969e-06, + "loss": 2.1043, + "step": 4383 + }, + { + "epoch": 0.2351931330472103, + "grad_norm": 0.56640625, + "learning_rate": 4.996194381526192e-06, + "loss": 2.2897, + "step": 4384 + }, + { + "epoch": 0.23524678111587982, + "grad_norm": 0.37109375, + "learning_rate": 4.996189588265001e-06, + "loss": 2.5071, + "step": 4385 + }, + { + "epoch": 0.23530042918454935, + "grad_norm": 0.421875, + "learning_rate": 4.9961847919894015e-06, + "loss": 2.6161, + "step": 4386 + }, + { + "epoch": 0.23535407725321889, + "grad_norm": 0.40625, + "learning_rate": 4.9961799926994e-06, + "loss": 2.6021, + "step": 4387 + }, + { + "epoch": 0.23540772532188842, + "grad_norm": 0.3828125, + "learning_rate": 4.9961751903950025e-06, + "loss": 2.3575, + "step": 4388 + }, + { + "epoch": 0.23546137339055795, + "grad_norm": 0.55078125, + "learning_rate": 4.996170385076215e-06, + "loss": 2.4488, + "step": 4389 + }, + { + "epoch": 0.23551502145922748, + "grad_norm": 0.341796875, + "learning_rate": 4.996165576743043e-06, + "loss": 2.2093, + "step": 4390 + }, + { + "epoch": 0.23556866952789698, + "grad_norm": 0.421875, + "learning_rate": 4.996160765395491e-06, + "loss": 2.4794, + "step": 4391 + }, + { + "epoch": 0.23562231759656652, + "grad_norm": 2.125, + "learning_rate": 4.996155951033567e-06, + "loss": 2.4225, + "step": 4392 + }, + { + "epoch": 0.23567596566523605, + "grad_norm": 0.3671875, + "learning_rate": 4.996151133657274e-06, + "loss": 2.4799, + "step": 4393 + }, + { + "epoch": 0.23572961373390558, + "grad_norm": 0.345703125, + "learning_rate": 4.996146313266621e-06, + "loss": 2.6127, + "step": 4394 + }, + { + "epoch": 0.2357832618025751, + "grad_norm": 0.359375, + "learning_rate": 4.9961414898616125e-06, + "loss": 2.2001, + "step": 4395 + }, + { + "epoch": 0.23583690987124464, + "grad_norm": 0.38671875, + "learning_rate": 4.996136663442253e-06, + "loss": 2.5978, + "step": 4396 + }, + { + "epoch": 0.23589055793991417, + "grad_norm": 0.380859375, + "learning_rate": 4.99613183400855e-06, + "loss": 2.2504, + "step": 4397 + }, + { + "epoch": 0.23594420600858368, + "grad_norm": 0.408203125, + "learning_rate": 4.996127001560509e-06, + "loss": 2.2614, + "step": 4398 + }, + { + "epoch": 0.2359978540772532, + "grad_norm": 0.3671875, + "learning_rate": 4.996122166098136e-06, + "loss": 2.3109, + "step": 4399 + }, + { + "epoch": 0.23605150214592274, + "grad_norm": 0.478515625, + "learning_rate": 4.996117327621436e-06, + "loss": 2.381, + "step": 4400 + }, + { + "epoch": 0.23610515021459227, + "grad_norm": 0.3203125, + "learning_rate": 4.996112486130415e-06, + "loss": 2.1477, + "step": 4401 + }, + { + "epoch": 0.2361587982832618, + "grad_norm": 0.35546875, + "learning_rate": 4.99610764162508e-06, + "loss": 2.2478, + "step": 4402 + }, + { + "epoch": 0.23621244635193134, + "grad_norm": 0.390625, + "learning_rate": 4.996102794105436e-06, + "loss": 2.2177, + "step": 4403 + }, + { + "epoch": 0.23626609442060087, + "grad_norm": 0.349609375, + "learning_rate": 4.996097943571489e-06, + "loss": 2.4646, + "step": 4404 + }, + { + "epoch": 0.23631974248927037, + "grad_norm": 0.359375, + "learning_rate": 4.996093090023245e-06, + "loss": 1.8481, + "step": 4405 + }, + { + "epoch": 0.2363733905579399, + "grad_norm": 0.337890625, + "learning_rate": 4.996088233460708e-06, + "loss": 2.2515, + "step": 4406 + }, + { + "epoch": 0.23642703862660944, + "grad_norm": 0.357421875, + "learning_rate": 4.996083373883887e-06, + "loss": 2.4977, + "step": 4407 + }, + { + "epoch": 0.23648068669527897, + "grad_norm": 0.47265625, + "learning_rate": 4.996078511292786e-06, + "loss": 2.2966, + "step": 4408 + }, + { + "epoch": 0.2365343347639485, + "grad_norm": 0.359375, + "learning_rate": 4.996073645687412e-06, + "loss": 2.2711, + "step": 4409 + }, + { + "epoch": 0.23658798283261803, + "grad_norm": 0.40625, + "learning_rate": 4.996068777067769e-06, + "loss": 2.2396, + "step": 4410 + }, + { + "epoch": 0.23664163090128756, + "grad_norm": 0.58984375, + "learning_rate": 4.996063905433865e-06, + "loss": 2.3766, + "step": 4411 + }, + { + "epoch": 0.23669527896995707, + "grad_norm": 0.349609375, + "learning_rate": 4.996059030785703e-06, + "loss": 2.4784, + "step": 4412 + }, + { + "epoch": 0.2367489270386266, + "grad_norm": 0.322265625, + "learning_rate": 4.996054153123292e-06, + "loss": 2.299, + "step": 4413 + }, + { + "epoch": 0.23680257510729613, + "grad_norm": 0.353515625, + "learning_rate": 4.996049272446637e-06, + "loss": 1.838, + "step": 4414 + }, + { + "epoch": 0.23685622317596566, + "grad_norm": 0.38671875, + "learning_rate": 4.996044388755743e-06, + "loss": 2.1311, + "step": 4415 + }, + { + "epoch": 0.2369098712446352, + "grad_norm": 0.3515625, + "learning_rate": 4.996039502050617e-06, + "loss": 2.3122, + "step": 4416 + }, + { + "epoch": 0.23696351931330473, + "grad_norm": 0.298828125, + "learning_rate": 4.9960346123312635e-06, + "loss": 1.9074, + "step": 4417 + }, + { + "epoch": 0.23701716738197426, + "grad_norm": 0.3828125, + "learning_rate": 4.99602971959769e-06, + "loss": 2.3038, + "step": 4418 + }, + { + "epoch": 0.2370708154506438, + "grad_norm": 0.3125, + "learning_rate": 4.996024823849901e-06, + "loss": 2.3365, + "step": 4419 + }, + { + "epoch": 0.2371244635193133, + "grad_norm": 1.4296875, + "learning_rate": 4.996019925087904e-06, + "loss": 1.3779, + "step": 4420 + }, + { + "epoch": 0.23717811158798283, + "grad_norm": 0.380859375, + "learning_rate": 4.996015023311703e-06, + "loss": 2.2404, + "step": 4421 + }, + { + "epoch": 0.23723175965665236, + "grad_norm": 0.318359375, + "learning_rate": 4.996010118521305e-06, + "loss": 2.4014, + "step": 4422 + }, + { + "epoch": 0.2372854077253219, + "grad_norm": 0.359375, + "learning_rate": 4.996005210716717e-06, + "loss": 2.307, + "step": 4423 + }, + { + "epoch": 0.23733905579399142, + "grad_norm": 0.421875, + "learning_rate": 4.996000299897943e-06, + "loss": 2.2976, + "step": 4424 + }, + { + "epoch": 0.23739270386266095, + "grad_norm": 0.427734375, + "learning_rate": 4.995995386064989e-06, + "loss": 2.1347, + "step": 4425 + }, + { + "epoch": 0.23744635193133048, + "grad_norm": 0.3125, + "learning_rate": 4.995990469217862e-06, + "loss": 2.403, + "step": 4426 + }, + { + "epoch": 0.2375, + "grad_norm": 0.38671875, + "learning_rate": 4.995985549356568e-06, + "loss": 2.3395, + "step": 4427 + }, + { + "epoch": 0.23755364806866952, + "grad_norm": 0.37109375, + "learning_rate": 4.995980626481112e-06, + "loss": 2.2455, + "step": 4428 + }, + { + "epoch": 0.23760729613733905, + "grad_norm": 0.34765625, + "learning_rate": 4.995975700591501e-06, + "loss": 2.2237, + "step": 4429 + }, + { + "epoch": 0.23766094420600858, + "grad_norm": 0.375, + "learning_rate": 4.99597077168774e-06, + "loss": 2.3662, + "step": 4430 + }, + { + "epoch": 0.23771459227467812, + "grad_norm": 0.52734375, + "learning_rate": 4.995965839769836e-06, + "loss": 1.7465, + "step": 4431 + }, + { + "epoch": 0.23776824034334765, + "grad_norm": 0.7265625, + "learning_rate": 4.995960904837792e-06, + "loss": 2.4668, + "step": 4432 + }, + { + "epoch": 0.23782188841201718, + "grad_norm": 0.47265625, + "learning_rate": 4.995955966891619e-06, + "loss": 2.5541, + "step": 4433 + }, + { + "epoch": 0.23787553648068668, + "grad_norm": 0.37109375, + "learning_rate": 4.995951025931319e-06, + "loss": 2.239, + "step": 4434 + }, + { + "epoch": 0.23792918454935622, + "grad_norm": 0.353515625, + "learning_rate": 4.9959460819568995e-06, + "loss": 2.0969, + "step": 4435 + }, + { + "epoch": 0.23798283261802575, + "grad_norm": 0.330078125, + "learning_rate": 4.995941134968366e-06, + "loss": 2.0611, + "step": 4436 + }, + { + "epoch": 0.23803648068669528, + "grad_norm": 0.357421875, + "learning_rate": 4.995936184965724e-06, + "loss": 2.0201, + "step": 4437 + }, + { + "epoch": 0.2380901287553648, + "grad_norm": 0.42578125, + "learning_rate": 4.995931231948982e-06, + "loss": 2.3877, + "step": 4438 + }, + { + "epoch": 0.23814377682403434, + "grad_norm": 0.34765625, + "learning_rate": 4.995926275918143e-06, + "loss": 2.5047, + "step": 4439 + }, + { + "epoch": 0.23819742489270387, + "grad_norm": 0.35546875, + "learning_rate": 4.995921316873214e-06, + "loss": 2.1056, + "step": 4440 + }, + { + "epoch": 0.23825107296137338, + "grad_norm": 0.341796875, + "learning_rate": 4.9959163548142e-06, + "loss": 2.3089, + "step": 4441 + }, + { + "epoch": 0.2383047210300429, + "grad_norm": 0.345703125, + "learning_rate": 4.995911389741109e-06, + "loss": 2.3981, + "step": 4442 + }, + { + "epoch": 0.23835836909871244, + "grad_norm": 0.408203125, + "learning_rate": 4.995906421653947e-06, + "loss": 2.5148, + "step": 4443 + }, + { + "epoch": 0.23841201716738197, + "grad_norm": 0.3515625, + "learning_rate": 4.995901450552718e-06, + "loss": 2.0109, + "step": 4444 + }, + { + "epoch": 0.2384656652360515, + "grad_norm": 0.396484375, + "learning_rate": 4.995896476437428e-06, + "loss": 2.234, + "step": 4445 + }, + { + "epoch": 0.23851931330472104, + "grad_norm": 0.3203125, + "learning_rate": 4.995891499308086e-06, + "loss": 2.1744, + "step": 4446 + }, + { + "epoch": 0.23857296137339057, + "grad_norm": 0.3515625, + "learning_rate": 4.995886519164695e-06, + "loss": 2.3024, + "step": 4447 + }, + { + "epoch": 0.23862660944206007, + "grad_norm": 0.341796875, + "learning_rate": 4.995881536007262e-06, + "loss": 2.2192, + "step": 4448 + }, + { + "epoch": 0.2386802575107296, + "grad_norm": 0.4296875, + "learning_rate": 4.995876549835794e-06, + "loss": 1.5131, + "step": 4449 + }, + { + "epoch": 0.23873390557939914, + "grad_norm": 0.453125, + "learning_rate": 4.995871560650296e-06, + "loss": 2.3098, + "step": 4450 + }, + { + "epoch": 0.23878755364806867, + "grad_norm": 0.98046875, + "learning_rate": 4.995866568450774e-06, + "loss": 2.2933, + "step": 4451 + }, + { + "epoch": 0.2388412017167382, + "grad_norm": 0.357421875, + "learning_rate": 4.995861573237234e-06, + "loss": 2.2712, + "step": 4452 + }, + { + "epoch": 0.23889484978540773, + "grad_norm": 0.419921875, + "learning_rate": 4.995856575009682e-06, + "loss": 2.4613, + "step": 4453 + }, + { + "epoch": 0.23894849785407726, + "grad_norm": 0.330078125, + "learning_rate": 4.9958515737681245e-06, + "loss": 2.1801, + "step": 4454 + }, + { + "epoch": 0.2390021459227468, + "grad_norm": 0.345703125, + "learning_rate": 4.995846569512567e-06, + "loss": 2.3378, + "step": 4455 + }, + { + "epoch": 0.2390557939914163, + "grad_norm": 0.34765625, + "learning_rate": 4.995841562243017e-06, + "loss": 2.1578, + "step": 4456 + }, + { + "epoch": 0.23910944206008583, + "grad_norm": 0.62890625, + "learning_rate": 4.995836551959477e-06, + "loss": 2.1712, + "step": 4457 + }, + { + "epoch": 0.23916309012875536, + "grad_norm": 0.38671875, + "learning_rate": 4.9958315386619575e-06, + "loss": 1.8003, + "step": 4458 + }, + { + "epoch": 0.2392167381974249, + "grad_norm": 0.3984375, + "learning_rate": 4.995826522350462e-06, + "loss": 2.4277, + "step": 4459 + }, + { + "epoch": 0.23927038626609443, + "grad_norm": 0.318359375, + "learning_rate": 4.995821503024997e-06, + "loss": 2.2494, + "step": 4460 + }, + { + "epoch": 0.23932403433476396, + "grad_norm": 0.421875, + "learning_rate": 4.995816480685568e-06, + "loss": 2.1969, + "step": 4461 + }, + { + "epoch": 0.2393776824034335, + "grad_norm": 0.78515625, + "learning_rate": 4.995811455332182e-06, + "loss": 2.1063, + "step": 4462 + }, + { + "epoch": 0.239431330472103, + "grad_norm": 0.37109375, + "learning_rate": 4.995806426964846e-06, + "loss": 2.3503, + "step": 4463 + }, + { + "epoch": 0.23948497854077253, + "grad_norm": 0.33203125, + "learning_rate": 4.995801395583563e-06, + "loss": 1.9303, + "step": 4464 + }, + { + "epoch": 0.23953862660944206, + "grad_norm": 0.392578125, + "learning_rate": 4.9957963611883415e-06, + "loss": 2.2407, + "step": 4465 + }, + { + "epoch": 0.2395922746781116, + "grad_norm": 0.361328125, + "learning_rate": 4.995791323779187e-06, + "loss": 2.2884, + "step": 4466 + }, + { + "epoch": 0.23964592274678112, + "grad_norm": 0.349609375, + "learning_rate": 4.995786283356105e-06, + "loss": 2.3338, + "step": 4467 + }, + { + "epoch": 0.23969957081545065, + "grad_norm": 0.392578125, + "learning_rate": 4.995781239919103e-06, + "loss": 2.2185, + "step": 4468 + }, + { + "epoch": 0.23975321888412018, + "grad_norm": 0.33203125, + "learning_rate": 4.9957761934681865e-06, + "loss": 1.8877, + "step": 4469 + }, + { + "epoch": 0.2398068669527897, + "grad_norm": 0.427734375, + "learning_rate": 4.995771144003361e-06, + "loss": 2.3482, + "step": 4470 + }, + { + "epoch": 0.23986051502145922, + "grad_norm": 0.357421875, + "learning_rate": 4.995766091524632e-06, + "loss": 2.1798, + "step": 4471 + }, + { + "epoch": 0.23991416309012875, + "grad_norm": 0.5078125, + "learning_rate": 4.995761036032007e-06, + "loss": 2.3819, + "step": 4472 + }, + { + "epoch": 0.23996781115879828, + "grad_norm": 0.470703125, + "learning_rate": 4.995755977525492e-06, + "loss": 2.2641, + "step": 4473 + }, + { + "epoch": 0.24002145922746781, + "grad_norm": 0.314453125, + "learning_rate": 4.995750916005092e-06, + "loss": 2.1664, + "step": 4474 + }, + { + "epoch": 0.24007510729613735, + "grad_norm": 0.306640625, + "learning_rate": 4.9957458514708155e-06, + "loss": 1.9657, + "step": 4475 + }, + { + "epoch": 0.24012875536480688, + "grad_norm": 0.328125, + "learning_rate": 4.995740783922666e-06, + "loss": 2.0926, + "step": 4476 + }, + { + "epoch": 0.24018240343347638, + "grad_norm": 0.546875, + "learning_rate": 4.99573571336065e-06, + "loss": 1.8623, + "step": 4477 + }, + { + "epoch": 0.24023605150214591, + "grad_norm": 0.388671875, + "learning_rate": 4.9957306397847745e-06, + "loss": 2.2556, + "step": 4478 + }, + { + "epoch": 0.24028969957081545, + "grad_norm": 0.44921875, + "learning_rate": 4.995725563195045e-06, + "loss": 2.2786, + "step": 4479 + }, + { + "epoch": 0.24034334763948498, + "grad_norm": 0.408203125, + "learning_rate": 4.995720483591469e-06, + "loss": 2.3825, + "step": 4480 + }, + { + "epoch": 0.2403969957081545, + "grad_norm": 0.380859375, + "learning_rate": 4.995715400974051e-06, + "loss": 2.3931, + "step": 4481 + }, + { + "epoch": 0.24045064377682404, + "grad_norm": 0.37890625, + "learning_rate": 4.995710315342797e-06, + "loss": 2.1958, + "step": 4482 + }, + { + "epoch": 0.24050429184549357, + "grad_norm": 0.353515625, + "learning_rate": 4.995705226697715e-06, + "loss": 2.4638, + "step": 4483 + }, + { + "epoch": 0.24055793991416308, + "grad_norm": 0.408203125, + "learning_rate": 4.99570013503881e-06, + "loss": 2.2074, + "step": 4484 + }, + { + "epoch": 0.2406115879828326, + "grad_norm": 0.37890625, + "learning_rate": 4.995695040366088e-06, + "loss": 2.4277, + "step": 4485 + }, + { + "epoch": 0.24066523605150214, + "grad_norm": 0.314453125, + "learning_rate": 4.995689942679555e-06, + "loss": 2.0864, + "step": 4486 + }, + { + "epoch": 0.24071888412017167, + "grad_norm": 0.36328125, + "learning_rate": 4.995684841979217e-06, + "loss": 2.4642, + "step": 4487 + }, + { + "epoch": 0.2407725321888412, + "grad_norm": 0.380859375, + "learning_rate": 4.995679738265081e-06, + "loss": 2.3718, + "step": 4488 + }, + { + "epoch": 0.24082618025751074, + "grad_norm": 0.8515625, + "learning_rate": 4.995674631537153e-06, + "loss": 2.5446, + "step": 4489 + }, + { + "epoch": 0.24087982832618027, + "grad_norm": 0.3515625, + "learning_rate": 4.995669521795439e-06, + "loss": 2.1314, + "step": 4490 + }, + { + "epoch": 0.2409334763948498, + "grad_norm": 0.375, + "learning_rate": 4.9956644090399455e-06, + "loss": 2.6526, + "step": 4491 + }, + { + "epoch": 0.2409871244635193, + "grad_norm": 0.384765625, + "learning_rate": 4.995659293270677e-06, + "loss": 2.4692, + "step": 4492 + }, + { + "epoch": 0.24104077253218884, + "grad_norm": 0.37109375, + "learning_rate": 4.995654174487642e-06, + "loss": 2.3226, + "step": 4493 + }, + { + "epoch": 0.24109442060085837, + "grad_norm": 0.341796875, + "learning_rate": 4.995649052690846e-06, + "loss": 2.4409, + "step": 4494 + }, + { + "epoch": 0.2411480686695279, + "grad_norm": 0.369140625, + "learning_rate": 4.995643927880294e-06, + "loss": 2.3634, + "step": 4495 + }, + { + "epoch": 0.24120171673819743, + "grad_norm": 0.359375, + "learning_rate": 4.995638800055993e-06, + "loss": 2.327, + "step": 4496 + }, + { + "epoch": 0.24125536480686696, + "grad_norm": 0.376953125, + "learning_rate": 4.99563366921795e-06, + "loss": 2.1031, + "step": 4497 + }, + { + "epoch": 0.2413090128755365, + "grad_norm": 0.3984375, + "learning_rate": 4.995628535366171e-06, + "loss": 2.2666, + "step": 4498 + }, + { + "epoch": 0.241362660944206, + "grad_norm": 0.490234375, + "learning_rate": 4.99562339850066e-06, + "loss": 2.3488, + "step": 4499 + }, + { + "epoch": 0.24141630901287553, + "grad_norm": 0.322265625, + "learning_rate": 4.995618258621425e-06, + "loss": 2.4031, + "step": 4500 + }, + { + "epoch": 0.24146995708154506, + "grad_norm": 0.373046875, + "learning_rate": 4.995613115728473e-06, + "loss": 2.2496, + "step": 4501 + }, + { + "epoch": 0.2415236051502146, + "grad_norm": 0.349609375, + "learning_rate": 4.995607969821809e-06, + "loss": 2.3826, + "step": 4502 + }, + { + "epoch": 0.24157725321888412, + "grad_norm": 0.296875, + "learning_rate": 4.995602820901439e-06, + "loss": 2.1686, + "step": 4503 + }, + { + "epoch": 0.24163090128755366, + "grad_norm": 0.59765625, + "learning_rate": 4.995597668967371e-06, + "loss": 2.6787, + "step": 4504 + }, + { + "epoch": 0.2416845493562232, + "grad_norm": 0.416015625, + "learning_rate": 4.995592514019609e-06, + "loss": 2.7441, + "step": 4505 + }, + { + "epoch": 0.2417381974248927, + "grad_norm": 0.4140625, + "learning_rate": 4.995587356058159e-06, + "loss": 2.2946, + "step": 4506 + }, + { + "epoch": 0.24179184549356222, + "grad_norm": 0.39453125, + "learning_rate": 4.9955821950830295e-06, + "loss": 2.3741, + "step": 4507 + }, + { + "epoch": 0.24184549356223176, + "grad_norm": 0.373046875, + "learning_rate": 4.995577031094225e-06, + "loss": 2.5099, + "step": 4508 + }, + { + "epoch": 0.2418991416309013, + "grad_norm": 0.408203125, + "learning_rate": 4.995571864091753e-06, + "loss": 2.2865, + "step": 4509 + }, + { + "epoch": 0.24195278969957082, + "grad_norm": 0.427734375, + "learning_rate": 4.995566694075619e-06, + "loss": 2.38, + "step": 4510 + }, + { + "epoch": 0.24200643776824035, + "grad_norm": 0.58984375, + "learning_rate": 4.9955615210458284e-06, + "loss": 2.4925, + "step": 4511 + }, + { + "epoch": 0.24206008583690988, + "grad_norm": 0.439453125, + "learning_rate": 4.99555634500239e-06, + "loss": 2.4641, + "step": 4512 + }, + { + "epoch": 0.2421137339055794, + "grad_norm": 0.333984375, + "learning_rate": 4.995551165945307e-06, + "loss": 2.1134, + "step": 4513 + }, + { + "epoch": 0.24216738197424892, + "grad_norm": 0.36328125, + "learning_rate": 4.995545983874588e-06, + "loss": 1.9623, + "step": 4514 + }, + { + "epoch": 0.24222103004291845, + "grad_norm": 0.85546875, + "learning_rate": 4.9955407987902375e-06, + "loss": 2.3321, + "step": 4515 + }, + { + "epoch": 0.24227467811158798, + "grad_norm": 0.37890625, + "learning_rate": 4.995535610692263e-06, + "loss": 2.1506, + "step": 4516 + }, + { + "epoch": 0.24232832618025751, + "grad_norm": 0.31640625, + "learning_rate": 4.99553041958067e-06, + "loss": 2.1842, + "step": 4517 + }, + { + "epoch": 0.24238197424892705, + "grad_norm": 0.455078125, + "learning_rate": 4.995525225455465e-06, + "loss": 2.2843, + "step": 4518 + }, + { + "epoch": 0.24243562231759658, + "grad_norm": 0.365234375, + "learning_rate": 4.995520028316656e-06, + "loss": 2.3844, + "step": 4519 + }, + { + "epoch": 0.24248927038626608, + "grad_norm": 0.412109375, + "learning_rate": 4.995514828164246e-06, + "loss": 2.4938, + "step": 4520 + }, + { + "epoch": 0.2425429184549356, + "grad_norm": 1.796875, + "learning_rate": 4.995509624998244e-06, + "loss": 2.4699, + "step": 4521 + }, + { + "epoch": 0.24259656652360514, + "grad_norm": 0.34765625, + "learning_rate": 4.995504418818654e-06, + "loss": 2.4737, + "step": 4522 + }, + { + "epoch": 0.24265021459227468, + "grad_norm": 0.34375, + "learning_rate": 4.995499209625484e-06, + "loss": 2.2537, + "step": 4523 + }, + { + "epoch": 0.2427038626609442, + "grad_norm": 0.35546875, + "learning_rate": 4.99549399741874e-06, + "loss": 2.3068, + "step": 4524 + }, + { + "epoch": 0.24275751072961374, + "grad_norm": 0.35546875, + "learning_rate": 4.9954887821984285e-06, + "loss": 2.4009, + "step": 4525 + }, + { + "epoch": 0.24281115879828327, + "grad_norm": 0.345703125, + "learning_rate": 4.995483563964555e-06, + "loss": 2.3111, + "step": 4526 + }, + { + "epoch": 0.24286480686695278, + "grad_norm": 0.31640625, + "learning_rate": 4.995478342717126e-06, + "loss": 1.9589, + "step": 4527 + }, + { + "epoch": 0.2429184549356223, + "grad_norm": 0.373046875, + "learning_rate": 4.9954731184561475e-06, + "loss": 2.2969, + "step": 4528 + }, + { + "epoch": 0.24297210300429184, + "grad_norm": 0.310546875, + "learning_rate": 4.995467891181627e-06, + "loss": 2.5011, + "step": 4529 + }, + { + "epoch": 0.24302575107296137, + "grad_norm": 0.3671875, + "learning_rate": 4.99546266089357e-06, + "loss": 2.6085, + "step": 4530 + }, + { + "epoch": 0.2430793991416309, + "grad_norm": 0.474609375, + "learning_rate": 4.995457427591983e-06, + "loss": 2.4931, + "step": 4531 + }, + { + "epoch": 0.24313304721030043, + "grad_norm": 0.337890625, + "learning_rate": 4.995452191276873e-06, + "loss": 2.1967, + "step": 4532 + }, + { + "epoch": 0.24318669527896997, + "grad_norm": 0.396484375, + "learning_rate": 4.995446951948244e-06, + "loss": 2.4224, + "step": 4533 + }, + { + "epoch": 0.2432403433476395, + "grad_norm": 0.341796875, + "learning_rate": 4.9954417096061055e-06, + "loss": 2.3547, + "step": 4534 + }, + { + "epoch": 0.243293991416309, + "grad_norm": 0.5, + "learning_rate": 4.995436464250461e-06, + "loss": 2.3363, + "step": 4535 + }, + { + "epoch": 0.24334763948497853, + "grad_norm": 0.361328125, + "learning_rate": 4.995431215881319e-06, + "loss": 2.1497, + "step": 4536 + }, + { + "epoch": 0.24340128755364807, + "grad_norm": 0.46875, + "learning_rate": 4.995425964498684e-06, + "loss": 2.4131, + "step": 4537 + }, + { + "epoch": 0.2434549356223176, + "grad_norm": 0.4140625, + "learning_rate": 4.995420710102564e-06, + "loss": 2.1842, + "step": 4538 + }, + { + "epoch": 0.24350858369098713, + "grad_norm": 0.357421875, + "learning_rate": 4.995415452692965e-06, + "loss": 2.465, + "step": 4539 + }, + { + "epoch": 0.24356223175965666, + "grad_norm": 0.953125, + "learning_rate": 4.995410192269892e-06, + "loss": 2.6375, + "step": 4540 + }, + { + "epoch": 0.2436158798283262, + "grad_norm": 0.392578125, + "learning_rate": 4.995404928833351e-06, + "loss": 2.1969, + "step": 4541 + }, + { + "epoch": 0.2436695278969957, + "grad_norm": 0.337890625, + "learning_rate": 4.995399662383352e-06, + "loss": 2.2847, + "step": 4542 + }, + { + "epoch": 0.24372317596566523, + "grad_norm": 0.384765625, + "learning_rate": 4.995394392919899e-06, + "loss": 2.3106, + "step": 4543 + }, + { + "epoch": 0.24377682403433476, + "grad_norm": 0.478515625, + "learning_rate": 4.995389120442997e-06, + "loss": 2.3467, + "step": 4544 + }, + { + "epoch": 0.2438304721030043, + "grad_norm": 0.46484375, + "learning_rate": 4.9953838449526545e-06, + "loss": 2.0159, + "step": 4545 + }, + { + "epoch": 0.24388412017167382, + "grad_norm": 0.455078125, + "learning_rate": 4.9953785664488775e-06, + "loss": 2.2645, + "step": 4546 + }, + { + "epoch": 0.24393776824034336, + "grad_norm": 0.390625, + "learning_rate": 4.995373284931671e-06, + "loss": 2.4472, + "step": 4547 + }, + { + "epoch": 0.2439914163090129, + "grad_norm": 0.310546875, + "learning_rate": 4.995368000401043e-06, + "loss": 2.0745, + "step": 4548 + }, + { + "epoch": 0.2440450643776824, + "grad_norm": 0.388671875, + "learning_rate": 4.995362712856999e-06, + "loss": 2.3355, + "step": 4549 + }, + { + "epoch": 0.24409871244635192, + "grad_norm": 0.474609375, + "learning_rate": 4.995357422299546e-06, + "loss": 2.4018, + "step": 4550 + }, + { + "epoch": 0.24415236051502145, + "grad_norm": 0.46875, + "learning_rate": 4.995352128728689e-06, + "loss": 2.4894, + "step": 4551 + }, + { + "epoch": 0.244206008583691, + "grad_norm": 0.333984375, + "learning_rate": 4.995346832144437e-06, + "loss": 2.3183, + "step": 4552 + }, + { + "epoch": 0.24425965665236052, + "grad_norm": 0.36328125, + "learning_rate": 4.995341532546793e-06, + "loss": 2.3756, + "step": 4553 + }, + { + "epoch": 0.24431330472103005, + "grad_norm": 0.30859375, + "learning_rate": 4.995336229935767e-06, + "loss": 2.2167, + "step": 4554 + }, + { + "epoch": 0.24436695278969958, + "grad_norm": 0.44140625, + "learning_rate": 4.995330924311363e-06, + "loss": 2.5672, + "step": 4555 + }, + { + "epoch": 0.24442060085836909, + "grad_norm": 0.3515625, + "learning_rate": 4.995325615673587e-06, + "loss": 2.1594, + "step": 4556 + }, + { + "epoch": 0.24447424892703862, + "grad_norm": 0.34765625, + "learning_rate": 4.995320304022448e-06, + "loss": 2.3334, + "step": 4557 + }, + { + "epoch": 0.24452789699570815, + "grad_norm": 0.38671875, + "learning_rate": 4.99531498935795e-06, + "loss": 2.5201, + "step": 4558 + }, + { + "epoch": 0.24458154506437768, + "grad_norm": 0.451171875, + "learning_rate": 4.9953096716801e-06, + "loss": 2.2207, + "step": 4559 + }, + { + "epoch": 0.2446351931330472, + "grad_norm": 0.357421875, + "learning_rate": 4.995304350988905e-06, + "loss": 2.0295, + "step": 4560 + }, + { + "epoch": 0.24468884120171674, + "grad_norm": 0.36328125, + "learning_rate": 4.995299027284371e-06, + "loss": 2.2167, + "step": 4561 + }, + { + "epoch": 0.24474248927038628, + "grad_norm": 0.419921875, + "learning_rate": 4.995293700566506e-06, + "loss": 2.3585, + "step": 4562 + }, + { + "epoch": 0.24479613733905578, + "grad_norm": 0.55078125, + "learning_rate": 4.995288370835313e-06, + "loss": 2.246, + "step": 4563 + }, + { + "epoch": 0.2448497854077253, + "grad_norm": 0.310546875, + "learning_rate": 4.995283038090801e-06, + "loss": 2.4302, + "step": 4564 + }, + { + "epoch": 0.24490343347639484, + "grad_norm": 0.515625, + "learning_rate": 4.995277702332976e-06, + "loss": 2.3841, + "step": 4565 + }, + { + "epoch": 0.24495708154506438, + "grad_norm": 0.349609375, + "learning_rate": 4.995272363561845e-06, + "loss": 2.3378, + "step": 4566 + }, + { + "epoch": 0.2450107296137339, + "grad_norm": 0.369140625, + "learning_rate": 4.995267021777413e-06, + "loss": 2.2145, + "step": 4567 + }, + { + "epoch": 0.24506437768240344, + "grad_norm": 0.3515625, + "learning_rate": 4.9952616769796865e-06, + "loss": 2.2366, + "step": 4568 + }, + { + "epoch": 0.24511802575107297, + "grad_norm": 0.396484375, + "learning_rate": 4.995256329168674e-06, + "loss": 2.4716, + "step": 4569 + }, + { + "epoch": 0.2451716738197425, + "grad_norm": 0.58984375, + "learning_rate": 4.995250978344379e-06, + "loss": 2.3167, + "step": 4570 + }, + { + "epoch": 0.245225321888412, + "grad_norm": 0.3515625, + "learning_rate": 4.9952456245068115e-06, + "loss": 2.3981, + "step": 4571 + }, + { + "epoch": 0.24527896995708154, + "grad_norm": 0.421875, + "learning_rate": 4.995240267655975e-06, + "loss": 2.5937, + "step": 4572 + }, + { + "epoch": 0.24533261802575107, + "grad_norm": 0.47265625, + "learning_rate": 4.995234907791877e-06, + "loss": 2.1828, + "step": 4573 + }, + { + "epoch": 0.2453862660944206, + "grad_norm": 1.015625, + "learning_rate": 4.995229544914524e-06, + "loss": 2.2706, + "step": 4574 + }, + { + "epoch": 0.24543991416309013, + "grad_norm": 2.640625, + "learning_rate": 4.995224179023922e-06, + "loss": 2.1034, + "step": 4575 + }, + { + "epoch": 0.24549356223175967, + "grad_norm": 4.0, + "learning_rate": 4.995218810120079e-06, + "loss": 2.5137, + "step": 4576 + }, + { + "epoch": 0.2455472103004292, + "grad_norm": 0.357421875, + "learning_rate": 4.995213438202999e-06, + "loss": 2.2122, + "step": 4577 + }, + { + "epoch": 0.2456008583690987, + "grad_norm": 0.37890625, + "learning_rate": 4.9952080632726914e-06, + "loss": 2.3585, + "step": 4578 + }, + { + "epoch": 0.24565450643776823, + "grad_norm": 0.48046875, + "learning_rate": 4.99520268532916e-06, + "loss": 2.576, + "step": 4579 + }, + { + "epoch": 0.24570815450643776, + "grad_norm": 0.40234375, + "learning_rate": 4.995197304372414e-06, + "loss": 2.4793, + "step": 4580 + }, + { + "epoch": 0.2457618025751073, + "grad_norm": 0.796875, + "learning_rate": 4.995191920402457e-06, + "loss": 2.2395, + "step": 4581 + }, + { + "epoch": 0.24581545064377683, + "grad_norm": 0.3671875, + "learning_rate": 4.995186533419298e-06, + "loss": 2.2984, + "step": 4582 + }, + { + "epoch": 0.24586909871244636, + "grad_norm": 0.35546875, + "learning_rate": 4.9951811434229404e-06, + "loss": 2.2648, + "step": 4583 + }, + { + "epoch": 0.2459227467811159, + "grad_norm": 0.419921875, + "learning_rate": 4.995175750413395e-06, + "loss": 2.4453, + "step": 4584 + }, + { + "epoch": 0.2459763948497854, + "grad_norm": 0.32421875, + "learning_rate": 4.995170354390665e-06, + "loss": 2.2907, + "step": 4585 + }, + { + "epoch": 0.24603004291845493, + "grad_norm": 0.314453125, + "learning_rate": 4.995164955354758e-06, + "loss": 2.004, + "step": 4586 + }, + { + "epoch": 0.24608369098712446, + "grad_norm": 0.6796875, + "learning_rate": 4.99515955330568e-06, + "loss": 2.1667, + "step": 4587 + }, + { + "epoch": 0.246137339055794, + "grad_norm": 0.4296875, + "learning_rate": 4.995154148243438e-06, + "loss": 2.2074, + "step": 4588 + }, + { + "epoch": 0.24619098712446352, + "grad_norm": 0.390625, + "learning_rate": 4.995148740168039e-06, + "loss": 2.2952, + "step": 4589 + }, + { + "epoch": 0.24624463519313305, + "grad_norm": 0.390625, + "learning_rate": 4.99514332907949e-06, + "loss": 2.3976, + "step": 4590 + }, + { + "epoch": 0.2462982832618026, + "grad_norm": 0.3203125, + "learning_rate": 4.9951379149777944e-06, + "loss": 2.2176, + "step": 4591 + }, + { + "epoch": 0.2463519313304721, + "grad_norm": 0.306640625, + "learning_rate": 4.9951324978629625e-06, + "loss": 1.8919, + "step": 4592 + }, + { + "epoch": 0.24640557939914162, + "grad_norm": 0.373046875, + "learning_rate": 4.995127077734998e-06, + "loss": 2.2717, + "step": 4593 + }, + { + "epoch": 0.24645922746781115, + "grad_norm": 0.419921875, + "learning_rate": 4.99512165459391e-06, + "loss": 2.2607, + "step": 4594 + }, + { + "epoch": 0.24651287553648069, + "grad_norm": 0.345703125, + "learning_rate": 4.995116228439702e-06, + "loss": 2.3468, + "step": 4595 + }, + { + "epoch": 0.24656652360515022, + "grad_norm": 0.388671875, + "learning_rate": 4.995110799272384e-06, + "loss": 2.3856, + "step": 4596 + }, + { + "epoch": 0.24662017167381975, + "grad_norm": 0.44921875, + "learning_rate": 4.99510536709196e-06, + "loss": 2.1204, + "step": 4597 + }, + { + "epoch": 0.24667381974248928, + "grad_norm": 0.365234375, + "learning_rate": 4.995099931898438e-06, + "loss": 2.376, + "step": 4598 + }, + { + "epoch": 0.24672746781115878, + "grad_norm": 0.6328125, + "learning_rate": 4.9950944936918236e-06, + "loss": 2.5111, + "step": 4599 + }, + { + "epoch": 0.24678111587982832, + "grad_norm": 0.369140625, + "learning_rate": 4.995089052472124e-06, + "loss": 2.2934, + "step": 4600 + }, + { + "epoch": 0.24683476394849785, + "grad_norm": 0.341796875, + "learning_rate": 4.995083608239345e-06, + "loss": 2.2407, + "step": 4601 + }, + { + "epoch": 0.24688841201716738, + "grad_norm": 0.345703125, + "learning_rate": 4.995078160993494e-06, + "loss": 2.2023, + "step": 4602 + }, + { + "epoch": 0.2469420600858369, + "grad_norm": 0.3359375, + "learning_rate": 4.995072710734578e-06, + "loss": 2.1156, + "step": 4603 + }, + { + "epoch": 0.24699570815450644, + "grad_norm": 0.4453125, + "learning_rate": 4.995067257462601e-06, + "loss": 1.9518, + "step": 4604 + }, + { + "epoch": 0.24704935622317598, + "grad_norm": 0.421875, + "learning_rate": 4.995061801177573e-06, + "loss": 2.1171, + "step": 4605 + }, + { + "epoch": 0.2471030042918455, + "grad_norm": 0.55078125, + "learning_rate": 4.995056341879499e-06, + "loss": 2.5426, + "step": 4606 + }, + { + "epoch": 0.247156652360515, + "grad_norm": 0.39453125, + "learning_rate": 4.9950508795683846e-06, + "loss": 2.4797, + "step": 4607 + }, + { + "epoch": 0.24721030042918454, + "grad_norm": 0.4296875, + "learning_rate": 4.995045414244238e-06, + "loss": 2.1743, + "step": 4608 + }, + { + "epoch": 0.24726394849785407, + "grad_norm": 0.34375, + "learning_rate": 4.9950399459070655e-06, + "loss": 2.2923, + "step": 4609 + }, + { + "epoch": 0.2473175965665236, + "grad_norm": 0.458984375, + "learning_rate": 4.9950344745568724e-06, + "loss": 1.4513, + "step": 4610 + }, + { + "epoch": 0.24737124463519314, + "grad_norm": 1.1640625, + "learning_rate": 4.995029000193667e-06, + "loss": 2.5055, + "step": 4611 + }, + { + "epoch": 0.24742489270386267, + "grad_norm": 0.369140625, + "learning_rate": 4.9950235228174554e-06, + "loss": 2.372, + "step": 4612 + }, + { + "epoch": 0.2474785407725322, + "grad_norm": 0.310546875, + "learning_rate": 4.995018042428243e-06, + "loss": 2.2747, + "step": 4613 + }, + { + "epoch": 0.2475321888412017, + "grad_norm": 0.3125, + "learning_rate": 4.995012559026039e-06, + "loss": 2.0832, + "step": 4614 + }, + { + "epoch": 0.24758583690987124, + "grad_norm": 0.34765625, + "learning_rate": 4.995007072610848e-06, + "loss": 2.1322, + "step": 4615 + }, + { + "epoch": 0.24763948497854077, + "grad_norm": 0.42578125, + "learning_rate": 4.995001583182677e-06, + "loss": 2.5339, + "step": 4616 + }, + { + "epoch": 0.2476931330472103, + "grad_norm": 0.32421875, + "learning_rate": 4.994996090741533e-06, + "loss": 2.282, + "step": 4617 + }, + { + "epoch": 0.24774678111587983, + "grad_norm": 0.341796875, + "learning_rate": 4.9949905952874215e-06, + "loss": 2.1521, + "step": 4618 + }, + { + "epoch": 0.24780042918454936, + "grad_norm": 0.4453125, + "learning_rate": 4.99498509682035e-06, + "loss": 2.384, + "step": 4619 + }, + { + "epoch": 0.2478540772532189, + "grad_norm": 0.330078125, + "learning_rate": 4.994979595340326e-06, + "loss": 1.8678, + "step": 4620 + }, + { + "epoch": 0.2479077253218884, + "grad_norm": 0.400390625, + "learning_rate": 4.994974090847355e-06, + "loss": 2.4976, + "step": 4621 + }, + { + "epoch": 0.24796137339055793, + "grad_norm": 0.4375, + "learning_rate": 4.994968583341444e-06, + "loss": 1.8876, + "step": 4622 + }, + { + "epoch": 0.24801502145922746, + "grad_norm": 0.333984375, + "learning_rate": 4.9949630728226e-06, + "loss": 2.285, + "step": 4623 + }, + { + "epoch": 0.248068669527897, + "grad_norm": 1.3125, + "learning_rate": 4.994957559290828e-06, + "loss": 2.4091, + "step": 4624 + }, + { + "epoch": 0.24812231759656653, + "grad_norm": 0.419921875, + "learning_rate": 4.994952042746138e-06, + "loss": 2.0148, + "step": 4625 + }, + { + "epoch": 0.24817596566523606, + "grad_norm": 0.30859375, + "learning_rate": 4.994946523188532e-06, + "loss": 1.976, + "step": 4626 + }, + { + "epoch": 0.2482296137339056, + "grad_norm": 0.40234375, + "learning_rate": 4.994941000618021e-06, + "loss": 2.299, + "step": 4627 + }, + { + "epoch": 0.2482832618025751, + "grad_norm": 0.341796875, + "learning_rate": 4.994935475034609e-06, + "loss": 2.2321, + "step": 4628 + }, + { + "epoch": 0.24833690987124463, + "grad_norm": 0.4296875, + "learning_rate": 4.994929946438304e-06, + "loss": 1.7756, + "step": 4629 + }, + { + "epoch": 0.24839055793991416, + "grad_norm": 0.431640625, + "learning_rate": 4.994924414829112e-06, + "loss": 2.2981, + "step": 4630 + }, + { + "epoch": 0.2484442060085837, + "grad_norm": 0.333984375, + "learning_rate": 4.99491888020704e-06, + "loss": 2.0803, + "step": 4631 + }, + { + "epoch": 0.24849785407725322, + "grad_norm": 0.4296875, + "learning_rate": 4.994913342572095e-06, + "loss": 2.3724, + "step": 4632 + }, + { + "epoch": 0.24855150214592275, + "grad_norm": 4.125, + "learning_rate": 4.994907801924282e-06, + "loss": 2.1584, + "step": 4633 + }, + { + "epoch": 0.24860515021459229, + "grad_norm": 0.435546875, + "learning_rate": 4.994902258263611e-06, + "loss": 2.3204, + "step": 4634 + }, + { + "epoch": 0.2486587982832618, + "grad_norm": 0.388671875, + "learning_rate": 4.994896711590085e-06, + "loss": 2.2331, + "step": 4635 + }, + { + "epoch": 0.24871244635193132, + "grad_norm": 0.765625, + "learning_rate": 4.994891161903713e-06, + "loss": 2.3358, + "step": 4636 + }, + { + "epoch": 0.24876609442060085, + "grad_norm": 0.32421875, + "learning_rate": 4.994885609204502e-06, + "loss": 2.208, + "step": 4637 + }, + { + "epoch": 0.24881974248927038, + "grad_norm": 0.4296875, + "learning_rate": 4.994880053492456e-06, + "loss": 2.2493, + "step": 4638 + }, + { + "epoch": 0.24887339055793992, + "grad_norm": 0.369140625, + "learning_rate": 4.994874494767585e-06, + "loss": 2.6134, + "step": 4639 + }, + { + "epoch": 0.24892703862660945, + "grad_norm": 0.373046875, + "learning_rate": 4.994868933029894e-06, + "loss": 2.4944, + "step": 4640 + }, + { + "epoch": 0.24898068669527898, + "grad_norm": 0.294921875, + "learning_rate": 4.994863368279389e-06, + "loss": 2.2595, + "step": 4641 + }, + { + "epoch": 0.2490343347639485, + "grad_norm": 0.38671875, + "learning_rate": 4.994857800516078e-06, + "loss": 1.6902, + "step": 4642 + }, + { + "epoch": 0.24908798283261802, + "grad_norm": 0.38671875, + "learning_rate": 4.994852229739968e-06, + "loss": 2.2194, + "step": 4643 + }, + { + "epoch": 0.24914163090128755, + "grad_norm": 0.3359375, + "learning_rate": 4.994846655951064e-06, + "loss": 2.2993, + "step": 4644 + }, + { + "epoch": 0.24919527896995708, + "grad_norm": 0.33203125, + "learning_rate": 4.994841079149375e-06, + "loss": 2.1118, + "step": 4645 + }, + { + "epoch": 0.2492489270386266, + "grad_norm": 0.31640625, + "learning_rate": 4.994835499334906e-06, + "loss": 2.1407, + "step": 4646 + }, + { + "epoch": 0.24930257510729614, + "grad_norm": 0.384765625, + "learning_rate": 4.994829916507664e-06, + "loss": 2.3995, + "step": 4647 + }, + { + "epoch": 0.24935622317596567, + "grad_norm": 0.341796875, + "learning_rate": 4.994824330667657e-06, + "loss": 2.1733, + "step": 4648 + }, + { + "epoch": 0.2494098712446352, + "grad_norm": 0.34375, + "learning_rate": 4.99481874181489e-06, + "loss": 2.4756, + "step": 4649 + }, + { + "epoch": 0.2494635193133047, + "grad_norm": 0.376953125, + "learning_rate": 4.99481314994937e-06, + "loss": 1.8024, + "step": 4650 + }, + { + "epoch": 0.24951716738197424, + "grad_norm": 0.37890625, + "learning_rate": 4.9948075550711055e-06, + "loss": 2.2234, + "step": 4651 + }, + { + "epoch": 0.24957081545064377, + "grad_norm": 1.3125, + "learning_rate": 4.9948019571801015e-06, + "loss": 2.2774, + "step": 4652 + }, + { + "epoch": 0.2496244635193133, + "grad_norm": 0.40625, + "learning_rate": 4.994796356276366e-06, + "loss": 2.5687, + "step": 4653 + }, + { + "epoch": 0.24967811158798284, + "grad_norm": 0.416015625, + "learning_rate": 4.994790752359904e-06, + "loss": 2.2956, + "step": 4654 + }, + { + "epoch": 0.24973175965665237, + "grad_norm": 0.404296875, + "learning_rate": 4.994785145430724e-06, + "loss": 2.3831, + "step": 4655 + }, + { + "epoch": 0.2497854077253219, + "grad_norm": 0.392578125, + "learning_rate": 4.994779535488832e-06, + "loss": 2.2913, + "step": 4656 + }, + { + "epoch": 0.2498390557939914, + "grad_norm": 0.3984375, + "learning_rate": 4.994773922534234e-06, + "loss": 2.2935, + "step": 4657 + }, + { + "epoch": 0.24989270386266094, + "grad_norm": 0.375, + "learning_rate": 4.994768306566939e-06, + "loss": 2.3534, + "step": 4658 + }, + { + "epoch": 0.24994635193133047, + "grad_norm": 0.435546875, + "learning_rate": 4.994762687586951e-06, + "loss": 2.088, + "step": 4659 + }, + { + "epoch": 0.25, + "grad_norm": 0.318359375, + "learning_rate": 4.99475706559428e-06, + "loss": 2.2418, + "step": 4660 + }, + { + "epoch": 0.25005364806866953, + "grad_norm": 0.328125, + "learning_rate": 4.994751440588929e-06, + "loss": 2.0466, + "step": 4661 + }, + { + "epoch": 0.25010729613733906, + "grad_norm": 0.43359375, + "learning_rate": 4.994745812570908e-06, + "loss": 2.4355, + "step": 4662 + }, + { + "epoch": 0.2501609442060086, + "grad_norm": 0.3125, + "learning_rate": 4.994740181540223e-06, + "loss": 2.3079, + "step": 4663 + }, + { + "epoch": 0.2502145922746781, + "grad_norm": 0.6015625, + "learning_rate": 4.99473454749688e-06, + "loss": 2.1142, + "step": 4664 + }, + { + "epoch": 0.25026824034334766, + "grad_norm": 0.39453125, + "learning_rate": 4.9947289104408864e-06, + "loss": 2.5022, + "step": 4665 + }, + { + "epoch": 0.2503218884120172, + "grad_norm": 0.38671875, + "learning_rate": 4.994723270372248e-06, + "loss": 2.4065, + "step": 4666 + }, + { + "epoch": 0.2503755364806867, + "grad_norm": 0.435546875, + "learning_rate": 4.994717627290974e-06, + "loss": 2.1518, + "step": 4667 + }, + { + "epoch": 0.2504291845493562, + "grad_norm": 0.33984375, + "learning_rate": 4.994711981197068e-06, + "loss": 2.3016, + "step": 4668 + }, + { + "epoch": 0.25048283261802573, + "grad_norm": 0.37890625, + "learning_rate": 4.994706332090539e-06, + "loss": 2.2433, + "step": 4669 + }, + { + "epoch": 0.25053648068669526, + "grad_norm": 0.353515625, + "learning_rate": 4.994700679971394e-06, + "loss": 2.3308, + "step": 4670 + }, + { + "epoch": 0.2505901287553648, + "grad_norm": 0.36328125, + "learning_rate": 4.994695024839638e-06, + "loss": 2.3543, + "step": 4671 + }, + { + "epoch": 0.2506437768240343, + "grad_norm": 0.41796875, + "learning_rate": 4.99468936669528e-06, + "loss": 2.3587, + "step": 4672 + }, + { + "epoch": 0.25069742489270386, + "grad_norm": 0.435546875, + "learning_rate": 4.9946837055383245e-06, + "loss": 2.2651, + "step": 4673 + }, + { + "epoch": 0.2507510729613734, + "grad_norm": 0.267578125, + "learning_rate": 4.9946780413687804e-06, + "loss": 2.2036, + "step": 4674 + }, + { + "epoch": 0.2508047210300429, + "grad_norm": 0.44140625, + "learning_rate": 4.994672374186654e-06, + "loss": 2.2135, + "step": 4675 + }, + { + "epoch": 0.25085836909871245, + "grad_norm": 0.376953125, + "learning_rate": 4.994666703991952e-06, + "loss": 1.9915, + "step": 4676 + }, + { + "epoch": 0.250912017167382, + "grad_norm": 0.365234375, + "learning_rate": 4.9946610307846796e-06, + "loss": 2.2106, + "step": 4677 + }, + { + "epoch": 0.2509656652360515, + "grad_norm": 0.4296875, + "learning_rate": 4.994655354564847e-06, + "loss": 2.0647, + "step": 4678 + }, + { + "epoch": 0.25101931330472105, + "grad_norm": 0.34375, + "learning_rate": 4.994649675332458e-06, + "loss": 2.2666, + "step": 4679 + }, + { + "epoch": 0.2510729613733906, + "grad_norm": 0.421875, + "learning_rate": 4.994643993087522e-06, + "loss": 2.3923, + "step": 4680 + }, + { + "epoch": 0.2511266094420601, + "grad_norm": 0.4765625, + "learning_rate": 4.994638307830043e-06, + "loss": 2.4032, + "step": 4681 + }, + { + "epoch": 0.2511802575107296, + "grad_norm": 0.625, + "learning_rate": 4.9946326195600305e-06, + "loss": 2.2781, + "step": 4682 + }, + { + "epoch": 0.2512339055793991, + "grad_norm": 0.416015625, + "learning_rate": 4.99462692827749e-06, + "loss": 2.4921, + "step": 4683 + }, + { + "epoch": 0.25128755364806865, + "grad_norm": 0.47265625, + "learning_rate": 4.9946212339824285e-06, + "loss": 2.3144, + "step": 4684 + }, + { + "epoch": 0.2513412017167382, + "grad_norm": 0.359375, + "learning_rate": 4.994615536674854e-06, + "loss": 2.3492, + "step": 4685 + }, + { + "epoch": 0.2513948497854077, + "grad_norm": 0.318359375, + "learning_rate": 4.994609836354771e-06, + "loss": 2.0215, + "step": 4686 + }, + { + "epoch": 0.25144849785407725, + "grad_norm": 0.388671875, + "learning_rate": 4.994604133022188e-06, + "loss": 2.2617, + "step": 4687 + }, + { + "epoch": 0.2515021459227468, + "grad_norm": 0.359375, + "learning_rate": 4.994598426677113e-06, + "loss": 2.2914, + "step": 4688 + }, + { + "epoch": 0.2515557939914163, + "grad_norm": 0.63671875, + "learning_rate": 4.994592717319551e-06, + "loss": 2.2392, + "step": 4689 + }, + { + "epoch": 0.25160944206008584, + "grad_norm": 0.408203125, + "learning_rate": 4.994587004949509e-06, + "loss": 2.3916, + "step": 4690 + }, + { + "epoch": 0.2516630901287554, + "grad_norm": 0.361328125, + "learning_rate": 4.994581289566994e-06, + "loss": 2.1581, + "step": 4691 + }, + { + "epoch": 0.2517167381974249, + "grad_norm": 0.455078125, + "learning_rate": 4.994575571172015e-06, + "loss": 2.4644, + "step": 4692 + }, + { + "epoch": 0.25177038626609444, + "grad_norm": 0.466796875, + "learning_rate": 4.994569849764576e-06, + "loss": 1.7294, + "step": 4693 + }, + { + "epoch": 0.25182403433476397, + "grad_norm": 0.392578125, + "learning_rate": 4.9945641253446854e-06, + "loss": 2.4405, + "step": 4694 + }, + { + "epoch": 0.2518776824034335, + "grad_norm": 0.384765625, + "learning_rate": 4.994558397912349e-06, + "loss": 2.6195, + "step": 4695 + }, + { + "epoch": 0.251931330472103, + "grad_norm": 0.341796875, + "learning_rate": 4.994552667467576e-06, + "loss": 2.1627, + "step": 4696 + }, + { + "epoch": 0.2519849785407725, + "grad_norm": 0.361328125, + "learning_rate": 4.994546934010371e-06, + "loss": 2.3353, + "step": 4697 + }, + { + "epoch": 0.25203862660944204, + "grad_norm": 0.365234375, + "learning_rate": 4.994541197540741e-06, + "loss": 2.2663, + "step": 4698 + }, + { + "epoch": 0.25209227467811157, + "grad_norm": 0.33984375, + "learning_rate": 4.994535458058695e-06, + "loss": 2.289, + "step": 4699 + }, + { + "epoch": 0.2521459227467811, + "grad_norm": 0.404296875, + "learning_rate": 4.994529715564238e-06, + "loss": 2.3735, + "step": 4700 + }, + { + "epoch": 0.25219957081545064, + "grad_norm": 0.36328125, + "learning_rate": 4.994523970057378e-06, + "loss": 2.244, + "step": 4701 + }, + { + "epoch": 0.25225321888412017, + "grad_norm": 0.3359375, + "learning_rate": 4.99451822153812e-06, + "loss": 2.1531, + "step": 4702 + }, + { + "epoch": 0.2523068669527897, + "grad_norm": 0.439453125, + "learning_rate": 4.994512470006474e-06, + "loss": 2.3836, + "step": 4703 + }, + { + "epoch": 0.25236051502145923, + "grad_norm": 0.4609375, + "learning_rate": 4.994506715462445e-06, + "loss": 1.7941, + "step": 4704 + }, + { + "epoch": 0.25241416309012876, + "grad_norm": 0.337890625, + "learning_rate": 4.994500957906041e-06, + "loss": 2.2046, + "step": 4705 + }, + { + "epoch": 0.2524678111587983, + "grad_norm": 0.361328125, + "learning_rate": 4.994495197337268e-06, + "loss": 2.3168, + "step": 4706 + }, + { + "epoch": 0.2525214592274678, + "grad_norm": 0.359375, + "learning_rate": 4.994489433756132e-06, + "loss": 2.4058, + "step": 4707 + }, + { + "epoch": 0.25257510729613736, + "grad_norm": 0.37890625, + "learning_rate": 4.9944836671626425e-06, + "loss": 2.3135, + "step": 4708 + }, + { + "epoch": 0.2526287553648069, + "grad_norm": 0.33203125, + "learning_rate": 4.9944778975568055e-06, + "loss": 2.1412, + "step": 4709 + }, + { + "epoch": 0.2526824034334764, + "grad_norm": 0.32421875, + "learning_rate": 4.994472124938626e-06, + "loss": 2.3329, + "step": 4710 + }, + { + "epoch": 0.2527360515021459, + "grad_norm": 0.55078125, + "learning_rate": 4.994466349308114e-06, + "loss": 2.3757, + "step": 4711 + }, + { + "epoch": 0.25278969957081543, + "grad_norm": 0.341796875, + "learning_rate": 4.994460570665275e-06, + "loss": 2.1203, + "step": 4712 + }, + { + "epoch": 0.25284334763948496, + "grad_norm": 0.326171875, + "learning_rate": 4.994454789010116e-06, + "loss": 1.9971, + "step": 4713 + }, + { + "epoch": 0.2528969957081545, + "grad_norm": 0.33984375, + "learning_rate": 4.994449004342643e-06, + "loss": 2.4338, + "step": 4714 + }, + { + "epoch": 0.252950643776824, + "grad_norm": 0.37890625, + "learning_rate": 4.994443216662865e-06, + "loss": 2.4304, + "step": 4715 + }, + { + "epoch": 0.25300429184549356, + "grad_norm": 0.42578125, + "learning_rate": 4.9944374259707875e-06, + "loss": 2.1622, + "step": 4716 + }, + { + "epoch": 0.2530579399141631, + "grad_norm": 0.462890625, + "learning_rate": 4.9944316322664185e-06, + "loss": 2.3459, + "step": 4717 + }, + { + "epoch": 0.2531115879828326, + "grad_norm": 0.51953125, + "learning_rate": 4.994425835549764e-06, + "loss": 1.8399, + "step": 4718 + }, + { + "epoch": 0.25316523605150215, + "grad_norm": 0.353515625, + "learning_rate": 4.9944200358208325e-06, + "loss": 2.1955, + "step": 4719 + }, + { + "epoch": 0.2532188841201717, + "grad_norm": 0.388671875, + "learning_rate": 4.99441423307963e-06, + "loss": 2.1003, + "step": 4720 + }, + { + "epoch": 0.2532725321888412, + "grad_norm": 0.33203125, + "learning_rate": 4.994408427326162e-06, + "loss": 2.2772, + "step": 4721 + }, + { + "epoch": 0.25332618025751075, + "grad_norm": 0.3359375, + "learning_rate": 4.9944026185604385e-06, + "loss": 2.1449, + "step": 4722 + }, + { + "epoch": 0.2533798283261803, + "grad_norm": 0.46875, + "learning_rate": 4.994396806782464e-06, + "loss": 1.8377, + "step": 4723 + }, + { + "epoch": 0.2534334763948498, + "grad_norm": 0.466796875, + "learning_rate": 4.9943909919922475e-06, + "loss": 2.2603, + "step": 4724 + }, + { + "epoch": 0.2534871244635193, + "grad_norm": 0.298828125, + "learning_rate": 4.9943851741897945e-06, + "loss": 2.3176, + "step": 4725 + }, + { + "epoch": 0.2535407725321888, + "grad_norm": 0.412109375, + "learning_rate": 4.994379353375113e-06, + "loss": 2.4992, + "step": 4726 + }, + { + "epoch": 0.25359442060085835, + "grad_norm": 0.328125, + "learning_rate": 4.994373529548209e-06, + "loss": 2.1385, + "step": 4727 + }, + { + "epoch": 0.2536480686695279, + "grad_norm": 0.66015625, + "learning_rate": 4.994367702709091e-06, + "loss": 2.4833, + "step": 4728 + }, + { + "epoch": 0.2537017167381974, + "grad_norm": 0.384765625, + "learning_rate": 4.994361872857765e-06, + "loss": 2.444, + "step": 4729 + }, + { + "epoch": 0.25375536480686695, + "grad_norm": 0.478515625, + "learning_rate": 4.994356039994238e-06, + "loss": 2.3972, + "step": 4730 + }, + { + "epoch": 0.2538090128755365, + "grad_norm": 0.3671875, + "learning_rate": 4.994350204118517e-06, + "loss": 2.2105, + "step": 4731 + }, + { + "epoch": 0.253862660944206, + "grad_norm": 0.392578125, + "learning_rate": 4.99434436523061e-06, + "loss": 2.2215, + "step": 4732 + }, + { + "epoch": 0.25391630901287554, + "grad_norm": 0.345703125, + "learning_rate": 4.994338523330523e-06, + "loss": 2.2485, + "step": 4733 + }, + { + "epoch": 0.2539699570815451, + "grad_norm": 0.43359375, + "learning_rate": 4.994332678418263e-06, + "loss": 2.3325, + "step": 4734 + }, + { + "epoch": 0.2540236051502146, + "grad_norm": 0.36328125, + "learning_rate": 4.994326830493839e-06, + "loss": 2.2791, + "step": 4735 + }, + { + "epoch": 0.25407725321888414, + "grad_norm": 0.421875, + "learning_rate": 4.994320979557256e-06, + "loss": 2.5119, + "step": 4736 + }, + { + "epoch": 0.25413090128755367, + "grad_norm": 0.31640625, + "learning_rate": 4.994315125608521e-06, + "loss": 2.3531, + "step": 4737 + }, + { + "epoch": 0.2541845493562232, + "grad_norm": 0.72265625, + "learning_rate": 4.994309268647641e-06, + "loss": 2.3866, + "step": 4738 + }, + { + "epoch": 0.25423819742489273, + "grad_norm": 0.333984375, + "learning_rate": 4.994303408674626e-06, + "loss": 2.395, + "step": 4739 + }, + { + "epoch": 0.2542918454935622, + "grad_norm": 0.3515625, + "learning_rate": 4.994297545689479e-06, + "loss": 2.0571, + "step": 4740 + }, + { + "epoch": 0.25434549356223174, + "grad_norm": 0.3515625, + "learning_rate": 4.9942916796922095e-06, + "loss": 2.0461, + "step": 4741 + }, + { + "epoch": 0.25439914163090127, + "grad_norm": 0.361328125, + "learning_rate": 4.994285810682824e-06, + "loss": 2.373, + "step": 4742 + }, + { + "epoch": 0.2544527896995708, + "grad_norm": 0.35546875, + "learning_rate": 4.994279938661329e-06, + "loss": 2.1919, + "step": 4743 + }, + { + "epoch": 0.25450643776824033, + "grad_norm": 0.375, + "learning_rate": 4.994274063627734e-06, + "loss": 2.4155, + "step": 4744 + }, + { + "epoch": 0.25456008583690987, + "grad_norm": 0.408203125, + "learning_rate": 4.994268185582043e-06, + "loss": 2.4552, + "step": 4745 + }, + { + "epoch": 0.2546137339055794, + "grad_norm": 0.416015625, + "learning_rate": 4.9942623045242645e-06, + "loss": 2.2609, + "step": 4746 + }, + { + "epoch": 0.25466738197424893, + "grad_norm": 0.27734375, + "learning_rate": 4.994256420454405e-06, + "loss": 2.1475, + "step": 4747 + }, + { + "epoch": 0.25472103004291846, + "grad_norm": 0.37109375, + "learning_rate": 4.994250533372473e-06, + "loss": 2.2549, + "step": 4748 + }, + { + "epoch": 0.254774678111588, + "grad_norm": 0.34765625, + "learning_rate": 4.994244643278475e-06, + "loss": 2.5032, + "step": 4749 + }, + { + "epoch": 0.2548283261802575, + "grad_norm": 0.357421875, + "learning_rate": 4.994238750172417e-06, + "loss": 2.4927, + "step": 4750 + }, + { + "epoch": 0.25488197424892706, + "grad_norm": 0.421875, + "learning_rate": 4.994232854054307e-06, + "loss": 2.2787, + "step": 4751 + }, + { + "epoch": 0.2549356223175966, + "grad_norm": 0.423828125, + "learning_rate": 4.994226954924152e-06, + "loss": 2.6359, + "step": 4752 + }, + { + "epoch": 0.2549892703862661, + "grad_norm": 0.3671875, + "learning_rate": 4.99422105278196e-06, + "loss": 2.536, + "step": 4753 + }, + { + "epoch": 0.2550429184549356, + "grad_norm": 0.345703125, + "learning_rate": 4.9942151476277365e-06, + "loss": 2.4047, + "step": 4754 + }, + { + "epoch": 0.25509656652360513, + "grad_norm": 0.404296875, + "learning_rate": 4.9942092394614895e-06, + "loss": 2.1332, + "step": 4755 + }, + { + "epoch": 0.25515021459227466, + "grad_norm": 0.353515625, + "learning_rate": 4.9942033282832256e-06, + "loss": 2.0293, + "step": 4756 + }, + { + "epoch": 0.2552038626609442, + "grad_norm": 0.515625, + "learning_rate": 4.994197414092953e-06, + "loss": 2.2305, + "step": 4757 + }, + { + "epoch": 0.2552575107296137, + "grad_norm": 0.392578125, + "learning_rate": 4.994191496890679e-06, + "loss": 2.0326, + "step": 4758 + }, + { + "epoch": 0.25531115879828326, + "grad_norm": 0.392578125, + "learning_rate": 4.994185576676409e-06, + "loss": 2.2503, + "step": 4759 + }, + { + "epoch": 0.2553648068669528, + "grad_norm": 0.427734375, + "learning_rate": 4.994179653450152e-06, + "loss": 2.3053, + "step": 4760 + }, + { + "epoch": 0.2554184549356223, + "grad_norm": 0.4453125, + "learning_rate": 4.994173727211913e-06, + "loss": 2.4834, + "step": 4761 + }, + { + "epoch": 0.25547210300429185, + "grad_norm": 0.5703125, + "learning_rate": 4.994167797961701e-06, + "loss": 2.5486, + "step": 4762 + }, + { + "epoch": 0.2555257510729614, + "grad_norm": 0.380859375, + "learning_rate": 4.994161865699523e-06, + "loss": 2.2932, + "step": 4763 + }, + { + "epoch": 0.2555793991416309, + "grad_norm": 0.359375, + "learning_rate": 4.994155930425386e-06, + "loss": 2.4229, + "step": 4764 + }, + { + "epoch": 0.25563304721030045, + "grad_norm": 0.416015625, + "learning_rate": 4.9941499921392965e-06, + "loss": 2.1477, + "step": 4765 + }, + { + "epoch": 0.25568669527897, + "grad_norm": 0.330078125, + "learning_rate": 4.994144050841262e-06, + "loss": 2.2445, + "step": 4766 + }, + { + "epoch": 0.2557403433476395, + "grad_norm": 0.30859375, + "learning_rate": 4.99413810653129e-06, + "loss": 2.1012, + "step": 4767 + }, + { + "epoch": 0.255793991416309, + "grad_norm": 0.39453125, + "learning_rate": 4.994132159209387e-06, + "loss": 2.3823, + "step": 4768 + }, + { + "epoch": 0.2558476394849785, + "grad_norm": 0.40234375, + "learning_rate": 4.994126208875561e-06, + "loss": 2.4231, + "step": 4769 + }, + { + "epoch": 0.25590128755364805, + "grad_norm": 0.4140625, + "learning_rate": 4.99412025552982e-06, + "loss": 2.3507, + "step": 4770 + }, + { + "epoch": 0.2559549356223176, + "grad_norm": 0.35546875, + "learning_rate": 4.994114299172168e-06, + "loss": 2.3896, + "step": 4771 + }, + { + "epoch": 0.2560085836909871, + "grad_norm": 0.39453125, + "learning_rate": 4.9941083398026154e-06, + "loss": 2.0742, + "step": 4772 + }, + { + "epoch": 0.25606223175965664, + "grad_norm": 0.35546875, + "learning_rate": 4.994102377421168e-06, + "loss": 2.3616, + "step": 4773 + }, + { + "epoch": 0.2561158798283262, + "grad_norm": 223.0, + "learning_rate": 4.994096412027834e-06, + "loss": 2.4052, + "step": 4774 + }, + { + "epoch": 0.2561695278969957, + "grad_norm": 0.34375, + "learning_rate": 4.994090443622619e-06, + "loss": 2.4093, + "step": 4775 + }, + { + "epoch": 0.25622317596566524, + "grad_norm": 0.400390625, + "learning_rate": 4.994084472205531e-06, + "loss": 2.3542, + "step": 4776 + }, + { + "epoch": 0.25627682403433477, + "grad_norm": 0.3671875, + "learning_rate": 4.994078497776576e-06, + "loss": 2.2216, + "step": 4777 + }, + { + "epoch": 0.2563304721030043, + "grad_norm": 0.5625, + "learning_rate": 4.994072520335765e-06, + "loss": 2.3032, + "step": 4778 + }, + { + "epoch": 0.25638412017167383, + "grad_norm": 0.40234375, + "learning_rate": 4.994066539883101e-06, + "loss": 2.3282, + "step": 4779 + }, + { + "epoch": 0.25643776824034337, + "grad_norm": 0.3671875, + "learning_rate": 4.994060556418594e-06, + "loss": 2.1603, + "step": 4780 + }, + { + "epoch": 0.2564914163090129, + "grad_norm": 0.375, + "learning_rate": 4.99405456994225e-06, + "loss": 2.4368, + "step": 4781 + }, + { + "epoch": 0.25654506437768243, + "grad_norm": 0.478515625, + "learning_rate": 4.994048580454075e-06, + "loss": 2.3092, + "step": 4782 + }, + { + "epoch": 0.2565987124463519, + "grad_norm": 0.53515625, + "learning_rate": 4.994042587954079e-06, + "loss": 2.4096, + "step": 4783 + }, + { + "epoch": 0.25665236051502144, + "grad_norm": 0.34765625, + "learning_rate": 4.994036592442267e-06, + "loss": 2.2702, + "step": 4784 + }, + { + "epoch": 0.25670600858369097, + "grad_norm": 0.451171875, + "learning_rate": 4.994030593918647e-06, + "loss": 1.3757, + "step": 4785 + }, + { + "epoch": 0.2567596566523605, + "grad_norm": 0.3828125, + "learning_rate": 4.994024592383227e-06, + "loss": 2.4489, + "step": 4786 + }, + { + "epoch": 0.25681330472103003, + "grad_norm": 0.3671875, + "learning_rate": 4.994018587836013e-06, + "loss": 2.5165, + "step": 4787 + }, + { + "epoch": 0.25686695278969957, + "grad_norm": 0.400390625, + "learning_rate": 4.994012580277013e-06, + "loss": 2.2621, + "step": 4788 + }, + { + "epoch": 0.2569206008583691, + "grad_norm": 0.400390625, + "learning_rate": 4.994006569706234e-06, + "loss": 2.3618, + "step": 4789 + }, + { + "epoch": 0.25697424892703863, + "grad_norm": 0.306640625, + "learning_rate": 4.994000556123684e-06, + "loss": 2.0198, + "step": 4790 + }, + { + "epoch": 0.25702789699570816, + "grad_norm": 0.330078125, + "learning_rate": 4.993994539529369e-06, + "loss": 2.2585, + "step": 4791 + }, + { + "epoch": 0.2570815450643777, + "grad_norm": 0.3984375, + "learning_rate": 4.993988519923296e-06, + "loss": 2.4709, + "step": 4792 + }, + { + "epoch": 0.2571351931330472, + "grad_norm": 0.48046875, + "learning_rate": 4.993982497305474e-06, + "loss": 2.5599, + "step": 4793 + }, + { + "epoch": 0.25718884120171676, + "grad_norm": 0.37890625, + "learning_rate": 4.99397647167591e-06, + "loss": 2.3292, + "step": 4794 + }, + { + "epoch": 0.2572424892703863, + "grad_norm": 0.58203125, + "learning_rate": 4.993970443034609e-06, + "loss": 2.4437, + "step": 4795 + }, + { + "epoch": 0.2572961373390558, + "grad_norm": 0.353515625, + "learning_rate": 4.993964411381581e-06, + "loss": 2.3087, + "step": 4796 + }, + { + "epoch": 0.2573497854077253, + "grad_norm": 0.388671875, + "learning_rate": 4.993958376716833e-06, + "loss": 2.1554, + "step": 4797 + }, + { + "epoch": 0.2574034334763948, + "grad_norm": 0.34765625, + "learning_rate": 4.9939523390403696e-06, + "loss": 2.1885, + "step": 4798 + }, + { + "epoch": 0.25745708154506436, + "grad_norm": 0.3515625, + "learning_rate": 4.993946298352201e-06, + "loss": 2.1787, + "step": 4799 + }, + { + "epoch": 0.2575107296137339, + "grad_norm": 0.4296875, + "learning_rate": 4.993940254652334e-06, + "loss": 2.259, + "step": 4800 + }, + { + "epoch": 0.2575643776824034, + "grad_norm": 0.37890625, + "learning_rate": 4.993934207940776e-06, + "loss": 2.4025, + "step": 4801 + }, + { + "epoch": 0.25761802575107295, + "grad_norm": 0.404296875, + "learning_rate": 4.993928158217532e-06, + "loss": 2.4618, + "step": 4802 + }, + { + "epoch": 0.2576716738197425, + "grad_norm": 0.390625, + "learning_rate": 4.993922105482611e-06, + "loss": 2.133, + "step": 4803 + }, + { + "epoch": 0.257725321888412, + "grad_norm": 0.365234375, + "learning_rate": 4.9939160497360215e-06, + "loss": 2.0485, + "step": 4804 + }, + { + "epoch": 0.25777896995708155, + "grad_norm": 0.80859375, + "learning_rate": 4.99390999097777e-06, + "loss": 1.4253, + "step": 4805 + }, + { + "epoch": 0.2578326180257511, + "grad_norm": 1.1875, + "learning_rate": 4.993903929207863e-06, + "loss": 2.2591, + "step": 4806 + }, + { + "epoch": 0.2578862660944206, + "grad_norm": 0.427734375, + "learning_rate": 4.993897864426307e-06, + "loss": 2.3127, + "step": 4807 + }, + { + "epoch": 0.25793991416309014, + "grad_norm": 0.37890625, + "learning_rate": 4.9938917966331114e-06, + "loss": 2.4395, + "step": 4808 + }, + { + "epoch": 0.2579935622317597, + "grad_norm": 0.30859375, + "learning_rate": 4.993885725828283e-06, + "loss": 2.3213, + "step": 4809 + }, + { + "epoch": 0.2580472103004292, + "grad_norm": 0.8125, + "learning_rate": 4.993879652011828e-06, + "loss": 1.3861, + "step": 4810 + }, + { + "epoch": 0.25810085836909874, + "grad_norm": 0.3828125, + "learning_rate": 4.9938735751837555e-06, + "loss": 2.2093, + "step": 4811 + }, + { + "epoch": 0.2581545064377682, + "grad_norm": 0.326171875, + "learning_rate": 4.993867495344072e-06, + "loss": 2.3305, + "step": 4812 + }, + { + "epoch": 0.25820815450643775, + "grad_norm": 0.359375, + "learning_rate": 4.993861412492784e-06, + "loss": 2.3882, + "step": 4813 + }, + { + "epoch": 0.2582618025751073, + "grad_norm": 0.337890625, + "learning_rate": 4.993855326629901e-06, + "loss": 2.4507, + "step": 4814 + }, + { + "epoch": 0.2583154506437768, + "grad_norm": 0.39453125, + "learning_rate": 4.993849237755427e-06, + "loss": 2.4912, + "step": 4815 + }, + { + "epoch": 0.25836909871244634, + "grad_norm": 0.3125, + "learning_rate": 4.993843145869372e-06, + "loss": 2.1408, + "step": 4816 + }, + { + "epoch": 0.2584227467811159, + "grad_norm": 0.59375, + "learning_rate": 4.993837050971744e-06, + "loss": 2.2315, + "step": 4817 + }, + { + "epoch": 0.2584763948497854, + "grad_norm": 0.375, + "learning_rate": 4.993830953062548e-06, + "loss": 2.1228, + "step": 4818 + }, + { + "epoch": 0.25853004291845494, + "grad_norm": 0.3359375, + "learning_rate": 4.993824852141792e-06, + "loss": 2.4653, + "step": 4819 + }, + { + "epoch": 0.25858369098712447, + "grad_norm": 0.390625, + "learning_rate": 4.993818748209485e-06, + "loss": 2.3596, + "step": 4820 + }, + { + "epoch": 0.258637339055794, + "grad_norm": 0.337890625, + "learning_rate": 4.993812641265632e-06, + "loss": 2.1944, + "step": 4821 + }, + { + "epoch": 0.25869098712446353, + "grad_norm": 2.015625, + "learning_rate": 4.9938065313102415e-06, + "loss": 2.2679, + "step": 4822 + }, + { + "epoch": 0.25874463519313307, + "grad_norm": 0.40234375, + "learning_rate": 4.993800418343321e-06, + "loss": 2.2612, + "step": 4823 + }, + { + "epoch": 0.2587982832618026, + "grad_norm": 0.357421875, + "learning_rate": 4.993794302364878e-06, + "loss": 2.1651, + "step": 4824 + }, + { + "epoch": 0.25885193133047213, + "grad_norm": 0.38671875, + "learning_rate": 4.99378818337492e-06, + "loss": 2.5119, + "step": 4825 + }, + { + "epoch": 0.2589055793991416, + "grad_norm": 0.439453125, + "learning_rate": 4.993782061373453e-06, + "loss": 2.315, + "step": 4826 + }, + { + "epoch": 0.25895922746781114, + "grad_norm": 0.373046875, + "learning_rate": 4.9937759363604865e-06, + "loss": 2.2105, + "step": 4827 + }, + { + "epoch": 0.25901287553648067, + "grad_norm": 0.34765625, + "learning_rate": 4.9937698083360266e-06, + "loss": 2.2735, + "step": 4828 + }, + { + "epoch": 0.2590665236051502, + "grad_norm": 0.46484375, + "learning_rate": 4.993763677300081e-06, + "loss": 1.4746, + "step": 4829 + }, + { + "epoch": 0.25912017167381973, + "grad_norm": 0.75390625, + "learning_rate": 4.9937575432526566e-06, + "loss": 2.3253, + "step": 4830 + }, + { + "epoch": 0.25917381974248926, + "grad_norm": 0.3515625, + "learning_rate": 4.993751406193762e-06, + "loss": 2.1768, + "step": 4831 + }, + { + "epoch": 0.2592274678111588, + "grad_norm": 1.21875, + "learning_rate": 4.993745266123403e-06, + "loss": 2.494, + "step": 4832 + }, + { + "epoch": 0.2592811158798283, + "grad_norm": 0.44921875, + "learning_rate": 4.9937391230415875e-06, + "loss": 2.2641, + "step": 4833 + }, + { + "epoch": 0.25933476394849786, + "grad_norm": 0.330078125, + "learning_rate": 4.993732976948325e-06, + "loss": 2.4023, + "step": 4834 + }, + { + "epoch": 0.2593884120171674, + "grad_norm": 0.37890625, + "learning_rate": 4.9937268278436196e-06, + "loss": 2.2027, + "step": 4835 + }, + { + "epoch": 0.2594420600858369, + "grad_norm": 0.3828125, + "learning_rate": 4.993720675727481e-06, + "loss": 2.3745, + "step": 4836 + }, + { + "epoch": 0.25949570815450645, + "grad_norm": 0.359375, + "learning_rate": 4.993714520599916e-06, + "loss": 2.2951, + "step": 4837 + }, + { + "epoch": 0.259549356223176, + "grad_norm": 0.359375, + "learning_rate": 4.993708362460931e-06, + "loss": 2.2517, + "step": 4838 + }, + { + "epoch": 0.2596030042918455, + "grad_norm": 0.734375, + "learning_rate": 4.993702201310537e-06, + "loss": 2.5814, + "step": 4839 + }, + { + "epoch": 0.259656652360515, + "grad_norm": 0.396484375, + "learning_rate": 4.9936960371487365e-06, + "loss": 2.4735, + "step": 4840 + }, + { + "epoch": 0.2597103004291845, + "grad_norm": 0.44921875, + "learning_rate": 4.993689869975539e-06, + "loss": 2.2918, + "step": 4841 + }, + { + "epoch": 0.25976394849785406, + "grad_norm": 0.357421875, + "learning_rate": 4.993683699790955e-06, + "loss": 2.0498, + "step": 4842 + }, + { + "epoch": 0.2598175965665236, + "grad_norm": 0.38671875, + "learning_rate": 4.993677526594987e-06, + "loss": 1.9634, + "step": 4843 + }, + { + "epoch": 0.2598712446351931, + "grad_norm": 0.470703125, + "learning_rate": 4.993671350387645e-06, + "loss": 2.2466, + "step": 4844 + }, + { + "epoch": 0.25992489270386265, + "grad_norm": 0.353515625, + "learning_rate": 4.993665171168937e-06, + "loss": 2.5521, + "step": 4845 + }, + { + "epoch": 0.2599785407725322, + "grad_norm": 0.365234375, + "learning_rate": 4.993658988938868e-06, + "loss": 2.361, + "step": 4846 + }, + { + "epoch": 0.2600321888412017, + "grad_norm": 0.34375, + "learning_rate": 4.993652803697448e-06, + "loss": 2.4399, + "step": 4847 + }, + { + "epoch": 0.26008583690987125, + "grad_norm": 0.3515625, + "learning_rate": 4.993646615444684e-06, + "loss": 2.4403, + "step": 4848 + }, + { + "epoch": 0.2601394849785408, + "grad_norm": 0.35546875, + "learning_rate": 4.993640424180582e-06, + "loss": 2.3884, + "step": 4849 + }, + { + "epoch": 0.2601931330472103, + "grad_norm": 0.376953125, + "learning_rate": 4.993634229905151e-06, + "loss": 2.1414, + "step": 4850 + }, + { + "epoch": 0.26024678111587984, + "grad_norm": 0.365234375, + "learning_rate": 4.993628032618398e-06, + "loss": 2.4007, + "step": 4851 + }, + { + "epoch": 0.2603004291845494, + "grad_norm": 0.388671875, + "learning_rate": 4.99362183232033e-06, + "loss": 2.3363, + "step": 4852 + }, + { + "epoch": 0.2603540772532189, + "grad_norm": 0.484375, + "learning_rate": 4.993615629010956e-06, + "loss": 2.3563, + "step": 4853 + }, + { + "epoch": 0.26040772532188844, + "grad_norm": 0.73828125, + "learning_rate": 4.9936094226902815e-06, + "loss": 2.309, + "step": 4854 + }, + { + "epoch": 0.2604613733905579, + "grad_norm": 0.34375, + "learning_rate": 4.993603213358315e-06, + "loss": 2.2151, + "step": 4855 + }, + { + "epoch": 0.26051502145922745, + "grad_norm": 0.39453125, + "learning_rate": 4.993597001015064e-06, + "loss": 1.6039, + "step": 4856 + }, + { + "epoch": 0.260568669527897, + "grad_norm": 0.40234375, + "learning_rate": 4.993590785660536e-06, + "loss": 2.0236, + "step": 4857 + }, + { + "epoch": 0.2606223175965665, + "grad_norm": 0.400390625, + "learning_rate": 4.993584567294738e-06, + "loss": 2.5387, + "step": 4858 + }, + { + "epoch": 0.26067596566523604, + "grad_norm": 0.359375, + "learning_rate": 4.993578345917679e-06, + "loss": 2.2011, + "step": 4859 + }, + { + "epoch": 0.2607296137339056, + "grad_norm": 0.341796875, + "learning_rate": 4.993572121529365e-06, + "loss": 2.325, + "step": 4860 + }, + { + "epoch": 0.2607832618025751, + "grad_norm": 0.5, + "learning_rate": 4.993565894129804e-06, + "loss": 2.0354, + "step": 4861 + }, + { + "epoch": 0.26083690987124464, + "grad_norm": 0.33984375, + "learning_rate": 4.993559663719003e-06, + "loss": 2.453, + "step": 4862 + }, + { + "epoch": 0.26089055793991417, + "grad_norm": 0.3828125, + "learning_rate": 4.99355343029697e-06, + "loss": 2.2747, + "step": 4863 + }, + { + "epoch": 0.2609442060085837, + "grad_norm": 0.462890625, + "learning_rate": 4.993547193863713e-06, + "loss": 2.3946, + "step": 4864 + }, + { + "epoch": 0.26099785407725323, + "grad_norm": 0.5390625, + "learning_rate": 4.993540954419238e-06, + "loss": 2.3723, + "step": 4865 + }, + { + "epoch": 0.26105150214592276, + "grad_norm": 0.349609375, + "learning_rate": 4.993534711963555e-06, + "loss": 2.3052, + "step": 4866 + }, + { + "epoch": 0.2611051502145923, + "grad_norm": 0.361328125, + "learning_rate": 4.99352846649667e-06, + "loss": 2.2389, + "step": 4867 + }, + { + "epoch": 0.26115879828326183, + "grad_norm": 0.33984375, + "learning_rate": 4.9935222180185895e-06, + "loss": 2.2687, + "step": 4868 + }, + { + "epoch": 0.2612124463519313, + "grad_norm": 0.33984375, + "learning_rate": 4.993515966529323e-06, + "loss": 1.5772, + "step": 4869 + }, + { + "epoch": 0.26126609442060084, + "grad_norm": 0.388671875, + "learning_rate": 4.993509712028878e-06, + "loss": 2.1788, + "step": 4870 + }, + { + "epoch": 0.26131974248927037, + "grad_norm": 0.369140625, + "learning_rate": 4.99350345451726e-06, + "loss": 2.3296, + "step": 4871 + }, + { + "epoch": 0.2613733905579399, + "grad_norm": 0.3984375, + "learning_rate": 4.9934971939944785e-06, + "loss": 2.3358, + "step": 4872 + }, + { + "epoch": 0.26142703862660943, + "grad_norm": 0.416015625, + "learning_rate": 4.993490930460541e-06, + "loss": 2.2922, + "step": 4873 + }, + { + "epoch": 0.26148068669527896, + "grad_norm": 0.447265625, + "learning_rate": 4.993484663915453e-06, + "loss": 2.1636, + "step": 4874 + }, + { + "epoch": 0.2615343347639485, + "grad_norm": 0.3515625, + "learning_rate": 4.993478394359225e-06, + "loss": 1.9936, + "step": 4875 + }, + { + "epoch": 0.261587982832618, + "grad_norm": 0.369140625, + "learning_rate": 4.993472121791863e-06, + "loss": 2.4369, + "step": 4876 + }, + { + "epoch": 0.26164163090128756, + "grad_norm": 0.36328125, + "learning_rate": 4.993465846213373e-06, + "loss": 2.2366, + "step": 4877 + }, + { + "epoch": 0.2616952789699571, + "grad_norm": 0.375, + "learning_rate": 4.993459567623766e-06, + "loss": 2.056, + "step": 4878 + }, + { + "epoch": 0.2617489270386266, + "grad_norm": 0.322265625, + "learning_rate": 4.993453286023048e-06, + "loss": 2.1045, + "step": 4879 + }, + { + "epoch": 0.26180257510729615, + "grad_norm": 0.4296875, + "learning_rate": 4.993447001411226e-06, + "loss": 2.4129, + "step": 4880 + }, + { + "epoch": 0.2618562231759657, + "grad_norm": 0.396484375, + "learning_rate": 4.993440713788308e-06, + "loss": 2.3454, + "step": 4881 + }, + { + "epoch": 0.2619098712446352, + "grad_norm": 0.38671875, + "learning_rate": 4.993434423154302e-06, + "loss": 2.1568, + "step": 4882 + }, + { + "epoch": 0.2619635193133047, + "grad_norm": 0.32421875, + "learning_rate": 4.993428129509214e-06, + "loss": 2.2348, + "step": 4883 + }, + { + "epoch": 0.2620171673819742, + "grad_norm": 0.361328125, + "learning_rate": 4.993421832853054e-06, + "loss": 1.8516, + "step": 4884 + }, + { + "epoch": 0.26207081545064376, + "grad_norm": 0.37890625, + "learning_rate": 4.993415533185829e-06, + "loss": 2.3146, + "step": 4885 + }, + { + "epoch": 0.2621244635193133, + "grad_norm": 0.4765625, + "learning_rate": 4.993409230507546e-06, + "loss": 2.3603, + "step": 4886 + }, + { + "epoch": 0.2621781115879828, + "grad_norm": 0.5390625, + "learning_rate": 4.9934029248182105e-06, + "loss": 2.3307, + "step": 4887 + }, + { + "epoch": 0.26223175965665235, + "grad_norm": 0.5, + "learning_rate": 4.993396616117834e-06, + "loss": 2.4954, + "step": 4888 + }, + { + "epoch": 0.2622854077253219, + "grad_norm": 0.435546875, + "learning_rate": 4.993390304406422e-06, + "loss": 2.3809, + "step": 4889 + }, + { + "epoch": 0.2623390557939914, + "grad_norm": 0.45703125, + "learning_rate": 4.993383989683983e-06, + "loss": 2.4799, + "step": 4890 + }, + { + "epoch": 0.26239270386266095, + "grad_norm": 0.33984375, + "learning_rate": 4.993377671950524e-06, + "loss": 2.2439, + "step": 4891 + }, + { + "epoch": 0.2624463519313305, + "grad_norm": 0.287109375, + "learning_rate": 4.993371351206052e-06, + "loss": 2.328, + "step": 4892 + }, + { + "epoch": 0.2625, + "grad_norm": 0.345703125, + "learning_rate": 4.993365027450576e-06, + "loss": 2.4062, + "step": 4893 + }, + { + "epoch": 0.26255364806866954, + "grad_norm": 0.484375, + "learning_rate": 4.993358700684104e-06, + "loss": 2.2748, + "step": 4894 + }, + { + "epoch": 0.2626072961373391, + "grad_norm": 0.765625, + "learning_rate": 4.993352370906641e-06, + "loss": 2.1193, + "step": 4895 + }, + { + "epoch": 0.2626609442060086, + "grad_norm": 0.365234375, + "learning_rate": 4.993346038118197e-06, + "loss": 2.2843, + "step": 4896 + }, + { + "epoch": 0.26271459227467814, + "grad_norm": 0.56640625, + "learning_rate": 4.9933397023187795e-06, + "loss": 2.4005, + "step": 4897 + }, + { + "epoch": 0.2627682403433476, + "grad_norm": 0.376953125, + "learning_rate": 4.993333363508395e-06, + "loss": 2.387, + "step": 4898 + }, + { + "epoch": 0.26282188841201715, + "grad_norm": 0.3515625, + "learning_rate": 4.993327021687052e-06, + "loss": 2.1467, + "step": 4899 + }, + { + "epoch": 0.2628755364806867, + "grad_norm": 0.369140625, + "learning_rate": 4.993320676854758e-06, + "loss": 2.4309, + "step": 4900 + }, + { + "epoch": 0.2629291845493562, + "grad_norm": 0.375, + "learning_rate": 4.99331432901152e-06, + "loss": 2.3343, + "step": 4901 + }, + { + "epoch": 0.26298283261802574, + "grad_norm": 0.3515625, + "learning_rate": 4.993307978157348e-06, + "loss": 2.4626, + "step": 4902 + }, + { + "epoch": 0.2630364806866953, + "grad_norm": 0.39453125, + "learning_rate": 4.993301624292246e-06, + "loss": 2.507, + "step": 4903 + }, + { + "epoch": 0.2630901287553648, + "grad_norm": 0.40625, + "learning_rate": 4.993295267416225e-06, + "loss": 2.2492, + "step": 4904 + }, + { + "epoch": 0.26314377682403434, + "grad_norm": 0.404296875, + "learning_rate": 4.99328890752929e-06, + "loss": 2.2859, + "step": 4905 + }, + { + "epoch": 0.26319742489270387, + "grad_norm": 0.345703125, + "learning_rate": 4.9932825446314515e-06, + "loss": 2.035, + "step": 4906 + }, + { + "epoch": 0.2632510729613734, + "grad_norm": 0.412109375, + "learning_rate": 4.993276178722715e-06, + "loss": 2.6707, + "step": 4907 + }, + { + "epoch": 0.26330472103004293, + "grad_norm": 0.34765625, + "learning_rate": 4.993269809803088e-06, + "loss": 2.185, + "step": 4908 + }, + { + "epoch": 0.26335836909871246, + "grad_norm": 0.333984375, + "learning_rate": 4.99326343787258e-06, + "loss": 2.4817, + "step": 4909 + }, + { + "epoch": 0.263412017167382, + "grad_norm": 0.68359375, + "learning_rate": 4.993257062931197e-06, + "loss": 2.5014, + "step": 4910 + }, + { + "epoch": 0.2634656652360515, + "grad_norm": 0.462890625, + "learning_rate": 4.993250684978948e-06, + "loss": 2.251, + "step": 4911 + }, + { + "epoch": 0.263519313304721, + "grad_norm": 0.349609375, + "learning_rate": 4.993244304015839e-06, + "loss": 2.0167, + "step": 4912 + }, + { + "epoch": 0.26357296137339054, + "grad_norm": 0.47265625, + "learning_rate": 4.993237920041881e-06, + "loss": 2.3506, + "step": 4913 + }, + { + "epoch": 0.26362660944206007, + "grad_norm": 0.33203125, + "learning_rate": 4.993231533057078e-06, + "loss": 2.2546, + "step": 4914 + }, + { + "epoch": 0.2636802575107296, + "grad_norm": 0.396484375, + "learning_rate": 4.99322514306144e-06, + "loss": 2.3556, + "step": 4915 + }, + { + "epoch": 0.26373390557939913, + "grad_norm": 0.3984375, + "learning_rate": 4.993218750054974e-06, + "loss": 2.3459, + "step": 4916 + }, + { + "epoch": 0.26378755364806866, + "grad_norm": 0.35546875, + "learning_rate": 4.993212354037687e-06, + "loss": 2.5977, + "step": 4917 + }, + { + "epoch": 0.2638412017167382, + "grad_norm": 0.322265625, + "learning_rate": 4.993205955009587e-06, + "loss": 2.3182, + "step": 4918 + }, + { + "epoch": 0.2638948497854077, + "grad_norm": 0.3515625, + "learning_rate": 4.993199552970683e-06, + "loss": 2.3904, + "step": 4919 + }, + { + "epoch": 0.26394849785407726, + "grad_norm": 0.45703125, + "learning_rate": 4.993193147920982e-06, + "loss": 2.7417, + "step": 4920 + }, + { + "epoch": 0.2640021459227468, + "grad_norm": 0.404296875, + "learning_rate": 4.993186739860491e-06, + "loss": 2.2467, + "step": 4921 + }, + { + "epoch": 0.2640557939914163, + "grad_norm": 0.349609375, + "learning_rate": 4.993180328789219e-06, + "loss": 2.2097, + "step": 4922 + }, + { + "epoch": 0.26410944206008585, + "grad_norm": 0.36328125, + "learning_rate": 4.9931739147071725e-06, + "loss": 2.3388, + "step": 4923 + }, + { + "epoch": 0.2641630901287554, + "grad_norm": 0.349609375, + "learning_rate": 4.99316749761436e-06, + "loss": 2.1425, + "step": 4924 + }, + { + "epoch": 0.2642167381974249, + "grad_norm": 0.470703125, + "learning_rate": 4.993161077510789e-06, + "loss": 2.3414, + "step": 4925 + }, + { + "epoch": 0.26427038626609445, + "grad_norm": 0.373046875, + "learning_rate": 4.993154654396467e-06, + "loss": 2.3111, + "step": 4926 + }, + { + "epoch": 0.2643240343347639, + "grad_norm": 2.265625, + "learning_rate": 4.9931482282714035e-06, + "loss": 2.3306, + "step": 4927 + }, + { + "epoch": 0.26437768240343346, + "grad_norm": 0.306640625, + "learning_rate": 4.993141799135603e-06, + "loss": 2.1948, + "step": 4928 + }, + { + "epoch": 0.264431330472103, + "grad_norm": 0.455078125, + "learning_rate": 4.993135366989077e-06, + "loss": 2.3245, + "step": 4929 + }, + { + "epoch": 0.2644849785407725, + "grad_norm": 0.400390625, + "learning_rate": 4.99312893183183e-06, + "loss": 2.4166, + "step": 4930 + }, + { + "epoch": 0.26453862660944205, + "grad_norm": 0.3671875, + "learning_rate": 4.993122493663872e-06, + "loss": 2.3027, + "step": 4931 + }, + { + "epoch": 0.2645922746781116, + "grad_norm": 0.408203125, + "learning_rate": 4.99311605248521e-06, + "loss": 2.2934, + "step": 4932 + }, + { + "epoch": 0.2646459227467811, + "grad_norm": 0.4375, + "learning_rate": 4.993109608295851e-06, + "loss": 2.2045, + "step": 4933 + }, + { + "epoch": 0.26469957081545065, + "grad_norm": 1.4765625, + "learning_rate": 4.993103161095803e-06, + "loss": 2.3317, + "step": 4934 + }, + { + "epoch": 0.2647532188841202, + "grad_norm": 0.380859375, + "learning_rate": 4.9930967108850756e-06, + "loss": 2.1631, + "step": 4935 + }, + { + "epoch": 0.2648068669527897, + "grad_norm": 0.44140625, + "learning_rate": 4.993090257663675e-06, + "loss": 2.1547, + "step": 4936 + }, + { + "epoch": 0.26486051502145924, + "grad_norm": 0.326171875, + "learning_rate": 4.993083801431609e-06, + "loss": 2.1155, + "step": 4937 + }, + { + "epoch": 0.2649141630901288, + "grad_norm": 0.36328125, + "learning_rate": 4.993077342188886e-06, + "loss": 2.2934, + "step": 4938 + }, + { + "epoch": 0.2649678111587983, + "grad_norm": 0.36328125, + "learning_rate": 4.993070879935512e-06, + "loss": 2.1962, + "step": 4939 + }, + { + "epoch": 0.26502145922746784, + "grad_norm": 0.376953125, + "learning_rate": 4.993064414671498e-06, + "loss": 2.2259, + "step": 4940 + }, + { + "epoch": 0.2650751072961373, + "grad_norm": 1.1796875, + "learning_rate": 4.9930579463968496e-06, + "loss": 2.0017, + "step": 4941 + }, + { + "epoch": 0.26512875536480685, + "grad_norm": 0.337890625, + "learning_rate": 4.9930514751115745e-06, + "loss": 1.8861, + "step": 4942 + }, + { + "epoch": 0.2651824034334764, + "grad_norm": 0.60546875, + "learning_rate": 4.993045000815682e-06, + "loss": 2.5265, + "step": 4943 + }, + { + "epoch": 0.2652360515021459, + "grad_norm": 0.5234375, + "learning_rate": 4.993038523509178e-06, + "loss": 2.2927, + "step": 4944 + }, + { + "epoch": 0.26528969957081544, + "grad_norm": 0.43359375, + "learning_rate": 4.993032043192072e-06, + "loss": 2.1994, + "step": 4945 + }, + { + "epoch": 0.26534334763948497, + "grad_norm": 0.48046875, + "learning_rate": 4.99302555986437e-06, + "loss": 2.5078, + "step": 4946 + }, + { + "epoch": 0.2653969957081545, + "grad_norm": 0.32421875, + "learning_rate": 4.993019073526083e-06, + "loss": 2.0539, + "step": 4947 + }, + { + "epoch": 0.26545064377682404, + "grad_norm": 0.361328125, + "learning_rate": 4.993012584177216e-06, + "loss": 2.4093, + "step": 4948 + }, + { + "epoch": 0.26550429184549357, + "grad_norm": 0.48828125, + "learning_rate": 4.993006091817777e-06, + "loss": 2.2649, + "step": 4949 + }, + { + "epoch": 0.2655579399141631, + "grad_norm": 0.38671875, + "learning_rate": 4.992999596447775e-06, + "loss": 2.5054, + "step": 4950 + }, + { + "epoch": 0.26561158798283263, + "grad_norm": 0.40234375, + "learning_rate": 4.992993098067218e-06, + "loss": 2.3637, + "step": 4951 + }, + { + "epoch": 0.26566523605150216, + "grad_norm": 0.416015625, + "learning_rate": 4.992986596676112e-06, + "loss": 1.7651, + "step": 4952 + }, + { + "epoch": 0.2657188841201717, + "grad_norm": 0.38671875, + "learning_rate": 4.992980092274466e-06, + "loss": 2.454, + "step": 4953 + }, + { + "epoch": 0.2657725321888412, + "grad_norm": 0.451171875, + "learning_rate": 4.99297358486229e-06, + "loss": 2.4954, + "step": 4954 + }, + { + "epoch": 0.2658261802575107, + "grad_norm": 0.390625, + "learning_rate": 4.992967074439587e-06, + "loss": 2.5539, + "step": 4955 + }, + { + "epoch": 0.26587982832618023, + "grad_norm": 0.337890625, + "learning_rate": 4.9929605610063695e-06, + "loss": 2.3086, + "step": 4956 + }, + { + "epoch": 0.26593347639484977, + "grad_norm": 0.357421875, + "learning_rate": 4.9929540445626425e-06, + "loss": 2.0953, + "step": 4957 + }, + { + "epoch": 0.2659871244635193, + "grad_norm": 0.40234375, + "learning_rate": 4.9929475251084156e-06, + "loss": 2.1411, + "step": 4958 + }, + { + "epoch": 0.26604077253218883, + "grad_norm": 0.376953125, + "learning_rate": 4.9929410026436954e-06, + "loss": 2.2366, + "step": 4959 + }, + { + "epoch": 0.26609442060085836, + "grad_norm": 0.41015625, + "learning_rate": 4.992934477168491e-06, + "loss": 2.4032, + "step": 4960 + }, + { + "epoch": 0.2661480686695279, + "grad_norm": 0.6171875, + "learning_rate": 4.992927948682809e-06, + "loss": 1.9532, + "step": 4961 + }, + { + "epoch": 0.2662017167381974, + "grad_norm": 0.40234375, + "learning_rate": 4.992921417186657e-06, + "loss": 2.0715, + "step": 4962 + }, + { + "epoch": 0.26625536480686696, + "grad_norm": 0.408203125, + "learning_rate": 4.992914882680044e-06, + "loss": 2.1604, + "step": 4963 + }, + { + "epoch": 0.2663090128755365, + "grad_norm": 0.359375, + "learning_rate": 4.992908345162979e-06, + "loss": 2.3723, + "step": 4964 + }, + { + "epoch": 0.266362660944206, + "grad_norm": 0.380859375, + "learning_rate": 4.992901804635468e-06, + "loss": 2.2722, + "step": 4965 + }, + { + "epoch": 0.26641630901287555, + "grad_norm": 0.349609375, + "learning_rate": 4.992895261097519e-06, + "loss": 2.2887, + "step": 4966 + }, + { + "epoch": 0.2664699570815451, + "grad_norm": 1.078125, + "learning_rate": 4.99288871454914e-06, + "loss": 2.2784, + "step": 4967 + }, + { + "epoch": 0.2665236051502146, + "grad_norm": 0.515625, + "learning_rate": 4.99288216499034e-06, + "loss": 2.3544, + "step": 4968 + }, + { + "epoch": 0.26657725321888415, + "grad_norm": 0.396484375, + "learning_rate": 4.992875612421126e-06, + "loss": 2.3615, + "step": 4969 + }, + { + "epoch": 0.2666309012875536, + "grad_norm": 0.373046875, + "learning_rate": 4.9928690568415066e-06, + "loss": 2.4178, + "step": 4970 + }, + { + "epoch": 0.26668454935622316, + "grad_norm": 0.54296875, + "learning_rate": 4.9928624982514885e-06, + "loss": 2.1519, + "step": 4971 + }, + { + "epoch": 0.2667381974248927, + "grad_norm": 0.380859375, + "learning_rate": 4.992855936651081e-06, + "loss": 2.1413, + "step": 4972 + }, + { + "epoch": 0.2667918454935622, + "grad_norm": 0.373046875, + "learning_rate": 4.992849372040291e-06, + "loss": 2.3162, + "step": 4973 + }, + { + "epoch": 0.26684549356223175, + "grad_norm": 0.302734375, + "learning_rate": 4.992842804419126e-06, + "loss": 1.986, + "step": 4974 + }, + { + "epoch": 0.2668991416309013, + "grad_norm": 0.333984375, + "learning_rate": 4.9928362337875944e-06, + "loss": 2.1559, + "step": 4975 + }, + { + "epoch": 0.2669527896995708, + "grad_norm": 0.37109375, + "learning_rate": 4.992829660145706e-06, + "loss": 2.2206, + "step": 4976 + }, + { + "epoch": 0.26700643776824035, + "grad_norm": 3.15625, + "learning_rate": 4.992823083493467e-06, + "loss": 2.1032, + "step": 4977 + }, + { + "epoch": 0.2670600858369099, + "grad_norm": 0.421875, + "learning_rate": 4.992816503830884e-06, + "loss": 2.162, + "step": 4978 + }, + { + "epoch": 0.2671137339055794, + "grad_norm": 0.419921875, + "learning_rate": 4.992809921157967e-06, + "loss": 2.5065, + "step": 4979 + }, + { + "epoch": 0.26716738197424894, + "grad_norm": 0.359375, + "learning_rate": 4.9928033354747245e-06, + "loss": 2.0983, + "step": 4980 + }, + { + "epoch": 0.2672210300429185, + "grad_norm": 0.35546875, + "learning_rate": 4.992796746781162e-06, + "loss": 2.4346, + "step": 4981 + }, + { + "epoch": 0.267274678111588, + "grad_norm": 0.37890625, + "learning_rate": 4.99279015507729e-06, + "loss": 2.4326, + "step": 4982 + }, + { + "epoch": 0.26732832618025754, + "grad_norm": 0.7109375, + "learning_rate": 4.992783560363115e-06, + "loss": 2.4709, + "step": 4983 + }, + { + "epoch": 0.267381974248927, + "grad_norm": 0.37109375, + "learning_rate": 4.992776962638645e-06, + "loss": 2.474, + "step": 4984 + }, + { + "epoch": 0.26743562231759654, + "grad_norm": 0.357421875, + "learning_rate": 4.992770361903888e-06, + "loss": 2.3476, + "step": 4985 + }, + { + "epoch": 0.2674892703862661, + "grad_norm": 0.375, + "learning_rate": 4.992763758158852e-06, + "loss": 2.4992, + "step": 4986 + }, + { + "epoch": 0.2675429184549356, + "grad_norm": 0.3828125, + "learning_rate": 4.992757151403545e-06, + "loss": 2.4473, + "step": 4987 + }, + { + "epoch": 0.26759656652360514, + "grad_norm": 0.455078125, + "learning_rate": 4.992750541637976e-06, + "loss": 2.3801, + "step": 4988 + }, + { + "epoch": 0.26765021459227467, + "grad_norm": 0.57421875, + "learning_rate": 4.992743928862151e-06, + "loss": 2.0942, + "step": 4989 + }, + { + "epoch": 0.2677038626609442, + "grad_norm": 0.32421875, + "learning_rate": 4.99273731307608e-06, + "loss": 2.2158, + "step": 4990 + }, + { + "epoch": 0.26775751072961373, + "grad_norm": 9.8125, + "learning_rate": 4.99273069427977e-06, + "loss": 2.3658, + "step": 4991 + }, + { + "epoch": 0.26781115879828327, + "grad_norm": 0.447265625, + "learning_rate": 4.9927240724732286e-06, + "loss": 2.3456, + "step": 4992 + }, + { + "epoch": 0.2678648068669528, + "grad_norm": 0.369140625, + "learning_rate": 4.992717447656464e-06, + "loss": 2.3782, + "step": 4993 + }, + { + "epoch": 0.26791845493562233, + "grad_norm": 0.3359375, + "learning_rate": 4.992710819829486e-06, + "loss": 2.2415, + "step": 4994 + }, + { + "epoch": 0.26797210300429186, + "grad_norm": 0.373046875, + "learning_rate": 4.9927041889923e-06, + "loss": 2.2869, + "step": 4995 + }, + { + "epoch": 0.2680257510729614, + "grad_norm": 0.392578125, + "learning_rate": 4.992697555144916e-06, + "loss": 2.411, + "step": 4996 + }, + { + "epoch": 0.2680793991416309, + "grad_norm": 0.337890625, + "learning_rate": 4.99269091828734e-06, + "loss": 2.0436, + "step": 4997 + }, + { + "epoch": 0.2681330472103004, + "grad_norm": 0.41796875, + "learning_rate": 4.992684278419581e-06, + "loss": 2.3723, + "step": 4998 + }, + { + "epoch": 0.26818669527896993, + "grad_norm": 0.4140625, + "learning_rate": 4.992677635541647e-06, + "loss": 2.1263, + "step": 4999 + }, + { + "epoch": 0.26824034334763946, + "grad_norm": 0.51953125, + "learning_rate": 4.992670989653546e-06, + "loss": 2.5489, + "step": 5000 + }, + { + "epoch": 0.268293991416309, + "grad_norm": 1.4375, + "learning_rate": 4.992664340755288e-06, + "loss": 2.452, + "step": 5001 + }, + { + "epoch": 0.26834763948497853, + "grad_norm": 0.365234375, + "learning_rate": 4.992657688846877e-06, + "loss": 2.047, + "step": 5002 + }, + { + "epoch": 0.26840128755364806, + "grad_norm": 0.388671875, + "learning_rate": 4.992651033928325e-06, + "loss": 2.3191, + "step": 5003 + }, + { + "epoch": 0.2684549356223176, + "grad_norm": 0.37890625, + "learning_rate": 4.992644375999638e-06, + "loss": 2.2052, + "step": 5004 + }, + { + "epoch": 0.2685085836909871, + "grad_norm": 0.37109375, + "learning_rate": 4.992637715060823e-06, + "loss": 2.2283, + "step": 5005 + }, + { + "epoch": 0.26856223175965666, + "grad_norm": 0.55078125, + "learning_rate": 4.99263105111189e-06, + "loss": 2.2526, + "step": 5006 + }, + { + "epoch": 0.2686158798283262, + "grad_norm": 0.35546875, + "learning_rate": 4.992624384152847e-06, + "loss": 2.1881, + "step": 5007 + }, + { + "epoch": 0.2686695278969957, + "grad_norm": 0.93359375, + "learning_rate": 4.992617714183701e-06, + "loss": 2.2417, + "step": 5008 + }, + { + "epoch": 0.26872317596566525, + "grad_norm": 0.353515625, + "learning_rate": 4.99261104120446e-06, + "loss": 2.5622, + "step": 5009 + }, + { + "epoch": 0.2687768240343348, + "grad_norm": 0.3515625, + "learning_rate": 4.9926043652151336e-06, + "loss": 2.2855, + "step": 5010 + }, + { + "epoch": 0.2688304721030043, + "grad_norm": 0.35546875, + "learning_rate": 4.992597686215728e-06, + "loss": 2.6003, + "step": 5011 + }, + { + "epoch": 0.26888412017167385, + "grad_norm": 0.427734375, + "learning_rate": 4.992591004206253e-06, + "loss": 2.3076, + "step": 5012 + }, + { + "epoch": 0.2689377682403433, + "grad_norm": 0.453125, + "learning_rate": 4.9925843191867155e-06, + "loss": 2.1238, + "step": 5013 + }, + { + "epoch": 0.26899141630901285, + "grad_norm": 0.369140625, + "learning_rate": 4.992577631157123e-06, + "loss": 2.4993, + "step": 5014 + }, + { + "epoch": 0.2690450643776824, + "grad_norm": 0.466796875, + "learning_rate": 4.992570940117485e-06, + "loss": 2.1892, + "step": 5015 + }, + { + "epoch": 0.2690987124463519, + "grad_norm": 0.31640625, + "learning_rate": 4.992564246067809e-06, + "loss": 2.3563, + "step": 5016 + }, + { + "epoch": 0.26915236051502145, + "grad_norm": 0.3515625, + "learning_rate": 4.992557549008102e-06, + "loss": 2.1353, + "step": 5017 + }, + { + "epoch": 0.269206008583691, + "grad_norm": 0.380859375, + "learning_rate": 4.992550848938374e-06, + "loss": 2.3228, + "step": 5018 + }, + { + "epoch": 0.2692596566523605, + "grad_norm": 0.37109375, + "learning_rate": 4.9925441458586335e-06, + "loss": 2.2086, + "step": 5019 + }, + { + "epoch": 0.26931330472103004, + "grad_norm": 0.8828125, + "learning_rate": 4.9925374397688866e-06, + "loss": 2.3077, + "step": 5020 + }, + { + "epoch": 0.2693669527896996, + "grad_norm": 0.8515625, + "learning_rate": 4.992530730669141e-06, + "loss": 2.2715, + "step": 5021 + }, + { + "epoch": 0.2694206008583691, + "grad_norm": 0.38671875, + "learning_rate": 4.992524018559407e-06, + "loss": 2.1349, + "step": 5022 + }, + { + "epoch": 0.26947424892703864, + "grad_norm": 0.396484375, + "learning_rate": 4.992517303439691e-06, + "loss": 2.3978, + "step": 5023 + }, + { + "epoch": 0.26952789699570817, + "grad_norm": 0.3828125, + "learning_rate": 4.992510585310003e-06, + "loss": 2.3116, + "step": 5024 + }, + { + "epoch": 0.2695815450643777, + "grad_norm": 0.30859375, + "learning_rate": 4.992503864170348e-06, + "loss": 2.4622, + "step": 5025 + }, + { + "epoch": 0.26963519313304724, + "grad_norm": 0.34375, + "learning_rate": 4.992497140020738e-06, + "loss": 2.2966, + "step": 5026 + }, + { + "epoch": 0.2696888412017167, + "grad_norm": 0.4453125, + "learning_rate": 4.992490412861178e-06, + "loss": 2.2699, + "step": 5027 + }, + { + "epoch": 0.26974248927038624, + "grad_norm": 0.4140625, + "learning_rate": 4.992483682691677e-06, + "loss": 2.2883, + "step": 5028 + }, + { + "epoch": 0.2697961373390558, + "grad_norm": 0.3828125, + "learning_rate": 4.992476949512244e-06, + "loss": 2.3325, + "step": 5029 + }, + { + "epoch": 0.2698497854077253, + "grad_norm": 0.4140625, + "learning_rate": 4.992470213322887e-06, + "loss": 2.5481, + "step": 5030 + }, + { + "epoch": 0.26990343347639484, + "grad_norm": 0.8125, + "learning_rate": 4.992463474123612e-06, + "loss": 2.3083, + "step": 5031 + }, + { + "epoch": 0.26995708154506437, + "grad_norm": 0.453125, + "learning_rate": 4.99245673191443e-06, + "loss": 2.5927, + "step": 5032 + }, + { + "epoch": 0.2700107296137339, + "grad_norm": 0.96484375, + "learning_rate": 4.992449986695348e-06, + "loss": 2.3018, + "step": 5033 + }, + { + "epoch": 0.27006437768240343, + "grad_norm": 0.76953125, + "learning_rate": 4.992443238466373e-06, + "loss": 2.3058, + "step": 5034 + }, + { + "epoch": 0.27011802575107297, + "grad_norm": 0.48046875, + "learning_rate": 4.9924364872275145e-06, + "loss": 2.4437, + "step": 5035 + }, + { + "epoch": 0.2701716738197425, + "grad_norm": 0.423828125, + "learning_rate": 4.992429732978781e-06, + "loss": 2.3307, + "step": 5036 + }, + { + "epoch": 0.27022532188841203, + "grad_norm": 0.3984375, + "learning_rate": 4.9924229757201795e-06, + "loss": 2.4955, + "step": 5037 + }, + { + "epoch": 0.27027896995708156, + "grad_norm": 0.369140625, + "learning_rate": 4.992416215451718e-06, + "loss": 2.3532, + "step": 5038 + }, + { + "epoch": 0.2703326180257511, + "grad_norm": 0.333984375, + "learning_rate": 4.992409452173407e-06, + "loss": 2.0064, + "step": 5039 + }, + { + "epoch": 0.2703862660944206, + "grad_norm": 0.3984375, + "learning_rate": 4.992402685885253e-06, + "loss": 2.2397, + "step": 5040 + }, + { + "epoch": 0.27043991416309016, + "grad_norm": 0.376953125, + "learning_rate": 4.992395916587263e-06, + "loss": 2.2216, + "step": 5041 + }, + { + "epoch": 0.27049356223175963, + "grad_norm": 0.33984375, + "learning_rate": 4.992389144279447e-06, + "loss": 2.2126, + "step": 5042 + }, + { + "epoch": 0.27054721030042916, + "grad_norm": 0.35546875, + "learning_rate": 4.992382368961812e-06, + "loss": 2.2708, + "step": 5043 + }, + { + "epoch": 0.2706008583690987, + "grad_norm": 0.4296875, + "learning_rate": 4.992375590634367e-06, + "loss": 2.5718, + "step": 5044 + }, + { + "epoch": 0.2706545064377682, + "grad_norm": 1.0234375, + "learning_rate": 4.992368809297119e-06, + "loss": 1.9375, + "step": 5045 + }, + { + "epoch": 0.27070815450643776, + "grad_norm": 0.373046875, + "learning_rate": 4.992362024950079e-06, + "loss": 2.2651, + "step": 5046 + }, + { + "epoch": 0.2707618025751073, + "grad_norm": 0.353515625, + "learning_rate": 4.992355237593252e-06, + "loss": 2.2594, + "step": 5047 + }, + { + "epoch": 0.2708154506437768, + "grad_norm": 0.369140625, + "learning_rate": 4.992348447226648e-06, + "loss": 2.2116, + "step": 5048 + }, + { + "epoch": 0.27086909871244635, + "grad_norm": 0.32421875, + "learning_rate": 4.9923416538502735e-06, + "loss": 1.9138, + "step": 5049 + }, + { + "epoch": 0.2709227467811159, + "grad_norm": 0.42578125, + "learning_rate": 4.9923348574641395e-06, + "loss": 2.5674, + "step": 5050 + }, + { + "epoch": 0.2709763948497854, + "grad_norm": 0.3359375, + "learning_rate": 4.992328058068252e-06, + "loss": 2.1531, + "step": 5051 + }, + { + "epoch": 0.27103004291845495, + "grad_norm": 0.419921875, + "learning_rate": 4.992321255662619e-06, + "loss": 2.4165, + "step": 5052 + }, + { + "epoch": 0.2710836909871245, + "grad_norm": 0.4296875, + "learning_rate": 4.992314450247251e-06, + "loss": 2.4065, + "step": 5053 + }, + { + "epoch": 0.271137339055794, + "grad_norm": 0.427734375, + "learning_rate": 4.992307641822154e-06, + "loss": 2.3825, + "step": 5054 + }, + { + "epoch": 0.27119098712446355, + "grad_norm": 0.3984375, + "learning_rate": 4.992300830387336e-06, + "loss": 2.1139, + "step": 5055 + }, + { + "epoch": 0.271244635193133, + "grad_norm": 0.421875, + "learning_rate": 4.9922940159428066e-06, + "loss": 2.483, + "step": 5056 + }, + { + "epoch": 0.27129828326180255, + "grad_norm": 0.40625, + "learning_rate": 4.9922871984885745e-06, + "loss": 2.3641, + "step": 5057 + }, + { + "epoch": 0.2713519313304721, + "grad_norm": 0.76171875, + "learning_rate": 4.992280378024646e-06, + "loss": 2.349, + "step": 5058 + }, + { + "epoch": 0.2714055793991416, + "grad_norm": 0.34375, + "learning_rate": 4.992273554551031e-06, + "loss": 2.5598, + "step": 5059 + }, + { + "epoch": 0.27145922746781115, + "grad_norm": 0.66015625, + "learning_rate": 4.992266728067737e-06, + "loss": 2.172, + "step": 5060 + }, + { + "epoch": 0.2715128755364807, + "grad_norm": 0.482421875, + "learning_rate": 4.9922598985747725e-06, + "loss": 1.9912, + "step": 5061 + }, + { + "epoch": 0.2715665236051502, + "grad_norm": 0.369140625, + "learning_rate": 4.9922530660721454e-06, + "loss": 2.1428, + "step": 5062 + }, + { + "epoch": 0.27162017167381974, + "grad_norm": 0.3984375, + "learning_rate": 4.9922462305598655e-06, + "loss": 2.5805, + "step": 5063 + }, + { + "epoch": 0.2716738197424893, + "grad_norm": 0.375, + "learning_rate": 4.992239392037938e-06, + "loss": 2.2617, + "step": 5064 + }, + { + "epoch": 0.2717274678111588, + "grad_norm": 0.42578125, + "learning_rate": 4.992232550506373e-06, + "loss": 2.4416, + "step": 5065 + }, + { + "epoch": 0.27178111587982834, + "grad_norm": 0.33984375, + "learning_rate": 4.9922257059651794e-06, + "loss": 2.5612, + "step": 5066 + }, + { + "epoch": 0.27183476394849787, + "grad_norm": 0.64453125, + "learning_rate": 4.992218858414364e-06, + "loss": 2.3146, + "step": 5067 + }, + { + "epoch": 0.2718884120171674, + "grad_norm": 0.404296875, + "learning_rate": 4.992212007853936e-06, + "loss": 2.2454, + "step": 5068 + }, + { + "epoch": 0.27194206008583693, + "grad_norm": 0.357421875, + "learning_rate": 4.9922051542839035e-06, + "loss": 2.124, + "step": 5069 + }, + { + "epoch": 0.2719957081545064, + "grad_norm": 0.380859375, + "learning_rate": 4.992198297704275e-06, + "loss": 2.5205, + "step": 5070 + }, + { + "epoch": 0.27204935622317594, + "grad_norm": 0.67578125, + "learning_rate": 4.992191438115058e-06, + "loss": 2.1473, + "step": 5071 + }, + { + "epoch": 0.2721030042918455, + "grad_norm": 1.4296875, + "learning_rate": 4.992184575516261e-06, + "loss": 2.111, + "step": 5072 + }, + { + "epoch": 0.272156652360515, + "grad_norm": 0.40625, + "learning_rate": 4.992177709907894e-06, + "loss": 2.285, + "step": 5073 + }, + { + "epoch": 0.27221030042918454, + "grad_norm": 0.31640625, + "learning_rate": 4.992170841289963e-06, + "loss": 2.2489, + "step": 5074 + }, + { + "epoch": 0.27226394849785407, + "grad_norm": 0.37109375, + "learning_rate": 4.992163969662477e-06, + "loss": 2.1996, + "step": 5075 + }, + { + "epoch": 0.2723175965665236, + "grad_norm": 0.7734375, + "learning_rate": 4.992157095025444e-06, + "loss": 2.2527, + "step": 5076 + }, + { + "epoch": 0.27237124463519313, + "grad_norm": 0.361328125, + "learning_rate": 4.992150217378873e-06, + "loss": 2.3962, + "step": 5077 + }, + { + "epoch": 0.27242489270386266, + "grad_norm": 0.359375, + "learning_rate": 4.992143336722772e-06, + "loss": 2.3166, + "step": 5078 + }, + { + "epoch": 0.2724785407725322, + "grad_norm": 0.361328125, + "learning_rate": 4.99213645305715e-06, + "loss": 2.1957, + "step": 5079 + }, + { + "epoch": 0.27253218884120173, + "grad_norm": 0.337890625, + "learning_rate": 4.992129566382014e-06, + "loss": 2.2714, + "step": 5080 + }, + { + "epoch": 0.27258583690987126, + "grad_norm": 0.46484375, + "learning_rate": 4.992122676697374e-06, + "loss": 2.3909, + "step": 5081 + }, + { + "epoch": 0.2726394849785408, + "grad_norm": 0.3515625, + "learning_rate": 4.992115784003237e-06, + "loss": 2.2363, + "step": 5082 + }, + { + "epoch": 0.2726931330472103, + "grad_norm": 0.474609375, + "learning_rate": 4.992108888299611e-06, + "loss": 2.5571, + "step": 5083 + }, + { + "epoch": 0.27274678111587985, + "grad_norm": 0.52734375, + "learning_rate": 4.9921019895865055e-06, + "loss": 2.1576, + "step": 5084 + }, + { + "epoch": 0.27280042918454933, + "grad_norm": 0.3125, + "learning_rate": 4.9920950878639275e-06, + "loss": 2.2718, + "step": 5085 + }, + { + "epoch": 0.27285407725321886, + "grad_norm": 0.416015625, + "learning_rate": 4.9920881831318865e-06, + "loss": 2.2929, + "step": 5086 + }, + { + "epoch": 0.2729077253218884, + "grad_norm": 0.353515625, + "learning_rate": 4.9920812753903915e-06, + "loss": 2.1213, + "step": 5087 + }, + { + "epoch": 0.2729613733905579, + "grad_norm": 0.359375, + "learning_rate": 4.992074364639449e-06, + "loss": 2.3679, + "step": 5088 + }, + { + "epoch": 0.27301502145922746, + "grad_norm": 0.33984375, + "learning_rate": 4.992067450879068e-06, + "loss": 2.1537, + "step": 5089 + }, + { + "epoch": 0.273068669527897, + "grad_norm": 0.4296875, + "learning_rate": 4.992060534109257e-06, + "loss": 2.6034, + "step": 5090 + }, + { + "epoch": 0.2731223175965665, + "grad_norm": 0.38671875, + "learning_rate": 4.992053614330025e-06, + "loss": 2.4404, + "step": 5091 + }, + { + "epoch": 0.27317596566523605, + "grad_norm": 0.5078125, + "learning_rate": 4.992046691541379e-06, + "loss": 2.4837, + "step": 5092 + }, + { + "epoch": 0.2732296137339056, + "grad_norm": 0.37890625, + "learning_rate": 4.992039765743328e-06, + "loss": 2.3586, + "step": 5093 + }, + { + "epoch": 0.2732832618025751, + "grad_norm": 0.486328125, + "learning_rate": 4.9920328369358815e-06, + "loss": 2.599, + "step": 5094 + }, + { + "epoch": 0.27333690987124465, + "grad_norm": 0.365234375, + "learning_rate": 4.9920259051190455e-06, + "loss": 2.553, + "step": 5095 + }, + { + "epoch": 0.2733905579399142, + "grad_norm": 0.53125, + "learning_rate": 4.99201897029283e-06, + "loss": 2.3297, + "step": 5096 + }, + { + "epoch": 0.2734442060085837, + "grad_norm": 0.357421875, + "learning_rate": 4.992012032457244e-06, + "loss": 2.3599, + "step": 5097 + }, + { + "epoch": 0.27349785407725324, + "grad_norm": 0.365234375, + "learning_rate": 4.9920050916122945e-06, + "loss": 2.3948, + "step": 5098 + }, + { + "epoch": 0.2735515021459227, + "grad_norm": 0.3984375, + "learning_rate": 4.9919981477579894e-06, + "loss": 2.1227, + "step": 5099 + }, + { + "epoch": 0.27360515021459225, + "grad_norm": 0.39453125, + "learning_rate": 4.991991200894339e-06, + "loss": 2.0401, + "step": 5100 + }, + { + "epoch": 0.2736587982832618, + "grad_norm": 0.404296875, + "learning_rate": 4.99198425102135e-06, + "loss": 2.2095, + "step": 5101 + }, + { + "epoch": 0.2737124463519313, + "grad_norm": 0.375, + "learning_rate": 4.991977298139032e-06, + "loss": 2.0839, + "step": 5102 + }, + { + "epoch": 0.27376609442060085, + "grad_norm": 0.388671875, + "learning_rate": 4.991970342247393e-06, + "loss": 2.2977, + "step": 5103 + }, + { + "epoch": 0.2738197424892704, + "grad_norm": 0.400390625, + "learning_rate": 4.991963383346441e-06, + "loss": 2.4523, + "step": 5104 + }, + { + "epoch": 0.2738733905579399, + "grad_norm": 0.369140625, + "learning_rate": 4.991956421436184e-06, + "loss": 2.4114, + "step": 5105 + }, + { + "epoch": 0.27392703862660944, + "grad_norm": 0.43359375, + "learning_rate": 4.991949456516632e-06, + "loss": 2.3609, + "step": 5106 + }, + { + "epoch": 0.273980686695279, + "grad_norm": 0.4609375, + "learning_rate": 4.991942488587792e-06, + "loss": 2.635, + "step": 5107 + }, + { + "epoch": 0.2740343347639485, + "grad_norm": 0.384765625, + "learning_rate": 4.991935517649673e-06, + "loss": 1.987, + "step": 5108 + }, + { + "epoch": 0.27408798283261804, + "grad_norm": 0.52734375, + "learning_rate": 4.991928543702284e-06, + "loss": 2.3427, + "step": 5109 + }, + { + "epoch": 0.27414163090128757, + "grad_norm": 0.404296875, + "learning_rate": 4.991921566745632e-06, + "loss": 2.262, + "step": 5110 + }, + { + "epoch": 0.2741952789699571, + "grad_norm": 0.9453125, + "learning_rate": 4.991914586779727e-06, + "loss": 2.4553, + "step": 5111 + }, + { + "epoch": 0.27424892703862663, + "grad_norm": 0.41796875, + "learning_rate": 4.991907603804576e-06, + "loss": 2.3784, + "step": 5112 + }, + { + "epoch": 0.27430257510729616, + "grad_norm": 0.373046875, + "learning_rate": 4.9919006178201875e-06, + "loss": 2.3033, + "step": 5113 + }, + { + "epoch": 0.27435622317596564, + "grad_norm": 0.6796875, + "learning_rate": 4.991893628826571e-06, + "loss": 2.2389, + "step": 5114 + }, + { + "epoch": 0.2744098712446352, + "grad_norm": 0.6328125, + "learning_rate": 4.991886636823734e-06, + "loss": 2.431, + "step": 5115 + }, + { + "epoch": 0.2744635193133047, + "grad_norm": 0.3984375, + "learning_rate": 4.991879641811686e-06, + "loss": 2.3363, + "step": 5116 + }, + { + "epoch": 0.27451716738197424, + "grad_norm": 0.3984375, + "learning_rate": 4.991872643790435e-06, + "loss": 2.2569, + "step": 5117 + }, + { + "epoch": 0.27457081545064377, + "grad_norm": 0.396484375, + "learning_rate": 4.9918656427599886e-06, + "loss": 2.2352, + "step": 5118 + }, + { + "epoch": 0.2746244635193133, + "grad_norm": 0.41015625, + "learning_rate": 4.991858638720356e-06, + "loss": 2.1851, + "step": 5119 + }, + { + "epoch": 0.27467811158798283, + "grad_norm": 0.353515625, + "learning_rate": 4.991851631671546e-06, + "loss": 2.2586, + "step": 5120 + }, + { + "epoch": 0.27473175965665236, + "grad_norm": 0.408203125, + "learning_rate": 4.991844621613566e-06, + "loss": 2.3996, + "step": 5121 + }, + { + "epoch": 0.2747854077253219, + "grad_norm": 0.53515625, + "learning_rate": 4.991837608546426e-06, + "loss": 2.1715, + "step": 5122 + }, + { + "epoch": 0.2748390557939914, + "grad_norm": 0.435546875, + "learning_rate": 4.991830592470132e-06, + "loss": 2.4366, + "step": 5123 + }, + { + "epoch": 0.27489270386266096, + "grad_norm": 0.447265625, + "learning_rate": 4.991823573384695e-06, + "loss": 2.4054, + "step": 5124 + }, + { + "epoch": 0.2749463519313305, + "grad_norm": 0.3359375, + "learning_rate": 4.991816551290124e-06, + "loss": 2.3254, + "step": 5125 + }, + { + "epoch": 0.275, + "grad_norm": 0.375, + "learning_rate": 4.991809526186424e-06, + "loss": 2.3396, + "step": 5126 + }, + { + "epoch": 0.27505364806866955, + "grad_norm": 0.466796875, + "learning_rate": 4.991802498073606e-06, + "loss": 2.2835, + "step": 5127 + }, + { + "epoch": 0.27510729613733903, + "grad_norm": 0.396484375, + "learning_rate": 4.991795466951678e-06, + "loss": 2.2784, + "step": 5128 + }, + { + "epoch": 0.27516094420600856, + "grad_norm": 0.392578125, + "learning_rate": 4.9917884328206485e-06, + "loss": 2.0393, + "step": 5129 + }, + { + "epoch": 0.2752145922746781, + "grad_norm": 0.404296875, + "learning_rate": 4.991781395680526e-06, + "loss": 2.626, + "step": 5130 + }, + { + "epoch": 0.2752682403433476, + "grad_norm": 0.361328125, + "learning_rate": 4.991774355531319e-06, + "loss": 2.0335, + "step": 5131 + }, + { + "epoch": 0.27532188841201716, + "grad_norm": 0.63671875, + "learning_rate": 4.991767312373036e-06, + "loss": 2.3117, + "step": 5132 + }, + { + "epoch": 0.2753755364806867, + "grad_norm": 0.36328125, + "learning_rate": 4.991760266205685e-06, + "loss": 2.3114, + "step": 5133 + }, + { + "epoch": 0.2754291845493562, + "grad_norm": 5.15625, + "learning_rate": 4.9917532170292745e-06, + "loss": 1.8617, + "step": 5134 + }, + { + "epoch": 0.27548283261802575, + "grad_norm": 0.375, + "learning_rate": 4.991746164843814e-06, + "loss": 2.2252, + "step": 5135 + }, + { + "epoch": 0.2755364806866953, + "grad_norm": 1.109375, + "learning_rate": 4.991739109649313e-06, + "loss": 2.4509, + "step": 5136 + }, + { + "epoch": 0.2755901287553648, + "grad_norm": 0.46875, + "learning_rate": 4.991732051445777e-06, + "loss": 2.1235, + "step": 5137 + }, + { + "epoch": 0.27564377682403435, + "grad_norm": 0.39453125, + "learning_rate": 4.991724990233216e-06, + "loss": 2.3104, + "step": 5138 + }, + { + "epoch": 0.2756974248927039, + "grad_norm": 0.34375, + "learning_rate": 4.991717926011639e-06, + "loss": 2.2875, + "step": 5139 + }, + { + "epoch": 0.2757510729613734, + "grad_norm": 0.447265625, + "learning_rate": 4.9917108587810535e-06, + "loss": 2.3702, + "step": 5140 + }, + { + "epoch": 0.27580472103004294, + "grad_norm": 0.361328125, + "learning_rate": 4.991703788541469e-06, + "loss": 2.3943, + "step": 5141 + }, + { + "epoch": 0.2758583690987124, + "grad_norm": 0.341796875, + "learning_rate": 4.991696715292894e-06, + "loss": 2.2307, + "step": 5142 + }, + { + "epoch": 0.27591201716738195, + "grad_norm": 0.439453125, + "learning_rate": 4.991689639035337e-06, + "loss": 2.4676, + "step": 5143 + }, + { + "epoch": 0.2759656652360515, + "grad_norm": 0.43359375, + "learning_rate": 4.991682559768805e-06, + "loss": 2.2842, + "step": 5144 + }, + { + "epoch": 0.276019313304721, + "grad_norm": 0.35546875, + "learning_rate": 4.9916754774933085e-06, + "loss": 2.3996, + "step": 5145 + }, + { + "epoch": 0.27607296137339055, + "grad_norm": 0.5859375, + "learning_rate": 4.991668392208855e-06, + "loss": 2.2499, + "step": 5146 + }, + { + "epoch": 0.2761266094420601, + "grad_norm": 0.337890625, + "learning_rate": 4.991661303915454e-06, + "loss": 2.3632, + "step": 5147 + }, + { + "epoch": 0.2761802575107296, + "grad_norm": 0.4453125, + "learning_rate": 4.991654212613113e-06, + "loss": 2.1815, + "step": 5148 + }, + { + "epoch": 0.27623390557939914, + "grad_norm": 0.412109375, + "learning_rate": 4.991647118301842e-06, + "loss": 2.1539, + "step": 5149 + }, + { + "epoch": 0.2762875536480687, + "grad_norm": 1.1015625, + "learning_rate": 4.991640020981647e-06, + "loss": 2.491, + "step": 5150 + }, + { + "epoch": 0.2763412017167382, + "grad_norm": 0.34765625, + "learning_rate": 4.9916329206525395e-06, + "loss": 2.3095, + "step": 5151 + }, + { + "epoch": 0.27639484978540774, + "grad_norm": 0.396484375, + "learning_rate": 4.991625817314526e-06, + "loss": 2.4221, + "step": 5152 + }, + { + "epoch": 0.27644849785407727, + "grad_norm": 0.6484375, + "learning_rate": 4.991618710967616e-06, + "loss": 2.4298, + "step": 5153 + }, + { + "epoch": 0.2765021459227468, + "grad_norm": 0.361328125, + "learning_rate": 4.9916116016118185e-06, + "loss": 2.3836, + "step": 5154 + }, + { + "epoch": 0.27655579399141633, + "grad_norm": 0.361328125, + "learning_rate": 4.991604489247141e-06, + "loss": 2.246, + "step": 5155 + }, + { + "epoch": 0.27660944206008586, + "grad_norm": 0.412109375, + "learning_rate": 4.991597373873592e-06, + "loss": 2.3317, + "step": 5156 + }, + { + "epoch": 0.27666309012875534, + "grad_norm": 0.412109375, + "learning_rate": 4.991590255491182e-06, + "loss": 2.656, + "step": 5157 + }, + { + "epoch": 0.27671673819742487, + "grad_norm": 0.365234375, + "learning_rate": 4.9915831340999175e-06, + "loss": 2.4295, + "step": 5158 + }, + { + "epoch": 0.2767703862660944, + "grad_norm": 0.46875, + "learning_rate": 4.991576009699808e-06, + "loss": 2.4294, + "step": 5159 + }, + { + "epoch": 0.27682403433476394, + "grad_norm": 0.33203125, + "learning_rate": 4.991568882290861e-06, + "loss": 2.3048, + "step": 5160 + }, + { + "epoch": 0.27687768240343347, + "grad_norm": 0.375, + "learning_rate": 4.991561751873087e-06, + "loss": 2.4851, + "step": 5161 + }, + { + "epoch": 0.276931330472103, + "grad_norm": 0.466796875, + "learning_rate": 4.991554618446494e-06, + "loss": 2.5925, + "step": 5162 + }, + { + "epoch": 0.27698497854077253, + "grad_norm": 0.359375, + "learning_rate": 4.99154748201109e-06, + "loss": 2.1018, + "step": 5163 + }, + { + "epoch": 0.27703862660944206, + "grad_norm": 0.361328125, + "learning_rate": 4.991540342566884e-06, + "loss": 2.3189, + "step": 5164 + }, + { + "epoch": 0.2770922746781116, + "grad_norm": 0.48046875, + "learning_rate": 4.991533200113884e-06, + "loss": 2.3458, + "step": 5165 + }, + { + "epoch": 0.2771459227467811, + "grad_norm": 1.296875, + "learning_rate": 4.9915260546521e-06, + "loss": 2.002, + "step": 5166 + }, + { + "epoch": 0.27719957081545066, + "grad_norm": 0.341796875, + "learning_rate": 4.99151890618154e-06, + "loss": 2.3168, + "step": 5167 + }, + { + "epoch": 0.2772532188841202, + "grad_norm": 0.37890625, + "learning_rate": 4.991511754702211e-06, + "loss": 2.0718, + "step": 5168 + }, + { + "epoch": 0.2773068669527897, + "grad_norm": 0.55078125, + "learning_rate": 4.991504600214124e-06, + "loss": 2.5256, + "step": 5169 + }, + { + "epoch": 0.27736051502145925, + "grad_norm": 0.38671875, + "learning_rate": 4.991497442717287e-06, + "loss": 2.541, + "step": 5170 + }, + { + "epoch": 0.27741416309012873, + "grad_norm": 0.359375, + "learning_rate": 4.991490282211707e-06, + "loss": 2.26, + "step": 5171 + }, + { + "epoch": 0.27746781115879826, + "grad_norm": 0.37109375, + "learning_rate": 4.991483118697396e-06, + "loss": 2.275, + "step": 5172 + }, + { + "epoch": 0.2775214592274678, + "grad_norm": 0.3984375, + "learning_rate": 4.99147595217436e-06, + "loss": 2.2902, + "step": 5173 + }, + { + "epoch": 0.2775751072961373, + "grad_norm": 0.37890625, + "learning_rate": 4.9914687826426075e-06, + "loss": 2.3622, + "step": 5174 + }, + { + "epoch": 0.27762875536480686, + "grad_norm": 0.423828125, + "learning_rate": 4.991461610102148e-06, + "loss": 2.3679, + "step": 5175 + }, + { + "epoch": 0.2776824034334764, + "grad_norm": 0.341796875, + "learning_rate": 4.991454434552991e-06, + "loss": 2.2834, + "step": 5176 + }, + { + "epoch": 0.2777360515021459, + "grad_norm": 0.435546875, + "learning_rate": 4.991447255995144e-06, + "loss": 2.1363, + "step": 5177 + }, + { + "epoch": 0.27778969957081545, + "grad_norm": 0.44921875, + "learning_rate": 4.991440074428615e-06, + "loss": 2.2917, + "step": 5178 + }, + { + "epoch": 0.277843347639485, + "grad_norm": 0.486328125, + "learning_rate": 4.991432889853414e-06, + "loss": 2.1723, + "step": 5179 + }, + { + "epoch": 0.2778969957081545, + "grad_norm": 0.392578125, + "learning_rate": 4.99142570226955e-06, + "loss": 2.5419, + "step": 5180 + }, + { + "epoch": 0.27795064377682405, + "grad_norm": 0.37109375, + "learning_rate": 4.991418511677031e-06, + "loss": 2.5307, + "step": 5181 + }, + { + "epoch": 0.2780042918454936, + "grad_norm": 0.37890625, + "learning_rate": 4.991411318075865e-06, + "loss": 2.3334, + "step": 5182 + }, + { + "epoch": 0.2780579399141631, + "grad_norm": 0.4296875, + "learning_rate": 4.9914041214660615e-06, + "loss": 2.152, + "step": 5183 + }, + { + "epoch": 0.27811158798283264, + "grad_norm": 0.376953125, + "learning_rate": 4.99139692184763e-06, + "loss": 2.1475, + "step": 5184 + }, + { + "epoch": 0.2781652360515021, + "grad_norm": 0.4140625, + "learning_rate": 4.9913897192205765e-06, + "loss": 2.5287, + "step": 5185 + }, + { + "epoch": 0.27821888412017165, + "grad_norm": 0.416015625, + "learning_rate": 4.991382513584912e-06, + "loss": 2.3435, + "step": 5186 + }, + { + "epoch": 0.2782725321888412, + "grad_norm": 0.390625, + "learning_rate": 4.991375304940646e-06, + "loss": 2.3325, + "step": 5187 + }, + { + "epoch": 0.2783261802575107, + "grad_norm": 0.37890625, + "learning_rate": 4.991368093287785e-06, + "loss": 2.2279, + "step": 5188 + }, + { + "epoch": 0.27837982832618025, + "grad_norm": 0.609375, + "learning_rate": 4.991360878626338e-06, + "loss": 2.4165, + "step": 5189 + }, + { + "epoch": 0.2784334763948498, + "grad_norm": 0.337890625, + "learning_rate": 4.991353660956315e-06, + "loss": 1.9797, + "step": 5190 + }, + { + "epoch": 0.2784871244635193, + "grad_norm": 0.341796875, + "learning_rate": 4.991346440277723e-06, + "loss": 2.3166, + "step": 5191 + }, + { + "epoch": 0.27854077253218884, + "grad_norm": 0.361328125, + "learning_rate": 4.9913392165905725e-06, + "loss": 2.3497, + "step": 5192 + }, + { + "epoch": 0.2785944206008584, + "grad_norm": 0.365234375, + "learning_rate": 4.991331989894872e-06, + "loss": 2.3766, + "step": 5193 + }, + { + "epoch": 0.2786480686695279, + "grad_norm": 0.376953125, + "learning_rate": 4.991324760190629e-06, + "loss": 2.3642, + "step": 5194 + }, + { + "epoch": 0.27870171673819744, + "grad_norm": 0.50390625, + "learning_rate": 4.9913175274778525e-06, + "loss": 2.445, + "step": 5195 + }, + { + "epoch": 0.27875536480686697, + "grad_norm": 0.447265625, + "learning_rate": 4.991310291756551e-06, + "loss": 2.3595, + "step": 5196 + }, + { + "epoch": 0.2788090128755365, + "grad_norm": 0.4140625, + "learning_rate": 4.991303053026736e-06, + "loss": 2.4638, + "step": 5197 + }, + { + "epoch": 0.27886266094420603, + "grad_norm": 0.55078125, + "learning_rate": 4.991295811288412e-06, + "loss": 1.6884, + "step": 5198 + }, + { + "epoch": 0.27891630901287556, + "grad_norm": 0.412109375, + "learning_rate": 4.991288566541591e-06, + "loss": 2.5506, + "step": 5199 + }, + { + "epoch": 0.27896995708154504, + "grad_norm": 0.6484375, + "learning_rate": 4.9912813187862805e-06, + "loss": 2.0569, + "step": 5200 + }, + { + "epoch": 0.27902360515021457, + "grad_norm": 0.330078125, + "learning_rate": 4.991274068022489e-06, + "loss": 2.2476, + "step": 5201 + }, + { + "epoch": 0.2790772532188841, + "grad_norm": 0.6953125, + "learning_rate": 4.991266814250226e-06, + "loss": 2.2615, + "step": 5202 + }, + { + "epoch": 0.27913090128755363, + "grad_norm": 0.486328125, + "learning_rate": 4.9912595574695e-06, + "loss": 2.4643, + "step": 5203 + }, + { + "epoch": 0.27918454935622317, + "grad_norm": 0.35546875, + "learning_rate": 4.991252297680319e-06, + "loss": 2.282, + "step": 5204 + }, + { + "epoch": 0.2792381974248927, + "grad_norm": 0.345703125, + "learning_rate": 4.9912450348826925e-06, + "loss": 1.9604, + "step": 5205 + }, + { + "epoch": 0.27929184549356223, + "grad_norm": 0.36328125, + "learning_rate": 4.991237769076629e-06, + "loss": 2.3608, + "step": 5206 + }, + { + "epoch": 0.27934549356223176, + "grad_norm": 0.404296875, + "learning_rate": 4.991230500262139e-06, + "loss": 2.3651, + "step": 5207 + }, + { + "epoch": 0.2793991416309013, + "grad_norm": 0.361328125, + "learning_rate": 4.991223228439228e-06, + "loss": 1.7528, + "step": 5208 + }, + { + "epoch": 0.2794527896995708, + "grad_norm": 0.419921875, + "learning_rate": 4.9912159536079066e-06, + "loss": 2.3706, + "step": 5209 + }, + { + "epoch": 0.27950643776824036, + "grad_norm": 0.4609375, + "learning_rate": 4.991208675768184e-06, + "loss": 2.2193, + "step": 5210 + }, + { + "epoch": 0.2795600858369099, + "grad_norm": 0.443359375, + "learning_rate": 4.991201394920068e-06, + "loss": 2.4904, + "step": 5211 + }, + { + "epoch": 0.2796137339055794, + "grad_norm": 0.4453125, + "learning_rate": 4.991194111063569e-06, + "loss": 2.1136, + "step": 5212 + }, + { + "epoch": 0.27966738197424895, + "grad_norm": 0.35546875, + "learning_rate": 4.991186824198693e-06, + "loss": 2.3178, + "step": 5213 + }, + { + "epoch": 0.27972103004291843, + "grad_norm": 0.4765625, + "learning_rate": 4.9911795343254515e-06, + "loss": 2.3938, + "step": 5214 + }, + { + "epoch": 0.27977467811158796, + "grad_norm": 0.87890625, + "learning_rate": 4.991172241443852e-06, + "loss": 2.5493, + "step": 5215 + }, + { + "epoch": 0.2798283261802575, + "grad_norm": 0.30078125, + "learning_rate": 4.991164945553904e-06, + "loss": 2.1791, + "step": 5216 + }, + { + "epoch": 0.279881974248927, + "grad_norm": 0.34375, + "learning_rate": 4.991157646655616e-06, + "loss": 2.1422, + "step": 5217 + }, + { + "epoch": 0.27993562231759656, + "grad_norm": 0.357421875, + "learning_rate": 4.991150344748995e-06, + "loss": 2.2829, + "step": 5218 + }, + { + "epoch": 0.2799892703862661, + "grad_norm": 0.375, + "learning_rate": 4.991143039834053e-06, + "loss": 2.3739, + "step": 5219 + }, + { + "epoch": 0.2800429184549356, + "grad_norm": 0.33203125, + "learning_rate": 4.991135731910796e-06, + "loss": 2.1085, + "step": 5220 + }, + { + "epoch": 0.28009656652360515, + "grad_norm": 0.421875, + "learning_rate": 4.991128420979237e-06, + "loss": 2.1136, + "step": 5221 + }, + { + "epoch": 0.2801502145922747, + "grad_norm": 0.466796875, + "learning_rate": 4.991121107039379e-06, + "loss": 2.423, + "step": 5222 + }, + { + "epoch": 0.2802038626609442, + "grad_norm": 0.49609375, + "learning_rate": 4.991113790091235e-06, + "loss": 2.666, + "step": 5223 + }, + { + "epoch": 0.28025751072961375, + "grad_norm": 0.5078125, + "learning_rate": 4.991106470134813e-06, + "loss": 2.2345, + "step": 5224 + }, + { + "epoch": 0.2803111587982833, + "grad_norm": 0.369140625, + "learning_rate": 4.9910991471701206e-06, + "loss": 2.1098, + "step": 5225 + }, + { + "epoch": 0.2803648068669528, + "grad_norm": 0.380859375, + "learning_rate": 4.991091821197168e-06, + "loss": 2.0164, + "step": 5226 + }, + { + "epoch": 0.28041845493562234, + "grad_norm": 0.3671875, + "learning_rate": 4.991084492215964e-06, + "loss": 2.3235, + "step": 5227 + }, + { + "epoch": 0.2804721030042919, + "grad_norm": 0.310546875, + "learning_rate": 4.991077160226517e-06, + "loss": 2.1892, + "step": 5228 + }, + { + "epoch": 0.28052575107296135, + "grad_norm": 0.33984375, + "learning_rate": 4.991069825228835e-06, + "loss": 2.3213, + "step": 5229 + }, + { + "epoch": 0.2805793991416309, + "grad_norm": 6.03125, + "learning_rate": 4.991062487222928e-06, + "loss": 2.1957, + "step": 5230 + }, + { + "epoch": 0.2806330472103004, + "grad_norm": 0.37890625, + "learning_rate": 4.9910551462088056e-06, + "loss": 2.3296, + "step": 5231 + }, + { + "epoch": 0.28068669527896994, + "grad_norm": 0.373046875, + "learning_rate": 4.991047802186475e-06, + "loss": 2.4027, + "step": 5232 + }, + { + "epoch": 0.2807403433476395, + "grad_norm": 0.4140625, + "learning_rate": 4.991040455155946e-06, + "loss": 2.4158, + "step": 5233 + }, + { + "epoch": 0.280793991416309, + "grad_norm": 0.408203125, + "learning_rate": 4.991033105117227e-06, + "loss": 2.2126, + "step": 5234 + }, + { + "epoch": 0.28084763948497854, + "grad_norm": 0.408203125, + "learning_rate": 4.991025752070326e-06, + "loss": 2.544, + "step": 5235 + }, + { + "epoch": 0.28090128755364807, + "grad_norm": 0.51171875, + "learning_rate": 4.991018396015254e-06, + "loss": 2.4182, + "step": 5236 + }, + { + "epoch": 0.2809549356223176, + "grad_norm": 0.388671875, + "learning_rate": 4.991011036952019e-06, + "loss": 2.37, + "step": 5237 + }, + { + "epoch": 0.28100858369098713, + "grad_norm": 0.36328125, + "learning_rate": 4.991003674880629e-06, + "loss": 2.2613, + "step": 5238 + }, + { + "epoch": 0.28106223175965667, + "grad_norm": 0.400390625, + "learning_rate": 4.990996309801095e-06, + "loss": 2.3923, + "step": 5239 + }, + { + "epoch": 0.2811158798283262, + "grad_norm": 0.462890625, + "learning_rate": 4.990988941713424e-06, + "loss": 1.8826, + "step": 5240 + }, + { + "epoch": 0.28116952789699573, + "grad_norm": 1.390625, + "learning_rate": 4.990981570617624e-06, + "loss": 2.3908, + "step": 5241 + }, + { + "epoch": 0.28122317596566526, + "grad_norm": 0.58984375, + "learning_rate": 4.990974196513707e-06, + "loss": 2.2539, + "step": 5242 + }, + { + "epoch": 0.28127682403433474, + "grad_norm": 0.3828125, + "learning_rate": 4.990966819401679e-06, + "loss": 2.5446, + "step": 5243 + }, + { + "epoch": 0.28133047210300427, + "grad_norm": 0.326171875, + "learning_rate": 4.990959439281551e-06, + "loss": 2.0298, + "step": 5244 + }, + { + "epoch": 0.2813841201716738, + "grad_norm": 0.3671875, + "learning_rate": 4.990952056153331e-06, + "loss": 2.3428, + "step": 5245 + }, + { + "epoch": 0.28143776824034333, + "grad_norm": 0.345703125, + "learning_rate": 4.990944670017028e-06, + "loss": 2.2663, + "step": 5246 + }, + { + "epoch": 0.28149141630901287, + "grad_norm": 0.49609375, + "learning_rate": 4.990937280872651e-06, + "loss": 2.4004, + "step": 5247 + }, + { + "epoch": 0.2815450643776824, + "grad_norm": 0.3046875, + "learning_rate": 4.990929888720208e-06, + "loss": 2.387, + "step": 5248 + }, + { + "epoch": 0.28159871244635193, + "grad_norm": 0.322265625, + "learning_rate": 4.990922493559709e-06, + "loss": 2.2684, + "step": 5249 + }, + { + "epoch": 0.28165236051502146, + "grad_norm": 0.546875, + "learning_rate": 4.990915095391163e-06, + "loss": 2.2317, + "step": 5250 + }, + { + "epoch": 0.281706008583691, + "grad_norm": 0.76953125, + "learning_rate": 4.990907694214578e-06, + "loss": 2.4378, + "step": 5251 + }, + { + "epoch": 0.2817596566523605, + "grad_norm": 0.3984375, + "learning_rate": 4.9909002900299646e-06, + "loss": 2.2003, + "step": 5252 + }, + { + "epoch": 0.28181330472103006, + "grad_norm": 0.349609375, + "learning_rate": 4.99089288283733e-06, + "loss": 2.2571, + "step": 5253 + }, + { + "epoch": 0.2818669527896996, + "grad_norm": 0.388671875, + "learning_rate": 4.990885472636684e-06, + "loss": 2.3789, + "step": 5254 + }, + { + "epoch": 0.2819206008583691, + "grad_norm": 0.376953125, + "learning_rate": 4.9908780594280355e-06, + "loss": 2.3594, + "step": 5255 + }, + { + "epoch": 0.28197424892703865, + "grad_norm": 0.41015625, + "learning_rate": 4.990870643211393e-06, + "loss": 2.2882, + "step": 5256 + }, + { + "epoch": 0.2820278969957081, + "grad_norm": 0.333984375, + "learning_rate": 4.990863223986766e-06, + "loss": 2.3038, + "step": 5257 + }, + { + "epoch": 0.28208154506437766, + "grad_norm": 0.34375, + "learning_rate": 4.990855801754163e-06, + "loss": 2.4229, + "step": 5258 + }, + { + "epoch": 0.2821351931330472, + "grad_norm": 0.384765625, + "learning_rate": 4.990848376513593e-06, + "loss": 2.4497, + "step": 5259 + }, + { + "epoch": 0.2821888412017167, + "grad_norm": 0.3828125, + "learning_rate": 4.990840948265066e-06, + "loss": 2.4063, + "step": 5260 + }, + { + "epoch": 0.28224248927038625, + "grad_norm": 0.384765625, + "learning_rate": 4.990833517008589e-06, + "loss": 2.2818, + "step": 5261 + }, + { + "epoch": 0.2822961373390558, + "grad_norm": 0.4375, + "learning_rate": 4.990826082744173e-06, + "loss": 2.2936, + "step": 5262 + }, + { + "epoch": 0.2823497854077253, + "grad_norm": 0.369140625, + "learning_rate": 4.990818645471826e-06, + "loss": 2.6917, + "step": 5263 + }, + { + "epoch": 0.28240343347639485, + "grad_norm": 0.3828125, + "learning_rate": 4.990811205191557e-06, + "loss": 2.3651, + "step": 5264 + }, + { + "epoch": 0.2824570815450644, + "grad_norm": 0.31640625, + "learning_rate": 4.990803761903374e-06, + "loss": 2.3512, + "step": 5265 + }, + { + "epoch": 0.2825107296137339, + "grad_norm": 0.443359375, + "learning_rate": 4.990796315607289e-06, + "loss": 2.6194, + "step": 5266 + }, + { + "epoch": 0.28256437768240344, + "grad_norm": 0.453125, + "learning_rate": 4.990788866303308e-06, + "loss": 1.801, + "step": 5267 + }, + { + "epoch": 0.282618025751073, + "grad_norm": 0.81640625, + "learning_rate": 4.990781413991441e-06, + "loss": 2.1371, + "step": 5268 + }, + { + "epoch": 0.2826716738197425, + "grad_norm": 0.37890625, + "learning_rate": 4.990773958671697e-06, + "loss": 2.1945, + "step": 5269 + }, + { + "epoch": 0.28272532188841204, + "grad_norm": 0.357421875, + "learning_rate": 4.990766500344085e-06, + "loss": 2.4712, + "step": 5270 + }, + { + "epoch": 0.28277896995708157, + "grad_norm": 0.42578125, + "learning_rate": 4.990759039008613e-06, + "loss": 2.3202, + "step": 5271 + }, + { + "epoch": 0.28283261802575105, + "grad_norm": 0.396484375, + "learning_rate": 4.990751574665293e-06, + "loss": 2.2633, + "step": 5272 + }, + { + "epoch": 0.2828862660944206, + "grad_norm": 0.435546875, + "learning_rate": 4.990744107314132e-06, + "loss": 2.1961, + "step": 5273 + }, + { + "epoch": 0.2829399141630901, + "grad_norm": 0.39453125, + "learning_rate": 4.990736636955138e-06, + "loss": 2.4144, + "step": 5274 + }, + { + "epoch": 0.28299356223175964, + "grad_norm": 0.4609375, + "learning_rate": 4.9907291635883215e-06, + "loss": 2.2331, + "step": 5275 + }, + { + "epoch": 0.2830472103004292, + "grad_norm": 0.384765625, + "learning_rate": 4.990721687213691e-06, + "loss": 2.2926, + "step": 5276 + }, + { + "epoch": 0.2831008583690987, + "grad_norm": 0.404296875, + "learning_rate": 4.990714207831255e-06, + "loss": 2.4427, + "step": 5277 + }, + { + "epoch": 0.28315450643776824, + "grad_norm": 0.3671875, + "learning_rate": 4.990706725441025e-06, + "loss": 2.324, + "step": 5278 + }, + { + "epoch": 0.28320815450643777, + "grad_norm": 0.451171875, + "learning_rate": 4.9906992400430065e-06, + "loss": 2.4456, + "step": 5279 + }, + { + "epoch": 0.2832618025751073, + "grad_norm": 0.34375, + "learning_rate": 4.990691751637211e-06, + "loss": 2.2946, + "step": 5280 + }, + { + "epoch": 0.28331545064377683, + "grad_norm": 0.33203125, + "learning_rate": 4.990684260223647e-06, + "loss": 2.1017, + "step": 5281 + }, + { + "epoch": 0.28336909871244637, + "grad_norm": 0.439453125, + "learning_rate": 4.990676765802322e-06, + "loss": 2.3399, + "step": 5282 + }, + { + "epoch": 0.2834227467811159, + "grad_norm": 0.3515625, + "learning_rate": 4.990669268373248e-06, + "loss": 2.3393, + "step": 5283 + }, + { + "epoch": 0.28347639484978543, + "grad_norm": 0.43359375, + "learning_rate": 4.9906617679364324e-06, + "loss": 2.2261, + "step": 5284 + }, + { + "epoch": 0.28353004291845496, + "grad_norm": 0.326171875, + "learning_rate": 4.9906542644918835e-06, + "loss": 2.1563, + "step": 5285 + }, + { + "epoch": 0.28358369098712444, + "grad_norm": 0.6015625, + "learning_rate": 4.990646758039611e-06, + "loss": 2.413, + "step": 5286 + }, + { + "epoch": 0.28363733905579397, + "grad_norm": 0.44140625, + "learning_rate": 4.990639248579625e-06, + "loss": 2.4587, + "step": 5287 + }, + { + "epoch": 0.2836909871244635, + "grad_norm": 0.3671875, + "learning_rate": 4.990631736111933e-06, + "loss": 2.3553, + "step": 5288 + }, + { + "epoch": 0.28374463519313303, + "grad_norm": 0.33984375, + "learning_rate": 4.9906242206365456e-06, + "loss": 2.1125, + "step": 5289 + }, + { + "epoch": 0.28379828326180256, + "grad_norm": 0.423828125, + "learning_rate": 4.9906167021534704e-06, + "loss": 2.2254, + "step": 5290 + }, + { + "epoch": 0.2838519313304721, + "grad_norm": 0.361328125, + "learning_rate": 4.990609180662718e-06, + "loss": 2.2485, + "step": 5291 + }, + { + "epoch": 0.2839055793991416, + "grad_norm": 0.392578125, + "learning_rate": 4.990601656164296e-06, + "loss": 2.464, + "step": 5292 + }, + { + "epoch": 0.28395922746781116, + "grad_norm": 0.375, + "learning_rate": 4.990594128658214e-06, + "loss": 2.264, + "step": 5293 + }, + { + "epoch": 0.2840128755364807, + "grad_norm": 0.330078125, + "learning_rate": 4.990586598144481e-06, + "loss": 2.1891, + "step": 5294 + }, + { + "epoch": 0.2840665236051502, + "grad_norm": 0.40234375, + "learning_rate": 4.990579064623106e-06, + "loss": 2.4889, + "step": 5295 + }, + { + "epoch": 0.28412017167381975, + "grad_norm": 0.365234375, + "learning_rate": 4.990571528094099e-06, + "loss": 2.3945, + "step": 5296 + }, + { + "epoch": 0.2841738197424893, + "grad_norm": 0.3515625, + "learning_rate": 4.990563988557468e-06, + "loss": 2.2729, + "step": 5297 + }, + { + "epoch": 0.2842274678111588, + "grad_norm": 0.63671875, + "learning_rate": 4.990556446013224e-06, + "loss": 1.7309, + "step": 5298 + }, + { + "epoch": 0.28428111587982835, + "grad_norm": 0.369140625, + "learning_rate": 4.990548900461374e-06, + "loss": 2.2202, + "step": 5299 + }, + { + "epoch": 0.2843347639484979, + "grad_norm": 0.390625, + "learning_rate": 4.990541351901927e-06, + "loss": 2.1883, + "step": 5300 + }, + { + "epoch": 0.28438841201716736, + "grad_norm": 0.376953125, + "learning_rate": 4.990533800334894e-06, + "loss": 2.522, + "step": 5301 + }, + { + "epoch": 0.2844420600858369, + "grad_norm": 0.392578125, + "learning_rate": 4.9905262457602825e-06, + "loss": 2.241, + "step": 5302 + }, + { + "epoch": 0.2844957081545064, + "grad_norm": 0.451171875, + "learning_rate": 4.990518688178101e-06, + "loss": 2.3944, + "step": 5303 + }, + { + "epoch": 0.28454935622317595, + "grad_norm": 0.359375, + "learning_rate": 4.990511127588362e-06, + "loss": 2.2611, + "step": 5304 + }, + { + "epoch": 0.2846030042918455, + "grad_norm": 0.90625, + "learning_rate": 4.990503563991071e-06, + "loss": 2.4663, + "step": 5305 + }, + { + "epoch": 0.284656652360515, + "grad_norm": 0.3828125, + "learning_rate": 4.990495997386239e-06, + "loss": 2.5073, + "step": 5306 + }, + { + "epoch": 0.28471030042918455, + "grad_norm": 1.1328125, + "learning_rate": 4.990488427773874e-06, + "loss": 1.8438, + "step": 5307 + }, + { + "epoch": 0.2847639484978541, + "grad_norm": 2.875, + "learning_rate": 4.990480855153988e-06, + "loss": 2.3314, + "step": 5308 + }, + { + "epoch": 0.2848175965665236, + "grad_norm": 0.40234375, + "learning_rate": 4.990473279526585e-06, + "loss": 2.4014, + "step": 5309 + }, + { + "epoch": 0.28487124463519314, + "grad_norm": 0.306640625, + "learning_rate": 4.990465700891679e-06, + "loss": 2.0302, + "step": 5310 + }, + { + "epoch": 0.2849248927038627, + "grad_norm": 0.462890625, + "learning_rate": 4.990458119249278e-06, + "loss": 2.4262, + "step": 5311 + }, + { + "epoch": 0.2849785407725322, + "grad_norm": 1.0546875, + "learning_rate": 4.990450534599389e-06, + "loss": 2.2515, + "step": 5312 + }, + { + "epoch": 0.28503218884120174, + "grad_norm": 0.36328125, + "learning_rate": 4.990442946942022e-06, + "loss": 2.2093, + "step": 5313 + }, + { + "epoch": 0.28508583690987127, + "grad_norm": 0.423828125, + "learning_rate": 4.990435356277188e-06, + "loss": 2.2816, + "step": 5314 + }, + { + "epoch": 0.28513948497854075, + "grad_norm": 0.3671875, + "learning_rate": 4.990427762604894e-06, + "loss": 2.4412, + "step": 5315 + }, + { + "epoch": 0.2851931330472103, + "grad_norm": 0.4140625, + "learning_rate": 4.990420165925151e-06, + "loss": 2.1935, + "step": 5316 + }, + { + "epoch": 0.2852467811158798, + "grad_norm": 0.37109375, + "learning_rate": 4.990412566237968e-06, + "loss": 2.3885, + "step": 5317 + }, + { + "epoch": 0.28530042918454934, + "grad_norm": 0.3515625, + "learning_rate": 4.990404963543352e-06, + "loss": 2.284, + "step": 5318 + }, + { + "epoch": 0.2853540772532189, + "grad_norm": 0.34765625, + "learning_rate": 4.990397357841315e-06, + "loss": 2.2441, + "step": 5319 + }, + { + "epoch": 0.2854077253218884, + "grad_norm": 0.384765625, + "learning_rate": 4.990389749131863e-06, + "loss": 2.0861, + "step": 5320 + }, + { + "epoch": 0.28546137339055794, + "grad_norm": 0.302734375, + "learning_rate": 4.9903821374150085e-06, + "loss": 2.2295, + "step": 5321 + }, + { + "epoch": 0.28551502145922747, + "grad_norm": 0.85546875, + "learning_rate": 4.990374522690758e-06, + "loss": 2.4435, + "step": 5322 + }, + { + "epoch": 0.285568669527897, + "grad_norm": 0.34765625, + "learning_rate": 4.990366904959123e-06, + "loss": 2.3983, + "step": 5323 + }, + { + "epoch": 0.28562231759656653, + "grad_norm": 0.6015625, + "learning_rate": 4.990359284220111e-06, + "loss": 2.4547, + "step": 5324 + }, + { + "epoch": 0.28567596566523606, + "grad_norm": 0.35546875, + "learning_rate": 4.990351660473732e-06, + "loss": 2.2614, + "step": 5325 + }, + { + "epoch": 0.2857296137339056, + "grad_norm": 0.55078125, + "learning_rate": 4.990344033719995e-06, + "loss": 2.1912, + "step": 5326 + }, + { + "epoch": 0.28578326180257513, + "grad_norm": 0.369140625, + "learning_rate": 4.9903364039589085e-06, + "loss": 1.9429, + "step": 5327 + }, + { + "epoch": 0.28583690987124466, + "grad_norm": 0.30859375, + "learning_rate": 4.990328771190484e-06, + "loss": 2.3764, + "step": 5328 + }, + { + "epoch": 0.28589055793991414, + "grad_norm": 0.388671875, + "learning_rate": 4.990321135414727e-06, + "loss": 2.1535, + "step": 5329 + }, + { + "epoch": 0.28594420600858367, + "grad_norm": 0.423828125, + "learning_rate": 4.99031349663165e-06, + "loss": 2.4104, + "step": 5330 + }, + { + "epoch": 0.2859978540772532, + "grad_norm": 0.390625, + "learning_rate": 4.990305854841262e-06, + "loss": 2.4781, + "step": 5331 + }, + { + "epoch": 0.28605150214592273, + "grad_norm": 0.328125, + "learning_rate": 4.99029821004357e-06, + "loss": 2.217, + "step": 5332 + }, + { + "epoch": 0.28610515021459226, + "grad_norm": 0.421875, + "learning_rate": 4.990290562238585e-06, + "loss": 2.4794, + "step": 5333 + }, + { + "epoch": 0.2861587982832618, + "grad_norm": 0.482421875, + "learning_rate": 4.990282911426317e-06, + "loss": 2.2563, + "step": 5334 + }, + { + "epoch": 0.2862124463519313, + "grad_norm": 0.392578125, + "learning_rate": 4.990275257606773e-06, + "loss": 2.2281, + "step": 5335 + }, + { + "epoch": 0.28626609442060086, + "grad_norm": 0.404296875, + "learning_rate": 4.990267600779962e-06, + "loss": 1.4914, + "step": 5336 + }, + { + "epoch": 0.2863197424892704, + "grad_norm": 0.333984375, + "learning_rate": 4.990259940945896e-06, + "loss": 2.0355, + "step": 5337 + }, + { + "epoch": 0.2863733905579399, + "grad_norm": 0.33984375, + "learning_rate": 4.9902522781045825e-06, + "loss": 2.0822, + "step": 5338 + }, + { + "epoch": 0.28642703862660945, + "grad_norm": 0.3828125, + "learning_rate": 4.990244612256031e-06, + "loss": 2.2299, + "step": 5339 + }, + { + "epoch": 0.286480686695279, + "grad_norm": 0.38671875, + "learning_rate": 4.990236943400251e-06, + "loss": 2.3757, + "step": 5340 + }, + { + "epoch": 0.2865343347639485, + "grad_norm": 0.3828125, + "learning_rate": 4.99022927153725e-06, + "loss": 2.373, + "step": 5341 + }, + { + "epoch": 0.28658798283261805, + "grad_norm": 0.3203125, + "learning_rate": 4.990221596667041e-06, + "loss": 2.3646, + "step": 5342 + }, + { + "epoch": 0.2866416309012876, + "grad_norm": 0.474609375, + "learning_rate": 4.9902139187896295e-06, + "loss": 2.3451, + "step": 5343 + }, + { + "epoch": 0.28669527896995706, + "grad_norm": 0.3828125, + "learning_rate": 4.990206237905027e-06, + "loss": 2.3692, + "step": 5344 + }, + { + "epoch": 0.2867489270386266, + "grad_norm": 0.361328125, + "learning_rate": 4.990198554013242e-06, + "loss": 2.1847, + "step": 5345 + }, + { + "epoch": 0.2868025751072961, + "grad_norm": 0.390625, + "learning_rate": 4.990190867114284e-06, + "loss": 2.0928, + "step": 5346 + }, + { + "epoch": 0.28685622317596565, + "grad_norm": 0.384765625, + "learning_rate": 4.990183177208162e-06, + "loss": 2.0872, + "step": 5347 + }, + { + "epoch": 0.2869098712446352, + "grad_norm": 0.373046875, + "learning_rate": 4.990175484294886e-06, + "loss": 2.4402, + "step": 5348 + }, + { + "epoch": 0.2869635193133047, + "grad_norm": 0.361328125, + "learning_rate": 4.990167788374464e-06, + "loss": 2.4357, + "step": 5349 + }, + { + "epoch": 0.28701716738197425, + "grad_norm": 0.357421875, + "learning_rate": 4.990160089446906e-06, + "loss": 1.7526, + "step": 5350 + }, + { + "epoch": 0.2870708154506438, + "grad_norm": 0.74609375, + "learning_rate": 4.990152387512222e-06, + "loss": 2.4291, + "step": 5351 + }, + { + "epoch": 0.2871244635193133, + "grad_norm": 0.328125, + "learning_rate": 4.99014468257042e-06, + "loss": 2.3306, + "step": 5352 + }, + { + "epoch": 0.28717811158798284, + "grad_norm": 0.375, + "learning_rate": 4.990136974621511e-06, + "loss": 2.4145, + "step": 5353 + }, + { + "epoch": 0.2872317596566524, + "grad_norm": 0.43359375, + "learning_rate": 4.990129263665502e-06, + "loss": 2.5494, + "step": 5354 + }, + { + "epoch": 0.2872854077253219, + "grad_norm": 0.49609375, + "learning_rate": 4.990121549702405e-06, + "loss": 2.4972, + "step": 5355 + }, + { + "epoch": 0.28733905579399144, + "grad_norm": 0.435546875, + "learning_rate": 4.990113832732226e-06, + "loss": 2.3951, + "step": 5356 + }, + { + "epoch": 0.28739270386266097, + "grad_norm": 0.6171875, + "learning_rate": 4.9901061127549774e-06, + "loss": 2.2648, + "step": 5357 + }, + { + "epoch": 0.28744635193133045, + "grad_norm": 0.392578125, + "learning_rate": 4.990098389770668e-06, + "loss": 2.2151, + "step": 5358 + }, + { + "epoch": 0.2875, + "grad_norm": 0.375, + "learning_rate": 4.990090663779305e-06, + "loss": 2.3478, + "step": 5359 + }, + { + "epoch": 0.2875536480686695, + "grad_norm": 0.404296875, + "learning_rate": 4.9900829347809e-06, + "loss": 2.3229, + "step": 5360 + }, + { + "epoch": 0.28760729613733904, + "grad_norm": 0.3828125, + "learning_rate": 4.990075202775461e-06, + "loss": 2.066, + "step": 5361 + }, + { + "epoch": 0.2876609442060086, + "grad_norm": 0.359375, + "learning_rate": 4.990067467762998e-06, + "loss": 2.3278, + "step": 5362 + }, + { + "epoch": 0.2877145922746781, + "grad_norm": 0.361328125, + "learning_rate": 4.99005972974352e-06, + "loss": 2.2461, + "step": 5363 + }, + { + "epoch": 0.28776824034334764, + "grad_norm": 0.396484375, + "learning_rate": 4.990051988717038e-06, + "loss": 2.4729, + "step": 5364 + }, + { + "epoch": 0.28782188841201717, + "grad_norm": 0.369140625, + "learning_rate": 4.990044244683559e-06, + "loss": 2.2799, + "step": 5365 + }, + { + "epoch": 0.2878755364806867, + "grad_norm": 0.359375, + "learning_rate": 4.990036497643092e-06, + "loss": 2.3922, + "step": 5366 + }, + { + "epoch": 0.28792918454935623, + "grad_norm": 0.36328125, + "learning_rate": 4.990028747595649e-06, + "loss": 2.0832, + "step": 5367 + }, + { + "epoch": 0.28798283261802576, + "grad_norm": 0.3671875, + "learning_rate": 4.990020994541237e-06, + "loss": 2.3245, + "step": 5368 + }, + { + "epoch": 0.2880364806866953, + "grad_norm": 0.30859375, + "learning_rate": 4.9900132384798675e-06, + "loss": 2.1104, + "step": 5369 + }, + { + "epoch": 0.2880901287553648, + "grad_norm": 0.470703125, + "learning_rate": 4.990005479411548e-06, + "loss": 2.2977, + "step": 5370 + }, + { + "epoch": 0.28814377682403436, + "grad_norm": 0.408203125, + "learning_rate": 4.989997717336288e-06, + "loss": 2.2238, + "step": 5371 + }, + { + "epoch": 0.28819742489270384, + "grad_norm": 0.33984375, + "learning_rate": 4.9899899522540985e-06, + "loss": 2.2821, + "step": 5372 + }, + { + "epoch": 0.28825107296137337, + "grad_norm": 0.70703125, + "learning_rate": 4.989982184164987e-06, + "loss": 2.2791, + "step": 5373 + }, + { + "epoch": 0.2883047210300429, + "grad_norm": 0.55078125, + "learning_rate": 4.9899744130689645e-06, + "loss": 2.5831, + "step": 5374 + }, + { + "epoch": 0.28835836909871243, + "grad_norm": 0.361328125, + "learning_rate": 4.9899666389660385e-06, + "loss": 2.1, + "step": 5375 + }, + { + "epoch": 0.28841201716738196, + "grad_norm": 0.34375, + "learning_rate": 4.98995886185622e-06, + "loss": 2.1897, + "step": 5376 + }, + { + "epoch": 0.2884656652360515, + "grad_norm": 0.369140625, + "learning_rate": 4.989951081739518e-06, + "loss": 2.2697, + "step": 5377 + }, + { + "epoch": 0.288519313304721, + "grad_norm": 0.435546875, + "learning_rate": 4.989943298615942e-06, + "loss": 2.1869, + "step": 5378 + }, + { + "epoch": 0.28857296137339056, + "grad_norm": 0.337890625, + "learning_rate": 4.9899355124855e-06, + "loss": 2.3268, + "step": 5379 + }, + { + "epoch": 0.2886266094420601, + "grad_norm": 0.408203125, + "learning_rate": 4.9899277233482035e-06, + "loss": 1.2524, + "step": 5380 + }, + { + "epoch": 0.2886802575107296, + "grad_norm": 0.3984375, + "learning_rate": 4.989919931204061e-06, + "loss": 2.075, + "step": 5381 + }, + { + "epoch": 0.28873390557939915, + "grad_norm": 0.341796875, + "learning_rate": 4.989912136053081e-06, + "loss": 2.3823, + "step": 5382 + }, + { + "epoch": 0.2887875536480687, + "grad_norm": 0.37890625, + "learning_rate": 4.989904337895275e-06, + "loss": 2.2023, + "step": 5383 + }, + { + "epoch": 0.2888412017167382, + "grad_norm": 0.419921875, + "learning_rate": 4.989896536730651e-06, + "loss": 2.5842, + "step": 5384 + }, + { + "epoch": 0.28889484978540775, + "grad_norm": 0.388671875, + "learning_rate": 4.9898887325592176e-06, + "loss": 2.2946, + "step": 5385 + }, + { + "epoch": 0.2889484978540773, + "grad_norm": 0.4375, + "learning_rate": 4.989880925380986e-06, + "loss": 2.3125, + "step": 5386 + }, + { + "epoch": 0.28900214592274676, + "grad_norm": 0.392578125, + "learning_rate": 4.989873115195965e-06, + "loss": 2.374, + "step": 5387 + }, + { + "epoch": 0.2890557939914163, + "grad_norm": 0.431640625, + "learning_rate": 4.989865302004163e-06, + "loss": 2.2233, + "step": 5388 + }, + { + "epoch": 0.2891094420600858, + "grad_norm": 0.38671875, + "learning_rate": 4.989857485805591e-06, + "loss": 2.2704, + "step": 5389 + }, + { + "epoch": 0.28916309012875535, + "grad_norm": 0.33984375, + "learning_rate": 4.989849666600258e-06, + "loss": 2.242, + "step": 5390 + }, + { + "epoch": 0.2892167381974249, + "grad_norm": 0.4140625, + "learning_rate": 4.989841844388172e-06, + "loss": 2.331, + "step": 5391 + }, + { + "epoch": 0.2892703862660944, + "grad_norm": 0.349609375, + "learning_rate": 4.989834019169345e-06, + "loss": 2.2554, + "step": 5392 + }, + { + "epoch": 0.28932403433476395, + "grad_norm": 0.52734375, + "learning_rate": 4.989826190943785e-06, + "loss": 2.2347, + "step": 5393 + }, + { + "epoch": 0.2893776824034335, + "grad_norm": 0.341796875, + "learning_rate": 4.989818359711501e-06, + "loss": 2.2001, + "step": 5394 + }, + { + "epoch": 0.289431330472103, + "grad_norm": 0.361328125, + "learning_rate": 4.989810525472504e-06, + "loss": 2.3657, + "step": 5395 + }, + { + "epoch": 0.28948497854077254, + "grad_norm": 0.314453125, + "learning_rate": 4.989802688226801e-06, + "loss": 2.2812, + "step": 5396 + }, + { + "epoch": 0.2895386266094421, + "grad_norm": 0.3984375, + "learning_rate": 4.989794847974404e-06, + "loss": 2.53, + "step": 5397 + }, + { + "epoch": 0.2895922746781116, + "grad_norm": 0.40234375, + "learning_rate": 4.989787004715321e-06, + "loss": 2.4421, + "step": 5398 + }, + { + "epoch": 0.28964592274678114, + "grad_norm": 0.349609375, + "learning_rate": 4.989779158449561e-06, + "loss": 2.4394, + "step": 5399 + }, + { + "epoch": 0.28969957081545067, + "grad_norm": 0.369140625, + "learning_rate": 4.989771309177136e-06, + "loss": 2.4241, + "step": 5400 + }, + { + "epoch": 0.28975321888412015, + "grad_norm": 0.28515625, + "learning_rate": 4.989763456898054e-06, + "loss": 2.2203, + "step": 5401 + }, + { + "epoch": 0.2898068669527897, + "grad_norm": 1.421875, + "learning_rate": 4.989755601612323e-06, + "loss": 1.867, + "step": 5402 + }, + { + "epoch": 0.2898605150214592, + "grad_norm": 0.451171875, + "learning_rate": 4.989747743319954e-06, + "loss": 2.2913, + "step": 5403 + }, + { + "epoch": 0.28991416309012874, + "grad_norm": 0.44921875, + "learning_rate": 4.989739882020957e-06, + "loss": 2.2578, + "step": 5404 + }, + { + "epoch": 0.28996781115879827, + "grad_norm": 0.375, + "learning_rate": 4.98973201771534e-06, + "loss": 2.474, + "step": 5405 + }, + { + "epoch": 0.2900214592274678, + "grad_norm": 0.54296875, + "learning_rate": 4.9897241504031136e-06, + "loss": 2.1995, + "step": 5406 + }, + { + "epoch": 0.29007510729613734, + "grad_norm": 2.4375, + "learning_rate": 4.989716280084287e-06, + "loss": 2.3003, + "step": 5407 + }, + { + "epoch": 0.29012875536480687, + "grad_norm": 0.345703125, + "learning_rate": 4.98970840675887e-06, + "loss": 2.1835, + "step": 5408 + }, + { + "epoch": 0.2901824034334764, + "grad_norm": 0.486328125, + "learning_rate": 4.989700530426871e-06, + "loss": 2.3493, + "step": 5409 + }, + { + "epoch": 0.29023605150214593, + "grad_norm": 0.349609375, + "learning_rate": 4.989692651088301e-06, + "loss": 2.149, + "step": 5410 + }, + { + "epoch": 0.29028969957081546, + "grad_norm": 0.42578125, + "learning_rate": 4.9896847687431686e-06, + "loss": 2.4762, + "step": 5411 + }, + { + "epoch": 0.290343347639485, + "grad_norm": 0.93359375, + "learning_rate": 4.989676883391484e-06, + "loss": 2.3261, + "step": 5412 + }, + { + "epoch": 0.2903969957081545, + "grad_norm": 0.4140625, + "learning_rate": 4.9896689950332555e-06, + "loss": 2.2259, + "step": 5413 + }, + { + "epoch": 0.29045064377682406, + "grad_norm": 0.3203125, + "learning_rate": 4.9896611036684935e-06, + "loss": 2.2694, + "step": 5414 + }, + { + "epoch": 0.2905042918454936, + "grad_norm": 0.6171875, + "learning_rate": 4.9896532092972085e-06, + "loss": 2.4297, + "step": 5415 + }, + { + "epoch": 0.29055793991416307, + "grad_norm": 0.3828125, + "learning_rate": 4.989645311919408e-06, + "loss": 2.1946, + "step": 5416 + }, + { + "epoch": 0.2906115879828326, + "grad_norm": 0.328125, + "learning_rate": 4.989637411535102e-06, + "loss": 2.1855, + "step": 5417 + }, + { + "epoch": 0.29066523605150213, + "grad_norm": 0.423828125, + "learning_rate": 4.989629508144302e-06, + "loss": 2.3573, + "step": 5418 + }, + { + "epoch": 0.29071888412017166, + "grad_norm": 0.486328125, + "learning_rate": 4.989621601747015e-06, + "loss": 1.5677, + "step": 5419 + }, + { + "epoch": 0.2907725321888412, + "grad_norm": 0.326171875, + "learning_rate": 4.989613692343251e-06, + "loss": 2.1862, + "step": 5420 + }, + { + "epoch": 0.2908261802575107, + "grad_norm": 0.515625, + "learning_rate": 4.989605779933021e-06, + "loss": 2.2636, + "step": 5421 + }, + { + "epoch": 0.29087982832618026, + "grad_norm": 0.439453125, + "learning_rate": 4.989597864516335e-06, + "loss": 2.146, + "step": 5422 + }, + { + "epoch": 0.2909334763948498, + "grad_norm": 0.61328125, + "learning_rate": 4.989589946093199e-06, + "loss": 2.2159, + "step": 5423 + }, + { + "epoch": 0.2909871244635193, + "grad_norm": 0.37109375, + "learning_rate": 4.989582024663626e-06, + "loss": 2.278, + "step": 5424 + }, + { + "epoch": 0.29104077253218885, + "grad_norm": 0.423828125, + "learning_rate": 4.989574100227624e-06, + "loss": 2.4465, + "step": 5425 + }, + { + "epoch": 0.2910944206008584, + "grad_norm": 0.439453125, + "learning_rate": 4.989566172785204e-06, + "loss": 2.4134, + "step": 5426 + }, + { + "epoch": 0.2911480686695279, + "grad_norm": 0.353515625, + "learning_rate": 4.989558242336373e-06, + "loss": 2.2872, + "step": 5427 + }, + { + "epoch": 0.29120171673819745, + "grad_norm": 0.357421875, + "learning_rate": 4.989550308881144e-06, + "loss": 2.2792, + "step": 5428 + }, + { + "epoch": 0.291255364806867, + "grad_norm": 0.384765625, + "learning_rate": 4.989542372419524e-06, + "loss": 2.2509, + "step": 5429 + }, + { + "epoch": 0.29130901287553645, + "grad_norm": 0.27734375, + "learning_rate": 4.989534432951522e-06, + "loss": 2.0952, + "step": 5430 + }, + { + "epoch": 0.291362660944206, + "grad_norm": 0.400390625, + "learning_rate": 4.98952649047715e-06, + "loss": 2.5171, + "step": 5431 + }, + { + "epoch": 0.2914163090128755, + "grad_norm": 0.431640625, + "learning_rate": 4.989518544996417e-06, + "loss": 2.0337, + "step": 5432 + }, + { + "epoch": 0.29146995708154505, + "grad_norm": 0.3125, + "learning_rate": 4.989510596509331e-06, + "loss": 2.2799, + "step": 5433 + }, + { + "epoch": 0.2915236051502146, + "grad_norm": 0.60546875, + "learning_rate": 4.9895026450159035e-06, + "loss": 2.4256, + "step": 5434 + }, + { + "epoch": 0.2915772532188841, + "grad_norm": 0.5546875, + "learning_rate": 4.989494690516142e-06, + "loss": 2.2952, + "step": 5435 + }, + { + "epoch": 0.29163090128755365, + "grad_norm": 0.369140625, + "learning_rate": 4.989486733010059e-06, + "loss": 2.4977, + "step": 5436 + }, + { + "epoch": 0.2916845493562232, + "grad_norm": 0.4765625, + "learning_rate": 4.989478772497661e-06, + "loss": 2.5303, + "step": 5437 + }, + { + "epoch": 0.2917381974248927, + "grad_norm": 0.365234375, + "learning_rate": 4.98947080897896e-06, + "loss": 2.3696, + "step": 5438 + }, + { + "epoch": 0.29179184549356224, + "grad_norm": 0.318359375, + "learning_rate": 4.989462842453965e-06, + "loss": 2.1877, + "step": 5439 + }, + { + "epoch": 0.2918454935622318, + "grad_norm": 0.400390625, + "learning_rate": 4.989454872922685e-06, + "loss": 2.1386, + "step": 5440 + }, + { + "epoch": 0.2918991416309013, + "grad_norm": 2.0625, + "learning_rate": 4.98944690038513e-06, + "loss": 2.2841, + "step": 5441 + }, + { + "epoch": 0.29195278969957084, + "grad_norm": 0.43359375, + "learning_rate": 4.98943892484131e-06, + "loss": 2.4931, + "step": 5442 + }, + { + "epoch": 0.29200643776824037, + "grad_norm": 0.3359375, + "learning_rate": 4.989430946291233e-06, + "loss": 2.2385, + "step": 5443 + }, + { + "epoch": 0.29206008583690984, + "grad_norm": 0.66015625, + "learning_rate": 4.989422964734911e-06, + "loss": 2.277, + "step": 5444 + }, + { + "epoch": 0.2921137339055794, + "grad_norm": 0.33203125, + "learning_rate": 4.9894149801723515e-06, + "loss": 2.3475, + "step": 5445 + }, + { + "epoch": 0.2921673819742489, + "grad_norm": 0.62890625, + "learning_rate": 4.989406992603566e-06, + "loss": 2.4725, + "step": 5446 + }, + { + "epoch": 0.29222103004291844, + "grad_norm": 0.734375, + "learning_rate": 4.989399002028563e-06, + "loss": 2.5126, + "step": 5447 + }, + { + "epoch": 0.29227467811158797, + "grad_norm": 0.373046875, + "learning_rate": 4.989391008447353e-06, + "loss": 2.2967, + "step": 5448 + }, + { + "epoch": 0.2923283261802575, + "grad_norm": 0.337890625, + "learning_rate": 4.989383011859944e-06, + "loss": 2.4026, + "step": 5449 + }, + { + "epoch": 0.29238197424892703, + "grad_norm": 0.4453125, + "learning_rate": 4.989375012266347e-06, + "loss": 2.2791, + "step": 5450 + }, + { + "epoch": 0.29243562231759657, + "grad_norm": 0.61328125, + "learning_rate": 4.989367009666572e-06, + "loss": 2.2234, + "step": 5451 + }, + { + "epoch": 0.2924892703862661, + "grad_norm": 0.421875, + "learning_rate": 4.989359004060628e-06, + "loss": 2.3402, + "step": 5452 + }, + { + "epoch": 0.29254291845493563, + "grad_norm": 0.37890625, + "learning_rate": 4.9893509954485245e-06, + "loss": 2.3702, + "step": 5453 + }, + { + "epoch": 0.29259656652360516, + "grad_norm": 0.3828125, + "learning_rate": 4.989342983830271e-06, + "loss": 2.3872, + "step": 5454 + }, + { + "epoch": 0.2926502145922747, + "grad_norm": 0.369140625, + "learning_rate": 4.989334969205878e-06, + "loss": 2.202, + "step": 5455 + }, + { + "epoch": 0.2927038626609442, + "grad_norm": 0.404296875, + "learning_rate": 4.9893269515753555e-06, + "loss": 2.2294, + "step": 5456 + }, + { + "epoch": 0.29275751072961376, + "grad_norm": 0.486328125, + "learning_rate": 4.989318930938711e-06, + "loss": 2.236, + "step": 5457 + }, + { + "epoch": 0.2928111587982833, + "grad_norm": 0.30078125, + "learning_rate": 4.989310907295956e-06, + "loss": 2.2414, + "step": 5458 + }, + { + "epoch": 0.29286480686695276, + "grad_norm": 0.3515625, + "learning_rate": 4.9893028806471e-06, + "loss": 2.202, + "step": 5459 + }, + { + "epoch": 0.2929184549356223, + "grad_norm": 0.64453125, + "learning_rate": 4.989294850992154e-06, + "loss": 2.5076, + "step": 5460 + }, + { + "epoch": 0.29297210300429183, + "grad_norm": 0.412109375, + "learning_rate": 4.989286818331124e-06, + "loss": 2.472, + "step": 5461 + }, + { + "epoch": 0.29302575107296136, + "grad_norm": 0.4296875, + "learning_rate": 4.989278782664023e-06, + "loss": 2.2996, + "step": 5462 + }, + { + "epoch": 0.2930793991416309, + "grad_norm": 0.439453125, + "learning_rate": 4.989270743990859e-06, + "loss": 1.5033, + "step": 5463 + }, + { + "epoch": 0.2931330472103004, + "grad_norm": 0.3828125, + "learning_rate": 4.989262702311643e-06, + "loss": 2.1411, + "step": 5464 + }, + { + "epoch": 0.29318669527896996, + "grad_norm": 0.33984375, + "learning_rate": 4.989254657626384e-06, + "loss": 2.4093, + "step": 5465 + }, + { + "epoch": 0.2932403433476395, + "grad_norm": 0.396484375, + "learning_rate": 4.989246609935091e-06, + "loss": 2.5664, + "step": 5466 + }, + { + "epoch": 0.293293991416309, + "grad_norm": 0.34765625, + "learning_rate": 4.989238559237776e-06, + "loss": 2.1699, + "step": 5467 + }, + { + "epoch": 0.29334763948497855, + "grad_norm": 0.388671875, + "learning_rate": 4.989230505534445e-06, + "loss": 1.8614, + "step": 5468 + }, + { + "epoch": 0.2934012875536481, + "grad_norm": 0.412109375, + "learning_rate": 4.9892224488251114e-06, + "loss": 2.3517, + "step": 5469 + }, + { + "epoch": 0.2934549356223176, + "grad_norm": 0.326171875, + "learning_rate": 4.989214389109783e-06, + "loss": 2.0721, + "step": 5470 + }, + { + "epoch": 0.29350858369098715, + "grad_norm": 0.447265625, + "learning_rate": 4.98920632638847e-06, + "loss": 2.0666, + "step": 5471 + }, + { + "epoch": 0.2935622317596567, + "grad_norm": 0.3828125, + "learning_rate": 4.9891982606611825e-06, + "loss": 2.128, + "step": 5472 + }, + { + "epoch": 0.29361587982832615, + "grad_norm": 0.38671875, + "learning_rate": 4.98919019192793e-06, + "loss": 2.303, + "step": 5473 + }, + { + "epoch": 0.2936695278969957, + "grad_norm": 0.37109375, + "learning_rate": 4.9891821201887214e-06, + "loss": 2.1504, + "step": 5474 + }, + { + "epoch": 0.2937231759656652, + "grad_norm": 0.443359375, + "learning_rate": 4.989174045443567e-06, + "loss": 2.4263, + "step": 5475 + }, + { + "epoch": 0.29377682403433475, + "grad_norm": 0.421875, + "learning_rate": 4.989165967692478e-06, + "loss": 2.3527, + "step": 5476 + }, + { + "epoch": 0.2938304721030043, + "grad_norm": 1.3984375, + "learning_rate": 4.989157886935461e-06, + "loss": 2.3272, + "step": 5477 + }, + { + "epoch": 0.2938841201716738, + "grad_norm": 0.447265625, + "learning_rate": 4.9891498031725285e-06, + "loss": 2.4965, + "step": 5478 + }, + { + "epoch": 0.29393776824034334, + "grad_norm": 0.39453125, + "learning_rate": 4.9891417164036895e-06, + "loss": 2.3009, + "step": 5479 + }, + { + "epoch": 0.2939914163090129, + "grad_norm": 0.412109375, + "learning_rate": 4.989133626628953e-06, + "loss": 2.1696, + "step": 5480 + }, + { + "epoch": 0.2940450643776824, + "grad_norm": 0.435546875, + "learning_rate": 4.98912553384833e-06, + "loss": 2.3476, + "step": 5481 + }, + { + "epoch": 0.29409871244635194, + "grad_norm": 0.396484375, + "learning_rate": 4.98911743806183e-06, + "loss": 2.1736, + "step": 5482 + }, + { + "epoch": 0.29415236051502147, + "grad_norm": 0.306640625, + "learning_rate": 4.989109339269463e-06, + "loss": 2.3924, + "step": 5483 + }, + { + "epoch": 0.294206008583691, + "grad_norm": 0.40234375, + "learning_rate": 4.989101237471237e-06, + "loss": 2.4787, + "step": 5484 + }, + { + "epoch": 0.29425965665236054, + "grad_norm": 0.66015625, + "learning_rate": 4.989093132667163e-06, + "loss": 2.4943, + "step": 5485 + }, + { + "epoch": 0.29431330472103007, + "grad_norm": 0.416015625, + "learning_rate": 4.989085024857251e-06, + "loss": 2.3291, + "step": 5486 + }, + { + "epoch": 0.2943669527896996, + "grad_norm": 0.6015625, + "learning_rate": 4.989076914041511e-06, + "loss": 2.2789, + "step": 5487 + }, + { + "epoch": 0.2944206008583691, + "grad_norm": 0.34765625, + "learning_rate": 4.989068800219951e-06, + "loss": 2.4593, + "step": 5488 + }, + { + "epoch": 0.2944742489270386, + "grad_norm": 0.3671875, + "learning_rate": 4.9890606833925845e-06, + "loss": 2.2417, + "step": 5489 + }, + { + "epoch": 0.29452789699570814, + "grad_norm": 0.462890625, + "learning_rate": 4.989052563559418e-06, + "loss": 2.491, + "step": 5490 + }, + { + "epoch": 0.29458154506437767, + "grad_norm": 0.373046875, + "learning_rate": 4.989044440720462e-06, + "loss": 2.3239, + "step": 5491 + }, + { + "epoch": 0.2946351931330472, + "grad_norm": 0.38671875, + "learning_rate": 4.9890363148757256e-06, + "loss": 2.1522, + "step": 5492 + }, + { + "epoch": 0.29468884120171673, + "grad_norm": 0.375, + "learning_rate": 4.989028186025221e-06, + "loss": 2.6292, + "step": 5493 + }, + { + "epoch": 0.29474248927038627, + "grad_norm": 0.400390625, + "learning_rate": 4.989020054168957e-06, + "loss": 2.1152, + "step": 5494 + }, + { + "epoch": 0.2947961373390558, + "grad_norm": 0.494140625, + "learning_rate": 4.989011919306942e-06, + "loss": 2.3413, + "step": 5495 + }, + { + "epoch": 0.29484978540772533, + "grad_norm": 0.404296875, + "learning_rate": 4.9890037814391875e-06, + "loss": 2.5182, + "step": 5496 + }, + { + "epoch": 0.29490343347639486, + "grad_norm": 0.451171875, + "learning_rate": 4.988995640565702e-06, + "loss": 1.8741, + "step": 5497 + }, + { + "epoch": 0.2949570815450644, + "grad_norm": 0.408203125, + "learning_rate": 4.988987496686497e-06, + "loss": 2.2811, + "step": 5498 + }, + { + "epoch": 0.2950107296137339, + "grad_norm": 0.34375, + "learning_rate": 4.988979349801581e-06, + "loss": 2.2158, + "step": 5499 + }, + { + "epoch": 0.29506437768240346, + "grad_norm": 0.380859375, + "learning_rate": 4.988971199910964e-06, + "loss": 2.5032, + "step": 5500 + }, + { + "epoch": 0.295118025751073, + "grad_norm": 0.388671875, + "learning_rate": 4.988963047014656e-06, + "loss": 2.1703, + "step": 5501 + }, + { + "epoch": 0.29517167381974246, + "grad_norm": 0.388671875, + "learning_rate": 4.988954891112667e-06, + "loss": 2.2838, + "step": 5502 + }, + { + "epoch": 0.295225321888412, + "grad_norm": 0.40234375, + "learning_rate": 4.988946732205007e-06, + "loss": 2.3842, + "step": 5503 + }, + { + "epoch": 0.2952789699570815, + "grad_norm": 0.462890625, + "learning_rate": 4.988938570291686e-06, + "loss": 2.3135, + "step": 5504 + }, + { + "epoch": 0.29533261802575106, + "grad_norm": 0.3203125, + "learning_rate": 4.988930405372712e-06, + "loss": 2.1417, + "step": 5505 + }, + { + "epoch": 0.2953862660944206, + "grad_norm": 0.388671875, + "learning_rate": 4.988922237448097e-06, + "loss": 2.1442, + "step": 5506 + }, + { + "epoch": 0.2954399141630901, + "grad_norm": 0.404296875, + "learning_rate": 4.98891406651785e-06, + "loss": 2.4037, + "step": 5507 + }, + { + "epoch": 0.29549356223175965, + "grad_norm": 0.357421875, + "learning_rate": 4.988905892581982e-06, + "loss": 2.331, + "step": 5508 + }, + { + "epoch": 0.2955472103004292, + "grad_norm": 0.3515625, + "learning_rate": 4.988897715640502e-06, + "loss": 2.156, + "step": 5509 + }, + { + "epoch": 0.2956008583690987, + "grad_norm": 0.490234375, + "learning_rate": 4.988889535693419e-06, + "loss": 2.3425, + "step": 5510 + }, + { + "epoch": 0.29565450643776825, + "grad_norm": 0.416015625, + "learning_rate": 4.988881352740743e-06, + "loss": 2.3246, + "step": 5511 + }, + { + "epoch": 0.2957081545064378, + "grad_norm": 0.37890625, + "learning_rate": 4.988873166782485e-06, + "loss": 2.4224, + "step": 5512 + }, + { + "epoch": 0.2957618025751073, + "grad_norm": 1.265625, + "learning_rate": 4.988864977818655e-06, + "loss": 1.9453, + "step": 5513 + }, + { + "epoch": 0.29581545064377684, + "grad_norm": 0.31640625, + "learning_rate": 4.988856785849262e-06, + "loss": 2.2495, + "step": 5514 + }, + { + "epoch": 0.2958690987124464, + "grad_norm": 0.392578125, + "learning_rate": 4.988848590874317e-06, + "loss": 2.4393, + "step": 5515 + }, + { + "epoch": 0.29592274678111585, + "grad_norm": 0.3828125, + "learning_rate": 4.988840392893828e-06, + "loss": 2.1393, + "step": 5516 + }, + { + "epoch": 0.2959763948497854, + "grad_norm": 0.39453125, + "learning_rate": 4.988832191907807e-06, + "loss": 2.4642, + "step": 5517 + }, + { + "epoch": 0.2960300429184549, + "grad_norm": 0.40625, + "learning_rate": 4.988823987916261e-06, + "loss": 1.7133, + "step": 5518 + }, + { + "epoch": 0.29608369098712445, + "grad_norm": 0.349609375, + "learning_rate": 4.988815780919204e-06, + "loss": 2.2095, + "step": 5519 + }, + { + "epoch": 0.296137339055794, + "grad_norm": 0.470703125, + "learning_rate": 4.988807570916643e-06, + "loss": 2.4877, + "step": 5520 + }, + { + "epoch": 0.2961909871244635, + "grad_norm": 0.3828125, + "learning_rate": 4.988799357908588e-06, + "loss": 2.1672, + "step": 5521 + }, + { + "epoch": 0.29624463519313304, + "grad_norm": 0.369140625, + "learning_rate": 4.988791141895051e-06, + "loss": 2.4011, + "step": 5522 + }, + { + "epoch": 0.2962982832618026, + "grad_norm": 0.384765625, + "learning_rate": 4.98878292287604e-06, + "loss": 2.3577, + "step": 5523 + }, + { + "epoch": 0.2963519313304721, + "grad_norm": 0.5546875, + "learning_rate": 4.988774700851564e-06, + "loss": 2.2826, + "step": 5524 + }, + { + "epoch": 0.29640557939914164, + "grad_norm": 0.60546875, + "learning_rate": 4.9887664758216355e-06, + "loss": 2.3507, + "step": 5525 + }, + { + "epoch": 0.29645922746781117, + "grad_norm": 0.408203125, + "learning_rate": 4.988758247786264e-06, + "loss": 2.4626, + "step": 5526 + }, + { + "epoch": 0.2965128755364807, + "grad_norm": 0.3984375, + "learning_rate": 4.988750016745457e-06, + "loss": 2.4414, + "step": 5527 + }, + { + "epoch": 0.29656652360515023, + "grad_norm": 0.349609375, + "learning_rate": 4.988741782699227e-06, + "loss": 2.399, + "step": 5528 + }, + { + "epoch": 0.29662017167381977, + "grad_norm": 0.412109375, + "learning_rate": 4.988733545647583e-06, + "loss": 2.2451, + "step": 5529 + }, + { + "epoch": 0.2966738197424893, + "grad_norm": 0.431640625, + "learning_rate": 4.988725305590535e-06, + "loss": 2.0191, + "step": 5530 + }, + { + "epoch": 0.2967274678111588, + "grad_norm": 0.39453125, + "learning_rate": 4.988717062528093e-06, + "loss": 2.4675, + "step": 5531 + }, + { + "epoch": 0.2967811158798283, + "grad_norm": 0.390625, + "learning_rate": 4.9887088164602675e-06, + "loss": 2.3073, + "step": 5532 + }, + { + "epoch": 0.29683476394849784, + "grad_norm": 0.41796875, + "learning_rate": 4.988700567387068e-06, + "loss": 2.144, + "step": 5533 + }, + { + "epoch": 0.29688841201716737, + "grad_norm": 0.4140625, + "learning_rate": 4.988692315308503e-06, + "loss": 2.3096, + "step": 5534 + }, + { + "epoch": 0.2969420600858369, + "grad_norm": 0.326171875, + "learning_rate": 4.988684060224585e-06, + "loss": 2.3597, + "step": 5535 + }, + { + "epoch": 0.29699570815450643, + "grad_norm": 0.3359375, + "learning_rate": 4.988675802135323e-06, + "loss": 2.341, + "step": 5536 + }, + { + "epoch": 0.29704935622317596, + "grad_norm": 0.41015625, + "learning_rate": 4.988667541040726e-06, + "loss": 2.3819, + "step": 5537 + }, + { + "epoch": 0.2971030042918455, + "grad_norm": 0.365234375, + "learning_rate": 4.988659276940805e-06, + "loss": 2.4421, + "step": 5538 + }, + { + "epoch": 0.29715665236051503, + "grad_norm": 0.384765625, + "learning_rate": 4.98865100983557e-06, + "loss": 2.1588, + "step": 5539 + }, + { + "epoch": 0.29721030042918456, + "grad_norm": 0.68359375, + "learning_rate": 4.988642739725031e-06, + "loss": 2.0428, + "step": 5540 + }, + { + "epoch": 0.2972639484978541, + "grad_norm": 1.296875, + "learning_rate": 4.9886344666091965e-06, + "loss": 2.167, + "step": 5541 + }, + { + "epoch": 0.2973175965665236, + "grad_norm": 0.392578125, + "learning_rate": 4.9886261904880785e-06, + "loss": 2.4741, + "step": 5542 + }, + { + "epoch": 0.29737124463519315, + "grad_norm": 0.396484375, + "learning_rate": 4.988617911361686e-06, + "loss": 2.2194, + "step": 5543 + }, + { + "epoch": 0.2974248927038627, + "grad_norm": 0.421875, + "learning_rate": 4.988609629230029e-06, + "loss": 2.2442, + "step": 5544 + }, + { + "epoch": 0.29747854077253216, + "grad_norm": 0.5, + "learning_rate": 4.988601344093118e-06, + "loss": 2.4616, + "step": 5545 + }, + { + "epoch": 0.2975321888412017, + "grad_norm": 0.7578125, + "learning_rate": 4.988593055950963e-06, + "loss": 2.2821, + "step": 5546 + }, + { + "epoch": 0.2975858369098712, + "grad_norm": 0.345703125, + "learning_rate": 4.9885847648035736e-06, + "loss": 2.2436, + "step": 5547 + }, + { + "epoch": 0.29763948497854076, + "grad_norm": 0.5703125, + "learning_rate": 4.98857647065096e-06, + "loss": 2.3739, + "step": 5548 + }, + { + "epoch": 0.2976931330472103, + "grad_norm": 0.392578125, + "learning_rate": 4.988568173493131e-06, + "loss": 2.2274, + "step": 5549 + }, + { + "epoch": 0.2977467811158798, + "grad_norm": 0.46484375, + "learning_rate": 4.9885598733300985e-06, + "loss": 2.2085, + "step": 5550 + }, + { + "epoch": 0.29780042918454935, + "grad_norm": 0.396484375, + "learning_rate": 4.988551570161872e-06, + "loss": 2.4653, + "step": 5551 + }, + { + "epoch": 0.2978540772532189, + "grad_norm": 0.5078125, + "learning_rate": 4.988543263988461e-06, + "loss": 2.6924, + "step": 5552 + }, + { + "epoch": 0.2979077253218884, + "grad_norm": 0.3984375, + "learning_rate": 4.988534954809876e-06, + "loss": 2.4047, + "step": 5553 + }, + { + "epoch": 0.29796137339055795, + "grad_norm": 0.376953125, + "learning_rate": 4.988526642626126e-06, + "loss": 2.2127, + "step": 5554 + }, + { + "epoch": 0.2980150214592275, + "grad_norm": 0.53125, + "learning_rate": 4.988518327437223e-06, + "loss": 1.5991, + "step": 5555 + }, + { + "epoch": 0.298068669527897, + "grad_norm": 0.412109375, + "learning_rate": 4.988510009243176e-06, + "loss": 2.2037, + "step": 5556 + }, + { + "epoch": 0.29812231759656654, + "grad_norm": 0.37890625, + "learning_rate": 4.988501688043994e-06, + "loss": 2.2008, + "step": 5557 + }, + { + "epoch": 0.2981759656652361, + "grad_norm": 0.369140625, + "learning_rate": 4.9884933638396895e-06, + "loss": 2.1986, + "step": 5558 + }, + { + "epoch": 0.29822961373390555, + "grad_norm": 0.48828125, + "learning_rate": 4.98848503663027e-06, + "loss": 2.0959, + "step": 5559 + }, + { + "epoch": 0.2982832618025751, + "grad_norm": 0.349609375, + "learning_rate": 4.988476706415747e-06, + "loss": 2.4213, + "step": 5560 + }, + { + "epoch": 0.2983369098712446, + "grad_norm": 0.439453125, + "learning_rate": 4.98846837319613e-06, + "loss": 2.2541, + "step": 5561 + }, + { + "epoch": 0.29839055793991415, + "grad_norm": 0.451171875, + "learning_rate": 4.988460036971428e-06, + "loss": 2.446, + "step": 5562 + }, + { + "epoch": 0.2984442060085837, + "grad_norm": 0.36328125, + "learning_rate": 4.988451697741655e-06, + "loss": 2.371, + "step": 5563 + }, + { + "epoch": 0.2984978540772532, + "grad_norm": 0.37109375, + "learning_rate": 4.9884433555068164e-06, + "loss": 2.1989, + "step": 5564 + }, + { + "epoch": 0.29855150214592274, + "grad_norm": 0.53515625, + "learning_rate": 4.988435010266926e-06, + "loss": 2.5868, + "step": 5565 + }, + { + "epoch": 0.2986051502145923, + "grad_norm": 0.404296875, + "learning_rate": 4.98842666202199e-06, + "loss": 2.5315, + "step": 5566 + }, + { + "epoch": 0.2986587982832618, + "grad_norm": 0.384765625, + "learning_rate": 4.9884183107720215e-06, + "loss": 2.4492, + "step": 5567 + }, + { + "epoch": 0.29871244635193134, + "grad_norm": 0.474609375, + "learning_rate": 4.988409956517031e-06, + "loss": 2.1976, + "step": 5568 + }, + { + "epoch": 0.29876609442060087, + "grad_norm": 1.1484375, + "learning_rate": 4.988401599257025e-06, + "loss": 2.1513, + "step": 5569 + }, + { + "epoch": 0.2988197424892704, + "grad_norm": 0.412109375, + "learning_rate": 4.9883932389920185e-06, + "loss": 2.62, + "step": 5570 + }, + { + "epoch": 0.29887339055793993, + "grad_norm": 0.82421875, + "learning_rate": 4.9883848757220175e-06, + "loss": 2.5152, + "step": 5571 + }, + { + "epoch": 0.29892703862660946, + "grad_norm": 0.4296875, + "learning_rate": 4.988376509447034e-06, + "loss": 2.4853, + "step": 5572 + }, + { + "epoch": 0.298980686695279, + "grad_norm": 0.44921875, + "learning_rate": 4.988368140167077e-06, + "loss": 2.2239, + "step": 5573 + }, + { + "epoch": 0.2990343347639485, + "grad_norm": 0.337890625, + "learning_rate": 4.988359767882158e-06, + "loss": 2.1448, + "step": 5574 + }, + { + "epoch": 0.299087982832618, + "grad_norm": 0.380859375, + "learning_rate": 4.988351392592286e-06, + "loss": 2.4567, + "step": 5575 + }, + { + "epoch": 0.29914163090128754, + "grad_norm": 0.412109375, + "learning_rate": 4.988343014297472e-06, + "loss": 2.5433, + "step": 5576 + }, + { + "epoch": 0.29919527896995707, + "grad_norm": 0.83203125, + "learning_rate": 4.988334632997726e-06, + "loss": 2.2144, + "step": 5577 + }, + { + "epoch": 0.2992489270386266, + "grad_norm": 0.376953125, + "learning_rate": 4.988326248693056e-06, + "loss": 2.3709, + "step": 5578 + }, + { + "epoch": 0.29930257510729613, + "grad_norm": 0.42578125, + "learning_rate": 4.988317861383475e-06, + "loss": 2.4628, + "step": 5579 + }, + { + "epoch": 0.29935622317596566, + "grad_norm": 0.396484375, + "learning_rate": 4.988309471068993e-06, + "loss": 2.2669, + "step": 5580 + }, + { + "epoch": 0.2994098712446352, + "grad_norm": 0.3515625, + "learning_rate": 4.988301077749618e-06, + "loss": 2.2084, + "step": 5581 + }, + { + "epoch": 0.2994635193133047, + "grad_norm": 0.37890625, + "learning_rate": 4.9882926814253615e-06, + "loss": 2.2912, + "step": 5582 + }, + { + "epoch": 0.29951716738197426, + "grad_norm": 0.392578125, + "learning_rate": 4.988284282096233e-06, + "loss": 2.144, + "step": 5583 + }, + { + "epoch": 0.2995708154506438, + "grad_norm": 0.353515625, + "learning_rate": 4.988275879762244e-06, + "loss": 2.2317, + "step": 5584 + }, + { + "epoch": 0.2996244635193133, + "grad_norm": 0.380859375, + "learning_rate": 4.988267474423403e-06, + "loss": 2.2227, + "step": 5585 + }, + { + "epoch": 0.29967811158798285, + "grad_norm": 0.326171875, + "learning_rate": 4.988259066079722e-06, + "loss": 2.2846, + "step": 5586 + }, + { + "epoch": 0.2997317596566524, + "grad_norm": 0.416015625, + "learning_rate": 4.988250654731208e-06, + "loss": 2.1465, + "step": 5587 + }, + { + "epoch": 0.29978540772532186, + "grad_norm": 0.375, + "learning_rate": 4.988242240377875e-06, + "loss": 2.3301, + "step": 5588 + }, + { + "epoch": 0.2998390557939914, + "grad_norm": 0.390625, + "learning_rate": 4.98823382301973e-06, + "loss": 2.3378, + "step": 5589 + }, + { + "epoch": 0.2998927038626609, + "grad_norm": 0.435546875, + "learning_rate": 4.9882254026567855e-06, + "loss": 2.0959, + "step": 5590 + }, + { + "epoch": 0.29994635193133046, + "grad_norm": 0.3984375, + "learning_rate": 4.988216979289051e-06, + "loss": 2.0896, + "step": 5591 + }, + { + "epoch": 0.3, + "grad_norm": 0.41015625, + "learning_rate": 4.988208552916535e-06, + "loss": 2.2988, + "step": 5592 + }, + { + "epoch": 0.3000536480686695, + "grad_norm": 3.375, + "learning_rate": 4.9882001235392505e-06, + "loss": 2.2682, + "step": 5593 + }, + { + "epoch": 0.30010729613733905, + "grad_norm": 0.39453125, + "learning_rate": 4.988191691157205e-06, + "loss": 2.4064, + "step": 5594 + }, + { + "epoch": 0.3001609442060086, + "grad_norm": 0.455078125, + "learning_rate": 4.98818325577041e-06, + "loss": 2.2192, + "step": 5595 + }, + { + "epoch": 0.3002145922746781, + "grad_norm": 0.455078125, + "learning_rate": 4.9881748173788765e-06, + "loss": 2.6275, + "step": 5596 + }, + { + "epoch": 0.30026824034334765, + "grad_norm": 0.5078125, + "learning_rate": 4.988166375982613e-06, + "loss": 2.2648, + "step": 5597 + }, + { + "epoch": 0.3003218884120172, + "grad_norm": 0.404296875, + "learning_rate": 4.988157931581631e-06, + "loss": 2.6306, + "step": 5598 + }, + { + "epoch": 0.3003755364806867, + "grad_norm": 0.384765625, + "learning_rate": 4.988149484175939e-06, + "loss": 2.3033, + "step": 5599 + }, + { + "epoch": 0.30042918454935624, + "grad_norm": 0.52734375, + "learning_rate": 4.98814103376555e-06, + "loss": 2.2222, + "step": 5600 + }, + { + "epoch": 0.3004828326180258, + "grad_norm": 0.390625, + "learning_rate": 4.9881325803504715e-06, + "loss": 2.3246, + "step": 5601 + }, + { + "epoch": 0.3005364806866953, + "grad_norm": 0.482421875, + "learning_rate": 4.988124123930716e-06, + "loss": 2.4433, + "step": 5602 + }, + { + "epoch": 0.3005901287553648, + "grad_norm": 0.453125, + "learning_rate": 4.988115664506291e-06, + "loss": 2.3004, + "step": 5603 + }, + { + "epoch": 0.3006437768240343, + "grad_norm": 0.37109375, + "learning_rate": 4.988107202077208e-06, + "loss": 2.3417, + "step": 5604 + }, + { + "epoch": 0.30069742489270385, + "grad_norm": 0.392578125, + "learning_rate": 4.9880987366434784e-06, + "loss": 2.4214, + "step": 5605 + }, + { + "epoch": 0.3007510729613734, + "grad_norm": 0.40234375, + "learning_rate": 4.988090268205111e-06, + "loss": 2.4072, + "step": 5606 + }, + { + "epoch": 0.3008047210300429, + "grad_norm": 2.53125, + "learning_rate": 4.988081796762116e-06, + "loss": 2.198, + "step": 5607 + }, + { + "epoch": 0.30085836909871244, + "grad_norm": 0.40625, + "learning_rate": 4.988073322314505e-06, + "loss": 2.4198, + "step": 5608 + }, + { + "epoch": 0.300912017167382, + "grad_norm": 0.34375, + "learning_rate": 4.988064844862288e-06, + "loss": 2.1722, + "step": 5609 + }, + { + "epoch": 0.3009656652360515, + "grad_norm": 0.40625, + "learning_rate": 4.988056364405473e-06, + "loss": 2.4885, + "step": 5610 + }, + { + "epoch": 0.30101931330472104, + "grad_norm": 0.404296875, + "learning_rate": 4.988047880944073e-06, + "loss": 2.5419, + "step": 5611 + }, + { + "epoch": 0.30107296137339057, + "grad_norm": 0.373046875, + "learning_rate": 4.988039394478096e-06, + "loss": 2.2525, + "step": 5612 + }, + { + "epoch": 0.3011266094420601, + "grad_norm": 0.37890625, + "learning_rate": 4.988030905007554e-06, + "loss": 2.2694, + "step": 5613 + }, + { + "epoch": 0.30118025751072963, + "grad_norm": 0.369140625, + "learning_rate": 4.988022412532456e-06, + "loss": 2.4119, + "step": 5614 + }, + { + "epoch": 0.30123390557939916, + "grad_norm": 0.40625, + "learning_rate": 4.988013917052813e-06, + "loss": 2.0315, + "step": 5615 + }, + { + "epoch": 0.3012875536480687, + "grad_norm": 0.388671875, + "learning_rate": 4.9880054185686356e-06, + "loss": 2.2087, + "step": 5616 + }, + { + "epoch": 0.30134120171673817, + "grad_norm": 0.4453125, + "learning_rate": 4.987996917079934e-06, + "loss": 1.9753, + "step": 5617 + }, + { + "epoch": 0.3013948497854077, + "grad_norm": 0.3984375, + "learning_rate": 4.987988412586716e-06, + "loss": 2.1734, + "step": 5618 + }, + { + "epoch": 0.30144849785407724, + "grad_norm": 0.326171875, + "learning_rate": 4.987979905088996e-06, + "loss": 2.2211, + "step": 5619 + }, + { + "epoch": 0.30150214592274677, + "grad_norm": 0.412109375, + "learning_rate": 4.98797139458678e-06, + "loss": 2.45, + "step": 5620 + }, + { + "epoch": 0.3015557939914163, + "grad_norm": 0.392578125, + "learning_rate": 4.987962881080082e-06, + "loss": 2.2114, + "step": 5621 + }, + { + "epoch": 0.30160944206008583, + "grad_norm": 0.33203125, + "learning_rate": 4.98795436456891e-06, + "loss": 2.1204, + "step": 5622 + }, + { + "epoch": 0.30166309012875536, + "grad_norm": 0.390625, + "learning_rate": 4.987945845053276e-06, + "loss": 2.4185, + "step": 5623 + }, + { + "epoch": 0.3017167381974249, + "grad_norm": 0.326171875, + "learning_rate": 4.987937322533188e-06, + "loss": 2.2328, + "step": 5624 + }, + { + "epoch": 0.3017703862660944, + "grad_norm": 0.419921875, + "learning_rate": 4.987928797008659e-06, + "loss": 2.2284, + "step": 5625 + }, + { + "epoch": 0.30182403433476396, + "grad_norm": 0.34375, + "learning_rate": 4.987920268479697e-06, + "loss": 2.2375, + "step": 5626 + }, + { + "epoch": 0.3018776824034335, + "grad_norm": 0.51953125, + "learning_rate": 4.987911736946312e-06, + "loss": 2.2394, + "step": 5627 + }, + { + "epoch": 0.301931330472103, + "grad_norm": 0.482421875, + "learning_rate": 4.987903202408517e-06, + "loss": 2.2078, + "step": 5628 + }, + { + "epoch": 0.30198497854077255, + "grad_norm": 0.37890625, + "learning_rate": 4.987894664866321e-06, + "loss": 2.3342, + "step": 5629 + }, + { + "epoch": 0.3020386266094421, + "grad_norm": 0.421875, + "learning_rate": 4.987886124319733e-06, + "loss": 2.1657, + "step": 5630 + }, + { + "epoch": 0.30209227467811156, + "grad_norm": 0.349609375, + "learning_rate": 4.987877580768765e-06, + "loss": 2.213, + "step": 5631 + }, + { + "epoch": 0.3021459227467811, + "grad_norm": 0.4140625, + "learning_rate": 4.987869034213426e-06, + "loss": 2.3023, + "step": 5632 + }, + { + "epoch": 0.3021995708154506, + "grad_norm": 0.4609375, + "learning_rate": 4.9878604846537285e-06, + "loss": 2.1562, + "step": 5633 + }, + { + "epoch": 0.30225321888412016, + "grad_norm": 0.357421875, + "learning_rate": 4.98785193208968e-06, + "loss": 2.2908, + "step": 5634 + }, + { + "epoch": 0.3023068669527897, + "grad_norm": 0.427734375, + "learning_rate": 4.987843376521293e-06, + "loss": 2.1841, + "step": 5635 + }, + { + "epoch": 0.3023605150214592, + "grad_norm": 0.421875, + "learning_rate": 4.987834817948576e-06, + "loss": 2.0346, + "step": 5636 + }, + { + "epoch": 0.30241416309012875, + "grad_norm": 0.408203125, + "learning_rate": 4.987826256371541e-06, + "loss": 2.3717, + "step": 5637 + }, + { + "epoch": 0.3024678111587983, + "grad_norm": 0.384765625, + "learning_rate": 4.987817691790198e-06, + "loss": 2.2898, + "step": 5638 + }, + { + "epoch": 0.3025214592274678, + "grad_norm": 0.328125, + "learning_rate": 4.987809124204557e-06, + "loss": 2.216, + "step": 5639 + }, + { + "epoch": 0.30257510729613735, + "grad_norm": 0.396484375, + "learning_rate": 4.987800553614628e-06, + "loss": 2.5659, + "step": 5640 + }, + { + "epoch": 0.3026287553648069, + "grad_norm": 0.447265625, + "learning_rate": 4.987791980020421e-06, + "loss": 2.2881, + "step": 5641 + }, + { + "epoch": 0.3026824034334764, + "grad_norm": 0.41015625, + "learning_rate": 4.9877834034219486e-06, + "loss": 2.5894, + "step": 5642 + }, + { + "epoch": 0.30273605150214594, + "grad_norm": 0.3671875, + "learning_rate": 4.987774823819219e-06, + "loss": 2.3253, + "step": 5643 + }, + { + "epoch": 0.3027896995708155, + "grad_norm": 0.34765625, + "learning_rate": 4.987766241212243e-06, + "loss": 2.2499, + "step": 5644 + }, + { + "epoch": 0.302843347639485, + "grad_norm": 0.5703125, + "learning_rate": 4.987757655601031e-06, + "loss": 2.0465, + "step": 5645 + }, + { + "epoch": 0.3028969957081545, + "grad_norm": 0.47265625, + "learning_rate": 4.987749066985594e-06, + "loss": 2.4995, + "step": 5646 + }, + { + "epoch": 0.302950643776824, + "grad_norm": 0.37109375, + "learning_rate": 4.987740475365942e-06, + "loss": 2.4702, + "step": 5647 + }, + { + "epoch": 0.30300429184549355, + "grad_norm": 0.333984375, + "learning_rate": 4.987731880742085e-06, + "loss": 2.3058, + "step": 5648 + }, + { + "epoch": 0.3030579399141631, + "grad_norm": 0.41796875, + "learning_rate": 4.987723283114033e-06, + "loss": 2.3935, + "step": 5649 + }, + { + "epoch": 0.3031115879828326, + "grad_norm": 0.412109375, + "learning_rate": 4.987714682481798e-06, + "loss": 2.3572, + "step": 5650 + }, + { + "epoch": 0.30316523605150214, + "grad_norm": 0.408203125, + "learning_rate": 4.987706078845389e-06, + "loss": 2.4312, + "step": 5651 + }, + { + "epoch": 0.3032188841201717, + "grad_norm": 0.404296875, + "learning_rate": 4.9876974722048175e-06, + "loss": 2.5378, + "step": 5652 + }, + { + "epoch": 0.3032725321888412, + "grad_norm": 0.3828125, + "learning_rate": 4.987688862560092e-06, + "loss": 2.1198, + "step": 5653 + }, + { + "epoch": 0.30332618025751074, + "grad_norm": 0.376953125, + "learning_rate": 4.987680249911226e-06, + "loss": 2.3307, + "step": 5654 + }, + { + "epoch": 0.30337982832618027, + "grad_norm": 0.390625, + "learning_rate": 4.987671634258226e-06, + "loss": 2.2746, + "step": 5655 + }, + { + "epoch": 0.3034334763948498, + "grad_norm": 0.578125, + "learning_rate": 4.987663015601105e-06, + "loss": 1.4796, + "step": 5656 + }, + { + "epoch": 0.30348712446351933, + "grad_norm": 0.43359375, + "learning_rate": 4.9876543939398734e-06, + "loss": 2.6031, + "step": 5657 + }, + { + "epoch": 0.30354077253218886, + "grad_norm": 0.396484375, + "learning_rate": 4.987645769274541e-06, + "loss": 2.3557, + "step": 5658 + }, + { + "epoch": 0.3035944206008584, + "grad_norm": 0.361328125, + "learning_rate": 4.987637141605117e-06, + "loss": 2.2857, + "step": 5659 + }, + { + "epoch": 0.30364806866952787, + "grad_norm": 0.361328125, + "learning_rate": 4.987628510931614e-06, + "loss": 2.3047, + "step": 5660 + }, + { + "epoch": 0.3037017167381974, + "grad_norm": 0.341796875, + "learning_rate": 4.987619877254042e-06, + "loss": 2.1268, + "step": 5661 + }, + { + "epoch": 0.30375536480686693, + "grad_norm": 0.455078125, + "learning_rate": 4.98761124057241e-06, + "loss": 2.5907, + "step": 5662 + }, + { + "epoch": 0.30380901287553647, + "grad_norm": 0.419921875, + "learning_rate": 4.98760260088673e-06, + "loss": 2.439, + "step": 5663 + }, + { + "epoch": 0.303862660944206, + "grad_norm": 0.451171875, + "learning_rate": 4.987593958197011e-06, + "loss": 2.357, + "step": 5664 + }, + { + "epoch": 0.30391630901287553, + "grad_norm": 0.447265625, + "learning_rate": 4.987585312503265e-06, + "loss": 2.3085, + "step": 5665 + }, + { + "epoch": 0.30396995708154506, + "grad_norm": 0.451171875, + "learning_rate": 4.987576663805501e-06, + "loss": 2.1313, + "step": 5666 + }, + { + "epoch": 0.3040236051502146, + "grad_norm": 0.84375, + "learning_rate": 4.987568012103731e-06, + "loss": 2.2944, + "step": 5667 + }, + { + "epoch": 0.3040772532188841, + "grad_norm": 0.326171875, + "learning_rate": 4.987559357397963e-06, + "loss": 2.016, + "step": 5668 + }, + { + "epoch": 0.30413090128755366, + "grad_norm": 0.5546875, + "learning_rate": 4.98755069968821e-06, + "loss": 1.7294, + "step": 5669 + }, + { + "epoch": 0.3041845493562232, + "grad_norm": 0.314453125, + "learning_rate": 4.987542038974482e-06, + "loss": 2.2395, + "step": 5670 + }, + { + "epoch": 0.3042381974248927, + "grad_norm": 0.376953125, + "learning_rate": 4.987533375256789e-06, + "loss": 2.4964, + "step": 5671 + }, + { + "epoch": 0.30429184549356225, + "grad_norm": 0.427734375, + "learning_rate": 4.9875247085351405e-06, + "loss": 2.5126, + "step": 5672 + }, + { + "epoch": 0.3043454935622318, + "grad_norm": 0.53125, + "learning_rate": 4.9875160388095476e-06, + "loss": 2.3241, + "step": 5673 + }, + { + "epoch": 0.30439914163090126, + "grad_norm": 0.392578125, + "learning_rate": 4.987507366080022e-06, + "loss": 2.4754, + "step": 5674 + }, + { + "epoch": 0.3044527896995708, + "grad_norm": 0.3828125, + "learning_rate": 4.987498690346572e-06, + "loss": 2.3824, + "step": 5675 + }, + { + "epoch": 0.3045064377682403, + "grad_norm": 0.34375, + "learning_rate": 4.98749001160921e-06, + "loss": 2.2384, + "step": 5676 + }, + { + "epoch": 0.30456008583690986, + "grad_norm": 0.337890625, + "learning_rate": 4.987481329867946e-06, + "loss": 2.2222, + "step": 5677 + }, + { + "epoch": 0.3046137339055794, + "grad_norm": 0.361328125, + "learning_rate": 4.987472645122789e-06, + "loss": 2.3546, + "step": 5678 + }, + { + "epoch": 0.3046673819742489, + "grad_norm": 0.375, + "learning_rate": 4.987463957373753e-06, + "loss": 2.3351, + "step": 5679 + }, + { + "epoch": 0.30472103004291845, + "grad_norm": 0.390625, + "learning_rate": 4.987455266620844e-06, + "loss": 1.9552, + "step": 5680 + }, + { + "epoch": 0.304774678111588, + "grad_norm": 0.390625, + "learning_rate": 4.987446572864075e-06, + "loss": 1.9724, + "step": 5681 + }, + { + "epoch": 0.3048283261802575, + "grad_norm": 1.0703125, + "learning_rate": 4.987437876103457e-06, + "loss": 2.3785, + "step": 5682 + }, + { + "epoch": 0.30488197424892705, + "grad_norm": 0.50390625, + "learning_rate": 4.987429176338999e-06, + "loss": 2.1969, + "step": 5683 + }, + { + "epoch": 0.3049356223175966, + "grad_norm": 0.6484375, + "learning_rate": 4.987420473570713e-06, + "loss": 2.4466, + "step": 5684 + }, + { + "epoch": 0.3049892703862661, + "grad_norm": 0.384765625, + "learning_rate": 4.987411767798609e-06, + "loss": 2.2807, + "step": 5685 + }, + { + "epoch": 0.30504291845493564, + "grad_norm": 0.380859375, + "learning_rate": 4.987403059022696e-06, + "loss": 2.5399, + "step": 5686 + }, + { + "epoch": 0.3050965665236052, + "grad_norm": 0.375, + "learning_rate": 4.9873943472429865e-06, + "loss": 2.0847, + "step": 5687 + }, + { + "epoch": 0.3051502145922747, + "grad_norm": 0.703125, + "learning_rate": 4.98738563245949e-06, + "loss": 2.2599, + "step": 5688 + }, + { + "epoch": 0.3052038626609442, + "grad_norm": 0.40234375, + "learning_rate": 4.987376914672217e-06, + "loss": 2.434, + "step": 5689 + }, + { + "epoch": 0.3052575107296137, + "grad_norm": 0.298828125, + "learning_rate": 4.987368193881179e-06, + "loss": 2.3485, + "step": 5690 + }, + { + "epoch": 0.30531115879828324, + "grad_norm": 0.330078125, + "learning_rate": 4.987359470086385e-06, + "loss": 2.2319, + "step": 5691 + }, + { + "epoch": 0.3053648068669528, + "grad_norm": 0.396484375, + "learning_rate": 4.987350743287848e-06, + "loss": 2.3269, + "step": 5692 + }, + { + "epoch": 0.3054184549356223, + "grad_norm": 0.375, + "learning_rate": 4.9873420134855755e-06, + "loss": 2.4362, + "step": 5693 + }, + { + "epoch": 0.30547210300429184, + "grad_norm": 0.416015625, + "learning_rate": 4.98733328067958e-06, + "loss": 2.5959, + "step": 5694 + }, + { + "epoch": 0.30552575107296137, + "grad_norm": 0.443359375, + "learning_rate": 4.98732454486987e-06, + "loss": 2.3217, + "step": 5695 + }, + { + "epoch": 0.3055793991416309, + "grad_norm": 0.412109375, + "learning_rate": 4.9873158060564594e-06, + "loss": 2.3206, + "step": 5696 + }, + { + "epoch": 0.30563304721030043, + "grad_norm": 0.376953125, + "learning_rate": 4.987307064239356e-06, + "loss": 2.1767, + "step": 5697 + }, + { + "epoch": 0.30568669527896997, + "grad_norm": 0.384765625, + "learning_rate": 4.987298319418572e-06, + "loss": 2.1092, + "step": 5698 + }, + { + "epoch": 0.3057403433476395, + "grad_norm": 0.3671875, + "learning_rate": 4.9872895715941174e-06, + "loss": 2.2966, + "step": 5699 + }, + { + "epoch": 0.30579399141630903, + "grad_norm": 0.408203125, + "learning_rate": 4.9872808207660015e-06, + "loss": 2.2876, + "step": 5700 + }, + { + "epoch": 0.30584763948497856, + "grad_norm": 0.35546875, + "learning_rate": 4.9872720669342365e-06, + "loss": 2.2152, + "step": 5701 + }, + { + "epoch": 0.3059012875536481, + "grad_norm": 0.47265625, + "learning_rate": 4.987263310098832e-06, + "loss": 2.2315, + "step": 5702 + }, + { + "epoch": 0.30595493562231757, + "grad_norm": 0.458984375, + "learning_rate": 4.9872545502598e-06, + "loss": 2.4234, + "step": 5703 + }, + { + "epoch": 0.3060085836909871, + "grad_norm": 0.408203125, + "learning_rate": 4.987245787417149e-06, + "loss": 2.2367, + "step": 5704 + }, + { + "epoch": 0.30606223175965663, + "grad_norm": 0.46484375, + "learning_rate": 4.9872370215708915e-06, + "loss": 2.3065, + "step": 5705 + }, + { + "epoch": 0.30611587982832617, + "grad_norm": 0.52734375, + "learning_rate": 4.987228252721037e-06, + "loss": 2.2953, + "step": 5706 + }, + { + "epoch": 0.3061695278969957, + "grad_norm": 0.390625, + "learning_rate": 4.987219480867596e-06, + "loss": 2.3364, + "step": 5707 + }, + { + "epoch": 0.30622317596566523, + "grad_norm": 0.33203125, + "learning_rate": 4.98721070601058e-06, + "loss": 2.3334, + "step": 5708 + }, + { + "epoch": 0.30627682403433476, + "grad_norm": 0.36328125, + "learning_rate": 4.987201928149998e-06, + "loss": 2.1495, + "step": 5709 + }, + { + "epoch": 0.3063304721030043, + "grad_norm": 0.40625, + "learning_rate": 4.987193147285863e-06, + "loss": 2.5053, + "step": 5710 + }, + { + "epoch": 0.3063841201716738, + "grad_norm": 0.5625, + "learning_rate": 4.987184363418184e-06, + "loss": 2.2766, + "step": 5711 + }, + { + "epoch": 0.30643776824034336, + "grad_norm": 0.353515625, + "learning_rate": 4.987175576546971e-06, + "loss": 2.1887, + "step": 5712 + }, + { + "epoch": 0.3064914163090129, + "grad_norm": 0.421875, + "learning_rate": 4.9871667866722366e-06, + "loss": 2.1731, + "step": 5713 + }, + { + "epoch": 0.3065450643776824, + "grad_norm": 0.384765625, + "learning_rate": 4.987157993793988e-06, + "loss": 2.061, + "step": 5714 + }, + { + "epoch": 0.30659871244635195, + "grad_norm": 0.4609375, + "learning_rate": 4.987149197912241e-06, + "loss": 2.2199, + "step": 5715 + }, + { + "epoch": 0.3066523605150215, + "grad_norm": 0.40625, + "learning_rate": 4.9871403990270015e-06, + "loss": 2.4127, + "step": 5716 + }, + { + "epoch": 0.306706008583691, + "grad_norm": 0.357421875, + "learning_rate": 4.987131597138283e-06, + "loss": 2.4217, + "step": 5717 + }, + { + "epoch": 0.3067596566523605, + "grad_norm": 0.4609375, + "learning_rate": 4.987122792246094e-06, + "loss": 2.2901, + "step": 5718 + }, + { + "epoch": 0.30681330472103, + "grad_norm": 0.388671875, + "learning_rate": 4.987113984350447e-06, + "loss": 2.3649, + "step": 5719 + }, + { + "epoch": 0.30686695278969955, + "grad_norm": 0.37109375, + "learning_rate": 4.987105173451351e-06, + "loss": 2.2748, + "step": 5720 + }, + { + "epoch": 0.3069206008583691, + "grad_norm": 0.38671875, + "learning_rate": 4.987096359548817e-06, + "loss": 2.2874, + "step": 5721 + }, + { + "epoch": 0.3069742489270386, + "grad_norm": 0.3671875, + "learning_rate": 4.987087542642858e-06, + "loss": 2.531, + "step": 5722 + }, + { + "epoch": 0.30702789699570815, + "grad_norm": 0.8046875, + "learning_rate": 4.987078722733482e-06, + "loss": 2.3986, + "step": 5723 + }, + { + "epoch": 0.3070815450643777, + "grad_norm": 0.392578125, + "learning_rate": 4.9870698998207e-06, + "loss": 2.3885, + "step": 5724 + }, + { + "epoch": 0.3071351931330472, + "grad_norm": 0.322265625, + "learning_rate": 4.987061073904523e-06, + "loss": 2.1421, + "step": 5725 + }, + { + "epoch": 0.30718884120171674, + "grad_norm": 0.44140625, + "learning_rate": 4.987052244984962e-06, + "loss": 2.321, + "step": 5726 + }, + { + "epoch": 0.3072424892703863, + "grad_norm": 0.35546875, + "learning_rate": 4.987043413062028e-06, + "loss": 2.33, + "step": 5727 + }, + { + "epoch": 0.3072961373390558, + "grad_norm": 0.37109375, + "learning_rate": 4.98703457813573e-06, + "loss": 2.2865, + "step": 5728 + }, + { + "epoch": 0.30734978540772534, + "grad_norm": 0.4296875, + "learning_rate": 4.987025740206081e-06, + "loss": 2.345, + "step": 5729 + }, + { + "epoch": 0.30740343347639487, + "grad_norm": 0.388671875, + "learning_rate": 4.98701689927309e-06, + "loss": 2.261, + "step": 5730 + }, + { + "epoch": 0.3074570815450644, + "grad_norm": 0.3828125, + "learning_rate": 4.987008055336767e-06, + "loss": 2.2233, + "step": 5731 + }, + { + "epoch": 0.3075107296137339, + "grad_norm": 0.400390625, + "learning_rate": 4.986999208397124e-06, + "loss": 2.4403, + "step": 5732 + }, + { + "epoch": 0.3075643776824034, + "grad_norm": 0.443359375, + "learning_rate": 4.986990358454172e-06, + "loss": 2.2274, + "step": 5733 + }, + { + "epoch": 0.30761802575107294, + "grad_norm": 0.6640625, + "learning_rate": 4.986981505507921e-06, + "loss": 2.0065, + "step": 5734 + }, + { + "epoch": 0.3076716738197425, + "grad_norm": 0.3203125, + "learning_rate": 4.986972649558381e-06, + "loss": 1.9428, + "step": 5735 + }, + { + "epoch": 0.307725321888412, + "grad_norm": 0.40234375, + "learning_rate": 4.986963790605565e-06, + "loss": 2.1853, + "step": 5736 + }, + { + "epoch": 0.30777896995708154, + "grad_norm": 0.349609375, + "learning_rate": 4.986954928649481e-06, + "loss": 2.4052, + "step": 5737 + }, + { + "epoch": 0.30783261802575107, + "grad_norm": 0.400390625, + "learning_rate": 4.9869460636901425e-06, + "loss": 1.9517, + "step": 5738 + }, + { + "epoch": 0.3078862660944206, + "grad_norm": 0.4296875, + "learning_rate": 4.986937195727557e-06, + "loss": 2.3291, + "step": 5739 + }, + { + "epoch": 0.30793991416309013, + "grad_norm": 0.7109375, + "learning_rate": 4.986928324761737e-06, + "loss": 2.5004, + "step": 5740 + }, + { + "epoch": 0.30799356223175967, + "grad_norm": 0.326171875, + "learning_rate": 4.986919450792694e-06, + "loss": 2.133, + "step": 5741 + }, + { + "epoch": 0.3080472103004292, + "grad_norm": 0.412109375, + "learning_rate": 4.986910573820437e-06, + "loss": 2.3774, + "step": 5742 + }, + { + "epoch": 0.30810085836909873, + "grad_norm": 0.3828125, + "learning_rate": 4.986901693844978e-06, + "loss": 2.1724, + "step": 5743 + }, + { + "epoch": 0.30815450643776826, + "grad_norm": 0.427734375, + "learning_rate": 4.986892810866326e-06, + "loss": 2.2415, + "step": 5744 + }, + { + "epoch": 0.3082081545064378, + "grad_norm": 0.396484375, + "learning_rate": 4.986883924884494e-06, + "loss": 2.2236, + "step": 5745 + }, + { + "epoch": 0.30826180257510727, + "grad_norm": 0.353515625, + "learning_rate": 4.986875035899491e-06, + "loss": 2.2123, + "step": 5746 + }, + { + "epoch": 0.3083154506437768, + "grad_norm": 0.392578125, + "learning_rate": 4.986866143911329e-06, + "loss": 2.1365, + "step": 5747 + }, + { + "epoch": 0.30836909871244633, + "grad_norm": 0.53125, + "learning_rate": 4.986857248920019e-06, + "loss": 2.5949, + "step": 5748 + }, + { + "epoch": 0.30842274678111586, + "grad_norm": 0.40234375, + "learning_rate": 4.986848350925569e-06, + "loss": 2.3539, + "step": 5749 + }, + { + "epoch": 0.3084763948497854, + "grad_norm": 0.39453125, + "learning_rate": 4.986839449927992e-06, + "loss": 2.183, + "step": 5750 + }, + { + "epoch": 0.3085300429184549, + "grad_norm": 0.462890625, + "learning_rate": 4.986830545927299e-06, + "loss": 2.416, + "step": 5751 + }, + { + "epoch": 0.30858369098712446, + "grad_norm": 0.35546875, + "learning_rate": 4.9868216389235e-06, + "loss": 2.2651, + "step": 5752 + }, + { + "epoch": 0.308637339055794, + "grad_norm": 0.48046875, + "learning_rate": 4.9868127289166055e-06, + "loss": 2.1406, + "step": 5753 + }, + { + "epoch": 0.3086909871244635, + "grad_norm": 0.4296875, + "learning_rate": 4.986803815906627e-06, + "loss": 2.4142, + "step": 5754 + }, + { + "epoch": 0.30874463519313305, + "grad_norm": 0.474609375, + "learning_rate": 4.9867948998935745e-06, + "loss": 1.4018, + "step": 5755 + }, + { + "epoch": 0.3087982832618026, + "grad_norm": 0.4140625, + "learning_rate": 4.986785980877459e-06, + "loss": 2.269, + "step": 5756 + }, + { + "epoch": 0.3088519313304721, + "grad_norm": 4.0625, + "learning_rate": 4.986777058858292e-06, + "loss": 2.2237, + "step": 5757 + }, + { + "epoch": 0.30890557939914165, + "grad_norm": 0.396484375, + "learning_rate": 4.986768133836084e-06, + "loss": 2.2506, + "step": 5758 + }, + { + "epoch": 0.3089592274678112, + "grad_norm": 0.380859375, + "learning_rate": 4.986759205810845e-06, + "loss": 2.3742, + "step": 5759 + }, + { + "epoch": 0.3090128755364807, + "grad_norm": 0.37109375, + "learning_rate": 4.986750274782586e-06, + "loss": 2.2396, + "step": 5760 + }, + { + "epoch": 0.3090665236051502, + "grad_norm": 0.4921875, + "learning_rate": 4.986741340751319e-06, + "loss": 2.2061, + "step": 5761 + }, + { + "epoch": 0.3091201716738197, + "grad_norm": 0.40234375, + "learning_rate": 4.986732403717053e-06, + "loss": 2.218, + "step": 5762 + }, + { + "epoch": 0.30917381974248925, + "grad_norm": 0.451171875, + "learning_rate": 4.986723463679799e-06, + "loss": 2.0223, + "step": 5763 + }, + { + "epoch": 0.3092274678111588, + "grad_norm": 0.49609375, + "learning_rate": 4.986714520639569e-06, + "loss": 2.5558, + "step": 5764 + }, + { + "epoch": 0.3092811158798283, + "grad_norm": 0.33203125, + "learning_rate": 4.986705574596374e-06, + "loss": 2.1916, + "step": 5765 + }, + { + "epoch": 0.30933476394849785, + "grad_norm": 2.421875, + "learning_rate": 4.986696625550223e-06, + "loss": 2.251, + "step": 5766 + }, + { + "epoch": 0.3093884120171674, + "grad_norm": 0.482421875, + "learning_rate": 4.986687673501129e-06, + "loss": 2.5159, + "step": 5767 + }, + { + "epoch": 0.3094420600858369, + "grad_norm": 0.408203125, + "learning_rate": 4.9866787184491e-06, + "loss": 2.5296, + "step": 5768 + }, + { + "epoch": 0.30949570815450644, + "grad_norm": 0.51953125, + "learning_rate": 4.98666976039415e-06, + "loss": 2.5407, + "step": 5769 + }, + { + "epoch": 0.309549356223176, + "grad_norm": 0.421875, + "learning_rate": 4.986660799336288e-06, + "loss": 2.4714, + "step": 5770 + }, + { + "epoch": 0.3096030042918455, + "grad_norm": 0.3203125, + "learning_rate": 4.9866518352755245e-06, + "loss": 2.2583, + "step": 5771 + }, + { + "epoch": 0.30965665236051504, + "grad_norm": 0.384765625, + "learning_rate": 4.986642868211872e-06, + "loss": 2.2943, + "step": 5772 + }, + { + "epoch": 0.30971030042918457, + "grad_norm": 0.41015625, + "learning_rate": 4.986633898145339e-06, + "loss": 2.1909, + "step": 5773 + }, + { + "epoch": 0.3097639484978541, + "grad_norm": 0.69140625, + "learning_rate": 4.986624925075938e-06, + "loss": 2.4148, + "step": 5774 + }, + { + "epoch": 0.3098175965665236, + "grad_norm": 0.40234375, + "learning_rate": 4.9866159490036795e-06, + "loss": 2.3048, + "step": 5775 + }, + { + "epoch": 0.3098712446351931, + "grad_norm": 0.38671875, + "learning_rate": 4.986606969928574e-06, + "loss": 2.2982, + "step": 5776 + }, + { + "epoch": 0.30992489270386264, + "grad_norm": 0.427734375, + "learning_rate": 4.986597987850633e-06, + "loss": 2.6443, + "step": 5777 + }, + { + "epoch": 0.3099785407725322, + "grad_norm": 0.43359375, + "learning_rate": 4.986589002769867e-06, + "loss": 2.3696, + "step": 5778 + }, + { + "epoch": 0.3100321888412017, + "grad_norm": 0.34375, + "learning_rate": 4.986580014686287e-06, + "loss": 2.3792, + "step": 5779 + }, + { + "epoch": 0.31008583690987124, + "grad_norm": 0.37890625, + "learning_rate": 4.986571023599903e-06, + "loss": 2.3313, + "step": 5780 + }, + { + "epoch": 0.31013948497854077, + "grad_norm": 0.3984375, + "learning_rate": 4.986562029510726e-06, + "loss": 2.2198, + "step": 5781 + }, + { + "epoch": 0.3101931330472103, + "grad_norm": 0.416015625, + "learning_rate": 4.9865530324187686e-06, + "loss": 2.2028, + "step": 5782 + }, + { + "epoch": 0.31024678111587983, + "grad_norm": 0.328125, + "learning_rate": 4.98654403232404e-06, + "loss": 2.2236, + "step": 5783 + }, + { + "epoch": 0.31030042918454936, + "grad_norm": 0.408203125, + "learning_rate": 4.986535029226551e-06, + "loss": 2.1303, + "step": 5784 + }, + { + "epoch": 0.3103540772532189, + "grad_norm": 0.4375, + "learning_rate": 4.9865260231263135e-06, + "loss": 2.3633, + "step": 5785 + }, + { + "epoch": 0.31040772532188843, + "grad_norm": 0.4921875, + "learning_rate": 4.986517014023338e-06, + "loss": 2.4137, + "step": 5786 + }, + { + "epoch": 0.31046137339055796, + "grad_norm": 0.390625, + "learning_rate": 4.986508001917635e-06, + "loss": 2.3615, + "step": 5787 + }, + { + "epoch": 0.3105150214592275, + "grad_norm": 0.384765625, + "learning_rate": 4.986498986809216e-06, + "loss": 2.3853, + "step": 5788 + }, + { + "epoch": 0.310568669527897, + "grad_norm": 0.400390625, + "learning_rate": 4.986489968698091e-06, + "loss": 2.3479, + "step": 5789 + }, + { + "epoch": 0.3106223175965665, + "grad_norm": 0.341796875, + "learning_rate": 4.9864809475842715e-06, + "loss": 2.2468, + "step": 5790 + }, + { + "epoch": 0.31067596566523603, + "grad_norm": 0.353515625, + "learning_rate": 4.986471923467768e-06, + "loss": 2.3263, + "step": 5791 + }, + { + "epoch": 0.31072961373390556, + "grad_norm": 0.3984375, + "learning_rate": 4.986462896348593e-06, + "loss": 2.6901, + "step": 5792 + }, + { + "epoch": 0.3107832618025751, + "grad_norm": 0.390625, + "learning_rate": 4.986453866226755e-06, + "loss": 2.2894, + "step": 5793 + }, + { + "epoch": 0.3108369098712446, + "grad_norm": 0.337890625, + "learning_rate": 4.986444833102265e-06, + "loss": 2.1175, + "step": 5794 + }, + { + "epoch": 0.31089055793991416, + "grad_norm": 0.361328125, + "learning_rate": 4.9864357969751374e-06, + "loss": 2.2978, + "step": 5795 + }, + { + "epoch": 0.3109442060085837, + "grad_norm": 7.6875, + "learning_rate": 4.9864267578453785e-06, + "loss": 2.0307, + "step": 5796 + }, + { + "epoch": 0.3109978540772532, + "grad_norm": 0.76953125, + "learning_rate": 4.986417715713002e-06, + "loss": 2.1174, + "step": 5797 + }, + { + "epoch": 0.31105150214592275, + "grad_norm": 0.400390625, + "learning_rate": 4.986408670578018e-06, + "loss": 2.0813, + "step": 5798 + }, + { + "epoch": 0.3111051502145923, + "grad_norm": 0.3828125, + "learning_rate": 4.986399622440438e-06, + "loss": 2.1931, + "step": 5799 + }, + { + "epoch": 0.3111587982832618, + "grad_norm": 0.41796875, + "learning_rate": 4.986390571300272e-06, + "loss": 2.4634, + "step": 5800 + }, + { + "epoch": 0.31121244635193135, + "grad_norm": 0.373046875, + "learning_rate": 4.986381517157533e-06, + "loss": 2.3993, + "step": 5801 + }, + { + "epoch": 0.3112660944206009, + "grad_norm": 0.419921875, + "learning_rate": 4.9863724600122285e-06, + "loss": 2.5063, + "step": 5802 + }, + { + "epoch": 0.3113197424892704, + "grad_norm": 0.4453125, + "learning_rate": 4.986363399864372e-06, + "loss": 2.3504, + "step": 5803 + }, + { + "epoch": 0.3113733905579399, + "grad_norm": 0.40234375, + "learning_rate": 4.986354336713973e-06, + "loss": 2.4998, + "step": 5804 + }, + { + "epoch": 0.3114270386266094, + "grad_norm": 0.453125, + "learning_rate": 4.986345270561044e-06, + "loss": 2.4202, + "step": 5805 + }, + { + "epoch": 0.31148068669527895, + "grad_norm": 0.375, + "learning_rate": 4.986336201405595e-06, + "loss": 2.0803, + "step": 5806 + }, + { + "epoch": 0.3115343347639485, + "grad_norm": 0.365234375, + "learning_rate": 4.986327129247637e-06, + "loss": 2.3713, + "step": 5807 + }, + { + "epoch": 0.311587982832618, + "grad_norm": 0.65625, + "learning_rate": 4.986318054087181e-06, + "loss": 2.3517, + "step": 5808 + }, + { + "epoch": 0.31164163090128755, + "grad_norm": 0.3984375, + "learning_rate": 4.986308975924238e-06, + "loss": 2.1676, + "step": 5809 + }, + { + "epoch": 0.3116952789699571, + "grad_norm": 0.546875, + "learning_rate": 4.986299894758819e-06, + "loss": 2.2835, + "step": 5810 + }, + { + "epoch": 0.3117489270386266, + "grad_norm": 0.37890625, + "learning_rate": 4.9862908105909354e-06, + "loss": 2.304, + "step": 5811 + }, + { + "epoch": 0.31180257510729614, + "grad_norm": 0.408203125, + "learning_rate": 4.9862817234205975e-06, + "loss": 2.3045, + "step": 5812 + }, + { + "epoch": 0.3118562231759657, + "grad_norm": 0.53125, + "learning_rate": 4.986272633247817e-06, + "loss": 2.5095, + "step": 5813 + }, + { + "epoch": 0.3119098712446352, + "grad_norm": 0.375, + "learning_rate": 4.986263540072602e-06, + "loss": 2.4629, + "step": 5814 + }, + { + "epoch": 0.31196351931330474, + "grad_norm": 0.3359375, + "learning_rate": 4.986254443894967e-06, + "loss": 2.4672, + "step": 5815 + }, + { + "epoch": 0.31201716738197427, + "grad_norm": 0.34375, + "learning_rate": 4.986245344714924e-06, + "loss": 2.2185, + "step": 5816 + }, + { + "epoch": 0.3120708154506438, + "grad_norm": 0.453125, + "learning_rate": 4.986236242532479e-06, + "loss": 2.3174, + "step": 5817 + }, + { + "epoch": 0.3121244635193133, + "grad_norm": 0.43359375, + "learning_rate": 4.986227137347647e-06, + "loss": 2.38, + "step": 5818 + }, + { + "epoch": 0.3121781115879828, + "grad_norm": 0.57421875, + "learning_rate": 4.986218029160438e-06, + "loss": 2.2775, + "step": 5819 + }, + { + "epoch": 0.31223175965665234, + "grad_norm": 0.357421875, + "learning_rate": 4.986208917970862e-06, + "loss": 2.054, + "step": 5820 + }, + { + "epoch": 0.3122854077253219, + "grad_norm": 0.61328125, + "learning_rate": 4.9861998037789315e-06, + "loss": 2.1573, + "step": 5821 + }, + { + "epoch": 0.3123390557939914, + "grad_norm": 0.423828125, + "learning_rate": 4.986190686584657e-06, + "loss": 2.336, + "step": 5822 + }, + { + "epoch": 0.31239270386266094, + "grad_norm": 0.33984375, + "learning_rate": 4.986181566388049e-06, + "loss": 2.2439, + "step": 5823 + }, + { + "epoch": 0.31244635193133047, + "grad_norm": 0.70703125, + "learning_rate": 4.986172443189118e-06, + "loss": 2.1433, + "step": 5824 + }, + { + "epoch": 0.3125, + "grad_norm": 0.369140625, + "learning_rate": 4.986163316987877e-06, + "loss": 2.1988, + "step": 5825 + }, + { + "epoch": 0.31255364806866953, + "grad_norm": 0.419921875, + "learning_rate": 4.9861541877843345e-06, + "loss": 2.5071, + "step": 5826 + }, + { + "epoch": 0.31260729613733906, + "grad_norm": 0.55078125, + "learning_rate": 4.986145055578504e-06, + "loss": 2.2507, + "step": 5827 + }, + { + "epoch": 0.3126609442060086, + "grad_norm": 0.412109375, + "learning_rate": 4.986135920370395e-06, + "loss": 2.4206, + "step": 5828 + }, + { + "epoch": 0.3127145922746781, + "grad_norm": 0.328125, + "learning_rate": 4.986126782160019e-06, + "loss": 1.9057, + "step": 5829 + }, + { + "epoch": 0.31276824034334766, + "grad_norm": 0.390625, + "learning_rate": 4.986117640947388e-06, + "loss": 2.3605, + "step": 5830 + }, + { + "epoch": 0.3128218884120172, + "grad_norm": 0.376953125, + "learning_rate": 4.986108496732511e-06, + "loss": 2.4069, + "step": 5831 + }, + { + "epoch": 0.3128755364806867, + "grad_norm": 0.3671875, + "learning_rate": 4.9860993495154e-06, + "loss": 2.3195, + "step": 5832 + }, + { + "epoch": 0.3129291845493562, + "grad_norm": 0.3828125, + "learning_rate": 4.986090199296066e-06, + "loss": 2.3338, + "step": 5833 + }, + { + "epoch": 0.31298283261802573, + "grad_norm": 0.65625, + "learning_rate": 4.98608104607452e-06, + "loss": 2.3931, + "step": 5834 + }, + { + "epoch": 0.31303648068669526, + "grad_norm": 0.396484375, + "learning_rate": 4.986071889850775e-06, + "loss": 2.3927, + "step": 5835 + }, + { + "epoch": 0.3130901287553648, + "grad_norm": 0.546875, + "learning_rate": 4.986062730624838e-06, + "loss": 2.3354, + "step": 5836 + }, + { + "epoch": 0.3131437768240343, + "grad_norm": 0.435546875, + "learning_rate": 4.986053568396723e-06, + "loss": 2.4742, + "step": 5837 + }, + { + "epoch": 0.31319742489270386, + "grad_norm": 0.365234375, + "learning_rate": 4.986044403166441e-06, + "loss": 2.3433, + "step": 5838 + }, + { + "epoch": 0.3132510729613734, + "grad_norm": 0.490234375, + "learning_rate": 4.986035234934002e-06, + "loss": 1.7634, + "step": 5839 + }, + { + "epoch": 0.3133047210300429, + "grad_norm": 0.326171875, + "learning_rate": 4.9860260636994175e-06, + "loss": 2.094, + "step": 5840 + }, + { + "epoch": 0.31335836909871245, + "grad_norm": 0.421875, + "learning_rate": 4.9860168894626985e-06, + "loss": 2.388, + "step": 5841 + }, + { + "epoch": 0.313412017167382, + "grad_norm": 0.326171875, + "learning_rate": 4.986007712223857e-06, + "loss": 2.098, + "step": 5842 + }, + { + "epoch": 0.3134656652360515, + "grad_norm": 0.35546875, + "learning_rate": 4.985998531982902e-06, + "loss": 2.3066, + "step": 5843 + }, + { + "epoch": 0.31351931330472105, + "grad_norm": 0.33984375, + "learning_rate": 4.985989348739847e-06, + "loss": 2.2516, + "step": 5844 + }, + { + "epoch": 0.3135729613733906, + "grad_norm": 1.0546875, + "learning_rate": 4.985980162494701e-06, + "loss": 2.3719, + "step": 5845 + }, + { + "epoch": 0.3136266094420601, + "grad_norm": 0.423828125, + "learning_rate": 4.9859709732474775e-06, + "loss": 2.3339, + "step": 5846 + }, + { + "epoch": 0.3136802575107296, + "grad_norm": 0.435546875, + "learning_rate": 4.985961780998184e-06, + "loss": 2.1477, + "step": 5847 + }, + { + "epoch": 0.3137339055793991, + "grad_norm": 0.478515625, + "learning_rate": 4.985952585746835e-06, + "loss": 2.2111, + "step": 5848 + }, + { + "epoch": 0.31378755364806865, + "grad_norm": 0.314453125, + "learning_rate": 4.985943387493441e-06, + "loss": 2.1985, + "step": 5849 + }, + { + "epoch": 0.3138412017167382, + "grad_norm": 0.447265625, + "learning_rate": 4.985934186238012e-06, + "loss": 2.461, + "step": 5850 + }, + { + "epoch": 0.3138948497854077, + "grad_norm": 0.3828125, + "learning_rate": 4.985924981980559e-06, + "loss": 2.3494, + "step": 5851 + }, + { + "epoch": 0.31394849785407725, + "grad_norm": 0.41015625, + "learning_rate": 4.985915774721093e-06, + "loss": 2.364, + "step": 5852 + }, + { + "epoch": 0.3140021459227468, + "grad_norm": 0.41796875, + "learning_rate": 4.985906564459627e-06, + "loss": 2.1572, + "step": 5853 + }, + { + "epoch": 0.3140557939914163, + "grad_norm": 0.470703125, + "learning_rate": 4.9858973511961705e-06, + "loss": 2.4252, + "step": 5854 + }, + { + "epoch": 0.31410944206008584, + "grad_norm": 0.345703125, + "learning_rate": 4.985888134930735e-06, + "loss": 2.2127, + "step": 5855 + }, + { + "epoch": 0.3141630901287554, + "grad_norm": 0.326171875, + "learning_rate": 4.9858789156633325e-06, + "loss": 2.1703, + "step": 5856 + }, + { + "epoch": 0.3142167381974249, + "grad_norm": 0.40234375, + "learning_rate": 4.985869693393973e-06, + "loss": 2.3034, + "step": 5857 + }, + { + "epoch": 0.31427038626609444, + "grad_norm": 0.390625, + "learning_rate": 4.9858604681226675e-06, + "loss": 2.4143, + "step": 5858 + }, + { + "epoch": 0.31432403433476397, + "grad_norm": 0.416015625, + "learning_rate": 4.985851239849427e-06, + "loss": 2.4208, + "step": 5859 + }, + { + "epoch": 0.3143776824034335, + "grad_norm": 0.419921875, + "learning_rate": 4.985842008574264e-06, + "loss": 2.3453, + "step": 5860 + }, + { + "epoch": 0.314431330472103, + "grad_norm": 0.44140625, + "learning_rate": 4.985832774297189e-06, + "loss": 2.2741, + "step": 5861 + }, + { + "epoch": 0.3144849785407725, + "grad_norm": 0.427734375, + "learning_rate": 4.985823537018213e-06, + "loss": 2.2315, + "step": 5862 + }, + { + "epoch": 0.31453862660944204, + "grad_norm": 0.4140625, + "learning_rate": 4.985814296737347e-06, + "loss": 2.1982, + "step": 5863 + }, + { + "epoch": 0.31459227467811157, + "grad_norm": 0.34765625, + "learning_rate": 4.985805053454602e-06, + "loss": 2.2574, + "step": 5864 + }, + { + "epoch": 0.3146459227467811, + "grad_norm": 0.35546875, + "learning_rate": 4.9857958071699905e-06, + "loss": 2.1686, + "step": 5865 + }, + { + "epoch": 0.31469957081545064, + "grad_norm": 0.41015625, + "learning_rate": 4.9857865578835215e-06, + "loss": 1.9984, + "step": 5866 + }, + { + "epoch": 0.31475321888412017, + "grad_norm": 0.8125, + "learning_rate": 4.985777305595208e-06, + "loss": 2.1048, + "step": 5867 + }, + { + "epoch": 0.3148068669527897, + "grad_norm": 0.384765625, + "learning_rate": 4.98576805030506e-06, + "loss": 2.1385, + "step": 5868 + }, + { + "epoch": 0.31486051502145923, + "grad_norm": 0.388671875, + "learning_rate": 4.98575879201309e-06, + "loss": 2.1769, + "step": 5869 + }, + { + "epoch": 0.31491416309012876, + "grad_norm": 0.3515625, + "learning_rate": 4.9857495307193074e-06, + "loss": 2.227, + "step": 5870 + }, + { + "epoch": 0.3149678111587983, + "grad_norm": 0.5, + "learning_rate": 4.985740266423725e-06, + "loss": 2.3731, + "step": 5871 + }, + { + "epoch": 0.3150214592274678, + "grad_norm": 0.69140625, + "learning_rate": 4.9857309991263535e-06, + "loss": 2.2409, + "step": 5872 + }, + { + "epoch": 0.31507510729613736, + "grad_norm": 0.396484375, + "learning_rate": 4.9857217288272035e-06, + "loss": 2.1506, + "step": 5873 + }, + { + "epoch": 0.3151287553648069, + "grad_norm": 0.439453125, + "learning_rate": 4.985712455526287e-06, + "loss": 2.2465, + "step": 5874 + }, + { + "epoch": 0.3151824034334764, + "grad_norm": 0.37890625, + "learning_rate": 4.985703179223614e-06, + "loss": 2.4254, + "step": 5875 + }, + { + "epoch": 0.3152360515021459, + "grad_norm": 0.458984375, + "learning_rate": 4.985693899919197e-06, + "loss": 2.3782, + "step": 5876 + }, + { + "epoch": 0.31528969957081543, + "grad_norm": 0.419921875, + "learning_rate": 4.985684617613048e-06, + "loss": 2.4533, + "step": 5877 + }, + { + "epoch": 0.31534334763948496, + "grad_norm": 0.384765625, + "learning_rate": 4.985675332305175e-06, + "loss": 2.4882, + "step": 5878 + }, + { + "epoch": 0.3153969957081545, + "grad_norm": 0.470703125, + "learning_rate": 4.985666043995592e-06, + "loss": 1.9962, + "step": 5879 + }, + { + "epoch": 0.315450643776824, + "grad_norm": 0.53515625, + "learning_rate": 4.985656752684309e-06, + "loss": 2.2772, + "step": 5880 + }, + { + "epoch": 0.31550429184549356, + "grad_norm": 0.4921875, + "learning_rate": 4.9856474583713386e-06, + "loss": 2.4673, + "step": 5881 + }, + { + "epoch": 0.3155579399141631, + "grad_norm": 0.36328125, + "learning_rate": 4.98563816105669e-06, + "loss": 1.9739, + "step": 5882 + }, + { + "epoch": 0.3156115879828326, + "grad_norm": 0.58203125, + "learning_rate": 4.985628860740376e-06, + "loss": 2.3903, + "step": 5883 + }, + { + "epoch": 0.31566523605150215, + "grad_norm": 0.361328125, + "learning_rate": 4.985619557422407e-06, + "loss": 2.1813, + "step": 5884 + }, + { + "epoch": 0.3157188841201717, + "grad_norm": 0.494140625, + "learning_rate": 4.985610251102794e-06, + "loss": 2.4789, + "step": 5885 + }, + { + "epoch": 0.3157725321888412, + "grad_norm": 0.40234375, + "learning_rate": 4.98560094178155e-06, + "loss": 2.4919, + "step": 5886 + }, + { + "epoch": 0.31582618025751075, + "grad_norm": 0.41015625, + "learning_rate": 4.985591629458684e-06, + "loss": 2.2212, + "step": 5887 + }, + { + "epoch": 0.3158798283261803, + "grad_norm": 0.470703125, + "learning_rate": 4.985582314134208e-06, + "loss": 2.6849, + "step": 5888 + }, + { + "epoch": 0.3159334763948498, + "grad_norm": 0.39453125, + "learning_rate": 4.985572995808134e-06, + "loss": 2.4404, + "step": 5889 + }, + { + "epoch": 0.3159871244635193, + "grad_norm": 0.32421875, + "learning_rate": 4.985563674480472e-06, + "loss": 1.8986, + "step": 5890 + }, + { + "epoch": 0.3160407725321888, + "grad_norm": 0.36328125, + "learning_rate": 4.985554350151236e-06, + "loss": 2.2955, + "step": 5891 + }, + { + "epoch": 0.31609442060085835, + "grad_norm": 0.455078125, + "learning_rate": 4.985545022820434e-06, + "loss": 2.3059, + "step": 5892 + }, + { + "epoch": 0.3161480686695279, + "grad_norm": 0.35546875, + "learning_rate": 4.9855356924880784e-06, + "loss": 2.2958, + "step": 5893 + }, + { + "epoch": 0.3162017167381974, + "grad_norm": 0.388671875, + "learning_rate": 4.98552635915418e-06, + "loss": 2.2453, + "step": 5894 + }, + { + "epoch": 0.31625536480686695, + "grad_norm": 0.392578125, + "learning_rate": 4.9855170228187514e-06, + "loss": 2.1983, + "step": 5895 + }, + { + "epoch": 0.3163090128755365, + "grad_norm": 0.46484375, + "learning_rate": 4.9855076834818025e-06, + "loss": 2.2015, + "step": 5896 + }, + { + "epoch": 0.316362660944206, + "grad_norm": 0.3515625, + "learning_rate": 4.9854983411433465e-06, + "loss": 2.2031, + "step": 5897 + }, + { + "epoch": 0.31641630901287554, + "grad_norm": 0.369140625, + "learning_rate": 4.985488995803393e-06, + "loss": 2.3875, + "step": 5898 + }, + { + "epoch": 0.3164699570815451, + "grad_norm": 0.37109375, + "learning_rate": 4.985479647461953e-06, + "loss": 2.4267, + "step": 5899 + }, + { + "epoch": 0.3165236051502146, + "grad_norm": 0.349609375, + "learning_rate": 4.985470296119038e-06, + "loss": 2.2806, + "step": 5900 + }, + { + "epoch": 0.31657725321888414, + "grad_norm": 0.478515625, + "learning_rate": 4.985460941774661e-06, + "loss": 2.4234, + "step": 5901 + }, + { + "epoch": 0.31663090128755367, + "grad_norm": 0.361328125, + "learning_rate": 4.985451584428832e-06, + "loss": 2.2998, + "step": 5902 + }, + { + "epoch": 0.3166845493562232, + "grad_norm": 0.33203125, + "learning_rate": 4.985442224081561e-06, + "loss": 2.1831, + "step": 5903 + }, + { + "epoch": 0.31673819742489273, + "grad_norm": 0.5546875, + "learning_rate": 4.985432860732862e-06, + "loss": 2.5179, + "step": 5904 + }, + { + "epoch": 0.3167918454935622, + "grad_norm": 0.384765625, + "learning_rate": 4.985423494382745e-06, + "loss": 2.6583, + "step": 5905 + }, + { + "epoch": 0.31684549356223174, + "grad_norm": 0.43359375, + "learning_rate": 4.985414125031221e-06, + "loss": 2.3367, + "step": 5906 + }, + { + "epoch": 0.31689914163090127, + "grad_norm": 0.50390625, + "learning_rate": 4.9854047526783015e-06, + "loss": 2.6607, + "step": 5907 + }, + { + "epoch": 0.3169527896995708, + "grad_norm": 0.400390625, + "learning_rate": 4.985395377323997e-06, + "loss": 2.3131, + "step": 5908 + }, + { + "epoch": 0.31700643776824033, + "grad_norm": 0.359375, + "learning_rate": 4.98538599896832e-06, + "loss": 2.2995, + "step": 5909 + }, + { + "epoch": 0.31706008583690987, + "grad_norm": 0.380859375, + "learning_rate": 4.9853766176112824e-06, + "loss": 2.2502, + "step": 5910 + }, + { + "epoch": 0.3171137339055794, + "grad_norm": 0.4296875, + "learning_rate": 4.985367233252895e-06, + "loss": 2.1793, + "step": 5911 + }, + { + "epoch": 0.31716738197424893, + "grad_norm": 0.498046875, + "learning_rate": 4.985357845893168e-06, + "loss": 2.4294, + "step": 5912 + }, + { + "epoch": 0.31722103004291846, + "grad_norm": 0.48828125, + "learning_rate": 4.985348455532114e-06, + "loss": 2.4707, + "step": 5913 + }, + { + "epoch": 0.317274678111588, + "grad_norm": 0.36328125, + "learning_rate": 4.985339062169744e-06, + "loss": 2.7433, + "step": 5914 + }, + { + "epoch": 0.3173283261802575, + "grad_norm": 0.427734375, + "learning_rate": 4.985329665806069e-06, + "loss": 2.3745, + "step": 5915 + }, + { + "epoch": 0.31738197424892706, + "grad_norm": 0.4609375, + "learning_rate": 4.9853202664411e-06, + "loss": 2.2647, + "step": 5916 + }, + { + "epoch": 0.3174356223175966, + "grad_norm": 0.45703125, + "learning_rate": 4.98531086407485e-06, + "loss": 2.3731, + "step": 5917 + }, + { + "epoch": 0.3174892703862661, + "grad_norm": 0.890625, + "learning_rate": 4.985301458707328e-06, + "loss": 1.9234, + "step": 5918 + }, + { + "epoch": 0.3175429184549356, + "grad_norm": 0.466796875, + "learning_rate": 4.985292050338547e-06, + "loss": 2.5201, + "step": 5919 + }, + { + "epoch": 0.31759656652360513, + "grad_norm": 0.384765625, + "learning_rate": 4.985282638968518e-06, + "loss": 2.0289, + "step": 5920 + }, + { + "epoch": 0.31765021459227466, + "grad_norm": 0.337890625, + "learning_rate": 4.985273224597254e-06, + "loss": 2.1281, + "step": 5921 + }, + { + "epoch": 0.3177038626609442, + "grad_norm": 0.36328125, + "learning_rate": 4.985263807224763e-06, + "loss": 2.1489, + "step": 5922 + }, + { + "epoch": 0.3177575107296137, + "grad_norm": 0.416015625, + "learning_rate": 4.985254386851059e-06, + "loss": 2.4584, + "step": 5923 + }, + { + "epoch": 0.31781115879828326, + "grad_norm": 0.37890625, + "learning_rate": 4.985244963476151e-06, + "loss": 2.4212, + "step": 5924 + }, + { + "epoch": 0.3178648068669528, + "grad_norm": 0.439453125, + "learning_rate": 4.985235537100053e-06, + "loss": 2.359, + "step": 5925 + }, + { + "epoch": 0.3179184549356223, + "grad_norm": 0.41015625, + "learning_rate": 4.985226107722775e-06, + "loss": 2.4357, + "step": 5926 + }, + { + "epoch": 0.31797210300429185, + "grad_norm": 0.439453125, + "learning_rate": 4.985216675344329e-06, + "loss": 2.1975, + "step": 5927 + }, + { + "epoch": 0.3180257510729614, + "grad_norm": 0.86328125, + "learning_rate": 4.9852072399647255e-06, + "loss": 2.319, + "step": 5928 + }, + { + "epoch": 0.3180793991416309, + "grad_norm": 0.439453125, + "learning_rate": 4.985197801583978e-06, + "loss": 2.6718, + "step": 5929 + }, + { + "epoch": 0.31813304721030045, + "grad_norm": 0.404296875, + "learning_rate": 4.985188360202095e-06, + "loss": 2.3908, + "step": 5930 + }, + { + "epoch": 0.31818669527897, + "grad_norm": 0.390625, + "learning_rate": 4.9851789158190885e-06, + "loss": 2.323, + "step": 5931 + }, + { + "epoch": 0.3182403433476395, + "grad_norm": 0.3203125, + "learning_rate": 4.985169468434972e-06, + "loss": 2.268, + "step": 5932 + }, + { + "epoch": 0.318293991416309, + "grad_norm": 0.3125, + "learning_rate": 4.9851600180497545e-06, + "loss": 2.2567, + "step": 5933 + }, + { + "epoch": 0.3183476394849785, + "grad_norm": 0.392578125, + "learning_rate": 4.985150564663449e-06, + "loss": 2.5615, + "step": 5934 + }, + { + "epoch": 0.31840128755364805, + "grad_norm": 10.875, + "learning_rate": 4.985141108276066e-06, + "loss": 2.4442, + "step": 5935 + }, + { + "epoch": 0.3184549356223176, + "grad_norm": 0.404296875, + "learning_rate": 4.985131648887618e-06, + "loss": 2.1381, + "step": 5936 + }, + { + "epoch": 0.3185085836909871, + "grad_norm": 0.40625, + "learning_rate": 4.985122186498114e-06, + "loss": 2.345, + "step": 5937 + }, + { + "epoch": 0.31856223175965664, + "grad_norm": 0.38671875, + "learning_rate": 4.985112721107569e-06, + "loss": 2.2735, + "step": 5938 + }, + { + "epoch": 0.3186158798283262, + "grad_norm": 0.369140625, + "learning_rate": 4.985103252715993e-06, + "loss": 2.0373, + "step": 5939 + }, + { + "epoch": 0.3186695278969957, + "grad_norm": 0.66796875, + "learning_rate": 4.985093781323395e-06, + "loss": 2.1862, + "step": 5940 + }, + { + "epoch": 0.31872317596566524, + "grad_norm": 0.31640625, + "learning_rate": 4.985084306929789e-06, + "loss": 2.1257, + "step": 5941 + }, + { + "epoch": 0.31877682403433477, + "grad_norm": 0.400390625, + "learning_rate": 4.985074829535187e-06, + "loss": 2.5643, + "step": 5942 + }, + { + "epoch": 0.3188304721030043, + "grad_norm": 0.4296875, + "learning_rate": 4.985065349139598e-06, + "loss": 2.4467, + "step": 5943 + }, + { + "epoch": 0.31888412017167383, + "grad_norm": 0.3828125, + "learning_rate": 4.985055865743036e-06, + "loss": 2.2425, + "step": 5944 + }, + { + "epoch": 0.31893776824034337, + "grad_norm": 0.35546875, + "learning_rate": 4.985046379345511e-06, + "loss": 2.1112, + "step": 5945 + }, + { + "epoch": 0.3189914163090129, + "grad_norm": 0.349609375, + "learning_rate": 4.985036889947034e-06, + "loss": 1.9154, + "step": 5946 + }, + { + "epoch": 0.31904506437768243, + "grad_norm": 0.39453125, + "learning_rate": 4.985027397547617e-06, + "loss": 2.2502, + "step": 5947 + }, + { + "epoch": 0.3190987124463519, + "grad_norm": 0.53125, + "learning_rate": 4.985017902147272e-06, + "loss": 2.1845, + "step": 5948 + }, + { + "epoch": 0.31915236051502144, + "grad_norm": 0.34765625, + "learning_rate": 4.98500840374601e-06, + "loss": 2.194, + "step": 5949 + }, + { + "epoch": 0.31920600858369097, + "grad_norm": 0.35546875, + "learning_rate": 4.984998902343843e-06, + "loss": 2.4836, + "step": 5950 + }, + { + "epoch": 0.3192596566523605, + "grad_norm": 0.373046875, + "learning_rate": 4.984989397940782e-06, + "loss": 2.2695, + "step": 5951 + }, + { + "epoch": 0.31931330472103003, + "grad_norm": 0.474609375, + "learning_rate": 4.984979890536837e-06, + "loss": 2.2573, + "step": 5952 + }, + { + "epoch": 0.31936695278969957, + "grad_norm": 0.353515625, + "learning_rate": 4.984970380132022e-06, + "loss": 2.0091, + "step": 5953 + }, + { + "epoch": 0.3194206008583691, + "grad_norm": 6.15625, + "learning_rate": 4.984960866726348e-06, + "loss": 2.5704, + "step": 5954 + }, + { + "epoch": 0.31947424892703863, + "grad_norm": 0.3984375, + "learning_rate": 4.984951350319826e-06, + "loss": 2.5705, + "step": 5955 + }, + { + "epoch": 0.31952789699570816, + "grad_norm": 0.361328125, + "learning_rate": 4.984941830912466e-06, + "loss": 2.1979, + "step": 5956 + }, + { + "epoch": 0.3195815450643777, + "grad_norm": 0.33984375, + "learning_rate": 4.984932308504282e-06, + "loss": 1.7189, + "step": 5957 + }, + { + "epoch": 0.3196351931330472, + "grad_norm": 0.345703125, + "learning_rate": 4.984922783095285e-06, + "loss": 2.474, + "step": 5958 + }, + { + "epoch": 0.31968884120171676, + "grad_norm": 0.349609375, + "learning_rate": 4.984913254685485e-06, + "loss": 2.1911, + "step": 5959 + }, + { + "epoch": 0.3197424892703863, + "grad_norm": 0.46875, + "learning_rate": 4.984903723274895e-06, + "loss": 2.2292, + "step": 5960 + }, + { + "epoch": 0.3197961373390558, + "grad_norm": 0.40625, + "learning_rate": 4.984894188863525e-06, + "loss": 2.3043, + "step": 5961 + }, + { + "epoch": 0.3198497854077253, + "grad_norm": 0.40625, + "learning_rate": 4.984884651451388e-06, + "loss": 2.383, + "step": 5962 + }, + { + "epoch": 0.3199034334763948, + "grad_norm": 0.376953125, + "learning_rate": 4.9848751110384955e-06, + "loss": 2.3778, + "step": 5963 + }, + { + "epoch": 0.31995708154506436, + "grad_norm": 0.412109375, + "learning_rate": 4.984865567624858e-06, + "loss": 2.6953, + "step": 5964 + }, + { + "epoch": 0.3200107296137339, + "grad_norm": 0.4140625, + "learning_rate": 4.984856021210488e-06, + "loss": 2.3595, + "step": 5965 + }, + { + "epoch": 0.3200643776824034, + "grad_norm": 0.412109375, + "learning_rate": 4.984846471795396e-06, + "loss": 2.1581, + "step": 5966 + }, + { + "epoch": 0.32011802575107295, + "grad_norm": 0.353515625, + "learning_rate": 4.984836919379594e-06, + "loss": 2.3195, + "step": 5967 + }, + { + "epoch": 0.3201716738197425, + "grad_norm": 0.37109375, + "learning_rate": 4.984827363963094e-06, + "loss": 2.5598, + "step": 5968 + }, + { + "epoch": 0.320225321888412, + "grad_norm": 0.359375, + "learning_rate": 4.984817805545907e-06, + "loss": 2.4331, + "step": 5969 + }, + { + "epoch": 0.32027896995708155, + "grad_norm": 0.484375, + "learning_rate": 4.984808244128044e-06, + "loss": 2.5594, + "step": 5970 + }, + { + "epoch": 0.3203326180257511, + "grad_norm": 0.380859375, + "learning_rate": 4.984798679709519e-06, + "loss": 2.3611, + "step": 5971 + }, + { + "epoch": 0.3203862660944206, + "grad_norm": 0.482421875, + "learning_rate": 4.9847891122903405e-06, + "loss": 2.4124, + "step": 5972 + }, + { + "epoch": 0.32043991416309014, + "grad_norm": 0.42578125, + "learning_rate": 4.9847795418705215e-06, + "loss": 1.8526, + "step": 5973 + }, + { + "epoch": 0.3204935622317597, + "grad_norm": 0.3359375, + "learning_rate": 4.984769968450074e-06, + "loss": 2.6496, + "step": 5974 + }, + { + "epoch": 0.3205472103004292, + "grad_norm": 0.498046875, + "learning_rate": 4.984760392029008e-06, + "loss": 2.3913, + "step": 5975 + }, + { + "epoch": 0.32060085836909874, + "grad_norm": 0.400390625, + "learning_rate": 4.984750812607337e-06, + "loss": 2.4017, + "step": 5976 + }, + { + "epoch": 0.3206545064377682, + "grad_norm": 0.55078125, + "learning_rate": 4.984741230185071e-06, + "loss": 2.3261, + "step": 5977 + }, + { + "epoch": 0.32070815450643775, + "grad_norm": 0.421875, + "learning_rate": 4.984731644762222e-06, + "loss": 2.2891, + "step": 5978 + }, + { + "epoch": 0.3207618025751073, + "grad_norm": 0.4296875, + "learning_rate": 4.9847220563388024e-06, + "loss": 1.9481, + "step": 5979 + }, + { + "epoch": 0.3208154506437768, + "grad_norm": 0.4140625, + "learning_rate": 4.984712464914823e-06, + "loss": 2.2687, + "step": 5980 + }, + { + "epoch": 0.32086909871244634, + "grad_norm": 0.380859375, + "learning_rate": 4.984702870490295e-06, + "loss": 2.2678, + "step": 5981 + }, + { + "epoch": 0.3209227467811159, + "grad_norm": 0.451171875, + "learning_rate": 4.984693273065231e-06, + "loss": 1.5317, + "step": 5982 + }, + { + "epoch": 0.3209763948497854, + "grad_norm": 0.369140625, + "learning_rate": 4.984683672639642e-06, + "loss": 2.3022, + "step": 5983 + }, + { + "epoch": 0.32103004291845494, + "grad_norm": 0.38671875, + "learning_rate": 4.98467406921354e-06, + "loss": 2.1668, + "step": 5984 + }, + { + "epoch": 0.32108369098712447, + "grad_norm": 0.375, + "learning_rate": 4.984664462786936e-06, + "loss": 2.2191, + "step": 5985 + }, + { + "epoch": 0.321137339055794, + "grad_norm": 0.427734375, + "learning_rate": 4.984654853359842e-06, + "loss": 2.0474, + "step": 5986 + }, + { + "epoch": 0.32119098712446353, + "grad_norm": 0.640625, + "learning_rate": 4.9846452409322685e-06, + "loss": 2.4366, + "step": 5987 + }, + { + "epoch": 0.32124463519313307, + "grad_norm": 0.32421875, + "learning_rate": 4.984635625504229e-06, + "loss": 2.1771, + "step": 5988 + }, + { + "epoch": 0.3212982832618026, + "grad_norm": 0.392578125, + "learning_rate": 4.984626007075735e-06, + "loss": 2.4282, + "step": 5989 + }, + { + "epoch": 0.32135193133047213, + "grad_norm": 0.3359375, + "learning_rate": 4.984616385646795e-06, + "loss": 2.2045, + "step": 5990 + }, + { + "epoch": 0.3214055793991416, + "grad_norm": 0.6171875, + "learning_rate": 4.984606761217426e-06, + "loss": 1.5082, + "step": 5991 + }, + { + "epoch": 0.32145922746781114, + "grad_norm": 0.3671875, + "learning_rate": 4.984597133787634e-06, + "loss": 2.3246, + "step": 5992 + }, + { + "epoch": 0.32151287553648067, + "grad_norm": 0.5546875, + "learning_rate": 4.984587503357434e-06, + "loss": 2.3205, + "step": 5993 + }, + { + "epoch": 0.3215665236051502, + "grad_norm": 0.404296875, + "learning_rate": 4.984577869926837e-06, + "loss": 2.4334, + "step": 5994 + }, + { + "epoch": 0.32162017167381973, + "grad_norm": 0.40625, + "learning_rate": 4.984568233495855e-06, + "loss": 2.4818, + "step": 5995 + }, + { + "epoch": 0.32167381974248926, + "grad_norm": 0.453125, + "learning_rate": 4.984558594064498e-06, + "loss": 2.367, + "step": 5996 + }, + { + "epoch": 0.3217274678111588, + "grad_norm": 0.369140625, + "learning_rate": 4.984548951632779e-06, + "loss": 2.3455, + "step": 5997 + }, + { + "epoch": 0.3217811158798283, + "grad_norm": 7.6875, + "learning_rate": 4.98453930620071e-06, + "loss": 2.3099, + "step": 5998 + }, + { + "epoch": 0.32183476394849786, + "grad_norm": 0.6328125, + "learning_rate": 4.984529657768301e-06, + "loss": 2.4125, + "step": 5999 + }, + { + "epoch": 0.3218884120171674, + "grad_norm": 0.455078125, + "learning_rate": 4.984520006335566e-06, + "loss": 2.3744, + "step": 6000 + }, + { + "epoch": 0.3219420600858369, + "grad_norm": 0.4375, + "learning_rate": 4.984510351902514e-06, + "loss": 1.9916, + "step": 6001 + }, + { + "epoch": 0.32199570815450645, + "grad_norm": 0.388671875, + "learning_rate": 4.9845006944691585e-06, + "loss": 2.2674, + "step": 6002 + }, + { + "epoch": 0.322049356223176, + "grad_norm": 0.353515625, + "learning_rate": 4.984491034035512e-06, + "loss": 2.2065, + "step": 6003 + }, + { + "epoch": 0.3221030042918455, + "grad_norm": 0.390625, + "learning_rate": 4.9844813706015826e-06, + "loss": 2.3398, + "step": 6004 + }, + { + "epoch": 0.322156652360515, + "grad_norm": 0.482421875, + "learning_rate": 4.984471704167385e-06, + "loss": 1.772, + "step": 6005 + }, + { + "epoch": 0.3222103004291845, + "grad_norm": 0.380859375, + "learning_rate": 4.984462034732931e-06, + "loss": 2.2829, + "step": 6006 + }, + { + "epoch": 0.32226394849785406, + "grad_norm": 0.458984375, + "learning_rate": 4.98445236229823e-06, + "loss": 1.3883, + "step": 6007 + }, + { + "epoch": 0.3223175965665236, + "grad_norm": 0.4453125, + "learning_rate": 4.984442686863295e-06, + "loss": 2.0986, + "step": 6008 + }, + { + "epoch": 0.3223712446351931, + "grad_norm": 0.33984375, + "learning_rate": 4.984433008428139e-06, + "loss": 2.2329, + "step": 6009 + }, + { + "epoch": 0.32242489270386265, + "grad_norm": 0.388671875, + "learning_rate": 4.984423326992772e-06, + "loss": 2.2425, + "step": 6010 + }, + { + "epoch": 0.3224785407725322, + "grad_norm": 0.35546875, + "learning_rate": 4.984413642557206e-06, + "loss": 1.8316, + "step": 6011 + }, + { + "epoch": 0.3225321888412017, + "grad_norm": 0.376953125, + "learning_rate": 4.984403955121452e-06, + "loss": 2.4854, + "step": 6012 + }, + { + "epoch": 0.32258583690987125, + "grad_norm": 0.40234375, + "learning_rate": 4.984394264685523e-06, + "loss": 2.3785, + "step": 6013 + }, + { + "epoch": 0.3226394849785408, + "grad_norm": 0.361328125, + "learning_rate": 4.98438457124943e-06, + "loss": 2.3305, + "step": 6014 + }, + { + "epoch": 0.3226931330472103, + "grad_norm": 0.5, + "learning_rate": 4.984374874813185e-06, + "loss": 2.3636, + "step": 6015 + }, + { + "epoch": 0.32274678111587984, + "grad_norm": 0.322265625, + "learning_rate": 4.9843651753768e-06, + "loss": 2.2276, + "step": 6016 + }, + { + "epoch": 0.3228004291845494, + "grad_norm": 0.380859375, + "learning_rate": 4.984355472940286e-06, + "loss": 2.1146, + "step": 6017 + }, + { + "epoch": 0.3228540772532189, + "grad_norm": 0.54296875, + "learning_rate": 4.984345767503655e-06, + "loss": 2.3963, + "step": 6018 + }, + { + "epoch": 0.32290772532188844, + "grad_norm": 0.357421875, + "learning_rate": 4.9843360590669185e-06, + "loss": 2.2224, + "step": 6019 + }, + { + "epoch": 0.3229613733905579, + "grad_norm": 0.376953125, + "learning_rate": 4.984326347630088e-06, + "loss": 2.2751, + "step": 6020 + }, + { + "epoch": 0.32301502145922745, + "grad_norm": 0.466796875, + "learning_rate": 4.984316633193177e-06, + "loss": 2.3095, + "step": 6021 + }, + { + "epoch": 0.323068669527897, + "grad_norm": 0.40234375, + "learning_rate": 4.984306915756195e-06, + "loss": 2.1897, + "step": 6022 + }, + { + "epoch": 0.3231223175965665, + "grad_norm": 0.66015625, + "learning_rate": 4.984297195319155e-06, + "loss": 2.3123, + "step": 6023 + }, + { + "epoch": 0.32317596566523604, + "grad_norm": 0.388671875, + "learning_rate": 4.984287471882069e-06, + "loss": 2.4307, + "step": 6024 + }, + { + "epoch": 0.3232296137339056, + "grad_norm": 0.470703125, + "learning_rate": 4.984277745444946e-06, + "loss": 2.1745, + "step": 6025 + }, + { + "epoch": 0.3232832618025751, + "grad_norm": 0.53515625, + "learning_rate": 4.984268016007802e-06, + "loss": 2.1012, + "step": 6026 + }, + { + "epoch": 0.32333690987124464, + "grad_norm": 0.341796875, + "learning_rate": 4.984258283570646e-06, + "loss": 2.1699, + "step": 6027 + }, + { + "epoch": 0.32339055793991417, + "grad_norm": 0.3828125, + "learning_rate": 4.984248548133491e-06, + "loss": 2.3404, + "step": 6028 + }, + { + "epoch": 0.3234442060085837, + "grad_norm": 0.3984375, + "learning_rate": 4.984238809696347e-06, + "loss": 2.1974, + "step": 6029 + }, + { + "epoch": 0.32349785407725323, + "grad_norm": 0.365234375, + "learning_rate": 4.984229068259227e-06, + "loss": 2.1098, + "step": 6030 + }, + { + "epoch": 0.32355150214592276, + "grad_norm": 0.78125, + "learning_rate": 4.984219323822143e-06, + "loss": 2.3628, + "step": 6031 + }, + { + "epoch": 0.3236051502145923, + "grad_norm": 0.578125, + "learning_rate": 4.984209576385107e-06, + "loss": 2.1348, + "step": 6032 + }, + { + "epoch": 0.32365879828326183, + "grad_norm": 0.369140625, + "learning_rate": 4.984199825948129e-06, + "loss": 1.8845, + "step": 6033 + }, + { + "epoch": 0.3237124463519313, + "grad_norm": 0.3828125, + "learning_rate": 4.984190072511223e-06, + "loss": 1.9613, + "step": 6034 + }, + { + "epoch": 0.32376609442060084, + "grad_norm": 0.34375, + "learning_rate": 4.9841803160744e-06, + "loss": 2.4983, + "step": 6035 + }, + { + "epoch": 0.32381974248927037, + "grad_norm": 0.494140625, + "learning_rate": 4.984170556637671e-06, + "loss": 2.2626, + "step": 6036 + }, + { + "epoch": 0.3238733905579399, + "grad_norm": 0.54296875, + "learning_rate": 4.984160794201049e-06, + "loss": 2.3193, + "step": 6037 + }, + { + "epoch": 0.32392703862660943, + "grad_norm": 0.42578125, + "learning_rate": 4.984151028764544e-06, + "loss": 2.441, + "step": 6038 + }, + { + "epoch": 0.32398068669527896, + "grad_norm": 0.359375, + "learning_rate": 4.984141260328169e-06, + "loss": 2.2419, + "step": 6039 + }, + { + "epoch": 0.3240343347639485, + "grad_norm": 0.462890625, + "learning_rate": 4.984131488891936e-06, + "loss": 2.5593, + "step": 6040 + }, + { + "epoch": 0.324087982832618, + "grad_norm": 0.7890625, + "learning_rate": 4.984121714455857e-06, + "loss": 2.2476, + "step": 6041 + }, + { + "epoch": 0.32414163090128756, + "grad_norm": 0.38671875, + "learning_rate": 4.984111937019943e-06, + "loss": 2.0232, + "step": 6042 + }, + { + "epoch": 0.3241952789699571, + "grad_norm": 0.470703125, + "learning_rate": 4.9841021565842054e-06, + "loss": 2.4563, + "step": 6043 + }, + { + "epoch": 0.3242489270386266, + "grad_norm": 0.419921875, + "learning_rate": 4.984092373148658e-06, + "loss": 2.2959, + "step": 6044 + }, + { + "epoch": 0.32430257510729615, + "grad_norm": 0.462890625, + "learning_rate": 4.984082586713311e-06, + "loss": 2.5052, + "step": 6045 + }, + { + "epoch": 0.3243562231759657, + "grad_norm": 0.408203125, + "learning_rate": 4.9840727972781755e-06, + "loss": 2.4114, + "step": 6046 + }, + { + "epoch": 0.3244098712446352, + "grad_norm": 0.400390625, + "learning_rate": 4.984063004843265e-06, + "loss": 2.2378, + "step": 6047 + }, + { + "epoch": 0.3244635193133047, + "grad_norm": 0.4453125, + "learning_rate": 4.984053209408591e-06, + "loss": 2.0535, + "step": 6048 + }, + { + "epoch": 0.3245171673819742, + "grad_norm": 0.37109375, + "learning_rate": 4.984043410974165e-06, + "loss": 2.5913, + "step": 6049 + }, + { + "epoch": 0.32457081545064376, + "grad_norm": 0.38671875, + "learning_rate": 4.984033609539999e-06, + "loss": 2.1787, + "step": 6050 + }, + { + "epoch": 0.3246244635193133, + "grad_norm": 0.3828125, + "learning_rate": 4.984023805106104e-06, + "loss": 2.0544, + "step": 6051 + }, + { + "epoch": 0.3246781115879828, + "grad_norm": 0.37109375, + "learning_rate": 4.984013997672493e-06, + "loss": 2.2416, + "step": 6052 + }, + { + "epoch": 0.32473175965665235, + "grad_norm": 1.0234375, + "learning_rate": 4.984004187239178e-06, + "loss": 2.3876, + "step": 6053 + }, + { + "epoch": 0.3247854077253219, + "grad_norm": 0.373046875, + "learning_rate": 4.983994373806169e-06, + "loss": 2.1747, + "step": 6054 + }, + { + "epoch": 0.3248390557939914, + "grad_norm": 0.38671875, + "learning_rate": 4.9839845573734795e-06, + "loss": 2.2463, + "step": 6055 + }, + { + "epoch": 0.32489270386266095, + "grad_norm": 0.36328125, + "learning_rate": 4.983974737941121e-06, + "loss": 2.2069, + "step": 6056 + }, + { + "epoch": 0.3249463519313305, + "grad_norm": 0.404296875, + "learning_rate": 4.983964915509105e-06, + "loss": 2.2562, + "step": 6057 + }, + { + "epoch": 0.325, + "grad_norm": 0.3515625, + "learning_rate": 4.983955090077445e-06, + "loss": 2.3706, + "step": 6058 + }, + { + "epoch": 0.32505364806866954, + "grad_norm": 0.41796875, + "learning_rate": 4.98394526164615e-06, + "loss": 2.5456, + "step": 6059 + }, + { + "epoch": 0.3251072961373391, + "grad_norm": 0.3984375, + "learning_rate": 4.983935430215234e-06, + "loss": 1.2897, + "step": 6060 + }, + { + "epoch": 0.3251609442060086, + "grad_norm": 0.33984375, + "learning_rate": 4.983925595784709e-06, + "loss": 2.2573, + "step": 6061 + }, + { + "epoch": 0.32521459227467814, + "grad_norm": 0.3828125, + "learning_rate": 4.9839157583545845e-06, + "loss": 2.2692, + "step": 6062 + }, + { + "epoch": 0.3252682403433476, + "grad_norm": 0.68359375, + "learning_rate": 4.983905917924875e-06, + "loss": 2.3191, + "step": 6063 + }, + { + "epoch": 0.32532188841201715, + "grad_norm": 0.3671875, + "learning_rate": 4.983896074495592e-06, + "loss": 2.2904, + "step": 6064 + }, + { + "epoch": 0.3253755364806867, + "grad_norm": 0.431640625, + "learning_rate": 4.983886228066746e-06, + "loss": 2.1929, + "step": 6065 + }, + { + "epoch": 0.3254291845493562, + "grad_norm": 2.140625, + "learning_rate": 4.983876378638349e-06, + "loss": 2.3265, + "step": 6066 + }, + { + "epoch": 0.32548283261802574, + "grad_norm": 0.46484375, + "learning_rate": 4.983866526210415e-06, + "loss": 2.2851, + "step": 6067 + }, + { + "epoch": 0.3255364806866953, + "grad_norm": 0.341796875, + "learning_rate": 4.983856670782954e-06, + "loss": 2.1788, + "step": 6068 + }, + { + "epoch": 0.3255901287553648, + "grad_norm": 0.400390625, + "learning_rate": 4.983846812355978e-06, + "loss": 2.3127, + "step": 6069 + }, + { + "epoch": 0.32564377682403434, + "grad_norm": 0.41796875, + "learning_rate": 4.983836950929499e-06, + "loss": 2.2955, + "step": 6070 + }, + { + "epoch": 0.32569742489270387, + "grad_norm": 0.3828125, + "learning_rate": 4.9838270865035296e-06, + "loss": 1.8971, + "step": 6071 + }, + { + "epoch": 0.3257510729613734, + "grad_norm": 0.380859375, + "learning_rate": 4.983817219078082e-06, + "loss": 2.2845, + "step": 6072 + }, + { + "epoch": 0.32580472103004293, + "grad_norm": 0.39453125, + "learning_rate": 4.983807348653167e-06, + "loss": 2.3261, + "step": 6073 + }, + { + "epoch": 0.32585836909871246, + "grad_norm": 0.361328125, + "learning_rate": 4.983797475228796e-06, + "loss": 2.2779, + "step": 6074 + }, + { + "epoch": 0.325912017167382, + "grad_norm": 0.5703125, + "learning_rate": 4.983787598804983e-06, + "loss": 1.7786, + "step": 6075 + }, + { + "epoch": 0.3259656652360515, + "grad_norm": 0.390625, + "learning_rate": 4.983777719381738e-06, + "loss": 2.3903, + "step": 6076 + }, + { + "epoch": 0.326019313304721, + "grad_norm": 0.474609375, + "learning_rate": 4.9837678369590745e-06, + "loss": 2.2641, + "step": 6077 + }, + { + "epoch": 0.32607296137339054, + "grad_norm": 0.36328125, + "learning_rate": 4.983757951537003e-06, + "loss": 2.1756, + "step": 6078 + }, + { + "epoch": 0.32612660944206007, + "grad_norm": 0.404296875, + "learning_rate": 4.983748063115537e-06, + "loss": 2.0994, + "step": 6079 + }, + { + "epoch": 0.3261802575107296, + "grad_norm": 0.365234375, + "learning_rate": 4.983738171694686e-06, + "loss": 2.3831, + "step": 6080 + }, + { + "epoch": 0.32623390557939913, + "grad_norm": 0.3828125, + "learning_rate": 4.983728277274465e-06, + "loss": 2.1999, + "step": 6081 + }, + { + "epoch": 0.32628755364806866, + "grad_norm": 0.40625, + "learning_rate": 4.983718379854884e-06, + "loss": 2.2356, + "step": 6082 + }, + { + "epoch": 0.3263412017167382, + "grad_norm": 0.40234375, + "learning_rate": 4.9837084794359545e-06, + "loss": 2.2876, + "step": 6083 + }, + { + "epoch": 0.3263948497854077, + "grad_norm": 0.41796875, + "learning_rate": 4.9836985760176904e-06, + "loss": 2.3802, + "step": 6084 + }, + { + "epoch": 0.32644849785407726, + "grad_norm": 0.39453125, + "learning_rate": 4.983688669600102e-06, + "loss": 2.2672, + "step": 6085 + }, + { + "epoch": 0.3265021459227468, + "grad_norm": 0.369140625, + "learning_rate": 4.9836787601832025e-06, + "loss": 2.1317, + "step": 6086 + }, + { + "epoch": 0.3265557939914163, + "grad_norm": 0.42578125, + "learning_rate": 4.983668847767002e-06, + "loss": 2.4496, + "step": 6087 + }, + { + "epoch": 0.32660944206008585, + "grad_norm": 0.361328125, + "learning_rate": 4.983658932351515e-06, + "loss": 2.1165, + "step": 6088 + }, + { + "epoch": 0.3266630901287554, + "grad_norm": 0.35546875, + "learning_rate": 4.983649013936751e-06, + "loss": 2.3683, + "step": 6089 + }, + { + "epoch": 0.3267167381974249, + "grad_norm": 0.474609375, + "learning_rate": 4.983639092522724e-06, + "loss": 2.2887, + "step": 6090 + }, + { + "epoch": 0.32677038626609445, + "grad_norm": 0.375, + "learning_rate": 4.983629168109445e-06, + "loss": 2.1212, + "step": 6091 + }, + { + "epoch": 0.3268240343347639, + "grad_norm": 0.44140625, + "learning_rate": 4.983619240696926e-06, + "loss": 2.4136, + "step": 6092 + }, + { + "epoch": 0.32687768240343346, + "grad_norm": 0.39453125, + "learning_rate": 4.983609310285179e-06, + "loss": 2.6239, + "step": 6093 + }, + { + "epoch": 0.326931330472103, + "grad_norm": 0.353515625, + "learning_rate": 4.983599376874216e-06, + "loss": 2.1012, + "step": 6094 + }, + { + "epoch": 0.3269849785407725, + "grad_norm": 0.380859375, + "learning_rate": 4.983589440464049e-06, + "loss": 2.3796, + "step": 6095 + }, + { + "epoch": 0.32703862660944205, + "grad_norm": 0.451171875, + "learning_rate": 4.983579501054691e-06, + "loss": 2.3716, + "step": 6096 + }, + { + "epoch": 0.3270922746781116, + "grad_norm": 0.64453125, + "learning_rate": 4.983569558646152e-06, + "loss": 2.1501, + "step": 6097 + }, + { + "epoch": 0.3271459227467811, + "grad_norm": 0.330078125, + "learning_rate": 4.983559613238446e-06, + "loss": 2.2381, + "step": 6098 + }, + { + "epoch": 0.32719957081545065, + "grad_norm": 0.376953125, + "learning_rate": 4.983549664831584e-06, + "loss": 2.2368, + "step": 6099 + }, + { + "epoch": 0.3272532188841202, + "grad_norm": 0.421875, + "learning_rate": 4.9835397134255775e-06, + "loss": 2.2678, + "step": 6100 + }, + { + "epoch": 0.3273068669527897, + "grad_norm": 0.341796875, + "learning_rate": 4.983529759020439e-06, + "loss": 2.2287, + "step": 6101 + }, + { + "epoch": 0.32736051502145924, + "grad_norm": 0.388671875, + "learning_rate": 4.983519801616181e-06, + "loss": 2.4386, + "step": 6102 + }, + { + "epoch": 0.3274141630901288, + "grad_norm": 0.37109375, + "learning_rate": 4.983509841212816e-06, + "loss": 2.3142, + "step": 6103 + }, + { + "epoch": 0.3274678111587983, + "grad_norm": 0.376953125, + "learning_rate": 4.983499877810354e-06, + "loss": 2.2367, + "step": 6104 + }, + { + "epoch": 0.32752145922746784, + "grad_norm": 0.37890625, + "learning_rate": 4.983489911408809e-06, + "loss": 2.1676, + "step": 6105 + }, + { + "epoch": 0.3275751072961373, + "grad_norm": 0.3203125, + "learning_rate": 4.983479942008192e-06, + "loss": 2.2163, + "step": 6106 + }, + { + "epoch": 0.32762875536480685, + "grad_norm": 0.400390625, + "learning_rate": 4.983469969608515e-06, + "loss": 2.3757, + "step": 6107 + }, + { + "epoch": 0.3276824034334764, + "grad_norm": 0.400390625, + "learning_rate": 4.983459994209791e-06, + "loss": 2.3252, + "step": 6108 + }, + { + "epoch": 0.3277360515021459, + "grad_norm": 0.462890625, + "learning_rate": 4.9834500158120304e-06, + "loss": 2.3114, + "step": 6109 + }, + { + "epoch": 0.32778969957081544, + "grad_norm": 0.6171875, + "learning_rate": 4.983440034415247e-06, + "loss": 1.291, + "step": 6110 + }, + { + "epoch": 0.32784334763948497, + "grad_norm": 0.4453125, + "learning_rate": 4.983430050019452e-06, + "loss": 2.6579, + "step": 6111 + }, + { + "epoch": 0.3278969957081545, + "grad_norm": 0.373046875, + "learning_rate": 4.983420062624658e-06, + "loss": 2.362, + "step": 6112 + }, + { + "epoch": 0.32795064377682404, + "grad_norm": 0.341796875, + "learning_rate": 4.9834100722308755e-06, + "loss": 2.3397, + "step": 6113 + }, + { + "epoch": 0.32800429184549357, + "grad_norm": 0.359375, + "learning_rate": 4.983400078838119e-06, + "loss": 2.2828, + "step": 6114 + }, + { + "epoch": 0.3280579399141631, + "grad_norm": 0.392578125, + "learning_rate": 4.983390082446398e-06, + "loss": 2.3478, + "step": 6115 + }, + { + "epoch": 0.32811158798283263, + "grad_norm": 0.37890625, + "learning_rate": 4.983380083055727e-06, + "loss": 2.3079, + "step": 6116 + }, + { + "epoch": 0.32816523605150216, + "grad_norm": 0.353515625, + "learning_rate": 4.983370080666116e-06, + "loss": 2.1957, + "step": 6117 + }, + { + "epoch": 0.3282188841201717, + "grad_norm": 0.294921875, + "learning_rate": 4.983360075277578e-06, + "loss": 1.9925, + "step": 6118 + }, + { + "epoch": 0.3282725321888412, + "grad_norm": 0.3828125, + "learning_rate": 4.983350066890126e-06, + "loss": 2.2208, + "step": 6119 + }, + { + "epoch": 0.3283261802575107, + "grad_norm": 0.42578125, + "learning_rate": 4.983340055503771e-06, + "loss": 2.2065, + "step": 6120 + }, + { + "epoch": 0.32837982832618023, + "grad_norm": 0.44921875, + "learning_rate": 4.983330041118524e-06, + "loss": 2.3579, + "step": 6121 + }, + { + "epoch": 0.32843347639484977, + "grad_norm": 0.412109375, + "learning_rate": 4.983320023734399e-06, + "loss": 2.3813, + "step": 6122 + }, + { + "epoch": 0.3284871244635193, + "grad_norm": 0.392578125, + "learning_rate": 4.983310003351408e-06, + "loss": 2.3359, + "step": 6123 + }, + { + "epoch": 0.32854077253218883, + "grad_norm": 0.45703125, + "learning_rate": 4.983299979969562e-06, + "loss": 2.4784, + "step": 6124 + }, + { + "epoch": 0.32859442060085836, + "grad_norm": 0.42578125, + "learning_rate": 4.983289953588873e-06, + "loss": 2.3639, + "step": 6125 + }, + { + "epoch": 0.3286480686695279, + "grad_norm": 0.46875, + "learning_rate": 4.983279924209356e-06, + "loss": 2.4796, + "step": 6126 + }, + { + "epoch": 0.3287017167381974, + "grad_norm": 0.443359375, + "learning_rate": 4.9832698918310185e-06, + "loss": 2.3873, + "step": 6127 + }, + { + "epoch": 0.32875536480686696, + "grad_norm": 0.498046875, + "learning_rate": 4.983259856453876e-06, + "loss": 1.9846, + "step": 6128 + }, + { + "epoch": 0.3288090128755365, + "grad_norm": 0.37109375, + "learning_rate": 4.983249818077939e-06, + "loss": 2.3693, + "step": 6129 + }, + { + "epoch": 0.328862660944206, + "grad_norm": 0.447265625, + "learning_rate": 4.983239776703221e-06, + "loss": 2.2468, + "step": 6130 + }, + { + "epoch": 0.32891630901287555, + "grad_norm": 0.46484375, + "learning_rate": 4.983229732329733e-06, + "loss": 2.4915, + "step": 6131 + }, + { + "epoch": 0.3289699570815451, + "grad_norm": 0.423828125, + "learning_rate": 4.9832196849574876e-06, + "loss": 2.3282, + "step": 6132 + }, + { + "epoch": 0.3290236051502146, + "grad_norm": 0.455078125, + "learning_rate": 4.983209634586497e-06, + "loss": 2.674, + "step": 6133 + }, + { + "epoch": 0.32907725321888415, + "grad_norm": 0.294921875, + "learning_rate": 4.983199581216772e-06, + "loss": 2.2419, + "step": 6134 + }, + { + "epoch": 0.3291309012875536, + "grad_norm": 0.369140625, + "learning_rate": 4.983189524848326e-06, + "loss": 2.42, + "step": 6135 + }, + { + "epoch": 0.32918454935622316, + "grad_norm": 0.349609375, + "learning_rate": 4.983179465481173e-06, + "loss": 2.3226, + "step": 6136 + }, + { + "epoch": 0.3292381974248927, + "grad_norm": 0.427734375, + "learning_rate": 4.983169403115321e-06, + "loss": 2.0487, + "step": 6137 + }, + { + "epoch": 0.3292918454935622, + "grad_norm": 0.337890625, + "learning_rate": 4.983159337750786e-06, + "loss": 2.3074, + "step": 6138 + }, + { + "epoch": 0.32934549356223175, + "grad_norm": 0.85546875, + "learning_rate": 4.983149269387577e-06, + "loss": 2.2781, + "step": 6139 + }, + { + "epoch": 0.3293991416309013, + "grad_norm": 0.45703125, + "learning_rate": 4.983139198025708e-06, + "loss": 2.1307, + "step": 6140 + }, + { + "epoch": 0.3294527896995708, + "grad_norm": 0.357421875, + "learning_rate": 4.9831291236651916e-06, + "loss": 2.4285, + "step": 6141 + }, + { + "epoch": 0.32950643776824035, + "grad_norm": 0.390625, + "learning_rate": 4.9831190463060386e-06, + "loss": 2.1537, + "step": 6142 + }, + { + "epoch": 0.3295600858369099, + "grad_norm": 0.41796875, + "learning_rate": 4.983108965948262e-06, + "loss": 2.3353, + "step": 6143 + }, + { + "epoch": 0.3296137339055794, + "grad_norm": 0.337890625, + "learning_rate": 4.983098882591873e-06, + "loss": 2.3697, + "step": 6144 + }, + { + "epoch": 0.32966738197424894, + "grad_norm": 0.4140625, + "learning_rate": 4.983088796236886e-06, + "loss": 2.6538, + "step": 6145 + }, + { + "epoch": 0.3297210300429185, + "grad_norm": 0.353515625, + "learning_rate": 4.98307870688331e-06, + "loss": 2.232, + "step": 6146 + }, + { + "epoch": 0.329774678111588, + "grad_norm": 2.421875, + "learning_rate": 4.983068614531159e-06, + "loss": 2.4832, + "step": 6147 + }, + { + "epoch": 0.32982832618025754, + "grad_norm": 0.435546875, + "learning_rate": 4.983058519180446e-06, + "loss": 2.3333, + "step": 6148 + }, + { + "epoch": 0.329881974248927, + "grad_norm": 0.421875, + "learning_rate": 4.983048420831181e-06, + "loss": 2.3776, + "step": 6149 + }, + { + "epoch": 0.32993562231759654, + "grad_norm": 0.392578125, + "learning_rate": 4.983038319483379e-06, + "loss": 2.2941, + "step": 6150 + }, + { + "epoch": 0.3299892703862661, + "grad_norm": 0.392578125, + "learning_rate": 4.98302821513705e-06, + "loss": 2.3316, + "step": 6151 + }, + { + "epoch": 0.3300429184549356, + "grad_norm": 0.400390625, + "learning_rate": 4.983018107792206e-06, + "loss": 2.4474, + "step": 6152 + }, + { + "epoch": 0.33009656652360514, + "grad_norm": 0.349609375, + "learning_rate": 4.983007997448861e-06, + "loss": 2.1331, + "step": 6153 + }, + { + "epoch": 0.33015021459227467, + "grad_norm": 0.3359375, + "learning_rate": 4.982997884107026e-06, + "loss": 2.4708, + "step": 6154 + }, + { + "epoch": 0.3302038626609442, + "grad_norm": 0.43359375, + "learning_rate": 4.982987767766713e-06, + "loss": 2.3806, + "step": 6155 + }, + { + "epoch": 0.33025751072961373, + "grad_norm": 0.3984375, + "learning_rate": 4.982977648427935e-06, + "loss": 2.3655, + "step": 6156 + }, + { + "epoch": 0.33031115879828327, + "grad_norm": 0.3828125, + "learning_rate": 4.982967526090704e-06, + "loss": 2.2868, + "step": 6157 + }, + { + "epoch": 0.3303648068669528, + "grad_norm": 0.3828125, + "learning_rate": 4.982957400755032e-06, + "loss": 2.4617, + "step": 6158 + }, + { + "epoch": 0.33041845493562233, + "grad_norm": 0.42578125, + "learning_rate": 4.982947272420932e-06, + "loss": 2.463, + "step": 6159 + }, + { + "epoch": 0.33047210300429186, + "grad_norm": 0.3203125, + "learning_rate": 4.982937141088414e-06, + "loss": 2.0599, + "step": 6160 + }, + { + "epoch": 0.3305257510729614, + "grad_norm": 0.408203125, + "learning_rate": 4.982927006757493e-06, + "loss": 2.6343, + "step": 6161 + }, + { + "epoch": 0.3305793991416309, + "grad_norm": 0.51953125, + "learning_rate": 4.98291686942818e-06, + "loss": 2.3084, + "step": 6162 + }, + { + "epoch": 0.3306330472103004, + "grad_norm": 0.4453125, + "learning_rate": 4.982906729100486e-06, + "loss": 2.2834, + "step": 6163 + }, + { + "epoch": 0.33068669527896993, + "grad_norm": 0.349609375, + "learning_rate": 4.982896585774426e-06, + "loss": 2.3624, + "step": 6164 + }, + { + "epoch": 0.33074034334763946, + "grad_norm": 0.34765625, + "learning_rate": 4.982886439450011e-06, + "loss": 2.2742, + "step": 6165 + }, + { + "epoch": 0.330793991416309, + "grad_norm": 0.40234375, + "learning_rate": 4.982876290127252e-06, + "loss": 2.2644, + "step": 6166 + }, + { + "epoch": 0.33084763948497853, + "grad_norm": 1.1953125, + "learning_rate": 4.982866137806162e-06, + "loss": 2.4005, + "step": 6167 + }, + { + "epoch": 0.33090128755364806, + "grad_norm": 0.39453125, + "learning_rate": 4.982855982486755e-06, + "loss": 2.4294, + "step": 6168 + }, + { + "epoch": 0.3309549356223176, + "grad_norm": 0.3984375, + "learning_rate": 4.982845824169041e-06, + "loss": 2.2048, + "step": 6169 + }, + { + "epoch": 0.3310085836909871, + "grad_norm": 0.353515625, + "learning_rate": 4.9828356628530324e-06, + "loss": 2.0934, + "step": 6170 + }, + { + "epoch": 0.33106223175965666, + "grad_norm": 0.51171875, + "learning_rate": 4.982825498538743e-06, + "loss": 2.5427, + "step": 6171 + }, + { + "epoch": 0.3311158798283262, + "grad_norm": 0.37109375, + "learning_rate": 4.982815331226185e-06, + "loss": 2.3313, + "step": 6172 + }, + { + "epoch": 0.3311695278969957, + "grad_norm": 0.42578125, + "learning_rate": 4.982805160915368e-06, + "loss": 2.4755, + "step": 6173 + }, + { + "epoch": 0.33122317596566525, + "grad_norm": 0.43359375, + "learning_rate": 4.9827949876063076e-06, + "loss": 2.3055, + "step": 6174 + }, + { + "epoch": 0.3312768240343348, + "grad_norm": 0.392578125, + "learning_rate": 4.982784811299014e-06, + "loss": 2.2944, + "step": 6175 + }, + { + "epoch": 0.3313304721030043, + "grad_norm": 0.3984375, + "learning_rate": 4.9827746319935e-06, + "loss": 2.2646, + "step": 6176 + }, + { + "epoch": 0.33138412017167385, + "grad_norm": 0.431640625, + "learning_rate": 4.982764449689779e-06, + "loss": 2.3164, + "step": 6177 + }, + { + "epoch": 0.3314377682403433, + "grad_norm": 0.365234375, + "learning_rate": 4.9827542643878615e-06, + "loss": 2.2809, + "step": 6178 + }, + { + "epoch": 0.33149141630901285, + "grad_norm": 0.416015625, + "learning_rate": 4.982744076087761e-06, + "loss": 2.4459, + "step": 6179 + }, + { + "epoch": 0.3315450643776824, + "grad_norm": 0.49609375, + "learning_rate": 4.98273388478949e-06, + "loss": 2.2354, + "step": 6180 + }, + { + "epoch": 0.3315987124463519, + "grad_norm": 0.419921875, + "learning_rate": 4.9827236904930595e-06, + "loss": 2.437, + "step": 6181 + }, + { + "epoch": 0.33165236051502145, + "grad_norm": 0.435546875, + "learning_rate": 4.982713493198482e-06, + "loss": 2.1946, + "step": 6182 + }, + { + "epoch": 0.331706008583691, + "grad_norm": 0.365234375, + "learning_rate": 4.9827032929057715e-06, + "loss": 2.269, + "step": 6183 + }, + { + "epoch": 0.3317596566523605, + "grad_norm": 0.3671875, + "learning_rate": 4.982693089614939e-06, + "loss": 2.2461, + "step": 6184 + }, + { + "epoch": 0.33181330472103004, + "grad_norm": 0.369140625, + "learning_rate": 4.982682883325996e-06, + "loss": 2.2304, + "step": 6185 + }, + { + "epoch": 0.3318669527896996, + "grad_norm": 0.462890625, + "learning_rate": 4.982672674038957e-06, + "loss": 2.2716, + "step": 6186 + }, + { + "epoch": 0.3319206008583691, + "grad_norm": 0.734375, + "learning_rate": 4.982662461753833e-06, + "loss": 2.0953, + "step": 6187 + }, + { + "epoch": 0.33197424892703864, + "grad_norm": 0.412109375, + "learning_rate": 4.9826522464706364e-06, + "loss": 2.1803, + "step": 6188 + }, + { + "epoch": 0.33202789699570817, + "grad_norm": 0.478515625, + "learning_rate": 4.9826420281893796e-06, + "loss": 2.0118, + "step": 6189 + }, + { + "epoch": 0.3320815450643777, + "grad_norm": 0.416015625, + "learning_rate": 4.982631806910075e-06, + "loss": 2.1027, + "step": 6190 + }, + { + "epoch": 0.33213519313304724, + "grad_norm": 0.490234375, + "learning_rate": 4.982621582632735e-06, + "loss": 2.4729, + "step": 6191 + }, + { + "epoch": 0.3321888412017167, + "grad_norm": 0.353515625, + "learning_rate": 4.982611355357372e-06, + "loss": 2.2429, + "step": 6192 + }, + { + "epoch": 0.33224248927038624, + "grad_norm": 0.51953125, + "learning_rate": 4.982601125083997e-06, + "loss": 2.4343, + "step": 6193 + }, + { + "epoch": 0.3322961373390558, + "grad_norm": 0.427734375, + "learning_rate": 4.982590891812625e-06, + "loss": 2.4006, + "step": 6194 + }, + { + "epoch": 0.3323497854077253, + "grad_norm": 0.49609375, + "learning_rate": 4.982580655543267e-06, + "loss": 2.3188, + "step": 6195 + }, + { + "epoch": 0.33240343347639484, + "grad_norm": 0.373046875, + "learning_rate": 4.982570416275934e-06, + "loss": 2.3659, + "step": 6196 + }, + { + "epoch": 0.33245708154506437, + "grad_norm": 0.388671875, + "learning_rate": 4.982560174010641e-06, + "loss": 2.3628, + "step": 6197 + }, + { + "epoch": 0.3325107296137339, + "grad_norm": 0.302734375, + "learning_rate": 4.982549928747398e-06, + "loss": 2.1509, + "step": 6198 + }, + { + "epoch": 0.33256437768240343, + "grad_norm": 0.39453125, + "learning_rate": 4.982539680486219e-06, + "loss": 2.3014, + "step": 6199 + }, + { + "epoch": 0.33261802575107297, + "grad_norm": 0.369140625, + "learning_rate": 4.982529429227115e-06, + "loss": 2.2125, + "step": 6200 + }, + { + "epoch": 0.3326716738197425, + "grad_norm": 0.41015625, + "learning_rate": 4.9825191749701005e-06, + "loss": 2.2714, + "step": 6201 + }, + { + "epoch": 0.33272532188841203, + "grad_norm": 0.34765625, + "learning_rate": 4.982508917715186e-06, + "loss": 2.3071, + "step": 6202 + }, + { + "epoch": 0.33277896995708156, + "grad_norm": 0.345703125, + "learning_rate": 4.982498657462384e-06, + "loss": 2.3179, + "step": 6203 + }, + { + "epoch": 0.3328326180257511, + "grad_norm": 0.431640625, + "learning_rate": 4.982488394211708e-06, + "loss": 2.3073, + "step": 6204 + }, + { + "epoch": 0.3328862660944206, + "grad_norm": 0.55078125, + "learning_rate": 4.982478127963169e-06, + "loss": 2.2443, + "step": 6205 + }, + { + "epoch": 0.33293991416309016, + "grad_norm": 0.47265625, + "learning_rate": 4.982467858716781e-06, + "loss": 2.3316, + "step": 6206 + }, + { + "epoch": 0.33299356223175963, + "grad_norm": 0.396484375, + "learning_rate": 4.982457586472554e-06, + "loss": 2.2676, + "step": 6207 + }, + { + "epoch": 0.33304721030042916, + "grad_norm": 0.462890625, + "learning_rate": 4.982447311230503e-06, + "loss": 2.276, + "step": 6208 + }, + { + "epoch": 0.3331008583690987, + "grad_norm": 0.5078125, + "learning_rate": 4.982437032990639e-06, + "loss": 2.2825, + "step": 6209 + }, + { + "epoch": 0.3331545064377682, + "grad_norm": 0.3671875, + "learning_rate": 4.982426751752976e-06, + "loss": 2.1818, + "step": 6210 + }, + { + "epoch": 0.33320815450643776, + "grad_norm": 0.416015625, + "learning_rate": 4.982416467517523e-06, + "loss": 2.2493, + "step": 6211 + }, + { + "epoch": 0.3332618025751073, + "grad_norm": 0.353515625, + "learning_rate": 4.982406180284296e-06, + "loss": 2.1804, + "step": 6212 + }, + { + "epoch": 0.3333154506437768, + "grad_norm": 0.44140625, + "learning_rate": 4.9823958900533055e-06, + "loss": 2.6195, + "step": 6213 + }, + { + "epoch": 0.33336909871244635, + "grad_norm": 0.671875, + "learning_rate": 4.982385596824565e-06, + "loss": 2.4457, + "step": 6214 + }, + { + "epoch": 0.3334227467811159, + "grad_norm": 0.361328125, + "learning_rate": 4.9823753005980854e-06, + "loss": 2.4828, + "step": 6215 + }, + { + "epoch": 0.3334763948497854, + "grad_norm": 0.404296875, + "learning_rate": 4.982365001373881e-06, + "loss": 2.1065, + "step": 6216 + }, + { + "epoch": 0.33353004291845495, + "grad_norm": 0.5390625, + "learning_rate": 4.982354699151962e-06, + "loss": 1.8164, + "step": 6217 + }, + { + "epoch": 0.3335836909871245, + "grad_norm": 0.302734375, + "learning_rate": 4.982344393932344e-06, + "loss": 1.9037, + "step": 6218 + }, + { + "epoch": 0.333637339055794, + "grad_norm": 0.419921875, + "learning_rate": 4.982334085715036e-06, + "loss": 2.3238, + "step": 6219 + }, + { + "epoch": 0.33369098712446355, + "grad_norm": 0.47265625, + "learning_rate": 4.982323774500053e-06, + "loss": 2.4664, + "step": 6220 + }, + { + "epoch": 0.333744635193133, + "grad_norm": 0.423828125, + "learning_rate": 4.982313460287407e-06, + "loss": 2.2352, + "step": 6221 + }, + { + "epoch": 0.33379828326180255, + "grad_norm": 0.375, + "learning_rate": 4.982303143077109e-06, + "loss": 2.1402, + "step": 6222 + }, + { + "epoch": 0.3338519313304721, + "grad_norm": 0.408203125, + "learning_rate": 4.982292822869172e-06, + "loss": 2.0841, + "step": 6223 + }, + { + "epoch": 0.3339055793991416, + "grad_norm": 0.470703125, + "learning_rate": 4.9822824996636095e-06, + "loss": 2.2915, + "step": 6224 + }, + { + "epoch": 0.33395922746781115, + "grad_norm": 0.41796875, + "learning_rate": 4.982272173460434e-06, + "loss": 2.0763, + "step": 6225 + }, + { + "epoch": 0.3340128755364807, + "grad_norm": 0.3984375, + "learning_rate": 4.982261844259656e-06, + "loss": 2.1941, + "step": 6226 + }, + { + "epoch": 0.3340665236051502, + "grad_norm": 0.41796875, + "learning_rate": 4.9822515120612905e-06, + "loss": 2.4662, + "step": 6227 + }, + { + "epoch": 0.33412017167381974, + "grad_norm": 0.388671875, + "learning_rate": 4.982241176865348e-06, + "loss": 2.1588, + "step": 6228 + }, + { + "epoch": 0.3341738197424893, + "grad_norm": 0.3984375, + "learning_rate": 4.982230838671842e-06, + "loss": 2.232, + "step": 6229 + }, + { + "epoch": 0.3342274678111588, + "grad_norm": 0.41015625, + "learning_rate": 4.982220497480784e-06, + "loss": 2.1211, + "step": 6230 + }, + { + "epoch": 0.33428111587982834, + "grad_norm": 0.44921875, + "learning_rate": 4.9822101532921885e-06, + "loss": 1.7361, + "step": 6231 + }, + { + "epoch": 0.33433476394849787, + "grad_norm": 0.337890625, + "learning_rate": 4.982199806106066e-06, + "loss": 2.3473, + "step": 6232 + }, + { + "epoch": 0.3343884120171674, + "grad_norm": 0.4375, + "learning_rate": 4.98218945592243e-06, + "loss": 2.2169, + "step": 6233 + }, + { + "epoch": 0.33444206008583693, + "grad_norm": 0.392578125, + "learning_rate": 4.982179102741293e-06, + "loss": 2.1592, + "step": 6234 + }, + { + "epoch": 0.3344957081545064, + "grad_norm": 0.384765625, + "learning_rate": 4.9821687465626665e-06, + "loss": 2.3095, + "step": 6235 + }, + { + "epoch": 0.33454935622317594, + "grad_norm": 0.40625, + "learning_rate": 4.982158387386564e-06, + "loss": 2.2263, + "step": 6236 + }, + { + "epoch": 0.3346030042918455, + "grad_norm": 0.40234375, + "learning_rate": 4.982148025212998e-06, + "loss": 2.434, + "step": 6237 + }, + { + "epoch": 0.334656652360515, + "grad_norm": 0.375, + "learning_rate": 4.982137660041981e-06, + "loss": 2.4396, + "step": 6238 + }, + { + "epoch": 0.33471030042918454, + "grad_norm": 0.427734375, + "learning_rate": 4.982127291873524e-06, + "loss": 2.0189, + "step": 6239 + }, + { + "epoch": 0.33476394849785407, + "grad_norm": 0.392578125, + "learning_rate": 4.9821169207076415e-06, + "loss": 2.3757, + "step": 6240 + }, + { + "epoch": 0.3348175965665236, + "grad_norm": 0.51953125, + "learning_rate": 4.982106546544345e-06, + "loss": 2.1437, + "step": 6241 + }, + { + "epoch": 0.33487124463519313, + "grad_norm": 0.455078125, + "learning_rate": 4.9820961693836476e-06, + "loss": 1.8153, + "step": 6242 + }, + { + "epoch": 0.33492489270386266, + "grad_norm": 0.390625, + "learning_rate": 4.982085789225562e-06, + "loss": 2.4534, + "step": 6243 + }, + { + "epoch": 0.3349785407725322, + "grad_norm": 1.0859375, + "learning_rate": 4.9820754060701e-06, + "loss": 2.6412, + "step": 6244 + }, + { + "epoch": 0.33503218884120173, + "grad_norm": 0.451171875, + "learning_rate": 4.982065019917274e-06, + "loss": 2.0847, + "step": 6245 + }, + { + "epoch": 0.33508583690987126, + "grad_norm": 0.4765625, + "learning_rate": 4.982054630767098e-06, + "loss": 2.1131, + "step": 6246 + }, + { + "epoch": 0.3351394849785408, + "grad_norm": 0.58984375, + "learning_rate": 4.982044238619582e-06, + "loss": 2.5332, + "step": 6247 + }, + { + "epoch": 0.3351931330472103, + "grad_norm": 0.416015625, + "learning_rate": 4.982033843474742e-06, + "loss": 2.4738, + "step": 6248 + }, + { + "epoch": 0.33524678111587985, + "grad_norm": 0.4296875, + "learning_rate": 4.982023445332587e-06, + "loss": 2.2144, + "step": 6249 + }, + { + "epoch": 0.33530042918454933, + "grad_norm": 0.41796875, + "learning_rate": 4.982013044193131e-06, + "loss": 2.4081, + "step": 6250 + }, + { + "epoch": 0.33535407725321886, + "grad_norm": 0.5546875, + "learning_rate": 4.982002640056388e-06, + "loss": 2.2374, + "step": 6251 + }, + { + "epoch": 0.3354077253218884, + "grad_norm": 0.373046875, + "learning_rate": 4.9819922329223694e-06, + "loss": 2.4697, + "step": 6252 + }, + { + "epoch": 0.3354613733905579, + "grad_norm": 0.451171875, + "learning_rate": 4.981981822791087e-06, + "loss": 2.6227, + "step": 6253 + }, + { + "epoch": 0.33551502145922746, + "grad_norm": 0.44140625, + "learning_rate": 4.981971409662554e-06, + "loss": 2.3198, + "step": 6254 + }, + { + "epoch": 0.335568669527897, + "grad_norm": 0.365234375, + "learning_rate": 4.981960993536783e-06, + "loss": 2.331, + "step": 6255 + }, + { + "epoch": 0.3356223175965665, + "grad_norm": 0.3359375, + "learning_rate": 4.981950574413787e-06, + "loss": 2.4817, + "step": 6256 + }, + { + "epoch": 0.33567596566523605, + "grad_norm": 0.388671875, + "learning_rate": 4.981940152293578e-06, + "loss": 2.2822, + "step": 6257 + }, + { + "epoch": 0.3357296137339056, + "grad_norm": 0.55078125, + "learning_rate": 4.981929727176169e-06, + "loss": 2.5526, + "step": 6258 + }, + { + "epoch": 0.3357832618025751, + "grad_norm": 0.65625, + "learning_rate": 4.981919299061572e-06, + "loss": 2.2303, + "step": 6259 + }, + { + "epoch": 0.33583690987124465, + "grad_norm": 0.55078125, + "learning_rate": 4.9819088679498e-06, + "loss": 2.2919, + "step": 6260 + }, + { + "epoch": 0.3358905579399142, + "grad_norm": 0.38671875, + "learning_rate": 4.981898433840865e-06, + "loss": 2.1403, + "step": 6261 + }, + { + "epoch": 0.3359442060085837, + "grad_norm": 0.55859375, + "learning_rate": 4.981887996734781e-06, + "loss": 2.5637, + "step": 6262 + }, + { + "epoch": 0.33599785407725324, + "grad_norm": 0.44140625, + "learning_rate": 4.98187755663156e-06, + "loss": 2.0781, + "step": 6263 + }, + { + "epoch": 0.3360515021459227, + "grad_norm": 0.404296875, + "learning_rate": 4.9818671135312135e-06, + "loss": 2.488, + "step": 6264 + }, + { + "epoch": 0.33610515021459225, + "grad_norm": 0.546875, + "learning_rate": 4.9818566674337556e-06, + "loss": 1.6521, + "step": 6265 + }, + { + "epoch": 0.3361587982832618, + "grad_norm": 0.373046875, + "learning_rate": 4.981846218339198e-06, + "loss": 2.1666, + "step": 6266 + }, + { + "epoch": 0.3362124463519313, + "grad_norm": 0.421875, + "learning_rate": 4.981835766247554e-06, + "loss": 2.4519, + "step": 6267 + }, + { + "epoch": 0.33626609442060085, + "grad_norm": 0.384765625, + "learning_rate": 4.981825311158835e-06, + "loss": 2.4416, + "step": 6268 + }, + { + "epoch": 0.3363197424892704, + "grad_norm": 0.37890625, + "learning_rate": 4.981814853073055e-06, + "loss": 2.2633, + "step": 6269 + }, + { + "epoch": 0.3363733905579399, + "grad_norm": 0.35546875, + "learning_rate": 4.981804391990226e-06, + "loss": 2.136, + "step": 6270 + }, + { + "epoch": 0.33642703862660944, + "grad_norm": 0.5234375, + "learning_rate": 4.9817939279103605e-06, + "loss": 2.2456, + "step": 6271 + }, + { + "epoch": 0.336480686695279, + "grad_norm": 0.41015625, + "learning_rate": 4.981783460833471e-06, + "loss": 2.384, + "step": 6272 + }, + { + "epoch": 0.3365343347639485, + "grad_norm": 0.412109375, + "learning_rate": 4.981772990759572e-06, + "loss": 2.0576, + "step": 6273 + }, + { + "epoch": 0.33658798283261804, + "grad_norm": 0.4296875, + "learning_rate": 4.981762517688673e-06, + "loss": 2.4873, + "step": 6274 + }, + { + "epoch": 0.33664163090128757, + "grad_norm": 0.34375, + "learning_rate": 4.981752041620789e-06, + "loss": 2.037, + "step": 6275 + }, + { + "epoch": 0.3366952789699571, + "grad_norm": 0.4140625, + "learning_rate": 4.981741562555932e-06, + "loss": 2.0247, + "step": 6276 + }, + { + "epoch": 0.33674892703862663, + "grad_norm": 3.140625, + "learning_rate": 4.981731080494114e-06, + "loss": 2.1542, + "step": 6277 + }, + { + "epoch": 0.33680257510729616, + "grad_norm": 0.39453125, + "learning_rate": 4.981720595435349e-06, + "loss": 2.2077, + "step": 6278 + }, + { + "epoch": 0.33685622317596564, + "grad_norm": 0.3984375, + "learning_rate": 4.981710107379649e-06, + "loss": 2.0963, + "step": 6279 + }, + { + "epoch": 0.3369098712446352, + "grad_norm": 0.498046875, + "learning_rate": 4.981699616327026e-06, + "loss": 2.3866, + "step": 6280 + }, + { + "epoch": 0.3369635193133047, + "grad_norm": 0.4453125, + "learning_rate": 4.981689122277494e-06, + "loss": 2.3376, + "step": 6281 + }, + { + "epoch": 0.33701716738197424, + "grad_norm": 0.396484375, + "learning_rate": 4.981678625231065e-06, + "loss": 2.3608, + "step": 6282 + }, + { + "epoch": 0.33707081545064377, + "grad_norm": 0.76171875, + "learning_rate": 4.98166812518775e-06, + "loss": 2.4434, + "step": 6283 + }, + { + "epoch": 0.3371244635193133, + "grad_norm": 0.396484375, + "learning_rate": 4.981657622147564e-06, + "loss": 2.2546, + "step": 6284 + }, + { + "epoch": 0.33717811158798283, + "grad_norm": 0.447265625, + "learning_rate": 4.981647116110519e-06, + "loss": 2.2607, + "step": 6285 + }, + { + "epoch": 0.33723175965665236, + "grad_norm": 0.408203125, + "learning_rate": 4.981636607076629e-06, + "loss": 2.1374, + "step": 6286 + }, + { + "epoch": 0.3372854077253219, + "grad_norm": 1.1015625, + "learning_rate": 4.981626095045904e-06, + "loss": 2.2434, + "step": 6287 + }, + { + "epoch": 0.3373390557939914, + "grad_norm": 0.345703125, + "learning_rate": 4.981615580018358e-06, + "loss": 2.0257, + "step": 6288 + }, + { + "epoch": 0.33739270386266096, + "grad_norm": 0.46484375, + "learning_rate": 4.981605061994004e-06, + "loss": 2.3571, + "step": 6289 + }, + { + "epoch": 0.3374463519313305, + "grad_norm": 0.392578125, + "learning_rate": 4.981594540972854e-06, + "loss": 2.2189, + "step": 6290 + }, + { + "epoch": 0.3375, + "grad_norm": 0.404296875, + "learning_rate": 4.9815840169549216e-06, + "loss": 2.3277, + "step": 6291 + }, + { + "epoch": 0.33755364806866955, + "grad_norm": 0.44921875, + "learning_rate": 4.981573489940219e-06, + "loss": 2.4662, + "step": 6292 + }, + { + "epoch": 0.33760729613733903, + "grad_norm": 0.70703125, + "learning_rate": 4.981562959928759e-06, + "loss": 2.1752, + "step": 6293 + }, + { + "epoch": 0.33766094420600856, + "grad_norm": 0.3984375, + "learning_rate": 4.981552426920554e-06, + "loss": 1.9591, + "step": 6294 + }, + { + "epoch": 0.3377145922746781, + "grad_norm": 0.546875, + "learning_rate": 4.981541890915617e-06, + "loss": 2.3386, + "step": 6295 + }, + { + "epoch": 0.3377682403433476, + "grad_norm": 0.390625, + "learning_rate": 4.981531351913961e-06, + "loss": 2.3421, + "step": 6296 + }, + { + "epoch": 0.33782188841201716, + "grad_norm": 0.376953125, + "learning_rate": 4.981520809915598e-06, + "loss": 2.2087, + "step": 6297 + }, + { + "epoch": 0.3378755364806867, + "grad_norm": 0.51953125, + "learning_rate": 4.9815102649205415e-06, + "loss": 2.3017, + "step": 6298 + }, + { + "epoch": 0.3379291845493562, + "grad_norm": 0.361328125, + "learning_rate": 4.981499716928804e-06, + "loss": 2.2444, + "step": 6299 + }, + { + "epoch": 0.33798283261802575, + "grad_norm": 0.310546875, + "learning_rate": 4.981489165940398e-06, + "loss": 1.9625, + "step": 6300 + }, + { + "epoch": 0.3380364806866953, + "grad_norm": 0.40625, + "learning_rate": 4.981478611955336e-06, + "loss": 2.1885, + "step": 6301 + }, + { + "epoch": 0.3380901287553648, + "grad_norm": 0.330078125, + "learning_rate": 4.981468054973631e-06, + "loss": 2.0947, + "step": 6302 + }, + { + "epoch": 0.33814377682403435, + "grad_norm": 0.46875, + "learning_rate": 4.9814574949952974e-06, + "loss": 2.3617, + "step": 6303 + }, + { + "epoch": 0.3381974248927039, + "grad_norm": 0.86328125, + "learning_rate": 4.981446932020345e-06, + "loss": 2.3611, + "step": 6304 + }, + { + "epoch": 0.3382510729613734, + "grad_norm": 0.70703125, + "learning_rate": 4.9814363660487885e-06, + "loss": 2.3331, + "step": 6305 + }, + { + "epoch": 0.33830472103004294, + "grad_norm": 0.3359375, + "learning_rate": 4.98142579708064e-06, + "loss": 2.2648, + "step": 6306 + }, + { + "epoch": 0.3383583690987124, + "grad_norm": 0.546875, + "learning_rate": 4.981415225115913e-06, + "loss": 2.2553, + "step": 6307 + }, + { + "epoch": 0.33841201716738195, + "grad_norm": 0.447265625, + "learning_rate": 4.981404650154619e-06, + "loss": 2.2818, + "step": 6308 + }, + { + "epoch": 0.3384656652360515, + "grad_norm": 0.4140625, + "learning_rate": 4.981394072196772e-06, + "loss": 2.1893, + "step": 6309 + }, + { + "epoch": 0.338519313304721, + "grad_norm": 0.421875, + "learning_rate": 4.981383491242383e-06, + "loss": 1.9636, + "step": 6310 + }, + { + "epoch": 0.33857296137339055, + "grad_norm": 0.40234375, + "learning_rate": 4.981372907291468e-06, + "loss": 2.5685, + "step": 6311 + }, + { + "epoch": 0.3386266094420601, + "grad_norm": 0.427734375, + "learning_rate": 4.981362320344036e-06, + "loss": 2.5103, + "step": 6312 + }, + { + "epoch": 0.3386802575107296, + "grad_norm": 0.37109375, + "learning_rate": 4.981351730400102e-06, + "loss": 2.4726, + "step": 6313 + }, + { + "epoch": 0.33873390557939914, + "grad_norm": 0.359375, + "learning_rate": 4.9813411374596785e-06, + "loss": 2.1329, + "step": 6314 + }, + { + "epoch": 0.3387875536480687, + "grad_norm": 0.53515625, + "learning_rate": 4.981330541522778e-06, + "loss": 1.7271, + "step": 6315 + }, + { + "epoch": 0.3388412017167382, + "grad_norm": 0.359375, + "learning_rate": 4.9813199425894136e-06, + "loss": 2.1693, + "step": 6316 + }, + { + "epoch": 0.33889484978540774, + "grad_norm": 0.345703125, + "learning_rate": 4.981309340659598e-06, + "loss": 2.3237, + "step": 6317 + }, + { + "epoch": 0.33894849785407727, + "grad_norm": 0.390625, + "learning_rate": 4.9812987357333444e-06, + "loss": 2.491, + "step": 6318 + }, + { + "epoch": 0.3390021459227468, + "grad_norm": 0.53515625, + "learning_rate": 4.981288127810664e-06, + "loss": 2.2532, + "step": 6319 + }, + { + "epoch": 0.33905579399141633, + "grad_norm": 0.3671875, + "learning_rate": 4.981277516891572e-06, + "loss": 2.1667, + "step": 6320 + }, + { + "epoch": 0.33910944206008586, + "grad_norm": 0.8046875, + "learning_rate": 4.981266902976079e-06, + "loss": 2.112, + "step": 6321 + }, + { + "epoch": 0.33916309012875534, + "grad_norm": 0.349609375, + "learning_rate": 4.981256286064199e-06, + "loss": 2.1044, + "step": 6322 + }, + { + "epoch": 0.33921673819742487, + "grad_norm": 0.419921875, + "learning_rate": 4.981245666155945e-06, + "loss": 2.1715, + "step": 6323 + }, + { + "epoch": 0.3392703862660944, + "grad_norm": 0.419921875, + "learning_rate": 4.981235043251329e-06, + "loss": 2.2459, + "step": 6324 + }, + { + "epoch": 0.33932403433476394, + "grad_norm": 0.447265625, + "learning_rate": 4.981224417350364e-06, + "loss": 2.3589, + "step": 6325 + }, + { + "epoch": 0.33937768240343347, + "grad_norm": 0.37109375, + "learning_rate": 4.981213788453064e-06, + "loss": 2.3097, + "step": 6326 + }, + { + "epoch": 0.339431330472103, + "grad_norm": 0.77734375, + "learning_rate": 4.9812031565594406e-06, + "loss": 2.3132, + "step": 6327 + }, + { + "epoch": 0.33948497854077253, + "grad_norm": 0.3515625, + "learning_rate": 4.981192521669507e-06, + "loss": 2.6137, + "step": 6328 + }, + { + "epoch": 0.33953862660944206, + "grad_norm": 0.400390625, + "learning_rate": 4.981181883783275e-06, + "loss": 2.2378, + "step": 6329 + }, + { + "epoch": 0.3395922746781116, + "grad_norm": 0.416015625, + "learning_rate": 4.98117124290076e-06, + "loss": 2.3782, + "step": 6330 + }, + { + "epoch": 0.3396459227467811, + "grad_norm": 0.37109375, + "learning_rate": 4.981160599021973e-06, + "loss": 2.1558, + "step": 6331 + }, + { + "epoch": 0.33969957081545066, + "grad_norm": 0.4296875, + "learning_rate": 4.981149952146927e-06, + "loss": 2.2072, + "step": 6332 + }, + { + "epoch": 0.3397532188841202, + "grad_norm": 0.404296875, + "learning_rate": 4.981139302275634e-06, + "loss": 2.2946, + "step": 6333 + }, + { + "epoch": 0.3398068669527897, + "grad_norm": 0.408203125, + "learning_rate": 4.981128649408109e-06, + "loss": 2.2145, + "step": 6334 + }, + { + "epoch": 0.33986051502145925, + "grad_norm": 0.37109375, + "learning_rate": 4.9811179935443635e-06, + "loss": 2.3604, + "step": 6335 + }, + { + "epoch": 0.33991416309012873, + "grad_norm": 0.390625, + "learning_rate": 4.98110733468441e-06, + "loss": 2.2619, + "step": 6336 + }, + { + "epoch": 0.33996781115879826, + "grad_norm": 0.3671875, + "learning_rate": 4.981096672828263e-06, + "loss": 2.0779, + "step": 6337 + }, + { + "epoch": 0.3400214592274678, + "grad_norm": 0.419921875, + "learning_rate": 4.981086007975934e-06, + "loss": 2.2789, + "step": 6338 + }, + { + "epoch": 0.3400751072961373, + "grad_norm": 0.341796875, + "learning_rate": 4.981075340127436e-06, + "loss": 2.2403, + "step": 6339 + }, + { + "epoch": 0.34012875536480686, + "grad_norm": 0.486328125, + "learning_rate": 4.981064669282782e-06, + "loss": 2.2218, + "step": 6340 + }, + { + "epoch": 0.3401824034334764, + "grad_norm": 0.369140625, + "learning_rate": 4.981053995441985e-06, + "loss": 2.1174, + "step": 6341 + }, + { + "epoch": 0.3402360515021459, + "grad_norm": 0.388671875, + "learning_rate": 4.9810433186050586e-06, + "loss": 2.4048, + "step": 6342 + }, + { + "epoch": 0.34028969957081545, + "grad_norm": 0.451171875, + "learning_rate": 4.981032638772014e-06, + "loss": 2.1118, + "step": 6343 + }, + { + "epoch": 0.340343347639485, + "grad_norm": 0.3828125, + "learning_rate": 4.981021955942866e-06, + "loss": 2.4515, + "step": 6344 + }, + { + "epoch": 0.3403969957081545, + "grad_norm": 0.431640625, + "learning_rate": 4.9810112701176265e-06, + "loss": 2.2746, + "step": 6345 + }, + { + "epoch": 0.34045064377682405, + "grad_norm": 2.53125, + "learning_rate": 4.981000581296308e-06, + "loss": 1.8443, + "step": 6346 + }, + { + "epoch": 0.3405042918454936, + "grad_norm": 0.3671875, + "learning_rate": 4.980989889478924e-06, + "loss": 2.0696, + "step": 6347 + }, + { + "epoch": 0.3405579399141631, + "grad_norm": 0.42578125, + "learning_rate": 4.9809791946654875e-06, + "loss": 2.3379, + "step": 6348 + }, + { + "epoch": 0.34061158798283264, + "grad_norm": 0.3828125, + "learning_rate": 4.980968496856011e-06, + "loss": 2.6259, + "step": 6349 + }, + { + "epoch": 0.3406652360515021, + "grad_norm": 0.423828125, + "learning_rate": 4.980957796050507e-06, + "loss": 2.2757, + "step": 6350 + }, + { + "epoch": 0.34071888412017165, + "grad_norm": 0.40625, + "learning_rate": 4.980947092248991e-06, + "loss": 2.4966, + "step": 6351 + }, + { + "epoch": 0.3407725321888412, + "grad_norm": 0.703125, + "learning_rate": 4.980936385451472e-06, + "loss": 2.2821, + "step": 6352 + }, + { + "epoch": 0.3408261802575107, + "grad_norm": 0.380859375, + "learning_rate": 4.980925675657966e-06, + "loss": 2.2872, + "step": 6353 + }, + { + "epoch": 0.34087982832618025, + "grad_norm": 0.46875, + "learning_rate": 4.980914962868485e-06, + "loss": 2.3598, + "step": 6354 + }, + { + "epoch": 0.3409334763948498, + "grad_norm": 0.40625, + "learning_rate": 4.980904247083041e-06, + "loss": 2.4833, + "step": 6355 + }, + { + "epoch": 0.3409871244635193, + "grad_norm": 0.3671875, + "learning_rate": 4.980893528301648e-06, + "loss": 2.3833, + "step": 6356 + }, + { + "epoch": 0.34104077253218884, + "grad_norm": 0.41015625, + "learning_rate": 4.980882806524319e-06, + "loss": 2.2885, + "step": 6357 + }, + { + "epoch": 0.3410944206008584, + "grad_norm": 0.43359375, + "learning_rate": 4.9808720817510665e-06, + "loss": 2.6119, + "step": 6358 + }, + { + "epoch": 0.3411480686695279, + "grad_norm": 0.5234375, + "learning_rate": 4.980861353981904e-06, + "loss": 2.6435, + "step": 6359 + }, + { + "epoch": 0.34120171673819744, + "grad_norm": 0.41015625, + "learning_rate": 4.980850623216843e-06, + "loss": 2.299, + "step": 6360 + }, + { + "epoch": 0.34125536480686697, + "grad_norm": 0.431640625, + "learning_rate": 4.9808398894558985e-06, + "loss": 2.3774, + "step": 6361 + }, + { + "epoch": 0.3413090128755365, + "grad_norm": 0.357421875, + "learning_rate": 4.980829152699081e-06, + "loss": 2.1218, + "step": 6362 + }, + { + "epoch": 0.34136266094420603, + "grad_norm": 0.37109375, + "learning_rate": 4.980818412946407e-06, + "loss": 2.3273, + "step": 6363 + }, + { + "epoch": 0.34141630901287556, + "grad_norm": 0.40625, + "learning_rate": 4.9808076701978854e-06, + "loss": 2.3034, + "step": 6364 + }, + { + "epoch": 0.34146995708154504, + "grad_norm": 0.423828125, + "learning_rate": 4.980796924453533e-06, + "loss": 2.6145, + "step": 6365 + }, + { + "epoch": 0.34152360515021457, + "grad_norm": 0.443359375, + "learning_rate": 4.9807861757133595e-06, + "loss": 2.4874, + "step": 6366 + }, + { + "epoch": 0.3415772532188841, + "grad_norm": 0.5078125, + "learning_rate": 4.98077542397738e-06, + "loss": 2.1818, + "step": 6367 + }, + { + "epoch": 0.34163090128755363, + "grad_norm": 0.384765625, + "learning_rate": 4.980764669245607e-06, + "loss": 2.3044, + "step": 6368 + }, + { + "epoch": 0.34168454935622317, + "grad_norm": 0.51953125, + "learning_rate": 4.980753911518052e-06, + "loss": 2.3247, + "step": 6369 + }, + { + "epoch": 0.3417381974248927, + "grad_norm": 0.34765625, + "learning_rate": 4.980743150794731e-06, + "loss": 2.3258, + "step": 6370 + }, + { + "epoch": 0.34179184549356223, + "grad_norm": 0.3828125, + "learning_rate": 4.980732387075654e-06, + "loss": 2.2604, + "step": 6371 + }, + { + "epoch": 0.34184549356223176, + "grad_norm": 0.373046875, + "learning_rate": 4.980721620360836e-06, + "loss": 2.0988, + "step": 6372 + }, + { + "epoch": 0.3418991416309013, + "grad_norm": 0.69140625, + "learning_rate": 4.980710850650289e-06, + "loss": 2.3415, + "step": 6373 + }, + { + "epoch": 0.3419527896995708, + "grad_norm": 0.416015625, + "learning_rate": 4.9807000779440265e-06, + "loss": 2.179, + "step": 6374 + }, + { + "epoch": 0.34200643776824036, + "grad_norm": 0.443359375, + "learning_rate": 4.980689302242061e-06, + "loss": 2.3506, + "step": 6375 + }, + { + "epoch": 0.3420600858369099, + "grad_norm": 0.52734375, + "learning_rate": 4.980678523544406e-06, + "loss": 2.2638, + "step": 6376 + }, + { + "epoch": 0.3421137339055794, + "grad_norm": 0.373046875, + "learning_rate": 4.980667741851074e-06, + "loss": 2.1076, + "step": 6377 + }, + { + "epoch": 0.34216738197424895, + "grad_norm": 0.375, + "learning_rate": 4.980656957162079e-06, + "loss": 2.303, + "step": 6378 + }, + { + "epoch": 0.34222103004291843, + "grad_norm": 0.51953125, + "learning_rate": 4.980646169477432e-06, + "loss": 2.1318, + "step": 6379 + }, + { + "epoch": 0.34227467811158796, + "grad_norm": 0.431640625, + "learning_rate": 4.980635378797148e-06, + "loss": 2.3494, + "step": 6380 + }, + { + "epoch": 0.3423283261802575, + "grad_norm": 0.38671875, + "learning_rate": 4.98062458512124e-06, + "loss": 2.3982, + "step": 6381 + }, + { + "epoch": 0.342381974248927, + "grad_norm": 0.37109375, + "learning_rate": 4.980613788449721e-06, + "loss": 2.3793, + "step": 6382 + }, + { + "epoch": 0.34243562231759656, + "grad_norm": 0.365234375, + "learning_rate": 4.980602988782602e-06, + "loss": 1.78, + "step": 6383 + }, + { + "epoch": 0.3424892703862661, + "grad_norm": 0.5859375, + "learning_rate": 4.980592186119899e-06, + "loss": 2.1369, + "step": 6384 + }, + { + "epoch": 0.3425429184549356, + "grad_norm": 0.369140625, + "learning_rate": 4.980581380461622e-06, + "loss": 2.2529, + "step": 6385 + }, + { + "epoch": 0.34259656652360515, + "grad_norm": 0.431640625, + "learning_rate": 4.980570571807787e-06, + "loss": 2.5586, + "step": 6386 + }, + { + "epoch": 0.3426502145922747, + "grad_norm": 0.462890625, + "learning_rate": 4.980559760158404e-06, + "loss": 2.4511, + "step": 6387 + }, + { + "epoch": 0.3427038626609442, + "grad_norm": 0.54296875, + "learning_rate": 4.980548945513489e-06, + "loss": 2.4863, + "step": 6388 + }, + { + "epoch": 0.34275751072961375, + "grad_norm": 0.515625, + "learning_rate": 4.980538127873054e-06, + "loss": 2.0726, + "step": 6389 + }, + { + "epoch": 0.3428111587982833, + "grad_norm": 0.408203125, + "learning_rate": 4.980527307237112e-06, + "loss": 2.3548, + "step": 6390 + }, + { + "epoch": 0.3428648068669528, + "grad_norm": 0.419921875, + "learning_rate": 4.980516483605675e-06, + "loss": 2.2894, + "step": 6391 + }, + { + "epoch": 0.34291845493562234, + "grad_norm": 0.369140625, + "learning_rate": 4.980505656978758e-06, + "loss": 2.2732, + "step": 6392 + }, + { + "epoch": 0.3429721030042919, + "grad_norm": 0.54296875, + "learning_rate": 4.980494827356372e-06, + "loss": 1.5462, + "step": 6393 + }, + { + "epoch": 0.34302575107296135, + "grad_norm": 0.328125, + "learning_rate": 4.980483994738532e-06, + "loss": 2.151, + "step": 6394 + }, + { + "epoch": 0.3430793991416309, + "grad_norm": 0.37109375, + "learning_rate": 4.98047315912525e-06, + "loss": 2.1465, + "step": 6395 + }, + { + "epoch": 0.3431330472103004, + "grad_norm": 0.33203125, + "learning_rate": 4.980462320516539e-06, + "loss": 2.314, + "step": 6396 + }, + { + "epoch": 0.34318669527896994, + "grad_norm": 0.384765625, + "learning_rate": 4.980451478912412e-06, + "loss": 2.2561, + "step": 6397 + }, + { + "epoch": 0.3432403433476395, + "grad_norm": 0.3828125, + "learning_rate": 4.980440634312883e-06, + "loss": 2.2415, + "step": 6398 + }, + { + "epoch": 0.343293991416309, + "grad_norm": 0.4765625, + "learning_rate": 4.9804297867179655e-06, + "loss": 2.475, + "step": 6399 + }, + { + "epoch": 0.34334763948497854, + "grad_norm": 0.412109375, + "learning_rate": 4.980418936127671e-06, + "loss": 2.3691, + "step": 6400 + }, + { + "epoch": 0.34340128755364807, + "grad_norm": 0.5234375, + "learning_rate": 4.980408082542012e-06, + "loss": 2.3373, + "step": 6401 + }, + { + "epoch": 0.3434549356223176, + "grad_norm": 0.8125, + "learning_rate": 4.980397225961005e-06, + "loss": 2.3909, + "step": 6402 + }, + { + "epoch": 0.34350858369098713, + "grad_norm": 0.53515625, + "learning_rate": 4.980386366384659e-06, + "loss": 2.248, + "step": 6403 + }, + { + "epoch": 0.34356223175965667, + "grad_norm": 0.76953125, + "learning_rate": 4.980375503812991e-06, + "loss": 2.1161, + "step": 6404 + }, + { + "epoch": 0.3436158798283262, + "grad_norm": 0.45703125, + "learning_rate": 4.980364638246011e-06, + "loss": 2.361, + "step": 6405 + }, + { + "epoch": 0.34366952789699573, + "grad_norm": 0.345703125, + "learning_rate": 4.980353769683734e-06, + "loss": 2.0882, + "step": 6406 + }, + { + "epoch": 0.34372317596566526, + "grad_norm": 0.380859375, + "learning_rate": 4.980342898126172e-06, + "loss": 2.2048, + "step": 6407 + }, + { + "epoch": 0.34377682403433474, + "grad_norm": 0.546875, + "learning_rate": 4.98033202357334e-06, + "loss": 2.228, + "step": 6408 + }, + { + "epoch": 0.34383047210300427, + "grad_norm": 0.439453125, + "learning_rate": 4.980321146025249e-06, + "loss": 2.355, + "step": 6409 + }, + { + "epoch": 0.3438841201716738, + "grad_norm": 0.443359375, + "learning_rate": 4.980310265481912e-06, + "loss": 2.1425, + "step": 6410 + }, + { + "epoch": 0.34393776824034333, + "grad_norm": 0.396484375, + "learning_rate": 4.980299381943343e-06, + "loss": 2.3956, + "step": 6411 + }, + { + "epoch": 0.34399141630901287, + "grad_norm": 0.421875, + "learning_rate": 4.980288495409556e-06, + "loss": 2.4367, + "step": 6412 + }, + { + "epoch": 0.3440450643776824, + "grad_norm": 0.3984375, + "learning_rate": 4.980277605880563e-06, + "loss": 2.402, + "step": 6413 + }, + { + "epoch": 0.34409871244635193, + "grad_norm": 0.47265625, + "learning_rate": 4.980266713356378e-06, + "loss": 2.5473, + "step": 6414 + }, + { + "epoch": 0.34415236051502146, + "grad_norm": 0.44140625, + "learning_rate": 4.980255817837013e-06, + "loss": 2.3422, + "step": 6415 + }, + { + "epoch": 0.344206008583691, + "grad_norm": 0.423828125, + "learning_rate": 4.9802449193224826e-06, + "loss": 2.2635, + "step": 6416 + }, + { + "epoch": 0.3442596566523605, + "grad_norm": 0.400390625, + "learning_rate": 4.980234017812799e-06, + "loss": 2.2819, + "step": 6417 + }, + { + "epoch": 0.34431330472103006, + "grad_norm": 0.369140625, + "learning_rate": 4.980223113307974e-06, + "loss": 2.2514, + "step": 6418 + }, + { + "epoch": 0.3443669527896996, + "grad_norm": 0.45703125, + "learning_rate": 4.980212205808024e-06, + "loss": 2.3496, + "step": 6419 + }, + { + "epoch": 0.3444206008583691, + "grad_norm": 0.375, + "learning_rate": 4.9802012953129595e-06, + "loss": 2.4053, + "step": 6420 + }, + { + "epoch": 0.34447424892703865, + "grad_norm": 0.416015625, + "learning_rate": 4.980190381822795e-06, + "loss": 2.5989, + "step": 6421 + }, + { + "epoch": 0.3445278969957081, + "grad_norm": 0.388671875, + "learning_rate": 4.980179465337542e-06, + "loss": 2.4138, + "step": 6422 + }, + { + "epoch": 0.34458154506437766, + "grad_norm": 0.380859375, + "learning_rate": 4.980168545857217e-06, + "loss": 1.9399, + "step": 6423 + }, + { + "epoch": 0.3446351931330472, + "grad_norm": 0.400390625, + "learning_rate": 4.980157623381831e-06, + "loss": 2.507, + "step": 6424 + }, + { + "epoch": 0.3446888412017167, + "grad_norm": 0.431640625, + "learning_rate": 4.980146697911396e-06, + "loss": 2.2255, + "step": 6425 + }, + { + "epoch": 0.34474248927038625, + "grad_norm": 0.4921875, + "learning_rate": 4.980135769445928e-06, + "loss": 2.2857, + "step": 6426 + }, + { + "epoch": 0.3447961373390558, + "grad_norm": 0.4375, + "learning_rate": 4.9801248379854375e-06, + "loss": 2.3759, + "step": 6427 + }, + { + "epoch": 0.3448497854077253, + "grad_norm": 0.412109375, + "learning_rate": 4.980113903529939e-06, + "loss": 2.2865, + "step": 6428 + }, + { + "epoch": 0.34490343347639485, + "grad_norm": 0.40625, + "learning_rate": 4.9801029660794465e-06, + "loss": 2.1587, + "step": 6429 + }, + { + "epoch": 0.3449570815450644, + "grad_norm": 0.435546875, + "learning_rate": 4.980092025633971e-06, + "loss": 2.5009, + "step": 6430 + }, + { + "epoch": 0.3450107296137339, + "grad_norm": 0.388671875, + "learning_rate": 4.980081082193529e-06, + "loss": 2.2528, + "step": 6431 + }, + { + "epoch": 0.34506437768240344, + "grad_norm": 0.42578125, + "learning_rate": 4.980070135758131e-06, + "loss": 2.3081, + "step": 6432 + }, + { + "epoch": 0.345118025751073, + "grad_norm": 0.40625, + "learning_rate": 4.980059186327791e-06, + "loss": 2.3071, + "step": 6433 + }, + { + "epoch": 0.3451716738197425, + "grad_norm": 0.45703125, + "learning_rate": 4.980048233902521e-06, + "loss": 2.2502, + "step": 6434 + }, + { + "epoch": 0.34522532188841204, + "grad_norm": 0.365234375, + "learning_rate": 4.980037278482337e-06, + "loss": 2.4057, + "step": 6435 + }, + { + "epoch": 0.34527896995708157, + "grad_norm": 0.330078125, + "learning_rate": 4.98002632006725e-06, + "loss": 2.2462, + "step": 6436 + }, + { + "epoch": 0.34533261802575105, + "grad_norm": 0.5, + "learning_rate": 4.9800153586572745e-06, + "loss": 2.1818, + "step": 6437 + }, + { + "epoch": 0.3453862660944206, + "grad_norm": 0.42578125, + "learning_rate": 4.980004394252422e-06, + "loss": 2.323, + "step": 6438 + }, + { + "epoch": 0.3454399141630901, + "grad_norm": 0.43359375, + "learning_rate": 4.979993426852708e-06, + "loss": 2.362, + "step": 6439 + }, + { + "epoch": 0.34549356223175964, + "grad_norm": 0.373046875, + "learning_rate": 4.9799824564581445e-06, + "loss": 2.4948, + "step": 6440 + }, + { + "epoch": 0.3455472103004292, + "grad_norm": 0.427734375, + "learning_rate": 4.979971483068745e-06, + "loss": 2.2637, + "step": 6441 + }, + { + "epoch": 0.3456008583690987, + "grad_norm": 0.61328125, + "learning_rate": 4.979960506684522e-06, + "loss": 2.3037, + "step": 6442 + }, + { + "epoch": 0.34565450643776824, + "grad_norm": 0.390625, + "learning_rate": 4.97994952730549e-06, + "loss": 2.3596, + "step": 6443 + }, + { + "epoch": 0.34570815450643777, + "grad_norm": 0.3984375, + "learning_rate": 4.979938544931661e-06, + "loss": 2.3323, + "step": 6444 + }, + { + "epoch": 0.3457618025751073, + "grad_norm": 0.43359375, + "learning_rate": 4.97992755956305e-06, + "loss": 2.2467, + "step": 6445 + }, + { + "epoch": 0.34581545064377683, + "grad_norm": 0.404296875, + "learning_rate": 4.979916571199668e-06, + "loss": 2.3108, + "step": 6446 + }, + { + "epoch": 0.34586909871244637, + "grad_norm": 0.6328125, + "learning_rate": 4.97990557984153e-06, + "loss": 2.4107, + "step": 6447 + }, + { + "epoch": 0.3459227467811159, + "grad_norm": 0.4296875, + "learning_rate": 4.979894585488649e-06, + "loss": 2.4041, + "step": 6448 + }, + { + "epoch": 0.34597639484978543, + "grad_norm": 0.400390625, + "learning_rate": 4.9798835881410375e-06, + "loss": 2.0966, + "step": 6449 + }, + { + "epoch": 0.34603004291845496, + "grad_norm": 0.3671875, + "learning_rate": 4.97987258779871e-06, + "loss": 2.4556, + "step": 6450 + }, + { + "epoch": 0.34608369098712444, + "grad_norm": 0.439453125, + "learning_rate": 4.979861584461679e-06, + "loss": 2.0538, + "step": 6451 + }, + { + "epoch": 0.34613733905579397, + "grad_norm": 0.419921875, + "learning_rate": 4.9798505781299565e-06, + "loss": 2.1759, + "step": 6452 + }, + { + "epoch": 0.3461909871244635, + "grad_norm": 0.337890625, + "learning_rate": 4.979839568803559e-06, + "loss": 2.1319, + "step": 6453 + }, + { + "epoch": 0.34624463519313303, + "grad_norm": 0.427734375, + "learning_rate": 4.979828556482496e-06, + "loss": 2.2646, + "step": 6454 + }, + { + "epoch": 0.34629828326180256, + "grad_norm": 0.40234375, + "learning_rate": 4.9798175411667835e-06, + "loss": 2.2209, + "step": 6455 + }, + { + "epoch": 0.3463519313304721, + "grad_norm": 2.140625, + "learning_rate": 4.979806522856435e-06, + "loss": 2.2363, + "step": 6456 + }, + { + "epoch": 0.3464055793991416, + "grad_norm": 0.4765625, + "learning_rate": 4.979795501551462e-06, + "loss": 2.4007, + "step": 6457 + }, + { + "epoch": 0.34645922746781116, + "grad_norm": 0.4140625, + "learning_rate": 4.9797844772518785e-06, + "loss": 2.4319, + "step": 6458 + }, + { + "epoch": 0.3465128755364807, + "grad_norm": 0.369140625, + "learning_rate": 4.9797734499576985e-06, + "loss": 2.084, + "step": 6459 + }, + { + "epoch": 0.3465665236051502, + "grad_norm": 0.396484375, + "learning_rate": 4.979762419668934e-06, + "loss": 2.2484, + "step": 6460 + }, + { + "epoch": 0.34662017167381975, + "grad_norm": 0.412109375, + "learning_rate": 4.9797513863856005e-06, + "loss": 2.1538, + "step": 6461 + }, + { + "epoch": 0.3466738197424893, + "grad_norm": 0.408203125, + "learning_rate": 4.979740350107709e-06, + "loss": 2.245, + "step": 6462 + }, + { + "epoch": 0.3467274678111588, + "grad_norm": 0.478515625, + "learning_rate": 4.979729310835274e-06, + "loss": 2.2194, + "step": 6463 + }, + { + "epoch": 0.34678111587982835, + "grad_norm": 0.376953125, + "learning_rate": 4.979718268568308e-06, + "loss": 2.4822, + "step": 6464 + }, + { + "epoch": 0.3468347639484979, + "grad_norm": 0.412109375, + "learning_rate": 4.979707223306825e-06, + "loss": 2.3541, + "step": 6465 + }, + { + "epoch": 0.34688841201716736, + "grad_norm": 0.3984375, + "learning_rate": 4.979696175050839e-06, + "loss": 2.2467, + "step": 6466 + }, + { + "epoch": 0.3469420600858369, + "grad_norm": 0.4609375, + "learning_rate": 4.9796851238003626e-06, + "loss": 2.268, + "step": 6467 + }, + { + "epoch": 0.3469957081545064, + "grad_norm": 0.40234375, + "learning_rate": 4.9796740695554095e-06, + "loss": 2.0618, + "step": 6468 + }, + { + "epoch": 0.34704935622317595, + "grad_norm": 0.65625, + "learning_rate": 4.979663012315991e-06, + "loss": 2.2221, + "step": 6469 + }, + { + "epoch": 0.3471030042918455, + "grad_norm": 0.423828125, + "learning_rate": 4.979651952082123e-06, + "loss": 2.3537, + "step": 6470 + }, + { + "epoch": 0.347156652360515, + "grad_norm": 0.40234375, + "learning_rate": 4.979640888853818e-06, + "loss": 2.5463, + "step": 6471 + }, + { + "epoch": 0.34721030042918455, + "grad_norm": 0.37109375, + "learning_rate": 4.97962982263109e-06, + "loss": 2.2813, + "step": 6472 + }, + { + "epoch": 0.3472639484978541, + "grad_norm": 0.353515625, + "learning_rate": 4.979618753413952e-06, + "loss": 2.2056, + "step": 6473 + }, + { + "epoch": 0.3473175965665236, + "grad_norm": 0.451171875, + "learning_rate": 4.979607681202415e-06, + "loss": 2.2139, + "step": 6474 + }, + { + "epoch": 0.34737124463519314, + "grad_norm": 0.41015625, + "learning_rate": 4.979596605996497e-06, + "loss": 2.4119, + "step": 6475 + }, + { + "epoch": 0.3474248927038627, + "grad_norm": 0.369140625, + "learning_rate": 4.979585527796207e-06, + "loss": 2.3119, + "step": 6476 + }, + { + "epoch": 0.3474785407725322, + "grad_norm": 0.75, + "learning_rate": 4.9795744466015615e-06, + "loss": 2.3023, + "step": 6477 + }, + { + "epoch": 0.34753218884120174, + "grad_norm": 0.37109375, + "learning_rate": 4.9795633624125716e-06, + "loss": 2.2497, + "step": 6478 + }, + { + "epoch": 0.34758583690987127, + "grad_norm": 2.234375, + "learning_rate": 4.979552275229252e-06, + "loss": 2.3562, + "step": 6479 + }, + { + "epoch": 0.34763948497854075, + "grad_norm": 0.40234375, + "learning_rate": 4.979541185051616e-06, + "loss": 2.2883, + "step": 6480 + }, + { + "epoch": 0.3476931330472103, + "grad_norm": 0.3984375, + "learning_rate": 4.979530091879677e-06, + "loss": 2.3261, + "step": 6481 + }, + { + "epoch": 0.3477467811158798, + "grad_norm": 0.38671875, + "learning_rate": 4.979518995713448e-06, + "loss": 2.1198, + "step": 6482 + }, + { + "epoch": 0.34780042918454934, + "grad_norm": 0.40234375, + "learning_rate": 4.979507896552942e-06, + "loss": 2.1769, + "step": 6483 + }, + { + "epoch": 0.3478540772532189, + "grad_norm": 0.37890625, + "learning_rate": 4.979496794398174e-06, + "loss": 2.2345, + "step": 6484 + }, + { + "epoch": 0.3479077253218884, + "grad_norm": 0.33984375, + "learning_rate": 4.979485689249155e-06, + "loss": 2.2612, + "step": 6485 + }, + { + "epoch": 0.34796137339055794, + "grad_norm": 0.326171875, + "learning_rate": 4.979474581105901e-06, + "loss": 1.8923, + "step": 6486 + }, + { + "epoch": 0.34801502145922747, + "grad_norm": 0.5234375, + "learning_rate": 4.979463469968424e-06, + "loss": 2.3103, + "step": 6487 + }, + { + "epoch": 0.348068669527897, + "grad_norm": 0.625, + "learning_rate": 4.979452355836737e-06, + "loss": 2.1151, + "step": 6488 + }, + { + "epoch": 0.34812231759656653, + "grad_norm": 0.392578125, + "learning_rate": 4.979441238710855e-06, + "loss": 2.2843, + "step": 6489 + }, + { + "epoch": 0.34817596566523606, + "grad_norm": 0.390625, + "learning_rate": 4.97943011859079e-06, + "loss": 2.2024, + "step": 6490 + }, + { + "epoch": 0.3482296137339056, + "grad_norm": 0.373046875, + "learning_rate": 4.979418995476555e-06, + "loss": 2.406, + "step": 6491 + }, + { + "epoch": 0.34828326180257513, + "grad_norm": 0.51171875, + "learning_rate": 4.979407869368165e-06, + "loss": 2.1829, + "step": 6492 + }, + { + "epoch": 0.34833690987124466, + "grad_norm": 0.384765625, + "learning_rate": 4.979396740265633e-06, + "loss": 2.1728, + "step": 6493 + }, + { + "epoch": 0.34839055793991414, + "grad_norm": 0.474609375, + "learning_rate": 4.9793856081689725e-06, + "loss": 2.2377, + "step": 6494 + }, + { + "epoch": 0.34844420600858367, + "grad_norm": 1.0859375, + "learning_rate": 4.979374473078196e-06, + "loss": 2.4536, + "step": 6495 + }, + { + "epoch": 0.3484978540772532, + "grad_norm": 0.400390625, + "learning_rate": 4.9793633349933176e-06, + "loss": 2.3029, + "step": 6496 + }, + { + "epoch": 0.34855150214592273, + "grad_norm": 0.47265625, + "learning_rate": 4.9793521939143516e-06, + "loss": 2.2043, + "step": 6497 + }, + { + "epoch": 0.34860515021459226, + "grad_norm": 0.373046875, + "learning_rate": 4.97934104984131e-06, + "loss": 2.5196, + "step": 6498 + }, + { + "epoch": 0.3486587982832618, + "grad_norm": 0.8046875, + "learning_rate": 4.979329902774206e-06, + "loss": 2.411, + "step": 6499 + }, + { + "epoch": 0.3487124463519313, + "grad_norm": 0.482421875, + "learning_rate": 4.9793187527130545e-06, + "loss": 2.1659, + "step": 6500 + }, + { + "epoch": 0.34876609442060086, + "grad_norm": 0.34375, + "learning_rate": 4.979307599657869e-06, + "loss": 2.0299, + "step": 6501 + }, + { + "epoch": 0.3488197424892704, + "grad_norm": 1.3203125, + "learning_rate": 4.9792964436086625e-06, + "loss": 2.3858, + "step": 6502 + }, + { + "epoch": 0.3488733905579399, + "grad_norm": 0.4140625, + "learning_rate": 4.979285284565447e-06, + "loss": 1.9854, + "step": 6503 + }, + { + "epoch": 0.34892703862660945, + "grad_norm": 0.47265625, + "learning_rate": 4.979274122528238e-06, + "loss": 2.215, + "step": 6504 + }, + { + "epoch": 0.348980686695279, + "grad_norm": 0.470703125, + "learning_rate": 4.979262957497049e-06, + "loss": 2.2416, + "step": 6505 + }, + { + "epoch": 0.3490343347639485, + "grad_norm": 0.412109375, + "learning_rate": 4.9792517894718925e-06, + "loss": 2.4079, + "step": 6506 + }, + { + "epoch": 0.34908798283261805, + "grad_norm": 0.353515625, + "learning_rate": 4.979240618452782e-06, + "loss": 2.1287, + "step": 6507 + }, + { + "epoch": 0.3491416309012876, + "grad_norm": 0.3828125, + "learning_rate": 4.979229444439731e-06, + "loss": 2.4231, + "step": 6508 + }, + { + "epoch": 0.34919527896995706, + "grad_norm": 0.45703125, + "learning_rate": 4.9792182674327525e-06, + "loss": 2.1285, + "step": 6509 + }, + { + "epoch": 0.3492489270386266, + "grad_norm": 0.4609375, + "learning_rate": 4.979207087431862e-06, + "loss": 2.3536, + "step": 6510 + }, + { + "epoch": 0.3493025751072961, + "grad_norm": 0.56640625, + "learning_rate": 4.979195904437072e-06, + "loss": 2.0962, + "step": 6511 + }, + { + "epoch": 0.34935622317596565, + "grad_norm": 0.5234375, + "learning_rate": 4.979184718448394e-06, + "loss": 2.4187, + "step": 6512 + }, + { + "epoch": 0.3494098712446352, + "grad_norm": 0.369140625, + "learning_rate": 4.979173529465845e-06, + "loss": 2.3468, + "step": 6513 + }, + { + "epoch": 0.3494635193133047, + "grad_norm": 0.384765625, + "learning_rate": 4.979162337489436e-06, + "loss": 2.3336, + "step": 6514 + }, + { + "epoch": 0.34951716738197425, + "grad_norm": 0.41015625, + "learning_rate": 4.979151142519181e-06, + "loss": 2.2886, + "step": 6515 + }, + { + "epoch": 0.3495708154506438, + "grad_norm": 0.376953125, + "learning_rate": 4.979139944555095e-06, + "loss": 2.2207, + "step": 6516 + }, + { + "epoch": 0.3496244635193133, + "grad_norm": 0.357421875, + "learning_rate": 4.979128743597189e-06, + "loss": 2.2407, + "step": 6517 + }, + { + "epoch": 0.34967811158798284, + "grad_norm": 0.40234375, + "learning_rate": 4.979117539645479e-06, + "loss": 2.3377, + "step": 6518 + }, + { + "epoch": 0.3497317596566524, + "grad_norm": 0.49609375, + "learning_rate": 4.979106332699975e-06, + "loss": 2.3621, + "step": 6519 + }, + { + "epoch": 0.3497854077253219, + "grad_norm": 0.37890625, + "learning_rate": 4.979095122760695e-06, + "loss": 2.0159, + "step": 6520 + }, + { + "epoch": 0.34983905579399144, + "grad_norm": 0.421875, + "learning_rate": 4.97908390982765e-06, + "loss": 2.1022, + "step": 6521 + }, + { + "epoch": 0.34989270386266097, + "grad_norm": 0.439453125, + "learning_rate": 4.979072693900854e-06, + "loss": 2.2935, + "step": 6522 + }, + { + "epoch": 0.34994635193133045, + "grad_norm": 0.353515625, + "learning_rate": 4.9790614749803205e-06, + "loss": 2.2659, + "step": 6523 + }, + { + "epoch": 0.35, + "grad_norm": 0.376953125, + "learning_rate": 4.979050253066064e-06, + "loss": 2.2481, + "step": 6524 + }, + { + "epoch": 0.3500536480686695, + "grad_norm": 0.4609375, + "learning_rate": 4.979039028158095e-06, + "loss": 2.4613, + "step": 6525 + }, + { + "epoch": 0.35010729613733904, + "grad_norm": 2.421875, + "learning_rate": 4.979027800256431e-06, + "loss": 2.1693, + "step": 6526 + }, + { + "epoch": 0.3501609442060086, + "grad_norm": 0.359375, + "learning_rate": 4.979016569361083e-06, + "loss": 2.267, + "step": 6527 + }, + { + "epoch": 0.3502145922746781, + "grad_norm": 0.427734375, + "learning_rate": 4.9790053354720655e-06, + "loss": 2.3672, + "step": 6528 + }, + { + "epoch": 0.35026824034334764, + "grad_norm": 0.458984375, + "learning_rate": 4.9789940985893914e-06, + "loss": 2.1698, + "step": 6529 + }, + { + "epoch": 0.35032188841201717, + "grad_norm": 0.392578125, + "learning_rate": 4.978982858713075e-06, + "loss": 2.1544, + "step": 6530 + }, + { + "epoch": 0.3503755364806867, + "grad_norm": 0.421875, + "learning_rate": 4.97897161584313e-06, + "loss": 2.174, + "step": 6531 + }, + { + "epoch": 0.35042918454935623, + "grad_norm": 0.392578125, + "learning_rate": 4.978960369979569e-06, + "loss": 2.0086, + "step": 6532 + }, + { + "epoch": 0.35048283261802576, + "grad_norm": 0.396484375, + "learning_rate": 4.978949121122407e-06, + "loss": 2.5183, + "step": 6533 + }, + { + "epoch": 0.3505364806866953, + "grad_norm": 0.5390625, + "learning_rate": 4.978937869271656e-06, + "loss": 2.2208, + "step": 6534 + }, + { + "epoch": 0.3505901287553648, + "grad_norm": 0.478515625, + "learning_rate": 4.978926614427331e-06, + "loss": 2.3228, + "step": 6535 + }, + { + "epoch": 0.35064377682403436, + "grad_norm": 0.396484375, + "learning_rate": 4.9789153565894436e-06, + "loss": 2.4722, + "step": 6536 + }, + { + "epoch": 0.35069742489270384, + "grad_norm": 0.466796875, + "learning_rate": 4.97890409575801e-06, + "loss": 2.2423, + "step": 6537 + }, + { + "epoch": 0.35075107296137337, + "grad_norm": 0.349609375, + "learning_rate": 4.978892831933042e-06, + "loss": 2.0805, + "step": 6538 + }, + { + "epoch": 0.3508047210300429, + "grad_norm": 0.447265625, + "learning_rate": 4.978881565114554e-06, + "loss": 2.1613, + "step": 6539 + }, + { + "epoch": 0.35085836909871243, + "grad_norm": 0.412109375, + "learning_rate": 4.9788702953025596e-06, + "loss": 2.2941, + "step": 6540 + }, + { + "epoch": 0.35091201716738196, + "grad_norm": 0.419921875, + "learning_rate": 4.978859022497071e-06, + "loss": 2.2918, + "step": 6541 + }, + { + "epoch": 0.3509656652360515, + "grad_norm": 0.421875, + "learning_rate": 4.978847746698104e-06, + "loss": 2.4136, + "step": 6542 + }, + { + "epoch": 0.351019313304721, + "grad_norm": 0.416015625, + "learning_rate": 4.978836467905671e-06, + "loss": 2.2771, + "step": 6543 + }, + { + "epoch": 0.35107296137339056, + "grad_norm": 0.396484375, + "learning_rate": 4.978825186119786e-06, + "loss": 2.2964, + "step": 6544 + }, + { + "epoch": 0.3511266094420601, + "grad_norm": 0.46484375, + "learning_rate": 4.9788139013404615e-06, + "loss": 2.2716, + "step": 6545 + }, + { + "epoch": 0.3511802575107296, + "grad_norm": 0.453125, + "learning_rate": 4.9788026135677125e-06, + "loss": 2.1341, + "step": 6546 + }, + { + "epoch": 0.35123390557939915, + "grad_norm": 2.6875, + "learning_rate": 4.9787913228015525e-06, + "loss": 2.1417, + "step": 6547 + }, + { + "epoch": 0.3512875536480687, + "grad_norm": 0.400390625, + "learning_rate": 4.978780029041994e-06, + "loss": 2.2856, + "step": 6548 + }, + { + "epoch": 0.3513412017167382, + "grad_norm": 0.44140625, + "learning_rate": 4.978768732289052e-06, + "loss": 2.1848, + "step": 6549 + }, + { + "epoch": 0.35139484978540775, + "grad_norm": 0.345703125, + "learning_rate": 4.97875743254274e-06, + "loss": 2.4389, + "step": 6550 + }, + { + "epoch": 0.3514484978540773, + "grad_norm": 0.462890625, + "learning_rate": 4.978746129803071e-06, + "loss": 2.1225, + "step": 6551 + }, + { + "epoch": 0.35150214592274676, + "grad_norm": 0.451171875, + "learning_rate": 4.9787348240700585e-06, + "loss": 2.1929, + "step": 6552 + }, + { + "epoch": 0.3515557939914163, + "grad_norm": 0.427734375, + "learning_rate": 4.978723515343717e-06, + "loss": 2.3344, + "step": 6553 + }, + { + "epoch": 0.3516094420600858, + "grad_norm": 0.396484375, + "learning_rate": 4.97871220362406e-06, + "loss": 2.4517, + "step": 6554 + }, + { + "epoch": 0.35166309012875535, + "grad_norm": 0.57421875, + "learning_rate": 4.9787008889111e-06, + "loss": 2.3793, + "step": 6555 + }, + { + "epoch": 0.3517167381974249, + "grad_norm": 0.43359375, + "learning_rate": 4.978689571204851e-06, + "loss": 2.4262, + "step": 6556 + }, + { + "epoch": 0.3517703862660944, + "grad_norm": 0.40234375, + "learning_rate": 4.978678250505329e-06, + "loss": 2.3066, + "step": 6557 + }, + { + "epoch": 0.35182403433476395, + "grad_norm": 0.388671875, + "learning_rate": 4.978666926812544e-06, + "loss": 2.3225, + "step": 6558 + }, + { + "epoch": 0.3518776824034335, + "grad_norm": 0.7265625, + "learning_rate": 4.978655600126513e-06, + "loss": 2.356, + "step": 6559 + }, + { + "epoch": 0.351931330472103, + "grad_norm": 0.447265625, + "learning_rate": 4.978644270447248e-06, + "loss": 2.1365, + "step": 6560 + }, + { + "epoch": 0.35198497854077254, + "grad_norm": 0.6171875, + "learning_rate": 4.978632937774762e-06, + "loss": 2.3576, + "step": 6561 + }, + { + "epoch": 0.3520386266094421, + "grad_norm": 0.345703125, + "learning_rate": 4.97862160210907e-06, + "loss": 2.057, + "step": 6562 + }, + { + "epoch": 0.3520922746781116, + "grad_norm": 0.42578125, + "learning_rate": 4.978610263450185e-06, + "loss": 2.2304, + "step": 6563 + }, + { + "epoch": 0.35214592274678114, + "grad_norm": 0.423828125, + "learning_rate": 4.978598921798123e-06, + "loss": 2.2585, + "step": 6564 + }, + { + "epoch": 0.35219957081545067, + "grad_norm": 0.4140625, + "learning_rate": 4.978587577152893e-06, + "loss": 2.3095, + "step": 6565 + }, + { + "epoch": 0.35225321888412015, + "grad_norm": 0.3828125, + "learning_rate": 4.9785762295145125e-06, + "loss": 2.3243, + "step": 6566 + }, + { + "epoch": 0.3523068669527897, + "grad_norm": 0.458984375, + "learning_rate": 4.978564878882993e-06, + "loss": 2.3135, + "step": 6567 + }, + { + "epoch": 0.3523605150214592, + "grad_norm": 0.388671875, + "learning_rate": 4.978553525258351e-06, + "loss": 2.5964, + "step": 6568 + }, + { + "epoch": 0.35241416309012874, + "grad_norm": 0.4296875, + "learning_rate": 4.978542168640598e-06, + "loss": 2.4402, + "step": 6569 + }, + { + "epoch": 0.35246781115879827, + "grad_norm": 0.4140625, + "learning_rate": 4.978530809029747e-06, + "loss": 2.3442, + "step": 6570 + }, + { + "epoch": 0.3525214592274678, + "grad_norm": 0.380859375, + "learning_rate": 4.978519446425814e-06, + "loss": 2.4327, + "step": 6571 + }, + { + "epoch": 0.35257510729613734, + "grad_norm": 0.376953125, + "learning_rate": 4.978508080828811e-06, + "loss": 2.1342, + "step": 6572 + }, + { + "epoch": 0.35262875536480687, + "grad_norm": 0.3984375, + "learning_rate": 4.978496712238753e-06, + "loss": 2.4299, + "step": 6573 + }, + { + "epoch": 0.3526824034334764, + "grad_norm": 0.361328125, + "learning_rate": 4.978485340655653e-06, + "loss": 2.3535, + "step": 6574 + }, + { + "epoch": 0.35273605150214593, + "grad_norm": 0.37109375, + "learning_rate": 4.978473966079524e-06, + "loss": 2.0099, + "step": 6575 + }, + { + "epoch": 0.35278969957081546, + "grad_norm": 0.412109375, + "learning_rate": 4.9784625885103815e-06, + "loss": 2.3325, + "step": 6576 + }, + { + "epoch": 0.352843347639485, + "grad_norm": 0.392578125, + "learning_rate": 4.978451207948238e-06, + "loss": 2.2227, + "step": 6577 + }, + { + "epoch": 0.3528969957081545, + "grad_norm": 0.466796875, + "learning_rate": 4.978439824393107e-06, + "loss": 2.6028, + "step": 6578 + }, + { + "epoch": 0.35295064377682406, + "grad_norm": 0.40625, + "learning_rate": 4.978428437845003e-06, + "loss": 2.2273, + "step": 6579 + }, + { + "epoch": 0.3530042918454936, + "grad_norm": 0.369140625, + "learning_rate": 4.9784170483039405e-06, + "loss": 2.2786, + "step": 6580 + }, + { + "epoch": 0.35305793991416307, + "grad_norm": 0.3984375, + "learning_rate": 4.978405655769931e-06, + "loss": 2.1945, + "step": 6581 + }, + { + "epoch": 0.3531115879828326, + "grad_norm": 0.3515625, + "learning_rate": 4.9783942602429904e-06, + "loss": 2.1408, + "step": 6582 + }, + { + "epoch": 0.35316523605150213, + "grad_norm": 0.8828125, + "learning_rate": 4.9783828617231315e-06, + "loss": 2.3597, + "step": 6583 + }, + { + "epoch": 0.35321888412017166, + "grad_norm": 0.48046875, + "learning_rate": 4.978371460210368e-06, + "loss": 2.238, + "step": 6584 + }, + { + "epoch": 0.3532725321888412, + "grad_norm": 6.59375, + "learning_rate": 4.978360055704714e-06, + "loss": 2.0441, + "step": 6585 + }, + { + "epoch": 0.3533261802575107, + "grad_norm": 0.427734375, + "learning_rate": 4.978348648206183e-06, + "loss": 2.3729, + "step": 6586 + }, + { + "epoch": 0.35337982832618026, + "grad_norm": 0.38671875, + "learning_rate": 4.978337237714789e-06, + "loss": 2.3191, + "step": 6587 + }, + { + "epoch": 0.3534334763948498, + "grad_norm": 0.38671875, + "learning_rate": 4.9783258242305445e-06, + "loss": 2.4802, + "step": 6588 + }, + { + "epoch": 0.3534871244635193, + "grad_norm": 0.578125, + "learning_rate": 4.978314407753466e-06, + "loss": 2.3223, + "step": 6589 + }, + { + "epoch": 0.35354077253218885, + "grad_norm": 0.427734375, + "learning_rate": 4.978302988283565e-06, + "loss": 2.3044, + "step": 6590 + }, + { + "epoch": 0.3535944206008584, + "grad_norm": 0.453125, + "learning_rate": 4.978291565820856e-06, + "loss": 2.145, + "step": 6591 + }, + { + "epoch": 0.3536480686695279, + "grad_norm": 0.453125, + "learning_rate": 4.978280140365353e-06, + "loss": 2.5065, + "step": 6592 + }, + { + "epoch": 0.35370171673819745, + "grad_norm": 0.390625, + "learning_rate": 4.97826871191707e-06, + "loss": 2.3489, + "step": 6593 + }, + { + "epoch": 0.353755364806867, + "grad_norm": 1.2578125, + "learning_rate": 4.97825728047602e-06, + "loss": 2.0133, + "step": 6594 + }, + { + "epoch": 0.35380901287553645, + "grad_norm": 0.3828125, + "learning_rate": 4.978245846042217e-06, + "loss": 2.444, + "step": 6595 + }, + { + "epoch": 0.353862660944206, + "grad_norm": 0.421875, + "learning_rate": 4.9782344086156756e-06, + "loss": 2.2693, + "step": 6596 + }, + { + "epoch": 0.3539163090128755, + "grad_norm": 0.373046875, + "learning_rate": 4.978222968196409e-06, + "loss": 2.4013, + "step": 6597 + }, + { + "epoch": 0.35396995708154505, + "grad_norm": 0.6953125, + "learning_rate": 4.9782115247844295e-06, + "loss": 2.2666, + "step": 6598 + }, + { + "epoch": 0.3540236051502146, + "grad_norm": 0.416015625, + "learning_rate": 4.978200078379754e-06, + "loss": 2.391, + "step": 6599 + }, + { + "epoch": 0.3540772532188841, + "grad_norm": 0.431640625, + "learning_rate": 4.978188628982395e-06, + "loss": 2.3527, + "step": 6600 + }, + { + "epoch": 0.35413090128755365, + "grad_norm": 0.361328125, + "learning_rate": 4.978177176592366e-06, + "loss": 2.2935, + "step": 6601 + }, + { + "epoch": 0.3541845493562232, + "grad_norm": 0.44921875, + "learning_rate": 4.97816572120968e-06, + "loss": 2.3421, + "step": 6602 + }, + { + "epoch": 0.3542381974248927, + "grad_norm": 0.412109375, + "learning_rate": 4.978154262834352e-06, + "loss": 2.3438, + "step": 6603 + }, + { + "epoch": 0.35429184549356224, + "grad_norm": 0.482421875, + "learning_rate": 4.978142801466397e-06, + "loss": 2.3665, + "step": 6604 + }, + { + "epoch": 0.3543454935622318, + "grad_norm": 0.421875, + "learning_rate": 4.978131337105826e-06, + "loss": 2.2892, + "step": 6605 + }, + { + "epoch": 0.3543991416309013, + "grad_norm": 0.376953125, + "learning_rate": 4.978119869752655e-06, + "loss": 2.2426, + "step": 6606 + }, + { + "epoch": 0.35445278969957084, + "grad_norm": 5.03125, + "learning_rate": 4.978108399406897e-06, + "loss": 2.3694, + "step": 6607 + }, + { + "epoch": 0.35450643776824037, + "grad_norm": 0.341796875, + "learning_rate": 4.9780969260685655e-06, + "loss": 1.9909, + "step": 6608 + }, + { + "epoch": 0.35456008583690984, + "grad_norm": 0.361328125, + "learning_rate": 4.978085449737676e-06, + "loss": 2.203, + "step": 6609 + }, + { + "epoch": 0.3546137339055794, + "grad_norm": 0.419921875, + "learning_rate": 4.97807397041424e-06, + "loss": 2.4118, + "step": 6610 + }, + { + "epoch": 0.3546673819742489, + "grad_norm": 3.390625, + "learning_rate": 4.978062488098273e-06, + "loss": 2.1278, + "step": 6611 + }, + { + "epoch": 0.35472103004291844, + "grad_norm": 0.455078125, + "learning_rate": 4.9780510027897885e-06, + "loss": 2.3142, + "step": 6612 + }, + { + "epoch": 0.35477467811158797, + "grad_norm": 0.37109375, + "learning_rate": 4.978039514488801e-06, + "loss": 2.2315, + "step": 6613 + }, + { + "epoch": 0.3548283261802575, + "grad_norm": 0.400390625, + "learning_rate": 4.978028023195323e-06, + "loss": 2.3329, + "step": 6614 + }, + { + "epoch": 0.35488197424892703, + "grad_norm": 0.4140625, + "learning_rate": 4.97801652890937e-06, + "loss": 2.2224, + "step": 6615 + }, + { + "epoch": 0.35493562231759657, + "grad_norm": 0.392578125, + "learning_rate": 4.978005031630954e-06, + "loss": 2.3207, + "step": 6616 + }, + { + "epoch": 0.3549892703862661, + "grad_norm": 0.3671875, + "learning_rate": 4.977993531360089e-06, + "loss": 2.3196, + "step": 6617 + }, + { + "epoch": 0.35504291845493563, + "grad_norm": 0.578125, + "learning_rate": 4.977982028096791e-06, + "loss": 2.2885, + "step": 6618 + }, + { + "epoch": 0.35509656652360516, + "grad_norm": 0.390625, + "learning_rate": 4.977970521841072e-06, + "loss": 2.0451, + "step": 6619 + }, + { + "epoch": 0.3551502145922747, + "grad_norm": 0.64453125, + "learning_rate": 4.977959012592947e-06, + "loss": 2.2431, + "step": 6620 + }, + { + "epoch": 0.3552038626609442, + "grad_norm": 0.462890625, + "learning_rate": 4.977947500352429e-06, + "loss": 2.352, + "step": 6621 + }, + { + "epoch": 0.35525751072961376, + "grad_norm": 0.369140625, + "learning_rate": 4.977935985119533e-06, + "loss": 2.2622, + "step": 6622 + }, + { + "epoch": 0.3553111587982833, + "grad_norm": 0.380859375, + "learning_rate": 4.977924466894272e-06, + "loss": 2.0746, + "step": 6623 + }, + { + "epoch": 0.35536480686695276, + "grad_norm": 0.388671875, + "learning_rate": 4.977912945676659e-06, + "loss": 2.2433, + "step": 6624 + }, + { + "epoch": 0.3554184549356223, + "grad_norm": 0.54296875, + "learning_rate": 4.977901421466711e-06, + "loss": 2.1565, + "step": 6625 + }, + { + "epoch": 0.35547210300429183, + "grad_norm": 0.392578125, + "learning_rate": 4.977889894264438e-06, + "loss": 2.2263, + "step": 6626 + }, + { + "epoch": 0.35552575107296136, + "grad_norm": 0.365234375, + "learning_rate": 4.9778783640698574e-06, + "loss": 2.049, + "step": 6627 + }, + { + "epoch": 0.3555793991416309, + "grad_norm": 0.421875, + "learning_rate": 4.977866830882981e-06, + "loss": 2.2334, + "step": 6628 + }, + { + "epoch": 0.3556330472103004, + "grad_norm": 0.380859375, + "learning_rate": 4.9778552947038234e-06, + "loss": 2.0759, + "step": 6629 + }, + { + "epoch": 0.35568669527896996, + "grad_norm": 0.421875, + "learning_rate": 4.977843755532398e-06, + "loss": 2.2596, + "step": 6630 + }, + { + "epoch": 0.3557403433476395, + "grad_norm": 0.361328125, + "learning_rate": 4.977832213368719e-06, + "loss": 2.1355, + "step": 6631 + }, + { + "epoch": 0.355793991416309, + "grad_norm": 0.384765625, + "learning_rate": 4.977820668212802e-06, + "loss": 2.4183, + "step": 6632 + }, + { + "epoch": 0.35584763948497855, + "grad_norm": 0.388671875, + "learning_rate": 4.977809120064658e-06, + "loss": 2.3046, + "step": 6633 + }, + { + "epoch": 0.3559012875536481, + "grad_norm": 0.419921875, + "learning_rate": 4.977797568924303e-06, + "loss": 2.2475, + "step": 6634 + }, + { + "epoch": 0.3559549356223176, + "grad_norm": 0.443359375, + "learning_rate": 4.977786014791751e-06, + "loss": 2.5129, + "step": 6635 + }, + { + "epoch": 0.35600858369098715, + "grad_norm": 0.3984375, + "learning_rate": 4.977774457667015e-06, + "loss": 2.3443, + "step": 6636 + }, + { + "epoch": 0.3560622317596567, + "grad_norm": 0.375, + "learning_rate": 4.9777628975501085e-06, + "loss": 2.0129, + "step": 6637 + }, + { + "epoch": 0.35611587982832615, + "grad_norm": 0.376953125, + "learning_rate": 4.9777513344410465e-06, + "loss": 2.3795, + "step": 6638 + }, + { + "epoch": 0.3561695278969957, + "grad_norm": 0.375, + "learning_rate": 4.977739768339843e-06, + "loss": 2.128, + "step": 6639 + }, + { + "epoch": 0.3562231759656652, + "grad_norm": 0.396484375, + "learning_rate": 4.977728199246512e-06, + "loss": 2.3061, + "step": 6640 + }, + { + "epoch": 0.35627682403433475, + "grad_norm": 0.50390625, + "learning_rate": 4.977716627161067e-06, + "loss": 1.424, + "step": 6641 + }, + { + "epoch": 0.3563304721030043, + "grad_norm": 0.53125, + "learning_rate": 4.977705052083522e-06, + "loss": 2.242, + "step": 6642 + }, + { + "epoch": 0.3563841201716738, + "grad_norm": 0.373046875, + "learning_rate": 4.97769347401389e-06, + "loss": 2.3037, + "step": 6643 + }, + { + "epoch": 0.35643776824034334, + "grad_norm": 0.427734375, + "learning_rate": 4.977681892952188e-06, + "loss": 2.2954, + "step": 6644 + }, + { + "epoch": 0.3564914163090129, + "grad_norm": 0.8046875, + "learning_rate": 4.977670308898426e-06, + "loss": 1.8741, + "step": 6645 + }, + { + "epoch": 0.3565450643776824, + "grad_norm": 0.431640625, + "learning_rate": 4.977658721852622e-06, + "loss": 2.3413, + "step": 6646 + }, + { + "epoch": 0.35659871244635194, + "grad_norm": 0.333984375, + "learning_rate": 4.977647131814787e-06, + "loss": 2.0883, + "step": 6647 + }, + { + "epoch": 0.35665236051502147, + "grad_norm": 0.384765625, + "learning_rate": 4.9776355387849365e-06, + "loss": 2.0407, + "step": 6648 + }, + { + "epoch": 0.356706008583691, + "grad_norm": 0.396484375, + "learning_rate": 4.977623942763083e-06, + "loss": 2.2976, + "step": 6649 + }, + { + "epoch": 0.35675965665236054, + "grad_norm": 0.60546875, + "learning_rate": 4.977612343749242e-06, + "loss": 2.298, + "step": 6650 + }, + { + "epoch": 0.35681330472103007, + "grad_norm": 0.404296875, + "learning_rate": 4.977600741743428e-06, + "loss": 2.2486, + "step": 6651 + }, + { + "epoch": 0.3568669527896996, + "grad_norm": 0.439453125, + "learning_rate": 4.977589136745653e-06, + "loss": 2.3462, + "step": 6652 + }, + { + "epoch": 0.3569206008583691, + "grad_norm": 0.431640625, + "learning_rate": 4.977577528755932e-06, + "loss": 2.3076, + "step": 6653 + }, + { + "epoch": 0.3569742489270386, + "grad_norm": 0.412109375, + "learning_rate": 4.97756591777428e-06, + "loss": 2.3697, + "step": 6654 + }, + { + "epoch": 0.35702789699570814, + "grad_norm": 0.447265625, + "learning_rate": 4.97755430380071e-06, + "loss": 1.3876, + "step": 6655 + }, + { + "epoch": 0.35708154506437767, + "grad_norm": 0.392578125, + "learning_rate": 4.977542686835236e-06, + "loss": 2.2461, + "step": 6656 + }, + { + "epoch": 0.3571351931330472, + "grad_norm": 0.3984375, + "learning_rate": 4.977531066877872e-06, + "loss": 2.4064, + "step": 6657 + }, + { + "epoch": 0.35718884120171673, + "grad_norm": 0.42578125, + "learning_rate": 4.9775194439286316e-06, + "loss": 2.2999, + "step": 6658 + }, + { + "epoch": 0.35724248927038627, + "grad_norm": 0.85546875, + "learning_rate": 4.97750781798753e-06, + "loss": 2.2076, + "step": 6659 + }, + { + "epoch": 0.3572961373390558, + "grad_norm": 0.345703125, + "learning_rate": 4.977496189054581e-06, + "loss": 2.283, + "step": 6660 + }, + { + "epoch": 0.35734978540772533, + "grad_norm": 0.365234375, + "learning_rate": 4.9774845571297985e-06, + "loss": 2.1803, + "step": 6661 + }, + { + "epoch": 0.35740343347639486, + "grad_norm": 0.376953125, + "learning_rate": 4.977472922213195e-06, + "loss": 2.2338, + "step": 6662 + }, + { + "epoch": 0.3574570815450644, + "grad_norm": 0.404296875, + "learning_rate": 4.977461284304786e-06, + "loss": 2.1599, + "step": 6663 + }, + { + "epoch": 0.3575107296137339, + "grad_norm": 0.404296875, + "learning_rate": 4.977449643404587e-06, + "loss": 2.4874, + "step": 6664 + }, + { + "epoch": 0.35756437768240346, + "grad_norm": 0.390625, + "learning_rate": 4.97743799951261e-06, + "loss": 2.1748, + "step": 6665 + }, + { + "epoch": 0.357618025751073, + "grad_norm": 0.384765625, + "learning_rate": 4.977426352628869e-06, + "loss": 2.3418, + "step": 6666 + }, + { + "epoch": 0.35767167381974246, + "grad_norm": 0.400390625, + "learning_rate": 4.977414702753379e-06, + "loss": 2.0693, + "step": 6667 + }, + { + "epoch": 0.357725321888412, + "grad_norm": 0.447265625, + "learning_rate": 4.9774030498861535e-06, + "loss": 2.5061, + "step": 6668 + }, + { + "epoch": 0.3577789699570815, + "grad_norm": 0.369140625, + "learning_rate": 4.9773913940272074e-06, + "loss": 2.1508, + "step": 6669 + }, + { + "epoch": 0.35783261802575106, + "grad_norm": 0.44921875, + "learning_rate": 4.977379735176554e-06, + "loss": 2.3595, + "step": 6670 + }, + { + "epoch": 0.3578862660944206, + "grad_norm": 0.34765625, + "learning_rate": 4.9773680733342075e-06, + "loss": 2.3074, + "step": 6671 + }, + { + "epoch": 0.3579399141630901, + "grad_norm": 0.419921875, + "learning_rate": 4.977356408500182e-06, + "loss": 2.3892, + "step": 6672 + }, + { + "epoch": 0.35799356223175965, + "grad_norm": 0.484375, + "learning_rate": 4.977344740674491e-06, + "loss": 2.1894, + "step": 6673 + }, + { + "epoch": 0.3580472103004292, + "grad_norm": 0.494140625, + "learning_rate": 4.97733306985715e-06, + "loss": 2.4677, + "step": 6674 + }, + { + "epoch": 0.3581008583690987, + "grad_norm": 0.36328125, + "learning_rate": 4.9773213960481715e-06, + "loss": 2.4473, + "step": 6675 + }, + { + "epoch": 0.35815450643776825, + "grad_norm": 0.380859375, + "learning_rate": 4.977309719247571e-06, + "loss": 2.3196, + "step": 6676 + }, + { + "epoch": 0.3582081545064378, + "grad_norm": 0.390625, + "learning_rate": 4.977298039455362e-06, + "loss": 2.2547, + "step": 6677 + }, + { + "epoch": 0.3582618025751073, + "grad_norm": 0.39453125, + "learning_rate": 4.977286356671559e-06, + "loss": 2.2708, + "step": 6678 + }, + { + "epoch": 0.35831545064377684, + "grad_norm": 0.330078125, + "learning_rate": 4.977274670896175e-06, + "loss": 2.0966, + "step": 6679 + }, + { + "epoch": 0.3583690987124464, + "grad_norm": 0.61328125, + "learning_rate": 4.977262982129225e-06, + "loss": 2.3075, + "step": 6680 + }, + { + "epoch": 0.35842274678111585, + "grad_norm": 0.44140625, + "learning_rate": 4.9772512903707225e-06, + "loss": 2.5076, + "step": 6681 + }, + { + "epoch": 0.3584763948497854, + "grad_norm": 0.40234375, + "learning_rate": 4.977239595620683e-06, + "loss": 2.3817, + "step": 6682 + }, + { + "epoch": 0.3585300429184549, + "grad_norm": 0.380859375, + "learning_rate": 4.977227897879119e-06, + "loss": 2.2842, + "step": 6683 + }, + { + "epoch": 0.35858369098712445, + "grad_norm": 3.875, + "learning_rate": 4.977216197146045e-06, + "loss": 2.2135, + "step": 6684 + }, + { + "epoch": 0.358637339055794, + "grad_norm": 0.388671875, + "learning_rate": 4.9772044934214765e-06, + "loss": 2.4048, + "step": 6685 + }, + { + "epoch": 0.3586909871244635, + "grad_norm": 0.345703125, + "learning_rate": 4.977192786705426e-06, + "loss": 2.3425, + "step": 6686 + }, + { + "epoch": 0.35874463519313304, + "grad_norm": 0.458984375, + "learning_rate": 4.977181076997908e-06, + "loss": 2.3242, + "step": 6687 + }, + { + "epoch": 0.3587982832618026, + "grad_norm": 0.703125, + "learning_rate": 4.977169364298937e-06, + "loss": 2.4195, + "step": 6688 + }, + { + "epoch": 0.3588519313304721, + "grad_norm": 0.390625, + "learning_rate": 4.977157648608527e-06, + "loss": 2.2186, + "step": 6689 + }, + { + "epoch": 0.35890557939914164, + "grad_norm": 0.36328125, + "learning_rate": 4.977145929926692e-06, + "loss": 2.2656, + "step": 6690 + }, + { + "epoch": 0.35895922746781117, + "grad_norm": 0.50390625, + "learning_rate": 4.977134208253447e-06, + "loss": 2.2002, + "step": 6691 + }, + { + "epoch": 0.3590128755364807, + "grad_norm": 0.470703125, + "learning_rate": 4.977122483588805e-06, + "loss": 1.9939, + "step": 6692 + }, + { + "epoch": 0.35906652360515023, + "grad_norm": 0.75390625, + "learning_rate": 4.97711075593278e-06, + "loss": 2.2833, + "step": 6693 + }, + { + "epoch": 0.35912017167381977, + "grad_norm": 0.44140625, + "learning_rate": 4.977099025285388e-06, + "loss": 1.2382, + "step": 6694 + }, + { + "epoch": 0.3591738197424893, + "grad_norm": 0.33203125, + "learning_rate": 4.977087291646641e-06, + "loss": 2.0806, + "step": 6695 + }, + { + "epoch": 0.3592274678111588, + "grad_norm": 0.65234375, + "learning_rate": 4.977075555016554e-06, + "loss": 2.1619, + "step": 6696 + }, + { + "epoch": 0.3592811158798283, + "grad_norm": 0.37890625, + "learning_rate": 4.9770638153951424e-06, + "loss": 2.4469, + "step": 6697 + }, + { + "epoch": 0.35933476394849784, + "grad_norm": 0.453125, + "learning_rate": 4.977052072782418e-06, + "loss": 2.5237, + "step": 6698 + }, + { + "epoch": 0.35938841201716737, + "grad_norm": 0.55859375, + "learning_rate": 4.977040327178397e-06, + "loss": 2.3341, + "step": 6699 + }, + { + "epoch": 0.3594420600858369, + "grad_norm": 0.396484375, + "learning_rate": 4.9770285785830925e-06, + "loss": 2.2409, + "step": 6700 + }, + { + "epoch": 0.35949570815450643, + "grad_norm": 0.361328125, + "learning_rate": 4.977016826996519e-06, + "loss": 2.4489, + "step": 6701 + }, + { + "epoch": 0.35954935622317596, + "grad_norm": 0.392578125, + "learning_rate": 4.977005072418691e-06, + "loss": 2.2339, + "step": 6702 + }, + { + "epoch": 0.3596030042918455, + "grad_norm": 0.380859375, + "learning_rate": 4.9769933148496215e-06, + "loss": 2.3315, + "step": 6703 + }, + { + "epoch": 0.35965665236051503, + "grad_norm": 0.47265625, + "learning_rate": 4.976981554289326e-06, + "loss": 2.2516, + "step": 6704 + }, + { + "epoch": 0.35971030042918456, + "grad_norm": 0.3984375, + "learning_rate": 4.976969790737819e-06, + "loss": 2.285, + "step": 6705 + }, + { + "epoch": 0.3597639484978541, + "grad_norm": 0.63671875, + "learning_rate": 4.976958024195113e-06, + "loss": 2.2661, + "step": 6706 + }, + { + "epoch": 0.3598175965665236, + "grad_norm": 0.396484375, + "learning_rate": 4.9769462546612244e-06, + "loss": 2.4619, + "step": 6707 + }, + { + "epoch": 0.35987124463519315, + "grad_norm": 0.48046875, + "learning_rate": 4.976934482136166e-06, + "loss": 2.2948, + "step": 6708 + }, + { + "epoch": 0.3599248927038627, + "grad_norm": 0.45703125, + "learning_rate": 4.976922706619951e-06, + "loss": 2.1575, + "step": 6709 + }, + { + "epoch": 0.35997854077253216, + "grad_norm": 0.408203125, + "learning_rate": 4.976910928112596e-06, + "loss": 2.4475, + "step": 6710 + }, + { + "epoch": 0.3600321888412017, + "grad_norm": 0.412109375, + "learning_rate": 4.976899146614114e-06, + "loss": 2.4032, + "step": 6711 + }, + { + "epoch": 0.3600858369098712, + "grad_norm": 0.40625, + "learning_rate": 4.976887362124518e-06, + "loss": 2.5078, + "step": 6712 + }, + { + "epoch": 0.36013948497854076, + "grad_norm": 0.515625, + "learning_rate": 4.976875574643825e-06, + "loss": 2.174, + "step": 6713 + }, + { + "epoch": 0.3601931330472103, + "grad_norm": 0.390625, + "learning_rate": 4.976863784172048e-06, + "loss": 2.3022, + "step": 6714 + }, + { + "epoch": 0.3602467811158798, + "grad_norm": 0.38671875, + "learning_rate": 4.9768519907092e-06, + "loss": 2.3837, + "step": 6715 + }, + { + "epoch": 0.36030042918454935, + "grad_norm": 0.361328125, + "learning_rate": 4.976840194255297e-06, + "loss": 2.162, + "step": 6716 + }, + { + "epoch": 0.3603540772532189, + "grad_norm": 0.453125, + "learning_rate": 4.976828394810351e-06, + "loss": 2.3466, + "step": 6717 + }, + { + "epoch": 0.3604077253218884, + "grad_norm": 0.52734375, + "learning_rate": 4.97681659237438e-06, + "loss": 1.9863, + "step": 6718 + }, + { + "epoch": 0.36046137339055795, + "grad_norm": 0.357421875, + "learning_rate": 4.976804786947394e-06, + "loss": 2.3143, + "step": 6719 + }, + { + "epoch": 0.3605150214592275, + "grad_norm": 0.404296875, + "learning_rate": 4.976792978529411e-06, + "loss": 2.3318, + "step": 6720 + }, + { + "epoch": 0.360568669527897, + "grad_norm": 0.5859375, + "learning_rate": 4.9767811671204425e-06, + "loss": 2.3513, + "step": 6721 + }, + { + "epoch": 0.36062231759656654, + "grad_norm": 0.36328125, + "learning_rate": 4.976769352720503e-06, + "loss": 2.2195, + "step": 6722 + }, + { + "epoch": 0.3606759656652361, + "grad_norm": 0.30859375, + "learning_rate": 4.9767575353296095e-06, + "loss": 2.3454, + "step": 6723 + }, + { + "epoch": 0.36072961373390555, + "grad_norm": 0.41796875, + "learning_rate": 4.976745714947773e-06, + "loss": 2.1028, + "step": 6724 + }, + { + "epoch": 0.3607832618025751, + "grad_norm": 0.392578125, + "learning_rate": 4.976733891575009e-06, + "loss": 2.3394, + "step": 6725 + }, + { + "epoch": 0.3608369098712446, + "grad_norm": 0.498046875, + "learning_rate": 4.976722065211332e-06, + "loss": 2.3959, + "step": 6726 + }, + { + "epoch": 0.36089055793991415, + "grad_norm": 0.408203125, + "learning_rate": 4.976710235856756e-06, + "loss": 2.4043, + "step": 6727 + }, + { + "epoch": 0.3609442060085837, + "grad_norm": 0.400390625, + "learning_rate": 4.9766984035112966e-06, + "loss": 2.4398, + "step": 6728 + }, + { + "epoch": 0.3609978540772532, + "grad_norm": 0.51171875, + "learning_rate": 4.976686568174966e-06, + "loss": 2.6048, + "step": 6729 + }, + { + "epoch": 0.36105150214592274, + "grad_norm": 0.578125, + "learning_rate": 4.9766747298477795e-06, + "loss": 2.2228, + "step": 6730 + }, + { + "epoch": 0.3611051502145923, + "grad_norm": 0.39453125, + "learning_rate": 4.9766628885297515e-06, + "loss": 2.1763, + "step": 6731 + }, + { + "epoch": 0.3611587982832618, + "grad_norm": 0.404296875, + "learning_rate": 4.976651044220896e-06, + "loss": 2.2997, + "step": 6732 + }, + { + "epoch": 0.36121244635193134, + "grad_norm": 0.486328125, + "learning_rate": 4.976639196921227e-06, + "loss": 2.1931, + "step": 6733 + }, + { + "epoch": 0.36126609442060087, + "grad_norm": 0.482421875, + "learning_rate": 4.97662734663076e-06, + "loss": 1.9536, + "step": 6734 + }, + { + "epoch": 0.3613197424892704, + "grad_norm": 0.35546875, + "learning_rate": 4.976615493349508e-06, + "loss": 2.2995, + "step": 6735 + }, + { + "epoch": 0.36137339055793993, + "grad_norm": 0.3984375, + "learning_rate": 4.976603637077486e-06, + "loss": 2.3231, + "step": 6736 + }, + { + "epoch": 0.36142703862660946, + "grad_norm": 0.412109375, + "learning_rate": 4.9765917778147085e-06, + "loss": 2.3979, + "step": 6737 + }, + { + "epoch": 0.361480686695279, + "grad_norm": 0.5078125, + "learning_rate": 4.976579915561189e-06, + "loss": 2.1391, + "step": 6738 + }, + { + "epoch": 0.3615343347639485, + "grad_norm": 0.39453125, + "learning_rate": 4.976568050316943e-06, + "loss": 2.2693, + "step": 6739 + }, + { + "epoch": 0.361587982832618, + "grad_norm": 0.375, + "learning_rate": 4.976556182081984e-06, + "loss": 2.5841, + "step": 6740 + }, + { + "epoch": 0.36164163090128754, + "grad_norm": 0.388671875, + "learning_rate": 4.9765443108563255e-06, + "loss": 2.3822, + "step": 6741 + }, + { + "epoch": 0.36169527896995707, + "grad_norm": 0.435546875, + "learning_rate": 4.976532436639983e-06, + "loss": 2.285, + "step": 6742 + }, + { + "epoch": 0.3617489270386266, + "grad_norm": 0.373046875, + "learning_rate": 4.976520559432971e-06, + "loss": 2.0832, + "step": 6743 + }, + { + "epoch": 0.36180257510729613, + "grad_norm": 0.404296875, + "learning_rate": 4.976508679235304e-06, + "loss": 2.5932, + "step": 6744 + }, + { + "epoch": 0.36185622317596566, + "grad_norm": 0.3125, + "learning_rate": 4.9764967960469954e-06, + "loss": 2.2057, + "step": 6745 + }, + { + "epoch": 0.3619098712446352, + "grad_norm": 0.55078125, + "learning_rate": 4.9764849098680605e-06, + "loss": 2.4485, + "step": 6746 + }, + { + "epoch": 0.3619635193133047, + "grad_norm": 0.3515625, + "learning_rate": 4.976473020698512e-06, + "loss": 2.239, + "step": 6747 + }, + { + "epoch": 0.36201716738197426, + "grad_norm": 0.4453125, + "learning_rate": 4.976461128538367e-06, + "loss": 2.4697, + "step": 6748 + }, + { + "epoch": 0.3620708154506438, + "grad_norm": 0.37890625, + "learning_rate": 4.976449233387637e-06, + "loss": 2.2323, + "step": 6749 + }, + { + "epoch": 0.3621244635193133, + "grad_norm": 0.396484375, + "learning_rate": 4.9764373352463374e-06, + "loss": 2.1303, + "step": 6750 + }, + { + "epoch": 0.36217811158798285, + "grad_norm": 0.470703125, + "learning_rate": 4.976425434114483e-06, + "loss": 2.2844, + "step": 6751 + }, + { + "epoch": 0.3622317596566524, + "grad_norm": 0.36328125, + "learning_rate": 4.976413529992089e-06, + "loss": 2.6147, + "step": 6752 + }, + { + "epoch": 0.36228540772532186, + "grad_norm": 0.416015625, + "learning_rate": 4.976401622879168e-06, + "loss": 1.4948, + "step": 6753 + }, + { + "epoch": 0.3623390557939914, + "grad_norm": 0.5625, + "learning_rate": 4.976389712775735e-06, + "loss": 2.4326, + "step": 6754 + }, + { + "epoch": 0.3623927038626609, + "grad_norm": 0.416015625, + "learning_rate": 4.976377799681804e-06, + "loss": 1.9687, + "step": 6755 + }, + { + "epoch": 0.36244635193133046, + "grad_norm": 0.5859375, + "learning_rate": 4.976365883597391e-06, + "loss": 2.1822, + "step": 6756 + }, + { + "epoch": 0.3625, + "grad_norm": 0.373046875, + "learning_rate": 4.976353964522509e-06, + "loss": 2.0957, + "step": 6757 + }, + { + "epoch": 0.3625536480686695, + "grad_norm": 0.380859375, + "learning_rate": 4.976342042457172e-06, + "loss": 2.2932, + "step": 6758 + }, + { + "epoch": 0.36260729613733905, + "grad_norm": 0.3515625, + "learning_rate": 4.976330117401395e-06, + "loss": 1.8567, + "step": 6759 + }, + { + "epoch": 0.3626609442060086, + "grad_norm": 0.34765625, + "learning_rate": 4.976318189355194e-06, + "loss": 2.4214, + "step": 6760 + }, + { + "epoch": 0.3627145922746781, + "grad_norm": 0.404296875, + "learning_rate": 4.976306258318581e-06, + "loss": 2.3802, + "step": 6761 + }, + { + "epoch": 0.36276824034334765, + "grad_norm": 0.38671875, + "learning_rate": 4.976294324291571e-06, + "loss": 2.1973, + "step": 6762 + }, + { + "epoch": 0.3628218884120172, + "grad_norm": 0.3671875, + "learning_rate": 4.9762823872741785e-06, + "loss": 2.2332, + "step": 6763 + }, + { + "epoch": 0.3628755364806867, + "grad_norm": 0.462890625, + "learning_rate": 4.976270447266418e-06, + "loss": 2.3854, + "step": 6764 + }, + { + "epoch": 0.36292918454935624, + "grad_norm": 0.80859375, + "learning_rate": 4.976258504268306e-06, + "loss": 2.4444, + "step": 6765 + }, + { + "epoch": 0.3629828326180258, + "grad_norm": 0.380859375, + "learning_rate": 4.9762465582798525e-06, + "loss": 2.1601, + "step": 6766 + }, + { + "epoch": 0.3630364806866953, + "grad_norm": 0.48046875, + "learning_rate": 4.976234609301076e-06, + "loss": 1.6838, + "step": 6767 + }, + { + "epoch": 0.3630901287553648, + "grad_norm": 0.39453125, + "learning_rate": 4.976222657331988e-06, + "loss": 2.31, + "step": 6768 + }, + { + "epoch": 0.3631437768240343, + "grad_norm": 0.4140625, + "learning_rate": 4.976210702372605e-06, + "loss": 2.2388, + "step": 6769 + }, + { + "epoch": 0.36319742489270385, + "grad_norm": 0.37890625, + "learning_rate": 4.976198744422941e-06, + "loss": 2.1773, + "step": 6770 + }, + { + "epoch": 0.3632510729613734, + "grad_norm": 0.41015625, + "learning_rate": 4.9761867834830094e-06, + "loss": 2.3193, + "step": 6771 + }, + { + "epoch": 0.3633047210300429, + "grad_norm": 0.423828125, + "learning_rate": 4.9761748195528255e-06, + "loss": 2.302, + "step": 6772 + }, + { + "epoch": 0.36335836909871244, + "grad_norm": 0.404296875, + "learning_rate": 4.9761628526324035e-06, + "loss": 2.1091, + "step": 6773 + }, + { + "epoch": 0.363412017167382, + "grad_norm": 0.390625, + "learning_rate": 4.976150882721759e-06, + "loss": 2.6435, + "step": 6774 + }, + { + "epoch": 0.3634656652360515, + "grad_norm": 0.42578125, + "learning_rate": 4.976138909820904e-06, + "loss": 2.4909, + "step": 6775 + }, + { + "epoch": 0.36351931330472104, + "grad_norm": 0.34765625, + "learning_rate": 4.976126933929855e-06, + "loss": 2.2047, + "step": 6776 + }, + { + "epoch": 0.36357296137339057, + "grad_norm": 20.0, + "learning_rate": 4.9761149550486254e-06, + "loss": 2.2801, + "step": 6777 + }, + { + "epoch": 0.3636266094420601, + "grad_norm": 0.369140625, + "learning_rate": 4.976102973177231e-06, + "loss": 2.2084, + "step": 6778 + }, + { + "epoch": 0.36368025751072963, + "grad_norm": 0.42578125, + "learning_rate": 4.9760909883156845e-06, + "loss": 2.4399, + "step": 6779 + }, + { + "epoch": 0.36373390557939916, + "grad_norm": 0.3671875, + "learning_rate": 4.976079000464002e-06, + "loss": 2.4386, + "step": 6780 + }, + { + "epoch": 0.3637875536480687, + "grad_norm": 0.4453125, + "learning_rate": 4.976067009622196e-06, + "loss": 2.1211, + "step": 6781 + }, + { + "epoch": 0.36384120171673817, + "grad_norm": 0.4140625, + "learning_rate": 4.976055015790284e-06, + "loss": 2.3809, + "step": 6782 + }, + { + "epoch": 0.3638948497854077, + "grad_norm": 0.36328125, + "learning_rate": 4.9760430189682775e-06, + "loss": 2.252, + "step": 6783 + }, + { + "epoch": 0.36394849785407724, + "grad_norm": 0.3515625, + "learning_rate": 4.976031019156192e-06, + "loss": 1.707, + "step": 6784 + }, + { + "epoch": 0.36400214592274677, + "grad_norm": 4.375, + "learning_rate": 4.9760190163540435e-06, + "loss": 2.088, + "step": 6785 + }, + { + "epoch": 0.3640557939914163, + "grad_norm": 0.5234375, + "learning_rate": 4.976007010561844e-06, + "loss": 2.643, + "step": 6786 + }, + { + "epoch": 0.36410944206008583, + "grad_norm": 0.419921875, + "learning_rate": 4.9759950017796085e-06, + "loss": 2.4823, + "step": 6787 + }, + { + "epoch": 0.36416309012875536, + "grad_norm": 0.37890625, + "learning_rate": 4.975982990007354e-06, + "loss": 2.4143, + "step": 6788 + }, + { + "epoch": 0.3642167381974249, + "grad_norm": 0.43359375, + "learning_rate": 4.975970975245092e-06, + "loss": 2.3961, + "step": 6789 + }, + { + "epoch": 0.3642703862660944, + "grad_norm": 0.365234375, + "learning_rate": 4.975958957492838e-06, + "loss": 2.2715, + "step": 6790 + }, + { + "epoch": 0.36432403433476396, + "grad_norm": 0.38671875, + "learning_rate": 4.975946936750608e-06, + "loss": 2.44, + "step": 6791 + }, + { + "epoch": 0.3643776824034335, + "grad_norm": 0.37890625, + "learning_rate": 4.975934913018413e-06, + "loss": 2.0424, + "step": 6792 + }, + { + "epoch": 0.364431330472103, + "grad_norm": 0.36328125, + "learning_rate": 4.975922886296271e-06, + "loss": 2.203, + "step": 6793 + }, + { + "epoch": 0.36448497854077255, + "grad_norm": 0.42578125, + "learning_rate": 4.975910856584196e-06, + "loss": 2.3955, + "step": 6794 + }, + { + "epoch": 0.3645386266094421, + "grad_norm": 0.53125, + "learning_rate": 4.975898823882201e-06, + "loss": 2.369, + "step": 6795 + }, + { + "epoch": 0.36459227467811156, + "grad_norm": 0.40234375, + "learning_rate": 4.975886788190301e-06, + "loss": 2.3355, + "step": 6796 + }, + { + "epoch": 0.3646459227467811, + "grad_norm": 0.443359375, + "learning_rate": 4.975874749508511e-06, + "loss": 2.3078, + "step": 6797 + }, + { + "epoch": 0.3646995708154506, + "grad_norm": 0.44140625, + "learning_rate": 4.975862707836846e-06, + "loss": 2.022, + "step": 6798 + }, + { + "epoch": 0.36475321888412016, + "grad_norm": 0.400390625, + "learning_rate": 4.975850663175319e-06, + "loss": 1.8757, + "step": 6799 + }, + { + "epoch": 0.3648068669527897, + "grad_norm": 0.392578125, + "learning_rate": 4.975838615523946e-06, + "loss": 2.2426, + "step": 6800 + }, + { + "epoch": 0.3648605150214592, + "grad_norm": 0.412109375, + "learning_rate": 4.9758265648827406e-06, + "loss": 2.4858, + "step": 6801 + }, + { + "epoch": 0.36491416309012875, + "grad_norm": 0.396484375, + "learning_rate": 4.975814511251718e-06, + "loss": 2.2375, + "step": 6802 + }, + { + "epoch": 0.3649678111587983, + "grad_norm": 0.390625, + "learning_rate": 4.975802454630893e-06, + "loss": 2.4016, + "step": 6803 + }, + { + "epoch": 0.3650214592274678, + "grad_norm": 0.470703125, + "learning_rate": 4.975790395020279e-06, + "loss": 2.2979, + "step": 6804 + }, + { + "epoch": 0.36507510729613735, + "grad_norm": 0.419921875, + "learning_rate": 4.975778332419892e-06, + "loss": 2.2469, + "step": 6805 + }, + { + "epoch": 0.3651287553648069, + "grad_norm": 0.41015625, + "learning_rate": 4.975766266829744e-06, + "loss": 2.2298, + "step": 6806 + }, + { + "epoch": 0.3651824034334764, + "grad_norm": 0.369140625, + "learning_rate": 4.9757541982498535e-06, + "loss": 2.3282, + "step": 6807 + }, + { + "epoch": 0.36523605150214594, + "grad_norm": 0.60546875, + "learning_rate": 4.975742126680232e-06, + "loss": 1.4782, + "step": 6808 + }, + { + "epoch": 0.3652896995708155, + "grad_norm": 0.390625, + "learning_rate": 4.975730052120895e-06, + "loss": 2.5128, + "step": 6809 + }, + { + "epoch": 0.365343347639485, + "grad_norm": 0.5078125, + "learning_rate": 4.975717974571857e-06, + "loss": 2.1067, + "step": 6810 + }, + { + "epoch": 0.3653969957081545, + "grad_norm": 0.3671875, + "learning_rate": 4.9757058940331334e-06, + "loss": 2.351, + "step": 6811 + }, + { + "epoch": 0.365450643776824, + "grad_norm": 0.412109375, + "learning_rate": 4.975693810504738e-06, + "loss": 2.267, + "step": 6812 + }, + { + "epoch": 0.36550429184549355, + "grad_norm": 0.404296875, + "learning_rate": 4.975681723986685e-06, + "loss": 2.439, + "step": 6813 + }, + { + "epoch": 0.3655579399141631, + "grad_norm": 0.392578125, + "learning_rate": 4.97566963447899e-06, + "loss": 2.3487, + "step": 6814 + }, + { + "epoch": 0.3656115879828326, + "grad_norm": 0.455078125, + "learning_rate": 4.975657541981666e-06, + "loss": 2.3504, + "step": 6815 + }, + { + "epoch": 0.36566523605150214, + "grad_norm": 0.447265625, + "learning_rate": 4.975645446494729e-06, + "loss": 2.0361, + "step": 6816 + }, + { + "epoch": 0.3657188841201717, + "grad_norm": 0.447265625, + "learning_rate": 4.975633348018194e-06, + "loss": 2.2876, + "step": 6817 + }, + { + "epoch": 0.3657725321888412, + "grad_norm": 0.40234375, + "learning_rate": 4.9756212465520745e-06, + "loss": 2.4066, + "step": 6818 + }, + { + "epoch": 0.36582618025751074, + "grad_norm": 0.3984375, + "learning_rate": 4.975609142096386e-06, + "loss": 2.3373, + "step": 6819 + }, + { + "epoch": 0.36587982832618027, + "grad_norm": 0.408203125, + "learning_rate": 4.975597034651142e-06, + "loss": 2.2851, + "step": 6820 + }, + { + "epoch": 0.3659334763948498, + "grad_norm": 0.44921875, + "learning_rate": 4.975584924216358e-06, + "loss": 2.331, + "step": 6821 + }, + { + "epoch": 0.36598712446351933, + "grad_norm": 0.45703125, + "learning_rate": 4.9755728107920475e-06, + "loss": 2.4257, + "step": 6822 + }, + { + "epoch": 0.36604077253218886, + "grad_norm": 0.4609375, + "learning_rate": 4.9755606943782275e-06, + "loss": 2.253, + "step": 6823 + }, + { + "epoch": 0.3660944206008584, + "grad_norm": 0.57421875, + "learning_rate": 4.97554857497491e-06, + "loss": 2.2984, + "step": 6824 + }, + { + "epoch": 0.36614806866952787, + "grad_norm": 0.375, + "learning_rate": 4.975536452582111e-06, + "loss": 2.2818, + "step": 6825 + }, + { + "epoch": 0.3662017167381974, + "grad_norm": 0.421875, + "learning_rate": 4.9755243271998455e-06, + "loss": 2.2281, + "step": 6826 + }, + { + "epoch": 0.36625536480686693, + "grad_norm": 0.546875, + "learning_rate": 4.975512198828127e-06, + "loss": 2.2769, + "step": 6827 + }, + { + "epoch": 0.36630901287553647, + "grad_norm": 0.3515625, + "learning_rate": 4.9755000674669705e-06, + "loss": 2.2034, + "step": 6828 + }, + { + "epoch": 0.366362660944206, + "grad_norm": 0.322265625, + "learning_rate": 4.975487933116391e-06, + "loss": 2.2504, + "step": 6829 + }, + { + "epoch": 0.36641630901287553, + "grad_norm": 0.38671875, + "learning_rate": 4.9754757957764035e-06, + "loss": 2.5962, + "step": 6830 + }, + { + "epoch": 0.36646995708154506, + "grad_norm": 0.484375, + "learning_rate": 4.975463655447022e-06, + "loss": 2.1265, + "step": 6831 + }, + { + "epoch": 0.3665236051502146, + "grad_norm": 0.4453125, + "learning_rate": 4.975451512128261e-06, + "loss": 2.4629, + "step": 6832 + }, + { + "epoch": 0.3665772532188841, + "grad_norm": 0.396484375, + "learning_rate": 4.975439365820135e-06, + "loss": 1.6209, + "step": 6833 + }, + { + "epoch": 0.36663090128755366, + "grad_norm": 0.453125, + "learning_rate": 4.97542721652266e-06, + "loss": 2.2907, + "step": 6834 + }, + { + "epoch": 0.3666845493562232, + "grad_norm": 0.419921875, + "learning_rate": 4.97541506423585e-06, + "loss": 2.2874, + "step": 6835 + }, + { + "epoch": 0.3667381974248927, + "grad_norm": 1.1171875, + "learning_rate": 4.975402908959719e-06, + "loss": 2.1849, + "step": 6836 + }, + { + "epoch": 0.36679184549356225, + "grad_norm": 0.416015625, + "learning_rate": 4.975390750694282e-06, + "loss": 2.2262, + "step": 6837 + }, + { + "epoch": 0.3668454935622318, + "grad_norm": 0.466796875, + "learning_rate": 4.975378589439553e-06, + "loss": 2.2796, + "step": 6838 + }, + { + "epoch": 0.36689914163090126, + "grad_norm": 0.37890625, + "learning_rate": 4.975366425195549e-06, + "loss": 2.4515, + "step": 6839 + }, + { + "epoch": 0.3669527896995708, + "grad_norm": 0.37890625, + "learning_rate": 4.975354257962283e-06, + "loss": 2.2198, + "step": 6840 + }, + { + "epoch": 0.3670064377682403, + "grad_norm": 0.4375, + "learning_rate": 4.975342087739769e-06, + "loss": 2.4159, + "step": 6841 + }, + { + "epoch": 0.36706008583690986, + "grad_norm": 0.4296875, + "learning_rate": 4.975329914528025e-06, + "loss": 2.4086, + "step": 6842 + }, + { + "epoch": 0.3671137339055794, + "grad_norm": 0.404296875, + "learning_rate": 4.975317738327061e-06, + "loss": 2.5122, + "step": 6843 + }, + { + "epoch": 0.3671673819742489, + "grad_norm": 0.4765625, + "learning_rate": 4.975305559136895e-06, + "loss": 2.2918, + "step": 6844 + }, + { + "epoch": 0.36722103004291845, + "grad_norm": 0.390625, + "learning_rate": 4.9752933769575396e-06, + "loss": 2.442, + "step": 6845 + }, + { + "epoch": 0.367274678111588, + "grad_norm": 0.4375, + "learning_rate": 4.975281191789012e-06, + "loss": 2.1205, + "step": 6846 + }, + { + "epoch": 0.3673283261802575, + "grad_norm": 0.41015625, + "learning_rate": 4.975269003631324e-06, + "loss": 2.3605, + "step": 6847 + }, + { + "epoch": 0.36738197424892705, + "grad_norm": 0.345703125, + "learning_rate": 4.975256812484494e-06, + "loss": 2.0178, + "step": 6848 + }, + { + "epoch": 0.3674356223175966, + "grad_norm": 0.4921875, + "learning_rate": 4.975244618348533e-06, + "loss": 2.045, + "step": 6849 + }, + { + "epoch": 0.3674892703862661, + "grad_norm": 0.482421875, + "learning_rate": 4.975232421223458e-06, + "loss": 2.4261, + "step": 6850 + }, + { + "epoch": 0.36754291845493564, + "grad_norm": 0.388671875, + "learning_rate": 4.975220221109282e-06, + "loss": 2.3471, + "step": 6851 + }, + { + "epoch": 0.3675965665236052, + "grad_norm": 0.439453125, + "learning_rate": 4.975208018006023e-06, + "loss": 2.2766, + "step": 6852 + }, + { + "epoch": 0.3676502145922747, + "grad_norm": 0.416015625, + "learning_rate": 4.9751958119136914e-06, + "loss": 2.4252, + "step": 6853 + }, + { + "epoch": 0.3677038626609442, + "grad_norm": 0.388671875, + "learning_rate": 4.975183602832305e-06, + "loss": 2.4068, + "step": 6854 + }, + { + "epoch": 0.3677575107296137, + "grad_norm": 0.51953125, + "learning_rate": 4.975171390761877e-06, + "loss": 2.3274, + "step": 6855 + }, + { + "epoch": 0.36781115879828324, + "grad_norm": 0.404296875, + "learning_rate": 4.975159175702423e-06, + "loss": 2.2168, + "step": 6856 + }, + { + "epoch": 0.3678648068669528, + "grad_norm": 0.419921875, + "learning_rate": 4.975146957653958e-06, + "loss": 2.6385, + "step": 6857 + }, + { + "epoch": 0.3679184549356223, + "grad_norm": 0.3984375, + "learning_rate": 4.975134736616496e-06, + "loss": 2.2738, + "step": 6858 + }, + { + "epoch": 0.36797210300429184, + "grad_norm": 0.4140625, + "learning_rate": 4.975122512590053e-06, + "loss": 2.1279, + "step": 6859 + }, + { + "epoch": 0.36802575107296137, + "grad_norm": 0.49609375, + "learning_rate": 4.9751102855746415e-06, + "loss": 2.5478, + "step": 6860 + }, + { + "epoch": 0.3680793991416309, + "grad_norm": 0.40625, + "learning_rate": 4.975098055570277e-06, + "loss": 2.057, + "step": 6861 + }, + { + "epoch": 0.36813304721030043, + "grad_norm": 0.94921875, + "learning_rate": 4.9750858225769764e-06, + "loss": 2.4407, + "step": 6862 + }, + { + "epoch": 0.36818669527896997, + "grad_norm": 0.5, + "learning_rate": 4.9750735865947515e-06, + "loss": 2.415, + "step": 6863 + }, + { + "epoch": 0.3682403433476395, + "grad_norm": 0.35546875, + "learning_rate": 4.975061347623618e-06, + "loss": 2.3506, + "step": 6864 + }, + { + "epoch": 0.36829399141630903, + "grad_norm": 0.388671875, + "learning_rate": 4.975049105663593e-06, + "loss": 2.2141, + "step": 6865 + }, + { + "epoch": 0.36834763948497856, + "grad_norm": 0.50390625, + "learning_rate": 4.975036860714689e-06, + "loss": 2.4209, + "step": 6866 + }, + { + "epoch": 0.3684012875536481, + "grad_norm": 0.388671875, + "learning_rate": 4.975024612776921e-06, + "loss": 2.3542, + "step": 6867 + }, + { + "epoch": 0.36845493562231757, + "grad_norm": 0.4453125, + "learning_rate": 4.975012361850303e-06, + "loss": 2.2208, + "step": 6868 + }, + { + "epoch": 0.3685085836909871, + "grad_norm": 0.423828125, + "learning_rate": 4.975000107934852e-06, + "loss": 2.3214, + "step": 6869 + }, + { + "epoch": 0.36856223175965663, + "grad_norm": 0.314453125, + "learning_rate": 4.974987851030581e-06, + "loss": 1.7984, + "step": 6870 + }, + { + "epoch": 0.36861587982832617, + "grad_norm": 0.412109375, + "learning_rate": 4.9749755911375055e-06, + "loss": 2.4339, + "step": 6871 + }, + { + "epoch": 0.3686695278969957, + "grad_norm": 0.45703125, + "learning_rate": 4.974963328255641e-06, + "loss": 2.1736, + "step": 6872 + }, + { + "epoch": 0.36872317596566523, + "grad_norm": 0.359375, + "learning_rate": 4.9749510623850006e-06, + "loss": 2.2885, + "step": 6873 + }, + { + "epoch": 0.36877682403433476, + "grad_norm": 0.384765625, + "learning_rate": 4.9749387935256e-06, + "loss": 2.0643, + "step": 6874 + }, + { + "epoch": 0.3688304721030043, + "grad_norm": 0.69921875, + "learning_rate": 4.974926521677455e-06, + "loss": 2.2615, + "step": 6875 + }, + { + "epoch": 0.3688841201716738, + "grad_norm": 0.396484375, + "learning_rate": 4.974914246840578e-06, + "loss": 2.4251, + "step": 6876 + }, + { + "epoch": 0.36893776824034336, + "grad_norm": 0.478515625, + "learning_rate": 4.974901969014986e-06, + "loss": 2.3759, + "step": 6877 + }, + { + "epoch": 0.3689914163090129, + "grad_norm": 0.392578125, + "learning_rate": 4.974889688200694e-06, + "loss": 2.3382, + "step": 6878 + }, + { + "epoch": 0.3690450643776824, + "grad_norm": 0.396484375, + "learning_rate": 4.974877404397714e-06, + "loss": 2.3338, + "step": 6879 + }, + { + "epoch": 0.36909871244635195, + "grad_norm": 0.375, + "learning_rate": 4.974865117606064e-06, + "loss": 2.275, + "step": 6880 + }, + { + "epoch": 0.3691523605150215, + "grad_norm": 0.373046875, + "learning_rate": 4.974852827825758e-06, + "loss": 2.0743, + "step": 6881 + }, + { + "epoch": 0.369206008583691, + "grad_norm": 0.400390625, + "learning_rate": 4.97484053505681e-06, + "loss": 2.3374, + "step": 6882 + }, + { + "epoch": 0.3692596566523605, + "grad_norm": 0.4140625, + "learning_rate": 4.9748282392992346e-06, + "loss": 2.1264, + "step": 6883 + }, + { + "epoch": 0.36931330472103, + "grad_norm": 0.33984375, + "learning_rate": 4.974815940553048e-06, + "loss": 2.1017, + "step": 6884 + }, + { + "epoch": 0.36936695278969955, + "grad_norm": 0.34765625, + "learning_rate": 4.974803638818264e-06, + "loss": 2.2042, + "step": 6885 + }, + { + "epoch": 0.3694206008583691, + "grad_norm": 0.5546875, + "learning_rate": 4.9747913340948985e-06, + "loss": 2.2099, + "step": 6886 + }, + { + "epoch": 0.3694742489270386, + "grad_norm": 0.384765625, + "learning_rate": 4.974779026382966e-06, + "loss": 2.377, + "step": 6887 + }, + { + "epoch": 0.36952789699570815, + "grad_norm": 0.322265625, + "learning_rate": 4.97476671568248e-06, + "loss": 2.1031, + "step": 6888 + }, + { + "epoch": 0.3695815450643777, + "grad_norm": 0.416015625, + "learning_rate": 4.9747544019934564e-06, + "loss": 2.5017, + "step": 6889 + }, + { + "epoch": 0.3696351931330472, + "grad_norm": 0.412109375, + "learning_rate": 4.974742085315911e-06, + "loss": 2.2995, + "step": 6890 + }, + { + "epoch": 0.36968884120171674, + "grad_norm": 0.388671875, + "learning_rate": 4.9747297656498575e-06, + "loss": 2.1405, + "step": 6891 + }, + { + "epoch": 0.3697424892703863, + "grad_norm": 0.345703125, + "learning_rate": 4.974717442995311e-06, + "loss": 2.0872, + "step": 6892 + }, + { + "epoch": 0.3697961373390558, + "grad_norm": 0.453125, + "learning_rate": 4.974705117352287e-06, + "loss": 2.2513, + "step": 6893 + }, + { + "epoch": 0.36984978540772534, + "grad_norm": 0.314453125, + "learning_rate": 4.9746927887207984e-06, + "loss": 2.3424, + "step": 6894 + }, + { + "epoch": 0.36990343347639487, + "grad_norm": 0.44140625, + "learning_rate": 4.974680457100862e-06, + "loss": 2.2495, + "step": 6895 + }, + { + "epoch": 0.3699570815450644, + "grad_norm": 0.96484375, + "learning_rate": 4.974668122492493e-06, + "loss": 2.421, + "step": 6896 + }, + { + "epoch": 0.3700107296137339, + "grad_norm": 0.3359375, + "learning_rate": 4.974655784895705e-06, + "loss": 2.0767, + "step": 6897 + }, + { + "epoch": 0.3700643776824034, + "grad_norm": 0.322265625, + "learning_rate": 4.974643444310513e-06, + "loss": 1.9064, + "step": 6898 + }, + { + "epoch": 0.37011802575107294, + "grad_norm": 0.439453125, + "learning_rate": 4.974631100736933e-06, + "loss": 2.2839, + "step": 6899 + }, + { + "epoch": 0.3701716738197425, + "grad_norm": 0.37890625, + "learning_rate": 4.974618754174979e-06, + "loss": 2.0965, + "step": 6900 + }, + { + "epoch": 0.370225321888412, + "grad_norm": 0.42578125, + "learning_rate": 4.974606404624667e-06, + "loss": 2.2364, + "step": 6901 + }, + { + "epoch": 0.37027896995708154, + "grad_norm": 0.392578125, + "learning_rate": 4.974594052086009e-06, + "loss": 2.2326, + "step": 6902 + }, + { + "epoch": 0.37033261802575107, + "grad_norm": 0.39453125, + "learning_rate": 4.974581696559023e-06, + "loss": 2.4757, + "step": 6903 + }, + { + "epoch": 0.3703862660944206, + "grad_norm": 0.3203125, + "learning_rate": 4.974569338043723e-06, + "loss": 2.1913, + "step": 6904 + }, + { + "epoch": 0.37043991416309013, + "grad_norm": 0.4296875, + "learning_rate": 4.9745569765401245e-06, + "loss": 2.2778, + "step": 6905 + }, + { + "epoch": 0.37049356223175967, + "grad_norm": 0.36328125, + "learning_rate": 4.974544612048241e-06, + "loss": 2.2967, + "step": 6906 + }, + { + "epoch": 0.3705472103004292, + "grad_norm": 0.55859375, + "learning_rate": 4.974532244568088e-06, + "loss": 2.2191, + "step": 6907 + }, + { + "epoch": 0.37060085836909873, + "grad_norm": 0.416015625, + "learning_rate": 4.97451987409968e-06, + "loss": 2.5115, + "step": 6908 + }, + { + "epoch": 0.37065450643776826, + "grad_norm": 0.3671875, + "learning_rate": 4.974507500643034e-06, + "loss": 2.1549, + "step": 6909 + }, + { + "epoch": 0.3707081545064378, + "grad_norm": 0.4375, + "learning_rate": 4.974495124198162e-06, + "loss": 2.4114, + "step": 6910 + }, + { + "epoch": 0.37076180257510727, + "grad_norm": 0.462890625, + "learning_rate": 4.974482744765081e-06, + "loss": 2.0806, + "step": 6911 + }, + { + "epoch": 0.3708154506437768, + "grad_norm": 0.4609375, + "learning_rate": 4.974470362343806e-06, + "loss": 2.2015, + "step": 6912 + }, + { + "epoch": 0.37086909871244633, + "grad_norm": 0.478515625, + "learning_rate": 4.974457976934351e-06, + "loss": 2.3766, + "step": 6913 + }, + { + "epoch": 0.37092274678111586, + "grad_norm": 0.5546875, + "learning_rate": 4.974445588536731e-06, + "loss": 2.4321, + "step": 6914 + }, + { + "epoch": 0.3709763948497854, + "grad_norm": 0.447265625, + "learning_rate": 4.974433197150961e-06, + "loss": 2.3437, + "step": 6915 + }, + { + "epoch": 0.3710300429184549, + "grad_norm": 0.421875, + "learning_rate": 4.974420802777056e-06, + "loss": 2.4078, + "step": 6916 + }, + { + "epoch": 0.37108369098712446, + "grad_norm": 0.439453125, + "learning_rate": 4.974408405415032e-06, + "loss": 2.3784, + "step": 6917 + }, + { + "epoch": 0.371137339055794, + "grad_norm": 0.396484375, + "learning_rate": 4.974396005064903e-06, + "loss": 2.3799, + "step": 6918 + }, + { + "epoch": 0.3711909871244635, + "grad_norm": 0.341796875, + "learning_rate": 4.974383601726683e-06, + "loss": 1.919, + "step": 6919 + }, + { + "epoch": 0.37124463519313305, + "grad_norm": 0.44140625, + "learning_rate": 4.97437119540039e-06, + "loss": 1.9071, + "step": 6920 + }, + { + "epoch": 0.3712982832618026, + "grad_norm": 0.48828125, + "learning_rate": 4.974358786086035e-06, + "loss": 2.0394, + "step": 6921 + }, + { + "epoch": 0.3713519313304721, + "grad_norm": 0.431640625, + "learning_rate": 4.974346373783637e-06, + "loss": 1.8474, + "step": 6922 + }, + { + "epoch": 0.37140557939914165, + "grad_norm": 0.380859375, + "learning_rate": 4.974333958493208e-06, + "loss": 1.8861, + "step": 6923 + }, + { + "epoch": 0.3714592274678112, + "grad_norm": 0.392578125, + "learning_rate": 4.974321540214764e-06, + "loss": 2.1243, + "step": 6924 + }, + { + "epoch": 0.3715128755364807, + "grad_norm": 0.431640625, + "learning_rate": 4.9743091189483205e-06, + "loss": 2.4927, + "step": 6925 + }, + { + "epoch": 0.3715665236051502, + "grad_norm": 0.400390625, + "learning_rate": 4.974296694693892e-06, + "loss": 2.3666, + "step": 6926 + }, + { + "epoch": 0.3716201716738197, + "grad_norm": 0.458984375, + "learning_rate": 4.974284267451493e-06, + "loss": 2.4341, + "step": 6927 + }, + { + "epoch": 0.37167381974248925, + "grad_norm": 0.3359375, + "learning_rate": 4.974271837221139e-06, + "loss": 2.0622, + "step": 6928 + }, + { + "epoch": 0.3717274678111588, + "grad_norm": 0.41015625, + "learning_rate": 4.974259404002846e-06, + "loss": 2.4283, + "step": 6929 + }, + { + "epoch": 0.3717811158798283, + "grad_norm": 0.470703125, + "learning_rate": 4.974246967796627e-06, + "loss": 2.3384, + "step": 6930 + }, + { + "epoch": 0.37183476394849785, + "grad_norm": 0.427734375, + "learning_rate": 4.974234528602499e-06, + "loss": 2.3403, + "step": 6931 + }, + { + "epoch": 0.3718884120171674, + "grad_norm": 0.470703125, + "learning_rate": 4.974222086420475e-06, + "loss": 2.1993, + "step": 6932 + }, + { + "epoch": 0.3719420600858369, + "grad_norm": 0.443359375, + "learning_rate": 4.974209641250572e-06, + "loss": 2.1121, + "step": 6933 + }, + { + "epoch": 0.37199570815450644, + "grad_norm": 0.85546875, + "learning_rate": 4.974197193092804e-06, + "loss": 2.3049, + "step": 6934 + }, + { + "epoch": 0.372049356223176, + "grad_norm": 0.474609375, + "learning_rate": 4.974184741947186e-06, + "loss": 2.4605, + "step": 6935 + }, + { + "epoch": 0.3721030042918455, + "grad_norm": 0.376953125, + "learning_rate": 4.974172287813734e-06, + "loss": 2.2224, + "step": 6936 + }, + { + "epoch": 0.37215665236051504, + "grad_norm": 0.462890625, + "learning_rate": 4.974159830692461e-06, + "loss": 2.2858, + "step": 6937 + }, + { + "epoch": 0.37221030042918457, + "grad_norm": 0.388671875, + "learning_rate": 4.974147370583385e-06, + "loss": 2.3696, + "step": 6938 + }, + { + "epoch": 0.3722639484978541, + "grad_norm": 0.5390625, + "learning_rate": 4.974134907486518e-06, + "loss": 2.3892, + "step": 6939 + }, + { + "epoch": 0.3723175965665236, + "grad_norm": 0.40625, + "learning_rate": 4.974122441401877e-06, + "loss": 2.5238, + "step": 6940 + }, + { + "epoch": 0.3723712446351931, + "grad_norm": 0.330078125, + "learning_rate": 4.974109972329476e-06, + "loss": 1.8149, + "step": 6941 + }, + { + "epoch": 0.37242489270386264, + "grad_norm": 0.640625, + "learning_rate": 4.974097500269331e-06, + "loss": 2.4112, + "step": 6942 + }, + { + "epoch": 0.3724785407725322, + "grad_norm": 0.35546875, + "learning_rate": 4.9740850252214565e-06, + "loss": 2.2018, + "step": 6943 + }, + { + "epoch": 0.3725321888412017, + "grad_norm": 0.49609375, + "learning_rate": 4.974072547185868e-06, + "loss": 2.3131, + "step": 6944 + }, + { + "epoch": 0.37258583690987124, + "grad_norm": 0.376953125, + "learning_rate": 4.97406006616258e-06, + "loss": 2.0773, + "step": 6945 + }, + { + "epoch": 0.37263948497854077, + "grad_norm": 0.98828125, + "learning_rate": 4.974047582151606e-06, + "loss": 1.8677, + "step": 6946 + }, + { + "epoch": 0.3726931330472103, + "grad_norm": 0.466796875, + "learning_rate": 4.974035095152965e-06, + "loss": 2.2946, + "step": 6947 + }, + { + "epoch": 0.37274678111587983, + "grad_norm": 0.4140625, + "learning_rate": 4.97402260516667e-06, + "loss": 2.2978, + "step": 6948 + }, + { + "epoch": 0.37280042918454936, + "grad_norm": 0.330078125, + "learning_rate": 4.974010112192734e-06, + "loss": 1.9992, + "step": 6949 + }, + { + "epoch": 0.3728540772532189, + "grad_norm": 0.447265625, + "learning_rate": 4.973997616231176e-06, + "loss": 2.3852, + "step": 6950 + }, + { + "epoch": 0.37290772532188843, + "grad_norm": 0.59765625, + "learning_rate": 4.973985117282009e-06, + "loss": 2.0971, + "step": 6951 + }, + { + "epoch": 0.37296137339055796, + "grad_norm": 0.412109375, + "learning_rate": 4.973972615345248e-06, + "loss": 2.2981, + "step": 6952 + }, + { + "epoch": 0.3730150214592275, + "grad_norm": 0.458984375, + "learning_rate": 4.973960110420908e-06, + "loss": 2.4744, + "step": 6953 + }, + { + "epoch": 0.373068669527897, + "grad_norm": 0.431640625, + "learning_rate": 4.9739476025090046e-06, + "loss": 1.196, + "step": 6954 + }, + { + "epoch": 0.3731223175965665, + "grad_norm": 0.4765625, + "learning_rate": 4.973935091609554e-06, + "loss": 2.1648, + "step": 6955 + }, + { + "epoch": 0.37317596566523603, + "grad_norm": 0.3984375, + "learning_rate": 4.973922577722568e-06, + "loss": 2.192, + "step": 6956 + }, + { + "epoch": 0.37322961373390556, + "grad_norm": 0.494140625, + "learning_rate": 4.973910060848066e-06, + "loss": 2.5079, + "step": 6957 + }, + { + "epoch": 0.3732832618025751, + "grad_norm": 0.369140625, + "learning_rate": 4.97389754098606e-06, + "loss": 2.3727, + "step": 6958 + }, + { + "epoch": 0.3733369098712446, + "grad_norm": 0.41796875, + "learning_rate": 4.973885018136566e-06, + "loss": 2.1315, + "step": 6959 + }, + { + "epoch": 0.37339055793991416, + "grad_norm": 0.37890625, + "learning_rate": 4.973872492299599e-06, + "loss": 2.2941, + "step": 6960 + }, + { + "epoch": 0.3734442060085837, + "grad_norm": 0.392578125, + "learning_rate": 4.973859963475174e-06, + "loss": 2.1137, + "step": 6961 + }, + { + "epoch": 0.3734978540772532, + "grad_norm": 0.400390625, + "learning_rate": 4.973847431663308e-06, + "loss": 2.3168, + "step": 6962 + }, + { + "epoch": 0.37355150214592275, + "grad_norm": 0.396484375, + "learning_rate": 4.973834896864013e-06, + "loss": 2.2209, + "step": 6963 + }, + { + "epoch": 0.3736051502145923, + "grad_norm": 0.390625, + "learning_rate": 4.973822359077306e-06, + "loss": 2.3145, + "step": 6964 + }, + { + "epoch": 0.3736587982832618, + "grad_norm": 0.423828125, + "learning_rate": 4.973809818303203e-06, + "loss": 1.395, + "step": 6965 + }, + { + "epoch": 0.37371244635193135, + "grad_norm": 0.392578125, + "learning_rate": 4.973797274541716e-06, + "loss": 2.1085, + "step": 6966 + }, + { + "epoch": 0.3737660944206009, + "grad_norm": 0.35546875, + "learning_rate": 4.973784727792863e-06, + "loss": 2.2008, + "step": 6967 + }, + { + "epoch": 0.3738197424892704, + "grad_norm": 0.435546875, + "learning_rate": 4.973772178056659e-06, + "loss": 2.2881, + "step": 6968 + }, + { + "epoch": 0.3738733905579399, + "grad_norm": 0.60546875, + "learning_rate": 4.973759625333118e-06, + "loss": 2.1359, + "step": 6969 + }, + { + "epoch": 0.3739270386266094, + "grad_norm": 0.380859375, + "learning_rate": 4.973747069622256e-06, + "loss": 2.241, + "step": 6970 + }, + { + "epoch": 0.37398068669527895, + "grad_norm": 0.4296875, + "learning_rate": 4.973734510924087e-06, + "loss": 1.8089, + "step": 6971 + }, + { + "epoch": 0.3740343347639485, + "grad_norm": 0.43359375, + "learning_rate": 4.973721949238627e-06, + "loss": 2.2164, + "step": 6972 + }, + { + "epoch": 0.374087982832618, + "grad_norm": 0.44921875, + "learning_rate": 4.973709384565891e-06, + "loss": 2.3273, + "step": 6973 + }, + { + "epoch": 0.37414163090128755, + "grad_norm": 0.439453125, + "learning_rate": 4.973696816905896e-06, + "loss": 2.2034, + "step": 6974 + }, + { + "epoch": 0.3741952789699571, + "grad_norm": 0.6015625, + "learning_rate": 4.9736842462586535e-06, + "loss": 2.6564, + "step": 6975 + }, + { + "epoch": 0.3742489270386266, + "grad_norm": 0.357421875, + "learning_rate": 4.97367167262418e-06, + "loss": 2.5283, + "step": 6976 + }, + { + "epoch": 0.37430257510729614, + "grad_norm": 0.38671875, + "learning_rate": 4.973659096002493e-06, + "loss": 2.4363, + "step": 6977 + }, + { + "epoch": 0.3743562231759657, + "grad_norm": 0.408203125, + "learning_rate": 4.973646516393607e-06, + "loss": 2.4353, + "step": 6978 + }, + { + "epoch": 0.3744098712446352, + "grad_norm": 0.376953125, + "learning_rate": 4.9736339337975335e-06, + "loss": 2.3223, + "step": 6979 + }, + { + "epoch": 0.37446351931330474, + "grad_norm": 0.388671875, + "learning_rate": 4.973621348214292e-06, + "loss": 2.1807, + "step": 6980 + }, + { + "epoch": 0.37451716738197427, + "grad_norm": 0.484375, + "learning_rate": 4.973608759643896e-06, + "loss": 2.3822, + "step": 6981 + }, + { + "epoch": 0.3745708154506438, + "grad_norm": 0.396484375, + "learning_rate": 4.973596168086361e-06, + "loss": 2.1162, + "step": 6982 + }, + { + "epoch": 0.3746244635193133, + "grad_norm": 0.41796875, + "learning_rate": 4.973583573541701e-06, + "loss": 2.6139, + "step": 6983 + }, + { + "epoch": 0.3746781115879828, + "grad_norm": 0.333984375, + "learning_rate": 4.973570976009933e-06, + "loss": 2.4712, + "step": 6984 + }, + { + "epoch": 0.37473175965665234, + "grad_norm": 0.44921875, + "learning_rate": 4.973558375491071e-06, + "loss": 2.465, + "step": 6985 + }, + { + "epoch": 0.3747854077253219, + "grad_norm": 0.6328125, + "learning_rate": 4.973545771985131e-06, + "loss": 2.4601, + "step": 6986 + }, + { + "epoch": 0.3748390557939914, + "grad_norm": 0.375, + "learning_rate": 4.973533165492127e-06, + "loss": 2.2448, + "step": 6987 + }, + { + "epoch": 0.37489270386266094, + "grad_norm": 0.39453125, + "learning_rate": 4.973520556012076e-06, + "loss": 2.2427, + "step": 6988 + }, + { + "epoch": 0.37494635193133047, + "grad_norm": 0.37109375, + "learning_rate": 4.973507943544992e-06, + "loss": 2.2347, + "step": 6989 + }, + { + "epoch": 0.375, + "grad_norm": 0.392578125, + "learning_rate": 4.973495328090891e-06, + "loss": 2.3712, + "step": 6990 + }, + { + "epoch": 0.37505364806866953, + "grad_norm": 0.37109375, + "learning_rate": 4.973482709649787e-06, + "loss": 2.2415, + "step": 6991 + }, + { + "epoch": 0.37510729613733906, + "grad_norm": 0.3671875, + "learning_rate": 4.9734700882216954e-06, + "loss": 2.3807, + "step": 6992 + }, + { + "epoch": 0.3751609442060086, + "grad_norm": 0.5625, + "learning_rate": 4.973457463806633e-06, + "loss": 2.246, + "step": 6993 + }, + { + "epoch": 0.3752145922746781, + "grad_norm": 0.38671875, + "learning_rate": 4.973444836404615e-06, + "loss": 2.1981, + "step": 6994 + }, + { + "epoch": 0.37526824034334766, + "grad_norm": 0.423828125, + "learning_rate": 4.973432206015655e-06, + "loss": 2.2594, + "step": 6995 + }, + { + "epoch": 0.3753218884120172, + "grad_norm": 0.375, + "learning_rate": 4.973419572639768e-06, + "loss": 1.8943, + "step": 6996 + }, + { + "epoch": 0.3753755364806867, + "grad_norm": 0.345703125, + "learning_rate": 4.973406936276972e-06, + "loss": 2.3893, + "step": 6997 + }, + { + "epoch": 0.3754291845493562, + "grad_norm": 0.38671875, + "learning_rate": 4.973394296927279e-06, + "loss": 1.9093, + "step": 6998 + }, + { + "epoch": 0.37548283261802573, + "grad_norm": 0.486328125, + "learning_rate": 4.973381654590707e-06, + "loss": 2.4733, + "step": 6999 + }, + { + "epoch": 0.37553648068669526, + "grad_norm": 0.392578125, + "learning_rate": 4.97336900926727e-06, + "loss": 2.2757, + "step": 7000 + }, + { + "epoch": 0.3755901287553648, + "grad_norm": 0.345703125, + "learning_rate": 4.973356360956982e-06, + "loss": 2.1916, + "step": 7001 + }, + { + "epoch": 0.3756437768240343, + "grad_norm": 0.40625, + "learning_rate": 4.97334370965986e-06, + "loss": 2.485, + "step": 7002 + }, + { + "epoch": 0.37569742489270386, + "grad_norm": 0.3984375, + "learning_rate": 4.973331055375919e-06, + "loss": 2.0981, + "step": 7003 + }, + { + "epoch": 0.3757510729613734, + "grad_norm": 0.404296875, + "learning_rate": 4.973318398105175e-06, + "loss": 2.2179, + "step": 7004 + }, + { + "epoch": 0.3758047210300429, + "grad_norm": 0.37109375, + "learning_rate": 4.973305737847641e-06, + "loss": 2.274, + "step": 7005 + }, + { + "epoch": 0.37585836909871245, + "grad_norm": 0.59375, + "learning_rate": 4.973293074603335e-06, + "loss": 2.633, + "step": 7006 + }, + { + "epoch": 0.375912017167382, + "grad_norm": 0.38671875, + "learning_rate": 4.9732804083722705e-06, + "loss": 2.1843, + "step": 7007 + }, + { + "epoch": 0.3759656652360515, + "grad_norm": 0.5078125, + "learning_rate": 4.9732677391544635e-06, + "loss": 2.2365, + "step": 7008 + }, + { + "epoch": 0.37601931330472105, + "grad_norm": 0.57421875, + "learning_rate": 4.973255066949929e-06, + "loss": 2.3077, + "step": 7009 + }, + { + "epoch": 0.3760729613733906, + "grad_norm": 0.44921875, + "learning_rate": 4.9732423917586826e-06, + "loss": 2.3912, + "step": 7010 + }, + { + "epoch": 0.3761266094420601, + "grad_norm": 0.486328125, + "learning_rate": 4.973229713580738e-06, + "loss": 2.137, + "step": 7011 + }, + { + "epoch": 0.3761802575107296, + "grad_norm": 0.40234375, + "learning_rate": 4.9732170324161134e-06, + "loss": 2.4725, + "step": 7012 + }, + { + "epoch": 0.3762339055793991, + "grad_norm": 0.435546875, + "learning_rate": 4.9732043482648225e-06, + "loss": 2.2928, + "step": 7013 + }, + { + "epoch": 0.37628755364806865, + "grad_norm": 0.45703125, + "learning_rate": 4.973191661126881e-06, + "loss": 2.6556, + "step": 7014 + }, + { + "epoch": 0.3763412017167382, + "grad_norm": 0.375, + "learning_rate": 4.973178971002304e-06, + "loss": 2.3176, + "step": 7015 + }, + { + "epoch": 0.3763948497854077, + "grad_norm": 0.375, + "learning_rate": 4.973166277891106e-06, + "loss": 2.2521, + "step": 7016 + }, + { + "epoch": 0.37644849785407725, + "grad_norm": 0.486328125, + "learning_rate": 4.973153581793303e-06, + "loss": 2.427, + "step": 7017 + }, + { + "epoch": 0.3765021459227468, + "grad_norm": 1.875, + "learning_rate": 4.973140882708911e-06, + "loss": 2.3426, + "step": 7018 + }, + { + "epoch": 0.3765557939914163, + "grad_norm": 0.3828125, + "learning_rate": 4.973128180637945e-06, + "loss": 2.3357, + "step": 7019 + }, + { + "epoch": 0.37660944206008584, + "grad_norm": 0.796875, + "learning_rate": 4.97311547558042e-06, + "loss": 2.3495, + "step": 7020 + }, + { + "epoch": 0.3766630901287554, + "grad_norm": 0.50390625, + "learning_rate": 4.973102767536352e-06, + "loss": 2.3665, + "step": 7021 + }, + { + "epoch": 0.3767167381974249, + "grad_norm": 0.63671875, + "learning_rate": 4.973090056505755e-06, + "loss": 2.1446, + "step": 7022 + }, + { + "epoch": 0.37677038626609444, + "grad_norm": 0.4140625, + "learning_rate": 4.973077342488645e-06, + "loss": 2.2574, + "step": 7023 + }, + { + "epoch": 0.37682403433476397, + "grad_norm": 0.474609375, + "learning_rate": 4.973064625485038e-06, + "loss": 2.2097, + "step": 7024 + }, + { + "epoch": 0.3768776824034335, + "grad_norm": 0.50390625, + "learning_rate": 4.973051905494949e-06, + "loss": 2.3199, + "step": 7025 + }, + { + "epoch": 0.376931330472103, + "grad_norm": 0.40234375, + "learning_rate": 4.973039182518393e-06, + "loss": 2.4487, + "step": 7026 + }, + { + "epoch": 0.3769849785407725, + "grad_norm": 0.421875, + "learning_rate": 4.973026456555386e-06, + "loss": 2.4638, + "step": 7027 + }, + { + "epoch": 0.37703862660944204, + "grad_norm": 0.349609375, + "learning_rate": 4.973013727605943e-06, + "loss": 2.1403, + "step": 7028 + }, + { + "epoch": 0.37709227467811157, + "grad_norm": 0.4140625, + "learning_rate": 4.9730009956700784e-06, + "loss": 2.3351, + "step": 7029 + }, + { + "epoch": 0.3771459227467811, + "grad_norm": 0.484375, + "learning_rate": 4.97298826074781e-06, + "loss": 2.469, + "step": 7030 + }, + { + "epoch": 0.37719957081545064, + "grad_norm": 0.328125, + "learning_rate": 4.97297552283915e-06, + "loss": 2.0746, + "step": 7031 + }, + { + "epoch": 0.37725321888412017, + "grad_norm": 0.451171875, + "learning_rate": 4.9729627819441165e-06, + "loss": 2.2719, + "step": 7032 + }, + { + "epoch": 0.3773068669527897, + "grad_norm": 0.40234375, + "learning_rate": 4.9729500380627235e-06, + "loss": 2.3857, + "step": 7033 + }, + { + "epoch": 0.37736051502145923, + "grad_norm": 0.404296875, + "learning_rate": 4.972937291194987e-06, + "loss": 2.4285, + "step": 7034 + }, + { + "epoch": 0.37741416309012876, + "grad_norm": 0.326171875, + "learning_rate": 4.9729245413409225e-06, + "loss": 1.9944, + "step": 7035 + }, + { + "epoch": 0.3774678111587983, + "grad_norm": 0.384765625, + "learning_rate": 4.972911788500545e-06, + "loss": 2.0916, + "step": 7036 + }, + { + "epoch": 0.3775214592274678, + "grad_norm": 0.458984375, + "learning_rate": 4.97289903267387e-06, + "loss": 2.5332, + "step": 7037 + }, + { + "epoch": 0.37757510729613736, + "grad_norm": 0.44140625, + "learning_rate": 4.9728862738609115e-06, + "loss": 2.2524, + "step": 7038 + }, + { + "epoch": 0.3776287553648069, + "grad_norm": 0.34765625, + "learning_rate": 4.972873512061688e-06, + "loss": 2.1768, + "step": 7039 + }, + { + "epoch": 0.3776824034334764, + "grad_norm": 0.458984375, + "learning_rate": 4.972860747276213e-06, + "loss": 2.295, + "step": 7040 + }, + { + "epoch": 0.3777360515021459, + "grad_norm": 0.34765625, + "learning_rate": 4.972847979504502e-06, + "loss": 2.0305, + "step": 7041 + }, + { + "epoch": 0.37778969957081543, + "grad_norm": 0.447265625, + "learning_rate": 4.97283520874657e-06, + "loss": 2.2667, + "step": 7042 + }, + { + "epoch": 0.37784334763948496, + "grad_norm": 0.3828125, + "learning_rate": 4.972822435002432e-06, + "loss": 2.4008, + "step": 7043 + }, + { + "epoch": 0.3778969957081545, + "grad_norm": 0.5, + "learning_rate": 4.972809658272106e-06, + "loss": 2.3243, + "step": 7044 + }, + { + "epoch": 0.377950643776824, + "grad_norm": 0.609375, + "learning_rate": 4.9727968785556044e-06, + "loss": 2.3922, + "step": 7045 + }, + { + "epoch": 0.37800429184549356, + "grad_norm": 0.443359375, + "learning_rate": 4.972784095852945e-06, + "loss": 2.2605, + "step": 7046 + }, + { + "epoch": 0.3780579399141631, + "grad_norm": 0.54296875, + "learning_rate": 4.972771310164143e-06, + "loss": 2.1397, + "step": 7047 + }, + { + "epoch": 0.3781115879828326, + "grad_norm": 0.451171875, + "learning_rate": 4.972758521489211e-06, + "loss": 2.338, + "step": 7048 + }, + { + "epoch": 0.37816523605150215, + "grad_norm": 0.443359375, + "learning_rate": 4.972745729828169e-06, + "loss": 2.2055, + "step": 7049 + }, + { + "epoch": 0.3782188841201717, + "grad_norm": 0.37890625, + "learning_rate": 4.972732935181028e-06, + "loss": 2.2838, + "step": 7050 + }, + { + "epoch": 0.3782725321888412, + "grad_norm": 0.42578125, + "learning_rate": 4.972720137547806e-06, + "loss": 2.4065, + "step": 7051 + }, + { + "epoch": 0.37832618025751075, + "grad_norm": 0.33984375, + "learning_rate": 4.9727073369285186e-06, + "loss": 2.028, + "step": 7052 + }, + { + "epoch": 0.3783798283261803, + "grad_norm": 0.390625, + "learning_rate": 4.97269453332318e-06, + "loss": 2.0486, + "step": 7053 + }, + { + "epoch": 0.3784334763948498, + "grad_norm": 0.5234375, + "learning_rate": 4.972681726731806e-06, + "loss": 2.1498, + "step": 7054 + }, + { + "epoch": 0.3784871244635193, + "grad_norm": 0.4296875, + "learning_rate": 4.972668917154412e-06, + "loss": 2.4562, + "step": 7055 + }, + { + "epoch": 0.3785407725321888, + "grad_norm": 0.439453125, + "learning_rate": 4.9726561045910145e-06, + "loss": 2.429, + "step": 7056 + }, + { + "epoch": 0.37859442060085835, + "grad_norm": 0.416015625, + "learning_rate": 4.9726432890416285e-06, + "loss": 2.2175, + "step": 7057 + }, + { + "epoch": 0.3786480686695279, + "grad_norm": 0.388671875, + "learning_rate": 4.9726304705062685e-06, + "loss": 2.405, + "step": 7058 + }, + { + "epoch": 0.3787017167381974, + "grad_norm": 0.43359375, + "learning_rate": 4.972617648984951e-06, + "loss": 2.366, + "step": 7059 + }, + { + "epoch": 0.37875536480686695, + "grad_norm": 0.33203125, + "learning_rate": 4.97260482447769e-06, + "loss": 2.2983, + "step": 7060 + }, + { + "epoch": 0.3788090128755365, + "grad_norm": 0.36328125, + "learning_rate": 4.972591996984504e-06, + "loss": 2.1874, + "step": 7061 + }, + { + "epoch": 0.378862660944206, + "grad_norm": 0.37109375, + "learning_rate": 4.9725791665054056e-06, + "loss": 2.4753, + "step": 7062 + }, + { + "epoch": 0.37891630901287554, + "grad_norm": 0.38671875, + "learning_rate": 4.972566333040411e-06, + "loss": 2.2042, + "step": 7063 + }, + { + "epoch": 0.3789699570815451, + "grad_norm": 0.369140625, + "learning_rate": 4.972553496589537e-06, + "loss": 2.3195, + "step": 7064 + }, + { + "epoch": 0.3790236051502146, + "grad_norm": 0.37109375, + "learning_rate": 4.972540657152798e-06, + "loss": 2.0113, + "step": 7065 + }, + { + "epoch": 0.37907725321888414, + "grad_norm": 0.3671875, + "learning_rate": 4.972527814730209e-06, + "loss": 2.0691, + "step": 7066 + }, + { + "epoch": 0.37913090128755367, + "grad_norm": 0.416015625, + "learning_rate": 4.972514969321787e-06, + "loss": 2.2011, + "step": 7067 + }, + { + "epoch": 0.3791845493562232, + "grad_norm": 0.474609375, + "learning_rate": 4.972502120927546e-06, + "loss": 2.2321, + "step": 7068 + }, + { + "epoch": 0.37923819742489273, + "grad_norm": 0.3515625, + "learning_rate": 4.972489269547503e-06, + "loss": 2.1991, + "step": 7069 + }, + { + "epoch": 0.3792918454935622, + "grad_norm": 0.39453125, + "learning_rate": 4.972476415181671e-06, + "loss": 2.289, + "step": 7070 + }, + { + "epoch": 0.37934549356223174, + "grad_norm": 0.333984375, + "learning_rate": 4.972463557830069e-06, + "loss": 2.1045, + "step": 7071 + }, + { + "epoch": 0.37939914163090127, + "grad_norm": 0.423828125, + "learning_rate": 4.972450697492709e-06, + "loss": 2.2808, + "step": 7072 + }, + { + "epoch": 0.3794527896995708, + "grad_norm": 0.380859375, + "learning_rate": 4.97243783416961e-06, + "loss": 2.2335, + "step": 7073 + }, + { + "epoch": 0.37950643776824033, + "grad_norm": 0.416015625, + "learning_rate": 4.972424967860784e-06, + "loss": 2.0, + "step": 7074 + }, + { + "epoch": 0.37956008583690987, + "grad_norm": 0.3515625, + "learning_rate": 4.97241209856625e-06, + "loss": 2.1722, + "step": 7075 + }, + { + "epoch": 0.3796137339055794, + "grad_norm": 0.486328125, + "learning_rate": 4.972399226286021e-06, + "loss": 2.397, + "step": 7076 + }, + { + "epoch": 0.37966738197424893, + "grad_norm": 0.439453125, + "learning_rate": 4.972386351020114e-06, + "loss": 2.295, + "step": 7077 + }, + { + "epoch": 0.37972103004291846, + "grad_norm": 0.369140625, + "learning_rate": 4.972373472768544e-06, + "loss": 2.1835, + "step": 7078 + }, + { + "epoch": 0.379774678111588, + "grad_norm": 0.37890625, + "learning_rate": 4.972360591531326e-06, + "loss": 2.0656, + "step": 7079 + }, + { + "epoch": 0.3798283261802575, + "grad_norm": 0.515625, + "learning_rate": 4.972347707308476e-06, + "loss": 1.707, + "step": 7080 + }, + { + "epoch": 0.37988197424892706, + "grad_norm": 0.37890625, + "learning_rate": 4.9723348201000095e-06, + "loss": 2.5063, + "step": 7081 + }, + { + "epoch": 0.3799356223175966, + "grad_norm": 0.79296875, + "learning_rate": 4.972321929905943e-06, + "loss": 2.2989, + "step": 7082 + }, + { + "epoch": 0.3799892703862661, + "grad_norm": 1.625, + "learning_rate": 4.972309036726291e-06, + "loss": 2.482, + "step": 7083 + }, + { + "epoch": 0.3800429184549356, + "grad_norm": 0.421875, + "learning_rate": 4.972296140561069e-06, + "loss": 2.4023, + "step": 7084 + }, + { + "epoch": 0.38009656652360513, + "grad_norm": 0.4296875, + "learning_rate": 4.972283241410293e-06, + "loss": 2.182, + "step": 7085 + }, + { + "epoch": 0.38015021459227466, + "grad_norm": 0.421875, + "learning_rate": 4.9722703392739795e-06, + "loss": 2.2363, + "step": 7086 + }, + { + "epoch": 0.3802038626609442, + "grad_norm": 0.400390625, + "learning_rate": 4.972257434152141e-06, + "loss": 2.2699, + "step": 7087 + }, + { + "epoch": 0.3802575107296137, + "grad_norm": 0.466796875, + "learning_rate": 4.972244526044797e-06, + "loss": 2.1328, + "step": 7088 + }, + { + "epoch": 0.38031115879828326, + "grad_norm": 0.44140625, + "learning_rate": 4.972231614951961e-06, + "loss": 2.2785, + "step": 7089 + }, + { + "epoch": 0.3803648068669528, + "grad_norm": 0.35546875, + "learning_rate": 4.972218700873648e-06, + "loss": 2.1698, + "step": 7090 + }, + { + "epoch": 0.3804184549356223, + "grad_norm": 0.474609375, + "learning_rate": 4.972205783809874e-06, + "loss": 2.4779, + "step": 7091 + }, + { + "epoch": 0.38047210300429185, + "grad_norm": 0.4453125, + "learning_rate": 4.972192863760656e-06, + "loss": 2.3522, + "step": 7092 + }, + { + "epoch": 0.3805257510729614, + "grad_norm": 0.376953125, + "learning_rate": 4.97217994072601e-06, + "loss": 2.4875, + "step": 7093 + }, + { + "epoch": 0.3805793991416309, + "grad_norm": 0.68359375, + "learning_rate": 4.972167014705948e-06, + "loss": 2.537, + "step": 7094 + }, + { + "epoch": 0.38063304721030045, + "grad_norm": 0.482421875, + "learning_rate": 4.972154085700489e-06, + "loss": 2.3838, + "step": 7095 + }, + { + "epoch": 0.38068669527897, + "grad_norm": 0.4296875, + "learning_rate": 4.972141153709647e-06, + "loss": 2.3521, + "step": 7096 + }, + { + "epoch": 0.3807403433476395, + "grad_norm": 0.546875, + "learning_rate": 4.972128218733439e-06, + "loss": 2.3617, + "step": 7097 + }, + { + "epoch": 0.380793991416309, + "grad_norm": 0.384765625, + "learning_rate": 4.972115280771878e-06, + "loss": 2.1456, + "step": 7098 + }, + { + "epoch": 0.3808476394849785, + "grad_norm": 0.361328125, + "learning_rate": 4.9721023398249825e-06, + "loss": 2.3124, + "step": 7099 + }, + { + "epoch": 0.38090128755364805, + "grad_norm": 0.384765625, + "learning_rate": 4.972089395892766e-06, + "loss": 2.3804, + "step": 7100 + }, + { + "epoch": 0.3809549356223176, + "grad_norm": 0.359375, + "learning_rate": 4.972076448975246e-06, + "loss": 2.3229, + "step": 7101 + }, + { + "epoch": 0.3810085836909871, + "grad_norm": 0.404296875, + "learning_rate": 4.972063499072437e-06, + "loss": 2.5004, + "step": 7102 + }, + { + "epoch": 0.38106223175965664, + "grad_norm": 0.384765625, + "learning_rate": 4.972050546184355e-06, + "loss": 2.2201, + "step": 7103 + }, + { + "epoch": 0.3811158798283262, + "grad_norm": 0.42578125, + "learning_rate": 4.9720375903110154e-06, + "loss": 2.3234, + "step": 7104 + }, + { + "epoch": 0.3811695278969957, + "grad_norm": 0.421875, + "learning_rate": 4.9720246314524334e-06, + "loss": 2.6284, + "step": 7105 + }, + { + "epoch": 0.38122317596566524, + "grad_norm": 0.431640625, + "learning_rate": 4.972011669608626e-06, + "loss": 2.7362, + "step": 7106 + }, + { + "epoch": 0.38127682403433477, + "grad_norm": 0.43359375, + "learning_rate": 4.971998704779608e-06, + "loss": 2.4781, + "step": 7107 + }, + { + "epoch": 0.3813304721030043, + "grad_norm": 0.390625, + "learning_rate": 4.971985736965395e-06, + "loss": 2.1324, + "step": 7108 + }, + { + "epoch": 0.38138412017167383, + "grad_norm": 0.388671875, + "learning_rate": 4.971972766166002e-06, + "loss": 2.5452, + "step": 7109 + }, + { + "epoch": 0.38143776824034337, + "grad_norm": 0.408203125, + "learning_rate": 4.971959792381446e-06, + "loss": 2.2418, + "step": 7110 + }, + { + "epoch": 0.3814914163090129, + "grad_norm": 0.515625, + "learning_rate": 4.9719468156117415e-06, + "loss": 2.3692, + "step": 7111 + }, + { + "epoch": 0.38154506437768243, + "grad_norm": 0.38671875, + "learning_rate": 4.971933835856906e-06, + "loss": 2.1894, + "step": 7112 + }, + { + "epoch": 0.3815987124463519, + "grad_norm": 0.38671875, + "learning_rate": 4.971920853116952e-06, + "loss": 2.0978, + "step": 7113 + }, + { + "epoch": 0.38165236051502144, + "grad_norm": 0.42578125, + "learning_rate": 4.9719078673919e-06, + "loss": 2.4108, + "step": 7114 + }, + { + "epoch": 0.38170600858369097, + "grad_norm": 0.345703125, + "learning_rate": 4.971894878681761e-06, + "loss": 2.4164, + "step": 7115 + }, + { + "epoch": 0.3817596566523605, + "grad_norm": 0.376953125, + "learning_rate": 4.971881886986552e-06, + "loss": 2.4666, + "step": 7116 + }, + { + "epoch": 0.38181330472103003, + "grad_norm": 0.55078125, + "learning_rate": 4.97186889230629e-06, + "loss": 2.2857, + "step": 7117 + }, + { + "epoch": 0.38186695278969957, + "grad_norm": 0.4140625, + "learning_rate": 4.97185589464099e-06, + "loss": 2.2304, + "step": 7118 + }, + { + "epoch": 0.3819206008583691, + "grad_norm": 0.458984375, + "learning_rate": 4.971842893990667e-06, + "loss": 2.2917, + "step": 7119 + }, + { + "epoch": 0.38197424892703863, + "grad_norm": 0.453125, + "learning_rate": 4.971829890355337e-06, + "loss": 2.2466, + "step": 7120 + }, + { + "epoch": 0.38202789699570816, + "grad_norm": 0.369140625, + "learning_rate": 4.971816883735017e-06, + "loss": 2.4693, + "step": 7121 + }, + { + "epoch": 0.3820815450643777, + "grad_norm": 0.359375, + "learning_rate": 4.971803874129721e-06, + "loss": 2.1536, + "step": 7122 + }, + { + "epoch": 0.3821351931330472, + "grad_norm": 0.412109375, + "learning_rate": 4.971790861539465e-06, + "loss": 2.2441, + "step": 7123 + }, + { + "epoch": 0.38218884120171676, + "grad_norm": 0.41796875, + "learning_rate": 4.971777845964266e-06, + "loss": 2.2809, + "step": 7124 + }, + { + "epoch": 0.3822424892703863, + "grad_norm": 0.498046875, + "learning_rate": 4.971764827404139e-06, + "loss": 2.2723, + "step": 7125 + }, + { + "epoch": 0.3822961373390558, + "grad_norm": 0.423828125, + "learning_rate": 4.971751805859099e-06, + "loss": 2.3634, + "step": 7126 + }, + { + "epoch": 0.3823497854077253, + "grad_norm": 0.376953125, + "learning_rate": 4.971738781329161e-06, + "loss": 2.0454, + "step": 7127 + }, + { + "epoch": 0.3824034334763948, + "grad_norm": 0.41015625, + "learning_rate": 4.971725753814344e-06, + "loss": 2.1559, + "step": 7128 + }, + { + "epoch": 0.38245708154506436, + "grad_norm": 0.419921875, + "learning_rate": 4.971712723314661e-06, + "loss": 2.3429, + "step": 7129 + }, + { + "epoch": 0.3825107296137339, + "grad_norm": 0.39453125, + "learning_rate": 4.971699689830128e-06, + "loss": 2.5033, + "step": 7130 + }, + { + "epoch": 0.3825643776824034, + "grad_norm": 0.435546875, + "learning_rate": 4.9716866533607614e-06, + "loss": 2.2841, + "step": 7131 + }, + { + "epoch": 0.38261802575107295, + "grad_norm": 0.49609375, + "learning_rate": 4.971673613906577e-06, + "loss": 2.2691, + "step": 7132 + }, + { + "epoch": 0.3826716738197425, + "grad_norm": 0.41796875, + "learning_rate": 4.9716605714675906e-06, + "loss": 2.4972, + "step": 7133 + }, + { + "epoch": 0.382725321888412, + "grad_norm": 0.61328125, + "learning_rate": 4.971647526043817e-06, + "loss": 2.3507, + "step": 7134 + }, + { + "epoch": 0.38277896995708155, + "grad_norm": 0.40234375, + "learning_rate": 4.971634477635272e-06, + "loss": 2.3991, + "step": 7135 + }, + { + "epoch": 0.3828326180257511, + "grad_norm": 0.41796875, + "learning_rate": 4.971621426241972e-06, + "loss": 2.3648, + "step": 7136 + }, + { + "epoch": 0.3828862660944206, + "grad_norm": 0.46875, + "learning_rate": 4.971608371863934e-06, + "loss": 1.2331, + "step": 7137 + }, + { + "epoch": 0.38293991416309014, + "grad_norm": 0.404296875, + "learning_rate": 4.971595314501172e-06, + "loss": 2.2686, + "step": 7138 + }, + { + "epoch": 0.3829935622317597, + "grad_norm": 0.384765625, + "learning_rate": 4.971582254153702e-06, + "loss": 2.2696, + "step": 7139 + }, + { + "epoch": 0.3830472103004292, + "grad_norm": 0.34375, + "learning_rate": 4.97156919082154e-06, + "loss": 2.2344, + "step": 7140 + }, + { + "epoch": 0.38310085836909874, + "grad_norm": 0.43359375, + "learning_rate": 4.9715561245047025e-06, + "loss": 2.4161, + "step": 7141 + }, + { + "epoch": 0.3831545064377682, + "grad_norm": 0.37109375, + "learning_rate": 4.971543055203204e-06, + "loss": 2.1406, + "step": 7142 + }, + { + "epoch": 0.38320815450643775, + "grad_norm": 0.51171875, + "learning_rate": 4.971529982917061e-06, + "loss": 2.2736, + "step": 7143 + }, + { + "epoch": 0.3832618025751073, + "grad_norm": 0.369140625, + "learning_rate": 4.971516907646289e-06, + "loss": 2.2636, + "step": 7144 + }, + { + "epoch": 0.3833154506437768, + "grad_norm": 0.47265625, + "learning_rate": 4.971503829390903e-06, + "loss": 1.5563, + "step": 7145 + }, + { + "epoch": 0.38336909871244634, + "grad_norm": 0.41015625, + "learning_rate": 4.9714907481509214e-06, + "loss": 2.1994, + "step": 7146 + }, + { + "epoch": 0.3834227467811159, + "grad_norm": 0.435546875, + "learning_rate": 4.971477663926357e-06, + "loss": 2.1354, + "step": 7147 + }, + { + "epoch": 0.3834763948497854, + "grad_norm": 0.75390625, + "learning_rate": 4.971464576717228e-06, + "loss": 2.3701, + "step": 7148 + }, + { + "epoch": 0.38353004291845494, + "grad_norm": 0.6328125, + "learning_rate": 4.971451486523548e-06, + "loss": 2.0047, + "step": 7149 + }, + { + "epoch": 0.38358369098712447, + "grad_norm": 0.37109375, + "learning_rate": 4.971438393345335e-06, + "loss": 2.2298, + "step": 7150 + }, + { + "epoch": 0.383637339055794, + "grad_norm": 0.41796875, + "learning_rate": 4.971425297182603e-06, + "loss": 2.6139, + "step": 7151 + }, + { + "epoch": 0.38369098712446353, + "grad_norm": 0.435546875, + "learning_rate": 4.971412198035368e-06, + "loss": 2.076, + "step": 7152 + }, + { + "epoch": 0.38374463519313307, + "grad_norm": 0.51171875, + "learning_rate": 4.971399095903647e-06, + "loss": 2.4839, + "step": 7153 + }, + { + "epoch": 0.3837982832618026, + "grad_norm": 0.48046875, + "learning_rate": 4.971385990787456e-06, + "loss": 2.2269, + "step": 7154 + }, + { + "epoch": 0.38385193133047213, + "grad_norm": 0.357421875, + "learning_rate": 4.971372882686809e-06, + "loss": 2.1772, + "step": 7155 + }, + { + "epoch": 0.3839055793991416, + "grad_norm": 0.412109375, + "learning_rate": 4.971359771601723e-06, + "loss": 2.5647, + "step": 7156 + }, + { + "epoch": 0.38395922746781114, + "grad_norm": 0.458984375, + "learning_rate": 4.971346657532214e-06, + "loss": 2.4423, + "step": 7157 + }, + { + "epoch": 0.38401287553648067, + "grad_norm": 0.41796875, + "learning_rate": 4.971333540478297e-06, + "loss": 2.3204, + "step": 7158 + }, + { + "epoch": 0.3840665236051502, + "grad_norm": 0.435546875, + "learning_rate": 4.971320420439989e-06, + "loss": 2.2262, + "step": 7159 + }, + { + "epoch": 0.38412017167381973, + "grad_norm": 0.361328125, + "learning_rate": 4.971307297417305e-06, + "loss": 1.5546, + "step": 7160 + }, + { + "epoch": 0.38417381974248926, + "grad_norm": 0.396484375, + "learning_rate": 4.971294171410261e-06, + "loss": 2.2744, + "step": 7161 + }, + { + "epoch": 0.3842274678111588, + "grad_norm": 0.37890625, + "learning_rate": 4.971281042418873e-06, + "loss": 2.3603, + "step": 7162 + }, + { + "epoch": 0.3842811158798283, + "grad_norm": 0.45703125, + "learning_rate": 4.971267910443156e-06, + "loss": 2.1471, + "step": 7163 + }, + { + "epoch": 0.38433476394849786, + "grad_norm": 0.5625, + "learning_rate": 4.971254775483127e-06, + "loss": 2.218, + "step": 7164 + }, + { + "epoch": 0.3843884120171674, + "grad_norm": 0.359375, + "learning_rate": 4.971241637538802e-06, + "loss": 2.1042, + "step": 7165 + }, + { + "epoch": 0.3844420600858369, + "grad_norm": 0.48828125, + "learning_rate": 4.971228496610196e-06, + "loss": 2.3388, + "step": 7166 + }, + { + "epoch": 0.38449570815450645, + "grad_norm": 0.458984375, + "learning_rate": 4.971215352697325e-06, + "loss": 2.2415, + "step": 7167 + }, + { + "epoch": 0.384549356223176, + "grad_norm": 60.0, + "learning_rate": 4.971202205800205e-06, + "loss": 1.7043, + "step": 7168 + }, + { + "epoch": 0.3846030042918455, + "grad_norm": 0.376953125, + "learning_rate": 4.971189055918853e-06, + "loss": 2.213, + "step": 7169 + }, + { + "epoch": 0.384656652360515, + "grad_norm": 0.38671875, + "learning_rate": 4.971175903053282e-06, + "loss": 2.6604, + "step": 7170 + }, + { + "epoch": 0.3847103004291845, + "grad_norm": 0.39453125, + "learning_rate": 4.971162747203512e-06, + "loss": 2.4183, + "step": 7171 + }, + { + "epoch": 0.38476394849785406, + "grad_norm": 0.625, + "learning_rate": 4.971149588369555e-06, + "loss": 2.1015, + "step": 7172 + }, + { + "epoch": 0.3848175965665236, + "grad_norm": 0.408203125, + "learning_rate": 4.971136426551429e-06, + "loss": 2.0564, + "step": 7173 + }, + { + "epoch": 0.3848712446351931, + "grad_norm": 0.455078125, + "learning_rate": 4.971123261749149e-06, + "loss": 2.3866, + "step": 7174 + }, + { + "epoch": 0.38492489270386265, + "grad_norm": 0.388671875, + "learning_rate": 4.971110093962732e-06, + "loss": 2.0914, + "step": 7175 + }, + { + "epoch": 0.3849785407725322, + "grad_norm": 0.384765625, + "learning_rate": 4.971096923192192e-06, + "loss": 2.1723, + "step": 7176 + }, + { + "epoch": 0.3850321888412017, + "grad_norm": 0.6171875, + "learning_rate": 4.9710837494375475e-06, + "loss": 2.3698, + "step": 7177 + }, + { + "epoch": 0.38508583690987125, + "grad_norm": 0.4921875, + "learning_rate": 4.9710705726988125e-06, + "loss": 2.1224, + "step": 7178 + }, + { + "epoch": 0.3851394849785408, + "grad_norm": 0.6796875, + "learning_rate": 4.971057392976003e-06, + "loss": 1.8444, + "step": 7179 + }, + { + "epoch": 0.3851931330472103, + "grad_norm": 0.44140625, + "learning_rate": 4.971044210269136e-06, + "loss": 2.2913, + "step": 7180 + }, + { + "epoch": 0.38524678111587984, + "grad_norm": 2.65625, + "learning_rate": 4.971031024578226e-06, + "loss": 2.333, + "step": 7181 + }, + { + "epoch": 0.3853004291845494, + "grad_norm": 0.51953125, + "learning_rate": 4.97101783590329e-06, + "loss": 2.4733, + "step": 7182 + }, + { + "epoch": 0.3853540772532189, + "grad_norm": 0.369140625, + "learning_rate": 4.9710046442443434e-06, + "loss": 2.4432, + "step": 7183 + }, + { + "epoch": 0.38540772532188844, + "grad_norm": 0.298828125, + "learning_rate": 4.970991449601403e-06, + "loss": 1.9196, + "step": 7184 + }, + { + "epoch": 0.3854613733905579, + "grad_norm": 0.4609375, + "learning_rate": 4.970978251974483e-06, + "loss": 2.4908, + "step": 7185 + }, + { + "epoch": 0.38551502145922745, + "grad_norm": 0.392578125, + "learning_rate": 4.9709650513636015e-06, + "loss": 2.3079, + "step": 7186 + }, + { + "epoch": 0.385568669527897, + "grad_norm": 0.451171875, + "learning_rate": 4.970951847768773e-06, + "loss": 2.2438, + "step": 7187 + }, + { + "epoch": 0.3856223175965665, + "grad_norm": 0.390625, + "learning_rate": 4.970938641190013e-06, + "loss": 2.2458, + "step": 7188 + }, + { + "epoch": 0.38567596566523604, + "grad_norm": 0.40625, + "learning_rate": 4.9709254316273394e-06, + "loss": 2.4819, + "step": 7189 + }, + { + "epoch": 0.3857296137339056, + "grad_norm": 0.41796875, + "learning_rate": 4.970912219080767e-06, + "loss": 2.272, + "step": 7190 + }, + { + "epoch": 0.3857832618025751, + "grad_norm": 0.365234375, + "learning_rate": 4.9708990035503105e-06, + "loss": 2.1889, + "step": 7191 + }, + { + "epoch": 0.38583690987124464, + "grad_norm": 0.3828125, + "learning_rate": 4.9708857850359884e-06, + "loss": 2.2867, + "step": 7192 + }, + { + "epoch": 0.38589055793991417, + "grad_norm": 0.412109375, + "learning_rate": 4.970872563537814e-06, + "loss": 2.4879, + "step": 7193 + }, + { + "epoch": 0.3859442060085837, + "grad_norm": 0.37109375, + "learning_rate": 4.970859339055806e-06, + "loss": 2.1779, + "step": 7194 + }, + { + "epoch": 0.38599785407725323, + "grad_norm": 0.45703125, + "learning_rate": 4.970846111589979e-06, + "loss": 2.3466, + "step": 7195 + }, + { + "epoch": 0.38605150214592276, + "grad_norm": 0.419921875, + "learning_rate": 4.9708328811403475e-06, + "loss": 2.389, + "step": 7196 + }, + { + "epoch": 0.3861051502145923, + "grad_norm": 0.435546875, + "learning_rate": 4.97081964770693e-06, + "loss": 2.2325, + "step": 7197 + }, + { + "epoch": 0.38615879828326183, + "grad_norm": 0.3984375, + "learning_rate": 4.9708064112897416e-06, + "loss": 2.3294, + "step": 7198 + }, + { + "epoch": 0.3862124463519313, + "grad_norm": 0.369140625, + "learning_rate": 4.970793171888798e-06, + "loss": 2.1591, + "step": 7199 + }, + { + "epoch": 0.38626609442060084, + "grad_norm": 0.3984375, + "learning_rate": 4.9707799295041145e-06, + "loss": 2.4702, + "step": 7200 + }, + { + "epoch": 0.38631974248927037, + "grad_norm": 0.42578125, + "learning_rate": 4.970766684135709e-06, + "loss": 2.2485, + "step": 7201 + }, + { + "epoch": 0.3863733905579399, + "grad_norm": 0.435546875, + "learning_rate": 4.970753435783596e-06, + "loss": 2.277, + "step": 7202 + }, + { + "epoch": 0.38642703862660943, + "grad_norm": 0.447265625, + "learning_rate": 4.970740184447791e-06, + "loss": 2.2314, + "step": 7203 + }, + { + "epoch": 0.38648068669527896, + "grad_norm": 0.4765625, + "learning_rate": 4.9707269301283125e-06, + "loss": 2.0534, + "step": 7204 + }, + { + "epoch": 0.3865343347639485, + "grad_norm": 0.4765625, + "learning_rate": 4.970713672825174e-06, + "loss": 2.5011, + "step": 7205 + }, + { + "epoch": 0.386587982832618, + "grad_norm": 0.40625, + "learning_rate": 4.970700412538393e-06, + "loss": 2.2704, + "step": 7206 + }, + { + "epoch": 0.38664163090128756, + "grad_norm": 0.380859375, + "learning_rate": 4.970687149267984e-06, + "loss": 2.4894, + "step": 7207 + }, + { + "epoch": 0.3866952789699571, + "grad_norm": 0.48046875, + "learning_rate": 4.970673883013964e-06, + "loss": 2.4007, + "step": 7208 + }, + { + "epoch": 0.3867489270386266, + "grad_norm": 0.435546875, + "learning_rate": 4.97066061377635e-06, + "loss": 2.2059, + "step": 7209 + }, + { + "epoch": 0.38680257510729615, + "grad_norm": 0.40234375, + "learning_rate": 4.9706473415551556e-06, + "loss": 2.2692, + "step": 7210 + }, + { + "epoch": 0.3868562231759657, + "grad_norm": 0.4296875, + "learning_rate": 4.9706340663504e-06, + "loss": 2.1618, + "step": 7211 + }, + { + "epoch": 0.3869098712446352, + "grad_norm": 0.4140625, + "learning_rate": 4.970620788162096e-06, + "loss": 2.2394, + "step": 7212 + }, + { + "epoch": 0.3869635193133047, + "grad_norm": 0.33984375, + "learning_rate": 4.970607506990261e-06, + "loss": 2.1964, + "step": 7213 + }, + { + "epoch": 0.3870171673819742, + "grad_norm": 0.51171875, + "learning_rate": 4.970594222834911e-06, + "loss": 2.3603, + "step": 7214 + }, + { + "epoch": 0.38707081545064376, + "grad_norm": 0.361328125, + "learning_rate": 4.970580935696063e-06, + "loss": 2.1607, + "step": 7215 + }, + { + "epoch": 0.3871244635193133, + "grad_norm": 0.3984375, + "learning_rate": 4.970567645573732e-06, + "loss": 2.2467, + "step": 7216 + }, + { + "epoch": 0.3871781115879828, + "grad_norm": 0.375, + "learning_rate": 4.970554352467935e-06, + "loss": 2.4177, + "step": 7217 + }, + { + "epoch": 0.38723175965665235, + "grad_norm": 0.515625, + "learning_rate": 4.970541056378686e-06, + "loss": 2.2152, + "step": 7218 + }, + { + "epoch": 0.3872854077253219, + "grad_norm": 0.447265625, + "learning_rate": 4.970527757306003e-06, + "loss": 2.3034, + "step": 7219 + }, + { + "epoch": 0.3873390557939914, + "grad_norm": 0.828125, + "learning_rate": 4.970514455249901e-06, + "loss": 1.3975, + "step": 7220 + }, + { + "epoch": 0.38739270386266095, + "grad_norm": 0.4140625, + "learning_rate": 4.9705011502103965e-06, + "loss": 2.5222, + "step": 7221 + }, + { + "epoch": 0.3874463519313305, + "grad_norm": 0.478515625, + "learning_rate": 4.970487842187506e-06, + "loss": 2.044, + "step": 7222 + }, + { + "epoch": 0.3875, + "grad_norm": 0.41796875, + "learning_rate": 4.970474531181245e-06, + "loss": 1.9152, + "step": 7223 + }, + { + "epoch": 0.38755364806866954, + "grad_norm": 0.37890625, + "learning_rate": 4.97046121719163e-06, + "loss": 2.3908, + "step": 7224 + }, + { + "epoch": 0.3876072961373391, + "grad_norm": 0.46484375, + "learning_rate": 4.970447900218676e-06, + "loss": 1.8052, + "step": 7225 + }, + { + "epoch": 0.3876609442060086, + "grad_norm": 0.416015625, + "learning_rate": 4.9704345802624e-06, + "loss": 2.2304, + "step": 7226 + }, + { + "epoch": 0.38771459227467814, + "grad_norm": 0.388671875, + "learning_rate": 4.970421257322818e-06, + "loss": 2.3354, + "step": 7227 + }, + { + "epoch": 0.3877682403433476, + "grad_norm": 0.427734375, + "learning_rate": 4.970407931399947e-06, + "loss": 2.4083, + "step": 7228 + }, + { + "epoch": 0.38782188841201715, + "grad_norm": 0.49609375, + "learning_rate": 4.970394602493801e-06, + "loss": 2.2387, + "step": 7229 + }, + { + "epoch": 0.3878755364806867, + "grad_norm": 0.392578125, + "learning_rate": 4.970381270604398e-06, + "loss": 2.0629, + "step": 7230 + }, + { + "epoch": 0.3879291845493562, + "grad_norm": 0.439453125, + "learning_rate": 4.970367935731753e-06, + "loss": 2.2259, + "step": 7231 + }, + { + "epoch": 0.38798283261802574, + "grad_norm": 0.421875, + "learning_rate": 4.9703545978758814e-06, + "loss": 2.1824, + "step": 7232 + }, + { + "epoch": 0.3880364806866953, + "grad_norm": 0.455078125, + "learning_rate": 4.970341257036802e-06, + "loss": 2.3738, + "step": 7233 + }, + { + "epoch": 0.3880901287553648, + "grad_norm": 0.439453125, + "learning_rate": 4.9703279132145285e-06, + "loss": 1.8181, + "step": 7234 + }, + { + "epoch": 0.38814377682403434, + "grad_norm": 0.4140625, + "learning_rate": 4.970314566409077e-06, + "loss": 2.3089, + "step": 7235 + }, + { + "epoch": 0.38819742489270387, + "grad_norm": 0.42578125, + "learning_rate": 4.970301216620466e-06, + "loss": 2.3052, + "step": 7236 + }, + { + "epoch": 0.3882510729613734, + "grad_norm": 0.431640625, + "learning_rate": 4.970287863848709e-06, + "loss": 2.3736, + "step": 7237 + }, + { + "epoch": 0.38830472103004293, + "grad_norm": 0.36328125, + "learning_rate": 4.970274508093823e-06, + "loss": 2.1999, + "step": 7238 + }, + { + "epoch": 0.38835836909871246, + "grad_norm": 0.451171875, + "learning_rate": 4.970261149355824e-06, + "loss": 2.1875, + "step": 7239 + }, + { + "epoch": 0.388412017167382, + "grad_norm": 0.546875, + "learning_rate": 4.970247787634729e-06, + "loss": 2.1045, + "step": 7240 + }, + { + "epoch": 0.3884656652360515, + "grad_norm": 0.3984375, + "learning_rate": 4.970234422930553e-06, + "loss": 2.3745, + "step": 7241 + }, + { + "epoch": 0.388519313304721, + "grad_norm": 0.9609375, + "learning_rate": 4.970221055243314e-06, + "loss": 2.4308, + "step": 7242 + }, + { + "epoch": 0.38857296137339054, + "grad_norm": 0.337890625, + "learning_rate": 4.970207684573025e-06, + "loss": 1.9982, + "step": 7243 + }, + { + "epoch": 0.38862660944206007, + "grad_norm": 0.44921875, + "learning_rate": 4.970194310919705e-06, + "loss": 2.0566, + "step": 7244 + }, + { + "epoch": 0.3886802575107296, + "grad_norm": 0.34375, + "learning_rate": 4.970180934283369e-06, + "loss": 1.6918, + "step": 7245 + }, + { + "epoch": 0.38873390557939913, + "grad_norm": 0.48828125, + "learning_rate": 4.970167554664033e-06, + "loss": 2.5412, + "step": 7246 + }, + { + "epoch": 0.38878755364806866, + "grad_norm": 0.439453125, + "learning_rate": 4.9701541720617145e-06, + "loss": 2.3166, + "step": 7247 + }, + { + "epoch": 0.3888412017167382, + "grad_norm": 0.51171875, + "learning_rate": 4.970140786476427e-06, + "loss": 2.1394, + "step": 7248 + }, + { + "epoch": 0.3888948497854077, + "grad_norm": 0.40625, + "learning_rate": 4.970127397908189e-06, + "loss": 2.2356, + "step": 7249 + }, + { + "epoch": 0.38894849785407726, + "grad_norm": 0.455078125, + "learning_rate": 4.970114006357015e-06, + "loss": 2.2066, + "step": 7250 + }, + { + "epoch": 0.3890021459227468, + "grad_norm": 0.76953125, + "learning_rate": 4.970100611822924e-06, + "loss": 2.0629, + "step": 7251 + }, + { + "epoch": 0.3890557939914163, + "grad_norm": 0.54296875, + "learning_rate": 4.9700872143059285e-06, + "loss": 2.3704, + "step": 7252 + }, + { + "epoch": 0.38910944206008585, + "grad_norm": 0.40234375, + "learning_rate": 4.9700738138060466e-06, + "loss": 2.3735, + "step": 7253 + }, + { + "epoch": 0.3891630901287554, + "grad_norm": 0.5390625, + "learning_rate": 4.970060410323295e-06, + "loss": 2.3912, + "step": 7254 + }, + { + "epoch": 0.3892167381974249, + "grad_norm": 0.439453125, + "learning_rate": 4.970047003857689e-06, + "loss": 2.2663, + "step": 7255 + }, + { + "epoch": 0.38927038626609445, + "grad_norm": 0.4140625, + "learning_rate": 4.970033594409244e-06, + "loss": 1.9004, + "step": 7256 + }, + { + "epoch": 0.3893240343347639, + "grad_norm": 0.35546875, + "learning_rate": 4.970020181977978e-06, + "loss": 2.2093, + "step": 7257 + }, + { + "epoch": 0.38937768240343346, + "grad_norm": 0.373046875, + "learning_rate": 4.970006766563906e-06, + "loss": 2.4144, + "step": 7258 + }, + { + "epoch": 0.389431330472103, + "grad_norm": 0.384765625, + "learning_rate": 4.969993348167046e-06, + "loss": 2.3434, + "step": 7259 + }, + { + "epoch": 0.3894849785407725, + "grad_norm": 0.451171875, + "learning_rate": 4.969979926787411e-06, + "loss": 2.4466, + "step": 7260 + }, + { + "epoch": 0.38953862660944205, + "grad_norm": 0.458984375, + "learning_rate": 4.96996650242502e-06, + "loss": 2.4355, + "step": 7261 + }, + { + "epoch": 0.3895922746781116, + "grad_norm": 0.34765625, + "learning_rate": 4.969953075079887e-06, + "loss": 2.3112, + "step": 7262 + }, + { + "epoch": 0.3896459227467811, + "grad_norm": 0.376953125, + "learning_rate": 4.969939644752031e-06, + "loss": 2.235, + "step": 7263 + }, + { + "epoch": 0.38969957081545065, + "grad_norm": 0.3984375, + "learning_rate": 4.969926211441466e-06, + "loss": 2.0678, + "step": 7264 + }, + { + "epoch": 0.3897532188841202, + "grad_norm": 0.46484375, + "learning_rate": 4.969912775148209e-06, + "loss": 2.3609, + "step": 7265 + }, + { + "epoch": 0.3898068669527897, + "grad_norm": 0.419921875, + "learning_rate": 4.969899335872276e-06, + "loss": 2.0462, + "step": 7266 + }, + { + "epoch": 0.38986051502145924, + "grad_norm": 0.498046875, + "learning_rate": 4.969885893613683e-06, + "loss": 2.344, + "step": 7267 + }, + { + "epoch": 0.3899141630901288, + "grad_norm": 0.4921875, + "learning_rate": 4.969872448372447e-06, + "loss": 2.4514, + "step": 7268 + }, + { + "epoch": 0.3899678111587983, + "grad_norm": 0.36328125, + "learning_rate": 4.969859000148583e-06, + "loss": 2.3005, + "step": 7269 + }, + { + "epoch": 0.39002145922746784, + "grad_norm": 1.1796875, + "learning_rate": 4.969845548942108e-06, + "loss": 2.5793, + "step": 7270 + }, + { + "epoch": 0.3900751072961373, + "grad_norm": 0.40234375, + "learning_rate": 4.969832094753039e-06, + "loss": 2.3775, + "step": 7271 + }, + { + "epoch": 0.39012875536480685, + "grad_norm": 0.41796875, + "learning_rate": 4.9698186375813916e-06, + "loss": 2.0881, + "step": 7272 + }, + { + "epoch": 0.3901824034334764, + "grad_norm": 0.42578125, + "learning_rate": 4.9698051774271815e-06, + "loss": 2.3244, + "step": 7273 + }, + { + "epoch": 0.3902360515021459, + "grad_norm": 0.38671875, + "learning_rate": 4.969791714290425e-06, + "loss": 2.4709, + "step": 7274 + }, + { + "epoch": 0.39028969957081544, + "grad_norm": 0.451171875, + "learning_rate": 4.96977824817114e-06, + "loss": 2.196, + "step": 7275 + }, + { + "epoch": 0.39034334763948497, + "grad_norm": 0.3984375, + "learning_rate": 4.96976477906934e-06, + "loss": 2.2223, + "step": 7276 + }, + { + "epoch": 0.3903969957081545, + "grad_norm": 0.42578125, + "learning_rate": 4.969751306985045e-06, + "loss": 2.3744, + "step": 7277 + }, + { + "epoch": 0.39045064377682404, + "grad_norm": 0.41015625, + "learning_rate": 4.9697378319182665e-06, + "loss": 2.36, + "step": 7278 + }, + { + "epoch": 0.39050429184549357, + "grad_norm": 0.451171875, + "learning_rate": 4.969724353869025e-06, + "loss": 1.4826, + "step": 7279 + }, + { + "epoch": 0.3905579399141631, + "grad_norm": 0.333984375, + "learning_rate": 4.969710872837334e-06, + "loss": 1.8932, + "step": 7280 + }, + { + "epoch": 0.39061158798283263, + "grad_norm": 0.35546875, + "learning_rate": 4.969697388823212e-06, + "loss": 2.1662, + "step": 7281 + }, + { + "epoch": 0.39066523605150216, + "grad_norm": 0.376953125, + "learning_rate": 4.969683901826674e-06, + "loss": 2.3805, + "step": 7282 + }, + { + "epoch": 0.3907188841201717, + "grad_norm": 0.439453125, + "learning_rate": 4.9696704118477355e-06, + "loss": 2.5809, + "step": 7283 + }, + { + "epoch": 0.3907725321888412, + "grad_norm": 0.84375, + "learning_rate": 4.9696569188864156e-06, + "loss": 2.4046, + "step": 7284 + }, + { + "epoch": 0.3908261802575107, + "grad_norm": 0.412109375, + "learning_rate": 4.969643422942727e-06, + "loss": 2.3834, + "step": 7285 + }, + { + "epoch": 0.39087982832618023, + "grad_norm": 0.458984375, + "learning_rate": 4.969629924016688e-06, + "loss": 2.4188, + "step": 7286 + }, + { + "epoch": 0.39093347639484977, + "grad_norm": 0.390625, + "learning_rate": 4.9696164221083155e-06, + "loss": 2.0414, + "step": 7287 + }, + { + "epoch": 0.3909871244635193, + "grad_norm": 0.45703125, + "learning_rate": 4.969602917217624e-06, + "loss": 2.3513, + "step": 7288 + }, + { + "epoch": 0.39104077253218883, + "grad_norm": 0.408203125, + "learning_rate": 4.969589409344632e-06, + "loss": 2.4017, + "step": 7289 + }, + { + "epoch": 0.39109442060085836, + "grad_norm": 0.38671875, + "learning_rate": 4.969575898489352e-06, + "loss": 2.3466, + "step": 7290 + }, + { + "epoch": 0.3911480686695279, + "grad_norm": 0.365234375, + "learning_rate": 4.969562384651806e-06, + "loss": 2.1313, + "step": 7291 + }, + { + "epoch": 0.3912017167381974, + "grad_norm": 0.392578125, + "learning_rate": 4.969548867832006e-06, + "loss": 2.2739, + "step": 7292 + }, + { + "epoch": 0.39125536480686696, + "grad_norm": 0.390625, + "learning_rate": 4.969535348029969e-06, + "loss": 2.3615, + "step": 7293 + }, + { + "epoch": 0.3913090128755365, + "grad_norm": 0.6171875, + "learning_rate": 4.969521825245712e-06, + "loss": 2.2908, + "step": 7294 + }, + { + "epoch": 0.391362660944206, + "grad_norm": 0.50390625, + "learning_rate": 4.969508299479252e-06, + "loss": 2.5669, + "step": 7295 + }, + { + "epoch": 0.39141630901287555, + "grad_norm": 0.37109375, + "learning_rate": 4.969494770730604e-06, + "loss": 2.2607, + "step": 7296 + }, + { + "epoch": 0.3914699570815451, + "grad_norm": 0.396484375, + "learning_rate": 4.969481238999785e-06, + "loss": 2.3488, + "step": 7297 + }, + { + "epoch": 0.3915236051502146, + "grad_norm": 0.39453125, + "learning_rate": 4.969467704286811e-06, + "loss": 1.9909, + "step": 7298 + }, + { + "epoch": 0.39157725321888415, + "grad_norm": 0.478515625, + "learning_rate": 4.969454166591699e-06, + "loss": 2.3813, + "step": 7299 + }, + { + "epoch": 0.3916309012875536, + "grad_norm": 0.408203125, + "learning_rate": 4.969440625914463e-06, + "loss": 2.44, + "step": 7300 + }, + { + "epoch": 0.39168454935622316, + "grad_norm": 0.400390625, + "learning_rate": 4.969427082255123e-06, + "loss": 1.729, + "step": 7301 + }, + { + "epoch": 0.3917381974248927, + "grad_norm": 0.373046875, + "learning_rate": 4.969413535613694e-06, + "loss": 2.5105, + "step": 7302 + }, + { + "epoch": 0.3917918454935622, + "grad_norm": 0.404296875, + "learning_rate": 4.969399985990191e-06, + "loss": 2.2666, + "step": 7303 + }, + { + "epoch": 0.39184549356223175, + "grad_norm": 0.41015625, + "learning_rate": 4.969386433384631e-06, + "loss": 2.3775, + "step": 7304 + }, + { + "epoch": 0.3918991416309013, + "grad_norm": 0.392578125, + "learning_rate": 4.969372877797032e-06, + "loss": 2.2873, + "step": 7305 + }, + { + "epoch": 0.3919527896995708, + "grad_norm": 0.5078125, + "learning_rate": 4.969359319227408e-06, + "loss": 1.8637, + "step": 7306 + }, + { + "epoch": 0.39200643776824035, + "grad_norm": 0.412109375, + "learning_rate": 4.969345757675777e-06, + "loss": 2.2206, + "step": 7307 + }, + { + "epoch": 0.3920600858369099, + "grad_norm": 0.400390625, + "learning_rate": 4.969332193142154e-06, + "loss": 2.311, + "step": 7308 + }, + { + "epoch": 0.3921137339055794, + "grad_norm": 0.390625, + "learning_rate": 4.969318625626557e-06, + "loss": 2.4605, + "step": 7309 + }, + { + "epoch": 0.39216738197424894, + "grad_norm": 0.376953125, + "learning_rate": 4.969305055129002e-06, + "loss": 2.3191, + "step": 7310 + }, + { + "epoch": 0.3922210300429185, + "grad_norm": 0.4375, + "learning_rate": 4.969291481649504e-06, + "loss": 2.434, + "step": 7311 + }, + { + "epoch": 0.392274678111588, + "grad_norm": 0.388671875, + "learning_rate": 4.969277905188081e-06, + "loss": 2.4169, + "step": 7312 + }, + { + "epoch": 0.39232832618025754, + "grad_norm": 0.408203125, + "learning_rate": 4.969264325744748e-06, + "loss": 2.4501, + "step": 7313 + }, + { + "epoch": 0.392381974248927, + "grad_norm": 30.75, + "learning_rate": 4.969250743319523e-06, + "loss": 2.2282, + "step": 7314 + }, + { + "epoch": 0.39243562231759654, + "grad_norm": 0.30859375, + "learning_rate": 4.969237157912421e-06, + "loss": 2.1037, + "step": 7315 + }, + { + "epoch": 0.3924892703862661, + "grad_norm": 0.80859375, + "learning_rate": 4.969223569523459e-06, + "loss": 1.9681, + "step": 7316 + }, + { + "epoch": 0.3925429184549356, + "grad_norm": 0.52734375, + "learning_rate": 4.969209978152653e-06, + "loss": 2.3258, + "step": 7317 + }, + { + "epoch": 0.39259656652360514, + "grad_norm": 0.46875, + "learning_rate": 4.96919638380002e-06, + "loss": 2.797, + "step": 7318 + }, + { + "epoch": 0.39265021459227467, + "grad_norm": 0.423828125, + "learning_rate": 4.969182786465577e-06, + "loss": 2.3412, + "step": 7319 + }, + { + "epoch": 0.3927038626609442, + "grad_norm": 0.6015625, + "learning_rate": 4.969169186149338e-06, + "loss": 2.4259, + "step": 7320 + }, + { + "epoch": 0.39275751072961373, + "grad_norm": 0.37109375, + "learning_rate": 4.969155582851323e-06, + "loss": 2.0552, + "step": 7321 + }, + { + "epoch": 0.39281115879828327, + "grad_norm": 0.36328125, + "learning_rate": 4.969141976571545e-06, + "loss": 2.3111, + "step": 7322 + }, + { + "epoch": 0.3928648068669528, + "grad_norm": 0.455078125, + "learning_rate": 4.969128367310023e-06, + "loss": 2.5277, + "step": 7323 + }, + { + "epoch": 0.39291845493562233, + "grad_norm": 0.384765625, + "learning_rate": 4.9691147550667714e-06, + "loss": 2.1252, + "step": 7324 + }, + { + "epoch": 0.39297210300429186, + "grad_norm": 0.42578125, + "learning_rate": 4.969101139841808e-06, + "loss": 2.102, + "step": 7325 + }, + { + "epoch": 0.3930257510729614, + "grad_norm": 0.41015625, + "learning_rate": 4.969087521635149e-06, + "loss": 2.2946, + "step": 7326 + }, + { + "epoch": 0.3930793991416309, + "grad_norm": 0.390625, + "learning_rate": 4.96907390044681e-06, + "loss": 2.1636, + "step": 7327 + }, + { + "epoch": 0.3931330472103004, + "grad_norm": 0.451171875, + "learning_rate": 4.9690602762768086e-06, + "loss": 2.4976, + "step": 7328 + }, + { + "epoch": 0.39318669527896993, + "grad_norm": 0.3671875, + "learning_rate": 4.969046649125161e-06, + "loss": 2.1733, + "step": 7329 + }, + { + "epoch": 0.39324034334763946, + "grad_norm": 0.474609375, + "learning_rate": 4.969033018991883e-06, + "loss": 1.5897, + "step": 7330 + }, + { + "epoch": 0.393293991416309, + "grad_norm": 0.376953125, + "learning_rate": 4.969019385876991e-06, + "loss": 2.1793, + "step": 7331 + }, + { + "epoch": 0.39334763948497853, + "grad_norm": 0.373046875, + "learning_rate": 4.9690057497805025e-06, + "loss": 2.2858, + "step": 7332 + }, + { + "epoch": 0.39340128755364806, + "grad_norm": 0.3828125, + "learning_rate": 4.968992110702434e-06, + "loss": 2.3371, + "step": 7333 + }, + { + "epoch": 0.3934549356223176, + "grad_norm": 0.392578125, + "learning_rate": 4.9689784686428e-06, + "loss": 2.3665, + "step": 7334 + }, + { + "epoch": 0.3935085836909871, + "grad_norm": 0.40625, + "learning_rate": 4.96896482360162e-06, + "loss": 2.3282, + "step": 7335 + }, + { + "epoch": 0.39356223175965666, + "grad_norm": 0.392578125, + "learning_rate": 4.968951175578908e-06, + "loss": 2.4321, + "step": 7336 + }, + { + "epoch": 0.3936158798283262, + "grad_norm": 0.40625, + "learning_rate": 4.968937524574681e-06, + "loss": 2.195, + "step": 7337 + }, + { + "epoch": 0.3936695278969957, + "grad_norm": 0.412109375, + "learning_rate": 4.968923870588955e-06, + "loss": 2.4622, + "step": 7338 + }, + { + "epoch": 0.39372317596566525, + "grad_norm": 0.341796875, + "learning_rate": 4.968910213621749e-06, + "loss": 2.0999, + "step": 7339 + }, + { + "epoch": 0.3937768240343348, + "grad_norm": 0.4296875, + "learning_rate": 4.9688965536730774e-06, + "loss": 2.2574, + "step": 7340 + }, + { + "epoch": 0.3938304721030043, + "grad_norm": 0.40625, + "learning_rate": 4.968882890742957e-06, + "loss": 2.3325, + "step": 7341 + }, + { + "epoch": 0.39388412017167385, + "grad_norm": 0.486328125, + "learning_rate": 4.968869224831404e-06, + "loss": 2.2449, + "step": 7342 + }, + { + "epoch": 0.3939377682403433, + "grad_norm": 0.453125, + "learning_rate": 4.968855555938435e-06, + "loss": 2.4122, + "step": 7343 + }, + { + "epoch": 0.39399141630901285, + "grad_norm": 0.490234375, + "learning_rate": 4.9688418840640675e-06, + "loss": 2.131, + "step": 7344 + }, + { + "epoch": 0.3940450643776824, + "grad_norm": 0.318359375, + "learning_rate": 4.968828209208316e-06, + "loss": 2.0677, + "step": 7345 + }, + { + "epoch": 0.3940987124463519, + "grad_norm": 0.470703125, + "learning_rate": 4.9688145313711995e-06, + "loss": 2.3562, + "step": 7346 + }, + { + "epoch": 0.39415236051502145, + "grad_norm": 0.431640625, + "learning_rate": 4.968800850552733e-06, + "loss": 2.3312, + "step": 7347 + }, + { + "epoch": 0.394206008583691, + "grad_norm": 0.38671875, + "learning_rate": 4.968787166752934e-06, + "loss": 2.192, + "step": 7348 + }, + { + "epoch": 0.3942596566523605, + "grad_norm": 0.44921875, + "learning_rate": 4.968773479971818e-06, + "loss": 2.4159, + "step": 7349 + }, + { + "epoch": 0.39431330472103004, + "grad_norm": 0.486328125, + "learning_rate": 4.968759790209401e-06, + "loss": 2.1575, + "step": 7350 + }, + { + "epoch": 0.3943669527896996, + "grad_norm": 0.45703125, + "learning_rate": 4.968746097465701e-06, + "loss": 2.3264, + "step": 7351 + }, + { + "epoch": 0.3944206008583691, + "grad_norm": 0.40234375, + "learning_rate": 4.968732401740734e-06, + "loss": 2.116, + "step": 7352 + }, + { + "epoch": 0.39447424892703864, + "grad_norm": 0.439453125, + "learning_rate": 4.968718703034517e-06, + "loss": 2.0924, + "step": 7353 + }, + { + "epoch": 0.39452789699570817, + "grad_norm": 0.365234375, + "learning_rate": 4.968705001347065e-06, + "loss": 2.234, + "step": 7354 + }, + { + "epoch": 0.3945815450643777, + "grad_norm": 0.4296875, + "learning_rate": 4.9686912966783965e-06, + "loss": 2.1706, + "step": 7355 + }, + { + "epoch": 0.39463519313304724, + "grad_norm": 0.404296875, + "learning_rate": 4.9686775890285275e-06, + "loss": 2.358, + "step": 7356 + }, + { + "epoch": 0.3946888412017167, + "grad_norm": 0.4296875, + "learning_rate": 4.968663878397473e-06, + "loss": 2.2694, + "step": 7357 + }, + { + "epoch": 0.39474248927038624, + "grad_norm": 0.46484375, + "learning_rate": 4.968650164785252e-06, + "loss": 2.3791, + "step": 7358 + }, + { + "epoch": 0.3947961373390558, + "grad_norm": 0.408203125, + "learning_rate": 4.968636448191878e-06, + "loss": 2.4247, + "step": 7359 + }, + { + "epoch": 0.3948497854077253, + "grad_norm": 0.412109375, + "learning_rate": 4.968622728617371e-06, + "loss": 2.1346, + "step": 7360 + }, + { + "epoch": 0.39490343347639484, + "grad_norm": 0.373046875, + "learning_rate": 4.968609006061745e-06, + "loss": 2.4299, + "step": 7361 + }, + { + "epoch": 0.39495708154506437, + "grad_norm": 0.3671875, + "learning_rate": 4.968595280525018e-06, + "loss": 2.2468, + "step": 7362 + }, + { + "epoch": 0.3950107296137339, + "grad_norm": 0.39453125, + "learning_rate": 4.968581552007206e-06, + "loss": 2.2949, + "step": 7363 + }, + { + "epoch": 0.39506437768240343, + "grad_norm": 0.404296875, + "learning_rate": 4.9685678205083255e-06, + "loss": 2.388, + "step": 7364 + }, + { + "epoch": 0.39511802575107297, + "grad_norm": 0.53515625, + "learning_rate": 4.968554086028394e-06, + "loss": 2.2039, + "step": 7365 + }, + { + "epoch": 0.3951716738197425, + "grad_norm": 0.380859375, + "learning_rate": 4.968540348567427e-06, + "loss": 2.5327, + "step": 7366 + }, + { + "epoch": 0.39522532188841203, + "grad_norm": 0.388671875, + "learning_rate": 4.96852660812544e-06, + "loss": 2.3383, + "step": 7367 + }, + { + "epoch": 0.39527896995708156, + "grad_norm": 0.431640625, + "learning_rate": 4.968512864702453e-06, + "loss": 2.4501, + "step": 7368 + }, + { + "epoch": 0.3953326180257511, + "grad_norm": 0.40625, + "learning_rate": 4.9684991182984795e-06, + "loss": 2.3988, + "step": 7369 + }, + { + "epoch": 0.3953862660944206, + "grad_norm": 0.47265625, + "learning_rate": 4.968485368913538e-06, + "loss": 2.301, + "step": 7370 + }, + { + "epoch": 0.39543991416309016, + "grad_norm": 0.44921875, + "learning_rate": 4.9684716165476435e-06, + "loss": 1.7452, + "step": 7371 + }, + { + "epoch": 0.39549356223175963, + "grad_norm": 0.380859375, + "learning_rate": 4.968457861200814e-06, + "loss": 2.3989, + "step": 7372 + }, + { + "epoch": 0.39554721030042916, + "grad_norm": 0.462890625, + "learning_rate": 4.9684441028730655e-06, + "loss": 2.319, + "step": 7373 + }, + { + "epoch": 0.3956008583690987, + "grad_norm": 0.408203125, + "learning_rate": 4.968430341564414e-06, + "loss": 2.0956, + "step": 7374 + }, + { + "epoch": 0.3956545064377682, + "grad_norm": 0.373046875, + "learning_rate": 4.9684165772748775e-06, + "loss": 2.2778, + "step": 7375 + }, + { + "epoch": 0.39570815450643776, + "grad_norm": 0.40234375, + "learning_rate": 4.968402810004471e-06, + "loss": 2.3372, + "step": 7376 + }, + { + "epoch": 0.3957618025751073, + "grad_norm": 0.384765625, + "learning_rate": 4.968389039753213e-06, + "loss": 1.8414, + "step": 7377 + }, + { + "epoch": 0.3958154506437768, + "grad_norm": 0.423828125, + "learning_rate": 4.968375266521119e-06, + "loss": 2.3768, + "step": 7378 + }, + { + "epoch": 0.39586909871244635, + "grad_norm": 0.423828125, + "learning_rate": 4.968361490308205e-06, + "loss": 2.3757, + "step": 7379 + }, + { + "epoch": 0.3959227467811159, + "grad_norm": 0.439453125, + "learning_rate": 4.9683477111144895e-06, + "loss": 2.2917, + "step": 7380 + }, + { + "epoch": 0.3959763948497854, + "grad_norm": 0.44140625, + "learning_rate": 4.968333928939987e-06, + "loss": 2.563, + "step": 7381 + }, + { + "epoch": 0.39603004291845495, + "grad_norm": 0.57421875, + "learning_rate": 4.968320143784716e-06, + "loss": 2.3342, + "step": 7382 + }, + { + "epoch": 0.3960836909871245, + "grad_norm": 0.375, + "learning_rate": 4.968306355648692e-06, + "loss": 2.183, + "step": 7383 + }, + { + "epoch": 0.396137339055794, + "grad_norm": 0.6015625, + "learning_rate": 4.9682925645319325e-06, + "loss": 2.1655, + "step": 7384 + }, + { + "epoch": 0.39619098712446355, + "grad_norm": 0.375, + "learning_rate": 4.968278770434453e-06, + "loss": 2.4049, + "step": 7385 + }, + { + "epoch": 0.396244635193133, + "grad_norm": 0.46875, + "learning_rate": 4.968264973356271e-06, + "loss": 2.2691, + "step": 7386 + }, + { + "epoch": 0.39629828326180255, + "grad_norm": 0.365234375, + "learning_rate": 4.968251173297403e-06, + "loss": 2.3374, + "step": 7387 + }, + { + "epoch": 0.3963519313304721, + "grad_norm": 3.078125, + "learning_rate": 4.968237370257866e-06, + "loss": 2.441, + "step": 7388 + }, + { + "epoch": 0.3964055793991416, + "grad_norm": 0.46875, + "learning_rate": 4.968223564237675e-06, + "loss": 1.3755, + "step": 7389 + }, + { + "epoch": 0.39645922746781115, + "grad_norm": 0.392578125, + "learning_rate": 4.968209755236849e-06, + "loss": 2.3905, + "step": 7390 + }, + { + "epoch": 0.3965128755364807, + "grad_norm": 0.3203125, + "learning_rate": 4.968195943255404e-06, + "loss": 2.1306, + "step": 7391 + }, + { + "epoch": 0.3965665236051502, + "grad_norm": 0.431640625, + "learning_rate": 4.968182128293355e-06, + "loss": 2.1477, + "step": 7392 + }, + { + "epoch": 0.39662017167381974, + "grad_norm": 0.47265625, + "learning_rate": 4.968168310350721e-06, + "loss": 2.3758, + "step": 7393 + }, + { + "epoch": 0.3966738197424893, + "grad_norm": 0.408203125, + "learning_rate": 4.968154489427518e-06, + "loss": 2.2042, + "step": 7394 + }, + { + "epoch": 0.3967274678111588, + "grad_norm": 0.58203125, + "learning_rate": 4.9681406655237615e-06, + "loss": 2.4393, + "step": 7395 + }, + { + "epoch": 0.39678111587982834, + "grad_norm": 0.65625, + "learning_rate": 4.96812683863947e-06, + "loss": 2.3782, + "step": 7396 + }, + { + "epoch": 0.39683476394849787, + "grad_norm": 0.4375, + "learning_rate": 4.968113008774659e-06, + "loss": 2.2697, + "step": 7397 + }, + { + "epoch": 0.3968884120171674, + "grad_norm": 0.423828125, + "learning_rate": 4.968099175929345e-06, + "loss": 2.2104, + "step": 7398 + }, + { + "epoch": 0.39694206008583693, + "grad_norm": 0.380859375, + "learning_rate": 4.968085340103545e-06, + "loss": 2.289, + "step": 7399 + }, + { + "epoch": 0.3969957081545064, + "grad_norm": 0.365234375, + "learning_rate": 4.968071501297277e-06, + "loss": 2.2424, + "step": 7400 + }, + { + "epoch": 0.39704935622317594, + "grad_norm": 0.4453125, + "learning_rate": 4.968057659510556e-06, + "loss": 2.1998, + "step": 7401 + }, + { + "epoch": 0.3971030042918455, + "grad_norm": 0.412109375, + "learning_rate": 4.9680438147434e-06, + "loss": 2.1687, + "step": 7402 + }, + { + "epoch": 0.397156652360515, + "grad_norm": 0.89453125, + "learning_rate": 4.968029966995823e-06, + "loss": 1.4124, + "step": 7403 + }, + { + "epoch": 0.39721030042918454, + "grad_norm": 0.421875, + "learning_rate": 4.9680161162678455e-06, + "loss": 2.1902, + "step": 7404 + }, + { + "epoch": 0.39726394849785407, + "grad_norm": 0.392578125, + "learning_rate": 4.968002262559483e-06, + "loss": 2.0362, + "step": 7405 + }, + { + "epoch": 0.3973175965665236, + "grad_norm": 0.439453125, + "learning_rate": 4.96798840587075e-06, + "loss": 2.3283, + "step": 7406 + }, + { + "epoch": 0.39737124463519313, + "grad_norm": 0.4140625, + "learning_rate": 4.967974546201666e-06, + "loss": 2.2574, + "step": 7407 + }, + { + "epoch": 0.39742489270386266, + "grad_norm": 0.37109375, + "learning_rate": 4.967960683552246e-06, + "loss": 2.2498, + "step": 7408 + }, + { + "epoch": 0.3974785407725322, + "grad_norm": 0.373046875, + "learning_rate": 4.967946817922508e-06, + "loss": 2.2291, + "step": 7409 + }, + { + "epoch": 0.39753218884120173, + "grad_norm": 0.73046875, + "learning_rate": 4.967932949312468e-06, + "loss": 2.2969, + "step": 7410 + }, + { + "epoch": 0.39758583690987126, + "grad_norm": 0.388671875, + "learning_rate": 4.967919077722143e-06, + "loss": 2.3941, + "step": 7411 + }, + { + "epoch": 0.3976394849785408, + "grad_norm": 0.369140625, + "learning_rate": 4.967905203151549e-06, + "loss": 2.3276, + "step": 7412 + }, + { + "epoch": 0.3976931330472103, + "grad_norm": 0.40234375, + "learning_rate": 4.967891325600705e-06, + "loss": 2.4569, + "step": 7413 + }, + { + "epoch": 0.39774678111587985, + "grad_norm": 0.435546875, + "learning_rate": 4.967877445069626e-06, + "loss": 2.4053, + "step": 7414 + }, + { + "epoch": 0.39780042918454933, + "grad_norm": 0.392578125, + "learning_rate": 4.9678635615583274e-06, + "loss": 2.3278, + "step": 7415 + }, + { + "epoch": 0.39785407725321886, + "grad_norm": 0.490234375, + "learning_rate": 4.967849675066828e-06, + "loss": 2.3218, + "step": 7416 + }, + { + "epoch": 0.3979077253218884, + "grad_norm": 0.455078125, + "learning_rate": 4.967835785595145e-06, + "loss": 2.625, + "step": 7417 + }, + { + "epoch": 0.3979613733905579, + "grad_norm": 0.6171875, + "learning_rate": 4.967821893143294e-06, + "loss": 1.8014, + "step": 7418 + }, + { + "epoch": 0.39801502145922746, + "grad_norm": 0.390625, + "learning_rate": 4.967807997711291e-06, + "loss": 2.1269, + "step": 7419 + }, + { + "epoch": 0.398068669527897, + "grad_norm": 0.515625, + "learning_rate": 4.967794099299155e-06, + "loss": 2.4252, + "step": 7420 + }, + { + "epoch": 0.3981223175965665, + "grad_norm": 0.375, + "learning_rate": 4.9677801979069006e-06, + "loss": 2.4486, + "step": 7421 + }, + { + "epoch": 0.39817596566523605, + "grad_norm": 0.40625, + "learning_rate": 4.967766293534546e-06, + "loss": 2.3396, + "step": 7422 + }, + { + "epoch": 0.3982296137339056, + "grad_norm": 0.46875, + "learning_rate": 4.967752386182108e-06, + "loss": 2.3152, + "step": 7423 + }, + { + "epoch": 0.3982832618025751, + "grad_norm": 0.447265625, + "learning_rate": 4.967738475849603e-06, + "loss": 2.2264, + "step": 7424 + }, + { + "epoch": 0.39833690987124465, + "grad_norm": 0.4921875, + "learning_rate": 4.9677245625370475e-06, + "loss": 2.231, + "step": 7425 + }, + { + "epoch": 0.3983905579399142, + "grad_norm": 0.41015625, + "learning_rate": 4.967710646244458e-06, + "loss": 2.1343, + "step": 7426 + }, + { + "epoch": 0.3984442060085837, + "grad_norm": 0.431640625, + "learning_rate": 4.967696726971853e-06, + "loss": 2.2769, + "step": 7427 + }, + { + "epoch": 0.39849785407725324, + "grad_norm": 0.453125, + "learning_rate": 4.967682804719247e-06, + "loss": 2.1541, + "step": 7428 + }, + { + "epoch": 0.3985515021459227, + "grad_norm": 0.7109375, + "learning_rate": 4.967668879486659e-06, + "loss": 2.1952, + "step": 7429 + }, + { + "epoch": 0.39860515021459225, + "grad_norm": 0.427734375, + "learning_rate": 4.967654951274104e-06, + "loss": 2.3439, + "step": 7430 + }, + { + "epoch": 0.3986587982832618, + "grad_norm": 0.416015625, + "learning_rate": 4.9676410200816e-06, + "loss": 2.4244, + "step": 7431 + }, + { + "epoch": 0.3987124463519313, + "grad_norm": 0.484375, + "learning_rate": 4.967627085909163e-06, + "loss": 2.481, + "step": 7432 + }, + { + "epoch": 0.39876609442060085, + "grad_norm": 0.51171875, + "learning_rate": 4.967613148756812e-06, + "loss": 2.3418, + "step": 7433 + }, + { + "epoch": 0.3988197424892704, + "grad_norm": 0.3984375, + "learning_rate": 4.96759920862456e-06, + "loss": 1.6314, + "step": 7434 + }, + { + "epoch": 0.3988733905579399, + "grad_norm": 0.40234375, + "learning_rate": 4.967585265512427e-06, + "loss": 2.1913, + "step": 7435 + }, + { + "epoch": 0.39892703862660944, + "grad_norm": 0.396484375, + "learning_rate": 4.9675713194204285e-06, + "loss": 2.2063, + "step": 7436 + }, + { + "epoch": 0.398980686695279, + "grad_norm": 0.380859375, + "learning_rate": 4.967557370348581e-06, + "loss": 2.2336, + "step": 7437 + }, + { + "epoch": 0.3990343347639485, + "grad_norm": 0.392578125, + "learning_rate": 4.967543418296903e-06, + "loss": 2.354, + "step": 7438 + }, + { + "epoch": 0.39908798283261804, + "grad_norm": 0.51953125, + "learning_rate": 4.96752946326541e-06, + "loss": 2.4519, + "step": 7439 + }, + { + "epoch": 0.39914163090128757, + "grad_norm": 0.439453125, + "learning_rate": 4.967515505254119e-06, + "loss": 2.2544, + "step": 7440 + }, + { + "epoch": 0.3991952789699571, + "grad_norm": 0.484375, + "learning_rate": 4.967501544263048e-06, + "loss": 2.3701, + "step": 7441 + }, + { + "epoch": 0.39924892703862663, + "grad_norm": 0.37109375, + "learning_rate": 4.967487580292212e-06, + "loss": 2.4132, + "step": 7442 + }, + { + "epoch": 0.39930257510729616, + "grad_norm": 0.443359375, + "learning_rate": 4.967473613341629e-06, + "loss": 2.2895, + "step": 7443 + }, + { + "epoch": 0.39935622317596564, + "grad_norm": 0.328125, + "learning_rate": 4.9674596434113155e-06, + "loss": 2.0724, + "step": 7444 + }, + { + "epoch": 0.3994098712446352, + "grad_norm": 0.431640625, + "learning_rate": 4.967445670501288e-06, + "loss": 2.4444, + "step": 7445 + }, + { + "epoch": 0.3994635193133047, + "grad_norm": 0.40625, + "learning_rate": 4.967431694611564e-06, + "loss": 2.0914, + "step": 7446 + }, + { + "epoch": 0.39951716738197424, + "grad_norm": 0.63671875, + "learning_rate": 4.967417715742161e-06, + "loss": 2.1023, + "step": 7447 + }, + { + "epoch": 0.39957081545064377, + "grad_norm": 0.380859375, + "learning_rate": 4.9674037338930946e-06, + "loss": 2.3548, + "step": 7448 + }, + { + "epoch": 0.3996244635193133, + "grad_norm": 0.4453125, + "learning_rate": 4.967389749064382e-06, + "loss": 2.485, + "step": 7449 + }, + { + "epoch": 0.39967811158798283, + "grad_norm": 0.484375, + "learning_rate": 4.967375761256041e-06, + "loss": 2.0846, + "step": 7450 + }, + { + "epoch": 0.39973175965665236, + "grad_norm": 0.361328125, + "learning_rate": 4.9673617704680874e-06, + "loss": 2.1224, + "step": 7451 + }, + { + "epoch": 0.3997854077253219, + "grad_norm": 0.43359375, + "learning_rate": 4.967347776700538e-06, + "loss": 2.4797, + "step": 7452 + }, + { + "epoch": 0.3998390557939914, + "grad_norm": 0.5625, + "learning_rate": 4.967333779953411e-06, + "loss": 2.2107, + "step": 7453 + }, + { + "epoch": 0.39989270386266096, + "grad_norm": 0.3984375, + "learning_rate": 4.967319780226722e-06, + "loss": 2.2776, + "step": 7454 + }, + { + "epoch": 0.3999463519313305, + "grad_norm": 0.34765625, + "learning_rate": 4.967305777520488e-06, + "loss": 2.1775, + "step": 7455 + }, + { + "epoch": 0.4, + "grad_norm": 0.443359375, + "learning_rate": 4.967291771834727e-06, + "loss": 2.2956, + "step": 7456 + }, + { + "epoch": 0.40005364806866955, + "grad_norm": 0.4140625, + "learning_rate": 4.9672777631694555e-06, + "loss": 2.1034, + "step": 7457 + }, + { + "epoch": 0.40010729613733903, + "grad_norm": 0.458984375, + "learning_rate": 4.967263751524689e-06, + "loss": 2.2611, + "step": 7458 + }, + { + "epoch": 0.40016094420600856, + "grad_norm": 0.75390625, + "learning_rate": 4.9672497369004465e-06, + "loss": 2.3016, + "step": 7459 + }, + { + "epoch": 0.4002145922746781, + "grad_norm": 1.0078125, + "learning_rate": 4.967235719296744e-06, + "loss": 2.5395, + "step": 7460 + }, + { + "epoch": 0.4002682403433476, + "grad_norm": 0.53515625, + "learning_rate": 4.967221698713598e-06, + "loss": 2.2762, + "step": 7461 + }, + { + "epoch": 0.40032188841201716, + "grad_norm": 0.46484375, + "learning_rate": 4.967207675151026e-06, + "loss": 2.3356, + "step": 7462 + }, + { + "epoch": 0.4003755364806867, + "grad_norm": 0.515625, + "learning_rate": 4.967193648609044e-06, + "loss": 1.7449, + "step": 7463 + }, + { + "epoch": 0.4004291845493562, + "grad_norm": 0.671875, + "learning_rate": 4.96717961908767e-06, + "loss": 2.3367, + "step": 7464 + }, + { + "epoch": 0.40048283261802575, + "grad_norm": 0.431640625, + "learning_rate": 4.967165586586922e-06, + "loss": 2.3295, + "step": 7465 + }, + { + "epoch": 0.4005364806866953, + "grad_norm": 0.4296875, + "learning_rate": 4.967151551106813e-06, + "loss": 2.3868, + "step": 7466 + }, + { + "epoch": 0.4005901287553648, + "grad_norm": 0.423828125, + "learning_rate": 4.967137512647365e-06, + "loss": 2.2897, + "step": 7467 + }, + { + "epoch": 0.40064377682403435, + "grad_norm": 0.365234375, + "learning_rate": 4.9671234712085916e-06, + "loss": 2.0712, + "step": 7468 + }, + { + "epoch": 0.4006974248927039, + "grad_norm": 0.390625, + "learning_rate": 4.967109426790511e-06, + "loss": 2.3328, + "step": 7469 + }, + { + "epoch": 0.4007510729613734, + "grad_norm": 0.435546875, + "learning_rate": 4.967095379393139e-06, + "loss": 2.5105, + "step": 7470 + }, + { + "epoch": 0.40080472103004294, + "grad_norm": 0.408203125, + "learning_rate": 4.967081329016494e-06, + "loss": 2.4181, + "step": 7471 + }, + { + "epoch": 0.4008583690987124, + "grad_norm": 0.365234375, + "learning_rate": 4.967067275660592e-06, + "loss": 2.0838, + "step": 7472 + }, + { + "epoch": 0.40091201716738195, + "grad_norm": 0.39453125, + "learning_rate": 4.9670532193254505e-06, + "loss": 2.1714, + "step": 7473 + }, + { + "epoch": 0.4009656652360515, + "grad_norm": 0.4453125, + "learning_rate": 4.967039160011087e-06, + "loss": 2.4196, + "step": 7474 + }, + { + "epoch": 0.401019313304721, + "grad_norm": 0.64453125, + "learning_rate": 4.9670250977175165e-06, + "loss": 2.1704, + "step": 7475 + }, + { + "epoch": 0.40107296137339055, + "grad_norm": 0.5703125, + "learning_rate": 4.967011032444758e-06, + "loss": 2.1611, + "step": 7476 + }, + { + "epoch": 0.4011266094420601, + "grad_norm": 0.578125, + "learning_rate": 4.9669969641928275e-06, + "loss": 2.2798, + "step": 7477 + }, + { + "epoch": 0.4011802575107296, + "grad_norm": 0.408203125, + "learning_rate": 4.966982892961742e-06, + "loss": 2.3599, + "step": 7478 + }, + { + "epoch": 0.40123390557939914, + "grad_norm": 0.6484375, + "learning_rate": 4.966968818751518e-06, + "loss": 2.4937, + "step": 7479 + }, + { + "epoch": 0.4012875536480687, + "grad_norm": 0.443359375, + "learning_rate": 4.966954741562176e-06, + "loss": 2.4276, + "step": 7480 + }, + { + "epoch": 0.4013412017167382, + "grad_norm": 0.61328125, + "learning_rate": 4.966940661393728e-06, + "loss": 2.147, + "step": 7481 + }, + { + "epoch": 0.40139484978540774, + "grad_norm": 0.388671875, + "learning_rate": 4.966926578246193e-06, + "loss": 2.2426, + "step": 7482 + }, + { + "epoch": 0.40144849785407727, + "grad_norm": 0.51171875, + "learning_rate": 4.966912492119589e-06, + "loss": 2.221, + "step": 7483 + }, + { + "epoch": 0.4015021459227468, + "grad_norm": 0.345703125, + "learning_rate": 4.966898403013932e-06, + "loss": 2.3964, + "step": 7484 + }, + { + "epoch": 0.40155579399141633, + "grad_norm": 0.427734375, + "learning_rate": 4.966884310929239e-06, + "loss": 2.3794, + "step": 7485 + }, + { + "epoch": 0.40160944206008586, + "grad_norm": 0.494140625, + "learning_rate": 4.966870215865528e-06, + "loss": 1.5436, + "step": 7486 + }, + { + "epoch": 0.40166309012875534, + "grad_norm": 0.9375, + "learning_rate": 4.966856117822814e-06, + "loss": 2.3031, + "step": 7487 + }, + { + "epoch": 0.40171673819742487, + "grad_norm": 0.392578125, + "learning_rate": 4.9668420168011165e-06, + "loss": 2.1342, + "step": 7488 + }, + { + "epoch": 0.4017703862660944, + "grad_norm": 0.32421875, + "learning_rate": 4.966827912800451e-06, + "loss": 1.9142, + "step": 7489 + }, + { + "epoch": 0.40182403433476394, + "grad_norm": 0.39453125, + "learning_rate": 4.966813805820835e-06, + "loss": 2.2079, + "step": 7490 + }, + { + "epoch": 0.40187768240343347, + "grad_norm": 0.43359375, + "learning_rate": 4.966799695862285e-06, + "loss": 2.265, + "step": 7491 + }, + { + "epoch": 0.401931330472103, + "grad_norm": 0.375, + "learning_rate": 4.966785582924819e-06, + "loss": 2.329, + "step": 7492 + }, + { + "epoch": 0.40198497854077253, + "grad_norm": 0.416015625, + "learning_rate": 4.966771467008453e-06, + "loss": 2.2872, + "step": 7493 + }, + { + "epoch": 0.40203862660944206, + "grad_norm": 0.443359375, + "learning_rate": 4.966757348113205e-06, + "loss": 2.3151, + "step": 7494 + }, + { + "epoch": 0.4020922746781116, + "grad_norm": 0.43359375, + "learning_rate": 4.966743226239091e-06, + "loss": 2.5184, + "step": 7495 + }, + { + "epoch": 0.4021459227467811, + "grad_norm": 0.419921875, + "learning_rate": 4.966729101386128e-06, + "loss": 2.3053, + "step": 7496 + }, + { + "epoch": 0.40219957081545066, + "grad_norm": 0.375, + "learning_rate": 4.966714973554335e-06, + "loss": 2.1647, + "step": 7497 + }, + { + "epoch": 0.4022532188841202, + "grad_norm": 0.484375, + "learning_rate": 4.966700842743728e-06, + "loss": 2.4846, + "step": 7498 + }, + { + "epoch": 0.4023068669527897, + "grad_norm": 4.46875, + "learning_rate": 4.966686708954323e-06, + "loss": 2.2385, + "step": 7499 + }, + { + "epoch": 0.40236051502145925, + "grad_norm": 0.416015625, + "learning_rate": 4.966672572186138e-06, + "loss": 2.4243, + "step": 7500 + }, + { + "epoch": 0.40241416309012873, + "grad_norm": 0.44921875, + "learning_rate": 4.96665843243919e-06, + "loss": 2.3233, + "step": 7501 + }, + { + "epoch": 0.40246781115879826, + "grad_norm": 0.470703125, + "learning_rate": 4.966644289713496e-06, + "loss": 2.417, + "step": 7502 + }, + { + "epoch": 0.4025214592274678, + "grad_norm": 0.482421875, + "learning_rate": 4.9666301440090735e-06, + "loss": 2.0871, + "step": 7503 + }, + { + "epoch": 0.4025751072961373, + "grad_norm": 0.40625, + "learning_rate": 4.966615995325939e-06, + "loss": 2.3886, + "step": 7504 + }, + { + "epoch": 0.40262875536480686, + "grad_norm": 0.8828125, + "learning_rate": 4.9666018436641094e-06, + "loss": 2.2989, + "step": 7505 + }, + { + "epoch": 0.4026824034334764, + "grad_norm": 0.388671875, + "learning_rate": 4.966587689023602e-06, + "loss": 2.2712, + "step": 7506 + }, + { + "epoch": 0.4027360515021459, + "grad_norm": 0.384765625, + "learning_rate": 4.966573531404435e-06, + "loss": 2.2151, + "step": 7507 + }, + { + "epoch": 0.40278969957081545, + "grad_norm": 0.43359375, + "learning_rate": 4.966559370806624e-06, + "loss": 2.0082, + "step": 7508 + }, + { + "epoch": 0.402843347639485, + "grad_norm": 0.466796875, + "learning_rate": 4.966545207230187e-06, + "loss": 2.3577, + "step": 7509 + }, + { + "epoch": 0.4028969957081545, + "grad_norm": 0.400390625, + "learning_rate": 4.96653104067514e-06, + "loss": 2.2959, + "step": 7510 + }, + { + "epoch": 0.40295064377682405, + "grad_norm": 0.455078125, + "learning_rate": 4.966516871141502e-06, + "loss": 2.3609, + "step": 7511 + }, + { + "epoch": 0.4030042918454936, + "grad_norm": 0.498046875, + "learning_rate": 4.966502698629287e-06, + "loss": 2.2797, + "step": 7512 + }, + { + "epoch": 0.4030579399141631, + "grad_norm": 0.3515625, + "learning_rate": 4.966488523138516e-06, + "loss": 2.1294, + "step": 7513 + }, + { + "epoch": 0.40311158798283264, + "grad_norm": 0.384765625, + "learning_rate": 4.966474344669203e-06, + "loss": 2.0668, + "step": 7514 + }, + { + "epoch": 0.4031652360515021, + "grad_norm": 0.353515625, + "learning_rate": 4.966460163221368e-06, + "loss": 2.3303, + "step": 7515 + }, + { + "epoch": 0.40321888412017165, + "grad_norm": 0.390625, + "learning_rate": 4.966445978795024e-06, + "loss": 2.2955, + "step": 7516 + }, + { + "epoch": 0.4032725321888412, + "grad_norm": 0.41796875, + "learning_rate": 4.966431791390194e-06, + "loss": 2.3716, + "step": 7517 + }, + { + "epoch": 0.4033261802575107, + "grad_norm": 0.5078125, + "learning_rate": 4.966417601006889e-06, + "loss": 1.3964, + "step": 7518 + }, + { + "epoch": 0.40337982832618025, + "grad_norm": 0.474609375, + "learning_rate": 4.96640340764513e-06, + "loss": 2.3582, + "step": 7519 + }, + { + "epoch": 0.4034334763948498, + "grad_norm": 1.015625, + "learning_rate": 4.966389211304932e-06, + "loss": 2.2271, + "step": 7520 + }, + { + "epoch": 0.4034871244635193, + "grad_norm": 0.455078125, + "learning_rate": 4.9663750119863145e-06, + "loss": 2.5406, + "step": 7521 + }, + { + "epoch": 0.40354077253218884, + "grad_norm": 0.77734375, + "learning_rate": 4.9663608096892926e-06, + "loss": 2.4964, + "step": 7522 + }, + { + "epoch": 0.4035944206008584, + "grad_norm": 0.41015625, + "learning_rate": 4.9663466044138834e-06, + "loss": 2.3964, + "step": 7523 + }, + { + "epoch": 0.4036480686695279, + "grad_norm": 0.625, + "learning_rate": 4.966332396160106e-06, + "loss": 2.4112, + "step": 7524 + }, + { + "epoch": 0.40370171673819744, + "grad_norm": 0.37890625, + "learning_rate": 4.9663181849279755e-06, + "loss": 2.4586, + "step": 7525 + }, + { + "epoch": 0.40375536480686697, + "grad_norm": 0.4609375, + "learning_rate": 4.96630397071751e-06, + "loss": 2.0592, + "step": 7526 + }, + { + "epoch": 0.4038090128755365, + "grad_norm": 0.671875, + "learning_rate": 4.966289753528727e-06, + "loss": 2.3415, + "step": 7527 + }, + { + "epoch": 0.40386266094420603, + "grad_norm": 0.41015625, + "learning_rate": 4.966275533361643e-06, + "loss": 2.4209, + "step": 7528 + }, + { + "epoch": 0.40391630901287556, + "grad_norm": 0.419921875, + "learning_rate": 4.966261310216276e-06, + "loss": 2.3484, + "step": 7529 + }, + { + "epoch": 0.40396995708154504, + "grad_norm": 0.396484375, + "learning_rate": 4.966247084092641e-06, + "loss": 2.2792, + "step": 7530 + }, + { + "epoch": 0.40402360515021457, + "grad_norm": 0.41796875, + "learning_rate": 4.966232854990759e-06, + "loss": 2.2975, + "step": 7531 + }, + { + "epoch": 0.4040772532188841, + "grad_norm": 0.37890625, + "learning_rate": 4.966218622910643e-06, + "loss": 2.4558, + "step": 7532 + }, + { + "epoch": 0.40413090128755363, + "grad_norm": 0.45703125, + "learning_rate": 4.966204387852313e-06, + "loss": 2.2867, + "step": 7533 + }, + { + "epoch": 0.40418454935622317, + "grad_norm": 0.349609375, + "learning_rate": 4.9661901498157865e-06, + "loss": 2.3979, + "step": 7534 + }, + { + "epoch": 0.4042381974248927, + "grad_norm": 0.42578125, + "learning_rate": 4.9661759088010774e-06, + "loss": 2.3384, + "step": 7535 + }, + { + "epoch": 0.40429184549356223, + "grad_norm": 0.388671875, + "learning_rate": 4.9661616648082065e-06, + "loss": 2.3229, + "step": 7536 + }, + { + "epoch": 0.40434549356223176, + "grad_norm": 0.357421875, + "learning_rate": 4.966147417837189e-06, + "loss": 2.1328, + "step": 7537 + }, + { + "epoch": 0.4043991416309013, + "grad_norm": 0.40625, + "learning_rate": 4.966133167888042e-06, + "loss": 2.3168, + "step": 7538 + }, + { + "epoch": 0.4044527896995708, + "grad_norm": 0.388671875, + "learning_rate": 4.966118914960785e-06, + "loss": 1.8927, + "step": 7539 + }, + { + "epoch": 0.40450643776824036, + "grad_norm": 0.384765625, + "learning_rate": 4.966104659055432e-06, + "loss": 2.3352, + "step": 7540 + }, + { + "epoch": 0.4045600858369099, + "grad_norm": 0.419921875, + "learning_rate": 4.966090400172002e-06, + "loss": 2.4954, + "step": 7541 + }, + { + "epoch": 0.4046137339055794, + "grad_norm": 1.515625, + "learning_rate": 4.966076138310512e-06, + "loss": 2.0749, + "step": 7542 + }, + { + "epoch": 0.40466738197424895, + "grad_norm": 0.349609375, + "learning_rate": 4.966061873470981e-06, + "loss": 2.2144, + "step": 7543 + }, + { + "epoch": 0.40472103004291843, + "grad_norm": 0.390625, + "learning_rate": 4.9660476056534226e-06, + "loss": 2.3466, + "step": 7544 + }, + { + "epoch": 0.40477467811158796, + "grad_norm": 0.36328125, + "learning_rate": 4.966033334857856e-06, + "loss": 2.0377, + "step": 7545 + }, + { + "epoch": 0.4048283261802575, + "grad_norm": 0.404296875, + "learning_rate": 4.966019061084298e-06, + "loss": 2.3215, + "step": 7546 + }, + { + "epoch": 0.404881974248927, + "grad_norm": 0.482421875, + "learning_rate": 4.966004784332768e-06, + "loss": 2.2873, + "step": 7547 + }, + { + "epoch": 0.40493562231759656, + "grad_norm": 0.427734375, + "learning_rate": 4.9659905046032796e-06, + "loss": 2.2948, + "step": 7548 + }, + { + "epoch": 0.4049892703862661, + "grad_norm": 0.408203125, + "learning_rate": 4.965976221895852e-06, + "loss": 2.4646, + "step": 7549 + }, + { + "epoch": 0.4050429184549356, + "grad_norm": 0.38671875, + "learning_rate": 4.9659619362105025e-06, + "loss": 2.4103, + "step": 7550 + }, + { + "epoch": 0.40509656652360515, + "grad_norm": 0.43359375, + "learning_rate": 4.965947647547248e-06, + "loss": 2.2631, + "step": 7551 + }, + { + "epoch": 0.4051502145922747, + "grad_norm": 0.39453125, + "learning_rate": 4.965933355906106e-06, + "loss": 2.5403, + "step": 7552 + }, + { + "epoch": 0.4052038626609442, + "grad_norm": 0.390625, + "learning_rate": 4.965919061287095e-06, + "loss": 2.2586, + "step": 7553 + }, + { + "epoch": 0.40525751072961375, + "grad_norm": 0.400390625, + "learning_rate": 4.965904763690229e-06, + "loss": 2.3813, + "step": 7554 + }, + { + "epoch": 0.4053111587982833, + "grad_norm": 0.41796875, + "learning_rate": 4.965890463115528e-06, + "loss": 2.1338, + "step": 7555 + }, + { + "epoch": 0.4053648068669528, + "grad_norm": 0.494140625, + "learning_rate": 4.965876159563008e-06, + "loss": 2.3932, + "step": 7556 + }, + { + "epoch": 0.40541845493562234, + "grad_norm": 0.478515625, + "learning_rate": 4.965861853032687e-06, + "loss": 2.411, + "step": 7557 + }, + { + "epoch": 0.4054721030042919, + "grad_norm": 0.39453125, + "learning_rate": 4.965847543524582e-06, + "loss": 1.9338, + "step": 7558 + }, + { + "epoch": 0.40552575107296135, + "grad_norm": 0.435546875, + "learning_rate": 4.96583323103871e-06, + "loss": 2.3214, + "step": 7559 + }, + { + "epoch": 0.4055793991416309, + "grad_norm": 0.443359375, + "learning_rate": 4.965818915575089e-06, + "loss": 2.3319, + "step": 7560 + }, + { + "epoch": 0.4056330472103004, + "grad_norm": 0.39453125, + "learning_rate": 4.965804597133735e-06, + "loss": 2.2465, + "step": 7561 + }, + { + "epoch": 0.40568669527896994, + "grad_norm": 0.4140625, + "learning_rate": 4.965790275714667e-06, + "loss": 2.4386, + "step": 7562 + }, + { + "epoch": 0.4057403433476395, + "grad_norm": 1.3515625, + "learning_rate": 4.965775951317901e-06, + "loss": 2.3129, + "step": 7563 + }, + { + "epoch": 0.405793991416309, + "grad_norm": 0.34375, + "learning_rate": 4.965761623943455e-06, + "loss": 2.1555, + "step": 7564 + }, + { + "epoch": 0.40584763948497854, + "grad_norm": 0.451171875, + "learning_rate": 4.965747293591346e-06, + "loss": 2.0889, + "step": 7565 + }, + { + "epoch": 0.40590128755364807, + "grad_norm": 0.37890625, + "learning_rate": 4.965732960261591e-06, + "loss": 2.2582, + "step": 7566 + }, + { + "epoch": 0.4059549356223176, + "grad_norm": 0.419921875, + "learning_rate": 4.965718623954208e-06, + "loss": 2.571, + "step": 7567 + }, + { + "epoch": 0.40600858369098713, + "grad_norm": 0.38671875, + "learning_rate": 4.965704284669214e-06, + "loss": 2.3344, + "step": 7568 + }, + { + "epoch": 0.40606223175965667, + "grad_norm": 0.51953125, + "learning_rate": 4.965689942406626e-06, + "loss": 2.4312, + "step": 7569 + }, + { + "epoch": 0.4061158798283262, + "grad_norm": 0.404296875, + "learning_rate": 4.9656755971664615e-06, + "loss": 2.2413, + "step": 7570 + }, + { + "epoch": 0.40616952789699573, + "grad_norm": 0.470703125, + "learning_rate": 4.965661248948738e-06, + "loss": 2.3273, + "step": 7571 + }, + { + "epoch": 0.40622317596566526, + "grad_norm": 0.466796875, + "learning_rate": 4.965646897753473e-06, + "loss": 1.8439, + "step": 7572 + }, + { + "epoch": 0.40627682403433474, + "grad_norm": 0.373046875, + "learning_rate": 4.965632543580683e-06, + "loss": 2.2844, + "step": 7573 + }, + { + "epoch": 0.40633047210300427, + "grad_norm": 0.44140625, + "learning_rate": 4.965618186430387e-06, + "loss": 2.636, + "step": 7574 + }, + { + "epoch": 0.4063841201716738, + "grad_norm": 0.45703125, + "learning_rate": 4.9656038263026e-06, + "loss": 1.6258, + "step": 7575 + }, + { + "epoch": 0.40643776824034333, + "grad_norm": 0.443359375, + "learning_rate": 4.965589463197341e-06, + "loss": 2.2664, + "step": 7576 + }, + { + "epoch": 0.40649141630901287, + "grad_norm": 0.42578125, + "learning_rate": 4.9655750971146266e-06, + "loss": 2.1441, + "step": 7577 + }, + { + "epoch": 0.4065450643776824, + "grad_norm": 0.3984375, + "learning_rate": 4.965560728054475e-06, + "loss": 2.3979, + "step": 7578 + }, + { + "epoch": 0.40659871244635193, + "grad_norm": 0.470703125, + "learning_rate": 4.965546356016903e-06, + "loss": 2.1562, + "step": 7579 + }, + { + "epoch": 0.40665236051502146, + "grad_norm": 0.375, + "learning_rate": 4.965531981001928e-06, + "loss": 2.2658, + "step": 7580 + }, + { + "epoch": 0.406706008583691, + "grad_norm": 0.400390625, + "learning_rate": 4.965517603009567e-06, + "loss": 2.1627, + "step": 7581 + }, + { + "epoch": 0.4067596566523605, + "grad_norm": 0.373046875, + "learning_rate": 4.965503222039838e-06, + "loss": 2.2255, + "step": 7582 + }, + { + "epoch": 0.40681330472103006, + "grad_norm": 0.384765625, + "learning_rate": 4.965488838092758e-06, + "loss": 2.4512, + "step": 7583 + }, + { + "epoch": 0.4068669527896996, + "grad_norm": 0.4375, + "learning_rate": 4.965474451168344e-06, + "loss": 2.2997, + "step": 7584 + }, + { + "epoch": 0.4069206008583691, + "grad_norm": 1.453125, + "learning_rate": 4.965460061266615e-06, + "loss": 2.6037, + "step": 7585 + }, + { + "epoch": 0.40697424892703865, + "grad_norm": 0.484375, + "learning_rate": 4.965445668387586e-06, + "loss": 2.3122, + "step": 7586 + }, + { + "epoch": 0.4070278969957081, + "grad_norm": 0.4921875, + "learning_rate": 4.965431272531276e-06, + "loss": 2.3922, + "step": 7587 + }, + { + "epoch": 0.40708154506437766, + "grad_norm": 8.5, + "learning_rate": 4.9654168736977015e-06, + "loss": 2.5364, + "step": 7588 + }, + { + "epoch": 0.4071351931330472, + "grad_norm": 0.408203125, + "learning_rate": 4.965402471886881e-06, + "loss": 2.4207, + "step": 7589 + }, + { + "epoch": 0.4071888412017167, + "grad_norm": 0.478515625, + "learning_rate": 4.965388067098832e-06, + "loss": 2.2744, + "step": 7590 + }, + { + "epoch": 0.40724248927038625, + "grad_norm": 0.4375, + "learning_rate": 4.965373659333569e-06, + "loss": 2.4882, + "step": 7591 + }, + { + "epoch": 0.4072961373390558, + "grad_norm": 0.416015625, + "learning_rate": 4.9653592485911124e-06, + "loss": 2.3192, + "step": 7592 + }, + { + "epoch": 0.4073497854077253, + "grad_norm": 0.451171875, + "learning_rate": 4.9653448348714795e-06, + "loss": 2.2208, + "step": 7593 + }, + { + "epoch": 0.40740343347639485, + "grad_norm": 0.404296875, + "learning_rate": 4.965330418174687e-06, + "loss": 2.1694, + "step": 7594 + }, + { + "epoch": 0.4074570815450644, + "grad_norm": 0.390625, + "learning_rate": 4.965315998500752e-06, + "loss": 2.3548, + "step": 7595 + }, + { + "epoch": 0.4075107296137339, + "grad_norm": 1.2734375, + "learning_rate": 4.965301575849692e-06, + "loss": 1.6959, + "step": 7596 + }, + { + "epoch": 0.40756437768240344, + "grad_norm": 0.421875, + "learning_rate": 4.9652871502215236e-06, + "loss": 2.1819, + "step": 7597 + }, + { + "epoch": 0.407618025751073, + "grad_norm": 0.400390625, + "learning_rate": 4.965272721616266e-06, + "loss": 2.2649, + "step": 7598 + }, + { + "epoch": 0.4076716738197425, + "grad_norm": 0.6796875, + "learning_rate": 4.965258290033936e-06, + "loss": 2.3468, + "step": 7599 + }, + { + "epoch": 0.40772532188841204, + "grad_norm": 0.62109375, + "learning_rate": 4.965243855474551e-06, + "loss": 2.2447, + "step": 7600 + }, + { + "epoch": 0.40777896995708157, + "grad_norm": 0.416015625, + "learning_rate": 4.965229417938128e-06, + "loss": 2.1922, + "step": 7601 + }, + { + "epoch": 0.40783261802575105, + "grad_norm": 0.458984375, + "learning_rate": 4.9652149774246845e-06, + "loss": 1.9172, + "step": 7602 + }, + { + "epoch": 0.4078862660944206, + "grad_norm": 0.59375, + "learning_rate": 4.965200533934238e-06, + "loss": 2.3532, + "step": 7603 + }, + { + "epoch": 0.4079399141630901, + "grad_norm": 0.400390625, + "learning_rate": 4.965186087466807e-06, + "loss": 2.2291, + "step": 7604 + }, + { + "epoch": 0.40799356223175964, + "grad_norm": 0.421875, + "learning_rate": 4.9651716380224076e-06, + "loss": 2.3842, + "step": 7605 + }, + { + "epoch": 0.4080472103004292, + "grad_norm": 0.404296875, + "learning_rate": 4.9651571856010575e-06, + "loss": 2.5248, + "step": 7606 + }, + { + "epoch": 0.4081008583690987, + "grad_norm": 0.5546875, + "learning_rate": 4.9651427302027745e-06, + "loss": 2.252, + "step": 7607 + }, + { + "epoch": 0.40815450643776824, + "grad_norm": 0.396484375, + "learning_rate": 4.965128271827576e-06, + "loss": 2.4874, + "step": 7608 + }, + { + "epoch": 0.40820815450643777, + "grad_norm": 0.392578125, + "learning_rate": 4.965113810475479e-06, + "loss": 2.2475, + "step": 7609 + }, + { + "epoch": 0.4082618025751073, + "grad_norm": 0.3671875, + "learning_rate": 4.965099346146501e-06, + "loss": 2.0152, + "step": 7610 + }, + { + "epoch": 0.40831545064377683, + "grad_norm": 0.3828125, + "learning_rate": 4.96508487884066e-06, + "loss": 2.4385, + "step": 7611 + }, + { + "epoch": 0.40836909871244637, + "grad_norm": 0.361328125, + "learning_rate": 4.965070408557974e-06, + "loss": 2.2002, + "step": 7612 + }, + { + "epoch": 0.4084227467811159, + "grad_norm": 0.408203125, + "learning_rate": 4.96505593529846e-06, + "loss": 2.4325, + "step": 7613 + }, + { + "epoch": 0.40847639484978543, + "grad_norm": 0.4609375, + "learning_rate": 4.9650414590621346e-06, + "loss": 2.5428, + "step": 7614 + }, + { + "epoch": 0.40853004291845496, + "grad_norm": 0.41015625, + "learning_rate": 4.9650269798490155e-06, + "loss": 2.2517, + "step": 7615 + }, + { + "epoch": 0.40858369098712444, + "grad_norm": 0.46875, + "learning_rate": 4.965012497659121e-06, + "loss": 2.4158, + "step": 7616 + }, + { + "epoch": 0.40863733905579397, + "grad_norm": 0.4609375, + "learning_rate": 4.964998012492468e-06, + "loss": 2.4373, + "step": 7617 + }, + { + "epoch": 0.4086909871244635, + "grad_norm": 0.51171875, + "learning_rate": 4.964983524349074e-06, + "loss": 2.4412, + "step": 7618 + }, + { + "epoch": 0.40874463519313303, + "grad_norm": 0.48828125, + "learning_rate": 4.964969033228957e-06, + "loss": 2.2496, + "step": 7619 + }, + { + "epoch": 0.40879828326180256, + "grad_norm": 2.4375, + "learning_rate": 4.964954539132134e-06, + "loss": 2.4937, + "step": 7620 + }, + { + "epoch": 0.4088519313304721, + "grad_norm": 0.39453125, + "learning_rate": 4.964940042058622e-06, + "loss": 2.5788, + "step": 7621 + }, + { + "epoch": 0.4089055793991416, + "grad_norm": 0.37109375, + "learning_rate": 4.964925542008441e-06, + "loss": 2.3454, + "step": 7622 + }, + { + "epoch": 0.40895922746781116, + "grad_norm": 0.431640625, + "learning_rate": 4.9649110389816054e-06, + "loss": 2.3534, + "step": 7623 + }, + { + "epoch": 0.4090128755364807, + "grad_norm": 0.388671875, + "learning_rate": 4.964896532978134e-06, + "loss": 2.3556, + "step": 7624 + }, + { + "epoch": 0.4090665236051502, + "grad_norm": 0.443359375, + "learning_rate": 4.964882023998045e-06, + "loss": 2.4504, + "step": 7625 + }, + { + "epoch": 0.40912017167381975, + "grad_norm": 0.431640625, + "learning_rate": 4.964867512041354e-06, + "loss": 2.2717, + "step": 7626 + }, + { + "epoch": 0.4091738197424893, + "grad_norm": 0.42578125, + "learning_rate": 4.964852997108081e-06, + "loss": 2.351, + "step": 7627 + }, + { + "epoch": 0.4092274678111588, + "grad_norm": 0.408203125, + "learning_rate": 4.964838479198241e-06, + "loss": 2.4237, + "step": 7628 + }, + { + "epoch": 0.40928111587982835, + "grad_norm": 0.47265625, + "learning_rate": 4.964823958311854e-06, + "loss": 2.2908, + "step": 7629 + }, + { + "epoch": 0.4093347639484979, + "grad_norm": 0.400390625, + "learning_rate": 4.964809434448936e-06, + "loss": 2.2917, + "step": 7630 + }, + { + "epoch": 0.40938841201716736, + "grad_norm": 0.392578125, + "learning_rate": 4.964794907609505e-06, + "loss": 2.1542, + "step": 7631 + }, + { + "epoch": 0.4094420600858369, + "grad_norm": 0.40234375, + "learning_rate": 4.964780377793577e-06, + "loss": 2.4253, + "step": 7632 + }, + { + "epoch": 0.4094957081545064, + "grad_norm": 0.439453125, + "learning_rate": 4.964765845001172e-06, + "loss": 2.2689, + "step": 7633 + }, + { + "epoch": 0.40954935622317595, + "grad_norm": 0.3828125, + "learning_rate": 4.964751309232307e-06, + "loss": 2.2593, + "step": 7634 + }, + { + "epoch": 0.4096030042918455, + "grad_norm": 0.400390625, + "learning_rate": 4.964736770487e-06, + "loss": 2.0394, + "step": 7635 + }, + { + "epoch": 0.409656652360515, + "grad_norm": 0.5390625, + "learning_rate": 4.964722228765266e-06, + "loss": 2.4408, + "step": 7636 + }, + { + "epoch": 0.40971030042918455, + "grad_norm": 0.361328125, + "learning_rate": 4.964707684067125e-06, + "loss": 2.139, + "step": 7637 + }, + { + "epoch": 0.4097639484978541, + "grad_norm": 0.3671875, + "learning_rate": 4.964693136392594e-06, + "loss": 2.5411, + "step": 7638 + }, + { + "epoch": 0.4098175965665236, + "grad_norm": 0.40625, + "learning_rate": 4.964678585741689e-06, + "loss": 2.3484, + "step": 7639 + }, + { + "epoch": 0.40987124463519314, + "grad_norm": 0.37109375, + "learning_rate": 4.96466403211443e-06, + "loss": 2.1945, + "step": 7640 + }, + { + "epoch": 0.4099248927038627, + "grad_norm": 0.435546875, + "learning_rate": 4.964649475510833e-06, + "loss": 2.3992, + "step": 7641 + }, + { + "epoch": 0.4099785407725322, + "grad_norm": 1.0703125, + "learning_rate": 4.964634915930916e-06, + "loss": 2.239, + "step": 7642 + }, + { + "epoch": 0.41003218884120174, + "grad_norm": 0.390625, + "learning_rate": 4.964620353374696e-06, + "loss": 1.9396, + "step": 7643 + }, + { + "epoch": 0.41008583690987127, + "grad_norm": 0.35546875, + "learning_rate": 4.964605787842193e-06, + "loss": 2.1863, + "step": 7644 + }, + { + "epoch": 0.41013948497854075, + "grad_norm": 0.345703125, + "learning_rate": 4.964591219333421e-06, + "loss": 2.3344, + "step": 7645 + }, + { + "epoch": 0.4101931330472103, + "grad_norm": 0.419921875, + "learning_rate": 4.964576647848401e-06, + "loss": 2.3338, + "step": 7646 + }, + { + "epoch": 0.4102467811158798, + "grad_norm": 0.439453125, + "learning_rate": 4.964562073387148e-06, + "loss": 2.3376, + "step": 7647 + }, + { + "epoch": 0.41030042918454934, + "grad_norm": 0.404296875, + "learning_rate": 4.964547495949681e-06, + "loss": 2.0967, + "step": 7648 + }, + { + "epoch": 0.4103540772532189, + "grad_norm": 0.474609375, + "learning_rate": 4.964532915536017e-06, + "loss": 2.3033, + "step": 7649 + }, + { + "epoch": 0.4104077253218884, + "grad_norm": 0.390625, + "learning_rate": 4.964518332146175e-06, + "loss": 2.089, + "step": 7650 + }, + { + "epoch": 0.41046137339055794, + "grad_norm": 0.6015625, + "learning_rate": 4.964503745780169e-06, + "loss": 1.9905, + "step": 7651 + }, + { + "epoch": 0.41051502145922747, + "grad_norm": 0.404296875, + "learning_rate": 4.964489156438021e-06, + "loss": 2.0118, + "step": 7652 + }, + { + "epoch": 0.410568669527897, + "grad_norm": 0.439453125, + "learning_rate": 4.964474564119745e-06, + "loss": 2.3988, + "step": 7653 + }, + { + "epoch": 0.41062231759656653, + "grad_norm": 0.470703125, + "learning_rate": 4.964459968825363e-06, + "loss": 2.5322, + "step": 7654 + }, + { + "epoch": 0.41067596566523606, + "grad_norm": 0.56640625, + "learning_rate": 4.964445370554887e-06, + "loss": 2.4261, + "step": 7655 + }, + { + "epoch": 0.4107296137339056, + "grad_norm": 0.765625, + "learning_rate": 4.964430769308339e-06, + "loss": 2.4543, + "step": 7656 + }, + { + "epoch": 0.41078326180257513, + "grad_norm": 0.88671875, + "learning_rate": 4.9644161650857345e-06, + "loss": 2.2798, + "step": 7657 + }, + { + "epoch": 0.41083690987124466, + "grad_norm": 0.4453125, + "learning_rate": 4.964401557887093e-06, + "loss": 2.2786, + "step": 7658 + }, + { + "epoch": 0.41089055793991414, + "grad_norm": 0.44140625, + "learning_rate": 4.96438694771243e-06, + "loss": 2.407, + "step": 7659 + }, + { + "epoch": 0.41094420600858367, + "grad_norm": 0.5, + "learning_rate": 4.964372334561764e-06, + "loss": 2.4661, + "step": 7660 + }, + { + "epoch": 0.4109978540772532, + "grad_norm": 0.48046875, + "learning_rate": 4.964357718435114e-06, + "loss": 2.4781, + "step": 7661 + }, + { + "epoch": 0.41105150214592273, + "grad_norm": 0.38671875, + "learning_rate": 4.964343099332495e-06, + "loss": 2.2903, + "step": 7662 + }, + { + "epoch": 0.41110515021459226, + "grad_norm": 0.51953125, + "learning_rate": 4.964328477253926e-06, + "loss": 2.2847, + "step": 7663 + }, + { + "epoch": 0.4111587982832618, + "grad_norm": 0.3984375, + "learning_rate": 4.964313852199425e-06, + "loss": 2.4231, + "step": 7664 + }, + { + "epoch": 0.4112124463519313, + "grad_norm": 0.408203125, + "learning_rate": 4.964299224169009e-06, + "loss": 2.253, + "step": 7665 + }, + { + "epoch": 0.41126609442060086, + "grad_norm": 0.5234375, + "learning_rate": 4.964284593162697e-06, + "loss": 2.2595, + "step": 7666 + }, + { + "epoch": 0.4113197424892704, + "grad_norm": 0.39453125, + "learning_rate": 4.964269959180505e-06, + "loss": 2.1699, + "step": 7667 + }, + { + "epoch": 0.4113733905579399, + "grad_norm": 0.298828125, + "learning_rate": 4.9642553222224524e-06, + "loss": 1.8595, + "step": 7668 + }, + { + "epoch": 0.41142703862660945, + "grad_norm": 0.37109375, + "learning_rate": 4.964240682288554e-06, + "loss": 2.2549, + "step": 7669 + }, + { + "epoch": 0.411480686695279, + "grad_norm": 0.369140625, + "learning_rate": 4.96422603937883e-06, + "loss": 2.2169, + "step": 7670 + }, + { + "epoch": 0.4115343347639485, + "grad_norm": 0.447265625, + "learning_rate": 4.964211393493298e-06, + "loss": 2.3419, + "step": 7671 + }, + { + "epoch": 0.41158798283261805, + "grad_norm": 0.3125, + "learning_rate": 4.964196744631975e-06, + "loss": 1.9902, + "step": 7672 + }, + { + "epoch": 0.4116416309012876, + "grad_norm": 0.39453125, + "learning_rate": 4.964182092794878e-06, + "loss": 2.1646, + "step": 7673 + }, + { + "epoch": 0.41169527896995706, + "grad_norm": 0.39453125, + "learning_rate": 4.9641674379820265e-06, + "loss": 2.2533, + "step": 7674 + }, + { + "epoch": 0.4117489270386266, + "grad_norm": 0.42578125, + "learning_rate": 4.964152780193437e-06, + "loss": 2.4239, + "step": 7675 + }, + { + "epoch": 0.4118025751072961, + "grad_norm": 0.44921875, + "learning_rate": 4.964138119429128e-06, + "loss": 1.5931, + "step": 7676 + }, + { + "epoch": 0.41185622317596565, + "grad_norm": 0.439453125, + "learning_rate": 4.964123455689115e-06, + "loss": 2.1554, + "step": 7677 + }, + { + "epoch": 0.4119098712446352, + "grad_norm": 0.416015625, + "learning_rate": 4.964108788973418e-06, + "loss": 2.3527, + "step": 7678 + }, + { + "epoch": 0.4119635193133047, + "grad_norm": 0.400390625, + "learning_rate": 4.9640941192820535e-06, + "loss": 2.3007, + "step": 7679 + }, + { + "epoch": 0.41201716738197425, + "grad_norm": 0.37890625, + "learning_rate": 4.96407944661504e-06, + "loss": 2.2453, + "step": 7680 + }, + { + "epoch": 0.4120708154506438, + "grad_norm": 0.52734375, + "learning_rate": 4.964064770972395e-06, + "loss": 2.4764, + "step": 7681 + }, + { + "epoch": 0.4121244635193133, + "grad_norm": 0.65625, + "learning_rate": 4.964050092354136e-06, + "loss": 2.282, + "step": 7682 + }, + { + "epoch": 0.41217811158798284, + "grad_norm": 0.486328125, + "learning_rate": 4.964035410760281e-06, + "loss": 2.365, + "step": 7683 + }, + { + "epoch": 0.4122317596566524, + "grad_norm": 0.421875, + "learning_rate": 4.964020726190848e-06, + "loss": 2.4713, + "step": 7684 + }, + { + "epoch": 0.4122854077253219, + "grad_norm": 0.3984375, + "learning_rate": 4.964006038645854e-06, + "loss": 2.1981, + "step": 7685 + }, + { + "epoch": 0.41233905579399144, + "grad_norm": 0.486328125, + "learning_rate": 4.963991348125317e-06, + "loss": 2.2168, + "step": 7686 + }, + { + "epoch": 0.41239270386266097, + "grad_norm": 0.47265625, + "learning_rate": 4.9639766546292545e-06, + "loss": 2.3385, + "step": 7687 + }, + { + "epoch": 0.41244635193133045, + "grad_norm": 0.515625, + "learning_rate": 4.963961958157685e-06, + "loss": 2.3988, + "step": 7688 + }, + { + "epoch": 0.4125, + "grad_norm": 0.37109375, + "learning_rate": 4.963947258710626e-06, + "loss": 2.1808, + "step": 7689 + }, + { + "epoch": 0.4125536480686695, + "grad_norm": 0.4765625, + "learning_rate": 4.9639325562880945e-06, + "loss": 2.5911, + "step": 7690 + }, + { + "epoch": 0.41260729613733904, + "grad_norm": 0.41015625, + "learning_rate": 4.96391785089011e-06, + "loss": 2.4672, + "step": 7691 + }, + { + "epoch": 0.4126609442060086, + "grad_norm": 0.43359375, + "learning_rate": 4.963903142516688e-06, + "loss": 2.2612, + "step": 7692 + }, + { + "epoch": 0.4127145922746781, + "grad_norm": 0.408203125, + "learning_rate": 4.963888431167847e-06, + "loss": 2.4462, + "step": 7693 + }, + { + "epoch": 0.41276824034334764, + "grad_norm": 0.396484375, + "learning_rate": 4.963873716843606e-06, + "loss": 2.2483, + "step": 7694 + }, + { + "epoch": 0.41282188841201717, + "grad_norm": 0.40234375, + "learning_rate": 4.9638589995439816e-06, + "loss": 2.2695, + "step": 7695 + }, + { + "epoch": 0.4128755364806867, + "grad_norm": 0.4296875, + "learning_rate": 4.9638442792689914e-06, + "loss": 2.5265, + "step": 7696 + }, + { + "epoch": 0.41292918454935623, + "grad_norm": 0.37890625, + "learning_rate": 4.963829556018654e-06, + "loss": 2.443, + "step": 7697 + }, + { + "epoch": 0.41298283261802576, + "grad_norm": 0.40625, + "learning_rate": 4.9638148297929864e-06, + "loss": 2.2969, + "step": 7698 + }, + { + "epoch": 0.4130364806866953, + "grad_norm": 0.453125, + "learning_rate": 4.963800100592008e-06, + "loss": 2.0775, + "step": 7699 + }, + { + "epoch": 0.4130901287553648, + "grad_norm": 0.3359375, + "learning_rate": 4.963785368415734e-06, + "loss": 1.9737, + "step": 7700 + }, + { + "epoch": 0.41314377682403436, + "grad_norm": 0.42578125, + "learning_rate": 4.963770633264184e-06, + "loss": 2.2606, + "step": 7701 + }, + { + "epoch": 0.41319742489270384, + "grad_norm": 0.3671875, + "learning_rate": 4.963755895137376e-06, + "loss": 2.4266, + "step": 7702 + }, + { + "epoch": 0.41325107296137337, + "grad_norm": 0.447265625, + "learning_rate": 4.963741154035326e-06, + "loss": 1.1922, + "step": 7703 + }, + { + "epoch": 0.4133047210300429, + "grad_norm": 0.412109375, + "learning_rate": 4.9637264099580535e-06, + "loss": 2.5499, + "step": 7704 + }, + { + "epoch": 0.41335836909871243, + "grad_norm": 0.451171875, + "learning_rate": 4.963711662905576e-06, + "loss": 2.3187, + "step": 7705 + }, + { + "epoch": 0.41341201716738196, + "grad_norm": 0.55859375, + "learning_rate": 4.96369691287791e-06, + "loss": 1.6386, + "step": 7706 + }, + { + "epoch": 0.4134656652360515, + "grad_norm": 0.4375, + "learning_rate": 4.9636821598750754e-06, + "loss": 2.5349, + "step": 7707 + }, + { + "epoch": 0.413519313304721, + "grad_norm": 0.412109375, + "learning_rate": 4.963667403897089e-06, + "loss": 2.193, + "step": 7708 + }, + { + "epoch": 0.41357296137339056, + "grad_norm": 0.56640625, + "learning_rate": 4.963652644943968e-06, + "loss": 2.1406, + "step": 7709 + }, + { + "epoch": 0.4136266094420601, + "grad_norm": 0.388671875, + "learning_rate": 4.963637883015732e-06, + "loss": 2.4921, + "step": 7710 + }, + { + "epoch": 0.4136802575107296, + "grad_norm": 0.34765625, + "learning_rate": 4.963623118112396e-06, + "loss": 1.9087, + "step": 7711 + }, + { + "epoch": 0.41373390557939915, + "grad_norm": 0.416015625, + "learning_rate": 4.96360835023398e-06, + "loss": 2.1692, + "step": 7712 + }, + { + "epoch": 0.4137875536480687, + "grad_norm": 0.4609375, + "learning_rate": 4.963593579380502e-06, + "loss": 2.4312, + "step": 7713 + }, + { + "epoch": 0.4138412017167382, + "grad_norm": 0.470703125, + "learning_rate": 4.9635788055519795e-06, + "loss": 2.3612, + "step": 7714 + }, + { + "epoch": 0.41389484978540775, + "grad_norm": 0.39453125, + "learning_rate": 4.963564028748429e-06, + "loss": 2.2987, + "step": 7715 + }, + { + "epoch": 0.4139484978540773, + "grad_norm": 0.39453125, + "learning_rate": 4.96354924896987e-06, + "loss": 2.4443, + "step": 7716 + }, + { + "epoch": 0.41400214592274676, + "grad_norm": 0.423828125, + "learning_rate": 4.963534466216319e-06, + "loss": 2.3827, + "step": 7717 + }, + { + "epoch": 0.4140557939914163, + "grad_norm": 0.73828125, + "learning_rate": 4.963519680487795e-06, + "loss": 2.2757, + "step": 7718 + }, + { + "epoch": 0.4141094420600858, + "grad_norm": 0.48046875, + "learning_rate": 4.963504891784315e-06, + "loss": 2.4528, + "step": 7719 + }, + { + "epoch": 0.41416309012875535, + "grad_norm": 0.392578125, + "learning_rate": 4.963490100105898e-06, + "loss": 2.2931, + "step": 7720 + }, + { + "epoch": 0.4142167381974249, + "grad_norm": 0.361328125, + "learning_rate": 4.963475305452561e-06, + "loss": 2.0861, + "step": 7721 + }, + { + "epoch": 0.4142703862660944, + "grad_norm": 0.39453125, + "learning_rate": 4.9634605078243214e-06, + "loss": 2.2782, + "step": 7722 + }, + { + "epoch": 0.41432403433476395, + "grad_norm": 0.431640625, + "learning_rate": 4.963445707221198e-06, + "loss": 2.228, + "step": 7723 + }, + { + "epoch": 0.4143776824034335, + "grad_norm": 0.375, + "learning_rate": 4.963430903643209e-06, + "loss": 2.4299, + "step": 7724 + }, + { + "epoch": 0.414431330472103, + "grad_norm": 0.375, + "learning_rate": 4.96341609709037e-06, + "loss": 2.3047, + "step": 7725 + }, + { + "epoch": 0.41448497854077254, + "grad_norm": 0.447265625, + "learning_rate": 4.963401287562702e-06, + "loss": 2.3025, + "step": 7726 + }, + { + "epoch": 0.4145386266094421, + "grad_norm": 0.396484375, + "learning_rate": 4.96338647506022e-06, + "loss": 2.3139, + "step": 7727 + }, + { + "epoch": 0.4145922746781116, + "grad_norm": 0.4140625, + "learning_rate": 4.963371659582944e-06, + "loss": 1.8135, + "step": 7728 + }, + { + "epoch": 0.41464592274678114, + "grad_norm": 0.458984375, + "learning_rate": 4.963356841130892e-06, + "loss": 2.4367, + "step": 7729 + }, + { + "epoch": 0.41469957081545067, + "grad_norm": 0.68359375, + "learning_rate": 4.96334201970408e-06, + "loss": 2.3834, + "step": 7730 + }, + { + "epoch": 0.41475321888412015, + "grad_norm": 0.5390625, + "learning_rate": 4.963327195302527e-06, + "loss": 2.4146, + "step": 7731 + }, + { + "epoch": 0.4148068669527897, + "grad_norm": 0.423828125, + "learning_rate": 4.963312367926251e-06, + "loss": 2.3102, + "step": 7732 + }, + { + "epoch": 0.4148605150214592, + "grad_norm": 0.44140625, + "learning_rate": 4.96329753757527e-06, + "loss": 2.4304, + "step": 7733 + }, + { + "epoch": 0.41491416309012874, + "grad_norm": 0.337890625, + "learning_rate": 4.9632827042496015e-06, + "loss": 2.0753, + "step": 7734 + }, + { + "epoch": 0.41496781115879827, + "grad_norm": 0.3984375, + "learning_rate": 4.963267867949263e-06, + "loss": 2.5131, + "step": 7735 + }, + { + "epoch": 0.4150214592274678, + "grad_norm": 0.48828125, + "learning_rate": 4.963253028674274e-06, + "loss": 2.2729, + "step": 7736 + }, + { + "epoch": 0.41507510729613734, + "grad_norm": 0.404296875, + "learning_rate": 4.96323818642465e-06, + "loss": 2.3056, + "step": 7737 + }, + { + "epoch": 0.41512875536480687, + "grad_norm": 0.396484375, + "learning_rate": 4.9632233412004114e-06, + "loss": 2.2235, + "step": 7738 + }, + { + "epoch": 0.4151824034334764, + "grad_norm": 0.3515625, + "learning_rate": 4.963208493001576e-06, + "loss": 2.2733, + "step": 7739 + }, + { + "epoch": 0.41523605150214593, + "grad_norm": 0.3515625, + "learning_rate": 4.963193641828159e-06, + "loss": 2.1338, + "step": 7740 + }, + { + "epoch": 0.41528969957081546, + "grad_norm": 0.5703125, + "learning_rate": 4.963178787680181e-06, + "loss": 2.1552, + "step": 7741 + }, + { + "epoch": 0.415343347639485, + "grad_norm": 0.51171875, + "learning_rate": 4.963163930557658e-06, + "loss": 2.1292, + "step": 7742 + }, + { + "epoch": 0.4153969957081545, + "grad_norm": 0.423828125, + "learning_rate": 4.963149070460611e-06, + "loss": 2.2121, + "step": 7743 + }, + { + "epoch": 0.41545064377682406, + "grad_norm": 0.4140625, + "learning_rate": 4.963134207389054e-06, + "loss": 2.2801, + "step": 7744 + }, + { + "epoch": 0.4155042918454936, + "grad_norm": 0.47265625, + "learning_rate": 4.963119341343008e-06, + "loss": 2.2281, + "step": 7745 + }, + { + "epoch": 0.41555793991416307, + "grad_norm": 0.5625, + "learning_rate": 4.96310447232249e-06, + "loss": 2.2248, + "step": 7746 + }, + { + "epoch": 0.4156115879828326, + "grad_norm": 0.392578125, + "learning_rate": 4.9630896003275175e-06, + "loss": 2.1441, + "step": 7747 + }, + { + "epoch": 0.41566523605150213, + "grad_norm": 0.470703125, + "learning_rate": 4.963074725358108e-06, + "loss": 2.4018, + "step": 7748 + }, + { + "epoch": 0.41571888412017166, + "grad_norm": 0.423828125, + "learning_rate": 4.963059847414281e-06, + "loss": 2.262, + "step": 7749 + }, + { + "epoch": 0.4157725321888412, + "grad_norm": 0.33203125, + "learning_rate": 4.963044966496054e-06, + "loss": 2.0267, + "step": 7750 + }, + { + "epoch": 0.4158261802575107, + "grad_norm": 0.66796875, + "learning_rate": 4.963030082603444e-06, + "loss": 2.2718, + "step": 7751 + }, + { + "epoch": 0.41587982832618026, + "grad_norm": 0.390625, + "learning_rate": 4.9630151957364705e-06, + "loss": 2.2832, + "step": 7752 + }, + { + "epoch": 0.4159334763948498, + "grad_norm": 0.5, + "learning_rate": 4.96300030589515e-06, + "loss": 2.2266, + "step": 7753 + }, + { + "epoch": 0.4159871244635193, + "grad_norm": 0.388671875, + "learning_rate": 4.962985413079501e-06, + "loss": 2.2332, + "step": 7754 + }, + { + "epoch": 0.41604077253218885, + "grad_norm": 0.55859375, + "learning_rate": 4.9629705172895415e-06, + "loss": 2.309, + "step": 7755 + }, + { + "epoch": 0.4160944206008584, + "grad_norm": 0.376953125, + "learning_rate": 4.96295561852529e-06, + "loss": 2.3672, + "step": 7756 + }, + { + "epoch": 0.4161480686695279, + "grad_norm": 0.40625, + "learning_rate": 4.9629407167867634e-06, + "loss": 2.2445, + "step": 7757 + }, + { + "epoch": 0.41620171673819745, + "grad_norm": 0.392578125, + "learning_rate": 4.962925812073981e-06, + "loss": 2.0372, + "step": 7758 + }, + { + "epoch": 0.416255364806867, + "grad_norm": 0.474609375, + "learning_rate": 4.96291090438696e-06, + "loss": 2.2352, + "step": 7759 + }, + { + "epoch": 0.41630901287553645, + "grad_norm": 0.42578125, + "learning_rate": 4.962895993725719e-06, + "loss": 2.4588, + "step": 7760 + }, + { + "epoch": 0.416362660944206, + "grad_norm": 0.4140625, + "learning_rate": 4.962881080090275e-06, + "loss": 2.3786, + "step": 7761 + }, + { + "epoch": 0.4164163090128755, + "grad_norm": 0.35546875, + "learning_rate": 4.962866163480646e-06, + "loss": 2.2153, + "step": 7762 + }, + { + "epoch": 0.41646995708154505, + "grad_norm": 0.435546875, + "learning_rate": 4.962851243896852e-06, + "loss": 2.0991, + "step": 7763 + }, + { + "epoch": 0.4165236051502146, + "grad_norm": 0.44921875, + "learning_rate": 4.962836321338909e-06, + "loss": 2.2565, + "step": 7764 + }, + { + "epoch": 0.4165772532188841, + "grad_norm": 0.47265625, + "learning_rate": 4.962821395806835e-06, + "loss": 2.3717, + "step": 7765 + }, + { + "epoch": 0.41663090128755365, + "grad_norm": 0.375, + "learning_rate": 4.962806467300649e-06, + "loss": 2.3246, + "step": 7766 + }, + { + "epoch": 0.4166845493562232, + "grad_norm": 0.455078125, + "learning_rate": 4.96279153582037e-06, + "loss": 2.5625, + "step": 7767 + }, + { + "epoch": 0.4167381974248927, + "grad_norm": 0.4296875, + "learning_rate": 4.9627766013660126e-06, + "loss": 2.3409, + "step": 7768 + }, + { + "epoch": 0.41679184549356224, + "grad_norm": 0.439453125, + "learning_rate": 4.962761663937598e-06, + "loss": 1.48, + "step": 7769 + }, + { + "epoch": 0.4168454935622318, + "grad_norm": 0.318359375, + "learning_rate": 4.962746723535144e-06, + "loss": 1.9173, + "step": 7770 + }, + { + "epoch": 0.4168991416309013, + "grad_norm": 0.625, + "learning_rate": 4.962731780158666e-06, + "loss": 2.417, + "step": 7771 + }, + { + "epoch": 0.41695278969957084, + "grad_norm": 0.419921875, + "learning_rate": 4.962716833808185e-06, + "loss": 2.5189, + "step": 7772 + }, + { + "epoch": 0.41700643776824037, + "grad_norm": 0.41796875, + "learning_rate": 4.962701884483718e-06, + "loss": 2.2127, + "step": 7773 + }, + { + "epoch": 0.41706008583690984, + "grad_norm": 0.404296875, + "learning_rate": 4.962686932185282e-06, + "loss": 2.3386, + "step": 7774 + }, + { + "epoch": 0.4171137339055794, + "grad_norm": 0.44921875, + "learning_rate": 4.962671976912897e-06, + "loss": 2.4546, + "step": 7775 + }, + { + "epoch": 0.4171673819742489, + "grad_norm": 0.380859375, + "learning_rate": 4.96265701866658e-06, + "loss": 2.6547, + "step": 7776 + }, + { + "epoch": 0.41722103004291844, + "grad_norm": 0.3671875, + "learning_rate": 4.962642057446348e-06, + "loss": 1.9599, + "step": 7777 + }, + { + "epoch": 0.41727467811158797, + "grad_norm": 0.404296875, + "learning_rate": 4.962627093252222e-06, + "loss": 2.3865, + "step": 7778 + }, + { + "epoch": 0.4173283261802575, + "grad_norm": 0.392578125, + "learning_rate": 4.962612126084218e-06, + "loss": 2.3284, + "step": 7779 + }, + { + "epoch": 0.41738197424892703, + "grad_norm": 0.419921875, + "learning_rate": 4.9625971559423535e-06, + "loss": 2.4731, + "step": 7780 + }, + { + "epoch": 0.41743562231759657, + "grad_norm": 0.37109375, + "learning_rate": 4.962582182826647e-06, + "loss": 2.2398, + "step": 7781 + }, + { + "epoch": 0.4174892703862661, + "grad_norm": 0.43359375, + "learning_rate": 4.962567206737119e-06, + "loss": 2.3158, + "step": 7782 + }, + { + "epoch": 0.41754291845493563, + "grad_norm": 0.388671875, + "learning_rate": 4.962552227673784e-06, + "loss": 2.1835, + "step": 7783 + }, + { + "epoch": 0.41759656652360516, + "grad_norm": 0.4453125, + "learning_rate": 4.962537245636662e-06, + "loss": 2.1999, + "step": 7784 + }, + { + "epoch": 0.4176502145922747, + "grad_norm": 0.38671875, + "learning_rate": 4.962522260625771e-06, + "loss": 2.4039, + "step": 7785 + }, + { + "epoch": 0.4177038626609442, + "grad_norm": 0.35546875, + "learning_rate": 4.962507272641129e-06, + "loss": 2.0527, + "step": 7786 + }, + { + "epoch": 0.41775751072961376, + "grad_norm": 0.373046875, + "learning_rate": 4.962492281682754e-06, + "loss": 2.4343, + "step": 7787 + }, + { + "epoch": 0.4178111587982833, + "grad_norm": 0.466796875, + "learning_rate": 4.962477287750663e-06, + "loss": 2.5506, + "step": 7788 + }, + { + "epoch": 0.41786480686695276, + "grad_norm": 0.474609375, + "learning_rate": 4.962462290844877e-06, + "loss": 2.2698, + "step": 7789 + }, + { + "epoch": 0.4179184549356223, + "grad_norm": 0.435546875, + "learning_rate": 4.962447290965411e-06, + "loss": 2.4372, + "step": 7790 + }, + { + "epoch": 0.41797210300429183, + "grad_norm": 0.5078125, + "learning_rate": 4.962432288112285e-06, + "loss": 2.2116, + "step": 7791 + }, + { + "epoch": 0.41802575107296136, + "grad_norm": 0.435546875, + "learning_rate": 4.962417282285517e-06, + "loss": 2.3454, + "step": 7792 + }, + { + "epoch": 0.4180793991416309, + "grad_norm": 0.4296875, + "learning_rate": 4.962402273485123e-06, + "loss": 2.2575, + "step": 7793 + }, + { + "epoch": 0.4181330472103004, + "grad_norm": 0.5234375, + "learning_rate": 4.962387261711124e-06, + "loss": 2.3687, + "step": 7794 + }, + { + "epoch": 0.41818669527896996, + "grad_norm": 0.37109375, + "learning_rate": 4.962372246963537e-06, + "loss": 2.0659, + "step": 7795 + }, + { + "epoch": 0.4182403433476395, + "grad_norm": 0.486328125, + "learning_rate": 4.96235722924238e-06, + "loss": 2.5191, + "step": 7796 + }, + { + "epoch": 0.418293991416309, + "grad_norm": 0.408203125, + "learning_rate": 4.962342208547671e-06, + "loss": 2.4319, + "step": 7797 + }, + { + "epoch": 0.41834763948497855, + "grad_norm": 0.38671875, + "learning_rate": 4.962327184879428e-06, + "loss": 2.3212, + "step": 7798 + }, + { + "epoch": 0.4184012875536481, + "grad_norm": 0.427734375, + "learning_rate": 4.96231215823767e-06, + "loss": 2.0094, + "step": 7799 + }, + { + "epoch": 0.4184549356223176, + "grad_norm": 0.435546875, + "learning_rate": 4.962297128622414e-06, + "loss": 2.2877, + "step": 7800 + }, + { + "epoch": 0.41850858369098715, + "grad_norm": 0.384765625, + "learning_rate": 4.962282096033679e-06, + "loss": 2.3055, + "step": 7801 + }, + { + "epoch": 0.4185622317596567, + "grad_norm": 0.45703125, + "learning_rate": 4.962267060471483e-06, + "loss": 2.1724, + "step": 7802 + }, + { + "epoch": 0.41861587982832615, + "grad_norm": 0.41015625, + "learning_rate": 4.962252021935845e-06, + "loss": 2.1863, + "step": 7803 + }, + { + "epoch": 0.4186695278969957, + "grad_norm": 0.341796875, + "learning_rate": 4.962236980426781e-06, + "loss": 2.042, + "step": 7804 + }, + { + "epoch": 0.4187231759656652, + "grad_norm": 0.484375, + "learning_rate": 4.962221935944311e-06, + "loss": 2.5317, + "step": 7805 + }, + { + "epoch": 0.41877682403433475, + "grad_norm": 0.75, + "learning_rate": 4.962206888488452e-06, + "loss": 2.3494, + "step": 7806 + }, + { + "epoch": 0.4188304721030043, + "grad_norm": 0.44140625, + "learning_rate": 4.962191838059223e-06, + "loss": 2.0442, + "step": 7807 + }, + { + "epoch": 0.4188841201716738, + "grad_norm": 0.38671875, + "learning_rate": 4.962176784656641e-06, + "loss": 2.1931, + "step": 7808 + }, + { + "epoch": 0.41893776824034334, + "grad_norm": 0.41796875, + "learning_rate": 4.962161728280727e-06, + "loss": 2.4124, + "step": 7809 + }, + { + "epoch": 0.4189914163090129, + "grad_norm": 0.369140625, + "learning_rate": 4.962146668931496e-06, + "loss": 2.3255, + "step": 7810 + }, + { + "epoch": 0.4190450643776824, + "grad_norm": 0.46484375, + "learning_rate": 4.962131606608968e-06, + "loss": 2.2176, + "step": 7811 + }, + { + "epoch": 0.41909871244635194, + "grad_norm": 0.435546875, + "learning_rate": 4.96211654131316e-06, + "loss": 2.4167, + "step": 7812 + }, + { + "epoch": 0.41915236051502147, + "grad_norm": 0.337890625, + "learning_rate": 4.962101473044092e-06, + "loss": 1.9923, + "step": 7813 + }, + { + "epoch": 0.419206008583691, + "grad_norm": 0.56640625, + "learning_rate": 4.96208640180178e-06, + "loss": 2.3854, + "step": 7814 + }, + { + "epoch": 0.41925965665236054, + "grad_norm": 2.5, + "learning_rate": 4.962071327586243e-06, + "loss": 2.4462, + "step": 7815 + }, + { + "epoch": 0.41931330472103007, + "grad_norm": 0.490234375, + "learning_rate": 4.9620562503975e-06, + "loss": 2.5271, + "step": 7816 + }, + { + "epoch": 0.4193669527896996, + "grad_norm": 0.41015625, + "learning_rate": 4.9620411702355685e-06, + "loss": 2.2845, + "step": 7817 + }, + { + "epoch": 0.4194206008583691, + "grad_norm": 0.490234375, + "learning_rate": 4.962026087100468e-06, + "loss": 2.3585, + "step": 7818 + }, + { + "epoch": 0.4194742489270386, + "grad_norm": 0.353515625, + "learning_rate": 4.962011000992214e-06, + "loss": 2.2143, + "step": 7819 + }, + { + "epoch": 0.41952789699570814, + "grad_norm": 0.37109375, + "learning_rate": 4.961995911910827e-06, + "loss": 2.3569, + "step": 7820 + }, + { + "epoch": 0.41958154506437767, + "grad_norm": 0.4609375, + "learning_rate": 4.961980819856324e-06, + "loss": 2.3607, + "step": 7821 + }, + { + "epoch": 0.4196351931330472, + "grad_norm": 0.37109375, + "learning_rate": 4.961965724828724e-06, + "loss": 1.8552, + "step": 7822 + }, + { + "epoch": 0.41968884120171673, + "grad_norm": 0.49609375, + "learning_rate": 4.9619506268280445e-06, + "loss": 2.2538, + "step": 7823 + }, + { + "epoch": 0.41974248927038627, + "grad_norm": 0.376953125, + "learning_rate": 4.9619355258543055e-06, + "loss": 2.4059, + "step": 7824 + }, + { + "epoch": 0.4197961373390558, + "grad_norm": 0.392578125, + "learning_rate": 4.961920421907523e-06, + "loss": 2.2968, + "step": 7825 + }, + { + "epoch": 0.41984978540772533, + "grad_norm": 0.396484375, + "learning_rate": 4.961905314987716e-06, + "loss": 1.9111, + "step": 7826 + }, + { + "epoch": 0.41990343347639486, + "grad_norm": 0.404296875, + "learning_rate": 4.961890205094904e-06, + "loss": 2.227, + "step": 7827 + }, + { + "epoch": 0.4199570815450644, + "grad_norm": 0.4296875, + "learning_rate": 4.961875092229103e-06, + "loss": 2.336, + "step": 7828 + }, + { + "epoch": 0.4200107296137339, + "grad_norm": 0.5, + "learning_rate": 4.961859976390333e-06, + "loss": 2.1967, + "step": 7829 + }, + { + "epoch": 0.42006437768240346, + "grad_norm": 0.421875, + "learning_rate": 4.961844857578612e-06, + "loss": 2.2761, + "step": 7830 + }, + { + "epoch": 0.420118025751073, + "grad_norm": 0.4921875, + "learning_rate": 4.961829735793957e-06, + "loss": 1.6034, + "step": 7831 + }, + { + "epoch": 0.42017167381974246, + "grad_norm": 0.412109375, + "learning_rate": 4.961814611036387e-06, + "loss": 2.1209, + "step": 7832 + }, + { + "epoch": 0.420225321888412, + "grad_norm": 0.380859375, + "learning_rate": 4.961799483305922e-06, + "loss": 2.1807, + "step": 7833 + }, + { + "epoch": 0.4202789699570815, + "grad_norm": 0.48046875, + "learning_rate": 4.961784352602576e-06, + "loss": 2.2541, + "step": 7834 + }, + { + "epoch": 0.42033261802575106, + "grad_norm": 0.41015625, + "learning_rate": 4.961769218926372e-06, + "loss": 2.2254, + "step": 7835 + }, + { + "epoch": 0.4203862660944206, + "grad_norm": 0.431640625, + "learning_rate": 4.961754082277327e-06, + "loss": 2.3466, + "step": 7836 + }, + { + "epoch": 0.4204399141630901, + "grad_norm": 0.41796875, + "learning_rate": 4.961738942655457e-06, + "loss": 2.2847, + "step": 7837 + }, + { + "epoch": 0.42049356223175965, + "grad_norm": 0.48046875, + "learning_rate": 4.9617238000607825e-06, + "loss": 2.8409, + "step": 7838 + }, + { + "epoch": 0.4205472103004292, + "grad_norm": 0.349609375, + "learning_rate": 4.961708654493321e-06, + "loss": 2.2808, + "step": 7839 + }, + { + "epoch": 0.4206008583690987, + "grad_norm": 0.38671875, + "learning_rate": 4.9616935059530915e-06, + "loss": 2.0836, + "step": 7840 + }, + { + "epoch": 0.42065450643776825, + "grad_norm": 0.486328125, + "learning_rate": 4.96167835444011e-06, + "loss": 2.266, + "step": 7841 + }, + { + "epoch": 0.4207081545064378, + "grad_norm": 0.34765625, + "learning_rate": 4.961663199954399e-06, + "loss": 1.9926, + "step": 7842 + }, + { + "epoch": 0.4207618025751073, + "grad_norm": 0.4140625, + "learning_rate": 4.961648042495972e-06, + "loss": 2.0401, + "step": 7843 + }, + { + "epoch": 0.42081545064377684, + "grad_norm": 0.4296875, + "learning_rate": 4.961632882064851e-06, + "loss": 2.3802, + "step": 7844 + }, + { + "epoch": 0.4208690987124464, + "grad_norm": 0.7265625, + "learning_rate": 4.961617718661052e-06, + "loss": 2.339, + "step": 7845 + }, + { + "epoch": 0.42092274678111585, + "grad_norm": 0.390625, + "learning_rate": 4.961602552284595e-06, + "loss": 2.2678, + "step": 7846 + }, + { + "epoch": 0.4209763948497854, + "grad_norm": 0.4453125, + "learning_rate": 4.9615873829354965e-06, + "loss": 1.9648, + "step": 7847 + }, + { + "epoch": 0.4210300429184549, + "grad_norm": 0.400390625, + "learning_rate": 4.961572210613777e-06, + "loss": 2.2841, + "step": 7848 + }, + { + "epoch": 0.42108369098712445, + "grad_norm": 1.0859375, + "learning_rate": 4.961557035319453e-06, + "loss": 2.4025, + "step": 7849 + }, + { + "epoch": 0.421137339055794, + "grad_norm": 0.47265625, + "learning_rate": 4.9615418570525435e-06, + "loss": 2.2567, + "step": 7850 + }, + { + "epoch": 0.4211909871244635, + "grad_norm": 0.384765625, + "learning_rate": 4.961526675813067e-06, + "loss": 2.4522, + "step": 7851 + }, + { + "epoch": 0.42124463519313304, + "grad_norm": 0.412109375, + "learning_rate": 4.961511491601042e-06, + "loss": 2.225, + "step": 7852 + }, + { + "epoch": 0.4212982832618026, + "grad_norm": 0.4296875, + "learning_rate": 4.961496304416485e-06, + "loss": 2.2466, + "step": 7853 + }, + { + "epoch": 0.4213519313304721, + "grad_norm": 0.3984375, + "learning_rate": 4.961481114259418e-06, + "loss": 2.2549, + "step": 7854 + }, + { + "epoch": 0.42140557939914164, + "grad_norm": 0.38671875, + "learning_rate": 4.961465921129856e-06, + "loss": 2.2126, + "step": 7855 + }, + { + "epoch": 0.42145922746781117, + "grad_norm": 0.376953125, + "learning_rate": 4.961450725027819e-06, + "loss": 2.2105, + "step": 7856 + }, + { + "epoch": 0.4215128755364807, + "grad_norm": 0.4296875, + "learning_rate": 4.961435525953324e-06, + "loss": 2.2368, + "step": 7857 + }, + { + "epoch": 0.42156652360515023, + "grad_norm": 0.3828125, + "learning_rate": 4.961420323906391e-06, + "loss": 2.3385, + "step": 7858 + }, + { + "epoch": 0.42162017167381977, + "grad_norm": 0.3828125, + "learning_rate": 4.961405118887037e-06, + "loss": 2.1811, + "step": 7859 + }, + { + "epoch": 0.4216738197424893, + "grad_norm": 0.4140625, + "learning_rate": 4.961389910895282e-06, + "loss": 2.4892, + "step": 7860 + }, + { + "epoch": 0.4217274678111588, + "grad_norm": 0.41015625, + "learning_rate": 4.961374699931143e-06, + "loss": 2.4166, + "step": 7861 + }, + { + "epoch": 0.4217811158798283, + "grad_norm": 0.486328125, + "learning_rate": 4.961359485994638e-06, + "loss": 2.1111, + "step": 7862 + }, + { + "epoch": 0.42183476394849784, + "grad_norm": 0.3828125, + "learning_rate": 4.9613442690857864e-06, + "loss": 2.3064, + "step": 7863 + }, + { + "epoch": 0.42188841201716737, + "grad_norm": 0.369140625, + "learning_rate": 4.961329049204606e-06, + "loss": 2.2226, + "step": 7864 + }, + { + "epoch": 0.4219420600858369, + "grad_norm": 0.416015625, + "learning_rate": 4.9613138263511166e-06, + "loss": 2.6043, + "step": 7865 + }, + { + "epoch": 0.42199570815450643, + "grad_norm": 0.41796875, + "learning_rate": 4.9612986005253335e-06, + "loss": 2.2804, + "step": 7866 + }, + { + "epoch": 0.42204935622317596, + "grad_norm": 0.44140625, + "learning_rate": 4.961283371727278e-06, + "loss": 2.401, + "step": 7867 + }, + { + "epoch": 0.4221030042918455, + "grad_norm": 0.412109375, + "learning_rate": 4.961268139956968e-06, + "loss": 1.961, + "step": 7868 + }, + { + "epoch": 0.42215665236051503, + "grad_norm": 0.40625, + "learning_rate": 4.9612529052144205e-06, + "loss": 2.2333, + "step": 7869 + }, + { + "epoch": 0.42221030042918456, + "grad_norm": 0.5234375, + "learning_rate": 4.961237667499656e-06, + "loss": 2.3525, + "step": 7870 + }, + { + "epoch": 0.4222639484978541, + "grad_norm": 0.482421875, + "learning_rate": 4.96122242681269e-06, + "loss": 2.1954, + "step": 7871 + }, + { + "epoch": 0.4223175965665236, + "grad_norm": 0.484375, + "learning_rate": 4.961207183153544e-06, + "loss": 2.1564, + "step": 7872 + }, + { + "epoch": 0.42237124463519315, + "grad_norm": 0.40625, + "learning_rate": 4.9611919365222335e-06, + "loss": 2.1566, + "step": 7873 + }, + { + "epoch": 0.4224248927038627, + "grad_norm": 0.4765625, + "learning_rate": 4.9611766869187796e-06, + "loss": 2.0647, + "step": 7874 + }, + { + "epoch": 0.42247854077253216, + "grad_norm": 0.384765625, + "learning_rate": 4.961161434343199e-06, + "loss": 2.4994, + "step": 7875 + }, + { + "epoch": 0.4225321888412017, + "grad_norm": 0.3984375, + "learning_rate": 4.961146178795511e-06, + "loss": 1.8788, + "step": 7876 + }, + { + "epoch": 0.4225858369098712, + "grad_norm": 0.359375, + "learning_rate": 4.961130920275733e-06, + "loss": 2.3078, + "step": 7877 + }, + { + "epoch": 0.42263948497854076, + "grad_norm": 0.435546875, + "learning_rate": 4.961115658783885e-06, + "loss": 2.4538, + "step": 7878 + }, + { + "epoch": 0.4226931330472103, + "grad_norm": 0.51171875, + "learning_rate": 4.961100394319983e-06, + "loss": 1.3789, + "step": 7879 + }, + { + "epoch": 0.4227467811158798, + "grad_norm": 0.45703125, + "learning_rate": 4.9610851268840486e-06, + "loss": 2.3267, + "step": 7880 + }, + { + "epoch": 0.42280042918454935, + "grad_norm": 0.498046875, + "learning_rate": 4.9610698564760975e-06, + "loss": 2.245, + "step": 7881 + }, + { + "epoch": 0.4228540772532189, + "grad_norm": 0.322265625, + "learning_rate": 4.96105458309615e-06, + "loss": 1.9979, + "step": 7882 + }, + { + "epoch": 0.4229077253218884, + "grad_norm": 0.390625, + "learning_rate": 4.961039306744223e-06, + "loss": 2.1072, + "step": 7883 + }, + { + "epoch": 0.42296137339055795, + "grad_norm": 0.5, + "learning_rate": 4.9610240274203355e-06, + "loss": 2.3876, + "step": 7884 + }, + { + "epoch": 0.4230150214592275, + "grad_norm": 0.33203125, + "learning_rate": 4.961008745124507e-06, + "loss": 1.8568, + "step": 7885 + }, + { + "epoch": 0.423068669527897, + "grad_norm": 0.380859375, + "learning_rate": 4.960993459856756e-06, + "loss": 1.9841, + "step": 7886 + }, + { + "epoch": 0.42312231759656654, + "grad_norm": 0.45703125, + "learning_rate": 4.9609781716170975e-06, + "loss": 2.3894, + "step": 7887 + }, + { + "epoch": 0.4231759656652361, + "grad_norm": 0.380859375, + "learning_rate": 4.960962880405554e-06, + "loss": 2.1788, + "step": 7888 + }, + { + "epoch": 0.42322961373390555, + "grad_norm": 0.408203125, + "learning_rate": 4.960947586222143e-06, + "loss": 2.0377, + "step": 7889 + }, + { + "epoch": 0.4232832618025751, + "grad_norm": 0.423828125, + "learning_rate": 4.960932289066882e-06, + "loss": 2.6103, + "step": 7890 + }, + { + "epoch": 0.4233369098712446, + "grad_norm": 0.3828125, + "learning_rate": 4.96091698893979e-06, + "loss": 2.1484, + "step": 7891 + }, + { + "epoch": 0.42339055793991415, + "grad_norm": 0.421875, + "learning_rate": 4.9609016858408855e-06, + "loss": 2.3982, + "step": 7892 + }, + { + "epoch": 0.4234442060085837, + "grad_norm": 0.412109375, + "learning_rate": 4.960886379770187e-06, + "loss": 2.326, + "step": 7893 + }, + { + "epoch": 0.4234978540772532, + "grad_norm": 0.39453125, + "learning_rate": 4.960871070727712e-06, + "loss": 2.4412, + "step": 7894 + }, + { + "epoch": 0.42355150214592274, + "grad_norm": 0.4921875, + "learning_rate": 4.9608557587134805e-06, + "loss": 2.4618, + "step": 7895 + }, + { + "epoch": 0.4236051502145923, + "grad_norm": 0.373046875, + "learning_rate": 4.96084044372751e-06, + "loss": 2.3485, + "step": 7896 + }, + { + "epoch": 0.4236587982832618, + "grad_norm": 0.40234375, + "learning_rate": 4.96082512576982e-06, + "loss": 2.1653, + "step": 7897 + }, + { + "epoch": 0.42371244635193134, + "grad_norm": 0.3671875, + "learning_rate": 4.960809804840428e-06, + "loss": 2.3013, + "step": 7898 + }, + { + "epoch": 0.42376609442060087, + "grad_norm": 0.4375, + "learning_rate": 4.960794480939353e-06, + "loss": 2.1685, + "step": 7899 + }, + { + "epoch": 0.4238197424892704, + "grad_norm": 0.416015625, + "learning_rate": 4.960779154066614e-06, + "loss": 2.3712, + "step": 7900 + }, + { + "epoch": 0.42387339055793993, + "grad_norm": 0.421875, + "learning_rate": 4.960763824222228e-06, + "loss": 2.3379, + "step": 7901 + }, + { + "epoch": 0.42392703862660946, + "grad_norm": 0.365234375, + "learning_rate": 4.960748491406215e-06, + "loss": 2.2299, + "step": 7902 + }, + { + "epoch": 0.423980686695279, + "grad_norm": 0.37109375, + "learning_rate": 4.9607331556185914e-06, + "loss": 2.2201, + "step": 7903 + }, + { + "epoch": 0.4240343347639485, + "grad_norm": 0.515625, + "learning_rate": 4.9607178168593785e-06, + "loss": 2.3933, + "step": 7904 + }, + { + "epoch": 0.424087982832618, + "grad_norm": 0.3671875, + "learning_rate": 4.960702475128593e-06, + "loss": 2.1518, + "step": 7905 + }, + { + "epoch": 0.42414163090128754, + "grad_norm": 0.43359375, + "learning_rate": 4.960687130426254e-06, + "loss": 2.4916, + "step": 7906 + }, + { + "epoch": 0.42419527896995707, + "grad_norm": 0.5625, + "learning_rate": 4.9606717827523805e-06, + "loss": 2.3881, + "step": 7907 + }, + { + "epoch": 0.4242489270386266, + "grad_norm": 0.392578125, + "learning_rate": 4.9606564321069906e-06, + "loss": 2.191, + "step": 7908 + }, + { + "epoch": 0.42430257510729613, + "grad_norm": 0.4375, + "learning_rate": 4.960641078490102e-06, + "loss": 2.2143, + "step": 7909 + }, + { + "epoch": 0.42435622317596566, + "grad_norm": 0.42578125, + "learning_rate": 4.960625721901735e-06, + "loss": 2.5026, + "step": 7910 + }, + { + "epoch": 0.4244098712446352, + "grad_norm": 0.435546875, + "learning_rate": 4.960610362341906e-06, + "loss": 2.3123, + "step": 7911 + }, + { + "epoch": 0.4244635193133047, + "grad_norm": 0.6484375, + "learning_rate": 4.960594999810636e-06, + "loss": 2.2014, + "step": 7912 + }, + { + "epoch": 0.42451716738197426, + "grad_norm": 0.37109375, + "learning_rate": 4.960579634307941e-06, + "loss": 2.1866, + "step": 7913 + }, + { + "epoch": 0.4245708154506438, + "grad_norm": 0.431640625, + "learning_rate": 4.960564265833842e-06, + "loss": 2.2825, + "step": 7914 + }, + { + "epoch": 0.4246244635193133, + "grad_norm": 0.37890625, + "learning_rate": 4.960548894388355e-06, + "loss": 2.3632, + "step": 7915 + }, + { + "epoch": 0.42467811158798285, + "grad_norm": 0.490234375, + "learning_rate": 4.960533519971501e-06, + "loss": 1.5035, + "step": 7916 + }, + { + "epoch": 0.4247317596566524, + "grad_norm": 0.455078125, + "learning_rate": 4.9605181425832975e-06, + "loss": 2.3493, + "step": 7917 + }, + { + "epoch": 0.42478540772532186, + "grad_norm": 0.416015625, + "learning_rate": 4.960502762223762e-06, + "loss": 2.3792, + "step": 7918 + }, + { + "epoch": 0.4248390557939914, + "grad_norm": 0.412109375, + "learning_rate": 4.960487378892915e-06, + "loss": 2.3741, + "step": 7919 + }, + { + "epoch": 0.4248927038626609, + "grad_norm": 0.3828125, + "learning_rate": 4.960471992590774e-06, + "loss": 2.2964, + "step": 7920 + }, + { + "epoch": 0.42494635193133046, + "grad_norm": 0.443359375, + "learning_rate": 4.960456603317358e-06, + "loss": 2.4697, + "step": 7921 + }, + { + "epoch": 0.425, + "grad_norm": 0.408203125, + "learning_rate": 4.960441211072686e-06, + "loss": 2.268, + "step": 7922 + }, + { + "epoch": 0.4250536480686695, + "grad_norm": 0.453125, + "learning_rate": 4.960425815856776e-06, + "loss": 2.4646, + "step": 7923 + }, + { + "epoch": 0.42510729613733905, + "grad_norm": 0.41015625, + "learning_rate": 4.960410417669646e-06, + "loss": 2.4699, + "step": 7924 + }, + { + "epoch": 0.4251609442060086, + "grad_norm": 0.5390625, + "learning_rate": 4.9603950165113145e-06, + "loss": 1.2588, + "step": 7925 + }, + { + "epoch": 0.4252145922746781, + "grad_norm": 0.455078125, + "learning_rate": 4.960379612381801e-06, + "loss": 2.4682, + "step": 7926 + }, + { + "epoch": 0.42526824034334765, + "grad_norm": 0.388671875, + "learning_rate": 4.960364205281124e-06, + "loss": 2.2737, + "step": 7927 + }, + { + "epoch": 0.4253218884120172, + "grad_norm": 0.421875, + "learning_rate": 4.9603487952093025e-06, + "loss": 2.2998, + "step": 7928 + }, + { + "epoch": 0.4253755364806867, + "grad_norm": 0.3125, + "learning_rate": 4.960333382166354e-06, + "loss": 2.2808, + "step": 7929 + }, + { + "epoch": 0.42542918454935624, + "grad_norm": 0.431640625, + "learning_rate": 4.960317966152298e-06, + "loss": 2.0365, + "step": 7930 + }, + { + "epoch": 0.4254828326180258, + "grad_norm": 0.408203125, + "learning_rate": 4.960302547167153e-06, + "loss": 2.3105, + "step": 7931 + }, + { + "epoch": 0.4255364806866953, + "grad_norm": 0.373046875, + "learning_rate": 4.960287125210936e-06, + "loss": 2.2626, + "step": 7932 + }, + { + "epoch": 0.4255901287553648, + "grad_norm": 0.435546875, + "learning_rate": 4.960271700283669e-06, + "loss": 1.7465, + "step": 7933 + }, + { + "epoch": 0.4256437768240343, + "grad_norm": 0.390625, + "learning_rate": 4.960256272385367e-06, + "loss": 2.192, + "step": 7934 + }, + { + "epoch": 0.42569742489270385, + "grad_norm": 0.546875, + "learning_rate": 4.960240841516052e-06, + "loss": 2.3274, + "step": 7935 + }, + { + "epoch": 0.4257510729613734, + "grad_norm": 0.390625, + "learning_rate": 4.960225407675739e-06, + "loss": 2.3632, + "step": 7936 + }, + { + "epoch": 0.4258047210300429, + "grad_norm": 0.396484375, + "learning_rate": 4.96020997086445e-06, + "loss": 2.4398, + "step": 7937 + }, + { + "epoch": 0.42585836909871244, + "grad_norm": 0.44921875, + "learning_rate": 4.960194531082202e-06, + "loss": 2.4335, + "step": 7938 + }, + { + "epoch": 0.425912017167382, + "grad_norm": 0.419921875, + "learning_rate": 4.960179088329014e-06, + "loss": 2.3527, + "step": 7939 + }, + { + "epoch": 0.4259656652360515, + "grad_norm": 0.44140625, + "learning_rate": 4.960163642604905e-06, + "loss": 1.5932, + "step": 7940 + }, + { + "epoch": 0.42601931330472104, + "grad_norm": 0.431640625, + "learning_rate": 4.960148193909892e-06, + "loss": 2.493, + "step": 7941 + }, + { + "epoch": 0.42607296137339057, + "grad_norm": 0.48828125, + "learning_rate": 4.960132742243995e-06, + "loss": 2.2497, + "step": 7942 + }, + { + "epoch": 0.4261266094420601, + "grad_norm": 0.47265625, + "learning_rate": 4.960117287607233e-06, + "loss": 2.4199, + "step": 7943 + }, + { + "epoch": 0.42618025751072963, + "grad_norm": 0.34765625, + "learning_rate": 4.960101829999623e-06, + "loss": 2.1938, + "step": 7944 + }, + { + "epoch": 0.42623390557939916, + "grad_norm": 0.466796875, + "learning_rate": 4.9600863694211864e-06, + "loss": 2.0799, + "step": 7945 + }, + { + "epoch": 0.4262875536480687, + "grad_norm": 0.8984375, + "learning_rate": 4.960070905871939e-06, + "loss": 2.523, + "step": 7946 + }, + { + "epoch": 0.42634120171673817, + "grad_norm": 0.41015625, + "learning_rate": 4.9600554393519015e-06, + "loss": 2.3288, + "step": 7947 + }, + { + "epoch": 0.4263948497854077, + "grad_norm": 0.546875, + "learning_rate": 4.9600399698610916e-06, + "loss": 2.2673, + "step": 7948 + }, + { + "epoch": 0.42644849785407724, + "grad_norm": 0.4140625, + "learning_rate": 4.960024497399528e-06, + "loss": 2.4253, + "step": 7949 + }, + { + "epoch": 0.42650214592274677, + "grad_norm": 0.44921875, + "learning_rate": 4.960009021967231e-06, + "loss": 2.2579, + "step": 7950 + }, + { + "epoch": 0.4265557939914163, + "grad_norm": 0.37109375, + "learning_rate": 4.959993543564216e-06, + "loss": 2.5843, + "step": 7951 + }, + { + "epoch": 0.42660944206008583, + "grad_norm": 0.439453125, + "learning_rate": 4.959978062190505e-06, + "loss": 2.2202, + "step": 7952 + }, + { + "epoch": 0.42666309012875536, + "grad_norm": 0.400390625, + "learning_rate": 4.959962577846114e-06, + "loss": 1.9632, + "step": 7953 + }, + { + "epoch": 0.4267167381974249, + "grad_norm": 0.412109375, + "learning_rate": 4.9599470905310635e-06, + "loss": 2.063, + "step": 7954 + }, + { + "epoch": 0.4267703862660944, + "grad_norm": 0.37109375, + "learning_rate": 4.959931600245372e-06, + "loss": 2.4682, + "step": 7955 + }, + { + "epoch": 0.42682403433476396, + "grad_norm": 0.435546875, + "learning_rate": 4.959916106989057e-06, + "loss": 2.3457, + "step": 7956 + }, + { + "epoch": 0.4268776824034335, + "grad_norm": 0.373046875, + "learning_rate": 4.959900610762139e-06, + "loss": 2.281, + "step": 7957 + }, + { + "epoch": 0.426931330472103, + "grad_norm": 0.40625, + "learning_rate": 4.959885111564635e-06, + "loss": 2.2625, + "step": 7958 + }, + { + "epoch": 0.42698497854077255, + "grad_norm": 0.349609375, + "learning_rate": 4.959869609396566e-06, + "loss": 2.2899, + "step": 7959 + }, + { + "epoch": 0.4270386266094421, + "grad_norm": 0.416015625, + "learning_rate": 4.959854104257948e-06, + "loss": 2.2625, + "step": 7960 + }, + { + "epoch": 0.42709227467811156, + "grad_norm": 0.4453125, + "learning_rate": 4.959838596148801e-06, + "loss": 2.2374, + "step": 7961 + }, + { + "epoch": 0.4271459227467811, + "grad_norm": 0.390625, + "learning_rate": 4.959823085069144e-06, + "loss": 2.0419, + "step": 7962 + }, + { + "epoch": 0.4271995708154506, + "grad_norm": 0.50390625, + "learning_rate": 4.9598075710189955e-06, + "loss": 2.4946, + "step": 7963 + }, + { + "epoch": 0.42725321888412016, + "grad_norm": 0.458984375, + "learning_rate": 4.959792053998373e-06, + "loss": 1.9956, + "step": 7964 + }, + { + "epoch": 0.4273068669527897, + "grad_norm": 0.41796875, + "learning_rate": 4.959776534007298e-06, + "loss": 2.3471, + "step": 7965 + }, + { + "epoch": 0.4273605150214592, + "grad_norm": 0.359375, + "learning_rate": 4.959761011045787e-06, + "loss": 2.1836, + "step": 7966 + }, + { + "epoch": 0.42741416309012875, + "grad_norm": 0.37890625, + "learning_rate": 4.95974548511386e-06, + "loss": 2.3269, + "step": 7967 + }, + { + "epoch": 0.4274678111587983, + "grad_norm": 0.3984375, + "learning_rate": 4.9597299562115346e-06, + "loss": 2.2184, + "step": 7968 + }, + { + "epoch": 0.4275214592274678, + "grad_norm": 0.427734375, + "learning_rate": 4.959714424338829e-06, + "loss": 2.2061, + "step": 7969 + }, + { + "epoch": 0.42757510729613735, + "grad_norm": 0.423828125, + "learning_rate": 4.959698889495765e-06, + "loss": 2.3496, + "step": 7970 + }, + { + "epoch": 0.4276287553648069, + "grad_norm": 0.349609375, + "learning_rate": 4.959683351682358e-06, + "loss": 1.9672, + "step": 7971 + }, + { + "epoch": 0.4276824034334764, + "grad_norm": 0.58203125, + "learning_rate": 4.959667810898629e-06, + "loss": 2.1551, + "step": 7972 + }, + { + "epoch": 0.42773605150214594, + "grad_norm": 0.423828125, + "learning_rate": 4.959652267144596e-06, + "loss": 2.2166, + "step": 7973 + }, + { + "epoch": 0.4277896995708155, + "grad_norm": 0.44921875, + "learning_rate": 4.959636720420277e-06, + "loss": 2.3635, + "step": 7974 + }, + { + "epoch": 0.427843347639485, + "grad_norm": 0.3828125, + "learning_rate": 4.9596211707256915e-06, + "loss": 2.3172, + "step": 7975 + }, + { + "epoch": 0.4278969957081545, + "grad_norm": 0.40625, + "learning_rate": 4.959605618060859e-06, + "loss": 2.3959, + "step": 7976 + }, + { + "epoch": 0.427950643776824, + "grad_norm": 1.5390625, + "learning_rate": 4.959590062425796e-06, + "loss": 2.3445, + "step": 7977 + }, + { + "epoch": 0.42800429184549355, + "grad_norm": 0.4140625, + "learning_rate": 4.9595745038205245e-06, + "loss": 2.2489, + "step": 7978 + }, + { + "epoch": 0.4280579399141631, + "grad_norm": 0.41015625, + "learning_rate": 4.959558942245061e-06, + "loss": 2.3266, + "step": 7979 + }, + { + "epoch": 0.4281115879828326, + "grad_norm": 1.359375, + "learning_rate": 4.959543377699425e-06, + "loss": 2.3644, + "step": 7980 + }, + { + "epoch": 0.42816523605150214, + "grad_norm": 0.43359375, + "learning_rate": 4.9595278101836355e-06, + "loss": 2.3272, + "step": 7981 + }, + { + "epoch": 0.4282188841201717, + "grad_norm": 0.375, + "learning_rate": 4.95951223969771e-06, + "loss": 2.4884, + "step": 7982 + }, + { + "epoch": 0.4282725321888412, + "grad_norm": 0.498046875, + "learning_rate": 4.95949666624167e-06, + "loss": 2.4526, + "step": 7983 + }, + { + "epoch": 0.42832618025751074, + "grad_norm": 0.400390625, + "learning_rate": 4.959481089815531e-06, + "loss": 2.3709, + "step": 7984 + }, + { + "epoch": 0.42837982832618027, + "grad_norm": 0.375, + "learning_rate": 4.959465510419314e-06, + "loss": 2.2641, + "step": 7985 + }, + { + "epoch": 0.4284334763948498, + "grad_norm": 0.6015625, + "learning_rate": 4.959449928053037e-06, + "loss": 2.284, + "step": 7986 + }, + { + "epoch": 0.42848712446351933, + "grad_norm": 0.4140625, + "learning_rate": 4.959434342716719e-06, + "loss": 2.4182, + "step": 7987 + }, + { + "epoch": 0.42854077253218886, + "grad_norm": 0.396484375, + "learning_rate": 4.9594187544103785e-06, + "loss": 2.2429, + "step": 7988 + }, + { + "epoch": 0.4285944206008584, + "grad_norm": 0.546875, + "learning_rate": 4.959403163134036e-06, + "loss": 1.9282, + "step": 7989 + }, + { + "epoch": 0.42864806866952787, + "grad_norm": 0.423828125, + "learning_rate": 4.959387568887708e-06, + "loss": 2.277, + "step": 7990 + }, + { + "epoch": 0.4287017167381974, + "grad_norm": 0.361328125, + "learning_rate": 4.9593719716714155e-06, + "loss": 1.948, + "step": 7991 + }, + { + "epoch": 0.42875536480686693, + "grad_norm": 0.30859375, + "learning_rate": 4.959356371485175e-06, + "loss": 1.8599, + "step": 7992 + }, + { + "epoch": 0.42880901287553647, + "grad_norm": 0.357421875, + "learning_rate": 4.959340768329007e-06, + "loss": 2.3564, + "step": 7993 + }, + { + "epoch": 0.428862660944206, + "grad_norm": 0.35546875, + "learning_rate": 4.959325162202929e-06, + "loss": 1.8286, + "step": 7994 + }, + { + "epoch": 0.42891630901287553, + "grad_norm": 0.388671875, + "learning_rate": 4.959309553106962e-06, + "loss": 2.4908, + "step": 7995 + }, + { + "epoch": 0.42896995708154506, + "grad_norm": 0.6171875, + "learning_rate": 4.9592939410411235e-06, + "loss": 2.261, + "step": 7996 + }, + { + "epoch": 0.4290236051502146, + "grad_norm": 0.40234375, + "learning_rate": 4.959278326005432e-06, + "loss": 2.2056, + "step": 7997 + }, + { + "epoch": 0.4290772532188841, + "grad_norm": 0.44140625, + "learning_rate": 4.959262707999906e-06, + "loss": 2.3836, + "step": 7998 + }, + { + "epoch": 0.42913090128755366, + "grad_norm": 0.384765625, + "learning_rate": 4.959247087024567e-06, + "loss": 2.1569, + "step": 7999 + }, + { + "epoch": 0.4291845493562232, + "grad_norm": 0.359375, + "learning_rate": 4.95923146307943e-06, + "loss": 2.1582, + "step": 8000 + }, + { + "epoch": 0.4292381974248927, + "grad_norm": 0.7421875, + "learning_rate": 4.959215836164517e-06, + "loss": 2.1968, + "step": 8001 + }, + { + "epoch": 0.42929184549356225, + "grad_norm": 0.408203125, + "learning_rate": 4.959200206279845e-06, + "loss": 2.2173, + "step": 8002 + }, + { + "epoch": 0.4293454935622318, + "grad_norm": 0.734375, + "learning_rate": 4.959184573425434e-06, + "loss": 1.7087, + "step": 8003 + }, + { + "epoch": 0.42939914163090126, + "grad_norm": 0.42578125, + "learning_rate": 4.959168937601303e-06, + "loss": 2.5078, + "step": 8004 + }, + { + "epoch": 0.4294527896995708, + "grad_norm": 0.39453125, + "learning_rate": 4.959153298807469e-06, + "loss": 2.2555, + "step": 8005 + }, + { + "epoch": 0.4295064377682403, + "grad_norm": 0.4375, + "learning_rate": 4.9591376570439544e-06, + "loss": 2.1305, + "step": 8006 + }, + { + "epoch": 0.42956008583690986, + "grad_norm": 0.380859375, + "learning_rate": 4.959122012310775e-06, + "loss": 2.1662, + "step": 8007 + }, + { + "epoch": 0.4296137339055794, + "grad_norm": 0.412109375, + "learning_rate": 4.95910636460795e-06, + "loss": 2.3101, + "step": 8008 + }, + { + "epoch": 0.4296673819742489, + "grad_norm": 0.369140625, + "learning_rate": 4.959090713935499e-06, + "loss": 2.5275, + "step": 8009 + }, + { + "epoch": 0.42972103004291845, + "grad_norm": 0.466796875, + "learning_rate": 4.959075060293442e-06, + "loss": 2.0155, + "step": 8010 + }, + { + "epoch": 0.429774678111588, + "grad_norm": 0.5, + "learning_rate": 4.959059403681795e-06, + "loss": 2.2857, + "step": 8011 + }, + { + "epoch": 0.4298283261802575, + "grad_norm": 0.482421875, + "learning_rate": 4.95904374410058e-06, + "loss": 2.4707, + "step": 8012 + }, + { + "epoch": 0.42988197424892705, + "grad_norm": 0.6796875, + "learning_rate": 4.959028081549814e-06, + "loss": 1.271, + "step": 8013 + }, + { + "epoch": 0.4299356223175966, + "grad_norm": 0.375, + "learning_rate": 4.959012416029517e-06, + "loss": 2.1272, + "step": 8014 + }, + { + "epoch": 0.4299892703862661, + "grad_norm": 0.4765625, + "learning_rate": 4.958996747539707e-06, + "loss": 2.4254, + "step": 8015 + }, + { + "epoch": 0.43004291845493564, + "grad_norm": 0.41015625, + "learning_rate": 4.958981076080404e-06, + "loss": 2.4272, + "step": 8016 + }, + { + "epoch": 0.4300965665236052, + "grad_norm": 0.4140625, + "learning_rate": 4.958965401651625e-06, + "loss": 2.4619, + "step": 8017 + }, + { + "epoch": 0.4301502145922747, + "grad_norm": 0.40625, + "learning_rate": 4.95894972425339e-06, + "loss": 2.0785, + "step": 8018 + }, + { + "epoch": 0.4302038626609442, + "grad_norm": 0.5625, + "learning_rate": 4.9589340438857195e-06, + "loss": 2.3869, + "step": 8019 + }, + { + "epoch": 0.4302575107296137, + "grad_norm": 0.41015625, + "learning_rate": 4.95891836054863e-06, + "loss": 2.3082, + "step": 8020 + }, + { + "epoch": 0.43031115879828324, + "grad_norm": 0.466796875, + "learning_rate": 4.958902674242142e-06, + "loss": 2.3459, + "step": 8021 + }, + { + "epoch": 0.4303648068669528, + "grad_norm": 0.435546875, + "learning_rate": 4.9588869849662735e-06, + "loss": 2.3565, + "step": 8022 + }, + { + "epoch": 0.4304184549356223, + "grad_norm": 0.435546875, + "learning_rate": 4.958871292721045e-06, + "loss": 2.3388, + "step": 8023 + }, + { + "epoch": 0.43047210300429184, + "grad_norm": 0.4609375, + "learning_rate": 4.958855597506473e-06, + "loss": 2.2986, + "step": 8024 + }, + { + "epoch": 0.43052575107296137, + "grad_norm": 0.3828125, + "learning_rate": 4.958839899322578e-06, + "loss": 2.5218, + "step": 8025 + }, + { + "epoch": 0.4305793991416309, + "grad_norm": 0.39453125, + "learning_rate": 4.958824198169379e-06, + "loss": 2.3665, + "step": 8026 + }, + { + "epoch": 0.43063304721030043, + "grad_norm": 0.49609375, + "learning_rate": 4.958808494046895e-06, + "loss": 2.0179, + "step": 8027 + }, + { + "epoch": 0.43068669527896997, + "grad_norm": 0.6171875, + "learning_rate": 4.958792786955144e-06, + "loss": 2.2723, + "step": 8028 + }, + { + "epoch": 0.4307403433476395, + "grad_norm": 0.400390625, + "learning_rate": 4.958777076894146e-06, + "loss": 2.3031, + "step": 8029 + }, + { + "epoch": 0.43079399141630903, + "grad_norm": 0.4140625, + "learning_rate": 4.958761363863919e-06, + "loss": 2.2066, + "step": 8030 + }, + { + "epoch": 0.43084763948497856, + "grad_norm": 0.46484375, + "learning_rate": 4.958745647864483e-06, + "loss": 2.3845, + "step": 8031 + }, + { + "epoch": 0.4309012875536481, + "grad_norm": 0.6328125, + "learning_rate": 4.9587299288958566e-06, + "loss": 2.263, + "step": 8032 + }, + { + "epoch": 0.43095493562231757, + "grad_norm": 0.3984375, + "learning_rate": 4.958714206958058e-06, + "loss": 2.0282, + "step": 8033 + }, + { + "epoch": 0.4310085836909871, + "grad_norm": 0.42578125, + "learning_rate": 4.958698482051109e-06, + "loss": 2.3013, + "step": 8034 + }, + { + "epoch": 0.43106223175965663, + "grad_norm": 0.515625, + "learning_rate": 4.9586827541750245e-06, + "loss": 2.1037, + "step": 8035 + }, + { + "epoch": 0.43111587982832617, + "grad_norm": 0.462890625, + "learning_rate": 4.958667023329826e-06, + "loss": 2.2946, + "step": 8036 + }, + { + "epoch": 0.4311695278969957, + "grad_norm": 0.400390625, + "learning_rate": 4.9586512895155325e-06, + "loss": 2.4553, + "step": 8037 + }, + { + "epoch": 0.43122317596566523, + "grad_norm": 0.56640625, + "learning_rate": 4.958635552732161e-06, + "loss": 2.3896, + "step": 8038 + }, + { + "epoch": 0.43127682403433476, + "grad_norm": 0.453125, + "learning_rate": 4.958619812979734e-06, + "loss": 2.5771, + "step": 8039 + }, + { + "epoch": 0.4313304721030043, + "grad_norm": 0.337890625, + "learning_rate": 4.958604070258267e-06, + "loss": 2.1336, + "step": 8040 + }, + { + "epoch": 0.4313841201716738, + "grad_norm": 0.400390625, + "learning_rate": 4.958588324567781e-06, + "loss": 2.1954, + "step": 8041 + }, + { + "epoch": 0.43143776824034336, + "grad_norm": 0.470703125, + "learning_rate": 4.958572575908295e-06, + "loss": 2.4137, + "step": 8042 + }, + { + "epoch": 0.4314914163090129, + "grad_norm": 0.451171875, + "learning_rate": 4.958556824279827e-06, + "loss": 2.3492, + "step": 8043 + }, + { + "epoch": 0.4315450643776824, + "grad_norm": 0.47265625, + "learning_rate": 4.958541069682397e-06, + "loss": 2.2884, + "step": 8044 + }, + { + "epoch": 0.43159871244635195, + "grad_norm": 0.470703125, + "learning_rate": 4.9585253121160236e-06, + "loss": 2.3819, + "step": 8045 + }, + { + "epoch": 0.4316523605150215, + "grad_norm": 0.390625, + "learning_rate": 4.958509551580725e-06, + "loss": 2.398, + "step": 8046 + }, + { + "epoch": 0.431706008583691, + "grad_norm": 0.36328125, + "learning_rate": 4.958493788076522e-06, + "loss": 2.0067, + "step": 8047 + }, + { + "epoch": 0.4317596566523605, + "grad_norm": 0.83984375, + "learning_rate": 4.958478021603432e-06, + "loss": 2.4478, + "step": 8048 + }, + { + "epoch": 0.43181330472103, + "grad_norm": 0.3828125, + "learning_rate": 4.958462252161475e-06, + "loss": 2.327, + "step": 8049 + }, + { + "epoch": 0.43186695278969955, + "grad_norm": 1.3046875, + "learning_rate": 4.958446479750669e-06, + "loss": 2.4864, + "step": 8050 + }, + { + "epoch": 0.4319206008583691, + "grad_norm": 0.71875, + "learning_rate": 4.958430704371035e-06, + "loss": 2.3356, + "step": 8051 + }, + { + "epoch": 0.4319742489270386, + "grad_norm": 0.392578125, + "learning_rate": 4.95841492602259e-06, + "loss": 2.1103, + "step": 8052 + }, + { + "epoch": 0.43202789699570815, + "grad_norm": 0.4296875, + "learning_rate": 4.958399144705355e-06, + "loss": 2.3007, + "step": 8053 + }, + { + "epoch": 0.4320815450643777, + "grad_norm": 0.42578125, + "learning_rate": 4.958383360419348e-06, + "loss": 2.2864, + "step": 8054 + }, + { + "epoch": 0.4321351931330472, + "grad_norm": 0.474609375, + "learning_rate": 4.958367573164587e-06, + "loss": 2.0699, + "step": 8055 + }, + { + "epoch": 0.43218884120171674, + "grad_norm": 0.412109375, + "learning_rate": 4.958351782941092e-06, + "loss": 2.2826, + "step": 8056 + }, + { + "epoch": 0.4322424892703863, + "grad_norm": 0.412109375, + "learning_rate": 4.958335989748883e-06, + "loss": 2.1728, + "step": 8057 + }, + { + "epoch": 0.4322961373390558, + "grad_norm": 0.404296875, + "learning_rate": 4.9583201935879776e-06, + "loss": 2.3802, + "step": 8058 + }, + { + "epoch": 0.43234978540772534, + "grad_norm": 0.458984375, + "learning_rate": 4.958304394458396e-06, + "loss": 2.6331, + "step": 8059 + }, + { + "epoch": 0.43240343347639487, + "grad_norm": 0.451171875, + "learning_rate": 4.9582885923601564e-06, + "loss": 2.3698, + "step": 8060 + }, + { + "epoch": 0.4324570815450644, + "grad_norm": 0.400390625, + "learning_rate": 4.958272787293279e-06, + "loss": 2.4708, + "step": 8061 + }, + { + "epoch": 0.4325107296137339, + "grad_norm": 0.408203125, + "learning_rate": 4.958256979257782e-06, + "loss": 2.0744, + "step": 8062 + }, + { + "epoch": 0.4325643776824034, + "grad_norm": 0.4609375, + "learning_rate": 4.9582411682536835e-06, + "loss": 2.3632, + "step": 8063 + }, + { + "epoch": 0.43261802575107294, + "grad_norm": 0.396484375, + "learning_rate": 4.9582253542810045e-06, + "loss": 2.1996, + "step": 8064 + }, + { + "epoch": 0.4326716738197425, + "grad_norm": 0.435546875, + "learning_rate": 4.958209537339764e-06, + "loss": 2.0826, + "step": 8065 + }, + { + "epoch": 0.432725321888412, + "grad_norm": 0.455078125, + "learning_rate": 4.9581937174299795e-06, + "loss": 2.2765, + "step": 8066 + }, + { + "epoch": 0.43277896995708154, + "grad_norm": 0.421875, + "learning_rate": 4.958177894551672e-06, + "loss": 2.3064, + "step": 8067 + }, + { + "epoch": 0.43283261802575107, + "grad_norm": 0.42578125, + "learning_rate": 4.958162068704859e-06, + "loss": 2.3817, + "step": 8068 + }, + { + "epoch": 0.4328862660944206, + "grad_norm": 0.462890625, + "learning_rate": 4.95814623988956e-06, + "loss": 2.3497, + "step": 8069 + }, + { + "epoch": 0.43293991416309013, + "grad_norm": 0.474609375, + "learning_rate": 4.958130408105795e-06, + "loss": 2.4784, + "step": 8070 + }, + { + "epoch": 0.43299356223175967, + "grad_norm": 0.7109375, + "learning_rate": 4.9581145733535815e-06, + "loss": 2.3539, + "step": 8071 + }, + { + "epoch": 0.4330472103004292, + "grad_norm": 0.38671875, + "learning_rate": 4.958098735632941e-06, + "loss": 2.3938, + "step": 8072 + }, + { + "epoch": 0.43310085836909873, + "grad_norm": 0.55078125, + "learning_rate": 4.958082894943891e-06, + "loss": 2.112, + "step": 8073 + }, + { + "epoch": 0.43315450643776826, + "grad_norm": 0.41015625, + "learning_rate": 4.9580670512864505e-06, + "loss": 2.1754, + "step": 8074 + }, + { + "epoch": 0.4332081545064378, + "grad_norm": 0.8828125, + "learning_rate": 4.958051204660639e-06, + "loss": 2.4471, + "step": 8075 + }, + { + "epoch": 0.43326180257510727, + "grad_norm": 0.42578125, + "learning_rate": 4.958035355066475e-06, + "loss": 2.2365, + "step": 8076 + }, + { + "epoch": 0.4333154506437768, + "grad_norm": 0.453125, + "learning_rate": 4.95801950250398e-06, + "loss": 2.3936, + "step": 8077 + }, + { + "epoch": 0.43336909871244633, + "grad_norm": 0.57421875, + "learning_rate": 4.958003646973169e-06, + "loss": 2.2648, + "step": 8078 + }, + { + "epoch": 0.43342274678111586, + "grad_norm": 0.65234375, + "learning_rate": 4.957987788474066e-06, + "loss": 2.3681, + "step": 8079 + }, + { + "epoch": 0.4334763948497854, + "grad_norm": 0.375, + "learning_rate": 4.957971927006687e-06, + "loss": 2.4586, + "step": 8080 + }, + { + "epoch": 0.4335300429184549, + "grad_norm": 0.54296875, + "learning_rate": 4.957956062571051e-06, + "loss": 2.2233, + "step": 8081 + }, + { + "epoch": 0.43358369098712446, + "grad_norm": 0.388671875, + "learning_rate": 4.957940195167179e-06, + "loss": 1.8253, + "step": 8082 + }, + { + "epoch": 0.433637339055794, + "grad_norm": 0.390625, + "learning_rate": 4.95792432479509e-06, + "loss": 2.1295, + "step": 8083 + }, + { + "epoch": 0.4336909871244635, + "grad_norm": 0.45703125, + "learning_rate": 4.957908451454801e-06, + "loss": 2.2982, + "step": 8084 + }, + { + "epoch": 0.43374463519313305, + "grad_norm": 0.45703125, + "learning_rate": 4.957892575146332e-06, + "loss": 2.5845, + "step": 8085 + }, + { + "epoch": 0.4337982832618026, + "grad_norm": 0.4609375, + "learning_rate": 4.957876695869703e-06, + "loss": 2.642, + "step": 8086 + }, + { + "epoch": 0.4338519313304721, + "grad_norm": 0.421875, + "learning_rate": 4.957860813624934e-06, + "loss": 2.2777, + "step": 8087 + }, + { + "epoch": 0.43390557939914165, + "grad_norm": 0.52734375, + "learning_rate": 4.957844928412042e-06, + "loss": 1.2979, + "step": 8088 + }, + { + "epoch": 0.4339592274678112, + "grad_norm": 2.8125, + "learning_rate": 4.957829040231048e-06, + "loss": 2.4542, + "step": 8089 + }, + { + "epoch": 0.4340128755364807, + "grad_norm": 0.341796875, + "learning_rate": 4.95781314908197e-06, + "loss": 2.1156, + "step": 8090 + }, + { + "epoch": 0.4340665236051502, + "grad_norm": 0.427734375, + "learning_rate": 4.957797254964829e-06, + "loss": 2.3465, + "step": 8091 + }, + { + "epoch": 0.4341201716738197, + "grad_norm": 0.4296875, + "learning_rate": 4.9577813578796406e-06, + "loss": 2.4259, + "step": 8092 + }, + { + "epoch": 0.43417381974248925, + "grad_norm": 0.70703125, + "learning_rate": 4.957765457826428e-06, + "loss": 1.9396, + "step": 8093 + }, + { + "epoch": 0.4342274678111588, + "grad_norm": 0.4140625, + "learning_rate": 4.957749554805208e-06, + "loss": 2.3744, + "step": 8094 + }, + { + "epoch": 0.4342811158798283, + "grad_norm": 0.380859375, + "learning_rate": 4.957733648816e-06, + "loss": 2.254, + "step": 8095 + }, + { + "epoch": 0.43433476394849785, + "grad_norm": 0.435546875, + "learning_rate": 4.957717739858824e-06, + "loss": 2.432, + "step": 8096 + }, + { + "epoch": 0.4343884120171674, + "grad_norm": 0.328125, + "learning_rate": 4.957701827933699e-06, + "loss": 2.236, + "step": 8097 + }, + { + "epoch": 0.4344420600858369, + "grad_norm": 0.427734375, + "learning_rate": 4.957685913040643e-06, + "loss": 2.1724, + "step": 8098 + }, + { + "epoch": 0.43449570815450644, + "grad_norm": 0.4296875, + "learning_rate": 4.957669995179677e-06, + "loss": 2.2898, + "step": 8099 + }, + { + "epoch": 0.434549356223176, + "grad_norm": 0.40234375, + "learning_rate": 4.957654074350821e-06, + "loss": 2.2533, + "step": 8100 + }, + { + "epoch": 0.4346030042918455, + "grad_norm": 0.52734375, + "learning_rate": 4.95763815055409e-06, + "loss": 2.2275, + "step": 8101 + }, + { + "epoch": 0.43465665236051504, + "grad_norm": 0.484375, + "learning_rate": 4.957622223789508e-06, + "loss": 2.244, + "step": 8102 + }, + { + "epoch": 0.43471030042918457, + "grad_norm": 0.388671875, + "learning_rate": 4.957606294057092e-06, + "loss": 2.2234, + "step": 8103 + }, + { + "epoch": 0.4347639484978541, + "grad_norm": 0.5390625, + "learning_rate": 4.957590361356861e-06, + "loss": 2.2021, + "step": 8104 + }, + { + "epoch": 0.4348175965665236, + "grad_norm": 3.734375, + "learning_rate": 4.957574425688834e-06, + "loss": 2.1746, + "step": 8105 + }, + { + "epoch": 0.4348712446351931, + "grad_norm": 0.478515625, + "learning_rate": 4.9575584870530315e-06, + "loss": 2.2655, + "step": 8106 + }, + { + "epoch": 0.43492489270386264, + "grad_norm": 0.5390625, + "learning_rate": 4.957542545449472e-06, + "loss": 2.3701, + "step": 8107 + }, + { + "epoch": 0.4349785407725322, + "grad_norm": 1.046875, + "learning_rate": 4.957526600878175e-06, + "loss": 2.2571, + "step": 8108 + }, + { + "epoch": 0.4350321888412017, + "grad_norm": 0.439453125, + "learning_rate": 4.9575106533391604e-06, + "loss": 2.3607, + "step": 8109 + }, + { + "epoch": 0.43508583690987124, + "grad_norm": 0.5078125, + "learning_rate": 4.957494702832446e-06, + "loss": 2.3361, + "step": 8110 + }, + { + "epoch": 0.43513948497854077, + "grad_norm": 0.328125, + "learning_rate": 4.957478749358052e-06, + "loss": 2.3465, + "step": 8111 + }, + { + "epoch": 0.4351931330472103, + "grad_norm": 0.392578125, + "learning_rate": 4.957462792915997e-06, + "loss": 2.238, + "step": 8112 + }, + { + "epoch": 0.43524678111587983, + "grad_norm": 0.423828125, + "learning_rate": 4.957446833506301e-06, + "loss": 2.2959, + "step": 8113 + }, + { + "epoch": 0.43530042918454936, + "grad_norm": 0.435546875, + "learning_rate": 4.957430871128983e-06, + "loss": 2.2202, + "step": 8114 + }, + { + "epoch": 0.4353540772532189, + "grad_norm": 0.390625, + "learning_rate": 4.957414905784062e-06, + "loss": 2.2341, + "step": 8115 + }, + { + "epoch": 0.43540772532188843, + "grad_norm": 0.40234375, + "learning_rate": 4.9573989374715585e-06, + "loss": 2.6308, + "step": 8116 + }, + { + "epoch": 0.43546137339055796, + "grad_norm": 0.55859375, + "learning_rate": 4.957382966191491e-06, + "loss": 2.2504, + "step": 8117 + }, + { + "epoch": 0.4355150214592275, + "grad_norm": 0.95703125, + "learning_rate": 4.957366991943877e-06, + "loss": 1.8291, + "step": 8118 + }, + { + "epoch": 0.435568669527897, + "grad_norm": 0.412109375, + "learning_rate": 4.957351014728739e-06, + "loss": 2.1994, + "step": 8119 + }, + { + "epoch": 0.4356223175965665, + "grad_norm": 0.455078125, + "learning_rate": 4.957335034546094e-06, + "loss": 2.2595, + "step": 8120 + }, + { + "epoch": 0.43567596566523603, + "grad_norm": 0.318359375, + "learning_rate": 4.957319051395961e-06, + "loss": 2.2829, + "step": 8121 + }, + { + "epoch": 0.43572961373390556, + "grad_norm": 0.392578125, + "learning_rate": 4.957303065278362e-06, + "loss": 2.1511, + "step": 8122 + }, + { + "epoch": 0.4357832618025751, + "grad_norm": 0.392578125, + "learning_rate": 4.957287076193313e-06, + "loss": 2.1771, + "step": 8123 + }, + { + "epoch": 0.4358369098712446, + "grad_norm": 1.40625, + "learning_rate": 4.957271084140837e-06, + "loss": 2.4061, + "step": 8124 + }, + { + "epoch": 0.43589055793991416, + "grad_norm": 0.375, + "learning_rate": 4.9572550891209495e-06, + "loss": 2.3211, + "step": 8125 + }, + { + "epoch": 0.4359442060085837, + "grad_norm": 0.46484375, + "learning_rate": 4.957239091133672e-06, + "loss": 2.0812, + "step": 8126 + }, + { + "epoch": 0.4359978540772532, + "grad_norm": 0.412109375, + "learning_rate": 4.957223090179023e-06, + "loss": 2.3271, + "step": 8127 + }, + { + "epoch": 0.43605150214592275, + "grad_norm": 0.47265625, + "learning_rate": 4.957207086257023e-06, + "loss": 2.5012, + "step": 8128 + }, + { + "epoch": 0.4361051502145923, + "grad_norm": 0.498046875, + "learning_rate": 4.95719107936769e-06, + "loss": 2.2459, + "step": 8129 + }, + { + "epoch": 0.4361587982832618, + "grad_norm": 0.341796875, + "learning_rate": 4.9571750695110444e-06, + "loss": 2.0812, + "step": 8130 + }, + { + "epoch": 0.43621244635193135, + "grad_norm": 0.54296875, + "learning_rate": 4.957159056687104e-06, + "loss": 2.2873, + "step": 8131 + }, + { + "epoch": 0.4362660944206009, + "grad_norm": 0.4765625, + "learning_rate": 4.95714304089589e-06, + "loss": 2.3767, + "step": 8132 + }, + { + "epoch": 0.4363197424892704, + "grad_norm": 0.5546875, + "learning_rate": 4.9571270221374204e-06, + "loss": 2.395, + "step": 8133 + }, + { + "epoch": 0.4363733905579399, + "grad_norm": 0.43359375, + "learning_rate": 4.957111000411715e-06, + "loss": 2.5504, + "step": 8134 + }, + { + "epoch": 0.4364270386266094, + "grad_norm": 0.353515625, + "learning_rate": 4.9570949757187935e-06, + "loss": 2.2811, + "step": 8135 + }, + { + "epoch": 0.43648068669527895, + "grad_norm": 0.38671875, + "learning_rate": 4.957078948058675e-06, + "loss": 2.2342, + "step": 8136 + }, + { + "epoch": 0.4365343347639485, + "grad_norm": 0.40234375, + "learning_rate": 4.957062917431378e-06, + "loss": 2.317, + "step": 8137 + }, + { + "epoch": 0.436587982832618, + "grad_norm": 0.51171875, + "learning_rate": 4.957046883836924e-06, + "loss": 1.9567, + "step": 8138 + }, + { + "epoch": 0.43664163090128755, + "grad_norm": 0.53515625, + "learning_rate": 4.957030847275329e-06, + "loss": 2.5461, + "step": 8139 + }, + { + "epoch": 0.4366952789699571, + "grad_norm": 0.427734375, + "learning_rate": 4.957014807746615e-06, + "loss": 2.123, + "step": 8140 + }, + { + "epoch": 0.4367489270386266, + "grad_norm": 0.443359375, + "learning_rate": 4.956998765250801e-06, + "loss": 2.3688, + "step": 8141 + }, + { + "epoch": 0.43680257510729614, + "grad_norm": 0.41015625, + "learning_rate": 4.9569827197879065e-06, + "loss": 2.104, + "step": 8142 + }, + { + "epoch": 0.4368562231759657, + "grad_norm": 0.396484375, + "learning_rate": 4.95696667135795e-06, + "loss": 2.3358, + "step": 8143 + }, + { + "epoch": 0.4369098712446352, + "grad_norm": 0.423828125, + "learning_rate": 4.956950619960952e-06, + "loss": 2.328, + "step": 8144 + }, + { + "epoch": 0.43696351931330474, + "grad_norm": 0.408203125, + "learning_rate": 4.95693456559693e-06, + "loss": 2.3218, + "step": 8145 + }, + { + "epoch": 0.43701716738197427, + "grad_norm": 0.59765625, + "learning_rate": 4.956918508265905e-06, + "loss": 2.3454, + "step": 8146 + }, + { + "epoch": 0.4370708154506438, + "grad_norm": 2.3125, + "learning_rate": 4.956902447967897e-06, + "loss": 2.3481, + "step": 8147 + }, + { + "epoch": 0.4371244635193133, + "grad_norm": 0.4140625, + "learning_rate": 4.9568863847029235e-06, + "loss": 2.3435, + "step": 8148 + }, + { + "epoch": 0.4371781115879828, + "grad_norm": 0.490234375, + "learning_rate": 4.956870318471005e-06, + "loss": 2.496, + "step": 8149 + }, + { + "epoch": 0.43723175965665234, + "grad_norm": 0.400390625, + "learning_rate": 4.9568542492721605e-06, + "loss": 2.1905, + "step": 8150 + }, + { + "epoch": 0.4372854077253219, + "grad_norm": 0.412109375, + "learning_rate": 4.95683817710641e-06, + "loss": 2.4712, + "step": 8151 + }, + { + "epoch": 0.4373390557939914, + "grad_norm": 0.62890625, + "learning_rate": 4.956822101973773e-06, + "loss": 2.3061, + "step": 8152 + }, + { + "epoch": 0.43739270386266094, + "grad_norm": 0.435546875, + "learning_rate": 4.956806023874267e-06, + "loss": 2.292, + "step": 8153 + }, + { + "epoch": 0.43744635193133047, + "grad_norm": 0.369140625, + "learning_rate": 4.9567899428079134e-06, + "loss": 2.2152, + "step": 8154 + }, + { + "epoch": 0.4375, + "grad_norm": 0.421875, + "learning_rate": 4.9567738587747314e-06, + "loss": 2.0381, + "step": 8155 + }, + { + "epoch": 0.43755364806866953, + "grad_norm": 0.416015625, + "learning_rate": 4.95675777177474e-06, + "loss": 2.1429, + "step": 8156 + }, + { + "epoch": 0.43760729613733906, + "grad_norm": 0.41796875, + "learning_rate": 4.956741681807959e-06, + "loss": 2.336, + "step": 8157 + }, + { + "epoch": 0.4376609442060086, + "grad_norm": 0.4140625, + "learning_rate": 4.956725588874407e-06, + "loss": 2.182, + "step": 8158 + }, + { + "epoch": 0.4377145922746781, + "grad_norm": 0.4140625, + "learning_rate": 4.956709492974104e-06, + "loss": 2.1457, + "step": 8159 + }, + { + "epoch": 0.43776824034334766, + "grad_norm": 0.37109375, + "learning_rate": 4.956693394107069e-06, + "loss": 2.07, + "step": 8160 + }, + { + "epoch": 0.4378218884120172, + "grad_norm": 0.40625, + "learning_rate": 4.956677292273323e-06, + "loss": 2.1196, + "step": 8161 + }, + { + "epoch": 0.4378755364806867, + "grad_norm": 0.435546875, + "learning_rate": 4.9566611874728835e-06, + "loss": 2.3341, + "step": 8162 + }, + { + "epoch": 0.4379291845493562, + "grad_norm": 0.396484375, + "learning_rate": 4.9566450797057706e-06, + "loss": 2.361, + "step": 8163 + }, + { + "epoch": 0.43798283261802573, + "grad_norm": 0.390625, + "learning_rate": 4.956628968972004e-06, + "loss": 2.5055, + "step": 8164 + }, + { + "epoch": 0.43803648068669526, + "grad_norm": 0.67578125, + "learning_rate": 4.9566128552716035e-06, + "loss": 2.4253, + "step": 8165 + }, + { + "epoch": 0.4380901287553648, + "grad_norm": 0.427734375, + "learning_rate": 4.9565967386045875e-06, + "loss": 2.2375, + "step": 8166 + }, + { + "epoch": 0.4381437768240343, + "grad_norm": 0.41015625, + "learning_rate": 4.956580618970977e-06, + "loss": 1.4941, + "step": 8167 + }, + { + "epoch": 0.43819742489270386, + "grad_norm": 0.38671875, + "learning_rate": 4.95656449637079e-06, + "loss": 2.2021, + "step": 8168 + }, + { + "epoch": 0.4382510729613734, + "grad_norm": 0.328125, + "learning_rate": 4.956548370804047e-06, + "loss": 1.6805, + "step": 8169 + }, + { + "epoch": 0.4383047210300429, + "grad_norm": 0.494140625, + "learning_rate": 4.956532242270766e-06, + "loss": 2.2044, + "step": 8170 + }, + { + "epoch": 0.43835836909871245, + "grad_norm": 0.41015625, + "learning_rate": 4.956516110770968e-06, + "loss": 2.481, + "step": 8171 + }, + { + "epoch": 0.438412017167382, + "grad_norm": 0.474609375, + "learning_rate": 4.956499976304672e-06, + "loss": 2.3914, + "step": 8172 + }, + { + "epoch": 0.4384656652360515, + "grad_norm": 0.49609375, + "learning_rate": 4.9564838388718975e-06, + "loss": 1.5323, + "step": 8173 + }, + { + "epoch": 0.43851931330472105, + "grad_norm": 0.41796875, + "learning_rate": 4.956467698472663e-06, + "loss": 2.2745, + "step": 8174 + }, + { + "epoch": 0.4385729613733906, + "grad_norm": 0.373046875, + "learning_rate": 4.95645155510699e-06, + "loss": 2.25, + "step": 8175 + }, + { + "epoch": 0.4386266094420601, + "grad_norm": 0.453125, + "learning_rate": 4.9564354087748964e-06, + "loss": 2.4289, + "step": 8176 + }, + { + "epoch": 0.4386802575107296, + "grad_norm": 0.3984375, + "learning_rate": 4.956419259476402e-06, + "loss": 2.2181, + "step": 8177 + }, + { + "epoch": 0.4387339055793991, + "grad_norm": 0.419921875, + "learning_rate": 4.956403107211527e-06, + "loss": 2.2624, + "step": 8178 + }, + { + "epoch": 0.43878755364806865, + "grad_norm": 0.451171875, + "learning_rate": 4.95638695198029e-06, + "loss": 2.282, + "step": 8179 + }, + { + "epoch": 0.4388412017167382, + "grad_norm": 5.34375, + "learning_rate": 4.956370793782711e-06, + "loss": 2.2885, + "step": 8180 + }, + { + "epoch": 0.4388948497854077, + "grad_norm": 0.36328125, + "learning_rate": 4.9563546326188095e-06, + "loss": 1.9653, + "step": 8181 + }, + { + "epoch": 0.43894849785407725, + "grad_norm": 0.388671875, + "learning_rate": 4.9563384684886055e-06, + "loss": 1.8764, + "step": 8182 + }, + { + "epoch": 0.4390021459227468, + "grad_norm": 0.423828125, + "learning_rate": 4.956322301392117e-06, + "loss": 2.4123, + "step": 8183 + }, + { + "epoch": 0.4390557939914163, + "grad_norm": 1.1171875, + "learning_rate": 4.956306131329365e-06, + "loss": 2.3141, + "step": 8184 + }, + { + "epoch": 0.43910944206008584, + "grad_norm": 0.53515625, + "learning_rate": 4.9562899583003686e-06, + "loss": 2.4223, + "step": 8185 + }, + { + "epoch": 0.4391630901287554, + "grad_norm": 0.455078125, + "learning_rate": 4.9562737823051464e-06, + "loss": 2.5589, + "step": 8186 + }, + { + "epoch": 0.4392167381974249, + "grad_norm": 0.66015625, + "learning_rate": 4.95625760334372e-06, + "loss": 2.4173, + "step": 8187 + }, + { + "epoch": 0.43927038626609444, + "grad_norm": 0.4375, + "learning_rate": 4.956241421416107e-06, + "loss": 2.4242, + "step": 8188 + }, + { + "epoch": 0.43932403433476397, + "grad_norm": 0.451171875, + "learning_rate": 4.956225236522328e-06, + "loss": 2.3816, + "step": 8189 + }, + { + "epoch": 0.4393776824034335, + "grad_norm": 0.59765625, + "learning_rate": 4.956209048662402e-06, + "loss": 2.4124, + "step": 8190 + }, + { + "epoch": 0.439431330472103, + "grad_norm": 0.41015625, + "learning_rate": 4.956192857836348e-06, + "loss": 2.4362, + "step": 8191 + }, + { + "epoch": 0.4394849785407725, + "grad_norm": 0.3984375, + "learning_rate": 4.956176664044188e-06, + "loss": 2.0693, + "step": 8192 + }, + { + "epoch": 0.43953862660944204, + "grad_norm": 0.41796875, + "learning_rate": 4.956160467285939e-06, + "loss": 2.2607, + "step": 8193 + }, + { + "epoch": 0.43959227467811157, + "grad_norm": 0.42578125, + "learning_rate": 4.9561442675616206e-06, + "loss": 2.2659, + "step": 8194 + }, + { + "epoch": 0.4396459227467811, + "grad_norm": 0.5, + "learning_rate": 4.956128064871254e-06, + "loss": 2.4447, + "step": 8195 + }, + { + "epoch": 0.43969957081545064, + "grad_norm": 0.4140625, + "learning_rate": 4.956111859214857e-06, + "loss": 2.3005, + "step": 8196 + }, + { + "epoch": 0.43975321888412017, + "grad_norm": 0.427734375, + "learning_rate": 4.956095650592452e-06, + "loss": 2.2744, + "step": 8197 + }, + { + "epoch": 0.4398068669527897, + "grad_norm": 0.4453125, + "learning_rate": 4.956079439004056e-06, + "loss": 2.1765, + "step": 8198 + }, + { + "epoch": 0.43986051502145923, + "grad_norm": 0.58203125, + "learning_rate": 4.956063224449688e-06, + "loss": 2.2596, + "step": 8199 + }, + { + "epoch": 0.43991416309012876, + "grad_norm": 0.349609375, + "learning_rate": 4.95604700692937e-06, + "loss": 2.1189, + "step": 8200 + }, + { + "epoch": 0.4399678111587983, + "grad_norm": 0.421875, + "learning_rate": 4.95603078644312e-06, + "loss": 2.256, + "step": 8201 + }, + { + "epoch": 0.4400214592274678, + "grad_norm": 0.408203125, + "learning_rate": 4.956014562990958e-06, + "loss": 2.2661, + "step": 8202 + }, + { + "epoch": 0.44007510729613736, + "grad_norm": 0.431640625, + "learning_rate": 4.955998336572904e-06, + "loss": 2.4287, + "step": 8203 + }, + { + "epoch": 0.4401287553648069, + "grad_norm": 0.61328125, + "learning_rate": 4.9559821071889765e-06, + "loss": 2.3185, + "step": 8204 + }, + { + "epoch": 0.4401824034334764, + "grad_norm": 0.392578125, + "learning_rate": 4.955965874839197e-06, + "loss": 2.2778, + "step": 8205 + }, + { + "epoch": 0.4402360515021459, + "grad_norm": 0.5546875, + "learning_rate": 4.955949639523584e-06, + "loss": 2.5401, + "step": 8206 + }, + { + "epoch": 0.44028969957081543, + "grad_norm": 0.8125, + "learning_rate": 4.9559334012421555e-06, + "loss": 2.4611, + "step": 8207 + }, + { + "epoch": 0.44034334763948496, + "grad_norm": 0.51171875, + "learning_rate": 4.955917159994934e-06, + "loss": 2.31, + "step": 8208 + }, + { + "epoch": 0.4403969957081545, + "grad_norm": 0.455078125, + "learning_rate": 4.955900915781936e-06, + "loss": 2.5952, + "step": 8209 + }, + { + "epoch": 0.440450643776824, + "grad_norm": 0.408203125, + "learning_rate": 4.955884668603185e-06, + "loss": 2.3636, + "step": 8210 + }, + { + "epoch": 0.44050429184549356, + "grad_norm": 0.451171875, + "learning_rate": 4.955868418458697e-06, + "loss": 2.4107, + "step": 8211 + }, + { + "epoch": 0.4405579399141631, + "grad_norm": 0.51171875, + "learning_rate": 4.955852165348493e-06, + "loss": 2.2921, + "step": 8212 + }, + { + "epoch": 0.4406115879828326, + "grad_norm": 0.373046875, + "learning_rate": 4.955835909272594e-06, + "loss": 2.4004, + "step": 8213 + }, + { + "epoch": 0.44066523605150215, + "grad_norm": 0.388671875, + "learning_rate": 4.955819650231017e-06, + "loss": 2.0766, + "step": 8214 + }, + { + "epoch": 0.4407188841201717, + "grad_norm": 0.431640625, + "learning_rate": 4.955803388223784e-06, + "loss": 2.3672, + "step": 8215 + }, + { + "epoch": 0.4407725321888412, + "grad_norm": 0.431640625, + "learning_rate": 4.955787123250912e-06, + "loss": 2.0957, + "step": 8216 + }, + { + "epoch": 0.44082618025751075, + "grad_norm": 0.416015625, + "learning_rate": 4.955770855312424e-06, + "loss": 2.2711, + "step": 8217 + }, + { + "epoch": 0.4408798283261803, + "grad_norm": 0.59375, + "learning_rate": 4.955754584408337e-06, + "loss": 2.2994, + "step": 8218 + }, + { + "epoch": 0.4409334763948498, + "grad_norm": 0.44921875, + "learning_rate": 4.955738310538672e-06, + "loss": 2.1133, + "step": 8219 + }, + { + "epoch": 0.4409871244635193, + "grad_norm": 0.4375, + "learning_rate": 4.955722033703449e-06, + "loss": 2.0573, + "step": 8220 + }, + { + "epoch": 0.4410407725321888, + "grad_norm": 0.484375, + "learning_rate": 4.955705753902686e-06, + "loss": 2.6117, + "step": 8221 + }, + { + "epoch": 0.44109442060085835, + "grad_norm": 0.384765625, + "learning_rate": 4.955689471136403e-06, + "loss": 2.2723, + "step": 8222 + }, + { + "epoch": 0.4411480686695279, + "grad_norm": 5.09375, + "learning_rate": 4.955673185404621e-06, + "loss": 2.3369, + "step": 8223 + }, + { + "epoch": 0.4412017167381974, + "grad_norm": 0.4140625, + "learning_rate": 4.955656896707359e-06, + "loss": 2.6603, + "step": 8224 + }, + { + "epoch": 0.44125536480686695, + "grad_norm": 0.404296875, + "learning_rate": 4.955640605044636e-06, + "loss": 2.4412, + "step": 8225 + }, + { + "epoch": 0.4413090128755365, + "grad_norm": 0.3984375, + "learning_rate": 4.9556243104164725e-06, + "loss": 2.2734, + "step": 8226 + }, + { + "epoch": 0.441362660944206, + "grad_norm": 0.53515625, + "learning_rate": 4.955608012822887e-06, + "loss": 2.3376, + "step": 8227 + }, + { + "epoch": 0.44141630901287554, + "grad_norm": 0.3984375, + "learning_rate": 4.955591712263901e-06, + "loss": 2.2491, + "step": 8228 + }, + { + "epoch": 0.4414699570815451, + "grad_norm": 0.42578125, + "learning_rate": 4.955575408739533e-06, + "loss": 2.316, + "step": 8229 + }, + { + "epoch": 0.4415236051502146, + "grad_norm": 0.384765625, + "learning_rate": 4.955559102249803e-06, + "loss": 2.2169, + "step": 8230 + }, + { + "epoch": 0.44157725321888414, + "grad_norm": 0.470703125, + "learning_rate": 4.955542792794731e-06, + "loss": 2.4944, + "step": 8231 + }, + { + "epoch": 0.44163090128755367, + "grad_norm": 0.474609375, + "learning_rate": 4.955526480374335e-06, + "loss": 2.4031, + "step": 8232 + }, + { + "epoch": 0.4416845493562232, + "grad_norm": 0.458984375, + "learning_rate": 4.955510164988638e-06, + "loss": 2.0629, + "step": 8233 + }, + { + "epoch": 0.44173819742489273, + "grad_norm": 0.392578125, + "learning_rate": 4.955493846637656e-06, + "loss": 2.4796, + "step": 8234 + }, + { + "epoch": 0.4417918454935622, + "grad_norm": 0.6796875, + "learning_rate": 4.955477525321411e-06, + "loss": 2.1923, + "step": 8235 + }, + { + "epoch": 0.44184549356223174, + "grad_norm": 0.4609375, + "learning_rate": 4.955461201039922e-06, + "loss": 2.2051, + "step": 8236 + }, + { + "epoch": 0.44189914163090127, + "grad_norm": 0.412109375, + "learning_rate": 4.9554448737932096e-06, + "loss": 2.3696, + "step": 8237 + }, + { + "epoch": 0.4419527896995708, + "grad_norm": 1.71875, + "learning_rate": 4.955428543581292e-06, + "loss": 2.4745, + "step": 8238 + }, + { + "epoch": 0.44200643776824033, + "grad_norm": 0.37890625, + "learning_rate": 4.95541221040419e-06, + "loss": 2.124, + "step": 8239 + }, + { + "epoch": 0.44206008583690987, + "grad_norm": 0.4375, + "learning_rate": 4.955395874261923e-06, + "loss": 2.177, + "step": 8240 + }, + { + "epoch": 0.4421137339055794, + "grad_norm": 0.419921875, + "learning_rate": 4.955379535154511e-06, + "loss": 2.3753, + "step": 8241 + }, + { + "epoch": 0.44216738197424893, + "grad_norm": 0.431640625, + "learning_rate": 4.955363193081972e-06, + "loss": 2.3628, + "step": 8242 + }, + { + "epoch": 0.44222103004291846, + "grad_norm": 0.41015625, + "learning_rate": 4.955346848044329e-06, + "loss": 2.5075, + "step": 8243 + }, + { + "epoch": 0.442274678111588, + "grad_norm": 0.419921875, + "learning_rate": 4.955330500041599e-06, + "loss": 1.7121, + "step": 8244 + }, + { + "epoch": 0.4423283261802575, + "grad_norm": 0.466796875, + "learning_rate": 4.955314149073803e-06, + "loss": 2.2941, + "step": 8245 + }, + { + "epoch": 0.44238197424892706, + "grad_norm": 0.451171875, + "learning_rate": 4.955297795140961e-06, + "loss": 2.2701, + "step": 8246 + }, + { + "epoch": 0.4424356223175966, + "grad_norm": 0.439453125, + "learning_rate": 4.955281438243091e-06, + "loss": 2.4153, + "step": 8247 + }, + { + "epoch": 0.4424892703862661, + "grad_norm": 0.44140625, + "learning_rate": 4.955265078380215e-06, + "loss": 2.3381, + "step": 8248 + }, + { + "epoch": 0.4425429184549356, + "grad_norm": 0.54296875, + "learning_rate": 4.955248715552351e-06, + "loss": 2.5514, + "step": 8249 + }, + { + "epoch": 0.44259656652360513, + "grad_norm": 0.5390625, + "learning_rate": 4.955232349759519e-06, + "loss": 2.0119, + "step": 8250 + }, + { + "epoch": 0.44265021459227466, + "grad_norm": 0.490234375, + "learning_rate": 4.9552159810017405e-06, + "loss": 2.128, + "step": 8251 + }, + { + "epoch": 0.4427038626609442, + "grad_norm": 0.515625, + "learning_rate": 4.955199609279033e-06, + "loss": 2.2129, + "step": 8252 + }, + { + "epoch": 0.4427575107296137, + "grad_norm": 0.392578125, + "learning_rate": 4.955183234591418e-06, + "loss": 2.1162, + "step": 8253 + }, + { + "epoch": 0.44281115879828326, + "grad_norm": 0.408203125, + "learning_rate": 4.9551668569389145e-06, + "loss": 2.2701, + "step": 8254 + }, + { + "epoch": 0.4428648068669528, + "grad_norm": 0.423828125, + "learning_rate": 4.955150476321541e-06, + "loss": 2.2866, + "step": 8255 + }, + { + "epoch": 0.4429184549356223, + "grad_norm": 0.4765625, + "learning_rate": 4.95513409273932e-06, + "loss": 2.2879, + "step": 8256 + }, + { + "epoch": 0.44297210300429185, + "grad_norm": 0.45703125, + "learning_rate": 4.9551177061922696e-06, + "loss": 2.2668, + "step": 8257 + }, + { + "epoch": 0.4430257510729614, + "grad_norm": 0.4140625, + "learning_rate": 4.95510131668041e-06, + "loss": 2.3721, + "step": 8258 + }, + { + "epoch": 0.4430793991416309, + "grad_norm": 3.28125, + "learning_rate": 4.95508492420376e-06, + "loss": 2.3567, + "step": 8259 + }, + { + "epoch": 0.44313304721030045, + "grad_norm": 0.46484375, + "learning_rate": 4.955068528762341e-06, + "loss": 1.5905, + "step": 8260 + }, + { + "epoch": 0.44318669527897, + "grad_norm": 0.361328125, + "learning_rate": 4.955052130356171e-06, + "loss": 2.1415, + "step": 8261 + }, + { + "epoch": 0.4432403433476395, + "grad_norm": 0.3515625, + "learning_rate": 4.955035728985272e-06, + "loss": 2.2501, + "step": 8262 + }, + { + "epoch": 0.443293991416309, + "grad_norm": 0.388671875, + "learning_rate": 4.955019324649663e-06, + "loss": 2.1241, + "step": 8263 + }, + { + "epoch": 0.4433476394849785, + "grad_norm": 0.431640625, + "learning_rate": 4.955002917349362e-06, + "loss": 2.461, + "step": 8264 + }, + { + "epoch": 0.44340128755364805, + "grad_norm": 0.44921875, + "learning_rate": 4.954986507084391e-06, + "loss": 2.4867, + "step": 8265 + }, + { + "epoch": 0.4434549356223176, + "grad_norm": 3.234375, + "learning_rate": 4.9549700938547696e-06, + "loss": 2.1209, + "step": 8266 + }, + { + "epoch": 0.4435085836909871, + "grad_norm": 0.40625, + "learning_rate": 4.954953677660516e-06, + "loss": 2.3674, + "step": 8267 + }, + { + "epoch": 0.44356223175965664, + "grad_norm": 0.451171875, + "learning_rate": 4.954937258501652e-06, + "loss": 2.4139, + "step": 8268 + }, + { + "epoch": 0.4436158798283262, + "grad_norm": 0.404296875, + "learning_rate": 4.954920836378196e-06, + "loss": 2.6326, + "step": 8269 + }, + { + "epoch": 0.4436695278969957, + "grad_norm": 0.447265625, + "learning_rate": 4.954904411290169e-06, + "loss": 2.4444, + "step": 8270 + }, + { + "epoch": 0.44372317596566524, + "grad_norm": 0.42578125, + "learning_rate": 4.95488798323759e-06, + "loss": 2.4628, + "step": 8271 + }, + { + "epoch": 0.44377682403433477, + "grad_norm": 0.455078125, + "learning_rate": 4.954871552220479e-06, + "loss": 2.427, + "step": 8272 + }, + { + "epoch": 0.4438304721030043, + "grad_norm": 0.41015625, + "learning_rate": 4.954855118238856e-06, + "loss": 2.4154, + "step": 8273 + }, + { + "epoch": 0.44388412017167383, + "grad_norm": 0.58203125, + "learning_rate": 4.954838681292741e-06, + "loss": 2.3887, + "step": 8274 + }, + { + "epoch": 0.44393776824034337, + "grad_norm": 0.408203125, + "learning_rate": 4.954822241382153e-06, + "loss": 2.3256, + "step": 8275 + }, + { + "epoch": 0.4439914163090129, + "grad_norm": 0.416015625, + "learning_rate": 4.954805798507113e-06, + "loss": 2.2242, + "step": 8276 + }, + { + "epoch": 0.44404506437768243, + "grad_norm": 0.447265625, + "learning_rate": 4.95478935266764e-06, + "loss": 2.1115, + "step": 8277 + }, + { + "epoch": 0.4440987124463519, + "grad_norm": 0.61328125, + "learning_rate": 4.954772903863753e-06, + "loss": 2.3902, + "step": 8278 + }, + { + "epoch": 0.44415236051502144, + "grad_norm": 0.416015625, + "learning_rate": 4.954756452095475e-06, + "loss": 2.3317, + "step": 8279 + }, + { + "epoch": 0.44420600858369097, + "grad_norm": 0.89453125, + "learning_rate": 4.954739997362824e-06, + "loss": 2.2889, + "step": 8280 + }, + { + "epoch": 0.4442596566523605, + "grad_norm": 0.451171875, + "learning_rate": 4.954723539665819e-06, + "loss": 2.402, + "step": 8281 + }, + { + "epoch": 0.44431330472103003, + "grad_norm": 0.376953125, + "learning_rate": 4.95470707900448e-06, + "loss": 2.1859, + "step": 8282 + }, + { + "epoch": 0.44436695278969957, + "grad_norm": 0.515625, + "learning_rate": 4.954690615378828e-06, + "loss": 2.5645, + "step": 8283 + }, + { + "epoch": 0.4444206008583691, + "grad_norm": 0.390625, + "learning_rate": 4.954674148788883e-06, + "loss": 2.3554, + "step": 8284 + }, + { + "epoch": 0.44447424892703863, + "grad_norm": 0.494140625, + "learning_rate": 4.954657679234664e-06, + "loss": 2.3413, + "step": 8285 + }, + { + "epoch": 0.44452789699570816, + "grad_norm": 0.451171875, + "learning_rate": 4.954641206716191e-06, + "loss": 2.3518, + "step": 8286 + }, + { + "epoch": 0.4445815450643777, + "grad_norm": 0.4765625, + "learning_rate": 4.954624731233485e-06, + "loss": 2.5301, + "step": 8287 + }, + { + "epoch": 0.4446351931330472, + "grad_norm": 0.419921875, + "learning_rate": 4.954608252786564e-06, + "loss": 2.226, + "step": 8288 + }, + { + "epoch": 0.44468884120171676, + "grad_norm": 0.45703125, + "learning_rate": 4.954591771375449e-06, + "loss": 2.4196, + "step": 8289 + }, + { + "epoch": 0.4447424892703863, + "grad_norm": 0.384765625, + "learning_rate": 4.95457528700016e-06, + "loss": 2.6264, + "step": 8290 + }, + { + "epoch": 0.4447961373390558, + "grad_norm": 0.3671875, + "learning_rate": 4.9545587996607155e-06, + "loss": 2.3125, + "step": 8291 + }, + { + "epoch": 0.4448497854077253, + "grad_norm": 0.427734375, + "learning_rate": 4.954542309357138e-06, + "loss": 2.2748, + "step": 8292 + }, + { + "epoch": 0.4449034334763948, + "grad_norm": 0.41015625, + "learning_rate": 4.954525816089445e-06, + "loss": 2.2501, + "step": 8293 + }, + { + "epoch": 0.44495708154506436, + "grad_norm": 1.1171875, + "learning_rate": 4.954509319857659e-06, + "loss": 2.1654, + "step": 8294 + }, + { + "epoch": 0.4450107296137339, + "grad_norm": 0.435546875, + "learning_rate": 4.954492820661797e-06, + "loss": 1.766, + "step": 8295 + }, + { + "epoch": 0.4450643776824034, + "grad_norm": 0.396484375, + "learning_rate": 4.954476318501881e-06, + "loss": 2.244, + "step": 8296 + }, + { + "epoch": 0.44511802575107295, + "grad_norm": 0.39453125, + "learning_rate": 4.9544598133779296e-06, + "loss": 2.3952, + "step": 8297 + }, + { + "epoch": 0.4451716738197425, + "grad_norm": 0.54296875, + "learning_rate": 4.954443305289963e-06, + "loss": 2.1108, + "step": 8298 + }, + { + "epoch": 0.445225321888412, + "grad_norm": 0.361328125, + "learning_rate": 4.954426794238002e-06, + "loss": 2.2084, + "step": 8299 + }, + { + "epoch": 0.44527896995708155, + "grad_norm": 0.369140625, + "learning_rate": 4.954410280222066e-06, + "loss": 2.2551, + "step": 8300 + }, + { + "epoch": 0.4453326180257511, + "grad_norm": 0.384765625, + "learning_rate": 4.954393763242176e-06, + "loss": 2.3704, + "step": 8301 + }, + { + "epoch": 0.4453862660944206, + "grad_norm": 1.6015625, + "learning_rate": 4.95437724329835e-06, + "loss": 2.3278, + "step": 8302 + }, + { + "epoch": 0.44543991416309014, + "grad_norm": 0.421875, + "learning_rate": 4.954360720390608e-06, + "loss": 2.0803, + "step": 8303 + }, + { + "epoch": 0.4454935622317597, + "grad_norm": 0.4765625, + "learning_rate": 4.954344194518972e-06, + "loss": 2.5164, + "step": 8304 + }, + { + "epoch": 0.4455472103004292, + "grad_norm": 0.390625, + "learning_rate": 4.954327665683459e-06, + "loss": 2.4894, + "step": 8305 + }, + { + "epoch": 0.44560085836909874, + "grad_norm": 0.486328125, + "learning_rate": 4.954311133884093e-06, + "loss": 2.4627, + "step": 8306 + }, + { + "epoch": 0.4456545064377682, + "grad_norm": 0.37109375, + "learning_rate": 4.95429459912089e-06, + "loss": 2.2476, + "step": 8307 + }, + { + "epoch": 0.44570815450643775, + "grad_norm": 0.470703125, + "learning_rate": 4.954278061393872e-06, + "loss": 2.1827, + "step": 8308 + }, + { + "epoch": 0.4457618025751073, + "grad_norm": 0.41796875, + "learning_rate": 4.95426152070306e-06, + "loss": 2.4584, + "step": 8309 + }, + { + "epoch": 0.4458154506437768, + "grad_norm": 0.42578125, + "learning_rate": 4.954244977048471e-06, + "loss": 2.5153, + "step": 8310 + }, + { + "epoch": 0.44586909871244634, + "grad_norm": 0.546875, + "learning_rate": 4.954228430430127e-06, + "loss": 2.2184, + "step": 8311 + }, + { + "epoch": 0.4459227467811159, + "grad_norm": 0.41796875, + "learning_rate": 4.9542118808480475e-06, + "loss": 2.4153, + "step": 8312 + }, + { + "epoch": 0.4459763948497854, + "grad_norm": 0.42578125, + "learning_rate": 4.954195328302253e-06, + "loss": 2.3311, + "step": 8313 + }, + { + "epoch": 0.44603004291845494, + "grad_norm": 0.416015625, + "learning_rate": 4.954178772792762e-06, + "loss": 1.9482, + "step": 8314 + }, + { + "epoch": 0.44608369098712447, + "grad_norm": 0.69140625, + "learning_rate": 4.954162214319597e-06, + "loss": 2.2178, + "step": 8315 + }, + { + "epoch": 0.446137339055794, + "grad_norm": 0.4453125, + "learning_rate": 4.954145652882776e-06, + "loss": 2.2251, + "step": 8316 + }, + { + "epoch": 0.44619098712446353, + "grad_norm": 0.51171875, + "learning_rate": 4.954129088482319e-06, + "loss": 1.388, + "step": 8317 + }, + { + "epoch": 0.44624463519313307, + "grad_norm": 0.47265625, + "learning_rate": 4.954112521118246e-06, + "loss": 2.3695, + "step": 8318 + }, + { + "epoch": 0.4462982832618026, + "grad_norm": 0.51171875, + "learning_rate": 4.954095950790579e-06, + "loss": 2.292, + "step": 8319 + }, + { + "epoch": 0.44635193133047213, + "grad_norm": 0.345703125, + "learning_rate": 4.954079377499336e-06, + "loss": 2.3354, + "step": 8320 + }, + { + "epoch": 0.4464055793991416, + "grad_norm": 0.59765625, + "learning_rate": 4.9540628012445365e-06, + "loss": 2.0463, + "step": 8321 + }, + { + "epoch": 0.44645922746781114, + "grad_norm": 0.37109375, + "learning_rate": 4.9540462220262035e-06, + "loss": 2.4264, + "step": 8322 + }, + { + "epoch": 0.44651287553648067, + "grad_norm": 0.458984375, + "learning_rate": 4.954029639844353e-06, + "loss": 2.3109, + "step": 8323 + }, + { + "epoch": 0.4465665236051502, + "grad_norm": 1.2578125, + "learning_rate": 4.9540130546990085e-06, + "loss": 2.1778, + "step": 8324 + }, + { + "epoch": 0.44662017167381973, + "grad_norm": 0.4921875, + "learning_rate": 4.953996466590189e-06, + "loss": 2.3007, + "step": 8325 + }, + { + "epoch": 0.44667381974248926, + "grad_norm": 0.44921875, + "learning_rate": 4.953979875517913e-06, + "loss": 2.0884, + "step": 8326 + }, + { + "epoch": 0.4467274678111588, + "grad_norm": 0.396484375, + "learning_rate": 4.953963281482201e-06, + "loss": 2.2319, + "step": 8327 + }, + { + "epoch": 0.4467811158798283, + "grad_norm": 0.375, + "learning_rate": 4.953946684483075e-06, + "loss": 2.1237, + "step": 8328 + }, + { + "epoch": 0.44683476394849786, + "grad_norm": 0.43359375, + "learning_rate": 4.953930084520554e-06, + "loss": 2.3998, + "step": 8329 + }, + { + "epoch": 0.4468884120171674, + "grad_norm": 0.333984375, + "learning_rate": 4.953913481594658e-06, + "loss": 2.1827, + "step": 8330 + }, + { + "epoch": 0.4469420600858369, + "grad_norm": 0.451171875, + "learning_rate": 4.953896875705406e-06, + "loss": 2.4067, + "step": 8331 + }, + { + "epoch": 0.44699570815450645, + "grad_norm": 0.43359375, + "learning_rate": 4.953880266852819e-06, + "loss": 2.407, + "step": 8332 + }, + { + "epoch": 0.447049356223176, + "grad_norm": 0.494140625, + "learning_rate": 4.953863655036918e-06, + "loss": 1.4353, + "step": 8333 + }, + { + "epoch": 0.4471030042918455, + "grad_norm": 0.408203125, + "learning_rate": 4.953847040257721e-06, + "loss": 2.2054, + "step": 8334 + }, + { + "epoch": 0.447156652360515, + "grad_norm": 0.404296875, + "learning_rate": 4.95383042251525e-06, + "loss": 2.4311, + "step": 8335 + }, + { + "epoch": 0.4472103004291845, + "grad_norm": 1.265625, + "learning_rate": 4.953813801809523e-06, + "loss": 2.4202, + "step": 8336 + }, + { + "epoch": 0.44726394849785406, + "grad_norm": 0.31640625, + "learning_rate": 4.953797178140562e-06, + "loss": 2.2048, + "step": 8337 + }, + { + "epoch": 0.4473175965665236, + "grad_norm": 0.3359375, + "learning_rate": 4.953780551508387e-06, + "loss": 2.0654, + "step": 8338 + }, + { + "epoch": 0.4473712446351931, + "grad_norm": 0.39453125, + "learning_rate": 4.953763921913017e-06, + "loss": 2.3469, + "step": 8339 + }, + { + "epoch": 0.44742489270386265, + "grad_norm": 0.3984375, + "learning_rate": 4.953747289354471e-06, + "loss": 2.2317, + "step": 8340 + }, + { + "epoch": 0.4474785407725322, + "grad_norm": 0.443359375, + "learning_rate": 4.953730653832772e-06, + "loss": 2.3318, + "step": 8341 + }, + { + "epoch": 0.4475321888412017, + "grad_norm": 0.41796875, + "learning_rate": 4.9537140153479385e-06, + "loss": 1.9569, + "step": 8342 + }, + { + "epoch": 0.44758583690987125, + "grad_norm": 0.478515625, + "learning_rate": 4.9536973738999905e-06, + "loss": 2.3773, + "step": 8343 + }, + { + "epoch": 0.4476394849785408, + "grad_norm": 0.388671875, + "learning_rate": 4.9536807294889485e-06, + "loss": 2.1725, + "step": 8344 + }, + { + "epoch": 0.4476931330472103, + "grad_norm": 0.5, + "learning_rate": 4.953664082114833e-06, + "loss": 2.2599, + "step": 8345 + }, + { + "epoch": 0.44774678111587984, + "grad_norm": 0.419921875, + "learning_rate": 4.953647431777663e-06, + "loss": 2.4084, + "step": 8346 + }, + { + "epoch": 0.4478004291845494, + "grad_norm": 0.4375, + "learning_rate": 4.953630778477459e-06, + "loss": 2.4641, + "step": 8347 + }, + { + "epoch": 0.4478540772532189, + "grad_norm": 0.404296875, + "learning_rate": 4.9536141222142405e-06, + "loss": 2.3564, + "step": 8348 + }, + { + "epoch": 0.44790772532188844, + "grad_norm": 0.478515625, + "learning_rate": 4.95359746298803e-06, + "loss": 2.3456, + "step": 8349 + }, + { + "epoch": 0.4479613733905579, + "grad_norm": 0.421875, + "learning_rate": 4.953580800798845e-06, + "loss": 2.2241, + "step": 8350 + }, + { + "epoch": 0.44801502145922745, + "grad_norm": 0.5078125, + "learning_rate": 4.9535641356467075e-06, + "loss": 1.483, + "step": 8351 + }, + { + "epoch": 0.448068669527897, + "grad_norm": 0.67578125, + "learning_rate": 4.9535474675316364e-06, + "loss": 2.4433, + "step": 8352 + }, + { + "epoch": 0.4481223175965665, + "grad_norm": 0.41015625, + "learning_rate": 4.953530796453652e-06, + "loss": 2.3644, + "step": 8353 + }, + { + "epoch": 0.44817596566523604, + "grad_norm": 0.416015625, + "learning_rate": 4.953514122412775e-06, + "loss": 2.1661, + "step": 8354 + }, + { + "epoch": 0.4482296137339056, + "grad_norm": 0.359375, + "learning_rate": 4.953497445409025e-06, + "loss": 2.2327, + "step": 8355 + }, + { + "epoch": 0.4482832618025751, + "grad_norm": 0.3359375, + "learning_rate": 4.953480765442421e-06, + "loss": 2.237, + "step": 8356 + }, + { + "epoch": 0.44833690987124464, + "grad_norm": 2.140625, + "learning_rate": 4.953464082512986e-06, + "loss": 2.2048, + "step": 8357 + }, + { + "epoch": 0.44839055793991417, + "grad_norm": 0.423828125, + "learning_rate": 4.953447396620738e-06, + "loss": 2.2674, + "step": 8358 + }, + { + "epoch": 0.4484442060085837, + "grad_norm": 0.431640625, + "learning_rate": 4.953430707765698e-06, + "loss": 2.1966, + "step": 8359 + }, + { + "epoch": 0.44849785407725323, + "grad_norm": 1.3203125, + "learning_rate": 4.953414015947885e-06, + "loss": 2.1801, + "step": 8360 + }, + { + "epoch": 0.44855150214592276, + "grad_norm": 0.431640625, + "learning_rate": 4.953397321167322e-06, + "loss": 2.2445, + "step": 8361 + }, + { + "epoch": 0.4486051502145923, + "grad_norm": 0.44140625, + "learning_rate": 4.953380623424026e-06, + "loss": 2.3121, + "step": 8362 + }, + { + "epoch": 0.44865879828326183, + "grad_norm": 0.396484375, + "learning_rate": 4.953363922718017e-06, + "loss": 2.2877, + "step": 8363 + }, + { + "epoch": 0.4487124463519313, + "grad_norm": 1.8359375, + "learning_rate": 4.9533472190493184e-06, + "loss": 2.3953, + "step": 8364 + }, + { + "epoch": 0.44876609442060084, + "grad_norm": 0.5, + "learning_rate": 4.953330512417948e-06, + "loss": 2.1768, + "step": 8365 + }, + { + "epoch": 0.44881974248927037, + "grad_norm": 0.4140625, + "learning_rate": 4.953313802823926e-06, + "loss": 2.4529, + "step": 8366 + }, + { + "epoch": 0.4488733905579399, + "grad_norm": 0.48046875, + "learning_rate": 4.9532970902672735e-06, + "loss": 2.0977, + "step": 8367 + }, + { + "epoch": 0.44892703862660943, + "grad_norm": 0.44140625, + "learning_rate": 4.95328037474801e-06, + "loss": 2.198, + "step": 8368 + }, + { + "epoch": 0.44898068669527896, + "grad_norm": 0.4453125, + "learning_rate": 4.953263656266157e-06, + "loss": 2.1203, + "step": 8369 + }, + { + "epoch": 0.4490343347639485, + "grad_norm": 0.4375, + "learning_rate": 4.9532469348217326e-06, + "loss": 2.4532, + "step": 8370 + }, + { + "epoch": 0.449087982832618, + "grad_norm": 0.41796875, + "learning_rate": 4.9532302104147585e-06, + "loss": 2.2194, + "step": 8371 + }, + { + "epoch": 0.44914163090128756, + "grad_norm": 0.416015625, + "learning_rate": 4.953213483045254e-06, + "loss": 2.3743, + "step": 8372 + }, + { + "epoch": 0.4491952789699571, + "grad_norm": 0.41796875, + "learning_rate": 4.953196752713239e-06, + "loss": 2.2344, + "step": 8373 + }, + { + "epoch": 0.4492489270386266, + "grad_norm": 0.3828125, + "learning_rate": 4.953180019418736e-06, + "loss": 2.3767, + "step": 8374 + }, + { + "epoch": 0.44930257510729615, + "grad_norm": 0.7890625, + "learning_rate": 4.953163283161762e-06, + "loss": 2.0817, + "step": 8375 + }, + { + "epoch": 0.4493562231759657, + "grad_norm": 0.3671875, + "learning_rate": 4.95314654394234e-06, + "loss": 2.3762, + "step": 8376 + }, + { + "epoch": 0.4494098712446352, + "grad_norm": 0.388671875, + "learning_rate": 4.953129801760489e-06, + "loss": 2.2744, + "step": 8377 + }, + { + "epoch": 0.4494635193133047, + "grad_norm": 0.46484375, + "learning_rate": 4.953113056616229e-06, + "loss": 2.2785, + "step": 8378 + }, + { + "epoch": 0.4495171673819742, + "grad_norm": 0.33984375, + "learning_rate": 4.953096308509581e-06, + "loss": 2.2454, + "step": 8379 + }, + { + "epoch": 0.44957081545064376, + "grad_norm": 0.41015625, + "learning_rate": 4.953079557440563e-06, + "loss": 2.2996, + "step": 8380 + }, + { + "epoch": 0.4496244635193133, + "grad_norm": 0.3984375, + "learning_rate": 4.953062803409199e-06, + "loss": 2.2361, + "step": 8381 + }, + { + "epoch": 0.4496781115879828, + "grad_norm": 0.45703125, + "learning_rate": 4.953046046415506e-06, + "loss": 2.1911, + "step": 8382 + }, + { + "epoch": 0.44973175965665235, + "grad_norm": 0.6171875, + "learning_rate": 4.953029286459506e-06, + "loss": 2.2196, + "step": 8383 + }, + { + "epoch": 0.4497854077253219, + "grad_norm": 0.400390625, + "learning_rate": 4.953012523541217e-06, + "loss": 2.3037, + "step": 8384 + }, + { + "epoch": 0.4498390557939914, + "grad_norm": 0.470703125, + "learning_rate": 4.952995757660662e-06, + "loss": 1.6911, + "step": 8385 + }, + { + "epoch": 0.44989270386266095, + "grad_norm": 0.4296875, + "learning_rate": 4.952978988817861e-06, + "loss": 2.4309, + "step": 8386 + }, + { + "epoch": 0.4499463519313305, + "grad_norm": 0.46484375, + "learning_rate": 4.952962217012833e-06, + "loss": 2.4652, + "step": 8387 + }, + { + "epoch": 0.45, + "grad_norm": 0.40234375, + "learning_rate": 4.952945442245598e-06, + "loss": 2.3981, + "step": 8388 + }, + { + "epoch": 0.45005364806866954, + "grad_norm": 0.41015625, + "learning_rate": 4.952928664516177e-06, + "loss": 2.0191, + "step": 8389 + }, + { + "epoch": 0.4501072961373391, + "grad_norm": 0.4375, + "learning_rate": 4.9529118838245904e-06, + "loss": 2.28, + "step": 8390 + }, + { + "epoch": 0.4501609442060086, + "grad_norm": 1.0, + "learning_rate": 4.9528951001708585e-06, + "loss": 2.2511, + "step": 8391 + }, + { + "epoch": 0.45021459227467814, + "grad_norm": 0.466796875, + "learning_rate": 4.952878313555001e-06, + "loss": 1.4553, + "step": 8392 + }, + { + "epoch": 0.4502682403433476, + "grad_norm": 0.53515625, + "learning_rate": 4.952861523977038e-06, + "loss": 2.4859, + "step": 8393 + }, + { + "epoch": 0.45032188841201715, + "grad_norm": 0.3984375, + "learning_rate": 4.952844731436991e-06, + "loss": 2.1288, + "step": 8394 + }, + { + "epoch": 0.4503755364806867, + "grad_norm": 0.396484375, + "learning_rate": 4.952827935934878e-06, + "loss": 2.2539, + "step": 8395 + }, + { + "epoch": 0.4504291845493562, + "grad_norm": 0.373046875, + "learning_rate": 4.952811137470724e-06, + "loss": 2.1037, + "step": 8396 + }, + { + "epoch": 0.45048283261802574, + "grad_norm": 0.41796875, + "learning_rate": 4.952794336044543e-06, + "loss": 2.2161, + "step": 8397 + }, + { + "epoch": 0.4505364806866953, + "grad_norm": 0.453125, + "learning_rate": 4.952777531656359e-06, + "loss": 2.4118, + "step": 8398 + }, + { + "epoch": 0.4505901287553648, + "grad_norm": 0.380859375, + "learning_rate": 4.952760724306193e-06, + "loss": 2.3676, + "step": 8399 + }, + { + "epoch": 0.45064377682403434, + "grad_norm": 0.421875, + "learning_rate": 4.952743913994062e-06, + "loss": 2.3102, + "step": 8400 + }, + { + "epoch": 0.45069742489270387, + "grad_norm": 0.474609375, + "learning_rate": 4.9527271007199895e-06, + "loss": 2.361, + "step": 8401 + }, + { + "epoch": 0.4507510729613734, + "grad_norm": 0.470703125, + "learning_rate": 4.952710284483995e-06, + "loss": 2.4784, + "step": 8402 + }, + { + "epoch": 0.45080472103004293, + "grad_norm": 0.416015625, + "learning_rate": 4.952693465286097e-06, + "loss": 2.4767, + "step": 8403 + }, + { + "epoch": 0.45085836909871246, + "grad_norm": 0.443359375, + "learning_rate": 4.952676643126318e-06, + "loss": 2.268, + "step": 8404 + }, + { + "epoch": 0.450912017167382, + "grad_norm": 0.37890625, + "learning_rate": 4.952659818004677e-06, + "loss": 2.2219, + "step": 8405 + }, + { + "epoch": 0.4509656652360515, + "grad_norm": 0.427734375, + "learning_rate": 4.9526429899211956e-06, + "loss": 2.3636, + "step": 8406 + }, + { + "epoch": 0.451019313304721, + "grad_norm": 0.41796875, + "learning_rate": 4.952626158875893e-06, + "loss": 2.329, + "step": 8407 + }, + { + "epoch": 0.45107296137339054, + "grad_norm": 5.15625, + "learning_rate": 4.952609324868789e-06, + "loss": 2.3105, + "step": 8408 + }, + { + "epoch": 0.45112660944206007, + "grad_norm": 0.361328125, + "learning_rate": 4.952592487899906e-06, + "loss": 2.3801, + "step": 8409 + }, + { + "epoch": 0.4511802575107296, + "grad_norm": 0.431640625, + "learning_rate": 4.9525756479692624e-06, + "loss": 2.448, + "step": 8410 + }, + { + "epoch": 0.45123390557939913, + "grad_norm": 0.671875, + "learning_rate": 4.95255880507688e-06, + "loss": 2.59, + "step": 8411 + }, + { + "epoch": 0.45128755364806866, + "grad_norm": 0.462890625, + "learning_rate": 4.952541959222777e-06, + "loss": 2.3858, + "step": 8412 + }, + { + "epoch": 0.4513412017167382, + "grad_norm": 0.72265625, + "learning_rate": 4.952525110406977e-06, + "loss": 2.2176, + "step": 8413 + }, + { + "epoch": 0.4513948497854077, + "grad_norm": 0.357421875, + "learning_rate": 4.952508258629497e-06, + "loss": 1.9599, + "step": 8414 + }, + { + "epoch": 0.45144849785407726, + "grad_norm": 0.404296875, + "learning_rate": 4.95249140389036e-06, + "loss": 2.3862, + "step": 8415 + }, + { + "epoch": 0.4515021459227468, + "grad_norm": 0.46875, + "learning_rate": 4.952474546189584e-06, + "loss": 2.309, + "step": 8416 + }, + { + "epoch": 0.4515557939914163, + "grad_norm": 0.373046875, + "learning_rate": 4.952457685527191e-06, + "loss": 2.3404, + "step": 8417 + }, + { + "epoch": 0.45160944206008585, + "grad_norm": 0.4140625, + "learning_rate": 4.952440821903201e-06, + "loss": 2.1644, + "step": 8418 + }, + { + "epoch": 0.4516630901287554, + "grad_norm": 0.37890625, + "learning_rate": 4.952423955317635e-06, + "loss": 2.0841, + "step": 8419 + }, + { + "epoch": 0.4517167381974249, + "grad_norm": 0.40234375, + "learning_rate": 4.952407085770512e-06, + "loss": 2.2432, + "step": 8420 + }, + { + "epoch": 0.45177038626609445, + "grad_norm": 0.40234375, + "learning_rate": 4.952390213261853e-06, + "loss": 2.3771, + "step": 8421 + }, + { + "epoch": 0.4518240343347639, + "grad_norm": 0.384765625, + "learning_rate": 4.952373337791678e-06, + "loss": 1.8755, + "step": 8422 + }, + { + "epoch": 0.45187768240343346, + "grad_norm": 1.3046875, + "learning_rate": 4.952356459360008e-06, + "loss": 2.3771, + "step": 8423 + }, + { + "epoch": 0.451931330472103, + "grad_norm": 0.5546875, + "learning_rate": 4.9523395779668634e-06, + "loss": 2.3111, + "step": 8424 + }, + { + "epoch": 0.4519849785407725, + "grad_norm": 0.494140625, + "learning_rate": 4.952322693612264e-06, + "loss": 2.3477, + "step": 8425 + }, + { + "epoch": 0.45203862660944205, + "grad_norm": 0.51953125, + "learning_rate": 4.952305806296231e-06, + "loss": 2.1239, + "step": 8426 + }, + { + "epoch": 0.4520922746781116, + "grad_norm": 0.5, + "learning_rate": 4.952288916018784e-06, + "loss": 2.323, + "step": 8427 + }, + { + "epoch": 0.4521459227467811, + "grad_norm": 0.3671875, + "learning_rate": 4.952272022779944e-06, + "loss": 2.3626, + "step": 8428 + }, + { + "epoch": 0.45219957081545065, + "grad_norm": 0.396484375, + "learning_rate": 4.952255126579731e-06, + "loss": 2.3054, + "step": 8429 + }, + { + "epoch": 0.4522532188841202, + "grad_norm": 0.427734375, + "learning_rate": 4.9522382274181646e-06, + "loss": 2.4169, + "step": 8430 + }, + { + "epoch": 0.4523068669527897, + "grad_norm": 0.34375, + "learning_rate": 4.952221325295267e-06, + "loss": 2.3271, + "step": 8431 + }, + { + "epoch": 0.45236051502145924, + "grad_norm": 0.443359375, + "learning_rate": 4.952204420211058e-06, + "loss": 2.3599, + "step": 8432 + }, + { + "epoch": 0.4524141630901288, + "grad_norm": 0.3828125, + "learning_rate": 4.952187512165557e-06, + "loss": 2.2061, + "step": 8433 + }, + { + "epoch": 0.4524678111587983, + "grad_norm": 0.51171875, + "learning_rate": 4.952170601158786e-06, + "loss": 2.2301, + "step": 8434 + }, + { + "epoch": 0.45252145922746784, + "grad_norm": 0.484375, + "learning_rate": 4.952153687190764e-06, + "loss": 1.5833, + "step": 8435 + }, + { + "epoch": 0.4525751072961373, + "grad_norm": 0.369140625, + "learning_rate": 4.952136770261512e-06, + "loss": 2.178, + "step": 8436 + }, + { + "epoch": 0.45262875536480685, + "grad_norm": 0.48046875, + "learning_rate": 4.952119850371051e-06, + "loss": 2.3772, + "step": 8437 + }, + { + "epoch": 0.4526824034334764, + "grad_norm": 0.69921875, + "learning_rate": 4.9521029275194e-06, + "loss": 1.5248, + "step": 8438 + }, + { + "epoch": 0.4527360515021459, + "grad_norm": 0.427734375, + "learning_rate": 4.952086001706581e-06, + "loss": 2.0421, + "step": 8439 + }, + { + "epoch": 0.45278969957081544, + "grad_norm": 0.39453125, + "learning_rate": 4.952069072932614e-06, + "loss": 2.2125, + "step": 8440 + }, + { + "epoch": 0.45284334763948497, + "grad_norm": 0.39453125, + "learning_rate": 4.952052141197518e-06, + "loss": 2.2696, + "step": 8441 + }, + { + "epoch": 0.4528969957081545, + "grad_norm": 0.373046875, + "learning_rate": 4.952035206501316e-06, + "loss": 2.1428, + "step": 8442 + }, + { + "epoch": 0.45295064377682404, + "grad_norm": 0.423828125, + "learning_rate": 4.952018268844027e-06, + "loss": 2.2499, + "step": 8443 + }, + { + "epoch": 0.45300429184549357, + "grad_norm": 0.431640625, + "learning_rate": 4.952001328225671e-06, + "loss": 2.3585, + "step": 8444 + }, + { + "epoch": 0.4530579399141631, + "grad_norm": 0.388671875, + "learning_rate": 4.951984384646269e-06, + "loss": 2.3151, + "step": 8445 + }, + { + "epoch": 0.45311158798283263, + "grad_norm": 0.470703125, + "learning_rate": 4.951967438105841e-06, + "loss": 2.1786, + "step": 8446 + }, + { + "epoch": 0.45316523605150216, + "grad_norm": 0.431640625, + "learning_rate": 4.951950488604409e-06, + "loss": 1.9541, + "step": 8447 + }, + { + "epoch": 0.4532188841201717, + "grad_norm": 0.453125, + "learning_rate": 4.951933536141992e-06, + "loss": 2.199, + "step": 8448 + }, + { + "epoch": 0.4532725321888412, + "grad_norm": 0.421875, + "learning_rate": 4.9519165807186104e-06, + "loss": 2.3362, + "step": 8449 + }, + { + "epoch": 0.4533261802575107, + "grad_norm": 0.486328125, + "learning_rate": 4.9518996223342854e-06, + "loss": 2.1963, + "step": 8450 + }, + { + "epoch": 0.45337982832618023, + "grad_norm": 0.416015625, + "learning_rate": 4.951882660989037e-06, + "loss": 2.4643, + "step": 8451 + }, + { + "epoch": 0.45343347639484977, + "grad_norm": 0.47265625, + "learning_rate": 4.9518656966828865e-06, + "loss": 2.2479, + "step": 8452 + }, + { + "epoch": 0.4534871244635193, + "grad_norm": 0.50390625, + "learning_rate": 4.951848729415853e-06, + "loss": 2.5617, + "step": 8453 + }, + { + "epoch": 0.45354077253218883, + "grad_norm": 0.44921875, + "learning_rate": 4.951831759187959e-06, + "loss": 2.2862, + "step": 8454 + }, + { + "epoch": 0.45359442060085836, + "grad_norm": 0.6015625, + "learning_rate": 4.951814785999223e-06, + "loss": 2.1754, + "step": 8455 + }, + { + "epoch": 0.4536480686695279, + "grad_norm": 0.6484375, + "learning_rate": 4.951797809849667e-06, + "loss": 1.489, + "step": 8456 + }, + { + "epoch": 0.4537017167381974, + "grad_norm": 0.44140625, + "learning_rate": 4.95178083073931e-06, + "loss": 2.3414, + "step": 8457 + }, + { + "epoch": 0.45375536480686696, + "grad_norm": 0.365234375, + "learning_rate": 4.9517638486681725e-06, + "loss": 2.1119, + "step": 8458 + }, + { + "epoch": 0.4538090128755365, + "grad_norm": 0.392578125, + "learning_rate": 4.951746863636277e-06, + "loss": 2.3133, + "step": 8459 + }, + { + "epoch": 0.453862660944206, + "grad_norm": 0.349609375, + "learning_rate": 4.951729875643643e-06, + "loss": 2.2496, + "step": 8460 + }, + { + "epoch": 0.45391630901287555, + "grad_norm": 0.4375, + "learning_rate": 4.9517128846902905e-06, + "loss": 2.3218, + "step": 8461 + }, + { + "epoch": 0.4539699570815451, + "grad_norm": 0.50390625, + "learning_rate": 4.951695890776239e-06, + "loss": 2.5545, + "step": 8462 + }, + { + "epoch": 0.4540236051502146, + "grad_norm": 0.41015625, + "learning_rate": 4.951678893901512e-06, + "loss": 2.2577, + "step": 8463 + }, + { + "epoch": 0.45407725321888415, + "grad_norm": 0.5859375, + "learning_rate": 4.951661894066128e-06, + "loss": 1.6804, + "step": 8464 + }, + { + "epoch": 0.4541309012875536, + "grad_norm": 0.75390625, + "learning_rate": 4.9516448912701074e-06, + "loss": 1.9676, + "step": 8465 + }, + { + "epoch": 0.45418454935622316, + "grad_norm": 2.484375, + "learning_rate": 4.951627885513473e-06, + "loss": 1.6453, + "step": 8466 + }, + { + "epoch": 0.4542381974248927, + "grad_norm": 0.5625, + "learning_rate": 4.951610876796241e-06, + "loss": 2.4588, + "step": 8467 + }, + { + "epoch": 0.4542918454935622, + "grad_norm": 0.515625, + "learning_rate": 4.951593865118436e-06, + "loss": 2.3578, + "step": 8468 + }, + { + "epoch": 0.45434549356223175, + "grad_norm": 0.392578125, + "learning_rate": 4.951576850480077e-06, + "loss": 2.4135, + "step": 8469 + }, + { + "epoch": 0.4543991416309013, + "grad_norm": 0.41015625, + "learning_rate": 4.951559832881185e-06, + "loss": 2.2758, + "step": 8470 + }, + { + "epoch": 0.4544527896995708, + "grad_norm": 0.419921875, + "learning_rate": 4.951542812321779e-06, + "loss": 2.4216, + "step": 8471 + }, + { + "epoch": 0.45450643776824035, + "grad_norm": 0.37109375, + "learning_rate": 4.951525788801882e-06, + "loss": 2.3303, + "step": 8472 + }, + { + "epoch": 0.4545600858369099, + "grad_norm": 0.390625, + "learning_rate": 4.951508762321512e-06, + "loss": 2.3093, + "step": 8473 + }, + { + "epoch": 0.4546137339055794, + "grad_norm": 0.361328125, + "learning_rate": 4.951491732880691e-06, + "loss": 2.0382, + "step": 8474 + }, + { + "epoch": 0.45466738197424894, + "grad_norm": 0.453125, + "learning_rate": 4.95147470047944e-06, + "loss": 2.0122, + "step": 8475 + }, + { + "epoch": 0.4547210300429185, + "grad_norm": 0.39453125, + "learning_rate": 4.951457665117779e-06, + "loss": 2.3454, + "step": 8476 + }, + { + "epoch": 0.454774678111588, + "grad_norm": 0.4609375, + "learning_rate": 4.9514406267957285e-06, + "loss": 2.649, + "step": 8477 + }, + { + "epoch": 0.45482832618025754, + "grad_norm": 0.400390625, + "learning_rate": 4.951423585513309e-06, + "loss": 2.5131, + "step": 8478 + }, + { + "epoch": 0.454881974248927, + "grad_norm": 0.43359375, + "learning_rate": 4.95140654127054e-06, + "loss": 2.4807, + "step": 8479 + }, + { + "epoch": 0.45493562231759654, + "grad_norm": 0.412109375, + "learning_rate": 4.951389494067444e-06, + "loss": 2.258, + "step": 8480 + }, + { + "epoch": 0.4549892703862661, + "grad_norm": 0.40625, + "learning_rate": 4.951372443904042e-06, + "loss": 2.3084, + "step": 8481 + }, + { + "epoch": 0.4550429184549356, + "grad_norm": 0.357421875, + "learning_rate": 4.951355390780353e-06, + "loss": 2.1216, + "step": 8482 + }, + { + "epoch": 0.45509656652360514, + "grad_norm": 0.41796875, + "learning_rate": 4.951338334696397e-06, + "loss": 2.1563, + "step": 8483 + }, + { + "epoch": 0.45515021459227467, + "grad_norm": 0.390625, + "learning_rate": 4.9513212756521964e-06, + "loss": 2.3142, + "step": 8484 + }, + { + "epoch": 0.4552038626609442, + "grad_norm": 0.3671875, + "learning_rate": 4.951304213647772e-06, + "loss": 2.0205, + "step": 8485 + }, + { + "epoch": 0.45525751072961373, + "grad_norm": 0.380859375, + "learning_rate": 4.951287148683141e-06, + "loss": 2.2307, + "step": 8486 + }, + { + "epoch": 0.45531115879828327, + "grad_norm": 0.35546875, + "learning_rate": 4.951270080758329e-06, + "loss": 2.1282, + "step": 8487 + }, + { + "epoch": 0.4553648068669528, + "grad_norm": 0.453125, + "learning_rate": 4.951253009873352e-06, + "loss": 2.1939, + "step": 8488 + }, + { + "epoch": 0.45541845493562233, + "grad_norm": 0.462890625, + "learning_rate": 4.951235936028235e-06, + "loss": 2.3504, + "step": 8489 + }, + { + "epoch": 0.45547210300429186, + "grad_norm": 0.5, + "learning_rate": 4.951218859222994e-06, + "loss": 2.4905, + "step": 8490 + }, + { + "epoch": 0.4555257510729614, + "grad_norm": 0.447265625, + "learning_rate": 4.951201779457653e-06, + "loss": 2.2548, + "step": 8491 + }, + { + "epoch": 0.4555793991416309, + "grad_norm": 0.462890625, + "learning_rate": 4.951184696732232e-06, + "loss": 2.2415, + "step": 8492 + }, + { + "epoch": 0.4556330472103004, + "grad_norm": 0.40234375, + "learning_rate": 4.95116761104675e-06, + "loss": 2.3158, + "step": 8493 + }, + { + "epoch": 0.45568669527896993, + "grad_norm": 0.6484375, + "learning_rate": 4.951150522401229e-06, + "loss": 2.4759, + "step": 8494 + }, + { + "epoch": 0.45574034334763946, + "grad_norm": 0.423828125, + "learning_rate": 4.95113343079569e-06, + "loss": 2.3253, + "step": 8495 + }, + { + "epoch": 0.455793991416309, + "grad_norm": 0.349609375, + "learning_rate": 4.951116336230153e-06, + "loss": 2.3316, + "step": 8496 + }, + { + "epoch": 0.45584763948497853, + "grad_norm": 0.435546875, + "learning_rate": 4.951099238704639e-06, + "loss": 2.0674, + "step": 8497 + }, + { + "epoch": 0.45590128755364806, + "grad_norm": 0.400390625, + "learning_rate": 4.951082138219167e-06, + "loss": 1.8181, + "step": 8498 + }, + { + "epoch": 0.4559549356223176, + "grad_norm": 0.427734375, + "learning_rate": 4.951065034773761e-06, + "loss": 2.1142, + "step": 8499 + }, + { + "epoch": 0.4560085836909871, + "grad_norm": 0.453125, + "learning_rate": 4.951047928368439e-06, + "loss": 2.2311, + "step": 8500 + }, + { + "epoch": 0.45606223175965666, + "grad_norm": 0.39453125, + "learning_rate": 4.951030819003222e-06, + "loss": 2.4563, + "step": 8501 + }, + { + "epoch": 0.4561158798283262, + "grad_norm": 0.361328125, + "learning_rate": 4.95101370667813e-06, + "loss": 2.0559, + "step": 8502 + }, + { + "epoch": 0.4561695278969957, + "grad_norm": 0.40625, + "learning_rate": 4.9509965913931865e-06, + "loss": 1.9675, + "step": 8503 + }, + { + "epoch": 0.45622317596566525, + "grad_norm": 0.46875, + "learning_rate": 4.95097947314841e-06, + "loss": 2.1449, + "step": 8504 + }, + { + "epoch": 0.4562768240343348, + "grad_norm": 0.458984375, + "learning_rate": 4.950962351943821e-06, + "loss": 2.4614, + "step": 8505 + }, + { + "epoch": 0.4563304721030043, + "grad_norm": 0.458984375, + "learning_rate": 4.95094522777944e-06, + "loss": 2.4725, + "step": 8506 + }, + { + "epoch": 0.45638412017167385, + "grad_norm": 0.3828125, + "learning_rate": 4.950928100655289e-06, + "loss": 2.4476, + "step": 8507 + }, + { + "epoch": 0.4564377682403433, + "grad_norm": 0.46484375, + "learning_rate": 4.950910970571389e-06, + "loss": 2.3364, + "step": 8508 + }, + { + "epoch": 0.45649141630901285, + "grad_norm": 0.4140625, + "learning_rate": 4.950893837527758e-06, + "loss": 2.4826, + "step": 8509 + }, + { + "epoch": 0.4565450643776824, + "grad_norm": 0.33984375, + "learning_rate": 4.95087670152442e-06, + "loss": 1.8897, + "step": 8510 + }, + { + "epoch": 0.4565987124463519, + "grad_norm": 0.419921875, + "learning_rate": 4.950859562561393e-06, + "loss": 2.4421, + "step": 8511 + }, + { + "epoch": 0.45665236051502145, + "grad_norm": 0.359375, + "learning_rate": 4.950842420638699e-06, + "loss": 2.134, + "step": 8512 + }, + { + "epoch": 0.456706008583691, + "grad_norm": 0.396484375, + "learning_rate": 4.950825275756359e-06, + "loss": 2.1709, + "step": 8513 + }, + { + "epoch": 0.4567596566523605, + "grad_norm": 0.40625, + "learning_rate": 4.950808127914393e-06, + "loss": 2.1851, + "step": 8514 + }, + { + "epoch": 0.45681330472103004, + "grad_norm": 0.453125, + "learning_rate": 4.950790977112821e-06, + "loss": 2.1584, + "step": 8515 + }, + { + "epoch": 0.4568669527896996, + "grad_norm": 0.46484375, + "learning_rate": 4.950773823351666e-06, + "loss": 2.1122, + "step": 8516 + }, + { + "epoch": 0.4569206008583691, + "grad_norm": 0.412109375, + "learning_rate": 4.950756666630947e-06, + "loss": 2.126, + "step": 8517 + }, + { + "epoch": 0.45697424892703864, + "grad_norm": 0.453125, + "learning_rate": 4.950739506950685e-06, + "loss": 2.3116, + "step": 8518 + }, + { + "epoch": 0.45702789699570817, + "grad_norm": 0.466796875, + "learning_rate": 4.9507223443109005e-06, + "loss": 2.265, + "step": 8519 + }, + { + "epoch": 0.4570815450643777, + "grad_norm": 0.3828125, + "learning_rate": 4.950705178711614e-06, + "loss": 2.2986, + "step": 8520 + }, + { + "epoch": 0.45713519313304724, + "grad_norm": 0.41015625, + "learning_rate": 4.950688010152847e-06, + "loss": 2.4411, + "step": 8521 + }, + { + "epoch": 0.4571888412017167, + "grad_norm": 0.466796875, + "learning_rate": 4.950670838634621e-06, + "loss": 2.2193, + "step": 8522 + }, + { + "epoch": 0.45724248927038624, + "grad_norm": 0.49609375, + "learning_rate": 4.9506536641569556e-06, + "loss": 2.5148, + "step": 8523 + }, + { + "epoch": 0.4572961373390558, + "grad_norm": 0.3671875, + "learning_rate": 4.9506364867198705e-06, + "loss": 2.271, + "step": 8524 + }, + { + "epoch": 0.4573497854077253, + "grad_norm": 0.640625, + "learning_rate": 4.950619306323389e-06, + "loss": 2.2295, + "step": 8525 + }, + { + "epoch": 0.45740343347639484, + "grad_norm": 0.4296875, + "learning_rate": 4.95060212296753e-06, + "loss": 2.1162, + "step": 8526 + }, + { + "epoch": 0.45745708154506437, + "grad_norm": 0.46484375, + "learning_rate": 4.950584936652314e-06, + "loss": 2.2781, + "step": 8527 + }, + { + "epoch": 0.4575107296137339, + "grad_norm": 0.455078125, + "learning_rate": 4.950567747377763e-06, + "loss": 2.0694, + "step": 8528 + }, + { + "epoch": 0.45756437768240343, + "grad_norm": 0.408203125, + "learning_rate": 4.9505505551438965e-06, + "loss": 2.5125, + "step": 8529 + }, + { + "epoch": 0.45761802575107297, + "grad_norm": 0.41796875, + "learning_rate": 4.950533359950737e-06, + "loss": 2.6147, + "step": 8530 + }, + { + "epoch": 0.4576716738197425, + "grad_norm": 0.90234375, + "learning_rate": 4.950516161798303e-06, + "loss": 2.3663, + "step": 8531 + }, + { + "epoch": 0.45772532188841203, + "grad_norm": 0.46875, + "learning_rate": 4.9504989606866175e-06, + "loss": 1.9967, + "step": 8532 + }, + { + "epoch": 0.45777896995708156, + "grad_norm": 0.60546875, + "learning_rate": 4.9504817566156995e-06, + "loss": 2.5631, + "step": 8533 + }, + { + "epoch": 0.4578326180257511, + "grad_norm": 0.318359375, + "learning_rate": 4.950464549585571e-06, + "loss": 1.9578, + "step": 8534 + }, + { + "epoch": 0.4578862660944206, + "grad_norm": 0.427734375, + "learning_rate": 4.950447339596253e-06, + "loss": 2.3303, + "step": 8535 + }, + { + "epoch": 0.45793991416309016, + "grad_norm": 0.640625, + "learning_rate": 4.950430126647765e-06, + "loss": 2.2034, + "step": 8536 + }, + { + "epoch": 0.45799356223175963, + "grad_norm": 0.51953125, + "learning_rate": 4.950412910740129e-06, + "loss": 2.3788, + "step": 8537 + }, + { + "epoch": 0.45804721030042916, + "grad_norm": 0.78125, + "learning_rate": 4.950395691873364e-06, + "loss": 2.3568, + "step": 8538 + }, + { + "epoch": 0.4581008583690987, + "grad_norm": 0.4765625, + "learning_rate": 4.950378470047492e-06, + "loss": 1.9677, + "step": 8539 + }, + { + "epoch": 0.4581545064377682, + "grad_norm": 0.42578125, + "learning_rate": 4.950361245262535e-06, + "loss": 2.3207, + "step": 8540 + }, + { + "epoch": 0.45820815450643776, + "grad_norm": 0.63671875, + "learning_rate": 4.950344017518511e-06, + "loss": 2.0359, + "step": 8541 + }, + { + "epoch": 0.4582618025751073, + "grad_norm": 0.353515625, + "learning_rate": 4.950326786815444e-06, + "loss": 2.0731, + "step": 8542 + }, + { + "epoch": 0.4583154506437768, + "grad_norm": 0.3671875, + "learning_rate": 4.950309553153352e-06, + "loss": 2.2823, + "step": 8543 + }, + { + "epoch": 0.45836909871244635, + "grad_norm": 0.32421875, + "learning_rate": 4.950292316532258e-06, + "loss": 2.1503, + "step": 8544 + }, + { + "epoch": 0.4584227467811159, + "grad_norm": 0.392578125, + "learning_rate": 4.9502750769521816e-06, + "loss": 2.1374, + "step": 8545 + }, + { + "epoch": 0.4584763948497854, + "grad_norm": 0.408203125, + "learning_rate": 4.950257834413143e-06, + "loss": 2.0237, + "step": 8546 + }, + { + "epoch": 0.45853004291845495, + "grad_norm": 0.4921875, + "learning_rate": 4.950240588915164e-06, + "loss": 2.177, + "step": 8547 + }, + { + "epoch": 0.4585836909871245, + "grad_norm": 0.5703125, + "learning_rate": 4.950223340458265e-06, + "loss": 2.5758, + "step": 8548 + }, + { + "epoch": 0.458637339055794, + "grad_norm": 0.65234375, + "learning_rate": 4.9502060890424686e-06, + "loss": 2.5664, + "step": 8549 + }, + { + "epoch": 0.45869098712446355, + "grad_norm": 0.46875, + "learning_rate": 4.9501888346677936e-06, + "loss": 2.3732, + "step": 8550 + }, + { + "epoch": 0.458744635193133, + "grad_norm": 0.384765625, + "learning_rate": 4.9501715773342606e-06, + "loss": 2.226, + "step": 8551 + }, + { + "epoch": 0.45879828326180255, + "grad_norm": 0.4921875, + "learning_rate": 4.950154317041892e-06, + "loss": 2.4912, + "step": 8552 + }, + { + "epoch": 0.4588519313304721, + "grad_norm": 0.416015625, + "learning_rate": 4.950137053790708e-06, + "loss": 2.3971, + "step": 8553 + }, + { + "epoch": 0.4589055793991416, + "grad_norm": 0.5, + "learning_rate": 4.9501197875807285e-06, + "loss": 2.4467, + "step": 8554 + }, + { + "epoch": 0.45895922746781115, + "grad_norm": 0.388671875, + "learning_rate": 4.950102518411976e-06, + "loss": 2.3984, + "step": 8555 + }, + { + "epoch": 0.4590128755364807, + "grad_norm": 1.8046875, + "learning_rate": 4.9500852462844696e-06, + "loss": 2.402, + "step": 8556 + }, + { + "epoch": 0.4590665236051502, + "grad_norm": 1.171875, + "learning_rate": 4.950067971198231e-06, + "loss": 2.0658, + "step": 8557 + }, + { + "epoch": 0.45912017167381974, + "grad_norm": 0.53125, + "learning_rate": 4.9500506931532825e-06, + "loss": 2.212, + "step": 8558 + }, + { + "epoch": 0.4591738197424893, + "grad_norm": 0.37109375, + "learning_rate": 4.9500334121496425e-06, + "loss": 2.3466, + "step": 8559 + }, + { + "epoch": 0.4592274678111588, + "grad_norm": 0.447265625, + "learning_rate": 4.950016128187333e-06, + "loss": 2.6097, + "step": 8560 + }, + { + "epoch": 0.45928111587982834, + "grad_norm": 0.59375, + "learning_rate": 4.949998841266375e-06, + "loss": 1.8574, + "step": 8561 + }, + { + "epoch": 0.45933476394849787, + "grad_norm": 0.37890625, + "learning_rate": 4.94998155138679e-06, + "loss": 2.0553, + "step": 8562 + }, + { + "epoch": 0.4593884120171674, + "grad_norm": 0.58203125, + "learning_rate": 4.949964258548597e-06, + "loss": 2.0493, + "step": 8563 + }, + { + "epoch": 0.45944206008583693, + "grad_norm": 0.390625, + "learning_rate": 4.9499469627518185e-06, + "loss": 1.8952, + "step": 8564 + }, + { + "epoch": 0.4594957081545064, + "grad_norm": 0.36328125, + "learning_rate": 4.9499296639964745e-06, + "loss": 2.2676, + "step": 8565 + }, + { + "epoch": 0.45954935622317594, + "grad_norm": 0.43359375, + "learning_rate": 4.949912362282586e-06, + "loss": 2.3267, + "step": 8566 + }, + { + "epoch": 0.4596030042918455, + "grad_norm": 0.408203125, + "learning_rate": 4.9498950576101754e-06, + "loss": 1.4367, + "step": 8567 + }, + { + "epoch": 0.459656652360515, + "grad_norm": 0.44140625, + "learning_rate": 4.949877749979262e-06, + "loss": 2.2968, + "step": 8568 + }, + { + "epoch": 0.45971030042918454, + "grad_norm": 0.3984375, + "learning_rate": 4.949860439389866e-06, + "loss": 2.3463, + "step": 8569 + }, + { + "epoch": 0.45976394849785407, + "grad_norm": 0.54296875, + "learning_rate": 4.94984312584201e-06, + "loss": 2.2992, + "step": 8570 + }, + { + "epoch": 0.4598175965665236, + "grad_norm": 0.427734375, + "learning_rate": 4.949825809335714e-06, + "loss": 2.1866, + "step": 8571 + }, + { + "epoch": 0.45987124463519313, + "grad_norm": 0.392578125, + "learning_rate": 4.949808489871e-06, + "loss": 2.3001, + "step": 8572 + }, + { + "epoch": 0.45992489270386266, + "grad_norm": 0.427734375, + "learning_rate": 4.949791167447887e-06, + "loss": 2.2255, + "step": 8573 + }, + { + "epoch": 0.4599785407725322, + "grad_norm": 0.41015625, + "learning_rate": 4.9497738420663974e-06, + "loss": 2.3224, + "step": 8574 + }, + { + "epoch": 0.46003218884120173, + "grad_norm": 0.4453125, + "learning_rate": 4.949756513726552e-06, + "loss": 2.3078, + "step": 8575 + }, + { + "epoch": 0.46008583690987126, + "grad_norm": 0.4140625, + "learning_rate": 4.949739182428371e-06, + "loss": 2.2121, + "step": 8576 + }, + { + "epoch": 0.4601394849785408, + "grad_norm": 0.388671875, + "learning_rate": 4.949721848171876e-06, + "loss": 2.1394, + "step": 8577 + }, + { + "epoch": 0.4601931330472103, + "grad_norm": 0.40234375, + "learning_rate": 4.949704510957089e-06, + "loss": 2.3503, + "step": 8578 + }, + { + "epoch": 0.46024678111587985, + "grad_norm": 0.4296875, + "learning_rate": 4.949687170784027e-06, + "loss": 1.4092, + "step": 8579 + }, + { + "epoch": 0.46030042918454933, + "grad_norm": 0.40625, + "learning_rate": 4.949669827652716e-06, + "loss": 2.2703, + "step": 8580 + }, + { + "epoch": 0.46035407725321886, + "grad_norm": 0.4296875, + "learning_rate": 4.949652481563174e-06, + "loss": 2.3553, + "step": 8581 + }, + { + "epoch": 0.4604077253218884, + "grad_norm": 0.375, + "learning_rate": 4.949635132515422e-06, + "loss": 2.182, + "step": 8582 + }, + { + "epoch": 0.4604613733905579, + "grad_norm": 0.44140625, + "learning_rate": 4.9496177805094816e-06, + "loss": 2.4169, + "step": 8583 + }, + { + "epoch": 0.46051502145922746, + "grad_norm": 0.51171875, + "learning_rate": 4.949600425545373e-06, + "loss": 2.6049, + "step": 8584 + }, + { + "epoch": 0.460568669527897, + "grad_norm": 0.431640625, + "learning_rate": 4.949583067623119e-06, + "loss": 2.2837, + "step": 8585 + }, + { + "epoch": 0.4606223175965665, + "grad_norm": 0.42578125, + "learning_rate": 4.9495657067427385e-06, + "loss": 2.4123, + "step": 8586 + }, + { + "epoch": 0.46067596566523605, + "grad_norm": 0.44140625, + "learning_rate": 4.949548342904253e-06, + "loss": 2.4453, + "step": 8587 + }, + { + "epoch": 0.4607296137339056, + "grad_norm": 0.3984375, + "learning_rate": 4.949530976107684e-06, + "loss": 2.4262, + "step": 8588 + }, + { + "epoch": 0.4607832618025751, + "grad_norm": 0.474609375, + "learning_rate": 4.949513606353052e-06, + "loss": 2.1425, + "step": 8589 + }, + { + "epoch": 0.46083690987124465, + "grad_norm": 0.4140625, + "learning_rate": 4.949496233640378e-06, + "loss": 2.418, + "step": 8590 + }, + { + "epoch": 0.4608905579399142, + "grad_norm": 0.478515625, + "learning_rate": 4.949478857969684e-06, + "loss": 1.448, + "step": 8591 + }, + { + "epoch": 0.4609442060085837, + "grad_norm": 0.890625, + "learning_rate": 4.94946147934099e-06, + "loss": 2.2145, + "step": 8592 + }, + { + "epoch": 0.46099785407725324, + "grad_norm": 0.39453125, + "learning_rate": 4.9494440977543165e-06, + "loss": 2.1278, + "step": 8593 + }, + { + "epoch": 0.4610515021459227, + "grad_norm": 0.396484375, + "learning_rate": 4.949426713209687e-06, + "loss": 2.3951, + "step": 8594 + }, + { + "epoch": 0.46110515021459225, + "grad_norm": 0.390625, + "learning_rate": 4.949409325707119e-06, + "loss": 2.3548, + "step": 8595 + }, + { + "epoch": 0.4611587982832618, + "grad_norm": 0.490234375, + "learning_rate": 4.9493919352466345e-06, + "loss": 2.2621, + "step": 8596 + }, + { + "epoch": 0.4612124463519313, + "grad_norm": 0.4140625, + "learning_rate": 4.949374541828255e-06, + "loss": 2.2377, + "step": 8597 + }, + { + "epoch": 0.46126609442060085, + "grad_norm": 0.474609375, + "learning_rate": 4.949357145452003e-06, + "loss": 2.2283, + "step": 8598 + }, + { + "epoch": 0.4613197424892704, + "grad_norm": 0.388671875, + "learning_rate": 4.949339746117898e-06, + "loss": 2.3671, + "step": 8599 + }, + { + "epoch": 0.4613733905579399, + "grad_norm": 0.55078125, + "learning_rate": 4.949322343825961e-06, + "loss": 2.2676, + "step": 8600 + }, + { + "epoch": 0.46142703862660944, + "grad_norm": 0.451171875, + "learning_rate": 4.949304938576213e-06, + "loss": 2.2437, + "step": 8601 + }, + { + "epoch": 0.461480686695279, + "grad_norm": 0.39453125, + "learning_rate": 4.949287530368675e-06, + "loss": 2.267, + "step": 8602 + }, + { + "epoch": 0.4615343347639485, + "grad_norm": 0.33984375, + "learning_rate": 4.949270119203368e-06, + "loss": 2.287, + "step": 8603 + }, + { + "epoch": 0.46158798283261804, + "grad_norm": 0.427734375, + "learning_rate": 4.949252705080314e-06, + "loss": 2.313, + "step": 8604 + }, + { + "epoch": 0.46164163090128757, + "grad_norm": 0.443359375, + "learning_rate": 4.9492352879995325e-06, + "loss": 2.1312, + "step": 8605 + }, + { + "epoch": 0.4616952789699571, + "grad_norm": 0.40234375, + "learning_rate": 4.949217867961046e-06, + "loss": 2.317, + "step": 8606 + }, + { + "epoch": 0.46174892703862663, + "grad_norm": 0.373046875, + "learning_rate": 4.949200444964874e-06, + "loss": 2.2715, + "step": 8607 + }, + { + "epoch": 0.46180257510729616, + "grad_norm": 0.408203125, + "learning_rate": 4.949183019011039e-06, + "loss": 2.2791, + "step": 8608 + }, + { + "epoch": 0.46185622317596564, + "grad_norm": 0.458984375, + "learning_rate": 4.949165590099561e-06, + "loss": 2.5321, + "step": 8609 + }, + { + "epoch": 0.4619098712446352, + "grad_norm": 0.45703125, + "learning_rate": 4.949148158230462e-06, + "loss": 2.1282, + "step": 8610 + }, + { + "epoch": 0.4619635193133047, + "grad_norm": 0.353515625, + "learning_rate": 4.949130723403762e-06, + "loss": 2.0953, + "step": 8611 + }, + { + "epoch": 0.46201716738197424, + "grad_norm": 0.412109375, + "learning_rate": 4.949113285619483e-06, + "loss": 2.246, + "step": 8612 + }, + { + "epoch": 0.46207081545064377, + "grad_norm": 0.376953125, + "learning_rate": 4.949095844877645e-06, + "loss": 2.078, + "step": 8613 + }, + { + "epoch": 0.4621244635193133, + "grad_norm": 0.423828125, + "learning_rate": 4.94907840117827e-06, + "loss": 2.4476, + "step": 8614 + }, + { + "epoch": 0.46217811158798283, + "grad_norm": 0.455078125, + "learning_rate": 4.949060954521379e-06, + "loss": 2.3627, + "step": 8615 + }, + { + "epoch": 0.46223175965665236, + "grad_norm": 0.416015625, + "learning_rate": 4.9490435049069925e-06, + "loss": 2.2935, + "step": 8616 + }, + { + "epoch": 0.4622854077253219, + "grad_norm": 0.392578125, + "learning_rate": 4.9490260523351316e-06, + "loss": 2.4709, + "step": 8617 + }, + { + "epoch": 0.4623390557939914, + "grad_norm": 0.3671875, + "learning_rate": 4.949008596805819e-06, + "loss": 2.1085, + "step": 8618 + }, + { + "epoch": 0.46239270386266096, + "grad_norm": 0.46484375, + "learning_rate": 4.9489911383190724e-06, + "loss": 2.223, + "step": 8619 + }, + { + "epoch": 0.4624463519313305, + "grad_norm": 0.77734375, + "learning_rate": 4.948973676874917e-06, + "loss": 2.4588, + "step": 8620 + }, + { + "epoch": 0.4625, + "grad_norm": 0.5625, + "learning_rate": 4.948956212473371e-06, + "loss": 1.9511, + "step": 8621 + }, + { + "epoch": 0.46255364806866955, + "grad_norm": 0.984375, + "learning_rate": 4.948938745114456e-06, + "loss": 2.1872, + "step": 8622 + }, + { + "epoch": 0.46260729613733903, + "grad_norm": 0.4921875, + "learning_rate": 4.948921274798193e-06, + "loss": 1.1416, + "step": 8623 + }, + { + "epoch": 0.46266094420600856, + "grad_norm": 0.44140625, + "learning_rate": 4.948903801524604e-06, + "loss": 2.2898, + "step": 8624 + }, + { + "epoch": 0.4627145922746781, + "grad_norm": 0.5703125, + "learning_rate": 4.948886325293709e-06, + "loss": 2.5387, + "step": 8625 + }, + { + "epoch": 0.4627682403433476, + "grad_norm": 0.48046875, + "learning_rate": 4.948868846105531e-06, + "loss": 2.1891, + "step": 8626 + }, + { + "epoch": 0.46282188841201716, + "grad_norm": 0.38671875, + "learning_rate": 4.948851363960089e-06, + "loss": 2.2678, + "step": 8627 + }, + { + "epoch": 0.4628755364806867, + "grad_norm": 0.408203125, + "learning_rate": 4.948833878857405e-06, + "loss": 2.1404, + "step": 8628 + }, + { + "epoch": 0.4629291845493562, + "grad_norm": 0.5625, + "learning_rate": 4.9488163907975e-06, + "loss": 1.5915, + "step": 8629 + }, + { + "epoch": 0.46298283261802575, + "grad_norm": 0.365234375, + "learning_rate": 4.948798899780395e-06, + "loss": 2.2786, + "step": 8630 + }, + { + "epoch": 0.4630364806866953, + "grad_norm": 0.4453125, + "learning_rate": 4.948781405806111e-06, + "loss": 2.444, + "step": 8631 + }, + { + "epoch": 0.4630901287553648, + "grad_norm": 0.3828125, + "learning_rate": 4.948763908874671e-06, + "loss": 2.5621, + "step": 8632 + }, + { + "epoch": 0.46314377682403435, + "grad_norm": 0.69140625, + "learning_rate": 4.948746408986092e-06, + "loss": 2.2957, + "step": 8633 + }, + { + "epoch": 0.4631974248927039, + "grad_norm": 0.423828125, + "learning_rate": 4.9487289061404e-06, + "loss": 1.7891, + "step": 8634 + }, + { + "epoch": 0.4632510729613734, + "grad_norm": 0.59375, + "learning_rate": 4.9487114003376125e-06, + "loss": 2.1816, + "step": 8635 + }, + { + "epoch": 0.46330472103004294, + "grad_norm": 0.462890625, + "learning_rate": 4.948693891577752e-06, + "loss": 2.473, + "step": 8636 + }, + { + "epoch": 0.4633583690987124, + "grad_norm": 0.443359375, + "learning_rate": 4.948676379860841e-06, + "loss": 2.5707, + "step": 8637 + }, + { + "epoch": 0.46341201716738195, + "grad_norm": 0.404296875, + "learning_rate": 4.948658865186897e-06, + "loss": 2.4099, + "step": 8638 + }, + { + "epoch": 0.4634656652360515, + "grad_norm": 0.60546875, + "learning_rate": 4.948641347555945e-06, + "loss": 2.3977, + "step": 8639 + }, + { + "epoch": 0.463519313304721, + "grad_norm": 0.439453125, + "learning_rate": 4.948623826968004e-06, + "loss": 2.2429, + "step": 8640 + }, + { + "epoch": 0.46357296137339055, + "grad_norm": 0.56640625, + "learning_rate": 4.9486063034230955e-06, + "loss": 2.3169, + "step": 8641 + }, + { + "epoch": 0.4636266094420601, + "grad_norm": 0.427734375, + "learning_rate": 4.948588776921241e-06, + "loss": 2.155, + "step": 8642 + }, + { + "epoch": 0.4636802575107296, + "grad_norm": 0.412109375, + "learning_rate": 4.948571247462461e-06, + "loss": 2.2483, + "step": 8643 + }, + { + "epoch": 0.46373390557939914, + "grad_norm": 0.408203125, + "learning_rate": 4.9485537150467785e-06, + "loss": 2.308, + "step": 8644 + }, + { + "epoch": 0.4637875536480687, + "grad_norm": 0.404296875, + "learning_rate": 4.948536179674213e-06, + "loss": 2.095, + "step": 8645 + }, + { + "epoch": 0.4638412017167382, + "grad_norm": 0.359375, + "learning_rate": 4.948518641344785e-06, + "loss": 2.078, + "step": 8646 + }, + { + "epoch": 0.46389484978540774, + "grad_norm": 0.416015625, + "learning_rate": 4.948501100058517e-06, + "loss": 2.2442, + "step": 8647 + }, + { + "epoch": 0.46394849785407727, + "grad_norm": 0.4140625, + "learning_rate": 4.94848355581543e-06, + "loss": 2.3839, + "step": 8648 + }, + { + "epoch": 0.4640021459227468, + "grad_norm": 0.44140625, + "learning_rate": 4.948466008615545e-06, + "loss": 2.3316, + "step": 8649 + }, + { + "epoch": 0.46405579399141633, + "grad_norm": 0.416015625, + "learning_rate": 4.9484484584588845e-06, + "loss": 2.5899, + "step": 8650 + }, + { + "epoch": 0.46410944206008586, + "grad_norm": 0.412109375, + "learning_rate": 4.948430905345467e-06, + "loss": 2.4367, + "step": 8651 + }, + { + "epoch": 0.46416309012875534, + "grad_norm": 0.37109375, + "learning_rate": 4.948413349275316e-06, + "loss": 2.1618, + "step": 8652 + }, + { + "epoch": 0.46421673819742487, + "grad_norm": 0.419921875, + "learning_rate": 4.948395790248452e-06, + "loss": 2.3019, + "step": 8653 + }, + { + "epoch": 0.4642703862660944, + "grad_norm": 0.419921875, + "learning_rate": 4.948378228264895e-06, + "loss": 2.2997, + "step": 8654 + }, + { + "epoch": 0.46432403433476394, + "grad_norm": 0.4609375, + "learning_rate": 4.948360663324669e-06, + "loss": 2.4475, + "step": 8655 + }, + { + "epoch": 0.46437768240343347, + "grad_norm": 0.45703125, + "learning_rate": 4.948343095427791e-06, + "loss": 2.4009, + "step": 8656 + }, + { + "epoch": 0.464431330472103, + "grad_norm": 0.390625, + "learning_rate": 4.948325524574287e-06, + "loss": 2.306, + "step": 8657 + }, + { + "epoch": 0.46448497854077253, + "grad_norm": 0.451171875, + "learning_rate": 4.948307950764175e-06, + "loss": 2.4364, + "step": 8658 + }, + { + "epoch": 0.46453862660944206, + "grad_norm": 0.44140625, + "learning_rate": 4.948290373997477e-06, + "loss": 2.462, + "step": 8659 + }, + { + "epoch": 0.4645922746781116, + "grad_norm": 0.546875, + "learning_rate": 4.948272794274215e-06, + "loss": 2.2513, + "step": 8660 + }, + { + "epoch": 0.4646459227467811, + "grad_norm": 0.396484375, + "learning_rate": 4.948255211594408e-06, + "loss": 2.3605, + "step": 8661 + }, + { + "epoch": 0.46469957081545066, + "grad_norm": 0.4453125, + "learning_rate": 4.9482376259580806e-06, + "loss": 2.2251, + "step": 8662 + }, + { + "epoch": 0.4647532188841202, + "grad_norm": 0.515625, + "learning_rate": 4.948220037365252e-06, + "loss": 2.2427, + "step": 8663 + }, + { + "epoch": 0.4648068669527897, + "grad_norm": 0.4140625, + "learning_rate": 4.948202445815942e-06, + "loss": 2.3291, + "step": 8664 + }, + { + "epoch": 0.46486051502145925, + "grad_norm": 0.458984375, + "learning_rate": 4.948184851310176e-06, + "loss": 2.4302, + "step": 8665 + }, + { + "epoch": 0.46491416309012873, + "grad_norm": 3.15625, + "learning_rate": 4.948167253847972e-06, + "loss": 2.4069, + "step": 8666 + }, + { + "epoch": 0.46496781115879826, + "grad_norm": 0.46484375, + "learning_rate": 4.948149653429351e-06, + "loss": 2.1973, + "step": 8667 + }, + { + "epoch": 0.4650214592274678, + "grad_norm": 0.33984375, + "learning_rate": 4.9481320500543364e-06, + "loss": 2.253, + "step": 8668 + }, + { + "epoch": 0.4650751072961373, + "grad_norm": 0.359375, + "learning_rate": 4.948114443722948e-06, + "loss": 2.2617, + "step": 8669 + }, + { + "epoch": 0.46512875536480686, + "grad_norm": 0.408203125, + "learning_rate": 4.9480968344352075e-06, + "loss": 2.1816, + "step": 8670 + }, + { + "epoch": 0.4651824034334764, + "grad_norm": 0.357421875, + "learning_rate": 4.948079222191136e-06, + "loss": 2.1321, + "step": 8671 + }, + { + "epoch": 0.4652360515021459, + "grad_norm": 0.49609375, + "learning_rate": 4.948061606990755e-06, + "loss": 2.1525, + "step": 8672 + }, + { + "epoch": 0.46528969957081545, + "grad_norm": 0.58984375, + "learning_rate": 4.948043988834085e-06, + "loss": 2.0968, + "step": 8673 + }, + { + "epoch": 0.465343347639485, + "grad_norm": 0.44140625, + "learning_rate": 4.948026367721149e-06, + "loss": 1.4518, + "step": 8674 + }, + { + "epoch": 0.4653969957081545, + "grad_norm": 0.4296875, + "learning_rate": 4.948008743651966e-06, + "loss": 2.214, + "step": 8675 + }, + { + "epoch": 0.46545064377682405, + "grad_norm": 0.4453125, + "learning_rate": 4.947991116626559e-06, + "loss": 2.452, + "step": 8676 + }, + { + "epoch": 0.4655042918454936, + "grad_norm": 0.423828125, + "learning_rate": 4.947973486644949e-06, + "loss": 2.1661, + "step": 8677 + }, + { + "epoch": 0.4655579399141631, + "grad_norm": 0.4453125, + "learning_rate": 4.947955853707157e-06, + "loss": 2.3545, + "step": 8678 + }, + { + "epoch": 0.46561158798283264, + "grad_norm": 0.60546875, + "learning_rate": 4.947938217813204e-06, + "loss": 2.5084, + "step": 8679 + }, + { + "epoch": 0.4656652360515021, + "grad_norm": 0.416015625, + "learning_rate": 4.9479205789631125e-06, + "loss": 2.2617, + "step": 8680 + }, + { + "epoch": 0.46571888412017165, + "grad_norm": 0.42578125, + "learning_rate": 4.947902937156902e-06, + "loss": 2.4082, + "step": 8681 + }, + { + "epoch": 0.4657725321888412, + "grad_norm": 0.8984375, + "learning_rate": 4.9478852923945955e-06, + "loss": 1.3799, + "step": 8682 + }, + { + "epoch": 0.4658261802575107, + "grad_norm": 0.392578125, + "learning_rate": 4.947867644676212e-06, + "loss": 2.4117, + "step": 8683 + }, + { + "epoch": 0.46587982832618025, + "grad_norm": 2.1875, + "learning_rate": 4.947849994001776e-06, + "loss": 1.9464, + "step": 8684 + }, + { + "epoch": 0.4659334763948498, + "grad_norm": 0.4375, + "learning_rate": 4.9478323403713065e-06, + "loss": 2.1342, + "step": 8685 + }, + { + "epoch": 0.4659871244635193, + "grad_norm": 0.486328125, + "learning_rate": 4.9478146837848255e-06, + "loss": 2.6123, + "step": 8686 + }, + { + "epoch": 0.46604077253218884, + "grad_norm": 0.443359375, + "learning_rate": 4.9477970242423544e-06, + "loss": 1.4282, + "step": 8687 + }, + { + "epoch": 0.4660944206008584, + "grad_norm": 0.47265625, + "learning_rate": 4.947779361743915e-06, + "loss": 2.2794, + "step": 8688 + }, + { + "epoch": 0.4661480686695279, + "grad_norm": 0.69140625, + "learning_rate": 4.947761696289527e-06, + "loss": 2.4243, + "step": 8689 + }, + { + "epoch": 0.46620171673819744, + "grad_norm": 0.466796875, + "learning_rate": 4.947744027879213e-06, + "loss": 2.5332, + "step": 8690 + }, + { + "epoch": 0.46625536480686697, + "grad_norm": 1.3203125, + "learning_rate": 4.947726356512995e-06, + "loss": 2.2224, + "step": 8691 + }, + { + "epoch": 0.4663090128755365, + "grad_norm": 0.388671875, + "learning_rate": 4.9477086821908924e-06, + "loss": 2.1382, + "step": 8692 + }, + { + "epoch": 0.46636266094420603, + "grad_norm": 0.4375, + "learning_rate": 4.947691004912929e-06, + "loss": 2.2898, + "step": 8693 + }, + { + "epoch": 0.46641630901287556, + "grad_norm": 0.4375, + "learning_rate": 4.947673324679123e-06, + "loss": 2.3597, + "step": 8694 + }, + { + "epoch": 0.46646995708154504, + "grad_norm": 0.390625, + "learning_rate": 4.947655641489499e-06, + "loss": 2.3562, + "step": 8695 + }, + { + "epoch": 0.46652360515021457, + "grad_norm": 0.404296875, + "learning_rate": 4.947637955344076e-06, + "loss": 2.3583, + "step": 8696 + }, + { + "epoch": 0.4665772532188841, + "grad_norm": 0.38671875, + "learning_rate": 4.947620266242877e-06, + "loss": 2.055, + "step": 8697 + }, + { + "epoch": 0.46663090128755363, + "grad_norm": 0.32421875, + "learning_rate": 4.947602574185922e-06, + "loss": 2.1712, + "step": 8698 + }, + { + "epoch": 0.46668454935622317, + "grad_norm": 0.392578125, + "learning_rate": 4.947584879173233e-06, + "loss": 2.0843, + "step": 8699 + }, + { + "epoch": 0.4667381974248927, + "grad_norm": 0.466796875, + "learning_rate": 4.947567181204831e-06, + "loss": 2.6011, + "step": 8700 + }, + { + "epoch": 0.46679184549356223, + "grad_norm": 0.388671875, + "learning_rate": 4.947549480280739e-06, + "loss": 2.1461, + "step": 8701 + }, + { + "epoch": 0.46684549356223176, + "grad_norm": 0.38671875, + "learning_rate": 4.947531776400976e-06, + "loss": 2.2179, + "step": 8702 + }, + { + "epoch": 0.4668991416309013, + "grad_norm": 0.45703125, + "learning_rate": 4.9475140695655645e-06, + "loss": 2.3858, + "step": 8703 + }, + { + "epoch": 0.4669527896995708, + "grad_norm": 1.125, + "learning_rate": 4.947496359774527e-06, + "loss": 2.101, + "step": 8704 + }, + { + "epoch": 0.46700643776824036, + "grad_norm": 0.4140625, + "learning_rate": 4.9474786470278816e-06, + "loss": 2.2892, + "step": 8705 + }, + { + "epoch": 0.4670600858369099, + "grad_norm": 0.423828125, + "learning_rate": 4.947460931325653e-06, + "loss": 2.3345, + "step": 8706 + }, + { + "epoch": 0.4671137339055794, + "grad_norm": 0.7421875, + "learning_rate": 4.947443212667861e-06, + "loss": 2.4907, + "step": 8707 + }, + { + "epoch": 0.46716738197424895, + "grad_norm": 0.53515625, + "learning_rate": 4.947425491054528e-06, + "loss": 2.0454, + "step": 8708 + }, + { + "epoch": 0.46722103004291843, + "grad_norm": 0.412109375, + "learning_rate": 4.947407766485674e-06, + "loss": 2.459, + "step": 8709 + }, + { + "epoch": 0.46727467811158796, + "grad_norm": 0.4140625, + "learning_rate": 4.947390038961322e-06, + "loss": 2.4538, + "step": 8710 + }, + { + "epoch": 0.4673283261802575, + "grad_norm": 0.392578125, + "learning_rate": 4.947372308481493e-06, + "loss": 2.118, + "step": 8711 + }, + { + "epoch": 0.467381974248927, + "grad_norm": 0.466796875, + "learning_rate": 4.947354575046207e-06, + "loss": 2.1973, + "step": 8712 + }, + { + "epoch": 0.46743562231759656, + "grad_norm": 0.435546875, + "learning_rate": 4.947336838655487e-06, + "loss": 2.2637, + "step": 8713 + }, + { + "epoch": 0.4674892703862661, + "grad_norm": 0.375, + "learning_rate": 4.947319099309353e-06, + "loss": 2.209, + "step": 8714 + }, + { + "epoch": 0.4675429184549356, + "grad_norm": 0.384765625, + "learning_rate": 4.947301357007829e-06, + "loss": 2.2485, + "step": 8715 + }, + { + "epoch": 0.46759656652360515, + "grad_norm": 0.404296875, + "learning_rate": 4.947283611750933e-06, + "loss": 2.1262, + "step": 8716 + }, + { + "epoch": 0.4676502145922747, + "grad_norm": 0.421875, + "learning_rate": 4.94726586353869e-06, + "loss": 2.0217, + "step": 8717 + }, + { + "epoch": 0.4677038626609442, + "grad_norm": 0.482421875, + "learning_rate": 4.947248112371117e-06, + "loss": 2.11, + "step": 8718 + }, + { + "epoch": 0.46775751072961375, + "grad_norm": 0.43359375, + "learning_rate": 4.94723035824824e-06, + "loss": 2.3465, + "step": 8719 + }, + { + "epoch": 0.4678111587982833, + "grad_norm": 0.4453125, + "learning_rate": 4.9472126011700776e-06, + "loss": 2.2171, + "step": 8720 + }, + { + "epoch": 0.4678648068669528, + "grad_norm": 0.54296875, + "learning_rate": 4.947194841136653e-06, + "loss": 2.3893, + "step": 8721 + }, + { + "epoch": 0.46791845493562234, + "grad_norm": 0.369140625, + "learning_rate": 4.947177078147986e-06, + "loss": 2.4491, + "step": 8722 + }, + { + "epoch": 0.4679721030042919, + "grad_norm": 0.427734375, + "learning_rate": 4.947159312204098e-06, + "loss": 2.2903, + "step": 8723 + }, + { + "epoch": 0.46802575107296135, + "grad_norm": 0.40625, + "learning_rate": 4.947141543305013e-06, + "loss": 2.0731, + "step": 8724 + }, + { + "epoch": 0.4680793991416309, + "grad_norm": 0.36328125, + "learning_rate": 4.94712377145075e-06, + "loss": 2.2971, + "step": 8725 + }, + { + "epoch": 0.4681330472103004, + "grad_norm": 0.439453125, + "learning_rate": 4.947105996641331e-06, + "loss": 2.2539, + "step": 8726 + }, + { + "epoch": 0.46818669527896994, + "grad_norm": 0.373046875, + "learning_rate": 4.947088218876777e-06, + "loss": 1.9796, + "step": 8727 + }, + { + "epoch": 0.4682403433476395, + "grad_norm": 0.37890625, + "learning_rate": 4.947070438157111e-06, + "loss": 2.4644, + "step": 8728 + }, + { + "epoch": 0.468293991416309, + "grad_norm": 0.32421875, + "learning_rate": 4.947052654482353e-06, + "loss": 1.7644, + "step": 8729 + }, + { + "epoch": 0.46834763948497854, + "grad_norm": 0.52734375, + "learning_rate": 4.947034867852526e-06, + "loss": 2.4002, + "step": 8730 + }, + { + "epoch": 0.46840128755364807, + "grad_norm": 0.494140625, + "learning_rate": 4.9470170782676494e-06, + "loss": 1.8351, + "step": 8731 + }, + { + "epoch": 0.4684549356223176, + "grad_norm": 0.4453125, + "learning_rate": 4.946999285727746e-06, + "loss": 1.9923, + "step": 8732 + }, + { + "epoch": 0.46850858369098713, + "grad_norm": 0.482421875, + "learning_rate": 4.946981490232837e-06, + "loss": 2.0306, + "step": 8733 + }, + { + "epoch": 0.46856223175965667, + "grad_norm": 0.4453125, + "learning_rate": 4.946963691782945e-06, + "loss": 2.0157, + "step": 8734 + }, + { + "epoch": 0.4686158798283262, + "grad_norm": 0.359375, + "learning_rate": 4.9469458903780895e-06, + "loss": 1.9812, + "step": 8735 + }, + { + "epoch": 0.46866952789699573, + "grad_norm": 0.4140625, + "learning_rate": 4.946928086018293e-06, + "loss": 2.3325, + "step": 8736 + }, + { + "epoch": 0.46872317596566526, + "grad_norm": 0.6015625, + "learning_rate": 4.946910278703578e-06, + "loss": 2.199, + "step": 8737 + }, + { + "epoch": 0.46877682403433474, + "grad_norm": 0.462890625, + "learning_rate": 4.946892468433964e-06, + "loss": 2.4099, + "step": 8738 + }, + { + "epoch": 0.46883047210300427, + "grad_norm": 0.478515625, + "learning_rate": 4.946874655209474e-06, + "loss": 2.2578, + "step": 8739 + }, + { + "epoch": 0.4688841201716738, + "grad_norm": 0.431640625, + "learning_rate": 4.946856839030129e-06, + "loss": 2.4344, + "step": 8740 + }, + { + "epoch": 0.46893776824034333, + "grad_norm": 0.474609375, + "learning_rate": 4.94683901989595e-06, + "loss": 2.3279, + "step": 8741 + }, + { + "epoch": 0.46899141630901287, + "grad_norm": 0.54296875, + "learning_rate": 4.946821197806959e-06, + "loss": 2.309, + "step": 8742 + }, + { + "epoch": 0.4690450643776824, + "grad_norm": 0.3828125, + "learning_rate": 4.946803372763178e-06, + "loss": 2.1901, + "step": 8743 + }, + { + "epoch": 0.46909871244635193, + "grad_norm": 0.37109375, + "learning_rate": 4.9467855447646275e-06, + "loss": 2.3878, + "step": 8744 + }, + { + "epoch": 0.46915236051502146, + "grad_norm": 0.353515625, + "learning_rate": 4.94676771381133e-06, + "loss": 2.1814, + "step": 8745 + }, + { + "epoch": 0.469206008583691, + "grad_norm": 0.42578125, + "learning_rate": 4.946749879903306e-06, + "loss": 2.2824, + "step": 8746 + }, + { + "epoch": 0.4692596566523605, + "grad_norm": 0.43359375, + "learning_rate": 4.946732043040579e-06, + "loss": 2.421, + "step": 8747 + }, + { + "epoch": 0.46931330472103006, + "grad_norm": 0.54296875, + "learning_rate": 4.946714203223168e-06, + "loss": 2.7437, + "step": 8748 + }, + { + "epoch": 0.4693669527896996, + "grad_norm": 0.45703125, + "learning_rate": 4.946696360451097e-06, + "loss": 2.0539, + "step": 8749 + }, + { + "epoch": 0.4694206008583691, + "grad_norm": 0.3671875, + "learning_rate": 4.946678514724385e-06, + "loss": 2.029, + "step": 8750 + }, + { + "epoch": 0.46947424892703865, + "grad_norm": 0.43359375, + "learning_rate": 4.946660666043056e-06, + "loss": 2.5437, + "step": 8751 + }, + { + "epoch": 0.4695278969957081, + "grad_norm": 0.64453125, + "learning_rate": 4.94664281440713e-06, + "loss": 2.2276, + "step": 8752 + }, + { + "epoch": 0.46958154506437766, + "grad_norm": 0.412109375, + "learning_rate": 4.9466249598166285e-06, + "loss": 2.3015, + "step": 8753 + }, + { + "epoch": 0.4696351931330472, + "grad_norm": 0.3984375, + "learning_rate": 4.946607102271574e-06, + "loss": 2.5112, + "step": 8754 + }, + { + "epoch": 0.4696888412017167, + "grad_norm": 0.51171875, + "learning_rate": 4.946589241771987e-06, + "loss": 2.4474, + "step": 8755 + }, + { + "epoch": 0.46974248927038625, + "grad_norm": 0.490234375, + "learning_rate": 4.94657137831789e-06, + "loss": 2.1784, + "step": 8756 + }, + { + "epoch": 0.4697961373390558, + "grad_norm": 0.68359375, + "learning_rate": 4.946553511909305e-06, + "loss": 2.2995, + "step": 8757 + }, + { + "epoch": 0.4698497854077253, + "grad_norm": 0.396484375, + "learning_rate": 4.946535642546252e-06, + "loss": 2.3782, + "step": 8758 + }, + { + "epoch": 0.46990343347639485, + "grad_norm": 0.380859375, + "learning_rate": 4.946517770228754e-06, + "loss": 2.3339, + "step": 8759 + }, + { + "epoch": 0.4699570815450644, + "grad_norm": 0.431640625, + "learning_rate": 4.946499894956831e-06, + "loss": 2.2643, + "step": 8760 + }, + { + "epoch": 0.4700107296137339, + "grad_norm": 0.458984375, + "learning_rate": 4.946482016730506e-06, + "loss": 2.4007, + "step": 8761 + }, + { + "epoch": 0.47006437768240344, + "grad_norm": 0.54296875, + "learning_rate": 4.946464135549801e-06, + "loss": 1.6152, + "step": 8762 + }, + { + "epoch": 0.470118025751073, + "grad_norm": 1.8984375, + "learning_rate": 4.946446251414735e-06, + "loss": 2.4195, + "step": 8763 + }, + { + "epoch": 0.4701716738197425, + "grad_norm": 0.52734375, + "learning_rate": 4.946428364325333e-06, + "loss": 1.2953, + "step": 8764 + }, + { + "epoch": 0.47022532188841204, + "grad_norm": 0.439453125, + "learning_rate": 4.946410474281614e-06, + "loss": 2.1249, + "step": 8765 + }, + { + "epoch": 0.47027896995708157, + "grad_norm": 0.421875, + "learning_rate": 4.946392581283601e-06, + "loss": 2.2753, + "step": 8766 + }, + { + "epoch": 0.47033261802575105, + "grad_norm": 0.478515625, + "learning_rate": 4.9463746853313146e-06, + "loss": 2.2803, + "step": 8767 + }, + { + "epoch": 0.4703862660944206, + "grad_norm": 0.58984375, + "learning_rate": 4.946356786424777e-06, + "loss": 2.3123, + "step": 8768 + }, + { + "epoch": 0.4704399141630901, + "grad_norm": 0.54296875, + "learning_rate": 4.94633888456401e-06, + "loss": 2.2969, + "step": 8769 + }, + { + "epoch": 0.47049356223175964, + "grad_norm": 0.365234375, + "learning_rate": 4.946320979749035e-06, + "loss": 2.345, + "step": 8770 + }, + { + "epoch": 0.4705472103004292, + "grad_norm": 0.77734375, + "learning_rate": 4.946303071979874e-06, + "loss": 1.6842, + "step": 8771 + }, + { + "epoch": 0.4706008583690987, + "grad_norm": 0.380859375, + "learning_rate": 4.946285161256548e-06, + "loss": 2.0171, + "step": 8772 + }, + { + "epoch": 0.47065450643776824, + "grad_norm": 0.451171875, + "learning_rate": 4.946267247579079e-06, + "loss": 2.289, + "step": 8773 + }, + { + "epoch": 0.47070815450643777, + "grad_norm": 0.421875, + "learning_rate": 4.946249330947489e-06, + "loss": 2.3974, + "step": 8774 + }, + { + "epoch": 0.4707618025751073, + "grad_norm": 0.4375, + "learning_rate": 4.946231411361798e-06, + "loss": 2.2441, + "step": 8775 + }, + { + "epoch": 0.47081545064377683, + "grad_norm": 0.58984375, + "learning_rate": 4.946213488822029e-06, + "loss": 2.1352, + "step": 8776 + }, + { + "epoch": 0.47086909871244637, + "grad_norm": 0.4296875, + "learning_rate": 4.9461955633282035e-06, + "loss": 2.0421, + "step": 8777 + }, + { + "epoch": 0.4709227467811159, + "grad_norm": 0.466796875, + "learning_rate": 4.9461776348803435e-06, + "loss": 2.4609, + "step": 8778 + }, + { + "epoch": 0.47097639484978543, + "grad_norm": 0.75390625, + "learning_rate": 4.94615970347847e-06, + "loss": 2.5532, + "step": 8779 + }, + { + "epoch": 0.47103004291845496, + "grad_norm": 0.416015625, + "learning_rate": 4.946141769122604e-06, + "loss": 2.3591, + "step": 8780 + }, + { + "epoch": 0.47108369098712444, + "grad_norm": 0.423828125, + "learning_rate": 4.946123831812769e-06, + "loss": 2.3898, + "step": 8781 + }, + { + "epoch": 0.47113733905579397, + "grad_norm": 0.4609375, + "learning_rate": 4.946105891548986e-06, + "loss": 2.3494, + "step": 8782 + }, + { + "epoch": 0.4711909871244635, + "grad_norm": 0.3828125, + "learning_rate": 4.946087948331276e-06, + "loss": 2.2242, + "step": 8783 + }, + { + "epoch": 0.47124463519313303, + "grad_norm": 0.416015625, + "learning_rate": 4.946070002159661e-06, + "loss": 2.2314, + "step": 8784 + }, + { + "epoch": 0.47129828326180256, + "grad_norm": 0.392578125, + "learning_rate": 4.946052053034163e-06, + "loss": 2.3206, + "step": 8785 + }, + { + "epoch": 0.4713519313304721, + "grad_norm": 0.42578125, + "learning_rate": 4.946034100954803e-06, + "loss": 1.8699, + "step": 8786 + }, + { + "epoch": 0.4714055793991416, + "grad_norm": 0.482421875, + "learning_rate": 4.9460161459216025e-06, + "loss": 2.1504, + "step": 8787 + }, + { + "epoch": 0.47145922746781116, + "grad_norm": 0.369140625, + "learning_rate": 4.945998187934584e-06, + "loss": 2.0733, + "step": 8788 + }, + { + "epoch": 0.4715128755364807, + "grad_norm": 0.4921875, + "learning_rate": 4.945980226993769e-06, + "loss": 2.3009, + "step": 8789 + }, + { + "epoch": 0.4715665236051502, + "grad_norm": 0.46875, + "learning_rate": 4.94596226309918e-06, + "loss": 2.3351, + "step": 8790 + }, + { + "epoch": 0.47162017167381975, + "grad_norm": 0.462890625, + "learning_rate": 4.945944296250838e-06, + "loss": 2.0664, + "step": 8791 + }, + { + "epoch": 0.4716738197424893, + "grad_norm": 0.42578125, + "learning_rate": 4.945926326448763e-06, + "loss": 2.0924, + "step": 8792 + }, + { + "epoch": 0.4717274678111588, + "grad_norm": 0.443359375, + "learning_rate": 4.9459083536929785e-06, + "loss": 2.2773, + "step": 8793 + }, + { + "epoch": 0.47178111587982835, + "grad_norm": 0.400390625, + "learning_rate": 4.9458903779835066e-06, + "loss": 2.2111, + "step": 8794 + }, + { + "epoch": 0.4718347639484979, + "grad_norm": 0.447265625, + "learning_rate": 4.945872399320368e-06, + "loss": 2.3542, + "step": 8795 + }, + { + "epoch": 0.47188841201716736, + "grad_norm": 0.486328125, + "learning_rate": 4.945854417703584e-06, + "loss": 2.404, + "step": 8796 + }, + { + "epoch": 0.4719420600858369, + "grad_norm": 0.44921875, + "learning_rate": 4.945836433133178e-06, + "loss": 2.0879, + "step": 8797 + }, + { + "epoch": 0.4719957081545064, + "grad_norm": 0.43359375, + "learning_rate": 4.945818445609171e-06, + "loss": 2.3505, + "step": 8798 + }, + { + "epoch": 0.47204935622317595, + "grad_norm": 0.443359375, + "learning_rate": 4.945800455131584e-06, + "loss": 2.4325, + "step": 8799 + }, + { + "epoch": 0.4721030042918455, + "grad_norm": 0.419921875, + "learning_rate": 4.945782461700439e-06, + "loss": 2.4049, + "step": 8800 + }, + { + "epoch": 0.472156652360515, + "grad_norm": 0.326171875, + "learning_rate": 4.945764465315758e-06, + "loss": 2.1433, + "step": 8801 + }, + { + "epoch": 0.47221030042918455, + "grad_norm": 0.455078125, + "learning_rate": 4.945746465977563e-06, + "loss": 2.3824, + "step": 8802 + }, + { + "epoch": 0.4722639484978541, + "grad_norm": 0.474609375, + "learning_rate": 4.945728463685875e-06, + "loss": 2.3215, + "step": 8803 + }, + { + "epoch": 0.4723175965665236, + "grad_norm": 0.470703125, + "learning_rate": 4.945710458440716e-06, + "loss": 2.1245, + "step": 8804 + }, + { + "epoch": 0.47237124463519314, + "grad_norm": 0.439453125, + "learning_rate": 4.945692450242108e-06, + "loss": 2.2724, + "step": 8805 + }, + { + "epoch": 0.4724248927038627, + "grad_norm": 0.44921875, + "learning_rate": 4.945674439090073e-06, + "loss": 2.1576, + "step": 8806 + }, + { + "epoch": 0.4724785407725322, + "grad_norm": 2.703125, + "learning_rate": 4.945656424984632e-06, + "loss": 2.4053, + "step": 8807 + }, + { + "epoch": 0.47253218884120174, + "grad_norm": 0.35546875, + "learning_rate": 4.945638407925808e-06, + "loss": 1.9733, + "step": 8808 + }, + { + "epoch": 0.47258583690987127, + "grad_norm": 0.369140625, + "learning_rate": 4.945620387913621e-06, + "loss": 2.0461, + "step": 8809 + }, + { + "epoch": 0.47263948497854075, + "grad_norm": 0.490234375, + "learning_rate": 4.9456023649480935e-06, + "loss": 2.4509, + "step": 8810 + }, + { + "epoch": 0.4726931330472103, + "grad_norm": 1.0078125, + "learning_rate": 4.945584339029248e-06, + "loss": 2.4728, + "step": 8811 + }, + { + "epoch": 0.4727467811158798, + "grad_norm": 0.71484375, + "learning_rate": 4.945566310157105e-06, + "loss": 2.3511, + "step": 8812 + }, + { + "epoch": 0.47280042918454934, + "grad_norm": 0.470703125, + "learning_rate": 4.9455482783316874e-06, + "loss": 2.0818, + "step": 8813 + }, + { + "epoch": 0.4728540772532189, + "grad_norm": 0.431640625, + "learning_rate": 4.945530243553017e-06, + "loss": 2.0493, + "step": 8814 + }, + { + "epoch": 0.4729077253218884, + "grad_norm": 0.390625, + "learning_rate": 4.945512205821115e-06, + "loss": 2.5921, + "step": 8815 + }, + { + "epoch": 0.47296137339055794, + "grad_norm": 0.43359375, + "learning_rate": 4.945494165136002e-06, + "loss": 2.4022, + "step": 8816 + }, + { + "epoch": 0.47301502145922747, + "grad_norm": 0.404296875, + "learning_rate": 4.9454761214977034e-06, + "loss": 2.4339, + "step": 8817 + }, + { + "epoch": 0.473068669527897, + "grad_norm": 0.400390625, + "learning_rate": 4.945458074906237e-06, + "loss": 2.4886, + "step": 8818 + }, + { + "epoch": 0.47312231759656653, + "grad_norm": 0.380859375, + "learning_rate": 4.945440025361626e-06, + "loss": 2.2798, + "step": 8819 + }, + { + "epoch": 0.47317596566523606, + "grad_norm": 0.451171875, + "learning_rate": 4.945421972863893e-06, + "loss": 2.327, + "step": 8820 + }, + { + "epoch": 0.4732296137339056, + "grad_norm": 0.447265625, + "learning_rate": 4.94540391741306e-06, + "loss": 2.6489, + "step": 8821 + }, + { + "epoch": 0.47328326180257513, + "grad_norm": 0.474609375, + "learning_rate": 4.945385859009148e-06, + "loss": 2.0168, + "step": 8822 + }, + { + "epoch": 0.47333690987124466, + "grad_norm": 0.390625, + "learning_rate": 4.945367797652177e-06, + "loss": 2.0701, + "step": 8823 + }, + { + "epoch": 0.47339055793991414, + "grad_norm": 0.462890625, + "learning_rate": 4.945349733342172e-06, + "loss": 2.5112, + "step": 8824 + }, + { + "epoch": 0.47344420600858367, + "grad_norm": 0.365234375, + "learning_rate": 4.945331666079154e-06, + "loss": 2.0349, + "step": 8825 + }, + { + "epoch": 0.4734978540772532, + "grad_norm": 0.44921875, + "learning_rate": 4.945313595863144e-06, + "loss": 2.2986, + "step": 8826 + }, + { + "epoch": 0.47355150214592273, + "grad_norm": 0.373046875, + "learning_rate": 4.945295522694163e-06, + "loss": 2.2047, + "step": 8827 + }, + { + "epoch": 0.47360515021459226, + "grad_norm": 0.388671875, + "learning_rate": 4.945277446572236e-06, + "loss": 2.3384, + "step": 8828 + }, + { + "epoch": 0.4736587982832618, + "grad_norm": 0.5703125, + "learning_rate": 4.945259367497381e-06, + "loss": 2.5267, + "step": 8829 + }, + { + "epoch": 0.4737124463519313, + "grad_norm": 0.455078125, + "learning_rate": 4.945241285469622e-06, + "loss": 2.1846, + "step": 8830 + }, + { + "epoch": 0.47376609442060086, + "grad_norm": 0.91796875, + "learning_rate": 4.945223200488981e-06, + "loss": 2.6102, + "step": 8831 + }, + { + "epoch": 0.4738197424892704, + "grad_norm": 0.474609375, + "learning_rate": 4.945205112555479e-06, + "loss": 2.2499, + "step": 8832 + }, + { + "epoch": 0.4738733905579399, + "grad_norm": 0.38671875, + "learning_rate": 4.945187021669138e-06, + "loss": 2.1638, + "step": 8833 + }, + { + "epoch": 0.47392703862660945, + "grad_norm": 0.447265625, + "learning_rate": 4.945168927829981e-06, + "loss": 2.3478, + "step": 8834 + }, + { + "epoch": 0.473980686695279, + "grad_norm": 0.423828125, + "learning_rate": 4.945150831038027e-06, + "loss": 2.2133, + "step": 8835 + }, + { + "epoch": 0.4740343347639485, + "grad_norm": 0.392578125, + "learning_rate": 4.945132731293301e-06, + "loss": 2.2084, + "step": 8836 + }, + { + "epoch": 0.47408798283261805, + "grad_norm": 1.75, + "learning_rate": 4.945114628595824e-06, + "loss": 2.4991, + "step": 8837 + }, + { + "epoch": 0.4741416309012876, + "grad_norm": 0.44921875, + "learning_rate": 4.945096522945616e-06, + "loss": 2.4052, + "step": 8838 + }, + { + "epoch": 0.47419527896995706, + "grad_norm": 0.43359375, + "learning_rate": 4.945078414342701e-06, + "loss": 2.5375, + "step": 8839 + }, + { + "epoch": 0.4742489270386266, + "grad_norm": 0.43359375, + "learning_rate": 4.9450603027871e-06, + "loss": 2.2988, + "step": 8840 + }, + { + "epoch": 0.4743025751072961, + "grad_norm": 0.48828125, + "learning_rate": 4.945042188278835e-06, + "loss": 2.272, + "step": 8841 + }, + { + "epoch": 0.47435622317596565, + "grad_norm": 0.4609375, + "learning_rate": 4.945024070817928e-06, + "loss": 2.3495, + "step": 8842 + }, + { + "epoch": 0.4744098712446352, + "grad_norm": 0.376953125, + "learning_rate": 4.945005950404401e-06, + "loss": 2.2293, + "step": 8843 + }, + { + "epoch": 0.4744635193133047, + "grad_norm": 0.4140625, + "learning_rate": 4.944987827038276e-06, + "loss": 2.3616, + "step": 8844 + }, + { + "epoch": 0.47451716738197425, + "grad_norm": 0.439453125, + "learning_rate": 4.944969700719573e-06, + "loss": 1.6806, + "step": 8845 + }, + { + "epoch": 0.4745708154506438, + "grad_norm": 0.447265625, + "learning_rate": 4.944951571448317e-06, + "loss": 2.1221, + "step": 8846 + }, + { + "epoch": 0.4746244635193133, + "grad_norm": 0.3984375, + "learning_rate": 4.944933439224527e-06, + "loss": 2.3043, + "step": 8847 + }, + { + "epoch": 0.47467811158798284, + "grad_norm": 0.51953125, + "learning_rate": 4.944915304048227e-06, + "loss": 2.4759, + "step": 8848 + }, + { + "epoch": 0.4747317596566524, + "grad_norm": 0.474609375, + "learning_rate": 4.944897165919439e-06, + "loss": 2.316, + "step": 8849 + }, + { + "epoch": 0.4747854077253219, + "grad_norm": 0.3515625, + "learning_rate": 4.944879024838182e-06, + "loss": 2.264, + "step": 8850 + }, + { + "epoch": 0.47483905579399144, + "grad_norm": 0.4140625, + "learning_rate": 4.944860880804481e-06, + "loss": 2.3249, + "step": 8851 + }, + { + "epoch": 0.47489270386266097, + "grad_norm": 0.4609375, + "learning_rate": 4.944842733818357e-06, + "loss": 2.123, + "step": 8852 + }, + { + "epoch": 0.47494635193133045, + "grad_norm": 0.427734375, + "learning_rate": 4.944824583879831e-06, + "loss": 2.41, + "step": 8853 + }, + { + "epoch": 0.475, + "grad_norm": 0.6484375, + "learning_rate": 4.944806430988927e-06, + "loss": 1.0647, + "step": 8854 + }, + { + "epoch": 0.4750536480686695, + "grad_norm": 0.51171875, + "learning_rate": 4.944788275145665e-06, + "loss": 2.4199, + "step": 8855 + }, + { + "epoch": 0.47510729613733904, + "grad_norm": 0.427734375, + "learning_rate": 4.944770116350067e-06, + "loss": 2.1947, + "step": 8856 + }, + { + "epoch": 0.4751609442060086, + "grad_norm": 0.453125, + "learning_rate": 4.944751954602156e-06, + "loss": 2.1879, + "step": 8857 + }, + { + "epoch": 0.4752145922746781, + "grad_norm": 0.486328125, + "learning_rate": 4.944733789901953e-06, + "loss": 2.3992, + "step": 8858 + }, + { + "epoch": 0.47526824034334764, + "grad_norm": 0.3984375, + "learning_rate": 4.94471562224948e-06, + "loss": 2.1054, + "step": 8859 + }, + { + "epoch": 0.47532188841201717, + "grad_norm": 1.1328125, + "learning_rate": 4.94469745164476e-06, + "loss": 2.1622, + "step": 8860 + }, + { + "epoch": 0.4753755364806867, + "grad_norm": 0.5390625, + "learning_rate": 4.944679278087814e-06, + "loss": 2.1699, + "step": 8861 + }, + { + "epoch": 0.47542918454935623, + "grad_norm": 0.56640625, + "learning_rate": 4.944661101578665e-06, + "loss": 2.2377, + "step": 8862 + }, + { + "epoch": 0.47548283261802576, + "grad_norm": 0.40625, + "learning_rate": 4.944642922117332e-06, + "loss": 2.1724, + "step": 8863 + }, + { + "epoch": 0.4755364806866953, + "grad_norm": 0.427734375, + "learning_rate": 4.9446247397038405e-06, + "loss": 2.2562, + "step": 8864 + }, + { + "epoch": 0.4755901287553648, + "grad_norm": 0.361328125, + "learning_rate": 4.944606554338211e-06, + "loss": 2.0826, + "step": 8865 + }, + { + "epoch": 0.47564377682403436, + "grad_norm": 0.4375, + "learning_rate": 4.944588366020466e-06, + "loss": 2.4331, + "step": 8866 + }, + { + "epoch": 0.47569742489270384, + "grad_norm": 0.470703125, + "learning_rate": 4.944570174750625e-06, + "loss": 2.5492, + "step": 8867 + }, + { + "epoch": 0.47575107296137337, + "grad_norm": 0.498046875, + "learning_rate": 4.944551980528714e-06, + "loss": 2.6729, + "step": 8868 + }, + { + "epoch": 0.4758047210300429, + "grad_norm": 0.390625, + "learning_rate": 4.944533783354751e-06, + "loss": 2.1953, + "step": 8869 + }, + { + "epoch": 0.47585836909871243, + "grad_norm": 0.703125, + "learning_rate": 4.944515583228761e-06, + "loss": 2.1878, + "step": 8870 + }, + { + "epoch": 0.47591201716738196, + "grad_norm": 0.419921875, + "learning_rate": 4.944497380150764e-06, + "loss": 2.1381, + "step": 8871 + }, + { + "epoch": 0.4759656652360515, + "grad_norm": 0.51953125, + "learning_rate": 4.944479174120783e-06, + "loss": 2.3032, + "step": 8872 + }, + { + "epoch": 0.476019313304721, + "grad_norm": 0.6796875, + "learning_rate": 4.944460965138841e-06, + "loss": 1.3505, + "step": 8873 + }, + { + "epoch": 0.47607296137339056, + "grad_norm": 0.48046875, + "learning_rate": 4.9444427532049574e-06, + "loss": 2.2773, + "step": 8874 + }, + { + "epoch": 0.4761266094420601, + "grad_norm": 0.412109375, + "learning_rate": 4.944424538319156e-06, + "loss": 2.3156, + "step": 8875 + }, + { + "epoch": 0.4761802575107296, + "grad_norm": 0.474609375, + "learning_rate": 4.944406320481458e-06, + "loss": 2.3368, + "step": 8876 + }, + { + "epoch": 0.47623390557939915, + "grad_norm": 0.41015625, + "learning_rate": 4.944388099691887e-06, + "loss": 2.2277, + "step": 8877 + }, + { + "epoch": 0.4762875536480687, + "grad_norm": 0.41796875, + "learning_rate": 4.944369875950463e-06, + "loss": 2.2986, + "step": 8878 + }, + { + "epoch": 0.4763412017167382, + "grad_norm": 0.40234375, + "learning_rate": 4.944351649257208e-06, + "loss": 2.0862, + "step": 8879 + }, + { + "epoch": 0.47639484978540775, + "grad_norm": 0.4140625, + "learning_rate": 4.944333419612146e-06, + "loss": 2.3175, + "step": 8880 + }, + { + "epoch": 0.4764484978540773, + "grad_norm": 0.455078125, + "learning_rate": 4.944315187015297e-06, + "loss": 2.4188, + "step": 8881 + }, + { + "epoch": 0.47650214592274676, + "grad_norm": 0.388671875, + "learning_rate": 4.944296951466685e-06, + "loss": 2.2673, + "step": 8882 + }, + { + "epoch": 0.4765557939914163, + "grad_norm": 0.46484375, + "learning_rate": 4.944278712966329e-06, + "loss": 2.3835, + "step": 8883 + }, + { + "epoch": 0.4766094420600858, + "grad_norm": 0.46484375, + "learning_rate": 4.944260471514255e-06, + "loss": 2.3446, + "step": 8884 + }, + { + "epoch": 0.47666309012875535, + "grad_norm": 0.34765625, + "learning_rate": 4.944242227110482e-06, + "loss": 2.4847, + "step": 8885 + }, + { + "epoch": 0.4767167381974249, + "grad_norm": 0.416015625, + "learning_rate": 4.9442239797550315e-06, + "loss": 2.2292, + "step": 8886 + }, + { + "epoch": 0.4767703862660944, + "grad_norm": 0.390625, + "learning_rate": 4.944205729447929e-06, + "loss": 2.4284, + "step": 8887 + }, + { + "epoch": 0.47682403433476395, + "grad_norm": 0.431640625, + "learning_rate": 4.944187476189193e-06, + "loss": 2.1611, + "step": 8888 + }, + { + "epoch": 0.4768776824034335, + "grad_norm": 0.44140625, + "learning_rate": 4.9441692199788485e-06, + "loss": 2.189, + "step": 8889 + }, + { + "epoch": 0.476931330472103, + "grad_norm": 0.353515625, + "learning_rate": 4.944150960816915e-06, + "loss": 1.9432, + "step": 8890 + }, + { + "epoch": 0.47698497854077254, + "grad_norm": 0.46875, + "learning_rate": 4.944132698703416e-06, + "loss": 2.0996, + "step": 8891 + }, + { + "epoch": 0.4770386266094421, + "grad_norm": 0.337890625, + "learning_rate": 4.944114433638373e-06, + "loss": 1.8503, + "step": 8892 + }, + { + "epoch": 0.4770922746781116, + "grad_norm": 0.35546875, + "learning_rate": 4.944096165621809e-06, + "loss": 1.7676, + "step": 8893 + }, + { + "epoch": 0.47714592274678114, + "grad_norm": 0.50390625, + "learning_rate": 4.944077894653744e-06, + "loss": 2.3126, + "step": 8894 + }, + { + "epoch": 0.47719957081545067, + "grad_norm": 0.451171875, + "learning_rate": 4.944059620734202e-06, + "loss": 2.0806, + "step": 8895 + }, + { + "epoch": 0.47725321888412015, + "grad_norm": 0.423828125, + "learning_rate": 4.944041343863205e-06, + "loss": 2.4534, + "step": 8896 + }, + { + "epoch": 0.4773068669527897, + "grad_norm": 0.4140625, + "learning_rate": 4.944023064040774e-06, + "loss": 2.4978, + "step": 8897 + }, + { + "epoch": 0.4773605150214592, + "grad_norm": 0.396484375, + "learning_rate": 4.944004781266931e-06, + "loss": 2.3939, + "step": 8898 + }, + { + "epoch": 0.47741416309012874, + "grad_norm": 0.392578125, + "learning_rate": 4.943986495541699e-06, + "loss": 2.2004, + "step": 8899 + }, + { + "epoch": 0.47746781115879827, + "grad_norm": 0.58984375, + "learning_rate": 4.943968206865101e-06, + "loss": 2.2038, + "step": 8900 + }, + { + "epoch": 0.4775214592274678, + "grad_norm": 0.53125, + "learning_rate": 4.943949915237156e-06, + "loss": 2.124, + "step": 8901 + }, + { + "epoch": 0.47757510729613734, + "grad_norm": 0.44921875, + "learning_rate": 4.943931620657889e-06, + "loss": 2.1689, + "step": 8902 + }, + { + "epoch": 0.47762875536480687, + "grad_norm": 0.34765625, + "learning_rate": 4.9439133231273204e-06, + "loss": 2.1004, + "step": 8903 + }, + { + "epoch": 0.4776824034334764, + "grad_norm": 0.41796875, + "learning_rate": 4.943895022645474e-06, + "loss": 2.1806, + "step": 8904 + }, + { + "epoch": 0.47773605150214593, + "grad_norm": 1.7109375, + "learning_rate": 4.94387671921237e-06, + "loss": 2.0749, + "step": 8905 + }, + { + "epoch": 0.47778969957081546, + "grad_norm": 0.421875, + "learning_rate": 4.943858412828031e-06, + "loss": 2.3557, + "step": 8906 + }, + { + "epoch": 0.477843347639485, + "grad_norm": 0.419921875, + "learning_rate": 4.94384010349248e-06, + "loss": 2.2587, + "step": 8907 + }, + { + "epoch": 0.4778969957081545, + "grad_norm": 0.458984375, + "learning_rate": 4.943821791205738e-06, + "loss": 2.2171, + "step": 8908 + }, + { + "epoch": 0.47795064377682406, + "grad_norm": 0.38671875, + "learning_rate": 4.943803475967829e-06, + "loss": 2.1786, + "step": 8909 + }, + { + "epoch": 0.4780042918454936, + "grad_norm": 0.359375, + "learning_rate": 4.9437851577787725e-06, + "loss": 2.2001, + "step": 8910 + }, + { + "epoch": 0.47805793991416307, + "grad_norm": 0.455078125, + "learning_rate": 4.943766836638592e-06, + "loss": 2.3122, + "step": 8911 + }, + { + "epoch": 0.4781115879828326, + "grad_norm": 0.44140625, + "learning_rate": 4.943748512547309e-06, + "loss": 2.1212, + "step": 8912 + }, + { + "epoch": 0.47816523605150213, + "grad_norm": 0.5546875, + "learning_rate": 4.943730185504948e-06, + "loss": 2.3143, + "step": 8913 + }, + { + "epoch": 0.47821888412017166, + "grad_norm": 0.41796875, + "learning_rate": 4.9437118555115275e-06, + "loss": 2.1702, + "step": 8914 + }, + { + "epoch": 0.4782725321888412, + "grad_norm": 0.5, + "learning_rate": 4.943693522567072e-06, + "loss": 2.0561, + "step": 8915 + }, + { + "epoch": 0.4783261802575107, + "grad_norm": 0.384765625, + "learning_rate": 4.943675186671604e-06, + "loss": 2.3774, + "step": 8916 + }, + { + "epoch": 0.47837982832618026, + "grad_norm": 0.373046875, + "learning_rate": 4.9436568478251435e-06, + "loss": 2.1099, + "step": 8917 + }, + { + "epoch": 0.4784334763948498, + "grad_norm": 0.373046875, + "learning_rate": 4.943638506027713e-06, + "loss": 2.1765, + "step": 8918 + }, + { + "epoch": 0.4784871244635193, + "grad_norm": 0.421875, + "learning_rate": 4.943620161279337e-06, + "loss": 2.3736, + "step": 8919 + }, + { + "epoch": 0.47854077253218885, + "grad_norm": 0.50390625, + "learning_rate": 4.9436018135800356e-06, + "loss": 2.2105, + "step": 8920 + }, + { + "epoch": 0.4785944206008584, + "grad_norm": 20.0, + "learning_rate": 4.943583462929832e-06, + "loss": 2.2725, + "step": 8921 + }, + { + "epoch": 0.4786480686695279, + "grad_norm": 0.40234375, + "learning_rate": 4.943565109328747e-06, + "loss": 2.0461, + "step": 8922 + }, + { + "epoch": 0.47870171673819745, + "grad_norm": 0.42578125, + "learning_rate": 4.943546752776804e-06, + "loss": 2.0595, + "step": 8923 + }, + { + "epoch": 0.478755364806867, + "grad_norm": 0.44140625, + "learning_rate": 4.9435283932740255e-06, + "loss": 2.2805, + "step": 8924 + }, + { + "epoch": 0.47880901287553645, + "grad_norm": 0.453125, + "learning_rate": 4.943510030820432e-06, + "loss": 2.3567, + "step": 8925 + }, + { + "epoch": 0.478862660944206, + "grad_norm": 0.3828125, + "learning_rate": 4.943491665416047e-06, + "loss": 2.279, + "step": 8926 + }, + { + "epoch": 0.4789163090128755, + "grad_norm": 0.439453125, + "learning_rate": 4.9434732970608926e-06, + "loss": 2.5623, + "step": 8927 + }, + { + "epoch": 0.47896995708154505, + "grad_norm": 0.462890625, + "learning_rate": 4.94345492575499e-06, + "loss": 2.3498, + "step": 8928 + }, + { + "epoch": 0.4790236051502146, + "grad_norm": 0.39453125, + "learning_rate": 4.943436551498362e-06, + "loss": 2.3476, + "step": 8929 + }, + { + "epoch": 0.4790772532188841, + "grad_norm": 0.490234375, + "learning_rate": 4.943418174291031e-06, + "loss": 1.4392, + "step": 8930 + }, + { + "epoch": 0.47913090128755365, + "grad_norm": 0.4140625, + "learning_rate": 4.94339979413302e-06, + "loss": 2.2981, + "step": 8931 + }, + { + "epoch": 0.4791845493562232, + "grad_norm": 0.50390625, + "learning_rate": 4.943381411024349e-06, + "loss": 2.4451, + "step": 8932 + }, + { + "epoch": 0.4792381974248927, + "grad_norm": 0.384765625, + "learning_rate": 4.943363024965042e-06, + "loss": 2.0318, + "step": 8933 + }, + { + "epoch": 0.47929184549356224, + "grad_norm": 0.40234375, + "learning_rate": 4.943344635955121e-06, + "loss": 2.4546, + "step": 8934 + }, + { + "epoch": 0.4793454935622318, + "grad_norm": 0.451171875, + "learning_rate": 4.9433262439946074e-06, + "loss": 2.3844, + "step": 8935 + }, + { + "epoch": 0.4793991416309013, + "grad_norm": 0.33984375, + "learning_rate": 4.943307849083524e-06, + "loss": 2.1855, + "step": 8936 + }, + { + "epoch": 0.47945278969957084, + "grad_norm": 0.486328125, + "learning_rate": 4.9432894512218935e-06, + "loss": 2.363, + "step": 8937 + }, + { + "epoch": 0.47950643776824037, + "grad_norm": 0.447265625, + "learning_rate": 4.9432710504097365e-06, + "loss": 2.0452, + "step": 8938 + }, + { + "epoch": 0.47956008583690984, + "grad_norm": 0.47265625, + "learning_rate": 4.9432526466470766e-06, + "loss": 1.3487, + "step": 8939 + }, + { + "epoch": 0.4796137339055794, + "grad_norm": 0.439453125, + "learning_rate": 4.943234239933936e-06, + "loss": 2.3998, + "step": 8940 + }, + { + "epoch": 0.4796673819742489, + "grad_norm": 0.7265625, + "learning_rate": 4.943215830270336e-06, + "loss": 2.3344, + "step": 8941 + }, + { + "epoch": 0.47972103004291844, + "grad_norm": 0.400390625, + "learning_rate": 4.943197417656299e-06, + "loss": 2.2451, + "step": 8942 + }, + { + "epoch": 0.47977467811158797, + "grad_norm": 0.390625, + "learning_rate": 4.943179002091849e-06, + "loss": 2.4598, + "step": 8943 + }, + { + "epoch": 0.4798283261802575, + "grad_norm": 0.388671875, + "learning_rate": 4.943160583577006e-06, + "loss": 2.1559, + "step": 8944 + }, + { + "epoch": 0.47988197424892703, + "grad_norm": 0.46484375, + "learning_rate": 4.9431421621117935e-06, + "loss": 2.2971, + "step": 8945 + }, + { + "epoch": 0.47993562231759657, + "grad_norm": 0.48046875, + "learning_rate": 4.9431237376962325e-06, + "loss": 2.1624, + "step": 8946 + }, + { + "epoch": 0.4799892703862661, + "grad_norm": 0.451171875, + "learning_rate": 4.943105310330347e-06, + "loss": 2.2466, + "step": 8947 + }, + { + "epoch": 0.48004291845493563, + "grad_norm": 0.41015625, + "learning_rate": 4.943086880014158e-06, + "loss": 2.1228, + "step": 8948 + }, + { + "epoch": 0.48009656652360516, + "grad_norm": 0.45703125, + "learning_rate": 4.943068446747689e-06, + "loss": 2.3006, + "step": 8949 + }, + { + "epoch": 0.4801502145922747, + "grad_norm": 0.44140625, + "learning_rate": 4.94305001053096e-06, + "loss": 2.3132, + "step": 8950 + }, + { + "epoch": 0.4802038626609442, + "grad_norm": 0.421875, + "learning_rate": 4.943031571363996e-06, + "loss": 2.1231, + "step": 8951 + }, + { + "epoch": 0.48025751072961376, + "grad_norm": 0.40234375, + "learning_rate": 4.9430131292468165e-06, + "loss": 2.2159, + "step": 8952 + }, + { + "epoch": 0.4803111587982833, + "grad_norm": 0.53515625, + "learning_rate": 4.942994684179446e-06, + "loss": 2.5587, + "step": 8953 + }, + { + "epoch": 0.48036480686695276, + "grad_norm": 0.462890625, + "learning_rate": 4.942976236161906e-06, + "loss": 2.2238, + "step": 8954 + }, + { + "epoch": 0.4804184549356223, + "grad_norm": 0.52734375, + "learning_rate": 4.942957785194219e-06, + "loss": 2.2873, + "step": 8955 + }, + { + "epoch": 0.48047210300429183, + "grad_norm": 0.439453125, + "learning_rate": 4.9429393312764065e-06, + "loss": 2.455, + "step": 8956 + }, + { + "epoch": 0.48052575107296136, + "grad_norm": 0.33984375, + "learning_rate": 4.942920874408491e-06, + "loss": 2.0783, + "step": 8957 + }, + { + "epoch": 0.4805793991416309, + "grad_norm": 0.404296875, + "learning_rate": 4.942902414590496e-06, + "loss": 2.2488, + "step": 8958 + }, + { + "epoch": 0.4806330472103004, + "grad_norm": 0.6171875, + "learning_rate": 4.942883951822442e-06, + "loss": 2.1468, + "step": 8959 + }, + { + "epoch": 0.48068669527896996, + "grad_norm": 0.474609375, + "learning_rate": 4.942865486104353e-06, + "loss": 2.2575, + "step": 8960 + }, + { + "epoch": 0.4807403433476395, + "grad_norm": 0.37890625, + "learning_rate": 4.94284701743625e-06, + "loss": 2.0617, + "step": 8961 + }, + { + "epoch": 0.480793991416309, + "grad_norm": 0.44140625, + "learning_rate": 4.942828545818156e-06, + "loss": 2.3212, + "step": 8962 + }, + { + "epoch": 0.48084763948497855, + "grad_norm": 0.46875, + "learning_rate": 4.942810071250093e-06, + "loss": 2.5464, + "step": 8963 + }, + { + "epoch": 0.4809012875536481, + "grad_norm": 0.44921875, + "learning_rate": 4.942791593732084e-06, + "loss": 2.3122, + "step": 8964 + }, + { + "epoch": 0.4809549356223176, + "grad_norm": 0.486328125, + "learning_rate": 4.9427731132641495e-06, + "loss": 2.3914, + "step": 8965 + }, + { + "epoch": 0.48100858369098715, + "grad_norm": 0.5234375, + "learning_rate": 4.942754629846314e-06, + "loss": 1.3401, + "step": 8966 + }, + { + "epoch": 0.4810622317596567, + "grad_norm": 0.41796875, + "learning_rate": 4.942736143478598e-06, + "loss": 2.4228, + "step": 8967 + }, + { + "epoch": 0.48111587982832615, + "grad_norm": 0.45703125, + "learning_rate": 4.942717654161025e-06, + "loss": 2.3495, + "step": 8968 + }, + { + "epoch": 0.4811695278969957, + "grad_norm": 0.5, + "learning_rate": 4.942699161893618e-06, + "loss": 2.3206, + "step": 8969 + }, + { + "epoch": 0.4812231759656652, + "grad_norm": 0.369140625, + "learning_rate": 4.942680666676397e-06, + "loss": 2.3529, + "step": 8970 + }, + { + "epoch": 0.48127682403433475, + "grad_norm": 0.40625, + "learning_rate": 4.942662168509386e-06, + "loss": 2.1432, + "step": 8971 + }, + { + "epoch": 0.4813304721030043, + "grad_norm": 0.474609375, + "learning_rate": 4.942643667392607e-06, + "loss": 2.1974, + "step": 8972 + }, + { + "epoch": 0.4813841201716738, + "grad_norm": 0.60546875, + "learning_rate": 4.942625163326083e-06, + "loss": 2.5468, + "step": 8973 + }, + { + "epoch": 0.48143776824034334, + "grad_norm": 0.482421875, + "learning_rate": 4.942606656309835e-06, + "loss": 2.2405, + "step": 8974 + }, + { + "epoch": 0.4814914163090129, + "grad_norm": 0.453125, + "learning_rate": 4.942588146343886e-06, + "loss": 2.3571, + "step": 8975 + }, + { + "epoch": 0.4815450643776824, + "grad_norm": 0.51171875, + "learning_rate": 4.942569633428259e-06, + "loss": 2.343, + "step": 8976 + }, + { + "epoch": 0.48159871244635194, + "grad_norm": 0.3828125, + "learning_rate": 4.942551117562976e-06, + "loss": 2.3361, + "step": 8977 + }, + { + "epoch": 0.48165236051502147, + "grad_norm": 0.44140625, + "learning_rate": 4.942532598748058e-06, + "loss": 2.0347, + "step": 8978 + }, + { + "epoch": 0.481706008583691, + "grad_norm": 0.703125, + "learning_rate": 4.942514076983529e-06, + "loss": 2.4619, + "step": 8979 + }, + { + "epoch": 0.48175965665236054, + "grad_norm": 0.484375, + "learning_rate": 4.942495552269412e-06, + "loss": 2.3984, + "step": 8980 + }, + { + "epoch": 0.48181330472103007, + "grad_norm": 0.412109375, + "learning_rate": 4.942477024605726e-06, + "loss": 2.1396, + "step": 8981 + }, + { + "epoch": 0.4818669527896996, + "grad_norm": 0.400390625, + "learning_rate": 4.942458493992497e-06, + "loss": 2.2013, + "step": 8982 + }, + { + "epoch": 0.4819206008583691, + "grad_norm": 0.412109375, + "learning_rate": 4.942439960429746e-06, + "loss": 2.4223, + "step": 8983 + }, + { + "epoch": 0.4819742489270386, + "grad_norm": 0.40625, + "learning_rate": 4.942421423917495e-06, + "loss": 2.2706, + "step": 8984 + }, + { + "epoch": 0.48202789699570814, + "grad_norm": 0.419921875, + "learning_rate": 4.942402884455767e-06, + "loss": 2.3509, + "step": 8985 + }, + { + "epoch": 0.48208154506437767, + "grad_norm": 0.462890625, + "learning_rate": 4.942384342044584e-06, + "loss": 2.5542, + "step": 8986 + }, + { + "epoch": 0.4821351931330472, + "grad_norm": 0.451171875, + "learning_rate": 4.942365796683968e-06, + "loss": 2.2918, + "step": 8987 + }, + { + "epoch": 0.48218884120171673, + "grad_norm": 0.439453125, + "learning_rate": 4.942347248373942e-06, + "loss": 2.7007, + "step": 8988 + }, + { + "epoch": 0.48224248927038627, + "grad_norm": 0.400390625, + "learning_rate": 4.942328697114529e-06, + "loss": 2.1694, + "step": 8989 + }, + { + "epoch": 0.4822961373390558, + "grad_norm": 0.384765625, + "learning_rate": 4.942310142905751e-06, + "loss": 2.2759, + "step": 8990 + }, + { + "epoch": 0.48234978540772533, + "grad_norm": 1.2265625, + "learning_rate": 4.942291585747629e-06, + "loss": 2.3074, + "step": 8991 + }, + { + "epoch": 0.48240343347639486, + "grad_norm": 0.458984375, + "learning_rate": 4.942273025640187e-06, + "loss": 2.3264, + "step": 8992 + }, + { + "epoch": 0.4824570815450644, + "grad_norm": 0.466796875, + "learning_rate": 4.942254462583447e-06, + "loss": 2.4458, + "step": 8993 + }, + { + "epoch": 0.4825107296137339, + "grad_norm": 0.466796875, + "learning_rate": 4.9422358965774305e-06, + "loss": 2.4479, + "step": 8994 + }, + { + "epoch": 0.48256437768240346, + "grad_norm": 0.423828125, + "learning_rate": 4.9422173276221615e-06, + "loss": 2.2286, + "step": 8995 + }, + { + "epoch": 0.482618025751073, + "grad_norm": 0.4453125, + "learning_rate": 4.942198755717663e-06, + "loss": 1.8302, + "step": 8996 + }, + { + "epoch": 0.48267167381974246, + "grad_norm": 0.380859375, + "learning_rate": 4.9421801808639545e-06, + "loss": 2.1317, + "step": 8997 + }, + { + "epoch": 0.482725321888412, + "grad_norm": 0.419921875, + "learning_rate": 4.9421616030610605e-06, + "loss": 2.2209, + "step": 8998 + }, + { + "epoch": 0.4827789699570815, + "grad_norm": 0.408203125, + "learning_rate": 4.942143022309003e-06, + "loss": 2.647, + "step": 8999 + }, + { + "epoch": 0.48283261802575106, + "grad_norm": 0.421875, + "learning_rate": 4.942124438607804e-06, + "loss": 2.2264, + "step": 9000 + }, + { + "epoch": 0.4828862660944206, + "grad_norm": 0.447265625, + "learning_rate": 4.9421058519574876e-06, + "loss": 1.9805, + "step": 9001 + }, + { + "epoch": 0.4829399141630901, + "grad_norm": 0.51953125, + "learning_rate": 4.942087262358074e-06, + "loss": 1.4317, + "step": 9002 + }, + { + "epoch": 0.48299356223175965, + "grad_norm": 0.490234375, + "learning_rate": 4.942068669809587e-06, + "loss": 1.2435, + "step": 9003 + }, + { + "epoch": 0.4830472103004292, + "grad_norm": 0.400390625, + "learning_rate": 4.942050074312048e-06, + "loss": 2.3133, + "step": 9004 + }, + { + "epoch": 0.4831008583690987, + "grad_norm": 0.33984375, + "learning_rate": 4.942031475865481e-06, + "loss": 2.0476, + "step": 9005 + }, + { + "epoch": 0.48315450643776825, + "grad_norm": 0.48046875, + "learning_rate": 4.942012874469907e-06, + "loss": 1.8831, + "step": 9006 + }, + { + "epoch": 0.4832081545064378, + "grad_norm": 0.48046875, + "learning_rate": 4.94199427012535e-06, + "loss": 2.5907, + "step": 9007 + }, + { + "epoch": 0.4832618025751073, + "grad_norm": 0.419921875, + "learning_rate": 4.941975662831831e-06, + "loss": 2.3133, + "step": 9008 + }, + { + "epoch": 0.48331545064377684, + "grad_norm": 0.373046875, + "learning_rate": 4.941957052589373e-06, + "loss": 2.1545, + "step": 9009 + }, + { + "epoch": 0.4833690987124464, + "grad_norm": 0.3828125, + "learning_rate": 4.941938439397998e-06, + "loss": 2.1603, + "step": 9010 + }, + { + "epoch": 0.48342274678111585, + "grad_norm": 0.486328125, + "learning_rate": 4.941919823257731e-06, + "loss": 1.5484, + "step": 9011 + }, + { + "epoch": 0.4834763948497854, + "grad_norm": 0.443359375, + "learning_rate": 4.941901204168591e-06, + "loss": 1.4546, + "step": 9012 + }, + { + "epoch": 0.4835300429184549, + "grad_norm": 0.478515625, + "learning_rate": 4.941882582130601e-06, + "loss": 2.2087, + "step": 9013 + }, + { + "epoch": 0.48358369098712445, + "grad_norm": 0.40625, + "learning_rate": 4.941863957143786e-06, + "loss": 2.3684, + "step": 9014 + }, + { + "epoch": 0.483637339055794, + "grad_norm": 0.447265625, + "learning_rate": 4.941845329208166e-06, + "loss": 2.2454, + "step": 9015 + }, + { + "epoch": 0.4836909871244635, + "grad_norm": 0.47265625, + "learning_rate": 4.941826698323765e-06, + "loss": 2.4991, + "step": 9016 + }, + { + "epoch": 0.48374463519313304, + "grad_norm": 0.384765625, + "learning_rate": 4.941808064490605e-06, + "loss": 2.1394, + "step": 9017 + }, + { + "epoch": 0.4837982832618026, + "grad_norm": 0.435546875, + "learning_rate": 4.941789427708708e-06, + "loss": 2.3915, + "step": 9018 + }, + { + "epoch": 0.4838519313304721, + "grad_norm": 0.404296875, + "learning_rate": 4.941770787978098e-06, + "loss": 2.2707, + "step": 9019 + }, + { + "epoch": 0.48390557939914164, + "grad_norm": 0.494140625, + "learning_rate": 4.9417521452987945e-06, + "loss": 2.5184, + "step": 9020 + }, + { + "epoch": 0.48395922746781117, + "grad_norm": 0.451171875, + "learning_rate": 4.941733499670823e-06, + "loss": 2.5929, + "step": 9021 + }, + { + "epoch": 0.4840128755364807, + "grad_norm": 0.4609375, + "learning_rate": 4.9417148510942056e-06, + "loss": 2.268, + "step": 9022 + }, + { + "epoch": 0.48406652360515023, + "grad_norm": 0.43359375, + "learning_rate": 4.941696199568964e-06, + "loss": 2.0659, + "step": 9023 + }, + { + "epoch": 0.48412017167381977, + "grad_norm": 0.423828125, + "learning_rate": 4.941677545095121e-06, + "loss": 2.2459, + "step": 9024 + }, + { + "epoch": 0.4841738197424893, + "grad_norm": 0.40625, + "learning_rate": 4.9416588876726976e-06, + "loss": 2.1965, + "step": 9025 + }, + { + "epoch": 0.4842274678111588, + "grad_norm": 0.453125, + "learning_rate": 4.941640227301719e-06, + "loss": 2.4047, + "step": 9026 + }, + { + "epoch": 0.4842811158798283, + "grad_norm": 0.59375, + "learning_rate": 4.9416215639822065e-06, + "loss": 2.4748, + "step": 9027 + }, + { + "epoch": 0.48433476394849784, + "grad_norm": 0.396484375, + "learning_rate": 4.941602897714182e-06, + "loss": 2.129, + "step": 9028 + }, + { + "epoch": 0.48438841201716737, + "grad_norm": 0.47265625, + "learning_rate": 4.941584228497669e-06, + "loss": 2.2787, + "step": 9029 + }, + { + "epoch": 0.4844420600858369, + "grad_norm": 0.46875, + "learning_rate": 4.94156555633269e-06, + "loss": 2.2277, + "step": 9030 + }, + { + "epoch": 0.48449570815450643, + "grad_norm": 0.427734375, + "learning_rate": 4.9415468812192665e-06, + "loss": 2.4473, + "step": 9031 + }, + { + "epoch": 0.48454935622317596, + "grad_norm": 0.46875, + "learning_rate": 4.941528203157423e-06, + "loss": 2.5249, + "step": 9032 + }, + { + "epoch": 0.4846030042918455, + "grad_norm": 0.4140625, + "learning_rate": 4.94150952214718e-06, + "loss": 2.3381, + "step": 9033 + }, + { + "epoch": 0.48465665236051503, + "grad_norm": 0.423828125, + "learning_rate": 4.9414908381885614e-06, + "loss": 2.3178, + "step": 9034 + }, + { + "epoch": 0.48471030042918456, + "grad_norm": 0.412109375, + "learning_rate": 4.94147215128159e-06, + "loss": 2.2764, + "step": 9035 + }, + { + "epoch": 0.4847639484978541, + "grad_norm": 0.40625, + "learning_rate": 4.941453461426287e-06, + "loss": 2.1235, + "step": 9036 + }, + { + "epoch": 0.4848175965665236, + "grad_norm": 0.3828125, + "learning_rate": 4.941434768622675e-06, + "loss": 2.4506, + "step": 9037 + }, + { + "epoch": 0.48487124463519315, + "grad_norm": 0.443359375, + "learning_rate": 4.9414160728707775e-06, + "loss": 2.2647, + "step": 9038 + }, + { + "epoch": 0.4849248927038627, + "grad_norm": 0.421875, + "learning_rate": 4.941397374170618e-06, + "loss": 2.0963, + "step": 9039 + }, + { + "epoch": 0.48497854077253216, + "grad_norm": 0.4296875, + "learning_rate": 4.9413786725222165e-06, + "loss": 2.118, + "step": 9040 + }, + { + "epoch": 0.4850321888412017, + "grad_norm": 0.42578125, + "learning_rate": 4.941359967925598e-06, + "loss": 2.2155, + "step": 9041 + }, + { + "epoch": 0.4850858369098712, + "grad_norm": 0.46484375, + "learning_rate": 4.9413412603807834e-06, + "loss": 2.5365, + "step": 9042 + }, + { + "epoch": 0.48513948497854076, + "grad_norm": 0.37890625, + "learning_rate": 4.9413225498877955e-06, + "loss": 2.3933, + "step": 9043 + }, + { + "epoch": 0.4851931330472103, + "grad_norm": 0.37890625, + "learning_rate": 4.941303836446658e-06, + "loss": 2.1739, + "step": 9044 + }, + { + "epoch": 0.4852467811158798, + "grad_norm": 0.423828125, + "learning_rate": 4.941285120057392e-06, + "loss": 2.0412, + "step": 9045 + }, + { + "epoch": 0.48530042918454935, + "grad_norm": 0.412109375, + "learning_rate": 4.941266400720022e-06, + "loss": 2.3201, + "step": 9046 + }, + { + "epoch": 0.4853540772532189, + "grad_norm": 0.38671875, + "learning_rate": 4.941247678434569e-06, + "loss": 2.0026, + "step": 9047 + }, + { + "epoch": 0.4854077253218884, + "grad_norm": 0.51171875, + "learning_rate": 4.9412289532010565e-06, + "loss": 2.2587, + "step": 9048 + }, + { + "epoch": 0.48546137339055795, + "grad_norm": 0.3984375, + "learning_rate": 4.941210225019507e-06, + "loss": 1.8683, + "step": 9049 + }, + { + "epoch": 0.4855150214592275, + "grad_norm": 0.40625, + "learning_rate": 4.941191493889942e-06, + "loss": 2.0733, + "step": 9050 + }, + { + "epoch": 0.485568669527897, + "grad_norm": 0.76953125, + "learning_rate": 4.941172759812386e-06, + "loss": 2.4356, + "step": 9051 + }, + { + "epoch": 0.48562231759656654, + "grad_norm": 0.369140625, + "learning_rate": 4.94115402278686e-06, + "loss": 2.4685, + "step": 9052 + }, + { + "epoch": 0.4856759656652361, + "grad_norm": 0.439453125, + "learning_rate": 4.941135282813387e-06, + "loss": 2.4232, + "step": 9053 + }, + { + "epoch": 0.48572961373390555, + "grad_norm": 0.490234375, + "learning_rate": 4.94111653989199e-06, + "loss": 2.2338, + "step": 9054 + }, + { + "epoch": 0.4857832618025751, + "grad_norm": 0.455078125, + "learning_rate": 4.941097794022692e-06, + "loss": 2.2418, + "step": 9055 + }, + { + "epoch": 0.4858369098712446, + "grad_norm": 0.44140625, + "learning_rate": 4.941079045205515e-06, + "loss": 2.3614, + "step": 9056 + }, + { + "epoch": 0.48589055793991415, + "grad_norm": 0.419921875, + "learning_rate": 4.9410602934404816e-06, + "loss": 2.2997, + "step": 9057 + }, + { + "epoch": 0.4859442060085837, + "grad_norm": 0.8359375, + "learning_rate": 4.941041538727615e-06, + "loss": 2.393, + "step": 9058 + }, + { + "epoch": 0.4859978540772532, + "grad_norm": 0.3984375, + "learning_rate": 4.941022781066938e-06, + "loss": 1.7827, + "step": 9059 + }, + { + "epoch": 0.48605150214592274, + "grad_norm": 0.419921875, + "learning_rate": 4.941004020458471e-06, + "loss": 2.2135, + "step": 9060 + }, + { + "epoch": 0.4861051502145923, + "grad_norm": 0.458984375, + "learning_rate": 4.94098525690224e-06, + "loss": 2.1834, + "step": 9061 + }, + { + "epoch": 0.4861587982832618, + "grad_norm": 0.48828125, + "learning_rate": 4.9409664903982656e-06, + "loss": 2.2713, + "step": 9062 + }, + { + "epoch": 0.48621244635193134, + "grad_norm": 0.458984375, + "learning_rate": 4.94094772094657e-06, + "loss": 2.325, + "step": 9063 + }, + { + "epoch": 0.48626609442060087, + "grad_norm": 0.388671875, + "learning_rate": 4.940928948547178e-06, + "loss": 2.2112, + "step": 9064 + }, + { + "epoch": 0.4863197424892704, + "grad_norm": 0.373046875, + "learning_rate": 4.940910173200111e-06, + "loss": 2.2419, + "step": 9065 + }, + { + "epoch": 0.48637339055793993, + "grad_norm": 0.453125, + "learning_rate": 4.940891394905391e-06, + "loss": 1.5618, + "step": 9066 + }, + { + "epoch": 0.48642703862660946, + "grad_norm": 0.421875, + "learning_rate": 4.940872613663041e-06, + "loss": 2.3716, + "step": 9067 + }, + { + "epoch": 0.486480686695279, + "grad_norm": 0.37109375, + "learning_rate": 4.940853829473085e-06, + "loss": 2.4968, + "step": 9068 + }, + { + "epoch": 0.4865343347639485, + "grad_norm": 0.5, + "learning_rate": 4.940835042335545e-06, + "loss": 2.4202, + "step": 9069 + }, + { + "epoch": 0.486587982832618, + "grad_norm": 0.54296875, + "learning_rate": 4.940816252250443e-06, + "loss": 2.4224, + "step": 9070 + }, + { + "epoch": 0.48664163090128754, + "grad_norm": 0.458984375, + "learning_rate": 4.940797459217802e-06, + "loss": 2.1958, + "step": 9071 + }, + { + "epoch": 0.48669527896995707, + "grad_norm": 0.45703125, + "learning_rate": 4.940778663237646e-06, + "loss": 2.4109, + "step": 9072 + }, + { + "epoch": 0.4867489270386266, + "grad_norm": 0.46875, + "learning_rate": 4.940759864309995e-06, + "loss": 2.2996, + "step": 9073 + }, + { + "epoch": 0.48680257510729613, + "grad_norm": 0.515625, + "learning_rate": 4.9407410624348736e-06, + "loss": 1.2525, + "step": 9074 + }, + { + "epoch": 0.48685622317596566, + "grad_norm": 0.498046875, + "learning_rate": 4.940722257612305e-06, + "loss": 2.3855, + "step": 9075 + }, + { + "epoch": 0.4869098712446352, + "grad_norm": 0.357421875, + "learning_rate": 4.94070344984231e-06, + "loss": 2.0967, + "step": 9076 + }, + { + "epoch": 0.4869635193133047, + "grad_norm": 0.4921875, + "learning_rate": 4.940684639124913e-06, + "loss": 2.3836, + "step": 9077 + }, + { + "epoch": 0.48701716738197426, + "grad_norm": 0.5625, + "learning_rate": 4.940665825460136e-06, + "loss": 2.2682, + "step": 9078 + }, + { + "epoch": 0.4870708154506438, + "grad_norm": 0.40234375, + "learning_rate": 4.940647008848002e-06, + "loss": 1.7082, + "step": 9079 + }, + { + "epoch": 0.4871244635193133, + "grad_norm": 0.40625, + "learning_rate": 4.940628189288533e-06, + "loss": 2.3402, + "step": 9080 + }, + { + "epoch": 0.48717811158798285, + "grad_norm": 0.40234375, + "learning_rate": 4.940609366781752e-06, + "loss": 2.3104, + "step": 9081 + }, + { + "epoch": 0.4872317596566524, + "grad_norm": 0.357421875, + "learning_rate": 4.940590541327682e-06, + "loss": 2.2618, + "step": 9082 + }, + { + "epoch": 0.48728540772532186, + "grad_norm": 0.42578125, + "learning_rate": 4.940571712926346e-06, + "loss": 2.2245, + "step": 9083 + }, + { + "epoch": 0.4873390557939914, + "grad_norm": 0.40234375, + "learning_rate": 4.940552881577767e-06, + "loss": 2.2753, + "step": 9084 + }, + { + "epoch": 0.4873927038626609, + "grad_norm": 0.40625, + "learning_rate": 4.940534047281967e-06, + "loss": 2.5116, + "step": 9085 + }, + { + "epoch": 0.48744635193133046, + "grad_norm": 0.455078125, + "learning_rate": 4.940515210038969e-06, + "loss": 2.4417, + "step": 9086 + }, + { + "epoch": 0.4875, + "grad_norm": 0.5546875, + "learning_rate": 4.940496369848795e-06, + "loss": 2.3329, + "step": 9087 + }, + { + "epoch": 0.4875536480686695, + "grad_norm": 0.5390625, + "learning_rate": 4.940477526711469e-06, + "loss": 2.0926, + "step": 9088 + }, + { + "epoch": 0.48760729613733905, + "grad_norm": 0.353515625, + "learning_rate": 4.940458680627013e-06, + "loss": 1.809, + "step": 9089 + }, + { + "epoch": 0.4876609442060086, + "grad_norm": 0.5, + "learning_rate": 4.94043983159545e-06, + "loss": 2.2553, + "step": 9090 + }, + { + "epoch": 0.4877145922746781, + "grad_norm": 0.4296875, + "learning_rate": 4.940420979616804e-06, + "loss": 2.5123, + "step": 9091 + }, + { + "epoch": 0.48776824034334765, + "grad_norm": 0.486328125, + "learning_rate": 4.940402124691094e-06, + "loss": 1.5021, + "step": 9092 + }, + { + "epoch": 0.4878218884120172, + "grad_norm": 0.4921875, + "learning_rate": 4.940383266818347e-06, + "loss": 2.3391, + "step": 9093 + }, + { + "epoch": 0.4878755364806867, + "grad_norm": 0.34375, + "learning_rate": 4.940364405998584e-06, + "loss": 2.1334, + "step": 9094 + }, + { + "epoch": 0.48792918454935624, + "grad_norm": 0.39453125, + "learning_rate": 4.940345542231826e-06, + "loss": 2.2452, + "step": 9095 + }, + { + "epoch": 0.4879828326180258, + "grad_norm": 0.5078125, + "learning_rate": 4.940326675518099e-06, + "loss": 2.3892, + "step": 9096 + }, + { + "epoch": 0.4880364806866953, + "grad_norm": 0.419921875, + "learning_rate": 4.9403078058574245e-06, + "loss": 2.2521, + "step": 9097 + }, + { + "epoch": 0.4880901287553648, + "grad_norm": 0.36328125, + "learning_rate": 4.940288933249825e-06, + "loss": 2.0879, + "step": 9098 + }, + { + "epoch": 0.4881437768240343, + "grad_norm": 0.3828125, + "learning_rate": 4.9402700576953235e-06, + "loss": 2.1122, + "step": 9099 + }, + { + "epoch": 0.48819742489270385, + "grad_norm": 0.423828125, + "learning_rate": 4.940251179193943e-06, + "loss": 2.2136, + "step": 9100 + }, + { + "epoch": 0.4882510729613734, + "grad_norm": 0.44140625, + "learning_rate": 4.940232297745705e-06, + "loss": 2.1626, + "step": 9101 + }, + { + "epoch": 0.4883047210300429, + "grad_norm": 0.423828125, + "learning_rate": 4.940213413350634e-06, + "loss": 2.3719, + "step": 9102 + }, + { + "epoch": 0.48835836909871244, + "grad_norm": 0.47265625, + "learning_rate": 4.940194526008751e-06, + "loss": 2.4061, + "step": 9103 + }, + { + "epoch": 0.488412017167382, + "grad_norm": 0.8671875, + "learning_rate": 4.940175635720082e-06, + "loss": 2.4483, + "step": 9104 + }, + { + "epoch": 0.4884656652360515, + "grad_norm": 0.431640625, + "learning_rate": 4.9401567424846466e-06, + "loss": 2.3247, + "step": 9105 + }, + { + "epoch": 0.48851931330472104, + "grad_norm": 0.41796875, + "learning_rate": 4.9401378463024685e-06, + "loss": 1.9428, + "step": 9106 + }, + { + "epoch": 0.48857296137339057, + "grad_norm": 0.427734375, + "learning_rate": 4.940118947173571e-06, + "loss": 2.4197, + "step": 9107 + }, + { + "epoch": 0.4886266094420601, + "grad_norm": 0.3828125, + "learning_rate": 4.9401000450979776e-06, + "loss": 2.3353, + "step": 9108 + }, + { + "epoch": 0.48868025751072963, + "grad_norm": 0.408203125, + "learning_rate": 4.94008114007571e-06, + "loss": 2.4205, + "step": 9109 + }, + { + "epoch": 0.48873390557939916, + "grad_norm": 0.46875, + "learning_rate": 4.94006223210679e-06, + "loss": 1.6493, + "step": 9110 + }, + { + "epoch": 0.4887875536480687, + "grad_norm": 0.40625, + "learning_rate": 4.940043321191242e-06, + "loss": 2.1291, + "step": 9111 + }, + { + "epoch": 0.48884120171673817, + "grad_norm": 0.421875, + "learning_rate": 4.9400244073290895e-06, + "loss": 2.2951, + "step": 9112 + }, + { + "epoch": 0.4888948497854077, + "grad_norm": 0.359375, + "learning_rate": 4.940005490520354e-06, + "loss": 2.1166, + "step": 9113 + }, + { + "epoch": 0.48894849785407724, + "grad_norm": 0.39453125, + "learning_rate": 4.939986570765059e-06, + "loss": 2.2433, + "step": 9114 + }, + { + "epoch": 0.48900214592274677, + "grad_norm": 0.390625, + "learning_rate": 4.9399676480632264e-06, + "loss": 2.2647, + "step": 9115 + }, + { + "epoch": 0.4890557939914163, + "grad_norm": 0.55859375, + "learning_rate": 4.93994872241488e-06, + "loss": 2.1868, + "step": 9116 + }, + { + "epoch": 0.48910944206008583, + "grad_norm": 0.435546875, + "learning_rate": 4.939929793820043e-06, + "loss": 2.1864, + "step": 9117 + }, + { + "epoch": 0.48916309012875536, + "grad_norm": 0.390625, + "learning_rate": 4.939910862278737e-06, + "loss": 2.3645, + "step": 9118 + }, + { + "epoch": 0.4892167381974249, + "grad_norm": 0.423828125, + "learning_rate": 4.939891927790985e-06, + "loss": 2.3276, + "step": 9119 + }, + { + "epoch": 0.4892703862660944, + "grad_norm": 0.4375, + "learning_rate": 4.939872990356811e-06, + "loss": 2.258, + "step": 9120 + }, + { + "epoch": 0.48932403433476396, + "grad_norm": 0.54296875, + "learning_rate": 4.939854049976237e-06, + "loss": 2.3251, + "step": 9121 + }, + { + "epoch": 0.4893776824034335, + "grad_norm": 0.470703125, + "learning_rate": 4.939835106649287e-06, + "loss": 2.5022, + "step": 9122 + }, + { + "epoch": 0.489431330472103, + "grad_norm": 0.458984375, + "learning_rate": 4.939816160375982e-06, + "loss": 2.3052, + "step": 9123 + }, + { + "epoch": 0.48948497854077255, + "grad_norm": 0.392578125, + "learning_rate": 4.939797211156346e-06, + "loss": 1.9992, + "step": 9124 + }, + { + "epoch": 0.4895386266094421, + "grad_norm": 0.3515625, + "learning_rate": 4.939778258990402e-06, + "loss": 2.0072, + "step": 9125 + }, + { + "epoch": 0.48959227467811156, + "grad_norm": 0.443359375, + "learning_rate": 4.939759303878172e-06, + "loss": 2.2711, + "step": 9126 + }, + { + "epoch": 0.4896459227467811, + "grad_norm": 0.4296875, + "learning_rate": 4.939740345819681e-06, + "loss": 2.2658, + "step": 9127 + }, + { + "epoch": 0.4896995708154506, + "grad_norm": 0.451171875, + "learning_rate": 4.93972138481495e-06, + "loss": 2.0728, + "step": 9128 + }, + { + "epoch": 0.48975321888412016, + "grad_norm": 0.4140625, + "learning_rate": 4.939702420864001e-06, + "loss": 2.1175, + "step": 9129 + }, + { + "epoch": 0.4898068669527897, + "grad_norm": 0.39453125, + "learning_rate": 4.939683453966859e-06, + "loss": 2.3942, + "step": 9130 + }, + { + "epoch": 0.4898605150214592, + "grad_norm": 0.423828125, + "learning_rate": 4.939664484123546e-06, + "loss": 2.4857, + "step": 9131 + }, + { + "epoch": 0.48991416309012875, + "grad_norm": 0.375, + "learning_rate": 4.939645511334086e-06, + "loss": 2.258, + "step": 9132 + }, + { + "epoch": 0.4899678111587983, + "grad_norm": 0.49609375, + "learning_rate": 4.939626535598499e-06, + "loss": 2.4087, + "step": 9133 + }, + { + "epoch": 0.4900214592274678, + "grad_norm": 0.4296875, + "learning_rate": 4.9396075569168115e-06, + "loss": 2.2031, + "step": 9134 + }, + { + "epoch": 0.49007510729613735, + "grad_norm": 0.43359375, + "learning_rate": 4.939588575289044e-06, + "loss": 2.361, + "step": 9135 + }, + { + "epoch": 0.4901287553648069, + "grad_norm": 0.578125, + "learning_rate": 4.93956959071522e-06, + "loss": 2.4059, + "step": 9136 + }, + { + "epoch": 0.4901824034334764, + "grad_norm": 0.478515625, + "learning_rate": 4.939550603195364e-06, + "loss": 2.4272, + "step": 9137 + }, + { + "epoch": 0.49023605150214594, + "grad_norm": 0.390625, + "learning_rate": 4.939531612729496e-06, + "loss": 2.1165, + "step": 9138 + }, + { + "epoch": 0.4902896995708155, + "grad_norm": 0.439453125, + "learning_rate": 4.939512619317642e-06, + "loss": 2.2218, + "step": 9139 + }, + { + "epoch": 0.490343347639485, + "grad_norm": 0.4609375, + "learning_rate": 4.939493622959821e-06, + "loss": 2.1532, + "step": 9140 + }, + { + "epoch": 0.4903969957081545, + "grad_norm": 0.4296875, + "learning_rate": 4.93947462365606e-06, + "loss": 2.4366, + "step": 9141 + }, + { + "epoch": 0.490450643776824, + "grad_norm": 0.423828125, + "learning_rate": 4.939455621406381e-06, + "loss": 2.2713, + "step": 9142 + }, + { + "epoch": 0.49050429184549355, + "grad_norm": 0.392578125, + "learning_rate": 4.939436616210804e-06, + "loss": 2.2055, + "step": 9143 + }, + { + "epoch": 0.4905579399141631, + "grad_norm": 0.4453125, + "learning_rate": 4.939417608069356e-06, + "loss": 2.6301, + "step": 9144 + }, + { + "epoch": 0.4906115879828326, + "grad_norm": 0.69140625, + "learning_rate": 4.939398596982058e-06, + "loss": 2.367, + "step": 9145 + }, + { + "epoch": 0.49066523605150214, + "grad_norm": 0.478515625, + "learning_rate": 4.939379582948933e-06, + "loss": 2.2719, + "step": 9146 + }, + { + "epoch": 0.4907188841201717, + "grad_norm": 0.37890625, + "learning_rate": 4.939360565970004e-06, + "loss": 2.1491, + "step": 9147 + }, + { + "epoch": 0.4907725321888412, + "grad_norm": 0.392578125, + "learning_rate": 4.939341546045293e-06, + "loss": 2.2633, + "step": 9148 + }, + { + "epoch": 0.49082618025751074, + "grad_norm": 0.474609375, + "learning_rate": 4.939322523174825e-06, + "loss": 2.3635, + "step": 9149 + }, + { + "epoch": 0.49087982832618027, + "grad_norm": 0.546875, + "learning_rate": 4.9393034973586225e-06, + "loss": 1.761, + "step": 9150 + }, + { + "epoch": 0.4909334763948498, + "grad_norm": 0.4375, + "learning_rate": 4.939284468596707e-06, + "loss": 2.1599, + "step": 9151 + }, + { + "epoch": 0.49098712446351933, + "grad_norm": 0.412109375, + "learning_rate": 4.939265436889103e-06, + "loss": 2.0407, + "step": 9152 + }, + { + "epoch": 0.49104077253218886, + "grad_norm": 0.423828125, + "learning_rate": 4.939246402235833e-06, + "loss": 2.2541, + "step": 9153 + }, + { + "epoch": 0.4910944206008584, + "grad_norm": 0.373046875, + "learning_rate": 4.9392273646369195e-06, + "loss": 2.2707, + "step": 9154 + }, + { + "epoch": 0.49114806866952787, + "grad_norm": 0.48828125, + "learning_rate": 4.939208324092386e-06, + "loss": 2.4417, + "step": 9155 + }, + { + "epoch": 0.4912017167381974, + "grad_norm": 0.46875, + "learning_rate": 4.939189280602256e-06, + "loss": 2.3158, + "step": 9156 + }, + { + "epoch": 0.49125536480686693, + "grad_norm": 0.4140625, + "learning_rate": 4.939170234166551e-06, + "loss": 1.9382, + "step": 9157 + }, + { + "epoch": 0.49130901287553647, + "grad_norm": 0.41015625, + "learning_rate": 4.9391511847852944e-06, + "loss": 2.0629, + "step": 9158 + }, + { + "epoch": 0.491362660944206, + "grad_norm": 0.59375, + "learning_rate": 4.939132132458511e-06, + "loss": 2.1881, + "step": 9159 + }, + { + "epoch": 0.49141630901287553, + "grad_norm": 0.42578125, + "learning_rate": 4.939113077186222e-06, + "loss": 2.2356, + "step": 9160 + }, + { + "epoch": 0.49146995708154506, + "grad_norm": 0.427734375, + "learning_rate": 4.939094018968451e-06, + "loss": 2.2533, + "step": 9161 + }, + { + "epoch": 0.4915236051502146, + "grad_norm": 0.474609375, + "learning_rate": 4.939074957805221e-06, + "loss": 2.0966, + "step": 9162 + }, + { + "epoch": 0.4915772532188841, + "grad_norm": 0.51171875, + "learning_rate": 4.939055893696554e-06, + "loss": 2.0717, + "step": 9163 + }, + { + "epoch": 0.49163090128755366, + "grad_norm": 0.404296875, + "learning_rate": 4.9390368266424745e-06, + "loss": 2.3147, + "step": 9164 + }, + { + "epoch": 0.4916845493562232, + "grad_norm": 0.474609375, + "learning_rate": 4.939017756643006e-06, + "loss": 2.324, + "step": 9165 + }, + { + "epoch": 0.4917381974248927, + "grad_norm": 0.5390625, + "learning_rate": 4.9389986836981695e-06, + "loss": 2.3451, + "step": 9166 + }, + { + "epoch": 0.49179184549356225, + "grad_norm": 0.4765625, + "learning_rate": 4.93897960780799e-06, + "loss": 2.2775, + "step": 9167 + }, + { + "epoch": 0.4918454935622318, + "grad_norm": 0.455078125, + "learning_rate": 4.938960528972488e-06, + "loss": 2.1526, + "step": 9168 + }, + { + "epoch": 0.49189914163090126, + "grad_norm": 0.466796875, + "learning_rate": 4.938941447191689e-06, + "loss": 2.103, + "step": 9169 + }, + { + "epoch": 0.4919527896995708, + "grad_norm": 0.447265625, + "learning_rate": 4.9389223624656156e-06, + "loss": 2.3087, + "step": 9170 + }, + { + "epoch": 0.4920064377682403, + "grad_norm": 0.482421875, + "learning_rate": 4.938903274794289e-06, + "loss": 2.4381, + "step": 9171 + }, + { + "epoch": 0.49206008583690986, + "grad_norm": 0.41015625, + "learning_rate": 4.938884184177735e-06, + "loss": 2.2061, + "step": 9172 + }, + { + "epoch": 0.4921137339055794, + "grad_norm": 0.396484375, + "learning_rate": 4.9388650906159755e-06, + "loss": 2.2494, + "step": 9173 + }, + { + "epoch": 0.4921673819742489, + "grad_norm": 0.53515625, + "learning_rate": 4.9388459941090324e-06, + "loss": 2.364, + "step": 9174 + }, + { + "epoch": 0.49222103004291845, + "grad_norm": 0.484375, + "learning_rate": 4.93882689465693e-06, + "loss": 1.7887, + "step": 9175 + }, + { + "epoch": 0.492274678111588, + "grad_norm": 0.380859375, + "learning_rate": 4.938807792259691e-06, + "loss": 2.1669, + "step": 9176 + }, + { + "epoch": 0.4923283261802575, + "grad_norm": 0.376953125, + "learning_rate": 4.93878868691734e-06, + "loss": 2.2465, + "step": 9177 + }, + { + "epoch": 0.49238197424892705, + "grad_norm": 0.470703125, + "learning_rate": 4.9387695786298965e-06, + "loss": 2.1988, + "step": 9178 + }, + { + "epoch": 0.4924356223175966, + "grad_norm": 0.44921875, + "learning_rate": 4.938750467397387e-06, + "loss": 2.4354, + "step": 9179 + }, + { + "epoch": 0.4924892703862661, + "grad_norm": 0.322265625, + "learning_rate": 4.9387313532198325e-06, + "loss": 2.4568, + "step": 9180 + }, + { + "epoch": 0.49254291845493564, + "grad_norm": 0.462890625, + "learning_rate": 4.938712236097258e-06, + "loss": 2.0834, + "step": 9181 + }, + { + "epoch": 0.4925965665236052, + "grad_norm": 0.400390625, + "learning_rate": 4.938693116029685e-06, + "loss": 2.1001, + "step": 9182 + }, + { + "epoch": 0.4926502145922747, + "grad_norm": 0.40234375, + "learning_rate": 4.938673993017137e-06, + "loss": 2.5445, + "step": 9183 + }, + { + "epoch": 0.4927038626609442, + "grad_norm": 0.5703125, + "learning_rate": 4.938654867059637e-06, + "loss": 1.2219, + "step": 9184 + }, + { + "epoch": 0.4927575107296137, + "grad_norm": 0.384765625, + "learning_rate": 4.938635738157208e-06, + "loss": 2.3691, + "step": 9185 + }, + { + "epoch": 0.49281115879828324, + "grad_norm": 0.4453125, + "learning_rate": 4.938616606309873e-06, + "loss": 2.4446, + "step": 9186 + }, + { + "epoch": 0.4928648068669528, + "grad_norm": 0.4921875, + "learning_rate": 4.938597471517657e-06, + "loss": 2.1995, + "step": 9187 + }, + { + "epoch": 0.4929184549356223, + "grad_norm": 0.435546875, + "learning_rate": 4.9385783337805795e-06, + "loss": 2.0774, + "step": 9188 + }, + { + "epoch": 0.49297210300429184, + "grad_norm": 0.4453125, + "learning_rate": 4.938559193098668e-06, + "loss": 2.5899, + "step": 9189 + }, + { + "epoch": 0.49302575107296137, + "grad_norm": 0.486328125, + "learning_rate": 4.9385400494719415e-06, + "loss": 2.2868, + "step": 9190 + }, + { + "epoch": 0.4930793991416309, + "grad_norm": 0.365234375, + "learning_rate": 4.938520902900426e-06, + "loss": 2.0357, + "step": 9191 + }, + { + "epoch": 0.49313304721030043, + "grad_norm": 0.51953125, + "learning_rate": 4.938501753384142e-06, + "loss": 2.386, + "step": 9192 + }, + { + "epoch": 0.49318669527896997, + "grad_norm": 0.439453125, + "learning_rate": 4.938482600923116e-06, + "loss": 2.4142, + "step": 9193 + }, + { + "epoch": 0.4932403433476395, + "grad_norm": 0.4140625, + "learning_rate": 4.938463445517367e-06, + "loss": 2.2922, + "step": 9194 + }, + { + "epoch": 0.49329399141630903, + "grad_norm": 0.462890625, + "learning_rate": 4.9384442871669225e-06, + "loss": 2.3264, + "step": 9195 + }, + { + "epoch": 0.49334763948497856, + "grad_norm": 0.345703125, + "learning_rate": 4.938425125871803e-06, + "loss": 2.1848, + "step": 9196 + }, + { + "epoch": 0.4934012875536481, + "grad_norm": 0.357421875, + "learning_rate": 4.938405961632032e-06, + "loss": 2.0879, + "step": 9197 + }, + { + "epoch": 0.49345493562231757, + "grad_norm": 0.40234375, + "learning_rate": 4.9383867944476325e-06, + "loss": 2.2541, + "step": 9198 + }, + { + "epoch": 0.4935085836909871, + "grad_norm": 0.392578125, + "learning_rate": 4.938367624318629e-06, + "loss": 2.2006, + "step": 9199 + }, + { + "epoch": 0.49356223175965663, + "grad_norm": 0.416015625, + "learning_rate": 4.938348451245043e-06, + "loss": 2.1995, + "step": 9200 + }, + { + "epoch": 0.49361587982832617, + "grad_norm": 0.462890625, + "learning_rate": 4.9383292752268975e-06, + "loss": 2.3616, + "step": 9201 + }, + { + "epoch": 0.4936695278969957, + "grad_norm": 0.4375, + "learning_rate": 4.9383100962642175e-06, + "loss": 2.2661, + "step": 9202 + }, + { + "epoch": 0.49372317596566523, + "grad_norm": 0.546875, + "learning_rate": 4.938290914357025e-06, + "loss": 2.341, + "step": 9203 + }, + { + "epoch": 0.49377682403433476, + "grad_norm": 0.486328125, + "learning_rate": 4.9382717295053425e-06, + "loss": 1.5136, + "step": 9204 + }, + { + "epoch": 0.4938304721030043, + "grad_norm": 0.392578125, + "learning_rate": 4.938252541709194e-06, + "loss": 2.2483, + "step": 9205 + }, + { + "epoch": 0.4938841201716738, + "grad_norm": 0.45703125, + "learning_rate": 4.938233350968603e-06, + "loss": 1.9121, + "step": 9206 + }, + { + "epoch": 0.49393776824034336, + "grad_norm": 0.3984375, + "learning_rate": 4.9382141572835926e-06, + "loss": 2.113, + "step": 9207 + }, + { + "epoch": 0.4939914163090129, + "grad_norm": 0.390625, + "learning_rate": 4.9381949606541855e-06, + "loss": 2.2727, + "step": 9208 + }, + { + "epoch": 0.4940450643776824, + "grad_norm": 0.455078125, + "learning_rate": 4.938175761080405e-06, + "loss": 2.1658, + "step": 9209 + }, + { + "epoch": 0.49409871244635195, + "grad_norm": 0.40234375, + "learning_rate": 4.938156558562274e-06, + "loss": 2.2549, + "step": 9210 + }, + { + "epoch": 0.4941523605150215, + "grad_norm": 0.53515625, + "learning_rate": 4.938137353099816e-06, + "loss": 2.356, + "step": 9211 + }, + { + "epoch": 0.494206008583691, + "grad_norm": 0.3671875, + "learning_rate": 4.938118144693054e-06, + "loss": 2.2668, + "step": 9212 + }, + { + "epoch": 0.4942596566523605, + "grad_norm": 0.5859375, + "learning_rate": 4.938098933342011e-06, + "loss": 2.3309, + "step": 9213 + }, + { + "epoch": 0.49431330472103, + "grad_norm": 0.453125, + "learning_rate": 4.938079719046712e-06, + "loss": 2.2002, + "step": 9214 + }, + { + "epoch": 0.49436695278969955, + "grad_norm": 0.451171875, + "learning_rate": 4.938060501807178e-06, + "loss": 2.3143, + "step": 9215 + }, + { + "epoch": 0.4944206008583691, + "grad_norm": 0.404296875, + "learning_rate": 4.938041281623433e-06, + "loss": 2.1065, + "step": 9216 + }, + { + "epoch": 0.4944742489270386, + "grad_norm": 0.41015625, + "learning_rate": 4.938022058495501e-06, + "loss": 1.9861, + "step": 9217 + }, + { + "epoch": 0.49452789699570815, + "grad_norm": 0.51953125, + "learning_rate": 4.938002832423404e-06, + "loss": 2.384, + "step": 9218 + }, + { + "epoch": 0.4945815450643777, + "grad_norm": 0.384765625, + "learning_rate": 4.937983603407165e-06, + "loss": 2.2399, + "step": 9219 + }, + { + "epoch": 0.4946351931330472, + "grad_norm": 0.48046875, + "learning_rate": 4.937964371446809e-06, + "loss": 2.5272, + "step": 9220 + }, + { + "epoch": 0.49468884120171674, + "grad_norm": 0.400390625, + "learning_rate": 4.937945136542357e-06, + "loss": 2.5807, + "step": 9221 + }, + { + "epoch": 0.4947424892703863, + "grad_norm": 0.4453125, + "learning_rate": 4.937925898693833e-06, + "loss": 2.3669, + "step": 9222 + }, + { + "epoch": 0.4947961373390558, + "grad_norm": 0.4453125, + "learning_rate": 4.937906657901261e-06, + "loss": 2.2322, + "step": 9223 + }, + { + "epoch": 0.49484978540772534, + "grad_norm": 0.375, + "learning_rate": 4.937887414164665e-06, + "loss": 2.3571, + "step": 9224 + }, + { + "epoch": 0.49490343347639487, + "grad_norm": 0.404296875, + "learning_rate": 4.937868167484066e-06, + "loss": 2.5793, + "step": 9225 + }, + { + "epoch": 0.4949570815450644, + "grad_norm": 0.45703125, + "learning_rate": 4.937848917859488e-06, + "loss": 2.2253, + "step": 9226 + }, + { + "epoch": 0.4950107296137339, + "grad_norm": 0.478515625, + "learning_rate": 4.937829665290955e-06, + "loss": 2.2446, + "step": 9227 + }, + { + "epoch": 0.4950643776824034, + "grad_norm": 0.5234375, + "learning_rate": 4.937810409778489e-06, + "loss": 2.0009, + "step": 9228 + }, + { + "epoch": 0.49511802575107294, + "grad_norm": 0.41015625, + "learning_rate": 4.937791151322114e-06, + "loss": 2.2268, + "step": 9229 + }, + { + "epoch": 0.4951716738197425, + "grad_norm": 1.0703125, + "learning_rate": 4.937771889921854e-06, + "loss": 2.3363, + "step": 9230 + }, + { + "epoch": 0.495225321888412, + "grad_norm": 0.3828125, + "learning_rate": 4.937752625577732e-06, + "loss": 1.9939, + "step": 9231 + }, + { + "epoch": 0.49527896995708154, + "grad_norm": 0.427734375, + "learning_rate": 4.93773335828977e-06, + "loss": 2.5528, + "step": 9232 + }, + { + "epoch": 0.49533261802575107, + "grad_norm": 0.419921875, + "learning_rate": 4.937714088057992e-06, + "loss": 1.5654, + "step": 9233 + }, + { + "epoch": 0.4953862660944206, + "grad_norm": 0.412109375, + "learning_rate": 4.937694814882422e-06, + "loss": 2.3304, + "step": 9234 + }, + { + "epoch": 0.49543991416309013, + "grad_norm": 0.408203125, + "learning_rate": 4.937675538763082e-06, + "loss": 2.3441, + "step": 9235 + }, + { + "epoch": 0.49549356223175967, + "grad_norm": 0.4609375, + "learning_rate": 4.937656259699996e-06, + "loss": 2.1544, + "step": 9236 + }, + { + "epoch": 0.4955472103004292, + "grad_norm": 0.384765625, + "learning_rate": 4.937636977693188e-06, + "loss": 1.9372, + "step": 9237 + }, + { + "epoch": 0.49560085836909873, + "grad_norm": 0.4296875, + "learning_rate": 4.937617692742679e-06, + "loss": 2.1547, + "step": 9238 + }, + { + "epoch": 0.49565450643776826, + "grad_norm": 0.384765625, + "learning_rate": 4.937598404848495e-06, + "loss": 2.2147, + "step": 9239 + }, + { + "epoch": 0.4957081545064378, + "grad_norm": 0.43359375, + "learning_rate": 4.937579114010657e-06, + "loss": 2.1974, + "step": 9240 + }, + { + "epoch": 0.49576180257510727, + "grad_norm": 0.361328125, + "learning_rate": 4.93755982022919e-06, + "loss": 2.1274, + "step": 9241 + }, + { + "epoch": 0.4958154506437768, + "grad_norm": 1.1484375, + "learning_rate": 4.937540523504116e-06, + "loss": 1.4562, + "step": 9242 + }, + { + "epoch": 0.49586909871244633, + "grad_norm": 0.51953125, + "learning_rate": 4.937521223835459e-06, + "loss": 2.446, + "step": 9243 + }, + { + "epoch": 0.49592274678111586, + "grad_norm": 0.58203125, + "learning_rate": 4.9375019212232426e-06, + "loss": 1.9685, + "step": 9244 + }, + { + "epoch": 0.4959763948497854, + "grad_norm": 0.37109375, + "learning_rate": 4.93748261566749e-06, + "loss": 2.018, + "step": 9245 + }, + { + "epoch": 0.4960300429184549, + "grad_norm": 0.50390625, + "learning_rate": 4.937463307168224e-06, + "loss": 2.354, + "step": 9246 + }, + { + "epoch": 0.49608369098712446, + "grad_norm": 0.4921875, + "learning_rate": 4.9374439957254684e-06, + "loss": 2.4677, + "step": 9247 + }, + { + "epoch": 0.496137339055794, + "grad_norm": 0.39453125, + "learning_rate": 4.937424681339246e-06, + "loss": 2.2487, + "step": 9248 + }, + { + "epoch": 0.4961909871244635, + "grad_norm": 0.443359375, + "learning_rate": 4.93740536400958e-06, + "loss": 2.2739, + "step": 9249 + }, + { + "epoch": 0.49624463519313305, + "grad_norm": 0.4453125, + "learning_rate": 4.937386043736495e-06, + "loss": 2.0443, + "step": 9250 + }, + { + "epoch": 0.4962982832618026, + "grad_norm": 0.427734375, + "learning_rate": 4.937366720520013e-06, + "loss": 2.4192, + "step": 9251 + }, + { + "epoch": 0.4963519313304721, + "grad_norm": 0.462890625, + "learning_rate": 4.937347394360158e-06, + "loss": 2.1999, + "step": 9252 + }, + { + "epoch": 0.49640557939914165, + "grad_norm": 0.4296875, + "learning_rate": 4.937328065256953e-06, + "loss": 2.2163, + "step": 9253 + }, + { + "epoch": 0.4964592274678112, + "grad_norm": 0.74609375, + "learning_rate": 4.937308733210421e-06, + "loss": 2.2796, + "step": 9254 + }, + { + "epoch": 0.4965128755364807, + "grad_norm": 0.486328125, + "learning_rate": 4.937289398220587e-06, + "loss": 2.2428, + "step": 9255 + }, + { + "epoch": 0.4965665236051502, + "grad_norm": 0.44140625, + "learning_rate": 4.937270060287472e-06, + "loss": 2.2205, + "step": 9256 + }, + { + "epoch": 0.4966201716738197, + "grad_norm": 0.408203125, + "learning_rate": 4.937250719411101e-06, + "loss": 2.301, + "step": 9257 + }, + { + "epoch": 0.49667381974248925, + "grad_norm": 0.328125, + "learning_rate": 4.937231375591497e-06, + "loss": 2.3022, + "step": 9258 + }, + { + "epoch": 0.4967274678111588, + "grad_norm": 0.46484375, + "learning_rate": 4.937212028828684e-06, + "loss": 2.0985, + "step": 9259 + }, + { + "epoch": 0.4967811158798283, + "grad_norm": 0.37890625, + "learning_rate": 4.9371926791226836e-06, + "loss": 1.8344, + "step": 9260 + }, + { + "epoch": 0.49683476394849785, + "grad_norm": 0.54296875, + "learning_rate": 4.93717332647352e-06, + "loss": 2.0253, + "step": 9261 + }, + { + "epoch": 0.4968884120171674, + "grad_norm": 0.462890625, + "learning_rate": 4.937153970881218e-06, + "loss": 2.3085, + "step": 9262 + }, + { + "epoch": 0.4969420600858369, + "grad_norm": 0.427734375, + "learning_rate": 4.937134612345799e-06, + "loss": 2.3733, + "step": 9263 + }, + { + "epoch": 0.49699570815450644, + "grad_norm": 0.443359375, + "learning_rate": 4.937115250867286e-06, + "loss": 2.4118, + "step": 9264 + }, + { + "epoch": 0.497049356223176, + "grad_norm": 0.421875, + "learning_rate": 4.937095886445706e-06, + "loss": 2.4625, + "step": 9265 + }, + { + "epoch": 0.4971030042918455, + "grad_norm": 0.50390625, + "learning_rate": 4.937076519081078e-06, + "loss": 2.4592, + "step": 9266 + }, + { + "epoch": 0.49715665236051504, + "grad_norm": 0.400390625, + "learning_rate": 4.937057148773427e-06, + "loss": 2.2343, + "step": 9267 + }, + { + "epoch": 0.49721030042918457, + "grad_norm": 0.51171875, + "learning_rate": 4.9370377755227775e-06, + "loss": 2.4649, + "step": 9268 + }, + { + "epoch": 0.4972639484978541, + "grad_norm": 0.44921875, + "learning_rate": 4.937018399329152e-06, + "loss": 2.2936, + "step": 9269 + }, + { + "epoch": 0.4973175965665236, + "grad_norm": 0.3984375, + "learning_rate": 4.9369990201925735e-06, + "loss": 2.2366, + "step": 9270 + }, + { + "epoch": 0.4973712446351931, + "grad_norm": 0.4375, + "learning_rate": 4.9369796381130666e-06, + "loss": 2.3523, + "step": 9271 + }, + { + "epoch": 0.49742489270386264, + "grad_norm": 0.416015625, + "learning_rate": 4.936960253090654e-06, + "loss": 2.135, + "step": 9272 + }, + { + "epoch": 0.4974785407725322, + "grad_norm": 0.4609375, + "learning_rate": 4.936940865125358e-06, + "loss": 2.2525, + "step": 9273 + }, + { + "epoch": 0.4975321888412017, + "grad_norm": 0.439453125, + "learning_rate": 4.936921474217204e-06, + "loss": 2.219, + "step": 9274 + }, + { + "epoch": 0.49758583690987124, + "grad_norm": 0.6796875, + "learning_rate": 4.936902080366214e-06, + "loss": 2.5226, + "step": 9275 + }, + { + "epoch": 0.49763948497854077, + "grad_norm": 0.458984375, + "learning_rate": 4.936882683572412e-06, + "loss": 2.2646, + "step": 9276 + }, + { + "epoch": 0.4976931330472103, + "grad_norm": 0.44140625, + "learning_rate": 4.936863283835821e-06, + "loss": 2.2649, + "step": 9277 + }, + { + "epoch": 0.49774678111587983, + "grad_norm": 0.50390625, + "learning_rate": 4.936843881156466e-06, + "loss": 2.2198, + "step": 9278 + }, + { + "epoch": 0.49780042918454936, + "grad_norm": 0.455078125, + "learning_rate": 4.936824475534368e-06, + "loss": 2.2354, + "step": 9279 + }, + { + "epoch": 0.4978540772532189, + "grad_norm": 0.470703125, + "learning_rate": 4.936805066969552e-06, + "loss": 2.3302, + "step": 9280 + }, + { + "epoch": 0.49790772532188843, + "grad_norm": 0.376953125, + "learning_rate": 4.9367856554620404e-06, + "loss": 2.2921, + "step": 9281 + }, + { + "epoch": 0.49796137339055796, + "grad_norm": 0.53515625, + "learning_rate": 4.936766241011859e-06, + "loss": 2.2926, + "step": 9282 + }, + { + "epoch": 0.4980150214592275, + "grad_norm": 0.431640625, + "learning_rate": 4.936746823619028e-06, + "loss": 2.2113, + "step": 9283 + }, + { + "epoch": 0.498068669527897, + "grad_norm": 0.46875, + "learning_rate": 4.936727403283573e-06, + "loss": 2.6357, + "step": 9284 + }, + { + "epoch": 0.4981223175965665, + "grad_norm": 0.447265625, + "learning_rate": 4.936707980005517e-06, + "loss": 2.1704, + "step": 9285 + }, + { + "epoch": 0.49817596566523603, + "grad_norm": 0.58984375, + "learning_rate": 4.936688553784883e-06, + "loss": 2.2174, + "step": 9286 + }, + { + "epoch": 0.49822961373390556, + "grad_norm": 0.412109375, + "learning_rate": 4.936669124621695e-06, + "loss": 2.4275, + "step": 9287 + }, + { + "epoch": 0.4982832618025751, + "grad_norm": 0.37109375, + "learning_rate": 4.936649692515977e-06, + "loss": 2.1592, + "step": 9288 + }, + { + "epoch": 0.4983369098712446, + "grad_norm": 0.3671875, + "learning_rate": 4.9366302574677505e-06, + "loss": 2.3577, + "step": 9289 + }, + { + "epoch": 0.49839055793991416, + "grad_norm": 0.5, + "learning_rate": 4.93661081947704e-06, + "loss": 2.1878, + "step": 9290 + }, + { + "epoch": 0.4984442060085837, + "grad_norm": 0.455078125, + "learning_rate": 4.93659137854387e-06, + "loss": 1.9766, + "step": 9291 + }, + { + "epoch": 0.4984978540772532, + "grad_norm": 0.421875, + "learning_rate": 4.936571934668263e-06, + "loss": 2.2163, + "step": 9292 + }, + { + "epoch": 0.49855150214592275, + "grad_norm": 0.484375, + "learning_rate": 4.936552487850243e-06, + "loss": 2.3792, + "step": 9293 + }, + { + "epoch": 0.4986051502145923, + "grad_norm": 0.4140625, + "learning_rate": 4.936533038089831e-06, + "loss": 2.2137, + "step": 9294 + }, + { + "epoch": 0.4986587982832618, + "grad_norm": 0.462890625, + "learning_rate": 4.936513585387055e-06, + "loss": 2.341, + "step": 9295 + }, + { + "epoch": 0.49871244635193135, + "grad_norm": 0.419921875, + "learning_rate": 4.936494129741934e-06, + "loss": 2.3039, + "step": 9296 + }, + { + "epoch": 0.4987660944206009, + "grad_norm": 0.458984375, + "learning_rate": 4.936474671154496e-06, + "loss": 2.2981, + "step": 9297 + }, + { + "epoch": 0.4988197424892704, + "grad_norm": 0.4296875, + "learning_rate": 4.936455209624759e-06, + "loss": 2.3219, + "step": 9298 + }, + { + "epoch": 0.4988733905579399, + "grad_norm": 0.5390625, + "learning_rate": 4.9364357451527525e-06, + "loss": 2.2345, + "step": 9299 + }, + { + "epoch": 0.4989270386266094, + "grad_norm": 0.39453125, + "learning_rate": 4.936416277738496e-06, + "loss": 2.0531, + "step": 9300 + }, + { + "epoch": 0.49898068669527895, + "grad_norm": 0.42578125, + "learning_rate": 4.936396807382013e-06, + "loss": 2.4474, + "step": 9301 + }, + { + "epoch": 0.4990343347639485, + "grad_norm": 0.451171875, + "learning_rate": 4.93637733408333e-06, + "loss": 2.0975, + "step": 9302 + }, + { + "epoch": 0.499087982832618, + "grad_norm": 0.482421875, + "learning_rate": 4.936357857842467e-06, + "loss": 2.351, + "step": 9303 + }, + { + "epoch": 0.49914163090128755, + "grad_norm": 0.400390625, + "learning_rate": 4.93633837865945e-06, + "loss": 2.2893, + "step": 9304 + }, + { + "epoch": 0.4991952789699571, + "grad_norm": 0.703125, + "learning_rate": 4.936318896534301e-06, + "loss": 2.2899, + "step": 9305 + }, + { + "epoch": 0.4992489270386266, + "grad_norm": 0.390625, + "learning_rate": 4.936299411467045e-06, + "loss": 2.1725, + "step": 9306 + }, + { + "epoch": 0.49930257510729614, + "grad_norm": 0.3203125, + "learning_rate": 4.936279923457704e-06, + "loss": 1.8849, + "step": 9307 + }, + { + "epoch": 0.4993562231759657, + "grad_norm": 0.41015625, + "learning_rate": 4.936260432506302e-06, + "loss": 2.0758, + "step": 9308 + }, + { + "epoch": 0.4994098712446352, + "grad_norm": 0.408203125, + "learning_rate": 4.936240938612864e-06, + "loss": 2.4659, + "step": 9309 + }, + { + "epoch": 0.49946351931330474, + "grad_norm": 0.4921875, + "learning_rate": 4.9362214417774115e-06, + "loss": 2.5298, + "step": 9310 + }, + { + "epoch": 0.49951716738197427, + "grad_norm": 0.3828125, + "learning_rate": 4.9362019419999686e-06, + "loss": 2.1557, + "step": 9311 + }, + { + "epoch": 0.4995708154506438, + "grad_norm": 0.55078125, + "learning_rate": 4.93618243928056e-06, + "loss": 2.2923, + "step": 9312 + }, + { + "epoch": 0.4996244635193133, + "grad_norm": 0.39453125, + "learning_rate": 4.936162933619208e-06, + "loss": 2.2036, + "step": 9313 + }, + { + "epoch": 0.4996781115879828, + "grad_norm": 0.447265625, + "learning_rate": 4.936143425015936e-06, + "loss": 2.3133, + "step": 9314 + }, + { + "epoch": 0.49973175965665234, + "grad_norm": 0.380859375, + "learning_rate": 4.936123913470768e-06, + "loss": 2.3036, + "step": 9315 + }, + { + "epoch": 0.4997854077253219, + "grad_norm": 0.365234375, + "learning_rate": 4.936104398983727e-06, + "loss": 2.2649, + "step": 9316 + }, + { + "epoch": 0.4998390557939914, + "grad_norm": 0.40625, + "learning_rate": 4.936084881554839e-06, + "loss": 2.3157, + "step": 9317 + }, + { + "epoch": 0.49989270386266094, + "grad_norm": 0.373046875, + "learning_rate": 4.936065361184125e-06, + "loss": 2.2739, + "step": 9318 + }, + { + "epoch": 0.49994635193133047, + "grad_norm": 0.373046875, + "learning_rate": 4.93604583787161e-06, + "loss": 1.8952, + "step": 9319 + }, + { + "epoch": 0.5, + "grad_norm": 0.423828125, + "learning_rate": 4.936026311617316e-06, + "loss": 2.2677, + "step": 9320 + }, + { + "epoch": 0.5000536480686695, + "grad_norm": 0.42578125, + "learning_rate": 4.936006782421267e-06, + "loss": 2.2763, + "step": 9321 + }, + { + "epoch": 0.5001072961373391, + "grad_norm": 0.341796875, + "learning_rate": 4.935987250283488e-06, + "loss": 2.0095, + "step": 9322 + }, + { + "epoch": 0.5001609442060085, + "grad_norm": 0.484375, + "learning_rate": 4.935967715204002e-06, + "loss": 2.0474, + "step": 9323 + }, + { + "epoch": 0.5002145922746781, + "grad_norm": 0.345703125, + "learning_rate": 4.935948177182831e-06, + "loss": 2.183, + "step": 9324 + }, + { + "epoch": 0.5002682403433476, + "grad_norm": 0.419921875, + "learning_rate": 4.935928636220001e-06, + "loss": 2.4814, + "step": 9325 + }, + { + "epoch": 0.5003218884120172, + "grad_norm": 0.4140625, + "learning_rate": 4.935909092315534e-06, + "loss": 2.1243, + "step": 9326 + }, + { + "epoch": 0.5003755364806867, + "grad_norm": 0.41015625, + "learning_rate": 4.935889545469454e-06, + "loss": 2.3474, + "step": 9327 + }, + { + "epoch": 0.5004291845493563, + "grad_norm": 0.408203125, + "learning_rate": 4.935869995681785e-06, + "loss": 1.9943, + "step": 9328 + }, + { + "epoch": 0.5004828326180257, + "grad_norm": 0.478515625, + "learning_rate": 4.93585044295255e-06, + "loss": 2.1402, + "step": 9329 + }, + { + "epoch": 0.5005364806866953, + "grad_norm": 0.45703125, + "learning_rate": 4.9358308872817724e-06, + "loss": 2.3229, + "step": 9330 + }, + { + "epoch": 0.5005901287553648, + "grad_norm": 0.44140625, + "learning_rate": 4.935811328669476e-06, + "loss": 2.2235, + "step": 9331 + }, + { + "epoch": 0.5006437768240344, + "grad_norm": 0.396484375, + "learning_rate": 4.935791767115687e-06, + "loss": 2.1504, + "step": 9332 + }, + { + "epoch": 0.5006974248927039, + "grad_norm": 0.40625, + "learning_rate": 4.9357722026204245e-06, + "loss": 2.3909, + "step": 9333 + }, + { + "epoch": 0.5007510729613734, + "grad_norm": 0.41796875, + "learning_rate": 4.935752635183715e-06, + "loss": 2.1624, + "step": 9334 + }, + { + "epoch": 0.5008047210300429, + "grad_norm": 0.5234375, + "learning_rate": 4.935733064805581e-06, + "loss": 2.1949, + "step": 9335 + }, + { + "epoch": 0.5008583690987124, + "grad_norm": 0.3984375, + "learning_rate": 4.935713491486048e-06, + "loss": 2.3485, + "step": 9336 + }, + { + "epoch": 0.500912017167382, + "grad_norm": 0.39453125, + "learning_rate": 4.935693915225137e-06, + "loss": 2.2943, + "step": 9337 + }, + { + "epoch": 0.5009656652360515, + "grad_norm": 0.44140625, + "learning_rate": 4.935674336022873e-06, + "loss": 2.6318, + "step": 9338 + }, + { + "epoch": 0.501019313304721, + "grad_norm": 0.439453125, + "learning_rate": 4.93565475387928e-06, + "loss": 2.3921, + "step": 9339 + }, + { + "epoch": 0.5010729613733905, + "grad_norm": 0.3671875, + "learning_rate": 4.935635168794382e-06, + "loss": 2.3463, + "step": 9340 + }, + { + "epoch": 0.5011266094420601, + "grad_norm": 0.44921875, + "learning_rate": 4.9356155807682004e-06, + "loss": 2.2285, + "step": 9341 + }, + { + "epoch": 0.5011802575107296, + "grad_norm": 0.486328125, + "learning_rate": 4.935595989800761e-06, + "loss": 2.3796, + "step": 9342 + }, + { + "epoch": 0.5012339055793992, + "grad_norm": 0.4609375, + "learning_rate": 4.9355763958920865e-06, + "loss": 2.2339, + "step": 9343 + }, + { + "epoch": 0.5012875536480687, + "grad_norm": 0.396484375, + "learning_rate": 4.9355567990422e-06, + "loss": 2.0665, + "step": 9344 + }, + { + "epoch": 0.5013412017167382, + "grad_norm": 0.47265625, + "learning_rate": 4.935537199251128e-06, + "loss": 2.3104, + "step": 9345 + }, + { + "epoch": 0.5013948497854077, + "grad_norm": 0.427734375, + "learning_rate": 4.935517596518891e-06, + "loss": 2.434, + "step": 9346 + }, + { + "epoch": 0.5014484978540773, + "grad_norm": 0.392578125, + "learning_rate": 4.9354979908455135e-06, + "loss": 2.1805, + "step": 9347 + }, + { + "epoch": 0.5015021459227468, + "grad_norm": 0.458984375, + "learning_rate": 4.935478382231019e-06, + "loss": 2.0349, + "step": 9348 + }, + { + "epoch": 0.5015557939914163, + "grad_norm": 0.5859375, + "learning_rate": 4.935458770675433e-06, + "loss": 2.267, + "step": 9349 + }, + { + "epoch": 0.5016094420600858, + "grad_norm": 0.5390625, + "learning_rate": 4.935439156178778e-06, + "loss": 2.3824, + "step": 9350 + }, + { + "epoch": 0.5016630901287553, + "grad_norm": 0.369140625, + "learning_rate": 4.9354195387410765e-06, + "loss": 2.2082, + "step": 9351 + }, + { + "epoch": 0.5017167381974249, + "grad_norm": 0.41796875, + "learning_rate": 4.935399918362353e-06, + "loss": 2.1732, + "step": 9352 + }, + { + "epoch": 0.5017703862660944, + "grad_norm": 0.65234375, + "learning_rate": 4.9353802950426324e-06, + "loss": 1.462, + "step": 9353 + }, + { + "epoch": 0.501824034334764, + "grad_norm": 0.45703125, + "learning_rate": 4.935360668781938e-06, + "loss": 2.5024, + "step": 9354 + }, + { + "epoch": 0.5018776824034334, + "grad_norm": 0.380859375, + "learning_rate": 4.9353410395802916e-06, + "loss": 2.2391, + "step": 9355 + }, + { + "epoch": 0.501931330472103, + "grad_norm": 0.4921875, + "learning_rate": 4.935321407437719e-06, + "loss": 2.4687, + "step": 9356 + }, + { + "epoch": 0.5019849785407725, + "grad_norm": 0.39453125, + "learning_rate": 4.935301772354242e-06, + "loss": 2.4611, + "step": 9357 + }, + { + "epoch": 0.5020386266094421, + "grad_norm": 0.43359375, + "learning_rate": 4.935282134329886e-06, + "loss": 2.0844, + "step": 9358 + }, + { + "epoch": 0.5020922746781116, + "grad_norm": 0.44921875, + "learning_rate": 4.935262493364675e-06, + "loss": 2.2027, + "step": 9359 + }, + { + "epoch": 0.5021459227467812, + "grad_norm": 0.4296875, + "learning_rate": 4.935242849458632e-06, + "loss": 2.2212, + "step": 9360 + }, + { + "epoch": 0.5021995708154506, + "grad_norm": 0.4921875, + "learning_rate": 4.93522320261178e-06, + "loss": 2.6356, + "step": 9361 + }, + { + "epoch": 0.5022532188841202, + "grad_norm": 0.408203125, + "learning_rate": 4.935203552824143e-06, + "loss": 2.2986, + "step": 9362 + }, + { + "epoch": 0.5023068669527897, + "grad_norm": 0.6953125, + "learning_rate": 4.935183900095745e-06, + "loss": 2.1865, + "step": 9363 + }, + { + "epoch": 0.5023605150214592, + "grad_norm": 0.439453125, + "learning_rate": 4.93516424442661e-06, + "loss": 2.2915, + "step": 9364 + }, + { + "epoch": 0.5024141630901288, + "grad_norm": 0.46875, + "learning_rate": 4.9351445858167615e-06, + "loss": 2.4705, + "step": 9365 + }, + { + "epoch": 0.5024678111587982, + "grad_norm": 0.373046875, + "learning_rate": 4.935124924266224e-06, + "loss": 2.1011, + "step": 9366 + }, + { + "epoch": 0.5025214592274678, + "grad_norm": 0.462890625, + "learning_rate": 4.935105259775019e-06, + "loss": 2.4613, + "step": 9367 + }, + { + "epoch": 0.5025751072961373, + "grad_norm": 1.0625, + "learning_rate": 4.9350855923431735e-06, + "loss": 2.3828, + "step": 9368 + }, + { + "epoch": 0.5026287553648069, + "grad_norm": 0.423828125, + "learning_rate": 4.935065921970709e-06, + "loss": 2.2045, + "step": 9369 + }, + { + "epoch": 0.5026824034334764, + "grad_norm": 0.431640625, + "learning_rate": 4.935046248657649e-06, + "loss": 2.31, + "step": 9370 + }, + { + "epoch": 0.502736051502146, + "grad_norm": 0.4140625, + "learning_rate": 4.935026572404018e-06, + "loss": 2.1086, + "step": 9371 + }, + { + "epoch": 0.5027896995708154, + "grad_norm": 0.45703125, + "learning_rate": 4.93500689320984e-06, + "loss": 2.1684, + "step": 9372 + }, + { + "epoch": 0.502843347639485, + "grad_norm": 0.4296875, + "learning_rate": 4.93498721107514e-06, + "loss": 2.2968, + "step": 9373 + }, + { + "epoch": 0.5028969957081545, + "grad_norm": 0.47265625, + "learning_rate": 4.9349675259999385e-06, + "loss": 2.2677, + "step": 9374 + }, + { + "epoch": 0.5029506437768241, + "grad_norm": 0.392578125, + "learning_rate": 4.934947837984262e-06, + "loss": 2.2799, + "step": 9375 + }, + { + "epoch": 0.5030042918454936, + "grad_norm": 0.431640625, + "learning_rate": 4.934928147028133e-06, + "loss": 2.3797, + "step": 9376 + }, + { + "epoch": 0.5030579399141631, + "grad_norm": 0.4609375, + "learning_rate": 4.9349084531315764e-06, + "loss": 2.3654, + "step": 9377 + }, + { + "epoch": 0.5031115879828326, + "grad_norm": 0.43359375, + "learning_rate": 4.934888756294614e-06, + "loss": 2.4059, + "step": 9378 + }, + { + "epoch": 0.5031652360515021, + "grad_norm": 0.40234375, + "learning_rate": 4.9348690565172715e-06, + "loss": 2.2593, + "step": 9379 + }, + { + "epoch": 0.5032188841201717, + "grad_norm": 0.51171875, + "learning_rate": 4.934849353799572e-06, + "loss": 2.2849, + "step": 9380 + }, + { + "epoch": 0.5032725321888412, + "grad_norm": 0.439453125, + "learning_rate": 4.934829648141539e-06, + "loss": 2.3301, + "step": 9381 + }, + { + "epoch": 0.5033261802575107, + "grad_norm": 0.447265625, + "learning_rate": 4.934809939543197e-06, + "loss": 2.3178, + "step": 9382 + }, + { + "epoch": 0.5033798283261802, + "grad_norm": 0.5, + "learning_rate": 4.934790228004569e-06, + "loss": 2.1313, + "step": 9383 + }, + { + "epoch": 0.5034334763948498, + "grad_norm": 0.4140625, + "learning_rate": 4.9347705135256795e-06, + "loss": 2.4243, + "step": 9384 + }, + { + "epoch": 0.5034871244635193, + "grad_norm": 0.4296875, + "learning_rate": 4.934750796106552e-06, + "loss": 2.2823, + "step": 9385 + }, + { + "epoch": 0.5035407725321889, + "grad_norm": 0.408203125, + "learning_rate": 4.93473107574721e-06, + "loss": 2.2749, + "step": 9386 + }, + { + "epoch": 0.5035944206008584, + "grad_norm": 0.4375, + "learning_rate": 4.934711352447678e-06, + "loss": 2.5147, + "step": 9387 + }, + { + "epoch": 0.5036480686695279, + "grad_norm": 0.40234375, + "learning_rate": 4.934691626207979e-06, + "loss": 1.6196, + "step": 9388 + }, + { + "epoch": 0.5037017167381974, + "grad_norm": 0.3828125, + "learning_rate": 4.934671897028138e-06, + "loss": 2.3195, + "step": 9389 + }, + { + "epoch": 0.503755364806867, + "grad_norm": 0.470703125, + "learning_rate": 4.934652164908178e-06, + "loss": 2.2973, + "step": 9390 + }, + { + "epoch": 0.5038090128755365, + "grad_norm": 0.404296875, + "learning_rate": 4.934632429848122e-06, + "loss": 2.1725, + "step": 9391 + }, + { + "epoch": 0.503862660944206, + "grad_norm": 0.443359375, + "learning_rate": 4.934612691847995e-06, + "loss": 2.4729, + "step": 9392 + }, + { + "epoch": 0.5039163090128755, + "grad_norm": 0.388671875, + "learning_rate": 4.934592950907821e-06, + "loss": 2.2073, + "step": 9393 + }, + { + "epoch": 0.503969957081545, + "grad_norm": 0.48046875, + "learning_rate": 4.934573207027624e-06, + "loss": 2.3639, + "step": 9394 + }, + { + "epoch": 0.5040236051502146, + "grad_norm": 0.4375, + "learning_rate": 4.934553460207426e-06, + "loss": 2.2812, + "step": 9395 + }, + { + "epoch": 0.5040772532188841, + "grad_norm": 0.4140625, + "learning_rate": 4.934533710447253e-06, + "loss": 2.2681, + "step": 9396 + }, + { + "epoch": 0.5041309012875537, + "grad_norm": 0.408203125, + "learning_rate": 4.934513957747128e-06, + "loss": 2.1056, + "step": 9397 + }, + { + "epoch": 0.5041845493562231, + "grad_norm": 0.443359375, + "learning_rate": 4.934494202107074e-06, + "loss": 2.3168, + "step": 9398 + }, + { + "epoch": 0.5042381974248927, + "grad_norm": 0.4296875, + "learning_rate": 4.934474443527117e-06, + "loss": 2.3414, + "step": 9399 + }, + { + "epoch": 0.5042918454935622, + "grad_norm": 0.515625, + "learning_rate": 4.934454682007278e-06, + "loss": 2.3189, + "step": 9400 + }, + { + "epoch": 0.5043454935622318, + "grad_norm": 0.359375, + "learning_rate": 4.934434917547584e-06, + "loss": 2.1705, + "step": 9401 + }, + { + "epoch": 0.5043991416309013, + "grad_norm": 0.416015625, + "learning_rate": 4.934415150148056e-06, + "loss": 2.2432, + "step": 9402 + }, + { + "epoch": 0.5044527896995709, + "grad_norm": 0.53125, + "learning_rate": 4.93439537980872e-06, + "loss": 2.6028, + "step": 9403 + }, + { + "epoch": 0.5045064377682403, + "grad_norm": 0.4375, + "learning_rate": 4.9343756065295976e-06, + "loss": 2.4398, + "step": 9404 + }, + { + "epoch": 0.5045600858369099, + "grad_norm": 0.5, + "learning_rate": 4.934355830310715e-06, + "loss": 2.3987, + "step": 9405 + }, + { + "epoch": 0.5046137339055794, + "grad_norm": 0.451171875, + "learning_rate": 4.934336051152096e-06, + "loss": 1.9314, + "step": 9406 + }, + { + "epoch": 0.5046673819742489, + "grad_norm": 0.353515625, + "learning_rate": 4.934316269053763e-06, + "loss": 2.3499, + "step": 9407 + }, + { + "epoch": 0.5047210300429185, + "grad_norm": 0.515625, + "learning_rate": 4.93429648401574e-06, + "loss": 2.291, + "step": 9408 + }, + { + "epoch": 0.5047746781115879, + "grad_norm": 0.46875, + "learning_rate": 4.934276696038052e-06, + "loss": 2.5099, + "step": 9409 + }, + { + "epoch": 0.5048283261802575, + "grad_norm": 0.4140625, + "learning_rate": 4.934256905120723e-06, + "loss": 2.4068, + "step": 9410 + }, + { + "epoch": 0.504881974248927, + "grad_norm": 0.5, + "learning_rate": 4.934237111263774e-06, + "loss": 2.3279, + "step": 9411 + }, + { + "epoch": 0.5049356223175966, + "grad_norm": 0.427734375, + "learning_rate": 4.934217314467234e-06, + "loss": 2.2949, + "step": 9412 + }, + { + "epoch": 0.5049892703862661, + "grad_norm": 0.40234375, + "learning_rate": 4.934197514731122e-06, + "loss": 2.1172, + "step": 9413 + }, + { + "epoch": 0.5050429184549357, + "grad_norm": 0.455078125, + "learning_rate": 4.934177712055464e-06, + "loss": 2.4203, + "step": 9414 + }, + { + "epoch": 0.5050965665236051, + "grad_norm": 0.341796875, + "learning_rate": 4.934157906440285e-06, + "loss": 2.1862, + "step": 9415 + }, + { + "epoch": 0.5051502145922747, + "grad_norm": 0.376953125, + "learning_rate": 4.934138097885607e-06, + "loss": 2.1097, + "step": 9416 + }, + { + "epoch": 0.5052038626609442, + "grad_norm": 0.4140625, + "learning_rate": 4.934118286391455e-06, + "loss": 2.2313, + "step": 9417 + }, + { + "epoch": 0.5052575107296138, + "grad_norm": 1.296875, + "learning_rate": 4.934098471957854e-06, + "loss": 2.5502, + "step": 9418 + }, + { + "epoch": 0.5053111587982833, + "grad_norm": 1.1953125, + "learning_rate": 4.934078654584824e-06, + "loss": 2.1627, + "step": 9419 + }, + { + "epoch": 0.5053648068669528, + "grad_norm": 0.38671875, + "learning_rate": 4.9340588342723925e-06, + "loss": 2.1177, + "step": 9420 + }, + { + "epoch": 0.5054184549356223, + "grad_norm": 0.45703125, + "learning_rate": 4.934039011020583e-06, + "loss": 2.363, + "step": 9421 + }, + { + "epoch": 0.5054721030042918, + "grad_norm": 0.447265625, + "learning_rate": 4.934019184829419e-06, + "loss": 2.3207, + "step": 9422 + }, + { + "epoch": 0.5055257510729614, + "grad_norm": 0.462890625, + "learning_rate": 4.933999355698923e-06, + "loss": 2.4039, + "step": 9423 + }, + { + "epoch": 0.5055793991416309, + "grad_norm": 0.48828125, + "learning_rate": 4.933979523629121e-06, + "loss": 2.2582, + "step": 9424 + }, + { + "epoch": 0.5056330472103004, + "grad_norm": 0.404296875, + "learning_rate": 4.933959688620037e-06, + "loss": 2.1862, + "step": 9425 + }, + { + "epoch": 0.5056866952789699, + "grad_norm": 0.427734375, + "learning_rate": 4.933939850671693e-06, + "loss": 2.3937, + "step": 9426 + }, + { + "epoch": 0.5057403433476395, + "grad_norm": 0.408203125, + "learning_rate": 4.933920009784115e-06, + "loss": 2.1158, + "step": 9427 + }, + { + "epoch": 0.505793991416309, + "grad_norm": 0.443359375, + "learning_rate": 4.933900165957325e-06, + "loss": 2.1401, + "step": 9428 + }, + { + "epoch": 0.5058476394849786, + "grad_norm": 0.42578125, + "learning_rate": 4.933880319191349e-06, + "loss": 2.1899, + "step": 9429 + }, + { + "epoch": 0.505901287553648, + "grad_norm": 0.64453125, + "learning_rate": 4.933860469486209e-06, + "loss": 2.5466, + "step": 9430 + }, + { + "epoch": 0.5059549356223176, + "grad_norm": 0.52734375, + "learning_rate": 4.933840616841931e-06, + "loss": 2.2126, + "step": 9431 + }, + { + "epoch": 0.5060085836909871, + "grad_norm": 0.376953125, + "learning_rate": 4.933820761258538e-06, + "loss": 2.3262, + "step": 9432 + }, + { + "epoch": 0.5060622317596567, + "grad_norm": 0.470703125, + "learning_rate": 4.933800902736053e-06, + "loss": 2.0528, + "step": 9433 + }, + { + "epoch": 0.5061158798283262, + "grad_norm": 0.4375, + "learning_rate": 4.933781041274502e-06, + "loss": 2.2817, + "step": 9434 + }, + { + "epoch": 0.5061695278969958, + "grad_norm": 0.453125, + "learning_rate": 4.933761176873907e-06, + "loss": 2.3064, + "step": 9435 + }, + { + "epoch": 0.5062231759656652, + "grad_norm": 0.3984375, + "learning_rate": 4.933741309534294e-06, + "loss": 2.3558, + "step": 9436 + }, + { + "epoch": 0.5062768240343347, + "grad_norm": 0.5, + "learning_rate": 4.933721439255684e-06, + "loss": 2.4515, + "step": 9437 + }, + { + "epoch": 0.5063304721030043, + "grad_norm": 0.43359375, + "learning_rate": 4.933701566038104e-06, + "loss": 2.1736, + "step": 9438 + }, + { + "epoch": 0.5063841201716738, + "grad_norm": 0.431640625, + "learning_rate": 4.933681689881577e-06, + "loss": 2.3944, + "step": 9439 + }, + { + "epoch": 0.5064377682403434, + "grad_norm": 0.71875, + "learning_rate": 4.933661810786127e-06, + "loss": 2.0828, + "step": 9440 + }, + { + "epoch": 0.5064914163090128, + "grad_norm": 1.109375, + "learning_rate": 4.9336419287517774e-06, + "loss": 2.1455, + "step": 9441 + }, + { + "epoch": 0.5065450643776824, + "grad_norm": 0.47265625, + "learning_rate": 4.933622043778553e-06, + "loss": 2.3745, + "step": 9442 + }, + { + "epoch": 0.5065987124463519, + "grad_norm": 0.59375, + "learning_rate": 4.933602155866477e-06, + "loss": 2.3444, + "step": 9443 + }, + { + "epoch": 0.5066523605150215, + "grad_norm": 0.39453125, + "learning_rate": 4.933582265015574e-06, + "loss": 2.2887, + "step": 9444 + }, + { + "epoch": 0.506706008583691, + "grad_norm": 0.455078125, + "learning_rate": 4.933562371225869e-06, + "loss": 2.2472, + "step": 9445 + }, + { + "epoch": 0.5067596566523606, + "grad_norm": 0.53515625, + "learning_rate": 4.933542474497384e-06, + "loss": 2.5082, + "step": 9446 + }, + { + "epoch": 0.50681330472103, + "grad_norm": 0.375, + "learning_rate": 4.933522574830144e-06, + "loss": 2.353, + "step": 9447 + }, + { + "epoch": 0.5068669527896996, + "grad_norm": 0.40625, + "learning_rate": 4.933502672224173e-06, + "loss": 2.0658, + "step": 9448 + }, + { + "epoch": 0.5069206008583691, + "grad_norm": 0.443359375, + "learning_rate": 4.933482766679495e-06, + "loss": 2.5004, + "step": 9449 + }, + { + "epoch": 0.5069742489270386, + "grad_norm": 0.421875, + "learning_rate": 4.933462858196134e-06, + "loss": 2.2097, + "step": 9450 + }, + { + "epoch": 0.5070278969957082, + "grad_norm": 1.109375, + "learning_rate": 4.933442946774115e-06, + "loss": 2.4353, + "step": 9451 + }, + { + "epoch": 0.5070815450643776, + "grad_norm": 0.3828125, + "learning_rate": 4.93342303241346e-06, + "loss": 1.9674, + "step": 9452 + }, + { + "epoch": 0.5071351931330472, + "grad_norm": 0.46484375, + "learning_rate": 4.933403115114196e-06, + "loss": 2.1256, + "step": 9453 + }, + { + "epoch": 0.5071888412017167, + "grad_norm": 0.455078125, + "learning_rate": 4.9333831948763436e-06, + "loss": 2.3472, + "step": 9454 + }, + { + "epoch": 0.5072424892703863, + "grad_norm": 0.58203125, + "learning_rate": 4.933363271699929e-06, + "loss": 2.2991, + "step": 9455 + }, + { + "epoch": 0.5072961373390558, + "grad_norm": 0.447265625, + "learning_rate": 4.933343345584975e-06, + "loss": 2.4147, + "step": 9456 + }, + { + "epoch": 0.5073497854077254, + "grad_norm": 1.28125, + "learning_rate": 4.933323416531508e-06, + "loss": 2.2726, + "step": 9457 + }, + { + "epoch": 0.5074034334763948, + "grad_norm": 0.404296875, + "learning_rate": 4.93330348453955e-06, + "loss": 2.1894, + "step": 9458 + }, + { + "epoch": 0.5074570815450644, + "grad_norm": 0.447265625, + "learning_rate": 4.933283549609125e-06, + "loss": 1.3205, + "step": 9459 + }, + { + "epoch": 0.5075107296137339, + "grad_norm": 0.455078125, + "learning_rate": 4.933263611740259e-06, + "loss": 2.19, + "step": 9460 + }, + { + "epoch": 0.5075643776824035, + "grad_norm": 1.1171875, + "learning_rate": 4.933243670932974e-06, + "loss": 2.2489, + "step": 9461 + }, + { + "epoch": 0.507618025751073, + "grad_norm": 0.490234375, + "learning_rate": 4.933223727187294e-06, + "loss": 1.692, + "step": 9462 + }, + { + "epoch": 0.5076716738197425, + "grad_norm": 0.4453125, + "learning_rate": 4.933203780503244e-06, + "loss": 2.2031, + "step": 9463 + }, + { + "epoch": 0.507725321888412, + "grad_norm": 0.396484375, + "learning_rate": 4.933183830880849e-06, + "loss": 2.4425, + "step": 9464 + }, + { + "epoch": 0.5077789699570815, + "grad_norm": 0.369140625, + "learning_rate": 4.933163878320132e-06, + "loss": 2.3296, + "step": 9465 + }, + { + "epoch": 0.5078326180257511, + "grad_norm": 0.388671875, + "learning_rate": 4.933143922821116e-06, + "loss": 2.1272, + "step": 9466 + }, + { + "epoch": 0.5078862660944206, + "grad_norm": 0.431640625, + "learning_rate": 4.933123964383827e-06, + "loss": 2.4244, + "step": 9467 + }, + { + "epoch": 0.5079399141630901, + "grad_norm": 0.47265625, + "learning_rate": 4.933104003008289e-06, + "loss": 2.2978, + "step": 9468 + }, + { + "epoch": 0.5079935622317596, + "grad_norm": 0.396484375, + "learning_rate": 4.933084038694525e-06, + "loss": 2.3053, + "step": 9469 + }, + { + "epoch": 0.5080472103004292, + "grad_norm": 0.427734375, + "learning_rate": 4.9330640714425595e-06, + "loss": 2.0927, + "step": 9470 + }, + { + "epoch": 0.5081008583690987, + "grad_norm": 0.42578125, + "learning_rate": 4.9330441012524165e-06, + "loss": 2.1458, + "step": 9471 + }, + { + "epoch": 0.5081545064377683, + "grad_norm": 1.265625, + "learning_rate": 4.933024128124121e-06, + "loss": 2.2109, + "step": 9472 + }, + { + "epoch": 0.5082081545064377, + "grad_norm": 0.419921875, + "learning_rate": 4.933004152057696e-06, + "loss": 2.253, + "step": 9473 + }, + { + "epoch": 0.5082618025751073, + "grad_norm": 0.447265625, + "learning_rate": 4.932984173053166e-06, + "loss": 1.8778, + "step": 9474 + }, + { + "epoch": 0.5083154506437768, + "grad_norm": 0.423828125, + "learning_rate": 4.932964191110555e-06, + "loss": 2.3539, + "step": 9475 + }, + { + "epoch": 0.5083690987124464, + "grad_norm": 0.396484375, + "learning_rate": 4.9329442062298884e-06, + "loss": 2.1395, + "step": 9476 + }, + { + "epoch": 0.5084227467811159, + "grad_norm": 0.5, + "learning_rate": 4.932924218411188e-06, + "loss": 2.4196, + "step": 9477 + }, + { + "epoch": 0.5084763948497855, + "grad_norm": 0.41015625, + "learning_rate": 4.9329042276544795e-06, + "loss": 2.2441, + "step": 9478 + }, + { + "epoch": 0.5085300429184549, + "grad_norm": 0.4140625, + "learning_rate": 4.932884233959787e-06, + "loss": 2.3427, + "step": 9479 + }, + { + "epoch": 0.5085836909871244, + "grad_norm": 0.41015625, + "learning_rate": 4.932864237327135e-06, + "loss": 2.0401, + "step": 9480 + }, + { + "epoch": 0.508637339055794, + "grad_norm": 0.330078125, + "learning_rate": 4.932844237756546e-06, + "loss": 1.9794, + "step": 9481 + }, + { + "epoch": 0.5086909871244635, + "grad_norm": 0.95703125, + "learning_rate": 4.9328242352480455e-06, + "loss": 2.4262, + "step": 9482 + }, + { + "epoch": 0.5087446351931331, + "grad_norm": 0.431640625, + "learning_rate": 4.932804229801658e-06, + "loss": 2.4221, + "step": 9483 + }, + { + "epoch": 0.5087982832618025, + "grad_norm": 0.40625, + "learning_rate": 4.932784221417406e-06, + "loss": 2.0419, + "step": 9484 + }, + { + "epoch": 0.5088519313304721, + "grad_norm": 0.38671875, + "learning_rate": 4.932764210095314e-06, + "loss": 2.0586, + "step": 9485 + }, + { + "epoch": 0.5089055793991416, + "grad_norm": 0.43359375, + "learning_rate": 4.932744195835408e-06, + "loss": 2.2587, + "step": 9486 + }, + { + "epoch": 0.5089592274678112, + "grad_norm": 0.408203125, + "learning_rate": 4.932724178637711e-06, + "loss": 2.2588, + "step": 9487 + }, + { + "epoch": 0.5090128755364807, + "grad_norm": 0.4453125, + "learning_rate": 4.932704158502248e-06, + "loss": 2.2626, + "step": 9488 + }, + { + "epoch": 0.5090665236051503, + "grad_norm": 0.361328125, + "learning_rate": 4.932684135429041e-06, + "loss": 1.9752, + "step": 9489 + }, + { + "epoch": 0.5091201716738197, + "grad_norm": 0.40625, + "learning_rate": 4.932664109418116e-06, + "loss": 2.3369, + "step": 9490 + }, + { + "epoch": 0.5091738197424893, + "grad_norm": 0.48046875, + "learning_rate": 4.932644080469497e-06, + "loss": 1.4644, + "step": 9491 + }, + { + "epoch": 0.5092274678111588, + "grad_norm": 0.53125, + "learning_rate": 4.932624048583207e-06, + "loss": 2.2756, + "step": 9492 + }, + { + "epoch": 0.5092811158798283, + "grad_norm": 0.43359375, + "learning_rate": 4.932604013759271e-06, + "loss": 2.3141, + "step": 9493 + }, + { + "epoch": 0.5093347639484979, + "grad_norm": 0.4140625, + "learning_rate": 4.932583975997714e-06, + "loss": 2.3217, + "step": 9494 + }, + { + "epoch": 0.5093884120171673, + "grad_norm": 0.427734375, + "learning_rate": 4.93256393529856e-06, + "loss": 2.3763, + "step": 9495 + }, + { + "epoch": 0.5094420600858369, + "grad_norm": 0.5390625, + "learning_rate": 4.932543891661832e-06, + "loss": 2.2745, + "step": 9496 + }, + { + "epoch": 0.5094957081545064, + "grad_norm": 0.400390625, + "learning_rate": 4.932523845087554e-06, + "loss": 2.4843, + "step": 9497 + }, + { + "epoch": 0.509549356223176, + "grad_norm": 0.412109375, + "learning_rate": 4.932503795575752e-06, + "loss": 2.2295, + "step": 9498 + }, + { + "epoch": 0.5096030042918455, + "grad_norm": 0.4296875, + "learning_rate": 4.932483743126449e-06, + "loss": 2.2582, + "step": 9499 + }, + { + "epoch": 0.509656652360515, + "grad_norm": 0.435546875, + "learning_rate": 4.9324636877396706e-06, + "loss": 1.9107, + "step": 9500 + }, + { + "epoch": 0.5097103004291845, + "grad_norm": 0.4375, + "learning_rate": 4.9324436294154385e-06, + "loss": 2.4031, + "step": 9501 + }, + { + "epoch": 0.5097639484978541, + "grad_norm": 0.40625, + "learning_rate": 4.932423568153779e-06, + "loss": 2.1025, + "step": 9502 + }, + { + "epoch": 0.5098175965665236, + "grad_norm": 0.392578125, + "learning_rate": 4.9324035039547155e-06, + "loss": 2.3004, + "step": 9503 + }, + { + "epoch": 0.5098712446351932, + "grad_norm": 0.458984375, + "learning_rate": 4.932383436818273e-06, + "loss": 2.0945, + "step": 9504 + }, + { + "epoch": 0.5099248927038627, + "grad_norm": 0.51953125, + "learning_rate": 4.932363366744474e-06, + "loss": 2.2972, + "step": 9505 + }, + { + "epoch": 0.5099785407725322, + "grad_norm": 0.44921875, + "learning_rate": 4.932343293733345e-06, + "loss": 2.4725, + "step": 9506 + }, + { + "epoch": 0.5100321888412017, + "grad_norm": 0.392578125, + "learning_rate": 4.932323217784909e-06, + "loss": 2.188, + "step": 9507 + }, + { + "epoch": 0.5100858369098712, + "grad_norm": 0.380859375, + "learning_rate": 4.932303138899189e-06, + "loss": 2.4479, + "step": 9508 + }, + { + "epoch": 0.5101394849785408, + "grad_norm": 0.408203125, + "learning_rate": 4.932283057076213e-06, + "loss": 2.0628, + "step": 9509 + }, + { + "epoch": 0.5101931330472103, + "grad_norm": 0.53125, + "learning_rate": 4.932262972316001e-06, + "loss": 2.5402, + "step": 9510 + }, + { + "epoch": 0.5102467811158798, + "grad_norm": 0.396484375, + "learning_rate": 4.93224288461858e-06, + "loss": 2.3901, + "step": 9511 + }, + { + "epoch": 0.5103004291845493, + "grad_norm": 0.447265625, + "learning_rate": 4.932222793983973e-06, + "loss": 2.4004, + "step": 9512 + }, + { + "epoch": 0.5103540772532189, + "grad_norm": 0.419921875, + "learning_rate": 4.932202700412204e-06, + "loss": 2.2579, + "step": 9513 + }, + { + "epoch": 0.5104077253218884, + "grad_norm": 0.490234375, + "learning_rate": 4.9321826039032985e-06, + "loss": 2.4916, + "step": 9514 + }, + { + "epoch": 0.510461373390558, + "grad_norm": 0.50390625, + "learning_rate": 4.93216250445728e-06, + "loss": 1.6951, + "step": 9515 + }, + { + "epoch": 0.5105150214592274, + "grad_norm": 0.4765625, + "learning_rate": 4.932142402074174e-06, + "loss": 2.4089, + "step": 9516 + }, + { + "epoch": 0.510568669527897, + "grad_norm": 0.421875, + "learning_rate": 4.932122296754003e-06, + "loss": 2.361, + "step": 9517 + }, + { + "epoch": 0.5106223175965665, + "grad_norm": 0.484375, + "learning_rate": 4.932102188496791e-06, + "loss": 2.0864, + "step": 9518 + }, + { + "epoch": 0.5106759656652361, + "grad_norm": 0.369140625, + "learning_rate": 4.932082077302564e-06, + "loss": 1.8203, + "step": 9519 + }, + { + "epoch": 0.5107296137339056, + "grad_norm": 0.41015625, + "learning_rate": 4.932061963171346e-06, + "loss": 2.3711, + "step": 9520 + }, + { + "epoch": 0.5107832618025752, + "grad_norm": 0.48046875, + "learning_rate": 4.932041846103162e-06, + "loss": 2.3159, + "step": 9521 + }, + { + "epoch": 0.5108369098712446, + "grad_norm": 0.5390625, + "learning_rate": 4.932021726098033e-06, + "loss": 2.2124, + "step": 9522 + }, + { + "epoch": 0.5108905579399141, + "grad_norm": 0.470703125, + "learning_rate": 4.932001603155986e-06, + "loss": 2.3006, + "step": 9523 + }, + { + "epoch": 0.5109442060085837, + "grad_norm": 0.486328125, + "learning_rate": 4.931981477277045e-06, + "loss": 2.3828, + "step": 9524 + }, + { + "epoch": 0.5109978540772532, + "grad_norm": 0.41015625, + "learning_rate": 4.9319613484612335e-06, + "loss": 2.2093, + "step": 9525 + }, + { + "epoch": 0.5110515021459228, + "grad_norm": 0.447265625, + "learning_rate": 4.9319412167085775e-06, + "loss": 2.2854, + "step": 9526 + }, + { + "epoch": 0.5111051502145922, + "grad_norm": 0.357421875, + "learning_rate": 4.9319210820191e-06, + "loss": 2.1398, + "step": 9527 + }, + { + "epoch": 0.5111587982832618, + "grad_norm": 0.443359375, + "learning_rate": 4.9319009443928244e-06, + "loss": 2.2713, + "step": 9528 + }, + { + "epoch": 0.5112124463519313, + "grad_norm": 0.37890625, + "learning_rate": 4.931880803829777e-06, + "loss": 2.1954, + "step": 9529 + }, + { + "epoch": 0.5112660944206009, + "grad_norm": 0.42578125, + "learning_rate": 4.931860660329981e-06, + "loss": 2.4021, + "step": 9530 + }, + { + "epoch": 0.5113197424892704, + "grad_norm": 0.439453125, + "learning_rate": 4.931840513893461e-06, + "loss": 2.216, + "step": 9531 + }, + { + "epoch": 0.51137339055794, + "grad_norm": 0.609375, + "learning_rate": 4.931820364520241e-06, + "loss": 2.2574, + "step": 9532 + }, + { + "epoch": 0.5114270386266094, + "grad_norm": 0.427734375, + "learning_rate": 4.931800212210346e-06, + "loss": 2.2063, + "step": 9533 + }, + { + "epoch": 0.511480686695279, + "grad_norm": 0.4140625, + "learning_rate": 4.931780056963799e-06, + "loss": 2.2291, + "step": 9534 + }, + { + "epoch": 0.5115343347639485, + "grad_norm": 2.328125, + "learning_rate": 4.931759898780627e-06, + "loss": 2.2109, + "step": 9535 + }, + { + "epoch": 0.511587982832618, + "grad_norm": 0.5546875, + "learning_rate": 4.931739737660852e-06, + "loss": 2.4034, + "step": 9536 + }, + { + "epoch": 0.5116416309012876, + "grad_norm": 0.53515625, + "learning_rate": 4.931719573604499e-06, + "loss": 2.3143, + "step": 9537 + }, + { + "epoch": 0.511695278969957, + "grad_norm": 0.451171875, + "learning_rate": 4.931699406611591e-06, + "loss": 2.3254, + "step": 9538 + }, + { + "epoch": 0.5117489270386266, + "grad_norm": 1.046875, + "learning_rate": 4.931679236682156e-06, + "loss": 2.4029, + "step": 9539 + }, + { + "epoch": 0.5118025751072961, + "grad_norm": 0.5625, + "learning_rate": 4.931659063816214e-06, + "loss": 2.3407, + "step": 9540 + }, + { + "epoch": 0.5118562231759657, + "grad_norm": 0.466796875, + "learning_rate": 4.931638888013792e-06, + "loss": 2.127, + "step": 9541 + }, + { + "epoch": 0.5119098712446352, + "grad_norm": 0.36328125, + "learning_rate": 4.931618709274914e-06, + "loss": 2.3659, + "step": 9542 + }, + { + "epoch": 0.5119635193133047, + "grad_norm": 0.48828125, + "learning_rate": 4.931598527599603e-06, + "loss": 2.3962, + "step": 9543 + }, + { + "epoch": 0.5120171673819742, + "grad_norm": 0.359375, + "learning_rate": 4.931578342987887e-06, + "loss": 2.1787, + "step": 9544 + }, + { + "epoch": 0.5120708154506438, + "grad_norm": 0.486328125, + "learning_rate": 4.9315581554397866e-06, + "loss": 2.3772, + "step": 9545 + }, + { + "epoch": 0.5121244635193133, + "grad_norm": 0.42578125, + "learning_rate": 4.931537964955326e-06, + "loss": 2.5828, + "step": 9546 + }, + { + "epoch": 0.5121781115879829, + "grad_norm": 0.462890625, + "learning_rate": 4.931517771534533e-06, + "loss": 2.6213, + "step": 9547 + }, + { + "epoch": 0.5122317596566524, + "grad_norm": 0.412109375, + "learning_rate": 4.93149757517743e-06, + "loss": 2.2766, + "step": 9548 + }, + { + "epoch": 0.5122854077253219, + "grad_norm": 0.44140625, + "learning_rate": 4.93147737588404e-06, + "loss": 2.2671, + "step": 9549 + }, + { + "epoch": 0.5123390557939914, + "grad_norm": 0.466796875, + "learning_rate": 4.93145717365439e-06, + "loss": 2.6164, + "step": 9550 + }, + { + "epoch": 0.5123927038626609, + "grad_norm": 0.6171875, + "learning_rate": 4.931436968488502e-06, + "loss": 2.4214, + "step": 9551 + }, + { + "epoch": 0.5124463519313305, + "grad_norm": 0.5234375, + "learning_rate": 4.931416760386403e-06, + "loss": 2.6149, + "step": 9552 + }, + { + "epoch": 0.5125, + "grad_norm": 0.5078125, + "learning_rate": 4.931396549348115e-06, + "loss": 2.3946, + "step": 9553 + }, + { + "epoch": 0.5125536480686695, + "grad_norm": 0.44921875, + "learning_rate": 4.9313763353736645e-06, + "loss": 2.3335, + "step": 9554 + }, + { + "epoch": 0.512607296137339, + "grad_norm": 0.4765625, + "learning_rate": 4.931356118463074e-06, + "loss": 2.4285, + "step": 9555 + }, + { + "epoch": 0.5126609442060086, + "grad_norm": 0.466796875, + "learning_rate": 4.931335898616368e-06, + "loss": 2.1269, + "step": 9556 + }, + { + "epoch": 0.5127145922746781, + "grad_norm": 0.53125, + "learning_rate": 4.9313156758335735e-06, + "loss": 2.3984, + "step": 9557 + }, + { + "epoch": 0.5127682403433477, + "grad_norm": 1.65625, + "learning_rate": 4.931295450114711e-06, + "loss": 2.3368, + "step": 9558 + }, + { + "epoch": 0.5128218884120171, + "grad_norm": 0.51953125, + "learning_rate": 4.931275221459809e-06, + "loss": 2.4168, + "step": 9559 + }, + { + "epoch": 0.5128755364806867, + "grad_norm": 0.36328125, + "learning_rate": 4.931254989868889e-06, + "loss": 2.16, + "step": 9560 + }, + { + "epoch": 0.5129291845493562, + "grad_norm": 0.412109375, + "learning_rate": 4.931234755341976e-06, + "loss": 2.3466, + "step": 9561 + }, + { + "epoch": 0.5129828326180258, + "grad_norm": 0.431640625, + "learning_rate": 4.931214517879096e-06, + "loss": 2.311, + "step": 9562 + }, + { + "epoch": 0.5130364806866953, + "grad_norm": 1.65625, + "learning_rate": 4.931194277480271e-06, + "loss": 2.1427, + "step": 9563 + }, + { + "epoch": 0.5130901287553649, + "grad_norm": 0.703125, + "learning_rate": 4.931174034145527e-06, + "loss": 2.326, + "step": 9564 + }, + { + "epoch": 0.5131437768240343, + "grad_norm": 0.3984375, + "learning_rate": 4.931153787874888e-06, + "loss": 2.2073, + "step": 9565 + }, + { + "epoch": 0.5131974248927038, + "grad_norm": 0.421875, + "learning_rate": 4.931133538668379e-06, + "loss": 2.41, + "step": 9566 + }, + { + "epoch": 0.5132510729613734, + "grad_norm": 0.546875, + "learning_rate": 4.931113286526024e-06, + "loss": 2.1801, + "step": 9567 + }, + { + "epoch": 0.5133047210300429, + "grad_norm": 0.41796875, + "learning_rate": 4.931093031447847e-06, + "loss": 2.1273, + "step": 9568 + }, + { + "epoch": 0.5133583690987125, + "grad_norm": 0.44140625, + "learning_rate": 4.9310727734338735e-06, + "loss": 2.1987, + "step": 9569 + }, + { + "epoch": 0.5134120171673819, + "grad_norm": 0.4140625, + "learning_rate": 4.931052512484127e-06, + "loss": 2.4394, + "step": 9570 + }, + { + "epoch": 0.5134656652360515, + "grad_norm": 0.416015625, + "learning_rate": 4.931032248598633e-06, + "loss": 2.3071, + "step": 9571 + }, + { + "epoch": 0.513519313304721, + "grad_norm": 0.435546875, + "learning_rate": 4.9310119817774145e-06, + "loss": 2.1666, + "step": 9572 + }, + { + "epoch": 0.5135729613733906, + "grad_norm": 0.400390625, + "learning_rate": 4.930991712020498e-06, + "loss": 2.5265, + "step": 9573 + }, + { + "epoch": 0.5136266094420601, + "grad_norm": 0.408203125, + "learning_rate": 4.930971439327906e-06, + "loss": 2.2059, + "step": 9574 + }, + { + "epoch": 0.5136802575107297, + "grad_norm": 0.40625, + "learning_rate": 4.9309511636996635e-06, + "loss": 2.2736, + "step": 9575 + }, + { + "epoch": 0.5137339055793991, + "grad_norm": 0.447265625, + "learning_rate": 4.930930885135795e-06, + "loss": 2.0847, + "step": 9576 + }, + { + "epoch": 0.5137875536480687, + "grad_norm": 0.4296875, + "learning_rate": 4.930910603636326e-06, + "loss": 2.5187, + "step": 9577 + }, + { + "epoch": 0.5138412017167382, + "grad_norm": 0.388671875, + "learning_rate": 4.93089031920128e-06, + "loss": 2.287, + "step": 9578 + }, + { + "epoch": 0.5138948497854077, + "grad_norm": 1.0625, + "learning_rate": 4.9308700318306826e-06, + "loss": 2.3277, + "step": 9579 + }, + { + "epoch": 0.5139484978540773, + "grad_norm": 0.4140625, + "learning_rate": 4.930849741524557e-06, + "loss": 1.9247, + "step": 9580 + }, + { + "epoch": 0.5140021459227467, + "grad_norm": 0.361328125, + "learning_rate": 4.930829448282927e-06, + "loss": 1.9688, + "step": 9581 + }, + { + "epoch": 0.5140557939914163, + "grad_norm": 1.875, + "learning_rate": 4.9308091521058194e-06, + "loss": 2.1528, + "step": 9582 + }, + { + "epoch": 0.5141094420600858, + "grad_norm": 0.4296875, + "learning_rate": 4.930788852993257e-06, + "loss": 2.4158, + "step": 9583 + }, + { + "epoch": 0.5141630901287554, + "grad_norm": 0.369140625, + "learning_rate": 4.930768550945265e-06, + "loss": 2.4654, + "step": 9584 + }, + { + "epoch": 0.5142167381974249, + "grad_norm": 0.41796875, + "learning_rate": 4.930748245961868e-06, + "loss": 2.2029, + "step": 9585 + }, + { + "epoch": 0.5142703862660944, + "grad_norm": 0.423828125, + "learning_rate": 4.930727938043091e-06, + "loss": 2.2832, + "step": 9586 + }, + { + "epoch": 0.5143240343347639, + "grad_norm": 0.482421875, + "learning_rate": 4.930707627188957e-06, + "loss": 2.1862, + "step": 9587 + }, + { + "epoch": 0.5143776824034335, + "grad_norm": 1.0, + "learning_rate": 4.9306873133994905e-06, + "loss": 2.33, + "step": 9588 + }, + { + "epoch": 0.514431330472103, + "grad_norm": 0.5078125, + "learning_rate": 4.930666996674718e-06, + "loss": 2.0833, + "step": 9589 + }, + { + "epoch": 0.5144849785407726, + "grad_norm": 0.609375, + "learning_rate": 4.930646677014662e-06, + "loss": 2.3614, + "step": 9590 + }, + { + "epoch": 0.514538626609442, + "grad_norm": 0.392578125, + "learning_rate": 4.930626354419349e-06, + "loss": 2.1358, + "step": 9591 + }, + { + "epoch": 0.5145922746781116, + "grad_norm": 0.482421875, + "learning_rate": 4.930606028888803e-06, + "loss": 2.1597, + "step": 9592 + }, + { + "epoch": 0.5146459227467811, + "grad_norm": 0.455078125, + "learning_rate": 4.9305857004230464e-06, + "loss": 2.1527, + "step": 9593 + }, + { + "epoch": 0.5146995708154506, + "grad_norm": 0.4453125, + "learning_rate": 4.930565369022107e-06, + "loss": 2.2622, + "step": 9594 + }, + { + "epoch": 0.5147532188841202, + "grad_norm": 0.435546875, + "learning_rate": 4.930545034686006e-06, + "loss": 2.4044, + "step": 9595 + }, + { + "epoch": 0.5148068669527897, + "grad_norm": 0.4296875, + "learning_rate": 4.9305246974147715e-06, + "loss": 2.2754, + "step": 9596 + }, + { + "epoch": 0.5148605150214592, + "grad_norm": 0.54296875, + "learning_rate": 4.930504357208425e-06, + "loss": 1.5049, + "step": 9597 + }, + { + "epoch": 0.5149141630901287, + "grad_norm": 0.455078125, + "learning_rate": 4.930484014066993e-06, + "loss": 2.1946, + "step": 9598 + }, + { + "epoch": 0.5149678111587983, + "grad_norm": 0.53515625, + "learning_rate": 4.930463667990498e-06, + "loss": 2.4098, + "step": 9599 + }, + { + "epoch": 0.5150214592274678, + "grad_norm": 0.494140625, + "learning_rate": 4.930443318978968e-06, + "loss": 2.2889, + "step": 9600 + }, + { + "epoch": 0.5150751072961374, + "grad_norm": 0.416015625, + "learning_rate": 4.930422967032425e-06, + "loss": 2.3265, + "step": 9601 + }, + { + "epoch": 0.5151287553648068, + "grad_norm": 0.4296875, + "learning_rate": 4.930402612150894e-06, + "loss": 2.3606, + "step": 9602 + }, + { + "epoch": 0.5151824034334764, + "grad_norm": 0.455078125, + "learning_rate": 4.930382254334399e-06, + "loss": 2.3631, + "step": 9603 + }, + { + "epoch": 0.5152360515021459, + "grad_norm": 0.50390625, + "learning_rate": 4.930361893582965e-06, + "loss": 2.0727, + "step": 9604 + }, + { + "epoch": 0.5152896995708155, + "grad_norm": 0.419921875, + "learning_rate": 4.9303415298966185e-06, + "loss": 2.3058, + "step": 9605 + }, + { + "epoch": 0.515343347639485, + "grad_norm": 0.41796875, + "learning_rate": 4.930321163275381e-06, + "loss": 2.3325, + "step": 9606 + }, + { + "epoch": 0.5153969957081546, + "grad_norm": 0.45703125, + "learning_rate": 4.930300793719279e-06, + "loss": 2.1517, + "step": 9607 + }, + { + "epoch": 0.515450643776824, + "grad_norm": 0.41796875, + "learning_rate": 4.930280421228337e-06, + "loss": 2.2228, + "step": 9608 + }, + { + "epoch": 0.5155042918454935, + "grad_norm": 0.9453125, + "learning_rate": 4.93026004580258e-06, + "loss": 2.2918, + "step": 9609 + }, + { + "epoch": 0.5155579399141631, + "grad_norm": 0.443359375, + "learning_rate": 4.930239667442031e-06, + "loss": 2.2784, + "step": 9610 + }, + { + "epoch": 0.5156115879828326, + "grad_norm": 0.470703125, + "learning_rate": 4.930219286146714e-06, + "loss": 2.3661, + "step": 9611 + }, + { + "epoch": 0.5156652360515022, + "grad_norm": 0.41796875, + "learning_rate": 4.930198901916657e-06, + "loss": 2.2596, + "step": 9612 + }, + { + "epoch": 0.5157188841201716, + "grad_norm": 0.3984375, + "learning_rate": 4.930178514751882e-06, + "loss": 2.2703, + "step": 9613 + }, + { + "epoch": 0.5157725321888412, + "grad_norm": 0.6015625, + "learning_rate": 4.930158124652415e-06, + "loss": 2.3742, + "step": 9614 + }, + { + "epoch": 0.5158261802575107, + "grad_norm": 0.37109375, + "learning_rate": 4.930137731618279e-06, + "loss": 2.3312, + "step": 9615 + }, + { + "epoch": 0.5158798283261803, + "grad_norm": 0.3984375, + "learning_rate": 4.9301173356495e-06, + "loss": 2.2751, + "step": 9616 + }, + { + "epoch": 0.5159334763948498, + "grad_norm": 0.388671875, + "learning_rate": 4.9300969367461025e-06, + "loss": 2.3165, + "step": 9617 + }, + { + "epoch": 0.5159871244635194, + "grad_norm": 0.443359375, + "learning_rate": 4.93007653490811e-06, + "loss": 2.4504, + "step": 9618 + }, + { + "epoch": 0.5160407725321888, + "grad_norm": 0.455078125, + "learning_rate": 4.9300561301355485e-06, + "loss": 2.3296, + "step": 9619 + }, + { + "epoch": 0.5160944206008584, + "grad_norm": 0.41015625, + "learning_rate": 4.930035722428442e-06, + "loss": 2.2406, + "step": 9620 + }, + { + "epoch": 0.5161480686695279, + "grad_norm": 0.478515625, + "learning_rate": 4.930015311786816e-06, + "loss": 2.2536, + "step": 9621 + }, + { + "epoch": 0.5162017167381975, + "grad_norm": 0.65625, + "learning_rate": 4.929994898210694e-06, + "loss": 2.5755, + "step": 9622 + }, + { + "epoch": 0.516255364806867, + "grad_norm": 0.37890625, + "learning_rate": 4.9299744817001e-06, + "loss": 2.2956, + "step": 9623 + }, + { + "epoch": 0.5163090128755364, + "grad_norm": 0.3671875, + "learning_rate": 4.929954062255061e-06, + "loss": 2.3299, + "step": 9624 + }, + { + "epoch": 0.516362660944206, + "grad_norm": 0.44140625, + "learning_rate": 4.9299336398756006e-06, + "loss": 2.3292, + "step": 9625 + }, + { + "epoch": 0.5164163090128755, + "grad_norm": 0.443359375, + "learning_rate": 4.929913214561742e-06, + "loss": 2.2175, + "step": 9626 + }, + { + "epoch": 0.5164699570815451, + "grad_norm": 0.466796875, + "learning_rate": 4.929892786313511e-06, + "loss": 2.1061, + "step": 9627 + }, + { + "epoch": 0.5165236051502146, + "grad_norm": 0.97265625, + "learning_rate": 4.929872355130933e-06, + "loss": 2.1381, + "step": 9628 + }, + { + "epoch": 0.5165772532188841, + "grad_norm": 0.3828125, + "learning_rate": 4.929851921014032e-06, + "loss": 2.3487, + "step": 9629 + }, + { + "epoch": 0.5166309012875536, + "grad_norm": 0.5390625, + "learning_rate": 4.929831483962834e-06, + "loss": 2.4093, + "step": 9630 + }, + { + "epoch": 0.5166845493562232, + "grad_norm": 0.376953125, + "learning_rate": 4.92981104397736e-06, + "loss": 2.2615, + "step": 9631 + }, + { + "epoch": 0.5167381974248927, + "grad_norm": 0.81640625, + "learning_rate": 4.929790601057639e-06, + "loss": 2.3152, + "step": 9632 + }, + { + "epoch": 0.5167918454935623, + "grad_norm": 0.64453125, + "learning_rate": 4.929770155203693e-06, + "loss": 1.9806, + "step": 9633 + }, + { + "epoch": 0.5168454935622318, + "grad_norm": 0.384765625, + "learning_rate": 4.929749706415547e-06, + "loss": 2.263, + "step": 9634 + }, + { + "epoch": 0.5168991416309013, + "grad_norm": 0.388671875, + "learning_rate": 4.929729254693226e-06, + "loss": 2.3038, + "step": 9635 + }, + { + "epoch": 0.5169527896995708, + "grad_norm": 0.49609375, + "learning_rate": 4.929708800036756e-06, + "loss": 2.3262, + "step": 9636 + }, + { + "epoch": 0.5170064377682403, + "grad_norm": 0.36328125, + "learning_rate": 4.92968834244616e-06, + "loss": 1.9686, + "step": 9637 + }, + { + "epoch": 0.5170600858369099, + "grad_norm": 0.435546875, + "learning_rate": 4.929667881921464e-06, + "loss": 2.1679, + "step": 9638 + }, + { + "epoch": 0.5171137339055794, + "grad_norm": 0.427734375, + "learning_rate": 4.929647418462691e-06, + "loss": 2.2694, + "step": 9639 + }, + { + "epoch": 0.5171673819742489, + "grad_norm": 0.423828125, + "learning_rate": 4.9296269520698655e-06, + "loss": 2.2972, + "step": 9640 + }, + { + "epoch": 0.5172210300429184, + "grad_norm": 0.421875, + "learning_rate": 4.929606482743015e-06, + "loss": 2.3915, + "step": 9641 + }, + { + "epoch": 0.517274678111588, + "grad_norm": 0.875, + "learning_rate": 4.9295860104821626e-06, + "loss": 1.3019, + "step": 9642 + }, + { + "epoch": 0.5173283261802575, + "grad_norm": 0.3671875, + "learning_rate": 4.9295655352873324e-06, + "loss": 2.3449, + "step": 9643 + }, + { + "epoch": 0.5173819742489271, + "grad_norm": 0.41015625, + "learning_rate": 4.9295450571585504e-06, + "loss": 2.4107, + "step": 9644 + }, + { + "epoch": 0.5174356223175965, + "grad_norm": 0.392578125, + "learning_rate": 4.929524576095841e-06, + "loss": 2.2634, + "step": 9645 + }, + { + "epoch": 0.5174892703862661, + "grad_norm": 0.46484375, + "learning_rate": 4.929504092099227e-06, + "loss": 2.07, + "step": 9646 + }, + { + "epoch": 0.5175429184549356, + "grad_norm": 0.44921875, + "learning_rate": 4.929483605168736e-06, + "loss": 2.4999, + "step": 9647 + }, + { + "epoch": 0.5175965665236052, + "grad_norm": 0.4296875, + "learning_rate": 4.929463115304392e-06, + "loss": 2.2475, + "step": 9648 + }, + { + "epoch": 0.5176502145922747, + "grad_norm": 0.40625, + "learning_rate": 4.929442622506219e-06, + "loss": 2.1822, + "step": 9649 + }, + { + "epoch": 0.5177038626609443, + "grad_norm": 0.46484375, + "learning_rate": 4.929422126774241e-06, + "loss": 2.1292, + "step": 9650 + }, + { + "epoch": 0.5177575107296137, + "grad_norm": 0.482421875, + "learning_rate": 4.929401628108485e-06, + "loss": 2.4015, + "step": 9651 + }, + { + "epoch": 0.5178111587982832, + "grad_norm": 0.5390625, + "learning_rate": 4.929381126508973e-06, + "loss": 2.2019, + "step": 9652 + }, + { + "epoch": 0.5178648068669528, + "grad_norm": 0.39453125, + "learning_rate": 4.929360621975732e-06, + "loss": 2.4101, + "step": 9653 + }, + { + "epoch": 0.5179184549356223, + "grad_norm": 0.435546875, + "learning_rate": 4.9293401145087864e-06, + "loss": 2.2956, + "step": 9654 + }, + { + "epoch": 0.5179721030042919, + "grad_norm": 0.416015625, + "learning_rate": 4.929319604108159e-06, + "loss": 2.2382, + "step": 9655 + }, + { + "epoch": 0.5180257510729613, + "grad_norm": 0.50390625, + "learning_rate": 4.929299090773878e-06, + "loss": 2.3971, + "step": 9656 + }, + { + "epoch": 0.5180793991416309, + "grad_norm": 0.46875, + "learning_rate": 4.929278574505966e-06, + "loss": 2.2931, + "step": 9657 + }, + { + "epoch": 0.5181330472103004, + "grad_norm": 0.470703125, + "learning_rate": 4.929258055304447e-06, + "loss": 2.1739, + "step": 9658 + }, + { + "epoch": 0.51818669527897, + "grad_norm": 0.46875, + "learning_rate": 4.929237533169348e-06, + "loss": 2.2335, + "step": 9659 + }, + { + "epoch": 0.5182403433476395, + "grad_norm": 0.4140625, + "learning_rate": 4.929217008100692e-06, + "loss": 2.4692, + "step": 9660 + }, + { + "epoch": 0.518293991416309, + "grad_norm": 0.4140625, + "learning_rate": 4.929196480098505e-06, + "loss": 2.4005, + "step": 9661 + }, + { + "epoch": 0.5183476394849785, + "grad_norm": 0.53125, + "learning_rate": 4.929175949162811e-06, + "loss": 1.3819, + "step": 9662 + }, + { + "epoch": 0.5184012875536481, + "grad_norm": 0.435546875, + "learning_rate": 4.929155415293634e-06, + "loss": 2.3357, + "step": 9663 + }, + { + "epoch": 0.5184549356223176, + "grad_norm": 0.455078125, + "learning_rate": 4.929134878491001e-06, + "loss": 2.5506, + "step": 9664 + }, + { + "epoch": 0.5185085836909872, + "grad_norm": 0.45703125, + "learning_rate": 4.929114338754936e-06, + "loss": 2.3852, + "step": 9665 + }, + { + "epoch": 0.5185622317596567, + "grad_norm": 0.43359375, + "learning_rate": 4.929093796085462e-06, + "loss": 1.9223, + "step": 9666 + }, + { + "epoch": 0.5186158798283261, + "grad_norm": 0.6875, + "learning_rate": 4.9290732504826065e-06, + "loss": 2.3345, + "step": 9667 + }, + { + "epoch": 0.5186695278969957, + "grad_norm": 0.41015625, + "learning_rate": 4.929052701946393e-06, + "loss": 2.281, + "step": 9668 + }, + { + "epoch": 0.5187231759656652, + "grad_norm": 0.3671875, + "learning_rate": 4.929032150476845e-06, + "loss": 2.0263, + "step": 9669 + }, + { + "epoch": 0.5187768240343348, + "grad_norm": 0.4453125, + "learning_rate": 4.92901159607399e-06, + "loss": 2.2339, + "step": 9670 + }, + { + "epoch": 0.5188304721030043, + "grad_norm": 0.47265625, + "learning_rate": 4.928991038737851e-06, + "loss": 2.291, + "step": 9671 + }, + { + "epoch": 0.5188841201716738, + "grad_norm": 0.5078125, + "learning_rate": 4.928970478468454e-06, + "loss": 1.9158, + "step": 9672 + }, + { + "epoch": 0.5189377682403433, + "grad_norm": 0.435546875, + "learning_rate": 4.928949915265821e-06, + "loss": 2.166, + "step": 9673 + }, + { + "epoch": 0.5189914163090129, + "grad_norm": 0.494140625, + "learning_rate": 4.9289293491299814e-06, + "loss": 2.3255, + "step": 9674 + }, + { + "epoch": 0.5190450643776824, + "grad_norm": 0.5, + "learning_rate": 4.928908780060957e-06, + "loss": 2.345, + "step": 9675 + }, + { + "epoch": 0.519098712446352, + "grad_norm": 0.416015625, + "learning_rate": 4.928888208058772e-06, + "loss": 2.4906, + "step": 9676 + }, + { + "epoch": 0.5191523605150214, + "grad_norm": 0.44921875, + "learning_rate": 4.928867633123454e-06, + "loss": 2.2387, + "step": 9677 + }, + { + "epoch": 0.519206008583691, + "grad_norm": 0.400390625, + "learning_rate": 4.928847055255026e-06, + "loss": 2.3274, + "step": 9678 + }, + { + "epoch": 0.5192596566523605, + "grad_norm": 0.77734375, + "learning_rate": 4.928826474453513e-06, + "loss": 2.2682, + "step": 9679 + }, + { + "epoch": 0.51931330472103, + "grad_norm": 0.416015625, + "learning_rate": 4.92880589071894e-06, + "loss": 2.327, + "step": 9680 + }, + { + "epoch": 0.5193669527896996, + "grad_norm": 0.5078125, + "learning_rate": 4.928785304051332e-06, + "loss": 2.4193, + "step": 9681 + }, + { + "epoch": 0.519420600858369, + "grad_norm": 0.462890625, + "learning_rate": 4.928764714450714e-06, + "loss": 2.3613, + "step": 9682 + }, + { + "epoch": 0.5194742489270386, + "grad_norm": 0.380859375, + "learning_rate": 4.92874412191711e-06, + "loss": 2.337, + "step": 9683 + }, + { + "epoch": 0.5195278969957081, + "grad_norm": 0.412109375, + "learning_rate": 4.928723526450545e-06, + "loss": 2.3214, + "step": 9684 + }, + { + "epoch": 0.5195815450643777, + "grad_norm": 0.388671875, + "learning_rate": 4.928702928051046e-06, + "loss": 2.2166, + "step": 9685 + }, + { + "epoch": 0.5196351931330472, + "grad_norm": 0.53125, + "learning_rate": 4.928682326718636e-06, + "loss": 2.6174, + "step": 9686 + }, + { + "epoch": 0.5196888412017168, + "grad_norm": 0.390625, + "learning_rate": 4.928661722453339e-06, + "loss": 1.9166, + "step": 9687 + }, + { + "epoch": 0.5197424892703862, + "grad_norm": 0.71484375, + "learning_rate": 4.928641115255181e-06, + "loss": 2.4107, + "step": 9688 + }, + { + "epoch": 0.5197961373390558, + "grad_norm": 0.404296875, + "learning_rate": 4.9286205051241884e-06, + "loss": 2.177, + "step": 9689 + }, + { + "epoch": 0.5198497854077253, + "grad_norm": 0.369140625, + "learning_rate": 4.928599892060383e-06, + "loss": 2.1797, + "step": 9690 + }, + { + "epoch": 0.5199034334763949, + "grad_norm": 0.37890625, + "learning_rate": 4.928579276063792e-06, + "loss": 2.2159, + "step": 9691 + }, + { + "epoch": 0.5199570815450644, + "grad_norm": 0.490234375, + "learning_rate": 4.928558657134439e-06, + "loss": 2.5239, + "step": 9692 + }, + { + "epoch": 0.520010729613734, + "grad_norm": 0.439453125, + "learning_rate": 4.92853803527235e-06, + "loss": 2.1951, + "step": 9693 + }, + { + "epoch": 0.5200643776824034, + "grad_norm": 0.412109375, + "learning_rate": 4.92851741047755e-06, + "loss": 2.2664, + "step": 9694 + }, + { + "epoch": 0.5201180257510729, + "grad_norm": 0.50390625, + "learning_rate": 4.928496782750063e-06, + "loss": 2.3493, + "step": 9695 + }, + { + "epoch": 0.5201716738197425, + "grad_norm": 0.79296875, + "learning_rate": 4.928476152089914e-06, + "loss": 2.4783, + "step": 9696 + }, + { + "epoch": 0.520225321888412, + "grad_norm": 0.41796875, + "learning_rate": 4.928455518497127e-06, + "loss": 2.2719, + "step": 9697 + }, + { + "epoch": 0.5202789699570816, + "grad_norm": 0.4453125, + "learning_rate": 4.928434881971729e-06, + "loss": 2.4056, + "step": 9698 + }, + { + "epoch": 0.520332618025751, + "grad_norm": 0.53515625, + "learning_rate": 4.928414242513744e-06, + "loss": 2.385, + "step": 9699 + }, + { + "epoch": 0.5203862660944206, + "grad_norm": 0.5078125, + "learning_rate": 4.928393600123197e-06, + "loss": 1.9342, + "step": 9700 + }, + { + "epoch": 0.5204399141630901, + "grad_norm": 0.486328125, + "learning_rate": 4.928372954800112e-06, + "loss": 2.1612, + "step": 9701 + }, + { + "epoch": 0.5204935622317597, + "grad_norm": 0.64453125, + "learning_rate": 4.928352306544515e-06, + "loss": 2.1338, + "step": 9702 + }, + { + "epoch": 0.5205472103004292, + "grad_norm": 0.419921875, + "learning_rate": 4.928331655356431e-06, + "loss": 2.2414, + "step": 9703 + }, + { + "epoch": 0.5206008583690988, + "grad_norm": 0.447265625, + "learning_rate": 4.928311001235885e-06, + "loss": 2.5428, + "step": 9704 + }, + { + "epoch": 0.5206545064377682, + "grad_norm": 0.39453125, + "learning_rate": 4.9282903441829e-06, + "loss": 2.317, + "step": 9705 + }, + { + "epoch": 0.5207081545064378, + "grad_norm": 0.443359375, + "learning_rate": 4.928269684197504e-06, + "loss": 2.3067, + "step": 9706 + }, + { + "epoch": 0.5207618025751073, + "grad_norm": 0.40234375, + "learning_rate": 4.9282490212797194e-06, + "loss": 2.4464, + "step": 9707 + }, + { + "epoch": 0.5208154506437769, + "grad_norm": 0.65234375, + "learning_rate": 4.928228355429573e-06, + "loss": 2.4566, + "step": 9708 + }, + { + "epoch": 0.5208690987124464, + "grad_norm": 0.40234375, + "learning_rate": 4.928207686647088e-06, + "loss": 2.1119, + "step": 9709 + }, + { + "epoch": 0.5209227467811158, + "grad_norm": 0.470703125, + "learning_rate": 4.928187014932291e-06, + "loss": 2.4167, + "step": 9710 + }, + { + "epoch": 0.5209763948497854, + "grad_norm": 0.70703125, + "learning_rate": 4.928166340285206e-06, + "loss": 2.3057, + "step": 9711 + }, + { + "epoch": 0.5210300429184549, + "grad_norm": 0.44140625, + "learning_rate": 4.928145662705859e-06, + "loss": 2.1661, + "step": 9712 + }, + { + "epoch": 0.5210836909871245, + "grad_norm": 0.3671875, + "learning_rate": 4.9281249821942734e-06, + "loss": 2.0102, + "step": 9713 + }, + { + "epoch": 0.521137339055794, + "grad_norm": 0.443359375, + "learning_rate": 4.928104298750476e-06, + "loss": 2.2184, + "step": 9714 + }, + { + "epoch": 0.5211909871244635, + "grad_norm": 0.3984375, + "learning_rate": 4.928083612374489e-06, + "loss": 2.1426, + "step": 9715 + }, + { + "epoch": 0.521244635193133, + "grad_norm": 0.6640625, + "learning_rate": 4.92806292306634e-06, + "loss": 2.3368, + "step": 9716 + }, + { + "epoch": 0.5212982832618026, + "grad_norm": 0.4296875, + "learning_rate": 4.928042230826053e-06, + "loss": 2.2997, + "step": 9717 + }, + { + "epoch": 0.5213519313304721, + "grad_norm": 0.43359375, + "learning_rate": 4.928021535653654e-06, + "loss": 2.2724, + "step": 9718 + }, + { + "epoch": 0.5214055793991417, + "grad_norm": 0.40625, + "learning_rate": 4.928000837549166e-06, + "loss": 2.0995, + "step": 9719 + }, + { + "epoch": 0.5214592274678111, + "grad_norm": 0.486328125, + "learning_rate": 4.927980136512616e-06, + "loss": 2.3078, + "step": 9720 + }, + { + "epoch": 0.5215128755364807, + "grad_norm": 0.3984375, + "learning_rate": 4.9279594325440275e-06, + "loss": 2.2875, + "step": 9721 + }, + { + "epoch": 0.5215665236051502, + "grad_norm": 0.384765625, + "learning_rate": 4.927938725643425e-06, + "loss": 1.7514, + "step": 9722 + }, + { + "epoch": 0.5216201716738197, + "grad_norm": 0.458984375, + "learning_rate": 4.927918015810836e-06, + "loss": 2.0621, + "step": 9723 + }, + { + "epoch": 0.5216738197424893, + "grad_norm": 1.0078125, + "learning_rate": 4.927897303046284e-06, + "loss": 2.2344, + "step": 9724 + }, + { + "epoch": 0.5217274678111588, + "grad_norm": 0.65625, + "learning_rate": 4.927876587349794e-06, + "loss": 2.3037, + "step": 9725 + }, + { + "epoch": 0.5217811158798283, + "grad_norm": 0.48046875, + "learning_rate": 4.927855868721391e-06, + "loss": 2.5449, + "step": 9726 + }, + { + "epoch": 0.5218347639484978, + "grad_norm": 0.3359375, + "learning_rate": 4.9278351471611e-06, + "loss": 2.1647, + "step": 9727 + }, + { + "epoch": 0.5218884120171674, + "grad_norm": 0.47265625, + "learning_rate": 4.9278144226689465e-06, + "loss": 2.3424, + "step": 9728 + }, + { + "epoch": 0.5219420600858369, + "grad_norm": 0.5390625, + "learning_rate": 4.927793695244955e-06, + "loss": 2.3848, + "step": 9729 + }, + { + "epoch": 0.5219957081545065, + "grad_norm": 0.490234375, + "learning_rate": 4.927772964889151e-06, + "loss": 2.068, + "step": 9730 + }, + { + "epoch": 0.5220493562231759, + "grad_norm": 1.34375, + "learning_rate": 4.927752231601559e-06, + "loss": 1.9165, + "step": 9731 + }, + { + "epoch": 0.5221030042918455, + "grad_norm": 0.46875, + "learning_rate": 4.927731495382205e-06, + "loss": 2.3994, + "step": 9732 + }, + { + "epoch": 0.522156652360515, + "grad_norm": 0.42578125, + "learning_rate": 4.927710756231112e-06, + "loss": 2.2239, + "step": 9733 + }, + { + "epoch": 0.5222103004291846, + "grad_norm": 0.427734375, + "learning_rate": 4.927690014148308e-06, + "loss": 2.2019, + "step": 9734 + }, + { + "epoch": 0.5222639484978541, + "grad_norm": 0.34765625, + "learning_rate": 4.927669269133815e-06, + "loss": 2.2859, + "step": 9735 + }, + { + "epoch": 0.5223175965665237, + "grad_norm": 1.0625, + "learning_rate": 4.92764852118766e-06, + "loss": 2.3747, + "step": 9736 + }, + { + "epoch": 0.5223712446351931, + "grad_norm": 0.474609375, + "learning_rate": 4.927627770309868e-06, + "loss": 2.3034, + "step": 9737 + }, + { + "epoch": 0.5224248927038626, + "grad_norm": 0.474609375, + "learning_rate": 4.927607016500463e-06, + "loss": 2.2322, + "step": 9738 + }, + { + "epoch": 0.5224785407725322, + "grad_norm": 0.412109375, + "learning_rate": 4.927586259759471e-06, + "loss": 2.3282, + "step": 9739 + }, + { + "epoch": 0.5225321888412017, + "grad_norm": 0.470703125, + "learning_rate": 4.9275655000869164e-06, + "loss": 2.2971, + "step": 9740 + }, + { + "epoch": 0.5225858369098713, + "grad_norm": 0.416015625, + "learning_rate": 4.927544737482825e-06, + "loss": 2.2143, + "step": 9741 + }, + { + "epoch": 0.5226394849785407, + "grad_norm": 0.486328125, + "learning_rate": 4.927523971947221e-06, + "loss": 2.3278, + "step": 9742 + }, + { + "epoch": 0.5226931330472103, + "grad_norm": 0.41796875, + "learning_rate": 4.927503203480131e-06, + "loss": 2.1899, + "step": 9743 + }, + { + "epoch": 0.5227467811158798, + "grad_norm": 0.43359375, + "learning_rate": 4.927482432081578e-06, + "loss": 2.3775, + "step": 9744 + }, + { + "epoch": 0.5228004291845494, + "grad_norm": 0.3984375, + "learning_rate": 4.927461657751588e-06, + "loss": 2.0751, + "step": 9745 + }, + { + "epoch": 0.5228540772532189, + "grad_norm": 1.7734375, + "learning_rate": 4.927440880490188e-06, + "loss": 2.3248, + "step": 9746 + }, + { + "epoch": 0.5229077253218885, + "grad_norm": 0.486328125, + "learning_rate": 4.9274201002973985e-06, + "loss": 2.4204, + "step": 9747 + }, + { + "epoch": 0.5229613733905579, + "grad_norm": 0.412109375, + "learning_rate": 4.927399317173249e-06, + "loss": 2.2701, + "step": 9748 + }, + { + "epoch": 0.5230150214592275, + "grad_norm": 0.44921875, + "learning_rate": 4.927378531117764e-06, + "loss": 2.1419, + "step": 9749 + }, + { + "epoch": 0.523068669527897, + "grad_norm": 0.3984375, + "learning_rate": 4.9273577421309655e-06, + "loss": 2.5234, + "step": 9750 + }, + { + "epoch": 0.5231223175965666, + "grad_norm": 0.46484375, + "learning_rate": 4.927336950212882e-06, + "loss": 2.2819, + "step": 9751 + }, + { + "epoch": 0.523175965665236, + "grad_norm": 0.42578125, + "learning_rate": 4.927316155363536e-06, + "loss": 2.3018, + "step": 9752 + }, + { + "epoch": 0.5232296137339055, + "grad_norm": 0.453125, + "learning_rate": 4.927295357582955e-06, + "loss": 2.6996, + "step": 9753 + }, + { + "epoch": 0.5232832618025751, + "grad_norm": 0.443359375, + "learning_rate": 4.927274556871163e-06, + "loss": 2.2391, + "step": 9754 + }, + { + "epoch": 0.5233369098712446, + "grad_norm": 1.0859375, + "learning_rate": 4.927253753228185e-06, + "loss": 2.421, + "step": 9755 + }, + { + "epoch": 0.5233905579399142, + "grad_norm": 0.5703125, + "learning_rate": 4.9272329466540465e-06, + "loss": 2.006, + "step": 9756 + }, + { + "epoch": 0.5234442060085837, + "grad_norm": 0.435546875, + "learning_rate": 4.927212137148772e-06, + "loss": 2.1571, + "step": 9757 + }, + { + "epoch": 0.5234978540772532, + "grad_norm": 0.4375, + "learning_rate": 4.927191324712387e-06, + "loss": 2.2953, + "step": 9758 + }, + { + "epoch": 0.5235515021459227, + "grad_norm": 0.416015625, + "learning_rate": 4.927170509344917e-06, + "loss": 2.3536, + "step": 9759 + }, + { + "epoch": 0.5236051502145923, + "grad_norm": 0.455078125, + "learning_rate": 4.927149691046387e-06, + "loss": 2.0829, + "step": 9760 + }, + { + "epoch": 0.5236587982832618, + "grad_norm": 0.45703125, + "learning_rate": 4.927128869816822e-06, + "loss": 2.4614, + "step": 9761 + }, + { + "epoch": 0.5237124463519314, + "grad_norm": 0.73828125, + "learning_rate": 4.927108045656246e-06, + "loss": 2.3177, + "step": 9762 + }, + { + "epoch": 0.5237660944206008, + "grad_norm": 0.3984375, + "learning_rate": 4.927087218564685e-06, + "loss": 2.2556, + "step": 9763 + }, + { + "epoch": 0.5238197424892704, + "grad_norm": 0.41015625, + "learning_rate": 4.927066388542167e-06, + "loss": 2.3926, + "step": 9764 + }, + { + "epoch": 0.5238733905579399, + "grad_norm": 0.388671875, + "learning_rate": 4.927045555588712e-06, + "loss": 2.2331, + "step": 9765 + }, + { + "epoch": 0.5239270386266094, + "grad_norm": 0.39453125, + "learning_rate": 4.9270247197043485e-06, + "loss": 2.1538, + "step": 9766 + }, + { + "epoch": 0.523980686695279, + "grad_norm": 0.4453125, + "learning_rate": 4.927003880889101e-06, + "loss": 2.455, + "step": 9767 + }, + { + "epoch": 0.5240343347639485, + "grad_norm": 0.466796875, + "learning_rate": 4.926983039142994e-06, + "loss": 2.1331, + "step": 9768 + }, + { + "epoch": 0.524087982832618, + "grad_norm": 0.453125, + "learning_rate": 4.926962194466054e-06, + "loss": 2.2417, + "step": 9769 + }, + { + "epoch": 0.5241416309012875, + "grad_norm": 0.4375, + "learning_rate": 4.926941346858305e-06, + "loss": 2.2096, + "step": 9770 + }, + { + "epoch": 0.5241952789699571, + "grad_norm": 0.380859375, + "learning_rate": 4.926920496319773e-06, + "loss": 2.1199, + "step": 9771 + }, + { + "epoch": 0.5242489270386266, + "grad_norm": 0.40625, + "learning_rate": 4.926899642850481e-06, + "loss": 2.3096, + "step": 9772 + }, + { + "epoch": 0.5243025751072962, + "grad_norm": 0.43359375, + "learning_rate": 4.926878786450457e-06, + "loss": 2.2337, + "step": 9773 + }, + { + "epoch": 0.5243562231759656, + "grad_norm": 0.419921875, + "learning_rate": 4.926857927119726e-06, + "loss": 2.4718, + "step": 9774 + }, + { + "epoch": 0.5244098712446352, + "grad_norm": 0.484375, + "learning_rate": 4.926837064858311e-06, + "loss": 2.3442, + "step": 9775 + }, + { + "epoch": 0.5244635193133047, + "grad_norm": 0.42578125, + "learning_rate": 4.926816199666239e-06, + "loss": 2.0174, + "step": 9776 + }, + { + "epoch": 0.5245171673819743, + "grad_norm": 0.38671875, + "learning_rate": 4.926795331543534e-06, + "loss": 2.2319, + "step": 9777 + }, + { + "epoch": 0.5245708154506438, + "grad_norm": 0.44921875, + "learning_rate": 4.926774460490223e-06, + "loss": 2.4052, + "step": 9778 + }, + { + "epoch": 0.5246244635193134, + "grad_norm": 0.462890625, + "learning_rate": 4.926753586506329e-06, + "loss": 2.3174, + "step": 9779 + }, + { + "epoch": 0.5246781115879828, + "grad_norm": 0.44921875, + "learning_rate": 4.926732709591879e-06, + "loss": 2.0791, + "step": 9780 + }, + { + "epoch": 0.5247317596566523, + "grad_norm": 0.419921875, + "learning_rate": 4.926711829746898e-06, + "loss": 2.4732, + "step": 9781 + }, + { + "epoch": 0.5247854077253219, + "grad_norm": 0.455078125, + "learning_rate": 4.926690946971409e-06, + "loss": 2.2226, + "step": 9782 + }, + { + "epoch": 0.5248390557939914, + "grad_norm": 0.4296875, + "learning_rate": 4.92667006126544e-06, + "loss": 2.4169, + "step": 9783 + }, + { + "epoch": 0.524892703862661, + "grad_norm": 0.39453125, + "learning_rate": 4.926649172629015e-06, + "loss": 2.2138, + "step": 9784 + }, + { + "epoch": 0.5249463519313304, + "grad_norm": 0.447265625, + "learning_rate": 4.9266282810621595e-06, + "loss": 2.4124, + "step": 9785 + }, + { + "epoch": 0.525, + "grad_norm": 0.5078125, + "learning_rate": 4.926607386564898e-06, + "loss": 2.4675, + "step": 9786 + }, + { + "epoch": 0.5250536480686695, + "grad_norm": 0.416015625, + "learning_rate": 4.9265864891372574e-06, + "loss": 2.3861, + "step": 9787 + }, + { + "epoch": 0.5251072961373391, + "grad_norm": 0.515625, + "learning_rate": 4.926565588779261e-06, + "loss": 2.3715, + "step": 9788 + }, + { + "epoch": 0.5251609442060086, + "grad_norm": 0.486328125, + "learning_rate": 4.926544685490935e-06, + "loss": 2.3549, + "step": 9789 + }, + { + "epoch": 0.5252145922746781, + "grad_norm": 0.421875, + "learning_rate": 4.926523779272305e-06, + "loss": 2.1924, + "step": 9790 + }, + { + "epoch": 0.5252682403433476, + "grad_norm": 0.45703125, + "learning_rate": 4.9265028701233956e-06, + "loss": 2.4285, + "step": 9791 + }, + { + "epoch": 0.5253218884120172, + "grad_norm": 0.412109375, + "learning_rate": 4.926481958044231e-06, + "loss": 2.3543, + "step": 9792 + }, + { + "epoch": 0.5253755364806867, + "grad_norm": 0.466796875, + "learning_rate": 4.926461043034839e-06, + "loss": 2.5347, + "step": 9793 + }, + { + "epoch": 0.5254291845493563, + "grad_norm": 0.423828125, + "learning_rate": 4.926440125095244e-06, + "loss": 2.3051, + "step": 9794 + }, + { + "epoch": 0.5254828326180258, + "grad_norm": 0.408203125, + "learning_rate": 4.92641920422547e-06, + "loss": 2.3633, + "step": 9795 + }, + { + "epoch": 0.5255364806866952, + "grad_norm": 0.61328125, + "learning_rate": 4.926398280425543e-06, + "loss": 2.07, + "step": 9796 + }, + { + "epoch": 0.5255901287553648, + "grad_norm": 0.50390625, + "learning_rate": 4.926377353695489e-06, + "loss": 2.1826, + "step": 9797 + }, + { + "epoch": 0.5256437768240343, + "grad_norm": 0.431640625, + "learning_rate": 4.9263564240353326e-06, + "loss": 2.2637, + "step": 9798 + }, + { + "epoch": 0.5256974248927039, + "grad_norm": 0.515625, + "learning_rate": 4.926335491445098e-06, + "loss": 2.3426, + "step": 9799 + }, + { + "epoch": 0.5257510729613734, + "grad_norm": 0.400390625, + "learning_rate": 4.926314555924813e-06, + "loss": 2.2572, + "step": 9800 + }, + { + "epoch": 0.5258047210300429, + "grad_norm": 0.482421875, + "learning_rate": 4.9262936174745e-06, + "loss": 2.3891, + "step": 9801 + }, + { + "epoch": 0.5258583690987124, + "grad_norm": 0.484375, + "learning_rate": 4.926272676094187e-06, + "loss": 2.2564, + "step": 9802 + }, + { + "epoch": 0.525912017167382, + "grad_norm": 0.38671875, + "learning_rate": 4.926251731783897e-06, + "loss": 2.4949, + "step": 9803 + }, + { + "epoch": 0.5259656652360515, + "grad_norm": 0.365234375, + "learning_rate": 4.9262307845436565e-06, + "loss": 2.254, + "step": 9804 + }, + { + "epoch": 0.5260193133047211, + "grad_norm": 0.408203125, + "learning_rate": 4.926209834373491e-06, + "loss": 2.1894, + "step": 9805 + }, + { + "epoch": 0.5260729613733905, + "grad_norm": 0.421875, + "learning_rate": 4.926188881273426e-06, + "loss": 2.2299, + "step": 9806 + }, + { + "epoch": 0.5261266094420601, + "grad_norm": 0.51953125, + "learning_rate": 4.926167925243485e-06, + "loss": 2.3171, + "step": 9807 + }, + { + "epoch": 0.5261802575107296, + "grad_norm": 0.423828125, + "learning_rate": 4.926146966283695e-06, + "loss": 2.4033, + "step": 9808 + }, + { + "epoch": 0.5262339055793992, + "grad_norm": 0.365234375, + "learning_rate": 4.92612600439408e-06, + "loss": 2.3226, + "step": 9809 + }, + { + "epoch": 0.5262875536480687, + "grad_norm": 0.451171875, + "learning_rate": 4.926105039574668e-06, + "loss": 2.442, + "step": 9810 + }, + { + "epoch": 0.5263412017167381, + "grad_norm": 0.55078125, + "learning_rate": 4.9260840718254815e-06, + "loss": 2.5612, + "step": 9811 + }, + { + "epoch": 0.5263948497854077, + "grad_norm": 0.443359375, + "learning_rate": 4.926063101146547e-06, + "loss": 2.333, + "step": 9812 + }, + { + "epoch": 0.5264484978540772, + "grad_norm": 0.443359375, + "learning_rate": 4.926042127537889e-06, + "loss": 2.3029, + "step": 9813 + }, + { + "epoch": 0.5265021459227468, + "grad_norm": 0.462890625, + "learning_rate": 4.926021150999533e-06, + "loss": 2.2945, + "step": 9814 + }, + { + "epoch": 0.5265557939914163, + "grad_norm": 0.423828125, + "learning_rate": 4.926000171531506e-06, + "loss": 2.3054, + "step": 9815 + }, + { + "epoch": 0.5266094420600859, + "grad_norm": 0.4296875, + "learning_rate": 4.925979189133832e-06, + "loss": 2.4955, + "step": 9816 + }, + { + "epoch": 0.5266630901287553, + "grad_norm": 0.37890625, + "learning_rate": 4.925958203806536e-06, + "loss": 2.3226, + "step": 9817 + }, + { + "epoch": 0.5267167381974249, + "grad_norm": 0.45703125, + "learning_rate": 4.9259372155496445e-06, + "loss": 2.2474, + "step": 9818 + }, + { + "epoch": 0.5267703862660944, + "grad_norm": 0.4609375, + "learning_rate": 4.925916224363181e-06, + "loss": 2.4113, + "step": 9819 + }, + { + "epoch": 0.526824034334764, + "grad_norm": 0.380859375, + "learning_rate": 4.925895230247173e-06, + "loss": 2.0948, + "step": 9820 + }, + { + "epoch": 0.5268776824034335, + "grad_norm": 0.390625, + "learning_rate": 4.925874233201644e-06, + "loss": 2.2782, + "step": 9821 + }, + { + "epoch": 0.526931330472103, + "grad_norm": 0.5234375, + "learning_rate": 4.925853233226621e-06, + "loss": 2.4174, + "step": 9822 + }, + { + "epoch": 0.5269849785407725, + "grad_norm": 0.412109375, + "learning_rate": 4.925832230322127e-06, + "loss": 2.2245, + "step": 9823 + }, + { + "epoch": 0.527038626609442, + "grad_norm": 0.408203125, + "learning_rate": 4.92581122448819e-06, + "loss": 2.1925, + "step": 9824 + }, + { + "epoch": 0.5270922746781116, + "grad_norm": 0.8203125, + "learning_rate": 4.925790215724835e-06, + "loss": 1.5035, + "step": 9825 + }, + { + "epoch": 0.5271459227467811, + "grad_norm": 0.470703125, + "learning_rate": 4.925769204032086e-06, + "loss": 2.0462, + "step": 9826 + }, + { + "epoch": 0.5271995708154507, + "grad_norm": 0.51953125, + "learning_rate": 4.9257481894099685e-06, + "loss": 2.5045, + "step": 9827 + }, + { + "epoch": 0.5272532188841201, + "grad_norm": 0.42578125, + "learning_rate": 4.925727171858509e-06, + "loss": 2.5312, + "step": 9828 + }, + { + "epoch": 0.5273068669527897, + "grad_norm": 0.4375, + "learning_rate": 4.925706151377732e-06, + "loss": 2.319, + "step": 9829 + }, + { + "epoch": 0.5273605150214592, + "grad_norm": 0.458984375, + "learning_rate": 4.925685127967663e-06, + "loss": 1.5671, + "step": 9830 + }, + { + "epoch": 0.5274141630901288, + "grad_norm": 0.515625, + "learning_rate": 4.925664101628328e-06, + "loss": 2.1151, + "step": 9831 + }, + { + "epoch": 0.5274678111587983, + "grad_norm": 0.421875, + "learning_rate": 4.925643072359751e-06, + "loss": 2.0352, + "step": 9832 + }, + { + "epoch": 0.5275214592274678, + "grad_norm": 0.416015625, + "learning_rate": 4.92562204016196e-06, + "loss": 2.2009, + "step": 9833 + }, + { + "epoch": 0.5275751072961373, + "grad_norm": 0.455078125, + "learning_rate": 4.9256010050349774e-06, + "loss": 2.2741, + "step": 9834 + }, + { + "epoch": 0.5276287553648069, + "grad_norm": 0.578125, + "learning_rate": 4.925579966978831e-06, + "loss": 2.3213, + "step": 9835 + }, + { + "epoch": 0.5276824034334764, + "grad_norm": 0.498046875, + "learning_rate": 4.925558925993544e-06, + "loss": 2.4248, + "step": 9836 + }, + { + "epoch": 0.527736051502146, + "grad_norm": 0.58984375, + "learning_rate": 4.925537882079143e-06, + "loss": 2.1706, + "step": 9837 + }, + { + "epoch": 0.5277896995708155, + "grad_norm": 0.4453125, + "learning_rate": 4.925516835235654e-06, + "loss": 2.3978, + "step": 9838 + }, + { + "epoch": 0.5278433476394849, + "grad_norm": 0.59375, + "learning_rate": 4.925495785463102e-06, + "loss": 2.3476, + "step": 9839 + }, + { + "epoch": 0.5278969957081545, + "grad_norm": 0.419921875, + "learning_rate": 4.925474732761511e-06, + "loss": 2.3546, + "step": 9840 + }, + { + "epoch": 0.527950643776824, + "grad_norm": 0.412109375, + "learning_rate": 4.925453677130909e-06, + "loss": 2.1139, + "step": 9841 + }, + { + "epoch": 0.5280042918454936, + "grad_norm": 0.470703125, + "learning_rate": 4.925432618571319e-06, + "loss": 2.1379, + "step": 9842 + }, + { + "epoch": 0.528057939914163, + "grad_norm": 0.447265625, + "learning_rate": 4.9254115570827684e-06, + "loss": 2.3386, + "step": 9843 + }, + { + "epoch": 0.5281115879828326, + "grad_norm": 0.55859375, + "learning_rate": 4.925390492665281e-06, + "loss": 2.5367, + "step": 9844 + }, + { + "epoch": 0.5281652360515021, + "grad_norm": 0.453125, + "learning_rate": 4.925369425318883e-06, + "loss": 2.3296, + "step": 9845 + }, + { + "epoch": 0.5282188841201717, + "grad_norm": 0.498046875, + "learning_rate": 4.9253483550436e-06, + "loss": 2.0627, + "step": 9846 + }, + { + "epoch": 0.5282725321888412, + "grad_norm": 0.44140625, + "learning_rate": 4.925327281839457e-06, + "loss": 2.4964, + "step": 9847 + }, + { + "epoch": 0.5283261802575108, + "grad_norm": 0.44140625, + "learning_rate": 4.92530620570648e-06, + "loss": 2.3166, + "step": 9848 + }, + { + "epoch": 0.5283798283261802, + "grad_norm": 0.49609375, + "learning_rate": 4.925285126644694e-06, + "loss": 2.0721, + "step": 9849 + }, + { + "epoch": 0.5284334763948498, + "grad_norm": 0.451171875, + "learning_rate": 4.925264044654125e-06, + "loss": 2.38, + "step": 9850 + }, + { + "epoch": 0.5284871244635193, + "grad_norm": 0.380859375, + "learning_rate": 4.925242959734798e-06, + "loss": 2.4241, + "step": 9851 + }, + { + "epoch": 0.5285407725321889, + "grad_norm": 0.43359375, + "learning_rate": 4.925221871886738e-06, + "loss": 2.4042, + "step": 9852 + }, + { + "epoch": 0.5285944206008584, + "grad_norm": 0.41796875, + "learning_rate": 4.925200781109971e-06, + "loss": 2.3441, + "step": 9853 + }, + { + "epoch": 0.5286480686695278, + "grad_norm": 0.404296875, + "learning_rate": 4.925179687404523e-06, + "loss": 2.2827, + "step": 9854 + }, + { + "epoch": 0.5287017167381974, + "grad_norm": 0.78515625, + "learning_rate": 4.925158590770418e-06, + "loss": 2.221, + "step": 9855 + }, + { + "epoch": 0.5287553648068669, + "grad_norm": 0.39453125, + "learning_rate": 4.925137491207683e-06, + "loss": 2.4698, + "step": 9856 + }, + { + "epoch": 0.5288090128755365, + "grad_norm": 0.51171875, + "learning_rate": 4.925116388716344e-06, + "loss": 2.3985, + "step": 9857 + }, + { + "epoch": 0.528862660944206, + "grad_norm": 0.45703125, + "learning_rate": 4.925095283296423e-06, + "loss": 2.4751, + "step": 9858 + }, + { + "epoch": 0.5289163090128756, + "grad_norm": 0.486328125, + "learning_rate": 4.9250741749479495e-06, + "loss": 2.425, + "step": 9859 + }, + { + "epoch": 0.528969957081545, + "grad_norm": 0.427734375, + "learning_rate": 4.925053063670947e-06, + "loss": 2.5201, + "step": 9860 + }, + { + "epoch": 0.5290236051502146, + "grad_norm": 0.451171875, + "learning_rate": 4.925031949465441e-06, + "loss": 2.4645, + "step": 9861 + }, + { + "epoch": 0.5290772532188841, + "grad_norm": 0.458984375, + "learning_rate": 4.925010832331457e-06, + "loss": 1.914, + "step": 9862 + }, + { + "epoch": 0.5291309012875537, + "grad_norm": 0.94140625, + "learning_rate": 4.924989712269021e-06, + "loss": 2.1505, + "step": 9863 + }, + { + "epoch": 0.5291845493562232, + "grad_norm": 0.412109375, + "learning_rate": 4.9249685892781594e-06, + "loss": 2.1547, + "step": 9864 + }, + { + "epoch": 0.5292381974248928, + "grad_norm": 0.421875, + "learning_rate": 4.9249474633588955e-06, + "loss": 2.3176, + "step": 9865 + }, + { + "epoch": 0.5292918454935622, + "grad_norm": 0.365234375, + "learning_rate": 4.924926334511257e-06, + "loss": 2.1171, + "step": 9866 + }, + { + "epoch": 0.5293454935622317, + "grad_norm": 0.484375, + "learning_rate": 4.924905202735267e-06, + "loss": 2.3194, + "step": 9867 + }, + { + "epoch": 0.5293991416309013, + "grad_norm": 0.4296875, + "learning_rate": 4.924884068030954e-06, + "loss": 2.1493, + "step": 9868 + }, + { + "epoch": 0.5294527896995708, + "grad_norm": 0.48046875, + "learning_rate": 4.924862930398341e-06, + "loss": 2.2219, + "step": 9869 + }, + { + "epoch": 0.5295064377682404, + "grad_norm": 0.380859375, + "learning_rate": 4.924841789837454e-06, + "loss": 2.3321, + "step": 9870 + }, + { + "epoch": 0.5295600858369098, + "grad_norm": 0.40625, + "learning_rate": 4.92482064634832e-06, + "loss": 2.3697, + "step": 9871 + }, + { + "epoch": 0.5296137339055794, + "grad_norm": 1.6171875, + "learning_rate": 4.924799499930963e-06, + "loss": 2.4092, + "step": 9872 + }, + { + "epoch": 0.5296673819742489, + "grad_norm": 0.455078125, + "learning_rate": 4.9247783505854085e-06, + "loss": 2.2352, + "step": 9873 + }, + { + "epoch": 0.5297210300429185, + "grad_norm": 0.498046875, + "learning_rate": 4.924757198311683e-06, + "loss": 2.3343, + "step": 9874 + }, + { + "epoch": 0.529774678111588, + "grad_norm": 0.39453125, + "learning_rate": 4.924736043109813e-06, + "loss": 2.1132, + "step": 9875 + }, + { + "epoch": 0.5298283261802575, + "grad_norm": 0.41796875, + "learning_rate": 4.9247148849798204e-06, + "loss": 2.5157, + "step": 9876 + }, + { + "epoch": 0.529881974248927, + "grad_norm": 0.91796875, + "learning_rate": 4.924693723921734e-06, + "loss": 2.1891, + "step": 9877 + }, + { + "epoch": 0.5299356223175966, + "grad_norm": 0.494140625, + "learning_rate": 4.924672559935578e-06, + "loss": 2.2242, + "step": 9878 + }, + { + "epoch": 0.5299892703862661, + "grad_norm": 0.42578125, + "learning_rate": 4.924651393021379e-06, + "loss": 2.2013, + "step": 9879 + }, + { + "epoch": 0.5300429184549357, + "grad_norm": 0.41015625, + "learning_rate": 4.924630223179161e-06, + "loss": 2.5697, + "step": 9880 + }, + { + "epoch": 0.5300965665236052, + "grad_norm": 0.412109375, + "learning_rate": 4.9246090504089516e-06, + "loss": 2.2534, + "step": 9881 + }, + { + "epoch": 0.5301502145922746, + "grad_norm": 0.451171875, + "learning_rate": 4.924587874710774e-06, + "loss": 2.441, + "step": 9882 + }, + { + "epoch": 0.5302038626609442, + "grad_norm": 0.404296875, + "learning_rate": 4.9245666960846564e-06, + "loss": 2.3412, + "step": 9883 + }, + { + "epoch": 0.5302575107296137, + "grad_norm": 0.69140625, + "learning_rate": 4.924545514530622e-06, + "loss": 2.4537, + "step": 9884 + }, + { + "epoch": 0.5303111587982833, + "grad_norm": 0.4609375, + "learning_rate": 4.924524330048698e-06, + "loss": 2.0349, + "step": 9885 + }, + { + "epoch": 0.5303648068669528, + "grad_norm": 0.46875, + "learning_rate": 4.924503142638909e-06, + "loss": 2.1507, + "step": 9886 + }, + { + "epoch": 0.5304184549356223, + "grad_norm": 0.443359375, + "learning_rate": 4.92448195230128e-06, + "loss": 2.4971, + "step": 9887 + }, + { + "epoch": 0.5304721030042918, + "grad_norm": 0.40234375, + "learning_rate": 4.924460759035839e-06, + "loss": 2.3428, + "step": 9888 + }, + { + "epoch": 0.5305257510729614, + "grad_norm": 0.453125, + "learning_rate": 4.924439562842609e-06, + "loss": 2.517, + "step": 9889 + }, + { + "epoch": 0.5305793991416309, + "grad_norm": 0.43359375, + "learning_rate": 4.9244183637216175e-06, + "loss": 2.2079, + "step": 9890 + }, + { + "epoch": 0.5306330472103005, + "grad_norm": 0.40625, + "learning_rate": 4.924397161672889e-06, + "loss": 2.4883, + "step": 9891 + }, + { + "epoch": 0.5306866952789699, + "grad_norm": 0.453125, + "learning_rate": 4.9243759566964485e-06, + "loss": 2.4766, + "step": 9892 + }, + { + "epoch": 0.5307403433476395, + "grad_norm": 0.43359375, + "learning_rate": 4.9243547487923235e-06, + "loss": 2.2651, + "step": 9893 + }, + { + "epoch": 0.530793991416309, + "grad_norm": 0.42578125, + "learning_rate": 4.924333537960539e-06, + "loss": 2.3347, + "step": 9894 + }, + { + "epoch": 0.5308476394849786, + "grad_norm": 0.3828125, + "learning_rate": 4.92431232420112e-06, + "loss": 2.1201, + "step": 9895 + }, + { + "epoch": 0.5309012875536481, + "grad_norm": 0.46484375, + "learning_rate": 4.924291107514092e-06, + "loss": 2.4076, + "step": 9896 + }, + { + "epoch": 0.5309549356223175, + "grad_norm": 0.384765625, + "learning_rate": 4.924269887899481e-06, + "loss": 2.1707, + "step": 9897 + }, + { + "epoch": 0.5310085836909871, + "grad_norm": 0.39453125, + "learning_rate": 4.924248665357312e-06, + "loss": 2.2916, + "step": 9898 + }, + { + "epoch": 0.5310622317596566, + "grad_norm": 0.435546875, + "learning_rate": 4.924227439887612e-06, + "loss": 2.2651, + "step": 9899 + }, + { + "epoch": 0.5311158798283262, + "grad_norm": 0.423828125, + "learning_rate": 4.924206211490406e-06, + "loss": 2.6368, + "step": 9900 + }, + { + "epoch": 0.5311695278969957, + "grad_norm": 0.4296875, + "learning_rate": 4.924184980165719e-06, + "loss": 2.0271, + "step": 9901 + }, + { + "epoch": 0.5312231759656653, + "grad_norm": 0.482421875, + "learning_rate": 4.924163745913578e-06, + "loss": 2.3549, + "step": 9902 + }, + { + "epoch": 0.5312768240343347, + "grad_norm": 0.63671875, + "learning_rate": 4.924142508734007e-06, + "loss": 2.3033, + "step": 9903 + }, + { + "epoch": 0.5313304721030043, + "grad_norm": 0.490234375, + "learning_rate": 4.924121268627033e-06, + "loss": 2.3924, + "step": 9904 + }, + { + "epoch": 0.5313841201716738, + "grad_norm": 0.416015625, + "learning_rate": 4.9241000255926795e-06, + "loss": 2.3191, + "step": 9905 + }, + { + "epoch": 0.5314377682403434, + "grad_norm": 0.447265625, + "learning_rate": 4.924078779630975e-06, + "loss": 2.3555, + "step": 9906 + }, + { + "epoch": 0.5314914163090129, + "grad_norm": 0.45703125, + "learning_rate": 4.924057530741944e-06, + "loss": 2.2307, + "step": 9907 + }, + { + "epoch": 0.5315450643776825, + "grad_norm": 0.44921875, + "learning_rate": 4.924036278925612e-06, + "loss": 2.4231, + "step": 9908 + }, + { + "epoch": 0.5315987124463519, + "grad_norm": 0.44921875, + "learning_rate": 4.924015024182004e-06, + "loss": 2.3599, + "step": 9909 + }, + { + "epoch": 0.5316523605150214, + "grad_norm": 0.412109375, + "learning_rate": 4.923993766511147e-06, + "loss": 2.3221, + "step": 9910 + }, + { + "epoch": 0.531706008583691, + "grad_norm": 0.345703125, + "learning_rate": 4.923972505913066e-06, + "loss": 1.9992, + "step": 9911 + }, + { + "epoch": 0.5317596566523605, + "grad_norm": 0.515625, + "learning_rate": 4.923951242387787e-06, + "loss": 2.4311, + "step": 9912 + }, + { + "epoch": 0.5318133047210301, + "grad_norm": 0.375, + "learning_rate": 4.923929975935334e-06, + "loss": 1.7927, + "step": 9913 + }, + { + "epoch": 0.5318669527896995, + "grad_norm": 0.375, + "learning_rate": 4.923908706555735e-06, + "loss": 2.1848, + "step": 9914 + }, + { + "epoch": 0.5319206008583691, + "grad_norm": 0.45703125, + "learning_rate": 4.923887434249015e-06, + "loss": 2.0305, + "step": 9915 + }, + { + "epoch": 0.5319742489270386, + "grad_norm": 0.400390625, + "learning_rate": 4.923866159015199e-06, + "loss": 2.2049, + "step": 9916 + }, + { + "epoch": 0.5320278969957082, + "grad_norm": 0.66796875, + "learning_rate": 4.9238448808543136e-06, + "loss": 2.4896, + "step": 9917 + }, + { + "epoch": 0.5320815450643777, + "grad_norm": 0.353515625, + "learning_rate": 4.923823599766384e-06, + "loss": 2.0346, + "step": 9918 + }, + { + "epoch": 0.5321351931330472, + "grad_norm": 0.48046875, + "learning_rate": 4.923802315751436e-06, + "loss": 2.453, + "step": 9919 + }, + { + "epoch": 0.5321888412017167, + "grad_norm": 0.48046875, + "learning_rate": 4.923781028809495e-06, + "loss": 2.2709, + "step": 9920 + }, + { + "epoch": 0.5322424892703863, + "grad_norm": 0.412109375, + "learning_rate": 4.923759738940587e-06, + "loss": 2.2613, + "step": 9921 + }, + { + "epoch": 0.5322961373390558, + "grad_norm": 0.46484375, + "learning_rate": 4.923738446144738e-06, + "loss": 2.2554, + "step": 9922 + }, + { + "epoch": 0.5323497854077254, + "grad_norm": 0.423828125, + "learning_rate": 4.923717150421973e-06, + "loss": 2.318, + "step": 9923 + }, + { + "epoch": 0.5324034334763948, + "grad_norm": 0.392578125, + "learning_rate": 4.923695851772317e-06, + "loss": 2.0766, + "step": 9924 + }, + { + "epoch": 0.5324570815450643, + "grad_norm": 0.4765625, + "learning_rate": 4.923674550195799e-06, + "loss": 2.185, + "step": 9925 + }, + { + "epoch": 0.5325107296137339, + "grad_norm": 0.349609375, + "learning_rate": 4.9236532456924415e-06, + "loss": 2.206, + "step": 9926 + }, + { + "epoch": 0.5325643776824034, + "grad_norm": 0.4140625, + "learning_rate": 4.923631938262271e-06, + "loss": 2.1501, + "step": 9927 + }, + { + "epoch": 0.532618025751073, + "grad_norm": 0.490234375, + "learning_rate": 4.923610627905313e-06, + "loss": 2.3314, + "step": 9928 + }, + { + "epoch": 0.5326716738197425, + "grad_norm": 0.458984375, + "learning_rate": 4.923589314621595e-06, + "loss": 1.8071, + "step": 9929 + }, + { + "epoch": 0.532725321888412, + "grad_norm": 0.546875, + "learning_rate": 4.92356799841114e-06, + "loss": 2.1759, + "step": 9930 + }, + { + "epoch": 0.5327789699570815, + "grad_norm": 0.41796875, + "learning_rate": 4.923546679273977e-06, + "loss": 2.4524, + "step": 9931 + }, + { + "epoch": 0.5328326180257511, + "grad_norm": 0.453125, + "learning_rate": 4.923525357210129e-06, + "loss": 2.4837, + "step": 9932 + }, + { + "epoch": 0.5328862660944206, + "grad_norm": 1.1640625, + "learning_rate": 4.923504032219623e-06, + "loss": 2.1973, + "step": 9933 + }, + { + "epoch": 0.5329399141630902, + "grad_norm": 0.45703125, + "learning_rate": 4.923482704302483e-06, + "loss": 2.3016, + "step": 9934 + }, + { + "epoch": 0.5329935622317596, + "grad_norm": 0.4609375, + "learning_rate": 4.9234613734587386e-06, + "loss": 2.4056, + "step": 9935 + }, + { + "epoch": 0.5330472103004292, + "grad_norm": 0.423828125, + "learning_rate": 4.923440039688412e-06, + "loss": 2.3203, + "step": 9936 + }, + { + "epoch": 0.5331008583690987, + "grad_norm": 0.400390625, + "learning_rate": 4.92341870299153e-06, + "loss": 2.1552, + "step": 9937 + }, + { + "epoch": 0.5331545064377683, + "grad_norm": 0.40625, + "learning_rate": 4.923397363368118e-06, + "loss": 2.4151, + "step": 9938 + }, + { + "epoch": 0.5332081545064378, + "grad_norm": 0.49609375, + "learning_rate": 4.923376020818203e-06, + "loss": 2.4637, + "step": 9939 + }, + { + "epoch": 0.5332618025751072, + "grad_norm": 0.6328125, + "learning_rate": 4.92335467534181e-06, + "loss": 2.391, + "step": 9940 + }, + { + "epoch": 0.5333154506437768, + "grad_norm": 0.474609375, + "learning_rate": 4.923333326938965e-06, + "loss": 2.502, + "step": 9941 + }, + { + "epoch": 0.5333690987124463, + "grad_norm": 0.427734375, + "learning_rate": 4.9233119756096925e-06, + "loss": 2.3144, + "step": 9942 + }, + { + "epoch": 0.5334227467811159, + "grad_norm": 0.4609375, + "learning_rate": 4.9232906213540205e-06, + "loss": 2.3699, + "step": 9943 + }, + { + "epoch": 0.5334763948497854, + "grad_norm": 0.4609375, + "learning_rate": 4.923269264171973e-06, + "loss": 2.2802, + "step": 9944 + }, + { + "epoch": 0.533530042918455, + "grad_norm": 0.353515625, + "learning_rate": 4.923247904063577e-06, + "loss": 2.202, + "step": 9945 + }, + { + "epoch": 0.5335836909871244, + "grad_norm": 0.42578125, + "learning_rate": 4.923226541028857e-06, + "loss": 2.4551, + "step": 9946 + }, + { + "epoch": 0.533637339055794, + "grad_norm": 0.734375, + "learning_rate": 4.923205175067841e-06, + "loss": 2.2686, + "step": 9947 + }, + { + "epoch": 0.5336909871244635, + "grad_norm": 0.3671875, + "learning_rate": 4.923183806180552e-06, + "loss": 1.8833, + "step": 9948 + }, + { + "epoch": 0.5337446351931331, + "grad_norm": 3.578125, + "learning_rate": 4.923162434367017e-06, + "loss": 2.3553, + "step": 9949 + }, + { + "epoch": 0.5337982832618026, + "grad_norm": 0.427734375, + "learning_rate": 4.923141059627263e-06, + "loss": 2.353, + "step": 9950 + }, + { + "epoch": 0.5338519313304722, + "grad_norm": 0.46875, + "learning_rate": 4.923119681961314e-06, + "loss": 2.3074, + "step": 9951 + }, + { + "epoch": 0.5339055793991416, + "grad_norm": 0.462890625, + "learning_rate": 4.923098301369197e-06, + "loss": 2.338, + "step": 9952 + }, + { + "epoch": 0.5339592274678111, + "grad_norm": 0.462890625, + "learning_rate": 4.923076917850937e-06, + "loss": 2.4871, + "step": 9953 + }, + { + "epoch": 0.5340128755364807, + "grad_norm": 0.431640625, + "learning_rate": 4.923055531406561e-06, + "loss": 2.2549, + "step": 9954 + }, + { + "epoch": 0.5340665236051502, + "grad_norm": 0.4375, + "learning_rate": 4.9230341420360934e-06, + "loss": 2.3595, + "step": 9955 + }, + { + "epoch": 0.5341201716738198, + "grad_norm": 0.40234375, + "learning_rate": 4.923012749739561e-06, + "loss": 2.2699, + "step": 9956 + }, + { + "epoch": 0.5341738197424892, + "grad_norm": 0.392578125, + "learning_rate": 4.922991354516989e-06, + "loss": 2.3462, + "step": 9957 + }, + { + "epoch": 0.5342274678111588, + "grad_norm": 0.37109375, + "learning_rate": 4.922969956368404e-06, + "loss": 2.4502, + "step": 9958 + }, + { + "epoch": 0.5342811158798283, + "grad_norm": 0.451171875, + "learning_rate": 4.9229485552938315e-06, + "loss": 2.1964, + "step": 9959 + }, + { + "epoch": 0.5343347639484979, + "grad_norm": 0.423828125, + "learning_rate": 4.922927151293296e-06, + "loss": 2.25, + "step": 9960 + }, + { + "epoch": 0.5343884120171674, + "grad_norm": 0.494140625, + "learning_rate": 4.922905744366826e-06, + "loss": 2.2462, + "step": 9961 + }, + { + "epoch": 0.534442060085837, + "grad_norm": 1.4453125, + "learning_rate": 4.922884334514445e-06, + "loss": 2.4412, + "step": 9962 + }, + { + "epoch": 0.5344957081545064, + "grad_norm": 0.423828125, + "learning_rate": 4.9228629217361805e-06, + "loss": 2.227, + "step": 9963 + }, + { + "epoch": 0.534549356223176, + "grad_norm": 0.46875, + "learning_rate": 4.922841506032058e-06, + "loss": 2.3362, + "step": 9964 + }, + { + "epoch": 0.5346030042918455, + "grad_norm": 0.42578125, + "learning_rate": 4.922820087402102e-06, + "loss": 2.3556, + "step": 9965 + }, + { + "epoch": 0.5346566523605151, + "grad_norm": 0.419921875, + "learning_rate": 4.92279866584634e-06, + "loss": 2.4492, + "step": 9966 + }, + { + "epoch": 0.5347103004291845, + "grad_norm": 0.376953125, + "learning_rate": 4.922777241364796e-06, + "loss": 2.1604, + "step": 9967 + }, + { + "epoch": 0.534763948497854, + "grad_norm": 0.6640625, + "learning_rate": 4.922755813957499e-06, + "loss": 2.2486, + "step": 9968 + }, + { + "epoch": 0.5348175965665236, + "grad_norm": 0.34375, + "learning_rate": 4.922734383624473e-06, + "loss": 2.1133, + "step": 9969 + }, + { + "epoch": 0.5348712446351931, + "grad_norm": 0.466796875, + "learning_rate": 4.922712950365742e-06, + "loss": 2.3968, + "step": 9970 + }, + { + "epoch": 0.5349248927038627, + "grad_norm": 0.74609375, + "learning_rate": 4.922691514181334e-06, + "loss": 2.4334, + "step": 9971 + }, + { + "epoch": 0.5349785407725322, + "grad_norm": 1.15625, + "learning_rate": 4.922670075071276e-06, + "loss": 2.3688, + "step": 9972 + }, + { + "epoch": 0.5350321888412017, + "grad_norm": 0.3515625, + "learning_rate": 4.922648633035593e-06, + "loss": 2.1604, + "step": 9973 + }, + { + "epoch": 0.5350858369098712, + "grad_norm": 0.53515625, + "learning_rate": 4.9226271880743086e-06, + "loss": 2.3275, + "step": 9974 + }, + { + "epoch": 0.5351394849785408, + "grad_norm": 0.44140625, + "learning_rate": 4.922605740187452e-06, + "loss": 2.2388, + "step": 9975 + }, + { + "epoch": 0.5351931330472103, + "grad_norm": 0.423828125, + "learning_rate": 4.922584289375046e-06, + "loss": 2.0382, + "step": 9976 + }, + { + "epoch": 0.5352467811158799, + "grad_norm": 0.38671875, + "learning_rate": 4.92256283563712e-06, + "loss": 2.2806, + "step": 9977 + }, + { + "epoch": 0.5353004291845493, + "grad_norm": 0.48046875, + "learning_rate": 4.922541378973696e-06, + "loss": 2.2952, + "step": 9978 + }, + { + "epoch": 0.5353540772532189, + "grad_norm": 0.498046875, + "learning_rate": 4.9225199193848035e-06, + "loss": 2.2105, + "step": 9979 + }, + { + "epoch": 0.5354077253218884, + "grad_norm": 0.4453125, + "learning_rate": 4.922498456870467e-06, + "loss": 2.2988, + "step": 9980 + }, + { + "epoch": 0.535461373390558, + "grad_norm": 0.3984375, + "learning_rate": 4.922476991430711e-06, + "loss": 2.4805, + "step": 9981 + }, + { + "epoch": 0.5355150214592275, + "grad_norm": 0.6484375, + "learning_rate": 4.922455523065564e-06, + "loss": 2.2229, + "step": 9982 + }, + { + "epoch": 0.535568669527897, + "grad_norm": 0.384765625, + "learning_rate": 4.92243405177505e-06, + "loss": 2.1291, + "step": 9983 + }, + { + "epoch": 0.5356223175965665, + "grad_norm": 0.4765625, + "learning_rate": 4.922412577559196e-06, + "loss": 2.5047, + "step": 9984 + }, + { + "epoch": 0.535675965665236, + "grad_norm": 0.38671875, + "learning_rate": 4.922391100418027e-06, + "loss": 2.0155, + "step": 9985 + }, + { + "epoch": 0.5357296137339056, + "grad_norm": 0.45703125, + "learning_rate": 4.9223696203515695e-06, + "loss": 2.4586, + "step": 9986 + }, + { + "epoch": 0.5357832618025751, + "grad_norm": 0.5234375, + "learning_rate": 4.922348137359849e-06, + "loss": 2.3807, + "step": 9987 + }, + { + "epoch": 0.5358369098712447, + "grad_norm": 0.6484375, + "learning_rate": 4.922326651442893e-06, + "loss": 2.6271, + "step": 9988 + }, + { + "epoch": 0.5358905579399141, + "grad_norm": 0.3984375, + "learning_rate": 4.922305162600725e-06, + "loss": 2.0122, + "step": 9989 + }, + { + "epoch": 0.5359442060085837, + "grad_norm": 0.42578125, + "learning_rate": 4.922283670833373e-06, + "loss": 2.3232, + "step": 9990 + }, + { + "epoch": 0.5359978540772532, + "grad_norm": 0.4765625, + "learning_rate": 4.922262176140862e-06, + "loss": 2.3492, + "step": 9991 + }, + { + "epoch": 0.5360515021459228, + "grad_norm": 0.9921875, + "learning_rate": 4.922240678523218e-06, + "loss": 1.3479, + "step": 9992 + }, + { + "epoch": 0.5361051502145923, + "grad_norm": 0.419921875, + "learning_rate": 4.922219177980467e-06, + "loss": 2.2829, + "step": 9993 + }, + { + "epoch": 0.5361587982832619, + "grad_norm": 0.412109375, + "learning_rate": 4.922197674512635e-06, + "loss": 2.2576, + "step": 9994 + }, + { + "epoch": 0.5362124463519313, + "grad_norm": 0.427734375, + "learning_rate": 4.9221761681197485e-06, + "loss": 2.1258, + "step": 9995 + }, + { + "epoch": 0.5362660944206008, + "grad_norm": 0.400390625, + "learning_rate": 4.9221546588018324e-06, + "loss": 2.3063, + "step": 9996 + }, + { + "epoch": 0.5363197424892704, + "grad_norm": 0.47265625, + "learning_rate": 4.922133146558914e-06, + "loss": 2.1686, + "step": 9997 + }, + { + "epoch": 0.5363733905579399, + "grad_norm": 0.36328125, + "learning_rate": 4.922111631391018e-06, + "loss": 2.3098, + "step": 9998 + }, + { + "epoch": 0.5364270386266095, + "grad_norm": 0.359375, + "learning_rate": 4.92209011329817e-06, + "loss": 2.3359, + "step": 9999 + }, + { + "epoch": 0.5364806866952789, + "grad_norm": 0.50390625, + "learning_rate": 4.922068592280398e-06, + "loss": 2.5495, + "step": 10000 + }, + { + "epoch": 0.5365343347639485, + "grad_norm": 0.50390625, + "learning_rate": 4.922047068337727e-06, + "loss": 2.0196, + "step": 10001 + }, + { + "epoch": 0.536587982832618, + "grad_norm": 0.392578125, + "learning_rate": 4.922025541470182e-06, + "loss": 2.2614, + "step": 10002 + }, + { + "epoch": 0.5366416309012876, + "grad_norm": 0.462890625, + "learning_rate": 4.922004011677791e-06, + "loss": 2.4084, + "step": 10003 + }, + { + "epoch": 0.5366952789699571, + "grad_norm": 0.3984375, + "learning_rate": 4.921982478960578e-06, + "loss": 2.2045, + "step": 10004 + }, + { + "epoch": 0.5367489270386266, + "grad_norm": 0.443359375, + "learning_rate": 4.921960943318571e-06, + "loss": 2.3909, + "step": 10005 + }, + { + "epoch": 0.5368025751072961, + "grad_norm": 1.09375, + "learning_rate": 4.921939404751794e-06, + "loss": 2.1872, + "step": 10006 + }, + { + "epoch": 0.5368562231759657, + "grad_norm": 0.51953125, + "learning_rate": 4.921917863260274e-06, + "loss": 2.3067, + "step": 10007 + }, + { + "epoch": 0.5369098712446352, + "grad_norm": 0.4765625, + "learning_rate": 4.921896318844037e-06, + "loss": 2.2617, + "step": 10008 + }, + { + "epoch": 0.5369635193133048, + "grad_norm": 0.466796875, + "learning_rate": 4.921874771503109e-06, + "loss": 2.4773, + "step": 10009 + }, + { + "epoch": 0.5370171673819742, + "grad_norm": 0.46875, + "learning_rate": 4.9218532212375155e-06, + "loss": 2.5829, + "step": 10010 + }, + { + "epoch": 0.5370708154506437, + "grad_norm": 0.474609375, + "learning_rate": 4.921831668047283e-06, + "loss": 2.2248, + "step": 10011 + }, + { + "epoch": 0.5371244635193133, + "grad_norm": 0.451171875, + "learning_rate": 4.9218101119324376e-06, + "loss": 2.2907, + "step": 10012 + }, + { + "epoch": 0.5371781115879828, + "grad_norm": 0.94140625, + "learning_rate": 4.921788552893006e-06, + "loss": 2.2444, + "step": 10013 + }, + { + "epoch": 0.5372317596566524, + "grad_norm": 0.388671875, + "learning_rate": 4.921766990929013e-06, + "loss": 2.3121, + "step": 10014 + }, + { + "epoch": 0.5372854077253219, + "grad_norm": 0.42578125, + "learning_rate": 4.921745426040485e-06, + "loss": 2.2169, + "step": 10015 + }, + { + "epoch": 0.5373390557939914, + "grad_norm": 0.416015625, + "learning_rate": 4.9217238582274484e-06, + "loss": 2.3065, + "step": 10016 + }, + { + "epoch": 0.5373927038626609, + "grad_norm": 0.447265625, + "learning_rate": 4.921702287489928e-06, + "loss": 1.8805, + "step": 10017 + }, + { + "epoch": 0.5374463519313305, + "grad_norm": 0.408203125, + "learning_rate": 4.921680713827952e-06, + "loss": 2.1403, + "step": 10018 + }, + { + "epoch": 0.5375, + "grad_norm": 0.390625, + "learning_rate": 4.921659137241544e-06, + "loss": 2.2781, + "step": 10019 + }, + { + "epoch": 0.5375536480686696, + "grad_norm": 0.4140625, + "learning_rate": 4.921637557730733e-06, + "loss": 2.3227, + "step": 10020 + }, + { + "epoch": 0.537607296137339, + "grad_norm": 0.54296875, + "learning_rate": 4.921615975295543e-06, + "loss": 2.2092, + "step": 10021 + }, + { + "epoch": 0.5376609442060086, + "grad_norm": 0.50390625, + "learning_rate": 4.921594389936e-06, + "loss": 2.2828, + "step": 10022 + }, + { + "epoch": 0.5377145922746781, + "grad_norm": 0.4609375, + "learning_rate": 4.92157280165213e-06, + "loss": 2.1675, + "step": 10023 + }, + { + "epoch": 0.5377682403433477, + "grad_norm": 0.455078125, + "learning_rate": 4.921551210443961e-06, + "loss": 2.2973, + "step": 10024 + }, + { + "epoch": 0.5378218884120172, + "grad_norm": 0.4921875, + "learning_rate": 4.921529616311517e-06, + "loss": 2.106, + "step": 10025 + }, + { + "epoch": 0.5378755364806866, + "grad_norm": 0.3359375, + "learning_rate": 4.9215080192548246e-06, + "loss": 2.1998, + "step": 10026 + }, + { + "epoch": 0.5379291845493562, + "grad_norm": 0.419921875, + "learning_rate": 4.9214864192739095e-06, + "loss": 2.123, + "step": 10027 + }, + { + "epoch": 0.5379828326180257, + "grad_norm": 0.55078125, + "learning_rate": 4.9214648163687995e-06, + "loss": 2.5795, + "step": 10028 + }, + { + "epoch": 0.5380364806866953, + "grad_norm": 0.439453125, + "learning_rate": 4.921443210539519e-06, + "loss": 2.0183, + "step": 10029 + }, + { + "epoch": 0.5380901287553648, + "grad_norm": 0.37109375, + "learning_rate": 4.921421601786095e-06, + "loss": 2.3056, + "step": 10030 + }, + { + "epoch": 0.5381437768240344, + "grad_norm": 0.478515625, + "learning_rate": 4.9213999901085525e-06, + "loss": 2.5013, + "step": 10031 + }, + { + "epoch": 0.5381974248927038, + "grad_norm": 0.40625, + "learning_rate": 4.921378375506919e-06, + "loss": 2.2297, + "step": 10032 + }, + { + "epoch": 0.5382510729613734, + "grad_norm": 0.4296875, + "learning_rate": 4.92135675798122e-06, + "loss": 2.008, + "step": 10033 + }, + { + "epoch": 0.5383047210300429, + "grad_norm": 0.486328125, + "learning_rate": 4.92133513753148e-06, + "loss": 2.311, + "step": 10034 + }, + { + "epoch": 0.5383583690987125, + "grad_norm": 0.5390625, + "learning_rate": 4.921313514157727e-06, + "loss": 2.2971, + "step": 10035 + }, + { + "epoch": 0.538412017167382, + "grad_norm": 0.431640625, + "learning_rate": 4.921291887859988e-06, + "loss": 2.2749, + "step": 10036 + }, + { + "epoch": 0.5384656652360515, + "grad_norm": 0.4375, + "learning_rate": 4.921270258638287e-06, + "loss": 2.1596, + "step": 10037 + }, + { + "epoch": 0.538519313304721, + "grad_norm": 0.43359375, + "learning_rate": 4.921248626492651e-06, + "loss": 2.3184, + "step": 10038 + }, + { + "epoch": 0.5385729613733906, + "grad_norm": 0.416015625, + "learning_rate": 4.921226991423106e-06, + "loss": 2.3577, + "step": 10039 + }, + { + "epoch": 0.5386266094420601, + "grad_norm": 0.478515625, + "learning_rate": 4.9212053534296785e-06, + "loss": 2.3138, + "step": 10040 + }, + { + "epoch": 0.5386802575107296, + "grad_norm": 0.44140625, + "learning_rate": 4.921183712512395e-06, + "loss": 2.1248, + "step": 10041 + }, + { + "epoch": 0.5387339055793992, + "grad_norm": 0.447265625, + "learning_rate": 4.92116206867128e-06, + "loss": 2.2984, + "step": 10042 + }, + { + "epoch": 0.5387875536480686, + "grad_norm": 0.404296875, + "learning_rate": 4.921140421906361e-06, + "loss": 2.2129, + "step": 10043 + }, + { + "epoch": 0.5388412017167382, + "grad_norm": 0.8046875, + "learning_rate": 4.921118772217663e-06, + "loss": 2.4248, + "step": 10044 + }, + { + "epoch": 0.5388948497854077, + "grad_norm": 0.49609375, + "learning_rate": 4.921097119605214e-06, + "loss": 1.7029, + "step": 10045 + }, + { + "epoch": 0.5389484978540773, + "grad_norm": 0.5859375, + "learning_rate": 4.921075464069038e-06, + "loss": 2.3957, + "step": 10046 + }, + { + "epoch": 0.5390021459227468, + "grad_norm": 0.51171875, + "learning_rate": 4.9210538056091635e-06, + "loss": 2.5276, + "step": 10047 + }, + { + "epoch": 0.5390557939914163, + "grad_norm": 0.41015625, + "learning_rate": 4.9210321442256145e-06, + "loss": 2.2477, + "step": 10048 + }, + { + "epoch": 0.5391094420600858, + "grad_norm": 0.79296875, + "learning_rate": 4.921010479918418e-06, + "loss": 1.842, + "step": 10049 + }, + { + "epoch": 0.5391630901287554, + "grad_norm": 0.5, + "learning_rate": 4.920988812687601e-06, + "loss": 2.648, + "step": 10050 + }, + { + "epoch": 0.5392167381974249, + "grad_norm": 0.37890625, + "learning_rate": 4.920967142533187e-06, + "loss": 2.1015, + "step": 10051 + }, + { + "epoch": 0.5392703862660945, + "grad_norm": 0.578125, + "learning_rate": 4.920945469455206e-06, + "loss": 2.3495, + "step": 10052 + }, + { + "epoch": 0.539324034334764, + "grad_norm": 0.404296875, + "learning_rate": 4.920923793453681e-06, + "loss": 2.4337, + "step": 10053 + }, + { + "epoch": 0.5393776824034334, + "grad_norm": 0.52734375, + "learning_rate": 4.92090211452864e-06, + "loss": 2.108, + "step": 10054 + }, + { + "epoch": 0.539431330472103, + "grad_norm": 0.47265625, + "learning_rate": 4.920880432680107e-06, + "loss": 2.4204, + "step": 10055 + }, + { + "epoch": 0.5394849785407725, + "grad_norm": 0.35546875, + "learning_rate": 4.920858747908112e-06, + "loss": 2.2159, + "step": 10056 + }, + { + "epoch": 0.5395386266094421, + "grad_norm": 0.42578125, + "learning_rate": 4.920837060212678e-06, + "loss": 2.2511, + "step": 10057 + }, + { + "epoch": 0.5395922746781115, + "grad_norm": 0.373046875, + "learning_rate": 4.920815369593832e-06, + "loss": 2.1048, + "step": 10058 + }, + { + "epoch": 0.5396459227467811, + "grad_norm": 0.48828125, + "learning_rate": 4.9207936760515994e-06, + "loss": 2.3037, + "step": 10059 + }, + { + "epoch": 0.5396995708154506, + "grad_norm": 0.486328125, + "learning_rate": 4.920771979586008e-06, + "loss": 2.3603, + "step": 10060 + }, + { + "epoch": 0.5397532188841202, + "grad_norm": 0.72265625, + "learning_rate": 4.920750280197084e-06, + "loss": 2.4734, + "step": 10061 + }, + { + "epoch": 0.5398068669527897, + "grad_norm": 0.48828125, + "learning_rate": 4.920728577884852e-06, + "loss": 2.3646, + "step": 10062 + }, + { + "epoch": 0.5398605150214593, + "grad_norm": 0.419921875, + "learning_rate": 4.920706872649339e-06, + "loss": 2.1325, + "step": 10063 + }, + { + "epoch": 0.5399141630901287, + "grad_norm": 0.404296875, + "learning_rate": 4.920685164490572e-06, + "loss": 2.3592, + "step": 10064 + }, + { + "epoch": 0.5399678111587983, + "grad_norm": 0.412109375, + "learning_rate": 4.920663453408576e-06, + "loss": 2.2217, + "step": 10065 + }, + { + "epoch": 0.5400214592274678, + "grad_norm": 0.4296875, + "learning_rate": 4.920641739403378e-06, + "loss": 2.3633, + "step": 10066 + }, + { + "epoch": 0.5400751072961374, + "grad_norm": 0.494140625, + "learning_rate": 4.920620022475004e-06, + "loss": 2.2654, + "step": 10067 + }, + { + "epoch": 0.5401287553648069, + "grad_norm": 0.42578125, + "learning_rate": 4.9205983026234806e-06, + "loss": 2.3933, + "step": 10068 + }, + { + "epoch": 0.5401824034334763, + "grad_norm": 0.38671875, + "learning_rate": 4.920576579848833e-06, + "loss": 2.2828, + "step": 10069 + }, + { + "epoch": 0.5402360515021459, + "grad_norm": 0.578125, + "learning_rate": 4.920554854151088e-06, + "loss": 2.2971, + "step": 10070 + }, + { + "epoch": 0.5402896995708154, + "grad_norm": 0.46875, + "learning_rate": 4.920533125530272e-06, + "loss": 2.2938, + "step": 10071 + }, + { + "epoch": 0.540343347639485, + "grad_norm": 0.462890625, + "learning_rate": 4.920511393986411e-06, + "loss": 2.4649, + "step": 10072 + }, + { + "epoch": 0.5403969957081545, + "grad_norm": 0.4765625, + "learning_rate": 4.9204896595195315e-06, + "loss": 2.281, + "step": 10073 + }, + { + "epoch": 0.5404506437768241, + "grad_norm": 0.44140625, + "learning_rate": 4.92046792212966e-06, + "loss": 2.1257, + "step": 10074 + }, + { + "epoch": 0.5405042918454935, + "grad_norm": 0.390625, + "learning_rate": 4.920446181816821e-06, + "loss": 2.4108, + "step": 10075 + }, + { + "epoch": 0.5405579399141631, + "grad_norm": 0.44140625, + "learning_rate": 4.920424438581044e-06, + "loss": 2.2979, + "step": 10076 + }, + { + "epoch": 0.5406115879828326, + "grad_norm": 0.41796875, + "learning_rate": 4.920402692422352e-06, + "loss": 2.222, + "step": 10077 + }, + { + "epoch": 0.5406652360515022, + "grad_norm": 0.51171875, + "learning_rate": 4.920380943340774e-06, + "loss": 2.3455, + "step": 10078 + }, + { + "epoch": 0.5407188841201717, + "grad_norm": 0.3359375, + "learning_rate": 4.920359191336333e-06, + "loss": 2.0294, + "step": 10079 + }, + { + "epoch": 0.5407725321888412, + "grad_norm": 0.65234375, + "learning_rate": 4.920337436409059e-06, + "loss": 2.2903, + "step": 10080 + }, + { + "epoch": 0.5408261802575107, + "grad_norm": 0.71484375, + "learning_rate": 4.920315678558975e-06, + "loss": 2.3678, + "step": 10081 + }, + { + "epoch": 0.5408798283261803, + "grad_norm": 0.44140625, + "learning_rate": 4.92029391778611e-06, + "loss": 2.2144, + "step": 10082 + }, + { + "epoch": 0.5409334763948498, + "grad_norm": 0.46484375, + "learning_rate": 4.920272154090488e-06, + "loss": 2.2531, + "step": 10083 + }, + { + "epoch": 0.5409871244635193, + "grad_norm": 0.416015625, + "learning_rate": 4.920250387472137e-06, + "loss": 2.1894, + "step": 10084 + }, + { + "epoch": 0.5410407725321889, + "grad_norm": 0.458984375, + "learning_rate": 4.920228617931082e-06, + "loss": 2.396, + "step": 10085 + }, + { + "epoch": 0.5410944206008583, + "grad_norm": 0.46875, + "learning_rate": 4.9202068454673505e-06, + "loss": 2.1505, + "step": 10086 + }, + { + "epoch": 0.5411480686695279, + "grad_norm": 0.40625, + "learning_rate": 4.920185070080967e-06, + "loss": 2.2435, + "step": 10087 + }, + { + "epoch": 0.5412017167381974, + "grad_norm": 0.423828125, + "learning_rate": 4.92016329177196e-06, + "loss": 2.2449, + "step": 10088 + }, + { + "epoch": 0.541255364806867, + "grad_norm": 0.8359375, + "learning_rate": 4.920141510540354e-06, + "loss": 2.222, + "step": 10089 + }, + { + "epoch": 0.5413090128755365, + "grad_norm": 0.435546875, + "learning_rate": 4.920119726386177e-06, + "loss": 1.6362, + "step": 10090 + }, + { + "epoch": 0.541362660944206, + "grad_norm": 0.466796875, + "learning_rate": 4.920097939309454e-06, + "loss": 2.4392, + "step": 10091 + }, + { + "epoch": 0.5414163090128755, + "grad_norm": 0.4296875, + "learning_rate": 4.920076149310211e-06, + "loss": 2.1255, + "step": 10092 + }, + { + "epoch": 0.5414699570815451, + "grad_norm": 0.4609375, + "learning_rate": 4.920054356388475e-06, + "loss": 2.4702, + "step": 10093 + }, + { + "epoch": 0.5415236051502146, + "grad_norm": 0.5390625, + "learning_rate": 4.920032560544274e-06, + "loss": 2.2034, + "step": 10094 + }, + { + "epoch": 0.5415772532188842, + "grad_norm": 0.400390625, + "learning_rate": 4.92001076177763e-06, + "loss": 2.5093, + "step": 10095 + }, + { + "epoch": 0.5416309012875536, + "grad_norm": 0.44140625, + "learning_rate": 4.919988960088574e-06, + "loss": 2.5057, + "step": 10096 + }, + { + "epoch": 0.5416845493562231, + "grad_norm": 0.46484375, + "learning_rate": 4.919967155477129e-06, + "loss": 2.2028, + "step": 10097 + }, + { + "epoch": 0.5417381974248927, + "grad_norm": 0.380859375, + "learning_rate": 4.919945347943323e-06, + "loss": 2.2103, + "step": 10098 + }, + { + "epoch": 0.5417918454935622, + "grad_norm": 0.384765625, + "learning_rate": 4.919923537487182e-06, + "loss": 2.1766, + "step": 10099 + }, + { + "epoch": 0.5418454935622318, + "grad_norm": 0.3828125, + "learning_rate": 4.919901724108734e-06, + "loss": 2.3393, + "step": 10100 + }, + { + "epoch": 0.5418991416309012, + "grad_norm": 0.40625, + "learning_rate": 4.919879907808001e-06, + "loss": 2.1602, + "step": 10101 + }, + { + "epoch": 0.5419527896995708, + "grad_norm": 0.4140625, + "learning_rate": 4.919858088585012e-06, + "loss": 2.2466, + "step": 10102 + }, + { + "epoch": 0.5420064377682403, + "grad_norm": 0.404296875, + "learning_rate": 4.919836266439795e-06, + "loss": 2.1679, + "step": 10103 + }, + { + "epoch": 0.5420600858369099, + "grad_norm": 0.48828125, + "learning_rate": 4.9198144413723735e-06, + "loss": 2.4745, + "step": 10104 + }, + { + "epoch": 0.5421137339055794, + "grad_norm": 0.58203125, + "learning_rate": 4.919792613382776e-06, + "loss": 2.2847, + "step": 10105 + }, + { + "epoch": 0.542167381974249, + "grad_norm": 0.357421875, + "learning_rate": 4.9197707824710265e-06, + "loss": 2.4205, + "step": 10106 + }, + { + "epoch": 0.5422210300429184, + "grad_norm": 0.41796875, + "learning_rate": 4.919748948637153e-06, + "loss": 2.5977, + "step": 10107 + }, + { + "epoch": 0.542274678111588, + "grad_norm": 0.412109375, + "learning_rate": 4.919727111881183e-06, + "loss": 2.4585, + "step": 10108 + }, + { + "epoch": 0.5423283261802575, + "grad_norm": 0.51171875, + "learning_rate": 4.91970527220314e-06, + "loss": 2.4405, + "step": 10109 + }, + { + "epoch": 0.5423819742489271, + "grad_norm": 0.42578125, + "learning_rate": 4.919683429603052e-06, + "loss": 2.3865, + "step": 10110 + }, + { + "epoch": 0.5424356223175966, + "grad_norm": 0.435546875, + "learning_rate": 4.919661584080946e-06, + "loss": 2.1461, + "step": 10111 + }, + { + "epoch": 0.542489270386266, + "grad_norm": 0.734375, + "learning_rate": 4.919639735636848e-06, + "loss": 2.1581, + "step": 10112 + }, + { + "epoch": 0.5425429184549356, + "grad_norm": 0.33203125, + "learning_rate": 4.919617884270782e-06, + "loss": 2.0338, + "step": 10113 + }, + { + "epoch": 0.5425965665236051, + "grad_norm": 0.41015625, + "learning_rate": 4.919596029982779e-06, + "loss": 2.2717, + "step": 10114 + }, + { + "epoch": 0.5426502145922747, + "grad_norm": 0.4765625, + "learning_rate": 4.919574172772861e-06, + "loss": 2.2559, + "step": 10115 + }, + { + "epoch": 0.5427038626609442, + "grad_norm": 0.46875, + "learning_rate": 4.919552312641056e-06, + "loss": 2.0707, + "step": 10116 + }, + { + "epoch": 0.5427575107296138, + "grad_norm": 0.421875, + "learning_rate": 4.919530449587391e-06, + "loss": 2.4284, + "step": 10117 + }, + { + "epoch": 0.5428111587982832, + "grad_norm": 0.419921875, + "learning_rate": 4.919508583611892e-06, + "loss": 2.0386, + "step": 10118 + }, + { + "epoch": 0.5428648068669528, + "grad_norm": 0.42578125, + "learning_rate": 4.919486714714585e-06, + "loss": 2.1845, + "step": 10119 + }, + { + "epoch": 0.5429184549356223, + "grad_norm": 0.404296875, + "learning_rate": 4.919464842895497e-06, + "loss": 2.0033, + "step": 10120 + }, + { + "epoch": 0.5429721030042919, + "grad_norm": 0.435546875, + "learning_rate": 4.919442968154655e-06, + "loss": 2.2771, + "step": 10121 + }, + { + "epoch": 0.5430257510729614, + "grad_norm": 0.400390625, + "learning_rate": 4.919421090492084e-06, + "loss": 2.1587, + "step": 10122 + }, + { + "epoch": 0.543079399141631, + "grad_norm": 0.50390625, + "learning_rate": 4.919399209907811e-06, + "loss": 2.2916, + "step": 10123 + }, + { + "epoch": 0.5431330472103004, + "grad_norm": 0.55859375, + "learning_rate": 4.919377326401862e-06, + "loss": 2.2104, + "step": 10124 + }, + { + "epoch": 0.54318669527897, + "grad_norm": 0.466796875, + "learning_rate": 4.919355439974265e-06, + "loss": 2.4104, + "step": 10125 + }, + { + "epoch": 0.5432403433476395, + "grad_norm": 0.66796875, + "learning_rate": 4.919333550625044e-06, + "loss": 2.5376, + "step": 10126 + }, + { + "epoch": 0.543293991416309, + "grad_norm": 0.5078125, + "learning_rate": 4.919311658354228e-06, + "loss": 2.4232, + "step": 10127 + }, + { + "epoch": 0.5433476394849786, + "grad_norm": 0.40234375, + "learning_rate": 4.919289763161841e-06, + "loss": 1.7676, + "step": 10128 + }, + { + "epoch": 0.543401287553648, + "grad_norm": 0.43359375, + "learning_rate": 4.919267865047911e-06, + "loss": 2.3589, + "step": 10129 + }, + { + "epoch": 0.5434549356223176, + "grad_norm": 0.388671875, + "learning_rate": 4.919245964012464e-06, + "loss": 2.284, + "step": 10130 + }, + { + "epoch": 0.5435085836909871, + "grad_norm": 0.462890625, + "learning_rate": 4.919224060055527e-06, + "loss": 2.3483, + "step": 10131 + }, + { + "epoch": 0.5435622317596567, + "grad_norm": 0.451171875, + "learning_rate": 4.9192021531771255e-06, + "loss": 2.4118, + "step": 10132 + }, + { + "epoch": 0.5436158798283262, + "grad_norm": 0.494140625, + "learning_rate": 4.919180243377286e-06, + "loss": 2.2642, + "step": 10133 + }, + { + "epoch": 0.5436695278969957, + "grad_norm": 0.33203125, + "learning_rate": 4.919158330656037e-06, + "loss": 2.0956, + "step": 10134 + }, + { + "epoch": 0.5437231759656652, + "grad_norm": 1.046875, + "learning_rate": 4.9191364150134014e-06, + "loss": 2.3875, + "step": 10135 + }, + { + "epoch": 0.5437768240343348, + "grad_norm": 0.4296875, + "learning_rate": 4.919114496449409e-06, + "loss": 2.2892, + "step": 10136 + }, + { + "epoch": 0.5438304721030043, + "grad_norm": 0.390625, + "learning_rate": 4.919092574964084e-06, + "loss": 2.0849, + "step": 10137 + }, + { + "epoch": 0.5438841201716739, + "grad_norm": 0.486328125, + "learning_rate": 4.9190706505574545e-06, + "loss": 2.3678, + "step": 10138 + }, + { + "epoch": 0.5439377682403433, + "grad_norm": 0.44921875, + "learning_rate": 4.9190487232295456e-06, + "loss": 2.3053, + "step": 10139 + }, + { + "epoch": 0.5439914163090128, + "grad_norm": 0.435546875, + "learning_rate": 4.919026792980385e-06, + "loss": 2.4208, + "step": 10140 + }, + { + "epoch": 0.5440450643776824, + "grad_norm": 0.37890625, + "learning_rate": 4.919004859809998e-06, + "loss": 2.5547, + "step": 10141 + }, + { + "epoch": 0.5440987124463519, + "grad_norm": 0.53125, + "learning_rate": 4.918982923718412e-06, + "loss": 2.1805, + "step": 10142 + }, + { + "epoch": 0.5441523605150215, + "grad_norm": 0.466796875, + "learning_rate": 4.918960984705653e-06, + "loss": 2.2761, + "step": 10143 + }, + { + "epoch": 0.544206008583691, + "grad_norm": 0.640625, + "learning_rate": 4.918939042771747e-06, + "loss": 2.3511, + "step": 10144 + }, + { + "epoch": 0.5442596566523605, + "grad_norm": 0.4140625, + "learning_rate": 4.918917097916722e-06, + "loss": 2.1664, + "step": 10145 + }, + { + "epoch": 0.54431330472103, + "grad_norm": 0.67578125, + "learning_rate": 4.918895150140603e-06, + "loss": 2.3157, + "step": 10146 + }, + { + "epoch": 0.5443669527896996, + "grad_norm": 0.51171875, + "learning_rate": 4.918873199443418e-06, + "loss": 1.8681, + "step": 10147 + }, + { + "epoch": 0.5444206008583691, + "grad_norm": 12.875, + "learning_rate": 4.918851245825193e-06, + "loss": 2.2125, + "step": 10148 + }, + { + "epoch": 0.5444742489270387, + "grad_norm": 0.408203125, + "learning_rate": 4.9188292892859525e-06, + "loss": 2.3037, + "step": 10149 + }, + { + "epoch": 0.5445278969957081, + "grad_norm": 0.439453125, + "learning_rate": 4.918807329825726e-06, + "loss": 2.3232, + "step": 10150 + }, + { + "epoch": 0.5445815450643777, + "grad_norm": 0.40234375, + "learning_rate": 4.918785367444538e-06, + "loss": 2.4529, + "step": 10151 + }, + { + "epoch": 0.5446351931330472, + "grad_norm": 0.474609375, + "learning_rate": 4.918763402142416e-06, + "loss": 2.364, + "step": 10152 + }, + { + "epoch": 0.5446888412017168, + "grad_norm": 0.458984375, + "learning_rate": 4.918741433919386e-06, + "loss": 2.2149, + "step": 10153 + }, + { + "epoch": 0.5447424892703863, + "grad_norm": 0.408203125, + "learning_rate": 4.918719462775475e-06, + "loss": 2.4452, + "step": 10154 + }, + { + "epoch": 0.5447961373390557, + "grad_norm": 0.375, + "learning_rate": 4.91869748871071e-06, + "loss": 2.1509, + "step": 10155 + }, + { + "epoch": 0.5448497854077253, + "grad_norm": 0.421875, + "learning_rate": 4.918675511725116e-06, + "loss": 2.4653, + "step": 10156 + }, + { + "epoch": 0.5449034334763948, + "grad_norm": 1.328125, + "learning_rate": 4.918653531818721e-06, + "loss": 2.2838, + "step": 10157 + }, + { + "epoch": 0.5449570815450644, + "grad_norm": 0.82421875, + "learning_rate": 4.918631548991551e-06, + "loss": 2.2264, + "step": 10158 + }, + { + "epoch": 0.5450107296137339, + "grad_norm": 0.4140625, + "learning_rate": 4.918609563243631e-06, + "loss": 2.0218, + "step": 10159 + }, + { + "epoch": 0.5450643776824035, + "grad_norm": 0.421875, + "learning_rate": 4.91858757457499e-06, + "loss": 2.2962, + "step": 10160 + }, + { + "epoch": 0.5451180257510729, + "grad_norm": 0.44921875, + "learning_rate": 4.918565582985654e-06, + "loss": 2.1493, + "step": 10161 + }, + { + "epoch": 0.5451716738197425, + "grad_norm": 0.451171875, + "learning_rate": 4.9185435884756485e-06, + "loss": 2.3404, + "step": 10162 + }, + { + "epoch": 0.545225321888412, + "grad_norm": 0.388671875, + "learning_rate": 4.918521591045002e-06, + "loss": 2.0654, + "step": 10163 + }, + { + "epoch": 0.5452789699570816, + "grad_norm": 0.4453125, + "learning_rate": 4.918499590693738e-06, + "loss": 2.363, + "step": 10164 + }, + { + "epoch": 0.5453326180257511, + "grad_norm": 0.462890625, + "learning_rate": 4.918477587421885e-06, + "loss": 1.9739, + "step": 10165 + }, + { + "epoch": 0.5453862660944206, + "grad_norm": 0.376953125, + "learning_rate": 4.91845558122947e-06, + "loss": 2.4167, + "step": 10166 + }, + { + "epoch": 0.5454399141630901, + "grad_norm": 0.4140625, + "learning_rate": 4.918433572116519e-06, + "loss": 2.4655, + "step": 10167 + }, + { + "epoch": 0.5454935622317597, + "grad_norm": 0.6015625, + "learning_rate": 4.918411560083058e-06, + "loss": 1.615, + "step": 10168 + }, + { + "epoch": 0.5455472103004292, + "grad_norm": 0.435546875, + "learning_rate": 4.918389545129116e-06, + "loss": 2.3658, + "step": 10169 + }, + { + "epoch": 0.5456008583690987, + "grad_norm": 0.41796875, + "learning_rate": 4.918367527254716e-06, + "loss": 2.2719, + "step": 10170 + }, + { + "epoch": 0.5456545064377682, + "grad_norm": 0.80859375, + "learning_rate": 4.918345506459886e-06, + "loss": 2.2568, + "step": 10171 + }, + { + "epoch": 0.5457081545064377, + "grad_norm": 0.365234375, + "learning_rate": 4.918323482744653e-06, + "loss": 2.2493, + "step": 10172 + }, + { + "epoch": 0.5457618025751073, + "grad_norm": 0.5078125, + "learning_rate": 4.918301456109045e-06, + "loss": 2.3739, + "step": 10173 + }, + { + "epoch": 0.5458154506437768, + "grad_norm": 0.54296875, + "learning_rate": 4.918279426553087e-06, + "loss": 2.2207, + "step": 10174 + }, + { + "epoch": 0.5458690987124464, + "grad_norm": 0.5234375, + "learning_rate": 4.918257394076804e-06, + "loss": 2.2448, + "step": 10175 + }, + { + "epoch": 0.5459227467811159, + "grad_norm": 0.419921875, + "learning_rate": 4.918235358680225e-06, + "loss": 2.3899, + "step": 10176 + }, + { + "epoch": 0.5459763948497854, + "grad_norm": 0.423828125, + "learning_rate": 4.918213320363376e-06, + "loss": 2.0611, + "step": 10177 + }, + { + "epoch": 0.5460300429184549, + "grad_norm": 0.443359375, + "learning_rate": 4.918191279126283e-06, + "loss": 2.4359, + "step": 10178 + }, + { + "epoch": 0.5460836909871245, + "grad_norm": 0.412109375, + "learning_rate": 4.918169234968975e-06, + "loss": 2.3437, + "step": 10179 + }, + { + "epoch": 0.546137339055794, + "grad_norm": 0.470703125, + "learning_rate": 4.918147187891476e-06, + "loss": 2.0128, + "step": 10180 + }, + { + "epoch": 0.5461909871244636, + "grad_norm": 0.400390625, + "learning_rate": 4.9181251378938125e-06, + "loss": 2.2897, + "step": 10181 + }, + { + "epoch": 0.546244635193133, + "grad_norm": 0.4609375, + "learning_rate": 4.918103084976012e-06, + "loss": 2.1618, + "step": 10182 + }, + { + "epoch": 0.5462982832618025, + "grad_norm": 0.455078125, + "learning_rate": 4.918081029138101e-06, + "loss": 2.2537, + "step": 10183 + }, + { + "epoch": 0.5463519313304721, + "grad_norm": 0.498046875, + "learning_rate": 4.918058970380107e-06, + "loss": 2.3801, + "step": 10184 + }, + { + "epoch": 0.5464055793991416, + "grad_norm": 0.40625, + "learning_rate": 4.918036908702056e-06, + "loss": 2.1974, + "step": 10185 + }, + { + "epoch": 0.5464592274678112, + "grad_norm": 0.4921875, + "learning_rate": 4.918014844103974e-06, + "loss": 1.8793, + "step": 10186 + }, + { + "epoch": 0.5465128755364806, + "grad_norm": 0.3203125, + "learning_rate": 4.9179927765858895e-06, + "loss": 2.1495, + "step": 10187 + }, + { + "epoch": 0.5465665236051502, + "grad_norm": 0.3984375, + "learning_rate": 4.917970706147826e-06, + "loss": 2.3742, + "step": 10188 + }, + { + "epoch": 0.5466201716738197, + "grad_norm": 0.431640625, + "learning_rate": 4.917948632789814e-06, + "loss": 2.2658, + "step": 10189 + }, + { + "epoch": 0.5466738197424893, + "grad_norm": 0.404296875, + "learning_rate": 4.917926556511877e-06, + "loss": 2.3894, + "step": 10190 + }, + { + "epoch": 0.5467274678111588, + "grad_norm": 0.419921875, + "learning_rate": 4.9179044773140425e-06, + "loss": 1.6453, + "step": 10191 + }, + { + "epoch": 0.5467811158798284, + "grad_norm": 0.376953125, + "learning_rate": 4.917882395196339e-06, + "loss": 2.2102, + "step": 10192 + }, + { + "epoch": 0.5468347639484978, + "grad_norm": 0.4453125, + "learning_rate": 4.917860310158789e-06, + "loss": 2.4419, + "step": 10193 + }, + { + "epoch": 0.5468884120171674, + "grad_norm": 0.458984375, + "learning_rate": 4.917838222201424e-06, + "loss": 2.1636, + "step": 10194 + }, + { + "epoch": 0.5469420600858369, + "grad_norm": 0.412109375, + "learning_rate": 4.917816131324269e-06, + "loss": 2.1927, + "step": 10195 + }, + { + "epoch": 0.5469957081545065, + "grad_norm": 0.46875, + "learning_rate": 4.917794037527348e-06, + "loss": 2.367, + "step": 10196 + }, + { + "epoch": 0.547049356223176, + "grad_norm": 0.41796875, + "learning_rate": 4.917771940810692e-06, + "loss": 2.3051, + "step": 10197 + }, + { + "epoch": 0.5471030042918454, + "grad_norm": 0.478515625, + "learning_rate": 4.917749841174324e-06, + "loss": 2.3277, + "step": 10198 + }, + { + "epoch": 0.547156652360515, + "grad_norm": 1.4609375, + "learning_rate": 4.917727738618273e-06, + "loss": 2.259, + "step": 10199 + }, + { + "epoch": 0.5472103004291845, + "grad_norm": 0.439453125, + "learning_rate": 4.917705633142564e-06, + "loss": 2.1745, + "step": 10200 + }, + { + "epoch": 0.5472639484978541, + "grad_norm": 0.53515625, + "learning_rate": 4.917683524747226e-06, + "loss": 2.0905, + "step": 10201 + }, + { + "epoch": 0.5473175965665236, + "grad_norm": 0.55859375, + "learning_rate": 4.917661413432283e-06, + "loss": 2.5256, + "step": 10202 + }, + { + "epoch": 0.5473712446351932, + "grad_norm": 0.412109375, + "learning_rate": 4.917639299197764e-06, + "loss": 2.4257, + "step": 10203 + }, + { + "epoch": 0.5474248927038626, + "grad_norm": 0.4765625, + "learning_rate": 4.917617182043695e-06, + "loss": 2.2203, + "step": 10204 + }, + { + "epoch": 0.5474785407725322, + "grad_norm": 0.384765625, + "learning_rate": 4.917595061970102e-06, + "loss": 1.9931, + "step": 10205 + }, + { + "epoch": 0.5475321888412017, + "grad_norm": 0.4296875, + "learning_rate": 4.917572938977011e-06, + "loss": 2.0413, + "step": 10206 + }, + { + "epoch": 0.5475858369098713, + "grad_norm": 0.39453125, + "learning_rate": 4.917550813064451e-06, + "loss": 2.2652, + "step": 10207 + }, + { + "epoch": 0.5476394849785408, + "grad_norm": 0.455078125, + "learning_rate": 4.917528684232448e-06, + "loss": 2.2998, + "step": 10208 + }, + { + "epoch": 0.5476931330472103, + "grad_norm": 0.4140625, + "learning_rate": 4.9175065524810274e-06, + "loss": 2.2041, + "step": 10209 + }, + { + "epoch": 0.5477467811158798, + "grad_norm": 0.41796875, + "learning_rate": 4.917484417810216e-06, + "loss": 2.3495, + "step": 10210 + }, + { + "epoch": 0.5478004291845494, + "grad_norm": 0.447265625, + "learning_rate": 4.917462280220043e-06, + "loss": 2.2353, + "step": 10211 + }, + { + "epoch": 0.5478540772532189, + "grad_norm": 0.478515625, + "learning_rate": 4.917440139710533e-06, + "loss": 1.8533, + "step": 10212 + }, + { + "epoch": 0.5479077253218884, + "grad_norm": 0.3359375, + "learning_rate": 4.917417996281714e-06, + "loss": 2.0797, + "step": 10213 + }, + { + "epoch": 0.547961373390558, + "grad_norm": 0.3671875, + "learning_rate": 4.917395849933611e-06, + "loss": 2.2138, + "step": 10214 + }, + { + "epoch": 0.5480150214592274, + "grad_norm": 0.46484375, + "learning_rate": 4.917373700666251e-06, + "loss": 2.1809, + "step": 10215 + }, + { + "epoch": 0.548068669527897, + "grad_norm": 0.458984375, + "learning_rate": 4.917351548479663e-06, + "loss": 2.2467, + "step": 10216 + }, + { + "epoch": 0.5481223175965665, + "grad_norm": 0.42578125, + "learning_rate": 4.917329393373871e-06, + "loss": 2.4615, + "step": 10217 + }, + { + "epoch": 0.5481759656652361, + "grad_norm": 0.359375, + "learning_rate": 4.9173072353489035e-06, + "loss": 2.1868, + "step": 10218 + }, + { + "epoch": 0.5482296137339056, + "grad_norm": 0.3828125, + "learning_rate": 4.917285074404787e-06, + "loss": 2.1235, + "step": 10219 + }, + { + "epoch": 0.5482832618025751, + "grad_norm": 0.5859375, + "learning_rate": 4.917262910541548e-06, + "loss": 2.4053, + "step": 10220 + }, + { + "epoch": 0.5483369098712446, + "grad_norm": 0.408203125, + "learning_rate": 4.917240743759213e-06, + "loss": 2.06, + "step": 10221 + }, + { + "epoch": 0.5483905579399142, + "grad_norm": 0.455078125, + "learning_rate": 4.917218574057808e-06, + "loss": 2.2767, + "step": 10222 + }, + { + "epoch": 0.5484442060085837, + "grad_norm": 0.453125, + "learning_rate": 4.917196401437362e-06, + "loss": 1.9732, + "step": 10223 + }, + { + "epoch": 0.5484978540772533, + "grad_norm": 0.4921875, + "learning_rate": 4.917174225897901e-06, + "loss": 2.3924, + "step": 10224 + }, + { + "epoch": 0.5485515021459227, + "grad_norm": 0.47265625, + "learning_rate": 4.91715204743945e-06, + "loss": 2.2269, + "step": 10225 + }, + { + "epoch": 0.5486051502145923, + "grad_norm": 0.50390625, + "learning_rate": 4.917129866062038e-06, + "loss": 2.3811, + "step": 10226 + }, + { + "epoch": 0.5486587982832618, + "grad_norm": 0.3828125, + "learning_rate": 4.917107681765691e-06, + "loss": 2.2597, + "step": 10227 + }, + { + "epoch": 0.5487124463519313, + "grad_norm": 0.431640625, + "learning_rate": 4.917085494550435e-06, + "loss": 2.1207, + "step": 10228 + }, + { + "epoch": 0.5487660944206009, + "grad_norm": 0.474609375, + "learning_rate": 4.917063304416298e-06, + "loss": 2.3127, + "step": 10229 + }, + { + "epoch": 0.5488197424892703, + "grad_norm": 0.462890625, + "learning_rate": 4.917041111363306e-06, + "loss": 2.3816, + "step": 10230 + }, + { + "epoch": 0.5488733905579399, + "grad_norm": 0.458984375, + "learning_rate": 4.9170189153914866e-06, + "loss": 2.1607, + "step": 10231 + }, + { + "epoch": 0.5489270386266094, + "grad_norm": 0.455078125, + "learning_rate": 4.916996716500866e-06, + "loss": 2.5621, + "step": 10232 + }, + { + "epoch": 0.548980686695279, + "grad_norm": 0.431640625, + "learning_rate": 4.9169745146914705e-06, + "loss": 2.3648, + "step": 10233 + }, + { + "epoch": 0.5490343347639485, + "grad_norm": 0.4296875, + "learning_rate": 4.916952309963328e-06, + "loss": 2.3903, + "step": 10234 + }, + { + "epoch": 0.5490879828326181, + "grad_norm": 0.54296875, + "learning_rate": 4.916930102316465e-06, + "loss": 2.2572, + "step": 10235 + }, + { + "epoch": 0.5491416309012875, + "grad_norm": 0.388671875, + "learning_rate": 4.9169078917509085e-06, + "loss": 2.3152, + "step": 10236 + }, + { + "epoch": 0.5491952789699571, + "grad_norm": 0.41796875, + "learning_rate": 4.916885678266684e-06, + "loss": 2.3589, + "step": 10237 + }, + { + "epoch": 0.5492489270386266, + "grad_norm": 0.50390625, + "learning_rate": 4.916863461863819e-06, + "loss": 2.485, + "step": 10238 + }, + { + "epoch": 0.5493025751072962, + "grad_norm": 0.423828125, + "learning_rate": 4.916841242542342e-06, + "loss": 2.4562, + "step": 10239 + }, + { + "epoch": 0.5493562231759657, + "grad_norm": 0.423828125, + "learning_rate": 4.916819020302277e-06, + "loss": 2.5882, + "step": 10240 + }, + { + "epoch": 0.5494098712446351, + "grad_norm": 0.478515625, + "learning_rate": 4.916796795143654e-06, + "loss": 2.326, + "step": 10241 + }, + { + "epoch": 0.5494635193133047, + "grad_norm": 0.494140625, + "learning_rate": 4.916774567066497e-06, + "loss": 2.3302, + "step": 10242 + }, + { + "epoch": 0.5495171673819742, + "grad_norm": 0.490234375, + "learning_rate": 4.916752336070834e-06, + "loss": 2.3142, + "step": 10243 + }, + { + "epoch": 0.5495708154506438, + "grad_norm": 0.44140625, + "learning_rate": 4.916730102156692e-06, + "loss": 2.3515, + "step": 10244 + }, + { + "epoch": 0.5496244635193133, + "grad_norm": 0.427734375, + "learning_rate": 4.916707865324098e-06, + "loss": 2.3296, + "step": 10245 + }, + { + "epoch": 0.5496781115879829, + "grad_norm": 0.380859375, + "learning_rate": 4.916685625573078e-06, + "loss": 2.0303, + "step": 10246 + }, + { + "epoch": 0.5497317596566523, + "grad_norm": 0.447265625, + "learning_rate": 4.916663382903659e-06, + "loss": 2.4438, + "step": 10247 + }, + { + "epoch": 0.5497854077253219, + "grad_norm": 0.390625, + "learning_rate": 4.91664113731587e-06, + "loss": 2.2706, + "step": 10248 + }, + { + "epoch": 0.5498390557939914, + "grad_norm": 0.40625, + "learning_rate": 4.916618888809734e-06, + "loss": 1.8975, + "step": 10249 + }, + { + "epoch": 0.549892703862661, + "grad_norm": 0.50390625, + "learning_rate": 4.9165966373852815e-06, + "loss": 2.3515, + "step": 10250 + }, + { + "epoch": 0.5499463519313305, + "grad_norm": 0.408203125, + "learning_rate": 4.9165743830425376e-06, + "loss": 2.1178, + "step": 10251 + }, + { + "epoch": 0.55, + "grad_norm": 0.419921875, + "learning_rate": 4.916552125781529e-06, + "loss": 2.1333, + "step": 10252 + }, + { + "epoch": 0.5500536480686695, + "grad_norm": 0.3828125, + "learning_rate": 4.9165298656022824e-06, + "loss": 1.8857, + "step": 10253 + }, + { + "epoch": 0.5501072961373391, + "grad_norm": 0.53515625, + "learning_rate": 4.916507602504826e-06, + "loss": 2.6044, + "step": 10254 + }, + { + "epoch": 0.5501609442060086, + "grad_norm": 0.443359375, + "learning_rate": 4.9164853364891865e-06, + "loss": 2.4238, + "step": 10255 + }, + { + "epoch": 0.5502145922746781, + "grad_norm": 0.416015625, + "learning_rate": 4.91646306755539e-06, + "loss": 2.2393, + "step": 10256 + }, + { + "epoch": 0.5502682403433476, + "grad_norm": 0.400390625, + "learning_rate": 4.9164407957034625e-06, + "loss": 2.495, + "step": 10257 + }, + { + "epoch": 0.5503218884120171, + "grad_norm": 0.44140625, + "learning_rate": 4.916418520933432e-06, + "loss": 2.2957, + "step": 10258 + }, + { + "epoch": 0.5503755364806867, + "grad_norm": 0.458984375, + "learning_rate": 4.916396243245327e-06, + "loss": 2.3705, + "step": 10259 + }, + { + "epoch": 0.5504291845493562, + "grad_norm": 0.416015625, + "learning_rate": 4.916373962639172e-06, + "loss": 2.2933, + "step": 10260 + }, + { + "epoch": 0.5504828326180258, + "grad_norm": 8.125, + "learning_rate": 4.9163516791149955e-06, + "loss": 1.7979, + "step": 10261 + }, + { + "epoch": 0.5505364806866953, + "grad_norm": 0.486328125, + "learning_rate": 4.9163293926728226e-06, + "loss": 2.1207, + "step": 10262 + }, + { + "epoch": 0.5505901287553648, + "grad_norm": 0.4296875, + "learning_rate": 4.916307103312682e-06, + "loss": 1.8756, + "step": 10263 + }, + { + "epoch": 0.5506437768240343, + "grad_norm": 0.408203125, + "learning_rate": 4.9162848110345995e-06, + "loss": 1.9358, + "step": 10264 + }, + { + "epoch": 0.5506974248927039, + "grad_norm": 0.4453125, + "learning_rate": 4.916262515838603e-06, + "loss": 2.4346, + "step": 10265 + }, + { + "epoch": 0.5507510729613734, + "grad_norm": 0.4453125, + "learning_rate": 4.916240217724717e-06, + "loss": 2.4205, + "step": 10266 + }, + { + "epoch": 0.550804721030043, + "grad_norm": 0.4140625, + "learning_rate": 4.916217916692973e-06, + "loss": 2.0968, + "step": 10267 + }, + { + "epoch": 0.5508583690987124, + "grad_norm": 0.3828125, + "learning_rate": 4.916195612743393e-06, + "loss": 2.1276, + "step": 10268 + }, + { + "epoch": 0.550912017167382, + "grad_norm": 0.443359375, + "learning_rate": 4.916173305876006e-06, + "loss": 2.5476, + "step": 10269 + }, + { + "epoch": 0.5509656652360515, + "grad_norm": 0.4296875, + "learning_rate": 4.91615099609084e-06, + "loss": 2.1359, + "step": 10270 + }, + { + "epoch": 0.551019313304721, + "grad_norm": 0.396484375, + "learning_rate": 4.916128683387922e-06, + "loss": 2.2099, + "step": 10271 + }, + { + "epoch": 0.5510729613733906, + "grad_norm": 1.03125, + "learning_rate": 4.9161063677672765e-06, + "loss": 2.3145, + "step": 10272 + }, + { + "epoch": 0.55112660944206, + "grad_norm": 0.447265625, + "learning_rate": 4.916084049228932e-06, + "loss": 2.2771, + "step": 10273 + }, + { + "epoch": 0.5511802575107296, + "grad_norm": 0.48046875, + "learning_rate": 4.916061727772916e-06, + "loss": 2.2289, + "step": 10274 + }, + { + "epoch": 0.5512339055793991, + "grad_norm": 0.38671875, + "learning_rate": 4.916039403399254e-06, + "loss": 2.1035, + "step": 10275 + }, + { + "epoch": 0.5512875536480687, + "grad_norm": 0.44140625, + "learning_rate": 4.916017076107974e-06, + "loss": 2.314, + "step": 10276 + }, + { + "epoch": 0.5513412017167382, + "grad_norm": 0.640625, + "learning_rate": 4.915994745899103e-06, + "loss": 2.4225, + "step": 10277 + }, + { + "epoch": 0.5513948497854078, + "grad_norm": 0.4765625, + "learning_rate": 4.915972412772668e-06, + "loss": 2.5239, + "step": 10278 + }, + { + "epoch": 0.5514484978540772, + "grad_norm": 0.4765625, + "learning_rate": 4.9159500767286954e-06, + "loss": 2.2262, + "step": 10279 + }, + { + "epoch": 0.5515021459227468, + "grad_norm": 0.53125, + "learning_rate": 4.915927737767212e-06, + "loss": 2.4612, + "step": 10280 + }, + { + "epoch": 0.5515557939914163, + "grad_norm": 1.1171875, + "learning_rate": 4.915905395888246e-06, + "loss": 2.2085, + "step": 10281 + }, + { + "epoch": 0.5516094420600859, + "grad_norm": 0.419921875, + "learning_rate": 4.915883051091823e-06, + "loss": 2.2801, + "step": 10282 + }, + { + "epoch": 0.5516630901287554, + "grad_norm": 0.458984375, + "learning_rate": 4.915860703377971e-06, + "loss": 2.4439, + "step": 10283 + }, + { + "epoch": 0.5517167381974248, + "grad_norm": 0.578125, + "learning_rate": 4.915838352746716e-06, + "loss": 2.4364, + "step": 10284 + }, + { + "epoch": 0.5517703862660944, + "grad_norm": 0.5859375, + "learning_rate": 4.915815999198086e-06, + "loss": 2.5926, + "step": 10285 + }, + { + "epoch": 0.5518240343347639, + "grad_norm": 0.41796875, + "learning_rate": 4.9157936427321075e-06, + "loss": 2.37, + "step": 10286 + }, + { + "epoch": 0.5518776824034335, + "grad_norm": 0.3828125, + "learning_rate": 4.915771283348807e-06, + "loss": 2.1505, + "step": 10287 + }, + { + "epoch": 0.551931330472103, + "grad_norm": 0.4765625, + "learning_rate": 4.915748921048212e-06, + "loss": 2.3889, + "step": 10288 + }, + { + "epoch": 0.5519849785407726, + "grad_norm": 0.41015625, + "learning_rate": 4.91572655583035e-06, + "loss": 2.1984, + "step": 10289 + }, + { + "epoch": 0.552038626609442, + "grad_norm": 0.439453125, + "learning_rate": 4.915704187695248e-06, + "loss": 2.325, + "step": 10290 + }, + { + "epoch": 0.5520922746781116, + "grad_norm": 0.388671875, + "learning_rate": 4.915681816642932e-06, + "loss": 1.9013, + "step": 10291 + }, + { + "epoch": 0.5521459227467811, + "grad_norm": 0.4296875, + "learning_rate": 4.9156594426734296e-06, + "loss": 2.402, + "step": 10292 + }, + { + "epoch": 0.5521995708154507, + "grad_norm": 0.59375, + "learning_rate": 4.915637065786768e-06, + "loss": 2.2673, + "step": 10293 + }, + { + "epoch": 0.5522532188841202, + "grad_norm": 0.474609375, + "learning_rate": 4.915614685982974e-06, + "loss": 2.2973, + "step": 10294 + }, + { + "epoch": 0.5523068669527897, + "grad_norm": 0.478515625, + "learning_rate": 4.915592303262074e-06, + "loss": 2.2776, + "step": 10295 + }, + { + "epoch": 0.5523605150214592, + "grad_norm": 0.4453125, + "learning_rate": 4.9155699176240964e-06, + "loss": 2.4365, + "step": 10296 + }, + { + "epoch": 0.5524141630901288, + "grad_norm": 0.546875, + "learning_rate": 4.915547529069068e-06, + "loss": 2.3098, + "step": 10297 + }, + { + "epoch": 0.5524678111587983, + "grad_norm": 0.345703125, + "learning_rate": 4.915525137597015e-06, + "loss": 2.274, + "step": 10298 + }, + { + "epoch": 0.5525214592274678, + "grad_norm": 0.451171875, + "learning_rate": 4.915502743207964e-06, + "loss": 2.3076, + "step": 10299 + }, + { + "epoch": 0.5525751072961373, + "grad_norm": 0.4375, + "learning_rate": 4.9154803459019435e-06, + "loss": 2.5049, + "step": 10300 + }, + { + "epoch": 0.5526287553648068, + "grad_norm": 0.7421875, + "learning_rate": 4.91545794567898e-06, + "loss": 2.3853, + "step": 10301 + }, + { + "epoch": 0.5526824034334764, + "grad_norm": 0.373046875, + "learning_rate": 4.9154355425391e-06, + "loss": 2.2188, + "step": 10302 + }, + { + "epoch": 0.5527360515021459, + "grad_norm": 0.3984375, + "learning_rate": 4.915413136482331e-06, + "loss": 2.3035, + "step": 10303 + }, + { + "epoch": 0.5527896995708155, + "grad_norm": 0.330078125, + "learning_rate": 4.9153907275087e-06, + "loss": 2.054, + "step": 10304 + }, + { + "epoch": 0.552843347639485, + "grad_norm": 0.7265625, + "learning_rate": 4.915368315618235e-06, + "loss": 2.542, + "step": 10305 + }, + { + "epoch": 0.5528969957081545, + "grad_norm": 0.486328125, + "learning_rate": 4.9153459008109614e-06, + "loss": 2.2946, + "step": 10306 + }, + { + "epoch": 0.552950643776824, + "grad_norm": 0.45703125, + "learning_rate": 4.915323483086906e-06, + "loss": 1.4873, + "step": 10307 + }, + { + "epoch": 0.5530042918454936, + "grad_norm": 0.44140625, + "learning_rate": 4.915301062446099e-06, + "loss": 2.2401, + "step": 10308 + }, + { + "epoch": 0.5530579399141631, + "grad_norm": 0.6171875, + "learning_rate": 4.915278638888565e-06, + "loss": 1.8292, + "step": 10309 + }, + { + "epoch": 0.5531115879828327, + "grad_norm": 0.447265625, + "learning_rate": 4.91525621241433e-06, + "loss": 2.2915, + "step": 10310 + }, + { + "epoch": 0.5531652360515021, + "grad_norm": 0.37109375, + "learning_rate": 4.915233783023424e-06, + "loss": 1.9667, + "step": 10311 + }, + { + "epoch": 0.5532188841201717, + "grad_norm": 0.47265625, + "learning_rate": 4.9152113507158714e-06, + "loss": 2.1129, + "step": 10312 + }, + { + "epoch": 0.5532725321888412, + "grad_norm": 0.416015625, + "learning_rate": 4.9151889154917015e-06, + "loss": 2.2943, + "step": 10313 + }, + { + "epoch": 0.5533261802575107, + "grad_norm": 0.494140625, + "learning_rate": 4.91516647735094e-06, + "loss": 2.2978, + "step": 10314 + }, + { + "epoch": 0.5533798283261803, + "grad_norm": 0.416015625, + "learning_rate": 4.9151440362936145e-06, + "loss": 2.4064, + "step": 10315 + }, + { + "epoch": 0.5534334763948497, + "grad_norm": 0.498046875, + "learning_rate": 4.915121592319752e-06, + "loss": 2.0982, + "step": 10316 + }, + { + "epoch": 0.5534871244635193, + "grad_norm": 0.4453125, + "learning_rate": 4.915099145429379e-06, + "loss": 1.941, + "step": 10317 + }, + { + "epoch": 0.5535407725321888, + "grad_norm": 0.388671875, + "learning_rate": 4.915076695622525e-06, + "loss": 2.4468, + "step": 10318 + }, + { + "epoch": 0.5535944206008584, + "grad_norm": 0.42578125, + "learning_rate": 4.915054242899214e-06, + "loss": 2.2168, + "step": 10319 + }, + { + "epoch": 0.5536480686695279, + "grad_norm": 0.443359375, + "learning_rate": 4.915031787259475e-06, + "loss": 2.3342, + "step": 10320 + }, + { + "epoch": 0.5537017167381975, + "grad_norm": 0.451171875, + "learning_rate": 4.915009328703333e-06, + "loss": 1.6392, + "step": 10321 + }, + { + "epoch": 0.5537553648068669, + "grad_norm": 0.62109375, + "learning_rate": 4.9149868672308184e-06, + "loss": 2.2173, + "step": 10322 + }, + { + "epoch": 0.5538090128755365, + "grad_norm": 0.5390625, + "learning_rate": 4.9149644028419565e-06, + "loss": 2.4147, + "step": 10323 + }, + { + "epoch": 0.553862660944206, + "grad_norm": 0.453125, + "learning_rate": 4.914941935536774e-06, + "loss": 2.2912, + "step": 10324 + }, + { + "epoch": 0.5539163090128756, + "grad_norm": 0.419921875, + "learning_rate": 4.914919465315299e-06, + "loss": 2.3636, + "step": 10325 + }, + { + "epoch": 0.5539699570815451, + "grad_norm": 0.435546875, + "learning_rate": 4.914896992177557e-06, + "loss": 2.2689, + "step": 10326 + }, + { + "epoch": 0.5540236051502145, + "grad_norm": 0.4375, + "learning_rate": 4.9148745161235775e-06, + "loss": 2.3153, + "step": 10327 + }, + { + "epoch": 0.5540772532188841, + "grad_norm": 0.431640625, + "learning_rate": 4.9148520371533855e-06, + "loss": 2.2501, + "step": 10328 + }, + { + "epoch": 0.5541309012875536, + "grad_norm": 0.396484375, + "learning_rate": 4.914829555267011e-06, + "loss": 2.3098, + "step": 10329 + }, + { + "epoch": 0.5541845493562232, + "grad_norm": 0.61328125, + "learning_rate": 4.914807070464478e-06, + "loss": 1.7421, + "step": 10330 + }, + { + "epoch": 0.5542381974248927, + "grad_norm": 0.59375, + "learning_rate": 4.914784582745815e-06, + "loss": 2.4577, + "step": 10331 + }, + { + "epoch": 0.5542918454935623, + "grad_norm": 0.42578125, + "learning_rate": 4.914762092111049e-06, + "loss": 2.1595, + "step": 10332 + }, + { + "epoch": 0.5543454935622317, + "grad_norm": 0.43359375, + "learning_rate": 4.914739598560208e-06, + "loss": 2.3093, + "step": 10333 + }, + { + "epoch": 0.5543991416309013, + "grad_norm": 0.470703125, + "learning_rate": 4.914717102093317e-06, + "loss": 2.2176, + "step": 10334 + }, + { + "epoch": 0.5544527896995708, + "grad_norm": 0.44140625, + "learning_rate": 4.914694602710406e-06, + "loss": 2.0305, + "step": 10335 + }, + { + "epoch": 0.5545064377682404, + "grad_norm": 0.53515625, + "learning_rate": 4.914672100411499e-06, + "loss": 2.5945, + "step": 10336 + }, + { + "epoch": 0.5545600858369099, + "grad_norm": 0.443359375, + "learning_rate": 4.914649595196627e-06, + "loss": 2.4746, + "step": 10337 + }, + { + "epoch": 0.5546137339055794, + "grad_norm": 1.1328125, + "learning_rate": 4.9146270870658145e-06, + "loss": 2.1875, + "step": 10338 + }, + { + "epoch": 0.5546673819742489, + "grad_norm": 0.5625, + "learning_rate": 4.914604576019089e-06, + "loss": 2.236, + "step": 10339 + }, + { + "epoch": 0.5547210300429185, + "grad_norm": 0.439453125, + "learning_rate": 4.9145820620564775e-06, + "loss": 2.2652, + "step": 10340 + }, + { + "epoch": 0.554774678111588, + "grad_norm": 0.46484375, + "learning_rate": 4.914559545178008e-06, + "loss": 2.2336, + "step": 10341 + }, + { + "epoch": 0.5548283261802575, + "grad_norm": 0.7421875, + "learning_rate": 4.914537025383707e-06, + "loss": 1.7673, + "step": 10342 + }, + { + "epoch": 0.554881974248927, + "grad_norm": 0.50390625, + "learning_rate": 4.914514502673603e-06, + "loss": 2.1678, + "step": 10343 + }, + { + "epoch": 0.5549356223175965, + "grad_norm": 0.486328125, + "learning_rate": 4.914491977047722e-06, + "loss": 2.4809, + "step": 10344 + }, + { + "epoch": 0.5549892703862661, + "grad_norm": 0.66796875, + "learning_rate": 4.914469448506091e-06, + "loss": 2.2295, + "step": 10345 + }, + { + "epoch": 0.5550429184549356, + "grad_norm": 0.400390625, + "learning_rate": 4.914446917048737e-06, + "loss": 2.0677, + "step": 10346 + }, + { + "epoch": 0.5550965665236052, + "grad_norm": 0.53515625, + "learning_rate": 4.914424382675689e-06, + "loss": 2.1381, + "step": 10347 + }, + { + "epoch": 0.5551502145922746, + "grad_norm": 0.451171875, + "learning_rate": 4.914401845386973e-06, + "loss": 2.2551, + "step": 10348 + }, + { + "epoch": 0.5552038626609442, + "grad_norm": 0.56640625, + "learning_rate": 4.914379305182616e-06, + "loss": 2.1727, + "step": 10349 + }, + { + "epoch": 0.5552575107296137, + "grad_norm": 0.375, + "learning_rate": 4.914356762062645e-06, + "loss": 2.3082, + "step": 10350 + }, + { + "epoch": 0.5553111587982833, + "grad_norm": 0.51171875, + "learning_rate": 4.914334216027088e-06, + "loss": 2.3082, + "step": 10351 + }, + { + "epoch": 0.5553648068669528, + "grad_norm": 0.5078125, + "learning_rate": 4.914311667075972e-06, + "loss": 2.3655, + "step": 10352 + }, + { + "epoch": 0.5554184549356224, + "grad_norm": 0.57421875, + "learning_rate": 4.914289115209325e-06, + "loss": 2.2366, + "step": 10353 + }, + { + "epoch": 0.5554721030042918, + "grad_norm": 0.53515625, + "learning_rate": 4.914266560427173e-06, + "loss": 2.2419, + "step": 10354 + }, + { + "epoch": 0.5555257510729614, + "grad_norm": 0.384765625, + "learning_rate": 4.914244002729543e-06, + "loss": 2.1487, + "step": 10355 + }, + { + "epoch": 0.5555793991416309, + "grad_norm": 0.5625, + "learning_rate": 4.914221442116463e-06, + "loss": 1.379, + "step": 10356 + }, + { + "epoch": 0.5556330472103004, + "grad_norm": 0.396484375, + "learning_rate": 4.91419887858796e-06, + "loss": 2.4974, + "step": 10357 + }, + { + "epoch": 0.55568669527897, + "grad_norm": 0.79296875, + "learning_rate": 4.9141763121440625e-06, + "loss": 2.4082, + "step": 10358 + }, + { + "epoch": 0.5557403433476394, + "grad_norm": 0.42578125, + "learning_rate": 4.914153742784796e-06, + "loss": 2.4784, + "step": 10359 + }, + { + "epoch": 0.555793991416309, + "grad_norm": 0.40234375, + "learning_rate": 4.914131170510188e-06, + "loss": 2.17, + "step": 10360 + }, + { + "epoch": 0.5558476394849785, + "grad_norm": 0.44921875, + "learning_rate": 4.914108595320267e-06, + "loss": 2.3977, + "step": 10361 + }, + { + "epoch": 0.5559012875536481, + "grad_norm": 0.46875, + "learning_rate": 4.914086017215059e-06, + "loss": 2.3848, + "step": 10362 + }, + { + "epoch": 0.5559549356223176, + "grad_norm": 0.427734375, + "learning_rate": 4.9140634361945914e-06, + "loss": 2.2841, + "step": 10363 + }, + { + "epoch": 0.5560085836909872, + "grad_norm": 0.35546875, + "learning_rate": 4.914040852258892e-06, + "loss": 1.9413, + "step": 10364 + }, + { + "epoch": 0.5560622317596566, + "grad_norm": 0.44921875, + "learning_rate": 4.914018265407987e-06, + "loss": 2.3284, + "step": 10365 + }, + { + "epoch": 0.5561158798283262, + "grad_norm": 0.5390625, + "learning_rate": 4.913995675641906e-06, + "loss": 2.2019, + "step": 10366 + }, + { + "epoch": 0.5561695278969957, + "grad_norm": 0.48046875, + "learning_rate": 4.913973082960674e-06, + "loss": 1.8324, + "step": 10367 + }, + { + "epoch": 0.5562231759656653, + "grad_norm": 0.419921875, + "learning_rate": 4.913950487364319e-06, + "loss": 2.2687, + "step": 10368 + }, + { + "epoch": 0.5562768240343348, + "grad_norm": 0.396484375, + "learning_rate": 4.913927888852868e-06, + "loss": 2.0395, + "step": 10369 + }, + { + "epoch": 0.5563304721030042, + "grad_norm": 0.451171875, + "learning_rate": 4.91390528742635e-06, + "loss": 2.1964, + "step": 10370 + }, + { + "epoch": 0.5563841201716738, + "grad_norm": 0.5078125, + "learning_rate": 4.913882683084789e-06, + "loss": 2.3695, + "step": 10371 + }, + { + "epoch": 0.5564377682403433, + "grad_norm": 0.466796875, + "learning_rate": 4.913860075828216e-06, + "loss": 1.9774, + "step": 10372 + }, + { + "epoch": 0.5564914163090129, + "grad_norm": 0.478515625, + "learning_rate": 4.913837465656656e-06, + "loss": 2.259, + "step": 10373 + }, + { + "epoch": 0.5565450643776824, + "grad_norm": 0.39453125, + "learning_rate": 4.913814852570136e-06, + "loss": 2.4223, + "step": 10374 + }, + { + "epoch": 0.556598712446352, + "grad_norm": 0.423828125, + "learning_rate": 4.913792236568685e-06, + "loss": 2.3676, + "step": 10375 + }, + { + "epoch": 0.5566523605150214, + "grad_norm": 0.47265625, + "learning_rate": 4.9137696176523285e-06, + "loss": 2.4565, + "step": 10376 + }, + { + "epoch": 0.556706008583691, + "grad_norm": 0.453125, + "learning_rate": 4.913746995821096e-06, + "loss": 2.465, + "step": 10377 + }, + { + "epoch": 0.5567596566523605, + "grad_norm": 0.51171875, + "learning_rate": 4.913724371075013e-06, + "loss": 2.3732, + "step": 10378 + }, + { + "epoch": 0.5568133047210301, + "grad_norm": 0.53125, + "learning_rate": 4.913701743414108e-06, + "loss": 2.4574, + "step": 10379 + }, + { + "epoch": 0.5568669527896996, + "grad_norm": 0.388671875, + "learning_rate": 4.9136791128384064e-06, + "loss": 2.5114, + "step": 10380 + }, + { + "epoch": 0.5569206008583691, + "grad_norm": 0.56640625, + "learning_rate": 4.913656479347938e-06, + "loss": 2.5182, + "step": 10381 + }, + { + "epoch": 0.5569742489270386, + "grad_norm": 0.41015625, + "learning_rate": 4.913633842942729e-06, + "loss": 2.2981, + "step": 10382 + }, + { + "epoch": 0.5570278969957082, + "grad_norm": 0.392578125, + "learning_rate": 4.9136112036228055e-06, + "loss": 2.416, + "step": 10383 + }, + { + "epoch": 0.5570815450643777, + "grad_norm": 0.48046875, + "learning_rate": 4.913588561388197e-06, + "loss": 2.2896, + "step": 10384 + }, + { + "epoch": 0.5571351931330472, + "grad_norm": 0.625, + "learning_rate": 4.91356591623893e-06, + "loss": 2.1288, + "step": 10385 + }, + { + "epoch": 0.5571888412017167, + "grad_norm": 0.408203125, + "learning_rate": 4.913543268175032e-06, + "loss": 2.3811, + "step": 10386 + }, + { + "epoch": 0.5572424892703862, + "grad_norm": 0.4375, + "learning_rate": 4.913520617196529e-06, + "loss": 2.2003, + "step": 10387 + }, + { + "epoch": 0.5572961373390558, + "grad_norm": 0.70703125, + "learning_rate": 4.91349796330345e-06, + "loss": 2.4067, + "step": 10388 + }, + { + "epoch": 0.5573497854077253, + "grad_norm": 0.462890625, + "learning_rate": 4.913475306495822e-06, + "loss": 2.0121, + "step": 10389 + }, + { + "epoch": 0.5574034334763949, + "grad_norm": 0.44140625, + "learning_rate": 4.913452646773672e-06, + "loss": 2.2949, + "step": 10390 + }, + { + "epoch": 0.5574570815450643, + "grad_norm": 0.431640625, + "learning_rate": 4.9134299841370285e-06, + "loss": 2.3825, + "step": 10391 + }, + { + "epoch": 0.5575107296137339, + "grad_norm": 0.44140625, + "learning_rate": 4.913407318585916e-06, + "loss": 2.3224, + "step": 10392 + }, + { + "epoch": 0.5575643776824034, + "grad_norm": 0.38671875, + "learning_rate": 4.913384650120365e-06, + "loss": 2.467, + "step": 10393 + }, + { + "epoch": 0.557618025751073, + "grad_norm": 0.625, + "learning_rate": 4.913361978740402e-06, + "loss": 2.1402, + "step": 10394 + }, + { + "epoch": 0.5576716738197425, + "grad_norm": 0.431640625, + "learning_rate": 4.913339304446053e-06, + "loss": 2.27, + "step": 10395 + }, + { + "epoch": 0.5577253218884121, + "grad_norm": 0.96875, + "learning_rate": 4.913316627237347e-06, + "loss": 2.4281, + "step": 10396 + }, + { + "epoch": 0.5577789699570815, + "grad_norm": 0.33984375, + "learning_rate": 4.913293947114311e-06, + "loss": 2.0193, + "step": 10397 + }, + { + "epoch": 0.5578326180257511, + "grad_norm": 0.37890625, + "learning_rate": 4.9132712640769716e-06, + "loss": 2.1508, + "step": 10398 + }, + { + "epoch": 0.5578862660944206, + "grad_norm": 0.55078125, + "learning_rate": 4.913248578125357e-06, + "loss": 2.4175, + "step": 10399 + }, + { + "epoch": 0.5579399141630901, + "grad_norm": 0.42578125, + "learning_rate": 4.913225889259495e-06, + "loss": 2.3133, + "step": 10400 + }, + { + "epoch": 0.5579935622317597, + "grad_norm": 0.408203125, + "learning_rate": 4.913203197479412e-06, + "loss": 2.3367, + "step": 10401 + }, + { + "epoch": 0.5580472103004291, + "grad_norm": 0.431640625, + "learning_rate": 4.913180502785136e-06, + "loss": 2.1598, + "step": 10402 + }, + { + "epoch": 0.5581008583690987, + "grad_norm": 0.431640625, + "learning_rate": 4.9131578051766936e-06, + "loss": 2.1613, + "step": 10403 + }, + { + "epoch": 0.5581545064377682, + "grad_norm": 0.451171875, + "learning_rate": 4.913135104654113e-06, + "loss": 2.1884, + "step": 10404 + }, + { + "epoch": 0.5582081545064378, + "grad_norm": 0.462890625, + "learning_rate": 4.913112401217421e-06, + "loss": 2.24, + "step": 10405 + }, + { + "epoch": 0.5582618025751073, + "grad_norm": 0.474609375, + "learning_rate": 4.913089694866646e-06, + "loss": 2.485, + "step": 10406 + }, + { + "epoch": 0.5583154506437769, + "grad_norm": 0.4765625, + "learning_rate": 4.913066985601814e-06, + "loss": 2.5547, + "step": 10407 + }, + { + "epoch": 0.5583690987124463, + "grad_norm": 0.38671875, + "learning_rate": 4.913044273422954e-06, + "loss": 2.287, + "step": 10408 + }, + { + "epoch": 0.5584227467811159, + "grad_norm": 0.470703125, + "learning_rate": 4.913021558330093e-06, + "loss": 2.3967, + "step": 10409 + }, + { + "epoch": 0.5584763948497854, + "grad_norm": 0.447265625, + "learning_rate": 4.912998840323258e-06, + "loss": 2.3499, + "step": 10410 + }, + { + "epoch": 0.558530042918455, + "grad_norm": 0.384765625, + "learning_rate": 4.912976119402476e-06, + "loss": 1.9832, + "step": 10411 + }, + { + "epoch": 0.5585836909871245, + "grad_norm": 0.443359375, + "learning_rate": 4.912953395567775e-06, + "loss": 2.2389, + "step": 10412 + }, + { + "epoch": 0.558637339055794, + "grad_norm": 0.392578125, + "learning_rate": 4.912930668819183e-06, + "loss": 2.0254, + "step": 10413 + }, + { + "epoch": 0.5586909871244635, + "grad_norm": 0.40625, + "learning_rate": 4.912907939156727e-06, + "loss": 1.9275, + "step": 10414 + }, + { + "epoch": 0.558744635193133, + "grad_norm": 0.474609375, + "learning_rate": 4.9128852065804335e-06, + "loss": 2.2732, + "step": 10415 + }, + { + "epoch": 0.5587982832618026, + "grad_norm": 0.51171875, + "learning_rate": 4.912862471090331e-06, + "loss": 2.4335, + "step": 10416 + }, + { + "epoch": 0.5588519313304721, + "grad_norm": 0.5078125, + "learning_rate": 4.912839732686448e-06, + "loss": 2.4185, + "step": 10417 + }, + { + "epoch": 0.5589055793991416, + "grad_norm": 0.45703125, + "learning_rate": 4.9128169913688084e-06, + "loss": 2.2445, + "step": 10418 + }, + { + "epoch": 0.5589592274678111, + "grad_norm": 0.396484375, + "learning_rate": 4.912794247137444e-06, + "loss": 2.1865, + "step": 10419 + }, + { + "epoch": 0.5590128755364807, + "grad_norm": 0.39453125, + "learning_rate": 4.912771499992379e-06, + "loss": 2.2303, + "step": 10420 + }, + { + "epoch": 0.5590665236051502, + "grad_norm": 0.494140625, + "learning_rate": 4.912748749933642e-06, + "loss": 2.1257, + "step": 10421 + }, + { + "epoch": 0.5591201716738198, + "grad_norm": 0.439453125, + "learning_rate": 4.9127259969612615e-06, + "loss": 2.4538, + "step": 10422 + }, + { + "epoch": 0.5591738197424893, + "grad_norm": 0.52734375, + "learning_rate": 4.912703241075264e-06, + "loss": 2.3037, + "step": 10423 + }, + { + "epoch": 0.5592274678111588, + "grad_norm": 0.55078125, + "learning_rate": 4.912680482275676e-06, + "loss": 2.0985, + "step": 10424 + }, + { + "epoch": 0.5592811158798283, + "grad_norm": 0.3984375, + "learning_rate": 4.912657720562527e-06, + "loss": 2.3677, + "step": 10425 + }, + { + "epoch": 0.5593347639484979, + "grad_norm": 0.416015625, + "learning_rate": 4.9126349559358436e-06, + "loss": 1.9189, + "step": 10426 + }, + { + "epoch": 0.5593884120171674, + "grad_norm": 0.462890625, + "learning_rate": 4.912612188395652e-06, + "loss": 2.6177, + "step": 10427 + }, + { + "epoch": 0.5594420600858369, + "grad_norm": 0.41796875, + "learning_rate": 4.912589417941982e-06, + "loss": 2.2003, + "step": 10428 + }, + { + "epoch": 0.5594957081545064, + "grad_norm": 0.42578125, + "learning_rate": 4.91256664457486e-06, + "loss": 2.193, + "step": 10429 + }, + { + "epoch": 0.5595493562231759, + "grad_norm": 0.451171875, + "learning_rate": 4.912543868294313e-06, + "loss": 2.1959, + "step": 10430 + }, + { + "epoch": 0.5596030042918455, + "grad_norm": 0.357421875, + "learning_rate": 4.912521089100369e-06, + "loss": 1.682, + "step": 10431 + }, + { + "epoch": 0.559656652360515, + "grad_norm": 0.458984375, + "learning_rate": 4.912498306993055e-06, + "loss": 2.3859, + "step": 10432 + }, + { + "epoch": 0.5597103004291846, + "grad_norm": 0.43359375, + "learning_rate": 4.912475521972399e-06, + "loss": 2.3091, + "step": 10433 + }, + { + "epoch": 0.559763948497854, + "grad_norm": 0.498046875, + "learning_rate": 4.91245273403843e-06, + "loss": 2.4107, + "step": 10434 + }, + { + "epoch": 0.5598175965665236, + "grad_norm": 0.45703125, + "learning_rate": 4.912429943191173e-06, + "loss": 2.4681, + "step": 10435 + }, + { + "epoch": 0.5598712446351931, + "grad_norm": 0.66015625, + "learning_rate": 4.912407149430657e-06, + "loss": 2.3853, + "step": 10436 + }, + { + "epoch": 0.5599248927038627, + "grad_norm": 0.5546875, + "learning_rate": 4.912384352756908e-06, + "loss": 2.2187, + "step": 10437 + }, + { + "epoch": 0.5599785407725322, + "grad_norm": 0.5078125, + "learning_rate": 4.912361553169956e-06, + "loss": 2.3185, + "step": 10438 + }, + { + "epoch": 0.5600321888412018, + "grad_norm": 0.458984375, + "learning_rate": 4.912338750669826e-06, + "loss": 1.9264, + "step": 10439 + }, + { + "epoch": 0.5600858369098712, + "grad_norm": 0.53125, + "learning_rate": 4.912315945256548e-06, + "loss": 2.1613, + "step": 10440 + }, + { + "epoch": 0.5601394849785408, + "grad_norm": 0.4140625, + "learning_rate": 4.912293136930147e-06, + "loss": 2.378, + "step": 10441 + }, + { + "epoch": 0.5601931330472103, + "grad_norm": 0.447265625, + "learning_rate": 4.912270325690652e-06, + "loss": 2.2051, + "step": 10442 + }, + { + "epoch": 0.5602467811158798, + "grad_norm": 0.439453125, + "learning_rate": 4.912247511538091e-06, + "loss": 2.6013, + "step": 10443 + }, + { + "epoch": 0.5603004291845494, + "grad_norm": 0.55859375, + "learning_rate": 4.91222469447249e-06, + "loss": 1.8456, + "step": 10444 + }, + { + "epoch": 0.5603540772532188, + "grad_norm": 0.52734375, + "learning_rate": 4.912201874493878e-06, + "loss": 2.0707, + "step": 10445 + }, + { + "epoch": 0.5604077253218884, + "grad_norm": 0.412109375, + "learning_rate": 4.912179051602282e-06, + "loss": 2.2318, + "step": 10446 + }, + { + "epoch": 0.5604613733905579, + "grad_norm": 0.5625, + "learning_rate": 4.912156225797729e-06, + "loss": 2.2885, + "step": 10447 + }, + { + "epoch": 0.5605150214592275, + "grad_norm": 0.37890625, + "learning_rate": 4.912133397080248e-06, + "loss": 2.0329, + "step": 10448 + }, + { + "epoch": 0.560568669527897, + "grad_norm": 0.53515625, + "learning_rate": 4.912110565449864e-06, + "loss": 2.2342, + "step": 10449 + }, + { + "epoch": 0.5606223175965666, + "grad_norm": 0.453125, + "learning_rate": 4.9120877309066074e-06, + "loss": 2.3728, + "step": 10450 + }, + { + "epoch": 0.560675965665236, + "grad_norm": 0.484375, + "learning_rate": 4.912064893450506e-06, + "loss": 2.3845, + "step": 10451 + }, + { + "epoch": 0.5607296137339056, + "grad_norm": 0.435546875, + "learning_rate": 4.912042053081583e-06, + "loss": 2.1314, + "step": 10452 + }, + { + "epoch": 0.5607832618025751, + "grad_norm": 0.451171875, + "learning_rate": 4.912019209799871e-06, + "loss": 2.4595, + "step": 10453 + }, + { + "epoch": 0.5608369098712447, + "grad_norm": 1.5859375, + "learning_rate": 4.911996363605395e-06, + "loss": 2.3416, + "step": 10454 + }, + { + "epoch": 0.5608905579399142, + "grad_norm": 0.427734375, + "learning_rate": 4.911973514498183e-06, + "loss": 2.4981, + "step": 10455 + }, + { + "epoch": 0.5609442060085837, + "grad_norm": 0.404296875, + "learning_rate": 4.911950662478263e-06, + "loss": 2.3582, + "step": 10456 + }, + { + "epoch": 0.5609978540772532, + "grad_norm": 0.51953125, + "learning_rate": 4.911927807545663e-06, + "loss": 2.1639, + "step": 10457 + }, + { + "epoch": 0.5610515021459227, + "grad_norm": 0.5, + "learning_rate": 4.9119049497004086e-06, + "loss": 2.4593, + "step": 10458 + }, + { + "epoch": 0.5611051502145923, + "grad_norm": 0.515625, + "learning_rate": 4.91188208894253e-06, + "loss": 2.3322, + "step": 10459 + }, + { + "epoch": 0.5611587982832618, + "grad_norm": 0.36328125, + "learning_rate": 4.9118592252720536e-06, + "loss": 2.1968, + "step": 10460 + }, + { + "epoch": 0.5612124463519313, + "grad_norm": 0.4609375, + "learning_rate": 4.9118363586890055e-06, + "loss": 2.2871, + "step": 10461 + }, + { + "epoch": 0.5612660944206008, + "grad_norm": 0.412109375, + "learning_rate": 4.911813489193415e-06, + "loss": 2.3612, + "step": 10462 + }, + { + "epoch": 0.5613197424892704, + "grad_norm": 0.41796875, + "learning_rate": 4.9117906167853105e-06, + "loss": 2.225, + "step": 10463 + }, + { + "epoch": 0.5613733905579399, + "grad_norm": 0.421875, + "learning_rate": 4.911767741464718e-06, + "loss": 2.3144, + "step": 10464 + }, + { + "epoch": 0.5614270386266095, + "grad_norm": 1.140625, + "learning_rate": 4.911744863231667e-06, + "loss": 2.364, + "step": 10465 + }, + { + "epoch": 0.561480686695279, + "grad_norm": 0.4609375, + "learning_rate": 4.9117219820861825e-06, + "loss": 2.4318, + "step": 10466 + }, + { + "epoch": 0.5615343347639485, + "grad_norm": 0.416015625, + "learning_rate": 4.911699098028293e-06, + "loss": 1.9739, + "step": 10467 + }, + { + "epoch": 0.561587982832618, + "grad_norm": 0.4609375, + "learning_rate": 4.911676211058028e-06, + "loss": 2.3199, + "step": 10468 + }, + { + "epoch": 0.5616416309012876, + "grad_norm": 0.478515625, + "learning_rate": 4.911653321175413e-06, + "loss": 2.384, + "step": 10469 + }, + { + "epoch": 0.5616952789699571, + "grad_norm": 0.45703125, + "learning_rate": 4.9116304283804766e-06, + "loss": 2.5337, + "step": 10470 + }, + { + "epoch": 0.5617489270386266, + "grad_norm": 0.42578125, + "learning_rate": 4.911607532673246e-06, + "loss": 2.361, + "step": 10471 + }, + { + "epoch": 0.5618025751072961, + "grad_norm": 0.435546875, + "learning_rate": 4.91158463405375e-06, + "loss": 2.4393, + "step": 10472 + }, + { + "epoch": 0.5618562231759656, + "grad_norm": 0.435546875, + "learning_rate": 4.911561732522015e-06, + "loss": 2.1309, + "step": 10473 + }, + { + "epoch": 0.5619098712446352, + "grad_norm": 0.486328125, + "learning_rate": 4.911538828078069e-06, + "loss": 2.4627, + "step": 10474 + }, + { + "epoch": 0.5619635193133047, + "grad_norm": 0.431640625, + "learning_rate": 4.9115159207219385e-06, + "loss": 2.3006, + "step": 10475 + }, + { + "epoch": 0.5620171673819743, + "grad_norm": 0.6328125, + "learning_rate": 4.911493010453653e-06, + "loss": 2.1624, + "step": 10476 + }, + { + "epoch": 0.5620708154506437, + "grad_norm": 0.43359375, + "learning_rate": 4.91147009727324e-06, + "loss": 2.2086, + "step": 10477 + }, + { + "epoch": 0.5621244635193133, + "grad_norm": 0.40625, + "learning_rate": 4.911447181180726e-06, + "loss": 2.1272, + "step": 10478 + }, + { + "epoch": 0.5621781115879828, + "grad_norm": 0.4453125, + "learning_rate": 4.911424262176141e-06, + "loss": 2.0839, + "step": 10479 + }, + { + "epoch": 0.5622317596566524, + "grad_norm": 0.396484375, + "learning_rate": 4.911401340259509e-06, + "loss": 2.4086, + "step": 10480 + }, + { + "epoch": 0.5622854077253219, + "grad_norm": 0.859375, + "learning_rate": 4.9113784154308605e-06, + "loss": 2.1722, + "step": 10481 + }, + { + "epoch": 0.5623390557939915, + "grad_norm": 0.412109375, + "learning_rate": 4.911355487690223e-06, + "loss": 2.337, + "step": 10482 + }, + { + "epoch": 0.5623927038626609, + "grad_norm": 0.453125, + "learning_rate": 4.911332557037623e-06, + "loss": 2.31, + "step": 10483 + }, + { + "epoch": 0.5624463519313305, + "grad_norm": 0.453125, + "learning_rate": 4.911309623473088e-06, + "loss": 2.3575, + "step": 10484 + }, + { + "epoch": 0.5625, + "grad_norm": 0.43359375, + "learning_rate": 4.911286686996648e-06, + "loss": 2.2574, + "step": 10485 + }, + { + "epoch": 0.5625536480686695, + "grad_norm": 0.4140625, + "learning_rate": 4.911263747608328e-06, + "loss": 1.9185, + "step": 10486 + }, + { + "epoch": 0.5626072961373391, + "grad_norm": 0.375, + "learning_rate": 4.9112408053081576e-06, + "loss": 2.0371, + "step": 10487 + }, + { + "epoch": 0.5626609442060085, + "grad_norm": 0.51171875, + "learning_rate": 4.911217860096163e-06, + "loss": 2.1064, + "step": 10488 + }, + { + "epoch": 0.5627145922746781, + "grad_norm": 0.482421875, + "learning_rate": 4.911194911972373e-06, + "loss": 2.3591, + "step": 10489 + }, + { + "epoch": 0.5627682403433476, + "grad_norm": 0.36328125, + "learning_rate": 4.911171960936815e-06, + "loss": 2.1036, + "step": 10490 + }, + { + "epoch": 0.5628218884120172, + "grad_norm": 0.396484375, + "learning_rate": 4.911149006989517e-06, + "loss": 2.0115, + "step": 10491 + }, + { + "epoch": 0.5628755364806867, + "grad_norm": 0.41015625, + "learning_rate": 4.9111260501305055e-06, + "loss": 2.3409, + "step": 10492 + }, + { + "epoch": 0.5629291845493563, + "grad_norm": 0.4765625, + "learning_rate": 4.91110309035981e-06, + "loss": 2.1183, + "step": 10493 + }, + { + "epoch": 0.5629828326180257, + "grad_norm": 0.423828125, + "learning_rate": 4.911080127677458e-06, + "loss": 2.3326, + "step": 10494 + }, + { + "epoch": 0.5630364806866953, + "grad_norm": 0.474609375, + "learning_rate": 4.911057162083475e-06, + "loss": 2.0564, + "step": 10495 + }, + { + "epoch": 0.5630901287553648, + "grad_norm": 0.39453125, + "learning_rate": 4.9110341935778915e-06, + "loss": 2.421, + "step": 10496 + }, + { + "epoch": 0.5631437768240344, + "grad_norm": 0.578125, + "learning_rate": 4.911011222160733e-06, + "loss": 2.2086, + "step": 10497 + }, + { + "epoch": 0.5631974248927039, + "grad_norm": 0.455078125, + "learning_rate": 4.91098824783203e-06, + "loss": 2.25, + "step": 10498 + }, + { + "epoch": 0.5632510729613734, + "grad_norm": 0.48046875, + "learning_rate": 4.910965270591808e-06, + "loss": 2.3944, + "step": 10499 + }, + { + "epoch": 0.5633047210300429, + "grad_norm": 0.427734375, + "learning_rate": 4.9109422904400935e-06, + "loss": 2.2465, + "step": 10500 + }, + { + "epoch": 0.5633583690987124, + "grad_norm": 0.4609375, + "learning_rate": 4.9109193073769175e-06, + "loss": 2.2636, + "step": 10501 + }, + { + "epoch": 0.563412017167382, + "grad_norm": 0.59765625, + "learning_rate": 4.910896321402306e-06, + "loss": 2.1036, + "step": 10502 + }, + { + "epoch": 0.5634656652360515, + "grad_norm": 0.419921875, + "learning_rate": 4.910873332516288e-06, + "loss": 2.5222, + "step": 10503 + }, + { + "epoch": 0.563519313304721, + "grad_norm": 0.40625, + "learning_rate": 4.91085034071889e-06, + "loss": 2.5072, + "step": 10504 + }, + { + "epoch": 0.5635729613733905, + "grad_norm": 0.73828125, + "learning_rate": 4.910827346010139e-06, + "loss": 2.0792, + "step": 10505 + }, + { + "epoch": 0.5636266094420601, + "grad_norm": 0.58203125, + "learning_rate": 4.910804348390064e-06, + "loss": 2.3971, + "step": 10506 + }, + { + "epoch": 0.5636802575107296, + "grad_norm": 0.484375, + "learning_rate": 4.910781347858694e-06, + "loss": 2.2961, + "step": 10507 + }, + { + "epoch": 0.5637339055793992, + "grad_norm": 0.58203125, + "learning_rate": 4.910758344416055e-06, + "loss": 2.3071, + "step": 10508 + }, + { + "epoch": 0.5637875536480687, + "grad_norm": 0.52734375, + "learning_rate": 4.910735338062175e-06, + "loss": 2.4189, + "step": 10509 + }, + { + "epoch": 0.5638412017167382, + "grad_norm": 0.412109375, + "learning_rate": 4.910712328797082e-06, + "loss": 2.3476, + "step": 10510 + }, + { + "epoch": 0.5638948497854077, + "grad_norm": 0.52734375, + "learning_rate": 4.910689316620803e-06, + "loss": 2.3512, + "step": 10511 + }, + { + "epoch": 0.5639484978540773, + "grad_norm": 0.5078125, + "learning_rate": 4.910666301533367e-06, + "loss": 2.331, + "step": 10512 + }, + { + "epoch": 0.5640021459227468, + "grad_norm": 0.478515625, + "learning_rate": 4.910643283534802e-06, + "loss": 2.2528, + "step": 10513 + }, + { + "epoch": 0.5640557939914163, + "grad_norm": 0.41015625, + "learning_rate": 4.910620262625134e-06, + "loss": 2.1991, + "step": 10514 + }, + { + "epoch": 0.5641094420600858, + "grad_norm": 0.462890625, + "learning_rate": 4.910597238804393e-06, + "loss": 2.4106, + "step": 10515 + }, + { + "epoch": 0.5641630901287553, + "grad_norm": 0.380859375, + "learning_rate": 4.910574212072605e-06, + "loss": 2.1801, + "step": 10516 + }, + { + "epoch": 0.5642167381974249, + "grad_norm": 0.400390625, + "learning_rate": 4.910551182429799e-06, + "loss": 1.9715, + "step": 10517 + }, + { + "epoch": 0.5642703862660944, + "grad_norm": 0.376953125, + "learning_rate": 4.9105281498760025e-06, + "loss": 2.2446, + "step": 10518 + }, + { + "epoch": 0.564324034334764, + "grad_norm": 0.4375, + "learning_rate": 4.9105051144112425e-06, + "loss": 2.2608, + "step": 10519 + }, + { + "epoch": 0.5643776824034334, + "grad_norm": 0.427734375, + "learning_rate": 4.910482076035548e-06, + "loss": 2.198, + "step": 10520 + }, + { + "epoch": 0.564431330472103, + "grad_norm": 0.52734375, + "learning_rate": 4.910459034748946e-06, + "loss": 2.247, + "step": 10521 + }, + { + "epoch": 0.5644849785407725, + "grad_norm": 0.470703125, + "learning_rate": 4.910435990551465e-06, + "loss": 2.4167, + "step": 10522 + }, + { + "epoch": 0.5645386266094421, + "grad_norm": 0.388671875, + "learning_rate": 4.910412943443132e-06, + "loss": 2.2001, + "step": 10523 + }, + { + "epoch": 0.5645922746781116, + "grad_norm": 0.39453125, + "learning_rate": 4.910389893423976e-06, + "loss": 2.0907, + "step": 10524 + }, + { + "epoch": 0.5646459227467812, + "grad_norm": 0.58984375, + "learning_rate": 4.910366840494023e-06, + "loss": 2.3075, + "step": 10525 + }, + { + "epoch": 0.5646995708154506, + "grad_norm": 0.45703125, + "learning_rate": 4.910343784653303e-06, + "loss": 1.9511, + "step": 10526 + }, + { + "epoch": 0.5647532188841202, + "grad_norm": 0.51953125, + "learning_rate": 4.910320725901843e-06, + "loss": 2.2483, + "step": 10527 + }, + { + "epoch": 0.5648068669527897, + "grad_norm": 0.37890625, + "learning_rate": 4.910297664239669e-06, + "loss": 2.0708, + "step": 10528 + }, + { + "epoch": 0.5648605150214592, + "grad_norm": 0.8359375, + "learning_rate": 4.910274599666812e-06, + "loss": 2.5296, + "step": 10529 + }, + { + "epoch": 0.5649141630901288, + "grad_norm": 0.384765625, + "learning_rate": 4.910251532183298e-06, + "loss": 2.253, + "step": 10530 + }, + { + "epoch": 0.5649678111587982, + "grad_norm": 0.515625, + "learning_rate": 4.910228461789155e-06, + "loss": 2.0785, + "step": 10531 + }, + { + "epoch": 0.5650214592274678, + "grad_norm": 0.4140625, + "learning_rate": 4.910205388484411e-06, + "loss": 2.4044, + "step": 10532 + }, + { + "epoch": 0.5650751072961373, + "grad_norm": 0.71484375, + "learning_rate": 4.910182312269095e-06, + "loss": 2.3353, + "step": 10533 + }, + { + "epoch": 0.5651287553648069, + "grad_norm": 0.44921875, + "learning_rate": 4.910159233143232e-06, + "loss": 2.3192, + "step": 10534 + }, + { + "epoch": 0.5651824034334764, + "grad_norm": 0.546875, + "learning_rate": 4.910136151106853e-06, + "loss": 2.1964, + "step": 10535 + }, + { + "epoch": 0.565236051502146, + "grad_norm": 0.46484375, + "learning_rate": 4.910113066159985e-06, + "loss": 2.4669, + "step": 10536 + }, + { + "epoch": 0.5652896995708154, + "grad_norm": 0.416015625, + "learning_rate": 4.910089978302655e-06, + "loss": 2.4673, + "step": 10537 + }, + { + "epoch": 0.565343347639485, + "grad_norm": 0.345703125, + "learning_rate": 4.91006688753489e-06, + "loss": 2.2189, + "step": 10538 + }, + { + "epoch": 0.5653969957081545, + "grad_norm": 0.79296875, + "learning_rate": 4.91004379385672e-06, + "loss": 2.0187, + "step": 10539 + }, + { + "epoch": 0.5654506437768241, + "grad_norm": 0.423828125, + "learning_rate": 4.910020697268173e-06, + "loss": 2.3156, + "step": 10540 + }, + { + "epoch": 0.5655042918454936, + "grad_norm": 0.46484375, + "learning_rate": 4.9099975977692745e-06, + "loss": 2.2041, + "step": 10541 + }, + { + "epoch": 0.5655579399141631, + "grad_norm": 0.40625, + "learning_rate": 4.909974495360055e-06, + "loss": 1.8554, + "step": 10542 + }, + { + "epoch": 0.5656115879828326, + "grad_norm": 0.482421875, + "learning_rate": 4.909951390040541e-06, + "loss": 2.4094, + "step": 10543 + }, + { + "epoch": 0.5656652360515021, + "grad_norm": 0.421875, + "learning_rate": 4.909928281810761e-06, + "loss": 2.2774, + "step": 10544 + }, + { + "epoch": 0.5657188841201717, + "grad_norm": 0.416015625, + "learning_rate": 4.909905170670742e-06, + "loss": 2.1409, + "step": 10545 + }, + { + "epoch": 0.5657725321888412, + "grad_norm": 0.408203125, + "learning_rate": 4.909882056620512e-06, + "loss": 2.2405, + "step": 10546 + }, + { + "epoch": 0.5658261802575107, + "grad_norm": 0.4140625, + "learning_rate": 4.9098589396601e-06, + "loss": 2.2113, + "step": 10547 + }, + { + "epoch": 0.5658798283261802, + "grad_norm": 0.404296875, + "learning_rate": 4.909835819789533e-06, + "loss": 2.2393, + "step": 10548 + }, + { + "epoch": 0.5659334763948498, + "grad_norm": 0.44921875, + "learning_rate": 4.90981269700884e-06, + "loss": 2.2873, + "step": 10549 + }, + { + "epoch": 0.5659871244635193, + "grad_norm": 0.453125, + "learning_rate": 4.909789571318048e-06, + "loss": 2.3901, + "step": 10550 + }, + { + "epoch": 0.5660407725321889, + "grad_norm": 2.21875, + "learning_rate": 4.9097664427171845e-06, + "loss": 2.3127, + "step": 10551 + }, + { + "epoch": 0.5660944206008584, + "grad_norm": 0.4453125, + "learning_rate": 4.909743311206279e-06, + "loss": 2.3618, + "step": 10552 + }, + { + "epoch": 0.5661480686695279, + "grad_norm": 1.0703125, + "learning_rate": 4.909720176785357e-06, + "loss": 1.8823, + "step": 10553 + }, + { + "epoch": 0.5662017167381974, + "grad_norm": 0.60546875, + "learning_rate": 4.909697039454448e-06, + "loss": 2.3368, + "step": 10554 + }, + { + "epoch": 0.566255364806867, + "grad_norm": 0.376953125, + "learning_rate": 4.909673899213582e-06, + "loss": 2.2203, + "step": 10555 + }, + { + "epoch": 0.5663090128755365, + "grad_norm": 0.470703125, + "learning_rate": 4.909650756062782e-06, + "loss": 2.2598, + "step": 10556 + }, + { + "epoch": 0.566362660944206, + "grad_norm": 0.486328125, + "learning_rate": 4.909627610002081e-06, + "loss": 2.4975, + "step": 10557 + }, + { + "epoch": 0.5664163090128755, + "grad_norm": 0.4609375, + "learning_rate": 4.909604461031503e-06, + "loss": 2.2625, + "step": 10558 + }, + { + "epoch": 0.566469957081545, + "grad_norm": 0.451171875, + "learning_rate": 4.909581309151079e-06, + "loss": 2.3888, + "step": 10559 + }, + { + "epoch": 0.5665236051502146, + "grad_norm": 0.416015625, + "learning_rate": 4.909558154360834e-06, + "loss": 2.0768, + "step": 10560 + }, + { + "epoch": 0.5665772532188841, + "grad_norm": 0.439453125, + "learning_rate": 4.909534996660799e-06, + "loss": 2.4348, + "step": 10561 + }, + { + "epoch": 0.5666309012875537, + "grad_norm": 0.416015625, + "learning_rate": 4.909511836050999e-06, + "loss": 2.2152, + "step": 10562 + }, + { + "epoch": 0.5666845493562231, + "grad_norm": 0.392578125, + "learning_rate": 4.909488672531465e-06, + "loss": 2.2248, + "step": 10563 + }, + { + "epoch": 0.5667381974248927, + "grad_norm": 0.40234375, + "learning_rate": 4.909465506102223e-06, + "loss": 2.2764, + "step": 10564 + }, + { + "epoch": 0.5667918454935622, + "grad_norm": 0.453125, + "learning_rate": 4.9094423367633006e-06, + "loss": 2.3372, + "step": 10565 + }, + { + "epoch": 0.5668454935622318, + "grad_norm": 0.486328125, + "learning_rate": 4.909419164514728e-06, + "loss": 2.2726, + "step": 10566 + }, + { + "epoch": 0.5668991416309013, + "grad_norm": 0.400390625, + "learning_rate": 4.90939598935653e-06, + "loss": 2.2442, + "step": 10567 + }, + { + "epoch": 0.5669527896995709, + "grad_norm": 0.369140625, + "learning_rate": 4.909372811288738e-06, + "loss": 2.2385, + "step": 10568 + }, + { + "epoch": 0.5670064377682403, + "grad_norm": 0.3671875, + "learning_rate": 4.909349630311377e-06, + "loss": 2.0233, + "step": 10569 + }, + { + "epoch": 0.5670600858369099, + "grad_norm": 0.44140625, + "learning_rate": 4.909326446424478e-06, + "loss": 2.4834, + "step": 10570 + }, + { + "epoch": 0.5671137339055794, + "grad_norm": 0.7734375, + "learning_rate": 4.909303259628066e-06, + "loss": 2.2639, + "step": 10571 + }, + { + "epoch": 0.5671673819742489, + "grad_norm": 0.396484375, + "learning_rate": 4.909280069922171e-06, + "loss": 2.3064, + "step": 10572 + }, + { + "epoch": 0.5672210300429185, + "grad_norm": 0.69140625, + "learning_rate": 4.909256877306821e-06, + "loss": 2.1869, + "step": 10573 + }, + { + "epoch": 0.5672746781115879, + "grad_norm": 0.3984375, + "learning_rate": 4.909233681782042e-06, + "loss": 2.2037, + "step": 10574 + }, + { + "epoch": 0.5673283261802575, + "grad_norm": 0.458984375, + "learning_rate": 4.909210483347864e-06, + "loss": 2.2063, + "step": 10575 + }, + { + "epoch": 0.567381974248927, + "grad_norm": 0.5625, + "learning_rate": 4.909187282004314e-06, + "loss": 1.8283, + "step": 10576 + }, + { + "epoch": 0.5674356223175966, + "grad_norm": 0.4609375, + "learning_rate": 4.90916407775142e-06, + "loss": 2.3004, + "step": 10577 + }, + { + "epoch": 0.5674892703862661, + "grad_norm": 0.484375, + "learning_rate": 4.909140870589212e-06, + "loss": 1.8113, + "step": 10578 + }, + { + "epoch": 0.5675429184549357, + "grad_norm": 0.451171875, + "learning_rate": 4.909117660517715e-06, + "loss": 2.3414, + "step": 10579 + }, + { + "epoch": 0.5675965665236051, + "grad_norm": 0.6171875, + "learning_rate": 4.90909444753696e-06, + "loss": 2.3471, + "step": 10580 + }, + { + "epoch": 0.5676502145922747, + "grad_norm": 0.46484375, + "learning_rate": 4.909071231646973e-06, + "loss": 2.1995, + "step": 10581 + }, + { + "epoch": 0.5677038626609442, + "grad_norm": 0.482421875, + "learning_rate": 4.909048012847781e-06, + "loss": 2.2607, + "step": 10582 + }, + { + "epoch": 0.5677575107296138, + "grad_norm": 0.396484375, + "learning_rate": 4.9090247911394154e-06, + "loss": 2.0651, + "step": 10583 + }, + { + "epoch": 0.5678111587982833, + "grad_norm": 0.474609375, + "learning_rate": 4.909001566521902e-06, + "loss": 2.2247, + "step": 10584 + }, + { + "epoch": 0.5678648068669528, + "grad_norm": 0.5234375, + "learning_rate": 4.908978338995269e-06, + "loss": 2.0491, + "step": 10585 + }, + { + "epoch": 0.5679184549356223, + "grad_norm": 0.345703125, + "learning_rate": 4.908955108559544e-06, + "loss": 2.1376, + "step": 10586 + }, + { + "epoch": 0.5679721030042918, + "grad_norm": 0.427734375, + "learning_rate": 4.908931875214757e-06, + "loss": 2.1962, + "step": 10587 + }, + { + "epoch": 0.5680257510729614, + "grad_norm": 0.486328125, + "learning_rate": 4.908908638960934e-06, + "loss": 2.4419, + "step": 10588 + }, + { + "epoch": 0.5680793991416309, + "grad_norm": 1.3203125, + "learning_rate": 4.908885399798105e-06, + "loss": 2.1888, + "step": 10589 + }, + { + "epoch": 0.5681330472103004, + "grad_norm": 0.375, + "learning_rate": 4.908862157726296e-06, + "loss": 2.3526, + "step": 10590 + }, + { + "epoch": 0.5681866952789699, + "grad_norm": 0.458984375, + "learning_rate": 4.908838912745536e-06, + "loss": 2.2932, + "step": 10591 + }, + { + "epoch": 0.5682403433476395, + "grad_norm": 0.443359375, + "learning_rate": 4.9088156648558535e-06, + "loss": 2.4873, + "step": 10592 + }, + { + "epoch": 0.568293991416309, + "grad_norm": 0.462890625, + "learning_rate": 4.9087924140572765e-06, + "loss": 2.4434, + "step": 10593 + }, + { + "epoch": 0.5683476394849786, + "grad_norm": 0.40234375, + "learning_rate": 4.9087691603498325e-06, + "loss": 2.2894, + "step": 10594 + }, + { + "epoch": 0.568401287553648, + "grad_norm": 0.43359375, + "learning_rate": 4.9087459037335495e-06, + "loss": 2.2818, + "step": 10595 + }, + { + "epoch": 0.5684549356223176, + "grad_norm": 0.453125, + "learning_rate": 4.908722644208455e-06, + "loss": 2.3495, + "step": 10596 + }, + { + "epoch": 0.5685085836909871, + "grad_norm": 0.55859375, + "learning_rate": 4.90869938177458e-06, + "loss": 2.4559, + "step": 10597 + }, + { + "epoch": 0.5685622317596567, + "grad_norm": 0.6015625, + "learning_rate": 4.90867611643195e-06, + "loss": 2.1457, + "step": 10598 + }, + { + "epoch": 0.5686158798283262, + "grad_norm": 0.43359375, + "learning_rate": 4.908652848180593e-06, + "loss": 1.9307, + "step": 10599 + }, + { + "epoch": 0.5686695278969958, + "grad_norm": 0.400390625, + "learning_rate": 4.908629577020538e-06, + "loss": 2.2086, + "step": 10600 + }, + { + "epoch": 0.5687231759656652, + "grad_norm": 0.494140625, + "learning_rate": 4.9086063029518136e-06, + "loss": 2.7626, + "step": 10601 + }, + { + "epoch": 0.5687768240343347, + "grad_norm": 0.55078125, + "learning_rate": 4.908583025974447e-06, + "loss": 2.164, + "step": 10602 + }, + { + "epoch": 0.5688304721030043, + "grad_norm": 0.5078125, + "learning_rate": 4.908559746088466e-06, + "loss": 2.3347, + "step": 10603 + }, + { + "epoch": 0.5688841201716738, + "grad_norm": 0.4375, + "learning_rate": 4.9085364632939e-06, + "loss": 2.0469, + "step": 10604 + }, + { + "epoch": 0.5689377682403434, + "grad_norm": 0.37890625, + "learning_rate": 4.908513177590775e-06, + "loss": 2.1044, + "step": 10605 + }, + { + "epoch": 0.5689914163090128, + "grad_norm": 0.43359375, + "learning_rate": 4.908489888979121e-06, + "loss": 2.3568, + "step": 10606 + }, + { + "epoch": 0.5690450643776824, + "grad_norm": 0.5078125, + "learning_rate": 4.908466597458966e-06, + "loss": 2.3465, + "step": 10607 + }, + { + "epoch": 0.5690987124463519, + "grad_norm": 0.4453125, + "learning_rate": 4.908443303030338e-06, + "loss": 2.4606, + "step": 10608 + }, + { + "epoch": 0.5691523605150215, + "grad_norm": 0.44921875, + "learning_rate": 4.908420005693264e-06, + "loss": 2.2641, + "step": 10609 + }, + { + "epoch": 0.569206008583691, + "grad_norm": 0.90625, + "learning_rate": 4.9083967054477735e-06, + "loss": 1.5524, + "step": 10610 + }, + { + "epoch": 0.5692596566523606, + "grad_norm": 0.47265625, + "learning_rate": 4.9083734022938935e-06, + "loss": 2.0027, + "step": 10611 + }, + { + "epoch": 0.56931330472103, + "grad_norm": 0.74609375, + "learning_rate": 4.908350096231653e-06, + "loss": 2.2217, + "step": 10612 + }, + { + "epoch": 0.5693669527896996, + "grad_norm": 0.396484375, + "learning_rate": 4.908326787261081e-06, + "loss": 2.127, + "step": 10613 + }, + { + "epoch": 0.5694206008583691, + "grad_norm": 0.462890625, + "learning_rate": 4.908303475382203e-06, + "loss": 2.4292, + "step": 10614 + }, + { + "epoch": 0.5694742489270386, + "grad_norm": 0.365234375, + "learning_rate": 4.90828016059505e-06, + "loss": 1.9942, + "step": 10615 + }, + { + "epoch": 0.5695278969957082, + "grad_norm": 0.4453125, + "learning_rate": 4.908256842899648e-06, + "loss": 2.3087, + "step": 10616 + }, + { + "epoch": 0.5695815450643776, + "grad_norm": 0.4921875, + "learning_rate": 4.908233522296026e-06, + "loss": 2.165, + "step": 10617 + }, + { + "epoch": 0.5696351931330472, + "grad_norm": 0.41015625, + "learning_rate": 4.9082101987842125e-06, + "loss": 2.2823, + "step": 10618 + }, + { + "epoch": 0.5696888412017167, + "grad_norm": 0.443359375, + "learning_rate": 4.908186872364236e-06, + "loss": 2.37, + "step": 10619 + }, + { + "epoch": 0.5697424892703863, + "grad_norm": 0.404296875, + "learning_rate": 4.908163543036122e-06, + "loss": 2.1622, + "step": 10620 + }, + { + "epoch": 0.5697961373390558, + "grad_norm": 0.427734375, + "learning_rate": 4.908140210799902e-06, + "loss": 2.2022, + "step": 10621 + }, + { + "epoch": 0.5698497854077254, + "grad_norm": 0.455078125, + "learning_rate": 4.908116875655603e-06, + "loss": 2.1902, + "step": 10622 + }, + { + "epoch": 0.5699034334763948, + "grad_norm": 0.5078125, + "learning_rate": 4.908093537603253e-06, + "loss": 2.1775, + "step": 10623 + }, + { + "epoch": 0.5699570815450644, + "grad_norm": 0.76171875, + "learning_rate": 4.90807019664288e-06, + "loss": 2.0874, + "step": 10624 + }, + { + "epoch": 0.5700107296137339, + "grad_norm": 0.455078125, + "learning_rate": 4.908046852774513e-06, + "loss": 2.3897, + "step": 10625 + }, + { + "epoch": 0.5700643776824035, + "grad_norm": 0.423828125, + "learning_rate": 4.908023505998179e-06, + "loss": 2.2153, + "step": 10626 + }, + { + "epoch": 0.570118025751073, + "grad_norm": 0.3828125, + "learning_rate": 4.908000156313907e-06, + "loss": 2.2288, + "step": 10627 + }, + { + "epoch": 0.5701716738197425, + "grad_norm": 0.5234375, + "learning_rate": 4.907976803721724e-06, + "loss": 2.3038, + "step": 10628 + }, + { + "epoch": 0.570225321888412, + "grad_norm": 0.466796875, + "learning_rate": 4.90795344822166e-06, + "loss": 2.5818, + "step": 10629 + }, + { + "epoch": 0.5702789699570815, + "grad_norm": 0.447265625, + "learning_rate": 4.907930089813743e-06, + "loss": 2.5694, + "step": 10630 + }, + { + "epoch": 0.5703326180257511, + "grad_norm": 0.408203125, + "learning_rate": 4.907906728497999e-06, + "loss": 2.2223, + "step": 10631 + }, + { + "epoch": 0.5703862660944206, + "grad_norm": 0.43359375, + "learning_rate": 4.907883364274459e-06, + "loss": 2.4767, + "step": 10632 + }, + { + "epoch": 0.5704399141630901, + "grad_norm": 0.427734375, + "learning_rate": 4.90785999714315e-06, + "loss": 2.4375, + "step": 10633 + }, + { + "epoch": 0.5704935622317596, + "grad_norm": 0.37890625, + "learning_rate": 4.9078366271041e-06, + "loss": 2.1005, + "step": 10634 + }, + { + "epoch": 0.5705472103004292, + "grad_norm": 0.466796875, + "learning_rate": 4.907813254157338e-06, + "loss": 2.3029, + "step": 10635 + }, + { + "epoch": 0.5706008583690987, + "grad_norm": 0.45703125, + "learning_rate": 4.907789878302891e-06, + "loss": 2.2528, + "step": 10636 + }, + { + "epoch": 0.5706545064377683, + "grad_norm": 0.390625, + "learning_rate": 4.907766499540788e-06, + "loss": 2.1434, + "step": 10637 + }, + { + "epoch": 0.5707081545064377, + "grad_norm": 0.4765625, + "learning_rate": 4.907743117871057e-06, + "loss": 2.4177, + "step": 10638 + }, + { + "epoch": 0.5707618025751073, + "grad_norm": 0.55859375, + "learning_rate": 4.907719733293727e-06, + "loss": 2.5035, + "step": 10639 + }, + { + "epoch": 0.5708154506437768, + "grad_norm": 0.515625, + "learning_rate": 4.907696345808825e-06, + "loss": 2.3966, + "step": 10640 + }, + { + "epoch": 0.5708690987124464, + "grad_norm": 0.5078125, + "learning_rate": 4.90767295541638e-06, + "loss": 2.369, + "step": 10641 + }, + { + "epoch": 0.5709227467811159, + "grad_norm": 1.1640625, + "learning_rate": 4.907649562116421e-06, + "loss": 2.3461, + "step": 10642 + }, + { + "epoch": 0.5709763948497855, + "grad_norm": 0.494140625, + "learning_rate": 4.907626165908975e-06, + "loss": 2.2556, + "step": 10643 + }, + { + "epoch": 0.5710300429184549, + "grad_norm": 0.486328125, + "learning_rate": 4.90760276679407e-06, + "loss": 2.3767, + "step": 10644 + }, + { + "epoch": 0.5710836909871244, + "grad_norm": 0.408203125, + "learning_rate": 4.9075793647717355e-06, + "loss": 2.4805, + "step": 10645 + }, + { + "epoch": 0.571137339055794, + "grad_norm": 0.44140625, + "learning_rate": 4.9075559598419984e-06, + "loss": 2.4589, + "step": 10646 + }, + { + "epoch": 0.5711909871244635, + "grad_norm": 0.46875, + "learning_rate": 4.907532552004888e-06, + "loss": 2.2771, + "step": 10647 + }, + { + "epoch": 0.5712446351931331, + "grad_norm": 0.44140625, + "learning_rate": 4.907509141260432e-06, + "loss": 2.5204, + "step": 10648 + }, + { + "epoch": 0.5712982832618025, + "grad_norm": 0.5390625, + "learning_rate": 4.90748572760866e-06, + "loss": 2.5269, + "step": 10649 + }, + { + "epoch": 0.5713519313304721, + "grad_norm": 1.96875, + "learning_rate": 4.907462311049598e-06, + "loss": 2.1695, + "step": 10650 + }, + { + "epoch": 0.5714055793991416, + "grad_norm": 0.49609375, + "learning_rate": 4.907438891583276e-06, + "loss": 2.1606, + "step": 10651 + }, + { + "epoch": 0.5714592274678112, + "grad_norm": 0.453125, + "learning_rate": 4.907415469209722e-06, + "loss": 2.0841, + "step": 10652 + }, + { + "epoch": 0.5715128755364807, + "grad_norm": 0.490234375, + "learning_rate": 4.907392043928964e-06, + "loss": 1.8168, + "step": 10653 + }, + { + "epoch": 0.5715665236051503, + "grad_norm": 0.3984375, + "learning_rate": 4.90736861574103e-06, + "loss": 2.386, + "step": 10654 + }, + { + "epoch": 0.5716201716738197, + "grad_norm": 0.4140625, + "learning_rate": 4.907345184645949e-06, + "loss": 2.1701, + "step": 10655 + }, + { + "epoch": 0.5716738197424893, + "grad_norm": 0.419921875, + "learning_rate": 4.907321750643748e-06, + "loss": 2.091, + "step": 10656 + }, + { + "epoch": 0.5717274678111588, + "grad_norm": 0.365234375, + "learning_rate": 4.907298313734458e-06, + "loss": 2.1297, + "step": 10657 + }, + { + "epoch": 0.5717811158798283, + "grad_norm": 0.6796875, + "learning_rate": 4.907274873918104e-06, + "loss": 2.3923, + "step": 10658 + }, + { + "epoch": 0.5718347639484979, + "grad_norm": 0.40234375, + "learning_rate": 4.907251431194717e-06, + "loss": 2.2941, + "step": 10659 + }, + { + "epoch": 0.5718884120171673, + "grad_norm": 0.53125, + "learning_rate": 4.907227985564323e-06, + "loss": 2.2555, + "step": 10660 + }, + { + "epoch": 0.5719420600858369, + "grad_norm": 0.421875, + "learning_rate": 4.907204537026952e-06, + "loss": 2.2768, + "step": 10661 + }, + { + "epoch": 0.5719957081545064, + "grad_norm": 0.58203125, + "learning_rate": 4.907181085582632e-06, + "loss": 2.319, + "step": 10662 + }, + { + "epoch": 0.572049356223176, + "grad_norm": 0.431640625, + "learning_rate": 4.90715763123139e-06, + "loss": 2.4497, + "step": 10663 + }, + { + "epoch": 0.5721030042918455, + "grad_norm": 0.53125, + "learning_rate": 4.907134173973257e-06, + "loss": 2.4019, + "step": 10664 + }, + { + "epoch": 0.572156652360515, + "grad_norm": 0.462890625, + "learning_rate": 4.907110713808259e-06, + "loss": 2.1833, + "step": 10665 + }, + { + "epoch": 0.5722103004291845, + "grad_norm": 0.423828125, + "learning_rate": 4.907087250736425e-06, + "loss": 2.4105, + "step": 10666 + }, + { + "epoch": 0.5722639484978541, + "grad_norm": 0.5078125, + "learning_rate": 4.907063784757784e-06, + "loss": 1.8279, + "step": 10667 + }, + { + "epoch": 0.5723175965665236, + "grad_norm": 0.392578125, + "learning_rate": 4.907040315872363e-06, + "loss": 2.1554, + "step": 10668 + }, + { + "epoch": 0.5723712446351932, + "grad_norm": 0.734375, + "learning_rate": 4.9070168440801915e-06, + "loss": 2.2253, + "step": 10669 + }, + { + "epoch": 0.5724248927038627, + "grad_norm": 0.44921875, + "learning_rate": 4.906993369381297e-06, + "loss": 2.3039, + "step": 10670 + }, + { + "epoch": 0.5724785407725322, + "grad_norm": 0.5, + "learning_rate": 4.906969891775709e-06, + "loss": 2.3564, + "step": 10671 + }, + { + "epoch": 0.5725321888412017, + "grad_norm": 0.396484375, + "learning_rate": 4.906946411263455e-06, + "loss": 2.3342, + "step": 10672 + }, + { + "epoch": 0.5725858369098712, + "grad_norm": 0.46484375, + "learning_rate": 4.906922927844563e-06, + "loss": 1.9816, + "step": 10673 + }, + { + "epoch": 0.5726394849785408, + "grad_norm": 0.41796875, + "learning_rate": 4.906899441519062e-06, + "loss": 2.261, + "step": 10674 + }, + { + "epoch": 0.5726931330472103, + "grad_norm": 0.412109375, + "learning_rate": 4.90687595228698e-06, + "loss": 2.2582, + "step": 10675 + }, + { + "epoch": 0.5727467811158798, + "grad_norm": 0.46484375, + "learning_rate": 4.9068524601483454e-06, + "loss": 2.2133, + "step": 10676 + }, + { + "epoch": 0.5728004291845493, + "grad_norm": 0.423828125, + "learning_rate": 4.906828965103187e-06, + "loss": 2.0702, + "step": 10677 + }, + { + "epoch": 0.5728540772532189, + "grad_norm": 0.63671875, + "learning_rate": 4.906805467151534e-06, + "loss": 2.3374, + "step": 10678 + }, + { + "epoch": 0.5729077253218884, + "grad_norm": 0.443359375, + "learning_rate": 4.906781966293413e-06, + "loss": 2.2254, + "step": 10679 + }, + { + "epoch": 0.572961373390558, + "grad_norm": 0.404296875, + "learning_rate": 4.906758462528853e-06, + "loss": 1.9499, + "step": 10680 + }, + { + "epoch": 0.5730150214592274, + "grad_norm": 0.427734375, + "learning_rate": 4.906734955857882e-06, + "loss": 2.36, + "step": 10681 + }, + { + "epoch": 0.573068669527897, + "grad_norm": 0.47265625, + "learning_rate": 4.906711446280529e-06, + "loss": 2.4593, + "step": 10682 + }, + { + "epoch": 0.5731223175965665, + "grad_norm": 0.4609375, + "learning_rate": 4.9066879337968235e-06, + "loss": 2.4825, + "step": 10683 + }, + { + "epoch": 0.5731759656652361, + "grad_norm": 0.396484375, + "learning_rate": 4.906664418406791e-06, + "loss": 2.1328, + "step": 10684 + }, + { + "epoch": 0.5732296137339056, + "grad_norm": 0.45703125, + "learning_rate": 4.906640900110462e-06, + "loss": 2.4232, + "step": 10685 + }, + { + "epoch": 0.5732832618025752, + "grad_norm": 0.46875, + "learning_rate": 4.906617378907865e-06, + "loss": 2.2592, + "step": 10686 + }, + { + "epoch": 0.5733369098712446, + "grad_norm": 0.474609375, + "learning_rate": 4.906593854799027e-06, + "loss": 2.4493, + "step": 10687 + }, + { + "epoch": 0.5733905579399141, + "grad_norm": 0.470703125, + "learning_rate": 4.9065703277839775e-06, + "loss": 2.4723, + "step": 10688 + }, + { + "epoch": 0.5734442060085837, + "grad_norm": 0.462890625, + "learning_rate": 4.906546797862745e-06, + "loss": 2.4193, + "step": 10689 + }, + { + "epoch": 0.5734978540772532, + "grad_norm": 0.455078125, + "learning_rate": 4.906523265035358e-06, + "loss": 2.2276, + "step": 10690 + }, + { + "epoch": 0.5735515021459228, + "grad_norm": 0.46875, + "learning_rate": 4.906499729301843e-06, + "loss": 1.6999, + "step": 10691 + }, + { + "epoch": 0.5736051502145922, + "grad_norm": 0.451171875, + "learning_rate": 4.906476190662231e-06, + "loss": 2.3372, + "step": 10692 + }, + { + "epoch": 0.5736587982832618, + "grad_norm": 0.37109375, + "learning_rate": 4.906452649116549e-06, + "loss": 2.4262, + "step": 10693 + }, + { + "epoch": 0.5737124463519313, + "grad_norm": 0.412109375, + "learning_rate": 4.906429104664826e-06, + "loss": 2.4349, + "step": 10694 + }, + { + "epoch": 0.5737660944206009, + "grad_norm": 0.435546875, + "learning_rate": 4.90640555730709e-06, + "loss": 2.2145, + "step": 10695 + }, + { + "epoch": 0.5738197424892704, + "grad_norm": 0.443359375, + "learning_rate": 4.9063820070433696e-06, + "loss": 2.5643, + "step": 10696 + }, + { + "epoch": 0.57387339055794, + "grad_norm": 0.423828125, + "learning_rate": 4.906358453873693e-06, + "loss": 2.4786, + "step": 10697 + }, + { + "epoch": 0.5739270386266094, + "grad_norm": 0.435546875, + "learning_rate": 4.9063348977980895e-06, + "loss": 2.3745, + "step": 10698 + }, + { + "epoch": 0.573980686695279, + "grad_norm": 0.44140625, + "learning_rate": 4.906311338816587e-06, + "loss": 2.3879, + "step": 10699 + }, + { + "epoch": 0.5740343347639485, + "grad_norm": 0.36328125, + "learning_rate": 4.906287776929214e-06, + "loss": 1.9527, + "step": 10700 + }, + { + "epoch": 0.574087982832618, + "grad_norm": 0.423828125, + "learning_rate": 4.906264212135998e-06, + "loss": 2.3689, + "step": 10701 + }, + { + "epoch": 0.5741416309012876, + "grad_norm": 0.494140625, + "learning_rate": 4.906240644436969e-06, + "loss": 2.1944, + "step": 10702 + }, + { + "epoch": 0.574195278969957, + "grad_norm": 0.478515625, + "learning_rate": 4.906217073832155e-06, + "loss": 2.2842, + "step": 10703 + }, + { + "epoch": 0.5742489270386266, + "grad_norm": 0.419921875, + "learning_rate": 4.906193500321583e-06, + "loss": 2.2354, + "step": 10704 + }, + { + "epoch": 0.5743025751072961, + "grad_norm": 0.515625, + "learning_rate": 4.906169923905284e-06, + "loss": 2.4566, + "step": 10705 + }, + { + "epoch": 0.5743562231759657, + "grad_norm": 0.4296875, + "learning_rate": 4.9061463445832846e-06, + "loss": 2.1935, + "step": 10706 + }, + { + "epoch": 0.5744098712446352, + "grad_norm": 0.41796875, + "learning_rate": 4.906122762355614e-06, + "loss": 2.2226, + "step": 10707 + }, + { + "epoch": 0.5744635193133047, + "grad_norm": 0.3984375, + "learning_rate": 4.906099177222301e-06, + "loss": 1.9905, + "step": 10708 + }, + { + "epoch": 0.5745171673819742, + "grad_norm": 0.6328125, + "learning_rate": 4.906075589183373e-06, + "loss": 2.2852, + "step": 10709 + }, + { + "epoch": 0.5745708154506438, + "grad_norm": 0.41796875, + "learning_rate": 4.906051998238859e-06, + "loss": 2.0785, + "step": 10710 + }, + { + "epoch": 0.5746244635193133, + "grad_norm": 0.451171875, + "learning_rate": 4.9060284043887875e-06, + "loss": 2.3405, + "step": 10711 + }, + { + "epoch": 0.5746781115879829, + "grad_norm": 0.498046875, + "learning_rate": 4.906004807633188e-06, + "loss": 2.064, + "step": 10712 + }, + { + "epoch": 0.5747317596566524, + "grad_norm": 0.35546875, + "learning_rate": 4.905981207972087e-06, + "loss": 1.9928, + "step": 10713 + }, + { + "epoch": 0.5747854077253219, + "grad_norm": 0.373046875, + "learning_rate": 4.905957605405515e-06, + "loss": 2.2475, + "step": 10714 + }, + { + "epoch": 0.5748390557939914, + "grad_norm": 0.43359375, + "learning_rate": 4.905933999933499e-06, + "loss": 2.2739, + "step": 10715 + }, + { + "epoch": 0.5748927038626609, + "grad_norm": 0.474609375, + "learning_rate": 4.905910391556068e-06, + "loss": 2.2861, + "step": 10716 + }, + { + "epoch": 0.5749463519313305, + "grad_norm": 0.546875, + "learning_rate": 4.905886780273251e-06, + "loss": 2.6108, + "step": 10717 + }, + { + "epoch": 0.575, + "grad_norm": 0.48828125, + "learning_rate": 4.905863166085076e-06, + "loss": 2.3283, + "step": 10718 + }, + { + "epoch": 0.5750536480686695, + "grad_norm": 0.43359375, + "learning_rate": 4.905839548991572e-06, + "loss": 2.2395, + "step": 10719 + }, + { + "epoch": 0.575107296137339, + "grad_norm": 0.43359375, + "learning_rate": 4.9058159289927665e-06, + "loss": 2.1468, + "step": 10720 + }, + { + "epoch": 0.5751609442060086, + "grad_norm": 0.44921875, + "learning_rate": 4.905792306088689e-06, + "loss": 2.3303, + "step": 10721 + }, + { + "epoch": 0.5752145922746781, + "grad_norm": 0.439453125, + "learning_rate": 4.905768680279367e-06, + "loss": 2.3979, + "step": 10722 + }, + { + "epoch": 0.5752682403433477, + "grad_norm": 0.3828125, + "learning_rate": 4.905745051564831e-06, + "loss": 2.2033, + "step": 10723 + }, + { + "epoch": 0.5753218884120171, + "grad_norm": 1.3125, + "learning_rate": 4.905721419945107e-06, + "loss": 2.3425, + "step": 10724 + }, + { + "epoch": 0.5753755364806867, + "grad_norm": 0.451171875, + "learning_rate": 4.905697785420225e-06, + "loss": 2.3512, + "step": 10725 + }, + { + "epoch": 0.5754291845493562, + "grad_norm": 0.453125, + "learning_rate": 4.905674147990214e-06, + "loss": 2.1507, + "step": 10726 + }, + { + "epoch": 0.5754828326180258, + "grad_norm": 0.474609375, + "learning_rate": 4.905650507655102e-06, + "loss": 2.3602, + "step": 10727 + }, + { + "epoch": 0.5755364806866953, + "grad_norm": 0.408203125, + "learning_rate": 4.905626864414917e-06, + "loss": 2.2381, + "step": 10728 + }, + { + "epoch": 0.5755901287553649, + "grad_norm": 0.4296875, + "learning_rate": 4.905603218269687e-06, + "loss": 2.2413, + "step": 10729 + }, + { + "epoch": 0.5756437768240343, + "grad_norm": 0.328125, + "learning_rate": 4.905579569219443e-06, + "loss": 2.2403, + "step": 10730 + }, + { + "epoch": 0.5756974248927038, + "grad_norm": 0.4140625, + "learning_rate": 4.905555917264212e-06, + "loss": 2.3157, + "step": 10731 + }, + { + "epoch": 0.5757510729613734, + "grad_norm": 0.419921875, + "learning_rate": 4.905532262404022e-06, + "loss": 2.1599, + "step": 10732 + }, + { + "epoch": 0.5758047210300429, + "grad_norm": 0.400390625, + "learning_rate": 4.905508604638903e-06, + "loss": 2.2635, + "step": 10733 + }, + { + "epoch": 0.5758583690987125, + "grad_norm": 0.4921875, + "learning_rate": 4.905484943968882e-06, + "loss": 2.3447, + "step": 10734 + }, + { + "epoch": 0.5759120171673819, + "grad_norm": 0.43359375, + "learning_rate": 4.905461280393988e-06, + "loss": 2.0003, + "step": 10735 + }, + { + "epoch": 0.5759656652360515, + "grad_norm": 0.419921875, + "learning_rate": 4.905437613914251e-06, + "loss": 1.9872, + "step": 10736 + }, + { + "epoch": 0.576019313304721, + "grad_norm": 0.34765625, + "learning_rate": 4.905413944529698e-06, + "loss": 2.0643, + "step": 10737 + }, + { + "epoch": 0.5760729613733906, + "grad_norm": 0.453125, + "learning_rate": 4.905390272240359e-06, + "loss": 2.2045, + "step": 10738 + }, + { + "epoch": 0.5761266094420601, + "grad_norm": 0.98046875, + "learning_rate": 4.90536659704626e-06, + "loss": 2.1081, + "step": 10739 + }, + { + "epoch": 0.5761802575107297, + "grad_norm": 0.443359375, + "learning_rate": 4.905342918947433e-06, + "loss": 2.2029, + "step": 10740 + }, + { + "epoch": 0.5762339055793991, + "grad_norm": 0.47265625, + "learning_rate": 4.905319237943904e-06, + "loss": 2.388, + "step": 10741 + }, + { + "epoch": 0.5762875536480687, + "grad_norm": 0.451171875, + "learning_rate": 4.9052955540357025e-06, + "loss": 2.3128, + "step": 10742 + }, + { + "epoch": 0.5763412017167382, + "grad_norm": 0.470703125, + "learning_rate": 4.905271867222857e-06, + "loss": 1.9276, + "step": 10743 + }, + { + "epoch": 0.5763948497854077, + "grad_norm": 0.4609375, + "learning_rate": 4.905248177505396e-06, + "loss": 2.4647, + "step": 10744 + }, + { + "epoch": 0.5764484978540773, + "grad_norm": 0.59765625, + "learning_rate": 4.905224484883349e-06, + "loss": 2.5024, + "step": 10745 + }, + { + "epoch": 0.5765021459227467, + "grad_norm": 0.72265625, + "learning_rate": 4.9052007893567435e-06, + "loss": 2.3646, + "step": 10746 + }, + { + "epoch": 0.5765557939914163, + "grad_norm": 0.421875, + "learning_rate": 4.9051770909256086e-06, + "loss": 2.3713, + "step": 10747 + }, + { + "epoch": 0.5766094420600858, + "grad_norm": 0.447265625, + "learning_rate": 4.905153389589973e-06, + "loss": 2.3224, + "step": 10748 + }, + { + "epoch": 0.5766630901287554, + "grad_norm": 0.423828125, + "learning_rate": 4.905129685349865e-06, + "loss": 2.1986, + "step": 10749 + }, + { + "epoch": 0.5767167381974249, + "grad_norm": 0.50390625, + "learning_rate": 4.9051059782053125e-06, + "loss": 2.4695, + "step": 10750 + }, + { + "epoch": 0.5767703862660944, + "grad_norm": 0.51171875, + "learning_rate": 4.905082268156346e-06, + "loss": 2.2972, + "step": 10751 + }, + { + "epoch": 0.5768240343347639, + "grad_norm": 0.66015625, + "learning_rate": 4.9050585552029935e-06, + "loss": 1.9357, + "step": 10752 + }, + { + "epoch": 0.5768776824034335, + "grad_norm": 0.609375, + "learning_rate": 4.905034839345282e-06, + "loss": 2.3585, + "step": 10753 + }, + { + "epoch": 0.576931330472103, + "grad_norm": 0.6875, + "learning_rate": 4.905011120583243e-06, + "loss": 2.3726, + "step": 10754 + }, + { + "epoch": 0.5769849785407726, + "grad_norm": 0.4453125, + "learning_rate": 4.904987398916902e-06, + "loss": 1.9493, + "step": 10755 + }, + { + "epoch": 0.577038626609442, + "grad_norm": 0.44921875, + "learning_rate": 4.9049636743462906e-06, + "loss": 2.2261, + "step": 10756 + }, + { + "epoch": 0.5770922746781116, + "grad_norm": 0.427734375, + "learning_rate": 4.904939946871435e-06, + "loss": 2.4055, + "step": 10757 + }, + { + "epoch": 0.5771459227467811, + "grad_norm": 0.408203125, + "learning_rate": 4.904916216492366e-06, + "loss": 2.5306, + "step": 10758 + }, + { + "epoch": 0.5771995708154506, + "grad_norm": 0.7109375, + "learning_rate": 4.90489248320911e-06, + "loss": 2.3503, + "step": 10759 + }, + { + "epoch": 0.5772532188841202, + "grad_norm": 0.58984375, + "learning_rate": 4.904868747021697e-06, + "loss": 2.2025, + "step": 10760 + }, + { + "epoch": 0.5773068669527897, + "grad_norm": 0.474609375, + "learning_rate": 4.904845007930156e-06, + "loss": 2.2571, + "step": 10761 + }, + { + "epoch": 0.5773605150214592, + "grad_norm": 0.453125, + "learning_rate": 4.904821265934514e-06, + "loss": 2.4115, + "step": 10762 + }, + { + "epoch": 0.5774141630901287, + "grad_norm": 0.4375, + "learning_rate": 4.904797521034802e-06, + "loss": 2.2893, + "step": 10763 + }, + { + "epoch": 0.5774678111587983, + "grad_norm": 0.431640625, + "learning_rate": 4.904773773231047e-06, + "loss": 2.1654, + "step": 10764 + }, + { + "epoch": 0.5775214592274678, + "grad_norm": 0.47265625, + "learning_rate": 4.9047500225232785e-06, + "loss": 2.2506, + "step": 10765 + }, + { + "epoch": 0.5775751072961374, + "grad_norm": 0.44140625, + "learning_rate": 4.904726268911525e-06, + "loss": 2.2015, + "step": 10766 + }, + { + "epoch": 0.5776287553648068, + "grad_norm": 0.419921875, + "learning_rate": 4.904702512395815e-06, + "loss": 2.2151, + "step": 10767 + }, + { + "epoch": 0.5776824034334764, + "grad_norm": 0.56640625, + "learning_rate": 4.9046787529761765e-06, + "loss": 2.1298, + "step": 10768 + }, + { + "epoch": 0.5777360515021459, + "grad_norm": 0.3984375, + "learning_rate": 4.90465499065264e-06, + "loss": 2.2135, + "step": 10769 + }, + { + "epoch": 0.5777896995708155, + "grad_norm": 0.62109375, + "learning_rate": 4.904631225425232e-06, + "loss": 2.3677, + "step": 10770 + }, + { + "epoch": 0.577843347639485, + "grad_norm": 0.408203125, + "learning_rate": 4.904607457293983e-06, + "loss": 2.4468, + "step": 10771 + }, + { + "epoch": 0.5778969957081546, + "grad_norm": 0.416015625, + "learning_rate": 4.904583686258921e-06, + "loss": 2.2333, + "step": 10772 + }, + { + "epoch": 0.577950643776824, + "grad_norm": 0.44140625, + "learning_rate": 4.904559912320075e-06, + "loss": 2.5236, + "step": 10773 + }, + { + "epoch": 0.5780042918454935, + "grad_norm": 0.65625, + "learning_rate": 4.904536135477473e-06, + "loss": 2.1617, + "step": 10774 + }, + { + "epoch": 0.5780579399141631, + "grad_norm": 0.44140625, + "learning_rate": 4.904512355731144e-06, + "loss": 2.3369, + "step": 10775 + }, + { + "epoch": 0.5781115879828326, + "grad_norm": 0.42578125, + "learning_rate": 4.904488573081118e-06, + "loss": 2.2694, + "step": 10776 + }, + { + "epoch": 0.5781652360515022, + "grad_norm": 0.43359375, + "learning_rate": 4.904464787527421e-06, + "loss": 2.0846, + "step": 10777 + }, + { + "epoch": 0.5782188841201716, + "grad_norm": 0.4609375, + "learning_rate": 4.904440999070084e-06, + "loss": 2.2614, + "step": 10778 + }, + { + "epoch": 0.5782725321888412, + "grad_norm": 0.349609375, + "learning_rate": 4.904417207709135e-06, + "loss": 1.8824, + "step": 10779 + }, + { + "epoch": 0.5783261802575107, + "grad_norm": 0.4375, + "learning_rate": 4.904393413444602e-06, + "loss": 2.2626, + "step": 10780 + }, + { + "epoch": 0.5783798283261803, + "grad_norm": 0.498046875, + "learning_rate": 4.904369616276516e-06, + "loss": 2.2516, + "step": 10781 + }, + { + "epoch": 0.5784334763948498, + "grad_norm": 0.423828125, + "learning_rate": 4.904345816204904e-06, + "loss": 2.2924, + "step": 10782 + }, + { + "epoch": 0.5784871244635194, + "grad_norm": 0.412109375, + "learning_rate": 4.904322013229794e-06, + "loss": 2.4628, + "step": 10783 + }, + { + "epoch": 0.5785407725321888, + "grad_norm": 0.470703125, + "learning_rate": 4.904298207351217e-06, + "loss": 2.3914, + "step": 10784 + }, + { + "epoch": 0.5785944206008584, + "grad_norm": 0.4765625, + "learning_rate": 4.9042743985692e-06, + "loss": 2.2709, + "step": 10785 + }, + { + "epoch": 0.5786480686695279, + "grad_norm": 0.39453125, + "learning_rate": 4.904250586883771e-06, + "loss": 2.3129, + "step": 10786 + }, + { + "epoch": 0.5787017167381975, + "grad_norm": 0.50390625, + "learning_rate": 4.904226772294961e-06, + "loss": 2.0917, + "step": 10787 + }, + { + "epoch": 0.578755364806867, + "grad_norm": 0.40234375, + "learning_rate": 4.904202954802798e-06, + "loss": 2.2708, + "step": 10788 + }, + { + "epoch": 0.5788090128755364, + "grad_norm": 0.5078125, + "learning_rate": 4.90417913440731e-06, + "loss": 2.2078, + "step": 10789 + }, + { + "epoch": 0.578862660944206, + "grad_norm": 0.37109375, + "learning_rate": 4.904155311108527e-06, + "loss": 2.0708, + "step": 10790 + }, + { + "epoch": 0.5789163090128755, + "grad_norm": 0.455078125, + "learning_rate": 4.904131484906476e-06, + "loss": 2.3444, + "step": 10791 + }, + { + "epoch": 0.5789699570815451, + "grad_norm": 0.74609375, + "learning_rate": 4.904107655801188e-06, + "loss": 1.8166, + "step": 10792 + }, + { + "epoch": 0.5790236051502146, + "grad_norm": 0.439453125, + "learning_rate": 4.9040838237926895e-06, + "loss": 1.5023, + "step": 10793 + }, + { + "epoch": 0.5790772532188841, + "grad_norm": 0.451171875, + "learning_rate": 4.90405998888101e-06, + "loss": 2.3187, + "step": 10794 + }, + { + "epoch": 0.5791309012875536, + "grad_norm": 0.51953125, + "learning_rate": 4.90403615106618e-06, + "loss": 2.567, + "step": 10795 + }, + { + "epoch": 0.5791845493562232, + "grad_norm": 0.4375, + "learning_rate": 4.904012310348226e-06, + "loss": 2.3475, + "step": 10796 + }, + { + "epoch": 0.5792381974248927, + "grad_norm": 0.79296875, + "learning_rate": 4.903988466727179e-06, + "loss": 2.5253, + "step": 10797 + }, + { + "epoch": 0.5792918454935623, + "grad_norm": 0.6328125, + "learning_rate": 4.903964620203065e-06, + "loss": 2.3051, + "step": 10798 + }, + { + "epoch": 0.5793454935622318, + "grad_norm": 0.55859375, + "learning_rate": 4.903940770775914e-06, + "loss": 2.3939, + "step": 10799 + }, + { + "epoch": 0.5793991416309013, + "grad_norm": 0.44921875, + "learning_rate": 4.9039169184457565e-06, + "loss": 2.2457, + "step": 10800 + }, + { + "epoch": 0.5794527896995708, + "grad_norm": 3.78125, + "learning_rate": 4.90389306321262e-06, + "loss": 2.1461, + "step": 10801 + }, + { + "epoch": 0.5795064377682403, + "grad_norm": 0.625, + "learning_rate": 4.9038692050765326e-06, + "loss": 2.3597, + "step": 10802 + }, + { + "epoch": 0.5795600858369099, + "grad_norm": 0.3828125, + "learning_rate": 4.903845344037523e-06, + "loss": 2.4078, + "step": 10803 + }, + { + "epoch": 0.5796137339055794, + "grad_norm": 0.423828125, + "learning_rate": 4.903821480095623e-06, + "loss": 2.1619, + "step": 10804 + }, + { + "epoch": 0.5796673819742489, + "grad_norm": 1.375, + "learning_rate": 4.9037976132508565e-06, + "loss": 2.3271, + "step": 10805 + }, + { + "epoch": 0.5797210300429184, + "grad_norm": 0.458984375, + "learning_rate": 4.903773743503257e-06, + "loss": 2.5284, + "step": 10806 + }, + { + "epoch": 0.579774678111588, + "grad_norm": 0.4296875, + "learning_rate": 4.90374987085285e-06, + "loss": 2.22, + "step": 10807 + }, + { + "epoch": 0.5798283261802575, + "grad_norm": 0.4453125, + "learning_rate": 4.903725995299666e-06, + "loss": 2.2866, + "step": 10808 + }, + { + "epoch": 0.5798819742489271, + "grad_norm": 0.71484375, + "learning_rate": 4.9037021168437335e-06, + "loss": 2.1974, + "step": 10809 + }, + { + "epoch": 0.5799356223175965, + "grad_norm": 0.455078125, + "learning_rate": 4.903678235485082e-06, + "loss": 2.2356, + "step": 10810 + }, + { + "epoch": 0.5799892703862661, + "grad_norm": 0.439453125, + "learning_rate": 4.903654351223739e-06, + "loss": 2.4389, + "step": 10811 + }, + { + "epoch": 0.5800429184549356, + "grad_norm": 0.44140625, + "learning_rate": 4.903630464059734e-06, + "loss": 2.4483, + "step": 10812 + }, + { + "epoch": 0.5800965665236052, + "grad_norm": 0.44140625, + "learning_rate": 4.903606573993096e-06, + "loss": 2.3049, + "step": 10813 + }, + { + "epoch": 0.5801502145922747, + "grad_norm": 0.470703125, + "learning_rate": 4.903582681023854e-06, + "loss": 2.4691, + "step": 10814 + }, + { + "epoch": 0.5802038626609443, + "grad_norm": 0.435546875, + "learning_rate": 4.903558785152036e-06, + "loss": 2.3541, + "step": 10815 + }, + { + "epoch": 0.5802575107296137, + "grad_norm": 0.412109375, + "learning_rate": 4.903534886377672e-06, + "loss": 2.3449, + "step": 10816 + }, + { + "epoch": 0.5803111587982832, + "grad_norm": 0.37890625, + "learning_rate": 4.90351098470079e-06, + "loss": 2.0583, + "step": 10817 + }, + { + "epoch": 0.5803648068669528, + "grad_norm": 0.5, + "learning_rate": 4.903487080121418e-06, + "loss": 2.5684, + "step": 10818 + }, + { + "epoch": 0.5804184549356223, + "grad_norm": 0.439453125, + "learning_rate": 4.903463172639587e-06, + "loss": 2.2119, + "step": 10819 + }, + { + "epoch": 0.5804721030042919, + "grad_norm": 1.0, + "learning_rate": 4.903439262255325e-06, + "loss": 2.3796, + "step": 10820 + }, + { + "epoch": 0.5805257510729613, + "grad_norm": 0.484375, + "learning_rate": 4.9034153489686615e-06, + "loss": 2.3573, + "step": 10821 + }, + { + "epoch": 0.5805793991416309, + "grad_norm": 0.421875, + "learning_rate": 4.903391432779624e-06, + "loss": 2.4347, + "step": 10822 + }, + { + "epoch": 0.5806330472103004, + "grad_norm": 0.41015625, + "learning_rate": 4.903367513688241e-06, + "loss": 2.2293, + "step": 10823 + }, + { + "epoch": 0.58068669527897, + "grad_norm": 0.408203125, + "learning_rate": 4.903343591694544e-06, + "loss": 2.1768, + "step": 10824 + }, + { + "epoch": 0.5807403433476395, + "grad_norm": 0.470703125, + "learning_rate": 4.903319666798559e-06, + "loss": 2.5361, + "step": 10825 + }, + { + "epoch": 0.580793991416309, + "grad_norm": 0.48046875, + "learning_rate": 4.903295739000317e-06, + "loss": 2.1624, + "step": 10826 + }, + { + "epoch": 0.5808476394849785, + "grad_norm": 0.41015625, + "learning_rate": 4.903271808299846e-06, + "loss": 2.4739, + "step": 10827 + }, + { + "epoch": 0.5809012875536481, + "grad_norm": 0.6640625, + "learning_rate": 4.903247874697173e-06, + "loss": 2.3846, + "step": 10828 + }, + { + "epoch": 0.5809549356223176, + "grad_norm": 0.486328125, + "learning_rate": 4.903223938192331e-06, + "loss": 2.3363, + "step": 10829 + }, + { + "epoch": 0.5810085836909872, + "grad_norm": 0.453125, + "learning_rate": 4.903199998785346e-06, + "loss": 2.2443, + "step": 10830 + }, + { + "epoch": 0.5810622317596567, + "grad_norm": 0.369140625, + "learning_rate": 4.903176056476248e-06, + "loss": 1.9817, + "step": 10831 + }, + { + "epoch": 0.5811158798283261, + "grad_norm": 0.482421875, + "learning_rate": 4.903152111265066e-06, + "loss": 2.4969, + "step": 10832 + }, + { + "epoch": 0.5811695278969957, + "grad_norm": 0.43359375, + "learning_rate": 4.903128163151828e-06, + "loss": 1.9815, + "step": 10833 + }, + { + "epoch": 0.5812231759656652, + "grad_norm": 0.6015625, + "learning_rate": 4.903104212136563e-06, + "loss": 2.3594, + "step": 10834 + }, + { + "epoch": 0.5812768240343348, + "grad_norm": 0.51171875, + "learning_rate": 4.9030802582193e-06, + "loss": 2.4288, + "step": 10835 + }, + { + "epoch": 0.5813304721030043, + "grad_norm": 0.48046875, + "learning_rate": 4.90305630140007e-06, + "loss": 2.4065, + "step": 10836 + }, + { + "epoch": 0.5813841201716738, + "grad_norm": 0.458984375, + "learning_rate": 4.903032341678899e-06, + "loss": 2.3228, + "step": 10837 + }, + { + "epoch": 0.5814377682403433, + "grad_norm": 0.42578125, + "learning_rate": 4.903008379055817e-06, + "loss": 2.4811, + "step": 10838 + }, + { + "epoch": 0.5814914163090129, + "grad_norm": 0.4765625, + "learning_rate": 4.902984413530854e-06, + "loss": 2.3773, + "step": 10839 + }, + { + "epoch": 0.5815450643776824, + "grad_norm": 0.515625, + "learning_rate": 4.902960445104037e-06, + "loss": 2.2857, + "step": 10840 + }, + { + "epoch": 0.581598712446352, + "grad_norm": 0.46484375, + "learning_rate": 4.9029364737753966e-06, + "loss": 2.5032, + "step": 10841 + }, + { + "epoch": 0.5816523605150214, + "grad_norm": 0.486328125, + "learning_rate": 4.9029124995449605e-06, + "loss": 2.3989, + "step": 10842 + }, + { + "epoch": 0.581706008583691, + "grad_norm": 0.484375, + "learning_rate": 4.902888522412759e-06, + "loss": 2.2889, + "step": 10843 + }, + { + "epoch": 0.5817596566523605, + "grad_norm": 0.490234375, + "learning_rate": 4.90286454237882e-06, + "loss": 2.3216, + "step": 10844 + }, + { + "epoch": 0.58181330472103, + "grad_norm": 0.453125, + "learning_rate": 4.902840559443173e-06, + "loss": 2.5005, + "step": 10845 + }, + { + "epoch": 0.5818669527896996, + "grad_norm": 0.76953125, + "learning_rate": 4.902816573605846e-06, + "loss": 2.4111, + "step": 10846 + }, + { + "epoch": 0.581920600858369, + "grad_norm": 0.44921875, + "learning_rate": 4.90279258486687e-06, + "loss": 2.3621, + "step": 10847 + }, + { + "epoch": 0.5819742489270386, + "grad_norm": 0.412109375, + "learning_rate": 4.902768593226271e-06, + "loss": 2.2649, + "step": 10848 + }, + { + "epoch": 0.5820278969957081, + "grad_norm": 0.474609375, + "learning_rate": 4.902744598684081e-06, + "loss": 2.3906, + "step": 10849 + }, + { + "epoch": 0.5820815450643777, + "grad_norm": 6.25, + "learning_rate": 4.902720601240327e-06, + "loss": 2.2333, + "step": 10850 + }, + { + "epoch": 0.5821351931330472, + "grad_norm": 0.51953125, + "learning_rate": 4.9026966008950386e-06, + "loss": 2.383, + "step": 10851 + }, + { + "epoch": 0.5821888412017168, + "grad_norm": 0.5234375, + "learning_rate": 4.902672597648245e-06, + "loss": 2.6143, + "step": 10852 + }, + { + "epoch": 0.5822424892703862, + "grad_norm": 0.609375, + "learning_rate": 4.902648591499975e-06, + "loss": 2.365, + "step": 10853 + }, + { + "epoch": 0.5822961373390558, + "grad_norm": 0.396484375, + "learning_rate": 4.902624582450257e-06, + "loss": 2.1109, + "step": 10854 + }, + { + "epoch": 0.5823497854077253, + "grad_norm": 0.443359375, + "learning_rate": 4.902600570499121e-06, + "loss": 2.5076, + "step": 10855 + }, + { + "epoch": 0.5824034334763949, + "grad_norm": 0.470703125, + "learning_rate": 4.902576555646596e-06, + "loss": 2.1775, + "step": 10856 + }, + { + "epoch": 0.5824570815450644, + "grad_norm": 0.4765625, + "learning_rate": 4.90255253789271e-06, + "loss": 2.4687, + "step": 10857 + }, + { + "epoch": 0.582510729613734, + "grad_norm": 0.427734375, + "learning_rate": 4.902528517237493e-06, + "loss": 2.1666, + "step": 10858 + }, + { + "epoch": 0.5825643776824034, + "grad_norm": 0.46484375, + "learning_rate": 4.902504493680973e-06, + "loss": 2.3346, + "step": 10859 + }, + { + "epoch": 0.5826180257510729, + "grad_norm": 0.439453125, + "learning_rate": 4.902480467223179e-06, + "loss": 2.3274, + "step": 10860 + }, + { + "epoch": 0.5826716738197425, + "grad_norm": 0.4453125, + "learning_rate": 4.902456437864142e-06, + "loss": 2.1652, + "step": 10861 + }, + { + "epoch": 0.582725321888412, + "grad_norm": 0.46484375, + "learning_rate": 4.902432405603888e-06, + "loss": 2.3199, + "step": 10862 + }, + { + "epoch": 0.5827789699570816, + "grad_norm": 0.51171875, + "learning_rate": 4.90240837044245e-06, + "loss": 2.4267, + "step": 10863 + }, + { + "epoch": 0.582832618025751, + "grad_norm": 0.4765625, + "learning_rate": 4.902384332379853e-06, + "loss": 2.2087, + "step": 10864 + }, + { + "epoch": 0.5828862660944206, + "grad_norm": 0.447265625, + "learning_rate": 4.902360291416127e-06, + "loss": 2.1823, + "step": 10865 + }, + { + "epoch": 0.5829399141630901, + "grad_norm": 0.455078125, + "learning_rate": 4.902336247551303e-06, + "loss": 2.349, + "step": 10866 + }, + { + "epoch": 0.5829935622317597, + "grad_norm": 0.470703125, + "learning_rate": 4.902312200785409e-06, + "loss": 2.2725, + "step": 10867 + }, + { + "epoch": 0.5830472103004292, + "grad_norm": 0.54296875, + "learning_rate": 4.902288151118473e-06, + "loss": 2.2137, + "step": 10868 + }, + { + "epoch": 0.5831008583690988, + "grad_norm": 0.41015625, + "learning_rate": 4.9022640985505245e-06, + "loss": 2.34, + "step": 10869 + }, + { + "epoch": 0.5831545064377682, + "grad_norm": 0.462890625, + "learning_rate": 4.902240043081594e-06, + "loss": 2.405, + "step": 10870 + }, + { + "epoch": 0.5832081545064378, + "grad_norm": 0.361328125, + "learning_rate": 4.902215984711708e-06, + "loss": 2.3188, + "step": 10871 + }, + { + "epoch": 0.5832618025751073, + "grad_norm": 0.42578125, + "learning_rate": 4.9021919234408975e-06, + "loss": 2.1659, + "step": 10872 + }, + { + "epoch": 0.5833154506437769, + "grad_norm": 0.431640625, + "learning_rate": 4.902167859269192e-06, + "loss": 2.3551, + "step": 10873 + }, + { + "epoch": 0.5833690987124464, + "grad_norm": 0.76953125, + "learning_rate": 4.902143792196619e-06, + "loss": 2.0556, + "step": 10874 + }, + { + "epoch": 0.5834227467811158, + "grad_norm": 0.6484375, + "learning_rate": 4.902119722223208e-06, + "loss": 2.2296, + "step": 10875 + }, + { + "epoch": 0.5834763948497854, + "grad_norm": 0.349609375, + "learning_rate": 4.9020956493489876e-06, + "loss": 2.275, + "step": 10876 + }, + { + "epoch": 0.5835300429184549, + "grad_norm": 0.5703125, + "learning_rate": 4.902071573573989e-06, + "loss": 2.2179, + "step": 10877 + }, + { + "epoch": 0.5835836909871245, + "grad_norm": 0.423828125, + "learning_rate": 4.902047494898238e-06, + "loss": 1.6634, + "step": 10878 + }, + { + "epoch": 0.583637339055794, + "grad_norm": 0.423828125, + "learning_rate": 4.902023413321767e-06, + "loss": 2.1859, + "step": 10879 + }, + { + "epoch": 0.5836909871244635, + "grad_norm": 0.400390625, + "learning_rate": 4.901999328844602e-06, + "loss": 2.3707, + "step": 10880 + }, + { + "epoch": 0.583744635193133, + "grad_norm": 0.99609375, + "learning_rate": 4.9019752414667746e-06, + "loss": 2.244, + "step": 10881 + }, + { + "epoch": 0.5837982832618026, + "grad_norm": 0.51171875, + "learning_rate": 4.901951151188313e-06, + "loss": 2.3305, + "step": 10882 + }, + { + "epoch": 0.5838519313304721, + "grad_norm": 0.3671875, + "learning_rate": 4.901927058009246e-06, + "loss": 2.4097, + "step": 10883 + }, + { + "epoch": 0.5839055793991417, + "grad_norm": 0.416015625, + "learning_rate": 4.901902961929603e-06, + "loss": 2.305, + "step": 10884 + }, + { + "epoch": 0.5839592274678111, + "grad_norm": 0.6171875, + "learning_rate": 4.901878862949413e-06, + "loss": 2.5305, + "step": 10885 + }, + { + "epoch": 0.5840128755364807, + "grad_norm": 0.62109375, + "learning_rate": 4.901854761068705e-06, + "loss": 2.3992, + "step": 10886 + }, + { + "epoch": 0.5840665236051502, + "grad_norm": 0.4375, + "learning_rate": 4.901830656287507e-06, + "loss": 2.4524, + "step": 10887 + }, + { + "epoch": 0.5841201716738197, + "grad_norm": 0.4609375, + "learning_rate": 4.901806548605851e-06, + "loss": 2.3755, + "step": 10888 + }, + { + "epoch": 0.5841738197424893, + "grad_norm": 0.8515625, + "learning_rate": 4.901782438023764e-06, + "loss": 2.1859, + "step": 10889 + }, + { + "epoch": 0.5842274678111588, + "grad_norm": 0.5, + "learning_rate": 4.901758324541275e-06, + "loss": 2.3436, + "step": 10890 + }, + { + "epoch": 0.5842811158798283, + "grad_norm": 0.53125, + "learning_rate": 4.901734208158414e-06, + "loss": 2.5059, + "step": 10891 + }, + { + "epoch": 0.5843347639484978, + "grad_norm": 0.4296875, + "learning_rate": 4.90171008887521e-06, + "loss": 2.0949, + "step": 10892 + }, + { + "epoch": 0.5843884120171674, + "grad_norm": 0.42578125, + "learning_rate": 4.9016859666916915e-06, + "loss": 2.2819, + "step": 10893 + }, + { + "epoch": 0.5844420600858369, + "grad_norm": 0.45703125, + "learning_rate": 4.901661841607888e-06, + "loss": 2.3438, + "step": 10894 + }, + { + "epoch": 0.5844957081545065, + "grad_norm": 0.478515625, + "learning_rate": 4.901637713623829e-06, + "loss": 2.1678, + "step": 10895 + }, + { + "epoch": 0.5845493562231759, + "grad_norm": 0.4453125, + "learning_rate": 4.901613582739543e-06, + "loss": 2.5054, + "step": 10896 + }, + { + "epoch": 0.5846030042918455, + "grad_norm": 0.4140625, + "learning_rate": 4.90158944895506e-06, + "loss": 2.3685, + "step": 10897 + }, + { + "epoch": 0.584656652360515, + "grad_norm": 0.421875, + "learning_rate": 4.901565312270408e-06, + "loss": 2.285, + "step": 10898 + }, + { + "epoch": 0.5847103004291846, + "grad_norm": 0.37109375, + "learning_rate": 4.901541172685617e-06, + "loss": 2.5403, + "step": 10899 + }, + { + "epoch": 0.5847639484978541, + "grad_norm": 0.4375, + "learning_rate": 4.901517030200715e-06, + "loss": 2.2863, + "step": 10900 + }, + { + "epoch": 0.5848175965665237, + "grad_norm": 0.41015625, + "learning_rate": 4.901492884815733e-06, + "loss": 2.2601, + "step": 10901 + }, + { + "epoch": 0.5848712446351931, + "grad_norm": 0.423828125, + "learning_rate": 4.901468736530699e-06, + "loss": 2.3039, + "step": 10902 + }, + { + "epoch": 0.5849248927038626, + "grad_norm": 0.3828125, + "learning_rate": 4.9014445853456424e-06, + "loss": 2.1036, + "step": 10903 + }, + { + "epoch": 0.5849785407725322, + "grad_norm": 0.42578125, + "learning_rate": 4.9014204312605925e-06, + "loss": 2.1816, + "step": 10904 + }, + { + "epoch": 0.5850321888412017, + "grad_norm": 0.421875, + "learning_rate": 4.901396274275578e-06, + "loss": 2.3698, + "step": 10905 + }, + { + "epoch": 0.5850858369098713, + "grad_norm": 0.3671875, + "learning_rate": 4.9013721143906275e-06, + "loss": 2.2403, + "step": 10906 + }, + { + "epoch": 0.5851394849785407, + "grad_norm": 0.421875, + "learning_rate": 4.9013479516057725e-06, + "loss": 2.401, + "step": 10907 + }, + { + "epoch": 0.5851931330472103, + "grad_norm": 0.55078125, + "learning_rate": 4.901323785921041e-06, + "loss": 1.0841, + "step": 10908 + }, + { + "epoch": 0.5852467811158798, + "grad_norm": 0.494140625, + "learning_rate": 4.90129961733646e-06, + "loss": 2.2885, + "step": 10909 + }, + { + "epoch": 0.5853004291845494, + "grad_norm": 0.494140625, + "learning_rate": 4.9012754458520615e-06, + "loss": 2.2457, + "step": 10910 + }, + { + "epoch": 0.5853540772532189, + "grad_norm": 0.41796875, + "learning_rate": 4.901251271467874e-06, + "loss": 1.8118, + "step": 10911 + }, + { + "epoch": 0.5854077253218885, + "grad_norm": 0.427734375, + "learning_rate": 4.9012270941839264e-06, + "loss": 2.0747, + "step": 10912 + }, + { + "epoch": 0.5854613733905579, + "grad_norm": 0.46875, + "learning_rate": 4.901202914000248e-06, + "loss": 2.5035, + "step": 10913 + }, + { + "epoch": 0.5855150214592275, + "grad_norm": 0.41015625, + "learning_rate": 4.901178730916868e-06, + "loss": 2.3184, + "step": 10914 + }, + { + "epoch": 0.585568669527897, + "grad_norm": 0.484375, + "learning_rate": 4.901154544933816e-06, + "loss": 2.0849, + "step": 10915 + }, + { + "epoch": 0.5856223175965666, + "grad_norm": 0.83984375, + "learning_rate": 4.90113035605112e-06, + "loss": 2.2511, + "step": 10916 + }, + { + "epoch": 0.585675965665236, + "grad_norm": 0.462890625, + "learning_rate": 4.9011061642688104e-06, + "loss": 2.4757, + "step": 10917 + }, + { + "epoch": 0.5857296137339055, + "grad_norm": 0.90234375, + "learning_rate": 4.901081969586916e-06, + "loss": 2.3958, + "step": 10918 + }, + { + "epoch": 0.5857832618025751, + "grad_norm": 0.51171875, + "learning_rate": 4.901057772005465e-06, + "loss": 2.4483, + "step": 10919 + }, + { + "epoch": 0.5858369098712446, + "grad_norm": 0.373046875, + "learning_rate": 4.901033571524488e-06, + "loss": 2.4341, + "step": 10920 + }, + { + "epoch": 0.5858905579399142, + "grad_norm": 0.498046875, + "learning_rate": 4.901009368144015e-06, + "loss": 2.3556, + "step": 10921 + }, + { + "epoch": 0.5859442060085837, + "grad_norm": 0.54296875, + "learning_rate": 4.900985161864074e-06, + "loss": 2.3273, + "step": 10922 + }, + { + "epoch": 0.5859978540772532, + "grad_norm": 0.41015625, + "learning_rate": 4.900960952684693e-06, + "loss": 2.1867, + "step": 10923 + }, + { + "epoch": 0.5860515021459227, + "grad_norm": 0.59375, + "learning_rate": 4.900936740605904e-06, + "loss": 2.3185, + "step": 10924 + }, + { + "epoch": 0.5861051502145923, + "grad_norm": 1.03125, + "learning_rate": 4.900912525627733e-06, + "loss": 2.2588, + "step": 10925 + }, + { + "epoch": 0.5861587982832618, + "grad_norm": 0.384765625, + "learning_rate": 4.900888307750212e-06, + "loss": 2.326, + "step": 10926 + }, + { + "epoch": 0.5862124463519314, + "grad_norm": 0.423828125, + "learning_rate": 4.90086408697337e-06, + "loss": 2.2302, + "step": 10927 + }, + { + "epoch": 0.5862660944206008, + "grad_norm": 0.435546875, + "learning_rate": 4.900839863297235e-06, + "loss": 2.3283, + "step": 10928 + }, + { + "epoch": 0.5863197424892704, + "grad_norm": 0.412109375, + "learning_rate": 4.9008156367218365e-06, + "loss": 2.2599, + "step": 10929 + }, + { + "epoch": 0.5863733905579399, + "grad_norm": 0.419921875, + "learning_rate": 4.9007914072472045e-06, + "loss": 2.3509, + "step": 10930 + }, + { + "epoch": 0.5864270386266094, + "grad_norm": 0.44921875, + "learning_rate": 4.9007671748733675e-06, + "loss": 2.2455, + "step": 10931 + }, + { + "epoch": 0.586480686695279, + "grad_norm": 0.494140625, + "learning_rate": 4.9007429396003545e-06, + "loss": 1.883, + "step": 10932 + }, + { + "epoch": 0.5865343347639485, + "grad_norm": 0.44921875, + "learning_rate": 4.900718701428197e-06, + "loss": 2.3666, + "step": 10933 + }, + { + "epoch": 0.586587982832618, + "grad_norm": 0.41015625, + "learning_rate": 4.90069446035692e-06, + "loss": 2.5014, + "step": 10934 + }, + { + "epoch": 0.5866416309012875, + "grad_norm": 0.435546875, + "learning_rate": 4.900670216386559e-06, + "loss": 2.106, + "step": 10935 + }, + { + "epoch": 0.5866952789699571, + "grad_norm": 0.494140625, + "learning_rate": 4.900645969517137e-06, + "loss": 2.5251, + "step": 10936 + }, + { + "epoch": 0.5867489270386266, + "grad_norm": 0.458984375, + "learning_rate": 4.900621719748686e-06, + "loss": 2.4989, + "step": 10937 + }, + { + "epoch": 0.5868025751072962, + "grad_norm": 0.3828125, + "learning_rate": 4.900597467081236e-06, + "loss": 1.9875, + "step": 10938 + }, + { + "epoch": 0.5868562231759656, + "grad_norm": 0.333984375, + "learning_rate": 4.900573211514815e-06, + "loss": 2.1266, + "step": 10939 + }, + { + "epoch": 0.5869098712446352, + "grad_norm": 0.55859375, + "learning_rate": 4.900548953049453e-06, + "loss": 2.4424, + "step": 10940 + }, + { + "epoch": 0.5869635193133047, + "grad_norm": 0.376953125, + "learning_rate": 4.900524691685179e-06, + "loss": 2.0319, + "step": 10941 + }, + { + "epoch": 0.5870171673819743, + "grad_norm": 0.392578125, + "learning_rate": 4.900500427422022e-06, + "loss": 2.0924, + "step": 10942 + }, + { + "epoch": 0.5870708154506438, + "grad_norm": 0.43359375, + "learning_rate": 4.900476160260013e-06, + "loss": 2.5116, + "step": 10943 + }, + { + "epoch": 0.5871244635193134, + "grad_norm": 0.42578125, + "learning_rate": 4.900451890199179e-06, + "loss": 2.4147, + "step": 10944 + }, + { + "epoch": 0.5871781115879828, + "grad_norm": 0.494140625, + "learning_rate": 4.9004276172395505e-06, + "loss": 2.3367, + "step": 10945 + }, + { + "epoch": 0.5872317596566523, + "grad_norm": 1.703125, + "learning_rate": 4.900403341381156e-06, + "loss": 2.3716, + "step": 10946 + }, + { + "epoch": 0.5872854077253219, + "grad_norm": 0.443359375, + "learning_rate": 4.900379062624026e-06, + "loss": 2.291, + "step": 10947 + }, + { + "epoch": 0.5873390557939914, + "grad_norm": 0.53125, + "learning_rate": 4.9003547809681896e-06, + "loss": 2.36, + "step": 10948 + }, + { + "epoch": 0.587392703862661, + "grad_norm": 0.5234375, + "learning_rate": 4.900330496413676e-06, + "loss": 2.3737, + "step": 10949 + }, + { + "epoch": 0.5874463519313304, + "grad_norm": 0.447265625, + "learning_rate": 4.900306208960513e-06, + "loss": 2.2479, + "step": 10950 + }, + { + "epoch": 0.5875, + "grad_norm": 0.421875, + "learning_rate": 4.900281918608732e-06, + "loss": 1.9838, + "step": 10951 + }, + { + "epoch": 0.5875536480686695, + "grad_norm": 0.44140625, + "learning_rate": 4.900257625358362e-06, + "loss": 2.3167, + "step": 10952 + }, + { + "epoch": 0.5876072961373391, + "grad_norm": 0.48828125, + "learning_rate": 4.900233329209431e-06, + "loss": 2.689, + "step": 10953 + }, + { + "epoch": 0.5876609442060086, + "grad_norm": 0.46484375, + "learning_rate": 4.90020903016197e-06, + "loss": 2.4791, + "step": 10954 + }, + { + "epoch": 0.5877145922746781, + "grad_norm": 0.431640625, + "learning_rate": 4.900184728216007e-06, + "loss": 2.4233, + "step": 10955 + }, + { + "epoch": 0.5877682403433476, + "grad_norm": 0.42578125, + "learning_rate": 4.900160423371572e-06, + "loss": 2.0669, + "step": 10956 + }, + { + "epoch": 0.5878218884120172, + "grad_norm": 0.4453125, + "learning_rate": 4.900136115628694e-06, + "loss": 2.4874, + "step": 10957 + }, + { + "epoch": 0.5878755364806867, + "grad_norm": 0.46484375, + "learning_rate": 4.900111804987403e-06, + "loss": 1.6763, + "step": 10958 + }, + { + "epoch": 0.5879291845493563, + "grad_norm": 0.42578125, + "learning_rate": 4.9000874914477284e-06, + "loss": 2.2605, + "step": 10959 + }, + { + "epoch": 0.5879828326180258, + "grad_norm": 0.419921875, + "learning_rate": 4.900063175009699e-06, + "loss": 2.2618, + "step": 10960 + }, + { + "epoch": 0.5880364806866952, + "grad_norm": 0.56640625, + "learning_rate": 4.900038855673344e-06, + "loss": 2.2717, + "step": 10961 + }, + { + "epoch": 0.5880901287553648, + "grad_norm": 0.609375, + "learning_rate": 4.900014533438693e-06, + "loss": 2.2996, + "step": 10962 + }, + { + "epoch": 0.5881437768240343, + "grad_norm": 0.5, + "learning_rate": 4.899990208305776e-06, + "loss": 2.5179, + "step": 10963 + }, + { + "epoch": 0.5881974248927039, + "grad_norm": 0.44140625, + "learning_rate": 4.899965880274621e-06, + "loss": 2.3383, + "step": 10964 + }, + { + "epoch": 0.5882510729613734, + "grad_norm": 0.34765625, + "learning_rate": 4.899941549345259e-06, + "loss": 2.1443, + "step": 10965 + }, + { + "epoch": 0.5883047210300429, + "grad_norm": 0.51953125, + "learning_rate": 4.899917215517719e-06, + "loss": 1.2931, + "step": 10966 + }, + { + "epoch": 0.5883583690987124, + "grad_norm": 0.451171875, + "learning_rate": 4.899892878792028e-06, + "loss": 2.2804, + "step": 10967 + }, + { + "epoch": 0.588412017167382, + "grad_norm": 0.369140625, + "learning_rate": 4.89986853916822e-06, + "loss": 2.3128, + "step": 10968 + }, + { + "epoch": 0.5884656652360515, + "grad_norm": 0.44921875, + "learning_rate": 4.89984419664632e-06, + "loss": 2.2248, + "step": 10969 + }, + { + "epoch": 0.5885193133047211, + "grad_norm": 0.44921875, + "learning_rate": 4.89981985122636e-06, + "loss": 2.4505, + "step": 10970 + }, + { + "epoch": 0.5885729613733905, + "grad_norm": 0.65234375, + "learning_rate": 4.8997955029083675e-06, + "loss": 2.3967, + "step": 10971 + }, + { + "epoch": 0.5886266094420601, + "grad_norm": 0.46484375, + "learning_rate": 4.899771151692373e-06, + "loss": 2.4424, + "step": 10972 + }, + { + "epoch": 0.5886802575107296, + "grad_norm": 1.3125, + "learning_rate": 4.899746797578407e-06, + "loss": 2.5007, + "step": 10973 + }, + { + "epoch": 0.5887339055793992, + "grad_norm": 1.8203125, + "learning_rate": 4.8997224405664964e-06, + "loss": 2.2184, + "step": 10974 + }, + { + "epoch": 0.5887875536480687, + "grad_norm": 0.4609375, + "learning_rate": 4.899698080656674e-06, + "loss": 2.3554, + "step": 10975 + }, + { + "epoch": 0.5888412017167381, + "grad_norm": 0.423828125, + "learning_rate": 4.899673717848965e-06, + "loss": 2.0799, + "step": 10976 + }, + { + "epoch": 0.5888948497854077, + "grad_norm": 0.419921875, + "learning_rate": 4.899649352143403e-06, + "loss": 2.0408, + "step": 10977 + }, + { + "epoch": 0.5889484978540772, + "grad_norm": 0.470703125, + "learning_rate": 4.899624983540014e-06, + "loss": 2.5514, + "step": 10978 + }, + { + "epoch": 0.5890021459227468, + "grad_norm": 0.427734375, + "learning_rate": 4.8996006120388294e-06, + "loss": 2.3744, + "step": 10979 + }, + { + "epoch": 0.5890557939914163, + "grad_norm": 0.443359375, + "learning_rate": 4.899576237639878e-06, + "loss": 2.1057, + "step": 10980 + }, + { + "epoch": 0.5891094420600859, + "grad_norm": 0.4765625, + "learning_rate": 4.899551860343189e-06, + "loss": 2.4161, + "step": 10981 + }, + { + "epoch": 0.5891630901287553, + "grad_norm": 0.5, + "learning_rate": 4.899527480148792e-06, + "loss": 2.4724, + "step": 10982 + }, + { + "epoch": 0.5892167381974249, + "grad_norm": 0.396484375, + "learning_rate": 4.899503097056718e-06, + "loss": 1.9584, + "step": 10983 + }, + { + "epoch": 0.5892703862660944, + "grad_norm": 0.4140625, + "learning_rate": 4.899478711066994e-06, + "loss": 2.343, + "step": 10984 + }, + { + "epoch": 0.589324034334764, + "grad_norm": 0.40625, + "learning_rate": 4.899454322179651e-06, + "loss": 2.3631, + "step": 10985 + }, + { + "epoch": 0.5893776824034335, + "grad_norm": 0.5, + "learning_rate": 4.899429930394718e-06, + "loss": 2.2326, + "step": 10986 + }, + { + "epoch": 0.589431330472103, + "grad_norm": 0.9609375, + "learning_rate": 4.899405535712224e-06, + "loss": 2.1626, + "step": 10987 + }, + { + "epoch": 0.5894849785407725, + "grad_norm": 0.423828125, + "learning_rate": 4.899381138132199e-06, + "loss": 2.3556, + "step": 10988 + }, + { + "epoch": 0.589538626609442, + "grad_norm": 0.43359375, + "learning_rate": 4.899356737654673e-06, + "loss": 2.0862, + "step": 10989 + }, + { + "epoch": 0.5895922746781116, + "grad_norm": 0.453125, + "learning_rate": 4.899332334279674e-06, + "loss": 2.5002, + "step": 10990 + }, + { + "epoch": 0.5896459227467811, + "grad_norm": 0.546875, + "learning_rate": 4.899307928007232e-06, + "loss": 2.4418, + "step": 10991 + }, + { + "epoch": 0.5896995708154507, + "grad_norm": 0.427734375, + "learning_rate": 4.899283518837377e-06, + "loss": 2.3417, + "step": 10992 + }, + { + "epoch": 0.5897532188841201, + "grad_norm": 0.416015625, + "learning_rate": 4.899259106770139e-06, + "loss": 2.2565, + "step": 10993 + }, + { + "epoch": 0.5898068669527897, + "grad_norm": 0.462890625, + "learning_rate": 4.8992346918055455e-06, + "loss": 2.3296, + "step": 10994 + }, + { + "epoch": 0.5898605150214592, + "grad_norm": 0.54296875, + "learning_rate": 4.8992102739436285e-06, + "loss": 2.3005, + "step": 10995 + }, + { + "epoch": 0.5899141630901288, + "grad_norm": 0.4375, + "learning_rate": 4.899185853184415e-06, + "loss": 2.3078, + "step": 10996 + }, + { + "epoch": 0.5899678111587983, + "grad_norm": 0.4765625, + "learning_rate": 4.899161429527936e-06, + "loss": 2.1035, + "step": 10997 + }, + { + "epoch": 0.5900214592274678, + "grad_norm": 0.66796875, + "learning_rate": 4.899137002974221e-06, + "loss": 2.5889, + "step": 10998 + }, + { + "epoch": 0.5900751072961373, + "grad_norm": 0.474609375, + "learning_rate": 4.8991125735232995e-06, + "loss": 2.6797, + "step": 10999 + }, + { + "epoch": 0.5901287553648069, + "grad_norm": 0.421875, + "learning_rate": 4.8990881411752e-06, + "loss": 2.3291, + "step": 11000 + }, + { + "epoch": 0.5901824034334764, + "grad_norm": 0.64453125, + "learning_rate": 4.899063705929953e-06, + "loss": 2.1417, + "step": 11001 + }, + { + "epoch": 0.590236051502146, + "grad_norm": 0.52734375, + "learning_rate": 4.899039267787588e-06, + "loss": 2.3259, + "step": 11002 + }, + { + "epoch": 0.5902896995708155, + "grad_norm": 0.3984375, + "learning_rate": 4.899014826748134e-06, + "loss": 2.2117, + "step": 11003 + }, + { + "epoch": 0.5903433476394849, + "grad_norm": 0.49609375, + "learning_rate": 4.89899038281162e-06, + "loss": 2.2252, + "step": 11004 + }, + { + "epoch": 0.5903969957081545, + "grad_norm": 0.408203125, + "learning_rate": 4.898965935978076e-06, + "loss": 2.2278, + "step": 11005 + }, + { + "epoch": 0.590450643776824, + "grad_norm": 0.39453125, + "learning_rate": 4.898941486247533e-06, + "loss": 2.355, + "step": 11006 + }, + { + "epoch": 0.5905042918454936, + "grad_norm": 0.44140625, + "learning_rate": 4.898917033620019e-06, + "loss": 2.2332, + "step": 11007 + }, + { + "epoch": 0.590557939914163, + "grad_norm": 0.466796875, + "learning_rate": 4.898892578095563e-06, + "loss": 1.9416, + "step": 11008 + }, + { + "epoch": 0.5906115879828326, + "grad_norm": 0.453125, + "learning_rate": 4.898868119674196e-06, + "loss": 2.3184, + "step": 11009 + }, + { + "epoch": 0.5906652360515021, + "grad_norm": 0.44921875, + "learning_rate": 4.898843658355947e-06, + "loss": 2.2417, + "step": 11010 + }, + { + "epoch": 0.5907188841201717, + "grad_norm": 0.44921875, + "learning_rate": 4.898819194140845e-06, + "loss": 1.7658, + "step": 11011 + }, + { + "epoch": 0.5907725321888412, + "grad_norm": 0.5625, + "learning_rate": 4.898794727028921e-06, + "loss": 2.4206, + "step": 11012 + }, + { + "epoch": 0.5908261802575108, + "grad_norm": 0.4140625, + "learning_rate": 4.898770257020201e-06, + "loss": 2.1721, + "step": 11013 + }, + { + "epoch": 0.5908798283261802, + "grad_norm": 0.4375, + "learning_rate": 4.898745784114719e-06, + "loss": 2.2398, + "step": 11014 + }, + { + "epoch": 0.5909334763948498, + "grad_norm": 0.4609375, + "learning_rate": 4.898721308312503e-06, + "loss": 2.4761, + "step": 11015 + }, + { + "epoch": 0.5909871244635193, + "grad_norm": 0.447265625, + "learning_rate": 4.898696829613581e-06, + "loss": 2.2362, + "step": 11016 + }, + { + "epoch": 0.5910407725321889, + "grad_norm": 0.4453125, + "learning_rate": 4.8986723480179845e-06, + "loss": 2.4892, + "step": 11017 + }, + { + "epoch": 0.5910944206008584, + "grad_norm": 0.43359375, + "learning_rate": 4.898647863525742e-06, + "loss": 2.3713, + "step": 11018 + }, + { + "epoch": 0.5911480686695278, + "grad_norm": 0.37890625, + "learning_rate": 4.898623376136883e-06, + "loss": 2.2131, + "step": 11019 + }, + { + "epoch": 0.5912017167381974, + "grad_norm": 0.4375, + "learning_rate": 4.898598885851437e-06, + "loss": 2.152, + "step": 11020 + }, + { + "epoch": 0.5912553648068669, + "grad_norm": 0.40625, + "learning_rate": 4.898574392669435e-06, + "loss": 2.2892, + "step": 11021 + }, + { + "epoch": 0.5913090128755365, + "grad_norm": 0.47265625, + "learning_rate": 4.898549896590905e-06, + "loss": 2.2938, + "step": 11022 + }, + { + "epoch": 0.591362660944206, + "grad_norm": 0.40625, + "learning_rate": 4.898525397615877e-06, + "loss": 2.5107, + "step": 11023 + }, + { + "epoch": 0.5914163090128756, + "grad_norm": 0.361328125, + "learning_rate": 4.898500895744381e-06, + "loss": 2.282, + "step": 11024 + }, + { + "epoch": 0.591469957081545, + "grad_norm": 1.5546875, + "learning_rate": 4.898476390976447e-06, + "loss": 2.257, + "step": 11025 + }, + { + "epoch": 0.5915236051502146, + "grad_norm": 0.392578125, + "learning_rate": 4.898451883312103e-06, + "loss": 2.2444, + "step": 11026 + }, + { + "epoch": 0.5915772532188841, + "grad_norm": 0.50390625, + "learning_rate": 4.89842737275138e-06, + "loss": 2.4575, + "step": 11027 + }, + { + "epoch": 0.5916309012875537, + "grad_norm": 0.41796875, + "learning_rate": 4.898402859294307e-06, + "loss": 2.3446, + "step": 11028 + }, + { + "epoch": 0.5916845493562232, + "grad_norm": 0.396484375, + "learning_rate": 4.898378342940914e-06, + "loss": 2.5588, + "step": 11029 + }, + { + "epoch": 0.5917381974248928, + "grad_norm": 0.44140625, + "learning_rate": 4.89835382369123e-06, + "loss": 2.3513, + "step": 11030 + }, + { + "epoch": 0.5917918454935622, + "grad_norm": 0.75, + "learning_rate": 4.898329301545285e-06, + "loss": 2.412, + "step": 11031 + }, + { + "epoch": 0.5918454935622317, + "grad_norm": 0.369140625, + "learning_rate": 4.8983047765031085e-06, + "loss": 2.1419, + "step": 11032 + }, + { + "epoch": 0.5918991416309013, + "grad_norm": 0.498046875, + "learning_rate": 4.898280248564731e-06, + "loss": 2.3432, + "step": 11033 + }, + { + "epoch": 0.5919527896995708, + "grad_norm": 0.3984375, + "learning_rate": 4.89825571773018e-06, + "loss": 2.321, + "step": 11034 + }, + { + "epoch": 0.5920064377682404, + "grad_norm": 0.462890625, + "learning_rate": 4.898231183999487e-06, + "loss": 2.3956, + "step": 11035 + }, + { + "epoch": 0.5920600858369098, + "grad_norm": 0.478515625, + "learning_rate": 4.898206647372681e-06, + "loss": 2.4456, + "step": 11036 + }, + { + "epoch": 0.5921137339055794, + "grad_norm": 0.4140625, + "learning_rate": 4.898182107849791e-06, + "loss": 2.3531, + "step": 11037 + }, + { + "epoch": 0.5921673819742489, + "grad_norm": 0.494140625, + "learning_rate": 4.898157565430848e-06, + "loss": 2.115, + "step": 11038 + }, + { + "epoch": 0.5922210300429185, + "grad_norm": 0.470703125, + "learning_rate": 4.8981330201158805e-06, + "loss": 2.2907, + "step": 11039 + }, + { + "epoch": 0.592274678111588, + "grad_norm": 0.388671875, + "learning_rate": 4.89810847190492e-06, + "loss": 2.0687, + "step": 11040 + }, + { + "epoch": 0.5923283261802575, + "grad_norm": 0.48046875, + "learning_rate": 4.898083920797993e-06, + "loss": 1.9469, + "step": 11041 + }, + { + "epoch": 0.592381974248927, + "grad_norm": 0.43359375, + "learning_rate": 4.898059366795132e-06, + "loss": 2.3471, + "step": 11042 + }, + { + "epoch": 0.5924356223175966, + "grad_norm": 0.431640625, + "learning_rate": 4.898034809896365e-06, + "loss": 2.34, + "step": 11043 + }, + { + "epoch": 0.5924892703862661, + "grad_norm": 0.9296875, + "learning_rate": 4.898010250101722e-06, + "loss": 2.4549, + "step": 11044 + }, + { + "epoch": 0.5925429184549357, + "grad_norm": 0.482421875, + "learning_rate": 4.8979856874112334e-06, + "loss": 1.8809, + "step": 11045 + }, + { + "epoch": 0.5925965665236052, + "grad_norm": 0.4765625, + "learning_rate": 4.897961121824927e-06, + "loss": 2.3006, + "step": 11046 + }, + { + "epoch": 0.5926502145922746, + "grad_norm": 0.4921875, + "learning_rate": 4.897936553342835e-06, + "loss": 2.4099, + "step": 11047 + }, + { + "epoch": 0.5927038626609442, + "grad_norm": 0.447265625, + "learning_rate": 4.897911981964986e-06, + "loss": 2.5023, + "step": 11048 + }, + { + "epoch": 0.5927575107296137, + "grad_norm": 0.453125, + "learning_rate": 4.897887407691408e-06, + "loss": 2.4582, + "step": 11049 + }, + { + "epoch": 0.5928111587982833, + "grad_norm": 0.45703125, + "learning_rate": 4.897862830522133e-06, + "loss": 2.3364, + "step": 11050 + }, + { + "epoch": 0.5928648068669528, + "grad_norm": 0.369140625, + "learning_rate": 4.897838250457191e-06, + "loss": 2.0391, + "step": 11051 + }, + { + "epoch": 0.5929184549356223, + "grad_norm": 0.38671875, + "learning_rate": 4.897813667496609e-06, + "loss": 2.175, + "step": 11052 + }, + { + "epoch": 0.5929721030042918, + "grad_norm": 0.416015625, + "learning_rate": 4.897789081640419e-06, + "loss": 2.2579, + "step": 11053 + }, + { + "epoch": 0.5930257510729614, + "grad_norm": 0.451171875, + "learning_rate": 4.8977644928886505e-06, + "loss": 2.3753, + "step": 11054 + }, + { + "epoch": 0.5930793991416309, + "grad_norm": 0.4921875, + "learning_rate": 4.897739901241331e-06, + "loss": 2.3342, + "step": 11055 + }, + { + "epoch": 0.5931330472103005, + "grad_norm": 0.466796875, + "learning_rate": 4.897715306698493e-06, + "loss": 2.2662, + "step": 11056 + }, + { + "epoch": 0.5931866952789699, + "grad_norm": 0.421875, + "learning_rate": 4.897690709260164e-06, + "loss": 2.2789, + "step": 11057 + }, + { + "epoch": 0.5932403433476395, + "grad_norm": 0.35546875, + "learning_rate": 4.897666108926376e-06, + "loss": 2.0801, + "step": 11058 + }, + { + "epoch": 0.593293991416309, + "grad_norm": 0.443359375, + "learning_rate": 4.897641505697157e-06, + "loss": 2.2711, + "step": 11059 + }, + { + "epoch": 0.5933476394849786, + "grad_norm": 0.431640625, + "learning_rate": 4.8976168995725374e-06, + "loss": 2.4076, + "step": 11060 + }, + { + "epoch": 0.5934012875536481, + "grad_norm": 0.44140625, + "learning_rate": 4.897592290552545e-06, + "loss": 2.2625, + "step": 11061 + }, + { + "epoch": 0.5934549356223175, + "grad_norm": 0.5, + "learning_rate": 4.897567678637213e-06, + "loss": 2.5009, + "step": 11062 + }, + { + "epoch": 0.5935085836909871, + "grad_norm": 0.5078125, + "learning_rate": 4.897543063826569e-06, + "loss": 2.3784, + "step": 11063 + }, + { + "epoch": 0.5935622317596566, + "grad_norm": 0.486328125, + "learning_rate": 4.897518446120642e-06, + "loss": 2.0313, + "step": 11064 + }, + { + "epoch": 0.5936158798283262, + "grad_norm": 0.439453125, + "learning_rate": 4.897493825519463e-06, + "loss": 2.2931, + "step": 11065 + }, + { + "epoch": 0.5936695278969957, + "grad_norm": 0.62890625, + "learning_rate": 4.897469202023063e-06, + "loss": 2.3455, + "step": 11066 + }, + { + "epoch": 0.5937231759656653, + "grad_norm": 0.466796875, + "learning_rate": 4.8974445756314685e-06, + "loss": 2.1597, + "step": 11067 + }, + { + "epoch": 0.5937768240343347, + "grad_norm": 0.451171875, + "learning_rate": 4.897419946344711e-06, + "loss": 2.2541, + "step": 11068 + }, + { + "epoch": 0.5938304721030043, + "grad_norm": 0.408203125, + "learning_rate": 4.897395314162821e-06, + "loss": 2.2881, + "step": 11069 + }, + { + "epoch": 0.5938841201716738, + "grad_norm": 0.46484375, + "learning_rate": 4.897370679085828e-06, + "loss": 2.3663, + "step": 11070 + }, + { + "epoch": 0.5939377682403434, + "grad_norm": 0.46484375, + "learning_rate": 4.89734604111376e-06, + "loss": 2.2416, + "step": 11071 + }, + { + "epoch": 0.5939914163090129, + "grad_norm": 0.404296875, + "learning_rate": 4.897321400246649e-06, + "loss": 2.3732, + "step": 11072 + }, + { + "epoch": 0.5940450643776825, + "grad_norm": 0.5703125, + "learning_rate": 4.897296756484522e-06, + "loss": 2.2114, + "step": 11073 + }, + { + "epoch": 0.5940987124463519, + "grad_norm": 0.3359375, + "learning_rate": 4.8972721098274115e-06, + "loss": 2.0546, + "step": 11074 + }, + { + "epoch": 0.5941523605150214, + "grad_norm": 0.46484375, + "learning_rate": 4.897247460275346e-06, + "loss": 2.3405, + "step": 11075 + }, + { + "epoch": 0.594206008583691, + "grad_norm": 0.43359375, + "learning_rate": 4.897222807828356e-06, + "loss": 2.4531, + "step": 11076 + }, + { + "epoch": 0.5942596566523605, + "grad_norm": 0.427734375, + "learning_rate": 4.89719815248647e-06, + "loss": 2.2618, + "step": 11077 + }, + { + "epoch": 0.5943133047210301, + "grad_norm": 0.52734375, + "learning_rate": 4.897173494249719e-06, + "loss": 2.3391, + "step": 11078 + }, + { + "epoch": 0.5943669527896995, + "grad_norm": 0.421875, + "learning_rate": 4.897148833118133e-06, + "loss": 2.5782, + "step": 11079 + }, + { + "epoch": 0.5944206008583691, + "grad_norm": 0.4140625, + "learning_rate": 4.89712416909174e-06, + "loss": 2.4893, + "step": 11080 + }, + { + "epoch": 0.5944742489270386, + "grad_norm": 0.439453125, + "learning_rate": 4.897099502170571e-06, + "loss": 1.4792, + "step": 11081 + }, + { + "epoch": 0.5945278969957082, + "grad_norm": 0.423828125, + "learning_rate": 4.8970748323546555e-06, + "loss": 2.4827, + "step": 11082 + }, + { + "epoch": 0.5945815450643777, + "grad_norm": 0.435546875, + "learning_rate": 4.897050159644024e-06, + "loss": 2.0886, + "step": 11083 + }, + { + "epoch": 0.5946351931330472, + "grad_norm": 0.4453125, + "learning_rate": 4.897025484038706e-06, + "loss": 2.3112, + "step": 11084 + }, + { + "epoch": 0.5946888412017167, + "grad_norm": 0.359375, + "learning_rate": 4.89700080553873e-06, + "loss": 2.0947, + "step": 11085 + }, + { + "epoch": 0.5947424892703863, + "grad_norm": 0.3984375, + "learning_rate": 4.896976124144127e-06, + "loss": 2.4048, + "step": 11086 + }, + { + "epoch": 0.5947961373390558, + "grad_norm": 0.57421875, + "learning_rate": 4.896951439854927e-06, + "loss": 2.3467, + "step": 11087 + }, + { + "epoch": 0.5948497854077254, + "grad_norm": 0.40625, + "learning_rate": 4.89692675267116e-06, + "loss": 2.2235, + "step": 11088 + }, + { + "epoch": 0.5949034334763948, + "grad_norm": 0.482421875, + "learning_rate": 4.896902062592854e-06, + "loss": 2.3017, + "step": 11089 + }, + { + "epoch": 0.5949570815450643, + "grad_norm": 0.435546875, + "learning_rate": 4.896877369620041e-06, + "loss": 2.0172, + "step": 11090 + }, + { + "epoch": 0.5950107296137339, + "grad_norm": 1.265625, + "learning_rate": 4.89685267375275e-06, + "loss": 1.5996, + "step": 11091 + }, + { + "epoch": 0.5950643776824034, + "grad_norm": 0.431640625, + "learning_rate": 4.896827974991011e-06, + "loss": 2.3539, + "step": 11092 + }, + { + "epoch": 0.595118025751073, + "grad_norm": 0.404296875, + "learning_rate": 4.896803273334852e-06, + "loss": 2.1274, + "step": 11093 + }, + { + "epoch": 0.5951716738197425, + "grad_norm": 0.49609375, + "learning_rate": 4.896778568784306e-06, + "loss": 2.3396, + "step": 11094 + }, + { + "epoch": 0.595225321888412, + "grad_norm": 0.515625, + "learning_rate": 4.896753861339401e-06, + "loss": 2.5052, + "step": 11095 + }, + { + "epoch": 0.5952789699570815, + "grad_norm": 0.416015625, + "learning_rate": 4.8967291510001665e-06, + "loss": 2.2696, + "step": 11096 + }, + { + "epoch": 0.5953326180257511, + "grad_norm": 0.451171875, + "learning_rate": 4.8967044377666325e-06, + "loss": 1.8538, + "step": 11097 + }, + { + "epoch": 0.5953862660944206, + "grad_norm": 0.388671875, + "learning_rate": 4.896679721638831e-06, + "loss": 2.2579, + "step": 11098 + }, + { + "epoch": 0.5954399141630902, + "grad_norm": 0.46484375, + "learning_rate": 4.896655002616788e-06, + "loss": 2.5381, + "step": 11099 + }, + { + "epoch": 0.5954935622317596, + "grad_norm": 0.55078125, + "learning_rate": 4.896630280700537e-06, + "loss": 2.2556, + "step": 11100 + }, + { + "epoch": 0.5955472103004292, + "grad_norm": 0.423828125, + "learning_rate": 4.896605555890105e-06, + "loss": 2.1523, + "step": 11101 + }, + { + "epoch": 0.5956008583690987, + "grad_norm": 0.41796875, + "learning_rate": 4.896580828185524e-06, + "loss": 2.2024, + "step": 11102 + }, + { + "epoch": 0.5956545064377683, + "grad_norm": 0.4453125, + "learning_rate": 4.896556097586823e-06, + "loss": 2.2758, + "step": 11103 + }, + { + "epoch": 0.5957081545064378, + "grad_norm": 0.5078125, + "learning_rate": 4.896531364094031e-06, + "loss": 2.1881, + "step": 11104 + }, + { + "epoch": 0.5957618025751072, + "grad_norm": 0.40234375, + "learning_rate": 4.89650662770718e-06, + "loss": 2.3213, + "step": 11105 + }, + { + "epoch": 0.5958154506437768, + "grad_norm": 0.421875, + "learning_rate": 4.896481888426298e-06, + "loss": 2.3119, + "step": 11106 + }, + { + "epoch": 0.5958690987124463, + "grad_norm": 0.3515625, + "learning_rate": 4.896457146251416e-06, + "loss": 2.2785, + "step": 11107 + }, + { + "epoch": 0.5959227467811159, + "grad_norm": 0.439453125, + "learning_rate": 4.896432401182562e-06, + "loss": 2.2833, + "step": 11108 + }, + { + "epoch": 0.5959763948497854, + "grad_norm": 0.4296875, + "learning_rate": 4.8964076532197694e-06, + "loss": 2.2464, + "step": 11109 + }, + { + "epoch": 0.596030042918455, + "grad_norm": 0.4375, + "learning_rate": 4.896382902363064e-06, + "loss": 1.9754, + "step": 11110 + }, + { + "epoch": 0.5960836909871244, + "grad_norm": 0.421875, + "learning_rate": 4.896358148612479e-06, + "loss": 2.364, + "step": 11111 + }, + { + "epoch": 0.596137339055794, + "grad_norm": 0.478515625, + "learning_rate": 4.896333391968042e-06, + "loss": 2.2628, + "step": 11112 + }, + { + "epoch": 0.5961909871244635, + "grad_norm": 0.421875, + "learning_rate": 4.896308632429784e-06, + "loss": 2.2699, + "step": 11113 + }, + { + "epoch": 0.5962446351931331, + "grad_norm": 1.1171875, + "learning_rate": 4.896283869997734e-06, + "loss": 2.3059, + "step": 11114 + }, + { + "epoch": 0.5962982832618026, + "grad_norm": 0.404296875, + "learning_rate": 4.896259104671924e-06, + "loss": 2.2203, + "step": 11115 + }, + { + "epoch": 0.5963519313304722, + "grad_norm": 0.470703125, + "learning_rate": 4.896234336452382e-06, + "loss": 1.7437, + "step": 11116 + }, + { + "epoch": 0.5964055793991416, + "grad_norm": 0.6484375, + "learning_rate": 4.896209565339138e-06, + "loss": 2.0526, + "step": 11117 + }, + { + "epoch": 0.5964592274678111, + "grad_norm": 0.42578125, + "learning_rate": 4.896184791332223e-06, + "loss": 2.3444, + "step": 11118 + }, + { + "epoch": 0.5965128755364807, + "grad_norm": 0.47265625, + "learning_rate": 4.896160014431666e-06, + "loss": 2.3665, + "step": 11119 + }, + { + "epoch": 0.5965665236051502, + "grad_norm": 0.41796875, + "learning_rate": 4.896135234637497e-06, + "loss": 2.2989, + "step": 11120 + }, + { + "epoch": 0.5966201716738198, + "grad_norm": 0.482421875, + "learning_rate": 4.896110451949746e-06, + "loss": 2.3324, + "step": 11121 + }, + { + "epoch": 0.5966738197424892, + "grad_norm": 0.38671875, + "learning_rate": 4.896085666368444e-06, + "loss": 1.7687, + "step": 11122 + }, + { + "epoch": 0.5967274678111588, + "grad_norm": 0.455078125, + "learning_rate": 4.896060877893619e-06, + "loss": 2.2109, + "step": 11123 + }, + { + "epoch": 0.5967811158798283, + "grad_norm": 0.43359375, + "learning_rate": 4.896036086525302e-06, + "loss": 2.3134, + "step": 11124 + }, + { + "epoch": 0.5968347639484979, + "grad_norm": 0.404296875, + "learning_rate": 4.896011292263523e-06, + "loss": 2.3695, + "step": 11125 + }, + { + "epoch": 0.5968884120171674, + "grad_norm": 0.478515625, + "learning_rate": 4.895986495108313e-06, + "loss": 2.3994, + "step": 11126 + }, + { + "epoch": 0.596942060085837, + "grad_norm": 0.40234375, + "learning_rate": 4.895961695059699e-06, + "loss": 2.2887, + "step": 11127 + }, + { + "epoch": 0.5969957081545064, + "grad_norm": 0.396484375, + "learning_rate": 4.8959368921177134e-06, + "loss": 2.0581, + "step": 11128 + }, + { + "epoch": 0.597049356223176, + "grad_norm": 0.5, + "learning_rate": 4.895912086282385e-06, + "loss": 2.1444, + "step": 11129 + }, + { + "epoch": 0.5971030042918455, + "grad_norm": 0.5234375, + "learning_rate": 4.895887277553744e-06, + "loss": 2.3117, + "step": 11130 + }, + { + "epoch": 0.5971566523605151, + "grad_norm": 0.58203125, + "learning_rate": 4.895862465931821e-06, + "loss": 2.2634, + "step": 11131 + }, + { + "epoch": 0.5972103004291845, + "grad_norm": 0.4296875, + "learning_rate": 4.8958376514166454e-06, + "loss": 2.222, + "step": 11132 + }, + { + "epoch": 0.597263948497854, + "grad_norm": 0.734375, + "learning_rate": 4.895812834008248e-06, + "loss": 2.183, + "step": 11133 + }, + { + "epoch": 0.5973175965665236, + "grad_norm": 0.43359375, + "learning_rate": 4.895788013706657e-06, + "loss": 2.3269, + "step": 11134 + }, + { + "epoch": 0.5973712446351931, + "grad_norm": 0.4453125, + "learning_rate": 4.895763190511904e-06, + "loss": 2.3474, + "step": 11135 + }, + { + "epoch": 0.5974248927038627, + "grad_norm": 0.447265625, + "learning_rate": 4.895738364424018e-06, + "loss": 2.5157, + "step": 11136 + }, + { + "epoch": 0.5974785407725322, + "grad_norm": 1.1015625, + "learning_rate": 4.89571353544303e-06, + "loss": 2.2486, + "step": 11137 + }, + { + "epoch": 0.5975321888412017, + "grad_norm": 0.486328125, + "learning_rate": 4.895688703568968e-06, + "loss": 2.3395, + "step": 11138 + }, + { + "epoch": 0.5975858369098712, + "grad_norm": 0.345703125, + "learning_rate": 4.895663868801865e-06, + "loss": 2.2868, + "step": 11139 + }, + { + "epoch": 0.5976394849785408, + "grad_norm": 0.5078125, + "learning_rate": 4.8956390311417484e-06, + "loss": 2.4576, + "step": 11140 + }, + { + "epoch": 0.5976931330472103, + "grad_norm": 0.46484375, + "learning_rate": 4.895614190588649e-06, + "loss": 2.6141, + "step": 11141 + }, + { + "epoch": 0.5977467811158799, + "grad_norm": 0.3984375, + "learning_rate": 4.895589347142598e-06, + "loss": 2.2649, + "step": 11142 + }, + { + "epoch": 0.5978004291845493, + "grad_norm": 0.37890625, + "learning_rate": 4.895564500803623e-06, + "loss": 2.1089, + "step": 11143 + }, + { + "epoch": 0.5978540772532189, + "grad_norm": 1.203125, + "learning_rate": 4.8955396515717565e-06, + "loss": 2.2504, + "step": 11144 + }, + { + "epoch": 0.5979077253218884, + "grad_norm": 0.453125, + "learning_rate": 4.895514799447027e-06, + "loss": 2.3383, + "step": 11145 + }, + { + "epoch": 0.597961373390558, + "grad_norm": 1.3125, + "learning_rate": 4.895489944429464e-06, + "loss": 2.2216, + "step": 11146 + }, + { + "epoch": 0.5980150214592275, + "grad_norm": 0.451171875, + "learning_rate": 4.895465086519099e-06, + "loss": 2.4352, + "step": 11147 + }, + { + "epoch": 0.598068669527897, + "grad_norm": 0.46875, + "learning_rate": 4.895440225715962e-06, + "loss": 2.3533, + "step": 11148 + }, + { + "epoch": 0.5981223175965665, + "grad_norm": 0.447265625, + "learning_rate": 4.895415362020081e-06, + "loss": 2.2794, + "step": 11149 + }, + { + "epoch": 0.598175965665236, + "grad_norm": 0.498046875, + "learning_rate": 4.8953904954314884e-06, + "loss": 2.3095, + "step": 11150 + }, + { + "epoch": 0.5982296137339056, + "grad_norm": 0.484375, + "learning_rate": 4.895365625950213e-06, + "loss": 2.4953, + "step": 11151 + }, + { + "epoch": 0.5982832618025751, + "grad_norm": 0.466796875, + "learning_rate": 4.895340753576285e-06, + "loss": 2.056, + "step": 11152 + }, + { + "epoch": 0.5983369098712447, + "grad_norm": 0.478515625, + "learning_rate": 4.895315878309735e-06, + "loss": 2.4469, + "step": 11153 + }, + { + "epoch": 0.5983905579399141, + "grad_norm": 0.484375, + "learning_rate": 4.895291000150592e-06, + "loss": 2.2472, + "step": 11154 + }, + { + "epoch": 0.5984442060085837, + "grad_norm": 0.36328125, + "learning_rate": 4.895266119098887e-06, + "loss": 1.8533, + "step": 11155 + }, + { + "epoch": 0.5984978540772532, + "grad_norm": 0.390625, + "learning_rate": 4.8952412351546495e-06, + "loss": 2.5247, + "step": 11156 + }, + { + "epoch": 0.5985515021459228, + "grad_norm": 0.4765625, + "learning_rate": 4.89521634831791e-06, + "loss": 2.1414, + "step": 11157 + }, + { + "epoch": 0.5986051502145923, + "grad_norm": 0.4375, + "learning_rate": 4.8951914585886975e-06, + "loss": 2.274, + "step": 11158 + }, + { + "epoch": 0.5986587982832619, + "grad_norm": 0.40234375, + "learning_rate": 4.895166565967043e-06, + "loss": 2.1669, + "step": 11159 + }, + { + "epoch": 0.5987124463519313, + "grad_norm": 0.36328125, + "learning_rate": 4.895141670452978e-06, + "loss": 2.2726, + "step": 11160 + }, + { + "epoch": 0.5987660944206008, + "grad_norm": 0.439453125, + "learning_rate": 4.895116772046529e-06, + "loss": 2.1808, + "step": 11161 + }, + { + "epoch": 0.5988197424892704, + "grad_norm": 0.5703125, + "learning_rate": 4.895091870747729e-06, + "loss": 1.9405, + "step": 11162 + }, + { + "epoch": 0.5988733905579399, + "grad_norm": 0.462890625, + "learning_rate": 4.895066966556606e-06, + "loss": 2.3217, + "step": 11163 + }, + { + "epoch": 0.5989270386266095, + "grad_norm": 0.47265625, + "learning_rate": 4.895042059473192e-06, + "loss": 2.1939, + "step": 11164 + }, + { + "epoch": 0.5989806866952789, + "grad_norm": 0.458984375, + "learning_rate": 4.895017149497517e-06, + "loss": 2.4489, + "step": 11165 + }, + { + "epoch": 0.5990343347639485, + "grad_norm": 0.43359375, + "learning_rate": 4.894992236629609e-06, + "loss": 2.4781, + "step": 11166 + }, + { + "epoch": 0.599087982832618, + "grad_norm": 0.3984375, + "learning_rate": 4.894967320869499e-06, + "loss": 2.3382, + "step": 11167 + }, + { + "epoch": 0.5991416309012876, + "grad_norm": 0.42578125, + "learning_rate": 4.894942402217219e-06, + "loss": 2.2221, + "step": 11168 + }, + { + "epoch": 0.5991952789699571, + "grad_norm": 0.376953125, + "learning_rate": 4.894917480672798e-06, + "loss": 2.3964, + "step": 11169 + }, + { + "epoch": 0.5992489270386266, + "grad_norm": 0.38671875, + "learning_rate": 4.894892556236264e-06, + "loss": 2.4411, + "step": 11170 + }, + { + "epoch": 0.5993025751072961, + "grad_norm": 0.43359375, + "learning_rate": 4.894867628907649e-06, + "loss": 2.2089, + "step": 11171 + }, + { + "epoch": 0.5993562231759657, + "grad_norm": 0.4453125, + "learning_rate": 4.894842698686983e-06, + "loss": 2.0028, + "step": 11172 + }, + { + "epoch": 0.5994098712446352, + "grad_norm": 0.443359375, + "learning_rate": 4.894817765574296e-06, + "loss": 2.2447, + "step": 11173 + }, + { + "epoch": 0.5994635193133048, + "grad_norm": 0.470703125, + "learning_rate": 4.894792829569618e-06, + "loss": 2.262, + "step": 11174 + }, + { + "epoch": 0.5995171673819742, + "grad_norm": 0.45703125, + "learning_rate": 4.89476789067298e-06, + "loss": 2.3511, + "step": 11175 + }, + { + "epoch": 0.5995708154506437, + "grad_norm": 0.482421875, + "learning_rate": 4.894742948884411e-06, + "loss": 2.271, + "step": 11176 + }, + { + "epoch": 0.5996244635193133, + "grad_norm": 0.515625, + "learning_rate": 4.894718004203941e-06, + "loss": 2.3631, + "step": 11177 + }, + { + "epoch": 0.5996781115879828, + "grad_norm": 0.369140625, + "learning_rate": 4.894693056631601e-06, + "loss": 1.9559, + "step": 11178 + }, + { + "epoch": 0.5997317596566524, + "grad_norm": 0.494140625, + "learning_rate": 4.89466810616742e-06, + "loss": 2.2944, + "step": 11179 + }, + { + "epoch": 0.5997854077253219, + "grad_norm": 0.400390625, + "learning_rate": 4.89464315281143e-06, + "loss": 2.1204, + "step": 11180 + }, + { + "epoch": 0.5998390557939914, + "grad_norm": 0.486328125, + "learning_rate": 4.894618196563659e-06, + "loss": 2.0028, + "step": 11181 + }, + { + "epoch": 0.5998927038626609, + "grad_norm": 0.478515625, + "learning_rate": 4.894593237424139e-06, + "loss": 2.2674, + "step": 11182 + }, + { + "epoch": 0.5999463519313305, + "grad_norm": 0.5, + "learning_rate": 4.894568275392898e-06, + "loss": 2.2006, + "step": 11183 + }, + { + "epoch": 0.6, + "grad_norm": 0.36328125, + "learning_rate": 4.894543310469968e-06, + "loss": 2.2823, + "step": 11184 + }, + { + "epoch": 0.6000536480686696, + "grad_norm": 0.6328125, + "learning_rate": 4.894518342655379e-06, + "loss": 2.4638, + "step": 11185 + }, + { + "epoch": 0.600107296137339, + "grad_norm": 0.486328125, + "learning_rate": 4.89449337194916e-06, + "loss": 2.4272, + "step": 11186 + }, + { + "epoch": 0.6001609442060086, + "grad_norm": 0.50390625, + "learning_rate": 4.894468398351342e-06, + "loss": 2.1525, + "step": 11187 + }, + { + "epoch": 0.6002145922746781, + "grad_norm": 1.359375, + "learning_rate": 4.894443421861955e-06, + "loss": 1.3858, + "step": 11188 + }, + { + "epoch": 0.6002682403433477, + "grad_norm": 0.515625, + "learning_rate": 4.894418442481029e-06, + "loss": 2.2738, + "step": 11189 + }, + { + "epoch": 0.6003218884120172, + "grad_norm": 0.423828125, + "learning_rate": 4.894393460208594e-06, + "loss": 2.1792, + "step": 11190 + }, + { + "epoch": 0.6003755364806866, + "grad_norm": 0.455078125, + "learning_rate": 4.894368475044682e-06, + "loss": 2.3577, + "step": 11191 + }, + { + "epoch": 0.6004291845493562, + "grad_norm": 0.53125, + "learning_rate": 4.89434348698932e-06, + "loss": 2.5142, + "step": 11192 + }, + { + "epoch": 0.6004828326180257, + "grad_norm": 0.3984375, + "learning_rate": 4.894318496042541e-06, + "loss": 2.3198, + "step": 11193 + }, + { + "epoch": 0.6005364806866953, + "grad_norm": 0.3984375, + "learning_rate": 4.894293502204373e-06, + "loss": 2.3249, + "step": 11194 + }, + { + "epoch": 0.6005901287553648, + "grad_norm": 0.45703125, + "learning_rate": 4.894268505474848e-06, + "loss": 2.2141, + "step": 11195 + }, + { + "epoch": 0.6006437768240344, + "grad_norm": 0.4140625, + "learning_rate": 4.894243505853995e-06, + "loss": 2.2986, + "step": 11196 + }, + { + "epoch": 0.6006974248927038, + "grad_norm": 0.5, + "learning_rate": 4.894218503341844e-06, + "loss": 2.4275, + "step": 11197 + }, + { + "epoch": 0.6007510729613734, + "grad_norm": 0.46484375, + "learning_rate": 4.894193497938426e-06, + "loss": 1.4809, + "step": 11198 + }, + { + "epoch": 0.6008047210300429, + "grad_norm": 0.43359375, + "learning_rate": 4.894168489643772e-06, + "loss": 2.4346, + "step": 11199 + }, + { + "epoch": 0.6008583690987125, + "grad_norm": 1.1171875, + "learning_rate": 4.894143478457911e-06, + "loss": 2.1724, + "step": 11200 + }, + { + "epoch": 0.600912017167382, + "grad_norm": 0.375, + "learning_rate": 4.894118464380873e-06, + "loss": 2.0122, + "step": 11201 + }, + { + "epoch": 0.6009656652360515, + "grad_norm": 0.470703125, + "learning_rate": 4.894093447412688e-06, + "loss": 2.3644, + "step": 11202 + }, + { + "epoch": 0.601019313304721, + "grad_norm": 0.40625, + "learning_rate": 4.894068427553386e-06, + "loss": 2.2053, + "step": 11203 + }, + { + "epoch": 0.6010729613733906, + "grad_norm": 0.703125, + "learning_rate": 4.894043404803e-06, + "loss": 2.1871, + "step": 11204 + }, + { + "epoch": 0.6011266094420601, + "grad_norm": 0.55078125, + "learning_rate": 4.894018379161558e-06, + "loss": 2.5728, + "step": 11205 + }, + { + "epoch": 0.6011802575107296, + "grad_norm": 0.439453125, + "learning_rate": 4.8939933506290896e-06, + "loss": 2.258, + "step": 11206 + }, + { + "epoch": 0.6012339055793992, + "grad_norm": 0.4375, + "learning_rate": 4.893968319205627e-06, + "loss": 2.2607, + "step": 11207 + }, + { + "epoch": 0.6012875536480686, + "grad_norm": 0.47265625, + "learning_rate": 4.893943284891197e-06, + "loss": 1.997, + "step": 11208 + }, + { + "epoch": 0.6013412017167382, + "grad_norm": 0.453125, + "learning_rate": 4.893918247685834e-06, + "loss": 2.295, + "step": 11209 + }, + { + "epoch": 0.6013948497854077, + "grad_norm": 0.455078125, + "learning_rate": 4.8938932075895666e-06, + "loss": 2.0959, + "step": 11210 + }, + { + "epoch": 0.6014484978540773, + "grad_norm": 0.44921875, + "learning_rate": 4.893868164602423e-06, + "loss": 2.2101, + "step": 11211 + }, + { + "epoch": 0.6015021459227468, + "grad_norm": 0.439453125, + "learning_rate": 4.893843118724436e-06, + "loss": 2.2857, + "step": 11212 + }, + { + "epoch": 0.6015557939914163, + "grad_norm": 0.462890625, + "learning_rate": 4.893818069955636e-06, + "loss": 2.379, + "step": 11213 + }, + { + "epoch": 0.6016094420600858, + "grad_norm": 0.5078125, + "learning_rate": 4.893793018296051e-06, + "loss": 2.3547, + "step": 11214 + }, + { + "epoch": 0.6016630901287554, + "grad_norm": 0.40625, + "learning_rate": 4.893767963745714e-06, + "loss": 2.3785, + "step": 11215 + }, + { + "epoch": 0.6017167381974249, + "grad_norm": 0.416015625, + "learning_rate": 4.893742906304653e-06, + "loss": 2.2548, + "step": 11216 + }, + { + "epoch": 0.6017703862660945, + "grad_norm": 0.47265625, + "learning_rate": 4.893717845972899e-06, + "loss": 2.3196, + "step": 11217 + }, + { + "epoch": 0.601824034334764, + "grad_norm": 0.5, + "learning_rate": 4.893692782750484e-06, + "loss": 1.6778, + "step": 11218 + }, + { + "epoch": 0.6018776824034334, + "grad_norm": 0.435546875, + "learning_rate": 4.893667716637434e-06, + "loss": 2.2743, + "step": 11219 + }, + { + "epoch": 0.601931330472103, + "grad_norm": 0.404296875, + "learning_rate": 4.8936426476337825e-06, + "loss": 2.1392, + "step": 11220 + }, + { + "epoch": 0.6019849785407725, + "grad_norm": 0.490234375, + "learning_rate": 4.8936175757395605e-06, + "loss": 2.3903, + "step": 11221 + }, + { + "epoch": 0.6020386266094421, + "grad_norm": 0.3984375, + "learning_rate": 4.893592500954795e-06, + "loss": 2.2524, + "step": 11222 + }, + { + "epoch": 0.6020922746781115, + "grad_norm": 0.443359375, + "learning_rate": 4.89356742327952e-06, + "loss": 2.4361, + "step": 11223 + }, + { + "epoch": 0.6021459227467811, + "grad_norm": 0.42578125, + "learning_rate": 4.893542342713764e-06, + "loss": 2.2623, + "step": 11224 + }, + { + "epoch": 0.6021995708154506, + "grad_norm": 5.0625, + "learning_rate": 4.893517259257556e-06, + "loss": 2.2346, + "step": 11225 + }, + { + "epoch": 0.6022532188841202, + "grad_norm": 0.484375, + "learning_rate": 4.893492172910929e-06, + "loss": 2.5183, + "step": 11226 + }, + { + "epoch": 0.6023068669527897, + "grad_norm": 0.4609375, + "learning_rate": 4.893467083673911e-06, + "loss": 2.3623, + "step": 11227 + }, + { + "epoch": 0.6023605150214593, + "grad_norm": 0.408203125, + "learning_rate": 4.893441991546534e-06, + "loss": 2.3038, + "step": 11228 + }, + { + "epoch": 0.6024141630901287, + "grad_norm": 0.46875, + "learning_rate": 4.893416896528826e-06, + "loss": 2.3718, + "step": 11229 + }, + { + "epoch": 0.6024678111587983, + "grad_norm": 3.0, + "learning_rate": 4.89339179862082e-06, + "loss": 2.3377, + "step": 11230 + }, + { + "epoch": 0.6025214592274678, + "grad_norm": 0.5, + "learning_rate": 4.893366697822545e-06, + "loss": 2.3902, + "step": 11231 + }, + { + "epoch": 0.6025751072961374, + "grad_norm": 0.359375, + "learning_rate": 4.893341594134031e-06, + "loss": 2.2439, + "step": 11232 + }, + { + "epoch": 0.6026287553648069, + "grad_norm": 0.3828125, + "learning_rate": 4.8933164875553084e-06, + "loss": 1.7375, + "step": 11233 + }, + { + "epoch": 0.6026824034334763, + "grad_norm": 0.408203125, + "learning_rate": 4.8932913780864085e-06, + "loss": 2.4737, + "step": 11234 + }, + { + "epoch": 0.6027360515021459, + "grad_norm": 0.439453125, + "learning_rate": 4.893266265727361e-06, + "loss": 2.3512, + "step": 11235 + }, + { + "epoch": 0.6027896995708154, + "grad_norm": 0.462890625, + "learning_rate": 4.893241150478196e-06, + "loss": 2.3059, + "step": 11236 + }, + { + "epoch": 0.602843347639485, + "grad_norm": 0.431640625, + "learning_rate": 4.893216032338944e-06, + "loss": 2.4523, + "step": 11237 + }, + { + "epoch": 0.6028969957081545, + "grad_norm": 0.46484375, + "learning_rate": 4.893190911309635e-06, + "loss": 2.2879, + "step": 11238 + }, + { + "epoch": 0.6029506437768241, + "grad_norm": 0.41796875, + "learning_rate": 4.893165787390301e-06, + "loss": 2.3436, + "step": 11239 + }, + { + "epoch": 0.6030042918454935, + "grad_norm": 0.50390625, + "learning_rate": 4.893140660580969e-06, + "loss": 2.2067, + "step": 11240 + }, + { + "epoch": 0.6030579399141631, + "grad_norm": 0.43359375, + "learning_rate": 4.893115530881672e-06, + "loss": 2.3788, + "step": 11241 + }, + { + "epoch": 0.6031115879828326, + "grad_norm": 0.4375, + "learning_rate": 4.893090398292441e-06, + "loss": 2.2807, + "step": 11242 + }, + { + "epoch": 0.6031652360515022, + "grad_norm": 0.458984375, + "learning_rate": 4.893065262813304e-06, + "loss": 2.2844, + "step": 11243 + }, + { + "epoch": 0.6032188841201717, + "grad_norm": 0.5625, + "learning_rate": 4.893040124444292e-06, + "loss": 2.0803, + "step": 11244 + }, + { + "epoch": 0.6032725321888412, + "grad_norm": 0.41796875, + "learning_rate": 4.893014983185437e-06, + "loss": 2.238, + "step": 11245 + }, + { + "epoch": 0.6033261802575107, + "grad_norm": 0.380859375, + "learning_rate": 4.892989839036768e-06, + "loss": 2.2833, + "step": 11246 + }, + { + "epoch": 0.6033798283261803, + "grad_norm": 0.369140625, + "learning_rate": 4.892964691998315e-06, + "loss": 2.2223, + "step": 11247 + }, + { + "epoch": 0.6034334763948498, + "grad_norm": 2.84375, + "learning_rate": 4.892939542070109e-06, + "loss": 2.2633, + "step": 11248 + }, + { + "epoch": 0.6034871244635193, + "grad_norm": 0.462890625, + "learning_rate": 4.892914389252179e-06, + "loss": 2.2661, + "step": 11249 + }, + { + "epoch": 0.6035407725321889, + "grad_norm": 0.55078125, + "learning_rate": 4.892889233544559e-06, + "loss": 2.2009, + "step": 11250 + }, + { + "epoch": 0.6035944206008583, + "grad_norm": 0.384765625, + "learning_rate": 4.892864074947275e-06, + "loss": 2.33, + "step": 11251 + }, + { + "epoch": 0.6036480686695279, + "grad_norm": 0.455078125, + "learning_rate": 4.892838913460361e-06, + "loss": 2.3228, + "step": 11252 + }, + { + "epoch": 0.6037017167381974, + "grad_norm": 0.4375, + "learning_rate": 4.892813749083844e-06, + "loss": 2.2416, + "step": 11253 + }, + { + "epoch": 0.603755364806867, + "grad_norm": 0.400390625, + "learning_rate": 4.892788581817757e-06, + "loss": 2.2575, + "step": 11254 + }, + { + "epoch": 0.6038090128755365, + "grad_norm": 0.408203125, + "learning_rate": 4.89276341166213e-06, + "loss": 2.2346, + "step": 11255 + }, + { + "epoch": 0.603862660944206, + "grad_norm": 0.5234375, + "learning_rate": 4.892738238616992e-06, + "loss": 2.6144, + "step": 11256 + }, + { + "epoch": 0.6039163090128755, + "grad_norm": 0.419921875, + "learning_rate": 4.892713062682376e-06, + "loss": 2.4239, + "step": 11257 + }, + { + "epoch": 0.6039699570815451, + "grad_norm": 0.45703125, + "learning_rate": 4.8926878838583095e-06, + "loss": 2.1495, + "step": 11258 + }, + { + "epoch": 0.6040236051502146, + "grad_norm": 0.455078125, + "learning_rate": 4.892662702144823e-06, + "loss": 2.3261, + "step": 11259 + }, + { + "epoch": 0.6040772532188842, + "grad_norm": 0.44140625, + "learning_rate": 4.89263751754195e-06, + "loss": 2.2756, + "step": 11260 + }, + { + "epoch": 0.6041309012875536, + "grad_norm": 0.4765625, + "learning_rate": 4.892612330049718e-06, + "loss": 2.3862, + "step": 11261 + }, + { + "epoch": 0.6041845493562231, + "grad_norm": 0.3984375, + "learning_rate": 4.892587139668159e-06, + "loss": 1.8072, + "step": 11262 + }, + { + "epoch": 0.6042381974248927, + "grad_norm": 0.4765625, + "learning_rate": 4.892561946397303e-06, + "loss": 2.2908, + "step": 11263 + }, + { + "epoch": 0.6042918454935622, + "grad_norm": 2.0, + "learning_rate": 4.892536750237179e-06, + "loss": 2.2337, + "step": 11264 + }, + { + "epoch": 0.6043454935622318, + "grad_norm": 0.423828125, + "learning_rate": 4.89251155118782e-06, + "loss": 2.3515, + "step": 11265 + }, + { + "epoch": 0.6043991416309012, + "grad_norm": 0.482421875, + "learning_rate": 4.8924863492492535e-06, + "loss": 2.1024, + "step": 11266 + }, + { + "epoch": 0.6044527896995708, + "grad_norm": 0.52734375, + "learning_rate": 4.892461144421513e-06, + "loss": 2.3231, + "step": 11267 + }, + { + "epoch": 0.6045064377682403, + "grad_norm": 0.4453125, + "learning_rate": 4.892435936704627e-06, + "loss": 2.3216, + "step": 11268 + }, + { + "epoch": 0.6045600858369099, + "grad_norm": 0.56640625, + "learning_rate": 4.892410726098626e-06, + "loss": 2.1879, + "step": 11269 + }, + { + "epoch": 0.6046137339055794, + "grad_norm": 0.50390625, + "learning_rate": 4.892385512603541e-06, + "loss": 2.0713, + "step": 11270 + }, + { + "epoch": 0.604667381974249, + "grad_norm": 0.41796875, + "learning_rate": 4.892360296219403e-06, + "loss": 2.2445, + "step": 11271 + }, + { + "epoch": 0.6047210300429184, + "grad_norm": 0.52734375, + "learning_rate": 4.892335076946241e-06, + "loss": 2.2247, + "step": 11272 + }, + { + "epoch": 0.604774678111588, + "grad_norm": 0.419921875, + "learning_rate": 4.892309854784087e-06, + "loss": 2.2793, + "step": 11273 + }, + { + "epoch": 0.6048283261802575, + "grad_norm": 0.42578125, + "learning_rate": 4.892284629732969e-06, + "loss": 2.2784, + "step": 11274 + }, + { + "epoch": 0.6048819742489271, + "grad_norm": 0.427734375, + "learning_rate": 4.89225940179292e-06, + "loss": 2.1591, + "step": 11275 + }, + { + "epoch": 0.6049356223175966, + "grad_norm": 0.41796875, + "learning_rate": 4.89223417096397e-06, + "loss": 2.36, + "step": 11276 + }, + { + "epoch": 0.604989270386266, + "grad_norm": 0.4765625, + "learning_rate": 4.892208937246149e-06, + "loss": 2.5092, + "step": 11277 + }, + { + "epoch": 0.6050429184549356, + "grad_norm": 0.451171875, + "learning_rate": 4.892183700639487e-06, + "loss": 2.4249, + "step": 11278 + }, + { + "epoch": 0.6050965665236051, + "grad_norm": 0.443359375, + "learning_rate": 4.892158461144015e-06, + "loss": 2.0125, + "step": 11279 + }, + { + "epoch": 0.6051502145922747, + "grad_norm": 0.96875, + "learning_rate": 4.892133218759763e-06, + "loss": 2.4037, + "step": 11280 + }, + { + "epoch": 0.6052038626609442, + "grad_norm": 0.61328125, + "learning_rate": 4.892107973486764e-06, + "loss": 2.2746, + "step": 11281 + }, + { + "epoch": 0.6052575107296138, + "grad_norm": 0.65625, + "learning_rate": 4.892082725325045e-06, + "loss": 2.3481, + "step": 11282 + }, + { + "epoch": 0.6053111587982832, + "grad_norm": 0.62109375, + "learning_rate": 4.8920574742746376e-06, + "loss": 2.2911, + "step": 11283 + }, + { + "epoch": 0.6053648068669528, + "grad_norm": 0.54296875, + "learning_rate": 4.892032220335573e-06, + "loss": 2.4654, + "step": 11284 + }, + { + "epoch": 0.6054184549356223, + "grad_norm": 0.431640625, + "learning_rate": 4.892006963507882e-06, + "loss": 2.2055, + "step": 11285 + }, + { + "epoch": 0.6054721030042919, + "grad_norm": 0.4765625, + "learning_rate": 4.891981703791594e-06, + "loss": 2.4677, + "step": 11286 + }, + { + "epoch": 0.6055257510729614, + "grad_norm": 0.435546875, + "learning_rate": 4.891956441186739e-06, + "loss": 2.4712, + "step": 11287 + }, + { + "epoch": 0.605579399141631, + "grad_norm": 0.46875, + "learning_rate": 4.89193117569335e-06, + "loss": 2.2703, + "step": 11288 + }, + { + "epoch": 0.6056330472103004, + "grad_norm": 0.390625, + "learning_rate": 4.891905907311455e-06, + "loss": 2.0144, + "step": 11289 + }, + { + "epoch": 0.60568669527897, + "grad_norm": 0.498046875, + "learning_rate": 4.8918806360410855e-06, + "loss": 2.4904, + "step": 11290 + }, + { + "epoch": 0.6057403433476395, + "grad_norm": 0.44140625, + "learning_rate": 4.891855361882272e-06, + "loss": 2.0683, + "step": 11291 + }, + { + "epoch": 0.605793991416309, + "grad_norm": 0.447265625, + "learning_rate": 4.8918300848350455e-06, + "loss": 2.485, + "step": 11292 + }, + { + "epoch": 0.6058476394849786, + "grad_norm": 0.80078125, + "learning_rate": 4.891804804899436e-06, + "loss": 1.6163, + "step": 11293 + }, + { + "epoch": 0.605901287553648, + "grad_norm": 0.43359375, + "learning_rate": 4.8917795220754735e-06, + "loss": 2.2337, + "step": 11294 + }, + { + "epoch": 0.6059549356223176, + "grad_norm": 0.419921875, + "learning_rate": 4.891754236363189e-06, + "loss": 2.2121, + "step": 11295 + }, + { + "epoch": 0.6060085836909871, + "grad_norm": 0.369140625, + "learning_rate": 4.891728947762614e-06, + "loss": 2.2552, + "step": 11296 + }, + { + "epoch": 0.6060622317596567, + "grad_norm": 0.482421875, + "learning_rate": 4.891703656273778e-06, + "loss": 1.9662, + "step": 11297 + }, + { + "epoch": 0.6061158798283262, + "grad_norm": 0.4765625, + "learning_rate": 4.8916783618967104e-06, + "loss": 2.2004, + "step": 11298 + }, + { + "epoch": 0.6061695278969957, + "grad_norm": 0.44140625, + "learning_rate": 4.8916530646314445e-06, + "loss": 2.3169, + "step": 11299 + }, + { + "epoch": 0.6062231759656652, + "grad_norm": 0.486328125, + "learning_rate": 4.891627764478009e-06, + "loss": 2.3267, + "step": 11300 + }, + { + "epoch": 0.6062768240343348, + "grad_norm": 0.458984375, + "learning_rate": 4.891602461436434e-06, + "loss": 2.4511, + "step": 11301 + }, + { + "epoch": 0.6063304721030043, + "grad_norm": 0.412109375, + "learning_rate": 4.891577155506751e-06, + "loss": 2.3923, + "step": 11302 + }, + { + "epoch": 0.6063841201716739, + "grad_norm": 0.466796875, + "learning_rate": 4.891551846688992e-06, + "loss": 2.1406, + "step": 11303 + }, + { + "epoch": 0.6064377682403433, + "grad_norm": 0.439453125, + "learning_rate": 4.891526534983185e-06, + "loss": 2.1466, + "step": 11304 + }, + { + "epoch": 0.6064914163090128, + "grad_norm": 0.396484375, + "learning_rate": 4.891501220389361e-06, + "loss": 2.4384, + "step": 11305 + }, + { + "epoch": 0.6065450643776824, + "grad_norm": 0.451171875, + "learning_rate": 4.891475902907552e-06, + "loss": 2.3514, + "step": 11306 + }, + { + "epoch": 0.6065987124463519, + "grad_norm": 0.431640625, + "learning_rate": 4.891450582537788e-06, + "loss": 2.3867, + "step": 11307 + }, + { + "epoch": 0.6066523605150215, + "grad_norm": 0.44921875, + "learning_rate": 4.891425259280099e-06, + "loss": 2.4196, + "step": 11308 + }, + { + "epoch": 0.606706008583691, + "grad_norm": 0.416015625, + "learning_rate": 4.8913999331345156e-06, + "loss": 2.1005, + "step": 11309 + }, + { + "epoch": 0.6067596566523605, + "grad_norm": 0.40234375, + "learning_rate": 4.891374604101069e-06, + "loss": 2.245, + "step": 11310 + }, + { + "epoch": 0.60681330472103, + "grad_norm": 0.38671875, + "learning_rate": 4.89134927217979e-06, + "loss": 1.8727, + "step": 11311 + }, + { + "epoch": 0.6068669527896996, + "grad_norm": 0.51171875, + "learning_rate": 4.891323937370707e-06, + "loss": 2.4308, + "step": 11312 + }, + { + "epoch": 0.6069206008583691, + "grad_norm": 0.5, + "learning_rate": 4.8912985996738534e-06, + "loss": 1.2641, + "step": 11313 + }, + { + "epoch": 0.6069742489270387, + "grad_norm": 0.458984375, + "learning_rate": 4.891273259089258e-06, + "loss": 2.4393, + "step": 11314 + }, + { + "epoch": 0.6070278969957081, + "grad_norm": 0.47265625, + "learning_rate": 4.891247915616953e-06, + "loss": 2.4991, + "step": 11315 + }, + { + "epoch": 0.6070815450643777, + "grad_norm": 0.46875, + "learning_rate": 4.891222569256968e-06, + "loss": 2.437, + "step": 11316 + }, + { + "epoch": 0.6071351931330472, + "grad_norm": 0.6875, + "learning_rate": 4.891197220009333e-06, + "loss": 2.228, + "step": 11317 + }, + { + "epoch": 0.6071888412017168, + "grad_norm": 0.47265625, + "learning_rate": 4.891171867874079e-06, + "loss": 2.4793, + "step": 11318 + }, + { + "epoch": 0.6072424892703863, + "grad_norm": 0.455078125, + "learning_rate": 4.891146512851238e-06, + "loss": 2.3279, + "step": 11319 + }, + { + "epoch": 0.6072961373390557, + "grad_norm": 0.5078125, + "learning_rate": 4.891121154940839e-06, + "loss": 2.2831, + "step": 11320 + }, + { + "epoch": 0.6073497854077253, + "grad_norm": 0.42578125, + "learning_rate": 4.8910957941429125e-06, + "loss": 2.2541, + "step": 11321 + }, + { + "epoch": 0.6074034334763948, + "grad_norm": 0.46875, + "learning_rate": 4.891070430457491e-06, + "loss": 2.1874, + "step": 11322 + }, + { + "epoch": 0.6074570815450644, + "grad_norm": 0.55859375, + "learning_rate": 4.891045063884603e-06, + "loss": 2.3329, + "step": 11323 + }, + { + "epoch": 0.6075107296137339, + "grad_norm": 0.38671875, + "learning_rate": 4.89101969442428e-06, + "loss": 2.3811, + "step": 11324 + }, + { + "epoch": 0.6075643776824035, + "grad_norm": 0.470703125, + "learning_rate": 4.890994322076553e-06, + "loss": 2.4082, + "step": 11325 + }, + { + "epoch": 0.6076180257510729, + "grad_norm": 0.4375, + "learning_rate": 4.890968946841452e-06, + "loss": 2.0136, + "step": 11326 + }, + { + "epoch": 0.6076716738197425, + "grad_norm": 0.400390625, + "learning_rate": 4.890943568719008e-06, + "loss": 2.3564, + "step": 11327 + }, + { + "epoch": 0.607725321888412, + "grad_norm": 0.5859375, + "learning_rate": 4.890918187709252e-06, + "loss": 2.332, + "step": 11328 + }, + { + "epoch": 0.6077789699570816, + "grad_norm": 0.416015625, + "learning_rate": 4.890892803812214e-06, + "loss": 2.1842, + "step": 11329 + }, + { + "epoch": 0.6078326180257511, + "grad_norm": 0.49609375, + "learning_rate": 4.890867417027925e-06, + "loss": 2.411, + "step": 11330 + }, + { + "epoch": 0.6078862660944206, + "grad_norm": 0.46875, + "learning_rate": 4.890842027356415e-06, + "loss": 2.2627, + "step": 11331 + }, + { + "epoch": 0.6079399141630901, + "grad_norm": 0.6640625, + "learning_rate": 4.890816634797716e-06, + "loss": 2.0945, + "step": 11332 + }, + { + "epoch": 0.6079935622317597, + "grad_norm": 0.515625, + "learning_rate": 4.890791239351857e-06, + "loss": 1.9933, + "step": 11333 + }, + { + "epoch": 0.6080472103004292, + "grad_norm": 0.6953125, + "learning_rate": 4.89076584101887e-06, + "loss": 2.1614, + "step": 11334 + }, + { + "epoch": 0.6081008583690987, + "grad_norm": 0.40625, + "learning_rate": 4.890740439798785e-06, + "loss": 2.0532, + "step": 11335 + }, + { + "epoch": 0.6081545064377682, + "grad_norm": 0.40625, + "learning_rate": 4.890715035691633e-06, + "loss": 2.2059, + "step": 11336 + }, + { + "epoch": 0.6082081545064377, + "grad_norm": 0.482421875, + "learning_rate": 4.890689628697446e-06, + "loss": 2.5389, + "step": 11337 + }, + { + "epoch": 0.6082618025751073, + "grad_norm": 0.484375, + "learning_rate": 4.890664218816251e-06, + "loss": 2.2289, + "step": 11338 + }, + { + "epoch": 0.6083154506437768, + "grad_norm": 0.44921875, + "learning_rate": 4.890638806048082e-06, + "loss": 2.2361, + "step": 11339 + }, + { + "epoch": 0.6083690987124464, + "grad_norm": 0.44140625, + "learning_rate": 4.890613390392969e-06, + "loss": 2.4094, + "step": 11340 + }, + { + "epoch": 0.6084227467811159, + "grad_norm": 0.421875, + "learning_rate": 4.890587971850941e-06, + "loss": 2.2881, + "step": 11341 + }, + { + "epoch": 0.6084763948497854, + "grad_norm": 0.44921875, + "learning_rate": 4.890562550422031e-06, + "loss": 2.2805, + "step": 11342 + }, + { + "epoch": 0.6085300429184549, + "grad_norm": 0.6328125, + "learning_rate": 4.890537126106268e-06, + "loss": 2.1327, + "step": 11343 + }, + { + "epoch": 0.6085836909871245, + "grad_norm": 0.392578125, + "learning_rate": 4.890511698903684e-06, + "loss": 2.2801, + "step": 11344 + }, + { + "epoch": 0.608637339055794, + "grad_norm": 0.435546875, + "learning_rate": 4.890486268814309e-06, + "loss": 2.3502, + "step": 11345 + }, + { + "epoch": 0.6086909871244636, + "grad_norm": 0.376953125, + "learning_rate": 4.890460835838174e-06, + "loss": 2.1867, + "step": 11346 + }, + { + "epoch": 0.608744635193133, + "grad_norm": 0.38671875, + "learning_rate": 4.890435399975309e-06, + "loss": 2.2863, + "step": 11347 + }, + { + "epoch": 0.6087982832618025, + "grad_norm": 0.8046875, + "learning_rate": 4.890409961225746e-06, + "loss": 2.2958, + "step": 11348 + }, + { + "epoch": 0.6088519313304721, + "grad_norm": 0.50390625, + "learning_rate": 4.890384519589515e-06, + "loss": 2.3147, + "step": 11349 + }, + { + "epoch": 0.6089055793991416, + "grad_norm": 0.55859375, + "learning_rate": 4.890359075066646e-06, + "loss": 2.2537, + "step": 11350 + }, + { + "epoch": 0.6089592274678112, + "grad_norm": 0.404296875, + "learning_rate": 4.890333627657171e-06, + "loss": 2.1979, + "step": 11351 + }, + { + "epoch": 0.6090128755364806, + "grad_norm": 0.478515625, + "learning_rate": 4.89030817736112e-06, + "loss": 2.324, + "step": 11352 + }, + { + "epoch": 0.6090665236051502, + "grad_norm": 0.4609375, + "learning_rate": 4.890282724178523e-06, + "loss": 1.8083, + "step": 11353 + }, + { + "epoch": 0.6091201716738197, + "grad_norm": 0.3671875, + "learning_rate": 4.890257268109413e-06, + "loss": 2.1647, + "step": 11354 + }, + { + "epoch": 0.6091738197424893, + "grad_norm": 0.466796875, + "learning_rate": 4.8902318091538185e-06, + "loss": 2.3428, + "step": 11355 + }, + { + "epoch": 0.6092274678111588, + "grad_norm": 0.47265625, + "learning_rate": 4.890206347311771e-06, + "loss": 2.3015, + "step": 11356 + }, + { + "epoch": 0.6092811158798284, + "grad_norm": 0.466796875, + "learning_rate": 4.890180882583302e-06, + "loss": 2.3775, + "step": 11357 + }, + { + "epoch": 0.6093347639484978, + "grad_norm": 0.4609375, + "learning_rate": 4.890155414968441e-06, + "loss": 2.296, + "step": 11358 + }, + { + "epoch": 0.6093884120171674, + "grad_norm": 0.404296875, + "learning_rate": 4.89012994446722e-06, + "loss": 1.9925, + "step": 11359 + }, + { + "epoch": 0.6094420600858369, + "grad_norm": 0.53515625, + "learning_rate": 4.890104471079668e-06, + "loss": 2.518, + "step": 11360 + }, + { + "epoch": 0.6094957081545065, + "grad_norm": 0.455078125, + "learning_rate": 4.890078994805819e-06, + "loss": 2.4109, + "step": 11361 + }, + { + "epoch": 0.609549356223176, + "grad_norm": 0.416015625, + "learning_rate": 4.8900535156457e-06, + "loss": 2.2739, + "step": 11362 + }, + { + "epoch": 0.6096030042918454, + "grad_norm": 0.478515625, + "learning_rate": 4.890028033599344e-06, + "loss": 2.1157, + "step": 11363 + }, + { + "epoch": 0.609656652360515, + "grad_norm": 0.369140625, + "learning_rate": 4.89000254866678e-06, + "loss": 2.0105, + "step": 11364 + }, + { + "epoch": 0.6097103004291845, + "grad_norm": 0.388671875, + "learning_rate": 4.889977060848041e-06, + "loss": 2.2488, + "step": 11365 + }, + { + "epoch": 0.6097639484978541, + "grad_norm": 0.431640625, + "learning_rate": 4.889951570143157e-06, + "loss": 2.368, + "step": 11366 + }, + { + "epoch": 0.6098175965665236, + "grad_norm": 0.53515625, + "learning_rate": 4.8899260765521585e-06, + "loss": 1.3463, + "step": 11367 + }, + { + "epoch": 0.6098712446351932, + "grad_norm": 0.51171875, + "learning_rate": 4.8899005800750755e-06, + "loss": 2.3254, + "step": 11368 + }, + { + "epoch": 0.6099248927038626, + "grad_norm": 0.45703125, + "learning_rate": 4.88987508071194e-06, + "loss": 2.463, + "step": 11369 + }, + { + "epoch": 0.6099785407725322, + "grad_norm": 0.5546875, + "learning_rate": 4.8898495784627835e-06, + "loss": 2.1095, + "step": 11370 + }, + { + "epoch": 0.6100321888412017, + "grad_norm": 0.4921875, + "learning_rate": 4.889824073327634e-06, + "loss": 2.0541, + "step": 11371 + }, + { + "epoch": 0.6100858369098713, + "grad_norm": 0.466796875, + "learning_rate": 4.889798565306525e-06, + "loss": 2.5968, + "step": 11372 + }, + { + "epoch": 0.6101394849785408, + "grad_norm": 0.431640625, + "learning_rate": 4.889773054399486e-06, + "loss": 2.3922, + "step": 11373 + }, + { + "epoch": 0.6101931330472103, + "grad_norm": 0.5859375, + "learning_rate": 4.889747540606548e-06, + "loss": 2.251, + "step": 11374 + }, + { + "epoch": 0.6102467811158798, + "grad_norm": 0.65625, + "learning_rate": 4.889722023927742e-06, + "loss": 2.2835, + "step": 11375 + }, + { + "epoch": 0.6103004291845494, + "grad_norm": 0.41015625, + "learning_rate": 4.8896965043630984e-06, + "loss": 2.2215, + "step": 11376 + }, + { + "epoch": 0.6103540772532189, + "grad_norm": 0.412109375, + "learning_rate": 4.889670981912649e-06, + "loss": 2.1847, + "step": 11377 + }, + { + "epoch": 0.6104077253218884, + "grad_norm": 0.546875, + "learning_rate": 4.889645456576423e-06, + "loss": 2.2836, + "step": 11378 + }, + { + "epoch": 0.610461373390558, + "grad_norm": 0.41796875, + "learning_rate": 4.889619928354453e-06, + "loss": 2.2553, + "step": 11379 + }, + { + "epoch": 0.6105150214592274, + "grad_norm": 0.42578125, + "learning_rate": 4.889594397246769e-06, + "loss": 2.2823, + "step": 11380 + }, + { + "epoch": 0.610568669527897, + "grad_norm": 0.3984375, + "learning_rate": 4.889568863253402e-06, + "loss": 1.9768, + "step": 11381 + }, + { + "epoch": 0.6106223175965665, + "grad_norm": 0.47265625, + "learning_rate": 4.889543326374382e-06, + "loss": 2.4839, + "step": 11382 + }, + { + "epoch": 0.6106759656652361, + "grad_norm": 0.42578125, + "learning_rate": 4.889517786609741e-06, + "loss": 2.3096, + "step": 11383 + }, + { + "epoch": 0.6107296137339056, + "grad_norm": 0.546875, + "learning_rate": 4.889492243959509e-06, + "loss": 2.1032, + "step": 11384 + }, + { + "epoch": 0.6107832618025751, + "grad_norm": 0.470703125, + "learning_rate": 4.889466698423718e-06, + "loss": 2.4425, + "step": 11385 + }, + { + "epoch": 0.6108369098712446, + "grad_norm": 0.48828125, + "learning_rate": 4.889441150002398e-06, + "loss": 1.7237, + "step": 11386 + }, + { + "epoch": 0.6108905579399142, + "grad_norm": 0.4296875, + "learning_rate": 4.8894155986955786e-06, + "loss": 2.2569, + "step": 11387 + }, + { + "epoch": 0.6109442060085837, + "grad_norm": 0.51953125, + "learning_rate": 4.889390044503293e-06, + "loss": 2.4406, + "step": 11388 + }, + { + "epoch": 0.6109978540772533, + "grad_norm": 0.458984375, + "learning_rate": 4.889364487425571e-06, + "loss": 2.2136, + "step": 11389 + }, + { + "epoch": 0.6110515021459227, + "grad_norm": 0.47265625, + "learning_rate": 4.889338927462443e-06, + "loss": 2.3472, + "step": 11390 + }, + { + "epoch": 0.6111051502145923, + "grad_norm": 0.49609375, + "learning_rate": 4.889313364613941e-06, + "loss": 2.2886, + "step": 11391 + }, + { + "epoch": 0.6111587982832618, + "grad_norm": 0.50390625, + "learning_rate": 4.889287798880094e-06, + "loss": 2.4447, + "step": 11392 + }, + { + "epoch": 0.6112124463519313, + "grad_norm": 0.4375, + "learning_rate": 4.8892622302609365e-06, + "loss": 2.2329, + "step": 11393 + }, + { + "epoch": 0.6112660944206009, + "grad_norm": 0.36328125, + "learning_rate": 4.889236658756495e-06, + "loss": 1.9204, + "step": 11394 + }, + { + "epoch": 0.6113197424892703, + "grad_norm": 0.439453125, + "learning_rate": 4.889211084366803e-06, + "loss": 2.3585, + "step": 11395 + }, + { + "epoch": 0.6113733905579399, + "grad_norm": 0.47265625, + "learning_rate": 4.88918550709189e-06, + "loss": 2.5905, + "step": 11396 + }, + { + "epoch": 0.6114270386266094, + "grad_norm": 0.427734375, + "learning_rate": 4.889159926931788e-06, + "loss": 2.1771, + "step": 11397 + }, + { + "epoch": 0.611480686695279, + "grad_norm": 0.47265625, + "learning_rate": 4.889134343886528e-06, + "loss": 2.3797, + "step": 11398 + }, + { + "epoch": 0.6115343347639485, + "grad_norm": 0.45703125, + "learning_rate": 4.88910875795614e-06, + "loss": 2.3553, + "step": 11399 + }, + { + "epoch": 0.6115879828326181, + "grad_norm": 0.4453125, + "learning_rate": 4.889083169140656e-06, + "loss": 2.2756, + "step": 11400 + }, + { + "epoch": 0.6116416309012875, + "grad_norm": 0.4375, + "learning_rate": 4.889057577440106e-06, + "loss": 1.9752, + "step": 11401 + }, + { + "epoch": 0.6116952789699571, + "grad_norm": 0.4140625, + "learning_rate": 4.88903198285452e-06, + "loss": 2.3375, + "step": 11402 + }, + { + "epoch": 0.6117489270386266, + "grad_norm": 0.384765625, + "learning_rate": 4.889006385383931e-06, + "loss": 2.1998, + "step": 11403 + }, + { + "epoch": 0.6118025751072962, + "grad_norm": 0.5078125, + "learning_rate": 4.888980785028369e-06, + "loss": 2.363, + "step": 11404 + }, + { + "epoch": 0.6118562231759657, + "grad_norm": 0.423828125, + "learning_rate": 4.888955181787864e-06, + "loss": 2.3752, + "step": 11405 + }, + { + "epoch": 0.6119098712446351, + "grad_norm": 0.427734375, + "learning_rate": 4.8889295756624485e-06, + "loss": 1.9659, + "step": 11406 + }, + { + "epoch": 0.6119635193133047, + "grad_norm": 0.41015625, + "learning_rate": 4.888903966652153e-06, + "loss": 2.4261, + "step": 11407 + }, + { + "epoch": 0.6120171673819742, + "grad_norm": 0.451171875, + "learning_rate": 4.888878354757008e-06, + "loss": 2.478, + "step": 11408 + }, + { + "epoch": 0.6120708154506438, + "grad_norm": 0.376953125, + "learning_rate": 4.888852739977044e-06, + "loss": 2.1211, + "step": 11409 + }, + { + "epoch": 0.6121244635193133, + "grad_norm": 0.43359375, + "learning_rate": 4.888827122312293e-06, + "loss": 2.2261, + "step": 11410 + }, + { + "epoch": 0.6121781115879829, + "grad_norm": 0.4609375, + "learning_rate": 4.8888015017627844e-06, + "loss": 2.2067, + "step": 11411 + }, + { + "epoch": 0.6122317596566523, + "grad_norm": 0.39453125, + "learning_rate": 4.888775878328551e-06, + "loss": 2.2827, + "step": 11412 + }, + { + "epoch": 0.6122854077253219, + "grad_norm": 0.490234375, + "learning_rate": 4.888750252009623e-06, + "loss": 2.1517, + "step": 11413 + }, + { + "epoch": 0.6123390557939914, + "grad_norm": 0.421875, + "learning_rate": 4.888724622806031e-06, + "loss": 2.1131, + "step": 11414 + }, + { + "epoch": 0.612392703862661, + "grad_norm": 0.439453125, + "learning_rate": 4.8886989907178065e-06, + "loss": 2.0492, + "step": 11415 + }, + { + "epoch": 0.6124463519313305, + "grad_norm": 0.46484375, + "learning_rate": 4.888673355744979e-06, + "loss": 2.2615, + "step": 11416 + }, + { + "epoch": 0.6125, + "grad_norm": 0.67578125, + "learning_rate": 4.888647717887582e-06, + "loss": 2.4148, + "step": 11417 + }, + { + "epoch": 0.6125536480686695, + "grad_norm": 0.416015625, + "learning_rate": 4.888622077145645e-06, + "loss": 2.3359, + "step": 11418 + }, + { + "epoch": 0.6126072961373391, + "grad_norm": 0.447265625, + "learning_rate": 4.888596433519198e-06, + "loss": 2.2774, + "step": 11419 + }, + { + "epoch": 0.6126609442060086, + "grad_norm": 0.447265625, + "learning_rate": 4.888570787008273e-06, + "loss": 2.2987, + "step": 11420 + }, + { + "epoch": 0.6127145922746781, + "grad_norm": 0.427734375, + "learning_rate": 4.8885451376129025e-06, + "loss": 2.2697, + "step": 11421 + }, + { + "epoch": 0.6127682403433476, + "grad_norm": 0.4453125, + "learning_rate": 4.8885194853331145e-06, + "loss": 2.0808, + "step": 11422 + }, + { + "epoch": 0.6128218884120171, + "grad_norm": 0.6328125, + "learning_rate": 4.888493830168942e-06, + "loss": 2.2783, + "step": 11423 + }, + { + "epoch": 0.6128755364806867, + "grad_norm": 0.4921875, + "learning_rate": 4.888468172120415e-06, + "loss": 2.4597, + "step": 11424 + }, + { + "epoch": 0.6129291845493562, + "grad_norm": 0.5390625, + "learning_rate": 4.8884425111875655e-06, + "loss": 2.3026, + "step": 11425 + }, + { + "epoch": 0.6129828326180258, + "grad_norm": 0.8046875, + "learning_rate": 4.888416847370424e-06, + "loss": 2.2159, + "step": 11426 + }, + { + "epoch": 0.6130364806866953, + "grad_norm": 0.5390625, + "learning_rate": 4.888391180669021e-06, + "loss": 2.387, + "step": 11427 + }, + { + "epoch": 0.6130901287553648, + "grad_norm": 0.36328125, + "learning_rate": 4.8883655110833874e-06, + "loss": 2.1289, + "step": 11428 + }, + { + "epoch": 0.6131437768240343, + "grad_norm": 0.423828125, + "learning_rate": 4.888339838613555e-06, + "loss": 2.3614, + "step": 11429 + }, + { + "epoch": 0.6131974248927039, + "grad_norm": 0.50390625, + "learning_rate": 4.888314163259555e-06, + "loss": 2.5995, + "step": 11430 + }, + { + "epoch": 0.6132510729613734, + "grad_norm": 0.419921875, + "learning_rate": 4.888288485021416e-06, + "loss": 2.2495, + "step": 11431 + }, + { + "epoch": 0.613304721030043, + "grad_norm": 0.408203125, + "learning_rate": 4.888262803899173e-06, + "loss": 2.278, + "step": 11432 + }, + { + "epoch": 0.6133583690987124, + "grad_norm": 0.49609375, + "learning_rate": 4.888237119892854e-06, + "loss": 2.3195, + "step": 11433 + }, + { + "epoch": 0.613412017167382, + "grad_norm": 0.4140625, + "learning_rate": 4.888211433002491e-06, + "loss": 2.2859, + "step": 11434 + }, + { + "epoch": 0.6134656652360515, + "grad_norm": 0.60546875, + "learning_rate": 4.888185743228115e-06, + "loss": 2.3589, + "step": 11435 + }, + { + "epoch": 0.613519313304721, + "grad_norm": 0.43359375, + "learning_rate": 4.8881600505697565e-06, + "loss": 1.9618, + "step": 11436 + }, + { + "epoch": 0.6135729613733906, + "grad_norm": 0.423828125, + "learning_rate": 4.888134355027447e-06, + "loss": 2.3619, + "step": 11437 + }, + { + "epoch": 0.61362660944206, + "grad_norm": 0.435546875, + "learning_rate": 4.888108656601218e-06, + "loss": 2.4455, + "step": 11438 + }, + { + "epoch": 0.6136802575107296, + "grad_norm": 0.4765625, + "learning_rate": 4.888082955291101e-06, + "loss": 2.1453, + "step": 11439 + }, + { + "epoch": 0.6137339055793991, + "grad_norm": 0.51953125, + "learning_rate": 4.888057251097125e-06, + "loss": 2.4146, + "step": 11440 + }, + { + "epoch": 0.6137875536480687, + "grad_norm": 0.39453125, + "learning_rate": 4.888031544019321e-06, + "loss": 2.0722, + "step": 11441 + }, + { + "epoch": 0.6138412017167382, + "grad_norm": 0.427734375, + "learning_rate": 4.888005834057723e-06, + "loss": 2.2394, + "step": 11442 + }, + { + "epoch": 0.6138948497854078, + "grad_norm": 1.0859375, + "learning_rate": 4.887980121212359e-06, + "loss": 2.2721, + "step": 11443 + }, + { + "epoch": 0.6139484978540772, + "grad_norm": 0.439453125, + "learning_rate": 4.887954405483262e-06, + "loss": 2.3885, + "step": 11444 + }, + { + "epoch": 0.6140021459227468, + "grad_norm": 0.478515625, + "learning_rate": 4.887928686870461e-06, + "loss": 1.9825, + "step": 11445 + }, + { + "epoch": 0.6140557939914163, + "grad_norm": 0.46875, + "learning_rate": 4.88790296537399e-06, + "loss": 2.4219, + "step": 11446 + }, + { + "epoch": 0.6141094420600859, + "grad_norm": 0.353515625, + "learning_rate": 4.887877240993878e-06, + "loss": 2.0951, + "step": 11447 + }, + { + "epoch": 0.6141630901287554, + "grad_norm": 0.4609375, + "learning_rate": 4.887851513730156e-06, + "loss": 2.4177, + "step": 11448 + }, + { + "epoch": 0.6142167381974248, + "grad_norm": 0.59375, + "learning_rate": 4.887825783582856e-06, + "loss": 2.0289, + "step": 11449 + }, + { + "epoch": 0.6142703862660944, + "grad_norm": 0.4765625, + "learning_rate": 4.887800050552009e-06, + "loss": 2.5397, + "step": 11450 + }, + { + "epoch": 0.6143240343347639, + "grad_norm": 0.56640625, + "learning_rate": 4.887774314637645e-06, + "loss": 2.0802, + "step": 11451 + }, + { + "epoch": 0.6143776824034335, + "grad_norm": 0.451171875, + "learning_rate": 4.887748575839795e-06, + "loss": 2.1316, + "step": 11452 + }, + { + "epoch": 0.614431330472103, + "grad_norm": 0.390625, + "learning_rate": 4.887722834158493e-06, + "loss": 2.4346, + "step": 11453 + }, + { + "epoch": 0.6144849785407726, + "grad_norm": 0.396484375, + "learning_rate": 4.887697089593767e-06, + "loss": 2.4268, + "step": 11454 + }, + { + "epoch": 0.614538626609442, + "grad_norm": 0.4453125, + "learning_rate": 4.887671342145648e-06, + "loss": 2.4345, + "step": 11455 + }, + { + "epoch": 0.6145922746781116, + "grad_norm": 0.404296875, + "learning_rate": 4.887645591814169e-06, + "loss": 2.2451, + "step": 11456 + }, + { + "epoch": 0.6146459227467811, + "grad_norm": 0.4375, + "learning_rate": 4.887619838599361e-06, + "loss": 2.5977, + "step": 11457 + }, + { + "epoch": 0.6146995708154507, + "grad_norm": 0.453125, + "learning_rate": 4.887594082501254e-06, + "loss": 2.3573, + "step": 11458 + }, + { + "epoch": 0.6147532188841202, + "grad_norm": 0.443359375, + "learning_rate": 4.887568323519878e-06, + "loss": 2.2336, + "step": 11459 + }, + { + "epoch": 0.6148068669527897, + "grad_norm": 0.443359375, + "learning_rate": 4.887542561655267e-06, + "loss": 2.3577, + "step": 11460 + }, + { + "epoch": 0.6148605150214592, + "grad_norm": 0.400390625, + "learning_rate": 4.88751679690745e-06, + "loss": 2.2224, + "step": 11461 + }, + { + "epoch": 0.6149141630901288, + "grad_norm": 0.49609375, + "learning_rate": 4.887491029276459e-06, + "loss": 2.423, + "step": 11462 + }, + { + "epoch": 0.6149678111587983, + "grad_norm": 0.447265625, + "learning_rate": 4.887465258762325e-06, + "loss": 2.1287, + "step": 11463 + }, + { + "epoch": 0.6150214592274678, + "grad_norm": 0.419921875, + "learning_rate": 4.887439485365079e-06, + "loss": 2.3695, + "step": 11464 + }, + { + "epoch": 0.6150751072961373, + "grad_norm": 0.46875, + "learning_rate": 4.887413709084752e-06, + "loss": 2.0306, + "step": 11465 + }, + { + "epoch": 0.6151287553648068, + "grad_norm": 0.4140625, + "learning_rate": 4.887387929921376e-06, + "loss": 2.1684, + "step": 11466 + }, + { + "epoch": 0.6151824034334764, + "grad_norm": 0.6875, + "learning_rate": 4.88736214787498e-06, + "loss": 2.0862, + "step": 11467 + }, + { + "epoch": 0.6152360515021459, + "grad_norm": 0.423828125, + "learning_rate": 4.887336362945597e-06, + "loss": 2.3715, + "step": 11468 + }, + { + "epoch": 0.6152896995708155, + "grad_norm": 0.447265625, + "learning_rate": 4.887310575133258e-06, + "loss": 2.1448, + "step": 11469 + }, + { + "epoch": 0.615343347639485, + "grad_norm": 0.55078125, + "learning_rate": 4.887284784437994e-06, + "loss": 2.7205, + "step": 11470 + }, + { + "epoch": 0.6153969957081545, + "grad_norm": 0.41015625, + "learning_rate": 4.887258990859835e-06, + "loss": 2.2126, + "step": 11471 + }, + { + "epoch": 0.615450643776824, + "grad_norm": 0.408203125, + "learning_rate": 4.887233194398814e-06, + "loss": 2.3991, + "step": 11472 + }, + { + "epoch": 0.6155042918454936, + "grad_norm": 0.5, + "learning_rate": 4.887207395054961e-06, + "loss": 2.498, + "step": 11473 + }, + { + "epoch": 0.6155579399141631, + "grad_norm": 0.455078125, + "learning_rate": 4.887181592828307e-06, + "loss": 2.3854, + "step": 11474 + }, + { + "epoch": 0.6156115879828327, + "grad_norm": 0.43359375, + "learning_rate": 4.887155787718884e-06, + "loss": 2.2397, + "step": 11475 + }, + { + "epoch": 0.6156652360515021, + "grad_norm": 0.47265625, + "learning_rate": 4.887129979726723e-06, + "loss": 2.1733, + "step": 11476 + }, + { + "epoch": 0.6157188841201717, + "grad_norm": 0.478515625, + "learning_rate": 4.887104168851854e-06, + "loss": 2.1485, + "step": 11477 + }, + { + "epoch": 0.6157725321888412, + "grad_norm": 0.4921875, + "learning_rate": 4.88707835509431e-06, + "loss": 2.2087, + "step": 11478 + }, + { + "epoch": 0.6158261802575107, + "grad_norm": 0.59765625, + "learning_rate": 4.88705253845412e-06, + "loss": 2.2591, + "step": 11479 + }, + { + "epoch": 0.6158798283261803, + "grad_norm": 0.384765625, + "learning_rate": 4.887026718931318e-06, + "loss": 2.2004, + "step": 11480 + }, + { + "epoch": 0.6159334763948497, + "grad_norm": 0.46875, + "learning_rate": 4.887000896525933e-06, + "loss": 2.6066, + "step": 11481 + }, + { + "epoch": 0.6159871244635193, + "grad_norm": 0.40234375, + "learning_rate": 4.886975071237996e-06, + "loss": 2.3325, + "step": 11482 + }, + { + "epoch": 0.6160407725321888, + "grad_norm": 1.4609375, + "learning_rate": 4.8869492430675405e-06, + "loss": 2.2285, + "step": 11483 + }, + { + "epoch": 0.6160944206008584, + "grad_norm": 0.45703125, + "learning_rate": 4.886923412014595e-06, + "loss": 2.5393, + "step": 11484 + }, + { + "epoch": 0.6161480686695279, + "grad_norm": 0.515625, + "learning_rate": 4.886897578079192e-06, + "loss": 2.243, + "step": 11485 + }, + { + "epoch": 0.6162017167381975, + "grad_norm": 0.416015625, + "learning_rate": 4.886871741261362e-06, + "loss": 1.9777, + "step": 11486 + }, + { + "epoch": 0.6162553648068669, + "grad_norm": 0.6015625, + "learning_rate": 4.886845901561137e-06, + "loss": 2.5097, + "step": 11487 + }, + { + "epoch": 0.6163090128755365, + "grad_norm": 0.404296875, + "learning_rate": 4.8868200589785484e-06, + "loss": 2.1179, + "step": 11488 + }, + { + "epoch": 0.616362660944206, + "grad_norm": 0.458984375, + "learning_rate": 4.886794213513627e-06, + "loss": 2.1566, + "step": 11489 + }, + { + "epoch": 0.6164163090128756, + "grad_norm": 0.69921875, + "learning_rate": 4.886768365166404e-06, + "loss": 2.2162, + "step": 11490 + }, + { + "epoch": 0.6164699570815451, + "grad_norm": 0.484375, + "learning_rate": 4.8867425139369104e-06, + "loss": 2.2814, + "step": 11491 + }, + { + "epoch": 0.6165236051502145, + "grad_norm": 0.431640625, + "learning_rate": 4.8867166598251775e-06, + "loss": 2.3058, + "step": 11492 + }, + { + "epoch": 0.6165772532188841, + "grad_norm": 0.498046875, + "learning_rate": 4.886690802831236e-06, + "loss": 2.501, + "step": 11493 + }, + { + "epoch": 0.6166309012875536, + "grad_norm": 0.4296875, + "learning_rate": 4.88666494295512e-06, + "loss": 2.2755, + "step": 11494 + }, + { + "epoch": 0.6166845493562232, + "grad_norm": 0.427734375, + "learning_rate": 4.886639080196856e-06, + "loss": 2.2552, + "step": 11495 + }, + { + "epoch": 0.6167381974248927, + "grad_norm": 0.404296875, + "learning_rate": 4.8866132145564784e-06, + "loss": 2.2216, + "step": 11496 + }, + { + "epoch": 0.6167918454935623, + "grad_norm": 0.5, + "learning_rate": 4.886587346034018e-06, + "loss": 2.713, + "step": 11497 + }, + { + "epoch": 0.6168454935622317, + "grad_norm": 0.4375, + "learning_rate": 4.886561474629505e-06, + "loss": 2.288, + "step": 11498 + }, + { + "epoch": 0.6168991416309013, + "grad_norm": 0.4765625, + "learning_rate": 4.886535600342973e-06, + "loss": 2.4681, + "step": 11499 + }, + { + "epoch": 0.6169527896995708, + "grad_norm": 0.40234375, + "learning_rate": 4.886509723174451e-06, + "loss": 2.3172, + "step": 11500 + }, + { + "epoch": 0.6170064377682404, + "grad_norm": 0.484375, + "learning_rate": 4.8864838431239706e-06, + "loss": 2.3955, + "step": 11501 + }, + { + "epoch": 0.6170600858369099, + "grad_norm": 0.65625, + "learning_rate": 4.886457960191563e-06, + "loss": 2.1908, + "step": 11502 + }, + { + "epoch": 0.6171137339055794, + "grad_norm": 0.546875, + "learning_rate": 4.8864320743772604e-06, + "loss": 2.3747, + "step": 11503 + }, + { + "epoch": 0.6171673819742489, + "grad_norm": 0.4609375, + "learning_rate": 4.8864061856810935e-06, + "loss": 2.371, + "step": 11504 + }, + { + "epoch": 0.6172210300429185, + "grad_norm": 1.0234375, + "learning_rate": 4.8863802941030935e-06, + "loss": 2.1678, + "step": 11505 + }, + { + "epoch": 0.617274678111588, + "grad_norm": 0.48046875, + "learning_rate": 4.886354399643292e-06, + "loss": 2.3486, + "step": 11506 + }, + { + "epoch": 0.6173283261802575, + "grad_norm": 0.55859375, + "learning_rate": 4.8863285023017195e-06, + "loss": 1.4225, + "step": 11507 + }, + { + "epoch": 0.617381974248927, + "grad_norm": 0.6328125, + "learning_rate": 4.886302602078407e-06, + "loss": 1.8268, + "step": 11508 + }, + { + "epoch": 0.6174356223175965, + "grad_norm": 0.373046875, + "learning_rate": 4.8862766989733875e-06, + "loss": 2.423, + "step": 11509 + }, + { + "epoch": 0.6174892703862661, + "grad_norm": 0.7578125, + "learning_rate": 4.886250792986692e-06, + "loss": 1.6803, + "step": 11510 + }, + { + "epoch": 0.6175429184549356, + "grad_norm": 0.396484375, + "learning_rate": 4.88622488411835e-06, + "loss": 2.0604, + "step": 11511 + }, + { + "epoch": 0.6175965665236052, + "grad_norm": 0.44921875, + "learning_rate": 4.886198972368394e-06, + "loss": 2.4886, + "step": 11512 + }, + { + "epoch": 0.6176502145922746, + "grad_norm": 0.466796875, + "learning_rate": 4.886173057736856e-06, + "loss": 2.3158, + "step": 11513 + }, + { + "epoch": 0.6177038626609442, + "grad_norm": 0.53515625, + "learning_rate": 4.886147140223766e-06, + "loss": 2.2563, + "step": 11514 + }, + { + "epoch": 0.6177575107296137, + "grad_norm": 0.48046875, + "learning_rate": 4.886121219829155e-06, + "loss": 1.8435, + "step": 11515 + }, + { + "epoch": 0.6178111587982833, + "grad_norm": 0.453125, + "learning_rate": 4.886095296553056e-06, + "loss": 2.3827, + "step": 11516 + }, + { + "epoch": 0.6178648068669528, + "grad_norm": 0.369140625, + "learning_rate": 4.8860693703954985e-06, + "loss": 2.1508, + "step": 11517 + }, + { + "epoch": 0.6179184549356224, + "grad_norm": 0.39453125, + "learning_rate": 4.886043441356516e-06, + "loss": 2.5492, + "step": 11518 + }, + { + "epoch": 0.6179721030042918, + "grad_norm": 0.451171875, + "learning_rate": 4.8860175094361375e-06, + "loss": 2.2134, + "step": 11519 + }, + { + "epoch": 0.6180257510729614, + "grad_norm": 0.455078125, + "learning_rate": 4.885991574634396e-06, + "loss": 2.0502, + "step": 11520 + }, + { + "epoch": 0.6180793991416309, + "grad_norm": 0.419921875, + "learning_rate": 4.885965636951321e-06, + "loss": 2.2403, + "step": 11521 + }, + { + "epoch": 0.6181330472103004, + "grad_norm": 0.486328125, + "learning_rate": 4.885939696386946e-06, + "loss": 2.3829, + "step": 11522 + }, + { + "epoch": 0.61818669527897, + "grad_norm": 0.453125, + "learning_rate": 4.885913752941301e-06, + "loss": 2.5446, + "step": 11523 + }, + { + "epoch": 0.6182403433476394, + "grad_norm": 0.640625, + "learning_rate": 4.8858878066144175e-06, + "loss": 2.5528, + "step": 11524 + }, + { + "epoch": 0.618293991416309, + "grad_norm": 0.47265625, + "learning_rate": 4.885861857406327e-06, + "loss": 2.2779, + "step": 11525 + }, + { + "epoch": 0.6183476394849785, + "grad_norm": 0.380859375, + "learning_rate": 4.885835905317061e-06, + "loss": 2.1349, + "step": 11526 + }, + { + "epoch": 0.6184012875536481, + "grad_norm": 0.44140625, + "learning_rate": 4.8858099503466505e-06, + "loss": 1.9759, + "step": 11527 + }, + { + "epoch": 0.6184549356223176, + "grad_norm": 0.59765625, + "learning_rate": 4.885783992495127e-06, + "loss": 1.9112, + "step": 11528 + }, + { + "epoch": 0.6185085836909872, + "grad_norm": 0.55859375, + "learning_rate": 4.885758031762521e-06, + "loss": 2.2465, + "step": 11529 + }, + { + "epoch": 0.6185622317596566, + "grad_norm": 0.44140625, + "learning_rate": 4.8857320681488656e-06, + "loss": 2.1173, + "step": 11530 + }, + { + "epoch": 0.6186158798283262, + "grad_norm": 0.39453125, + "learning_rate": 4.88570610165419e-06, + "loss": 2.1442, + "step": 11531 + }, + { + "epoch": 0.6186695278969957, + "grad_norm": 0.60546875, + "learning_rate": 4.885680132278528e-06, + "loss": 1.3683, + "step": 11532 + }, + { + "epoch": 0.6187231759656653, + "grad_norm": 0.41015625, + "learning_rate": 4.88565416002191e-06, + "loss": 2.2469, + "step": 11533 + }, + { + "epoch": 0.6187768240343348, + "grad_norm": 0.671875, + "learning_rate": 4.885628184884367e-06, + "loss": 2.3374, + "step": 11534 + }, + { + "epoch": 0.6188304721030042, + "grad_norm": 0.423828125, + "learning_rate": 4.8856022068659294e-06, + "loss": 2.2802, + "step": 11535 + }, + { + "epoch": 0.6188841201716738, + "grad_norm": 0.53125, + "learning_rate": 4.8855762259666305e-06, + "loss": 2.2407, + "step": 11536 + }, + { + "epoch": 0.6189377682403433, + "grad_norm": 0.494140625, + "learning_rate": 4.8855502421865006e-06, + "loss": 2.2477, + "step": 11537 + }, + { + "epoch": 0.6189914163090129, + "grad_norm": 0.390625, + "learning_rate": 4.885524255525571e-06, + "loss": 1.931, + "step": 11538 + }, + { + "epoch": 0.6190450643776824, + "grad_norm": 0.4609375, + "learning_rate": 4.885498265983874e-06, + "loss": 2.2294, + "step": 11539 + }, + { + "epoch": 0.619098712446352, + "grad_norm": 0.4296875, + "learning_rate": 4.88547227356144e-06, + "loss": 2.6007, + "step": 11540 + }, + { + "epoch": 0.6191523605150214, + "grad_norm": 0.6171875, + "learning_rate": 4.8854462782583e-06, + "loss": 2.2275, + "step": 11541 + }, + { + "epoch": 0.619206008583691, + "grad_norm": 0.5234375, + "learning_rate": 4.885420280074488e-06, + "loss": 2.2789, + "step": 11542 + }, + { + "epoch": 0.6192596566523605, + "grad_norm": 0.486328125, + "learning_rate": 4.885394279010031e-06, + "loss": 2.3515, + "step": 11543 + }, + { + "epoch": 0.6193133047210301, + "grad_norm": 0.435546875, + "learning_rate": 4.885368275064965e-06, + "loss": 2.1332, + "step": 11544 + }, + { + "epoch": 0.6193669527896996, + "grad_norm": 2.5625, + "learning_rate": 4.885342268239319e-06, + "loss": 2.5585, + "step": 11545 + }, + { + "epoch": 0.6194206008583691, + "grad_norm": 0.51953125, + "learning_rate": 4.885316258533124e-06, + "loss": 2.2461, + "step": 11546 + }, + { + "epoch": 0.6194742489270386, + "grad_norm": 0.396484375, + "learning_rate": 4.8852902459464125e-06, + "loss": 2.0252, + "step": 11547 + }, + { + "epoch": 0.6195278969957082, + "grad_norm": 0.5078125, + "learning_rate": 4.885264230479216e-06, + "loss": 2.2898, + "step": 11548 + }, + { + "epoch": 0.6195815450643777, + "grad_norm": 0.4375, + "learning_rate": 4.885238212131564e-06, + "loss": 2.0515, + "step": 11549 + }, + { + "epoch": 0.6196351931330472, + "grad_norm": 0.388671875, + "learning_rate": 4.885212190903491e-06, + "loss": 2.0386, + "step": 11550 + }, + { + "epoch": 0.6196888412017167, + "grad_norm": 0.4609375, + "learning_rate": 4.885186166795026e-06, + "loss": 2.299, + "step": 11551 + }, + { + "epoch": 0.6197424892703862, + "grad_norm": 0.447265625, + "learning_rate": 4.885160139806202e-06, + "loss": 2.5281, + "step": 11552 + }, + { + "epoch": 0.6197961373390558, + "grad_norm": 0.46875, + "learning_rate": 4.885134109937049e-06, + "loss": 1.5749, + "step": 11553 + }, + { + "epoch": 0.6198497854077253, + "grad_norm": 0.5625, + "learning_rate": 4.8851080771876e-06, + "loss": 2.4364, + "step": 11554 + }, + { + "epoch": 0.6199034334763949, + "grad_norm": 0.388671875, + "learning_rate": 4.885082041557885e-06, + "loss": 2.0529, + "step": 11555 + }, + { + "epoch": 0.6199570815450643, + "grad_norm": 0.48828125, + "learning_rate": 4.885056003047935e-06, + "loss": 2.129, + "step": 11556 + }, + { + "epoch": 0.6200107296137339, + "grad_norm": 0.396484375, + "learning_rate": 4.885029961657783e-06, + "loss": 2.1471, + "step": 11557 + }, + { + "epoch": 0.6200643776824034, + "grad_norm": 0.59375, + "learning_rate": 4.885003917387461e-06, + "loss": 2.3555, + "step": 11558 + }, + { + "epoch": 0.620118025751073, + "grad_norm": 0.443359375, + "learning_rate": 4.884977870236998e-06, + "loss": 2.2899, + "step": 11559 + }, + { + "epoch": 0.6201716738197425, + "grad_norm": 0.478515625, + "learning_rate": 4.884951820206427e-06, + "loss": 2.3169, + "step": 11560 + }, + { + "epoch": 0.6202253218884121, + "grad_norm": 0.7890625, + "learning_rate": 4.88492576729578e-06, + "loss": 1.7141, + "step": 11561 + }, + { + "epoch": 0.6202789699570815, + "grad_norm": 0.380859375, + "learning_rate": 4.884899711505087e-06, + "loss": 1.8717, + "step": 11562 + }, + { + "epoch": 0.6203326180257511, + "grad_norm": 0.443359375, + "learning_rate": 4.884873652834381e-06, + "loss": 2.6487, + "step": 11563 + }, + { + "epoch": 0.6203862660944206, + "grad_norm": 0.43359375, + "learning_rate": 4.884847591283691e-06, + "loss": 2.275, + "step": 11564 + }, + { + "epoch": 0.6204399141630901, + "grad_norm": 0.427734375, + "learning_rate": 4.884821526853052e-06, + "loss": 2.3478, + "step": 11565 + }, + { + "epoch": 0.6204935622317597, + "grad_norm": 0.466796875, + "learning_rate": 4.884795459542493e-06, + "loss": 2.3139, + "step": 11566 + }, + { + "epoch": 0.6205472103004291, + "grad_norm": 0.494140625, + "learning_rate": 4.8847693893520456e-06, + "loss": 2.4745, + "step": 11567 + }, + { + "epoch": 0.6206008583690987, + "grad_norm": 0.44921875, + "learning_rate": 4.884743316281742e-06, + "loss": 2.2648, + "step": 11568 + }, + { + "epoch": 0.6206545064377682, + "grad_norm": 0.427734375, + "learning_rate": 4.884717240331613e-06, + "loss": 2.4113, + "step": 11569 + }, + { + "epoch": 0.6207081545064378, + "grad_norm": 0.443359375, + "learning_rate": 4.884691161501691e-06, + "loss": 2.2406, + "step": 11570 + }, + { + "epoch": 0.6207618025751073, + "grad_norm": 0.435546875, + "learning_rate": 4.884665079792007e-06, + "loss": 2.4243, + "step": 11571 + }, + { + "epoch": 0.6208154506437769, + "grad_norm": 0.43359375, + "learning_rate": 4.884638995202592e-06, + "loss": 2.239, + "step": 11572 + }, + { + "epoch": 0.6208690987124463, + "grad_norm": 0.53515625, + "learning_rate": 4.884612907733479e-06, + "loss": 2.2545, + "step": 11573 + }, + { + "epoch": 0.6209227467811159, + "grad_norm": 0.498046875, + "learning_rate": 4.8845868173846975e-06, + "loss": 2.2171, + "step": 11574 + }, + { + "epoch": 0.6209763948497854, + "grad_norm": 0.484375, + "learning_rate": 4.884560724156281e-06, + "loss": 2.2868, + "step": 11575 + }, + { + "epoch": 0.621030042918455, + "grad_norm": 0.462890625, + "learning_rate": 4.88453462804826e-06, + "loss": 2.2427, + "step": 11576 + }, + { + "epoch": 0.6210836909871245, + "grad_norm": 0.484375, + "learning_rate": 4.884508529060666e-06, + "loss": 2.4392, + "step": 11577 + }, + { + "epoch": 0.621137339055794, + "grad_norm": 0.42578125, + "learning_rate": 4.884482427193529e-06, + "loss": 2.2338, + "step": 11578 + }, + { + "epoch": 0.6211909871244635, + "grad_norm": 0.474609375, + "learning_rate": 4.884456322446883e-06, + "loss": 2.2219, + "step": 11579 + }, + { + "epoch": 0.621244635193133, + "grad_norm": 0.439453125, + "learning_rate": 4.884430214820759e-06, + "loss": 2.1389, + "step": 11580 + }, + { + "epoch": 0.6212982832618026, + "grad_norm": 0.65625, + "learning_rate": 4.8844041043151876e-06, + "loss": 2.3026, + "step": 11581 + }, + { + "epoch": 0.6213519313304721, + "grad_norm": 0.416015625, + "learning_rate": 4.884377990930202e-06, + "loss": 2.1746, + "step": 11582 + }, + { + "epoch": 0.6214055793991416, + "grad_norm": 0.486328125, + "learning_rate": 4.884351874665831e-06, + "loss": 2.2118, + "step": 11583 + }, + { + "epoch": 0.6214592274678111, + "grad_norm": 0.498046875, + "learning_rate": 4.884325755522109e-06, + "loss": 2.1974, + "step": 11584 + }, + { + "epoch": 0.6215128755364807, + "grad_norm": 0.484375, + "learning_rate": 4.884299633499066e-06, + "loss": 2.1346, + "step": 11585 + }, + { + "epoch": 0.6215665236051502, + "grad_norm": 0.423828125, + "learning_rate": 4.884273508596734e-06, + "loss": 2.4957, + "step": 11586 + }, + { + "epoch": 0.6216201716738198, + "grad_norm": 0.5859375, + "learning_rate": 4.884247380815144e-06, + "loss": 2.2132, + "step": 11587 + }, + { + "epoch": 0.6216738197424893, + "grad_norm": 0.46484375, + "learning_rate": 4.884221250154327e-06, + "loss": 2.1546, + "step": 11588 + }, + { + "epoch": 0.6217274678111588, + "grad_norm": 0.48828125, + "learning_rate": 4.884195116614318e-06, + "loss": 2.2044, + "step": 11589 + }, + { + "epoch": 0.6217811158798283, + "grad_norm": 0.44921875, + "learning_rate": 4.884168980195144e-06, + "loss": 2.1855, + "step": 11590 + }, + { + "epoch": 0.6218347639484979, + "grad_norm": 0.443359375, + "learning_rate": 4.884142840896839e-06, + "loss": 2.4234, + "step": 11591 + }, + { + "epoch": 0.6218884120171674, + "grad_norm": 0.498046875, + "learning_rate": 4.884116698719434e-06, + "loss": 2.3095, + "step": 11592 + }, + { + "epoch": 0.6219420600858369, + "grad_norm": 2.328125, + "learning_rate": 4.884090553662961e-06, + "loss": 2.3284, + "step": 11593 + }, + { + "epoch": 0.6219957081545064, + "grad_norm": 0.478515625, + "learning_rate": 4.884064405727451e-06, + "loss": 2.2969, + "step": 11594 + }, + { + "epoch": 0.6220493562231759, + "grad_norm": 0.44140625, + "learning_rate": 4.8840382549129366e-06, + "loss": 2.4343, + "step": 11595 + }, + { + "epoch": 0.6221030042918455, + "grad_norm": 0.412109375, + "learning_rate": 4.884012101219448e-06, + "loss": 2.4067, + "step": 11596 + }, + { + "epoch": 0.622156652360515, + "grad_norm": 0.6953125, + "learning_rate": 4.883985944647018e-06, + "loss": 2.6129, + "step": 11597 + }, + { + "epoch": 0.6222103004291846, + "grad_norm": 0.439453125, + "learning_rate": 4.883959785195678e-06, + "loss": 2.4445, + "step": 11598 + }, + { + "epoch": 0.622263948497854, + "grad_norm": 0.71875, + "learning_rate": 4.883933622865459e-06, + "loss": 2.302, + "step": 11599 + }, + { + "epoch": 0.6223175965665236, + "grad_norm": 0.56640625, + "learning_rate": 4.8839074576563914e-06, + "loss": 2.2735, + "step": 11600 + }, + { + "epoch": 0.6223712446351931, + "grad_norm": 0.462890625, + "learning_rate": 4.883881289568509e-06, + "loss": 2.5784, + "step": 11601 + }, + { + "epoch": 0.6224248927038627, + "grad_norm": 0.416015625, + "learning_rate": 4.883855118601843e-06, + "loss": 1.6253, + "step": 11602 + }, + { + "epoch": 0.6224785407725322, + "grad_norm": 0.47265625, + "learning_rate": 4.883828944756425e-06, + "loss": 2.4058, + "step": 11603 + }, + { + "epoch": 0.6225321888412018, + "grad_norm": 0.515625, + "learning_rate": 4.883802768032286e-06, + "loss": 2.331, + "step": 11604 + }, + { + "epoch": 0.6225858369098712, + "grad_norm": 0.33984375, + "learning_rate": 4.8837765884294575e-06, + "loss": 2.2578, + "step": 11605 + }, + { + "epoch": 0.6226394849785408, + "grad_norm": 0.5, + "learning_rate": 4.8837504059479715e-06, + "loss": 2.2556, + "step": 11606 + }, + { + "epoch": 0.6226931330472103, + "grad_norm": 0.42578125, + "learning_rate": 4.883724220587859e-06, + "loss": 2.2155, + "step": 11607 + }, + { + "epoch": 0.6227467811158798, + "grad_norm": 0.478515625, + "learning_rate": 4.883698032349154e-06, + "loss": 2.2335, + "step": 11608 + }, + { + "epoch": 0.6228004291845494, + "grad_norm": 0.416015625, + "learning_rate": 4.883671841231884e-06, + "loss": 2.3487, + "step": 11609 + }, + { + "epoch": 0.6228540772532188, + "grad_norm": 0.419921875, + "learning_rate": 4.883645647236084e-06, + "loss": 2.1295, + "step": 11610 + }, + { + "epoch": 0.6229077253218884, + "grad_norm": 0.42578125, + "learning_rate": 4.883619450361785e-06, + "loss": 2.0862, + "step": 11611 + }, + { + "epoch": 0.6229613733905579, + "grad_norm": 0.53125, + "learning_rate": 4.8835932506090175e-06, + "loss": 2.1842, + "step": 11612 + }, + { + "epoch": 0.6230150214592275, + "grad_norm": 0.447265625, + "learning_rate": 4.883567047977815e-06, + "loss": 2.1495, + "step": 11613 + }, + { + "epoch": 0.623068669527897, + "grad_norm": 0.431640625, + "learning_rate": 4.883540842468207e-06, + "loss": 2.0737, + "step": 11614 + }, + { + "epoch": 0.6231223175965666, + "grad_norm": 0.40625, + "learning_rate": 4.883514634080226e-06, + "loss": 1.9345, + "step": 11615 + }, + { + "epoch": 0.623175965665236, + "grad_norm": 0.453125, + "learning_rate": 4.883488422813904e-06, + "loss": 2.3478, + "step": 11616 + }, + { + "epoch": 0.6232296137339056, + "grad_norm": 0.890625, + "learning_rate": 4.883462208669274e-06, + "loss": 2.1922, + "step": 11617 + }, + { + "epoch": 0.6232832618025751, + "grad_norm": 0.44140625, + "learning_rate": 4.883435991646364e-06, + "loss": 2.1402, + "step": 11618 + }, + { + "epoch": 0.6233369098712447, + "grad_norm": 0.43359375, + "learning_rate": 4.883409771745209e-06, + "loss": 2.3376, + "step": 11619 + }, + { + "epoch": 0.6233905579399142, + "grad_norm": 0.345703125, + "learning_rate": 4.883383548965838e-06, + "loss": 2.0865, + "step": 11620 + }, + { + "epoch": 0.6234442060085837, + "grad_norm": 0.69140625, + "learning_rate": 4.883357323308285e-06, + "loss": 2.1325, + "step": 11621 + }, + { + "epoch": 0.6234978540772532, + "grad_norm": 0.458984375, + "learning_rate": 4.883331094772581e-06, + "loss": 2.2553, + "step": 11622 + }, + { + "epoch": 0.6235515021459227, + "grad_norm": 0.47265625, + "learning_rate": 4.883304863358757e-06, + "loss": 2.4351, + "step": 11623 + }, + { + "epoch": 0.6236051502145923, + "grad_norm": 0.453125, + "learning_rate": 4.883278629066846e-06, + "loss": 2.3908, + "step": 11624 + }, + { + "epoch": 0.6236587982832618, + "grad_norm": 0.4453125, + "learning_rate": 4.883252391896878e-06, + "loss": 2.2645, + "step": 11625 + }, + { + "epoch": 0.6237124463519313, + "grad_norm": 0.4765625, + "learning_rate": 4.883226151848885e-06, + "loss": 2.256, + "step": 11626 + }, + { + "epoch": 0.6237660944206008, + "grad_norm": 0.4609375, + "learning_rate": 4.883199908922901e-06, + "loss": 2.1559, + "step": 11627 + }, + { + "epoch": 0.6238197424892704, + "grad_norm": 0.51171875, + "learning_rate": 4.883173663118954e-06, + "loss": 2.2804, + "step": 11628 + }, + { + "epoch": 0.6238733905579399, + "grad_norm": 0.431640625, + "learning_rate": 4.8831474144370785e-06, + "loss": 2.2828, + "step": 11629 + }, + { + "epoch": 0.6239270386266095, + "grad_norm": 0.39453125, + "learning_rate": 4.883121162877304e-06, + "loss": 2.0707, + "step": 11630 + }, + { + "epoch": 0.623980686695279, + "grad_norm": 0.466796875, + "learning_rate": 4.883094908439665e-06, + "loss": 2.3479, + "step": 11631 + }, + { + "epoch": 0.6240343347639485, + "grad_norm": 0.451171875, + "learning_rate": 4.883068651124192e-06, + "loss": 2.4009, + "step": 11632 + }, + { + "epoch": 0.624087982832618, + "grad_norm": 0.435546875, + "learning_rate": 4.883042390930915e-06, + "loss": 2.1793, + "step": 11633 + }, + { + "epoch": 0.6241416309012876, + "grad_norm": 0.453125, + "learning_rate": 4.8830161278598674e-06, + "loss": 2.4986, + "step": 11634 + }, + { + "epoch": 0.6241952789699571, + "grad_norm": 0.4296875, + "learning_rate": 4.8829898619110805e-06, + "loss": 2.3331, + "step": 11635 + }, + { + "epoch": 0.6242489270386266, + "grad_norm": 0.474609375, + "learning_rate": 4.8829635930845865e-06, + "loss": 2.2527, + "step": 11636 + }, + { + "epoch": 0.6243025751072961, + "grad_norm": 0.4609375, + "learning_rate": 4.8829373213804165e-06, + "loss": 2.1674, + "step": 11637 + }, + { + "epoch": 0.6243562231759656, + "grad_norm": 0.470703125, + "learning_rate": 4.882911046798603e-06, + "loss": 2.4679, + "step": 11638 + }, + { + "epoch": 0.6244098712446352, + "grad_norm": 0.474609375, + "learning_rate": 4.882884769339176e-06, + "loss": 2.1293, + "step": 11639 + }, + { + "epoch": 0.6244635193133047, + "grad_norm": 0.41796875, + "learning_rate": 4.882858489002169e-06, + "loss": 2.2101, + "step": 11640 + }, + { + "epoch": 0.6245171673819743, + "grad_norm": 0.41796875, + "learning_rate": 4.8828322057876135e-06, + "loss": 2.2677, + "step": 11641 + }, + { + "epoch": 0.6245708154506437, + "grad_norm": 0.375, + "learning_rate": 4.882805919695541e-06, + "loss": 2.021, + "step": 11642 + }, + { + "epoch": 0.6246244635193133, + "grad_norm": 0.43359375, + "learning_rate": 4.882779630725983e-06, + "loss": 1.9758, + "step": 11643 + }, + { + "epoch": 0.6246781115879828, + "grad_norm": 0.423828125, + "learning_rate": 4.88275333887897e-06, + "loss": 2.3747, + "step": 11644 + }, + { + "epoch": 0.6247317596566524, + "grad_norm": 0.51171875, + "learning_rate": 4.882727044154537e-06, + "loss": 2.451, + "step": 11645 + }, + { + "epoch": 0.6247854077253219, + "grad_norm": 0.37109375, + "learning_rate": 4.882700746552713e-06, + "loss": 2.2596, + "step": 11646 + }, + { + "epoch": 0.6248390557939915, + "grad_norm": 0.55078125, + "learning_rate": 4.8826744460735315e-06, + "loss": 2.2858, + "step": 11647 + }, + { + "epoch": 0.6248927038626609, + "grad_norm": 0.62890625, + "learning_rate": 4.882648142717023e-06, + "loss": 2.2288, + "step": 11648 + }, + { + "epoch": 0.6249463519313305, + "grad_norm": 0.388671875, + "learning_rate": 4.882621836483219e-06, + "loss": 2.1437, + "step": 11649 + }, + { + "epoch": 0.625, + "grad_norm": 0.51953125, + "learning_rate": 4.8825955273721524e-06, + "loss": 2.4253, + "step": 11650 + }, + { + "epoch": 0.6250536480686695, + "grad_norm": 0.49609375, + "learning_rate": 4.8825692153838545e-06, + "loss": 2.2878, + "step": 11651 + }, + { + "epoch": 0.6251072961373391, + "grad_norm": 0.44921875, + "learning_rate": 4.8825429005183575e-06, + "loss": 2.4567, + "step": 11652 + }, + { + "epoch": 0.6251609442060085, + "grad_norm": 0.4765625, + "learning_rate": 4.882516582775693e-06, + "loss": 2.4459, + "step": 11653 + }, + { + "epoch": 0.6252145922746781, + "grad_norm": 0.490234375, + "learning_rate": 4.882490262155891e-06, + "loss": 2.6244, + "step": 11654 + }, + { + "epoch": 0.6252682403433476, + "grad_norm": 0.451171875, + "learning_rate": 4.882463938658985e-06, + "loss": 2.4555, + "step": 11655 + }, + { + "epoch": 0.6253218884120172, + "grad_norm": 0.419921875, + "learning_rate": 4.8824376122850085e-06, + "loss": 1.9677, + "step": 11656 + }, + { + "epoch": 0.6253755364806867, + "grad_norm": 0.462890625, + "learning_rate": 4.88241128303399e-06, + "loss": 2.2968, + "step": 11657 + }, + { + "epoch": 0.6254291845493563, + "grad_norm": 0.447265625, + "learning_rate": 4.882384950905963e-06, + "loss": 2.3199, + "step": 11658 + }, + { + "epoch": 0.6254828326180257, + "grad_norm": 0.48828125, + "learning_rate": 4.88235861590096e-06, + "loss": 2.3846, + "step": 11659 + }, + { + "epoch": 0.6255364806866953, + "grad_norm": 0.515625, + "learning_rate": 4.882332278019009e-06, + "loss": 2.4428, + "step": 11660 + }, + { + "epoch": 0.6255901287553648, + "grad_norm": 0.42578125, + "learning_rate": 4.882305937260148e-06, + "loss": 2.2708, + "step": 11661 + }, + { + "epoch": 0.6256437768240344, + "grad_norm": 0.38671875, + "learning_rate": 4.882279593624404e-06, + "loss": 2.1819, + "step": 11662 + }, + { + "epoch": 0.6256974248927039, + "grad_norm": 0.70703125, + "learning_rate": 4.8822532471118085e-06, + "loss": 2.2352, + "step": 11663 + }, + { + "epoch": 0.6257510729613734, + "grad_norm": 0.5234375, + "learning_rate": 4.882226897722397e-06, + "loss": 2.1836, + "step": 11664 + }, + { + "epoch": 0.6258047210300429, + "grad_norm": 0.48046875, + "learning_rate": 4.8822005454561994e-06, + "loss": 2.2332, + "step": 11665 + }, + { + "epoch": 0.6258583690987124, + "grad_norm": 0.486328125, + "learning_rate": 4.8821741903132465e-06, + "loss": 2.4031, + "step": 11666 + }, + { + "epoch": 0.625912017167382, + "grad_norm": 0.4140625, + "learning_rate": 4.882147832293572e-06, + "loss": 2.3089, + "step": 11667 + }, + { + "epoch": 0.6259656652360515, + "grad_norm": 0.431640625, + "learning_rate": 4.882121471397207e-06, + "loss": 2.1701, + "step": 11668 + }, + { + "epoch": 0.626019313304721, + "grad_norm": 0.455078125, + "learning_rate": 4.882095107624183e-06, + "loss": 2.2405, + "step": 11669 + }, + { + "epoch": 0.6260729613733905, + "grad_norm": 0.412109375, + "learning_rate": 4.882068740974532e-06, + "loss": 2.6176, + "step": 11670 + }, + { + "epoch": 0.6261266094420601, + "grad_norm": 0.546875, + "learning_rate": 4.882042371448285e-06, + "loss": 2.3761, + "step": 11671 + }, + { + "epoch": 0.6261802575107296, + "grad_norm": 0.478515625, + "learning_rate": 4.882015999045475e-06, + "loss": 2.3227, + "step": 11672 + }, + { + "epoch": 0.6262339055793992, + "grad_norm": 0.357421875, + "learning_rate": 4.881989623766135e-06, + "loss": 2.0097, + "step": 11673 + }, + { + "epoch": 0.6262875536480687, + "grad_norm": 0.484375, + "learning_rate": 4.881963245610294e-06, + "loss": 2.3109, + "step": 11674 + }, + { + "epoch": 0.6263412017167382, + "grad_norm": 0.56640625, + "learning_rate": 4.881936864577985e-06, + "loss": 2.2023, + "step": 11675 + }, + { + "epoch": 0.6263948497854077, + "grad_norm": 0.45703125, + "learning_rate": 4.8819104806692404e-06, + "loss": 2.4648, + "step": 11676 + }, + { + "epoch": 0.6264484978540773, + "grad_norm": 0.5078125, + "learning_rate": 4.881884093884093e-06, + "loss": 2.2742, + "step": 11677 + }, + { + "epoch": 0.6265021459227468, + "grad_norm": 0.51953125, + "learning_rate": 4.881857704222572e-06, + "loss": 2.2099, + "step": 11678 + }, + { + "epoch": 0.6265557939914163, + "grad_norm": 0.419921875, + "learning_rate": 4.881831311684712e-06, + "loss": 2.1408, + "step": 11679 + }, + { + "epoch": 0.6266094420600858, + "grad_norm": 0.3984375, + "learning_rate": 4.8818049162705415e-06, + "loss": 2.3515, + "step": 11680 + }, + { + "epoch": 0.6266630901287553, + "grad_norm": 0.458984375, + "learning_rate": 4.881778517980096e-06, + "loss": 2.2079, + "step": 11681 + }, + { + "epoch": 0.6267167381974249, + "grad_norm": 0.470703125, + "learning_rate": 4.881752116813406e-06, + "loss": 2.4768, + "step": 11682 + }, + { + "epoch": 0.6267703862660944, + "grad_norm": 0.419921875, + "learning_rate": 4.881725712770503e-06, + "loss": 2.3192, + "step": 11683 + }, + { + "epoch": 0.626824034334764, + "grad_norm": 1.2734375, + "learning_rate": 4.881699305851419e-06, + "loss": 2.4701, + "step": 11684 + }, + { + "epoch": 0.6268776824034334, + "grad_norm": 0.9765625, + "learning_rate": 4.8816728960561855e-06, + "loss": 2.2907, + "step": 11685 + }, + { + "epoch": 0.626931330472103, + "grad_norm": 0.357421875, + "learning_rate": 4.881646483384835e-06, + "loss": 1.8404, + "step": 11686 + }, + { + "epoch": 0.6269849785407725, + "grad_norm": 0.3828125, + "learning_rate": 4.8816200678374e-06, + "loss": 2.414, + "step": 11687 + }, + { + "epoch": 0.6270386266094421, + "grad_norm": 0.408203125, + "learning_rate": 4.881593649413911e-06, + "loss": 2.2011, + "step": 11688 + }, + { + "epoch": 0.6270922746781116, + "grad_norm": 0.45703125, + "learning_rate": 4.881567228114401e-06, + "loss": 2.3232, + "step": 11689 + }, + { + "epoch": 0.6271459227467812, + "grad_norm": 0.466796875, + "learning_rate": 4.881540803938901e-06, + "loss": 2.557, + "step": 11690 + }, + { + "epoch": 0.6271995708154506, + "grad_norm": 0.416015625, + "learning_rate": 4.881514376887444e-06, + "loss": 2.2275, + "step": 11691 + }, + { + "epoch": 0.6272532188841202, + "grad_norm": 0.392578125, + "learning_rate": 4.881487946960061e-06, + "loss": 2.1678, + "step": 11692 + }, + { + "epoch": 0.6273068669527897, + "grad_norm": 0.50390625, + "learning_rate": 4.881461514156784e-06, + "loss": 2.4071, + "step": 11693 + }, + { + "epoch": 0.6273605150214592, + "grad_norm": 0.47265625, + "learning_rate": 4.881435078477645e-06, + "loss": 1.8481, + "step": 11694 + }, + { + "epoch": 0.6274141630901288, + "grad_norm": 0.48046875, + "learning_rate": 4.8814086399226765e-06, + "loss": 2.2289, + "step": 11695 + }, + { + "epoch": 0.6274678111587982, + "grad_norm": 0.5703125, + "learning_rate": 4.8813821984919106e-06, + "loss": 2.3475, + "step": 11696 + }, + { + "epoch": 0.6275214592274678, + "grad_norm": 0.52734375, + "learning_rate": 4.881355754185378e-06, + "loss": 1.4403, + "step": 11697 + }, + { + "epoch": 0.6275751072961373, + "grad_norm": 0.54296875, + "learning_rate": 4.881329307003111e-06, + "loss": 2.3968, + "step": 11698 + }, + { + "epoch": 0.6276287553648069, + "grad_norm": 0.421875, + "learning_rate": 4.881302856945143e-06, + "loss": 2.058, + "step": 11699 + }, + { + "epoch": 0.6276824034334764, + "grad_norm": 0.4921875, + "learning_rate": 4.881276404011503e-06, + "loss": 1.6995, + "step": 11700 + }, + { + "epoch": 0.627736051502146, + "grad_norm": 0.72265625, + "learning_rate": 4.881249948202226e-06, + "loss": 2.3219, + "step": 11701 + }, + { + "epoch": 0.6277896995708154, + "grad_norm": 0.52734375, + "learning_rate": 4.881223489517341e-06, + "loss": 2.4404, + "step": 11702 + }, + { + "epoch": 0.627843347639485, + "grad_norm": 0.40234375, + "learning_rate": 4.881197027956883e-06, + "loss": 2.2833, + "step": 11703 + }, + { + "epoch": 0.6278969957081545, + "grad_norm": 0.44921875, + "learning_rate": 4.881170563520883e-06, + "loss": 2.2556, + "step": 11704 + }, + { + "epoch": 0.6279506437768241, + "grad_norm": 0.4296875, + "learning_rate": 4.881144096209372e-06, + "loss": 2.1451, + "step": 11705 + }, + { + "epoch": 0.6280042918454936, + "grad_norm": 0.4921875, + "learning_rate": 4.881117626022382e-06, + "loss": 2.5437, + "step": 11706 + }, + { + "epoch": 0.6280579399141631, + "grad_norm": 0.447265625, + "learning_rate": 4.881091152959946e-06, + "loss": 2.1739, + "step": 11707 + }, + { + "epoch": 0.6281115879828326, + "grad_norm": 0.46875, + "learning_rate": 4.881064677022095e-06, + "loss": 1.8445, + "step": 11708 + }, + { + "epoch": 0.6281652360515021, + "grad_norm": 0.4296875, + "learning_rate": 4.881038198208861e-06, + "loss": 2.3074, + "step": 11709 + }, + { + "epoch": 0.6282188841201717, + "grad_norm": 0.390625, + "learning_rate": 4.8810117165202765e-06, + "loss": 2.3671, + "step": 11710 + }, + { + "epoch": 0.6282725321888412, + "grad_norm": 0.46875, + "learning_rate": 4.8809852319563746e-06, + "loss": 2.2994, + "step": 11711 + }, + { + "epoch": 0.6283261802575107, + "grad_norm": 0.427734375, + "learning_rate": 4.8809587445171845e-06, + "loss": 2.091, + "step": 11712 + }, + { + "epoch": 0.6283798283261802, + "grad_norm": 0.46484375, + "learning_rate": 4.88093225420274e-06, + "loss": 2.3575, + "step": 11713 + }, + { + "epoch": 0.6284334763948498, + "grad_norm": 0.453125, + "learning_rate": 4.880905761013073e-06, + "loss": 2.2847, + "step": 11714 + }, + { + "epoch": 0.6284871244635193, + "grad_norm": 0.43359375, + "learning_rate": 4.880879264948216e-06, + "loss": 2.2354, + "step": 11715 + }, + { + "epoch": 0.6285407725321889, + "grad_norm": 0.61328125, + "learning_rate": 4.880852766008199e-06, + "loss": 2.4198, + "step": 11716 + }, + { + "epoch": 0.6285944206008584, + "grad_norm": 0.43359375, + "learning_rate": 4.880826264193056e-06, + "loss": 2.4352, + "step": 11717 + }, + { + "epoch": 0.6286480686695279, + "grad_norm": 0.478515625, + "learning_rate": 4.880799759502819e-06, + "loss": 1.8205, + "step": 11718 + }, + { + "epoch": 0.6287017167381974, + "grad_norm": 0.482421875, + "learning_rate": 4.880773251937518e-06, + "loss": 2.2127, + "step": 11719 + }, + { + "epoch": 0.628755364806867, + "grad_norm": 0.462890625, + "learning_rate": 4.880746741497187e-06, + "loss": 2.4817, + "step": 11720 + }, + { + "epoch": 0.6288090128755365, + "grad_norm": 0.44140625, + "learning_rate": 4.880720228181857e-06, + "loss": 2.1723, + "step": 11721 + }, + { + "epoch": 0.628862660944206, + "grad_norm": 0.458984375, + "learning_rate": 4.88069371199156e-06, + "loss": 2.2111, + "step": 11722 + }, + { + "epoch": 0.6289163090128755, + "grad_norm": 2.015625, + "learning_rate": 4.880667192926328e-06, + "loss": 2.2679, + "step": 11723 + }, + { + "epoch": 0.628969957081545, + "grad_norm": 0.376953125, + "learning_rate": 4.880640670986194e-06, + "loss": 2.2098, + "step": 11724 + }, + { + "epoch": 0.6290236051502146, + "grad_norm": 0.40625, + "learning_rate": 4.88061414617119e-06, + "loss": 2.2554, + "step": 11725 + }, + { + "epoch": 0.6290772532188841, + "grad_norm": 0.484375, + "learning_rate": 4.880587618481347e-06, + "loss": 2.3417, + "step": 11726 + }, + { + "epoch": 0.6291309012875537, + "grad_norm": 0.453125, + "learning_rate": 4.8805610879166975e-06, + "loss": 2.4953, + "step": 11727 + }, + { + "epoch": 0.6291845493562231, + "grad_norm": 0.4453125, + "learning_rate": 4.880534554477273e-06, + "loss": 2.3051, + "step": 11728 + }, + { + "epoch": 0.6292381974248927, + "grad_norm": 0.609375, + "learning_rate": 4.880508018163107e-06, + "loss": 2.3779, + "step": 11729 + }, + { + "epoch": 0.6292918454935622, + "grad_norm": 0.44140625, + "learning_rate": 4.88048147897423e-06, + "loss": 2.1911, + "step": 11730 + }, + { + "epoch": 0.6293454935622318, + "grad_norm": 0.59375, + "learning_rate": 4.880454936910675e-06, + "loss": 2.2581, + "step": 11731 + }, + { + "epoch": 0.6293991416309013, + "grad_norm": 0.53515625, + "learning_rate": 4.8804283919724735e-06, + "loss": 2.3569, + "step": 11732 + }, + { + "epoch": 0.6294527896995709, + "grad_norm": 0.443359375, + "learning_rate": 4.880401844159658e-06, + "loss": 1.6709, + "step": 11733 + }, + { + "epoch": 0.6295064377682403, + "grad_norm": 0.4375, + "learning_rate": 4.880375293472259e-06, + "loss": 1.9374, + "step": 11734 + }, + { + "epoch": 0.6295600858369099, + "grad_norm": 0.435546875, + "learning_rate": 4.880348739910311e-06, + "loss": 1.6321, + "step": 11735 + }, + { + "epoch": 0.6296137339055794, + "grad_norm": 0.388671875, + "learning_rate": 4.880322183473845e-06, + "loss": 2.2682, + "step": 11736 + }, + { + "epoch": 0.6296673819742489, + "grad_norm": 0.44921875, + "learning_rate": 4.880295624162893e-06, + "loss": 2.3649, + "step": 11737 + }, + { + "epoch": 0.6297210300429185, + "grad_norm": 0.48828125, + "learning_rate": 4.880269061977487e-06, + "loss": 2.2683, + "step": 11738 + }, + { + "epoch": 0.6297746781115879, + "grad_norm": 0.4765625, + "learning_rate": 4.880242496917659e-06, + "loss": 2.2554, + "step": 11739 + }, + { + "epoch": 0.6298283261802575, + "grad_norm": 0.404296875, + "learning_rate": 4.880215928983441e-06, + "loss": 2.1311, + "step": 11740 + }, + { + "epoch": 0.629881974248927, + "grad_norm": 0.44140625, + "learning_rate": 4.880189358174866e-06, + "loss": 2.2048, + "step": 11741 + }, + { + "epoch": 0.6299356223175966, + "grad_norm": 0.6484375, + "learning_rate": 4.880162784491965e-06, + "loss": 2.2803, + "step": 11742 + }, + { + "epoch": 0.6299892703862661, + "grad_norm": 0.51171875, + "learning_rate": 4.88013620793477e-06, + "loss": 2.1665, + "step": 11743 + }, + { + "epoch": 0.6300429184549357, + "grad_norm": 0.423828125, + "learning_rate": 4.8801096285033145e-06, + "loss": 2.3927, + "step": 11744 + }, + { + "epoch": 0.6300965665236051, + "grad_norm": 0.39453125, + "learning_rate": 4.8800830461976295e-06, + "loss": 2.2507, + "step": 11745 + }, + { + "epoch": 0.6301502145922747, + "grad_norm": 0.5546875, + "learning_rate": 4.880056461017747e-06, + "loss": 2.2628, + "step": 11746 + }, + { + "epoch": 0.6302038626609442, + "grad_norm": 0.490234375, + "learning_rate": 4.8800298729637e-06, + "loss": 2.3561, + "step": 11747 + }, + { + "epoch": 0.6302575107296138, + "grad_norm": 0.4375, + "learning_rate": 4.880003282035519e-06, + "loss": 1.9534, + "step": 11748 + }, + { + "epoch": 0.6303111587982833, + "grad_norm": 0.515625, + "learning_rate": 4.879976688233238e-06, + "loss": 2.5674, + "step": 11749 + }, + { + "epoch": 0.6303648068669528, + "grad_norm": 0.470703125, + "learning_rate": 4.879950091556888e-06, + "loss": 2.2503, + "step": 11750 + }, + { + "epoch": 0.6304184549356223, + "grad_norm": 0.35546875, + "learning_rate": 4.879923492006501e-06, + "loss": 2.0907, + "step": 11751 + }, + { + "epoch": 0.6304721030042918, + "grad_norm": 0.494140625, + "learning_rate": 4.8798968895821095e-06, + "loss": 2.2917, + "step": 11752 + }, + { + "epoch": 0.6305257510729614, + "grad_norm": 0.37890625, + "learning_rate": 4.879870284283746e-06, + "loss": 1.9699, + "step": 11753 + }, + { + "epoch": 0.6305793991416309, + "grad_norm": 0.439453125, + "learning_rate": 4.879843676111442e-06, + "loss": 2.4737, + "step": 11754 + }, + { + "epoch": 0.6306330472103004, + "grad_norm": 0.4609375, + "learning_rate": 4.87981706506523e-06, + "loss": 2.1372, + "step": 11755 + }, + { + "epoch": 0.6306866952789699, + "grad_norm": 0.515625, + "learning_rate": 4.879790451145141e-06, + "loss": 2.4914, + "step": 11756 + }, + { + "epoch": 0.6307403433476395, + "grad_norm": 0.455078125, + "learning_rate": 4.87976383435121e-06, + "loss": 2.4188, + "step": 11757 + }, + { + "epoch": 0.630793991416309, + "grad_norm": 0.44921875, + "learning_rate": 4.8797372146834655e-06, + "loss": 2.3233, + "step": 11758 + }, + { + "epoch": 0.6308476394849786, + "grad_norm": 0.427734375, + "learning_rate": 4.879710592141943e-06, + "loss": 2.5221, + "step": 11759 + }, + { + "epoch": 0.630901287553648, + "grad_norm": 0.388671875, + "learning_rate": 4.879683966726672e-06, + "loss": 2.0214, + "step": 11760 + }, + { + "epoch": 0.6309549356223176, + "grad_norm": 0.4375, + "learning_rate": 4.879657338437686e-06, + "loss": 2.1715, + "step": 11761 + }, + { + "epoch": 0.6310085836909871, + "grad_norm": 0.53125, + "learning_rate": 4.879630707275017e-06, + "loss": 2.2904, + "step": 11762 + }, + { + "epoch": 0.6310622317596567, + "grad_norm": 0.50390625, + "learning_rate": 4.879604073238697e-06, + "loss": 2.1998, + "step": 11763 + }, + { + "epoch": 0.6311158798283262, + "grad_norm": 0.462890625, + "learning_rate": 4.879577436328758e-06, + "loss": 2.1842, + "step": 11764 + }, + { + "epoch": 0.6311695278969958, + "grad_norm": 0.359375, + "learning_rate": 4.879550796545233e-06, + "loss": 2.0206, + "step": 11765 + }, + { + "epoch": 0.6312231759656652, + "grad_norm": 0.470703125, + "learning_rate": 4.8795241538881525e-06, + "loss": 2.4145, + "step": 11766 + }, + { + "epoch": 0.6312768240343347, + "grad_norm": 0.51953125, + "learning_rate": 4.87949750835755e-06, + "loss": 2.2013, + "step": 11767 + }, + { + "epoch": 0.6313304721030043, + "grad_norm": 0.474609375, + "learning_rate": 4.879470859953458e-06, + "loss": 2.2615, + "step": 11768 + }, + { + "epoch": 0.6313841201716738, + "grad_norm": 0.4375, + "learning_rate": 4.879444208675907e-06, + "loss": 2.6502, + "step": 11769 + }, + { + "epoch": 0.6314377682403434, + "grad_norm": 0.416015625, + "learning_rate": 4.879417554524931e-06, + "loss": 2.3578, + "step": 11770 + }, + { + "epoch": 0.6314914163090128, + "grad_norm": 0.474609375, + "learning_rate": 4.879390897500562e-06, + "loss": 2.5694, + "step": 11771 + }, + { + "epoch": 0.6315450643776824, + "grad_norm": 0.7421875, + "learning_rate": 4.879364237602831e-06, + "loss": 2.1856, + "step": 11772 + }, + { + "epoch": 0.6315987124463519, + "grad_norm": 0.443359375, + "learning_rate": 4.87933757483177e-06, + "loss": 2.173, + "step": 11773 + }, + { + "epoch": 0.6316523605150215, + "grad_norm": 0.443359375, + "learning_rate": 4.879310909187413e-06, + "loss": 2.3273, + "step": 11774 + }, + { + "epoch": 0.631706008583691, + "grad_norm": 0.43359375, + "learning_rate": 4.879284240669792e-06, + "loss": 2.3673, + "step": 11775 + }, + { + "epoch": 0.6317596566523606, + "grad_norm": 0.45703125, + "learning_rate": 4.879257569278937e-06, + "loss": 2.4583, + "step": 11776 + }, + { + "epoch": 0.63181330472103, + "grad_norm": 0.455078125, + "learning_rate": 4.879230895014883e-06, + "loss": 2.2747, + "step": 11777 + }, + { + "epoch": 0.6318669527896996, + "grad_norm": 0.38671875, + "learning_rate": 4.87920421787766e-06, + "loss": 2.4013, + "step": 11778 + }, + { + "epoch": 0.6319206008583691, + "grad_norm": 0.54296875, + "learning_rate": 4.8791775378673016e-06, + "loss": 2.3647, + "step": 11779 + }, + { + "epoch": 0.6319742489270386, + "grad_norm": 0.98046875, + "learning_rate": 4.8791508549838385e-06, + "loss": 2.457, + "step": 11780 + }, + { + "epoch": 0.6320278969957082, + "grad_norm": 1.2265625, + "learning_rate": 4.879124169227304e-06, + "loss": 2.4625, + "step": 11781 + }, + { + "epoch": 0.6320815450643776, + "grad_norm": 0.5234375, + "learning_rate": 4.879097480597731e-06, + "loss": 2.3734, + "step": 11782 + }, + { + "epoch": 0.6321351931330472, + "grad_norm": 0.5703125, + "learning_rate": 4.879070789095151e-06, + "loss": 2.1941, + "step": 11783 + }, + { + "epoch": 0.6321888412017167, + "grad_norm": 0.38671875, + "learning_rate": 4.879044094719595e-06, + "loss": 2.4067, + "step": 11784 + }, + { + "epoch": 0.6322424892703863, + "grad_norm": 0.462890625, + "learning_rate": 4.879017397471097e-06, + "loss": 2.3288, + "step": 11785 + }, + { + "epoch": 0.6322961373390558, + "grad_norm": 0.470703125, + "learning_rate": 4.878990697349689e-06, + "loss": 2.2667, + "step": 11786 + }, + { + "epoch": 0.6323497854077254, + "grad_norm": 0.447265625, + "learning_rate": 4.878963994355404e-06, + "loss": 2.3711, + "step": 11787 + }, + { + "epoch": 0.6324034334763948, + "grad_norm": 0.375, + "learning_rate": 4.878937288488271e-06, + "loss": 2.1579, + "step": 11788 + }, + { + "epoch": 0.6324570815450644, + "grad_norm": 0.515625, + "learning_rate": 4.878910579748326e-06, + "loss": 2.5605, + "step": 11789 + }, + { + "epoch": 0.6325107296137339, + "grad_norm": 0.5234375, + "learning_rate": 4.8788838681355985e-06, + "loss": 2.3089, + "step": 11790 + }, + { + "epoch": 0.6325643776824035, + "grad_norm": 0.921875, + "learning_rate": 4.878857153650123e-06, + "loss": 2.2065, + "step": 11791 + }, + { + "epoch": 0.632618025751073, + "grad_norm": 0.44921875, + "learning_rate": 4.87883043629193e-06, + "loss": 2.1796, + "step": 11792 + }, + { + "epoch": 0.6326716738197425, + "grad_norm": 0.408203125, + "learning_rate": 4.8788037160610526e-06, + "loss": 2.1618, + "step": 11793 + }, + { + "epoch": 0.632725321888412, + "grad_norm": 0.412109375, + "learning_rate": 4.8787769929575235e-06, + "loss": 2.0211, + "step": 11794 + }, + { + "epoch": 0.6327789699570815, + "grad_norm": 0.48828125, + "learning_rate": 4.878750266981374e-06, + "loss": 2.4827, + "step": 11795 + }, + { + "epoch": 0.6328326180257511, + "grad_norm": 0.419921875, + "learning_rate": 4.878723538132636e-06, + "loss": 2.2533, + "step": 11796 + }, + { + "epoch": 0.6328862660944206, + "grad_norm": 0.474609375, + "learning_rate": 4.878696806411344e-06, + "loss": 2.3723, + "step": 11797 + }, + { + "epoch": 0.6329399141630901, + "grad_norm": 0.392578125, + "learning_rate": 4.878670071817527e-06, + "loss": 1.9956, + "step": 11798 + }, + { + "epoch": 0.6329935622317596, + "grad_norm": 0.55859375, + "learning_rate": 4.8786433343512206e-06, + "loss": 1.5947, + "step": 11799 + }, + { + "epoch": 0.6330472103004292, + "grad_norm": 0.53515625, + "learning_rate": 4.878616594012456e-06, + "loss": 2.4018, + "step": 11800 + }, + { + "epoch": 0.6331008583690987, + "grad_norm": 1.2421875, + "learning_rate": 4.878589850801264e-06, + "loss": 2.3683, + "step": 11801 + }, + { + "epoch": 0.6331545064377683, + "grad_norm": 0.44921875, + "learning_rate": 4.878563104717678e-06, + "loss": 2.246, + "step": 11802 + }, + { + "epoch": 0.6332081545064377, + "grad_norm": 0.4453125, + "learning_rate": 4.878536355761731e-06, + "loss": 2.447, + "step": 11803 + }, + { + "epoch": 0.6332618025751073, + "grad_norm": 0.427734375, + "learning_rate": 4.878509603933453e-06, + "loss": 2.2863, + "step": 11804 + }, + { + "epoch": 0.6333154506437768, + "grad_norm": 0.45703125, + "learning_rate": 4.878482849232879e-06, + "loss": 2.4683, + "step": 11805 + }, + { + "epoch": 0.6333690987124464, + "grad_norm": 0.44921875, + "learning_rate": 4.8784560916600394e-06, + "loss": 2.2012, + "step": 11806 + }, + { + "epoch": 0.6334227467811159, + "grad_norm": 0.62109375, + "learning_rate": 4.878429331214969e-06, + "loss": 2.4155, + "step": 11807 + }, + { + "epoch": 0.6334763948497855, + "grad_norm": 0.482421875, + "learning_rate": 4.878402567897698e-06, + "loss": 2.3274, + "step": 11808 + }, + { + "epoch": 0.6335300429184549, + "grad_norm": 0.53125, + "learning_rate": 4.878375801708258e-06, + "loss": 1.9642, + "step": 11809 + }, + { + "epoch": 0.6335836909871244, + "grad_norm": 0.4453125, + "learning_rate": 4.878349032646683e-06, + "loss": 2.4588, + "step": 11810 + }, + { + "epoch": 0.633637339055794, + "grad_norm": 0.41796875, + "learning_rate": 4.878322260713005e-06, + "loss": 2.4855, + "step": 11811 + }, + { + "epoch": 0.6336909871244635, + "grad_norm": 0.462890625, + "learning_rate": 4.878295485907256e-06, + "loss": 2.0803, + "step": 11812 + }, + { + "epoch": 0.6337446351931331, + "grad_norm": 0.5234375, + "learning_rate": 4.878268708229468e-06, + "loss": 2.126, + "step": 11813 + }, + { + "epoch": 0.6337982832618025, + "grad_norm": 0.41796875, + "learning_rate": 4.878241927679675e-06, + "loss": 2.2381, + "step": 11814 + }, + { + "epoch": 0.6338519313304721, + "grad_norm": 0.42578125, + "learning_rate": 4.878215144257907e-06, + "loss": 1.6145, + "step": 11815 + }, + { + "epoch": 0.6339055793991416, + "grad_norm": 0.396484375, + "learning_rate": 4.878188357964197e-06, + "loss": 2.2135, + "step": 11816 + }, + { + "epoch": 0.6339592274678112, + "grad_norm": 0.39453125, + "learning_rate": 4.87816156879858e-06, + "loss": 2.3639, + "step": 11817 + }, + { + "epoch": 0.6340128755364807, + "grad_norm": 0.41796875, + "learning_rate": 4.878134776761084e-06, + "loss": 2.1797, + "step": 11818 + }, + { + "epoch": 0.6340665236051503, + "grad_norm": 0.396484375, + "learning_rate": 4.878107981851745e-06, + "loss": 2.1324, + "step": 11819 + }, + { + "epoch": 0.6341201716738197, + "grad_norm": 0.423828125, + "learning_rate": 4.878081184070593e-06, + "loss": 2.2006, + "step": 11820 + }, + { + "epoch": 0.6341738197424893, + "grad_norm": 0.48046875, + "learning_rate": 4.878054383417662e-06, + "loss": 2.1313, + "step": 11821 + }, + { + "epoch": 0.6342274678111588, + "grad_norm": 0.41796875, + "learning_rate": 4.8780275798929835e-06, + "loss": 2.3998, + "step": 11822 + }, + { + "epoch": 0.6342811158798283, + "grad_norm": 0.455078125, + "learning_rate": 4.878000773496589e-06, + "loss": 2.2859, + "step": 11823 + }, + { + "epoch": 0.6343347639484979, + "grad_norm": 0.43359375, + "learning_rate": 4.877973964228513e-06, + "loss": 1.9177, + "step": 11824 + }, + { + "epoch": 0.6343884120171673, + "grad_norm": 0.486328125, + "learning_rate": 4.8779471520887854e-06, + "loss": 2.2461, + "step": 11825 + }, + { + "epoch": 0.6344420600858369, + "grad_norm": 0.49609375, + "learning_rate": 4.877920337077441e-06, + "loss": 2.1263, + "step": 11826 + }, + { + "epoch": 0.6344957081545064, + "grad_norm": 0.41015625, + "learning_rate": 4.877893519194511e-06, + "loss": 2.2492, + "step": 11827 + }, + { + "epoch": 0.634549356223176, + "grad_norm": 0.67578125, + "learning_rate": 4.877866698440028e-06, + "loss": 2.3071, + "step": 11828 + }, + { + "epoch": 0.6346030042918455, + "grad_norm": 0.4296875, + "learning_rate": 4.877839874814024e-06, + "loss": 2.1393, + "step": 11829 + }, + { + "epoch": 0.634656652360515, + "grad_norm": 0.65625, + "learning_rate": 4.877813048316531e-06, + "loss": 2.3267, + "step": 11830 + }, + { + "epoch": 0.6347103004291845, + "grad_norm": 0.546875, + "learning_rate": 4.877786218947583e-06, + "loss": 2.4507, + "step": 11831 + }, + { + "epoch": 0.6347639484978541, + "grad_norm": 0.5, + "learning_rate": 4.877759386707211e-06, + "loss": 2.3006, + "step": 11832 + }, + { + "epoch": 0.6348175965665236, + "grad_norm": 0.466796875, + "learning_rate": 4.877732551595448e-06, + "loss": 2.33, + "step": 11833 + }, + { + "epoch": 0.6348712446351932, + "grad_norm": 0.404296875, + "learning_rate": 4.877705713612327e-06, + "loss": 2.3092, + "step": 11834 + }, + { + "epoch": 0.6349248927038627, + "grad_norm": 0.443359375, + "learning_rate": 4.877678872757878e-06, + "loss": 2.4515, + "step": 11835 + }, + { + "epoch": 0.6349785407725322, + "grad_norm": 0.447265625, + "learning_rate": 4.877652029032136e-06, + "loss": 2.3205, + "step": 11836 + }, + { + "epoch": 0.6350321888412017, + "grad_norm": 0.443359375, + "learning_rate": 4.8776251824351324e-06, + "loss": 2.0317, + "step": 11837 + }, + { + "epoch": 0.6350858369098712, + "grad_norm": 0.4140625, + "learning_rate": 4.8775983329669e-06, + "loss": 2.2372, + "step": 11838 + }, + { + "epoch": 0.6351394849785408, + "grad_norm": 0.421875, + "learning_rate": 4.877571480627471e-06, + "loss": 2.1135, + "step": 11839 + }, + { + "epoch": 0.6351931330472103, + "grad_norm": 0.49609375, + "learning_rate": 4.877544625416877e-06, + "loss": 2.4208, + "step": 11840 + }, + { + "epoch": 0.6352467811158798, + "grad_norm": 0.44140625, + "learning_rate": 4.877517767335152e-06, + "loss": 2.2109, + "step": 11841 + }, + { + "epoch": 0.6353004291845493, + "grad_norm": 0.4453125, + "learning_rate": 4.877490906382327e-06, + "loss": 2.3259, + "step": 11842 + }, + { + "epoch": 0.6353540772532189, + "grad_norm": 0.451171875, + "learning_rate": 4.877464042558435e-06, + "loss": 2.2421, + "step": 11843 + }, + { + "epoch": 0.6354077253218884, + "grad_norm": 0.6015625, + "learning_rate": 4.877437175863509e-06, + "loss": 2.4803, + "step": 11844 + }, + { + "epoch": 0.635461373390558, + "grad_norm": 0.458984375, + "learning_rate": 4.877410306297582e-06, + "loss": 2.0452, + "step": 11845 + }, + { + "epoch": 0.6355150214592274, + "grad_norm": 0.4453125, + "learning_rate": 4.877383433860684e-06, + "loss": 2.3705, + "step": 11846 + }, + { + "epoch": 0.635568669527897, + "grad_norm": 0.49609375, + "learning_rate": 4.877356558552849e-06, + "loss": 2.466, + "step": 11847 + }, + { + "epoch": 0.6356223175965665, + "grad_norm": 0.451171875, + "learning_rate": 4.87732968037411e-06, + "loss": 1.8462, + "step": 11848 + }, + { + "epoch": 0.6356759656652361, + "grad_norm": 0.4609375, + "learning_rate": 4.877302799324498e-06, + "loss": 2.3517, + "step": 11849 + }, + { + "epoch": 0.6357296137339056, + "grad_norm": 0.486328125, + "learning_rate": 4.8772759154040464e-06, + "loss": 2.358, + "step": 11850 + }, + { + "epoch": 0.6357832618025752, + "grad_norm": 0.40625, + "learning_rate": 4.877249028612787e-06, + "loss": 2.1162, + "step": 11851 + }, + { + "epoch": 0.6358369098712446, + "grad_norm": 0.416015625, + "learning_rate": 4.877222138950753e-06, + "loss": 2.185, + "step": 11852 + }, + { + "epoch": 0.6358905579399141, + "grad_norm": 0.44140625, + "learning_rate": 4.877195246417978e-06, + "loss": 2.1345, + "step": 11853 + }, + { + "epoch": 0.6359442060085837, + "grad_norm": 0.482421875, + "learning_rate": 4.877168351014492e-06, + "loss": 2.4815, + "step": 11854 + }, + { + "epoch": 0.6359978540772532, + "grad_norm": 0.451171875, + "learning_rate": 4.877141452740328e-06, + "loss": 2.3892, + "step": 11855 + }, + { + "epoch": 0.6360515021459228, + "grad_norm": 0.47265625, + "learning_rate": 4.87711455159552e-06, + "loss": 2.3608, + "step": 11856 + }, + { + "epoch": 0.6361051502145922, + "grad_norm": 0.4765625, + "learning_rate": 4.8770876475801e-06, + "loss": 2.1978, + "step": 11857 + }, + { + "epoch": 0.6361587982832618, + "grad_norm": 0.423828125, + "learning_rate": 4.877060740694099e-06, + "loss": 2.0492, + "step": 11858 + }, + { + "epoch": 0.6362124463519313, + "grad_norm": 0.41796875, + "learning_rate": 4.877033830937551e-06, + "loss": 2.349, + "step": 11859 + }, + { + "epoch": 0.6362660944206009, + "grad_norm": 0.609375, + "learning_rate": 4.8770069183104875e-06, + "loss": 2.3584, + "step": 11860 + }, + { + "epoch": 0.6363197424892704, + "grad_norm": 0.53515625, + "learning_rate": 4.8769800028129426e-06, + "loss": 2.3116, + "step": 11861 + }, + { + "epoch": 0.63637339055794, + "grad_norm": 0.37890625, + "learning_rate": 4.876953084444946e-06, + "loss": 2.0673, + "step": 11862 + }, + { + "epoch": 0.6364270386266094, + "grad_norm": 0.45703125, + "learning_rate": 4.876926163206534e-06, + "loss": 2.3335, + "step": 11863 + }, + { + "epoch": 0.636480686695279, + "grad_norm": 0.96875, + "learning_rate": 4.876899239097735e-06, + "loss": 1.5406, + "step": 11864 + }, + { + "epoch": 0.6365343347639485, + "grad_norm": 0.44921875, + "learning_rate": 4.876872312118586e-06, + "loss": 2.1918, + "step": 11865 + }, + { + "epoch": 0.636587982832618, + "grad_norm": 0.40234375, + "learning_rate": 4.876845382269115e-06, + "loss": 2.0408, + "step": 11866 + }, + { + "epoch": 0.6366416309012876, + "grad_norm": 0.48828125, + "learning_rate": 4.8768184495493576e-06, + "loss": 2.4202, + "step": 11867 + }, + { + "epoch": 0.636695278969957, + "grad_norm": 0.447265625, + "learning_rate": 4.876791513959345e-06, + "loss": 2.261, + "step": 11868 + }, + { + "epoch": 0.6367489270386266, + "grad_norm": 0.466796875, + "learning_rate": 4.87676457549911e-06, + "loss": 2.3267, + "step": 11869 + }, + { + "epoch": 0.6368025751072961, + "grad_norm": 0.4296875, + "learning_rate": 4.876737634168686e-06, + "loss": 2.2486, + "step": 11870 + }, + { + "epoch": 0.6368562231759657, + "grad_norm": 0.48828125, + "learning_rate": 4.876710689968104e-06, + "loss": 2.2177, + "step": 11871 + }, + { + "epoch": 0.6369098712446352, + "grad_norm": 0.498046875, + "learning_rate": 4.876683742897397e-06, + "loss": 2.3522, + "step": 11872 + }, + { + "epoch": 0.6369635193133047, + "grad_norm": 0.421875, + "learning_rate": 4.8766567929565986e-06, + "loss": 2.3133, + "step": 11873 + }, + { + "epoch": 0.6370171673819742, + "grad_norm": 0.45703125, + "learning_rate": 4.87662984014574e-06, + "loss": 2.1171, + "step": 11874 + }, + { + "epoch": 0.6370708154506438, + "grad_norm": 0.42578125, + "learning_rate": 4.876602884464854e-06, + "loss": 2.2438, + "step": 11875 + }, + { + "epoch": 0.6371244635193133, + "grad_norm": 0.41015625, + "learning_rate": 4.876575925913975e-06, + "loss": 2.1847, + "step": 11876 + }, + { + "epoch": 0.6371781115879829, + "grad_norm": 0.447265625, + "learning_rate": 4.8765489644931315e-06, + "loss": 2.3364, + "step": 11877 + }, + { + "epoch": 0.6372317596566524, + "grad_norm": 0.443359375, + "learning_rate": 4.87652200020236e-06, + "loss": 1.1997, + "step": 11878 + }, + { + "epoch": 0.6372854077253219, + "grad_norm": 0.486328125, + "learning_rate": 4.876495033041692e-06, + "loss": 2.2854, + "step": 11879 + }, + { + "epoch": 0.6373390557939914, + "grad_norm": 0.40234375, + "learning_rate": 4.876468063011159e-06, + "loss": 2.2361, + "step": 11880 + }, + { + "epoch": 0.6373927038626609, + "grad_norm": 0.4296875, + "learning_rate": 4.876441090110795e-06, + "loss": 2.4235, + "step": 11881 + }, + { + "epoch": 0.6374463519313305, + "grad_norm": 0.419921875, + "learning_rate": 4.876414114340631e-06, + "loss": 2.2496, + "step": 11882 + }, + { + "epoch": 0.6375, + "grad_norm": 0.58984375, + "learning_rate": 4.876387135700701e-06, + "loss": 2.519, + "step": 11883 + }, + { + "epoch": 0.6375536480686695, + "grad_norm": 0.44140625, + "learning_rate": 4.876360154191037e-06, + "loss": 2.1286, + "step": 11884 + }, + { + "epoch": 0.637607296137339, + "grad_norm": 0.3828125, + "learning_rate": 4.876333169811671e-06, + "loss": 2.24, + "step": 11885 + }, + { + "epoch": 0.6376609442060086, + "grad_norm": 0.4375, + "learning_rate": 4.8763061825626365e-06, + "loss": 2.2204, + "step": 11886 + }, + { + "epoch": 0.6377145922746781, + "grad_norm": 0.43359375, + "learning_rate": 4.876279192443966e-06, + "loss": 2.4267, + "step": 11887 + }, + { + "epoch": 0.6377682403433477, + "grad_norm": 0.33203125, + "learning_rate": 4.876252199455691e-06, + "loss": 1.9296, + "step": 11888 + }, + { + "epoch": 0.6378218884120171, + "grad_norm": 0.74609375, + "learning_rate": 4.876225203597845e-06, + "loss": 2.251, + "step": 11889 + }, + { + "epoch": 0.6378755364806867, + "grad_norm": 0.49609375, + "learning_rate": 4.876198204870462e-06, + "loss": 1.9477, + "step": 11890 + }, + { + "epoch": 0.6379291845493562, + "grad_norm": 0.4140625, + "learning_rate": 4.876171203273572e-06, + "loss": 2.3304, + "step": 11891 + }, + { + "epoch": 0.6379828326180258, + "grad_norm": 0.453125, + "learning_rate": 4.876144198807209e-06, + "loss": 2.2492, + "step": 11892 + }, + { + "epoch": 0.6380364806866953, + "grad_norm": 0.3984375, + "learning_rate": 4.876117191471405e-06, + "loss": 2.4401, + "step": 11893 + }, + { + "epoch": 0.6380901287553649, + "grad_norm": 0.451171875, + "learning_rate": 4.876090181266193e-06, + "loss": 2.3311, + "step": 11894 + }, + { + "epoch": 0.6381437768240343, + "grad_norm": 0.890625, + "learning_rate": 4.876063168191606e-06, + "loss": 2.2862, + "step": 11895 + }, + { + "epoch": 0.6381974248927038, + "grad_norm": 0.484375, + "learning_rate": 4.876036152247676e-06, + "loss": 2.19, + "step": 11896 + }, + { + "epoch": 0.6382510729613734, + "grad_norm": 0.423828125, + "learning_rate": 4.876009133434436e-06, + "loss": 2.3617, + "step": 11897 + }, + { + "epoch": 0.6383047210300429, + "grad_norm": 0.44140625, + "learning_rate": 4.875982111751918e-06, + "loss": 2.2852, + "step": 11898 + }, + { + "epoch": 0.6383583690987125, + "grad_norm": 0.41796875, + "learning_rate": 4.875955087200156e-06, + "loss": 2.3747, + "step": 11899 + }, + { + "epoch": 0.6384120171673819, + "grad_norm": 0.484375, + "learning_rate": 4.875928059779181e-06, + "loss": 2.2046, + "step": 11900 + }, + { + "epoch": 0.6384656652360515, + "grad_norm": 0.482421875, + "learning_rate": 4.875901029489027e-06, + "loss": 2.4334, + "step": 11901 + }, + { + "epoch": 0.638519313304721, + "grad_norm": 0.734375, + "learning_rate": 4.875873996329726e-06, + "loss": 2.4366, + "step": 11902 + }, + { + "epoch": 0.6385729613733906, + "grad_norm": 0.42578125, + "learning_rate": 4.875846960301309e-06, + "loss": 2.3238, + "step": 11903 + }, + { + "epoch": 0.6386266094420601, + "grad_norm": 0.373046875, + "learning_rate": 4.8758199214038125e-06, + "loss": 2.2032, + "step": 11904 + }, + { + "epoch": 0.6386802575107297, + "grad_norm": 0.45703125, + "learning_rate": 4.875792879637265e-06, + "loss": 2.4571, + "step": 11905 + }, + { + "epoch": 0.6387339055793991, + "grad_norm": 0.4609375, + "learning_rate": 4.875765835001703e-06, + "loss": 2.3898, + "step": 11906 + }, + { + "epoch": 0.6387875536480687, + "grad_norm": 0.45703125, + "learning_rate": 4.875738787497157e-06, + "loss": 2.2991, + "step": 11907 + }, + { + "epoch": 0.6388412017167382, + "grad_norm": 0.458984375, + "learning_rate": 4.875711737123659e-06, + "loss": 2.1931, + "step": 11908 + }, + { + "epoch": 0.6388948497854077, + "grad_norm": 0.4765625, + "learning_rate": 4.875684683881242e-06, + "loss": 2.4827, + "step": 11909 + }, + { + "epoch": 0.6389484978540773, + "grad_norm": 0.52734375, + "learning_rate": 4.8756576277699406e-06, + "loss": 2.0544, + "step": 11910 + }, + { + "epoch": 0.6390021459227467, + "grad_norm": 0.400390625, + "learning_rate": 4.875630568789786e-06, + "loss": 2.3204, + "step": 11911 + }, + { + "epoch": 0.6390557939914163, + "grad_norm": 0.40625, + "learning_rate": 4.875603506940811e-06, + "loss": 2.3467, + "step": 11912 + }, + { + "epoch": 0.6391094420600858, + "grad_norm": 0.462890625, + "learning_rate": 4.875576442223048e-06, + "loss": 2.3393, + "step": 11913 + }, + { + "epoch": 0.6391630901287554, + "grad_norm": 0.46484375, + "learning_rate": 4.87554937463653e-06, + "loss": 2.6052, + "step": 11914 + }, + { + "epoch": 0.6392167381974249, + "grad_norm": 0.3984375, + "learning_rate": 4.875522304181289e-06, + "loss": 2.3061, + "step": 11915 + }, + { + "epoch": 0.6392703862660944, + "grad_norm": 0.466796875, + "learning_rate": 4.87549523085736e-06, + "loss": 2.2742, + "step": 11916 + }, + { + "epoch": 0.6393240343347639, + "grad_norm": 0.478515625, + "learning_rate": 4.875468154664773e-06, + "loss": 2.1077, + "step": 11917 + }, + { + "epoch": 0.6393776824034335, + "grad_norm": 0.6640625, + "learning_rate": 4.875441075603562e-06, + "loss": 2.1671, + "step": 11918 + }, + { + "epoch": 0.639431330472103, + "grad_norm": 0.419921875, + "learning_rate": 4.875413993673759e-06, + "loss": 2.2107, + "step": 11919 + }, + { + "epoch": 0.6394849785407726, + "grad_norm": 0.478515625, + "learning_rate": 4.875386908875398e-06, + "loss": 2.4184, + "step": 11920 + }, + { + "epoch": 0.639538626609442, + "grad_norm": 0.8359375, + "learning_rate": 4.8753598212085105e-06, + "loss": 2.235, + "step": 11921 + }, + { + "epoch": 0.6395922746781116, + "grad_norm": 0.43359375, + "learning_rate": 4.875332730673129e-06, + "loss": 2.3377, + "step": 11922 + }, + { + "epoch": 0.6396459227467811, + "grad_norm": 0.376953125, + "learning_rate": 4.875305637269288e-06, + "loss": 2.1455, + "step": 11923 + }, + { + "epoch": 0.6396995708154506, + "grad_norm": 0.447265625, + "learning_rate": 4.875278540997017e-06, + "loss": 2.2567, + "step": 11924 + }, + { + "epoch": 0.6397532188841202, + "grad_norm": 0.498046875, + "learning_rate": 4.875251441856352e-06, + "loss": 2.3012, + "step": 11925 + }, + { + "epoch": 0.6398068669527897, + "grad_norm": 0.427734375, + "learning_rate": 4.875224339847325e-06, + "loss": 2.1507, + "step": 11926 + }, + { + "epoch": 0.6398605150214592, + "grad_norm": 0.36328125, + "learning_rate": 4.875197234969968e-06, + "loss": 2.2614, + "step": 11927 + }, + { + "epoch": 0.6399141630901287, + "grad_norm": 0.6015625, + "learning_rate": 4.875170127224314e-06, + "loss": 2.2028, + "step": 11928 + }, + { + "epoch": 0.6399678111587983, + "grad_norm": 0.43359375, + "learning_rate": 4.875143016610394e-06, + "loss": 2.161, + "step": 11929 + }, + { + "epoch": 0.6400214592274678, + "grad_norm": 0.546875, + "learning_rate": 4.875115903128244e-06, + "loss": 2.4515, + "step": 11930 + }, + { + "epoch": 0.6400751072961374, + "grad_norm": 0.48046875, + "learning_rate": 4.8750887867778955e-06, + "loss": 2.5667, + "step": 11931 + }, + { + "epoch": 0.6401287553648068, + "grad_norm": 0.419921875, + "learning_rate": 4.87506166755938e-06, + "loss": 2.3205, + "step": 11932 + }, + { + "epoch": 0.6401824034334764, + "grad_norm": 0.466796875, + "learning_rate": 4.875034545472731e-06, + "loss": 2.3668, + "step": 11933 + }, + { + "epoch": 0.6402360515021459, + "grad_norm": 0.41015625, + "learning_rate": 4.875007420517981e-06, + "loss": 1.8906, + "step": 11934 + }, + { + "epoch": 0.6402896995708155, + "grad_norm": 0.408203125, + "learning_rate": 4.874980292695164e-06, + "loss": 2.1502, + "step": 11935 + }, + { + "epoch": 0.640343347639485, + "grad_norm": 0.421875, + "learning_rate": 4.874953162004312e-06, + "loss": 2.3188, + "step": 11936 + }, + { + "epoch": 0.6403969957081546, + "grad_norm": 0.404296875, + "learning_rate": 4.874926028445458e-06, + "loss": 2.2475, + "step": 11937 + }, + { + "epoch": 0.640450643776824, + "grad_norm": 0.4921875, + "learning_rate": 4.874898892018633e-06, + "loss": 2.2609, + "step": 11938 + }, + { + "epoch": 0.6405042918454935, + "grad_norm": 0.470703125, + "learning_rate": 4.874871752723872e-06, + "loss": 2.3788, + "step": 11939 + }, + { + "epoch": 0.6405579399141631, + "grad_norm": 0.39453125, + "learning_rate": 4.874844610561207e-06, + "loss": 2.0149, + "step": 11940 + }, + { + "epoch": 0.6406115879828326, + "grad_norm": 0.431640625, + "learning_rate": 4.874817465530671e-06, + "loss": 2.1003, + "step": 11941 + }, + { + "epoch": 0.6406652360515022, + "grad_norm": 0.76171875, + "learning_rate": 4.874790317632296e-06, + "loss": 1.5294, + "step": 11942 + }, + { + "epoch": 0.6407188841201716, + "grad_norm": 0.5546875, + "learning_rate": 4.874763166866116e-06, + "loss": 2.5656, + "step": 11943 + }, + { + "epoch": 0.6407725321888412, + "grad_norm": 0.4140625, + "learning_rate": 4.874736013232162e-06, + "loss": 2.3825, + "step": 11944 + }, + { + "epoch": 0.6408261802575107, + "grad_norm": 0.408203125, + "learning_rate": 4.874708856730469e-06, + "loss": 2.2411, + "step": 11945 + }, + { + "epoch": 0.6408798283261803, + "grad_norm": 0.41015625, + "learning_rate": 4.874681697361067e-06, + "loss": 2.2783, + "step": 11946 + }, + { + "epoch": 0.6409334763948498, + "grad_norm": 0.44140625, + "learning_rate": 4.874654535123993e-06, + "loss": 2.4234, + "step": 11947 + }, + { + "epoch": 0.6409871244635194, + "grad_norm": 0.390625, + "learning_rate": 4.874627370019276e-06, + "loss": 2.4619, + "step": 11948 + }, + { + "epoch": 0.6410407725321888, + "grad_norm": 0.431640625, + "learning_rate": 4.87460020204695e-06, + "loss": 2.3207, + "step": 11949 + }, + { + "epoch": 0.6410944206008584, + "grad_norm": 0.421875, + "learning_rate": 4.874573031207048e-06, + "loss": 2.3373, + "step": 11950 + }, + { + "epoch": 0.6411480686695279, + "grad_norm": 0.80078125, + "learning_rate": 4.8745458574996015e-06, + "loss": 2.1705, + "step": 11951 + }, + { + "epoch": 0.6412017167381975, + "grad_norm": 0.453125, + "learning_rate": 4.874518680924646e-06, + "loss": 2.2546, + "step": 11952 + }, + { + "epoch": 0.641255364806867, + "grad_norm": 0.6328125, + "learning_rate": 4.874491501482213e-06, + "loss": 1.9634, + "step": 11953 + }, + { + "epoch": 0.6413090128755364, + "grad_norm": 0.341796875, + "learning_rate": 4.874464319172335e-06, + "loss": 2.182, + "step": 11954 + }, + { + "epoch": 0.641362660944206, + "grad_norm": 0.546875, + "learning_rate": 4.8744371339950445e-06, + "loss": 2.4232, + "step": 11955 + }, + { + "epoch": 0.6414163090128755, + "grad_norm": 0.494140625, + "learning_rate": 4.874409945950375e-06, + "loss": 2.2292, + "step": 11956 + }, + { + "epoch": 0.6414699570815451, + "grad_norm": 0.419921875, + "learning_rate": 4.874382755038359e-06, + "loss": 2.2799, + "step": 11957 + }, + { + "epoch": 0.6415236051502146, + "grad_norm": 0.439453125, + "learning_rate": 4.87435556125903e-06, + "loss": 2.3516, + "step": 11958 + }, + { + "epoch": 0.6415772532188841, + "grad_norm": 0.390625, + "learning_rate": 4.874328364612419e-06, + "loss": 2.2057, + "step": 11959 + }, + { + "epoch": 0.6416309012875536, + "grad_norm": 0.46484375, + "learning_rate": 4.8743011650985616e-06, + "loss": 2.5369, + "step": 11960 + }, + { + "epoch": 0.6416845493562232, + "grad_norm": 0.4609375, + "learning_rate": 4.874273962717489e-06, + "loss": 2.3696, + "step": 11961 + }, + { + "epoch": 0.6417381974248927, + "grad_norm": 0.390625, + "learning_rate": 4.874246757469234e-06, + "loss": 2.3275, + "step": 11962 + }, + { + "epoch": 0.6417918454935623, + "grad_norm": 0.53515625, + "learning_rate": 4.87421954935383e-06, + "loss": 1.5317, + "step": 11963 + }, + { + "epoch": 0.6418454935622318, + "grad_norm": 0.4453125, + "learning_rate": 4.874192338371309e-06, + "loss": 2.3663, + "step": 11964 + }, + { + "epoch": 0.6418991416309013, + "grad_norm": 0.5625, + "learning_rate": 4.8741651245217045e-06, + "loss": 2.2747, + "step": 11965 + }, + { + "epoch": 0.6419527896995708, + "grad_norm": 0.4375, + "learning_rate": 4.87413790780505e-06, + "loss": 2.3496, + "step": 11966 + }, + { + "epoch": 0.6420064377682403, + "grad_norm": 0.494140625, + "learning_rate": 4.8741106882213774e-06, + "loss": 2.2415, + "step": 11967 + }, + { + "epoch": 0.6420600858369099, + "grad_norm": 0.4375, + "learning_rate": 4.87408346577072e-06, + "loss": 2.2715, + "step": 11968 + }, + { + "epoch": 0.6421137339055794, + "grad_norm": 0.578125, + "learning_rate": 4.874056240453109e-06, + "loss": 2.248, + "step": 11969 + }, + { + "epoch": 0.6421673819742489, + "grad_norm": 0.5234375, + "learning_rate": 4.874029012268581e-06, + "loss": 1.8675, + "step": 11970 + }, + { + "epoch": 0.6422210300429184, + "grad_norm": 0.447265625, + "learning_rate": 4.874001781217165e-06, + "loss": 2.4116, + "step": 11971 + }, + { + "epoch": 0.642274678111588, + "grad_norm": 0.90625, + "learning_rate": 4.873974547298896e-06, + "loss": 2.2794, + "step": 11972 + }, + { + "epoch": 0.6423283261802575, + "grad_norm": 0.65625, + "learning_rate": 4.873947310513807e-06, + "loss": 2.5128, + "step": 11973 + }, + { + "epoch": 0.6423819742489271, + "grad_norm": 0.453125, + "learning_rate": 4.873920070861929e-06, + "loss": 2.4211, + "step": 11974 + }, + { + "epoch": 0.6424356223175965, + "grad_norm": 0.47265625, + "learning_rate": 4.873892828343298e-06, + "loss": 2.4086, + "step": 11975 + }, + { + "epoch": 0.6424892703862661, + "grad_norm": 0.462890625, + "learning_rate": 4.873865582957944e-06, + "loss": 2.0983, + "step": 11976 + }, + { + "epoch": 0.6425429184549356, + "grad_norm": 0.4375, + "learning_rate": 4.873838334705901e-06, + "loss": 2.3056, + "step": 11977 + }, + { + "epoch": 0.6425965665236052, + "grad_norm": 0.49609375, + "learning_rate": 4.8738110835872025e-06, + "loss": 2.2619, + "step": 11978 + }, + { + "epoch": 0.6426502145922747, + "grad_norm": 0.56640625, + "learning_rate": 4.873783829601881e-06, + "loss": 2.3306, + "step": 11979 + }, + { + "epoch": 0.6427038626609443, + "grad_norm": 0.46875, + "learning_rate": 4.873756572749968e-06, + "loss": 2.2093, + "step": 11980 + }, + { + "epoch": 0.6427575107296137, + "grad_norm": 0.431640625, + "learning_rate": 4.873729313031499e-06, + "loss": 2.3175, + "step": 11981 + }, + { + "epoch": 0.6428111587982832, + "grad_norm": 0.4296875, + "learning_rate": 4.873702050446505e-06, + "loss": 2.2699, + "step": 11982 + }, + { + "epoch": 0.6428648068669528, + "grad_norm": 0.51953125, + "learning_rate": 4.87367478499502e-06, + "loss": 2.399, + "step": 11983 + }, + { + "epoch": 0.6429184549356223, + "grad_norm": 0.486328125, + "learning_rate": 4.873647516677076e-06, + "loss": 2.2355, + "step": 11984 + }, + { + "epoch": 0.6429721030042919, + "grad_norm": 0.5234375, + "learning_rate": 4.873620245492706e-06, + "loss": 1.5135, + "step": 11985 + }, + { + "epoch": 0.6430257510729613, + "grad_norm": 0.7109375, + "learning_rate": 4.873592971441944e-06, + "loss": 2.2612, + "step": 11986 + }, + { + "epoch": 0.6430793991416309, + "grad_norm": 0.50390625, + "learning_rate": 4.873565694524822e-06, + "loss": 2.3748, + "step": 11987 + }, + { + "epoch": 0.6431330472103004, + "grad_norm": 0.4765625, + "learning_rate": 4.873538414741373e-06, + "loss": 2.3767, + "step": 11988 + }, + { + "epoch": 0.64318669527897, + "grad_norm": 0.486328125, + "learning_rate": 4.8735111320916305e-06, + "loss": 2.5317, + "step": 11989 + }, + { + "epoch": 0.6432403433476395, + "grad_norm": 0.365234375, + "learning_rate": 4.873483846575626e-06, + "loss": 2.1346, + "step": 11990 + }, + { + "epoch": 0.643293991416309, + "grad_norm": 0.390625, + "learning_rate": 4.873456558193395e-06, + "loss": 2.1809, + "step": 11991 + }, + { + "epoch": 0.6433476394849785, + "grad_norm": 0.470703125, + "learning_rate": 4.873429266944968e-06, + "loss": 2.3697, + "step": 11992 + }, + { + "epoch": 0.6434012875536481, + "grad_norm": 0.453125, + "learning_rate": 4.873401972830379e-06, + "loss": 2.1523, + "step": 11993 + }, + { + "epoch": 0.6434549356223176, + "grad_norm": 0.48046875, + "learning_rate": 4.873374675849662e-06, + "loss": 2.1894, + "step": 11994 + }, + { + "epoch": 0.6435085836909872, + "grad_norm": 0.4140625, + "learning_rate": 4.873347376002847e-06, + "loss": 2.3086, + "step": 11995 + }, + { + "epoch": 0.6435622317596567, + "grad_norm": 0.4375, + "learning_rate": 4.87332007328997e-06, + "loss": 1.8426, + "step": 11996 + }, + { + "epoch": 0.6436158798283261, + "grad_norm": 0.462890625, + "learning_rate": 4.873292767711062e-06, + "loss": 2.3313, + "step": 11997 + }, + { + "epoch": 0.6436695278969957, + "grad_norm": 0.458984375, + "learning_rate": 4.873265459266158e-06, + "loss": 2.2297, + "step": 11998 + }, + { + "epoch": 0.6437231759656652, + "grad_norm": 0.45703125, + "learning_rate": 4.873238147955289e-06, + "loss": 1.6657, + "step": 11999 + }, + { + "epoch": 0.6437768240343348, + "grad_norm": 0.46875, + "learning_rate": 4.8732108337784885e-06, + "loss": 2.0471, + "step": 12000 + }, + { + "epoch": 0.6438304721030043, + "grad_norm": 0.443359375, + "learning_rate": 4.87318351673579e-06, + "loss": 2.4456, + "step": 12001 + }, + { + "epoch": 0.6438841201716738, + "grad_norm": 0.4296875, + "learning_rate": 4.873156196827226e-06, + "loss": 2.2482, + "step": 12002 + }, + { + "epoch": 0.6439377682403433, + "grad_norm": 0.443359375, + "learning_rate": 4.87312887405283e-06, + "loss": 2.1915, + "step": 12003 + }, + { + "epoch": 0.6439914163090129, + "grad_norm": 0.6015625, + "learning_rate": 4.873101548412634e-06, + "loss": 2.2766, + "step": 12004 + }, + { + "epoch": 0.6440450643776824, + "grad_norm": 0.46875, + "learning_rate": 4.873074219906673e-06, + "loss": 2.3394, + "step": 12005 + }, + { + "epoch": 0.644098712446352, + "grad_norm": 0.40234375, + "learning_rate": 4.873046888534977e-06, + "loss": 2.3193, + "step": 12006 + }, + { + "epoch": 0.6441523605150214, + "grad_norm": 0.53515625, + "learning_rate": 4.8730195542975815e-06, + "loss": 2.0842, + "step": 12007 + }, + { + "epoch": 0.644206008583691, + "grad_norm": 0.45703125, + "learning_rate": 4.872992217194518e-06, + "loss": 2.5838, + "step": 12008 + }, + { + "epoch": 0.6442596566523605, + "grad_norm": 0.5390625, + "learning_rate": 4.872964877225822e-06, + "loss": 2.354, + "step": 12009 + }, + { + "epoch": 0.64431330472103, + "grad_norm": 0.45703125, + "learning_rate": 4.872937534391523e-06, + "loss": 2.0685, + "step": 12010 + }, + { + "epoch": 0.6443669527896996, + "grad_norm": 0.4609375, + "learning_rate": 4.872910188691657e-06, + "loss": 2.0636, + "step": 12011 + }, + { + "epoch": 0.644420600858369, + "grad_norm": 0.421875, + "learning_rate": 4.872882840126255e-06, + "loss": 2.0566, + "step": 12012 + }, + { + "epoch": 0.6444742489270386, + "grad_norm": 0.462890625, + "learning_rate": 4.87285548869535e-06, + "loss": 2.3334, + "step": 12013 + }, + { + "epoch": 0.6445278969957081, + "grad_norm": 0.427734375, + "learning_rate": 4.872828134398978e-06, + "loss": 2.231, + "step": 12014 + }, + { + "epoch": 0.6445815450643777, + "grad_norm": 0.3828125, + "learning_rate": 4.872800777237168e-06, + "loss": 2.1531, + "step": 12015 + }, + { + "epoch": 0.6446351931330472, + "grad_norm": 0.365234375, + "learning_rate": 4.872773417209956e-06, + "loss": 1.8977, + "step": 12016 + }, + { + "epoch": 0.6446888412017168, + "grad_norm": 1.15625, + "learning_rate": 4.8727460543173734e-06, + "loss": 2.1965, + "step": 12017 + }, + { + "epoch": 0.6447424892703862, + "grad_norm": 1.4453125, + "learning_rate": 4.872718688559453e-06, + "loss": 2.3075, + "step": 12018 + }, + { + "epoch": 0.6447961373390558, + "grad_norm": 0.490234375, + "learning_rate": 4.87269131993623e-06, + "loss": 2.28, + "step": 12019 + }, + { + "epoch": 0.6448497854077253, + "grad_norm": 0.52734375, + "learning_rate": 4.872663948447736e-06, + "loss": 2.41, + "step": 12020 + }, + { + "epoch": 0.6449034334763949, + "grad_norm": 0.458984375, + "learning_rate": 4.872636574094003e-06, + "loss": 2.1708, + "step": 12021 + }, + { + "epoch": 0.6449570815450644, + "grad_norm": 0.4375, + "learning_rate": 4.872609196875065e-06, + "loss": 2.2001, + "step": 12022 + }, + { + "epoch": 0.645010729613734, + "grad_norm": 0.484375, + "learning_rate": 4.872581816790957e-06, + "loss": 2.5217, + "step": 12023 + }, + { + "epoch": 0.6450643776824034, + "grad_norm": 0.47265625, + "learning_rate": 4.872554433841709e-06, + "loss": 2.3895, + "step": 12024 + }, + { + "epoch": 0.6451180257510729, + "grad_norm": 0.56640625, + "learning_rate": 4.872527048027356e-06, + "loss": 2.114, + "step": 12025 + }, + { + "epoch": 0.6451716738197425, + "grad_norm": 0.416015625, + "learning_rate": 4.87249965934793e-06, + "loss": 2.3425, + "step": 12026 + }, + { + "epoch": 0.645225321888412, + "grad_norm": 0.482421875, + "learning_rate": 4.872472267803465e-06, + "loss": 1.5596, + "step": 12027 + }, + { + "epoch": 0.6452789699570816, + "grad_norm": 0.52734375, + "learning_rate": 4.872444873393993e-06, + "loss": 2.3225, + "step": 12028 + }, + { + "epoch": 0.645332618025751, + "grad_norm": 0.435546875, + "learning_rate": 4.872417476119548e-06, + "loss": 2.1502, + "step": 12029 + }, + { + "epoch": 0.6453862660944206, + "grad_norm": 8.8125, + "learning_rate": 4.872390075980163e-06, + "loss": 2.3449, + "step": 12030 + }, + { + "epoch": 0.6454399141630901, + "grad_norm": 0.435546875, + "learning_rate": 4.87236267297587e-06, + "loss": 2.3209, + "step": 12031 + }, + { + "epoch": 0.6454935622317597, + "grad_norm": 0.455078125, + "learning_rate": 4.872335267106703e-06, + "loss": 2.3147, + "step": 12032 + }, + { + "epoch": 0.6455472103004292, + "grad_norm": 0.443359375, + "learning_rate": 4.8723078583726965e-06, + "loss": 2.2644, + "step": 12033 + }, + { + "epoch": 0.6456008583690988, + "grad_norm": 0.4453125, + "learning_rate": 4.872280446773881e-06, + "loss": 2.4722, + "step": 12034 + }, + { + "epoch": 0.6456545064377682, + "grad_norm": 0.64453125, + "learning_rate": 4.872253032310291e-06, + "loss": 2.3444, + "step": 12035 + }, + { + "epoch": 0.6457081545064378, + "grad_norm": 0.427734375, + "learning_rate": 4.872225614981959e-06, + "loss": 1.6992, + "step": 12036 + }, + { + "epoch": 0.6457618025751073, + "grad_norm": 0.419921875, + "learning_rate": 4.8721981947889194e-06, + "loss": 2.3223, + "step": 12037 + }, + { + "epoch": 0.6458154506437769, + "grad_norm": 0.486328125, + "learning_rate": 4.872170771731203e-06, + "loss": 2.0948, + "step": 12038 + }, + { + "epoch": 0.6458690987124464, + "grad_norm": 0.5546875, + "learning_rate": 4.872143345808845e-06, + "loss": 2.309, + "step": 12039 + }, + { + "epoch": 0.6459227467811158, + "grad_norm": 0.5078125, + "learning_rate": 4.8721159170218775e-06, + "loss": 2.4359, + "step": 12040 + }, + { + "epoch": 0.6459763948497854, + "grad_norm": 0.49609375, + "learning_rate": 4.872088485370335e-06, + "loss": 2.435, + "step": 12041 + }, + { + "epoch": 0.6460300429184549, + "grad_norm": 0.447265625, + "learning_rate": 4.872061050854249e-06, + "loss": 2.1348, + "step": 12042 + }, + { + "epoch": 0.6460836909871245, + "grad_norm": 0.4296875, + "learning_rate": 4.872033613473652e-06, + "loss": 2.2337, + "step": 12043 + }, + { + "epoch": 0.646137339055794, + "grad_norm": 0.4609375, + "learning_rate": 4.87200617322858e-06, + "loss": 2.4445, + "step": 12044 + }, + { + "epoch": 0.6461909871244635, + "grad_norm": 0.435546875, + "learning_rate": 4.871978730119064e-06, + "loss": 2.4353, + "step": 12045 + }, + { + "epoch": 0.646244635193133, + "grad_norm": 0.47265625, + "learning_rate": 4.871951284145137e-06, + "loss": 2.2571, + "step": 12046 + }, + { + "epoch": 0.6462982832618026, + "grad_norm": 0.373046875, + "learning_rate": 4.8719238353068335e-06, + "loss": 2.1849, + "step": 12047 + }, + { + "epoch": 0.6463519313304721, + "grad_norm": 0.47265625, + "learning_rate": 4.871896383604185e-06, + "loss": 2.345, + "step": 12048 + }, + { + "epoch": 0.6464055793991417, + "grad_norm": 0.49609375, + "learning_rate": 4.871868929037226e-06, + "loss": 2.3282, + "step": 12049 + }, + { + "epoch": 0.6464592274678111, + "grad_norm": 0.447265625, + "learning_rate": 4.871841471605989e-06, + "loss": 2.3208, + "step": 12050 + }, + { + "epoch": 0.6465128755364807, + "grad_norm": 0.5703125, + "learning_rate": 4.8718140113105085e-06, + "loss": 2.4313, + "step": 12051 + }, + { + "epoch": 0.6465665236051502, + "grad_norm": 0.4375, + "learning_rate": 4.8717865481508155e-06, + "loss": 2.3948, + "step": 12052 + }, + { + "epoch": 0.6466201716738197, + "grad_norm": 0.359375, + "learning_rate": 4.871759082126943e-06, + "loss": 2.1247, + "step": 12053 + }, + { + "epoch": 0.6466738197424893, + "grad_norm": 0.408203125, + "learning_rate": 4.871731613238927e-06, + "loss": 2.3461, + "step": 12054 + }, + { + "epoch": 0.6467274678111588, + "grad_norm": 0.48046875, + "learning_rate": 4.871704141486799e-06, + "loss": 2.003, + "step": 12055 + }, + { + "epoch": 0.6467811158798283, + "grad_norm": 0.48046875, + "learning_rate": 4.871676666870592e-06, + "loss": 2.3144, + "step": 12056 + }, + { + "epoch": 0.6468347639484978, + "grad_norm": 0.89453125, + "learning_rate": 4.871649189390339e-06, + "loss": 2.568, + "step": 12057 + }, + { + "epoch": 0.6468884120171674, + "grad_norm": 0.484375, + "learning_rate": 4.8716217090460735e-06, + "loss": 2.2503, + "step": 12058 + }, + { + "epoch": 0.6469420600858369, + "grad_norm": 0.453125, + "learning_rate": 4.871594225837829e-06, + "loss": 2.4429, + "step": 12059 + }, + { + "epoch": 0.6469957081545065, + "grad_norm": 0.451171875, + "learning_rate": 4.871566739765639e-06, + "loss": 2.2236, + "step": 12060 + }, + { + "epoch": 0.6470493562231759, + "grad_norm": 0.447265625, + "learning_rate": 4.871539250829535e-06, + "loss": 2.3408, + "step": 12061 + }, + { + "epoch": 0.6471030042918455, + "grad_norm": 0.5546875, + "learning_rate": 4.871511759029552e-06, + "loss": 2.2085, + "step": 12062 + }, + { + "epoch": 0.647156652360515, + "grad_norm": 0.59375, + "learning_rate": 4.871484264365722e-06, + "loss": 2.59, + "step": 12063 + }, + { + "epoch": 0.6472103004291846, + "grad_norm": 0.51953125, + "learning_rate": 4.871456766838079e-06, + "loss": 2.4015, + "step": 12064 + }, + { + "epoch": 0.6472639484978541, + "grad_norm": 0.5390625, + "learning_rate": 4.871429266446656e-06, + "loss": 2.3626, + "step": 12065 + }, + { + "epoch": 0.6473175965665237, + "grad_norm": 0.4296875, + "learning_rate": 4.871401763191486e-06, + "loss": 2.2792, + "step": 12066 + }, + { + "epoch": 0.6473712446351931, + "grad_norm": 0.53515625, + "learning_rate": 4.871374257072603e-06, + "loss": 2.2105, + "step": 12067 + }, + { + "epoch": 0.6474248927038626, + "grad_norm": 0.51171875, + "learning_rate": 4.871346748090039e-06, + "loss": 2.7045, + "step": 12068 + }, + { + "epoch": 0.6474785407725322, + "grad_norm": 0.431640625, + "learning_rate": 4.871319236243828e-06, + "loss": 2.0424, + "step": 12069 + }, + { + "epoch": 0.6475321888412017, + "grad_norm": 0.427734375, + "learning_rate": 4.871291721534002e-06, + "loss": 2.1801, + "step": 12070 + }, + { + "epoch": 0.6475858369098713, + "grad_norm": 0.46484375, + "learning_rate": 4.8712642039605965e-06, + "loss": 2.4236, + "step": 12071 + }, + { + "epoch": 0.6476394849785407, + "grad_norm": 0.365234375, + "learning_rate": 4.871236683523643e-06, + "loss": 2.2138, + "step": 12072 + }, + { + "epoch": 0.6476931330472103, + "grad_norm": 0.703125, + "learning_rate": 4.871209160223175e-06, + "loss": 2.0633, + "step": 12073 + }, + { + "epoch": 0.6477467811158798, + "grad_norm": 0.439453125, + "learning_rate": 4.871181634059226e-06, + "loss": 2.443, + "step": 12074 + }, + { + "epoch": 0.6478004291845494, + "grad_norm": 0.4453125, + "learning_rate": 4.871154105031829e-06, + "loss": 2.3606, + "step": 12075 + }, + { + "epoch": 0.6478540772532189, + "grad_norm": 0.515625, + "learning_rate": 4.871126573141018e-06, + "loss": 2.3512, + "step": 12076 + }, + { + "epoch": 0.6479077253218885, + "grad_norm": 0.5625, + "learning_rate": 4.8710990383868255e-06, + "loss": 2.3043, + "step": 12077 + }, + { + "epoch": 0.6479613733905579, + "grad_norm": 0.451171875, + "learning_rate": 4.871071500769284e-06, + "loss": 2.383, + "step": 12078 + }, + { + "epoch": 0.6480150214592275, + "grad_norm": 0.45703125, + "learning_rate": 4.871043960288428e-06, + "loss": 2.4759, + "step": 12079 + }, + { + "epoch": 0.648068669527897, + "grad_norm": 0.478515625, + "learning_rate": 4.871016416944292e-06, + "loss": 2.4158, + "step": 12080 + }, + { + "epoch": 0.6481223175965666, + "grad_norm": 0.47265625, + "learning_rate": 4.870988870736906e-06, + "loss": 2.3924, + "step": 12081 + }, + { + "epoch": 0.648175965665236, + "grad_norm": 0.4375, + "learning_rate": 4.870961321666306e-06, + "loss": 2.475, + "step": 12082 + }, + { + "epoch": 0.6482296137339055, + "grad_norm": 0.68359375, + "learning_rate": 4.870933769732523e-06, + "loss": 2.1704, + "step": 12083 + }, + { + "epoch": 0.6482832618025751, + "grad_norm": 0.41796875, + "learning_rate": 4.870906214935593e-06, + "loss": 2.3809, + "step": 12084 + }, + { + "epoch": 0.6483369098712446, + "grad_norm": 0.44921875, + "learning_rate": 4.870878657275547e-06, + "loss": 2.0431, + "step": 12085 + }, + { + "epoch": 0.6483905579399142, + "grad_norm": 0.5078125, + "learning_rate": 4.870851096752419e-06, + "loss": 2.2374, + "step": 12086 + }, + { + "epoch": 0.6484442060085837, + "grad_norm": 0.37890625, + "learning_rate": 4.870823533366242e-06, + "loss": 2.0748, + "step": 12087 + }, + { + "epoch": 0.6484978540772532, + "grad_norm": 0.6171875, + "learning_rate": 4.87079596711705e-06, + "loss": 2.4054, + "step": 12088 + }, + { + "epoch": 0.6485515021459227, + "grad_norm": 0.439453125, + "learning_rate": 4.8707683980048755e-06, + "loss": 2.2099, + "step": 12089 + }, + { + "epoch": 0.6486051502145923, + "grad_norm": 0.42578125, + "learning_rate": 4.870740826029753e-06, + "loss": 2.1924, + "step": 12090 + }, + { + "epoch": 0.6486587982832618, + "grad_norm": 0.416015625, + "learning_rate": 4.870713251191715e-06, + "loss": 2.2605, + "step": 12091 + }, + { + "epoch": 0.6487124463519314, + "grad_norm": 0.4609375, + "learning_rate": 4.870685673490794e-06, + "loss": 2.1104, + "step": 12092 + }, + { + "epoch": 0.6487660944206008, + "grad_norm": 0.4609375, + "learning_rate": 4.870658092927025e-06, + "loss": 2.2513, + "step": 12093 + }, + { + "epoch": 0.6488197424892704, + "grad_norm": 0.55078125, + "learning_rate": 4.870630509500439e-06, + "loss": 2.3768, + "step": 12094 + }, + { + "epoch": 0.6488733905579399, + "grad_norm": 0.4140625, + "learning_rate": 4.870602923211072e-06, + "loss": 2.3945, + "step": 12095 + }, + { + "epoch": 0.6489270386266094, + "grad_norm": 0.47265625, + "learning_rate": 4.870575334058955e-06, + "loss": 2.5213, + "step": 12096 + }, + { + "epoch": 0.648980686695279, + "grad_norm": 2.546875, + "learning_rate": 4.870547742044124e-06, + "loss": 2.2907, + "step": 12097 + }, + { + "epoch": 0.6490343347639485, + "grad_norm": 0.466796875, + "learning_rate": 4.870520147166609e-06, + "loss": 2.3326, + "step": 12098 + }, + { + "epoch": 0.649087982832618, + "grad_norm": 0.40234375, + "learning_rate": 4.870492549426446e-06, + "loss": 2.0486, + "step": 12099 + }, + { + "epoch": 0.6491416309012875, + "grad_norm": 0.46484375, + "learning_rate": 4.870464948823666e-06, + "loss": 2.207, + "step": 12100 + }, + { + "epoch": 0.6491952789699571, + "grad_norm": 0.486328125, + "learning_rate": 4.870437345358305e-06, + "loss": 2.2027, + "step": 12101 + }, + { + "epoch": 0.6492489270386266, + "grad_norm": 0.51171875, + "learning_rate": 4.870409739030395e-06, + "loss": 2.1949, + "step": 12102 + }, + { + "epoch": 0.6493025751072962, + "grad_norm": 0.37109375, + "learning_rate": 4.8703821298399685e-06, + "loss": 2.2378, + "step": 12103 + }, + { + "epoch": 0.6493562231759656, + "grad_norm": 0.443359375, + "learning_rate": 4.8703545177870605e-06, + "loss": 2.2289, + "step": 12104 + }, + { + "epoch": 0.6494098712446352, + "grad_norm": 0.47265625, + "learning_rate": 4.870326902871703e-06, + "loss": 2.3014, + "step": 12105 + }, + { + "epoch": 0.6494635193133047, + "grad_norm": 0.42578125, + "learning_rate": 4.87029928509393e-06, + "loss": 2.1402, + "step": 12106 + }, + { + "epoch": 0.6495171673819743, + "grad_norm": 0.494140625, + "learning_rate": 4.870271664453774e-06, + "loss": 2.3939, + "step": 12107 + }, + { + "epoch": 0.6495708154506438, + "grad_norm": 0.455078125, + "learning_rate": 4.87024404095127e-06, + "loss": 2.281, + "step": 12108 + }, + { + "epoch": 0.6496244635193134, + "grad_norm": 0.39453125, + "learning_rate": 4.87021641458645e-06, + "loss": 2.36, + "step": 12109 + }, + { + "epoch": 0.6496781115879828, + "grad_norm": 0.431640625, + "learning_rate": 4.870188785359348e-06, + "loss": 2.3363, + "step": 12110 + }, + { + "epoch": 0.6497317596566523, + "grad_norm": 0.396484375, + "learning_rate": 4.870161153269996e-06, + "loss": 2.3969, + "step": 12111 + }, + { + "epoch": 0.6497854077253219, + "grad_norm": 0.5625, + "learning_rate": 4.87013351831843e-06, + "loss": 2.4075, + "step": 12112 + }, + { + "epoch": 0.6498390557939914, + "grad_norm": 0.36328125, + "learning_rate": 4.870105880504681e-06, + "loss": 2.2075, + "step": 12113 + }, + { + "epoch": 0.649892703862661, + "grad_norm": 0.42578125, + "learning_rate": 4.870078239828784e-06, + "loss": 2.3578, + "step": 12114 + }, + { + "epoch": 0.6499463519313304, + "grad_norm": 0.40625, + "learning_rate": 4.870050596290771e-06, + "loss": 2.1551, + "step": 12115 + }, + { + "epoch": 0.65, + "grad_norm": 0.455078125, + "learning_rate": 4.870022949890676e-06, + "loss": 2.4343, + "step": 12116 + }, + { + "epoch": 0.6500536480686695, + "grad_norm": 0.43359375, + "learning_rate": 4.8699953006285334e-06, + "loss": 2.345, + "step": 12117 + }, + { + "epoch": 0.6501072961373391, + "grad_norm": 1.2109375, + "learning_rate": 4.869967648504375e-06, + "loss": 2.2912, + "step": 12118 + }, + { + "epoch": 0.6501609442060086, + "grad_norm": 0.97265625, + "learning_rate": 4.8699399935182345e-06, + "loss": 2.3441, + "step": 12119 + }, + { + "epoch": 0.6502145922746781, + "grad_norm": 1.296875, + "learning_rate": 4.8699123356701455e-06, + "loss": 2.3856, + "step": 12120 + }, + { + "epoch": 0.6502682403433476, + "grad_norm": 0.431640625, + "learning_rate": 4.869884674960141e-06, + "loss": 1.5361, + "step": 12121 + }, + { + "epoch": 0.6503218884120172, + "grad_norm": 0.421875, + "learning_rate": 4.869857011388256e-06, + "loss": 2.3277, + "step": 12122 + }, + { + "epoch": 0.6503755364806867, + "grad_norm": 0.482421875, + "learning_rate": 4.869829344954523e-06, + "loss": 2.3295, + "step": 12123 + }, + { + "epoch": 0.6504291845493563, + "grad_norm": 0.33984375, + "learning_rate": 4.869801675658974e-06, + "loss": 2.0048, + "step": 12124 + }, + { + "epoch": 0.6504828326180258, + "grad_norm": 0.462890625, + "learning_rate": 4.8697740035016444e-06, + "loss": 2.3077, + "step": 12125 + }, + { + "epoch": 0.6505364806866952, + "grad_norm": 0.416015625, + "learning_rate": 4.869746328482566e-06, + "loss": 2.2765, + "step": 12126 + }, + { + "epoch": 0.6505901287553648, + "grad_norm": 0.369140625, + "learning_rate": 4.869718650601774e-06, + "loss": 2.1342, + "step": 12127 + }, + { + "epoch": 0.6506437768240343, + "grad_norm": 0.48828125, + "learning_rate": 4.869690969859301e-06, + "loss": 2.3355, + "step": 12128 + }, + { + "epoch": 0.6506974248927039, + "grad_norm": 0.46875, + "learning_rate": 4.86966328625518e-06, + "loss": 2.4852, + "step": 12129 + }, + { + "epoch": 0.6507510729613734, + "grad_norm": 0.5390625, + "learning_rate": 4.869635599789444e-06, + "loss": 2.1422, + "step": 12130 + }, + { + "epoch": 0.6508047210300429, + "grad_norm": 0.68359375, + "learning_rate": 4.8696079104621285e-06, + "loss": 2.3431, + "step": 12131 + }, + { + "epoch": 0.6508583690987124, + "grad_norm": 0.9296875, + "learning_rate": 4.869580218273265e-06, + "loss": 2.0004, + "step": 12132 + }, + { + "epoch": 0.650912017167382, + "grad_norm": 0.5078125, + "learning_rate": 4.869552523222887e-06, + "loss": 2.3267, + "step": 12133 + }, + { + "epoch": 0.6509656652360515, + "grad_norm": 0.375, + "learning_rate": 4.869524825311029e-06, + "loss": 2.2253, + "step": 12134 + }, + { + "epoch": 0.6510193133047211, + "grad_norm": 0.443359375, + "learning_rate": 4.8694971245377235e-06, + "loss": 2.0741, + "step": 12135 + }, + { + "epoch": 0.6510729613733905, + "grad_norm": 0.54296875, + "learning_rate": 4.869469420903005e-06, + "loss": 2.4077, + "step": 12136 + }, + { + "epoch": 0.6511266094420601, + "grad_norm": 0.451171875, + "learning_rate": 4.869441714406906e-06, + "loss": 2.4227, + "step": 12137 + }, + { + "epoch": 0.6511802575107296, + "grad_norm": 0.4609375, + "learning_rate": 4.86941400504946e-06, + "loss": 2.2918, + "step": 12138 + }, + { + "epoch": 0.6512339055793992, + "grad_norm": 0.423828125, + "learning_rate": 4.8693862928307015e-06, + "loss": 2.3611, + "step": 12139 + }, + { + "epoch": 0.6512875536480687, + "grad_norm": 0.412109375, + "learning_rate": 4.869358577750663e-06, + "loss": 2.2823, + "step": 12140 + }, + { + "epoch": 0.6513412017167381, + "grad_norm": 0.74609375, + "learning_rate": 4.869330859809378e-06, + "loss": 2.3189, + "step": 12141 + }, + { + "epoch": 0.6513948497854077, + "grad_norm": 0.435546875, + "learning_rate": 4.86930313900688e-06, + "loss": 2.295, + "step": 12142 + }, + { + "epoch": 0.6514484978540772, + "grad_norm": 0.388671875, + "learning_rate": 4.869275415343203e-06, + "loss": 2.2468, + "step": 12143 + }, + { + "epoch": 0.6515021459227468, + "grad_norm": 0.52734375, + "learning_rate": 4.8692476888183805e-06, + "loss": 2.3536, + "step": 12144 + }, + { + "epoch": 0.6515557939914163, + "grad_norm": 0.59765625, + "learning_rate": 4.869219959432445e-06, + "loss": 2.0595, + "step": 12145 + }, + { + "epoch": 0.6516094420600859, + "grad_norm": 0.3984375, + "learning_rate": 4.8691922271854305e-06, + "loss": 2.3097, + "step": 12146 + }, + { + "epoch": 0.6516630901287553, + "grad_norm": 0.4765625, + "learning_rate": 4.869164492077371e-06, + "loss": 2.2555, + "step": 12147 + }, + { + "epoch": 0.6517167381974249, + "grad_norm": 0.419921875, + "learning_rate": 4.8691367541083e-06, + "loss": 2.3248, + "step": 12148 + }, + { + "epoch": 0.6517703862660944, + "grad_norm": 0.53125, + "learning_rate": 4.8691090132782496e-06, + "loss": 1.8482, + "step": 12149 + }, + { + "epoch": 0.651824034334764, + "grad_norm": 0.462890625, + "learning_rate": 4.869081269587255e-06, + "loss": 2.2779, + "step": 12150 + }, + { + "epoch": 0.6518776824034335, + "grad_norm": 0.51171875, + "learning_rate": 4.869053523035349e-06, + "loss": 2.2314, + "step": 12151 + }, + { + "epoch": 0.651931330472103, + "grad_norm": 0.60546875, + "learning_rate": 4.869025773622565e-06, + "loss": 2.2397, + "step": 12152 + }, + { + "epoch": 0.6519849785407725, + "grad_norm": 0.486328125, + "learning_rate": 4.868998021348935e-06, + "loss": 2.1552, + "step": 12153 + }, + { + "epoch": 0.652038626609442, + "grad_norm": 1.203125, + "learning_rate": 4.8689702662144965e-06, + "loss": 2.416, + "step": 12154 + }, + { + "epoch": 0.6520922746781116, + "grad_norm": 0.65625, + "learning_rate": 4.868942508219279e-06, + "loss": 2.3617, + "step": 12155 + }, + { + "epoch": 0.6521459227467811, + "grad_norm": 0.51171875, + "learning_rate": 4.868914747363318e-06, + "loss": 2.5216, + "step": 12156 + }, + { + "epoch": 0.6521995708154507, + "grad_norm": 0.47265625, + "learning_rate": 4.868886983646647e-06, + "loss": 2.2412, + "step": 12157 + }, + { + "epoch": 0.6522532188841201, + "grad_norm": 0.36328125, + "learning_rate": 4.868859217069299e-06, + "loss": 2.0532, + "step": 12158 + }, + { + "epoch": 0.6523068669527897, + "grad_norm": 0.51953125, + "learning_rate": 4.868831447631308e-06, + "loss": 2.3039, + "step": 12159 + }, + { + "epoch": 0.6523605150214592, + "grad_norm": 0.43359375, + "learning_rate": 4.8688036753327075e-06, + "loss": 2.1068, + "step": 12160 + }, + { + "epoch": 0.6524141630901288, + "grad_norm": 0.453125, + "learning_rate": 4.86877590017353e-06, + "loss": 2.296, + "step": 12161 + }, + { + "epoch": 0.6524678111587983, + "grad_norm": 0.427734375, + "learning_rate": 4.8687481221538105e-06, + "loss": 1.9132, + "step": 12162 + }, + { + "epoch": 0.6525214592274678, + "grad_norm": 0.9765625, + "learning_rate": 4.868720341273582e-06, + "loss": 2.6714, + "step": 12163 + }, + { + "epoch": 0.6525751072961373, + "grad_norm": 0.453125, + "learning_rate": 4.8686925575328765e-06, + "loss": 2.5256, + "step": 12164 + }, + { + "epoch": 0.6526287553648069, + "grad_norm": 0.357421875, + "learning_rate": 4.868664770931731e-06, + "loss": 2.1516, + "step": 12165 + }, + { + "epoch": 0.6526824034334764, + "grad_norm": 0.4609375, + "learning_rate": 4.868636981470176e-06, + "loss": 2.3297, + "step": 12166 + }, + { + "epoch": 0.652736051502146, + "grad_norm": 0.5546875, + "learning_rate": 4.8686091891482465e-06, + "loss": 2.2057, + "step": 12167 + }, + { + "epoch": 0.6527896995708155, + "grad_norm": 0.57421875, + "learning_rate": 4.868581393965975e-06, + "loss": 2.3079, + "step": 12168 + }, + { + "epoch": 0.6528433476394849, + "grad_norm": 0.46875, + "learning_rate": 4.868553595923396e-06, + "loss": 2.4559, + "step": 12169 + }, + { + "epoch": 0.6528969957081545, + "grad_norm": 0.40625, + "learning_rate": 4.868525795020544e-06, + "loss": 2.15, + "step": 12170 + }, + { + "epoch": 0.652950643776824, + "grad_norm": 0.380859375, + "learning_rate": 4.868497991257449e-06, + "loss": 2.0731, + "step": 12171 + }, + { + "epoch": 0.6530042918454936, + "grad_norm": 0.486328125, + "learning_rate": 4.8684701846341485e-06, + "loss": 2.0798, + "step": 12172 + }, + { + "epoch": 0.653057939914163, + "grad_norm": 0.494140625, + "learning_rate": 4.868442375150675e-06, + "loss": 2.1414, + "step": 12173 + }, + { + "epoch": 0.6531115879828326, + "grad_norm": 0.515625, + "learning_rate": 4.8684145628070606e-06, + "loss": 2.2996, + "step": 12174 + }, + { + "epoch": 0.6531652360515021, + "grad_norm": 0.46875, + "learning_rate": 4.86838674760334e-06, + "loss": 2.1941, + "step": 12175 + }, + { + "epoch": 0.6532188841201717, + "grad_norm": 0.44140625, + "learning_rate": 4.8683589295395475e-06, + "loss": 2.3237, + "step": 12176 + }, + { + "epoch": 0.6532725321888412, + "grad_norm": 0.453125, + "learning_rate": 4.868331108615714e-06, + "loss": 2.5219, + "step": 12177 + }, + { + "epoch": 0.6533261802575108, + "grad_norm": 0.44140625, + "learning_rate": 4.868303284831877e-06, + "loss": 2.3179, + "step": 12178 + }, + { + "epoch": 0.6533798283261802, + "grad_norm": 0.478515625, + "learning_rate": 4.868275458188068e-06, + "loss": 2.1859, + "step": 12179 + }, + { + "epoch": 0.6534334763948498, + "grad_norm": 0.404296875, + "learning_rate": 4.868247628684319e-06, + "loss": 2.3354, + "step": 12180 + }, + { + "epoch": 0.6534871244635193, + "grad_norm": 0.4140625, + "learning_rate": 4.868219796320667e-06, + "loss": 2.2751, + "step": 12181 + }, + { + "epoch": 0.6535407725321889, + "grad_norm": 4.0625, + "learning_rate": 4.868191961097143e-06, + "loss": 2.538, + "step": 12182 + }, + { + "epoch": 0.6535944206008584, + "grad_norm": 0.380859375, + "learning_rate": 4.868164123013782e-06, + "loss": 2.1727, + "step": 12183 + }, + { + "epoch": 0.6536480686695278, + "grad_norm": 0.470703125, + "learning_rate": 4.8681362820706164e-06, + "loss": 2.2947, + "step": 12184 + }, + { + "epoch": 0.6537017167381974, + "grad_norm": 0.431640625, + "learning_rate": 4.868108438267681e-06, + "loss": 2.0522, + "step": 12185 + }, + { + "epoch": 0.6537553648068669, + "grad_norm": 0.515625, + "learning_rate": 4.868080591605009e-06, + "loss": 2.5177, + "step": 12186 + }, + { + "epoch": 0.6538090128755365, + "grad_norm": 0.455078125, + "learning_rate": 4.868052742082634e-06, + "loss": 2.2407, + "step": 12187 + }, + { + "epoch": 0.653862660944206, + "grad_norm": 0.40625, + "learning_rate": 4.868024889700589e-06, + "loss": 2.2818, + "step": 12188 + }, + { + "epoch": 0.6539163090128756, + "grad_norm": 0.6328125, + "learning_rate": 4.86799703445891e-06, + "loss": 2.2152, + "step": 12189 + }, + { + "epoch": 0.653969957081545, + "grad_norm": 0.462890625, + "learning_rate": 4.867969176357627e-06, + "loss": 2.3748, + "step": 12190 + }, + { + "epoch": 0.6540236051502146, + "grad_norm": 0.44921875, + "learning_rate": 4.867941315396776e-06, + "loss": 2.2187, + "step": 12191 + }, + { + "epoch": 0.6540772532188841, + "grad_norm": 0.458984375, + "learning_rate": 4.867913451576391e-06, + "loss": 2.2769, + "step": 12192 + }, + { + "epoch": 0.6541309012875537, + "grad_norm": 0.451171875, + "learning_rate": 4.8678855848965046e-06, + "loss": 1.9258, + "step": 12193 + }, + { + "epoch": 0.6541845493562232, + "grad_norm": 0.52734375, + "learning_rate": 4.86785771535715e-06, + "loss": 2.4738, + "step": 12194 + }, + { + "epoch": 0.6542381974248928, + "grad_norm": 0.37109375, + "learning_rate": 4.8678298429583626e-06, + "loss": 1.8898, + "step": 12195 + }, + { + "epoch": 0.6542918454935622, + "grad_norm": 0.404296875, + "learning_rate": 4.867801967700175e-06, + "loss": 2.3072, + "step": 12196 + }, + { + "epoch": 0.6543454935622317, + "grad_norm": 0.50390625, + "learning_rate": 4.8677740895826195e-06, + "loss": 2.6223, + "step": 12197 + }, + { + "epoch": 0.6543991416309013, + "grad_norm": 0.45703125, + "learning_rate": 4.867746208605732e-06, + "loss": 2.441, + "step": 12198 + }, + { + "epoch": 0.6544527896995708, + "grad_norm": 0.4375, + "learning_rate": 4.867718324769545e-06, + "loss": 2.1275, + "step": 12199 + }, + { + "epoch": 0.6545064377682404, + "grad_norm": 0.416015625, + "learning_rate": 4.867690438074093e-06, + "loss": 2.4188, + "step": 12200 + }, + { + "epoch": 0.6545600858369098, + "grad_norm": 0.4921875, + "learning_rate": 4.867662548519408e-06, + "loss": 2.1024, + "step": 12201 + }, + { + "epoch": 0.6546137339055794, + "grad_norm": 0.439453125, + "learning_rate": 4.867634656105527e-06, + "loss": 2.3339, + "step": 12202 + }, + { + "epoch": 0.6546673819742489, + "grad_norm": 0.435546875, + "learning_rate": 4.86760676083248e-06, + "loss": 2.329, + "step": 12203 + }, + { + "epoch": 0.6547210300429185, + "grad_norm": 0.4453125, + "learning_rate": 4.867578862700302e-06, + "loss": 2.3659, + "step": 12204 + }, + { + "epoch": 0.654774678111588, + "grad_norm": 0.447265625, + "learning_rate": 4.867550961709027e-06, + "loss": 2.2883, + "step": 12205 + }, + { + "epoch": 0.6548283261802575, + "grad_norm": 0.74609375, + "learning_rate": 4.8675230578586895e-06, + "loss": 2.3577, + "step": 12206 + }, + { + "epoch": 0.654881974248927, + "grad_norm": 0.59765625, + "learning_rate": 4.867495151149321e-06, + "loss": 2.3331, + "step": 12207 + }, + { + "epoch": 0.6549356223175966, + "grad_norm": 0.51171875, + "learning_rate": 4.8674672415809575e-06, + "loss": 2.3164, + "step": 12208 + }, + { + "epoch": 0.6549892703862661, + "grad_norm": 0.3515625, + "learning_rate": 4.8674393291536315e-06, + "loss": 2.0063, + "step": 12209 + }, + { + "epoch": 0.6550429184549357, + "grad_norm": 0.51171875, + "learning_rate": 4.867411413867377e-06, + "loss": 2.4769, + "step": 12210 + }, + { + "epoch": 0.6550965665236052, + "grad_norm": 0.482421875, + "learning_rate": 4.867383495722226e-06, + "loss": 2.3052, + "step": 12211 + }, + { + "epoch": 0.6551502145922746, + "grad_norm": 0.453125, + "learning_rate": 4.867355574718216e-06, + "loss": 2.2897, + "step": 12212 + }, + { + "epoch": 0.6552038626609442, + "grad_norm": 0.427734375, + "learning_rate": 4.867327650855378e-06, + "loss": 2.4138, + "step": 12213 + }, + { + "epoch": 0.6552575107296137, + "grad_norm": 0.53515625, + "learning_rate": 4.867299724133746e-06, + "loss": 2.3876, + "step": 12214 + }, + { + "epoch": 0.6553111587982833, + "grad_norm": 0.462890625, + "learning_rate": 4.867271794553353e-06, + "loss": 2.2288, + "step": 12215 + }, + { + "epoch": 0.6553648068669528, + "grad_norm": 0.51171875, + "learning_rate": 4.867243862114235e-06, + "loss": 2.4348, + "step": 12216 + }, + { + "epoch": 0.6554184549356223, + "grad_norm": 0.4140625, + "learning_rate": 4.867215926816425e-06, + "loss": 1.9127, + "step": 12217 + }, + { + "epoch": 0.6554721030042918, + "grad_norm": 2.6875, + "learning_rate": 4.8671879886599545e-06, + "loss": 2.1533, + "step": 12218 + }, + { + "epoch": 0.6555257510729614, + "grad_norm": 0.38671875, + "learning_rate": 4.86716004764486e-06, + "loss": 2.2262, + "step": 12219 + }, + { + "epoch": 0.6555793991416309, + "grad_norm": 0.4296875, + "learning_rate": 4.867132103771174e-06, + "loss": 2.1197, + "step": 12220 + }, + { + "epoch": 0.6556330472103005, + "grad_norm": 0.392578125, + "learning_rate": 4.86710415703893e-06, + "loss": 1.9354, + "step": 12221 + }, + { + "epoch": 0.6556866952789699, + "grad_norm": 0.494140625, + "learning_rate": 4.867076207448162e-06, + "loss": 2.4617, + "step": 12222 + }, + { + "epoch": 0.6557403433476395, + "grad_norm": 0.49609375, + "learning_rate": 4.867048254998904e-06, + "loss": 2.1941, + "step": 12223 + }, + { + "epoch": 0.655793991416309, + "grad_norm": 0.515625, + "learning_rate": 4.86702029969119e-06, + "loss": 2.1914, + "step": 12224 + }, + { + "epoch": 0.6558476394849786, + "grad_norm": 0.482421875, + "learning_rate": 4.866992341525054e-06, + "loss": 2.255, + "step": 12225 + }, + { + "epoch": 0.6559012875536481, + "grad_norm": 0.515625, + "learning_rate": 4.866964380500529e-06, + "loss": 2.1208, + "step": 12226 + }, + { + "epoch": 0.6559549356223175, + "grad_norm": 0.447265625, + "learning_rate": 4.866936416617648e-06, + "loss": 2.4169, + "step": 12227 + }, + { + "epoch": 0.6560085836909871, + "grad_norm": 0.470703125, + "learning_rate": 4.866908449876446e-06, + "loss": 2.4845, + "step": 12228 + }, + { + "epoch": 0.6560622317596566, + "grad_norm": 0.45703125, + "learning_rate": 4.866880480276956e-06, + "loss": 2.3457, + "step": 12229 + }, + { + "epoch": 0.6561158798283262, + "grad_norm": 0.6484375, + "learning_rate": 4.8668525078192136e-06, + "loss": 2.4699, + "step": 12230 + }, + { + "epoch": 0.6561695278969957, + "grad_norm": 0.408203125, + "learning_rate": 4.86682453250325e-06, + "loss": 2.1431, + "step": 12231 + }, + { + "epoch": 0.6562231759656653, + "grad_norm": 0.43359375, + "learning_rate": 4.866796554329102e-06, + "loss": 2.3068, + "step": 12232 + }, + { + "epoch": 0.6562768240343347, + "grad_norm": 0.65234375, + "learning_rate": 4.8667685732967995e-06, + "loss": 2.3657, + "step": 12233 + }, + { + "epoch": 0.6563304721030043, + "grad_norm": 0.453125, + "learning_rate": 4.866740589406379e-06, + "loss": 2.2576, + "step": 12234 + }, + { + "epoch": 0.6563841201716738, + "grad_norm": 0.4375, + "learning_rate": 4.8667126026578745e-06, + "loss": 2.2066, + "step": 12235 + }, + { + "epoch": 0.6564377682403434, + "grad_norm": 0.55859375, + "learning_rate": 4.866684613051319e-06, + "loss": 2.4178, + "step": 12236 + }, + { + "epoch": 0.6564914163090129, + "grad_norm": 1.796875, + "learning_rate": 4.866656620586745e-06, + "loss": 2.3846, + "step": 12237 + }, + { + "epoch": 0.6565450643776825, + "grad_norm": 0.357421875, + "learning_rate": 4.866628625264189e-06, + "loss": 2.0755, + "step": 12238 + }, + { + "epoch": 0.6565987124463519, + "grad_norm": 0.4453125, + "learning_rate": 4.866600627083683e-06, + "loss": 1.9867, + "step": 12239 + }, + { + "epoch": 0.6566523605150214, + "grad_norm": 0.470703125, + "learning_rate": 4.866572626045261e-06, + "loss": 2.3646, + "step": 12240 + }, + { + "epoch": 0.656706008583691, + "grad_norm": 2.28125, + "learning_rate": 4.866544622148957e-06, + "loss": 2.242, + "step": 12241 + }, + { + "epoch": 0.6567596566523605, + "grad_norm": 0.39453125, + "learning_rate": 4.866516615394804e-06, + "loss": 2.2639, + "step": 12242 + }, + { + "epoch": 0.6568133047210301, + "grad_norm": 0.68359375, + "learning_rate": 4.866488605782837e-06, + "loss": 2.0085, + "step": 12243 + }, + { + "epoch": 0.6568669527896995, + "grad_norm": 0.5234375, + "learning_rate": 4.866460593313091e-06, + "loss": 2.2795, + "step": 12244 + }, + { + "epoch": 0.6569206008583691, + "grad_norm": 0.416015625, + "learning_rate": 4.866432577985597e-06, + "loss": 2.2095, + "step": 12245 + }, + { + "epoch": 0.6569742489270386, + "grad_norm": 0.40625, + "learning_rate": 4.866404559800391e-06, + "loss": 2.436, + "step": 12246 + }, + { + "epoch": 0.6570278969957082, + "grad_norm": 0.52734375, + "learning_rate": 4.866376538757505e-06, + "loss": 2.2402, + "step": 12247 + }, + { + "epoch": 0.6570815450643777, + "grad_norm": 0.44921875, + "learning_rate": 4.8663485148569745e-06, + "loss": 2.1256, + "step": 12248 + }, + { + "epoch": 0.6571351931330472, + "grad_norm": 0.625, + "learning_rate": 4.866320488098833e-06, + "loss": 2.4138, + "step": 12249 + }, + { + "epoch": 0.6571888412017167, + "grad_norm": 0.373046875, + "learning_rate": 4.866292458483113e-06, + "loss": 2.1534, + "step": 12250 + }, + { + "epoch": 0.6572424892703863, + "grad_norm": 0.8671875, + "learning_rate": 4.86626442600985e-06, + "loss": 2.3105, + "step": 12251 + }, + { + "epoch": 0.6572961373390558, + "grad_norm": 0.46875, + "learning_rate": 4.866236390679077e-06, + "loss": 2.3178, + "step": 12252 + }, + { + "epoch": 0.6573497854077254, + "grad_norm": 0.43359375, + "learning_rate": 4.866208352490828e-06, + "loss": 2.1978, + "step": 12253 + }, + { + "epoch": 0.6574034334763948, + "grad_norm": 0.3984375, + "learning_rate": 4.866180311445137e-06, + "loss": 2.2392, + "step": 12254 + }, + { + "epoch": 0.6574570815450643, + "grad_norm": 0.46875, + "learning_rate": 4.866152267542038e-06, + "loss": 2.4529, + "step": 12255 + }, + { + "epoch": 0.6575107296137339, + "grad_norm": 0.453125, + "learning_rate": 4.866124220781564e-06, + "loss": 2.1175, + "step": 12256 + }, + { + "epoch": 0.6575643776824034, + "grad_norm": 0.73046875, + "learning_rate": 4.866096171163751e-06, + "loss": 2.4651, + "step": 12257 + }, + { + "epoch": 0.657618025751073, + "grad_norm": 0.431640625, + "learning_rate": 4.86606811868863e-06, + "loss": 2.3068, + "step": 12258 + }, + { + "epoch": 0.6576716738197425, + "grad_norm": 0.490234375, + "learning_rate": 4.8660400633562365e-06, + "loss": 2.0511, + "step": 12259 + }, + { + "epoch": 0.657725321888412, + "grad_norm": 1.8515625, + "learning_rate": 4.8660120051666045e-06, + "loss": 2.308, + "step": 12260 + }, + { + "epoch": 0.6577789699570815, + "grad_norm": 0.6015625, + "learning_rate": 4.865983944119768e-06, + "loss": 2.5052, + "step": 12261 + }, + { + "epoch": 0.6578326180257511, + "grad_norm": 0.45703125, + "learning_rate": 4.865955880215759e-06, + "loss": 2.4425, + "step": 12262 + }, + { + "epoch": 0.6578862660944206, + "grad_norm": 0.392578125, + "learning_rate": 4.865927813454614e-06, + "loss": 2.1146, + "step": 12263 + }, + { + "epoch": 0.6579399141630902, + "grad_norm": 0.6640625, + "learning_rate": 4.865899743836365e-06, + "loss": 2.254, + "step": 12264 + }, + { + "epoch": 0.6579935622317596, + "grad_norm": 0.462890625, + "learning_rate": 4.865871671361047e-06, + "loss": 2.1818, + "step": 12265 + }, + { + "epoch": 0.6580472103004292, + "grad_norm": 0.42578125, + "learning_rate": 4.8658435960286935e-06, + "loss": 2.0859, + "step": 12266 + }, + { + "epoch": 0.6581008583690987, + "grad_norm": 0.369140625, + "learning_rate": 4.865815517839339e-06, + "loss": 2.0291, + "step": 12267 + }, + { + "epoch": 0.6581545064377683, + "grad_norm": 0.60546875, + "learning_rate": 4.865787436793015e-06, + "loss": 2.34, + "step": 12268 + }, + { + "epoch": 0.6582081545064378, + "grad_norm": 0.380859375, + "learning_rate": 4.865759352889758e-06, + "loss": 2.3206, + "step": 12269 + }, + { + "epoch": 0.6582618025751072, + "grad_norm": 0.412109375, + "learning_rate": 4.8657312661296015e-06, + "loss": 2.0935, + "step": 12270 + }, + { + "epoch": 0.6583154506437768, + "grad_norm": 0.470703125, + "learning_rate": 4.86570317651258e-06, + "loss": 2.3718, + "step": 12271 + }, + { + "epoch": 0.6583690987124463, + "grad_norm": 0.41015625, + "learning_rate": 4.865675084038726e-06, + "loss": 2.2422, + "step": 12272 + }, + { + "epoch": 0.6584227467811159, + "grad_norm": 0.5703125, + "learning_rate": 4.865646988708072e-06, + "loss": 2.2561, + "step": 12273 + }, + { + "epoch": 0.6584763948497854, + "grad_norm": 0.466796875, + "learning_rate": 4.865618890520656e-06, + "loss": 2.2279, + "step": 12274 + }, + { + "epoch": 0.658530042918455, + "grad_norm": 0.55859375, + "learning_rate": 4.865590789476509e-06, + "loss": 2.4892, + "step": 12275 + }, + { + "epoch": 0.6585836909871244, + "grad_norm": 0.45703125, + "learning_rate": 4.865562685575665e-06, + "loss": 2.3305, + "step": 12276 + }, + { + "epoch": 0.658637339055794, + "grad_norm": 0.5546875, + "learning_rate": 4.86553457881816e-06, + "loss": 2.302, + "step": 12277 + }, + { + "epoch": 0.6586909871244635, + "grad_norm": 0.408203125, + "learning_rate": 4.865506469204026e-06, + "loss": 2.0901, + "step": 12278 + }, + { + "epoch": 0.6587446351931331, + "grad_norm": 0.447265625, + "learning_rate": 4.865478356733298e-06, + "loss": 2.4, + "step": 12279 + }, + { + "epoch": 0.6587982832618026, + "grad_norm": 0.53125, + "learning_rate": 4.865450241406009e-06, + "loss": 2.4822, + "step": 12280 + }, + { + "epoch": 0.6588519313304722, + "grad_norm": 0.73046875, + "learning_rate": 4.865422123222194e-06, + "loss": 2.1403, + "step": 12281 + }, + { + "epoch": 0.6589055793991416, + "grad_norm": 0.451171875, + "learning_rate": 4.865394002181884e-06, + "loss": 2.1831, + "step": 12282 + }, + { + "epoch": 0.6589592274678111, + "grad_norm": 0.421875, + "learning_rate": 4.865365878285117e-06, + "loss": 2.096, + "step": 12283 + }, + { + "epoch": 0.6590128755364807, + "grad_norm": 0.60546875, + "learning_rate": 4.865337751531926e-06, + "loss": 2.4844, + "step": 12284 + }, + { + "epoch": 0.6590665236051502, + "grad_norm": 0.41796875, + "learning_rate": 4.865309621922344e-06, + "loss": 2.3683, + "step": 12285 + }, + { + "epoch": 0.6591201716738198, + "grad_norm": 0.52734375, + "learning_rate": 4.865281489456405e-06, + "loss": 1.5405, + "step": 12286 + }, + { + "epoch": 0.6591738197424892, + "grad_norm": 0.439453125, + "learning_rate": 4.865253354134143e-06, + "loss": 2.3584, + "step": 12287 + }, + { + "epoch": 0.6592274678111588, + "grad_norm": 0.703125, + "learning_rate": 4.865225215955592e-06, + "loss": 2.1664, + "step": 12288 + }, + { + "epoch": 0.6592811158798283, + "grad_norm": 0.4609375, + "learning_rate": 4.8651970749207864e-06, + "loss": 2.5236, + "step": 12289 + }, + { + "epoch": 0.6593347639484979, + "grad_norm": 0.462890625, + "learning_rate": 4.86516893102976e-06, + "loss": 2.1059, + "step": 12290 + }, + { + "epoch": 0.6593884120171674, + "grad_norm": 0.341796875, + "learning_rate": 4.865140784282547e-06, + "loss": 2.2401, + "step": 12291 + }, + { + "epoch": 0.659442060085837, + "grad_norm": 0.41015625, + "learning_rate": 4.865112634679181e-06, + "loss": 2.3157, + "step": 12292 + }, + { + "epoch": 0.6594957081545064, + "grad_norm": 0.5546875, + "learning_rate": 4.865084482219695e-06, + "loss": 2.2078, + "step": 12293 + }, + { + "epoch": 0.659549356223176, + "grad_norm": 0.494140625, + "learning_rate": 4.865056326904126e-06, + "loss": 2.1956, + "step": 12294 + }, + { + "epoch": 0.6596030042918455, + "grad_norm": 0.43359375, + "learning_rate": 4.865028168732505e-06, + "loss": 2.1983, + "step": 12295 + }, + { + "epoch": 0.6596566523605151, + "grad_norm": 0.50390625, + "learning_rate": 4.8650000077048675e-06, + "loss": 2.0322, + "step": 12296 + }, + { + "epoch": 0.6597103004291845, + "grad_norm": 0.46875, + "learning_rate": 4.864971843821246e-06, + "loss": 2.2667, + "step": 12297 + }, + { + "epoch": 0.659763948497854, + "grad_norm": 0.4140625, + "learning_rate": 4.864943677081677e-06, + "loss": 2.1312, + "step": 12298 + }, + { + "epoch": 0.6598175965665236, + "grad_norm": 4.46875, + "learning_rate": 4.864915507486193e-06, + "loss": 2.5598, + "step": 12299 + }, + { + "epoch": 0.6598712446351931, + "grad_norm": 0.765625, + "learning_rate": 4.864887335034828e-06, + "loss": 2.2492, + "step": 12300 + }, + { + "epoch": 0.6599248927038627, + "grad_norm": 0.470703125, + "learning_rate": 4.864859159727616e-06, + "loss": 2.2669, + "step": 12301 + }, + { + "epoch": 0.6599785407725322, + "grad_norm": 0.4140625, + "learning_rate": 4.8648309815645915e-06, + "loss": 2.3821, + "step": 12302 + }, + { + "epoch": 0.6600321888412017, + "grad_norm": 0.498046875, + "learning_rate": 4.864802800545788e-06, + "loss": 2.3567, + "step": 12303 + }, + { + "epoch": 0.6600858369098712, + "grad_norm": 0.431640625, + "learning_rate": 4.86477461667124e-06, + "loss": 2.4, + "step": 12304 + }, + { + "epoch": 0.6601394849785408, + "grad_norm": 0.439453125, + "learning_rate": 4.8647464299409816e-06, + "loss": 2.0743, + "step": 12305 + }, + { + "epoch": 0.6601931330472103, + "grad_norm": 0.35546875, + "learning_rate": 4.864718240355046e-06, + "loss": 2.3361, + "step": 12306 + }, + { + "epoch": 0.6602467811158799, + "grad_norm": 0.53515625, + "learning_rate": 4.864690047913468e-06, + "loss": 2.3153, + "step": 12307 + }, + { + "epoch": 0.6603004291845493, + "grad_norm": 0.453125, + "learning_rate": 4.864661852616281e-06, + "loss": 2.3998, + "step": 12308 + }, + { + "epoch": 0.6603540772532189, + "grad_norm": 0.4453125, + "learning_rate": 4.86463365446352e-06, + "loss": 2.2911, + "step": 12309 + }, + { + "epoch": 0.6604077253218884, + "grad_norm": 0.45703125, + "learning_rate": 4.864605453455218e-06, + "loss": 2.4683, + "step": 12310 + }, + { + "epoch": 0.660461373390558, + "grad_norm": 0.470703125, + "learning_rate": 4.86457724959141e-06, + "loss": 2.204, + "step": 12311 + }, + { + "epoch": 0.6605150214592275, + "grad_norm": 0.494140625, + "learning_rate": 4.86454904287213e-06, + "loss": 2.1787, + "step": 12312 + }, + { + "epoch": 0.660568669527897, + "grad_norm": 0.3984375, + "learning_rate": 4.864520833297411e-06, + "loss": 2.1063, + "step": 12313 + }, + { + "epoch": 0.6606223175965665, + "grad_norm": 0.4296875, + "learning_rate": 4.8644926208672886e-06, + "loss": 2.2, + "step": 12314 + }, + { + "epoch": 0.660675965665236, + "grad_norm": 0.4296875, + "learning_rate": 4.864464405581795e-06, + "loss": 2.1101, + "step": 12315 + }, + { + "epoch": 0.6607296137339056, + "grad_norm": 0.49609375, + "learning_rate": 4.864436187440966e-06, + "loss": 2.3437, + "step": 12316 + }, + { + "epoch": 0.6607832618025751, + "grad_norm": 0.4453125, + "learning_rate": 4.864407966444835e-06, + "loss": 2.3436, + "step": 12317 + }, + { + "epoch": 0.6608369098712447, + "grad_norm": 0.392578125, + "learning_rate": 4.8643797425934355e-06, + "loss": 2.1285, + "step": 12318 + }, + { + "epoch": 0.6608905579399141, + "grad_norm": 0.60546875, + "learning_rate": 4.864351515886803e-06, + "loss": 2.2166, + "step": 12319 + }, + { + "epoch": 0.6609442060085837, + "grad_norm": 0.48828125, + "learning_rate": 4.86432328632497e-06, + "loss": 2.2007, + "step": 12320 + }, + { + "epoch": 0.6609978540772532, + "grad_norm": 0.5703125, + "learning_rate": 4.864295053907972e-06, + "loss": 2.714, + "step": 12321 + }, + { + "epoch": 0.6610515021459228, + "grad_norm": 0.427734375, + "learning_rate": 4.864266818635842e-06, + "loss": 2.2618, + "step": 12322 + }, + { + "epoch": 0.6611051502145923, + "grad_norm": 0.4765625, + "learning_rate": 4.864238580508614e-06, + "loss": 1.5559, + "step": 12323 + }, + { + "epoch": 0.6611587982832619, + "grad_norm": 0.478515625, + "learning_rate": 4.864210339526324e-06, + "loss": 1.534, + "step": 12324 + }, + { + "epoch": 0.6612124463519313, + "grad_norm": 0.58984375, + "learning_rate": 4.864182095689004e-06, + "loss": 2.3087, + "step": 12325 + }, + { + "epoch": 0.6612660944206008, + "grad_norm": 0.46875, + "learning_rate": 4.864153848996689e-06, + "loss": 2.2179, + "step": 12326 + }, + { + "epoch": 0.6613197424892704, + "grad_norm": 0.41015625, + "learning_rate": 4.864125599449413e-06, + "loss": 2.2525, + "step": 12327 + }, + { + "epoch": 0.6613733905579399, + "grad_norm": 0.50390625, + "learning_rate": 4.86409734704721e-06, + "loss": 2.1338, + "step": 12328 + }, + { + "epoch": 0.6614270386266095, + "grad_norm": 0.515625, + "learning_rate": 4.8640690917901135e-06, + "loss": 2.3609, + "step": 12329 + }, + { + "epoch": 0.6614806866952789, + "grad_norm": 0.453125, + "learning_rate": 4.864040833678159e-06, + "loss": 2.2772, + "step": 12330 + }, + { + "epoch": 0.6615343347639485, + "grad_norm": 0.408203125, + "learning_rate": 4.8640125727113796e-06, + "loss": 2.0843, + "step": 12331 + }, + { + "epoch": 0.661587982832618, + "grad_norm": 0.498046875, + "learning_rate": 4.86398430888981e-06, + "loss": 2.5117, + "step": 12332 + }, + { + "epoch": 0.6616416309012876, + "grad_norm": 0.451171875, + "learning_rate": 4.863956042213484e-06, + "loss": 2.1961, + "step": 12333 + }, + { + "epoch": 0.6616952789699571, + "grad_norm": 0.41015625, + "learning_rate": 4.863927772682436e-06, + "loss": 2.2222, + "step": 12334 + }, + { + "epoch": 0.6617489270386266, + "grad_norm": 0.51171875, + "learning_rate": 4.863899500296701e-06, + "loss": 1.6262, + "step": 12335 + }, + { + "epoch": 0.6618025751072961, + "grad_norm": 0.484375, + "learning_rate": 4.86387122505631e-06, + "loss": 2.1372, + "step": 12336 + }, + { + "epoch": 0.6618562231759657, + "grad_norm": 0.5234375, + "learning_rate": 4.8638429469613e-06, + "loss": 2.279, + "step": 12337 + }, + { + "epoch": 0.6619098712446352, + "grad_norm": 0.44921875, + "learning_rate": 4.863814666011705e-06, + "loss": 2.5454, + "step": 12338 + }, + { + "epoch": 0.6619635193133048, + "grad_norm": 0.455078125, + "learning_rate": 4.8637863822075585e-06, + "loss": 2.2823, + "step": 12339 + }, + { + "epoch": 0.6620171673819742, + "grad_norm": 0.376953125, + "learning_rate": 4.8637580955488935e-06, + "loss": 2.2345, + "step": 12340 + }, + { + "epoch": 0.6620708154506437, + "grad_norm": 0.408203125, + "learning_rate": 4.863729806035746e-06, + "loss": 2.2311, + "step": 12341 + }, + { + "epoch": 0.6621244635193133, + "grad_norm": 0.4140625, + "learning_rate": 4.86370151366815e-06, + "loss": 2.4048, + "step": 12342 + }, + { + "epoch": 0.6621781115879828, + "grad_norm": 0.384765625, + "learning_rate": 4.863673218446139e-06, + "loss": 2.1903, + "step": 12343 + }, + { + "epoch": 0.6622317596566524, + "grad_norm": 0.390625, + "learning_rate": 4.8636449203697475e-06, + "loss": 2.4011, + "step": 12344 + }, + { + "epoch": 0.6622854077253219, + "grad_norm": 0.53515625, + "learning_rate": 4.863616619439009e-06, + "loss": 2.3016, + "step": 12345 + }, + { + "epoch": 0.6623390557939914, + "grad_norm": 0.45703125, + "learning_rate": 4.863588315653959e-06, + "loss": 2.3385, + "step": 12346 + }, + { + "epoch": 0.6623927038626609, + "grad_norm": 0.453125, + "learning_rate": 4.86356000901463e-06, + "loss": 2.1856, + "step": 12347 + }, + { + "epoch": 0.6624463519313305, + "grad_norm": 0.494140625, + "learning_rate": 4.863531699521058e-06, + "loss": 2.2517, + "step": 12348 + }, + { + "epoch": 0.6625, + "grad_norm": 0.390625, + "learning_rate": 4.863503387173276e-06, + "loss": 2.1837, + "step": 12349 + }, + { + "epoch": 0.6625536480686696, + "grad_norm": 0.4296875, + "learning_rate": 4.863475071971317e-06, + "loss": 2.4258, + "step": 12350 + }, + { + "epoch": 0.662607296137339, + "grad_norm": 0.67578125, + "learning_rate": 4.863446753915219e-06, + "loss": 2.0798, + "step": 12351 + }, + { + "epoch": 0.6626609442060086, + "grad_norm": 0.462890625, + "learning_rate": 4.863418433005013e-06, + "loss": 2.5215, + "step": 12352 + }, + { + "epoch": 0.6627145922746781, + "grad_norm": 0.431640625, + "learning_rate": 4.863390109240733e-06, + "loss": 2.1346, + "step": 12353 + }, + { + "epoch": 0.6627682403433477, + "grad_norm": 0.421875, + "learning_rate": 4.8633617826224154e-06, + "loss": 2.2611, + "step": 12354 + }, + { + "epoch": 0.6628218884120172, + "grad_norm": 0.474609375, + "learning_rate": 4.863333453150093e-06, + "loss": 2.4681, + "step": 12355 + }, + { + "epoch": 0.6628755364806866, + "grad_norm": 0.45703125, + "learning_rate": 4.8633051208238e-06, + "loss": 2.4471, + "step": 12356 + }, + { + "epoch": 0.6629291845493562, + "grad_norm": 0.37109375, + "learning_rate": 4.863276785643571e-06, + "loss": 1.9981, + "step": 12357 + }, + { + "epoch": 0.6629828326180257, + "grad_norm": 0.7421875, + "learning_rate": 4.86324844760944e-06, + "loss": 2.1666, + "step": 12358 + }, + { + "epoch": 0.6630364806866953, + "grad_norm": 0.6875, + "learning_rate": 4.863220106721441e-06, + "loss": 2.5872, + "step": 12359 + }, + { + "epoch": 0.6630901287553648, + "grad_norm": 0.58203125, + "learning_rate": 4.8631917629796095e-06, + "loss": 2.2853, + "step": 12360 + }, + { + "epoch": 0.6631437768240344, + "grad_norm": 0.54296875, + "learning_rate": 4.863163416383978e-06, + "loss": 2.0991, + "step": 12361 + }, + { + "epoch": 0.6631974248927038, + "grad_norm": 0.453125, + "learning_rate": 4.863135066934582e-06, + "loss": 2.3962, + "step": 12362 + }, + { + "epoch": 0.6632510729613734, + "grad_norm": 0.38671875, + "learning_rate": 4.863106714631455e-06, + "loss": 2.3247, + "step": 12363 + }, + { + "epoch": 0.6633047210300429, + "grad_norm": 0.85546875, + "learning_rate": 4.8630783594746316e-06, + "loss": 2.3571, + "step": 12364 + }, + { + "epoch": 0.6633583690987125, + "grad_norm": 0.453125, + "learning_rate": 4.8630500014641455e-06, + "loss": 2.2842, + "step": 12365 + }, + { + "epoch": 0.663412017167382, + "grad_norm": 0.4375, + "learning_rate": 4.863021640600032e-06, + "loss": 1.9923, + "step": 12366 + }, + { + "epoch": 0.6634656652360515, + "grad_norm": 0.52734375, + "learning_rate": 4.862993276882324e-06, + "loss": 2.1579, + "step": 12367 + }, + { + "epoch": 0.663519313304721, + "grad_norm": 0.44921875, + "learning_rate": 4.862964910311057e-06, + "loss": 2.1073, + "step": 12368 + }, + { + "epoch": 0.6635729613733906, + "grad_norm": 0.439453125, + "learning_rate": 4.862936540886264e-06, + "loss": 2.2867, + "step": 12369 + }, + { + "epoch": 0.6636266094420601, + "grad_norm": 0.4453125, + "learning_rate": 4.86290816860798e-06, + "loss": 2.3123, + "step": 12370 + }, + { + "epoch": 0.6636802575107296, + "grad_norm": 0.390625, + "learning_rate": 4.86287979347624e-06, + "loss": 2.3451, + "step": 12371 + }, + { + "epoch": 0.6637339055793992, + "grad_norm": 0.494140625, + "learning_rate": 4.862851415491078e-06, + "loss": 2.354, + "step": 12372 + }, + { + "epoch": 0.6637875536480686, + "grad_norm": 0.416015625, + "learning_rate": 4.862823034652527e-06, + "loss": 2.4811, + "step": 12373 + }, + { + "epoch": 0.6638412017167382, + "grad_norm": 0.453125, + "learning_rate": 4.862794650960622e-06, + "loss": 2.2897, + "step": 12374 + }, + { + "epoch": 0.6638948497854077, + "grad_norm": 0.48046875, + "learning_rate": 4.8627662644153975e-06, + "loss": 2.3665, + "step": 12375 + }, + { + "epoch": 0.6639484978540773, + "grad_norm": 0.40234375, + "learning_rate": 4.862737875016888e-06, + "loss": 2.1516, + "step": 12376 + }, + { + "epoch": 0.6640021459227468, + "grad_norm": 0.466796875, + "learning_rate": 4.862709482765126e-06, + "loss": 2.1191, + "step": 12377 + }, + { + "epoch": 0.6640557939914163, + "grad_norm": 0.4375, + "learning_rate": 4.862681087660148e-06, + "loss": 2.2577, + "step": 12378 + }, + { + "epoch": 0.6641094420600858, + "grad_norm": 0.427734375, + "learning_rate": 4.862652689701988e-06, + "loss": 2.4007, + "step": 12379 + }, + { + "epoch": 0.6641630901287554, + "grad_norm": 0.466796875, + "learning_rate": 4.8626242888906795e-06, + "loss": 2.2801, + "step": 12380 + }, + { + "epoch": 0.6642167381974249, + "grad_norm": 0.453125, + "learning_rate": 4.862595885226257e-06, + "loss": 2.2821, + "step": 12381 + }, + { + "epoch": 0.6642703862660945, + "grad_norm": 0.4296875, + "learning_rate": 4.862567478708754e-06, + "loss": 2.2268, + "step": 12382 + }, + { + "epoch": 0.664324034334764, + "grad_norm": 0.431640625, + "learning_rate": 4.8625390693382065e-06, + "loss": 2.18, + "step": 12383 + }, + { + "epoch": 0.6643776824034334, + "grad_norm": 0.47265625, + "learning_rate": 4.862510657114647e-06, + "loss": 2.3442, + "step": 12384 + }, + { + "epoch": 0.664431330472103, + "grad_norm": 0.51953125, + "learning_rate": 4.8624822420381124e-06, + "loss": 2.1374, + "step": 12385 + }, + { + "epoch": 0.6644849785407725, + "grad_norm": 0.376953125, + "learning_rate": 4.862453824108635e-06, + "loss": 2.1159, + "step": 12386 + }, + { + "epoch": 0.6645386266094421, + "grad_norm": 0.435546875, + "learning_rate": 4.8624254033262485e-06, + "loss": 2.4344, + "step": 12387 + }, + { + "epoch": 0.6645922746781115, + "grad_norm": 0.431640625, + "learning_rate": 4.862396979690989e-06, + "loss": 2.3422, + "step": 12388 + }, + { + "epoch": 0.6646459227467811, + "grad_norm": 0.43359375, + "learning_rate": 4.86236855320289e-06, + "loss": 2.1777, + "step": 12389 + }, + { + "epoch": 0.6646995708154506, + "grad_norm": 0.48828125, + "learning_rate": 4.8623401238619864e-06, + "loss": 2.2284, + "step": 12390 + }, + { + "epoch": 0.6647532188841202, + "grad_norm": 0.44140625, + "learning_rate": 4.862311691668311e-06, + "loss": 2.1325, + "step": 12391 + }, + { + "epoch": 0.6648068669527897, + "grad_norm": 0.51171875, + "learning_rate": 4.8622832566218995e-06, + "loss": 2.2284, + "step": 12392 + }, + { + "epoch": 0.6648605150214593, + "grad_norm": 0.5, + "learning_rate": 4.862254818722786e-06, + "loss": 2.0959, + "step": 12393 + }, + { + "epoch": 0.6649141630901287, + "grad_norm": 0.392578125, + "learning_rate": 4.862226377971005e-06, + "loss": 2.3094, + "step": 12394 + }, + { + "epoch": 0.6649678111587983, + "grad_norm": 0.41015625, + "learning_rate": 4.862197934366589e-06, + "loss": 2.1701, + "step": 12395 + }, + { + "epoch": 0.6650214592274678, + "grad_norm": 0.455078125, + "learning_rate": 4.862169487909576e-06, + "loss": 2.0466, + "step": 12396 + }, + { + "epoch": 0.6650751072961374, + "grad_norm": 0.46875, + "learning_rate": 4.862141038599997e-06, + "loss": 2.329, + "step": 12397 + }, + { + "epoch": 0.6651287553648069, + "grad_norm": 0.7265625, + "learning_rate": 4.862112586437888e-06, + "loss": 2.2158, + "step": 12398 + }, + { + "epoch": 0.6651824034334763, + "grad_norm": 0.451171875, + "learning_rate": 4.862084131423283e-06, + "loss": 2.4112, + "step": 12399 + }, + { + "epoch": 0.6652360515021459, + "grad_norm": 0.5, + "learning_rate": 4.862055673556216e-06, + "loss": 2.4372, + "step": 12400 + }, + { + "epoch": 0.6652896995708154, + "grad_norm": 0.419921875, + "learning_rate": 4.862027212836723e-06, + "loss": 2.2595, + "step": 12401 + }, + { + "epoch": 0.665343347639485, + "grad_norm": 0.46875, + "learning_rate": 4.861998749264836e-06, + "loss": 2.2384, + "step": 12402 + }, + { + "epoch": 0.6653969957081545, + "grad_norm": 0.4296875, + "learning_rate": 4.86197028284059e-06, + "loss": 2.2432, + "step": 12403 + }, + { + "epoch": 0.6654506437768241, + "grad_norm": 0.4609375, + "learning_rate": 4.8619418135640205e-06, + "loss": 2.4418, + "step": 12404 + }, + { + "epoch": 0.6655042918454935, + "grad_norm": 0.470703125, + "learning_rate": 4.8619133414351615e-06, + "loss": 2.2409, + "step": 12405 + }, + { + "epoch": 0.6655579399141631, + "grad_norm": 0.46875, + "learning_rate": 4.861884866454047e-06, + "loss": 1.9955, + "step": 12406 + }, + { + "epoch": 0.6656115879828326, + "grad_norm": 0.4921875, + "learning_rate": 4.8618563886207114e-06, + "loss": 1.9396, + "step": 12407 + }, + { + "epoch": 0.6656652360515022, + "grad_norm": 0.423828125, + "learning_rate": 4.861827907935189e-06, + "loss": 2.1195, + "step": 12408 + }, + { + "epoch": 0.6657188841201717, + "grad_norm": 0.546875, + "learning_rate": 4.8617994243975145e-06, + "loss": 2.3636, + "step": 12409 + }, + { + "epoch": 0.6657725321888412, + "grad_norm": 0.39453125, + "learning_rate": 4.8617709380077216e-06, + "loss": 2.4473, + "step": 12410 + }, + { + "epoch": 0.6658261802575107, + "grad_norm": 0.443359375, + "learning_rate": 4.861742448765846e-06, + "loss": 2.4611, + "step": 12411 + }, + { + "epoch": 0.6658798283261803, + "grad_norm": 0.45703125, + "learning_rate": 4.8617139566719205e-06, + "loss": 2.3807, + "step": 12412 + }, + { + "epoch": 0.6659334763948498, + "grad_norm": 0.453125, + "learning_rate": 4.861685461725981e-06, + "loss": 2.2984, + "step": 12413 + }, + { + "epoch": 0.6659871244635193, + "grad_norm": 0.59375, + "learning_rate": 4.861656963928062e-06, + "loss": 2.3963, + "step": 12414 + }, + { + "epoch": 0.6660407725321889, + "grad_norm": 0.49609375, + "learning_rate": 4.861628463278196e-06, + "loss": 2.3943, + "step": 12415 + }, + { + "epoch": 0.6660944206008583, + "grad_norm": 0.431640625, + "learning_rate": 4.86159995977642e-06, + "loss": 2.1844, + "step": 12416 + }, + { + "epoch": 0.6661480686695279, + "grad_norm": 0.474609375, + "learning_rate": 4.861571453422765e-06, + "loss": 2.1834, + "step": 12417 + }, + { + "epoch": 0.6662017167381974, + "grad_norm": 0.62890625, + "learning_rate": 4.861542944217269e-06, + "loss": 2.219, + "step": 12418 + }, + { + "epoch": 0.666255364806867, + "grad_norm": 0.443359375, + "learning_rate": 4.8615144321599645e-06, + "loss": 2.3397, + "step": 12419 + }, + { + "epoch": 0.6663090128755365, + "grad_norm": 0.419921875, + "learning_rate": 4.8614859172508865e-06, + "loss": 2.369, + "step": 12420 + }, + { + "epoch": 0.666362660944206, + "grad_norm": 0.421875, + "learning_rate": 4.86145739949007e-06, + "loss": 2.443, + "step": 12421 + }, + { + "epoch": 0.6664163090128755, + "grad_norm": 0.388671875, + "learning_rate": 4.861428878877546e-06, + "loss": 2.1845, + "step": 12422 + }, + { + "epoch": 0.6664699570815451, + "grad_norm": 0.4609375, + "learning_rate": 4.861400355413354e-06, + "loss": 1.9744, + "step": 12423 + }, + { + "epoch": 0.6665236051502146, + "grad_norm": 0.5, + "learning_rate": 4.861371829097526e-06, + "loss": 2.3083, + "step": 12424 + }, + { + "epoch": 0.6665772532188842, + "grad_norm": 0.38671875, + "learning_rate": 4.8613432999300955e-06, + "loss": 1.8124, + "step": 12425 + }, + { + "epoch": 0.6666309012875536, + "grad_norm": 0.40234375, + "learning_rate": 4.8613147679110985e-06, + "loss": 2.1109, + "step": 12426 + }, + { + "epoch": 0.6666845493562231, + "grad_norm": 0.43359375, + "learning_rate": 4.8612862330405685e-06, + "loss": 2.266, + "step": 12427 + }, + { + "epoch": 0.6667381974248927, + "grad_norm": 0.46484375, + "learning_rate": 4.861257695318541e-06, + "loss": 2.2259, + "step": 12428 + }, + { + "epoch": 0.6667918454935622, + "grad_norm": 0.41796875, + "learning_rate": 4.86122915474505e-06, + "loss": 2.3046, + "step": 12429 + }, + { + "epoch": 0.6668454935622318, + "grad_norm": 0.369140625, + "learning_rate": 4.861200611320129e-06, + "loss": 2.3083, + "step": 12430 + }, + { + "epoch": 0.6668991416309012, + "grad_norm": 0.5390625, + "learning_rate": 4.861172065043814e-06, + "loss": 1.5401, + "step": 12431 + }, + { + "epoch": 0.6669527896995708, + "grad_norm": 0.494140625, + "learning_rate": 4.861143515916139e-06, + "loss": 2.4225, + "step": 12432 + }, + { + "epoch": 0.6670064377682403, + "grad_norm": 0.44921875, + "learning_rate": 4.861114963937137e-06, + "loss": 1.8754, + "step": 12433 + }, + { + "epoch": 0.6670600858369099, + "grad_norm": 0.466796875, + "learning_rate": 4.861086409106846e-06, + "loss": 2.5415, + "step": 12434 + }, + { + "epoch": 0.6671137339055794, + "grad_norm": 0.451171875, + "learning_rate": 4.861057851425296e-06, + "loss": 2.1275, + "step": 12435 + }, + { + "epoch": 0.667167381974249, + "grad_norm": 0.4609375, + "learning_rate": 4.8610292908925236e-06, + "loss": 2.2642, + "step": 12436 + }, + { + "epoch": 0.6672210300429184, + "grad_norm": 0.51171875, + "learning_rate": 4.861000727508565e-06, + "loss": 2.3403, + "step": 12437 + }, + { + "epoch": 0.667274678111588, + "grad_norm": 0.48046875, + "learning_rate": 4.860972161273452e-06, + "loss": 2.4241, + "step": 12438 + }, + { + "epoch": 0.6673283261802575, + "grad_norm": 0.609375, + "learning_rate": 4.860943592187221e-06, + "loss": 1.5696, + "step": 12439 + }, + { + "epoch": 0.6673819742489271, + "grad_norm": 0.455078125, + "learning_rate": 4.8609150202499046e-06, + "loss": 2.2215, + "step": 12440 + }, + { + "epoch": 0.6674356223175966, + "grad_norm": 0.4140625, + "learning_rate": 4.860886445461539e-06, + "loss": 2.285, + "step": 12441 + }, + { + "epoch": 0.667489270386266, + "grad_norm": 1.09375, + "learning_rate": 4.860857867822158e-06, + "loss": 2.2634, + "step": 12442 + }, + { + "epoch": 0.6675429184549356, + "grad_norm": 0.390625, + "learning_rate": 4.8608292873317965e-06, + "loss": 2.1254, + "step": 12443 + }, + { + "epoch": 0.6675965665236051, + "grad_norm": 0.43359375, + "learning_rate": 4.860800703990488e-06, + "loss": 2.4502, + "step": 12444 + }, + { + "epoch": 0.6676502145922747, + "grad_norm": 0.435546875, + "learning_rate": 4.860772117798268e-06, + "loss": 2.2276, + "step": 12445 + }, + { + "epoch": 0.6677038626609442, + "grad_norm": 0.46484375, + "learning_rate": 4.860743528755171e-06, + "loss": 2.3397, + "step": 12446 + }, + { + "epoch": 0.6677575107296138, + "grad_norm": 0.486328125, + "learning_rate": 4.860714936861231e-06, + "loss": 2.5993, + "step": 12447 + }, + { + "epoch": 0.6678111587982832, + "grad_norm": 0.43359375, + "learning_rate": 4.860686342116483e-06, + "loss": 2.1564, + "step": 12448 + }, + { + "epoch": 0.6678648068669528, + "grad_norm": 0.412109375, + "learning_rate": 4.860657744520962e-06, + "loss": 1.9293, + "step": 12449 + }, + { + "epoch": 0.6679184549356223, + "grad_norm": 0.439453125, + "learning_rate": 4.8606291440747e-06, + "loss": 2.2869, + "step": 12450 + }, + { + "epoch": 0.6679721030042919, + "grad_norm": 0.42578125, + "learning_rate": 4.860600540777734e-06, + "loss": 2.4082, + "step": 12451 + }, + { + "epoch": 0.6680257510729614, + "grad_norm": 0.45703125, + "learning_rate": 4.860571934630099e-06, + "loss": 2.3936, + "step": 12452 + }, + { + "epoch": 0.668079399141631, + "grad_norm": 0.4453125, + "learning_rate": 4.860543325631828e-06, + "loss": 2.5485, + "step": 12453 + }, + { + "epoch": 0.6681330472103004, + "grad_norm": 0.466796875, + "learning_rate": 4.8605147137829555e-06, + "loss": 2.1145, + "step": 12454 + }, + { + "epoch": 0.66818669527897, + "grad_norm": 0.345703125, + "learning_rate": 4.860486099083517e-06, + "loss": 2.3245, + "step": 12455 + }, + { + "epoch": 0.6682403433476395, + "grad_norm": 0.458984375, + "learning_rate": 4.860457481533547e-06, + "loss": 2.3219, + "step": 12456 + }, + { + "epoch": 0.668293991416309, + "grad_norm": 0.482421875, + "learning_rate": 4.8604288611330795e-06, + "loss": 2.0359, + "step": 12457 + }, + { + "epoch": 0.6683476394849786, + "grad_norm": 0.458984375, + "learning_rate": 4.860400237882149e-06, + "loss": 2.0282, + "step": 12458 + }, + { + "epoch": 0.668401287553648, + "grad_norm": 0.51171875, + "learning_rate": 4.8603716117807904e-06, + "loss": 2.1264, + "step": 12459 + }, + { + "epoch": 0.6684549356223176, + "grad_norm": 0.4375, + "learning_rate": 4.860342982829038e-06, + "loss": 2.1498, + "step": 12460 + }, + { + "epoch": 0.6685085836909871, + "grad_norm": 0.54296875, + "learning_rate": 4.860314351026927e-06, + "loss": 2.3339, + "step": 12461 + }, + { + "epoch": 0.6685622317596567, + "grad_norm": 0.4453125, + "learning_rate": 4.860285716374491e-06, + "loss": 2.3265, + "step": 12462 + }, + { + "epoch": 0.6686158798283262, + "grad_norm": 0.44921875, + "learning_rate": 4.8602570788717665e-06, + "loss": 2.158, + "step": 12463 + }, + { + "epoch": 0.6686695278969957, + "grad_norm": 0.41796875, + "learning_rate": 4.8602284385187845e-06, + "loss": 2.2969, + "step": 12464 + }, + { + "epoch": 0.6687231759656652, + "grad_norm": 0.3984375, + "learning_rate": 4.860199795315583e-06, + "loss": 2.5209, + "step": 12465 + }, + { + "epoch": 0.6687768240343348, + "grad_norm": 1.109375, + "learning_rate": 4.8601711492621964e-06, + "loss": 2.1997, + "step": 12466 + }, + { + "epoch": 0.6688304721030043, + "grad_norm": 0.455078125, + "learning_rate": 4.8601425003586565e-06, + "loss": 2.2169, + "step": 12467 + }, + { + "epoch": 0.6688841201716739, + "grad_norm": 0.609375, + "learning_rate": 4.860113848605001e-06, + "loss": 2.396, + "step": 12468 + }, + { + "epoch": 0.6689377682403433, + "grad_norm": 0.408203125, + "learning_rate": 4.860085194001262e-06, + "loss": 2.2031, + "step": 12469 + }, + { + "epoch": 0.6689914163090128, + "grad_norm": 1.2421875, + "learning_rate": 4.8600565365474765e-06, + "loss": 2.2396, + "step": 12470 + }, + { + "epoch": 0.6690450643776824, + "grad_norm": 0.4453125, + "learning_rate": 4.860027876243677e-06, + "loss": 2.2618, + "step": 12471 + }, + { + "epoch": 0.6690987124463519, + "grad_norm": 0.83203125, + "learning_rate": 4.859999213089899e-06, + "loss": 2.2615, + "step": 12472 + }, + { + "epoch": 0.6691523605150215, + "grad_norm": 0.48828125, + "learning_rate": 4.859970547086177e-06, + "loss": 2.3953, + "step": 12473 + }, + { + "epoch": 0.669206008583691, + "grad_norm": 0.42578125, + "learning_rate": 4.8599418782325465e-06, + "loss": 2.4158, + "step": 12474 + }, + { + "epoch": 0.6692596566523605, + "grad_norm": 0.45703125, + "learning_rate": 4.859913206529041e-06, + "loss": 2.1432, + "step": 12475 + }, + { + "epoch": 0.66931330472103, + "grad_norm": 0.5, + "learning_rate": 4.859884531975695e-06, + "loss": 2.1535, + "step": 12476 + }, + { + "epoch": 0.6693669527896996, + "grad_norm": 0.484375, + "learning_rate": 4.8598558545725434e-06, + "loss": 2.3907, + "step": 12477 + }, + { + "epoch": 0.6694206008583691, + "grad_norm": 0.5234375, + "learning_rate": 4.859827174319622e-06, + "loss": 2.1619, + "step": 12478 + }, + { + "epoch": 0.6694742489270387, + "grad_norm": 0.39453125, + "learning_rate": 4.859798491216964e-06, + "loss": 2.3204, + "step": 12479 + }, + { + "epoch": 0.6695278969957081, + "grad_norm": 0.486328125, + "learning_rate": 4.859769805264605e-06, + "loss": 2.4046, + "step": 12480 + }, + { + "epoch": 0.6695815450643777, + "grad_norm": 0.447265625, + "learning_rate": 4.859741116462578e-06, + "loss": 2.2409, + "step": 12481 + }, + { + "epoch": 0.6696351931330472, + "grad_norm": 1.078125, + "learning_rate": 4.85971242481092e-06, + "loss": 2.5228, + "step": 12482 + }, + { + "epoch": 0.6696888412017168, + "grad_norm": 0.5078125, + "learning_rate": 4.859683730309664e-06, + "loss": 2.2055, + "step": 12483 + }, + { + "epoch": 0.6697424892703863, + "grad_norm": 0.431640625, + "learning_rate": 4.859655032958845e-06, + "loss": 2.2412, + "step": 12484 + }, + { + "epoch": 0.6697961373390557, + "grad_norm": 0.50390625, + "learning_rate": 4.859626332758498e-06, + "loss": 2.2988, + "step": 12485 + }, + { + "epoch": 0.6698497854077253, + "grad_norm": 0.546875, + "learning_rate": 4.859597629708656e-06, + "loss": 2.2402, + "step": 12486 + }, + { + "epoch": 0.6699034334763948, + "grad_norm": 0.419921875, + "learning_rate": 4.859568923809357e-06, + "loss": 2.2803, + "step": 12487 + }, + { + "epoch": 0.6699570815450644, + "grad_norm": 0.4296875, + "learning_rate": 4.859540215060633e-06, + "loss": 2.029, + "step": 12488 + }, + { + "epoch": 0.6700107296137339, + "grad_norm": 0.421875, + "learning_rate": 4.8595115034625195e-06, + "loss": 2.0959, + "step": 12489 + }, + { + "epoch": 0.6700643776824035, + "grad_norm": 0.4296875, + "learning_rate": 4.859482789015051e-06, + "loss": 2.1956, + "step": 12490 + }, + { + "epoch": 0.6701180257510729, + "grad_norm": 0.51953125, + "learning_rate": 4.859454071718262e-06, + "loss": 2.1623, + "step": 12491 + }, + { + "epoch": 0.6701716738197425, + "grad_norm": 0.4375, + "learning_rate": 4.8594253515721874e-06, + "loss": 2.23, + "step": 12492 + }, + { + "epoch": 0.670225321888412, + "grad_norm": 0.44921875, + "learning_rate": 4.859396628576862e-06, + "loss": 2.3118, + "step": 12493 + }, + { + "epoch": 0.6702789699570816, + "grad_norm": 0.48828125, + "learning_rate": 4.85936790273232e-06, + "loss": 2.2122, + "step": 12494 + }, + { + "epoch": 0.6703326180257511, + "grad_norm": 0.53125, + "learning_rate": 4.859339174038598e-06, + "loss": 2.4373, + "step": 12495 + }, + { + "epoch": 0.6703862660944206, + "grad_norm": 0.51171875, + "learning_rate": 4.8593104424957275e-06, + "loss": 2.2541, + "step": 12496 + }, + { + "epoch": 0.6704399141630901, + "grad_norm": 0.462890625, + "learning_rate": 4.859281708103745e-06, + "loss": 2.462, + "step": 12497 + }, + { + "epoch": 0.6704935622317597, + "grad_norm": 0.423828125, + "learning_rate": 4.859252970862686e-06, + "loss": 2.1569, + "step": 12498 + }, + { + "epoch": 0.6705472103004292, + "grad_norm": 0.349609375, + "learning_rate": 4.859224230772584e-06, + "loss": 2.0464, + "step": 12499 + }, + { + "epoch": 0.6706008583690987, + "grad_norm": 0.37890625, + "learning_rate": 4.859195487833474e-06, + "loss": 2.175, + "step": 12500 + }, + { + "epoch": 0.6706545064377682, + "grad_norm": 0.462890625, + "learning_rate": 4.85916674204539e-06, + "loss": 2.4285, + "step": 12501 + }, + { + "epoch": 0.6707081545064377, + "grad_norm": 0.455078125, + "learning_rate": 4.859137993408368e-06, + "loss": 2.4529, + "step": 12502 + }, + { + "epoch": 0.6707618025751073, + "grad_norm": 0.318359375, + "learning_rate": 4.8591092419224415e-06, + "loss": 1.9056, + "step": 12503 + }, + { + "epoch": 0.6708154506437768, + "grad_norm": 0.431640625, + "learning_rate": 4.859080487587646e-06, + "loss": 2.07, + "step": 12504 + }, + { + "epoch": 0.6708690987124464, + "grad_norm": 0.58203125, + "learning_rate": 4.859051730404017e-06, + "loss": 2.325, + "step": 12505 + }, + { + "epoch": 0.6709227467811159, + "grad_norm": 0.515625, + "learning_rate": 4.859022970371587e-06, + "loss": 2.3246, + "step": 12506 + }, + { + "epoch": 0.6709763948497854, + "grad_norm": 0.47265625, + "learning_rate": 4.8589942074903925e-06, + "loss": 2.4192, + "step": 12507 + }, + { + "epoch": 0.6710300429184549, + "grad_norm": 0.57421875, + "learning_rate": 4.858965441760469e-06, + "loss": 2.4694, + "step": 12508 + }, + { + "epoch": 0.6710836909871245, + "grad_norm": 0.55859375, + "learning_rate": 4.858936673181849e-06, + "loss": 2.0829, + "step": 12509 + }, + { + "epoch": 0.671137339055794, + "grad_norm": 0.45703125, + "learning_rate": 4.858907901754567e-06, + "loss": 2.2833, + "step": 12510 + }, + { + "epoch": 0.6711909871244636, + "grad_norm": 0.435546875, + "learning_rate": 4.85887912747866e-06, + "loss": 2.2418, + "step": 12511 + }, + { + "epoch": 0.671244635193133, + "grad_norm": 0.53515625, + "learning_rate": 4.858850350354163e-06, + "loss": 2.4471, + "step": 12512 + }, + { + "epoch": 0.6712982832618025, + "grad_norm": 0.94921875, + "learning_rate": 4.858821570381108e-06, + "loss": 2.4182, + "step": 12513 + }, + { + "epoch": 0.6713519313304721, + "grad_norm": 0.43359375, + "learning_rate": 4.8587927875595315e-06, + "loss": 2.4544, + "step": 12514 + }, + { + "epoch": 0.6714055793991416, + "grad_norm": 0.43359375, + "learning_rate": 4.858764001889468e-06, + "loss": 2.2094, + "step": 12515 + }, + { + "epoch": 0.6714592274678112, + "grad_norm": 0.48046875, + "learning_rate": 4.8587352133709525e-06, + "loss": 2.5213, + "step": 12516 + }, + { + "epoch": 0.6715128755364806, + "grad_norm": 0.5390625, + "learning_rate": 4.8587064220040196e-06, + "loss": 2.0646, + "step": 12517 + }, + { + "epoch": 0.6715665236051502, + "grad_norm": 0.44921875, + "learning_rate": 4.858677627788704e-06, + "loss": 2.1449, + "step": 12518 + }, + { + "epoch": 0.6716201716738197, + "grad_norm": 0.46875, + "learning_rate": 4.85864883072504e-06, + "loss": 2.2518, + "step": 12519 + }, + { + "epoch": 0.6716738197424893, + "grad_norm": 0.392578125, + "learning_rate": 4.858620030813062e-06, + "loss": 2.3075, + "step": 12520 + }, + { + "epoch": 0.6717274678111588, + "grad_norm": 0.431640625, + "learning_rate": 4.858591228052807e-06, + "loss": 2.2605, + "step": 12521 + }, + { + "epoch": 0.6717811158798284, + "grad_norm": 0.4609375, + "learning_rate": 4.858562422444308e-06, + "loss": 2.4732, + "step": 12522 + }, + { + "epoch": 0.6718347639484978, + "grad_norm": 0.490234375, + "learning_rate": 4.858533613987601e-06, + "loss": 2.4651, + "step": 12523 + }, + { + "epoch": 0.6718884120171674, + "grad_norm": 0.5546875, + "learning_rate": 4.858504802682718e-06, + "loss": 2.3864, + "step": 12524 + }, + { + "epoch": 0.6719420600858369, + "grad_norm": 0.4375, + "learning_rate": 4.858475988529697e-06, + "loss": 2.4988, + "step": 12525 + }, + { + "epoch": 0.6719957081545065, + "grad_norm": 0.57421875, + "learning_rate": 4.858447171528572e-06, + "loss": 2.2601, + "step": 12526 + }, + { + "epoch": 0.672049356223176, + "grad_norm": 0.4453125, + "learning_rate": 4.858418351679377e-06, + "loss": 2.3194, + "step": 12527 + }, + { + "epoch": 0.6721030042918454, + "grad_norm": 0.357421875, + "learning_rate": 4.858389528982147e-06, + "loss": 1.8581, + "step": 12528 + }, + { + "epoch": 0.672156652360515, + "grad_norm": 0.416015625, + "learning_rate": 4.858360703436917e-06, + "loss": 2.2616, + "step": 12529 + }, + { + "epoch": 0.6722103004291845, + "grad_norm": 0.5234375, + "learning_rate": 4.8583318750437216e-06, + "loss": 2.3725, + "step": 12530 + }, + { + "epoch": 0.6722639484978541, + "grad_norm": 0.447265625, + "learning_rate": 4.858303043802596e-06, + "loss": 2.4553, + "step": 12531 + }, + { + "epoch": 0.6723175965665236, + "grad_norm": 0.451171875, + "learning_rate": 4.858274209713575e-06, + "loss": 2.2231, + "step": 12532 + }, + { + "epoch": 0.6723712446351932, + "grad_norm": 0.51171875, + "learning_rate": 4.858245372776693e-06, + "loss": 2.4055, + "step": 12533 + }, + { + "epoch": 0.6724248927038626, + "grad_norm": 0.455078125, + "learning_rate": 4.858216532991985e-06, + "loss": 2.3877, + "step": 12534 + }, + { + "epoch": 0.6724785407725322, + "grad_norm": 0.484375, + "learning_rate": 4.858187690359485e-06, + "loss": 2.288, + "step": 12535 + }, + { + "epoch": 0.6725321888412017, + "grad_norm": 0.43359375, + "learning_rate": 4.85815884487923e-06, + "loss": 2.3496, + "step": 12536 + }, + { + "epoch": 0.6725858369098713, + "grad_norm": 0.37890625, + "learning_rate": 4.858129996551253e-06, + "loss": 2.3066, + "step": 12537 + }, + { + "epoch": 0.6726394849785408, + "grad_norm": 0.5234375, + "learning_rate": 4.85810114537559e-06, + "loss": 2.2828, + "step": 12538 + }, + { + "epoch": 0.6726931330472103, + "grad_norm": 0.486328125, + "learning_rate": 4.858072291352275e-06, + "loss": 2.2332, + "step": 12539 + }, + { + "epoch": 0.6727467811158798, + "grad_norm": 0.51953125, + "learning_rate": 4.858043434481342e-06, + "loss": 2.3695, + "step": 12540 + }, + { + "epoch": 0.6728004291845494, + "grad_norm": 0.41015625, + "learning_rate": 4.858014574762828e-06, + "loss": 2.5244, + "step": 12541 + }, + { + "epoch": 0.6728540772532189, + "grad_norm": 0.431640625, + "learning_rate": 4.857985712196766e-06, + "loss": 2.2106, + "step": 12542 + }, + { + "epoch": 0.6729077253218884, + "grad_norm": 0.4140625, + "learning_rate": 4.857956846783193e-06, + "loss": 2.1612, + "step": 12543 + }, + { + "epoch": 0.672961373390558, + "grad_norm": 0.5234375, + "learning_rate": 4.857927978522141e-06, + "loss": 2.4217, + "step": 12544 + }, + { + "epoch": 0.6730150214592274, + "grad_norm": 0.427734375, + "learning_rate": 4.857899107413648e-06, + "loss": 1.9511, + "step": 12545 + }, + { + "epoch": 0.673068669527897, + "grad_norm": 0.5234375, + "learning_rate": 4.8578702334577455e-06, + "loss": 2.2214, + "step": 12546 + }, + { + "epoch": 0.6731223175965665, + "grad_norm": 0.44140625, + "learning_rate": 4.85784135665447e-06, + "loss": 2.4623, + "step": 12547 + }, + { + "epoch": 0.6731759656652361, + "grad_norm": 0.5625, + "learning_rate": 4.857812477003857e-06, + "loss": 2.1093, + "step": 12548 + }, + { + "epoch": 0.6732296137339056, + "grad_norm": 0.4609375, + "learning_rate": 4.857783594505941e-06, + "loss": 2.4319, + "step": 12549 + }, + { + "epoch": 0.6732832618025751, + "grad_norm": 0.50390625, + "learning_rate": 4.857754709160757e-06, + "loss": 2.6545, + "step": 12550 + }, + { + "epoch": 0.6733369098712446, + "grad_norm": 0.384765625, + "learning_rate": 4.857725820968339e-06, + "loss": 2.4148, + "step": 12551 + }, + { + "epoch": 0.6733905579399142, + "grad_norm": 0.427734375, + "learning_rate": 4.857696929928723e-06, + "loss": 1.9926, + "step": 12552 + }, + { + "epoch": 0.6734442060085837, + "grad_norm": 0.453125, + "learning_rate": 4.857668036041942e-06, + "loss": 2.1087, + "step": 12553 + }, + { + "epoch": 0.6734978540772533, + "grad_norm": 0.416015625, + "learning_rate": 4.857639139308034e-06, + "loss": 2.2824, + "step": 12554 + }, + { + "epoch": 0.6735515021459227, + "grad_norm": 0.451171875, + "learning_rate": 4.8576102397270315e-06, + "loss": 2.3265, + "step": 12555 + }, + { + "epoch": 0.6736051502145923, + "grad_norm": 0.7578125, + "learning_rate": 4.85758133729897e-06, + "loss": 1.7308, + "step": 12556 + }, + { + "epoch": 0.6736587982832618, + "grad_norm": 0.447265625, + "learning_rate": 4.857552432023884e-06, + "loss": 2.3559, + "step": 12557 + }, + { + "epoch": 0.6737124463519313, + "grad_norm": 0.447265625, + "learning_rate": 4.857523523901809e-06, + "loss": 2.3122, + "step": 12558 + }, + { + "epoch": 0.6737660944206009, + "grad_norm": 0.4296875, + "learning_rate": 4.857494612932781e-06, + "loss": 2.3283, + "step": 12559 + }, + { + "epoch": 0.6738197424892703, + "grad_norm": 0.55078125, + "learning_rate": 4.857465699116832e-06, + "loss": 2.382, + "step": 12560 + }, + { + "epoch": 0.6738733905579399, + "grad_norm": 0.51953125, + "learning_rate": 4.857436782454e-06, + "loss": 2.276, + "step": 12561 + }, + { + "epoch": 0.6739270386266094, + "grad_norm": 0.4765625, + "learning_rate": 4.8574078629443174e-06, + "loss": 2.3359, + "step": 12562 + }, + { + "epoch": 0.673980686695279, + "grad_norm": 0.447265625, + "learning_rate": 4.85737894058782e-06, + "loss": 1.3447, + "step": 12563 + }, + { + "epoch": 0.6740343347639485, + "grad_norm": 0.46484375, + "learning_rate": 4.857350015384544e-06, + "loss": 2.2518, + "step": 12564 + }, + { + "epoch": 0.6740879828326181, + "grad_norm": 0.470703125, + "learning_rate": 4.857321087334523e-06, + "loss": 2.3759, + "step": 12565 + }, + { + "epoch": 0.6741416309012875, + "grad_norm": 1.046875, + "learning_rate": 4.8572921564377915e-06, + "loss": 2.1801, + "step": 12566 + }, + { + "epoch": 0.6741952789699571, + "grad_norm": 0.41015625, + "learning_rate": 4.8572632226943864e-06, + "loss": 2.1873, + "step": 12567 + }, + { + "epoch": 0.6742489270386266, + "grad_norm": 0.466796875, + "learning_rate": 4.85723428610434e-06, + "loss": 2.3836, + "step": 12568 + }, + { + "epoch": 0.6743025751072962, + "grad_norm": 0.4453125, + "learning_rate": 4.857205346667689e-06, + "loss": 2.3755, + "step": 12569 + }, + { + "epoch": 0.6743562231759657, + "grad_norm": 0.671875, + "learning_rate": 4.857176404384469e-06, + "loss": 1.4355, + "step": 12570 + }, + { + "epoch": 0.6744098712446351, + "grad_norm": 0.4765625, + "learning_rate": 4.857147459254713e-06, + "loss": 2.3121, + "step": 12571 + }, + { + "epoch": 0.6744635193133047, + "grad_norm": 0.4453125, + "learning_rate": 4.857118511278457e-06, + "loss": 2.1192, + "step": 12572 + }, + { + "epoch": 0.6745171673819742, + "grad_norm": 0.435546875, + "learning_rate": 4.857089560455737e-06, + "loss": 2.3757, + "step": 12573 + }, + { + "epoch": 0.6745708154506438, + "grad_norm": 0.4921875, + "learning_rate": 4.857060606786585e-06, + "loss": 2.5113, + "step": 12574 + }, + { + "epoch": 0.6746244635193133, + "grad_norm": 0.68359375, + "learning_rate": 4.857031650271039e-06, + "loss": 2.4231, + "step": 12575 + }, + { + "epoch": 0.6746781115879829, + "grad_norm": 0.42578125, + "learning_rate": 4.857002690909133e-06, + "loss": 2.2784, + "step": 12576 + }, + { + "epoch": 0.6747317596566523, + "grad_norm": 0.48828125, + "learning_rate": 4.856973728700901e-06, + "loss": 2.3156, + "step": 12577 + }, + { + "epoch": 0.6747854077253219, + "grad_norm": 0.396484375, + "learning_rate": 4.856944763646378e-06, + "loss": 2.4221, + "step": 12578 + }, + { + "epoch": 0.6748390557939914, + "grad_norm": 0.58984375, + "learning_rate": 4.856915795745601e-06, + "loss": 2.4856, + "step": 12579 + }, + { + "epoch": 0.674892703862661, + "grad_norm": 0.4609375, + "learning_rate": 4.856886824998603e-06, + "loss": 2.452, + "step": 12580 + }, + { + "epoch": 0.6749463519313305, + "grad_norm": 0.4765625, + "learning_rate": 4.85685785140542e-06, + "loss": 2.1973, + "step": 12581 + }, + { + "epoch": 0.675, + "grad_norm": 0.4921875, + "learning_rate": 4.856828874966086e-06, + "loss": 2.1959, + "step": 12582 + }, + { + "epoch": 0.6750536480686695, + "grad_norm": 0.435546875, + "learning_rate": 4.856799895680637e-06, + "loss": 2.469, + "step": 12583 + }, + { + "epoch": 0.6751072961373391, + "grad_norm": 0.470703125, + "learning_rate": 4.856770913549107e-06, + "loss": 2.355, + "step": 12584 + }, + { + "epoch": 0.6751609442060086, + "grad_norm": 0.40625, + "learning_rate": 4.856741928571532e-06, + "loss": 2.3163, + "step": 12585 + }, + { + "epoch": 0.6752145922746781, + "grad_norm": 0.455078125, + "learning_rate": 4.856712940747947e-06, + "loss": 2.5179, + "step": 12586 + }, + { + "epoch": 0.6752682403433476, + "grad_norm": 0.416015625, + "learning_rate": 4.856683950078387e-06, + "loss": 2.2818, + "step": 12587 + }, + { + "epoch": 0.6753218884120171, + "grad_norm": 0.400390625, + "learning_rate": 4.8566549565628865e-06, + "loss": 2.2808, + "step": 12588 + }, + { + "epoch": 0.6753755364806867, + "grad_norm": 0.484375, + "learning_rate": 4.85662596020148e-06, + "loss": 2.4493, + "step": 12589 + }, + { + "epoch": 0.6754291845493562, + "grad_norm": 0.50390625, + "learning_rate": 4.8565969609942034e-06, + "loss": 2.1927, + "step": 12590 + }, + { + "epoch": 0.6754828326180258, + "grad_norm": 0.431640625, + "learning_rate": 4.856567958941091e-06, + "loss": 2.2909, + "step": 12591 + }, + { + "epoch": 0.6755364806866953, + "grad_norm": 0.5546875, + "learning_rate": 4.856538954042179e-06, + "loss": 1.5986, + "step": 12592 + }, + { + "epoch": 0.6755901287553648, + "grad_norm": 0.478515625, + "learning_rate": 4.856509946297501e-06, + "loss": 1.9933, + "step": 12593 + }, + { + "epoch": 0.6756437768240343, + "grad_norm": 0.390625, + "learning_rate": 4.856480935707094e-06, + "loss": 2.124, + "step": 12594 + }, + { + "epoch": 0.6756974248927039, + "grad_norm": 0.40625, + "learning_rate": 4.85645192227099e-06, + "loss": 2.3859, + "step": 12595 + }, + { + "epoch": 0.6757510729613734, + "grad_norm": 0.5, + "learning_rate": 4.856422905989228e-06, + "loss": 2.2525, + "step": 12596 + }, + { + "epoch": 0.675804721030043, + "grad_norm": 0.486328125, + "learning_rate": 4.856393886861839e-06, + "loss": 2.4381, + "step": 12597 + }, + { + "epoch": 0.6758583690987124, + "grad_norm": 0.50390625, + "learning_rate": 4.8563648648888605e-06, + "loss": 2.1525, + "step": 12598 + }, + { + "epoch": 0.675912017167382, + "grad_norm": 0.51953125, + "learning_rate": 4.8563358400703274e-06, + "loss": 2.2483, + "step": 12599 + }, + { + "epoch": 0.6759656652360515, + "grad_norm": 0.42578125, + "learning_rate": 4.856306812406274e-06, + "loss": 2.3103, + "step": 12600 + }, + { + "epoch": 0.676019313304721, + "grad_norm": 0.4609375, + "learning_rate": 4.856277781896735e-06, + "loss": 2.0467, + "step": 12601 + }, + { + "epoch": 0.6760729613733906, + "grad_norm": 0.42578125, + "learning_rate": 4.8562487485417474e-06, + "loss": 2.3124, + "step": 12602 + }, + { + "epoch": 0.67612660944206, + "grad_norm": 0.4453125, + "learning_rate": 4.856219712341345e-06, + "loss": 2.2226, + "step": 12603 + }, + { + "epoch": 0.6761802575107296, + "grad_norm": 0.4921875, + "learning_rate": 4.856190673295562e-06, + "loss": 2.3319, + "step": 12604 + }, + { + "epoch": 0.6762339055793991, + "grad_norm": 0.365234375, + "learning_rate": 4.856161631404434e-06, + "loss": 1.9439, + "step": 12605 + }, + { + "epoch": 0.6762875536480687, + "grad_norm": 0.4453125, + "learning_rate": 4.856132586667997e-06, + "loss": 2.336, + "step": 12606 + }, + { + "epoch": 0.6763412017167382, + "grad_norm": 0.5859375, + "learning_rate": 4.856103539086286e-06, + "loss": 2.4578, + "step": 12607 + }, + { + "epoch": 0.6763948497854078, + "grad_norm": 0.46875, + "learning_rate": 4.8560744886593344e-06, + "loss": 2.2679, + "step": 12608 + }, + { + "epoch": 0.6764484978540772, + "grad_norm": 0.40234375, + "learning_rate": 4.856045435387179e-06, + "loss": 2.3595, + "step": 12609 + }, + { + "epoch": 0.6765021459227468, + "grad_norm": 0.96875, + "learning_rate": 4.8560163792698545e-06, + "loss": 2.1481, + "step": 12610 + }, + { + "epoch": 0.6765557939914163, + "grad_norm": 0.435546875, + "learning_rate": 4.8559873203073945e-06, + "loss": 1.3531, + "step": 12611 + }, + { + "epoch": 0.6766094420600859, + "grad_norm": 0.490234375, + "learning_rate": 4.855958258499837e-06, + "loss": 2.1455, + "step": 12612 + }, + { + "epoch": 0.6766630901287554, + "grad_norm": 0.5, + "learning_rate": 4.855929193847215e-06, + "loss": 2.3504, + "step": 12613 + }, + { + "epoch": 0.6767167381974248, + "grad_norm": 0.466796875, + "learning_rate": 4.855900126349564e-06, + "loss": 2.2276, + "step": 12614 + }, + { + "epoch": 0.6767703862660944, + "grad_norm": 0.486328125, + "learning_rate": 4.855871056006919e-06, + "loss": 2.364, + "step": 12615 + }, + { + "epoch": 0.6768240343347639, + "grad_norm": 0.515625, + "learning_rate": 4.855841982819316e-06, + "loss": 2.4897, + "step": 12616 + }, + { + "epoch": 0.6768776824034335, + "grad_norm": 0.49609375, + "learning_rate": 4.855812906786788e-06, + "loss": 2.2919, + "step": 12617 + }, + { + "epoch": 0.676931330472103, + "grad_norm": 0.455078125, + "learning_rate": 4.855783827909373e-06, + "loss": 2.0476, + "step": 12618 + }, + { + "epoch": 0.6769849785407726, + "grad_norm": 0.412109375, + "learning_rate": 4.855754746187104e-06, + "loss": 2.093, + "step": 12619 + }, + { + "epoch": 0.677038626609442, + "grad_norm": 0.59765625, + "learning_rate": 4.855725661620016e-06, + "loss": 1.9557, + "step": 12620 + }, + { + "epoch": 0.6770922746781116, + "grad_norm": 0.44921875, + "learning_rate": 4.855696574208146e-06, + "loss": 2.1739, + "step": 12621 + }, + { + "epoch": 0.6771459227467811, + "grad_norm": 0.5546875, + "learning_rate": 4.855667483951527e-06, + "loss": 2.1745, + "step": 12622 + }, + { + "epoch": 0.6771995708154507, + "grad_norm": 0.38671875, + "learning_rate": 4.855638390850196e-06, + "loss": 2.3284, + "step": 12623 + }, + { + "epoch": 0.6772532188841202, + "grad_norm": 0.625, + "learning_rate": 4.855609294904186e-06, + "loss": 2.3476, + "step": 12624 + }, + { + "epoch": 0.6773068669527897, + "grad_norm": 0.474609375, + "learning_rate": 4.855580196113535e-06, + "loss": 2.3978, + "step": 12625 + }, + { + "epoch": 0.6773605150214592, + "grad_norm": 0.421875, + "learning_rate": 4.855551094478276e-06, + "loss": 2.2696, + "step": 12626 + }, + { + "epoch": 0.6774141630901288, + "grad_norm": 0.416015625, + "learning_rate": 4.855521989998444e-06, + "loss": 2.1374, + "step": 12627 + }, + { + "epoch": 0.6774678111587983, + "grad_norm": 0.416015625, + "learning_rate": 4.855492882674075e-06, + "loss": 2.3093, + "step": 12628 + }, + { + "epoch": 0.6775214592274678, + "grad_norm": 0.4609375, + "learning_rate": 4.855463772505205e-06, + "loss": 2.2369, + "step": 12629 + }, + { + "epoch": 0.6775751072961373, + "grad_norm": 0.484375, + "learning_rate": 4.8554346594918676e-06, + "loss": 2.248, + "step": 12630 + }, + { + "epoch": 0.6776287553648068, + "grad_norm": 0.412109375, + "learning_rate": 4.855405543634098e-06, + "loss": 2.2725, + "step": 12631 + }, + { + "epoch": 0.6776824034334764, + "grad_norm": 0.423828125, + "learning_rate": 4.855376424931932e-06, + "loss": 1.9811, + "step": 12632 + }, + { + "epoch": 0.6777360515021459, + "grad_norm": 0.421875, + "learning_rate": 4.855347303385405e-06, + "loss": 2.1499, + "step": 12633 + }, + { + "epoch": 0.6777896995708155, + "grad_norm": 0.466796875, + "learning_rate": 4.855318178994552e-06, + "loss": 2.6702, + "step": 12634 + }, + { + "epoch": 0.677843347639485, + "grad_norm": 0.5078125, + "learning_rate": 4.855289051759407e-06, + "loss": 2.3291, + "step": 12635 + }, + { + "epoch": 0.6778969957081545, + "grad_norm": 0.443359375, + "learning_rate": 4.855259921680006e-06, + "loss": 2.2084, + "step": 12636 + }, + { + "epoch": 0.677950643776824, + "grad_norm": 0.44921875, + "learning_rate": 4.855230788756385e-06, + "loss": 2.3914, + "step": 12637 + }, + { + "epoch": 0.6780042918454936, + "grad_norm": 0.453125, + "learning_rate": 4.855201652988578e-06, + "loss": 1.9932, + "step": 12638 + }, + { + "epoch": 0.6780579399141631, + "grad_norm": 0.515625, + "learning_rate": 4.855172514376622e-06, + "loss": 2.3948, + "step": 12639 + }, + { + "epoch": 0.6781115879828327, + "grad_norm": 5.15625, + "learning_rate": 4.855143372920549e-06, + "loss": 1.2581, + "step": 12640 + }, + { + "epoch": 0.6781652360515021, + "grad_norm": 0.427734375, + "learning_rate": 4.855114228620396e-06, + "loss": 2.2659, + "step": 12641 + }, + { + "epoch": 0.6782188841201717, + "grad_norm": 0.41796875, + "learning_rate": 4.855085081476199e-06, + "loss": 2.1148, + "step": 12642 + }, + { + "epoch": 0.6782725321888412, + "grad_norm": 0.421875, + "learning_rate": 4.855055931487993e-06, + "loss": 2.3002, + "step": 12643 + }, + { + "epoch": 0.6783261802575107, + "grad_norm": 0.40234375, + "learning_rate": 4.855026778655812e-06, + "loss": 2.2737, + "step": 12644 + }, + { + "epoch": 0.6783798283261803, + "grad_norm": 0.400390625, + "learning_rate": 4.854997622979693e-06, + "loss": 2.2341, + "step": 12645 + }, + { + "epoch": 0.6784334763948497, + "grad_norm": 0.43359375, + "learning_rate": 4.854968464459669e-06, + "loss": 2.2136, + "step": 12646 + }, + { + "epoch": 0.6784871244635193, + "grad_norm": 0.515625, + "learning_rate": 4.8549393030957755e-06, + "loss": 2.1988, + "step": 12647 + }, + { + "epoch": 0.6785407725321888, + "grad_norm": 0.41015625, + "learning_rate": 4.85491013888805e-06, + "loss": 2.344, + "step": 12648 + }, + { + "epoch": 0.6785944206008584, + "grad_norm": 0.47265625, + "learning_rate": 4.854880971836525e-06, + "loss": 2.3994, + "step": 12649 + }, + { + "epoch": 0.6786480686695279, + "grad_norm": 0.453125, + "learning_rate": 4.854851801941237e-06, + "loss": 2.4463, + "step": 12650 + }, + { + "epoch": 0.6787017167381975, + "grad_norm": 0.435546875, + "learning_rate": 4.854822629202222e-06, + "loss": 2.506, + "step": 12651 + }, + { + "epoch": 0.6787553648068669, + "grad_norm": 0.43359375, + "learning_rate": 4.854793453619514e-06, + "loss": 2.4141, + "step": 12652 + }, + { + "epoch": 0.6788090128755365, + "grad_norm": 0.515625, + "learning_rate": 4.854764275193148e-06, + "loss": 2.4908, + "step": 12653 + }, + { + "epoch": 0.678862660944206, + "grad_norm": 0.46875, + "learning_rate": 4.8547350939231605e-06, + "loss": 2.4408, + "step": 12654 + }, + { + "epoch": 0.6789163090128756, + "grad_norm": 0.416015625, + "learning_rate": 4.854705909809586e-06, + "loss": 2.2668, + "step": 12655 + }, + { + "epoch": 0.6789699570815451, + "grad_norm": 0.50390625, + "learning_rate": 4.854676722852459e-06, + "loss": 2.5308, + "step": 12656 + }, + { + "epoch": 0.6790236051502145, + "grad_norm": 0.466796875, + "learning_rate": 4.854647533051817e-06, + "loss": 2.0071, + "step": 12657 + }, + { + "epoch": 0.6790772532188841, + "grad_norm": 0.408203125, + "learning_rate": 4.854618340407693e-06, + "loss": 2.2618, + "step": 12658 + }, + { + "epoch": 0.6791309012875536, + "grad_norm": 1.3359375, + "learning_rate": 4.854589144920123e-06, + "loss": 2.2062, + "step": 12659 + }, + { + "epoch": 0.6791845493562232, + "grad_norm": 1.6328125, + "learning_rate": 4.854559946589142e-06, + "loss": 2.3659, + "step": 12660 + }, + { + "epoch": 0.6792381974248927, + "grad_norm": 0.62890625, + "learning_rate": 4.854530745414786e-06, + "loss": 2.1632, + "step": 12661 + }, + { + "epoch": 0.6792918454935623, + "grad_norm": 0.5078125, + "learning_rate": 4.85450154139709e-06, + "loss": 2.6386, + "step": 12662 + }, + { + "epoch": 0.6793454935622317, + "grad_norm": 0.8203125, + "learning_rate": 4.854472334536089e-06, + "loss": 2.0966, + "step": 12663 + }, + { + "epoch": 0.6793991416309013, + "grad_norm": 0.443359375, + "learning_rate": 4.854443124831819e-06, + "loss": 2.3301, + "step": 12664 + }, + { + "epoch": 0.6794527896995708, + "grad_norm": 0.44921875, + "learning_rate": 4.8544139122843135e-06, + "loss": 2.3595, + "step": 12665 + }, + { + "epoch": 0.6795064377682404, + "grad_norm": 0.46484375, + "learning_rate": 4.85438469689361e-06, + "loss": 2.307, + "step": 12666 + }, + { + "epoch": 0.6795600858369099, + "grad_norm": 0.474609375, + "learning_rate": 4.854355478659742e-06, + "loss": 2.1022, + "step": 12667 + }, + { + "epoch": 0.6796137339055794, + "grad_norm": 0.51171875, + "learning_rate": 4.854326257582745e-06, + "loss": 2.5401, + "step": 12668 + }, + { + "epoch": 0.6796673819742489, + "grad_norm": 0.43359375, + "learning_rate": 4.854297033662656e-06, + "loss": 2.3536, + "step": 12669 + }, + { + "epoch": 0.6797210300429185, + "grad_norm": 0.5, + "learning_rate": 4.854267806899509e-06, + "loss": 2.2706, + "step": 12670 + }, + { + "epoch": 0.679774678111588, + "grad_norm": 0.54296875, + "learning_rate": 4.854238577293339e-06, + "loss": 2.1222, + "step": 12671 + }, + { + "epoch": 0.6798283261802575, + "grad_norm": 0.419921875, + "learning_rate": 4.854209344844181e-06, + "loss": 2.4186, + "step": 12672 + }, + { + "epoch": 0.679881974248927, + "grad_norm": 0.470703125, + "learning_rate": 4.854180109552072e-06, + "loss": 2.2238, + "step": 12673 + }, + { + "epoch": 0.6799356223175965, + "grad_norm": 0.54296875, + "learning_rate": 4.854150871417046e-06, + "loss": 2.3162, + "step": 12674 + }, + { + "epoch": 0.6799892703862661, + "grad_norm": 0.447265625, + "learning_rate": 4.854121630439138e-06, + "loss": 2.3421, + "step": 12675 + }, + { + "epoch": 0.6800429184549356, + "grad_norm": 0.462890625, + "learning_rate": 4.854092386618384e-06, + "loss": 2.4187, + "step": 12676 + }, + { + "epoch": 0.6800965665236052, + "grad_norm": 0.490234375, + "learning_rate": 4.85406313995482e-06, + "loss": 2.4221, + "step": 12677 + }, + { + "epoch": 0.6801502145922746, + "grad_norm": 0.4765625, + "learning_rate": 4.854033890448481e-06, + "loss": 2.4142, + "step": 12678 + }, + { + "epoch": 0.6802038626609442, + "grad_norm": 0.44921875, + "learning_rate": 4.8540046380994e-06, + "loss": 2.2045, + "step": 12679 + }, + { + "epoch": 0.6802575107296137, + "grad_norm": 0.52734375, + "learning_rate": 4.853975382907616e-06, + "loss": 2.561, + "step": 12680 + }, + { + "epoch": 0.6803111587982833, + "grad_norm": 0.53125, + "learning_rate": 4.853946124873161e-06, + "loss": 1.4767, + "step": 12681 + }, + { + "epoch": 0.6803648068669528, + "grad_norm": 0.4453125, + "learning_rate": 4.853916863996072e-06, + "loss": 2.5475, + "step": 12682 + }, + { + "epoch": 0.6804184549356224, + "grad_norm": 0.46484375, + "learning_rate": 4.8538876002763845e-06, + "loss": 2.1947, + "step": 12683 + }, + { + "epoch": 0.6804721030042918, + "grad_norm": 0.345703125, + "learning_rate": 4.853858333714134e-06, + "loss": 2.1924, + "step": 12684 + }, + { + "epoch": 0.6805257510729614, + "grad_norm": 0.51171875, + "learning_rate": 4.8538290643093555e-06, + "loss": 2.3768, + "step": 12685 + }, + { + "epoch": 0.6805793991416309, + "grad_norm": 0.470703125, + "learning_rate": 4.853799792062083e-06, + "loss": 2.3469, + "step": 12686 + }, + { + "epoch": 0.6806330472103004, + "grad_norm": 0.482421875, + "learning_rate": 4.853770516972353e-06, + "loss": 2.1141, + "step": 12687 + }, + { + "epoch": 0.68068669527897, + "grad_norm": 0.4765625, + "learning_rate": 4.853741239040203e-06, + "loss": 2.1777, + "step": 12688 + }, + { + "epoch": 0.6807403433476394, + "grad_norm": 0.423828125, + "learning_rate": 4.8537119582656644e-06, + "loss": 1.6733, + "step": 12689 + }, + { + "epoch": 0.680793991416309, + "grad_norm": 0.447265625, + "learning_rate": 4.853682674648775e-06, + "loss": 2.2257, + "step": 12690 + }, + { + "epoch": 0.6808476394849785, + "grad_norm": 0.4296875, + "learning_rate": 4.85365338818957e-06, + "loss": 2.3214, + "step": 12691 + }, + { + "epoch": 0.6809012875536481, + "grad_norm": 0.349609375, + "learning_rate": 4.853624098888083e-06, + "loss": 2.1201, + "step": 12692 + }, + { + "epoch": 0.6809549356223176, + "grad_norm": 0.58203125, + "learning_rate": 4.853594806744353e-06, + "loss": 2.1543, + "step": 12693 + }, + { + "epoch": 0.6810085836909872, + "grad_norm": 0.4375, + "learning_rate": 4.853565511758411e-06, + "loss": 2.3863, + "step": 12694 + }, + { + "epoch": 0.6810622317596566, + "grad_norm": 0.427734375, + "learning_rate": 4.853536213930295e-06, + "loss": 2.1193, + "step": 12695 + }, + { + "epoch": 0.6811158798283262, + "grad_norm": 0.47265625, + "learning_rate": 4.85350691326004e-06, + "loss": 2.2245, + "step": 12696 + }, + { + "epoch": 0.6811695278969957, + "grad_norm": 0.466796875, + "learning_rate": 4.853477609747681e-06, + "loss": 2.3257, + "step": 12697 + }, + { + "epoch": 0.6812231759656653, + "grad_norm": 0.408203125, + "learning_rate": 4.853448303393254e-06, + "loss": 1.8116, + "step": 12698 + }, + { + "epoch": 0.6812768240343348, + "grad_norm": 0.49609375, + "learning_rate": 4.853418994196794e-06, + "loss": 2.3174, + "step": 12699 + }, + { + "epoch": 0.6813304721030042, + "grad_norm": 0.42578125, + "learning_rate": 4.853389682158336e-06, + "loss": 2.3773, + "step": 12700 + }, + { + "epoch": 0.6813841201716738, + "grad_norm": 0.46484375, + "learning_rate": 4.8533603672779165e-06, + "loss": 2.5045, + "step": 12701 + }, + { + "epoch": 0.6814377682403433, + "grad_norm": 0.4453125, + "learning_rate": 4.85333104955557e-06, + "loss": 2.3892, + "step": 12702 + }, + { + "epoch": 0.6814914163090129, + "grad_norm": 0.4296875, + "learning_rate": 4.853301728991331e-06, + "loss": 2.2981, + "step": 12703 + }, + { + "epoch": 0.6815450643776824, + "grad_norm": 0.4453125, + "learning_rate": 4.8532724055852374e-06, + "loss": 2.1296, + "step": 12704 + }, + { + "epoch": 0.681598712446352, + "grad_norm": 0.4296875, + "learning_rate": 4.853243079337323e-06, + "loss": 2.2329, + "step": 12705 + }, + { + "epoch": 0.6816523605150214, + "grad_norm": 0.458984375, + "learning_rate": 4.853213750247623e-06, + "loss": 2.3145, + "step": 12706 + }, + { + "epoch": 0.681706008583691, + "grad_norm": 0.427734375, + "learning_rate": 4.853184418316173e-06, + "loss": 2.3705, + "step": 12707 + }, + { + "epoch": 0.6817596566523605, + "grad_norm": 0.671875, + "learning_rate": 4.853155083543009e-06, + "loss": 2.3999, + "step": 12708 + }, + { + "epoch": 0.6818133047210301, + "grad_norm": 0.419921875, + "learning_rate": 4.853125745928166e-06, + "loss": 2.4163, + "step": 12709 + }, + { + "epoch": 0.6818669527896996, + "grad_norm": 0.361328125, + "learning_rate": 4.85309640547168e-06, + "loss": 2.1544, + "step": 12710 + }, + { + "epoch": 0.6819206008583691, + "grad_norm": 1.046875, + "learning_rate": 4.853067062173585e-06, + "loss": 2.2486, + "step": 12711 + }, + { + "epoch": 0.6819742489270386, + "grad_norm": 0.5390625, + "learning_rate": 4.8530377160339186e-06, + "loss": 2.4506, + "step": 12712 + }, + { + "epoch": 0.6820278969957082, + "grad_norm": 0.40234375, + "learning_rate": 4.853008367052715e-06, + "loss": 2.131, + "step": 12713 + }, + { + "epoch": 0.6820815450643777, + "grad_norm": 0.490234375, + "learning_rate": 4.852979015230009e-06, + "loss": 2.1259, + "step": 12714 + }, + { + "epoch": 0.6821351931330472, + "grad_norm": 0.37890625, + "learning_rate": 4.8529496605658365e-06, + "loss": 1.9769, + "step": 12715 + }, + { + "epoch": 0.6821888412017167, + "grad_norm": 0.57421875, + "learning_rate": 4.852920303060234e-06, + "loss": 2.2635, + "step": 12716 + }, + { + "epoch": 0.6822424892703862, + "grad_norm": 0.41796875, + "learning_rate": 4.852890942713236e-06, + "loss": 2.247, + "step": 12717 + }, + { + "epoch": 0.6822961373390558, + "grad_norm": 0.4765625, + "learning_rate": 4.852861579524877e-06, + "loss": 2.3567, + "step": 12718 + }, + { + "epoch": 0.6823497854077253, + "grad_norm": 1.703125, + "learning_rate": 4.852832213495194e-06, + "loss": 2.385, + "step": 12719 + }, + { + "epoch": 0.6824034334763949, + "grad_norm": 0.41796875, + "learning_rate": 4.852802844624223e-06, + "loss": 2.4627, + "step": 12720 + }, + { + "epoch": 0.6824570815450643, + "grad_norm": 0.42578125, + "learning_rate": 4.852773472911998e-06, + "loss": 2.151, + "step": 12721 + }, + { + "epoch": 0.6825107296137339, + "grad_norm": 0.388671875, + "learning_rate": 4.852744098358554e-06, + "loss": 2.2107, + "step": 12722 + }, + { + "epoch": 0.6825643776824034, + "grad_norm": 0.41796875, + "learning_rate": 4.852714720963928e-06, + "loss": 1.9504, + "step": 12723 + }, + { + "epoch": 0.682618025751073, + "grad_norm": 0.458984375, + "learning_rate": 4.852685340728155e-06, + "loss": 2.2785, + "step": 12724 + }, + { + "epoch": 0.6826716738197425, + "grad_norm": 0.43359375, + "learning_rate": 4.852655957651271e-06, + "loss": 2.1967, + "step": 12725 + }, + { + "epoch": 0.6827253218884121, + "grad_norm": 0.404296875, + "learning_rate": 4.85262657173331e-06, + "loss": 2.2051, + "step": 12726 + }, + { + "epoch": 0.6827789699570815, + "grad_norm": 0.41015625, + "learning_rate": 4.852597182974308e-06, + "loss": 2.5159, + "step": 12727 + }, + { + "epoch": 0.6828326180257511, + "grad_norm": 0.375, + "learning_rate": 4.852567791374302e-06, + "loss": 2.0664, + "step": 12728 + }, + { + "epoch": 0.6828862660944206, + "grad_norm": 0.431640625, + "learning_rate": 4.852538396933326e-06, + "loss": 2.2745, + "step": 12729 + }, + { + "epoch": 0.6829399141630901, + "grad_norm": 0.4765625, + "learning_rate": 4.852508999651416e-06, + "loss": 2.2205, + "step": 12730 + }, + { + "epoch": 0.6829935622317597, + "grad_norm": 0.4765625, + "learning_rate": 4.852479599528606e-06, + "loss": 2.4256, + "step": 12731 + }, + { + "epoch": 0.6830472103004291, + "grad_norm": 0.5625, + "learning_rate": 4.852450196564934e-06, + "loss": 2.3759, + "step": 12732 + }, + { + "epoch": 0.6831008583690987, + "grad_norm": 0.56640625, + "learning_rate": 4.852420790760434e-06, + "loss": 2.305, + "step": 12733 + }, + { + "epoch": 0.6831545064377682, + "grad_norm": 0.51953125, + "learning_rate": 4.852391382115141e-06, + "loss": 2.3046, + "step": 12734 + }, + { + "epoch": 0.6832081545064378, + "grad_norm": 0.380859375, + "learning_rate": 4.8523619706290924e-06, + "loss": 2.1631, + "step": 12735 + }, + { + "epoch": 0.6832618025751073, + "grad_norm": 0.43359375, + "learning_rate": 4.852332556302323e-06, + "loss": 2.0389, + "step": 12736 + }, + { + "epoch": 0.6833154506437769, + "grad_norm": 0.4296875, + "learning_rate": 4.852303139134868e-06, + "loss": 2.4124, + "step": 12737 + }, + { + "epoch": 0.6833690987124463, + "grad_norm": 4.28125, + "learning_rate": 4.852273719126761e-06, + "loss": 2.2199, + "step": 12738 + }, + { + "epoch": 0.6834227467811159, + "grad_norm": 0.4453125, + "learning_rate": 4.852244296278042e-06, + "loss": 2.2605, + "step": 12739 + }, + { + "epoch": 0.6834763948497854, + "grad_norm": 0.49609375, + "learning_rate": 4.852214870588742e-06, + "loss": 2.3796, + "step": 12740 + }, + { + "epoch": 0.683530042918455, + "grad_norm": 0.453125, + "learning_rate": 4.8521854420588995e-06, + "loss": 2.2782, + "step": 12741 + }, + { + "epoch": 0.6835836909871245, + "grad_norm": 0.4765625, + "learning_rate": 4.852156010688549e-06, + "loss": 2.328, + "step": 12742 + }, + { + "epoch": 0.683637339055794, + "grad_norm": 0.41796875, + "learning_rate": 4.852126576477726e-06, + "loss": 2.122, + "step": 12743 + }, + { + "epoch": 0.6836909871244635, + "grad_norm": 0.375, + "learning_rate": 4.852097139426465e-06, + "loss": 1.9893, + "step": 12744 + }, + { + "epoch": 0.683744635193133, + "grad_norm": 0.439453125, + "learning_rate": 4.852067699534804e-06, + "loss": 2.39, + "step": 12745 + }, + { + "epoch": 0.6837982832618026, + "grad_norm": 0.474609375, + "learning_rate": 4.8520382568027765e-06, + "loss": 2.4284, + "step": 12746 + }, + { + "epoch": 0.6838519313304721, + "grad_norm": 0.427734375, + "learning_rate": 4.852008811230419e-06, + "loss": 2.2292, + "step": 12747 + }, + { + "epoch": 0.6839055793991416, + "grad_norm": 0.65234375, + "learning_rate": 4.851979362817768e-06, + "loss": 2.1811, + "step": 12748 + }, + { + "epoch": 0.6839592274678111, + "grad_norm": 0.4140625, + "learning_rate": 4.851949911564857e-06, + "loss": 2.1949, + "step": 12749 + }, + { + "epoch": 0.6840128755364807, + "grad_norm": 0.451171875, + "learning_rate": 4.851920457471723e-06, + "loss": 2.283, + "step": 12750 + }, + { + "epoch": 0.6840665236051502, + "grad_norm": 0.458984375, + "learning_rate": 4.8518910005384e-06, + "loss": 2.1659, + "step": 12751 + }, + { + "epoch": 0.6841201716738198, + "grad_norm": 0.423828125, + "learning_rate": 4.851861540764925e-06, + "loss": 2.4222, + "step": 12752 + }, + { + "epoch": 0.6841738197424893, + "grad_norm": 0.439453125, + "learning_rate": 4.851832078151333e-06, + "loss": 2.4696, + "step": 12753 + }, + { + "epoch": 0.6842274678111588, + "grad_norm": 0.4765625, + "learning_rate": 4.85180261269766e-06, + "loss": 2.3804, + "step": 12754 + }, + { + "epoch": 0.6842811158798283, + "grad_norm": 0.447265625, + "learning_rate": 4.851773144403941e-06, + "loss": 2.3106, + "step": 12755 + }, + { + "epoch": 0.6843347639484979, + "grad_norm": 0.671875, + "learning_rate": 4.851743673270213e-06, + "loss": 2.3863, + "step": 12756 + }, + { + "epoch": 0.6843884120171674, + "grad_norm": 0.48046875, + "learning_rate": 4.85171419929651e-06, + "loss": 2.5669, + "step": 12757 + }, + { + "epoch": 0.6844420600858369, + "grad_norm": 0.462890625, + "learning_rate": 4.851684722482868e-06, + "loss": 2.424, + "step": 12758 + }, + { + "epoch": 0.6844957081545064, + "grad_norm": 0.4296875, + "learning_rate": 4.851655242829323e-06, + "loss": 2.1961, + "step": 12759 + }, + { + "epoch": 0.6845493562231759, + "grad_norm": 0.439453125, + "learning_rate": 4.851625760335909e-06, + "loss": 2.3124, + "step": 12760 + }, + { + "epoch": 0.6846030042918455, + "grad_norm": 0.5234375, + "learning_rate": 4.851596275002664e-06, + "loss": 2.1225, + "step": 12761 + }, + { + "epoch": 0.684656652360515, + "grad_norm": 0.515625, + "learning_rate": 4.851566786829624e-06, + "loss": 2.2532, + "step": 12762 + }, + { + "epoch": 0.6847103004291846, + "grad_norm": 0.4375, + "learning_rate": 4.8515372958168205e-06, + "loss": 2.4447, + "step": 12763 + }, + { + "epoch": 0.684763948497854, + "grad_norm": 0.4609375, + "learning_rate": 4.8515078019642935e-06, + "loss": 2.0042, + "step": 12764 + }, + { + "epoch": 0.6848175965665236, + "grad_norm": 0.5, + "learning_rate": 4.851478305272076e-06, + "loss": 2.2535, + "step": 12765 + }, + { + "epoch": 0.6848712446351931, + "grad_norm": 0.494140625, + "learning_rate": 4.851448805740204e-06, + "loss": 2.3501, + "step": 12766 + }, + { + "epoch": 0.6849248927038627, + "grad_norm": 0.427734375, + "learning_rate": 4.851419303368715e-06, + "loss": 1.7391, + "step": 12767 + }, + { + "epoch": 0.6849785407725322, + "grad_norm": 0.4140625, + "learning_rate": 4.8513897981576425e-06, + "loss": 2.0992, + "step": 12768 + }, + { + "epoch": 0.6850321888412018, + "grad_norm": 0.490234375, + "learning_rate": 4.851360290107023e-06, + "loss": 2.3258, + "step": 12769 + }, + { + "epoch": 0.6850858369098712, + "grad_norm": 0.46875, + "learning_rate": 4.851330779216892e-06, + "loss": 2.189, + "step": 12770 + }, + { + "epoch": 0.6851394849785408, + "grad_norm": 0.427734375, + "learning_rate": 4.851301265487285e-06, + "loss": 2.1045, + "step": 12771 + }, + { + "epoch": 0.6851931330472103, + "grad_norm": 0.54296875, + "learning_rate": 4.851271748918238e-06, + "loss": 2.2936, + "step": 12772 + }, + { + "epoch": 0.6852467811158798, + "grad_norm": 0.46875, + "learning_rate": 4.851242229509786e-06, + "loss": 2.3077, + "step": 12773 + }, + { + "epoch": 0.6853004291845494, + "grad_norm": 0.474609375, + "learning_rate": 4.851212707261965e-06, + "loss": 2.2978, + "step": 12774 + }, + { + "epoch": 0.6853540772532188, + "grad_norm": 0.5, + "learning_rate": 4.851183182174811e-06, + "loss": 2.2827, + "step": 12775 + }, + { + "epoch": 0.6854077253218884, + "grad_norm": 0.458984375, + "learning_rate": 4.85115365424836e-06, + "loss": 2.0839, + "step": 12776 + }, + { + "epoch": 0.6854613733905579, + "grad_norm": 0.49609375, + "learning_rate": 4.8511241234826465e-06, + "loss": 2.135, + "step": 12777 + }, + { + "epoch": 0.6855150214592275, + "grad_norm": 0.431640625, + "learning_rate": 4.851094589877707e-06, + "loss": 2.3878, + "step": 12778 + }, + { + "epoch": 0.685568669527897, + "grad_norm": 0.478515625, + "learning_rate": 4.851065053433576e-06, + "loss": 2.1973, + "step": 12779 + }, + { + "epoch": 0.6856223175965666, + "grad_norm": 0.396484375, + "learning_rate": 4.85103551415029e-06, + "loss": 2.2258, + "step": 12780 + }, + { + "epoch": 0.685675965665236, + "grad_norm": 0.474609375, + "learning_rate": 4.851005972027886e-06, + "loss": 2.2777, + "step": 12781 + }, + { + "epoch": 0.6857296137339056, + "grad_norm": 0.51171875, + "learning_rate": 4.850976427066398e-06, + "loss": 2.2261, + "step": 12782 + }, + { + "epoch": 0.6857832618025751, + "grad_norm": 0.4921875, + "learning_rate": 4.8509468792658615e-06, + "loss": 2.4445, + "step": 12783 + }, + { + "epoch": 0.6858369098712447, + "grad_norm": 0.546875, + "learning_rate": 4.850917328626313e-06, + "loss": 2.1629, + "step": 12784 + }, + { + "epoch": 0.6858905579399142, + "grad_norm": 0.349609375, + "learning_rate": 4.850887775147788e-06, + "loss": 2.1521, + "step": 12785 + }, + { + "epoch": 0.6859442060085837, + "grad_norm": 1.046875, + "learning_rate": 4.850858218830322e-06, + "loss": 2.295, + "step": 12786 + }, + { + "epoch": 0.6859978540772532, + "grad_norm": 0.384765625, + "learning_rate": 4.85082865967395e-06, + "loss": 2.03, + "step": 12787 + }, + { + "epoch": 0.6860515021459227, + "grad_norm": 0.4765625, + "learning_rate": 4.85079909767871e-06, + "loss": 2.1925, + "step": 12788 + }, + { + "epoch": 0.6861051502145923, + "grad_norm": 0.431640625, + "learning_rate": 4.850769532844635e-06, + "loss": 2.0351, + "step": 12789 + }, + { + "epoch": 0.6861587982832618, + "grad_norm": 0.453125, + "learning_rate": 4.850739965171762e-06, + "loss": 2.3859, + "step": 12790 + }, + { + "epoch": 0.6862124463519313, + "grad_norm": 0.515625, + "learning_rate": 4.850710394660128e-06, + "loss": 2.3576, + "step": 12791 + }, + { + "epoch": 0.6862660944206008, + "grad_norm": 0.59375, + "learning_rate": 4.8506808213097665e-06, + "loss": 1.5238, + "step": 12792 + }, + { + "epoch": 0.6863197424892704, + "grad_norm": 0.490234375, + "learning_rate": 4.850651245120713e-06, + "loss": 2.5347, + "step": 12793 + }, + { + "epoch": 0.6863733905579399, + "grad_norm": 0.376953125, + "learning_rate": 4.850621666093005e-06, + "loss": 1.9016, + "step": 12794 + }, + { + "epoch": 0.6864270386266095, + "grad_norm": 0.400390625, + "learning_rate": 4.850592084226678e-06, + "loss": 2.1588, + "step": 12795 + }, + { + "epoch": 0.686480686695279, + "grad_norm": 0.45703125, + "learning_rate": 4.8505624995217674e-06, + "loss": 2.0633, + "step": 12796 + }, + { + "epoch": 0.6865343347639485, + "grad_norm": 0.5234375, + "learning_rate": 4.850532911978307e-06, + "loss": 2.3722, + "step": 12797 + }, + { + "epoch": 0.686587982832618, + "grad_norm": 0.4453125, + "learning_rate": 4.850503321596336e-06, + "loss": 1.9362, + "step": 12798 + }, + { + "epoch": 0.6866416309012876, + "grad_norm": 0.5390625, + "learning_rate": 4.850473728375888e-06, + "loss": 2.2354, + "step": 12799 + }, + { + "epoch": 0.6866952789699571, + "grad_norm": 0.5078125, + "learning_rate": 4.8504441323169984e-06, + "loss": 2.5566, + "step": 12800 + }, + { + "epoch": 0.6867489270386266, + "grad_norm": 0.46484375, + "learning_rate": 4.850414533419704e-06, + "loss": 2.2593, + "step": 12801 + }, + { + "epoch": 0.6868025751072961, + "grad_norm": 0.765625, + "learning_rate": 4.85038493168404e-06, + "loss": 2.1785, + "step": 12802 + }, + { + "epoch": 0.6868562231759656, + "grad_norm": 0.478515625, + "learning_rate": 4.850355327110043e-06, + "loss": 2.3082, + "step": 12803 + }, + { + "epoch": 0.6869098712446352, + "grad_norm": 0.498046875, + "learning_rate": 4.850325719697747e-06, + "loss": 2.2019, + "step": 12804 + }, + { + "epoch": 0.6869635193133047, + "grad_norm": 0.50390625, + "learning_rate": 4.85029610944719e-06, + "loss": 2.5668, + "step": 12805 + }, + { + "epoch": 0.6870171673819743, + "grad_norm": 0.484375, + "learning_rate": 4.850266496358407e-06, + "loss": 2.3738, + "step": 12806 + }, + { + "epoch": 0.6870708154506437, + "grad_norm": 0.41015625, + "learning_rate": 4.850236880431432e-06, + "loss": 2.2584, + "step": 12807 + }, + { + "epoch": 0.6871244635193133, + "grad_norm": 0.4609375, + "learning_rate": 4.850207261666303e-06, + "loss": 2.0897, + "step": 12808 + }, + { + "epoch": 0.6871781115879828, + "grad_norm": 0.4609375, + "learning_rate": 4.850177640063054e-06, + "loss": 2.2377, + "step": 12809 + }, + { + "epoch": 0.6872317596566524, + "grad_norm": 0.46875, + "learning_rate": 4.850148015621723e-06, + "loss": 1.6721, + "step": 12810 + }, + { + "epoch": 0.6872854077253219, + "grad_norm": 0.447265625, + "learning_rate": 4.850118388342343e-06, + "loss": 2.4062, + "step": 12811 + }, + { + "epoch": 0.6873390557939915, + "grad_norm": 0.4140625, + "learning_rate": 4.850088758224952e-06, + "loss": 2.2891, + "step": 12812 + }, + { + "epoch": 0.6873927038626609, + "grad_norm": 0.41015625, + "learning_rate": 4.850059125269585e-06, + "loss": 2.1556, + "step": 12813 + }, + { + "epoch": 0.6874463519313305, + "grad_norm": 0.46484375, + "learning_rate": 4.850029489476278e-06, + "loss": 2.1582, + "step": 12814 + }, + { + "epoch": 0.6875, + "grad_norm": 0.44140625, + "learning_rate": 4.849999850845066e-06, + "loss": 2.3827, + "step": 12815 + }, + { + "epoch": 0.6875536480686695, + "grad_norm": 0.48046875, + "learning_rate": 4.849970209375986e-06, + "loss": 2.2945, + "step": 12816 + }, + { + "epoch": 0.6876072961373391, + "grad_norm": 0.349609375, + "learning_rate": 4.849940565069073e-06, + "loss": 1.7368, + "step": 12817 + }, + { + "epoch": 0.6876609442060085, + "grad_norm": 0.5078125, + "learning_rate": 4.8499109179243624e-06, + "loss": 1.5175, + "step": 12818 + }, + { + "epoch": 0.6877145922746781, + "grad_norm": 0.36328125, + "learning_rate": 4.849881267941891e-06, + "loss": 1.9276, + "step": 12819 + }, + { + "epoch": 0.6877682403433476, + "grad_norm": 0.419921875, + "learning_rate": 4.849851615121694e-06, + "loss": 2.3661, + "step": 12820 + }, + { + "epoch": 0.6878218884120172, + "grad_norm": 0.7578125, + "learning_rate": 4.8498219594638076e-06, + "loss": 2.1904, + "step": 12821 + }, + { + "epoch": 0.6878755364806867, + "grad_norm": 0.419921875, + "learning_rate": 4.849792300968268e-06, + "loss": 2.1767, + "step": 12822 + }, + { + "epoch": 0.6879291845493563, + "grad_norm": 1.296875, + "learning_rate": 4.84976263963511e-06, + "loss": 2.0282, + "step": 12823 + }, + { + "epoch": 0.6879828326180257, + "grad_norm": 0.72265625, + "learning_rate": 4.849732975464369e-06, + "loss": 2.3746, + "step": 12824 + }, + { + "epoch": 0.6880364806866953, + "grad_norm": 0.44921875, + "learning_rate": 4.849703308456082e-06, + "loss": 2.1902, + "step": 12825 + }, + { + "epoch": 0.6880901287553648, + "grad_norm": 0.5, + "learning_rate": 4.849673638610285e-06, + "loss": 2.2943, + "step": 12826 + }, + { + "epoch": 0.6881437768240344, + "grad_norm": 0.453125, + "learning_rate": 4.8496439659270134e-06, + "loss": 2.2981, + "step": 12827 + }, + { + "epoch": 0.6881974248927039, + "grad_norm": 0.4609375, + "learning_rate": 4.849614290406302e-06, + "loss": 2.0788, + "step": 12828 + }, + { + "epoch": 0.6882510729613734, + "grad_norm": 0.52734375, + "learning_rate": 4.849584612048188e-06, + "loss": 2.4239, + "step": 12829 + }, + { + "epoch": 0.6883047210300429, + "grad_norm": 0.5078125, + "learning_rate": 4.849554930852707e-06, + "loss": 2.4132, + "step": 12830 + }, + { + "epoch": 0.6883583690987124, + "grad_norm": 0.376953125, + "learning_rate": 4.849525246819895e-06, + "loss": 2.304, + "step": 12831 + }, + { + "epoch": 0.688412017167382, + "grad_norm": 0.486328125, + "learning_rate": 4.849495559949787e-06, + "loss": 2.3235, + "step": 12832 + }, + { + "epoch": 0.6884656652360515, + "grad_norm": 0.41796875, + "learning_rate": 4.84946587024242e-06, + "loss": 1.867, + "step": 12833 + }, + { + "epoch": 0.688519313304721, + "grad_norm": 0.4453125, + "learning_rate": 4.849436177697828e-06, + "loss": 1.7693, + "step": 12834 + }, + { + "epoch": 0.6885729613733905, + "grad_norm": 0.515625, + "learning_rate": 4.849406482316049e-06, + "loss": 2.4765, + "step": 12835 + }, + { + "epoch": 0.6886266094420601, + "grad_norm": 0.46875, + "learning_rate": 4.849376784097118e-06, + "loss": 2.2362, + "step": 12836 + }, + { + "epoch": 0.6886802575107296, + "grad_norm": 0.423828125, + "learning_rate": 4.84934708304107e-06, + "loss": 2.2853, + "step": 12837 + }, + { + "epoch": 0.6887339055793992, + "grad_norm": 0.490234375, + "learning_rate": 4.849317379147943e-06, + "loss": 2.5206, + "step": 12838 + }, + { + "epoch": 0.6887875536480687, + "grad_norm": 0.58203125, + "learning_rate": 4.849287672417771e-06, + "loss": 2.3197, + "step": 12839 + }, + { + "epoch": 0.6888412017167382, + "grad_norm": 0.42578125, + "learning_rate": 4.849257962850589e-06, + "loss": 2.1594, + "step": 12840 + }, + { + "epoch": 0.6888948497854077, + "grad_norm": 0.458984375, + "learning_rate": 4.849228250446436e-06, + "loss": 2.7862, + "step": 12841 + }, + { + "epoch": 0.6889484978540773, + "grad_norm": 0.51171875, + "learning_rate": 4.849198535205345e-06, + "loss": 2.2945, + "step": 12842 + }, + { + "epoch": 0.6890021459227468, + "grad_norm": 0.43359375, + "learning_rate": 4.849168817127354e-06, + "loss": 2.1874, + "step": 12843 + }, + { + "epoch": 0.6890557939914163, + "grad_norm": 0.478515625, + "learning_rate": 4.849139096212497e-06, + "loss": 2.4017, + "step": 12844 + }, + { + "epoch": 0.6891094420600858, + "grad_norm": 0.45703125, + "learning_rate": 4.849109372460812e-06, + "loss": 2.2928, + "step": 12845 + }, + { + "epoch": 0.6891630901287553, + "grad_norm": 0.376953125, + "learning_rate": 4.849079645872333e-06, + "loss": 2.1026, + "step": 12846 + }, + { + "epoch": 0.6892167381974249, + "grad_norm": 0.46875, + "learning_rate": 4.849049916447096e-06, + "loss": 2.4737, + "step": 12847 + }, + { + "epoch": 0.6892703862660944, + "grad_norm": 0.50390625, + "learning_rate": 4.8490201841851386e-06, + "loss": 2.4402, + "step": 12848 + }, + { + "epoch": 0.689324034334764, + "grad_norm": 0.56640625, + "learning_rate": 4.848990449086496e-06, + "loss": 2.1952, + "step": 12849 + }, + { + "epoch": 0.6893776824034334, + "grad_norm": 0.439453125, + "learning_rate": 4.848960711151202e-06, + "loss": 2.127, + "step": 12850 + }, + { + "epoch": 0.689431330472103, + "grad_norm": 0.443359375, + "learning_rate": 4.848930970379295e-06, + "loss": 2.2638, + "step": 12851 + }, + { + "epoch": 0.6894849785407725, + "grad_norm": 0.7890625, + "learning_rate": 4.8489012267708104e-06, + "loss": 2.1772, + "step": 12852 + }, + { + "epoch": 0.6895386266094421, + "grad_norm": 0.396484375, + "learning_rate": 4.848871480325784e-06, + "loss": 2.3468, + "step": 12853 + }, + { + "epoch": 0.6895922746781116, + "grad_norm": 0.494140625, + "learning_rate": 4.8488417310442515e-06, + "loss": 2.5321, + "step": 12854 + }, + { + "epoch": 0.6896459227467812, + "grad_norm": 0.51953125, + "learning_rate": 4.848811978926249e-06, + "loss": 2.5261, + "step": 12855 + }, + { + "epoch": 0.6896995708154506, + "grad_norm": 0.56640625, + "learning_rate": 4.8487822239718115e-06, + "loss": 1.4485, + "step": 12856 + }, + { + "epoch": 0.6897532188841202, + "grad_norm": 0.42578125, + "learning_rate": 4.848752466180977e-06, + "loss": 2.4075, + "step": 12857 + }, + { + "epoch": 0.6898068669527897, + "grad_norm": 0.423828125, + "learning_rate": 4.84872270555378e-06, + "loss": 2.5739, + "step": 12858 + }, + { + "epoch": 0.6898605150214592, + "grad_norm": 0.439453125, + "learning_rate": 4.848692942090256e-06, + "loss": 2.2718, + "step": 12859 + }, + { + "epoch": 0.6899141630901288, + "grad_norm": 0.57421875, + "learning_rate": 4.8486631757904424e-06, + "loss": 2.5093, + "step": 12860 + }, + { + "epoch": 0.6899678111587982, + "grad_norm": 0.39453125, + "learning_rate": 4.8486334066543735e-06, + "loss": 2.187, + "step": 12861 + }, + { + "epoch": 0.6900214592274678, + "grad_norm": 0.4453125, + "learning_rate": 4.848603634682087e-06, + "loss": 2.3789, + "step": 12862 + }, + { + "epoch": 0.6900751072961373, + "grad_norm": 0.482421875, + "learning_rate": 4.848573859873617e-06, + "loss": 2.3842, + "step": 12863 + }, + { + "epoch": 0.6901287553648069, + "grad_norm": 0.494140625, + "learning_rate": 4.848544082229001e-06, + "loss": 2.156, + "step": 12864 + }, + { + "epoch": 0.6901824034334764, + "grad_norm": 0.404296875, + "learning_rate": 4.8485143017482735e-06, + "loss": 2.1807, + "step": 12865 + }, + { + "epoch": 0.690236051502146, + "grad_norm": 0.578125, + "learning_rate": 4.848484518431472e-06, + "loss": 2.1435, + "step": 12866 + }, + { + "epoch": 0.6902896995708154, + "grad_norm": 2.125, + "learning_rate": 4.848454732278632e-06, + "loss": 1.6514, + "step": 12867 + }, + { + "epoch": 0.690343347639485, + "grad_norm": 0.3671875, + "learning_rate": 4.84842494328979e-06, + "loss": 2.2691, + "step": 12868 + }, + { + "epoch": 0.6903969957081545, + "grad_norm": 0.466796875, + "learning_rate": 4.8483951514649795e-06, + "loss": 2.2521, + "step": 12869 + }, + { + "epoch": 0.6904506437768241, + "grad_norm": 0.419921875, + "learning_rate": 4.848365356804239e-06, + "loss": 2.1408, + "step": 12870 + }, + { + "epoch": 0.6905042918454936, + "grad_norm": 0.45703125, + "learning_rate": 4.848335559307604e-06, + "loss": 2.2229, + "step": 12871 + }, + { + "epoch": 0.6905579399141631, + "grad_norm": 0.41015625, + "learning_rate": 4.84830575897511e-06, + "loss": 2.227, + "step": 12872 + }, + { + "epoch": 0.6906115879828326, + "grad_norm": 0.6484375, + "learning_rate": 4.848275955806793e-06, + "loss": 2.2268, + "step": 12873 + }, + { + "epoch": 0.6906652360515021, + "grad_norm": 0.4765625, + "learning_rate": 4.848246149802689e-06, + "loss": 2.1791, + "step": 12874 + }, + { + "epoch": 0.6907188841201717, + "grad_norm": 0.4921875, + "learning_rate": 4.848216340962835e-06, + "loss": 1.8813, + "step": 12875 + }, + { + "epoch": 0.6907725321888412, + "grad_norm": 0.458984375, + "learning_rate": 4.8481865292872655e-06, + "loss": 2.3463, + "step": 12876 + }, + { + "epoch": 0.6908261802575107, + "grad_norm": 0.4453125, + "learning_rate": 4.848156714776017e-06, + "loss": 2.4028, + "step": 12877 + }, + { + "epoch": 0.6908798283261802, + "grad_norm": 0.404296875, + "learning_rate": 4.848126897429125e-06, + "loss": 2.0865, + "step": 12878 + }, + { + "epoch": 0.6909334763948498, + "grad_norm": 0.4765625, + "learning_rate": 4.848097077246628e-06, + "loss": 2.3442, + "step": 12879 + }, + { + "epoch": 0.6909871244635193, + "grad_norm": 0.50390625, + "learning_rate": 4.848067254228559e-06, + "loss": 2.2068, + "step": 12880 + }, + { + "epoch": 0.6910407725321889, + "grad_norm": 0.4296875, + "learning_rate": 4.848037428374955e-06, + "loss": 2.3763, + "step": 12881 + }, + { + "epoch": 0.6910944206008584, + "grad_norm": 0.60546875, + "learning_rate": 4.848007599685853e-06, + "loss": 2.362, + "step": 12882 + }, + { + "epoch": 0.6911480686695279, + "grad_norm": 0.4765625, + "learning_rate": 4.847977768161288e-06, + "loss": 1.9906, + "step": 12883 + }, + { + "epoch": 0.6912017167381974, + "grad_norm": 0.455078125, + "learning_rate": 4.847947933801296e-06, + "loss": 2.1135, + "step": 12884 + }, + { + "epoch": 0.691255364806867, + "grad_norm": 0.48046875, + "learning_rate": 4.847918096605913e-06, + "loss": 2.4532, + "step": 12885 + }, + { + "epoch": 0.6913090128755365, + "grad_norm": 0.51953125, + "learning_rate": 4.8478882565751756e-06, + "loss": 2.538, + "step": 12886 + }, + { + "epoch": 0.691362660944206, + "grad_norm": 0.443359375, + "learning_rate": 4.8478584137091195e-06, + "loss": 2.1004, + "step": 12887 + }, + { + "epoch": 0.6914163090128755, + "grad_norm": 0.4453125, + "learning_rate": 4.847828568007781e-06, + "loss": 2.3532, + "step": 12888 + }, + { + "epoch": 0.691469957081545, + "grad_norm": 0.50390625, + "learning_rate": 4.847798719471196e-06, + "loss": 2.2594, + "step": 12889 + }, + { + "epoch": 0.6915236051502146, + "grad_norm": 0.5234375, + "learning_rate": 4.8477688680994e-06, + "loss": 2.6574, + "step": 12890 + }, + { + "epoch": 0.6915772532188841, + "grad_norm": 0.490234375, + "learning_rate": 4.84773901389243e-06, + "loss": 2.1851, + "step": 12891 + }, + { + "epoch": 0.6916309012875537, + "grad_norm": 0.68359375, + "learning_rate": 4.847709156850321e-06, + "loss": 2.1063, + "step": 12892 + }, + { + "epoch": 0.6916845493562231, + "grad_norm": 0.4765625, + "learning_rate": 4.84767929697311e-06, + "loss": 2.3974, + "step": 12893 + }, + { + "epoch": 0.6917381974248927, + "grad_norm": 0.400390625, + "learning_rate": 4.847649434260832e-06, + "loss": 2.0893, + "step": 12894 + }, + { + "epoch": 0.6917918454935622, + "grad_norm": 0.74609375, + "learning_rate": 4.847619568713524e-06, + "loss": 2.3117, + "step": 12895 + }, + { + "epoch": 0.6918454935622318, + "grad_norm": 0.4296875, + "learning_rate": 4.847589700331223e-06, + "loss": 1.9125, + "step": 12896 + }, + { + "epoch": 0.6918991416309013, + "grad_norm": 0.69140625, + "learning_rate": 4.847559829113963e-06, + "loss": 1.8522, + "step": 12897 + }, + { + "epoch": 0.6919527896995709, + "grad_norm": 0.416015625, + "learning_rate": 4.847529955061781e-06, + "loss": 2.3649, + "step": 12898 + }, + { + "epoch": 0.6920064377682403, + "grad_norm": 0.443359375, + "learning_rate": 4.847500078174713e-06, + "loss": 2.1918, + "step": 12899 + }, + { + "epoch": 0.6920600858369099, + "grad_norm": 0.466796875, + "learning_rate": 4.847470198452795e-06, + "loss": 2.1375, + "step": 12900 + }, + { + "epoch": 0.6921137339055794, + "grad_norm": 0.451171875, + "learning_rate": 4.847440315896064e-06, + "loss": 2.1688, + "step": 12901 + }, + { + "epoch": 0.6921673819742489, + "grad_norm": 0.515625, + "learning_rate": 4.847410430504554e-06, + "loss": 2.4027, + "step": 12902 + }, + { + "epoch": 0.6922210300429185, + "grad_norm": 0.353515625, + "learning_rate": 4.847380542278303e-06, + "loss": 1.8355, + "step": 12903 + }, + { + "epoch": 0.6922746781115879, + "grad_norm": 1.1015625, + "learning_rate": 4.847350651217346e-06, + "loss": 1.9952, + "step": 12904 + }, + { + "epoch": 0.6923283261802575, + "grad_norm": 0.4765625, + "learning_rate": 4.847320757321721e-06, + "loss": 1.7462, + "step": 12905 + }, + { + "epoch": 0.692381974248927, + "grad_norm": 0.4375, + "learning_rate": 4.8472908605914615e-06, + "loss": 2.3755, + "step": 12906 + }, + { + "epoch": 0.6924356223175966, + "grad_norm": 0.515625, + "learning_rate": 4.847260961026605e-06, + "loss": 2.3253, + "step": 12907 + }, + { + "epoch": 0.6924892703862661, + "grad_norm": 0.474609375, + "learning_rate": 4.847231058627187e-06, + "loss": 2.3225, + "step": 12908 + }, + { + "epoch": 0.6925429184549357, + "grad_norm": 0.56640625, + "learning_rate": 4.847201153393244e-06, + "loss": 2.2375, + "step": 12909 + }, + { + "epoch": 0.6925965665236051, + "grad_norm": 0.40234375, + "learning_rate": 4.847171245324812e-06, + "loss": 2.113, + "step": 12910 + }, + { + "epoch": 0.6926502145922747, + "grad_norm": 0.4375, + "learning_rate": 4.847141334421927e-06, + "loss": 2.2503, + "step": 12911 + }, + { + "epoch": 0.6927038626609442, + "grad_norm": 0.51171875, + "learning_rate": 4.847111420684627e-06, + "loss": 2.5648, + "step": 12912 + }, + { + "epoch": 0.6927575107296138, + "grad_norm": 0.51953125, + "learning_rate": 4.847081504112945e-06, + "loss": 2.3659, + "step": 12913 + }, + { + "epoch": 0.6928111587982833, + "grad_norm": 0.46484375, + "learning_rate": 4.847051584706919e-06, + "loss": 2.2996, + "step": 12914 + }, + { + "epoch": 0.6928648068669528, + "grad_norm": 0.50390625, + "learning_rate": 4.847021662466584e-06, + "loss": 2.3286, + "step": 12915 + }, + { + "epoch": 0.6929184549356223, + "grad_norm": 0.52734375, + "learning_rate": 4.846991737391978e-06, + "loss": 2.3957, + "step": 12916 + }, + { + "epoch": 0.6929721030042918, + "grad_norm": 0.4375, + "learning_rate": 4.846961809483136e-06, + "loss": 2.0813, + "step": 12917 + }, + { + "epoch": 0.6930257510729614, + "grad_norm": 0.373046875, + "learning_rate": 4.846931878740093e-06, + "loss": 2.095, + "step": 12918 + }, + { + "epoch": 0.6930793991416309, + "grad_norm": 0.458984375, + "learning_rate": 4.846901945162888e-06, + "loss": 2.1165, + "step": 12919 + }, + { + "epoch": 0.6931330472103004, + "grad_norm": 0.58203125, + "learning_rate": 4.846872008751553e-06, + "loss": 2.2875, + "step": 12920 + }, + { + "epoch": 0.6931866952789699, + "grad_norm": 0.5703125, + "learning_rate": 4.846842069506128e-06, + "loss": 2.1616, + "step": 12921 + }, + { + "epoch": 0.6932403433476395, + "grad_norm": 0.40625, + "learning_rate": 4.8468121274266475e-06, + "loss": 2.4027, + "step": 12922 + }, + { + "epoch": 0.693293991416309, + "grad_norm": 0.388671875, + "learning_rate": 4.846782182513148e-06, + "loss": 2.0505, + "step": 12923 + }, + { + "epoch": 0.6933476394849786, + "grad_norm": 0.5078125, + "learning_rate": 4.846752234765666e-06, + "loss": 1.9902, + "step": 12924 + }, + { + "epoch": 0.693401287553648, + "grad_norm": 0.447265625, + "learning_rate": 4.846722284184237e-06, + "loss": 2.261, + "step": 12925 + }, + { + "epoch": 0.6934549356223176, + "grad_norm": 0.48046875, + "learning_rate": 4.846692330768896e-06, + "loss": 2.3329, + "step": 12926 + }, + { + "epoch": 0.6935085836909871, + "grad_norm": 0.9140625, + "learning_rate": 4.846662374519682e-06, + "loss": 2.4756, + "step": 12927 + }, + { + "epoch": 0.6935622317596567, + "grad_norm": 0.48046875, + "learning_rate": 4.846632415436629e-06, + "loss": 2.0448, + "step": 12928 + }, + { + "epoch": 0.6936158798283262, + "grad_norm": 0.44140625, + "learning_rate": 4.846602453519774e-06, + "loss": 2.1582, + "step": 12929 + }, + { + "epoch": 0.6936695278969958, + "grad_norm": 0.451171875, + "learning_rate": 4.846572488769154e-06, + "loss": 2.0827, + "step": 12930 + }, + { + "epoch": 0.6937231759656652, + "grad_norm": 0.5546875, + "learning_rate": 4.846542521184804e-06, + "loss": 1.5934, + "step": 12931 + }, + { + "epoch": 0.6937768240343347, + "grad_norm": 0.421875, + "learning_rate": 4.846512550766759e-06, + "loss": 2.0478, + "step": 12932 + }, + { + "epoch": 0.6938304721030043, + "grad_norm": 0.423828125, + "learning_rate": 4.846482577515058e-06, + "loss": 2.3221, + "step": 12933 + }, + { + "epoch": 0.6938841201716738, + "grad_norm": 0.451171875, + "learning_rate": 4.846452601429735e-06, + "loss": 2.593, + "step": 12934 + }, + { + "epoch": 0.6939377682403434, + "grad_norm": 0.453125, + "learning_rate": 4.846422622510828e-06, + "loss": 1.5098, + "step": 12935 + }, + { + "epoch": 0.6939914163090128, + "grad_norm": 0.9375, + "learning_rate": 4.846392640758372e-06, + "loss": 2.3783, + "step": 12936 + }, + { + "epoch": 0.6940450643776824, + "grad_norm": 0.447265625, + "learning_rate": 4.846362656172403e-06, + "loss": 2.2731, + "step": 12937 + }, + { + "epoch": 0.6940987124463519, + "grad_norm": 0.43359375, + "learning_rate": 4.846332668752958e-06, + "loss": 2.3118, + "step": 12938 + }, + { + "epoch": 0.6941523605150215, + "grad_norm": 0.482421875, + "learning_rate": 4.846302678500073e-06, + "loss": 2.2649, + "step": 12939 + }, + { + "epoch": 0.694206008583691, + "grad_norm": 0.384765625, + "learning_rate": 4.8462726854137845e-06, + "loss": 2.0665, + "step": 12940 + }, + { + "epoch": 0.6942596566523606, + "grad_norm": 0.5078125, + "learning_rate": 4.846242689494127e-06, + "loss": 2.1024, + "step": 12941 + }, + { + "epoch": 0.69431330472103, + "grad_norm": 0.50390625, + "learning_rate": 4.846212690741139e-06, + "loss": 2.2367, + "step": 12942 + }, + { + "epoch": 0.6943669527896996, + "grad_norm": 0.46875, + "learning_rate": 4.846182689154856e-06, + "loss": 2.372, + "step": 12943 + }, + { + "epoch": 0.6944206008583691, + "grad_norm": 0.447265625, + "learning_rate": 4.846152684735313e-06, + "loss": 2.3682, + "step": 12944 + }, + { + "epoch": 0.6944742489270386, + "grad_norm": 0.5, + "learning_rate": 4.846122677482548e-06, + "loss": 2.0953, + "step": 12945 + }, + { + "epoch": 0.6945278969957082, + "grad_norm": 0.33984375, + "learning_rate": 4.846092667396596e-06, + "loss": 2.1136, + "step": 12946 + }, + { + "epoch": 0.6945815450643776, + "grad_norm": 0.44921875, + "learning_rate": 4.846062654477494e-06, + "loss": 2.2192, + "step": 12947 + }, + { + "epoch": 0.6946351931330472, + "grad_norm": 0.53515625, + "learning_rate": 4.846032638725278e-06, + "loss": 2.3902, + "step": 12948 + }, + { + "epoch": 0.6946888412017167, + "grad_norm": 0.45703125, + "learning_rate": 4.846002620139984e-06, + "loss": 2.2918, + "step": 12949 + }, + { + "epoch": 0.6947424892703863, + "grad_norm": 0.392578125, + "learning_rate": 4.845972598721648e-06, + "loss": 2.3641, + "step": 12950 + }, + { + "epoch": 0.6947961373390558, + "grad_norm": 0.462890625, + "learning_rate": 4.845942574470307e-06, + "loss": 1.9589, + "step": 12951 + }, + { + "epoch": 0.6948497854077254, + "grad_norm": 0.58203125, + "learning_rate": 4.8459125473859975e-06, + "loss": 2.4055, + "step": 12952 + }, + { + "epoch": 0.6949034334763948, + "grad_norm": 0.4296875, + "learning_rate": 4.845882517468755e-06, + "loss": 2.1277, + "step": 12953 + }, + { + "epoch": 0.6949570815450644, + "grad_norm": 0.416015625, + "learning_rate": 4.845852484718616e-06, + "loss": 2.2294, + "step": 12954 + }, + { + "epoch": 0.6950107296137339, + "grad_norm": 0.5, + "learning_rate": 4.845822449135617e-06, + "loss": 2.4344, + "step": 12955 + }, + { + "epoch": 0.6950643776824035, + "grad_norm": 0.443359375, + "learning_rate": 4.845792410719793e-06, + "loss": 2.3508, + "step": 12956 + }, + { + "epoch": 0.695118025751073, + "grad_norm": 0.4375, + "learning_rate": 4.845762369471182e-06, + "loss": 2.2717, + "step": 12957 + }, + { + "epoch": 0.6951716738197425, + "grad_norm": 9.25, + "learning_rate": 4.845732325389819e-06, + "loss": 1.9862, + "step": 12958 + }, + { + "epoch": 0.695225321888412, + "grad_norm": 1.1328125, + "learning_rate": 4.845702278475741e-06, + "loss": 1.9882, + "step": 12959 + }, + { + "epoch": 0.6952789699570815, + "grad_norm": 0.498046875, + "learning_rate": 4.845672228728985e-06, + "loss": 2.2641, + "step": 12960 + }, + { + "epoch": 0.6953326180257511, + "grad_norm": 0.458984375, + "learning_rate": 4.8456421761495855e-06, + "loss": 2.3232, + "step": 12961 + }, + { + "epoch": 0.6953862660944206, + "grad_norm": 0.5703125, + "learning_rate": 4.845612120737581e-06, + "loss": 1.5266, + "step": 12962 + }, + { + "epoch": 0.6954399141630901, + "grad_norm": 0.408203125, + "learning_rate": 4.845582062493005e-06, + "loss": 2.0904, + "step": 12963 + }, + { + "epoch": 0.6954935622317596, + "grad_norm": 0.5, + "learning_rate": 4.845552001415896e-06, + "loss": 2.1884, + "step": 12964 + }, + { + "epoch": 0.6955472103004292, + "grad_norm": 0.458984375, + "learning_rate": 4.845521937506289e-06, + "loss": 2.3248, + "step": 12965 + }, + { + "epoch": 0.6956008583690987, + "grad_norm": 0.5546875, + "learning_rate": 4.845491870764222e-06, + "loss": 2.47, + "step": 12966 + }, + { + "epoch": 0.6956545064377683, + "grad_norm": 0.482421875, + "learning_rate": 4.845461801189729e-06, + "loss": 2.3374, + "step": 12967 + }, + { + "epoch": 0.6957081545064377, + "grad_norm": 0.46875, + "learning_rate": 4.845431728782849e-06, + "loss": 2.2632, + "step": 12968 + }, + { + "epoch": 0.6957618025751073, + "grad_norm": 0.4609375, + "learning_rate": 4.845401653543616e-06, + "loss": 2.2431, + "step": 12969 + }, + { + "epoch": 0.6958154506437768, + "grad_norm": 0.478515625, + "learning_rate": 4.845371575472067e-06, + "loss": 2.2688, + "step": 12970 + }, + { + "epoch": 0.6958690987124464, + "grad_norm": 0.373046875, + "learning_rate": 4.845341494568239e-06, + "loss": 2.0086, + "step": 12971 + }, + { + "epoch": 0.6959227467811159, + "grad_norm": 0.40234375, + "learning_rate": 4.845311410832167e-06, + "loss": 2.0259, + "step": 12972 + }, + { + "epoch": 0.6959763948497855, + "grad_norm": 0.412109375, + "learning_rate": 4.845281324263889e-06, + "loss": 2.2464, + "step": 12973 + }, + { + "epoch": 0.6960300429184549, + "grad_norm": 0.443359375, + "learning_rate": 4.845251234863441e-06, + "loss": 2.3477, + "step": 12974 + }, + { + "epoch": 0.6960836909871244, + "grad_norm": 0.5234375, + "learning_rate": 4.845221142630857e-06, + "loss": 2.2559, + "step": 12975 + }, + { + "epoch": 0.696137339055794, + "grad_norm": 0.466796875, + "learning_rate": 4.845191047566176e-06, + "loss": 2.2712, + "step": 12976 + }, + { + "epoch": 0.6961909871244635, + "grad_norm": 0.71875, + "learning_rate": 4.845160949669434e-06, + "loss": 2.203, + "step": 12977 + }, + { + "epoch": 0.6962446351931331, + "grad_norm": 0.4140625, + "learning_rate": 4.845130848940667e-06, + "loss": 2.1761, + "step": 12978 + }, + { + "epoch": 0.6962982832618025, + "grad_norm": 0.392578125, + "learning_rate": 4.84510074537991e-06, + "loss": 2.5223, + "step": 12979 + }, + { + "epoch": 0.6963519313304721, + "grad_norm": 0.462890625, + "learning_rate": 4.845070638987202e-06, + "loss": 2.1627, + "step": 12980 + }, + { + "epoch": 0.6964055793991416, + "grad_norm": 0.50390625, + "learning_rate": 4.845040529762577e-06, + "loss": 2.0913, + "step": 12981 + }, + { + "epoch": 0.6964592274678112, + "grad_norm": 0.447265625, + "learning_rate": 4.8450104177060725e-06, + "loss": 2.3983, + "step": 12982 + }, + { + "epoch": 0.6965128755364807, + "grad_norm": 0.51953125, + "learning_rate": 4.844980302817725e-06, + "loss": 1.3666, + "step": 12983 + }, + { + "epoch": 0.6965665236051503, + "grad_norm": 0.423828125, + "learning_rate": 4.84495018509757e-06, + "loss": 1.9106, + "step": 12984 + }, + { + "epoch": 0.6966201716738197, + "grad_norm": 0.5390625, + "learning_rate": 4.844920064545644e-06, + "loss": 2.2455, + "step": 12985 + }, + { + "epoch": 0.6966738197424893, + "grad_norm": 0.5078125, + "learning_rate": 4.844889941161985e-06, + "loss": 2.1422, + "step": 12986 + }, + { + "epoch": 0.6967274678111588, + "grad_norm": 0.486328125, + "learning_rate": 4.844859814946626e-06, + "loss": 2.1032, + "step": 12987 + }, + { + "epoch": 0.6967811158798283, + "grad_norm": 0.54296875, + "learning_rate": 4.844829685899607e-06, + "loss": 2.4033, + "step": 12988 + }, + { + "epoch": 0.6968347639484979, + "grad_norm": 0.486328125, + "learning_rate": 4.844799554020963e-06, + "loss": 2.6246, + "step": 12989 + }, + { + "epoch": 0.6968884120171673, + "grad_norm": 0.453125, + "learning_rate": 4.84476941931073e-06, + "loss": 2.3977, + "step": 12990 + }, + { + "epoch": 0.6969420600858369, + "grad_norm": 0.60546875, + "learning_rate": 4.844739281768945e-06, + "loss": 2.4015, + "step": 12991 + }, + { + "epoch": 0.6969957081545064, + "grad_norm": 0.4296875, + "learning_rate": 4.8447091413956436e-06, + "loss": 2.2905, + "step": 12992 + }, + { + "epoch": 0.697049356223176, + "grad_norm": 0.91015625, + "learning_rate": 4.844678998190863e-06, + "loss": 2.3134, + "step": 12993 + }, + { + "epoch": 0.6971030042918455, + "grad_norm": 0.49609375, + "learning_rate": 4.8446488521546385e-06, + "loss": 2.6127, + "step": 12994 + }, + { + "epoch": 0.697156652360515, + "grad_norm": 0.5703125, + "learning_rate": 4.844618703287008e-06, + "loss": 2.2645, + "step": 12995 + }, + { + "epoch": 0.6972103004291845, + "grad_norm": 0.3671875, + "learning_rate": 4.844588551588007e-06, + "loss": 2.4146, + "step": 12996 + }, + { + "epoch": 0.6972639484978541, + "grad_norm": 0.443359375, + "learning_rate": 4.844558397057672e-06, + "loss": 2.4006, + "step": 12997 + }, + { + "epoch": 0.6973175965665236, + "grad_norm": 0.44921875, + "learning_rate": 4.844528239696039e-06, + "loss": 2.3076, + "step": 12998 + }, + { + "epoch": 0.6973712446351932, + "grad_norm": 0.4296875, + "learning_rate": 4.844498079503146e-06, + "loss": 2.2146, + "step": 12999 + }, + { + "epoch": 0.6974248927038627, + "grad_norm": 0.451171875, + "learning_rate": 4.844467916479028e-06, + "loss": 2.3201, + "step": 13000 + }, + { + "epoch": 0.6974785407725322, + "grad_norm": 0.74609375, + "learning_rate": 4.844437750623721e-06, + "loss": 1.1543, + "step": 13001 + }, + { + "epoch": 0.6975321888412017, + "grad_norm": 0.455078125, + "learning_rate": 4.844407581937262e-06, + "loss": 2.0874, + "step": 13002 + }, + { + "epoch": 0.6975858369098712, + "grad_norm": 0.380859375, + "learning_rate": 4.844377410419689e-06, + "loss": 2.1632, + "step": 13003 + }, + { + "epoch": 0.6976394849785408, + "grad_norm": 0.46875, + "learning_rate": 4.844347236071037e-06, + "loss": 2.0066, + "step": 13004 + }, + { + "epoch": 0.6976931330472103, + "grad_norm": 0.427734375, + "learning_rate": 4.8443170588913414e-06, + "loss": 2.2881, + "step": 13005 + }, + { + "epoch": 0.6977467811158798, + "grad_norm": 0.5078125, + "learning_rate": 4.844286878880639e-06, + "loss": 2.3931, + "step": 13006 + }, + { + "epoch": 0.6978004291845493, + "grad_norm": 0.486328125, + "learning_rate": 4.844256696038969e-06, + "loss": 2.5967, + "step": 13007 + }, + { + "epoch": 0.6978540772532189, + "grad_norm": 0.462890625, + "learning_rate": 4.8442265103663646e-06, + "loss": 1.8807, + "step": 13008 + }, + { + "epoch": 0.6979077253218884, + "grad_norm": 0.90234375, + "learning_rate": 4.844196321862864e-06, + "loss": 1.4596, + "step": 13009 + }, + { + "epoch": 0.697961373390558, + "grad_norm": 0.48046875, + "learning_rate": 4.8441661305285036e-06, + "loss": 2.2404, + "step": 13010 + }, + { + "epoch": 0.6980150214592274, + "grad_norm": 0.45703125, + "learning_rate": 4.8441359363633185e-06, + "loss": 2.234, + "step": 13011 + }, + { + "epoch": 0.698068669527897, + "grad_norm": 0.453125, + "learning_rate": 4.844105739367346e-06, + "loss": 2.1798, + "step": 13012 + }, + { + "epoch": 0.6981223175965665, + "grad_norm": 0.490234375, + "learning_rate": 4.844075539540623e-06, + "loss": 2.5076, + "step": 13013 + }, + { + "epoch": 0.6981759656652361, + "grad_norm": 0.5703125, + "learning_rate": 4.844045336883187e-06, + "loss": 2.5414, + "step": 13014 + }, + { + "epoch": 0.6982296137339056, + "grad_norm": 0.51953125, + "learning_rate": 4.84401513139507e-06, + "loss": 2.4797, + "step": 13015 + }, + { + "epoch": 0.6982832618025752, + "grad_norm": 0.408203125, + "learning_rate": 4.843984923076314e-06, + "loss": 2.3576, + "step": 13016 + }, + { + "epoch": 0.6983369098712446, + "grad_norm": 0.451171875, + "learning_rate": 4.843954711926952e-06, + "loss": 2.0324, + "step": 13017 + }, + { + "epoch": 0.6983905579399141, + "grad_norm": 0.44921875, + "learning_rate": 4.8439244979470215e-06, + "loss": 2.1754, + "step": 13018 + }, + { + "epoch": 0.6984442060085837, + "grad_norm": 0.5390625, + "learning_rate": 4.84389428113656e-06, + "loss": 2.4161, + "step": 13019 + }, + { + "epoch": 0.6984978540772532, + "grad_norm": 0.396484375, + "learning_rate": 4.843864061495601e-06, + "loss": 2.225, + "step": 13020 + }, + { + "epoch": 0.6985515021459228, + "grad_norm": 0.478515625, + "learning_rate": 4.843833839024184e-06, + "loss": 2.1831, + "step": 13021 + }, + { + "epoch": 0.6986051502145922, + "grad_norm": 0.77734375, + "learning_rate": 4.843803613722345e-06, + "loss": 2.0884, + "step": 13022 + }, + { + "epoch": 0.6986587982832618, + "grad_norm": 0.50390625, + "learning_rate": 4.843773385590121e-06, + "loss": 2.3154, + "step": 13023 + }, + { + "epoch": 0.6987124463519313, + "grad_norm": 0.50390625, + "learning_rate": 4.843743154627545e-06, + "loss": 2.3948, + "step": 13024 + }, + { + "epoch": 0.6987660944206009, + "grad_norm": 0.46484375, + "learning_rate": 4.843712920834657e-06, + "loss": 2.4387, + "step": 13025 + }, + { + "epoch": 0.6988197424892704, + "grad_norm": 0.416015625, + "learning_rate": 4.8436826842114924e-06, + "loss": 2.0861, + "step": 13026 + }, + { + "epoch": 0.69887339055794, + "grad_norm": 0.5, + "learning_rate": 4.843652444758088e-06, + "loss": 2.1898, + "step": 13027 + }, + { + "epoch": 0.6989270386266094, + "grad_norm": 0.4921875, + "learning_rate": 4.84362220247448e-06, + "loss": 2.4679, + "step": 13028 + }, + { + "epoch": 0.698980686695279, + "grad_norm": 0.470703125, + "learning_rate": 4.843591957360706e-06, + "loss": 2.0667, + "step": 13029 + }, + { + "epoch": 0.6990343347639485, + "grad_norm": 0.44921875, + "learning_rate": 4.8435617094168e-06, + "loss": 2.2431, + "step": 13030 + }, + { + "epoch": 0.699087982832618, + "grad_norm": 0.4765625, + "learning_rate": 4.843531458642801e-06, + "loss": 2.1968, + "step": 13031 + }, + { + "epoch": 0.6991416309012876, + "grad_norm": 0.443359375, + "learning_rate": 4.8435012050387445e-06, + "loss": 2.2317, + "step": 13032 + }, + { + "epoch": 0.699195278969957, + "grad_norm": 0.455078125, + "learning_rate": 4.8434709486046675e-06, + "loss": 2.127, + "step": 13033 + }, + { + "epoch": 0.6992489270386266, + "grad_norm": 0.392578125, + "learning_rate": 4.843440689340605e-06, + "loss": 1.9233, + "step": 13034 + }, + { + "epoch": 0.6993025751072961, + "grad_norm": 0.546875, + "learning_rate": 4.843410427246596e-06, + "loss": 2.4402, + "step": 13035 + }, + { + "epoch": 0.6993562231759657, + "grad_norm": 0.51953125, + "learning_rate": 4.843380162322675e-06, + "loss": 2.3436, + "step": 13036 + }, + { + "epoch": 0.6994098712446352, + "grad_norm": 0.44921875, + "learning_rate": 4.84334989456888e-06, + "loss": 2.2759, + "step": 13037 + }, + { + "epoch": 0.6994635193133047, + "grad_norm": 0.46875, + "learning_rate": 4.843319623985246e-06, + "loss": 2.3104, + "step": 13038 + }, + { + "epoch": 0.6995171673819742, + "grad_norm": 0.478515625, + "learning_rate": 4.843289350571811e-06, + "loss": 2.4016, + "step": 13039 + }, + { + "epoch": 0.6995708154506438, + "grad_norm": 0.5234375, + "learning_rate": 4.843259074328611e-06, + "loss": 2.3175, + "step": 13040 + }, + { + "epoch": 0.6996244635193133, + "grad_norm": 0.423828125, + "learning_rate": 4.843228795255684e-06, + "loss": 2.1936, + "step": 13041 + }, + { + "epoch": 0.6996781115879829, + "grad_norm": 0.5546875, + "learning_rate": 4.8431985133530625e-06, + "loss": 2.1996, + "step": 13042 + }, + { + "epoch": 0.6997317596566524, + "grad_norm": 0.57421875, + "learning_rate": 4.843168228620787e-06, + "loss": 2.4496, + "step": 13043 + }, + { + "epoch": 0.6997854077253219, + "grad_norm": 0.4296875, + "learning_rate": 4.843137941058893e-06, + "loss": 2.2143, + "step": 13044 + }, + { + "epoch": 0.6998390557939914, + "grad_norm": 0.4765625, + "learning_rate": 4.843107650667416e-06, + "loss": 2.1697, + "step": 13045 + }, + { + "epoch": 0.6998927038626609, + "grad_norm": 0.443359375, + "learning_rate": 4.843077357446394e-06, + "loss": 2.3422, + "step": 13046 + }, + { + "epoch": 0.6999463519313305, + "grad_norm": 0.404296875, + "learning_rate": 4.843047061395863e-06, + "loss": 2.1918, + "step": 13047 + }, + { + "epoch": 0.7, + "grad_norm": 0.46484375, + "learning_rate": 4.84301676251586e-06, + "loss": 2.2475, + "step": 13048 + }, + { + "epoch": 0.7000536480686695, + "grad_norm": 0.53515625, + "learning_rate": 4.842986460806421e-06, + "loss": 2.3083, + "step": 13049 + }, + { + "epoch": 0.700107296137339, + "grad_norm": 0.470703125, + "learning_rate": 4.842956156267582e-06, + "loss": 2.2639, + "step": 13050 + }, + { + "epoch": 0.7001609442060086, + "grad_norm": 0.45703125, + "learning_rate": 4.84292584889938e-06, + "loss": 2.5768, + "step": 13051 + }, + { + "epoch": 0.7002145922746781, + "grad_norm": 0.41015625, + "learning_rate": 4.842895538701853e-06, + "loss": 2.2073, + "step": 13052 + }, + { + "epoch": 0.7002682403433477, + "grad_norm": 0.443359375, + "learning_rate": 4.842865225675036e-06, + "loss": 2.3083, + "step": 13053 + }, + { + "epoch": 0.7003218884120171, + "grad_norm": 0.47265625, + "learning_rate": 4.842834909818967e-06, + "loss": 2.3011, + "step": 13054 + }, + { + "epoch": 0.7003755364806867, + "grad_norm": 0.498046875, + "learning_rate": 4.842804591133681e-06, + "loss": 2.6914, + "step": 13055 + }, + { + "epoch": 0.7004291845493562, + "grad_norm": 0.49609375, + "learning_rate": 4.842774269619216e-06, + "loss": 2.7249, + "step": 13056 + }, + { + "epoch": 0.7004828326180258, + "grad_norm": 0.466796875, + "learning_rate": 4.8427439452756075e-06, + "loss": 2.2104, + "step": 13057 + }, + { + "epoch": 0.7005364806866953, + "grad_norm": 0.50390625, + "learning_rate": 4.842713618102894e-06, + "loss": 2.1667, + "step": 13058 + }, + { + "epoch": 0.7005901287553649, + "grad_norm": 0.490234375, + "learning_rate": 4.842683288101108e-06, + "loss": 2.2452, + "step": 13059 + }, + { + "epoch": 0.7006437768240343, + "grad_norm": 0.375, + "learning_rate": 4.842652955270291e-06, + "loss": 2.1758, + "step": 13060 + }, + { + "epoch": 0.7006974248927038, + "grad_norm": 0.65234375, + "learning_rate": 4.842622619610477e-06, + "loss": 2.491, + "step": 13061 + }, + { + "epoch": 0.7007510729613734, + "grad_norm": 0.52734375, + "learning_rate": 4.842592281121703e-06, + "loss": 2.1912, + "step": 13062 + }, + { + "epoch": 0.7008047210300429, + "grad_norm": 0.412109375, + "learning_rate": 4.8425619398040055e-06, + "loss": 2.3034, + "step": 13063 + }, + { + "epoch": 0.7008583690987125, + "grad_norm": 0.44921875, + "learning_rate": 4.842531595657422e-06, + "loss": 2.1059, + "step": 13064 + }, + { + "epoch": 0.7009120171673819, + "grad_norm": 0.50390625, + "learning_rate": 4.842501248681989e-06, + "loss": 2.6011, + "step": 13065 + }, + { + "epoch": 0.7009656652360515, + "grad_norm": 0.470703125, + "learning_rate": 4.842470898877742e-06, + "loss": 2.3181, + "step": 13066 + }, + { + "epoch": 0.701019313304721, + "grad_norm": 0.490234375, + "learning_rate": 4.842440546244718e-06, + "loss": 2.1879, + "step": 13067 + }, + { + "epoch": 0.7010729613733906, + "grad_norm": 0.396484375, + "learning_rate": 4.842410190782954e-06, + "loss": 2.3802, + "step": 13068 + }, + { + "epoch": 0.7011266094420601, + "grad_norm": 0.400390625, + "learning_rate": 4.842379832492488e-06, + "loss": 2.098, + "step": 13069 + }, + { + "epoch": 0.7011802575107297, + "grad_norm": 0.39453125, + "learning_rate": 4.842349471373354e-06, + "loss": 2.3456, + "step": 13070 + }, + { + "epoch": 0.7012339055793991, + "grad_norm": 0.466796875, + "learning_rate": 4.842319107425591e-06, + "loss": 2.4007, + "step": 13071 + }, + { + "epoch": 0.7012875536480687, + "grad_norm": 0.435546875, + "learning_rate": 4.842288740649234e-06, + "loss": 2.3291, + "step": 13072 + }, + { + "epoch": 0.7013412017167382, + "grad_norm": 0.75390625, + "learning_rate": 4.84225837104432e-06, + "loss": 2.3342, + "step": 13073 + }, + { + "epoch": 0.7013948497854077, + "grad_norm": 0.53515625, + "learning_rate": 4.842227998610887e-06, + "loss": 2.5304, + "step": 13074 + }, + { + "epoch": 0.7014484978540773, + "grad_norm": 0.50390625, + "learning_rate": 4.84219762334897e-06, + "loss": 2.4168, + "step": 13075 + }, + { + "epoch": 0.7015021459227467, + "grad_norm": 0.42578125, + "learning_rate": 4.8421672452586064e-06, + "loss": 2.2003, + "step": 13076 + }, + { + "epoch": 0.7015557939914163, + "grad_norm": 0.5078125, + "learning_rate": 4.8421368643398335e-06, + "loss": 2.3969, + "step": 13077 + }, + { + "epoch": 0.7016094420600858, + "grad_norm": 0.51171875, + "learning_rate": 4.842106480592687e-06, + "loss": 2.317, + "step": 13078 + }, + { + "epoch": 0.7016630901287554, + "grad_norm": 0.416015625, + "learning_rate": 4.842076094017204e-06, + "loss": 2.3993, + "step": 13079 + }, + { + "epoch": 0.7017167381974249, + "grad_norm": 0.462890625, + "learning_rate": 4.842045704613421e-06, + "loss": 2.3755, + "step": 13080 + }, + { + "epoch": 0.7017703862660944, + "grad_norm": 0.53125, + "learning_rate": 4.842015312381375e-06, + "loss": 2.47, + "step": 13081 + }, + { + "epoch": 0.7018240343347639, + "grad_norm": 0.408203125, + "learning_rate": 4.841984917321102e-06, + "loss": 2.3046, + "step": 13082 + }, + { + "epoch": 0.7018776824034335, + "grad_norm": 0.470703125, + "learning_rate": 4.841954519432639e-06, + "loss": 2.2666, + "step": 13083 + }, + { + "epoch": 0.701931330472103, + "grad_norm": 0.419921875, + "learning_rate": 4.841924118716024e-06, + "loss": 1.9717, + "step": 13084 + }, + { + "epoch": 0.7019849785407726, + "grad_norm": 0.47265625, + "learning_rate": 4.841893715171292e-06, + "loss": 2.1692, + "step": 13085 + }, + { + "epoch": 0.702038626609442, + "grad_norm": 0.37109375, + "learning_rate": 4.841863308798481e-06, + "loss": 2.3128, + "step": 13086 + }, + { + "epoch": 0.7020922746781116, + "grad_norm": 0.55859375, + "learning_rate": 4.841832899597626e-06, + "loss": 2.6206, + "step": 13087 + }, + { + "epoch": 0.7021459227467811, + "grad_norm": 0.482421875, + "learning_rate": 4.841802487568765e-06, + "loss": 2.2345, + "step": 13088 + }, + { + "epoch": 0.7021995708154506, + "grad_norm": 0.60546875, + "learning_rate": 4.841772072711936e-06, + "loss": 2.2863, + "step": 13089 + }, + { + "epoch": 0.7022532188841202, + "grad_norm": 0.50390625, + "learning_rate": 4.8417416550271726e-06, + "loss": 2.1871, + "step": 13090 + }, + { + "epoch": 0.7023068669527897, + "grad_norm": 0.412109375, + "learning_rate": 4.841711234514514e-06, + "loss": 2.3036, + "step": 13091 + }, + { + "epoch": 0.7023605150214592, + "grad_norm": 0.443359375, + "learning_rate": 4.841680811173996e-06, + "loss": 2.2541, + "step": 13092 + }, + { + "epoch": 0.7024141630901287, + "grad_norm": 0.421875, + "learning_rate": 4.841650385005655e-06, + "loss": 2.2371, + "step": 13093 + }, + { + "epoch": 0.7024678111587983, + "grad_norm": 0.44140625, + "learning_rate": 4.841619956009529e-06, + "loss": 2.3044, + "step": 13094 + }, + { + "epoch": 0.7025214592274678, + "grad_norm": 0.421875, + "learning_rate": 4.841589524185654e-06, + "loss": 2.1092, + "step": 13095 + }, + { + "epoch": 0.7025751072961374, + "grad_norm": 0.41796875, + "learning_rate": 4.841559089534066e-06, + "loss": 2.0053, + "step": 13096 + }, + { + "epoch": 0.7026287553648068, + "grad_norm": 0.4296875, + "learning_rate": 4.841528652054803e-06, + "loss": 2.2374, + "step": 13097 + }, + { + "epoch": 0.7026824034334764, + "grad_norm": 0.439453125, + "learning_rate": 4.841498211747902e-06, + "loss": 2.336, + "step": 13098 + }, + { + "epoch": 0.7027360515021459, + "grad_norm": 0.33203125, + "learning_rate": 4.841467768613397e-06, + "loss": 1.9505, + "step": 13099 + }, + { + "epoch": 0.7027896995708155, + "grad_norm": 0.443359375, + "learning_rate": 4.841437322651328e-06, + "loss": 2.4485, + "step": 13100 + }, + { + "epoch": 0.702843347639485, + "grad_norm": 2.5, + "learning_rate": 4.8414068738617304e-06, + "loss": 1.6141, + "step": 13101 + }, + { + "epoch": 0.7028969957081546, + "grad_norm": 0.47265625, + "learning_rate": 4.841376422244641e-06, + "loss": 2.1422, + "step": 13102 + }, + { + "epoch": 0.702950643776824, + "grad_norm": 0.439453125, + "learning_rate": 4.8413459678000964e-06, + "loss": 2.4115, + "step": 13103 + }, + { + "epoch": 0.7030042918454935, + "grad_norm": 0.462890625, + "learning_rate": 4.841315510528134e-06, + "loss": 2.2805, + "step": 13104 + }, + { + "epoch": 0.7030579399141631, + "grad_norm": 0.59765625, + "learning_rate": 4.8412850504287904e-06, + "loss": 2.5027, + "step": 13105 + }, + { + "epoch": 0.7031115879828326, + "grad_norm": 0.396484375, + "learning_rate": 4.841254587502101e-06, + "loss": 2.3105, + "step": 13106 + }, + { + "epoch": 0.7031652360515022, + "grad_norm": 0.4296875, + "learning_rate": 4.841224121748105e-06, + "loss": 2.374, + "step": 13107 + }, + { + "epoch": 0.7032188841201716, + "grad_norm": 0.4765625, + "learning_rate": 4.841193653166838e-06, + "loss": 2.566, + "step": 13108 + }, + { + "epoch": 0.7032725321888412, + "grad_norm": 0.458984375, + "learning_rate": 4.841163181758336e-06, + "loss": 2.3609, + "step": 13109 + }, + { + "epoch": 0.7033261802575107, + "grad_norm": 0.478515625, + "learning_rate": 4.841132707522637e-06, + "loss": 2.073, + "step": 13110 + }, + { + "epoch": 0.7033798283261803, + "grad_norm": 0.68359375, + "learning_rate": 4.841102230459778e-06, + "loss": 1.3595, + "step": 13111 + }, + { + "epoch": 0.7034334763948498, + "grad_norm": 0.484375, + "learning_rate": 4.841071750569794e-06, + "loss": 2.3249, + "step": 13112 + }, + { + "epoch": 0.7034871244635194, + "grad_norm": 0.4296875, + "learning_rate": 4.841041267852724e-06, + "loss": 2.2712, + "step": 13113 + }, + { + "epoch": 0.7035407725321888, + "grad_norm": 0.416015625, + "learning_rate": 4.841010782308603e-06, + "loss": 2.0415, + "step": 13114 + }, + { + "epoch": 0.7035944206008584, + "grad_norm": 0.453125, + "learning_rate": 4.840980293937469e-06, + "loss": 2.2143, + "step": 13115 + }, + { + "epoch": 0.7036480686695279, + "grad_norm": 0.431640625, + "learning_rate": 4.840949802739359e-06, + "loss": 1.7461, + "step": 13116 + }, + { + "epoch": 0.7037017167381975, + "grad_norm": 0.353515625, + "learning_rate": 4.840919308714308e-06, + "loss": 1.9974, + "step": 13117 + }, + { + "epoch": 0.703755364806867, + "grad_norm": 0.46484375, + "learning_rate": 4.840888811862354e-06, + "loss": 2.3414, + "step": 13118 + }, + { + "epoch": 0.7038090128755364, + "grad_norm": 0.41015625, + "learning_rate": 4.840858312183535e-06, + "loss": 1.9464, + "step": 13119 + }, + { + "epoch": 0.703862660944206, + "grad_norm": 0.427734375, + "learning_rate": 4.840827809677886e-06, + "loss": 2.348, + "step": 13120 + }, + { + "epoch": 0.7039163090128755, + "grad_norm": 0.51953125, + "learning_rate": 4.8407973043454444e-06, + "loss": 2.3039, + "step": 13121 + }, + { + "epoch": 0.7039699570815451, + "grad_norm": 0.4453125, + "learning_rate": 4.840766796186248e-06, + "loss": 2.4221, + "step": 13122 + }, + { + "epoch": 0.7040236051502146, + "grad_norm": 0.486328125, + "learning_rate": 4.840736285200333e-06, + "loss": 2.4763, + "step": 13123 + }, + { + "epoch": 0.7040772532188841, + "grad_norm": 0.462890625, + "learning_rate": 4.840705771387735e-06, + "loss": 2.3538, + "step": 13124 + }, + { + "epoch": 0.7041309012875536, + "grad_norm": 0.44140625, + "learning_rate": 4.840675254748493e-06, + "loss": 1.8603, + "step": 13125 + }, + { + "epoch": 0.7041845493562232, + "grad_norm": 0.4765625, + "learning_rate": 4.840644735282642e-06, + "loss": 2.4872, + "step": 13126 + }, + { + "epoch": 0.7042381974248927, + "grad_norm": 0.54296875, + "learning_rate": 4.84061421299022e-06, + "loss": 2.2984, + "step": 13127 + }, + { + "epoch": 0.7042918454935623, + "grad_norm": 0.427734375, + "learning_rate": 4.840583687871264e-06, + "loss": 1.9798, + "step": 13128 + }, + { + "epoch": 0.7043454935622318, + "grad_norm": 0.515625, + "learning_rate": 4.8405531599258094e-06, + "loss": 2.3693, + "step": 13129 + }, + { + "epoch": 0.7043991416309013, + "grad_norm": 0.56640625, + "learning_rate": 4.840522629153894e-06, + "loss": 2.2607, + "step": 13130 + }, + { + "epoch": 0.7044527896995708, + "grad_norm": 0.431640625, + "learning_rate": 4.840492095555556e-06, + "loss": 2.1643, + "step": 13131 + }, + { + "epoch": 0.7045064377682403, + "grad_norm": 0.375, + "learning_rate": 4.84046155913083e-06, + "loss": 2.1832, + "step": 13132 + }, + { + "epoch": 0.7045600858369099, + "grad_norm": 0.423828125, + "learning_rate": 4.840431019879754e-06, + "loss": 2.2281, + "step": 13133 + }, + { + "epoch": 0.7046137339055794, + "grad_norm": 0.45703125, + "learning_rate": 4.840400477802365e-06, + "loss": 2.2341, + "step": 13134 + }, + { + "epoch": 0.7046673819742489, + "grad_norm": 0.5390625, + "learning_rate": 4.840369932898699e-06, + "loss": 1.3801, + "step": 13135 + }, + { + "epoch": 0.7047210300429184, + "grad_norm": 0.443359375, + "learning_rate": 4.840339385168794e-06, + "loss": 2.1717, + "step": 13136 + }, + { + "epoch": 0.704774678111588, + "grad_norm": 0.498046875, + "learning_rate": 4.840308834612686e-06, + "loss": 2.2054, + "step": 13137 + }, + { + "epoch": 0.7048283261802575, + "grad_norm": 0.427734375, + "learning_rate": 4.840278281230413e-06, + "loss": 2.249, + "step": 13138 + }, + { + "epoch": 0.7048819742489271, + "grad_norm": 0.498046875, + "learning_rate": 4.84024772502201e-06, + "loss": 2.3785, + "step": 13139 + }, + { + "epoch": 0.7049356223175965, + "grad_norm": 0.4921875, + "learning_rate": 4.840217165987516e-06, + "loss": 2.1667, + "step": 13140 + }, + { + "epoch": 0.7049892703862661, + "grad_norm": 0.4453125, + "learning_rate": 4.840186604126966e-06, + "loss": 2.2209, + "step": 13141 + }, + { + "epoch": 0.7050429184549356, + "grad_norm": 0.453125, + "learning_rate": 4.840156039440399e-06, + "loss": 2.3443, + "step": 13142 + }, + { + "epoch": 0.7050965665236052, + "grad_norm": 1.0078125, + "learning_rate": 4.84012547192785e-06, + "loss": 2.109, + "step": 13143 + }, + { + "epoch": 0.7051502145922747, + "grad_norm": 0.416015625, + "learning_rate": 4.8400949015893576e-06, + "loss": 2.2371, + "step": 13144 + }, + { + "epoch": 0.7052038626609443, + "grad_norm": 0.474609375, + "learning_rate": 4.840064328424958e-06, + "loss": 2.2887, + "step": 13145 + }, + { + "epoch": 0.7052575107296137, + "grad_norm": 0.451171875, + "learning_rate": 4.840033752434687e-06, + "loss": 2.4313, + "step": 13146 + }, + { + "epoch": 0.7053111587982832, + "grad_norm": 0.458984375, + "learning_rate": 4.840003173618583e-06, + "loss": 1.3412, + "step": 13147 + }, + { + "epoch": 0.7053648068669528, + "grad_norm": 0.40625, + "learning_rate": 4.839972591976682e-06, + "loss": 1.9295, + "step": 13148 + }, + { + "epoch": 0.7054184549356223, + "grad_norm": 0.390625, + "learning_rate": 4.8399420075090215e-06, + "loss": 2.2655, + "step": 13149 + }, + { + "epoch": 0.7054721030042919, + "grad_norm": 0.53515625, + "learning_rate": 4.839911420215638e-06, + "loss": 2.5736, + "step": 13150 + }, + { + "epoch": 0.7055257510729613, + "grad_norm": 0.451171875, + "learning_rate": 4.8398808300965695e-06, + "loss": 2.163, + "step": 13151 + }, + { + "epoch": 0.7055793991416309, + "grad_norm": 0.59375, + "learning_rate": 4.839850237151851e-06, + "loss": 2.1744, + "step": 13152 + }, + { + "epoch": 0.7056330472103004, + "grad_norm": 0.51171875, + "learning_rate": 4.839819641381522e-06, + "loss": 2.4633, + "step": 13153 + }, + { + "epoch": 0.70568669527897, + "grad_norm": 0.494140625, + "learning_rate": 4.839789042785617e-06, + "loss": 2.2276, + "step": 13154 + }, + { + "epoch": 0.7057403433476395, + "grad_norm": 0.474609375, + "learning_rate": 4.839758441364174e-06, + "loss": 2.4222, + "step": 13155 + }, + { + "epoch": 0.705793991416309, + "grad_norm": 0.427734375, + "learning_rate": 4.83972783711723e-06, + "loss": 2.1978, + "step": 13156 + }, + { + "epoch": 0.7058476394849785, + "grad_norm": 0.447265625, + "learning_rate": 4.839697230044822e-06, + "loss": 2.5406, + "step": 13157 + }, + { + "epoch": 0.7059012875536481, + "grad_norm": 0.462890625, + "learning_rate": 4.8396666201469865e-06, + "loss": 1.5295, + "step": 13158 + }, + { + "epoch": 0.7059549356223176, + "grad_norm": 0.38671875, + "learning_rate": 4.839636007423761e-06, + "loss": 2.2326, + "step": 13159 + }, + { + "epoch": 0.7060085836909872, + "grad_norm": 1.5390625, + "learning_rate": 4.839605391875183e-06, + "loss": 2.3679, + "step": 13160 + }, + { + "epoch": 0.7060622317596567, + "grad_norm": 0.49609375, + "learning_rate": 4.839574773501289e-06, + "loss": 2.5879, + "step": 13161 + }, + { + "epoch": 0.7061158798283261, + "grad_norm": 0.51953125, + "learning_rate": 4.839544152302115e-06, + "loss": 2.1864, + "step": 13162 + }, + { + "epoch": 0.7061695278969957, + "grad_norm": 0.41796875, + "learning_rate": 4.839513528277698e-06, + "loss": 2.3425, + "step": 13163 + }, + { + "epoch": 0.7062231759656652, + "grad_norm": 0.546875, + "learning_rate": 4.839482901428077e-06, + "loss": 2.1047, + "step": 13164 + }, + { + "epoch": 0.7062768240343348, + "grad_norm": 0.52734375, + "learning_rate": 4.839452271753287e-06, + "loss": 2.3252, + "step": 13165 + }, + { + "epoch": 0.7063304721030043, + "grad_norm": 0.4375, + "learning_rate": 4.839421639253365e-06, + "loss": 2.4954, + "step": 13166 + }, + { + "epoch": 0.7063841201716738, + "grad_norm": 0.453125, + "learning_rate": 4.83939100392835e-06, + "loss": 2.4642, + "step": 13167 + }, + { + "epoch": 0.7064377682403433, + "grad_norm": 0.48828125, + "learning_rate": 4.839360365778277e-06, + "loss": 2.2634, + "step": 13168 + }, + { + "epoch": 0.7064914163090129, + "grad_norm": 0.376953125, + "learning_rate": 4.839329724803183e-06, + "loss": 2.4843, + "step": 13169 + }, + { + "epoch": 0.7065450643776824, + "grad_norm": 0.38671875, + "learning_rate": 4.839299081003107e-06, + "loss": 2.1934, + "step": 13170 + }, + { + "epoch": 0.706598712446352, + "grad_norm": 0.42578125, + "learning_rate": 4.839268434378084e-06, + "loss": 2.1523, + "step": 13171 + }, + { + "epoch": 0.7066523605150214, + "grad_norm": 0.42578125, + "learning_rate": 4.839237784928152e-06, + "loss": 1.9449, + "step": 13172 + }, + { + "epoch": 0.706706008583691, + "grad_norm": 0.494140625, + "learning_rate": 4.839207132653347e-06, + "loss": 2.4337, + "step": 13173 + }, + { + "epoch": 0.7067596566523605, + "grad_norm": 0.4296875, + "learning_rate": 4.8391764775537074e-06, + "loss": 2.2556, + "step": 13174 + }, + { + "epoch": 0.70681330472103, + "grad_norm": 0.478515625, + "learning_rate": 4.839145819629269e-06, + "loss": 2.181, + "step": 13175 + }, + { + "epoch": 0.7068669527896996, + "grad_norm": 0.55078125, + "learning_rate": 4.83911515888007e-06, + "loss": 2.4915, + "step": 13176 + }, + { + "epoch": 0.706920600858369, + "grad_norm": 0.56640625, + "learning_rate": 4.839084495306146e-06, + "loss": 2.3079, + "step": 13177 + }, + { + "epoch": 0.7069742489270386, + "grad_norm": 0.478515625, + "learning_rate": 4.8390538289075354e-06, + "loss": 1.5921, + "step": 13178 + }, + { + "epoch": 0.7070278969957081, + "grad_norm": 0.431640625, + "learning_rate": 4.839023159684274e-06, + "loss": 2.2194, + "step": 13179 + }, + { + "epoch": 0.7070815450643777, + "grad_norm": 0.439453125, + "learning_rate": 4.8389924876364005e-06, + "loss": 2.3039, + "step": 13180 + }, + { + "epoch": 0.7071351931330472, + "grad_norm": 0.46875, + "learning_rate": 4.838961812763949e-06, + "loss": 2.4356, + "step": 13181 + }, + { + "epoch": 0.7071888412017168, + "grad_norm": 0.44921875, + "learning_rate": 4.838931135066961e-06, + "loss": 2.403, + "step": 13182 + }, + { + "epoch": 0.7072424892703862, + "grad_norm": 0.46484375, + "learning_rate": 4.838900454545469e-06, + "loss": 1.5847, + "step": 13183 + }, + { + "epoch": 0.7072961373390558, + "grad_norm": 0.486328125, + "learning_rate": 4.838869771199513e-06, + "loss": 2.3879, + "step": 13184 + }, + { + "epoch": 0.7073497854077253, + "grad_norm": 0.435546875, + "learning_rate": 4.838839085029129e-06, + "loss": 2.1656, + "step": 13185 + }, + { + "epoch": 0.7074034334763949, + "grad_norm": 1.078125, + "learning_rate": 4.838808396034354e-06, + "loss": 2.4373, + "step": 13186 + }, + { + "epoch": 0.7074570815450644, + "grad_norm": 0.5078125, + "learning_rate": 4.838777704215225e-06, + "loss": 1.6554, + "step": 13187 + }, + { + "epoch": 0.707510729613734, + "grad_norm": 0.421875, + "learning_rate": 4.838747009571779e-06, + "loss": 2.408, + "step": 13188 + }, + { + "epoch": 0.7075643776824034, + "grad_norm": 0.4765625, + "learning_rate": 4.8387163121040544e-06, + "loss": 2.1601, + "step": 13189 + }, + { + "epoch": 0.7076180257510729, + "grad_norm": 0.482421875, + "learning_rate": 4.8386856118120864e-06, + "loss": 2.1976, + "step": 13190 + }, + { + "epoch": 0.7076716738197425, + "grad_norm": 0.51171875, + "learning_rate": 4.838654908695913e-06, + "loss": 2.2599, + "step": 13191 + }, + { + "epoch": 0.707725321888412, + "grad_norm": 0.453125, + "learning_rate": 4.838624202755571e-06, + "loss": 2.3179, + "step": 13192 + }, + { + "epoch": 0.7077789699570816, + "grad_norm": 0.447265625, + "learning_rate": 4.838593493991099e-06, + "loss": 2.376, + "step": 13193 + }, + { + "epoch": 0.707832618025751, + "grad_norm": 0.49609375, + "learning_rate": 4.838562782402531e-06, + "loss": 2.4011, + "step": 13194 + }, + { + "epoch": 0.7078862660944206, + "grad_norm": 0.51171875, + "learning_rate": 4.838532067989907e-06, + "loss": 2.4036, + "step": 13195 + }, + { + "epoch": 0.7079399141630901, + "grad_norm": 0.7421875, + "learning_rate": 4.838501350753262e-06, + "loss": 2.3763, + "step": 13196 + }, + { + "epoch": 0.7079935622317597, + "grad_norm": 0.578125, + "learning_rate": 4.838470630692634e-06, + "loss": 2.4551, + "step": 13197 + }, + { + "epoch": 0.7080472103004292, + "grad_norm": 0.45703125, + "learning_rate": 4.838439907808061e-06, + "loss": 1.6532, + "step": 13198 + }, + { + "epoch": 0.7081008583690988, + "grad_norm": 0.46484375, + "learning_rate": 4.838409182099579e-06, + "loss": 2.371, + "step": 13199 + }, + { + "epoch": 0.7081545064377682, + "grad_norm": 0.43359375, + "learning_rate": 4.838378453567225e-06, + "loss": 2.3581, + "step": 13200 + }, + { + "epoch": 0.7082081545064378, + "grad_norm": 0.41015625, + "learning_rate": 4.838347722211037e-06, + "loss": 2.4372, + "step": 13201 + }, + { + "epoch": 0.7082618025751073, + "grad_norm": 0.4609375, + "learning_rate": 4.83831698803105e-06, + "loss": 2.3341, + "step": 13202 + }, + { + "epoch": 0.7083154506437769, + "grad_norm": 0.46875, + "learning_rate": 4.838286251027303e-06, + "loss": 2.3492, + "step": 13203 + }, + { + "epoch": 0.7083690987124464, + "grad_norm": 0.44140625, + "learning_rate": 4.8382555111998345e-06, + "loss": 2.4038, + "step": 13204 + }, + { + "epoch": 0.7084227467811158, + "grad_norm": 0.388671875, + "learning_rate": 4.838224768548678e-06, + "loss": 2.1305, + "step": 13205 + }, + { + "epoch": 0.7084763948497854, + "grad_norm": 0.4296875, + "learning_rate": 4.838194023073874e-06, + "loss": 1.997, + "step": 13206 + }, + { + "epoch": 0.7085300429184549, + "grad_norm": 0.431640625, + "learning_rate": 4.838163274775457e-06, + "loss": 2.1748, + "step": 13207 + }, + { + "epoch": 0.7085836909871245, + "grad_norm": 0.4375, + "learning_rate": 4.838132523653466e-06, + "loss": 2.3011, + "step": 13208 + }, + { + "epoch": 0.708637339055794, + "grad_norm": 0.482421875, + "learning_rate": 4.838101769707937e-06, + "loss": 2.1979, + "step": 13209 + }, + { + "epoch": 0.7086909871244635, + "grad_norm": 0.5390625, + "learning_rate": 4.838071012938908e-06, + "loss": 2.3135, + "step": 13210 + }, + { + "epoch": 0.708744635193133, + "grad_norm": 0.4375, + "learning_rate": 4.838040253346415e-06, + "loss": 2.3451, + "step": 13211 + }, + { + "epoch": 0.7087982832618026, + "grad_norm": 0.45703125, + "learning_rate": 4.838009490930496e-06, + "loss": 2.3695, + "step": 13212 + }, + { + "epoch": 0.7088519313304721, + "grad_norm": 0.4921875, + "learning_rate": 4.837978725691188e-06, + "loss": 2.334, + "step": 13213 + }, + { + "epoch": 0.7089055793991417, + "grad_norm": 0.5234375, + "learning_rate": 4.837947957628528e-06, + "loss": 2.1642, + "step": 13214 + }, + { + "epoch": 0.7089592274678111, + "grad_norm": 0.486328125, + "learning_rate": 4.837917186742553e-06, + "loss": 2.4229, + "step": 13215 + }, + { + "epoch": 0.7090128755364807, + "grad_norm": 0.4296875, + "learning_rate": 4.8378864130333016e-06, + "loss": 2.2436, + "step": 13216 + }, + { + "epoch": 0.7090665236051502, + "grad_norm": 0.4921875, + "learning_rate": 4.837855636500809e-06, + "loss": 2.2395, + "step": 13217 + }, + { + "epoch": 0.7091201716738197, + "grad_norm": 1.28125, + "learning_rate": 4.837824857145114e-06, + "loss": 2.4306, + "step": 13218 + }, + { + "epoch": 0.7091738197424893, + "grad_norm": 0.50390625, + "learning_rate": 4.837794074966252e-06, + "loss": 2.2364, + "step": 13219 + }, + { + "epoch": 0.7092274678111588, + "grad_norm": 0.431640625, + "learning_rate": 4.837763289964261e-06, + "loss": 2.4481, + "step": 13220 + }, + { + "epoch": 0.7092811158798283, + "grad_norm": 0.41015625, + "learning_rate": 4.837732502139179e-06, + "loss": 2.3769, + "step": 13221 + }, + { + "epoch": 0.7093347639484978, + "grad_norm": 0.427734375, + "learning_rate": 4.837701711491042e-06, + "loss": 2.3581, + "step": 13222 + }, + { + "epoch": 0.7093884120171674, + "grad_norm": 0.466796875, + "learning_rate": 4.8376709180198875e-06, + "loss": 2.2127, + "step": 13223 + }, + { + "epoch": 0.7094420600858369, + "grad_norm": 0.8515625, + "learning_rate": 4.837640121725754e-06, + "loss": 1.7498, + "step": 13224 + }, + { + "epoch": 0.7094957081545065, + "grad_norm": 0.44140625, + "learning_rate": 4.837609322608676e-06, + "loss": 2.5095, + "step": 13225 + }, + { + "epoch": 0.7095493562231759, + "grad_norm": 0.47265625, + "learning_rate": 4.837578520668693e-06, + "loss": 2.2571, + "step": 13226 + }, + { + "epoch": 0.7096030042918455, + "grad_norm": 0.5859375, + "learning_rate": 4.837547715905842e-06, + "loss": 2.6914, + "step": 13227 + }, + { + "epoch": 0.709656652360515, + "grad_norm": 0.79296875, + "learning_rate": 4.837516908320159e-06, + "loss": 2.1805, + "step": 13228 + }, + { + "epoch": 0.7097103004291846, + "grad_norm": 0.5390625, + "learning_rate": 4.837486097911681e-06, + "loss": 2.4225, + "step": 13229 + }, + { + "epoch": 0.7097639484978541, + "grad_norm": 0.4453125, + "learning_rate": 4.837455284680447e-06, + "loss": 2.1587, + "step": 13230 + }, + { + "epoch": 0.7098175965665237, + "grad_norm": 0.416015625, + "learning_rate": 4.837424468626493e-06, + "loss": 2.1864, + "step": 13231 + }, + { + "epoch": 0.7098712446351931, + "grad_norm": 0.384765625, + "learning_rate": 4.837393649749856e-06, + "loss": 2.4066, + "step": 13232 + }, + { + "epoch": 0.7099248927038626, + "grad_norm": 0.4375, + "learning_rate": 4.8373628280505745e-06, + "loss": 2.2481, + "step": 13233 + }, + { + "epoch": 0.7099785407725322, + "grad_norm": 0.458984375, + "learning_rate": 4.8373320035286845e-06, + "loss": 2.0951, + "step": 13234 + }, + { + "epoch": 0.7100321888412017, + "grad_norm": 0.4140625, + "learning_rate": 4.837301176184224e-06, + "loss": 2.4905, + "step": 13235 + }, + { + "epoch": 0.7100858369098713, + "grad_norm": 0.54296875, + "learning_rate": 4.8372703460172295e-06, + "loss": 2.3645, + "step": 13236 + }, + { + "epoch": 0.7101394849785407, + "grad_norm": 0.439453125, + "learning_rate": 4.837239513027738e-06, + "loss": 2.2793, + "step": 13237 + }, + { + "epoch": 0.7101931330472103, + "grad_norm": 0.392578125, + "learning_rate": 4.8372086772157885e-06, + "loss": 2.0872, + "step": 13238 + }, + { + "epoch": 0.7102467811158798, + "grad_norm": 0.419921875, + "learning_rate": 4.8371778385814164e-06, + "loss": 2.0936, + "step": 13239 + }, + { + "epoch": 0.7103004291845494, + "grad_norm": 0.416015625, + "learning_rate": 4.8371469971246595e-06, + "loss": 2.2724, + "step": 13240 + }, + { + "epoch": 0.7103540772532189, + "grad_norm": 0.46875, + "learning_rate": 4.837116152845555e-06, + "loss": 2.2577, + "step": 13241 + }, + { + "epoch": 0.7104077253218885, + "grad_norm": 0.494140625, + "learning_rate": 4.837085305744141e-06, + "loss": 2.2883, + "step": 13242 + }, + { + "epoch": 0.7104613733905579, + "grad_norm": 0.44921875, + "learning_rate": 4.837054455820453e-06, + "loss": 2.4285, + "step": 13243 + }, + { + "epoch": 0.7105150214592275, + "grad_norm": 0.48046875, + "learning_rate": 4.83702360307453e-06, + "loss": 2.4403, + "step": 13244 + }, + { + "epoch": 0.710568669527897, + "grad_norm": 0.4140625, + "learning_rate": 4.8369927475064085e-06, + "loss": 1.8247, + "step": 13245 + }, + { + "epoch": 0.7106223175965666, + "grad_norm": 0.61328125, + "learning_rate": 4.836961889116125e-06, + "loss": 2.3418, + "step": 13246 + }, + { + "epoch": 0.710675965665236, + "grad_norm": 0.53125, + "learning_rate": 4.836931027903718e-06, + "loss": 2.3362, + "step": 13247 + }, + { + "epoch": 0.7107296137339055, + "grad_norm": 0.466796875, + "learning_rate": 4.8369001638692244e-06, + "loss": 2.4442, + "step": 13248 + }, + { + "epoch": 0.7107832618025751, + "grad_norm": 0.578125, + "learning_rate": 4.836869297012683e-06, + "loss": 2.4225, + "step": 13249 + }, + { + "epoch": 0.7108369098712446, + "grad_norm": 0.87890625, + "learning_rate": 4.836838427334127e-06, + "loss": 2.2148, + "step": 13250 + }, + { + "epoch": 0.7108905579399142, + "grad_norm": 0.462890625, + "learning_rate": 4.836807554833597e-06, + "loss": 2.4094, + "step": 13251 + }, + { + "epoch": 0.7109442060085837, + "grad_norm": 0.55859375, + "learning_rate": 4.83677667951113e-06, + "loss": 2.0893, + "step": 13252 + }, + { + "epoch": 0.7109978540772532, + "grad_norm": 0.447265625, + "learning_rate": 4.836745801366762e-06, + "loss": 2.2455, + "step": 13253 + }, + { + "epoch": 0.7110515021459227, + "grad_norm": 0.765625, + "learning_rate": 4.836714920400531e-06, + "loss": 2.287, + "step": 13254 + }, + { + "epoch": 0.7111051502145923, + "grad_norm": 0.478515625, + "learning_rate": 4.836684036612476e-06, + "loss": 2.4615, + "step": 13255 + }, + { + "epoch": 0.7111587982832618, + "grad_norm": 0.4140625, + "learning_rate": 4.83665315000263e-06, + "loss": 2.1939, + "step": 13256 + }, + { + "epoch": 0.7112124463519314, + "grad_norm": 0.390625, + "learning_rate": 4.836622260571035e-06, + "loss": 1.9236, + "step": 13257 + }, + { + "epoch": 0.7112660944206008, + "grad_norm": 0.41015625, + "learning_rate": 4.836591368317726e-06, + "loss": 2.4249, + "step": 13258 + }, + { + "epoch": 0.7113197424892704, + "grad_norm": 0.490234375, + "learning_rate": 4.836560473242739e-06, + "loss": 2.6043, + "step": 13259 + }, + { + "epoch": 0.7113733905579399, + "grad_norm": 0.48828125, + "learning_rate": 4.836529575346114e-06, + "loss": 2.0595, + "step": 13260 + }, + { + "epoch": 0.7114270386266094, + "grad_norm": 0.455078125, + "learning_rate": 4.836498674627886e-06, + "loss": 2.4907, + "step": 13261 + }, + { + "epoch": 0.711480686695279, + "grad_norm": 0.416015625, + "learning_rate": 4.836467771088095e-06, + "loss": 2.1509, + "step": 13262 + }, + { + "epoch": 0.7115343347639485, + "grad_norm": 0.55078125, + "learning_rate": 4.836436864726775e-06, + "loss": 2.204, + "step": 13263 + }, + { + "epoch": 0.711587982832618, + "grad_norm": 0.3671875, + "learning_rate": 4.836405955543967e-06, + "loss": 2.239, + "step": 13264 + }, + { + "epoch": 0.7116416309012875, + "grad_norm": 0.48046875, + "learning_rate": 4.836375043539706e-06, + "loss": 2.281, + "step": 13265 + }, + { + "epoch": 0.7116952789699571, + "grad_norm": 0.447265625, + "learning_rate": 4.836344128714029e-06, + "loss": 2.2542, + "step": 13266 + }, + { + "epoch": 0.7117489270386266, + "grad_norm": 0.68359375, + "learning_rate": 4.836313211066974e-06, + "loss": 2.3517, + "step": 13267 + }, + { + "epoch": 0.7118025751072962, + "grad_norm": 0.70703125, + "learning_rate": 4.836282290598579e-06, + "loss": 2.303, + "step": 13268 + }, + { + "epoch": 0.7118562231759656, + "grad_norm": 0.4296875, + "learning_rate": 4.836251367308881e-06, + "loss": 2.5071, + "step": 13269 + }, + { + "epoch": 0.7119098712446352, + "grad_norm": 0.43359375, + "learning_rate": 4.836220441197916e-06, + "loss": 2.3399, + "step": 13270 + }, + { + "epoch": 0.7119635193133047, + "grad_norm": 0.470703125, + "learning_rate": 4.836189512265723e-06, + "loss": 2.4055, + "step": 13271 + }, + { + "epoch": 0.7120171673819743, + "grad_norm": 0.466796875, + "learning_rate": 4.836158580512339e-06, + "loss": 2.1513, + "step": 13272 + }, + { + "epoch": 0.7120708154506438, + "grad_norm": 0.486328125, + "learning_rate": 4.8361276459378005e-06, + "loss": 2.378, + "step": 13273 + }, + { + "epoch": 0.7121244635193134, + "grad_norm": 0.384765625, + "learning_rate": 4.8360967085421465e-06, + "loss": 2.2389, + "step": 13274 + }, + { + "epoch": 0.7121781115879828, + "grad_norm": 1.703125, + "learning_rate": 4.836065768325413e-06, + "loss": 2.3935, + "step": 13275 + }, + { + "epoch": 0.7122317596566523, + "grad_norm": 0.5, + "learning_rate": 4.8360348252876375e-06, + "loss": 2.2135, + "step": 13276 + }, + { + "epoch": 0.7122854077253219, + "grad_norm": 0.47265625, + "learning_rate": 4.836003879428857e-06, + "loss": 2.2611, + "step": 13277 + }, + { + "epoch": 0.7123390557939914, + "grad_norm": 0.53515625, + "learning_rate": 4.835972930749111e-06, + "loss": 2.1137, + "step": 13278 + }, + { + "epoch": 0.712392703862661, + "grad_norm": 0.5625, + "learning_rate": 4.835941979248434e-06, + "loss": 2.0992, + "step": 13279 + }, + { + "epoch": 0.7124463519313304, + "grad_norm": 0.625, + "learning_rate": 4.835911024926865e-06, + "loss": 2.2138, + "step": 13280 + }, + { + "epoch": 0.7125, + "grad_norm": 0.427734375, + "learning_rate": 4.835880067784441e-06, + "loss": 2.368, + "step": 13281 + }, + { + "epoch": 0.7125536480686695, + "grad_norm": 0.41015625, + "learning_rate": 4.8358491078212e-06, + "loss": 2.3086, + "step": 13282 + }, + { + "epoch": 0.7126072961373391, + "grad_norm": 0.4140625, + "learning_rate": 4.835818145037179e-06, + "loss": 2.1532, + "step": 13283 + }, + { + "epoch": 0.7126609442060086, + "grad_norm": 0.447265625, + "learning_rate": 4.835787179432414e-06, + "loss": 2.0909, + "step": 13284 + }, + { + "epoch": 0.7127145922746781, + "grad_norm": 0.423828125, + "learning_rate": 4.835756211006945e-06, + "loss": 2.1154, + "step": 13285 + }, + { + "epoch": 0.7127682403433476, + "grad_norm": 0.4609375, + "learning_rate": 4.8357252397608074e-06, + "loss": 2.2667, + "step": 13286 + }, + { + "epoch": 0.7128218884120172, + "grad_norm": 0.46484375, + "learning_rate": 4.835694265694039e-06, + "loss": 2.3038, + "step": 13287 + }, + { + "epoch": 0.7128755364806867, + "grad_norm": 0.4453125, + "learning_rate": 4.835663288806678e-06, + "loss": 2.4378, + "step": 13288 + }, + { + "epoch": 0.7129291845493563, + "grad_norm": 1.046875, + "learning_rate": 4.835632309098761e-06, + "loss": 2.1537, + "step": 13289 + }, + { + "epoch": 0.7129828326180258, + "grad_norm": 0.470703125, + "learning_rate": 4.835601326570326e-06, + "loss": 2.4043, + "step": 13290 + }, + { + "epoch": 0.7130364806866952, + "grad_norm": 0.498046875, + "learning_rate": 4.835570341221409e-06, + "loss": 2.5134, + "step": 13291 + }, + { + "epoch": 0.7130901287553648, + "grad_norm": 0.50390625, + "learning_rate": 4.83553935305205e-06, + "loss": 1.5813, + "step": 13292 + }, + { + "epoch": 0.7131437768240343, + "grad_norm": 0.451171875, + "learning_rate": 4.835508362062284e-06, + "loss": 2.2004, + "step": 13293 + }, + { + "epoch": 0.7131974248927039, + "grad_norm": 0.341796875, + "learning_rate": 4.83547736825215e-06, + "loss": 1.6695, + "step": 13294 + }, + { + "epoch": 0.7132510729613734, + "grad_norm": 0.4453125, + "learning_rate": 4.835446371621685e-06, + "loss": 2.3222, + "step": 13295 + }, + { + "epoch": 0.7133047210300429, + "grad_norm": 0.44140625, + "learning_rate": 4.835415372170925e-06, + "loss": 1.9533, + "step": 13296 + }, + { + "epoch": 0.7133583690987124, + "grad_norm": 0.390625, + "learning_rate": 4.835384369899909e-06, + "loss": 2.2421, + "step": 13297 + }, + { + "epoch": 0.713412017167382, + "grad_norm": 0.49609375, + "learning_rate": 4.835353364808675e-06, + "loss": 2.1799, + "step": 13298 + }, + { + "epoch": 0.7134656652360515, + "grad_norm": 2.453125, + "learning_rate": 4.835322356897258e-06, + "loss": 2.3855, + "step": 13299 + }, + { + "epoch": 0.7135193133047211, + "grad_norm": 0.4375, + "learning_rate": 4.835291346165698e-06, + "loss": 2.2717, + "step": 13300 + }, + { + "epoch": 0.7135729613733905, + "grad_norm": 0.62109375, + "learning_rate": 4.835260332614031e-06, + "loss": 2.4662, + "step": 13301 + }, + { + "epoch": 0.7136266094420601, + "grad_norm": 0.439453125, + "learning_rate": 4.835229316242296e-06, + "loss": 2.2045, + "step": 13302 + }, + { + "epoch": 0.7136802575107296, + "grad_norm": 0.482421875, + "learning_rate": 4.835198297050529e-06, + "loss": 2.4156, + "step": 13303 + }, + { + "epoch": 0.7137339055793992, + "grad_norm": 0.63671875, + "learning_rate": 4.835167275038766e-06, + "loss": 2.2602, + "step": 13304 + }, + { + "epoch": 0.7137875536480687, + "grad_norm": 0.41796875, + "learning_rate": 4.8351362502070475e-06, + "loss": 2.4519, + "step": 13305 + }, + { + "epoch": 0.7138412017167381, + "grad_norm": 0.49609375, + "learning_rate": 4.83510522255541e-06, + "loss": 2.3057, + "step": 13306 + }, + { + "epoch": 0.7138948497854077, + "grad_norm": 0.46484375, + "learning_rate": 4.83507419208389e-06, + "loss": 2.315, + "step": 13307 + }, + { + "epoch": 0.7139484978540772, + "grad_norm": 0.48828125, + "learning_rate": 4.835043158792526e-06, + "loss": 1.5186, + "step": 13308 + }, + { + "epoch": 0.7140021459227468, + "grad_norm": 0.3984375, + "learning_rate": 4.835012122681356e-06, + "loss": 2.2927, + "step": 13309 + }, + { + "epoch": 0.7140557939914163, + "grad_norm": 0.515625, + "learning_rate": 4.834981083750415e-06, + "loss": 2.4548, + "step": 13310 + }, + { + "epoch": 0.7141094420600859, + "grad_norm": 0.447265625, + "learning_rate": 4.834950041999743e-06, + "loss": 2.2832, + "step": 13311 + }, + { + "epoch": 0.7141630901287553, + "grad_norm": 0.640625, + "learning_rate": 4.8349189974293765e-06, + "loss": 2.1389, + "step": 13312 + }, + { + "epoch": 0.7142167381974249, + "grad_norm": 0.4453125, + "learning_rate": 4.834887950039353e-06, + "loss": 2.1926, + "step": 13313 + }, + { + "epoch": 0.7142703862660944, + "grad_norm": 0.390625, + "learning_rate": 4.83485689982971e-06, + "loss": 2.1311, + "step": 13314 + }, + { + "epoch": 0.714324034334764, + "grad_norm": 0.4140625, + "learning_rate": 4.834825846800486e-06, + "loss": 2.1684, + "step": 13315 + }, + { + "epoch": 0.7143776824034335, + "grad_norm": 0.40625, + "learning_rate": 4.834794790951716e-06, + "loss": 2.2876, + "step": 13316 + }, + { + "epoch": 0.714431330472103, + "grad_norm": 0.453125, + "learning_rate": 4.834763732283439e-06, + "loss": 2.3351, + "step": 13317 + }, + { + "epoch": 0.7144849785407725, + "grad_norm": 0.451171875, + "learning_rate": 4.834732670795694e-06, + "loss": 2.2773, + "step": 13318 + }, + { + "epoch": 0.714538626609442, + "grad_norm": 0.51171875, + "learning_rate": 4.834701606488517e-06, + "loss": 2.4611, + "step": 13319 + }, + { + "epoch": 0.7145922746781116, + "grad_norm": 0.40625, + "learning_rate": 4.834670539361944e-06, + "loss": 2.3108, + "step": 13320 + }, + { + "epoch": 0.7146459227467811, + "grad_norm": 0.48046875, + "learning_rate": 4.834639469416016e-06, + "loss": 2.4186, + "step": 13321 + }, + { + "epoch": 0.7146995708154507, + "grad_norm": 0.3515625, + "learning_rate": 4.834608396650768e-06, + "loss": 1.8473, + "step": 13322 + }, + { + "epoch": 0.7147532188841201, + "grad_norm": 0.453125, + "learning_rate": 4.834577321066238e-06, + "loss": 2.2146, + "step": 13323 + }, + { + "epoch": 0.7148068669527897, + "grad_norm": 0.392578125, + "learning_rate": 4.834546242662463e-06, + "loss": 2.3335, + "step": 13324 + }, + { + "epoch": 0.7148605150214592, + "grad_norm": 0.57421875, + "learning_rate": 4.834515161439482e-06, + "loss": 2.0955, + "step": 13325 + }, + { + "epoch": 0.7149141630901288, + "grad_norm": 0.4921875, + "learning_rate": 4.834484077397331e-06, + "loss": 2.265, + "step": 13326 + }, + { + "epoch": 0.7149678111587983, + "grad_norm": 0.421875, + "learning_rate": 4.834452990536049e-06, + "loss": 1.6156, + "step": 13327 + }, + { + "epoch": 0.7150214592274678, + "grad_norm": 0.3984375, + "learning_rate": 4.834421900855673e-06, + "loss": 2.413, + "step": 13328 + }, + { + "epoch": 0.7150751072961373, + "grad_norm": 0.515625, + "learning_rate": 4.8343908083562396e-06, + "loss": 2.2073, + "step": 13329 + }, + { + "epoch": 0.7151287553648069, + "grad_norm": 0.376953125, + "learning_rate": 4.8343597130377875e-06, + "loss": 2.3023, + "step": 13330 + }, + { + "epoch": 0.7151824034334764, + "grad_norm": 0.44921875, + "learning_rate": 4.834328614900353e-06, + "loss": 2.1491, + "step": 13331 + }, + { + "epoch": 0.715236051502146, + "grad_norm": 0.41796875, + "learning_rate": 4.834297513943976e-06, + "loss": 2.3602, + "step": 13332 + }, + { + "epoch": 0.7152896995708155, + "grad_norm": 0.47265625, + "learning_rate": 4.834266410168692e-06, + "loss": 2.2802, + "step": 13333 + }, + { + "epoch": 0.7153433476394849, + "grad_norm": 0.609375, + "learning_rate": 4.834235303574539e-06, + "loss": 2.2269, + "step": 13334 + }, + { + "epoch": 0.7153969957081545, + "grad_norm": 0.77734375, + "learning_rate": 4.834204194161555e-06, + "loss": 2.3514, + "step": 13335 + }, + { + "epoch": 0.715450643776824, + "grad_norm": 0.46484375, + "learning_rate": 4.8341730819297765e-06, + "loss": 2.2619, + "step": 13336 + }, + { + "epoch": 0.7155042918454936, + "grad_norm": 3.0625, + "learning_rate": 4.834141966879242e-06, + "loss": 2.2514, + "step": 13337 + }, + { + "epoch": 0.715557939914163, + "grad_norm": 0.5078125, + "learning_rate": 4.83411084900999e-06, + "loss": 2.3141, + "step": 13338 + }, + { + "epoch": 0.7156115879828326, + "grad_norm": 0.50390625, + "learning_rate": 4.8340797283220565e-06, + "loss": 2.6727, + "step": 13339 + }, + { + "epoch": 0.7156652360515021, + "grad_norm": 0.43359375, + "learning_rate": 4.834048604815479e-06, + "loss": 2.3489, + "step": 13340 + }, + { + "epoch": 0.7157188841201717, + "grad_norm": 0.470703125, + "learning_rate": 4.8340174784902964e-06, + "loss": 2.1859, + "step": 13341 + }, + { + "epoch": 0.7157725321888412, + "grad_norm": 0.49609375, + "learning_rate": 4.833986349346544e-06, + "loss": 2.3274, + "step": 13342 + }, + { + "epoch": 0.7158261802575108, + "grad_norm": 0.447265625, + "learning_rate": 4.833955217384263e-06, + "loss": 2.0564, + "step": 13343 + }, + { + "epoch": 0.7158798283261802, + "grad_norm": 0.478515625, + "learning_rate": 4.833924082603488e-06, + "loss": 2.2118, + "step": 13344 + }, + { + "epoch": 0.7159334763948498, + "grad_norm": 0.52734375, + "learning_rate": 4.833892945004257e-06, + "loss": 2.5211, + "step": 13345 + }, + { + "epoch": 0.7159871244635193, + "grad_norm": 0.474609375, + "learning_rate": 4.83386180458661e-06, + "loss": 2.342, + "step": 13346 + }, + { + "epoch": 0.7160407725321889, + "grad_norm": 0.92578125, + "learning_rate": 4.8338306613505805e-06, + "loss": 2.1839, + "step": 13347 + }, + { + "epoch": 0.7160944206008584, + "grad_norm": 0.482421875, + "learning_rate": 4.833799515296209e-06, + "loss": 2.5219, + "step": 13348 + }, + { + "epoch": 0.7161480686695278, + "grad_norm": 0.443359375, + "learning_rate": 4.833768366423533e-06, + "loss": 2.2125, + "step": 13349 + }, + { + "epoch": 0.7162017167381974, + "grad_norm": 0.44921875, + "learning_rate": 4.833737214732589e-06, + "loss": 2.385, + "step": 13350 + }, + { + "epoch": 0.7162553648068669, + "grad_norm": 0.46484375, + "learning_rate": 4.8337060602234155e-06, + "loss": 2.3724, + "step": 13351 + }, + { + "epoch": 0.7163090128755365, + "grad_norm": 0.50390625, + "learning_rate": 4.83367490289605e-06, + "loss": 2.5186, + "step": 13352 + }, + { + "epoch": 0.716362660944206, + "grad_norm": 0.4375, + "learning_rate": 4.83364374275053e-06, + "loss": 2.2392, + "step": 13353 + }, + { + "epoch": 0.7164163090128756, + "grad_norm": 0.447265625, + "learning_rate": 4.8336125797868925e-06, + "loss": 2.8449, + "step": 13354 + }, + { + "epoch": 0.716469957081545, + "grad_norm": 0.47265625, + "learning_rate": 4.8335814140051765e-06, + "loss": 2.2814, + "step": 13355 + }, + { + "epoch": 0.7165236051502146, + "grad_norm": 0.470703125, + "learning_rate": 4.8335502454054175e-06, + "loss": 2.2993, + "step": 13356 + }, + { + "epoch": 0.7165772532188841, + "grad_norm": 1.0390625, + "learning_rate": 4.833519073987656e-06, + "loss": 2.1736, + "step": 13357 + }, + { + "epoch": 0.7166309012875537, + "grad_norm": 0.59375, + "learning_rate": 4.833487899751927e-06, + "loss": 1.7813, + "step": 13358 + }, + { + "epoch": 0.7166845493562232, + "grad_norm": 0.40625, + "learning_rate": 4.83345672269827e-06, + "loss": 2.412, + "step": 13359 + }, + { + "epoch": 0.7167381974248928, + "grad_norm": 0.4375, + "learning_rate": 4.833425542826722e-06, + "loss": 2.317, + "step": 13360 + }, + { + "epoch": 0.7167918454935622, + "grad_norm": 0.5546875, + "learning_rate": 4.83339436013732e-06, + "loss": 2.2403, + "step": 13361 + }, + { + "epoch": 0.7168454935622317, + "grad_norm": 0.482421875, + "learning_rate": 4.8333631746301025e-06, + "loss": 2.277, + "step": 13362 + }, + { + "epoch": 0.7168991416309013, + "grad_norm": 0.39453125, + "learning_rate": 4.833331986305106e-06, + "loss": 2.2708, + "step": 13363 + }, + { + "epoch": 0.7169527896995708, + "grad_norm": 0.53515625, + "learning_rate": 4.833300795162371e-06, + "loss": 1.8568, + "step": 13364 + }, + { + "epoch": 0.7170064377682404, + "grad_norm": 0.419921875, + "learning_rate": 4.833269601201931e-06, + "loss": 2.281, + "step": 13365 + }, + { + "epoch": 0.7170600858369098, + "grad_norm": 0.55078125, + "learning_rate": 4.833238404423828e-06, + "loss": 2.1582, + "step": 13366 + }, + { + "epoch": 0.7171137339055794, + "grad_norm": 0.458984375, + "learning_rate": 4.833207204828097e-06, + "loss": 2.4328, + "step": 13367 + }, + { + "epoch": 0.7171673819742489, + "grad_norm": 0.56640625, + "learning_rate": 4.833176002414775e-06, + "loss": 2.2704, + "step": 13368 + }, + { + "epoch": 0.7172210300429185, + "grad_norm": 0.4296875, + "learning_rate": 4.833144797183902e-06, + "loss": 2.2496, + "step": 13369 + }, + { + "epoch": 0.717274678111588, + "grad_norm": 0.43359375, + "learning_rate": 4.833113589135514e-06, + "loss": 2.5104, + "step": 13370 + }, + { + "epoch": 0.7173283261802575, + "grad_norm": 0.412109375, + "learning_rate": 4.833082378269649e-06, + "loss": 2.3002, + "step": 13371 + }, + { + "epoch": 0.717381974248927, + "grad_norm": 0.625, + "learning_rate": 4.833051164586346e-06, + "loss": 1.4585, + "step": 13372 + }, + { + "epoch": 0.7174356223175966, + "grad_norm": 0.50390625, + "learning_rate": 4.833019948085641e-06, + "loss": 2.2905, + "step": 13373 + }, + { + "epoch": 0.7174892703862661, + "grad_norm": 0.50390625, + "learning_rate": 4.832988728767573e-06, + "loss": 2.3892, + "step": 13374 + }, + { + "epoch": 0.7175429184549357, + "grad_norm": 0.490234375, + "learning_rate": 4.832957506632179e-06, + "loss": 2.151, + "step": 13375 + }, + { + "epoch": 0.7175965665236052, + "grad_norm": 0.75390625, + "learning_rate": 4.832926281679496e-06, + "loss": 1.9873, + "step": 13376 + }, + { + "epoch": 0.7176502145922746, + "grad_norm": 0.69140625, + "learning_rate": 4.832895053909563e-06, + "loss": 2.3781, + "step": 13377 + }, + { + "epoch": 0.7177038626609442, + "grad_norm": 0.44921875, + "learning_rate": 4.832863823322417e-06, + "loss": 2.3976, + "step": 13378 + }, + { + "epoch": 0.7177575107296137, + "grad_norm": 0.44140625, + "learning_rate": 4.832832589918096e-06, + "loss": 2.3941, + "step": 13379 + }, + { + "epoch": 0.7178111587982833, + "grad_norm": 0.486328125, + "learning_rate": 4.832801353696638e-06, + "loss": 2.6544, + "step": 13380 + }, + { + "epoch": 0.7178648068669528, + "grad_norm": 0.4921875, + "learning_rate": 4.83277011465808e-06, + "loss": 2.5178, + "step": 13381 + }, + { + "epoch": 0.7179184549356223, + "grad_norm": 0.4765625, + "learning_rate": 4.832738872802459e-06, + "loss": 2.2375, + "step": 13382 + }, + { + "epoch": 0.7179721030042918, + "grad_norm": 0.41796875, + "learning_rate": 4.832707628129816e-06, + "loss": 2.2574, + "step": 13383 + }, + { + "epoch": 0.7180257510729614, + "grad_norm": 0.4921875, + "learning_rate": 4.832676380640185e-06, + "loss": 2.3792, + "step": 13384 + }, + { + "epoch": 0.7180793991416309, + "grad_norm": 0.41796875, + "learning_rate": 4.832645130333605e-06, + "loss": 2.3837, + "step": 13385 + }, + { + "epoch": 0.7181330472103005, + "grad_norm": 0.49609375, + "learning_rate": 4.832613877210115e-06, + "loss": 2.2299, + "step": 13386 + }, + { + "epoch": 0.7181866952789699, + "grad_norm": 0.95703125, + "learning_rate": 4.832582621269752e-06, + "loss": 2.0621, + "step": 13387 + }, + { + "epoch": 0.7182403433476395, + "grad_norm": 0.41015625, + "learning_rate": 4.832551362512552e-06, + "loss": 2.067, + "step": 13388 + }, + { + "epoch": 0.718293991416309, + "grad_norm": 0.478515625, + "learning_rate": 4.832520100938555e-06, + "loss": 2.4883, + "step": 13389 + }, + { + "epoch": 0.7183476394849786, + "grad_norm": 0.484375, + "learning_rate": 4.832488836547799e-06, + "loss": 2.2722, + "step": 13390 + }, + { + "epoch": 0.7184012875536481, + "grad_norm": 0.51953125, + "learning_rate": 4.832457569340319e-06, + "loss": 2.385, + "step": 13391 + }, + { + "epoch": 0.7184549356223175, + "grad_norm": 0.5234375, + "learning_rate": 4.832426299316155e-06, + "loss": 2.0329, + "step": 13392 + }, + { + "epoch": 0.7185085836909871, + "grad_norm": 0.451171875, + "learning_rate": 4.832395026475345e-06, + "loss": 2.3441, + "step": 13393 + }, + { + "epoch": 0.7185622317596566, + "grad_norm": 0.41796875, + "learning_rate": 4.832363750817925e-06, + "loss": 2.3476, + "step": 13394 + }, + { + "epoch": 0.7186158798283262, + "grad_norm": 0.5078125, + "learning_rate": 4.832332472343935e-06, + "loss": 1.8784, + "step": 13395 + }, + { + "epoch": 0.7186695278969957, + "grad_norm": 0.421875, + "learning_rate": 4.8323011910534114e-06, + "loss": 2.2509, + "step": 13396 + }, + { + "epoch": 0.7187231759656653, + "grad_norm": 0.53515625, + "learning_rate": 4.832269906946391e-06, + "loss": 2.4294, + "step": 13397 + }, + { + "epoch": 0.7187768240343347, + "grad_norm": 0.466796875, + "learning_rate": 4.832238620022913e-06, + "loss": 2.4037, + "step": 13398 + }, + { + "epoch": 0.7188304721030043, + "grad_norm": 0.396484375, + "learning_rate": 4.832207330283016e-06, + "loss": 1.9784, + "step": 13399 + }, + { + "epoch": 0.7188841201716738, + "grad_norm": 0.435546875, + "learning_rate": 4.832176037726736e-06, + "loss": 2.2441, + "step": 13400 + }, + { + "epoch": 0.7189377682403434, + "grad_norm": 0.4921875, + "learning_rate": 4.832144742354111e-06, + "loss": 2.2934, + "step": 13401 + }, + { + "epoch": 0.7189914163090129, + "grad_norm": 0.486328125, + "learning_rate": 4.83211344416518e-06, + "loss": 2.261, + "step": 13402 + }, + { + "epoch": 0.7190450643776825, + "grad_norm": 0.5546875, + "learning_rate": 4.83208214315998e-06, + "loss": 2.3135, + "step": 13403 + }, + { + "epoch": 0.7190987124463519, + "grad_norm": 0.70703125, + "learning_rate": 4.832050839338549e-06, + "loss": 2.3576, + "step": 13404 + }, + { + "epoch": 0.7191523605150214, + "grad_norm": 0.341796875, + "learning_rate": 4.832019532700924e-06, + "loss": 2.2658, + "step": 13405 + }, + { + "epoch": 0.719206008583691, + "grad_norm": 0.63671875, + "learning_rate": 4.831988223247144e-06, + "loss": 2.381, + "step": 13406 + }, + { + "epoch": 0.7192596566523605, + "grad_norm": 0.42578125, + "learning_rate": 4.831956910977246e-06, + "loss": 2.3713, + "step": 13407 + }, + { + "epoch": 0.7193133047210301, + "grad_norm": 0.443359375, + "learning_rate": 4.8319255958912684e-06, + "loss": 2.3356, + "step": 13408 + }, + { + "epoch": 0.7193669527896995, + "grad_norm": 0.5, + "learning_rate": 4.831894277989248e-06, + "loss": 2.2173, + "step": 13409 + }, + { + "epoch": 0.7194206008583691, + "grad_norm": 0.412109375, + "learning_rate": 4.831862957271224e-06, + "loss": 2.2113, + "step": 13410 + }, + { + "epoch": 0.7194742489270386, + "grad_norm": 0.4140625, + "learning_rate": 4.831831633737234e-06, + "loss": 2.1781, + "step": 13411 + }, + { + "epoch": 0.7195278969957082, + "grad_norm": 0.53515625, + "learning_rate": 4.831800307387315e-06, + "loss": 2.3924, + "step": 13412 + }, + { + "epoch": 0.7195815450643777, + "grad_norm": 0.4921875, + "learning_rate": 4.831768978221504e-06, + "loss": 2.5603, + "step": 13413 + }, + { + "epoch": 0.7196351931330472, + "grad_norm": 0.482421875, + "learning_rate": 4.831737646239841e-06, + "loss": 2.1959, + "step": 13414 + }, + { + "epoch": 0.7196888412017167, + "grad_norm": 0.54296875, + "learning_rate": 4.8317063114423625e-06, + "loss": 2.2681, + "step": 13415 + }, + { + "epoch": 0.7197424892703863, + "grad_norm": 0.47265625, + "learning_rate": 4.831674973829108e-06, + "loss": 2.3578, + "step": 13416 + }, + { + "epoch": 0.7197961373390558, + "grad_norm": 0.6015625, + "learning_rate": 4.831643633400113e-06, + "loss": 2.6003, + "step": 13417 + }, + { + "epoch": 0.7198497854077254, + "grad_norm": 0.4609375, + "learning_rate": 4.831612290155416e-06, + "loss": 2.3887, + "step": 13418 + }, + { + "epoch": 0.7199034334763948, + "grad_norm": 0.4375, + "learning_rate": 4.8315809440950565e-06, + "loss": 2.4121, + "step": 13419 + }, + { + "epoch": 0.7199570815450643, + "grad_norm": 2.140625, + "learning_rate": 4.8315495952190705e-06, + "loss": 2.1648, + "step": 13420 + }, + { + "epoch": 0.7200107296137339, + "grad_norm": 0.451171875, + "learning_rate": 4.831518243527496e-06, + "loss": 2.4392, + "step": 13421 + }, + { + "epoch": 0.7200643776824034, + "grad_norm": 0.404296875, + "learning_rate": 4.831486889020372e-06, + "loss": 2.1508, + "step": 13422 + }, + { + "epoch": 0.720118025751073, + "grad_norm": 0.453125, + "learning_rate": 4.831455531697735e-06, + "loss": 1.9795, + "step": 13423 + }, + { + "epoch": 0.7201716738197425, + "grad_norm": 0.5625, + "learning_rate": 4.831424171559624e-06, + "loss": 2.4914, + "step": 13424 + }, + { + "epoch": 0.720225321888412, + "grad_norm": 0.5234375, + "learning_rate": 4.831392808606077e-06, + "loss": 2.2944, + "step": 13425 + }, + { + "epoch": 0.7202789699570815, + "grad_norm": 0.5234375, + "learning_rate": 4.83136144283713e-06, + "loss": 2.4007, + "step": 13426 + }, + { + "epoch": 0.7203326180257511, + "grad_norm": 0.427734375, + "learning_rate": 4.8313300742528235e-06, + "loss": 2.3982, + "step": 13427 + }, + { + "epoch": 0.7203862660944206, + "grad_norm": 0.41796875, + "learning_rate": 4.831298702853193e-06, + "loss": 2.054, + "step": 13428 + }, + { + "epoch": 0.7204399141630902, + "grad_norm": 0.435546875, + "learning_rate": 4.831267328638278e-06, + "loss": 2.1408, + "step": 13429 + }, + { + "epoch": 0.7204935622317596, + "grad_norm": 0.50390625, + "learning_rate": 4.831235951608116e-06, + "loss": 2.2727, + "step": 13430 + }, + { + "epoch": 0.7205472103004292, + "grad_norm": 0.443359375, + "learning_rate": 4.831204571762745e-06, + "loss": 2.3934, + "step": 13431 + }, + { + "epoch": 0.7206008583690987, + "grad_norm": 0.41015625, + "learning_rate": 4.831173189102202e-06, + "loss": 2.2639, + "step": 13432 + }, + { + "epoch": 0.7206545064377683, + "grad_norm": 0.365234375, + "learning_rate": 4.831141803626526e-06, + "loss": 2.4264, + "step": 13433 + }, + { + "epoch": 0.7207081545064378, + "grad_norm": 0.40234375, + "learning_rate": 4.831110415335753e-06, + "loss": 2.3164, + "step": 13434 + }, + { + "epoch": 0.7207618025751072, + "grad_norm": 0.54296875, + "learning_rate": 4.831079024229924e-06, + "loss": 2.2811, + "step": 13435 + }, + { + "epoch": 0.7208154506437768, + "grad_norm": 0.443359375, + "learning_rate": 4.831047630309075e-06, + "loss": 2.3366, + "step": 13436 + }, + { + "epoch": 0.7208690987124463, + "grad_norm": 0.337890625, + "learning_rate": 4.831016233573244e-06, + "loss": 1.9628, + "step": 13437 + }, + { + "epoch": 0.7209227467811159, + "grad_norm": 0.45703125, + "learning_rate": 4.830984834022468e-06, + "loss": 2.4711, + "step": 13438 + }, + { + "epoch": 0.7209763948497854, + "grad_norm": 0.4609375, + "learning_rate": 4.830953431656788e-06, + "loss": 2.3322, + "step": 13439 + }, + { + "epoch": 0.721030042918455, + "grad_norm": 0.427734375, + "learning_rate": 4.830922026476238e-06, + "loss": 2.4088, + "step": 13440 + }, + { + "epoch": 0.7210836909871244, + "grad_norm": 0.46484375, + "learning_rate": 4.830890618480859e-06, + "loss": 2.3057, + "step": 13441 + }, + { + "epoch": 0.721137339055794, + "grad_norm": 0.3671875, + "learning_rate": 4.830859207670687e-06, + "loss": 1.8765, + "step": 13442 + }, + { + "epoch": 0.7211909871244635, + "grad_norm": 0.48046875, + "learning_rate": 4.830827794045761e-06, + "loss": 2.3021, + "step": 13443 + }, + { + "epoch": 0.7212446351931331, + "grad_norm": 0.42578125, + "learning_rate": 4.8307963776061184e-06, + "loss": 2.381, + "step": 13444 + }, + { + "epoch": 0.7212982832618026, + "grad_norm": 0.47265625, + "learning_rate": 4.830764958351798e-06, + "loss": 2.3565, + "step": 13445 + }, + { + "epoch": 0.7213519313304722, + "grad_norm": 0.62109375, + "learning_rate": 4.830733536282837e-06, + "loss": 1.9682, + "step": 13446 + }, + { + "epoch": 0.7214055793991416, + "grad_norm": 0.42578125, + "learning_rate": 4.830702111399273e-06, + "loss": 2.3161, + "step": 13447 + }, + { + "epoch": 0.7214592274678111, + "grad_norm": 0.494140625, + "learning_rate": 4.830670683701144e-06, + "loss": 2.3072, + "step": 13448 + }, + { + "epoch": 0.7215128755364807, + "grad_norm": 0.486328125, + "learning_rate": 4.830639253188489e-06, + "loss": 2.3511, + "step": 13449 + }, + { + "epoch": 0.7215665236051502, + "grad_norm": 0.40625, + "learning_rate": 4.830607819861346e-06, + "loss": 2.2254, + "step": 13450 + }, + { + "epoch": 0.7216201716738198, + "grad_norm": 1.4375, + "learning_rate": 4.8305763837197505e-06, + "loss": 2.4117, + "step": 13451 + }, + { + "epoch": 0.7216738197424892, + "grad_norm": 0.50390625, + "learning_rate": 4.8305449447637435e-06, + "loss": 2.4766, + "step": 13452 + }, + { + "epoch": 0.7217274678111588, + "grad_norm": 0.470703125, + "learning_rate": 4.830513502993361e-06, + "loss": 2.3023, + "step": 13453 + }, + { + "epoch": 0.7217811158798283, + "grad_norm": 5.15625, + "learning_rate": 4.830482058408642e-06, + "loss": 2.2015, + "step": 13454 + }, + { + "epoch": 0.7218347639484979, + "grad_norm": 0.515625, + "learning_rate": 4.830450611009624e-06, + "loss": 2.3418, + "step": 13455 + }, + { + "epoch": 0.7218884120171674, + "grad_norm": 0.423828125, + "learning_rate": 4.830419160796346e-06, + "loss": 2.5267, + "step": 13456 + }, + { + "epoch": 0.721942060085837, + "grad_norm": 0.40625, + "learning_rate": 4.830387707768844e-06, + "loss": 2.2745, + "step": 13457 + }, + { + "epoch": 0.7219957081545064, + "grad_norm": 0.56640625, + "learning_rate": 4.830356251927157e-06, + "loss": 2.188, + "step": 13458 + }, + { + "epoch": 0.722049356223176, + "grad_norm": 0.57421875, + "learning_rate": 4.830324793271324e-06, + "loss": 2.3976, + "step": 13459 + }, + { + "epoch": 0.7221030042918455, + "grad_norm": 0.439453125, + "learning_rate": 4.830293331801382e-06, + "loss": 2.2292, + "step": 13460 + }, + { + "epoch": 0.7221566523605151, + "grad_norm": 0.439453125, + "learning_rate": 4.8302618675173686e-06, + "loss": 2.3081, + "step": 13461 + }, + { + "epoch": 0.7222103004291845, + "grad_norm": 0.458984375, + "learning_rate": 4.830230400419322e-06, + "loss": 2.1054, + "step": 13462 + }, + { + "epoch": 0.722263948497854, + "grad_norm": 0.45703125, + "learning_rate": 4.8301989305072805e-06, + "loss": 2.4738, + "step": 13463 + }, + { + "epoch": 0.7223175965665236, + "grad_norm": 0.4375, + "learning_rate": 4.830167457781282e-06, + "loss": 2.3119, + "step": 13464 + }, + { + "epoch": 0.7223712446351931, + "grad_norm": 0.796875, + "learning_rate": 4.8301359822413654e-06, + "loss": 2.3655, + "step": 13465 + }, + { + "epoch": 0.7224248927038627, + "grad_norm": 0.482421875, + "learning_rate": 4.8301045038875665e-06, + "loss": 2.3517, + "step": 13466 + }, + { + "epoch": 0.7224785407725322, + "grad_norm": 0.369140625, + "learning_rate": 4.830073022719926e-06, + "loss": 2.3565, + "step": 13467 + }, + { + "epoch": 0.7225321888412017, + "grad_norm": 0.427734375, + "learning_rate": 4.830041538738479e-06, + "loss": 2.7301, + "step": 13468 + }, + { + "epoch": 0.7225858369098712, + "grad_norm": 0.71484375, + "learning_rate": 4.8300100519432665e-06, + "loss": 2.3037, + "step": 13469 + }, + { + "epoch": 0.7226394849785408, + "grad_norm": 0.4609375, + "learning_rate": 4.829978562334324e-06, + "loss": 2.4617, + "step": 13470 + }, + { + "epoch": 0.7226931330472103, + "grad_norm": 0.52734375, + "learning_rate": 4.8299470699116926e-06, + "loss": 1.9166, + "step": 13471 + }, + { + "epoch": 0.7227467811158799, + "grad_norm": 0.5078125, + "learning_rate": 4.829915574675406e-06, + "loss": 1.9651, + "step": 13472 + }, + { + "epoch": 0.7228004291845493, + "grad_norm": 0.5, + "learning_rate": 4.829884076625506e-06, + "loss": 2.295, + "step": 13473 + }, + { + "epoch": 0.7228540772532189, + "grad_norm": 0.45703125, + "learning_rate": 4.8298525757620295e-06, + "loss": 2.3745, + "step": 13474 + }, + { + "epoch": 0.7229077253218884, + "grad_norm": 0.59765625, + "learning_rate": 4.829821072085013e-06, + "loss": 2.541, + "step": 13475 + }, + { + "epoch": 0.722961373390558, + "grad_norm": 0.44921875, + "learning_rate": 4.829789565594497e-06, + "loss": 2.101, + "step": 13476 + }, + { + "epoch": 0.7230150214592275, + "grad_norm": 0.37890625, + "learning_rate": 4.829758056290518e-06, + "loss": 2.2607, + "step": 13477 + }, + { + "epoch": 0.723068669527897, + "grad_norm": 0.458984375, + "learning_rate": 4.829726544173114e-06, + "loss": 2.2183, + "step": 13478 + }, + { + "epoch": 0.7231223175965665, + "grad_norm": 0.443359375, + "learning_rate": 4.8296950292423244e-06, + "loss": 2.38, + "step": 13479 + }, + { + "epoch": 0.723175965665236, + "grad_norm": 0.486328125, + "learning_rate": 4.829663511498186e-06, + "loss": 2.1703, + "step": 13480 + }, + { + "epoch": 0.7232296137339056, + "grad_norm": 0.455078125, + "learning_rate": 4.829631990940737e-06, + "loss": 1.7885, + "step": 13481 + }, + { + "epoch": 0.7232832618025751, + "grad_norm": 0.50390625, + "learning_rate": 4.8296004675700155e-06, + "loss": 2.5101, + "step": 13482 + }, + { + "epoch": 0.7233369098712447, + "grad_norm": 0.388671875, + "learning_rate": 4.82956894138606e-06, + "loss": 1.4829, + "step": 13483 + }, + { + "epoch": 0.7233905579399141, + "grad_norm": 0.50390625, + "learning_rate": 4.829537412388909e-06, + "loss": 2.1688, + "step": 13484 + }, + { + "epoch": 0.7234442060085837, + "grad_norm": 0.439453125, + "learning_rate": 4.829505880578599e-06, + "loss": 2.3205, + "step": 13485 + }, + { + "epoch": 0.7234978540772532, + "grad_norm": 0.455078125, + "learning_rate": 4.82947434595517e-06, + "loss": 2.2838, + "step": 13486 + }, + { + "epoch": 0.7235515021459228, + "grad_norm": 0.380859375, + "learning_rate": 4.829442808518657e-06, + "loss": 2.0849, + "step": 13487 + }, + { + "epoch": 0.7236051502145923, + "grad_norm": 0.4765625, + "learning_rate": 4.829411268269102e-06, + "loss": 2.1599, + "step": 13488 + }, + { + "epoch": 0.7236587982832619, + "grad_norm": 0.53125, + "learning_rate": 4.829379725206541e-06, + "loss": 2.4598, + "step": 13489 + }, + { + "epoch": 0.7237124463519313, + "grad_norm": 7.4375, + "learning_rate": 4.829348179331011e-06, + "loss": 2.2189, + "step": 13490 + }, + { + "epoch": 0.7237660944206008, + "grad_norm": 0.439453125, + "learning_rate": 4.829316630642553e-06, + "loss": 2.1711, + "step": 13491 + }, + { + "epoch": 0.7238197424892704, + "grad_norm": 0.6328125, + "learning_rate": 4.829285079141202e-06, + "loss": 2.1562, + "step": 13492 + }, + { + "epoch": 0.7238733905579399, + "grad_norm": 0.43359375, + "learning_rate": 4.829253524826999e-06, + "loss": 2.2293, + "step": 13493 + }, + { + "epoch": 0.7239270386266095, + "grad_norm": 0.45703125, + "learning_rate": 4.8292219676999804e-06, + "loss": 2.3749, + "step": 13494 + }, + { + "epoch": 0.7239806866952789, + "grad_norm": 0.5078125, + "learning_rate": 4.829190407760185e-06, + "loss": 2.6038, + "step": 13495 + }, + { + "epoch": 0.7240343347639485, + "grad_norm": 0.474609375, + "learning_rate": 4.8291588450076495e-06, + "loss": 2.0086, + "step": 13496 + }, + { + "epoch": 0.724087982832618, + "grad_norm": 0.58203125, + "learning_rate": 4.829127279442413e-06, + "loss": 2.5217, + "step": 13497 + }, + { + "epoch": 0.7241416309012876, + "grad_norm": 0.48046875, + "learning_rate": 4.8290957110645145e-06, + "loss": 2.136, + "step": 13498 + }, + { + "epoch": 0.7241952789699571, + "grad_norm": 0.427734375, + "learning_rate": 4.829064139873991e-06, + "loss": 2.0456, + "step": 13499 + }, + { + "epoch": 0.7242489270386266, + "grad_norm": 0.482421875, + "learning_rate": 4.829032565870881e-06, + "loss": 2.4294, + "step": 13500 + }, + { + "epoch": 0.7243025751072961, + "grad_norm": 0.369140625, + "learning_rate": 4.829000989055222e-06, + "loss": 2.2927, + "step": 13501 + }, + { + "epoch": 0.7243562231759657, + "grad_norm": 9.6875, + "learning_rate": 4.828969409427053e-06, + "loss": 2.2655, + "step": 13502 + }, + { + "epoch": 0.7244098712446352, + "grad_norm": 0.416015625, + "learning_rate": 4.8289378269864125e-06, + "loss": 2.2439, + "step": 13503 + }, + { + "epoch": 0.7244635193133048, + "grad_norm": 0.4375, + "learning_rate": 4.828906241733337e-06, + "loss": 2.261, + "step": 13504 + }, + { + "epoch": 0.7245171673819742, + "grad_norm": 6.0, + "learning_rate": 4.828874653667866e-06, + "loss": 2.3502, + "step": 13505 + }, + { + "epoch": 0.7245708154506437, + "grad_norm": 0.490234375, + "learning_rate": 4.828843062790037e-06, + "loss": 2.4487, + "step": 13506 + }, + { + "epoch": 0.7246244635193133, + "grad_norm": 0.4453125, + "learning_rate": 4.828811469099889e-06, + "loss": 2.3302, + "step": 13507 + }, + { + "epoch": 0.7246781115879828, + "grad_norm": 0.431640625, + "learning_rate": 4.828779872597459e-06, + "loss": 2.336, + "step": 13508 + }, + { + "epoch": 0.7247317596566524, + "grad_norm": 0.6171875, + "learning_rate": 4.828748273282786e-06, + "loss": 2.3376, + "step": 13509 + }, + { + "epoch": 0.7247854077253219, + "grad_norm": 0.5234375, + "learning_rate": 4.8287166711559075e-06, + "loss": 2.5637, + "step": 13510 + }, + { + "epoch": 0.7248390557939914, + "grad_norm": 0.66796875, + "learning_rate": 4.828685066216862e-06, + "loss": 2.1866, + "step": 13511 + }, + { + "epoch": 0.7248927038626609, + "grad_norm": 0.447265625, + "learning_rate": 4.828653458465688e-06, + "loss": 2.3394, + "step": 13512 + }, + { + "epoch": 0.7249463519313305, + "grad_norm": 0.5234375, + "learning_rate": 4.828621847902423e-06, + "loss": 2.2164, + "step": 13513 + }, + { + "epoch": 0.725, + "grad_norm": 0.4453125, + "learning_rate": 4.828590234527107e-06, + "loss": 2.4284, + "step": 13514 + }, + { + "epoch": 0.7250536480686696, + "grad_norm": 1.234375, + "learning_rate": 4.828558618339775e-06, + "loss": 2.3854, + "step": 13515 + }, + { + "epoch": 0.725107296137339, + "grad_norm": 0.54296875, + "learning_rate": 4.828526999340467e-06, + "loss": 2.2875, + "step": 13516 + }, + { + "epoch": 0.7251609442060086, + "grad_norm": 0.53515625, + "learning_rate": 4.828495377529221e-06, + "loss": 2.1942, + "step": 13517 + }, + { + "epoch": 0.7252145922746781, + "grad_norm": 0.419921875, + "learning_rate": 4.8284637529060765e-06, + "loss": 2.0996, + "step": 13518 + }, + { + "epoch": 0.7252682403433477, + "grad_norm": 0.455078125, + "learning_rate": 4.8284321254710695e-06, + "loss": 2.3366, + "step": 13519 + }, + { + "epoch": 0.7253218884120172, + "grad_norm": 0.486328125, + "learning_rate": 4.828400495224239e-06, + "loss": 2.4074, + "step": 13520 + }, + { + "epoch": 0.7253755364806866, + "grad_norm": 0.5390625, + "learning_rate": 4.828368862165623e-06, + "loss": 1.3858, + "step": 13521 + }, + { + "epoch": 0.7254291845493562, + "grad_norm": 0.5078125, + "learning_rate": 4.8283372262952614e-06, + "loss": 2.4098, + "step": 13522 + }, + { + "epoch": 0.7254828326180257, + "grad_norm": 0.51953125, + "learning_rate": 4.82830558761319e-06, + "loss": 2.4467, + "step": 13523 + }, + { + "epoch": 0.7255364806866953, + "grad_norm": 0.435546875, + "learning_rate": 4.828273946119449e-06, + "loss": 1.8937, + "step": 13524 + }, + { + "epoch": 0.7255901287553648, + "grad_norm": 0.451171875, + "learning_rate": 4.828242301814075e-06, + "loss": 2.2423, + "step": 13525 + }, + { + "epoch": 0.7256437768240344, + "grad_norm": 0.52734375, + "learning_rate": 4.828210654697106e-06, + "loss": 2.2859, + "step": 13526 + }, + { + "epoch": 0.7256974248927038, + "grad_norm": 0.42578125, + "learning_rate": 4.828179004768582e-06, + "loss": 2.4562, + "step": 13527 + }, + { + "epoch": 0.7257510729613734, + "grad_norm": 0.3828125, + "learning_rate": 4.82814735202854e-06, + "loss": 2.0623, + "step": 13528 + }, + { + "epoch": 0.7258047210300429, + "grad_norm": 0.427734375, + "learning_rate": 4.828115696477019e-06, + "loss": 2.2038, + "step": 13529 + }, + { + "epoch": 0.7258583690987125, + "grad_norm": 0.466796875, + "learning_rate": 4.828084038114057e-06, + "loss": 2.3272, + "step": 13530 + }, + { + "epoch": 0.725912017167382, + "grad_norm": 0.44140625, + "learning_rate": 4.828052376939691e-06, + "loss": 2.392, + "step": 13531 + }, + { + "epoch": 0.7259656652360515, + "grad_norm": 0.482421875, + "learning_rate": 4.8280207129539615e-06, + "loss": 2.375, + "step": 13532 + }, + { + "epoch": 0.726019313304721, + "grad_norm": 0.55859375, + "learning_rate": 4.827989046156905e-06, + "loss": 2.0927, + "step": 13533 + }, + { + "epoch": 0.7260729613733906, + "grad_norm": 0.5, + "learning_rate": 4.8279573765485594e-06, + "loss": 2.2254, + "step": 13534 + }, + { + "epoch": 0.7261266094420601, + "grad_norm": 0.41796875, + "learning_rate": 4.827925704128964e-06, + "loss": 2.3205, + "step": 13535 + }, + { + "epoch": 0.7261802575107296, + "grad_norm": 0.4453125, + "learning_rate": 4.827894028898157e-06, + "loss": 2.4517, + "step": 13536 + }, + { + "epoch": 0.7262339055793992, + "grad_norm": 0.53125, + "learning_rate": 4.8278623508561775e-06, + "loss": 2.1989, + "step": 13537 + }, + { + "epoch": 0.7262875536480686, + "grad_norm": 0.3828125, + "learning_rate": 4.827830670003061e-06, + "loss": 2.4182, + "step": 13538 + }, + { + "epoch": 0.7263412017167382, + "grad_norm": 0.470703125, + "learning_rate": 4.827798986338849e-06, + "loss": 2.5488, + "step": 13539 + }, + { + "epoch": 0.7263948497854077, + "grad_norm": 0.474609375, + "learning_rate": 4.827767299863577e-06, + "loss": 2.5419, + "step": 13540 + }, + { + "epoch": 0.7264484978540773, + "grad_norm": 0.46875, + "learning_rate": 4.827735610577284e-06, + "loss": 2.5088, + "step": 13541 + }, + { + "epoch": 0.7265021459227468, + "grad_norm": 0.44921875, + "learning_rate": 4.82770391848001e-06, + "loss": 2.3505, + "step": 13542 + }, + { + "epoch": 0.7265557939914163, + "grad_norm": 0.6328125, + "learning_rate": 4.827672223571792e-06, + "loss": 2.4434, + "step": 13543 + }, + { + "epoch": 0.7266094420600858, + "grad_norm": 0.486328125, + "learning_rate": 4.827640525852668e-06, + "loss": 2.3989, + "step": 13544 + }, + { + "epoch": 0.7266630901287554, + "grad_norm": 0.46875, + "learning_rate": 4.827608825322676e-06, + "loss": 2.3429, + "step": 13545 + }, + { + "epoch": 0.7267167381974249, + "grad_norm": 0.4609375, + "learning_rate": 4.827577121981856e-06, + "loss": 2.4077, + "step": 13546 + }, + { + "epoch": 0.7267703862660945, + "grad_norm": 0.9921875, + "learning_rate": 4.827545415830244e-06, + "loss": 2.481, + "step": 13547 + }, + { + "epoch": 0.726824034334764, + "grad_norm": 0.345703125, + "learning_rate": 4.8275137068678804e-06, + "loss": 2.1512, + "step": 13548 + }, + { + "epoch": 0.7268776824034334, + "grad_norm": 0.41015625, + "learning_rate": 4.8274819950948024e-06, + "loss": 2.3224, + "step": 13549 + }, + { + "epoch": 0.726931330472103, + "grad_norm": 0.3984375, + "learning_rate": 4.827450280511048e-06, + "loss": 2.2012, + "step": 13550 + }, + { + "epoch": 0.7269849785407725, + "grad_norm": 0.53125, + "learning_rate": 4.827418563116656e-06, + "loss": 2.1097, + "step": 13551 + }, + { + "epoch": 0.7270386266094421, + "grad_norm": 0.443359375, + "learning_rate": 4.827386842911664e-06, + "loss": 2.1827, + "step": 13552 + }, + { + "epoch": 0.7270922746781115, + "grad_norm": 0.65234375, + "learning_rate": 4.827355119896112e-06, + "loss": 2.252, + "step": 13553 + }, + { + "epoch": 0.7271459227467811, + "grad_norm": 5.84375, + "learning_rate": 4.827323394070037e-06, + "loss": 1.869, + "step": 13554 + }, + { + "epoch": 0.7271995708154506, + "grad_norm": 0.4609375, + "learning_rate": 4.827291665433478e-06, + "loss": 1.9135, + "step": 13555 + }, + { + "epoch": 0.7272532188841202, + "grad_norm": 0.5546875, + "learning_rate": 4.827259933986471e-06, + "loss": 2.4687, + "step": 13556 + }, + { + "epoch": 0.7273068669527897, + "grad_norm": 0.48046875, + "learning_rate": 4.827228199729058e-06, + "loss": 2.3234, + "step": 13557 + }, + { + "epoch": 0.7273605150214593, + "grad_norm": 0.5078125, + "learning_rate": 4.827196462661275e-06, + "loss": 2.3801, + "step": 13558 + }, + { + "epoch": 0.7274141630901287, + "grad_norm": 1.2421875, + "learning_rate": 4.827164722783161e-06, + "loss": 2.4511, + "step": 13559 + }, + { + "epoch": 0.7274678111587983, + "grad_norm": 0.4609375, + "learning_rate": 4.827132980094754e-06, + "loss": 1.8194, + "step": 13560 + }, + { + "epoch": 0.7275214592274678, + "grad_norm": 0.390625, + "learning_rate": 4.827101234596092e-06, + "loss": 2.0863, + "step": 13561 + }, + { + "epoch": 0.7275751072961374, + "grad_norm": 0.462890625, + "learning_rate": 4.827069486287214e-06, + "loss": 2.2466, + "step": 13562 + }, + { + "epoch": 0.7276287553648069, + "grad_norm": 0.7109375, + "learning_rate": 4.827037735168159e-06, + "loss": 2.199, + "step": 13563 + }, + { + "epoch": 0.7276824034334763, + "grad_norm": 0.427734375, + "learning_rate": 4.827005981238963e-06, + "loss": 2.2059, + "step": 13564 + }, + { + "epoch": 0.7277360515021459, + "grad_norm": 0.474609375, + "learning_rate": 4.8269742244996675e-06, + "loss": 2.2134, + "step": 13565 + }, + { + "epoch": 0.7277896995708154, + "grad_norm": 0.6328125, + "learning_rate": 4.8269424649503085e-06, + "loss": 2.3223, + "step": 13566 + }, + { + "epoch": 0.727843347639485, + "grad_norm": 0.67578125, + "learning_rate": 4.8269107025909245e-06, + "loss": 2.2327, + "step": 13567 + }, + { + "epoch": 0.7278969957081545, + "grad_norm": 0.47265625, + "learning_rate": 4.826878937421555e-06, + "loss": 2.3588, + "step": 13568 + }, + { + "epoch": 0.7279506437768241, + "grad_norm": 0.5078125, + "learning_rate": 4.826847169442238e-06, + "loss": 2.395, + "step": 13569 + }, + { + "epoch": 0.7280042918454935, + "grad_norm": 0.470703125, + "learning_rate": 4.826815398653011e-06, + "loss": 2.3616, + "step": 13570 + }, + { + "epoch": 0.7280579399141631, + "grad_norm": 0.46484375, + "learning_rate": 4.826783625053913e-06, + "loss": 1.6312, + "step": 13571 + }, + { + "epoch": 0.7281115879828326, + "grad_norm": 0.48828125, + "learning_rate": 4.826751848644984e-06, + "loss": 2.298, + "step": 13572 + }, + { + "epoch": 0.7281652360515022, + "grad_norm": 0.36328125, + "learning_rate": 4.82672006942626e-06, + "loss": 2.3098, + "step": 13573 + }, + { + "epoch": 0.7282188841201717, + "grad_norm": 0.40625, + "learning_rate": 4.826688287397778e-06, + "loss": 1.9953, + "step": 13574 + }, + { + "epoch": 0.7282725321888412, + "grad_norm": 0.494140625, + "learning_rate": 4.826656502559581e-06, + "loss": 2.2542, + "step": 13575 + }, + { + "epoch": 0.7283261802575107, + "grad_norm": 0.388671875, + "learning_rate": 4.826624714911704e-06, + "loss": 2.0188, + "step": 13576 + }, + { + "epoch": 0.7283798283261803, + "grad_norm": 0.451171875, + "learning_rate": 4.8265929244541865e-06, + "loss": 2.2086, + "step": 13577 + }, + { + "epoch": 0.7284334763948498, + "grad_norm": 0.57421875, + "learning_rate": 4.826561131187067e-06, + "loss": 2.4192, + "step": 13578 + }, + { + "epoch": 0.7284871244635193, + "grad_norm": 0.443359375, + "learning_rate": 4.826529335110382e-06, + "loss": 2.2381, + "step": 13579 + }, + { + "epoch": 0.7285407725321889, + "grad_norm": 0.427734375, + "learning_rate": 4.826497536224173e-06, + "loss": 2.2976, + "step": 13580 + }, + { + "epoch": 0.7285944206008583, + "grad_norm": 0.5, + "learning_rate": 4.826465734528476e-06, + "loss": 2.4627, + "step": 13581 + }, + { + "epoch": 0.7286480686695279, + "grad_norm": 0.345703125, + "learning_rate": 4.826433930023331e-06, + "loss": 1.9387, + "step": 13582 + }, + { + "epoch": 0.7287017167381974, + "grad_norm": 0.4140625, + "learning_rate": 4.826402122708775e-06, + "loss": 2.2349, + "step": 13583 + }, + { + "epoch": 0.728755364806867, + "grad_norm": 0.62890625, + "learning_rate": 4.826370312584847e-06, + "loss": 2.2508, + "step": 13584 + }, + { + "epoch": 0.7288090128755365, + "grad_norm": 0.447265625, + "learning_rate": 4.826338499651586e-06, + "loss": 2.3336, + "step": 13585 + }, + { + "epoch": 0.728862660944206, + "grad_norm": 0.478515625, + "learning_rate": 4.82630668390903e-06, + "loss": 2.3741, + "step": 13586 + }, + { + "epoch": 0.7289163090128755, + "grad_norm": 0.4609375, + "learning_rate": 4.826274865357216e-06, + "loss": 2.4296, + "step": 13587 + }, + { + "epoch": 0.7289699570815451, + "grad_norm": 0.4375, + "learning_rate": 4.826243043996185e-06, + "loss": 2.3337, + "step": 13588 + }, + { + "epoch": 0.7290236051502146, + "grad_norm": 2.21875, + "learning_rate": 4.826211219825975e-06, + "loss": 2.204, + "step": 13589 + }, + { + "epoch": 0.7290772532188842, + "grad_norm": 0.458984375, + "learning_rate": 4.826179392846622e-06, + "loss": 2.2089, + "step": 13590 + }, + { + "epoch": 0.7291309012875536, + "grad_norm": 0.44140625, + "learning_rate": 4.826147563058167e-06, + "loss": 2.1212, + "step": 13591 + }, + { + "epoch": 0.7291845493562231, + "grad_norm": 0.447265625, + "learning_rate": 4.8261157304606465e-06, + "loss": 2.204, + "step": 13592 + }, + { + "epoch": 0.7292381974248927, + "grad_norm": 0.474609375, + "learning_rate": 4.826083895054101e-06, + "loss": 1.9447, + "step": 13593 + }, + { + "epoch": 0.7292918454935622, + "grad_norm": 0.51171875, + "learning_rate": 4.826052056838567e-06, + "loss": 2.3176, + "step": 13594 + }, + { + "epoch": 0.7293454935622318, + "grad_norm": 0.458984375, + "learning_rate": 4.826020215814084e-06, + "loss": 2.1672, + "step": 13595 + }, + { + "epoch": 0.7293991416309012, + "grad_norm": 2.78125, + "learning_rate": 4.825988371980691e-06, + "loss": 2.2642, + "step": 13596 + }, + { + "epoch": 0.7294527896995708, + "grad_norm": 0.427734375, + "learning_rate": 4.825956525338426e-06, + "loss": 2.395, + "step": 13597 + }, + { + "epoch": 0.7295064377682403, + "grad_norm": 0.5625, + "learning_rate": 4.825924675887326e-06, + "loss": 2.3779, + "step": 13598 + }, + { + "epoch": 0.7295600858369099, + "grad_norm": 0.34765625, + "learning_rate": 4.825892823627431e-06, + "loss": 1.9917, + "step": 13599 + }, + { + "epoch": 0.7296137339055794, + "grad_norm": 0.435546875, + "learning_rate": 4.82586096855878e-06, + "loss": 2.1832, + "step": 13600 + }, + { + "epoch": 0.729667381974249, + "grad_norm": 0.435546875, + "learning_rate": 4.82582911068141e-06, + "loss": 2.1322, + "step": 13601 + }, + { + "epoch": 0.7297210300429184, + "grad_norm": 0.451171875, + "learning_rate": 4.825797249995359e-06, + "loss": 2.2351, + "step": 13602 + }, + { + "epoch": 0.729774678111588, + "grad_norm": 0.4609375, + "learning_rate": 4.825765386500667e-06, + "loss": 2.3805, + "step": 13603 + }, + { + "epoch": 0.7298283261802575, + "grad_norm": 0.419921875, + "learning_rate": 4.825733520197373e-06, + "loss": 2.2146, + "step": 13604 + }, + { + "epoch": 0.7298819742489271, + "grad_norm": 0.462890625, + "learning_rate": 4.8257016510855135e-06, + "loss": 2.2635, + "step": 13605 + }, + { + "epoch": 0.7299356223175966, + "grad_norm": 0.466796875, + "learning_rate": 4.825669779165128e-06, + "loss": 2.0539, + "step": 13606 + }, + { + "epoch": 0.729989270386266, + "grad_norm": 0.3515625, + "learning_rate": 4.825637904436255e-06, + "loss": 1.977, + "step": 13607 + }, + { + "epoch": 0.7300429184549356, + "grad_norm": 0.640625, + "learning_rate": 4.825606026898934e-06, + "loss": 2.1057, + "step": 13608 + }, + { + "epoch": 0.7300965665236051, + "grad_norm": 0.578125, + "learning_rate": 4.8255741465532015e-06, + "loss": 2.3105, + "step": 13609 + }, + { + "epoch": 0.7301502145922747, + "grad_norm": 0.416015625, + "learning_rate": 4.825542263399097e-06, + "loss": 2.0961, + "step": 13610 + }, + { + "epoch": 0.7302038626609442, + "grad_norm": 0.4921875, + "learning_rate": 4.825510377436659e-06, + "loss": 2.3799, + "step": 13611 + }, + { + "epoch": 0.7302575107296138, + "grad_norm": 0.53125, + "learning_rate": 4.825478488665926e-06, + "loss": 2.2163, + "step": 13612 + }, + { + "epoch": 0.7303111587982832, + "grad_norm": 0.435546875, + "learning_rate": 4.825446597086937e-06, + "loss": 2.2334, + "step": 13613 + }, + { + "epoch": 0.7303648068669528, + "grad_norm": 0.44140625, + "learning_rate": 4.825414702699729e-06, + "loss": 2.303, + "step": 13614 + }, + { + "epoch": 0.7304184549356223, + "grad_norm": 0.453125, + "learning_rate": 4.825382805504342e-06, + "loss": 2.2504, + "step": 13615 + }, + { + "epoch": 0.7304721030042919, + "grad_norm": 0.41015625, + "learning_rate": 4.825350905500814e-06, + "loss": 2.219, + "step": 13616 + }, + { + "epoch": 0.7305257510729614, + "grad_norm": 0.609375, + "learning_rate": 4.825319002689184e-06, + "loss": 2.2901, + "step": 13617 + }, + { + "epoch": 0.730579399141631, + "grad_norm": 0.4609375, + "learning_rate": 4.825287097069488e-06, + "loss": 2.3792, + "step": 13618 + }, + { + "epoch": 0.7306330472103004, + "grad_norm": 0.51171875, + "learning_rate": 4.825255188641769e-06, + "loss": 2.3568, + "step": 13619 + }, + { + "epoch": 0.73068669527897, + "grad_norm": 0.51171875, + "learning_rate": 4.825223277406063e-06, + "loss": 1.9622, + "step": 13620 + }, + { + "epoch": 0.7307403433476395, + "grad_norm": 0.419921875, + "learning_rate": 4.825191363362407e-06, + "loss": 2.452, + "step": 13621 + }, + { + "epoch": 0.730793991416309, + "grad_norm": 0.38671875, + "learning_rate": 4.825159446510842e-06, + "loss": 2.1838, + "step": 13622 + }, + { + "epoch": 0.7308476394849786, + "grad_norm": 0.4140625, + "learning_rate": 4.825127526851406e-06, + "loss": 2.3623, + "step": 13623 + }, + { + "epoch": 0.730901287553648, + "grad_norm": 0.44921875, + "learning_rate": 4.8250956043841366e-06, + "loss": 2.3439, + "step": 13624 + }, + { + "epoch": 0.7309549356223176, + "grad_norm": 0.443359375, + "learning_rate": 4.825063679109074e-06, + "loss": 2.3324, + "step": 13625 + }, + { + "epoch": 0.7310085836909871, + "grad_norm": 0.625, + "learning_rate": 4.8250317510262555e-06, + "loss": 2.1726, + "step": 13626 + }, + { + "epoch": 0.7310622317596567, + "grad_norm": 0.384765625, + "learning_rate": 4.824999820135719e-06, + "loss": 2.4344, + "step": 13627 + }, + { + "epoch": 0.7311158798283262, + "grad_norm": 0.47265625, + "learning_rate": 4.824967886437505e-06, + "loss": 2.3771, + "step": 13628 + }, + { + "epoch": 0.7311695278969957, + "grad_norm": 0.41796875, + "learning_rate": 4.8249359499316516e-06, + "loss": 1.8794, + "step": 13629 + }, + { + "epoch": 0.7312231759656652, + "grad_norm": 0.4609375, + "learning_rate": 4.824904010618196e-06, + "loss": 2.421, + "step": 13630 + }, + { + "epoch": 0.7312768240343348, + "grad_norm": 0.43359375, + "learning_rate": 4.824872068497177e-06, + "loss": 2.2412, + "step": 13631 + }, + { + "epoch": 0.7313304721030043, + "grad_norm": 0.48046875, + "learning_rate": 4.824840123568635e-06, + "loss": 2.5004, + "step": 13632 + }, + { + "epoch": 0.7313841201716739, + "grad_norm": 0.37109375, + "learning_rate": 4.824808175832607e-06, + "loss": 2.4248, + "step": 13633 + }, + { + "epoch": 0.7314377682403433, + "grad_norm": 0.44140625, + "learning_rate": 4.824776225289132e-06, + "loss": 2.2628, + "step": 13634 + }, + { + "epoch": 0.7314914163090128, + "grad_norm": 0.443359375, + "learning_rate": 4.824744271938248e-06, + "loss": 2.37, + "step": 13635 + }, + { + "epoch": 0.7315450643776824, + "grad_norm": 0.56640625, + "learning_rate": 4.824712315779995e-06, + "loss": 2.1202, + "step": 13636 + }, + { + "epoch": 0.7315987124463519, + "grad_norm": 0.453125, + "learning_rate": 4.8246803568144095e-06, + "loss": 2.3132, + "step": 13637 + }, + { + "epoch": 0.7316523605150215, + "grad_norm": 0.4296875, + "learning_rate": 4.8246483950415316e-06, + "loss": 2.4207, + "step": 13638 + }, + { + "epoch": 0.731706008583691, + "grad_norm": 0.41015625, + "learning_rate": 4.8246164304614e-06, + "loss": 2.2708, + "step": 13639 + }, + { + "epoch": 0.7317596566523605, + "grad_norm": 0.380859375, + "learning_rate": 4.824584463074053e-06, + "loss": 1.871, + "step": 13640 + }, + { + "epoch": 0.73181330472103, + "grad_norm": 0.439453125, + "learning_rate": 4.824552492879528e-06, + "loss": 2.3593, + "step": 13641 + }, + { + "epoch": 0.7318669527896996, + "grad_norm": 0.439453125, + "learning_rate": 4.824520519877866e-06, + "loss": 2.2653, + "step": 13642 + }, + { + "epoch": 0.7319206008583691, + "grad_norm": 0.54296875, + "learning_rate": 4.8244885440691044e-06, + "loss": 2.4352, + "step": 13643 + }, + { + "epoch": 0.7319742489270387, + "grad_norm": 0.431640625, + "learning_rate": 4.82445656545328e-06, + "loss": 2.1422, + "step": 13644 + }, + { + "epoch": 0.7320278969957081, + "grad_norm": 0.5078125, + "learning_rate": 4.824424584030435e-06, + "loss": 2.1746, + "step": 13645 + }, + { + "epoch": 0.7320815450643777, + "grad_norm": 0.4296875, + "learning_rate": 4.824392599800606e-06, + "loss": 2.2125, + "step": 13646 + }, + { + "epoch": 0.7321351931330472, + "grad_norm": 0.546875, + "learning_rate": 4.824360612763831e-06, + "loss": 2.204, + "step": 13647 + }, + { + "epoch": 0.7321888412017168, + "grad_norm": 0.416015625, + "learning_rate": 4.82432862292015e-06, + "loss": 1.9507, + "step": 13648 + }, + { + "epoch": 0.7322424892703863, + "grad_norm": 0.421875, + "learning_rate": 4.8242966302696e-06, + "loss": 2.3042, + "step": 13649 + }, + { + "epoch": 0.7322961373390557, + "grad_norm": 0.419921875, + "learning_rate": 4.8242646348122215e-06, + "loss": 2.4776, + "step": 13650 + }, + { + "epoch": 0.7323497854077253, + "grad_norm": 0.490234375, + "learning_rate": 4.824232636548052e-06, + "loss": 2.2711, + "step": 13651 + }, + { + "epoch": 0.7324034334763948, + "grad_norm": 0.4765625, + "learning_rate": 4.824200635477131e-06, + "loss": 2.2505, + "step": 13652 + }, + { + "epoch": 0.7324570815450644, + "grad_norm": 19.5, + "learning_rate": 4.824168631599497e-06, + "loss": 2.3827, + "step": 13653 + }, + { + "epoch": 0.7325107296137339, + "grad_norm": 1.296875, + "learning_rate": 4.824136624915187e-06, + "loss": 2.3101, + "step": 13654 + }, + { + "epoch": 0.7325643776824035, + "grad_norm": 0.419921875, + "learning_rate": 4.824104615424241e-06, + "loss": 2.3457, + "step": 13655 + }, + { + "epoch": 0.7326180257510729, + "grad_norm": 0.435546875, + "learning_rate": 4.824072603126698e-06, + "loss": 2.3992, + "step": 13656 + }, + { + "epoch": 0.7326716738197425, + "grad_norm": 0.48046875, + "learning_rate": 4.824040588022596e-06, + "loss": 2.4273, + "step": 13657 + }, + { + "epoch": 0.732725321888412, + "grad_norm": 0.5859375, + "learning_rate": 4.824008570111975e-06, + "loss": 2.2526, + "step": 13658 + }, + { + "epoch": 0.7327789699570816, + "grad_norm": 0.47265625, + "learning_rate": 4.823976549394871e-06, + "loss": 2.4728, + "step": 13659 + }, + { + "epoch": 0.7328326180257511, + "grad_norm": 0.400390625, + "learning_rate": 4.823944525871324e-06, + "loss": 2.4001, + "step": 13660 + }, + { + "epoch": 0.7328862660944206, + "grad_norm": 0.51953125, + "learning_rate": 4.8239124995413746e-06, + "loss": 2.3934, + "step": 13661 + }, + { + "epoch": 0.7329399141630901, + "grad_norm": 0.46484375, + "learning_rate": 4.823880470405059e-06, + "loss": 2.4659, + "step": 13662 + }, + { + "epoch": 0.7329935622317597, + "grad_norm": 0.50390625, + "learning_rate": 4.8238484384624165e-06, + "loss": 2.3209, + "step": 13663 + }, + { + "epoch": 0.7330472103004292, + "grad_norm": 0.421875, + "learning_rate": 4.823816403713486e-06, + "loss": 2.0035, + "step": 13664 + }, + { + "epoch": 0.7331008583690987, + "grad_norm": 0.52734375, + "learning_rate": 4.823784366158306e-06, + "loss": 2.4717, + "step": 13665 + }, + { + "epoch": 0.7331545064377682, + "grad_norm": 0.50390625, + "learning_rate": 4.823752325796916e-06, + "loss": 2.5356, + "step": 13666 + }, + { + "epoch": 0.7332081545064377, + "grad_norm": 0.8984375, + "learning_rate": 4.823720282629353e-06, + "loss": 2.2891, + "step": 13667 + }, + { + "epoch": 0.7332618025751073, + "grad_norm": 0.390625, + "learning_rate": 4.823688236655656e-06, + "loss": 2.5518, + "step": 13668 + }, + { + "epoch": 0.7333154506437768, + "grad_norm": 0.68359375, + "learning_rate": 4.823656187875866e-06, + "loss": 2.4054, + "step": 13669 + }, + { + "epoch": 0.7333690987124464, + "grad_norm": 0.65625, + "learning_rate": 4.823624136290019e-06, + "loss": 2.321, + "step": 13670 + }, + { + "epoch": 0.7334227467811159, + "grad_norm": 0.44140625, + "learning_rate": 4.823592081898155e-06, + "loss": 2.1604, + "step": 13671 + }, + { + "epoch": 0.7334763948497854, + "grad_norm": 0.486328125, + "learning_rate": 4.823560024700313e-06, + "loss": 2.3267, + "step": 13672 + }, + { + "epoch": 0.7335300429184549, + "grad_norm": 0.39453125, + "learning_rate": 4.82352796469653e-06, + "loss": 2.2806, + "step": 13673 + }, + { + "epoch": 0.7335836909871245, + "grad_norm": 0.474609375, + "learning_rate": 4.823495901886848e-06, + "loss": 2.3336, + "step": 13674 + }, + { + "epoch": 0.733637339055794, + "grad_norm": 0.453125, + "learning_rate": 4.823463836271302e-06, + "loss": 2.1559, + "step": 13675 + }, + { + "epoch": 0.7336909871244636, + "grad_norm": 0.50390625, + "learning_rate": 4.823431767849933e-06, + "loss": 2.374, + "step": 13676 + }, + { + "epoch": 0.733744635193133, + "grad_norm": 0.40234375, + "learning_rate": 4.823399696622779e-06, + "loss": 2.0139, + "step": 13677 + }, + { + "epoch": 0.7337982832618025, + "grad_norm": 0.45703125, + "learning_rate": 4.823367622589879e-06, + "loss": 2.3296, + "step": 13678 + }, + { + "epoch": 0.7338519313304721, + "grad_norm": 0.4453125, + "learning_rate": 4.823335545751271e-06, + "loss": 1.715, + "step": 13679 + }, + { + "epoch": 0.7339055793991416, + "grad_norm": 0.4140625, + "learning_rate": 4.823303466106994e-06, + "loss": 2.3595, + "step": 13680 + }, + { + "epoch": 0.7339592274678112, + "grad_norm": 0.546875, + "learning_rate": 4.8232713836570885e-06, + "loss": 2.3426, + "step": 13681 + }, + { + "epoch": 0.7340128755364806, + "grad_norm": 0.416015625, + "learning_rate": 4.823239298401591e-06, + "loss": 2.0661, + "step": 13682 + }, + { + "epoch": 0.7340665236051502, + "grad_norm": 0.39453125, + "learning_rate": 4.823207210340541e-06, + "loss": 2.1786, + "step": 13683 + }, + { + "epoch": 0.7341201716738197, + "grad_norm": 0.66015625, + "learning_rate": 4.823175119473976e-06, + "loss": 2.448, + "step": 13684 + }, + { + "epoch": 0.7341738197424893, + "grad_norm": 0.4921875, + "learning_rate": 4.823143025801937e-06, + "loss": 2.2452, + "step": 13685 + }, + { + "epoch": 0.7342274678111588, + "grad_norm": 0.421875, + "learning_rate": 4.8231109293244625e-06, + "loss": 2.2152, + "step": 13686 + }, + { + "epoch": 0.7342811158798284, + "grad_norm": 0.5, + "learning_rate": 4.82307883004159e-06, + "loss": 2.7476, + "step": 13687 + }, + { + "epoch": 0.7343347639484978, + "grad_norm": 0.4765625, + "learning_rate": 4.8230467279533585e-06, + "loss": 2.3091, + "step": 13688 + }, + { + "epoch": 0.7343884120171674, + "grad_norm": 0.408203125, + "learning_rate": 4.823014623059808e-06, + "loss": 2.293, + "step": 13689 + }, + { + "epoch": 0.7344420600858369, + "grad_norm": 0.482421875, + "learning_rate": 4.822982515360975e-06, + "loss": 2.3996, + "step": 13690 + }, + { + "epoch": 0.7344957081545065, + "grad_norm": 0.494140625, + "learning_rate": 4.822950404856901e-06, + "loss": 2.5701, + "step": 13691 + }, + { + "epoch": 0.734549356223176, + "grad_norm": 0.58984375, + "learning_rate": 4.8229182915476225e-06, + "loss": 2.2643, + "step": 13692 + }, + { + "epoch": 0.7346030042918454, + "grad_norm": 0.52734375, + "learning_rate": 4.822886175433179e-06, + "loss": 2.3997, + "step": 13693 + }, + { + "epoch": 0.734656652360515, + "grad_norm": 0.4375, + "learning_rate": 4.8228540565136105e-06, + "loss": 2.3468, + "step": 13694 + }, + { + "epoch": 0.7347103004291845, + "grad_norm": 0.439453125, + "learning_rate": 4.8228219347889535e-06, + "loss": 2.327, + "step": 13695 + }, + { + "epoch": 0.7347639484978541, + "grad_norm": 0.6328125, + "learning_rate": 4.822789810259249e-06, + "loss": 1.1763, + "step": 13696 + }, + { + "epoch": 0.7348175965665236, + "grad_norm": 0.357421875, + "learning_rate": 4.822757682924534e-06, + "loss": 2.0033, + "step": 13697 + }, + { + "epoch": 0.7348712446351932, + "grad_norm": 0.396484375, + "learning_rate": 4.822725552784849e-06, + "loss": 2.2087, + "step": 13698 + }, + { + "epoch": 0.7349248927038626, + "grad_norm": 0.48046875, + "learning_rate": 4.822693419840231e-06, + "loss": 2.3518, + "step": 13699 + }, + { + "epoch": 0.7349785407725322, + "grad_norm": 0.478515625, + "learning_rate": 4.822661284090721e-06, + "loss": 2.3173, + "step": 13700 + }, + { + "epoch": 0.7350321888412017, + "grad_norm": 0.45703125, + "learning_rate": 4.822629145536355e-06, + "loss": 2.264, + "step": 13701 + }, + { + "epoch": 0.7350858369098713, + "grad_norm": 0.46484375, + "learning_rate": 4.822597004177174e-06, + "loss": 1.7053, + "step": 13702 + }, + { + "epoch": 0.7351394849785408, + "grad_norm": 0.41796875, + "learning_rate": 4.822564860013216e-06, + "loss": 2.0071, + "step": 13703 + }, + { + "epoch": 0.7351931330472103, + "grad_norm": 0.45703125, + "learning_rate": 4.8225327130445204e-06, + "loss": 2.3287, + "step": 13704 + }, + { + "epoch": 0.7352467811158798, + "grad_norm": 0.376953125, + "learning_rate": 4.8225005632711255e-06, + "loss": 2.1355, + "step": 13705 + }, + { + "epoch": 0.7353004291845494, + "grad_norm": 0.44921875, + "learning_rate": 4.82246841069307e-06, + "loss": 2.2502, + "step": 13706 + }, + { + "epoch": 0.7353540772532189, + "grad_norm": 0.431640625, + "learning_rate": 4.822436255310393e-06, + "loss": 2.2273, + "step": 13707 + }, + { + "epoch": 0.7354077253218884, + "grad_norm": 1.1796875, + "learning_rate": 4.822404097123134e-06, + "loss": 2.2985, + "step": 13708 + }, + { + "epoch": 0.735461373390558, + "grad_norm": 0.58203125, + "learning_rate": 4.8223719361313295e-06, + "loss": 2.3232, + "step": 13709 + }, + { + "epoch": 0.7355150214592274, + "grad_norm": 0.4140625, + "learning_rate": 4.82233977233502e-06, + "loss": 2.51, + "step": 13710 + }, + { + "epoch": 0.735568669527897, + "grad_norm": 0.54296875, + "learning_rate": 4.822307605734245e-06, + "loss": 2.4663, + "step": 13711 + }, + { + "epoch": 0.7356223175965665, + "grad_norm": 0.380859375, + "learning_rate": 4.822275436329043e-06, + "loss": 2.298, + "step": 13712 + }, + { + "epoch": 0.7356759656652361, + "grad_norm": 0.4765625, + "learning_rate": 4.822243264119452e-06, + "loss": 2.3584, + "step": 13713 + }, + { + "epoch": 0.7357296137339056, + "grad_norm": 0.59375, + "learning_rate": 4.822211089105512e-06, + "loss": 2.4949, + "step": 13714 + }, + { + "epoch": 0.7357832618025751, + "grad_norm": 0.4140625, + "learning_rate": 4.822178911287261e-06, + "loss": 2.3925, + "step": 13715 + }, + { + "epoch": 0.7358369098712446, + "grad_norm": 0.51953125, + "learning_rate": 4.822146730664738e-06, + "loss": 2.2389, + "step": 13716 + }, + { + "epoch": 0.7358905579399142, + "grad_norm": 0.453125, + "learning_rate": 4.822114547237982e-06, + "loss": 2.3654, + "step": 13717 + }, + { + "epoch": 0.7359442060085837, + "grad_norm": 0.4453125, + "learning_rate": 4.822082361007031e-06, + "loss": 2.3141, + "step": 13718 + }, + { + "epoch": 0.7359978540772533, + "grad_norm": 0.4140625, + "learning_rate": 4.822050171971925e-06, + "loss": 2.0818, + "step": 13719 + }, + { + "epoch": 0.7360515021459227, + "grad_norm": 0.486328125, + "learning_rate": 4.822017980132703e-06, + "loss": 2.5209, + "step": 13720 + }, + { + "epoch": 0.7361051502145923, + "grad_norm": 0.37890625, + "learning_rate": 4.821985785489402e-06, + "loss": 2.0254, + "step": 13721 + }, + { + "epoch": 0.7361587982832618, + "grad_norm": 0.44140625, + "learning_rate": 4.821953588042064e-06, + "loss": 2.2684, + "step": 13722 + }, + { + "epoch": 0.7362124463519313, + "grad_norm": 0.44140625, + "learning_rate": 4.821921387790726e-06, + "loss": 2.0125, + "step": 13723 + }, + { + "epoch": 0.7362660944206009, + "grad_norm": 0.39453125, + "learning_rate": 4.8218891847354254e-06, + "loss": 2.3766, + "step": 13724 + }, + { + "epoch": 0.7363197424892703, + "grad_norm": 0.78125, + "learning_rate": 4.821856978876204e-06, + "loss": 2.355, + "step": 13725 + }, + { + "epoch": 0.7363733905579399, + "grad_norm": 0.486328125, + "learning_rate": 4.821824770213099e-06, + "loss": 2.6, + "step": 13726 + }, + { + "epoch": 0.7364270386266094, + "grad_norm": 0.37109375, + "learning_rate": 4.82179255874615e-06, + "loss": 2.283, + "step": 13727 + }, + { + "epoch": 0.736480686695279, + "grad_norm": 0.51171875, + "learning_rate": 4.821760344475396e-06, + "loss": 2.3389, + "step": 13728 + }, + { + "epoch": 0.7365343347639485, + "grad_norm": 0.5546875, + "learning_rate": 4.8217281274008744e-06, + "loss": 2.1825, + "step": 13729 + }, + { + "epoch": 0.7365879828326181, + "grad_norm": 0.451171875, + "learning_rate": 4.821695907522626e-06, + "loss": 2.6591, + "step": 13730 + }, + { + "epoch": 0.7366416309012875, + "grad_norm": 0.458984375, + "learning_rate": 4.821663684840689e-06, + "loss": 1.9544, + "step": 13731 + }, + { + "epoch": 0.7366952789699571, + "grad_norm": 0.498046875, + "learning_rate": 4.821631459355102e-06, + "loss": 2.219, + "step": 13732 + }, + { + "epoch": 0.7367489270386266, + "grad_norm": 1.734375, + "learning_rate": 4.821599231065903e-06, + "loss": 2.3356, + "step": 13733 + }, + { + "epoch": 0.7368025751072962, + "grad_norm": 0.455078125, + "learning_rate": 4.821566999973134e-06, + "loss": 2.3334, + "step": 13734 + }, + { + "epoch": 0.7368562231759657, + "grad_norm": 0.45703125, + "learning_rate": 4.821534766076831e-06, + "loss": 2.2827, + "step": 13735 + }, + { + "epoch": 0.7369098712446351, + "grad_norm": 0.43359375, + "learning_rate": 4.821502529377034e-06, + "loss": 2.3027, + "step": 13736 + }, + { + "epoch": 0.7369635193133047, + "grad_norm": 1.15625, + "learning_rate": 4.821470289873782e-06, + "loss": 2.0233, + "step": 13737 + }, + { + "epoch": 0.7370171673819742, + "grad_norm": 0.55859375, + "learning_rate": 4.821438047567114e-06, + "loss": 2.3898, + "step": 13738 + }, + { + "epoch": 0.7370708154506438, + "grad_norm": 0.490234375, + "learning_rate": 4.821405802457068e-06, + "loss": 2.2213, + "step": 13739 + }, + { + "epoch": 0.7371244635193133, + "grad_norm": 0.482421875, + "learning_rate": 4.821373554543685e-06, + "loss": 2.2932, + "step": 13740 + }, + { + "epoch": 0.7371781115879829, + "grad_norm": 0.42578125, + "learning_rate": 4.821341303827001e-06, + "loss": 2.4808, + "step": 13741 + }, + { + "epoch": 0.7372317596566523, + "grad_norm": 0.474609375, + "learning_rate": 4.821309050307057e-06, + "loss": 2.232, + "step": 13742 + }, + { + "epoch": 0.7372854077253219, + "grad_norm": 0.439453125, + "learning_rate": 4.821276793983891e-06, + "loss": 2.2012, + "step": 13743 + }, + { + "epoch": 0.7373390557939914, + "grad_norm": 0.41796875, + "learning_rate": 4.821244534857544e-06, + "loss": 2.3265, + "step": 13744 + }, + { + "epoch": 0.737392703862661, + "grad_norm": 0.515625, + "learning_rate": 4.821212272928053e-06, + "loss": 2.4117, + "step": 13745 + }, + { + "epoch": 0.7374463519313305, + "grad_norm": 0.45703125, + "learning_rate": 4.821180008195456e-06, + "loss": 2.6936, + "step": 13746 + }, + { + "epoch": 0.7375, + "grad_norm": 0.4375, + "learning_rate": 4.821147740659795e-06, + "loss": 2.21, + "step": 13747 + }, + { + "epoch": 0.7375536480686695, + "grad_norm": 0.490234375, + "learning_rate": 4.821115470321106e-06, + "loss": 2.2294, + "step": 13748 + }, + { + "epoch": 0.7376072961373391, + "grad_norm": 0.48828125, + "learning_rate": 4.821083197179431e-06, + "loss": 2.4449, + "step": 13749 + }, + { + "epoch": 0.7376609442060086, + "grad_norm": 0.412109375, + "learning_rate": 4.821050921234805e-06, + "loss": 1.8894, + "step": 13750 + }, + { + "epoch": 0.7377145922746781, + "grad_norm": 1.171875, + "learning_rate": 4.82101864248727e-06, + "loss": 2.3862, + "step": 13751 + }, + { + "epoch": 0.7377682403433476, + "grad_norm": 0.458984375, + "learning_rate": 4.820986360936865e-06, + "loss": 2.4238, + "step": 13752 + }, + { + "epoch": 0.7378218884120171, + "grad_norm": 0.455078125, + "learning_rate": 4.820954076583627e-06, + "loss": 2.6324, + "step": 13753 + }, + { + "epoch": 0.7378755364806867, + "grad_norm": 0.494140625, + "learning_rate": 4.820921789427597e-06, + "loss": 2.2189, + "step": 13754 + }, + { + "epoch": 0.7379291845493562, + "grad_norm": 0.47265625, + "learning_rate": 4.820889499468812e-06, + "loss": 1.9847, + "step": 13755 + }, + { + "epoch": 0.7379828326180258, + "grad_norm": 0.478515625, + "learning_rate": 4.820857206707313e-06, + "loss": 2.1127, + "step": 13756 + }, + { + "epoch": 0.7380364806866953, + "grad_norm": 0.482421875, + "learning_rate": 4.820824911143139e-06, + "loss": 2.4267, + "step": 13757 + }, + { + "epoch": 0.7380901287553648, + "grad_norm": 0.546875, + "learning_rate": 4.820792612776327e-06, + "loss": 2.3384, + "step": 13758 + }, + { + "epoch": 0.7381437768240343, + "grad_norm": 1.359375, + "learning_rate": 4.820760311606918e-06, + "loss": 1.9084, + "step": 13759 + }, + { + "epoch": 0.7381974248927039, + "grad_norm": 0.45703125, + "learning_rate": 4.820728007634949e-06, + "loss": 2.2819, + "step": 13760 + }, + { + "epoch": 0.7382510729613734, + "grad_norm": 0.498046875, + "learning_rate": 4.820695700860461e-06, + "loss": 2.4706, + "step": 13761 + }, + { + "epoch": 0.738304721030043, + "grad_norm": 0.53125, + "learning_rate": 4.820663391283492e-06, + "loss": 2.1461, + "step": 13762 + }, + { + "epoch": 0.7383583690987124, + "grad_norm": 0.43359375, + "learning_rate": 4.820631078904081e-06, + "loss": 2.2916, + "step": 13763 + }, + { + "epoch": 0.738412017167382, + "grad_norm": 0.4609375, + "learning_rate": 4.8205987637222674e-06, + "loss": 2.4207, + "step": 13764 + }, + { + "epoch": 0.7384656652360515, + "grad_norm": 0.5, + "learning_rate": 4.8205664457380895e-06, + "loss": 2.3512, + "step": 13765 + }, + { + "epoch": 0.738519313304721, + "grad_norm": 0.58203125, + "learning_rate": 4.820534124951588e-06, + "loss": 2.1926, + "step": 13766 + }, + { + "epoch": 0.7385729613733906, + "grad_norm": 0.478515625, + "learning_rate": 4.8205018013628e-06, + "loss": 2.4243, + "step": 13767 + }, + { + "epoch": 0.73862660944206, + "grad_norm": 0.396484375, + "learning_rate": 4.820469474971765e-06, + "loss": 2.3893, + "step": 13768 + }, + { + "epoch": 0.7386802575107296, + "grad_norm": 0.466796875, + "learning_rate": 4.820437145778523e-06, + "loss": 2.2131, + "step": 13769 + }, + { + "epoch": 0.7387339055793991, + "grad_norm": 0.515625, + "learning_rate": 4.8204048137831125e-06, + "loss": 2.2547, + "step": 13770 + }, + { + "epoch": 0.7387875536480687, + "grad_norm": 0.4609375, + "learning_rate": 4.820372478985572e-06, + "loss": 2.3581, + "step": 13771 + }, + { + "epoch": 0.7388412017167382, + "grad_norm": 0.419921875, + "learning_rate": 4.820340141385941e-06, + "loss": 2.1779, + "step": 13772 + }, + { + "epoch": 0.7388948497854078, + "grad_norm": 0.443359375, + "learning_rate": 4.820307800984259e-06, + "loss": 2.3478, + "step": 13773 + }, + { + "epoch": 0.7389484978540772, + "grad_norm": 0.490234375, + "learning_rate": 4.820275457780564e-06, + "loss": 2.3288, + "step": 13774 + }, + { + "epoch": 0.7390021459227468, + "grad_norm": 0.5234375, + "learning_rate": 4.8202431117748965e-06, + "loss": 2.2763, + "step": 13775 + }, + { + "epoch": 0.7390557939914163, + "grad_norm": 0.466796875, + "learning_rate": 4.820210762967295e-06, + "loss": 2.541, + "step": 13776 + }, + { + "epoch": 0.7391094420600859, + "grad_norm": 0.400390625, + "learning_rate": 4.820178411357797e-06, + "loss": 2.4144, + "step": 13777 + }, + { + "epoch": 0.7391630901287554, + "grad_norm": 0.423828125, + "learning_rate": 4.820146056946443e-06, + "loss": 2.6429, + "step": 13778 + }, + { + "epoch": 0.7392167381974248, + "grad_norm": 0.47265625, + "learning_rate": 4.820113699733272e-06, + "loss": 2.2794, + "step": 13779 + }, + { + "epoch": 0.7392703862660944, + "grad_norm": 0.578125, + "learning_rate": 4.820081339718323e-06, + "loss": 2.3343, + "step": 13780 + }, + { + "epoch": 0.7393240343347639, + "grad_norm": 0.3984375, + "learning_rate": 4.8200489769016355e-06, + "loss": 2.2927, + "step": 13781 + }, + { + "epoch": 0.7393776824034335, + "grad_norm": 0.439453125, + "learning_rate": 4.8200166112832485e-06, + "loss": 2.282, + "step": 13782 + }, + { + "epoch": 0.739431330472103, + "grad_norm": 0.44921875, + "learning_rate": 4.8199842428632e-06, + "loss": 2.0995, + "step": 13783 + }, + { + "epoch": 0.7394849785407726, + "grad_norm": 0.65234375, + "learning_rate": 4.819951871641531e-06, + "loss": 2.254, + "step": 13784 + }, + { + "epoch": 0.739538626609442, + "grad_norm": 0.7578125, + "learning_rate": 4.819919497618277e-06, + "loss": 2.381, + "step": 13785 + }, + { + "epoch": 0.7395922746781116, + "grad_norm": 1.0703125, + "learning_rate": 4.819887120793482e-06, + "loss": 2.4172, + "step": 13786 + }, + { + "epoch": 0.7396459227467811, + "grad_norm": 0.51953125, + "learning_rate": 4.819854741167182e-06, + "loss": 2.2173, + "step": 13787 + }, + { + "epoch": 0.7396995708154507, + "grad_norm": 0.443359375, + "learning_rate": 4.819822358739416e-06, + "loss": 1.9059, + "step": 13788 + }, + { + "epoch": 0.7397532188841202, + "grad_norm": 0.4453125, + "learning_rate": 4.819789973510225e-06, + "loss": 2.189, + "step": 13789 + }, + { + "epoch": 0.7398068669527897, + "grad_norm": 0.51171875, + "learning_rate": 4.819757585479645e-06, + "loss": 1.9083, + "step": 13790 + }, + { + "epoch": 0.7398605150214592, + "grad_norm": 0.59375, + "learning_rate": 4.819725194647719e-06, + "loss": 2.1742, + "step": 13791 + }, + { + "epoch": 0.7399141630901288, + "grad_norm": 0.57421875, + "learning_rate": 4.819692801014483e-06, + "loss": 2.2466, + "step": 13792 + }, + { + "epoch": 0.7399678111587983, + "grad_norm": 0.400390625, + "learning_rate": 4.819660404579978e-06, + "loss": 2.2568, + "step": 13793 + }, + { + "epoch": 0.7400214592274678, + "grad_norm": 0.6328125, + "learning_rate": 4.819628005344242e-06, + "loss": 2.6334, + "step": 13794 + }, + { + "epoch": 0.7400751072961373, + "grad_norm": 1.0546875, + "learning_rate": 4.819595603307314e-06, + "loss": 2.3767, + "step": 13795 + }, + { + "epoch": 0.7401287553648068, + "grad_norm": 0.453125, + "learning_rate": 4.819563198469235e-06, + "loss": 2.2962, + "step": 13796 + }, + { + "epoch": 0.7401824034334764, + "grad_norm": 0.5, + "learning_rate": 4.819530790830042e-06, + "loss": 2.561, + "step": 13797 + }, + { + "epoch": 0.7402360515021459, + "grad_norm": 0.40234375, + "learning_rate": 4.819498380389775e-06, + "loss": 2.157, + "step": 13798 + }, + { + "epoch": 0.7402896995708155, + "grad_norm": 0.400390625, + "learning_rate": 4.819465967148472e-06, + "loss": 2.1045, + "step": 13799 + }, + { + "epoch": 0.740343347639485, + "grad_norm": 0.40625, + "learning_rate": 4.819433551106175e-06, + "loss": 2.3603, + "step": 13800 + }, + { + "epoch": 0.7403969957081545, + "grad_norm": 0.54296875, + "learning_rate": 4.81940113226292e-06, + "loss": 2.3873, + "step": 13801 + }, + { + "epoch": 0.740450643776824, + "grad_norm": 0.447265625, + "learning_rate": 4.8193687106187484e-06, + "loss": 2.2529, + "step": 13802 + }, + { + "epoch": 0.7405042918454936, + "grad_norm": 0.48046875, + "learning_rate": 4.819336286173698e-06, + "loss": 2.3393, + "step": 13803 + }, + { + "epoch": 0.7405579399141631, + "grad_norm": 0.466796875, + "learning_rate": 4.8193038589278085e-06, + "loss": 2.2273, + "step": 13804 + }, + { + "epoch": 0.7406115879828327, + "grad_norm": 0.453125, + "learning_rate": 4.819271428881119e-06, + "loss": 2.0686, + "step": 13805 + }, + { + "epoch": 0.7406652360515021, + "grad_norm": 0.63671875, + "learning_rate": 4.819238996033669e-06, + "loss": 2.3178, + "step": 13806 + }, + { + "epoch": 0.7407188841201717, + "grad_norm": 0.4140625, + "learning_rate": 4.819206560385496e-06, + "loss": 2.3632, + "step": 13807 + }, + { + "epoch": 0.7407725321888412, + "grad_norm": 0.427734375, + "learning_rate": 4.819174121936641e-06, + "loss": 2.3714, + "step": 13808 + }, + { + "epoch": 0.7408261802575107, + "grad_norm": 0.5, + "learning_rate": 4.819141680687144e-06, + "loss": 2.2666, + "step": 13809 + }, + { + "epoch": 0.7408798283261803, + "grad_norm": 0.380859375, + "learning_rate": 4.819109236637042e-06, + "loss": 2.5855, + "step": 13810 + }, + { + "epoch": 0.7409334763948497, + "grad_norm": 0.3828125, + "learning_rate": 4.819076789786374e-06, + "loss": 2.1397, + "step": 13811 + }, + { + "epoch": 0.7409871244635193, + "grad_norm": 0.462890625, + "learning_rate": 4.819044340135182e-06, + "loss": 2.3752, + "step": 13812 + }, + { + "epoch": 0.7410407725321888, + "grad_norm": 0.478515625, + "learning_rate": 4.819011887683503e-06, + "loss": 2.5326, + "step": 13813 + }, + { + "epoch": 0.7410944206008584, + "grad_norm": 5.0625, + "learning_rate": 4.818979432431375e-06, + "loss": 2.5696, + "step": 13814 + }, + { + "epoch": 0.7411480686695279, + "grad_norm": 0.41796875, + "learning_rate": 4.81894697437884e-06, + "loss": 1.7665, + "step": 13815 + }, + { + "epoch": 0.7412017167381975, + "grad_norm": 0.44921875, + "learning_rate": 4.818914513525935e-06, + "loss": 2.3077, + "step": 13816 + }, + { + "epoch": 0.7412553648068669, + "grad_norm": 0.466796875, + "learning_rate": 4.8188820498727016e-06, + "loss": 2.0859, + "step": 13817 + }, + { + "epoch": 0.7413090128755365, + "grad_norm": 0.447265625, + "learning_rate": 4.818849583419177e-06, + "loss": 2.2715, + "step": 13818 + }, + { + "epoch": 0.741362660944206, + "grad_norm": 0.416015625, + "learning_rate": 4.8188171141654e-06, + "loss": 2.3108, + "step": 13819 + }, + { + "epoch": 0.7414163090128756, + "grad_norm": 0.859375, + "learning_rate": 4.818784642111411e-06, + "loss": 2.308, + "step": 13820 + }, + { + "epoch": 0.7414699570815451, + "grad_norm": 0.4453125, + "learning_rate": 4.81875216725725e-06, + "loss": 2.486, + "step": 13821 + }, + { + "epoch": 0.7415236051502145, + "grad_norm": 0.45703125, + "learning_rate": 4.818719689602954e-06, + "loss": 2.4895, + "step": 13822 + }, + { + "epoch": 0.7415772532188841, + "grad_norm": 0.4140625, + "learning_rate": 4.818687209148565e-06, + "loss": 2.2781, + "step": 13823 + }, + { + "epoch": 0.7416309012875536, + "grad_norm": 0.46875, + "learning_rate": 4.818654725894119e-06, + "loss": 2.4792, + "step": 13824 + }, + { + "epoch": 0.7416845493562232, + "grad_norm": 0.5234375, + "learning_rate": 4.818622239839658e-06, + "loss": 2.2707, + "step": 13825 + }, + { + "epoch": 0.7417381974248927, + "grad_norm": 0.5625, + "learning_rate": 4.818589750985219e-06, + "loss": 2.3793, + "step": 13826 + }, + { + "epoch": 0.7417918454935623, + "grad_norm": 0.412109375, + "learning_rate": 4.8185572593308434e-06, + "loss": 2.1386, + "step": 13827 + }, + { + "epoch": 0.7418454935622317, + "grad_norm": 0.4296875, + "learning_rate": 4.818524764876569e-06, + "loss": 2.4534, + "step": 13828 + }, + { + "epoch": 0.7418991416309013, + "grad_norm": 0.51953125, + "learning_rate": 4.818492267622435e-06, + "loss": 2.1962, + "step": 13829 + }, + { + "epoch": 0.7419527896995708, + "grad_norm": 0.67578125, + "learning_rate": 4.818459767568481e-06, + "loss": 2.5351, + "step": 13830 + }, + { + "epoch": 0.7420064377682404, + "grad_norm": 0.54296875, + "learning_rate": 4.818427264714747e-06, + "loss": 2.5398, + "step": 13831 + }, + { + "epoch": 0.7420600858369099, + "grad_norm": 0.482421875, + "learning_rate": 4.8183947590612714e-06, + "loss": 2.2426, + "step": 13832 + }, + { + "epoch": 0.7421137339055794, + "grad_norm": 0.44140625, + "learning_rate": 4.818362250608093e-06, + "loss": 2.2963, + "step": 13833 + }, + { + "epoch": 0.7421673819742489, + "grad_norm": 0.4296875, + "learning_rate": 4.818329739355252e-06, + "loss": 2.2417, + "step": 13834 + }, + { + "epoch": 0.7422210300429185, + "grad_norm": 0.42578125, + "learning_rate": 4.818297225302788e-06, + "loss": 2.3716, + "step": 13835 + }, + { + "epoch": 0.742274678111588, + "grad_norm": 0.408203125, + "learning_rate": 4.818264708450738e-06, + "loss": 2.4405, + "step": 13836 + }, + { + "epoch": 0.7423283261802575, + "grad_norm": 0.384765625, + "learning_rate": 4.818232188799144e-06, + "loss": 2.2937, + "step": 13837 + }, + { + "epoch": 0.742381974248927, + "grad_norm": 0.474609375, + "learning_rate": 4.818199666348044e-06, + "loss": 2.4655, + "step": 13838 + }, + { + "epoch": 0.7424356223175965, + "grad_norm": 0.54296875, + "learning_rate": 4.818167141097477e-06, + "loss": 2.2866, + "step": 13839 + }, + { + "epoch": 0.7424892703862661, + "grad_norm": 0.453125, + "learning_rate": 4.8181346130474836e-06, + "loss": 2.272, + "step": 13840 + }, + { + "epoch": 0.7425429184549356, + "grad_norm": 0.609375, + "learning_rate": 4.818102082198102e-06, + "loss": 1.691, + "step": 13841 + }, + { + "epoch": 0.7425965665236052, + "grad_norm": 0.419921875, + "learning_rate": 4.818069548549371e-06, + "loss": 2.1966, + "step": 13842 + }, + { + "epoch": 0.7426502145922746, + "grad_norm": 0.462890625, + "learning_rate": 4.818037012101331e-06, + "loss": 2.2996, + "step": 13843 + }, + { + "epoch": 0.7427038626609442, + "grad_norm": 0.490234375, + "learning_rate": 4.81800447285402e-06, + "loss": 2.3441, + "step": 13844 + }, + { + "epoch": 0.7427575107296137, + "grad_norm": 0.48046875, + "learning_rate": 4.817971930807479e-06, + "loss": 2.2845, + "step": 13845 + }, + { + "epoch": 0.7428111587982833, + "grad_norm": 0.5234375, + "learning_rate": 4.817939385961746e-06, + "loss": 2.2816, + "step": 13846 + }, + { + "epoch": 0.7428648068669528, + "grad_norm": 0.55078125, + "learning_rate": 4.81790683831686e-06, + "loss": 2.1847, + "step": 13847 + }, + { + "epoch": 0.7429184549356224, + "grad_norm": 0.474609375, + "learning_rate": 4.817874287872862e-06, + "loss": 2.3934, + "step": 13848 + }, + { + "epoch": 0.7429721030042918, + "grad_norm": 2.9375, + "learning_rate": 4.81784173462979e-06, + "loss": 1.379, + "step": 13849 + }, + { + "epoch": 0.7430257510729614, + "grad_norm": 0.392578125, + "learning_rate": 4.817809178587684e-06, + "loss": 2.2398, + "step": 13850 + }, + { + "epoch": 0.7430793991416309, + "grad_norm": 0.453125, + "learning_rate": 4.817776619746582e-06, + "loss": 2.3725, + "step": 13851 + }, + { + "epoch": 0.7431330472103004, + "grad_norm": 0.498046875, + "learning_rate": 4.817744058106526e-06, + "loss": 2.4389, + "step": 13852 + }, + { + "epoch": 0.74318669527897, + "grad_norm": 0.451171875, + "learning_rate": 4.817711493667552e-06, + "loss": 2.229, + "step": 13853 + }, + { + "epoch": 0.7432403433476394, + "grad_norm": 0.48046875, + "learning_rate": 4.817678926429702e-06, + "loss": 2.5244, + "step": 13854 + }, + { + "epoch": 0.743293991416309, + "grad_norm": 0.482421875, + "learning_rate": 4.817646356393013e-06, + "loss": 2.5017, + "step": 13855 + }, + { + "epoch": 0.7433476394849785, + "grad_norm": 1.7734375, + "learning_rate": 4.817613783557526e-06, + "loss": 2.322, + "step": 13856 + }, + { + "epoch": 0.7434012875536481, + "grad_norm": 0.66015625, + "learning_rate": 4.817581207923281e-06, + "loss": 2.3173, + "step": 13857 + }, + { + "epoch": 0.7434549356223176, + "grad_norm": 0.58984375, + "learning_rate": 4.817548629490315e-06, + "loss": 2.3735, + "step": 13858 + }, + { + "epoch": 0.7435085836909872, + "grad_norm": 0.423828125, + "learning_rate": 4.817516048258669e-06, + "loss": 2.5418, + "step": 13859 + }, + { + "epoch": 0.7435622317596566, + "grad_norm": 0.435546875, + "learning_rate": 4.817483464228382e-06, + "loss": 2.206, + "step": 13860 + }, + { + "epoch": 0.7436158798283262, + "grad_norm": 0.474609375, + "learning_rate": 4.817450877399493e-06, + "loss": 2.3668, + "step": 13861 + }, + { + "epoch": 0.7436695278969957, + "grad_norm": 0.75, + "learning_rate": 4.817418287772042e-06, + "loss": 2.4693, + "step": 13862 + }, + { + "epoch": 0.7437231759656653, + "grad_norm": 0.49609375, + "learning_rate": 4.817385695346068e-06, + "loss": 2.2759, + "step": 13863 + }, + { + "epoch": 0.7437768240343348, + "grad_norm": 0.458984375, + "learning_rate": 4.817353100121611e-06, + "loss": 2.4082, + "step": 13864 + }, + { + "epoch": 0.7438304721030042, + "grad_norm": 0.59375, + "learning_rate": 4.817320502098709e-06, + "loss": 2.3491, + "step": 13865 + }, + { + "epoch": 0.7438841201716738, + "grad_norm": 0.408203125, + "learning_rate": 4.817287901277401e-06, + "loss": 2.1899, + "step": 13866 + }, + { + "epoch": 0.7439377682403433, + "grad_norm": 0.486328125, + "learning_rate": 4.817255297657729e-06, + "loss": 2.1358, + "step": 13867 + }, + { + "epoch": 0.7439914163090129, + "grad_norm": 0.51953125, + "learning_rate": 4.81722269123973e-06, + "loss": 2.1378, + "step": 13868 + }, + { + "epoch": 0.7440450643776824, + "grad_norm": 0.439453125, + "learning_rate": 4.8171900820234455e-06, + "loss": 2.2857, + "step": 13869 + }, + { + "epoch": 0.744098712446352, + "grad_norm": 0.59765625, + "learning_rate": 4.817157470008912e-06, + "loss": 2.1812, + "step": 13870 + }, + { + "epoch": 0.7441523605150214, + "grad_norm": 0.470703125, + "learning_rate": 4.817124855196171e-06, + "loss": 2.1748, + "step": 13871 + }, + { + "epoch": 0.744206008583691, + "grad_norm": 0.55078125, + "learning_rate": 4.817092237585261e-06, + "loss": 2.1946, + "step": 13872 + }, + { + "epoch": 0.7442596566523605, + "grad_norm": 0.3984375, + "learning_rate": 4.817059617176222e-06, + "loss": 2.232, + "step": 13873 + }, + { + "epoch": 0.7443133047210301, + "grad_norm": 0.4375, + "learning_rate": 4.8170269939690936e-06, + "loss": 2.329, + "step": 13874 + }, + { + "epoch": 0.7443669527896996, + "grad_norm": 0.3984375, + "learning_rate": 4.816994367963914e-06, + "loss": 2.0368, + "step": 13875 + }, + { + "epoch": 0.7444206008583691, + "grad_norm": 1.34375, + "learning_rate": 4.816961739160724e-06, + "loss": 1.4315, + "step": 13876 + }, + { + "epoch": 0.7444742489270386, + "grad_norm": 0.359375, + "learning_rate": 4.816929107559561e-06, + "loss": 2.0758, + "step": 13877 + }, + { + "epoch": 0.7445278969957082, + "grad_norm": 0.515625, + "learning_rate": 4.816896473160468e-06, + "loss": 2.4137, + "step": 13878 + }, + { + "epoch": 0.7445815450643777, + "grad_norm": 0.486328125, + "learning_rate": 4.81686383596348e-06, + "loss": 2.4658, + "step": 13879 + }, + { + "epoch": 0.7446351931330472, + "grad_norm": 0.4375, + "learning_rate": 4.81683119596864e-06, + "loss": 2.2986, + "step": 13880 + }, + { + "epoch": 0.7446888412017167, + "grad_norm": 0.392578125, + "learning_rate": 4.816798553175985e-06, + "loss": 2.4378, + "step": 13881 + }, + { + "epoch": 0.7447424892703862, + "grad_norm": 0.4375, + "learning_rate": 4.816765907585556e-06, + "loss": 2.2038, + "step": 13882 + }, + { + "epoch": 0.7447961373390558, + "grad_norm": 0.44140625, + "learning_rate": 4.816733259197391e-06, + "loss": 2.4647, + "step": 13883 + }, + { + "epoch": 0.7448497854077253, + "grad_norm": 0.419921875, + "learning_rate": 4.816700608011531e-06, + "loss": 2.4701, + "step": 13884 + }, + { + "epoch": 0.7449034334763949, + "grad_norm": 0.458984375, + "learning_rate": 4.816667954028014e-06, + "loss": 2.5496, + "step": 13885 + }, + { + "epoch": 0.7449570815450643, + "grad_norm": 0.50390625, + "learning_rate": 4.816635297246881e-06, + "loss": 1.9996, + "step": 13886 + }, + { + "epoch": 0.7450107296137339, + "grad_norm": 0.41015625, + "learning_rate": 4.81660263766817e-06, + "loss": 2.3703, + "step": 13887 + }, + { + "epoch": 0.7450643776824034, + "grad_norm": 0.50390625, + "learning_rate": 4.816569975291921e-06, + "loss": 2.3347, + "step": 13888 + }, + { + "epoch": 0.745118025751073, + "grad_norm": 0.5, + "learning_rate": 4.816537310118173e-06, + "loss": 2.1348, + "step": 13889 + }, + { + "epoch": 0.7451716738197425, + "grad_norm": 0.462890625, + "learning_rate": 4.816504642146966e-06, + "loss": 2.0499, + "step": 13890 + }, + { + "epoch": 0.7452253218884121, + "grad_norm": 0.74609375, + "learning_rate": 4.8164719713783404e-06, + "loss": 2.1375, + "step": 13891 + }, + { + "epoch": 0.7452789699570815, + "grad_norm": 0.455078125, + "learning_rate": 4.816439297812333e-06, + "loss": 2.1691, + "step": 13892 + }, + { + "epoch": 0.7453326180257511, + "grad_norm": 0.404296875, + "learning_rate": 4.816406621448986e-06, + "loss": 2.2926, + "step": 13893 + }, + { + "epoch": 0.7453862660944206, + "grad_norm": 0.6171875, + "learning_rate": 4.8163739422883375e-06, + "loss": 2.2391, + "step": 13894 + }, + { + "epoch": 0.7454399141630901, + "grad_norm": 0.45703125, + "learning_rate": 4.816341260330427e-06, + "loss": 2.1602, + "step": 13895 + }, + { + "epoch": 0.7454935622317597, + "grad_norm": 0.47265625, + "learning_rate": 4.816308575575294e-06, + "loss": 2.0994, + "step": 13896 + }, + { + "epoch": 0.7455472103004291, + "grad_norm": 0.53515625, + "learning_rate": 4.816275888022978e-06, + "loss": 2.4502, + "step": 13897 + }, + { + "epoch": 0.7456008583690987, + "grad_norm": 0.5625, + "learning_rate": 4.816243197673519e-06, + "loss": 2.3627, + "step": 13898 + }, + { + "epoch": 0.7456545064377682, + "grad_norm": 0.52734375, + "learning_rate": 4.816210504526956e-06, + "loss": 2.1279, + "step": 13899 + }, + { + "epoch": 0.7457081545064378, + "grad_norm": 0.423828125, + "learning_rate": 4.8161778085833276e-06, + "loss": 2.3916, + "step": 13900 + }, + { + "epoch": 0.7457618025751073, + "grad_norm": 0.384765625, + "learning_rate": 4.8161451098426745e-06, + "loss": 2.1373, + "step": 13901 + }, + { + "epoch": 0.7458154506437769, + "grad_norm": 0.4921875, + "learning_rate": 4.816112408305037e-06, + "loss": 2.2173, + "step": 13902 + }, + { + "epoch": 0.7458690987124463, + "grad_norm": 0.5390625, + "learning_rate": 4.816079703970452e-06, + "loss": 2.0661, + "step": 13903 + }, + { + "epoch": 0.7459227467811159, + "grad_norm": 0.44921875, + "learning_rate": 4.8160469968389615e-06, + "loss": 2.4984, + "step": 13904 + }, + { + "epoch": 0.7459763948497854, + "grad_norm": 0.59375, + "learning_rate": 4.816014286910604e-06, + "loss": 2.3965, + "step": 13905 + }, + { + "epoch": 0.746030042918455, + "grad_norm": 0.59375, + "learning_rate": 4.8159815741854185e-06, + "loss": 2.4537, + "step": 13906 + }, + { + "epoch": 0.7460836909871245, + "grad_norm": 0.390625, + "learning_rate": 4.815948858663445e-06, + "loss": 2.3275, + "step": 13907 + }, + { + "epoch": 0.746137339055794, + "grad_norm": 0.41796875, + "learning_rate": 4.815916140344723e-06, + "loss": 2.4526, + "step": 13908 + }, + { + "epoch": 0.7461909871244635, + "grad_norm": 0.451171875, + "learning_rate": 4.815883419229293e-06, + "loss": 2.2255, + "step": 13909 + }, + { + "epoch": 0.746244635193133, + "grad_norm": 0.494140625, + "learning_rate": 4.815850695317191e-06, + "loss": 1.7053, + "step": 13910 + }, + { + "epoch": 0.7462982832618026, + "grad_norm": 0.49609375, + "learning_rate": 4.815817968608461e-06, + "loss": 2.5233, + "step": 13911 + }, + { + "epoch": 0.7463519313304721, + "grad_norm": 0.474609375, + "learning_rate": 4.81578523910314e-06, + "loss": 2.2018, + "step": 13912 + }, + { + "epoch": 0.7464055793991416, + "grad_norm": 0.388671875, + "learning_rate": 4.815752506801269e-06, + "loss": 2.2184, + "step": 13913 + }, + { + "epoch": 0.7464592274678111, + "grad_norm": 0.6796875, + "learning_rate": 4.815719771702885e-06, + "loss": 2.3341, + "step": 13914 + }, + { + "epoch": 0.7465128755364807, + "grad_norm": 0.44140625, + "learning_rate": 4.815687033808031e-06, + "loss": 2.1727, + "step": 13915 + }, + { + "epoch": 0.7465665236051502, + "grad_norm": 0.490234375, + "learning_rate": 4.815654293116743e-06, + "loss": 2.1352, + "step": 13916 + }, + { + "epoch": 0.7466201716738198, + "grad_norm": 0.48828125, + "learning_rate": 4.815621549629063e-06, + "loss": 2.5746, + "step": 13917 + }, + { + "epoch": 0.7466738197424893, + "grad_norm": 0.546875, + "learning_rate": 4.815588803345029e-06, + "loss": 2.3728, + "step": 13918 + }, + { + "epoch": 0.7467274678111588, + "grad_norm": 0.4296875, + "learning_rate": 4.815556054264682e-06, + "loss": 2.3041, + "step": 13919 + }, + { + "epoch": 0.7467811158798283, + "grad_norm": 0.470703125, + "learning_rate": 4.81552330238806e-06, + "loss": 2.1806, + "step": 13920 + }, + { + "epoch": 0.7468347639484979, + "grad_norm": 0.43359375, + "learning_rate": 4.8154905477152046e-06, + "loss": 2.3722, + "step": 13921 + }, + { + "epoch": 0.7468884120171674, + "grad_norm": 0.51171875, + "learning_rate": 4.815457790246153e-06, + "loss": 2.4891, + "step": 13922 + }, + { + "epoch": 0.7469420600858369, + "grad_norm": 0.453125, + "learning_rate": 4.815425029980947e-06, + "loss": 2.0073, + "step": 13923 + }, + { + "epoch": 0.7469957081545064, + "grad_norm": 0.462890625, + "learning_rate": 4.815392266919624e-06, + "loss": 2.121, + "step": 13924 + }, + { + "epoch": 0.7470493562231759, + "grad_norm": 0.4296875, + "learning_rate": 4.815359501062226e-06, + "loss": 2.2608, + "step": 13925 + }, + { + "epoch": 0.7471030042918455, + "grad_norm": 0.443359375, + "learning_rate": 4.815326732408789e-06, + "loss": 2.4035, + "step": 13926 + }, + { + "epoch": 0.747156652360515, + "grad_norm": 0.451171875, + "learning_rate": 4.815293960959357e-06, + "loss": 2.2912, + "step": 13927 + }, + { + "epoch": 0.7472103004291846, + "grad_norm": 0.390625, + "learning_rate": 4.815261186713966e-06, + "loss": 2.5598, + "step": 13928 + }, + { + "epoch": 0.747263948497854, + "grad_norm": 0.474609375, + "learning_rate": 4.815228409672658e-06, + "loss": 2.4549, + "step": 13929 + }, + { + "epoch": 0.7473175965665236, + "grad_norm": 1.09375, + "learning_rate": 4.81519562983547e-06, + "loss": 1.8618, + "step": 13930 + }, + { + "epoch": 0.7473712446351931, + "grad_norm": 0.474609375, + "learning_rate": 4.815162847202444e-06, + "loss": 2.5468, + "step": 13931 + }, + { + "epoch": 0.7474248927038627, + "grad_norm": 0.431640625, + "learning_rate": 4.8151300617736184e-06, + "loss": 2.4941, + "step": 13932 + }, + { + "epoch": 0.7474785407725322, + "grad_norm": 0.65234375, + "learning_rate": 4.815097273549033e-06, + "loss": 2.2823, + "step": 13933 + }, + { + "epoch": 0.7475321888412018, + "grad_norm": 0.419921875, + "learning_rate": 4.815064482528728e-06, + "loss": 2.2091, + "step": 13934 + }, + { + "epoch": 0.7475858369098712, + "grad_norm": 0.36328125, + "learning_rate": 4.815031688712742e-06, + "loss": 2.1119, + "step": 13935 + }, + { + "epoch": 0.7476394849785408, + "grad_norm": 0.486328125, + "learning_rate": 4.814998892101116e-06, + "loss": 2.3697, + "step": 13936 + }, + { + "epoch": 0.7476931330472103, + "grad_norm": 0.423828125, + "learning_rate": 4.814966092693887e-06, + "loss": 2.3832, + "step": 13937 + }, + { + "epoch": 0.7477467811158798, + "grad_norm": 0.419921875, + "learning_rate": 4.8149332904910975e-06, + "loss": 2.3022, + "step": 13938 + }, + { + "epoch": 0.7478004291845494, + "grad_norm": 0.47265625, + "learning_rate": 4.814900485492785e-06, + "loss": 2.3411, + "step": 13939 + }, + { + "epoch": 0.7478540772532188, + "grad_norm": 0.48828125, + "learning_rate": 4.814867677698991e-06, + "loss": 2.4195, + "step": 13940 + }, + { + "epoch": 0.7479077253218884, + "grad_norm": 0.51171875, + "learning_rate": 4.814834867109753e-06, + "loss": 2.3889, + "step": 13941 + }, + { + "epoch": 0.7479613733905579, + "grad_norm": 0.5703125, + "learning_rate": 4.8148020537251125e-06, + "loss": 2.1563, + "step": 13942 + }, + { + "epoch": 0.7480150214592275, + "grad_norm": 0.56640625, + "learning_rate": 4.814769237545108e-06, + "loss": 2.4205, + "step": 13943 + }, + { + "epoch": 0.748068669527897, + "grad_norm": 0.53515625, + "learning_rate": 4.81473641856978e-06, + "loss": 2.244, + "step": 13944 + }, + { + "epoch": 0.7481223175965666, + "grad_norm": 0.5703125, + "learning_rate": 4.814703596799168e-06, + "loss": 2.1805, + "step": 13945 + }, + { + "epoch": 0.748175965665236, + "grad_norm": 0.33984375, + "learning_rate": 4.814670772233309e-06, + "loss": 2.0767, + "step": 13946 + }, + { + "epoch": 0.7482296137339056, + "grad_norm": 0.515625, + "learning_rate": 4.8146379448722475e-06, + "loss": 2.1455, + "step": 13947 + }, + { + "epoch": 0.7482832618025751, + "grad_norm": 0.4375, + "learning_rate": 4.814605114716019e-06, + "loss": 2.2332, + "step": 13948 + }, + { + "epoch": 0.7483369098712447, + "grad_norm": 0.408203125, + "learning_rate": 4.814572281764666e-06, + "loss": 2.1744, + "step": 13949 + }, + { + "epoch": 0.7483905579399142, + "grad_norm": 0.6484375, + "learning_rate": 4.814539446018226e-06, + "loss": 2.3165, + "step": 13950 + }, + { + "epoch": 0.7484442060085837, + "grad_norm": 0.421875, + "learning_rate": 4.814506607476739e-06, + "loss": 2.3046, + "step": 13951 + }, + { + "epoch": 0.7484978540772532, + "grad_norm": 0.4765625, + "learning_rate": 4.814473766140246e-06, + "loss": 2.6923, + "step": 13952 + }, + { + "epoch": 0.7485515021459227, + "grad_norm": 0.427734375, + "learning_rate": 4.8144409220087854e-06, + "loss": 2.2799, + "step": 13953 + }, + { + "epoch": 0.7486051502145923, + "grad_norm": 0.453125, + "learning_rate": 4.8144080750823974e-06, + "loss": 2.2894, + "step": 13954 + }, + { + "epoch": 0.7486587982832618, + "grad_norm": 0.8203125, + "learning_rate": 4.814375225361122e-06, + "loss": 2.4685, + "step": 13955 + }, + { + "epoch": 0.7487124463519313, + "grad_norm": 0.455078125, + "learning_rate": 4.814342372844998e-06, + "loss": 2.4229, + "step": 13956 + }, + { + "epoch": 0.7487660944206008, + "grad_norm": 0.380859375, + "learning_rate": 4.814309517534065e-06, + "loss": 2.2475, + "step": 13957 + }, + { + "epoch": 0.7488197424892704, + "grad_norm": 0.4921875, + "learning_rate": 4.814276659428363e-06, + "loss": 2.0739, + "step": 13958 + }, + { + "epoch": 0.7488733905579399, + "grad_norm": 0.47265625, + "learning_rate": 4.814243798527933e-06, + "loss": 2.147, + "step": 13959 + }, + { + "epoch": 0.7489270386266095, + "grad_norm": 0.416015625, + "learning_rate": 4.814210934832813e-06, + "loss": 2.2096, + "step": 13960 + }, + { + "epoch": 0.748980686695279, + "grad_norm": 0.5390625, + "learning_rate": 4.8141780683430425e-06, + "loss": 2.4858, + "step": 13961 + }, + { + "epoch": 0.7490343347639485, + "grad_norm": 0.69921875, + "learning_rate": 4.814145199058663e-06, + "loss": 1.3965, + "step": 13962 + }, + { + "epoch": 0.749087982832618, + "grad_norm": 0.376953125, + "learning_rate": 4.8141123269797124e-06, + "loss": 2.2045, + "step": 13963 + }, + { + "epoch": 0.7491416309012876, + "grad_norm": 0.490234375, + "learning_rate": 4.814079452106232e-06, + "loss": 2.2734, + "step": 13964 + }, + { + "epoch": 0.7491952789699571, + "grad_norm": 0.49609375, + "learning_rate": 4.814046574438259e-06, + "loss": 2.4053, + "step": 13965 + }, + { + "epoch": 0.7492489270386266, + "grad_norm": 0.427734375, + "learning_rate": 4.814013693975836e-06, + "loss": 2.647, + "step": 13966 + }, + { + "epoch": 0.7493025751072961, + "grad_norm": 0.388671875, + "learning_rate": 4.813980810719001e-06, + "loss": 1.8591, + "step": 13967 + }, + { + "epoch": 0.7493562231759656, + "grad_norm": 0.4375, + "learning_rate": 4.8139479246677935e-06, + "loss": 2.0673, + "step": 13968 + }, + { + "epoch": 0.7494098712446352, + "grad_norm": 0.443359375, + "learning_rate": 4.813915035822254e-06, + "loss": 1.8195, + "step": 13969 + }, + { + "epoch": 0.7494635193133047, + "grad_norm": 0.703125, + "learning_rate": 4.813882144182423e-06, + "loss": 2.3939, + "step": 13970 + }, + { + "epoch": 0.7495171673819743, + "grad_norm": 0.453125, + "learning_rate": 4.813849249748338e-06, + "loss": 2.3576, + "step": 13971 + }, + { + "epoch": 0.7495708154506437, + "grad_norm": 0.45703125, + "learning_rate": 4.813816352520041e-06, + "loss": 2.1827, + "step": 13972 + }, + { + "epoch": 0.7496244635193133, + "grad_norm": 0.62109375, + "learning_rate": 4.81378345249757e-06, + "loss": 2.4203, + "step": 13973 + }, + { + "epoch": 0.7496781115879828, + "grad_norm": 0.51171875, + "learning_rate": 4.813750549680966e-06, + "loss": 2.3616, + "step": 13974 + }, + { + "epoch": 0.7497317596566524, + "grad_norm": 0.4765625, + "learning_rate": 4.813717644070268e-06, + "loss": 2.12, + "step": 13975 + }, + { + "epoch": 0.7497854077253219, + "grad_norm": 0.50390625, + "learning_rate": 4.813684735665516e-06, + "loss": 2.2397, + "step": 13976 + }, + { + "epoch": 0.7498390557939915, + "grad_norm": 0.5390625, + "learning_rate": 4.81365182446675e-06, + "loss": 2.1905, + "step": 13977 + }, + { + "epoch": 0.7498927038626609, + "grad_norm": 0.357421875, + "learning_rate": 4.813618910474008e-06, + "loss": 2.0472, + "step": 13978 + }, + { + "epoch": 0.7499463519313305, + "grad_norm": 0.46484375, + "learning_rate": 4.813585993687332e-06, + "loss": 2.4517, + "step": 13979 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 4.813553074106761e-06, + "loss": 2.4689, + "step": 13980 + }, + { + "epoch": 0.7500536480686695, + "grad_norm": 0.4375, + "learning_rate": 4.813520151732334e-06, + "loss": 2.3643, + "step": 13981 + }, + { + "epoch": 0.7501072961373391, + "grad_norm": 0.4609375, + "learning_rate": 4.8134872265640915e-06, + "loss": 2.3755, + "step": 13982 + }, + { + "epoch": 0.7501609442060085, + "grad_norm": 0.431640625, + "learning_rate": 4.813454298602074e-06, + "loss": 2.4076, + "step": 13983 + }, + { + "epoch": 0.7502145922746781, + "grad_norm": 0.41796875, + "learning_rate": 4.81342136784632e-06, + "loss": 2.3618, + "step": 13984 + }, + { + "epoch": 0.7502682403433476, + "grad_norm": 0.447265625, + "learning_rate": 4.8133884342968695e-06, + "loss": 2.3422, + "step": 13985 + }, + { + "epoch": 0.7503218884120172, + "grad_norm": 0.49609375, + "learning_rate": 4.8133554979537635e-06, + "loss": 2.556, + "step": 13986 + }, + { + "epoch": 0.7503755364806867, + "grad_norm": 0.49609375, + "learning_rate": 4.8133225588170395e-06, + "loss": 2.2921, + "step": 13987 + }, + { + "epoch": 0.7504291845493563, + "grad_norm": 0.49609375, + "learning_rate": 4.813289616886738e-06, + "loss": 1.9726, + "step": 13988 + }, + { + "epoch": 0.7504828326180257, + "grad_norm": 0.43359375, + "learning_rate": 4.8132566721629e-06, + "loss": 2.2608, + "step": 13989 + }, + { + "epoch": 0.7505364806866953, + "grad_norm": 0.458984375, + "learning_rate": 4.813223724645565e-06, + "loss": 2.3391, + "step": 13990 + }, + { + "epoch": 0.7505901287553648, + "grad_norm": 0.45703125, + "learning_rate": 4.8131907743347725e-06, + "loss": 2.2286, + "step": 13991 + }, + { + "epoch": 0.7506437768240344, + "grad_norm": 0.4765625, + "learning_rate": 4.813157821230562e-06, + "loss": 2.267, + "step": 13992 + }, + { + "epoch": 0.7506974248927039, + "grad_norm": 2.71875, + "learning_rate": 4.813124865332973e-06, + "loss": 2.3886, + "step": 13993 + }, + { + "epoch": 0.7507510729613734, + "grad_norm": 0.4921875, + "learning_rate": 4.813091906642046e-06, + "loss": 2.3617, + "step": 13994 + }, + { + "epoch": 0.7508047210300429, + "grad_norm": 0.42578125, + "learning_rate": 4.81305894515782e-06, + "loss": 2.1266, + "step": 13995 + }, + { + "epoch": 0.7508583690987124, + "grad_norm": 0.44921875, + "learning_rate": 4.813025980880336e-06, + "loss": 2.4849, + "step": 13996 + }, + { + "epoch": 0.750912017167382, + "grad_norm": 0.458984375, + "learning_rate": 4.8129930138096325e-06, + "loss": 2.3139, + "step": 13997 + }, + { + "epoch": 0.7509656652360515, + "grad_norm": 0.482421875, + "learning_rate": 4.812960043945751e-06, + "loss": 2.1897, + "step": 13998 + }, + { + "epoch": 0.751019313304721, + "grad_norm": 0.412109375, + "learning_rate": 4.8129270712887296e-06, + "loss": 2.1273, + "step": 13999 + }, + { + "epoch": 0.7510729613733905, + "grad_norm": 0.47265625, + "learning_rate": 4.812894095838609e-06, + "loss": 2.2879, + "step": 14000 + }, + { + "epoch": 0.7511266094420601, + "grad_norm": 0.41796875, + "learning_rate": 4.812861117595429e-06, + "loss": 2.3241, + "step": 14001 + }, + { + "epoch": 0.7511802575107296, + "grad_norm": 0.408203125, + "learning_rate": 4.812828136559228e-06, + "loss": 2.1914, + "step": 14002 + }, + { + "epoch": 0.7512339055793992, + "grad_norm": 0.42578125, + "learning_rate": 4.8127951527300485e-06, + "loss": 2.1414, + "step": 14003 + }, + { + "epoch": 0.7512875536480687, + "grad_norm": 0.419921875, + "learning_rate": 4.812762166107928e-06, + "loss": 2.154, + "step": 14004 + }, + { + "epoch": 0.7513412017167382, + "grad_norm": 0.44921875, + "learning_rate": 4.812729176692908e-06, + "loss": 2.2789, + "step": 14005 + }, + { + "epoch": 0.7513948497854077, + "grad_norm": 0.447265625, + "learning_rate": 4.812696184485027e-06, + "loss": 2.5194, + "step": 14006 + }, + { + "epoch": 0.7514484978540773, + "grad_norm": 0.482421875, + "learning_rate": 4.8126631894843265e-06, + "loss": 2.3426, + "step": 14007 + }, + { + "epoch": 0.7515021459227468, + "grad_norm": 0.5078125, + "learning_rate": 4.812630191690844e-06, + "loss": 2.2453, + "step": 14008 + }, + { + "epoch": 0.7515557939914163, + "grad_norm": 0.4609375, + "learning_rate": 4.812597191104621e-06, + "loss": 2.358, + "step": 14009 + }, + { + "epoch": 0.7516094420600858, + "grad_norm": 0.43359375, + "learning_rate": 4.812564187725696e-06, + "loss": 2.1281, + "step": 14010 + }, + { + "epoch": 0.7516630901287553, + "grad_norm": 0.462890625, + "learning_rate": 4.8125311815541124e-06, + "loss": 2.3356, + "step": 14011 + }, + { + "epoch": 0.7517167381974249, + "grad_norm": 0.44921875, + "learning_rate": 4.812498172589906e-06, + "loss": 2.5792, + "step": 14012 + }, + { + "epoch": 0.7517703862660944, + "grad_norm": 0.490234375, + "learning_rate": 4.812465160833118e-06, + "loss": 2.2613, + "step": 14013 + }, + { + "epoch": 0.751824034334764, + "grad_norm": 0.458984375, + "learning_rate": 4.8124321462837884e-06, + "loss": 2.3402, + "step": 14014 + }, + { + "epoch": 0.7518776824034334, + "grad_norm": 0.392578125, + "learning_rate": 4.812399128941957e-06, + "loss": 2.2156, + "step": 14015 + }, + { + "epoch": 0.751931330472103, + "grad_norm": 0.484375, + "learning_rate": 4.812366108807663e-06, + "loss": 2.4178, + "step": 14016 + }, + { + "epoch": 0.7519849785407725, + "grad_norm": 0.51171875, + "learning_rate": 4.812333085880948e-06, + "loss": 2.385, + "step": 14017 + }, + { + "epoch": 0.7520386266094421, + "grad_norm": 0.455078125, + "learning_rate": 4.812300060161852e-06, + "loss": 2.2388, + "step": 14018 + }, + { + "epoch": 0.7520922746781116, + "grad_norm": 0.44140625, + "learning_rate": 4.812267031650412e-06, + "loss": 2.2034, + "step": 14019 + }, + { + "epoch": 0.7521459227467812, + "grad_norm": 0.4375, + "learning_rate": 4.81223400034667e-06, + "loss": 2.392, + "step": 14020 + }, + { + "epoch": 0.7521995708154506, + "grad_norm": 0.466796875, + "learning_rate": 4.8122009662506665e-06, + "loss": 2.2397, + "step": 14021 + }, + { + "epoch": 0.7522532188841202, + "grad_norm": 0.453125, + "learning_rate": 4.81216792936244e-06, + "loss": 2.3948, + "step": 14022 + }, + { + "epoch": 0.7523068669527897, + "grad_norm": 0.57421875, + "learning_rate": 4.81213488968203e-06, + "loss": 2.0378, + "step": 14023 + }, + { + "epoch": 0.7523605150214592, + "grad_norm": 0.431640625, + "learning_rate": 4.8121018472094785e-06, + "loss": 2.1458, + "step": 14024 + }, + { + "epoch": 0.7524141630901288, + "grad_norm": 0.458984375, + "learning_rate": 4.812068801944823e-06, + "loss": 2.4432, + "step": 14025 + }, + { + "epoch": 0.7524678111587982, + "grad_norm": 0.5078125, + "learning_rate": 4.812035753888105e-06, + "loss": 2.2452, + "step": 14026 + }, + { + "epoch": 0.7525214592274678, + "grad_norm": 0.451171875, + "learning_rate": 4.812002703039364e-06, + "loss": 2.6156, + "step": 14027 + }, + { + "epoch": 0.7525751072961373, + "grad_norm": 0.486328125, + "learning_rate": 4.81196964939864e-06, + "loss": 2.3919, + "step": 14028 + }, + { + "epoch": 0.7526287553648069, + "grad_norm": 0.474609375, + "learning_rate": 4.811936592965973e-06, + "loss": 2.2183, + "step": 14029 + }, + { + "epoch": 0.7526824034334764, + "grad_norm": 0.498046875, + "learning_rate": 4.8119035337414025e-06, + "loss": 2.2745, + "step": 14030 + }, + { + "epoch": 0.752736051502146, + "grad_norm": 0.419921875, + "learning_rate": 4.811870471724968e-06, + "loss": 2.3133, + "step": 14031 + }, + { + "epoch": 0.7527896995708154, + "grad_norm": 0.419921875, + "learning_rate": 4.8118374069167105e-06, + "loss": 2.3495, + "step": 14032 + }, + { + "epoch": 0.752843347639485, + "grad_norm": 0.447265625, + "learning_rate": 4.81180433931667e-06, + "loss": 2.4492, + "step": 14033 + }, + { + "epoch": 0.7528969957081545, + "grad_norm": 0.470703125, + "learning_rate": 4.811771268924885e-06, + "loss": 2.1065, + "step": 14034 + }, + { + "epoch": 0.7529506437768241, + "grad_norm": 0.55859375, + "learning_rate": 4.811738195741397e-06, + "loss": 2.262, + "step": 14035 + }, + { + "epoch": 0.7530042918454936, + "grad_norm": 0.4921875, + "learning_rate": 4.811705119766245e-06, + "loss": 2.213, + "step": 14036 + }, + { + "epoch": 0.7530579399141631, + "grad_norm": 0.376953125, + "learning_rate": 4.8116720409994695e-06, + "loss": 2.161, + "step": 14037 + }, + { + "epoch": 0.7531115879828326, + "grad_norm": 0.43359375, + "learning_rate": 4.8116389594411096e-06, + "loss": 2.0204, + "step": 14038 + }, + { + "epoch": 0.7531652360515021, + "grad_norm": 0.61328125, + "learning_rate": 4.811605875091207e-06, + "loss": 2.4405, + "step": 14039 + }, + { + "epoch": 0.7532188841201717, + "grad_norm": 0.46484375, + "learning_rate": 4.8115727879497995e-06, + "loss": 2.4423, + "step": 14040 + }, + { + "epoch": 0.7532725321888412, + "grad_norm": 0.4296875, + "learning_rate": 4.811539698016928e-06, + "loss": 2.1276, + "step": 14041 + }, + { + "epoch": 0.7533261802575107, + "grad_norm": 0.5078125, + "learning_rate": 4.811506605292633e-06, + "loss": 2.5342, + "step": 14042 + }, + { + "epoch": 0.7533798283261802, + "grad_norm": 0.443359375, + "learning_rate": 4.811473509776954e-06, + "loss": 2.3215, + "step": 14043 + }, + { + "epoch": 0.7534334763948498, + "grad_norm": 0.435546875, + "learning_rate": 4.81144041146993e-06, + "loss": 2.2913, + "step": 14044 + }, + { + "epoch": 0.7534871244635193, + "grad_norm": 0.421875, + "learning_rate": 4.811407310371603e-06, + "loss": 2.3728, + "step": 14045 + }, + { + "epoch": 0.7535407725321889, + "grad_norm": 0.46484375, + "learning_rate": 4.811374206482011e-06, + "loss": 2.2066, + "step": 14046 + }, + { + "epoch": 0.7535944206008584, + "grad_norm": 0.482421875, + "learning_rate": 4.811341099801196e-06, + "loss": 2.3001, + "step": 14047 + }, + { + "epoch": 0.7536480686695279, + "grad_norm": 0.44921875, + "learning_rate": 4.8113079903291955e-06, + "loss": 2.1416, + "step": 14048 + }, + { + "epoch": 0.7537017167381974, + "grad_norm": 0.416015625, + "learning_rate": 4.811274878066051e-06, + "loss": 2.3438, + "step": 14049 + }, + { + "epoch": 0.753755364806867, + "grad_norm": 0.4609375, + "learning_rate": 4.811241763011803e-06, + "loss": 2.394, + "step": 14050 + }, + { + "epoch": 0.7538090128755365, + "grad_norm": 0.51171875, + "learning_rate": 4.811208645166491e-06, + "loss": 2.6688, + "step": 14051 + }, + { + "epoch": 0.753862660944206, + "grad_norm": 0.63671875, + "learning_rate": 4.811175524530154e-06, + "loss": 2.2813, + "step": 14052 + }, + { + "epoch": 0.7539163090128755, + "grad_norm": 0.8671875, + "learning_rate": 4.811142401102833e-06, + "loss": 2.3258, + "step": 14053 + }, + { + "epoch": 0.753969957081545, + "grad_norm": 0.431640625, + "learning_rate": 4.8111092748845675e-06, + "loss": 2.1272, + "step": 14054 + }, + { + "epoch": 0.7540236051502146, + "grad_norm": 0.40625, + "learning_rate": 4.811076145875398e-06, + "loss": 2.1293, + "step": 14055 + }, + { + "epoch": 0.7540772532188841, + "grad_norm": 0.4140625, + "learning_rate": 4.811043014075365e-06, + "loss": 1.9424, + "step": 14056 + }, + { + "epoch": 0.7541309012875537, + "grad_norm": 0.6640625, + "learning_rate": 4.811009879484506e-06, + "loss": 1.1301, + "step": 14057 + }, + { + "epoch": 0.7541845493562231, + "grad_norm": 0.439453125, + "learning_rate": 4.810976742102864e-06, + "loss": 2.2612, + "step": 14058 + }, + { + "epoch": 0.7542381974248927, + "grad_norm": 0.52734375, + "learning_rate": 4.810943601930479e-06, + "loss": 2.476, + "step": 14059 + }, + { + "epoch": 0.7542918454935622, + "grad_norm": 0.427734375, + "learning_rate": 4.810910458967388e-06, + "loss": 2.0257, + "step": 14060 + }, + { + "epoch": 0.7543454935622318, + "grad_norm": 0.69140625, + "learning_rate": 4.810877313213634e-06, + "loss": 2.279, + "step": 14061 + }, + { + "epoch": 0.7543991416309013, + "grad_norm": 0.42578125, + "learning_rate": 4.8108441646692546e-06, + "loss": 2.5204, + "step": 14062 + }, + { + "epoch": 0.7544527896995709, + "grad_norm": 0.53515625, + "learning_rate": 4.810811013334292e-06, + "loss": 2.2321, + "step": 14063 + }, + { + "epoch": 0.7545064377682403, + "grad_norm": 0.4296875, + "learning_rate": 4.810777859208786e-06, + "loss": 2.4926, + "step": 14064 + }, + { + "epoch": 0.7545600858369099, + "grad_norm": 0.376953125, + "learning_rate": 4.810744702292775e-06, + "loss": 1.9745, + "step": 14065 + }, + { + "epoch": 0.7546137339055794, + "grad_norm": 0.5234375, + "learning_rate": 4.8107115425863e-06, + "loss": 2.2391, + "step": 14066 + }, + { + "epoch": 0.7546673819742489, + "grad_norm": 0.466796875, + "learning_rate": 4.810678380089402e-06, + "loss": 2.4559, + "step": 14067 + }, + { + "epoch": 0.7547210300429185, + "grad_norm": 0.486328125, + "learning_rate": 4.81064521480212e-06, + "loss": 2.3848, + "step": 14068 + }, + { + "epoch": 0.7547746781115879, + "grad_norm": 0.51953125, + "learning_rate": 4.8106120467244936e-06, + "loss": 2.4061, + "step": 14069 + }, + { + "epoch": 0.7548283261802575, + "grad_norm": 0.4296875, + "learning_rate": 4.810578875856563e-06, + "loss": 2.2843, + "step": 14070 + }, + { + "epoch": 0.754881974248927, + "grad_norm": 0.37109375, + "learning_rate": 4.810545702198369e-06, + "loss": 2.1866, + "step": 14071 + }, + { + "epoch": 0.7549356223175966, + "grad_norm": 0.50390625, + "learning_rate": 4.8105125257499515e-06, + "loss": 2.4117, + "step": 14072 + }, + { + "epoch": 0.7549892703862661, + "grad_norm": 0.482421875, + "learning_rate": 4.810479346511351e-06, + "loss": 2.1396, + "step": 14073 + }, + { + "epoch": 0.7550429184549357, + "grad_norm": 0.4140625, + "learning_rate": 4.810446164482607e-06, + "loss": 2.3628, + "step": 14074 + }, + { + "epoch": 0.7550965665236051, + "grad_norm": 0.421875, + "learning_rate": 4.810412979663759e-06, + "loss": 2.152, + "step": 14075 + }, + { + "epoch": 0.7551502145922747, + "grad_norm": 0.404296875, + "learning_rate": 4.810379792054847e-06, + "loss": 2.3851, + "step": 14076 + }, + { + "epoch": 0.7552038626609442, + "grad_norm": 0.61328125, + "learning_rate": 4.810346601655913e-06, + "loss": 2.0904, + "step": 14077 + }, + { + "epoch": 0.7552575107296138, + "grad_norm": 0.400390625, + "learning_rate": 4.810313408466995e-06, + "loss": 2.3592, + "step": 14078 + }, + { + "epoch": 0.7553111587982833, + "grad_norm": 0.4453125, + "learning_rate": 4.810280212488134e-06, + "loss": 2.16, + "step": 14079 + }, + { + "epoch": 0.7553648068669528, + "grad_norm": 0.5546875, + "learning_rate": 4.81024701371937e-06, + "loss": 2.4642, + "step": 14080 + }, + { + "epoch": 0.7554184549356223, + "grad_norm": 0.37890625, + "learning_rate": 4.810213812160743e-06, + "loss": 2.202, + "step": 14081 + }, + { + "epoch": 0.7554721030042918, + "grad_norm": 0.4765625, + "learning_rate": 4.810180607812293e-06, + "loss": 2.3982, + "step": 14082 + }, + { + "epoch": 0.7555257510729614, + "grad_norm": 0.58203125, + "learning_rate": 4.81014740067406e-06, + "loss": 2.2093, + "step": 14083 + }, + { + "epoch": 0.7555793991416309, + "grad_norm": 0.490234375, + "learning_rate": 4.810114190746086e-06, + "loss": 2.2817, + "step": 14084 + }, + { + "epoch": 0.7556330472103004, + "grad_norm": 0.50390625, + "learning_rate": 4.810080978028407e-06, + "loss": 2.4462, + "step": 14085 + }, + { + "epoch": 0.7556866952789699, + "grad_norm": 0.412109375, + "learning_rate": 4.8100477625210675e-06, + "loss": 2.3183, + "step": 14086 + }, + { + "epoch": 0.7557403433476395, + "grad_norm": 0.5859375, + "learning_rate": 4.8100145442241045e-06, + "loss": 2.1836, + "step": 14087 + }, + { + "epoch": 0.755793991416309, + "grad_norm": 0.484375, + "learning_rate": 4.8099813231375605e-06, + "loss": 2.3118, + "step": 14088 + }, + { + "epoch": 0.7558476394849786, + "grad_norm": 0.453125, + "learning_rate": 4.8099480992614736e-06, + "loss": 2.3802, + "step": 14089 + }, + { + "epoch": 0.755901287553648, + "grad_norm": 0.5, + "learning_rate": 4.809914872595884e-06, + "loss": 2.3978, + "step": 14090 + }, + { + "epoch": 0.7559549356223176, + "grad_norm": 0.474609375, + "learning_rate": 4.8098816431408335e-06, + "loss": 2.2524, + "step": 14091 + }, + { + "epoch": 0.7560085836909871, + "grad_norm": 0.498046875, + "learning_rate": 4.809848410896361e-06, + "loss": 2.3304, + "step": 14092 + }, + { + "epoch": 0.7560622317596567, + "grad_norm": 0.94921875, + "learning_rate": 4.809815175862506e-06, + "loss": 2.2848, + "step": 14093 + }, + { + "epoch": 0.7561158798283262, + "grad_norm": 0.64453125, + "learning_rate": 4.809781938039312e-06, + "loss": 2.3018, + "step": 14094 + }, + { + "epoch": 0.7561695278969958, + "grad_norm": 0.3828125, + "learning_rate": 4.809748697426815e-06, + "loss": 1.9415, + "step": 14095 + }, + { + "epoch": 0.7562231759656652, + "grad_norm": 0.48046875, + "learning_rate": 4.809715454025057e-06, + "loss": 2.3536, + "step": 14096 + }, + { + "epoch": 0.7562768240343347, + "grad_norm": 0.453125, + "learning_rate": 4.809682207834078e-06, + "loss": 2.3269, + "step": 14097 + }, + { + "epoch": 0.7563304721030043, + "grad_norm": 1.7578125, + "learning_rate": 4.809648958853917e-06, + "loss": 2.3025, + "step": 14098 + }, + { + "epoch": 0.7563841201716738, + "grad_norm": 0.453125, + "learning_rate": 4.809615707084617e-06, + "loss": 2.2556, + "step": 14099 + }, + { + "epoch": 0.7564377682403434, + "grad_norm": 0.62890625, + "learning_rate": 4.809582452526216e-06, + "loss": 2.2544, + "step": 14100 + }, + { + "epoch": 0.7564914163090128, + "grad_norm": 0.63671875, + "learning_rate": 4.8095491951787546e-06, + "loss": 2.3006, + "step": 14101 + }, + { + "epoch": 0.7565450643776824, + "grad_norm": 0.375, + "learning_rate": 4.809515935042272e-06, + "loss": 2.0508, + "step": 14102 + }, + { + "epoch": 0.7565987124463519, + "grad_norm": 0.41796875, + "learning_rate": 4.809482672116809e-06, + "loss": 2.2786, + "step": 14103 + }, + { + "epoch": 0.7566523605150215, + "grad_norm": 0.5703125, + "learning_rate": 4.8094494064024075e-06, + "loss": 2.2618, + "step": 14104 + }, + { + "epoch": 0.756706008583691, + "grad_norm": 0.52734375, + "learning_rate": 4.8094161378991065e-06, + "loss": 2.3999, + "step": 14105 + }, + { + "epoch": 0.7567596566523606, + "grad_norm": 2.078125, + "learning_rate": 4.809382866606945e-06, + "loss": 2.3501, + "step": 14106 + }, + { + "epoch": 0.75681330472103, + "grad_norm": 0.466796875, + "learning_rate": 4.8093495925259644e-06, + "loss": 2.1464, + "step": 14107 + }, + { + "epoch": 0.7568669527896996, + "grad_norm": 0.498046875, + "learning_rate": 4.809316315656204e-06, + "loss": 2.0855, + "step": 14108 + }, + { + "epoch": 0.7569206008583691, + "grad_norm": 0.4453125, + "learning_rate": 4.809283035997705e-06, + "loss": 2.2234, + "step": 14109 + }, + { + "epoch": 0.7569742489270386, + "grad_norm": 0.53515625, + "learning_rate": 4.809249753550508e-06, + "loss": 2.2656, + "step": 14110 + }, + { + "epoch": 0.7570278969957082, + "grad_norm": 0.416015625, + "learning_rate": 4.809216468314651e-06, + "loss": 2.2991, + "step": 14111 + }, + { + "epoch": 0.7570815450643776, + "grad_norm": 0.71875, + "learning_rate": 4.8091831802901766e-06, + "loss": 2.2017, + "step": 14112 + }, + { + "epoch": 0.7571351931330472, + "grad_norm": 0.5078125, + "learning_rate": 4.809149889477123e-06, + "loss": 2.3404, + "step": 14113 + }, + { + "epoch": 0.7571888412017167, + "grad_norm": 0.47265625, + "learning_rate": 4.809116595875532e-06, + "loss": 2.3371, + "step": 14114 + }, + { + "epoch": 0.7572424892703863, + "grad_norm": 0.5234375, + "learning_rate": 4.809083299485442e-06, + "loss": 2.3005, + "step": 14115 + }, + { + "epoch": 0.7572961373390558, + "grad_norm": 0.4453125, + "learning_rate": 4.8090500003068954e-06, + "loss": 2.373, + "step": 14116 + }, + { + "epoch": 0.7573497854077254, + "grad_norm": 0.462890625, + "learning_rate": 4.8090166983399315e-06, + "loss": 2.1256, + "step": 14117 + }, + { + "epoch": 0.7574034334763948, + "grad_norm": 0.40234375, + "learning_rate": 4.808983393584589e-06, + "loss": 2.2335, + "step": 14118 + }, + { + "epoch": 0.7574570815450644, + "grad_norm": 0.66015625, + "learning_rate": 4.808950086040911e-06, + "loss": 2.6147, + "step": 14119 + }, + { + "epoch": 0.7575107296137339, + "grad_norm": 0.462890625, + "learning_rate": 4.808916775708936e-06, + "loss": 2.0911, + "step": 14120 + }, + { + "epoch": 0.7575643776824035, + "grad_norm": 0.453125, + "learning_rate": 4.808883462588705e-06, + "loss": 2.6301, + "step": 14121 + }, + { + "epoch": 0.757618025751073, + "grad_norm": 0.37890625, + "learning_rate": 4.808850146680256e-06, + "loss": 2.4974, + "step": 14122 + }, + { + "epoch": 0.7576716738197425, + "grad_norm": 0.443359375, + "learning_rate": 4.808816827983632e-06, + "loss": 2.5323, + "step": 14123 + }, + { + "epoch": 0.757725321888412, + "grad_norm": 0.5, + "learning_rate": 4.8087835064988725e-06, + "loss": 2.2834, + "step": 14124 + }, + { + "epoch": 0.7577789699570815, + "grad_norm": 0.458984375, + "learning_rate": 4.808750182226017e-06, + "loss": 1.8735, + "step": 14125 + }, + { + "epoch": 0.7578326180257511, + "grad_norm": 0.4375, + "learning_rate": 4.808716855165105e-06, + "loss": 2.2493, + "step": 14126 + }, + { + "epoch": 0.7578862660944206, + "grad_norm": 0.474609375, + "learning_rate": 4.808683525316179e-06, + "loss": 2.3412, + "step": 14127 + }, + { + "epoch": 0.7579399141630901, + "grad_norm": 0.42578125, + "learning_rate": 4.808650192679278e-06, + "loss": 2.334, + "step": 14128 + }, + { + "epoch": 0.7579935622317596, + "grad_norm": 0.482421875, + "learning_rate": 4.808616857254442e-06, + "loss": 2.3732, + "step": 14129 + }, + { + "epoch": 0.7580472103004292, + "grad_norm": 0.5625, + "learning_rate": 4.808583519041713e-06, + "loss": 2.5481, + "step": 14130 + }, + { + "epoch": 0.7581008583690987, + "grad_norm": 0.447265625, + "learning_rate": 4.808550178041129e-06, + "loss": 2.0894, + "step": 14131 + }, + { + "epoch": 0.7581545064377683, + "grad_norm": 0.484375, + "learning_rate": 4.808516834252731e-06, + "loss": 2.4123, + "step": 14132 + }, + { + "epoch": 0.7582081545064377, + "grad_norm": 0.443359375, + "learning_rate": 4.80848348767656e-06, + "loss": 2.2109, + "step": 14133 + }, + { + "epoch": 0.7582618025751073, + "grad_norm": 0.41015625, + "learning_rate": 4.808450138312655e-06, + "loss": 2.4419, + "step": 14134 + }, + { + "epoch": 0.7583154506437768, + "grad_norm": 2.046875, + "learning_rate": 4.808416786161057e-06, + "loss": 2.2633, + "step": 14135 + }, + { + "epoch": 0.7583690987124464, + "grad_norm": 0.42578125, + "learning_rate": 4.808383431221806e-06, + "loss": 2.2984, + "step": 14136 + }, + { + "epoch": 0.7584227467811159, + "grad_norm": 0.44140625, + "learning_rate": 4.8083500734949426e-06, + "loss": 2.1032, + "step": 14137 + }, + { + "epoch": 0.7584763948497855, + "grad_norm": 0.4375, + "learning_rate": 4.808316712980507e-06, + "loss": 2.2689, + "step": 14138 + }, + { + "epoch": 0.7585300429184549, + "grad_norm": 0.396484375, + "learning_rate": 4.80828334967854e-06, + "loss": 2.039, + "step": 14139 + }, + { + "epoch": 0.7585836909871244, + "grad_norm": 0.5234375, + "learning_rate": 4.8082499835890815e-06, + "loss": 2.314, + "step": 14140 + }, + { + "epoch": 0.758637339055794, + "grad_norm": 0.462890625, + "learning_rate": 4.808216614712171e-06, + "loss": 2.3095, + "step": 14141 + }, + { + "epoch": 0.7586909871244635, + "grad_norm": 0.482421875, + "learning_rate": 4.80818324304785e-06, + "loss": 2.5406, + "step": 14142 + }, + { + "epoch": 0.7587446351931331, + "grad_norm": 0.484375, + "learning_rate": 4.808149868596158e-06, + "loss": 2.3516, + "step": 14143 + }, + { + "epoch": 0.7587982832618025, + "grad_norm": 0.484375, + "learning_rate": 4.808116491357136e-06, + "loss": 2.4397, + "step": 14144 + }, + { + "epoch": 0.7588519313304721, + "grad_norm": 0.494140625, + "learning_rate": 4.808083111330823e-06, + "loss": 2.3361, + "step": 14145 + }, + { + "epoch": 0.7589055793991416, + "grad_norm": 0.423828125, + "learning_rate": 4.8080497285172605e-06, + "loss": 2.4435, + "step": 14146 + }, + { + "epoch": 0.7589592274678112, + "grad_norm": 0.49609375, + "learning_rate": 4.8080163429164885e-06, + "loss": 2.4039, + "step": 14147 + }, + { + "epoch": 0.7590128755364807, + "grad_norm": 0.4765625, + "learning_rate": 4.807982954528548e-06, + "loss": 2.2738, + "step": 14148 + }, + { + "epoch": 0.7590665236051503, + "grad_norm": 0.46875, + "learning_rate": 4.8079495633534775e-06, + "loss": 2.0218, + "step": 14149 + }, + { + "epoch": 0.7591201716738197, + "grad_norm": 0.447265625, + "learning_rate": 4.80791616939132e-06, + "loss": 2.3487, + "step": 14150 + }, + { + "epoch": 0.7591738197424893, + "grad_norm": 0.94140625, + "learning_rate": 4.807882772642113e-06, + "loss": 1.9233, + "step": 14151 + }, + { + "epoch": 0.7592274678111588, + "grad_norm": 0.4375, + "learning_rate": 4.807849373105898e-06, + "loss": 2.1726, + "step": 14152 + }, + { + "epoch": 0.7592811158798283, + "grad_norm": 0.447265625, + "learning_rate": 4.807815970782717e-06, + "loss": 2.065, + "step": 14153 + }, + { + "epoch": 0.7593347639484979, + "grad_norm": 0.46484375, + "learning_rate": 4.807782565672607e-06, + "loss": 2.3174, + "step": 14154 + }, + { + "epoch": 0.7593884120171673, + "grad_norm": 0.4609375, + "learning_rate": 4.807749157775611e-06, + "loss": 2.1846, + "step": 14155 + }, + { + "epoch": 0.7594420600858369, + "grad_norm": 0.5703125, + "learning_rate": 4.807715747091768e-06, + "loss": 2.2724, + "step": 14156 + }, + { + "epoch": 0.7594957081545064, + "grad_norm": 0.490234375, + "learning_rate": 4.807682333621119e-06, + "loss": 2.1739, + "step": 14157 + }, + { + "epoch": 0.759549356223176, + "grad_norm": 0.431640625, + "learning_rate": 4.807648917363704e-06, + "loss": 2.1707, + "step": 14158 + }, + { + "epoch": 0.7596030042918455, + "grad_norm": 1.21875, + "learning_rate": 4.807615498319565e-06, + "loss": 1.9428, + "step": 14159 + }, + { + "epoch": 0.759656652360515, + "grad_norm": 0.5390625, + "learning_rate": 4.807582076488739e-06, + "loss": 2.3239, + "step": 14160 + }, + { + "epoch": 0.7597103004291845, + "grad_norm": 0.44921875, + "learning_rate": 4.807548651871269e-06, + "loss": 2.2713, + "step": 14161 + }, + { + "epoch": 0.7597639484978541, + "grad_norm": 0.435546875, + "learning_rate": 4.807515224467194e-06, + "loss": 2.2577, + "step": 14162 + }, + { + "epoch": 0.7598175965665236, + "grad_norm": 0.390625, + "learning_rate": 4.807481794276555e-06, + "loss": 2.1032, + "step": 14163 + }, + { + "epoch": 0.7598712446351932, + "grad_norm": 0.7265625, + "learning_rate": 4.8074483612993934e-06, + "loss": 2.216, + "step": 14164 + }, + { + "epoch": 0.7599248927038627, + "grad_norm": 0.400390625, + "learning_rate": 4.807414925535748e-06, + "loss": 1.8083, + "step": 14165 + }, + { + "epoch": 0.7599785407725322, + "grad_norm": 0.388671875, + "learning_rate": 4.807381486985659e-06, + "loss": 2.2407, + "step": 14166 + }, + { + "epoch": 0.7600321888412017, + "grad_norm": 0.46484375, + "learning_rate": 4.807348045649168e-06, + "loss": 2.3244, + "step": 14167 + }, + { + "epoch": 0.7600858369098712, + "grad_norm": 0.447265625, + "learning_rate": 4.807314601526315e-06, + "loss": 2.5791, + "step": 14168 + }, + { + "epoch": 0.7601394849785408, + "grad_norm": 0.5, + "learning_rate": 4.80728115461714e-06, + "loss": 2.245, + "step": 14169 + }, + { + "epoch": 0.7601931330472103, + "grad_norm": 0.494140625, + "learning_rate": 4.807247704921685e-06, + "loss": 2.5717, + "step": 14170 + }, + { + "epoch": 0.7602467811158798, + "grad_norm": 0.6953125, + "learning_rate": 4.807214252439988e-06, + "loss": 2.4851, + "step": 14171 + }, + { + "epoch": 0.7603004291845493, + "grad_norm": 0.4296875, + "learning_rate": 4.80718079717209e-06, + "loss": 2.3482, + "step": 14172 + }, + { + "epoch": 0.7603540772532189, + "grad_norm": 0.404296875, + "learning_rate": 4.8071473391180315e-06, + "loss": 1.9565, + "step": 14173 + }, + { + "epoch": 0.7604077253218884, + "grad_norm": 0.44140625, + "learning_rate": 4.807113878277855e-06, + "loss": 2.2972, + "step": 14174 + }, + { + "epoch": 0.760461373390558, + "grad_norm": 0.50390625, + "learning_rate": 4.807080414651597e-06, + "loss": 1.938, + "step": 14175 + }, + { + "epoch": 0.7605150214592274, + "grad_norm": 0.8203125, + "learning_rate": 4.8070469482393016e-06, + "loss": 1.4477, + "step": 14176 + }, + { + "epoch": 0.760568669527897, + "grad_norm": 0.458984375, + "learning_rate": 4.807013479041006e-06, + "loss": 2.0543, + "step": 14177 + }, + { + "epoch": 0.7606223175965665, + "grad_norm": 0.458984375, + "learning_rate": 4.806980007056754e-06, + "loss": 2.3945, + "step": 14178 + }, + { + "epoch": 0.7606759656652361, + "grad_norm": 0.6171875, + "learning_rate": 4.806946532286584e-06, + "loss": 2.1727, + "step": 14179 + }, + { + "epoch": 0.7607296137339056, + "grad_norm": 0.443359375, + "learning_rate": 4.806913054730535e-06, + "loss": 2.3614, + "step": 14180 + }, + { + "epoch": 0.7607832618025752, + "grad_norm": 0.50390625, + "learning_rate": 4.806879574388651e-06, + "loss": 2.3514, + "step": 14181 + }, + { + "epoch": 0.7608369098712446, + "grad_norm": 0.55859375, + "learning_rate": 4.806846091260971e-06, + "loss": 2.3091, + "step": 14182 + }, + { + "epoch": 0.7608905579399141, + "grad_norm": 0.49609375, + "learning_rate": 4.8068126053475336e-06, + "loss": 2.3046, + "step": 14183 + }, + { + "epoch": 0.7609442060085837, + "grad_norm": 0.45703125, + "learning_rate": 4.806779116648381e-06, + "loss": 2.3392, + "step": 14184 + }, + { + "epoch": 0.7609978540772532, + "grad_norm": 0.49609375, + "learning_rate": 4.8067456251635535e-06, + "loss": 2.3855, + "step": 14185 + }, + { + "epoch": 0.7610515021459228, + "grad_norm": 0.447265625, + "learning_rate": 4.806712130893091e-06, + "loss": 2.1509, + "step": 14186 + }, + { + "epoch": 0.7611051502145922, + "grad_norm": 0.4609375, + "learning_rate": 4.8066786338370345e-06, + "loss": 2.5275, + "step": 14187 + }, + { + "epoch": 0.7611587982832618, + "grad_norm": 0.470703125, + "learning_rate": 4.806645133995425e-06, + "loss": 2.1422, + "step": 14188 + }, + { + "epoch": 0.7612124463519313, + "grad_norm": 0.447265625, + "learning_rate": 4.806611631368301e-06, + "loss": 2.3167, + "step": 14189 + }, + { + "epoch": 0.7612660944206009, + "grad_norm": 0.4609375, + "learning_rate": 4.806578125955704e-06, + "loss": 2.1603, + "step": 14190 + }, + { + "epoch": 0.7613197424892704, + "grad_norm": 0.53515625, + "learning_rate": 4.806544617757676e-06, + "loss": 2.3344, + "step": 14191 + }, + { + "epoch": 0.76137339055794, + "grad_norm": 1.1328125, + "learning_rate": 4.806511106774256e-06, + "loss": 2.2685, + "step": 14192 + }, + { + "epoch": 0.7614270386266094, + "grad_norm": 0.5625, + "learning_rate": 4.806477593005483e-06, + "loss": 1.9234, + "step": 14193 + }, + { + "epoch": 0.761480686695279, + "grad_norm": 0.515625, + "learning_rate": 4.8064440764514e-06, + "loss": 2.3294, + "step": 14194 + }, + { + "epoch": 0.7615343347639485, + "grad_norm": 0.462890625, + "learning_rate": 4.806410557112046e-06, + "loss": 2.161, + "step": 14195 + }, + { + "epoch": 0.761587982832618, + "grad_norm": 0.453125, + "learning_rate": 4.806377034987463e-06, + "loss": 2.2835, + "step": 14196 + }, + { + "epoch": 0.7616416309012876, + "grad_norm": 0.4296875, + "learning_rate": 4.806343510077689e-06, + "loss": 1.5677, + "step": 14197 + }, + { + "epoch": 0.761695278969957, + "grad_norm": 0.392578125, + "learning_rate": 4.8063099823827675e-06, + "loss": 2.3279, + "step": 14198 + }, + { + "epoch": 0.7617489270386266, + "grad_norm": 0.447265625, + "learning_rate": 4.806276451902736e-06, + "loss": 2.2565, + "step": 14199 + }, + { + "epoch": 0.7618025751072961, + "grad_norm": 0.40234375, + "learning_rate": 4.806242918637637e-06, + "loss": 2.2545, + "step": 14200 + }, + { + "epoch": 0.7618562231759657, + "grad_norm": 0.443359375, + "learning_rate": 4.806209382587511e-06, + "loss": 2.2537, + "step": 14201 + }, + { + "epoch": 0.7619098712446352, + "grad_norm": 0.5078125, + "learning_rate": 4.806175843752398e-06, + "loss": 2.4707, + "step": 14202 + }, + { + "epoch": 0.7619635193133047, + "grad_norm": 0.341796875, + "learning_rate": 4.806142302132337e-06, + "loss": 2.0556, + "step": 14203 + }, + { + "epoch": 0.7620171673819742, + "grad_norm": 0.44140625, + "learning_rate": 4.806108757727371e-06, + "loss": 2.2061, + "step": 14204 + }, + { + "epoch": 0.7620708154506438, + "grad_norm": 2.375, + "learning_rate": 4.806075210537538e-06, + "loss": 2.2436, + "step": 14205 + }, + { + "epoch": 0.7621244635193133, + "grad_norm": 0.64453125, + "learning_rate": 4.8060416605628815e-06, + "loss": 2.0606, + "step": 14206 + }, + { + "epoch": 0.7621781115879829, + "grad_norm": 0.49609375, + "learning_rate": 4.8060081078034405e-06, + "loss": 2.2077, + "step": 14207 + }, + { + "epoch": 0.7622317596566524, + "grad_norm": 0.482421875, + "learning_rate": 4.805974552259255e-06, + "loss": 2.1605, + "step": 14208 + }, + { + "epoch": 0.7622854077253219, + "grad_norm": 1.53125, + "learning_rate": 4.805940993930366e-06, + "loss": 2.3272, + "step": 14209 + }, + { + "epoch": 0.7623390557939914, + "grad_norm": 0.396484375, + "learning_rate": 4.805907432816814e-06, + "loss": 2.0947, + "step": 14210 + }, + { + "epoch": 0.7623927038626609, + "grad_norm": 0.453125, + "learning_rate": 4.805873868918639e-06, + "loss": 2.4697, + "step": 14211 + }, + { + "epoch": 0.7624463519313305, + "grad_norm": 0.46875, + "learning_rate": 4.805840302235882e-06, + "loss": 2.2009, + "step": 14212 + }, + { + "epoch": 0.7625, + "grad_norm": 0.419921875, + "learning_rate": 4.805806732768585e-06, + "loss": 2.2866, + "step": 14213 + }, + { + "epoch": 0.7625536480686695, + "grad_norm": 0.4765625, + "learning_rate": 4.8057731605167865e-06, + "loss": 2.4168, + "step": 14214 + }, + { + "epoch": 0.762607296137339, + "grad_norm": 0.494140625, + "learning_rate": 4.805739585480528e-06, + "loss": 2.2057, + "step": 14215 + }, + { + "epoch": 0.7626609442060086, + "grad_norm": 0.4375, + "learning_rate": 4.805706007659849e-06, + "loss": 2.2335, + "step": 14216 + }, + { + "epoch": 0.7627145922746781, + "grad_norm": 0.625, + "learning_rate": 4.805672427054791e-06, + "loss": 2.3705, + "step": 14217 + }, + { + "epoch": 0.7627682403433477, + "grad_norm": 0.486328125, + "learning_rate": 4.805638843665394e-06, + "loss": 2.3559, + "step": 14218 + }, + { + "epoch": 0.7628218884120171, + "grad_norm": 0.470703125, + "learning_rate": 4.805605257491699e-06, + "loss": 2.1833, + "step": 14219 + }, + { + "epoch": 0.7628755364806867, + "grad_norm": 0.341796875, + "learning_rate": 4.805571668533747e-06, + "loss": 2.0416, + "step": 14220 + }, + { + "epoch": 0.7629291845493562, + "grad_norm": 0.80859375, + "learning_rate": 4.805538076791578e-06, + "loss": 1.2474, + "step": 14221 + }, + { + "epoch": 0.7629828326180258, + "grad_norm": 0.435546875, + "learning_rate": 4.805504482265232e-06, + "loss": 2.3856, + "step": 14222 + }, + { + "epoch": 0.7630364806866953, + "grad_norm": 0.44140625, + "learning_rate": 4.805470884954751e-06, + "loss": 2.1544, + "step": 14223 + }, + { + "epoch": 0.7630901287553649, + "grad_norm": 0.51171875, + "learning_rate": 4.805437284860174e-06, + "loss": 2.1709, + "step": 14224 + }, + { + "epoch": 0.7631437768240343, + "grad_norm": 0.453125, + "learning_rate": 4.805403681981542e-06, + "loss": 2.2973, + "step": 14225 + }, + { + "epoch": 0.7631974248927038, + "grad_norm": 0.54296875, + "learning_rate": 4.805370076318897e-06, + "loss": 2.1101, + "step": 14226 + }, + { + "epoch": 0.7632510729613734, + "grad_norm": 0.66796875, + "learning_rate": 4.805336467872278e-06, + "loss": 2.3108, + "step": 14227 + }, + { + "epoch": 0.7633047210300429, + "grad_norm": 0.51171875, + "learning_rate": 4.8053028566417255e-06, + "loss": 1.9681, + "step": 14228 + }, + { + "epoch": 0.7633583690987125, + "grad_norm": 0.50390625, + "learning_rate": 4.805269242627281e-06, + "loss": 2.186, + "step": 14229 + }, + { + "epoch": 0.7634120171673819, + "grad_norm": 0.52734375, + "learning_rate": 4.8052356258289845e-06, + "loss": 2.2812, + "step": 14230 + }, + { + "epoch": 0.7634656652360515, + "grad_norm": 0.369140625, + "learning_rate": 4.805202006246877e-06, + "loss": 1.8298, + "step": 14231 + }, + { + "epoch": 0.763519313304721, + "grad_norm": 0.5, + "learning_rate": 4.805168383880999e-06, + "loss": 2.3852, + "step": 14232 + }, + { + "epoch": 0.7635729613733906, + "grad_norm": 0.48828125, + "learning_rate": 4.805134758731391e-06, + "loss": 2.4576, + "step": 14233 + }, + { + "epoch": 0.7636266094420601, + "grad_norm": 0.484375, + "learning_rate": 4.8051011307980935e-06, + "loss": 2.3499, + "step": 14234 + }, + { + "epoch": 0.7636802575107297, + "grad_norm": 0.40625, + "learning_rate": 4.805067500081148e-06, + "loss": 2.239, + "step": 14235 + }, + { + "epoch": 0.7637339055793991, + "grad_norm": 0.41796875, + "learning_rate": 4.805033866580593e-06, + "loss": 2.4198, + "step": 14236 + }, + { + "epoch": 0.7637875536480687, + "grad_norm": 0.69140625, + "learning_rate": 4.805000230296472e-06, + "loss": 2.3895, + "step": 14237 + }, + { + "epoch": 0.7638412017167382, + "grad_norm": 0.421875, + "learning_rate": 4.804966591228823e-06, + "loss": 2.2327, + "step": 14238 + }, + { + "epoch": 0.7638948497854077, + "grad_norm": 0.482421875, + "learning_rate": 4.804932949377687e-06, + "loss": 2.5071, + "step": 14239 + }, + { + "epoch": 0.7639484978540773, + "grad_norm": 0.4296875, + "learning_rate": 4.804899304743107e-06, + "loss": 2.1906, + "step": 14240 + }, + { + "epoch": 0.7640021459227467, + "grad_norm": 0.447265625, + "learning_rate": 4.804865657325121e-06, + "loss": 2.4597, + "step": 14241 + }, + { + "epoch": 0.7640557939914163, + "grad_norm": 0.78515625, + "learning_rate": 4.804832007123771e-06, + "loss": 2.2193, + "step": 14242 + }, + { + "epoch": 0.7641094420600858, + "grad_norm": 0.451171875, + "learning_rate": 4.804798354139097e-06, + "loss": 2.5786, + "step": 14243 + }, + { + "epoch": 0.7641630901287554, + "grad_norm": 0.40234375, + "learning_rate": 4.804764698371139e-06, + "loss": 2.1371, + "step": 14244 + }, + { + "epoch": 0.7642167381974249, + "grad_norm": 0.35546875, + "learning_rate": 4.80473103981994e-06, + "loss": 2.159, + "step": 14245 + }, + { + "epoch": 0.7642703862660944, + "grad_norm": 0.4453125, + "learning_rate": 4.804697378485539e-06, + "loss": 2.0532, + "step": 14246 + }, + { + "epoch": 0.7643240343347639, + "grad_norm": 0.4609375, + "learning_rate": 4.804663714367977e-06, + "loss": 2.4508, + "step": 14247 + }, + { + "epoch": 0.7643776824034335, + "grad_norm": 0.578125, + "learning_rate": 4.804630047467293e-06, + "loss": 2.4173, + "step": 14248 + }, + { + "epoch": 0.764431330472103, + "grad_norm": 0.455078125, + "learning_rate": 4.8045963777835305e-06, + "loss": 2.4156, + "step": 14249 + }, + { + "epoch": 0.7644849785407726, + "grad_norm": 0.408203125, + "learning_rate": 4.804562705316728e-06, + "loss": 2.1518, + "step": 14250 + }, + { + "epoch": 0.764538626609442, + "grad_norm": 0.470703125, + "learning_rate": 4.804529030066927e-06, + "loss": 2.4284, + "step": 14251 + }, + { + "epoch": 0.7645922746781116, + "grad_norm": 0.8203125, + "learning_rate": 4.804495352034169e-06, + "loss": 2.1213, + "step": 14252 + }, + { + "epoch": 0.7646459227467811, + "grad_norm": 0.484375, + "learning_rate": 4.804461671218492e-06, + "loss": 2.2906, + "step": 14253 + }, + { + "epoch": 0.7646995708154506, + "grad_norm": 0.53125, + "learning_rate": 4.804427987619939e-06, + "loss": 2.3549, + "step": 14254 + }, + { + "epoch": 0.7647532188841202, + "grad_norm": 0.431640625, + "learning_rate": 4.804394301238552e-06, + "loss": 2.2785, + "step": 14255 + }, + { + "epoch": 0.7648068669527897, + "grad_norm": 0.5078125, + "learning_rate": 4.804360612074368e-06, + "loss": 2.5578, + "step": 14256 + }, + { + "epoch": 0.7648605150214592, + "grad_norm": 0.408203125, + "learning_rate": 4.80432692012743e-06, + "loss": 2.2486, + "step": 14257 + }, + { + "epoch": 0.7649141630901287, + "grad_norm": 0.859375, + "learning_rate": 4.804293225397778e-06, + "loss": 1.3267, + "step": 14258 + }, + { + "epoch": 0.7649678111587983, + "grad_norm": 0.380859375, + "learning_rate": 4.804259527885453e-06, + "loss": 2.2171, + "step": 14259 + }, + { + "epoch": 0.7650214592274678, + "grad_norm": 0.46875, + "learning_rate": 4.804225827590496e-06, + "loss": 2.3089, + "step": 14260 + }, + { + "epoch": 0.7650751072961374, + "grad_norm": 0.404296875, + "learning_rate": 4.8041921245129465e-06, + "loss": 2.3807, + "step": 14261 + }, + { + "epoch": 0.7651287553648068, + "grad_norm": 1.625, + "learning_rate": 4.804158418652846e-06, + "loss": 2.164, + "step": 14262 + }, + { + "epoch": 0.7651824034334764, + "grad_norm": 0.447265625, + "learning_rate": 4.804124710010236e-06, + "loss": 2.322, + "step": 14263 + }, + { + "epoch": 0.7652360515021459, + "grad_norm": 0.54296875, + "learning_rate": 4.804090998585156e-06, + "loss": 2.414, + "step": 14264 + }, + { + "epoch": 0.7652896995708155, + "grad_norm": 0.474609375, + "learning_rate": 4.804057284377647e-06, + "loss": 2.3174, + "step": 14265 + }, + { + "epoch": 0.765343347639485, + "grad_norm": 0.447265625, + "learning_rate": 4.8040235673877485e-06, + "loss": 2.4134, + "step": 14266 + }, + { + "epoch": 0.7653969957081546, + "grad_norm": 0.431640625, + "learning_rate": 4.803989847615504e-06, + "loss": 2.3167, + "step": 14267 + }, + { + "epoch": 0.765450643776824, + "grad_norm": 0.400390625, + "learning_rate": 4.8039561250609526e-06, + "loss": 2.325, + "step": 14268 + }, + { + "epoch": 0.7655042918454935, + "grad_norm": 0.41796875, + "learning_rate": 4.803922399724135e-06, + "loss": 2.3957, + "step": 14269 + }, + { + "epoch": 0.7655579399141631, + "grad_norm": 0.486328125, + "learning_rate": 4.803888671605092e-06, + "loss": 2.2592, + "step": 14270 + }, + { + "epoch": 0.7656115879828326, + "grad_norm": 0.435546875, + "learning_rate": 4.803854940703864e-06, + "loss": 2.252, + "step": 14271 + }, + { + "epoch": 0.7656652360515022, + "grad_norm": 0.67578125, + "learning_rate": 4.803821207020492e-06, + "loss": 2.3094, + "step": 14272 + }, + { + "epoch": 0.7657188841201716, + "grad_norm": 0.498046875, + "learning_rate": 4.803787470555018e-06, + "loss": 2.1258, + "step": 14273 + }, + { + "epoch": 0.7657725321888412, + "grad_norm": 4.8125, + "learning_rate": 4.803753731307481e-06, + "loss": 2.4088, + "step": 14274 + }, + { + "epoch": 0.7658261802575107, + "grad_norm": 0.64453125, + "learning_rate": 4.803719989277923e-06, + "loss": 2.2973, + "step": 14275 + }, + { + "epoch": 0.7658798283261803, + "grad_norm": 0.4609375, + "learning_rate": 4.803686244466383e-06, + "loss": 2.3419, + "step": 14276 + }, + { + "epoch": 0.7659334763948498, + "grad_norm": 0.431640625, + "learning_rate": 4.803652496872904e-06, + "loss": 2.3473, + "step": 14277 + }, + { + "epoch": 0.7659871244635194, + "grad_norm": 0.423828125, + "learning_rate": 4.803618746497525e-06, + "loss": 2.1666, + "step": 14278 + }, + { + "epoch": 0.7660407725321888, + "grad_norm": 0.47265625, + "learning_rate": 4.803584993340286e-06, + "loss": 2.4023, + "step": 14279 + }, + { + "epoch": 0.7660944206008584, + "grad_norm": 0.412109375, + "learning_rate": 4.803551237401232e-06, + "loss": 2.1002, + "step": 14280 + }, + { + "epoch": 0.7661480686695279, + "grad_norm": 0.5078125, + "learning_rate": 4.803517478680399e-06, + "loss": 2.3021, + "step": 14281 + }, + { + "epoch": 0.7662017167381975, + "grad_norm": 0.45703125, + "learning_rate": 4.80348371717783e-06, + "loss": 2.2887, + "step": 14282 + }, + { + "epoch": 0.766255364806867, + "grad_norm": 0.453125, + "learning_rate": 4.803449952893565e-06, + "loss": 2.3776, + "step": 14283 + }, + { + "epoch": 0.7663090128755364, + "grad_norm": 0.498046875, + "learning_rate": 4.803416185827646e-06, + "loss": 2.4115, + "step": 14284 + }, + { + "epoch": 0.766362660944206, + "grad_norm": 0.421875, + "learning_rate": 4.803382415980113e-06, + "loss": 2.2478, + "step": 14285 + }, + { + "epoch": 0.7664163090128755, + "grad_norm": 0.447265625, + "learning_rate": 4.803348643351006e-06, + "loss": 2.3001, + "step": 14286 + }, + { + "epoch": 0.7664699570815451, + "grad_norm": 0.94921875, + "learning_rate": 4.8033148679403676e-06, + "loss": 2.252, + "step": 14287 + }, + { + "epoch": 0.7665236051502146, + "grad_norm": 1.125, + "learning_rate": 4.8032810897482364e-06, + "loss": 2.2536, + "step": 14288 + }, + { + "epoch": 0.7665772532188841, + "grad_norm": 0.7890625, + "learning_rate": 4.803247308774654e-06, + "loss": 2.3619, + "step": 14289 + }, + { + "epoch": 0.7666309012875536, + "grad_norm": 0.439453125, + "learning_rate": 4.803213525019663e-06, + "loss": 2.3687, + "step": 14290 + }, + { + "epoch": 0.7666845493562232, + "grad_norm": 0.443359375, + "learning_rate": 4.803179738483303e-06, + "loss": 2.2048, + "step": 14291 + }, + { + "epoch": 0.7667381974248927, + "grad_norm": 0.462890625, + "learning_rate": 4.803145949165613e-06, + "loss": 2.4001, + "step": 14292 + }, + { + "epoch": 0.7667918454935623, + "grad_norm": 0.45703125, + "learning_rate": 4.803112157066636e-06, + "loss": 2.2274, + "step": 14293 + }, + { + "epoch": 0.7668454935622318, + "grad_norm": 0.369140625, + "learning_rate": 4.8030783621864115e-06, + "loss": 2.2408, + "step": 14294 + }, + { + "epoch": 0.7668991416309013, + "grad_norm": 0.671875, + "learning_rate": 4.803044564524982e-06, + "loss": 2.3735, + "step": 14295 + }, + { + "epoch": 0.7669527896995708, + "grad_norm": 1.5234375, + "learning_rate": 4.803010764082387e-06, + "loss": 2.4847, + "step": 14296 + }, + { + "epoch": 0.7670064377682403, + "grad_norm": 0.85546875, + "learning_rate": 4.802976960858667e-06, + "loss": 2.6512, + "step": 14297 + }, + { + "epoch": 0.7670600858369099, + "grad_norm": 0.41015625, + "learning_rate": 4.802943154853863e-06, + "loss": 2.5271, + "step": 14298 + }, + { + "epoch": 0.7671137339055794, + "grad_norm": 0.474609375, + "learning_rate": 4.802909346068018e-06, + "loss": 2.4473, + "step": 14299 + }, + { + "epoch": 0.7671673819742489, + "grad_norm": 0.431640625, + "learning_rate": 4.802875534501169e-06, + "loss": 2.1782, + "step": 14300 + }, + { + "epoch": 0.7672210300429184, + "grad_norm": 0.484375, + "learning_rate": 4.80284172015336e-06, + "loss": 2.2806, + "step": 14301 + }, + { + "epoch": 0.767274678111588, + "grad_norm": 0.384765625, + "learning_rate": 4.802807903024631e-06, + "loss": 1.9771, + "step": 14302 + }, + { + "epoch": 0.7673283261802575, + "grad_norm": 0.400390625, + "learning_rate": 4.802774083115021e-06, + "loss": 2.4329, + "step": 14303 + }, + { + "epoch": 0.7673819742489271, + "grad_norm": 0.431640625, + "learning_rate": 4.802740260424574e-06, + "loss": 2.122, + "step": 14304 + }, + { + "epoch": 0.7674356223175965, + "grad_norm": 0.5, + "learning_rate": 4.802706434953329e-06, + "loss": 2.3874, + "step": 14305 + }, + { + "epoch": 0.7674892703862661, + "grad_norm": 0.435546875, + "learning_rate": 4.802672606701326e-06, + "loss": 2.3581, + "step": 14306 + }, + { + "epoch": 0.7675429184549356, + "grad_norm": 0.4296875, + "learning_rate": 4.802638775668608e-06, + "loss": 2.1222, + "step": 14307 + }, + { + "epoch": 0.7675965665236052, + "grad_norm": 0.392578125, + "learning_rate": 4.802604941855215e-06, + "loss": 2.1858, + "step": 14308 + }, + { + "epoch": 0.7676502145922747, + "grad_norm": 0.5, + "learning_rate": 4.802571105261187e-06, + "loss": 2.5422, + "step": 14309 + }, + { + "epoch": 0.7677038626609443, + "grad_norm": 0.400390625, + "learning_rate": 4.802537265886566e-06, + "loss": 2.1802, + "step": 14310 + }, + { + "epoch": 0.7677575107296137, + "grad_norm": 0.87109375, + "learning_rate": 4.802503423731391e-06, + "loss": 2.5094, + "step": 14311 + }, + { + "epoch": 0.7678111587982832, + "grad_norm": 0.546875, + "learning_rate": 4.802469578795705e-06, + "loss": 2.6146, + "step": 14312 + }, + { + "epoch": 0.7678648068669528, + "grad_norm": 0.4921875, + "learning_rate": 4.802435731079549e-06, + "loss": 2.5328, + "step": 14313 + }, + { + "epoch": 0.7679184549356223, + "grad_norm": 1.390625, + "learning_rate": 4.802401880582963e-06, + "loss": 2.0177, + "step": 14314 + }, + { + "epoch": 0.7679721030042919, + "grad_norm": 0.48046875, + "learning_rate": 4.8023680273059865e-06, + "loss": 2.3305, + "step": 14315 + }, + { + "epoch": 0.7680257510729613, + "grad_norm": 0.45703125, + "learning_rate": 4.802334171248663e-06, + "loss": 2.2612, + "step": 14316 + }, + { + "epoch": 0.7680793991416309, + "grad_norm": 0.5078125, + "learning_rate": 4.802300312411031e-06, + "loss": 2.246, + "step": 14317 + }, + { + "epoch": 0.7681330472103004, + "grad_norm": 0.51171875, + "learning_rate": 4.8022664507931335e-06, + "loss": 2.3164, + "step": 14318 + }, + { + "epoch": 0.76818669527897, + "grad_norm": 0.388671875, + "learning_rate": 4.802232586395009e-06, + "loss": 1.8741, + "step": 14319 + }, + { + "epoch": 0.7682403433476395, + "grad_norm": 0.81640625, + "learning_rate": 4.802198719216701e-06, + "loss": 2.199, + "step": 14320 + }, + { + "epoch": 0.768293991416309, + "grad_norm": 1.53125, + "learning_rate": 4.8021648492582496e-06, + "loss": 2.2701, + "step": 14321 + }, + { + "epoch": 0.7683476394849785, + "grad_norm": 0.466796875, + "learning_rate": 4.802130976519694e-06, + "loss": 2.5478, + "step": 14322 + }, + { + "epoch": 0.7684012875536481, + "grad_norm": 0.447265625, + "learning_rate": 4.802097101001077e-06, + "loss": 2.2978, + "step": 14323 + }, + { + "epoch": 0.7684549356223176, + "grad_norm": 0.51171875, + "learning_rate": 4.802063222702439e-06, + "loss": 2.3123, + "step": 14324 + }, + { + "epoch": 0.7685085836909872, + "grad_norm": 0.451171875, + "learning_rate": 4.8020293416238215e-06, + "loss": 2.2542, + "step": 14325 + }, + { + "epoch": 0.7685622317596567, + "grad_norm": 0.48046875, + "learning_rate": 4.801995457765264e-06, + "loss": 2.5164, + "step": 14326 + }, + { + "epoch": 0.7686158798283261, + "grad_norm": 0.4453125, + "learning_rate": 4.801961571126808e-06, + "loss": 2.3325, + "step": 14327 + }, + { + "epoch": 0.7686695278969957, + "grad_norm": 0.3671875, + "learning_rate": 4.8019276817084944e-06, + "loss": 2.1418, + "step": 14328 + }, + { + "epoch": 0.7687231759656652, + "grad_norm": 0.38671875, + "learning_rate": 4.801893789510365e-06, + "loss": 2.0988, + "step": 14329 + }, + { + "epoch": 0.7687768240343348, + "grad_norm": 0.4609375, + "learning_rate": 4.80185989453246e-06, + "loss": 2.289, + "step": 14330 + }, + { + "epoch": 0.7688304721030043, + "grad_norm": 0.466796875, + "learning_rate": 4.80182599677482e-06, + "loss": 2.4609, + "step": 14331 + }, + { + "epoch": 0.7688841201716738, + "grad_norm": 0.5078125, + "learning_rate": 4.8017920962374865e-06, + "loss": 2.461, + "step": 14332 + }, + { + "epoch": 0.7689377682403433, + "grad_norm": 0.53125, + "learning_rate": 4.801758192920501e-06, + "loss": 2.1901, + "step": 14333 + }, + { + "epoch": 0.7689914163090129, + "grad_norm": 0.46875, + "learning_rate": 4.8017242868239025e-06, + "loss": 2.4482, + "step": 14334 + }, + { + "epoch": 0.7690450643776824, + "grad_norm": 3.734375, + "learning_rate": 4.801690377947733e-06, + "loss": 2.0743, + "step": 14335 + }, + { + "epoch": 0.769098712446352, + "grad_norm": 0.439453125, + "learning_rate": 4.801656466292034e-06, + "loss": 2.2432, + "step": 14336 + }, + { + "epoch": 0.7691523605150214, + "grad_norm": 0.494140625, + "learning_rate": 4.801622551856846e-06, + "loss": 2.3055, + "step": 14337 + }, + { + "epoch": 0.769206008583691, + "grad_norm": 0.55078125, + "learning_rate": 4.80158863464221e-06, + "loss": 2.1196, + "step": 14338 + }, + { + "epoch": 0.7692596566523605, + "grad_norm": 0.455078125, + "learning_rate": 4.801554714648166e-06, + "loss": 2.4579, + "step": 14339 + }, + { + "epoch": 0.76931330472103, + "grad_norm": 0.578125, + "learning_rate": 4.8015207918747574e-06, + "loss": 2.1373, + "step": 14340 + }, + { + "epoch": 0.7693669527896996, + "grad_norm": 0.421875, + "learning_rate": 4.801486866322023e-06, + "loss": 2.1831, + "step": 14341 + }, + { + "epoch": 0.769420600858369, + "grad_norm": 0.44140625, + "learning_rate": 4.801452937990006e-06, + "loss": 2.4552, + "step": 14342 + }, + { + "epoch": 0.7694742489270386, + "grad_norm": 0.37890625, + "learning_rate": 4.801419006878743e-06, + "loss": 2.3598, + "step": 14343 + }, + { + "epoch": 0.7695278969957081, + "grad_norm": 1.8515625, + "learning_rate": 4.8013850729882794e-06, + "loss": 2.3231, + "step": 14344 + }, + { + "epoch": 0.7695815450643777, + "grad_norm": 0.455078125, + "learning_rate": 4.801351136318655e-06, + "loss": 2.3393, + "step": 14345 + }, + { + "epoch": 0.7696351931330472, + "grad_norm": 0.48046875, + "learning_rate": 4.801317196869909e-06, + "loss": 2.2552, + "step": 14346 + }, + { + "epoch": 0.7696888412017168, + "grad_norm": 0.427734375, + "learning_rate": 4.8012832546420846e-06, + "loss": 2.3234, + "step": 14347 + }, + { + "epoch": 0.7697424892703862, + "grad_norm": 0.5859375, + "learning_rate": 4.801249309635222e-06, + "loss": 2.3598, + "step": 14348 + }, + { + "epoch": 0.7697961373390558, + "grad_norm": 0.423828125, + "learning_rate": 4.8012153618493614e-06, + "loss": 2.2253, + "step": 14349 + }, + { + "epoch": 0.7698497854077253, + "grad_norm": 0.54296875, + "learning_rate": 4.801181411284545e-06, + "loss": 2.2368, + "step": 14350 + }, + { + "epoch": 0.7699034334763949, + "grad_norm": 0.423828125, + "learning_rate": 4.801147457940813e-06, + "loss": 1.9123, + "step": 14351 + }, + { + "epoch": 0.7699570815450644, + "grad_norm": 0.39453125, + "learning_rate": 4.801113501818208e-06, + "loss": 1.9941, + "step": 14352 + }, + { + "epoch": 0.770010729613734, + "grad_norm": 1.15625, + "learning_rate": 4.801079542916768e-06, + "loss": 2.7421, + "step": 14353 + }, + { + "epoch": 0.7700643776824034, + "grad_norm": 0.494140625, + "learning_rate": 4.801045581236536e-06, + "loss": 2.2223, + "step": 14354 + }, + { + "epoch": 0.7701180257510729, + "grad_norm": 0.53515625, + "learning_rate": 4.8010116167775534e-06, + "loss": 2.2624, + "step": 14355 + }, + { + "epoch": 0.7701716738197425, + "grad_norm": 0.44921875, + "learning_rate": 4.80097764953986e-06, + "loss": 2.35, + "step": 14356 + }, + { + "epoch": 0.770225321888412, + "grad_norm": 0.486328125, + "learning_rate": 4.8009436795234975e-06, + "loss": 2.3174, + "step": 14357 + }, + { + "epoch": 0.7702789699570816, + "grad_norm": 0.421875, + "learning_rate": 4.800909706728507e-06, + "loss": 2.2941, + "step": 14358 + }, + { + "epoch": 0.770332618025751, + "grad_norm": 0.515625, + "learning_rate": 4.800875731154929e-06, + "loss": 2.3882, + "step": 14359 + }, + { + "epoch": 0.7703862660944206, + "grad_norm": 0.462890625, + "learning_rate": 4.800841752802805e-06, + "loss": 2.3466, + "step": 14360 + }, + { + "epoch": 0.7704399141630901, + "grad_norm": 0.453125, + "learning_rate": 4.800807771672177e-06, + "loss": 2.3066, + "step": 14361 + }, + { + "epoch": 0.7704935622317597, + "grad_norm": 0.498046875, + "learning_rate": 4.800773787763083e-06, + "loss": 2.3063, + "step": 14362 + }, + { + "epoch": 0.7705472103004292, + "grad_norm": 0.42578125, + "learning_rate": 4.800739801075567e-06, + "loss": 2.0621, + "step": 14363 + }, + { + "epoch": 0.7706008583690988, + "grad_norm": 0.45703125, + "learning_rate": 4.8007058116096685e-06, + "loss": 2.1874, + "step": 14364 + }, + { + "epoch": 0.7706545064377682, + "grad_norm": 0.53515625, + "learning_rate": 4.80067181936543e-06, + "loss": 2.4454, + "step": 14365 + }, + { + "epoch": 0.7707081545064378, + "grad_norm": 0.43359375, + "learning_rate": 4.800637824342891e-06, + "loss": 2.2513, + "step": 14366 + }, + { + "epoch": 0.7707618025751073, + "grad_norm": 0.435546875, + "learning_rate": 4.800603826542093e-06, + "loss": 2.325, + "step": 14367 + }, + { + "epoch": 0.7708154506437769, + "grad_norm": 0.41796875, + "learning_rate": 4.800569825963078e-06, + "loss": 2.3178, + "step": 14368 + }, + { + "epoch": 0.7708690987124464, + "grad_norm": 0.53125, + "learning_rate": 4.800535822605885e-06, + "loss": 2.1727, + "step": 14369 + }, + { + "epoch": 0.7709227467811158, + "grad_norm": 0.466796875, + "learning_rate": 4.800501816470556e-06, + "loss": 2.2397, + "step": 14370 + }, + { + "epoch": 0.7709763948497854, + "grad_norm": 0.408203125, + "learning_rate": 4.800467807557134e-06, + "loss": 2.1482, + "step": 14371 + }, + { + "epoch": 0.7710300429184549, + "grad_norm": 0.53515625, + "learning_rate": 4.800433795865657e-06, + "loss": 1.4842, + "step": 14372 + }, + { + "epoch": 0.7710836909871245, + "grad_norm": 0.73828125, + "learning_rate": 4.800399781396168e-06, + "loss": 2.1175, + "step": 14373 + }, + { + "epoch": 0.771137339055794, + "grad_norm": 0.5234375, + "learning_rate": 4.800365764148708e-06, + "loss": 2.3467, + "step": 14374 + }, + { + "epoch": 0.7711909871244635, + "grad_norm": 0.427734375, + "learning_rate": 4.800331744123317e-06, + "loss": 2.0718, + "step": 14375 + }, + { + "epoch": 0.771244635193133, + "grad_norm": 0.462890625, + "learning_rate": 4.800297721320038e-06, + "loss": 2.3336, + "step": 14376 + }, + { + "epoch": 0.7712982832618026, + "grad_norm": 0.51953125, + "learning_rate": 4.800263695738911e-06, + "loss": 2.2706, + "step": 14377 + }, + { + "epoch": 0.7713519313304721, + "grad_norm": 0.4453125, + "learning_rate": 4.800229667379975e-06, + "loss": 2.2334, + "step": 14378 + }, + { + "epoch": 0.7714055793991417, + "grad_norm": 0.455078125, + "learning_rate": 4.800195636243275e-06, + "loss": 2.3234, + "step": 14379 + }, + { + "epoch": 0.7714592274678111, + "grad_norm": 0.435546875, + "learning_rate": 4.800161602328849e-06, + "loss": 2.1731, + "step": 14380 + }, + { + "epoch": 0.7715128755364807, + "grad_norm": 0.484375, + "learning_rate": 4.800127565636739e-06, + "loss": 2.2278, + "step": 14381 + }, + { + "epoch": 0.7715665236051502, + "grad_norm": 0.43359375, + "learning_rate": 4.800093526166987e-06, + "loss": 2.1003, + "step": 14382 + }, + { + "epoch": 0.7716201716738197, + "grad_norm": 0.4453125, + "learning_rate": 4.800059483919633e-06, + "loss": 2.2566, + "step": 14383 + }, + { + "epoch": 0.7716738197424893, + "grad_norm": 0.4765625, + "learning_rate": 4.800025438894718e-06, + "loss": 2.4948, + "step": 14384 + }, + { + "epoch": 0.7717274678111588, + "grad_norm": 0.38671875, + "learning_rate": 4.7999913910922855e-06, + "loss": 1.855, + "step": 14385 + }, + { + "epoch": 0.7717811158798283, + "grad_norm": 0.412109375, + "learning_rate": 4.799957340512373e-06, + "loss": 2.0059, + "step": 14386 + }, + { + "epoch": 0.7718347639484978, + "grad_norm": 0.55078125, + "learning_rate": 4.799923287155025e-06, + "loss": 2.1216, + "step": 14387 + }, + { + "epoch": 0.7718884120171674, + "grad_norm": 0.42578125, + "learning_rate": 4.79988923102028e-06, + "loss": 2.3228, + "step": 14388 + }, + { + "epoch": 0.7719420600858369, + "grad_norm": 0.53125, + "learning_rate": 4.79985517210818e-06, + "loss": 2.3849, + "step": 14389 + }, + { + "epoch": 0.7719957081545065, + "grad_norm": 0.7578125, + "learning_rate": 4.799821110418766e-06, + "loss": 2.2007, + "step": 14390 + }, + { + "epoch": 0.7720493562231759, + "grad_norm": 0.466796875, + "learning_rate": 4.79978704595208e-06, + "loss": 2.1803, + "step": 14391 + }, + { + "epoch": 0.7721030042918455, + "grad_norm": 0.69140625, + "learning_rate": 4.799752978708163e-06, + "loss": 2.0313, + "step": 14392 + }, + { + "epoch": 0.772156652360515, + "grad_norm": 0.48828125, + "learning_rate": 4.799718908687055e-06, + "loss": 2.3949, + "step": 14393 + }, + { + "epoch": 0.7722103004291846, + "grad_norm": 0.494140625, + "learning_rate": 4.7996848358887986e-06, + "loss": 2.4321, + "step": 14394 + }, + { + "epoch": 0.7722639484978541, + "grad_norm": 0.37890625, + "learning_rate": 4.799650760313433e-06, + "loss": 2.0064, + "step": 14395 + }, + { + "epoch": 0.7723175965665237, + "grad_norm": 0.50390625, + "learning_rate": 4.799616681961002e-06, + "loss": 2.1283, + "step": 14396 + }, + { + "epoch": 0.7723712446351931, + "grad_norm": 0.47265625, + "learning_rate": 4.799582600831543e-06, + "loss": 2.4187, + "step": 14397 + }, + { + "epoch": 0.7724248927038626, + "grad_norm": 0.494140625, + "learning_rate": 4.799548516925101e-06, + "loss": 2.2358, + "step": 14398 + }, + { + "epoch": 0.7724785407725322, + "grad_norm": 0.51171875, + "learning_rate": 4.799514430241716e-06, + "loss": 2.4053, + "step": 14399 + }, + { + "epoch": 0.7725321888412017, + "grad_norm": 0.421875, + "learning_rate": 4.7994803407814275e-06, + "loss": 2.216, + "step": 14400 + }, + { + "epoch": 0.7725858369098713, + "grad_norm": 0.48046875, + "learning_rate": 4.799446248544279e-06, + "loss": 2.3875, + "step": 14401 + }, + { + "epoch": 0.7726394849785407, + "grad_norm": 0.50390625, + "learning_rate": 4.79941215353031e-06, + "loss": 2.2788, + "step": 14402 + }, + { + "epoch": 0.7726931330472103, + "grad_norm": 0.470703125, + "learning_rate": 4.799378055739562e-06, + "loss": 2.2702, + "step": 14403 + }, + { + "epoch": 0.7727467811158798, + "grad_norm": 0.384765625, + "learning_rate": 4.7993439551720775e-06, + "loss": 2.1796, + "step": 14404 + }, + { + "epoch": 0.7728004291845494, + "grad_norm": 0.53125, + "learning_rate": 4.799309851827896e-06, + "loss": 2.2254, + "step": 14405 + }, + { + "epoch": 0.7728540772532189, + "grad_norm": 0.5546875, + "learning_rate": 4.7992757457070594e-06, + "loss": 2.2437, + "step": 14406 + }, + { + "epoch": 0.7729077253218885, + "grad_norm": 0.462890625, + "learning_rate": 4.799241636809608e-06, + "loss": 2.011, + "step": 14407 + }, + { + "epoch": 0.7729613733905579, + "grad_norm": 0.419921875, + "learning_rate": 4.799207525135585e-06, + "loss": 2.4255, + "step": 14408 + }, + { + "epoch": 0.7730150214592275, + "grad_norm": 0.50390625, + "learning_rate": 4.79917341068503e-06, + "loss": 2.2777, + "step": 14409 + }, + { + "epoch": 0.773068669527897, + "grad_norm": 0.48046875, + "learning_rate": 4.7991392934579836e-06, + "loss": 2.3156, + "step": 14410 + }, + { + "epoch": 0.7731223175965666, + "grad_norm": 0.453125, + "learning_rate": 4.7991051734544895e-06, + "loss": 2.4287, + "step": 14411 + }, + { + "epoch": 0.773175965665236, + "grad_norm": 0.46484375, + "learning_rate": 4.7990710506745865e-06, + "loss": 2.1522, + "step": 14412 + }, + { + "epoch": 0.7732296137339055, + "grad_norm": 0.443359375, + "learning_rate": 4.799036925118316e-06, + "loss": 2.2687, + "step": 14413 + }, + { + "epoch": 0.7732832618025751, + "grad_norm": 0.46484375, + "learning_rate": 4.799002796785722e-06, + "loss": 2.2788, + "step": 14414 + }, + { + "epoch": 0.7733369098712446, + "grad_norm": 0.4296875, + "learning_rate": 4.798968665676842e-06, + "loss": 2.5286, + "step": 14415 + }, + { + "epoch": 0.7733905579399142, + "grad_norm": 0.404296875, + "learning_rate": 4.798934531791719e-06, + "loss": 2.0331, + "step": 14416 + }, + { + "epoch": 0.7734442060085837, + "grad_norm": 0.40234375, + "learning_rate": 4.7989003951303945e-06, + "loss": 1.9995, + "step": 14417 + }, + { + "epoch": 0.7734978540772532, + "grad_norm": 0.51171875, + "learning_rate": 4.7988662556929085e-06, + "loss": 2.4235, + "step": 14418 + }, + { + "epoch": 0.7735515021459227, + "grad_norm": 0.4921875, + "learning_rate": 4.798832113479303e-06, + "loss": 2.0641, + "step": 14419 + }, + { + "epoch": 0.7736051502145923, + "grad_norm": 0.4921875, + "learning_rate": 4.7987979684896206e-06, + "loss": 1.7188, + "step": 14420 + }, + { + "epoch": 0.7736587982832618, + "grad_norm": 0.5, + "learning_rate": 4.798763820723901e-06, + "loss": 2.0971, + "step": 14421 + }, + { + "epoch": 0.7737124463519314, + "grad_norm": 0.39453125, + "learning_rate": 4.798729670182184e-06, + "loss": 2.3367, + "step": 14422 + }, + { + "epoch": 0.7737660944206008, + "grad_norm": 0.82421875, + "learning_rate": 4.798695516864513e-06, + "loss": 2.4034, + "step": 14423 + }, + { + "epoch": 0.7738197424892704, + "grad_norm": 0.43359375, + "learning_rate": 4.7986613607709295e-06, + "loss": 2.1593, + "step": 14424 + }, + { + "epoch": 0.7738733905579399, + "grad_norm": 0.51171875, + "learning_rate": 4.7986272019014734e-06, + "loss": 2.3091, + "step": 14425 + }, + { + "epoch": 0.7739270386266094, + "grad_norm": 0.515625, + "learning_rate": 4.798593040256186e-06, + "loss": 2.551, + "step": 14426 + }, + { + "epoch": 0.773980686695279, + "grad_norm": 0.49609375, + "learning_rate": 4.79855887583511e-06, + "loss": 1.8593, + "step": 14427 + }, + { + "epoch": 0.7740343347639485, + "grad_norm": 0.55078125, + "learning_rate": 4.798524708638286e-06, + "loss": 2.1409, + "step": 14428 + }, + { + "epoch": 0.774087982832618, + "grad_norm": 0.384765625, + "learning_rate": 4.7984905386657536e-06, + "loss": 2.1642, + "step": 14429 + }, + { + "epoch": 0.7741416309012875, + "grad_norm": 0.46484375, + "learning_rate": 4.798456365917556e-06, + "loss": 2.3757, + "step": 14430 + }, + { + "epoch": 0.7741952789699571, + "grad_norm": 0.447265625, + "learning_rate": 4.798422190393734e-06, + "loss": 2.0164, + "step": 14431 + }, + { + "epoch": 0.7742489270386266, + "grad_norm": 0.45703125, + "learning_rate": 4.798388012094328e-06, + "loss": 2.1557, + "step": 14432 + }, + { + "epoch": 0.7743025751072962, + "grad_norm": 0.51171875, + "learning_rate": 4.798353831019381e-06, + "loss": 2.319, + "step": 14433 + }, + { + "epoch": 0.7743562231759656, + "grad_norm": 0.453125, + "learning_rate": 4.798319647168933e-06, + "loss": 2.1648, + "step": 14434 + }, + { + "epoch": 0.7744098712446352, + "grad_norm": 0.515625, + "learning_rate": 4.798285460543025e-06, + "loss": 2.3985, + "step": 14435 + }, + { + "epoch": 0.7744635193133047, + "grad_norm": 0.48046875, + "learning_rate": 4.7982512711416995e-06, + "loss": 2.3613, + "step": 14436 + }, + { + "epoch": 0.7745171673819743, + "grad_norm": 0.451171875, + "learning_rate": 4.798217078964997e-06, + "loss": 2.4701, + "step": 14437 + }, + { + "epoch": 0.7745708154506438, + "grad_norm": 0.5625, + "learning_rate": 4.79818288401296e-06, + "loss": 2.2305, + "step": 14438 + }, + { + "epoch": 0.7746244635193134, + "grad_norm": 0.5625, + "learning_rate": 4.798148686285627e-06, + "loss": 2.3828, + "step": 14439 + }, + { + "epoch": 0.7746781115879828, + "grad_norm": 0.443359375, + "learning_rate": 4.798114485783042e-06, + "loss": 2.3226, + "step": 14440 + }, + { + "epoch": 0.7747317596566523, + "grad_norm": 0.66796875, + "learning_rate": 4.798080282505245e-06, + "loss": 1.2811, + "step": 14441 + }, + { + "epoch": 0.7747854077253219, + "grad_norm": 0.48046875, + "learning_rate": 4.798046076452277e-06, + "loss": 2.4583, + "step": 14442 + }, + { + "epoch": 0.7748390557939914, + "grad_norm": 0.5078125, + "learning_rate": 4.7980118676241816e-06, + "loss": 2.3757, + "step": 14443 + }, + { + "epoch": 0.774892703862661, + "grad_norm": 0.451171875, + "learning_rate": 4.797977656020997e-06, + "loss": 2.1204, + "step": 14444 + }, + { + "epoch": 0.7749463519313304, + "grad_norm": 0.4609375, + "learning_rate": 4.7979434416427665e-06, + "loss": 2.3446, + "step": 14445 + }, + { + "epoch": 0.775, + "grad_norm": 0.51171875, + "learning_rate": 4.797909224489531e-06, + "loss": 2.1491, + "step": 14446 + }, + { + "epoch": 0.7750536480686695, + "grad_norm": 0.40234375, + "learning_rate": 4.797875004561331e-06, + "loss": 2.0418, + "step": 14447 + }, + { + "epoch": 0.7751072961373391, + "grad_norm": 0.431640625, + "learning_rate": 4.797840781858209e-06, + "loss": 2.1933, + "step": 14448 + }, + { + "epoch": 0.7751609442060086, + "grad_norm": 0.478515625, + "learning_rate": 4.7978065563802065e-06, + "loss": 2.1989, + "step": 14449 + }, + { + "epoch": 0.7752145922746781, + "grad_norm": 0.486328125, + "learning_rate": 4.797772328127364e-06, + "loss": 2.452, + "step": 14450 + }, + { + "epoch": 0.7752682403433476, + "grad_norm": 0.466796875, + "learning_rate": 4.797738097099722e-06, + "loss": 2.5061, + "step": 14451 + }, + { + "epoch": 0.7753218884120172, + "grad_norm": 0.421875, + "learning_rate": 4.797703863297324e-06, + "loss": 2.2111, + "step": 14452 + }, + { + "epoch": 0.7753755364806867, + "grad_norm": 0.53125, + "learning_rate": 4.79766962672021e-06, + "loss": 2.5677, + "step": 14453 + }, + { + "epoch": 0.7754291845493563, + "grad_norm": 0.4921875, + "learning_rate": 4.797635387368421e-06, + "loss": 2.4606, + "step": 14454 + }, + { + "epoch": 0.7754828326180258, + "grad_norm": 0.427734375, + "learning_rate": 4.797601145241999e-06, + "loss": 2.3931, + "step": 14455 + }, + { + "epoch": 0.7755364806866952, + "grad_norm": 0.44921875, + "learning_rate": 4.797566900340986e-06, + "loss": 2.2295, + "step": 14456 + }, + { + "epoch": 0.7755901287553648, + "grad_norm": 0.443359375, + "learning_rate": 4.797532652665422e-06, + "loss": 2.3341, + "step": 14457 + }, + { + "epoch": 0.7756437768240343, + "grad_norm": 0.4375, + "learning_rate": 4.797498402215349e-06, + "loss": 2.3742, + "step": 14458 + }, + { + "epoch": 0.7756974248927039, + "grad_norm": 0.47265625, + "learning_rate": 4.797464148990808e-06, + "loss": 2.3541, + "step": 14459 + }, + { + "epoch": 0.7757510729613734, + "grad_norm": 0.4609375, + "learning_rate": 4.797429892991841e-06, + "loss": 2.3256, + "step": 14460 + }, + { + "epoch": 0.7758047210300429, + "grad_norm": 0.447265625, + "learning_rate": 4.797395634218489e-06, + "loss": 2.0445, + "step": 14461 + }, + { + "epoch": 0.7758583690987124, + "grad_norm": 0.4921875, + "learning_rate": 4.797361372670793e-06, + "loss": 2.2795, + "step": 14462 + }, + { + "epoch": 0.775912017167382, + "grad_norm": 0.435546875, + "learning_rate": 4.797327108348796e-06, + "loss": 2.4398, + "step": 14463 + }, + { + "epoch": 0.7759656652360515, + "grad_norm": 0.52734375, + "learning_rate": 4.797292841252537e-06, + "loss": 2.4159, + "step": 14464 + }, + { + "epoch": 0.7760193133047211, + "grad_norm": 0.486328125, + "learning_rate": 4.79725857138206e-06, + "loss": 2.3639, + "step": 14465 + }, + { + "epoch": 0.7760729613733905, + "grad_norm": 0.6328125, + "learning_rate": 4.7972242987374034e-06, + "loss": 2.355, + "step": 14466 + }, + { + "epoch": 0.7761266094420601, + "grad_norm": 0.5390625, + "learning_rate": 4.797190023318611e-06, + "loss": 2.3515, + "step": 14467 + }, + { + "epoch": 0.7761802575107296, + "grad_norm": 0.46875, + "learning_rate": 4.797155745125722e-06, + "loss": 2.2748, + "step": 14468 + }, + { + "epoch": 0.7762339055793992, + "grad_norm": 0.494140625, + "learning_rate": 4.797121464158781e-06, + "loss": 2.3, + "step": 14469 + }, + { + "epoch": 0.7762875536480687, + "grad_norm": 0.5234375, + "learning_rate": 4.797087180417827e-06, + "loss": 2.4246, + "step": 14470 + }, + { + "epoch": 0.7763412017167381, + "grad_norm": 0.40234375, + "learning_rate": 4.797052893902901e-06, + "loss": 2.644, + "step": 14471 + }, + { + "epoch": 0.7763948497854077, + "grad_norm": 0.4296875, + "learning_rate": 4.797018604614046e-06, + "loss": 2.3444, + "step": 14472 + }, + { + "epoch": 0.7764484978540772, + "grad_norm": 0.47265625, + "learning_rate": 4.796984312551303e-06, + "loss": 2.017, + "step": 14473 + }, + { + "epoch": 0.7765021459227468, + "grad_norm": 0.4296875, + "learning_rate": 4.796950017714712e-06, + "loss": 2.2044, + "step": 14474 + }, + { + "epoch": 0.7765557939914163, + "grad_norm": 0.4453125, + "learning_rate": 4.7969157201043155e-06, + "loss": 2.3363, + "step": 14475 + }, + { + "epoch": 0.7766094420600859, + "grad_norm": 0.458984375, + "learning_rate": 4.796881419720156e-06, + "loss": 2.4403, + "step": 14476 + }, + { + "epoch": 0.7766630901287553, + "grad_norm": 0.4140625, + "learning_rate": 4.796847116562275e-06, + "loss": 2.2081, + "step": 14477 + }, + { + "epoch": 0.7767167381974249, + "grad_norm": 0.4921875, + "learning_rate": 4.796812810630711e-06, + "loss": 2.395, + "step": 14478 + }, + { + "epoch": 0.7767703862660944, + "grad_norm": 0.439453125, + "learning_rate": 4.796778501925507e-06, + "loss": 2.3647, + "step": 14479 + }, + { + "epoch": 0.776824034334764, + "grad_norm": 0.57421875, + "learning_rate": 4.796744190446706e-06, + "loss": 2.4063, + "step": 14480 + }, + { + "epoch": 0.7768776824034335, + "grad_norm": 0.46484375, + "learning_rate": 4.796709876194347e-06, + "loss": 2.4133, + "step": 14481 + }, + { + "epoch": 0.776931330472103, + "grad_norm": 0.435546875, + "learning_rate": 4.796675559168473e-06, + "loss": 2.1628, + "step": 14482 + }, + { + "epoch": 0.7769849785407725, + "grad_norm": 0.427734375, + "learning_rate": 4.796641239369125e-06, + "loss": 2.2059, + "step": 14483 + }, + { + "epoch": 0.777038626609442, + "grad_norm": 0.4453125, + "learning_rate": 4.796606916796345e-06, + "loss": 2.1382, + "step": 14484 + }, + { + "epoch": 0.7770922746781116, + "grad_norm": 0.443359375, + "learning_rate": 4.796572591450173e-06, + "loss": 2.2868, + "step": 14485 + }, + { + "epoch": 0.7771459227467811, + "grad_norm": 0.6640625, + "learning_rate": 4.796538263330651e-06, + "loss": 2.4208, + "step": 14486 + }, + { + "epoch": 0.7771995708154507, + "grad_norm": 0.484375, + "learning_rate": 4.796503932437821e-06, + "loss": 2.1625, + "step": 14487 + }, + { + "epoch": 0.7772532188841201, + "grad_norm": 0.56640625, + "learning_rate": 4.7964695987717256e-06, + "loss": 2.349, + "step": 14488 + }, + { + "epoch": 0.7773068669527897, + "grad_norm": 0.48046875, + "learning_rate": 4.796435262332404e-06, + "loss": 1.8406, + "step": 14489 + }, + { + "epoch": 0.7773605150214592, + "grad_norm": 0.466796875, + "learning_rate": 4.796400923119898e-06, + "loss": 2.4664, + "step": 14490 + }, + { + "epoch": 0.7774141630901288, + "grad_norm": 0.44921875, + "learning_rate": 4.796366581134251e-06, + "loss": 2.5074, + "step": 14491 + }, + { + "epoch": 0.7774678111587983, + "grad_norm": 0.5, + "learning_rate": 4.796332236375501e-06, + "loss": 2.298, + "step": 14492 + }, + { + "epoch": 0.7775214592274678, + "grad_norm": 0.41796875, + "learning_rate": 4.796297888843694e-06, + "loss": 2.3252, + "step": 14493 + }, + { + "epoch": 0.7775751072961373, + "grad_norm": 0.5703125, + "learning_rate": 4.796263538538868e-06, + "loss": 2.2563, + "step": 14494 + }, + { + "epoch": 0.7776287553648069, + "grad_norm": 0.453125, + "learning_rate": 4.796229185461066e-06, + "loss": 2.0714, + "step": 14495 + }, + { + "epoch": 0.7776824034334764, + "grad_norm": 0.447265625, + "learning_rate": 4.796194829610328e-06, + "loss": 2.2578, + "step": 14496 + }, + { + "epoch": 0.777736051502146, + "grad_norm": 0.953125, + "learning_rate": 4.796160470986697e-06, + "loss": 2.392, + "step": 14497 + }, + { + "epoch": 0.7777896995708155, + "grad_norm": 0.470703125, + "learning_rate": 4.796126109590214e-06, + "loss": 2.1855, + "step": 14498 + }, + { + "epoch": 0.7778433476394849, + "grad_norm": 0.7109375, + "learning_rate": 4.796091745420921e-06, + "loss": 2.338, + "step": 14499 + }, + { + "epoch": 0.7778969957081545, + "grad_norm": 0.55078125, + "learning_rate": 4.796057378478858e-06, + "loss": 2.4692, + "step": 14500 + }, + { + "epoch": 0.777950643776824, + "grad_norm": 0.4921875, + "learning_rate": 4.796023008764068e-06, + "loss": 2.2266, + "step": 14501 + }, + { + "epoch": 0.7780042918454936, + "grad_norm": 0.78125, + "learning_rate": 4.795988636276592e-06, + "loss": 2.2951, + "step": 14502 + }, + { + "epoch": 0.778057939914163, + "grad_norm": 0.392578125, + "learning_rate": 4.795954261016472e-06, + "loss": 2.3415, + "step": 14503 + }, + { + "epoch": 0.7781115879828326, + "grad_norm": 0.46875, + "learning_rate": 4.795919882983748e-06, + "loss": 2.3257, + "step": 14504 + }, + { + "epoch": 0.7781652360515021, + "grad_norm": 0.412109375, + "learning_rate": 4.795885502178463e-06, + "loss": 2.1355, + "step": 14505 + }, + { + "epoch": 0.7782188841201717, + "grad_norm": 0.5078125, + "learning_rate": 4.795851118600658e-06, + "loss": 2.0922, + "step": 14506 + }, + { + "epoch": 0.7782725321888412, + "grad_norm": 0.421875, + "learning_rate": 4.795816732250375e-06, + "loss": 2.2482, + "step": 14507 + }, + { + "epoch": 0.7783261802575108, + "grad_norm": 0.41796875, + "learning_rate": 4.795782343127655e-06, + "loss": 2.3502, + "step": 14508 + }, + { + "epoch": 0.7783798283261802, + "grad_norm": 0.443359375, + "learning_rate": 4.79574795123254e-06, + "loss": 2.2572, + "step": 14509 + }, + { + "epoch": 0.7784334763948498, + "grad_norm": 0.4296875, + "learning_rate": 4.795713556565071e-06, + "loss": 2.2804, + "step": 14510 + }, + { + "epoch": 0.7784871244635193, + "grad_norm": 0.427734375, + "learning_rate": 4.79567915912529e-06, + "loss": 2.3782, + "step": 14511 + }, + { + "epoch": 0.7785407725321889, + "grad_norm": 0.5, + "learning_rate": 4.795644758913237e-06, + "loss": 2.1153, + "step": 14512 + }, + { + "epoch": 0.7785944206008584, + "grad_norm": 0.42578125, + "learning_rate": 4.795610355928956e-06, + "loss": 2.2412, + "step": 14513 + }, + { + "epoch": 0.7786480686695278, + "grad_norm": 0.388671875, + "learning_rate": 4.795575950172486e-06, + "loss": 2.1538, + "step": 14514 + }, + { + "epoch": 0.7787017167381974, + "grad_norm": 0.453125, + "learning_rate": 4.795541541643871e-06, + "loss": 2.5133, + "step": 14515 + }, + { + "epoch": 0.7787553648068669, + "grad_norm": 0.421875, + "learning_rate": 4.795507130343151e-06, + "loss": 2.2768, + "step": 14516 + }, + { + "epoch": 0.7788090128755365, + "grad_norm": 0.4609375, + "learning_rate": 4.7954727162703685e-06, + "loss": 2.257, + "step": 14517 + }, + { + "epoch": 0.778862660944206, + "grad_norm": 0.44921875, + "learning_rate": 4.795438299425564e-06, + "loss": 2.1037, + "step": 14518 + }, + { + "epoch": 0.7789163090128756, + "grad_norm": 0.462890625, + "learning_rate": 4.79540387980878e-06, + "loss": 2.1329, + "step": 14519 + }, + { + "epoch": 0.778969957081545, + "grad_norm": 0.66015625, + "learning_rate": 4.795369457420057e-06, + "loss": 2.2606, + "step": 14520 + }, + { + "epoch": 0.7790236051502146, + "grad_norm": 0.447265625, + "learning_rate": 4.795335032259437e-06, + "loss": 2.2949, + "step": 14521 + }, + { + "epoch": 0.7790772532188841, + "grad_norm": 0.49609375, + "learning_rate": 4.795300604326964e-06, + "loss": 2.2625, + "step": 14522 + }, + { + "epoch": 0.7791309012875537, + "grad_norm": 0.5234375, + "learning_rate": 4.795266173622676e-06, + "loss": 2.4439, + "step": 14523 + }, + { + "epoch": 0.7791845493562232, + "grad_norm": 0.498046875, + "learning_rate": 4.795231740146615e-06, + "loss": 2.0711, + "step": 14524 + }, + { + "epoch": 0.7792381974248928, + "grad_norm": 0.4140625, + "learning_rate": 4.795197303898824e-06, + "loss": 2.3683, + "step": 14525 + }, + { + "epoch": 0.7792918454935622, + "grad_norm": 0.4140625, + "learning_rate": 4.795162864879345e-06, + "loss": 2.2347, + "step": 14526 + }, + { + "epoch": 0.7793454935622317, + "grad_norm": 0.42578125, + "learning_rate": 4.795128423088218e-06, + "loss": 2.4972, + "step": 14527 + }, + { + "epoch": 0.7793991416309013, + "grad_norm": 0.4296875, + "learning_rate": 4.795093978525486e-06, + "loss": 2.4036, + "step": 14528 + }, + { + "epoch": 0.7794527896995708, + "grad_norm": 0.439453125, + "learning_rate": 4.795059531191189e-06, + "loss": 2.3455, + "step": 14529 + }, + { + "epoch": 0.7795064377682404, + "grad_norm": 0.55859375, + "learning_rate": 4.79502508108537e-06, + "loss": 2.5054, + "step": 14530 + }, + { + "epoch": 0.7795600858369098, + "grad_norm": 0.53125, + "learning_rate": 4.79499062820807e-06, + "loss": 2.5391, + "step": 14531 + }, + { + "epoch": 0.7796137339055794, + "grad_norm": 0.54296875, + "learning_rate": 4.794956172559331e-06, + "loss": 2.1563, + "step": 14532 + }, + { + "epoch": 0.7796673819742489, + "grad_norm": 0.4453125, + "learning_rate": 4.794921714139194e-06, + "loss": 2.3001, + "step": 14533 + }, + { + "epoch": 0.7797210300429185, + "grad_norm": 0.443359375, + "learning_rate": 4.794887252947701e-06, + "loss": 2.3595, + "step": 14534 + }, + { + "epoch": 0.779774678111588, + "grad_norm": 0.498046875, + "learning_rate": 4.794852788984894e-06, + "loss": 2.2578, + "step": 14535 + }, + { + "epoch": 0.7798283261802575, + "grad_norm": 0.447265625, + "learning_rate": 4.794818322250814e-06, + "loss": 2.3912, + "step": 14536 + }, + { + "epoch": 0.779881974248927, + "grad_norm": 0.58203125, + "learning_rate": 4.794783852745503e-06, + "loss": 2.4363, + "step": 14537 + }, + { + "epoch": 0.7799356223175966, + "grad_norm": 0.45703125, + "learning_rate": 4.794749380469001e-06, + "loss": 2.0799, + "step": 14538 + }, + { + "epoch": 0.7799892703862661, + "grad_norm": 0.93359375, + "learning_rate": 4.7947149054213525e-06, + "loss": 2.1512, + "step": 14539 + }, + { + "epoch": 0.7800429184549357, + "grad_norm": 0.46875, + "learning_rate": 4.794680427602597e-06, + "loss": 2.3873, + "step": 14540 + }, + { + "epoch": 0.7800965665236052, + "grad_norm": 0.4765625, + "learning_rate": 4.794645947012777e-06, + "loss": 2.3949, + "step": 14541 + }, + { + "epoch": 0.7801502145922746, + "grad_norm": 0.58203125, + "learning_rate": 4.7946114636519346e-06, + "loss": 2.3206, + "step": 14542 + }, + { + "epoch": 0.7802038626609442, + "grad_norm": 0.52734375, + "learning_rate": 4.794576977520109e-06, + "loss": 2.4687, + "step": 14543 + }, + { + "epoch": 0.7802575107296137, + "grad_norm": 0.39453125, + "learning_rate": 4.794542488617345e-06, + "loss": 2.2009, + "step": 14544 + }, + { + "epoch": 0.7803111587982833, + "grad_norm": 0.419921875, + "learning_rate": 4.794507996943684e-06, + "loss": 2.2605, + "step": 14545 + }, + { + "epoch": 0.7803648068669528, + "grad_norm": 0.42578125, + "learning_rate": 4.794473502499164e-06, + "loss": 2.2441, + "step": 14546 + }, + { + "epoch": 0.7804184549356223, + "grad_norm": 0.625, + "learning_rate": 4.794439005283831e-06, + "loss": 2.3667, + "step": 14547 + }, + { + "epoch": 0.7804721030042918, + "grad_norm": 0.474609375, + "learning_rate": 4.794404505297724e-06, + "loss": 2.5692, + "step": 14548 + }, + { + "epoch": 0.7805257510729614, + "grad_norm": 0.45703125, + "learning_rate": 4.794370002540886e-06, + "loss": 2.3148, + "step": 14549 + }, + { + "epoch": 0.7805793991416309, + "grad_norm": 0.48046875, + "learning_rate": 4.794335497013358e-06, + "loss": 2.2902, + "step": 14550 + }, + { + "epoch": 0.7806330472103005, + "grad_norm": 0.51171875, + "learning_rate": 4.794300988715183e-06, + "loss": 2.0916, + "step": 14551 + }, + { + "epoch": 0.7806866952789699, + "grad_norm": 0.52734375, + "learning_rate": 4.7942664776464e-06, + "loss": 2.286, + "step": 14552 + }, + { + "epoch": 0.7807403433476395, + "grad_norm": 0.44140625, + "learning_rate": 4.794231963807052e-06, + "loss": 2.0869, + "step": 14553 + }, + { + "epoch": 0.780793991416309, + "grad_norm": 0.5625, + "learning_rate": 4.794197447197182e-06, + "loss": 2.4849, + "step": 14554 + }, + { + "epoch": 0.7808476394849786, + "grad_norm": 0.5078125, + "learning_rate": 4.79416292781683e-06, + "loss": 2.5819, + "step": 14555 + }, + { + "epoch": 0.7809012875536481, + "grad_norm": 0.478515625, + "learning_rate": 4.794128405666039e-06, + "loss": 2.5375, + "step": 14556 + }, + { + "epoch": 0.7809549356223175, + "grad_norm": 0.42578125, + "learning_rate": 4.794093880744849e-06, + "loss": 2.306, + "step": 14557 + }, + { + "epoch": 0.7810085836909871, + "grad_norm": 0.435546875, + "learning_rate": 4.794059353053303e-06, + "loss": 2.2116, + "step": 14558 + }, + { + "epoch": 0.7810622317596566, + "grad_norm": 0.451171875, + "learning_rate": 4.794024822591442e-06, + "loss": 2.3188, + "step": 14559 + }, + { + "epoch": 0.7811158798283262, + "grad_norm": 0.4609375, + "learning_rate": 4.793990289359308e-06, + "loss": 2.2114, + "step": 14560 + }, + { + "epoch": 0.7811695278969957, + "grad_norm": 0.6328125, + "learning_rate": 4.793955753356943e-06, + "loss": 2.4891, + "step": 14561 + }, + { + "epoch": 0.7812231759656653, + "grad_norm": 0.60546875, + "learning_rate": 4.793921214584388e-06, + "loss": 2.3644, + "step": 14562 + }, + { + "epoch": 0.7812768240343347, + "grad_norm": 1.3828125, + "learning_rate": 4.793886673041686e-06, + "loss": 2.2976, + "step": 14563 + }, + { + "epoch": 0.7813304721030043, + "grad_norm": 0.46875, + "learning_rate": 4.793852128728878e-06, + "loss": 2.2281, + "step": 14564 + }, + { + "epoch": 0.7813841201716738, + "grad_norm": 0.48046875, + "learning_rate": 4.793817581646004e-06, + "loss": 2.4457, + "step": 14565 + }, + { + "epoch": 0.7814377682403434, + "grad_norm": 0.412109375, + "learning_rate": 4.793783031793109e-06, + "loss": 1.7771, + "step": 14566 + }, + { + "epoch": 0.7814914163090129, + "grad_norm": 0.494140625, + "learning_rate": 4.793748479170232e-06, + "loss": 2.0835, + "step": 14567 + }, + { + "epoch": 0.7815450643776825, + "grad_norm": 0.47265625, + "learning_rate": 4.793713923777416e-06, + "loss": 2.2676, + "step": 14568 + }, + { + "epoch": 0.7815987124463519, + "grad_norm": 0.6953125, + "learning_rate": 4.793679365614703e-06, + "loss": 2.0326, + "step": 14569 + }, + { + "epoch": 0.7816523605150214, + "grad_norm": 0.50390625, + "learning_rate": 4.793644804682134e-06, + "loss": 2.3929, + "step": 14570 + }, + { + "epoch": 0.781706008583691, + "grad_norm": 0.46875, + "learning_rate": 4.79361024097975e-06, + "loss": 2.4483, + "step": 14571 + }, + { + "epoch": 0.7817596566523605, + "grad_norm": 0.419921875, + "learning_rate": 4.7935756745075944e-06, + "loss": 2.1311, + "step": 14572 + }, + { + "epoch": 0.7818133047210301, + "grad_norm": 0.50390625, + "learning_rate": 4.793541105265709e-06, + "loss": 2.4549, + "step": 14573 + }, + { + "epoch": 0.7818669527896995, + "grad_norm": 1.0625, + "learning_rate": 4.793506533254133e-06, + "loss": 2.1898, + "step": 14574 + }, + { + "epoch": 0.7819206008583691, + "grad_norm": 0.431640625, + "learning_rate": 4.79347195847291e-06, + "loss": 2.317, + "step": 14575 + }, + { + "epoch": 0.7819742489270386, + "grad_norm": 0.48828125, + "learning_rate": 4.793437380922084e-06, + "loss": 2.1841, + "step": 14576 + }, + { + "epoch": 0.7820278969957082, + "grad_norm": 0.62109375, + "learning_rate": 4.793402800601692e-06, + "loss": 2.3028, + "step": 14577 + }, + { + "epoch": 0.7820815450643777, + "grad_norm": 0.498046875, + "learning_rate": 4.793368217511779e-06, + "loss": 2.1613, + "step": 14578 + }, + { + "epoch": 0.7821351931330472, + "grad_norm": 0.466796875, + "learning_rate": 4.7933336316523865e-06, + "loss": 2.232, + "step": 14579 + }, + { + "epoch": 0.7821888412017167, + "grad_norm": 0.490234375, + "learning_rate": 4.7932990430235555e-06, + "loss": 2.3265, + "step": 14580 + }, + { + "epoch": 0.7822424892703863, + "grad_norm": 0.37890625, + "learning_rate": 4.793264451625328e-06, + "loss": 2.2934, + "step": 14581 + }, + { + "epoch": 0.7822961373390558, + "grad_norm": 0.484375, + "learning_rate": 4.793229857457745e-06, + "loss": 2.3285, + "step": 14582 + }, + { + "epoch": 0.7823497854077254, + "grad_norm": 0.466796875, + "learning_rate": 4.79319526052085e-06, + "loss": 2.277, + "step": 14583 + }, + { + "epoch": 0.7824034334763948, + "grad_norm": 0.41796875, + "learning_rate": 4.793160660814683e-06, + "loss": 2.3666, + "step": 14584 + }, + { + "epoch": 0.7824570815450643, + "grad_norm": 0.458984375, + "learning_rate": 4.793126058339287e-06, + "loss": 2.1824, + "step": 14585 + }, + { + "epoch": 0.7825107296137339, + "grad_norm": 0.46875, + "learning_rate": 4.7930914530947035e-06, + "loss": 2.3356, + "step": 14586 + }, + { + "epoch": 0.7825643776824034, + "grad_norm": 0.45703125, + "learning_rate": 4.793056845080974e-06, + "loss": 2.3284, + "step": 14587 + }, + { + "epoch": 0.782618025751073, + "grad_norm": 0.435546875, + "learning_rate": 4.79302223429814e-06, + "loss": 2.1021, + "step": 14588 + }, + { + "epoch": 0.7826716738197425, + "grad_norm": 0.37890625, + "learning_rate": 4.792987620746245e-06, + "loss": 2.3638, + "step": 14589 + }, + { + "epoch": 0.782725321888412, + "grad_norm": 0.5078125, + "learning_rate": 4.792953004425328e-06, + "loss": 2.0912, + "step": 14590 + }, + { + "epoch": 0.7827789699570815, + "grad_norm": 0.400390625, + "learning_rate": 4.792918385335433e-06, + "loss": 2.0514, + "step": 14591 + }, + { + "epoch": 0.7828326180257511, + "grad_norm": 0.609375, + "learning_rate": 4.792883763476601e-06, + "loss": 2.137, + "step": 14592 + }, + { + "epoch": 0.7828862660944206, + "grad_norm": 1.3046875, + "learning_rate": 4.7928491388488755e-06, + "loss": 2.2562, + "step": 14593 + }, + { + "epoch": 0.7829399141630902, + "grad_norm": 0.3671875, + "learning_rate": 4.792814511452295e-06, + "loss": 2.1781, + "step": 14594 + }, + { + "epoch": 0.7829935622317596, + "grad_norm": 0.4140625, + "learning_rate": 4.792779881286904e-06, + "loss": 2.1157, + "step": 14595 + }, + { + "epoch": 0.7830472103004292, + "grad_norm": 0.44921875, + "learning_rate": 4.792745248352742e-06, + "loss": 2.3413, + "step": 14596 + }, + { + "epoch": 0.7831008583690987, + "grad_norm": 0.478515625, + "learning_rate": 4.792710612649854e-06, + "loss": 2.2012, + "step": 14597 + }, + { + "epoch": 0.7831545064377683, + "grad_norm": 0.50390625, + "learning_rate": 4.7926759741782794e-06, + "loss": 2.3261, + "step": 14598 + }, + { + "epoch": 0.7832081545064378, + "grad_norm": 0.45703125, + "learning_rate": 4.79264133293806e-06, + "loss": 2.2995, + "step": 14599 + }, + { + "epoch": 0.7832618025751072, + "grad_norm": 0.435546875, + "learning_rate": 4.79260668892924e-06, + "loss": 2.2485, + "step": 14600 + }, + { + "epoch": 0.7833154506437768, + "grad_norm": 0.400390625, + "learning_rate": 4.792572042151858e-06, + "loss": 2.1225, + "step": 14601 + }, + { + "epoch": 0.7833690987124463, + "grad_norm": 0.478515625, + "learning_rate": 4.7925373926059584e-06, + "loss": 2.323, + "step": 14602 + }, + { + "epoch": 0.7834227467811159, + "grad_norm": 0.416015625, + "learning_rate": 4.792502740291581e-06, + "loss": 2.1535, + "step": 14603 + }, + { + "epoch": 0.7834763948497854, + "grad_norm": 0.5, + "learning_rate": 4.79246808520877e-06, + "loss": 2.1792, + "step": 14604 + }, + { + "epoch": 0.783530042918455, + "grad_norm": 0.515625, + "learning_rate": 4.792433427357565e-06, + "loss": 2.2012, + "step": 14605 + }, + { + "epoch": 0.7835836909871244, + "grad_norm": 0.5703125, + "learning_rate": 4.792398766738009e-06, + "loss": 2.5013, + "step": 14606 + }, + { + "epoch": 0.783637339055794, + "grad_norm": 0.51953125, + "learning_rate": 4.792364103350145e-06, + "loss": 2.322, + "step": 14607 + }, + { + "epoch": 0.7836909871244635, + "grad_norm": 0.474609375, + "learning_rate": 4.792329437194011e-06, + "loss": 2.3374, + "step": 14608 + }, + { + "epoch": 0.7837446351931331, + "grad_norm": 0.423828125, + "learning_rate": 4.792294768269653e-06, + "loss": 2.5436, + "step": 14609 + }, + { + "epoch": 0.7837982832618026, + "grad_norm": 0.484375, + "learning_rate": 4.792260096577111e-06, + "loss": 2.4556, + "step": 14610 + }, + { + "epoch": 0.7838519313304722, + "grad_norm": 0.45703125, + "learning_rate": 4.792225422116427e-06, + "loss": 2.2135, + "step": 14611 + }, + { + "epoch": 0.7839055793991416, + "grad_norm": 0.47265625, + "learning_rate": 4.792190744887643e-06, + "loss": 2.1305, + "step": 14612 + }, + { + "epoch": 0.7839592274678111, + "grad_norm": 0.458984375, + "learning_rate": 4.7921560648908e-06, + "loss": 2.3914, + "step": 14613 + }, + { + "epoch": 0.7840128755364807, + "grad_norm": 0.48046875, + "learning_rate": 4.792121382125943e-06, + "loss": 2.2441, + "step": 14614 + }, + { + "epoch": 0.7840665236051502, + "grad_norm": 0.388671875, + "learning_rate": 4.79208669659311e-06, + "loss": 1.7687, + "step": 14615 + }, + { + "epoch": 0.7841201716738198, + "grad_norm": 0.408203125, + "learning_rate": 4.792052008292345e-06, + "loss": 2.0941, + "step": 14616 + }, + { + "epoch": 0.7841738197424892, + "grad_norm": 0.462890625, + "learning_rate": 4.792017317223689e-06, + "loss": 2.3881, + "step": 14617 + }, + { + "epoch": 0.7842274678111588, + "grad_norm": 0.484375, + "learning_rate": 4.791982623387185e-06, + "loss": 2.3265, + "step": 14618 + }, + { + "epoch": 0.7842811158798283, + "grad_norm": 0.470703125, + "learning_rate": 4.791947926782873e-06, + "loss": 2.22, + "step": 14619 + }, + { + "epoch": 0.7843347639484979, + "grad_norm": 0.44140625, + "learning_rate": 4.791913227410797e-06, + "loss": 2.2368, + "step": 14620 + }, + { + "epoch": 0.7843884120171674, + "grad_norm": 0.431640625, + "learning_rate": 4.791878525270998e-06, + "loss": 2.1366, + "step": 14621 + }, + { + "epoch": 0.784442060085837, + "grad_norm": 0.466796875, + "learning_rate": 4.791843820363517e-06, + "loss": 2.2587, + "step": 14622 + }, + { + "epoch": 0.7844957081545064, + "grad_norm": 0.52734375, + "learning_rate": 4.791809112688398e-06, + "loss": 2.4076, + "step": 14623 + }, + { + "epoch": 0.784549356223176, + "grad_norm": 0.53515625, + "learning_rate": 4.791774402245681e-06, + "loss": 2.3506, + "step": 14624 + }, + { + "epoch": 0.7846030042918455, + "grad_norm": 0.59765625, + "learning_rate": 4.791739689035408e-06, + "loss": 2.2251, + "step": 14625 + }, + { + "epoch": 0.7846566523605151, + "grad_norm": 0.53515625, + "learning_rate": 4.791704973057623e-06, + "loss": 2.4132, + "step": 14626 + }, + { + "epoch": 0.7847103004291845, + "grad_norm": 0.44921875, + "learning_rate": 4.791670254312366e-06, + "loss": 2.2336, + "step": 14627 + }, + { + "epoch": 0.784763948497854, + "grad_norm": 0.80078125, + "learning_rate": 4.791635532799679e-06, + "loss": 2.411, + "step": 14628 + }, + { + "epoch": 0.7848175965665236, + "grad_norm": 0.412109375, + "learning_rate": 4.791600808519605e-06, + "loss": 2.3299, + "step": 14629 + }, + { + "epoch": 0.7848712446351931, + "grad_norm": 0.490234375, + "learning_rate": 4.791566081472185e-06, + "loss": 2.2813, + "step": 14630 + }, + { + "epoch": 0.7849248927038627, + "grad_norm": 0.470703125, + "learning_rate": 4.7915313516574616e-06, + "loss": 2.1457, + "step": 14631 + }, + { + "epoch": 0.7849785407725322, + "grad_norm": 0.41796875, + "learning_rate": 4.791496619075475e-06, + "loss": 2.1753, + "step": 14632 + }, + { + "epoch": 0.7850321888412017, + "grad_norm": 0.91015625, + "learning_rate": 4.79146188372627e-06, + "loss": 2.5532, + "step": 14633 + }, + { + "epoch": 0.7850858369098712, + "grad_norm": 0.49609375, + "learning_rate": 4.791427145609886e-06, + "loss": 2.0606, + "step": 14634 + }, + { + "epoch": 0.7851394849785408, + "grad_norm": 0.439453125, + "learning_rate": 4.791392404726367e-06, + "loss": 2.3166, + "step": 14635 + }, + { + "epoch": 0.7851931330472103, + "grad_norm": 0.484375, + "learning_rate": 4.791357661075753e-06, + "loss": 2.2268, + "step": 14636 + }, + { + "epoch": 0.7852467811158799, + "grad_norm": 0.4296875, + "learning_rate": 4.791322914658088e-06, + "loss": 2.314, + "step": 14637 + }, + { + "epoch": 0.7853004291845493, + "grad_norm": 0.447265625, + "learning_rate": 4.791288165473411e-06, + "loss": 2.0048, + "step": 14638 + }, + { + "epoch": 0.7853540772532189, + "grad_norm": 0.41015625, + "learning_rate": 4.791253413521768e-06, + "loss": 2.1938, + "step": 14639 + }, + { + "epoch": 0.7854077253218884, + "grad_norm": 0.474609375, + "learning_rate": 4.791218658803197e-06, + "loss": 2.2015, + "step": 14640 + }, + { + "epoch": 0.785461373390558, + "grad_norm": 0.51953125, + "learning_rate": 4.791183901317743e-06, + "loss": 2.2636, + "step": 14641 + }, + { + "epoch": 0.7855150214592275, + "grad_norm": 0.466796875, + "learning_rate": 4.791149141065447e-06, + "loss": 2.4919, + "step": 14642 + }, + { + "epoch": 0.785568669527897, + "grad_norm": 0.69140625, + "learning_rate": 4.791114378046349e-06, + "loss": 2.2087, + "step": 14643 + }, + { + "epoch": 0.7856223175965665, + "grad_norm": 0.5703125, + "learning_rate": 4.791079612260494e-06, + "loss": 2.1224, + "step": 14644 + }, + { + "epoch": 0.785675965665236, + "grad_norm": 0.5546875, + "learning_rate": 4.791044843707922e-06, + "loss": 2.0675, + "step": 14645 + }, + { + "epoch": 0.7857296137339056, + "grad_norm": 0.484375, + "learning_rate": 4.791010072388676e-06, + "loss": 2.1578, + "step": 14646 + }, + { + "epoch": 0.7857832618025751, + "grad_norm": 0.474609375, + "learning_rate": 4.790975298302798e-06, + "loss": 2.1783, + "step": 14647 + }, + { + "epoch": 0.7858369098712447, + "grad_norm": 0.99609375, + "learning_rate": 4.790940521450329e-06, + "loss": 2.3515, + "step": 14648 + }, + { + "epoch": 0.7858905579399141, + "grad_norm": 0.50390625, + "learning_rate": 4.790905741831312e-06, + "loss": 2.3275, + "step": 14649 + }, + { + "epoch": 0.7859442060085837, + "grad_norm": 0.482421875, + "learning_rate": 4.790870959445788e-06, + "loss": 2.1106, + "step": 14650 + }, + { + "epoch": 0.7859978540772532, + "grad_norm": 0.474609375, + "learning_rate": 4.7908361742938e-06, + "loss": 2.3451, + "step": 14651 + }, + { + "epoch": 0.7860515021459228, + "grad_norm": 0.451171875, + "learning_rate": 4.79080138637539e-06, + "loss": 2.1077, + "step": 14652 + }, + { + "epoch": 0.7861051502145923, + "grad_norm": 0.451171875, + "learning_rate": 4.7907665956906e-06, + "loss": 2.2702, + "step": 14653 + }, + { + "epoch": 0.7861587982832619, + "grad_norm": 0.435546875, + "learning_rate": 4.790731802239471e-06, + "loss": 2.1435, + "step": 14654 + }, + { + "epoch": 0.7862124463519313, + "grad_norm": 0.4296875, + "learning_rate": 4.790697006022046e-06, + "loss": 2.2483, + "step": 14655 + }, + { + "epoch": 0.7862660944206008, + "grad_norm": 0.4453125, + "learning_rate": 4.790662207038367e-06, + "loss": 2.3219, + "step": 14656 + }, + { + "epoch": 0.7863197424892704, + "grad_norm": 0.51953125, + "learning_rate": 4.7906274052884746e-06, + "loss": 1.9913, + "step": 14657 + }, + { + "epoch": 0.7863733905579399, + "grad_norm": 0.7578125, + "learning_rate": 4.790592600772413e-06, + "loss": 2.4476, + "step": 14658 + }, + { + "epoch": 0.7864270386266095, + "grad_norm": 0.46484375, + "learning_rate": 4.790557793490223e-06, + "loss": 2.0521, + "step": 14659 + }, + { + "epoch": 0.7864806866952789, + "grad_norm": 0.5625, + "learning_rate": 4.790522983441946e-06, + "loss": 2.195, + "step": 14660 + }, + { + "epoch": 0.7865343347639485, + "grad_norm": 0.494140625, + "learning_rate": 4.790488170627626e-06, + "loss": 2.1466, + "step": 14661 + }, + { + "epoch": 0.786587982832618, + "grad_norm": 0.392578125, + "learning_rate": 4.790453355047304e-06, + "loss": 1.8616, + "step": 14662 + }, + { + "epoch": 0.7866416309012876, + "grad_norm": 0.4921875, + "learning_rate": 4.790418536701022e-06, + "loss": 2.3934, + "step": 14663 + }, + { + "epoch": 0.7866952789699571, + "grad_norm": 0.376953125, + "learning_rate": 4.79038371558882e-06, + "loss": 2.0744, + "step": 14664 + }, + { + "epoch": 0.7867489270386266, + "grad_norm": 0.53515625, + "learning_rate": 4.790348891710745e-06, + "loss": 2.187, + "step": 14665 + }, + { + "epoch": 0.7868025751072961, + "grad_norm": 0.53125, + "learning_rate": 4.790314065066834e-06, + "loss": 2.5473, + "step": 14666 + }, + { + "epoch": 0.7868562231759657, + "grad_norm": 0.435546875, + "learning_rate": 4.790279235657131e-06, + "loss": 2.2804, + "step": 14667 + }, + { + "epoch": 0.7869098712446352, + "grad_norm": 0.5078125, + "learning_rate": 4.790244403481679e-06, + "loss": 2.4452, + "step": 14668 + }, + { + "epoch": 0.7869635193133048, + "grad_norm": 0.46875, + "learning_rate": 4.790209568540519e-06, + "loss": 2.3546, + "step": 14669 + }, + { + "epoch": 0.7870171673819742, + "grad_norm": 1.078125, + "learning_rate": 4.790174730833694e-06, + "loss": 2.1197, + "step": 14670 + }, + { + "epoch": 0.7870708154506437, + "grad_norm": 0.39453125, + "learning_rate": 4.790139890361245e-06, + "loss": 2.1185, + "step": 14671 + }, + { + "epoch": 0.7871244635193133, + "grad_norm": 0.470703125, + "learning_rate": 4.790105047123214e-06, + "loss": 2.341, + "step": 14672 + }, + { + "epoch": 0.7871781115879828, + "grad_norm": 0.4375, + "learning_rate": 4.790070201119644e-06, + "loss": 2.2212, + "step": 14673 + }, + { + "epoch": 0.7872317596566524, + "grad_norm": 0.41015625, + "learning_rate": 4.790035352350577e-06, + "loss": 2.24, + "step": 14674 + }, + { + "epoch": 0.7872854077253219, + "grad_norm": 0.4765625, + "learning_rate": 4.790000500816054e-06, + "loss": 1.608, + "step": 14675 + }, + { + "epoch": 0.7873390557939914, + "grad_norm": 0.4453125, + "learning_rate": 4.789965646516118e-06, + "loss": 2.186, + "step": 14676 + }, + { + "epoch": 0.7873927038626609, + "grad_norm": 0.51171875, + "learning_rate": 4.7899307894508115e-06, + "loss": 2.2727, + "step": 14677 + }, + { + "epoch": 0.7874463519313305, + "grad_norm": 0.52734375, + "learning_rate": 4.789895929620175e-06, + "loss": 2.6985, + "step": 14678 + }, + { + "epoch": 0.7875, + "grad_norm": 0.482421875, + "learning_rate": 4.789861067024253e-06, + "loss": 2.4237, + "step": 14679 + }, + { + "epoch": 0.7875536480686696, + "grad_norm": 0.40625, + "learning_rate": 4.7898262016630844e-06, + "loss": 2.1956, + "step": 14680 + }, + { + "epoch": 0.787607296137339, + "grad_norm": 0.484375, + "learning_rate": 4.789791333536715e-06, + "loss": 2.3287, + "step": 14681 + }, + { + "epoch": 0.7876609442060086, + "grad_norm": 0.47265625, + "learning_rate": 4.789756462645183e-06, + "loss": 2.2545, + "step": 14682 + }, + { + "epoch": 0.7877145922746781, + "grad_norm": 0.4765625, + "learning_rate": 4.789721588988534e-06, + "loss": 2.2114, + "step": 14683 + }, + { + "epoch": 0.7877682403433477, + "grad_norm": 0.37890625, + "learning_rate": 4.789686712566809e-06, + "loss": 2.1673, + "step": 14684 + }, + { + "epoch": 0.7878218884120172, + "grad_norm": 0.45703125, + "learning_rate": 4.789651833380049e-06, + "loss": 2.5978, + "step": 14685 + }, + { + "epoch": 0.7878755364806866, + "grad_norm": 0.435546875, + "learning_rate": 4.789616951428296e-06, + "loss": 2.0426, + "step": 14686 + }, + { + "epoch": 0.7879291845493562, + "grad_norm": 0.46875, + "learning_rate": 4.789582066711595e-06, + "loss": 2.4358, + "step": 14687 + }, + { + "epoch": 0.7879828326180257, + "grad_norm": 0.451171875, + "learning_rate": 4.7895471792299845e-06, + "loss": 2.2802, + "step": 14688 + }, + { + "epoch": 0.7880364806866953, + "grad_norm": 0.427734375, + "learning_rate": 4.789512288983508e-06, + "loss": 2.2262, + "step": 14689 + }, + { + "epoch": 0.7880901287553648, + "grad_norm": 0.62109375, + "learning_rate": 4.789477395972208e-06, + "loss": 2.1964, + "step": 14690 + }, + { + "epoch": 0.7881437768240344, + "grad_norm": 0.71484375, + "learning_rate": 4.789442500196128e-06, + "loss": 2.3911, + "step": 14691 + }, + { + "epoch": 0.7881974248927038, + "grad_norm": 0.49609375, + "learning_rate": 4.789407601655307e-06, + "loss": 2.2212, + "step": 14692 + }, + { + "epoch": 0.7882510729613734, + "grad_norm": 0.51171875, + "learning_rate": 4.789372700349791e-06, + "loss": 2.3366, + "step": 14693 + }, + { + "epoch": 0.7883047210300429, + "grad_norm": 0.36328125, + "learning_rate": 4.789337796279617e-06, + "loss": 2.1454, + "step": 14694 + }, + { + "epoch": 0.7883583690987125, + "grad_norm": 0.53515625, + "learning_rate": 4.789302889444833e-06, + "loss": 2.3455, + "step": 14695 + }, + { + "epoch": 0.788412017167382, + "grad_norm": 0.458984375, + "learning_rate": 4.789267979845476e-06, + "loss": 2.3686, + "step": 14696 + }, + { + "epoch": 0.7884656652360515, + "grad_norm": 0.498046875, + "learning_rate": 4.789233067481591e-06, + "loss": 2.0671, + "step": 14697 + }, + { + "epoch": 0.788519313304721, + "grad_norm": 0.474609375, + "learning_rate": 4.78919815235322e-06, + "loss": 2.4455, + "step": 14698 + }, + { + "epoch": 0.7885729613733906, + "grad_norm": 0.47265625, + "learning_rate": 4.789163234460405e-06, + "loss": 2.4577, + "step": 14699 + }, + { + "epoch": 0.7886266094420601, + "grad_norm": 0.482421875, + "learning_rate": 4.789128313803187e-06, + "loss": 1.8383, + "step": 14700 + }, + { + "epoch": 0.7886802575107296, + "grad_norm": 0.46484375, + "learning_rate": 4.789093390381609e-06, + "loss": 2.0105, + "step": 14701 + }, + { + "epoch": 0.7887339055793992, + "grad_norm": 0.49609375, + "learning_rate": 4.7890584641957134e-06, + "loss": 2.3868, + "step": 14702 + }, + { + "epoch": 0.7887875536480686, + "grad_norm": 0.400390625, + "learning_rate": 4.7890235352455436e-06, + "loss": 2.4835, + "step": 14703 + }, + { + "epoch": 0.7888412017167382, + "grad_norm": 1.40625, + "learning_rate": 4.788988603531139e-06, + "loss": 2.1504, + "step": 14704 + }, + { + "epoch": 0.7888948497854077, + "grad_norm": 0.482421875, + "learning_rate": 4.788953669052544e-06, + "loss": 2.3959, + "step": 14705 + }, + { + "epoch": 0.7889484978540773, + "grad_norm": 0.447265625, + "learning_rate": 4.788918731809799e-06, + "loss": 2.056, + "step": 14706 + }, + { + "epoch": 0.7890021459227468, + "grad_norm": 0.423828125, + "learning_rate": 4.788883791802948e-06, + "loss": 2.236, + "step": 14707 + }, + { + "epoch": 0.7890557939914163, + "grad_norm": 0.46484375, + "learning_rate": 4.788848849032032e-06, + "loss": 2.2771, + "step": 14708 + }, + { + "epoch": 0.7891094420600858, + "grad_norm": 0.435546875, + "learning_rate": 4.788813903497094e-06, + "loss": 2.5129, + "step": 14709 + }, + { + "epoch": 0.7891630901287554, + "grad_norm": 0.4765625, + "learning_rate": 4.788778955198174e-06, + "loss": 2.377, + "step": 14710 + }, + { + "epoch": 0.7892167381974249, + "grad_norm": 0.40234375, + "learning_rate": 4.788744004135318e-06, + "loss": 2.1584, + "step": 14711 + }, + { + "epoch": 0.7892703862660945, + "grad_norm": 0.4765625, + "learning_rate": 4.788709050308566e-06, + "loss": 2.1859, + "step": 14712 + }, + { + "epoch": 0.789324034334764, + "grad_norm": 0.61328125, + "learning_rate": 4.788674093717959e-06, + "loss": 2.2791, + "step": 14713 + }, + { + "epoch": 0.7893776824034334, + "grad_norm": 0.609375, + "learning_rate": 4.7886391343635415e-06, + "loss": 2.0221, + "step": 14714 + }, + { + "epoch": 0.789431330472103, + "grad_norm": 0.53515625, + "learning_rate": 4.788604172245355e-06, + "loss": 2.3941, + "step": 14715 + }, + { + "epoch": 0.7894849785407725, + "grad_norm": 0.55078125, + "learning_rate": 4.7885692073634415e-06, + "loss": 2.2268, + "step": 14716 + }, + { + "epoch": 0.7895386266094421, + "grad_norm": 0.515625, + "learning_rate": 4.788534239717843e-06, + "loss": 2.2491, + "step": 14717 + }, + { + "epoch": 0.7895922746781115, + "grad_norm": 0.48828125, + "learning_rate": 4.788499269308602e-06, + "loss": 2.0192, + "step": 14718 + }, + { + "epoch": 0.7896459227467811, + "grad_norm": 0.498046875, + "learning_rate": 4.78846429613576e-06, + "loss": 2.3247, + "step": 14719 + }, + { + "epoch": 0.7896995708154506, + "grad_norm": 0.458984375, + "learning_rate": 4.788429320199361e-06, + "loss": 2.3875, + "step": 14720 + }, + { + "epoch": 0.7897532188841202, + "grad_norm": 0.5234375, + "learning_rate": 4.788394341499446e-06, + "loss": 2.3601, + "step": 14721 + }, + { + "epoch": 0.7898068669527897, + "grad_norm": 0.57421875, + "learning_rate": 4.788359360036057e-06, + "loss": 1.6083, + "step": 14722 + }, + { + "epoch": 0.7898605150214593, + "grad_norm": 0.470703125, + "learning_rate": 4.788324375809237e-06, + "loss": 2.3425, + "step": 14723 + }, + { + "epoch": 0.7899141630901287, + "grad_norm": 0.4765625, + "learning_rate": 4.788289388819027e-06, + "loss": 2.1538, + "step": 14724 + }, + { + "epoch": 0.7899678111587983, + "grad_norm": 0.41796875, + "learning_rate": 4.788254399065472e-06, + "loss": 2.2972, + "step": 14725 + }, + { + "epoch": 0.7900214592274678, + "grad_norm": 0.427734375, + "learning_rate": 4.788219406548611e-06, + "loss": 2.2272, + "step": 14726 + }, + { + "epoch": 0.7900751072961374, + "grad_norm": 0.5859375, + "learning_rate": 4.788184411268488e-06, + "loss": 2.2821, + "step": 14727 + }, + { + "epoch": 0.7901287553648069, + "grad_norm": 0.357421875, + "learning_rate": 4.788149413225145e-06, + "loss": 2.1045, + "step": 14728 + }, + { + "epoch": 0.7901824034334763, + "grad_norm": 0.435546875, + "learning_rate": 4.788114412418624e-06, + "loss": 1.955, + "step": 14729 + }, + { + "epoch": 0.7902360515021459, + "grad_norm": 0.478515625, + "learning_rate": 4.788079408848968e-06, + "loss": 2.2621, + "step": 14730 + }, + { + "epoch": 0.7902896995708154, + "grad_norm": 0.3984375, + "learning_rate": 4.788044402516218e-06, + "loss": 2.2204, + "step": 14731 + }, + { + "epoch": 0.790343347639485, + "grad_norm": 0.50390625, + "learning_rate": 4.788009393420417e-06, + "loss": 2.2484, + "step": 14732 + }, + { + "epoch": 0.7903969957081545, + "grad_norm": 0.578125, + "learning_rate": 4.787974381561607e-06, + "loss": 2.5005, + "step": 14733 + }, + { + "epoch": 0.7904506437768241, + "grad_norm": 0.48828125, + "learning_rate": 4.787939366939832e-06, + "loss": 2.2152, + "step": 14734 + }, + { + "epoch": 0.7905042918454935, + "grad_norm": 0.37109375, + "learning_rate": 4.787904349555133e-06, + "loss": 1.9078, + "step": 14735 + }, + { + "epoch": 0.7905579399141631, + "grad_norm": 0.408203125, + "learning_rate": 4.78786932940755e-06, + "loss": 2.1781, + "step": 14736 + }, + { + "epoch": 0.7906115879828326, + "grad_norm": 0.42578125, + "learning_rate": 4.7878343064971286e-06, + "loss": 2.2193, + "step": 14737 + }, + { + "epoch": 0.7906652360515022, + "grad_norm": 0.43359375, + "learning_rate": 4.787799280823911e-06, + "loss": 2.3697, + "step": 14738 + }, + { + "epoch": 0.7907188841201717, + "grad_norm": 0.55859375, + "learning_rate": 4.7877642523879365e-06, + "loss": 2.3706, + "step": 14739 + }, + { + "epoch": 0.7907725321888412, + "grad_norm": 0.453125, + "learning_rate": 4.78772922118925e-06, + "loss": 2.2196, + "step": 14740 + }, + { + "epoch": 0.7908261802575107, + "grad_norm": 0.423828125, + "learning_rate": 4.7876941872278935e-06, + "loss": 2.1778, + "step": 14741 + }, + { + "epoch": 0.7908798283261803, + "grad_norm": 0.412109375, + "learning_rate": 4.7876591505039085e-06, + "loss": 2.2161, + "step": 14742 + }, + { + "epoch": 0.7909334763948498, + "grad_norm": 0.52734375, + "learning_rate": 4.787624111017338e-06, + "loss": 2.3273, + "step": 14743 + }, + { + "epoch": 0.7909871244635193, + "grad_norm": 0.484375, + "learning_rate": 4.7875890687682244e-06, + "loss": 2.2548, + "step": 14744 + }, + { + "epoch": 0.7910407725321889, + "grad_norm": 0.50390625, + "learning_rate": 4.787554023756609e-06, + "loss": 2.3058, + "step": 14745 + }, + { + "epoch": 0.7910944206008583, + "grad_norm": 0.49609375, + "learning_rate": 4.787518975982535e-06, + "loss": 2.3265, + "step": 14746 + }, + { + "epoch": 0.7911480686695279, + "grad_norm": 0.4921875, + "learning_rate": 4.7874839254460445e-06, + "loss": 2.244, + "step": 14747 + }, + { + "epoch": 0.7912017167381974, + "grad_norm": 0.447265625, + "learning_rate": 4.78744887214718e-06, + "loss": 2.2606, + "step": 14748 + }, + { + "epoch": 0.791255364806867, + "grad_norm": 0.447265625, + "learning_rate": 4.787413816085983e-06, + "loss": 2.3252, + "step": 14749 + }, + { + "epoch": 0.7913090128755365, + "grad_norm": 0.73828125, + "learning_rate": 4.787378757262497e-06, + "loss": 2.3452, + "step": 14750 + }, + { + "epoch": 0.791362660944206, + "grad_norm": 0.44140625, + "learning_rate": 4.787343695676765e-06, + "loss": 2.4092, + "step": 14751 + }, + { + "epoch": 0.7914163090128755, + "grad_norm": 0.44921875, + "learning_rate": 4.787308631328826e-06, + "loss": 2.4481, + "step": 14752 + }, + { + "epoch": 0.7914699570815451, + "grad_norm": 0.8203125, + "learning_rate": 4.787273564218726e-06, + "loss": 2.2452, + "step": 14753 + }, + { + "epoch": 0.7915236051502146, + "grad_norm": 0.45703125, + "learning_rate": 4.787238494346506e-06, + "loss": 2.2263, + "step": 14754 + }, + { + "epoch": 0.7915772532188842, + "grad_norm": 0.45703125, + "learning_rate": 4.7872034217122074e-06, + "loss": 2.3889, + "step": 14755 + }, + { + "epoch": 0.7916309012875536, + "grad_norm": 0.4921875, + "learning_rate": 4.787168346315874e-06, + "loss": 2.2941, + "step": 14756 + }, + { + "epoch": 0.7916845493562231, + "grad_norm": 0.39453125, + "learning_rate": 4.787133268157547e-06, + "loss": 2.3878, + "step": 14757 + }, + { + "epoch": 0.7917381974248927, + "grad_norm": 0.474609375, + "learning_rate": 4.787098187237269e-06, + "loss": 2.181, + "step": 14758 + }, + { + "epoch": 0.7917918454935622, + "grad_norm": 0.38671875, + "learning_rate": 4.787063103555083e-06, + "loss": 1.8449, + "step": 14759 + }, + { + "epoch": 0.7918454935622318, + "grad_norm": 0.5703125, + "learning_rate": 4.787028017111032e-06, + "loss": 1.8323, + "step": 14760 + }, + { + "epoch": 0.7918991416309012, + "grad_norm": 0.388671875, + "learning_rate": 4.786992927905156e-06, + "loss": 2.1854, + "step": 14761 + }, + { + "epoch": 0.7919527896995708, + "grad_norm": 0.486328125, + "learning_rate": 4.786957835937499e-06, + "loss": 2.2068, + "step": 14762 + }, + { + "epoch": 0.7920064377682403, + "grad_norm": 1.234375, + "learning_rate": 4.786922741208103e-06, + "loss": 2.2273, + "step": 14763 + }, + { + "epoch": 0.7920600858369099, + "grad_norm": 0.423828125, + "learning_rate": 4.786887643717012e-06, + "loss": 2.2915, + "step": 14764 + }, + { + "epoch": 0.7921137339055794, + "grad_norm": 0.4921875, + "learning_rate": 4.786852543464266e-06, + "loss": 2.2017, + "step": 14765 + }, + { + "epoch": 0.792167381974249, + "grad_norm": 0.5234375, + "learning_rate": 4.786817440449908e-06, + "loss": 2.2464, + "step": 14766 + }, + { + "epoch": 0.7922210300429184, + "grad_norm": 0.431640625, + "learning_rate": 4.786782334673981e-06, + "loss": 2.3244, + "step": 14767 + }, + { + "epoch": 0.792274678111588, + "grad_norm": 0.6015625, + "learning_rate": 4.786747226136527e-06, + "loss": 1.4077, + "step": 14768 + }, + { + "epoch": 0.7923283261802575, + "grad_norm": 0.5078125, + "learning_rate": 4.7867121148375885e-06, + "loss": 2.1471, + "step": 14769 + }, + { + "epoch": 0.7923819742489271, + "grad_norm": 0.431640625, + "learning_rate": 4.786677000777208e-06, + "loss": 2.1905, + "step": 14770 + }, + { + "epoch": 0.7924356223175966, + "grad_norm": 0.51171875, + "learning_rate": 4.786641883955427e-06, + "loss": 2.3105, + "step": 14771 + }, + { + "epoch": 0.792489270386266, + "grad_norm": 0.515625, + "learning_rate": 4.78660676437229e-06, + "loss": 2.1614, + "step": 14772 + }, + { + "epoch": 0.7925429184549356, + "grad_norm": 0.4453125, + "learning_rate": 4.786571642027837e-06, + "loss": 2.3878, + "step": 14773 + }, + { + "epoch": 0.7925965665236051, + "grad_norm": 0.466796875, + "learning_rate": 4.786536516922112e-06, + "loss": 2.2979, + "step": 14774 + }, + { + "epoch": 0.7926502145922747, + "grad_norm": 0.458984375, + "learning_rate": 4.786501389055157e-06, + "loss": 2.2327, + "step": 14775 + }, + { + "epoch": 0.7927038626609442, + "grad_norm": 2.640625, + "learning_rate": 4.786466258427014e-06, + "loss": 2.4863, + "step": 14776 + }, + { + "epoch": 0.7927575107296138, + "grad_norm": 0.380859375, + "learning_rate": 4.786431125037726e-06, + "loss": 1.7651, + "step": 14777 + }, + { + "epoch": 0.7928111587982832, + "grad_norm": 0.8046875, + "learning_rate": 4.7863959888873355e-06, + "loss": 2.45, + "step": 14778 + }, + { + "epoch": 0.7928648068669528, + "grad_norm": 0.66015625, + "learning_rate": 4.786360849975884e-06, + "loss": 2.5973, + "step": 14779 + }, + { + "epoch": 0.7929184549356223, + "grad_norm": 0.5859375, + "learning_rate": 4.7863257083034156e-06, + "loss": 2.4694, + "step": 14780 + }, + { + "epoch": 0.7929721030042919, + "grad_norm": 0.6328125, + "learning_rate": 4.786290563869971e-06, + "loss": 1.6103, + "step": 14781 + }, + { + "epoch": 0.7930257510729614, + "grad_norm": 0.470703125, + "learning_rate": 4.786255416675593e-06, + "loss": 2.4431, + "step": 14782 + }, + { + "epoch": 0.793079399141631, + "grad_norm": 0.443359375, + "learning_rate": 4.786220266720325e-06, + "loss": 2.3214, + "step": 14783 + }, + { + "epoch": 0.7931330472103004, + "grad_norm": 0.50390625, + "learning_rate": 4.786185114004208e-06, + "loss": 2.2548, + "step": 14784 + }, + { + "epoch": 0.79318669527897, + "grad_norm": 0.427734375, + "learning_rate": 4.7861499585272864e-06, + "loss": 2.3006, + "step": 14785 + }, + { + "epoch": 0.7932403433476395, + "grad_norm": 0.4921875, + "learning_rate": 4.786114800289601e-06, + "loss": 2.2927, + "step": 14786 + }, + { + "epoch": 0.793293991416309, + "grad_norm": 0.4375, + "learning_rate": 4.786079639291195e-06, + "loss": 2.0355, + "step": 14787 + }, + { + "epoch": 0.7933476394849786, + "grad_norm": 0.486328125, + "learning_rate": 4.78604447553211e-06, + "loss": 2.214, + "step": 14788 + }, + { + "epoch": 0.793401287553648, + "grad_norm": 0.4921875, + "learning_rate": 4.78600930901239e-06, + "loss": 2.5031, + "step": 14789 + }, + { + "epoch": 0.7934549356223176, + "grad_norm": 0.54296875, + "learning_rate": 4.785974139732076e-06, + "loss": 2.5963, + "step": 14790 + }, + { + "epoch": 0.7935085836909871, + "grad_norm": 0.51171875, + "learning_rate": 4.785938967691212e-06, + "loss": 2.0034, + "step": 14791 + }, + { + "epoch": 0.7935622317596567, + "grad_norm": 0.51953125, + "learning_rate": 4.785903792889838e-06, + "loss": 2.2829, + "step": 14792 + }, + { + "epoch": 0.7936158798283262, + "grad_norm": 0.5625, + "learning_rate": 4.785868615327999e-06, + "loss": 1.7131, + "step": 14793 + }, + { + "epoch": 0.7936695278969957, + "grad_norm": 0.427734375, + "learning_rate": 4.785833435005736e-06, + "loss": 2.2258, + "step": 14794 + }, + { + "epoch": 0.7937231759656652, + "grad_norm": 0.462890625, + "learning_rate": 4.785798251923093e-06, + "loss": 2.2309, + "step": 14795 + }, + { + "epoch": 0.7937768240343348, + "grad_norm": 0.46484375, + "learning_rate": 4.78576306608011e-06, + "loss": 2.3664, + "step": 14796 + }, + { + "epoch": 0.7938304721030043, + "grad_norm": 0.56640625, + "learning_rate": 4.785727877476832e-06, + "loss": 2.1314, + "step": 14797 + }, + { + "epoch": 0.7938841201716739, + "grad_norm": 0.416015625, + "learning_rate": 4.7856926861133005e-06, + "loss": 2.4253, + "step": 14798 + }, + { + "epoch": 0.7939377682403433, + "grad_norm": 0.435546875, + "learning_rate": 4.785657491989557e-06, + "loss": 1.9826, + "step": 14799 + }, + { + "epoch": 0.7939914163090128, + "grad_norm": 0.47265625, + "learning_rate": 4.785622295105646e-06, + "loss": 2.3667, + "step": 14800 + }, + { + "epoch": 0.7940450643776824, + "grad_norm": 0.5859375, + "learning_rate": 4.785587095461608e-06, + "loss": 2.3378, + "step": 14801 + }, + { + "epoch": 0.7940987124463519, + "grad_norm": 0.70703125, + "learning_rate": 4.7855518930574875e-06, + "loss": 2.6766, + "step": 14802 + }, + { + "epoch": 0.7941523605150215, + "grad_norm": 0.453125, + "learning_rate": 4.785516687893326e-06, + "loss": 1.6674, + "step": 14803 + }, + { + "epoch": 0.794206008583691, + "grad_norm": 0.49609375, + "learning_rate": 4.785481479969165e-06, + "loss": 1.9075, + "step": 14804 + }, + { + "epoch": 0.7942596566523605, + "grad_norm": 0.455078125, + "learning_rate": 4.785446269285048e-06, + "loss": 2.3536, + "step": 14805 + }, + { + "epoch": 0.79431330472103, + "grad_norm": 0.478515625, + "learning_rate": 4.785411055841018e-06, + "loss": 2.2294, + "step": 14806 + }, + { + "epoch": 0.7943669527896996, + "grad_norm": 0.51953125, + "learning_rate": 4.785375839637117e-06, + "loss": 2.2384, + "step": 14807 + }, + { + "epoch": 0.7944206008583691, + "grad_norm": 0.43359375, + "learning_rate": 4.785340620673388e-06, + "loss": 2.3173, + "step": 14808 + }, + { + "epoch": 0.7944742489270387, + "grad_norm": 0.54296875, + "learning_rate": 4.785305398949872e-06, + "loss": 2.2471, + "step": 14809 + }, + { + "epoch": 0.7945278969957081, + "grad_norm": 1.3046875, + "learning_rate": 4.785270174466614e-06, + "loss": 2.2271, + "step": 14810 + }, + { + "epoch": 0.7945815450643777, + "grad_norm": 0.365234375, + "learning_rate": 4.785234947223654e-06, + "loss": 2.3307, + "step": 14811 + }, + { + "epoch": 0.7946351931330472, + "grad_norm": 0.435546875, + "learning_rate": 4.785199717221036e-06, + "loss": 2.2419, + "step": 14812 + }, + { + "epoch": 0.7946888412017168, + "grad_norm": 0.5, + "learning_rate": 4.785164484458803e-06, + "loss": 2.4145, + "step": 14813 + }, + { + "epoch": 0.7947424892703863, + "grad_norm": 0.47265625, + "learning_rate": 4.785129248936996e-06, + "loss": 2.3186, + "step": 14814 + }, + { + "epoch": 0.7947961373390557, + "grad_norm": 0.5, + "learning_rate": 4.785094010655659e-06, + "loss": 2.2978, + "step": 14815 + }, + { + "epoch": 0.7948497854077253, + "grad_norm": 0.62890625, + "learning_rate": 4.785058769614833e-06, + "loss": 2.6446, + "step": 14816 + }, + { + "epoch": 0.7949034334763948, + "grad_norm": 0.4921875, + "learning_rate": 4.785023525814562e-06, + "loss": 2.2522, + "step": 14817 + }, + { + "epoch": 0.7949570815450644, + "grad_norm": 0.6796875, + "learning_rate": 4.784988279254889e-06, + "loss": 2.3923, + "step": 14818 + }, + { + "epoch": 0.7950107296137339, + "grad_norm": 0.546875, + "learning_rate": 4.784953029935853e-06, + "loss": 2.2476, + "step": 14819 + }, + { + "epoch": 0.7950643776824035, + "grad_norm": 0.50390625, + "learning_rate": 4.7849177778575016e-06, + "loss": 2.5638, + "step": 14820 + }, + { + "epoch": 0.7951180257510729, + "grad_norm": 0.65234375, + "learning_rate": 4.784882523019874e-06, + "loss": 2.3809, + "step": 14821 + }, + { + "epoch": 0.7951716738197425, + "grad_norm": 0.47265625, + "learning_rate": 4.7848472654230135e-06, + "loss": 2.3841, + "step": 14822 + }, + { + "epoch": 0.795225321888412, + "grad_norm": 0.388671875, + "learning_rate": 4.784812005066964e-06, + "loss": 2.165, + "step": 14823 + }, + { + "epoch": 0.7952789699570816, + "grad_norm": 0.5546875, + "learning_rate": 4.784776741951766e-06, + "loss": 2.5189, + "step": 14824 + }, + { + "epoch": 0.7953326180257511, + "grad_norm": 0.734375, + "learning_rate": 4.784741476077462e-06, + "loss": 2.3456, + "step": 14825 + }, + { + "epoch": 0.7953862660944206, + "grad_norm": 0.48046875, + "learning_rate": 4.784706207444097e-06, + "loss": 2.5622, + "step": 14826 + }, + { + "epoch": 0.7954399141630901, + "grad_norm": 0.51953125, + "learning_rate": 4.784670936051713e-06, + "loss": 2.4629, + "step": 14827 + }, + { + "epoch": 0.7954935622317597, + "grad_norm": 0.4453125, + "learning_rate": 4.784635661900351e-06, + "loss": 2.3142, + "step": 14828 + }, + { + "epoch": 0.7955472103004292, + "grad_norm": 0.48828125, + "learning_rate": 4.784600384990053e-06, + "loss": 2.3089, + "step": 14829 + }, + { + "epoch": 0.7956008583690987, + "grad_norm": 0.482421875, + "learning_rate": 4.784565105320864e-06, + "loss": 1.6201, + "step": 14830 + }, + { + "epoch": 0.7956545064377682, + "grad_norm": 0.5234375, + "learning_rate": 4.784529822892827e-06, + "loss": 2.2752, + "step": 14831 + }, + { + "epoch": 0.7957081545064377, + "grad_norm": 0.52734375, + "learning_rate": 4.784494537705981e-06, + "loss": 2.4236, + "step": 14832 + }, + { + "epoch": 0.7957618025751073, + "grad_norm": 0.431640625, + "learning_rate": 4.7844592497603726e-06, + "loss": 2.1601, + "step": 14833 + }, + { + "epoch": 0.7958154506437768, + "grad_norm": 0.69921875, + "learning_rate": 4.784423959056042e-06, + "loss": 2.3016, + "step": 14834 + }, + { + "epoch": 0.7958690987124464, + "grad_norm": 0.466796875, + "learning_rate": 4.7843886655930325e-06, + "loss": 2.1929, + "step": 14835 + }, + { + "epoch": 0.7959227467811159, + "grad_norm": 0.474609375, + "learning_rate": 4.784353369371387e-06, + "loss": 2.3068, + "step": 14836 + }, + { + "epoch": 0.7959763948497854, + "grad_norm": 0.52734375, + "learning_rate": 4.784318070391148e-06, + "loss": 2.3891, + "step": 14837 + }, + { + "epoch": 0.7960300429184549, + "grad_norm": 0.384765625, + "learning_rate": 4.784282768652356e-06, + "loss": 2.0051, + "step": 14838 + }, + { + "epoch": 0.7960836909871245, + "grad_norm": 0.451171875, + "learning_rate": 4.784247464155057e-06, + "loss": 2.4167, + "step": 14839 + }, + { + "epoch": 0.796137339055794, + "grad_norm": 0.4296875, + "learning_rate": 4.784212156899293e-06, + "loss": 2.2968, + "step": 14840 + }, + { + "epoch": 0.7961909871244636, + "grad_norm": 0.55859375, + "learning_rate": 4.7841768468851045e-06, + "loss": 2.3132, + "step": 14841 + }, + { + "epoch": 0.796244635193133, + "grad_norm": 0.546875, + "learning_rate": 4.7841415341125365e-06, + "loss": 2.1249, + "step": 14842 + }, + { + "epoch": 0.7962982832618025, + "grad_norm": 0.494140625, + "learning_rate": 4.78410621858163e-06, + "loss": 2.0234, + "step": 14843 + }, + { + "epoch": 0.7963519313304721, + "grad_norm": 0.416015625, + "learning_rate": 4.784070900292428e-06, + "loss": 2.2308, + "step": 14844 + }, + { + "epoch": 0.7964055793991416, + "grad_norm": 0.47265625, + "learning_rate": 4.784035579244974e-06, + "loss": 1.685, + "step": 14845 + }, + { + "epoch": 0.7964592274678112, + "grad_norm": 0.40234375, + "learning_rate": 4.7840002554393094e-06, + "loss": 2.3522, + "step": 14846 + }, + { + "epoch": 0.7965128755364806, + "grad_norm": 0.69921875, + "learning_rate": 4.783964928875479e-06, + "loss": 2.2155, + "step": 14847 + }, + { + "epoch": 0.7965665236051502, + "grad_norm": 0.62109375, + "learning_rate": 4.783929599553522e-06, + "loss": 1.6529, + "step": 14848 + }, + { + "epoch": 0.7966201716738197, + "grad_norm": 0.361328125, + "learning_rate": 4.783894267473484e-06, + "loss": 1.9483, + "step": 14849 + }, + { + "epoch": 0.7966738197424893, + "grad_norm": 0.484375, + "learning_rate": 4.783858932635407e-06, + "loss": 2.4528, + "step": 14850 + }, + { + "epoch": 0.7967274678111588, + "grad_norm": 0.408203125, + "learning_rate": 4.783823595039333e-06, + "loss": 2.4142, + "step": 14851 + }, + { + "epoch": 0.7967811158798284, + "grad_norm": 0.396484375, + "learning_rate": 4.783788254685305e-06, + "loss": 2.2817, + "step": 14852 + }, + { + "epoch": 0.7968347639484978, + "grad_norm": 0.7890625, + "learning_rate": 4.7837529115733666e-06, + "loss": 1.9745, + "step": 14853 + }, + { + "epoch": 0.7968884120171674, + "grad_norm": 0.546875, + "learning_rate": 4.783717565703559e-06, + "loss": 2.5399, + "step": 14854 + }, + { + "epoch": 0.7969420600858369, + "grad_norm": 0.36328125, + "learning_rate": 4.783682217075925e-06, + "loss": 2.2081, + "step": 14855 + }, + { + "epoch": 0.7969957081545065, + "grad_norm": 0.51953125, + "learning_rate": 4.783646865690509e-06, + "loss": 2.3846, + "step": 14856 + }, + { + "epoch": 0.797049356223176, + "grad_norm": 0.48828125, + "learning_rate": 4.783611511547351e-06, + "loss": 2.2103, + "step": 14857 + }, + { + "epoch": 0.7971030042918454, + "grad_norm": 0.498046875, + "learning_rate": 4.783576154646496e-06, + "loss": 2.3034, + "step": 14858 + }, + { + "epoch": 0.797156652360515, + "grad_norm": 0.51953125, + "learning_rate": 4.783540794987986e-06, + "loss": 2.4907, + "step": 14859 + }, + { + "epoch": 0.7972103004291845, + "grad_norm": 0.578125, + "learning_rate": 4.7835054325718635e-06, + "loss": 2.0207, + "step": 14860 + }, + { + "epoch": 0.7972639484978541, + "grad_norm": 0.494140625, + "learning_rate": 4.7834700673981715e-06, + "loss": 2.2818, + "step": 14861 + }, + { + "epoch": 0.7973175965665236, + "grad_norm": 0.37109375, + "learning_rate": 4.783434699466952e-06, + "loss": 1.9832, + "step": 14862 + }, + { + "epoch": 0.7973712446351932, + "grad_norm": 0.50390625, + "learning_rate": 4.783399328778249e-06, + "loss": 2.4141, + "step": 14863 + }, + { + "epoch": 0.7974248927038626, + "grad_norm": 0.53125, + "learning_rate": 4.7833639553321034e-06, + "loss": 2.5888, + "step": 14864 + }, + { + "epoch": 0.7974785407725322, + "grad_norm": 0.423828125, + "learning_rate": 4.783328579128559e-06, + "loss": 2.2958, + "step": 14865 + }, + { + "epoch": 0.7975321888412017, + "grad_norm": 0.388671875, + "learning_rate": 4.7832932001676595e-06, + "loss": 2.3909, + "step": 14866 + }, + { + "epoch": 0.7975858369098713, + "grad_norm": 0.5703125, + "learning_rate": 4.7832578184494455e-06, + "loss": 2.0702, + "step": 14867 + }, + { + "epoch": 0.7976394849785408, + "grad_norm": 1.3125, + "learning_rate": 4.7832224339739614e-06, + "loss": 2.5711, + "step": 14868 + }, + { + "epoch": 0.7976931330472103, + "grad_norm": 0.54296875, + "learning_rate": 4.783187046741249e-06, + "loss": 2.2882, + "step": 14869 + }, + { + "epoch": 0.7977467811158798, + "grad_norm": 0.4375, + "learning_rate": 4.7831516567513514e-06, + "loss": 2.2641, + "step": 14870 + }, + { + "epoch": 0.7978004291845494, + "grad_norm": 0.484375, + "learning_rate": 4.783116264004312e-06, + "loss": 2.4453, + "step": 14871 + }, + { + "epoch": 0.7978540772532189, + "grad_norm": 0.47265625, + "learning_rate": 4.783080868500172e-06, + "loss": 2.2157, + "step": 14872 + }, + { + "epoch": 0.7979077253218884, + "grad_norm": 0.4765625, + "learning_rate": 4.783045470238975e-06, + "loss": 2.2076, + "step": 14873 + }, + { + "epoch": 0.797961373390558, + "grad_norm": 0.69921875, + "learning_rate": 4.783010069220764e-06, + "loss": 1.373, + "step": 14874 + }, + { + "epoch": 0.7980150214592274, + "grad_norm": 0.455078125, + "learning_rate": 4.782974665445581e-06, + "loss": 2.4967, + "step": 14875 + }, + { + "epoch": 0.798068669527897, + "grad_norm": 0.412109375, + "learning_rate": 4.78293925891347e-06, + "loss": 1.9742, + "step": 14876 + }, + { + "epoch": 0.7981223175965665, + "grad_norm": 0.431640625, + "learning_rate": 4.782903849624473e-06, + "loss": 1.7892, + "step": 14877 + }, + { + "epoch": 0.7981759656652361, + "grad_norm": 0.453125, + "learning_rate": 4.782868437578631e-06, + "loss": 2.3092, + "step": 14878 + }, + { + "epoch": 0.7982296137339056, + "grad_norm": 0.447265625, + "learning_rate": 4.782833022775991e-06, + "loss": 2.2387, + "step": 14879 + }, + { + "epoch": 0.7982832618025751, + "grad_norm": 0.451171875, + "learning_rate": 4.782797605216592e-06, + "loss": 2.6064, + "step": 14880 + }, + { + "epoch": 0.7983369098712446, + "grad_norm": 0.55859375, + "learning_rate": 4.782762184900478e-06, + "loss": 2.0012, + "step": 14881 + }, + { + "epoch": 0.7983905579399142, + "grad_norm": 0.46484375, + "learning_rate": 4.782726761827693e-06, + "loss": 2.3783, + "step": 14882 + }, + { + "epoch": 0.7984442060085837, + "grad_norm": 0.4140625, + "learning_rate": 4.782691335998277e-06, + "loss": 2.2443, + "step": 14883 + }, + { + "epoch": 0.7984978540772533, + "grad_norm": 0.419921875, + "learning_rate": 4.782655907412275e-06, + "loss": 2.3112, + "step": 14884 + }, + { + "epoch": 0.7985515021459227, + "grad_norm": 0.408203125, + "learning_rate": 4.782620476069729e-06, + "loss": 2.5462, + "step": 14885 + }, + { + "epoch": 0.7986051502145923, + "grad_norm": 0.451171875, + "learning_rate": 4.782585041970683e-06, + "loss": 2.3996, + "step": 14886 + }, + { + "epoch": 0.7986587982832618, + "grad_norm": 2.203125, + "learning_rate": 4.782549605115178e-06, + "loss": 2.0981, + "step": 14887 + }, + { + "epoch": 0.7987124463519313, + "grad_norm": 0.546875, + "learning_rate": 4.782514165503257e-06, + "loss": 2.3126, + "step": 14888 + }, + { + "epoch": 0.7987660944206009, + "grad_norm": 0.5, + "learning_rate": 4.782478723134965e-06, + "loss": 2.3191, + "step": 14889 + }, + { + "epoch": 0.7988197424892703, + "grad_norm": 0.55078125, + "learning_rate": 4.782443278010341e-06, + "loss": 2.9391, + "step": 14890 + }, + { + "epoch": 0.7988733905579399, + "grad_norm": 0.455078125, + "learning_rate": 4.782407830129431e-06, + "loss": 2.1724, + "step": 14891 + }, + { + "epoch": 0.7989270386266094, + "grad_norm": 0.796875, + "learning_rate": 4.782372379492277e-06, + "loss": 2.2596, + "step": 14892 + }, + { + "epoch": 0.798980686695279, + "grad_norm": 0.51171875, + "learning_rate": 4.782336926098921e-06, + "loss": 2.4532, + "step": 14893 + }, + { + "epoch": 0.7990343347639485, + "grad_norm": 0.4140625, + "learning_rate": 4.782301469949406e-06, + "loss": 2.4302, + "step": 14894 + }, + { + "epoch": 0.7990879828326181, + "grad_norm": 0.5234375, + "learning_rate": 4.782266011043777e-06, + "loss": 2.4159, + "step": 14895 + }, + { + "epoch": 0.7991416309012875, + "grad_norm": 0.42578125, + "learning_rate": 4.782230549382073e-06, + "loss": 2.5238, + "step": 14896 + }, + { + "epoch": 0.7991952789699571, + "grad_norm": 0.46484375, + "learning_rate": 4.78219508496434e-06, + "loss": 2.2997, + "step": 14897 + }, + { + "epoch": 0.7992489270386266, + "grad_norm": 0.45703125, + "learning_rate": 4.782159617790619e-06, + "loss": 2.2047, + "step": 14898 + }, + { + "epoch": 0.7993025751072962, + "grad_norm": 0.70703125, + "learning_rate": 4.782124147860954e-06, + "loss": 2.4397, + "step": 14899 + }, + { + "epoch": 0.7993562231759657, + "grad_norm": 0.515625, + "learning_rate": 4.782088675175387e-06, + "loss": 2.4215, + "step": 14900 + }, + { + "epoch": 0.7994098712446351, + "grad_norm": 0.48046875, + "learning_rate": 4.782053199733961e-06, + "loss": 2.0735, + "step": 14901 + }, + { + "epoch": 0.7994635193133047, + "grad_norm": 0.458984375, + "learning_rate": 4.78201772153672e-06, + "loss": 2.4429, + "step": 14902 + }, + { + "epoch": 0.7995171673819742, + "grad_norm": 0.4375, + "learning_rate": 4.781982240583705e-06, + "loss": 2.3252, + "step": 14903 + }, + { + "epoch": 0.7995708154506438, + "grad_norm": 0.498046875, + "learning_rate": 4.78194675687496e-06, + "loss": 2.289, + "step": 14904 + }, + { + "epoch": 0.7996244635193133, + "grad_norm": 0.4296875, + "learning_rate": 4.7819112704105276e-06, + "loss": 2.1134, + "step": 14905 + }, + { + "epoch": 0.7996781115879829, + "grad_norm": 0.44921875, + "learning_rate": 4.78187578119045e-06, + "loss": 2.2531, + "step": 14906 + }, + { + "epoch": 0.7997317596566523, + "grad_norm": 0.46484375, + "learning_rate": 4.781840289214772e-06, + "loss": 2.2517, + "step": 14907 + }, + { + "epoch": 0.7997854077253219, + "grad_norm": 0.447265625, + "learning_rate": 4.781804794483533e-06, + "loss": 2.1399, + "step": 14908 + }, + { + "epoch": 0.7998390557939914, + "grad_norm": 0.423828125, + "learning_rate": 4.78176929699678e-06, + "loss": 2.1898, + "step": 14909 + }, + { + "epoch": 0.799892703862661, + "grad_norm": 0.478515625, + "learning_rate": 4.781733796754553e-06, + "loss": 2.2517, + "step": 14910 + }, + { + "epoch": 0.7999463519313305, + "grad_norm": 0.5625, + "learning_rate": 4.781698293756896e-06, + "loss": 2.485, + "step": 14911 + }, + { + "epoch": 0.8, + "grad_norm": 0.50390625, + "learning_rate": 4.781662788003851e-06, + "loss": 2.2626, + "step": 14912 + }, + { + "epoch": 0.8000536480686695, + "grad_norm": 0.462890625, + "learning_rate": 4.781627279495461e-06, + "loss": 2.5236, + "step": 14913 + }, + { + "epoch": 0.8001072961373391, + "grad_norm": 0.46875, + "learning_rate": 4.78159176823177e-06, + "loss": 2.4449, + "step": 14914 + }, + { + "epoch": 0.8001609442060086, + "grad_norm": 0.478515625, + "learning_rate": 4.781556254212821e-06, + "loss": 2.5292, + "step": 14915 + }, + { + "epoch": 0.8002145922746781, + "grad_norm": 0.45703125, + "learning_rate": 4.7815207374386555e-06, + "loss": 2.5472, + "step": 14916 + }, + { + "epoch": 0.8002682403433476, + "grad_norm": 0.494140625, + "learning_rate": 4.781485217909317e-06, + "loss": 2.432, + "step": 14917 + }, + { + "epoch": 0.8003218884120171, + "grad_norm": 0.455078125, + "learning_rate": 4.781449695624849e-06, + "loss": 2.3831, + "step": 14918 + }, + { + "epoch": 0.8003755364806867, + "grad_norm": 0.45703125, + "learning_rate": 4.781414170585292e-06, + "loss": 2.3226, + "step": 14919 + }, + { + "epoch": 0.8004291845493562, + "grad_norm": 0.39453125, + "learning_rate": 4.7813786427906924e-06, + "loss": 2.2885, + "step": 14920 + }, + { + "epoch": 0.8004828326180258, + "grad_norm": 0.490234375, + "learning_rate": 4.781343112241091e-06, + "loss": 2.2447, + "step": 14921 + }, + { + "epoch": 0.8005364806866953, + "grad_norm": 0.435546875, + "learning_rate": 4.781307578936531e-06, + "loss": 2.1995, + "step": 14922 + }, + { + "epoch": 0.8005901287553648, + "grad_norm": 0.56640625, + "learning_rate": 4.7812720428770555e-06, + "loss": 2.3153, + "step": 14923 + }, + { + "epoch": 0.8006437768240343, + "grad_norm": 0.48046875, + "learning_rate": 4.781236504062707e-06, + "loss": 2.3525, + "step": 14924 + }, + { + "epoch": 0.8006974248927039, + "grad_norm": 0.91015625, + "learning_rate": 4.781200962493529e-06, + "loss": 2.0827, + "step": 14925 + }, + { + "epoch": 0.8007510729613734, + "grad_norm": 0.5078125, + "learning_rate": 4.781165418169565e-06, + "loss": 2.4639, + "step": 14926 + }, + { + "epoch": 0.800804721030043, + "grad_norm": 0.408203125, + "learning_rate": 4.781129871090856e-06, + "loss": 2.3646, + "step": 14927 + }, + { + "epoch": 0.8008583690987124, + "grad_norm": 0.5859375, + "learning_rate": 4.781094321257445e-06, + "loss": 1.4788, + "step": 14928 + }, + { + "epoch": 0.800912017167382, + "grad_norm": 0.98046875, + "learning_rate": 4.781058768669377e-06, + "loss": 2.3349, + "step": 14929 + }, + { + "epoch": 0.8009656652360515, + "grad_norm": 0.431640625, + "learning_rate": 4.7810232133266945e-06, + "loss": 2.1254, + "step": 14930 + }, + { + "epoch": 0.801019313304721, + "grad_norm": 0.482421875, + "learning_rate": 4.7809876552294395e-06, + "loss": 2.1574, + "step": 14931 + }, + { + "epoch": 0.8010729613733906, + "grad_norm": 0.734375, + "learning_rate": 4.780952094377655e-06, + "loss": 2.2925, + "step": 14932 + }, + { + "epoch": 0.80112660944206, + "grad_norm": 0.44140625, + "learning_rate": 4.780916530771385e-06, + "loss": 2.5915, + "step": 14933 + }, + { + "epoch": 0.8011802575107296, + "grad_norm": 0.46484375, + "learning_rate": 4.7808809644106715e-06, + "loss": 2.2954, + "step": 14934 + }, + { + "epoch": 0.8012339055793991, + "grad_norm": 0.423828125, + "learning_rate": 4.7808453952955565e-06, + "loss": 2.108, + "step": 14935 + }, + { + "epoch": 0.8012875536480687, + "grad_norm": 0.5078125, + "learning_rate": 4.7808098234260845e-06, + "loss": 2.3527, + "step": 14936 + }, + { + "epoch": 0.8013412017167382, + "grad_norm": 0.470703125, + "learning_rate": 4.780774248802298e-06, + "loss": 2.2865, + "step": 14937 + }, + { + "epoch": 0.8013948497854078, + "grad_norm": 0.40625, + "learning_rate": 4.780738671424241e-06, + "loss": 2.2973, + "step": 14938 + }, + { + "epoch": 0.8014484978540772, + "grad_norm": 0.484375, + "learning_rate": 4.780703091291954e-06, + "loss": 2.7187, + "step": 14939 + }, + { + "epoch": 0.8015021459227468, + "grad_norm": 0.51171875, + "learning_rate": 4.780667508405482e-06, + "loss": 2.1226, + "step": 14940 + }, + { + "epoch": 0.8015557939914163, + "grad_norm": 0.40625, + "learning_rate": 4.780631922764868e-06, + "loss": 2.2727, + "step": 14941 + }, + { + "epoch": 0.8016094420600859, + "grad_norm": 0.4765625, + "learning_rate": 4.7805963343701535e-06, + "loss": 2.4668, + "step": 14942 + }, + { + "epoch": 0.8016630901287554, + "grad_norm": 0.51171875, + "learning_rate": 4.780560743221382e-06, + "loss": 2.1179, + "step": 14943 + }, + { + "epoch": 0.8017167381974248, + "grad_norm": 0.5, + "learning_rate": 4.780525149318598e-06, + "loss": 2.222, + "step": 14944 + }, + { + "epoch": 0.8017703862660944, + "grad_norm": 0.60546875, + "learning_rate": 4.780489552661842e-06, + "loss": 2.2843, + "step": 14945 + }, + { + "epoch": 0.8018240343347639, + "grad_norm": 0.412109375, + "learning_rate": 4.780453953251159e-06, + "loss": 2.1147, + "step": 14946 + }, + { + "epoch": 0.8018776824034335, + "grad_norm": 0.384765625, + "learning_rate": 4.780418351086591e-06, + "loss": 1.9722, + "step": 14947 + }, + { + "epoch": 0.801931330472103, + "grad_norm": 0.455078125, + "learning_rate": 4.780382746168182e-06, + "loss": 2.3675, + "step": 14948 + }, + { + "epoch": 0.8019849785407726, + "grad_norm": 0.5625, + "learning_rate": 4.780347138495973e-06, + "loss": 2.3414, + "step": 14949 + }, + { + "epoch": 0.802038626609442, + "grad_norm": 0.53125, + "learning_rate": 4.78031152807001e-06, + "loss": 1.1946, + "step": 14950 + }, + { + "epoch": 0.8020922746781116, + "grad_norm": 0.45703125, + "learning_rate": 4.780275914890332e-06, + "loss": 2.3053, + "step": 14951 + }, + { + "epoch": 0.8021459227467811, + "grad_norm": 0.52734375, + "learning_rate": 4.780240298956985e-06, + "loss": 2.2702, + "step": 14952 + }, + { + "epoch": 0.8021995708154507, + "grad_norm": 0.53125, + "learning_rate": 4.7802046802700125e-06, + "loss": 2.3875, + "step": 14953 + }, + { + "epoch": 0.8022532188841202, + "grad_norm": 0.5, + "learning_rate": 4.780169058829456e-06, + "loss": 2.3918, + "step": 14954 + }, + { + "epoch": 0.8023068669527897, + "grad_norm": 0.416015625, + "learning_rate": 4.780133434635358e-06, + "loss": 2.2797, + "step": 14955 + }, + { + "epoch": 0.8023605150214592, + "grad_norm": 0.56640625, + "learning_rate": 4.780097807687762e-06, + "loss": 2.3098, + "step": 14956 + }, + { + "epoch": 0.8024141630901288, + "grad_norm": 0.4609375, + "learning_rate": 4.780062177986712e-06, + "loss": 2.1874, + "step": 14957 + }, + { + "epoch": 0.8024678111587983, + "grad_norm": 0.486328125, + "learning_rate": 4.78002654553225e-06, + "loss": 2.1552, + "step": 14958 + }, + { + "epoch": 0.8025214592274678, + "grad_norm": 0.45703125, + "learning_rate": 4.77999091032442e-06, + "loss": 2.2004, + "step": 14959 + }, + { + "epoch": 0.8025751072961373, + "grad_norm": 0.44921875, + "learning_rate": 4.779955272363264e-06, + "loss": 2.1938, + "step": 14960 + }, + { + "epoch": 0.8026287553648068, + "grad_norm": 0.5234375, + "learning_rate": 4.779919631648826e-06, + "loss": 2.2372, + "step": 14961 + }, + { + "epoch": 0.8026824034334764, + "grad_norm": 0.466796875, + "learning_rate": 4.779883988181148e-06, + "loss": 2.2111, + "step": 14962 + }, + { + "epoch": 0.8027360515021459, + "grad_norm": 0.416015625, + "learning_rate": 4.779848341960274e-06, + "loss": 2.2774, + "step": 14963 + }, + { + "epoch": 0.8027896995708155, + "grad_norm": 0.47265625, + "learning_rate": 4.779812692986246e-06, + "loss": 2.2319, + "step": 14964 + }, + { + "epoch": 0.802843347639485, + "grad_norm": 0.43359375, + "learning_rate": 4.779777041259108e-06, + "loss": 1.9921, + "step": 14965 + }, + { + "epoch": 0.8028969957081545, + "grad_norm": 0.5078125, + "learning_rate": 4.779741386778902e-06, + "loss": 2.1925, + "step": 14966 + }, + { + "epoch": 0.802950643776824, + "grad_norm": 0.41796875, + "learning_rate": 4.779705729545673e-06, + "loss": 2.2143, + "step": 14967 + }, + { + "epoch": 0.8030042918454936, + "grad_norm": 0.447265625, + "learning_rate": 4.779670069559463e-06, + "loss": 2.1349, + "step": 14968 + }, + { + "epoch": 0.8030579399141631, + "grad_norm": 0.486328125, + "learning_rate": 4.779634406820313e-06, + "loss": 2.4154, + "step": 14969 + }, + { + "epoch": 0.8031115879828327, + "grad_norm": 0.49609375, + "learning_rate": 4.779598741328269e-06, + "loss": 2.3654, + "step": 14970 + }, + { + "epoch": 0.8031652360515021, + "grad_norm": 0.47265625, + "learning_rate": 4.779563073083373e-06, + "loss": 2.4706, + "step": 14971 + }, + { + "epoch": 0.8032188841201717, + "grad_norm": 1.0, + "learning_rate": 4.779527402085668e-06, + "loss": 1.8608, + "step": 14972 + }, + { + "epoch": 0.8032725321888412, + "grad_norm": 0.458984375, + "learning_rate": 4.779491728335197e-06, + "loss": 2.2798, + "step": 14973 + }, + { + "epoch": 0.8033261802575107, + "grad_norm": 0.45703125, + "learning_rate": 4.779456051832004e-06, + "loss": 2.3841, + "step": 14974 + }, + { + "epoch": 0.8033798283261803, + "grad_norm": 0.57421875, + "learning_rate": 4.779420372576131e-06, + "loss": 2.2556, + "step": 14975 + }, + { + "epoch": 0.8034334763948497, + "grad_norm": 0.423828125, + "learning_rate": 4.779384690567621e-06, + "loss": 2.3862, + "step": 14976 + }, + { + "epoch": 0.8034871244635193, + "grad_norm": 0.5703125, + "learning_rate": 4.779349005806518e-06, + "loss": 2.3022, + "step": 14977 + }, + { + "epoch": 0.8035407725321888, + "grad_norm": 0.55078125, + "learning_rate": 4.7793133182928646e-06, + "loss": 2.5139, + "step": 14978 + }, + { + "epoch": 0.8035944206008584, + "grad_norm": 0.50390625, + "learning_rate": 4.779277628026703e-06, + "loss": 2.4865, + "step": 14979 + }, + { + "epoch": 0.8036480686695279, + "grad_norm": 0.44921875, + "learning_rate": 4.779241935008077e-06, + "loss": 1.9392, + "step": 14980 + }, + { + "epoch": 0.8037017167381975, + "grad_norm": 0.5859375, + "learning_rate": 4.779206239237032e-06, + "loss": 2.2365, + "step": 14981 + }, + { + "epoch": 0.8037553648068669, + "grad_norm": 0.4453125, + "learning_rate": 4.779170540713608e-06, + "loss": 2.6186, + "step": 14982 + }, + { + "epoch": 0.8038090128755365, + "grad_norm": 0.4453125, + "learning_rate": 4.779134839437849e-06, + "loss": 2.3575, + "step": 14983 + }, + { + "epoch": 0.803862660944206, + "grad_norm": 0.482421875, + "learning_rate": 4.779099135409797e-06, + "loss": 2.3692, + "step": 14984 + }, + { + "epoch": 0.8039163090128756, + "grad_norm": 0.48828125, + "learning_rate": 4.7790634286294975e-06, + "loss": 2.2627, + "step": 14985 + }, + { + "epoch": 0.8039699570815451, + "grad_norm": 0.359375, + "learning_rate": 4.779027719096993e-06, + "loss": 2.2672, + "step": 14986 + }, + { + "epoch": 0.8040236051502145, + "grad_norm": 0.4921875, + "learning_rate": 4.778992006812325e-06, + "loss": 2.3455, + "step": 14987 + }, + { + "epoch": 0.8040772532188841, + "grad_norm": 0.5, + "learning_rate": 4.7789562917755386e-06, + "loss": 2.3442, + "step": 14988 + }, + { + "epoch": 0.8041309012875536, + "grad_norm": 0.486328125, + "learning_rate": 4.778920573986676e-06, + "loss": 2.3226, + "step": 14989 + }, + { + "epoch": 0.8041845493562232, + "grad_norm": 0.451171875, + "learning_rate": 4.77888485344578e-06, + "loss": 2.3025, + "step": 14990 + }, + { + "epoch": 0.8042381974248927, + "grad_norm": 0.51953125, + "learning_rate": 4.778849130152894e-06, + "loss": 2.0717, + "step": 14991 + }, + { + "epoch": 0.8042918454935623, + "grad_norm": 0.5, + "learning_rate": 4.778813404108062e-06, + "loss": 2.0998, + "step": 14992 + }, + { + "epoch": 0.8043454935622317, + "grad_norm": 0.4765625, + "learning_rate": 4.778777675311325e-06, + "loss": 2.0595, + "step": 14993 + }, + { + "epoch": 0.8043991416309013, + "grad_norm": 0.484375, + "learning_rate": 4.778741943762728e-06, + "loss": 2.3306, + "step": 14994 + }, + { + "epoch": 0.8044527896995708, + "grad_norm": 0.625, + "learning_rate": 4.778706209462314e-06, + "loss": 2.5378, + "step": 14995 + }, + { + "epoch": 0.8045064377682404, + "grad_norm": 0.470703125, + "learning_rate": 4.778670472410126e-06, + "loss": 2.2265, + "step": 14996 + }, + { + "epoch": 0.8045600858369099, + "grad_norm": 0.451171875, + "learning_rate": 4.778634732606207e-06, + "loss": 2.1966, + "step": 14997 + }, + { + "epoch": 0.8046137339055794, + "grad_norm": 0.462890625, + "learning_rate": 4.7785989900506e-06, + "loss": 2.2377, + "step": 14998 + }, + { + "epoch": 0.8046673819742489, + "grad_norm": 0.46875, + "learning_rate": 4.778563244743348e-06, + "loss": 2.1684, + "step": 14999 + }, + { + "epoch": 0.8047210300429185, + "grad_norm": 0.451171875, + "learning_rate": 4.778527496684495e-06, + "loss": 2.2456, + "step": 15000 + }, + { + "epoch": 0.804774678111588, + "grad_norm": 0.46875, + "learning_rate": 4.778491745874083e-06, + "loss": 2.5625, + "step": 15001 + }, + { + "epoch": 0.8048283261802575, + "grad_norm": 4.03125, + "learning_rate": 4.778455992312157e-06, + "loss": 2.221, + "step": 15002 + }, + { + "epoch": 0.804881974248927, + "grad_norm": 0.42578125, + "learning_rate": 4.778420235998758e-06, + "loss": 2.1369, + "step": 15003 + }, + { + "epoch": 0.8049356223175965, + "grad_norm": 0.447265625, + "learning_rate": 4.778384476933931e-06, + "loss": 2.4255, + "step": 15004 + }, + { + "epoch": 0.8049892703862661, + "grad_norm": 0.7734375, + "learning_rate": 4.7783487151177175e-06, + "loss": 2.1463, + "step": 15005 + }, + { + "epoch": 0.8050429184549356, + "grad_norm": 0.4921875, + "learning_rate": 4.778312950550162e-06, + "loss": 2.3315, + "step": 15006 + }, + { + "epoch": 0.8050965665236052, + "grad_norm": 0.359375, + "learning_rate": 4.7782771832313065e-06, + "loss": 2.1212, + "step": 15007 + }, + { + "epoch": 0.8051502145922746, + "grad_norm": 0.474609375, + "learning_rate": 4.778241413161196e-06, + "loss": 2.2241, + "step": 15008 + }, + { + "epoch": 0.8052038626609442, + "grad_norm": 0.447265625, + "learning_rate": 4.778205640339872e-06, + "loss": 2.0931, + "step": 15009 + }, + { + "epoch": 0.8052575107296137, + "grad_norm": 0.388671875, + "learning_rate": 4.778169864767379e-06, + "loss": 2.1142, + "step": 15010 + }, + { + "epoch": 0.8053111587982833, + "grad_norm": 0.57421875, + "learning_rate": 4.778134086443758e-06, + "loss": 2.1097, + "step": 15011 + }, + { + "epoch": 0.8053648068669528, + "grad_norm": 1.5859375, + "learning_rate": 4.778098305369056e-06, + "loss": 2.3049, + "step": 15012 + }, + { + "epoch": 0.8054184549356224, + "grad_norm": 0.4609375, + "learning_rate": 4.778062521543312e-06, + "loss": 2.2633, + "step": 15013 + }, + { + "epoch": 0.8054721030042918, + "grad_norm": 0.390625, + "learning_rate": 4.778026734966572e-06, + "loss": 1.9816, + "step": 15014 + }, + { + "epoch": 0.8055257510729614, + "grad_norm": 0.376953125, + "learning_rate": 4.777990945638878e-06, + "loss": 2.2987, + "step": 15015 + }, + { + "epoch": 0.8055793991416309, + "grad_norm": 0.416015625, + "learning_rate": 4.777955153560274e-06, + "loss": 2.2844, + "step": 15016 + }, + { + "epoch": 0.8056330472103004, + "grad_norm": 0.3984375, + "learning_rate": 4.777919358730803e-06, + "loss": 2.2354, + "step": 15017 + }, + { + "epoch": 0.80568669527897, + "grad_norm": 0.4296875, + "learning_rate": 4.777883561150508e-06, + "loss": 2.3892, + "step": 15018 + }, + { + "epoch": 0.8057403433476394, + "grad_norm": 0.466796875, + "learning_rate": 4.777847760819432e-06, + "loss": 2.5137, + "step": 15019 + }, + { + "epoch": 0.805793991416309, + "grad_norm": 0.431640625, + "learning_rate": 4.777811957737619e-06, + "loss": 2.1984, + "step": 15020 + }, + { + "epoch": 0.8058476394849785, + "grad_norm": 0.4765625, + "learning_rate": 4.77777615190511e-06, + "loss": 2.4052, + "step": 15021 + }, + { + "epoch": 0.8059012875536481, + "grad_norm": 0.435546875, + "learning_rate": 4.7777403433219525e-06, + "loss": 2.2766, + "step": 15022 + }, + { + "epoch": 0.8059549356223176, + "grad_norm": 0.74609375, + "learning_rate": 4.7777045319881855e-06, + "loss": 2.422, + "step": 15023 + }, + { + "epoch": 0.8060085836909872, + "grad_norm": 0.51171875, + "learning_rate": 4.777668717903855e-06, + "loss": 2.2787, + "step": 15024 + }, + { + "epoch": 0.8060622317596566, + "grad_norm": 0.4453125, + "learning_rate": 4.777632901069002e-06, + "loss": 2.2137, + "step": 15025 + }, + { + "epoch": 0.8061158798283262, + "grad_norm": 0.478515625, + "learning_rate": 4.777597081483672e-06, + "loss": 2.6921, + "step": 15026 + }, + { + "epoch": 0.8061695278969957, + "grad_norm": 0.51953125, + "learning_rate": 4.777561259147907e-06, + "loss": 1.7681, + "step": 15027 + }, + { + "epoch": 0.8062231759656653, + "grad_norm": 0.4453125, + "learning_rate": 4.7775254340617505e-06, + "loss": 2.0463, + "step": 15028 + }, + { + "epoch": 0.8062768240343348, + "grad_norm": 0.3671875, + "learning_rate": 4.777489606225245e-06, + "loss": 1.8435, + "step": 15029 + }, + { + "epoch": 0.8063304721030042, + "grad_norm": 0.4375, + "learning_rate": 4.777453775638435e-06, + "loss": 2.0454, + "step": 15030 + }, + { + "epoch": 0.8063841201716738, + "grad_norm": 0.42578125, + "learning_rate": 4.777417942301364e-06, + "loss": 2.3927, + "step": 15031 + }, + { + "epoch": 0.8064377682403433, + "grad_norm": 0.44140625, + "learning_rate": 4.777382106214074e-06, + "loss": 2.3013, + "step": 15032 + }, + { + "epoch": 0.8064914163090129, + "grad_norm": 0.546875, + "learning_rate": 4.777346267376609e-06, + "loss": 2.5261, + "step": 15033 + }, + { + "epoch": 0.8065450643776824, + "grad_norm": 0.58203125, + "learning_rate": 4.777310425789012e-06, + "loss": 2.0738, + "step": 15034 + }, + { + "epoch": 0.806598712446352, + "grad_norm": 0.42578125, + "learning_rate": 4.777274581451326e-06, + "loss": 2.3927, + "step": 15035 + }, + { + "epoch": 0.8066523605150214, + "grad_norm": 0.486328125, + "learning_rate": 4.777238734363595e-06, + "loss": 2.2729, + "step": 15036 + }, + { + "epoch": 0.806706008583691, + "grad_norm": 0.490234375, + "learning_rate": 4.777202884525862e-06, + "loss": 2.3251, + "step": 15037 + }, + { + "epoch": 0.8067596566523605, + "grad_norm": 0.71484375, + "learning_rate": 4.7771670319381704e-06, + "loss": 1.4422, + "step": 15038 + }, + { + "epoch": 0.8068133047210301, + "grad_norm": 0.431640625, + "learning_rate": 4.777131176600563e-06, + "loss": 2.1636, + "step": 15039 + }, + { + "epoch": 0.8068669527896996, + "grad_norm": 0.3828125, + "learning_rate": 4.777095318513084e-06, + "loss": 2.2576, + "step": 15040 + }, + { + "epoch": 0.8069206008583691, + "grad_norm": 0.490234375, + "learning_rate": 4.777059457675776e-06, + "loss": 2.3289, + "step": 15041 + }, + { + "epoch": 0.8069742489270386, + "grad_norm": 0.5390625, + "learning_rate": 4.777023594088682e-06, + "loss": 2.3151, + "step": 15042 + }, + { + "epoch": 0.8070278969957082, + "grad_norm": 0.462890625, + "learning_rate": 4.776987727751846e-06, + "loss": 2.3277, + "step": 15043 + }, + { + "epoch": 0.8070815450643777, + "grad_norm": 0.6953125, + "learning_rate": 4.776951858665311e-06, + "loss": 1.9036, + "step": 15044 + }, + { + "epoch": 0.8071351931330472, + "grad_norm": 0.41796875, + "learning_rate": 4.77691598682912e-06, + "loss": 2.3633, + "step": 15045 + }, + { + "epoch": 0.8071888412017167, + "grad_norm": 0.376953125, + "learning_rate": 4.7768801122433185e-06, + "loss": 2.3043, + "step": 15046 + }, + { + "epoch": 0.8072424892703862, + "grad_norm": 0.470703125, + "learning_rate": 4.776844234907946e-06, + "loss": 2.2447, + "step": 15047 + }, + { + "epoch": 0.8072961373390558, + "grad_norm": 0.609375, + "learning_rate": 4.776808354823049e-06, + "loss": 2.3439, + "step": 15048 + }, + { + "epoch": 0.8073497854077253, + "grad_norm": 0.46484375, + "learning_rate": 4.776772471988669e-06, + "loss": 1.9946, + "step": 15049 + }, + { + "epoch": 0.8074034334763949, + "grad_norm": 0.51171875, + "learning_rate": 4.776736586404851e-06, + "loss": 2.4924, + "step": 15050 + }, + { + "epoch": 0.8074570815450643, + "grad_norm": 0.4921875, + "learning_rate": 4.776700698071637e-06, + "loss": 2.2564, + "step": 15051 + }, + { + "epoch": 0.8075107296137339, + "grad_norm": 0.51171875, + "learning_rate": 4.77666480698907e-06, + "loss": 2.3283, + "step": 15052 + }, + { + "epoch": 0.8075643776824034, + "grad_norm": 0.55078125, + "learning_rate": 4.776628913157194e-06, + "loss": 2.4132, + "step": 15053 + }, + { + "epoch": 0.807618025751073, + "grad_norm": 0.462890625, + "learning_rate": 4.776593016576053e-06, + "loss": 2.3791, + "step": 15054 + }, + { + "epoch": 0.8076716738197425, + "grad_norm": 0.466796875, + "learning_rate": 4.776557117245691e-06, + "loss": 2.3347, + "step": 15055 + }, + { + "epoch": 0.8077253218884121, + "grad_norm": 0.44921875, + "learning_rate": 4.7765212151661475e-06, + "loss": 2.2724, + "step": 15056 + }, + { + "epoch": 0.8077789699570815, + "grad_norm": 0.58203125, + "learning_rate": 4.77648531033747e-06, + "loss": 1.7292, + "step": 15057 + }, + { + "epoch": 0.8078326180257511, + "grad_norm": 0.53125, + "learning_rate": 4.7764494027597004e-06, + "loss": 2.3112, + "step": 15058 + }, + { + "epoch": 0.8078862660944206, + "grad_norm": 0.443359375, + "learning_rate": 4.776413492432881e-06, + "loss": 2.1523, + "step": 15059 + }, + { + "epoch": 0.8079399141630901, + "grad_norm": 0.458984375, + "learning_rate": 4.776377579357057e-06, + "loss": 2.2028, + "step": 15060 + }, + { + "epoch": 0.8079935622317597, + "grad_norm": 0.45703125, + "learning_rate": 4.776341663532271e-06, + "loss": 2.1899, + "step": 15061 + }, + { + "epoch": 0.8080472103004291, + "grad_norm": 0.65625, + "learning_rate": 4.776305744958566e-06, + "loss": 2.14, + "step": 15062 + }, + { + "epoch": 0.8081008583690987, + "grad_norm": 0.5078125, + "learning_rate": 4.776269823635985e-06, + "loss": 2.3352, + "step": 15063 + }, + { + "epoch": 0.8081545064377682, + "grad_norm": 0.46875, + "learning_rate": 4.776233899564573e-06, + "loss": 1.7284, + "step": 15064 + }, + { + "epoch": 0.8082081545064378, + "grad_norm": 0.44921875, + "learning_rate": 4.776197972744372e-06, + "loss": 2.1119, + "step": 15065 + }, + { + "epoch": 0.8082618025751073, + "grad_norm": 0.41015625, + "learning_rate": 4.776162043175425e-06, + "loss": 2.068, + "step": 15066 + }, + { + "epoch": 0.8083154506437769, + "grad_norm": 0.546875, + "learning_rate": 4.7761261108577775e-06, + "loss": 2.4325, + "step": 15067 + }, + { + "epoch": 0.8083690987124463, + "grad_norm": 0.640625, + "learning_rate": 4.776090175791471e-06, + "loss": 2.2317, + "step": 15068 + }, + { + "epoch": 0.8084227467811159, + "grad_norm": 0.490234375, + "learning_rate": 4.776054237976549e-06, + "loss": 2.4719, + "step": 15069 + }, + { + "epoch": 0.8084763948497854, + "grad_norm": 0.50390625, + "learning_rate": 4.776018297413057e-06, + "loss": 2.2792, + "step": 15070 + }, + { + "epoch": 0.808530042918455, + "grad_norm": 0.431640625, + "learning_rate": 4.775982354101036e-06, + "loss": 2.4166, + "step": 15071 + }, + { + "epoch": 0.8085836909871245, + "grad_norm": 0.4765625, + "learning_rate": 4.775946408040529e-06, + "loss": 2.4003, + "step": 15072 + }, + { + "epoch": 0.808637339055794, + "grad_norm": 0.43359375, + "learning_rate": 4.775910459231582e-06, + "loss": 2.3922, + "step": 15073 + }, + { + "epoch": 0.8086909871244635, + "grad_norm": 0.490234375, + "learning_rate": 4.775874507674236e-06, + "loss": 2.378, + "step": 15074 + }, + { + "epoch": 0.808744635193133, + "grad_norm": 0.5390625, + "learning_rate": 4.775838553368536e-06, + "loss": 2.1628, + "step": 15075 + }, + { + "epoch": 0.8087982832618026, + "grad_norm": 0.400390625, + "learning_rate": 4.775802596314525e-06, + "loss": 2.2236, + "step": 15076 + }, + { + "epoch": 0.8088519313304721, + "grad_norm": 0.462890625, + "learning_rate": 4.7757666365122455e-06, + "loss": 2.1184, + "step": 15077 + }, + { + "epoch": 0.8089055793991416, + "grad_norm": 0.439453125, + "learning_rate": 4.775730673961743e-06, + "loss": 1.9073, + "step": 15078 + }, + { + "epoch": 0.8089592274678111, + "grad_norm": 0.4609375, + "learning_rate": 4.775694708663058e-06, + "loss": 2.188, + "step": 15079 + }, + { + "epoch": 0.8090128755364807, + "grad_norm": 1.03125, + "learning_rate": 4.7756587406162365e-06, + "loss": 2.2946, + "step": 15080 + }, + { + "epoch": 0.8090665236051502, + "grad_norm": 0.443359375, + "learning_rate": 4.775622769821321e-06, + "loss": 1.974, + "step": 15081 + }, + { + "epoch": 0.8091201716738198, + "grad_norm": 0.625, + "learning_rate": 4.775586796278354e-06, + "loss": 2.4099, + "step": 15082 + }, + { + "epoch": 0.8091738197424893, + "grad_norm": 0.50390625, + "learning_rate": 4.775550819987381e-06, + "loss": 1.4813, + "step": 15083 + }, + { + "epoch": 0.8092274678111588, + "grad_norm": 0.46875, + "learning_rate": 4.775514840948444e-06, + "loss": 2.3324, + "step": 15084 + }, + { + "epoch": 0.8092811158798283, + "grad_norm": 0.458984375, + "learning_rate": 4.775478859161587e-06, + "loss": 2.271, + "step": 15085 + }, + { + "epoch": 0.8093347639484979, + "grad_norm": 0.396484375, + "learning_rate": 4.775442874626852e-06, + "loss": 2.4117, + "step": 15086 + }, + { + "epoch": 0.8093884120171674, + "grad_norm": 0.515625, + "learning_rate": 4.775406887344285e-06, + "loss": 2.0731, + "step": 15087 + }, + { + "epoch": 0.8094420600858369, + "grad_norm": 0.47265625, + "learning_rate": 4.775370897313927e-06, + "loss": 2.3715, + "step": 15088 + }, + { + "epoch": 0.8094957081545064, + "grad_norm": 0.48828125, + "learning_rate": 4.775334904535823e-06, + "loss": 2.2396, + "step": 15089 + }, + { + "epoch": 0.8095493562231759, + "grad_norm": 0.56640625, + "learning_rate": 4.775298909010016e-06, + "loss": 2.3837, + "step": 15090 + }, + { + "epoch": 0.8096030042918455, + "grad_norm": 0.44140625, + "learning_rate": 4.77526291073655e-06, + "loss": 2.3332, + "step": 15091 + }, + { + "epoch": 0.809656652360515, + "grad_norm": 0.44140625, + "learning_rate": 4.775226909715468e-06, + "loss": 2.3646, + "step": 15092 + }, + { + "epoch": 0.8097103004291846, + "grad_norm": 0.52734375, + "learning_rate": 4.7751909059468126e-06, + "loss": 2.4356, + "step": 15093 + }, + { + "epoch": 0.809763948497854, + "grad_norm": 0.5, + "learning_rate": 4.7751548994306294e-06, + "loss": 2.1486, + "step": 15094 + }, + { + "epoch": 0.8098175965665236, + "grad_norm": 0.404296875, + "learning_rate": 4.7751188901669595e-06, + "loss": 1.8424, + "step": 15095 + }, + { + "epoch": 0.8098712446351931, + "grad_norm": 0.52734375, + "learning_rate": 4.7750828781558475e-06, + "loss": 2.3697, + "step": 15096 + }, + { + "epoch": 0.8099248927038627, + "grad_norm": 0.60546875, + "learning_rate": 4.7750468633973375e-06, + "loss": 2.448, + "step": 15097 + }, + { + "epoch": 0.8099785407725322, + "grad_norm": 0.453125, + "learning_rate": 4.775010845891472e-06, + "loss": 2.3659, + "step": 15098 + }, + { + "epoch": 0.8100321888412018, + "grad_norm": 0.45703125, + "learning_rate": 4.774974825638296e-06, + "loss": 2.1729, + "step": 15099 + }, + { + "epoch": 0.8100858369098712, + "grad_norm": 0.4453125, + "learning_rate": 4.774938802637851e-06, + "loss": 2.2699, + "step": 15100 + }, + { + "epoch": 0.8101394849785408, + "grad_norm": 0.458984375, + "learning_rate": 4.7749027768901805e-06, + "loss": 2.3731, + "step": 15101 + }, + { + "epoch": 0.8101931330472103, + "grad_norm": 0.423828125, + "learning_rate": 4.77486674839533e-06, + "loss": 2.2718, + "step": 15102 + }, + { + "epoch": 0.8102467811158798, + "grad_norm": 0.453125, + "learning_rate": 4.774830717153342e-06, + "loss": 2.14, + "step": 15103 + }, + { + "epoch": 0.8103004291845494, + "grad_norm": 0.470703125, + "learning_rate": 4.774794683164259e-06, + "loss": 2.2531, + "step": 15104 + }, + { + "epoch": 0.8103540772532188, + "grad_norm": 0.4921875, + "learning_rate": 4.774758646428126e-06, + "loss": 2.1885, + "step": 15105 + }, + { + "epoch": 0.8104077253218884, + "grad_norm": 0.48046875, + "learning_rate": 4.774722606944985e-06, + "loss": 2.4504, + "step": 15106 + }, + { + "epoch": 0.8104613733905579, + "grad_norm": 0.52734375, + "learning_rate": 4.7746865647148824e-06, + "loss": 2.0186, + "step": 15107 + }, + { + "epoch": 0.8105150214592275, + "grad_norm": 0.470703125, + "learning_rate": 4.774650519737858e-06, + "loss": 2.2704, + "step": 15108 + }, + { + "epoch": 0.810568669527897, + "grad_norm": 0.5, + "learning_rate": 4.774614472013957e-06, + "loss": 2.3405, + "step": 15109 + }, + { + "epoch": 0.8106223175965666, + "grad_norm": 0.49609375, + "learning_rate": 4.774578421543225e-06, + "loss": 2.3512, + "step": 15110 + }, + { + "epoch": 0.810675965665236, + "grad_norm": 0.474609375, + "learning_rate": 4.774542368325702e-06, + "loss": 2.2645, + "step": 15111 + }, + { + "epoch": 0.8107296137339056, + "grad_norm": 0.392578125, + "learning_rate": 4.774506312361433e-06, + "loss": 2.1136, + "step": 15112 + }, + { + "epoch": 0.8107832618025751, + "grad_norm": 0.470703125, + "learning_rate": 4.774470253650463e-06, + "loss": 2.3407, + "step": 15113 + }, + { + "epoch": 0.8108369098712447, + "grad_norm": 0.380859375, + "learning_rate": 4.774434192192833e-06, + "loss": 2.2786, + "step": 15114 + }, + { + "epoch": 0.8108905579399142, + "grad_norm": 0.5078125, + "learning_rate": 4.7743981279885875e-06, + "loss": 1.997, + "step": 15115 + }, + { + "epoch": 0.8109442060085837, + "grad_norm": 0.48828125, + "learning_rate": 4.774362061037772e-06, + "loss": 2.2354, + "step": 15116 + }, + { + "epoch": 0.8109978540772532, + "grad_norm": 0.490234375, + "learning_rate": 4.774325991340427e-06, + "loss": 2.1993, + "step": 15117 + }, + { + "epoch": 0.8110515021459227, + "grad_norm": 0.4453125, + "learning_rate": 4.774289918896597e-06, + "loss": 2.2995, + "step": 15118 + }, + { + "epoch": 0.8111051502145923, + "grad_norm": 0.5, + "learning_rate": 4.774253843706327e-06, + "loss": 2.2236, + "step": 15119 + }, + { + "epoch": 0.8111587982832618, + "grad_norm": 0.5234375, + "learning_rate": 4.774217765769659e-06, + "loss": 2.2058, + "step": 15120 + }, + { + "epoch": 0.8112124463519313, + "grad_norm": 0.4921875, + "learning_rate": 4.774181685086637e-06, + "loss": 2.2516, + "step": 15121 + }, + { + "epoch": 0.8112660944206008, + "grad_norm": 0.49609375, + "learning_rate": 4.7741456016573055e-06, + "loss": 2.5028, + "step": 15122 + }, + { + "epoch": 0.8113197424892704, + "grad_norm": 0.365234375, + "learning_rate": 4.7741095154817064e-06, + "loss": 2.0789, + "step": 15123 + }, + { + "epoch": 0.8113733905579399, + "grad_norm": 0.6015625, + "learning_rate": 4.774073426559884e-06, + "loss": 1.4295, + "step": 15124 + }, + { + "epoch": 0.8114270386266095, + "grad_norm": 0.455078125, + "learning_rate": 4.774037334891883e-06, + "loss": 2.2696, + "step": 15125 + }, + { + "epoch": 0.811480686695279, + "grad_norm": 0.44921875, + "learning_rate": 4.774001240477745e-06, + "loss": 2.2375, + "step": 15126 + }, + { + "epoch": 0.8115343347639485, + "grad_norm": 0.77734375, + "learning_rate": 4.773965143317515e-06, + "loss": 2.233, + "step": 15127 + }, + { + "epoch": 0.811587982832618, + "grad_norm": 0.49609375, + "learning_rate": 4.773929043411236e-06, + "loss": 2.0752, + "step": 15128 + }, + { + "epoch": 0.8116416309012876, + "grad_norm": 0.375, + "learning_rate": 4.773892940758952e-06, + "loss": 2.3974, + "step": 15129 + }, + { + "epoch": 0.8116952789699571, + "grad_norm": 0.41015625, + "learning_rate": 4.773856835360706e-06, + "loss": 2.3384, + "step": 15130 + }, + { + "epoch": 0.8117489270386266, + "grad_norm": 0.515625, + "learning_rate": 4.7738207272165425e-06, + "loss": 2.2478, + "step": 15131 + }, + { + "epoch": 0.8118025751072961, + "grad_norm": 0.451171875, + "learning_rate": 4.773784616326504e-06, + "loss": 2.5398, + "step": 15132 + }, + { + "epoch": 0.8118562231759656, + "grad_norm": 1.1953125, + "learning_rate": 4.773748502690636e-06, + "loss": 2.4463, + "step": 15133 + }, + { + "epoch": 0.8119098712446352, + "grad_norm": 0.515625, + "learning_rate": 4.773712386308979e-06, + "loss": 2.3298, + "step": 15134 + }, + { + "epoch": 0.8119635193133047, + "grad_norm": 0.443359375, + "learning_rate": 4.7736762671815794e-06, + "loss": 2.3064, + "step": 15135 + }, + { + "epoch": 0.8120171673819743, + "grad_norm": 0.546875, + "learning_rate": 4.77364014530848e-06, + "loss": 1.6181, + "step": 15136 + }, + { + "epoch": 0.8120708154506437, + "grad_norm": 0.453125, + "learning_rate": 4.773604020689724e-06, + "loss": 2.2248, + "step": 15137 + }, + { + "epoch": 0.8121244635193133, + "grad_norm": 0.43359375, + "learning_rate": 4.7735678933253545e-06, + "loss": 2.2404, + "step": 15138 + }, + { + "epoch": 0.8121781115879828, + "grad_norm": 0.53125, + "learning_rate": 4.773531763215417e-06, + "loss": 2.2584, + "step": 15139 + }, + { + "epoch": 0.8122317596566524, + "grad_norm": 0.55078125, + "learning_rate": 4.773495630359953e-06, + "loss": 1.785, + "step": 15140 + }, + { + "epoch": 0.8122854077253219, + "grad_norm": 0.5078125, + "learning_rate": 4.7734594947590075e-06, + "loss": 2.38, + "step": 15141 + }, + { + "epoch": 0.8123390557939915, + "grad_norm": 0.47265625, + "learning_rate": 4.773423356412624e-06, + "loss": 2.3511, + "step": 15142 + }, + { + "epoch": 0.8123927038626609, + "grad_norm": 0.453125, + "learning_rate": 4.773387215320846e-06, + "loss": 2.2579, + "step": 15143 + }, + { + "epoch": 0.8124463519313305, + "grad_norm": 0.46484375, + "learning_rate": 4.773351071483718e-06, + "loss": 2.3263, + "step": 15144 + }, + { + "epoch": 0.8125, + "grad_norm": 0.53125, + "learning_rate": 4.773314924901281e-06, + "loss": 2.1028, + "step": 15145 + }, + { + "epoch": 0.8125536480686695, + "grad_norm": 0.470703125, + "learning_rate": 4.773278775573581e-06, + "loss": 1.9924, + "step": 15146 + }, + { + "epoch": 0.8126072961373391, + "grad_norm": 0.455078125, + "learning_rate": 4.773242623500661e-06, + "loss": 1.8978, + "step": 15147 + }, + { + "epoch": 0.8126609442060085, + "grad_norm": 0.453125, + "learning_rate": 4.773206468682565e-06, + "loss": 2.2471, + "step": 15148 + }, + { + "epoch": 0.8127145922746781, + "grad_norm": 0.40625, + "learning_rate": 4.773170311119336e-06, + "loss": 2.4024, + "step": 15149 + }, + { + "epoch": 0.8127682403433476, + "grad_norm": 0.44921875, + "learning_rate": 4.773134150811018e-06, + "loss": 2.2911, + "step": 15150 + }, + { + "epoch": 0.8128218884120172, + "grad_norm": 0.447265625, + "learning_rate": 4.773097987757655e-06, + "loss": 2.108, + "step": 15151 + }, + { + "epoch": 0.8128755364806867, + "grad_norm": 0.37890625, + "learning_rate": 4.77306182195929e-06, + "loss": 2.0619, + "step": 15152 + }, + { + "epoch": 0.8129291845493563, + "grad_norm": 0.44921875, + "learning_rate": 4.7730256534159665e-06, + "loss": 2.2398, + "step": 15153 + }, + { + "epoch": 0.8129828326180257, + "grad_norm": 0.40234375, + "learning_rate": 4.77298948212773e-06, + "loss": 1.9308, + "step": 15154 + }, + { + "epoch": 0.8130364806866953, + "grad_norm": 0.5, + "learning_rate": 4.772953308094622e-06, + "loss": 2.3427, + "step": 15155 + }, + { + "epoch": 0.8130901287553648, + "grad_norm": 0.48828125, + "learning_rate": 4.772917131316688e-06, + "loss": 2.3513, + "step": 15156 + }, + { + "epoch": 0.8131437768240344, + "grad_norm": 0.9609375, + "learning_rate": 4.77288095179397e-06, + "loss": 2.3525, + "step": 15157 + }, + { + "epoch": 0.8131974248927039, + "grad_norm": 0.435546875, + "learning_rate": 4.772844769526513e-06, + "loss": 2.3432, + "step": 15158 + }, + { + "epoch": 0.8132510729613734, + "grad_norm": 0.41796875, + "learning_rate": 4.772808584514359e-06, + "loss": 2.4844, + "step": 15159 + }, + { + "epoch": 0.8133047210300429, + "grad_norm": 0.490234375, + "learning_rate": 4.772772396757554e-06, + "loss": 1.9541, + "step": 15160 + }, + { + "epoch": 0.8133583690987124, + "grad_norm": 0.609375, + "learning_rate": 4.77273620625614e-06, + "loss": 2.5801, + "step": 15161 + }, + { + "epoch": 0.813412017167382, + "grad_norm": 0.53515625, + "learning_rate": 4.772700013010161e-06, + "loss": 2.1293, + "step": 15162 + }, + { + "epoch": 0.8134656652360515, + "grad_norm": 0.6796875, + "learning_rate": 4.772663817019662e-06, + "loss": 2.2429, + "step": 15163 + }, + { + "epoch": 0.813519313304721, + "grad_norm": 0.474609375, + "learning_rate": 4.772627618284684e-06, + "loss": 2.3913, + "step": 15164 + }, + { + "epoch": 0.8135729613733905, + "grad_norm": 0.470703125, + "learning_rate": 4.772591416805274e-06, + "loss": 2.3551, + "step": 15165 + }, + { + "epoch": 0.8136266094420601, + "grad_norm": 0.5390625, + "learning_rate": 4.772555212581473e-06, + "loss": 2.1631, + "step": 15166 + }, + { + "epoch": 0.8136802575107296, + "grad_norm": 0.45703125, + "learning_rate": 4.772519005613326e-06, + "loss": 2.1993, + "step": 15167 + }, + { + "epoch": 0.8137339055793992, + "grad_norm": 0.5546875, + "learning_rate": 4.772482795900877e-06, + "loss": 2.5725, + "step": 15168 + }, + { + "epoch": 0.8137875536480687, + "grad_norm": 0.462890625, + "learning_rate": 4.772446583444169e-06, + "loss": 2.2296, + "step": 15169 + }, + { + "epoch": 0.8138412017167382, + "grad_norm": 0.45703125, + "learning_rate": 4.772410368243246e-06, + "loss": 2.0027, + "step": 15170 + }, + { + "epoch": 0.8138948497854077, + "grad_norm": 0.478515625, + "learning_rate": 4.772374150298152e-06, + "loss": 2.2699, + "step": 15171 + }, + { + "epoch": 0.8139484978540773, + "grad_norm": 0.451171875, + "learning_rate": 4.77233792960893e-06, + "loss": 2.3144, + "step": 15172 + }, + { + "epoch": 0.8140021459227468, + "grad_norm": 0.44921875, + "learning_rate": 4.772301706175625e-06, + "loss": 2.1922, + "step": 15173 + }, + { + "epoch": 0.8140557939914163, + "grad_norm": 0.48828125, + "learning_rate": 4.772265479998279e-06, + "loss": 2.3627, + "step": 15174 + }, + { + "epoch": 0.8141094420600858, + "grad_norm": 0.466796875, + "learning_rate": 4.772229251076936e-06, + "loss": 2.273, + "step": 15175 + }, + { + "epoch": 0.8141630901287553, + "grad_norm": 0.609375, + "learning_rate": 4.772193019411642e-06, + "loss": 2.3407, + "step": 15176 + }, + { + "epoch": 0.8142167381974249, + "grad_norm": 0.419921875, + "learning_rate": 4.772156785002439e-06, + "loss": 2.2113, + "step": 15177 + }, + { + "epoch": 0.8142703862660944, + "grad_norm": 0.466796875, + "learning_rate": 4.7721205478493705e-06, + "loss": 2.3591, + "step": 15178 + }, + { + "epoch": 0.814324034334764, + "grad_norm": 0.384765625, + "learning_rate": 4.7720843079524804e-06, + "loss": 2.0915, + "step": 15179 + }, + { + "epoch": 0.8143776824034334, + "grad_norm": 0.40234375, + "learning_rate": 4.7720480653118136e-06, + "loss": 2.3681, + "step": 15180 + }, + { + "epoch": 0.814431330472103, + "grad_norm": 0.43359375, + "learning_rate": 4.772011819927412e-06, + "loss": 1.9815, + "step": 15181 + }, + { + "epoch": 0.8144849785407725, + "grad_norm": 0.48046875, + "learning_rate": 4.771975571799322e-06, + "loss": 2.2191, + "step": 15182 + }, + { + "epoch": 0.8145386266094421, + "grad_norm": 0.373046875, + "learning_rate": 4.771939320927584e-06, + "loss": 2.1934, + "step": 15183 + }, + { + "epoch": 0.8145922746781116, + "grad_norm": 0.466796875, + "learning_rate": 4.7719030673122445e-06, + "loss": 2.3421, + "step": 15184 + }, + { + "epoch": 0.8146459227467812, + "grad_norm": 0.5078125, + "learning_rate": 4.771866810953346e-06, + "loss": 2.4754, + "step": 15185 + }, + { + "epoch": 0.8146995708154506, + "grad_norm": 0.4609375, + "learning_rate": 4.771830551850934e-06, + "loss": 2.2241, + "step": 15186 + }, + { + "epoch": 0.8147532188841202, + "grad_norm": 0.455078125, + "learning_rate": 4.771794290005049e-06, + "loss": 2.3121, + "step": 15187 + }, + { + "epoch": 0.8148068669527897, + "grad_norm": 0.43359375, + "learning_rate": 4.771758025415738e-06, + "loss": 2.188, + "step": 15188 + }, + { + "epoch": 0.8148605150214592, + "grad_norm": 0.375, + "learning_rate": 4.771721758083042e-06, + "loss": 2.1505, + "step": 15189 + }, + { + "epoch": 0.8149141630901288, + "grad_norm": 0.49609375, + "learning_rate": 4.771685488007008e-06, + "loss": 2.3241, + "step": 15190 + }, + { + "epoch": 0.8149678111587982, + "grad_norm": 0.419921875, + "learning_rate": 4.771649215187677e-06, + "loss": 1.8503, + "step": 15191 + }, + { + "epoch": 0.8150214592274678, + "grad_norm": 0.37109375, + "learning_rate": 4.771612939625094e-06, + "loss": 2.0851, + "step": 15192 + }, + { + "epoch": 0.8150751072961373, + "grad_norm": 0.54296875, + "learning_rate": 4.771576661319303e-06, + "loss": 2.1994, + "step": 15193 + }, + { + "epoch": 0.8151287553648069, + "grad_norm": 0.466796875, + "learning_rate": 4.771540380270348e-06, + "loss": 2.3224, + "step": 15194 + }, + { + "epoch": 0.8151824034334764, + "grad_norm": 0.76953125, + "learning_rate": 4.771504096478271e-06, + "loss": 2.2727, + "step": 15195 + }, + { + "epoch": 0.815236051502146, + "grad_norm": 0.5625, + "learning_rate": 4.771467809943117e-06, + "loss": 2.3851, + "step": 15196 + }, + { + "epoch": 0.8152896995708154, + "grad_norm": 0.4375, + "learning_rate": 4.771431520664932e-06, + "loss": 2.355, + "step": 15197 + }, + { + "epoch": 0.815343347639485, + "grad_norm": 0.64453125, + "learning_rate": 4.771395228643757e-06, + "loss": 2.3812, + "step": 15198 + }, + { + "epoch": 0.8153969957081545, + "grad_norm": 0.55078125, + "learning_rate": 4.771358933879636e-06, + "loss": 2.2375, + "step": 15199 + }, + { + "epoch": 0.8154506437768241, + "grad_norm": 0.474609375, + "learning_rate": 4.771322636372614e-06, + "loss": 2.2744, + "step": 15200 + }, + { + "epoch": 0.8155042918454936, + "grad_norm": 0.45703125, + "learning_rate": 4.771286336122733e-06, + "loss": 2.3876, + "step": 15201 + }, + { + "epoch": 0.8155579399141631, + "grad_norm": 0.50390625, + "learning_rate": 4.771250033130039e-06, + "loss": 2.6186, + "step": 15202 + }, + { + "epoch": 0.8156115879828326, + "grad_norm": 0.431640625, + "learning_rate": 4.7712137273945756e-06, + "loss": 2.2935, + "step": 15203 + }, + { + "epoch": 0.8156652360515021, + "grad_norm": 0.361328125, + "learning_rate": 4.771177418916385e-06, + "loss": 2.0543, + "step": 15204 + }, + { + "epoch": 0.8157188841201717, + "grad_norm": 0.51171875, + "learning_rate": 4.771141107695512e-06, + "loss": 2.6428, + "step": 15205 + }, + { + "epoch": 0.8157725321888412, + "grad_norm": 0.41015625, + "learning_rate": 4.771104793732002e-06, + "loss": 2.336, + "step": 15206 + }, + { + "epoch": 0.8158261802575107, + "grad_norm": 0.453125, + "learning_rate": 4.771068477025896e-06, + "loss": 2.2847, + "step": 15207 + }, + { + "epoch": 0.8158798283261802, + "grad_norm": 0.46484375, + "learning_rate": 4.77103215757724e-06, + "loss": 2.3655, + "step": 15208 + }, + { + "epoch": 0.8159334763948498, + "grad_norm": 0.359375, + "learning_rate": 4.770995835386077e-06, + "loss": 1.907, + "step": 15209 + }, + { + "epoch": 0.8159871244635193, + "grad_norm": 3.296875, + "learning_rate": 4.77095951045245e-06, + "loss": 2.1379, + "step": 15210 + }, + { + "epoch": 0.8160407725321889, + "grad_norm": 0.4453125, + "learning_rate": 4.770923182776404e-06, + "loss": 2.4602, + "step": 15211 + }, + { + "epoch": 0.8160944206008584, + "grad_norm": 0.53125, + "learning_rate": 4.770886852357983e-06, + "loss": 2.3512, + "step": 15212 + }, + { + "epoch": 0.8161480686695279, + "grad_norm": 0.60546875, + "learning_rate": 4.77085051919723e-06, + "loss": 2.0954, + "step": 15213 + }, + { + "epoch": 0.8162017167381974, + "grad_norm": 0.5, + "learning_rate": 4.77081418329419e-06, + "loss": 2.4977, + "step": 15214 + }, + { + "epoch": 0.816255364806867, + "grad_norm": 0.42578125, + "learning_rate": 4.770777844648906e-06, + "loss": 2.0574, + "step": 15215 + }, + { + "epoch": 0.8163090128755365, + "grad_norm": 0.431640625, + "learning_rate": 4.7707415032614225e-06, + "loss": 2.2834, + "step": 15216 + }, + { + "epoch": 0.816362660944206, + "grad_norm": 0.416015625, + "learning_rate": 4.770705159131783e-06, + "loss": 1.8007, + "step": 15217 + }, + { + "epoch": 0.8164163090128755, + "grad_norm": 0.51171875, + "learning_rate": 4.7706688122600305e-06, + "loss": 2.1356, + "step": 15218 + }, + { + "epoch": 0.816469957081545, + "grad_norm": 0.58203125, + "learning_rate": 4.770632462646211e-06, + "loss": 2.0295, + "step": 15219 + }, + { + "epoch": 0.8165236051502146, + "grad_norm": 0.443359375, + "learning_rate": 4.7705961102903665e-06, + "loss": 2.227, + "step": 15220 + }, + { + "epoch": 0.8165772532188841, + "grad_norm": 0.447265625, + "learning_rate": 4.770559755192542e-06, + "loss": 2.393, + "step": 15221 + }, + { + "epoch": 0.8166309012875537, + "grad_norm": 0.466796875, + "learning_rate": 4.770523397352781e-06, + "loss": 2.5425, + "step": 15222 + }, + { + "epoch": 0.8166845493562231, + "grad_norm": 0.5078125, + "learning_rate": 4.770487036771127e-06, + "loss": 2.1413, + "step": 15223 + }, + { + "epoch": 0.8167381974248927, + "grad_norm": 0.53515625, + "learning_rate": 4.770450673447625e-06, + "loss": 1.3895, + "step": 15224 + }, + { + "epoch": 0.8167918454935622, + "grad_norm": 0.48046875, + "learning_rate": 4.770414307382318e-06, + "loss": 2.4448, + "step": 15225 + }, + { + "epoch": 0.8168454935622318, + "grad_norm": 0.404296875, + "learning_rate": 4.77037793857525e-06, + "loss": 2.3979, + "step": 15226 + }, + { + "epoch": 0.8168991416309013, + "grad_norm": 0.494140625, + "learning_rate": 4.770341567026466e-06, + "loss": 2.1568, + "step": 15227 + }, + { + "epoch": 0.8169527896995709, + "grad_norm": 0.439453125, + "learning_rate": 4.770305192736008e-06, + "loss": 2.3209, + "step": 15228 + }, + { + "epoch": 0.8170064377682403, + "grad_norm": 0.470703125, + "learning_rate": 4.770268815703922e-06, + "loss": 2.2676, + "step": 15229 + }, + { + "epoch": 0.8170600858369099, + "grad_norm": 0.5, + "learning_rate": 4.77023243593025e-06, + "loss": 2.3618, + "step": 15230 + }, + { + "epoch": 0.8171137339055794, + "grad_norm": 0.447265625, + "learning_rate": 4.7701960534150365e-06, + "loss": 2.3488, + "step": 15231 + }, + { + "epoch": 0.8171673819742489, + "grad_norm": 0.39453125, + "learning_rate": 4.770159668158326e-06, + "loss": 2.6652, + "step": 15232 + }, + { + "epoch": 0.8172210300429185, + "grad_norm": 0.458984375, + "learning_rate": 4.770123280160163e-06, + "loss": 2.2662, + "step": 15233 + }, + { + "epoch": 0.8172746781115879, + "grad_norm": 0.474609375, + "learning_rate": 4.77008688942059e-06, + "loss": 2.5127, + "step": 15234 + }, + { + "epoch": 0.8173283261802575, + "grad_norm": 0.470703125, + "learning_rate": 4.770050495939651e-06, + "loss": 2.3439, + "step": 15235 + }, + { + "epoch": 0.817381974248927, + "grad_norm": 0.400390625, + "learning_rate": 4.770014099717391e-06, + "loss": 1.9863, + "step": 15236 + }, + { + "epoch": 0.8174356223175966, + "grad_norm": 0.478515625, + "learning_rate": 4.7699777007538535e-06, + "loss": 2.3357, + "step": 15237 + }, + { + "epoch": 0.8174892703862661, + "grad_norm": 0.49609375, + "learning_rate": 4.769941299049082e-06, + "loss": 2.1666, + "step": 15238 + }, + { + "epoch": 0.8175429184549357, + "grad_norm": 0.5078125, + "learning_rate": 4.769904894603121e-06, + "loss": 2.2131, + "step": 15239 + }, + { + "epoch": 0.8175965665236051, + "grad_norm": 0.4609375, + "learning_rate": 4.769868487416015e-06, + "loss": 1.9724, + "step": 15240 + }, + { + "epoch": 0.8176502145922747, + "grad_norm": 0.4296875, + "learning_rate": 4.769832077487807e-06, + "loss": 2.231, + "step": 15241 + }, + { + "epoch": 0.8177038626609442, + "grad_norm": 0.44921875, + "learning_rate": 4.769795664818542e-06, + "loss": 2.2599, + "step": 15242 + }, + { + "epoch": 0.8177575107296138, + "grad_norm": 0.474609375, + "learning_rate": 4.769759249408262e-06, + "loss": 2.4111, + "step": 15243 + }, + { + "epoch": 0.8178111587982833, + "grad_norm": 0.546875, + "learning_rate": 4.769722831257012e-06, + "loss": 2.4881, + "step": 15244 + }, + { + "epoch": 0.8178648068669528, + "grad_norm": 0.470703125, + "learning_rate": 4.769686410364837e-06, + "loss": 2.3946, + "step": 15245 + }, + { + "epoch": 0.8179184549356223, + "grad_norm": 0.88671875, + "learning_rate": 4.7696499867317804e-06, + "loss": 2.3104, + "step": 15246 + }, + { + "epoch": 0.8179721030042918, + "grad_norm": 0.59375, + "learning_rate": 4.769613560357886e-06, + "loss": 2.5074, + "step": 15247 + }, + { + "epoch": 0.8180257510729614, + "grad_norm": 0.46875, + "learning_rate": 4.769577131243197e-06, + "loss": 2.2523, + "step": 15248 + }, + { + "epoch": 0.8180793991416309, + "grad_norm": 0.48828125, + "learning_rate": 4.769540699387759e-06, + "loss": 2.243, + "step": 15249 + }, + { + "epoch": 0.8181330472103004, + "grad_norm": 0.56640625, + "learning_rate": 4.769504264791614e-06, + "loss": 2.2524, + "step": 15250 + }, + { + "epoch": 0.8181866952789699, + "grad_norm": 0.408203125, + "learning_rate": 4.7694678274548085e-06, + "loss": 2.2976, + "step": 15251 + }, + { + "epoch": 0.8182403433476395, + "grad_norm": 0.48046875, + "learning_rate": 4.769431387377385e-06, + "loss": 2.5022, + "step": 15252 + }, + { + "epoch": 0.818293991416309, + "grad_norm": 0.39453125, + "learning_rate": 4.769394944559386e-06, + "loss": 2.0831, + "step": 15253 + }, + { + "epoch": 0.8183476394849786, + "grad_norm": 0.4765625, + "learning_rate": 4.769358499000859e-06, + "loss": 2.1477, + "step": 15254 + }, + { + "epoch": 0.818401287553648, + "grad_norm": 0.375, + "learning_rate": 4.769322050701846e-06, + "loss": 2.0774, + "step": 15255 + }, + { + "epoch": 0.8184549356223176, + "grad_norm": 0.4921875, + "learning_rate": 4.76928559966239e-06, + "loss": 2.2709, + "step": 15256 + }, + { + "epoch": 0.8185085836909871, + "grad_norm": 0.4921875, + "learning_rate": 4.769249145882537e-06, + "loss": 2.421, + "step": 15257 + }, + { + "epoch": 0.8185622317596567, + "grad_norm": 0.33984375, + "learning_rate": 4.76921268936233e-06, + "loss": 2.1379, + "step": 15258 + }, + { + "epoch": 0.8186158798283262, + "grad_norm": 0.462890625, + "learning_rate": 4.769176230101813e-06, + "loss": 2.2871, + "step": 15259 + }, + { + "epoch": 0.8186695278969958, + "grad_norm": 0.3984375, + "learning_rate": 4.769139768101031e-06, + "loss": 2.1887, + "step": 15260 + }, + { + "epoch": 0.8187231759656652, + "grad_norm": 0.59375, + "learning_rate": 4.769103303360027e-06, + "loss": 2.107, + "step": 15261 + }, + { + "epoch": 0.8187768240343347, + "grad_norm": 0.58984375, + "learning_rate": 4.769066835878845e-06, + "loss": 1.3624, + "step": 15262 + }, + { + "epoch": 0.8188304721030043, + "grad_norm": 0.482421875, + "learning_rate": 4.76903036565753e-06, + "loss": 2.3689, + "step": 15263 + }, + { + "epoch": 0.8188841201716738, + "grad_norm": 0.53125, + "learning_rate": 4.7689938926961246e-06, + "loss": 2.3182, + "step": 15264 + }, + { + "epoch": 0.8189377682403434, + "grad_norm": 0.50390625, + "learning_rate": 4.768957416994674e-06, + "loss": 2.3538, + "step": 15265 + }, + { + "epoch": 0.8189914163090128, + "grad_norm": 0.60546875, + "learning_rate": 4.768920938553222e-06, + "loss": 1.62, + "step": 15266 + }, + { + "epoch": 0.8190450643776824, + "grad_norm": 0.443359375, + "learning_rate": 4.768884457371813e-06, + "loss": 2.1049, + "step": 15267 + }, + { + "epoch": 0.8190987124463519, + "grad_norm": 0.443359375, + "learning_rate": 4.7688479734504905e-06, + "loss": 2.1122, + "step": 15268 + }, + { + "epoch": 0.8191523605150215, + "grad_norm": 0.38671875, + "learning_rate": 4.768811486789299e-06, + "loss": 2.2548, + "step": 15269 + }, + { + "epoch": 0.819206008583691, + "grad_norm": 0.4921875, + "learning_rate": 4.768774997388281e-06, + "loss": 2.2323, + "step": 15270 + }, + { + "epoch": 0.8192596566523606, + "grad_norm": 0.4921875, + "learning_rate": 4.7687385052474835e-06, + "loss": 2.3171, + "step": 15271 + }, + { + "epoch": 0.81931330472103, + "grad_norm": 0.41015625, + "learning_rate": 4.7687020103669474e-06, + "loss": 2.0284, + "step": 15272 + }, + { + "epoch": 0.8193669527896996, + "grad_norm": 0.38671875, + "learning_rate": 4.768665512746719e-06, + "loss": 2.1529, + "step": 15273 + }, + { + "epoch": 0.8194206008583691, + "grad_norm": 0.7109375, + "learning_rate": 4.768629012386841e-06, + "loss": 2.2503, + "step": 15274 + }, + { + "epoch": 0.8194742489270386, + "grad_norm": 0.56640625, + "learning_rate": 4.768592509287359e-06, + "loss": 2.0177, + "step": 15275 + }, + { + "epoch": 0.8195278969957082, + "grad_norm": 0.443359375, + "learning_rate": 4.768556003448315e-06, + "loss": 2.2692, + "step": 15276 + }, + { + "epoch": 0.8195815450643776, + "grad_norm": 0.392578125, + "learning_rate": 4.768519494869755e-06, + "loss": 2.3267, + "step": 15277 + }, + { + "epoch": 0.8196351931330472, + "grad_norm": 0.494140625, + "learning_rate": 4.7684829835517225e-06, + "loss": 2.224, + "step": 15278 + }, + { + "epoch": 0.8196888412017167, + "grad_norm": 0.482421875, + "learning_rate": 4.768446469494261e-06, + "loss": 2.2157, + "step": 15279 + }, + { + "epoch": 0.8197424892703863, + "grad_norm": 0.416015625, + "learning_rate": 4.768409952697416e-06, + "loss": 2.4004, + "step": 15280 + }, + { + "epoch": 0.8197961373390558, + "grad_norm": 0.4765625, + "learning_rate": 4.76837343316123e-06, + "loss": 2.3358, + "step": 15281 + }, + { + "epoch": 0.8198497854077254, + "grad_norm": 0.73046875, + "learning_rate": 4.768336910885747e-06, + "loss": 2.3434, + "step": 15282 + }, + { + "epoch": 0.8199034334763948, + "grad_norm": 0.46484375, + "learning_rate": 4.768300385871013e-06, + "loss": 2.4556, + "step": 15283 + }, + { + "epoch": 0.8199570815450644, + "grad_norm": 0.365234375, + "learning_rate": 4.768263858117071e-06, + "loss": 2.0566, + "step": 15284 + }, + { + "epoch": 0.8200107296137339, + "grad_norm": 0.419921875, + "learning_rate": 4.7682273276239635e-06, + "loss": 2.1475, + "step": 15285 + }, + { + "epoch": 0.8200643776824035, + "grad_norm": 0.388671875, + "learning_rate": 4.768190794391737e-06, + "loss": 2.3134, + "step": 15286 + }, + { + "epoch": 0.820118025751073, + "grad_norm": 0.53515625, + "learning_rate": 4.768154258420435e-06, + "loss": 2.2213, + "step": 15287 + }, + { + "epoch": 0.8201716738197425, + "grad_norm": 2.765625, + "learning_rate": 4.768117719710102e-06, + "loss": 2.1864, + "step": 15288 + }, + { + "epoch": 0.820225321888412, + "grad_norm": 0.52734375, + "learning_rate": 4.7680811782607805e-06, + "loss": 2.0093, + "step": 15289 + }, + { + "epoch": 0.8202789699570815, + "grad_norm": 0.4375, + "learning_rate": 4.768044634072516e-06, + "loss": 2.2903, + "step": 15290 + }, + { + "epoch": 0.8203326180257511, + "grad_norm": 0.44921875, + "learning_rate": 4.768008087145353e-06, + "loss": 2.1558, + "step": 15291 + }, + { + "epoch": 0.8203862660944206, + "grad_norm": 0.447265625, + "learning_rate": 4.767971537479335e-06, + "loss": 2.352, + "step": 15292 + }, + { + "epoch": 0.8204399141630901, + "grad_norm": 0.38671875, + "learning_rate": 4.767934985074505e-06, + "loss": 2.3172, + "step": 15293 + }, + { + "epoch": 0.8204935622317596, + "grad_norm": 0.46484375, + "learning_rate": 4.7678984299309085e-06, + "loss": 2.1912, + "step": 15294 + }, + { + "epoch": 0.8205472103004292, + "grad_norm": 0.466796875, + "learning_rate": 4.7678618720485895e-06, + "loss": 2.3458, + "step": 15295 + }, + { + "epoch": 0.8206008583690987, + "grad_norm": 0.427734375, + "learning_rate": 4.767825311427592e-06, + "loss": 2.1984, + "step": 15296 + }, + { + "epoch": 0.8206545064377683, + "grad_norm": 0.4609375, + "learning_rate": 4.767788748067961e-06, + "loss": 2.3722, + "step": 15297 + }, + { + "epoch": 0.8207081545064377, + "grad_norm": 0.48828125, + "learning_rate": 4.767752181969738e-06, + "loss": 2.2805, + "step": 15298 + }, + { + "epoch": 0.8207618025751073, + "grad_norm": 0.51171875, + "learning_rate": 4.7677156131329705e-06, + "loss": 2.2054, + "step": 15299 + }, + { + "epoch": 0.8208154506437768, + "grad_norm": 0.474609375, + "learning_rate": 4.7676790415577e-06, + "loss": 2.3807, + "step": 15300 + }, + { + "epoch": 0.8208690987124464, + "grad_norm": 0.435546875, + "learning_rate": 4.7676424672439725e-06, + "loss": 2.247, + "step": 15301 + }, + { + "epoch": 0.8209227467811159, + "grad_norm": 0.51953125, + "learning_rate": 4.767605890191832e-06, + "loss": 2.4393, + "step": 15302 + }, + { + "epoch": 0.8209763948497855, + "grad_norm": 0.453125, + "learning_rate": 4.767569310401321e-06, + "loss": 2.496, + "step": 15303 + }, + { + "epoch": 0.8210300429184549, + "grad_norm": 0.400390625, + "learning_rate": 4.767532727872485e-06, + "loss": 2.2781, + "step": 15304 + }, + { + "epoch": 0.8210836909871244, + "grad_norm": 0.44921875, + "learning_rate": 4.767496142605369e-06, + "loss": 2.4737, + "step": 15305 + }, + { + "epoch": 0.821137339055794, + "grad_norm": 0.62109375, + "learning_rate": 4.7674595546000145e-06, + "loss": 2.0356, + "step": 15306 + }, + { + "epoch": 0.8211909871244635, + "grad_norm": 0.478515625, + "learning_rate": 4.767422963856469e-06, + "loss": 2.3103, + "step": 15307 + }, + { + "epoch": 0.8212446351931331, + "grad_norm": 0.466796875, + "learning_rate": 4.767386370374774e-06, + "loss": 2.134, + "step": 15308 + }, + { + "epoch": 0.8212982832618025, + "grad_norm": 0.419921875, + "learning_rate": 4.767349774154974e-06, + "loss": 2.0649, + "step": 15309 + }, + { + "epoch": 0.8213519313304721, + "grad_norm": 0.5078125, + "learning_rate": 4.767313175197116e-06, + "loss": 2.3698, + "step": 15310 + }, + { + "epoch": 0.8214055793991416, + "grad_norm": 0.5546875, + "learning_rate": 4.767276573501241e-06, + "loss": 2.1351, + "step": 15311 + }, + { + "epoch": 0.8214592274678112, + "grad_norm": 0.40234375, + "learning_rate": 4.7672399690673945e-06, + "loss": 1.9976, + "step": 15312 + }, + { + "epoch": 0.8215128755364807, + "grad_norm": 0.42578125, + "learning_rate": 4.76720336189562e-06, + "loss": 2.1604, + "step": 15313 + }, + { + "epoch": 0.8215665236051503, + "grad_norm": 0.54296875, + "learning_rate": 4.7671667519859625e-06, + "loss": 2.2694, + "step": 15314 + }, + { + "epoch": 0.8216201716738197, + "grad_norm": 0.51953125, + "learning_rate": 4.7671301393384666e-06, + "loss": 2.4296, + "step": 15315 + }, + { + "epoch": 0.8216738197424893, + "grad_norm": 0.498046875, + "learning_rate": 4.767093523953175e-06, + "loss": 2.4957, + "step": 15316 + }, + { + "epoch": 0.8217274678111588, + "grad_norm": 0.3984375, + "learning_rate": 4.767056905830133e-06, + "loss": 2.1285, + "step": 15317 + }, + { + "epoch": 0.8217811158798283, + "grad_norm": 0.5078125, + "learning_rate": 4.767020284969385e-06, + "loss": 2.3236, + "step": 15318 + }, + { + "epoch": 0.8218347639484979, + "grad_norm": 0.443359375, + "learning_rate": 4.766983661370974e-06, + "loss": 2.0892, + "step": 15319 + }, + { + "epoch": 0.8218884120171673, + "grad_norm": 0.6953125, + "learning_rate": 4.766947035034946e-06, + "loss": 2.4366, + "step": 15320 + }, + { + "epoch": 0.8219420600858369, + "grad_norm": 0.51171875, + "learning_rate": 4.766910405961343e-06, + "loss": 2.1933, + "step": 15321 + }, + { + "epoch": 0.8219957081545064, + "grad_norm": 0.44140625, + "learning_rate": 4.766873774150211e-06, + "loss": 2.2143, + "step": 15322 + }, + { + "epoch": 0.822049356223176, + "grad_norm": 0.404296875, + "learning_rate": 4.766837139601594e-06, + "loss": 2.0645, + "step": 15323 + }, + { + "epoch": 0.8221030042918455, + "grad_norm": 0.443359375, + "learning_rate": 4.766800502315536e-06, + "loss": 2.315, + "step": 15324 + }, + { + "epoch": 0.822156652360515, + "grad_norm": 0.8125, + "learning_rate": 4.766763862292082e-06, + "loss": 2.3956, + "step": 15325 + }, + { + "epoch": 0.8222103004291845, + "grad_norm": 0.48046875, + "learning_rate": 4.7667272195312745e-06, + "loss": 2.2692, + "step": 15326 + }, + { + "epoch": 0.8222639484978541, + "grad_norm": 0.51953125, + "learning_rate": 4.766690574033158e-06, + "loss": 2.4415, + "step": 15327 + }, + { + "epoch": 0.8223175965665236, + "grad_norm": 0.48828125, + "learning_rate": 4.766653925797778e-06, + "loss": 2.1076, + "step": 15328 + }, + { + "epoch": 0.8223712446351932, + "grad_norm": 0.458984375, + "learning_rate": 4.7666172748251795e-06, + "loss": 2.4077, + "step": 15329 + }, + { + "epoch": 0.8224248927038627, + "grad_norm": 0.484375, + "learning_rate": 4.766580621115404e-06, + "loss": 2.3814, + "step": 15330 + }, + { + "epoch": 0.8224785407725322, + "grad_norm": 0.62890625, + "learning_rate": 4.766543964668498e-06, + "loss": 2.2899, + "step": 15331 + }, + { + "epoch": 0.8225321888412017, + "grad_norm": 0.404296875, + "learning_rate": 4.766507305484504e-06, + "loss": 2.228, + "step": 15332 + }, + { + "epoch": 0.8225858369098712, + "grad_norm": 0.443359375, + "learning_rate": 4.766470643563469e-06, + "loss": 2.1317, + "step": 15333 + }, + { + "epoch": 0.8226394849785408, + "grad_norm": 0.7265625, + "learning_rate": 4.766433978905434e-06, + "loss": 2.2609, + "step": 15334 + }, + { + "epoch": 0.8226931330472103, + "grad_norm": 0.453125, + "learning_rate": 4.766397311510446e-06, + "loss": 2.2959, + "step": 15335 + }, + { + "epoch": 0.8227467811158798, + "grad_norm": 0.515625, + "learning_rate": 4.766360641378547e-06, + "loss": 2.2818, + "step": 15336 + }, + { + "epoch": 0.8228004291845493, + "grad_norm": 0.427734375, + "learning_rate": 4.7663239685097835e-06, + "loss": 2.2705, + "step": 15337 + }, + { + "epoch": 0.8228540772532189, + "grad_norm": 0.8359375, + "learning_rate": 4.766287292904198e-06, + "loss": 2.3714, + "step": 15338 + }, + { + "epoch": 0.8229077253218884, + "grad_norm": 0.52734375, + "learning_rate": 4.766250614561836e-06, + "loss": 2.346, + "step": 15339 + }, + { + "epoch": 0.822961373390558, + "grad_norm": 0.625, + "learning_rate": 4.766213933482741e-06, + "loss": 2.7081, + "step": 15340 + }, + { + "epoch": 0.8230150214592274, + "grad_norm": 0.4609375, + "learning_rate": 4.7661772496669564e-06, + "loss": 2.4447, + "step": 15341 + }, + { + "epoch": 0.823068669527897, + "grad_norm": 0.427734375, + "learning_rate": 4.766140563114529e-06, + "loss": 2.1986, + "step": 15342 + }, + { + "epoch": 0.8231223175965665, + "grad_norm": 0.70703125, + "learning_rate": 4.766103873825501e-06, + "loss": 2.1623, + "step": 15343 + }, + { + "epoch": 0.8231759656652361, + "grad_norm": 0.55859375, + "learning_rate": 4.766067181799918e-06, + "loss": 2.2462, + "step": 15344 + }, + { + "epoch": 0.8232296137339056, + "grad_norm": 0.455078125, + "learning_rate": 4.766030487037824e-06, + "loss": 2.4118, + "step": 15345 + }, + { + "epoch": 0.8232832618025752, + "grad_norm": 0.49609375, + "learning_rate": 4.765993789539263e-06, + "loss": 2.2744, + "step": 15346 + }, + { + "epoch": 0.8233369098712446, + "grad_norm": 0.47265625, + "learning_rate": 4.765957089304279e-06, + "loss": 2.2798, + "step": 15347 + }, + { + "epoch": 0.8233905579399141, + "grad_norm": 0.416015625, + "learning_rate": 4.7659203863329174e-06, + "loss": 2.3186, + "step": 15348 + }, + { + "epoch": 0.8234442060085837, + "grad_norm": 0.58203125, + "learning_rate": 4.765883680625221e-06, + "loss": 2.2532, + "step": 15349 + }, + { + "epoch": 0.8234978540772532, + "grad_norm": 0.478515625, + "learning_rate": 4.765846972181235e-06, + "loss": 2.1852, + "step": 15350 + }, + { + "epoch": 0.8235515021459228, + "grad_norm": 0.5, + "learning_rate": 4.765810261001005e-06, + "loss": 2.2109, + "step": 15351 + }, + { + "epoch": 0.8236051502145922, + "grad_norm": 0.4296875, + "learning_rate": 4.765773547084573e-06, + "loss": 2.1995, + "step": 15352 + }, + { + "epoch": 0.8236587982832618, + "grad_norm": 0.546875, + "learning_rate": 4.765736830431984e-06, + "loss": 2.3797, + "step": 15353 + }, + { + "epoch": 0.8237124463519313, + "grad_norm": 0.392578125, + "learning_rate": 4.7657001110432834e-06, + "loss": 2.1544, + "step": 15354 + }, + { + "epoch": 0.8237660944206009, + "grad_norm": 0.451171875, + "learning_rate": 4.765663388918515e-06, + "loss": 2.2685, + "step": 15355 + }, + { + "epoch": 0.8238197424892704, + "grad_norm": 0.41796875, + "learning_rate": 4.765626664057722e-06, + "loss": 2.2873, + "step": 15356 + }, + { + "epoch": 0.82387339055794, + "grad_norm": 0.484375, + "learning_rate": 4.765589936460951e-06, + "loss": 2.3642, + "step": 15357 + }, + { + "epoch": 0.8239270386266094, + "grad_norm": 0.50390625, + "learning_rate": 4.765553206128244e-06, + "loss": 2.3018, + "step": 15358 + }, + { + "epoch": 0.823980686695279, + "grad_norm": 0.72265625, + "learning_rate": 4.765516473059647e-06, + "loss": 2.1141, + "step": 15359 + }, + { + "epoch": 0.8240343347639485, + "grad_norm": 0.416015625, + "learning_rate": 4.765479737255204e-06, + "loss": 2.2647, + "step": 15360 + }, + { + "epoch": 0.824087982832618, + "grad_norm": 0.4765625, + "learning_rate": 4.765442998714959e-06, + "loss": 2.1487, + "step": 15361 + }, + { + "epoch": 0.8241416309012876, + "grad_norm": 0.498046875, + "learning_rate": 4.7654062574389565e-06, + "loss": 2.2573, + "step": 15362 + }, + { + "epoch": 0.824195278969957, + "grad_norm": 0.439453125, + "learning_rate": 4.76536951342724e-06, + "loss": 2.2012, + "step": 15363 + }, + { + "epoch": 0.8242489270386266, + "grad_norm": 0.50390625, + "learning_rate": 4.765332766679856e-06, + "loss": 2.164, + "step": 15364 + }, + { + "epoch": 0.8243025751072961, + "grad_norm": 0.447265625, + "learning_rate": 4.765296017196847e-06, + "loss": 2.1215, + "step": 15365 + }, + { + "epoch": 0.8243562231759657, + "grad_norm": 0.4921875, + "learning_rate": 4.765259264978259e-06, + "loss": 2.5166, + "step": 15366 + }, + { + "epoch": 0.8244098712446352, + "grad_norm": 0.59375, + "learning_rate": 4.765222510024135e-06, + "loss": 2.4165, + "step": 15367 + }, + { + "epoch": 0.8244635193133047, + "grad_norm": 0.455078125, + "learning_rate": 4.765185752334519e-06, + "loss": 1.8234, + "step": 15368 + }, + { + "epoch": 0.8245171673819742, + "grad_norm": 0.58203125, + "learning_rate": 4.7651489919094574e-06, + "loss": 2.3583, + "step": 15369 + }, + { + "epoch": 0.8245708154506438, + "grad_norm": 0.55859375, + "learning_rate": 4.7651122287489925e-06, + "loss": 2.3873, + "step": 15370 + }, + { + "epoch": 0.8246244635193133, + "grad_norm": 0.62890625, + "learning_rate": 4.76507546285317e-06, + "loss": 2.2227, + "step": 15371 + }, + { + "epoch": 0.8246781115879829, + "grad_norm": 0.439453125, + "learning_rate": 4.765038694222034e-06, + "loss": 2.2703, + "step": 15372 + }, + { + "epoch": 0.8247317596566524, + "grad_norm": 0.439453125, + "learning_rate": 4.765001922855629e-06, + "loss": 2.2791, + "step": 15373 + }, + { + "epoch": 0.8247854077253219, + "grad_norm": 0.494140625, + "learning_rate": 4.7649651487539986e-06, + "loss": 2.3653, + "step": 15374 + }, + { + "epoch": 0.8248390557939914, + "grad_norm": 0.609375, + "learning_rate": 4.764928371917188e-06, + "loss": 2.0868, + "step": 15375 + }, + { + "epoch": 0.8248927038626609, + "grad_norm": 0.4765625, + "learning_rate": 4.764891592345241e-06, + "loss": 2.2889, + "step": 15376 + }, + { + "epoch": 0.8249463519313305, + "grad_norm": 0.416015625, + "learning_rate": 4.764854810038203e-06, + "loss": 2.2854, + "step": 15377 + }, + { + "epoch": 0.825, + "grad_norm": 0.51171875, + "learning_rate": 4.764818024996117e-06, + "loss": 2.1827, + "step": 15378 + }, + { + "epoch": 0.8250536480686695, + "grad_norm": 1.3125, + "learning_rate": 4.764781237219029e-06, + "loss": 2.2415, + "step": 15379 + }, + { + "epoch": 0.825107296137339, + "grad_norm": 0.42578125, + "learning_rate": 4.764744446706983e-06, + "loss": 2.2536, + "step": 15380 + }, + { + "epoch": 0.8251609442060086, + "grad_norm": 0.4140625, + "learning_rate": 4.764707653460022e-06, + "loss": 2.1692, + "step": 15381 + }, + { + "epoch": 0.8252145922746781, + "grad_norm": 0.478515625, + "learning_rate": 4.764670857478193e-06, + "loss": 2.2638, + "step": 15382 + }, + { + "epoch": 0.8252682403433477, + "grad_norm": 0.50390625, + "learning_rate": 4.764634058761538e-06, + "loss": 2.2741, + "step": 15383 + }, + { + "epoch": 0.8253218884120171, + "grad_norm": 0.4140625, + "learning_rate": 4.764597257310103e-06, + "loss": 2.0138, + "step": 15384 + }, + { + "epoch": 0.8253755364806867, + "grad_norm": 0.46484375, + "learning_rate": 4.764560453123932e-06, + "loss": 2.2687, + "step": 15385 + }, + { + "epoch": 0.8254291845493562, + "grad_norm": 0.458984375, + "learning_rate": 4.764523646203068e-06, + "loss": 2.3246, + "step": 15386 + }, + { + "epoch": 0.8254828326180258, + "grad_norm": 0.455078125, + "learning_rate": 4.764486836547558e-06, + "loss": 2.2386, + "step": 15387 + }, + { + "epoch": 0.8255364806866953, + "grad_norm": 0.4140625, + "learning_rate": 4.764450024157445e-06, + "loss": 2.2081, + "step": 15388 + }, + { + "epoch": 0.8255901287553649, + "grad_norm": 0.4921875, + "learning_rate": 4.764413209032773e-06, + "loss": 2.1777, + "step": 15389 + }, + { + "epoch": 0.8256437768240343, + "grad_norm": 0.443359375, + "learning_rate": 4.764376391173588e-06, + "loss": 2.36, + "step": 15390 + }, + { + "epoch": 0.8256974248927038, + "grad_norm": 0.466796875, + "learning_rate": 4.7643395705799326e-06, + "loss": 2.4947, + "step": 15391 + }, + { + "epoch": 0.8257510729613734, + "grad_norm": 0.5078125, + "learning_rate": 4.7643027472518536e-06, + "loss": 2.3288, + "step": 15392 + }, + { + "epoch": 0.8258047210300429, + "grad_norm": 0.458984375, + "learning_rate": 4.764265921189393e-06, + "loss": 2.1154, + "step": 15393 + }, + { + "epoch": 0.8258583690987125, + "grad_norm": 0.486328125, + "learning_rate": 4.7642290923925965e-06, + "loss": 2.2685, + "step": 15394 + }, + { + "epoch": 0.8259120171673819, + "grad_norm": 0.40625, + "learning_rate": 4.764192260861509e-06, + "loss": 2.1519, + "step": 15395 + }, + { + "epoch": 0.8259656652360515, + "grad_norm": 0.466796875, + "learning_rate": 4.764155426596174e-06, + "loss": 2.2984, + "step": 15396 + }, + { + "epoch": 0.826019313304721, + "grad_norm": 0.4453125, + "learning_rate": 4.764118589596637e-06, + "loss": 2.2026, + "step": 15397 + }, + { + "epoch": 0.8260729613733906, + "grad_norm": 0.5703125, + "learning_rate": 4.764081749862941e-06, + "loss": 2.236, + "step": 15398 + }, + { + "epoch": 0.8261266094420601, + "grad_norm": 0.4375, + "learning_rate": 4.7640449073951315e-06, + "loss": 2.4739, + "step": 15399 + }, + { + "epoch": 0.8261802575107297, + "grad_norm": 0.5546875, + "learning_rate": 4.764008062193254e-06, + "loss": 2.6317, + "step": 15400 + }, + { + "epoch": 0.8262339055793991, + "grad_norm": 0.609375, + "learning_rate": 4.763971214257351e-06, + "loss": 2.0816, + "step": 15401 + }, + { + "epoch": 0.8262875536480687, + "grad_norm": 0.50390625, + "learning_rate": 4.763934363587468e-06, + "loss": 2.3848, + "step": 15402 + }, + { + "epoch": 0.8263412017167382, + "grad_norm": 1.4765625, + "learning_rate": 4.763897510183649e-06, + "loss": 2.2667, + "step": 15403 + }, + { + "epoch": 0.8263948497854077, + "grad_norm": 0.5, + "learning_rate": 4.763860654045939e-06, + "loss": 2.2832, + "step": 15404 + }, + { + "epoch": 0.8264484978540773, + "grad_norm": 0.390625, + "learning_rate": 4.763823795174383e-06, + "loss": 2.0142, + "step": 15405 + }, + { + "epoch": 0.8265021459227467, + "grad_norm": 0.494140625, + "learning_rate": 4.763786933569025e-06, + "loss": 2.3898, + "step": 15406 + }, + { + "epoch": 0.8265557939914163, + "grad_norm": 0.47265625, + "learning_rate": 4.7637500692299085e-06, + "loss": 2.1925, + "step": 15407 + }, + { + "epoch": 0.8266094420600858, + "grad_norm": 0.4453125, + "learning_rate": 4.763713202157079e-06, + "loss": 2.1494, + "step": 15408 + }, + { + "epoch": 0.8266630901287554, + "grad_norm": 0.6015625, + "learning_rate": 4.763676332350582e-06, + "loss": 1.8301, + "step": 15409 + }, + { + "epoch": 0.8267167381974249, + "grad_norm": 0.419921875, + "learning_rate": 4.7636394598104594e-06, + "loss": 2.1894, + "step": 15410 + }, + { + "epoch": 0.8267703862660944, + "grad_norm": 0.38671875, + "learning_rate": 4.763602584536759e-06, + "loss": 2.1971, + "step": 15411 + }, + { + "epoch": 0.8268240343347639, + "grad_norm": 0.4765625, + "learning_rate": 4.763565706529523e-06, + "loss": 2.3654, + "step": 15412 + }, + { + "epoch": 0.8268776824034335, + "grad_norm": 0.498046875, + "learning_rate": 4.763528825788796e-06, + "loss": 2.3457, + "step": 15413 + }, + { + "epoch": 0.826931330472103, + "grad_norm": 0.58203125, + "learning_rate": 4.763491942314624e-06, + "loss": 2.1926, + "step": 15414 + }, + { + "epoch": 0.8269849785407726, + "grad_norm": 0.431640625, + "learning_rate": 4.76345505610705e-06, + "loss": 2.3304, + "step": 15415 + }, + { + "epoch": 0.827038626609442, + "grad_norm": 0.396484375, + "learning_rate": 4.76341816716612e-06, + "loss": 2.3595, + "step": 15416 + }, + { + "epoch": 0.8270922746781116, + "grad_norm": 0.453125, + "learning_rate": 4.763381275491876e-06, + "loss": 2.2406, + "step": 15417 + }, + { + "epoch": 0.8271459227467811, + "grad_norm": 0.44140625, + "learning_rate": 4.763344381084366e-06, + "loss": 2.1637, + "step": 15418 + }, + { + "epoch": 0.8271995708154506, + "grad_norm": 0.6015625, + "learning_rate": 4.763307483943632e-06, + "loss": 2.2737, + "step": 15419 + }, + { + "epoch": 0.8272532188841202, + "grad_norm": 0.462890625, + "learning_rate": 4.76327058406972e-06, + "loss": 2.0246, + "step": 15420 + }, + { + "epoch": 0.8273068669527897, + "grad_norm": 0.5078125, + "learning_rate": 4.763233681462673e-06, + "loss": 2.216, + "step": 15421 + }, + { + "epoch": 0.8273605150214592, + "grad_norm": 0.4140625, + "learning_rate": 4.7631967761225374e-06, + "loss": 2.2918, + "step": 15422 + }, + { + "epoch": 0.8274141630901287, + "grad_norm": 0.412109375, + "learning_rate": 4.763159868049357e-06, + "loss": 2.2496, + "step": 15423 + }, + { + "epoch": 0.8274678111587983, + "grad_norm": 0.421875, + "learning_rate": 4.763122957243176e-06, + "loss": 2.3686, + "step": 15424 + }, + { + "epoch": 0.8275214592274678, + "grad_norm": 0.486328125, + "learning_rate": 4.763086043704039e-06, + "loss": 2.257, + "step": 15425 + }, + { + "epoch": 0.8275751072961374, + "grad_norm": 0.453125, + "learning_rate": 4.763049127431991e-06, + "loss": 2.1786, + "step": 15426 + }, + { + "epoch": 0.8276287553648068, + "grad_norm": 0.5078125, + "learning_rate": 4.763012208427076e-06, + "loss": 2.2073, + "step": 15427 + }, + { + "epoch": 0.8276824034334764, + "grad_norm": 0.408203125, + "learning_rate": 4.7629752866893394e-06, + "loss": 2.1874, + "step": 15428 + }, + { + "epoch": 0.8277360515021459, + "grad_norm": 0.455078125, + "learning_rate": 4.762938362218825e-06, + "loss": 2.2544, + "step": 15429 + }, + { + "epoch": 0.8277896995708155, + "grad_norm": 0.61328125, + "learning_rate": 4.762901435015579e-06, + "loss": 2.132, + "step": 15430 + }, + { + "epoch": 0.827843347639485, + "grad_norm": 0.3515625, + "learning_rate": 4.762864505079643e-06, + "loss": 2.1137, + "step": 15431 + }, + { + "epoch": 0.8278969957081546, + "grad_norm": 0.51171875, + "learning_rate": 4.762827572411064e-06, + "loss": 2.1943, + "step": 15432 + }, + { + "epoch": 0.827950643776824, + "grad_norm": 0.40625, + "learning_rate": 4.762790637009887e-06, + "loss": 2.2847, + "step": 15433 + }, + { + "epoch": 0.8280042918454935, + "grad_norm": 0.4296875, + "learning_rate": 4.762753698876154e-06, + "loss": 2.3425, + "step": 15434 + }, + { + "epoch": 0.8280579399141631, + "grad_norm": 0.38671875, + "learning_rate": 4.762716758009912e-06, + "loss": 1.6981, + "step": 15435 + }, + { + "epoch": 0.8281115879828326, + "grad_norm": 0.42578125, + "learning_rate": 4.762679814411204e-06, + "loss": 2.145, + "step": 15436 + }, + { + "epoch": 0.8281652360515022, + "grad_norm": 0.466796875, + "learning_rate": 4.762642868080076e-06, + "loss": 2.1504, + "step": 15437 + }, + { + "epoch": 0.8282188841201716, + "grad_norm": 0.4921875, + "learning_rate": 4.762605919016572e-06, + "loss": 2.1786, + "step": 15438 + }, + { + "epoch": 0.8282725321888412, + "grad_norm": 0.490234375, + "learning_rate": 4.7625689672207365e-06, + "loss": 2.3317, + "step": 15439 + }, + { + "epoch": 0.8283261802575107, + "grad_norm": 0.46484375, + "learning_rate": 4.762532012692615e-06, + "loss": 2.1406, + "step": 15440 + }, + { + "epoch": 0.8283798283261803, + "grad_norm": 0.65625, + "learning_rate": 4.76249505543225e-06, + "loss": 2.3714, + "step": 15441 + }, + { + "epoch": 0.8284334763948498, + "grad_norm": 0.5, + "learning_rate": 4.762458095439688e-06, + "loss": 2.3649, + "step": 15442 + }, + { + "epoch": 0.8284871244635194, + "grad_norm": 0.431640625, + "learning_rate": 4.762421132714973e-06, + "loss": 2.4963, + "step": 15443 + }, + { + "epoch": 0.8285407725321888, + "grad_norm": 0.39453125, + "learning_rate": 4.7623841672581504e-06, + "loss": 1.9692, + "step": 15444 + }, + { + "epoch": 0.8285944206008584, + "grad_norm": 0.45703125, + "learning_rate": 4.7623471990692635e-06, + "loss": 2.2365, + "step": 15445 + }, + { + "epoch": 0.8286480686695279, + "grad_norm": 0.36328125, + "learning_rate": 4.762310228148358e-06, + "loss": 1.7543, + "step": 15446 + }, + { + "epoch": 0.8287017167381975, + "grad_norm": 0.41796875, + "learning_rate": 4.762273254495479e-06, + "loss": 2.1763, + "step": 15447 + }, + { + "epoch": 0.828755364806867, + "grad_norm": 0.453125, + "learning_rate": 4.762236278110669e-06, + "loss": 2.4123, + "step": 15448 + }, + { + "epoch": 0.8288090128755364, + "grad_norm": 0.423828125, + "learning_rate": 4.762199298993974e-06, + "loss": 2.185, + "step": 15449 + }, + { + "epoch": 0.828862660944206, + "grad_norm": 0.490234375, + "learning_rate": 4.762162317145439e-06, + "loss": 1.801, + "step": 15450 + }, + { + "epoch": 0.8289163090128755, + "grad_norm": 0.431640625, + "learning_rate": 4.762125332565109e-06, + "loss": 2.2621, + "step": 15451 + }, + { + "epoch": 0.8289699570815451, + "grad_norm": 0.453125, + "learning_rate": 4.762088345253027e-06, + "loss": 2.26, + "step": 15452 + }, + { + "epoch": 0.8290236051502146, + "grad_norm": 0.43359375, + "learning_rate": 4.762051355209239e-06, + "loss": 2.2418, + "step": 15453 + }, + { + "epoch": 0.8290772532188841, + "grad_norm": 0.42578125, + "learning_rate": 4.76201436243379e-06, + "loss": 2.0396, + "step": 15454 + }, + { + "epoch": 0.8291309012875536, + "grad_norm": 0.5, + "learning_rate": 4.761977366926722e-06, + "loss": 2.2187, + "step": 15455 + }, + { + "epoch": 0.8291845493562232, + "grad_norm": 0.51953125, + "learning_rate": 4.761940368688084e-06, + "loss": 2.2082, + "step": 15456 + }, + { + "epoch": 0.8292381974248927, + "grad_norm": 2.046875, + "learning_rate": 4.761903367717917e-06, + "loss": 2.4708, + "step": 15457 + }, + { + "epoch": 0.8292918454935623, + "grad_norm": 0.498046875, + "learning_rate": 4.761866364016268e-06, + "loss": 2.3786, + "step": 15458 + }, + { + "epoch": 0.8293454935622318, + "grad_norm": 0.416015625, + "learning_rate": 4.76182935758318e-06, + "loss": 2.2404, + "step": 15459 + }, + { + "epoch": 0.8293991416309013, + "grad_norm": 0.43359375, + "learning_rate": 4.761792348418698e-06, + "loss": 2.2665, + "step": 15460 + }, + { + "epoch": 0.8294527896995708, + "grad_norm": 0.6171875, + "learning_rate": 4.761755336522867e-06, + "loss": 2.2251, + "step": 15461 + }, + { + "epoch": 0.8295064377682403, + "grad_norm": 1.046875, + "learning_rate": 4.761718321895733e-06, + "loss": 2.4228, + "step": 15462 + }, + { + "epoch": 0.8295600858369099, + "grad_norm": 0.51953125, + "learning_rate": 4.761681304537339e-06, + "loss": 2.4983, + "step": 15463 + }, + { + "epoch": 0.8296137339055794, + "grad_norm": 0.46484375, + "learning_rate": 4.761644284447729e-06, + "loss": 2.4488, + "step": 15464 + }, + { + "epoch": 0.8296673819742489, + "grad_norm": 0.5703125, + "learning_rate": 4.76160726162695e-06, + "loss": 2.5004, + "step": 15465 + }, + { + "epoch": 0.8297210300429184, + "grad_norm": 0.443359375, + "learning_rate": 4.761570236075046e-06, + "loss": 2.3599, + "step": 15466 + }, + { + "epoch": 0.829774678111588, + "grad_norm": 0.44140625, + "learning_rate": 4.7615332077920604e-06, + "loss": 2.1454, + "step": 15467 + }, + { + "epoch": 0.8298283261802575, + "grad_norm": 0.44140625, + "learning_rate": 4.7614961767780385e-06, + "loss": 2.4706, + "step": 15468 + }, + { + "epoch": 0.8298819742489271, + "grad_norm": 0.62890625, + "learning_rate": 4.7614591430330264e-06, + "loss": 2.2957, + "step": 15469 + }, + { + "epoch": 0.8299356223175965, + "grad_norm": 0.5078125, + "learning_rate": 4.7614221065570675e-06, + "loss": 2.0807, + "step": 15470 + }, + { + "epoch": 0.8299892703862661, + "grad_norm": 0.4921875, + "learning_rate": 4.761385067350207e-06, + "loss": 2.1495, + "step": 15471 + }, + { + "epoch": 0.8300429184549356, + "grad_norm": 0.71875, + "learning_rate": 4.761348025412489e-06, + "loss": 2.0896, + "step": 15472 + }, + { + "epoch": 0.8300965665236052, + "grad_norm": 0.42578125, + "learning_rate": 4.7613109807439584e-06, + "loss": 2.1737, + "step": 15473 + }, + { + "epoch": 0.8301502145922747, + "grad_norm": 0.6328125, + "learning_rate": 4.76127393334466e-06, + "loss": 2.3746, + "step": 15474 + }, + { + "epoch": 0.8302038626609443, + "grad_norm": 0.46875, + "learning_rate": 4.761236883214639e-06, + "loss": 2.1082, + "step": 15475 + }, + { + "epoch": 0.8302575107296137, + "grad_norm": 0.51953125, + "learning_rate": 4.761199830353941e-06, + "loss": 2.4486, + "step": 15476 + }, + { + "epoch": 0.8303111587982832, + "grad_norm": 0.44140625, + "learning_rate": 4.7611627747626084e-06, + "loss": 2.3838, + "step": 15477 + }, + { + "epoch": 0.8303648068669528, + "grad_norm": 0.455078125, + "learning_rate": 4.761125716440687e-06, + "loss": 2.4361, + "step": 15478 + }, + { + "epoch": 0.8304184549356223, + "grad_norm": 0.4609375, + "learning_rate": 4.761088655388223e-06, + "loss": 2.1332, + "step": 15479 + }, + { + "epoch": 0.8304721030042919, + "grad_norm": 0.5234375, + "learning_rate": 4.761051591605259e-06, + "loss": 2.6931, + "step": 15480 + }, + { + "epoch": 0.8305257510729613, + "grad_norm": 0.412109375, + "learning_rate": 4.76101452509184e-06, + "loss": 2.4599, + "step": 15481 + }, + { + "epoch": 0.8305793991416309, + "grad_norm": 0.5, + "learning_rate": 4.760977455848013e-06, + "loss": 1.5384, + "step": 15482 + }, + { + "epoch": 0.8306330472103004, + "grad_norm": 0.451171875, + "learning_rate": 4.76094038387382e-06, + "loss": 2.2435, + "step": 15483 + }, + { + "epoch": 0.83068669527897, + "grad_norm": 0.59375, + "learning_rate": 4.760903309169307e-06, + "loss": 2.3353, + "step": 15484 + }, + { + "epoch": 0.8307403433476395, + "grad_norm": 0.4296875, + "learning_rate": 4.760866231734519e-06, + "loss": 2.2715, + "step": 15485 + }, + { + "epoch": 0.830793991416309, + "grad_norm": 0.380859375, + "learning_rate": 4.7608291515695e-06, + "loss": 2.4145, + "step": 15486 + }, + { + "epoch": 0.8308476394849785, + "grad_norm": 0.4140625, + "learning_rate": 4.760792068674296e-06, + "loss": 2.5204, + "step": 15487 + }, + { + "epoch": 0.8309012875536481, + "grad_norm": 0.90625, + "learning_rate": 4.760754983048949e-06, + "loss": 2.4537, + "step": 15488 + }, + { + "epoch": 0.8309549356223176, + "grad_norm": 0.44921875, + "learning_rate": 4.760717894693508e-06, + "loss": 2.3976, + "step": 15489 + }, + { + "epoch": 0.8310085836909872, + "grad_norm": 0.4140625, + "learning_rate": 4.7606808036080154e-06, + "loss": 2.1567, + "step": 15490 + }, + { + "epoch": 0.8310622317596567, + "grad_norm": 0.455078125, + "learning_rate": 4.760643709792515e-06, + "loss": 1.8012, + "step": 15491 + }, + { + "epoch": 0.8311158798283261, + "grad_norm": 0.765625, + "learning_rate": 4.760606613247054e-06, + "loss": 2.3484, + "step": 15492 + }, + { + "epoch": 0.8311695278969957, + "grad_norm": 0.51171875, + "learning_rate": 4.760569513971676e-06, + "loss": 2.2702, + "step": 15493 + }, + { + "epoch": 0.8312231759656652, + "grad_norm": 0.4375, + "learning_rate": 4.760532411966425e-06, + "loss": 2.4532, + "step": 15494 + }, + { + "epoch": 0.8312768240343348, + "grad_norm": 0.375, + "learning_rate": 4.760495307231346e-06, + "loss": 2.1451, + "step": 15495 + }, + { + "epoch": 0.8313304721030043, + "grad_norm": 0.44140625, + "learning_rate": 4.760458199766485e-06, + "loss": 1.8767, + "step": 15496 + }, + { + "epoch": 0.8313841201716738, + "grad_norm": 0.466796875, + "learning_rate": 4.760421089571887e-06, + "loss": 2.3781, + "step": 15497 + }, + { + "epoch": 0.8314377682403433, + "grad_norm": 0.51953125, + "learning_rate": 4.760383976647595e-06, + "loss": 2.5758, + "step": 15498 + }, + { + "epoch": 0.8314914163090129, + "grad_norm": 0.53125, + "learning_rate": 4.760346860993656e-06, + "loss": 2.281, + "step": 15499 + }, + { + "epoch": 0.8315450643776824, + "grad_norm": 0.40234375, + "learning_rate": 4.760309742610113e-06, + "loss": 2.4509, + "step": 15500 + }, + { + "epoch": 0.831598712446352, + "grad_norm": 1.1015625, + "learning_rate": 4.7602726214970115e-06, + "loss": 2.4119, + "step": 15501 + }, + { + "epoch": 0.8316523605150214, + "grad_norm": 0.5, + "learning_rate": 4.7602354976543966e-06, + "loss": 2.4264, + "step": 15502 + }, + { + "epoch": 0.831706008583691, + "grad_norm": 0.474609375, + "learning_rate": 4.760198371082312e-06, + "loss": 2.3562, + "step": 15503 + }, + { + "epoch": 0.8317596566523605, + "grad_norm": 0.50390625, + "learning_rate": 4.760161241780804e-06, + "loss": 2.2693, + "step": 15504 + }, + { + "epoch": 0.83181330472103, + "grad_norm": 0.52734375, + "learning_rate": 4.760124109749918e-06, + "loss": 2.2613, + "step": 15505 + }, + { + "epoch": 0.8318669527896996, + "grad_norm": 0.50390625, + "learning_rate": 4.760086974989695e-06, + "loss": 2.2874, + "step": 15506 + }, + { + "epoch": 0.831920600858369, + "grad_norm": 0.72265625, + "learning_rate": 4.760049837500184e-06, + "loss": 2.3304, + "step": 15507 + }, + { + "epoch": 0.8319742489270386, + "grad_norm": 0.7734375, + "learning_rate": 4.760012697281428e-06, + "loss": 2.2315, + "step": 15508 + }, + { + "epoch": 0.8320278969957081, + "grad_norm": 0.412109375, + "learning_rate": 4.759975554333473e-06, + "loss": 2.3139, + "step": 15509 + }, + { + "epoch": 0.8320815450643777, + "grad_norm": 0.4921875, + "learning_rate": 4.759938408656363e-06, + "loss": 2.3698, + "step": 15510 + }, + { + "epoch": 0.8321351931330472, + "grad_norm": 0.48828125, + "learning_rate": 4.759901260250141e-06, + "loss": 2.4528, + "step": 15511 + }, + { + "epoch": 0.8321888412017168, + "grad_norm": 0.5078125, + "learning_rate": 4.759864109114855e-06, + "loss": 2.2806, + "step": 15512 + }, + { + "epoch": 0.8322424892703862, + "grad_norm": 0.498046875, + "learning_rate": 4.759826955250549e-06, + "loss": 2.5171, + "step": 15513 + }, + { + "epoch": 0.8322961373390558, + "grad_norm": 0.359375, + "learning_rate": 4.759789798657268e-06, + "loss": 2.1315, + "step": 15514 + }, + { + "epoch": 0.8323497854077253, + "grad_norm": 0.515625, + "learning_rate": 4.759752639335055e-06, + "loss": 2.4621, + "step": 15515 + }, + { + "epoch": 0.8324034334763949, + "grad_norm": 0.494140625, + "learning_rate": 4.759715477283957e-06, + "loss": 2.2541, + "step": 15516 + }, + { + "epoch": 0.8324570815450644, + "grad_norm": 0.419921875, + "learning_rate": 4.759678312504019e-06, + "loss": 2.1583, + "step": 15517 + }, + { + "epoch": 0.832510729613734, + "grad_norm": 0.58203125, + "learning_rate": 4.759641144995284e-06, + "loss": 2.0785, + "step": 15518 + }, + { + "epoch": 0.8325643776824034, + "grad_norm": 0.59765625, + "learning_rate": 4.759603974757797e-06, + "loss": 2.1221, + "step": 15519 + }, + { + "epoch": 0.8326180257510729, + "grad_norm": 0.50390625, + "learning_rate": 4.759566801791605e-06, + "loss": 2.3379, + "step": 15520 + }, + { + "epoch": 0.8326716738197425, + "grad_norm": 0.59765625, + "learning_rate": 4.759529626096752e-06, + "loss": 1.0114, + "step": 15521 + }, + { + "epoch": 0.832725321888412, + "grad_norm": 0.70703125, + "learning_rate": 4.759492447673282e-06, + "loss": 2.3815, + "step": 15522 + }, + { + "epoch": 0.8327789699570816, + "grad_norm": 0.453125, + "learning_rate": 4.75945526652124e-06, + "loss": 2.2158, + "step": 15523 + }, + { + "epoch": 0.832832618025751, + "grad_norm": 0.5078125, + "learning_rate": 4.759418082640671e-06, + "loss": 2.2529, + "step": 15524 + }, + { + "epoch": 0.8328862660944206, + "grad_norm": 0.4921875, + "learning_rate": 4.759380896031621e-06, + "loss": 2.147, + "step": 15525 + }, + { + "epoch": 0.8329399141630901, + "grad_norm": 0.4921875, + "learning_rate": 4.7593437066941344e-06, + "loss": 2.1088, + "step": 15526 + }, + { + "epoch": 0.8329935622317597, + "grad_norm": 0.470703125, + "learning_rate": 4.759306514628255e-06, + "loss": 2.2916, + "step": 15527 + }, + { + "epoch": 0.8330472103004292, + "grad_norm": 0.50390625, + "learning_rate": 4.75926931983403e-06, + "loss": 2.439, + "step": 15528 + }, + { + "epoch": 0.8331008583690988, + "grad_norm": 0.51171875, + "learning_rate": 4.759232122311503e-06, + "loss": 2.3838, + "step": 15529 + }, + { + "epoch": 0.8331545064377682, + "grad_norm": 0.447265625, + "learning_rate": 4.759194922060717e-06, + "loss": 2.0787, + "step": 15530 + }, + { + "epoch": 0.8332081545064378, + "grad_norm": 0.416015625, + "learning_rate": 4.75915771908172e-06, + "loss": 2.3871, + "step": 15531 + }, + { + "epoch": 0.8332618025751073, + "grad_norm": 0.52734375, + "learning_rate": 4.759120513374555e-06, + "loss": 2.5711, + "step": 15532 + }, + { + "epoch": 0.8333154506437769, + "grad_norm": 0.63671875, + "learning_rate": 4.759083304939268e-06, + "loss": 1.7148, + "step": 15533 + }, + { + "epoch": 0.8333690987124464, + "grad_norm": 0.515625, + "learning_rate": 4.759046093775903e-06, + "loss": 2.245, + "step": 15534 + }, + { + "epoch": 0.8334227467811158, + "grad_norm": 0.3828125, + "learning_rate": 4.759008879884507e-06, + "loss": 1.7036, + "step": 15535 + }, + { + "epoch": 0.8334763948497854, + "grad_norm": 0.4765625, + "learning_rate": 4.758971663265122e-06, + "loss": 2.2422, + "step": 15536 + }, + { + "epoch": 0.8335300429184549, + "grad_norm": 0.423828125, + "learning_rate": 4.758934443917794e-06, + "loss": 2.1646, + "step": 15537 + }, + { + "epoch": 0.8335836909871245, + "grad_norm": 0.484375, + "learning_rate": 4.7588972218425685e-06, + "loss": 2.0599, + "step": 15538 + }, + { + "epoch": 0.833637339055794, + "grad_norm": 0.431640625, + "learning_rate": 4.758859997039491e-06, + "loss": 2.1375, + "step": 15539 + }, + { + "epoch": 0.8336909871244635, + "grad_norm": 0.486328125, + "learning_rate": 4.758822769508606e-06, + "loss": 2.366, + "step": 15540 + }, + { + "epoch": 0.833744635193133, + "grad_norm": 0.408203125, + "learning_rate": 4.758785539249957e-06, + "loss": 2.1596, + "step": 15541 + }, + { + "epoch": 0.8337982832618026, + "grad_norm": 0.447265625, + "learning_rate": 4.758748306263591e-06, + "loss": 2.2048, + "step": 15542 + }, + { + "epoch": 0.8338519313304721, + "grad_norm": 0.369140625, + "learning_rate": 4.758711070549552e-06, + "loss": 2.1128, + "step": 15543 + }, + { + "epoch": 0.8339055793991417, + "grad_norm": 0.462890625, + "learning_rate": 4.7586738321078855e-06, + "loss": 2.4783, + "step": 15544 + }, + { + "epoch": 0.8339592274678111, + "grad_norm": 0.447265625, + "learning_rate": 4.758636590938635e-06, + "loss": 2.3139, + "step": 15545 + }, + { + "epoch": 0.8340128755364807, + "grad_norm": 0.427734375, + "learning_rate": 4.758599347041847e-06, + "loss": 2.07, + "step": 15546 + }, + { + "epoch": 0.8340665236051502, + "grad_norm": 0.44140625, + "learning_rate": 4.758562100417565e-06, + "loss": 2.1843, + "step": 15547 + }, + { + "epoch": 0.8341201716738197, + "grad_norm": 0.412109375, + "learning_rate": 4.758524851065836e-06, + "loss": 2.1831, + "step": 15548 + }, + { + "epoch": 0.8341738197424893, + "grad_norm": 0.451171875, + "learning_rate": 4.758487598986704e-06, + "loss": 2.1761, + "step": 15549 + }, + { + "epoch": 0.8342274678111588, + "grad_norm": 0.484375, + "learning_rate": 4.758450344180214e-06, + "loss": 2.1873, + "step": 15550 + }, + { + "epoch": 0.8342811158798283, + "grad_norm": 0.439453125, + "learning_rate": 4.758413086646409e-06, + "loss": 2.1664, + "step": 15551 + }, + { + "epoch": 0.8343347639484978, + "grad_norm": 0.5859375, + "learning_rate": 4.758375826385338e-06, + "loss": 2.3695, + "step": 15552 + }, + { + "epoch": 0.8343884120171674, + "grad_norm": 0.361328125, + "learning_rate": 4.758338563397043e-06, + "loss": 2.2688, + "step": 15553 + }, + { + "epoch": 0.8344420600858369, + "grad_norm": 0.458984375, + "learning_rate": 4.758301297681571e-06, + "loss": 2.5159, + "step": 15554 + }, + { + "epoch": 0.8344957081545065, + "grad_norm": 0.55078125, + "learning_rate": 4.758264029238965e-06, + "loss": 2.238, + "step": 15555 + }, + { + "epoch": 0.8345493562231759, + "grad_norm": 0.41015625, + "learning_rate": 4.758226758069271e-06, + "loss": 2.2809, + "step": 15556 + }, + { + "epoch": 0.8346030042918455, + "grad_norm": 0.59765625, + "learning_rate": 4.758189484172534e-06, + "loss": 1.9762, + "step": 15557 + }, + { + "epoch": 0.834656652360515, + "grad_norm": 0.423828125, + "learning_rate": 4.758152207548798e-06, + "loss": 2.2581, + "step": 15558 + }, + { + "epoch": 0.8347103004291846, + "grad_norm": 0.416015625, + "learning_rate": 4.75811492819811e-06, + "loss": 2.2306, + "step": 15559 + }, + { + "epoch": 0.8347639484978541, + "grad_norm": 0.400390625, + "learning_rate": 4.758077646120514e-06, + "loss": 2.4212, + "step": 15560 + }, + { + "epoch": 0.8348175965665237, + "grad_norm": 0.412109375, + "learning_rate": 4.7580403613160545e-06, + "loss": 2.2795, + "step": 15561 + }, + { + "epoch": 0.8348712446351931, + "grad_norm": 0.421875, + "learning_rate": 4.758003073784777e-06, + "loss": 2.5594, + "step": 15562 + }, + { + "epoch": 0.8349248927038626, + "grad_norm": 0.435546875, + "learning_rate": 4.757965783526727e-06, + "loss": 2.2616, + "step": 15563 + }, + { + "epoch": 0.8349785407725322, + "grad_norm": 0.4453125, + "learning_rate": 4.757928490541949e-06, + "loss": 2.4127, + "step": 15564 + }, + { + "epoch": 0.8350321888412017, + "grad_norm": 0.484375, + "learning_rate": 4.757891194830488e-06, + "loss": 2.2546, + "step": 15565 + }, + { + "epoch": 0.8350858369098713, + "grad_norm": 0.41796875, + "learning_rate": 4.757853896392388e-06, + "loss": 2.5716, + "step": 15566 + }, + { + "epoch": 0.8351394849785407, + "grad_norm": 0.33203125, + "learning_rate": 4.757816595227697e-06, + "loss": 2.2153, + "step": 15567 + }, + { + "epoch": 0.8351931330472103, + "grad_norm": 0.408203125, + "learning_rate": 4.757779291336456e-06, + "loss": 2.1351, + "step": 15568 + }, + { + "epoch": 0.8352467811158798, + "grad_norm": 0.56640625, + "learning_rate": 4.757741984718714e-06, + "loss": 2.3528, + "step": 15569 + }, + { + "epoch": 0.8353004291845494, + "grad_norm": 0.4453125, + "learning_rate": 4.757704675374513e-06, + "loss": 1.6626, + "step": 15570 + }, + { + "epoch": 0.8353540772532189, + "grad_norm": 0.51953125, + "learning_rate": 4.7576673633039015e-06, + "loss": 2.2591, + "step": 15571 + }, + { + "epoch": 0.8354077253218885, + "grad_norm": 0.51171875, + "learning_rate": 4.757630048506921e-06, + "loss": 1.9719, + "step": 15572 + }, + { + "epoch": 0.8354613733905579, + "grad_norm": 0.50390625, + "learning_rate": 4.757592730983618e-06, + "loss": 2.1078, + "step": 15573 + }, + { + "epoch": 0.8355150214592275, + "grad_norm": 0.478515625, + "learning_rate": 4.7575554107340386e-06, + "loss": 2.3931, + "step": 15574 + }, + { + "epoch": 0.835568669527897, + "grad_norm": 0.482421875, + "learning_rate": 4.757518087758226e-06, + "loss": 2.3844, + "step": 15575 + }, + { + "epoch": 0.8356223175965666, + "grad_norm": 0.49609375, + "learning_rate": 4.757480762056226e-06, + "loss": 2.4034, + "step": 15576 + }, + { + "epoch": 0.835675965665236, + "grad_norm": 0.42578125, + "learning_rate": 4.757443433628083e-06, + "loss": 2.4203, + "step": 15577 + }, + { + "epoch": 0.8357296137339055, + "grad_norm": 0.47265625, + "learning_rate": 4.757406102473844e-06, + "loss": 2.4865, + "step": 15578 + }, + { + "epoch": 0.8357832618025751, + "grad_norm": 0.5, + "learning_rate": 4.757368768593553e-06, + "loss": 2.3864, + "step": 15579 + }, + { + "epoch": 0.8358369098712446, + "grad_norm": 0.44140625, + "learning_rate": 4.7573314319872545e-06, + "loss": 2.4628, + "step": 15580 + }, + { + "epoch": 0.8358905579399142, + "grad_norm": 0.50390625, + "learning_rate": 4.757294092654994e-06, + "loss": 2.1662, + "step": 15581 + }, + { + "epoch": 0.8359442060085837, + "grad_norm": 0.52734375, + "learning_rate": 4.757256750596817e-06, + "loss": 1.9531, + "step": 15582 + }, + { + "epoch": 0.8359978540772532, + "grad_norm": 0.419921875, + "learning_rate": 4.757219405812769e-06, + "loss": 2.2217, + "step": 15583 + }, + { + "epoch": 0.8360515021459227, + "grad_norm": 0.6796875, + "learning_rate": 4.757182058302893e-06, + "loss": 2.315, + "step": 15584 + }, + { + "epoch": 0.8361051502145923, + "grad_norm": 0.462890625, + "learning_rate": 4.757144708067236e-06, + "loss": 1.7489, + "step": 15585 + }, + { + "epoch": 0.8361587982832618, + "grad_norm": 0.490234375, + "learning_rate": 4.757107355105843e-06, + "loss": 2.3548, + "step": 15586 + }, + { + "epoch": 0.8362124463519314, + "grad_norm": 0.45703125, + "learning_rate": 4.7570699994187585e-06, + "loss": 2.2763, + "step": 15587 + }, + { + "epoch": 0.8362660944206008, + "grad_norm": 0.380859375, + "learning_rate": 4.757032641006027e-06, + "loss": 2.442, + "step": 15588 + }, + { + "epoch": 0.8363197424892704, + "grad_norm": 0.51171875, + "learning_rate": 4.756995279867695e-06, + "loss": 2.3524, + "step": 15589 + }, + { + "epoch": 0.8363733905579399, + "grad_norm": 0.4765625, + "learning_rate": 4.756957916003808e-06, + "loss": 2.0541, + "step": 15590 + }, + { + "epoch": 0.8364270386266094, + "grad_norm": 0.62890625, + "learning_rate": 4.756920549414409e-06, + "loss": 2.3367, + "step": 15591 + }, + { + "epoch": 0.836480686695279, + "grad_norm": 0.416015625, + "learning_rate": 4.756883180099545e-06, + "loss": 1.6427, + "step": 15592 + }, + { + "epoch": 0.8365343347639485, + "grad_norm": 0.404296875, + "learning_rate": 4.756845808059259e-06, + "loss": 2.3411, + "step": 15593 + }, + { + "epoch": 0.836587982832618, + "grad_norm": 0.45703125, + "learning_rate": 4.756808433293599e-06, + "loss": 2.1586, + "step": 15594 + }, + { + "epoch": 0.8366416309012875, + "grad_norm": 0.7421875, + "learning_rate": 4.756771055802608e-06, + "loss": 2.632, + "step": 15595 + }, + { + "epoch": 0.8366952789699571, + "grad_norm": 0.6328125, + "learning_rate": 4.756733675586332e-06, + "loss": 2.2381, + "step": 15596 + }, + { + "epoch": 0.8367489270386266, + "grad_norm": 0.451171875, + "learning_rate": 4.756696292644817e-06, + "loss": 2.3202, + "step": 15597 + }, + { + "epoch": 0.8368025751072962, + "grad_norm": 0.470703125, + "learning_rate": 4.756658906978106e-06, + "loss": 2.4018, + "step": 15598 + }, + { + "epoch": 0.8368562231759656, + "grad_norm": 0.45703125, + "learning_rate": 4.756621518586245e-06, + "loss": 2.2978, + "step": 15599 + }, + { + "epoch": 0.8369098712446352, + "grad_norm": 0.443359375, + "learning_rate": 4.75658412746928e-06, + "loss": 2.4125, + "step": 15600 + }, + { + "epoch": 0.8369635193133047, + "grad_norm": 0.486328125, + "learning_rate": 4.756546733627255e-06, + "loss": 2.2824, + "step": 15601 + }, + { + "epoch": 0.8370171673819743, + "grad_norm": 0.5, + "learning_rate": 4.756509337060217e-06, + "loss": 2.3232, + "step": 15602 + }, + { + "epoch": 0.8370708154506438, + "grad_norm": 0.400390625, + "learning_rate": 4.756471937768208e-06, + "loss": 2.3013, + "step": 15603 + }, + { + "epoch": 0.8371244635193134, + "grad_norm": 0.482421875, + "learning_rate": 4.756434535751276e-06, + "loss": 2.3294, + "step": 15604 + }, + { + "epoch": 0.8371781115879828, + "grad_norm": 0.44921875, + "learning_rate": 4.756397131009466e-06, + "loss": 2.4301, + "step": 15605 + }, + { + "epoch": 0.8372317596566523, + "grad_norm": 0.57421875, + "learning_rate": 4.756359723542821e-06, + "loss": 2.0946, + "step": 15606 + }, + { + "epoch": 0.8372854077253219, + "grad_norm": 0.484375, + "learning_rate": 4.756322313351388e-06, + "loss": 1.9812, + "step": 15607 + }, + { + "epoch": 0.8373390557939914, + "grad_norm": 0.46484375, + "learning_rate": 4.756284900435212e-06, + "loss": 2.2365, + "step": 15608 + }, + { + "epoch": 0.837392703862661, + "grad_norm": 0.466796875, + "learning_rate": 4.7562474847943375e-06, + "loss": 2.559, + "step": 15609 + }, + { + "epoch": 0.8374463519313304, + "grad_norm": 0.53125, + "learning_rate": 4.756210066428811e-06, + "loss": 2.3582, + "step": 15610 + }, + { + "epoch": 0.8375, + "grad_norm": 0.453125, + "learning_rate": 4.756172645338675e-06, + "loss": 2.1683, + "step": 15611 + }, + { + "epoch": 0.8375536480686695, + "grad_norm": 0.408203125, + "learning_rate": 4.756135221523978e-06, + "loss": 2.0181, + "step": 15612 + }, + { + "epoch": 0.8376072961373391, + "grad_norm": 0.51953125, + "learning_rate": 4.756097794984763e-06, + "loss": 2.2495, + "step": 15613 + }, + { + "epoch": 0.8376609442060086, + "grad_norm": 0.46484375, + "learning_rate": 4.756060365721076e-06, + "loss": 2.454, + "step": 15614 + }, + { + "epoch": 0.8377145922746781, + "grad_norm": 0.4921875, + "learning_rate": 4.756022933732962e-06, + "loss": 2.3596, + "step": 15615 + }, + { + "epoch": 0.8377682403433476, + "grad_norm": 0.451171875, + "learning_rate": 4.755985499020466e-06, + "loss": 2.3436, + "step": 15616 + }, + { + "epoch": 0.8378218884120172, + "grad_norm": 0.51171875, + "learning_rate": 4.755948061583634e-06, + "loss": 2.192, + "step": 15617 + }, + { + "epoch": 0.8378755364806867, + "grad_norm": 0.42578125, + "learning_rate": 4.75591062142251e-06, + "loss": 2.4001, + "step": 15618 + }, + { + "epoch": 0.8379291845493563, + "grad_norm": 0.390625, + "learning_rate": 4.75587317853714e-06, + "loss": 2.2379, + "step": 15619 + }, + { + "epoch": 0.8379828326180258, + "grad_norm": 0.5234375, + "learning_rate": 4.75583573292757e-06, + "loss": 2.5281, + "step": 15620 + }, + { + "epoch": 0.8380364806866952, + "grad_norm": 0.3828125, + "learning_rate": 4.755798284593843e-06, + "loss": 2.2051, + "step": 15621 + }, + { + "epoch": 0.8380901287553648, + "grad_norm": 0.47265625, + "learning_rate": 4.7557608335360075e-06, + "loss": 2.1278, + "step": 15622 + }, + { + "epoch": 0.8381437768240343, + "grad_norm": 0.4453125, + "learning_rate": 4.755723379754104e-06, + "loss": 2.2209, + "step": 15623 + }, + { + "epoch": 0.8381974248927039, + "grad_norm": 0.5, + "learning_rate": 4.755685923248183e-06, + "loss": 2.3838, + "step": 15624 + }, + { + "epoch": 0.8382510729613734, + "grad_norm": 0.447265625, + "learning_rate": 4.755648464018287e-06, + "loss": 2.2817, + "step": 15625 + }, + { + "epoch": 0.8383047210300429, + "grad_norm": 0.48046875, + "learning_rate": 4.75561100206446e-06, + "loss": 2.1373, + "step": 15626 + }, + { + "epoch": 0.8383583690987124, + "grad_norm": 0.75390625, + "learning_rate": 4.755573537386749e-06, + "loss": 1.9314, + "step": 15627 + }, + { + "epoch": 0.838412017167382, + "grad_norm": 0.55859375, + "learning_rate": 4.755536069985199e-06, + "loss": 2.2473, + "step": 15628 + }, + { + "epoch": 0.8384656652360515, + "grad_norm": 0.52734375, + "learning_rate": 4.7554985998598555e-06, + "loss": 1.661, + "step": 15629 + }, + { + "epoch": 0.8385193133047211, + "grad_norm": 1.0546875, + "learning_rate": 4.755461127010764e-06, + "loss": 2.3067, + "step": 15630 + }, + { + "epoch": 0.8385729613733905, + "grad_norm": 0.47265625, + "learning_rate": 4.755423651437968e-06, + "loss": 1.9417, + "step": 15631 + }, + { + "epoch": 0.8386266094420601, + "grad_norm": 0.5546875, + "learning_rate": 4.7553861731415154e-06, + "loss": 2.4257, + "step": 15632 + }, + { + "epoch": 0.8386802575107296, + "grad_norm": 0.51171875, + "learning_rate": 4.7553486921214485e-06, + "loss": 2.3177, + "step": 15633 + }, + { + "epoch": 0.8387339055793992, + "grad_norm": 0.478515625, + "learning_rate": 4.755311208377815e-06, + "loss": 2.2925, + "step": 15634 + }, + { + "epoch": 0.8387875536480687, + "grad_norm": 0.48828125, + "learning_rate": 4.755273721910659e-06, + "loss": 2.5574, + "step": 15635 + }, + { + "epoch": 0.8388412017167381, + "grad_norm": 0.462890625, + "learning_rate": 4.755236232720026e-06, + "loss": 2.1968, + "step": 15636 + }, + { + "epoch": 0.8388948497854077, + "grad_norm": 0.5625, + "learning_rate": 4.755198740805961e-06, + "loss": 2.2908, + "step": 15637 + }, + { + "epoch": 0.8389484978540772, + "grad_norm": 8.5, + "learning_rate": 4.7551612461685105e-06, + "loss": 1.6123, + "step": 15638 + }, + { + "epoch": 0.8390021459227468, + "grad_norm": 0.625, + "learning_rate": 4.755123748807718e-06, + "loss": 2.1598, + "step": 15639 + }, + { + "epoch": 0.8390557939914163, + "grad_norm": 0.484375, + "learning_rate": 4.755086248723629e-06, + "loss": 2.3306, + "step": 15640 + }, + { + "epoch": 0.8391094420600859, + "grad_norm": 0.4921875, + "learning_rate": 4.755048745916291e-06, + "loss": 2.3557, + "step": 15641 + }, + { + "epoch": 0.8391630901287553, + "grad_norm": 0.42578125, + "learning_rate": 4.755011240385746e-06, + "loss": 1.7808, + "step": 15642 + }, + { + "epoch": 0.8392167381974249, + "grad_norm": 0.458984375, + "learning_rate": 4.7549737321320425e-06, + "loss": 2.3784, + "step": 15643 + }, + { + "epoch": 0.8392703862660944, + "grad_norm": 0.421875, + "learning_rate": 4.754936221155223e-06, + "loss": 1.8477, + "step": 15644 + }, + { + "epoch": 0.839324034334764, + "grad_norm": 0.443359375, + "learning_rate": 4.754898707455336e-06, + "loss": 2.101, + "step": 15645 + }, + { + "epoch": 0.8393776824034335, + "grad_norm": 0.65234375, + "learning_rate": 4.754861191032423e-06, + "loss": 2.2564, + "step": 15646 + }, + { + "epoch": 0.839431330472103, + "grad_norm": 0.515625, + "learning_rate": 4.7548236718865316e-06, + "loss": 2.2297, + "step": 15647 + }, + { + "epoch": 0.8394849785407725, + "grad_norm": 0.5234375, + "learning_rate": 4.754786150017706e-06, + "loss": 2.3661, + "step": 15648 + }, + { + "epoch": 0.839538626609442, + "grad_norm": 0.46875, + "learning_rate": 4.754748625425993e-06, + "loss": 2.3934, + "step": 15649 + }, + { + "epoch": 0.8395922746781116, + "grad_norm": 0.431640625, + "learning_rate": 4.754711098111437e-06, + "loss": 2.1946, + "step": 15650 + }, + { + "epoch": 0.8396459227467811, + "grad_norm": 0.515625, + "learning_rate": 4.754673568074084e-06, + "loss": 2.4806, + "step": 15651 + }, + { + "epoch": 0.8396995708154507, + "grad_norm": 0.37109375, + "learning_rate": 4.754636035313979e-06, + "loss": 2.1867, + "step": 15652 + }, + { + "epoch": 0.8397532188841201, + "grad_norm": 0.5, + "learning_rate": 4.754598499831165e-06, + "loss": 2.3941, + "step": 15653 + }, + { + "epoch": 0.8398068669527897, + "grad_norm": 0.427734375, + "learning_rate": 4.754560961625692e-06, + "loss": 2.2912, + "step": 15654 + }, + { + "epoch": 0.8398605150214592, + "grad_norm": 0.43359375, + "learning_rate": 4.754523420697601e-06, + "loss": 2.2888, + "step": 15655 + }, + { + "epoch": 0.8399141630901288, + "grad_norm": 0.392578125, + "learning_rate": 4.7544858770469395e-06, + "loss": 2.1871, + "step": 15656 + }, + { + "epoch": 0.8399678111587983, + "grad_norm": 0.78515625, + "learning_rate": 4.754448330673752e-06, + "loss": 2.1473, + "step": 15657 + }, + { + "epoch": 0.8400214592274678, + "grad_norm": 0.4453125, + "learning_rate": 4.7544107815780845e-06, + "loss": 2.2228, + "step": 15658 + }, + { + "epoch": 0.8400751072961373, + "grad_norm": 0.40625, + "learning_rate": 4.7543732297599825e-06, + "loss": 2.1595, + "step": 15659 + }, + { + "epoch": 0.8401287553648069, + "grad_norm": 0.42578125, + "learning_rate": 4.754335675219491e-06, + "loss": 2.2146, + "step": 15660 + }, + { + "epoch": 0.8401824034334764, + "grad_norm": 0.447265625, + "learning_rate": 4.754298117956655e-06, + "loss": 2.3724, + "step": 15661 + }, + { + "epoch": 0.840236051502146, + "grad_norm": 0.474609375, + "learning_rate": 4.75426055797152e-06, + "loss": 2.2932, + "step": 15662 + }, + { + "epoch": 0.8402896995708155, + "grad_norm": 0.443359375, + "learning_rate": 4.754222995264132e-06, + "loss": 2.3596, + "step": 15663 + }, + { + "epoch": 0.8403433476394849, + "grad_norm": 0.5390625, + "learning_rate": 4.7541854298345355e-06, + "loss": 2.2528, + "step": 15664 + }, + { + "epoch": 0.8403969957081545, + "grad_norm": 0.50390625, + "learning_rate": 4.754147861682776e-06, + "loss": 2.2251, + "step": 15665 + }, + { + "epoch": 0.840450643776824, + "grad_norm": 0.453125, + "learning_rate": 4.754110290808899e-06, + "loss": 2.3852, + "step": 15666 + }, + { + "epoch": 0.8405042918454936, + "grad_norm": 0.42578125, + "learning_rate": 4.754072717212951e-06, + "loss": 2.4096, + "step": 15667 + }, + { + "epoch": 0.840557939914163, + "grad_norm": 0.51953125, + "learning_rate": 4.7540351408949756e-06, + "loss": 2.2146, + "step": 15668 + }, + { + "epoch": 0.8406115879828326, + "grad_norm": 0.45703125, + "learning_rate": 4.7539975618550184e-06, + "loss": 2.3316, + "step": 15669 + }, + { + "epoch": 0.8406652360515021, + "grad_norm": 0.470703125, + "learning_rate": 4.753959980093126e-06, + "loss": 2.5095, + "step": 15670 + }, + { + "epoch": 0.8407188841201717, + "grad_norm": 0.453125, + "learning_rate": 4.753922395609343e-06, + "loss": 2.2417, + "step": 15671 + }, + { + "epoch": 0.8407725321888412, + "grad_norm": 0.4765625, + "learning_rate": 4.753884808403715e-06, + "loss": 2.3195, + "step": 15672 + }, + { + "epoch": 0.8408261802575108, + "grad_norm": 0.51171875, + "learning_rate": 4.753847218476287e-06, + "loss": 2.4431, + "step": 15673 + }, + { + "epoch": 0.8408798283261802, + "grad_norm": 0.416015625, + "learning_rate": 4.753809625827105e-06, + "loss": 2.1902, + "step": 15674 + }, + { + "epoch": 0.8409334763948498, + "grad_norm": 0.453125, + "learning_rate": 4.753772030456213e-06, + "loss": 2.1643, + "step": 15675 + }, + { + "epoch": 0.8409871244635193, + "grad_norm": 0.44140625, + "learning_rate": 4.753734432363659e-06, + "loss": 2.2782, + "step": 15676 + }, + { + "epoch": 0.8410407725321889, + "grad_norm": 0.6015625, + "learning_rate": 4.7536968315494855e-06, + "loss": 1.5372, + "step": 15677 + }, + { + "epoch": 0.8410944206008584, + "grad_norm": 0.4375, + "learning_rate": 4.75365922801374e-06, + "loss": 2.4515, + "step": 15678 + }, + { + "epoch": 0.8411480686695278, + "grad_norm": 0.48828125, + "learning_rate": 4.7536216217564666e-06, + "loss": 2.1859, + "step": 15679 + }, + { + "epoch": 0.8412017167381974, + "grad_norm": 0.4140625, + "learning_rate": 4.753584012777712e-06, + "loss": 2.1039, + "step": 15680 + }, + { + "epoch": 0.8412553648068669, + "grad_norm": 0.5546875, + "learning_rate": 4.753546401077521e-06, + "loss": 1.4437, + "step": 15681 + }, + { + "epoch": 0.8413090128755365, + "grad_norm": 0.51953125, + "learning_rate": 4.753508786655938e-06, + "loss": 2.4012, + "step": 15682 + }, + { + "epoch": 0.841362660944206, + "grad_norm": 0.48046875, + "learning_rate": 4.75347116951301e-06, + "loss": 2.3043, + "step": 15683 + }, + { + "epoch": 0.8414163090128756, + "grad_norm": 0.494140625, + "learning_rate": 4.753433549648781e-06, + "loss": 1.86, + "step": 15684 + }, + { + "epoch": 0.841469957081545, + "grad_norm": 0.56640625, + "learning_rate": 4.753395927063298e-06, + "loss": 2.5044, + "step": 15685 + }, + { + "epoch": 0.8415236051502146, + "grad_norm": 0.376953125, + "learning_rate": 4.753358301756606e-06, + "loss": 2.4138, + "step": 15686 + }, + { + "epoch": 0.8415772532188841, + "grad_norm": 0.498046875, + "learning_rate": 4.75332067372875e-06, + "loss": 2.2478, + "step": 15687 + }, + { + "epoch": 0.8416309012875537, + "grad_norm": 0.431640625, + "learning_rate": 4.753283042979774e-06, + "loss": 2.3384, + "step": 15688 + }, + { + "epoch": 0.8416845493562232, + "grad_norm": 0.365234375, + "learning_rate": 4.753245409509726e-06, + "loss": 2.2754, + "step": 15689 + }, + { + "epoch": 0.8417381974248928, + "grad_norm": 0.486328125, + "learning_rate": 4.753207773318651e-06, + "loss": 2.4514, + "step": 15690 + }, + { + "epoch": 0.8417918454935622, + "grad_norm": 0.455078125, + "learning_rate": 4.753170134406593e-06, + "loss": 2.308, + "step": 15691 + }, + { + "epoch": 0.8418454935622317, + "grad_norm": 0.5625, + "learning_rate": 4.7531324927735986e-06, + "loss": 2.2229, + "step": 15692 + }, + { + "epoch": 0.8418991416309013, + "grad_norm": 0.5078125, + "learning_rate": 4.753094848419713e-06, + "loss": 2.5024, + "step": 15693 + }, + { + "epoch": 0.8419527896995708, + "grad_norm": 0.431640625, + "learning_rate": 4.753057201344982e-06, + "loss": 2.1563, + "step": 15694 + }, + { + "epoch": 0.8420064377682404, + "grad_norm": 0.427734375, + "learning_rate": 4.7530195515494495e-06, + "loss": 1.968, + "step": 15695 + }, + { + "epoch": 0.8420600858369098, + "grad_norm": 1.6640625, + "learning_rate": 4.752981899033164e-06, + "loss": 2.285, + "step": 15696 + }, + { + "epoch": 0.8421137339055794, + "grad_norm": 0.5390625, + "learning_rate": 4.752944243796167e-06, + "loss": 2.4805, + "step": 15697 + }, + { + "epoch": 0.8421673819742489, + "grad_norm": 0.421875, + "learning_rate": 4.752906585838507e-06, + "loss": 2.2149, + "step": 15698 + }, + { + "epoch": 0.8422210300429185, + "grad_norm": 0.498046875, + "learning_rate": 4.752868925160229e-06, + "loss": 2.3006, + "step": 15699 + }, + { + "epoch": 0.842274678111588, + "grad_norm": 0.37109375, + "learning_rate": 4.752831261761377e-06, + "loss": 2.008, + "step": 15700 + }, + { + "epoch": 0.8423283261802575, + "grad_norm": 0.57421875, + "learning_rate": 4.752793595641999e-06, + "loss": 2.417, + "step": 15701 + }, + { + "epoch": 0.842381974248927, + "grad_norm": 0.4765625, + "learning_rate": 4.752755926802137e-06, + "loss": 2.3366, + "step": 15702 + }, + { + "epoch": 0.8424356223175966, + "grad_norm": 0.515625, + "learning_rate": 4.752718255241841e-06, + "loss": 2.476, + "step": 15703 + }, + { + "epoch": 0.8424892703862661, + "grad_norm": 0.4375, + "learning_rate": 4.752680580961152e-06, + "loss": 2.3549, + "step": 15704 + }, + { + "epoch": 0.8425429184549357, + "grad_norm": 0.5078125, + "learning_rate": 4.752642903960118e-06, + "loss": 2.0809, + "step": 15705 + }, + { + "epoch": 0.8425965665236052, + "grad_norm": 0.423828125, + "learning_rate": 4.752605224238784e-06, + "loss": 2.2167, + "step": 15706 + }, + { + "epoch": 0.8426502145922746, + "grad_norm": 0.4765625, + "learning_rate": 4.752567541797196e-06, + "loss": 2.1609, + "step": 15707 + }, + { + "epoch": 0.8427038626609442, + "grad_norm": 0.431640625, + "learning_rate": 4.752529856635398e-06, + "loss": 2.4438, + "step": 15708 + }, + { + "epoch": 0.8427575107296137, + "grad_norm": 0.4296875, + "learning_rate": 4.752492168753438e-06, + "loss": 2.2726, + "step": 15709 + }, + { + "epoch": 0.8428111587982833, + "grad_norm": 0.40625, + "learning_rate": 4.752454478151359e-06, + "loss": 2.1059, + "step": 15710 + }, + { + "epoch": 0.8428648068669528, + "grad_norm": 0.5390625, + "learning_rate": 4.752416784829208e-06, + "loss": 2.1951, + "step": 15711 + }, + { + "epoch": 0.8429184549356223, + "grad_norm": 0.4375, + "learning_rate": 4.752379088787029e-06, + "loss": 2.3453, + "step": 15712 + }, + { + "epoch": 0.8429721030042918, + "grad_norm": 0.3671875, + "learning_rate": 4.7523413900248695e-06, + "loss": 1.9137, + "step": 15713 + }, + { + "epoch": 0.8430257510729614, + "grad_norm": 0.458984375, + "learning_rate": 4.7523036885427745e-06, + "loss": 2.4932, + "step": 15714 + }, + { + "epoch": 0.8430793991416309, + "grad_norm": 1.109375, + "learning_rate": 4.752265984340788e-06, + "loss": 2.1829, + "step": 15715 + }, + { + "epoch": 0.8431330472103005, + "grad_norm": 0.359375, + "learning_rate": 4.752228277418957e-06, + "loss": 2.0385, + "step": 15716 + }, + { + "epoch": 0.8431866952789699, + "grad_norm": 0.423828125, + "learning_rate": 4.752190567777327e-06, + "loss": 2.2084, + "step": 15717 + }, + { + "epoch": 0.8432403433476395, + "grad_norm": 0.478515625, + "learning_rate": 4.752152855415944e-06, + "loss": 2.11, + "step": 15718 + }, + { + "epoch": 0.843293991416309, + "grad_norm": 0.494140625, + "learning_rate": 4.752115140334851e-06, + "loss": 2.0566, + "step": 15719 + }, + { + "epoch": 0.8433476394849786, + "grad_norm": 0.447265625, + "learning_rate": 4.752077422534096e-06, + "loss": 2.3729, + "step": 15720 + }, + { + "epoch": 0.8434012875536481, + "grad_norm": 0.51171875, + "learning_rate": 4.752039702013724e-06, + "loss": 2.2912, + "step": 15721 + }, + { + "epoch": 0.8434549356223175, + "grad_norm": 0.6015625, + "learning_rate": 4.752001978773781e-06, + "loss": 1.4097, + "step": 15722 + }, + { + "epoch": 0.8435085836909871, + "grad_norm": 0.51953125, + "learning_rate": 4.751964252814311e-06, + "loss": 2.3249, + "step": 15723 + }, + { + "epoch": 0.8435622317596566, + "grad_norm": 0.375, + "learning_rate": 4.751926524135361e-06, + "loss": 2.0867, + "step": 15724 + }, + { + "epoch": 0.8436158798283262, + "grad_norm": 0.4609375, + "learning_rate": 4.7518887927369765e-06, + "loss": 2.3159, + "step": 15725 + }, + { + "epoch": 0.8436695278969957, + "grad_norm": 0.51171875, + "learning_rate": 4.751851058619203e-06, + "loss": 2.3614, + "step": 15726 + }, + { + "epoch": 0.8437231759656653, + "grad_norm": 0.37890625, + "learning_rate": 4.751813321782084e-06, + "loss": 2.1156, + "step": 15727 + }, + { + "epoch": 0.8437768240343347, + "grad_norm": 0.421875, + "learning_rate": 4.751775582225668e-06, + "loss": 2.1633, + "step": 15728 + }, + { + "epoch": 0.8438304721030043, + "grad_norm": 0.4375, + "learning_rate": 4.7517378399499994e-06, + "loss": 2.3322, + "step": 15729 + }, + { + "epoch": 0.8438841201716738, + "grad_norm": 0.66015625, + "learning_rate": 4.751700094955123e-06, + "loss": 2.3337, + "step": 15730 + }, + { + "epoch": 0.8439377682403434, + "grad_norm": 0.53125, + "learning_rate": 4.7516623472410856e-06, + "loss": 2.392, + "step": 15731 + }, + { + "epoch": 0.8439914163090129, + "grad_norm": 0.49609375, + "learning_rate": 4.7516245968079325e-06, + "loss": 2.2783, + "step": 15732 + }, + { + "epoch": 0.8440450643776825, + "grad_norm": 0.498046875, + "learning_rate": 4.751586843655709e-06, + "loss": 2.3108, + "step": 15733 + }, + { + "epoch": 0.8440987124463519, + "grad_norm": 0.50390625, + "learning_rate": 4.751549087784461e-06, + "loss": 2.352, + "step": 15734 + }, + { + "epoch": 0.8441523605150214, + "grad_norm": 0.48828125, + "learning_rate": 4.751511329194232e-06, + "loss": 2.0687, + "step": 15735 + }, + { + "epoch": 0.844206008583691, + "grad_norm": 0.458984375, + "learning_rate": 4.751473567885072e-06, + "loss": 2.2653, + "step": 15736 + }, + { + "epoch": 0.8442596566523605, + "grad_norm": 0.4609375, + "learning_rate": 4.751435803857023e-06, + "loss": 2.3166, + "step": 15737 + }, + { + "epoch": 0.8443133047210301, + "grad_norm": 0.51953125, + "learning_rate": 4.751398037110132e-06, + "loss": 1.3712, + "step": 15738 + }, + { + "epoch": 0.8443669527896995, + "grad_norm": 0.451171875, + "learning_rate": 4.751360267644444e-06, + "loss": 2.4694, + "step": 15739 + }, + { + "epoch": 0.8444206008583691, + "grad_norm": 0.3984375, + "learning_rate": 4.7513224954600055e-06, + "loss": 2.3113, + "step": 15740 + }, + { + "epoch": 0.8444742489270386, + "grad_norm": 0.439453125, + "learning_rate": 4.75128472055686e-06, + "loss": 2.3028, + "step": 15741 + }, + { + "epoch": 0.8445278969957082, + "grad_norm": 0.458984375, + "learning_rate": 4.751246942935056e-06, + "loss": 2.4082, + "step": 15742 + }, + { + "epoch": 0.8445815450643777, + "grad_norm": 0.5, + "learning_rate": 4.751209162594638e-06, + "loss": 2.4278, + "step": 15743 + }, + { + "epoch": 0.8446351931330472, + "grad_norm": 0.51171875, + "learning_rate": 4.751171379535651e-06, + "loss": 2.3992, + "step": 15744 + }, + { + "epoch": 0.8446888412017167, + "grad_norm": 1.1953125, + "learning_rate": 4.7511335937581406e-06, + "loss": 2.4998, + "step": 15745 + }, + { + "epoch": 0.8447424892703863, + "grad_norm": 0.41015625, + "learning_rate": 4.7510958052621525e-06, + "loss": 2.1786, + "step": 15746 + }, + { + "epoch": 0.8447961373390558, + "grad_norm": 0.50390625, + "learning_rate": 4.751058014047733e-06, + "loss": 2.1155, + "step": 15747 + }, + { + "epoch": 0.8448497854077254, + "grad_norm": 0.453125, + "learning_rate": 4.751020220114928e-06, + "loss": 2.4618, + "step": 15748 + }, + { + "epoch": 0.8449034334763948, + "grad_norm": 0.47265625, + "learning_rate": 4.750982423463782e-06, + "loss": 2.2758, + "step": 15749 + }, + { + "epoch": 0.8449570815450643, + "grad_norm": 0.484375, + "learning_rate": 4.7509446240943415e-06, + "loss": 2.1721, + "step": 15750 + }, + { + "epoch": 0.8450107296137339, + "grad_norm": 0.51953125, + "learning_rate": 4.750906822006652e-06, + "loss": 2.2575, + "step": 15751 + }, + { + "epoch": 0.8450643776824034, + "grad_norm": 0.51171875, + "learning_rate": 4.750869017200759e-06, + "loss": 2.4472, + "step": 15752 + }, + { + "epoch": 0.845118025751073, + "grad_norm": 0.5234375, + "learning_rate": 4.750831209676708e-06, + "loss": 2.0275, + "step": 15753 + }, + { + "epoch": 0.8451716738197425, + "grad_norm": 0.478515625, + "learning_rate": 4.750793399434544e-06, + "loss": 2.3622, + "step": 15754 + }, + { + "epoch": 0.845225321888412, + "grad_norm": 0.453125, + "learning_rate": 4.750755586474314e-06, + "loss": 2.3545, + "step": 15755 + }, + { + "epoch": 0.8452789699570815, + "grad_norm": 0.470703125, + "learning_rate": 4.750717770796063e-06, + "loss": 2.3528, + "step": 15756 + }, + { + "epoch": 0.8453326180257511, + "grad_norm": 0.484375, + "learning_rate": 4.750679952399837e-06, + "loss": 2.3065, + "step": 15757 + }, + { + "epoch": 0.8453862660944206, + "grad_norm": 0.40234375, + "learning_rate": 4.750642131285682e-06, + "loss": 2.3359, + "step": 15758 + }, + { + "epoch": 0.8454399141630902, + "grad_norm": 0.4765625, + "learning_rate": 4.750604307453642e-06, + "loss": 2.3271, + "step": 15759 + }, + { + "epoch": 0.8454935622317596, + "grad_norm": 0.51171875, + "learning_rate": 4.750566480903764e-06, + "loss": 2.2256, + "step": 15760 + }, + { + "epoch": 0.8455472103004292, + "grad_norm": 0.462890625, + "learning_rate": 4.750528651636094e-06, + "loss": 2.1589, + "step": 15761 + }, + { + "epoch": 0.8456008583690987, + "grad_norm": 0.53515625, + "learning_rate": 4.7504908196506775e-06, + "loss": 2.2964, + "step": 15762 + }, + { + "epoch": 0.8456545064377683, + "grad_norm": 1.1015625, + "learning_rate": 4.750452984947559e-06, + "loss": 2.3625, + "step": 15763 + }, + { + "epoch": 0.8457081545064378, + "grad_norm": 0.4765625, + "learning_rate": 4.750415147526785e-06, + "loss": 2.4857, + "step": 15764 + }, + { + "epoch": 0.8457618025751072, + "grad_norm": 0.61328125, + "learning_rate": 4.750377307388402e-06, + "loss": 2.3834, + "step": 15765 + }, + { + "epoch": 0.8458154506437768, + "grad_norm": 0.6015625, + "learning_rate": 4.750339464532454e-06, + "loss": 2.175, + "step": 15766 + }, + { + "epoch": 0.8458690987124463, + "grad_norm": 0.474609375, + "learning_rate": 4.750301618958989e-06, + "loss": 2.2757, + "step": 15767 + }, + { + "epoch": 0.8459227467811159, + "grad_norm": 0.42578125, + "learning_rate": 4.75026377066805e-06, + "loss": 2.4521, + "step": 15768 + }, + { + "epoch": 0.8459763948497854, + "grad_norm": 0.453125, + "learning_rate": 4.750225919659685e-06, + "loss": 2.5083, + "step": 15769 + }, + { + "epoch": 0.846030042918455, + "grad_norm": 0.578125, + "learning_rate": 4.750188065933939e-06, + "loss": 2.426, + "step": 15770 + }, + { + "epoch": 0.8460836909871244, + "grad_norm": 0.384765625, + "learning_rate": 4.7501502094908556e-06, + "loss": 2.0678, + "step": 15771 + }, + { + "epoch": 0.846137339055794, + "grad_norm": 0.50390625, + "learning_rate": 4.750112350330483e-06, + "loss": 2.1201, + "step": 15772 + }, + { + "epoch": 0.8461909871244635, + "grad_norm": 0.416015625, + "learning_rate": 4.750074488452868e-06, + "loss": 2.3854, + "step": 15773 + }, + { + "epoch": 0.8462446351931331, + "grad_norm": 0.515625, + "learning_rate": 4.7500366238580535e-06, + "loss": 2.23, + "step": 15774 + }, + { + "epoch": 0.8462982832618026, + "grad_norm": 0.47265625, + "learning_rate": 4.749998756546086e-06, + "loss": 2.0209, + "step": 15775 + }, + { + "epoch": 0.8463519313304722, + "grad_norm": 0.490234375, + "learning_rate": 4.749960886517012e-06, + "loss": 2.1672, + "step": 15776 + }, + { + "epoch": 0.8464055793991416, + "grad_norm": 0.47265625, + "learning_rate": 4.749923013770877e-06, + "loss": 2.1391, + "step": 15777 + }, + { + "epoch": 0.8464592274678111, + "grad_norm": 0.484375, + "learning_rate": 4.749885138307726e-06, + "loss": 2.243, + "step": 15778 + }, + { + "epoch": 0.8465128755364807, + "grad_norm": 0.45703125, + "learning_rate": 4.749847260127605e-06, + "loss": 2.1778, + "step": 15779 + }, + { + "epoch": 0.8465665236051502, + "grad_norm": 0.4921875, + "learning_rate": 4.749809379230561e-06, + "loss": 2.2085, + "step": 15780 + }, + { + "epoch": 0.8466201716738198, + "grad_norm": 0.4765625, + "learning_rate": 4.749771495616638e-06, + "loss": 2.4868, + "step": 15781 + }, + { + "epoch": 0.8466738197424892, + "grad_norm": 2.734375, + "learning_rate": 4.749733609285883e-06, + "loss": 1.8886, + "step": 15782 + }, + { + "epoch": 0.8467274678111588, + "grad_norm": 0.46484375, + "learning_rate": 4.749695720238341e-06, + "loss": 2.2105, + "step": 15783 + }, + { + "epoch": 0.8467811158798283, + "grad_norm": 0.515625, + "learning_rate": 4.749657828474058e-06, + "loss": 2.3084, + "step": 15784 + }, + { + "epoch": 0.8468347639484979, + "grad_norm": 0.52734375, + "learning_rate": 4.74961993399308e-06, + "loss": 2.3585, + "step": 15785 + }, + { + "epoch": 0.8468884120171674, + "grad_norm": 0.408203125, + "learning_rate": 4.749582036795452e-06, + "loss": 2.1603, + "step": 15786 + }, + { + "epoch": 0.846942060085837, + "grad_norm": 0.365234375, + "learning_rate": 4.749544136881221e-06, + "loss": 2.2064, + "step": 15787 + }, + { + "epoch": 0.8469957081545064, + "grad_norm": 0.4609375, + "learning_rate": 4.749506234250432e-06, + "loss": 2.3137, + "step": 15788 + }, + { + "epoch": 0.847049356223176, + "grad_norm": 0.373046875, + "learning_rate": 4.7494683289031306e-06, + "loss": 2.0133, + "step": 15789 + }, + { + "epoch": 0.8471030042918455, + "grad_norm": 0.423828125, + "learning_rate": 4.749430420839362e-06, + "loss": 2.3212, + "step": 15790 + }, + { + "epoch": 0.8471566523605151, + "grad_norm": 0.451171875, + "learning_rate": 4.7493925100591745e-06, + "loss": 2.2679, + "step": 15791 + }, + { + "epoch": 0.8472103004291845, + "grad_norm": 0.72265625, + "learning_rate": 4.749354596562611e-06, + "loss": 2.2395, + "step": 15792 + }, + { + "epoch": 0.847263948497854, + "grad_norm": 0.462890625, + "learning_rate": 4.749316680349718e-06, + "loss": 2.058, + "step": 15793 + }, + { + "epoch": 0.8473175965665236, + "grad_norm": 0.408203125, + "learning_rate": 4.7492787614205425e-06, + "loss": 2.1376, + "step": 15794 + }, + { + "epoch": 0.8473712446351931, + "grad_norm": 0.62109375, + "learning_rate": 4.74924083977513e-06, + "loss": 2.4068, + "step": 15795 + }, + { + "epoch": 0.8474248927038627, + "grad_norm": 0.416015625, + "learning_rate": 4.749202915413526e-06, + "loss": 2.0154, + "step": 15796 + }, + { + "epoch": 0.8474785407725322, + "grad_norm": 0.40234375, + "learning_rate": 4.749164988335775e-06, + "loss": 2.0567, + "step": 15797 + }, + { + "epoch": 0.8475321888412017, + "grad_norm": 0.4375, + "learning_rate": 4.749127058541924e-06, + "loss": 2.2471, + "step": 15798 + }, + { + "epoch": 0.8475858369098712, + "grad_norm": 0.45703125, + "learning_rate": 4.749089126032019e-06, + "loss": 2.1963, + "step": 15799 + }, + { + "epoch": 0.8476394849785408, + "grad_norm": 0.4921875, + "learning_rate": 4.749051190806105e-06, + "loss": 2.4619, + "step": 15800 + }, + { + "epoch": 0.8476931330472103, + "grad_norm": 0.51953125, + "learning_rate": 4.749013252864229e-06, + "loss": 2.2281, + "step": 15801 + }, + { + "epoch": 0.8477467811158799, + "grad_norm": 0.37109375, + "learning_rate": 4.748975312206437e-06, + "loss": 2.0761, + "step": 15802 + }, + { + "epoch": 0.8478004291845493, + "grad_norm": 0.447265625, + "learning_rate": 4.7489373688327736e-06, + "loss": 2.2297, + "step": 15803 + }, + { + "epoch": 0.8478540772532189, + "grad_norm": 0.375, + "learning_rate": 4.7488994227432835e-06, + "loss": 2.2093, + "step": 15804 + }, + { + "epoch": 0.8479077253218884, + "grad_norm": 0.60546875, + "learning_rate": 4.748861473938016e-06, + "loss": 2.2191, + "step": 15805 + }, + { + "epoch": 0.847961373390558, + "grad_norm": 0.578125, + "learning_rate": 4.7488235224170135e-06, + "loss": 2.3137, + "step": 15806 + }, + { + "epoch": 0.8480150214592275, + "grad_norm": 3.28125, + "learning_rate": 4.748785568180324e-06, + "loss": 2.3551, + "step": 15807 + }, + { + "epoch": 0.848068669527897, + "grad_norm": 0.423828125, + "learning_rate": 4.748747611227993e-06, + "loss": 2.2486, + "step": 15808 + }, + { + "epoch": 0.8481223175965665, + "grad_norm": 0.451171875, + "learning_rate": 4.748709651560065e-06, + "loss": 2.2204, + "step": 15809 + }, + { + "epoch": 0.848175965665236, + "grad_norm": 0.859375, + "learning_rate": 4.748671689176587e-06, + "loss": 2.4545, + "step": 15810 + }, + { + "epoch": 0.8482296137339056, + "grad_norm": 0.515625, + "learning_rate": 4.748633724077605e-06, + "loss": 2.2116, + "step": 15811 + }, + { + "epoch": 0.8482832618025751, + "grad_norm": 0.486328125, + "learning_rate": 4.748595756263165e-06, + "loss": 2.2226, + "step": 15812 + }, + { + "epoch": 0.8483369098712447, + "grad_norm": 0.4375, + "learning_rate": 4.7485577857333105e-06, + "loss": 2.3869, + "step": 15813 + }, + { + "epoch": 0.8483905579399141, + "grad_norm": 0.421875, + "learning_rate": 4.748519812488091e-06, + "loss": 2.2552, + "step": 15814 + }, + { + "epoch": 0.8484442060085837, + "grad_norm": 0.4296875, + "learning_rate": 4.74848183652755e-06, + "loss": 2.1821, + "step": 15815 + }, + { + "epoch": 0.8484978540772532, + "grad_norm": 0.44921875, + "learning_rate": 4.748443857851734e-06, + "loss": 2.2694, + "step": 15816 + }, + { + "epoch": 0.8485515021459228, + "grad_norm": 0.48828125, + "learning_rate": 4.748405876460689e-06, + "loss": 2.4083, + "step": 15817 + }, + { + "epoch": 0.8486051502145923, + "grad_norm": 0.47265625, + "learning_rate": 4.74836789235446e-06, + "loss": 1.9768, + "step": 15818 + }, + { + "epoch": 0.8486587982832619, + "grad_norm": 0.396484375, + "learning_rate": 4.748329905533093e-06, + "loss": 2.2372, + "step": 15819 + }, + { + "epoch": 0.8487124463519313, + "grad_norm": 0.578125, + "learning_rate": 4.748291915996635e-06, + "loss": 2.1333, + "step": 15820 + }, + { + "epoch": 0.8487660944206008, + "grad_norm": 0.70703125, + "learning_rate": 4.748253923745132e-06, + "loss": 1.423, + "step": 15821 + }, + { + "epoch": 0.8488197424892704, + "grad_norm": 0.45703125, + "learning_rate": 4.748215928778629e-06, + "loss": 2.0276, + "step": 15822 + }, + { + "epoch": 0.8488733905579399, + "grad_norm": 0.458984375, + "learning_rate": 4.748177931097171e-06, + "loss": 2.4252, + "step": 15823 + }, + { + "epoch": 0.8489270386266095, + "grad_norm": 0.462890625, + "learning_rate": 4.7481399307008055e-06, + "loss": 2.3966, + "step": 15824 + }, + { + "epoch": 0.8489806866952789, + "grad_norm": 0.51953125, + "learning_rate": 4.7481019275895775e-06, + "loss": 2.3091, + "step": 15825 + }, + { + "epoch": 0.8490343347639485, + "grad_norm": 0.4765625, + "learning_rate": 4.7480639217635335e-06, + "loss": 2.205, + "step": 15826 + }, + { + "epoch": 0.849087982832618, + "grad_norm": 0.478515625, + "learning_rate": 4.748025913222719e-06, + "loss": 2.4231, + "step": 15827 + }, + { + "epoch": 0.8491416309012876, + "grad_norm": 0.4453125, + "learning_rate": 4.7479879019671805e-06, + "loss": 2.2696, + "step": 15828 + }, + { + "epoch": 0.8491952789699571, + "grad_norm": 0.76953125, + "learning_rate": 4.747949887996963e-06, + "loss": 2.4802, + "step": 15829 + }, + { + "epoch": 0.8492489270386266, + "grad_norm": 0.55859375, + "learning_rate": 4.7479118713121124e-06, + "loss": 2.43, + "step": 15830 + }, + { + "epoch": 0.8493025751072961, + "grad_norm": 0.58984375, + "learning_rate": 4.7478738519126755e-06, + "loss": 2.4333, + "step": 15831 + }, + { + "epoch": 0.8493562231759657, + "grad_norm": 0.4453125, + "learning_rate": 4.747835829798697e-06, + "loss": 2.2237, + "step": 15832 + }, + { + "epoch": 0.8494098712446352, + "grad_norm": 0.455078125, + "learning_rate": 4.747797804970224e-06, + "loss": 2.5033, + "step": 15833 + }, + { + "epoch": 0.8494635193133048, + "grad_norm": 0.435546875, + "learning_rate": 4.7477597774273025e-06, + "loss": 2.2745, + "step": 15834 + }, + { + "epoch": 0.8495171673819742, + "grad_norm": 0.53515625, + "learning_rate": 4.747721747169977e-06, + "loss": 1.9625, + "step": 15835 + }, + { + "epoch": 0.8495708154506437, + "grad_norm": 0.51953125, + "learning_rate": 4.747683714198295e-06, + "loss": 2.3531, + "step": 15836 + }, + { + "epoch": 0.8496244635193133, + "grad_norm": 0.4296875, + "learning_rate": 4.747645678512302e-06, + "loss": 2.3084, + "step": 15837 + }, + { + "epoch": 0.8496781115879828, + "grad_norm": 0.43359375, + "learning_rate": 4.747607640112042e-06, + "loss": 2.3538, + "step": 15838 + }, + { + "epoch": 0.8497317596566524, + "grad_norm": 0.427734375, + "learning_rate": 4.747569598997564e-06, + "loss": 2.4625, + "step": 15839 + }, + { + "epoch": 0.8497854077253219, + "grad_norm": 0.4609375, + "learning_rate": 4.747531555168912e-06, + "loss": 2.4765, + "step": 15840 + }, + { + "epoch": 0.8498390557939914, + "grad_norm": 0.5546875, + "learning_rate": 4.747493508626132e-06, + "loss": 2.3, + "step": 15841 + }, + { + "epoch": 0.8498927038626609, + "grad_norm": 0.359375, + "learning_rate": 4.747455459369272e-06, + "loss": 2.2039, + "step": 15842 + }, + { + "epoch": 0.8499463519313305, + "grad_norm": 0.640625, + "learning_rate": 4.747417407398375e-06, + "loss": 2.4356, + "step": 15843 + }, + { + "epoch": 0.85, + "grad_norm": 0.443359375, + "learning_rate": 4.747379352713489e-06, + "loss": 2.3711, + "step": 15844 + }, + { + "epoch": 0.8500536480686696, + "grad_norm": 0.462890625, + "learning_rate": 4.747341295314659e-06, + "loss": 2.2933, + "step": 15845 + }, + { + "epoch": 0.850107296137339, + "grad_norm": 0.4453125, + "learning_rate": 4.747303235201931e-06, + "loss": 2.0626, + "step": 15846 + }, + { + "epoch": 0.8501609442060086, + "grad_norm": 0.375, + "learning_rate": 4.747265172375351e-06, + "loss": 1.8262, + "step": 15847 + }, + { + "epoch": 0.8502145922746781, + "grad_norm": 0.4609375, + "learning_rate": 4.747227106834966e-06, + "loss": 2.1868, + "step": 15848 + }, + { + "epoch": 0.8502682403433477, + "grad_norm": 0.453125, + "learning_rate": 4.747189038580821e-06, + "loss": 2.4542, + "step": 15849 + }, + { + "epoch": 0.8503218884120172, + "grad_norm": 0.412109375, + "learning_rate": 4.747150967612961e-06, + "loss": 2.1684, + "step": 15850 + }, + { + "epoch": 0.8503755364806866, + "grad_norm": 0.427734375, + "learning_rate": 4.747112893931434e-06, + "loss": 2.0438, + "step": 15851 + }, + { + "epoch": 0.8504291845493562, + "grad_norm": 0.5, + "learning_rate": 4.747074817536285e-06, + "loss": 2.4103, + "step": 15852 + }, + { + "epoch": 0.8504828326180257, + "grad_norm": 0.462890625, + "learning_rate": 4.74703673842756e-06, + "loss": 2.3037, + "step": 15853 + }, + { + "epoch": 0.8505364806866953, + "grad_norm": 0.515625, + "learning_rate": 4.746998656605305e-06, + "loss": 2.2087, + "step": 15854 + }, + { + "epoch": 0.8505901287553648, + "grad_norm": 0.46875, + "learning_rate": 4.746960572069565e-06, + "loss": 2.2923, + "step": 15855 + }, + { + "epoch": 0.8506437768240344, + "grad_norm": 0.458984375, + "learning_rate": 4.746922484820389e-06, + "loss": 2.2455, + "step": 15856 + }, + { + "epoch": 0.8506974248927038, + "grad_norm": 0.453125, + "learning_rate": 4.74688439485782e-06, + "loss": 2.2518, + "step": 15857 + }, + { + "epoch": 0.8507510729613734, + "grad_norm": 0.5390625, + "learning_rate": 4.746846302181904e-06, + "loss": 2.2101, + "step": 15858 + }, + { + "epoch": 0.8508047210300429, + "grad_norm": 0.47265625, + "learning_rate": 4.746808206792689e-06, + "loss": 1.6049, + "step": 15859 + }, + { + "epoch": 0.8508583690987125, + "grad_norm": 0.50390625, + "learning_rate": 4.7467701086902205e-06, + "loss": 2.2073, + "step": 15860 + }, + { + "epoch": 0.850912017167382, + "grad_norm": 1.8046875, + "learning_rate": 4.746732007874543e-06, + "loss": 2.3623, + "step": 15861 + }, + { + "epoch": 0.8509656652360515, + "grad_norm": 0.59375, + "learning_rate": 4.7466939043457035e-06, + "loss": 1.7253, + "step": 15862 + }, + { + "epoch": 0.851019313304721, + "grad_norm": 0.45703125, + "learning_rate": 4.746655798103749e-06, + "loss": 2.1889, + "step": 15863 + }, + { + "epoch": 0.8510729613733906, + "grad_norm": 0.546875, + "learning_rate": 4.7466176891487234e-06, + "loss": 2.4205, + "step": 15864 + }, + { + "epoch": 0.8511266094420601, + "grad_norm": 0.498046875, + "learning_rate": 4.746579577480675e-06, + "loss": 2.2862, + "step": 15865 + }, + { + "epoch": 0.8511802575107296, + "grad_norm": 0.49609375, + "learning_rate": 4.746541463099648e-06, + "loss": 2.1745, + "step": 15866 + }, + { + "epoch": 0.8512339055793992, + "grad_norm": 0.4765625, + "learning_rate": 4.746503346005689e-06, + "loss": 2.2497, + "step": 15867 + }, + { + "epoch": 0.8512875536480686, + "grad_norm": 0.390625, + "learning_rate": 4.7464652261988444e-06, + "loss": 2.2489, + "step": 15868 + }, + { + "epoch": 0.8513412017167382, + "grad_norm": 0.419921875, + "learning_rate": 4.74642710367916e-06, + "loss": 2.3089, + "step": 15869 + }, + { + "epoch": 0.8513948497854077, + "grad_norm": 0.4375, + "learning_rate": 4.7463889784466824e-06, + "loss": 2.4136, + "step": 15870 + }, + { + "epoch": 0.8514484978540773, + "grad_norm": 0.4140625, + "learning_rate": 4.7463508505014556e-06, + "loss": 2.1319, + "step": 15871 + }, + { + "epoch": 0.8515021459227468, + "grad_norm": 1.1875, + "learning_rate": 4.746312719843528e-06, + "loss": 2.1887, + "step": 15872 + }, + { + "epoch": 0.8515557939914163, + "grad_norm": 0.5234375, + "learning_rate": 4.746274586472944e-06, + "loss": 2.1816, + "step": 15873 + }, + { + "epoch": 0.8516094420600858, + "grad_norm": 0.419921875, + "learning_rate": 4.7462364503897515e-06, + "loss": 2.2274, + "step": 15874 + }, + { + "epoch": 0.8516630901287554, + "grad_norm": 0.53515625, + "learning_rate": 4.7461983115939944e-06, + "loss": 2.2441, + "step": 15875 + }, + { + "epoch": 0.8517167381974249, + "grad_norm": 0.419921875, + "learning_rate": 4.746160170085721e-06, + "loss": 2.1408, + "step": 15876 + }, + { + "epoch": 0.8517703862660945, + "grad_norm": 0.490234375, + "learning_rate": 4.7461220258649756e-06, + "loss": 2.1905, + "step": 15877 + }, + { + "epoch": 0.851824034334764, + "grad_norm": 0.416015625, + "learning_rate": 4.746083878931804e-06, + "loss": 2.1324, + "step": 15878 + }, + { + "epoch": 0.8518776824034334, + "grad_norm": 0.53515625, + "learning_rate": 4.746045729286254e-06, + "loss": 2.1043, + "step": 15879 + }, + { + "epoch": 0.851931330472103, + "grad_norm": 0.41796875, + "learning_rate": 4.7460075769283705e-06, + "loss": 2.1509, + "step": 15880 + }, + { + "epoch": 0.8519849785407725, + "grad_norm": 0.4765625, + "learning_rate": 4.7459694218582e-06, + "loss": 2.3404, + "step": 15881 + }, + { + "epoch": 0.8520386266094421, + "grad_norm": 0.427734375, + "learning_rate": 4.745931264075789e-06, + "loss": 2.2135, + "step": 15882 + }, + { + "epoch": 0.8520922746781115, + "grad_norm": 0.5, + "learning_rate": 4.745893103581181e-06, + "loss": 2.4058, + "step": 15883 + }, + { + "epoch": 0.8521459227467811, + "grad_norm": 0.53125, + "learning_rate": 4.745854940374426e-06, + "loss": 2.2408, + "step": 15884 + }, + { + "epoch": 0.8521995708154506, + "grad_norm": 0.48046875, + "learning_rate": 4.745816774455569e-06, + "loss": 2.3536, + "step": 15885 + }, + { + "epoch": 0.8522532188841202, + "grad_norm": 0.58984375, + "learning_rate": 4.745778605824652e-06, + "loss": 2.3336, + "step": 15886 + }, + { + "epoch": 0.8523068669527897, + "grad_norm": 0.494140625, + "learning_rate": 4.745740434481727e-06, + "loss": 2.2412, + "step": 15887 + }, + { + "epoch": 0.8523605150214593, + "grad_norm": 0.458984375, + "learning_rate": 4.745702260426836e-06, + "loss": 2.1283, + "step": 15888 + }, + { + "epoch": 0.8524141630901287, + "grad_norm": 0.546875, + "learning_rate": 4.745664083660028e-06, + "loss": 2.3902, + "step": 15889 + }, + { + "epoch": 0.8524678111587983, + "grad_norm": 0.48046875, + "learning_rate": 4.745625904181345e-06, + "loss": 2.3342, + "step": 15890 + }, + { + "epoch": 0.8525214592274678, + "grad_norm": 0.51953125, + "learning_rate": 4.7455877219908386e-06, + "loss": 2.0295, + "step": 15891 + }, + { + "epoch": 0.8525751072961374, + "grad_norm": 0.404296875, + "learning_rate": 4.745549537088551e-06, + "loss": 2.3118, + "step": 15892 + }, + { + "epoch": 0.8526287553648069, + "grad_norm": 0.4375, + "learning_rate": 4.745511349474529e-06, + "loss": 1.8996, + "step": 15893 + }, + { + "epoch": 0.8526824034334763, + "grad_norm": 0.447265625, + "learning_rate": 4.74547315914882e-06, + "loss": 2.3803, + "step": 15894 + }, + { + "epoch": 0.8527360515021459, + "grad_norm": 0.390625, + "learning_rate": 4.745434966111468e-06, + "loss": 2.0102, + "step": 15895 + }, + { + "epoch": 0.8527896995708154, + "grad_norm": 0.419921875, + "learning_rate": 4.745396770362521e-06, + "loss": 2.2651, + "step": 15896 + }, + { + "epoch": 0.852843347639485, + "grad_norm": 0.625, + "learning_rate": 4.745358571902025e-06, + "loss": 2.4302, + "step": 15897 + }, + { + "epoch": 0.8528969957081545, + "grad_norm": 0.478515625, + "learning_rate": 4.745320370730025e-06, + "loss": 2.2927, + "step": 15898 + }, + { + "epoch": 0.8529506437768241, + "grad_norm": 0.361328125, + "learning_rate": 4.745282166846566e-06, + "loss": 2.0959, + "step": 15899 + }, + { + "epoch": 0.8530042918454935, + "grad_norm": 0.51171875, + "learning_rate": 4.7452439602516985e-06, + "loss": 2.2176, + "step": 15900 + }, + { + "epoch": 0.8530579399141631, + "grad_norm": 0.62109375, + "learning_rate": 4.745205750945465e-06, + "loss": 2.265, + "step": 15901 + }, + { + "epoch": 0.8531115879828326, + "grad_norm": 0.55859375, + "learning_rate": 4.745167538927912e-06, + "loss": 2.1366, + "step": 15902 + }, + { + "epoch": 0.8531652360515022, + "grad_norm": 0.546875, + "learning_rate": 4.745129324199087e-06, + "loss": 2.0696, + "step": 15903 + }, + { + "epoch": 0.8532188841201717, + "grad_norm": 0.57421875, + "learning_rate": 4.745091106759036e-06, + "loss": 2.296, + "step": 15904 + }, + { + "epoch": 0.8532725321888412, + "grad_norm": 0.49609375, + "learning_rate": 4.745052886607803e-06, + "loss": 2.3112, + "step": 15905 + }, + { + "epoch": 0.8533261802575107, + "grad_norm": 0.453125, + "learning_rate": 4.745014663745436e-06, + "loss": 2.4604, + "step": 15906 + }, + { + "epoch": 0.8533798283261803, + "grad_norm": 0.39453125, + "learning_rate": 4.744976438171982e-06, + "loss": 1.9949, + "step": 15907 + }, + { + "epoch": 0.8534334763948498, + "grad_norm": 0.419921875, + "learning_rate": 4.744938209887485e-06, + "loss": 2.3016, + "step": 15908 + }, + { + "epoch": 0.8534871244635193, + "grad_norm": 0.45703125, + "learning_rate": 4.744899978891993e-06, + "loss": 2.1106, + "step": 15909 + }, + { + "epoch": 0.8535407725321889, + "grad_norm": 0.486328125, + "learning_rate": 4.744861745185551e-06, + "loss": 2.3277, + "step": 15910 + }, + { + "epoch": 0.8535944206008583, + "grad_norm": 0.453125, + "learning_rate": 4.744823508768206e-06, + "loss": 2.4992, + "step": 15911 + }, + { + "epoch": 0.8536480686695279, + "grad_norm": 0.4765625, + "learning_rate": 4.744785269640003e-06, + "loss": 2.1477, + "step": 15912 + }, + { + "epoch": 0.8537017167381974, + "grad_norm": 0.9453125, + "learning_rate": 4.744747027800989e-06, + "loss": 2.2194, + "step": 15913 + }, + { + "epoch": 0.853755364806867, + "grad_norm": 0.55078125, + "learning_rate": 4.74470878325121e-06, + "loss": 2.381, + "step": 15914 + }, + { + "epoch": 0.8538090128755365, + "grad_norm": 0.462890625, + "learning_rate": 4.744670535990713e-06, + "loss": 2.1906, + "step": 15915 + }, + { + "epoch": 0.853862660944206, + "grad_norm": 0.45703125, + "learning_rate": 4.744632286019543e-06, + "loss": 2.2693, + "step": 15916 + }, + { + "epoch": 0.8539163090128755, + "grad_norm": 0.412109375, + "learning_rate": 4.744594033337746e-06, + "loss": 1.8295, + "step": 15917 + }, + { + "epoch": 0.8539699570815451, + "grad_norm": 0.435546875, + "learning_rate": 4.7445557779453695e-06, + "loss": 2.1967, + "step": 15918 + }, + { + "epoch": 0.8540236051502146, + "grad_norm": 0.458984375, + "learning_rate": 4.74451751984246e-06, + "loss": 2.2544, + "step": 15919 + }, + { + "epoch": 0.8540772532188842, + "grad_norm": 0.52734375, + "learning_rate": 4.744479259029061e-06, + "loss": 2.5571, + "step": 15920 + }, + { + "epoch": 0.8541309012875536, + "grad_norm": 0.48046875, + "learning_rate": 4.744440995505221e-06, + "loss": 2.2807, + "step": 15921 + }, + { + "epoch": 0.8541845493562231, + "grad_norm": 0.48046875, + "learning_rate": 4.744402729270986e-06, + "loss": 2.1818, + "step": 15922 + }, + { + "epoch": 0.8542381974248927, + "grad_norm": 0.6015625, + "learning_rate": 4.744364460326402e-06, + "loss": 2.1816, + "step": 15923 + }, + { + "epoch": 0.8542918454935622, + "grad_norm": 0.54296875, + "learning_rate": 4.744326188671514e-06, + "loss": 2.7351, + "step": 15924 + }, + { + "epoch": 0.8543454935622318, + "grad_norm": 0.478515625, + "learning_rate": 4.744287914306371e-06, + "loss": 2.3719, + "step": 15925 + }, + { + "epoch": 0.8543991416309012, + "grad_norm": 0.421875, + "learning_rate": 4.744249637231015e-06, + "loss": 2.1673, + "step": 15926 + }, + { + "epoch": 0.8544527896995708, + "grad_norm": 0.46875, + "learning_rate": 4.744211357445497e-06, + "loss": 2.31, + "step": 15927 + }, + { + "epoch": 0.8545064377682403, + "grad_norm": 0.515625, + "learning_rate": 4.744173074949861e-06, + "loss": 2.3979, + "step": 15928 + }, + { + "epoch": 0.8545600858369099, + "grad_norm": 0.5078125, + "learning_rate": 4.744134789744151e-06, + "loss": 2.3157, + "step": 15929 + }, + { + "epoch": 0.8546137339055794, + "grad_norm": 0.431640625, + "learning_rate": 4.744096501828417e-06, + "loss": 2.2577, + "step": 15930 + }, + { + "epoch": 0.854667381974249, + "grad_norm": 0.4453125, + "learning_rate": 4.744058211202704e-06, + "loss": 2.4478, + "step": 15931 + }, + { + "epoch": 0.8547210300429184, + "grad_norm": 0.349609375, + "learning_rate": 4.744019917867057e-06, + "loss": 2.1987, + "step": 15932 + }, + { + "epoch": 0.854774678111588, + "grad_norm": 0.57421875, + "learning_rate": 4.743981621821523e-06, + "loss": 1.3968, + "step": 15933 + }, + { + "epoch": 0.8548283261802575, + "grad_norm": 0.38671875, + "learning_rate": 4.743943323066149e-06, + "loss": 2.2005, + "step": 15934 + }, + { + "epoch": 0.8548819742489271, + "grad_norm": 0.484375, + "learning_rate": 4.743905021600981e-06, + "loss": 2.3938, + "step": 15935 + }, + { + "epoch": 0.8549356223175966, + "grad_norm": 0.427734375, + "learning_rate": 4.743866717426064e-06, + "loss": 2.4793, + "step": 15936 + }, + { + "epoch": 0.854989270386266, + "grad_norm": 0.482421875, + "learning_rate": 4.743828410541445e-06, + "loss": 2.7063, + "step": 15937 + }, + { + "epoch": 0.8550429184549356, + "grad_norm": 1.1875, + "learning_rate": 4.743790100947171e-06, + "loss": 2.1428, + "step": 15938 + }, + { + "epoch": 0.8550965665236051, + "grad_norm": 10.1875, + "learning_rate": 4.743751788643288e-06, + "loss": 3.0857, + "step": 15939 + }, + { + "epoch": 0.8551502145922747, + "grad_norm": 0.3828125, + "learning_rate": 4.743713473629842e-06, + "loss": 2.2867, + "step": 15940 + }, + { + "epoch": 0.8552038626609442, + "grad_norm": 0.5546875, + "learning_rate": 4.743675155906878e-06, + "loss": 2.7735, + "step": 15941 + }, + { + "epoch": 0.8552575107296138, + "grad_norm": 0.451171875, + "learning_rate": 4.743636835474444e-06, + "loss": 2.2915, + "step": 15942 + }, + { + "epoch": 0.8553111587982832, + "grad_norm": 0.47265625, + "learning_rate": 4.743598512332586e-06, + "loss": 2.4443, + "step": 15943 + }, + { + "epoch": 0.8553648068669528, + "grad_norm": 0.4140625, + "learning_rate": 4.74356018648135e-06, + "loss": 2.4922, + "step": 15944 + }, + { + "epoch": 0.8554184549356223, + "grad_norm": 0.4765625, + "learning_rate": 4.743521857920782e-06, + "loss": 1.9469, + "step": 15945 + }, + { + "epoch": 0.8554721030042919, + "grad_norm": 0.439453125, + "learning_rate": 4.74348352665093e-06, + "loss": 2.1032, + "step": 15946 + }, + { + "epoch": 0.8555257510729614, + "grad_norm": 0.470703125, + "learning_rate": 4.7434451926718375e-06, + "loss": 2.4103, + "step": 15947 + }, + { + "epoch": 0.855579399141631, + "grad_norm": 0.43359375, + "learning_rate": 4.743406855983552e-06, + "loss": 2.3222, + "step": 15948 + }, + { + "epoch": 0.8556330472103004, + "grad_norm": 0.47265625, + "learning_rate": 4.7433685165861204e-06, + "loss": 2.4901, + "step": 15949 + }, + { + "epoch": 0.85568669527897, + "grad_norm": 0.45703125, + "learning_rate": 4.743330174479589e-06, + "loss": 2.3615, + "step": 15950 + }, + { + "epoch": 0.8557403433476395, + "grad_norm": 0.5234375, + "learning_rate": 4.7432918296640025e-06, + "loss": 2.203, + "step": 15951 + }, + { + "epoch": 0.855793991416309, + "grad_norm": 0.46875, + "learning_rate": 4.743253482139409e-06, + "loss": 2.4082, + "step": 15952 + }, + { + "epoch": 0.8558476394849786, + "grad_norm": 0.51171875, + "learning_rate": 4.743215131905855e-06, + "loss": 2.3864, + "step": 15953 + }, + { + "epoch": 0.855901287553648, + "grad_norm": 0.421875, + "learning_rate": 4.743176778963386e-06, + "loss": 2.162, + "step": 15954 + }, + { + "epoch": 0.8559549356223176, + "grad_norm": 0.4453125, + "learning_rate": 4.743138423312047e-06, + "loss": 2.1048, + "step": 15955 + }, + { + "epoch": 0.8560085836909871, + "grad_norm": 0.46484375, + "learning_rate": 4.743100064951885e-06, + "loss": 2.3436, + "step": 15956 + }, + { + "epoch": 0.8560622317596567, + "grad_norm": 0.48828125, + "learning_rate": 4.743061703882949e-06, + "loss": 2.2494, + "step": 15957 + }, + { + "epoch": 0.8561158798283262, + "grad_norm": 0.4296875, + "learning_rate": 4.7430233401052825e-06, + "loss": 2.2183, + "step": 15958 + }, + { + "epoch": 0.8561695278969957, + "grad_norm": 0.43359375, + "learning_rate": 4.742984973618933e-06, + "loss": 2.2999, + "step": 15959 + }, + { + "epoch": 0.8562231759656652, + "grad_norm": 0.419921875, + "learning_rate": 4.742946604423946e-06, + "loss": 2.2428, + "step": 15960 + }, + { + "epoch": 0.8562768240343348, + "grad_norm": 0.498046875, + "learning_rate": 4.742908232520368e-06, + "loss": 2.3828, + "step": 15961 + }, + { + "epoch": 0.8563304721030043, + "grad_norm": 0.384765625, + "learning_rate": 4.742869857908246e-06, + "loss": 1.8307, + "step": 15962 + }, + { + "epoch": 0.8563841201716739, + "grad_norm": 0.52734375, + "learning_rate": 4.742831480587626e-06, + "loss": 2.1752, + "step": 15963 + }, + { + "epoch": 0.8564377682403433, + "grad_norm": 0.78515625, + "learning_rate": 4.7427931005585535e-06, + "loss": 2.2881, + "step": 15964 + }, + { + "epoch": 0.8564914163090128, + "grad_norm": 0.5859375, + "learning_rate": 4.742754717821077e-06, + "loss": 2.0376, + "step": 15965 + }, + { + "epoch": 0.8565450643776824, + "grad_norm": 1.25, + "learning_rate": 4.7427163323752406e-06, + "loss": 1.3768, + "step": 15966 + }, + { + "epoch": 0.8565987124463519, + "grad_norm": 0.474609375, + "learning_rate": 4.742677944221092e-06, + "loss": 2.2069, + "step": 15967 + }, + { + "epoch": 0.8566523605150215, + "grad_norm": 0.546875, + "learning_rate": 4.742639553358677e-06, + "loss": 2.2814, + "step": 15968 + }, + { + "epoch": 0.856706008583691, + "grad_norm": 0.45703125, + "learning_rate": 4.742601159788042e-06, + "loss": 2.2773, + "step": 15969 + }, + { + "epoch": 0.8567596566523605, + "grad_norm": 1.0390625, + "learning_rate": 4.742562763509233e-06, + "loss": 2.2486, + "step": 15970 + }, + { + "epoch": 0.85681330472103, + "grad_norm": 0.6171875, + "learning_rate": 4.742524364522297e-06, + "loss": 2.2679, + "step": 15971 + }, + { + "epoch": 0.8568669527896996, + "grad_norm": 0.51953125, + "learning_rate": 4.742485962827281e-06, + "loss": 2.2355, + "step": 15972 + }, + { + "epoch": 0.8569206008583691, + "grad_norm": 0.90625, + "learning_rate": 4.742447558424229e-06, + "loss": 2.3514, + "step": 15973 + }, + { + "epoch": 0.8569742489270387, + "grad_norm": 0.44140625, + "learning_rate": 4.74240915131319e-06, + "loss": 2.2739, + "step": 15974 + }, + { + "epoch": 0.8570278969957081, + "grad_norm": 0.47265625, + "learning_rate": 4.742370741494209e-06, + "loss": 2.3662, + "step": 15975 + }, + { + "epoch": 0.8570815450643777, + "grad_norm": 0.39453125, + "learning_rate": 4.7423323289673335e-06, + "loss": 2.2523, + "step": 15976 + }, + { + "epoch": 0.8571351931330472, + "grad_norm": 0.65234375, + "learning_rate": 4.742293913732609e-06, + "loss": 2.2476, + "step": 15977 + }, + { + "epoch": 0.8571888412017168, + "grad_norm": 0.48046875, + "learning_rate": 4.7422554957900805e-06, + "loss": 2.3707, + "step": 15978 + }, + { + "epoch": 0.8572424892703863, + "grad_norm": 0.439453125, + "learning_rate": 4.742217075139796e-06, + "loss": 2.3568, + "step": 15979 + }, + { + "epoch": 0.8572961373390557, + "grad_norm": 0.43359375, + "learning_rate": 4.742178651781804e-06, + "loss": 2.3667, + "step": 15980 + }, + { + "epoch": 0.8573497854077253, + "grad_norm": 0.498046875, + "learning_rate": 4.7421402257161465e-06, + "loss": 2.1036, + "step": 15981 + }, + { + "epoch": 0.8574034334763948, + "grad_norm": 0.36328125, + "learning_rate": 4.742101796942873e-06, + "loss": 2.3705, + "step": 15982 + }, + { + "epoch": 0.8574570815450644, + "grad_norm": 0.4921875, + "learning_rate": 4.742063365462028e-06, + "loss": 2.2703, + "step": 15983 + }, + { + "epoch": 0.8575107296137339, + "grad_norm": 0.451171875, + "learning_rate": 4.74202493127366e-06, + "loss": 2.1598, + "step": 15984 + }, + { + "epoch": 0.8575643776824035, + "grad_norm": 0.51953125, + "learning_rate": 4.741986494377814e-06, + "loss": 2.362, + "step": 15985 + }, + { + "epoch": 0.8576180257510729, + "grad_norm": 0.91015625, + "learning_rate": 4.7419480547745366e-06, + "loss": 2.3252, + "step": 15986 + }, + { + "epoch": 0.8576716738197425, + "grad_norm": 0.73828125, + "learning_rate": 4.741909612463874e-06, + "loss": 2.4608, + "step": 15987 + }, + { + "epoch": 0.857725321888412, + "grad_norm": 0.65234375, + "learning_rate": 4.7418711674458735e-06, + "loss": 2.2539, + "step": 15988 + }, + { + "epoch": 0.8577789699570816, + "grad_norm": 0.470703125, + "learning_rate": 4.74183271972058e-06, + "loss": 2.3626, + "step": 15989 + }, + { + "epoch": 0.8578326180257511, + "grad_norm": 0.455078125, + "learning_rate": 4.741794269288042e-06, + "loss": 2.3326, + "step": 15990 + }, + { + "epoch": 0.8578862660944206, + "grad_norm": 0.546875, + "learning_rate": 4.741755816148304e-06, + "loss": 2.3163, + "step": 15991 + }, + { + "epoch": 0.8579399141630901, + "grad_norm": 0.57421875, + "learning_rate": 4.741717360301414e-06, + "loss": 2.0449, + "step": 15992 + }, + { + "epoch": 0.8579935622317597, + "grad_norm": 0.625, + "learning_rate": 4.741678901747417e-06, + "loss": 1.9352, + "step": 15993 + }, + { + "epoch": 0.8580472103004292, + "grad_norm": 0.51171875, + "learning_rate": 4.741640440486361e-06, + "loss": 2.0259, + "step": 15994 + }, + { + "epoch": 0.8581008583690987, + "grad_norm": 0.3984375, + "learning_rate": 4.741601976518292e-06, + "loss": 2.2382, + "step": 15995 + }, + { + "epoch": 0.8581545064377682, + "grad_norm": 0.470703125, + "learning_rate": 4.741563509843255e-06, + "loss": 1.4793, + "step": 15996 + }, + { + "epoch": 0.8582081545064377, + "grad_norm": 0.443359375, + "learning_rate": 4.741525040461298e-06, + "loss": 2.218, + "step": 15997 + }, + { + "epoch": 0.8582618025751073, + "grad_norm": 0.5546875, + "learning_rate": 4.7414865683724665e-06, + "loss": 2.2599, + "step": 15998 + }, + { + "epoch": 0.8583154506437768, + "grad_norm": 0.498046875, + "learning_rate": 4.741448093576807e-06, + "loss": 2.1112, + "step": 15999 + }, + { + "epoch": 0.8583690987124464, + "grad_norm": 0.41015625, + "learning_rate": 4.741409616074367e-06, + "loss": 2.2718, + "step": 16000 + }, + { + "epoch": 0.8584227467811159, + "grad_norm": 0.451171875, + "learning_rate": 4.7413711358651925e-06, + "loss": 2.1596, + "step": 16001 + }, + { + "epoch": 0.8584763948497854, + "grad_norm": 0.55859375, + "learning_rate": 4.74133265294933e-06, + "loss": 2.4565, + "step": 16002 + }, + { + "epoch": 0.8585300429184549, + "grad_norm": 0.44140625, + "learning_rate": 4.7412941673268255e-06, + "loss": 2.2112, + "step": 16003 + }, + { + "epoch": 0.8585836909871245, + "grad_norm": 0.365234375, + "learning_rate": 4.741255678997726e-06, + "loss": 2.1162, + "step": 16004 + }, + { + "epoch": 0.858637339055794, + "grad_norm": 0.51171875, + "learning_rate": 4.741217187962077e-06, + "loss": 2.2476, + "step": 16005 + }, + { + "epoch": 0.8586909871244636, + "grad_norm": 0.48828125, + "learning_rate": 4.741178694219927e-06, + "loss": 2.2134, + "step": 16006 + }, + { + "epoch": 0.858744635193133, + "grad_norm": 0.484375, + "learning_rate": 4.74114019777132e-06, + "loss": 2.1756, + "step": 16007 + }, + { + "epoch": 0.8587982832618025, + "grad_norm": 0.5, + "learning_rate": 4.741101698616304e-06, + "loss": 2.2411, + "step": 16008 + }, + { + "epoch": 0.8588519313304721, + "grad_norm": 0.474609375, + "learning_rate": 4.741063196754925e-06, + "loss": 2.3175, + "step": 16009 + }, + { + "epoch": 0.8589055793991416, + "grad_norm": 0.6796875, + "learning_rate": 4.74102469218723e-06, + "loss": 2.4347, + "step": 16010 + }, + { + "epoch": 0.8589592274678112, + "grad_norm": 0.451171875, + "learning_rate": 4.740986184913266e-06, + "loss": 2.4184, + "step": 16011 + }, + { + "epoch": 0.8590128755364806, + "grad_norm": 0.59375, + "learning_rate": 4.740947674933078e-06, + "loss": 2.4437, + "step": 16012 + }, + { + "epoch": 0.8590665236051502, + "grad_norm": 0.6328125, + "learning_rate": 4.7409091622467125e-06, + "loss": 2.4567, + "step": 16013 + }, + { + "epoch": 0.8591201716738197, + "grad_norm": 0.48828125, + "learning_rate": 4.740870646854218e-06, + "loss": 2.279, + "step": 16014 + }, + { + "epoch": 0.8591738197424893, + "grad_norm": 0.482421875, + "learning_rate": 4.740832128755638e-06, + "loss": 2.3773, + "step": 16015 + }, + { + "epoch": 0.8592274678111588, + "grad_norm": 0.4375, + "learning_rate": 4.740793607951022e-06, + "loss": 2.3317, + "step": 16016 + }, + { + "epoch": 0.8592811158798284, + "grad_norm": 0.48828125, + "learning_rate": 4.7407550844404145e-06, + "loss": 2.1749, + "step": 16017 + }, + { + "epoch": 0.8593347639484978, + "grad_norm": 0.435546875, + "learning_rate": 4.7407165582238635e-06, + "loss": 2.3476, + "step": 16018 + }, + { + "epoch": 0.8593884120171674, + "grad_norm": 0.5, + "learning_rate": 4.740678029301414e-06, + "loss": 2.307, + "step": 16019 + }, + { + "epoch": 0.8594420600858369, + "grad_norm": 0.66796875, + "learning_rate": 4.740639497673114e-06, + "loss": 2.1577, + "step": 16020 + }, + { + "epoch": 0.8594957081545065, + "grad_norm": 0.44140625, + "learning_rate": 4.740600963339009e-06, + "loss": 2.1581, + "step": 16021 + }, + { + "epoch": 0.859549356223176, + "grad_norm": 0.439453125, + "learning_rate": 4.7405624262991465e-06, + "loss": 2.0356, + "step": 16022 + }, + { + "epoch": 0.8596030042918454, + "grad_norm": 0.423828125, + "learning_rate": 4.740523886553572e-06, + "loss": 2.2892, + "step": 16023 + }, + { + "epoch": 0.859656652360515, + "grad_norm": 0.482421875, + "learning_rate": 4.7404853441023325e-06, + "loss": 2.3676, + "step": 16024 + }, + { + "epoch": 0.8597103004291845, + "grad_norm": 0.435546875, + "learning_rate": 4.7404467989454735e-06, + "loss": 2.1533, + "step": 16025 + }, + { + "epoch": 0.8597639484978541, + "grad_norm": 0.451171875, + "learning_rate": 4.740408251083044e-06, + "loss": 2.4032, + "step": 16026 + }, + { + "epoch": 0.8598175965665236, + "grad_norm": 0.41015625, + "learning_rate": 4.7403697005150885e-06, + "loss": 2.29, + "step": 16027 + }, + { + "epoch": 0.8598712446351932, + "grad_norm": 0.55078125, + "learning_rate": 4.740331147241655e-06, + "loss": 2.4176, + "step": 16028 + }, + { + "epoch": 0.8599248927038626, + "grad_norm": 0.51953125, + "learning_rate": 4.740292591262787e-06, + "loss": 2.078, + "step": 16029 + }, + { + "epoch": 0.8599785407725322, + "grad_norm": 0.453125, + "learning_rate": 4.740254032578535e-06, + "loss": 2.2664, + "step": 16030 + }, + { + "epoch": 0.8600321888412017, + "grad_norm": 0.60546875, + "learning_rate": 4.740215471188943e-06, + "loss": 2.1346, + "step": 16031 + }, + { + "epoch": 0.8600858369098713, + "grad_norm": 0.4609375, + "learning_rate": 4.74017690709406e-06, + "loss": 2.1605, + "step": 16032 + }, + { + "epoch": 0.8601394849785408, + "grad_norm": 0.490234375, + "learning_rate": 4.740138340293929e-06, + "loss": 2.469, + "step": 16033 + }, + { + "epoch": 0.8601931330472103, + "grad_norm": 0.462890625, + "learning_rate": 4.7400997707886e-06, + "loss": 2.2435, + "step": 16034 + }, + { + "epoch": 0.8602467811158798, + "grad_norm": 0.439453125, + "learning_rate": 4.7400611985781176e-06, + "loss": 2.6332, + "step": 16035 + }, + { + "epoch": 0.8603004291845494, + "grad_norm": 0.46484375, + "learning_rate": 4.740022623662529e-06, + "loss": 1.8484, + "step": 16036 + }, + { + "epoch": 0.8603540772532189, + "grad_norm": 0.482421875, + "learning_rate": 4.73998404604188e-06, + "loss": 2.1589, + "step": 16037 + }, + { + "epoch": 0.8604077253218884, + "grad_norm": 0.43359375, + "learning_rate": 4.739945465716219e-06, + "loss": 2.1218, + "step": 16038 + }, + { + "epoch": 0.860461373390558, + "grad_norm": 1.3671875, + "learning_rate": 4.739906882685591e-06, + "loss": 2.3202, + "step": 16039 + }, + { + "epoch": 0.8605150214592274, + "grad_norm": 0.42578125, + "learning_rate": 4.739868296950043e-06, + "loss": 2.384, + "step": 16040 + }, + { + "epoch": 0.860568669527897, + "grad_norm": 0.47265625, + "learning_rate": 4.739829708509621e-06, + "loss": 2.055, + "step": 16041 + }, + { + "epoch": 0.8606223175965665, + "grad_norm": 0.5703125, + "learning_rate": 4.739791117364373e-06, + "loss": 2.4718, + "step": 16042 + }, + { + "epoch": 0.8606759656652361, + "grad_norm": 0.578125, + "learning_rate": 4.739752523514345e-06, + "loss": 2.1893, + "step": 16043 + }, + { + "epoch": 0.8607296137339056, + "grad_norm": 0.5390625, + "learning_rate": 4.739713926959583e-06, + "loss": 2.4773, + "step": 16044 + }, + { + "epoch": 0.8607832618025751, + "grad_norm": 0.443359375, + "learning_rate": 4.739675327700134e-06, + "loss": 2.2409, + "step": 16045 + }, + { + "epoch": 0.8608369098712446, + "grad_norm": 0.4765625, + "learning_rate": 4.739636725736045e-06, + "loss": 2.1323, + "step": 16046 + }, + { + "epoch": 0.8608905579399142, + "grad_norm": 0.474609375, + "learning_rate": 4.739598121067363e-06, + "loss": 2.4302, + "step": 16047 + }, + { + "epoch": 0.8609442060085837, + "grad_norm": 0.9609375, + "learning_rate": 4.739559513694132e-06, + "loss": 2.3395, + "step": 16048 + }, + { + "epoch": 0.8609978540772533, + "grad_norm": 0.482421875, + "learning_rate": 4.739520903616402e-06, + "loss": 2.2386, + "step": 16049 + }, + { + "epoch": 0.8610515021459227, + "grad_norm": 0.51953125, + "learning_rate": 4.739482290834217e-06, + "loss": 2.2519, + "step": 16050 + }, + { + "epoch": 0.8611051502145923, + "grad_norm": 1.0390625, + "learning_rate": 4.739443675347627e-06, + "loss": 2.3502, + "step": 16051 + }, + { + "epoch": 0.8611587982832618, + "grad_norm": 0.46875, + "learning_rate": 4.739405057156674e-06, + "loss": 2.4679, + "step": 16052 + }, + { + "epoch": 0.8612124463519313, + "grad_norm": 0.609375, + "learning_rate": 4.739366436261409e-06, + "loss": 2.2221, + "step": 16053 + }, + { + "epoch": 0.8612660944206009, + "grad_norm": 0.484375, + "learning_rate": 4.739327812661875e-06, + "loss": 2.3802, + "step": 16054 + }, + { + "epoch": 0.8613197424892703, + "grad_norm": 0.458984375, + "learning_rate": 4.739289186358121e-06, + "loss": 2.2695, + "step": 16055 + }, + { + "epoch": 0.8613733905579399, + "grad_norm": 0.5078125, + "learning_rate": 4.739250557350194e-06, + "loss": 2.4719, + "step": 16056 + }, + { + "epoch": 0.8614270386266094, + "grad_norm": 0.4296875, + "learning_rate": 4.739211925638139e-06, + "loss": 1.9472, + "step": 16057 + }, + { + "epoch": 0.861480686695279, + "grad_norm": 0.44921875, + "learning_rate": 4.739173291222002e-06, + "loss": 2.102, + "step": 16058 + }, + { + "epoch": 0.8615343347639485, + "grad_norm": 0.47265625, + "learning_rate": 4.739134654101833e-06, + "loss": 2.3453, + "step": 16059 + }, + { + "epoch": 0.8615879828326181, + "grad_norm": 0.388671875, + "learning_rate": 4.7390960142776746e-06, + "loss": 2.21, + "step": 16060 + }, + { + "epoch": 0.8616416309012875, + "grad_norm": 0.4921875, + "learning_rate": 4.739057371749577e-06, + "loss": 2.4388, + "step": 16061 + }, + { + "epoch": 0.8616952789699571, + "grad_norm": 0.458984375, + "learning_rate": 4.739018726517585e-06, + "loss": 2.4618, + "step": 16062 + }, + { + "epoch": 0.8617489270386266, + "grad_norm": 0.4375, + "learning_rate": 4.738980078581745e-06, + "loss": 2.268, + "step": 16063 + }, + { + "epoch": 0.8618025751072962, + "grad_norm": 37.25, + "learning_rate": 4.738941427942104e-06, + "loss": 2.2462, + "step": 16064 + }, + { + "epoch": 0.8618562231759657, + "grad_norm": 0.59375, + "learning_rate": 4.73890277459871e-06, + "loss": 2.4255, + "step": 16065 + }, + { + "epoch": 0.8619098712446351, + "grad_norm": 0.44140625, + "learning_rate": 4.738864118551608e-06, + "loss": 2.2443, + "step": 16066 + }, + { + "epoch": 0.8619635193133047, + "grad_norm": 0.46875, + "learning_rate": 4.7388254598008455e-06, + "loss": 2.2943, + "step": 16067 + }, + { + "epoch": 0.8620171673819742, + "grad_norm": 0.44921875, + "learning_rate": 4.7387867983464695e-06, + "loss": 2.3255, + "step": 16068 + }, + { + "epoch": 0.8620708154506438, + "grad_norm": 0.5, + "learning_rate": 4.738748134188525e-06, + "loss": 2.419, + "step": 16069 + }, + { + "epoch": 0.8621244635193133, + "grad_norm": 0.482421875, + "learning_rate": 4.73870946732706e-06, + "loss": 2.2061, + "step": 16070 + }, + { + "epoch": 0.8621781115879829, + "grad_norm": 0.498046875, + "learning_rate": 4.738670797762123e-06, + "loss": 2.3227, + "step": 16071 + }, + { + "epoch": 0.8622317596566523, + "grad_norm": 0.474609375, + "learning_rate": 4.738632125493757e-06, + "loss": 2.1801, + "step": 16072 + }, + { + "epoch": 0.8622854077253219, + "grad_norm": 0.53125, + "learning_rate": 4.738593450522011e-06, + "loss": 2.5233, + "step": 16073 + }, + { + "epoch": 0.8623390557939914, + "grad_norm": 0.5546875, + "learning_rate": 4.738554772846931e-06, + "loss": 2.4235, + "step": 16074 + }, + { + "epoch": 0.862392703862661, + "grad_norm": 0.462890625, + "learning_rate": 4.738516092468563e-06, + "loss": 2.3824, + "step": 16075 + }, + { + "epoch": 0.8624463519313305, + "grad_norm": 0.44921875, + "learning_rate": 4.738477409386955e-06, + "loss": 2.1734, + "step": 16076 + }, + { + "epoch": 0.8625, + "grad_norm": 0.4375, + "learning_rate": 4.738438723602154e-06, + "loss": 1.9114, + "step": 16077 + }, + { + "epoch": 0.8625536480686695, + "grad_norm": 0.416015625, + "learning_rate": 4.738400035114206e-06, + "loss": 2.2932, + "step": 16078 + }, + { + "epoch": 0.8626072961373391, + "grad_norm": 0.43359375, + "learning_rate": 4.738361343923158e-06, + "loss": 2.2231, + "step": 16079 + }, + { + "epoch": 0.8626609442060086, + "grad_norm": 0.40625, + "learning_rate": 4.738322650029056e-06, + "loss": 2.3534, + "step": 16080 + }, + { + "epoch": 0.8627145922746781, + "grad_norm": 0.376953125, + "learning_rate": 4.738283953431946e-06, + "loss": 2.1781, + "step": 16081 + }, + { + "epoch": 0.8627682403433476, + "grad_norm": 0.53515625, + "learning_rate": 4.738245254131877e-06, + "loss": 2.3511, + "step": 16082 + }, + { + "epoch": 0.8628218884120171, + "grad_norm": 0.41015625, + "learning_rate": 4.7382065521288945e-06, + "loss": 2.2328, + "step": 16083 + }, + { + "epoch": 0.8628755364806867, + "grad_norm": 0.46484375, + "learning_rate": 4.738167847423046e-06, + "loss": 2.1798, + "step": 16084 + }, + { + "epoch": 0.8629291845493562, + "grad_norm": 0.4609375, + "learning_rate": 4.738129140014377e-06, + "loss": 2.227, + "step": 16085 + }, + { + "epoch": 0.8629828326180258, + "grad_norm": 0.45703125, + "learning_rate": 4.738090429902935e-06, + "loss": 2.121, + "step": 16086 + }, + { + "epoch": 0.8630364806866953, + "grad_norm": 0.36328125, + "learning_rate": 4.738051717088766e-06, + "loss": 2.0907, + "step": 16087 + }, + { + "epoch": 0.8630901287553648, + "grad_norm": 0.43359375, + "learning_rate": 4.738013001571919e-06, + "loss": 2.2259, + "step": 16088 + }, + { + "epoch": 0.8631437768240343, + "grad_norm": 0.431640625, + "learning_rate": 4.737974283352437e-06, + "loss": 2.3161, + "step": 16089 + }, + { + "epoch": 0.8631974248927039, + "grad_norm": 0.515625, + "learning_rate": 4.73793556243037e-06, + "loss": 2.2171, + "step": 16090 + }, + { + "epoch": 0.8632510729613734, + "grad_norm": 0.478515625, + "learning_rate": 4.7378968388057635e-06, + "loss": 2.0835, + "step": 16091 + }, + { + "epoch": 0.863304721030043, + "grad_norm": 0.46875, + "learning_rate": 4.737858112478665e-06, + "loss": 2.2779, + "step": 16092 + }, + { + "epoch": 0.8633583690987124, + "grad_norm": 0.43359375, + "learning_rate": 4.73781938344912e-06, + "loss": 2.2909, + "step": 16093 + }, + { + "epoch": 0.863412017167382, + "grad_norm": 0.515625, + "learning_rate": 4.737780651717175e-06, + "loss": 2.2676, + "step": 16094 + }, + { + "epoch": 0.8634656652360515, + "grad_norm": 0.5, + "learning_rate": 4.73774191728288e-06, + "loss": 2.0911, + "step": 16095 + }, + { + "epoch": 0.863519313304721, + "grad_norm": 0.49609375, + "learning_rate": 4.737703180146277e-06, + "loss": 2.2118, + "step": 16096 + }, + { + "epoch": 0.8635729613733906, + "grad_norm": 0.44140625, + "learning_rate": 4.737664440307417e-06, + "loss": 2.2778, + "step": 16097 + }, + { + "epoch": 0.86362660944206, + "grad_norm": 0.4453125, + "learning_rate": 4.737625697766345e-06, + "loss": 2.3596, + "step": 16098 + }, + { + "epoch": 0.8636802575107296, + "grad_norm": 0.53515625, + "learning_rate": 4.737586952523107e-06, + "loss": 2.0895, + "step": 16099 + }, + { + "epoch": 0.8637339055793991, + "grad_norm": 0.53125, + "learning_rate": 4.737548204577751e-06, + "loss": 2.1661, + "step": 16100 + }, + { + "epoch": 0.8637875536480687, + "grad_norm": 0.427734375, + "learning_rate": 4.737509453930324e-06, + "loss": 2.2915, + "step": 16101 + }, + { + "epoch": 0.8638412017167382, + "grad_norm": 0.443359375, + "learning_rate": 4.737470700580871e-06, + "loss": 1.8711, + "step": 16102 + }, + { + "epoch": 0.8638948497854078, + "grad_norm": 0.3984375, + "learning_rate": 4.73743194452944e-06, + "loss": 2.2005, + "step": 16103 + }, + { + "epoch": 0.8639484978540772, + "grad_norm": 0.62109375, + "learning_rate": 4.737393185776078e-06, + "loss": 2.1533, + "step": 16104 + }, + { + "epoch": 0.8640021459227468, + "grad_norm": 0.443359375, + "learning_rate": 4.737354424320833e-06, + "loss": 2.4258, + "step": 16105 + }, + { + "epoch": 0.8640557939914163, + "grad_norm": 0.455078125, + "learning_rate": 4.737315660163748e-06, + "loss": 2.3426, + "step": 16106 + }, + { + "epoch": 0.8641094420600859, + "grad_norm": 0.5078125, + "learning_rate": 4.737276893304874e-06, + "loss": 2.2376, + "step": 16107 + }, + { + "epoch": 0.8641630901287554, + "grad_norm": 0.498046875, + "learning_rate": 4.737238123744255e-06, + "loss": 2.3137, + "step": 16108 + }, + { + "epoch": 0.8642167381974248, + "grad_norm": 0.61328125, + "learning_rate": 4.737199351481941e-06, + "loss": 2.0734, + "step": 16109 + }, + { + "epoch": 0.8642703862660944, + "grad_norm": 0.49609375, + "learning_rate": 4.737160576517974e-06, + "loss": 2.5085, + "step": 16110 + }, + { + "epoch": 0.8643240343347639, + "grad_norm": 0.48828125, + "learning_rate": 4.737121798852405e-06, + "loss": 2.1387, + "step": 16111 + }, + { + "epoch": 0.8643776824034335, + "grad_norm": 0.46484375, + "learning_rate": 4.737083018485279e-06, + "loss": 2.1598, + "step": 16112 + }, + { + "epoch": 0.864431330472103, + "grad_norm": 0.48828125, + "learning_rate": 4.737044235416643e-06, + "loss": 2.2226, + "step": 16113 + }, + { + "epoch": 0.8644849785407726, + "grad_norm": 0.6796875, + "learning_rate": 4.737005449646544e-06, + "loss": 1.9435, + "step": 16114 + }, + { + "epoch": 0.864538626609442, + "grad_norm": 0.55078125, + "learning_rate": 4.736966661175029e-06, + "loss": 2.3176, + "step": 16115 + }, + { + "epoch": 0.8645922746781116, + "grad_norm": 0.421875, + "learning_rate": 4.736927870002145e-06, + "loss": 2.2145, + "step": 16116 + }, + { + "epoch": 0.8646459227467811, + "grad_norm": 0.55078125, + "learning_rate": 4.736889076127938e-06, + "loss": 2.1062, + "step": 16117 + }, + { + "epoch": 0.8646995708154507, + "grad_norm": 0.484375, + "learning_rate": 4.736850279552455e-06, + "loss": 2.5156, + "step": 16118 + }, + { + "epoch": 0.8647532188841202, + "grad_norm": 0.470703125, + "learning_rate": 4.736811480275743e-06, + "loss": 2.1973, + "step": 16119 + }, + { + "epoch": 0.8648068669527897, + "grad_norm": 0.478515625, + "learning_rate": 4.736772678297851e-06, + "loss": 2.3051, + "step": 16120 + }, + { + "epoch": 0.8648605150214592, + "grad_norm": 0.55078125, + "learning_rate": 4.736733873618823e-06, + "loss": 2.4905, + "step": 16121 + }, + { + "epoch": 0.8649141630901288, + "grad_norm": 0.484375, + "learning_rate": 4.736695066238705e-06, + "loss": 2.4649, + "step": 16122 + }, + { + "epoch": 0.8649678111587983, + "grad_norm": 0.498046875, + "learning_rate": 4.736656256157548e-06, + "loss": 2.5072, + "step": 16123 + }, + { + "epoch": 0.8650214592274678, + "grad_norm": 1.0859375, + "learning_rate": 4.736617443375396e-06, + "loss": 2.5309, + "step": 16124 + }, + { + "epoch": 0.8650751072961373, + "grad_norm": 0.37890625, + "learning_rate": 4.736578627892296e-06, + "loss": 2.0651, + "step": 16125 + }, + { + "epoch": 0.8651287553648068, + "grad_norm": 0.51171875, + "learning_rate": 4.736539809708295e-06, + "loss": 2.2098, + "step": 16126 + }, + { + "epoch": 0.8651824034334764, + "grad_norm": 0.46875, + "learning_rate": 4.736500988823441e-06, + "loss": 2.1331, + "step": 16127 + }, + { + "epoch": 0.8652360515021459, + "grad_norm": 0.55859375, + "learning_rate": 4.736462165237779e-06, + "loss": 2.3736, + "step": 16128 + }, + { + "epoch": 0.8652896995708155, + "grad_norm": 0.423828125, + "learning_rate": 4.7364233389513584e-06, + "loss": 2.1553, + "step": 16129 + }, + { + "epoch": 0.865343347639485, + "grad_norm": 0.46484375, + "learning_rate": 4.736384509964223e-06, + "loss": 2.1132, + "step": 16130 + }, + { + "epoch": 0.8653969957081545, + "grad_norm": 0.41796875, + "learning_rate": 4.736345678276422e-06, + "loss": 2.1515, + "step": 16131 + }, + { + "epoch": 0.865450643776824, + "grad_norm": 0.56640625, + "learning_rate": 4.736306843888002e-06, + "loss": 2.3105, + "step": 16132 + }, + { + "epoch": 0.8655042918454936, + "grad_norm": 0.79296875, + "learning_rate": 4.736268006799009e-06, + "loss": 2.1385, + "step": 16133 + }, + { + "epoch": 0.8655579399141631, + "grad_norm": 0.4375, + "learning_rate": 4.73622916700949e-06, + "loss": 2.2212, + "step": 16134 + }, + { + "epoch": 0.8656115879828327, + "grad_norm": 0.51953125, + "learning_rate": 4.736190324519493e-06, + "loss": 2.2106, + "step": 16135 + }, + { + "epoch": 0.8656652360515021, + "grad_norm": 0.455078125, + "learning_rate": 4.736151479329063e-06, + "loss": 2.163, + "step": 16136 + }, + { + "epoch": 0.8657188841201717, + "grad_norm": 0.48046875, + "learning_rate": 4.73611263143825e-06, + "loss": 2.3667, + "step": 16137 + }, + { + "epoch": 0.8657725321888412, + "grad_norm": 0.470703125, + "learning_rate": 4.736073780847097e-06, + "loss": 2.2156, + "step": 16138 + }, + { + "epoch": 0.8658261802575107, + "grad_norm": 0.380859375, + "learning_rate": 4.736034927555654e-06, + "loss": 1.9898, + "step": 16139 + }, + { + "epoch": 0.8658798283261803, + "grad_norm": 0.47265625, + "learning_rate": 4.735996071563966e-06, + "loss": 2.1736, + "step": 16140 + }, + { + "epoch": 0.8659334763948497, + "grad_norm": 1.9375, + "learning_rate": 4.735957212872082e-06, + "loss": 2.5765, + "step": 16141 + }, + { + "epoch": 0.8659871244635193, + "grad_norm": 0.36328125, + "learning_rate": 4.7359183514800475e-06, + "loss": 1.8862, + "step": 16142 + }, + { + "epoch": 0.8660407725321888, + "grad_norm": 0.435546875, + "learning_rate": 4.7358794873879086e-06, + "loss": 2.1555, + "step": 16143 + }, + { + "epoch": 0.8660944206008584, + "grad_norm": 0.45703125, + "learning_rate": 4.735840620595714e-06, + "loss": 2.1733, + "step": 16144 + }, + { + "epoch": 0.8661480686695279, + "grad_norm": 2.03125, + "learning_rate": 4.7358017511035105e-06, + "loss": 2.2055, + "step": 16145 + }, + { + "epoch": 0.8662017167381975, + "grad_norm": 1.125, + "learning_rate": 4.735762878911344e-06, + "loss": 2.255, + "step": 16146 + }, + { + "epoch": 0.8662553648068669, + "grad_norm": 0.42578125, + "learning_rate": 4.735724004019261e-06, + "loss": 2.2608, + "step": 16147 + }, + { + "epoch": 0.8663090128755365, + "grad_norm": 0.515625, + "learning_rate": 4.735685126427309e-06, + "loss": 2.2679, + "step": 16148 + }, + { + "epoch": 0.866362660944206, + "grad_norm": 0.3359375, + "learning_rate": 4.7356462461355366e-06, + "loss": 1.9109, + "step": 16149 + }, + { + "epoch": 0.8664163090128756, + "grad_norm": 0.498046875, + "learning_rate": 4.735607363143989e-06, + "loss": 2.4609, + "step": 16150 + }, + { + "epoch": 0.8664699570815451, + "grad_norm": 0.515625, + "learning_rate": 4.7355684774527145e-06, + "loss": 2.5038, + "step": 16151 + }, + { + "epoch": 0.8665236051502145, + "grad_norm": 0.53515625, + "learning_rate": 4.7355295890617575e-06, + "loss": 2.2028, + "step": 16152 + }, + { + "epoch": 0.8665772532188841, + "grad_norm": 0.5, + "learning_rate": 4.735490697971168e-06, + "loss": 2.3347, + "step": 16153 + }, + { + "epoch": 0.8666309012875536, + "grad_norm": 0.44140625, + "learning_rate": 4.7354518041809905e-06, + "loss": 2.2546, + "step": 16154 + }, + { + "epoch": 0.8666845493562232, + "grad_norm": 0.4609375, + "learning_rate": 4.735412907691273e-06, + "loss": 2.2319, + "step": 16155 + }, + { + "epoch": 0.8667381974248927, + "grad_norm": 0.5, + "learning_rate": 4.735374008502063e-06, + "loss": 1.8664, + "step": 16156 + }, + { + "epoch": 0.8667918454935623, + "grad_norm": 0.37109375, + "learning_rate": 4.735335106613407e-06, + "loss": 2.2296, + "step": 16157 + }, + { + "epoch": 0.8668454935622317, + "grad_norm": 0.9140625, + "learning_rate": 4.7352962020253515e-06, + "loss": 1.4953, + "step": 16158 + }, + { + "epoch": 0.8668991416309013, + "grad_norm": 0.412109375, + "learning_rate": 4.735257294737944e-06, + "loss": 2.2091, + "step": 16159 + }, + { + "epoch": 0.8669527896995708, + "grad_norm": 0.5546875, + "learning_rate": 4.735218384751232e-06, + "loss": 2.3106, + "step": 16160 + }, + { + "epoch": 0.8670064377682404, + "grad_norm": 0.482421875, + "learning_rate": 4.735179472065261e-06, + "loss": 2.3012, + "step": 16161 + }, + { + "epoch": 0.8670600858369099, + "grad_norm": 0.498046875, + "learning_rate": 4.73514055668008e-06, + "loss": 2.5067, + "step": 16162 + }, + { + "epoch": 0.8671137339055794, + "grad_norm": 0.416015625, + "learning_rate": 4.7351016385957335e-06, + "loss": 2.3641, + "step": 16163 + }, + { + "epoch": 0.8671673819742489, + "grad_norm": 0.51171875, + "learning_rate": 4.735062717812271e-06, + "loss": 2.6828, + "step": 16164 + }, + { + "epoch": 0.8672210300429185, + "grad_norm": 0.5234375, + "learning_rate": 4.735023794329738e-06, + "loss": 2.5088, + "step": 16165 + }, + { + "epoch": 0.867274678111588, + "grad_norm": 0.46875, + "learning_rate": 4.734984868148183e-06, + "loss": 2.2188, + "step": 16166 + }, + { + "epoch": 0.8673283261802575, + "grad_norm": 0.546875, + "learning_rate": 4.73494593926765e-06, + "loss": 2.4729, + "step": 16167 + }, + { + "epoch": 0.867381974248927, + "grad_norm": 0.484375, + "learning_rate": 4.734907007688189e-06, + "loss": 2.0514, + "step": 16168 + }, + { + "epoch": 0.8674356223175965, + "grad_norm": 0.486328125, + "learning_rate": 4.734868073409845e-06, + "loss": 2.2371, + "step": 16169 + }, + { + "epoch": 0.8674892703862661, + "grad_norm": 0.439453125, + "learning_rate": 4.734829136432667e-06, + "loss": 2.1226, + "step": 16170 + }, + { + "epoch": 0.8675429184549356, + "grad_norm": 0.466796875, + "learning_rate": 4.734790196756701e-06, + "loss": 2.3728, + "step": 16171 + }, + { + "epoch": 0.8675965665236052, + "grad_norm": 0.494140625, + "learning_rate": 4.734751254381993e-06, + "loss": 2.3332, + "step": 16172 + }, + { + "epoch": 0.8676502145922746, + "grad_norm": 0.45703125, + "learning_rate": 4.734712309308592e-06, + "loss": 2.3154, + "step": 16173 + }, + { + "epoch": 0.8677038626609442, + "grad_norm": 0.5703125, + "learning_rate": 4.734673361536544e-06, + "loss": 2.1104, + "step": 16174 + }, + { + "epoch": 0.8677575107296137, + "grad_norm": 0.53515625, + "learning_rate": 4.734634411065895e-06, + "loss": 2.2734, + "step": 16175 + }, + { + "epoch": 0.8678111587982833, + "grad_norm": 0.431640625, + "learning_rate": 4.734595457896693e-06, + "loss": 2.2576, + "step": 16176 + }, + { + "epoch": 0.8678648068669528, + "grad_norm": 0.4765625, + "learning_rate": 4.734556502028987e-06, + "loss": 1.204, + "step": 16177 + }, + { + "epoch": 0.8679184549356224, + "grad_norm": 0.490234375, + "learning_rate": 4.734517543462821e-06, + "loss": 2.2199, + "step": 16178 + }, + { + "epoch": 0.8679721030042918, + "grad_norm": 0.53125, + "learning_rate": 4.734478582198243e-06, + "loss": 2.4259, + "step": 16179 + }, + { + "epoch": 0.8680257510729614, + "grad_norm": 0.3984375, + "learning_rate": 4.734439618235302e-06, + "loss": 1.5815, + "step": 16180 + }, + { + "epoch": 0.8680793991416309, + "grad_norm": 0.4765625, + "learning_rate": 4.734400651574041e-06, + "loss": 2.2412, + "step": 16181 + }, + { + "epoch": 0.8681330472103004, + "grad_norm": 0.52734375, + "learning_rate": 4.734361682214511e-06, + "loss": 2.4472, + "step": 16182 + }, + { + "epoch": 0.86818669527897, + "grad_norm": 0.53515625, + "learning_rate": 4.734322710156756e-06, + "loss": 2.3987, + "step": 16183 + }, + { + "epoch": 0.8682403433476394, + "grad_norm": 0.47265625, + "learning_rate": 4.734283735400826e-06, + "loss": 2.2632, + "step": 16184 + }, + { + "epoch": 0.868293991416309, + "grad_norm": 0.44140625, + "learning_rate": 4.734244757946765e-06, + "loss": 2.2033, + "step": 16185 + }, + { + "epoch": 0.8683476394849785, + "grad_norm": 0.447265625, + "learning_rate": 4.734205777794624e-06, + "loss": 2.41, + "step": 16186 + }, + { + "epoch": 0.8684012875536481, + "grad_norm": 0.470703125, + "learning_rate": 4.734166794944446e-06, + "loss": 2.4802, + "step": 16187 + }, + { + "epoch": 0.8684549356223176, + "grad_norm": 0.51953125, + "learning_rate": 4.73412780939628e-06, + "loss": 2.329, + "step": 16188 + }, + { + "epoch": 0.8685085836909872, + "grad_norm": 0.4765625, + "learning_rate": 4.734088821150173e-06, + "loss": 2.2145, + "step": 16189 + }, + { + "epoch": 0.8685622317596566, + "grad_norm": 0.5625, + "learning_rate": 4.7340498302061725e-06, + "loss": 2.0311, + "step": 16190 + }, + { + "epoch": 0.8686158798283262, + "grad_norm": 0.5234375, + "learning_rate": 4.7340108365643245e-06, + "loss": 2.1693, + "step": 16191 + }, + { + "epoch": 0.8686695278969957, + "grad_norm": 0.4296875, + "learning_rate": 4.733971840224677e-06, + "loss": 2.2516, + "step": 16192 + }, + { + "epoch": 0.8687231759656653, + "grad_norm": 0.451171875, + "learning_rate": 4.733932841187276e-06, + "loss": 2.1702, + "step": 16193 + }, + { + "epoch": 0.8687768240343348, + "grad_norm": 0.546875, + "learning_rate": 4.73389383945217e-06, + "loss": 1.2244, + "step": 16194 + }, + { + "epoch": 0.8688304721030042, + "grad_norm": 0.44921875, + "learning_rate": 4.733854835019405e-06, + "loss": 2.3824, + "step": 16195 + }, + { + "epoch": 0.8688841201716738, + "grad_norm": 0.47265625, + "learning_rate": 4.733815827889028e-06, + "loss": 2.31, + "step": 16196 + }, + { + "epoch": 0.8689377682403433, + "grad_norm": 0.482421875, + "learning_rate": 4.733776818061088e-06, + "loss": 2.274, + "step": 16197 + }, + { + "epoch": 0.8689914163090129, + "grad_norm": 0.490234375, + "learning_rate": 4.73373780553563e-06, + "loss": 2.2036, + "step": 16198 + }, + { + "epoch": 0.8690450643776824, + "grad_norm": 0.458984375, + "learning_rate": 4.733698790312702e-06, + "loss": 2.2761, + "step": 16199 + }, + { + "epoch": 0.869098712446352, + "grad_norm": 0.380859375, + "learning_rate": 4.73365977239235e-06, + "loss": 1.9815, + "step": 16200 + }, + { + "epoch": 0.8691523605150214, + "grad_norm": 0.44921875, + "learning_rate": 4.733620751774624e-06, + "loss": 2.133, + "step": 16201 + }, + { + "epoch": 0.869206008583691, + "grad_norm": 0.53515625, + "learning_rate": 4.733581728459568e-06, + "loss": 2.1794, + "step": 16202 + }, + { + "epoch": 0.8692596566523605, + "grad_norm": 0.484375, + "learning_rate": 4.733542702447231e-06, + "loss": 2.3222, + "step": 16203 + }, + { + "epoch": 0.8693133047210301, + "grad_norm": 0.51171875, + "learning_rate": 4.733503673737659e-06, + "loss": 2.4082, + "step": 16204 + }, + { + "epoch": 0.8693669527896996, + "grad_norm": 0.84375, + "learning_rate": 4.733464642330899e-06, + "loss": 2.2511, + "step": 16205 + }, + { + "epoch": 0.8694206008583691, + "grad_norm": 0.5703125, + "learning_rate": 4.733425608227e-06, + "loss": 2.2299, + "step": 16206 + }, + { + "epoch": 0.8694742489270386, + "grad_norm": 0.7109375, + "learning_rate": 4.733386571426007e-06, + "loss": 2.0396, + "step": 16207 + }, + { + "epoch": 0.8695278969957082, + "grad_norm": 0.55078125, + "learning_rate": 4.7333475319279684e-06, + "loss": 2.408, + "step": 16208 + }, + { + "epoch": 0.8695815450643777, + "grad_norm": 0.45703125, + "learning_rate": 4.733308489732931e-06, + "loss": 2.1631, + "step": 16209 + }, + { + "epoch": 0.8696351931330472, + "grad_norm": 0.44140625, + "learning_rate": 4.733269444840941e-06, + "loss": 2.1235, + "step": 16210 + }, + { + "epoch": 0.8696888412017167, + "grad_norm": 0.439453125, + "learning_rate": 4.733230397252048e-06, + "loss": 1.9867, + "step": 16211 + }, + { + "epoch": 0.8697424892703862, + "grad_norm": 0.419921875, + "learning_rate": 4.733191346966296e-06, + "loss": 2.1343, + "step": 16212 + }, + { + "epoch": 0.8697961373390558, + "grad_norm": 0.392578125, + "learning_rate": 4.733152293983735e-06, + "loss": 2.3282, + "step": 16213 + }, + { + "epoch": 0.8698497854077253, + "grad_norm": 0.53125, + "learning_rate": 4.73311323830441e-06, + "loss": 2.1746, + "step": 16214 + }, + { + "epoch": 0.8699034334763949, + "grad_norm": 0.416015625, + "learning_rate": 4.733074179928369e-06, + "loss": 2.3811, + "step": 16215 + }, + { + "epoch": 0.8699570815450643, + "grad_norm": 0.46875, + "learning_rate": 4.73303511885566e-06, + "loss": 2.156, + "step": 16216 + }, + { + "epoch": 0.8700107296137339, + "grad_norm": 0.455078125, + "learning_rate": 4.73299605508633e-06, + "loss": 2.2319, + "step": 16217 + }, + { + "epoch": 0.8700643776824034, + "grad_norm": 0.498046875, + "learning_rate": 4.732956988620424e-06, + "loss": 2.1062, + "step": 16218 + }, + { + "epoch": 0.870118025751073, + "grad_norm": 0.41796875, + "learning_rate": 4.732917919457992e-06, + "loss": 2.3941, + "step": 16219 + }, + { + "epoch": 0.8701716738197425, + "grad_norm": 0.421875, + "learning_rate": 4.732878847599079e-06, + "loss": 2.2916, + "step": 16220 + }, + { + "epoch": 0.8702253218884121, + "grad_norm": 0.439453125, + "learning_rate": 4.732839773043734e-06, + "loss": 2.1455, + "step": 16221 + }, + { + "epoch": 0.8702789699570815, + "grad_norm": 0.66796875, + "learning_rate": 4.732800695792002e-06, + "loss": 2.3452, + "step": 16222 + }, + { + "epoch": 0.8703326180257511, + "grad_norm": 0.71875, + "learning_rate": 4.732761615843934e-06, + "loss": 2.3055, + "step": 16223 + }, + { + "epoch": 0.8703862660944206, + "grad_norm": 0.44140625, + "learning_rate": 4.732722533199573e-06, + "loss": 2.09, + "step": 16224 + }, + { + "epoch": 0.8704399141630901, + "grad_norm": 0.56640625, + "learning_rate": 4.732683447858968e-06, + "loss": 2.2565, + "step": 16225 + }, + { + "epoch": 0.8704935622317597, + "grad_norm": 0.53125, + "learning_rate": 4.7326443598221665e-06, + "loss": 2.1602, + "step": 16226 + }, + { + "epoch": 0.8705472103004291, + "grad_norm": 0.427734375, + "learning_rate": 4.732605269089214e-06, + "loss": 2.4423, + "step": 16227 + }, + { + "epoch": 0.8706008583690987, + "grad_norm": 0.453125, + "learning_rate": 4.7325661756601606e-06, + "loss": 2.4568, + "step": 16228 + }, + { + "epoch": 0.8706545064377682, + "grad_norm": 0.40625, + "learning_rate": 4.732527079535051e-06, + "loss": 1.8182, + "step": 16229 + }, + { + "epoch": 0.8707081545064378, + "grad_norm": 0.59765625, + "learning_rate": 4.732487980713934e-06, + "loss": 2.467, + "step": 16230 + }, + { + "epoch": 0.8707618025751073, + "grad_norm": 0.486328125, + "learning_rate": 4.732448879196855e-06, + "loss": 2.2069, + "step": 16231 + }, + { + "epoch": 0.8708154506437769, + "grad_norm": 0.455078125, + "learning_rate": 4.732409774983863e-06, + "loss": 2.1856, + "step": 16232 + }, + { + "epoch": 0.8708690987124463, + "grad_norm": 0.42578125, + "learning_rate": 4.732370668075005e-06, + "loss": 2.1901, + "step": 16233 + }, + { + "epoch": 0.8709227467811159, + "grad_norm": 0.435546875, + "learning_rate": 4.732331558470328e-06, + "loss": 2.3575, + "step": 16234 + }, + { + "epoch": 0.8709763948497854, + "grad_norm": 0.5859375, + "learning_rate": 4.732292446169878e-06, + "loss": 1.3674, + "step": 16235 + }, + { + "epoch": 0.871030042918455, + "grad_norm": 0.4609375, + "learning_rate": 4.732253331173704e-06, + "loss": 2.2972, + "step": 16236 + }, + { + "epoch": 0.8710836909871245, + "grad_norm": 0.57421875, + "learning_rate": 4.7322142134818525e-06, + "loss": 2.2526, + "step": 16237 + }, + { + "epoch": 0.871137339055794, + "grad_norm": 0.466796875, + "learning_rate": 4.732175093094371e-06, + "loss": 2.174, + "step": 16238 + }, + { + "epoch": 0.8711909871244635, + "grad_norm": 0.6484375, + "learning_rate": 4.732135970011306e-06, + "loss": 2.2787, + "step": 16239 + }, + { + "epoch": 0.871244635193133, + "grad_norm": 0.51171875, + "learning_rate": 4.732096844232705e-06, + "loss": 2.0985, + "step": 16240 + }, + { + "epoch": 0.8712982832618026, + "grad_norm": 0.494140625, + "learning_rate": 4.732057715758616e-06, + "loss": 2.646, + "step": 16241 + }, + { + "epoch": 0.8713519313304721, + "grad_norm": 0.447265625, + "learning_rate": 4.732018584589086e-06, + "loss": 2.4647, + "step": 16242 + }, + { + "epoch": 0.8714055793991416, + "grad_norm": 0.486328125, + "learning_rate": 4.731979450724161e-06, + "loss": 2.2826, + "step": 16243 + }, + { + "epoch": 0.8714592274678111, + "grad_norm": 0.49609375, + "learning_rate": 4.731940314163891e-06, + "loss": 2.2483, + "step": 16244 + }, + { + "epoch": 0.8715128755364807, + "grad_norm": 0.75, + "learning_rate": 4.73190117490832e-06, + "loss": 1.8352, + "step": 16245 + }, + { + "epoch": 0.8715665236051502, + "grad_norm": 0.4765625, + "learning_rate": 4.731862032957497e-06, + "loss": 2.5003, + "step": 16246 + }, + { + "epoch": 0.8716201716738198, + "grad_norm": 0.4921875, + "learning_rate": 4.7318228883114694e-06, + "loss": 2.3847, + "step": 16247 + }, + { + "epoch": 0.8716738197424893, + "grad_norm": 0.462890625, + "learning_rate": 4.731783740970284e-06, + "loss": 2.3879, + "step": 16248 + }, + { + "epoch": 0.8717274678111588, + "grad_norm": 0.46484375, + "learning_rate": 4.731744590933988e-06, + "loss": 2.2693, + "step": 16249 + }, + { + "epoch": 0.8717811158798283, + "grad_norm": 0.396484375, + "learning_rate": 4.7317054382026285e-06, + "loss": 2.2298, + "step": 16250 + }, + { + "epoch": 0.8718347639484979, + "grad_norm": 0.5546875, + "learning_rate": 4.731666282776253e-06, + "loss": 2.428, + "step": 16251 + }, + { + "epoch": 0.8718884120171674, + "grad_norm": 0.5078125, + "learning_rate": 4.73162712465491e-06, + "loss": 2.2805, + "step": 16252 + }, + { + "epoch": 0.8719420600858369, + "grad_norm": 0.50390625, + "learning_rate": 4.731587963838645e-06, + "loss": 2.2185, + "step": 16253 + }, + { + "epoch": 0.8719957081545064, + "grad_norm": 0.55859375, + "learning_rate": 4.7315488003275065e-06, + "loss": 2.2564, + "step": 16254 + }, + { + "epoch": 0.8720493562231759, + "grad_norm": 0.46875, + "learning_rate": 4.731509634121541e-06, + "loss": 2.1544, + "step": 16255 + }, + { + "epoch": 0.8721030042918455, + "grad_norm": 0.52734375, + "learning_rate": 4.731470465220796e-06, + "loss": 2.5443, + "step": 16256 + }, + { + "epoch": 0.872156652360515, + "grad_norm": 0.56640625, + "learning_rate": 4.7314312936253196e-06, + "loss": 2.3801, + "step": 16257 + }, + { + "epoch": 0.8722103004291846, + "grad_norm": 0.462890625, + "learning_rate": 4.731392119335157e-06, + "loss": 2.2574, + "step": 16258 + }, + { + "epoch": 0.872263948497854, + "grad_norm": 0.4609375, + "learning_rate": 4.731352942350358e-06, + "loss": 2.4634, + "step": 16259 + }, + { + "epoch": 0.8723175965665236, + "grad_norm": 0.62109375, + "learning_rate": 4.731313762670969e-06, + "loss": 2.4834, + "step": 16260 + }, + { + "epoch": 0.8723712446351931, + "grad_norm": 0.48828125, + "learning_rate": 4.731274580297037e-06, + "loss": 2.5936, + "step": 16261 + }, + { + "epoch": 0.8724248927038627, + "grad_norm": 0.451171875, + "learning_rate": 4.731235395228609e-06, + "loss": 2.3465, + "step": 16262 + }, + { + "epoch": 0.8724785407725322, + "grad_norm": 0.55078125, + "learning_rate": 4.7311962074657335e-06, + "loss": 2.2139, + "step": 16263 + }, + { + "epoch": 0.8725321888412018, + "grad_norm": 0.51171875, + "learning_rate": 4.731157017008457e-06, + "loss": 2.3767, + "step": 16264 + }, + { + "epoch": 0.8725858369098712, + "grad_norm": 0.48828125, + "learning_rate": 4.731117823856826e-06, + "loss": 2.0574, + "step": 16265 + }, + { + "epoch": 0.8726394849785408, + "grad_norm": 0.51171875, + "learning_rate": 4.731078628010889e-06, + "loss": 2.2764, + "step": 16266 + }, + { + "epoch": 0.8726931330472103, + "grad_norm": 1.671875, + "learning_rate": 4.731039429470693e-06, + "loss": 2.5203, + "step": 16267 + }, + { + "epoch": 0.8727467811158798, + "grad_norm": 0.40625, + "learning_rate": 4.731000228236286e-06, + "loss": 2.0125, + "step": 16268 + }, + { + "epoch": 0.8728004291845494, + "grad_norm": 0.400390625, + "learning_rate": 4.7309610243077145e-06, + "loss": 2.1089, + "step": 16269 + }, + { + "epoch": 0.8728540772532188, + "grad_norm": 0.423828125, + "learning_rate": 4.730921817685027e-06, + "loss": 2.1711, + "step": 16270 + }, + { + "epoch": 0.8729077253218884, + "grad_norm": 0.453125, + "learning_rate": 4.730882608368268e-06, + "loss": 2.1471, + "step": 16271 + }, + { + "epoch": 0.8729613733905579, + "grad_norm": 0.4296875, + "learning_rate": 4.7308433963574885e-06, + "loss": 2.1033, + "step": 16272 + }, + { + "epoch": 0.8730150214592275, + "grad_norm": 0.455078125, + "learning_rate": 4.730804181652733e-06, + "loss": 2.3065, + "step": 16273 + }, + { + "epoch": 0.873068669527897, + "grad_norm": 0.435546875, + "learning_rate": 4.730764964254051e-06, + "loss": 2.2693, + "step": 16274 + }, + { + "epoch": 0.8731223175965666, + "grad_norm": 0.4921875, + "learning_rate": 4.730725744161488e-06, + "loss": 2.3322, + "step": 16275 + }, + { + "epoch": 0.873175965665236, + "grad_norm": 0.484375, + "learning_rate": 4.730686521375093e-06, + "loss": 2.1912, + "step": 16276 + }, + { + "epoch": 0.8732296137339056, + "grad_norm": 0.40234375, + "learning_rate": 4.7306472958949126e-06, + "loss": 2.2481, + "step": 16277 + }, + { + "epoch": 0.8732832618025751, + "grad_norm": 0.44140625, + "learning_rate": 4.730608067720993e-06, + "loss": 2.2522, + "step": 16278 + }, + { + "epoch": 0.8733369098712447, + "grad_norm": 0.439453125, + "learning_rate": 4.730568836853384e-06, + "loss": 2.3632, + "step": 16279 + }, + { + "epoch": 0.8733905579399142, + "grad_norm": 0.46875, + "learning_rate": 4.730529603292131e-06, + "loss": 2.3769, + "step": 16280 + }, + { + "epoch": 0.8734442060085837, + "grad_norm": 0.59765625, + "learning_rate": 4.730490367037282e-06, + "loss": 1.5526, + "step": 16281 + }, + { + "epoch": 0.8734978540772532, + "grad_norm": 0.451171875, + "learning_rate": 4.7304511280888855e-06, + "loss": 2.3333, + "step": 16282 + }, + { + "epoch": 0.8735515021459227, + "grad_norm": 0.396484375, + "learning_rate": 4.7304118864469875e-06, + "loss": 2.2426, + "step": 16283 + }, + { + "epoch": 0.8736051502145923, + "grad_norm": 0.4921875, + "learning_rate": 4.730372642111636e-06, + "loss": 2.3454, + "step": 16284 + }, + { + "epoch": 0.8736587982832618, + "grad_norm": 0.56640625, + "learning_rate": 4.730333395082877e-06, + "loss": 2.2076, + "step": 16285 + }, + { + "epoch": 0.8737124463519313, + "grad_norm": 0.451171875, + "learning_rate": 4.7302941453607595e-06, + "loss": 2.1566, + "step": 16286 + }, + { + "epoch": 0.8737660944206008, + "grad_norm": 0.62109375, + "learning_rate": 4.730254892945331e-06, + "loss": 2.2096, + "step": 16287 + }, + { + "epoch": 0.8738197424892704, + "grad_norm": 0.451171875, + "learning_rate": 4.730215637836637e-06, + "loss": 2.2807, + "step": 16288 + }, + { + "epoch": 0.8738733905579399, + "grad_norm": 0.482421875, + "learning_rate": 4.730176380034728e-06, + "loss": 2.4151, + "step": 16289 + }, + { + "epoch": 0.8739270386266095, + "grad_norm": 0.447265625, + "learning_rate": 4.730137119539648e-06, + "loss": 2.0483, + "step": 16290 + }, + { + "epoch": 0.873980686695279, + "grad_norm": 0.515625, + "learning_rate": 4.730097856351447e-06, + "loss": 2.2633, + "step": 16291 + }, + { + "epoch": 0.8740343347639485, + "grad_norm": 0.5, + "learning_rate": 4.730058590470172e-06, + "loss": 1.8068, + "step": 16292 + }, + { + "epoch": 0.874087982832618, + "grad_norm": 0.515625, + "learning_rate": 4.730019321895869e-06, + "loss": 2.3338, + "step": 16293 + }, + { + "epoch": 0.8741416309012876, + "grad_norm": 0.4375, + "learning_rate": 4.729980050628586e-06, + "loss": 2.2128, + "step": 16294 + }, + { + "epoch": 0.8741952789699571, + "grad_norm": 0.58984375, + "learning_rate": 4.729940776668372e-06, + "loss": 2.2581, + "step": 16295 + }, + { + "epoch": 0.8742489270386266, + "grad_norm": 0.78515625, + "learning_rate": 4.729901500015271e-06, + "loss": 2.31, + "step": 16296 + }, + { + "epoch": 0.8743025751072961, + "grad_norm": 0.41015625, + "learning_rate": 4.729862220669335e-06, + "loss": 2.2249, + "step": 16297 + }, + { + "epoch": 0.8743562231759656, + "grad_norm": 0.51953125, + "learning_rate": 4.729822938630608e-06, + "loss": 2.2727, + "step": 16298 + }, + { + "epoch": 0.8744098712446352, + "grad_norm": 0.5546875, + "learning_rate": 4.729783653899138e-06, + "loss": 2.4156, + "step": 16299 + }, + { + "epoch": 0.8744635193133047, + "grad_norm": 0.68359375, + "learning_rate": 4.729744366474974e-06, + "loss": 2.3764, + "step": 16300 + }, + { + "epoch": 0.8745171673819743, + "grad_norm": 0.46875, + "learning_rate": 4.729705076358161e-06, + "loss": 2.0917, + "step": 16301 + }, + { + "epoch": 0.8745708154506437, + "grad_norm": 0.458984375, + "learning_rate": 4.729665783548749e-06, + "loss": 2.2785, + "step": 16302 + }, + { + "epoch": 0.8746244635193133, + "grad_norm": 0.4453125, + "learning_rate": 4.729626488046783e-06, + "loss": 2.0628, + "step": 16303 + }, + { + "epoch": 0.8746781115879828, + "grad_norm": 0.515625, + "learning_rate": 4.729587189852313e-06, + "loss": 2.2636, + "step": 16304 + }, + { + "epoch": 0.8747317596566524, + "grad_norm": 0.44140625, + "learning_rate": 4.729547888965385e-06, + "loss": 2.2525, + "step": 16305 + }, + { + "epoch": 0.8747854077253219, + "grad_norm": 0.482421875, + "learning_rate": 4.729508585386047e-06, + "loss": 2.2571, + "step": 16306 + }, + { + "epoch": 0.8748390557939915, + "grad_norm": 0.63671875, + "learning_rate": 4.729469279114345e-06, + "loss": 2.0406, + "step": 16307 + }, + { + "epoch": 0.8748927038626609, + "grad_norm": 0.58984375, + "learning_rate": 4.729429970150328e-06, + "loss": 2.3148, + "step": 16308 + }, + { + "epoch": 0.8749463519313305, + "grad_norm": 0.515625, + "learning_rate": 4.729390658494042e-06, + "loss": 1.8872, + "step": 16309 + }, + { + "epoch": 0.875, + "grad_norm": 0.423828125, + "learning_rate": 4.729351344145536e-06, + "loss": 2.2012, + "step": 16310 + }, + { + "epoch": 0.8750536480686695, + "grad_norm": 0.462890625, + "learning_rate": 4.729312027104858e-06, + "loss": 2.1605, + "step": 16311 + }, + { + "epoch": 0.8751072961373391, + "grad_norm": 0.5234375, + "learning_rate": 4.729272707372053e-06, + "loss": 2.3841, + "step": 16312 + }, + { + "epoch": 0.8751609442060085, + "grad_norm": 0.42578125, + "learning_rate": 4.729233384947171e-06, + "loss": 2.3695, + "step": 16313 + }, + { + "epoch": 0.8752145922746781, + "grad_norm": 0.515625, + "learning_rate": 4.7291940598302584e-06, + "loss": 2.4318, + "step": 16314 + }, + { + "epoch": 0.8752682403433476, + "grad_norm": 0.50390625, + "learning_rate": 4.729154732021362e-06, + "loss": 2.3995, + "step": 16315 + }, + { + "epoch": 0.8753218884120172, + "grad_norm": 0.470703125, + "learning_rate": 4.729115401520531e-06, + "loss": 2.3581, + "step": 16316 + }, + { + "epoch": 0.8753755364806867, + "grad_norm": 0.875, + "learning_rate": 4.729076068327811e-06, + "loss": 2.5068, + "step": 16317 + }, + { + "epoch": 0.8754291845493563, + "grad_norm": 0.390625, + "learning_rate": 4.7290367324432504e-06, + "loss": 2.0655, + "step": 16318 + }, + { + "epoch": 0.8754828326180257, + "grad_norm": 0.7265625, + "learning_rate": 4.728997393866898e-06, + "loss": 2.5516, + "step": 16319 + }, + { + "epoch": 0.8755364806866953, + "grad_norm": 0.54296875, + "learning_rate": 4.728958052598798e-06, + "loss": 2.119, + "step": 16320 + }, + { + "epoch": 0.8755901287553648, + "grad_norm": 0.462890625, + "learning_rate": 4.728918708639001e-06, + "loss": 2.07, + "step": 16321 + }, + { + "epoch": 0.8756437768240344, + "grad_norm": 0.412109375, + "learning_rate": 4.728879361987554e-06, + "loss": 2.3478, + "step": 16322 + }, + { + "epoch": 0.8756974248927039, + "grad_norm": 0.4921875, + "learning_rate": 4.7288400126445025e-06, + "loss": 2.4422, + "step": 16323 + }, + { + "epoch": 0.8757510729613734, + "grad_norm": 0.431640625, + "learning_rate": 4.728800660609896e-06, + "loss": 2.322, + "step": 16324 + }, + { + "epoch": 0.8758047210300429, + "grad_norm": 0.47265625, + "learning_rate": 4.7287613058837815e-06, + "loss": 2.398, + "step": 16325 + }, + { + "epoch": 0.8758583690987124, + "grad_norm": 0.5078125, + "learning_rate": 4.728721948466207e-06, + "loss": 2.4751, + "step": 16326 + }, + { + "epoch": 0.875912017167382, + "grad_norm": 0.53125, + "learning_rate": 4.7286825883572184e-06, + "loss": 2.2814, + "step": 16327 + }, + { + "epoch": 0.8759656652360515, + "grad_norm": 0.68359375, + "learning_rate": 4.728643225556865e-06, + "loss": 2.2777, + "step": 16328 + }, + { + "epoch": 0.876019313304721, + "grad_norm": 0.48828125, + "learning_rate": 4.728603860065194e-06, + "loss": 2.34, + "step": 16329 + }, + { + "epoch": 0.8760729613733905, + "grad_norm": 0.55078125, + "learning_rate": 4.7285644918822525e-06, + "loss": 1.3401, + "step": 16330 + }, + { + "epoch": 0.8761266094420601, + "grad_norm": 0.44140625, + "learning_rate": 4.728525121008089e-06, + "loss": 2.2227, + "step": 16331 + }, + { + "epoch": 0.8761802575107296, + "grad_norm": 0.40234375, + "learning_rate": 4.728485747442748e-06, + "loss": 2.3253, + "step": 16332 + }, + { + "epoch": 0.8762339055793992, + "grad_norm": 1.3984375, + "learning_rate": 4.728446371186281e-06, + "loss": 2.4403, + "step": 16333 + }, + { + "epoch": 0.8762875536480687, + "grad_norm": 0.55859375, + "learning_rate": 4.728406992238733e-06, + "loss": 2.247, + "step": 16334 + }, + { + "epoch": 0.8763412017167382, + "grad_norm": 0.53125, + "learning_rate": 4.728367610600153e-06, + "loss": 2.2502, + "step": 16335 + }, + { + "epoch": 0.8763948497854077, + "grad_norm": 0.44921875, + "learning_rate": 4.728328226270588e-06, + "loss": 2.3163, + "step": 16336 + }, + { + "epoch": 0.8764484978540773, + "grad_norm": 0.419921875, + "learning_rate": 4.728288839250085e-06, + "loss": 2.3977, + "step": 16337 + }, + { + "epoch": 0.8765021459227468, + "grad_norm": 0.42578125, + "learning_rate": 4.728249449538692e-06, + "loss": 2.2391, + "step": 16338 + }, + { + "epoch": 0.8765557939914163, + "grad_norm": 0.90234375, + "learning_rate": 4.7282100571364564e-06, + "loss": 2.3131, + "step": 16339 + }, + { + "epoch": 0.8766094420600858, + "grad_norm": 0.5390625, + "learning_rate": 4.728170662043426e-06, + "loss": 2.2965, + "step": 16340 + }, + { + "epoch": 0.8766630901287553, + "grad_norm": 0.41796875, + "learning_rate": 4.728131264259649e-06, + "loss": 2.4084, + "step": 16341 + }, + { + "epoch": 0.8767167381974249, + "grad_norm": 0.4921875, + "learning_rate": 4.728091863785173e-06, + "loss": 2.0385, + "step": 16342 + }, + { + "epoch": 0.8767703862660944, + "grad_norm": 0.50390625, + "learning_rate": 4.728052460620044e-06, + "loss": 2.4489, + "step": 16343 + }, + { + "epoch": 0.876824034334764, + "grad_norm": 0.400390625, + "learning_rate": 4.72801305476431e-06, + "loss": 2.1927, + "step": 16344 + }, + { + "epoch": 0.8768776824034334, + "grad_norm": 0.42578125, + "learning_rate": 4.727973646218019e-06, + "loss": 2.2975, + "step": 16345 + }, + { + "epoch": 0.876931330472103, + "grad_norm": 0.62890625, + "learning_rate": 4.7279342349812194e-06, + "loss": 1.4975, + "step": 16346 + }, + { + "epoch": 0.8769849785407725, + "grad_norm": 0.455078125, + "learning_rate": 4.727894821053958e-06, + "loss": 2.1954, + "step": 16347 + }, + { + "epoch": 0.8770386266094421, + "grad_norm": 0.458984375, + "learning_rate": 4.727855404436283e-06, + "loss": 2.2411, + "step": 16348 + }, + { + "epoch": 0.8770922746781116, + "grad_norm": 0.390625, + "learning_rate": 4.7278159851282405e-06, + "loss": 2.1311, + "step": 16349 + }, + { + "epoch": 0.8771459227467812, + "grad_norm": 0.462890625, + "learning_rate": 4.727776563129879e-06, + "loss": 2.1715, + "step": 16350 + }, + { + "epoch": 0.8771995708154506, + "grad_norm": 0.474609375, + "learning_rate": 4.727737138441247e-06, + "loss": 2.1835, + "step": 16351 + }, + { + "epoch": 0.8772532188841202, + "grad_norm": 0.48828125, + "learning_rate": 4.72769771106239e-06, + "loss": 2.3419, + "step": 16352 + }, + { + "epoch": 0.8773068669527897, + "grad_norm": 0.48046875, + "learning_rate": 4.727658280993358e-06, + "loss": 2.545, + "step": 16353 + }, + { + "epoch": 0.8773605150214592, + "grad_norm": 0.4296875, + "learning_rate": 4.727618848234198e-06, + "loss": 2.1419, + "step": 16354 + }, + { + "epoch": 0.8774141630901288, + "grad_norm": 0.458984375, + "learning_rate": 4.727579412784956e-06, + "loss": 2.3553, + "step": 16355 + }, + { + "epoch": 0.8774678111587982, + "grad_norm": 0.482421875, + "learning_rate": 4.727539974645681e-06, + "loss": 2.4087, + "step": 16356 + }, + { + "epoch": 0.8775214592274678, + "grad_norm": 0.427734375, + "learning_rate": 4.7275005338164205e-06, + "loss": 2.3149, + "step": 16357 + }, + { + "epoch": 0.8775751072961373, + "grad_norm": 0.60546875, + "learning_rate": 4.727461090297222e-06, + "loss": 2.3017, + "step": 16358 + }, + { + "epoch": 0.8776287553648069, + "grad_norm": 0.40625, + "learning_rate": 4.727421644088134e-06, + "loss": 1.8492, + "step": 16359 + }, + { + "epoch": 0.8776824034334764, + "grad_norm": 0.43359375, + "learning_rate": 4.7273821951892015e-06, + "loss": 2.211, + "step": 16360 + }, + { + "epoch": 0.877736051502146, + "grad_norm": 0.5078125, + "learning_rate": 4.7273427436004745e-06, + "loss": 1.6825, + "step": 16361 + }, + { + "epoch": 0.8777896995708154, + "grad_norm": 0.515625, + "learning_rate": 4.727303289322001e-06, + "loss": 2.37, + "step": 16362 + }, + { + "epoch": 0.877843347639485, + "grad_norm": 0.357421875, + "learning_rate": 4.727263832353827e-06, + "loss": 2.022, + "step": 16363 + }, + { + "epoch": 0.8778969957081545, + "grad_norm": 0.427734375, + "learning_rate": 4.727224372696001e-06, + "loss": 2.262, + "step": 16364 + }, + { + "epoch": 0.8779506437768241, + "grad_norm": 0.490234375, + "learning_rate": 4.727184910348571e-06, + "loss": 2.143, + "step": 16365 + }, + { + "epoch": 0.8780042918454936, + "grad_norm": 0.890625, + "learning_rate": 4.727145445311583e-06, + "loss": 2.3503, + "step": 16366 + }, + { + "epoch": 0.8780579399141631, + "grad_norm": 0.470703125, + "learning_rate": 4.727105977585087e-06, + "loss": 2.3659, + "step": 16367 + }, + { + "epoch": 0.8781115879828326, + "grad_norm": 0.462890625, + "learning_rate": 4.7270665071691285e-06, + "loss": 1.9458, + "step": 16368 + }, + { + "epoch": 0.8781652360515021, + "grad_norm": 0.54296875, + "learning_rate": 4.727027034063757e-06, + "loss": 2.2772, + "step": 16369 + }, + { + "epoch": 0.8782188841201717, + "grad_norm": 0.51171875, + "learning_rate": 4.726987558269018e-06, + "loss": 2.5537, + "step": 16370 + }, + { + "epoch": 0.8782725321888412, + "grad_norm": 0.494140625, + "learning_rate": 4.726948079784962e-06, + "loss": 2.3826, + "step": 16371 + }, + { + "epoch": 0.8783261802575107, + "grad_norm": 0.458984375, + "learning_rate": 4.7269085986116345e-06, + "loss": 2.3189, + "step": 16372 + }, + { + "epoch": 0.8783798283261802, + "grad_norm": 0.546875, + "learning_rate": 4.726869114749084e-06, + "loss": 2.4812, + "step": 16373 + }, + { + "epoch": 0.8784334763948498, + "grad_norm": 0.421875, + "learning_rate": 4.7268296281973575e-06, + "loss": 2.2198, + "step": 16374 + }, + { + "epoch": 0.8784871244635193, + "grad_norm": 0.455078125, + "learning_rate": 4.726790138956503e-06, + "loss": 2.2894, + "step": 16375 + }, + { + "epoch": 0.8785407725321889, + "grad_norm": 0.59765625, + "learning_rate": 4.726750647026569e-06, + "loss": 1.9866, + "step": 16376 + }, + { + "epoch": 0.8785944206008584, + "grad_norm": 0.3671875, + "learning_rate": 4.726711152407602e-06, + "loss": 2.1283, + "step": 16377 + }, + { + "epoch": 0.8786480686695279, + "grad_norm": 0.47265625, + "learning_rate": 4.726671655099652e-06, + "loss": 2.3605, + "step": 16378 + }, + { + "epoch": 0.8787017167381974, + "grad_norm": 0.44921875, + "learning_rate": 4.726632155102763e-06, + "loss": 2.3469, + "step": 16379 + }, + { + "epoch": 0.878755364806867, + "grad_norm": 0.455078125, + "learning_rate": 4.726592652416986e-06, + "loss": 2.0517, + "step": 16380 + }, + { + "epoch": 0.8788090128755365, + "grad_norm": 0.5390625, + "learning_rate": 4.726553147042366e-06, + "loss": 2.2361, + "step": 16381 + }, + { + "epoch": 0.878862660944206, + "grad_norm": 0.4296875, + "learning_rate": 4.726513638978953e-06, + "loss": 2.1154, + "step": 16382 + }, + { + "epoch": 0.8789163090128755, + "grad_norm": 0.490234375, + "learning_rate": 4.726474128226794e-06, + "loss": 2.5161, + "step": 16383 + }, + { + "epoch": 0.878969957081545, + "grad_norm": 0.40625, + "learning_rate": 4.726434614785936e-06, + "loss": 2.282, + "step": 16384 + }, + { + "epoch": 0.8790236051502146, + "grad_norm": 0.5078125, + "learning_rate": 4.726395098656427e-06, + "loss": 2.6406, + "step": 16385 + }, + { + "epoch": 0.8790772532188841, + "grad_norm": 0.4453125, + "learning_rate": 4.726355579838315e-06, + "loss": 2.2109, + "step": 16386 + }, + { + "epoch": 0.8791309012875537, + "grad_norm": 0.4609375, + "learning_rate": 4.726316058331648e-06, + "loss": 2.3403, + "step": 16387 + }, + { + "epoch": 0.8791845493562231, + "grad_norm": 0.44140625, + "learning_rate": 4.726276534136474e-06, + "loss": 2.1609, + "step": 16388 + }, + { + "epoch": 0.8792381974248927, + "grad_norm": 0.427734375, + "learning_rate": 4.726237007252839e-06, + "loss": 2.1859, + "step": 16389 + }, + { + "epoch": 0.8792918454935622, + "grad_norm": 0.51171875, + "learning_rate": 4.726197477680792e-06, + "loss": 2.291, + "step": 16390 + }, + { + "epoch": 0.8793454935622318, + "grad_norm": 0.384765625, + "learning_rate": 4.726157945420381e-06, + "loss": 2.1448, + "step": 16391 + }, + { + "epoch": 0.8793991416309013, + "grad_norm": 0.6796875, + "learning_rate": 4.726118410471653e-06, + "loss": 2.1147, + "step": 16392 + }, + { + "epoch": 0.8794527896995709, + "grad_norm": 2.734375, + "learning_rate": 4.726078872834656e-06, + "loss": 2.246, + "step": 16393 + }, + { + "epoch": 0.8795064377682403, + "grad_norm": 0.4453125, + "learning_rate": 4.726039332509439e-06, + "loss": 2.4351, + "step": 16394 + }, + { + "epoch": 0.8795600858369099, + "grad_norm": 0.51953125, + "learning_rate": 4.725999789496047e-06, + "loss": 2.4841, + "step": 16395 + }, + { + "epoch": 0.8796137339055794, + "grad_norm": 0.46875, + "learning_rate": 4.7259602437945305e-06, + "loss": 2.2046, + "step": 16396 + }, + { + "epoch": 0.8796673819742489, + "grad_norm": 0.400390625, + "learning_rate": 4.725920695404935e-06, + "loss": 1.8241, + "step": 16397 + }, + { + "epoch": 0.8797210300429185, + "grad_norm": 0.515625, + "learning_rate": 4.72588114432731e-06, + "loss": 1.8696, + "step": 16398 + }, + { + "epoch": 0.8797746781115879, + "grad_norm": 0.5, + "learning_rate": 4.725841590561703e-06, + "loss": 2.5771, + "step": 16399 + }, + { + "epoch": 0.8798283261802575, + "grad_norm": 0.37890625, + "learning_rate": 4.72580203410816e-06, + "loss": 2.1634, + "step": 16400 + }, + { + "epoch": 0.879881974248927, + "grad_norm": 0.4609375, + "learning_rate": 4.725762474966731e-06, + "loss": 2.3388, + "step": 16401 + }, + { + "epoch": 0.8799356223175966, + "grad_norm": 0.3671875, + "learning_rate": 4.725722913137462e-06, + "loss": 2.2891, + "step": 16402 + }, + { + "epoch": 0.8799892703862661, + "grad_norm": 0.455078125, + "learning_rate": 4.725683348620404e-06, + "loss": 1.8716, + "step": 16403 + }, + { + "epoch": 0.8800429184549357, + "grad_norm": 0.81640625, + "learning_rate": 4.7256437814156e-06, + "loss": 2.0613, + "step": 16404 + }, + { + "epoch": 0.8800965665236051, + "grad_norm": 0.4296875, + "learning_rate": 4.7256042115231015e-06, + "loss": 2.2003, + "step": 16405 + }, + { + "epoch": 0.8801502145922747, + "grad_norm": 0.484375, + "learning_rate": 4.725564638942954e-06, + "loss": 2.19, + "step": 16406 + }, + { + "epoch": 0.8802038626609442, + "grad_norm": 0.45703125, + "learning_rate": 4.725525063675207e-06, + "loss": 2.393, + "step": 16407 + }, + { + "epoch": 0.8802575107296138, + "grad_norm": 0.45703125, + "learning_rate": 4.725485485719908e-06, + "loss": 2.3229, + "step": 16408 + }, + { + "epoch": 0.8803111587982833, + "grad_norm": 0.421875, + "learning_rate": 4.725445905077104e-06, + "loss": 2.1427, + "step": 16409 + }, + { + "epoch": 0.8803648068669528, + "grad_norm": 1.09375, + "learning_rate": 4.7254063217468426e-06, + "loss": 2.0658, + "step": 16410 + }, + { + "epoch": 0.8804184549356223, + "grad_norm": 0.443359375, + "learning_rate": 4.725366735729173e-06, + "loss": 2.19, + "step": 16411 + }, + { + "epoch": 0.8804721030042918, + "grad_norm": 0.48046875, + "learning_rate": 4.725327147024142e-06, + "loss": 2.4263, + "step": 16412 + }, + { + "epoch": 0.8805257510729614, + "grad_norm": 0.54296875, + "learning_rate": 4.725287555631797e-06, + "loss": 2.321, + "step": 16413 + }, + { + "epoch": 0.8805793991416309, + "grad_norm": 0.71484375, + "learning_rate": 4.7252479615521875e-06, + "loss": 2.1463, + "step": 16414 + }, + { + "epoch": 0.8806330472103004, + "grad_norm": 0.5234375, + "learning_rate": 4.725208364785359e-06, + "loss": 2.3944, + "step": 16415 + }, + { + "epoch": 0.8806866952789699, + "grad_norm": 0.458984375, + "learning_rate": 4.725168765331362e-06, + "loss": 2.3283, + "step": 16416 + }, + { + "epoch": 0.8807403433476395, + "grad_norm": 0.5703125, + "learning_rate": 4.725129163190241e-06, + "loss": 1.4441, + "step": 16417 + }, + { + "epoch": 0.880793991416309, + "grad_norm": 0.3671875, + "learning_rate": 4.725089558362047e-06, + "loss": 1.9807, + "step": 16418 + }, + { + "epoch": 0.8808476394849786, + "grad_norm": 0.57421875, + "learning_rate": 4.725049950846826e-06, + "loss": 2.2144, + "step": 16419 + }, + { + "epoch": 0.880901287553648, + "grad_norm": 0.5078125, + "learning_rate": 4.725010340644627e-06, + "loss": 2.0861, + "step": 16420 + }, + { + "epoch": 0.8809549356223176, + "grad_norm": 1.7734375, + "learning_rate": 4.7249707277554965e-06, + "loss": 2.5426, + "step": 16421 + }, + { + "epoch": 0.8810085836909871, + "grad_norm": 0.46484375, + "learning_rate": 4.7249311121794825e-06, + "loss": 2.0962, + "step": 16422 + }, + { + "epoch": 0.8810622317596567, + "grad_norm": 0.51953125, + "learning_rate": 4.724891493916634e-06, + "loss": 2.5349, + "step": 16423 + }, + { + "epoch": 0.8811158798283262, + "grad_norm": 0.48828125, + "learning_rate": 4.724851872966999e-06, + "loss": 2.4559, + "step": 16424 + }, + { + "epoch": 0.8811695278969958, + "grad_norm": 0.4453125, + "learning_rate": 4.724812249330624e-06, + "loss": 2.2682, + "step": 16425 + }, + { + "epoch": 0.8812231759656652, + "grad_norm": 0.64453125, + "learning_rate": 4.724772623007557e-06, + "loss": 2.2705, + "step": 16426 + }, + { + "epoch": 0.8812768240343347, + "grad_norm": 0.4453125, + "learning_rate": 4.724732993997846e-06, + "loss": 2.0501, + "step": 16427 + }, + { + "epoch": 0.8813304721030043, + "grad_norm": 0.5546875, + "learning_rate": 4.7246933623015405e-06, + "loss": 2.1777, + "step": 16428 + }, + { + "epoch": 0.8813841201716738, + "grad_norm": 0.90234375, + "learning_rate": 4.724653727918685e-06, + "loss": 2.5921, + "step": 16429 + }, + { + "epoch": 0.8814377682403434, + "grad_norm": 0.447265625, + "learning_rate": 4.724614090849331e-06, + "loss": 2.2988, + "step": 16430 + }, + { + "epoch": 0.8814914163090128, + "grad_norm": 0.435546875, + "learning_rate": 4.724574451093524e-06, + "loss": 2.2949, + "step": 16431 + }, + { + "epoch": 0.8815450643776824, + "grad_norm": 1.6171875, + "learning_rate": 4.724534808651312e-06, + "loss": 2.4338, + "step": 16432 + }, + { + "epoch": 0.8815987124463519, + "grad_norm": 0.494140625, + "learning_rate": 4.7244951635227444e-06, + "loss": 2.3398, + "step": 16433 + }, + { + "epoch": 0.8816523605150215, + "grad_norm": 0.44140625, + "learning_rate": 4.724455515707868e-06, + "loss": 2.3864, + "step": 16434 + }, + { + "epoch": 0.881706008583691, + "grad_norm": 0.4296875, + "learning_rate": 4.72441586520673e-06, + "loss": 2.4605, + "step": 16435 + }, + { + "epoch": 0.8817596566523606, + "grad_norm": 0.474609375, + "learning_rate": 4.724376212019379e-06, + "loss": 2.3215, + "step": 16436 + }, + { + "epoch": 0.88181330472103, + "grad_norm": 0.494140625, + "learning_rate": 4.724336556145864e-06, + "loss": 2.1774, + "step": 16437 + }, + { + "epoch": 0.8818669527896996, + "grad_norm": 0.55078125, + "learning_rate": 4.724296897586231e-06, + "loss": 2.0653, + "step": 16438 + }, + { + "epoch": 0.8819206008583691, + "grad_norm": 0.50390625, + "learning_rate": 4.724257236340529e-06, + "loss": 2.1144, + "step": 16439 + }, + { + "epoch": 0.8819742489270386, + "grad_norm": 0.478515625, + "learning_rate": 4.7242175724088054e-06, + "loss": 2.5431, + "step": 16440 + }, + { + "epoch": 0.8820278969957082, + "grad_norm": 0.46484375, + "learning_rate": 4.7241779057911095e-06, + "loss": 2.313, + "step": 16441 + }, + { + "epoch": 0.8820815450643776, + "grad_norm": 0.478515625, + "learning_rate": 4.7241382364874865e-06, + "loss": 1.8721, + "step": 16442 + }, + { + "epoch": 0.8821351931330472, + "grad_norm": 0.458984375, + "learning_rate": 4.724098564497987e-06, + "loss": 2.2043, + "step": 16443 + }, + { + "epoch": 0.8821888412017167, + "grad_norm": 0.419921875, + "learning_rate": 4.724058889822657e-06, + "loss": 2.157, + "step": 16444 + }, + { + "epoch": 0.8822424892703863, + "grad_norm": 0.6015625, + "learning_rate": 4.7240192124615455e-06, + "loss": 2.2684, + "step": 16445 + }, + { + "epoch": 0.8822961373390558, + "grad_norm": 0.46484375, + "learning_rate": 4.7239795324147e-06, + "loss": 2.263, + "step": 16446 + }, + { + "epoch": 0.8823497854077254, + "grad_norm": 0.53515625, + "learning_rate": 4.723939849682169e-06, + "loss": 2.3136, + "step": 16447 + }, + { + "epoch": 0.8824034334763948, + "grad_norm": 0.625, + "learning_rate": 4.723900164263999e-06, + "loss": 2.5362, + "step": 16448 + }, + { + "epoch": 0.8824570815450644, + "grad_norm": 0.44921875, + "learning_rate": 4.723860476160238e-06, + "loss": 1.9976, + "step": 16449 + }, + { + "epoch": 0.8825107296137339, + "grad_norm": 0.45703125, + "learning_rate": 4.723820785370937e-06, + "loss": 2.3853, + "step": 16450 + }, + { + "epoch": 0.8825643776824035, + "grad_norm": 0.41796875, + "learning_rate": 4.72378109189614e-06, + "loss": 2.2189, + "step": 16451 + }, + { + "epoch": 0.882618025751073, + "grad_norm": 0.447265625, + "learning_rate": 4.723741395735898e-06, + "loss": 2.2349, + "step": 16452 + }, + { + "epoch": 0.8826716738197425, + "grad_norm": 0.56640625, + "learning_rate": 4.723701696890256e-06, + "loss": 2.4318, + "step": 16453 + }, + { + "epoch": 0.882725321888412, + "grad_norm": 0.451171875, + "learning_rate": 4.7236619953592644e-06, + "loss": 1.8156, + "step": 16454 + }, + { + "epoch": 0.8827789699570815, + "grad_norm": 0.400390625, + "learning_rate": 4.7236222911429695e-06, + "loss": 2.1871, + "step": 16455 + }, + { + "epoch": 0.8828326180257511, + "grad_norm": 0.486328125, + "learning_rate": 4.723582584241422e-06, + "loss": 2.3214, + "step": 16456 + }, + { + "epoch": 0.8828862660944206, + "grad_norm": 0.462890625, + "learning_rate": 4.723542874654665e-06, + "loss": 2.4489, + "step": 16457 + }, + { + "epoch": 0.8829399141630901, + "grad_norm": 0.490234375, + "learning_rate": 4.723503162382751e-06, + "loss": 2.3063, + "step": 16458 + }, + { + "epoch": 0.8829935622317596, + "grad_norm": 0.5078125, + "learning_rate": 4.723463447425725e-06, + "loss": 2.0961, + "step": 16459 + }, + { + "epoch": 0.8830472103004292, + "grad_norm": 0.443359375, + "learning_rate": 4.723423729783638e-06, + "loss": 2.2652, + "step": 16460 + }, + { + "epoch": 0.8831008583690987, + "grad_norm": 0.5078125, + "learning_rate": 4.723384009456535e-06, + "loss": 2.339, + "step": 16461 + }, + { + "epoch": 0.8831545064377683, + "grad_norm": 0.3671875, + "learning_rate": 4.723344286444465e-06, + "loss": 2.2367, + "step": 16462 + }, + { + "epoch": 0.8832081545064377, + "grad_norm": 0.451171875, + "learning_rate": 4.7233045607474775e-06, + "loss": 2.3269, + "step": 16463 + }, + { + "epoch": 0.8832618025751073, + "grad_norm": 0.451171875, + "learning_rate": 4.723264832365618e-06, + "loss": 2.39, + "step": 16464 + }, + { + "epoch": 0.8833154506437768, + "grad_norm": 0.51171875, + "learning_rate": 4.723225101298936e-06, + "loss": 2.5264, + "step": 16465 + }, + { + "epoch": 0.8833690987124464, + "grad_norm": 0.396484375, + "learning_rate": 4.723185367547479e-06, + "loss": 2.3668, + "step": 16466 + }, + { + "epoch": 0.8834227467811159, + "grad_norm": 0.4375, + "learning_rate": 4.723145631111295e-06, + "loss": 2.3752, + "step": 16467 + }, + { + "epoch": 0.8834763948497855, + "grad_norm": 0.41796875, + "learning_rate": 4.723105891990431e-06, + "loss": 2.176, + "step": 16468 + }, + { + "epoch": 0.8835300429184549, + "grad_norm": 0.455078125, + "learning_rate": 4.723066150184937e-06, + "loss": 2.2862, + "step": 16469 + }, + { + "epoch": 0.8835836909871244, + "grad_norm": 0.4453125, + "learning_rate": 4.7230264056948596e-06, + "loss": 2.2568, + "step": 16470 + }, + { + "epoch": 0.883637339055794, + "grad_norm": 0.51953125, + "learning_rate": 4.722986658520248e-06, + "loss": 2.4269, + "step": 16471 + }, + { + "epoch": 0.8836909871244635, + "grad_norm": 0.470703125, + "learning_rate": 4.722946908661148e-06, + "loss": 2.3, + "step": 16472 + }, + { + "epoch": 0.8837446351931331, + "grad_norm": 0.66015625, + "learning_rate": 4.72290715611761e-06, + "loss": 1.976, + "step": 16473 + }, + { + "epoch": 0.8837982832618025, + "grad_norm": 0.478515625, + "learning_rate": 4.722867400889681e-06, + "loss": 2.3009, + "step": 16474 + }, + { + "epoch": 0.8838519313304721, + "grad_norm": 0.65234375, + "learning_rate": 4.722827642977408e-06, + "loss": 2.3304, + "step": 16475 + }, + { + "epoch": 0.8839055793991416, + "grad_norm": 0.46484375, + "learning_rate": 4.722787882380841e-06, + "loss": 2.1485, + "step": 16476 + }, + { + "epoch": 0.8839592274678112, + "grad_norm": 0.404296875, + "learning_rate": 4.722748119100027e-06, + "loss": 2.0086, + "step": 16477 + }, + { + "epoch": 0.8840128755364807, + "grad_norm": 0.51171875, + "learning_rate": 4.7227083531350135e-06, + "loss": 2.3367, + "step": 16478 + }, + { + "epoch": 0.8840665236051503, + "grad_norm": 0.416015625, + "learning_rate": 4.72266858448585e-06, + "loss": 2.5173, + "step": 16479 + }, + { + "epoch": 0.8841201716738197, + "grad_norm": 0.51171875, + "learning_rate": 4.7226288131525825e-06, + "loss": 2.3073, + "step": 16480 + }, + { + "epoch": 0.8841738197424893, + "grad_norm": 0.63671875, + "learning_rate": 4.72258903913526e-06, + "loss": 2.5121, + "step": 16481 + }, + { + "epoch": 0.8842274678111588, + "grad_norm": 0.4609375, + "learning_rate": 4.722549262433931e-06, + "loss": 2.426, + "step": 16482 + }, + { + "epoch": 0.8842811158798283, + "grad_norm": 0.46484375, + "learning_rate": 4.722509483048644e-06, + "loss": 2.4662, + "step": 16483 + }, + { + "epoch": 0.8843347639484979, + "grad_norm": 0.546875, + "learning_rate": 4.722469700979445e-06, + "loss": 1.991, + "step": 16484 + }, + { + "epoch": 0.8843884120171673, + "grad_norm": 0.41015625, + "learning_rate": 4.722429916226384e-06, + "loss": 2.0598, + "step": 16485 + }, + { + "epoch": 0.8844420600858369, + "grad_norm": 0.5703125, + "learning_rate": 4.722390128789508e-06, + "loss": 2.4333, + "step": 16486 + }, + { + "epoch": 0.8844957081545064, + "grad_norm": 0.53515625, + "learning_rate": 4.7223503386688654e-06, + "loss": 2.2312, + "step": 16487 + }, + { + "epoch": 0.884549356223176, + "grad_norm": 1.046875, + "learning_rate": 4.722310545864505e-06, + "loss": 2.3265, + "step": 16488 + }, + { + "epoch": 0.8846030042918455, + "grad_norm": 0.482421875, + "learning_rate": 4.722270750376473e-06, + "loss": 2.3887, + "step": 16489 + }, + { + "epoch": 0.884656652360515, + "grad_norm": 0.53515625, + "learning_rate": 4.722230952204818e-06, + "loss": 2.2348, + "step": 16490 + }, + { + "epoch": 0.8847103004291845, + "grad_norm": 0.3515625, + "learning_rate": 4.72219115134959e-06, + "loss": 2.0265, + "step": 16491 + }, + { + "epoch": 0.8847639484978541, + "grad_norm": 0.462890625, + "learning_rate": 4.722151347810835e-06, + "loss": 2.1039, + "step": 16492 + }, + { + "epoch": 0.8848175965665236, + "grad_norm": 0.5625, + "learning_rate": 4.722111541588602e-06, + "loss": 2.2308, + "step": 16493 + }, + { + "epoch": 0.8848712446351932, + "grad_norm": 0.482421875, + "learning_rate": 4.722071732682939e-06, + "loss": 2.2876, + "step": 16494 + }, + { + "epoch": 0.8849248927038627, + "grad_norm": 0.43359375, + "learning_rate": 4.722031921093893e-06, + "loss": 2.3647, + "step": 16495 + }, + { + "epoch": 0.8849785407725322, + "grad_norm": 0.47265625, + "learning_rate": 4.721992106821513e-06, + "loss": 2.4094, + "step": 16496 + }, + { + "epoch": 0.8850321888412017, + "grad_norm": 0.48046875, + "learning_rate": 4.721952289865848e-06, + "loss": 2.2452, + "step": 16497 + }, + { + "epoch": 0.8850858369098712, + "grad_norm": 0.4140625, + "learning_rate": 4.721912470226944e-06, + "loss": 2.3343, + "step": 16498 + }, + { + "epoch": 0.8851394849785408, + "grad_norm": 0.408203125, + "learning_rate": 4.721872647904851e-06, + "loss": 2.4394, + "step": 16499 + }, + { + "epoch": 0.8851931330472103, + "grad_norm": 0.470703125, + "learning_rate": 4.721832822899616e-06, + "loss": 2.4101, + "step": 16500 + }, + { + "epoch": 0.8852467811158798, + "grad_norm": 0.451171875, + "learning_rate": 4.721792995211287e-06, + "loss": 2.3371, + "step": 16501 + }, + { + "epoch": 0.8853004291845493, + "grad_norm": 0.57421875, + "learning_rate": 4.721753164839912e-06, + "loss": 2.3241, + "step": 16502 + }, + { + "epoch": 0.8853540772532189, + "grad_norm": 0.373046875, + "learning_rate": 4.721713331785541e-06, + "loss": 1.8056, + "step": 16503 + }, + { + "epoch": 0.8854077253218884, + "grad_norm": 0.462890625, + "learning_rate": 4.7216734960482195e-06, + "loss": 2.3391, + "step": 16504 + }, + { + "epoch": 0.885461373390558, + "grad_norm": 0.486328125, + "learning_rate": 4.721633657627997e-06, + "loss": 2.4016, + "step": 16505 + }, + { + "epoch": 0.8855150214592274, + "grad_norm": 0.5234375, + "learning_rate": 4.721593816524922e-06, + "loss": 2.3436, + "step": 16506 + }, + { + "epoch": 0.885568669527897, + "grad_norm": 0.486328125, + "learning_rate": 4.7215539727390415e-06, + "loss": 2.2552, + "step": 16507 + }, + { + "epoch": 0.8856223175965665, + "grad_norm": 0.458984375, + "learning_rate": 4.721514126270404e-06, + "loss": 1.7141, + "step": 16508 + }, + { + "epoch": 0.8856759656652361, + "grad_norm": 0.609375, + "learning_rate": 4.7214742771190575e-06, + "loss": 2.4842, + "step": 16509 + }, + { + "epoch": 0.8857296137339056, + "grad_norm": 0.5390625, + "learning_rate": 4.721434425285051e-06, + "loss": 2.2346, + "step": 16510 + }, + { + "epoch": 0.8857832618025752, + "grad_norm": 0.44921875, + "learning_rate": 4.7213945707684315e-06, + "loss": 2.0905, + "step": 16511 + }, + { + "epoch": 0.8858369098712446, + "grad_norm": 0.431640625, + "learning_rate": 4.721354713569247e-06, + "loss": 1.8545, + "step": 16512 + }, + { + "epoch": 0.8858905579399141, + "grad_norm": 0.419921875, + "learning_rate": 4.721314853687547e-06, + "loss": 1.9777, + "step": 16513 + }, + { + "epoch": 0.8859442060085837, + "grad_norm": 0.466796875, + "learning_rate": 4.721274991123379e-06, + "loss": 2.3883, + "step": 16514 + }, + { + "epoch": 0.8859978540772532, + "grad_norm": 0.5390625, + "learning_rate": 4.721235125876791e-06, + "loss": 2.206, + "step": 16515 + }, + { + "epoch": 0.8860515021459228, + "grad_norm": 0.54296875, + "learning_rate": 4.7211952579478306e-06, + "loss": 2.2918, + "step": 16516 + }, + { + "epoch": 0.8861051502145922, + "grad_norm": 0.578125, + "learning_rate": 4.721155387336546e-06, + "loss": 2.3035, + "step": 16517 + }, + { + "epoch": 0.8861587982832618, + "grad_norm": 0.76171875, + "learning_rate": 4.7211155140429875e-06, + "loss": 2.6304, + "step": 16518 + }, + { + "epoch": 0.8862124463519313, + "grad_norm": 0.423828125, + "learning_rate": 4.7210756380672006e-06, + "loss": 2.3696, + "step": 16519 + }, + { + "epoch": 0.8862660944206009, + "grad_norm": 0.5078125, + "learning_rate": 4.721035759409235e-06, + "loss": 2.241, + "step": 16520 + }, + { + "epoch": 0.8863197424892704, + "grad_norm": 0.484375, + "learning_rate": 4.720995878069138e-06, + "loss": 2.3996, + "step": 16521 + }, + { + "epoch": 0.88637339055794, + "grad_norm": 0.515625, + "learning_rate": 4.720955994046957e-06, + "loss": 2.4835, + "step": 16522 + }, + { + "epoch": 0.8864270386266094, + "grad_norm": 0.6796875, + "learning_rate": 4.720916107342743e-06, + "loss": 2.4694, + "step": 16523 + }, + { + "epoch": 0.886480686695279, + "grad_norm": 0.455078125, + "learning_rate": 4.720876217956541e-06, + "loss": 2.2357, + "step": 16524 + }, + { + "epoch": 0.8865343347639485, + "grad_norm": 0.4921875, + "learning_rate": 4.720836325888401e-06, + "loss": 2.2049, + "step": 16525 + }, + { + "epoch": 0.886587982832618, + "grad_norm": 0.578125, + "learning_rate": 4.7207964311383705e-06, + "loss": 2.2432, + "step": 16526 + }, + { + "epoch": 0.8866416309012876, + "grad_norm": 0.62890625, + "learning_rate": 4.720756533706499e-06, + "loss": 2.293, + "step": 16527 + }, + { + "epoch": 0.886695278969957, + "grad_norm": 0.455078125, + "learning_rate": 4.7207166335928325e-06, + "loss": 2.3337, + "step": 16528 + }, + { + "epoch": 0.8867489270386266, + "grad_norm": 0.453125, + "learning_rate": 4.72067673079742e-06, + "loss": 2.2507, + "step": 16529 + }, + { + "epoch": 0.8868025751072961, + "grad_norm": 0.48046875, + "learning_rate": 4.720636825320311e-06, + "loss": 2.3507, + "step": 16530 + }, + { + "epoch": 0.8868562231759657, + "grad_norm": 0.482421875, + "learning_rate": 4.720596917161552e-06, + "loss": 2.4114, + "step": 16531 + }, + { + "epoch": 0.8869098712446352, + "grad_norm": 0.48828125, + "learning_rate": 4.720557006321192e-06, + "loss": 2.3692, + "step": 16532 + }, + { + "epoch": 0.8869635193133047, + "grad_norm": 1.21875, + "learning_rate": 4.720517092799279e-06, + "loss": 2.1435, + "step": 16533 + }, + { + "epoch": 0.8870171673819742, + "grad_norm": 0.73828125, + "learning_rate": 4.720477176595862e-06, + "loss": 2.2638, + "step": 16534 + }, + { + "epoch": 0.8870708154506438, + "grad_norm": 0.4140625, + "learning_rate": 4.720437257710987e-06, + "loss": 1.9433, + "step": 16535 + }, + { + "epoch": 0.8871244635193133, + "grad_norm": 0.4765625, + "learning_rate": 4.720397336144704e-06, + "loss": 2.2112, + "step": 16536 + }, + { + "epoch": 0.8871781115879829, + "grad_norm": 0.408203125, + "learning_rate": 4.720357411897062e-06, + "loss": 2.1033, + "step": 16537 + }, + { + "epoch": 0.8872317596566524, + "grad_norm": 0.365234375, + "learning_rate": 4.720317484968107e-06, + "loss": 2.0405, + "step": 16538 + }, + { + "epoch": 0.8872854077253219, + "grad_norm": 0.443359375, + "learning_rate": 4.720277555357889e-06, + "loss": 2.2456, + "step": 16539 + }, + { + "epoch": 0.8873390557939914, + "grad_norm": 0.439453125, + "learning_rate": 4.720237623066454e-06, + "loss": 2.3165, + "step": 16540 + }, + { + "epoch": 0.8873927038626609, + "grad_norm": 0.3984375, + "learning_rate": 4.720197688093852e-06, + "loss": 1.9759, + "step": 16541 + }, + { + "epoch": 0.8874463519313305, + "grad_norm": 0.5546875, + "learning_rate": 4.720157750440133e-06, + "loss": 2.3237, + "step": 16542 + }, + { + "epoch": 0.8875, + "grad_norm": 0.478515625, + "learning_rate": 4.720117810105341e-06, + "loss": 2.3415, + "step": 16543 + }, + { + "epoch": 0.8875536480686695, + "grad_norm": 0.439453125, + "learning_rate": 4.720077867089528e-06, + "loss": 2.2163, + "step": 16544 + }, + { + "epoch": 0.887607296137339, + "grad_norm": 0.482421875, + "learning_rate": 4.72003792139274e-06, + "loss": 2.2476, + "step": 16545 + }, + { + "epoch": 0.8876609442060086, + "grad_norm": 0.486328125, + "learning_rate": 4.719997973015026e-06, + "loss": 2.3072, + "step": 16546 + }, + { + "epoch": 0.8877145922746781, + "grad_norm": 0.490234375, + "learning_rate": 4.719958021956433e-06, + "loss": 2.4217, + "step": 16547 + }, + { + "epoch": 0.8877682403433477, + "grad_norm": 0.4765625, + "learning_rate": 4.719918068217012e-06, + "loss": 2.5857, + "step": 16548 + }, + { + "epoch": 0.8878218884120171, + "grad_norm": 0.52734375, + "learning_rate": 4.719878111796809e-06, + "loss": 2.3105, + "step": 16549 + }, + { + "epoch": 0.8878755364806867, + "grad_norm": 0.421875, + "learning_rate": 4.719838152695873e-06, + "loss": 2.1671, + "step": 16550 + }, + { + "epoch": 0.8879291845493562, + "grad_norm": 0.44921875, + "learning_rate": 4.719798190914252e-06, + "loss": 2.4106, + "step": 16551 + }, + { + "epoch": 0.8879828326180258, + "grad_norm": 0.73046875, + "learning_rate": 4.7197582264519946e-06, + "loss": 2.3365, + "step": 16552 + }, + { + "epoch": 0.8880364806866953, + "grad_norm": 0.5078125, + "learning_rate": 4.7197182593091486e-06, + "loss": 2.006, + "step": 16553 + }, + { + "epoch": 0.8880901287553649, + "grad_norm": 0.44921875, + "learning_rate": 4.719678289485763e-06, + "loss": 2.5343, + "step": 16554 + }, + { + "epoch": 0.8881437768240343, + "grad_norm": 8.9375, + "learning_rate": 4.719638316981884e-06, + "loss": 2.2675, + "step": 16555 + }, + { + "epoch": 0.8881974248927038, + "grad_norm": 0.353515625, + "learning_rate": 4.719598341797563e-06, + "loss": 2.165, + "step": 16556 + }, + { + "epoch": 0.8882510729613734, + "grad_norm": 0.8515625, + "learning_rate": 4.719558363932845e-06, + "loss": 2.3101, + "step": 16557 + }, + { + "epoch": 0.8883047210300429, + "grad_norm": 0.671875, + "learning_rate": 4.7195183833877824e-06, + "loss": 2.1603, + "step": 16558 + }, + { + "epoch": 0.8883583690987125, + "grad_norm": 0.62890625, + "learning_rate": 4.71947840016242e-06, + "loss": 1.6322, + "step": 16559 + }, + { + "epoch": 0.8884120171673819, + "grad_norm": 0.482421875, + "learning_rate": 4.7194384142568076e-06, + "loss": 2.3531, + "step": 16560 + }, + { + "epoch": 0.8884656652360515, + "grad_norm": 0.5, + "learning_rate": 4.719398425670992e-06, + "loss": 2.3317, + "step": 16561 + }, + { + "epoch": 0.888519313304721, + "grad_norm": 0.484375, + "learning_rate": 4.719358434405024e-06, + "loss": 2.2959, + "step": 16562 + }, + { + "epoch": 0.8885729613733906, + "grad_norm": 0.51953125, + "learning_rate": 4.7193184404589485e-06, + "loss": 2.404, + "step": 16563 + }, + { + "epoch": 0.8886266094420601, + "grad_norm": 0.48828125, + "learning_rate": 4.719278443832817e-06, + "loss": 2.5441, + "step": 16564 + }, + { + "epoch": 0.8886802575107297, + "grad_norm": 0.60546875, + "learning_rate": 4.719238444526676e-06, + "loss": 2.2593, + "step": 16565 + }, + { + "epoch": 0.8887339055793991, + "grad_norm": 0.494140625, + "learning_rate": 4.719198442540575e-06, + "loss": 2.3379, + "step": 16566 + }, + { + "epoch": 0.8887875536480687, + "grad_norm": 0.62890625, + "learning_rate": 4.719158437874561e-06, + "loss": 2.2331, + "step": 16567 + }, + { + "epoch": 0.8888412017167382, + "grad_norm": 0.45703125, + "learning_rate": 4.719118430528684e-06, + "loss": 2.3725, + "step": 16568 + }, + { + "epoch": 0.8888948497854077, + "grad_norm": 0.46484375, + "learning_rate": 4.7190784205029905e-06, + "loss": 2.2896, + "step": 16569 + }, + { + "epoch": 0.8889484978540773, + "grad_norm": 0.55859375, + "learning_rate": 4.719038407797529e-06, + "loss": 1.4731, + "step": 16570 + }, + { + "epoch": 0.8890021459227467, + "grad_norm": 0.46875, + "learning_rate": 4.718998392412349e-06, + "loss": 2.4014, + "step": 16571 + }, + { + "epoch": 0.8890557939914163, + "grad_norm": 0.38671875, + "learning_rate": 4.718958374347499e-06, + "loss": 2.102, + "step": 16572 + }, + { + "epoch": 0.8891094420600858, + "grad_norm": 0.365234375, + "learning_rate": 4.7189183536030254e-06, + "loss": 2.2489, + "step": 16573 + }, + { + "epoch": 0.8891630901287554, + "grad_norm": 0.439453125, + "learning_rate": 4.718878330178978e-06, + "loss": 2.4661, + "step": 16574 + }, + { + "epoch": 0.8892167381974249, + "grad_norm": 0.5234375, + "learning_rate": 4.718838304075405e-06, + "loss": 2.4087, + "step": 16575 + }, + { + "epoch": 0.8892703862660944, + "grad_norm": 0.42578125, + "learning_rate": 4.7187982752923545e-06, + "loss": 2.1745, + "step": 16576 + }, + { + "epoch": 0.8893240343347639, + "grad_norm": 0.455078125, + "learning_rate": 4.718758243829875e-06, + "loss": 2.381, + "step": 16577 + }, + { + "epoch": 0.8893776824034335, + "grad_norm": 0.4296875, + "learning_rate": 4.718718209688015e-06, + "loss": 2.569, + "step": 16578 + }, + { + "epoch": 0.889431330472103, + "grad_norm": 0.60546875, + "learning_rate": 4.718678172866822e-06, + "loss": 2.0991, + "step": 16579 + }, + { + "epoch": 0.8894849785407726, + "grad_norm": 0.44140625, + "learning_rate": 4.7186381333663445e-06, + "loss": 2.2847, + "step": 16580 + }, + { + "epoch": 0.889538626609442, + "grad_norm": 0.384765625, + "learning_rate": 4.718598091186632e-06, + "loss": 2.2526, + "step": 16581 + }, + { + "epoch": 0.8895922746781116, + "grad_norm": 0.4453125, + "learning_rate": 4.718558046327732e-06, + "loss": 2.2741, + "step": 16582 + }, + { + "epoch": 0.8896459227467811, + "grad_norm": 0.412109375, + "learning_rate": 4.718517998789693e-06, + "loss": 2.2577, + "step": 16583 + }, + { + "epoch": 0.8896995708154506, + "grad_norm": 0.609375, + "learning_rate": 4.718477948572563e-06, + "loss": 2.363, + "step": 16584 + }, + { + "epoch": 0.8897532188841202, + "grad_norm": 0.447265625, + "learning_rate": 4.718437895676391e-06, + "loss": 2.2134, + "step": 16585 + }, + { + "epoch": 0.8898068669527897, + "grad_norm": 0.458984375, + "learning_rate": 4.718397840101226e-06, + "loss": 1.8647, + "step": 16586 + }, + { + "epoch": 0.8898605150214592, + "grad_norm": 0.47265625, + "learning_rate": 4.718357781847114e-06, + "loss": 2.3702, + "step": 16587 + }, + { + "epoch": 0.8899141630901287, + "grad_norm": 0.4765625, + "learning_rate": 4.718317720914105e-06, + "loss": 2.2717, + "step": 16588 + }, + { + "epoch": 0.8899678111587983, + "grad_norm": 0.59765625, + "learning_rate": 4.7182776573022476e-06, + "loss": 2.3281, + "step": 16589 + }, + { + "epoch": 0.8900214592274678, + "grad_norm": 0.474609375, + "learning_rate": 4.71823759101159e-06, + "loss": 2.237, + "step": 16590 + }, + { + "epoch": 0.8900751072961374, + "grad_norm": 0.453125, + "learning_rate": 4.71819752204218e-06, + "loss": 2.463, + "step": 16591 + }, + { + "epoch": 0.8901287553648068, + "grad_norm": 0.4453125, + "learning_rate": 4.718157450394066e-06, + "loss": 2.2325, + "step": 16592 + }, + { + "epoch": 0.8901824034334764, + "grad_norm": 0.4609375, + "learning_rate": 4.718117376067298e-06, + "loss": 1.8055, + "step": 16593 + }, + { + "epoch": 0.8902360515021459, + "grad_norm": 0.76171875, + "learning_rate": 4.718077299061922e-06, + "loss": 2.2875, + "step": 16594 + }, + { + "epoch": 0.8902896995708155, + "grad_norm": 0.5859375, + "learning_rate": 4.7180372193779884e-06, + "loss": 2.2996, + "step": 16595 + }, + { + "epoch": 0.890343347639485, + "grad_norm": 0.4609375, + "learning_rate": 4.717997137015544e-06, + "loss": 2.132, + "step": 16596 + }, + { + "epoch": 0.8903969957081546, + "grad_norm": 0.451171875, + "learning_rate": 4.717957051974639e-06, + "loss": 2.2325, + "step": 16597 + }, + { + "epoch": 0.890450643776824, + "grad_norm": 0.65234375, + "learning_rate": 4.717916964255319e-06, + "loss": 2.2566, + "step": 16598 + }, + { + "epoch": 0.8905042918454935, + "grad_norm": 0.625, + "learning_rate": 4.717876873857635e-06, + "loss": 2.3344, + "step": 16599 + }, + { + "epoch": 0.8905579399141631, + "grad_norm": 0.48046875, + "learning_rate": 4.717836780781635e-06, + "loss": 2.329, + "step": 16600 + }, + { + "epoch": 0.8906115879828326, + "grad_norm": 0.59375, + "learning_rate": 4.717796685027367e-06, + "loss": 2.3308, + "step": 16601 + }, + { + "epoch": 0.8906652360515022, + "grad_norm": 0.427734375, + "learning_rate": 4.7177565865948795e-06, + "loss": 2.1511, + "step": 16602 + }, + { + "epoch": 0.8907188841201716, + "grad_norm": 0.482421875, + "learning_rate": 4.71771648548422e-06, + "loss": 2.5601, + "step": 16603 + }, + { + "epoch": 0.8907725321888412, + "grad_norm": 0.7265625, + "learning_rate": 4.717676381695438e-06, + "loss": 2.4124, + "step": 16604 + }, + { + "epoch": 0.8908261802575107, + "grad_norm": 0.515625, + "learning_rate": 4.717636275228582e-06, + "loss": 2.2489, + "step": 16605 + }, + { + "epoch": 0.8908798283261803, + "grad_norm": 0.4921875, + "learning_rate": 4.7175961660837e-06, + "loss": 2.2117, + "step": 16606 + }, + { + "epoch": 0.8909334763948498, + "grad_norm": 0.447265625, + "learning_rate": 4.717556054260841e-06, + "loss": 2.2215, + "step": 16607 + }, + { + "epoch": 0.8909871244635194, + "grad_norm": 0.609375, + "learning_rate": 4.7175159397600525e-06, + "loss": 2.7231, + "step": 16608 + }, + { + "epoch": 0.8910407725321888, + "grad_norm": 0.46875, + "learning_rate": 4.717475822581383e-06, + "loss": 1.9627, + "step": 16609 + }, + { + "epoch": 0.8910944206008584, + "grad_norm": 0.3984375, + "learning_rate": 4.717435702724882e-06, + "loss": 2.0445, + "step": 16610 + }, + { + "epoch": 0.8911480686695279, + "grad_norm": 0.466796875, + "learning_rate": 4.717395580190598e-06, + "loss": 2.4561, + "step": 16611 + }, + { + "epoch": 0.8912017167381975, + "grad_norm": 0.44921875, + "learning_rate": 4.7173554549785775e-06, + "loss": 2.078, + "step": 16612 + }, + { + "epoch": 0.891255364806867, + "grad_norm": 0.58984375, + "learning_rate": 4.7173153270888715e-06, + "loss": 2.0659, + "step": 16613 + }, + { + "epoch": 0.8913090128755364, + "grad_norm": 0.48046875, + "learning_rate": 4.717275196521526e-06, + "loss": 2.354, + "step": 16614 + }, + { + "epoch": 0.891362660944206, + "grad_norm": 0.51171875, + "learning_rate": 4.717235063276591e-06, + "loss": 2.178, + "step": 16615 + }, + { + "epoch": 0.8914163090128755, + "grad_norm": 0.435546875, + "learning_rate": 4.717194927354115e-06, + "loss": 2.0946, + "step": 16616 + }, + { + "epoch": 0.8914699570815451, + "grad_norm": 0.466796875, + "learning_rate": 4.717154788754146e-06, + "loss": 2.2124, + "step": 16617 + }, + { + "epoch": 0.8915236051502146, + "grad_norm": 0.4375, + "learning_rate": 4.717114647476732e-06, + "loss": 2.4341, + "step": 16618 + }, + { + "epoch": 0.8915772532188841, + "grad_norm": 0.462890625, + "learning_rate": 4.717074503521923e-06, + "loss": 2.0636, + "step": 16619 + }, + { + "epoch": 0.8916309012875536, + "grad_norm": 0.484375, + "learning_rate": 4.717034356889766e-06, + "loss": 2.4862, + "step": 16620 + }, + { + "epoch": 0.8916845493562232, + "grad_norm": 0.380859375, + "learning_rate": 4.71699420758031e-06, + "loss": 2.2379, + "step": 16621 + }, + { + "epoch": 0.8917381974248927, + "grad_norm": 0.52734375, + "learning_rate": 4.716954055593603e-06, + "loss": 2.4634, + "step": 16622 + }, + { + "epoch": 0.8917918454935623, + "grad_norm": 0.451171875, + "learning_rate": 4.716913900929695e-06, + "loss": 2.304, + "step": 16623 + }, + { + "epoch": 0.8918454935622318, + "grad_norm": 0.54296875, + "learning_rate": 4.716873743588633e-06, + "loss": 2.2361, + "step": 16624 + }, + { + "epoch": 0.8918991416309013, + "grad_norm": 0.64453125, + "learning_rate": 4.7168335835704655e-06, + "loss": 1.9312, + "step": 16625 + }, + { + "epoch": 0.8919527896995708, + "grad_norm": 0.47265625, + "learning_rate": 4.716793420875242e-06, + "loss": 2.1, + "step": 16626 + }, + { + "epoch": 0.8920064377682403, + "grad_norm": 0.412109375, + "learning_rate": 4.71675325550301e-06, + "loss": 2.3143, + "step": 16627 + }, + { + "epoch": 0.8920600858369099, + "grad_norm": 0.55078125, + "learning_rate": 4.716713087453819e-06, + "loss": 2.318, + "step": 16628 + }, + { + "epoch": 0.8921137339055794, + "grad_norm": 0.416015625, + "learning_rate": 4.716672916727717e-06, + "loss": 2.1641, + "step": 16629 + }, + { + "epoch": 0.8921673819742489, + "grad_norm": 0.4921875, + "learning_rate": 4.716632743324751e-06, + "loss": 2.4145, + "step": 16630 + }, + { + "epoch": 0.8922210300429184, + "grad_norm": 0.435546875, + "learning_rate": 4.716592567244973e-06, + "loss": 2.032, + "step": 16631 + }, + { + "epoch": 0.892274678111588, + "grad_norm": 0.4140625, + "learning_rate": 4.716552388488429e-06, + "loss": 2.2537, + "step": 16632 + }, + { + "epoch": 0.8923283261802575, + "grad_norm": 0.427734375, + "learning_rate": 4.716512207055167e-06, + "loss": 2.3227, + "step": 16633 + }, + { + "epoch": 0.8923819742489271, + "grad_norm": 0.458984375, + "learning_rate": 4.716472022945238e-06, + "loss": 2.1608, + "step": 16634 + }, + { + "epoch": 0.8924356223175965, + "grad_norm": 0.494140625, + "learning_rate": 4.716431836158688e-06, + "loss": 2.3651, + "step": 16635 + }, + { + "epoch": 0.8924892703862661, + "grad_norm": 0.392578125, + "learning_rate": 4.716391646695567e-06, + "loss": 1.7982, + "step": 16636 + }, + { + "epoch": 0.8925429184549356, + "grad_norm": 0.384765625, + "learning_rate": 4.716351454555923e-06, + "loss": 2.3052, + "step": 16637 + }, + { + "epoch": 0.8925965665236052, + "grad_norm": 0.427734375, + "learning_rate": 4.716311259739806e-06, + "loss": 1.9553, + "step": 16638 + }, + { + "epoch": 0.8926502145922747, + "grad_norm": 0.52734375, + "learning_rate": 4.716271062247261e-06, + "loss": 2.5554, + "step": 16639 + }, + { + "epoch": 0.8927038626609443, + "grad_norm": 0.3984375, + "learning_rate": 4.716230862078341e-06, + "loss": 2.1858, + "step": 16640 + }, + { + "epoch": 0.8927575107296137, + "grad_norm": 0.4765625, + "learning_rate": 4.71619065923309e-06, + "loss": 2.2193, + "step": 16641 + }, + { + "epoch": 0.8928111587982832, + "grad_norm": 0.43359375, + "learning_rate": 4.71615045371156e-06, + "loss": 2.2625, + "step": 16642 + }, + { + "epoch": 0.8928648068669528, + "grad_norm": 0.494140625, + "learning_rate": 4.716110245513799e-06, + "loss": 2.4416, + "step": 16643 + }, + { + "epoch": 0.8929184549356223, + "grad_norm": 0.546875, + "learning_rate": 4.716070034639854e-06, + "loss": 2.336, + "step": 16644 + }, + { + "epoch": 0.8929721030042919, + "grad_norm": 0.494140625, + "learning_rate": 4.716029821089775e-06, + "loss": 2.1339, + "step": 16645 + }, + { + "epoch": 0.8930257510729613, + "grad_norm": 0.466796875, + "learning_rate": 4.71598960486361e-06, + "loss": 2.3116, + "step": 16646 + }, + { + "epoch": 0.8930793991416309, + "grad_norm": 0.50390625, + "learning_rate": 4.715949385961407e-06, + "loss": 2.2236, + "step": 16647 + }, + { + "epoch": 0.8931330472103004, + "grad_norm": 0.470703125, + "learning_rate": 4.715909164383215e-06, + "loss": 2.2362, + "step": 16648 + }, + { + "epoch": 0.89318669527897, + "grad_norm": 0.49609375, + "learning_rate": 4.715868940129085e-06, + "loss": 2.2867, + "step": 16649 + }, + { + "epoch": 0.8932403433476395, + "grad_norm": 0.439453125, + "learning_rate": 4.715828713199061e-06, + "loss": 2.3485, + "step": 16650 + }, + { + "epoch": 0.893293991416309, + "grad_norm": 0.453125, + "learning_rate": 4.7157884835931955e-06, + "loss": 2.2146, + "step": 16651 + }, + { + "epoch": 0.8933476394849785, + "grad_norm": 0.42578125, + "learning_rate": 4.715748251311535e-06, + "loss": 2.2375, + "step": 16652 + }, + { + "epoch": 0.8934012875536481, + "grad_norm": 0.53515625, + "learning_rate": 4.715708016354128e-06, + "loss": 2.3867, + "step": 16653 + }, + { + "epoch": 0.8934549356223176, + "grad_norm": 0.51171875, + "learning_rate": 4.715667778721025e-06, + "loss": 2.2094, + "step": 16654 + }, + { + "epoch": 0.8935085836909872, + "grad_norm": 0.486328125, + "learning_rate": 4.715627538412272e-06, + "loss": 2.4704, + "step": 16655 + }, + { + "epoch": 0.8935622317596567, + "grad_norm": 0.451171875, + "learning_rate": 4.715587295427919e-06, + "loss": 2.1172, + "step": 16656 + }, + { + "epoch": 0.8936158798283261, + "grad_norm": 0.482421875, + "learning_rate": 4.715547049768015e-06, + "loss": 2.3882, + "step": 16657 + }, + { + "epoch": 0.8936695278969957, + "grad_norm": 0.423828125, + "learning_rate": 4.715506801432608e-06, + "loss": 2.2842, + "step": 16658 + }, + { + "epoch": 0.8937231759656652, + "grad_norm": 0.416015625, + "learning_rate": 4.715466550421747e-06, + "loss": 2.087, + "step": 16659 + }, + { + "epoch": 0.8937768240343348, + "grad_norm": 0.466796875, + "learning_rate": 4.71542629673548e-06, + "loss": 2.2785, + "step": 16660 + }, + { + "epoch": 0.8938304721030043, + "grad_norm": 0.4609375, + "learning_rate": 4.715386040373856e-06, + "loss": 2.2315, + "step": 16661 + }, + { + "epoch": 0.8938841201716738, + "grad_norm": 0.70703125, + "learning_rate": 4.715345781336923e-06, + "loss": 2.4957, + "step": 16662 + }, + { + "epoch": 0.8939377682403433, + "grad_norm": 0.5234375, + "learning_rate": 4.71530551962473e-06, + "loss": 2.4036, + "step": 16663 + }, + { + "epoch": 0.8939914163090129, + "grad_norm": 0.486328125, + "learning_rate": 4.715265255237327e-06, + "loss": 2.2096, + "step": 16664 + }, + { + "epoch": 0.8940450643776824, + "grad_norm": 0.498046875, + "learning_rate": 4.7152249881747605e-06, + "loss": 2.2854, + "step": 16665 + }, + { + "epoch": 0.894098712446352, + "grad_norm": 0.45703125, + "learning_rate": 4.71518471843708e-06, + "loss": 2.4846, + "step": 16666 + }, + { + "epoch": 0.8941523605150214, + "grad_norm": 0.4921875, + "learning_rate": 4.715144446024333e-06, + "loss": 2.3479, + "step": 16667 + }, + { + "epoch": 0.894206008583691, + "grad_norm": 0.490234375, + "learning_rate": 4.715104170936572e-06, + "loss": 2.2703, + "step": 16668 + }, + { + "epoch": 0.8942596566523605, + "grad_norm": 0.71875, + "learning_rate": 4.715063893173841e-06, + "loss": 2.0941, + "step": 16669 + }, + { + "epoch": 0.89431330472103, + "grad_norm": 0.5703125, + "learning_rate": 4.7150236127361905e-06, + "loss": 2.2287, + "step": 16670 + }, + { + "epoch": 0.8943669527896996, + "grad_norm": 0.490234375, + "learning_rate": 4.71498332962367e-06, + "loss": 2.349, + "step": 16671 + }, + { + "epoch": 0.894420600858369, + "grad_norm": 0.45703125, + "learning_rate": 4.714943043836326e-06, + "loss": 2.5314, + "step": 16672 + }, + { + "epoch": 0.8944742489270386, + "grad_norm": 0.365234375, + "learning_rate": 4.71490275537421e-06, + "loss": 2.181, + "step": 16673 + }, + { + "epoch": 0.8945278969957081, + "grad_norm": 0.44921875, + "learning_rate": 4.714862464237369e-06, + "loss": 2.1284, + "step": 16674 + }, + { + "epoch": 0.8945815450643777, + "grad_norm": 0.44921875, + "learning_rate": 4.714822170425851e-06, + "loss": 2.3908, + "step": 16675 + }, + { + "epoch": 0.8946351931330472, + "grad_norm": 0.484375, + "learning_rate": 4.714781873939706e-06, + "loss": 2.4071, + "step": 16676 + }, + { + "epoch": 0.8946888412017168, + "grad_norm": 0.5, + "learning_rate": 4.714741574778982e-06, + "loss": 2.46, + "step": 16677 + }, + { + "epoch": 0.8947424892703862, + "grad_norm": 0.51953125, + "learning_rate": 4.714701272943727e-06, + "loss": 2.3036, + "step": 16678 + }, + { + "epoch": 0.8947961373390558, + "grad_norm": 0.64453125, + "learning_rate": 4.7146609684339915e-06, + "loss": 2.1309, + "step": 16679 + }, + { + "epoch": 0.8948497854077253, + "grad_norm": 0.40234375, + "learning_rate": 4.714620661249822e-06, + "loss": 2.2952, + "step": 16680 + }, + { + "epoch": 0.8949034334763949, + "grad_norm": 0.412109375, + "learning_rate": 4.71458035139127e-06, + "loss": 1.9572, + "step": 16681 + }, + { + "epoch": 0.8949570815450644, + "grad_norm": 0.73828125, + "learning_rate": 4.714540038858382e-06, + "loss": 2.3002, + "step": 16682 + }, + { + "epoch": 0.895010729613734, + "grad_norm": 0.43359375, + "learning_rate": 4.714499723651206e-06, + "loss": 2.4292, + "step": 16683 + }, + { + "epoch": 0.8950643776824034, + "grad_norm": 0.4609375, + "learning_rate": 4.714459405769792e-06, + "loss": 2.0382, + "step": 16684 + }, + { + "epoch": 0.8951180257510729, + "grad_norm": 0.404296875, + "learning_rate": 4.714419085214189e-06, + "loss": 2.0845, + "step": 16685 + }, + { + "epoch": 0.8951716738197425, + "grad_norm": 0.478515625, + "learning_rate": 4.714378761984446e-06, + "loss": 2.188, + "step": 16686 + }, + { + "epoch": 0.895225321888412, + "grad_norm": 0.416015625, + "learning_rate": 4.71433843608061e-06, + "loss": 2.0758, + "step": 16687 + }, + { + "epoch": 0.8952789699570816, + "grad_norm": 0.6953125, + "learning_rate": 4.714298107502731e-06, + "loss": 2.404, + "step": 16688 + }, + { + "epoch": 0.895332618025751, + "grad_norm": 0.46484375, + "learning_rate": 4.7142577762508565e-06, + "loss": 2.3459, + "step": 16689 + }, + { + "epoch": 0.8953862660944206, + "grad_norm": 0.52734375, + "learning_rate": 4.714217442325036e-06, + "loss": 2.1147, + "step": 16690 + }, + { + "epoch": 0.8954399141630901, + "grad_norm": 0.462890625, + "learning_rate": 4.714177105725319e-06, + "loss": 2.3578, + "step": 16691 + }, + { + "epoch": 0.8954935622317597, + "grad_norm": 0.59765625, + "learning_rate": 4.714136766451753e-06, + "loss": 2.7477, + "step": 16692 + }, + { + "epoch": 0.8955472103004292, + "grad_norm": 0.640625, + "learning_rate": 4.714096424504388e-06, + "loss": 2.2688, + "step": 16693 + }, + { + "epoch": 0.8956008583690988, + "grad_norm": 0.462890625, + "learning_rate": 4.7140560798832715e-06, + "loss": 2.0698, + "step": 16694 + }, + { + "epoch": 0.8956545064377682, + "grad_norm": 0.62109375, + "learning_rate": 4.7140157325884516e-06, + "loss": 2.4399, + "step": 16695 + }, + { + "epoch": 0.8957081545064378, + "grad_norm": 0.4765625, + "learning_rate": 4.713975382619979e-06, + "loss": 2.3546, + "step": 16696 + }, + { + "epoch": 0.8957618025751073, + "grad_norm": 0.5703125, + "learning_rate": 4.713935029977901e-06, + "loss": 2.4918, + "step": 16697 + }, + { + "epoch": 0.8958154506437769, + "grad_norm": 0.4375, + "learning_rate": 4.713894674662267e-06, + "loss": 2.1669, + "step": 16698 + }, + { + "epoch": 0.8958690987124464, + "grad_norm": 0.490234375, + "learning_rate": 4.713854316673126e-06, + "loss": 2.4297, + "step": 16699 + }, + { + "epoch": 0.8959227467811158, + "grad_norm": 0.447265625, + "learning_rate": 4.713813956010524e-06, + "loss": 2.4203, + "step": 16700 + }, + { + "epoch": 0.8959763948497854, + "grad_norm": 0.443359375, + "learning_rate": 4.713773592674514e-06, + "loss": 2.2068, + "step": 16701 + }, + { + "epoch": 0.8960300429184549, + "grad_norm": 0.5, + "learning_rate": 4.713733226665142e-06, + "loss": 2.1845, + "step": 16702 + }, + { + "epoch": 0.8960836909871245, + "grad_norm": 0.478515625, + "learning_rate": 4.713692857982458e-06, + "loss": 2.3749, + "step": 16703 + }, + { + "epoch": 0.896137339055794, + "grad_norm": 0.53515625, + "learning_rate": 4.7136524866265106e-06, + "loss": 2.6247, + "step": 16704 + }, + { + "epoch": 0.8961909871244635, + "grad_norm": 0.51171875, + "learning_rate": 4.713612112597347e-06, + "loss": 2.0409, + "step": 16705 + }, + { + "epoch": 0.896244635193133, + "grad_norm": 0.466796875, + "learning_rate": 4.713571735895018e-06, + "loss": 2.2321, + "step": 16706 + }, + { + "epoch": 0.8962982832618026, + "grad_norm": 0.390625, + "learning_rate": 4.7135313565195705e-06, + "loss": 1.9408, + "step": 16707 + }, + { + "epoch": 0.8963519313304721, + "grad_norm": 0.470703125, + "learning_rate": 4.713490974471056e-06, + "loss": 2.0805, + "step": 16708 + }, + { + "epoch": 0.8964055793991417, + "grad_norm": 0.48046875, + "learning_rate": 4.71345058974952e-06, + "loss": 2.4156, + "step": 16709 + }, + { + "epoch": 0.8964592274678111, + "grad_norm": 0.451171875, + "learning_rate": 4.7134102023550135e-06, + "loss": 2.1129, + "step": 16710 + }, + { + "epoch": 0.8965128755364807, + "grad_norm": 0.400390625, + "learning_rate": 4.713369812287584e-06, + "loss": 2.1391, + "step": 16711 + }, + { + "epoch": 0.8965665236051502, + "grad_norm": 0.478515625, + "learning_rate": 4.71332941954728e-06, + "loss": 2.4057, + "step": 16712 + }, + { + "epoch": 0.8966201716738197, + "grad_norm": 0.46875, + "learning_rate": 4.7132890241341525e-06, + "loss": 2.245, + "step": 16713 + }, + { + "epoch": 0.8966738197424893, + "grad_norm": 0.8515625, + "learning_rate": 4.713248626048249e-06, + "loss": 2.2575, + "step": 16714 + }, + { + "epoch": 0.8967274678111588, + "grad_norm": 0.5078125, + "learning_rate": 4.713208225289617e-06, + "loss": 2.4806, + "step": 16715 + }, + { + "epoch": 0.8967811158798283, + "grad_norm": 0.4609375, + "learning_rate": 4.713167821858307e-06, + "loss": 2.286, + "step": 16716 + }, + { + "epoch": 0.8968347639484978, + "grad_norm": 0.494140625, + "learning_rate": 4.713127415754367e-06, + "loss": 2.3337, + "step": 16717 + }, + { + "epoch": 0.8968884120171674, + "grad_norm": 0.4609375, + "learning_rate": 4.7130870069778465e-06, + "loss": 2.2295, + "step": 16718 + }, + { + "epoch": 0.8969420600858369, + "grad_norm": 0.47265625, + "learning_rate": 4.713046595528794e-06, + "loss": 2.2045, + "step": 16719 + }, + { + "epoch": 0.8969957081545065, + "grad_norm": 0.578125, + "learning_rate": 4.713006181407258e-06, + "loss": 2.1637, + "step": 16720 + }, + { + "epoch": 0.8970493562231759, + "grad_norm": 0.419921875, + "learning_rate": 4.712965764613287e-06, + "loss": 2.1184, + "step": 16721 + }, + { + "epoch": 0.8971030042918455, + "grad_norm": 1.890625, + "learning_rate": 4.712925345146931e-06, + "loss": 1.9566, + "step": 16722 + }, + { + "epoch": 0.897156652360515, + "grad_norm": 0.55078125, + "learning_rate": 4.712884923008237e-06, + "loss": 2.1995, + "step": 16723 + }, + { + "epoch": 0.8972103004291846, + "grad_norm": 1.765625, + "learning_rate": 4.712844498197256e-06, + "loss": 2.6244, + "step": 16724 + }, + { + "epoch": 0.8972639484978541, + "grad_norm": 0.4765625, + "learning_rate": 4.712804070714035e-06, + "loss": 2.5658, + "step": 16725 + }, + { + "epoch": 0.8973175965665237, + "grad_norm": 0.431640625, + "learning_rate": 4.7127636405586236e-06, + "loss": 2.4908, + "step": 16726 + }, + { + "epoch": 0.8973712446351931, + "grad_norm": 0.482421875, + "learning_rate": 4.7127232077310705e-06, + "loss": 2.2916, + "step": 16727 + }, + { + "epoch": 0.8974248927038626, + "grad_norm": 0.44921875, + "learning_rate": 4.712682772231425e-06, + "loss": 2.1469, + "step": 16728 + }, + { + "epoch": 0.8974785407725322, + "grad_norm": 0.48046875, + "learning_rate": 4.712642334059735e-06, + "loss": 2.2783, + "step": 16729 + }, + { + "epoch": 0.8975321888412017, + "grad_norm": 0.4453125, + "learning_rate": 4.71260189321605e-06, + "loss": 2.3101, + "step": 16730 + }, + { + "epoch": 0.8975858369098713, + "grad_norm": 0.498046875, + "learning_rate": 4.712561449700418e-06, + "loss": 1.9226, + "step": 16731 + }, + { + "epoch": 0.8976394849785407, + "grad_norm": 0.48046875, + "learning_rate": 4.71252100351289e-06, + "loss": 2.2849, + "step": 16732 + }, + { + "epoch": 0.8976931330472103, + "grad_norm": 0.474609375, + "learning_rate": 4.712480554653513e-06, + "loss": 2.3194, + "step": 16733 + }, + { + "epoch": 0.8977467811158798, + "grad_norm": 0.427734375, + "learning_rate": 4.712440103122335e-06, + "loss": 2.2085, + "step": 16734 + }, + { + "epoch": 0.8978004291845494, + "grad_norm": 0.375, + "learning_rate": 4.712399648919407e-06, + "loss": 2.1898, + "step": 16735 + }, + { + "epoch": 0.8978540772532189, + "grad_norm": 1.2890625, + "learning_rate": 4.712359192044776e-06, + "loss": 2.1538, + "step": 16736 + }, + { + "epoch": 0.8979077253218885, + "grad_norm": 0.5234375, + "learning_rate": 4.712318732498493e-06, + "loss": 2.166, + "step": 16737 + }, + { + "epoch": 0.8979613733905579, + "grad_norm": 0.48828125, + "learning_rate": 4.712278270280605e-06, + "loss": 2.3937, + "step": 16738 + }, + { + "epoch": 0.8980150214592275, + "grad_norm": 0.4296875, + "learning_rate": 4.712237805391161e-06, + "loss": 2.198, + "step": 16739 + }, + { + "epoch": 0.898068669527897, + "grad_norm": 0.50390625, + "learning_rate": 4.712197337830211e-06, + "loss": 2.2305, + "step": 16740 + }, + { + "epoch": 0.8981223175965666, + "grad_norm": 0.54296875, + "learning_rate": 4.712156867597803e-06, + "loss": 2.1755, + "step": 16741 + }, + { + "epoch": 0.898175965665236, + "grad_norm": 1.4296875, + "learning_rate": 4.712116394693986e-06, + "loss": 2.2719, + "step": 16742 + }, + { + "epoch": 0.8982296137339055, + "grad_norm": 0.494140625, + "learning_rate": 4.712075919118809e-06, + "loss": 2.4433, + "step": 16743 + }, + { + "epoch": 0.8982832618025751, + "grad_norm": 0.41796875, + "learning_rate": 4.712035440872321e-06, + "loss": 2.3465, + "step": 16744 + }, + { + "epoch": 0.8983369098712446, + "grad_norm": 0.458984375, + "learning_rate": 4.71199495995457e-06, + "loss": 2.3354, + "step": 16745 + }, + { + "epoch": 0.8983905579399142, + "grad_norm": 0.470703125, + "learning_rate": 4.711954476365607e-06, + "loss": 2.3363, + "step": 16746 + }, + { + "epoch": 0.8984442060085837, + "grad_norm": 0.396484375, + "learning_rate": 4.7119139901054775e-06, + "loss": 2.0453, + "step": 16747 + }, + { + "epoch": 0.8984978540772532, + "grad_norm": 0.458984375, + "learning_rate": 4.711873501174234e-06, + "loss": 2.231, + "step": 16748 + }, + { + "epoch": 0.8985515021459227, + "grad_norm": 0.423828125, + "learning_rate": 4.7118330095719236e-06, + "loss": 1.6363, + "step": 16749 + }, + { + "epoch": 0.8986051502145923, + "grad_norm": 0.53125, + "learning_rate": 4.711792515298595e-06, + "loss": 2.1137, + "step": 16750 + }, + { + "epoch": 0.8986587982832618, + "grad_norm": 0.44921875, + "learning_rate": 4.7117520183542975e-06, + "loss": 2.4051, + "step": 16751 + }, + { + "epoch": 0.8987124463519314, + "grad_norm": 0.546875, + "learning_rate": 4.711711518739079e-06, + "loss": 2.2405, + "step": 16752 + }, + { + "epoch": 0.8987660944206008, + "grad_norm": 0.4921875, + "learning_rate": 4.711671016452991e-06, + "loss": 2.3241, + "step": 16753 + }, + { + "epoch": 0.8988197424892704, + "grad_norm": 0.515625, + "learning_rate": 4.71163051149608e-06, + "loss": 2.3062, + "step": 16754 + }, + { + "epoch": 0.8988733905579399, + "grad_norm": 0.4921875, + "learning_rate": 4.711590003868396e-06, + "loss": 2.3132, + "step": 16755 + }, + { + "epoch": 0.8989270386266094, + "grad_norm": 0.4375, + "learning_rate": 4.7115494935699875e-06, + "loss": 2.3508, + "step": 16756 + }, + { + "epoch": 0.898980686695279, + "grad_norm": 10.625, + "learning_rate": 4.711508980600904e-06, + "loss": 2.0821, + "step": 16757 + }, + { + "epoch": 0.8990343347639485, + "grad_norm": 0.4609375, + "learning_rate": 4.711468464961193e-06, + "loss": 2.1557, + "step": 16758 + }, + { + "epoch": 0.899087982832618, + "grad_norm": 0.421875, + "learning_rate": 4.711427946650905e-06, + "loss": 2.2296, + "step": 16759 + }, + { + "epoch": 0.8991416309012875, + "grad_norm": 0.470703125, + "learning_rate": 4.7113874256700885e-06, + "loss": 2.2824, + "step": 16760 + }, + { + "epoch": 0.8991952789699571, + "grad_norm": 0.408203125, + "learning_rate": 4.711346902018792e-06, + "loss": 2.2768, + "step": 16761 + }, + { + "epoch": 0.8992489270386266, + "grad_norm": 0.53515625, + "learning_rate": 4.711306375697064e-06, + "loss": 2.5995, + "step": 16762 + }, + { + "epoch": 0.8993025751072962, + "grad_norm": 0.46875, + "learning_rate": 4.711265846704955e-06, + "loss": 2.3122, + "step": 16763 + }, + { + "epoch": 0.8993562231759656, + "grad_norm": 0.412109375, + "learning_rate": 4.711225315042513e-06, + "loss": 2.422, + "step": 16764 + }, + { + "epoch": 0.8994098712446352, + "grad_norm": 0.46875, + "learning_rate": 4.7111847807097875e-06, + "loss": 2.3984, + "step": 16765 + }, + { + "epoch": 0.8994635193133047, + "grad_norm": 0.4609375, + "learning_rate": 4.7111442437068255e-06, + "loss": 2.3181, + "step": 16766 + }, + { + "epoch": 0.8995171673819743, + "grad_norm": 0.4453125, + "learning_rate": 4.7111037040336784e-06, + "loss": 2.3211, + "step": 16767 + }, + { + "epoch": 0.8995708154506438, + "grad_norm": 0.474609375, + "learning_rate": 4.711063161690395e-06, + "loss": 2.2875, + "step": 16768 + }, + { + "epoch": 0.8996244635193134, + "grad_norm": 0.4140625, + "learning_rate": 4.711022616677023e-06, + "loss": 2.0459, + "step": 16769 + }, + { + "epoch": 0.8996781115879828, + "grad_norm": 0.462890625, + "learning_rate": 4.71098206899361e-06, + "loss": 2.2598, + "step": 16770 + }, + { + "epoch": 0.8997317596566523, + "grad_norm": 0.54296875, + "learning_rate": 4.710941518640209e-06, + "loss": 2.0259, + "step": 16771 + }, + { + "epoch": 0.8997854077253219, + "grad_norm": 0.455078125, + "learning_rate": 4.710900965616865e-06, + "loss": 2.3071, + "step": 16772 + }, + { + "epoch": 0.8998390557939914, + "grad_norm": 0.5234375, + "learning_rate": 4.71086040992363e-06, + "loss": 2.6727, + "step": 16773 + }, + { + "epoch": 0.899892703862661, + "grad_norm": 0.498046875, + "learning_rate": 4.710819851560551e-06, + "loss": 2.3419, + "step": 16774 + }, + { + "epoch": 0.8999463519313304, + "grad_norm": 1.875, + "learning_rate": 4.710779290527678e-06, + "loss": 2.4541, + "step": 16775 + }, + { + "epoch": 0.9, + "grad_norm": 0.5546875, + "learning_rate": 4.710738726825059e-06, + "loss": 2.3535, + "step": 16776 + }, + { + "epoch": 0.9000536480686695, + "grad_norm": 0.416015625, + "learning_rate": 4.710698160452745e-06, + "loss": 2.1425, + "step": 16777 + }, + { + "epoch": 0.9001072961373391, + "grad_norm": 0.51953125, + "learning_rate": 4.710657591410782e-06, + "loss": 2.2354, + "step": 16778 + }, + { + "epoch": 0.9001609442060086, + "grad_norm": 0.55859375, + "learning_rate": 4.710617019699222e-06, + "loss": 2.4895, + "step": 16779 + }, + { + "epoch": 0.9002145922746781, + "grad_norm": 0.4921875, + "learning_rate": 4.710576445318111e-06, + "loss": 2.2857, + "step": 16780 + }, + { + "epoch": 0.9002682403433476, + "grad_norm": 0.53125, + "learning_rate": 4.710535868267502e-06, + "loss": 2.1785, + "step": 16781 + }, + { + "epoch": 0.9003218884120172, + "grad_norm": 0.486328125, + "learning_rate": 4.71049528854744e-06, + "loss": 2.0971, + "step": 16782 + }, + { + "epoch": 0.9003755364806867, + "grad_norm": 0.41796875, + "learning_rate": 4.710454706157975e-06, + "loss": 2.1797, + "step": 16783 + }, + { + "epoch": 0.9004291845493563, + "grad_norm": 0.419921875, + "learning_rate": 4.710414121099158e-06, + "loss": 2.0552, + "step": 16784 + }, + { + "epoch": 0.9004828326180258, + "grad_norm": 0.52734375, + "learning_rate": 4.7103735333710354e-06, + "loss": 2.0116, + "step": 16785 + }, + { + "epoch": 0.9005364806866952, + "grad_norm": 0.5, + "learning_rate": 4.710332942973658e-06, + "loss": 2.073, + "step": 16786 + }, + { + "epoch": 0.9005901287553648, + "grad_norm": 0.5234375, + "learning_rate": 4.710292349907075e-06, + "loss": 2.2605, + "step": 16787 + }, + { + "epoch": 0.9006437768240343, + "grad_norm": 0.48828125, + "learning_rate": 4.710251754171333e-06, + "loss": 2.3634, + "step": 16788 + }, + { + "epoch": 0.9006974248927039, + "grad_norm": 0.4765625, + "learning_rate": 4.7102111557664845e-06, + "loss": 2.3547, + "step": 16789 + }, + { + "epoch": 0.9007510729613734, + "grad_norm": 0.59765625, + "learning_rate": 4.710170554692576e-06, + "loss": 2.1238, + "step": 16790 + }, + { + "epoch": 0.9008047210300429, + "grad_norm": 0.515625, + "learning_rate": 4.7101299509496565e-06, + "loss": 2.1912, + "step": 16791 + }, + { + "epoch": 0.9008583690987124, + "grad_norm": 0.43359375, + "learning_rate": 4.710089344537777e-06, + "loss": 2.3444, + "step": 16792 + }, + { + "epoch": 0.900912017167382, + "grad_norm": 0.484375, + "learning_rate": 4.7100487354569845e-06, + "loss": 2.2445, + "step": 16793 + }, + { + "epoch": 0.9009656652360515, + "grad_norm": 0.49609375, + "learning_rate": 4.710008123707329e-06, + "loss": 2.425, + "step": 16794 + }, + { + "epoch": 0.9010193133047211, + "grad_norm": 0.5546875, + "learning_rate": 4.70996750928886e-06, + "loss": 2.5078, + "step": 16795 + }, + { + "epoch": 0.9010729613733905, + "grad_norm": 0.45703125, + "learning_rate": 4.709926892201625e-06, + "loss": 2.3537, + "step": 16796 + }, + { + "epoch": 0.9011266094420601, + "grad_norm": 1.0703125, + "learning_rate": 4.7098862724456755e-06, + "loss": 2.2973, + "step": 16797 + }, + { + "epoch": 0.9011802575107296, + "grad_norm": 0.4765625, + "learning_rate": 4.709845650021058e-06, + "loss": 2.4202, + "step": 16798 + }, + { + "epoch": 0.9012339055793992, + "grad_norm": 0.4296875, + "learning_rate": 4.709805024927823e-06, + "loss": 2.2141, + "step": 16799 + }, + { + "epoch": 0.9012875536480687, + "grad_norm": 0.404296875, + "learning_rate": 4.709764397166018e-06, + "loss": 2.1066, + "step": 16800 + }, + { + "epoch": 0.9013412017167381, + "grad_norm": 0.55859375, + "learning_rate": 4.709723766735695e-06, + "loss": 2.585, + "step": 16801 + }, + { + "epoch": 0.9013948497854077, + "grad_norm": 0.56640625, + "learning_rate": 4.7096831336369e-06, + "loss": 2.3248, + "step": 16802 + }, + { + "epoch": 0.9014484978540772, + "grad_norm": 0.50390625, + "learning_rate": 4.709642497869683e-06, + "loss": 2.3664, + "step": 16803 + }, + { + "epoch": 0.9015021459227468, + "grad_norm": 1.171875, + "learning_rate": 4.709601859434094e-06, + "loss": 2.3364, + "step": 16804 + }, + { + "epoch": 0.9015557939914163, + "grad_norm": 0.4921875, + "learning_rate": 4.709561218330182e-06, + "loss": 2.3182, + "step": 16805 + }, + { + "epoch": 0.9016094420600859, + "grad_norm": 0.482421875, + "learning_rate": 4.7095205745579955e-06, + "loss": 2.1542, + "step": 16806 + }, + { + "epoch": 0.9016630901287553, + "grad_norm": 17.75, + "learning_rate": 4.709479928117584e-06, + "loss": 2.2111, + "step": 16807 + }, + { + "epoch": 0.9017167381974249, + "grad_norm": 0.46875, + "learning_rate": 4.709439279008995e-06, + "loss": 2.2702, + "step": 16808 + }, + { + "epoch": 0.9017703862660944, + "grad_norm": 0.5390625, + "learning_rate": 4.70939862723228e-06, + "loss": 2.3394, + "step": 16809 + }, + { + "epoch": 0.901824034334764, + "grad_norm": 0.435546875, + "learning_rate": 4.709357972787486e-06, + "loss": 2.3195, + "step": 16810 + }, + { + "epoch": 0.9018776824034335, + "grad_norm": 0.392578125, + "learning_rate": 4.7093173156746645e-06, + "loss": 2.2443, + "step": 16811 + }, + { + "epoch": 0.901931330472103, + "grad_norm": 0.486328125, + "learning_rate": 4.709276655893862e-06, + "loss": 2.2586, + "step": 16812 + }, + { + "epoch": 0.9019849785407725, + "grad_norm": 0.546875, + "learning_rate": 4.709235993445128e-06, + "loss": 2.1631, + "step": 16813 + }, + { + "epoch": 0.902038626609442, + "grad_norm": 0.40625, + "learning_rate": 4.709195328328514e-06, + "loss": 1.9777, + "step": 16814 + }, + { + "epoch": 0.9020922746781116, + "grad_norm": 0.423828125, + "learning_rate": 4.709154660544066e-06, + "loss": 2.0425, + "step": 16815 + }, + { + "epoch": 0.9021459227467811, + "grad_norm": 0.431640625, + "learning_rate": 4.709113990091836e-06, + "loss": 2.2952, + "step": 16816 + }, + { + "epoch": 0.9021995708154507, + "grad_norm": 0.404296875, + "learning_rate": 4.709073316971871e-06, + "loss": 2.4809, + "step": 16817 + }, + { + "epoch": 0.9022532188841201, + "grad_norm": 0.423828125, + "learning_rate": 4.709032641184221e-06, + "loss": 2.1937, + "step": 16818 + }, + { + "epoch": 0.9023068669527897, + "grad_norm": 0.59375, + "learning_rate": 4.708991962728934e-06, + "loss": 2.3952, + "step": 16819 + }, + { + "epoch": 0.9023605150214592, + "grad_norm": 0.84765625, + "learning_rate": 4.7089512816060605e-06, + "loss": 1.3163, + "step": 16820 + }, + { + "epoch": 0.9024141630901288, + "grad_norm": 0.46875, + "learning_rate": 4.708910597815649e-06, + "loss": 2.2905, + "step": 16821 + }, + { + "epoch": 0.9024678111587983, + "grad_norm": 0.51171875, + "learning_rate": 4.70886991135775e-06, + "loss": 2.2135, + "step": 16822 + }, + { + "epoch": 0.9025214592274678, + "grad_norm": 0.515625, + "learning_rate": 4.70882922223241e-06, + "loss": 2.6306, + "step": 16823 + }, + { + "epoch": 0.9025751072961373, + "grad_norm": 0.51171875, + "learning_rate": 4.70878853043968e-06, + "loss": 2.3586, + "step": 16824 + }, + { + "epoch": 0.9026287553648069, + "grad_norm": 0.453125, + "learning_rate": 4.708747835979608e-06, + "loss": 2.3762, + "step": 16825 + }, + { + "epoch": 0.9026824034334764, + "grad_norm": 0.451171875, + "learning_rate": 4.708707138852245e-06, + "loss": 2.2058, + "step": 16826 + }, + { + "epoch": 0.902736051502146, + "grad_norm": 0.392578125, + "learning_rate": 4.708666439057638e-06, + "loss": 1.9084, + "step": 16827 + }, + { + "epoch": 0.9027896995708155, + "grad_norm": 0.498046875, + "learning_rate": 4.708625736595838e-06, + "loss": 2.4415, + "step": 16828 + }, + { + "epoch": 0.9028433476394849, + "grad_norm": 0.56640625, + "learning_rate": 4.708585031466894e-06, + "loss": 2.2687, + "step": 16829 + }, + { + "epoch": 0.9028969957081545, + "grad_norm": 0.41796875, + "learning_rate": 4.7085443236708525e-06, + "loss": 2.2242, + "step": 16830 + }, + { + "epoch": 0.902950643776824, + "grad_norm": 0.404296875, + "learning_rate": 4.7085036132077655e-06, + "loss": 2.1514, + "step": 16831 + }, + { + "epoch": 0.9030042918454936, + "grad_norm": 0.443359375, + "learning_rate": 4.708462900077682e-06, + "loss": 2.3484, + "step": 16832 + }, + { + "epoch": 0.903057939914163, + "grad_norm": 0.5546875, + "learning_rate": 4.708422184280649e-06, + "loss": 2.3717, + "step": 16833 + }, + { + "epoch": 0.9031115879828326, + "grad_norm": 0.4296875, + "learning_rate": 4.708381465816718e-06, + "loss": 2.3771, + "step": 16834 + }, + { + "epoch": 0.9031652360515021, + "grad_norm": 0.5078125, + "learning_rate": 4.708340744685937e-06, + "loss": 2.3733, + "step": 16835 + }, + { + "epoch": 0.9032188841201717, + "grad_norm": 0.87109375, + "learning_rate": 4.708300020888355e-06, + "loss": 2.185, + "step": 16836 + }, + { + "epoch": 0.9032725321888412, + "grad_norm": 0.4453125, + "learning_rate": 4.708259294424022e-06, + "loss": 2.5128, + "step": 16837 + }, + { + "epoch": 0.9033261802575108, + "grad_norm": 0.5859375, + "learning_rate": 4.708218565292987e-06, + "loss": 2.5605, + "step": 16838 + }, + { + "epoch": 0.9033798283261802, + "grad_norm": 0.4140625, + "learning_rate": 4.7081778334953e-06, + "loss": 2.3107, + "step": 16839 + }, + { + "epoch": 0.9034334763948498, + "grad_norm": 0.4765625, + "learning_rate": 4.708137099031007e-06, + "loss": 2.0206, + "step": 16840 + }, + { + "epoch": 0.9034871244635193, + "grad_norm": 0.451171875, + "learning_rate": 4.708096361900161e-06, + "loss": 2.2788, + "step": 16841 + }, + { + "epoch": 0.9035407725321889, + "grad_norm": 0.51171875, + "learning_rate": 4.708055622102809e-06, + "loss": 2.3957, + "step": 16842 + }, + { + "epoch": 0.9035944206008584, + "grad_norm": 0.431640625, + "learning_rate": 4.708014879639e-06, + "loss": 2.3427, + "step": 16843 + }, + { + "epoch": 0.9036480686695278, + "grad_norm": 1.046875, + "learning_rate": 4.707974134508785e-06, + "loss": 2.175, + "step": 16844 + }, + { + "epoch": 0.9037017167381974, + "grad_norm": 0.5, + "learning_rate": 4.7079333867122125e-06, + "loss": 2.4912, + "step": 16845 + }, + { + "epoch": 0.9037553648068669, + "grad_norm": 0.51171875, + "learning_rate": 4.70789263624933e-06, + "loss": 2.1661, + "step": 16846 + }, + { + "epoch": 0.9038090128755365, + "grad_norm": 0.45703125, + "learning_rate": 4.707851883120189e-06, + "loss": 2.4656, + "step": 16847 + }, + { + "epoch": 0.903862660944206, + "grad_norm": 0.498046875, + "learning_rate": 4.7078111273248374e-06, + "loss": 2.0302, + "step": 16848 + }, + { + "epoch": 0.9039163090128756, + "grad_norm": 0.40625, + "learning_rate": 4.707770368863325e-06, + "loss": 1.6158, + "step": 16849 + }, + { + "epoch": 0.903969957081545, + "grad_norm": 0.4765625, + "learning_rate": 4.707729607735702e-06, + "loss": 2.3366, + "step": 16850 + }, + { + "epoch": 0.9040236051502146, + "grad_norm": 0.48828125, + "learning_rate": 4.707688843942014e-06, + "loss": 2.2047, + "step": 16851 + }, + { + "epoch": 0.9040772532188841, + "grad_norm": 0.9609375, + "learning_rate": 4.707648077482314e-06, + "loss": 2.3696, + "step": 16852 + }, + { + "epoch": 0.9041309012875537, + "grad_norm": 0.4296875, + "learning_rate": 4.7076073083566494e-06, + "loss": 2.2099, + "step": 16853 + }, + { + "epoch": 0.9041845493562232, + "grad_norm": 0.52734375, + "learning_rate": 4.707566536565071e-06, + "loss": 2.464, + "step": 16854 + }, + { + "epoch": 0.9042381974248928, + "grad_norm": 0.5625, + "learning_rate": 4.707525762107627e-06, + "loss": 2.3567, + "step": 16855 + }, + { + "epoch": 0.9042918454935622, + "grad_norm": 0.5078125, + "learning_rate": 4.707484984984365e-06, + "loss": 2.3522, + "step": 16856 + }, + { + "epoch": 0.9043454935622317, + "grad_norm": 0.53515625, + "learning_rate": 4.707444205195337e-06, + "loss": 2.3934, + "step": 16857 + }, + { + "epoch": 0.9043991416309013, + "grad_norm": 0.5390625, + "learning_rate": 4.707403422740591e-06, + "loss": 2.1816, + "step": 16858 + }, + { + "epoch": 0.9044527896995708, + "grad_norm": 0.494140625, + "learning_rate": 4.707362637620176e-06, + "loss": 2.5101, + "step": 16859 + }, + { + "epoch": 0.9045064377682404, + "grad_norm": 0.625, + "learning_rate": 4.7073218498341426e-06, + "loss": 2.3294, + "step": 16860 + }, + { + "epoch": 0.9045600858369098, + "grad_norm": 0.98046875, + "learning_rate": 4.7072810593825385e-06, + "loss": 2.2213, + "step": 16861 + }, + { + "epoch": 0.9046137339055794, + "grad_norm": 0.44921875, + "learning_rate": 4.707240266265414e-06, + "loss": 1.9858, + "step": 16862 + }, + { + "epoch": 0.9046673819742489, + "grad_norm": 0.78125, + "learning_rate": 4.707199470482817e-06, + "loss": 2.104, + "step": 16863 + }, + { + "epoch": 0.9047210300429185, + "grad_norm": 0.474609375, + "learning_rate": 4.707158672034797e-06, + "loss": 2.5625, + "step": 16864 + }, + { + "epoch": 0.904774678111588, + "grad_norm": 0.453125, + "learning_rate": 4.707117870921406e-06, + "loss": 2.1967, + "step": 16865 + }, + { + "epoch": 0.9048283261802575, + "grad_norm": 0.54296875, + "learning_rate": 4.707077067142689e-06, + "loss": 2.2689, + "step": 16866 + }, + { + "epoch": 0.904881974248927, + "grad_norm": 0.44921875, + "learning_rate": 4.7070362606986995e-06, + "loss": 1.6632, + "step": 16867 + }, + { + "epoch": 0.9049356223175966, + "grad_norm": 0.52734375, + "learning_rate": 4.706995451589483e-06, + "loss": 2.5191, + "step": 16868 + }, + { + "epoch": 0.9049892703862661, + "grad_norm": 0.62890625, + "learning_rate": 4.706954639815091e-06, + "loss": 2.2582, + "step": 16869 + }, + { + "epoch": 0.9050429184549357, + "grad_norm": 0.4609375, + "learning_rate": 4.706913825375573e-06, + "loss": 2.1387, + "step": 16870 + }, + { + "epoch": 0.9050965665236052, + "grad_norm": 0.40234375, + "learning_rate": 4.706873008270977e-06, + "loss": 2.1594, + "step": 16871 + }, + { + "epoch": 0.9051502145922746, + "grad_norm": 0.46484375, + "learning_rate": 4.7068321885013536e-06, + "loss": 1.7409, + "step": 16872 + }, + { + "epoch": 0.9052038626609442, + "grad_norm": 0.38671875, + "learning_rate": 4.706791366066751e-06, + "loss": 1.9926, + "step": 16873 + }, + { + "epoch": 0.9052575107296137, + "grad_norm": 0.4609375, + "learning_rate": 4.706750540967219e-06, + "loss": 2.2946, + "step": 16874 + }, + { + "epoch": 0.9053111587982833, + "grad_norm": 0.52734375, + "learning_rate": 4.706709713202806e-06, + "loss": 2.4508, + "step": 16875 + }, + { + "epoch": 0.9053648068669528, + "grad_norm": 0.380859375, + "learning_rate": 4.706668882773563e-06, + "loss": 2.3105, + "step": 16876 + }, + { + "epoch": 0.9054184549356223, + "grad_norm": 0.53125, + "learning_rate": 4.706628049679538e-06, + "loss": 2.2373, + "step": 16877 + }, + { + "epoch": 0.9054721030042918, + "grad_norm": 0.474609375, + "learning_rate": 4.70658721392078e-06, + "loss": 2.1114, + "step": 16878 + }, + { + "epoch": 0.9055257510729614, + "grad_norm": 0.46484375, + "learning_rate": 4.7065463754973396e-06, + "loss": 2.1206, + "step": 16879 + }, + { + "epoch": 0.9055793991416309, + "grad_norm": 0.474609375, + "learning_rate": 4.706505534409266e-06, + "loss": 1.975, + "step": 16880 + }, + { + "epoch": 0.9056330472103005, + "grad_norm": 0.392578125, + "learning_rate": 4.706464690656607e-06, + "loss": 2.2067, + "step": 16881 + }, + { + "epoch": 0.9056866952789699, + "grad_norm": 0.65234375, + "learning_rate": 4.706423844239414e-06, + "loss": 2.1431, + "step": 16882 + }, + { + "epoch": 0.9057403433476395, + "grad_norm": 0.48046875, + "learning_rate": 4.706382995157734e-06, + "loss": 2.149, + "step": 16883 + }, + { + "epoch": 0.905793991416309, + "grad_norm": 0.56640625, + "learning_rate": 4.706342143411619e-06, + "loss": 2.2955, + "step": 16884 + }, + { + "epoch": 0.9058476394849786, + "grad_norm": 0.47265625, + "learning_rate": 4.7063012890011165e-06, + "loss": 2.4915, + "step": 16885 + }, + { + "epoch": 0.9059012875536481, + "grad_norm": 0.7265625, + "learning_rate": 4.7062604319262766e-06, + "loss": 2.2061, + "step": 16886 + }, + { + "epoch": 0.9059549356223175, + "grad_norm": 0.59375, + "learning_rate": 4.706219572187149e-06, + "loss": 2.4193, + "step": 16887 + }, + { + "epoch": 0.9060085836909871, + "grad_norm": 0.46484375, + "learning_rate": 4.7061787097837804e-06, + "loss": 2.2849, + "step": 16888 + }, + { + "epoch": 0.9060622317596566, + "grad_norm": 0.435546875, + "learning_rate": 4.706137844716223e-06, + "loss": 2.3148, + "step": 16889 + }, + { + "epoch": 0.9061158798283262, + "grad_norm": 0.482421875, + "learning_rate": 4.706096976984526e-06, + "loss": 2.5239, + "step": 16890 + }, + { + "epoch": 0.9061695278969957, + "grad_norm": 0.466796875, + "learning_rate": 4.7060561065887365e-06, + "loss": 2.3804, + "step": 16891 + }, + { + "epoch": 0.9062231759656653, + "grad_norm": 0.55859375, + "learning_rate": 4.706015233528907e-06, + "loss": 2.4246, + "step": 16892 + }, + { + "epoch": 0.9062768240343347, + "grad_norm": 0.47265625, + "learning_rate": 4.705974357805084e-06, + "loss": 2.2744, + "step": 16893 + }, + { + "epoch": 0.9063304721030043, + "grad_norm": 0.51953125, + "learning_rate": 4.705933479417319e-06, + "loss": 2.3492, + "step": 16894 + }, + { + "epoch": 0.9063841201716738, + "grad_norm": 0.94140625, + "learning_rate": 4.70589259836566e-06, + "loss": 1.6949, + "step": 16895 + }, + { + "epoch": 0.9064377682403434, + "grad_norm": 0.96875, + "learning_rate": 4.705851714650157e-06, + "loss": 2.3471, + "step": 16896 + }, + { + "epoch": 0.9064914163090129, + "grad_norm": 0.56640625, + "learning_rate": 4.705810828270859e-06, + "loss": 2.192, + "step": 16897 + }, + { + "epoch": 0.9065450643776825, + "grad_norm": 0.470703125, + "learning_rate": 4.705769939227816e-06, + "loss": 2.2715, + "step": 16898 + }, + { + "epoch": 0.9065987124463519, + "grad_norm": 0.458984375, + "learning_rate": 4.705729047521077e-06, + "loss": 2.2306, + "step": 16899 + }, + { + "epoch": 0.9066523605150214, + "grad_norm": 0.453125, + "learning_rate": 4.705688153150691e-06, + "loss": 2.0636, + "step": 16900 + }, + { + "epoch": 0.906706008583691, + "grad_norm": 0.455078125, + "learning_rate": 4.705647256116707e-06, + "loss": 2.2029, + "step": 16901 + }, + { + "epoch": 0.9067596566523605, + "grad_norm": 0.59765625, + "learning_rate": 4.7056063564191755e-06, + "loss": 2.4436, + "step": 16902 + }, + { + "epoch": 0.9068133047210301, + "grad_norm": 0.474609375, + "learning_rate": 4.705565454058146e-06, + "loss": 2.6332, + "step": 16903 + }, + { + "epoch": 0.9068669527896995, + "grad_norm": 0.63671875, + "learning_rate": 4.705524549033668e-06, + "loss": 2.6436, + "step": 16904 + }, + { + "epoch": 0.9069206008583691, + "grad_norm": 0.494140625, + "learning_rate": 4.70548364134579e-06, + "loss": 2.3273, + "step": 16905 + }, + { + "epoch": 0.9069742489270386, + "grad_norm": 0.44921875, + "learning_rate": 4.70544273099456e-06, + "loss": 2.2994, + "step": 16906 + }, + { + "epoch": 0.9070278969957082, + "grad_norm": 0.408203125, + "learning_rate": 4.705401817980031e-06, + "loss": 2.2351, + "step": 16907 + }, + { + "epoch": 0.9070815450643777, + "grad_norm": 0.55859375, + "learning_rate": 4.70536090230225e-06, + "loss": 2.387, + "step": 16908 + }, + { + "epoch": 0.9071351931330472, + "grad_norm": 0.48828125, + "learning_rate": 4.705319983961266e-06, + "loss": 2.3738, + "step": 16909 + }, + { + "epoch": 0.9071888412017167, + "grad_norm": 0.431640625, + "learning_rate": 4.70527906295713e-06, + "loss": 2.1139, + "step": 16910 + }, + { + "epoch": 0.9072424892703863, + "grad_norm": 0.65234375, + "learning_rate": 4.70523813928989e-06, + "loss": 2.3741, + "step": 16911 + }, + { + "epoch": 0.9072961373390558, + "grad_norm": 0.53125, + "learning_rate": 4.705197212959598e-06, + "loss": 2.4257, + "step": 16912 + }, + { + "epoch": 0.9073497854077254, + "grad_norm": 0.51953125, + "learning_rate": 4.7051562839663005e-06, + "loss": 2.2235, + "step": 16913 + }, + { + "epoch": 0.9074034334763948, + "grad_norm": 0.52734375, + "learning_rate": 4.705115352310048e-06, + "loss": 2.5829, + "step": 16914 + }, + { + "epoch": 0.9074570815450643, + "grad_norm": 0.5234375, + "learning_rate": 4.7050744179908895e-06, + "loss": 2.3515, + "step": 16915 + }, + { + "epoch": 0.9075107296137339, + "grad_norm": 0.94921875, + "learning_rate": 4.705033481008876e-06, + "loss": 2.3749, + "step": 16916 + }, + { + "epoch": 0.9075643776824034, + "grad_norm": 0.5859375, + "learning_rate": 4.7049925413640545e-06, + "loss": 2.3726, + "step": 16917 + }, + { + "epoch": 0.907618025751073, + "grad_norm": 0.44921875, + "learning_rate": 4.704951599056476e-06, + "loss": 2.1851, + "step": 16918 + }, + { + "epoch": 0.9076716738197425, + "grad_norm": 0.46484375, + "learning_rate": 4.70491065408619e-06, + "loss": 2.4512, + "step": 16919 + }, + { + "epoch": 0.907725321888412, + "grad_norm": 0.50390625, + "learning_rate": 4.704869706453246e-06, + "loss": 2.2423, + "step": 16920 + }, + { + "epoch": 0.9077789699570815, + "grad_norm": 0.435546875, + "learning_rate": 4.704828756157693e-06, + "loss": 2.5735, + "step": 16921 + }, + { + "epoch": 0.9078326180257511, + "grad_norm": 0.458984375, + "learning_rate": 4.704787803199581e-06, + "loss": 2.2705, + "step": 16922 + }, + { + "epoch": 0.9078862660944206, + "grad_norm": 0.443359375, + "learning_rate": 4.7047468475789574e-06, + "loss": 2.4136, + "step": 16923 + }, + { + "epoch": 0.9079399141630902, + "grad_norm": 0.439453125, + "learning_rate": 4.704705889295874e-06, + "loss": 2.2655, + "step": 16924 + }, + { + "epoch": 0.9079935622317596, + "grad_norm": 0.47265625, + "learning_rate": 4.70466492835038e-06, + "loss": 2.155, + "step": 16925 + }, + { + "epoch": 0.9080472103004292, + "grad_norm": 0.55078125, + "learning_rate": 4.704623964742524e-06, + "loss": 2.3338, + "step": 16926 + }, + { + "epoch": 0.9081008583690987, + "grad_norm": 0.455078125, + "learning_rate": 4.704582998472356e-06, + "loss": 2.4539, + "step": 16927 + }, + { + "epoch": 0.9081545064377683, + "grad_norm": 0.427734375, + "learning_rate": 4.7045420295399245e-06, + "loss": 2.2604, + "step": 16928 + }, + { + "epoch": 0.9082081545064378, + "grad_norm": 0.51953125, + "learning_rate": 4.704501057945281e-06, + "loss": 2.6077, + "step": 16929 + }, + { + "epoch": 0.9082618025751072, + "grad_norm": 0.484375, + "learning_rate": 4.704460083688473e-06, + "loss": 2.4635, + "step": 16930 + }, + { + "epoch": 0.9083154506437768, + "grad_norm": 0.4765625, + "learning_rate": 4.704419106769551e-06, + "loss": 2.1803, + "step": 16931 + }, + { + "epoch": 0.9083690987124463, + "grad_norm": 0.4140625, + "learning_rate": 4.704378127188563e-06, + "loss": 2.4555, + "step": 16932 + }, + { + "epoch": 0.9084227467811159, + "grad_norm": 0.474609375, + "learning_rate": 4.7043371449455624e-06, + "loss": 2.2668, + "step": 16933 + }, + { + "epoch": 0.9084763948497854, + "grad_norm": 0.443359375, + "learning_rate": 4.704296160040594e-06, + "loss": 2.4278, + "step": 16934 + }, + { + "epoch": 0.908530042918455, + "grad_norm": 1.953125, + "learning_rate": 4.7042551724737096e-06, + "loss": 2.1079, + "step": 16935 + }, + { + "epoch": 0.9085836909871244, + "grad_norm": 0.5234375, + "learning_rate": 4.704214182244958e-06, + "loss": 2.4723, + "step": 16936 + }, + { + "epoch": 0.908637339055794, + "grad_norm": 0.484375, + "learning_rate": 4.70417318935439e-06, + "loss": 2.4098, + "step": 16937 + }, + { + "epoch": 0.9086909871244635, + "grad_norm": 0.40625, + "learning_rate": 4.704132193802054e-06, + "loss": 2.3017, + "step": 16938 + }, + { + "epoch": 0.9087446351931331, + "grad_norm": 0.55859375, + "learning_rate": 4.704091195588e-06, + "loss": 2.1075, + "step": 16939 + }, + { + "epoch": 0.9087982832618026, + "grad_norm": 0.482421875, + "learning_rate": 4.704050194712277e-06, + "loss": 2.3091, + "step": 16940 + }, + { + "epoch": 0.9088519313304722, + "grad_norm": 0.484375, + "learning_rate": 4.7040091911749345e-06, + "loss": 2.4143, + "step": 16941 + }, + { + "epoch": 0.9089055793991416, + "grad_norm": 0.470703125, + "learning_rate": 4.703968184976022e-06, + "loss": 2.4435, + "step": 16942 + }, + { + "epoch": 0.9089592274678111, + "grad_norm": 0.44921875, + "learning_rate": 4.703927176115589e-06, + "loss": 1.998, + "step": 16943 + }, + { + "epoch": 0.9090128755364807, + "grad_norm": 0.484375, + "learning_rate": 4.703886164593686e-06, + "loss": 2.3498, + "step": 16944 + }, + { + "epoch": 0.9090665236051502, + "grad_norm": 0.4765625, + "learning_rate": 4.703845150410361e-06, + "loss": 2.476, + "step": 16945 + }, + { + "epoch": 0.9091201716738198, + "grad_norm": 0.462890625, + "learning_rate": 4.703804133565665e-06, + "loss": 2.5432, + "step": 16946 + }, + { + "epoch": 0.9091738197424892, + "grad_norm": 0.5078125, + "learning_rate": 4.703763114059648e-06, + "loss": 2.4439, + "step": 16947 + }, + { + "epoch": 0.9092274678111588, + "grad_norm": 0.5546875, + "learning_rate": 4.703722091892356e-06, + "loss": 1.1261, + "step": 16948 + }, + { + "epoch": 0.9092811158798283, + "grad_norm": 0.486328125, + "learning_rate": 4.703681067063842e-06, + "loss": 2.1705, + "step": 16949 + }, + { + "epoch": 0.9093347639484979, + "grad_norm": 0.498046875, + "learning_rate": 4.703640039574156e-06, + "loss": 2.3561, + "step": 16950 + }, + { + "epoch": 0.9093884120171674, + "grad_norm": 0.56640625, + "learning_rate": 4.703599009423344e-06, + "loss": 1.2869, + "step": 16951 + }, + { + "epoch": 0.909442060085837, + "grad_norm": 0.498046875, + "learning_rate": 4.703557976611458e-06, + "loss": 2.2646, + "step": 16952 + }, + { + "epoch": 0.9094957081545064, + "grad_norm": 0.578125, + "learning_rate": 4.703516941138547e-06, + "loss": 2.2995, + "step": 16953 + }, + { + "epoch": 0.909549356223176, + "grad_norm": 0.451171875, + "learning_rate": 4.7034759030046615e-06, + "loss": 2.138, + "step": 16954 + }, + { + "epoch": 0.9096030042918455, + "grad_norm": 0.4453125, + "learning_rate": 4.70343486220985e-06, + "loss": 2.1084, + "step": 16955 + }, + { + "epoch": 0.9096566523605151, + "grad_norm": 1.0, + "learning_rate": 4.703393818754162e-06, + "loss": 2.569, + "step": 16956 + }, + { + "epoch": 0.9097103004291845, + "grad_norm": 0.47265625, + "learning_rate": 4.703352772637647e-06, + "loss": 2.3181, + "step": 16957 + }, + { + "epoch": 0.909763948497854, + "grad_norm": 0.498046875, + "learning_rate": 4.703311723860356e-06, + "loss": 2.4446, + "step": 16958 + }, + { + "epoch": 0.9098175965665236, + "grad_norm": 0.484375, + "learning_rate": 4.703270672422336e-06, + "loss": 2.1471, + "step": 16959 + }, + { + "epoch": 0.9098712446351931, + "grad_norm": 0.484375, + "learning_rate": 4.703229618323639e-06, + "loss": 2.405, + "step": 16960 + }, + { + "epoch": 0.9099248927038627, + "grad_norm": 0.54296875, + "learning_rate": 4.703188561564314e-06, + "loss": 2.1857, + "step": 16961 + }, + { + "epoch": 0.9099785407725322, + "grad_norm": 0.46875, + "learning_rate": 4.70314750214441e-06, + "loss": 2.2655, + "step": 16962 + }, + { + "epoch": 0.9100321888412017, + "grad_norm": 0.55078125, + "learning_rate": 4.703106440063977e-06, + "loss": 2.3637, + "step": 16963 + }, + { + "epoch": 0.9100858369098712, + "grad_norm": 0.427734375, + "learning_rate": 4.703065375323064e-06, + "loss": 2.1627, + "step": 16964 + }, + { + "epoch": 0.9101394849785408, + "grad_norm": 0.4453125, + "learning_rate": 4.703024307921721e-06, + "loss": 2.2918, + "step": 16965 + }, + { + "epoch": 0.9101931330472103, + "grad_norm": 0.490234375, + "learning_rate": 4.702983237859997e-06, + "loss": 2.435, + "step": 16966 + }, + { + "epoch": 0.9102467811158799, + "grad_norm": 0.484375, + "learning_rate": 4.702942165137943e-06, + "loss": 2.3212, + "step": 16967 + }, + { + "epoch": 0.9103004291845493, + "grad_norm": 0.69140625, + "learning_rate": 4.702901089755608e-06, + "loss": 2.3487, + "step": 16968 + }, + { + "epoch": 0.9103540772532189, + "grad_norm": 0.380859375, + "learning_rate": 4.702860011713041e-06, + "loss": 2.1914, + "step": 16969 + }, + { + "epoch": 0.9104077253218884, + "grad_norm": 0.458984375, + "learning_rate": 4.7028189310102926e-06, + "loss": 2.1317, + "step": 16970 + }, + { + "epoch": 0.910461373390558, + "grad_norm": 1.140625, + "learning_rate": 4.702777847647411e-06, + "loss": 2.0916, + "step": 16971 + }, + { + "epoch": 0.9105150214592275, + "grad_norm": 0.470703125, + "learning_rate": 4.702736761624447e-06, + "loss": 2.2853, + "step": 16972 + }, + { + "epoch": 0.910568669527897, + "grad_norm": 0.4296875, + "learning_rate": 4.7026956729414494e-06, + "loss": 2.4337, + "step": 16973 + }, + { + "epoch": 0.9106223175965665, + "grad_norm": 0.5078125, + "learning_rate": 4.702654581598468e-06, + "loss": 2.2304, + "step": 16974 + }, + { + "epoch": 0.910675965665236, + "grad_norm": 0.48046875, + "learning_rate": 4.702613487595554e-06, + "loss": 2.3412, + "step": 16975 + }, + { + "epoch": 0.9107296137339056, + "grad_norm": 0.58984375, + "learning_rate": 4.702572390932755e-06, + "loss": 2.5127, + "step": 16976 + }, + { + "epoch": 0.9107832618025751, + "grad_norm": 0.5, + "learning_rate": 4.702531291610121e-06, + "loss": 2.4245, + "step": 16977 + }, + { + "epoch": 0.9108369098712447, + "grad_norm": 0.482421875, + "learning_rate": 4.7024901896277016e-06, + "loss": 2.5677, + "step": 16978 + }, + { + "epoch": 0.9108905579399141, + "grad_norm": 0.9609375, + "learning_rate": 4.702449084985547e-06, + "loss": 2.2836, + "step": 16979 + }, + { + "epoch": 0.9109442060085837, + "grad_norm": 0.5234375, + "learning_rate": 4.702407977683708e-06, + "loss": 2.4232, + "step": 16980 + }, + { + "epoch": 0.9109978540772532, + "grad_norm": 0.578125, + "learning_rate": 4.702366867722231e-06, + "loss": 2.4609, + "step": 16981 + }, + { + "epoch": 0.9110515021459228, + "grad_norm": 0.392578125, + "learning_rate": 4.702325755101169e-06, + "loss": 2.3374, + "step": 16982 + }, + { + "epoch": 0.9111051502145923, + "grad_norm": 0.43359375, + "learning_rate": 4.70228463982057e-06, + "loss": 2.1105, + "step": 16983 + }, + { + "epoch": 0.9111587982832619, + "grad_norm": 0.51171875, + "learning_rate": 4.7022435218804826e-06, + "loss": 2.3209, + "step": 16984 + }, + { + "epoch": 0.9112124463519313, + "grad_norm": 0.462890625, + "learning_rate": 4.7022024012809585e-06, + "loss": 2.2401, + "step": 16985 + }, + { + "epoch": 0.9112660944206008, + "grad_norm": 0.5078125, + "learning_rate": 4.702161278022047e-06, + "loss": 2.2961, + "step": 16986 + }, + { + "epoch": 0.9113197424892704, + "grad_norm": 0.42578125, + "learning_rate": 4.702120152103796e-06, + "loss": 2.0982, + "step": 16987 + }, + { + "epoch": 0.9113733905579399, + "grad_norm": 0.546875, + "learning_rate": 4.702079023526258e-06, + "loss": 1.9645, + "step": 16988 + }, + { + "epoch": 0.9114270386266095, + "grad_norm": 0.4921875, + "learning_rate": 4.70203789228948e-06, + "loss": 1.7023, + "step": 16989 + }, + { + "epoch": 0.9114806866952789, + "grad_norm": 0.35546875, + "learning_rate": 4.701996758393513e-06, + "loss": 2.2666, + "step": 16990 + }, + { + "epoch": 0.9115343347639485, + "grad_norm": 0.392578125, + "learning_rate": 4.701955621838407e-06, + "loss": 2.1145, + "step": 16991 + }, + { + "epoch": 0.911587982832618, + "grad_norm": 0.48046875, + "learning_rate": 4.70191448262421e-06, + "loss": 2.1629, + "step": 16992 + }, + { + "epoch": 0.9116416309012876, + "grad_norm": 0.45703125, + "learning_rate": 4.7018733407509744e-06, + "loss": 2.3473, + "step": 16993 + }, + { + "epoch": 0.9116952789699571, + "grad_norm": 1.015625, + "learning_rate": 4.701832196218747e-06, + "loss": 2.6355, + "step": 16994 + }, + { + "epoch": 0.9117489270386266, + "grad_norm": 0.4375, + "learning_rate": 4.70179104902758e-06, + "loss": 2.1323, + "step": 16995 + }, + { + "epoch": 0.9118025751072961, + "grad_norm": 0.6484375, + "learning_rate": 4.70174989917752e-06, + "loss": 1.4922, + "step": 16996 + }, + { + "epoch": 0.9118562231759657, + "grad_norm": 0.5, + "learning_rate": 4.70170874666862e-06, + "loss": 2.3226, + "step": 16997 + }, + { + "epoch": 0.9119098712446352, + "grad_norm": 0.484375, + "learning_rate": 4.701667591500928e-06, + "loss": 2.3582, + "step": 16998 + }, + { + "epoch": 0.9119635193133048, + "grad_norm": 0.423828125, + "learning_rate": 4.7016264336744945e-06, + "loss": 2.1402, + "step": 16999 + }, + { + "epoch": 0.9120171673819742, + "grad_norm": 0.44140625, + "learning_rate": 4.701585273189367e-06, + "loss": 2.2928, + "step": 17000 + }, + { + "epoch": 0.9120708154506437, + "grad_norm": 0.439453125, + "learning_rate": 4.701544110045598e-06, + "loss": 2.5351, + "step": 17001 + }, + { + "epoch": 0.9121244635193133, + "grad_norm": 0.50390625, + "learning_rate": 4.701502944243237e-06, + "loss": 2.4516, + "step": 17002 + }, + { + "epoch": 0.9121781115879828, + "grad_norm": 0.50390625, + "learning_rate": 4.701461775782331e-06, + "loss": 2.2966, + "step": 17003 + }, + { + "epoch": 0.9122317596566524, + "grad_norm": 0.51171875, + "learning_rate": 4.701420604662933e-06, + "loss": 2.2023, + "step": 17004 + }, + { + "epoch": 0.9122854077253219, + "grad_norm": 0.40625, + "learning_rate": 4.70137943088509e-06, + "loss": 2.2316, + "step": 17005 + }, + { + "epoch": 0.9123390557939914, + "grad_norm": 0.431640625, + "learning_rate": 4.701338254448854e-06, + "loss": 2.3547, + "step": 17006 + }, + { + "epoch": 0.9123927038626609, + "grad_norm": 0.48046875, + "learning_rate": 4.701297075354273e-06, + "loss": 2.2036, + "step": 17007 + }, + { + "epoch": 0.9124463519313305, + "grad_norm": 0.4296875, + "learning_rate": 4.701255893601398e-06, + "loss": 2.1483, + "step": 17008 + }, + { + "epoch": 0.9125, + "grad_norm": 0.458984375, + "learning_rate": 4.701214709190277e-06, + "loss": 2.4044, + "step": 17009 + }, + { + "epoch": 0.9125536480686696, + "grad_norm": 0.3828125, + "learning_rate": 4.701173522120962e-06, + "loss": 2.0014, + "step": 17010 + }, + { + "epoch": 0.912607296137339, + "grad_norm": 0.4921875, + "learning_rate": 4.701132332393501e-06, + "loss": 2.0218, + "step": 17011 + }, + { + "epoch": 0.9126609442060086, + "grad_norm": 0.470703125, + "learning_rate": 4.701091140007945e-06, + "loss": 2.3036, + "step": 17012 + }, + { + "epoch": 0.9127145922746781, + "grad_norm": 0.439453125, + "learning_rate": 4.701049944964342e-06, + "loss": 2.2931, + "step": 17013 + }, + { + "epoch": 0.9127682403433477, + "grad_norm": 0.47265625, + "learning_rate": 4.701008747262744e-06, + "loss": 2.3222, + "step": 17014 + }, + { + "epoch": 0.9128218884120172, + "grad_norm": 0.46484375, + "learning_rate": 4.700967546903199e-06, + "loss": 2.1964, + "step": 17015 + }, + { + "epoch": 0.9128755364806866, + "grad_norm": 0.48046875, + "learning_rate": 4.700926343885758e-06, + "loss": 2.2047, + "step": 17016 + }, + { + "epoch": 0.9129291845493562, + "grad_norm": 0.44140625, + "learning_rate": 4.700885138210469e-06, + "loss": 2.4821, + "step": 17017 + }, + { + "epoch": 0.9129828326180257, + "grad_norm": 0.453125, + "learning_rate": 4.700843929877383e-06, + "loss": 2.2376, + "step": 17018 + }, + { + "epoch": 0.9130364806866953, + "grad_norm": 0.55078125, + "learning_rate": 4.70080271888655e-06, + "loss": 2.2115, + "step": 17019 + }, + { + "epoch": 0.9130901287553648, + "grad_norm": 0.68359375, + "learning_rate": 4.700761505238019e-06, + "loss": 2.1931, + "step": 17020 + }, + { + "epoch": 0.9131437768240344, + "grad_norm": 0.453125, + "learning_rate": 4.700720288931842e-06, + "loss": 2.1333, + "step": 17021 + }, + { + "epoch": 0.9131974248927038, + "grad_norm": 0.96875, + "learning_rate": 4.7006790699680646e-06, + "loss": 2.4584, + "step": 17022 + }, + { + "epoch": 0.9132510729613734, + "grad_norm": 0.546875, + "learning_rate": 4.700637848346739e-06, + "loss": 2.2625, + "step": 17023 + }, + { + "epoch": 0.9133047210300429, + "grad_norm": 0.5234375, + "learning_rate": 4.700596624067917e-06, + "loss": 2.166, + "step": 17024 + }, + { + "epoch": 0.9133583690987125, + "grad_norm": 0.5625, + "learning_rate": 4.700555397131644e-06, + "loss": 2.346, + "step": 17025 + }, + { + "epoch": 0.913412017167382, + "grad_norm": 0.4765625, + "learning_rate": 4.700514167537973e-06, + "loss": 2.2653, + "step": 17026 + }, + { + "epoch": 0.9134656652360515, + "grad_norm": 0.443359375, + "learning_rate": 4.7004729352869535e-06, + "loss": 2.3207, + "step": 17027 + }, + { + "epoch": 0.913519313304721, + "grad_norm": 0.416015625, + "learning_rate": 4.700431700378634e-06, + "loss": 2.2127, + "step": 17028 + }, + { + "epoch": 0.9135729613733906, + "grad_norm": 0.49609375, + "learning_rate": 4.700390462813065e-06, + "loss": 2.3742, + "step": 17029 + }, + { + "epoch": 0.9136266094420601, + "grad_norm": 0.76953125, + "learning_rate": 4.700349222590296e-06, + "loss": 2.3778, + "step": 17030 + }, + { + "epoch": 0.9136802575107296, + "grad_norm": 0.44140625, + "learning_rate": 4.700307979710377e-06, + "loss": 2.3079, + "step": 17031 + }, + { + "epoch": 0.9137339055793992, + "grad_norm": 0.5625, + "learning_rate": 4.7002667341733575e-06, + "loss": 2.321, + "step": 17032 + }, + { + "epoch": 0.9137875536480686, + "grad_norm": 0.5234375, + "learning_rate": 4.700225485979288e-06, + "loss": 2.1029, + "step": 17033 + }, + { + "epoch": 0.9138412017167382, + "grad_norm": 0.427734375, + "learning_rate": 4.700184235128219e-06, + "loss": 2.1164, + "step": 17034 + }, + { + "epoch": 0.9138948497854077, + "grad_norm": 0.45703125, + "learning_rate": 4.7001429816201984e-06, + "loss": 1.9037, + "step": 17035 + }, + { + "epoch": 0.9139484978540773, + "grad_norm": 0.486328125, + "learning_rate": 4.700101725455276e-06, + "loss": 1.8428, + "step": 17036 + }, + { + "epoch": 0.9140021459227468, + "grad_norm": 0.77734375, + "learning_rate": 4.700060466633504e-06, + "loss": 2.3522, + "step": 17037 + }, + { + "epoch": 0.9140557939914163, + "grad_norm": 0.494140625, + "learning_rate": 4.70001920515493e-06, + "loss": 2.3238, + "step": 17038 + }, + { + "epoch": 0.9141094420600858, + "grad_norm": 0.458984375, + "learning_rate": 4.699977941019605e-06, + "loss": 2.2212, + "step": 17039 + }, + { + "epoch": 0.9141630901287554, + "grad_norm": 0.49609375, + "learning_rate": 4.699936674227578e-06, + "loss": 2.0266, + "step": 17040 + }, + { + "epoch": 0.9142167381974249, + "grad_norm": 0.498046875, + "learning_rate": 4.699895404778899e-06, + "loss": 2.1464, + "step": 17041 + }, + { + "epoch": 0.9142703862660945, + "grad_norm": 0.447265625, + "learning_rate": 4.699854132673619e-06, + "loss": 2.1538, + "step": 17042 + }, + { + "epoch": 0.914324034334764, + "grad_norm": 0.455078125, + "learning_rate": 4.699812857911786e-06, + "loss": 2.4823, + "step": 17043 + }, + { + "epoch": 0.9143776824034334, + "grad_norm": 0.462890625, + "learning_rate": 4.699771580493451e-06, + "loss": 2.3933, + "step": 17044 + }, + { + "epoch": 0.914431330472103, + "grad_norm": 0.490234375, + "learning_rate": 4.699730300418663e-06, + "loss": 2.5348, + "step": 17045 + }, + { + "epoch": 0.9144849785407725, + "grad_norm": 0.474609375, + "learning_rate": 4.6996890176874735e-06, + "loss": 2.2396, + "step": 17046 + }, + { + "epoch": 0.9145386266094421, + "grad_norm": 0.51171875, + "learning_rate": 4.699647732299932e-06, + "loss": 2.2085, + "step": 17047 + }, + { + "epoch": 0.9145922746781115, + "grad_norm": 0.498046875, + "learning_rate": 4.699606444256085e-06, + "loss": 2.2451, + "step": 17048 + }, + { + "epoch": 0.9146459227467811, + "grad_norm": 0.4453125, + "learning_rate": 4.699565153555987e-06, + "loss": 2.1632, + "step": 17049 + }, + { + "epoch": 0.9146995708154506, + "grad_norm": 0.48828125, + "learning_rate": 4.699523860199685e-06, + "loss": 2.3482, + "step": 17050 + }, + { + "epoch": 0.9147532188841202, + "grad_norm": 0.40625, + "learning_rate": 4.69948256418723e-06, + "loss": 2.3051, + "step": 17051 + }, + { + "epoch": 0.9148068669527897, + "grad_norm": 0.7734375, + "learning_rate": 4.6994412655186725e-06, + "loss": 2.1614, + "step": 17052 + }, + { + "epoch": 0.9148605150214593, + "grad_norm": 0.40625, + "learning_rate": 4.699399964194061e-06, + "loss": 1.991, + "step": 17053 + }, + { + "epoch": 0.9149141630901287, + "grad_norm": 0.4453125, + "learning_rate": 4.699358660213445e-06, + "loss": 2.2612, + "step": 17054 + }, + { + "epoch": 0.9149678111587983, + "grad_norm": 0.515625, + "learning_rate": 4.699317353576876e-06, + "loss": 2.1575, + "step": 17055 + }, + { + "epoch": 0.9150214592274678, + "grad_norm": 0.59765625, + "learning_rate": 4.699276044284403e-06, + "loss": 2.2772, + "step": 17056 + }, + { + "epoch": 0.9150751072961374, + "grad_norm": 0.484375, + "learning_rate": 4.699234732336076e-06, + "loss": 2.3087, + "step": 17057 + }, + { + "epoch": 0.9151287553648069, + "grad_norm": 0.451171875, + "learning_rate": 4.699193417731946e-06, + "loss": 2.2234, + "step": 17058 + }, + { + "epoch": 0.9151824034334763, + "grad_norm": 0.498046875, + "learning_rate": 4.69915210047206e-06, + "loss": 1.6029, + "step": 17059 + }, + { + "epoch": 0.9152360515021459, + "grad_norm": 0.44140625, + "learning_rate": 4.69911078055647e-06, + "loss": 2.3207, + "step": 17060 + }, + { + "epoch": 0.9152896995708154, + "grad_norm": 0.458984375, + "learning_rate": 4.699069457985226e-06, + "loss": 2.4322, + "step": 17061 + }, + { + "epoch": 0.915343347639485, + "grad_norm": 0.5546875, + "learning_rate": 4.699028132758378e-06, + "loss": 2.2606, + "step": 17062 + }, + { + "epoch": 0.9153969957081545, + "grad_norm": 0.61328125, + "learning_rate": 4.698986804875974e-06, + "loss": 2.2615, + "step": 17063 + }, + { + "epoch": 0.9154506437768241, + "grad_norm": 0.443359375, + "learning_rate": 4.6989454743380665e-06, + "loss": 2.1232, + "step": 17064 + }, + { + "epoch": 0.9155042918454935, + "grad_norm": 0.515625, + "learning_rate": 4.698904141144703e-06, + "loss": 2.1888, + "step": 17065 + }, + { + "epoch": 0.9155579399141631, + "grad_norm": 0.6796875, + "learning_rate": 4.698862805295936e-06, + "loss": 2.1976, + "step": 17066 + }, + { + "epoch": 0.9156115879828326, + "grad_norm": 0.51953125, + "learning_rate": 4.698821466791813e-06, + "loss": 2.3735, + "step": 17067 + }, + { + "epoch": 0.9156652360515022, + "grad_norm": 0.5, + "learning_rate": 4.698780125632386e-06, + "loss": 2.5135, + "step": 17068 + }, + { + "epoch": 0.9157188841201717, + "grad_norm": 0.59765625, + "learning_rate": 4.698738781817703e-06, + "loss": 2.1416, + "step": 17069 + }, + { + "epoch": 0.9157725321888412, + "grad_norm": 0.447265625, + "learning_rate": 4.698697435347815e-06, + "loss": 2.3041, + "step": 17070 + }, + { + "epoch": 0.9158261802575107, + "grad_norm": 0.494140625, + "learning_rate": 4.698656086222771e-06, + "loss": 2.3315, + "step": 17071 + }, + { + "epoch": 0.9158798283261803, + "grad_norm": 0.4375, + "learning_rate": 4.6986147344426235e-06, + "loss": 1.4249, + "step": 17072 + }, + { + "epoch": 0.9159334763948498, + "grad_norm": 0.490234375, + "learning_rate": 4.69857338000742e-06, + "loss": 2.1788, + "step": 17073 + }, + { + "epoch": 0.9159871244635193, + "grad_norm": 0.416015625, + "learning_rate": 4.698532022917211e-06, + "loss": 2.3844, + "step": 17074 + }, + { + "epoch": 0.9160407725321889, + "grad_norm": 0.5546875, + "learning_rate": 4.698490663172045e-06, + "loss": 2.294, + "step": 17075 + }, + { + "epoch": 0.9160944206008583, + "grad_norm": 1.0390625, + "learning_rate": 4.698449300771974e-06, + "loss": 2.2782, + "step": 17076 + }, + { + "epoch": 0.9161480686695279, + "grad_norm": 0.439453125, + "learning_rate": 4.698407935717048e-06, + "loss": 2.2478, + "step": 17077 + }, + { + "epoch": 0.9162017167381974, + "grad_norm": 0.51171875, + "learning_rate": 4.698366568007317e-06, + "loss": 2.2997, + "step": 17078 + }, + { + "epoch": 0.916255364806867, + "grad_norm": 0.55078125, + "learning_rate": 4.6983251976428305e-06, + "loss": 2.2601, + "step": 17079 + }, + { + "epoch": 0.9163090128755365, + "grad_norm": 0.404296875, + "learning_rate": 4.698283824623637e-06, + "loss": 2.2351, + "step": 17080 + }, + { + "epoch": 0.916362660944206, + "grad_norm": 0.765625, + "learning_rate": 4.698242448949788e-06, + "loss": 2.2184, + "step": 17081 + }, + { + "epoch": 0.9164163090128755, + "grad_norm": 0.455078125, + "learning_rate": 4.6982010706213334e-06, + "loss": 2.365, + "step": 17082 + }, + { + "epoch": 0.9164699570815451, + "grad_norm": 0.55078125, + "learning_rate": 4.698159689638322e-06, + "loss": 2.5088, + "step": 17083 + }, + { + "epoch": 0.9165236051502146, + "grad_norm": 0.50390625, + "learning_rate": 4.6981183060008065e-06, + "loss": 2.4173, + "step": 17084 + }, + { + "epoch": 0.9165772532188842, + "grad_norm": 0.443359375, + "learning_rate": 4.698076919708834e-06, + "loss": 2.1697, + "step": 17085 + }, + { + "epoch": 0.9166309012875536, + "grad_norm": 0.796875, + "learning_rate": 4.698035530762455e-06, + "loss": 1.7488, + "step": 17086 + }, + { + "epoch": 0.9166845493562231, + "grad_norm": 0.41796875, + "learning_rate": 4.697994139161721e-06, + "loss": 2.334, + "step": 17087 + }, + { + "epoch": 0.9167381974248927, + "grad_norm": 0.48828125, + "learning_rate": 4.6979527449066804e-06, + "loss": 2.2526, + "step": 17088 + }, + { + "epoch": 0.9167918454935622, + "grad_norm": 0.51171875, + "learning_rate": 4.697911347997385e-06, + "loss": 2.2138, + "step": 17089 + }, + { + "epoch": 0.9168454935622318, + "grad_norm": 0.765625, + "learning_rate": 4.697869948433883e-06, + "loss": 2.2537, + "step": 17090 + }, + { + "epoch": 0.9168991416309012, + "grad_norm": 0.4765625, + "learning_rate": 4.697828546216224e-06, + "loss": 2.0962, + "step": 17091 + }, + { + "epoch": 0.9169527896995708, + "grad_norm": 0.474609375, + "learning_rate": 4.6977871413444595e-06, + "loss": 1.5613, + "step": 17092 + }, + { + "epoch": 0.9170064377682403, + "grad_norm": 0.447265625, + "learning_rate": 4.69774573381864e-06, + "loss": 2.3176, + "step": 17093 + }, + { + "epoch": 0.9170600858369099, + "grad_norm": 0.419921875, + "learning_rate": 4.697704323638814e-06, + "loss": 1.5768, + "step": 17094 + }, + { + "epoch": 0.9171137339055794, + "grad_norm": 0.400390625, + "learning_rate": 4.697662910805032e-06, + "loss": 2.0833, + "step": 17095 + }, + { + "epoch": 0.917167381974249, + "grad_norm": 0.474609375, + "learning_rate": 4.697621495317344e-06, + "loss": 2.066, + "step": 17096 + }, + { + "epoch": 0.9172210300429184, + "grad_norm": 0.494140625, + "learning_rate": 4.697580077175799e-06, + "loss": 2.41, + "step": 17097 + }, + { + "epoch": 0.917274678111588, + "grad_norm": 0.55859375, + "learning_rate": 4.69753865638045e-06, + "loss": 2.5491, + "step": 17098 + }, + { + "epoch": 0.9173283261802575, + "grad_norm": 0.474609375, + "learning_rate": 4.697497232931344e-06, + "loss": 2.6717, + "step": 17099 + }, + { + "epoch": 0.9173819742489271, + "grad_norm": 0.5546875, + "learning_rate": 4.697455806828532e-06, + "loss": 2.4551, + "step": 17100 + }, + { + "epoch": 0.9174356223175966, + "grad_norm": 0.53125, + "learning_rate": 4.697414378072064e-06, + "loss": 2.3071, + "step": 17101 + }, + { + "epoch": 0.917489270386266, + "grad_norm": 0.419921875, + "learning_rate": 4.697372946661991e-06, + "loss": 2.1574, + "step": 17102 + }, + { + "epoch": 0.9175429184549356, + "grad_norm": 0.427734375, + "learning_rate": 4.697331512598362e-06, + "loss": 2.5167, + "step": 17103 + }, + { + "epoch": 0.9175965665236051, + "grad_norm": 0.4609375, + "learning_rate": 4.697290075881226e-06, + "loss": 2.0828, + "step": 17104 + }, + { + "epoch": 0.9176502145922747, + "grad_norm": 0.5859375, + "learning_rate": 4.697248636510636e-06, + "loss": 2.1312, + "step": 17105 + }, + { + "epoch": 0.9177038626609442, + "grad_norm": 0.458984375, + "learning_rate": 4.697207194486639e-06, + "loss": 2.2498, + "step": 17106 + }, + { + "epoch": 0.9177575107296138, + "grad_norm": 0.48828125, + "learning_rate": 4.697165749809287e-06, + "loss": 2.3527, + "step": 17107 + }, + { + "epoch": 0.9178111587982832, + "grad_norm": 0.5703125, + "learning_rate": 4.697124302478629e-06, + "loss": 2.0723, + "step": 17108 + }, + { + "epoch": 0.9178648068669528, + "grad_norm": 0.625, + "learning_rate": 4.697082852494717e-06, + "loss": 2.574, + "step": 17109 + }, + { + "epoch": 0.9179184549356223, + "grad_norm": 0.40625, + "learning_rate": 4.697041399857598e-06, + "loss": 1.9969, + "step": 17110 + }, + { + "epoch": 0.9179721030042919, + "grad_norm": 0.4609375, + "learning_rate": 4.696999944567324e-06, + "loss": 2.1793, + "step": 17111 + }, + { + "epoch": 0.9180257510729614, + "grad_norm": 0.49609375, + "learning_rate": 4.696958486623944e-06, + "loss": 2.4037, + "step": 17112 + }, + { + "epoch": 0.918079399141631, + "grad_norm": 0.4296875, + "learning_rate": 4.696917026027509e-06, + "loss": 2.1741, + "step": 17113 + }, + { + "epoch": 0.9181330472103004, + "grad_norm": 0.4921875, + "learning_rate": 4.696875562778069e-06, + "loss": 2.2557, + "step": 17114 + }, + { + "epoch": 0.91818669527897, + "grad_norm": 0.5390625, + "learning_rate": 4.696834096875673e-06, + "loss": 2.3138, + "step": 17115 + }, + { + "epoch": 0.9182403433476395, + "grad_norm": 0.50390625, + "learning_rate": 4.696792628320374e-06, + "loss": 2.3575, + "step": 17116 + }, + { + "epoch": 0.918293991416309, + "grad_norm": 0.52734375, + "learning_rate": 4.6967511571122186e-06, + "loss": 2.254, + "step": 17117 + }, + { + "epoch": 0.9183476394849786, + "grad_norm": 0.58984375, + "learning_rate": 4.696709683251258e-06, + "loss": 2.3426, + "step": 17118 + }, + { + "epoch": 0.918401287553648, + "grad_norm": 0.4765625, + "learning_rate": 4.696668206737543e-06, + "loss": 2.4069, + "step": 17119 + }, + { + "epoch": 0.9184549356223176, + "grad_norm": 0.4609375, + "learning_rate": 4.696626727571123e-06, + "loss": 2.327, + "step": 17120 + }, + { + "epoch": 0.9185085836909871, + "grad_norm": 0.451171875, + "learning_rate": 4.696585245752049e-06, + "loss": 2.2844, + "step": 17121 + }, + { + "epoch": 0.9185622317596567, + "grad_norm": 0.515625, + "learning_rate": 4.696543761280369e-06, + "loss": 2.2719, + "step": 17122 + }, + { + "epoch": 0.9186158798283262, + "grad_norm": 0.421875, + "learning_rate": 4.696502274156136e-06, + "loss": 2.3616, + "step": 17123 + }, + { + "epoch": 0.9186695278969957, + "grad_norm": 0.447265625, + "learning_rate": 4.696460784379398e-06, + "loss": 2.2528, + "step": 17124 + }, + { + "epoch": 0.9187231759656652, + "grad_norm": 0.404296875, + "learning_rate": 4.696419291950206e-06, + "loss": 2.1346, + "step": 17125 + }, + { + "epoch": 0.9187768240343348, + "grad_norm": 0.45703125, + "learning_rate": 4.696377796868609e-06, + "loss": 2.2305, + "step": 17126 + }, + { + "epoch": 0.9188304721030043, + "grad_norm": 0.490234375, + "learning_rate": 4.696336299134659e-06, + "loss": 2.247, + "step": 17127 + }, + { + "epoch": 0.9188841201716739, + "grad_norm": 0.55078125, + "learning_rate": 4.696294798748404e-06, + "loss": 2.3162, + "step": 17128 + }, + { + "epoch": 0.9189377682403433, + "grad_norm": 0.474609375, + "learning_rate": 4.696253295709896e-06, + "loss": 2.1514, + "step": 17129 + }, + { + "epoch": 0.9189914163090128, + "grad_norm": 0.478515625, + "learning_rate": 4.696211790019184e-06, + "loss": 2.4157, + "step": 17130 + }, + { + "epoch": 0.9190450643776824, + "grad_norm": 0.443359375, + "learning_rate": 4.696170281676319e-06, + "loss": 2.1855, + "step": 17131 + }, + { + "epoch": 0.9190987124463519, + "grad_norm": 0.54296875, + "learning_rate": 4.69612877068135e-06, + "loss": 1.5206, + "step": 17132 + }, + { + "epoch": 0.9191523605150215, + "grad_norm": 0.494140625, + "learning_rate": 4.696087257034327e-06, + "loss": 2.3839, + "step": 17133 + }, + { + "epoch": 0.919206008583691, + "grad_norm": 0.421875, + "learning_rate": 4.6960457407353024e-06, + "loss": 2.1383, + "step": 17134 + }, + { + "epoch": 0.9192596566523605, + "grad_norm": 8.5, + "learning_rate": 4.696004221784324e-06, + "loss": 2.3062, + "step": 17135 + }, + { + "epoch": 0.91931330472103, + "grad_norm": 0.482421875, + "learning_rate": 4.695962700181442e-06, + "loss": 2.1192, + "step": 17136 + }, + { + "epoch": 0.9193669527896996, + "grad_norm": 0.494140625, + "learning_rate": 4.695921175926708e-06, + "loss": 2.2909, + "step": 17137 + }, + { + "epoch": 0.9194206008583691, + "grad_norm": 0.390625, + "learning_rate": 4.695879649020171e-06, + "loss": 2.1239, + "step": 17138 + }, + { + "epoch": 0.9194742489270387, + "grad_norm": 0.490234375, + "learning_rate": 4.695838119461881e-06, + "loss": 2.3372, + "step": 17139 + }, + { + "epoch": 0.9195278969957081, + "grad_norm": 0.44921875, + "learning_rate": 4.695796587251889e-06, + "loss": 2.2803, + "step": 17140 + }, + { + "epoch": 0.9195815450643777, + "grad_norm": 0.8125, + "learning_rate": 4.695755052390245e-06, + "loss": 1.8878, + "step": 17141 + }, + { + "epoch": 0.9196351931330472, + "grad_norm": 0.515625, + "learning_rate": 4.695713514877e-06, + "loss": 2.8479, + "step": 17142 + }, + { + "epoch": 0.9196888412017168, + "grad_norm": 0.376953125, + "learning_rate": 4.695671974712201e-06, + "loss": 1.9298, + "step": 17143 + }, + { + "epoch": 0.9197424892703863, + "grad_norm": 0.482421875, + "learning_rate": 4.695630431895902e-06, + "loss": 2.3901, + "step": 17144 + }, + { + "epoch": 0.9197961373390557, + "grad_norm": 0.5234375, + "learning_rate": 4.695588886428151e-06, + "loss": 2.3437, + "step": 17145 + }, + { + "epoch": 0.9198497854077253, + "grad_norm": 0.46875, + "learning_rate": 4.695547338308999e-06, + "loss": 1.9046, + "step": 17146 + }, + { + "epoch": 0.9199034334763948, + "grad_norm": 1.109375, + "learning_rate": 4.6955057875384945e-06, + "loss": 2.2473, + "step": 17147 + }, + { + "epoch": 0.9199570815450644, + "grad_norm": 0.490234375, + "learning_rate": 4.69546423411669e-06, + "loss": 2.2614, + "step": 17148 + }, + { + "epoch": 0.9200107296137339, + "grad_norm": 0.5078125, + "learning_rate": 4.695422678043634e-06, + "loss": 2.1566, + "step": 17149 + }, + { + "epoch": 0.9200643776824035, + "grad_norm": 0.5546875, + "learning_rate": 4.695381119319379e-06, + "loss": 2.4845, + "step": 17150 + }, + { + "epoch": 0.9201180257510729, + "grad_norm": 0.482421875, + "learning_rate": 4.695339557943972e-06, + "loss": 2.1872, + "step": 17151 + }, + { + "epoch": 0.9201716738197425, + "grad_norm": 0.42578125, + "learning_rate": 4.695297993917465e-06, + "loss": 2.2114, + "step": 17152 + }, + { + "epoch": 0.920225321888412, + "grad_norm": 0.55859375, + "learning_rate": 4.695256427239908e-06, + "loss": 2.2492, + "step": 17153 + }, + { + "epoch": 0.9202789699570816, + "grad_norm": 0.466796875, + "learning_rate": 4.695214857911351e-06, + "loss": 2.4019, + "step": 17154 + }, + { + "epoch": 0.9203326180257511, + "grad_norm": 0.45703125, + "learning_rate": 4.695173285931845e-06, + "loss": 2.5404, + "step": 17155 + }, + { + "epoch": 0.9203862660944206, + "grad_norm": 0.6328125, + "learning_rate": 4.695131711301438e-06, + "loss": 2.3475, + "step": 17156 + }, + { + "epoch": 0.9204399141630901, + "grad_norm": 0.39453125, + "learning_rate": 4.695090134020182e-06, + "loss": 2.3497, + "step": 17157 + }, + { + "epoch": 0.9204935622317597, + "grad_norm": 0.47265625, + "learning_rate": 4.695048554088128e-06, + "loss": 2.1927, + "step": 17158 + }, + { + "epoch": 0.9205472103004292, + "grad_norm": 0.400390625, + "learning_rate": 4.6950069715053245e-06, + "loss": 1.6789, + "step": 17159 + }, + { + "epoch": 0.9206008583690987, + "grad_norm": 0.435546875, + "learning_rate": 4.694965386271823e-06, + "loss": 2.1864, + "step": 17160 + }, + { + "epoch": 0.9206545064377682, + "grad_norm": 0.431640625, + "learning_rate": 4.6949237983876724e-06, + "loss": 2.1859, + "step": 17161 + }, + { + "epoch": 0.9207081545064377, + "grad_norm": 0.458984375, + "learning_rate": 4.694882207852924e-06, + "loss": 2.4276, + "step": 17162 + }, + { + "epoch": 0.9207618025751073, + "grad_norm": 0.466796875, + "learning_rate": 4.694840614667628e-06, + "loss": 2.4465, + "step": 17163 + }, + { + "epoch": 0.9208154506437768, + "grad_norm": 0.65234375, + "learning_rate": 4.694799018831834e-06, + "loss": 2.3165, + "step": 17164 + }, + { + "epoch": 0.9208690987124464, + "grad_norm": 0.51953125, + "learning_rate": 4.694757420345592e-06, + "loss": 2.3468, + "step": 17165 + }, + { + "epoch": 0.9209227467811159, + "grad_norm": 0.4921875, + "learning_rate": 4.694715819208953e-06, + "loss": 2.2492, + "step": 17166 + }, + { + "epoch": 0.9209763948497854, + "grad_norm": 0.5390625, + "learning_rate": 4.694674215421967e-06, + "loss": 2.1305, + "step": 17167 + }, + { + "epoch": 0.9210300429184549, + "grad_norm": 2.34375, + "learning_rate": 4.694632608984684e-06, + "loss": 2.2966, + "step": 17168 + }, + { + "epoch": 0.9210836909871245, + "grad_norm": 0.462890625, + "learning_rate": 4.6945909998971536e-06, + "loss": 2.5417, + "step": 17169 + }, + { + "epoch": 0.921137339055794, + "grad_norm": 0.494140625, + "learning_rate": 4.694549388159428e-06, + "loss": 2.511, + "step": 17170 + }, + { + "epoch": 0.9211909871244636, + "grad_norm": 0.5234375, + "learning_rate": 4.694507773771556e-06, + "loss": 2.0688, + "step": 17171 + }, + { + "epoch": 0.921244635193133, + "grad_norm": 0.439453125, + "learning_rate": 4.694466156733588e-06, + "loss": 2.0239, + "step": 17172 + }, + { + "epoch": 0.9212982832618025, + "grad_norm": 0.390625, + "learning_rate": 4.694424537045575e-06, + "loss": 2.361, + "step": 17173 + }, + { + "epoch": 0.9213519313304721, + "grad_norm": 0.4296875, + "learning_rate": 4.694382914707567e-06, + "loss": 2.5299, + "step": 17174 + }, + { + "epoch": 0.9214055793991416, + "grad_norm": 0.57421875, + "learning_rate": 4.694341289719613e-06, + "loss": 2.2951, + "step": 17175 + }, + { + "epoch": 0.9214592274678112, + "grad_norm": 0.470703125, + "learning_rate": 4.694299662081765e-06, + "loss": 2.3427, + "step": 17176 + }, + { + "epoch": 0.9215128755364806, + "grad_norm": 0.5390625, + "learning_rate": 4.694258031794072e-06, + "loss": 2.0986, + "step": 17177 + }, + { + "epoch": 0.9215665236051502, + "grad_norm": 0.49609375, + "learning_rate": 4.694216398856584e-06, + "loss": 1.9963, + "step": 17178 + }, + { + "epoch": 0.9216201716738197, + "grad_norm": 0.486328125, + "learning_rate": 4.694174763269354e-06, + "loss": 2.3049, + "step": 17179 + }, + { + "epoch": 0.9216738197424893, + "grad_norm": 0.515625, + "learning_rate": 4.69413312503243e-06, + "loss": 2.2498, + "step": 17180 + }, + { + "epoch": 0.9217274678111588, + "grad_norm": 0.5390625, + "learning_rate": 4.694091484145861e-06, + "loss": 2.1627, + "step": 17181 + }, + { + "epoch": 0.9217811158798284, + "grad_norm": 0.5078125, + "learning_rate": 4.6940498406097e-06, + "loss": 2.2517, + "step": 17182 + }, + { + "epoch": 0.9218347639484978, + "grad_norm": 0.451171875, + "learning_rate": 4.694008194423996e-06, + "loss": 2.2557, + "step": 17183 + }, + { + "epoch": 0.9218884120171674, + "grad_norm": 0.47265625, + "learning_rate": 4.693966545588799e-06, + "loss": 1.6167, + "step": 17184 + }, + { + "epoch": 0.9219420600858369, + "grad_norm": 0.4609375, + "learning_rate": 4.693924894104161e-06, + "loss": 2.0925, + "step": 17185 + }, + { + "epoch": 0.9219957081545065, + "grad_norm": 0.42578125, + "learning_rate": 4.6938832399701305e-06, + "loss": 2.0887, + "step": 17186 + }, + { + "epoch": 0.922049356223176, + "grad_norm": 0.5546875, + "learning_rate": 4.693841583186758e-06, + "loss": 2.3151, + "step": 17187 + }, + { + "epoch": 0.9221030042918454, + "grad_norm": 0.625, + "learning_rate": 4.6937999237540945e-06, + "loss": 2.2665, + "step": 17188 + }, + { + "epoch": 0.922156652360515, + "grad_norm": 0.4296875, + "learning_rate": 4.69375826167219e-06, + "loss": 2.4222, + "step": 17189 + }, + { + "epoch": 0.9222103004291845, + "grad_norm": 0.55859375, + "learning_rate": 4.693716596941094e-06, + "loss": 2.0608, + "step": 17190 + }, + { + "epoch": 0.9222639484978541, + "grad_norm": 0.46484375, + "learning_rate": 4.693674929560858e-06, + "loss": 2.0428, + "step": 17191 + }, + { + "epoch": 0.9223175965665236, + "grad_norm": 0.435546875, + "learning_rate": 4.693633259531533e-06, + "loss": 1.7248, + "step": 17192 + }, + { + "epoch": 0.9223712446351932, + "grad_norm": 0.435546875, + "learning_rate": 4.693591586853168e-06, + "loss": 2.1989, + "step": 17193 + }, + { + "epoch": 0.9224248927038626, + "grad_norm": 0.458984375, + "learning_rate": 4.693549911525813e-06, + "loss": 2.3986, + "step": 17194 + }, + { + "epoch": 0.9224785407725322, + "grad_norm": 0.59765625, + "learning_rate": 4.693508233549518e-06, + "loss": 2.1128, + "step": 17195 + }, + { + "epoch": 0.9225321888412017, + "grad_norm": 0.427734375, + "learning_rate": 4.693466552924336e-06, + "loss": 2.3889, + "step": 17196 + }, + { + "epoch": 0.9225858369098713, + "grad_norm": 0.470703125, + "learning_rate": 4.693424869650315e-06, + "loss": 2.1986, + "step": 17197 + }, + { + "epoch": 0.9226394849785408, + "grad_norm": 0.482421875, + "learning_rate": 4.693383183727506e-06, + "loss": 2.2569, + "step": 17198 + }, + { + "epoch": 0.9226931330472103, + "grad_norm": 0.48828125, + "learning_rate": 4.693341495155959e-06, + "loss": 2.3744, + "step": 17199 + }, + { + "epoch": 0.9227467811158798, + "grad_norm": 0.431640625, + "learning_rate": 4.693299803935724e-06, + "loss": 2.6345, + "step": 17200 + }, + { + "epoch": 0.9228004291845494, + "grad_norm": 0.5, + "learning_rate": 4.693258110066853e-06, + "loss": 2.3768, + "step": 17201 + }, + { + "epoch": 0.9228540772532189, + "grad_norm": 0.6796875, + "learning_rate": 4.693216413549394e-06, + "loss": 1.7885, + "step": 17202 + }, + { + "epoch": 0.9229077253218884, + "grad_norm": 0.5390625, + "learning_rate": 4.693174714383399e-06, + "loss": 2.4486, + "step": 17203 + }, + { + "epoch": 0.922961373390558, + "grad_norm": 0.44921875, + "learning_rate": 4.693133012568919e-06, + "loss": 2.4721, + "step": 17204 + }, + { + "epoch": 0.9230150214592274, + "grad_norm": 0.443359375, + "learning_rate": 4.693091308106002e-06, + "loss": 2.2104, + "step": 17205 + }, + { + "epoch": 0.923068669527897, + "grad_norm": 0.5703125, + "learning_rate": 4.6930496009947014e-06, + "loss": 2.4692, + "step": 17206 + }, + { + "epoch": 0.9231223175965665, + "grad_norm": 0.57421875, + "learning_rate": 4.693007891235065e-06, + "loss": 2.2101, + "step": 17207 + }, + { + "epoch": 0.9231759656652361, + "grad_norm": 0.421875, + "learning_rate": 4.692966178827144e-06, + "loss": 2.1896, + "step": 17208 + }, + { + "epoch": 0.9232296137339056, + "grad_norm": 0.62890625, + "learning_rate": 4.69292446377099e-06, + "loss": 1.4237, + "step": 17209 + }, + { + "epoch": 0.9232832618025751, + "grad_norm": 0.482421875, + "learning_rate": 4.69288274606665e-06, + "loss": 2.1933, + "step": 17210 + }, + { + "epoch": 0.9233369098712446, + "grad_norm": 0.400390625, + "learning_rate": 4.692841025714178e-06, + "loss": 2.0797, + "step": 17211 + }, + { + "epoch": 0.9233905579399142, + "grad_norm": 0.5, + "learning_rate": 4.692799302713622e-06, + "loss": 2.2832, + "step": 17212 + }, + { + "epoch": 0.9234442060085837, + "grad_norm": 0.4609375, + "learning_rate": 4.692757577065034e-06, + "loss": 2.2228, + "step": 17213 + }, + { + "epoch": 0.9234978540772533, + "grad_norm": 0.54296875, + "learning_rate": 4.692715848768464e-06, + "loss": 2.3186, + "step": 17214 + }, + { + "epoch": 0.9235515021459227, + "grad_norm": 0.453125, + "learning_rate": 4.6926741178239615e-06, + "loss": 2.1801, + "step": 17215 + }, + { + "epoch": 0.9236051502145923, + "grad_norm": 0.38671875, + "learning_rate": 4.692632384231578e-06, + "loss": 2.2086, + "step": 17216 + }, + { + "epoch": 0.9236587982832618, + "grad_norm": 0.54296875, + "learning_rate": 4.692590647991363e-06, + "loss": 2.5804, + "step": 17217 + }, + { + "epoch": 0.9237124463519313, + "grad_norm": 0.455078125, + "learning_rate": 4.6925489091033675e-06, + "loss": 2.1532, + "step": 17218 + }, + { + "epoch": 0.9237660944206009, + "grad_norm": 0.5859375, + "learning_rate": 4.6925071675676415e-06, + "loss": 2.2231, + "step": 17219 + }, + { + "epoch": 0.9238197424892703, + "grad_norm": 0.51171875, + "learning_rate": 4.692465423384236e-06, + "loss": 2.027, + "step": 17220 + }, + { + "epoch": 0.9238733905579399, + "grad_norm": 0.484375, + "learning_rate": 4.692423676553201e-06, + "loss": 2.4149, + "step": 17221 + }, + { + "epoch": 0.9239270386266094, + "grad_norm": 0.443359375, + "learning_rate": 4.6923819270745875e-06, + "loss": 2.2133, + "step": 17222 + }, + { + "epoch": 0.923980686695279, + "grad_norm": 0.482421875, + "learning_rate": 4.692340174948444e-06, + "loss": 2.3776, + "step": 17223 + }, + { + "epoch": 0.9240343347639485, + "grad_norm": 0.41015625, + "learning_rate": 4.692298420174823e-06, + "loss": 2.1064, + "step": 17224 + }, + { + "epoch": 0.9240879828326181, + "grad_norm": 0.447265625, + "learning_rate": 4.692256662753774e-06, + "loss": 2.2157, + "step": 17225 + }, + { + "epoch": 0.9241416309012875, + "grad_norm": 0.451171875, + "learning_rate": 4.692214902685348e-06, + "loss": 2.1785, + "step": 17226 + }, + { + "epoch": 0.9241952789699571, + "grad_norm": 0.5390625, + "learning_rate": 4.692173139969596e-06, + "loss": 2.4523, + "step": 17227 + }, + { + "epoch": 0.9242489270386266, + "grad_norm": 0.421875, + "learning_rate": 4.692131374606565e-06, + "loss": 2.4106, + "step": 17228 + }, + { + "epoch": 0.9243025751072962, + "grad_norm": 0.41015625, + "learning_rate": 4.6920896065963105e-06, + "loss": 2.3383, + "step": 17229 + }, + { + "epoch": 0.9243562231759657, + "grad_norm": 0.474609375, + "learning_rate": 4.6920478359388785e-06, + "loss": 2.5121, + "step": 17230 + }, + { + "epoch": 0.9244098712446351, + "grad_norm": 0.5546875, + "learning_rate": 4.692006062634322e-06, + "loss": 1.9979, + "step": 17231 + }, + { + "epoch": 0.9244635193133047, + "grad_norm": 0.458984375, + "learning_rate": 4.691964286682691e-06, + "loss": 2.0812, + "step": 17232 + }, + { + "epoch": 0.9245171673819742, + "grad_norm": 0.515625, + "learning_rate": 4.691922508084036e-06, + "loss": 2.2748, + "step": 17233 + }, + { + "epoch": 0.9245708154506438, + "grad_norm": 0.453125, + "learning_rate": 4.691880726838407e-06, + "loss": 2.2734, + "step": 17234 + }, + { + "epoch": 0.9246244635193133, + "grad_norm": 0.51953125, + "learning_rate": 4.691838942945854e-06, + "loss": 2.3656, + "step": 17235 + }, + { + "epoch": 0.9246781115879829, + "grad_norm": 0.5390625, + "learning_rate": 4.691797156406428e-06, + "loss": 2.0882, + "step": 17236 + }, + { + "epoch": 0.9247317596566523, + "grad_norm": 0.5, + "learning_rate": 4.69175536722018e-06, + "loss": 2.0625, + "step": 17237 + }, + { + "epoch": 0.9247854077253219, + "grad_norm": 0.353515625, + "learning_rate": 4.69171357538716e-06, + "loss": 2.0844, + "step": 17238 + }, + { + "epoch": 0.9248390557939914, + "grad_norm": 0.51171875, + "learning_rate": 4.691671780907419e-06, + "loss": 2.1943, + "step": 17239 + }, + { + "epoch": 0.924892703862661, + "grad_norm": 0.36328125, + "learning_rate": 4.691629983781006e-06, + "loss": 2.118, + "step": 17240 + }, + { + "epoch": 0.9249463519313305, + "grad_norm": 0.46484375, + "learning_rate": 4.691588184007973e-06, + "loss": 2.3588, + "step": 17241 + }, + { + "epoch": 0.925, + "grad_norm": 0.37109375, + "learning_rate": 4.69154638158837e-06, + "loss": 1.8774, + "step": 17242 + }, + { + "epoch": 0.9250536480686695, + "grad_norm": 0.5, + "learning_rate": 4.691504576522247e-06, + "loss": 2.2567, + "step": 17243 + }, + { + "epoch": 0.9251072961373391, + "grad_norm": 0.48828125, + "learning_rate": 4.691462768809655e-06, + "loss": 2.3949, + "step": 17244 + }, + { + "epoch": 0.9251609442060086, + "grad_norm": 0.65625, + "learning_rate": 4.691420958450644e-06, + "loss": 2.4208, + "step": 17245 + }, + { + "epoch": 0.9252145922746781, + "grad_norm": 0.42578125, + "learning_rate": 4.6913791454452655e-06, + "loss": 2.2579, + "step": 17246 + }, + { + "epoch": 0.9252682403433476, + "grad_norm": 0.4296875, + "learning_rate": 4.691337329793568e-06, + "loss": 2.2802, + "step": 17247 + }, + { + "epoch": 0.9253218884120171, + "grad_norm": 0.478515625, + "learning_rate": 4.691295511495605e-06, + "loss": 2.2964, + "step": 17248 + }, + { + "epoch": 0.9253755364806867, + "grad_norm": 0.447265625, + "learning_rate": 4.691253690551424e-06, + "loss": 2.2422, + "step": 17249 + }, + { + "epoch": 0.9254291845493562, + "grad_norm": 0.890625, + "learning_rate": 4.691211866961078e-06, + "loss": 2.2486, + "step": 17250 + }, + { + "epoch": 0.9254828326180258, + "grad_norm": 0.5390625, + "learning_rate": 4.691170040724616e-06, + "loss": 2.3117, + "step": 17251 + }, + { + "epoch": 0.9255364806866953, + "grad_norm": 0.56640625, + "learning_rate": 4.691128211842089e-06, + "loss": 2.3708, + "step": 17252 + }, + { + "epoch": 0.9255901287553648, + "grad_norm": 0.431640625, + "learning_rate": 4.691086380313546e-06, + "loss": 2.319, + "step": 17253 + }, + { + "epoch": 0.9256437768240343, + "grad_norm": 0.640625, + "learning_rate": 4.6910445461390405e-06, + "loss": 2.451, + "step": 17254 + }, + { + "epoch": 0.9256974248927039, + "grad_norm": 0.5625, + "learning_rate": 4.691002709318621e-06, + "loss": 2.215, + "step": 17255 + }, + { + "epoch": 0.9257510729613734, + "grad_norm": 0.46484375, + "learning_rate": 4.690960869852338e-06, + "loss": 2.1974, + "step": 17256 + }, + { + "epoch": 0.925804721030043, + "grad_norm": 0.453125, + "learning_rate": 4.690919027740242e-06, + "loss": 2.2848, + "step": 17257 + }, + { + "epoch": 0.9258583690987124, + "grad_norm": 0.58984375, + "learning_rate": 4.690877182982385e-06, + "loss": 2.3048, + "step": 17258 + }, + { + "epoch": 0.925912017167382, + "grad_norm": 0.412109375, + "learning_rate": 4.690835335578816e-06, + "loss": 2.4513, + "step": 17259 + }, + { + "epoch": 0.9259656652360515, + "grad_norm": 0.50390625, + "learning_rate": 4.690793485529586e-06, + "loss": 2.3365, + "step": 17260 + }, + { + "epoch": 0.926019313304721, + "grad_norm": 0.365234375, + "learning_rate": 4.690751632834746e-06, + "loss": 2.2876, + "step": 17261 + }, + { + "epoch": 0.9260729613733906, + "grad_norm": 0.494140625, + "learning_rate": 4.690709777494345e-06, + "loss": 2.2699, + "step": 17262 + }, + { + "epoch": 0.92612660944206, + "grad_norm": 0.515625, + "learning_rate": 4.690667919508436e-06, + "loss": 2.5102, + "step": 17263 + }, + { + "epoch": 0.9261802575107296, + "grad_norm": 0.52734375, + "learning_rate": 4.690626058877068e-06, + "loss": 2.1372, + "step": 17264 + }, + { + "epoch": 0.9262339055793991, + "grad_norm": 0.5078125, + "learning_rate": 4.690584195600291e-06, + "loss": 2.3306, + "step": 17265 + }, + { + "epoch": 0.9262875536480687, + "grad_norm": 0.5, + "learning_rate": 4.690542329678157e-06, + "loss": 2.1942, + "step": 17266 + }, + { + "epoch": 0.9263412017167382, + "grad_norm": 0.53125, + "learning_rate": 4.690500461110716e-06, + "loss": 2.3363, + "step": 17267 + }, + { + "epoch": 0.9263948497854078, + "grad_norm": 0.474609375, + "learning_rate": 4.690458589898018e-06, + "loss": 2.3945, + "step": 17268 + }, + { + "epoch": 0.9264484978540772, + "grad_norm": 0.416015625, + "learning_rate": 4.690416716040114e-06, + "loss": 2.345, + "step": 17269 + }, + { + "epoch": 0.9265021459227468, + "grad_norm": 0.56640625, + "learning_rate": 4.690374839537054e-06, + "loss": 2.4127, + "step": 17270 + }, + { + "epoch": 0.9265557939914163, + "grad_norm": 0.458984375, + "learning_rate": 4.6903329603888905e-06, + "loss": 2.3497, + "step": 17271 + }, + { + "epoch": 0.9266094420600859, + "grad_norm": 0.486328125, + "learning_rate": 4.6902910785956715e-06, + "loss": 2.5957, + "step": 17272 + }, + { + "epoch": 0.9266630901287554, + "grad_norm": 0.435546875, + "learning_rate": 4.69024919415745e-06, + "loss": 2.2694, + "step": 17273 + }, + { + "epoch": 0.9267167381974248, + "grad_norm": 0.400390625, + "learning_rate": 4.6902073070742746e-06, + "loss": 2.3187, + "step": 17274 + }, + { + "epoch": 0.9267703862660944, + "grad_norm": 0.6875, + "learning_rate": 4.6901654173461966e-06, + "loss": 2.3751, + "step": 17275 + }, + { + "epoch": 0.9268240343347639, + "grad_norm": 0.490234375, + "learning_rate": 4.690123524973267e-06, + "loss": 2.0938, + "step": 17276 + }, + { + "epoch": 0.9268776824034335, + "grad_norm": 0.859375, + "learning_rate": 4.690081629955535e-06, + "loss": 1.849, + "step": 17277 + }, + { + "epoch": 0.926931330472103, + "grad_norm": 0.42578125, + "learning_rate": 4.690039732293054e-06, + "loss": 1.9534, + "step": 17278 + }, + { + "epoch": 0.9269849785407726, + "grad_norm": 0.453125, + "learning_rate": 4.68999783198587e-06, + "loss": 2.3351, + "step": 17279 + }, + { + "epoch": 0.927038626609442, + "grad_norm": 0.51171875, + "learning_rate": 4.689955929034039e-06, + "loss": 2.2804, + "step": 17280 + }, + { + "epoch": 0.9270922746781116, + "grad_norm": 1.21875, + "learning_rate": 4.689914023437608e-06, + "loss": 2.2372, + "step": 17281 + }, + { + "epoch": 0.9271459227467811, + "grad_norm": 0.474609375, + "learning_rate": 4.689872115196629e-06, + "loss": 2.2449, + "step": 17282 + }, + { + "epoch": 0.9271995708154507, + "grad_norm": 0.42578125, + "learning_rate": 4.689830204311152e-06, + "loss": 2.2263, + "step": 17283 + }, + { + "epoch": 0.9272532188841202, + "grad_norm": 0.5078125, + "learning_rate": 4.689788290781228e-06, + "loss": 2.4385, + "step": 17284 + }, + { + "epoch": 0.9273068669527897, + "grad_norm": 0.58203125, + "learning_rate": 4.689746374606907e-06, + "loss": 2.6569, + "step": 17285 + }, + { + "epoch": 0.9273605150214592, + "grad_norm": 0.43359375, + "learning_rate": 4.689704455788241e-06, + "loss": 2.1562, + "step": 17286 + }, + { + "epoch": 0.9274141630901288, + "grad_norm": 0.408203125, + "learning_rate": 4.6896625343252784e-06, + "loss": 2.3139, + "step": 17287 + }, + { + "epoch": 0.9274678111587983, + "grad_norm": 0.419921875, + "learning_rate": 4.689620610218072e-06, + "loss": 2.1527, + "step": 17288 + }, + { + "epoch": 0.9275214592274678, + "grad_norm": 0.5546875, + "learning_rate": 4.689578683466671e-06, + "loss": 2.301, + "step": 17289 + }, + { + "epoch": 0.9275751072961373, + "grad_norm": 0.51171875, + "learning_rate": 4.689536754071128e-06, + "loss": 2.2408, + "step": 17290 + }, + { + "epoch": 0.9276287553648068, + "grad_norm": 0.671875, + "learning_rate": 4.689494822031491e-06, + "loss": 2.1796, + "step": 17291 + }, + { + "epoch": 0.9276824034334764, + "grad_norm": 0.43359375, + "learning_rate": 4.689452887347812e-06, + "loss": 2.271, + "step": 17292 + }, + { + "epoch": 0.9277360515021459, + "grad_norm": 0.486328125, + "learning_rate": 4.689410950020141e-06, + "loss": 2.5314, + "step": 17293 + }, + { + "epoch": 0.9277896995708155, + "grad_norm": 1.0859375, + "learning_rate": 4.68936901004853e-06, + "loss": 2.3972, + "step": 17294 + }, + { + "epoch": 0.927843347639485, + "grad_norm": 0.439453125, + "learning_rate": 4.689327067433028e-06, + "loss": 2.1222, + "step": 17295 + }, + { + "epoch": 0.9278969957081545, + "grad_norm": 0.42578125, + "learning_rate": 4.689285122173687e-06, + "loss": 2.371, + "step": 17296 + }, + { + "epoch": 0.927950643776824, + "grad_norm": 0.47265625, + "learning_rate": 4.689243174270557e-06, + "loss": 2.2188, + "step": 17297 + }, + { + "epoch": 0.9280042918454936, + "grad_norm": 0.75390625, + "learning_rate": 4.689201223723689e-06, + "loss": 1.955, + "step": 17298 + }, + { + "epoch": 0.9280579399141631, + "grad_norm": 0.443359375, + "learning_rate": 4.6891592705331325e-06, + "loss": 2.3515, + "step": 17299 + }, + { + "epoch": 0.9281115879828327, + "grad_norm": 0.419921875, + "learning_rate": 4.68911731469894e-06, + "loss": 2.4492, + "step": 17300 + }, + { + "epoch": 0.9281652360515021, + "grad_norm": 0.42578125, + "learning_rate": 4.689075356221161e-06, + "loss": 2.3028, + "step": 17301 + }, + { + "epoch": 0.9282188841201717, + "grad_norm": 0.4453125, + "learning_rate": 4.689033395099845e-06, + "loss": 2.1484, + "step": 17302 + }, + { + "epoch": 0.9282725321888412, + "grad_norm": 0.55078125, + "learning_rate": 4.688991431335046e-06, + "loss": 2.147, + "step": 17303 + }, + { + "epoch": 0.9283261802575107, + "grad_norm": 0.515625, + "learning_rate": 4.6889494649268116e-06, + "loss": 2.2788, + "step": 17304 + }, + { + "epoch": 0.9283798283261803, + "grad_norm": 0.5234375, + "learning_rate": 4.688907495875195e-06, + "loss": 2.5973, + "step": 17305 + }, + { + "epoch": 0.9284334763948497, + "grad_norm": 0.439453125, + "learning_rate": 4.6888655241802435e-06, + "loss": 2.053, + "step": 17306 + }, + { + "epoch": 0.9284871244635193, + "grad_norm": 0.51953125, + "learning_rate": 4.688823549842011e-06, + "loss": 2.4308, + "step": 17307 + }, + { + "epoch": 0.9285407725321888, + "grad_norm": 0.478515625, + "learning_rate": 4.688781572860548e-06, + "loss": 2.1419, + "step": 17308 + }, + { + "epoch": 0.9285944206008584, + "grad_norm": 0.65234375, + "learning_rate": 4.6887395932359025e-06, + "loss": 2.2245, + "step": 17309 + }, + { + "epoch": 0.9286480686695279, + "grad_norm": 0.43359375, + "learning_rate": 4.6886976109681274e-06, + "loss": 2.4606, + "step": 17310 + }, + { + "epoch": 0.9287017167381975, + "grad_norm": 0.462890625, + "learning_rate": 4.688655626057272e-06, + "loss": 2.1979, + "step": 17311 + }, + { + "epoch": 0.9287553648068669, + "grad_norm": 0.43359375, + "learning_rate": 4.68861363850339e-06, + "loss": 2.3147, + "step": 17312 + }, + { + "epoch": 0.9288090128755365, + "grad_norm": 0.470703125, + "learning_rate": 4.688571648306528e-06, + "loss": 2.0954, + "step": 17313 + }, + { + "epoch": 0.928862660944206, + "grad_norm": 0.435546875, + "learning_rate": 4.6885296554667395e-06, + "loss": 2.1617, + "step": 17314 + }, + { + "epoch": 0.9289163090128756, + "grad_norm": 0.48828125, + "learning_rate": 4.688487659984075e-06, + "loss": 2.0764, + "step": 17315 + }, + { + "epoch": 0.9289699570815451, + "grad_norm": 0.43359375, + "learning_rate": 4.6884456618585825e-06, + "loss": 2.3244, + "step": 17316 + }, + { + "epoch": 0.9290236051502145, + "grad_norm": 0.494140625, + "learning_rate": 4.688403661090317e-06, + "loss": 2.2082, + "step": 17317 + }, + { + "epoch": 0.9290772532188841, + "grad_norm": 0.4453125, + "learning_rate": 4.6883616576793255e-06, + "loss": 1.4786, + "step": 17318 + }, + { + "epoch": 0.9291309012875536, + "grad_norm": 0.57421875, + "learning_rate": 4.688319651625662e-06, + "loss": 2.4386, + "step": 17319 + }, + { + "epoch": 0.9291845493562232, + "grad_norm": 1.0703125, + "learning_rate": 4.688277642929374e-06, + "loss": 2.1325, + "step": 17320 + }, + { + "epoch": 0.9292381974248927, + "grad_norm": 0.578125, + "learning_rate": 4.6882356315905146e-06, + "loss": 2.2286, + "step": 17321 + }, + { + "epoch": 0.9292918454935623, + "grad_norm": 0.55078125, + "learning_rate": 4.688193617609132e-06, + "loss": 2.4061, + "step": 17322 + }, + { + "epoch": 0.9293454935622317, + "grad_norm": 0.5546875, + "learning_rate": 4.68815160098528e-06, + "loss": 2.3897, + "step": 17323 + }, + { + "epoch": 0.9293991416309013, + "grad_norm": 0.458984375, + "learning_rate": 4.688109581719008e-06, + "loss": 1.9039, + "step": 17324 + }, + { + "epoch": 0.9294527896995708, + "grad_norm": 0.4375, + "learning_rate": 4.688067559810366e-06, + "loss": 2.4182, + "step": 17325 + }, + { + "epoch": 0.9295064377682404, + "grad_norm": 0.484375, + "learning_rate": 4.688025535259406e-06, + "loss": 2.247, + "step": 17326 + }, + { + "epoch": 0.9295600858369099, + "grad_norm": 0.435546875, + "learning_rate": 4.687983508066178e-06, + "loss": 2.2912, + "step": 17327 + }, + { + "epoch": 0.9296137339055794, + "grad_norm": 0.41796875, + "learning_rate": 4.687941478230733e-06, + "loss": 2.1705, + "step": 17328 + }, + { + "epoch": 0.9296673819742489, + "grad_norm": 0.57421875, + "learning_rate": 4.687899445753122e-06, + "loss": 2.4432, + "step": 17329 + }, + { + "epoch": 0.9297210300429185, + "grad_norm": 0.43359375, + "learning_rate": 4.687857410633394e-06, + "loss": 2.1777, + "step": 17330 + }, + { + "epoch": 0.929774678111588, + "grad_norm": 0.4296875, + "learning_rate": 4.6878153728716025e-06, + "loss": 2.386, + "step": 17331 + }, + { + "epoch": 0.9298283261802575, + "grad_norm": 0.4609375, + "learning_rate": 4.6877733324677965e-06, + "loss": 2.3935, + "step": 17332 + }, + { + "epoch": 0.929881974248927, + "grad_norm": 0.404296875, + "learning_rate": 4.687731289422027e-06, + "loss": 2.2131, + "step": 17333 + }, + { + "epoch": 0.9299356223175965, + "grad_norm": 0.419921875, + "learning_rate": 4.6876892437343455e-06, + "loss": 2.2046, + "step": 17334 + }, + { + "epoch": 0.9299892703862661, + "grad_norm": 0.6015625, + "learning_rate": 4.687647195404802e-06, + "loss": 2.0071, + "step": 17335 + }, + { + "epoch": 0.9300429184549356, + "grad_norm": 0.4140625, + "learning_rate": 4.687605144433448e-06, + "loss": 2.2628, + "step": 17336 + }, + { + "epoch": 0.9300965665236052, + "grad_norm": 0.484375, + "learning_rate": 4.687563090820334e-06, + "loss": 2.185, + "step": 17337 + }, + { + "epoch": 0.9301502145922746, + "grad_norm": 0.37109375, + "learning_rate": 4.68752103456551e-06, + "loss": 2.0795, + "step": 17338 + }, + { + "epoch": 0.9302038626609442, + "grad_norm": 0.482421875, + "learning_rate": 4.687478975669027e-06, + "loss": 2.3482, + "step": 17339 + }, + { + "epoch": 0.9302575107296137, + "grad_norm": 0.482421875, + "learning_rate": 4.687436914130937e-06, + "loss": 2.4763, + "step": 17340 + }, + { + "epoch": 0.9303111587982833, + "grad_norm": 0.5, + "learning_rate": 4.68739484995129e-06, + "loss": 2.3021, + "step": 17341 + }, + { + "epoch": 0.9303648068669528, + "grad_norm": 0.4765625, + "learning_rate": 4.687352783130136e-06, + "loss": 2.2565, + "step": 17342 + }, + { + "epoch": 0.9304184549356224, + "grad_norm": 0.41796875, + "learning_rate": 4.687310713667527e-06, + "loss": 2.193, + "step": 17343 + }, + { + "epoch": 0.9304721030042918, + "grad_norm": 0.455078125, + "learning_rate": 4.687268641563514e-06, + "loss": 2.1179, + "step": 17344 + }, + { + "epoch": 0.9305257510729614, + "grad_norm": 0.45703125, + "learning_rate": 4.687226566818146e-06, + "loss": 2.4068, + "step": 17345 + }, + { + "epoch": 0.9305793991416309, + "grad_norm": 0.9296875, + "learning_rate": 4.687184489431476e-06, + "loss": 2.4231, + "step": 17346 + }, + { + "epoch": 0.9306330472103004, + "grad_norm": 0.57421875, + "learning_rate": 4.687142409403553e-06, + "loss": 2.4376, + "step": 17347 + }, + { + "epoch": 0.93068669527897, + "grad_norm": 0.431640625, + "learning_rate": 4.687100326734429e-06, + "loss": 2.22, + "step": 17348 + }, + { + "epoch": 0.9307403433476394, + "grad_norm": 0.76171875, + "learning_rate": 4.687058241424155e-06, + "loss": 2.2564, + "step": 17349 + }, + { + "epoch": 0.930793991416309, + "grad_norm": 0.45703125, + "learning_rate": 4.6870161534727806e-06, + "loss": 2.1507, + "step": 17350 + }, + { + "epoch": 0.9308476394849785, + "grad_norm": 0.451171875, + "learning_rate": 4.686974062880358e-06, + "loss": 2.2334, + "step": 17351 + }, + { + "epoch": 0.9309012875536481, + "grad_norm": 0.5078125, + "learning_rate": 4.686931969646937e-06, + "loss": 2.343, + "step": 17352 + }, + { + "epoch": 0.9309549356223176, + "grad_norm": 0.5078125, + "learning_rate": 4.6868898737725686e-06, + "loss": 2.2686, + "step": 17353 + }, + { + "epoch": 0.9310085836909872, + "grad_norm": 0.48828125, + "learning_rate": 4.686847775257303e-06, + "loss": 2.53, + "step": 17354 + }, + { + "epoch": 0.9310622317596566, + "grad_norm": 1.71875, + "learning_rate": 4.686805674101193e-06, + "loss": 2.4241, + "step": 17355 + }, + { + "epoch": 0.9311158798283262, + "grad_norm": 0.4453125, + "learning_rate": 4.686763570304288e-06, + "loss": 2.3035, + "step": 17356 + }, + { + "epoch": 0.9311695278969957, + "grad_norm": 0.462890625, + "learning_rate": 4.686721463866639e-06, + "loss": 2.3161, + "step": 17357 + }, + { + "epoch": 0.9312231759656653, + "grad_norm": 0.416015625, + "learning_rate": 4.686679354788297e-06, + "loss": 2.2278, + "step": 17358 + }, + { + "epoch": 0.9312768240343348, + "grad_norm": 0.421875, + "learning_rate": 4.6866372430693125e-06, + "loss": 2.3322, + "step": 17359 + }, + { + "epoch": 0.9313304721030042, + "grad_norm": 0.46484375, + "learning_rate": 4.686595128709737e-06, + "loss": 2.4139, + "step": 17360 + }, + { + "epoch": 0.9313841201716738, + "grad_norm": 0.39453125, + "learning_rate": 4.686553011709621e-06, + "loss": 2.0169, + "step": 17361 + }, + { + "epoch": 0.9314377682403433, + "grad_norm": 0.427734375, + "learning_rate": 4.686510892069015e-06, + "loss": 2.1778, + "step": 17362 + }, + { + "epoch": 0.9314914163090129, + "grad_norm": 0.455078125, + "learning_rate": 4.686468769787971e-06, + "loss": 2.2346, + "step": 17363 + }, + { + "epoch": 0.9315450643776824, + "grad_norm": 0.443359375, + "learning_rate": 4.686426644866539e-06, + "loss": 2.3296, + "step": 17364 + }, + { + "epoch": 0.931598712446352, + "grad_norm": 0.4140625, + "learning_rate": 4.6863845173047694e-06, + "loss": 2.3682, + "step": 17365 + }, + { + "epoch": 0.9316523605150214, + "grad_norm": 0.44921875, + "learning_rate": 4.686342387102714e-06, + "loss": 1.9272, + "step": 17366 + }, + { + "epoch": 0.931706008583691, + "grad_norm": 0.53515625, + "learning_rate": 4.686300254260423e-06, + "loss": 2.2976, + "step": 17367 + }, + { + "epoch": 0.9317596566523605, + "grad_norm": 0.478515625, + "learning_rate": 4.686258118777948e-06, + "loss": 2.3323, + "step": 17368 + }, + { + "epoch": 0.9318133047210301, + "grad_norm": 0.353515625, + "learning_rate": 4.68621598065534e-06, + "loss": 2.1322, + "step": 17369 + }, + { + "epoch": 0.9318669527896996, + "grad_norm": 0.45703125, + "learning_rate": 4.686173839892648e-06, + "loss": 2.2306, + "step": 17370 + }, + { + "epoch": 0.9319206008583691, + "grad_norm": 0.7734375, + "learning_rate": 4.686131696489925e-06, + "loss": 2.17, + "step": 17371 + }, + { + "epoch": 0.9319742489270386, + "grad_norm": 0.77734375, + "learning_rate": 4.686089550447222e-06, + "loss": 2.028, + "step": 17372 + }, + { + "epoch": 0.9320278969957082, + "grad_norm": 0.427734375, + "learning_rate": 4.686047401764589e-06, + "loss": 2.159, + "step": 17373 + }, + { + "epoch": 0.9320815450643777, + "grad_norm": 0.466796875, + "learning_rate": 4.686005250442076e-06, + "loss": 2.2047, + "step": 17374 + }, + { + "epoch": 0.9321351931330472, + "grad_norm": 0.80078125, + "learning_rate": 4.6859630964797345e-06, + "loss": 2.2762, + "step": 17375 + }, + { + "epoch": 0.9321888412017167, + "grad_norm": 0.51171875, + "learning_rate": 4.685920939877617e-06, + "loss": 2.2997, + "step": 17376 + }, + { + "epoch": 0.9322424892703862, + "grad_norm": 0.46484375, + "learning_rate": 4.685878780635772e-06, + "loss": 1.9534, + "step": 17377 + }, + { + "epoch": 0.9322961373390558, + "grad_norm": 0.44140625, + "learning_rate": 4.685836618754253e-06, + "loss": 2.3866, + "step": 17378 + }, + { + "epoch": 0.9323497854077253, + "grad_norm": 0.515625, + "learning_rate": 4.685794454233108e-06, + "loss": 2.1553, + "step": 17379 + }, + { + "epoch": 0.9324034334763949, + "grad_norm": 0.5, + "learning_rate": 4.68575228707239e-06, + "loss": 2.4934, + "step": 17380 + }, + { + "epoch": 0.9324570815450643, + "grad_norm": 0.6484375, + "learning_rate": 4.68571011727215e-06, + "loss": 2.34, + "step": 17381 + }, + { + "epoch": 0.9325107296137339, + "grad_norm": 0.48046875, + "learning_rate": 4.685667944832437e-06, + "loss": 1.9479, + "step": 17382 + }, + { + "epoch": 0.9325643776824034, + "grad_norm": 0.421875, + "learning_rate": 4.685625769753305e-06, + "loss": 2.1584, + "step": 17383 + }, + { + "epoch": 0.932618025751073, + "grad_norm": 0.53125, + "learning_rate": 4.685583592034802e-06, + "loss": 2.12, + "step": 17384 + }, + { + "epoch": 0.9326716738197425, + "grad_norm": 0.5234375, + "learning_rate": 4.685541411676981e-06, + "loss": 2.2927, + "step": 17385 + }, + { + "epoch": 0.9327253218884121, + "grad_norm": 0.52734375, + "learning_rate": 4.685499228679891e-06, + "loss": 2.4059, + "step": 17386 + }, + { + "epoch": 0.9327789699570815, + "grad_norm": 0.47265625, + "learning_rate": 4.685457043043583e-06, + "loss": 2.3335, + "step": 17387 + }, + { + "epoch": 0.9328326180257511, + "grad_norm": 0.43359375, + "learning_rate": 4.68541485476811e-06, + "loss": 2.2318, + "step": 17388 + }, + { + "epoch": 0.9328862660944206, + "grad_norm": 0.37890625, + "learning_rate": 4.685372663853523e-06, + "loss": 2.2189, + "step": 17389 + }, + { + "epoch": 0.9329399141630901, + "grad_norm": 0.44921875, + "learning_rate": 4.68533047029987e-06, + "loss": 2.2322, + "step": 17390 + }, + { + "epoch": 0.9329935622317597, + "grad_norm": 0.625, + "learning_rate": 4.685288274107204e-06, + "loss": 2.139, + "step": 17391 + }, + { + "epoch": 0.9330472103004291, + "grad_norm": 0.6796875, + "learning_rate": 4.685246075275576e-06, + "loss": 2.5342, + "step": 17392 + }, + { + "epoch": 0.9331008583690987, + "grad_norm": 0.498046875, + "learning_rate": 4.685203873805037e-06, + "loss": 2.3827, + "step": 17393 + }, + { + "epoch": 0.9331545064377682, + "grad_norm": 0.546875, + "learning_rate": 4.685161669695637e-06, + "loss": 2.3344, + "step": 17394 + }, + { + "epoch": 0.9332081545064378, + "grad_norm": 0.482421875, + "learning_rate": 4.685119462947428e-06, + "loss": 2.2627, + "step": 17395 + }, + { + "epoch": 0.9332618025751073, + "grad_norm": 0.48046875, + "learning_rate": 4.68507725356046e-06, + "loss": 2.4274, + "step": 17396 + }, + { + "epoch": 0.9333154506437769, + "grad_norm": 0.49609375, + "learning_rate": 4.685035041534786e-06, + "loss": 2.2985, + "step": 17397 + }, + { + "epoch": 0.9333690987124463, + "grad_norm": 0.47265625, + "learning_rate": 4.6849928268704535e-06, + "loss": 2.1183, + "step": 17398 + }, + { + "epoch": 0.9334227467811159, + "grad_norm": 0.478515625, + "learning_rate": 4.684950609567517e-06, + "loss": 2.1028, + "step": 17399 + }, + { + "epoch": 0.9334763948497854, + "grad_norm": 0.44140625, + "learning_rate": 4.684908389626025e-06, + "loss": 2.4135, + "step": 17400 + }, + { + "epoch": 0.933530042918455, + "grad_norm": 0.486328125, + "learning_rate": 4.68486616704603e-06, + "loss": 2.2013, + "step": 17401 + }, + { + "epoch": 0.9335836909871245, + "grad_norm": 0.48828125, + "learning_rate": 4.684823941827582e-06, + "loss": 2.385, + "step": 17402 + }, + { + "epoch": 0.933637339055794, + "grad_norm": 0.4609375, + "learning_rate": 4.684781713970733e-06, + "loss": 2.0611, + "step": 17403 + }, + { + "epoch": 0.9336909871244635, + "grad_norm": 0.50390625, + "learning_rate": 4.684739483475533e-06, + "loss": 2.6556, + "step": 17404 + }, + { + "epoch": 0.933744635193133, + "grad_norm": 0.443359375, + "learning_rate": 4.684697250342034e-06, + "loss": 2.4015, + "step": 17405 + }, + { + "epoch": 0.9337982832618026, + "grad_norm": 0.3984375, + "learning_rate": 4.684655014570285e-06, + "loss": 2.375, + "step": 17406 + }, + { + "epoch": 0.9338519313304721, + "grad_norm": 0.458984375, + "learning_rate": 4.68461277616034e-06, + "loss": 2.0891, + "step": 17407 + }, + { + "epoch": 0.9339055793991416, + "grad_norm": 0.396484375, + "learning_rate": 4.6845705351122475e-06, + "loss": 2.3596, + "step": 17408 + }, + { + "epoch": 0.9339592274678111, + "grad_norm": 0.404296875, + "learning_rate": 4.684528291426059e-06, + "loss": 2.3244, + "step": 17409 + }, + { + "epoch": 0.9340128755364807, + "grad_norm": 0.466796875, + "learning_rate": 4.684486045101827e-06, + "loss": 2.1955, + "step": 17410 + }, + { + "epoch": 0.9340665236051502, + "grad_norm": 0.5390625, + "learning_rate": 4.684443796139602e-06, + "loss": 2.0946, + "step": 17411 + }, + { + "epoch": 0.9341201716738198, + "grad_norm": 0.455078125, + "learning_rate": 4.684401544539433e-06, + "loss": 2.0712, + "step": 17412 + }, + { + "epoch": 0.9341738197424893, + "grad_norm": 0.447265625, + "learning_rate": 4.6843592903013725e-06, + "loss": 2.4151, + "step": 17413 + }, + { + "epoch": 0.9342274678111588, + "grad_norm": 0.50390625, + "learning_rate": 4.684317033425473e-06, + "loss": 2.5932, + "step": 17414 + }, + { + "epoch": 0.9342811158798283, + "grad_norm": 0.427734375, + "learning_rate": 4.684274773911783e-06, + "loss": 2.2673, + "step": 17415 + }, + { + "epoch": 0.9343347639484979, + "grad_norm": 0.7734375, + "learning_rate": 4.684232511760355e-06, + "loss": 2.3864, + "step": 17416 + }, + { + "epoch": 0.9343884120171674, + "grad_norm": 0.6015625, + "learning_rate": 4.684190246971239e-06, + "loss": 2.2081, + "step": 17417 + }, + { + "epoch": 0.9344420600858369, + "grad_norm": 0.439453125, + "learning_rate": 4.6841479795444874e-06, + "loss": 2.4297, + "step": 17418 + }, + { + "epoch": 0.9344957081545064, + "grad_norm": 0.60546875, + "learning_rate": 4.68410570948015e-06, + "loss": 2.1528, + "step": 17419 + }, + { + "epoch": 0.9345493562231759, + "grad_norm": 0.470703125, + "learning_rate": 4.684063436778278e-06, + "loss": 2.2571, + "step": 17420 + }, + { + "epoch": 0.9346030042918455, + "grad_norm": 0.470703125, + "learning_rate": 4.684021161438924e-06, + "loss": 2.4106, + "step": 17421 + }, + { + "epoch": 0.934656652360515, + "grad_norm": 0.46875, + "learning_rate": 4.683978883462137e-06, + "loss": 2.4423, + "step": 17422 + }, + { + "epoch": 0.9347103004291846, + "grad_norm": 0.384765625, + "learning_rate": 4.6839366028479695e-06, + "loss": 1.9858, + "step": 17423 + }, + { + "epoch": 0.934763948497854, + "grad_norm": 0.484375, + "learning_rate": 4.683894319596472e-06, + "loss": 2.2193, + "step": 17424 + }, + { + "epoch": 0.9348175965665236, + "grad_norm": 0.5390625, + "learning_rate": 4.6838520337076945e-06, + "loss": 2.4561, + "step": 17425 + }, + { + "epoch": 0.9348712446351931, + "grad_norm": 0.400390625, + "learning_rate": 4.68380974518169e-06, + "loss": 2.3659, + "step": 17426 + }, + { + "epoch": 0.9349248927038627, + "grad_norm": 0.498046875, + "learning_rate": 4.683767454018508e-06, + "loss": 2.2608, + "step": 17427 + }, + { + "epoch": 0.9349785407725322, + "grad_norm": 0.45703125, + "learning_rate": 4.683725160218201e-06, + "loss": 2.3394, + "step": 17428 + }, + { + "epoch": 0.9350321888412018, + "grad_norm": 0.6171875, + "learning_rate": 4.683682863780819e-06, + "loss": 2.2429, + "step": 17429 + }, + { + "epoch": 0.9350858369098712, + "grad_norm": 0.451171875, + "learning_rate": 4.683640564706413e-06, + "loss": 2.2141, + "step": 17430 + }, + { + "epoch": 0.9351394849785408, + "grad_norm": 0.458984375, + "learning_rate": 4.683598262995035e-06, + "loss": 2.2491, + "step": 17431 + }, + { + "epoch": 0.9351931330472103, + "grad_norm": 0.484375, + "learning_rate": 4.683555958646735e-06, + "loss": 2.2421, + "step": 17432 + }, + { + "epoch": 0.9352467811158798, + "grad_norm": 0.380859375, + "learning_rate": 4.6835136516615656e-06, + "loss": 2.0621, + "step": 17433 + }, + { + "epoch": 0.9353004291845494, + "grad_norm": 0.47265625, + "learning_rate": 4.6834713420395765e-06, + "loss": 2.2428, + "step": 17434 + }, + { + "epoch": 0.9353540772532188, + "grad_norm": 0.498046875, + "learning_rate": 4.683429029780819e-06, + "loss": 2.2843, + "step": 17435 + }, + { + "epoch": 0.9354077253218884, + "grad_norm": 1.6015625, + "learning_rate": 4.683386714885344e-06, + "loss": 2.4204, + "step": 17436 + }, + { + "epoch": 0.9354613733905579, + "grad_norm": 0.4921875, + "learning_rate": 4.683344397353204e-06, + "loss": 2.2065, + "step": 17437 + }, + { + "epoch": 0.9355150214592275, + "grad_norm": 0.5390625, + "learning_rate": 4.6833020771844486e-06, + "loss": 1.5003, + "step": 17438 + }, + { + "epoch": 0.935568669527897, + "grad_norm": 0.46875, + "learning_rate": 4.6832597543791294e-06, + "loss": 2.119, + "step": 17439 + }, + { + "epoch": 0.9356223175965666, + "grad_norm": 0.50390625, + "learning_rate": 4.6832174289372975e-06, + "loss": 2.6228, + "step": 17440 + }, + { + "epoch": 0.935675965665236, + "grad_norm": 0.75, + "learning_rate": 4.683175100859004e-06, + "loss": 2.288, + "step": 17441 + }, + { + "epoch": 0.9357296137339056, + "grad_norm": 0.45703125, + "learning_rate": 4.683132770144301e-06, + "loss": 2.1078, + "step": 17442 + }, + { + "epoch": 0.9357832618025751, + "grad_norm": 0.546875, + "learning_rate": 4.683090436793238e-06, + "loss": 2.4298, + "step": 17443 + }, + { + "epoch": 0.9358369098712447, + "grad_norm": 0.5078125, + "learning_rate": 4.683048100805866e-06, + "loss": 2.5322, + "step": 17444 + }, + { + "epoch": 0.9358905579399142, + "grad_norm": 0.5546875, + "learning_rate": 4.683005762182238e-06, + "loss": 2.1926, + "step": 17445 + }, + { + "epoch": 0.9359442060085837, + "grad_norm": 0.47265625, + "learning_rate": 4.682963420922404e-06, + "loss": 2.1845, + "step": 17446 + }, + { + "epoch": 0.9359978540772532, + "grad_norm": 0.453125, + "learning_rate": 4.6829210770264145e-06, + "loss": 2.1989, + "step": 17447 + }, + { + "epoch": 0.9360515021459227, + "grad_norm": 0.439453125, + "learning_rate": 4.682878730494322e-06, + "loss": 2.1515, + "step": 17448 + }, + { + "epoch": 0.9361051502145923, + "grad_norm": 0.48828125, + "learning_rate": 4.682836381326177e-06, + "loss": 2.2987, + "step": 17449 + }, + { + "epoch": 0.9361587982832618, + "grad_norm": 0.455078125, + "learning_rate": 4.6827940295220295e-06, + "loss": 2.5076, + "step": 17450 + }, + { + "epoch": 0.9362124463519313, + "grad_norm": 0.392578125, + "learning_rate": 4.682751675081933e-06, + "loss": 2.4035, + "step": 17451 + }, + { + "epoch": 0.9362660944206008, + "grad_norm": 0.55859375, + "learning_rate": 4.682709318005937e-06, + "loss": 2.4449, + "step": 17452 + }, + { + "epoch": 0.9363197424892704, + "grad_norm": 0.44921875, + "learning_rate": 4.682666958294093e-06, + "loss": 2.2897, + "step": 17453 + }, + { + "epoch": 0.9363733905579399, + "grad_norm": 8.75, + "learning_rate": 4.682624595946452e-06, + "loss": 2.3232, + "step": 17454 + }, + { + "epoch": 0.9364270386266095, + "grad_norm": 0.5546875, + "learning_rate": 4.6825822309630654e-06, + "loss": 2.4464, + "step": 17455 + }, + { + "epoch": 0.936480686695279, + "grad_norm": 0.51171875, + "learning_rate": 4.682539863343984e-06, + "loss": 2.4058, + "step": 17456 + }, + { + "epoch": 0.9365343347639485, + "grad_norm": 0.890625, + "learning_rate": 4.682497493089261e-06, + "loss": 2.5826, + "step": 17457 + }, + { + "epoch": 0.936587982832618, + "grad_norm": 0.427734375, + "learning_rate": 4.682455120198945e-06, + "loss": 2.3539, + "step": 17458 + }, + { + "epoch": 0.9366416309012876, + "grad_norm": 0.396484375, + "learning_rate": 4.682412744673087e-06, + "loss": 2.5317, + "step": 17459 + }, + { + "epoch": 0.9366952789699571, + "grad_norm": 0.48828125, + "learning_rate": 4.682370366511739e-06, + "loss": 2.2068, + "step": 17460 + }, + { + "epoch": 0.9367489270386266, + "grad_norm": 0.478515625, + "learning_rate": 4.682327985714954e-06, + "loss": 2.2039, + "step": 17461 + }, + { + "epoch": 0.9368025751072961, + "grad_norm": 0.498046875, + "learning_rate": 4.68228560228278e-06, + "loss": 2.2177, + "step": 17462 + }, + { + "epoch": 0.9368562231759656, + "grad_norm": 0.408203125, + "learning_rate": 4.682243216215271e-06, + "loss": 2.1356, + "step": 17463 + }, + { + "epoch": 0.9369098712446352, + "grad_norm": 0.60546875, + "learning_rate": 4.6822008275124765e-06, + "loss": 2.341, + "step": 17464 + }, + { + "epoch": 0.9369635193133047, + "grad_norm": 0.435546875, + "learning_rate": 4.6821584361744475e-06, + "loss": 2.409, + "step": 17465 + }, + { + "epoch": 0.9370171673819743, + "grad_norm": 0.44140625, + "learning_rate": 4.682116042201237e-06, + "loss": 2.1412, + "step": 17466 + }, + { + "epoch": 0.9370708154506437, + "grad_norm": 0.4453125, + "learning_rate": 4.682073645592894e-06, + "loss": 2.4776, + "step": 17467 + }, + { + "epoch": 0.9371244635193133, + "grad_norm": 0.67578125, + "learning_rate": 4.682031246349471e-06, + "loss": 2.3431, + "step": 17468 + }, + { + "epoch": 0.9371781115879828, + "grad_norm": 0.6875, + "learning_rate": 4.681988844471018e-06, + "loss": 2.1019, + "step": 17469 + }, + { + "epoch": 0.9372317596566524, + "grad_norm": 0.4765625, + "learning_rate": 4.681946439957589e-06, + "loss": 2.3489, + "step": 17470 + }, + { + "epoch": 0.9372854077253219, + "grad_norm": 0.796875, + "learning_rate": 4.681904032809232e-06, + "loss": 2.5057, + "step": 17471 + }, + { + "epoch": 0.9373390557939915, + "grad_norm": 0.484375, + "learning_rate": 4.681861623025999e-06, + "loss": 2.228, + "step": 17472 + }, + { + "epoch": 0.9373927038626609, + "grad_norm": 0.5, + "learning_rate": 4.681819210607943e-06, + "loss": 2.3004, + "step": 17473 + }, + { + "epoch": 0.9374463519313305, + "grad_norm": 0.40625, + "learning_rate": 4.681776795555114e-06, + "loss": 2.3885, + "step": 17474 + }, + { + "epoch": 0.9375, + "grad_norm": 0.51171875, + "learning_rate": 4.681734377867562e-06, + "loss": 2.3576, + "step": 17475 + }, + { + "epoch": 0.9375536480686695, + "grad_norm": 0.38671875, + "learning_rate": 4.68169195754534e-06, + "loss": 2.1367, + "step": 17476 + }, + { + "epoch": 0.9376072961373391, + "grad_norm": 0.578125, + "learning_rate": 4.681649534588498e-06, + "loss": 2.4512, + "step": 17477 + }, + { + "epoch": 0.9376609442060085, + "grad_norm": 0.4375, + "learning_rate": 4.681607108997089e-06, + "loss": 2.5496, + "step": 17478 + }, + { + "epoch": 0.9377145922746781, + "grad_norm": 0.478515625, + "learning_rate": 4.6815646807711625e-06, + "loss": 2.2896, + "step": 17479 + }, + { + "epoch": 0.9377682403433476, + "grad_norm": 0.49609375, + "learning_rate": 4.68152224991077e-06, + "loss": 2.5103, + "step": 17480 + }, + { + "epoch": 0.9378218884120172, + "grad_norm": 0.54296875, + "learning_rate": 4.681479816415964e-06, + "loss": 2.1478, + "step": 17481 + }, + { + "epoch": 0.9378755364806867, + "grad_norm": 0.48828125, + "learning_rate": 4.681437380286794e-06, + "loss": 2.4232, + "step": 17482 + }, + { + "epoch": 0.9379291845493563, + "grad_norm": 0.494140625, + "learning_rate": 4.681394941523312e-06, + "loss": 2.2622, + "step": 17483 + }, + { + "epoch": 0.9379828326180257, + "grad_norm": 0.46484375, + "learning_rate": 4.68135250012557e-06, + "loss": 2.2876, + "step": 17484 + }, + { + "epoch": 0.9380364806866953, + "grad_norm": 0.5546875, + "learning_rate": 4.681310056093618e-06, + "loss": 2.193, + "step": 17485 + }, + { + "epoch": 0.9380901287553648, + "grad_norm": 0.4609375, + "learning_rate": 4.681267609427508e-06, + "loss": 2.3922, + "step": 17486 + }, + { + "epoch": 0.9381437768240344, + "grad_norm": 0.466796875, + "learning_rate": 4.68122516012729e-06, + "loss": 2.1763, + "step": 17487 + }, + { + "epoch": 0.9381974248927039, + "grad_norm": 0.6953125, + "learning_rate": 4.681182708193018e-06, + "loss": 2.2889, + "step": 17488 + }, + { + "epoch": 0.9382510729613734, + "grad_norm": 0.5625, + "learning_rate": 4.681140253624741e-06, + "loss": 2.2257, + "step": 17489 + }, + { + "epoch": 0.9383047210300429, + "grad_norm": 0.73046875, + "learning_rate": 4.681097796422511e-06, + "loss": 2.5271, + "step": 17490 + }, + { + "epoch": 0.9383583690987124, + "grad_norm": 0.466796875, + "learning_rate": 4.681055336586379e-06, + "loss": 2.4408, + "step": 17491 + }, + { + "epoch": 0.938412017167382, + "grad_norm": 0.46875, + "learning_rate": 4.6810128741163955e-06, + "loss": 2.5416, + "step": 17492 + }, + { + "epoch": 0.9384656652360515, + "grad_norm": 0.484375, + "learning_rate": 4.6809704090126135e-06, + "loss": 2.319, + "step": 17493 + }, + { + "epoch": 0.938519313304721, + "grad_norm": 0.41796875, + "learning_rate": 4.680927941275083e-06, + "loss": 2.284, + "step": 17494 + }, + { + "epoch": 0.9385729613733905, + "grad_norm": 0.470703125, + "learning_rate": 4.6808854709038565e-06, + "loss": 2.3385, + "step": 17495 + }, + { + "epoch": 0.9386266094420601, + "grad_norm": 0.796875, + "learning_rate": 4.680842997898984e-06, + "loss": 1.8793, + "step": 17496 + }, + { + "epoch": 0.9386802575107296, + "grad_norm": 0.421875, + "learning_rate": 4.680800522260518e-06, + "loss": 2.3117, + "step": 17497 + }, + { + "epoch": 0.9387339055793992, + "grad_norm": 0.5703125, + "learning_rate": 4.680758043988508e-06, + "loss": 2.2477, + "step": 17498 + }, + { + "epoch": 0.9387875536480687, + "grad_norm": 0.41015625, + "learning_rate": 4.680715563083007e-06, + "loss": 2.079, + "step": 17499 + }, + { + "epoch": 0.9388412017167382, + "grad_norm": 0.498046875, + "learning_rate": 4.6806730795440656e-06, + "loss": 2.2317, + "step": 17500 + }, + { + "epoch": 0.9388948497854077, + "grad_norm": 0.443359375, + "learning_rate": 4.680630593371736e-06, + "loss": 2.1793, + "step": 17501 + }, + { + "epoch": 0.9389484978540773, + "grad_norm": 0.53125, + "learning_rate": 4.680588104566068e-06, + "loss": 2.3825, + "step": 17502 + }, + { + "epoch": 0.9390021459227468, + "grad_norm": 0.453125, + "learning_rate": 4.680545613127113e-06, + "loss": 2.3836, + "step": 17503 + }, + { + "epoch": 0.9390557939914163, + "grad_norm": 0.44921875, + "learning_rate": 4.6805031190549245e-06, + "loss": 2.2103, + "step": 17504 + }, + { + "epoch": 0.9391094420600858, + "grad_norm": 3.703125, + "learning_rate": 4.680460622349551e-06, + "loss": 1.2311, + "step": 17505 + }, + { + "epoch": 0.9391630901287553, + "grad_norm": 0.45703125, + "learning_rate": 4.680418123011046e-06, + "loss": 2.5425, + "step": 17506 + }, + { + "epoch": 0.9392167381974249, + "grad_norm": 0.44921875, + "learning_rate": 4.680375621039459e-06, + "loss": 2.3418, + "step": 17507 + }, + { + "epoch": 0.9392703862660944, + "grad_norm": 0.51953125, + "learning_rate": 4.680333116434842e-06, + "loss": 2.2579, + "step": 17508 + }, + { + "epoch": 0.939324034334764, + "grad_norm": 0.490234375, + "learning_rate": 4.680290609197248e-06, + "loss": 2.0995, + "step": 17509 + }, + { + "epoch": 0.9393776824034334, + "grad_norm": 0.47265625, + "learning_rate": 4.680248099326726e-06, + "loss": 2.2567, + "step": 17510 + }, + { + "epoch": 0.939431330472103, + "grad_norm": 0.474609375, + "learning_rate": 4.680205586823327e-06, + "loss": 2.451, + "step": 17511 + }, + { + "epoch": 0.9394849785407725, + "grad_norm": 0.4296875, + "learning_rate": 4.6801630716871064e-06, + "loss": 2.3811, + "step": 17512 + }, + { + "epoch": 0.9395386266094421, + "grad_norm": 1.859375, + "learning_rate": 4.680120553918111e-06, + "loss": 1.7076, + "step": 17513 + }, + { + "epoch": 0.9395922746781116, + "grad_norm": 0.40234375, + "learning_rate": 4.680078033516393e-06, + "loss": 2.1767, + "step": 17514 + }, + { + "epoch": 0.9396459227467812, + "grad_norm": 0.490234375, + "learning_rate": 4.680035510482006e-06, + "loss": 2.4134, + "step": 17515 + }, + { + "epoch": 0.9396995708154506, + "grad_norm": 0.462890625, + "learning_rate": 4.679992984814999e-06, + "loss": 2.3526, + "step": 17516 + }, + { + "epoch": 0.9397532188841202, + "grad_norm": 0.404296875, + "learning_rate": 4.679950456515425e-06, + "loss": 2.2851, + "step": 17517 + }, + { + "epoch": 0.9398068669527897, + "grad_norm": 0.3984375, + "learning_rate": 4.679907925583335e-06, + "loss": 2.1502, + "step": 17518 + }, + { + "epoch": 0.9398605150214592, + "grad_norm": 0.47265625, + "learning_rate": 4.67986539201878e-06, + "loss": 2.1606, + "step": 17519 + }, + { + "epoch": 0.9399141630901288, + "grad_norm": 0.486328125, + "learning_rate": 4.679822855821811e-06, + "loss": 2.2718, + "step": 17520 + }, + { + "epoch": 0.9399678111587982, + "grad_norm": 0.94140625, + "learning_rate": 4.679780316992479e-06, + "loss": 2.2476, + "step": 17521 + }, + { + "epoch": 0.9400214592274678, + "grad_norm": 0.44921875, + "learning_rate": 4.679737775530837e-06, + "loss": 2.2964, + "step": 17522 + }, + { + "epoch": 0.9400751072961373, + "grad_norm": 0.40625, + "learning_rate": 4.679695231436936e-06, + "loss": 2.1187, + "step": 17523 + }, + { + "epoch": 0.9401287553648069, + "grad_norm": 0.46484375, + "learning_rate": 4.679652684710825e-06, + "loss": 2.1178, + "step": 17524 + }, + { + "epoch": 0.9401824034334764, + "grad_norm": 0.56640625, + "learning_rate": 4.679610135352559e-06, + "loss": 2.1845, + "step": 17525 + }, + { + "epoch": 0.940236051502146, + "grad_norm": 0.64453125, + "learning_rate": 4.679567583362187e-06, + "loss": 2.0508, + "step": 17526 + }, + { + "epoch": 0.9402896995708154, + "grad_norm": 0.484375, + "learning_rate": 4.679525028739761e-06, + "loss": 2.2645, + "step": 17527 + }, + { + "epoch": 0.940343347639485, + "grad_norm": 0.796875, + "learning_rate": 4.679482471485332e-06, + "loss": 2.0591, + "step": 17528 + }, + { + "epoch": 0.9403969957081545, + "grad_norm": 0.431640625, + "learning_rate": 4.679439911598952e-06, + "loss": 2.2579, + "step": 17529 + }, + { + "epoch": 0.9404506437768241, + "grad_norm": 0.451171875, + "learning_rate": 4.679397349080673e-06, + "loss": 2.1679, + "step": 17530 + }, + { + "epoch": 0.9405042918454936, + "grad_norm": 0.4765625, + "learning_rate": 4.679354783930545e-06, + "loss": 2.4728, + "step": 17531 + }, + { + "epoch": 0.9405579399141631, + "grad_norm": 0.408203125, + "learning_rate": 4.67931221614862e-06, + "loss": 2.3862, + "step": 17532 + }, + { + "epoch": 0.9406115879828326, + "grad_norm": 0.40625, + "learning_rate": 4.6792696457349495e-06, + "loss": 2.0105, + "step": 17533 + }, + { + "epoch": 0.9406652360515021, + "grad_norm": 0.5234375, + "learning_rate": 4.679227072689585e-06, + "loss": 2.3822, + "step": 17534 + }, + { + "epoch": 0.9407188841201717, + "grad_norm": 0.5, + "learning_rate": 4.679184497012578e-06, + "loss": 2.254, + "step": 17535 + }, + { + "epoch": 0.9407725321888412, + "grad_norm": 0.462890625, + "learning_rate": 4.679141918703979e-06, + "loss": 2.2589, + "step": 17536 + }, + { + "epoch": 0.9408261802575107, + "grad_norm": 0.55859375, + "learning_rate": 4.679099337763839e-06, + "loss": 2.3575, + "step": 17537 + }, + { + "epoch": 0.9408798283261802, + "grad_norm": 0.51953125, + "learning_rate": 4.679056754192212e-06, + "loss": 2.4794, + "step": 17538 + }, + { + "epoch": 0.9409334763948498, + "grad_norm": 0.494140625, + "learning_rate": 4.679014167989148e-06, + "loss": 2.3794, + "step": 17539 + }, + { + "epoch": 0.9409871244635193, + "grad_norm": 0.404296875, + "learning_rate": 4.678971579154698e-06, + "loss": 2.2924, + "step": 17540 + }, + { + "epoch": 0.9410407725321889, + "grad_norm": 0.515625, + "learning_rate": 4.6789289876889135e-06, + "loss": 2.5395, + "step": 17541 + }, + { + "epoch": 0.9410944206008584, + "grad_norm": 0.48046875, + "learning_rate": 4.678886393591846e-06, + "loss": 2.4227, + "step": 17542 + }, + { + "epoch": 0.9411480686695279, + "grad_norm": 0.435546875, + "learning_rate": 4.678843796863548e-06, + "loss": 2.459, + "step": 17543 + }, + { + "epoch": 0.9412017167381974, + "grad_norm": 0.466796875, + "learning_rate": 4.678801197504069e-06, + "loss": 2.2478, + "step": 17544 + }, + { + "epoch": 0.941255364806867, + "grad_norm": 0.4609375, + "learning_rate": 4.678758595513462e-06, + "loss": 2.249, + "step": 17545 + }, + { + "epoch": 0.9413090128755365, + "grad_norm": 0.474609375, + "learning_rate": 4.678715990891778e-06, + "loss": 2.1262, + "step": 17546 + }, + { + "epoch": 0.941362660944206, + "grad_norm": 0.5859375, + "learning_rate": 4.678673383639069e-06, + "loss": 2.5117, + "step": 17547 + }, + { + "epoch": 0.9414163090128755, + "grad_norm": 0.4765625, + "learning_rate": 4.678630773755384e-06, + "loss": 2.3561, + "step": 17548 + }, + { + "epoch": 0.941469957081545, + "grad_norm": 0.466796875, + "learning_rate": 4.678588161240778e-06, + "loss": 2.2165, + "step": 17549 + }, + { + "epoch": 0.9415236051502146, + "grad_norm": 0.51953125, + "learning_rate": 4.6785455460953e-06, + "loss": 2.3094, + "step": 17550 + }, + { + "epoch": 0.9415772532188841, + "grad_norm": 0.470703125, + "learning_rate": 4.6785029283190026e-06, + "loss": 2.3956, + "step": 17551 + }, + { + "epoch": 0.9416309012875537, + "grad_norm": 0.39453125, + "learning_rate": 4.678460307911937e-06, + "loss": 2.2286, + "step": 17552 + }, + { + "epoch": 0.9416845493562231, + "grad_norm": 0.470703125, + "learning_rate": 4.678417684874155e-06, + "loss": 2.412, + "step": 17553 + }, + { + "epoch": 0.9417381974248927, + "grad_norm": 0.4375, + "learning_rate": 4.678375059205706e-06, + "loss": 2.4189, + "step": 17554 + }, + { + "epoch": 0.9417918454935622, + "grad_norm": 0.5546875, + "learning_rate": 4.6783324309066444e-06, + "loss": 2.2537, + "step": 17555 + }, + { + "epoch": 0.9418454935622318, + "grad_norm": 0.48046875, + "learning_rate": 4.678289799977021e-06, + "loss": 2.1249, + "step": 17556 + }, + { + "epoch": 0.9418991416309013, + "grad_norm": 0.86328125, + "learning_rate": 4.678247166416885e-06, + "loss": 2.3183, + "step": 17557 + }, + { + "epoch": 0.9419527896995709, + "grad_norm": 0.5546875, + "learning_rate": 4.6782045302262904e-06, + "loss": 2.2015, + "step": 17558 + }, + { + "epoch": 0.9420064377682403, + "grad_norm": 0.484375, + "learning_rate": 4.678161891405288e-06, + "loss": 2.161, + "step": 17559 + }, + { + "epoch": 0.9420600858369099, + "grad_norm": 0.4296875, + "learning_rate": 4.6781192499539285e-06, + "loss": 2.244, + "step": 17560 + }, + { + "epoch": 0.9421137339055794, + "grad_norm": 0.48828125, + "learning_rate": 4.678076605872265e-06, + "loss": 2.4145, + "step": 17561 + }, + { + "epoch": 0.9421673819742489, + "grad_norm": 0.49609375, + "learning_rate": 4.6780339591603476e-06, + "loss": 2.1738, + "step": 17562 + }, + { + "epoch": 0.9422210300429185, + "grad_norm": 0.482421875, + "learning_rate": 4.6779913098182275e-06, + "loss": 2.1413, + "step": 17563 + }, + { + "epoch": 0.9422746781115879, + "grad_norm": 0.546875, + "learning_rate": 4.6779486578459576e-06, + "loss": 2.3309, + "step": 17564 + }, + { + "epoch": 0.9423283261802575, + "grad_norm": 0.61328125, + "learning_rate": 4.677906003243589e-06, + "loss": 2.2436, + "step": 17565 + }, + { + "epoch": 0.942381974248927, + "grad_norm": 0.498046875, + "learning_rate": 4.677863346011173e-06, + "loss": 2.1568, + "step": 17566 + }, + { + "epoch": 0.9424356223175966, + "grad_norm": 0.416015625, + "learning_rate": 4.67782068614876e-06, + "loss": 2.2249, + "step": 17567 + }, + { + "epoch": 0.9424892703862661, + "grad_norm": 0.447265625, + "learning_rate": 4.677778023656404e-06, + "loss": 2.1654, + "step": 17568 + }, + { + "epoch": 0.9425429184549357, + "grad_norm": 0.5234375, + "learning_rate": 4.677735358534155e-06, + "loss": 1.6549, + "step": 17569 + }, + { + "epoch": 0.9425965665236051, + "grad_norm": 0.51171875, + "learning_rate": 4.677692690782063e-06, + "loss": 2.3495, + "step": 17570 + }, + { + "epoch": 0.9426502145922747, + "grad_norm": 0.443359375, + "learning_rate": 4.677650020400182e-06, + "loss": 2.1646, + "step": 17571 + }, + { + "epoch": 0.9427038626609442, + "grad_norm": 0.50390625, + "learning_rate": 4.677607347388563e-06, + "loss": 2.2599, + "step": 17572 + }, + { + "epoch": 0.9427575107296138, + "grad_norm": 0.58203125, + "learning_rate": 4.677564671747257e-06, + "loss": 2.3875, + "step": 17573 + }, + { + "epoch": 0.9428111587982833, + "grad_norm": 0.6015625, + "learning_rate": 4.6775219934763164e-06, + "loss": 2.2923, + "step": 17574 + }, + { + "epoch": 0.9428648068669528, + "grad_norm": 0.48046875, + "learning_rate": 4.677479312575791e-06, + "loss": 2.1187, + "step": 17575 + }, + { + "epoch": 0.9429184549356223, + "grad_norm": 0.58203125, + "learning_rate": 4.677436629045734e-06, + "loss": 2.1622, + "step": 17576 + }, + { + "epoch": 0.9429721030042918, + "grad_norm": 0.421875, + "learning_rate": 4.677393942886196e-06, + "loss": 1.9146, + "step": 17577 + }, + { + "epoch": 0.9430257510729614, + "grad_norm": 0.67578125, + "learning_rate": 4.67735125409723e-06, + "loss": 1.9976, + "step": 17578 + }, + { + "epoch": 0.9430793991416309, + "grad_norm": 0.42578125, + "learning_rate": 4.677308562678885e-06, + "loss": 2.2989, + "step": 17579 + }, + { + "epoch": 0.9431330472103004, + "grad_norm": 0.578125, + "learning_rate": 4.677265868631215e-06, + "loss": 2.1893, + "step": 17580 + }, + { + "epoch": 0.9431866952789699, + "grad_norm": 0.50390625, + "learning_rate": 4.67722317195427e-06, + "loss": 2.0241, + "step": 17581 + }, + { + "epoch": 0.9432403433476395, + "grad_norm": 0.451171875, + "learning_rate": 4.677180472648103e-06, + "loss": 2.186, + "step": 17582 + }, + { + "epoch": 0.943293991416309, + "grad_norm": 0.427734375, + "learning_rate": 4.677137770712764e-06, + "loss": 2.2781, + "step": 17583 + }, + { + "epoch": 0.9433476394849786, + "grad_norm": 0.458984375, + "learning_rate": 4.677095066148305e-06, + "loss": 2.4137, + "step": 17584 + }, + { + "epoch": 0.943401287553648, + "grad_norm": 0.396484375, + "learning_rate": 4.677052358954778e-06, + "loss": 2.0561, + "step": 17585 + }, + { + "epoch": 0.9434549356223176, + "grad_norm": 0.373046875, + "learning_rate": 4.6770096491322346e-06, + "loss": 2.0955, + "step": 17586 + }, + { + "epoch": 0.9435085836909871, + "grad_norm": 0.482421875, + "learning_rate": 4.676966936680726e-06, + "loss": 2.3536, + "step": 17587 + }, + { + "epoch": 0.9435622317596567, + "grad_norm": 0.453125, + "learning_rate": 4.676924221600304e-06, + "loss": 2.0219, + "step": 17588 + }, + { + "epoch": 0.9436158798283262, + "grad_norm": 0.625, + "learning_rate": 4.676881503891021e-06, + "loss": 2.4451, + "step": 17589 + }, + { + "epoch": 0.9436695278969958, + "grad_norm": 0.48828125, + "learning_rate": 4.6768387835529275e-06, + "loss": 2.5062, + "step": 17590 + }, + { + "epoch": 0.9437231759656652, + "grad_norm": 0.7109375, + "learning_rate": 4.676796060586074e-06, + "loss": 2.3574, + "step": 17591 + }, + { + "epoch": 0.9437768240343347, + "grad_norm": 0.47265625, + "learning_rate": 4.676753334990515e-06, + "loss": 2.379, + "step": 17592 + }, + { + "epoch": 0.9438304721030043, + "grad_norm": 0.60546875, + "learning_rate": 4.676710606766299e-06, + "loss": 1.7875, + "step": 17593 + }, + { + "epoch": 0.9438841201716738, + "grad_norm": 0.48828125, + "learning_rate": 4.67666787591348e-06, + "loss": 2.3558, + "step": 17594 + }, + { + "epoch": 0.9439377682403434, + "grad_norm": 0.447265625, + "learning_rate": 4.676625142432108e-06, + "loss": 2.2594, + "step": 17595 + }, + { + "epoch": 0.9439914163090128, + "grad_norm": 0.515625, + "learning_rate": 4.676582406322236e-06, + "loss": 2.371, + "step": 17596 + }, + { + "epoch": 0.9440450643776824, + "grad_norm": 0.470703125, + "learning_rate": 4.676539667583916e-06, + "loss": 2.1469, + "step": 17597 + }, + { + "epoch": 0.9440987124463519, + "grad_norm": 0.453125, + "learning_rate": 4.676496926217197e-06, + "loss": 2.32, + "step": 17598 + }, + { + "epoch": 0.9441523605150215, + "grad_norm": 0.498046875, + "learning_rate": 4.676454182222133e-06, + "loss": 2.4927, + "step": 17599 + }, + { + "epoch": 0.944206008583691, + "grad_norm": 0.44140625, + "learning_rate": 4.676411435598774e-06, + "loss": 2.1153, + "step": 17600 + }, + { + "epoch": 0.9442596566523606, + "grad_norm": 0.4375, + "learning_rate": 4.6763686863471725e-06, + "loss": 2.1279, + "step": 17601 + }, + { + "epoch": 0.94431330472103, + "grad_norm": 0.53125, + "learning_rate": 4.676325934467381e-06, + "loss": 2.5651, + "step": 17602 + }, + { + "epoch": 0.9443669527896996, + "grad_norm": 0.478515625, + "learning_rate": 4.676283179959449e-06, + "loss": 2.5047, + "step": 17603 + }, + { + "epoch": 0.9444206008583691, + "grad_norm": 0.419921875, + "learning_rate": 4.67624042282343e-06, + "loss": 2.0441, + "step": 17604 + }, + { + "epoch": 0.9444742489270386, + "grad_norm": 0.6484375, + "learning_rate": 4.676197663059374e-06, + "loss": 2.1627, + "step": 17605 + }, + { + "epoch": 0.9445278969957082, + "grad_norm": 0.74609375, + "learning_rate": 4.676154900667335e-06, + "loss": 2.4068, + "step": 17606 + }, + { + "epoch": 0.9445815450643776, + "grad_norm": 0.53515625, + "learning_rate": 4.6761121356473615e-06, + "loss": 2.1501, + "step": 17607 + }, + { + "epoch": 0.9446351931330472, + "grad_norm": 1.1953125, + "learning_rate": 4.6760693679995086e-06, + "loss": 2.417, + "step": 17608 + }, + { + "epoch": 0.9446888412017167, + "grad_norm": 0.462890625, + "learning_rate": 4.676026597723825e-06, + "loss": 2.3736, + "step": 17609 + }, + { + "epoch": 0.9447424892703863, + "grad_norm": 0.66796875, + "learning_rate": 4.675983824820364e-06, + "loss": 2.169, + "step": 17610 + }, + { + "epoch": 0.9447961373390558, + "grad_norm": 0.494140625, + "learning_rate": 4.675941049289176e-06, + "loss": 2.5438, + "step": 17611 + }, + { + "epoch": 0.9448497854077254, + "grad_norm": 0.53125, + "learning_rate": 4.675898271130313e-06, + "loss": 2.2648, + "step": 17612 + }, + { + "epoch": 0.9449034334763948, + "grad_norm": 0.51953125, + "learning_rate": 4.675855490343829e-06, + "loss": 2.3863, + "step": 17613 + }, + { + "epoch": 0.9449570815450644, + "grad_norm": 0.546875, + "learning_rate": 4.675812706929773e-06, + "loss": 1.9642, + "step": 17614 + }, + { + "epoch": 0.9450107296137339, + "grad_norm": 0.41796875, + "learning_rate": 4.6757699208881966e-06, + "loss": 2.1284, + "step": 17615 + }, + { + "epoch": 0.9450643776824035, + "grad_norm": 0.470703125, + "learning_rate": 4.675727132219153e-06, + "loss": 2.2819, + "step": 17616 + }, + { + "epoch": 0.945118025751073, + "grad_norm": 0.4609375, + "learning_rate": 4.675684340922694e-06, + "loss": 2.2514, + "step": 17617 + }, + { + "epoch": 0.9451716738197425, + "grad_norm": 0.47265625, + "learning_rate": 4.675641546998869e-06, + "loss": 2.6714, + "step": 17618 + }, + { + "epoch": 0.945225321888412, + "grad_norm": 0.66015625, + "learning_rate": 4.675598750447732e-06, + "loss": 1.5334, + "step": 17619 + }, + { + "epoch": 0.9452789699570815, + "grad_norm": 0.5390625, + "learning_rate": 4.675555951269333e-06, + "loss": 2.4488, + "step": 17620 + }, + { + "epoch": 0.9453326180257511, + "grad_norm": 0.82421875, + "learning_rate": 4.675513149463725e-06, + "loss": 2.2437, + "step": 17621 + }, + { + "epoch": 0.9453862660944206, + "grad_norm": 0.61328125, + "learning_rate": 4.675470345030959e-06, + "loss": 1.5038, + "step": 17622 + }, + { + "epoch": 0.9454399141630901, + "grad_norm": 0.515625, + "learning_rate": 4.675427537971087e-06, + "loss": 2.0865, + "step": 17623 + }, + { + "epoch": 0.9454935622317596, + "grad_norm": 0.41796875, + "learning_rate": 4.675384728284161e-06, + "loss": 2.4615, + "step": 17624 + }, + { + "epoch": 0.9455472103004292, + "grad_norm": 0.49609375, + "learning_rate": 4.675341915970232e-06, + "loss": 2.3302, + "step": 17625 + }, + { + "epoch": 0.9456008583690987, + "grad_norm": 0.43359375, + "learning_rate": 4.675299101029351e-06, + "loss": 2.3182, + "step": 17626 + }, + { + "epoch": 0.9456545064377683, + "grad_norm": 0.4765625, + "learning_rate": 4.675256283461572e-06, + "loss": 2.3121, + "step": 17627 + }, + { + "epoch": 0.9457081545064377, + "grad_norm": 0.484375, + "learning_rate": 4.675213463266944e-06, + "loss": 2.4289, + "step": 17628 + }, + { + "epoch": 0.9457618025751073, + "grad_norm": 0.46875, + "learning_rate": 4.675170640445521e-06, + "loss": 2.349, + "step": 17629 + }, + { + "epoch": 0.9458154506437768, + "grad_norm": 0.453125, + "learning_rate": 4.675127814997353e-06, + "loss": 2.2948, + "step": 17630 + }, + { + "epoch": 0.9458690987124464, + "grad_norm": 0.427734375, + "learning_rate": 4.675084986922493e-06, + "loss": 2.2045, + "step": 17631 + }, + { + "epoch": 0.9459227467811159, + "grad_norm": 0.625, + "learning_rate": 4.675042156220993e-06, + "loss": 2.3161, + "step": 17632 + }, + { + "epoch": 0.9459763948497855, + "grad_norm": 1.0625, + "learning_rate": 4.674999322892903e-06, + "loss": 2.0993, + "step": 17633 + }, + { + "epoch": 0.9460300429184549, + "grad_norm": 0.466796875, + "learning_rate": 4.674956486938276e-06, + "loss": 2.4357, + "step": 17634 + }, + { + "epoch": 0.9460836909871244, + "grad_norm": 0.9296875, + "learning_rate": 4.674913648357164e-06, + "loss": 2.4085, + "step": 17635 + }, + { + "epoch": 0.946137339055794, + "grad_norm": 0.427734375, + "learning_rate": 4.674870807149617e-06, + "loss": 2.2776, + "step": 17636 + }, + { + "epoch": 0.9461909871244635, + "grad_norm": 0.494140625, + "learning_rate": 4.674827963315688e-06, + "loss": 2.3961, + "step": 17637 + }, + { + "epoch": 0.9462446351931331, + "grad_norm": 0.478515625, + "learning_rate": 4.674785116855429e-06, + "loss": 2.336, + "step": 17638 + }, + { + "epoch": 0.9462982832618025, + "grad_norm": 0.52734375, + "learning_rate": 4.674742267768891e-06, + "loss": 2.4184, + "step": 17639 + }, + { + "epoch": 0.9463519313304721, + "grad_norm": 0.56640625, + "learning_rate": 4.674699416056127e-06, + "loss": 1.6024, + "step": 17640 + }, + { + "epoch": 0.9464055793991416, + "grad_norm": 0.453125, + "learning_rate": 4.674656561717187e-06, + "loss": 2.076, + "step": 17641 + }, + { + "epoch": 0.9464592274678112, + "grad_norm": 0.453125, + "learning_rate": 4.674613704752124e-06, + "loss": 2.3244, + "step": 17642 + }, + { + "epoch": 0.9465128755364807, + "grad_norm": 0.455078125, + "learning_rate": 4.674570845160988e-06, + "loss": 2.4787, + "step": 17643 + }, + { + "epoch": 0.9465665236051503, + "grad_norm": 0.9609375, + "learning_rate": 4.6745279829438335e-06, + "loss": 2.5088, + "step": 17644 + }, + { + "epoch": 0.9466201716738197, + "grad_norm": 0.43359375, + "learning_rate": 4.674485118100711e-06, + "loss": 2.1777, + "step": 17645 + }, + { + "epoch": 0.9466738197424893, + "grad_norm": 0.52734375, + "learning_rate": 4.674442250631671e-06, + "loss": 2.2226, + "step": 17646 + }, + { + "epoch": 0.9467274678111588, + "grad_norm": 0.62109375, + "learning_rate": 4.674399380536767e-06, + "loss": 2.2295, + "step": 17647 + }, + { + "epoch": 0.9467811158798283, + "grad_norm": 0.462890625, + "learning_rate": 4.6743565078160506e-06, + "loss": 2.1733, + "step": 17648 + }, + { + "epoch": 0.9468347639484979, + "grad_norm": 0.423828125, + "learning_rate": 4.674313632469573e-06, + "loss": 2.4258, + "step": 17649 + }, + { + "epoch": 0.9468884120171673, + "grad_norm": 0.482421875, + "learning_rate": 4.674270754497385e-06, + "loss": 2.3242, + "step": 17650 + }, + { + "epoch": 0.9469420600858369, + "grad_norm": 0.60546875, + "learning_rate": 4.674227873899541e-06, + "loss": 2.5546, + "step": 17651 + }, + { + "epoch": 0.9469957081545064, + "grad_norm": 0.46875, + "learning_rate": 4.674184990676091e-06, + "loss": 2.1586, + "step": 17652 + }, + { + "epoch": 0.947049356223176, + "grad_norm": 0.416015625, + "learning_rate": 4.674142104827086e-06, + "loss": 2.2933, + "step": 17653 + }, + { + "epoch": 0.9471030042918455, + "grad_norm": 0.482421875, + "learning_rate": 4.67409921635258e-06, + "loss": 2.3316, + "step": 17654 + }, + { + "epoch": 0.947156652360515, + "grad_norm": 0.447265625, + "learning_rate": 4.674056325252623e-06, + "loss": 2.2557, + "step": 17655 + }, + { + "epoch": 0.9472103004291845, + "grad_norm": 0.494140625, + "learning_rate": 4.674013431527268e-06, + "loss": 2.3362, + "step": 17656 + }, + { + "epoch": 0.9472639484978541, + "grad_norm": 0.5234375, + "learning_rate": 4.673970535176565e-06, + "loss": 2.272, + "step": 17657 + }, + { + "epoch": 0.9473175965665236, + "grad_norm": 0.498046875, + "learning_rate": 4.673927636200568e-06, + "loss": 2.2075, + "step": 17658 + }, + { + "epoch": 0.9473712446351932, + "grad_norm": 0.482421875, + "learning_rate": 4.673884734599328e-06, + "loss": 2.2975, + "step": 17659 + }, + { + "epoch": 0.9474248927038627, + "grad_norm": 0.60546875, + "learning_rate": 4.6738418303728965e-06, + "loss": 2.4723, + "step": 17660 + }, + { + "epoch": 0.9474785407725322, + "grad_norm": 0.6015625, + "learning_rate": 4.6737989235213255e-06, + "loss": 2.4783, + "step": 17661 + }, + { + "epoch": 0.9475321888412017, + "grad_norm": 0.474609375, + "learning_rate": 4.673756014044666e-06, + "loss": 2.4154, + "step": 17662 + }, + { + "epoch": 0.9475858369098712, + "grad_norm": 0.58203125, + "learning_rate": 4.673713101942972e-06, + "loss": 1.569, + "step": 17663 + }, + { + "epoch": 0.9476394849785408, + "grad_norm": 0.44921875, + "learning_rate": 4.673670187216293e-06, + "loss": 2.4146, + "step": 17664 + }, + { + "epoch": 0.9476931330472103, + "grad_norm": 0.390625, + "learning_rate": 4.673627269864681e-06, + "loss": 2.2973, + "step": 17665 + }, + { + "epoch": 0.9477467811158798, + "grad_norm": 0.462890625, + "learning_rate": 4.67358434988819e-06, + "loss": 2.1561, + "step": 17666 + }, + { + "epoch": 0.9478004291845493, + "grad_norm": 0.53125, + "learning_rate": 4.6735414272868696e-06, + "loss": 2.2162, + "step": 17667 + }, + { + "epoch": 0.9478540772532189, + "grad_norm": 0.51171875, + "learning_rate": 4.673498502060773e-06, + "loss": 2.1689, + "step": 17668 + }, + { + "epoch": 0.9479077253218884, + "grad_norm": 0.482421875, + "learning_rate": 4.673455574209951e-06, + "loss": 2.0554, + "step": 17669 + }, + { + "epoch": 0.947961373390558, + "grad_norm": 0.4140625, + "learning_rate": 4.673412643734455e-06, + "loss": 2.4154, + "step": 17670 + }, + { + "epoch": 0.9480150214592274, + "grad_norm": 0.5234375, + "learning_rate": 4.673369710634339e-06, + "loss": 2.3805, + "step": 17671 + }, + { + "epoch": 0.948068669527897, + "grad_norm": 0.74609375, + "learning_rate": 4.673326774909653e-06, + "loss": 2.3269, + "step": 17672 + }, + { + "epoch": 0.9481223175965665, + "grad_norm": 0.4609375, + "learning_rate": 4.67328383656045e-06, + "loss": 2.4638, + "step": 17673 + }, + { + "epoch": 0.9481759656652361, + "grad_norm": 0.515625, + "learning_rate": 4.6732408955867814e-06, + "loss": 2.3006, + "step": 17674 + }, + { + "epoch": 0.9482296137339056, + "grad_norm": 0.396484375, + "learning_rate": 4.673197951988699e-06, + "loss": 2.4958, + "step": 17675 + }, + { + "epoch": 0.9482832618025752, + "grad_norm": 0.62890625, + "learning_rate": 4.673155005766254e-06, + "loss": 2.3594, + "step": 17676 + }, + { + "epoch": 0.9483369098712446, + "grad_norm": 0.50390625, + "learning_rate": 4.673112056919499e-06, + "loss": 2.2661, + "step": 17677 + }, + { + "epoch": 0.9483905579399141, + "grad_norm": 0.443359375, + "learning_rate": 4.6730691054484856e-06, + "loss": 2.0571, + "step": 17678 + }, + { + "epoch": 0.9484442060085837, + "grad_norm": 0.4609375, + "learning_rate": 4.673026151353266e-06, + "loss": 2.1984, + "step": 17679 + }, + { + "epoch": 0.9484978540772532, + "grad_norm": 0.48046875, + "learning_rate": 4.672983194633892e-06, + "loss": 2.3811, + "step": 17680 + }, + { + "epoch": 0.9485515021459228, + "grad_norm": 0.439453125, + "learning_rate": 4.6729402352904155e-06, + "loss": 2.3689, + "step": 17681 + }, + { + "epoch": 0.9486051502145922, + "grad_norm": 0.43359375, + "learning_rate": 4.672897273322888e-06, + "loss": 2.0243, + "step": 17682 + }, + { + "epoch": 0.9486587982832618, + "grad_norm": 0.4453125, + "learning_rate": 4.672854308731361e-06, + "loss": 2.3173, + "step": 17683 + }, + { + "epoch": 0.9487124463519313, + "grad_norm": 0.423828125, + "learning_rate": 4.672811341515888e-06, + "loss": 1.8696, + "step": 17684 + }, + { + "epoch": 0.9487660944206009, + "grad_norm": 0.466796875, + "learning_rate": 4.672768371676519e-06, + "loss": 2.4498, + "step": 17685 + }, + { + "epoch": 0.9488197424892704, + "grad_norm": 0.58984375, + "learning_rate": 4.672725399213307e-06, + "loss": 2.2172, + "step": 17686 + }, + { + "epoch": 0.94887339055794, + "grad_norm": 0.41015625, + "learning_rate": 4.672682424126304e-06, + "loss": 2.4583, + "step": 17687 + }, + { + "epoch": 0.9489270386266094, + "grad_norm": 0.44140625, + "learning_rate": 4.672639446415561e-06, + "loss": 1.9716, + "step": 17688 + }, + { + "epoch": 0.948980686695279, + "grad_norm": 0.443359375, + "learning_rate": 4.672596466081132e-06, + "loss": 2.3287, + "step": 17689 + }, + { + "epoch": 0.9490343347639485, + "grad_norm": 0.431640625, + "learning_rate": 4.672553483123065e-06, + "loss": 2.1995, + "step": 17690 + }, + { + "epoch": 0.949087982832618, + "grad_norm": 0.443359375, + "learning_rate": 4.672510497541416e-06, + "loss": 2.4459, + "step": 17691 + }, + { + "epoch": 0.9491416309012876, + "grad_norm": 0.48046875, + "learning_rate": 4.672467509336235e-06, + "loss": 2.3475, + "step": 17692 + }, + { + "epoch": 0.949195278969957, + "grad_norm": 0.345703125, + "learning_rate": 4.672424518507574e-06, + "loss": 2.2736, + "step": 17693 + }, + { + "epoch": 0.9492489270386266, + "grad_norm": 0.51953125, + "learning_rate": 4.672381525055485e-06, + "loss": 2.4275, + "step": 17694 + }, + { + "epoch": 0.9493025751072961, + "grad_norm": 0.46875, + "learning_rate": 4.67233852898002e-06, + "loss": 2.1459, + "step": 17695 + }, + { + "epoch": 0.9493562231759657, + "grad_norm": 0.48828125, + "learning_rate": 4.67229553028123e-06, + "loss": 2.3832, + "step": 17696 + }, + { + "epoch": 0.9494098712446352, + "grad_norm": 0.41796875, + "learning_rate": 4.672252528959169e-06, + "loss": 2.245, + "step": 17697 + }, + { + "epoch": 0.9494635193133047, + "grad_norm": 0.462890625, + "learning_rate": 4.672209525013887e-06, + "loss": 2.3327, + "step": 17698 + }, + { + "epoch": 0.9495171673819742, + "grad_norm": 0.59765625, + "learning_rate": 4.672166518445437e-06, + "loss": 2.3333, + "step": 17699 + }, + { + "epoch": 0.9495708154506438, + "grad_norm": 0.51953125, + "learning_rate": 4.6721235092538695e-06, + "loss": 2.4332, + "step": 17700 + }, + { + "epoch": 0.9496244635193133, + "grad_norm": 0.453125, + "learning_rate": 4.672080497439239e-06, + "loss": 2.3138, + "step": 17701 + }, + { + "epoch": 0.9496781115879829, + "grad_norm": 0.7421875, + "learning_rate": 4.672037483001595e-06, + "loss": 2.0316, + "step": 17702 + }, + { + "epoch": 0.9497317596566524, + "grad_norm": 0.56640625, + "learning_rate": 4.67199446594099e-06, + "loss": 2.5245, + "step": 17703 + }, + { + "epoch": 0.9497854077253219, + "grad_norm": 0.44140625, + "learning_rate": 4.6719514462574774e-06, + "loss": 2.2531, + "step": 17704 + }, + { + "epoch": 0.9498390557939914, + "grad_norm": 0.44140625, + "learning_rate": 4.671908423951108e-06, + "loss": 2.2538, + "step": 17705 + }, + { + "epoch": 0.9498927038626609, + "grad_norm": 0.46875, + "learning_rate": 4.6718653990219336e-06, + "loss": 2.3439, + "step": 17706 + }, + { + "epoch": 0.9499463519313305, + "grad_norm": 0.443359375, + "learning_rate": 4.6718223714700066e-06, + "loss": 2.2391, + "step": 17707 + }, + { + "epoch": 0.95, + "grad_norm": 0.609375, + "learning_rate": 4.671779341295378e-06, + "loss": 2.3037, + "step": 17708 + }, + { + "epoch": 0.9500536480686695, + "grad_norm": 0.48828125, + "learning_rate": 4.671736308498101e-06, + "loss": 2.3707, + "step": 17709 + }, + { + "epoch": 0.950107296137339, + "grad_norm": 0.4921875, + "learning_rate": 4.671693273078227e-06, + "loss": 2.3577, + "step": 17710 + }, + { + "epoch": 0.9501609442060086, + "grad_norm": 0.43359375, + "learning_rate": 4.671650235035808e-06, + "loss": 2.1663, + "step": 17711 + }, + { + "epoch": 0.9502145922746781, + "grad_norm": 2.5625, + "learning_rate": 4.671607194370896e-06, + "loss": 2.4535, + "step": 17712 + }, + { + "epoch": 0.9502682403433477, + "grad_norm": 0.515625, + "learning_rate": 4.671564151083544e-06, + "loss": 2.232, + "step": 17713 + }, + { + "epoch": 0.9503218884120171, + "grad_norm": 0.4609375, + "learning_rate": 4.671521105173802e-06, + "loss": 2.3944, + "step": 17714 + }, + { + "epoch": 0.9503755364806867, + "grad_norm": 0.51171875, + "learning_rate": 4.671478056641723e-06, + "loss": 2.4427, + "step": 17715 + }, + { + "epoch": 0.9504291845493562, + "grad_norm": 0.361328125, + "learning_rate": 4.6714350054873596e-06, + "loss": 2.147, + "step": 17716 + }, + { + "epoch": 0.9504828326180258, + "grad_norm": 0.451171875, + "learning_rate": 4.671391951710763e-06, + "loss": 2.5017, + "step": 17717 + }, + { + "epoch": 0.9505364806866953, + "grad_norm": 0.447265625, + "learning_rate": 4.671348895311985e-06, + "loss": 2.3607, + "step": 17718 + }, + { + "epoch": 0.9505901287553649, + "grad_norm": 0.6171875, + "learning_rate": 4.671305836291078e-06, + "loss": 2.2515, + "step": 17719 + }, + { + "epoch": 0.9506437768240343, + "grad_norm": 0.484375, + "learning_rate": 4.671262774648094e-06, + "loss": 2.4014, + "step": 17720 + }, + { + "epoch": 0.9506974248927038, + "grad_norm": 0.46484375, + "learning_rate": 4.671219710383085e-06, + "loss": 2.2972, + "step": 17721 + }, + { + "epoch": 0.9507510729613734, + "grad_norm": 0.466796875, + "learning_rate": 4.6711766434961025e-06, + "loss": 2.2917, + "step": 17722 + }, + { + "epoch": 0.9508047210300429, + "grad_norm": 0.431640625, + "learning_rate": 4.6711335739872e-06, + "loss": 2.3161, + "step": 17723 + }, + { + "epoch": 0.9508583690987125, + "grad_norm": 0.61328125, + "learning_rate": 4.671090501856427e-06, + "loss": 2.2815, + "step": 17724 + }, + { + "epoch": 0.9509120171673819, + "grad_norm": 0.498046875, + "learning_rate": 4.671047427103838e-06, + "loss": 2.8428, + "step": 17725 + }, + { + "epoch": 0.9509656652360515, + "grad_norm": 0.494140625, + "learning_rate": 4.671004349729484e-06, + "loss": 2.2197, + "step": 17726 + }, + { + "epoch": 0.951019313304721, + "grad_norm": 0.466796875, + "learning_rate": 4.670961269733416e-06, + "loss": 2.2837, + "step": 17727 + }, + { + "epoch": 0.9510729613733906, + "grad_norm": 0.48828125, + "learning_rate": 4.670918187115688e-06, + "loss": 2.2087, + "step": 17728 + }, + { + "epoch": 0.9511266094420601, + "grad_norm": 0.482421875, + "learning_rate": 4.670875101876351e-06, + "loss": 2.2309, + "step": 17729 + }, + { + "epoch": 0.9511802575107297, + "grad_norm": 0.5078125, + "learning_rate": 4.670832014015456e-06, + "loss": 2.3335, + "step": 17730 + }, + { + "epoch": 0.9512339055793991, + "grad_norm": 0.498046875, + "learning_rate": 4.670788923533058e-06, + "loss": 2.0628, + "step": 17731 + }, + { + "epoch": 0.9512875536480687, + "grad_norm": 0.60546875, + "learning_rate": 4.670745830429206e-06, + "loss": 2.3384, + "step": 17732 + }, + { + "epoch": 0.9513412017167382, + "grad_norm": 0.4375, + "learning_rate": 4.670702734703954e-06, + "loss": 2.277, + "step": 17733 + }, + { + "epoch": 0.9513948497854077, + "grad_norm": 0.58984375, + "learning_rate": 4.670659636357352e-06, + "loss": 2.4134, + "step": 17734 + }, + { + "epoch": 0.9514484978540773, + "grad_norm": 0.435546875, + "learning_rate": 4.670616535389454e-06, + "loss": 2.3504, + "step": 17735 + }, + { + "epoch": 0.9515021459227467, + "grad_norm": 0.365234375, + "learning_rate": 4.670573431800311e-06, + "loss": 2.2333, + "step": 17736 + }, + { + "epoch": 0.9515557939914163, + "grad_norm": 0.498046875, + "learning_rate": 4.670530325589976e-06, + "loss": 2.4197, + "step": 17737 + }, + { + "epoch": 0.9516094420600858, + "grad_norm": 0.6328125, + "learning_rate": 4.6704872167585e-06, + "loss": 2.4402, + "step": 17738 + }, + { + "epoch": 0.9516630901287554, + "grad_norm": 0.44921875, + "learning_rate": 4.670444105305936e-06, + "loss": 2.4907, + "step": 17739 + }, + { + "epoch": 0.9517167381974249, + "grad_norm": 0.451171875, + "learning_rate": 4.6704009912323345e-06, + "loss": 2.2876, + "step": 17740 + }, + { + "epoch": 0.9517703862660944, + "grad_norm": 0.474609375, + "learning_rate": 4.6703578745377495e-06, + "loss": 2.0598, + "step": 17741 + }, + { + "epoch": 0.9518240343347639, + "grad_norm": 0.45703125, + "learning_rate": 4.670314755222232e-06, + "loss": 2.2764, + "step": 17742 + }, + { + "epoch": 0.9518776824034335, + "grad_norm": 0.498046875, + "learning_rate": 4.6702716332858346e-06, + "loss": 2.1527, + "step": 17743 + }, + { + "epoch": 0.951931330472103, + "grad_norm": 0.49609375, + "learning_rate": 4.670228508728608e-06, + "loss": 2.0266, + "step": 17744 + }, + { + "epoch": 0.9519849785407726, + "grad_norm": 0.43359375, + "learning_rate": 4.670185381550606e-06, + "loss": 2.1893, + "step": 17745 + }, + { + "epoch": 0.952038626609442, + "grad_norm": 0.453125, + "learning_rate": 4.67014225175188e-06, + "loss": 2.2744, + "step": 17746 + }, + { + "epoch": 0.9520922746781116, + "grad_norm": 0.5, + "learning_rate": 4.670099119332482e-06, + "loss": 2.26, + "step": 17747 + }, + { + "epoch": 0.9521459227467811, + "grad_norm": 0.474609375, + "learning_rate": 4.670055984292464e-06, + "loss": 2.4299, + "step": 17748 + }, + { + "epoch": 0.9521995708154506, + "grad_norm": 0.50390625, + "learning_rate": 4.6700128466318785e-06, + "loss": 2.3774, + "step": 17749 + }, + { + "epoch": 0.9522532188841202, + "grad_norm": 0.498046875, + "learning_rate": 4.669969706350778e-06, + "loss": 2.397, + "step": 17750 + }, + { + "epoch": 0.9523068669527897, + "grad_norm": 0.64453125, + "learning_rate": 4.669926563449213e-06, + "loss": 1.2123, + "step": 17751 + }, + { + "epoch": 0.9523605150214592, + "grad_norm": 0.91015625, + "learning_rate": 4.669883417927237e-06, + "loss": 2.2807, + "step": 17752 + }, + { + "epoch": 0.9524141630901287, + "grad_norm": 0.5390625, + "learning_rate": 4.669840269784901e-06, + "loss": 2.3976, + "step": 17753 + }, + { + "epoch": 0.9524678111587983, + "grad_norm": 0.439453125, + "learning_rate": 4.669797119022259e-06, + "loss": 2.0201, + "step": 17754 + }, + { + "epoch": 0.9525214592274678, + "grad_norm": 0.5, + "learning_rate": 4.669753965639361e-06, + "loss": 2.3915, + "step": 17755 + }, + { + "epoch": 0.9525751072961374, + "grad_norm": 0.486328125, + "learning_rate": 4.669710809636261e-06, + "loss": 2.5378, + "step": 17756 + }, + { + "epoch": 0.9526287553648068, + "grad_norm": 0.88671875, + "learning_rate": 4.669667651013009e-06, + "loss": 2.363, + "step": 17757 + }, + { + "epoch": 0.9526824034334764, + "grad_norm": 0.5, + "learning_rate": 4.669624489769658e-06, + "loss": 2.2271, + "step": 17758 + }, + { + "epoch": 0.9527360515021459, + "grad_norm": 0.55859375, + "learning_rate": 4.669581325906261e-06, + "loss": 2.3797, + "step": 17759 + }, + { + "epoch": 0.9527896995708155, + "grad_norm": 0.486328125, + "learning_rate": 4.669538159422869e-06, + "loss": 2.3051, + "step": 17760 + }, + { + "epoch": 0.952843347639485, + "grad_norm": 0.421875, + "learning_rate": 4.669494990319535e-06, + "loss": 2.0563, + "step": 17761 + }, + { + "epoch": 0.9528969957081546, + "grad_norm": 0.404296875, + "learning_rate": 4.66945181859631e-06, + "loss": 2.1109, + "step": 17762 + }, + { + "epoch": 0.952950643776824, + "grad_norm": 0.486328125, + "learning_rate": 4.669408644253249e-06, + "loss": 2.4816, + "step": 17763 + }, + { + "epoch": 0.9530042918454935, + "grad_norm": 0.46875, + "learning_rate": 4.669365467290399e-06, + "loss": 2.2783, + "step": 17764 + }, + { + "epoch": 0.9530579399141631, + "grad_norm": 0.515625, + "learning_rate": 4.669322287707817e-06, + "loss": 2.4452, + "step": 17765 + }, + { + "epoch": 0.9531115879828326, + "grad_norm": 0.50390625, + "learning_rate": 4.669279105505553e-06, + "loss": 2.469, + "step": 17766 + }, + { + "epoch": 0.9531652360515022, + "grad_norm": 0.380859375, + "learning_rate": 4.669235920683659e-06, + "loss": 1.8192, + "step": 17767 + }, + { + "epoch": 0.9532188841201716, + "grad_norm": 0.388671875, + "learning_rate": 4.669192733242188e-06, + "loss": 2.0291, + "step": 17768 + }, + { + "epoch": 0.9532725321888412, + "grad_norm": 0.84765625, + "learning_rate": 4.669149543181191e-06, + "loss": 2.2618, + "step": 17769 + }, + { + "epoch": 0.9533261802575107, + "grad_norm": 0.43359375, + "learning_rate": 4.669106350500722e-06, + "loss": 2.2735, + "step": 17770 + }, + { + "epoch": 0.9533798283261803, + "grad_norm": 0.47265625, + "learning_rate": 4.669063155200832e-06, + "loss": 2.6165, + "step": 17771 + }, + { + "epoch": 0.9534334763948498, + "grad_norm": 0.5, + "learning_rate": 4.669019957281572e-06, + "loss": 2.3344, + "step": 17772 + }, + { + "epoch": 0.9534871244635194, + "grad_norm": 0.4609375, + "learning_rate": 4.668976756742996e-06, + "loss": 2.3425, + "step": 17773 + }, + { + "epoch": 0.9535407725321888, + "grad_norm": 0.62109375, + "learning_rate": 4.668933553585155e-06, + "loss": 2.2297, + "step": 17774 + }, + { + "epoch": 0.9535944206008584, + "grad_norm": 0.466796875, + "learning_rate": 4.668890347808103e-06, + "loss": 2.3055, + "step": 17775 + }, + { + "epoch": 0.9536480686695279, + "grad_norm": 0.375, + "learning_rate": 4.668847139411889e-06, + "loss": 2.2759, + "step": 17776 + }, + { + "epoch": 0.9537017167381975, + "grad_norm": 0.5, + "learning_rate": 4.668803928396569e-06, + "loss": 2.3727, + "step": 17777 + }, + { + "epoch": 0.953755364806867, + "grad_norm": 0.453125, + "learning_rate": 4.668760714762193e-06, + "loss": 2.1305, + "step": 17778 + }, + { + "epoch": 0.9538090128755364, + "grad_norm": 0.63671875, + "learning_rate": 4.668717498508812e-06, + "loss": 2.4295, + "step": 17779 + }, + { + "epoch": 0.953862660944206, + "grad_norm": 0.384765625, + "learning_rate": 4.668674279636481e-06, + "loss": 2.1053, + "step": 17780 + }, + { + "epoch": 0.9539163090128755, + "grad_norm": 0.63671875, + "learning_rate": 4.66863105814525e-06, + "loss": 2.1757, + "step": 17781 + }, + { + "epoch": 0.9539699570815451, + "grad_norm": 0.4765625, + "learning_rate": 4.668587834035172e-06, + "loss": 2.004, + "step": 17782 + }, + { + "epoch": 0.9540236051502146, + "grad_norm": 0.51171875, + "learning_rate": 4.6685446073063e-06, + "loss": 2.2865, + "step": 17783 + }, + { + "epoch": 0.9540772532188841, + "grad_norm": 0.4375, + "learning_rate": 4.668501377958685e-06, + "loss": 2.2347, + "step": 17784 + }, + { + "epoch": 0.9541309012875536, + "grad_norm": 0.490234375, + "learning_rate": 4.668458145992379e-06, + "loss": 2.3226, + "step": 17785 + }, + { + "epoch": 0.9541845493562232, + "grad_norm": 0.48046875, + "learning_rate": 4.668414911407436e-06, + "loss": 2.1917, + "step": 17786 + }, + { + "epoch": 0.9542381974248927, + "grad_norm": 0.48828125, + "learning_rate": 4.668371674203907e-06, + "loss": 2.176, + "step": 17787 + }, + { + "epoch": 0.9542918454935623, + "grad_norm": 0.453125, + "learning_rate": 4.668328434381844e-06, + "loss": 2.3225, + "step": 17788 + }, + { + "epoch": 0.9543454935622318, + "grad_norm": 0.48046875, + "learning_rate": 4.668285191941298e-06, + "loss": 2.2614, + "step": 17789 + }, + { + "epoch": 0.9543991416309013, + "grad_norm": 0.62109375, + "learning_rate": 4.6682419468823245e-06, + "loss": 2.2065, + "step": 17790 + }, + { + "epoch": 0.9544527896995708, + "grad_norm": 0.41015625, + "learning_rate": 4.668198699204974e-06, + "loss": 2.0477, + "step": 17791 + }, + { + "epoch": 0.9545064377682403, + "grad_norm": 0.462890625, + "learning_rate": 4.6681554489092964e-06, + "loss": 2.2284, + "step": 17792 + }, + { + "epoch": 0.9545600858369099, + "grad_norm": 0.5234375, + "learning_rate": 4.668112195995348e-06, + "loss": 2.2762, + "step": 17793 + }, + { + "epoch": 0.9546137339055794, + "grad_norm": 0.5, + "learning_rate": 4.668068940463179e-06, + "loss": 2.3979, + "step": 17794 + }, + { + "epoch": 0.9546673819742489, + "grad_norm": 0.470703125, + "learning_rate": 4.668025682312841e-06, + "loss": 2.4425, + "step": 17795 + }, + { + "epoch": 0.9547210300429184, + "grad_norm": 0.5859375, + "learning_rate": 4.667982421544388e-06, + "loss": 1.9595, + "step": 17796 + }, + { + "epoch": 0.954774678111588, + "grad_norm": 0.361328125, + "learning_rate": 4.667939158157871e-06, + "loss": 2.0722, + "step": 17797 + }, + { + "epoch": 0.9548283261802575, + "grad_norm": 0.59765625, + "learning_rate": 4.667895892153342e-06, + "loss": 2.2138, + "step": 17798 + }, + { + "epoch": 0.9548819742489271, + "grad_norm": 0.43359375, + "learning_rate": 4.667852623530855e-06, + "loss": 2.3499, + "step": 17799 + }, + { + "epoch": 0.9549356223175965, + "grad_norm": 0.443359375, + "learning_rate": 4.66780935229046e-06, + "loss": 2.1612, + "step": 17800 + }, + { + "epoch": 0.9549892703862661, + "grad_norm": 0.490234375, + "learning_rate": 4.667766078432211e-06, + "loss": 2.3547, + "step": 17801 + }, + { + "epoch": 0.9550429184549356, + "grad_norm": 1.015625, + "learning_rate": 4.667722801956159e-06, + "loss": 2.2162, + "step": 17802 + }, + { + "epoch": 0.9550965665236052, + "grad_norm": 0.48828125, + "learning_rate": 4.667679522862357e-06, + "loss": 2.2287, + "step": 17803 + }, + { + "epoch": 0.9551502145922747, + "grad_norm": 0.421875, + "learning_rate": 4.667636241150857e-06, + "loss": 2.1874, + "step": 17804 + }, + { + "epoch": 0.9552038626609443, + "grad_norm": 0.39453125, + "learning_rate": 4.667592956821711e-06, + "loss": 2.1661, + "step": 17805 + }, + { + "epoch": 0.9552575107296137, + "grad_norm": 0.69140625, + "learning_rate": 4.667549669874973e-06, + "loss": 2.4888, + "step": 17806 + }, + { + "epoch": 0.9553111587982832, + "grad_norm": 0.478515625, + "learning_rate": 4.667506380310692e-06, + "loss": 2.5963, + "step": 17807 + }, + { + "epoch": 0.9553648068669528, + "grad_norm": 0.5, + "learning_rate": 4.667463088128923e-06, + "loss": 2.2806, + "step": 17808 + }, + { + "epoch": 0.9554184549356223, + "grad_norm": 0.46484375, + "learning_rate": 4.667419793329718e-06, + "loss": 2.4153, + "step": 17809 + }, + { + "epoch": 0.9554721030042919, + "grad_norm": 0.47265625, + "learning_rate": 4.667376495913128e-06, + "loss": 2.3438, + "step": 17810 + }, + { + "epoch": 0.9555257510729613, + "grad_norm": 0.5546875, + "learning_rate": 4.667333195879207e-06, + "loss": 2.4212, + "step": 17811 + }, + { + "epoch": 0.9555793991416309, + "grad_norm": 0.427734375, + "learning_rate": 4.667289893228005e-06, + "loss": 2.1834, + "step": 17812 + }, + { + "epoch": 0.9556330472103004, + "grad_norm": 0.4609375, + "learning_rate": 4.667246587959577e-06, + "loss": 2.2095, + "step": 17813 + }, + { + "epoch": 0.95568669527897, + "grad_norm": 0.51171875, + "learning_rate": 4.667203280073973e-06, + "loss": 2.3813, + "step": 17814 + }, + { + "epoch": 0.9557403433476395, + "grad_norm": 0.47265625, + "learning_rate": 4.6671599695712466e-06, + "loss": 2.0616, + "step": 17815 + }, + { + "epoch": 0.955793991416309, + "grad_norm": 0.53515625, + "learning_rate": 4.667116656451449e-06, + "loss": 2.0706, + "step": 17816 + }, + { + "epoch": 0.9558476394849785, + "grad_norm": 0.69140625, + "learning_rate": 4.667073340714634e-06, + "loss": 2.1913, + "step": 17817 + }, + { + "epoch": 0.9559012875536481, + "grad_norm": 0.6953125, + "learning_rate": 4.667030022360853e-06, + "loss": 2.3662, + "step": 17818 + }, + { + "epoch": 0.9559549356223176, + "grad_norm": 0.416015625, + "learning_rate": 4.666986701390158e-06, + "loss": 2.2546, + "step": 17819 + }, + { + "epoch": 0.9560085836909872, + "grad_norm": 0.462890625, + "learning_rate": 4.666943377802602e-06, + "loss": 2.4478, + "step": 17820 + }, + { + "epoch": 0.9560622317596567, + "grad_norm": 0.48828125, + "learning_rate": 4.666900051598237e-06, + "loss": 2.3468, + "step": 17821 + }, + { + "epoch": 0.9561158798283261, + "grad_norm": 0.498046875, + "learning_rate": 4.666856722777116e-06, + "loss": 2.346, + "step": 17822 + }, + { + "epoch": 0.9561695278969957, + "grad_norm": 0.5078125, + "learning_rate": 4.666813391339289e-06, + "loss": 1.7465, + "step": 17823 + }, + { + "epoch": 0.9562231759656652, + "grad_norm": 0.427734375, + "learning_rate": 4.6667700572848115e-06, + "loss": 2.4238, + "step": 17824 + }, + { + "epoch": 0.9562768240343348, + "grad_norm": 0.50390625, + "learning_rate": 4.666726720613734e-06, + "loss": 2.448, + "step": 17825 + }, + { + "epoch": 0.9563304721030043, + "grad_norm": 0.3828125, + "learning_rate": 4.666683381326109e-06, + "loss": 2.0573, + "step": 17826 + }, + { + "epoch": 0.9563841201716738, + "grad_norm": 0.4453125, + "learning_rate": 4.666640039421989e-06, + "loss": 2.1582, + "step": 17827 + }, + { + "epoch": 0.9564377682403433, + "grad_norm": 0.453125, + "learning_rate": 4.6665966949014264e-06, + "loss": 2.2258, + "step": 17828 + }, + { + "epoch": 0.9564914163090129, + "grad_norm": 0.66015625, + "learning_rate": 4.666553347764474e-06, + "loss": 2.4949, + "step": 17829 + }, + { + "epoch": 0.9565450643776824, + "grad_norm": 0.4609375, + "learning_rate": 4.6665099980111836e-06, + "loss": 2.2261, + "step": 17830 + }, + { + "epoch": 0.956598712446352, + "grad_norm": 0.52734375, + "learning_rate": 4.666466645641607e-06, + "loss": 2.3395, + "step": 17831 + }, + { + "epoch": 0.9566523605150214, + "grad_norm": 0.546875, + "learning_rate": 4.666423290655798e-06, + "loss": 2.2474, + "step": 17832 + }, + { + "epoch": 0.956706008583691, + "grad_norm": 0.48046875, + "learning_rate": 4.666379933053808e-06, + "loss": 2.1317, + "step": 17833 + }, + { + "epoch": 0.9567596566523605, + "grad_norm": 0.5078125, + "learning_rate": 4.666336572835688e-06, + "loss": 2.3581, + "step": 17834 + }, + { + "epoch": 0.95681330472103, + "grad_norm": 4.90625, + "learning_rate": 4.666293210001494e-06, + "loss": 2.5209, + "step": 17835 + }, + { + "epoch": 0.9568669527896996, + "grad_norm": 0.51171875, + "learning_rate": 4.666249844551275e-06, + "loss": 2.2747, + "step": 17836 + }, + { + "epoch": 0.956920600858369, + "grad_norm": 0.8046875, + "learning_rate": 4.6662064764850844e-06, + "loss": 2.3146, + "step": 17837 + }, + { + "epoch": 0.9569742489270386, + "grad_norm": 0.390625, + "learning_rate": 4.666163105802974e-06, + "loss": 1.8342, + "step": 17838 + }, + { + "epoch": 0.9570278969957081, + "grad_norm": 0.56640625, + "learning_rate": 4.666119732504999e-06, + "loss": 2.6517, + "step": 17839 + }, + { + "epoch": 0.9570815450643777, + "grad_norm": 0.47265625, + "learning_rate": 4.666076356591208e-06, + "loss": 2.3902, + "step": 17840 + }, + { + "epoch": 0.9571351931330472, + "grad_norm": 0.412109375, + "learning_rate": 4.666032978061656e-06, + "loss": 2.1366, + "step": 17841 + }, + { + "epoch": 0.9571888412017168, + "grad_norm": 0.412109375, + "learning_rate": 4.665989596916395e-06, + "loss": 2.3877, + "step": 17842 + }, + { + "epoch": 0.9572424892703862, + "grad_norm": 0.640625, + "learning_rate": 4.665946213155476e-06, + "loss": 2.481, + "step": 17843 + }, + { + "epoch": 0.9572961373390558, + "grad_norm": 0.4453125, + "learning_rate": 4.665902826778952e-06, + "loss": 2.1459, + "step": 17844 + }, + { + "epoch": 0.9573497854077253, + "grad_norm": 0.435546875, + "learning_rate": 4.665859437786876e-06, + "loss": 2.2057, + "step": 17845 + }, + { + "epoch": 0.9574034334763949, + "grad_norm": 0.5234375, + "learning_rate": 4.665816046179299e-06, + "loss": 2.3876, + "step": 17846 + }, + { + "epoch": 0.9574570815450644, + "grad_norm": 0.36328125, + "learning_rate": 4.665772651956276e-06, + "loss": 2.0955, + "step": 17847 + }, + { + "epoch": 0.957510729613734, + "grad_norm": 0.703125, + "learning_rate": 4.665729255117857e-06, + "loss": 2.1213, + "step": 17848 + }, + { + "epoch": 0.9575643776824034, + "grad_norm": 0.5703125, + "learning_rate": 4.665685855664095e-06, + "loss": 2.3521, + "step": 17849 + }, + { + "epoch": 0.9576180257510729, + "grad_norm": 0.57421875, + "learning_rate": 4.665642453595043e-06, + "loss": 2.1939, + "step": 17850 + }, + { + "epoch": 0.9576716738197425, + "grad_norm": 0.455078125, + "learning_rate": 4.665599048910753e-06, + "loss": 2.4719, + "step": 17851 + }, + { + "epoch": 0.957725321888412, + "grad_norm": 0.43359375, + "learning_rate": 4.665555641611278e-06, + "loss": 2.2006, + "step": 17852 + }, + { + "epoch": 0.9577789699570816, + "grad_norm": 0.478515625, + "learning_rate": 4.66551223169667e-06, + "loss": 2.1455, + "step": 17853 + }, + { + "epoch": 0.957832618025751, + "grad_norm": 0.5234375, + "learning_rate": 4.665468819166981e-06, + "loss": 2.2386, + "step": 17854 + }, + { + "epoch": 0.9578862660944206, + "grad_norm": 0.453125, + "learning_rate": 4.665425404022262e-06, + "loss": 2.2293, + "step": 17855 + }, + { + "epoch": 0.9579399141630901, + "grad_norm": 0.408203125, + "learning_rate": 4.665381986262569e-06, + "loss": 2.0173, + "step": 17856 + }, + { + "epoch": 0.9579935622317597, + "grad_norm": 0.455078125, + "learning_rate": 4.665338565887952e-06, + "loss": 2.1544, + "step": 17857 + }, + { + "epoch": 0.9580472103004292, + "grad_norm": 0.498046875, + "learning_rate": 4.665295142898465e-06, + "loss": 2.3091, + "step": 17858 + }, + { + "epoch": 0.9581008583690988, + "grad_norm": 0.4921875, + "learning_rate": 4.6652517172941586e-06, + "loss": 2.1978, + "step": 17859 + }, + { + "epoch": 0.9581545064377682, + "grad_norm": 0.91796875, + "learning_rate": 4.665208289075086e-06, + "loss": 2.3814, + "step": 17860 + }, + { + "epoch": 0.9582081545064378, + "grad_norm": 0.54296875, + "learning_rate": 4.6651648582413e-06, + "loss": 2.506, + "step": 17861 + }, + { + "epoch": 0.9582618025751073, + "grad_norm": 0.4765625, + "learning_rate": 4.6651214247928536e-06, + "loss": 2.5139, + "step": 17862 + }, + { + "epoch": 0.9583154506437769, + "grad_norm": 0.484375, + "learning_rate": 4.665077988729797e-06, + "loss": 2.171, + "step": 17863 + }, + { + "epoch": 0.9583690987124464, + "grad_norm": 0.48828125, + "learning_rate": 4.665034550052185e-06, + "loss": 2.3052, + "step": 17864 + }, + { + "epoch": 0.9584227467811158, + "grad_norm": 1.1640625, + "learning_rate": 4.6649911087600695e-06, + "loss": 2.5694, + "step": 17865 + }, + { + "epoch": 0.9584763948497854, + "grad_norm": 0.58203125, + "learning_rate": 4.664947664853502e-06, + "loss": 2.2674, + "step": 17866 + }, + { + "epoch": 0.9585300429184549, + "grad_norm": 0.4609375, + "learning_rate": 4.664904218332536e-06, + "loss": 2.0701, + "step": 17867 + }, + { + "epoch": 0.9585836909871245, + "grad_norm": 0.48046875, + "learning_rate": 4.664860769197223e-06, + "loss": 2.2611, + "step": 17868 + }, + { + "epoch": 0.958637339055794, + "grad_norm": 0.5, + "learning_rate": 4.664817317447616e-06, + "loss": 2.2912, + "step": 17869 + }, + { + "epoch": 0.9586909871244635, + "grad_norm": 0.376953125, + "learning_rate": 4.664773863083768e-06, + "loss": 2.2924, + "step": 17870 + }, + { + "epoch": 0.958744635193133, + "grad_norm": 0.5234375, + "learning_rate": 4.664730406105731e-06, + "loss": 2.3563, + "step": 17871 + }, + { + "epoch": 0.9587982832618026, + "grad_norm": 0.5078125, + "learning_rate": 4.664686946513557e-06, + "loss": 2.4982, + "step": 17872 + }, + { + "epoch": 0.9588519313304721, + "grad_norm": 0.4765625, + "learning_rate": 4.6646434843072984e-06, + "loss": 2.3215, + "step": 17873 + }, + { + "epoch": 0.9589055793991417, + "grad_norm": 0.41015625, + "learning_rate": 4.66460001948701e-06, + "loss": 2.2173, + "step": 17874 + }, + { + "epoch": 0.9589592274678111, + "grad_norm": 0.53515625, + "learning_rate": 4.664556552052741e-06, + "loss": 2.2479, + "step": 17875 + }, + { + "epoch": 0.9590128755364807, + "grad_norm": 0.4140625, + "learning_rate": 4.664513082004546e-06, + "loss": 2.2588, + "step": 17876 + }, + { + "epoch": 0.9590665236051502, + "grad_norm": 0.478515625, + "learning_rate": 4.664469609342476e-06, + "loss": 2.1312, + "step": 17877 + }, + { + "epoch": 0.9591201716738197, + "grad_norm": 0.4765625, + "learning_rate": 4.664426134066585e-06, + "loss": 2.3766, + "step": 17878 + }, + { + "epoch": 0.9591738197424893, + "grad_norm": 0.458984375, + "learning_rate": 4.664382656176926e-06, + "loss": 2.315, + "step": 17879 + }, + { + "epoch": 0.9592274678111588, + "grad_norm": 0.47265625, + "learning_rate": 4.664339175673549e-06, + "loss": 2.3653, + "step": 17880 + }, + { + "epoch": 0.9592811158798283, + "grad_norm": 0.5, + "learning_rate": 4.664295692556509e-06, + "loss": 2.2742, + "step": 17881 + }, + { + "epoch": 0.9593347639484978, + "grad_norm": 0.427734375, + "learning_rate": 4.6642522068258555e-06, + "loss": 2.1996, + "step": 17882 + }, + { + "epoch": 0.9593884120171674, + "grad_norm": 0.419921875, + "learning_rate": 4.664208718481644e-06, + "loss": 2.3342, + "step": 17883 + }, + { + "epoch": 0.9594420600858369, + "grad_norm": 0.5625, + "learning_rate": 4.664165227523927e-06, + "loss": 2.5298, + "step": 17884 + }, + { + "epoch": 0.9594957081545065, + "grad_norm": 0.55078125, + "learning_rate": 4.664121733952754e-06, + "loss": 2.4906, + "step": 17885 + }, + { + "epoch": 0.9595493562231759, + "grad_norm": 0.400390625, + "learning_rate": 4.6640782377681815e-06, + "loss": 2.0652, + "step": 17886 + }, + { + "epoch": 0.9596030042918455, + "grad_norm": 0.341796875, + "learning_rate": 4.664034738970259e-06, + "loss": 2.055, + "step": 17887 + }, + { + "epoch": 0.959656652360515, + "grad_norm": 0.451171875, + "learning_rate": 4.66399123755904e-06, + "loss": 2.3918, + "step": 17888 + }, + { + "epoch": 0.9597103004291846, + "grad_norm": 0.421875, + "learning_rate": 4.663947733534577e-06, + "loss": 2.3045, + "step": 17889 + }, + { + "epoch": 0.9597639484978541, + "grad_norm": 0.46484375, + "learning_rate": 4.6639042268969226e-06, + "loss": 2.0995, + "step": 17890 + }, + { + "epoch": 0.9598175965665237, + "grad_norm": 0.58984375, + "learning_rate": 4.6638607176461295e-06, + "loss": 1.9653, + "step": 17891 + }, + { + "epoch": 0.9598712446351931, + "grad_norm": 0.48828125, + "learning_rate": 4.66381720578225e-06, + "loss": 2.095, + "step": 17892 + }, + { + "epoch": 0.9599248927038626, + "grad_norm": 0.4765625, + "learning_rate": 4.663773691305336e-06, + "loss": 2.1157, + "step": 17893 + }, + { + "epoch": 0.9599785407725322, + "grad_norm": 0.453125, + "learning_rate": 4.663730174215443e-06, + "loss": 2.1883, + "step": 17894 + }, + { + "epoch": 0.9600321888412017, + "grad_norm": 0.412109375, + "learning_rate": 4.663686654512619e-06, + "loss": 2.0253, + "step": 17895 + }, + { + "epoch": 0.9600858369098713, + "grad_norm": 0.458984375, + "learning_rate": 4.66364313219692e-06, + "loss": 2.3904, + "step": 17896 + }, + { + "epoch": 0.9601394849785407, + "grad_norm": 0.5, + "learning_rate": 4.663599607268397e-06, + "loss": 2.2621, + "step": 17897 + }, + { + "epoch": 0.9601931330472103, + "grad_norm": 0.5625, + "learning_rate": 4.663556079727104e-06, + "loss": 2.8706, + "step": 17898 + }, + { + "epoch": 0.9602467811158798, + "grad_norm": 0.5234375, + "learning_rate": 4.6635125495730915e-06, + "loss": 2.4845, + "step": 17899 + }, + { + "epoch": 0.9603004291845494, + "grad_norm": 0.40234375, + "learning_rate": 4.663469016806413e-06, + "loss": 2.3991, + "step": 17900 + }, + { + "epoch": 0.9603540772532189, + "grad_norm": 0.46875, + "learning_rate": 4.663425481427122e-06, + "loss": 2.6408, + "step": 17901 + }, + { + "epoch": 0.9604077253218885, + "grad_norm": 0.51171875, + "learning_rate": 4.66338194343527e-06, + "loss": 2.6081, + "step": 17902 + }, + { + "epoch": 0.9604613733905579, + "grad_norm": 0.49609375, + "learning_rate": 4.66333840283091e-06, + "loss": 2.2986, + "step": 17903 + }, + { + "epoch": 0.9605150214592275, + "grad_norm": 0.58203125, + "learning_rate": 4.663294859614095e-06, + "loss": 2.3463, + "step": 17904 + }, + { + "epoch": 0.960568669527897, + "grad_norm": 0.47265625, + "learning_rate": 4.663251313784876e-06, + "loss": 2.2568, + "step": 17905 + }, + { + "epoch": 0.9606223175965666, + "grad_norm": 0.4765625, + "learning_rate": 4.6632077653433065e-06, + "loss": 2.5032, + "step": 17906 + }, + { + "epoch": 0.960675965665236, + "grad_norm": 0.50390625, + "learning_rate": 4.663164214289439e-06, + "loss": 2.3227, + "step": 17907 + }, + { + "epoch": 0.9607296137339055, + "grad_norm": 0.44921875, + "learning_rate": 4.663120660623327e-06, + "loss": 2.2312, + "step": 17908 + }, + { + "epoch": 0.9607832618025751, + "grad_norm": 0.55859375, + "learning_rate": 4.663077104345023e-06, + "loss": 2.4935, + "step": 17909 + }, + { + "epoch": 0.9608369098712446, + "grad_norm": 6.0625, + "learning_rate": 4.663033545454578e-06, + "loss": 2.5312, + "step": 17910 + }, + { + "epoch": 0.9608905579399142, + "grad_norm": 0.55078125, + "learning_rate": 4.662989983952045e-06, + "loss": 1.9605, + "step": 17911 + }, + { + "epoch": 0.9609442060085837, + "grad_norm": 0.447265625, + "learning_rate": 4.662946419837478e-06, + "loss": 2.3152, + "step": 17912 + }, + { + "epoch": 0.9609978540772532, + "grad_norm": 0.44921875, + "learning_rate": 4.662902853110929e-06, + "loss": 2.3839, + "step": 17913 + }, + { + "epoch": 0.9610515021459227, + "grad_norm": 0.55078125, + "learning_rate": 4.6628592837724505e-06, + "loss": 2.3359, + "step": 17914 + }, + { + "epoch": 0.9611051502145923, + "grad_norm": 0.41015625, + "learning_rate": 4.662815711822095e-06, + "loss": 2.0931, + "step": 17915 + }, + { + "epoch": 0.9611587982832618, + "grad_norm": 0.482421875, + "learning_rate": 4.662772137259914e-06, + "loss": 2.2148, + "step": 17916 + }, + { + "epoch": 0.9612124463519314, + "grad_norm": 0.51171875, + "learning_rate": 4.6627285600859625e-06, + "loss": 2.2154, + "step": 17917 + }, + { + "epoch": 0.9612660944206008, + "grad_norm": 0.44140625, + "learning_rate": 4.662684980300291e-06, + "loss": 2.2285, + "step": 17918 + }, + { + "epoch": 0.9613197424892704, + "grad_norm": 0.75, + "learning_rate": 4.662641397902954e-06, + "loss": 1.908, + "step": 17919 + }, + { + "epoch": 0.9613733905579399, + "grad_norm": 0.54296875, + "learning_rate": 4.662597812894003e-06, + "loss": 2.303, + "step": 17920 + }, + { + "epoch": 0.9614270386266094, + "grad_norm": 0.51953125, + "learning_rate": 4.66255422527349e-06, + "loss": 2.4434, + "step": 17921 + }, + { + "epoch": 0.961480686695279, + "grad_norm": 0.478515625, + "learning_rate": 4.6625106350414694e-06, + "loss": 2.2799, + "step": 17922 + }, + { + "epoch": 0.9615343347639485, + "grad_norm": 0.43359375, + "learning_rate": 4.662467042197993e-06, + "loss": 2.1446, + "step": 17923 + }, + { + "epoch": 0.961587982832618, + "grad_norm": 0.451171875, + "learning_rate": 4.6624234467431115e-06, + "loss": 2.2036, + "step": 17924 + }, + { + "epoch": 0.9616416309012875, + "grad_norm": 0.5390625, + "learning_rate": 4.662379848676881e-06, + "loss": 2.3422, + "step": 17925 + }, + { + "epoch": 0.9616952789699571, + "grad_norm": 0.412109375, + "learning_rate": 4.662336247999352e-06, + "loss": 2.179, + "step": 17926 + }, + { + "epoch": 0.9617489270386266, + "grad_norm": 0.462890625, + "learning_rate": 4.662292644710578e-06, + "loss": 2.3648, + "step": 17927 + }, + { + "epoch": 0.9618025751072962, + "grad_norm": 0.484375, + "learning_rate": 4.66224903881061e-06, + "loss": 2.2384, + "step": 17928 + }, + { + "epoch": 0.9618562231759656, + "grad_norm": 0.470703125, + "learning_rate": 4.662205430299504e-06, + "loss": 2.4247, + "step": 17929 + }, + { + "epoch": 0.9619098712446352, + "grad_norm": 0.67578125, + "learning_rate": 4.662161819177309e-06, + "loss": 2.423, + "step": 17930 + }, + { + "epoch": 0.9619635193133047, + "grad_norm": 0.5, + "learning_rate": 4.6621182054440796e-06, + "loss": 1.9977, + "step": 17931 + }, + { + "epoch": 0.9620171673819743, + "grad_norm": 0.546875, + "learning_rate": 4.662074589099868e-06, + "loss": 2.0691, + "step": 17932 + }, + { + "epoch": 0.9620708154506438, + "grad_norm": 0.52734375, + "learning_rate": 4.662030970144727e-06, + "loss": 2.1087, + "step": 17933 + }, + { + "epoch": 0.9621244635193134, + "grad_norm": 0.55859375, + "learning_rate": 4.6619873485787105e-06, + "loss": 2.2873, + "step": 17934 + }, + { + "epoch": 0.9621781115879828, + "grad_norm": 0.546875, + "learning_rate": 4.661943724401868e-06, + "loss": 2.3634, + "step": 17935 + }, + { + "epoch": 0.9622317596566523, + "grad_norm": 0.37109375, + "learning_rate": 4.661900097614256e-06, + "loss": 2.0998, + "step": 17936 + }, + { + "epoch": 0.9622854077253219, + "grad_norm": 0.55859375, + "learning_rate": 4.661856468215924e-06, + "loss": 1.7207, + "step": 17937 + }, + { + "epoch": 0.9623390557939914, + "grad_norm": 0.43359375, + "learning_rate": 4.661812836206927e-06, + "loss": 2.2912, + "step": 17938 + }, + { + "epoch": 0.962392703862661, + "grad_norm": 0.40234375, + "learning_rate": 4.661769201587317e-06, + "loss": 2.1708, + "step": 17939 + }, + { + "epoch": 0.9624463519313304, + "grad_norm": 0.423828125, + "learning_rate": 4.661725564357146e-06, + "loss": 2.1242, + "step": 17940 + }, + { + "epoch": 0.9625, + "grad_norm": 0.423828125, + "learning_rate": 4.661681924516466e-06, + "loss": 2.0708, + "step": 17941 + }, + { + "epoch": 0.9625536480686695, + "grad_norm": 0.419921875, + "learning_rate": 4.661638282065332e-06, + "loss": 2.2372, + "step": 17942 + }, + { + "epoch": 0.9626072961373391, + "grad_norm": 0.435546875, + "learning_rate": 4.661594637003795e-06, + "loss": 2.2896, + "step": 17943 + }, + { + "epoch": 0.9626609442060086, + "grad_norm": 0.49609375, + "learning_rate": 4.6615509893319085e-06, + "loss": 2.4894, + "step": 17944 + }, + { + "epoch": 0.9627145922746781, + "grad_norm": 0.435546875, + "learning_rate": 4.661507339049725e-06, + "loss": 1.7269, + "step": 17945 + }, + { + "epoch": 0.9627682403433476, + "grad_norm": 0.4453125, + "learning_rate": 4.661463686157297e-06, + "loss": 2.3667, + "step": 17946 + }, + { + "epoch": 0.9628218884120172, + "grad_norm": 0.453125, + "learning_rate": 4.661420030654677e-06, + "loss": 2.4267, + "step": 17947 + }, + { + "epoch": 0.9628755364806867, + "grad_norm": 0.490234375, + "learning_rate": 4.661376372541918e-06, + "loss": 2.2758, + "step": 17948 + }, + { + "epoch": 0.9629291845493563, + "grad_norm": 0.455078125, + "learning_rate": 4.661332711819074e-06, + "loss": 2.2129, + "step": 17949 + }, + { + "epoch": 0.9629828326180258, + "grad_norm": 0.5078125, + "learning_rate": 4.661289048486195e-06, + "loss": 2.3918, + "step": 17950 + }, + { + "epoch": 0.9630364806866952, + "grad_norm": 0.462890625, + "learning_rate": 4.661245382543336e-06, + "loss": 2.1831, + "step": 17951 + }, + { + "epoch": 0.9630901287553648, + "grad_norm": 0.44921875, + "learning_rate": 4.661201713990549e-06, + "loss": 2.1745, + "step": 17952 + }, + { + "epoch": 0.9631437768240343, + "grad_norm": 0.470703125, + "learning_rate": 4.661158042827887e-06, + "loss": 2.0545, + "step": 17953 + }, + { + "epoch": 0.9631974248927039, + "grad_norm": 0.5078125, + "learning_rate": 4.661114369055402e-06, + "loss": 2.3423, + "step": 17954 + }, + { + "epoch": 0.9632510729613734, + "grad_norm": 0.48046875, + "learning_rate": 4.6610706926731465e-06, + "loss": 2.2726, + "step": 17955 + }, + { + "epoch": 0.9633047210300429, + "grad_norm": 0.4765625, + "learning_rate": 4.661027013681175e-06, + "loss": 2.0591, + "step": 17956 + }, + { + "epoch": 0.9633583690987124, + "grad_norm": 0.47265625, + "learning_rate": 4.66098333207954e-06, + "loss": 2.2182, + "step": 17957 + }, + { + "epoch": 0.963412017167382, + "grad_norm": 0.52734375, + "learning_rate": 4.660939647868291e-06, + "loss": 2.3336, + "step": 17958 + }, + { + "epoch": 0.9634656652360515, + "grad_norm": 0.578125, + "learning_rate": 4.6608959610474844e-06, + "loss": 2.2744, + "step": 17959 + }, + { + "epoch": 0.9635193133047211, + "grad_norm": 0.3984375, + "learning_rate": 4.660852271617172e-06, + "loss": 2.3341, + "step": 17960 + }, + { + "epoch": 0.9635729613733905, + "grad_norm": 0.482421875, + "learning_rate": 4.6608085795774065e-06, + "loss": 2.2689, + "step": 17961 + }, + { + "epoch": 0.9636266094420601, + "grad_norm": 0.443359375, + "learning_rate": 4.66076488492824e-06, + "loss": 2.4305, + "step": 17962 + }, + { + "epoch": 0.9636802575107296, + "grad_norm": 0.4375, + "learning_rate": 4.660721187669725e-06, + "loss": 2.1985, + "step": 17963 + }, + { + "epoch": 0.9637339055793992, + "grad_norm": 0.408203125, + "learning_rate": 4.660677487801917e-06, + "loss": 2.0304, + "step": 17964 + }, + { + "epoch": 0.9637875536480687, + "grad_norm": 0.41015625, + "learning_rate": 4.660633785324865e-06, + "loss": 2.3126, + "step": 17965 + }, + { + "epoch": 0.9638412017167381, + "grad_norm": 0.365234375, + "learning_rate": 4.660590080238625e-06, + "loss": 1.9451, + "step": 17966 + }, + { + "epoch": 0.9638948497854077, + "grad_norm": 0.67578125, + "learning_rate": 4.660546372543247e-06, + "loss": 2.3733, + "step": 17967 + }, + { + "epoch": 0.9639484978540772, + "grad_norm": 0.498046875, + "learning_rate": 4.660502662238785e-06, + "loss": 2.196, + "step": 17968 + }, + { + "epoch": 0.9640021459227468, + "grad_norm": 0.46875, + "learning_rate": 4.660458949325293e-06, + "loss": 2.4398, + "step": 17969 + }, + { + "epoch": 0.9640557939914163, + "grad_norm": 0.546875, + "learning_rate": 4.660415233802822e-06, + "loss": 1.3928, + "step": 17970 + }, + { + "epoch": 0.9641094420600859, + "grad_norm": 0.51953125, + "learning_rate": 4.660371515671426e-06, + "loss": 2.1278, + "step": 17971 + }, + { + "epoch": 0.9641630901287553, + "grad_norm": 0.7734375, + "learning_rate": 4.660327794931157e-06, + "loss": 2.3782, + "step": 17972 + }, + { + "epoch": 0.9642167381974249, + "grad_norm": 0.49609375, + "learning_rate": 4.660284071582067e-06, + "loss": 2.2306, + "step": 17973 + }, + { + "epoch": 0.9642703862660944, + "grad_norm": 0.4609375, + "learning_rate": 4.660240345624211e-06, + "loss": 2.3412, + "step": 17974 + }, + { + "epoch": 0.964324034334764, + "grad_norm": 0.51171875, + "learning_rate": 4.66019661705764e-06, + "loss": 2.1774, + "step": 17975 + }, + { + "epoch": 0.9643776824034335, + "grad_norm": 0.466796875, + "learning_rate": 4.660152885882408e-06, + "loss": 2.2601, + "step": 17976 + }, + { + "epoch": 0.964431330472103, + "grad_norm": 0.515625, + "learning_rate": 4.660109152098567e-06, + "loss": 2.3757, + "step": 17977 + }, + { + "epoch": 0.9644849785407725, + "grad_norm": 0.50390625, + "learning_rate": 4.660065415706171e-06, + "loss": 2.3939, + "step": 17978 + }, + { + "epoch": 0.964538626609442, + "grad_norm": 0.494140625, + "learning_rate": 4.660021676705271e-06, + "loss": 2.4246, + "step": 17979 + }, + { + "epoch": 0.9645922746781116, + "grad_norm": 0.4609375, + "learning_rate": 4.659977935095921e-06, + "loss": 2.362, + "step": 17980 + }, + { + "epoch": 0.9646459227467811, + "grad_norm": 0.455078125, + "learning_rate": 4.659934190878174e-06, + "loss": 1.8477, + "step": 17981 + }, + { + "epoch": 0.9646995708154507, + "grad_norm": 0.54296875, + "learning_rate": 4.659890444052081e-06, + "loss": 2.3595, + "step": 17982 + }, + { + "epoch": 0.9647532188841201, + "grad_norm": 5.03125, + "learning_rate": 4.659846694617697e-06, + "loss": 2.2807, + "step": 17983 + }, + { + "epoch": 0.9648068669527897, + "grad_norm": 0.43359375, + "learning_rate": 4.659802942575075e-06, + "loss": 2.2645, + "step": 17984 + }, + { + "epoch": 0.9648605150214592, + "grad_norm": 0.48828125, + "learning_rate": 4.6597591879242655e-06, + "loss": 2.1295, + "step": 17985 + }, + { + "epoch": 0.9649141630901288, + "grad_norm": 0.4765625, + "learning_rate": 4.659715430665322e-06, + "loss": 2.3274, + "step": 17986 + }, + { + "epoch": 0.9649678111587983, + "grad_norm": 0.48046875, + "learning_rate": 4.6596716707983e-06, + "loss": 2.3047, + "step": 17987 + }, + { + "epoch": 0.9650214592274678, + "grad_norm": 0.49609375, + "learning_rate": 4.65962790832325e-06, + "loss": 2.3755, + "step": 17988 + }, + { + "epoch": 0.9650751072961373, + "grad_norm": 0.703125, + "learning_rate": 4.6595841432402245e-06, + "loss": 2.4964, + "step": 17989 + }, + { + "epoch": 0.9651287553648069, + "grad_norm": 0.3984375, + "learning_rate": 4.659540375549278e-06, + "loss": 2.1963, + "step": 17990 + }, + { + "epoch": 0.9651824034334764, + "grad_norm": 0.51953125, + "learning_rate": 4.659496605250461e-06, + "loss": 2.3464, + "step": 17991 + }, + { + "epoch": 0.965236051502146, + "grad_norm": 0.58984375, + "learning_rate": 4.659452832343829e-06, + "loss": 2.4125, + "step": 17992 + }, + { + "epoch": 0.9652896995708155, + "grad_norm": 0.4140625, + "learning_rate": 4.659409056829434e-06, + "loss": 2.3591, + "step": 17993 + }, + { + "epoch": 0.9653433476394849, + "grad_norm": 0.52734375, + "learning_rate": 4.659365278707327e-06, + "loss": 2.3184, + "step": 17994 + }, + { + "epoch": 0.9653969957081545, + "grad_norm": 0.470703125, + "learning_rate": 4.659321497977563e-06, + "loss": 2.3464, + "step": 17995 + }, + { + "epoch": 0.965450643776824, + "grad_norm": 0.45703125, + "learning_rate": 4.659277714640195e-06, + "loss": 2.1157, + "step": 17996 + }, + { + "epoch": 0.9655042918454936, + "grad_norm": 0.44921875, + "learning_rate": 4.659233928695275e-06, + "loss": 2.313, + "step": 17997 + }, + { + "epoch": 0.965557939914163, + "grad_norm": 0.53125, + "learning_rate": 4.659190140142855e-06, + "loss": 2.2136, + "step": 17998 + }, + { + "epoch": 0.9656115879828326, + "grad_norm": 0.50390625, + "learning_rate": 4.65914634898299e-06, + "loss": 2.2089, + "step": 17999 + }, + { + "epoch": 0.9656652360515021, + "grad_norm": 0.458984375, + "learning_rate": 4.659102555215732e-06, + "loss": 2.1177, + "step": 18000 + }, + { + "epoch": 0.9657188841201717, + "grad_norm": 0.51171875, + "learning_rate": 4.659058758841133e-06, + "loss": 2.153, + "step": 18001 + }, + { + "epoch": 0.9657725321888412, + "grad_norm": 0.39453125, + "learning_rate": 4.659014959859246e-06, + "loss": 1.9536, + "step": 18002 + }, + { + "epoch": 0.9658261802575108, + "grad_norm": 0.58984375, + "learning_rate": 4.658971158270125e-06, + "loss": 2.1033, + "step": 18003 + }, + { + "epoch": 0.9658798283261802, + "grad_norm": 0.51953125, + "learning_rate": 4.658927354073823e-06, + "loss": 1.3716, + "step": 18004 + }, + { + "epoch": 0.9659334763948498, + "grad_norm": 0.51953125, + "learning_rate": 4.658883547270392e-06, + "loss": 2.3048, + "step": 18005 + }, + { + "epoch": 0.9659871244635193, + "grad_norm": 0.47265625, + "learning_rate": 4.658839737859884e-06, + "loss": 2.4142, + "step": 18006 + }, + { + "epoch": 0.9660407725321889, + "grad_norm": 0.43359375, + "learning_rate": 4.658795925842354e-06, + "loss": 1.9933, + "step": 18007 + }, + { + "epoch": 0.9660944206008584, + "grad_norm": 0.482421875, + "learning_rate": 4.658752111217853e-06, + "loss": 2.3474, + "step": 18008 + }, + { + "epoch": 0.9661480686695278, + "grad_norm": 0.486328125, + "learning_rate": 4.658708293986436e-06, + "loss": 2.0425, + "step": 18009 + }, + { + "epoch": 0.9662017167381974, + "grad_norm": 0.455078125, + "learning_rate": 4.658664474148155e-06, + "loss": 2.4052, + "step": 18010 + }, + { + "epoch": 0.9662553648068669, + "grad_norm": 0.46484375, + "learning_rate": 4.658620651703062e-06, + "loss": 2.1841, + "step": 18011 + }, + { + "epoch": 0.9663090128755365, + "grad_norm": 0.486328125, + "learning_rate": 4.65857682665121e-06, + "loss": 2.3241, + "step": 18012 + }, + { + "epoch": 0.966362660944206, + "grad_norm": 0.546875, + "learning_rate": 4.658532998992654e-06, + "loss": 2.2221, + "step": 18013 + }, + { + "epoch": 0.9664163090128756, + "grad_norm": 0.5625, + "learning_rate": 4.658489168727445e-06, + "loss": 2.4998, + "step": 18014 + }, + { + "epoch": 0.966469957081545, + "grad_norm": 0.451171875, + "learning_rate": 4.658445335855635e-06, + "loss": 2.2404, + "step": 18015 + }, + { + "epoch": 0.9665236051502146, + "grad_norm": 0.431640625, + "learning_rate": 4.65840150037728e-06, + "loss": 2.1405, + "step": 18016 + }, + { + "epoch": 0.9665772532188841, + "grad_norm": 0.59375, + "learning_rate": 4.658357662292431e-06, + "loss": 1.8169, + "step": 18017 + }, + { + "epoch": 0.9666309012875537, + "grad_norm": 0.5390625, + "learning_rate": 4.65831382160114e-06, + "loss": 1.6235, + "step": 18018 + }, + { + "epoch": 0.9666845493562232, + "grad_norm": 0.455078125, + "learning_rate": 4.658269978303462e-06, + "loss": 2.3632, + "step": 18019 + }, + { + "epoch": 0.9667381974248928, + "grad_norm": 0.4921875, + "learning_rate": 4.658226132399449e-06, + "loss": 2.6164, + "step": 18020 + }, + { + "epoch": 0.9667918454935622, + "grad_norm": 0.56640625, + "learning_rate": 4.658182283889154e-06, + "loss": 2.4559, + "step": 18021 + }, + { + "epoch": 0.9668454935622317, + "grad_norm": 0.439453125, + "learning_rate": 4.658138432772631e-06, + "loss": 2.1884, + "step": 18022 + }, + { + "epoch": 0.9668991416309013, + "grad_norm": 0.58984375, + "learning_rate": 4.658094579049931e-06, + "loss": 1.9105, + "step": 18023 + }, + { + "epoch": 0.9669527896995708, + "grad_norm": 0.5390625, + "learning_rate": 4.658050722721108e-06, + "loss": 2.3518, + "step": 18024 + }, + { + "epoch": 0.9670064377682404, + "grad_norm": 0.66015625, + "learning_rate": 4.6580068637862144e-06, + "loss": 2.3164, + "step": 18025 + }, + { + "epoch": 0.9670600858369098, + "grad_norm": 0.3828125, + "learning_rate": 4.657963002245304e-06, + "loss": 2.03, + "step": 18026 + }, + { + "epoch": 0.9671137339055794, + "grad_norm": 0.515625, + "learning_rate": 4.65791913809843e-06, + "loss": 1.8207, + "step": 18027 + }, + { + "epoch": 0.9671673819742489, + "grad_norm": 0.443359375, + "learning_rate": 4.657875271345644e-06, + "loss": 2.1337, + "step": 18028 + }, + { + "epoch": 0.9672210300429185, + "grad_norm": 0.375, + "learning_rate": 4.657831401987e-06, + "loss": 2.2519, + "step": 18029 + }, + { + "epoch": 0.967274678111588, + "grad_norm": 0.44140625, + "learning_rate": 4.6577875300225505e-06, + "loss": 2.3338, + "step": 18030 + }, + { + "epoch": 0.9673283261802575, + "grad_norm": 0.466796875, + "learning_rate": 4.657743655452348e-06, + "loss": 2.1954, + "step": 18031 + }, + { + "epoch": 0.967381974248927, + "grad_norm": 0.50390625, + "learning_rate": 4.657699778276448e-06, + "loss": 2.2925, + "step": 18032 + }, + { + "epoch": 0.9674356223175966, + "grad_norm": 0.439453125, + "learning_rate": 4.6576558984948995e-06, + "loss": 2.3537, + "step": 18033 + }, + { + "epoch": 0.9674892703862661, + "grad_norm": 0.390625, + "learning_rate": 4.65761201610776e-06, + "loss": 2.1996, + "step": 18034 + }, + { + "epoch": 0.9675429184549357, + "grad_norm": 14.25, + "learning_rate": 4.6575681311150785e-06, + "loss": 2.4458, + "step": 18035 + }, + { + "epoch": 0.9675965665236052, + "grad_norm": 0.50390625, + "learning_rate": 4.65752424351691e-06, + "loss": 2.3078, + "step": 18036 + }, + { + "epoch": 0.9676502145922746, + "grad_norm": 0.4296875, + "learning_rate": 4.657480353313307e-06, + "loss": 2.3982, + "step": 18037 + }, + { + "epoch": 0.9677038626609442, + "grad_norm": 0.396484375, + "learning_rate": 4.657436460504323e-06, + "loss": 2.2387, + "step": 18038 + }, + { + "epoch": 0.9677575107296137, + "grad_norm": 0.51171875, + "learning_rate": 4.657392565090011e-06, + "loss": 2.3062, + "step": 18039 + }, + { + "epoch": 0.9678111587982833, + "grad_norm": 0.7421875, + "learning_rate": 4.657348667070423e-06, + "loss": 2.2644, + "step": 18040 + }, + { + "epoch": 0.9678648068669528, + "grad_norm": 0.458984375, + "learning_rate": 4.657304766445613e-06, + "loss": 2.3402, + "step": 18041 + }, + { + "epoch": 0.9679184549356223, + "grad_norm": 0.490234375, + "learning_rate": 4.657260863215633e-06, + "loss": 2.2737, + "step": 18042 + }, + { + "epoch": 0.9679721030042918, + "grad_norm": 0.412109375, + "learning_rate": 4.657216957380537e-06, + "loss": 2.06, + "step": 18043 + }, + { + "epoch": 0.9680257510729614, + "grad_norm": 0.51171875, + "learning_rate": 4.657173048940378e-06, + "loss": 2.1611, + "step": 18044 + }, + { + "epoch": 0.9680793991416309, + "grad_norm": 0.412109375, + "learning_rate": 4.657129137895209e-06, + "loss": 2.3196, + "step": 18045 + }, + { + "epoch": 0.9681330472103005, + "grad_norm": 0.5, + "learning_rate": 4.657085224245083e-06, + "loss": 2.3179, + "step": 18046 + }, + { + "epoch": 0.9681866952789699, + "grad_norm": 0.392578125, + "learning_rate": 4.6570413079900516e-06, + "loss": 2.0607, + "step": 18047 + }, + { + "epoch": 0.9682403433476395, + "grad_norm": 0.5078125, + "learning_rate": 4.65699738913017e-06, + "loss": 1.838, + "step": 18048 + }, + { + "epoch": 0.968293991416309, + "grad_norm": 0.408203125, + "learning_rate": 4.6569534676654896e-06, + "loss": 2.1502, + "step": 18049 + }, + { + "epoch": 0.9683476394849786, + "grad_norm": 0.431640625, + "learning_rate": 4.6569095435960645e-06, + "loss": 2.22, + "step": 18050 + }, + { + "epoch": 0.9684012875536481, + "grad_norm": 0.81640625, + "learning_rate": 4.656865616921947e-06, + "loss": 2.3097, + "step": 18051 + }, + { + "epoch": 0.9684549356223175, + "grad_norm": 0.69140625, + "learning_rate": 4.656821687643191e-06, + "loss": 2.2386, + "step": 18052 + }, + { + "epoch": 0.9685085836909871, + "grad_norm": 0.4765625, + "learning_rate": 4.6567777557598495e-06, + "loss": 2.2099, + "step": 18053 + }, + { + "epoch": 0.9685622317596566, + "grad_norm": 0.52734375, + "learning_rate": 4.656733821271973e-06, + "loss": 2.6154, + "step": 18054 + }, + { + "epoch": 0.9686158798283262, + "grad_norm": 0.474609375, + "learning_rate": 4.656689884179619e-06, + "loss": 2.3901, + "step": 18055 + }, + { + "epoch": 0.9686695278969957, + "grad_norm": 0.390625, + "learning_rate": 4.656645944482837e-06, + "loss": 2.203, + "step": 18056 + }, + { + "epoch": 0.9687231759656653, + "grad_norm": 0.435546875, + "learning_rate": 4.656602002181682e-06, + "loss": 2.2611, + "step": 18057 + }, + { + "epoch": 0.9687768240343347, + "grad_norm": 0.431640625, + "learning_rate": 4.656558057276206e-06, + "loss": 2.4351, + "step": 18058 + }, + { + "epoch": 0.9688304721030043, + "grad_norm": 0.46875, + "learning_rate": 4.656514109766462e-06, + "loss": 2.1938, + "step": 18059 + }, + { + "epoch": 0.9688841201716738, + "grad_norm": 0.43359375, + "learning_rate": 4.656470159652504e-06, + "loss": 2.1494, + "step": 18060 + }, + { + "epoch": 0.9689377682403434, + "grad_norm": 0.48828125, + "learning_rate": 4.6564262069343845e-06, + "loss": 2.3746, + "step": 18061 + }, + { + "epoch": 0.9689914163090129, + "grad_norm": 0.57421875, + "learning_rate": 4.656382251612157e-06, + "loss": 2.4649, + "step": 18062 + }, + { + "epoch": 0.9690450643776825, + "grad_norm": 0.3984375, + "learning_rate": 4.656338293685873e-06, + "loss": 2.182, + "step": 18063 + }, + { + "epoch": 0.9690987124463519, + "grad_norm": 0.65234375, + "learning_rate": 4.6562943331555875e-06, + "loss": 2.4837, + "step": 18064 + }, + { + "epoch": 0.9691523605150214, + "grad_norm": 0.482421875, + "learning_rate": 4.656250370021353e-06, + "loss": 2.3742, + "step": 18065 + }, + { + "epoch": 0.969206008583691, + "grad_norm": 0.458984375, + "learning_rate": 4.656206404283222e-06, + "loss": 2.372, + "step": 18066 + }, + { + "epoch": 0.9692596566523605, + "grad_norm": 0.59375, + "learning_rate": 4.656162435941249e-06, + "loss": 2.567, + "step": 18067 + }, + { + "epoch": 0.9693133047210301, + "grad_norm": 0.4765625, + "learning_rate": 4.656118464995486e-06, + "loss": 2.0909, + "step": 18068 + }, + { + "epoch": 0.9693669527896995, + "grad_norm": 0.56640625, + "learning_rate": 4.656074491445986e-06, + "loss": 2.3224, + "step": 18069 + }, + { + "epoch": 0.9694206008583691, + "grad_norm": 0.53515625, + "learning_rate": 4.656030515292801e-06, + "loss": 2.1253, + "step": 18070 + }, + { + "epoch": 0.9694742489270386, + "grad_norm": 0.443359375, + "learning_rate": 4.655986536535987e-06, + "loss": 2.243, + "step": 18071 + }, + { + "epoch": 0.9695278969957082, + "grad_norm": 0.51953125, + "learning_rate": 4.655942555175595e-06, + "loss": 2.4214, + "step": 18072 + }, + { + "epoch": 0.9695815450643777, + "grad_norm": 0.408203125, + "learning_rate": 4.655898571211679e-06, + "loss": 2.0153, + "step": 18073 + }, + { + "epoch": 0.9696351931330472, + "grad_norm": 0.453125, + "learning_rate": 4.655854584644292e-06, + "loss": 2.4078, + "step": 18074 + }, + { + "epoch": 0.9696888412017167, + "grad_norm": 0.3984375, + "learning_rate": 4.655810595473486e-06, + "loss": 2.0457, + "step": 18075 + }, + { + "epoch": 0.9697424892703863, + "grad_norm": 0.52734375, + "learning_rate": 4.655766603699317e-06, + "loss": 2.404, + "step": 18076 + }, + { + "epoch": 0.9697961373390558, + "grad_norm": 0.5703125, + "learning_rate": 4.655722609321835e-06, + "loss": 2.1763, + "step": 18077 + }, + { + "epoch": 0.9698497854077254, + "grad_norm": 0.48046875, + "learning_rate": 4.6556786123410935e-06, + "loss": 2.4248, + "step": 18078 + }, + { + "epoch": 0.9699034334763948, + "grad_norm": 0.51953125, + "learning_rate": 4.655634612757147e-06, + "loss": 2.5723, + "step": 18079 + }, + { + "epoch": 0.9699570815450643, + "grad_norm": 0.5234375, + "learning_rate": 4.6555906105700485e-06, + "loss": 2.2587, + "step": 18080 + }, + { + "epoch": 0.9700107296137339, + "grad_norm": 0.45703125, + "learning_rate": 4.655546605779851e-06, + "loss": 1.3247, + "step": 18081 + }, + { + "epoch": 0.9700643776824034, + "grad_norm": 0.68359375, + "learning_rate": 4.655502598386606e-06, + "loss": 2.2216, + "step": 18082 + }, + { + "epoch": 0.970118025751073, + "grad_norm": 0.412109375, + "learning_rate": 4.655458588390369e-06, + "loss": 2.1003, + "step": 18083 + }, + { + "epoch": 0.9701716738197425, + "grad_norm": 0.44921875, + "learning_rate": 4.655414575791192e-06, + "loss": 2.2038, + "step": 18084 + }, + { + "epoch": 0.970225321888412, + "grad_norm": 0.5078125, + "learning_rate": 4.655370560589128e-06, + "loss": 2.0069, + "step": 18085 + }, + { + "epoch": 0.9702789699570815, + "grad_norm": 0.48046875, + "learning_rate": 4.655326542784231e-06, + "loss": 2.3204, + "step": 18086 + }, + { + "epoch": 0.9703326180257511, + "grad_norm": 0.435546875, + "learning_rate": 4.655282522376553e-06, + "loss": 2.2134, + "step": 18087 + }, + { + "epoch": 0.9703862660944206, + "grad_norm": 0.4296875, + "learning_rate": 4.655238499366147e-06, + "loss": 2.4361, + "step": 18088 + }, + { + "epoch": 0.9704399141630902, + "grad_norm": 0.4921875, + "learning_rate": 4.655194473753069e-06, + "loss": 2.1808, + "step": 18089 + }, + { + "epoch": 0.9704935622317596, + "grad_norm": 0.63671875, + "learning_rate": 4.655150445537368e-06, + "loss": 2.5085, + "step": 18090 + }, + { + "epoch": 0.9705472103004292, + "grad_norm": 0.490234375, + "learning_rate": 4.655106414719101e-06, + "loss": 2.4444, + "step": 18091 + }, + { + "epoch": 0.9706008583690987, + "grad_norm": 0.482421875, + "learning_rate": 4.655062381298318e-06, + "loss": 1.9547, + "step": 18092 + }, + { + "epoch": 0.9706545064377683, + "grad_norm": 0.462890625, + "learning_rate": 4.655018345275074e-06, + "loss": 2.2122, + "step": 18093 + }, + { + "epoch": 0.9707081545064378, + "grad_norm": 0.4765625, + "learning_rate": 4.654974306649422e-06, + "loss": 2.3438, + "step": 18094 + }, + { + "epoch": 0.9707618025751072, + "grad_norm": 0.53515625, + "learning_rate": 4.654930265421414e-06, + "loss": 2.269, + "step": 18095 + }, + { + "epoch": 0.9708154506437768, + "grad_norm": 0.423828125, + "learning_rate": 4.654886221591106e-06, + "loss": 2.1738, + "step": 18096 + }, + { + "epoch": 0.9708690987124463, + "grad_norm": 0.482421875, + "learning_rate": 4.6548421751585486e-06, + "loss": 2.322, + "step": 18097 + }, + { + "epoch": 0.9709227467811159, + "grad_norm": 0.478515625, + "learning_rate": 4.654798126123795e-06, + "loss": 1.9793, + "step": 18098 + }, + { + "epoch": 0.9709763948497854, + "grad_norm": 0.515625, + "learning_rate": 4.6547540744869e-06, + "loss": 2.3975, + "step": 18099 + }, + { + "epoch": 0.971030042918455, + "grad_norm": 0.41796875, + "learning_rate": 4.654710020247915e-06, + "loss": 2.2891, + "step": 18100 + }, + { + "epoch": 0.9710836909871244, + "grad_norm": 0.470703125, + "learning_rate": 4.654665963406895e-06, + "loss": 2.1569, + "step": 18101 + }, + { + "epoch": 0.971137339055794, + "grad_norm": 0.439453125, + "learning_rate": 4.654621903963892e-06, + "loss": 2.2407, + "step": 18102 + }, + { + "epoch": 0.9711909871244635, + "grad_norm": 0.494140625, + "learning_rate": 4.6545778419189594e-06, + "loss": 2.3203, + "step": 18103 + }, + { + "epoch": 0.9712446351931331, + "grad_norm": 0.69921875, + "learning_rate": 4.65453377727215e-06, + "loss": 2.0406, + "step": 18104 + }, + { + "epoch": 0.9712982832618026, + "grad_norm": 0.52734375, + "learning_rate": 4.654489710023517e-06, + "loss": 2.1972, + "step": 18105 + }, + { + "epoch": 0.9713519313304722, + "grad_norm": 0.53515625, + "learning_rate": 4.654445640173116e-06, + "loss": 2.4296, + "step": 18106 + }, + { + "epoch": 0.9714055793991416, + "grad_norm": 0.46484375, + "learning_rate": 4.654401567720997e-06, + "loss": 1.663, + "step": 18107 + }, + { + "epoch": 0.9714592274678111, + "grad_norm": 0.455078125, + "learning_rate": 4.654357492667216e-06, + "loss": 2.1008, + "step": 18108 + }, + { + "epoch": 0.9715128755364807, + "grad_norm": 0.458984375, + "learning_rate": 4.654313415011824e-06, + "loss": 2.217, + "step": 18109 + }, + { + "epoch": 0.9715665236051502, + "grad_norm": 0.423828125, + "learning_rate": 4.654269334754875e-06, + "loss": 1.6328, + "step": 18110 + }, + { + "epoch": 0.9716201716738198, + "grad_norm": 0.494140625, + "learning_rate": 4.654225251896422e-06, + "loss": 2.3437, + "step": 18111 + }, + { + "epoch": 0.9716738197424892, + "grad_norm": 0.4453125, + "learning_rate": 4.654181166436519e-06, + "loss": 2.1845, + "step": 18112 + }, + { + "epoch": 0.9717274678111588, + "grad_norm": 0.44921875, + "learning_rate": 4.654137078375218e-06, + "loss": 2.4662, + "step": 18113 + }, + { + "epoch": 0.9717811158798283, + "grad_norm": 0.66015625, + "learning_rate": 4.654092987712574e-06, + "loss": 2.4455, + "step": 18114 + }, + { + "epoch": 0.9718347639484979, + "grad_norm": 0.51953125, + "learning_rate": 4.654048894448639e-06, + "loss": 2.0912, + "step": 18115 + }, + { + "epoch": 0.9718884120171674, + "grad_norm": 0.46484375, + "learning_rate": 4.6540047985834664e-06, + "loss": 2.3746, + "step": 18116 + }, + { + "epoch": 0.971942060085837, + "grad_norm": 0.671875, + "learning_rate": 4.653960700117109e-06, + "loss": 2.5161, + "step": 18117 + }, + { + "epoch": 0.9719957081545064, + "grad_norm": 0.419921875, + "learning_rate": 4.653916599049622e-06, + "loss": 2.2236, + "step": 18118 + }, + { + "epoch": 0.972049356223176, + "grad_norm": 0.470703125, + "learning_rate": 4.653872495381056e-06, + "loss": 2.2477, + "step": 18119 + }, + { + "epoch": 0.9721030042918455, + "grad_norm": 0.458984375, + "learning_rate": 4.653828389111465e-06, + "loss": 2.1763, + "step": 18120 + }, + { + "epoch": 0.9721566523605151, + "grad_norm": 0.400390625, + "learning_rate": 4.653784280240904e-06, + "loss": 2.0979, + "step": 18121 + }, + { + "epoch": 0.9722103004291845, + "grad_norm": 0.6484375, + "learning_rate": 4.653740168769424e-06, + "loss": 2.3458, + "step": 18122 + }, + { + "epoch": 0.972263948497854, + "grad_norm": 0.4921875, + "learning_rate": 4.65369605469708e-06, + "loss": 2.2088, + "step": 18123 + }, + { + "epoch": 0.9723175965665236, + "grad_norm": 0.453125, + "learning_rate": 4.653651938023924e-06, + "loss": 2.2375, + "step": 18124 + }, + { + "epoch": 0.9723712446351931, + "grad_norm": 0.478515625, + "learning_rate": 4.65360781875001e-06, + "loss": 2.1786, + "step": 18125 + }, + { + "epoch": 0.9724248927038627, + "grad_norm": 0.5390625, + "learning_rate": 4.653563696875392e-06, + "loss": 2.4229, + "step": 18126 + }, + { + "epoch": 0.9724785407725322, + "grad_norm": 0.56640625, + "learning_rate": 4.653519572400121e-06, + "loss": 2.2809, + "step": 18127 + }, + { + "epoch": 0.9725321888412017, + "grad_norm": 0.51953125, + "learning_rate": 4.653475445324254e-06, + "loss": 2.2048, + "step": 18128 + }, + { + "epoch": 0.9725858369098712, + "grad_norm": 0.53515625, + "learning_rate": 4.653431315647839e-06, + "loss": 2.6341, + "step": 18129 + }, + { + "epoch": 0.9726394849785408, + "grad_norm": 0.515625, + "learning_rate": 4.653387183370934e-06, + "loss": 2.2509, + "step": 18130 + }, + { + "epoch": 0.9726931330472103, + "grad_norm": 0.427734375, + "learning_rate": 4.65334304849359e-06, + "loss": 2.3241, + "step": 18131 + }, + { + "epoch": 0.9727467811158799, + "grad_norm": 0.404296875, + "learning_rate": 4.653298911015862e-06, + "loss": 2.1099, + "step": 18132 + }, + { + "epoch": 0.9728004291845493, + "grad_norm": 0.40625, + "learning_rate": 4.653254770937801e-06, + "loss": 1.9049, + "step": 18133 + }, + { + "epoch": 0.9728540772532189, + "grad_norm": 0.423828125, + "learning_rate": 4.653210628259462e-06, + "loss": 2.1538, + "step": 18134 + }, + { + "epoch": 0.9729077253218884, + "grad_norm": 0.46875, + "learning_rate": 4.653166482980898e-06, + "loss": 2.3054, + "step": 18135 + }, + { + "epoch": 0.972961373390558, + "grad_norm": 0.443359375, + "learning_rate": 4.653122335102161e-06, + "loss": 1.981, + "step": 18136 + }, + { + "epoch": 0.9730150214592275, + "grad_norm": 0.50390625, + "learning_rate": 4.653078184623306e-06, + "loss": 2.2661, + "step": 18137 + }, + { + "epoch": 0.973068669527897, + "grad_norm": 0.46875, + "learning_rate": 4.653034031544386e-06, + "loss": 2.1664, + "step": 18138 + }, + { + "epoch": 0.9731223175965665, + "grad_norm": 0.4375, + "learning_rate": 4.652989875865454e-06, + "loss": 2.1137, + "step": 18139 + }, + { + "epoch": 0.973175965665236, + "grad_norm": 0.486328125, + "learning_rate": 4.652945717586563e-06, + "loss": 2.1091, + "step": 18140 + }, + { + "epoch": 0.9732296137339056, + "grad_norm": 0.40234375, + "learning_rate": 4.652901556707767e-06, + "loss": 2.4271, + "step": 18141 + }, + { + "epoch": 0.9732832618025751, + "grad_norm": 0.7421875, + "learning_rate": 4.652857393229119e-06, + "loss": 2.2855, + "step": 18142 + }, + { + "epoch": 0.9733369098712447, + "grad_norm": 0.380859375, + "learning_rate": 4.652813227150672e-06, + "loss": 2.5566, + "step": 18143 + }, + { + "epoch": 0.9733905579399141, + "grad_norm": 0.498046875, + "learning_rate": 4.652769058472481e-06, + "loss": 2.2366, + "step": 18144 + }, + { + "epoch": 0.9734442060085837, + "grad_norm": 0.515625, + "learning_rate": 4.652724887194596e-06, + "loss": 2.2173, + "step": 18145 + }, + { + "epoch": 0.9734978540772532, + "grad_norm": 0.51171875, + "learning_rate": 4.652680713317073e-06, + "loss": 2.1894, + "step": 18146 + }, + { + "epoch": 0.9735515021459228, + "grad_norm": 0.71484375, + "learning_rate": 4.652636536839965e-06, + "loss": 2.5423, + "step": 18147 + }, + { + "epoch": 0.9736051502145923, + "grad_norm": 0.435546875, + "learning_rate": 4.652592357763325e-06, + "loss": 2.3142, + "step": 18148 + }, + { + "epoch": 0.9736587982832619, + "grad_norm": 0.421875, + "learning_rate": 4.652548176087207e-06, + "loss": 2.2363, + "step": 18149 + }, + { + "epoch": 0.9737124463519313, + "grad_norm": 0.5078125, + "learning_rate": 4.652503991811663e-06, + "loss": 2.3838, + "step": 18150 + }, + { + "epoch": 0.9737660944206008, + "grad_norm": 0.408203125, + "learning_rate": 4.6524598049367475e-06, + "loss": 1.9243, + "step": 18151 + }, + { + "epoch": 0.9738197424892704, + "grad_norm": 0.4375, + "learning_rate": 4.652415615462513e-06, + "loss": 2.4713, + "step": 18152 + }, + { + "epoch": 0.9738733905579399, + "grad_norm": 0.431640625, + "learning_rate": 4.652371423389014e-06, + "loss": 2.1426, + "step": 18153 + }, + { + "epoch": 0.9739270386266095, + "grad_norm": 0.458984375, + "learning_rate": 4.652327228716302e-06, + "loss": 2.4506, + "step": 18154 + }, + { + "epoch": 0.9739806866952789, + "grad_norm": 0.5, + "learning_rate": 4.6522830314444326e-06, + "loss": 2.3126, + "step": 18155 + }, + { + "epoch": 0.9740343347639485, + "grad_norm": 0.53125, + "learning_rate": 4.652238831573458e-06, + "loss": 2.0453, + "step": 18156 + }, + { + "epoch": 0.974087982832618, + "grad_norm": 0.4609375, + "learning_rate": 4.652194629103431e-06, + "loss": 2.2988, + "step": 18157 + }, + { + "epoch": 0.9741416309012876, + "grad_norm": 0.49609375, + "learning_rate": 4.652150424034407e-06, + "loss": 2.2391, + "step": 18158 + }, + { + "epoch": 0.9741952789699571, + "grad_norm": 0.458984375, + "learning_rate": 4.652106216366438e-06, + "loss": 1.6757, + "step": 18159 + }, + { + "epoch": 0.9742489270386266, + "grad_norm": 0.470703125, + "learning_rate": 4.652062006099577e-06, + "loss": 2.3188, + "step": 18160 + }, + { + "epoch": 0.9743025751072961, + "grad_norm": 0.6484375, + "learning_rate": 4.652017793233877e-06, + "loss": 2.4624, + "step": 18161 + }, + { + "epoch": 0.9743562231759657, + "grad_norm": 0.5078125, + "learning_rate": 4.651973577769393e-06, + "loss": 2.2515, + "step": 18162 + }, + { + "epoch": 0.9744098712446352, + "grad_norm": 0.462890625, + "learning_rate": 4.651929359706178e-06, + "loss": 2.549, + "step": 18163 + }, + { + "epoch": 0.9744635193133048, + "grad_norm": 0.5703125, + "learning_rate": 4.6518851390442844e-06, + "loss": 2.2224, + "step": 18164 + }, + { + "epoch": 0.9745171673819742, + "grad_norm": 0.494140625, + "learning_rate": 4.651840915783766e-06, + "loss": 2.186, + "step": 18165 + }, + { + "epoch": 0.9745708154506437, + "grad_norm": 0.498046875, + "learning_rate": 4.651796689924677e-06, + "loss": 2.3443, + "step": 18166 + }, + { + "epoch": 0.9746244635193133, + "grad_norm": 0.50390625, + "learning_rate": 4.65175246146707e-06, + "loss": 2.3074, + "step": 18167 + }, + { + "epoch": 0.9746781115879828, + "grad_norm": 0.63671875, + "learning_rate": 4.651708230410999e-06, + "loss": 2.3677, + "step": 18168 + }, + { + "epoch": 0.9747317596566524, + "grad_norm": 0.43359375, + "learning_rate": 4.651663996756518e-06, + "loss": 2.1601, + "step": 18169 + }, + { + "epoch": 0.9747854077253219, + "grad_norm": 0.4609375, + "learning_rate": 4.651619760503678e-06, + "loss": 2.1669, + "step": 18170 + }, + { + "epoch": 0.9748390557939914, + "grad_norm": 0.5, + "learning_rate": 4.651575521652535e-06, + "loss": 2.1406, + "step": 18171 + }, + { + "epoch": 0.9748927038626609, + "grad_norm": 2.3125, + "learning_rate": 4.651531280203141e-06, + "loss": 1.4426, + "step": 18172 + }, + { + "epoch": 0.9749463519313305, + "grad_norm": 0.515625, + "learning_rate": 4.6514870361555485e-06, + "loss": 2.2544, + "step": 18173 + }, + { + "epoch": 0.975, + "grad_norm": 0.423828125, + "learning_rate": 4.651442789509813e-06, + "loss": 2.2366, + "step": 18174 + }, + { + "epoch": 0.9750536480686696, + "grad_norm": 0.4765625, + "learning_rate": 4.651398540265988e-06, + "loss": 2.2975, + "step": 18175 + }, + { + "epoch": 0.975107296137339, + "grad_norm": 0.52734375, + "learning_rate": 4.651354288424125e-06, + "loss": 2.3749, + "step": 18176 + }, + { + "epoch": 0.9751609442060086, + "grad_norm": 0.423828125, + "learning_rate": 4.651310033984279e-06, + "loss": 2.1592, + "step": 18177 + }, + { + "epoch": 0.9752145922746781, + "grad_norm": 0.416015625, + "learning_rate": 4.651265776946503e-06, + "loss": 2.1454, + "step": 18178 + }, + { + "epoch": 0.9752682403433477, + "grad_norm": 0.490234375, + "learning_rate": 4.65122151731085e-06, + "loss": 2.3477, + "step": 18179 + }, + { + "epoch": 0.9753218884120172, + "grad_norm": 0.5390625, + "learning_rate": 4.651177255077374e-06, + "loss": 2.1678, + "step": 18180 + }, + { + "epoch": 0.9753755364806866, + "grad_norm": 0.52734375, + "learning_rate": 4.651132990246128e-06, + "loss": 2.5118, + "step": 18181 + }, + { + "epoch": 0.9754291845493562, + "grad_norm": 0.470703125, + "learning_rate": 4.651088722817166e-06, + "loss": 2.2754, + "step": 18182 + }, + { + "epoch": 0.9754828326180257, + "grad_norm": 0.55078125, + "learning_rate": 4.651044452790542e-06, + "loss": 1.0661, + "step": 18183 + }, + { + "epoch": 0.9755364806866953, + "grad_norm": 0.7734375, + "learning_rate": 4.6510001801663075e-06, + "loss": 2.3529, + "step": 18184 + }, + { + "epoch": 0.9755901287553648, + "grad_norm": 0.478515625, + "learning_rate": 4.650955904944517e-06, + "loss": 2.1827, + "step": 18185 + }, + { + "epoch": 0.9756437768240344, + "grad_norm": 0.48046875, + "learning_rate": 4.650911627125225e-06, + "loss": 2.1981, + "step": 18186 + }, + { + "epoch": 0.9756974248927038, + "grad_norm": 0.49609375, + "learning_rate": 4.6508673467084835e-06, + "loss": 2.5798, + "step": 18187 + }, + { + "epoch": 0.9757510729613734, + "grad_norm": 0.390625, + "learning_rate": 4.650823063694347e-06, + "loss": 2.2576, + "step": 18188 + }, + { + "epoch": 0.9758047210300429, + "grad_norm": 0.53125, + "learning_rate": 4.650778778082868e-06, + "loss": 2.441, + "step": 18189 + }, + { + "epoch": 0.9758583690987125, + "grad_norm": 3.21875, + "learning_rate": 4.650734489874101e-06, + "loss": 2.1906, + "step": 18190 + }, + { + "epoch": 0.975912017167382, + "grad_norm": 0.45703125, + "learning_rate": 4.650690199068099e-06, + "loss": 2.5424, + "step": 18191 + }, + { + "epoch": 0.9759656652360515, + "grad_norm": 0.466796875, + "learning_rate": 4.650645905664914e-06, + "loss": 2.3822, + "step": 18192 + }, + { + "epoch": 0.976019313304721, + "grad_norm": 0.5234375, + "learning_rate": 4.650601609664603e-06, + "loss": 2.3046, + "step": 18193 + }, + { + "epoch": 0.9760729613733906, + "grad_norm": 0.5234375, + "learning_rate": 4.650557311067216e-06, + "loss": 2.4315, + "step": 18194 + }, + { + "epoch": 0.9761266094420601, + "grad_norm": 0.4296875, + "learning_rate": 4.650513009872809e-06, + "loss": 2.1504, + "step": 18195 + }, + { + "epoch": 0.9761802575107296, + "grad_norm": 0.462890625, + "learning_rate": 4.650468706081433e-06, + "loss": 2.5573, + "step": 18196 + }, + { + "epoch": 0.9762339055793992, + "grad_norm": 0.5234375, + "learning_rate": 4.6504243996931445e-06, + "loss": 2.1392, + "step": 18197 + }, + { + "epoch": 0.9762875536480686, + "grad_norm": 0.494140625, + "learning_rate": 4.650380090707994e-06, + "loss": 2.2641, + "step": 18198 + }, + { + "epoch": 0.9763412017167382, + "grad_norm": 0.52734375, + "learning_rate": 4.650335779126037e-06, + "loss": 2.433, + "step": 18199 + }, + { + "epoch": 0.9763948497854077, + "grad_norm": 0.54296875, + "learning_rate": 4.650291464947327e-06, + "loss": 2.1759, + "step": 18200 + }, + { + "epoch": 0.9764484978540773, + "grad_norm": 0.359375, + "learning_rate": 4.650247148171917e-06, + "loss": 1.9313, + "step": 18201 + }, + { + "epoch": 0.9765021459227468, + "grad_norm": 0.484375, + "learning_rate": 4.6502028287998605e-06, + "loss": 2.2011, + "step": 18202 + }, + { + "epoch": 0.9765557939914163, + "grad_norm": 0.8359375, + "learning_rate": 4.6501585068312095e-06, + "loss": 2.2535, + "step": 18203 + }, + { + "epoch": 0.9766094420600858, + "grad_norm": 0.474609375, + "learning_rate": 4.650114182266021e-06, + "loss": 2.3686, + "step": 18204 + }, + { + "epoch": 0.9766630901287554, + "grad_norm": 0.439453125, + "learning_rate": 4.650069855104345e-06, + "loss": 2.169, + "step": 18205 + }, + { + "epoch": 0.9767167381974249, + "grad_norm": 0.6484375, + "learning_rate": 4.650025525346237e-06, + "loss": 2.2155, + "step": 18206 + }, + { + "epoch": 0.9767703862660945, + "grad_norm": 0.490234375, + "learning_rate": 4.64998119299175e-06, + "loss": 2.1116, + "step": 18207 + }, + { + "epoch": 0.976824034334764, + "grad_norm": 0.486328125, + "learning_rate": 4.649936858040939e-06, + "loss": 2.3199, + "step": 18208 + }, + { + "epoch": 0.9768776824034334, + "grad_norm": 0.55859375, + "learning_rate": 4.649892520493855e-06, + "loss": 2.2738, + "step": 18209 + }, + { + "epoch": 0.976931330472103, + "grad_norm": 0.49609375, + "learning_rate": 4.649848180350553e-06, + "loss": 2.2883, + "step": 18210 + }, + { + "epoch": 0.9769849785407725, + "grad_norm": 0.51953125, + "learning_rate": 4.649803837611086e-06, + "loss": 1.9225, + "step": 18211 + }, + { + "epoch": 0.9770386266094421, + "grad_norm": 0.455078125, + "learning_rate": 4.6497594922755084e-06, + "loss": 2.0953, + "step": 18212 + }, + { + "epoch": 0.9770922746781115, + "grad_norm": 0.5234375, + "learning_rate": 4.649715144343873e-06, + "loss": 2.2886, + "step": 18213 + }, + { + "epoch": 0.9771459227467811, + "grad_norm": 0.46484375, + "learning_rate": 4.649670793816233e-06, + "loss": 2.5382, + "step": 18214 + }, + { + "epoch": 0.9771995708154506, + "grad_norm": 0.447265625, + "learning_rate": 4.649626440692643e-06, + "loss": 1.9611, + "step": 18215 + }, + { + "epoch": 0.9772532188841202, + "grad_norm": 0.5546875, + "learning_rate": 4.649582084973156e-06, + "loss": 1.4438, + "step": 18216 + }, + { + "epoch": 0.9773068669527897, + "grad_norm": 0.5, + "learning_rate": 4.649537726657825e-06, + "loss": 2.2857, + "step": 18217 + }, + { + "epoch": 0.9773605150214593, + "grad_norm": 0.54296875, + "learning_rate": 4.649493365746706e-06, + "loss": 2.3959, + "step": 18218 + }, + { + "epoch": 0.9774141630901287, + "grad_norm": 0.431640625, + "learning_rate": 4.649449002239849e-06, + "loss": 2.0896, + "step": 18219 + }, + { + "epoch": 0.9774678111587983, + "grad_norm": 0.58203125, + "learning_rate": 4.649404636137309e-06, + "loss": 2.7634, + "step": 18220 + }, + { + "epoch": 0.9775214592274678, + "grad_norm": 0.41015625, + "learning_rate": 4.6493602674391416e-06, + "loss": 2.2758, + "step": 18221 + }, + { + "epoch": 0.9775751072961374, + "grad_norm": 0.4765625, + "learning_rate": 4.649315896145398e-06, + "loss": 2.2252, + "step": 18222 + }, + { + "epoch": 0.9776287553648069, + "grad_norm": 0.640625, + "learning_rate": 4.649271522256132e-06, + "loss": 2.2666, + "step": 18223 + }, + { + "epoch": 0.9776824034334763, + "grad_norm": 0.50390625, + "learning_rate": 4.649227145771398e-06, + "loss": 2.4013, + "step": 18224 + }, + { + "epoch": 0.9777360515021459, + "grad_norm": 0.5078125, + "learning_rate": 4.649182766691249e-06, + "loss": 2.5494, + "step": 18225 + }, + { + "epoch": 0.9777896995708154, + "grad_norm": 0.4375, + "learning_rate": 4.649138385015739e-06, + "loss": 2.3414, + "step": 18226 + }, + { + "epoch": 0.977843347639485, + "grad_norm": 0.419921875, + "learning_rate": 4.649094000744922e-06, + "loss": 2.0143, + "step": 18227 + }, + { + "epoch": 0.9778969957081545, + "grad_norm": 0.45703125, + "learning_rate": 4.64904961387885e-06, + "loss": 2.2907, + "step": 18228 + }, + { + "epoch": 0.9779506437768241, + "grad_norm": 0.5546875, + "learning_rate": 4.649005224417577e-06, + "loss": 2.2548, + "step": 18229 + }, + { + "epoch": 0.9780042918454935, + "grad_norm": 0.490234375, + "learning_rate": 4.648960832361159e-06, + "loss": 2.4886, + "step": 18230 + }, + { + "epoch": 0.9780579399141631, + "grad_norm": 0.462890625, + "learning_rate": 4.6489164377096475e-06, + "loss": 2.3767, + "step": 18231 + }, + { + "epoch": 0.9781115879828326, + "grad_norm": 1.2109375, + "learning_rate": 4.648872040463096e-06, + "loss": 2.3347, + "step": 18232 + }, + { + "epoch": 0.9781652360515022, + "grad_norm": 0.609375, + "learning_rate": 4.6488276406215585e-06, + "loss": 2.3944, + "step": 18233 + }, + { + "epoch": 0.9782188841201717, + "grad_norm": 0.44140625, + "learning_rate": 4.648783238185089e-06, + "loss": 2.3078, + "step": 18234 + }, + { + "epoch": 0.9782725321888412, + "grad_norm": 0.44921875, + "learning_rate": 4.6487388331537405e-06, + "loss": 2.2085, + "step": 18235 + }, + { + "epoch": 0.9783261802575107, + "grad_norm": 0.48828125, + "learning_rate": 4.648694425527568e-06, + "loss": 2.3156, + "step": 18236 + }, + { + "epoch": 0.9783798283261803, + "grad_norm": 0.50390625, + "learning_rate": 4.648650015306623e-06, + "loss": 2.6501, + "step": 18237 + }, + { + "epoch": 0.9784334763948498, + "grad_norm": 0.4609375, + "learning_rate": 4.64860560249096e-06, + "loss": 2.075, + "step": 18238 + }, + { + "epoch": 0.9784871244635193, + "grad_norm": 0.466796875, + "learning_rate": 4.648561187080634e-06, + "loss": 2.4001, + "step": 18239 + }, + { + "epoch": 0.9785407725321889, + "grad_norm": 0.4609375, + "learning_rate": 4.648516769075696e-06, + "loss": 2.0609, + "step": 18240 + }, + { + "epoch": 0.9785944206008583, + "grad_norm": 0.466796875, + "learning_rate": 4.648472348476202e-06, + "loss": 2.5051, + "step": 18241 + }, + { + "epoch": 0.9786480686695279, + "grad_norm": 0.6171875, + "learning_rate": 4.648427925282205e-06, + "loss": 2.2255, + "step": 18242 + }, + { + "epoch": 0.9787017167381974, + "grad_norm": 0.427734375, + "learning_rate": 4.648383499493757e-06, + "loss": 2.0176, + "step": 18243 + }, + { + "epoch": 0.978755364806867, + "grad_norm": 0.455078125, + "learning_rate": 4.648339071110915e-06, + "loss": 2.3311, + "step": 18244 + }, + { + "epoch": 0.9788090128755365, + "grad_norm": 0.4296875, + "learning_rate": 4.64829464013373e-06, + "loss": 2.1623, + "step": 18245 + }, + { + "epoch": 0.978862660944206, + "grad_norm": 0.65625, + "learning_rate": 4.6482502065622546e-06, + "loss": 2.266, + "step": 18246 + }, + { + "epoch": 0.9789163090128755, + "grad_norm": 0.5390625, + "learning_rate": 4.6482057703965455e-06, + "loss": 2.4698, + "step": 18247 + }, + { + "epoch": 0.9789699570815451, + "grad_norm": 0.578125, + "learning_rate": 4.648161331636656e-06, + "loss": 2.0524, + "step": 18248 + }, + { + "epoch": 0.9790236051502146, + "grad_norm": 0.5625, + "learning_rate": 4.648116890282638e-06, + "loss": 2.2455, + "step": 18249 + }, + { + "epoch": 0.9790772532188842, + "grad_norm": 0.44921875, + "learning_rate": 4.648072446334545e-06, + "loss": 2.5042, + "step": 18250 + }, + { + "epoch": 0.9791309012875536, + "grad_norm": 0.451171875, + "learning_rate": 4.648027999792433e-06, + "loss": 2.0393, + "step": 18251 + }, + { + "epoch": 0.9791845493562231, + "grad_norm": 0.53515625, + "learning_rate": 4.647983550656354e-06, + "loss": 2.4963, + "step": 18252 + }, + { + "epoch": 0.9792381974248927, + "grad_norm": 0.486328125, + "learning_rate": 4.647939098926362e-06, + "loss": 2.3148, + "step": 18253 + }, + { + "epoch": 0.9792918454935622, + "grad_norm": 0.55859375, + "learning_rate": 4.647894644602511e-06, + "loss": 2.2771, + "step": 18254 + }, + { + "epoch": 0.9793454935622318, + "grad_norm": 0.498046875, + "learning_rate": 4.647850187684854e-06, + "loss": 2.4723, + "step": 18255 + }, + { + "epoch": 0.9793991416309012, + "grad_norm": 0.48828125, + "learning_rate": 4.647805728173445e-06, + "loss": 2.3855, + "step": 18256 + }, + { + "epoch": 0.9794527896995708, + "grad_norm": 0.357421875, + "learning_rate": 4.6477612660683374e-06, + "loss": 2.1113, + "step": 18257 + }, + { + "epoch": 0.9795064377682403, + "grad_norm": 0.498046875, + "learning_rate": 4.647716801369586e-06, + "loss": 2.3064, + "step": 18258 + }, + { + "epoch": 0.9795600858369099, + "grad_norm": 0.5859375, + "learning_rate": 4.647672334077244e-06, + "loss": 2.2296, + "step": 18259 + }, + { + "epoch": 0.9796137339055794, + "grad_norm": 0.46875, + "learning_rate": 4.647627864191364e-06, + "loss": 2.1086, + "step": 18260 + }, + { + "epoch": 0.979667381974249, + "grad_norm": 0.51953125, + "learning_rate": 4.647583391712e-06, + "loss": 2.3541, + "step": 18261 + }, + { + "epoch": 0.9797210300429184, + "grad_norm": 0.400390625, + "learning_rate": 4.647538916639207e-06, + "loss": 2.1923, + "step": 18262 + }, + { + "epoch": 0.979774678111588, + "grad_norm": 0.5078125, + "learning_rate": 4.647494438973039e-06, + "loss": 2.1968, + "step": 18263 + }, + { + "epoch": 0.9798283261802575, + "grad_norm": 0.55859375, + "learning_rate": 4.647449958713547e-06, + "loss": 2.251, + "step": 18264 + }, + { + "epoch": 0.9798819742489271, + "grad_norm": 0.72265625, + "learning_rate": 4.647405475860787e-06, + "loss": 2.2026, + "step": 18265 + }, + { + "epoch": 0.9799356223175966, + "grad_norm": 0.384765625, + "learning_rate": 4.647360990414812e-06, + "loss": 2.1744, + "step": 18266 + }, + { + "epoch": 0.979989270386266, + "grad_norm": 0.51953125, + "learning_rate": 4.647316502375676e-06, + "loss": 2.4697, + "step": 18267 + }, + { + "epoch": 0.9800429184549356, + "grad_norm": 0.478515625, + "learning_rate": 4.647272011743433e-06, + "loss": 2.3245, + "step": 18268 + }, + { + "epoch": 0.9800965665236051, + "grad_norm": 0.443359375, + "learning_rate": 4.647227518518136e-06, + "loss": 2.197, + "step": 18269 + }, + { + "epoch": 0.9801502145922747, + "grad_norm": 0.40234375, + "learning_rate": 4.647183022699839e-06, + "loss": 2.1656, + "step": 18270 + }, + { + "epoch": 0.9802038626609442, + "grad_norm": 0.50390625, + "learning_rate": 4.6471385242885955e-06, + "loss": 2.3542, + "step": 18271 + }, + { + "epoch": 0.9802575107296138, + "grad_norm": 0.5078125, + "learning_rate": 4.64709402328446e-06, + "loss": 2.4734, + "step": 18272 + }, + { + "epoch": 0.9803111587982832, + "grad_norm": 1.078125, + "learning_rate": 4.6470495196874844e-06, + "loss": 2.3718, + "step": 18273 + }, + { + "epoch": 0.9803648068669528, + "grad_norm": 0.51171875, + "learning_rate": 4.647005013497724e-06, + "loss": 2.3551, + "step": 18274 + }, + { + "epoch": 0.9804184549356223, + "grad_norm": 0.37890625, + "learning_rate": 4.646960504715233e-06, + "loss": 2.1816, + "step": 18275 + }, + { + "epoch": 0.9804721030042919, + "grad_norm": 0.5078125, + "learning_rate": 4.6469159933400645e-06, + "loss": 2.4149, + "step": 18276 + }, + { + "epoch": 0.9805257510729614, + "grad_norm": 0.396484375, + "learning_rate": 4.646871479372273e-06, + "loss": 1.9634, + "step": 18277 + }, + { + "epoch": 0.980579399141631, + "grad_norm": 0.4921875, + "learning_rate": 4.64682696281191e-06, + "loss": 2.2865, + "step": 18278 + }, + { + "epoch": 0.9806330472103004, + "grad_norm": 0.46484375, + "learning_rate": 4.6467824436590315e-06, + "loss": 2.4355, + "step": 18279 + }, + { + "epoch": 0.98068669527897, + "grad_norm": 0.50390625, + "learning_rate": 4.64673792191369e-06, + "loss": 2.3421, + "step": 18280 + }, + { + "epoch": 0.9807403433476395, + "grad_norm": 0.494140625, + "learning_rate": 4.64669339757594e-06, + "loss": 2.3861, + "step": 18281 + }, + { + "epoch": 0.980793991416309, + "grad_norm": 0.421875, + "learning_rate": 4.646648870645835e-06, + "loss": 2.0817, + "step": 18282 + }, + { + "epoch": 0.9808476394849786, + "grad_norm": 0.4453125, + "learning_rate": 4.646604341123429e-06, + "loss": 2.0823, + "step": 18283 + }, + { + "epoch": 0.980901287553648, + "grad_norm": 0.447265625, + "learning_rate": 4.646559809008775e-06, + "loss": 2.2566, + "step": 18284 + }, + { + "epoch": 0.9809549356223176, + "grad_norm": 0.404296875, + "learning_rate": 4.646515274301927e-06, + "loss": 2.0561, + "step": 18285 + }, + { + "epoch": 0.9810085836909871, + "grad_norm": 0.5546875, + "learning_rate": 4.6464707370029396e-06, + "loss": 2.3044, + "step": 18286 + }, + { + "epoch": 0.9810622317596567, + "grad_norm": 0.4921875, + "learning_rate": 4.646426197111866e-06, + "loss": 2.0746, + "step": 18287 + }, + { + "epoch": 0.9811158798283262, + "grad_norm": 0.37109375, + "learning_rate": 4.646381654628761e-06, + "loss": 1.8967, + "step": 18288 + }, + { + "epoch": 0.9811695278969957, + "grad_norm": 0.6015625, + "learning_rate": 4.646337109553677e-06, + "loss": 2.2778, + "step": 18289 + }, + { + "epoch": 0.9812231759656652, + "grad_norm": 0.443359375, + "learning_rate": 4.646292561886668e-06, + "loss": 2.387, + "step": 18290 + }, + { + "epoch": 0.9812768240343348, + "grad_norm": 0.44140625, + "learning_rate": 4.646248011627787e-06, + "loss": 2.3607, + "step": 18291 + }, + { + "epoch": 0.9813304721030043, + "grad_norm": 0.64453125, + "learning_rate": 4.646203458777091e-06, + "loss": 2.7044, + "step": 18292 + }, + { + "epoch": 0.9813841201716739, + "grad_norm": 0.47265625, + "learning_rate": 4.64615890333463e-06, + "loss": 1.894, + "step": 18293 + }, + { + "epoch": 0.9814377682403433, + "grad_norm": 0.470703125, + "learning_rate": 4.6461143453004605e-06, + "loss": 2.3953, + "step": 18294 + }, + { + "epoch": 0.9814914163090128, + "grad_norm": 0.453125, + "learning_rate": 4.646069784674635e-06, + "loss": 2.2603, + "step": 18295 + }, + { + "epoch": 0.9815450643776824, + "grad_norm": 0.52734375, + "learning_rate": 4.646025221457206e-06, + "loss": 2.3868, + "step": 18296 + }, + { + "epoch": 0.9815987124463519, + "grad_norm": 0.478515625, + "learning_rate": 4.645980655648231e-06, + "loss": 2.2801, + "step": 18297 + }, + { + "epoch": 0.9816523605150215, + "grad_norm": 0.45703125, + "learning_rate": 4.645936087247761e-06, + "loss": 1.9398, + "step": 18298 + }, + { + "epoch": 0.981706008583691, + "grad_norm": 0.5234375, + "learning_rate": 4.645891516255851e-06, + "loss": 2.3853, + "step": 18299 + }, + { + "epoch": 0.9817596566523605, + "grad_norm": 0.486328125, + "learning_rate": 4.645846942672554e-06, + "loss": 2.1435, + "step": 18300 + }, + { + "epoch": 0.98181330472103, + "grad_norm": 0.47265625, + "learning_rate": 4.645802366497924e-06, + "loss": 2.4931, + "step": 18301 + }, + { + "epoch": 0.9818669527896996, + "grad_norm": 0.55078125, + "learning_rate": 4.645757787732015e-06, + "loss": 2.4519, + "step": 18302 + }, + { + "epoch": 0.9819206008583691, + "grad_norm": 0.51953125, + "learning_rate": 4.645713206374881e-06, + "loss": 2.3879, + "step": 18303 + }, + { + "epoch": 0.9819742489270387, + "grad_norm": 0.53125, + "learning_rate": 4.645668622426575e-06, + "loss": 2.2247, + "step": 18304 + }, + { + "epoch": 0.9820278969957081, + "grad_norm": 0.408203125, + "learning_rate": 4.645624035887153e-06, + "loss": 2.2715, + "step": 18305 + }, + { + "epoch": 0.9820815450643777, + "grad_norm": 0.61328125, + "learning_rate": 4.645579446756667e-06, + "loss": 2.4953, + "step": 18306 + }, + { + "epoch": 0.9821351931330472, + "grad_norm": 0.44921875, + "learning_rate": 4.645534855035171e-06, + "loss": 2.3847, + "step": 18307 + }, + { + "epoch": 0.9821888412017168, + "grad_norm": 1.6640625, + "learning_rate": 4.645490260722718e-06, + "loss": 2.4102, + "step": 18308 + }, + { + "epoch": 0.9822424892703863, + "grad_norm": 0.46484375, + "learning_rate": 4.645445663819364e-06, + "loss": 2.3608, + "step": 18309 + }, + { + "epoch": 0.9822961373390557, + "grad_norm": 0.482421875, + "learning_rate": 4.645401064325162e-06, + "loss": 2.4284, + "step": 18310 + }, + { + "epoch": 0.9823497854077253, + "grad_norm": 0.69140625, + "learning_rate": 4.645356462240165e-06, + "loss": 2.403, + "step": 18311 + }, + { + "epoch": 0.9824034334763948, + "grad_norm": 5.375, + "learning_rate": 4.6453118575644275e-06, + "loss": 1.8188, + "step": 18312 + }, + { + "epoch": 0.9824570815450644, + "grad_norm": 0.59765625, + "learning_rate": 4.645267250298004e-06, + "loss": 2.2467, + "step": 18313 + }, + { + "epoch": 0.9825107296137339, + "grad_norm": 0.53515625, + "learning_rate": 4.6452226404409475e-06, + "loss": 2.1741, + "step": 18314 + }, + { + "epoch": 0.9825643776824035, + "grad_norm": 0.46484375, + "learning_rate": 4.645178027993311e-06, + "loss": 2.3103, + "step": 18315 + }, + { + "epoch": 0.9826180257510729, + "grad_norm": 0.51171875, + "learning_rate": 4.64513341295515e-06, + "loss": 2.4759, + "step": 18316 + }, + { + "epoch": 0.9826716738197425, + "grad_norm": 0.45703125, + "learning_rate": 4.645088795326519e-06, + "loss": 2.3173, + "step": 18317 + }, + { + "epoch": 0.982725321888412, + "grad_norm": 0.45703125, + "learning_rate": 4.64504417510747e-06, + "loss": 2.4701, + "step": 18318 + }, + { + "epoch": 0.9827789699570816, + "grad_norm": 0.50390625, + "learning_rate": 4.644999552298057e-06, + "loss": 2.5217, + "step": 18319 + }, + { + "epoch": 0.9828326180257511, + "grad_norm": 0.41015625, + "learning_rate": 4.644954926898336e-06, + "loss": 2.3592, + "step": 18320 + }, + { + "epoch": 0.9828862660944206, + "grad_norm": 0.498046875, + "learning_rate": 4.644910298908358e-06, + "loss": 1.675, + "step": 18321 + }, + { + "epoch": 0.9829399141630901, + "grad_norm": 0.39453125, + "learning_rate": 4.644865668328179e-06, + "loss": 1.9448, + "step": 18322 + }, + { + "epoch": 0.9829935622317597, + "grad_norm": 1.3359375, + "learning_rate": 4.6448210351578515e-06, + "loss": 2.4443, + "step": 18323 + }, + { + "epoch": 0.9830472103004292, + "grad_norm": 0.703125, + "learning_rate": 4.644776399397431e-06, + "loss": 2.3392, + "step": 18324 + }, + { + "epoch": 0.9831008583690987, + "grad_norm": 0.5390625, + "learning_rate": 4.64473176104697e-06, + "loss": 2.3152, + "step": 18325 + }, + { + "epoch": 0.9831545064377682, + "grad_norm": 0.5078125, + "learning_rate": 4.6446871201065225e-06, + "loss": 2.0434, + "step": 18326 + }, + { + "epoch": 0.9832081545064377, + "grad_norm": 0.49609375, + "learning_rate": 4.644642476576143e-06, + "loss": 2.3069, + "step": 18327 + }, + { + "epoch": 0.9832618025751073, + "grad_norm": 0.466796875, + "learning_rate": 4.644597830455886e-06, + "loss": 2.3584, + "step": 18328 + }, + { + "epoch": 0.9833154506437768, + "grad_norm": 0.4140625, + "learning_rate": 4.644553181745804e-06, + "loss": 2.3054, + "step": 18329 + }, + { + "epoch": 0.9833690987124464, + "grad_norm": 0.478515625, + "learning_rate": 4.644508530445951e-06, + "loss": 2.3191, + "step": 18330 + }, + { + "epoch": 0.9834227467811159, + "grad_norm": 0.51953125, + "learning_rate": 4.644463876556382e-06, + "loss": 2.4645, + "step": 18331 + }, + { + "epoch": 0.9834763948497854, + "grad_norm": 0.5703125, + "learning_rate": 4.644419220077151e-06, + "loss": 2.0096, + "step": 18332 + }, + { + "epoch": 0.9835300429184549, + "grad_norm": 0.4375, + "learning_rate": 4.6443745610083106e-06, + "loss": 2.3732, + "step": 18333 + }, + { + "epoch": 0.9835836909871245, + "grad_norm": 0.373046875, + "learning_rate": 4.644329899349915e-06, + "loss": 2.0889, + "step": 18334 + }, + { + "epoch": 0.983637339055794, + "grad_norm": 0.46484375, + "learning_rate": 4.64428523510202e-06, + "loss": 2.4048, + "step": 18335 + }, + { + "epoch": 0.9836909871244636, + "grad_norm": 0.86328125, + "learning_rate": 4.644240568264677e-06, + "loss": 1.9291, + "step": 18336 + }, + { + "epoch": 0.983744635193133, + "grad_norm": 0.88671875, + "learning_rate": 4.6441958988379406e-06, + "loss": 2.4201, + "step": 18337 + }, + { + "epoch": 0.9837982832618025, + "grad_norm": 0.42578125, + "learning_rate": 4.644151226821866e-06, + "loss": 2.433, + "step": 18338 + }, + { + "epoch": 0.9838519313304721, + "grad_norm": 0.435546875, + "learning_rate": 4.644106552216506e-06, + "loss": 2.3299, + "step": 18339 + }, + { + "epoch": 0.9839055793991416, + "grad_norm": 0.451171875, + "learning_rate": 4.644061875021916e-06, + "loss": 2.1225, + "step": 18340 + }, + { + "epoch": 0.9839592274678112, + "grad_norm": 0.58984375, + "learning_rate": 4.644017195238147e-06, + "loss": 2.1866, + "step": 18341 + }, + { + "epoch": 0.9840128755364806, + "grad_norm": 0.62890625, + "learning_rate": 4.643972512865257e-06, + "loss": 2.1716, + "step": 18342 + }, + { + "epoch": 0.9840665236051502, + "grad_norm": 0.51171875, + "learning_rate": 4.643927827903296e-06, + "loss": 2.2265, + "step": 18343 + }, + { + "epoch": 0.9841201716738197, + "grad_norm": 0.55859375, + "learning_rate": 4.643883140352321e-06, + "loss": 2.2083, + "step": 18344 + }, + { + "epoch": 0.9841738197424893, + "grad_norm": 0.72265625, + "learning_rate": 4.643838450212383e-06, + "loss": 2.3521, + "step": 18345 + }, + { + "epoch": 0.9842274678111588, + "grad_norm": 0.439453125, + "learning_rate": 4.643793757483539e-06, + "loss": 2.0907, + "step": 18346 + }, + { + "epoch": 0.9842811158798284, + "grad_norm": 0.4375, + "learning_rate": 4.643749062165841e-06, + "loss": 2.0301, + "step": 18347 + }, + { + "epoch": 0.9843347639484978, + "grad_norm": 0.455078125, + "learning_rate": 4.643704364259344e-06, + "loss": 2.1274, + "step": 18348 + }, + { + "epoch": 0.9843884120171674, + "grad_norm": 0.40625, + "learning_rate": 4.643659663764102e-06, + "loss": 2.2335, + "step": 18349 + }, + { + "epoch": 0.9844420600858369, + "grad_norm": 0.44921875, + "learning_rate": 4.643614960680167e-06, + "loss": 2.1252, + "step": 18350 + }, + { + "epoch": 0.9844957081545065, + "grad_norm": 0.47265625, + "learning_rate": 4.6435702550075954e-06, + "loss": 2.5558, + "step": 18351 + }, + { + "epoch": 0.984549356223176, + "grad_norm": 0.44140625, + "learning_rate": 4.643525546746442e-06, + "loss": 2.5125, + "step": 18352 + }, + { + "epoch": 0.9846030042918454, + "grad_norm": 0.5, + "learning_rate": 4.643480835896757e-06, + "loss": 2.2009, + "step": 18353 + }, + { + "epoch": 0.984656652360515, + "grad_norm": 0.458984375, + "learning_rate": 4.643436122458597e-06, + "loss": 2.2877, + "step": 18354 + }, + { + "epoch": 0.9847103004291845, + "grad_norm": 0.62890625, + "learning_rate": 4.6433914064320154e-06, + "loss": 2.4353, + "step": 18355 + }, + { + "epoch": 0.9847639484978541, + "grad_norm": 0.498046875, + "learning_rate": 4.643346687817066e-06, + "loss": 2.46, + "step": 18356 + }, + { + "epoch": 0.9848175965665236, + "grad_norm": 0.5703125, + "learning_rate": 4.643301966613804e-06, + "loss": 2.3611, + "step": 18357 + }, + { + "epoch": 0.9848712446351932, + "grad_norm": 0.4765625, + "learning_rate": 4.6432572428222816e-06, + "loss": 2.5812, + "step": 18358 + }, + { + "epoch": 0.9849248927038626, + "grad_norm": 0.48046875, + "learning_rate": 4.643212516442554e-06, + "loss": 2.2744, + "step": 18359 + }, + { + "epoch": 0.9849785407725322, + "grad_norm": 0.447265625, + "learning_rate": 4.6431677874746756e-06, + "loss": 2.2875, + "step": 18360 + }, + { + "epoch": 0.9850321888412017, + "grad_norm": 0.443359375, + "learning_rate": 4.643123055918699e-06, + "loss": 2.2003, + "step": 18361 + }, + { + "epoch": 0.9850858369098713, + "grad_norm": 0.4921875, + "learning_rate": 4.6430783217746795e-06, + "loss": 2.2876, + "step": 18362 + }, + { + "epoch": 0.9851394849785408, + "grad_norm": 0.5, + "learning_rate": 4.64303358504267e-06, + "loss": 2.2513, + "step": 18363 + }, + { + "epoch": 0.9851931330472103, + "grad_norm": 0.4921875, + "learning_rate": 4.642988845722725e-06, + "loss": 2.1817, + "step": 18364 + }, + { + "epoch": 0.9852467811158798, + "grad_norm": 0.458984375, + "learning_rate": 4.6429441038148995e-06, + "loss": 2.262, + "step": 18365 + }, + { + "epoch": 0.9853004291845494, + "grad_norm": 0.462890625, + "learning_rate": 4.642899359319246e-06, + "loss": 1.6052, + "step": 18366 + }, + { + "epoch": 0.9853540772532189, + "grad_norm": 0.609375, + "learning_rate": 4.64285461223582e-06, + "loss": 2.2638, + "step": 18367 + }, + { + "epoch": 0.9854077253218884, + "grad_norm": 1.796875, + "learning_rate": 4.642809862564675e-06, + "loss": 2.34, + "step": 18368 + }, + { + "epoch": 0.985461373390558, + "grad_norm": 0.546875, + "learning_rate": 4.642765110305863e-06, + "loss": 2.3192, + "step": 18369 + }, + { + "epoch": 0.9855150214592274, + "grad_norm": 0.51171875, + "learning_rate": 4.6427203554594415e-06, + "loss": 2.5325, + "step": 18370 + }, + { + "epoch": 0.985568669527897, + "grad_norm": 1.03125, + "learning_rate": 4.642675598025462e-06, + "loss": 2.2122, + "step": 18371 + }, + { + "epoch": 0.9856223175965665, + "grad_norm": 0.7265625, + "learning_rate": 4.642630838003979e-06, + "loss": 2.12, + "step": 18372 + }, + { + "epoch": 0.9856759656652361, + "grad_norm": 0.40234375, + "learning_rate": 4.642586075395048e-06, + "loss": 2.3173, + "step": 18373 + }, + { + "epoch": 0.9857296137339056, + "grad_norm": 0.404296875, + "learning_rate": 4.642541310198722e-06, + "loss": 1.7969, + "step": 18374 + }, + { + "epoch": 0.9857832618025751, + "grad_norm": 0.51171875, + "learning_rate": 4.642496542415054e-06, + "loss": 2.3282, + "step": 18375 + }, + { + "epoch": 0.9858369098712446, + "grad_norm": 0.42578125, + "learning_rate": 4.6424517720441e-06, + "loss": 2.0497, + "step": 18376 + }, + { + "epoch": 0.9858905579399142, + "grad_norm": 0.4375, + "learning_rate": 4.642406999085913e-06, + "loss": 2.3351, + "step": 18377 + }, + { + "epoch": 0.9859442060085837, + "grad_norm": 0.55859375, + "learning_rate": 4.642362223540547e-06, + "loss": 2.0642, + "step": 18378 + }, + { + "epoch": 0.9859978540772533, + "grad_norm": 0.40625, + "learning_rate": 4.6423174454080565e-06, + "loss": 1.9755, + "step": 18379 + }, + { + "epoch": 0.9860515021459227, + "grad_norm": 1.0859375, + "learning_rate": 4.642272664688497e-06, + "loss": 2.3104, + "step": 18380 + }, + { + "epoch": 0.9861051502145923, + "grad_norm": 0.48046875, + "learning_rate": 4.642227881381919e-06, + "loss": 2.3195, + "step": 18381 + }, + { + "epoch": 0.9861587982832618, + "grad_norm": 0.44140625, + "learning_rate": 4.6421830954883784e-06, + "loss": 2.2079, + "step": 18382 + }, + { + "epoch": 0.9862124463519313, + "grad_norm": 0.44921875, + "learning_rate": 4.642138307007931e-06, + "loss": 2.4301, + "step": 18383 + }, + { + "epoch": 0.9862660944206009, + "grad_norm": 0.384765625, + "learning_rate": 4.642093515940629e-06, + "loss": 1.9266, + "step": 18384 + }, + { + "epoch": 0.9863197424892703, + "grad_norm": 0.47265625, + "learning_rate": 4.642048722286526e-06, + "loss": 2.0899, + "step": 18385 + }, + { + "epoch": 0.9863733905579399, + "grad_norm": 0.5078125, + "learning_rate": 4.642003926045677e-06, + "loss": 2.3328, + "step": 18386 + }, + { + "epoch": 0.9864270386266094, + "grad_norm": 1.1328125, + "learning_rate": 4.641959127218137e-06, + "loss": 2.5108, + "step": 18387 + }, + { + "epoch": 0.986480686695279, + "grad_norm": 0.66796875, + "learning_rate": 4.641914325803958e-06, + "loss": 2.3346, + "step": 18388 + }, + { + "epoch": 0.9865343347639485, + "grad_norm": 0.48828125, + "learning_rate": 4.641869521803196e-06, + "loss": 2.2384, + "step": 18389 + }, + { + "epoch": 0.9865879828326181, + "grad_norm": 1.359375, + "learning_rate": 4.641824715215904e-06, + "loss": 2.2263, + "step": 18390 + }, + { + "epoch": 0.9866416309012875, + "grad_norm": 0.4609375, + "learning_rate": 4.641779906042136e-06, + "loss": 2.3532, + "step": 18391 + }, + { + "epoch": 0.9866952789699571, + "grad_norm": 0.46484375, + "learning_rate": 4.641735094281947e-06, + "loss": 2.31, + "step": 18392 + }, + { + "epoch": 0.9867489270386266, + "grad_norm": 0.44140625, + "learning_rate": 4.641690279935391e-06, + "loss": 2.3238, + "step": 18393 + }, + { + "epoch": 0.9868025751072962, + "grad_norm": 0.490234375, + "learning_rate": 4.6416454630025225e-06, + "loss": 2.0891, + "step": 18394 + }, + { + "epoch": 0.9868562231759657, + "grad_norm": 0.486328125, + "learning_rate": 4.641600643483393e-06, + "loss": 2.4459, + "step": 18395 + }, + { + "epoch": 0.9869098712446351, + "grad_norm": 0.64453125, + "learning_rate": 4.64155582137806e-06, + "loss": 2.2839, + "step": 18396 + }, + { + "epoch": 0.9869635193133047, + "grad_norm": 0.53125, + "learning_rate": 4.641510996686575e-06, + "loss": 2.3408, + "step": 18397 + }, + { + "epoch": 0.9870171673819742, + "grad_norm": 0.69140625, + "learning_rate": 4.641466169408995e-06, + "loss": 2.2278, + "step": 18398 + }, + { + "epoch": 0.9870708154506438, + "grad_norm": 0.3828125, + "learning_rate": 4.641421339545371e-06, + "loss": 2.0439, + "step": 18399 + }, + { + "epoch": 0.9871244635193133, + "grad_norm": 0.494140625, + "learning_rate": 4.6413765070957585e-06, + "loss": 2.4207, + "step": 18400 + }, + { + "epoch": 0.9871781115879829, + "grad_norm": 0.40234375, + "learning_rate": 4.641331672060213e-06, + "loss": 2.4205, + "step": 18401 + }, + { + "epoch": 0.9872317596566523, + "grad_norm": 0.46875, + "learning_rate": 4.641286834438786e-06, + "loss": 2.4843, + "step": 18402 + }, + { + "epoch": 0.9872854077253219, + "grad_norm": 0.63671875, + "learning_rate": 4.641241994231533e-06, + "loss": 2.3621, + "step": 18403 + }, + { + "epoch": 0.9873390557939914, + "grad_norm": 0.53515625, + "learning_rate": 4.641197151438509e-06, + "loss": 2.5753, + "step": 18404 + }, + { + "epoch": 0.987392703862661, + "grad_norm": 0.412109375, + "learning_rate": 4.641152306059768e-06, + "loss": 2.0776, + "step": 18405 + }, + { + "epoch": 0.9874463519313305, + "grad_norm": 1.3359375, + "learning_rate": 4.641107458095362e-06, + "loss": 2.387, + "step": 18406 + }, + { + "epoch": 0.9875, + "grad_norm": 0.515625, + "learning_rate": 4.641062607545347e-06, + "loss": 2.0552, + "step": 18407 + }, + { + "epoch": 0.9875536480686695, + "grad_norm": 0.458984375, + "learning_rate": 4.641017754409776e-06, + "loss": 2.4363, + "step": 18408 + }, + { + "epoch": 0.9876072961373391, + "grad_norm": 0.5078125, + "learning_rate": 4.6409728986887046e-06, + "loss": 2.3307, + "step": 18409 + }, + { + "epoch": 0.9876609442060086, + "grad_norm": 0.478515625, + "learning_rate": 4.640928040382186e-06, + "loss": 2.3797, + "step": 18410 + }, + { + "epoch": 0.9877145922746781, + "grad_norm": 0.400390625, + "learning_rate": 4.640883179490276e-06, + "loss": 2.3315, + "step": 18411 + }, + { + "epoch": 0.9877682403433476, + "grad_norm": 0.4765625, + "learning_rate": 4.640838316013027e-06, + "loss": 2.2896, + "step": 18412 + }, + { + "epoch": 0.9878218884120171, + "grad_norm": 0.50390625, + "learning_rate": 4.640793449950493e-06, + "loss": 2.5304, + "step": 18413 + }, + { + "epoch": 0.9878755364806867, + "grad_norm": 0.49609375, + "learning_rate": 4.640748581302729e-06, + "loss": 2.335, + "step": 18414 + }, + { + "epoch": 0.9879291845493562, + "grad_norm": 0.490234375, + "learning_rate": 4.640703710069788e-06, + "loss": 2.523, + "step": 18415 + }, + { + "epoch": 0.9879828326180258, + "grad_norm": 0.470703125, + "learning_rate": 4.640658836251727e-06, + "loss": 2.2468, + "step": 18416 + }, + { + "epoch": 0.9880364806866953, + "grad_norm": 0.52734375, + "learning_rate": 4.640613959848598e-06, + "loss": 2.2626, + "step": 18417 + }, + { + "epoch": 0.9880901287553648, + "grad_norm": 0.427734375, + "learning_rate": 4.640569080860454e-06, + "loss": 2.0258, + "step": 18418 + }, + { + "epoch": 0.9881437768240343, + "grad_norm": 0.458984375, + "learning_rate": 4.640524199287353e-06, + "loss": 2.2647, + "step": 18419 + }, + { + "epoch": 0.9881974248927039, + "grad_norm": 0.431640625, + "learning_rate": 4.640479315129345e-06, + "loss": 2.2784, + "step": 18420 + }, + { + "epoch": 0.9882510729613734, + "grad_norm": 0.546875, + "learning_rate": 4.640434428386486e-06, + "loss": 1.4126, + "step": 18421 + }, + { + "epoch": 0.988304721030043, + "grad_norm": 0.484375, + "learning_rate": 4.640389539058832e-06, + "loss": 2.2709, + "step": 18422 + }, + { + "epoch": 0.9883583690987124, + "grad_norm": 0.412109375, + "learning_rate": 4.640344647146435e-06, + "loss": 2.3514, + "step": 18423 + }, + { + "epoch": 0.988412017167382, + "grad_norm": 0.439453125, + "learning_rate": 4.64029975264935e-06, + "loss": 2.1818, + "step": 18424 + }, + { + "epoch": 0.9884656652360515, + "grad_norm": 0.46875, + "learning_rate": 4.640254855567631e-06, + "loss": 2.4063, + "step": 18425 + }, + { + "epoch": 0.988519313304721, + "grad_norm": 0.455078125, + "learning_rate": 4.640209955901331e-06, + "loss": 2.1131, + "step": 18426 + }, + { + "epoch": 0.9885729613733906, + "grad_norm": 0.466796875, + "learning_rate": 4.640165053650507e-06, + "loss": 2.3453, + "step": 18427 + }, + { + "epoch": 0.98862660944206, + "grad_norm": 0.474609375, + "learning_rate": 4.640120148815211e-06, + "loss": 2.309, + "step": 18428 + }, + { + "epoch": 0.9886802575107296, + "grad_norm": 1.2734375, + "learning_rate": 4.640075241395498e-06, + "loss": 2.2755, + "step": 18429 + }, + { + "epoch": 0.9887339055793991, + "grad_norm": 0.52734375, + "learning_rate": 4.640030331391422e-06, + "loss": 2.311, + "step": 18430 + }, + { + "epoch": 0.9887875536480687, + "grad_norm": 0.625, + "learning_rate": 4.639985418803037e-06, + "loss": 2.1015, + "step": 18431 + }, + { + "epoch": 0.9888412017167382, + "grad_norm": 0.5078125, + "learning_rate": 4.639940503630398e-06, + "loss": 2.1983, + "step": 18432 + }, + { + "epoch": 0.9888948497854078, + "grad_norm": 0.443359375, + "learning_rate": 4.6398955858735584e-06, + "loss": 2.2116, + "step": 18433 + }, + { + "epoch": 0.9889484978540772, + "grad_norm": 0.5, + "learning_rate": 4.639850665532574e-06, + "loss": 2.5622, + "step": 18434 + }, + { + "epoch": 0.9890021459227468, + "grad_norm": 0.478515625, + "learning_rate": 4.639805742607497e-06, + "loss": 2.4134, + "step": 18435 + }, + { + "epoch": 0.9890557939914163, + "grad_norm": 0.48828125, + "learning_rate": 4.639760817098382e-06, + "loss": 2.3762, + "step": 18436 + }, + { + "epoch": 0.9891094420600859, + "grad_norm": 0.53515625, + "learning_rate": 4.639715889005285e-06, + "loss": 2.2242, + "step": 18437 + }, + { + "epoch": 0.9891630901287554, + "grad_norm": 0.451171875, + "learning_rate": 4.639670958328258e-06, + "loss": 2.2098, + "step": 18438 + }, + { + "epoch": 0.9892167381974248, + "grad_norm": 0.53515625, + "learning_rate": 4.639626025067357e-06, + "loss": 2.1885, + "step": 18439 + }, + { + "epoch": 0.9892703862660944, + "grad_norm": 0.9453125, + "learning_rate": 4.639581089222636e-06, + "loss": 1.8377, + "step": 18440 + }, + { + "epoch": 0.9893240343347639, + "grad_norm": 0.53125, + "learning_rate": 4.639536150794149e-06, + "loss": 2.345, + "step": 18441 + }, + { + "epoch": 0.9893776824034335, + "grad_norm": 0.4921875, + "learning_rate": 4.639491209781949e-06, + "loss": 2.3659, + "step": 18442 + }, + { + "epoch": 0.989431330472103, + "grad_norm": 0.4765625, + "learning_rate": 4.639446266186091e-06, + "loss": 2.155, + "step": 18443 + }, + { + "epoch": 0.9894849785407726, + "grad_norm": 0.421875, + "learning_rate": 4.639401320006631e-06, + "loss": 2.2412, + "step": 18444 + }, + { + "epoch": 0.989538626609442, + "grad_norm": 0.53515625, + "learning_rate": 4.639356371243622e-06, + "loss": 2.24, + "step": 18445 + }, + { + "epoch": 0.9895922746781116, + "grad_norm": 0.5078125, + "learning_rate": 4.639311419897118e-06, + "loss": 2.4263, + "step": 18446 + }, + { + "epoch": 0.9896459227467811, + "grad_norm": 0.453125, + "learning_rate": 4.639266465967172e-06, + "loss": 2.5268, + "step": 18447 + }, + { + "epoch": 0.9896995708154507, + "grad_norm": 0.431640625, + "learning_rate": 4.6392215094538415e-06, + "loss": 1.9019, + "step": 18448 + }, + { + "epoch": 0.9897532188841202, + "grad_norm": 0.376953125, + "learning_rate": 4.63917655035718e-06, + "loss": 2.1554, + "step": 18449 + }, + { + "epoch": 0.9898068669527897, + "grad_norm": 0.453125, + "learning_rate": 4.639131588677238e-06, + "loss": 2.2699, + "step": 18450 + }, + { + "epoch": 0.9898605150214592, + "grad_norm": 0.462890625, + "learning_rate": 4.6390866244140745e-06, + "loss": 2.2872, + "step": 18451 + }, + { + "epoch": 0.9899141630901288, + "grad_norm": 2.171875, + "learning_rate": 4.639041657567742e-06, + "loss": 2.272, + "step": 18452 + }, + { + "epoch": 0.9899678111587983, + "grad_norm": 0.48828125, + "learning_rate": 4.638996688138294e-06, + "loss": 2.3619, + "step": 18453 + }, + { + "epoch": 0.9900214592274678, + "grad_norm": 0.466796875, + "learning_rate": 4.638951716125787e-06, + "loss": 2.0418, + "step": 18454 + }, + { + "epoch": 0.9900751072961373, + "grad_norm": 0.4375, + "learning_rate": 4.638906741530273e-06, + "loss": 2.1857, + "step": 18455 + }, + { + "epoch": 0.9901287553648068, + "grad_norm": 0.609375, + "learning_rate": 4.638861764351806e-06, + "loss": 2.2169, + "step": 18456 + }, + { + "epoch": 0.9901824034334764, + "grad_norm": 0.41015625, + "learning_rate": 4.638816784590443e-06, + "loss": 1.9544, + "step": 18457 + }, + { + "epoch": 0.9902360515021459, + "grad_norm": 0.408203125, + "learning_rate": 4.638771802246237e-06, + "loss": 2.3212, + "step": 18458 + }, + { + "epoch": 0.9902896995708155, + "grad_norm": 0.494140625, + "learning_rate": 4.638726817319241e-06, + "loss": 2.3257, + "step": 18459 + }, + { + "epoch": 0.990343347639485, + "grad_norm": 0.5078125, + "learning_rate": 4.638681829809511e-06, + "loss": 2.0873, + "step": 18460 + }, + { + "epoch": 0.9903969957081545, + "grad_norm": 0.421875, + "learning_rate": 4.638636839717101e-06, + "loss": 2.3783, + "step": 18461 + }, + { + "epoch": 0.990450643776824, + "grad_norm": 0.4609375, + "learning_rate": 4.638591847042065e-06, + "loss": 2.2434, + "step": 18462 + }, + { + "epoch": 0.9905042918454936, + "grad_norm": 0.369140625, + "learning_rate": 4.638546851784458e-06, + "loss": 2.0599, + "step": 18463 + }, + { + "epoch": 0.9905579399141631, + "grad_norm": 0.46875, + "learning_rate": 4.638501853944334e-06, + "loss": 2.0638, + "step": 18464 + }, + { + "epoch": 0.9906115879828327, + "grad_norm": 0.51171875, + "learning_rate": 4.638456853521746e-06, + "loss": 2.6954, + "step": 18465 + }, + { + "epoch": 0.9906652360515021, + "grad_norm": 0.88671875, + "learning_rate": 4.638411850516749e-06, + "loss": 2.4783, + "step": 18466 + }, + { + "epoch": 0.9907188841201717, + "grad_norm": 0.427734375, + "learning_rate": 4.638366844929399e-06, + "loss": 2.294, + "step": 18467 + }, + { + "epoch": 0.9907725321888412, + "grad_norm": 0.51171875, + "learning_rate": 4.63832183675975e-06, + "loss": 1.556, + "step": 18468 + }, + { + "epoch": 0.9908261802575107, + "grad_norm": 0.47265625, + "learning_rate": 4.638276826007855e-06, + "loss": 1.9315, + "step": 18469 + }, + { + "epoch": 0.9908798283261803, + "grad_norm": 0.45703125, + "learning_rate": 4.638231812673768e-06, + "loss": 2.4161, + "step": 18470 + }, + { + "epoch": 0.9909334763948497, + "grad_norm": 0.5546875, + "learning_rate": 4.638186796757545e-06, + "loss": 2.0786, + "step": 18471 + }, + { + "epoch": 0.9909871244635193, + "grad_norm": 0.47265625, + "learning_rate": 4.638141778259239e-06, + "loss": 2.4437, + "step": 18472 + }, + { + "epoch": 0.9910407725321888, + "grad_norm": 0.52734375, + "learning_rate": 4.638096757178905e-06, + "loss": 2.1273, + "step": 18473 + }, + { + "epoch": 0.9910944206008584, + "grad_norm": 0.435546875, + "learning_rate": 4.638051733516598e-06, + "loss": 2.4259, + "step": 18474 + }, + { + "epoch": 0.9911480686695279, + "grad_norm": 0.5078125, + "learning_rate": 4.638006707272371e-06, + "loss": 2.1839, + "step": 18475 + }, + { + "epoch": 0.9912017167381975, + "grad_norm": 0.58984375, + "learning_rate": 4.637961678446279e-06, + "loss": 2.3231, + "step": 18476 + }, + { + "epoch": 0.9912553648068669, + "grad_norm": 0.4609375, + "learning_rate": 4.637916647038377e-06, + "loss": 2.3497, + "step": 18477 + }, + { + "epoch": 0.9913090128755365, + "grad_norm": 0.765625, + "learning_rate": 4.637871613048719e-06, + "loss": 1.4131, + "step": 18478 + }, + { + "epoch": 0.991362660944206, + "grad_norm": 0.474609375, + "learning_rate": 4.637826576477359e-06, + "loss": 2.2083, + "step": 18479 + }, + { + "epoch": 0.9914163090128756, + "grad_norm": 0.478515625, + "learning_rate": 4.637781537324352e-06, + "loss": 2.3453, + "step": 18480 + }, + { + "epoch": 0.9914699570815451, + "grad_norm": 0.4765625, + "learning_rate": 4.637736495589751e-06, + "loss": 2.1214, + "step": 18481 + }, + { + "epoch": 0.9915236051502145, + "grad_norm": 0.4921875, + "learning_rate": 4.637691451273613e-06, + "loss": 2.195, + "step": 18482 + }, + { + "epoch": 0.9915772532188841, + "grad_norm": 0.412109375, + "learning_rate": 4.637646404375989e-06, + "loss": 2.1244, + "step": 18483 + }, + { + "epoch": 0.9916309012875536, + "grad_norm": 0.455078125, + "learning_rate": 4.637601354896936e-06, + "loss": 2.2528, + "step": 18484 + }, + { + "epoch": 0.9916845493562232, + "grad_norm": 0.5859375, + "learning_rate": 4.637556302836508e-06, + "loss": 2.3156, + "step": 18485 + }, + { + "epoch": 0.9917381974248927, + "grad_norm": 0.51953125, + "learning_rate": 4.6375112481947585e-06, + "loss": 2.0302, + "step": 18486 + }, + { + "epoch": 0.9917918454935623, + "grad_norm": 0.4296875, + "learning_rate": 4.637466190971742e-06, + "loss": 2.2013, + "step": 18487 + }, + { + "epoch": 0.9918454935622317, + "grad_norm": 0.439453125, + "learning_rate": 4.637421131167514e-06, + "loss": 1.8367, + "step": 18488 + }, + { + "epoch": 0.9918991416309013, + "grad_norm": 0.5, + "learning_rate": 4.637376068782128e-06, + "loss": 2.2049, + "step": 18489 + }, + { + "epoch": 0.9919527896995708, + "grad_norm": 1.03125, + "learning_rate": 4.637331003815638e-06, + "loss": 2.5601, + "step": 18490 + }, + { + "epoch": 0.9920064377682404, + "grad_norm": 0.435546875, + "learning_rate": 4.6372859362681e-06, + "loss": 2.4137, + "step": 18491 + }, + { + "epoch": 0.9920600858369099, + "grad_norm": 0.466796875, + "learning_rate": 4.637240866139567e-06, + "loss": 2.2295, + "step": 18492 + }, + { + "epoch": 0.9921137339055794, + "grad_norm": 0.46875, + "learning_rate": 4.637195793430095e-06, + "loss": 2.2801, + "step": 18493 + }, + { + "epoch": 0.9921673819742489, + "grad_norm": 0.48828125, + "learning_rate": 4.637150718139736e-06, + "loss": 2.5397, + "step": 18494 + }, + { + "epoch": 0.9922210300429185, + "grad_norm": 0.59765625, + "learning_rate": 4.637105640268547e-06, + "loss": 2.0078, + "step": 18495 + }, + { + "epoch": 0.992274678111588, + "grad_norm": 0.4609375, + "learning_rate": 4.637060559816579e-06, + "loss": 2.1839, + "step": 18496 + }, + { + "epoch": 0.9923283261802575, + "grad_norm": 0.458984375, + "learning_rate": 4.637015476783891e-06, + "loss": 2.4861, + "step": 18497 + }, + { + "epoch": 0.992381974248927, + "grad_norm": 0.50390625, + "learning_rate": 4.6369703911705345e-06, + "loss": 2.406, + "step": 18498 + }, + { + "epoch": 0.9924356223175965, + "grad_norm": 0.4375, + "learning_rate": 4.636925302976564e-06, + "loss": 2.2573, + "step": 18499 + }, + { + "epoch": 0.9924892703862661, + "grad_norm": 0.5, + "learning_rate": 4.636880212202035e-06, + "loss": 2.341, + "step": 18500 + }, + { + "epoch": 0.9925429184549356, + "grad_norm": 0.408203125, + "learning_rate": 4.6368351188470015e-06, + "loss": 2.2043, + "step": 18501 + }, + { + "epoch": 0.9925965665236052, + "grad_norm": 0.431640625, + "learning_rate": 4.636790022911517e-06, + "loss": 2.4666, + "step": 18502 + }, + { + "epoch": 0.9926502145922746, + "grad_norm": 3.453125, + "learning_rate": 4.636744924395638e-06, + "loss": 2.2664, + "step": 18503 + }, + { + "epoch": 0.9927038626609442, + "grad_norm": 0.55859375, + "learning_rate": 4.6366998232994176e-06, + "loss": 2.3533, + "step": 18504 + }, + { + "epoch": 0.9927575107296137, + "grad_norm": 0.52734375, + "learning_rate": 4.6366547196229095e-06, + "loss": 2.1668, + "step": 18505 + }, + { + "epoch": 0.9928111587982833, + "grad_norm": 0.51953125, + "learning_rate": 4.6366096133661705e-06, + "loss": 2.3851, + "step": 18506 + }, + { + "epoch": 0.9928648068669528, + "grad_norm": 0.466796875, + "learning_rate": 4.636564504529253e-06, + "loss": 2.261, + "step": 18507 + }, + { + "epoch": 0.9929184549356224, + "grad_norm": 0.5078125, + "learning_rate": 4.636519393112212e-06, + "loss": 2.333, + "step": 18508 + }, + { + "epoch": 0.9929721030042918, + "grad_norm": 0.6796875, + "learning_rate": 4.636474279115103e-06, + "loss": 2.3285, + "step": 18509 + }, + { + "epoch": 0.9930257510729614, + "grad_norm": 0.53125, + "learning_rate": 4.6364291625379785e-06, + "loss": 2.4331, + "step": 18510 + }, + { + "epoch": 0.9930793991416309, + "grad_norm": 0.51171875, + "learning_rate": 4.636384043380895e-06, + "loss": 2.3475, + "step": 18511 + }, + { + "epoch": 0.9931330472103004, + "grad_norm": 0.546875, + "learning_rate": 4.636338921643906e-06, + "loss": 2.301, + "step": 18512 + }, + { + "epoch": 0.99318669527897, + "grad_norm": 0.490234375, + "learning_rate": 4.636293797327066e-06, + "loss": 2.2414, + "step": 18513 + }, + { + "epoch": 0.9932403433476394, + "grad_norm": 0.453125, + "learning_rate": 4.6362486704304294e-06, + "loss": 2.1805, + "step": 18514 + }, + { + "epoch": 0.993293991416309, + "grad_norm": 0.51953125, + "learning_rate": 4.636203540954051e-06, + "loss": 2.3307, + "step": 18515 + }, + { + "epoch": 0.9933476394849785, + "grad_norm": 0.48046875, + "learning_rate": 4.636158408897985e-06, + "loss": 2.2297, + "step": 18516 + }, + { + "epoch": 0.9934012875536481, + "grad_norm": 0.70703125, + "learning_rate": 4.636113274262286e-06, + "loss": 2.1286, + "step": 18517 + }, + { + "epoch": 0.9934549356223176, + "grad_norm": 0.59375, + "learning_rate": 4.63606813704701e-06, + "loss": 2.2771, + "step": 18518 + }, + { + "epoch": 0.9935085836909872, + "grad_norm": 0.51171875, + "learning_rate": 4.636022997252209e-06, + "loss": 2.4226, + "step": 18519 + }, + { + "epoch": 0.9935622317596566, + "grad_norm": 0.484375, + "learning_rate": 4.635977854877939e-06, + "loss": 2.279, + "step": 18520 + }, + { + "epoch": 0.9936158798283262, + "grad_norm": 0.71484375, + "learning_rate": 4.6359327099242535e-06, + "loss": 2.4439, + "step": 18521 + }, + { + "epoch": 0.9936695278969957, + "grad_norm": 0.5, + "learning_rate": 4.6358875623912085e-06, + "loss": 2.2861, + "step": 18522 + }, + { + "epoch": 0.9937231759656653, + "grad_norm": 0.388671875, + "learning_rate": 4.635842412278857e-06, + "loss": 2.3415, + "step": 18523 + }, + { + "epoch": 0.9937768240343348, + "grad_norm": 0.462890625, + "learning_rate": 4.635797259587254e-06, + "loss": 2.2722, + "step": 18524 + }, + { + "epoch": 0.9938304721030042, + "grad_norm": 0.455078125, + "learning_rate": 4.6357521043164545e-06, + "loss": 2.1334, + "step": 18525 + }, + { + "epoch": 0.9938841201716738, + "grad_norm": 0.400390625, + "learning_rate": 4.6357069464665135e-06, + "loss": 2.2731, + "step": 18526 + }, + { + "epoch": 0.9939377682403433, + "grad_norm": 0.470703125, + "learning_rate": 4.635661786037484e-06, + "loss": 2.36, + "step": 18527 + }, + { + "epoch": 0.9939914163090129, + "grad_norm": 0.447265625, + "learning_rate": 4.635616623029422e-06, + "loss": 2.1463, + "step": 18528 + }, + { + "epoch": 0.9940450643776824, + "grad_norm": 0.439453125, + "learning_rate": 4.63557145744238e-06, + "loss": 2.0941, + "step": 18529 + }, + { + "epoch": 0.994098712446352, + "grad_norm": 0.48828125, + "learning_rate": 4.6355262892764155e-06, + "loss": 2.3246, + "step": 18530 + }, + { + "epoch": 0.9941523605150214, + "grad_norm": 0.4453125, + "learning_rate": 4.63548111853158e-06, + "loss": 2.2348, + "step": 18531 + }, + { + "epoch": 0.994206008583691, + "grad_norm": 0.73046875, + "learning_rate": 4.6354359452079305e-06, + "loss": 2.0869, + "step": 18532 + }, + { + "epoch": 0.9942596566523605, + "grad_norm": 0.484375, + "learning_rate": 4.63539076930552e-06, + "loss": 2.56, + "step": 18533 + }, + { + "epoch": 0.9943133047210301, + "grad_norm": 0.455078125, + "learning_rate": 4.6353455908244035e-06, + "loss": 2.3095, + "step": 18534 + }, + { + "epoch": 0.9943669527896996, + "grad_norm": 0.49609375, + "learning_rate": 4.6353004097646366e-06, + "loss": 2.2348, + "step": 18535 + }, + { + "epoch": 0.9944206008583691, + "grad_norm": 0.58203125, + "learning_rate": 4.635255226126273e-06, + "loss": 2.0877, + "step": 18536 + }, + { + "epoch": 0.9944742489270386, + "grad_norm": 0.41015625, + "learning_rate": 4.635210039909366e-06, + "loss": 2.4045, + "step": 18537 + }, + { + "epoch": 0.9945278969957082, + "grad_norm": 0.53125, + "learning_rate": 4.6351648511139714e-06, + "loss": 2.4566, + "step": 18538 + }, + { + "epoch": 0.9945815450643777, + "grad_norm": 0.5078125, + "learning_rate": 4.635119659740145e-06, + "loss": 1.8445, + "step": 18539 + }, + { + "epoch": 0.9946351931330472, + "grad_norm": 0.4296875, + "learning_rate": 4.635074465787939e-06, + "loss": 2.3432, + "step": 18540 + }, + { + "epoch": 0.9946888412017167, + "grad_norm": 0.462890625, + "learning_rate": 4.63502926925741e-06, + "loss": 2.3522, + "step": 18541 + }, + { + "epoch": 0.9947424892703862, + "grad_norm": 0.4765625, + "learning_rate": 4.634984070148611e-06, + "loss": 2.4215, + "step": 18542 + }, + { + "epoch": 0.9947961373390558, + "grad_norm": 0.466796875, + "learning_rate": 4.634938868461597e-06, + "loss": 1.9957, + "step": 18543 + }, + { + "epoch": 0.9948497854077253, + "grad_norm": 0.76953125, + "learning_rate": 4.634893664196423e-06, + "loss": 2.3136, + "step": 18544 + }, + { + "epoch": 0.9949034334763949, + "grad_norm": 0.4140625, + "learning_rate": 4.634848457353143e-06, + "loss": 2.3308, + "step": 18545 + }, + { + "epoch": 0.9949570815450643, + "grad_norm": 0.52734375, + "learning_rate": 4.634803247931813e-06, + "loss": 2.3265, + "step": 18546 + }, + { + "epoch": 0.9950107296137339, + "grad_norm": 0.40625, + "learning_rate": 4.634758035932486e-06, + "loss": 2.5011, + "step": 18547 + }, + { + "epoch": 0.9950643776824034, + "grad_norm": 0.439453125, + "learning_rate": 4.634712821355217e-06, + "loss": 2.0595, + "step": 18548 + }, + { + "epoch": 0.995118025751073, + "grad_norm": 0.51953125, + "learning_rate": 4.634667604200061e-06, + "loss": 2.5218, + "step": 18549 + }, + { + "epoch": 0.9951716738197425, + "grad_norm": 0.45703125, + "learning_rate": 4.634622384467073e-06, + "loss": 2.3134, + "step": 18550 + }, + { + "epoch": 0.9952253218884121, + "grad_norm": 0.4609375, + "learning_rate": 4.634577162156306e-06, + "loss": 2.2172, + "step": 18551 + }, + { + "epoch": 0.9952789699570815, + "grad_norm": 0.435546875, + "learning_rate": 4.634531937267817e-06, + "loss": 2.0138, + "step": 18552 + }, + { + "epoch": 0.9953326180257511, + "grad_norm": 0.51953125, + "learning_rate": 4.634486709801658e-06, + "loss": 2.3406, + "step": 18553 + }, + { + "epoch": 0.9953862660944206, + "grad_norm": 0.458984375, + "learning_rate": 4.6344414797578855e-06, + "loss": 2.1366, + "step": 18554 + }, + { + "epoch": 0.9954399141630901, + "grad_norm": 1.09375, + "learning_rate": 4.634396247136552e-06, + "loss": 2.1051, + "step": 18555 + }, + { + "epoch": 0.9954935622317597, + "grad_norm": 0.423828125, + "learning_rate": 4.634351011937716e-06, + "loss": 2.226, + "step": 18556 + }, + { + "epoch": 0.9955472103004291, + "grad_norm": 0.5546875, + "learning_rate": 4.634305774161428e-06, + "loss": 2.2205, + "step": 18557 + }, + { + "epoch": 0.9956008583690987, + "grad_norm": 0.4453125, + "learning_rate": 4.634260533807745e-06, + "loss": 2.4483, + "step": 18558 + }, + { + "epoch": 0.9956545064377682, + "grad_norm": 3.6875, + "learning_rate": 4.63421529087672e-06, + "loss": 1.5685, + "step": 18559 + }, + { + "epoch": 0.9957081545064378, + "grad_norm": 0.470703125, + "learning_rate": 4.63417004536841e-06, + "loss": 2.3141, + "step": 18560 + }, + { + "epoch": 0.9957618025751073, + "grad_norm": 0.5390625, + "learning_rate": 4.634124797282867e-06, + "loss": 1.7588, + "step": 18561 + }, + { + "epoch": 0.9958154506437769, + "grad_norm": 0.42578125, + "learning_rate": 4.634079546620148e-06, + "loss": 2.3839, + "step": 18562 + }, + { + "epoch": 0.9958690987124463, + "grad_norm": 0.39453125, + "learning_rate": 4.634034293380307e-06, + "loss": 1.2817, + "step": 18563 + }, + { + "epoch": 0.9959227467811159, + "grad_norm": 0.482421875, + "learning_rate": 4.633989037563396e-06, + "loss": 2.262, + "step": 18564 + }, + { + "epoch": 0.9959763948497854, + "grad_norm": 0.455078125, + "learning_rate": 4.633943779169474e-06, + "loss": 2.2673, + "step": 18565 + }, + { + "epoch": 0.996030042918455, + "grad_norm": 0.37890625, + "learning_rate": 4.6338985181985925e-06, + "loss": 2.1201, + "step": 18566 + }, + { + "epoch": 0.9960836909871245, + "grad_norm": 0.546875, + "learning_rate": 4.6338532546508074e-06, + "loss": 2.3382, + "step": 18567 + }, + { + "epoch": 0.996137339055794, + "grad_norm": 0.59375, + "learning_rate": 4.633807988526174e-06, + "loss": 1.1146, + "step": 18568 + }, + { + "epoch": 0.9961909871244635, + "grad_norm": 0.443359375, + "learning_rate": 4.6337627198247445e-06, + "loss": 2.6407, + "step": 18569 + }, + { + "epoch": 0.996244635193133, + "grad_norm": 0.408203125, + "learning_rate": 4.633717448546575e-06, + "loss": 2.2518, + "step": 18570 + }, + { + "epoch": 0.9962982832618026, + "grad_norm": 1.1953125, + "learning_rate": 4.633672174691721e-06, + "loss": 2.3065, + "step": 18571 + }, + { + "epoch": 0.9963519313304721, + "grad_norm": 1.109375, + "learning_rate": 4.633626898260237e-06, + "loss": 2.1976, + "step": 18572 + }, + { + "epoch": 0.9964055793991416, + "grad_norm": 0.447265625, + "learning_rate": 4.6335816192521764e-06, + "loss": 1.8336, + "step": 18573 + }, + { + "epoch": 0.9964592274678111, + "grad_norm": 0.462890625, + "learning_rate": 4.6335363376675956e-06, + "loss": 2.2248, + "step": 18574 + }, + { + "epoch": 0.9965128755364807, + "grad_norm": 0.86328125, + "learning_rate": 4.633491053506547e-06, + "loss": 1.5772, + "step": 18575 + }, + { + "epoch": 0.9965665236051502, + "grad_norm": 0.451171875, + "learning_rate": 4.633445766769088e-06, + "loss": 2.4071, + "step": 18576 + }, + { + "epoch": 0.9966201716738198, + "grad_norm": 0.47265625, + "learning_rate": 4.633400477455271e-06, + "loss": 2.2095, + "step": 18577 + }, + { + "epoch": 0.9966738197424893, + "grad_norm": 0.74609375, + "learning_rate": 4.633355185565151e-06, + "loss": 2.281, + "step": 18578 + }, + { + "epoch": 0.9967274678111588, + "grad_norm": 0.45703125, + "learning_rate": 4.633309891098785e-06, + "loss": 1.8198, + "step": 18579 + }, + { + "epoch": 0.9967811158798283, + "grad_norm": 0.470703125, + "learning_rate": 4.633264594056225e-06, + "loss": 2.4718, + "step": 18580 + }, + { + "epoch": 0.9968347639484979, + "grad_norm": 0.439453125, + "learning_rate": 4.633219294437526e-06, + "loss": 2.1427, + "step": 18581 + }, + { + "epoch": 0.9968884120171674, + "grad_norm": 0.4140625, + "learning_rate": 4.633173992242744e-06, + "loss": 2.0254, + "step": 18582 + }, + { + "epoch": 0.9969420600858369, + "grad_norm": 0.5, + "learning_rate": 4.633128687471934e-06, + "loss": 2.6161, + "step": 18583 + }, + { + "epoch": 0.9969957081545064, + "grad_norm": 0.47265625, + "learning_rate": 4.633083380125148e-06, + "loss": 2.4282, + "step": 18584 + }, + { + "epoch": 0.9970493562231759, + "grad_norm": 0.482421875, + "learning_rate": 4.633038070202444e-06, + "loss": 2.2225, + "step": 18585 + }, + { + "epoch": 0.9971030042918455, + "grad_norm": 0.4609375, + "learning_rate": 4.6329927577038745e-06, + "loss": 2.189, + "step": 18586 + }, + { + "epoch": 0.997156652360515, + "grad_norm": 0.494140625, + "learning_rate": 4.6329474426294955e-06, + "loss": 1.7412, + "step": 18587 + }, + { + "epoch": 0.9972103004291846, + "grad_norm": 0.357421875, + "learning_rate": 4.632902124979361e-06, + "loss": 1.9748, + "step": 18588 + }, + { + "epoch": 0.997263948497854, + "grad_norm": 0.60546875, + "learning_rate": 4.632856804753526e-06, + "loss": 2.1228, + "step": 18589 + }, + { + "epoch": 0.9973175965665236, + "grad_norm": 0.5859375, + "learning_rate": 4.632811481952045e-06, + "loss": 2.3545, + "step": 18590 + }, + { + "epoch": 0.9973712446351931, + "grad_norm": 0.52734375, + "learning_rate": 4.632766156574972e-06, + "loss": 2.0743, + "step": 18591 + }, + { + "epoch": 0.9974248927038627, + "grad_norm": 0.396484375, + "learning_rate": 4.632720828622364e-06, + "loss": 2.0964, + "step": 18592 + }, + { + "epoch": 0.9974785407725322, + "grad_norm": 0.50390625, + "learning_rate": 4.6326754980942735e-06, + "loss": 2.4147, + "step": 18593 + }, + { + "epoch": 0.9975321888412018, + "grad_norm": 0.435546875, + "learning_rate": 4.632630164990756e-06, + "loss": 2.3403, + "step": 18594 + }, + { + "epoch": 0.9975858369098712, + "grad_norm": 0.46875, + "learning_rate": 4.632584829311867e-06, + "loss": 2.1093, + "step": 18595 + }, + { + "epoch": 0.9976394849785408, + "grad_norm": 0.482421875, + "learning_rate": 4.63253949105766e-06, + "loss": 2.4088, + "step": 18596 + }, + { + "epoch": 0.9976931330472103, + "grad_norm": 0.5390625, + "learning_rate": 4.632494150228191e-06, + "loss": 2.1453, + "step": 18597 + }, + { + "epoch": 0.9977467811158798, + "grad_norm": 0.453125, + "learning_rate": 4.632448806823513e-06, + "loss": 2.3289, + "step": 18598 + }, + { + "epoch": 0.9978004291845494, + "grad_norm": 0.609375, + "learning_rate": 4.632403460843682e-06, + "loss": 2.4617, + "step": 18599 + }, + { + "epoch": 0.9978540772532188, + "grad_norm": 0.58984375, + "learning_rate": 4.632358112288753e-06, + "loss": 2.4396, + "step": 18600 + }, + { + "epoch": 0.9979077253218884, + "grad_norm": 0.51953125, + "learning_rate": 4.63231276115878e-06, + "loss": 2.2861, + "step": 18601 + }, + { + "epoch": 0.9979613733905579, + "grad_norm": 0.3984375, + "learning_rate": 4.632267407453819e-06, + "loss": 2.282, + "step": 18602 + }, + { + "epoch": 0.9980150214592275, + "grad_norm": 0.515625, + "learning_rate": 4.632222051173924e-06, + "loss": 2.3248, + "step": 18603 + }, + { + "epoch": 0.998068669527897, + "grad_norm": 0.49609375, + "learning_rate": 4.632176692319148e-06, + "loss": 2.3974, + "step": 18604 + }, + { + "epoch": 0.9981223175965666, + "grad_norm": 0.439453125, + "learning_rate": 4.632131330889549e-06, + "loss": 2.2842, + "step": 18605 + }, + { + "epoch": 0.998175965665236, + "grad_norm": 0.431640625, + "learning_rate": 4.632085966885179e-06, + "loss": 2.2313, + "step": 18606 + }, + { + "epoch": 0.9982296137339056, + "grad_norm": 0.41796875, + "learning_rate": 4.632040600306095e-06, + "loss": 2.4737, + "step": 18607 + }, + { + "epoch": 0.9982832618025751, + "grad_norm": 0.494140625, + "learning_rate": 4.631995231152351e-06, + "loss": 2.0868, + "step": 18608 + }, + { + "epoch": 0.9983369098712447, + "grad_norm": 0.42578125, + "learning_rate": 4.631949859424001e-06, + "loss": 1.9142, + "step": 18609 + }, + { + "epoch": 0.9983905579399142, + "grad_norm": 0.490234375, + "learning_rate": 4.6319044851211e-06, + "loss": 2.2672, + "step": 18610 + }, + { + "epoch": 0.9984442060085837, + "grad_norm": 0.609375, + "learning_rate": 4.631859108243704e-06, + "loss": 2.4555, + "step": 18611 + }, + { + "epoch": 0.9984978540772532, + "grad_norm": 0.47265625, + "learning_rate": 4.631813728791866e-06, + "loss": 2.4307, + "step": 18612 + }, + { + "epoch": 0.9985515021459227, + "grad_norm": 0.41015625, + "learning_rate": 4.631768346765642e-06, + "loss": 2.1144, + "step": 18613 + }, + { + "epoch": 0.9986051502145923, + "grad_norm": 0.392578125, + "learning_rate": 4.631722962165087e-06, + "loss": 2.3457, + "step": 18614 + }, + { + "epoch": 0.9986587982832618, + "grad_norm": 0.451171875, + "learning_rate": 4.631677574990255e-06, + "loss": 2.3586, + "step": 18615 + }, + { + "epoch": 0.9987124463519313, + "grad_norm": 0.5859375, + "learning_rate": 4.631632185241202e-06, + "loss": 2.3716, + "step": 18616 + }, + { + "epoch": 0.9987660944206008, + "grad_norm": 0.482421875, + "learning_rate": 4.631586792917981e-06, + "loss": 2.3199, + "step": 18617 + }, + { + "epoch": 0.9988197424892704, + "grad_norm": 0.400390625, + "learning_rate": 4.631541398020647e-06, + "loss": 2.3497, + "step": 18618 + }, + { + "epoch": 0.9988733905579399, + "grad_norm": 0.5234375, + "learning_rate": 4.631496000549257e-06, + "loss": 2.4496, + "step": 18619 + }, + { + "epoch": 0.9989270386266095, + "grad_norm": 0.482421875, + "learning_rate": 4.631450600503864e-06, + "loss": 2.4108, + "step": 18620 + }, + { + "epoch": 0.998980686695279, + "grad_norm": 0.96875, + "learning_rate": 4.631405197884524e-06, + "loss": 2.512, + "step": 18621 + }, + { + "epoch": 0.9990343347639485, + "grad_norm": 0.423828125, + "learning_rate": 4.63135979269129e-06, + "loss": 2.2648, + "step": 18622 + }, + { + "epoch": 0.999087982832618, + "grad_norm": 0.470703125, + "learning_rate": 4.6313143849242185e-06, + "loss": 2.2955, + "step": 18623 + }, + { + "epoch": 0.9991416309012876, + "grad_norm": 0.53515625, + "learning_rate": 4.631268974583364e-06, + "loss": 2.348, + "step": 18624 + }, + { + "epoch": 0.9991952789699571, + "grad_norm": 1.40625, + "learning_rate": 4.631223561668781e-06, + "loss": 1.472, + "step": 18625 + }, + { + "epoch": 0.9992489270386266, + "grad_norm": 0.419921875, + "learning_rate": 4.631178146180524e-06, + "loss": 2.2418, + "step": 18626 + }, + { + "epoch": 0.9993025751072961, + "grad_norm": 0.44140625, + "learning_rate": 4.631132728118649e-06, + "loss": 2.0519, + "step": 18627 + }, + { + "epoch": 0.9993562231759656, + "grad_norm": 0.4921875, + "learning_rate": 4.631087307483209e-06, + "loss": 2.5763, + "step": 18628 + }, + { + "epoch": 0.9994098712446352, + "grad_norm": 0.5390625, + "learning_rate": 4.631041884274261e-06, + "loss": 2.3582, + "step": 18629 + }, + { + "epoch": 0.9994635193133047, + "grad_norm": 0.671875, + "learning_rate": 4.630996458491858e-06, + "loss": 2.0687, + "step": 18630 + }, + { + "epoch": 0.9995171673819743, + "grad_norm": 0.44921875, + "learning_rate": 4.630951030136056e-06, + "loss": 2.4045, + "step": 18631 + }, + { + "epoch": 0.9995708154506437, + "grad_norm": 0.447265625, + "learning_rate": 4.63090559920691e-06, + "loss": 2.0803, + "step": 18632 + }, + { + "epoch": 0.9996244635193133, + "grad_norm": 0.63671875, + "learning_rate": 4.630860165704474e-06, + "loss": 1.8399, + "step": 18633 + }, + { + "epoch": 0.9996781115879828, + "grad_norm": 0.5, + "learning_rate": 4.6308147296288035e-06, + "loss": 2.5026, + "step": 18634 + }, + { + "epoch": 0.9997317596566524, + "grad_norm": 0.4921875, + "learning_rate": 4.630769290979953e-06, + "loss": 2.2246, + "step": 18635 + }, + { + "epoch": 0.9997854077253219, + "grad_norm": 0.6875, + "learning_rate": 4.630723849757978e-06, + "loss": 2.3125, + "step": 18636 + }, + { + "epoch": 0.9998390557939915, + "grad_norm": 0.4921875, + "learning_rate": 4.6306784059629315e-06, + "loss": 2.3083, + "step": 18637 + }, + { + "epoch": 0.9998927038626609, + "grad_norm": 0.515625, + "learning_rate": 4.630632959594871e-06, + "loss": 2.1613, + "step": 18638 + }, + { + "epoch": 0.9999463519313305, + "grad_norm": 0.6171875, + "learning_rate": 4.63058751065385e-06, + "loss": 2.377, + "step": 18639 + }, + { + "epoch": 1.0, + "grad_norm": 0.494140625, + "learning_rate": 4.630542059139923e-06, + "loss": 2.3591, + "step": 18640 + } + ], + "logging_steps": 1, + "max_steps": 93200, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5966987500157665e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}