{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20144203657455892, "eval_steps": 500, "global_step": 5001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1379.3720991798398, "learning_rate": 2.6845637583892618e-08, "loss": 15.4268, "step": 1 }, { "epoch": 0.0, "grad_norm": 12.8534206219106, "learning_rate": 5.3691275167785235e-08, "loss": 1.5295, "step": 2 }, { "epoch": 0.0, "grad_norm": 1367.2156892086268, "learning_rate": 8.053691275167787e-08, "loss": 14.914, "step": 3 }, { "epoch": 0.0, "grad_norm": 1356.0751967393617, "learning_rate": 1.0738255033557047e-07, "loss": 15.3003, "step": 4 }, { "epoch": 0.0, "grad_norm": 1332.7461428880738, "learning_rate": 1.342281879194631e-07, "loss": 15.3399, "step": 5 }, { "epoch": 0.0, "grad_norm": 1327.7087244291388, "learning_rate": 1.6107382550335574e-07, "loss": 15.6574, "step": 6 }, { "epoch": 0.0, "grad_norm": 1357.2838160327776, "learning_rate": 1.8791946308724833e-07, "loss": 15.3619, "step": 7 }, { "epoch": 0.0, "grad_norm": 1460.9880894767196, "learning_rate": 2.1476510067114094e-07, "loss": 14.8984, "step": 8 }, { "epoch": 0.0, "grad_norm": 13.685025255387016, "learning_rate": 2.4161073825503355e-07, "loss": 1.6682, "step": 9 }, { "epoch": 0.0, "grad_norm": 1433.944513190723, "learning_rate": 2.684563758389262e-07, "loss": 15.0011, "step": 10 }, { "epoch": 0.0, "grad_norm": 1302.7623411450743, "learning_rate": 2.9530201342281884e-07, "loss": 15.6583, "step": 11 }, { "epoch": 0.0, "grad_norm": 1345.8042090611784, "learning_rate": 3.221476510067115e-07, "loss": 15.4581, "step": 12 }, { "epoch": 0.0, "grad_norm": 1208.1489105112164, "learning_rate": 3.48993288590604e-07, "loss": 14.4632, "step": 13 }, { "epoch": 0.0, "grad_norm": 1122.3733969914717, "learning_rate": 3.7583892617449665e-07, "loss": 13.5493, "step": 14 }, { "epoch": 0.0, "grad_norm": 1036.9703513692707, "learning_rate": 4.026845637583893e-07, "loss": 13.0069, "step": 15 }, { "epoch": 0.0, "grad_norm": 989.8774194879742, "learning_rate": 4.295302013422819e-07, "loss": 13.0841, "step": 16 }, { "epoch": 0.0, "grad_norm": 1070.3499952623663, "learning_rate": 4.563758389261745e-07, "loss": 13.8049, "step": 17 }, { "epoch": 0.0, "grad_norm": 1022.0857382159567, "learning_rate": 4.832214765100671e-07, "loss": 13.1312, "step": 18 }, { "epoch": 0.0, "grad_norm": 922.0295847794329, "learning_rate": 5.100671140939598e-07, "loss": 12.2413, "step": 19 }, { "epoch": 0.0, "grad_norm": 626.8057262987901, "learning_rate": 5.369127516778524e-07, "loss": 10.251, "step": 20 }, { "epoch": 0.0, "grad_norm": 574.7762030056041, "learning_rate": 5.63758389261745e-07, "loss": 10.5845, "step": 21 }, { "epoch": 0.0, "grad_norm": 621.5321839899642, "learning_rate": 5.906040268456377e-07, "loss": 10.1517, "step": 22 }, { "epoch": 0.0, "grad_norm": 627.1730841493847, "learning_rate": 6.174496644295303e-07, "loss": 10.0554, "step": 23 }, { "epoch": 0.0, "grad_norm": 510.36730831743716, "learning_rate": 6.44295302013423e-07, "loss": 10.2728, "step": 24 }, { "epoch": 0.0, "grad_norm": 519.3995865128837, "learning_rate": 6.711409395973155e-07, "loss": 9.6979, "step": 25 }, { "epoch": 0.0, "grad_norm": 457.2100248409554, "learning_rate": 6.97986577181208e-07, "loss": 8.404, "step": 26 }, { "epoch": 0.0, "grad_norm": 333.7980179207978, "learning_rate": 7.248322147651007e-07, "loss": 7.0196, "step": 27 }, { "epoch": 0.0, "grad_norm": 230.65095048133824, "learning_rate": 7.516778523489933e-07, "loss": 7.0535, "step": 28 }, { "epoch": 0.0, "grad_norm": 202.45785726173838, "learning_rate": 7.78523489932886e-07, "loss": 7.3524, "step": 29 }, { "epoch": 0.0, "grad_norm": 141.1784162971984, "learning_rate": 8.053691275167786e-07, "loss": 5.9181, "step": 30 }, { "epoch": 0.0, "grad_norm": 160.36072995219004, "learning_rate": 8.322147651006713e-07, "loss": 6.7424, "step": 31 }, { "epoch": 0.0, "grad_norm": 153.3983638890247, "learning_rate": 8.590604026845638e-07, "loss": 5.3766, "step": 32 }, { "epoch": 0.0, "grad_norm": 160.0499921485999, "learning_rate": 8.859060402684565e-07, "loss": 5.067, "step": 33 }, { "epoch": 0.0, "grad_norm": 144.8789271164616, "learning_rate": 9.12751677852349e-07, "loss": 5.6964, "step": 34 }, { "epoch": 0.0, "grad_norm": 142.97309552434834, "learning_rate": 9.395973154362417e-07, "loss": 5.2783, "step": 35 }, { "epoch": 0.0, "grad_norm": 120.28867130605691, "learning_rate": 9.664429530201342e-07, "loss": 5.8628, "step": 36 }, { "epoch": 0.0, "grad_norm": 117.39289909868887, "learning_rate": 9.93288590604027e-07, "loss": 6.0932, "step": 37 }, { "epoch": 0.0, "grad_norm": 109.1450812867474, "learning_rate": 1.0201342281879196e-06, "loss": 5.2595, "step": 38 }, { "epoch": 0.0, "grad_norm": 119.76168539749345, "learning_rate": 1.0469798657718122e-06, "loss": 5.8302, "step": 39 }, { "epoch": 0.0, "grad_norm": 138.6191466863621, "learning_rate": 1.0738255033557048e-06, "loss": 4.4134, "step": 40 }, { "epoch": 0.0, "grad_norm": 104.40519142078718, "learning_rate": 1.1006711409395974e-06, "loss": 5.6586, "step": 41 }, { "epoch": 0.0, "grad_norm": 70.673119264975, "learning_rate": 1.12751677852349e-06, "loss": 4.8559, "step": 42 }, { "epoch": 0.0, "grad_norm": 88.61579685891138, "learning_rate": 1.1543624161073828e-06, "loss": 4.5503, "step": 43 }, { "epoch": 0.0, "grad_norm": 55.93590312064801, "learning_rate": 1.1812080536912753e-06, "loss": 4.311, "step": 44 }, { "epoch": 0.0, "grad_norm": 118.27309796259414, "learning_rate": 1.208053691275168e-06, "loss": 3.9019, "step": 45 }, { "epoch": 0.0, "grad_norm": 78.85209279862124, "learning_rate": 1.2348993288590605e-06, "loss": 4.1337, "step": 46 }, { "epoch": 0.0, "grad_norm": 98.12405486611367, "learning_rate": 1.2617449664429531e-06, "loss": 4.5948, "step": 47 }, { "epoch": 0.0, "grad_norm": 63.959240832161754, "learning_rate": 1.288590604026846e-06, "loss": 3.5753, "step": 48 }, { "epoch": 0.0, "grad_norm": 56.58840985353587, "learning_rate": 1.3154362416107383e-06, "loss": 3.6236, "step": 49 }, { "epoch": 0.0, "grad_norm": 55.24870913585886, "learning_rate": 1.342281879194631e-06, "loss": 3.4079, "step": 50 }, { "epoch": 0.0, "grad_norm": 47.87746896035972, "learning_rate": 1.3691275167785237e-06, "loss": 3.6738, "step": 51 }, { "epoch": 0.0, "grad_norm": 51.97487694440137, "learning_rate": 1.395973154362416e-06, "loss": 3.8814, "step": 52 }, { "epoch": 0.0, "grad_norm": 42.98929196517629, "learning_rate": 1.4228187919463088e-06, "loss": 3.3119, "step": 53 }, { "epoch": 0.0, "grad_norm": 41.24350579976683, "learning_rate": 1.4496644295302014e-06, "loss": 3.0046, "step": 54 }, { "epoch": 0.0, "grad_norm": 45.75625157306433, "learning_rate": 1.4765100671140942e-06, "loss": 3.3642, "step": 55 }, { "epoch": 0.0, "grad_norm": 45.14172474761506, "learning_rate": 1.5033557046979866e-06, "loss": 3.3027, "step": 56 }, { "epoch": 0.0, "grad_norm": 53.02674522833074, "learning_rate": 1.5302013422818792e-06, "loss": 3.4546, "step": 57 }, { "epoch": 0.0, "grad_norm": 37.83432483620663, "learning_rate": 1.557046979865772e-06, "loss": 3.2131, "step": 58 }, { "epoch": 0.0, "grad_norm": 32.0919898696877, "learning_rate": 1.5838926174496646e-06, "loss": 3.2642, "step": 59 }, { "epoch": 0.0, "grad_norm": 42.87961228631677, "learning_rate": 1.6107382550335572e-06, "loss": 3.0634, "step": 60 }, { "epoch": 0.0, "grad_norm": 42.971138864100254, "learning_rate": 1.6375838926174498e-06, "loss": 3.3706, "step": 61 }, { "epoch": 0.0, "grad_norm": 48.06803919950474, "learning_rate": 1.6644295302013426e-06, "loss": 3.4155, "step": 62 }, { "epoch": 0.0, "grad_norm": 33.48043635353635, "learning_rate": 1.6912751677852351e-06, "loss": 2.839, "step": 63 }, { "epoch": 0.0, "grad_norm": 30.969839843504406, "learning_rate": 1.7181208053691275e-06, "loss": 2.7469, "step": 64 }, { "epoch": 0.0, "grad_norm": 26.65704331081703, "learning_rate": 1.7449664429530203e-06, "loss": 2.6614, "step": 65 }, { "epoch": 0.0, "grad_norm": 30.509605083858826, "learning_rate": 1.771812080536913e-06, "loss": 2.5327, "step": 66 }, { "epoch": 0.0, "grad_norm": 27.797477758110855, "learning_rate": 1.7986577181208057e-06, "loss": 2.5899, "step": 67 }, { "epoch": 0.0, "grad_norm": 32.33846675375741, "learning_rate": 1.825503355704698e-06, "loss": 2.9863, "step": 68 }, { "epoch": 0.0, "grad_norm": 26.92341785904472, "learning_rate": 1.8523489932885907e-06, "loss": 2.7372, "step": 69 }, { "epoch": 0.0, "grad_norm": 28.679791412739323, "learning_rate": 1.8791946308724835e-06, "loss": 3.014, "step": 70 }, { "epoch": 0.0, "grad_norm": 25.13369507776243, "learning_rate": 1.906040268456376e-06, "loss": 2.602, "step": 71 }, { "epoch": 0.0, "grad_norm": 34.02156210830726, "learning_rate": 1.9328859060402684e-06, "loss": 3.0853, "step": 72 }, { "epoch": 0.0, "grad_norm": 30.70853474511914, "learning_rate": 1.959731543624161e-06, "loss": 2.1585, "step": 73 }, { "epoch": 0.0, "grad_norm": 27.27251874476091, "learning_rate": 1.986577181208054e-06, "loss": 2.4983, "step": 74 }, { "epoch": 0.0, "grad_norm": 26.32702895458006, "learning_rate": 2.013422818791946e-06, "loss": 2.4734, "step": 75 }, { "epoch": 0.0, "grad_norm": 24.51841515770848, "learning_rate": 2.0402684563758392e-06, "loss": 2.162, "step": 76 }, { "epoch": 0.0, "grad_norm": 22.064990372140535, "learning_rate": 2.067114093959732e-06, "loss": 2.3108, "step": 77 }, { "epoch": 0.0, "grad_norm": 28.064437504570606, "learning_rate": 2.0939597315436244e-06, "loss": 2.9155, "step": 78 }, { "epoch": 0.0, "grad_norm": 12.536350362321539, "learning_rate": 2.120805369127517e-06, "loss": 1.4456, "step": 79 }, { "epoch": 0.0, "grad_norm": 25.978376246878465, "learning_rate": 2.1476510067114096e-06, "loss": 2.6396, "step": 80 }, { "epoch": 0.0, "grad_norm": 25.562256155967653, "learning_rate": 2.174496644295302e-06, "loss": 2.215, "step": 81 }, { "epoch": 0.0, "grad_norm": 19.77336038919066, "learning_rate": 2.2013422818791947e-06, "loss": 2.0772, "step": 82 }, { "epoch": 0.0, "grad_norm": 27.049999307203677, "learning_rate": 2.2281879194630873e-06, "loss": 2.3279, "step": 83 }, { "epoch": 0.0, "grad_norm": 12.24508052793857, "learning_rate": 2.25503355704698e-06, "loss": 1.4648, "step": 84 }, { "epoch": 0.0, "grad_norm": 23.313229785568076, "learning_rate": 2.2818791946308725e-06, "loss": 2.7208, "step": 85 }, { "epoch": 0.0, "grad_norm": 23.129347964745936, "learning_rate": 2.3087248322147655e-06, "loss": 1.9661, "step": 86 }, { "epoch": 0.0, "grad_norm": 20.958667114185133, "learning_rate": 2.3355704697986577e-06, "loss": 2.2202, "step": 87 }, { "epoch": 0.0, "grad_norm": 24.74421340201436, "learning_rate": 2.3624161073825507e-06, "loss": 2.124, "step": 88 }, { "epoch": 0.0, "grad_norm": 12.638125408192659, "learning_rate": 2.3892617449664433e-06, "loss": 1.4054, "step": 89 }, { "epoch": 0.0, "grad_norm": 22.547612050266068, "learning_rate": 2.416107382550336e-06, "loss": 1.8086, "step": 90 }, { "epoch": 0.0, "grad_norm": 19.728919128173345, "learning_rate": 2.4429530201342285e-06, "loss": 2.1664, "step": 91 }, { "epoch": 0.0, "grad_norm": 22.818225129062817, "learning_rate": 2.469798657718121e-06, "loss": 2.0583, "step": 92 }, { "epoch": 0.0, "grad_norm": 24.987351354977292, "learning_rate": 2.4966442953020136e-06, "loss": 2.2603, "step": 93 }, { "epoch": 0.0, "grad_norm": 20.296590866071373, "learning_rate": 2.5234899328859062e-06, "loss": 2.1282, "step": 94 }, { "epoch": 0.0, "grad_norm": 18.214574496499722, "learning_rate": 2.5503355704697992e-06, "loss": 2.0341, "step": 95 }, { "epoch": 0.0, "grad_norm": 19.256988084631885, "learning_rate": 2.577181208053692e-06, "loss": 2.1784, "step": 96 }, { "epoch": 0.0, "grad_norm": 14.750386975252132, "learning_rate": 2.604026845637584e-06, "loss": 1.7499, "step": 97 }, { "epoch": 0.0, "grad_norm": 19.10847467657068, "learning_rate": 2.6308724832214766e-06, "loss": 1.9537, "step": 98 }, { "epoch": 0.0, "grad_norm": 20.306771416543913, "learning_rate": 2.657718120805369e-06, "loss": 2.0663, "step": 99 }, { "epoch": 0.0, "grad_norm": 22.84553552279815, "learning_rate": 2.684563758389262e-06, "loss": 2.1094, "step": 100 }, { "epoch": 0.0, "grad_norm": 19.510941228250555, "learning_rate": 2.7114093959731548e-06, "loss": 2.0442, "step": 101 }, { "epoch": 0.0, "grad_norm": 15.797589490096449, "learning_rate": 2.7382550335570473e-06, "loss": 1.7963, "step": 102 }, { "epoch": 0.0, "grad_norm": 17.723040840925055, "learning_rate": 2.76510067114094e-06, "loss": 2.0927, "step": 103 }, { "epoch": 0.0, "grad_norm": 20.56979983597794, "learning_rate": 2.791946308724832e-06, "loss": 1.9796, "step": 104 }, { "epoch": 0.0, "grad_norm": 14.38876629404607, "learning_rate": 2.8187919463087247e-06, "loss": 1.9783, "step": 105 }, { "epoch": 0.0, "grad_norm": 16.199703074114808, "learning_rate": 2.8456375838926177e-06, "loss": 1.9211, "step": 106 }, { "epoch": 0.0, "grad_norm": 14.789568611069523, "learning_rate": 2.8724832214765103e-06, "loss": 1.767, "step": 107 }, { "epoch": 0.0, "grad_norm": 14.58163221726849, "learning_rate": 2.899328859060403e-06, "loss": 1.9911, "step": 108 }, { "epoch": 0.0, "grad_norm": 19.91176329218805, "learning_rate": 2.9261744966442955e-06, "loss": 1.8932, "step": 109 }, { "epoch": 0.0, "grad_norm": 20.956470177399684, "learning_rate": 2.9530201342281885e-06, "loss": 1.8611, "step": 110 }, { "epoch": 0.0, "grad_norm": 14.733270877123367, "learning_rate": 2.979865771812081e-06, "loss": 2.0041, "step": 111 }, { "epoch": 0.0, "grad_norm": 15.18002295197271, "learning_rate": 3.0067114093959732e-06, "loss": 1.5112, "step": 112 }, { "epoch": 0.0, "grad_norm": 25.70936859670564, "learning_rate": 3.033557046979866e-06, "loss": 2.4545, "step": 113 }, { "epoch": 0.0, "grad_norm": 13.926470122370823, "learning_rate": 3.0604026845637584e-06, "loss": 1.7448, "step": 114 }, { "epoch": 0.0, "grad_norm": 15.943350791797702, "learning_rate": 3.0872483221476514e-06, "loss": 1.9604, "step": 115 }, { "epoch": 0.0, "grad_norm": 20.197824441019495, "learning_rate": 3.114093959731544e-06, "loss": 2.0546, "step": 116 }, { "epoch": 0.0, "grad_norm": 16.3138879828343, "learning_rate": 3.1409395973154366e-06, "loss": 1.2785, "step": 117 }, { "epoch": 0.0, "grad_norm": 21.35395249592559, "learning_rate": 3.167785234899329e-06, "loss": 2.7419, "step": 118 }, { "epoch": 0.0, "grad_norm": 12.69111880960645, "learning_rate": 3.194630872483222e-06, "loss": 1.6434, "step": 119 }, { "epoch": 0.0, "grad_norm": 12.982455849431444, "learning_rate": 3.2214765100671143e-06, "loss": 2.1976, "step": 120 }, { "epoch": 0.0, "grad_norm": 11.518962918485057, "learning_rate": 3.248322147651007e-06, "loss": 1.5123, "step": 121 }, { "epoch": 0.0, "grad_norm": 20.064316885133437, "learning_rate": 3.2751677852348995e-06, "loss": 2.206, "step": 122 }, { "epoch": 0.0, "grad_norm": 11.196567443999271, "learning_rate": 3.302013422818792e-06, "loss": 1.4922, "step": 123 }, { "epoch": 0.0, "grad_norm": 6.813290931368686, "learning_rate": 3.328859060402685e-06, "loss": 1.1576, "step": 124 }, { "epoch": 0.01, "grad_norm": 14.643274264416528, "learning_rate": 3.3557046979865777e-06, "loss": 1.9169, "step": 125 }, { "epoch": 0.01, "grad_norm": 12.764007126181044, "learning_rate": 3.3825503355704703e-06, "loss": 1.8815, "step": 126 }, { "epoch": 0.01, "grad_norm": 14.096268086959679, "learning_rate": 3.4093959731543625e-06, "loss": 1.567, "step": 127 }, { "epoch": 0.01, "grad_norm": 18.250938395234062, "learning_rate": 3.436241610738255e-06, "loss": 1.668, "step": 128 }, { "epoch": 0.01, "grad_norm": 11.573891263834847, "learning_rate": 3.4630872483221476e-06, "loss": 1.6717, "step": 129 }, { "epoch": 0.01, "grad_norm": 24.141761954303966, "learning_rate": 3.4899328859060407e-06, "loss": 1.7085, "step": 130 }, { "epoch": 0.01, "grad_norm": 12.860055788677856, "learning_rate": 3.5167785234899332e-06, "loss": 1.8231, "step": 131 }, { "epoch": 0.01, "grad_norm": 14.50483829259475, "learning_rate": 3.543624161073826e-06, "loss": 2.04, "step": 132 }, { "epoch": 0.01, "grad_norm": 13.340768519036331, "learning_rate": 3.5704697986577184e-06, "loss": 1.4616, "step": 133 }, { "epoch": 0.01, "grad_norm": 12.286266651787656, "learning_rate": 3.5973154362416114e-06, "loss": 1.4944, "step": 134 }, { "epoch": 0.01, "grad_norm": 12.223707110673606, "learning_rate": 3.6241610738255036e-06, "loss": 1.6233, "step": 135 }, { "epoch": 0.01, "grad_norm": 14.48345026987458, "learning_rate": 3.651006711409396e-06, "loss": 1.9901, "step": 136 }, { "epoch": 0.01, "grad_norm": 20.26552789758307, "learning_rate": 3.6778523489932888e-06, "loss": 2.0787, "step": 137 }, { "epoch": 0.01, "grad_norm": 19.72525228652948, "learning_rate": 3.7046979865771814e-06, "loss": 2.0334, "step": 138 }, { "epoch": 0.01, "grad_norm": 12.780049120791178, "learning_rate": 3.7315436241610744e-06, "loss": 1.6787, "step": 139 }, { "epoch": 0.01, "grad_norm": 14.546074380485582, "learning_rate": 3.758389261744967e-06, "loss": 1.8702, "step": 140 }, { "epoch": 0.01, "grad_norm": 6.709097755903235, "learning_rate": 3.7852348993288595e-06, "loss": 1.2595, "step": 141 }, { "epoch": 0.01, "grad_norm": 12.063162323488752, "learning_rate": 3.812080536912752e-06, "loss": 1.784, "step": 142 }, { "epoch": 0.01, "grad_norm": 13.957625227865835, "learning_rate": 3.838926174496644e-06, "loss": 1.8175, "step": 143 }, { "epoch": 0.01, "grad_norm": 15.085079054839767, "learning_rate": 3.865771812080537e-06, "loss": 1.6911, "step": 144 }, { "epoch": 0.01, "grad_norm": 8.31556562796878, "learning_rate": 3.8926174496644295e-06, "loss": 1.3445, "step": 145 }, { "epoch": 0.01, "grad_norm": 10.540520671108675, "learning_rate": 3.919463087248322e-06, "loss": 1.5777, "step": 146 }, { "epoch": 0.01, "grad_norm": 15.576257909876029, "learning_rate": 3.9463087248322155e-06, "loss": 1.5895, "step": 147 }, { "epoch": 0.01, "grad_norm": 15.052530982896844, "learning_rate": 3.973154362416108e-06, "loss": 1.6578, "step": 148 }, { "epoch": 0.01, "grad_norm": 12.71377622166453, "learning_rate": 4.000000000000001e-06, "loss": 1.8639, "step": 149 }, { "epoch": 0.01, "grad_norm": 14.918372148849897, "learning_rate": 4.026845637583892e-06, "loss": 1.4887, "step": 150 }, { "epoch": 0.01, "grad_norm": 11.821435242090589, "learning_rate": 4.053691275167785e-06, "loss": 1.8509, "step": 151 }, { "epoch": 0.01, "grad_norm": 14.446613557133626, "learning_rate": 4.0805369127516784e-06, "loss": 1.643, "step": 152 }, { "epoch": 0.01, "grad_norm": 15.579093564838542, "learning_rate": 4.107382550335571e-06, "loss": 1.339, "step": 153 }, { "epoch": 0.01, "grad_norm": 13.299064293707799, "learning_rate": 4.134228187919464e-06, "loss": 1.8641, "step": 154 }, { "epoch": 0.01, "grad_norm": 12.139099152761217, "learning_rate": 4.161073825503356e-06, "loss": 1.6654, "step": 155 }, { "epoch": 0.01, "grad_norm": 11.686682477529438, "learning_rate": 4.187919463087249e-06, "loss": 1.412, "step": 156 }, { "epoch": 0.01, "grad_norm": 14.51065934826115, "learning_rate": 4.214765100671141e-06, "loss": 2.0085, "step": 157 }, { "epoch": 0.01, "grad_norm": 17.265061302846355, "learning_rate": 4.241610738255034e-06, "loss": 2.324, "step": 158 }, { "epoch": 0.01, "grad_norm": 10.807505565522677, "learning_rate": 4.2684563758389265e-06, "loss": 1.7117, "step": 159 }, { "epoch": 0.01, "grad_norm": 16.153231537480185, "learning_rate": 4.295302013422819e-06, "loss": 1.9677, "step": 160 }, { "epoch": 0.01, "grad_norm": 15.489437743942664, "learning_rate": 4.322147651006712e-06, "loss": 1.9075, "step": 161 }, { "epoch": 0.01, "grad_norm": 14.627439994995036, "learning_rate": 4.348993288590604e-06, "loss": 1.7908, "step": 162 }, { "epoch": 0.01, "grad_norm": 12.314315042757203, "learning_rate": 4.375838926174497e-06, "loss": 1.628, "step": 163 }, { "epoch": 0.01, "grad_norm": 18.894482362082506, "learning_rate": 4.4026845637583895e-06, "loss": 1.6236, "step": 164 }, { "epoch": 0.01, "grad_norm": 16.345393611291524, "learning_rate": 4.429530201342283e-06, "loss": 2.0266, "step": 165 }, { "epoch": 0.01, "grad_norm": 11.603244940585046, "learning_rate": 4.456375838926175e-06, "loss": 1.5495, "step": 166 }, { "epoch": 0.01, "grad_norm": 10.345492404743105, "learning_rate": 4.483221476510067e-06, "loss": 1.4911, "step": 167 }, { "epoch": 0.01, "grad_norm": 12.440021844240855, "learning_rate": 4.51006711409396e-06, "loss": 1.8804, "step": 168 }, { "epoch": 0.01, "grad_norm": 11.531648439709466, "learning_rate": 4.536912751677852e-06, "loss": 1.697, "step": 169 }, { "epoch": 0.01, "grad_norm": 11.08337602148859, "learning_rate": 4.563758389261745e-06, "loss": 1.5035, "step": 170 }, { "epoch": 0.01, "grad_norm": 13.535791614438475, "learning_rate": 4.5906040268456384e-06, "loss": 1.8912, "step": 171 }, { "epoch": 0.01, "grad_norm": 11.881529290309787, "learning_rate": 4.617449664429531e-06, "loss": 1.6131, "step": 172 }, { "epoch": 0.01, "grad_norm": 16.17970250569803, "learning_rate": 4.644295302013423e-06, "loss": 1.7546, "step": 173 }, { "epoch": 0.01, "grad_norm": 15.143087308647942, "learning_rate": 4.671140939597315e-06, "loss": 1.5703, "step": 174 }, { "epoch": 0.01, "grad_norm": 14.28331408382497, "learning_rate": 4.697986577181208e-06, "loss": 2.0012, "step": 175 }, { "epoch": 0.01, "grad_norm": 10.884699065314573, "learning_rate": 4.724832214765101e-06, "loss": 1.5294, "step": 176 }, { "epoch": 0.01, "grad_norm": 10.722896782080753, "learning_rate": 4.751677852348994e-06, "loss": 1.5955, "step": 177 }, { "epoch": 0.01, "grad_norm": 14.347335472195118, "learning_rate": 4.7785234899328866e-06, "loss": 1.6365, "step": 178 }, { "epoch": 0.01, "grad_norm": 15.851492785664655, "learning_rate": 4.805369127516779e-06, "loss": 1.9486, "step": 179 }, { "epoch": 0.01, "grad_norm": 19.971523057766166, "learning_rate": 4.832214765100672e-06, "loss": 1.8916, "step": 180 }, { "epoch": 0.01, "grad_norm": 13.29731024397963, "learning_rate": 4.859060402684564e-06, "loss": 1.5239, "step": 181 }, { "epoch": 0.01, "grad_norm": 14.239374412559837, "learning_rate": 4.885906040268457e-06, "loss": 1.6987, "step": 182 }, { "epoch": 0.01, "grad_norm": 14.579998991834582, "learning_rate": 4.9127516778523495e-06, "loss": 1.7507, "step": 183 }, { "epoch": 0.01, "grad_norm": 11.827398285369991, "learning_rate": 4.939597315436242e-06, "loss": 1.8709, "step": 184 }, { "epoch": 0.01, "grad_norm": 13.180745238261542, "learning_rate": 4.966442953020135e-06, "loss": 1.4853, "step": 185 }, { "epoch": 0.01, "grad_norm": 9.77188804594341, "learning_rate": 4.993288590604027e-06, "loss": 1.2317, "step": 186 }, { "epoch": 0.01, "grad_norm": 10.764297684473298, "learning_rate": 5.02013422818792e-06, "loss": 1.529, "step": 187 }, { "epoch": 0.01, "grad_norm": 11.410177769159763, "learning_rate": 5.0469798657718124e-06, "loss": 1.2431, "step": 188 }, { "epoch": 0.01, "grad_norm": 18.296675687538578, "learning_rate": 5.073825503355705e-06, "loss": 1.4392, "step": 189 }, { "epoch": 0.01, "grad_norm": 15.151768475375025, "learning_rate": 5.1006711409395985e-06, "loss": 1.7412, "step": 190 }, { "epoch": 0.01, "grad_norm": 15.601085935647076, "learning_rate": 5.12751677852349e-06, "loss": 2.1132, "step": 191 }, { "epoch": 0.01, "grad_norm": 21.31842403632272, "learning_rate": 5.154362416107384e-06, "loss": 1.5016, "step": 192 }, { "epoch": 0.01, "grad_norm": 23.061021935372647, "learning_rate": 5.181208053691275e-06, "loss": 1.8035, "step": 193 }, { "epoch": 0.01, "grad_norm": 8.943096562508227, "learning_rate": 5.208053691275168e-06, "loss": 1.6704, "step": 194 }, { "epoch": 0.01, "grad_norm": 9.91609745320432, "learning_rate": 5.234899328859061e-06, "loss": 1.4274, "step": 195 }, { "epoch": 0.01, "grad_norm": 10.043075066679481, "learning_rate": 5.261744966442953e-06, "loss": 1.5762, "step": 196 }, { "epoch": 0.01, "grad_norm": 16.17379490776072, "learning_rate": 5.2885906040268466e-06, "loss": 1.6552, "step": 197 }, { "epoch": 0.01, "grad_norm": 10.652944866848367, "learning_rate": 5.315436241610738e-06, "loss": 1.516, "step": 198 }, { "epoch": 0.01, "grad_norm": 10.061365814480085, "learning_rate": 5.342281879194632e-06, "loss": 1.6705, "step": 199 }, { "epoch": 0.01, "grad_norm": 14.024462008727843, "learning_rate": 5.369127516778524e-06, "loss": 1.7595, "step": 200 }, { "epoch": 0.01, "grad_norm": 9.35316912645751, "learning_rate": 5.395973154362416e-06, "loss": 1.4622, "step": 201 }, { "epoch": 0.01, "grad_norm": 16.825025297260247, "learning_rate": 5.4228187919463095e-06, "loss": 2.0792, "step": 202 }, { "epoch": 0.01, "grad_norm": 13.765515690477317, "learning_rate": 5.449664429530201e-06, "loss": 1.6338, "step": 203 }, { "epoch": 0.01, "grad_norm": 12.8581687685068, "learning_rate": 5.476510067114095e-06, "loss": 1.7885, "step": 204 }, { "epoch": 0.01, "grad_norm": 12.189091030248958, "learning_rate": 5.503355704697987e-06, "loss": 1.828, "step": 205 }, { "epoch": 0.01, "grad_norm": 7.667264981245788, "learning_rate": 5.53020134228188e-06, "loss": 1.3538, "step": 206 }, { "epoch": 0.01, "grad_norm": 12.86597105681116, "learning_rate": 5.5570469798657725e-06, "loss": 1.6074, "step": 207 }, { "epoch": 0.01, "grad_norm": 12.725591120764053, "learning_rate": 5.583892617449664e-06, "loss": 1.4364, "step": 208 }, { "epoch": 0.01, "grad_norm": 19.989212120333164, "learning_rate": 5.610738255033558e-06, "loss": 1.4904, "step": 209 }, { "epoch": 0.01, "grad_norm": 12.446222742017182, "learning_rate": 5.637583892617449e-06, "loss": 1.4446, "step": 210 }, { "epoch": 0.01, "grad_norm": 4.8196189133302445, "learning_rate": 5.664429530201343e-06, "loss": 1.155, "step": 211 }, { "epoch": 0.01, "grad_norm": 12.57547711753834, "learning_rate": 5.691275167785235e-06, "loss": 1.764, "step": 212 }, { "epoch": 0.01, "grad_norm": 11.485919062651456, "learning_rate": 5.718120805369128e-06, "loss": 1.3615, "step": 213 }, { "epoch": 0.01, "grad_norm": 14.24774036826691, "learning_rate": 5.7449664429530206e-06, "loss": 1.3537, "step": 214 }, { "epoch": 0.01, "grad_norm": 10.484321234211297, "learning_rate": 5.771812080536914e-06, "loss": 1.6412, "step": 215 }, { "epoch": 0.01, "grad_norm": 9.507940120152089, "learning_rate": 5.798657718120806e-06, "loss": 1.3493, "step": 216 }, { "epoch": 0.01, "grad_norm": 10.754718070726947, "learning_rate": 5.825503355704698e-06, "loss": 1.8523, "step": 217 }, { "epoch": 0.01, "grad_norm": 9.401522533753987, "learning_rate": 5.852348993288591e-06, "loss": 1.6014, "step": 218 }, { "epoch": 0.01, "grad_norm": 9.353450906498342, "learning_rate": 5.8791946308724835e-06, "loss": 1.5356, "step": 219 }, { "epoch": 0.01, "grad_norm": 10.596158219614574, "learning_rate": 5.906040268456377e-06, "loss": 1.5215, "step": 220 }, { "epoch": 0.01, "grad_norm": 12.028007652005108, "learning_rate": 5.932885906040269e-06, "loss": 1.5457, "step": 221 }, { "epoch": 0.01, "grad_norm": 9.60145639881299, "learning_rate": 5.959731543624162e-06, "loss": 1.5872, "step": 222 }, { "epoch": 0.01, "grad_norm": 8.957821931272354, "learning_rate": 5.986577181208054e-06, "loss": 1.4347, "step": 223 }, { "epoch": 0.01, "grad_norm": 11.133189588283791, "learning_rate": 6.0134228187919464e-06, "loss": 1.7833, "step": 224 }, { "epoch": 0.01, "grad_norm": 14.41756147762758, "learning_rate": 6.04026845637584e-06, "loss": 1.8138, "step": 225 }, { "epoch": 0.01, "grad_norm": 13.610841931715774, "learning_rate": 6.067114093959732e-06, "loss": 1.8726, "step": 226 }, { "epoch": 0.01, "grad_norm": 11.495916536168222, "learning_rate": 6.093959731543625e-06, "loss": 1.7612, "step": 227 }, { "epoch": 0.01, "grad_norm": 10.156497571691345, "learning_rate": 6.120805369127517e-06, "loss": 1.51, "step": 228 }, { "epoch": 0.01, "grad_norm": 9.983284316136041, "learning_rate": 6.14765100671141e-06, "loss": 1.4645, "step": 229 }, { "epoch": 0.01, "grad_norm": 13.12661480251222, "learning_rate": 6.174496644295303e-06, "loss": 1.658, "step": 230 }, { "epoch": 0.01, "grad_norm": 8.552454706231153, "learning_rate": 6.2013422818791946e-06, "loss": 1.2123, "step": 231 }, { "epoch": 0.01, "grad_norm": 10.123196858785338, "learning_rate": 6.228187919463088e-06, "loss": 1.7071, "step": 232 }, { "epoch": 0.01, "grad_norm": 8.580620826008001, "learning_rate": 6.25503355704698e-06, "loss": 1.3564, "step": 233 }, { "epoch": 0.01, "grad_norm": 10.263579162132157, "learning_rate": 6.281879194630873e-06, "loss": 1.6229, "step": 234 }, { "epoch": 0.01, "grad_norm": 13.598528740649439, "learning_rate": 6.308724832214766e-06, "loss": 2.0826, "step": 235 }, { "epoch": 0.01, "grad_norm": 11.671772121388846, "learning_rate": 6.335570469798658e-06, "loss": 1.9092, "step": 236 }, { "epoch": 0.01, "grad_norm": 5.134116326125955, "learning_rate": 6.362416107382551e-06, "loss": 1.1773, "step": 237 }, { "epoch": 0.01, "grad_norm": 9.00806840581287, "learning_rate": 6.389261744966444e-06, "loss": 1.2386, "step": 238 }, { "epoch": 0.01, "grad_norm": 13.6349220086916, "learning_rate": 6.416107382550336e-06, "loss": 1.2972, "step": 239 }, { "epoch": 0.01, "grad_norm": 8.870003388516592, "learning_rate": 6.442953020134229e-06, "loss": 1.4886, "step": 240 }, { "epoch": 0.01, "grad_norm": 9.703293154317208, "learning_rate": 6.469798657718121e-06, "loss": 1.5812, "step": 241 }, { "epoch": 0.01, "grad_norm": 13.477762977854425, "learning_rate": 6.496644295302014e-06, "loss": 2.0074, "step": 242 }, { "epoch": 0.01, "grad_norm": 9.364091339313735, "learning_rate": 6.523489932885907e-06, "loss": 1.5767, "step": 243 }, { "epoch": 0.01, "grad_norm": 5.061778043450656, "learning_rate": 6.550335570469799e-06, "loss": 1.1556, "step": 244 }, { "epoch": 0.01, "grad_norm": 10.235527266761915, "learning_rate": 6.5771812080536925e-06, "loss": 1.5016, "step": 245 }, { "epoch": 0.01, "grad_norm": 11.76044715570968, "learning_rate": 6.604026845637584e-06, "loss": 1.6558, "step": 246 }, { "epoch": 0.01, "grad_norm": 13.211734022710534, "learning_rate": 6.630872483221477e-06, "loss": 2.0527, "step": 247 }, { "epoch": 0.01, "grad_norm": 11.4877471188776, "learning_rate": 6.65771812080537e-06, "loss": 1.7114, "step": 248 }, { "epoch": 0.01, "grad_norm": 13.952922265439943, "learning_rate": 6.684563758389262e-06, "loss": 1.4065, "step": 249 }, { "epoch": 0.01, "grad_norm": 15.752231909907449, "learning_rate": 6.711409395973155e-06, "loss": 1.7585, "step": 250 }, { "epoch": 0.01, "grad_norm": 7.964973395053712, "learning_rate": 6.738255033557047e-06, "loss": 1.758, "step": 251 }, { "epoch": 0.01, "grad_norm": 8.964169821449593, "learning_rate": 6.765100671140941e-06, "loss": 1.1376, "step": 252 }, { "epoch": 0.01, "grad_norm": 16.68719837794281, "learning_rate": 6.791946308724832e-06, "loss": 1.282, "step": 253 }, { "epoch": 0.01, "grad_norm": 9.346014460083014, "learning_rate": 6.818791946308725e-06, "loss": 1.4305, "step": 254 }, { "epoch": 0.01, "grad_norm": 12.393880235366709, "learning_rate": 6.845637583892618e-06, "loss": 1.4726, "step": 255 }, { "epoch": 0.01, "grad_norm": 14.574973700346327, "learning_rate": 6.87248322147651e-06, "loss": 1.5764, "step": 256 }, { "epoch": 0.01, "grad_norm": 9.5889622707587, "learning_rate": 6.8993288590604035e-06, "loss": 1.5886, "step": 257 }, { "epoch": 0.01, "grad_norm": 13.053009784693288, "learning_rate": 6.926174496644295e-06, "loss": 1.3649, "step": 258 }, { "epoch": 0.01, "grad_norm": 8.569822734540786, "learning_rate": 6.953020134228189e-06, "loss": 1.7326, "step": 259 }, { "epoch": 0.01, "grad_norm": 13.014095218785567, "learning_rate": 6.979865771812081e-06, "loss": 1.3926, "step": 260 }, { "epoch": 0.01, "grad_norm": 14.626409654072866, "learning_rate": 7.006711409395974e-06, "loss": 1.909, "step": 261 }, { "epoch": 0.01, "grad_norm": 12.946460199204028, "learning_rate": 7.0335570469798665e-06, "loss": 1.5285, "step": 262 }, { "epoch": 0.01, "grad_norm": 15.367394362597063, "learning_rate": 7.060402684563758e-06, "loss": 1.6252, "step": 263 }, { "epoch": 0.01, "grad_norm": 10.128127598162862, "learning_rate": 7.087248322147652e-06, "loss": 1.7979, "step": 264 }, { "epoch": 0.01, "grad_norm": 12.493707412532908, "learning_rate": 7.114093959731544e-06, "loss": 1.4242, "step": 265 }, { "epoch": 0.01, "grad_norm": 11.54427329173879, "learning_rate": 7.140939597315437e-06, "loss": 1.9175, "step": 266 }, { "epoch": 0.01, "grad_norm": 15.302456341955967, "learning_rate": 7.167785234899329e-06, "loss": 1.354, "step": 267 }, { "epoch": 0.01, "grad_norm": 9.965065298588401, "learning_rate": 7.194630872483223e-06, "loss": 1.8103, "step": 268 }, { "epoch": 0.01, "grad_norm": 10.773296535998654, "learning_rate": 7.221476510067115e-06, "loss": 1.2827, "step": 269 }, { "epoch": 0.01, "grad_norm": 10.965233599793144, "learning_rate": 7.248322147651007e-06, "loss": 1.6364, "step": 270 }, { "epoch": 0.01, "grad_norm": 11.18406341014796, "learning_rate": 7.2751677852349e-06, "loss": 1.2708, "step": 271 }, { "epoch": 0.01, "grad_norm": 7.635674408248562, "learning_rate": 7.302013422818792e-06, "loss": 1.4547, "step": 272 }, { "epoch": 0.01, "grad_norm": 11.059144232433749, "learning_rate": 7.328859060402686e-06, "loss": 1.4559, "step": 273 }, { "epoch": 0.01, "grad_norm": 10.207189326824452, "learning_rate": 7.3557046979865775e-06, "loss": 1.9118, "step": 274 }, { "epoch": 0.01, "grad_norm": 10.658580091519708, "learning_rate": 7.382550335570471e-06, "loss": 1.8017, "step": 275 }, { "epoch": 0.01, "grad_norm": 8.913271685246535, "learning_rate": 7.409395973154363e-06, "loss": 1.5528, "step": 276 }, { "epoch": 0.01, "grad_norm": 9.383906560364299, "learning_rate": 7.436241610738255e-06, "loss": 1.3903, "step": 277 }, { "epoch": 0.01, "grad_norm": 8.784505758233623, "learning_rate": 7.463087248322149e-06, "loss": 1.5208, "step": 278 }, { "epoch": 0.01, "grad_norm": 9.79469433278323, "learning_rate": 7.4899328859060405e-06, "loss": 1.6199, "step": 279 }, { "epoch": 0.01, "grad_norm": 8.56381120904719, "learning_rate": 7.516778523489934e-06, "loss": 1.3765, "step": 280 }, { "epoch": 0.01, "grad_norm": 14.094434540039053, "learning_rate": 7.543624161073826e-06, "loss": 1.4641, "step": 281 }, { "epoch": 0.01, "grad_norm": 12.34365961384772, "learning_rate": 7.570469798657719e-06, "loss": 2.1032, "step": 282 }, { "epoch": 0.01, "grad_norm": 9.035124596658779, "learning_rate": 7.597315436241612e-06, "loss": 1.5777, "step": 283 }, { "epoch": 0.01, "grad_norm": 8.997683545546973, "learning_rate": 7.624161073825504e-06, "loss": 1.6408, "step": 284 }, { "epoch": 0.01, "grad_norm": 10.065155618120626, "learning_rate": 7.651006711409396e-06, "loss": 1.4344, "step": 285 }, { "epoch": 0.01, "grad_norm": 8.361079722623494, "learning_rate": 7.677852348993289e-06, "loss": 1.4586, "step": 286 }, { "epoch": 0.01, "grad_norm": 8.695438794503787, "learning_rate": 7.704697986577183e-06, "loss": 1.6616, "step": 287 }, { "epoch": 0.01, "grad_norm": 11.299179387335887, "learning_rate": 7.731543624161074e-06, "loss": 1.785, "step": 288 }, { "epoch": 0.01, "grad_norm": 9.568308646025505, "learning_rate": 7.758389261744968e-06, "loss": 1.4455, "step": 289 }, { "epoch": 0.01, "grad_norm": 7.732738386479873, "learning_rate": 7.785234899328859e-06, "loss": 1.5233, "step": 290 }, { "epoch": 0.01, "grad_norm": 8.833384094395637, "learning_rate": 7.812080536912753e-06, "loss": 1.4055, "step": 291 }, { "epoch": 0.01, "grad_norm": 4.54529072920182, "learning_rate": 7.838926174496644e-06, "loss": 1.1067, "step": 292 }, { "epoch": 0.01, "grad_norm": 11.48750810634312, "learning_rate": 7.865771812080537e-06, "loss": 2.0899, "step": 293 }, { "epoch": 0.01, "grad_norm": 10.906559405220005, "learning_rate": 7.892617449664431e-06, "loss": 1.6334, "step": 294 }, { "epoch": 0.01, "grad_norm": 8.699846895401677, "learning_rate": 7.919463087248322e-06, "loss": 1.5975, "step": 295 }, { "epoch": 0.01, "grad_norm": 8.39621817297778, "learning_rate": 7.946308724832216e-06, "loss": 1.5089, "step": 296 }, { "epoch": 0.01, "grad_norm": 14.789277280291532, "learning_rate": 7.973154362416107e-06, "loss": 2.038, "step": 297 }, { "epoch": 0.01, "grad_norm": 6.295142628075283, "learning_rate": 8.000000000000001e-06, "loss": 1.2353, "step": 298 }, { "epoch": 0.01, "grad_norm": 8.807328434768754, "learning_rate": 8.026845637583894e-06, "loss": 1.5409, "step": 299 }, { "epoch": 0.01, "grad_norm": 12.473396773857809, "learning_rate": 8.053691275167785e-06, "loss": 1.6243, "step": 300 }, { "epoch": 0.01, "grad_norm": 9.904258622721057, "learning_rate": 8.080536912751679e-06, "loss": 2.04, "step": 301 }, { "epoch": 0.01, "grad_norm": 8.673097454710115, "learning_rate": 8.10738255033557e-06, "loss": 1.2377, "step": 302 }, { "epoch": 0.01, "grad_norm": 15.044854709390615, "learning_rate": 8.134228187919464e-06, "loss": 1.7277, "step": 303 }, { "epoch": 0.01, "grad_norm": 10.900336713035701, "learning_rate": 8.161073825503357e-06, "loss": 1.5998, "step": 304 }, { "epoch": 0.01, "grad_norm": 13.174609108609152, "learning_rate": 8.18791946308725e-06, "loss": 1.7433, "step": 305 }, { "epoch": 0.01, "grad_norm": 11.844639630043693, "learning_rate": 8.214765100671142e-06, "loss": 1.6152, "step": 306 }, { "epoch": 0.01, "grad_norm": 12.139723228716852, "learning_rate": 8.241610738255035e-06, "loss": 1.6333, "step": 307 }, { "epoch": 0.01, "grad_norm": 13.727332261052586, "learning_rate": 8.268456375838927e-06, "loss": 1.5606, "step": 308 }, { "epoch": 0.01, "grad_norm": 9.663628255979177, "learning_rate": 8.29530201342282e-06, "loss": 1.7879, "step": 309 }, { "epoch": 0.01, "grad_norm": 11.29019286777234, "learning_rate": 8.322147651006712e-06, "loss": 1.5749, "step": 310 }, { "epoch": 0.01, "grad_norm": 9.775092407800736, "learning_rate": 8.348993288590605e-06, "loss": 1.5944, "step": 311 }, { "epoch": 0.01, "grad_norm": 8.67006642374828, "learning_rate": 8.375838926174498e-06, "loss": 1.6951, "step": 312 }, { "epoch": 0.01, "grad_norm": 12.313097629781833, "learning_rate": 8.40268456375839e-06, "loss": 1.119, "step": 313 }, { "epoch": 0.01, "grad_norm": 8.740240080737678, "learning_rate": 8.429530201342283e-06, "loss": 1.5149, "step": 314 }, { "epoch": 0.01, "grad_norm": 10.340689028123274, "learning_rate": 8.456375838926175e-06, "loss": 1.9048, "step": 315 }, { "epoch": 0.01, "grad_norm": 9.273432871141575, "learning_rate": 8.483221476510068e-06, "loss": 1.6202, "step": 316 }, { "epoch": 0.01, "grad_norm": 9.644259856017797, "learning_rate": 8.51006711409396e-06, "loss": 1.6766, "step": 317 }, { "epoch": 0.01, "grad_norm": 10.657875111729533, "learning_rate": 8.536912751677853e-06, "loss": 1.9949, "step": 318 }, { "epoch": 0.01, "grad_norm": 5.246742026987623, "learning_rate": 8.563758389261746e-06, "loss": 1.1047, "step": 319 }, { "epoch": 0.01, "grad_norm": 9.18864242956882, "learning_rate": 8.590604026845638e-06, "loss": 1.2312, "step": 320 }, { "epoch": 0.01, "grad_norm": 9.090116088685697, "learning_rate": 8.617449664429531e-06, "loss": 1.5047, "step": 321 }, { "epoch": 0.01, "grad_norm": 11.564187038931198, "learning_rate": 8.644295302013423e-06, "loss": 1.4331, "step": 322 }, { "epoch": 0.01, "grad_norm": 9.496333819942201, "learning_rate": 8.671140939597316e-06, "loss": 1.5035, "step": 323 }, { "epoch": 0.01, "grad_norm": 9.272267597231734, "learning_rate": 8.697986577181209e-06, "loss": 1.2826, "step": 324 }, { "epoch": 0.01, "grad_norm": 11.576186066897877, "learning_rate": 8.724832214765101e-06, "loss": 1.7093, "step": 325 }, { "epoch": 0.01, "grad_norm": 7.864505869737435, "learning_rate": 8.751677852348994e-06, "loss": 1.3858, "step": 326 }, { "epoch": 0.01, "grad_norm": 9.182285214380252, "learning_rate": 8.778523489932886e-06, "loss": 1.2842, "step": 327 }, { "epoch": 0.01, "grad_norm": 11.14634103815586, "learning_rate": 8.805369127516779e-06, "loss": 1.1746, "step": 328 }, { "epoch": 0.01, "grad_norm": 8.688655254608909, "learning_rate": 8.832214765100672e-06, "loss": 1.5103, "step": 329 }, { "epoch": 0.01, "grad_norm": 7.554938803093232, "learning_rate": 8.859060402684566e-06, "loss": 1.2835, "step": 330 }, { "epoch": 0.01, "grad_norm": 15.255985803647404, "learning_rate": 8.885906040268457e-06, "loss": 1.9845, "step": 331 }, { "epoch": 0.01, "grad_norm": 9.181282883855086, "learning_rate": 8.91275167785235e-06, "loss": 1.661, "step": 332 }, { "epoch": 0.01, "grad_norm": 12.36346818139929, "learning_rate": 8.939597315436242e-06, "loss": 1.5244, "step": 333 }, { "epoch": 0.01, "grad_norm": 8.388853705491451, "learning_rate": 8.966442953020134e-06, "loss": 1.4875, "step": 334 }, { "epoch": 0.01, "grad_norm": 8.737028575065583, "learning_rate": 8.993288590604027e-06, "loss": 1.4506, "step": 335 }, { "epoch": 0.01, "grad_norm": 10.432560448611623, "learning_rate": 9.02013422818792e-06, "loss": 1.6521, "step": 336 }, { "epoch": 0.01, "grad_norm": 5.427772696312779, "learning_rate": 9.046979865771814e-06, "loss": 1.3052, "step": 337 }, { "epoch": 0.01, "grad_norm": 8.344212539496946, "learning_rate": 9.073825503355705e-06, "loss": 1.4408, "step": 338 }, { "epoch": 0.01, "grad_norm": 13.725747313321792, "learning_rate": 9.100671140939597e-06, "loss": 1.8887, "step": 339 }, { "epoch": 0.01, "grad_norm": 8.311809411600889, "learning_rate": 9.12751677852349e-06, "loss": 1.3505, "step": 340 }, { "epoch": 0.01, "grad_norm": 9.558711767623944, "learning_rate": 9.154362416107383e-06, "loss": 1.3347, "step": 341 }, { "epoch": 0.01, "grad_norm": 11.543979641674689, "learning_rate": 9.181208053691277e-06, "loss": 1.8786, "step": 342 }, { "epoch": 0.01, "grad_norm": 11.767906823118505, "learning_rate": 9.208053691275168e-06, "loss": 1.6337, "step": 343 }, { "epoch": 0.01, "grad_norm": 8.349390558250093, "learning_rate": 9.234899328859062e-06, "loss": 1.2518, "step": 344 }, { "epoch": 0.01, "grad_norm": 14.295244084871817, "learning_rate": 9.261744966442953e-06, "loss": 1.646, "step": 345 }, { "epoch": 0.01, "grad_norm": 9.067964926192612, "learning_rate": 9.288590604026846e-06, "loss": 1.5712, "step": 346 }, { "epoch": 0.01, "grad_norm": 11.988413240516904, "learning_rate": 9.31543624161074e-06, "loss": 1.8023, "step": 347 }, { "epoch": 0.01, "grad_norm": 7.725082292402449, "learning_rate": 9.34228187919463e-06, "loss": 1.3904, "step": 348 }, { "epoch": 0.01, "grad_norm": 13.538385697186461, "learning_rate": 9.369127516778525e-06, "loss": 1.2176, "step": 349 }, { "epoch": 0.01, "grad_norm": 8.954462854500267, "learning_rate": 9.395973154362416e-06, "loss": 1.3867, "step": 350 }, { "epoch": 0.01, "grad_norm": 4.69562591117132, "learning_rate": 9.42281879194631e-06, "loss": 1.2014, "step": 351 }, { "epoch": 0.01, "grad_norm": 10.907438289910699, "learning_rate": 9.449664429530203e-06, "loss": 1.3065, "step": 352 }, { "epoch": 0.01, "grad_norm": 13.331833282275065, "learning_rate": 9.476510067114095e-06, "loss": 1.8804, "step": 353 }, { "epoch": 0.01, "grad_norm": 8.335076987176812, "learning_rate": 9.503355704697988e-06, "loss": 1.4837, "step": 354 }, { "epoch": 0.01, "grad_norm": 8.267759303994664, "learning_rate": 9.530201342281879e-06, "loss": 1.7094, "step": 355 }, { "epoch": 0.01, "grad_norm": 10.952949430161986, "learning_rate": 9.557046979865773e-06, "loss": 1.4585, "step": 356 }, { "epoch": 0.01, "grad_norm": 8.981061721370573, "learning_rate": 9.583892617449666e-06, "loss": 1.6428, "step": 357 }, { "epoch": 0.01, "grad_norm": 11.750352290003713, "learning_rate": 9.610738255033558e-06, "loss": 1.4157, "step": 358 }, { "epoch": 0.01, "grad_norm": 8.537089893461811, "learning_rate": 9.637583892617451e-06, "loss": 1.5694, "step": 359 }, { "epoch": 0.01, "grad_norm": 7.721233187634588, "learning_rate": 9.664429530201343e-06, "loss": 1.428, "step": 360 }, { "epoch": 0.01, "grad_norm": 8.894043355202989, "learning_rate": 9.691275167785236e-06, "loss": 1.2399, "step": 361 }, { "epoch": 0.01, "grad_norm": 9.89419547831076, "learning_rate": 9.718120805369129e-06, "loss": 1.4593, "step": 362 }, { "epoch": 0.01, "grad_norm": 10.444037192684798, "learning_rate": 9.744966442953021e-06, "loss": 1.84, "step": 363 }, { "epoch": 0.01, "grad_norm": 7.6825192853110265, "learning_rate": 9.771812080536914e-06, "loss": 1.291, "step": 364 }, { "epoch": 0.01, "grad_norm": 7.913625663843959, "learning_rate": 9.798657718120806e-06, "loss": 1.4084, "step": 365 }, { "epoch": 0.01, "grad_norm": 11.955604322140983, "learning_rate": 9.825503355704699e-06, "loss": 1.7061, "step": 366 }, { "epoch": 0.01, "grad_norm": 11.62089959227162, "learning_rate": 9.852348993288592e-06, "loss": 1.7951, "step": 367 }, { "epoch": 0.01, "grad_norm": 9.29940556199203, "learning_rate": 9.879194630872484e-06, "loss": 1.644, "step": 368 }, { "epoch": 0.01, "grad_norm": 8.888406606746397, "learning_rate": 9.906040268456377e-06, "loss": 1.4007, "step": 369 }, { "epoch": 0.01, "grad_norm": 9.324435974424938, "learning_rate": 9.93288590604027e-06, "loss": 1.5139, "step": 370 }, { "epoch": 0.01, "grad_norm": 15.092469886188237, "learning_rate": 9.959731543624162e-06, "loss": 1.658, "step": 371 }, { "epoch": 0.01, "grad_norm": 9.798239784673768, "learning_rate": 9.986577181208055e-06, "loss": 1.5002, "step": 372 }, { "epoch": 0.02, "grad_norm": 11.174780869324398, "learning_rate": 1.0013422818791947e-05, "loss": 1.9242, "step": 373 }, { "epoch": 0.02, "grad_norm": 12.21923255015902, "learning_rate": 1.004026845637584e-05, "loss": 1.3241, "step": 374 }, { "epoch": 0.02, "grad_norm": 8.46030760835296, "learning_rate": 1.0067114093959734e-05, "loss": 1.1467, "step": 375 }, { "epoch": 0.02, "grad_norm": 10.104507369826752, "learning_rate": 1.0093959731543625e-05, "loss": 1.4861, "step": 376 }, { "epoch": 0.02, "grad_norm": 7.82452396622584, "learning_rate": 1.0120805369127517e-05, "loss": 1.2164, "step": 377 }, { "epoch": 0.02, "grad_norm": 9.237099080996998, "learning_rate": 1.014765100671141e-05, "loss": 1.7492, "step": 378 }, { "epoch": 0.02, "grad_norm": 11.650935437381616, "learning_rate": 1.0174496644295303e-05, "loss": 1.1814, "step": 379 }, { "epoch": 0.02, "grad_norm": 8.035852084864297, "learning_rate": 1.0201342281879197e-05, "loss": 1.6595, "step": 380 }, { "epoch": 0.02, "grad_norm": 11.41862935575625, "learning_rate": 1.0228187919463088e-05, "loss": 1.4773, "step": 381 }, { "epoch": 0.02, "grad_norm": 11.095710098628684, "learning_rate": 1.025503355704698e-05, "loss": 1.7986, "step": 382 }, { "epoch": 0.02, "grad_norm": 9.909356812828454, "learning_rate": 1.0281879194630873e-05, "loss": 1.4372, "step": 383 }, { "epoch": 0.02, "grad_norm": 9.819748827331054, "learning_rate": 1.0308724832214767e-05, "loss": 1.5856, "step": 384 }, { "epoch": 0.02, "grad_norm": 15.327594488710123, "learning_rate": 1.033557046979866e-05, "loss": 1.826, "step": 385 }, { "epoch": 0.02, "grad_norm": 9.13503962517979, "learning_rate": 1.036241610738255e-05, "loss": 1.6619, "step": 386 }, { "epoch": 0.02, "grad_norm": 8.377050703898767, "learning_rate": 1.0389261744966443e-05, "loss": 1.2555, "step": 387 }, { "epoch": 0.02, "grad_norm": 13.072628247604198, "learning_rate": 1.0416107382550336e-05, "loss": 1.5963, "step": 388 }, { "epoch": 0.02, "grad_norm": 8.297840978482256, "learning_rate": 1.044295302013423e-05, "loss": 1.3466, "step": 389 }, { "epoch": 0.02, "grad_norm": 8.414736317088886, "learning_rate": 1.0469798657718123e-05, "loss": 1.5266, "step": 390 }, { "epoch": 0.02, "grad_norm": 10.46327313730743, "learning_rate": 1.0496644295302014e-05, "loss": 1.3284, "step": 391 }, { "epoch": 0.02, "grad_norm": 7.251013017465008, "learning_rate": 1.0523489932885906e-05, "loss": 1.2931, "step": 392 }, { "epoch": 0.02, "grad_norm": 8.578294615207088, "learning_rate": 1.0550335570469799e-05, "loss": 1.6163, "step": 393 }, { "epoch": 0.02, "grad_norm": 6.911961559019799, "learning_rate": 1.0577181208053693e-05, "loss": 1.3804, "step": 394 }, { "epoch": 0.02, "grad_norm": 11.316430892253736, "learning_rate": 1.0604026845637586e-05, "loss": 1.3293, "step": 395 }, { "epoch": 0.02, "grad_norm": 7.383118553593106, "learning_rate": 1.0630872483221477e-05, "loss": 1.3164, "step": 396 }, { "epoch": 0.02, "grad_norm": 8.844864708845973, "learning_rate": 1.065771812080537e-05, "loss": 1.4031, "step": 397 }, { "epoch": 0.02, "grad_norm": 11.950385801454448, "learning_rate": 1.0684563758389264e-05, "loss": 1.4896, "step": 398 }, { "epoch": 0.02, "grad_norm": 8.393193707835785, "learning_rate": 1.0711409395973156e-05, "loss": 1.3456, "step": 399 }, { "epoch": 0.02, "grad_norm": 8.909450442572261, "learning_rate": 1.0738255033557049e-05, "loss": 1.4749, "step": 400 }, { "epoch": 0.02, "grad_norm": 7.082460375776544, "learning_rate": 1.076510067114094e-05, "loss": 1.284, "step": 401 }, { "epoch": 0.02, "grad_norm": 10.675805571191248, "learning_rate": 1.0791946308724832e-05, "loss": 1.5959, "step": 402 }, { "epoch": 0.02, "grad_norm": 10.12779406435023, "learning_rate": 1.0818791946308726e-05, "loss": 1.8493, "step": 403 }, { "epoch": 0.02, "grad_norm": 10.335440232238135, "learning_rate": 1.0845637583892619e-05, "loss": 1.6222, "step": 404 }, { "epoch": 0.02, "grad_norm": 14.573394308412109, "learning_rate": 1.0872483221476512e-05, "loss": 1.4336, "step": 405 }, { "epoch": 0.02, "grad_norm": 7.977355478954481, "learning_rate": 1.0899328859060403e-05, "loss": 1.5827, "step": 406 }, { "epoch": 0.02, "grad_norm": 9.696029088146323, "learning_rate": 1.0926174496644297e-05, "loss": 1.924, "step": 407 }, { "epoch": 0.02, "grad_norm": 9.201859611500188, "learning_rate": 1.095302013422819e-05, "loss": 1.5042, "step": 408 }, { "epoch": 0.02, "grad_norm": 8.00011203211513, "learning_rate": 1.0979865771812082e-05, "loss": 1.3995, "step": 409 }, { "epoch": 0.02, "grad_norm": 9.551486110298208, "learning_rate": 1.1006711409395975e-05, "loss": 1.6962, "step": 410 }, { "epoch": 0.02, "grad_norm": 12.675182744412622, "learning_rate": 1.1033557046979865e-05, "loss": 1.0448, "step": 411 }, { "epoch": 0.02, "grad_norm": 10.097196057725439, "learning_rate": 1.106040268456376e-05, "loss": 1.4764, "step": 412 }, { "epoch": 0.02, "grad_norm": 8.32493336522974, "learning_rate": 1.1087248322147652e-05, "loss": 1.6457, "step": 413 }, { "epoch": 0.02, "grad_norm": 8.253346597613184, "learning_rate": 1.1114093959731545e-05, "loss": 1.2916, "step": 414 }, { "epoch": 0.02, "grad_norm": 9.323028106691497, "learning_rate": 1.1140939597315436e-05, "loss": 1.3274, "step": 415 }, { "epoch": 0.02, "grad_norm": 10.09821426153491, "learning_rate": 1.1167785234899328e-05, "loss": 1.6463, "step": 416 }, { "epoch": 0.02, "grad_norm": 8.277224133702717, "learning_rate": 1.1194630872483223e-05, "loss": 1.4417, "step": 417 }, { "epoch": 0.02, "grad_norm": 9.220819649715269, "learning_rate": 1.1221476510067115e-05, "loss": 1.8499, "step": 418 }, { "epoch": 0.02, "grad_norm": 9.50967380209388, "learning_rate": 1.1248322147651008e-05, "loss": 1.6063, "step": 419 }, { "epoch": 0.02, "grad_norm": 8.494909302554124, "learning_rate": 1.1275167785234899e-05, "loss": 1.1973, "step": 420 }, { "epoch": 0.02, "grad_norm": 8.268122149772186, "learning_rate": 1.1302013422818795e-05, "loss": 1.5379, "step": 421 }, { "epoch": 0.02, "grad_norm": 10.067738434152155, "learning_rate": 1.1328859060402686e-05, "loss": 1.4953, "step": 422 }, { "epoch": 0.02, "grad_norm": 7.427611759674664, "learning_rate": 1.1355704697986578e-05, "loss": 1.5466, "step": 423 }, { "epoch": 0.02, "grad_norm": 11.31902039398064, "learning_rate": 1.138255033557047e-05, "loss": 1.4364, "step": 424 }, { "epoch": 0.02, "grad_norm": 8.99627372078485, "learning_rate": 1.1409395973154362e-05, "loss": 1.5627, "step": 425 }, { "epoch": 0.02, "grad_norm": 10.604063946254929, "learning_rate": 1.1436241610738256e-05, "loss": 1.8446, "step": 426 }, { "epoch": 0.02, "grad_norm": 11.569096113951334, "learning_rate": 1.1463087248322149e-05, "loss": 1.3505, "step": 427 }, { "epoch": 0.02, "grad_norm": 8.485028254891345, "learning_rate": 1.1489932885906041e-05, "loss": 1.2645, "step": 428 }, { "epoch": 0.02, "grad_norm": 6.653880927228366, "learning_rate": 1.1516778523489934e-05, "loss": 1.3512, "step": 429 }, { "epoch": 0.02, "grad_norm": 9.573778927063621, "learning_rate": 1.1543624161073828e-05, "loss": 1.4285, "step": 430 }, { "epoch": 0.02, "grad_norm": 14.414640141107622, "learning_rate": 1.1570469798657719e-05, "loss": 1.4266, "step": 431 }, { "epoch": 0.02, "grad_norm": 8.229242957248958, "learning_rate": 1.1597315436241611e-05, "loss": 1.6794, "step": 432 }, { "epoch": 0.02, "grad_norm": 8.421603714600863, "learning_rate": 1.1624161073825504e-05, "loss": 1.4813, "step": 433 }, { "epoch": 0.02, "grad_norm": 9.865355602236088, "learning_rate": 1.1651006711409397e-05, "loss": 1.4764, "step": 434 }, { "epoch": 0.02, "grad_norm": 14.035964443677242, "learning_rate": 1.1677852348993291e-05, "loss": 2.1341, "step": 435 }, { "epoch": 0.02, "grad_norm": 8.2320562576311, "learning_rate": 1.1704697986577182e-05, "loss": 1.6651, "step": 436 }, { "epoch": 0.02, "grad_norm": 9.252894253733398, "learning_rate": 1.1731543624161074e-05, "loss": 1.5525, "step": 437 }, { "epoch": 0.02, "grad_norm": 8.839349451519443, "learning_rate": 1.1758389261744967e-05, "loss": 1.3654, "step": 438 }, { "epoch": 0.02, "grad_norm": 6.079801548707691, "learning_rate": 1.178523489932886e-05, "loss": 1.1164, "step": 439 }, { "epoch": 0.02, "grad_norm": 10.385378902213176, "learning_rate": 1.1812080536912754e-05, "loss": 1.5868, "step": 440 }, { "epoch": 0.02, "grad_norm": 9.999054327947057, "learning_rate": 1.1838926174496645e-05, "loss": 1.5177, "step": 441 }, { "epoch": 0.02, "grad_norm": 7.47426552667939, "learning_rate": 1.1865771812080537e-05, "loss": 1.3416, "step": 442 }, { "epoch": 0.02, "grad_norm": 8.670943549709241, "learning_rate": 1.189261744966443e-05, "loss": 1.479, "step": 443 }, { "epoch": 0.02, "grad_norm": 9.154212116148663, "learning_rate": 1.1919463087248324e-05, "loss": 1.3498, "step": 444 }, { "epoch": 0.02, "grad_norm": 9.532682033293778, "learning_rate": 1.1946308724832217e-05, "loss": 1.3744, "step": 445 }, { "epoch": 0.02, "grad_norm": 8.34991911053489, "learning_rate": 1.1973154362416108e-05, "loss": 1.4076, "step": 446 }, { "epoch": 0.02, "grad_norm": 7.438665318262275, "learning_rate": 1.2e-05, "loss": 1.4545, "step": 447 }, { "epoch": 0.02, "grad_norm": 8.316898371045484, "learning_rate": 1.2026845637583893e-05, "loss": 1.58, "step": 448 }, { "epoch": 0.02, "grad_norm": 14.145984928059145, "learning_rate": 1.2053691275167787e-05, "loss": 1.6299, "step": 449 }, { "epoch": 0.02, "grad_norm": 9.17763415555075, "learning_rate": 1.208053691275168e-05, "loss": 1.6198, "step": 450 }, { "epoch": 0.02, "grad_norm": 6.8501911393393575, "learning_rate": 1.210738255033557e-05, "loss": 1.3471, "step": 451 }, { "epoch": 0.02, "grad_norm": 9.710169940848424, "learning_rate": 1.2134228187919463e-05, "loss": 1.6027, "step": 452 }, { "epoch": 0.02, "grad_norm": 9.480272569957146, "learning_rate": 1.2161073825503358e-05, "loss": 1.6442, "step": 453 }, { "epoch": 0.02, "grad_norm": 5.061662406799057, "learning_rate": 1.218791946308725e-05, "loss": 1.1247, "step": 454 }, { "epoch": 0.02, "grad_norm": 9.796630545625053, "learning_rate": 1.2214765100671143e-05, "loss": 1.6916, "step": 455 }, { "epoch": 0.02, "grad_norm": 6.916702945209782, "learning_rate": 1.2241610738255034e-05, "loss": 1.4768, "step": 456 }, { "epoch": 0.02, "grad_norm": 8.301497312682491, "learning_rate": 1.2268456375838926e-05, "loss": 1.5865, "step": 457 }, { "epoch": 0.02, "grad_norm": 6.757332208179617, "learning_rate": 1.229530201342282e-05, "loss": 1.1263, "step": 458 }, { "epoch": 0.02, "grad_norm": 7.172429477316673, "learning_rate": 1.2322147651006713e-05, "loss": 1.5916, "step": 459 }, { "epoch": 0.02, "grad_norm": 8.764459373362694, "learning_rate": 1.2348993288590606e-05, "loss": 1.3644, "step": 460 }, { "epoch": 0.02, "grad_norm": 7.793327597742426, "learning_rate": 1.2375838926174497e-05, "loss": 1.6128, "step": 461 }, { "epoch": 0.02, "grad_norm": 9.153653293119383, "learning_rate": 1.2402684563758389e-05, "loss": 1.3732, "step": 462 }, { "epoch": 0.02, "grad_norm": 6.982019540661202, "learning_rate": 1.2429530201342283e-05, "loss": 1.4621, "step": 463 }, { "epoch": 0.02, "grad_norm": 11.745604520467719, "learning_rate": 1.2456375838926176e-05, "loss": 1.5017, "step": 464 }, { "epoch": 0.02, "grad_norm": 9.907962121888232, "learning_rate": 1.2483221476510069e-05, "loss": 1.4954, "step": 465 }, { "epoch": 0.02, "grad_norm": 7.025972186983962, "learning_rate": 1.251006711409396e-05, "loss": 1.3027, "step": 466 }, { "epoch": 0.02, "grad_norm": 5.148161832078976, "learning_rate": 1.2536912751677854e-05, "loss": 1.1679, "step": 467 }, { "epoch": 0.02, "grad_norm": 6.191485177006818, "learning_rate": 1.2563758389261746e-05, "loss": 1.3467, "step": 468 }, { "epoch": 0.02, "grad_norm": 7.38838657551247, "learning_rate": 1.2590604026845639e-05, "loss": 1.3592, "step": 469 }, { "epoch": 0.02, "grad_norm": 11.428320566074404, "learning_rate": 1.2617449664429532e-05, "loss": 1.9306, "step": 470 }, { "epoch": 0.02, "grad_norm": 8.670737252850074, "learning_rate": 1.2644295302013422e-05, "loss": 1.4233, "step": 471 }, { "epoch": 0.02, "grad_norm": 12.375400769249934, "learning_rate": 1.2671140939597317e-05, "loss": 1.3632, "step": 472 }, { "epoch": 0.02, "grad_norm": 4.113950100576592, "learning_rate": 1.269798657718121e-05, "loss": 0.9677, "step": 473 }, { "epoch": 0.02, "grad_norm": 7.101881978912405, "learning_rate": 1.2724832214765102e-05, "loss": 1.3013, "step": 474 }, { "epoch": 0.02, "grad_norm": 9.566489460814012, "learning_rate": 1.2751677852348994e-05, "loss": 1.5764, "step": 475 }, { "epoch": 0.02, "grad_norm": 11.81306177129485, "learning_rate": 1.2778523489932889e-05, "loss": 1.5302, "step": 476 }, { "epoch": 0.02, "grad_norm": 8.103011929112583, "learning_rate": 1.280536912751678e-05, "loss": 1.5293, "step": 477 }, { "epoch": 0.02, "grad_norm": 9.034707274716586, "learning_rate": 1.2832214765100672e-05, "loss": 1.605, "step": 478 }, { "epoch": 0.02, "grad_norm": 10.78190053653468, "learning_rate": 1.2859060402684565e-05, "loss": 1.8285, "step": 479 }, { "epoch": 0.02, "grad_norm": 7.832945997807883, "learning_rate": 1.2885906040268457e-05, "loss": 1.2634, "step": 480 }, { "epoch": 0.02, "grad_norm": 7.28774557452969, "learning_rate": 1.2912751677852352e-05, "loss": 1.4308, "step": 481 }, { "epoch": 0.02, "grad_norm": 7.778102054842449, "learning_rate": 1.2939597315436243e-05, "loss": 1.4375, "step": 482 }, { "epoch": 0.02, "grad_norm": 11.007807198159542, "learning_rate": 1.2966442953020135e-05, "loss": 1.5594, "step": 483 }, { "epoch": 0.02, "grad_norm": 8.555946738014196, "learning_rate": 1.2993288590604028e-05, "loss": 1.6496, "step": 484 }, { "epoch": 0.02, "grad_norm": 8.648784491062047, "learning_rate": 1.302013422818792e-05, "loss": 1.7619, "step": 485 }, { "epoch": 0.02, "grad_norm": 6.805502468961682, "learning_rate": 1.3046979865771815e-05, "loss": 1.2919, "step": 486 }, { "epoch": 0.02, "grad_norm": 10.248174788555783, "learning_rate": 1.3073825503355706e-05, "loss": 1.1833, "step": 487 }, { "epoch": 0.02, "grad_norm": 13.810163711229256, "learning_rate": 1.3100671140939598e-05, "loss": 1.5023, "step": 488 }, { "epoch": 0.02, "grad_norm": 9.037078629041334, "learning_rate": 1.312751677852349e-05, "loss": 1.6849, "step": 489 }, { "epoch": 0.02, "grad_norm": 7.665302782234119, "learning_rate": 1.3154362416107385e-05, "loss": 1.1481, "step": 490 }, { "epoch": 0.02, "grad_norm": 7.062356207185401, "learning_rate": 1.3181208053691278e-05, "loss": 1.2876, "step": 491 }, { "epoch": 0.02, "grad_norm": 9.486829501735757, "learning_rate": 1.3208053691275168e-05, "loss": 1.2023, "step": 492 }, { "epoch": 0.02, "grad_norm": 8.019840304216956, "learning_rate": 1.3234899328859061e-05, "loss": 1.5196, "step": 493 }, { "epoch": 0.02, "grad_norm": 8.336089736601796, "learning_rate": 1.3261744966442954e-05, "loss": 1.5339, "step": 494 }, { "epoch": 0.02, "grad_norm": 10.415309981188626, "learning_rate": 1.3288590604026848e-05, "loss": 1.3937, "step": 495 }, { "epoch": 0.02, "grad_norm": 7.283136231413104, "learning_rate": 1.331543624161074e-05, "loss": 1.5499, "step": 496 }, { "epoch": 0.02, "grad_norm": 8.072416553588251, "learning_rate": 1.3342281879194631e-05, "loss": 1.681, "step": 497 }, { "epoch": 0.02, "grad_norm": 7.856148105266359, "learning_rate": 1.3369127516778524e-05, "loss": 1.2094, "step": 498 }, { "epoch": 0.02, "grad_norm": 8.16530480403354, "learning_rate": 1.3395973154362418e-05, "loss": 1.4894, "step": 499 }, { "epoch": 0.02, "grad_norm": 8.695566334632854, "learning_rate": 1.342281879194631e-05, "loss": 1.1902, "step": 500 }, { "epoch": 0.02, "grad_norm": 8.94896777394868, "learning_rate": 1.3449664429530202e-05, "loss": 1.582, "step": 501 }, { "epoch": 0.02, "grad_norm": 10.416645425890673, "learning_rate": 1.3476510067114094e-05, "loss": 1.7216, "step": 502 }, { "epoch": 0.02, "grad_norm": 7.111735282585837, "learning_rate": 1.3503355704697987e-05, "loss": 1.5494, "step": 503 }, { "epoch": 0.02, "grad_norm": 8.0788022868739, "learning_rate": 1.3530201342281881e-05, "loss": 1.395, "step": 504 }, { "epoch": 0.02, "grad_norm": 8.67167001885283, "learning_rate": 1.3557046979865774e-05, "loss": 1.3778, "step": 505 }, { "epoch": 0.02, "grad_norm": 6.408622900237423, "learning_rate": 1.3583892617449665e-05, "loss": 1.4904, "step": 506 }, { "epoch": 0.02, "grad_norm": 7.723404671606452, "learning_rate": 1.3610738255033557e-05, "loss": 1.4276, "step": 507 }, { "epoch": 0.02, "grad_norm": 7.538371607976068, "learning_rate": 1.363758389261745e-05, "loss": 1.254, "step": 508 }, { "epoch": 0.02, "grad_norm": 8.152355195574536, "learning_rate": 1.3664429530201344e-05, "loss": 1.3815, "step": 509 }, { "epoch": 0.02, "grad_norm": 8.075484270640901, "learning_rate": 1.3691275167785237e-05, "loss": 1.5075, "step": 510 }, { "epoch": 0.02, "grad_norm": 6.929160859051723, "learning_rate": 1.3718120805369128e-05, "loss": 1.5102, "step": 511 }, { "epoch": 0.02, "grad_norm": 7.521422411126925, "learning_rate": 1.374496644295302e-05, "loss": 1.3788, "step": 512 }, { "epoch": 0.02, "grad_norm": 9.662761368199288, "learning_rate": 1.3771812080536914e-05, "loss": 1.1429, "step": 513 }, { "epoch": 0.02, "grad_norm": 9.568898730051162, "learning_rate": 1.3798657718120807e-05, "loss": 1.4793, "step": 514 }, { "epoch": 0.02, "grad_norm": 7.31628783591004, "learning_rate": 1.38255033557047e-05, "loss": 1.4851, "step": 515 }, { "epoch": 0.02, "grad_norm": 13.932900830669347, "learning_rate": 1.385234899328859e-05, "loss": 1.5712, "step": 516 }, { "epoch": 0.02, "grad_norm": 7.249113168548576, "learning_rate": 1.3879194630872483e-05, "loss": 1.3454, "step": 517 }, { "epoch": 0.02, "grad_norm": 9.881798747617315, "learning_rate": 1.3906040268456377e-05, "loss": 1.8441, "step": 518 }, { "epoch": 0.02, "grad_norm": 20.155168464491197, "learning_rate": 1.393288590604027e-05, "loss": 1.5913, "step": 519 }, { "epoch": 0.02, "grad_norm": 10.453412654675237, "learning_rate": 1.3959731543624163e-05, "loss": 1.5557, "step": 520 }, { "epoch": 0.02, "grad_norm": 15.041093106879847, "learning_rate": 1.3986577181208053e-05, "loss": 1.156, "step": 521 }, { "epoch": 0.02, "grad_norm": 5.137265776600472, "learning_rate": 1.4013422818791948e-05, "loss": 1.1797, "step": 522 }, { "epoch": 0.02, "grad_norm": 7.870416818990308, "learning_rate": 1.404026845637584e-05, "loss": 1.3991, "step": 523 }, { "epoch": 0.02, "grad_norm": 12.5708998645491, "learning_rate": 1.4067114093959733e-05, "loss": 1.4046, "step": 524 }, { "epoch": 0.02, "grad_norm": 7.745510809295142, "learning_rate": 1.4093959731543626e-05, "loss": 1.3977, "step": 525 }, { "epoch": 0.02, "grad_norm": 7.943868135824137, "learning_rate": 1.4120805369127516e-05, "loss": 1.3955, "step": 526 }, { "epoch": 0.02, "grad_norm": 5.996875242097958, "learning_rate": 1.414765100671141e-05, "loss": 1.2694, "step": 527 }, { "epoch": 0.02, "grad_norm": 10.496375775379063, "learning_rate": 1.4174496644295303e-05, "loss": 1.4434, "step": 528 }, { "epoch": 0.02, "grad_norm": 7.317487581743314, "learning_rate": 1.4201342281879196e-05, "loss": 1.3041, "step": 529 }, { "epoch": 0.02, "grad_norm": 7.2485209867741345, "learning_rate": 1.4228187919463088e-05, "loss": 1.4963, "step": 530 }, { "epoch": 0.02, "grad_norm": 7.560976821678934, "learning_rate": 1.425503355704698e-05, "loss": 1.5704, "step": 531 }, { "epoch": 0.02, "grad_norm": 6.811254700919813, "learning_rate": 1.4281879194630874e-05, "loss": 1.5331, "step": 532 }, { "epoch": 0.02, "grad_norm": 15.465413006052787, "learning_rate": 1.4308724832214766e-05, "loss": 1.5635, "step": 533 }, { "epoch": 0.02, "grad_norm": 7.101774379901071, "learning_rate": 1.4335570469798659e-05, "loss": 1.1154, "step": 534 }, { "epoch": 0.02, "grad_norm": 11.159371358502538, "learning_rate": 1.4362416107382551e-05, "loss": 1.4458, "step": 535 }, { "epoch": 0.02, "grad_norm": 10.174260961502805, "learning_rate": 1.4389261744966446e-05, "loss": 1.5627, "step": 536 }, { "epoch": 0.02, "grad_norm": 7.545421669624485, "learning_rate": 1.4416107382550337e-05, "loss": 1.2197, "step": 537 }, { "epoch": 0.02, "grad_norm": 9.708844544090837, "learning_rate": 1.444295302013423e-05, "loss": 1.5157, "step": 538 }, { "epoch": 0.02, "grad_norm": 8.17160886194052, "learning_rate": 1.4469798657718122e-05, "loss": 1.3182, "step": 539 }, { "epoch": 0.02, "grad_norm": 8.419679652994198, "learning_rate": 1.4496644295302014e-05, "loss": 1.3181, "step": 540 }, { "epoch": 0.02, "grad_norm": 9.125527777852835, "learning_rate": 1.4523489932885909e-05, "loss": 1.3603, "step": 541 }, { "epoch": 0.02, "grad_norm": 7.6874506097130375, "learning_rate": 1.45503355704698e-05, "loss": 1.0992, "step": 542 }, { "epoch": 0.02, "grad_norm": 11.270491378056562, "learning_rate": 1.4577181208053692e-05, "loss": 1.2715, "step": 543 }, { "epoch": 0.02, "grad_norm": 7.583169553596723, "learning_rate": 1.4604026845637585e-05, "loss": 1.2255, "step": 544 }, { "epoch": 0.02, "grad_norm": 8.406054115096355, "learning_rate": 1.4630872483221479e-05, "loss": 1.2699, "step": 545 }, { "epoch": 0.02, "grad_norm": 7.353522345368765, "learning_rate": 1.4657718120805372e-05, "loss": 1.4357, "step": 546 }, { "epoch": 0.02, "grad_norm": 6.511655148061715, "learning_rate": 1.4684563758389262e-05, "loss": 1.3119, "step": 547 }, { "epoch": 0.02, "grad_norm": 8.851821039613545, "learning_rate": 1.4711409395973155e-05, "loss": 1.4612, "step": 548 }, { "epoch": 0.02, "grad_norm": 7.47303434090239, "learning_rate": 1.4738255033557048e-05, "loss": 1.3566, "step": 549 }, { "epoch": 0.02, "grad_norm": 6.510422595093781, "learning_rate": 1.4765100671140942e-05, "loss": 1.5438, "step": 550 }, { "epoch": 0.02, "grad_norm": 11.739921214927813, "learning_rate": 1.4791946308724835e-05, "loss": 1.8695, "step": 551 }, { "epoch": 0.02, "grad_norm": 7.119992350323915, "learning_rate": 1.4818791946308725e-05, "loss": 1.1185, "step": 552 }, { "epoch": 0.02, "grad_norm": 7.520701360288286, "learning_rate": 1.4845637583892618e-05, "loss": 1.6391, "step": 553 }, { "epoch": 0.02, "grad_norm": 7.383802738815257, "learning_rate": 1.487248322147651e-05, "loss": 1.328, "step": 554 }, { "epoch": 0.02, "grad_norm": 6.62589206342016, "learning_rate": 1.4899328859060405e-05, "loss": 1.2322, "step": 555 }, { "epoch": 0.02, "grad_norm": 9.30216351796237, "learning_rate": 1.4926174496644297e-05, "loss": 1.2199, "step": 556 }, { "epoch": 0.02, "grad_norm": 8.228721412554105, "learning_rate": 1.4953020134228188e-05, "loss": 1.2169, "step": 557 }, { "epoch": 0.02, "grad_norm": 7.742477711805231, "learning_rate": 1.4979865771812081e-05, "loss": 1.4394, "step": 558 }, { "epoch": 0.02, "grad_norm": 6.937906435587678, "learning_rate": 1.5006711409395975e-05, "loss": 1.2919, "step": 559 }, { "epoch": 0.02, "grad_norm": 7.4501171918562985, "learning_rate": 1.5033557046979868e-05, "loss": 1.424, "step": 560 }, { "epoch": 0.02, "grad_norm": 8.316736783331523, "learning_rate": 1.506040268456376e-05, "loss": 1.565, "step": 561 }, { "epoch": 0.02, "grad_norm": 7.749884676260847, "learning_rate": 1.5087248322147651e-05, "loss": 1.2053, "step": 562 }, { "epoch": 0.02, "grad_norm": 9.81823142785112, "learning_rate": 1.5114093959731544e-05, "loss": 1.5824, "step": 563 }, { "epoch": 0.02, "grad_norm": 7.7225795080504875, "learning_rate": 1.5140939597315438e-05, "loss": 1.3721, "step": 564 }, { "epoch": 0.02, "grad_norm": 4.3485080045617135, "learning_rate": 1.516778523489933e-05, "loss": 1.0952, "step": 565 }, { "epoch": 0.02, "grad_norm": 11.37887808090431, "learning_rate": 1.5194630872483223e-05, "loss": 1.3754, "step": 566 }, { "epoch": 0.02, "grad_norm": 9.87861084672472, "learning_rate": 1.5221476510067114e-05, "loss": 1.7421, "step": 567 }, { "epoch": 0.02, "grad_norm": 9.778248564677485, "learning_rate": 1.5248322147651009e-05, "loss": 1.711, "step": 568 }, { "epoch": 0.02, "grad_norm": 8.62918022335809, "learning_rate": 1.5275167785234903e-05, "loss": 1.4417, "step": 569 }, { "epoch": 0.02, "grad_norm": 8.834517594911215, "learning_rate": 1.5302013422818792e-05, "loss": 1.8575, "step": 570 }, { "epoch": 0.02, "grad_norm": 9.17769990533976, "learning_rate": 1.5328859060402685e-05, "loss": 1.5623, "step": 571 }, { "epoch": 0.02, "grad_norm": 10.211237958354836, "learning_rate": 1.5355704697986577e-05, "loss": 1.7896, "step": 572 }, { "epoch": 0.02, "grad_norm": 7.3952703994595534, "learning_rate": 1.5382550335570473e-05, "loss": 1.4155, "step": 573 }, { "epoch": 0.02, "grad_norm": 6.583809823056281, "learning_rate": 1.5409395973154366e-05, "loss": 1.236, "step": 574 }, { "epoch": 0.02, "grad_norm": 8.116588935418603, "learning_rate": 1.5436241610738255e-05, "loss": 1.5395, "step": 575 }, { "epoch": 0.02, "grad_norm": 9.834567717938464, "learning_rate": 1.5463087248322148e-05, "loss": 1.6032, "step": 576 }, { "epoch": 0.02, "grad_norm": 9.313015440352208, "learning_rate": 1.548993288590604e-05, "loss": 1.531, "step": 577 }, { "epoch": 0.02, "grad_norm": 11.761453775322154, "learning_rate": 1.5516778523489936e-05, "loss": 1.4034, "step": 578 }, { "epoch": 0.02, "grad_norm": 7.733988907897115, "learning_rate": 1.554362416107383e-05, "loss": 1.4758, "step": 579 }, { "epoch": 0.02, "grad_norm": 7.723471651705433, "learning_rate": 1.5570469798657718e-05, "loss": 1.6519, "step": 580 }, { "epoch": 0.02, "grad_norm": 7.683024073717439, "learning_rate": 1.559731543624161e-05, "loss": 1.1023, "step": 581 }, { "epoch": 0.02, "grad_norm": 6.771382530687474, "learning_rate": 1.5624161073825506e-05, "loss": 1.2879, "step": 582 }, { "epoch": 0.02, "grad_norm": 8.333162236828377, "learning_rate": 1.56510067114094e-05, "loss": 1.8738, "step": 583 }, { "epoch": 0.02, "grad_norm": 6.726603314237087, "learning_rate": 1.5677852348993288e-05, "loss": 1.2181, "step": 584 }, { "epoch": 0.02, "grad_norm": 8.742760599928726, "learning_rate": 1.570469798657718e-05, "loss": 1.501, "step": 585 }, { "epoch": 0.02, "grad_norm": 7.966656136793154, "learning_rate": 1.5731543624161073e-05, "loss": 1.5028, "step": 586 }, { "epoch": 0.02, "grad_norm": 10.18356366854356, "learning_rate": 1.575838926174497e-05, "loss": 1.3861, "step": 587 }, { "epoch": 0.02, "grad_norm": 11.947862619777618, "learning_rate": 1.5785234899328862e-05, "loss": 1.1892, "step": 588 }, { "epoch": 0.02, "grad_norm": 4.4826065850478445, "learning_rate": 1.581208053691275e-05, "loss": 1.1479, "step": 589 }, { "epoch": 0.02, "grad_norm": 8.320773479811276, "learning_rate": 1.5838926174496644e-05, "loss": 1.3655, "step": 590 }, { "epoch": 0.02, "grad_norm": 7.914291018541771, "learning_rate": 1.586577181208054e-05, "loss": 1.3441, "step": 591 }, { "epoch": 0.02, "grad_norm": 9.516654803971699, "learning_rate": 1.5892617449664432e-05, "loss": 1.8111, "step": 592 }, { "epoch": 0.02, "grad_norm": 8.51706100074725, "learning_rate": 1.5919463087248325e-05, "loss": 1.6239, "step": 593 }, { "epoch": 0.02, "grad_norm": 8.671907382657079, "learning_rate": 1.5946308724832214e-05, "loss": 1.3289, "step": 594 }, { "epoch": 0.02, "grad_norm": 8.026837225200687, "learning_rate": 1.5973154362416107e-05, "loss": 1.3409, "step": 595 }, { "epoch": 0.02, "grad_norm": 6.869668467611291, "learning_rate": 1.6000000000000003e-05, "loss": 1.3098, "step": 596 }, { "epoch": 0.02, "grad_norm": 10.282826546264316, "learning_rate": 1.6026845637583895e-05, "loss": 1.4488, "step": 597 }, { "epoch": 0.02, "grad_norm": 8.690974793437086, "learning_rate": 1.6053691275167788e-05, "loss": 1.0872, "step": 598 }, { "epoch": 0.02, "grad_norm": 9.356392804768612, "learning_rate": 1.6080536912751677e-05, "loss": 1.3607, "step": 599 }, { "epoch": 0.02, "grad_norm": 7.048842024036013, "learning_rate": 1.610738255033557e-05, "loss": 1.4449, "step": 600 }, { "epoch": 0.02, "grad_norm": 9.155717759001321, "learning_rate": 1.6134228187919466e-05, "loss": 1.5859, "step": 601 }, { "epoch": 0.02, "grad_norm": 8.858674187094271, "learning_rate": 1.6161073825503358e-05, "loss": 1.1479, "step": 602 }, { "epoch": 0.02, "grad_norm": 6.558384009973683, "learning_rate": 1.618791946308725e-05, "loss": 1.653, "step": 603 }, { "epoch": 0.02, "grad_norm": 9.775744137326498, "learning_rate": 1.621476510067114e-05, "loss": 1.4949, "step": 604 }, { "epoch": 0.02, "grad_norm": 9.464495909492927, "learning_rate": 1.6241610738255036e-05, "loss": 1.505, "step": 605 }, { "epoch": 0.02, "grad_norm": 13.282144098657279, "learning_rate": 1.626845637583893e-05, "loss": 1.4929, "step": 606 }, { "epoch": 0.02, "grad_norm": 9.774937894051398, "learning_rate": 1.629530201342282e-05, "loss": 1.0399, "step": 607 }, { "epoch": 0.02, "grad_norm": 10.515020436246944, "learning_rate": 1.6322147651006714e-05, "loss": 1.9131, "step": 608 }, { "epoch": 0.02, "grad_norm": 10.02323023790672, "learning_rate": 1.6348993288590603e-05, "loss": 1.6139, "step": 609 }, { "epoch": 0.02, "grad_norm": 3.622876690045713, "learning_rate": 1.63758389261745e-05, "loss": 1.0577, "step": 610 }, { "epoch": 0.02, "grad_norm": 9.776784253430833, "learning_rate": 1.640268456375839e-05, "loss": 1.9954, "step": 611 }, { "epoch": 0.02, "grad_norm": 7.695960454965206, "learning_rate": 1.6429530201342284e-05, "loss": 1.1116, "step": 612 }, { "epoch": 0.02, "grad_norm": 7.016577281751768, "learning_rate": 1.6456375838926177e-05, "loss": 1.5672, "step": 613 }, { "epoch": 0.02, "grad_norm": 8.28260412750175, "learning_rate": 1.648322147651007e-05, "loss": 1.5242, "step": 614 }, { "epoch": 0.02, "grad_norm": 7.842548267981796, "learning_rate": 1.6510067114093962e-05, "loss": 1.4001, "step": 615 }, { "epoch": 0.02, "grad_norm": 7.797958727681141, "learning_rate": 1.6536912751677854e-05, "loss": 1.5679, "step": 616 }, { "epoch": 0.02, "grad_norm": 7.134279512769903, "learning_rate": 1.6563758389261747e-05, "loss": 1.4982, "step": 617 }, { "epoch": 0.02, "grad_norm": 10.93637686416352, "learning_rate": 1.659060402684564e-05, "loss": 1.6387, "step": 618 }, { "epoch": 0.02, "grad_norm": 8.911455523137747, "learning_rate": 1.6617449664429532e-05, "loss": 1.3803, "step": 619 }, { "epoch": 0.02, "grad_norm": 7.111460667724544, "learning_rate": 1.6644295302013425e-05, "loss": 1.2912, "step": 620 }, { "epoch": 0.03, "grad_norm": 12.84826865811465, "learning_rate": 1.6671140939597317e-05, "loss": 1.3408, "step": 621 }, { "epoch": 0.03, "grad_norm": 7.811612554386186, "learning_rate": 1.669798657718121e-05, "loss": 1.4216, "step": 622 }, { "epoch": 0.03, "grad_norm": 9.600228955930593, "learning_rate": 1.6724832214765103e-05, "loss": 1.6831, "step": 623 }, { "epoch": 0.03, "grad_norm": 8.237916989274682, "learning_rate": 1.6751677852348995e-05, "loss": 1.5187, "step": 624 }, { "epoch": 0.03, "grad_norm": 7.690984881873106, "learning_rate": 1.6778523489932888e-05, "loss": 1.2515, "step": 625 }, { "epoch": 0.03, "grad_norm": 9.629821009030652, "learning_rate": 1.680536912751678e-05, "loss": 1.7682, "step": 626 }, { "epoch": 0.03, "grad_norm": 8.768552078380317, "learning_rate": 1.6832214765100673e-05, "loss": 1.6374, "step": 627 }, { "epoch": 0.03, "grad_norm": 7.826202406673499, "learning_rate": 1.6859060402684565e-05, "loss": 1.5675, "step": 628 }, { "epoch": 0.03, "grad_norm": 10.343875400904523, "learning_rate": 1.6885906040268458e-05, "loss": 1.9749, "step": 629 }, { "epoch": 0.03, "grad_norm": 8.442940900110244, "learning_rate": 1.691275167785235e-05, "loss": 1.3851, "step": 630 }, { "epoch": 0.03, "grad_norm": 8.49809278340363, "learning_rate": 1.6939597315436243e-05, "loss": 1.5163, "step": 631 }, { "epoch": 0.03, "grad_norm": 8.87558836941394, "learning_rate": 1.6966442953020136e-05, "loss": 1.4327, "step": 632 }, { "epoch": 0.03, "grad_norm": 8.879065893643617, "learning_rate": 1.699328859060403e-05, "loss": 1.6641, "step": 633 }, { "epoch": 0.03, "grad_norm": 6.970638435658616, "learning_rate": 1.702013422818792e-05, "loss": 1.3324, "step": 634 }, { "epoch": 0.03, "grad_norm": 8.150071994776928, "learning_rate": 1.7046979865771814e-05, "loss": 1.2787, "step": 635 }, { "epoch": 0.03, "grad_norm": 11.721376703536926, "learning_rate": 1.7073825503355706e-05, "loss": 1.6006, "step": 636 }, { "epoch": 0.03, "grad_norm": 8.798452481275602, "learning_rate": 1.71006711409396e-05, "loss": 1.7199, "step": 637 }, { "epoch": 0.03, "grad_norm": 7.493434676423813, "learning_rate": 1.712751677852349e-05, "loss": 1.5992, "step": 638 }, { "epoch": 0.03, "grad_norm": 7.584695602309373, "learning_rate": 1.7154362416107384e-05, "loss": 1.5779, "step": 639 }, { "epoch": 0.03, "grad_norm": 7.211231575961541, "learning_rate": 1.7181208053691277e-05, "loss": 1.2577, "step": 640 }, { "epoch": 0.03, "grad_norm": 13.607595585767653, "learning_rate": 1.720805369127517e-05, "loss": 1.7324, "step": 641 }, { "epoch": 0.03, "grad_norm": 7.043581327014544, "learning_rate": 1.7234899328859062e-05, "loss": 1.587, "step": 642 }, { "epoch": 0.03, "grad_norm": 9.914377064064201, "learning_rate": 1.7261744966442954e-05, "loss": 1.7005, "step": 643 }, { "epoch": 0.03, "grad_norm": 11.383295887873215, "learning_rate": 1.7288590604026847e-05, "loss": 1.5572, "step": 644 }, { "epoch": 0.03, "grad_norm": 7.998904184716423, "learning_rate": 1.731543624161074e-05, "loss": 1.2511, "step": 645 }, { "epoch": 0.03, "grad_norm": 7.115506829839945, "learning_rate": 1.7342281879194632e-05, "loss": 1.3593, "step": 646 }, { "epoch": 0.03, "grad_norm": 6.054047838124271, "learning_rate": 1.7369127516778525e-05, "loss": 1.3131, "step": 647 }, { "epoch": 0.03, "grad_norm": 8.129573932882154, "learning_rate": 1.7395973154362417e-05, "loss": 1.3129, "step": 648 }, { "epoch": 0.03, "grad_norm": 10.761645123310167, "learning_rate": 1.742281879194631e-05, "loss": 1.3701, "step": 649 }, { "epoch": 0.03, "grad_norm": 6.9777522086385035, "learning_rate": 1.7449664429530202e-05, "loss": 1.4474, "step": 650 }, { "epoch": 0.03, "grad_norm": 8.003757630729373, "learning_rate": 1.7476510067114095e-05, "loss": 1.1496, "step": 651 }, { "epoch": 0.03, "grad_norm": 10.243792899357986, "learning_rate": 1.7503355704697988e-05, "loss": 1.8197, "step": 652 }, { "epoch": 0.03, "grad_norm": 7.751956755261069, "learning_rate": 1.753020134228188e-05, "loss": 1.5824, "step": 653 }, { "epoch": 0.03, "grad_norm": 12.096351528690645, "learning_rate": 1.7557046979865773e-05, "loss": 1.497, "step": 654 }, { "epoch": 0.03, "grad_norm": 6.694022663612402, "learning_rate": 1.7583892617449665e-05, "loss": 1.3844, "step": 655 }, { "epoch": 0.03, "grad_norm": 6.58025552246381, "learning_rate": 1.7610738255033558e-05, "loss": 1.5863, "step": 656 }, { "epoch": 0.03, "grad_norm": 13.547482488927036, "learning_rate": 1.763758389261745e-05, "loss": 1.0899, "step": 657 }, { "epoch": 0.03, "grad_norm": 7.416158381154374, "learning_rate": 1.7664429530201343e-05, "loss": 1.5087, "step": 658 }, { "epoch": 0.03, "grad_norm": 9.294839563754406, "learning_rate": 1.7691275167785236e-05, "loss": 1.8399, "step": 659 }, { "epoch": 0.03, "grad_norm": 14.630333333437656, "learning_rate": 1.771812080536913e-05, "loss": 1.2181, "step": 660 }, { "epoch": 0.03, "grad_norm": 8.113713147515277, "learning_rate": 1.774496644295302e-05, "loss": 1.4961, "step": 661 }, { "epoch": 0.03, "grad_norm": 10.248928916187554, "learning_rate": 1.7771812080536913e-05, "loss": 1.506, "step": 662 }, { "epoch": 0.03, "grad_norm": 13.843347056037487, "learning_rate": 1.7798657718120806e-05, "loss": 1.7997, "step": 663 }, { "epoch": 0.03, "grad_norm": 7.595666452267448, "learning_rate": 1.78255033557047e-05, "loss": 1.4877, "step": 664 }, { "epoch": 0.03, "grad_norm": 6.573118819718279, "learning_rate": 1.7852348993288595e-05, "loss": 1.4087, "step": 665 }, { "epoch": 0.03, "grad_norm": 10.990766588736186, "learning_rate": 1.7879194630872484e-05, "loss": 1.8962, "step": 666 }, { "epoch": 0.03, "grad_norm": 7.799215913727351, "learning_rate": 1.7906040268456376e-05, "loss": 1.3199, "step": 667 }, { "epoch": 0.03, "grad_norm": 4.683206986329194, "learning_rate": 1.793288590604027e-05, "loss": 1.0974, "step": 668 }, { "epoch": 0.03, "grad_norm": 9.351257159317212, "learning_rate": 1.795973154362416e-05, "loss": 1.3103, "step": 669 }, { "epoch": 0.03, "grad_norm": 11.950992592804711, "learning_rate": 1.7986577181208054e-05, "loss": 1.4563, "step": 670 }, { "epoch": 0.03, "grad_norm": 10.550490056202355, "learning_rate": 1.8013422818791947e-05, "loss": 1.6629, "step": 671 }, { "epoch": 0.03, "grad_norm": 8.37748526166874, "learning_rate": 1.804026845637584e-05, "loss": 1.4842, "step": 672 }, { "epoch": 0.03, "grad_norm": 7.969724880593675, "learning_rate": 1.8067114093959732e-05, "loss": 1.1297, "step": 673 }, { "epoch": 0.03, "grad_norm": 9.320598290114177, "learning_rate": 1.8093959731543628e-05, "loss": 1.4965, "step": 674 }, { "epoch": 0.03, "grad_norm": 7.66158812672837, "learning_rate": 1.8120805369127517e-05, "loss": 1.5017, "step": 675 }, { "epoch": 0.03, "grad_norm": 9.024446627271338, "learning_rate": 1.814765100671141e-05, "loss": 1.7178, "step": 676 }, { "epoch": 0.03, "grad_norm": 9.722779569375733, "learning_rate": 1.8174496644295302e-05, "loss": 1.4876, "step": 677 }, { "epoch": 0.03, "grad_norm": 7.997168282674599, "learning_rate": 1.8201342281879195e-05, "loss": 1.22, "step": 678 }, { "epoch": 0.03, "grad_norm": 8.814857150025032, "learning_rate": 1.822818791946309e-05, "loss": 1.3637, "step": 679 }, { "epoch": 0.03, "grad_norm": 7.720562058525064, "learning_rate": 1.825503355704698e-05, "loss": 1.4558, "step": 680 }, { "epoch": 0.03, "grad_norm": 16.095251042606805, "learning_rate": 1.8281879194630873e-05, "loss": 1.6135, "step": 681 }, { "epoch": 0.03, "grad_norm": 6.379690528812599, "learning_rate": 1.8308724832214765e-05, "loss": 1.4294, "step": 682 }, { "epoch": 0.03, "grad_norm": 6.237698216152336, "learning_rate": 1.833557046979866e-05, "loss": 1.4633, "step": 683 }, { "epoch": 0.03, "grad_norm": 11.920617156667952, "learning_rate": 1.8362416107382554e-05, "loss": 1.3385, "step": 684 }, { "epoch": 0.03, "grad_norm": 11.978482030153378, "learning_rate": 1.8389261744966443e-05, "loss": 1.4339, "step": 685 }, { "epoch": 0.03, "grad_norm": 10.675762067153812, "learning_rate": 1.8416107382550336e-05, "loss": 1.7475, "step": 686 }, { "epoch": 0.03, "grad_norm": 6.91537531960125, "learning_rate": 1.8442953020134228e-05, "loss": 1.3903, "step": 687 }, { "epoch": 0.03, "grad_norm": 8.39191489746502, "learning_rate": 1.8469798657718124e-05, "loss": 1.1273, "step": 688 }, { "epoch": 0.03, "grad_norm": 7.698657895502848, "learning_rate": 1.8496644295302017e-05, "loss": 1.2752, "step": 689 }, { "epoch": 0.03, "grad_norm": 10.040502632693768, "learning_rate": 1.8523489932885906e-05, "loss": 1.3141, "step": 690 }, { "epoch": 0.03, "grad_norm": 11.41594770225767, "learning_rate": 1.85503355704698e-05, "loss": 1.8202, "step": 691 }, { "epoch": 0.03, "grad_norm": 9.059940052491621, "learning_rate": 1.857718120805369e-05, "loss": 1.6422, "step": 692 }, { "epoch": 0.03, "grad_norm": 6.998902675284476, "learning_rate": 1.8604026845637587e-05, "loss": 1.6059, "step": 693 }, { "epoch": 0.03, "grad_norm": 9.060663849685163, "learning_rate": 1.863087248322148e-05, "loss": 1.3515, "step": 694 }, { "epoch": 0.03, "grad_norm": 10.44681127573044, "learning_rate": 1.865771812080537e-05, "loss": 1.4895, "step": 695 }, { "epoch": 0.03, "grad_norm": 8.924586200696046, "learning_rate": 1.868456375838926e-05, "loss": 1.4124, "step": 696 }, { "epoch": 0.03, "grad_norm": 9.753865952420195, "learning_rate": 1.8711409395973157e-05, "loss": 1.2813, "step": 697 }, { "epoch": 0.03, "grad_norm": 8.585563320327854, "learning_rate": 1.873825503355705e-05, "loss": 1.2427, "step": 698 }, { "epoch": 0.03, "grad_norm": 11.28238335349942, "learning_rate": 1.8765100671140943e-05, "loss": 1.6496, "step": 699 }, { "epoch": 0.03, "grad_norm": 7.320841708865357, "learning_rate": 1.8791946308724832e-05, "loss": 1.4567, "step": 700 }, { "epoch": 0.03, "grad_norm": 8.720938619177465, "learning_rate": 1.8818791946308724e-05, "loss": 1.4006, "step": 701 }, { "epoch": 0.03, "grad_norm": 8.087994039308818, "learning_rate": 1.884563758389262e-05, "loss": 1.738, "step": 702 }, { "epoch": 0.03, "grad_norm": 6.348231111030693, "learning_rate": 1.8872483221476513e-05, "loss": 1.1448, "step": 703 }, { "epoch": 0.03, "grad_norm": 7.934053643986842, "learning_rate": 1.8899328859060406e-05, "loss": 1.4155, "step": 704 }, { "epoch": 0.03, "grad_norm": 10.804127198362616, "learning_rate": 1.8926174496644295e-05, "loss": 1.2542, "step": 705 }, { "epoch": 0.03, "grad_norm": 8.454901448724996, "learning_rate": 1.895302013422819e-05, "loss": 1.5187, "step": 706 }, { "epoch": 0.03, "grad_norm": 8.667501023111416, "learning_rate": 1.8979865771812083e-05, "loss": 1.3869, "step": 707 }, { "epoch": 0.03, "grad_norm": 6.685829475079491, "learning_rate": 1.9006711409395976e-05, "loss": 1.4483, "step": 708 }, { "epoch": 0.03, "grad_norm": 7.851068920924944, "learning_rate": 1.903355704697987e-05, "loss": 1.5229, "step": 709 }, { "epoch": 0.03, "grad_norm": 12.360581657943092, "learning_rate": 1.9060402684563758e-05, "loss": 1.5678, "step": 710 }, { "epoch": 0.03, "grad_norm": 8.838471833579849, "learning_rate": 1.9087248322147654e-05, "loss": 1.5814, "step": 711 }, { "epoch": 0.03, "grad_norm": 11.538814897314536, "learning_rate": 1.9114093959731546e-05, "loss": 1.5919, "step": 712 }, { "epoch": 0.03, "grad_norm": 6.973865796091992, "learning_rate": 1.914093959731544e-05, "loss": 1.5036, "step": 713 }, { "epoch": 0.03, "grad_norm": 8.942099494799981, "learning_rate": 1.916778523489933e-05, "loss": 1.346, "step": 714 }, { "epoch": 0.03, "grad_norm": 6.928561960545991, "learning_rate": 1.919463087248322e-05, "loss": 1.5113, "step": 715 }, { "epoch": 0.03, "grad_norm": 8.327535615218974, "learning_rate": 1.9221476510067117e-05, "loss": 1.4813, "step": 716 }, { "epoch": 0.03, "grad_norm": 10.111254074950109, "learning_rate": 1.924832214765101e-05, "loss": 1.3902, "step": 717 }, { "epoch": 0.03, "grad_norm": 8.795375983098658, "learning_rate": 1.9275167785234902e-05, "loss": 1.4768, "step": 718 }, { "epoch": 0.03, "grad_norm": 6.106205038701756, "learning_rate": 1.9302013422818794e-05, "loss": 1.3735, "step": 719 }, { "epoch": 0.03, "grad_norm": 4.282268008053936, "learning_rate": 1.9328859060402687e-05, "loss": 0.9056, "step": 720 }, { "epoch": 0.03, "grad_norm": 11.643635341717253, "learning_rate": 1.935570469798658e-05, "loss": 1.9102, "step": 721 }, { "epoch": 0.03, "grad_norm": 6.40585713760294, "learning_rate": 1.9382550335570472e-05, "loss": 1.1561, "step": 722 }, { "epoch": 0.03, "grad_norm": 9.501838932148951, "learning_rate": 1.9409395973154365e-05, "loss": 1.1236, "step": 723 }, { "epoch": 0.03, "grad_norm": 8.98236565771981, "learning_rate": 1.9436241610738257e-05, "loss": 1.8247, "step": 724 }, { "epoch": 0.03, "grad_norm": 5.727741036215868, "learning_rate": 1.946308724832215e-05, "loss": 1.2078, "step": 725 }, { "epoch": 0.03, "grad_norm": 10.416768621462328, "learning_rate": 1.9489932885906042e-05, "loss": 1.7006, "step": 726 }, { "epoch": 0.03, "grad_norm": 7.513142860504843, "learning_rate": 1.9516778523489935e-05, "loss": 1.1217, "step": 727 }, { "epoch": 0.03, "grad_norm": 8.072295401301412, "learning_rate": 1.9543624161073828e-05, "loss": 1.38, "step": 728 }, { "epoch": 0.03, "grad_norm": 7.6748219663576736, "learning_rate": 1.957046979865772e-05, "loss": 1.7309, "step": 729 }, { "epoch": 0.03, "grad_norm": 10.254250623148174, "learning_rate": 1.9597315436241613e-05, "loss": 1.972, "step": 730 }, { "epoch": 0.03, "grad_norm": 6.830264236163746, "learning_rate": 1.9624161073825505e-05, "loss": 1.1723, "step": 731 }, { "epoch": 0.03, "grad_norm": 7.223272899053619, "learning_rate": 1.9651006711409398e-05, "loss": 1.6147, "step": 732 }, { "epoch": 0.03, "grad_norm": 6.85084381573008, "learning_rate": 1.967785234899329e-05, "loss": 1.3413, "step": 733 }, { "epoch": 0.03, "grad_norm": 7.742970632841864, "learning_rate": 1.9704697986577183e-05, "loss": 1.3835, "step": 734 }, { "epoch": 0.03, "grad_norm": 6.360610460935072, "learning_rate": 1.9731543624161076e-05, "loss": 1.08, "step": 735 }, { "epoch": 0.03, "grad_norm": 7.285160081855017, "learning_rate": 1.975838926174497e-05, "loss": 1.6392, "step": 736 }, { "epoch": 0.03, "grad_norm": 7.1117153040527645, "learning_rate": 1.978523489932886e-05, "loss": 1.2834, "step": 737 }, { "epoch": 0.03, "grad_norm": 3.785895560013108, "learning_rate": 1.9812080536912754e-05, "loss": 1.1575, "step": 738 }, { "epoch": 0.03, "grad_norm": 7.703110716339846, "learning_rate": 1.9838926174496646e-05, "loss": 1.7975, "step": 739 }, { "epoch": 0.03, "grad_norm": 6.237705248850261, "learning_rate": 1.986577181208054e-05, "loss": 1.291, "step": 740 }, { "epoch": 0.03, "grad_norm": 9.116558227935354, "learning_rate": 1.989261744966443e-05, "loss": 2.0239, "step": 741 }, { "epoch": 0.03, "grad_norm": 6.90484734944185, "learning_rate": 1.9919463087248324e-05, "loss": 1.3373, "step": 742 }, { "epoch": 0.03, "grad_norm": 6.5733463525343225, "learning_rate": 1.9946308724832216e-05, "loss": 1.3538, "step": 743 }, { "epoch": 0.03, "grad_norm": 7.116657752010826, "learning_rate": 1.997315436241611e-05, "loss": 1.5555, "step": 744 }, { "epoch": 0.03, "grad_norm": 6.469860543505684, "learning_rate": 2e-05, "loss": 1.1794, "step": 745 }, { "epoch": 0.03, "grad_norm": 5.706829985744727, "learning_rate": 1.9999999914901735e-05, "loss": 1.2363, "step": 746 }, { "epoch": 0.03, "grad_norm": 8.206991344480622, "learning_rate": 1.9999999659606937e-05, "loss": 1.599, "step": 747 }, { "epoch": 0.03, "grad_norm": 9.706840586256803, "learning_rate": 1.999999923411561e-05, "loss": 1.4984, "step": 748 }, { "epoch": 0.03, "grad_norm": 7.264141889429116, "learning_rate": 1.9999998638427763e-05, "loss": 1.716, "step": 749 }, { "epoch": 0.03, "grad_norm": 9.274320343622609, "learning_rate": 1.9999997872543408e-05, "loss": 1.7843, "step": 750 }, { "epoch": 0.03, "grad_norm": 7.8960701568138205, "learning_rate": 1.9999996936462554e-05, "loss": 1.7174, "step": 751 }, { "epoch": 0.03, "grad_norm": 3.598929548023946, "learning_rate": 1.9999995830185223e-05, "loss": 1.0959, "step": 752 }, { "epoch": 0.03, "grad_norm": 7.780972960595795, "learning_rate": 1.999999455371143e-05, "loss": 1.1483, "step": 753 }, { "epoch": 0.03, "grad_norm": 10.184131294296247, "learning_rate": 1.9999993107041193e-05, "loss": 1.7082, "step": 754 }, { "epoch": 0.03, "grad_norm": 11.781349946752146, "learning_rate": 1.9999991490174538e-05, "loss": 2.0316, "step": 755 }, { "epoch": 0.03, "grad_norm": 8.640662260274956, "learning_rate": 1.99999897031115e-05, "loss": 1.3356, "step": 756 }, { "epoch": 0.03, "grad_norm": 8.88159666557774, "learning_rate": 1.99999877458521e-05, "loss": 1.4823, "step": 757 }, { "epoch": 0.03, "grad_norm": 9.44327637549819, "learning_rate": 1.9999985618396377e-05, "loss": 1.3769, "step": 758 }, { "epoch": 0.03, "grad_norm": 6.395754990201366, "learning_rate": 1.999998332074437e-05, "loss": 1.1389, "step": 759 }, { "epoch": 0.03, "grad_norm": 8.37935042952115, "learning_rate": 1.9999980852896106e-05, "loss": 1.3471, "step": 760 }, { "epoch": 0.03, "grad_norm": 7.228485876678234, "learning_rate": 1.999997821485164e-05, "loss": 1.042, "step": 761 }, { "epoch": 0.03, "grad_norm": 5.7488458530356, "learning_rate": 1.999997540661101e-05, "loss": 1.3817, "step": 762 }, { "epoch": 0.03, "grad_norm": 8.917897366076083, "learning_rate": 1.9999972428174262e-05, "loss": 1.6128, "step": 763 }, { "epoch": 0.03, "grad_norm": 7.550449975188234, "learning_rate": 1.9999969279541456e-05, "loss": 1.3963, "step": 764 }, { "epoch": 0.03, "grad_norm": 7.13585240811636, "learning_rate": 1.9999965960712635e-05, "loss": 1.2469, "step": 765 }, { "epoch": 0.03, "grad_norm": 7.7245083934014644, "learning_rate": 1.9999962471687865e-05, "loss": 1.3719, "step": 766 }, { "epoch": 0.03, "grad_norm": 10.296826926203583, "learning_rate": 1.9999958812467196e-05, "loss": 2.0527, "step": 767 }, { "epoch": 0.03, "grad_norm": 6.024472436586068, "learning_rate": 1.9999954983050696e-05, "loss": 1.4095, "step": 768 }, { "epoch": 0.03, "grad_norm": 9.260654994613297, "learning_rate": 1.9999950983438433e-05, "loss": 1.6362, "step": 769 }, { "epoch": 0.03, "grad_norm": 11.186866729649955, "learning_rate": 1.9999946813630468e-05, "loss": 1.6968, "step": 770 }, { "epoch": 0.03, "grad_norm": 5.813063620379354, "learning_rate": 1.999994247362687e-05, "loss": 1.4802, "step": 771 }, { "epoch": 0.03, "grad_norm": 7.981096229430154, "learning_rate": 1.9999937963427728e-05, "loss": 1.4514, "step": 772 }, { "epoch": 0.03, "grad_norm": 7.03754199655554, "learning_rate": 1.99999332830331e-05, "loss": 1.3815, "step": 773 }, { "epoch": 0.03, "grad_norm": 9.023180925830445, "learning_rate": 1.9999928432443077e-05, "loss": 1.498, "step": 774 }, { "epoch": 0.03, "grad_norm": 6.131724684558895, "learning_rate": 1.9999923411657742e-05, "loss": 1.4913, "step": 775 }, { "epoch": 0.03, "grad_norm": 6.818180068795018, "learning_rate": 1.9999918220677176e-05, "loss": 1.4362, "step": 776 }, { "epoch": 0.03, "grad_norm": 6.546406079156196, "learning_rate": 1.9999912859501466e-05, "loss": 1.3896, "step": 777 }, { "epoch": 0.03, "grad_norm": 8.86169432171516, "learning_rate": 1.9999907328130708e-05, "loss": 1.2364, "step": 778 }, { "epoch": 0.03, "grad_norm": 10.166797940058492, "learning_rate": 1.9999901626564993e-05, "loss": 1.8125, "step": 779 }, { "epoch": 0.03, "grad_norm": 8.681415371586207, "learning_rate": 1.9999895754804423e-05, "loss": 1.6726, "step": 780 }, { "epoch": 0.03, "grad_norm": 8.785825979286638, "learning_rate": 1.999988971284909e-05, "loss": 1.4207, "step": 781 }, { "epoch": 0.03, "grad_norm": 6.096810940912092, "learning_rate": 1.9999883500699103e-05, "loss": 1.1938, "step": 782 }, { "epoch": 0.03, "grad_norm": 14.439366157570156, "learning_rate": 1.9999877118354564e-05, "loss": 1.5035, "step": 783 }, { "epoch": 0.03, "grad_norm": 14.785683165772193, "learning_rate": 1.9999870565815582e-05, "loss": 1.619, "step": 784 }, { "epoch": 0.03, "grad_norm": 6.644861384471609, "learning_rate": 1.9999863843082274e-05, "loss": 1.4064, "step": 785 }, { "epoch": 0.03, "grad_norm": 6.9769501926234625, "learning_rate": 1.9999856950154747e-05, "loss": 1.5519, "step": 786 }, { "epoch": 0.03, "grad_norm": 11.067900778530898, "learning_rate": 1.999984988703312e-05, "loss": 1.6625, "step": 787 }, { "epoch": 0.03, "grad_norm": 8.061841807549257, "learning_rate": 1.999984265371752e-05, "loss": 1.585, "step": 788 }, { "epoch": 0.03, "grad_norm": 9.023966626054792, "learning_rate": 1.999983525020806e-05, "loss": 1.619, "step": 789 }, { "epoch": 0.03, "grad_norm": 6.374809501106074, "learning_rate": 1.9999827676504874e-05, "loss": 1.3745, "step": 790 }, { "epoch": 0.03, "grad_norm": 6.212699114977627, "learning_rate": 1.999981993260809e-05, "loss": 1.4163, "step": 791 }, { "epoch": 0.03, "grad_norm": 9.955192400510422, "learning_rate": 1.999981201851783e-05, "loss": 1.4073, "step": 792 }, { "epoch": 0.03, "grad_norm": 4.131343692873796, "learning_rate": 1.999980393423424e-05, "loss": 1.1097, "step": 793 }, { "epoch": 0.03, "grad_norm": 7.389692371614512, "learning_rate": 1.9999795679757456e-05, "loss": 1.4518, "step": 794 }, { "epoch": 0.03, "grad_norm": 9.418405174133042, "learning_rate": 1.9999787255087613e-05, "loss": 1.1745, "step": 795 }, { "epoch": 0.03, "grad_norm": 8.112785581933728, "learning_rate": 1.9999778660224863e-05, "loss": 1.5497, "step": 796 }, { "epoch": 0.03, "grad_norm": 8.979924218349447, "learning_rate": 1.999976989516934e-05, "loss": 1.6604, "step": 797 }, { "epoch": 0.03, "grad_norm": 5.744524953387931, "learning_rate": 1.9999760959921204e-05, "loss": 1.1642, "step": 798 }, { "epoch": 0.03, "grad_norm": 9.497287409860748, "learning_rate": 1.9999751854480604e-05, "loss": 1.7451, "step": 799 }, { "epoch": 0.03, "grad_norm": 5.37670110198183, "learning_rate": 1.9999742578847692e-05, "loss": 1.1948, "step": 800 }, { "epoch": 0.03, "grad_norm": 12.60652184238769, "learning_rate": 1.999973313302263e-05, "loss": 1.633, "step": 801 }, { "epoch": 0.03, "grad_norm": 5.566660149471107, "learning_rate": 1.9999723517005574e-05, "loss": 1.1491, "step": 802 }, { "epoch": 0.03, "grad_norm": 11.582237190888199, "learning_rate": 1.9999713730796693e-05, "loss": 1.8714, "step": 803 }, { "epoch": 0.03, "grad_norm": 9.649713840485242, "learning_rate": 1.999970377439615e-05, "loss": 1.8329, "step": 804 }, { "epoch": 0.03, "grad_norm": 9.302606868619392, "learning_rate": 1.9999693647804115e-05, "loss": 1.332, "step": 805 }, { "epoch": 0.03, "grad_norm": 8.3297002103004, "learning_rate": 1.9999683351020765e-05, "loss": 1.3406, "step": 806 }, { "epoch": 0.03, "grad_norm": 7.387169241072521, "learning_rate": 1.9999672884046265e-05, "loss": 1.2219, "step": 807 }, { "epoch": 0.03, "grad_norm": 6.21946052432099, "learning_rate": 1.9999662246880804e-05, "loss": 1.125, "step": 808 }, { "epoch": 0.03, "grad_norm": 6.686452505019419, "learning_rate": 1.9999651439524553e-05, "loss": 1.5402, "step": 809 }, { "epoch": 0.03, "grad_norm": 8.943442565837149, "learning_rate": 1.9999640461977706e-05, "loss": 1.5689, "step": 810 }, { "epoch": 0.03, "grad_norm": 7.555902913489458, "learning_rate": 1.999962931424044e-05, "loss": 1.4912, "step": 811 }, { "epoch": 0.03, "grad_norm": 7.916068438554028, "learning_rate": 1.999961799631296e-05, "loss": 1.4335, "step": 812 }, { "epoch": 0.03, "grad_norm": 9.13841769076635, "learning_rate": 1.999960650819544e-05, "loss": 1.306, "step": 813 }, { "epoch": 0.03, "grad_norm": 6.509678119174156, "learning_rate": 1.9999594849888083e-05, "loss": 1.2639, "step": 814 }, { "epoch": 0.03, "grad_norm": 7.671413232654003, "learning_rate": 1.9999583021391093e-05, "loss": 1.533, "step": 815 }, { "epoch": 0.03, "grad_norm": 6.170979308483551, "learning_rate": 1.9999571022704665e-05, "loss": 1.3236, "step": 816 }, { "epoch": 0.03, "grad_norm": 8.965625377361226, "learning_rate": 1.9999558853829003e-05, "loss": 1.3139, "step": 817 }, { "epoch": 0.03, "grad_norm": 7.3054427065930865, "learning_rate": 1.999954651476432e-05, "loss": 1.2987, "step": 818 }, { "epoch": 0.03, "grad_norm": 6.710995486022545, "learning_rate": 1.999953400551082e-05, "loss": 1.4309, "step": 819 }, { "epoch": 0.03, "grad_norm": 7.881028038869501, "learning_rate": 1.999952132606872e-05, "loss": 1.4738, "step": 820 }, { "epoch": 0.03, "grad_norm": 7.814300233674139, "learning_rate": 1.999950847643823e-05, "loss": 1.2071, "step": 821 }, { "epoch": 0.03, "grad_norm": 9.62747515332661, "learning_rate": 1.9999495456619572e-05, "loss": 1.3801, "step": 822 }, { "epoch": 0.03, "grad_norm": 7.505578199197759, "learning_rate": 1.999948226661297e-05, "loss": 1.3126, "step": 823 }, { "epoch": 0.03, "grad_norm": 8.957752844563435, "learning_rate": 1.9999468906418647e-05, "loss": 1.3338, "step": 824 }, { "epoch": 0.03, "grad_norm": 8.933677100266538, "learning_rate": 1.999945537603683e-05, "loss": 1.2617, "step": 825 }, { "epoch": 0.03, "grad_norm": 7.0164568690635996, "learning_rate": 1.9999441675467748e-05, "loss": 1.3407, "step": 826 }, { "epoch": 0.03, "grad_norm": 7.020236962220256, "learning_rate": 1.9999427804711638e-05, "loss": 1.7017, "step": 827 }, { "epoch": 0.03, "grad_norm": 7.146532602788197, "learning_rate": 1.999941376376873e-05, "loss": 1.5422, "step": 828 }, { "epoch": 0.03, "grad_norm": 6.600247752067845, "learning_rate": 1.9999399552639268e-05, "loss": 1.449, "step": 829 }, { "epoch": 0.03, "grad_norm": 7.858498831341781, "learning_rate": 1.9999385171323492e-05, "loss": 1.5386, "step": 830 }, { "epoch": 0.03, "grad_norm": 6.460704209311796, "learning_rate": 1.9999370619821646e-05, "loss": 1.3679, "step": 831 }, { "epoch": 0.03, "grad_norm": 9.63946032846754, "learning_rate": 1.999935589813398e-05, "loss": 1.795, "step": 832 }, { "epoch": 0.03, "grad_norm": 8.027083400066376, "learning_rate": 1.999934100626074e-05, "loss": 1.3929, "step": 833 }, { "epoch": 0.03, "grad_norm": 7.039673303531014, "learning_rate": 1.9999325944202187e-05, "loss": 1.3605, "step": 834 }, { "epoch": 0.03, "grad_norm": 7.1670429422022215, "learning_rate": 1.999931071195857e-05, "loss": 1.2207, "step": 835 }, { "epoch": 0.03, "grad_norm": 7.710611359121355, "learning_rate": 1.9999295309530155e-05, "loss": 1.3509, "step": 836 }, { "epoch": 0.03, "grad_norm": 6.9800599831532715, "learning_rate": 1.99992797369172e-05, "loss": 1.4956, "step": 837 }, { "epoch": 0.03, "grad_norm": 7.076902527027667, "learning_rate": 1.999926399411997e-05, "loss": 1.4288, "step": 838 }, { "epoch": 0.03, "grad_norm": 6.945709014067419, "learning_rate": 1.999924808113873e-05, "loss": 1.5127, "step": 839 }, { "epoch": 0.03, "grad_norm": 8.946234903174766, "learning_rate": 1.9999231997973756e-05, "loss": 1.2649, "step": 840 }, { "epoch": 0.03, "grad_norm": 7.261989801534517, "learning_rate": 1.999921574462532e-05, "loss": 1.3994, "step": 841 }, { "epoch": 0.03, "grad_norm": 9.364302618827185, "learning_rate": 1.99991993210937e-05, "loss": 1.4293, "step": 842 }, { "epoch": 0.03, "grad_norm": 9.348881751103375, "learning_rate": 1.9999182727379173e-05, "loss": 1.8878, "step": 843 }, { "epoch": 0.03, "grad_norm": 3.822638419472803, "learning_rate": 1.9999165963482025e-05, "loss": 0.9755, "step": 844 }, { "epoch": 0.03, "grad_norm": 9.823924191768002, "learning_rate": 1.9999149029402533e-05, "loss": 1.1731, "step": 845 }, { "epoch": 0.03, "grad_norm": 7.441260653035092, "learning_rate": 1.9999131925140994e-05, "loss": 1.2607, "step": 846 }, { "epoch": 0.03, "grad_norm": 6.137848826561035, "learning_rate": 1.99991146506977e-05, "loss": 1.3407, "step": 847 }, { "epoch": 0.03, "grad_norm": 7.348558348268355, "learning_rate": 1.9999097206072936e-05, "loss": 1.55, "step": 848 }, { "epoch": 0.03, "grad_norm": 6.90711494647997, "learning_rate": 1.9999079591267008e-05, "loss": 1.5706, "step": 849 }, { "epoch": 0.03, "grad_norm": 6.531440156458352, "learning_rate": 1.999906180628021e-05, "loss": 1.1552, "step": 850 }, { "epoch": 0.03, "grad_norm": 8.423840931270778, "learning_rate": 1.9999043851112848e-05, "loss": 1.0926, "step": 851 }, { "epoch": 0.03, "grad_norm": 6.470800628277083, "learning_rate": 1.9999025725765223e-05, "loss": 1.6047, "step": 852 }, { "epoch": 0.03, "grad_norm": 8.481258890278603, "learning_rate": 1.9999007430237648e-05, "loss": 1.2276, "step": 853 }, { "epoch": 0.03, "grad_norm": 5.599166988714669, "learning_rate": 1.9998988964530434e-05, "loss": 1.2592, "step": 854 }, { "epoch": 0.03, "grad_norm": 5.998311713897696, "learning_rate": 1.9998970328643895e-05, "loss": 1.4086, "step": 855 }, { "epoch": 0.03, "grad_norm": 8.81383571468493, "learning_rate": 1.9998951522578344e-05, "loss": 1.2964, "step": 856 }, { "epoch": 0.03, "grad_norm": 10.357954055525788, "learning_rate": 1.999893254633411e-05, "loss": 1.4832, "step": 857 }, { "epoch": 0.03, "grad_norm": 5.593746315651964, "learning_rate": 1.9998913399911507e-05, "loss": 1.1391, "step": 858 }, { "epoch": 0.03, "grad_norm": 6.1076663544153185, "learning_rate": 1.9998894083310864e-05, "loss": 1.2948, "step": 859 }, { "epoch": 0.03, "grad_norm": 6.99522056733465, "learning_rate": 1.9998874596532512e-05, "loss": 1.1518, "step": 860 }, { "epoch": 0.03, "grad_norm": 9.14299161592401, "learning_rate": 1.999885493957678e-05, "loss": 1.2017, "step": 861 }, { "epoch": 0.03, "grad_norm": 5.656668942223748, "learning_rate": 1.9998835112444005e-05, "loss": 1.2726, "step": 862 }, { "epoch": 0.03, "grad_norm": 7.4357093681983795, "learning_rate": 1.999881511513452e-05, "loss": 1.3603, "step": 863 }, { "epoch": 0.03, "grad_norm": 9.527095207488873, "learning_rate": 1.9998794947648667e-05, "loss": 1.6349, "step": 864 }, { "epoch": 0.03, "grad_norm": 8.529976740925223, "learning_rate": 1.9998774609986794e-05, "loss": 1.2691, "step": 865 }, { "epoch": 0.03, "grad_norm": 6.486917256158089, "learning_rate": 1.9998754102149243e-05, "loss": 1.4847, "step": 866 }, { "epoch": 0.03, "grad_norm": 6.3840159519485535, "learning_rate": 1.9998733424136362e-05, "loss": 1.2487, "step": 867 }, { "epoch": 0.03, "grad_norm": 6.786659163729986, "learning_rate": 1.9998712575948508e-05, "loss": 1.4958, "step": 868 }, { "epoch": 0.04, "grad_norm": 3.9899254222054235, "learning_rate": 1.999869155758603e-05, "loss": 1.0337, "step": 869 }, { "epoch": 0.04, "grad_norm": 11.466241359445439, "learning_rate": 1.9998670369049287e-05, "loss": 1.4944, "step": 870 }, { "epoch": 0.04, "grad_norm": 7.281120623943987, "learning_rate": 1.999864901033864e-05, "loss": 1.4187, "step": 871 }, { "epoch": 0.04, "grad_norm": 3.743635079779636, "learning_rate": 1.9998627481454455e-05, "loss": 1.0909, "step": 872 }, { "epoch": 0.04, "grad_norm": 7.8700526621491536, "learning_rate": 1.9998605782397095e-05, "loss": 1.5574, "step": 873 }, { "epoch": 0.04, "grad_norm": 6.178490473313165, "learning_rate": 1.9998583913166937e-05, "loss": 1.2867, "step": 874 }, { "epoch": 0.04, "grad_norm": 7.707439956768373, "learning_rate": 1.999856187376434e-05, "loss": 1.6709, "step": 875 }, { "epoch": 0.04, "grad_norm": 6.869414675758467, "learning_rate": 1.999853966418969e-05, "loss": 1.2027, "step": 876 }, { "epoch": 0.04, "grad_norm": 7.862271494855543, "learning_rate": 1.999851728444336e-05, "loss": 1.3763, "step": 877 }, { "epoch": 0.04, "grad_norm": 9.92081644880729, "learning_rate": 1.999849473452573e-05, "loss": 1.5602, "step": 878 }, { "epoch": 0.04, "grad_norm": 7.068815683800614, "learning_rate": 1.999847201443719e-05, "loss": 1.4446, "step": 879 }, { "epoch": 0.04, "grad_norm": 6.570821762072489, "learning_rate": 1.9998449124178122e-05, "loss": 1.3241, "step": 880 }, { "epoch": 0.04, "grad_norm": 5.490764591285981, "learning_rate": 1.9998426063748915e-05, "loss": 1.3843, "step": 881 }, { "epoch": 0.04, "grad_norm": 7.792883202515735, "learning_rate": 1.9998402833149962e-05, "loss": 1.0749, "step": 882 }, { "epoch": 0.04, "grad_norm": 5.772212550643707, "learning_rate": 1.999837943238166e-05, "loss": 1.2109, "step": 883 }, { "epoch": 0.04, "grad_norm": 7.523506950103476, "learning_rate": 1.9998355861444408e-05, "loss": 1.747, "step": 884 }, { "epoch": 0.04, "grad_norm": 6.616832527487637, "learning_rate": 1.9998332120338603e-05, "loss": 1.5266, "step": 885 }, { "epoch": 0.04, "grad_norm": 6.840466841293714, "learning_rate": 1.999830820906465e-05, "loss": 1.4726, "step": 886 }, { "epoch": 0.04, "grad_norm": 6.769396106789772, "learning_rate": 1.9998284127622963e-05, "loss": 1.5841, "step": 887 }, { "epoch": 0.04, "grad_norm": 7.458809827355406, "learning_rate": 1.999825987601394e-05, "loss": 1.4778, "step": 888 }, { "epoch": 0.04, "grad_norm": 5.839335232516739, "learning_rate": 1.9998235454238006e-05, "loss": 1.1632, "step": 889 }, { "epoch": 0.04, "grad_norm": 3.924268410262047, "learning_rate": 1.9998210862295568e-05, "loss": 1.0121, "step": 890 }, { "epoch": 0.04, "grad_norm": 6.132790765966691, "learning_rate": 1.999818610018705e-05, "loss": 1.2826, "step": 891 }, { "epoch": 0.04, "grad_norm": 9.638807362039483, "learning_rate": 1.9998161167912865e-05, "loss": 1.8299, "step": 892 }, { "epoch": 0.04, "grad_norm": 8.348266224987619, "learning_rate": 1.9998136065473447e-05, "loss": 1.5679, "step": 893 }, { "epoch": 0.04, "grad_norm": 6.1697973469581155, "learning_rate": 1.999811079286922e-05, "loss": 1.3121, "step": 894 }, { "epoch": 0.04, "grad_norm": 7.675506559745794, "learning_rate": 1.9998085350100614e-05, "loss": 1.2425, "step": 895 }, { "epoch": 0.04, "grad_norm": 6.009689644410746, "learning_rate": 1.999805973716806e-05, "loss": 1.0485, "step": 896 }, { "epoch": 0.04, "grad_norm": 6.241446875853482, "learning_rate": 1.9998033954071997e-05, "loss": 1.5707, "step": 897 }, { "epoch": 0.04, "grad_norm": 6.80428414964604, "learning_rate": 1.999800800081286e-05, "loss": 1.2991, "step": 898 }, { "epoch": 0.04, "grad_norm": 7.61366199279083, "learning_rate": 1.999798187739109e-05, "loss": 1.7387, "step": 899 }, { "epoch": 0.04, "grad_norm": 5.313297502305878, "learning_rate": 1.999795558380714e-05, "loss": 1.3132, "step": 900 }, { "epoch": 0.04, "grad_norm": 7.3202478094818995, "learning_rate": 1.999792912006145e-05, "loss": 1.7162, "step": 901 }, { "epoch": 0.04, "grad_norm": 7.37181925081806, "learning_rate": 1.9997902486154475e-05, "loss": 1.5806, "step": 902 }, { "epoch": 0.04, "grad_norm": 6.338108099523437, "learning_rate": 1.9997875682086666e-05, "loss": 1.3067, "step": 903 }, { "epoch": 0.04, "grad_norm": 8.758093689236459, "learning_rate": 1.9997848707858476e-05, "loss": 1.1247, "step": 904 }, { "epoch": 0.04, "grad_norm": 7.069832365251186, "learning_rate": 1.9997821563470366e-05, "loss": 1.5612, "step": 905 }, { "epoch": 0.04, "grad_norm": 7.920955496689187, "learning_rate": 1.99977942489228e-05, "loss": 1.7987, "step": 906 }, { "epoch": 0.04, "grad_norm": 8.071893410173617, "learning_rate": 1.9997766764216243e-05, "loss": 1.2922, "step": 907 }, { "epoch": 0.04, "grad_norm": 6.7541456085918705, "learning_rate": 1.9997739109351164e-05, "loss": 1.4874, "step": 908 }, { "epoch": 0.04, "grad_norm": 7.223701762165308, "learning_rate": 1.999771128432803e-05, "loss": 1.5645, "step": 909 }, { "epoch": 0.04, "grad_norm": 8.091512140312028, "learning_rate": 1.9997683289147314e-05, "loss": 1.0261, "step": 910 }, { "epoch": 0.04, "grad_norm": 8.646671091088304, "learning_rate": 1.9997655123809495e-05, "loss": 1.6216, "step": 911 }, { "epoch": 0.04, "grad_norm": 10.17266627377912, "learning_rate": 1.9997626788315056e-05, "loss": 1.3076, "step": 912 }, { "epoch": 0.04, "grad_norm": 7.879290927576934, "learning_rate": 1.9997598282664467e-05, "loss": 1.4595, "step": 913 }, { "epoch": 0.04, "grad_norm": 5.775935161279197, "learning_rate": 1.9997569606858227e-05, "loss": 1.6569, "step": 914 }, { "epoch": 0.04, "grad_norm": 7.834018863311575, "learning_rate": 1.9997540760896817e-05, "loss": 1.0737, "step": 915 }, { "epoch": 0.04, "grad_norm": 8.574795828722781, "learning_rate": 1.9997511744780732e-05, "loss": 1.5492, "step": 916 }, { "epoch": 0.04, "grad_norm": 7.93404331579417, "learning_rate": 1.999748255851046e-05, "loss": 1.6876, "step": 917 }, { "epoch": 0.04, "grad_norm": 6.144081585405643, "learning_rate": 1.99974532020865e-05, "loss": 1.3208, "step": 918 }, { "epoch": 0.04, "grad_norm": 9.365463924681675, "learning_rate": 1.9997423675509354e-05, "loss": 1.6095, "step": 919 }, { "epoch": 0.04, "grad_norm": 15.84847043991765, "learning_rate": 1.9997393978779524e-05, "loss": 1.878, "step": 920 }, { "epoch": 0.04, "grad_norm": 6.625038671080887, "learning_rate": 1.9997364111897512e-05, "loss": 1.3495, "step": 921 }, { "epoch": 0.04, "grad_norm": 9.459081527348825, "learning_rate": 1.999733407486383e-05, "loss": 1.7466, "step": 922 }, { "epoch": 0.04, "grad_norm": 7.798782472513242, "learning_rate": 1.9997303867678986e-05, "loss": 1.2825, "step": 923 }, { "epoch": 0.04, "grad_norm": 9.811219471879745, "learning_rate": 1.99972734903435e-05, "loss": 1.197, "step": 924 }, { "epoch": 0.04, "grad_norm": 7.449292848412114, "learning_rate": 1.9997242942857883e-05, "loss": 1.5957, "step": 925 }, { "epoch": 0.04, "grad_norm": 8.077292673562203, "learning_rate": 1.9997212225222657e-05, "loss": 1.6007, "step": 926 }, { "epoch": 0.04, "grad_norm": 7.920999201052989, "learning_rate": 1.9997181337438346e-05, "loss": 1.7541, "step": 927 }, { "epoch": 0.04, "grad_norm": 7.803679177788871, "learning_rate": 1.9997150279505476e-05, "loss": 1.2353, "step": 928 }, { "epoch": 0.04, "grad_norm": 6.885007517961068, "learning_rate": 1.999711905142457e-05, "loss": 1.3159, "step": 929 }, { "epoch": 0.04, "grad_norm": 7.795986035123478, "learning_rate": 1.9997087653196166e-05, "loss": 1.486, "step": 930 }, { "epoch": 0.04, "grad_norm": 6.694050399489354, "learning_rate": 1.9997056084820797e-05, "loss": 1.3817, "step": 931 }, { "epoch": 0.04, "grad_norm": 10.540591863416799, "learning_rate": 1.9997024346298996e-05, "loss": 1.4426, "step": 932 }, { "epoch": 0.04, "grad_norm": 8.114919773107593, "learning_rate": 1.999699243763131e-05, "loss": 1.4093, "step": 933 }, { "epoch": 0.04, "grad_norm": 8.317358439773628, "learning_rate": 1.9996960358818277e-05, "loss": 1.1704, "step": 934 }, { "epoch": 0.04, "grad_norm": 7.624932923310686, "learning_rate": 1.9996928109860442e-05, "loss": 1.5912, "step": 935 }, { "epoch": 0.04, "grad_norm": 8.52554577414443, "learning_rate": 1.9996895690758358e-05, "loss": 1.48, "step": 936 }, { "epoch": 0.04, "grad_norm": 7.018647240373812, "learning_rate": 1.9996863101512576e-05, "loss": 1.2244, "step": 937 }, { "epoch": 0.04, "grad_norm": 6.9780436074401395, "learning_rate": 1.9996830342123648e-05, "loss": 1.6141, "step": 938 }, { "epoch": 0.04, "grad_norm": 6.816056822306007, "learning_rate": 1.9996797412592137e-05, "loss": 1.1869, "step": 939 }, { "epoch": 0.04, "grad_norm": 8.184321348035454, "learning_rate": 1.99967643129186e-05, "loss": 1.4055, "step": 940 }, { "epoch": 0.04, "grad_norm": 8.261313600473951, "learning_rate": 1.999673104310359e-05, "loss": 1.6616, "step": 941 }, { "epoch": 0.04, "grad_norm": 6.1815032758984145, "learning_rate": 1.9996697603147696e-05, "loss": 1.3349, "step": 942 }, { "epoch": 0.04, "grad_norm": 9.980776032792113, "learning_rate": 1.999666399305147e-05, "loss": 1.5637, "step": 943 }, { "epoch": 0.04, "grad_norm": 6.415747393477452, "learning_rate": 1.9996630212815486e-05, "loss": 1.2148, "step": 944 }, { "epoch": 0.04, "grad_norm": 5.98050192302215, "learning_rate": 1.999659626244032e-05, "loss": 1.3023, "step": 945 }, { "epoch": 0.04, "grad_norm": 7.472954369019742, "learning_rate": 1.9996562141926557e-05, "loss": 1.5546, "step": 946 }, { "epoch": 0.04, "grad_norm": 7.559545023053875, "learning_rate": 1.999652785127477e-05, "loss": 1.6487, "step": 947 }, { "epoch": 0.04, "grad_norm": 8.144467829269692, "learning_rate": 1.9996493390485545e-05, "loss": 1.4299, "step": 948 }, { "epoch": 0.04, "grad_norm": 6.671945924005451, "learning_rate": 1.9996458759559468e-05, "loss": 1.4013, "step": 949 }, { "epoch": 0.04, "grad_norm": 8.3728415567973, "learning_rate": 1.999642395849713e-05, "loss": 1.8359, "step": 950 }, { "epoch": 0.04, "grad_norm": 6.200381167291844, "learning_rate": 1.9996388987299117e-05, "loss": 1.6116, "step": 951 }, { "epoch": 0.04, "grad_norm": 7.141567012674161, "learning_rate": 1.9996353845966033e-05, "loss": 1.3249, "step": 952 }, { "epoch": 0.04, "grad_norm": 7.616278529581532, "learning_rate": 1.999631853449847e-05, "loss": 1.4081, "step": 953 }, { "epoch": 0.04, "grad_norm": 6.010885292179237, "learning_rate": 1.9996283052897035e-05, "loss": 1.2937, "step": 954 }, { "epoch": 0.04, "grad_norm": 6.593531996326042, "learning_rate": 1.9996247401162324e-05, "loss": 1.3638, "step": 955 }, { "epoch": 0.04, "grad_norm": 6.240717284488935, "learning_rate": 1.9996211579294947e-05, "loss": 1.4056, "step": 956 }, { "epoch": 0.04, "grad_norm": 6.393327986303736, "learning_rate": 1.999617558729552e-05, "loss": 1.4116, "step": 957 }, { "epoch": 0.04, "grad_norm": 7.011379452200711, "learning_rate": 1.9996139425164644e-05, "loss": 1.2791, "step": 958 }, { "epoch": 0.04, "grad_norm": 11.075644223576186, "learning_rate": 1.9996103092902944e-05, "loss": 1.5012, "step": 959 }, { "epoch": 0.04, "grad_norm": 9.966514585664333, "learning_rate": 1.9996066590511032e-05, "loss": 1.3111, "step": 960 }, { "epoch": 0.04, "grad_norm": 8.011836781072017, "learning_rate": 1.9996029917989537e-05, "loss": 1.68, "step": 961 }, { "epoch": 0.04, "grad_norm": 9.747085036174793, "learning_rate": 1.999599307533907e-05, "loss": 1.396, "step": 962 }, { "epoch": 0.04, "grad_norm": 8.875664391433856, "learning_rate": 1.9995956062560274e-05, "loss": 1.5381, "step": 963 }, { "epoch": 0.04, "grad_norm": 5.612536592671347, "learning_rate": 1.999591887965377e-05, "loss": 1.3646, "step": 964 }, { "epoch": 0.04, "grad_norm": 6.0717609666381005, "learning_rate": 1.999588152662019e-05, "loss": 1.2591, "step": 965 }, { "epoch": 0.04, "grad_norm": 10.2311529315973, "learning_rate": 1.9995844003460173e-05, "loss": 1.6544, "step": 966 }, { "epoch": 0.04, "grad_norm": 6.683708922212679, "learning_rate": 1.9995806310174357e-05, "loss": 1.1797, "step": 967 }, { "epoch": 0.04, "grad_norm": 5.892594769306384, "learning_rate": 1.9995768446763383e-05, "loss": 1.0544, "step": 968 }, { "epoch": 0.04, "grad_norm": 8.680636210293175, "learning_rate": 1.9995730413227894e-05, "loss": 1.6938, "step": 969 }, { "epoch": 0.04, "grad_norm": 7.679661199812549, "learning_rate": 1.999569220956854e-05, "loss": 1.7427, "step": 970 }, { "epoch": 0.04, "grad_norm": 7.513474271290572, "learning_rate": 1.999565383578597e-05, "loss": 1.3672, "step": 971 }, { "epoch": 0.04, "grad_norm": 6.74898764281988, "learning_rate": 1.9995615291880838e-05, "loss": 1.4479, "step": 972 }, { "epoch": 0.04, "grad_norm": 11.706665898881583, "learning_rate": 1.9995576577853794e-05, "loss": 1.7845, "step": 973 }, { "epoch": 0.04, "grad_norm": 6.444991628194857, "learning_rate": 1.9995537693705506e-05, "loss": 1.5322, "step": 974 }, { "epoch": 0.04, "grad_norm": 11.829607735225315, "learning_rate": 1.9995498639436634e-05, "loss": 1.5932, "step": 975 }, { "epoch": 0.04, "grad_norm": 7.42211434977875, "learning_rate": 1.9995459415047835e-05, "loss": 1.3356, "step": 976 }, { "epoch": 0.04, "grad_norm": 7.837787731391683, "learning_rate": 1.9995420020539783e-05, "loss": 1.684, "step": 977 }, { "epoch": 0.04, "grad_norm": 6.901813918701222, "learning_rate": 1.9995380455913153e-05, "loss": 1.5276, "step": 978 }, { "epoch": 0.04, "grad_norm": 8.79887242832733, "learning_rate": 1.999534072116861e-05, "loss": 1.5357, "step": 979 }, { "epoch": 0.04, "grad_norm": 7.534717527296219, "learning_rate": 1.9995300816306832e-05, "loss": 1.5181, "step": 980 }, { "epoch": 0.04, "grad_norm": 8.040834051294794, "learning_rate": 1.9995260741328496e-05, "loss": 1.5596, "step": 981 }, { "epoch": 0.04, "grad_norm": 6.180512668344116, "learning_rate": 1.9995220496234292e-05, "loss": 1.3328, "step": 982 }, { "epoch": 0.04, "grad_norm": 6.772007216600503, "learning_rate": 1.99951800810249e-05, "loss": 1.5205, "step": 983 }, { "epoch": 0.04, "grad_norm": 6.6853981053446905, "learning_rate": 1.999513949570101e-05, "loss": 1.3577, "step": 984 }, { "epoch": 0.04, "grad_norm": 6.22475066772042, "learning_rate": 1.9995098740263308e-05, "loss": 1.3033, "step": 985 }, { "epoch": 0.04, "grad_norm": 7.833301425126583, "learning_rate": 1.9995057814712488e-05, "loss": 1.5611, "step": 986 }, { "epoch": 0.04, "grad_norm": 8.434479476544725, "learning_rate": 1.9995016719049255e-05, "loss": 1.6259, "step": 987 }, { "epoch": 0.04, "grad_norm": 8.871294747787312, "learning_rate": 1.99949754532743e-05, "loss": 1.4339, "step": 988 }, { "epoch": 0.04, "grad_norm": 8.140778691577871, "learning_rate": 1.9994934017388326e-05, "loss": 1.5787, "step": 989 }, { "epoch": 0.04, "grad_norm": 7.505653973831968, "learning_rate": 1.9994892411392043e-05, "loss": 1.2681, "step": 990 }, { "epoch": 0.04, "grad_norm": 5.700502500654452, "learning_rate": 1.9994850635286153e-05, "loss": 1.2548, "step": 991 }, { "epoch": 0.04, "grad_norm": 6.448845610114987, "learning_rate": 1.9994808689071374e-05, "loss": 1.475, "step": 992 }, { "epoch": 0.04, "grad_norm": 7.137376055457139, "learning_rate": 1.9994766572748413e-05, "loss": 1.5991, "step": 993 }, { "epoch": 0.04, "grad_norm": 10.697492373898864, "learning_rate": 1.9994724286317996e-05, "loss": 1.5039, "step": 994 }, { "epoch": 0.04, "grad_norm": 5.322005883310929, "learning_rate": 1.999468182978083e-05, "loss": 1.1898, "step": 995 }, { "epoch": 0.04, "grad_norm": 6.125618165303836, "learning_rate": 1.9994639203137647e-05, "loss": 1.2526, "step": 996 }, { "epoch": 0.04, "grad_norm": 6.727283980793183, "learning_rate": 1.999459640638917e-05, "loss": 1.3563, "step": 997 }, { "epoch": 0.04, "grad_norm": 8.516814097533212, "learning_rate": 1.9994553439536125e-05, "loss": 1.651, "step": 998 }, { "epoch": 0.04, "grad_norm": 7.174027831910559, "learning_rate": 1.9994510302579248e-05, "loss": 1.445, "step": 999 }, { "epoch": 0.04, "grad_norm": 6.600799823324763, "learning_rate": 1.999446699551927e-05, "loss": 1.242, "step": 1000 }, { "epoch": 0.04, "grad_norm": 6.054816642540922, "learning_rate": 1.9994423518356926e-05, "loss": 1.3757, "step": 1001 }, { "epoch": 0.04, "grad_norm": 11.19061321706073, "learning_rate": 1.999437987109296e-05, "loss": 1.5227, "step": 1002 }, { "epoch": 0.04, "grad_norm": 6.280727576755848, "learning_rate": 1.9994336053728118e-05, "loss": 1.2921, "step": 1003 }, { "epoch": 0.04, "grad_norm": 6.419717502732885, "learning_rate": 1.9994292066263138e-05, "loss": 1.3911, "step": 1004 }, { "epoch": 0.04, "grad_norm": 13.801942725902244, "learning_rate": 1.999424790869877e-05, "loss": 1.8957, "step": 1005 }, { "epoch": 0.04, "grad_norm": 7.873363949094151, "learning_rate": 1.999420358103577e-05, "loss": 1.7875, "step": 1006 }, { "epoch": 0.04, "grad_norm": 6.1484383855677125, "learning_rate": 1.9994159083274886e-05, "loss": 1.3423, "step": 1007 }, { "epoch": 0.04, "grad_norm": 5.896922711767914, "learning_rate": 1.9994114415416884e-05, "loss": 1.0747, "step": 1008 }, { "epoch": 0.04, "grad_norm": 6.178020421771567, "learning_rate": 1.9994069577462517e-05, "loss": 1.3024, "step": 1009 }, { "epoch": 0.04, "grad_norm": 7.426796083788023, "learning_rate": 1.999402456941255e-05, "loss": 1.3702, "step": 1010 }, { "epoch": 0.04, "grad_norm": 7.196536156645012, "learning_rate": 1.9993979391267752e-05, "loss": 1.6199, "step": 1011 }, { "epoch": 0.04, "grad_norm": 9.268881628158054, "learning_rate": 1.9993934043028886e-05, "loss": 1.5233, "step": 1012 }, { "epoch": 0.04, "grad_norm": 7.732571140657865, "learning_rate": 1.999388852469673e-05, "loss": 1.3941, "step": 1013 }, { "epoch": 0.04, "grad_norm": 9.521183511351005, "learning_rate": 1.9993842836272054e-05, "loss": 1.6458, "step": 1014 }, { "epoch": 0.04, "grad_norm": 6.114005870907644, "learning_rate": 1.999379697775564e-05, "loss": 1.376, "step": 1015 }, { "epoch": 0.04, "grad_norm": 8.712721168899753, "learning_rate": 1.9993750949148266e-05, "loss": 1.4283, "step": 1016 }, { "epoch": 0.04, "grad_norm": 6.536587072851685, "learning_rate": 1.999370475045072e-05, "loss": 1.4373, "step": 1017 }, { "epoch": 0.04, "grad_norm": 9.59251515402904, "learning_rate": 1.9993658381663773e-05, "loss": 1.6832, "step": 1018 }, { "epoch": 0.04, "grad_norm": 7.311404974494539, "learning_rate": 1.9993611842788233e-05, "loss": 1.5442, "step": 1019 }, { "epoch": 0.04, "grad_norm": 9.654078000564674, "learning_rate": 1.9993565133824882e-05, "loss": 1.3342, "step": 1020 }, { "epoch": 0.04, "grad_norm": 5.867182038706527, "learning_rate": 1.9993518254774517e-05, "loss": 1.3485, "step": 1021 }, { "epoch": 0.04, "grad_norm": 6.481603750153433, "learning_rate": 1.9993471205637934e-05, "loss": 1.1429, "step": 1022 }, { "epoch": 0.04, "grad_norm": 5.691290119472744, "learning_rate": 1.999342398641594e-05, "loss": 1.3617, "step": 1023 }, { "epoch": 0.04, "grad_norm": 8.788521076069781, "learning_rate": 1.9993376597109332e-05, "loss": 1.3175, "step": 1024 }, { "epoch": 0.04, "grad_norm": 10.858053878268503, "learning_rate": 1.9993329037718916e-05, "loss": 1.2998, "step": 1025 }, { "epoch": 0.04, "grad_norm": 9.147726962331111, "learning_rate": 1.9993281308245507e-05, "loss": 1.8507, "step": 1026 }, { "epoch": 0.04, "grad_norm": 7.046559033580077, "learning_rate": 1.9993233408689915e-05, "loss": 1.5495, "step": 1027 }, { "epoch": 0.04, "grad_norm": 8.497128773485002, "learning_rate": 1.9993185339052956e-05, "loss": 1.3018, "step": 1028 }, { "epoch": 0.04, "grad_norm": 13.00542676787873, "learning_rate": 1.9993137099335445e-05, "loss": 1.8672, "step": 1029 }, { "epoch": 0.04, "grad_norm": 10.014732185832456, "learning_rate": 1.9993088689538206e-05, "loss": 1.4667, "step": 1030 }, { "epoch": 0.04, "grad_norm": 9.444986002334135, "learning_rate": 1.999304010966206e-05, "loss": 1.1086, "step": 1031 }, { "epoch": 0.04, "grad_norm": 6.662606660008082, "learning_rate": 1.9992991359707837e-05, "loss": 1.4395, "step": 1032 }, { "epoch": 0.04, "grad_norm": 8.489408846017827, "learning_rate": 1.999294243967637e-05, "loss": 1.1524, "step": 1033 }, { "epoch": 0.04, "grad_norm": 10.203708282250208, "learning_rate": 1.9992893349568482e-05, "loss": 1.4725, "step": 1034 }, { "epoch": 0.04, "grad_norm": 9.135229336920883, "learning_rate": 1.999284408938501e-05, "loss": 1.6593, "step": 1035 }, { "epoch": 0.04, "grad_norm": 8.035658867456814, "learning_rate": 1.9992794659126806e-05, "loss": 1.4783, "step": 1036 }, { "epoch": 0.04, "grad_norm": 9.54452779435401, "learning_rate": 1.9992745058794694e-05, "loss": 1.5312, "step": 1037 }, { "epoch": 0.04, "grad_norm": 6.7457926274126985, "learning_rate": 1.9992695288389527e-05, "loss": 1.3954, "step": 1038 }, { "epoch": 0.04, "grad_norm": 7.725314644713983, "learning_rate": 1.999264534791215e-05, "loss": 1.2731, "step": 1039 }, { "epoch": 0.04, "grad_norm": 11.177025012280493, "learning_rate": 1.9992595237363415e-05, "loss": 1.6093, "step": 1040 }, { "epoch": 0.04, "grad_norm": 6.587725964785506, "learning_rate": 1.9992544956744174e-05, "loss": 1.3668, "step": 1041 }, { "epoch": 0.04, "grad_norm": 7.827510771105642, "learning_rate": 1.999249450605528e-05, "loss": 1.41, "step": 1042 }, { "epoch": 0.04, "grad_norm": 10.951484119297973, "learning_rate": 1.9992443885297592e-05, "loss": 1.7087, "step": 1043 }, { "epoch": 0.04, "grad_norm": 6.675934905959847, "learning_rate": 1.9992393094471976e-05, "loss": 1.3769, "step": 1044 }, { "epoch": 0.04, "grad_norm": 9.617488507898361, "learning_rate": 1.9992342133579295e-05, "loss": 1.3218, "step": 1045 }, { "epoch": 0.04, "grad_norm": 6.424149798609915, "learning_rate": 1.9992291002620412e-05, "loss": 1.384, "step": 1046 }, { "epoch": 0.04, "grad_norm": 6.125964742977812, "learning_rate": 1.9992239701596202e-05, "loss": 1.6464, "step": 1047 }, { "epoch": 0.04, "grad_norm": 8.711751849837585, "learning_rate": 1.999218823050754e-05, "loss": 1.3026, "step": 1048 }, { "epoch": 0.04, "grad_norm": 7.7981425952959835, "learning_rate": 1.9992136589355292e-05, "loss": 1.4123, "step": 1049 }, { "epoch": 0.04, "grad_norm": 7.174696792080061, "learning_rate": 1.9992084778140347e-05, "loss": 1.5308, "step": 1050 }, { "epoch": 0.04, "grad_norm": 7.15037456468554, "learning_rate": 1.9992032796863584e-05, "loss": 1.4467, "step": 1051 }, { "epoch": 0.04, "grad_norm": 6.769767446998471, "learning_rate": 1.9991980645525882e-05, "loss": 1.3248, "step": 1052 }, { "epoch": 0.04, "grad_norm": 6.989813356447352, "learning_rate": 1.999192832412814e-05, "loss": 1.4501, "step": 1053 }, { "epoch": 0.04, "grad_norm": 8.08183533405334, "learning_rate": 1.999187583267124e-05, "loss": 1.3976, "step": 1054 }, { "epoch": 0.04, "grad_norm": 8.202110512643497, "learning_rate": 1.9991823171156072e-05, "loss": 1.5613, "step": 1055 }, { "epoch": 0.04, "grad_norm": 7.552436474798843, "learning_rate": 1.9991770339583544e-05, "loss": 1.5689, "step": 1056 }, { "epoch": 0.04, "grad_norm": 5.75306964654212, "learning_rate": 1.9991717337954545e-05, "loss": 1.3197, "step": 1057 }, { "epoch": 0.04, "grad_norm": 6.270917445460211, "learning_rate": 1.999166416626998e-05, "loss": 1.3265, "step": 1058 }, { "epoch": 0.04, "grad_norm": 5.846781645425409, "learning_rate": 1.999161082453076e-05, "loss": 1.3841, "step": 1059 }, { "epoch": 0.04, "grad_norm": 8.422739618053509, "learning_rate": 1.9991557312737787e-05, "loss": 1.2695, "step": 1060 }, { "epoch": 0.04, "grad_norm": 5.790371697998435, "learning_rate": 1.9991503630891967e-05, "loss": 1.1158, "step": 1061 }, { "epoch": 0.04, "grad_norm": 9.221746567787163, "learning_rate": 1.9991449778994223e-05, "loss": 1.6151, "step": 1062 }, { "epoch": 0.04, "grad_norm": 6.592692375084283, "learning_rate": 1.9991395757045467e-05, "loss": 1.0144, "step": 1063 }, { "epoch": 0.04, "grad_norm": 7.434188342257871, "learning_rate": 1.999134156504662e-05, "loss": 1.5019, "step": 1064 }, { "epoch": 0.04, "grad_norm": 6.8651413437217945, "learning_rate": 1.9991287202998602e-05, "loss": 1.454, "step": 1065 }, { "epoch": 0.04, "grad_norm": 10.232482694551734, "learning_rate": 1.9991232670902343e-05, "loss": 1.5478, "step": 1066 }, { "epoch": 0.04, "grad_norm": 7.632265934540515, "learning_rate": 1.9991177968758764e-05, "loss": 1.5036, "step": 1067 }, { "epoch": 0.04, "grad_norm": 6.04449328756143, "learning_rate": 1.9991123096568802e-05, "loss": 1.2138, "step": 1068 }, { "epoch": 0.04, "grad_norm": 8.275985218567797, "learning_rate": 1.9991068054333387e-05, "loss": 1.7905, "step": 1069 }, { "epoch": 0.04, "grad_norm": 6.266771284315821, "learning_rate": 1.999101284205346e-05, "loss": 1.4896, "step": 1070 }, { "epoch": 0.04, "grad_norm": 9.00377099296883, "learning_rate": 1.9990957459729956e-05, "loss": 1.1116, "step": 1071 }, { "epoch": 0.04, "grad_norm": 9.550448812268904, "learning_rate": 1.999090190736382e-05, "loss": 1.6304, "step": 1072 }, { "epoch": 0.04, "grad_norm": 6.439208143688307, "learning_rate": 1.9990846184955998e-05, "loss": 1.5372, "step": 1073 }, { "epoch": 0.04, "grad_norm": 6.904235022439844, "learning_rate": 1.9990790292507434e-05, "loss": 1.6194, "step": 1074 }, { "epoch": 0.04, "grad_norm": 5.177638332439812, "learning_rate": 1.9990734230019086e-05, "loss": 1.2, "step": 1075 }, { "epoch": 0.04, "grad_norm": 11.96615675756491, "learning_rate": 1.9990677997491906e-05, "loss": 1.9356, "step": 1076 }, { "epoch": 0.04, "grad_norm": 6.300314222220389, "learning_rate": 1.999062159492685e-05, "loss": 1.3247, "step": 1077 }, { "epoch": 0.04, "grad_norm": 5.3795764096347645, "learning_rate": 1.9990565022324877e-05, "loss": 1.3849, "step": 1078 }, { "epoch": 0.04, "grad_norm": 7.5033674087850635, "learning_rate": 1.9990508279686952e-05, "loss": 1.5566, "step": 1079 }, { "epoch": 0.04, "grad_norm": 9.516357839503339, "learning_rate": 1.9990451367014033e-05, "loss": 1.4577, "step": 1080 }, { "epoch": 0.04, "grad_norm": 9.895091733776823, "learning_rate": 1.9990394284307105e-05, "loss": 1.3428, "step": 1081 }, { "epoch": 0.04, "grad_norm": 4.982195042920531, "learning_rate": 1.9990337031567124e-05, "loss": 1.0707, "step": 1082 }, { "epoch": 0.04, "grad_norm": 6.703179984539122, "learning_rate": 1.999027960879507e-05, "loss": 1.4839, "step": 1083 }, { "epoch": 0.04, "grad_norm": 8.091692019999666, "learning_rate": 1.9990222015991925e-05, "loss": 1.1921, "step": 1084 }, { "epoch": 0.04, "grad_norm": 6.073775033096179, "learning_rate": 1.999016425315866e-05, "loss": 1.1458, "step": 1085 }, { "epoch": 0.04, "grad_norm": 8.893720118063953, "learning_rate": 1.9990106320296266e-05, "loss": 1.2642, "step": 1086 }, { "epoch": 0.04, "grad_norm": 7.829825252599104, "learning_rate": 1.9990048217405725e-05, "loss": 1.366, "step": 1087 }, { "epoch": 0.04, "grad_norm": 7.689235875990117, "learning_rate": 1.9989989944488028e-05, "loss": 1.2178, "step": 1088 }, { "epoch": 0.04, "grad_norm": 9.37470020278995, "learning_rate": 1.998993150154416e-05, "loss": 1.2546, "step": 1089 }, { "epoch": 0.04, "grad_norm": 8.497384595560334, "learning_rate": 1.998987288857513e-05, "loss": 1.493, "step": 1090 }, { "epoch": 0.04, "grad_norm": 7.027493470938268, "learning_rate": 1.9989814105581923e-05, "loss": 1.2675, "step": 1091 }, { "epoch": 0.04, "grad_norm": 6.421792511325926, "learning_rate": 1.998975515256554e-05, "loss": 1.4063, "step": 1092 }, { "epoch": 0.04, "grad_norm": 8.436662927099091, "learning_rate": 1.998969602952699e-05, "loss": 1.4665, "step": 1093 }, { "epoch": 0.04, "grad_norm": 7.0810915679503, "learning_rate": 1.9989636736467278e-05, "loss": 1.3567, "step": 1094 }, { "epoch": 0.04, "grad_norm": 5.641916997220099, "learning_rate": 1.9989577273387412e-05, "loss": 1.251, "step": 1095 }, { "epoch": 0.04, "grad_norm": 9.373019408079179, "learning_rate": 1.9989517640288402e-05, "loss": 1.3061, "step": 1096 }, { "epoch": 0.04, "grad_norm": 8.162205263710801, "learning_rate": 1.9989457837171266e-05, "loss": 1.783, "step": 1097 }, { "epoch": 0.04, "grad_norm": 5.3618504008006935, "learning_rate": 1.998939786403702e-05, "loss": 1.2785, "step": 1098 }, { "epoch": 0.04, "grad_norm": 6.9690183771913095, "learning_rate": 1.9989337720886685e-05, "loss": 1.3401, "step": 1099 }, { "epoch": 0.04, "grad_norm": 6.571866725799089, "learning_rate": 1.9989277407721287e-05, "loss": 1.2627, "step": 1100 }, { "epoch": 0.04, "grad_norm": 5.376809660231449, "learning_rate": 1.998921692454185e-05, "loss": 1.3289, "step": 1101 }, { "epoch": 0.04, "grad_norm": 6.199059706561627, "learning_rate": 1.9989156271349405e-05, "loss": 1.1648, "step": 1102 }, { "epoch": 0.04, "grad_norm": 5.121096665549203, "learning_rate": 1.998909544814498e-05, "loss": 1.2336, "step": 1103 }, { "epoch": 0.04, "grad_norm": 6.943995396406024, "learning_rate": 1.9989034454929616e-05, "loss": 1.4747, "step": 1104 }, { "epoch": 0.04, "grad_norm": 6.553381325124945, "learning_rate": 1.998897329170435e-05, "loss": 1.4922, "step": 1105 }, { "epoch": 0.04, "grad_norm": 6.764048438199058, "learning_rate": 1.998891195847022e-05, "loss": 1.335, "step": 1106 }, { "epoch": 0.04, "grad_norm": 7.041997050681997, "learning_rate": 1.9988850455228273e-05, "loss": 1.4317, "step": 1107 }, { "epoch": 0.04, "grad_norm": 5.960319512562217, "learning_rate": 1.9988788781979554e-05, "loss": 1.327, "step": 1108 }, { "epoch": 0.04, "grad_norm": 7.731941091844931, "learning_rate": 1.9988726938725113e-05, "loss": 1.4945, "step": 1109 }, { "epoch": 0.04, "grad_norm": 7.900597433402245, "learning_rate": 1.9988664925465998e-05, "loss": 1.5907, "step": 1110 }, { "epoch": 0.04, "grad_norm": 6.306397185153227, "learning_rate": 1.9988602742203275e-05, "loss": 1.2107, "step": 1111 }, { "epoch": 0.04, "grad_norm": 6.40629404572251, "learning_rate": 1.9988540388937993e-05, "loss": 1.0309, "step": 1112 }, { "epoch": 0.04, "grad_norm": 8.374400902175227, "learning_rate": 1.9988477865671217e-05, "loss": 1.613, "step": 1113 }, { "epoch": 0.04, "grad_norm": 9.523030994512139, "learning_rate": 1.998841517240401e-05, "loss": 1.4283, "step": 1114 }, { "epoch": 0.04, "grad_norm": 11.830620363542188, "learning_rate": 1.998835230913744e-05, "loss": 1.655, "step": 1115 }, { "epoch": 0.04, "grad_norm": 7.353142651649498, "learning_rate": 1.9988289275872576e-05, "loss": 1.3767, "step": 1116 }, { "epoch": 0.04, "grad_norm": 9.21912231695581, "learning_rate": 1.9988226072610494e-05, "loss": 1.3033, "step": 1117 }, { "epoch": 0.05, "grad_norm": 6.814991805506658, "learning_rate": 1.9988162699352264e-05, "loss": 1.3728, "step": 1118 }, { "epoch": 0.05, "grad_norm": 7.288018848422014, "learning_rate": 1.9988099156098966e-05, "loss": 1.1978, "step": 1119 }, { "epoch": 0.05, "grad_norm": 7.420668415998671, "learning_rate": 1.9988035442851686e-05, "loss": 1.6142, "step": 1120 }, { "epoch": 0.05, "grad_norm": 8.451789256923396, "learning_rate": 1.9987971559611507e-05, "loss": 1.3732, "step": 1121 }, { "epoch": 0.05, "grad_norm": 7.408693781857968, "learning_rate": 1.9987907506379513e-05, "loss": 1.4327, "step": 1122 }, { "epoch": 0.05, "grad_norm": 9.32806376782246, "learning_rate": 1.9987843283156795e-05, "loss": 1.4698, "step": 1123 }, { "epoch": 0.05, "grad_norm": 6.8335292889247725, "learning_rate": 1.9987778889944448e-05, "loss": 1.4701, "step": 1124 }, { "epoch": 0.05, "grad_norm": 7.6765760616675385, "learning_rate": 1.9987714326743566e-05, "loss": 1.0584, "step": 1125 }, { "epoch": 0.05, "grad_norm": 6.871944992225571, "learning_rate": 1.998764959355525e-05, "loss": 1.3631, "step": 1126 }, { "epoch": 0.05, "grad_norm": 7.758705382655558, "learning_rate": 1.99875846903806e-05, "loss": 1.5984, "step": 1127 }, { "epoch": 0.05, "grad_norm": 7.337426195414603, "learning_rate": 1.998751961722072e-05, "loss": 1.5327, "step": 1128 }, { "epoch": 0.05, "grad_norm": 8.045436515490048, "learning_rate": 1.998745437407672e-05, "loss": 1.6327, "step": 1129 }, { "epoch": 0.05, "grad_norm": 7.146993945227617, "learning_rate": 1.998738896094971e-05, "loss": 1.5954, "step": 1130 }, { "epoch": 0.05, "grad_norm": 10.01110302002289, "learning_rate": 1.9987323377840802e-05, "loss": 1.7811, "step": 1131 }, { "epoch": 0.05, "grad_norm": 7.410782251349829, "learning_rate": 1.9987257624751112e-05, "loss": 1.1588, "step": 1132 }, { "epoch": 0.05, "grad_norm": 3.963648512349303, "learning_rate": 1.9987191701681757e-05, "loss": 1.1468, "step": 1133 }, { "epoch": 0.05, "grad_norm": 7.707446085421224, "learning_rate": 1.9987125608633863e-05, "loss": 1.5631, "step": 1134 }, { "epoch": 0.05, "grad_norm": 6.65735247981638, "learning_rate": 1.9987059345608557e-05, "loss": 1.4013, "step": 1135 }, { "epoch": 0.05, "grad_norm": 7.726827874090467, "learning_rate": 1.998699291260696e-05, "loss": 1.7462, "step": 1136 }, { "epoch": 0.05, "grad_norm": 11.7340754261406, "learning_rate": 1.9986926309630208e-05, "loss": 1.4953, "step": 1137 }, { "epoch": 0.05, "grad_norm": 7.017154576768546, "learning_rate": 1.998685953667943e-05, "loss": 1.2793, "step": 1138 }, { "epoch": 0.05, "grad_norm": 7.6742403901590786, "learning_rate": 1.998679259375577e-05, "loss": 1.1419, "step": 1139 }, { "epoch": 0.05, "grad_norm": 7.576496007981129, "learning_rate": 1.9986725480860355e-05, "loss": 1.2548, "step": 1140 }, { "epoch": 0.05, "grad_norm": 6.145869110067948, "learning_rate": 1.998665819799434e-05, "loss": 1.4126, "step": 1141 }, { "epoch": 0.05, "grad_norm": 9.513297955882855, "learning_rate": 1.9986590745158864e-05, "loss": 1.8085, "step": 1142 }, { "epoch": 0.05, "grad_norm": 6.178169495109167, "learning_rate": 1.9986523122355073e-05, "loss": 1.3133, "step": 1143 }, { "epoch": 0.05, "grad_norm": 8.17626904446269, "learning_rate": 1.998645532958412e-05, "loss": 1.3063, "step": 1144 }, { "epoch": 0.05, "grad_norm": 6.556993280300207, "learning_rate": 1.9986387366847164e-05, "loss": 1.0484, "step": 1145 }, { "epoch": 0.05, "grad_norm": 6.659703484377311, "learning_rate": 1.9986319234145356e-05, "loss": 1.4591, "step": 1146 }, { "epoch": 0.05, "grad_norm": 6.240316368792554, "learning_rate": 1.9986250931479857e-05, "loss": 1.3616, "step": 1147 }, { "epoch": 0.05, "grad_norm": 6.995479266482959, "learning_rate": 1.9986182458851824e-05, "loss": 1.4465, "step": 1148 }, { "epoch": 0.05, "grad_norm": 5.754078073630623, "learning_rate": 1.9986113816262433e-05, "loss": 1.3476, "step": 1149 }, { "epoch": 0.05, "grad_norm": 6.213156906259343, "learning_rate": 1.9986045003712845e-05, "loss": 1.3594, "step": 1150 }, { "epoch": 0.05, "grad_norm": 8.44331677429967, "learning_rate": 1.9985976021204236e-05, "loss": 1.5596, "step": 1151 }, { "epoch": 0.05, "grad_norm": 7.722628412584643, "learning_rate": 1.998590686873777e-05, "loss": 1.2554, "step": 1152 }, { "epoch": 0.05, "grad_norm": 7.201822462865456, "learning_rate": 1.9985837546314637e-05, "loss": 1.6201, "step": 1153 }, { "epoch": 0.05, "grad_norm": 6.1481882654429505, "learning_rate": 1.9985768053936012e-05, "loss": 1.3285, "step": 1154 }, { "epoch": 0.05, "grad_norm": 6.697784955203706, "learning_rate": 1.9985698391603076e-05, "loss": 1.2146, "step": 1155 }, { "epoch": 0.05, "grad_norm": 5.575946803991559, "learning_rate": 1.998562855931701e-05, "loss": 1.4095, "step": 1156 }, { "epoch": 0.05, "grad_norm": 6.6831928515601575, "learning_rate": 1.9985558557079013e-05, "loss": 1.5531, "step": 1157 }, { "epoch": 0.05, "grad_norm": 6.055767479507656, "learning_rate": 1.9985488384890268e-05, "loss": 1.153, "step": 1158 }, { "epoch": 0.05, "grad_norm": 5.647366893362251, "learning_rate": 1.9985418042751975e-05, "loss": 1.4351, "step": 1159 }, { "epoch": 0.05, "grad_norm": 7.379921029593668, "learning_rate": 1.9985347530665327e-05, "loss": 1.281, "step": 1160 }, { "epoch": 0.05, "grad_norm": 6.122118194239811, "learning_rate": 1.9985276848631526e-05, "loss": 1.5779, "step": 1161 }, { "epoch": 0.05, "grad_norm": 8.080564586428036, "learning_rate": 1.9985205996651776e-05, "loss": 1.4505, "step": 1162 }, { "epoch": 0.05, "grad_norm": 6.3925020052979225, "learning_rate": 1.998513497472728e-05, "loss": 1.3472, "step": 1163 }, { "epoch": 0.05, "grad_norm": 5.336455626270903, "learning_rate": 1.998506378285925e-05, "loss": 1.3863, "step": 1164 }, { "epoch": 0.05, "grad_norm": 6.466951551893215, "learning_rate": 1.9984992421048898e-05, "loss": 1.1468, "step": 1165 }, { "epoch": 0.05, "grad_norm": 4.858953394755017, "learning_rate": 1.9984920889297432e-05, "loss": 1.4421, "step": 1166 }, { "epoch": 0.05, "grad_norm": 6.400100271260996, "learning_rate": 1.9984849187606078e-05, "loss": 1.3454, "step": 1167 }, { "epoch": 0.05, "grad_norm": 7.120267437658351, "learning_rate": 1.998477731597605e-05, "loss": 1.3566, "step": 1168 }, { "epoch": 0.05, "grad_norm": 6.066949897519249, "learning_rate": 1.9984705274408574e-05, "loss": 1.282, "step": 1169 }, { "epoch": 0.05, "grad_norm": 8.666664242902167, "learning_rate": 1.9984633062904873e-05, "loss": 1.7577, "step": 1170 }, { "epoch": 0.05, "grad_norm": 7.282790140133076, "learning_rate": 1.9984560681466184e-05, "loss": 1.3367, "step": 1171 }, { "epoch": 0.05, "grad_norm": 7.451778489057455, "learning_rate": 1.9984488130093732e-05, "loss": 1.231, "step": 1172 }, { "epoch": 0.05, "grad_norm": 6.793557985622377, "learning_rate": 1.998441540878875e-05, "loss": 1.2448, "step": 1173 }, { "epoch": 0.05, "grad_norm": 5.816243707827969, "learning_rate": 1.9984342517552485e-05, "loss": 1.3335, "step": 1174 }, { "epoch": 0.05, "grad_norm": 10.6671012390821, "learning_rate": 1.998426945638617e-05, "loss": 1.6317, "step": 1175 }, { "epoch": 0.05, "grad_norm": 4.344876307572289, "learning_rate": 1.9984196225291052e-05, "loss": 1.0844, "step": 1176 }, { "epoch": 0.05, "grad_norm": 3.4696141731361325, "learning_rate": 1.9984122824268372e-05, "loss": 0.9681, "step": 1177 }, { "epoch": 0.05, "grad_norm": 7.009083209285846, "learning_rate": 1.9984049253319383e-05, "loss": 1.4656, "step": 1178 }, { "epoch": 0.05, "grad_norm": 8.534199764603587, "learning_rate": 1.9983975512445337e-05, "loss": 1.6483, "step": 1179 }, { "epoch": 0.05, "grad_norm": 11.50637888443161, "learning_rate": 1.998390160164749e-05, "loss": 1.6886, "step": 1180 }, { "epoch": 0.05, "grad_norm": 6.642440148319819, "learning_rate": 1.99838275209271e-05, "loss": 1.5254, "step": 1181 }, { "epoch": 0.05, "grad_norm": 8.678685568390385, "learning_rate": 1.9983753270285423e-05, "loss": 1.7299, "step": 1182 }, { "epoch": 0.05, "grad_norm": 8.50557903745635, "learning_rate": 1.998367884972373e-05, "loss": 1.7553, "step": 1183 }, { "epoch": 0.05, "grad_norm": 10.985319747550658, "learning_rate": 1.9983604259243284e-05, "loss": 1.482, "step": 1184 }, { "epoch": 0.05, "grad_norm": 9.555131735123524, "learning_rate": 1.9983529498845355e-05, "loss": 1.3364, "step": 1185 }, { "epoch": 0.05, "grad_norm": 6.143094192730966, "learning_rate": 1.9983454568531215e-05, "loss": 1.5031, "step": 1186 }, { "epoch": 0.05, "grad_norm": 6.310516474342615, "learning_rate": 1.9983379468302136e-05, "loss": 1.4171, "step": 1187 }, { "epoch": 0.05, "grad_norm": 5.0725500170010855, "learning_rate": 1.9983304198159404e-05, "loss": 0.9257, "step": 1188 }, { "epoch": 0.05, "grad_norm": 5.083362159453924, "learning_rate": 1.9983228758104295e-05, "loss": 1.1782, "step": 1189 }, { "epoch": 0.05, "grad_norm": 8.463812605050002, "learning_rate": 1.9983153148138094e-05, "loss": 1.6367, "step": 1190 }, { "epoch": 0.05, "grad_norm": 8.618756562321986, "learning_rate": 1.9983077368262084e-05, "loss": 1.6135, "step": 1191 }, { "epoch": 0.05, "grad_norm": 5.545773539425761, "learning_rate": 1.998300141847756e-05, "loss": 1.2085, "step": 1192 }, { "epoch": 0.05, "grad_norm": 9.125736597001142, "learning_rate": 1.9982925298785813e-05, "loss": 1.6518, "step": 1193 }, { "epoch": 0.05, "grad_norm": 5.8792477957971245, "learning_rate": 1.9982849009188138e-05, "loss": 1.3272, "step": 1194 }, { "epoch": 0.05, "grad_norm": 6.952758240196858, "learning_rate": 1.9982772549685834e-05, "loss": 1.6702, "step": 1195 }, { "epoch": 0.05, "grad_norm": 6.558048025667784, "learning_rate": 1.9982695920280203e-05, "loss": 1.3478, "step": 1196 }, { "epoch": 0.05, "grad_norm": 6.130211683446369, "learning_rate": 1.9982619120972547e-05, "loss": 1.4685, "step": 1197 }, { "epoch": 0.05, "grad_norm": 6.94752275980667, "learning_rate": 1.9982542151764176e-05, "loss": 1.5225, "step": 1198 }, { "epoch": 0.05, "grad_norm": 7.051667879884348, "learning_rate": 1.9982465012656395e-05, "loss": 1.1539, "step": 1199 }, { "epoch": 0.05, "grad_norm": 6.111269567684271, "learning_rate": 1.998238770365052e-05, "loss": 1.412, "step": 1200 }, { "epoch": 0.05, "grad_norm": 7.781262453022093, "learning_rate": 1.998231022474787e-05, "loss": 1.2496, "step": 1201 }, { "epoch": 0.05, "grad_norm": 7.797413020456786, "learning_rate": 1.9982232575949764e-05, "loss": 1.2036, "step": 1202 }, { "epoch": 0.05, "grad_norm": 6.015556562711532, "learning_rate": 1.9982154757257515e-05, "loss": 1.4245, "step": 1203 }, { "epoch": 0.05, "grad_norm": 5.866238147136575, "learning_rate": 1.9982076768672453e-05, "loss": 1.2116, "step": 1204 }, { "epoch": 0.05, "grad_norm": 5.561758853761197, "learning_rate": 1.998199861019591e-05, "loss": 1.4311, "step": 1205 }, { "epoch": 0.05, "grad_norm": 6.174754019457683, "learning_rate": 1.9981920281829207e-05, "loss": 1.3279, "step": 1206 }, { "epoch": 0.05, "grad_norm": 5.619812424309057, "learning_rate": 1.998184178357368e-05, "loss": 1.4269, "step": 1207 }, { "epoch": 0.05, "grad_norm": 7.394987705318756, "learning_rate": 1.9981763115430666e-05, "loss": 1.6773, "step": 1208 }, { "epoch": 0.05, "grad_norm": 8.426016299014888, "learning_rate": 1.9981684277401506e-05, "loss": 1.3274, "step": 1209 }, { "epoch": 0.05, "grad_norm": 6.294887037371584, "learning_rate": 1.9981605269487543e-05, "loss": 1.5305, "step": 1210 }, { "epoch": 0.05, "grad_norm": 7.796464300055534, "learning_rate": 1.9981526091690117e-05, "loss": 1.5138, "step": 1211 }, { "epoch": 0.05, "grad_norm": 6.893280444083912, "learning_rate": 1.9981446744010577e-05, "loss": 1.3877, "step": 1212 }, { "epoch": 0.05, "grad_norm": 6.212464041628949, "learning_rate": 1.9981367226450272e-05, "loss": 1.2192, "step": 1213 }, { "epoch": 0.05, "grad_norm": 7.352197947992044, "learning_rate": 1.998128753901056e-05, "loss": 1.2298, "step": 1214 }, { "epoch": 0.05, "grad_norm": 7.4543117523989, "learning_rate": 1.998120768169279e-05, "loss": 1.3555, "step": 1215 }, { "epoch": 0.05, "grad_norm": 7.132273085524825, "learning_rate": 1.9981127654498335e-05, "loss": 1.2322, "step": 1216 }, { "epoch": 0.05, "grad_norm": 4.388557301176299, "learning_rate": 1.9981047457428538e-05, "loss": 1.1142, "step": 1217 }, { "epoch": 0.05, "grad_norm": 8.177780432484276, "learning_rate": 1.9980967090484776e-05, "loss": 1.4194, "step": 1218 }, { "epoch": 0.05, "grad_norm": 6.676748628908222, "learning_rate": 1.9980886553668418e-05, "loss": 1.3767, "step": 1219 }, { "epoch": 0.05, "grad_norm": 7.278656816329185, "learning_rate": 1.9980805846980828e-05, "loss": 1.4169, "step": 1220 }, { "epoch": 0.05, "grad_norm": 6.608800775152978, "learning_rate": 1.9980724970423383e-05, "loss": 1.2306, "step": 1221 }, { "epoch": 0.05, "grad_norm": 10.035396988791753, "learning_rate": 1.9980643923997462e-05, "loss": 1.4411, "step": 1222 }, { "epoch": 0.05, "grad_norm": 7.000639820694462, "learning_rate": 1.9980562707704436e-05, "loss": 1.3169, "step": 1223 }, { "epoch": 0.05, "grad_norm": 6.81233099634652, "learning_rate": 1.9980481321545695e-05, "loss": 1.3637, "step": 1224 }, { "epoch": 0.05, "grad_norm": 6.842308611076228, "learning_rate": 1.998039976552262e-05, "loss": 1.5162, "step": 1225 }, { "epoch": 0.05, "grad_norm": 6.58499874189754, "learning_rate": 1.9980318039636607e-05, "loss": 1.4434, "step": 1226 }, { "epoch": 0.05, "grad_norm": 7.367101475806799, "learning_rate": 1.9980236143889038e-05, "loss": 1.6265, "step": 1227 }, { "epoch": 0.05, "grad_norm": 9.98801771822008, "learning_rate": 1.998015407828131e-05, "loss": 1.5983, "step": 1228 }, { "epoch": 0.05, "grad_norm": 7.009980584536178, "learning_rate": 1.998007184281482e-05, "loss": 1.378, "step": 1229 }, { "epoch": 0.05, "grad_norm": 7.495230931097211, "learning_rate": 1.9979989437490967e-05, "loss": 1.3763, "step": 1230 }, { "epoch": 0.05, "grad_norm": 5.519108385378533, "learning_rate": 1.9979906862311153e-05, "loss": 1.0846, "step": 1231 }, { "epoch": 0.05, "grad_norm": 5.997626650944431, "learning_rate": 1.9979824117276784e-05, "loss": 1.436, "step": 1232 }, { "epoch": 0.05, "grad_norm": 6.327859077093838, "learning_rate": 1.9979741202389266e-05, "loss": 1.4388, "step": 1233 }, { "epoch": 0.05, "grad_norm": 5.721771545577175, "learning_rate": 1.997965811765002e-05, "loss": 1.3019, "step": 1234 }, { "epoch": 0.05, "grad_norm": 9.48673392795667, "learning_rate": 1.9979574863060444e-05, "loss": 1.3877, "step": 1235 }, { "epoch": 0.05, "grad_norm": 9.276455521815421, "learning_rate": 1.9979491438621967e-05, "loss": 1.4666, "step": 1236 }, { "epoch": 0.05, "grad_norm": 5.403137209355509, "learning_rate": 1.9979407844336008e-05, "loss": 1.228, "step": 1237 }, { "epoch": 0.05, "grad_norm": 8.021694275945022, "learning_rate": 1.9979324080203987e-05, "loss": 1.3729, "step": 1238 }, { "epoch": 0.05, "grad_norm": 6.491883420784804, "learning_rate": 1.9979240146227326e-05, "loss": 1.2864, "step": 1239 }, { "epoch": 0.05, "grad_norm": 4.742516425050354, "learning_rate": 1.997915604240746e-05, "loss": 1.1955, "step": 1240 }, { "epoch": 0.05, "grad_norm": 7.438750471792213, "learning_rate": 1.9979071768745816e-05, "loss": 1.2317, "step": 1241 }, { "epoch": 0.05, "grad_norm": 4.840317232397019, "learning_rate": 1.9978987325243834e-05, "loss": 1.0969, "step": 1242 }, { "epoch": 0.05, "grad_norm": 6.9528970834182, "learning_rate": 1.9978902711902944e-05, "loss": 1.2987, "step": 1243 }, { "epoch": 0.05, "grad_norm": 5.521246901237202, "learning_rate": 1.9978817928724588e-05, "loss": 1.2004, "step": 1244 }, { "epoch": 0.05, "grad_norm": 6.865414721252148, "learning_rate": 1.997873297571021e-05, "loss": 1.2301, "step": 1245 }, { "epoch": 0.05, "grad_norm": 5.49196308520183, "learning_rate": 1.997864785286126e-05, "loss": 1.2564, "step": 1246 }, { "epoch": 0.05, "grad_norm": 5.505189403364644, "learning_rate": 1.9978562560179183e-05, "loss": 1.4784, "step": 1247 }, { "epoch": 0.05, "grad_norm": 6.182909692563706, "learning_rate": 1.997847709766543e-05, "loss": 1.5122, "step": 1248 }, { "epoch": 0.05, "grad_norm": 7.702263858571601, "learning_rate": 1.9978391465321453e-05, "loss": 1.1709, "step": 1249 }, { "epoch": 0.05, "grad_norm": 9.204931571719213, "learning_rate": 1.9978305663148717e-05, "loss": 1.4563, "step": 1250 }, { "epoch": 0.05, "grad_norm": 6.532946374106809, "learning_rate": 1.9978219691148676e-05, "loss": 1.5769, "step": 1251 }, { "epoch": 0.05, "grad_norm": 7.299408141140899, "learning_rate": 1.9978133549322792e-05, "loss": 1.6734, "step": 1252 }, { "epoch": 0.05, "grad_norm": 4.048391828637755, "learning_rate": 1.997804723767254e-05, "loss": 1.1673, "step": 1253 }, { "epoch": 0.05, "grad_norm": 5.308771093030084, "learning_rate": 1.997796075619938e-05, "loss": 1.281, "step": 1254 }, { "epoch": 0.05, "grad_norm": 4.135544222229945, "learning_rate": 1.997787410490479e-05, "loss": 1.1979, "step": 1255 }, { "epoch": 0.05, "grad_norm": 7.109479217772318, "learning_rate": 1.997778728379024e-05, "loss": 1.5992, "step": 1256 }, { "epoch": 0.05, "grad_norm": 4.9237348275568715, "learning_rate": 1.9977700292857207e-05, "loss": 1.2393, "step": 1257 }, { "epoch": 0.05, "grad_norm": 5.993804642586544, "learning_rate": 1.9977613132107177e-05, "loss": 1.5255, "step": 1258 }, { "epoch": 0.05, "grad_norm": 7.798595554244061, "learning_rate": 1.9977525801541628e-05, "loss": 1.4897, "step": 1259 }, { "epoch": 0.05, "grad_norm": 5.104167810309012, "learning_rate": 1.997743830116205e-05, "loss": 1.1029, "step": 1260 }, { "epoch": 0.05, "grad_norm": 6.4435354330140395, "learning_rate": 1.9977350630969933e-05, "loss": 1.477, "step": 1261 }, { "epoch": 0.05, "grad_norm": 5.924202902590135, "learning_rate": 1.9977262790966768e-05, "loss": 1.1487, "step": 1262 }, { "epoch": 0.05, "grad_norm": 5.594166592787112, "learning_rate": 1.9977174781154046e-05, "loss": 0.9261, "step": 1263 }, { "epoch": 0.05, "grad_norm": 9.289278572250316, "learning_rate": 1.9977086601533272e-05, "loss": 1.7257, "step": 1264 }, { "epoch": 0.05, "grad_norm": 5.781225272463888, "learning_rate": 1.997699825210594e-05, "loss": 1.4252, "step": 1265 }, { "epoch": 0.05, "grad_norm": 8.984428742447479, "learning_rate": 1.9976909732873556e-05, "loss": 1.602, "step": 1266 }, { "epoch": 0.05, "grad_norm": 6.021206146700822, "learning_rate": 1.997682104383763e-05, "loss": 1.4888, "step": 1267 }, { "epoch": 0.05, "grad_norm": 7.280136120764609, "learning_rate": 1.9976732184999665e-05, "loss": 1.6424, "step": 1268 }, { "epoch": 0.05, "grad_norm": 6.251768817059288, "learning_rate": 1.997664315636118e-05, "loss": 1.451, "step": 1269 }, { "epoch": 0.05, "grad_norm": 12.174717110567478, "learning_rate": 1.9976553957923688e-05, "loss": 1.0553, "step": 1270 }, { "epoch": 0.05, "grad_norm": 8.438107690991323, "learning_rate": 1.99764645896887e-05, "loss": 1.4878, "step": 1271 }, { "epoch": 0.05, "grad_norm": 6.662200244215648, "learning_rate": 1.9976375051657753e-05, "loss": 1.4619, "step": 1272 }, { "epoch": 0.05, "grad_norm": 6.548822899378578, "learning_rate": 1.9976285343832353e-05, "loss": 1.1558, "step": 1273 }, { "epoch": 0.05, "grad_norm": 6.636682695192623, "learning_rate": 1.9976195466214042e-05, "loss": 1.323, "step": 1274 }, { "epoch": 0.05, "grad_norm": 8.130434143017432, "learning_rate": 1.9976105418804338e-05, "loss": 1.3274, "step": 1275 }, { "epoch": 0.05, "grad_norm": 8.545043591919084, "learning_rate": 1.997601520160478e-05, "loss": 1.4263, "step": 1276 }, { "epoch": 0.05, "grad_norm": 9.412584827796017, "learning_rate": 1.99759248146169e-05, "loss": 1.6126, "step": 1277 }, { "epoch": 0.05, "grad_norm": 7.42135890707537, "learning_rate": 1.9975834257842242e-05, "loss": 1.3288, "step": 1278 }, { "epoch": 0.05, "grad_norm": 7.13431366627439, "learning_rate": 1.997574353128234e-05, "loss": 1.3081, "step": 1279 }, { "epoch": 0.05, "grad_norm": 9.001090135687505, "learning_rate": 1.997565263493874e-05, "loss": 1.4558, "step": 1280 }, { "epoch": 0.05, "grad_norm": 8.635698819584958, "learning_rate": 1.9975561568812994e-05, "loss": 1.2734, "step": 1281 }, { "epoch": 0.05, "grad_norm": 7.9314357646433615, "learning_rate": 1.997547033290665e-05, "loss": 1.4843, "step": 1282 }, { "epoch": 0.05, "grad_norm": 7.480072415970632, "learning_rate": 1.9975378927221257e-05, "loss": 1.5015, "step": 1283 }, { "epoch": 0.05, "grad_norm": 7.505497398699267, "learning_rate": 1.9975287351758373e-05, "loss": 1.3908, "step": 1284 }, { "epoch": 0.05, "grad_norm": 4.902488915356631, "learning_rate": 1.9975195606519556e-05, "loss": 1.1291, "step": 1285 }, { "epoch": 0.05, "grad_norm": 7.941932615270398, "learning_rate": 1.9975103691506368e-05, "loss": 1.3984, "step": 1286 }, { "epoch": 0.05, "grad_norm": 8.47877951575868, "learning_rate": 1.9975011606720374e-05, "loss": 0.9977, "step": 1287 }, { "epoch": 0.05, "grad_norm": 8.566348842636724, "learning_rate": 1.997491935216314e-05, "loss": 1.5744, "step": 1288 }, { "epoch": 0.05, "grad_norm": 6.223989082166951, "learning_rate": 1.9974826927836236e-05, "loss": 1.3923, "step": 1289 }, { "epoch": 0.05, "grad_norm": 6.339019579066466, "learning_rate": 1.997473433374124e-05, "loss": 1.5108, "step": 1290 }, { "epoch": 0.05, "grad_norm": 7.319203663600513, "learning_rate": 1.997464156987972e-05, "loss": 1.5018, "step": 1291 }, { "epoch": 0.05, "grad_norm": 6.526390276771469, "learning_rate": 1.997454863625326e-05, "loss": 1.2867, "step": 1292 }, { "epoch": 0.05, "grad_norm": 6.668274949695046, "learning_rate": 1.997445553286344e-05, "loss": 1.3103, "step": 1293 }, { "epoch": 0.05, "grad_norm": 7.672380716918383, "learning_rate": 1.9974362259711844e-05, "loss": 1.5279, "step": 1294 }, { "epoch": 0.05, "grad_norm": 6.990287659986135, "learning_rate": 1.997426881680006e-05, "loss": 1.4137, "step": 1295 }, { "epoch": 0.05, "grad_norm": 9.861426070700963, "learning_rate": 1.997417520412968e-05, "loss": 1.8529, "step": 1296 }, { "epoch": 0.05, "grad_norm": 7.619709724346227, "learning_rate": 1.9974081421702296e-05, "loss": 1.342, "step": 1297 }, { "epoch": 0.05, "grad_norm": 7.281780397698333, "learning_rate": 1.9973987469519504e-05, "loss": 1.4139, "step": 1298 }, { "epoch": 0.05, "grad_norm": 7.309080637932666, "learning_rate": 1.9973893347582904e-05, "loss": 1.6374, "step": 1299 }, { "epoch": 0.05, "grad_norm": 6.04046855663974, "learning_rate": 1.9973799055894094e-05, "loss": 1.603, "step": 1300 }, { "epoch": 0.05, "grad_norm": 6.500141862449772, "learning_rate": 1.997370459445468e-05, "loss": 1.5292, "step": 1301 }, { "epoch": 0.05, "grad_norm": 5.922333711416567, "learning_rate": 1.9973609963266277e-05, "loss": 1.2449, "step": 1302 }, { "epoch": 0.05, "grad_norm": 9.276098913412875, "learning_rate": 1.997351516233049e-05, "loss": 1.5701, "step": 1303 }, { "epoch": 0.05, "grad_norm": 5.414216014922543, "learning_rate": 1.9973420191648928e-05, "loss": 1.3047, "step": 1304 }, { "epoch": 0.05, "grad_norm": 6.840338640641262, "learning_rate": 1.9973325051223215e-05, "loss": 1.7686, "step": 1305 }, { "epoch": 0.05, "grad_norm": 8.549427471224725, "learning_rate": 1.997322974105497e-05, "loss": 1.4307, "step": 1306 }, { "epoch": 0.05, "grad_norm": 7.877854618742849, "learning_rate": 1.9973134261145807e-05, "loss": 1.4297, "step": 1307 }, { "epoch": 0.05, "grad_norm": 5.835052890888657, "learning_rate": 1.997303861149736e-05, "loss": 1.2405, "step": 1308 }, { "epoch": 0.05, "grad_norm": 5.870056983620416, "learning_rate": 1.997294279211125e-05, "loss": 1.2845, "step": 1309 }, { "epoch": 0.05, "grad_norm": 5.136738218515481, "learning_rate": 1.9972846802989113e-05, "loss": 1.3764, "step": 1310 }, { "epoch": 0.05, "grad_norm": 6.5352214864921745, "learning_rate": 1.997275064413258e-05, "loss": 1.3409, "step": 1311 }, { "epoch": 0.05, "grad_norm": 6.149582103843652, "learning_rate": 1.9972654315543293e-05, "loss": 1.3812, "step": 1312 }, { "epoch": 0.05, "grad_norm": 7.398368386300991, "learning_rate": 1.9972557817222885e-05, "loss": 1.2974, "step": 1313 }, { "epoch": 0.05, "grad_norm": 9.248684577690094, "learning_rate": 1.9972461149172997e-05, "loss": 1.373, "step": 1314 }, { "epoch": 0.05, "grad_norm": 6.048471522523522, "learning_rate": 1.997236431139528e-05, "loss": 1.5065, "step": 1315 }, { "epoch": 0.05, "grad_norm": 7.842490955727602, "learning_rate": 1.9972267303891378e-05, "loss": 1.4348, "step": 1316 }, { "epoch": 0.05, "grad_norm": 6.367193806942393, "learning_rate": 1.9972170126662945e-05, "loss": 1.3992, "step": 1317 }, { "epoch": 0.05, "grad_norm": 7.339883110487712, "learning_rate": 1.9972072779711634e-05, "loss": 1.6242, "step": 1318 }, { "epoch": 0.05, "grad_norm": 8.185983676218573, "learning_rate": 1.9971975263039102e-05, "loss": 1.0995, "step": 1319 }, { "epoch": 0.05, "grad_norm": 6.205812040527204, "learning_rate": 1.9971877576647005e-05, "loss": 1.2978, "step": 1320 }, { "epoch": 0.05, "grad_norm": 3.702314679345192, "learning_rate": 1.997177972053701e-05, "loss": 0.9797, "step": 1321 }, { "epoch": 0.05, "grad_norm": 6.587133445717258, "learning_rate": 1.9971681694710784e-05, "loss": 1.3709, "step": 1322 }, { "epoch": 0.05, "grad_norm": 5.408010095518702, "learning_rate": 1.9971583499169988e-05, "loss": 1.325, "step": 1323 }, { "epoch": 0.05, "grad_norm": 6.668658527773487, "learning_rate": 1.99714851339163e-05, "loss": 1.2033, "step": 1324 }, { "epoch": 0.05, "grad_norm": 6.909971208772203, "learning_rate": 1.9971386598951395e-05, "loss": 1.2628, "step": 1325 }, { "epoch": 0.05, "grad_norm": 6.061806241487598, "learning_rate": 1.9971287894276946e-05, "loss": 1.2654, "step": 1326 }, { "epoch": 0.05, "grad_norm": 6.0032750031536075, "learning_rate": 1.997118901989463e-05, "loss": 1.2757, "step": 1327 }, { "epoch": 0.05, "grad_norm": 7.1985029655935335, "learning_rate": 1.9971089975806135e-05, "loss": 1.4569, "step": 1328 }, { "epoch": 0.05, "grad_norm": 5.657985684216344, "learning_rate": 1.997099076201315e-05, "loss": 1.4145, "step": 1329 }, { "epoch": 0.05, "grad_norm": 5.226220000720181, "learning_rate": 1.9970891378517355e-05, "loss": 1.2174, "step": 1330 }, { "epoch": 0.05, "grad_norm": 9.750864239524953, "learning_rate": 1.9970791825320444e-05, "loss": 1.4603, "step": 1331 }, { "epoch": 0.05, "grad_norm": 6.958585449582653, "learning_rate": 1.9970692102424115e-05, "loss": 1.4582, "step": 1332 }, { "epoch": 0.05, "grad_norm": 6.35505735223703, "learning_rate": 1.9970592209830064e-05, "loss": 1.333, "step": 1333 }, { "epoch": 0.05, "grad_norm": 9.78835772535074, "learning_rate": 1.997049214753999e-05, "loss": 1.6088, "step": 1334 }, { "epoch": 0.05, "grad_norm": 6.159407594791994, "learning_rate": 1.9970391915555595e-05, "loss": 1.4315, "step": 1335 }, { "epoch": 0.05, "grad_norm": 5.905671630994518, "learning_rate": 1.997029151387859e-05, "loss": 1.3165, "step": 1336 }, { "epoch": 0.05, "grad_norm": 6.631150416929365, "learning_rate": 1.9970190942510674e-05, "loss": 1.4673, "step": 1337 }, { "epoch": 0.05, "grad_norm": 5.200698391171232, "learning_rate": 1.997009020145357e-05, "loss": 1.2484, "step": 1338 }, { "epoch": 0.05, "grad_norm": 8.392567442450567, "learning_rate": 1.9969989290708982e-05, "loss": 1.5052, "step": 1339 }, { "epoch": 0.05, "grad_norm": 8.794361035108619, "learning_rate": 1.9969888210278637e-05, "loss": 1.3799, "step": 1340 }, { "epoch": 0.05, "grad_norm": 6.465148174357662, "learning_rate": 1.996978696016425e-05, "loss": 1.5378, "step": 1341 }, { "epoch": 0.05, "grad_norm": 7.045859333313788, "learning_rate": 1.9969685540367544e-05, "loss": 1.6475, "step": 1342 }, { "epoch": 0.05, "grad_norm": 6.5918414474142955, "learning_rate": 1.9969583950890245e-05, "loss": 1.4184, "step": 1343 }, { "epoch": 0.05, "grad_norm": 7.825143488116609, "learning_rate": 1.9969482191734086e-05, "loss": 1.6977, "step": 1344 }, { "epoch": 0.05, "grad_norm": 6.674388114273383, "learning_rate": 1.9969380262900796e-05, "loss": 1.3704, "step": 1345 }, { "epoch": 0.05, "grad_norm": 6.616565869653101, "learning_rate": 1.9969278164392116e-05, "loss": 1.3853, "step": 1346 }, { "epoch": 0.05, "grad_norm": 7.431165634106452, "learning_rate": 1.996917589620977e-05, "loss": 1.5915, "step": 1347 }, { "epoch": 0.05, "grad_norm": 6.948670467241131, "learning_rate": 1.996907345835551e-05, "loss": 1.4315, "step": 1348 }, { "epoch": 0.05, "grad_norm": 6.871450895343091, "learning_rate": 1.996897085083107e-05, "loss": 1.2682, "step": 1349 }, { "epoch": 0.05, "grad_norm": 9.434795215898948, "learning_rate": 1.996886807363821e-05, "loss": 1.634, "step": 1350 }, { "epoch": 0.05, "grad_norm": 5.974471061751191, "learning_rate": 1.9968765126778666e-05, "loss": 1.3612, "step": 1351 }, { "epoch": 0.05, "grad_norm": 9.598605382044358, "learning_rate": 1.9968662010254198e-05, "loss": 1.7867, "step": 1352 }, { "epoch": 0.05, "grad_norm": 5.991346003621144, "learning_rate": 1.996855872406656e-05, "loss": 1.3096, "step": 1353 }, { "epoch": 0.05, "grad_norm": 7.135931785313358, "learning_rate": 1.9968455268217506e-05, "loss": 1.1562, "step": 1354 }, { "epoch": 0.05, "grad_norm": 9.163616841890807, "learning_rate": 1.99683516427088e-05, "loss": 1.6955, "step": 1355 }, { "epoch": 0.05, "grad_norm": 7.504699794197886, "learning_rate": 1.9968247847542205e-05, "loss": 1.562, "step": 1356 }, { "epoch": 0.05, "grad_norm": 4.174707742827942, "learning_rate": 1.9968143882719486e-05, "loss": 1.1909, "step": 1357 }, { "epoch": 0.05, "grad_norm": 7.756016632837737, "learning_rate": 1.996803974824242e-05, "loss": 1.3687, "step": 1358 }, { "epoch": 0.05, "grad_norm": 11.659431660539806, "learning_rate": 1.9967935444112766e-05, "loss": 1.573, "step": 1359 }, { "epoch": 0.05, "grad_norm": 8.397334151266401, "learning_rate": 1.9967830970332305e-05, "loss": 1.2908, "step": 1360 }, { "epoch": 0.05, "grad_norm": 7.019052399944366, "learning_rate": 1.9967726326902825e-05, "loss": 1.395, "step": 1361 }, { "epoch": 0.05, "grad_norm": 5.179139620525485, "learning_rate": 1.9967621513826092e-05, "loss": 1.2932, "step": 1362 }, { "epoch": 0.05, "grad_norm": 9.62573276640963, "learning_rate": 1.99675165311039e-05, "loss": 1.6737, "step": 1363 }, { "epoch": 0.05, "grad_norm": 7.473040022097914, "learning_rate": 1.996741137873803e-05, "loss": 1.2636, "step": 1364 }, { "epoch": 0.05, "grad_norm": 4.422617529299315, "learning_rate": 1.9967306056730276e-05, "loss": 1.1477, "step": 1365 }, { "epoch": 0.06, "grad_norm": 6.69153725119968, "learning_rate": 1.9967200565082426e-05, "loss": 1.3223, "step": 1366 }, { "epoch": 0.06, "grad_norm": 9.260358551013038, "learning_rate": 1.9967094903796282e-05, "loss": 1.744, "step": 1367 }, { "epoch": 0.06, "grad_norm": 7.053172632007911, "learning_rate": 1.9966989072873636e-05, "loss": 1.2347, "step": 1368 }, { "epoch": 0.06, "grad_norm": 5.628946146524508, "learning_rate": 1.996688307231629e-05, "loss": 1.1935, "step": 1369 }, { "epoch": 0.06, "grad_norm": 9.7071394218458, "learning_rate": 1.996677690212605e-05, "loss": 1.677, "step": 1370 }, { "epoch": 0.06, "grad_norm": 4.011482228103249, "learning_rate": 1.996667056230472e-05, "loss": 1.0503, "step": 1371 }, { "epoch": 0.06, "grad_norm": 8.294646943173595, "learning_rate": 1.996656405285412e-05, "loss": 1.6745, "step": 1372 }, { "epoch": 0.06, "grad_norm": 7.359235718467013, "learning_rate": 1.9966457373776048e-05, "loss": 1.5218, "step": 1373 }, { "epoch": 0.06, "grad_norm": 7.083855847892963, "learning_rate": 1.9966350525072328e-05, "loss": 1.6229, "step": 1374 }, { "epoch": 0.06, "grad_norm": 6.903834660424176, "learning_rate": 1.996624350674478e-05, "loss": 1.4559, "step": 1375 }, { "epoch": 0.06, "grad_norm": 8.982463557818338, "learning_rate": 1.9966136318795216e-05, "loss": 1.4578, "step": 1376 }, { "epoch": 0.06, "grad_norm": 8.133956956738988, "learning_rate": 1.9966028961225475e-05, "loss": 1.2484, "step": 1377 }, { "epoch": 0.06, "grad_norm": 5.871959477089958, "learning_rate": 1.996592143403737e-05, "loss": 1.4797, "step": 1378 }, { "epoch": 0.06, "grad_norm": 7.681588409948699, "learning_rate": 1.996581373723274e-05, "loss": 1.3188, "step": 1379 }, { "epoch": 0.06, "grad_norm": 6.342168302014338, "learning_rate": 1.9965705870813416e-05, "loss": 1.6737, "step": 1380 }, { "epoch": 0.06, "grad_norm": 10.337915034132228, "learning_rate": 1.9965597834781235e-05, "loss": 1.7259, "step": 1381 }, { "epoch": 0.06, "grad_norm": 7.060582252549427, "learning_rate": 1.9965489629138032e-05, "loss": 1.3318, "step": 1382 }, { "epoch": 0.06, "grad_norm": 7.8255501345718335, "learning_rate": 1.9965381253885646e-05, "loss": 1.8348, "step": 1383 }, { "epoch": 0.06, "grad_norm": 6.884381742112118, "learning_rate": 1.996527270902593e-05, "loss": 1.3294, "step": 1384 }, { "epoch": 0.06, "grad_norm": 8.829997124670875, "learning_rate": 1.996516399456073e-05, "loss": 1.4629, "step": 1385 }, { "epoch": 0.06, "grad_norm": 10.116597884062246, "learning_rate": 1.996505511049189e-05, "loss": 1.71, "step": 1386 }, { "epoch": 0.06, "grad_norm": 9.555178881607057, "learning_rate": 1.9964946056821266e-05, "loss": 1.4158, "step": 1387 }, { "epoch": 0.06, "grad_norm": 6.90115804167898, "learning_rate": 1.9964836833550715e-05, "loss": 1.4301, "step": 1388 }, { "epoch": 0.06, "grad_norm": 6.291682362882667, "learning_rate": 1.9964727440682097e-05, "loss": 1.4802, "step": 1389 }, { "epoch": 0.06, "grad_norm": 6.021321365084674, "learning_rate": 1.9964617878217273e-05, "loss": 1.554, "step": 1390 }, { "epoch": 0.06, "grad_norm": 6.831753044292781, "learning_rate": 1.9964508146158107e-05, "loss": 1.3284, "step": 1391 }, { "epoch": 0.06, "grad_norm": 4.687964436965135, "learning_rate": 1.9964398244506468e-05, "loss": 1.1057, "step": 1392 }, { "epoch": 0.06, "grad_norm": 6.730245361736973, "learning_rate": 1.996428817326422e-05, "loss": 1.2512, "step": 1393 }, { "epoch": 0.06, "grad_norm": 8.591611226009329, "learning_rate": 1.996417793243325e-05, "loss": 1.2635, "step": 1394 }, { "epoch": 0.06, "grad_norm": 5.445458182302627, "learning_rate": 1.996406752201542e-05, "loss": 1.3182, "step": 1395 }, { "epoch": 0.06, "grad_norm": 6.519498646951889, "learning_rate": 1.9963956942012615e-05, "loss": 1.3275, "step": 1396 }, { "epoch": 0.06, "grad_norm": 6.438288964606097, "learning_rate": 1.996384619242672e-05, "loss": 1.4527, "step": 1397 }, { "epoch": 0.06, "grad_norm": 13.506660684958048, "learning_rate": 1.9963735273259618e-05, "loss": 1.509, "step": 1398 }, { "epoch": 0.06, "grad_norm": 7.578134223133227, "learning_rate": 1.9963624184513194e-05, "loss": 1.2091, "step": 1399 }, { "epoch": 0.06, "grad_norm": 13.557938957712244, "learning_rate": 1.9963512926189337e-05, "loss": 1.4336, "step": 1400 }, { "epoch": 0.06, "grad_norm": 8.309800897864221, "learning_rate": 1.9963401498289952e-05, "loss": 1.6587, "step": 1401 }, { "epoch": 0.06, "grad_norm": 7.275750734392879, "learning_rate": 1.9963289900816923e-05, "loss": 1.5638, "step": 1402 }, { "epoch": 0.06, "grad_norm": 7.980202899940684, "learning_rate": 1.9963178133772153e-05, "loss": 1.5954, "step": 1403 }, { "epoch": 0.06, "grad_norm": 7.670835507005246, "learning_rate": 1.996306619715755e-05, "loss": 1.5459, "step": 1404 }, { "epoch": 0.06, "grad_norm": 9.65282837425003, "learning_rate": 1.9962954090975007e-05, "loss": 1.3876, "step": 1405 }, { "epoch": 0.06, "grad_norm": 6.645863043399876, "learning_rate": 1.9962841815226447e-05, "loss": 1.2799, "step": 1406 }, { "epoch": 0.06, "grad_norm": 7.542246425417072, "learning_rate": 1.996272936991377e-05, "loss": 1.339, "step": 1407 }, { "epoch": 0.06, "grad_norm": 10.551154463507817, "learning_rate": 1.9962616755038892e-05, "loss": 1.6935, "step": 1408 }, { "epoch": 0.06, "grad_norm": 6.7856825501857925, "learning_rate": 1.9962503970603733e-05, "loss": 1.3675, "step": 1409 }, { "epoch": 0.06, "grad_norm": 9.344699352998903, "learning_rate": 1.9962391016610207e-05, "loss": 1.2029, "step": 1410 }, { "epoch": 0.06, "grad_norm": 8.49984258139571, "learning_rate": 1.9962277893060244e-05, "loss": 1.1598, "step": 1411 }, { "epoch": 0.06, "grad_norm": 8.621348140223294, "learning_rate": 1.9962164599955762e-05, "loss": 1.5974, "step": 1412 }, { "epoch": 0.06, "grad_norm": 8.378576489746932, "learning_rate": 1.9962051137298693e-05, "loss": 1.4864, "step": 1413 }, { "epoch": 0.06, "grad_norm": 6.38889274891855, "learning_rate": 1.9961937505090968e-05, "loss": 1.3161, "step": 1414 }, { "epoch": 0.06, "grad_norm": 8.317106323828197, "learning_rate": 1.9961823703334518e-05, "loss": 1.341, "step": 1415 }, { "epoch": 0.06, "grad_norm": 5.524252607842972, "learning_rate": 1.9961709732031284e-05, "loss": 1.4369, "step": 1416 }, { "epoch": 0.06, "grad_norm": 7.055702442798704, "learning_rate": 1.9961595591183205e-05, "loss": 1.4266, "step": 1417 }, { "epoch": 0.06, "grad_norm": 6.063598853684944, "learning_rate": 1.996148128079222e-05, "loss": 1.1427, "step": 1418 }, { "epoch": 0.06, "grad_norm": 8.235377435692664, "learning_rate": 1.996136680086028e-05, "loss": 1.3123, "step": 1419 }, { "epoch": 0.06, "grad_norm": 5.859673502151679, "learning_rate": 1.9961252151389328e-05, "loss": 1.3697, "step": 1420 }, { "epoch": 0.06, "grad_norm": 8.995510368566718, "learning_rate": 1.996113733238132e-05, "loss": 1.7422, "step": 1421 }, { "epoch": 0.06, "grad_norm": 8.28704253852011, "learning_rate": 1.9961022343838206e-05, "loss": 1.7036, "step": 1422 }, { "epoch": 0.06, "grad_norm": 5.700495649750436, "learning_rate": 1.9960907185761945e-05, "loss": 1.2192, "step": 1423 }, { "epoch": 0.06, "grad_norm": 5.635391920762192, "learning_rate": 1.99607918581545e-05, "loss": 1.1413, "step": 1424 }, { "epoch": 0.06, "grad_norm": 5.56365265542115, "learning_rate": 1.9960676361017827e-05, "loss": 1.2294, "step": 1425 }, { "epoch": 0.06, "grad_norm": 9.721291352004608, "learning_rate": 1.99605606943539e-05, "loss": 1.613, "step": 1426 }, { "epoch": 0.06, "grad_norm": 7.745725039514284, "learning_rate": 1.9960444858164678e-05, "loss": 1.342, "step": 1427 }, { "epoch": 0.06, "grad_norm": 7.25555042936773, "learning_rate": 1.996032885245214e-05, "loss": 1.8052, "step": 1428 }, { "epoch": 0.06, "grad_norm": 8.873998256905931, "learning_rate": 1.996021267721826e-05, "loss": 1.4539, "step": 1429 }, { "epoch": 0.06, "grad_norm": 7.182174424982235, "learning_rate": 1.996009633246501e-05, "loss": 1.3021, "step": 1430 }, { "epoch": 0.06, "grad_norm": 6.290054311175747, "learning_rate": 1.9959979818194373e-05, "loss": 1.471, "step": 1431 }, { "epoch": 0.06, "grad_norm": 7.760008867452599, "learning_rate": 1.9959863134408337e-05, "loss": 1.0637, "step": 1432 }, { "epoch": 0.06, "grad_norm": 5.90735698477957, "learning_rate": 1.995974628110888e-05, "loss": 1.2686, "step": 1433 }, { "epoch": 0.06, "grad_norm": 6.19552721123613, "learning_rate": 1.9959629258297995e-05, "loss": 1.1687, "step": 1434 }, { "epoch": 0.06, "grad_norm": 8.108050288538617, "learning_rate": 1.9959512065977673e-05, "loss": 1.7228, "step": 1435 }, { "epoch": 0.06, "grad_norm": 7.316459531445684, "learning_rate": 1.9959394704149908e-05, "loss": 1.2216, "step": 1436 }, { "epoch": 0.06, "grad_norm": 4.487244895961331, "learning_rate": 1.9959277172816698e-05, "loss": 1.1444, "step": 1437 }, { "epoch": 0.06, "grad_norm": 10.258707138502286, "learning_rate": 1.9959159471980045e-05, "loss": 1.4857, "step": 1438 }, { "epoch": 0.06, "grad_norm": 5.170354732182427, "learning_rate": 1.9959041601641948e-05, "loss": 1.1608, "step": 1439 }, { "epoch": 0.06, "grad_norm": 5.102090795510799, "learning_rate": 1.9958923561804417e-05, "loss": 1.1645, "step": 1440 }, { "epoch": 0.06, "grad_norm": 5.737535906696977, "learning_rate": 1.995880535246946e-05, "loss": 1.398, "step": 1441 }, { "epoch": 0.06, "grad_norm": 5.852537355125488, "learning_rate": 1.9958686973639088e-05, "loss": 1.291, "step": 1442 }, { "epoch": 0.06, "grad_norm": 8.623558687956683, "learning_rate": 1.9958568425315316e-05, "loss": 1.3199, "step": 1443 }, { "epoch": 0.06, "grad_norm": 9.62514592417538, "learning_rate": 1.995844970750016e-05, "loss": 1.4647, "step": 1444 }, { "epoch": 0.06, "grad_norm": 7.127517128104072, "learning_rate": 1.9958330820195644e-05, "loss": 1.522, "step": 1445 }, { "epoch": 0.06, "grad_norm": 6.460841400722731, "learning_rate": 1.995821176340379e-05, "loss": 1.4628, "step": 1446 }, { "epoch": 0.06, "grad_norm": 7.417868220660701, "learning_rate": 1.9958092537126627e-05, "loss": 1.3646, "step": 1447 }, { "epoch": 0.06, "grad_norm": 7.733327752591579, "learning_rate": 1.9957973141366177e-05, "loss": 1.51, "step": 1448 }, { "epoch": 0.06, "grad_norm": 5.582031803694507, "learning_rate": 1.9957853576124478e-05, "loss": 1.1981, "step": 1449 }, { "epoch": 0.06, "grad_norm": 7.040515310500903, "learning_rate": 1.9957733841403564e-05, "loss": 1.2464, "step": 1450 }, { "epoch": 0.06, "grad_norm": 5.593812483583821, "learning_rate": 1.995761393720547e-05, "loss": 1.2816, "step": 1451 }, { "epoch": 0.06, "grad_norm": 6.642966983467225, "learning_rate": 1.9957493863532242e-05, "loss": 1.4182, "step": 1452 }, { "epoch": 0.06, "grad_norm": 7.6762412941854175, "learning_rate": 1.9957373620385917e-05, "loss": 1.8309, "step": 1453 }, { "epoch": 0.06, "grad_norm": 5.29165738375946, "learning_rate": 1.9957253207768547e-05, "loss": 1.3729, "step": 1454 }, { "epoch": 0.06, "grad_norm": 5.453883533880601, "learning_rate": 1.9957132625682177e-05, "loss": 1.1561, "step": 1455 }, { "epoch": 0.06, "grad_norm": 5.00676070441879, "learning_rate": 1.9957011874128866e-05, "loss": 1.0691, "step": 1456 }, { "epoch": 0.06, "grad_norm": 8.06257821384189, "learning_rate": 1.9956890953110662e-05, "loss": 1.3211, "step": 1457 }, { "epoch": 0.06, "grad_norm": 6.494155595739766, "learning_rate": 1.995676986262963e-05, "loss": 1.7024, "step": 1458 }, { "epoch": 0.06, "grad_norm": 7.036051458018038, "learning_rate": 1.9956648602687824e-05, "loss": 1.4724, "step": 1459 }, { "epoch": 0.06, "grad_norm": 5.421741318285281, "learning_rate": 1.995652717328731e-05, "loss": 1.0659, "step": 1460 }, { "epoch": 0.06, "grad_norm": 6.445206413407089, "learning_rate": 1.9956405574430157e-05, "loss": 1.1788, "step": 1461 }, { "epoch": 0.06, "grad_norm": 5.42401125335881, "learning_rate": 1.995628380611843e-05, "loss": 1.1567, "step": 1462 }, { "epoch": 0.06, "grad_norm": 7.909183887961802, "learning_rate": 1.9956161868354207e-05, "loss": 1.4171, "step": 1463 }, { "epoch": 0.06, "grad_norm": 6.142008319180877, "learning_rate": 1.9956039761139558e-05, "loss": 1.457, "step": 1464 }, { "epoch": 0.06, "grad_norm": 6.939658985538146, "learning_rate": 1.9955917484476567e-05, "loss": 1.4544, "step": 1465 }, { "epoch": 0.06, "grad_norm": 8.971951006394367, "learning_rate": 1.995579503836731e-05, "loss": 1.1686, "step": 1466 }, { "epoch": 0.06, "grad_norm": 9.633004642298342, "learning_rate": 1.9955672422813873e-05, "loss": 1.6463, "step": 1467 }, { "epoch": 0.06, "grad_norm": 5.265404154294693, "learning_rate": 1.9955549637818343e-05, "loss": 1.3659, "step": 1468 }, { "epoch": 0.06, "grad_norm": 9.122616632776786, "learning_rate": 1.995542668338281e-05, "loss": 1.528, "step": 1469 }, { "epoch": 0.06, "grad_norm": 6.270860678395396, "learning_rate": 1.9955303559509365e-05, "loss": 1.2669, "step": 1470 }, { "epoch": 0.06, "grad_norm": 6.907035937604945, "learning_rate": 1.9955180266200107e-05, "loss": 1.79, "step": 1471 }, { "epoch": 0.06, "grad_norm": 7.6045138801594385, "learning_rate": 1.995505680345713e-05, "loss": 1.5054, "step": 1472 }, { "epoch": 0.06, "grad_norm": 6.840056759815627, "learning_rate": 1.9954933171282542e-05, "loss": 1.3857, "step": 1473 }, { "epoch": 0.06, "grad_norm": 7.215959167176934, "learning_rate": 1.9954809369678435e-05, "loss": 1.5313, "step": 1474 }, { "epoch": 0.06, "grad_norm": 6.712740973719321, "learning_rate": 1.995468539864693e-05, "loss": 1.6483, "step": 1475 }, { "epoch": 0.06, "grad_norm": 5.632704324433016, "learning_rate": 1.995456125819013e-05, "loss": 1.3361, "step": 1476 }, { "epoch": 0.06, "grad_norm": 5.991152607595547, "learning_rate": 1.9954436948310144e-05, "loss": 1.3354, "step": 1477 }, { "epoch": 0.06, "grad_norm": 4.956666104002904, "learning_rate": 1.9954312469009095e-05, "loss": 1.3973, "step": 1478 }, { "epoch": 0.06, "grad_norm": 5.663911308799256, "learning_rate": 1.9954187820289102e-05, "loss": 1.3989, "step": 1479 }, { "epoch": 0.06, "grad_norm": 8.08768547949442, "learning_rate": 1.9954063002152278e-05, "loss": 1.2128, "step": 1480 }, { "epoch": 0.06, "grad_norm": 6.084507535424869, "learning_rate": 1.9953938014600757e-05, "loss": 1.3111, "step": 1481 }, { "epoch": 0.06, "grad_norm": 7.248581230638397, "learning_rate": 1.995381285763666e-05, "loss": 1.2533, "step": 1482 }, { "epoch": 0.06, "grad_norm": 8.741567905340336, "learning_rate": 1.9953687531262117e-05, "loss": 1.5419, "step": 1483 }, { "epoch": 0.06, "grad_norm": 9.001529962007812, "learning_rate": 1.9953562035479267e-05, "loss": 1.2979, "step": 1484 }, { "epoch": 0.06, "grad_norm": 7.511616847772875, "learning_rate": 1.995343637029024e-05, "loss": 1.118, "step": 1485 }, { "epoch": 0.06, "grad_norm": 8.581325701859086, "learning_rate": 1.9953310535697173e-05, "loss": 1.5552, "step": 1486 }, { "epoch": 0.06, "grad_norm": 6.724582240646722, "learning_rate": 1.9953184531702215e-05, "loss": 1.5721, "step": 1487 }, { "epoch": 0.06, "grad_norm": 7.372612877513773, "learning_rate": 1.9953058358307512e-05, "loss": 1.5329, "step": 1488 }, { "epoch": 0.06, "grad_norm": 8.311344892737381, "learning_rate": 1.9952932015515198e-05, "loss": 1.8343, "step": 1489 }, { "epoch": 0.06, "grad_norm": 8.0780310089049, "learning_rate": 1.995280550332744e-05, "loss": 1.5196, "step": 1490 }, { "epoch": 0.06, "grad_norm": 7.184315064769623, "learning_rate": 1.995267882174637e-05, "loss": 1.5368, "step": 1491 }, { "epoch": 0.06, "grad_norm": 8.09806800037185, "learning_rate": 1.9952551970774166e-05, "loss": 1.2071, "step": 1492 }, { "epoch": 0.06, "grad_norm": 7.924723751243145, "learning_rate": 1.9952424950412978e-05, "loss": 1.3975, "step": 1493 }, { "epoch": 0.06, "grad_norm": 9.413953116505157, "learning_rate": 1.9952297760664966e-05, "loss": 1.6823, "step": 1494 }, { "epoch": 0.06, "grad_norm": 7.802940647087575, "learning_rate": 1.9952170401532296e-05, "loss": 1.3648, "step": 1495 }, { "epoch": 0.06, "grad_norm": 3.7516086853913877, "learning_rate": 1.9952042873017136e-05, "loss": 0.9918, "step": 1496 }, { "epoch": 0.06, "grad_norm": 13.035630361418338, "learning_rate": 1.995191517512166e-05, "loss": 1.607, "step": 1497 }, { "epoch": 0.06, "grad_norm": 7.201077764268946, "learning_rate": 1.995178730784803e-05, "loss": 1.1725, "step": 1498 }, { "epoch": 0.06, "grad_norm": 13.985591555531078, "learning_rate": 1.9951659271198437e-05, "loss": 2.1272, "step": 1499 }, { "epoch": 0.06, "grad_norm": 6.513750348852558, "learning_rate": 1.9951531065175054e-05, "loss": 1.2252, "step": 1500 }, { "epoch": 0.06, "grad_norm": 6.683200390746712, "learning_rate": 1.995140268978006e-05, "loss": 1.6882, "step": 1501 }, { "epoch": 0.06, "grad_norm": 5.848679770613399, "learning_rate": 1.995127414501564e-05, "loss": 1.556, "step": 1502 }, { "epoch": 0.06, "grad_norm": 6.3941788132556345, "learning_rate": 1.9951145430883986e-05, "loss": 1.3069, "step": 1503 }, { "epoch": 0.06, "grad_norm": 8.194172717057551, "learning_rate": 1.9951016547387286e-05, "loss": 1.3932, "step": 1504 }, { "epoch": 0.06, "grad_norm": 6.597246146880204, "learning_rate": 1.9950887494527735e-05, "loss": 1.4012, "step": 1505 }, { "epoch": 0.06, "grad_norm": 6.054413820458573, "learning_rate": 1.995075827230753e-05, "loss": 1.4402, "step": 1506 }, { "epoch": 0.06, "grad_norm": 6.086833181709755, "learning_rate": 1.9950628880728868e-05, "loss": 1.12, "step": 1507 }, { "epoch": 0.06, "grad_norm": 6.806341080716645, "learning_rate": 1.995049931979395e-05, "loss": 1.2329, "step": 1508 }, { "epoch": 0.06, "grad_norm": 7.392345445619857, "learning_rate": 1.9950369589504987e-05, "loss": 1.4104, "step": 1509 }, { "epoch": 0.06, "grad_norm": 8.098024377502341, "learning_rate": 1.9950239689864182e-05, "loss": 1.3549, "step": 1510 }, { "epoch": 0.06, "grad_norm": 4.226256123252309, "learning_rate": 1.9950109620873746e-05, "loss": 1.0022, "step": 1511 }, { "epoch": 0.06, "grad_norm": 5.6360826043349155, "learning_rate": 1.9949979382535894e-05, "loss": 1.4581, "step": 1512 }, { "epoch": 0.06, "grad_norm": 9.544144963732201, "learning_rate": 1.9949848974852843e-05, "loss": 1.6059, "step": 1513 }, { "epoch": 0.06, "grad_norm": 7.165575590524718, "learning_rate": 1.994971839782681e-05, "loss": 1.4665, "step": 1514 }, { "epoch": 0.06, "grad_norm": 7.272202055463505, "learning_rate": 1.994958765146002e-05, "loss": 1.2935, "step": 1515 }, { "epoch": 0.06, "grad_norm": 9.56313624952119, "learning_rate": 1.9949456735754697e-05, "loss": 1.1577, "step": 1516 }, { "epoch": 0.06, "grad_norm": 7.539039786243627, "learning_rate": 1.9949325650713072e-05, "loss": 1.5931, "step": 1517 }, { "epoch": 0.06, "grad_norm": 6.685723790866573, "learning_rate": 1.9949194396337374e-05, "loss": 1.1646, "step": 1518 }, { "epoch": 0.06, "grad_norm": 3.223923164237922, "learning_rate": 1.9949062972629835e-05, "loss": 0.9914, "step": 1519 }, { "epoch": 0.06, "grad_norm": 7.023196817327144, "learning_rate": 1.9948931379592692e-05, "loss": 1.2849, "step": 1520 }, { "epoch": 0.06, "grad_norm": 7.479767445041039, "learning_rate": 1.994879961722819e-05, "loss": 1.3771, "step": 1521 }, { "epoch": 0.06, "grad_norm": 5.865406509365259, "learning_rate": 1.9948667685538564e-05, "loss": 1.2343, "step": 1522 }, { "epoch": 0.06, "grad_norm": 6.870246739719876, "learning_rate": 1.9948535584526066e-05, "loss": 1.1168, "step": 1523 }, { "epoch": 0.06, "grad_norm": 7.141149797676787, "learning_rate": 1.9948403314192938e-05, "loss": 1.295, "step": 1524 }, { "epoch": 0.06, "grad_norm": 7.335691686724497, "learning_rate": 1.9948270874541438e-05, "loss": 1.3613, "step": 1525 }, { "epoch": 0.06, "grad_norm": 8.545276913963258, "learning_rate": 1.9948138265573815e-05, "loss": 1.4848, "step": 1526 }, { "epoch": 0.06, "grad_norm": 5.638456702770252, "learning_rate": 1.994800548729233e-05, "loss": 1.0175, "step": 1527 }, { "epoch": 0.06, "grad_norm": 4.351059645817972, "learning_rate": 1.9947872539699236e-05, "loss": 1.0924, "step": 1528 }, { "epoch": 0.06, "grad_norm": 9.658952670205018, "learning_rate": 1.9947739422796803e-05, "loss": 1.2638, "step": 1529 }, { "epoch": 0.06, "grad_norm": 6.824508558053438, "learning_rate": 1.9947606136587293e-05, "loss": 1.6483, "step": 1530 }, { "epoch": 0.06, "grad_norm": 8.760923289785385, "learning_rate": 1.9947472681072974e-05, "loss": 1.4872, "step": 1531 }, { "epoch": 0.06, "grad_norm": 8.143965236547297, "learning_rate": 1.994733905625612e-05, "loss": 1.4459, "step": 1532 }, { "epoch": 0.06, "grad_norm": 7.4480628440869205, "learning_rate": 1.9947205262139006e-05, "loss": 1.5218, "step": 1533 }, { "epoch": 0.06, "grad_norm": 7.631708833704664, "learning_rate": 1.9947071298723904e-05, "loss": 1.5779, "step": 1534 }, { "epoch": 0.06, "grad_norm": 7.485497750679615, "learning_rate": 1.99469371660131e-05, "loss": 1.6444, "step": 1535 }, { "epoch": 0.06, "grad_norm": 5.722309497031518, "learning_rate": 1.994680286400887e-05, "loss": 1.4287, "step": 1536 }, { "epoch": 0.06, "grad_norm": 7.992546940333161, "learning_rate": 1.9946668392713503e-05, "loss": 1.6253, "step": 1537 }, { "epoch": 0.06, "grad_norm": 5.342102541772421, "learning_rate": 1.9946533752129292e-05, "loss": 1.2453, "step": 1538 }, { "epoch": 0.06, "grad_norm": 5.654906082102196, "learning_rate": 1.9946398942258523e-05, "loss": 1.3879, "step": 1539 }, { "epoch": 0.06, "grad_norm": 6.208205537170746, "learning_rate": 1.9946263963103493e-05, "loss": 1.2924, "step": 1540 }, { "epoch": 0.06, "grad_norm": 8.28143628400916, "learning_rate": 1.9946128814666496e-05, "loss": 1.3187, "step": 1541 }, { "epoch": 0.06, "grad_norm": 7.0501705117234135, "learning_rate": 1.994599349694984e-05, "loss": 1.4295, "step": 1542 }, { "epoch": 0.06, "grad_norm": 5.079207926087998, "learning_rate": 1.994585800995582e-05, "loss": 1.0657, "step": 1543 }, { "epoch": 0.06, "grad_norm": 6.529469507098824, "learning_rate": 1.9945722353686743e-05, "loss": 1.4487, "step": 1544 }, { "epoch": 0.06, "grad_norm": 9.35757365334837, "learning_rate": 1.9945586528144923e-05, "loss": 1.4333, "step": 1545 }, { "epoch": 0.06, "grad_norm": 11.571609072814372, "learning_rate": 1.9945450533332663e-05, "loss": 1.9596, "step": 1546 }, { "epoch": 0.06, "grad_norm": 11.032395663945293, "learning_rate": 1.994531436925229e-05, "loss": 1.717, "step": 1547 }, { "epoch": 0.06, "grad_norm": 6.7972952803423725, "learning_rate": 1.994517803590611e-05, "loss": 1.2925, "step": 1548 }, { "epoch": 0.06, "grad_norm": 7.08034214608452, "learning_rate": 1.9945041533296448e-05, "loss": 1.3213, "step": 1549 }, { "epoch": 0.06, "grad_norm": 7.539644529234815, "learning_rate": 1.9944904861425626e-05, "loss": 1.4502, "step": 1550 }, { "epoch": 0.06, "grad_norm": 9.31531444417317, "learning_rate": 1.994476802029597e-05, "loss": 1.7819, "step": 1551 }, { "epoch": 0.06, "grad_norm": 9.921529556124726, "learning_rate": 1.9944631009909815e-05, "loss": 1.7807, "step": 1552 }, { "epoch": 0.06, "grad_norm": 6.5716019322330785, "learning_rate": 1.9944493830269487e-05, "loss": 1.6417, "step": 1553 }, { "epoch": 0.06, "grad_norm": 8.519358117715136, "learning_rate": 1.994435648137732e-05, "loss": 1.9101, "step": 1554 }, { "epoch": 0.06, "grad_norm": 9.532529571551585, "learning_rate": 1.9944218963235653e-05, "loss": 1.4121, "step": 1555 }, { "epoch": 0.06, "grad_norm": 5.907671414538133, "learning_rate": 1.9944081275846826e-05, "loss": 1.112, "step": 1556 }, { "epoch": 0.06, "grad_norm": 8.32721150332738, "learning_rate": 1.9943943419213182e-05, "loss": 1.1694, "step": 1557 }, { "epoch": 0.06, "grad_norm": 8.11909419991208, "learning_rate": 1.9943805393337073e-05, "loss": 1.45, "step": 1558 }, { "epoch": 0.06, "grad_norm": 7.286928875595552, "learning_rate": 1.994366719822084e-05, "loss": 1.2329, "step": 1559 }, { "epoch": 0.06, "grad_norm": 3.7927439174877033, "learning_rate": 1.994352883386684e-05, "loss": 1.0477, "step": 1560 }, { "epoch": 0.06, "grad_norm": 10.958128889538223, "learning_rate": 1.9943390300277427e-05, "loss": 1.603, "step": 1561 }, { "epoch": 0.06, "grad_norm": 6.170189289590508, "learning_rate": 1.994325159745496e-05, "loss": 1.4423, "step": 1562 }, { "epoch": 0.06, "grad_norm": 6.50071470960183, "learning_rate": 1.9943112725401793e-05, "loss": 1.1979, "step": 1563 }, { "epoch": 0.06, "grad_norm": 6.67036573703531, "learning_rate": 1.9942973684120297e-05, "loss": 1.3554, "step": 1564 }, { "epoch": 0.06, "grad_norm": 10.899842506755757, "learning_rate": 1.9942834473612836e-05, "loss": 1.2885, "step": 1565 }, { "epoch": 0.06, "grad_norm": 8.670129116540577, "learning_rate": 1.9942695093881782e-05, "loss": 1.4537, "step": 1566 }, { "epoch": 0.06, "grad_norm": 6.7312256194945554, "learning_rate": 1.9942555544929497e-05, "loss": 1.263, "step": 1567 }, { "epoch": 0.06, "grad_norm": 5.185744260633248, "learning_rate": 1.994241582675837e-05, "loss": 1.2302, "step": 1568 }, { "epoch": 0.06, "grad_norm": 9.098596941884958, "learning_rate": 1.994227593937077e-05, "loss": 1.3897, "step": 1569 }, { "epoch": 0.06, "grad_norm": 9.522767975133329, "learning_rate": 1.9942135882769083e-05, "loss": 1.5482, "step": 1570 }, { "epoch": 0.06, "grad_norm": 7.714566595500402, "learning_rate": 1.9941995656955687e-05, "loss": 1.1318, "step": 1571 }, { "epoch": 0.06, "grad_norm": 5.2981293350866645, "learning_rate": 1.9941855261932975e-05, "loss": 1.2576, "step": 1572 }, { "epoch": 0.06, "grad_norm": 6.2250672496196415, "learning_rate": 1.9941714697703333e-05, "loss": 1.0657, "step": 1573 }, { "epoch": 0.06, "grad_norm": 6.094744842113075, "learning_rate": 1.994157396426915e-05, "loss": 1.2815, "step": 1574 }, { "epoch": 0.06, "grad_norm": 6.525858474415809, "learning_rate": 1.9941433061632827e-05, "loss": 1.3282, "step": 1575 }, { "epoch": 0.06, "grad_norm": 6.461860155676494, "learning_rate": 1.9941291989796756e-05, "loss": 1.2583, "step": 1576 }, { "epoch": 0.06, "grad_norm": 7.7787951119602505, "learning_rate": 1.9941150748763344e-05, "loss": 1.022, "step": 1577 }, { "epoch": 0.06, "grad_norm": 10.401451954512998, "learning_rate": 1.9941009338534995e-05, "loss": 1.8156, "step": 1578 }, { "epoch": 0.06, "grad_norm": 9.58227379164404, "learning_rate": 1.9940867759114113e-05, "loss": 1.3461, "step": 1579 }, { "epoch": 0.06, "grad_norm": 6.690634179228754, "learning_rate": 1.9940726010503105e-05, "loss": 1.4187, "step": 1580 }, { "epoch": 0.06, "grad_norm": 8.77280965565021, "learning_rate": 1.994058409270439e-05, "loss": 1.3978, "step": 1581 }, { "epoch": 0.06, "grad_norm": 8.083777859803659, "learning_rate": 1.994044200572038e-05, "loss": 1.2461, "step": 1582 }, { "epoch": 0.06, "grad_norm": 7.856843418443755, "learning_rate": 1.9940299749553492e-05, "loss": 1.458, "step": 1583 }, { "epoch": 0.06, "grad_norm": 7.31283912339637, "learning_rate": 1.994015732420615e-05, "loss": 1.4488, "step": 1584 }, { "epoch": 0.06, "grad_norm": 6.057728244559561, "learning_rate": 1.9940014729680775e-05, "loss": 1.2361, "step": 1585 }, { "epoch": 0.06, "grad_norm": 6.595304766012098, "learning_rate": 1.9939871965979794e-05, "loss": 1.5832, "step": 1586 }, { "epoch": 0.06, "grad_norm": 5.60111861956935, "learning_rate": 1.993972903310564e-05, "loss": 1.1715, "step": 1587 }, { "epoch": 0.06, "grad_norm": 6.989689784232833, "learning_rate": 1.9939585931060744e-05, "loss": 1.1792, "step": 1588 }, { "epoch": 0.06, "grad_norm": 7.17453886949581, "learning_rate": 1.993944265984754e-05, "loss": 1.3627, "step": 1589 }, { "epoch": 0.06, "grad_norm": 6.409664314217006, "learning_rate": 1.9939299219468467e-05, "loss": 1.5373, "step": 1590 }, { "epoch": 0.06, "grad_norm": 6.075693106741561, "learning_rate": 1.993915560992597e-05, "loss": 1.1777, "step": 1591 }, { "epoch": 0.06, "grad_norm": 6.192403901172698, "learning_rate": 1.993901183122249e-05, "loss": 1.5035, "step": 1592 }, { "epoch": 0.06, "grad_norm": 6.640393897120675, "learning_rate": 1.9938867883360472e-05, "loss": 1.3254, "step": 1593 }, { "epoch": 0.06, "grad_norm": 5.809090601586043, "learning_rate": 1.993872376634237e-05, "loss": 1.3879, "step": 1594 }, { "epoch": 0.06, "grad_norm": 5.355182527982758, "learning_rate": 1.9938579480170633e-05, "loss": 1.1732, "step": 1595 }, { "epoch": 0.06, "grad_norm": 7.322432509837842, "learning_rate": 1.9938435024847723e-05, "loss": 1.075, "step": 1596 }, { "epoch": 0.06, "grad_norm": 6.981143220828466, "learning_rate": 1.9938290400376094e-05, "loss": 1.3783, "step": 1597 }, { "epoch": 0.06, "grad_norm": 7.619628262979774, "learning_rate": 1.9938145606758202e-05, "loss": 1.3298, "step": 1598 }, { "epoch": 0.06, "grad_norm": 5.623960361259372, "learning_rate": 1.9938000643996522e-05, "loss": 1.5042, "step": 1599 }, { "epoch": 0.06, "grad_norm": 6.5739225559456145, "learning_rate": 1.9937855512093516e-05, "loss": 1.3372, "step": 1600 }, { "epoch": 0.06, "grad_norm": 7.266145277377041, "learning_rate": 1.9937710211051653e-05, "loss": 1.509, "step": 1601 }, { "epoch": 0.06, "grad_norm": 6.469312510968996, "learning_rate": 1.993756474087341e-05, "loss": 1.4383, "step": 1602 }, { "epoch": 0.06, "grad_norm": 6.978471542836006, "learning_rate": 1.9937419101561258e-05, "loss": 1.2334, "step": 1603 }, { "epoch": 0.06, "grad_norm": 7.392335320682871, "learning_rate": 1.9937273293117675e-05, "loss": 1.4389, "step": 1604 }, { "epoch": 0.06, "grad_norm": 7.158813927089674, "learning_rate": 1.9937127315545153e-05, "loss": 1.3663, "step": 1605 }, { "epoch": 0.06, "grad_norm": 6.378488620897281, "learning_rate": 1.9936981168846166e-05, "loss": 1.5565, "step": 1606 }, { "epoch": 0.06, "grad_norm": 7.62754883387644, "learning_rate": 1.99368348530232e-05, "loss": 1.4329, "step": 1607 }, { "epoch": 0.06, "grad_norm": 7.943410745982573, "learning_rate": 1.9936688368078756e-05, "loss": 1.3482, "step": 1608 }, { "epoch": 0.06, "grad_norm": 6.451168088621057, "learning_rate": 1.9936541714015317e-05, "loss": 1.5532, "step": 1609 }, { "epoch": 0.06, "grad_norm": 7.559605427719264, "learning_rate": 1.9936394890835387e-05, "loss": 1.4402, "step": 1610 }, { "epoch": 0.06, "grad_norm": 6.142675489715711, "learning_rate": 1.9936247898541457e-05, "loss": 1.3562, "step": 1611 }, { "epoch": 0.06, "grad_norm": 6.390666476163092, "learning_rate": 1.9936100737136034e-05, "loss": 1.048, "step": 1612 }, { "epoch": 0.06, "grad_norm": 7.527392428232301, "learning_rate": 1.993595340662162e-05, "loss": 1.2397, "step": 1613 }, { "epoch": 0.07, "grad_norm": 5.8514051690548365, "learning_rate": 1.993580590700073e-05, "loss": 1.1742, "step": 1614 }, { "epoch": 0.07, "grad_norm": 6.478998822309698, "learning_rate": 1.9935658238275858e-05, "loss": 1.1014, "step": 1615 }, { "epoch": 0.07, "grad_norm": 6.608661681964139, "learning_rate": 1.9935510400449534e-05, "loss": 1.092, "step": 1616 }, { "epoch": 0.07, "grad_norm": 7.089407265384593, "learning_rate": 1.9935362393524262e-05, "loss": 1.7726, "step": 1617 }, { "epoch": 0.07, "grad_norm": 6.774298791242347, "learning_rate": 1.993521421750257e-05, "loss": 1.571, "step": 1618 }, { "epoch": 0.07, "grad_norm": 6.781802512135609, "learning_rate": 1.9935065872386977e-05, "loss": 1.6905, "step": 1619 }, { "epoch": 0.07, "grad_norm": 6.0197893745602835, "learning_rate": 1.993491735818001e-05, "loss": 1.3041, "step": 1620 }, { "epoch": 0.07, "grad_norm": 6.544392804378859, "learning_rate": 1.9934768674884187e-05, "loss": 1.5142, "step": 1621 }, { "epoch": 0.07, "grad_norm": 7.805665932824067, "learning_rate": 1.993461982250205e-05, "loss": 1.322, "step": 1622 }, { "epoch": 0.07, "grad_norm": 5.805394971521824, "learning_rate": 1.993447080103613e-05, "loss": 1.3419, "step": 1623 }, { "epoch": 0.07, "grad_norm": 5.865843585738184, "learning_rate": 1.9934321610488955e-05, "loss": 1.1941, "step": 1624 }, { "epoch": 0.07, "grad_norm": 7.677701994350402, "learning_rate": 1.9934172250863074e-05, "loss": 1.6682, "step": 1625 }, { "epoch": 0.07, "grad_norm": 3.656563521687858, "learning_rate": 1.9934022722161025e-05, "loss": 1.1174, "step": 1626 }, { "epoch": 0.07, "grad_norm": 10.074739077062604, "learning_rate": 1.9933873024385352e-05, "loss": 1.2706, "step": 1627 }, { "epoch": 0.07, "grad_norm": 10.116254423287817, "learning_rate": 1.9933723157538606e-05, "loss": 1.5758, "step": 1628 }, { "epoch": 0.07, "grad_norm": 8.43568384602253, "learning_rate": 1.993357312162333e-05, "loss": 1.2362, "step": 1629 }, { "epoch": 0.07, "grad_norm": 5.290182240848821, "learning_rate": 1.9933422916642093e-05, "loss": 1.3019, "step": 1630 }, { "epoch": 0.07, "grad_norm": 9.120901478111954, "learning_rate": 1.9933272542597432e-05, "loss": 1.5725, "step": 1631 }, { "epoch": 0.07, "grad_norm": 8.111223430836484, "learning_rate": 1.9933121999491923e-05, "loss": 1.4675, "step": 1632 }, { "epoch": 0.07, "grad_norm": 9.071313114492433, "learning_rate": 1.993297128732812e-05, "loss": 1.4817, "step": 1633 }, { "epoch": 0.07, "grad_norm": 7.710873816747146, "learning_rate": 1.9932820406108586e-05, "loss": 1.4678, "step": 1634 }, { "epoch": 0.07, "grad_norm": 5.327068150483543, "learning_rate": 1.9932669355835893e-05, "loss": 1.1287, "step": 1635 }, { "epoch": 0.07, "grad_norm": 5.856110823537345, "learning_rate": 1.9932518136512612e-05, "loss": 1.2326, "step": 1636 }, { "epoch": 0.07, "grad_norm": 5.5763091821901805, "learning_rate": 1.9932366748141318e-05, "loss": 1.3405, "step": 1637 }, { "epoch": 0.07, "grad_norm": 6.49273271358971, "learning_rate": 1.9932215190724582e-05, "loss": 1.15, "step": 1638 }, { "epoch": 0.07, "grad_norm": 4.398065629815027, "learning_rate": 1.993206346426499e-05, "loss": 1.0625, "step": 1639 }, { "epoch": 0.07, "grad_norm": 7.750787524045484, "learning_rate": 1.993191156876512e-05, "loss": 1.7256, "step": 1640 }, { "epoch": 0.07, "grad_norm": 7.730581934470894, "learning_rate": 1.9931759504227555e-05, "loss": 1.5309, "step": 1641 }, { "epoch": 0.07, "grad_norm": 7.2088961963128195, "learning_rate": 1.993160727065489e-05, "loss": 1.2774, "step": 1642 }, { "epoch": 0.07, "grad_norm": 11.651327937181227, "learning_rate": 1.9931454868049713e-05, "loss": 1.4965, "step": 1643 }, { "epoch": 0.07, "grad_norm": 5.92699279504504, "learning_rate": 1.9931302296414615e-05, "loss": 1.3109, "step": 1644 }, { "epoch": 0.07, "grad_norm": 8.309450043311207, "learning_rate": 1.9931149555752195e-05, "loss": 1.5808, "step": 1645 }, { "epoch": 0.07, "grad_norm": 8.749626345156699, "learning_rate": 1.9930996646065055e-05, "loss": 1.6116, "step": 1646 }, { "epoch": 0.07, "grad_norm": 5.112384855649576, "learning_rate": 1.9930843567355794e-05, "loss": 1.4236, "step": 1647 }, { "epoch": 0.07, "grad_norm": 7.568626866352402, "learning_rate": 1.9930690319627017e-05, "loss": 1.3355, "step": 1648 }, { "epoch": 0.07, "grad_norm": 5.644718595756402, "learning_rate": 1.9930536902881333e-05, "loss": 1.4223, "step": 1649 }, { "epoch": 0.07, "grad_norm": 5.894857796854881, "learning_rate": 1.9930383317121355e-05, "loss": 1.3005, "step": 1650 }, { "epoch": 0.07, "grad_norm": 6.006740734699825, "learning_rate": 1.9930229562349697e-05, "loss": 1.2257, "step": 1651 }, { "epoch": 0.07, "grad_norm": 6.4882022628806, "learning_rate": 1.9930075638568973e-05, "loss": 1.7219, "step": 1652 }, { "epoch": 0.07, "grad_norm": 5.938144432325534, "learning_rate": 1.9929921545781805e-05, "loss": 1.6193, "step": 1653 }, { "epoch": 0.07, "grad_norm": 6.37878731753769, "learning_rate": 1.9929767283990814e-05, "loss": 1.468, "step": 1654 }, { "epoch": 0.07, "grad_norm": 5.712880185992707, "learning_rate": 1.9929612853198628e-05, "loss": 1.4272, "step": 1655 }, { "epoch": 0.07, "grad_norm": 5.082371310995654, "learning_rate": 1.992945825340787e-05, "loss": 1.185, "step": 1656 }, { "epoch": 0.07, "grad_norm": 5.985156578753303, "learning_rate": 1.9929303484621177e-05, "loss": 1.3921, "step": 1657 }, { "epoch": 0.07, "grad_norm": 7.033665445543282, "learning_rate": 1.992914854684118e-05, "loss": 1.5983, "step": 1658 }, { "epoch": 0.07, "grad_norm": 8.326442660226737, "learning_rate": 1.9928993440070522e-05, "loss": 1.4395, "step": 1659 }, { "epoch": 0.07, "grad_norm": 5.870200774828051, "learning_rate": 1.9928838164311834e-05, "loss": 1.0925, "step": 1660 }, { "epoch": 0.07, "grad_norm": 7.072785921063068, "learning_rate": 1.9928682719567762e-05, "loss": 1.2887, "step": 1661 }, { "epoch": 0.07, "grad_norm": 5.888497958751173, "learning_rate": 1.9928527105840952e-05, "loss": 1.3087, "step": 1662 }, { "epoch": 0.07, "grad_norm": 6.534549608752247, "learning_rate": 1.9928371323134055e-05, "loss": 1.2264, "step": 1663 }, { "epoch": 0.07, "grad_norm": 6.7763821422269785, "learning_rate": 1.992821537144972e-05, "loss": 1.1615, "step": 1664 }, { "epoch": 0.07, "grad_norm": 5.073106206221608, "learning_rate": 1.99280592507906e-05, "loss": 1.293, "step": 1665 }, { "epoch": 0.07, "grad_norm": 8.074976451536681, "learning_rate": 1.9927902961159353e-05, "loss": 1.1778, "step": 1666 }, { "epoch": 0.07, "grad_norm": 5.909053159561319, "learning_rate": 1.992774650255864e-05, "loss": 1.4633, "step": 1667 }, { "epoch": 0.07, "grad_norm": 6.385949656089326, "learning_rate": 1.9927589874991125e-05, "loss": 1.2306, "step": 1668 }, { "epoch": 0.07, "grad_norm": 5.715482874351792, "learning_rate": 1.992743307845947e-05, "loss": 1.1861, "step": 1669 }, { "epoch": 0.07, "grad_norm": 7.778867765226718, "learning_rate": 1.9927276112966347e-05, "loss": 1.3787, "step": 1670 }, { "epoch": 0.07, "grad_norm": 6.648479773841119, "learning_rate": 1.9927118978514426e-05, "loss": 1.6182, "step": 1671 }, { "epoch": 0.07, "grad_norm": 8.15540401833208, "learning_rate": 1.992696167510638e-05, "loss": 1.4082, "step": 1672 }, { "epoch": 0.07, "grad_norm": 7.254012974044487, "learning_rate": 1.992680420274489e-05, "loss": 1.3287, "step": 1673 }, { "epoch": 0.07, "grad_norm": 8.518865110024242, "learning_rate": 1.9926646561432634e-05, "loss": 1.653, "step": 1674 }, { "epoch": 0.07, "grad_norm": 8.923994973352675, "learning_rate": 1.992648875117229e-05, "loss": 1.662, "step": 1675 }, { "epoch": 0.07, "grad_norm": 7.2048350053733365, "learning_rate": 1.9926330771966554e-05, "loss": 1.2386, "step": 1676 }, { "epoch": 0.07, "grad_norm": 7.699161398714065, "learning_rate": 1.9926172623818108e-05, "loss": 1.2519, "step": 1677 }, { "epoch": 0.07, "grad_norm": 5.34755248933042, "learning_rate": 1.9926014306729645e-05, "loss": 1.316, "step": 1678 }, { "epoch": 0.07, "grad_norm": 7.152965599653538, "learning_rate": 1.9925855820703862e-05, "loss": 1.3969, "step": 1679 }, { "epoch": 0.07, "grad_norm": 7.69933018387624, "learning_rate": 1.9925697165743453e-05, "loss": 1.2576, "step": 1680 }, { "epoch": 0.07, "grad_norm": 4.031168273676834, "learning_rate": 1.992553834185112e-05, "loss": 1.1726, "step": 1681 }, { "epoch": 0.07, "grad_norm": 6.496433759523101, "learning_rate": 1.992537934902956e-05, "loss": 1.1888, "step": 1682 }, { "epoch": 0.07, "grad_norm": 6.729325085418545, "learning_rate": 1.992522018728149e-05, "loss": 1.1776, "step": 1683 }, { "epoch": 0.07, "grad_norm": 7.8770115341714195, "learning_rate": 1.9925060856609614e-05, "loss": 1.6919, "step": 1684 }, { "epoch": 0.07, "grad_norm": 6.0878552238194095, "learning_rate": 1.992490135701664e-05, "loss": 1.4694, "step": 1685 }, { "epoch": 0.07, "grad_norm": 5.478208461704142, "learning_rate": 1.9924741688505284e-05, "loss": 1.2665, "step": 1686 }, { "epoch": 0.07, "grad_norm": 11.24583709703786, "learning_rate": 1.9924581851078272e-05, "loss": 1.5426, "step": 1687 }, { "epoch": 0.07, "grad_norm": 8.190483850669683, "learning_rate": 1.992442184473831e-05, "loss": 1.4105, "step": 1688 }, { "epoch": 0.07, "grad_norm": 5.248388133122466, "learning_rate": 1.992426166948813e-05, "loss": 1.2239, "step": 1689 }, { "epoch": 0.07, "grad_norm": 7.0589390011053395, "learning_rate": 1.992410132533046e-05, "loss": 1.6185, "step": 1690 }, { "epoch": 0.07, "grad_norm": 6.145311335723086, "learning_rate": 1.992394081226803e-05, "loss": 1.5274, "step": 1691 }, { "epoch": 0.07, "grad_norm": 6.45199735989477, "learning_rate": 1.992378013030356e-05, "loss": 1.2775, "step": 1692 }, { "epoch": 0.07, "grad_norm": 9.10188760423342, "learning_rate": 1.99236192794398e-05, "loss": 1.0359, "step": 1693 }, { "epoch": 0.07, "grad_norm": 7.680400597548671, "learning_rate": 1.9923458259679473e-05, "loss": 1.3229, "step": 1694 }, { "epoch": 0.07, "grad_norm": 7.346560758139622, "learning_rate": 1.992329707102533e-05, "loss": 1.3244, "step": 1695 }, { "epoch": 0.07, "grad_norm": 7.556713394630111, "learning_rate": 1.9923135713480112e-05, "loss": 1.1206, "step": 1696 }, { "epoch": 0.07, "grad_norm": 5.9654618664773045, "learning_rate": 1.9922974187046563e-05, "loss": 1.3522, "step": 1697 }, { "epoch": 0.07, "grad_norm": 10.133248190517277, "learning_rate": 1.992281249172743e-05, "loss": 1.2864, "step": 1698 }, { "epoch": 0.07, "grad_norm": 10.377822359872365, "learning_rate": 1.9922650627525475e-05, "loss": 1.2607, "step": 1699 }, { "epoch": 0.07, "grad_norm": 7.768660810924517, "learning_rate": 1.9922488594443445e-05, "loss": 1.5015, "step": 1700 }, { "epoch": 0.07, "grad_norm": 6.853470733131885, "learning_rate": 1.99223263924841e-05, "loss": 1.21, "step": 1701 }, { "epoch": 0.07, "grad_norm": 5.520803231728888, "learning_rate": 1.9922164021650195e-05, "loss": 1.3648, "step": 1702 }, { "epoch": 0.07, "grad_norm": 7.822571381701583, "learning_rate": 1.99220014819445e-05, "loss": 1.3823, "step": 1703 }, { "epoch": 0.07, "grad_norm": 7.917994216538084, "learning_rate": 1.9921838773369782e-05, "loss": 1.43, "step": 1704 }, { "epoch": 0.07, "grad_norm": 6.8088448831396216, "learning_rate": 1.9921675895928806e-05, "loss": 1.4698, "step": 1705 }, { "epoch": 0.07, "grad_norm": 5.946736114359994, "learning_rate": 1.9921512849624345e-05, "loss": 1.5381, "step": 1706 }, { "epoch": 0.07, "grad_norm": 15.79832887747123, "learning_rate": 1.9921349634459176e-05, "loss": 1.4969, "step": 1707 }, { "epoch": 0.07, "grad_norm": 8.304559368916484, "learning_rate": 1.9921186250436075e-05, "loss": 1.1857, "step": 1708 }, { "epoch": 0.07, "grad_norm": 9.3264970553984, "learning_rate": 1.9921022697557824e-05, "loss": 1.3402, "step": 1709 }, { "epoch": 0.07, "grad_norm": 11.149684063867502, "learning_rate": 1.9920858975827207e-05, "loss": 1.4257, "step": 1710 }, { "epoch": 0.07, "grad_norm": 5.451057532094731, "learning_rate": 1.9920695085247012e-05, "loss": 1.3762, "step": 1711 }, { "epoch": 0.07, "grad_norm": 7.3843049881880995, "learning_rate": 1.992053102582002e-05, "loss": 1.5508, "step": 1712 }, { "epoch": 0.07, "grad_norm": 8.117902170348703, "learning_rate": 1.9920366797549032e-05, "loss": 1.2479, "step": 1713 }, { "epoch": 0.07, "grad_norm": 7.319116056589735, "learning_rate": 1.9920202400436843e-05, "loss": 1.5845, "step": 1714 }, { "epoch": 0.07, "grad_norm": 6.00822832168836, "learning_rate": 1.9920037834486245e-05, "loss": 1.5245, "step": 1715 }, { "epoch": 0.07, "grad_norm": 7.2934451740367665, "learning_rate": 1.9919873099700045e-05, "loss": 1.4578, "step": 1716 }, { "epoch": 0.07, "grad_norm": 6.159304496800559, "learning_rate": 1.991970819608104e-05, "loss": 1.4281, "step": 1717 }, { "epoch": 0.07, "grad_norm": 4.952747536882591, "learning_rate": 1.9919543123632044e-05, "loss": 1.1817, "step": 1718 }, { "epoch": 0.07, "grad_norm": 6.211977119387033, "learning_rate": 1.9919377882355863e-05, "loss": 1.44, "step": 1719 }, { "epoch": 0.07, "grad_norm": 6.331147977211939, "learning_rate": 1.9919212472255305e-05, "loss": 1.2883, "step": 1720 }, { "epoch": 0.07, "grad_norm": 5.3346054916498185, "learning_rate": 1.9919046893333194e-05, "loss": 1.3507, "step": 1721 }, { "epoch": 0.07, "grad_norm": 5.731406890546832, "learning_rate": 1.991888114559234e-05, "loss": 1.6637, "step": 1722 }, { "epoch": 0.07, "grad_norm": 12.134700365445143, "learning_rate": 1.991871522903557e-05, "loss": 1.7547, "step": 1723 }, { "epoch": 0.07, "grad_norm": 6.136288929620964, "learning_rate": 1.9918549143665705e-05, "loss": 1.1381, "step": 1724 }, { "epoch": 0.07, "grad_norm": 8.603047677756543, "learning_rate": 1.9918382889485578e-05, "loss": 1.6167, "step": 1725 }, { "epoch": 0.07, "grad_norm": 11.557569322985294, "learning_rate": 1.9918216466498004e-05, "loss": 2.2483, "step": 1726 }, { "epoch": 0.07, "grad_norm": 8.109442732408645, "learning_rate": 1.991804987470583e-05, "loss": 1.4275, "step": 1727 }, { "epoch": 0.07, "grad_norm": 6.3797191129216815, "learning_rate": 1.9917883114111884e-05, "loss": 1.4498, "step": 1728 }, { "epoch": 0.07, "grad_norm": 7.506771305809261, "learning_rate": 1.9917716184719e-05, "loss": 1.6498, "step": 1729 }, { "epoch": 0.07, "grad_norm": 6.310880322621589, "learning_rate": 1.9917549086530034e-05, "loss": 1.2824, "step": 1730 }, { "epoch": 0.07, "grad_norm": 6.811035140124221, "learning_rate": 1.9917381819547814e-05, "loss": 1.0696, "step": 1731 }, { "epoch": 0.07, "grad_norm": 5.608228013349219, "learning_rate": 1.99172143837752e-05, "loss": 1.2104, "step": 1732 }, { "epoch": 0.07, "grad_norm": 7.027231981468973, "learning_rate": 1.991704677921503e-05, "loss": 1.4918, "step": 1733 }, { "epoch": 0.07, "grad_norm": 7.0489565763917765, "learning_rate": 1.9916879005870164e-05, "loss": 1.3998, "step": 1734 }, { "epoch": 0.07, "grad_norm": 6.42963262852247, "learning_rate": 1.9916711063743456e-05, "loss": 1.4929, "step": 1735 }, { "epoch": 0.07, "grad_norm": 7.017362157096668, "learning_rate": 1.9916542952837764e-05, "loss": 1.4392, "step": 1736 }, { "epoch": 0.07, "grad_norm": 8.045204604129488, "learning_rate": 1.9916374673155945e-05, "loss": 1.1419, "step": 1737 }, { "epoch": 0.07, "grad_norm": 6.145834818494214, "learning_rate": 1.991620622470087e-05, "loss": 1.1655, "step": 1738 }, { "epoch": 0.07, "grad_norm": 7.081079699341444, "learning_rate": 1.9916037607475402e-05, "loss": 1.2701, "step": 1739 }, { "epoch": 0.07, "grad_norm": 5.226159187826429, "learning_rate": 1.991586882148241e-05, "loss": 1.2725, "step": 1740 }, { "epoch": 0.07, "grad_norm": 8.349359066104512, "learning_rate": 1.9915699866724773e-05, "loss": 1.4241, "step": 1741 }, { "epoch": 0.07, "grad_norm": 8.926721703308994, "learning_rate": 1.9915530743205358e-05, "loss": 1.59, "step": 1742 }, { "epoch": 0.07, "grad_norm": 7.999344282514739, "learning_rate": 1.991536145092705e-05, "loss": 1.8143, "step": 1743 }, { "epoch": 0.07, "grad_norm": 5.818289030500545, "learning_rate": 1.9915191989892725e-05, "loss": 1.2075, "step": 1744 }, { "epoch": 0.07, "grad_norm": 15.601293365287694, "learning_rate": 1.991502236010527e-05, "loss": 1.4127, "step": 1745 }, { "epoch": 0.07, "grad_norm": 8.298343420921267, "learning_rate": 1.9914852561567574e-05, "loss": 1.0145, "step": 1746 }, { "epoch": 0.07, "grad_norm": 5.474593316675642, "learning_rate": 1.9914682594282526e-05, "loss": 1.388, "step": 1747 }, { "epoch": 0.07, "grad_norm": 6.195612080501038, "learning_rate": 1.9914512458253014e-05, "loss": 1.2526, "step": 1748 }, { "epoch": 0.07, "grad_norm": 4.324070732912341, "learning_rate": 1.9914342153481935e-05, "loss": 1.222, "step": 1749 }, { "epoch": 0.07, "grad_norm": 6.895319279343103, "learning_rate": 1.9914171679972196e-05, "loss": 1.424, "step": 1750 }, { "epoch": 0.07, "grad_norm": 7.353797203841676, "learning_rate": 1.991400103772669e-05, "loss": 1.3574, "step": 1751 }, { "epoch": 0.07, "grad_norm": 6.042433336959572, "learning_rate": 1.9913830226748322e-05, "loss": 1.3862, "step": 1752 }, { "epoch": 0.07, "grad_norm": 10.998261005360554, "learning_rate": 1.9913659247039998e-05, "loss": 1.4126, "step": 1753 }, { "epoch": 0.07, "grad_norm": 5.898870494013171, "learning_rate": 1.9913488098604636e-05, "loss": 1.3144, "step": 1754 }, { "epoch": 0.07, "grad_norm": 7.833619034022559, "learning_rate": 1.991331678144514e-05, "loss": 1.4492, "step": 1755 }, { "epoch": 0.07, "grad_norm": 6.803320290471051, "learning_rate": 1.991314529556443e-05, "loss": 1.3331, "step": 1756 }, { "epoch": 0.07, "grad_norm": 8.191269472544466, "learning_rate": 1.9912973640965423e-05, "loss": 1.3654, "step": 1757 }, { "epoch": 0.07, "grad_norm": 9.89152675124716, "learning_rate": 1.991280181765104e-05, "loss": 1.3809, "step": 1758 }, { "epoch": 0.07, "grad_norm": 6.386224116385923, "learning_rate": 1.9912629825624212e-05, "loss": 1.2998, "step": 1759 }, { "epoch": 0.07, "grad_norm": 8.277462624411385, "learning_rate": 1.9912457664887855e-05, "loss": 1.0777, "step": 1760 }, { "epoch": 0.07, "grad_norm": 7.639883888556958, "learning_rate": 1.991228533544491e-05, "loss": 1.3858, "step": 1761 }, { "epoch": 0.07, "grad_norm": 7.71621459688264, "learning_rate": 1.9912112837298304e-05, "loss": 1.1989, "step": 1762 }, { "epoch": 0.07, "grad_norm": 8.556813046672799, "learning_rate": 1.9911940170450972e-05, "loss": 1.5478, "step": 1763 }, { "epoch": 0.07, "grad_norm": 8.382872366027941, "learning_rate": 1.9911767334905858e-05, "loss": 1.5322, "step": 1764 }, { "epoch": 0.07, "grad_norm": 11.784716661821816, "learning_rate": 1.9911594330665895e-05, "loss": 1.7565, "step": 1765 }, { "epoch": 0.07, "grad_norm": 6.628468038670719, "learning_rate": 1.9911421157734036e-05, "loss": 1.5098, "step": 1766 }, { "epoch": 0.07, "grad_norm": 9.177711320719025, "learning_rate": 1.9911247816113224e-05, "loss": 1.1364, "step": 1767 }, { "epoch": 0.07, "grad_norm": 15.561688252484943, "learning_rate": 1.991107430580641e-05, "loss": 1.2444, "step": 1768 }, { "epoch": 0.07, "grad_norm": 8.4568799363013, "learning_rate": 1.991090062681655e-05, "loss": 1.4806, "step": 1769 }, { "epoch": 0.07, "grad_norm": 6.394164282685899, "learning_rate": 1.9910726779146597e-05, "loss": 1.3055, "step": 1770 }, { "epoch": 0.07, "grad_norm": 11.07188287178954, "learning_rate": 1.9910552762799506e-05, "loss": 1.2697, "step": 1771 }, { "epoch": 0.07, "grad_norm": 9.277541721371888, "learning_rate": 1.9910378577778248e-05, "loss": 1.6219, "step": 1772 }, { "epoch": 0.07, "grad_norm": 5.432089175637452, "learning_rate": 1.9910204224085777e-05, "loss": 1.474, "step": 1773 }, { "epoch": 0.07, "grad_norm": 6.569606106332312, "learning_rate": 1.991002970172507e-05, "loss": 1.3646, "step": 1774 }, { "epoch": 0.07, "grad_norm": 5.596953282022571, "learning_rate": 1.990985501069909e-05, "loss": 1.0274, "step": 1775 }, { "epoch": 0.07, "grad_norm": 7.357675756044595, "learning_rate": 1.9909680151010815e-05, "loss": 1.3306, "step": 1776 }, { "epoch": 0.07, "grad_norm": 5.377739488685105, "learning_rate": 1.9909505122663222e-05, "loss": 1.3393, "step": 1777 }, { "epoch": 0.07, "grad_norm": 7.6138783849353775, "learning_rate": 1.990932992565928e-05, "loss": 1.2284, "step": 1778 }, { "epoch": 0.07, "grad_norm": 5.60313478546979, "learning_rate": 1.9909154560001986e-05, "loss": 1.5439, "step": 1779 }, { "epoch": 0.07, "grad_norm": 7.972049318222225, "learning_rate": 1.9908979025694312e-05, "loss": 1.6239, "step": 1780 }, { "epoch": 0.07, "grad_norm": 4.858085993130201, "learning_rate": 1.990880332273925e-05, "loss": 1.1449, "step": 1781 }, { "epoch": 0.07, "grad_norm": 5.423170453073888, "learning_rate": 1.9908627451139795e-05, "loss": 1.2426, "step": 1782 }, { "epoch": 0.07, "grad_norm": 6.9883928300979745, "learning_rate": 1.9908451410898936e-05, "loss": 1.5397, "step": 1783 }, { "epoch": 0.07, "grad_norm": 5.542155496212223, "learning_rate": 1.9908275202019664e-05, "loss": 1.4943, "step": 1784 }, { "epoch": 0.07, "grad_norm": 5.452378534502823, "learning_rate": 1.9908098824504986e-05, "loss": 1.2322, "step": 1785 }, { "epoch": 0.07, "grad_norm": 7.2635571017663505, "learning_rate": 1.99079222783579e-05, "loss": 1.3524, "step": 1786 }, { "epoch": 0.07, "grad_norm": 4.703191484416512, "learning_rate": 1.9907745563581414e-05, "loss": 1.1532, "step": 1787 }, { "epoch": 0.07, "grad_norm": 9.309888596478867, "learning_rate": 1.990756868017853e-05, "loss": 1.4145, "step": 1788 }, { "epoch": 0.07, "grad_norm": 8.425418491896762, "learning_rate": 1.9907391628152265e-05, "loss": 1.6318, "step": 1789 }, { "epoch": 0.07, "grad_norm": 6.748837398982095, "learning_rate": 1.9907214407505626e-05, "loss": 1.5433, "step": 1790 }, { "epoch": 0.07, "grad_norm": 6.610148343171284, "learning_rate": 1.9907037018241636e-05, "loss": 1.4925, "step": 1791 }, { "epoch": 0.07, "grad_norm": 5.886287135748048, "learning_rate": 1.9906859460363307e-05, "loss": 1.3725, "step": 1792 }, { "epoch": 0.07, "grad_norm": 7.934101973483032, "learning_rate": 1.9906681733873667e-05, "loss": 1.2098, "step": 1793 }, { "epoch": 0.07, "grad_norm": 5.821121171717437, "learning_rate": 1.990650383877574e-05, "loss": 1.5023, "step": 1794 }, { "epoch": 0.07, "grad_norm": 9.074021152154163, "learning_rate": 1.990632577507255e-05, "loss": 1.4374, "step": 1795 }, { "epoch": 0.07, "grad_norm": 7.933658849380015, "learning_rate": 1.990614754276713e-05, "loss": 1.9002, "step": 1796 }, { "epoch": 0.07, "grad_norm": 7.871279041465596, "learning_rate": 1.9905969141862512e-05, "loss": 1.5813, "step": 1797 }, { "epoch": 0.07, "grad_norm": 5.941290167208677, "learning_rate": 1.9905790572361736e-05, "loss": 1.3539, "step": 1798 }, { "epoch": 0.07, "grad_norm": 5.316097887667811, "learning_rate": 1.9905611834267834e-05, "loss": 1.2964, "step": 1799 }, { "epoch": 0.07, "grad_norm": 7.522166752742381, "learning_rate": 1.9905432927583858e-05, "loss": 1.7959, "step": 1800 }, { "epoch": 0.07, "grad_norm": 7.78242110236598, "learning_rate": 1.9905253852312845e-05, "loss": 1.5231, "step": 1801 }, { "epoch": 0.07, "grad_norm": 5.157257795621279, "learning_rate": 1.9905074608457843e-05, "loss": 1.0522, "step": 1802 }, { "epoch": 0.07, "grad_norm": 6.289875723341624, "learning_rate": 1.990489519602191e-05, "loss": 1.2414, "step": 1803 }, { "epoch": 0.07, "grad_norm": 8.894820781939416, "learning_rate": 1.9904715615008092e-05, "loss": 1.6934, "step": 1804 }, { "epoch": 0.07, "grad_norm": 6.429420225885721, "learning_rate": 1.990453586541945e-05, "loss": 1.2645, "step": 1805 }, { "epoch": 0.07, "grad_norm": 5.206983720684911, "learning_rate": 1.990435594725904e-05, "loss": 1.1669, "step": 1806 }, { "epoch": 0.07, "grad_norm": 6.429944245912889, "learning_rate": 1.9904175860529922e-05, "loss": 1.0718, "step": 1807 }, { "epoch": 0.07, "grad_norm": 8.344498414761276, "learning_rate": 1.9903995605235172e-05, "loss": 1.657, "step": 1808 }, { "epoch": 0.07, "grad_norm": 5.1099072782204615, "learning_rate": 1.9903815181377847e-05, "loss": 1.2116, "step": 1809 }, { "epoch": 0.07, "grad_norm": 7.612526679436819, "learning_rate": 1.990363458896102e-05, "loss": 1.504, "step": 1810 }, { "epoch": 0.07, "grad_norm": 7.185900296286562, "learning_rate": 1.9903453827987765e-05, "loss": 1.3022, "step": 1811 }, { "epoch": 0.07, "grad_norm": 4.95702676462048, "learning_rate": 1.990327289846116e-05, "loss": 1.2666, "step": 1812 }, { "epoch": 0.07, "grad_norm": 6.220178227341689, "learning_rate": 1.9903091800384283e-05, "loss": 1.2833, "step": 1813 }, { "epoch": 0.07, "grad_norm": 7.082063465426698, "learning_rate": 1.9902910533760217e-05, "loss": 1.4117, "step": 1814 }, { "epoch": 0.07, "grad_norm": 6.884736141590249, "learning_rate": 1.990272909859205e-05, "loss": 1.2908, "step": 1815 }, { "epoch": 0.07, "grad_norm": 6.5416372703163175, "learning_rate": 1.9902547494882865e-05, "loss": 1.7995, "step": 1816 }, { "epoch": 0.07, "grad_norm": 6.925866921431485, "learning_rate": 1.9902365722635753e-05, "loss": 1.6132, "step": 1817 }, { "epoch": 0.07, "grad_norm": 5.922915269005915, "learning_rate": 1.990218378185381e-05, "loss": 1.4446, "step": 1818 }, { "epoch": 0.07, "grad_norm": 5.795380954368281, "learning_rate": 1.990200167254013e-05, "loss": 1.3096, "step": 1819 }, { "epoch": 0.07, "grad_norm": 5.195386670293692, "learning_rate": 1.990181939469782e-05, "loss": 1.3163, "step": 1820 }, { "epoch": 0.07, "grad_norm": 5.912284694575628, "learning_rate": 1.9901636948329974e-05, "loss": 1.4034, "step": 1821 }, { "epoch": 0.07, "grad_norm": 7.32285080006908, "learning_rate": 1.99014543334397e-05, "loss": 1.4367, "step": 1822 }, { "epoch": 0.07, "grad_norm": 5.598316673353837, "learning_rate": 1.9901271550030107e-05, "loss": 1.5952, "step": 1823 }, { "epoch": 0.07, "grad_norm": 5.514889830843357, "learning_rate": 1.9901088598104305e-05, "loss": 1.3828, "step": 1824 }, { "epoch": 0.07, "grad_norm": 8.739268656764798, "learning_rate": 1.99009054776654e-05, "loss": 1.2074, "step": 1825 }, { "epoch": 0.07, "grad_norm": 6.741307080442253, "learning_rate": 1.9900722188716526e-05, "loss": 1.7741, "step": 1826 }, { "epoch": 0.07, "grad_norm": 7.838050987195043, "learning_rate": 1.9900538731260786e-05, "loss": 1.873, "step": 1827 }, { "epoch": 0.07, "grad_norm": 6.035000673892894, "learning_rate": 1.9900355105301314e-05, "loss": 1.4176, "step": 1828 }, { "epoch": 0.07, "grad_norm": 7.0587549412363995, "learning_rate": 1.990017131084123e-05, "loss": 1.513, "step": 1829 }, { "epoch": 0.07, "grad_norm": 5.687590853577961, "learning_rate": 1.9899987347883662e-05, "loss": 1.2701, "step": 1830 }, { "epoch": 0.07, "grad_norm": 7.001211163053075, "learning_rate": 1.989980321643174e-05, "loss": 1.5076, "step": 1831 }, { "epoch": 0.07, "grad_norm": 5.544513499111193, "learning_rate": 1.98996189164886e-05, "loss": 1.4251, "step": 1832 }, { "epoch": 0.07, "grad_norm": 3.843430567631214, "learning_rate": 1.989943444805738e-05, "loss": 1.1513, "step": 1833 }, { "epoch": 0.07, "grad_norm": 4.899542014326216, "learning_rate": 1.9899249811141215e-05, "loss": 1.2903, "step": 1834 }, { "epoch": 0.07, "grad_norm": 6.877236704650846, "learning_rate": 1.989906500574325e-05, "loss": 1.3307, "step": 1835 }, { "epoch": 0.07, "grad_norm": 7.51169605999953, "learning_rate": 1.9898880031866632e-05, "loss": 1.4188, "step": 1836 }, { "epoch": 0.07, "grad_norm": 6.844452648018087, "learning_rate": 1.9898694889514506e-05, "loss": 1.647, "step": 1837 }, { "epoch": 0.07, "grad_norm": 10.61160811105431, "learning_rate": 1.9898509578690027e-05, "loss": 1.2722, "step": 1838 }, { "epoch": 0.07, "grad_norm": 9.356485262920222, "learning_rate": 1.9898324099396346e-05, "loss": 1.5551, "step": 1839 }, { "epoch": 0.07, "grad_norm": 4.711797400099966, "learning_rate": 1.9898138451636618e-05, "loss": 1.2139, "step": 1840 }, { "epoch": 0.07, "grad_norm": 8.250110614898936, "learning_rate": 1.9897952635414007e-05, "loss": 1.3928, "step": 1841 }, { "epoch": 0.07, "grad_norm": 6.960504154851947, "learning_rate": 1.9897766650731677e-05, "loss": 1.422, "step": 1842 }, { "epoch": 0.07, "grad_norm": 5.551607915883396, "learning_rate": 1.9897580497592784e-05, "loss": 1.4695, "step": 1843 }, { "epoch": 0.07, "grad_norm": 5.377224571925624, "learning_rate": 1.9897394176000508e-05, "loss": 1.1154, "step": 1844 }, { "epoch": 0.07, "grad_norm": 5.60280508667392, "learning_rate": 1.989720768595801e-05, "loss": 1.2947, "step": 1845 }, { "epoch": 0.07, "grad_norm": 5.756358955303432, "learning_rate": 1.989702102746847e-05, "loss": 1.2054, "step": 1846 }, { "epoch": 0.07, "grad_norm": 7.571809357760466, "learning_rate": 1.9896834200535062e-05, "loss": 1.4072, "step": 1847 }, { "epoch": 0.07, "grad_norm": 7.238190866052183, "learning_rate": 1.989664720516097e-05, "loss": 1.249, "step": 1848 }, { "epoch": 0.07, "grad_norm": 5.007015988014855, "learning_rate": 1.989646004134937e-05, "loss": 1.3853, "step": 1849 }, { "epoch": 0.07, "grad_norm": 10.08241283981454, "learning_rate": 1.9896272709103455e-05, "loss": 1.8438, "step": 1850 }, { "epoch": 0.07, "grad_norm": 5.4160827198627715, "learning_rate": 1.9896085208426405e-05, "loss": 1.1178, "step": 1851 }, { "epoch": 0.07, "grad_norm": 6.167960950431204, "learning_rate": 1.9895897539321418e-05, "loss": 1.382, "step": 1852 }, { "epoch": 0.07, "grad_norm": 7.653280038247735, "learning_rate": 1.9895709701791687e-05, "loss": 1.4701, "step": 1853 }, { "epoch": 0.07, "grad_norm": 6.878093247756022, "learning_rate": 1.9895521695840406e-05, "loss": 1.1745, "step": 1854 }, { "epoch": 0.07, "grad_norm": 6.541696865083197, "learning_rate": 1.9895333521470775e-05, "loss": 1.1126, "step": 1855 }, { "epoch": 0.07, "grad_norm": 6.723480103350438, "learning_rate": 1.9895145178686e-05, "loss": 1.5545, "step": 1856 }, { "epoch": 0.07, "grad_norm": 10.660580261372116, "learning_rate": 1.9894956667489286e-05, "loss": 1.3655, "step": 1857 }, { "epoch": 0.07, "grad_norm": 6.37325426878096, "learning_rate": 1.9894767987883835e-05, "loss": 1.2893, "step": 1858 }, { "epoch": 0.07, "grad_norm": 6.905116105498393, "learning_rate": 1.9894579139872868e-05, "loss": 1.3496, "step": 1859 }, { "epoch": 0.07, "grad_norm": 3.9599799363560337, "learning_rate": 1.9894390123459592e-05, "loss": 1.2177, "step": 1860 }, { "epoch": 0.07, "grad_norm": 6.216751001268892, "learning_rate": 1.9894200938647224e-05, "loss": 1.2694, "step": 1861 }, { "epoch": 0.08, "grad_norm": 8.087187324380208, "learning_rate": 1.989401158543899e-05, "loss": 0.9905, "step": 1862 }, { "epoch": 0.08, "grad_norm": 7.405171557016595, "learning_rate": 1.9893822063838108e-05, "loss": 1.4704, "step": 1863 }, { "epoch": 0.08, "grad_norm": 7.728880050427066, "learning_rate": 1.9893632373847803e-05, "loss": 1.5873, "step": 1864 }, { "epoch": 0.08, "grad_norm": 9.6554711554294, "learning_rate": 1.989344251547131e-05, "loss": 1.5349, "step": 1865 }, { "epoch": 0.08, "grad_norm": 4.216398772182142, "learning_rate": 1.989325248871185e-05, "loss": 1.0868, "step": 1866 }, { "epoch": 0.08, "grad_norm": 7.735331624295228, "learning_rate": 1.989306229357266e-05, "loss": 1.2725, "step": 1867 }, { "epoch": 0.08, "grad_norm": 5.472735416115238, "learning_rate": 1.9892871930056987e-05, "loss": 1.3095, "step": 1868 }, { "epoch": 0.08, "grad_norm": 6.414520252337363, "learning_rate": 1.9892681398168062e-05, "loss": 1.0833, "step": 1869 }, { "epoch": 0.08, "grad_norm": 5.034319195959007, "learning_rate": 1.9892490697909127e-05, "loss": 1.3502, "step": 1870 }, { "epoch": 0.08, "grad_norm": 5.290917496925557, "learning_rate": 1.989229982928343e-05, "loss": 1.3774, "step": 1871 }, { "epoch": 0.08, "grad_norm": 6.5135485726364815, "learning_rate": 1.989210879229422e-05, "loss": 1.1419, "step": 1872 }, { "epoch": 0.08, "grad_norm": 9.521960658831057, "learning_rate": 1.989191758694475e-05, "loss": 1.8081, "step": 1873 }, { "epoch": 0.08, "grad_norm": 6.330767326989305, "learning_rate": 1.9891726213238267e-05, "loss": 1.246, "step": 1874 }, { "epoch": 0.08, "grad_norm": 5.909940074987961, "learning_rate": 1.9891534671178035e-05, "loss": 1.5355, "step": 1875 }, { "epoch": 0.08, "grad_norm": 5.391192753244623, "learning_rate": 1.9891342960767312e-05, "loss": 1.2977, "step": 1876 }, { "epoch": 0.08, "grad_norm": 6.809855256836489, "learning_rate": 1.9891151082009364e-05, "loss": 1.4352, "step": 1877 }, { "epoch": 0.08, "grad_norm": 6.645338893387367, "learning_rate": 1.9890959034907452e-05, "loss": 1.3508, "step": 1878 }, { "epoch": 0.08, "grad_norm": 6.472118473117855, "learning_rate": 1.9890766819464847e-05, "loss": 1.5358, "step": 1879 }, { "epoch": 0.08, "grad_norm": 7.074032989461991, "learning_rate": 1.9890574435684818e-05, "loss": 1.5075, "step": 1880 }, { "epoch": 0.08, "grad_norm": 6.992202967631186, "learning_rate": 1.989038188357064e-05, "loss": 1.4892, "step": 1881 }, { "epoch": 0.08, "grad_norm": 32.29535162413436, "learning_rate": 1.9890189163125594e-05, "loss": 1.2364, "step": 1882 }, { "epoch": 0.08, "grad_norm": 9.303141467011429, "learning_rate": 1.9889996274352955e-05, "loss": 1.4995, "step": 1883 }, { "epoch": 0.08, "grad_norm": 9.961460165241721, "learning_rate": 1.988980321725601e-05, "loss": 1.2742, "step": 1884 }, { "epoch": 0.08, "grad_norm": 10.903422782355507, "learning_rate": 1.9889609991838042e-05, "loss": 1.7074, "step": 1885 }, { "epoch": 0.08, "grad_norm": 7.291231553119069, "learning_rate": 1.988941659810234e-05, "loss": 1.2152, "step": 1886 }, { "epoch": 0.08, "grad_norm": 7.835324326467648, "learning_rate": 1.9889223036052198e-05, "loss": 1.5583, "step": 1887 }, { "epoch": 0.08, "grad_norm": 8.607203745269564, "learning_rate": 1.988902930569091e-05, "loss": 1.1908, "step": 1888 }, { "epoch": 0.08, "grad_norm": 9.733304406953474, "learning_rate": 1.9888835407021765e-05, "loss": 1.5966, "step": 1889 }, { "epoch": 0.08, "grad_norm": 5.715246070931457, "learning_rate": 1.9888641340048075e-05, "loss": 1.5123, "step": 1890 }, { "epoch": 0.08, "grad_norm": 6.3958815925201735, "learning_rate": 1.9888447104773133e-05, "loss": 1.277, "step": 1891 }, { "epoch": 0.08, "grad_norm": 8.214163076957846, "learning_rate": 1.9888252701200253e-05, "loss": 1.5097, "step": 1892 }, { "epoch": 0.08, "grad_norm": 7.043811599076709, "learning_rate": 1.988805812933274e-05, "loss": 1.3704, "step": 1893 }, { "epoch": 0.08, "grad_norm": 6.016743686393313, "learning_rate": 1.9887863389173908e-05, "loss": 1.3935, "step": 1894 }, { "epoch": 0.08, "grad_norm": 8.426743542356977, "learning_rate": 1.9887668480727066e-05, "loss": 1.3407, "step": 1895 }, { "epoch": 0.08, "grad_norm": 6.797615012238835, "learning_rate": 1.9887473403995534e-05, "loss": 1.2813, "step": 1896 }, { "epoch": 0.08, "grad_norm": 10.0018845640442, "learning_rate": 1.988727815898263e-05, "loss": 1.6259, "step": 1897 }, { "epoch": 0.08, "grad_norm": 5.484024172122658, "learning_rate": 1.9887082745691685e-05, "loss": 1.3847, "step": 1898 }, { "epoch": 0.08, "grad_norm": 5.581495327619253, "learning_rate": 1.988688716412602e-05, "loss": 1.2899, "step": 1899 }, { "epoch": 0.08, "grad_norm": 6.148368174049432, "learning_rate": 1.988669141428896e-05, "loss": 1.492, "step": 1900 }, { "epoch": 0.08, "grad_norm": 5.058422598656176, "learning_rate": 1.988649549618384e-05, "loss": 1.2799, "step": 1901 }, { "epoch": 0.08, "grad_norm": 7.099246718032856, "learning_rate": 1.988629940981399e-05, "loss": 1.3575, "step": 1902 }, { "epoch": 0.08, "grad_norm": 6.75550331744537, "learning_rate": 1.988610315518276e-05, "loss": 1.4692, "step": 1903 }, { "epoch": 0.08, "grad_norm": 5.6383837643469485, "learning_rate": 1.9885906732293473e-05, "loss": 1.5472, "step": 1904 }, { "epoch": 0.08, "grad_norm": 6.307233565069014, "learning_rate": 1.9885710141149488e-05, "loss": 1.2707, "step": 1905 }, { "epoch": 0.08, "grad_norm": 6.250488980604428, "learning_rate": 1.988551338175414e-05, "loss": 1.3601, "step": 1906 }, { "epoch": 0.08, "grad_norm": 6.94454103123784, "learning_rate": 1.988531645411078e-05, "loss": 1.5703, "step": 1907 }, { "epoch": 0.08, "grad_norm": 5.47266529671148, "learning_rate": 1.9885119358222765e-05, "loss": 1.2022, "step": 1908 }, { "epoch": 0.08, "grad_norm": 9.304969827303509, "learning_rate": 1.9884922094093445e-05, "loss": 1.9959, "step": 1909 }, { "epoch": 0.08, "grad_norm": 6.195758014392338, "learning_rate": 1.9884724661726176e-05, "loss": 1.5371, "step": 1910 }, { "epoch": 0.08, "grad_norm": 7.536575092917892, "learning_rate": 1.988452706112432e-05, "loss": 1.3836, "step": 1911 }, { "epoch": 0.08, "grad_norm": 5.915181813887974, "learning_rate": 1.988432929229124e-05, "loss": 1.4546, "step": 1912 }, { "epoch": 0.08, "grad_norm": 9.081173340000653, "learning_rate": 1.9884131355230303e-05, "loss": 1.3403, "step": 1913 }, { "epoch": 0.08, "grad_norm": 6.793320254187598, "learning_rate": 1.988393324994488e-05, "loss": 1.444, "step": 1914 }, { "epoch": 0.08, "grad_norm": 4.313101418786349, "learning_rate": 1.9883734976438335e-05, "loss": 1.2467, "step": 1915 }, { "epoch": 0.08, "grad_norm": 5.947451746894204, "learning_rate": 1.988353653471405e-05, "loss": 1.1859, "step": 1916 }, { "epoch": 0.08, "grad_norm": 6.045131783586373, "learning_rate": 1.9883337924775397e-05, "loss": 1.4838, "step": 1917 }, { "epoch": 0.08, "grad_norm": 6.623865516138536, "learning_rate": 1.9883139146625763e-05, "loss": 1.472, "step": 1918 }, { "epoch": 0.08, "grad_norm": 7.848043277361003, "learning_rate": 1.9882940200268525e-05, "loss": 1.2968, "step": 1919 }, { "epoch": 0.08, "grad_norm": 7.028172657779085, "learning_rate": 1.988274108570707e-05, "loss": 1.4765, "step": 1920 }, { "epoch": 0.08, "grad_norm": 6.305263445746768, "learning_rate": 1.988254180294479e-05, "loss": 1.4632, "step": 1921 }, { "epoch": 0.08, "grad_norm": 6.448802281639567, "learning_rate": 1.9882342351985072e-05, "loss": 1.4273, "step": 1922 }, { "epoch": 0.08, "grad_norm": 6.101088181749497, "learning_rate": 1.9882142732831316e-05, "loss": 1.4528, "step": 1923 }, { "epoch": 0.08, "grad_norm": 5.727522198138094, "learning_rate": 1.9881942945486916e-05, "loss": 1.3927, "step": 1924 }, { "epoch": 0.08, "grad_norm": 5.65456179362216, "learning_rate": 1.988174298995527e-05, "loss": 1.4535, "step": 1925 }, { "epoch": 0.08, "grad_norm": 7.244434902691974, "learning_rate": 1.9881542866239786e-05, "loss": 1.1475, "step": 1926 }, { "epoch": 0.08, "grad_norm": 5.339281823390445, "learning_rate": 1.988134257434387e-05, "loss": 1.0999, "step": 1927 }, { "epoch": 0.08, "grad_norm": 6.4584276036091195, "learning_rate": 1.9881142114270928e-05, "loss": 1.2863, "step": 1928 }, { "epoch": 0.08, "grad_norm": 5.344711810640193, "learning_rate": 1.9880941486024375e-05, "loss": 1.3002, "step": 1929 }, { "epoch": 0.08, "grad_norm": 6.351686169973188, "learning_rate": 1.988074068960762e-05, "loss": 1.3365, "step": 1930 }, { "epoch": 0.08, "grad_norm": 6.769722034786174, "learning_rate": 1.9880539725024082e-05, "loss": 1.5159, "step": 1931 }, { "epoch": 0.08, "grad_norm": 5.795344799719702, "learning_rate": 1.9880338592277188e-05, "loss": 1.4249, "step": 1932 }, { "epoch": 0.08, "grad_norm": 49.66664152571337, "learning_rate": 1.9880137291370356e-05, "loss": 1.5723, "step": 1933 }, { "epoch": 0.08, "grad_norm": 7.549454416769481, "learning_rate": 1.987993582230701e-05, "loss": 1.1613, "step": 1934 }, { "epoch": 0.08, "grad_norm": 8.131330768663847, "learning_rate": 1.9879734185090585e-05, "loss": 1.3872, "step": 1935 }, { "epoch": 0.08, "grad_norm": 12.411637757687284, "learning_rate": 1.9879532379724507e-05, "loss": 1.9966, "step": 1936 }, { "epoch": 0.08, "grad_norm": 7.771896070647335, "learning_rate": 1.9879330406212214e-05, "loss": 1.5774, "step": 1937 }, { "epoch": 0.08, "grad_norm": 7.3398532727425145, "learning_rate": 1.9879128264557142e-05, "loss": 1.2974, "step": 1938 }, { "epoch": 0.08, "grad_norm": 20.52315372868552, "learning_rate": 1.987892595476273e-05, "loss": 1.4551, "step": 1939 }, { "epoch": 0.08, "grad_norm": 6.4556568476893545, "learning_rate": 1.9878723476832427e-05, "loss": 1.4021, "step": 1940 }, { "epoch": 0.08, "grad_norm": 6.155423123645429, "learning_rate": 1.9878520830769675e-05, "loss": 1.6988, "step": 1941 }, { "epoch": 0.08, "grad_norm": 8.36117962714811, "learning_rate": 1.9878318016577917e-05, "loss": 1.2595, "step": 1942 }, { "epoch": 0.08, "grad_norm": 9.08492112780158, "learning_rate": 1.9878115034260616e-05, "loss": 1.5837, "step": 1943 }, { "epoch": 0.08, "grad_norm": 7.869138684675926, "learning_rate": 1.9877911883821223e-05, "loss": 1.5631, "step": 1944 }, { "epoch": 0.08, "grad_norm": 10.0845420574577, "learning_rate": 1.987770856526319e-05, "loss": 1.3692, "step": 1945 }, { "epoch": 0.08, "grad_norm": 8.225301032281044, "learning_rate": 1.9877505078589988e-05, "loss": 1.2714, "step": 1946 }, { "epoch": 0.08, "grad_norm": 8.046137332716192, "learning_rate": 1.9877301423805068e-05, "loss": 1.3963, "step": 1947 }, { "epoch": 0.08, "grad_norm": 11.325081274730213, "learning_rate": 1.9877097600911903e-05, "loss": 1.4354, "step": 1948 }, { "epoch": 0.08, "grad_norm": 6.458713001584281, "learning_rate": 1.9876893609913965e-05, "loss": 1.0767, "step": 1949 }, { "epoch": 0.08, "grad_norm": 6.7971848401223545, "learning_rate": 1.9876689450814718e-05, "loss": 1.3194, "step": 1950 }, { "epoch": 0.08, "grad_norm": 8.667725455264923, "learning_rate": 1.9876485123617643e-05, "loss": 2.0666, "step": 1951 }, { "epoch": 0.08, "grad_norm": 8.004045602884313, "learning_rate": 1.9876280628326215e-05, "loss": 1.3416, "step": 1952 }, { "epoch": 0.08, "grad_norm": 7.164829069345282, "learning_rate": 1.987607596494391e-05, "loss": 1.437, "step": 1953 }, { "epoch": 0.08, "grad_norm": 5.952126644912351, "learning_rate": 1.9875871133474223e-05, "loss": 1.5062, "step": 1954 }, { "epoch": 0.08, "grad_norm": 5.2870983332966714, "learning_rate": 1.987566613392063e-05, "loss": 1.294, "step": 1955 }, { "epoch": 0.08, "grad_norm": 14.100405620816359, "learning_rate": 1.987546096628662e-05, "loss": 1.7477, "step": 1956 }, { "epoch": 0.08, "grad_norm": 6.342764950449752, "learning_rate": 1.9875255630575693e-05, "loss": 1.4715, "step": 1957 }, { "epoch": 0.08, "grad_norm": 6.137647740131131, "learning_rate": 1.9875050126791334e-05, "loss": 1.2572, "step": 1958 }, { "epoch": 0.08, "grad_norm": 6.747413580892788, "learning_rate": 1.987484445493705e-05, "loss": 1.5153, "step": 1959 }, { "epoch": 0.08, "grad_norm": 5.827275100563909, "learning_rate": 1.9874638615016337e-05, "loss": 1.2175, "step": 1960 }, { "epoch": 0.08, "grad_norm": 9.819189551363388, "learning_rate": 1.9874432607032696e-05, "loss": 1.5515, "step": 1961 }, { "epoch": 0.08, "grad_norm": 6.391896935253675, "learning_rate": 1.9874226430989637e-05, "loss": 1.6351, "step": 1962 }, { "epoch": 0.08, "grad_norm": 50.76994407845396, "learning_rate": 1.9874020086890664e-05, "loss": 1.8266, "step": 1963 }, { "epoch": 0.08, "grad_norm": 12.729947535579848, "learning_rate": 1.9873813574739293e-05, "loss": 1.32, "step": 1964 }, { "epoch": 0.08, "grad_norm": 71.17585192663728, "learning_rate": 1.9873606894539043e-05, "loss": 2.0312, "step": 1965 }, { "epoch": 0.08, "grad_norm": 9.982800640683587, "learning_rate": 1.9873400046293422e-05, "loss": 1.2879, "step": 1966 }, { "epoch": 0.08, "grad_norm": 5.2020414650697235, "learning_rate": 1.987319303000595e-05, "loss": 1.4783, "step": 1967 }, { "epoch": 0.08, "grad_norm": 9.610199168823236, "learning_rate": 1.9872985845680164e-05, "loss": 1.484, "step": 1968 }, { "epoch": 0.08, "grad_norm": 11.680698284824642, "learning_rate": 1.987277849331958e-05, "loss": 1.7848, "step": 1969 }, { "epoch": 0.08, "grad_norm": 6.356293134043534, "learning_rate": 1.987257097292773e-05, "loss": 1.4553, "step": 1970 }, { "epoch": 0.08, "grad_norm": 5.543439321803046, "learning_rate": 1.9872363284508143e-05, "loss": 1.5538, "step": 1971 }, { "epoch": 0.08, "grad_norm": 7.886208274946951, "learning_rate": 1.9872155428064356e-05, "loss": 1.2102, "step": 1972 }, { "epoch": 0.08, "grad_norm": 7.030215927281574, "learning_rate": 1.9871947403599904e-05, "loss": 1.4892, "step": 1973 }, { "epoch": 0.08, "grad_norm": 9.427589184249705, "learning_rate": 1.9871739211118332e-05, "loss": 1.3802, "step": 1974 }, { "epoch": 0.08, "grad_norm": 18.9571556013096, "learning_rate": 1.987153085062318e-05, "loss": 2.2489, "step": 1975 }, { "epoch": 0.08, "grad_norm": 8.13438091681017, "learning_rate": 1.9871322322117998e-05, "loss": 0.9751, "step": 1976 }, { "epoch": 0.08, "grad_norm": 11.340738806307808, "learning_rate": 1.987111362560633e-05, "loss": 1.4222, "step": 1977 }, { "epoch": 0.08, "grad_norm": 9.066015523174409, "learning_rate": 1.9870904761091728e-05, "loss": 1.4344, "step": 1978 }, { "epoch": 0.08, "grad_norm": 7.376372344209014, "learning_rate": 1.987069572857775e-05, "loss": 1.2253, "step": 1979 }, { "epoch": 0.08, "grad_norm": 8.567556856907022, "learning_rate": 1.9870486528067957e-05, "loss": 1.4916, "step": 1980 }, { "epoch": 0.08, "grad_norm": 8.352045204240113, "learning_rate": 1.9870277159565902e-05, "loss": 1.4423, "step": 1981 }, { "epoch": 0.08, "grad_norm": 11.758888762307567, "learning_rate": 1.987006762307515e-05, "loss": 1.7159, "step": 1982 }, { "epoch": 0.08, "grad_norm": 6.705161845761703, "learning_rate": 1.9869857918599273e-05, "loss": 1.4703, "step": 1983 }, { "epoch": 0.08, "grad_norm": 6.080581947700933, "learning_rate": 1.986964804614183e-05, "loss": 1.3423, "step": 1984 }, { "epoch": 0.08, "grad_norm": 12.801902283726717, "learning_rate": 1.9869438005706404e-05, "loss": 1.5858, "step": 1985 }, { "epoch": 0.08, "grad_norm": 7.194431286339335, "learning_rate": 1.9869227797296565e-05, "loss": 1.6454, "step": 1986 }, { "epoch": 0.08, "grad_norm": 7.033776752601409, "learning_rate": 1.9869017420915888e-05, "loss": 1.5596, "step": 1987 }, { "epoch": 0.08, "grad_norm": 8.258548302769505, "learning_rate": 1.9868806876567954e-05, "loss": 1.4228, "step": 1988 }, { "epoch": 0.08, "grad_norm": 16.175625414139162, "learning_rate": 1.9868596164256352e-05, "loss": 1.729, "step": 1989 }, { "epoch": 0.08, "grad_norm": 6.4132349723929, "learning_rate": 1.9868385283984665e-05, "loss": 1.4251, "step": 1990 }, { "epoch": 0.08, "grad_norm": 7.928297429089372, "learning_rate": 1.986817423575648e-05, "loss": 1.6279, "step": 1991 }, { "epoch": 0.08, "grad_norm": 11.732851828082064, "learning_rate": 1.986796301957539e-05, "loss": 1.5941, "step": 1992 }, { "epoch": 0.08, "grad_norm": 9.577160730653409, "learning_rate": 1.986775163544499e-05, "loss": 1.7442, "step": 1993 }, { "epoch": 0.08, "grad_norm": 6.3395855298044035, "learning_rate": 1.986754008336888e-05, "loss": 1.1593, "step": 1994 }, { "epoch": 0.08, "grad_norm": 6.9369983081322015, "learning_rate": 1.9867328363350658e-05, "loss": 1.2943, "step": 1995 }, { "epoch": 0.08, "grad_norm": 8.393487589835019, "learning_rate": 1.986711647539393e-05, "loss": 1.6736, "step": 1996 }, { "epoch": 0.08, "grad_norm": 7.560275518553596, "learning_rate": 1.98669044195023e-05, "loss": 1.3487, "step": 1997 }, { "epoch": 0.08, "grad_norm": 7.126566641487436, "learning_rate": 1.9866692195679375e-05, "loss": 1.6894, "step": 1998 }, { "epoch": 0.08, "grad_norm": 6.119523502501558, "learning_rate": 1.986647980392877e-05, "loss": 1.355, "step": 1999 }, { "epoch": 0.08, "grad_norm": 7.682670636332944, "learning_rate": 1.9866267244254103e-05, "loss": 1.1045, "step": 2000 }, { "epoch": 0.08, "grad_norm": 7.478065499779025, "learning_rate": 1.9866054516658985e-05, "loss": 1.3317, "step": 2001 }, { "epoch": 0.08, "grad_norm": 6.898600535774286, "learning_rate": 1.986584162114704e-05, "loss": 1.4485, "step": 2002 }, { "epoch": 0.08, "grad_norm": 6.062876729158292, "learning_rate": 1.986562855772189e-05, "loss": 1.358, "step": 2003 }, { "epoch": 0.08, "grad_norm": 6.273172695020129, "learning_rate": 1.986541532638716e-05, "loss": 1.2718, "step": 2004 }, { "epoch": 0.08, "grad_norm": 9.613439102606318, "learning_rate": 1.9865201927146486e-05, "loss": 1.4793, "step": 2005 }, { "epoch": 0.08, "grad_norm": 8.441628410394232, "learning_rate": 1.986498836000349e-05, "loss": 0.9922, "step": 2006 }, { "epoch": 0.08, "grad_norm": 5.202889276326259, "learning_rate": 1.986477462496182e-05, "loss": 1.355, "step": 2007 }, { "epoch": 0.08, "grad_norm": 7.3635050064863385, "learning_rate": 1.9864560722025098e-05, "loss": 1.5329, "step": 2008 }, { "epoch": 0.08, "grad_norm": 8.41000838214148, "learning_rate": 1.9864346651196976e-05, "loss": 1.6468, "step": 2009 }, { "epoch": 0.08, "grad_norm": 6.692878247565852, "learning_rate": 1.9864132412481094e-05, "loss": 1.0813, "step": 2010 }, { "epoch": 0.08, "grad_norm": 7.738002300884941, "learning_rate": 1.98639180058811e-05, "loss": 1.0475, "step": 2011 }, { "epoch": 0.08, "grad_norm": 6.0385714517372096, "learning_rate": 1.986370343140064e-05, "loss": 1.407, "step": 2012 }, { "epoch": 0.08, "grad_norm": 13.823605859600802, "learning_rate": 1.9863488689043363e-05, "loss": 1.8481, "step": 2013 }, { "epoch": 0.08, "grad_norm": 8.329536391505625, "learning_rate": 1.9863273778812934e-05, "loss": 1.7969, "step": 2014 }, { "epoch": 0.08, "grad_norm": 6.236965320588755, "learning_rate": 1.9863058700713e-05, "loss": 1.3936, "step": 2015 }, { "epoch": 0.08, "grad_norm": 18.457369005091717, "learning_rate": 1.986284345474723e-05, "loss": 1.8438, "step": 2016 }, { "epoch": 0.08, "grad_norm": 7.718536519614692, "learning_rate": 1.986262804091928e-05, "loss": 1.0857, "step": 2017 }, { "epoch": 0.08, "grad_norm": 12.324054694663023, "learning_rate": 1.986241245923282e-05, "loss": 1.4625, "step": 2018 }, { "epoch": 0.08, "grad_norm": 7.984901507079154, "learning_rate": 1.9862196709691522e-05, "loss": 1.1212, "step": 2019 }, { "epoch": 0.08, "grad_norm": 5.7095705009407975, "learning_rate": 1.9861980792299056e-05, "loss": 1.2673, "step": 2020 }, { "epoch": 0.08, "grad_norm": 6.938045672856821, "learning_rate": 1.9861764707059094e-05, "loss": 1.3506, "step": 2021 }, { "epoch": 0.08, "grad_norm": 11.212348386769186, "learning_rate": 1.9861548453975315e-05, "loss": 1.355, "step": 2022 }, { "epoch": 0.08, "grad_norm": 8.95386858016036, "learning_rate": 1.9861332033051402e-05, "loss": 1.4413, "step": 2023 }, { "epoch": 0.08, "grad_norm": 7.3363356268296815, "learning_rate": 1.9861115444291034e-05, "loss": 1.6278, "step": 2024 }, { "epoch": 0.08, "grad_norm": 7.391962594516691, "learning_rate": 1.9860898687697904e-05, "loss": 1.3893, "step": 2025 }, { "epoch": 0.08, "grad_norm": 10.361945359189987, "learning_rate": 1.9860681763275693e-05, "loss": 1.3278, "step": 2026 }, { "epoch": 0.08, "grad_norm": 7.691938593058895, "learning_rate": 1.98604646710281e-05, "loss": 1.7167, "step": 2027 }, { "epoch": 0.08, "grad_norm": 5.936805249320885, "learning_rate": 1.9860247410958812e-05, "loss": 1.3407, "step": 2028 }, { "epoch": 0.08, "grad_norm": 8.343126756429472, "learning_rate": 1.9860029983071537e-05, "loss": 1.8915, "step": 2029 }, { "epoch": 0.08, "grad_norm": 6.7400645903005385, "learning_rate": 1.9859812387369965e-05, "loss": 1.4172, "step": 2030 }, { "epoch": 0.08, "grad_norm": 5.734630976959931, "learning_rate": 1.9859594623857806e-05, "loss": 1.3161, "step": 2031 }, { "epoch": 0.08, "grad_norm": 6.2376251754373575, "learning_rate": 1.985937669253877e-05, "loss": 1.5185, "step": 2032 }, { "epoch": 0.08, "grad_norm": 4.3593288744757475, "learning_rate": 1.9859158593416554e-05, "loss": 1.119, "step": 2033 }, { "epoch": 0.08, "grad_norm": 7.0016955696470555, "learning_rate": 1.9858940326494877e-05, "loss": 1.4015, "step": 2034 }, { "epoch": 0.08, "grad_norm": 6.093204002074062, "learning_rate": 1.9858721891777457e-05, "loss": 1.4231, "step": 2035 }, { "epoch": 0.08, "grad_norm": 6.546781331476035, "learning_rate": 1.9858503289268007e-05, "loss": 1.0932, "step": 2036 }, { "epoch": 0.08, "grad_norm": 12.104858795656092, "learning_rate": 1.985828451897025e-05, "loss": 1.8787, "step": 2037 }, { "epoch": 0.08, "grad_norm": 7.74825242973996, "learning_rate": 1.9858065580887907e-05, "loss": 1.5114, "step": 2038 }, { "epoch": 0.08, "grad_norm": 6.28404120686469, "learning_rate": 1.9857846475024704e-05, "loss": 1.4589, "step": 2039 }, { "epoch": 0.08, "grad_norm": 6.865812391855606, "learning_rate": 1.9857627201384372e-05, "loss": 1.391, "step": 2040 }, { "epoch": 0.08, "grad_norm": 6.422128495422681, "learning_rate": 1.9857407759970645e-05, "loss": 1.3031, "step": 2041 }, { "epoch": 0.08, "grad_norm": 6.740899762848786, "learning_rate": 1.9857188150787254e-05, "loss": 1.5544, "step": 2042 }, { "epoch": 0.08, "grad_norm": 7.40047595690418, "learning_rate": 1.985696837383794e-05, "loss": 1.4017, "step": 2043 }, { "epoch": 0.08, "grad_norm": 12.454280308096974, "learning_rate": 1.9856748429126438e-05, "loss": 1.0161, "step": 2044 }, { "epoch": 0.08, "grad_norm": 6.502385374574426, "learning_rate": 1.98565283166565e-05, "loss": 1.1975, "step": 2045 }, { "epoch": 0.08, "grad_norm": 7.404091452828083, "learning_rate": 1.985630803643186e-05, "loss": 1.6209, "step": 2046 }, { "epoch": 0.08, "grad_norm": 6.595585731363609, "learning_rate": 1.9856087588456278e-05, "loss": 1.3848, "step": 2047 }, { "epoch": 0.08, "grad_norm": 6.248203717006554, "learning_rate": 1.9855866972733505e-05, "loss": 1.3173, "step": 2048 }, { "epoch": 0.08, "grad_norm": 7.753662816339862, "learning_rate": 1.985564618926729e-05, "loss": 1.1622, "step": 2049 }, { "epoch": 0.08, "grad_norm": 6.480586541317035, "learning_rate": 1.9855425238061393e-05, "loss": 1.5633, "step": 2050 }, { "epoch": 0.08, "grad_norm": 6.341995987455431, "learning_rate": 1.9855204119119572e-05, "loss": 1.356, "step": 2051 }, { "epoch": 0.08, "grad_norm": 5.475278231967259, "learning_rate": 1.98549828324456e-05, "loss": 1.177, "step": 2052 }, { "epoch": 0.08, "grad_norm": 6.932490616127661, "learning_rate": 1.9854761378043234e-05, "loss": 1.2633, "step": 2053 }, { "epoch": 0.08, "grad_norm": 5.451900146705143, "learning_rate": 1.9854539755916245e-05, "loss": 1.4707, "step": 2054 }, { "epoch": 0.08, "grad_norm": 6.21180800605403, "learning_rate": 1.985431796606841e-05, "loss": 1.3845, "step": 2055 }, { "epoch": 0.08, "grad_norm": 5.555010389264111, "learning_rate": 1.9854096008503495e-05, "loss": 1.3051, "step": 2056 }, { "epoch": 0.08, "grad_norm": 8.766848976093007, "learning_rate": 1.9853873883225282e-05, "loss": 1.6173, "step": 2057 }, { "epoch": 0.08, "grad_norm": 8.161336498618304, "learning_rate": 1.9853651590237554e-05, "loss": 1.7824, "step": 2058 }, { "epoch": 0.08, "grad_norm": 10.043646570582553, "learning_rate": 1.9853429129544092e-05, "loss": 1.3753, "step": 2059 }, { "epoch": 0.08, "grad_norm": 9.335500231804678, "learning_rate": 1.985320650114868e-05, "loss": 1.248, "step": 2060 }, { "epoch": 0.08, "grad_norm": 6.486632047340316, "learning_rate": 1.9852983705055112e-05, "loss": 1.1288, "step": 2061 }, { "epoch": 0.08, "grad_norm": 8.015814769319924, "learning_rate": 1.9852760741267177e-05, "loss": 1.395, "step": 2062 }, { "epoch": 0.08, "grad_norm": 7.471297949698899, "learning_rate": 1.985253760978867e-05, "loss": 1.593, "step": 2063 }, { "epoch": 0.08, "grad_norm": 7.615892953807744, "learning_rate": 1.9852314310623385e-05, "loss": 1.3446, "step": 2064 }, { "epoch": 0.08, "grad_norm": 5.6764861541814495, "learning_rate": 1.985209084377513e-05, "loss": 1.224, "step": 2065 }, { "epoch": 0.08, "grad_norm": 7.984682132826466, "learning_rate": 1.9851867209247704e-05, "loss": 1.2029, "step": 2066 }, { "epoch": 0.08, "grad_norm": 8.286963968874016, "learning_rate": 1.9851643407044914e-05, "loss": 1.6169, "step": 2067 }, { "epoch": 0.08, "grad_norm": 5.521578102390017, "learning_rate": 1.9851419437170566e-05, "loss": 1.2492, "step": 2068 }, { "epoch": 0.08, "grad_norm": 6.7339445328126, "learning_rate": 1.9851195299628476e-05, "loss": 1.3129, "step": 2069 }, { "epoch": 0.08, "grad_norm": 9.420731971776329, "learning_rate": 1.985097099442246e-05, "loss": 1.297, "step": 2070 }, { "epoch": 0.08, "grad_norm": 6.925840543569701, "learning_rate": 1.9850746521556328e-05, "loss": 1.4849, "step": 2071 }, { "epoch": 0.08, "grad_norm": 5.524618399040382, "learning_rate": 1.9850521881033907e-05, "loss": 1.6939, "step": 2072 }, { "epoch": 0.08, "grad_norm": 7.213465774504431, "learning_rate": 1.985029707285902e-05, "loss": 1.224, "step": 2073 }, { "epoch": 0.08, "grad_norm": 6.8137052616530065, "learning_rate": 1.9850072097035493e-05, "loss": 1.4034, "step": 2074 }, { "epoch": 0.08, "grad_norm": 7.439276558837327, "learning_rate": 1.984984695356715e-05, "loss": 1.2804, "step": 2075 }, { "epoch": 0.08, "grad_norm": 5.23039211308543, "learning_rate": 1.984962164245783e-05, "loss": 1.1844, "step": 2076 }, { "epoch": 0.08, "grad_norm": 4.997671619595233, "learning_rate": 1.9849396163711363e-05, "loss": 1.1771, "step": 2077 }, { "epoch": 0.08, "grad_norm": 7.899571590860373, "learning_rate": 1.9849170517331587e-05, "loss": 0.9749, "step": 2078 }, { "epoch": 0.08, "grad_norm": 7.4484078862909255, "learning_rate": 1.9848944703322345e-05, "loss": 1.4435, "step": 2079 }, { "epoch": 0.08, "grad_norm": 7.553159947970482, "learning_rate": 1.9848718721687478e-05, "loss": 1.4433, "step": 2080 }, { "epoch": 0.08, "grad_norm": 10.02600219154172, "learning_rate": 1.9848492572430832e-05, "loss": 1.2229, "step": 2081 }, { "epoch": 0.08, "grad_norm": 8.659733282924702, "learning_rate": 1.9848266255556255e-05, "loss": 1.3973, "step": 2082 }, { "epoch": 0.08, "grad_norm": 7.49003489161571, "learning_rate": 1.9848039771067604e-05, "loss": 1.2507, "step": 2083 }, { "epoch": 0.08, "grad_norm": 9.072160922255867, "learning_rate": 1.9847813118968727e-05, "loss": 1.5853, "step": 2084 }, { "epoch": 0.08, "grad_norm": 6.184045547019883, "learning_rate": 1.9847586299263487e-05, "loss": 1.2774, "step": 2085 }, { "epoch": 0.08, "grad_norm": 6.218850798424014, "learning_rate": 1.9847359311955743e-05, "loss": 1.443, "step": 2086 }, { "epoch": 0.08, "grad_norm": 5.081527867587629, "learning_rate": 1.9847132157049357e-05, "loss": 1.2054, "step": 2087 }, { "epoch": 0.08, "grad_norm": 7.058633030220518, "learning_rate": 1.9846904834548196e-05, "loss": 1.159, "step": 2088 }, { "epoch": 0.08, "grad_norm": 7.169304717094344, "learning_rate": 1.9846677344456127e-05, "loss": 1.6185, "step": 2089 }, { "epoch": 0.08, "grad_norm": 7.8593668527065965, "learning_rate": 1.9846449686777025e-05, "loss": 1.284, "step": 2090 }, { "epoch": 0.08, "grad_norm": 6.323128931275353, "learning_rate": 1.984622186151476e-05, "loss": 1.3999, "step": 2091 }, { "epoch": 0.08, "grad_norm": 6.559913890501203, "learning_rate": 1.9845993868673216e-05, "loss": 1.742, "step": 2092 }, { "epoch": 0.08, "grad_norm": 5.8157740668673625, "learning_rate": 1.9845765708256265e-05, "loss": 1.2514, "step": 2093 }, { "epoch": 0.08, "grad_norm": 6.3539166770356195, "learning_rate": 1.98455373802678e-05, "loss": 1.2092, "step": 2094 }, { "epoch": 0.08, "grad_norm": 8.199856207051168, "learning_rate": 1.98453088847117e-05, "loss": 1.2876, "step": 2095 }, { "epoch": 0.08, "grad_norm": 9.521129345934831, "learning_rate": 1.984508022159186e-05, "loss": 1.3859, "step": 2096 }, { "epoch": 0.08, "grad_norm": 58.11827962530425, "learning_rate": 1.9844851390912163e-05, "loss": 1.9576, "step": 2097 }, { "epoch": 0.08, "grad_norm": 10.766235199335897, "learning_rate": 1.984462239267651e-05, "loss": 1.2589, "step": 2098 }, { "epoch": 0.08, "grad_norm": 13.718447267821798, "learning_rate": 1.9844393226888793e-05, "loss": 1.5563, "step": 2099 }, { "epoch": 0.08, "grad_norm": 16.998866019231098, "learning_rate": 1.9844163893552923e-05, "loss": 1.5821, "step": 2100 }, { "epoch": 0.08, "grad_norm": 9.6937655745645, "learning_rate": 1.984393439267279e-05, "loss": 1.4352, "step": 2101 }, { "epoch": 0.08, "grad_norm": 6.05595963983662, "learning_rate": 1.9843704724252308e-05, "loss": 1.4137, "step": 2102 }, { "epoch": 0.08, "grad_norm": 10.119930206321781, "learning_rate": 1.9843474888295388e-05, "loss": 1.185, "step": 2103 }, { "epoch": 0.08, "grad_norm": 12.126472705493255, "learning_rate": 1.9843244884805935e-05, "loss": 1.3151, "step": 2104 }, { "epoch": 0.08, "grad_norm": 15.848231348397727, "learning_rate": 1.9843014713787866e-05, "loss": 1.7134, "step": 2105 }, { "epoch": 0.08, "grad_norm": 5.107833459485515, "learning_rate": 1.98427843752451e-05, "loss": 1.3304, "step": 2106 }, { "epoch": 0.08, "grad_norm": 10.27156983478511, "learning_rate": 1.9842553869181555e-05, "loss": 1.6161, "step": 2107 }, { "epoch": 0.08, "grad_norm": 10.346621323021084, "learning_rate": 1.984232319560116e-05, "loss": 1.5305, "step": 2108 }, { "epoch": 0.08, "grad_norm": 10.06230000514128, "learning_rate": 1.9842092354507835e-05, "loss": 1.4227, "step": 2109 }, { "epoch": 0.08, "grad_norm": 7.405568751567361, "learning_rate": 1.9841861345905505e-05, "loss": 1.2207, "step": 2110 }, { "epoch": 0.09, "grad_norm": 13.997328427118234, "learning_rate": 1.984163016979811e-05, "loss": 2.2131, "step": 2111 }, { "epoch": 0.09, "grad_norm": 10.96372065153358, "learning_rate": 1.9841398826189585e-05, "loss": 1.2718, "step": 2112 }, { "epoch": 0.09, "grad_norm": 6.167961223986975, "learning_rate": 1.9841167315083858e-05, "loss": 1.3156, "step": 2113 }, { "epoch": 0.09, "grad_norm": 7.449055423860171, "learning_rate": 1.9840935636484877e-05, "loss": 1.1934, "step": 2114 }, { "epoch": 0.09, "grad_norm": 9.254981984664196, "learning_rate": 1.9840703790396583e-05, "loss": 1.537, "step": 2115 }, { "epoch": 0.09, "grad_norm": 8.784466640951036, "learning_rate": 1.9840471776822927e-05, "loss": 1.6054, "step": 2116 }, { "epoch": 0.09, "grad_norm": 10.197839595501872, "learning_rate": 1.9840239595767847e-05, "loss": 1.2749, "step": 2117 }, { "epoch": 0.09, "grad_norm": 8.418675478498772, "learning_rate": 1.9840007247235303e-05, "loss": 1.4637, "step": 2118 }, { "epoch": 0.09, "grad_norm": 10.807871732267039, "learning_rate": 1.9839774731229247e-05, "loss": 1.2672, "step": 2119 }, { "epoch": 0.09, "grad_norm": 11.69749765223561, "learning_rate": 1.9839542047753634e-05, "loss": 1.6809, "step": 2120 }, { "epoch": 0.09, "grad_norm": 7.431462192437819, "learning_rate": 1.9839309196812428e-05, "loss": 1.0529, "step": 2121 }, { "epoch": 0.09, "grad_norm": 9.588562292992778, "learning_rate": 1.9839076178409593e-05, "loss": 1.4438, "step": 2122 }, { "epoch": 0.09, "grad_norm": 9.89138412003577, "learning_rate": 1.983884299254909e-05, "loss": 1.3326, "step": 2123 }, { "epoch": 0.09, "grad_norm": 6.851090815440423, "learning_rate": 1.9838609639234892e-05, "loss": 1.1389, "step": 2124 }, { "epoch": 0.09, "grad_norm": 7.115450581213647, "learning_rate": 1.9838376118470965e-05, "loss": 1.3287, "step": 2125 }, { "epoch": 0.09, "grad_norm": 6.598472923197704, "learning_rate": 1.9838142430261293e-05, "loss": 1.2751, "step": 2126 }, { "epoch": 0.09, "grad_norm": 9.309393209448436, "learning_rate": 1.9837908574609842e-05, "loss": 1.7558, "step": 2127 }, { "epoch": 0.09, "grad_norm": 11.218686822109227, "learning_rate": 1.98376745515206e-05, "loss": 1.2729, "step": 2128 }, { "epoch": 0.09, "grad_norm": 7.943066851311584, "learning_rate": 1.983744036099755e-05, "loss": 1.4527, "step": 2129 }, { "epoch": 0.09, "grad_norm": 6.215902907933666, "learning_rate": 1.9837206003044674e-05, "loss": 1.3402, "step": 2130 }, { "epoch": 0.09, "grad_norm": 8.973109497639802, "learning_rate": 1.9836971477665962e-05, "loss": 1.5662, "step": 2131 }, { "epoch": 0.09, "grad_norm": 7.21149359569075, "learning_rate": 1.9836736784865406e-05, "loss": 1.5492, "step": 2132 }, { "epoch": 0.09, "grad_norm": 9.018203124705373, "learning_rate": 1.9836501924646998e-05, "loss": 1.3717, "step": 2133 }, { "epoch": 0.09, "grad_norm": 10.737189938770085, "learning_rate": 1.983626689701474e-05, "loss": 1.1464, "step": 2134 }, { "epoch": 0.09, "grad_norm": 12.515106223177204, "learning_rate": 1.983603170197263e-05, "loss": 1.5919, "step": 2135 }, { "epoch": 0.09, "grad_norm": 7.715792116718323, "learning_rate": 1.983579633952467e-05, "loss": 1.7391, "step": 2136 }, { "epoch": 0.09, "grad_norm": 5.351822784282908, "learning_rate": 1.983556080967486e-05, "loss": 1.2781, "step": 2137 }, { "epoch": 0.09, "grad_norm": 8.121850344172062, "learning_rate": 1.9835325112427223e-05, "loss": 1.3982, "step": 2138 }, { "epoch": 0.09, "grad_norm": 6.5617134171531655, "learning_rate": 1.9835089247785763e-05, "loss": 1.473, "step": 2139 }, { "epoch": 0.09, "grad_norm": 8.812970555955127, "learning_rate": 1.9834853215754488e-05, "loss": 1.5728, "step": 2140 }, { "epoch": 0.09, "grad_norm": 5.853917566749489, "learning_rate": 1.9834617016337424e-05, "loss": 1.2893, "step": 2141 }, { "epoch": 0.09, "grad_norm": 5.8519349787536, "learning_rate": 1.9834380649538586e-05, "loss": 1.1881, "step": 2142 }, { "epoch": 0.09, "grad_norm": 8.536763702885692, "learning_rate": 1.9834144115362e-05, "loss": 1.5859, "step": 2143 }, { "epoch": 0.09, "grad_norm": 5.405226438623203, "learning_rate": 1.983390741381169e-05, "loss": 1.5649, "step": 2144 }, { "epoch": 0.09, "grad_norm": 5.679808971907038, "learning_rate": 1.9833670544891685e-05, "loss": 1.3384, "step": 2145 }, { "epoch": 0.09, "grad_norm": 7.450287416090553, "learning_rate": 1.9833433508606018e-05, "loss": 1.529, "step": 2146 }, { "epoch": 0.09, "grad_norm": 24.249720411959476, "learning_rate": 1.983319630495872e-05, "loss": 1.3843, "step": 2147 }, { "epoch": 0.09, "grad_norm": 7.818429244666359, "learning_rate": 1.983295893395383e-05, "loss": 1.4132, "step": 2148 }, { "epoch": 0.09, "grad_norm": 9.392360692370362, "learning_rate": 1.983272139559539e-05, "loss": 1.5431, "step": 2149 }, { "epoch": 0.09, "grad_norm": 11.029588964620906, "learning_rate": 1.9832483689887438e-05, "loss": 1.2658, "step": 2150 }, { "epoch": 0.09, "grad_norm": 6.551146151310274, "learning_rate": 1.9832245816834023e-05, "loss": 1.4433, "step": 2151 }, { "epoch": 0.09, "grad_norm": 7.922382842549953, "learning_rate": 1.9832007776439194e-05, "loss": 1.4921, "step": 2152 }, { "epoch": 0.09, "grad_norm": 10.601642708939318, "learning_rate": 1.9831769568707e-05, "loss": 1.4252, "step": 2153 }, { "epoch": 0.09, "grad_norm": 7.613467230248213, "learning_rate": 1.98315311936415e-05, "loss": 1.7025, "step": 2154 }, { "epoch": 0.09, "grad_norm": 10.044813847208802, "learning_rate": 1.9831292651246742e-05, "loss": 1.4549, "step": 2155 }, { "epoch": 0.09, "grad_norm": 6.990937121081952, "learning_rate": 1.9831053941526796e-05, "loss": 1.5714, "step": 2156 }, { "epoch": 0.09, "grad_norm": 5.797955484443314, "learning_rate": 1.983081506448572e-05, "loss": 1.3849, "step": 2157 }, { "epoch": 0.09, "grad_norm": 5.243140945057242, "learning_rate": 1.9830576020127577e-05, "loss": 1.1203, "step": 2158 }, { "epoch": 0.09, "grad_norm": 7.194466445992653, "learning_rate": 1.983033680845644e-05, "loss": 1.2226, "step": 2159 }, { "epoch": 0.09, "grad_norm": 5.5177857425702586, "learning_rate": 1.9830097429476377e-05, "loss": 1.3112, "step": 2160 }, { "epoch": 0.09, "grad_norm": 6.037636701157427, "learning_rate": 1.9829857883191466e-05, "loss": 1.655, "step": 2161 }, { "epoch": 0.09, "grad_norm": 7.897443426397933, "learning_rate": 1.982961816960578e-05, "loss": 1.4438, "step": 2162 }, { "epoch": 0.09, "grad_norm": 4.874977978404665, "learning_rate": 1.98293782887234e-05, "loss": 1.255, "step": 2163 }, { "epoch": 0.09, "grad_norm": 9.646579244033633, "learning_rate": 1.9829138240548413e-05, "loss": 1.4475, "step": 2164 }, { "epoch": 0.09, "grad_norm": 5.681079010827928, "learning_rate": 1.9828898025084898e-05, "loss": 1.3579, "step": 2165 }, { "epoch": 0.09, "grad_norm": 8.078049597857948, "learning_rate": 1.9828657642336947e-05, "loss": 1.6121, "step": 2166 }, { "epoch": 0.09, "grad_norm": 5.734300445580301, "learning_rate": 1.9828417092308646e-05, "loss": 1.2257, "step": 2167 }, { "epoch": 0.09, "grad_norm": 5.315772418202527, "learning_rate": 1.9828176375004097e-05, "loss": 1.2124, "step": 2168 }, { "epoch": 0.09, "grad_norm": 33.484953359548484, "learning_rate": 1.9827935490427393e-05, "loss": 1.6639, "step": 2169 }, { "epoch": 0.09, "grad_norm": 6.441566891901978, "learning_rate": 1.9827694438582635e-05, "loss": 1.2254, "step": 2170 }, { "epoch": 0.09, "grad_norm": 8.779562889206817, "learning_rate": 1.9827453219473925e-05, "loss": 1.286, "step": 2171 }, { "epoch": 0.09, "grad_norm": 13.422568988701434, "learning_rate": 1.982721183310537e-05, "loss": 1.4057, "step": 2172 }, { "epoch": 0.09, "grad_norm": 15.170934217107806, "learning_rate": 1.9826970279481072e-05, "loss": 1.5012, "step": 2173 }, { "epoch": 0.09, "grad_norm": 8.619348316438499, "learning_rate": 1.982672855860515e-05, "loss": 1.4628, "step": 2174 }, { "epoch": 0.09, "grad_norm": 119.68899419043944, "learning_rate": 1.9826486670481712e-05, "loss": 3.5284, "step": 2175 }, { "epoch": 0.09, "grad_norm": 25.33062875343638, "learning_rate": 1.982624461511488e-05, "loss": 1.6982, "step": 2176 }, { "epoch": 0.09, "grad_norm": 21.931613305513558, "learning_rate": 1.982600239250877e-05, "loss": 1.3873, "step": 2177 }, { "epoch": 0.09, "grad_norm": 7.827554319094738, "learning_rate": 1.9825760002667506e-05, "loss": 1.3944, "step": 2178 }, { "epoch": 0.09, "grad_norm": 7.107013922183744, "learning_rate": 1.9825517445595212e-05, "loss": 1.5091, "step": 2179 }, { "epoch": 0.09, "grad_norm": 7.081900530013561, "learning_rate": 1.982527472129602e-05, "loss": 1.2978, "step": 2180 }, { "epoch": 0.09, "grad_norm": 12.860203157097322, "learning_rate": 1.9825031829774057e-05, "loss": 1.5169, "step": 2181 }, { "epoch": 0.09, "grad_norm": 11.914802620545139, "learning_rate": 1.9824788771033462e-05, "loss": 1.548, "step": 2182 }, { "epoch": 0.09, "grad_norm": 6.590894387333212, "learning_rate": 1.9824545545078366e-05, "loss": 0.9895, "step": 2183 }, { "epoch": 0.09, "grad_norm": 11.032637597648499, "learning_rate": 1.9824302151912907e-05, "loss": 1.6021, "step": 2184 }, { "epoch": 0.09, "grad_norm": 6.177045173185003, "learning_rate": 1.9824058591541234e-05, "loss": 1.6869, "step": 2185 }, { "epoch": 0.09, "grad_norm": 10.843339249264051, "learning_rate": 1.982381486396749e-05, "loss": 1.4369, "step": 2186 }, { "epoch": 0.09, "grad_norm": 18.37092532836726, "learning_rate": 1.9823570969195822e-05, "loss": 1.5016, "step": 2187 }, { "epoch": 0.09, "grad_norm": 9.132022144009358, "learning_rate": 1.9823326907230384e-05, "loss": 1.5888, "step": 2188 }, { "epoch": 0.09, "grad_norm": 5.716857741974544, "learning_rate": 1.9823082678075323e-05, "loss": 1.3036, "step": 2189 }, { "epoch": 0.09, "grad_norm": 11.557280712698354, "learning_rate": 1.9822838281734807e-05, "loss": 1.9741, "step": 2190 }, { "epoch": 0.09, "grad_norm": 9.67467083225736, "learning_rate": 1.9822593718212985e-05, "loss": 1.6161, "step": 2191 }, { "epoch": 0.09, "grad_norm": 9.74868170786922, "learning_rate": 1.9822348987514025e-05, "loss": 1.1185, "step": 2192 }, { "epoch": 0.09, "grad_norm": 8.65723469052163, "learning_rate": 1.982210408964209e-05, "loss": 1.4581, "step": 2193 }, { "epoch": 0.09, "grad_norm": 8.242940490403177, "learning_rate": 1.9821859024601345e-05, "loss": 1.0819, "step": 2194 }, { "epoch": 0.09, "grad_norm": 9.27109662109477, "learning_rate": 1.9821613792395967e-05, "loss": 0.9394, "step": 2195 }, { "epoch": 0.09, "grad_norm": 9.463391549800129, "learning_rate": 1.9821368393030127e-05, "loss": 1.3728, "step": 2196 }, { "epoch": 0.09, "grad_norm": 7.570914939074665, "learning_rate": 1.9821122826508e-05, "loss": 1.2208, "step": 2197 }, { "epoch": 0.09, "grad_norm": 7.714525603016291, "learning_rate": 1.9820877092833773e-05, "loss": 1.5355, "step": 2198 }, { "epoch": 0.09, "grad_norm": 9.842768182807974, "learning_rate": 1.9820631192011618e-05, "loss": 1.9009, "step": 2199 }, { "epoch": 0.09, "grad_norm": 6.6021668328893774, "learning_rate": 1.9820385124045725e-05, "loss": 1.1475, "step": 2200 }, { "epoch": 0.09, "grad_norm": 6.926957060186036, "learning_rate": 1.9820138888940286e-05, "loss": 1.5232, "step": 2201 }, { "epoch": 0.09, "grad_norm": 10.12948475778242, "learning_rate": 1.9819892486699485e-05, "loss": 1.6132, "step": 2202 }, { "epoch": 0.09, "grad_norm": 6.4955331526551845, "learning_rate": 1.9819645917327516e-05, "loss": 1.4578, "step": 2203 }, { "epoch": 0.09, "grad_norm": 6.037800495883764, "learning_rate": 1.981939918082858e-05, "loss": 1.6176, "step": 2204 }, { "epoch": 0.09, "grad_norm": 5.833791577414573, "learning_rate": 1.9819152277206874e-05, "loss": 1.2693, "step": 2205 }, { "epoch": 0.09, "grad_norm": 12.519088830593718, "learning_rate": 1.98189052064666e-05, "loss": 1.2969, "step": 2206 }, { "epoch": 0.09, "grad_norm": 6.776263505534228, "learning_rate": 1.9818657968611963e-05, "loss": 1.4722, "step": 2207 }, { "epoch": 0.09, "grad_norm": 4.907905931489199, "learning_rate": 1.9818410563647172e-05, "loss": 1.2125, "step": 2208 }, { "epoch": 0.09, "grad_norm": 7.011724837052742, "learning_rate": 1.981816299157644e-05, "loss": 1.0828, "step": 2209 }, { "epoch": 0.09, "grad_norm": 6.75490711624516, "learning_rate": 1.9817915252403973e-05, "loss": 1.326, "step": 2210 }, { "epoch": 0.09, "grad_norm": 6.85572245274576, "learning_rate": 1.9817667346133995e-05, "loss": 1.2886, "step": 2211 }, { "epoch": 0.09, "grad_norm": 5.358118676115781, "learning_rate": 1.9817419272770718e-05, "loss": 1.3831, "step": 2212 }, { "epoch": 0.09, "grad_norm": 4.574388800189298, "learning_rate": 1.981717103231837e-05, "loss": 1.2479, "step": 2213 }, { "epoch": 0.09, "grad_norm": 5.930142040638645, "learning_rate": 1.981692262478118e-05, "loss": 1.1691, "step": 2214 }, { "epoch": 0.09, "grad_norm": 6.293104411838765, "learning_rate": 1.9816674050163366e-05, "loss": 1.6866, "step": 2215 }, { "epoch": 0.09, "grad_norm": 7.979976522826998, "learning_rate": 1.9816425308469163e-05, "loss": 1.9124, "step": 2216 }, { "epoch": 0.09, "grad_norm": 5.775595961718293, "learning_rate": 1.9816176399702806e-05, "loss": 1.2914, "step": 2217 }, { "epoch": 0.09, "grad_norm": 7.821757472794938, "learning_rate": 1.9815927323868528e-05, "loss": 1.6291, "step": 2218 }, { "epoch": 0.09, "grad_norm": 5.464951897816095, "learning_rate": 1.9815678080970574e-05, "loss": 1.3126, "step": 2219 }, { "epoch": 0.09, "grad_norm": 7.623158993669226, "learning_rate": 1.981542867101318e-05, "loss": 1.5193, "step": 2220 }, { "epoch": 0.09, "grad_norm": 11.52193403309273, "learning_rate": 1.9815179094000588e-05, "loss": 1.8671, "step": 2221 }, { "epoch": 0.09, "grad_norm": 5.713833295812381, "learning_rate": 1.9814929349937054e-05, "loss": 1.0225, "step": 2222 }, { "epoch": 0.09, "grad_norm": 8.216328518432011, "learning_rate": 1.9814679438826826e-05, "loss": 1.3291, "step": 2223 }, { "epoch": 0.09, "grad_norm": 7.930286917116389, "learning_rate": 1.9814429360674158e-05, "loss": 1.5232, "step": 2224 }, { "epoch": 0.09, "grad_norm": 5.6446798731208645, "learning_rate": 1.9814179115483303e-05, "loss": 1.3697, "step": 2225 }, { "epoch": 0.09, "grad_norm": 5.398420866200483, "learning_rate": 1.981392870325852e-05, "loss": 1.0964, "step": 2226 }, { "epoch": 0.09, "grad_norm": 6.675559635444843, "learning_rate": 1.9813678124004078e-05, "loss": 1.2761, "step": 2227 }, { "epoch": 0.09, "grad_norm": 7.286088278388259, "learning_rate": 1.981342737772423e-05, "loss": 1.4092, "step": 2228 }, { "epoch": 0.09, "grad_norm": 8.329069935826217, "learning_rate": 1.9813176464423255e-05, "loss": 1.6465, "step": 2229 }, { "epoch": 0.09, "grad_norm": 5.122298097986522, "learning_rate": 1.9812925384105416e-05, "loss": 0.9894, "step": 2230 }, { "epoch": 0.09, "grad_norm": 4.799462363435874, "learning_rate": 1.981267413677499e-05, "loss": 1.3434, "step": 2231 }, { "epoch": 0.09, "grad_norm": 6.972480107966845, "learning_rate": 1.9812422722436254e-05, "loss": 1.443, "step": 2232 }, { "epoch": 0.09, "grad_norm": 4.650724217988361, "learning_rate": 1.9812171141093484e-05, "loss": 1.0992, "step": 2233 }, { "epoch": 0.09, "grad_norm": 6.935915311958544, "learning_rate": 1.981191939275096e-05, "loss": 1.6155, "step": 2234 }, { "epoch": 0.09, "grad_norm": 3.700876084149738, "learning_rate": 1.9811667477412972e-05, "loss": 1.1604, "step": 2235 }, { "epoch": 0.09, "grad_norm": 5.896066610940687, "learning_rate": 1.9811415395083803e-05, "loss": 1.3248, "step": 2236 }, { "epoch": 0.09, "grad_norm": 7.050652963231763, "learning_rate": 1.981116314576775e-05, "loss": 1.3466, "step": 2237 }, { "epoch": 0.09, "grad_norm": 6.051142127930135, "learning_rate": 1.98109107294691e-05, "loss": 1.5764, "step": 2238 }, { "epoch": 0.09, "grad_norm": 7.22467871666585, "learning_rate": 1.981065814619215e-05, "loss": 1.4366, "step": 2239 }, { "epoch": 0.09, "grad_norm": 5.3612733858837895, "learning_rate": 1.98104053959412e-05, "loss": 1.1638, "step": 2240 }, { "epoch": 0.09, "grad_norm": 9.677878483592012, "learning_rate": 1.981015247872055e-05, "loss": 1.6954, "step": 2241 }, { "epoch": 0.09, "grad_norm": 7.735076921680322, "learning_rate": 1.9809899394534505e-05, "loss": 1.9427, "step": 2242 }, { "epoch": 0.09, "grad_norm": 5.64146162873456, "learning_rate": 1.9809646143387374e-05, "loss": 1.5667, "step": 2243 }, { "epoch": 0.09, "grad_norm": 6.252219092040173, "learning_rate": 1.9809392725283466e-05, "loss": 1.3843, "step": 2244 }, { "epoch": 0.09, "grad_norm": 5.382678712283265, "learning_rate": 1.9809139140227097e-05, "loss": 1.4533, "step": 2245 }, { "epoch": 0.09, "grad_norm": 5.704497503856482, "learning_rate": 1.9808885388222578e-05, "loss": 1.2716, "step": 2246 }, { "epoch": 0.09, "grad_norm": 6.643919518517507, "learning_rate": 1.9808631469274234e-05, "loss": 1.2191, "step": 2247 }, { "epoch": 0.09, "grad_norm": 6.104766447960723, "learning_rate": 1.980837738338638e-05, "loss": 1.3034, "step": 2248 }, { "epoch": 0.09, "grad_norm": 6.264482665874045, "learning_rate": 1.9808123130563343e-05, "loss": 1.4667, "step": 2249 }, { "epoch": 0.09, "grad_norm": 7.262810167578607, "learning_rate": 1.980786871080945e-05, "loss": 1.3823, "step": 2250 }, { "epoch": 0.09, "grad_norm": 5.229917165581827, "learning_rate": 1.9807614124129034e-05, "loss": 1.2753, "step": 2251 }, { "epoch": 0.09, "grad_norm": 7.8002524729154, "learning_rate": 1.9807359370526425e-05, "loss": 1.4502, "step": 2252 }, { "epoch": 0.09, "grad_norm": 6.514224622960106, "learning_rate": 1.9807104450005962e-05, "loss": 1.2071, "step": 2253 }, { "epoch": 0.09, "grad_norm": 7.117451877344915, "learning_rate": 1.9806849362571978e-05, "loss": 1.7524, "step": 2254 }, { "epoch": 0.09, "grad_norm": 6.446803291189573, "learning_rate": 1.980659410822882e-05, "loss": 1.4324, "step": 2255 }, { "epoch": 0.09, "grad_norm": 6.204477024225661, "learning_rate": 1.9806338686980825e-05, "loss": 1.2728, "step": 2256 }, { "epoch": 0.09, "grad_norm": 5.513502680165082, "learning_rate": 1.980608309883235e-05, "loss": 1.1821, "step": 2257 }, { "epoch": 0.09, "grad_norm": 6.864596488954305, "learning_rate": 1.980582734378774e-05, "loss": 1.174, "step": 2258 }, { "epoch": 0.09, "grad_norm": 6.57540194428217, "learning_rate": 1.9805571421851345e-05, "loss": 1.3875, "step": 2259 }, { "epoch": 0.09, "grad_norm": 7.871270072300933, "learning_rate": 1.980531533302753e-05, "loss": 1.6042, "step": 2260 }, { "epoch": 0.09, "grad_norm": 7.153764303858159, "learning_rate": 1.9805059077320643e-05, "loss": 1.2587, "step": 2261 }, { "epoch": 0.09, "grad_norm": 8.634854873773975, "learning_rate": 1.9804802654735048e-05, "loss": 1.2827, "step": 2262 }, { "epoch": 0.09, "grad_norm": 7.436011980111569, "learning_rate": 1.9804546065275116e-05, "loss": 1.4932, "step": 2263 }, { "epoch": 0.09, "grad_norm": 7.1529211812132525, "learning_rate": 1.9804289308945207e-05, "loss": 1.5369, "step": 2264 }, { "epoch": 0.09, "grad_norm": 5.5276050598796, "learning_rate": 1.9804032385749693e-05, "loss": 1.3374, "step": 2265 }, { "epoch": 0.09, "grad_norm": 5.955919974843701, "learning_rate": 1.9803775295692944e-05, "loss": 1.4445, "step": 2266 }, { "epoch": 0.09, "grad_norm": 6.735865948029496, "learning_rate": 1.980351803877934e-05, "loss": 1.6915, "step": 2267 }, { "epoch": 0.09, "grad_norm": 7.091024703482934, "learning_rate": 1.9803260615013257e-05, "loss": 1.6724, "step": 2268 }, { "epoch": 0.09, "grad_norm": 5.299801763832618, "learning_rate": 1.980300302439908e-05, "loss": 1.2007, "step": 2269 }, { "epoch": 0.09, "grad_norm": 6.086150301725009, "learning_rate": 1.980274526694119e-05, "loss": 1.398, "step": 2270 }, { "epoch": 0.09, "grad_norm": 7.154678537577464, "learning_rate": 1.980248734264397e-05, "loss": 1.513, "step": 2271 }, { "epoch": 0.09, "grad_norm": 7.670652538745526, "learning_rate": 1.9802229251511815e-05, "loss": 1.3006, "step": 2272 }, { "epoch": 0.09, "grad_norm": 7.738828673167921, "learning_rate": 1.980197099354912e-05, "loss": 1.2935, "step": 2273 }, { "epoch": 0.09, "grad_norm": 5.670523290173077, "learning_rate": 1.9801712568760273e-05, "loss": 1.2846, "step": 2274 }, { "epoch": 0.09, "grad_norm": 6.814570880174142, "learning_rate": 1.9801453977149674e-05, "loss": 1.2825, "step": 2275 }, { "epoch": 0.09, "grad_norm": 6.053366255786789, "learning_rate": 1.9801195218721734e-05, "loss": 1.3144, "step": 2276 }, { "epoch": 0.09, "grad_norm": 8.113979128022825, "learning_rate": 1.9800936293480847e-05, "loss": 1.6025, "step": 2277 }, { "epoch": 0.09, "grad_norm": 7.273020099675095, "learning_rate": 1.9800677201431417e-05, "loss": 1.3955, "step": 2278 }, { "epoch": 0.09, "grad_norm": 6.9208848384884165, "learning_rate": 1.9800417942577865e-05, "loss": 1.5187, "step": 2279 }, { "epoch": 0.09, "grad_norm": 6.336788340396834, "learning_rate": 1.9800158516924593e-05, "loss": 1.3084, "step": 2280 }, { "epoch": 0.09, "grad_norm": 5.722623191784169, "learning_rate": 1.9799898924476022e-05, "loss": 1.2517, "step": 2281 }, { "epoch": 0.09, "grad_norm": 6.837008197305857, "learning_rate": 1.9799639165236568e-05, "loss": 1.3607, "step": 2282 }, { "epoch": 0.09, "grad_norm": 5.47358606709877, "learning_rate": 1.9799379239210654e-05, "loss": 1.1804, "step": 2283 }, { "epoch": 0.09, "grad_norm": 5.388501971117802, "learning_rate": 1.9799119146402703e-05, "loss": 1.3098, "step": 2284 }, { "epoch": 0.09, "grad_norm": 6.549232759883415, "learning_rate": 1.979885888681714e-05, "loss": 1.4922, "step": 2285 }, { "epoch": 0.09, "grad_norm": 4.969302243087315, "learning_rate": 1.9798598460458394e-05, "loss": 1.2463, "step": 2286 }, { "epoch": 0.09, "grad_norm": 5.292745131968066, "learning_rate": 1.9798337867330907e-05, "loss": 1.1767, "step": 2287 }, { "epoch": 0.09, "grad_norm": 8.23170360050032, "learning_rate": 1.97980771074391e-05, "loss": 1.4472, "step": 2288 }, { "epoch": 0.09, "grad_norm": 6.623997762440592, "learning_rate": 1.9797816180787418e-05, "loss": 1.3854, "step": 2289 }, { "epoch": 0.09, "grad_norm": 5.792177052243116, "learning_rate": 1.97975550873803e-05, "loss": 1.3113, "step": 2290 }, { "epoch": 0.09, "grad_norm": 5.089261522258525, "learning_rate": 1.9797293827222194e-05, "loss": 1.1542, "step": 2291 }, { "epoch": 0.09, "grad_norm": 4.787551284696548, "learning_rate": 1.9797032400317543e-05, "loss": 1.1647, "step": 2292 }, { "epoch": 0.09, "grad_norm": 7.6498494989094725, "learning_rate": 1.9796770806670795e-05, "loss": 1.7065, "step": 2293 }, { "epoch": 0.09, "grad_norm": 8.225555189168992, "learning_rate": 1.979650904628641e-05, "loss": 1.2545, "step": 2294 }, { "epoch": 0.09, "grad_norm": 7.359320306566103, "learning_rate": 1.9796247119168832e-05, "loss": 1.3106, "step": 2295 }, { "epoch": 0.09, "grad_norm": 7.01055905853141, "learning_rate": 1.9795985025322525e-05, "loss": 1.7033, "step": 2296 }, { "epoch": 0.09, "grad_norm": 5.891219777461577, "learning_rate": 1.979572276475195e-05, "loss": 1.2933, "step": 2297 }, { "epoch": 0.09, "grad_norm": 10.589534035004933, "learning_rate": 1.979546033746157e-05, "loss": 1.3456, "step": 2298 }, { "epoch": 0.09, "grad_norm": 7.828405956998997, "learning_rate": 1.979519774345585e-05, "loss": 1.3914, "step": 2299 }, { "epoch": 0.09, "grad_norm": 6.407218418186931, "learning_rate": 1.979493498273926e-05, "loss": 1.3972, "step": 2300 }, { "epoch": 0.09, "grad_norm": 14.300819827888258, "learning_rate": 1.979467205531627e-05, "loss": 1.5014, "step": 2301 }, { "epoch": 0.09, "grad_norm": 10.369858996830136, "learning_rate": 1.9794408961191362e-05, "loss": 1.2413, "step": 2302 }, { "epoch": 0.09, "grad_norm": 6.470620070124043, "learning_rate": 1.9794145700369007e-05, "loss": 1.2812, "step": 2303 }, { "epoch": 0.09, "grad_norm": 13.05400563217698, "learning_rate": 1.9793882272853688e-05, "loss": 1.7169, "step": 2304 }, { "epoch": 0.09, "grad_norm": 8.148956365044722, "learning_rate": 1.9793618678649888e-05, "loss": 1.2752, "step": 2305 }, { "epoch": 0.09, "grad_norm": 8.055338612325986, "learning_rate": 1.9793354917762096e-05, "loss": 1.3632, "step": 2306 }, { "epoch": 0.09, "grad_norm": 5.951062331253587, "learning_rate": 1.9793090990194795e-05, "loss": 1.3419, "step": 2307 }, { "epoch": 0.09, "grad_norm": 7.204507590274996, "learning_rate": 1.979282689595248e-05, "loss": 1.2839, "step": 2308 }, { "epoch": 0.09, "grad_norm": 14.807214933211405, "learning_rate": 1.979256263503965e-05, "loss": 1.5369, "step": 2309 }, { "epoch": 0.09, "grad_norm": 9.020449480062286, "learning_rate": 1.9792298207460793e-05, "loss": 1.4177, "step": 2310 }, { "epoch": 0.09, "grad_norm": 4.625559408723252, "learning_rate": 1.9792033613220422e-05, "loss": 1.0572, "step": 2311 }, { "epoch": 0.09, "grad_norm": 7.277135341343127, "learning_rate": 1.979176885232303e-05, "loss": 1.5635, "step": 2312 }, { "epoch": 0.09, "grad_norm": 6.258081832944338, "learning_rate": 1.9791503924773126e-05, "loss": 1.2927, "step": 2313 }, { "epoch": 0.09, "grad_norm": 9.211124612288353, "learning_rate": 1.979123883057522e-05, "loss": 1.1576, "step": 2314 }, { "epoch": 0.09, "grad_norm": 6.810563655285875, "learning_rate": 1.9790973569733825e-05, "loss": 1.5758, "step": 2315 }, { "epoch": 0.09, "grad_norm": 6.235994249562045, "learning_rate": 1.9790708142253456e-05, "loss": 1.261, "step": 2316 }, { "epoch": 0.09, "grad_norm": 7.769641259717488, "learning_rate": 1.9790442548138626e-05, "loss": 1.4045, "step": 2317 }, { "epoch": 0.09, "grad_norm": 6.338530163532364, "learning_rate": 1.979017678739386e-05, "loss": 1.4369, "step": 2318 }, { "epoch": 0.09, "grad_norm": 7.023206439533541, "learning_rate": 1.9789910860023677e-05, "loss": 1.3618, "step": 2319 }, { "epoch": 0.09, "grad_norm": 8.176496351653952, "learning_rate": 1.9789644766032604e-05, "loss": 1.4055, "step": 2320 }, { "epoch": 0.09, "grad_norm": 7.370978828060599, "learning_rate": 1.9789378505425173e-05, "loss": 1.4803, "step": 2321 }, { "epoch": 0.09, "grad_norm": 7.49617763684483, "learning_rate": 1.9789112078205918e-05, "loss": 1.4739, "step": 2322 }, { "epoch": 0.09, "grad_norm": 11.563416070876334, "learning_rate": 1.978884548437936e-05, "loss": 1.1277, "step": 2323 }, { "epoch": 0.09, "grad_norm": 7.5005460012442375, "learning_rate": 1.9788578723950053e-05, "loss": 1.4434, "step": 2324 }, { "epoch": 0.09, "grad_norm": 6.290455558746135, "learning_rate": 1.9788311796922527e-05, "loss": 1.582, "step": 2325 }, { "epoch": 0.09, "grad_norm": 10.288105712047095, "learning_rate": 1.9788044703301327e-05, "loss": 1.86, "step": 2326 }, { "epoch": 0.09, "grad_norm": 7.746960049002564, "learning_rate": 1.9787777443091e-05, "loss": 1.4346, "step": 2327 }, { "epoch": 0.09, "grad_norm": 4.935723944753187, "learning_rate": 1.9787510016296095e-05, "loss": 1.0585, "step": 2328 }, { "epoch": 0.09, "grad_norm": 13.803863764278566, "learning_rate": 1.9787242422921163e-05, "loss": 1.4575, "step": 2329 }, { "epoch": 0.09, "grad_norm": 5.392008227940538, "learning_rate": 1.9786974662970758e-05, "loss": 1.3128, "step": 2330 }, { "epoch": 0.09, "grad_norm": 6.654788540628237, "learning_rate": 1.9786706736449436e-05, "loss": 1.2252, "step": 2331 }, { "epoch": 0.09, "grad_norm": 7.962868483444048, "learning_rate": 1.978643864336176e-05, "loss": 1.5635, "step": 2332 }, { "epoch": 0.09, "grad_norm": 9.890644268893915, "learning_rate": 1.978617038371229e-05, "loss": 1.6848, "step": 2333 }, { "epoch": 0.09, "grad_norm": 6.978357958792903, "learning_rate": 1.9785901957505592e-05, "loss": 1.4512, "step": 2334 }, { "epoch": 0.09, "grad_norm": 8.157856887797164, "learning_rate": 1.9785633364746234e-05, "loss": 1.4105, "step": 2335 }, { "epoch": 0.09, "grad_norm": 10.818300383704187, "learning_rate": 1.978536460543879e-05, "loss": 1.3506, "step": 2336 }, { "epoch": 0.09, "grad_norm": 11.0187521832239, "learning_rate": 1.9785095679587836e-05, "loss": 1.5442, "step": 2337 }, { "epoch": 0.09, "grad_norm": 7.336400475364567, "learning_rate": 1.9784826587197942e-05, "loss": 1.607, "step": 2338 }, { "epoch": 0.09, "grad_norm": 8.46176835177031, "learning_rate": 1.9784557328273696e-05, "loss": 1.5016, "step": 2339 }, { "epoch": 0.09, "grad_norm": 7.436280238490885, "learning_rate": 1.9784287902819673e-05, "loss": 1.1873, "step": 2340 }, { "epoch": 0.09, "grad_norm": 7.395440181414065, "learning_rate": 1.978401831084046e-05, "loss": 1.455, "step": 2341 }, { "epoch": 0.09, "grad_norm": 5.49295549620925, "learning_rate": 1.978374855234065e-05, "loss": 1.0875, "step": 2342 }, { "epoch": 0.09, "grad_norm": 7.553754161335859, "learning_rate": 1.9783478627324833e-05, "loss": 1.7287, "step": 2343 }, { "epoch": 0.09, "grad_norm": 6.091099174184596, "learning_rate": 1.9783208535797602e-05, "loss": 1.3867, "step": 2344 }, { "epoch": 0.09, "grad_norm": 6.816272472701812, "learning_rate": 1.9782938277763547e-05, "loss": 1.3163, "step": 2345 }, { "epoch": 0.09, "grad_norm": 6.325533281728228, "learning_rate": 1.9782667853227278e-05, "loss": 1.2925, "step": 2346 }, { "epoch": 0.09, "grad_norm": 7.774503574457326, "learning_rate": 1.9782397262193398e-05, "loss": 1.4665, "step": 2347 }, { "epoch": 0.09, "grad_norm": 8.239155409430976, "learning_rate": 1.9782126504666502e-05, "loss": 1.5979, "step": 2348 }, { "epoch": 0.09, "grad_norm": 7.308282359683663, "learning_rate": 1.9781855580651207e-05, "loss": 1.519, "step": 2349 }, { "epoch": 0.09, "grad_norm": 7.782115078365031, "learning_rate": 1.978158449015212e-05, "loss": 1.6723, "step": 2350 }, { "epoch": 0.09, "grad_norm": 8.286537351194205, "learning_rate": 1.9781313233173858e-05, "loss": 1.4596, "step": 2351 }, { "epoch": 0.09, "grad_norm": 7.737455978053662, "learning_rate": 1.9781041809721032e-05, "loss": 1.5336, "step": 2352 }, { "epoch": 0.09, "grad_norm": 6.909025841860506, "learning_rate": 1.978077021979827e-05, "loss": 1.6065, "step": 2353 }, { "epoch": 0.09, "grad_norm": 7.041378923459224, "learning_rate": 1.9780498463410187e-05, "loss": 1.7557, "step": 2354 }, { "epoch": 0.09, "grad_norm": 8.697575369707412, "learning_rate": 1.9780226540561413e-05, "loss": 1.286, "step": 2355 }, { "epoch": 0.09, "grad_norm": 7.645396053787499, "learning_rate": 1.977995445125657e-05, "loss": 1.5277, "step": 2356 }, { "epoch": 0.09, "grad_norm": 6.139478509198994, "learning_rate": 1.9779682195500297e-05, "loss": 1.404, "step": 2357 }, { "epoch": 0.09, "grad_norm": 5.608353730408532, "learning_rate": 1.977940977329722e-05, "loss": 1.3109, "step": 2358 }, { "epoch": 0.1, "grad_norm": 5.446476810984435, "learning_rate": 1.9779137184651983e-05, "loss": 1.252, "step": 2359 }, { "epoch": 0.1, "grad_norm": 6.681720462200904, "learning_rate": 1.9778864429569218e-05, "loss": 1.5071, "step": 2360 }, { "epoch": 0.1, "grad_norm": 8.113596827536437, "learning_rate": 1.9778591508053573e-05, "loss": 1.4697, "step": 2361 }, { "epoch": 0.1, "grad_norm": 6.365578117426721, "learning_rate": 1.977831842010969e-05, "loss": 1.2852, "step": 2362 }, { "epoch": 0.1, "grad_norm": 5.872350184857425, "learning_rate": 1.9778045165742216e-05, "loss": 1.1824, "step": 2363 }, { "epoch": 0.1, "grad_norm": 7.115627604945798, "learning_rate": 1.9777771744955804e-05, "loss": 1.3251, "step": 2364 }, { "epoch": 0.1, "grad_norm": 7.197142668936221, "learning_rate": 1.9777498157755105e-05, "loss": 1.5379, "step": 2365 }, { "epoch": 0.1, "grad_norm": 7.069140332784883, "learning_rate": 1.977722440414478e-05, "loss": 1.6133, "step": 2366 }, { "epoch": 0.1, "grad_norm": 5.282741907563678, "learning_rate": 1.9776950484129484e-05, "loss": 1.3122, "step": 2367 }, { "epoch": 0.1, "grad_norm": 5.579124253701632, "learning_rate": 1.9776676397713882e-05, "loss": 1.5344, "step": 2368 }, { "epoch": 0.1, "grad_norm": 6.234074244359075, "learning_rate": 1.9776402144902636e-05, "loss": 1.5122, "step": 2369 }, { "epoch": 0.1, "grad_norm": 5.104892642666924, "learning_rate": 1.9776127725700413e-05, "loss": 1.1049, "step": 2370 }, { "epoch": 0.1, "grad_norm": 7.101666703954131, "learning_rate": 1.9775853140111887e-05, "loss": 1.4982, "step": 2371 }, { "epoch": 0.1, "grad_norm": 5.818277688615716, "learning_rate": 1.977557838814173e-05, "loss": 1.2647, "step": 2372 }, { "epoch": 0.1, "grad_norm": 7.542974421187077, "learning_rate": 1.977530346979462e-05, "loss": 1.4095, "step": 2373 }, { "epoch": 0.1, "grad_norm": 5.439748550519008, "learning_rate": 1.977502838507523e-05, "loss": 0.8275, "step": 2374 }, { "epoch": 0.1, "grad_norm": 6.025433194786921, "learning_rate": 1.9774753133988246e-05, "loss": 1.1935, "step": 2375 }, { "epoch": 0.1, "grad_norm": 6.323258265745918, "learning_rate": 1.9774477716538352e-05, "loss": 1.3953, "step": 2376 }, { "epoch": 0.1, "grad_norm": 7.234839979396902, "learning_rate": 1.977420213273024e-05, "loss": 1.6676, "step": 2377 }, { "epoch": 0.1, "grad_norm": 6.1756150533742895, "learning_rate": 1.9773926382568592e-05, "loss": 1.4695, "step": 2378 }, { "epoch": 0.1, "grad_norm": 4.6299358078533155, "learning_rate": 1.9773650466058107e-05, "loss": 1.1667, "step": 2379 }, { "epoch": 0.1, "grad_norm": 4.709647800653659, "learning_rate": 1.977337438320348e-05, "loss": 1.1521, "step": 2380 }, { "epoch": 0.1, "grad_norm": 6.352227388811451, "learning_rate": 1.977309813400941e-05, "loss": 0.992, "step": 2381 }, { "epoch": 0.1, "grad_norm": 7.0209300111154835, "learning_rate": 1.9772821718480596e-05, "loss": 1.1522, "step": 2382 }, { "epoch": 0.1, "grad_norm": 5.970260572492284, "learning_rate": 1.9772545136621746e-05, "loss": 1.5545, "step": 2383 }, { "epoch": 0.1, "grad_norm": 7.2576045814755465, "learning_rate": 1.9772268388437562e-05, "loss": 1.2986, "step": 2384 }, { "epoch": 0.1, "grad_norm": 6.753774365377731, "learning_rate": 1.9771991473932764e-05, "loss": 1.5024, "step": 2385 }, { "epoch": 0.1, "grad_norm": 5.879106583684735, "learning_rate": 1.9771714393112057e-05, "loss": 1.3666, "step": 2386 }, { "epoch": 0.1, "grad_norm": 4.816612076620179, "learning_rate": 1.9771437145980155e-05, "loss": 1.3493, "step": 2387 }, { "epoch": 0.1, "grad_norm": 8.275079342119255, "learning_rate": 1.9771159732541783e-05, "loss": 1.6081, "step": 2388 }, { "epoch": 0.1, "grad_norm": 5.240677902769774, "learning_rate": 1.9770882152801662e-05, "loss": 1.2584, "step": 2389 }, { "epoch": 0.1, "grad_norm": 5.3073863214931025, "learning_rate": 1.977060440676451e-05, "loss": 1.149, "step": 2390 }, { "epoch": 0.1, "grad_norm": 4.8025112277602275, "learning_rate": 1.977032649443506e-05, "loss": 1.2676, "step": 2391 }, { "epoch": 0.1, "grad_norm": 8.286296544568502, "learning_rate": 1.9770048415818042e-05, "loss": 1.4232, "step": 2392 }, { "epoch": 0.1, "grad_norm": 5.863040857611469, "learning_rate": 1.9769770170918187e-05, "loss": 0.8656, "step": 2393 }, { "epoch": 0.1, "grad_norm": 9.539837149411529, "learning_rate": 1.976949175974023e-05, "loss": 1.8224, "step": 2394 }, { "epoch": 0.1, "grad_norm": 6.589612391885002, "learning_rate": 1.976921318228891e-05, "loss": 1.4666, "step": 2395 }, { "epoch": 0.1, "grad_norm": 6.289036891601582, "learning_rate": 1.9768934438568967e-05, "loss": 1.2597, "step": 2396 }, { "epoch": 0.1, "grad_norm": 7.100475058053136, "learning_rate": 1.9768655528585147e-05, "loss": 1.3084, "step": 2397 }, { "epoch": 0.1, "grad_norm": 6.727862760370415, "learning_rate": 1.9768376452342197e-05, "loss": 1.332, "step": 2398 }, { "epoch": 0.1, "grad_norm": 5.633192839215411, "learning_rate": 1.9768097209844867e-05, "loss": 0.99, "step": 2399 }, { "epoch": 0.1, "grad_norm": 6.30743139076143, "learning_rate": 1.976781780109791e-05, "loss": 1.2059, "step": 2400 }, { "epoch": 0.1, "grad_norm": 7.947547214362693, "learning_rate": 1.9767538226106078e-05, "loss": 1.3394, "step": 2401 }, { "epoch": 0.1, "grad_norm": 11.573479943471469, "learning_rate": 1.9767258484874136e-05, "loss": 1.7008, "step": 2402 }, { "epoch": 0.1, "grad_norm": 7.664828573057713, "learning_rate": 1.9766978577406834e-05, "loss": 1.3304, "step": 2403 }, { "epoch": 0.1, "grad_norm": 5.6189670903477635, "learning_rate": 1.9766698503708945e-05, "loss": 1.33, "step": 2404 }, { "epoch": 0.1, "grad_norm": 5.96458374837972, "learning_rate": 1.9766418263785237e-05, "loss": 1.2183, "step": 2405 }, { "epoch": 0.1, "grad_norm": 5.487236312312979, "learning_rate": 1.976613785764047e-05, "loss": 1.469, "step": 2406 }, { "epoch": 0.1, "grad_norm": 5.4274288304752005, "learning_rate": 1.9765857285279432e-05, "loss": 1.4245, "step": 2407 }, { "epoch": 0.1, "grad_norm": 8.20778941634519, "learning_rate": 1.976557654670688e-05, "loss": 1.2476, "step": 2408 }, { "epoch": 0.1, "grad_norm": 7.328902456172008, "learning_rate": 1.9765295641927604e-05, "loss": 1.3873, "step": 2409 }, { "epoch": 0.1, "grad_norm": 6.993270302209906, "learning_rate": 1.9765014570946383e-05, "loss": 1.5852, "step": 2410 }, { "epoch": 0.1, "grad_norm": 5.513559657351155, "learning_rate": 1.9764733333767998e-05, "loss": 1.4948, "step": 2411 }, { "epoch": 0.1, "grad_norm": 7.111248953835878, "learning_rate": 1.9764451930397235e-05, "loss": 1.4332, "step": 2412 }, { "epoch": 0.1, "grad_norm": 7.1986550466949675, "learning_rate": 1.9764170360838886e-05, "loss": 1.2678, "step": 2413 }, { "epoch": 0.1, "grad_norm": 5.315287515718362, "learning_rate": 1.976388862509774e-05, "loss": 1.0943, "step": 2414 }, { "epoch": 0.1, "grad_norm": 5.4549355052132205, "learning_rate": 1.9763606723178602e-05, "loss": 1.084, "step": 2415 }, { "epoch": 0.1, "grad_norm": 6.4637995556816215, "learning_rate": 1.976332465508626e-05, "loss": 1.6363, "step": 2416 }, { "epoch": 0.1, "grad_norm": 7.07483323233534, "learning_rate": 1.9763042420825513e-05, "loss": 1.3429, "step": 2417 }, { "epoch": 0.1, "grad_norm": 6.212156582204302, "learning_rate": 1.9762760020401172e-05, "loss": 1.3462, "step": 2418 }, { "epoch": 0.1, "grad_norm": 6.6254916782672195, "learning_rate": 1.976247745381804e-05, "loss": 1.2216, "step": 2419 }, { "epoch": 0.1, "grad_norm": 3.3778045481600985, "learning_rate": 1.9762194721080926e-05, "loss": 1.1359, "step": 2420 }, { "epoch": 0.1, "grad_norm": 7.8702363290839275, "learning_rate": 1.9761911822194643e-05, "loss": 1.6536, "step": 2421 }, { "epoch": 0.1, "grad_norm": 5.773962552878389, "learning_rate": 1.9761628757164002e-05, "loss": 1.2052, "step": 2422 }, { "epoch": 0.1, "grad_norm": 7.963056452181607, "learning_rate": 1.9761345525993826e-05, "loss": 1.7302, "step": 2423 }, { "epoch": 0.1, "grad_norm": 5.416849015630392, "learning_rate": 1.9761062128688932e-05, "loss": 1.3611, "step": 2424 }, { "epoch": 0.1, "grad_norm": 6.564680309555223, "learning_rate": 1.9760778565254148e-05, "loss": 1.1967, "step": 2425 }, { "epoch": 0.1, "grad_norm": 6.846261665463614, "learning_rate": 1.976049483569429e-05, "loss": 1.4635, "step": 2426 }, { "epoch": 0.1, "grad_norm": 5.712065082087612, "learning_rate": 1.97602109400142e-05, "loss": 1.421, "step": 2427 }, { "epoch": 0.1, "grad_norm": 5.089016323328259, "learning_rate": 1.9759926878218703e-05, "loss": 1.1574, "step": 2428 }, { "epoch": 0.1, "grad_norm": 6.347513902152747, "learning_rate": 1.9759642650312633e-05, "loss": 1.3844, "step": 2429 }, { "epoch": 0.1, "grad_norm": 8.073420672555265, "learning_rate": 1.9759358256300827e-05, "loss": 1.1729, "step": 2430 }, { "epoch": 0.1, "grad_norm": 7.995792486720135, "learning_rate": 1.9759073696188127e-05, "loss": 1.1315, "step": 2431 }, { "epoch": 0.1, "grad_norm": 5.287106373752574, "learning_rate": 1.975878896997938e-05, "loss": 1.3228, "step": 2432 }, { "epoch": 0.1, "grad_norm": 8.138740153735842, "learning_rate": 1.9758504077679425e-05, "loss": 1.4879, "step": 2433 }, { "epoch": 0.1, "grad_norm": 5.174036650312244, "learning_rate": 1.9758219019293115e-05, "loss": 1.3063, "step": 2434 }, { "epoch": 0.1, "grad_norm": 8.668052127880092, "learning_rate": 1.97579337948253e-05, "loss": 1.2467, "step": 2435 }, { "epoch": 0.1, "grad_norm": 7.883660742892354, "learning_rate": 1.975764840428083e-05, "loss": 1.2907, "step": 2436 }, { "epoch": 0.1, "grad_norm": 5.487756411917772, "learning_rate": 1.9757362847664574e-05, "loss": 1.1775, "step": 2437 }, { "epoch": 0.1, "grad_norm": 7.711475171237999, "learning_rate": 1.9757077124981383e-05, "loss": 1.0746, "step": 2438 }, { "epoch": 0.1, "grad_norm": 7.333239556863978, "learning_rate": 1.975679123623612e-05, "loss": 1.5081, "step": 2439 }, { "epoch": 0.1, "grad_norm": 6.560943980113215, "learning_rate": 1.9756505181433654e-05, "loss": 1.3991, "step": 2440 }, { "epoch": 0.1, "grad_norm": 6.063242720032983, "learning_rate": 1.9756218960578854e-05, "loss": 1.4994, "step": 2441 }, { "epoch": 0.1, "grad_norm": 8.82615857484061, "learning_rate": 1.9755932573676587e-05, "loss": 1.6439, "step": 2442 }, { "epoch": 0.1, "grad_norm": 7.051744772190796, "learning_rate": 1.9755646020731728e-05, "loss": 1.2911, "step": 2443 }, { "epoch": 0.1, "grad_norm": 6.685018516542887, "learning_rate": 1.975535930174916e-05, "loss": 1.4109, "step": 2444 }, { "epoch": 0.1, "grad_norm": 5.2151942726645, "learning_rate": 1.9755072416733756e-05, "loss": 1.2421, "step": 2445 }, { "epoch": 0.1, "grad_norm": 7.207488791982005, "learning_rate": 1.9754785365690403e-05, "loss": 1.3384, "step": 2446 }, { "epoch": 0.1, "grad_norm": 7.190489932409248, "learning_rate": 1.9754498148623985e-05, "loss": 1.4638, "step": 2447 }, { "epoch": 0.1, "grad_norm": 7.696121120359858, "learning_rate": 1.975421076553939e-05, "loss": 1.4681, "step": 2448 }, { "epoch": 0.1, "grad_norm": 7.070757299860934, "learning_rate": 1.975392321644151e-05, "loss": 1.2239, "step": 2449 }, { "epoch": 0.1, "grad_norm": 5.100002574189488, "learning_rate": 1.9753635501335236e-05, "loss": 1.3039, "step": 2450 }, { "epoch": 0.1, "grad_norm": 8.445605511542439, "learning_rate": 1.975334762022547e-05, "loss": 1.4239, "step": 2451 }, { "epoch": 0.1, "grad_norm": 7.984800751996431, "learning_rate": 1.9753059573117105e-05, "loss": 1.1344, "step": 2452 }, { "epoch": 0.1, "grad_norm": 7.445415734833298, "learning_rate": 1.9752771360015053e-05, "loss": 1.1388, "step": 2453 }, { "epoch": 0.1, "grad_norm": 6.536766513256212, "learning_rate": 1.975248298092421e-05, "loss": 1.5078, "step": 2454 }, { "epoch": 0.1, "grad_norm": 10.94465643624607, "learning_rate": 1.975219443584949e-05, "loss": 1.371, "step": 2455 }, { "epoch": 0.1, "grad_norm": 6.57138242400799, "learning_rate": 1.97519057247958e-05, "loss": 1.5209, "step": 2456 }, { "epoch": 0.1, "grad_norm": 5.6006928040398885, "learning_rate": 1.9751616847768053e-05, "loss": 1.2998, "step": 2457 }, { "epoch": 0.1, "grad_norm": 10.47192036814895, "learning_rate": 1.9751327804771172e-05, "loss": 1.3711, "step": 2458 }, { "epoch": 0.1, "grad_norm": 8.643635758346635, "learning_rate": 1.9751038595810068e-05, "loss": 1.4132, "step": 2459 }, { "epoch": 0.1, "grad_norm": 7.883925815532739, "learning_rate": 1.9750749220889673e-05, "loss": 1.4994, "step": 2460 }, { "epoch": 0.1, "grad_norm": 7.755740301849055, "learning_rate": 1.9750459680014904e-05, "loss": 1.3762, "step": 2461 }, { "epoch": 0.1, "grad_norm": 8.89682322350582, "learning_rate": 1.975016997319069e-05, "loss": 1.7078, "step": 2462 }, { "epoch": 0.1, "grad_norm": 6.575981286381875, "learning_rate": 1.9749880100421967e-05, "loss": 1.2477, "step": 2463 }, { "epoch": 0.1, "grad_norm": 5.184497377969023, "learning_rate": 1.9749590061713663e-05, "loss": 1.3656, "step": 2464 }, { "epoch": 0.1, "grad_norm": 5.958324399610432, "learning_rate": 1.9749299857070716e-05, "loss": 1.3767, "step": 2465 }, { "epoch": 0.1, "grad_norm": 6.12023312740827, "learning_rate": 1.9749009486498064e-05, "loss": 1.5188, "step": 2466 }, { "epoch": 0.1, "grad_norm": 5.497908280092942, "learning_rate": 1.9748718950000653e-05, "loss": 1.3222, "step": 2467 }, { "epoch": 0.1, "grad_norm": 5.2370476186823005, "learning_rate": 1.9748428247583423e-05, "loss": 1.2115, "step": 2468 }, { "epoch": 0.1, "grad_norm": 5.965137462496832, "learning_rate": 1.9748137379251324e-05, "loss": 1.2071, "step": 2469 }, { "epoch": 0.1, "grad_norm": 5.514050449460222, "learning_rate": 1.9747846345009306e-05, "loss": 1.1752, "step": 2470 }, { "epoch": 0.1, "grad_norm": 5.341545435811534, "learning_rate": 1.9747555144862327e-05, "loss": 1.3015, "step": 2471 }, { "epoch": 0.1, "grad_norm": 7.423157995644816, "learning_rate": 1.9747263778815335e-05, "loss": 1.5654, "step": 2472 }, { "epoch": 0.1, "grad_norm": 5.409162197740796, "learning_rate": 1.974697224687329e-05, "loss": 1.0982, "step": 2473 }, { "epoch": 0.1, "grad_norm": 6.586100238607348, "learning_rate": 1.9746680549041163e-05, "loss": 1.5776, "step": 2474 }, { "epoch": 0.1, "grad_norm": 6.848624016694678, "learning_rate": 1.9746388685323907e-05, "loss": 1.3578, "step": 2475 }, { "epoch": 0.1, "grad_norm": 7.300857202920487, "learning_rate": 1.9746096655726497e-05, "loss": 1.3031, "step": 2476 }, { "epoch": 0.1, "grad_norm": 6.480346173294212, "learning_rate": 1.9745804460253903e-05, "loss": 0.8772, "step": 2477 }, { "epoch": 0.1, "grad_norm": 4.832706186773727, "learning_rate": 1.9745512098911092e-05, "loss": 1.3348, "step": 2478 }, { "epoch": 0.1, "grad_norm": 7.009632440307114, "learning_rate": 1.9745219571703042e-05, "loss": 1.7949, "step": 2479 }, { "epoch": 0.1, "grad_norm": 5.954167734482992, "learning_rate": 1.974492687863474e-05, "loss": 1.1958, "step": 2480 }, { "epoch": 0.1, "grad_norm": 5.935506098639694, "learning_rate": 1.9744634019711155e-05, "loss": 1.3224, "step": 2481 }, { "epoch": 0.1, "grad_norm": 6.407714859873503, "learning_rate": 1.974434099493728e-05, "loss": 1.5656, "step": 2482 }, { "epoch": 0.1, "grad_norm": 5.636815000135334, "learning_rate": 1.9744047804318097e-05, "loss": 1.6051, "step": 2483 }, { "epoch": 0.1, "grad_norm": 3.4316843546847715, "learning_rate": 1.97437544478586e-05, "loss": 1.062, "step": 2484 }, { "epoch": 0.1, "grad_norm": 6.835939241298115, "learning_rate": 1.9743460925563783e-05, "loss": 1.1627, "step": 2485 }, { "epoch": 0.1, "grad_norm": 6.537300126249334, "learning_rate": 1.9743167237438637e-05, "loss": 1.597, "step": 2486 }, { "epoch": 0.1, "grad_norm": 9.64606556085158, "learning_rate": 1.9742873383488163e-05, "loss": 1.4732, "step": 2487 }, { "epoch": 0.1, "grad_norm": 6.4696880934698004, "learning_rate": 1.974257936371736e-05, "loss": 1.4101, "step": 2488 }, { "epoch": 0.1, "grad_norm": 6.734206369883296, "learning_rate": 1.9742285178131233e-05, "loss": 1.5091, "step": 2489 }, { "epoch": 0.1, "grad_norm": 9.646663051702214, "learning_rate": 1.9741990826734793e-05, "loss": 1.8179, "step": 2490 }, { "epoch": 0.1, "grad_norm": 7.949321939249056, "learning_rate": 1.9741696309533042e-05, "loss": 1.4236, "step": 2491 }, { "epoch": 0.1, "grad_norm": 6.615022293077521, "learning_rate": 1.9741401626531003e-05, "loss": 1.0857, "step": 2492 }, { "epoch": 0.1, "grad_norm": 6.190171276770775, "learning_rate": 1.974110677773368e-05, "loss": 1.3074, "step": 2493 }, { "epoch": 0.1, "grad_norm": 8.019877268896954, "learning_rate": 1.97408117631461e-05, "loss": 1.3625, "step": 2494 }, { "epoch": 0.1, "grad_norm": 4.6182056571985575, "learning_rate": 1.9740516582773283e-05, "loss": 1.1987, "step": 2495 }, { "epoch": 0.1, "grad_norm": 6.416483147531896, "learning_rate": 1.9740221236620245e-05, "loss": 1.2295, "step": 2496 }, { "epoch": 0.1, "grad_norm": 4.839186228160043, "learning_rate": 1.973992572469202e-05, "loss": 1.1997, "step": 2497 }, { "epoch": 0.1, "grad_norm": 5.138526379447447, "learning_rate": 1.973963004699364e-05, "loss": 1.1367, "step": 2498 }, { "epoch": 0.1, "grad_norm": 7.71341067783331, "learning_rate": 1.973933420353013e-05, "loss": 1.5714, "step": 2499 }, { "epoch": 0.1, "grad_norm": 5.293538561554527, "learning_rate": 1.9739038194306528e-05, "loss": 1.297, "step": 2500 }, { "epoch": 0.1, "grad_norm": 6.291853478249205, "learning_rate": 1.9738742019327875e-05, "loss": 1.089, "step": 2501 }, { "epoch": 0.1, "grad_norm": 6.070496979564334, "learning_rate": 1.9738445678599206e-05, "loss": 1.543, "step": 2502 }, { "epoch": 0.1, "grad_norm": 5.454494272585979, "learning_rate": 1.9738149172125566e-05, "loss": 1.1625, "step": 2503 }, { "epoch": 0.1, "grad_norm": 5.455262123675351, "learning_rate": 1.9737852499912006e-05, "loss": 1.4755, "step": 2504 }, { "epoch": 0.1, "grad_norm": 6.840162638767774, "learning_rate": 1.9737555661963573e-05, "loss": 1.6069, "step": 2505 }, { "epoch": 0.1, "grad_norm": 5.780915941394309, "learning_rate": 1.973725865828532e-05, "loss": 1.4268, "step": 2506 }, { "epoch": 0.1, "grad_norm": 6.320601516683957, "learning_rate": 1.97369614888823e-05, "loss": 1.2379, "step": 2507 }, { "epoch": 0.1, "grad_norm": 3.720291432432725, "learning_rate": 1.9736664153759565e-05, "loss": 1.0744, "step": 2508 }, { "epoch": 0.1, "grad_norm": 8.987449368240055, "learning_rate": 1.9736366652922188e-05, "loss": 1.6249, "step": 2509 }, { "epoch": 0.1, "grad_norm": 8.532215216540594, "learning_rate": 1.9736068986375222e-05, "loss": 1.5233, "step": 2510 }, { "epoch": 0.1, "grad_norm": 7.069008746314104, "learning_rate": 1.9735771154123742e-05, "loss": 1.213, "step": 2511 }, { "epoch": 0.1, "grad_norm": 5.824199356299683, "learning_rate": 1.9735473156172808e-05, "loss": 1.5735, "step": 2512 }, { "epoch": 0.1, "grad_norm": 7.082075189932723, "learning_rate": 1.9735174992527498e-05, "loss": 1.0434, "step": 2513 }, { "epoch": 0.1, "grad_norm": 8.923635342204987, "learning_rate": 1.9734876663192883e-05, "loss": 1.3727, "step": 2514 }, { "epoch": 0.1, "grad_norm": 7.04703745672627, "learning_rate": 1.9734578168174044e-05, "loss": 1.3505, "step": 2515 }, { "epoch": 0.1, "grad_norm": 5.779894182461942, "learning_rate": 1.9734279507476057e-05, "loss": 1.0725, "step": 2516 }, { "epoch": 0.1, "grad_norm": 7.658138648044491, "learning_rate": 1.973398068110401e-05, "loss": 1.3463, "step": 2517 }, { "epoch": 0.1, "grad_norm": 8.839489411405488, "learning_rate": 1.973368168906299e-05, "loss": 1.3811, "step": 2518 }, { "epoch": 0.1, "grad_norm": 6.118705737640947, "learning_rate": 1.9733382531358073e-05, "loss": 1.1019, "step": 2519 }, { "epoch": 0.1, "grad_norm": 4.842196335714272, "learning_rate": 1.9733083207994367e-05, "loss": 1.126, "step": 2520 }, { "epoch": 0.1, "grad_norm": 6.795827233205269, "learning_rate": 1.9732783718976957e-05, "loss": 1.2167, "step": 2521 }, { "epoch": 0.1, "grad_norm": 6.34291033975161, "learning_rate": 1.973248406431094e-05, "loss": 1.3082, "step": 2522 }, { "epoch": 0.1, "grad_norm": 10.485082348198171, "learning_rate": 1.973218424400142e-05, "loss": 1.5184, "step": 2523 }, { "epoch": 0.1, "grad_norm": 7.201681497338545, "learning_rate": 1.97318842580535e-05, "loss": 1.4952, "step": 2524 }, { "epoch": 0.1, "grad_norm": 6.786421578703849, "learning_rate": 1.973158410647228e-05, "loss": 1.5083, "step": 2525 }, { "epoch": 0.1, "grad_norm": 8.170509306928906, "learning_rate": 1.9731283789262877e-05, "loss": 1.3967, "step": 2526 }, { "epoch": 0.1, "grad_norm": 8.485243579930072, "learning_rate": 1.9730983306430396e-05, "loss": 1.3816, "step": 2527 }, { "epoch": 0.1, "grad_norm": 5.132264137214185, "learning_rate": 1.973068265797995e-05, "loss": 1.3127, "step": 2528 }, { "epoch": 0.1, "grad_norm": 6.977391876260524, "learning_rate": 1.973038184391666e-05, "loss": 1.5879, "step": 2529 }, { "epoch": 0.1, "grad_norm": 6.652429282391596, "learning_rate": 1.9730080864245647e-05, "loss": 1.53, "step": 2530 }, { "epoch": 0.1, "grad_norm": 7.30106402723896, "learning_rate": 1.972977971897203e-05, "loss": 1.515, "step": 2531 }, { "epoch": 0.1, "grad_norm": 8.057634385169399, "learning_rate": 1.9729478408100933e-05, "loss": 1.4551, "step": 2532 }, { "epoch": 0.1, "grad_norm": 6.2130591298821765, "learning_rate": 1.9729176931637487e-05, "loss": 1.1922, "step": 2533 }, { "epoch": 0.1, "grad_norm": 7.811616563202031, "learning_rate": 1.9728875289586824e-05, "loss": 1.3936, "step": 2534 }, { "epoch": 0.1, "grad_norm": 6.0430246481648595, "learning_rate": 1.9728573481954075e-05, "loss": 1.3444, "step": 2535 }, { "epoch": 0.1, "grad_norm": 6.093425293607571, "learning_rate": 1.972827150874438e-05, "loss": 1.3472, "step": 2536 }, { "epoch": 0.1, "grad_norm": 5.73977166428045, "learning_rate": 1.9727969369962877e-05, "loss": 1.2742, "step": 2537 }, { "epoch": 0.1, "grad_norm": 5.402948572001233, "learning_rate": 1.9727667065614703e-05, "loss": 1.3686, "step": 2538 }, { "epoch": 0.1, "grad_norm": 5.522379616520328, "learning_rate": 1.9727364595705012e-05, "loss": 1.4152, "step": 2539 }, { "epoch": 0.1, "grad_norm": 6.45297160480035, "learning_rate": 1.9727061960238948e-05, "loss": 1.0922, "step": 2540 }, { "epoch": 0.1, "grad_norm": 12.116096096203538, "learning_rate": 1.9726759159221662e-05, "loss": 1.0823, "step": 2541 }, { "epoch": 0.1, "grad_norm": 7.123595476813397, "learning_rate": 1.972645619265831e-05, "loss": 1.4807, "step": 2542 }, { "epoch": 0.1, "grad_norm": 5.5752367803110765, "learning_rate": 1.972615306055404e-05, "loss": 1.4202, "step": 2543 }, { "epoch": 0.1, "grad_norm": 8.228200499914623, "learning_rate": 1.9725849762914015e-05, "loss": 1.1249, "step": 2544 }, { "epoch": 0.1, "grad_norm": 6.2736127969757005, "learning_rate": 1.9725546299743407e-05, "loss": 1.3513, "step": 2545 }, { "epoch": 0.1, "grad_norm": 5.962823614733744, "learning_rate": 1.9725242671047365e-05, "loss": 1.312, "step": 2546 }, { "epoch": 0.1, "grad_norm": 5.480483980456306, "learning_rate": 1.9724938876831067e-05, "loss": 1.1961, "step": 2547 }, { "epoch": 0.1, "grad_norm": 7.577530021198025, "learning_rate": 1.9724634917099683e-05, "loss": 1.3993, "step": 2548 }, { "epoch": 0.1, "grad_norm": 5.618886043389187, "learning_rate": 1.9724330791858382e-05, "loss": 1.443, "step": 2549 }, { "epoch": 0.1, "grad_norm": 7.479859292745246, "learning_rate": 1.9724026501112343e-05, "loss": 1.2056, "step": 2550 }, { "epoch": 0.1, "grad_norm": 9.97982456503693, "learning_rate": 1.9723722044866745e-05, "loss": 1.3498, "step": 2551 }, { "epoch": 0.1, "grad_norm": 6.940270348244879, "learning_rate": 1.972341742312677e-05, "loss": 1.4322, "step": 2552 }, { "epoch": 0.1, "grad_norm": 6.4979506781185945, "learning_rate": 1.9723112635897595e-05, "loss": 1.7645, "step": 2553 }, { "epoch": 0.1, "grad_norm": 5.4608654751304675, "learning_rate": 1.9722807683184418e-05, "loss": 1.4636, "step": 2554 }, { "epoch": 0.1, "grad_norm": 6.577685968851478, "learning_rate": 1.9722502564992428e-05, "loss": 1.37, "step": 2555 }, { "epoch": 0.1, "grad_norm": 6.396471204200455, "learning_rate": 1.972219728132681e-05, "loss": 1.245, "step": 2556 }, { "epoch": 0.1, "grad_norm": 7.261552051716292, "learning_rate": 1.9721891832192764e-05, "loss": 1.4803, "step": 2557 }, { "epoch": 0.1, "grad_norm": 5.844550835074468, "learning_rate": 1.9721586217595496e-05, "loss": 1.4101, "step": 2558 }, { "epoch": 0.1, "grad_norm": 12.111566223673272, "learning_rate": 1.9721280437540197e-05, "loss": 1.3702, "step": 2559 }, { "epoch": 0.1, "grad_norm": 7.964504024698751, "learning_rate": 1.9720974492032073e-05, "loss": 1.4542, "step": 2560 }, { "epoch": 0.1, "grad_norm": 6.0427961928055804, "learning_rate": 1.9720668381076337e-05, "loss": 1.2805, "step": 2561 }, { "epoch": 0.1, "grad_norm": 7.890183302965444, "learning_rate": 1.9720362104678193e-05, "loss": 1.4862, "step": 2562 }, { "epoch": 0.1, "grad_norm": 5.760601390188951, "learning_rate": 1.9720055662842856e-05, "loss": 1.3909, "step": 2563 }, { "epoch": 0.1, "grad_norm": 6.670609260341819, "learning_rate": 1.9719749055575544e-05, "loss": 1.1174, "step": 2564 }, { "epoch": 0.1, "grad_norm": 8.10808418327401, "learning_rate": 1.971944228288147e-05, "loss": 1.3224, "step": 2565 }, { "epoch": 0.1, "grad_norm": 9.581019467013325, "learning_rate": 1.971913534476586e-05, "loss": 1.6409, "step": 2566 }, { "epoch": 0.1, "grad_norm": 9.147327137615482, "learning_rate": 1.9718828241233935e-05, "loss": 1.4296, "step": 2567 }, { "epoch": 0.1, "grad_norm": 8.925467265580668, "learning_rate": 1.971852097229092e-05, "loss": 1.304, "step": 2568 }, { "epoch": 0.1, "grad_norm": 7.947998284566251, "learning_rate": 1.971821353794205e-05, "loss": 1.3643, "step": 2569 }, { "epoch": 0.1, "grad_norm": 6.210875232762943, "learning_rate": 1.9717905938192558e-05, "loss": 1.5467, "step": 2570 }, { "epoch": 0.1, "grad_norm": 6.0957404310164, "learning_rate": 1.9717598173047673e-05, "loss": 1.2647, "step": 2571 }, { "epoch": 0.1, "grad_norm": 7.397809068048734, "learning_rate": 1.9717290242512632e-05, "loss": 1.3148, "step": 2572 }, { "epoch": 0.1, "grad_norm": 3.6121276094373314, "learning_rate": 1.9716982146592684e-05, "loss": 1.133, "step": 2573 }, { "epoch": 0.1, "grad_norm": 8.90533337923404, "learning_rate": 1.9716673885293066e-05, "loss": 1.4559, "step": 2574 }, { "epoch": 0.1, "grad_norm": 7.522578140924586, "learning_rate": 1.971636545861903e-05, "loss": 1.3052, "step": 2575 }, { "epoch": 0.1, "grad_norm": 5.666716266027498, "learning_rate": 1.9716056866575822e-05, "loss": 1.2982, "step": 2576 }, { "epoch": 0.1, "grad_norm": 5.718605776142715, "learning_rate": 1.9715748109168696e-05, "loss": 0.9182, "step": 2577 }, { "epoch": 0.1, "grad_norm": 13.48792606287642, "learning_rate": 1.9715439186402903e-05, "loss": 1.4046, "step": 2578 }, { "epoch": 0.1, "grad_norm": 8.205638488319808, "learning_rate": 1.9715130098283702e-05, "loss": 1.0977, "step": 2579 }, { "epoch": 0.1, "grad_norm": 5.626440399506317, "learning_rate": 1.9714820844816358e-05, "loss": 1.2659, "step": 2580 }, { "epoch": 0.1, "grad_norm": 7.111753445647643, "learning_rate": 1.971451142600613e-05, "loss": 1.3977, "step": 2581 }, { "epoch": 0.1, "grad_norm": 7.7160376626386595, "learning_rate": 1.971420184185829e-05, "loss": 1.5231, "step": 2582 }, { "epoch": 0.1, "grad_norm": 4.384911200035775, "learning_rate": 1.9713892092378093e-05, "loss": 1.3073, "step": 2583 }, { "epoch": 0.1, "grad_norm": 6.371804409585002, "learning_rate": 1.9713582177570826e-05, "loss": 1.2322, "step": 2584 }, { "epoch": 0.1, "grad_norm": 4.232292813034817, "learning_rate": 1.9713272097441755e-05, "loss": 1.2428, "step": 2585 }, { "epoch": 0.1, "grad_norm": 4.609874843452016, "learning_rate": 1.9712961851996163e-05, "loss": 1.2985, "step": 2586 }, { "epoch": 0.1, "grad_norm": 3.647828417674238, "learning_rate": 1.9712651441239328e-05, "loss": 1.0808, "step": 2587 }, { "epoch": 0.1, "grad_norm": 6.546445954421866, "learning_rate": 1.9712340865176532e-05, "loss": 1.3253, "step": 2588 }, { "epoch": 0.1, "grad_norm": 5.299834495007353, "learning_rate": 1.9712030123813058e-05, "loss": 1.2245, "step": 2589 }, { "epoch": 0.1, "grad_norm": 10.84215651674454, "learning_rate": 1.9711719217154204e-05, "loss": 1.669, "step": 2590 }, { "epoch": 0.1, "grad_norm": 5.8009027873066845, "learning_rate": 1.9711408145205253e-05, "loss": 1.1867, "step": 2591 }, { "epoch": 0.1, "grad_norm": 7.361569737684312, "learning_rate": 1.97110969079715e-05, "loss": 1.469, "step": 2592 }, { "epoch": 0.1, "grad_norm": 9.482124003750835, "learning_rate": 1.971078550545825e-05, "loss": 1.2488, "step": 2593 }, { "epoch": 0.1, "grad_norm": 5.617585624826382, "learning_rate": 1.9710473937670792e-05, "loss": 1.287, "step": 2594 }, { "epoch": 0.1, "grad_norm": 6.562722403308764, "learning_rate": 1.9710162204614435e-05, "loss": 1.4877, "step": 2595 }, { "epoch": 0.1, "grad_norm": 7.639918939120277, "learning_rate": 1.9709850306294485e-05, "loss": 1.5055, "step": 2596 }, { "epoch": 0.1, "grad_norm": 5.378697982189352, "learning_rate": 1.970953824271625e-05, "loss": 1.1987, "step": 2597 }, { "epoch": 0.1, "grad_norm": 4.917312702005351, "learning_rate": 1.970922601388504e-05, "loss": 1.1321, "step": 2598 }, { "epoch": 0.1, "grad_norm": 4.872735214192032, "learning_rate": 1.970891361980617e-05, "loss": 1.267, "step": 2599 }, { "epoch": 0.1, "grad_norm": 4.983803978661779, "learning_rate": 1.970860106048495e-05, "loss": 1.1915, "step": 2600 }, { "epoch": 0.1, "grad_norm": 7.536393733639486, "learning_rate": 1.9708288335926714e-05, "loss": 1.3072, "step": 2601 }, { "epoch": 0.1, "grad_norm": 7.8302395304708945, "learning_rate": 1.970797544613677e-05, "loss": 1.3533, "step": 2602 }, { "epoch": 0.1, "grad_norm": 7.772681751432056, "learning_rate": 1.9707662391120456e-05, "loss": 1.3297, "step": 2603 }, { "epoch": 0.1, "grad_norm": 6.003462753384896, "learning_rate": 1.9707349170883087e-05, "loss": 1.4141, "step": 2604 }, { "epoch": 0.1, "grad_norm": 4.975087336141571, "learning_rate": 1.9707035785430005e-05, "loss": 1.2704, "step": 2605 }, { "epoch": 0.1, "grad_norm": 13.857051644240189, "learning_rate": 1.9706722234766537e-05, "loss": 1.3668, "step": 2606 }, { "epoch": 0.11, "grad_norm": 8.753991782344862, "learning_rate": 1.9706408518898025e-05, "loss": 1.456, "step": 2607 }, { "epoch": 0.11, "grad_norm": 6.571156882821755, "learning_rate": 1.9706094637829797e-05, "loss": 1.7173, "step": 2608 }, { "epoch": 0.11, "grad_norm": 9.185155369222365, "learning_rate": 1.9705780591567213e-05, "loss": 1.5724, "step": 2609 }, { "epoch": 0.11, "grad_norm": 5.278671248295658, "learning_rate": 1.9705466380115603e-05, "loss": 1.2371, "step": 2610 }, { "epoch": 0.11, "grad_norm": 7.582594741907626, "learning_rate": 1.970515200348032e-05, "loss": 1.3111, "step": 2611 }, { "epoch": 0.11, "grad_norm": 8.290958544940048, "learning_rate": 1.9704837461666715e-05, "loss": 1.2586, "step": 2612 }, { "epoch": 0.11, "grad_norm": 7.007766331586256, "learning_rate": 1.9704522754680143e-05, "loss": 1.3112, "step": 2613 }, { "epoch": 0.11, "grad_norm": 8.71701695075993, "learning_rate": 1.9704207882525955e-05, "loss": 1.6345, "step": 2614 }, { "epoch": 0.11, "grad_norm": 7.445357235939104, "learning_rate": 1.9703892845209513e-05, "loss": 1.3071, "step": 2615 }, { "epoch": 0.11, "grad_norm": 6.129241481667356, "learning_rate": 1.9703577642736182e-05, "loss": 1.2857, "step": 2616 }, { "epoch": 0.11, "grad_norm": 5.794384650257385, "learning_rate": 1.970326227511132e-05, "loss": 1.3616, "step": 2617 }, { "epoch": 0.11, "grad_norm": 6.893052752147094, "learning_rate": 1.97029467423403e-05, "loss": 1.2112, "step": 2618 }, { "epoch": 0.11, "grad_norm": 4.814211109622403, "learning_rate": 1.9702631044428492e-05, "loss": 1.221, "step": 2619 }, { "epoch": 0.11, "grad_norm": 6.972677239655188, "learning_rate": 1.9702315181381264e-05, "loss": 1.2337, "step": 2620 }, { "epoch": 0.11, "grad_norm": 8.196352352861117, "learning_rate": 1.9701999153203997e-05, "loss": 1.4647, "step": 2621 }, { "epoch": 0.11, "grad_norm": 5.905569600372065, "learning_rate": 1.970168295990207e-05, "loss": 1.4314, "step": 2622 }, { "epoch": 0.11, "grad_norm": 4.843204010319253, "learning_rate": 1.970136660148086e-05, "loss": 1.3417, "step": 2623 }, { "epoch": 0.11, "grad_norm": 8.093669898283489, "learning_rate": 1.9701050077945754e-05, "loss": 0.9207, "step": 2624 }, { "epoch": 0.11, "grad_norm": 10.371296593125958, "learning_rate": 1.970073338930214e-05, "loss": 1.364, "step": 2625 }, { "epoch": 0.11, "grad_norm": 5.456231878503099, "learning_rate": 1.9700416535555404e-05, "loss": 1.3838, "step": 2626 }, { "epoch": 0.11, "grad_norm": 10.367962455969566, "learning_rate": 1.9700099516710944e-05, "loss": 1.6727, "step": 2627 }, { "epoch": 0.11, "grad_norm": 8.092825874018594, "learning_rate": 1.9699782332774154e-05, "loss": 1.3588, "step": 2628 }, { "epoch": 0.11, "grad_norm": 5.648597273201228, "learning_rate": 1.969946498375043e-05, "loss": 1.0985, "step": 2629 }, { "epoch": 0.11, "grad_norm": 7.244211271625733, "learning_rate": 1.969914746964517e-05, "loss": 1.2563, "step": 2630 }, { "epoch": 0.11, "grad_norm": 7.5734040374852025, "learning_rate": 1.9698829790463792e-05, "loss": 1.5872, "step": 2631 }, { "epoch": 0.11, "grad_norm": 4.9132671764684295, "learning_rate": 1.969851194621169e-05, "loss": 1.3093, "step": 2632 }, { "epoch": 0.11, "grad_norm": 4.869013547994214, "learning_rate": 1.969819393689427e-05, "loss": 1.3391, "step": 2633 }, { "epoch": 0.11, "grad_norm": 7.935787370378075, "learning_rate": 1.969787576251696e-05, "loss": 1.3522, "step": 2634 }, { "epoch": 0.11, "grad_norm": 11.693761333803899, "learning_rate": 1.9697557423085163e-05, "loss": 1.2907, "step": 2635 }, { "epoch": 0.11, "grad_norm": 7.946515616282997, "learning_rate": 1.9697238918604303e-05, "loss": 1.4278, "step": 2636 }, { "epoch": 0.11, "grad_norm": 7.508275403223399, "learning_rate": 1.9696920249079795e-05, "loss": 1.3119, "step": 2637 }, { "epoch": 0.11, "grad_norm": 10.6256961738392, "learning_rate": 1.969660141451707e-05, "loss": 1.6451, "step": 2638 }, { "epoch": 0.11, "grad_norm": 5.446599124142236, "learning_rate": 1.969628241492155e-05, "loss": 1.2343, "step": 2639 }, { "epoch": 0.11, "grad_norm": 5.452133046659204, "learning_rate": 1.969596325029866e-05, "loss": 1.4679, "step": 2640 }, { "epoch": 0.11, "grad_norm": 5.592834168939742, "learning_rate": 1.9695643920653845e-05, "loss": 1.2738, "step": 2641 }, { "epoch": 0.11, "grad_norm": 7.432758786585461, "learning_rate": 1.9695324425992527e-05, "loss": 1.2182, "step": 2642 }, { "epoch": 0.11, "grad_norm": 6.188941705415736, "learning_rate": 1.9695004766320148e-05, "loss": 1.1953, "step": 2643 }, { "epoch": 0.11, "grad_norm": 5.4436755639824215, "learning_rate": 1.969468494164215e-05, "loss": 1.0823, "step": 2644 }, { "epoch": 0.11, "grad_norm": 5.490861642998686, "learning_rate": 1.9694364951963972e-05, "loss": 1.3339, "step": 2645 }, { "epoch": 0.11, "grad_norm": 5.848845453207603, "learning_rate": 1.9694044797291068e-05, "loss": 1.2896, "step": 2646 }, { "epoch": 0.11, "grad_norm": 5.194653474198149, "learning_rate": 1.9693724477628882e-05, "loss": 1.3313, "step": 2647 }, { "epoch": 0.11, "grad_norm": 5.72612689262227, "learning_rate": 1.9693403992982865e-05, "loss": 1.3308, "step": 2648 }, { "epoch": 0.11, "grad_norm": 7.21344519617811, "learning_rate": 1.9693083343358473e-05, "loss": 1.4584, "step": 2649 }, { "epoch": 0.11, "grad_norm": 6.86341747952875, "learning_rate": 1.969276252876116e-05, "loss": 1.2772, "step": 2650 }, { "epoch": 0.11, "grad_norm": 8.786564555091909, "learning_rate": 1.9692441549196393e-05, "loss": 1.2751, "step": 2651 }, { "epoch": 0.11, "grad_norm": 4.8928547299243546, "learning_rate": 1.969212040466963e-05, "loss": 1.3377, "step": 2652 }, { "epoch": 0.11, "grad_norm": 11.149200571263584, "learning_rate": 1.9691799095186338e-05, "loss": 1.5266, "step": 2653 }, { "epoch": 0.11, "grad_norm": 8.625527245206051, "learning_rate": 1.9691477620751985e-05, "loss": 1.9414, "step": 2654 }, { "epoch": 0.11, "grad_norm": 4.649545192142785, "learning_rate": 1.9691155981372043e-05, "loss": 1.0669, "step": 2655 }, { "epoch": 0.11, "grad_norm": 6.100652777122418, "learning_rate": 1.9690834177051984e-05, "loss": 1.4298, "step": 2656 }, { "epoch": 0.11, "grad_norm": 9.42682685603318, "learning_rate": 1.9690512207797287e-05, "loss": 1.4025, "step": 2657 }, { "epoch": 0.11, "grad_norm": 6.485111633744882, "learning_rate": 1.9690190073613434e-05, "loss": 1.3335, "step": 2658 }, { "epoch": 0.11, "grad_norm": 5.344349622565846, "learning_rate": 1.9689867774505906e-05, "loss": 1.1668, "step": 2659 }, { "epoch": 0.11, "grad_norm": 5.503404580393387, "learning_rate": 1.9689545310480186e-05, "loss": 1.4361, "step": 2660 }, { "epoch": 0.11, "grad_norm": 6.697256452857929, "learning_rate": 1.9689222681541766e-05, "loss": 1.7503, "step": 2661 }, { "epoch": 0.11, "grad_norm": 5.745235298324686, "learning_rate": 1.9688899887696134e-05, "loss": 1.2887, "step": 2662 }, { "epoch": 0.11, "grad_norm": 7.710038701970412, "learning_rate": 1.9688576928948783e-05, "loss": 1.3471, "step": 2663 }, { "epoch": 0.11, "grad_norm": 6.107554587208237, "learning_rate": 1.968825380530521e-05, "loss": 1.4368, "step": 2664 }, { "epoch": 0.11, "grad_norm": 5.757835486457908, "learning_rate": 1.9687930516770922e-05, "loss": 1.2799, "step": 2665 }, { "epoch": 0.11, "grad_norm": 7.116851055686256, "learning_rate": 1.9687607063351414e-05, "loss": 1.7312, "step": 2666 }, { "epoch": 0.11, "grad_norm": 5.5675377980561445, "learning_rate": 1.9687283445052192e-05, "loss": 1.1898, "step": 2667 }, { "epoch": 0.11, "grad_norm": 3.5652486917275397, "learning_rate": 1.9686959661878763e-05, "loss": 1.1707, "step": 2668 }, { "epoch": 0.11, "grad_norm": 7.608071758920661, "learning_rate": 1.9686635713836638e-05, "loss": 1.5562, "step": 2669 }, { "epoch": 0.11, "grad_norm": 5.698834640255228, "learning_rate": 1.9686311600931334e-05, "loss": 1.3651, "step": 2670 }, { "epoch": 0.11, "grad_norm": 6.793840444509706, "learning_rate": 1.968598732316836e-05, "loss": 1.3487, "step": 2671 }, { "epoch": 0.11, "grad_norm": 7.77514727182542, "learning_rate": 1.9685662880553242e-05, "loss": 1.3989, "step": 2672 }, { "epoch": 0.11, "grad_norm": 7.203341460102947, "learning_rate": 1.96853382730915e-05, "loss": 1.4155, "step": 2673 }, { "epoch": 0.11, "grad_norm": 6.21709180764204, "learning_rate": 1.968501350078866e-05, "loss": 1.0167, "step": 2674 }, { "epoch": 0.11, "grad_norm": 7.801862324736657, "learning_rate": 1.9684688563650247e-05, "loss": 1.3805, "step": 2675 }, { "epoch": 0.11, "grad_norm": 5.03908982357433, "learning_rate": 1.9684363461681792e-05, "loss": 1.4309, "step": 2676 }, { "epoch": 0.11, "grad_norm": 7.681365024019717, "learning_rate": 1.9684038194888827e-05, "loss": 1.1512, "step": 2677 }, { "epoch": 0.11, "grad_norm": 7.669043516411221, "learning_rate": 1.968371276327689e-05, "loss": 1.2958, "step": 2678 }, { "epoch": 0.11, "grad_norm": 5.542748346643057, "learning_rate": 1.968338716685152e-05, "loss": 1.4848, "step": 2679 }, { "epoch": 0.11, "grad_norm": 7.623009384238525, "learning_rate": 1.968306140561826e-05, "loss": 1.5436, "step": 2680 }, { "epoch": 0.11, "grad_norm": 6.3282366296029755, "learning_rate": 1.9682735479582648e-05, "loss": 1.3925, "step": 2681 }, { "epoch": 0.11, "grad_norm": 4.990307853502858, "learning_rate": 1.9682409388750234e-05, "loss": 1.074, "step": 2682 }, { "epoch": 0.11, "grad_norm": 5.819775468128765, "learning_rate": 1.9682083133126572e-05, "loss": 1.293, "step": 2683 }, { "epoch": 0.11, "grad_norm": 12.763354357104793, "learning_rate": 1.968175671271721e-05, "loss": 1.4267, "step": 2684 }, { "epoch": 0.11, "grad_norm": 6.975424420129589, "learning_rate": 1.9681430127527705e-05, "loss": 1.5483, "step": 2685 }, { "epoch": 0.11, "grad_norm": 5.555192594316251, "learning_rate": 1.9681103377563617e-05, "loss": 1.1957, "step": 2686 }, { "epoch": 0.11, "grad_norm": 6.180674169508494, "learning_rate": 1.9680776462830506e-05, "loss": 1.5546, "step": 2687 }, { "epoch": 0.11, "grad_norm": 5.31878764622501, "learning_rate": 1.968044938333393e-05, "loss": 1.2565, "step": 2688 }, { "epoch": 0.11, "grad_norm": 6.533048670545851, "learning_rate": 1.968012213907947e-05, "loss": 1.0506, "step": 2689 }, { "epoch": 0.11, "grad_norm": 6.735540246500043, "learning_rate": 1.9679794730072684e-05, "loss": 1.0938, "step": 2690 }, { "epoch": 0.11, "grad_norm": 6.024936087787572, "learning_rate": 1.9679467156319148e-05, "loss": 1.6703, "step": 2691 }, { "epoch": 0.11, "grad_norm": 7.045798465348672, "learning_rate": 1.9679139417824435e-05, "loss": 1.1684, "step": 2692 }, { "epoch": 0.11, "grad_norm": 8.579634980579847, "learning_rate": 1.9678811514594124e-05, "loss": 1.3536, "step": 2693 }, { "epoch": 0.11, "grad_norm": 7.745255431764085, "learning_rate": 1.9678483446633797e-05, "loss": 1.1109, "step": 2694 }, { "epoch": 0.11, "grad_norm": 6.542947265265834, "learning_rate": 1.967815521394904e-05, "loss": 1.4376, "step": 2695 }, { "epoch": 0.11, "grad_norm": 5.164446908759423, "learning_rate": 1.9677826816545434e-05, "loss": 1.1871, "step": 2696 }, { "epoch": 0.11, "grad_norm": 9.82784491661167, "learning_rate": 1.9677498254428572e-05, "loss": 1.6281, "step": 2697 }, { "epoch": 0.11, "grad_norm": 5.353131912383838, "learning_rate": 1.9677169527604047e-05, "loss": 1.3564, "step": 2698 }, { "epoch": 0.11, "grad_norm": 5.755077056494895, "learning_rate": 1.9676840636077447e-05, "loss": 1.3584, "step": 2699 }, { "epoch": 0.11, "grad_norm": 4.424367356868997, "learning_rate": 1.9676511579854375e-05, "loss": 1.0231, "step": 2700 }, { "epoch": 0.11, "grad_norm": 14.08466756546122, "learning_rate": 1.9676182358940434e-05, "loss": 1.6196, "step": 2701 }, { "epoch": 0.11, "grad_norm": 6.343602729307134, "learning_rate": 1.967585297334122e-05, "loss": 1.4593, "step": 2702 }, { "epoch": 0.11, "grad_norm": 7.058906287184488, "learning_rate": 1.9675523423062347e-05, "loss": 1.4502, "step": 2703 }, { "epoch": 0.11, "grad_norm": 6.768534045984661, "learning_rate": 1.9675193708109415e-05, "loss": 1.4222, "step": 2704 }, { "epoch": 0.11, "grad_norm": 6.325415876580892, "learning_rate": 1.967486382848804e-05, "loss": 1.4382, "step": 2705 }, { "epoch": 0.11, "grad_norm": 5.489690987699771, "learning_rate": 1.967453378420384e-05, "loss": 1.2465, "step": 2706 }, { "epoch": 0.11, "grad_norm": 6.098141216297506, "learning_rate": 1.967420357526243e-05, "loss": 1.2797, "step": 2707 }, { "epoch": 0.11, "grad_norm": 5.95364769501616, "learning_rate": 1.9673873201669428e-05, "loss": 1.37, "step": 2708 }, { "epoch": 0.11, "grad_norm": 5.117781522531783, "learning_rate": 1.9673542663430457e-05, "loss": 1.3079, "step": 2709 }, { "epoch": 0.11, "grad_norm": 6.485972373108887, "learning_rate": 1.967321196055114e-05, "loss": 1.1891, "step": 2710 }, { "epoch": 0.11, "grad_norm": 10.53221064131135, "learning_rate": 1.9672881093037114e-05, "loss": 1.5583, "step": 2711 }, { "epoch": 0.11, "grad_norm": 5.310811151339031, "learning_rate": 1.9672550060894003e-05, "loss": 1.2058, "step": 2712 }, { "epoch": 0.11, "grad_norm": 6.422410736448932, "learning_rate": 1.9672218864127445e-05, "loss": 1.3707, "step": 2713 }, { "epoch": 0.11, "grad_norm": 5.176430183471184, "learning_rate": 1.9671887502743075e-05, "loss": 1.3765, "step": 2714 }, { "epoch": 0.11, "grad_norm": 5.590180176362584, "learning_rate": 1.9671555976746536e-05, "loss": 1.4214, "step": 2715 }, { "epoch": 0.11, "grad_norm": 6.0084746626061385, "learning_rate": 1.9671224286143462e-05, "loss": 1.335, "step": 2716 }, { "epoch": 0.11, "grad_norm": 7.98658812168835, "learning_rate": 1.9670892430939504e-05, "loss": 1.4309, "step": 2717 }, { "epoch": 0.11, "grad_norm": 6.108684537305699, "learning_rate": 1.967056041114031e-05, "loss": 1.1104, "step": 2718 }, { "epoch": 0.11, "grad_norm": 4.905700178962875, "learning_rate": 1.967022822675153e-05, "loss": 1.2274, "step": 2719 }, { "epoch": 0.11, "grad_norm": 6.893630922726997, "learning_rate": 1.966989587777882e-05, "loss": 1.3914, "step": 2720 }, { "epoch": 0.11, "grad_norm": 6.305385370644963, "learning_rate": 1.9669563364227834e-05, "loss": 1.5114, "step": 2721 }, { "epoch": 0.11, "grad_norm": 4.793130231758179, "learning_rate": 1.9669230686104233e-05, "loss": 1.1672, "step": 2722 }, { "epoch": 0.11, "grad_norm": 6.47311372819021, "learning_rate": 1.9668897843413676e-05, "loss": 1.3378, "step": 2723 }, { "epoch": 0.11, "grad_norm": 7.264044963137334, "learning_rate": 1.966856483616183e-05, "loss": 1.3463, "step": 2724 }, { "epoch": 0.11, "grad_norm": 8.010400668513547, "learning_rate": 1.9668231664354365e-05, "loss": 0.8856, "step": 2725 }, { "epoch": 0.11, "grad_norm": 6.8070727574670595, "learning_rate": 1.9667898327996947e-05, "loss": 1.592, "step": 2726 }, { "epoch": 0.11, "grad_norm": 6.185424625114049, "learning_rate": 1.9667564827095248e-05, "loss": 1.4691, "step": 2727 }, { "epoch": 0.11, "grad_norm": 6.790047347138205, "learning_rate": 1.9667231161654952e-05, "loss": 1.187, "step": 2728 }, { "epoch": 0.11, "grad_norm": 7.035057492653119, "learning_rate": 1.9666897331681732e-05, "loss": 1.4052, "step": 2729 }, { "epoch": 0.11, "grad_norm": 6.931391081861229, "learning_rate": 1.966656333718127e-05, "loss": 1.3659, "step": 2730 }, { "epoch": 0.11, "grad_norm": 6.169570998747281, "learning_rate": 1.966622917815925e-05, "loss": 1.3168, "step": 2731 }, { "epoch": 0.11, "grad_norm": 5.877566618953312, "learning_rate": 1.966589485462136e-05, "loss": 1.1932, "step": 2732 }, { "epoch": 0.11, "grad_norm": 5.3657600811029935, "learning_rate": 1.9665560366573292e-05, "loss": 1.2801, "step": 2733 }, { "epoch": 0.11, "grad_norm": 6.071324244342275, "learning_rate": 1.966522571402074e-05, "loss": 1.4703, "step": 2734 }, { "epoch": 0.11, "grad_norm": 5.515860057439115, "learning_rate": 1.9664890896969397e-05, "loss": 1.1866, "step": 2735 }, { "epoch": 0.11, "grad_norm": 7.984958897150914, "learning_rate": 1.9664555915424957e-05, "loss": 1.3443, "step": 2736 }, { "epoch": 0.11, "grad_norm": 5.334276366194055, "learning_rate": 1.9664220769393127e-05, "loss": 1.5344, "step": 2737 }, { "epoch": 0.11, "grad_norm": 7.557632553846091, "learning_rate": 1.9663885458879614e-05, "loss": 1.4431, "step": 2738 }, { "epoch": 0.11, "grad_norm": 7.754010481727069, "learning_rate": 1.9663549983890117e-05, "loss": 1.286, "step": 2739 }, { "epoch": 0.11, "grad_norm": 6.657492117621428, "learning_rate": 1.966321434443035e-05, "loss": 1.0604, "step": 2740 }, { "epoch": 0.11, "grad_norm": 6.484174994932781, "learning_rate": 1.9662878540506026e-05, "loss": 1.5294, "step": 2741 }, { "epoch": 0.11, "grad_norm": 5.462947855182102, "learning_rate": 1.966254257212286e-05, "loss": 1.5214, "step": 2742 }, { "epoch": 0.11, "grad_norm": 6.748969895592326, "learning_rate": 1.9662206439286566e-05, "loss": 1.3349, "step": 2743 }, { "epoch": 0.11, "grad_norm": 4.846406109520501, "learning_rate": 1.966187014200287e-05, "loss": 1.2443, "step": 2744 }, { "epoch": 0.11, "grad_norm": 5.42855112727872, "learning_rate": 1.9661533680277495e-05, "loss": 1.2886, "step": 2745 }, { "epoch": 0.11, "grad_norm": 4.489372436870935, "learning_rate": 1.9661197054116165e-05, "loss": 1.2051, "step": 2746 }, { "epoch": 0.11, "grad_norm": 7.097993313979912, "learning_rate": 1.9660860263524612e-05, "loss": 1.344, "step": 2747 }, { "epoch": 0.11, "grad_norm": 6.503484134715703, "learning_rate": 1.9660523308508565e-05, "loss": 1.552, "step": 2748 }, { "epoch": 0.11, "grad_norm": 6.951035554240658, "learning_rate": 1.9660186189073763e-05, "loss": 1.6213, "step": 2749 }, { "epoch": 0.11, "grad_norm": 8.213123144938951, "learning_rate": 1.965984890522594e-05, "loss": 1.4438, "step": 2750 }, { "epoch": 0.11, "grad_norm": 8.347535284783067, "learning_rate": 1.9659511456970833e-05, "loss": 1.4343, "step": 2751 }, { "epoch": 0.11, "grad_norm": 6.027988667183151, "learning_rate": 1.9659173844314194e-05, "loss": 1.7236, "step": 2752 }, { "epoch": 0.11, "grad_norm": 5.989486344905474, "learning_rate": 1.9658836067261766e-05, "loss": 0.9638, "step": 2753 }, { "epoch": 0.11, "grad_norm": 6.083279259266226, "learning_rate": 1.9658498125819293e-05, "loss": 1.3267, "step": 2754 }, { "epoch": 0.11, "grad_norm": 5.775133600965877, "learning_rate": 1.9658160019992534e-05, "loss": 1.2873, "step": 2755 }, { "epoch": 0.11, "grad_norm": 6.320594295275376, "learning_rate": 1.9657821749787237e-05, "loss": 1.2924, "step": 2756 }, { "epoch": 0.11, "grad_norm": 9.572308131025181, "learning_rate": 1.9657483315209162e-05, "loss": 1.5642, "step": 2757 }, { "epoch": 0.11, "grad_norm": 6.083294500059953, "learning_rate": 1.9657144716264073e-05, "loss": 1.5336, "step": 2758 }, { "epoch": 0.11, "grad_norm": 9.28399374268687, "learning_rate": 1.9656805952957726e-05, "loss": 1.4155, "step": 2759 }, { "epoch": 0.11, "grad_norm": 5.029630607930598, "learning_rate": 1.965646702529589e-05, "loss": 1.3834, "step": 2760 }, { "epoch": 0.11, "grad_norm": 7.006328454238826, "learning_rate": 1.9656127933284334e-05, "loss": 1.6977, "step": 2761 }, { "epoch": 0.11, "grad_norm": 8.858457426903794, "learning_rate": 1.9655788676928826e-05, "loss": 1.465, "step": 2762 }, { "epoch": 0.11, "grad_norm": 5.285633985799881, "learning_rate": 1.9655449256235143e-05, "loss": 1.2721, "step": 2763 }, { "epoch": 0.11, "grad_norm": 6.890785900426548, "learning_rate": 1.9655109671209062e-05, "loss": 1.0141, "step": 2764 }, { "epoch": 0.11, "grad_norm": 5.104977486636603, "learning_rate": 1.965476992185636e-05, "loss": 1.3042, "step": 2765 }, { "epoch": 0.11, "grad_norm": 5.987633889899133, "learning_rate": 1.9654430008182824e-05, "loss": 1.1932, "step": 2766 }, { "epoch": 0.11, "grad_norm": 5.894923459308026, "learning_rate": 1.9654089930194234e-05, "loss": 1.3016, "step": 2767 }, { "epoch": 0.11, "grad_norm": 6.821217418502508, "learning_rate": 1.965374968789638e-05, "loss": 1.4806, "step": 2768 }, { "epoch": 0.11, "grad_norm": 5.702403826431151, "learning_rate": 1.9653409281295053e-05, "loss": 1.5811, "step": 2769 }, { "epoch": 0.11, "grad_norm": 5.519920633596432, "learning_rate": 1.965306871039605e-05, "loss": 1.1832, "step": 2770 }, { "epoch": 0.11, "grad_norm": 6.426337340831826, "learning_rate": 1.9652727975205163e-05, "loss": 1.5694, "step": 2771 }, { "epoch": 0.11, "grad_norm": 7.456171605167301, "learning_rate": 1.965238707572819e-05, "loss": 1.2235, "step": 2772 }, { "epoch": 0.11, "grad_norm": 6.6982703207900745, "learning_rate": 1.9652046011970937e-05, "loss": 1.2918, "step": 2773 }, { "epoch": 0.11, "grad_norm": 5.294801423525001, "learning_rate": 1.9651704783939206e-05, "loss": 1.5028, "step": 2774 }, { "epoch": 0.11, "grad_norm": 6.842042936543816, "learning_rate": 1.9651363391638804e-05, "loss": 1.0369, "step": 2775 }, { "epoch": 0.11, "grad_norm": 8.80397930166349, "learning_rate": 1.9651021835075545e-05, "loss": 1.7982, "step": 2776 }, { "epoch": 0.11, "grad_norm": 6.889976636647789, "learning_rate": 1.9650680114255244e-05, "loss": 1.1349, "step": 2777 }, { "epoch": 0.11, "grad_norm": 5.26751993008654, "learning_rate": 1.9650338229183708e-05, "loss": 1.2002, "step": 2778 }, { "epoch": 0.11, "grad_norm": 4.749358005633514, "learning_rate": 1.964999617986676e-05, "loss": 1.1048, "step": 2779 }, { "epoch": 0.11, "grad_norm": 5.528745578930723, "learning_rate": 1.9649653966310227e-05, "loss": 1.3761, "step": 2780 }, { "epoch": 0.11, "grad_norm": 4.225904791585423, "learning_rate": 1.9649311588519927e-05, "loss": 1.3215, "step": 2781 }, { "epoch": 0.11, "grad_norm": 3.4567084198471654, "learning_rate": 1.9648969046501688e-05, "loss": 1.1459, "step": 2782 }, { "epoch": 0.11, "grad_norm": 6.729173500930345, "learning_rate": 1.9648626340261346e-05, "loss": 1.3909, "step": 2783 }, { "epoch": 0.11, "grad_norm": 5.795136701138339, "learning_rate": 1.9648283469804723e-05, "loss": 1.3115, "step": 2784 }, { "epoch": 0.11, "grad_norm": 5.2922720186001655, "learning_rate": 1.964794043513766e-05, "loss": 1.2719, "step": 2785 }, { "epoch": 0.11, "grad_norm": 7.541398203476618, "learning_rate": 1.9647597236266e-05, "loss": 1.3593, "step": 2786 }, { "epoch": 0.11, "grad_norm": 7.796024424037487, "learning_rate": 1.9647253873195578e-05, "loss": 1.3737, "step": 2787 }, { "epoch": 0.11, "grad_norm": 5.717613274294006, "learning_rate": 1.9646910345932237e-05, "loss": 1.4179, "step": 2788 }, { "epoch": 0.11, "grad_norm": 5.789135693081646, "learning_rate": 1.9646566654481833e-05, "loss": 1.0573, "step": 2789 }, { "epoch": 0.11, "grad_norm": 6.8996782589804075, "learning_rate": 1.96462227988502e-05, "loss": 1.6745, "step": 2790 }, { "epoch": 0.11, "grad_norm": 6.386067592439238, "learning_rate": 1.9645878779043203e-05, "loss": 1.5231, "step": 2791 }, { "epoch": 0.11, "grad_norm": 5.52714251171493, "learning_rate": 1.9645534595066697e-05, "loss": 1.1773, "step": 2792 }, { "epoch": 0.11, "grad_norm": 5.974401519825416, "learning_rate": 1.9645190246926532e-05, "loss": 1.4538, "step": 2793 }, { "epoch": 0.11, "grad_norm": 5.425727593177253, "learning_rate": 1.9644845734628573e-05, "loss": 1.3373, "step": 2794 }, { "epoch": 0.11, "grad_norm": 4.457637090283118, "learning_rate": 1.9644501058178686e-05, "loss": 1.1194, "step": 2795 }, { "epoch": 0.11, "grad_norm": 5.261110202674557, "learning_rate": 1.964415621758273e-05, "loss": 1.2325, "step": 2796 }, { "epoch": 0.11, "grad_norm": 6.727085281587915, "learning_rate": 1.9643811212846583e-05, "loss": 1.4017, "step": 2797 }, { "epoch": 0.11, "grad_norm": 5.669661160780835, "learning_rate": 1.964346604397611e-05, "loss": 1.1224, "step": 2798 }, { "epoch": 0.11, "grad_norm": 5.603343268310057, "learning_rate": 1.964312071097719e-05, "loss": 1.2152, "step": 2799 }, { "epoch": 0.11, "grad_norm": 6.584081278612616, "learning_rate": 1.9642775213855696e-05, "loss": 1.4744, "step": 2800 }, { "epoch": 0.11, "grad_norm": 4.861484195069706, "learning_rate": 1.964242955261751e-05, "loss": 1.0113, "step": 2801 }, { "epoch": 0.11, "grad_norm": 5.140797000024259, "learning_rate": 1.9642083727268522e-05, "loss": 0.9178, "step": 2802 }, { "epoch": 0.11, "grad_norm": 5.953665568602358, "learning_rate": 1.9641737737814608e-05, "loss": 1.3652, "step": 2803 }, { "epoch": 0.11, "grad_norm": 6.727611758723014, "learning_rate": 1.964139158426166e-05, "loss": 1.2361, "step": 2804 }, { "epoch": 0.11, "grad_norm": 8.022872001862229, "learning_rate": 1.964104526661557e-05, "loss": 1.3895, "step": 2805 }, { "epoch": 0.11, "grad_norm": 5.760873124369996, "learning_rate": 1.9640698784882234e-05, "loss": 1.3244, "step": 2806 }, { "epoch": 0.11, "grad_norm": 5.445761211332446, "learning_rate": 1.9640352139067544e-05, "loss": 1.4343, "step": 2807 }, { "epoch": 0.11, "grad_norm": 6.959768970519632, "learning_rate": 1.9640005329177405e-05, "loss": 1.3001, "step": 2808 }, { "epoch": 0.11, "grad_norm": 8.077627419149165, "learning_rate": 1.9639658355217716e-05, "loss": 1.4133, "step": 2809 }, { "epoch": 0.11, "grad_norm": 5.489544276868499, "learning_rate": 1.9639311217194383e-05, "loss": 0.9325, "step": 2810 }, { "epoch": 0.11, "grad_norm": 3.954737840251701, "learning_rate": 1.9638963915113317e-05, "loss": 1.1183, "step": 2811 }, { "epoch": 0.11, "grad_norm": 6.510906637869646, "learning_rate": 1.9638616448980428e-05, "loss": 1.276, "step": 2812 }, { "epoch": 0.11, "grad_norm": 7.620640085005474, "learning_rate": 1.9638268818801624e-05, "loss": 1.4719, "step": 2813 }, { "epoch": 0.11, "grad_norm": 7.964273454987224, "learning_rate": 1.963792102458283e-05, "loss": 1.2706, "step": 2814 }, { "epoch": 0.11, "grad_norm": 4.453076955862479, "learning_rate": 1.963757306632996e-05, "loss": 1.2797, "step": 2815 }, { "epoch": 0.11, "grad_norm": 5.65631590131971, "learning_rate": 1.9637224944048938e-05, "loss": 1.2578, "step": 2816 }, { "epoch": 0.11, "grad_norm": 5.949128000237121, "learning_rate": 1.9636876657745687e-05, "loss": 1.2016, "step": 2817 }, { "epoch": 0.11, "grad_norm": 7.2880866058449465, "learning_rate": 1.963652820742614e-05, "loss": 1.219, "step": 2818 }, { "epoch": 0.11, "grad_norm": 5.884923106688556, "learning_rate": 1.963617959309622e-05, "loss": 1.1964, "step": 2819 }, { "epoch": 0.11, "grad_norm": 6.005678290210574, "learning_rate": 1.9635830814761864e-05, "loss": 1.2213, "step": 2820 }, { "epoch": 0.11, "grad_norm": 7.737778248751051, "learning_rate": 1.963548187242901e-05, "loss": 1.7709, "step": 2821 }, { "epoch": 0.11, "grad_norm": 7.557764276695044, "learning_rate": 1.9635132766103596e-05, "loss": 1.587, "step": 2822 }, { "epoch": 0.11, "grad_norm": 8.990558305498386, "learning_rate": 1.963478349579156e-05, "loss": 1.196, "step": 2823 }, { "epoch": 0.11, "grad_norm": 6.728168765473515, "learning_rate": 1.963443406149885e-05, "loss": 1.4557, "step": 2824 }, { "epoch": 0.11, "grad_norm": 6.435177731692842, "learning_rate": 1.9634084463231414e-05, "loss": 1.5095, "step": 2825 }, { "epoch": 0.11, "grad_norm": 6.19998539063743, "learning_rate": 1.96337347009952e-05, "loss": 1.2355, "step": 2826 }, { "epoch": 0.11, "grad_norm": 3.6980594723944615, "learning_rate": 1.963338477479616e-05, "loss": 1.0948, "step": 2827 }, { "epoch": 0.11, "grad_norm": 5.665371280262307, "learning_rate": 1.9633034684640248e-05, "loss": 1.4163, "step": 2828 }, { "epoch": 0.11, "grad_norm": 5.8403540953147495, "learning_rate": 1.9632684430533432e-05, "loss": 1.4926, "step": 2829 }, { "epoch": 0.11, "grad_norm": 7.121091710661659, "learning_rate": 1.9632334012481662e-05, "loss": 1.576, "step": 2830 }, { "epoch": 0.11, "grad_norm": 6.82688596177203, "learning_rate": 1.9631983430490907e-05, "loss": 1.3069, "step": 2831 }, { "epoch": 0.11, "grad_norm": 6.257572257648184, "learning_rate": 1.9631632684567133e-05, "loss": 1.3457, "step": 2832 }, { "epoch": 0.11, "grad_norm": 6.523104392929014, "learning_rate": 1.963128177471631e-05, "loss": 1.7196, "step": 2833 }, { "epoch": 0.11, "grad_norm": 4.997030793879166, "learning_rate": 1.9630930700944413e-05, "loss": 1.5087, "step": 2834 }, { "epoch": 0.11, "grad_norm": 6.302666638407603, "learning_rate": 1.9630579463257414e-05, "loss": 1.2299, "step": 2835 }, { "epoch": 0.11, "grad_norm": 6.920095918709255, "learning_rate": 1.963022806166129e-05, "loss": 1.2553, "step": 2836 }, { "epoch": 0.11, "grad_norm": 4.177563167127155, "learning_rate": 1.962987649616202e-05, "loss": 1.3203, "step": 2837 }, { "epoch": 0.11, "grad_norm": 6.246860194724832, "learning_rate": 1.9629524766765593e-05, "loss": 1.4603, "step": 2838 }, { "epoch": 0.11, "grad_norm": 4.7638675826899615, "learning_rate": 1.9629172873477995e-05, "loss": 1.2964, "step": 2839 }, { "epoch": 0.11, "grad_norm": 6.418423443124514, "learning_rate": 1.9628820816305213e-05, "loss": 1.3539, "step": 2840 }, { "epoch": 0.11, "grad_norm": 7.089433770382736, "learning_rate": 1.9628468595253233e-05, "loss": 1.5603, "step": 2841 }, { "epoch": 0.11, "grad_norm": 5.918410746020326, "learning_rate": 1.9628116210328062e-05, "loss": 1.5613, "step": 2842 }, { "epoch": 0.11, "grad_norm": 5.617093920342838, "learning_rate": 1.9627763661535687e-05, "loss": 1.7142, "step": 2843 }, { "epoch": 0.11, "grad_norm": 5.836870930690821, "learning_rate": 1.9627410948882115e-05, "loss": 1.1748, "step": 2844 }, { "epoch": 0.11, "grad_norm": 5.740540623794242, "learning_rate": 1.9627058072373347e-05, "loss": 1.3688, "step": 2845 }, { "epoch": 0.11, "grad_norm": 4.78262268344502, "learning_rate": 1.9626705032015386e-05, "loss": 0.9966, "step": 2846 }, { "epoch": 0.11, "grad_norm": 4.5933736281000765, "learning_rate": 1.9626351827814242e-05, "loss": 1.1126, "step": 2847 }, { "epoch": 0.11, "grad_norm": 4.504756426907731, "learning_rate": 1.962599845977593e-05, "loss": 1.1528, "step": 2848 }, { "epoch": 0.11, "grad_norm": 3.6605424783366542, "learning_rate": 1.9625644927906458e-05, "loss": 1.2022, "step": 2849 }, { "epoch": 0.11, "grad_norm": 5.127310216186128, "learning_rate": 1.962529123221185e-05, "loss": 1.1252, "step": 2850 }, { "epoch": 0.11, "grad_norm": 5.429840219216996, "learning_rate": 1.962493737269812e-05, "loss": 1.3465, "step": 2851 }, { "epoch": 0.11, "grad_norm": 4.8463620064832575, "learning_rate": 1.962458334937129e-05, "loss": 1.2957, "step": 2852 }, { "epoch": 0.11, "grad_norm": 6.386598681357348, "learning_rate": 1.9624229162237394e-05, "loss": 1.4593, "step": 2853 }, { "epoch": 0.11, "grad_norm": 5.809225249662573, "learning_rate": 1.9623874811302454e-05, "loss": 1.3638, "step": 2854 }, { "epoch": 0.12, "grad_norm": 6.045619293079117, "learning_rate": 1.9623520296572497e-05, "loss": 1.4809, "step": 2855 }, { "epoch": 0.12, "grad_norm": 4.776247598722774, "learning_rate": 1.962316561805356e-05, "loss": 1.0748, "step": 2856 }, { "epoch": 0.12, "grad_norm": 6.177123213498283, "learning_rate": 1.9622810775751688e-05, "loss": 1.4757, "step": 2857 }, { "epoch": 0.12, "grad_norm": 5.21474537755649, "learning_rate": 1.9622455769672904e-05, "loss": 1.3392, "step": 2858 }, { "epoch": 0.12, "grad_norm": 4.931093733180505, "learning_rate": 1.962210059982326e-05, "loss": 1.1643, "step": 2859 }, { "epoch": 0.12, "grad_norm": 5.401431797382821, "learning_rate": 1.9621745266208803e-05, "loss": 1.0452, "step": 2860 }, { "epoch": 0.12, "grad_norm": 5.876646164511465, "learning_rate": 1.962138976883558e-05, "loss": 1.1907, "step": 2861 }, { "epoch": 0.12, "grad_norm": 6.6906775656598, "learning_rate": 1.9621034107709635e-05, "loss": 1.6123, "step": 2862 }, { "epoch": 0.12, "grad_norm": 5.690848291450209, "learning_rate": 1.9620678282837023e-05, "loss": 1.1326, "step": 2863 }, { "epoch": 0.12, "grad_norm": 5.6236062903143145, "learning_rate": 1.9620322294223805e-05, "loss": 1.2682, "step": 2864 }, { "epoch": 0.12, "grad_norm": 7.315662759418646, "learning_rate": 1.9619966141876035e-05, "loss": 1.5175, "step": 2865 }, { "epoch": 0.12, "grad_norm": 5.142642505500878, "learning_rate": 1.961960982579978e-05, "loss": 1.2871, "step": 2866 }, { "epoch": 0.12, "grad_norm": 4.436541144391051, "learning_rate": 1.9619253346001096e-05, "loss": 1.2387, "step": 2867 }, { "epoch": 0.12, "grad_norm": 8.105759302399907, "learning_rate": 1.9618896702486055e-05, "loss": 1.4664, "step": 2868 }, { "epoch": 0.12, "grad_norm": 6.351610552585991, "learning_rate": 1.961853989526073e-05, "loss": 1.3793, "step": 2869 }, { "epoch": 0.12, "grad_norm": 4.6163534310487275, "learning_rate": 1.9618182924331187e-05, "loss": 1.2818, "step": 2870 }, { "epoch": 0.12, "grad_norm": 7.672578767145308, "learning_rate": 1.961782578970351e-05, "loss": 1.6266, "step": 2871 }, { "epoch": 0.12, "grad_norm": 9.668347899213302, "learning_rate": 1.9617468491383768e-05, "loss": 1.4731, "step": 2872 }, { "epoch": 0.12, "grad_norm": 8.42370501469077, "learning_rate": 1.961711102937805e-05, "loss": 1.4226, "step": 2873 }, { "epoch": 0.12, "grad_norm": 7.414895713083783, "learning_rate": 1.9616753403692433e-05, "loss": 1.5084, "step": 2874 }, { "epoch": 0.12, "grad_norm": 7.1980634292722065, "learning_rate": 1.961639561433301e-05, "loss": 1.4234, "step": 2875 }, { "epoch": 0.12, "grad_norm": 5.571689709055699, "learning_rate": 1.9616037661305867e-05, "loss": 1.0961, "step": 2876 }, { "epoch": 0.12, "grad_norm": 4.132075438621875, "learning_rate": 1.9615679544617094e-05, "loss": 1.1206, "step": 2877 }, { "epoch": 0.12, "grad_norm": 4.3487393555883775, "learning_rate": 1.9615321264272795e-05, "loss": 1.2646, "step": 2878 }, { "epoch": 0.12, "grad_norm": 7.324496386006089, "learning_rate": 1.9614962820279056e-05, "loss": 1.4227, "step": 2879 }, { "epoch": 0.12, "grad_norm": 6.659609525772356, "learning_rate": 1.9614604212641985e-05, "loss": 1.298, "step": 2880 }, { "epoch": 0.12, "grad_norm": 8.695250021756731, "learning_rate": 1.9614245441367682e-05, "loss": 1.4738, "step": 2881 }, { "epoch": 0.12, "grad_norm": 4.6184096888809165, "learning_rate": 1.9613886506462253e-05, "loss": 1.1892, "step": 2882 }, { "epoch": 0.12, "grad_norm": 4.837886466306968, "learning_rate": 1.9613527407931815e-05, "loss": 1.2968, "step": 2883 }, { "epoch": 0.12, "grad_norm": 4.498531798787343, "learning_rate": 1.9613168145782468e-05, "loss": 1.1254, "step": 2884 }, { "epoch": 0.12, "grad_norm": 8.498342773474677, "learning_rate": 1.9612808720020335e-05, "loss": 1.2909, "step": 2885 }, { "epoch": 0.12, "grad_norm": 5.71947128340825, "learning_rate": 1.9612449130651528e-05, "loss": 1.3005, "step": 2886 }, { "epoch": 0.12, "grad_norm": 6.550068955528291, "learning_rate": 1.961208937768217e-05, "loss": 1.3433, "step": 2887 }, { "epoch": 0.12, "grad_norm": 6.117216278986792, "learning_rate": 1.9611729461118384e-05, "loss": 1.3155, "step": 2888 }, { "epoch": 0.12, "grad_norm": 5.401941916760231, "learning_rate": 1.9611369380966297e-05, "loss": 1.2425, "step": 2889 }, { "epoch": 0.12, "grad_norm": 16.26270200597905, "learning_rate": 1.961100913723203e-05, "loss": 1.8019, "step": 2890 }, { "epoch": 0.12, "grad_norm": 7.76039505527485, "learning_rate": 1.961064872992172e-05, "loss": 1.5588, "step": 2891 }, { "epoch": 0.12, "grad_norm": 7.29796211946602, "learning_rate": 1.9610288159041503e-05, "loss": 1.3669, "step": 2892 }, { "epoch": 0.12, "grad_norm": 9.848291883336264, "learning_rate": 1.9609927424597515e-05, "loss": 1.6341, "step": 2893 }, { "epoch": 0.12, "grad_norm": 8.2425399622976, "learning_rate": 1.960956652659589e-05, "loss": 1.4863, "step": 2894 }, { "epoch": 0.12, "grad_norm": 7.127132957625464, "learning_rate": 1.9609205465042778e-05, "loss": 1.3095, "step": 2895 }, { "epoch": 0.12, "grad_norm": 8.169493650840664, "learning_rate": 1.9608844239944317e-05, "loss": 1.5671, "step": 2896 }, { "epoch": 0.12, "grad_norm": 10.693782543691986, "learning_rate": 1.960848285130666e-05, "loss": 1.3775, "step": 2897 }, { "epoch": 0.12, "grad_norm": 7.329634978392567, "learning_rate": 1.9608121299135954e-05, "loss": 1.3524, "step": 2898 }, { "epoch": 0.12, "grad_norm": 4.996100142190463, "learning_rate": 1.9607759583438357e-05, "loss": 1.2188, "step": 2899 }, { "epoch": 0.12, "grad_norm": 6.38763252106279, "learning_rate": 1.960739770422002e-05, "loss": 1.2908, "step": 2900 }, { "epoch": 0.12, "grad_norm": 7.0256171393472675, "learning_rate": 1.9607035661487107e-05, "loss": 1.3418, "step": 2901 }, { "epoch": 0.12, "grad_norm": 7.807626817356524, "learning_rate": 1.9606673455245778e-05, "loss": 1.2549, "step": 2902 }, { "epoch": 0.12, "grad_norm": 5.736051956146063, "learning_rate": 1.9606311085502198e-05, "loss": 1.3496, "step": 2903 }, { "epoch": 0.12, "grad_norm": 7.5476951421386325, "learning_rate": 1.960594855226253e-05, "loss": 1.4083, "step": 2904 }, { "epoch": 0.12, "grad_norm": 4.510502432609987, "learning_rate": 1.9605585855532953e-05, "loss": 1.1249, "step": 2905 }, { "epoch": 0.12, "grad_norm": 8.55168109731261, "learning_rate": 1.960522299531963e-05, "loss": 1.2036, "step": 2906 }, { "epoch": 0.12, "grad_norm": 6.544873416622423, "learning_rate": 1.9604859971628743e-05, "loss": 1.4241, "step": 2907 }, { "epoch": 0.12, "grad_norm": 5.030756357373765, "learning_rate": 1.960449678446647e-05, "loss": 1.3429, "step": 2908 }, { "epoch": 0.12, "grad_norm": 5.788832050066066, "learning_rate": 1.960413343383899e-05, "loss": 1.1346, "step": 2909 }, { "epoch": 0.12, "grad_norm": 4.777799628065474, "learning_rate": 1.9603769919752493e-05, "loss": 1.1554, "step": 2910 }, { "epoch": 0.12, "grad_norm": 6.969208262810722, "learning_rate": 1.9603406242213156e-05, "loss": 1.3351, "step": 2911 }, { "epoch": 0.12, "grad_norm": 6.935278337666744, "learning_rate": 1.9603042401227178e-05, "loss": 1.4676, "step": 2912 }, { "epoch": 0.12, "grad_norm": 6.891660418861789, "learning_rate": 1.9602678396800744e-05, "loss": 1.1546, "step": 2913 }, { "epoch": 0.12, "grad_norm": 5.804670352643215, "learning_rate": 1.9602314228940055e-05, "loss": 1.2661, "step": 2914 }, { "epoch": 0.12, "grad_norm": 7.365024282921082, "learning_rate": 1.9601949897651308e-05, "loss": 1.3712, "step": 2915 }, { "epoch": 0.12, "grad_norm": 8.467231080517495, "learning_rate": 1.96015854029407e-05, "loss": 1.2419, "step": 2916 }, { "epoch": 0.12, "grad_norm": 7.606533187938361, "learning_rate": 1.960122074481444e-05, "loss": 1.4522, "step": 2917 }, { "epoch": 0.12, "grad_norm": 4.894385989418491, "learning_rate": 1.960085592327873e-05, "loss": 1.1086, "step": 2918 }, { "epoch": 0.12, "grad_norm": 4.733907321614068, "learning_rate": 1.9600490938339778e-05, "loss": 1.2835, "step": 2919 }, { "epoch": 0.12, "grad_norm": 5.664715080744172, "learning_rate": 1.9600125790003802e-05, "loss": 1.1556, "step": 2920 }, { "epoch": 0.12, "grad_norm": 6.328742808705013, "learning_rate": 1.9599760478277014e-05, "loss": 1.5612, "step": 2921 }, { "epoch": 0.12, "grad_norm": 7.004330361434522, "learning_rate": 1.959939500316563e-05, "loss": 1.2427, "step": 2922 }, { "epoch": 0.12, "grad_norm": 7.083135901483623, "learning_rate": 1.959902936467587e-05, "loss": 1.5178, "step": 2923 }, { "epoch": 0.12, "grad_norm": 5.745019161552831, "learning_rate": 1.9598663562813957e-05, "loss": 1.1685, "step": 2924 }, { "epoch": 0.12, "grad_norm": 5.892650304796013, "learning_rate": 1.959829759758612e-05, "loss": 1.3835, "step": 2925 }, { "epoch": 0.12, "grad_norm": 5.372157940105152, "learning_rate": 1.959793146899858e-05, "loss": 1.3857, "step": 2926 }, { "epoch": 0.12, "grad_norm": 5.591501721097316, "learning_rate": 1.9597565177057582e-05, "loss": 1.2305, "step": 2927 }, { "epoch": 0.12, "grad_norm": 4.615674859692715, "learning_rate": 1.9597198721769345e-05, "loss": 1.1188, "step": 2928 }, { "epoch": 0.12, "grad_norm": 4.89080026928698, "learning_rate": 1.9596832103140118e-05, "loss": 1.1787, "step": 2929 }, { "epoch": 0.12, "grad_norm": 6.58841971043338, "learning_rate": 1.9596465321176136e-05, "loss": 1.4848, "step": 2930 }, { "epoch": 0.12, "grad_norm": 7.148700622030725, "learning_rate": 1.9596098375883635e-05, "loss": 1.188, "step": 2931 }, { "epoch": 0.12, "grad_norm": 6.153032019131572, "learning_rate": 1.9595731267268874e-05, "loss": 1.5297, "step": 2932 }, { "epoch": 0.12, "grad_norm": 5.863641315739823, "learning_rate": 1.959536399533809e-05, "loss": 1.3637, "step": 2933 }, { "epoch": 0.12, "grad_norm": 5.283600782479322, "learning_rate": 1.9594996560097538e-05, "loss": 1.2388, "step": 2934 }, { "epoch": 0.12, "grad_norm": 7.352890084338497, "learning_rate": 1.959462896155347e-05, "loss": 1.5276, "step": 2935 }, { "epoch": 0.12, "grad_norm": 8.811615391789324, "learning_rate": 1.9594261199712147e-05, "loss": 1.4688, "step": 2936 }, { "epoch": 0.12, "grad_norm": 5.578410856272615, "learning_rate": 1.959389327457982e-05, "loss": 1.0948, "step": 2937 }, { "epoch": 0.12, "grad_norm": 6.313321012670502, "learning_rate": 1.9593525186162758e-05, "loss": 1.3573, "step": 2938 }, { "epoch": 0.12, "grad_norm": 6.589468440784611, "learning_rate": 1.9593156934467222e-05, "loss": 1.3421, "step": 2939 }, { "epoch": 0.12, "grad_norm": 7.910560080529263, "learning_rate": 1.959278851949948e-05, "loss": 1.2332, "step": 2940 }, { "epoch": 0.12, "grad_norm": 6.876533685395539, "learning_rate": 1.9592419941265804e-05, "loss": 1.0218, "step": 2941 }, { "epoch": 0.12, "grad_norm": 8.39355662596893, "learning_rate": 1.9592051199772467e-05, "loss": 1.6571, "step": 2942 }, { "epoch": 0.12, "grad_norm": 7.026297884341463, "learning_rate": 1.959168229502574e-05, "loss": 1.2139, "step": 2943 }, { "epoch": 0.12, "grad_norm": 5.812318755952964, "learning_rate": 1.9591313227031913e-05, "loss": 1.3714, "step": 2944 }, { "epoch": 0.12, "grad_norm": 7.210162094672603, "learning_rate": 1.9590943995797252e-05, "loss": 1.5276, "step": 2945 }, { "epoch": 0.12, "grad_norm": 7.187543343787898, "learning_rate": 1.9590574601328056e-05, "loss": 1.3987, "step": 2946 }, { "epoch": 0.12, "grad_norm": 7.271347237694114, "learning_rate": 1.9590205043630604e-05, "loss": 1.2376, "step": 2947 }, { "epoch": 0.12, "grad_norm": 5.898819521750856, "learning_rate": 1.9589835322711183e-05, "loss": 1.5076, "step": 2948 }, { "epoch": 0.12, "grad_norm": 6.733115833031504, "learning_rate": 1.9589465438576093e-05, "loss": 1.08, "step": 2949 }, { "epoch": 0.12, "grad_norm": 8.760782388702946, "learning_rate": 1.9589095391231625e-05, "loss": 1.4793, "step": 2950 }, { "epoch": 0.12, "grad_norm": 5.416027700542925, "learning_rate": 1.9588725180684077e-05, "loss": 1.3132, "step": 2951 }, { "epoch": 0.12, "grad_norm": 3.5557127270216236, "learning_rate": 1.9588354806939752e-05, "loss": 1.1222, "step": 2952 }, { "epoch": 0.12, "grad_norm": 5.04421466213256, "learning_rate": 1.958798427000495e-05, "loss": 1.0641, "step": 2953 }, { "epoch": 0.12, "grad_norm": 5.094901227752529, "learning_rate": 1.9587613569885984e-05, "loss": 1.3609, "step": 2954 }, { "epoch": 0.12, "grad_norm": 5.322958656263504, "learning_rate": 1.9587242706589155e-05, "loss": 1.4906, "step": 2955 }, { "epoch": 0.12, "grad_norm": 4.343438138792018, "learning_rate": 1.958687168012078e-05, "loss": 1.0237, "step": 2956 }, { "epoch": 0.12, "grad_norm": 7.626874726987884, "learning_rate": 1.9586500490487174e-05, "loss": 1.7888, "step": 2957 }, { "epoch": 0.12, "grad_norm": 6.044770377191504, "learning_rate": 1.9586129137694653e-05, "loss": 1.6433, "step": 2958 }, { "epoch": 0.12, "grad_norm": 6.902634948759568, "learning_rate": 1.9585757621749536e-05, "loss": 1.315, "step": 2959 }, { "epoch": 0.12, "grad_norm": 6.059915875074615, "learning_rate": 1.958538594265815e-05, "loss": 1.0832, "step": 2960 }, { "epoch": 0.12, "grad_norm": 4.640305931146878, "learning_rate": 1.9585014100426813e-05, "loss": 1.3399, "step": 2961 }, { "epoch": 0.12, "grad_norm": 5.540717001933553, "learning_rate": 1.958464209506186e-05, "loss": 1.3087, "step": 2962 }, { "epoch": 0.12, "grad_norm": 8.386719805919371, "learning_rate": 1.9584269926569626e-05, "loss": 1.3857, "step": 2963 }, { "epoch": 0.12, "grad_norm": 5.231246768925852, "learning_rate": 1.958389759495644e-05, "loss": 1.3003, "step": 2964 }, { "epoch": 0.12, "grad_norm": 3.4351415414924475, "learning_rate": 1.9583525100228638e-05, "loss": 1.0146, "step": 2965 }, { "epoch": 0.12, "grad_norm": 4.524253826156421, "learning_rate": 1.958315244239256e-05, "loss": 1.2228, "step": 2966 }, { "epoch": 0.12, "grad_norm": 6.365544611681355, "learning_rate": 1.958277962145455e-05, "loss": 1.4578, "step": 2967 }, { "epoch": 0.12, "grad_norm": 5.237198047052827, "learning_rate": 1.9582406637420954e-05, "loss": 1.1376, "step": 2968 }, { "epoch": 0.12, "grad_norm": 5.6663691464538575, "learning_rate": 1.958203349029812e-05, "loss": 1.2138, "step": 2969 }, { "epoch": 0.12, "grad_norm": 6.130814467633388, "learning_rate": 1.9581660180092394e-05, "loss": 1.293, "step": 2970 }, { "epoch": 0.12, "grad_norm": 3.4531606997283935, "learning_rate": 1.958128670681014e-05, "loss": 0.9759, "step": 2971 }, { "epoch": 0.12, "grad_norm": 5.8434341914874315, "learning_rate": 1.9580913070457705e-05, "loss": 1.5946, "step": 2972 }, { "epoch": 0.12, "grad_norm": 5.871851083499828, "learning_rate": 1.958053927104145e-05, "loss": 1.1846, "step": 2973 }, { "epoch": 0.12, "grad_norm": 5.3896781752599425, "learning_rate": 1.9580165308567737e-05, "loss": 1.3184, "step": 2974 }, { "epoch": 0.12, "grad_norm": 6.09958137414239, "learning_rate": 1.9579791183042933e-05, "loss": 1.2932, "step": 2975 }, { "epoch": 0.12, "grad_norm": 5.693450824601202, "learning_rate": 1.9579416894473407e-05, "loss": 1.2035, "step": 2976 }, { "epoch": 0.12, "grad_norm": 5.571185821447305, "learning_rate": 1.957904244286552e-05, "loss": 1.119, "step": 2977 }, { "epoch": 0.12, "grad_norm": 8.232555225273163, "learning_rate": 1.9578667828225657e-05, "loss": 1.5448, "step": 2978 }, { "epoch": 0.12, "grad_norm": 5.642243778391934, "learning_rate": 1.9578293050560185e-05, "loss": 1.3442, "step": 2979 }, { "epoch": 0.12, "grad_norm": 4.875077957848286, "learning_rate": 1.957791810987549e-05, "loss": 1.1479, "step": 2980 }, { "epoch": 0.12, "grad_norm": 7.327984929629464, "learning_rate": 1.9577543006177948e-05, "loss": 1.5649, "step": 2981 }, { "epoch": 0.12, "grad_norm": 7.2493053310253535, "learning_rate": 1.9577167739473945e-05, "loss": 1.0351, "step": 2982 }, { "epoch": 0.12, "grad_norm": 5.690070603375901, "learning_rate": 1.9576792309769863e-05, "loss": 1.5636, "step": 2983 }, { "epoch": 0.12, "grad_norm": 4.472415118386781, "learning_rate": 1.95764167170721e-05, "loss": 1.2785, "step": 2984 }, { "epoch": 0.12, "grad_norm": 4.7858990556448, "learning_rate": 1.9576040961387043e-05, "loss": 1.194, "step": 2985 }, { "epoch": 0.12, "grad_norm": 5.788544842937453, "learning_rate": 1.957566504272109e-05, "loss": 1.0681, "step": 2986 }, { "epoch": 0.12, "grad_norm": 4.593798199226103, "learning_rate": 1.9575288961080637e-05, "loss": 1.1911, "step": 2987 }, { "epoch": 0.12, "grad_norm": 6.671293214817249, "learning_rate": 1.9574912716472086e-05, "loss": 1.2671, "step": 2988 }, { "epoch": 0.12, "grad_norm": 6.417457574391445, "learning_rate": 1.957453630890184e-05, "loss": 1.5844, "step": 2989 }, { "epoch": 0.12, "grad_norm": 8.007017462945115, "learning_rate": 1.9574159738376304e-05, "loss": 1.3579, "step": 2990 }, { "epoch": 0.12, "grad_norm": 6.57823210297244, "learning_rate": 1.957378300490189e-05, "loss": 0.9591, "step": 2991 }, { "epoch": 0.12, "grad_norm": 7.3944908180625015, "learning_rate": 1.9573406108485006e-05, "loss": 1.4853, "step": 2992 }, { "epoch": 0.12, "grad_norm": 4.631522651851654, "learning_rate": 1.957302904913207e-05, "loss": 1.1462, "step": 2993 }, { "epoch": 0.12, "grad_norm": 7.088771004797125, "learning_rate": 1.95726518268495e-05, "loss": 1.5986, "step": 2994 }, { "epoch": 0.12, "grad_norm": 7.3358824000871286, "learning_rate": 1.9572274441643715e-05, "loss": 1.3787, "step": 2995 }, { "epoch": 0.12, "grad_norm": 6.8932885477376535, "learning_rate": 1.9571896893521135e-05, "loss": 1.6467, "step": 2996 }, { "epoch": 0.12, "grad_norm": 5.234227830629614, "learning_rate": 1.957151918248819e-05, "loss": 1.1829, "step": 2997 }, { "epoch": 0.12, "grad_norm": 6.908963945306555, "learning_rate": 1.9571141308551306e-05, "loss": 1.5723, "step": 2998 }, { "epoch": 0.12, "grad_norm": 5.791431990796311, "learning_rate": 1.957076327171692e-05, "loss": 1.1533, "step": 2999 }, { "epoch": 0.12, "grad_norm": 6.0185539576857785, "learning_rate": 1.9570385071991455e-05, "loss": 1.3065, "step": 3000 }, { "epoch": 0.12, "grad_norm": 7.530716919064747, "learning_rate": 1.9570006709381352e-05, "loss": 1.6475, "step": 3001 }, { "epoch": 0.12, "grad_norm": 7.266439417205829, "learning_rate": 1.9569628183893058e-05, "loss": 1.5153, "step": 3002 }, { "epoch": 0.12, "grad_norm": 5.4523597666476675, "learning_rate": 1.956924949553301e-05, "loss": 1.119, "step": 3003 }, { "epoch": 0.12, "grad_norm": 5.670794335594339, "learning_rate": 1.956887064430765e-05, "loss": 1.4032, "step": 3004 }, { "epoch": 0.12, "grad_norm": 6.752957253056093, "learning_rate": 1.956849163022343e-05, "loss": 1.1935, "step": 3005 }, { "epoch": 0.12, "grad_norm": 4.085954711966279, "learning_rate": 1.95681124532868e-05, "loss": 0.9464, "step": 3006 }, { "epoch": 0.12, "grad_norm": 5.384637233585121, "learning_rate": 1.9567733113504213e-05, "loss": 1.1936, "step": 3007 }, { "epoch": 0.12, "grad_norm": 6.646012852902901, "learning_rate": 1.9567353610882126e-05, "loss": 1.0686, "step": 3008 }, { "epoch": 0.12, "grad_norm": 7.194704576514314, "learning_rate": 1.9566973945427e-05, "loss": 1.7048, "step": 3009 }, { "epoch": 0.12, "grad_norm": 6.976242487833728, "learning_rate": 1.956659411714529e-05, "loss": 1.4069, "step": 3010 }, { "epoch": 0.12, "grad_norm": 8.577012794802915, "learning_rate": 1.9566214126043467e-05, "loss": 1.8297, "step": 3011 }, { "epoch": 0.12, "grad_norm": 6.446067739588253, "learning_rate": 1.9565833972127996e-05, "loss": 1.0778, "step": 3012 }, { "epoch": 0.12, "grad_norm": 5.741439398314465, "learning_rate": 1.9565453655405342e-05, "loss": 1.2241, "step": 3013 }, { "epoch": 0.12, "grad_norm": 5.3475571467680005, "learning_rate": 1.9565073175881986e-05, "loss": 1.2833, "step": 3014 }, { "epoch": 0.12, "grad_norm": 4.955427454994196, "learning_rate": 1.9564692533564404e-05, "loss": 1.2292, "step": 3015 }, { "epoch": 0.12, "grad_norm": 6.64942812661909, "learning_rate": 1.956431172845907e-05, "loss": 1.3799, "step": 3016 }, { "epoch": 0.12, "grad_norm": 8.163095006138523, "learning_rate": 1.9563930760572462e-05, "loss": 1.5989, "step": 3017 }, { "epoch": 0.12, "grad_norm": 7.540869041804654, "learning_rate": 1.956354962991107e-05, "loss": 1.4673, "step": 3018 }, { "epoch": 0.12, "grad_norm": 8.010788689807262, "learning_rate": 1.9563168336481384e-05, "loss": 1.3297, "step": 3019 }, { "epoch": 0.12, "grad_norm": 5.876265498028198, "learning_rate": 1.9562786880289883e-05, "loss": 1.4176, "step": 3020 }, { "epoch": 0.12, "grad_norm": 5.575270548171447, "learning_rate": 1.9562405261343066e-05, "loss": 1.514, "step": 3021 }, { "epoch": 0.12, "grad_norm": 5.536256304518745, "learning_rate": 1.956202347964743e-05, "loss": 1.2446, "step": 3022 }, { "epoch": 0.12, "grad_norm": 5.428698878852939, "learning_rate": 1.956164153520946e-05, "loss": 1.416, "step": 3023 }, { "epoch": 0.12, "grad_norm": 6.041139765511129, "learning_rate": 1.9561259428035675e-05, "loss": 1.4111, "step": 3024 }, { "epoch": 0.12, "grad_norm": 5.461622632040706, "learning_rate": 1.9560877158132565e-05, "loss": 1.03, "step": 3025 }, { "epoch": 0.12, "grad_norm": 6.186863638675823, "learning_rate": 1.9560494725506644e-05, "loss": 1.4132, "step": 3026 }, { "epoch": 0.12, "grad_norm": 5.82533876966083, "learning_rate": 1.9560112130164412e-05, "loss": 1.4313, "step": 3027 }, { "epoch": 0.12, "grad_norm": 5.10960376436543, "learning_rate": 1.955972937211239e-05, "loss": 1.3614, "step": 3028 }, { "epoch": 0.12, "grad_norm": 8.41487829928482, "learning_rate": 1.9559346451357085e-05, "loss": 1.5139, "step": 3029 }, { "epoch": 0.12, "grad_norm": 4.3565909930691324, "learning_rate": 1.955896336790502e-05, "loss": 1.1268, "step": 3030 }, { "epoch": 0.12, "grad_norm": 6.005849207104483, "learning_rate": 1.9558580121762712e-05, "loss": 1.108, "step": 3031 }, { "epoch": 0.12, "grad_norm": 6.296166926016051, "learning_rate": 1.9558196712936682e-05, "loss": 1.3326, "step": 3032 }, { "epoch": 0.12, "grad_norm": 4.674354796676517, "learning_rate": 1.9557813141433462e-05, "loss": 1.2763, "step": 3033 }, { "epoch": 0.12, "grad_norm": 7.994637448520102, "learning_rate": 1.9557429407259573e-05, "loss": 1.7612, "step": 3034 }, { "epoch": 0.12, "grad_norm": 6.122383604544614, "learning_rate": 1.955704551042155e-05, "loss": 1.3702, "step": 3035 }, { "epoch": 0.12, "grad_norm": 6.4407046974262006, "learning_rate": 1.9556661450925923e-05, "loss": 1.727, "step": 3036 }, { "epoch": 0.12, "grad_norm": 5.7153045348909615, "learning_rate": 1.9556277228779237e-05, "loss": 1.3907, "step": 3037 }, { "epoch": 0.12, "grad_norm": 6.944929222233087, "learning_rate": 1.955589284398802e-05, "loss": 1.1955, "step": 3038 }, { "epoch": 0.12, "grad_norm": 5.311154124956176, "learning_rate": 1.955550829655882e-05, "loss": 1.4433, "step": 3039 }, { "epoch": 0.12, "grad_norm": 6.471146382141527, "learning_rate": 1.9555123586498187e-05, "loss": 1.4908, "step": 3040 }, { "epoch": 0.12, "grad_norm": 6.865120433965442, "learning_rate": 1.9554738713812657e-05, "loss": 1.0397, "step": 3041 }, { "epoch": 0.12, "grad_norm": 5.352302617216924, "learning_rate": 1.955435367850879e-05, "loss": 1.4875, "step": 3042 }, { "epoch": 0.12, "grad_norm": 9.67850082155871, "learning_rate": 1.9553968480593133e-05, "loss": 1.4699, "step": 3043 }, { "epoch": 0.12, "grad_norm": 5.225597444846306, "learning_rate": 1.955358312007225e-05, "loss": 1.2078, "step": 3044 }, { "epoch": 0.12, "grad_norm": 7.447491290475621, "learning_rate": 1.955319759695269e-05, "loss": 1.3521, "step": 3045 }, { "epoch": 0.12, "grad_norm": 7.273068525614477, "learning_rate": 1.955281191124102e-05, "loss": 1.3991, "step": 3046 }, { "epoch": 0.12, "grad_norm": 11.892519769589509, "learning_rate": 1.95524260629438e-05, "loss": 1.3323, "step": 3047 }, { "epoch": 0.12, "grad_norm": 6.6377192759838755, "learning_rate": 1.9552040052067604e-05, "loss": 1.4178, "step": 3048 }, { "epoch": 0.12, "grad_norm": 12.453432864443718, "learning_rate": 1.9551653878618998e-05, "loss": 1.5771, "step": 3049 }, { "epoch": 0.12, "grad_norm": 8.555419331136935, "learning_rate": 1.955126754260455e-05, "loss": 1.063, "step": 3050 }, { "epoch": 0.12, "grad_norm": 8.391439764954109, "learning_rate": 1.9550881044030844e-05, "loss": 1.2256, "step": 3051 }, { "epoch": 0.12, "grad_norm": 10.717759878989044, "learning_rate": 1.955049438290445e-05, "loss": 1.2581, "step": 3052 }, { "epoch": 0.12, "grad_norm": 7.567993637139406, "learning_rate": 1.955010755923196e-05, "loss": 1.6009, "step": 3053 }, { "epoch": 0.12, "grad_norm": 8.191616474726164, "learning_rate": 1.9549720573019945e-05, "loss": 1.343, "step": 3054 }, { "epoch": 0.12, "grad_norm": 6.851952047077054, "learning_rate": 1.9549333424274998e-05, "loss": 1.4578, "step": 3055 }, { "epoch": 0.12, "grad_norm": 6.236155484130217, "learning_rate": 1.9548946113003703e-05, "loss": 0.8406, "step": 3056 }, { "epoch": 0.12, "grad_norm": 8.605833266726817, "learning_rate": 1.954855863921266e-05, "loss": 1.7349, "step": 3057 }, { "epoch": 0.12, "grad_norm": 5.341293251682161, "learning_rate": 1.9548171002908458e-05, "loss": 1.2011, "step": 3058 }, { "epoch": 0.12, "grad_norm": 6.621043118896074, "learning_rate": 1.9547783204097694e-05, "loss": 1.0242, "step": 3059 }, { "epoch": 0.12, "grad_norm": 5.987528890735899, "learning_rate": 1.9547395242786973e-05, "loss": 1.2363, "step": 3060 }, { "epoch": 0.12, "grad_norm": 6.856406915139954, "learning_rate": 1.954700711898289e-05, "loss": 1.3745, "step": 3061 }, { "epoch": 0.12, "grad_norm": 11.038433718282434, "learning_rate": 1.9546618832692062e-05, "loss": 1.6385, "step": 3062 }, { "epoch": 0.12, "grad_norm": 5.881645646146623, "learning_rate": 1.954623038392109e-05, "loss": 1.4833, "step": 3063 }, { "epoch": 0.12, "grad_norm": 4.525638146966041, "learning_rate": 1.954584177267658e-05, "loss": 1.1032, "step": 3064 }, { "epoch": 0.12, "grad_norm": 5.575045041497217, "learning_rate": 1.954545299896516e-05, "loss": 1.178, "step": 3065 }, { "epoch": 0.12, "grad_norm": 5.807885969302533, "learning_rate": 1.9545064062793434e-05, "loss": 1.3116, "step": 3066 }, { "epoch": 0.12, "grad_norm": 6.393033748070535, "learning_rate": 1.954467496416803e-05, "loss": 1.3919, "step": 3067 }, { "epoch": 0.12, "grad_norm": 6.093672654520014, "learning_rate": 1.9544285703095565e-05, "loss": 1.4815, "step": 3068 }, { "epoch": 0.12, "grad_norm": 5.779054658204869, "learning_rate": 1.954389627958267e-05, "loss": 1.4736, "step": 3069 }, { "epoch": 0.12, "grad_norm": 7.359269409318602, "learning_rate": 1.9543506693635965e-05, "loss": 1.1374, "step": 3070 }, { "epoch": 0.12, "grad_norm": 5.058726471822921, "learning_rate": 1.9543116945262087e-05, "loss": 1.1055, "step": 3071 }, { "epoch": 0.12, "grad_norm": 5.465982245119378, "learning_rate": 1.9542727034467667e-05, "loss": 1.0212, "step": 3072 }, { "epoch": 0.12, "grad_norm": 8.707224513120302, "learning_rate": 1.9542336961259338e-05, "loss": 1.5051, "step": 3073 }, { "epoch": 0.12, "grad_norm": 6.97382912673923, "learning_rate": 1.9541946725643747e-05, "loss": 1.3319, "step": 3074 }, { "epoch": 0.12, "grad_norm": 4.948759052204712, "learning_rate": 1.9541556327627525e-05, "loss": 1.0344, "step": 3075 }, { "epoch": 0.12, "grad_norm": 5.407563038946613, "learning_rate": 1.9541165767217327e-05, "loss": 1.2038, "step": 3076 }, { "epoch": 0.12, "grad_norm": 6.394440742720365, "learning_rate": 1.9540775044419793e-05, "loss": 1.2591, "step": 3077 }, { "epoch": 0.12, "grad_norm": 5.8547254876401995, "learning_rate": 1.9540384159241577e-05, "loss": 1.1788, "step": 3078 }, { "epoch": 0.12, "grad_norm": 5.782167733510908, "learning_rate": 1.953999311168933e-05, "loss": 1.3366, "step": 3079 }, { "epoch": 0.12, "grad_norm": 6.721799313264839, "learning_rate": 1.9539601901769708e-05, "loss": 1.1487, "step": 3080 }, { "epoch": 0.12, "grad_norm": 5.662441523037463, "learning_rate": 1.9539210529489367e-05, "loss": 1.2138, "step": 3081 }, { "epoch": 0.12, "grad_norm": 6.754428460318666, "learning_rate": 1.953881899485497e-05, "loss": 1.4099, "step": 3082 }, { "epoch": 0.12, "grad_norm": 5.3788485642736, "learning_rate": 1.9538427297873183e-05, "loss": 1.4588, "step": 3083 }, { "epoch": 0.12, "grad_norm": 6.655618133586845, "learning_rate": 1.9538035438550665e-05, "loss": 1.1919, "step": 3084 }, { "epoch": 0.12, "grad_norm": 7.641410299590693, "learning_rate": 1.9537643416894097e-05, "loss": 1.2265, "step": 3085 }, { "epoch": 0.12, "grad_norm": 5.844856901940121, "learning_rate": 1.9537251232910143e-05, "loss": 1.4873, "step": 3086 }, { "epoch": 0.12, "grad_norm": 7.396194844009791, "learning_rate": 1.9536858886605477e-05, "loss": 1.0394, "step": 3087 }, { "epoch": 0.12, "grad_norm": 7.511212779095034, "learning_rate": 1.953646637798678e-05, "loss": 1.4258, "step": 3088 }, { "epoch": 0.12, "grad_norm": 6.168810476323793, "learning_rate": 1.9536073707060733e-05, "loss": 1.5882, "step": 3089 }, { "epoch": 0.12, "grad_norm": 6.322385518885021, "learning_rate": 1.9535680873834017e-05, "loss": 1.681, "step": 3090 }, { "epoch": 0.12, "grad_norm": 6.533059218061694, "learning_rate": 1.9535287878313315e-05, "loss": 1.3936, "step": 3091 }, { "epoch": 0.12, "grad_norm": 5.758483617366037, "learning_rate": 1.9534894720505326e-05, "loss": 1.0865, "step": 3092 }, { "epoch": 0.12, "grad_norm": 8.109722057695535, "learning_rate": 1.9534501400416727e-05, "loss": 1.2768, "step": 3093 }, { "epoch": 0.12, "grad_norm": 6.087739345191718, "learning_rate": 1.9534107918054223e-05, "loss": 1.4132, "step": 3094 }, { "epoch": 0.12, "grad_norm": 4.968139949887867, "learning_rate": 1.9533714273424506e-05, "loss": 1.4629, "step": 3095 }, { "epoch": 0.12, "grad_norm": 6.454359057454755, "learning_rate": 1.953332046653428e-05, "loss": 1.352, "step": 3096 }, { "epoch": 0.12, "grad_norm": 5.432247309200816, "learning_rate": 1.9532926497390244e-05, "loss": 1.2346, "step": 3097 }, { "epoch": 0.12, "grad_norm": 4.710200808342539, "learning_rate": 1.9532532365999102e-05, "loss": 1.3329, "step": 3098 }, { "epoch": 0.12, "grad_norm": 4.751932441236711, "learning_rate": 1.9532138072367567e-05, "loss": 1.2681, "step": 3099 }, { "epoch": 0.12, "grad_norm": 6.448049972556608, "learning_rate": 1.9531743616502345e-05, "loss": 1.6228, "step": 3100 }, { "epoch": 0.12, "grad_norm": 14.280045510637844, "learning_rate": 1.953134899841015e-05, "loss": 1.6846, "step": 3101 }, { "epoch": 0.12, "grad_norm": 4.6073916841500395, "learning_rate": 1.9530954218097703e-05, "loss": 1.2591, "step": 3102 }, { "epoch": 0.12, "grad_norm": 3.7326692307282205, "learning_rate": 1.9530559275571714e-05, "loss": 1.1636, "step": 3103 }, { "epoch": 0.13, "grad_norm": 6.246469503453881, "learning_rate": 1.9530164170838915e-05, "loss": 1.2267, "step": 3104 }, { "epoch": 0.13, "grad_norm": 6.696156586714013, "learning_rate": 1.9529768903906022e-05, "loss": 1.1701, "step": 3105 }, { "epoch": 0.13, "grad_norm": 6.759558738368552, "learning_rate": 1.952937347477977e-05, "loss": 1.2415, "step": 3106 }, { "epoch": 0.13, "grad_norm": 5.183835929593885, "learning_rate": 1.9528977883466883e-05, "loss": 1.3643, "step": 3107 }, { "epoch": 0.13, "grad_norm": 6.390243423847495, "learning_rate": 1.9528582129974098e-05, "loss": 1.3506, "step": 3108 }, { "epoch": 0.13, "grad_norm": 5.33701614284104, "learning_rate": 1.9528186214308144e-05, "loss": 1.152, "step": 3109 }, { "epoch": 0.13, "grad_norm": 8.417570859709311, "learning_rate": 1.9527790136475767e-05, "loss": 1.5009, "step": 3110 }, { "epoch": 0.13, "grad_norm": 5.139641464746353, "learning_rate": 1.9527393896483703e-05, "loss": 1.264, "step": 3111 }, { "epoch": 0.13, "grad_norm": 6.61139589048605, "learning_rate": 1.95269974943387e-05, "loss": 1.4734, "step": 3112 }, { "epoch": 0.13, "grad_norm": 4.23824752848678, "learning_rate": 1.9526600930047503e-05, "loss": 0.9956, "step": 3113 }, { "epoch": 0.13, "grad_norm": 6.807227945536994, "learning_rate": 1.952620420361686e-05, "loss": 1.3152, "step": 3114 }, { "epoch": 0.13, "grad_norm": 9.106900305471056, "learning_rate": 1.9525807315053523e-05, "loss": 1.2571, "step": 3115 }, { "epoch": 0.13, "grad_norm": 7.388020990542762, "learning_rate": 1.952541026436425e-05, "loss": 1.3577, "step": 3116 }, { "epoch": 0.13, "grad_norm": 5.213904734356749, "learning_rate": 1.9525013051555794e-05, "loss": 1.222, "step": 3117 }, { "epoch": 0.13, "grad_norm": 6.489748683967728, "learning_rate": 1.9524615676634923e-05, "loss": 1.2105, "step": 3118 }, { "epoch": 0.13, "grad_norm": 7.302755564249802, "learning_rate": 1.952421813960839e-05, "loss": 1.1385, "step": 3119 }, { "epoch": 0.13, "grad_norm": 11.1986965276448, "learning_rate": 1.952382044048297e-05, "loss": 1.6333, "step": 3120 }, { "epoch": 0.13, "grad_norm": 4.746355989314025, "learning_rate": 1.9523422579265426e-05, "loss": 1.1581, "step": 3121 }, { "epoch": 0.13, "grad_norm": 7.056781487093378, "learning_rate": 1.952302455596253e-05, "loss": 1.4657, "step": 3122 }, { "epoch": 0.13, "grad_norm": 7.094425361183786, "learning_rate": 1.9522626370581058e-05, "loss": 1.2324, "step": 3123 }, { "epoch": 0.13, "grad_norm": 9.347728731077913, "learning_rate": 1.9522228023127788e-05, "loss": 1.2746, "step": 3124 }, { "epoch": 0.13, "grad_norm": 6.4641902310283115, "learning_rate": 1.95218295136095e-05, "loss": 1.2687, "step": 3125 }, { "epoch": 0.13, "grad_norm": 7.461242472567538, "learning_rate": 1.952143084203297e-05, "loss": 1.6961, "step": 3126 }, { "epoch": 0.13, "grad_norm": 6.147368545218562, "learning_rate": 1.9521032008404992e-05, "loss": 1.008, "step": 3127 }, { "epoch": 0.13, "grad_norm": 7.293863915920318, "learning_rate": 1.952063301273235e-05, "loss": 1.3189, "step": 3128 }, { "epoch": 0.13, "grad_norm": 10.07060437331083, "learning_rate": 1.952023385502183e-05, "loss": 1.6048, "step": 3129 }, { "epoch": 0.13, "grad_norm": 5.447779137593787, "learning_rate": 1.9519834535280237e-05, "loss": 1.0686, "step": 3130 }, { "epoch": 0.13, "grad_norm": 6.293321291177359, "learning_rate": 1.9519435053514356e-05, "loss": 1.4016, "step": 3131 }, { "epoch": 0.13, "grad_norm": 6.160184887594016, "learning_rate": 1.9519035409730992e-05, "loss": 1.3989, "step": 3132 }, { "epoch": 0.13, "grad_norm": 7.029311753194242, "learning_rate": 1.9518635603936943e-05, "loss": 1.3441, "step": 3133 }, { "epoch": 0.13, "grad_norm": 7.200473158804808, "learning_rate": 1.951823563613902e-05, "loss": 1.2821, "step": 3134 }, { "epoch": 0.13, "grad_norm": 7.589873426352561, "learning_rate": 1.9517835506344025e-05, "loss": 1.5579, "step": 3135 }, { "epoch": 0.13, "grad_norm": 9.674041607977303, "learning_rate": 1.9517435214558768e-05, "loss": 1.1926, "step": 3136 }, { "epoch": 0.13, "grad_norm": 4.073125887395572, "learning_rate": 1.9517034760790064e-05, "loss": 1.1299, "step": 3137 }, { "epoch": 0.13, "grad_norm": 6.596435977833834, "learning_rate": 1.9516634145044728e-05, "loss": 1.3055, "step": 3138 }, { "epoch": 0.13, "grad_norm": 5.233413033341089, "learning_rate": 1.9516233367329578e-05, "loss": 1.303, "step": 3139 }, { "epoch": 0.13, "grad_norm": 5.823768848641338, "learning_rate": 1.9515832427651436e-05, "loss": 1.3427, "step": 3140 }, { "epoch": 0.13, "grad_norm": 6.870714170790582, "learning_rate": 1.9515431326017124e-05, "loss": 1.4737, "step": 3141 }, { "epoch": 0.13, "grad_norm": 6.839561530875974, "learning_rate": 1.951503006243347e-05, "loss": 1.1626, "step": 3142 }, { "epoch": 0.13, "grad_norm": 4.634065462722403, "learning_rate": 1.9514628636907303e-05, "loss": 1.1391, "step": 3143 }, { "epoch": 0.13, "grad_norm": 6.0998195375773525, "learning_rate": 1.9514227049445454e-05, "loss": 1.8265, "step": 3144 }, { "epoch": 0.13, "grad_norm": 8.525217785291463, "learning_rate": 1.951382530005476e-05, "loss": 1.4533, "step": 3145 }, { "epoch": 0.13, "grad_norm": 4.779742753850481, "learning_rate": 1.9513423388742063e-05, "loss": 1.0843, "step": 3146 }, { "epoch": 0.13, "grad_norm": 6.911795214129019, "learning_rate": 1.9513021315514188e-05, "loss": 1.8046, "step": 3147 }, { "epoch": 0.13, "grad_norm": 7.535562190103548, "learning_rate": 1.9512619080377994e-05, "loss": 1.1025, "step": 3148 }, { "epoch": 0.13, "grad_norm": 5.438452427184618, "learning_rate": 1.951221668334032e-05, "loss": 1.4489, "step": 3149 }, { "epoch": 0.13, "grad_norm": 5.826261881253335, "learning_rate": 1.951181412440802e-05, "loss": 1.3695, "step": 3150 }, { "epoch": 0.13, "grad_norm": 9.626958487669006, "learning_rate": 1.9511411403587937e-05, "loss": 1.1484, "step": 3151 }, { "epoch": 0.13, "grad_norm": 7.474240700334718, "learning_rate": 1.9511008520886928e-05, "loss": 1.6691, "step": 3152 }, { "epoch": 0.13, "grad_norm": 4.9715911382301385, "learning_rate": 1.9510605476311854e-05, "loss": 1.129, "step": 3153 }, { "epoch": 0.13, "grad_norm": 9.288235779622477, "learning_rate": 1.9510202269869573e-05, "loss": 1.7531, "step": 3154 }, { "epoch": 0.13, "grad_norm": 6.347282279406751, "learning_rate": 1.9509798901566943e-05, "loss": 1.352, "step": 3155 }, { "epoch": 0.13, "grad_norm": 6.290169032711555, "learning_rate": 1.950939537141083e-05, "loss": 0.9036, "step": 3156 }, { "epoch": 0.13, "grad_norm": 5.119428081977788, "learning_rate": 1.9508991679408113e-05, "loss": 1.1192, "step": 3157 }, { "epoch": 0.13, "grad_norm": 6.832228268612211, "learning_rate": 1.950858782556565e-05, "loss": 1.2977, "step": 3158 }, { "epoch": 0.13, "grad_norm": 5.2643133361941175, "learning_rate": 1.950818380989032e-05, "loss": 1.2638, "step": 3159 }, { "epoch": 0.13, "grad_norm": 7.394257114093725, "learning_rate": 1.9507779632388997e-05, "loss": 1.41, "step": 3160 }, { "epoch": 0.13, "grad_norm": 7.351244974840778, "learning_rate": 1.950737529306856e-05, "loss": 1.479, "step": 3161 }, { "epoch": 0.13, "grad_norm": 7.2873959834810025, "learning_rate": 1.950697079193589e-05, "loss": 1.3747, "step": 3162 }, { "epoch": 0.13, "grad_norm": 6.5091983518889664, "learning_rate": 1.9506566128997877e-05, "loss": 1.4221, "step": 3163 }, { "epoch": 0.13, "grad_norm": 6.114140882061283, "learning_rate": 1.9506161304261404e-05, "loss": 1.334, "step": 3164 }, { "epoch": 0.13, "grad_norm": 5.278889627160714, "learning_rate": 1.9505756317733356e-05, "loss": 1.2872, "step": 3165 }, { "epoch": 0.13, "grad_norm": 9.590371198434953, "learning_rate": 1.9505351169420637e-05, "loss": 1.4958, "step": 3166 }, { "epoch": 0.13, "grad_norm": 6.280514788857349, "learning_rate": 1.9504945859330134e-05, "loss": 1.2974, "step": 3167 }, { "epoch": 0.13, "grad_norm": 5.755227227450106, "learning_rate": 1.950454038746875e-05, "loss": 1.1727, "step": 3168 }, { "epoch": 0.13, "grad_norm": 8.470652611003924, "learning_rate": 1.950413475384338e-05, "loss": 1.2225, "step": 3169 }, { "epoch": 0.13, "grad_norm": 7.980494039062044, "learning_rate": 1.9503728958460932e-05, "loss": 1.4493, "step": 3170 }, { "epoch": 0.13, "grad_norm": 5.497028765723917, "learning_rate": 1.9503323001328313e-05, "loss": 1.4733, "step": 3171 }, { "epoch": 0.13, "grad_norm": 6.438736381098128, "learning_rate": 1.9502916882452433e-05, "loss": 1.4552, "step": 3172 }, { "epoch": 0.13, "grad_norm": 9.014814383555338, "learning_rate": 1.9502510601840198e-05, "loss": 1.5364, "step": 3173 }, { "epoch": 0.13, "grad_norm": 7.11365221813519, "learning_rate": 1.9502104159498532e-05, "loss": 1.4995, "step": 3174 }, { "epoch": 0.13, "grad_norm": 5.448499025644143, "learning_rate": 1.9501697555434342e-05, "loss": 1.4112, "step": 3175 }, { "epoch": 0.13, "grad_norm": 5.556659212265499, "learning_rate": 1.9501290789654558e-05, "loss": 1.4222, "step": 3176 }, { "epoch": 0.13, "grad_norm": 18.044358101204118, "learning_rate": 1.9500883862166097e-05, "loss": 1.36, "step": 3177 }, { "epoch": 0.13, "grad_norm": 5.152249167669299, "learning_rate": 1.9500476772975886e-05, "loss": 1.455, "step": 3178 }, { "epoch": 0.13, "grad_norm": 6.174292309946439, "learning_rate": 1.9500069522090853e-05, "loss": 1.4961, "step": 3179 }, { "epoch": 0.13, "grad_norm": 13.419515867315315, "learning_rate": 1.9499662109517933e-05, "loss": 1.5951, "step": 3180 }, { "epoch": 0.13, "grad_norm": 9.848041107085185, "learning_rate": 1.9499254535264055e-05, "loss": 1.1112, "step": 3181 }, { "epoch": 0.13, "grad_norm": 4.811772059718595, "learning_rate": 1.9498846799336162e-05, "loss": 1.2166, "step": 3182 }, { "epoch": 0.13, "grad_norm": 7.164532820496349, "learning_rate": 1.9498438901741186e-05, "loss": 1.3728, "step": 3183 }, { "epoch": 0.13, "grad_norm": 8.750138822888145, "learning_rate": 1.9498030842486072e-05, "loss": 1.3979, "step": 3184 }, { "epoch": 0.13, "grad_norm": 7.22778534124478, "learning_rate": 1.949762262157777e-05, "loss": 1.1116, "step": 3185 }, { "epoch": 0.13, "grad_norm": 6.297112288792508, "learning_rate": 1.949721423902322e-05, "loss": 1.2778, "step": 3186 }, { "epoch": 0.13, "grad_norm": 5.692693435108846, "learning_rate": 1.9496805694829376e-05, "loss": 1.4646, "step": 3187 }, { "epoch": 0.13, "grad_norm": 6.757566110643624, "learning_rate": 1.9496396989003195e-05, "loss": 1.7272, "step": 3188 }, { "epoch": 0.13, "grad_norm": 5.120495598479416, "learning_rate": 1.9495988121551628e-05, "loss": 1.417, "step": 3189 }, { "epoch": 0.13, "grad_norm": 5.268010284859559, "learning_rate": 1.9495579092481635e-05, "loss": 1.1349, "step": 3190 }, { "epoch": 0.13, "grad_norm": 5.966486651588832, "learning_rate": 1.9495169901800178e-05, "loss": 1.1294, "step": 3191 }, { "epoch": 0.13, "grad_norm": 7.198491498076987, "learning_rate": 1.949476054951422e-05, "loss": 1.6101, "step": 3192 }, { "epoch": 0.13, "grad_norm": 6.974929077504478, "learning_rate": 1.949435103563073e-05, "loss": 1.2375, "step": 3193 }, { "epoch": 0.13, "grad_norm": 4.851274058449346, "learning_rate": 1.9493941360156676e-05, "loss": 0.9031, "step": 3194 }, { "epoch": 0.13, "grad_norm": 9.085427871696805, "learning_rate": 1.949353152309903e-05, "loss": 1.3395, "step": 3195 }, { "epoch": 0.13, "grad_norm": 7.270046507465551, "learning_rate": 1.949312152446477e-05, "loss": 1.4827, "step": 3196 }, { "epoch": 0.13, "grad_norm": 5.79421732536088, "learning_rate": 1.949271136426088e-05, "loss": 1.2993, "step": 3197 }, { "epoch": 0.13, "grad_norm": 4.268235799217651, "learning_rate": 1.9492301042494324e-05, "loss": 1.2778, "step": 3198 }, { "epoch": 0.13, "grad_norm": 5.914332370010181, "learning_rate": 1.9491890559172098e-05, "loss": 1.4354, "step": 3199 }, { "epoch": 0.13, "grad_norm": 6.3276898572050975, "learning_rate": 1.9491479914301184e-05, "loss": 1.3066, "step": 3200 }, { "epoch": 0.13, "grad_norm": 6.109008374133051, "learning_rate": 1.9491069107888578e-05, "loss": 1.2693, "step": 3201 }, { "epoch": 0.13, "grad_norm": 8.047042053394044, "learning_rate": 1.9490658139941262e-05, "loss": 1.5876, "step": 3202 }, { "epoch": 0.13, "grad_norm": 6.365138874510876, "learning_rate": 1.9490247010466234e-05, "loss": 1.2326, "step": 3203 }, { "epoch": 0.13, "grad_norm": 5.532562104234933, "learning_rate": 1.9489835719470494e-05, "loss": 1.1141, "step": 3204 }, { "epoch": 0.13, "grad_norm": 5.788554536756972, "learning_rate": 1.9489424266961037e-05, "loss": 1.1927, "step": 3205 }, { "epoch": 0.13, "grad_norm": 6.611499604994943, "learning_rate": 1.9489012652944874e-05, "loss": 1.5178, "step": 3206 }, { "epoch": 0.13, "grad_norm": 6.438750031904858, "learning_rate": 1.9488600877429e-05, "loss": 1.1693, "step": 3207 }, { "epoch": 0.13, "grad_norm": 5.2529604867609, "learning_rate": 1.9488188940420432e-05, "loss": 1.3445, "step": 3208 }, { "epoch": 0.13, "grad_norm": 6.315975839745974, "learning_rate": 1.9487776841926176e-05, "loss": 1.436, "step": 3209 }, { "epoch": 0.13, "grad_norm": 4.012619624850473, "learning_rate": 1.948736458195325e-05, "loss": 1.2594, "step": 3210 }, { "epoch": 0.13, "grad_norm": 5.890717702991227, "learning_rate": 1.9486952160508665e-05, "loss": 1.5454, "step": 3211 }, { "epoch": 0.13, "grad_norm": 6.129146337493982, "learning_rate": 1.9486539577599445e-05, "loss": 1.2022, "step": 3212 }, { "epoch": 0.13, "grad_norm": 9.456061686636017, "learning_rate": 1.948612683323261e-05, "loss": 1.6361, "step": 3213 }, { "epoch": 0.13, "grad_norm": 4.835789233262676, "learning_rate": 1.9485713927415186e-05, "loss": 1.1138, "step": 3214 }, { "epoch": 0.13, "grad_norm": 5.136426438011736, "learning_rate": 1.9485300860154194e-05, "loss": 1.3093, "step": 3215 }, { "epoch": 0.13, "grad_norm": 8.327201137961394, "learning_rate": 1.9484887631456674e-05, "loss": 1.5607, "step": 3216 }, { "epoch": 0.13, "grad_norm": 6.837548562590902, "learning_rate": 1.9484474241329656e-05, "loss": 1.318, "step": 3217 }, { "epoch": 0.13, "grad_norm": 6.352309975911152, "learning_rate": 1.9484060689780172e-05, "loss": 1.466, "step": 3218 }, { "epoch": 0.13, "grad_norm": 9.735711007284067, "learning_rate": 1.9483646976815264e-05, "loss": 1.0882, "step": 3219 }, { "epoch": 0.13, "grad_norm": 6.256733010334577, "learning_rate": 1.9483233102441974e-05, "loss": 1.5666, "step": 3220 }, { "epoch": 0.13, "grad_norm": 8.028540282110542, "learning_rate": 1.9482819066667342e-05, "loss": 1.4483, "step": 3221 }, { "epoch": 0.13, "grad_norm": 7.504197065860445, "learning_rate": 1.948240486949842e-05, "loss": 1.4249, "step": 3222 }, { "epoch": 0.13, "grad_norm": 5.548819065160275, "learning_rate": 1.948199051094225e-05, "loss": 1.3833, "step": 3223 }, { "epoch": 0.13, "grad_norm": 6.2136892827810515, "learning_rate": 1.948157599100589e-05, "loss": 1.2258, "step": 3224 }, { "epoch": 0.13, "grad_norm": 4.4880970593812055, "learning_rate": 1.9481161309696393e-05, "loss": 1.2484, "step": 3225 }, { "epoch": 0.13, "grad_norm": 6.671116439321732, "learning_rate": 1.9480746467020825e-05, "loss": 1.4631, "step": 3226 }, { "epoch": 0.13, "grad_norm": 5.296735583885339, "learning_rate": 1.948033146298623e-05, "loss": 1.1563, "step": 3227 }, { "epoch": 0.13, "grad_norm": 6.016455086786798, "learning_rate": 1.9479916297599683e-05, "loss": 1.2724, "step": 3228 }, { "epoch": 0.13, "grad_norm": 5.741368219442508, "learning_rate": 1.947950097086825e-05, "loss": 1.3168, "step": 3229 }, { "epoch": 0.13, "grad_norm": 4.942241250653897, "learning_rate": 1.9479085482798995e-05, "loss": 1.2419, "step": 3230 }, { "epoch": 0.13, "grad_norm": 8.233880497656015, "learning_rate": 1.947866983339899e-05, "loss": 1.0405, "step": 3231 }, { "epoch": 0.13, "grad_norm": 8.686138213576816, "learning_rate": 1.9478254022675313e-05, "loss": 1.3642, "step": 3232 }, { "epoch": 0.13, "grad_norm": 5.198563422292533, "learning_rate": 1.9477838050635032e-05, "loss": 1.4815, "step": 3233 }, { "epoch": 0.13, "grad_norm": 4.8195837294042, "learning_rate": 1.947742191728524e-05, "loss": 1.2152, "step": 3234 }, { "epoch": 0.13, "grad_norm": 4.919356606267648, "learning_rate": 1.9477005622633012e-05, "loss": 1.2265, "step": 3235 }, { "epoch": 0.13, "grad_norm": 6.538992435059917, "learning_rate": 1.947658916668543e-05, "loss": 1.2607, "step": 3236 }, { "epoch": 0.13, "grad_norm": 5.242175342325214, "learning_rate": 1.9476172549449593e-05, "loss": 1.2295, "step": 3237 }, { "epoch": 0.13, "grad_norm": 7.36514967763002, "learning_rate": 1.9475755770932578e-05, "loss": 1.3858, "step": 3238 }, { "epoch": 0.13, "grad_norm": 5.423527561574612, "learning_rate": 1.9475338831141485e-05, "loss": 1.3123, "step": 3239 }, { "epoch": 0.13, "grad_norm": 5.638508019465149, "learning_rate": 1.9474921730083413e-05, "loss": 1.504, "step": 3240 }, { "epoch": 0.13, "grad_norm": 4.943571819457716, "learning_rate": 1.9474504467765457e-05, "loss": 1.2554, "step": 3241 }, { "epoch": 0.13, "grad_norm": 5.840141511832041, "learning_rate": 1.947408704419472e-05, "loss": 1.2764, "step": 3242 }, { "epoch": 0.13, "grad_norm": 4.842227644229233, "learning_rate": 1.9473669459378305e-05, "loss": 1.1892, "step": 3243 }, { "epoch": 0.13, "grad_norm": 5.231742095563332, "learning_rate": 1.9473251713323323e-05, "loss": 1.3548, "step": 3244 }, { "epoch": 0.13, "grad_norm": 4.965579620840349, "learning_rate": 1.9472833806036876e-05, "loss": 1.2348, "step": 3245 }, { "epoch": 0.13, "grad_norm": 5.291797988219052, "learning_rate": 1.9472415737526086e-05, "loss": 1.1578, "step": 3246 }, { "epoch": 0.13, "grad_norm": 9.05889135336164, "learning_rate": 1.9471997507798062e-05, "loss": 1.6284, "step": 3247 }, { "epoch": 0.13, "grad_norm": 5.679425184609992, "learning_rate": 1.9471579116859926e-05, "loss": 1.2238, "step": 3248 }, { "epoch": 0.13, "grad_norm": 4.108669539692443, "learning_rate": 1.9471160564718795e-05, "loss": 0.8654, "step": 3249 }, { "epoch": 0.13, "grad_norm": 5.76392612498795, "learning_rate": 1.9470741851381796e-05, "loss": 1.3839, "step": 3250 }, { "epoch": 0.13, "grad_norm": 6.253218852365156, "learning_rate": 1.9470322976856053e-05, "loss": 1.1897, "step": 3251 }, { "epoch": 0.13, "grad_norm": 7.079527753969774, "learning_rate": 1.94699039411487e-05, "loss": 1.2545, "step": 3252 }, { "epoch": 0.13, "grad_norm": 4.663969590548592, "learning_rate": 1.946948474426686e-05, "loss": 1.4431, "step": 3253 }, { "epoch": 0.13, "grad_norm": 4.484699512101987, "learning_rate": 1.9469065386217673e-05, "loss": 1.0928, "step": 3254 }, { "epoch": 0.13, "grad_norm": 8.717445961901015, "learning_rate": 1.9468645867008276e-05, "loss": 1.3254, "step": 3255 }, { "epoch": 0.13, "grad_norm": 5.071149164123177, "learning_rate": 1.9468226186645813e-05, "loss": 1.0902, "step": 3256 }, { "epoch": 0.13, "grad_norm": 6.218806457313732, "learning_rate": 1.946780634513742e-05, "loss": 1.6235, "step": 3257 }, { "epoch": 0.13, "grad_norm": 6.333969292158284, "learning_rate": 1.9467386342490245e-05, "loss": 1.1253, "step": 3258 }, { "epoch": 0.13, "grad_norm": 6.559886561860392, "learning_rate": 1.9466966178711438e-05, "loss": 1.2546, "step": 3259 }, { "epoch": 0.13, "grad_norm": 6.69814946934972, "learning_rate": 1.9466545853808147e-05, "loss": 1.5185, "step": 3260 }, { "epoch": 0.13, "grad_norm": 6.238771490499749, "learning_rate": 1.9466125367787528e-05, "loss": 1.408, "step": 3261 }, { "epoch": 0.13, "grad_norm": 6.628580859231505, "learning_rate": 1.9465704720656736e-05, "loss": 1.6864, "step": 3262 }, { "epoch": 0.13, "grad_norm": 8.751367572304234, "learning_rate": 1.9465283912422934e-05, "loss": 1.5183, "step": 3263 }, { "epoch": 0.13, "grad_norm": 10.538599781838982, "learning_rate": 1.946486294309328e-05, "loss": 1.5341, "step": 3264 }, { "epoch": 0.13, "grad_norm": 6.820825510610087, "learning_rate": 1.946444181267494e-05, "loss": 1.2936, "step": 3265 }, { "epoch": 0.13, "grad_norm": 5.928673362332088, "learning_rate": 1.9464020521175083e-05, "loss": 1.0408, "step": 3266 }, { "epoch": 0.13, "grad_norm": 6.6252149959206506, "learning_rate": 1.9463599068600875e-05, "loss": 1.2583, "step": 3267 }, { "epoch": 0.13, "grad_norm": 10.412580713080168, "learning_rate": 1.9463177454959493e-05, "loss": 1.3512, "step": 3268 }, { "epoch": 0.13, "grad_norm": 5.653030957375395, "learning_rate": 1.9462755680258113e-05, "loss": 1.1338, "step": 3269 }, { "epoch": 0.13, "grad_norm": 7.447617988641233, "learning_rate": 1.946233374450391e-05, "loss": 1.5323, "step": 3270 }, { "epoch": 0.13, "grad_norm": 8.282261222145705, "learning_rate": 1.9461911647704066e-05, "loss": 1.4106, "step": 3271 }, { "epoch": 0.13, "grad_norm": 6.960131756637283, "learning_rate": 1.946148938986577e-05, "loss": 1.3172, "step": 3272 }, { "epoch": 0.13, "grad_norm": 7.674550950269288, "learning_rate": 1.94610669709962e-05, "loss": 1.4782, "step": 3273 }, { "epoch": 0.13, "grad_norm": 6.506198869122454, "learning_rate": 1.946064439110255e-05, "loss": 1.4238, "step": 3274 }, { "epoch": 0.13, "grad_norm": 4.886270450800704, "learning_rate": 1.9460221650192016e-05, "loss": 1.1621, "step": 3275 }, { "epoch": 0.13, "grad_norm": 8.342368421347022, "learning_rate": 1.945979874827179e-05, "loss": 1.4895, "step": 3276 }, { "epoch": 0.13, "grad_norm": 5.974476170875982, "learning_rate": 1.9459375685349066e-05, "loss": 1.3778, "step": 3277 }, { "epoch": 0.13, "grad_norm": 7.723824414064462, "learning_rate": 1.9458952461431047e-05, "loss": 1.2274, "step": 3278 }, { "epoch": 0.13, "grad_norm": 5.9309333494473275, "learning_rate": 1.945852907652494e-05, "loss": 1.3131, "step": 3279 }, { "epoch": 0.13, "grad_norm": 6.174028374815243, "learning_rate": 1.9458105530637945e-05, "loss": 1.4126, "step": 3280 }, { "epoch": 0.13, "grad_norm": 7.42526151683423, "learning_rate": 1.945768182377727e-05, "loss": 1.391, "step": 3281 }, { "epoch": 0.13, "grad_norm": 5.7815508600708405, "learning_rate": 1.9457257955950136e-05, "loss": 1.1637, "step": 3282 }, { "epoch": 0.13, "grad_norm": 5.235192022263862, "learning_rate": 1.9456833927163745e-05, "loss": 1.3215, "step": 3283 }, { "epoch": 0.13, "grad_norm": 6.860765433504844, "learning_rate": 1.9456409737425322e-05, "loss": 1.5687, "step": 3284 }, { "epoch": 0.13, "grad_norm": 6.50325663611714, "learning_rate": 1.945598538674208e-05, "loss": 1.2808, "step": 3285 }, { "epoch": 0.13, "grad_norm": 4.042301135948131, "learning_rate": 1.9455560875121245e-05, "loss": 1.0842, "step": 3286 }, { "epoch": 0.13, "grad_norm": 6.23677810978767, "learning_rate": 1.9455136202570045e-05, "loss": 1.6341, "step": 3287 }, { "epoch": 0.13, "grad_norm": 5.433045526404774, "learning_rate": 1.9454711369095703e-05, "loss": 1.3169, "step": 3288 }, { "epoch": 0.13, "grad_norm": 10.775568158855489, "learning_rate": 1.9454286374705454e-05, "loss": 1.237, "step": 3289 }, { "epoch": 0.13, "grad_norm": 4.689083871178388, "learning_rate": 1.9453861219406524e-05, "loss": 0.9635, "step": 3290 }, { "epoch": 0.13, "grad_norm": 8.372991689863902, "learning_rate": 1.9453435903206158e-05, "loss": 1.5761, "step": 3291 }, { "epoch": 0.13, "grad_norm": 6.63275167794525, "learning_rate": 1.945301042611159e-05, "loss": 1.4305, "step": 3292 }, { "epoch": 0.13, "grad_norm": 5.930546492153795, "learning_rate": 1.9452584788130056e-05, "loss": 1.2588, "step": 3293 }, { "epoch": 0.13, "grad_norm": 5.366031774389702, "learning_rate": 1.9452158989268812e-05, "loss": 1.2981, "step": 3294 }, { "epoch": 0.13, "grad_norm": 13.620615471082825, "learning_rate": 1.94517330295351e-05, "loss": 1.5391, "step": 3295 }, { "epoch": 0.13, "grad_norm": 8.274268881662023, "learning_rate": 1.9451306908936166e-05, "loss": 1.2932, "step": 3296 }, { "epoch": 0.13, "grad_norm": 8.281238755349078, "learning_rate": 1.9450880627479262e-05, "loss": 1.383, "step": 3297 }, { "epoch": 0.13, "grad_norm": 5.673910053830081, "learning_rate": 1.945045418517165e-05, "loss": 1.2052, "step": 3298 }, { "epoch": 0.13, "grad_norm": 5.618628733128624, "learning_rate": 1.9450027582020582e-05, "loss": 1.4226, "step": 3299 }, { "epoch": 0.13, "grad_norm": 6.645187814093119, "learning_rate": 1.9449600818033322e-05, "loss": 1.5259, "step": 3300 }, { "epoch": 0.13, "grad_norm": 7.099596961234936, "learning_rate": 1.9449173893217134e-05, "loss": 1.3621, "step": 3301 }, { "epoch": 0.13, "grad_norm": 6.921301316880527, "learning_rate": 1.944874680757928e-05, "loss": 1.4232, "step": 3302 }, { "epoch": 0.13, "grad_norm": 7.654607927474763, "learning_rate": 1.9448319561127032e-05, "loss": 1.3699, "step": 3303 }, { "epoch": 0.13, "grad_norm": 5.439714898858324, "learning_rate": 1.944789215386766e-05, "loss": 1.2159, "step": 3304 }, { "epoch": 0.13, "grad_norm": 5.72204302312801, "learning_rate": 1.9447464585808438e-05, "loss": 1.0172, "step": 3305 }, { "epoch": 0.13, "grad_norm": 5.818956633735157, "learning_rate": 1.9447036856956643e-05, "loss": 1.3676, "step": 3306 }, { "epoch": 0.13, "grad_norm": 7.262279286287546, "learning_rate": 1.944660896731956e-05, "loss": 1.5104, "step": 3307 }, { "epoch": 0.13, "grad_norm": 5.573475144694797, "learning_rate": 1.9446180916904467e-05, "loss": 1.2324, "step": 3308 }, { "epoch": 0.13, "grad_norm": 5.285351516155106, "learning_rate": 1.9445752705718644e-05, "loss": 1.347, "step": 3309 }, { "epoch": 0.13, "grad_norm": 5.240872637600388, "learning_rate": 1.944532433376939e-05, "loss": 0.9876, "step": 3310 }, { "epoch": 0.13, "grad_norm": 5.328598112214239, "learning_rate": 1.944489580106399e-05, "loss": 1.2482, "step": 3311 }, { "epoch": 0.13, "grad_norm": 4.876037093380104, "learning_rate": 1.944446710760974e-05, "loss": 1.0243, "step": 3312 }, { "epoch": 0.13, "grad_norm": 9.68922648656827, "learning_rate": 1.9444038253413927e-05, "loss": 1.2498, "step": 3313 }, { "epoch": 0.13, "grad_norm": 4.386042769752434, "learning_rate": 1.944360923848386e-05, "loss": 1.1413, "step": 3314 }, { "epoch": 0.13, "grad_norm": 5.659303162081216, "learning_rate": 1.9443180062826842e-05, "loss": 1.0009, "step": 3315 }, { "epoch": 0.13, "grad_norm": 4.670410536949369, "learning_rate": 1.9442750726450166e-05, "loss": 1.2824, "step": 3316 }, { "epoch": 0.13, "grad_norm": 5.413963606263734, "learning_rate": 1.9442321229361153e-05, "loss": 1.4245, "step": 3317 }, { "epoch": 0.13, "grad_norm": 5.585528788406191, "learning_rate": 1.9441891571567102e-05, "loss": 0.8945, "step": 3318 }, { "epoch": 0.13, "grad_norm": 4.685591388886318, "learning_rate": 1.9441461753075333e-05, "loss": 1.3506, "step": 3319 }, { "epoch": 0.13, "grad_norm": 5.145652990902651, "learning_rate": 1.9441031773893154e-05, "loss": 1.0681, "step": 3320 }, { "epoch": 0.13, "grad_norm": 6.352750874389817, "learning_rate": 1.9440601634027892e-05, "loss": 1.2163, "step": 3321 }, { "epoch": 0.13, "grad_norm": 5.849806879905485, "learning_rate": 1.944017133348686e-05, "loss": 1.2149, "step": 3322 }, { "epoch": 0.13, "grad_norm": 7.628944191212095, "learning_rate": 1.9439740872277384e-05, "loss": 1.4039, "step": 3323 }, { "epoch": 0.13, "grad_norm": 5.410014045362807, "learning_rate": 1.9439310250406794e-05, "loss": 1.288, "step": 3324 }, { "epoch": 0.13, "grad_norm": 5.8274339350327224, "learning_rate": 1.9438879467882413e-05, "loss": 1.3408, "step": 3325 }, { "epoch": 0.13, "grad_norm": 5.580427472801265, "learning_rate": 1.9438448524711576e-05, "loss": 1.1246, "step": 3326 }, { "epoch": 0.13, "grad_norm": 7.773255568507889, "learning_rate": 1.943801742090162e-05, "loss": 1.6044, "step": 3327 }, { "epoch": 0.13, "grad_norm": 7.264925055453032, "learning_rate": 1.943758615645988e-05, "loss": 1.0845, "step": 3328 }, { "epoch": 0.13, "grad_norm": 5.1285509559694455, "learning_rate": 1.9437154731393694e-05, "loss": 1.1623, "step": 3329 }, { "epoch": 0.13, "grad_norm": 6.889338185632538, "learning_rate": 1.94367231457104e-05, "loss": 1.4098, "step": 3330 }, { "epoch": 0.13, "grad_norm": 4.47445467463117, "learning_rate": 1.943629139941736e-05, "loss": 1.1482, "step": 3331 }, { "epoch": 0.13, "grad_norm": 6.552692855026888, "learning_rate": 1.9435859492521906e-05, "loss": 1.3552, "step": 3332 }, { "epoch": 0.13, "grad_norm": 6.4598459268808694, "learning_rate": 1.9435427425031393e-05, "loss": 1.1868, "step": 3333 }, { "epoch": 0.13, "grad_norm": 10.120259090094493, "learning_rate": 1.943499519695318e-05, "loss": 1.4921, "step": 3334 }, { "epoch": 0.13, "grad_norm": 5.792501300172822, "learning_rate": 1.9434562808294622e-05, "loss": 1.061, "step": 3335 }, { "epoch": 0.13, "grad_norm": 4.598181886120825, "learning_rate": 1.943413025906307e-05, "loss": 1.0647, "step": 3336 }, { "epoch": 0.13, "grad_norm": 4.230043662129069, "learning_rate": 1.9433697549265896e-05, "loss": 1.0181, "step": 3337 }, { "epoch": 0.13, "grad_norm": 18.856936026810484, "learning_rate": 1.943326467891046e-05, "loss": 1.6702, "step": 3338 }, { "epoch": 0.13, "grad_norm": 5.03830172688048, "learning_rate": 1.9432831648004128e-05, "loss": 1.4255, "step": 3339 }, { "epoch": 0.13, "grad_norm": 4.63577456919776, "learning_rate": 1.943239845655427e-05, "loss": 1.2956, "step": 3340 }, { "epoch": 0.13, "grad_norm": 6.774999816236365, "learning_rate": 1.943196510456826e-05, "loss": 1.1748, "step": 3341 }, { "epoch": 0.13, "grad_norm": 7.843903043950551, "learning_rate": 1.943153159205348e-05, "loss": 1.1477, "step": 3342 }, { "epoch": 0.13, "grad_norm": 9.234248514806017, "learning_rate": 1.94310979190173e-05, "loss": 1.5955, "step": 3343 }, { "epoch": 0.13, "grad_norm": 6.338884870818518, "learning_rate": 1.94306640854671e-05, "loss": 0.9635, "step": 3344 }, { "epoch": 0.13, "grad_norm": 7.23531744425188, "learning_rate": 1.9430230091410265e-05, "loss": 1.223, "step": 3345 }, { "epoch": 0.13, "grad_norm": 7.563420286292663, "learning_rate": 1.942979593685419e-05, "loss": 1.5077, "step": 3346 }, { "epoch": 0.13, "grad_norm": 8.431782743218271, "learning_rate": 1.942936162180625e-05, "loss": 1.1606, "step": 3347 }, { "epoch": 0.13, "grad_norm": 6.290189750667179, "learning_rate": 1.942892714627385e-05, "loss": 1.2706, "step": 3348 }, { "epoch": 0.13, "grad_norm": 4.4201539775121725, "learning_rate": 1.9428492510264374e-05, "loss": 1.2016, "step": 3349 }, { "epoch": 0.13, "grad_norm": 9.988778742524293, "learning_rate": 1.9428057713785226e-05, "loss": 1.2799, "step": 3350 }, { "epoch": 0.13, "grad_norm": 8.041848148760502, "learning_rate": 1.9427622756843803e-05, "loss": 1.3313, "step": 3351 }, { "epoch": 0.14, "grad_norm": 7.683812301600324, "learning_rate": 1.9427187639447514e-05, "loss": 1.3284, "step": 3352 }, { "epoch": 0.14, "grad_norm": 6.037413054633752, "learning_rate": 1.942675236160376e-05, "loss": 1.3698, "step": 3353 }, { "epoch": 0.14, "grad_norm": 4.495613753991952, "learning_rate": 1.942631692331994e-05, "loss": 0.9804, "step": 3354 }, { "epoch": 0.14, "grad_norm": 5.543844139044633, "learning_rate": 1.942588132460348e-05, "loss": 1.4795, "step": 3355 }, { "epoch": 0.14, "grad_norm": 5.469374523114687, "learning_rate": 1.942544556546179e-05, "loss": 1.206, "step": 3356 }, { "epoch": 0.14, "grad_norm": 5.031339884769263, "learning_rate": 1.942500964590228e-05, "loss": 1.3744, "step": 3357 }, { "epoch": 0.14, "grad_norm": 10.632459982394275, "learning_rate": 1.9424573565932375e-05, "loss": 1.3592, "step": 3358 }, { "epoch": 0.14, "grad_norm": 10.152457338704897, "learning_rate": 1.9424137325559496e-05, "loss": 1.3744, "step": 3359 }, { "epoch": 0.14, "grad_norm": 8.817381467132538, "learning_rate": 1.9423700924791065e-05, "loss": 1.1671, "step": 3360 }, { "epoch": 0.14, "grad_norm": 5.856140417085936, "learning_rate": 1.9423264363634513e-05, "loss": 1.2425, "step": 3361 }, { "epoch": 0.14, "grad_norm": 6.307620813916284, "learning_rate": 1.9422827642097266e-05, "loss": 1.4451, "step": 3362 }, { "epoch": 0.14, "grad_norm": 12.016422738751547, "learning_rate": 1.9422390760186762e-05, "loss": 1.5191, "step": 3363 }, { "epoch": 0.14, "grad_norm": 10.259095271282117, "learning_rate": 1.9421953717910434e-05, "loss": 1.3417, "step": 3364 }, { "epoch": 0.14, "grad_norm": 6.196925698008163, "learning_rate": 1.9421516515275716e-05, "loss": 1.1784, "step": 3365 }, { "epoch": 0.14, "grad_norm": 8.072968875541047, "learning_rate": 1.9421079152290058e-05, "loss": 1.3248, "step": 3366 }, { "epoch": 0.14, "grad_norm": 6.10942870247672, "learning_rate": 1.9420641628960897e-05, "loss": 1.2596, "step": 3367 }, { "epoch": 0.14, "grad_norm": 5.577252628633583, "learning_rate": 1.942020394529568e-05, "loss": 1.2609, "step": 3368 }, { "epoch": 0.14, "grad_norm": 4.93289491121317, "learning_rate": 1.941976610130186e-05, "loss": 1.1995, "step": 3369 }, { "epoch": 0.14, "grad_norm": 5.244429828615214, "learning_rate": 1.9419328096986887e-05, "loss": 1.3102, "step": 3370 }, { "epoch": 0.14, "grad_norm": 6.1170779650590354, "learning_rate": 1.9418889932358213e-05, "loss": 1.3383, "step": 3371 }, { "epoch": 0.14, "grad_norm": 5.9837898830308545, "learning_rate": 1.94184516074233e-05, "loss": 1.2413, "step": 3372 }, { "epoch": 0.14, "grad_norm": 5.802853367765761, "learning_rate": 1.9418013122189603e-05, "loss": 1.4723, "step": 3373 }, { "epoch": 0.14, "grad_norm": 4.985108001041933, "learning_rate": 1.941757447666459e-05, "loss": 1.4141, "step": 3374 }, { "epoch": 0.14, "grad_norm": 7.824270072485116, "learning_rate": 1.9417135670855725e-05, "loss": 1.2629, "step": 3375 }, { "epoch": 0.14, "grad_norm": 4.991615788394417, "learning_rate": 1.9416696704770476e-05, "loss": 1.1989, "step": 3376 }, { "epoch": 0.14, "grad_norm": 6.155859940372491, "learning_rate": 1.941625757841631e-05, "loss": 1.2238, "step": 3377 }, { "epoch": 0.14, "grad_norm": 5.224106117372199, "learning_rate": 1.9415818291800706e-05, "loss": 1.4948, "step": 3378 }, { "epoch": 0.14, "grad_norm": 5.589497109648999, "learning_rate": 1.9415378844931144e-05, "loss": 1.5896, "step": 3379 }, { "epoch": 0.14, "grad_norm": 6.170033320692142, "learning_rate": 1.941493923781509e-05, "loss": 1.0294, "step": 3380 }, { "epoch": 0.14, "grad_norm": 6.051430615475649, "learning_rate": 1.9414499470460038e-05, "loss": 1.3544, "step": 3381 }, { "epoch": 0.14, "grad_norm": 7.496059669335801, "learning_rate": 1.941405954287347e-05, "loss": 1.2456, "step": 3382 }, { "epoch": 0.14, "grad_norm": 5.16012270156543, "learning_rate": 1.9413619455062873e-05, "loss": 1.316, "step": 3383 }, { "epoch": 0.14, "grad_norm": 5.670541146981436, "learning_rate": 1.941317920703574e-05, "loss": 1.1069, "step": 3384 }, { "epoch": 0.14, "grad_norm": 5.712483386603127, "learning_rate": 1.941273879879955e-05, "loss": 0.9328, "step": 3385 }, { "epoch": 0.14, "grad_norm": 3.9802664301070703, "learning_rate": 1.9412298230361816e-05, "loss": 1.0938, "step": 3386 }, { "epoch": 0.14, "grad_norm": 6.463753723647389, "learning_rate": 1.9411857501730028e-05, "loss": 1.3868, "step": 3387 }, { "epoch": 0.14, "grad_norm": 4.511544210395661, "learning_rate": 1.9411416612911688e-05, "loss": 1.3139, "step": 3388 }, { "epoch": 0.14, "grad_norm": 4.611414218736167, "learning_rate": 1.9410975563914298e-05, "loss": 1.4016, "step": 3389 }, { "epoch": 0.14, "grad_norm": 4.964293718582946, "learning_rate": 1.9410534354745367e-05, "loss": 1.2763, "step": 3390 }, { "epoch": 0.14, "grad_norm": 5.362813387979358, "learning_rate": 1.9410092985412405e-05, "loss": 1.3941, "step": 3391 }, { "epoch": 0.14, "grad_norm": 7.376554532814177, "learning_rate": 1.9409651455922925e-05, "loss": 1.5512, "step": 3392 }, { "epoch": 0.14, "grad_norm": 5.406336684032062, "learning_rate": 1.9409209766284438e-05, "loss": 1.1554, "step": 3393 }, { "epoch": 0.14, "grad_norm": 5.743753070043522, "learning_rate": 1.940876791650446e-05, "loss": 1.6023, "step": 3394 }, { "epoch": 0.14, "grad_norm": 5.383232165187367, "learning_rate": 1.9408325906590517e-05, "loss": 1.4496, "step": 3395 }, { "epoch": 0.14, "grad_norm": 4.906339305535815, "learning_rate": 1.9407883736550128e-05, "loss": 1.2932, "step": 3396 }, { "epoch": 0.14, "grad_norm": 7.025327306896144, "learning_rate": 1.9407441406390818e-05, "loss": 1.2405, "step": 3397 }, { "epoch": 0.14, "grad_norm": 8.769775916082093, "learning_rate": 1.940699891612012e-05, "loss": 1.4101, "step": 3398 }, { "epoch": 0.14, "grad_norm": 6.052871484635951, "learning_rate": 1.940655626574556e-05, "loss": 1.2884, "step": 3399 }, { "epoch": 0.14, "grad_norm": 6.414912173761975, "learning_rate": 1.9406113455274672e-05, "loss": 1.4323, "step": 3400 }, { "epoch": 0.14, "grad_norm": 5.83116374807513, "learning_rate": 1.9405670484714995e-05, "loss": 1.1161, "step": 3401 }, { "epoch": 0.14, "grad_norm": 7.821701934315629, "learning_rate": 1.940522735407407e-05, "loss": 1.4214, "step": 3402 }, { "epoch": 0.14, "grad_norm": 7.0175844553651725, "learning_rate": 1.940478406335943e-05, "loss": 1.5578, "step": 3403 }, { "epoch": 0.14, "grad_norm": 8.48628861303603, "learning_rate": 1.9404340612578628e-05, "loss": 1.5568, "step": 3404 }, { "epoch": 0.14, "grad_norm": 6.446606043686788, "learning_rate": 1.940389700173921e-05, "loss": 1.4454, "step": 3405 }, { "epoch": 0.14, "grad_norm": 6.819481838956032, "learning_rate": 1.9403453230848726e-05, "loss": 1.4707, "step": 3406 }, { "epoch": 0.14, "grad_norm": 4.7041510195737315, "learning_rate": 1.9403009299914727e-05, "loss": 1.2329, "step": 3407 }, { "epoch": 0.14, "grad_norm": 7.361500083965648, "learning_rate": 1.940256520894477e-05, "loss": 1.3406, "step": 3408 }, { "epoch": 0.14, "grad_norm": 5.1733373194692565, "learning_rate": 1.9402120957946413e-05, "loss": 1.5265, "step": 3409 }, { "epoch": 0.14, "grad_norm": 6.465194621615663, "learning_rate": 1.9401676546927218e-05, "loss": 1.0876, "step": 3410 }, { "epoch": 0.14, "grad_norm": 6.603399231957344, "learning_rate": 1.9401231975894744e-05, "loss": 1.4524, "step": 3411 }, { "epoch": 0.14, "grad_norm": 9.824476685524571, "learning_rate": 1.9400787244856564e-05, "loss": 1.715, "step": 3412 }, { "epoch": 0.14, "grad_norm": 6.589583707872581, "learning_rate": 1.9400342353820244e-05, "loss": 1.3078, "step": 3413 }, { "epoch": 0.14, "grad_norm": 5.374150355455869, "learning_rate": 1.9399897302793356e-05, "loss": 1.4007, "step": 3414 }, { "epoch": 0.14, "grad_norm": 7.3259788621077595, "learning_rate": 1.9399452091783475e-05, "loss": 1.7372, "step": 3415 }, { "epoch": 0.14, "grad_norm": 6.046465292883325, "learning_rate": 1.939900672079818e-05, "loss": 1.3025, "step": 3416 }, { "epoch": 0.14, "grad_norm": 6.202166410265881, "learning_rate": 1.9398561189845045e-05, "loss": 1.3197, "step": 3417 }, { "epoch": 0.14, "grad_norm": 5.0489215579169695, "learning_rate": 1.9398115498931657e-05, "loss": 1.099, "step": 3418 }, { "epoch": 0.14, "grad_norm": 7.529183044772131, "learning_rate": 1.9397669648065603e-05, "loss": 1.4582, "step": 3419 }, { "epoch": 0.14, "grad_norm": 6.70313803446688, "learning_rate": 1.939722363725447e-05, "loss": 1.2427, "step": 3420 }, { "epoch": 0.14, "grad_norm": 6.891997811350824, "learning_rate": 1.9396777466505846e-05, "loss": 1.4855, "step": 3421 }, { "epoch": 0.14, "grad_norm": 11.553798407386221, "learning_rate": 1.939633113582733e-05, "loss": 1.6285, "step": 3422 }, { "epoch": 0.14, "grad_norm": 6.431800125437125, "learning_rate": 1.9395884645226515e-05, "loss": 1.7717, "step": 3423 }, { "epoch": 0.14, "grad_norm": 5.758094550645932, "learning_rate": 1.9395437994711002e-05, "loss": 1.36, "step": 3424 }, { "epoch": 0.14, "grad_norm": 7.106706396827898, "learning_rate": 1.939499118428839e-05, "loss": 1.4688, "step": 3425 }, { "epoch": 0.14, "grad_norm": 7.354318890115715, "learning_rate": 1.9394544213966288e-05, "loss": 1.3066, "step": 3426 }, { "epoch": 0.14, "grad_norm": 6.88545503745137, "learning_rate": 1.9394097083752297e-05, "loss": 1.3422, "step": 3427 }, { "epoch": 0.14, "grad_norm": 6.164132069473866, "learning_rate": 1.939364979365403e-05, "loss": 1.2147, "step": 3428 }, { "epoch": 0.14, "grad_norm": 7.572123127985721, "learning_rate": 1.93932023436791e-05, "loss": 1.3116, "step": 3429 }, { "epoch": 0.14, "grad_norm": 8.96948694150734, "learning_rate": 1.9392754733835128e-05, "loss": 1.1239, "step": 3430 }, { "epoch": 0.14, "grad_norm": 9.96746578011833, "learning_rate": 1.939230696412972e-05, "loss": 1.2308, "step": 3431 }, { "epoch": 0.14, "grad_norm": 8.886858589409234, "learning_rate": 1.939185903457051e-05, "loss": 1.5237, "step": 3432 }, { "epoch": 0.14, "grad_norm": 10.14458108874245, "learning_rate": 1.939141094516511e-05, "loss": 1.4112, "step": 3433 }, { "epoch": 0.14, "grad_norm": 8.563895841498034, "learning_rate": 1.9390962695921152e-05, "loss": 1.2075, "step": 3434 }, { "epoch": 0.14, "grad_norm": 8.688612434360364, "learning_rate": 1.9390514286846266e-05, "loss": 1.2809, "step": 3435 }, { "epoch": 0.14, "grad_norm": 6.455404088915596, "learning_rate": 1.9390065717948084e-05, "loss": 1.124, "step": 3436 }, { "epoch": 0.14, "grad_norm": 5.377092463913481, "learning_rate": 1.9389616989234236e-05, "loss": 1.2928, "step": 3437 }, { "epoch": 0.14, "grad_norm": 11.3292860420178, "learning_rate": 1.9389168100712363e-05, "loss": 1.4396, "step": 3438 }, { "epoch": 0.14, "grad_norm": 9.865940936365773, "learning_rate": 1.9388719052390104e-05, "loss": 1.3292, "step": 3439 }, { "epoch": 0.14, "grad_norm": 6.844952206312766, "learning_rate": 1.9388269844275103e-05, "loss": 1.4415, "step": 3440 }, { "epoch": 0.14, "grad_norm": 8.39040531383545, "learning_rate": 1.9387820476375005e-05, "loss": 1.4864, "step": 3441 }, { "epoch": 0.14, "grad_norm": 6.2925562792158996, "learning_rate": 1.938737094869745e-05, "loss": 1.1135, "step": 3442 }, { "epoch": 0.14, "grad_norm": 7.245415351294139, "learning_rate": 1.9386921261250104e-05, "loss": 1.0453, "step": 3443 }, { "epoch": 0.14, "grad_norm": 9.707016036573508, "learning_rate": 1.938647141404061e-05, "loss": 1.6276, "step": 3444 }, { "epoch": 0.14, "grad_norm": 6.607878055502544, "learning_rate": 1.9386021407076626e-05, "loss": 1.31, "step": 3445 }, { "epoch": 0.14, "grad_norm": 7.433957679482305, "learning_rate": 1.9385571240365813e-05, "loss": 1.4047, "step": 3446 }, { "epoch": 0.14, "grad_norm": 13.558277596922672, "learning_rate": 1.938512091391583e-05, "loss": 1.5027, "step": 3447 }, { "epoch": 0.14, "grad_norm": 5.543241024622766, "learning_rate": 1.938467042773434e-05, "loss": 1.4807, "step": 3448 }, { "epoch": 0.14, "grad_norm": 6.155714350358287, "learning_rate": 1.938421978182902e-05, "loss": 1.1894, "step": 3449 }, { "epoch": 0.14, "grad_norm": 6.291657158779086, "learning_rate": 1.9383768976207526e-05, "loss": 1.3246, "step": 3450 }, { "epoch": 0.14, "grad_norm": 7.304015317762432, "learning_rate": 1.938331801087754e-05, "loss": 1.2248, "step": 3451 }, { "epoch": 0.14, "grad_norm": 5.762949836475672, "learning_rate": 1.9382866885846735e-05, "loss": 1.3467, "step": 3452 }, { "epoch": 0.14, "grad_norm": 5.608611672444151, "learning_rate": 1.9382415601122786e-05, "loss": 1.1973, "step": 3453 }, { "epoch": 0.14, "grad_norm": 7.543379816820437, "learning_rate": 1.938196415671338e-05, "loss": 1.3761, "step": 3454 }, { "epoch": 0.14, "grad_norm": 6.354132069285957, "learning_rate": 1.9381512552626192e-05, "loss": 1.3247, "step": 3455 }, { "epoch": 0.14, "grad_norm": 5.903248979810366, "learning_rate": 1.9381060788868916e-05, "loss": 1.2662, "step": 3456 }, { "epoch": 0.14, "grad_norm": 6.389771314102732, "learning_rate": 1.938060886544924e-05, "loss": 1.2107, "step": 3457 }, { "epoch": 0.14, "grad_norm": 7.015642641356586, "learning_rate": 1.938015678237485e-05, "loss": 1.35, "step": 3458 }, { "epoch": 0.14, "grad_norm": 4.581190872631078, "learning_rate": 1.9379704539653443e-05, "loss": 1.1677, "step": 3459 }, { "epoch": 0.14, "grad_norm": 6.1322004279129, "learning_rate": 1.9379252137292716e-05, "loss": 1.3736, "step": 3460 }, { "epoch": 0.14, "grad_norm": 5.1417265371124214, "learning_rate": 1.9378799575300372e-05, "loss": 1.1147, "step": 3461 }, { "epoch": 0.14, "grad_norm": 6.478441920222657, "learning_rate": 1.937834685368411e-05, "loss": 1.6412, "step": 3462 }, { "epoch": 0.14, "grad_norm": 8.861013409056604, "learning_rate": 1.9377893972451636e-05, "loss": 1.3654, "step": 3463 }, { "epoch": 0.14, "grad_norm": 5.762769559743608, "learning_rate": 1.9377440931610655e-05, "loss": 1.3497, "step": 3464 }, { "epoch": 0.14, "grad_norm": 4.806424827527615, "learning_rate": 1.937698773116888e-05, "loss": 1.1595, "step": 3465 }, { "epoch": 0.14, "grad_norm": 5.163343008450323, "learning_rate": 1.937653437113403e-05, "loss": 1.2692, "step": 3466 }, { "epoch": 0.14, "grad_norm": 4.523775991468714, "learning_rate": 1.9376080851513814e-05, "loss": 1.158, "step": 3467 }, { "epoch": 0.14, "grad_norm": 8.241072416361618, "learning_rate": 1.937562717231595e-05, "loss": 1.2557, "step": 3468 }, { "epoch": 0.14, "grad_norm": 6.948099192001277, "learning_rate": 1.9375173333548164e-05, "loss": 1.6766, "step": 3469 }, { "epoch": 0.14, "grad_norm": 7.716205195254176, "learning_rate": 1.9374719335218176e-05, "loss": 1.3781, "step": 3470 }, { "epoch": 0.14, "grad_norm": 5.894590509924929, "learning_rate": 1.9374265177333716e-05, "loss": 1.1931, "step": 3471 }, { "epoch": 0.14, "grad_norm": 7.358373304822158, "learning_rate": 1.9373810859902513e-05, "loss": 1.074, "step": 3472 }, { "epoch": 0.14, "grad_norm": 4.872093054416607, "learning_rate": 1.93733563829323e-05, "loss": 1.3643, "step": 3473 }, { "epoch": 0.14, "grad_norm": 5.124914847642334, "learning_rate": 1.937290174643081e-05, "loss": 1.3216, "step": 3474 }, { "epoch": 0.14, "grad_norm": 6.24917466785008, "learning_rate": 1.937244695040578e-05, "loss": 1.1541, "step": 3475 }, { "epoch": 0.14, "grad_norm": 5.574337029204829, "learning_rate": 1.9371991994864956e-05, "loss": 1.3603, "step": 3476 }, { "epoch": 0.14, "grad_norm": 6.5933238798800415, "learning_rate": 1.9371536879816074e-05, "loss": 1.4235, "step": 3477 }, { "epoch": 0.14, "grad_norm": 6.718265799348814, "learning_rate": 1.9371081605266884e-05, "loss": 1.4624, "step": 3478 }, { "epoch": 0.14, "grad_norm": 7.294645468376381, "learning_rate": 1.9370626171225134e-05, "loss": 1.5719, "step": 3479 }, { "epoch": 0.14, "grad_norm": 6.993040484422911, "learning_rate": 1.9370170577698575e-05, "loss": 1.7173, "step": 3480 }, { "epoch": 0.14, "grad_norm": 5.181489864014435, "learning_rate": 1.9369714824694964e-05, "loss": 1.093, "step": 3481 }, { "epoch": 0.14, "grad_norm": 4.965665987668762, "learning_rate": 1.9369258912222052e-05, "loss": 1.1253, "step": 3482 }, { "epoch": 0.14, "grad_norm": 8.12412523605497, "learning_rate": 1.9368802840287603e-05, "loss": 1.4009, "step": 3483 }, { "epoch": 0.14, "grad_norm": 4.742624776225516, "learning_rate": 1.9368346608899376e-05, "loss": 1.2876, "step": 3484 }, { "epoch": 0.14, "grad_norm": 5.992417496795538, "learning_rate": 1.9367890218065136e-05, "loss": 1.3265, "step": 3485 }, { "epoch": 0.14, "grad_norm": 6.37341153192314, "learning_rate": 1.9367433667792658e-05, "loss": 1.2273, "step": 3486 }, { "epoch": 0.14, "grad_norm": 6.34479826162872, "learning_rate": 1.9366976958089704e-05, "loss": 1.3512, "step": 3487 }, { "epoch": 0.14, "grad_norm": 4.382871018399976, "learning_rate": 1.9366520088964046e-05, "loss": 1.2288, "step": 3488 }, { "epoch": 0.14, "grad_norm": 7.462706475481963, "learning_rate": 1.9366063060423468e-05, "loss": 1.6517, "step": 3489 }, { "epoch": 0.14, "grad_norm": 5.1258245338650354, "learning_rate": 1.936560587247574e-05, "loss": 1.3657, "step": 3490 }, { "epoch": 0.14, "grad_norm": 4.75781319110719, "learning_rate": 1.9365148525128653e-05, "loss": 1.0261, "step": 3491 }, { "epoch": 0.14, "grad_norm": 5.461145532762875, "learning_rate": 1.936469101838998e-05, "loss": 1.1841, "step": 3492 }, { "epoch": 0.14, "grad_norm": 5.179499326549803, "learning_rate": 1.936423335226751e-05, "loss": 1.2588, "step": 3493 }, { "epoch": 0.14, "grad_norm": 6.5727749207936075, "learning_rate": 1.936377552676904e-05, "loss": 1.22, "step": 3494 }, { "epoch": 0.14, "grad_norm": 6.196754930823824, "learning_rate": 1.9363317541902353e-05, "loss": 1.0986, "step": 3495 }, { "epoch": 0.14, "grad_norm": 5.556516018727316, "learning_rate": 1.9362859397675252e-05, "loss": 1.4878, "step": 3496 }, { "epoch": 0.14, "grad_norm": 5.658884717243499, "learning_rate": 1.9362401094095527e-05, "loss": 1.323, "step": 3497 }, { "epoch": 0.14, "grad_norm": 4.3152435849005615, "learning_rate": 1.936194263117098e-05, "loss": 1.1911, "step": 3498 }, { "epoch": 0.14, "grad_norm": 5.4156630325329385, "learning_rate": 1.9361484008909418e-05, "loss": 1.1041, "step": 3499 }, { "epoch": 0.14, "grad_norm": 6.7797491483193, "learning_rate": 1.936102522731864e-05, "loss": 1.11, "step": 3500 }, { "epoch": 0.14, "grad_norm": 5.231591885572993, "learning_rate": 1.936056628640646e-05, "loss": 1.2652, "step": 3501 }, { "epoch": 0.14, "grad_norm": 5.874898910780507, "learning_rate": 1.936010718618069e-05, "loss": 1.2977, "step": 3502 }, { "epoch": 0.14, "grad_norm": 9.910529618788939, "learning_rate": 1.9359647926649137e-05, "loss": 1.2985, "step": 3503 }, { "epoch": 0.14, "grad_norm": 5.127822701745876, "learning_rate": 1.9359188507819623e-05, "loss": 1.5123, "step": 3504 }, { "epoch": 0.14, "grad_norm": 6.637024614745705, "learning_rate": 1.9358728929699966e-05, "loss": 1.1047, "step": 3505 }, { "epoch": 0.14, "grad_norm": 6.359722410178401, "learning_rate": 1.9358269192297986e-05, "loss": 1.5618, "step": 3506 }, { "epoch": 0.14, "grad_norm": 7.750770619213121, "learning_rate": 1.935780929562151e-05, "loss": 1.4019, "step": 3507 }, { "epoch": 0.14, "grad_norm": 5.58412025929698, "learning_rate": 1.935734923967836e-05, "loss": 1.7258, "step": 3508 }, { "epoch": 0.14, "grad_norm": 7.547147867446062, "learning_rate": 1.935688902447637e-05, "loss": 1.4428, "step": 3509 }, { "epoch": 0.14, "grad_norm": 4.778229935074341, "learning_rate": 1.9356428650023377e-05, "loss": 1.0937, "step": 3510 }, { "epoch": 0.14, "grad_norm": 4.60732683791496, "learning_rate": 1.935596811632721e-05, "loss": 1.3196, "step": 3511 }, { "epoch": 0.14, "grad_norm": 5.700535746591945, "learning_rate": 1.935550742339571e-05, "loss": 1.3589, "step": 3512 }, { "epoch": 0.14, "grad_norm": 5.131380948001451, "learning_rate": 1.9355046571236715e-05, "loss": 1.2749, "step": 3513 }, { "epoch": 0.14, "grad_norm": 4.621406242803609, "learning_rate": 1.9354585559858073e-05, "loss": 1.3357, "step": 3514 }, { "epoch": 0.14, "grad_norm": 5.661460594129713, "learning_rate": 1.9354124389267625e-05, "loss": 1.2304, "step": 3515 }, { "epoch": 0.14, "grad_norm": 5.801585180083996, "learning_rate": 1.9353663059473225e-05, "loss": 1.3014, "step": 3516 }, { "epoch": 0.14, "grad_norm": 5.983408719264548, "learning_rate": 1.9353201570482723e-05, "loss": 1.7416, "step": 3517 }, { "epoch": 0.14, "grad_norm": 4.272217801127376, "learning_rate": 1.9352739922303973e-05, "loss": 1.1886, "step": 3518 }, { "epoch": 0.14, "grad_norm": 4.731418781738641, "learning_rate": 1.935227811494483e-05, "loss": 1.3161, "step": 3519 }, { "epoch": 0.14, "grad_norm": 6.4041217862465585, "learning_rate": 1.935181614841316e-05, "loss": 1.4277, "step": 3520 }, { "epoch": 0.14, "grad_norm": 6.212395122218816, "learning_rate": 1.9351354022716817e-05, "loss": 1.3881, "step": 3521 }, { "epoch": 0.14, "grad_norm": 5.30944805217136, "learning_rate": 1.9350891737863672e-05, "loss": 1.2659, "step": 3522 }, { "epoch": 0.14, "grad_norm": 4.397898307814744, "learning_rate": 1.935042929386159e-05, "loss": 1.1778, "step": 3523 }, { "epoch": 0.14, "grad_norm": 7.146021414355779, "learning_rate": 1.9349966690718448e-05, "loss": 1.499, "step": 3524 }, { "epoch": 0.14, "grad_norm": 6.6445677890048405, "learning_rate": 1.9349503928442107e-05, "loss": 1.4357, "step": 3525 }, { "epoch": 0.14, "grad_norm": 8.113995305391045, "learning_rate": 1.9349041007040456e-05, "loss": 1.138, "step": 3526 }, { "epoch": 0.14, "grad_norm": 5.279183692764449, "learning_rate": 1.9348577926521365e-05, "loss": 1.2013, "step": 3527 }, { "epoch": 0.14, "grad_norm": 4.70110630698537, "learning_rate": 1.9348114686892722e-05, "loss": 1.2864, "step": 3528 }, { "epoch": 0.14, "grad_norm": 6.450872096008785, "learning_rate": 1.9347651288162405e-05, "loss": 1.5178, "step": 3529 }, { "epoch": 0.14, "grad_norm": 5.342921691811906, "learning_rate": 1.9347187730338307e-05, "loss": 1.1821, "step": 3530 }, { "epoch": 0.14, "grad_norm": 7.670805418502913, "learning_rate": 1.934672401342831e-05, "loss": 1.6262, "step": 3531 }, { "epoch": 0.14, "grad_norm": 6.476008241308268, "learning_rate": 1.9346260137440313e-05, "loss": 1.4287, "step": 3532 }, { "epoch": 0.14, "grad_norm": 5.818720125648486, "learning_rate": 1.9345796102382207e-05, "loss": 1.3461, "step": 3533 }, { "epoch": 0.14, "grad_norm": 6.046703201104345, "learning_rate": 1.9345331908261893e-05, "loss": 1.3369, "step": 3534 }, { "epoch": 0.14, "grad_norm": 9.615088791054546, "learning_rate": 1.934486755508727e-05, "loss": 1.492, "step": 3535 }, { "epoch": 0.14, "grad_norm": 6.301105297708707, "learning_rate": 1.9344403042866242e-05, "loss": 1.2015, "step": 3536 }, { "epoch": 0.14, "grad_norm": 5.463109159601858, "learning_rate": 1.9343938371606714e-05, "loss": 1.3505, "step": 3537 }, { "epoch": 0.14, "grad_norm": 7.186095209401906, "learning_rate": 1.934347354131659e-05, "loss": 0.9972, "step": 3538 }, { "epoch": 0.14, "grad_norm": 8.51826927075693, "learning_rate": 1.934300855200379e-05, "loss": 1.2761, "step": 3539 }, { "epoch": 0.14, "grad_norm": 4.595616800886077, "learning_rate": 1.9342543403676222e-05, "loss": 1.2862, "step": 3540 }, { "epoch": 0.14, "grad_norm": 6.0756309610333625, "learning_rate": 1.9342078096341804e-05, "loss": 1.157, "step": 3541 }, { "epoch": 0.14, "grad_norm": 5.905058340147622, "learning_rate": 1.9341612630008457e-05, "loss": 1.0459, "step": 3542 }, { "epoch": 0.14, "grad_norm": 6.046927815146896, "learning_rate": 1.9341147004684098e-05, "loss": 1.4215, "step": 3543 }, { "epoch": 0.14, "grad_norm": 7.396685621252678, "learning_rate": 1.934068122037666e-05, "loss": 1.5611, "step": 3544 }, { "epoch": 0.14, "grad_norm": 5.876620266252898, "learning_rate": 1.934021527709406e-05, "loss": 1.2384, "step": 3545 }, { "epoch": 0.14, "grad_norm": 5.736525717937083, "learning_rate": 1.933974917484424e-05, "loss": 1.2211, "step": 3546 }, { "epoch": 0.14, "grad_norm": 6.517526527577385, "learning_rate": 1.9339282913635125e-05, "loss": 1.2161, "step": 3547 }, { "epoch": 0.14, "grad_norm": 5.755477376298946, "learning_rate": 1.933881649347465e-05, "loss": 1.4884, "step": 3548 }, { "epoch": 0.14, "grad_norm": 5.0292536306221365, "learning_rate": 1.9338349914370764e-05, "loss": 1.1779, "step": 3549 }, { "epoch": 0.14, "grad_norm": 5.75411921483523, "learning_rate": 1.9337883176331392e-05, "loss": 1.2395, "step": 3550 }, { "epoch": 0.14, "grad_norm": 6.676796576915265, "learning_rate": 1.9337416279364486e-05, "loss": 1.4636, "step": 3551 }, { "epoch": 0.14, "grad_norm": 5.424827810644559, "learning_rate": 1.9336949223477993e-05, "loss": 1.5046, "step": 3552 }, { "epoch": 0.14, "grad_norm": 6.84517265145236, "learning_rate": 1.9336482008679863e-05, "loss": 1.4599, "step": 3553 }, { "epoch": 0.14, "grad_norm": 5.5351893978123945, "learning_rate": 1.9336014634978046e-05, "loss": 1.2653, "step": 3554 }, { "epoch": 0.14, "grad_norm": 5.351882071932221, "learning_rate": 1.9335547102380493e-05, "loss": 1.4666, "step": 3555 }, { "epoch": 0.14, "grad_norm": 6.23298100591561, "learning_rate": 1.933507941089517e-05, "loss": 1.3103, "step": 3556 }, { "epoch": 0.14, "grad_norm": 4.755909101930564, "learning_rate": 1.933461156053003e-05, "loss": 1.4024, "step": 3557 }, { "epoch": 0.14, "grad_norm": 5.131644604105007, "learning_rate": 1.9334143551293036e-05, "loss": 1.4308, "step": 3558 }, { "epoch": 0.14, "grad_norm": 5.73181816984935, "learning_rate": 1.9333675383192155e-05, "loss": 1.3873, "step": 3559 }, { "epoch": 0.14, "grad_norm": 4.127282418627163, "learning_rate": 1.9333207056235353e-05, "loss": 1.2409, "step": 3560 }, { "epoch": 0.14, "grad_norm": 4.385636718875025, "learning_rate": 1.933273857043061e-05, "loss": 1.1726, "step": 3561 }, { "epoch": 0.14, "grad_norm": 5.897729141243967, "learning_rate": 1.9332269925785886e-05, "loss": 1.2604, "step": 3562 }, { "epoch": 0.14, "grad_norm": 7.544996930364611, "learning_rate": 1.9331801122309165e-05, "loss": 1.419, "step": 3563 }, { "epoch": 0.14, "grad_norm": 4.316018173358168, "learning_rate": 1.933133216000842e-05, "loss": 1.3462, "step": 3564 }, { "epoch": 0.14, "grad_norm": 4.588162260642491, "learning_rate": 1.933086303889164e-05, "loss": 1.231, "step": 3565 }, { "epoch": 0.14, "grad_norm": 3.447751074374453, "learning_rate": 1.9330393758966808e-05, "loss": 1.186, "step": 3566 }, { "epoch": 0.14, "grad_norm": 3.2497593448365647, "learning_rate": 1.9329924320241905e-05, "loss": 1.1019, "step": 3567 }, { "epoch": 0.14, "grad_norm": 7.070092271897849, "learning_rate": 1.932945472272493e-05, "loss": 1.3323, "step": 3568 }, { "epoch": 0.14, "grad_norm": 6.954542640271877, "learning_rate": 1.932898496642386e-05, "loss": 1.3721, "step": 3569 }, { "epoch": 0.14, "grad_norm": 5.487115156244947, "learning_rate": 1.932851505134671e-05, "loss": 1.3476, "step": 3570 }, { "epoch": 0.14, "grad_norm": 4.843672548931465, "learning_rate": 1.9328044977501465e-05, "loss": 1.3099, "step": 3571 }, { "epoch": 0.14, "grad_norm": 5.081740237232703, "learning_rate": 1.9327574744896125e-05, "loss": 1.0944, "step": 3572 }, { "epoch": 0.14, "grad_norm": 6.368916255204438, "learning_rate": 1.9327104353538702e-05, "loss": 1.452, "step": 3573 }, { "epoch": 0.14, "grad_norm": 5.662836395515465, "learning_rate": 1.9326633803437197e-05, "loss": 1.1854, "step": 3574 }, { "epoch": 0.14, "grad_norm": 9.242383917318552, "learning_rate": 1.932616309459961e-05, "loss": 1.4741, "step": 3575 }, { "epoch": 0.14, "grad_norm": 5.690959504717035, "learning_rate": 1.9325692227033966e-05, "loss": 1.4331, "step": 3576 }, { "epoch": 0.14, "grad_norm": 5.080305621749548, "learning_rate": 1.9325221200748276e-05, "loss": 1.4988, "step": 3577 }, { "epoch": 0.14, "grad_norm": 6.598070086629607, "learning_rate": 1.9324750015750552e-05, "loss": 1.412, "step": 3578 }, { "epoch": 0.14, "grad_norm": 7.034487462878759, "learning_rate": 1.9324278672048812e-05, "loss": 1.5655, "step": 3579 }, { "epoch": 0.14, "grad_norm": 5.728796048200182, "learning_rate": 1.9323807169651086e-05, "loss": 1.1206, "step": 3580 }, { "epoch": 0.14, "grad_norm": 6.316272963710362, "learning_rate": 1.932333550856539e-05, "loss": 1.3065, "step": 3581 }, { "epoch": 0.14, "grad_norm": 8.933249928534773, "learning_rate": 1.932286368879976e-05, "loss": 1.468, "step": 3582 }, { "epoch": 0.14, "grad_norm": 5.402785082308136, "learning_rate": 1.9322391710362218e-05, "loss": 1.4435, "step": 3583 }, { "epoch": 0.14, "grad_norm": 5.816580075304339, "learning_rate": 1.9321919573260803e-05, "loss": 1.3101, "step": 3584 }, { "epoch": 0.14, "grad_norm": 6.5722148320596165, "learning_rate": 1.9321447277503548e-05, "loss": 1.3343, "step": 3585 }, { "epoch": 0.14, "grad_norm": 4.944792166548335, "learning_rate": 1.9320974823098496e-05, "loss": 1.133, "step": 3586 }, { "epoch": 0.14, "grad_norm": 4.5003254459739015, "learning_rate": 1.932050221005368e-05, "loss": 1.0491, "step": 3587 }, { "epoch": 0.14, "grad_norm": 8.075978641757509, "learning_rate": 1.932002943837715e-05, "loss": 1.5595, "step": 3588 }, { "epoch": 0.14, "grad_norm": 8.399880362003879, "learning_rate": 1.931955650807695e-05, "loss": 1.3211, "step": 3589 }, { "epoch": 0.14, "grad_norm": 5.10246550786571, "learning_rate": 1.9319083419161125e-05, "loss": 1.1488, "step": 3590 }, { "epoch": 0.14, "grad_norm": 8.288271731388031, "learning_rate": 1.9318610171637734e-05, "loss": 1.4547, "step": 3591 }, { "epoch": 0.14, "grad_norm": 5.923431648573905, "learning_rate": 1.931813676551483e-05, "loss": 1.3313, "step": 3592 }, { "epoch": 0.14, "grad_norm": 4.768131559573413, "learning_rate": 1.9317663200800468e-05, "loss": 1.2237, "step": 3593 }, { "epoch": 0.14, "grad_norm": 5.676869049064393, "learning_rate": 1.9317189477502703e-05, "loss": 1.3421, "step": 3594 }, { "epoch": 0.14, "grad_norm": 5.913947831849508, "learning_rate": 1.9316715595629613e-05, "loss": 1.486, "step": 3595 }, { "epoch": 0.14, "grad_norm": 7.120575406662452, "learning_rate": 1.9316241555189245e-05, "loss": 1.3117, "step": 3596 }, { "epoch": 0.14, "grad_norm": 4.852399156032039, "learning_rate": 1.931576735618968e-05, "loss": 1.3234, "step": 3597 }, { "epoch": 0.14, "grad_norm": 7.797687182123218, "learning_rate": 1.9315292998638983e-05, "loss": 1.2737, "step": 3598 }, { "epoch": 0.14, "grad_norm": 6.987113542103044, "learning_rate": 1.931481848254523e-05, "loss": 1.3531, "step": 3599 }, { "epoch": 0.15, "grad_norm": 5.824767118872224, "learning_rate": 1.9314343807916495e-05, "loss": 1.3492, "step": 3600 }, { "epoch": 0.15, "grad_norm": 4.596404174813881, "learning_rate": 1.9313868974760854e-05, "loss": 1.3098, "step": 3601 }, { "epoch": 0.15, "grad_norm": 8.472603541163481, "learning_rate": 1.9313393983086396e-05, "loss": 1.4864, "step": 3602 }, { "epoch": 0.15, "grad_norm": 6.130093005860818, "learning_rate": 1.93129188329012e-05, "loss": 1.2642, "step": 3603 }, { "epoch": 0.15, "grad_norm": 8.39032586262664, "learning_rate": 1.9312443524213356e-05, "loss": 1.2612, "step": 3604 }, { "epoch": 0.15, "grad_norm": 6.698058669876713, "learning_rate": 1.931196805703095e-05, "loss": 1.3444, "step": 3605 }, { "epoch": 0.15, "grad_norm": 4.715010709611209, "learning_rate": 1.9311492431362077e-05, "loss": 1.1977, "step": 3606 }, { "epoch": 0.15, "grad_norm": 5.020022402447842, "learning_rate": 1.9311016647214828e-05, "loss": 1.2581, "step": 3607 }, { "epoch": 0.15, "grad_norm": 4.5807724501818425, "learning_rate": 1.9310540704597307e-05, "loss": 1.209, "step": 3608 }, { "epoch": 0.15, "grad_norm": 6.668415692756213, "learning_rate": 1.931006460351761e-05, "loss": 1.3596, "step": 3609 }, { "epoch": 0.15, "grad_norm": 5.108981395578428, "learning_rate": 1.9309588343983844e-05, "loss": 0.9325, "step": 3610 }, { "epoch": 0.15, "grad_norm": 7.131992702399826, "learning_rate": 1.9309111926004107e-05, "loss": 1.27, "step": 3611 }, { "epoch": 0.15, "grad_norm": 6.042034294276209, "learning_rate": 1.9308635349586512e-05, "loss": 1.2546, "step": 3612 }, { "epoch": 0.15, "grad_norm": 7.312828712542513, "learning_rate": 1.9308158614739177e-05, "loss": 1.295, "step": 3613 }, { "epoch": 0.15, "grad_norm": 5.504771280480764, "learning_rate": 1.9307681721470202e-05, "loss": 1.3963, "step": 3614 }, { "epoch": 0.15, "grad_norm": 5.050241099585464, "learning_rate": 1.9307204669787716e-05, "loss": 1.0411, "step": 3615 }, { "epoch": 0.15, "grad_norm": 5.523750735929576, "learning_rate": 1.930672745969983e-05, "loss": 1.1821, "step": 3616 }, { "epoch": 0.15, "grad_norm": 6.872808469155872, "learning_rate": 1.9306250091214672e-05, "loss": 1.5609, "step": 3617 }, { "epoch": 0.15, "grad_norm": 6.717052631414772, "learning_rate": 1.930577256434036e-05, "loss": 1.6624, "step": 3618 }, { "epoch": 0.15, "grad_norm": 6.0116159017622985, "learning_rate": 1.930529487908503e-05, "loss": 1.2977, "step": 3619 }, { "epoch": 0.15, "grad_norm": 5.139016423607786, "learning_rate": 1.9304817035456804e-05, "loss": 1.3727, "step": 3620 }, { "epoch": 0.15, "grad_norm": 5.065429177253061, "learning_rate": 1.9304339033463816e-05, "loss": 1.009, "step": 3621 }, { "epoch": 0.15, "grad_norm": 4.579654315103523, "learning_rate": 1.9303860873114207e-05, "loss": 1.2083, "step": 3622 }, { "epoch": 0.15, "grad_norm": 7.322308122966746, "learning_rate": 1.930338255441611e-05, "loss": 1.0228, "step": 3623 }, { "epoch": 0.15, "grad_norm": 6.492213567756576, "learning_rate": 1.9302904077377668e-05, "loss": 1.2802, "step": 3624 }, { "epoch": 0.15, "grad_norm": 4.906902712563606, "learning_rate": 1.9302425442007025e-05, "loss": 1.1329, "step": 3625 }, { "epoch": 0.15, "grad_norm": 6.667151005763552, "learning_rate": 1.930194664831232e-05, "loss": 1.2075, "step": 3626 }, { "epoch": 0.15, "grad_norm": 7.431314148854089, "learning_rate": 1.9301467696301714e-05, "loss": 1.3872, "step": 3627 }, { "epoch": 0.15, "grad_norm": 7.2163390874354585, "learning_rate": 1.9300988585983354e-05, "loss": 1.2297, "step": 3628 }, { "epoch": 0.15, "grad_norm": 6.907424457661549, "learning_rate": 1.9300509317365387e-05, "loss": 1.2644, "step": 3629 }, { "epoch": 0.15, "grad_norm": 5.525480119631577, "learning_rate": 1.930002989045598e-05, "loss": 1.2747, "step": 3630 }, { "epoch": 0.15, "grad_norm": 6.510632692476909, "learning_rate": 1.9299550305263285e-05, "loss": 1.2827, "step": 3631 }, { "epoch": 0.15, "grad_norm": 6.043368806106176, "learning_rate": 1.929907056179547e-05, "loss": 1.3859, "step": 3632 }, { "epoch": 0.15, "grad_norm": 6.859155643966201, "learning_rate": 1.9298590660060698e-05, "loss": 0.9335, "step": 3633 }, { "epoch": 0.15, "grad_norm": 8.672651385402915, "learning_rate": 1.9298110600067134e-05, "loss": 1.2317, "step": 3634 }, { "epoch": 0.15, "grad_norm": 7.490775705702057, "learning_rate": 1.9297630381822956e-05, "loss": 1.526, "step": 3635 }, { "epoch": 0.15, "grad_norm": 5.323688208103202, "learning_rate": 1.9297150005336327e-05, "loss": 1.1783, "step": 3636 }, { "epoch": 0.15, "grad_norm": 10.851350916321758, "learning_rate": 1.9296669470615432e-05, "loss": 1.0798, "step": 3637 }, { "epoch": 0.15, "grad_norm": 4.977923548771917, "learning_rate": 1.9296188777668444e-05, "loss": 1.3619, "step": 3638 }, { "epoch": 0.15, "grad_norm": 5.804242503767528, "learning_rate": 1.9295707926503545e-05, "loss": 1.2245, "step": 3639 }, { "epoch": 0.15, "grad_norm": 5.668574121363205, "learning_rate": 1.929522691712892e-05, "loss": 1.366, "step": 3640 }, { "epoch": 0.15, "grad_norm": 5.984019835954693, "learning_rate": 1.929474574955276e-05, "loss": 0.9846, "step": 3641 }, { "epoch": 0.15, "grad_norm": 4.274181238217217, "learning_rate": 1.9294264423783246e-05, "loss": 1.1889, "step": 3642 }, { "epoch": 0.15, "grad_norm": 5.728230889564322, "learning_rate": 1.929378293982857e-05, "loss": 1.3088, "step": 3643 }, { "epoch": 0.15, "grad_norm": 4.970012673731528, "learning_rate": 1.9293301297696936e-05, "loss": 1.222, "step": 3644 }, { "epoch": 0.15, "grad_norm": 5.583279825897293, "learning_rate": 1.9292819497396532e-05, "loss": 1.3263, "step": 3645 }, { "epoch": 0.15, "grad_norm": 5.57970882903596, "learning_rate": 1.9292337538935568e-05, "loss": 1.3843, "step": 3646 }, { "epoch": 0.15, "grad_norm": 4.899798365587663, "learning_rate": 1.9291855422322233e-05, "loss": 1.261, "step": 3647 }, { "epoch": 0.15, "grad_norm": 6.2634117257346515, "learning_rate": 1.9291373147564743e-05, "loss": 1.4518, "step": 3648 }, { "epoch": 0.15, "grad_norm": 6.393498341951171, "learning_rate": 1.9290890714671304e-05, "loss": 1.128, "step": 3649 }, { "epoch": 0.15, "grad_norm": 5.716729883723744, "learning_rate": 1.9290408123650127e-05, "loss": 0.804, "step": 3650 }, { "epoch": 0.15, "grad_norm": 6.517149975457266, "learning_rate": 1.9289925374509422e-05, "loss": 1.1292, "step": 3651 }, { "epoch": 0.15, "grad_norm": 4.567846451666844, "learning_rate": 1.9289442467257408e-05, "loss": 1.3339, "step": 3652 }, { "epoch": 0.15, "grad_norm": 3.984416937200025, "learning_rate": 1.9288959401902306e-05, "loss": 1.0181, "step": 3653 }, { "epoch": 0.15, "grad_norm": 5.895444387772017, "learning_rate": 1.9288476178452333e-05, "loss": 1.3377, "step": 3654 }, { "epoch": 0.15, "grad_norm": 5.582332886043637, "learning_rate": 1.9287992796915716e-05, "loss": 1.337, "step": 3655 }, { "epoch": 0.15, "grad_norm": 6.772375719885593, "learning_rate": 1.9287509257300684e-05, "loss": 1.3948, "step": 3656 }, { "epoch": 0.15, "grad_norm": 6.39170452683877, "learning_rate": 1.9287025559615463e-05, "loss": 1.0504, "step": 3657 }, { "epoch": 0.15, "grad_norm": 7.346128561062332, "learning_rate": 1.9286541703868287e-05, "loss": 1.4128, "step": 3658 }, { "epoch": 0.15, "grad_norm": 4.711095436137454, "learning_rate": 1.928605769006739e-05, "loss": 1.3212, "step": 3659 }, { "epoch": 0.15, "grad_norm": 6.849773416393885, "learning_rate": 1.9285573518221008e-05, "loss": 1.0052, "step": 3660 }, { "epoch": 0.15, "grad_norm": 5.750127909569548, "learning_rate": 1.9285089188337385e-05, "loss": 1.4988, "step": 3661 }, { "epoch": 0.15, "grad_norm": 3.486853898468992, "learning_rate": 1.9284604700424764e-05, "loss": 1.1074, "step": 3662 }, { "epoch": 0.15, "grad_norm": 7.283655832184039, "learning_rate": 1.9284120054491393e-05, "loss": 1.3574, "step": 3663 }, { "epoch": 0.15, "grad_norm": 5.893076616364612, "learning_rate": 1.9283635250545514e-05, "loss": 1.3644, "step": 3664 }, { "epoch": 0.15, "grad_norm": 7.812246406475565, "learning_rate": 1.928315028859538e-05, "loss": 1.4042, "step": 3665 }, { "epoch": 0.15, "grad_norm": 7.352552300886805, "learning_rate": 1.928266516864925e-05, "loss": 1.3715, "step": 3666 }, { "epoch": 0.15, "grad_norm": 6.31070807562294, "learning_rate": 1.9282179890715377e-05, "loss": 1.4893, "step": 3667 }, { "epoch": 0.15, "grad_norm": 8.452256551316395, "learning_rate": 1.9281694454802017e-05, "loss": 1.244, "step": 3668 }, { "epoch": 0.15, "grad_norm": 6.17309529174972, "learning_rate": 1.9281208860917436e-05, "loss": 1.4553, "step": 3669 }, { "epoch": 0.15, "grad_norm": 4.735994118998276, "learning_rate": 1.9280723109069898e-05, "loss": 1.2584, "step": 3670 }, { "epoch": 0.15, "grad_norm": 6.648520444258801, "learning_rate": 1.9280237199267672e-05, "loss": 1.3496, "step": 3671 }, { "epoch": 0.15, "grad_norm": 5.56478061535713, "learning_rate": 1.9279751131519027e-05, "loss": 1.0981, "step": 3672 }, { "epoch": 0.15, "grad_norm": 5.2071165548835205, "learning_rate": 1.927926490583223e-05, "loss": 1.3553, "step": 3673 }, { "epoch": 0.15, "grad_norm": 6.079019620365373, "learning_rate": 1.9278778522215567e-05, "loss": 1.2601, "step": 3674 }, { "epoch": 0.15, "grad_norm": 5.122498246668452, "learning_rate": 1.9278291980677308e-05, "loss": 1.35, "step": 3675 }, { "epoch": 0.15, "grad_norm": 4.790148936939341, "learning_rate": 1.9277805281225734e-05, "loss": 1.3816, "step": 3676 }, { "epoch": 0.15, "grad_norm": 5.27965247935062, "learning_rate": 1.9277318423869133e-05, "loss": 1.2692, "step": 3677 }, { "epoch": 0.15, "grad_norm": 5.817572267617018, "learning_rate": 1.9276831408615788e-05, "loss": 1.0147, "step": 3678 }, { "epoch": 0.15, "grad_norm": 4.875090241137434, "learning_rate": 1.9276344235473988e-05, "loss": 1.0579, "step": 3679 }, { "epoch": 0.15, "grad_norm": 4.402732151671582, "learning_rate": 1.9275856904452026e-05, "loss": 1.1325, "step": 3680 }, { "epoch": 0.15, "grad_norm": 4.693677607393764, "learning_rate": 1.9275369415558196e-05, "loss": 1.3578, "step": 3681 }, { "epoch": 0.15, "grad_norm": 5.986115359918962, "learning_rate": 1.9274881768800793e-05, "loss": 1.3884, "step": 3682 }, { "epoch": 0.15, "grad_norm": 6.665038641187422, "learning_rate": 1.9274393964188118e-05, "loss": 0.9523, "step": 3683 }, { "epoch": 0.15, "grad_norm": 3.7636628425190217, "learning_rate": 1.9273906001728474e-05, "loss": 1.1186, "step": 3684 }, { "epoch": 0.15, "grad_norm": 6.026186379803525, "learning_rate": 1.9273417881430163e-05, "loss": 1.1833, "step": 3685 }, { "epoch": 0.15, "grad_norm": 7.033531731303629, "learning_rate": 1.9272929603301496e-05, "loss": 1.0802, "step": 3686 }, { "epoch": 0.15, "grad_norm": 6.328588956025378, "learning_rate": 1.927244116735078e-05, "loss": 1.165, "step": 3687 }, { "epoch": 0.15, "grad_norm": 6.018169671147362, "learning_rate": 1.927195257358633e-05, "loss": 1.4584, "step": 3688 }, { "epoch": 0.15, "grad_norm": 5.956268215344535, "learning_rate": 1.9271463822016465e-05, "loss": 1.339, "step": 3689 }, { "epoch": 0.15, "grad_norm": 5.075293777667218, "learning_rate": 1.9270974912649498e-05, "loss": 1.18, "step": 3690 }, { "epoch": 0.15, "grad_norm": 4.452895779661805, "learning_rate": 1.9270485845493753e-05, "loss": 1.2107, "step": 3691 }, { "epoch": 0.15, "grad_norm": 5.398338202271196, "learning_rate": 1.9269996620557553e-05, "loss": 1.3696, "step": 3692 }, { "epoch": 0.15, "grad_norm": 5.368699449515494, "learning_rate": 1.9269507237849225e-05, "loss": 1.0349, "step": 3693 }, { "epoch": 0.15, "grad_norm": 6.008223011938114, "learning_rate": 1.9269017697377094e-05, "loss": 1.3826, "step": 3694 }, { "epoch": 0.15, "grad_norm": 8.213925188757125, "learning_rate": 1.9268527999149498e-05, "loss": 1.525, "step": 3695 }, { "epoch": 0.15, "grad_norm": 8.000479089852423, "learning_rate": 1.9268038143174768e-05, "loss": 1.5339, "step": 3696 }, { "epoch": 0.15, "grad_norm": 4.936056037864134, "learning_rate": 1.926754812946124e-05, "loss": 1.2142, "step": 3697 }, { "epoch": 0.15, "grad_norm": 5.883685056970128, "learning_rate": 1.9267057958017258e-05, "loss": 1.2474, "step": 3698 }, { "epoch": 0.15, "grad_norm": 6.103146379228568, "learning_rate": 1.926656762885116e-05, "loss": 1.1131, "step": 3699 }, { "epoch": 0.15, "grad_norm": 6.432743471184644, "learning_rate": 1.9266077141971296e-05, "loss": 1.2827, "step": 3700 }, { "epoch": 0.15, "grad_norm": 4.9905415171978555, "learning_rate": 1.9265586497386013e-05, "loss": 1.2263, "step": 3701 }, { "epoch": 0.15, "grad_norm": 6.97507455174489, "learning_rate": 1.926509569510366e-05, "loss": 1.3248, "step": 3702 }, { "epoch": 0.15, "grad_norm": 6.9220059987667675, "learning_rate": 1.9264604735132586e-05, "loss": 1.4234, "step": 3703 }, { "epoch": 0.15, "grad_norm": 4.754350554195363, "learning_rate": 1.9264113617481153e-05, "loss": 1.1399, "step": 3704 }, { "epoch": 0.15, "grad_norm": 8.178642875162765, "learning_rate": 1.926362234215772e-05, "loss": 1.6125, "step": 3705 }, { "epoch": 0.15, "grad_norm": 4.6604005382348515, "learning_rate": 1.926313090917064e-05, "loss": 1.3311, "step": 3706 }, { "epoch": 0.15, "grad_norm": 6.070124717070623, "learning_rate": 1.9262639318528292e-05, "loss": 1.2972, "step": 3707 }, { "epoch": 0.15, "grad_norm": 5.731064718254311, "learning_rate": 1.9262147570239028e-05, "loss": 1.2734, "step": 3708 }, { "epoch": 0.15, "grad_norm": 6.429318271011033, "learning_rate": 1.9261655664311226e-05, "loss": 1.2221, "step": 3709 }, { "epoch": 0.15, "grad_norm": 5.302304958869578, "learning_rate": 1.9261163600753256e-05, "loss": 1.4537, "step": 3710 }, { "epoch": 0.15, "grad_norm": 8.533648539056662, "learning_rate": 1.926067137957349e-05, "loss": 1.2812, "step": 3711 }, { "epoch": 0.15, "grad_norm": 7.048886952960785, "learning_rate": 1.926017900078031e-05, "loss": 1.5396, "step": 3712 }, { "epoch": 0.15, "grad_norm": 6.265197260912028, "learning_rate": 1.9259686464382093e-05, "loss": 1.4943, "step": 3713 }, { "epoch": 0.15, "grad_norm": 5.8310756397495345, "learning_rate": 1.925919377038722e-05, "loss": 1.5629, "step": 3714 }, { "epoch": 0.15, "grad_norm": 5.251915872759406, "learning_rate": 1.9258700918804084e-05, "loss": 1.0858, "step": 3715 }, { "epoch": 0.15, "grad_norm": 6.005725986569699, "learning_rate": 1.9258207909641066e-05, "loss": 1.3501, "step": 3716 }, { "epoch": 0.15, "grad_norm": 5.557941995569214, "learning_rate": 1.925771474290656e-05, "loss": 1.3268, "step": 3717 }, { "epoch": 0.15, "grad_norm": 5.138057897210028, "learning_rate": 1.9257221418608954e-05, "loss": 1.2849, "step": 3718 }, { "epoch": 0.15, "grad_norm": 6.488108963349086, "learning_rate": 1.9256727936756652e-05, "loss": 1.5539, "step": 3719 }, { "epoch": 0.15, "grad_norm": 4.817781541738744, "learning_rate": 1.925623429735805e-05, "loss": 1.088, "step": 3720 }, { "epoch": 0.15, "grad_norm": 6.245826881472131, "learning_rate": 1.9255740500421548e-05, "loss": 1.4417, "step": 3721 }, { "epoch": 0.15, "grad_norm": 7.724485877740637, "learning_rate": 1.9255246545955554e-05, "loss": 1.4726, "step": 3722 }, { "epoch": 0.15, "grad_norm": 5.767269610676141, "learning_rate": 1.925475243396847e-05, "loss": 1.2265, "step": 3723 }, { "epoch": 0.15, "grad_norm": 7.67284850788565, "learning_rate": 1.9254258164468707e-05, "loss": 1.2075, "step": 3724 }, { "epoch": 0.15, "grad_norm": 4.996101563161321, "learning_rate": 1.9253763737464683e-05, "loss": 1.2114, "step": 3725 }, { "epoch": 0.15, "grad_norm": 5.241654358941049, "learning_rate": 1.9253269152964805e-05, "loss": 1.2924, "step": 3726 }, { "epoch": 0.15, "grad_norm": 6.0739761135009545, "learning_rate": 1.9252774410977494e-05, "loss": 1.19, "step": 3727 }, { "epoch": 0.15, "grad_norm": 4.627544700876754, "learning_rate": 1.925227951151117e-05, "loss": 1.114, "step": 3728 }, { "epoch": 0.15, "grad_norm": 5.6427805283444705, "learning_rate": 1.9251784454574262e-05, "loss": 1.144, "step": 3729 }, { "epoch": 0.15, "grad_norm": 4.9774769806845205, "learning_rate": 1.9251289240175182e-05, "loss": 1.3762, "step": 3730 }, { "epoch": 0.15, "grad_norm": 6.578277478599333, "learning_rate": 1.925079386832237e-05, "loss": 1.46, "step": 3731 }, { "epoch": 0.15, "grad_norm": 7.804664393037738, "learning_rate": 1.9250298339024255e-05, "loss": 1.3968, "step": 3732 }, { "epoch": 0.15, "grad_norm": 6.208521764488242, "learning_rate": 1.9249802652289267e-05, "loss": 1.3355, "step": 3733 }, { "epoch": 0.15, "grad_norm": 6.182292652665391, "learning_rate": 1.9249306808125846e-05, "loss": 1.3366, "step": 3734 }, { "epoch": 0.15, "grad_norm": 5.743112719592296, "learning_rate": 1.924881080654243e-05, "loss": 1.4569, "step": 3735 }, { "epoch": 0.15, "grad_norm": 6.450416471870751, "learning_rate": 1.9248314647547457e-05, "loss": 1.5415, "step": 3736 }, { "epoch": 0.15, "grad_norm": 6.440774616437502, "learning_rate": 1.924781833114938e-05, "loss": 1.4191, "step": 3737 }, { "epoch": 0.15, "grad_norm": 3.810002646614609, "learning_rate": 1.9247321857356634e-05, "loss": 1.1696, "step": 3738 }, { "epoch": 0.15, "grad_norm": 4.852139207604734, "learning_rate": 1.924682522617768e-05, "loss": 1.3536, "step": 3739 }, { "epoch": 0.15, "grad_norm": 6.032221007590808, "learning_rate": 1.924632843762097e-05, "loss": 1.5368, "step": 3740 }, { "epoch": 0.15, "grad_norm": 8.698590505776723, "learning_rate": 1.9245831491694955e-05, "loss": 1.3329, "step": 3741 }, { "epoch": 0.15, "grad_norm": 5.576343666943949, "learning_rate": 1.924533438840809e-05, "loss": 1.2937, "step": 3742 }, { "epoch": 0.15, "grad_norm": 6.188691956455258, "learning_rate": 1.924483712776884e-05, "loss": 1.3262, "step": 3743 }, { "epoch": 0.15, "grad_norm": 6.551430712996993, "learning_rate": 1.9244339709785666e-05, "loss": 1.1791, "step": 3744 }, { "epoch": 0.15, "grad_norm": 5.21766330659627, "learning_rate": 1.9243842134467035e-05, "loss": 1.378, "step": 3745 }, { "epoch": 0.15, "grad_norm": 6.1124838385395535, "learning_rate": 1.9243344401821415e-05, "loss": 1.3612, "step": 3746 }, { "epoch": 0.15, "grad_norm": 5.686074514033127, "learning_rate": 1.9242846511857277e-05, "loss": 1.2782, "step": 3747 }, { "epoch": 0.15, "grad_norm": 6.26241414437536, "learning_rate": 1.92423484645831e-05, "loss": 1.3612, "step": 3748 }, { "epoch": 0.15, "grad_norm": 5.519000917290419, "learning_rate": 1.9241850260007354e-05, "loss": 1.3866, "step": 3749 }, { "epoch": 0.15, "grad_norm": 7.375136986811197, "learning_rate": 1.9241351898138522e-05, "loss": 1.5753, "step": 3750 }, { "epoch": 0.15, "grad_norm": 4.863259096332787, "learning_rate": 1.9240853378985084e-05, "loss": 1.1668, "step": 3751 }, { "epoch": 0.15, "grad_norm": 4.786533408740255, "learning_rate": 1.9240354702555524e-05, "loss": 1.3227, "step": 3752 }, { "epoch": 0.15, "grad_norm": 4.820147376534494, "learning_rate": 1.923985586885833e-05, "loss": 1.1194, "step": 3753 }, { "epoch": 0.15, "grad_norm": 8.744365054160161, "learning_rate": 1.9239356877901993e-05, "loss": 1.4594, "step": 3754 }, { "epoch": 0.15, "grad_norm": 4.506276529928117, "learning_rate": 1.9238857729695007e-05, "loss": 1.3011, "step": 3755 }, { "epoch": 0.15, "grad_norm": 6.317328298623736, "learning_rate": 1.923835842424586e-05, "loss": 1.196, "step": 3756 }, { "epoch": 0.15, "grad_norm": 6.359263538222685, "learning_rate": 1.9237858961563063e-05, "loss": 1.5914, "step": 3757 }, { "epoch": 0.15, "grad_norm": 4.362459258567222, "learning_rate": 1.9237359341655108e-05, "loss": 1.171, "step": 3758 }, { "epoch": 0.15, "grad_norm": 5.270209784498621, "learning_rate": 1.923685956453049e-05, "loss": 1.2476, "step": 3759 }, { "epoch": 0.15, "grad_norm": 5.891320776726994, "learning_rate": 1.9236359630197733e-05, "loss": 1.5037, "step": 3760 }, { "epoch": 0.15, "grad_norm": 7.194767654159692, "learning_rate": 1.923585953866534e-05, "loss": 1.6077, "step": 3761 }, { "epoch": 0.15, "grad_norm": 5.279986118357161, "learning_rate": 1.9235359289941814e-05, "loss": 1.4265, "step": 3762 }, { "epoch": 0.15, "grad_norm": 4.878088776021628, "learning_rate": 1.9234858884035676e-05, "loss": 1.1897, "step": 3763 }, { "epoch": 0.15, "grad_norm": 5.550540300259057, "learning_rate": 1.923435832095544e-05, "loss": 1.084, "step": 3764 }, { "epoch": 0.15, "grad_norm": 5.752809656776331, "learning_rate": 1.9233857600709628e-05, "loss": 1.1469, "step": 3765 }, { "epoch": 0.15, "grad_norm": 7.669034517079199, "learning_rate": 1.923335672330676e-05, "loss": 1.5183, "step": 3766 }, { "epoch": 0.15, "grad_norm": 4.145025531869675, "learning_rate": 1.9232855688755365e-05, "loss": 1.1538, "step": 3767 }, { "epoch": 0.15, "grad_norm": 8.423097826020777, "learning_rate": 1.923235449706396e-05, "loss": 1.3414, "step": 3768 }, { "epoch": 0.15, "grad_norm": 6.9319014618644275, "learning_rate": 1.9231853148241092e-05, "loss": 1.0655, "step": 3769 }, { "epoch": 0.15, "grad_norm": 4.919737016052818, "learning_rate": 1.9231351642295278e-05, "loss": 1.3515, "step": 3770 }, { "epoch": 0.15, "grad_norm": 4.729249224771176, "learning_rate": 1.923084997923506e-05, "loss": 1.2049, "step": 3771 }, { "epoch": 0.15, "grad_norm": 7.034395189026535, "learning_rate": 1.923034815906898e-05, "loss": 1.4821, "step": 3772 }, { "epoch": 0.15, "grad_norm": 6.974134307215962, "learning_rate": 1.9229846181805565e-05, "loss": 1.2205, "step": 3773 }, { "epoch": 0.15, "grad_norm": 5.192448997433581, "learning_rate": 1.9229344047453374e-05, "loss": 1.5085, "step": 3774 }, { "epoch": 0.15, "grad_norm": 4.798058468258428, "learning_rate": 1.9228841756020948e-05, "loss": 1.376, "step": 3775 }, { "epoch": 0.15, "grad_norm": 5.387259076644892, "learning_rate": 1.922833930751683e-05, "loss": 1.3605, "step": 3776 }, { "epoch": 0.15, "grad_norm": 6.330209575658079, "learning_rate": 1.9227836701949577e-05, "loss": 1.3744, "step": 3777 }, { "epoch": 0.15, "grad_norm": 5.979954302492915, "learning_rate": 1.9227333939327746e-05, "loss": 1.2736, "step": 3778 }, { "epoch": 0.15, "grad_norm": 4.645129768018499, "learning_rate": 1.9226831019659887e-05, "loss": 1.2831, "step": 3779 }, { "epoch": 0.15, "grad_norm": 4.319076667147134, "learning_rate": 1.9226327942954566e-05, "loss": 1.1212, "step": 3780 }, { "epoch": 0.15, "grad_norm": 10.615559871893295, "learning_rate": 1.922582470922034e-05, "loss": 1.4999, "step": 3781 }, { "epoch": 0.15, "grad_norm": 6.9468578876842, "learning_rate": 1.922532131846578e-05, "loss": 1.308, "step": 3782 }, { "epoch": 0.15, "grad_norm": 5.688015286109089, "learning_rate": 1.9224817770699445e-05, "loss": 1.2843, "step": 3783 }, { "epoch": 0.15, "grad_norm": 5.641621215101204, "learning_rate": 1.922431406592991e-05, "loss": 0.9755, "step": 3784 }, { "epoch": 0.15, "grad_norm": 4.894752119471582, "learning_rate": 1.922381020416575e-05, "loss": 1.2113, "step": 3785 }, { "epoch": 0.15, "grad_norm": 6.097364666716114, "learning_rate": 1.922330618541554e-05, "loss": 1.2074, "step": 3786 }, { "epoch": 0.15, "grad_norm": 4.647640820032474, "learning_rate": 1.922280200968785e-05, "loss": 1.2878, "step": 3787 }, { "epoch": 0.15, "grad_norm": 4.3250466723271215, "learning_rate": 1.922229767699127e-05, "loss": 1.2344, "step": 3788 }, { "epoch": 0.15, "grad_norm": 5.123384543670834, "learning_rate": 1.9221793187334378e-05, "loss": 1.3558, "step": 3789 }, { "epoch": 0.15, "grad_norm": 5.512702869317206, "learning_rate": 1.9221288540725768e-05, "loss": 1.4625, "step": 3790 }, { "epoch": 0.15, "grad_norm": 5.1838517742396, "learning_rate": 1.922078373717402e-05, "loss": 1.1259, "step": 3791 }, { "epoch": 0.15, "grad_norm": 4.995466628782504, "learning_rate": 1.9220278776687735e-05, "loss": 1.5003, "step": 3792 }, { "epoch": 0.15, "grad_norm": 5.151449390600595, "learning_rate": 1.9219773659275495e-05, "loss": 1.4624, "step": 3793 }, { "epoch": 0.15, "grad_norm": 5.1720808761756, "learning_rate": 1.9219268384945907e-05, "loss": 1.3972, "step": 3794 }, { "epoch": 0.15, "grad_norm": 4.465595626878422, "learning_rate": 1.9218762953707568e-05, "loss": 1.1815, "step": 3795 }, { "epoch": 0.15, "grad_norm": 5.158897803475003, "learning_rate": 1.9218257365569077e-05, "loss": 1.1699, "step": 3796 }, { "epoch": 0.15, "grad_norm": 12.230357586269678, "learning_rate": 1.9217751620539044e-05, "loss": 1.8189, "step": 3797 }, { "epoch": 0.15, "grad_norm": 5.730949594137556, "learning_rate": 1.9217245718626076e-05, "loss": 0.9981, "step": 3798 }, { "epoch": 0.15, "grad_norm": 7.667613083579999, "learning_rate": 1.9216739659838776e-05, "loss": 1.353, "step": 3799 }, { "epoch": 0.15, "grad_norm": 6.340806530774783, "learning_rate": 1.9216233444185762e-05, "loss": 1.3024, "step": 3800 }, { "epoch": 0.15, "grad_norm": 8.163995474749633, "learning_rate": 1.9215727071675654e-05, "loss": 1.5443, "step": 3801 }, { "epoch": 0.15, "grad_norm": 6.665633643031014, "learning_rate": 1.9215220542317066e-05, "loss": 1.4613, "step": 3802 }, { "epoch": 0.15, "grad_norm": 7.173796924601962, "learning_rate": 1.9214713856118615e-05, "loss": 1.3302, "step": 3803 }, { "epoch": 0.15, "grad_norm": 6.605443035063351, "learning_rate": 1.9214207013088935e-05, "loss": 1.5524, "step": 3804 }, { "epoch": 0.15, "grad_norm": 4.658690125701912, "learning_rate": 1.9213700013236644e-05, "loss": 1.2231, "step": 3805 }, { "epoch": 0.15, "grad_norm": 4.993704515848969, "learning_rate": 1.9213192856570368e-05, "loss": 1.2258, "step": 3806 }, { "epoch": 0.15, "grad_norm": 7.760948090715163, "learning_rate": 1.921268554309875e-05, "loss": 1.5158, "step": 3807 }, { "epoch": 0.15, "grad_norm": 7.067065161736737, "learning_rate": 1.9212178072830416e-05, "loss": 1.4492, "step": 3808 }, { "epoch": 0.15, "grad_norm": 6.472669688632329, "learning_rate": 1.9211670445774e-05, "loss": 1.1818, "step": 3809 }, { "epoch": 0.15, "grad_norm": 6.714497295994995, "learning_rate": 1.9211162661938155e-05, "loss": 1.3543, "step": 3810 }, { "epoch": 0.15, "grad_norm": 6.577241083585308, "learning_rate": 1.921065472133151e-05, "loss": 1.3984, "step": 3811 }, { "epoch": 0.15, "grad_norm": 4.700557794574102, "learning_rate": 1.9210146623962718e-05, "loss": 1.3243, "step": 3812 }, { "epoch": 0.15, "grad_norm": 9.80797467472245, "learning_rate": 1.920963836984042e-05, "loss": 1.3319, "step": 3813 }, { "epoch": 0.15, "grad_norm": 10.749698605675931, "learning_rate": 1.920912995897327e-05, "loss": 1.3987, "step": 3814 }, { "epoch": 0.15, "grad_norm": 6.138633510085447, "learning_rate": 1.9208621391369922e-05, "loss": 1.0934, "step": 3815 }, { "epoch": 0.15, "grad_norm": 8.469142077486504, "learning_rate": 1.920811266703903e-05, "loss": 1.3873, "step": 3816 }, { "epoch": 0.15, "grad_norm": 6.022262116622826, "learning_rate": 1.920760378598925e-05, "loss": 1.1021, "step": 3817 }, { "epoch": 0.15, "grad_norm": 8.20752288032026, "learning_rate": 1.9207094748229252e-05, "loss": 0.9833, "step": 3818 }, { "epoch": 0.15, "grad_norm": 8.135915586436894, "learning_rate": 1.920658555376769e-05, "loss": 1.1488, "step": 3819 }, { "epoch": 0.15, "grad_norm": 7.535549625733738, "learning_rate": 1.9206076202613232e-05, "loss": 1.1257, "step": 3820 }, { "epoch": 0.15, "grad_norm": 7.640631376982391, "learning_rate": 1.920556669477455e-05, "loss": 1.0853, "step": 3821 }, { "epoch": 0.15, "grad_norm": 12.323387246951501, "learning_rate": 1.9205057030260316e-05, "loss": 1.5059, "step": 3822 }, { "epoch": 0.15, "grad_norm": 5.690342472497452, "learning_rate": 1.92045472090792e-05, "loss": 1.3229, "step": 3823 }, { "epoch": 0.15, "grad_norm": 10.130675119792466, "learning_rate": 1.9204037231239882e-05, "loss": 1.5292, "step": 3824 }, { "epoch": 0.15, "grad_norm": 9.19741804077045, "learning_rate": 1.9203527096751045e-05, "loss": 1.7438, "step": 3825 }, { "epoch": 0.15, "grad_norm": 8.157199695102383, "learning_rate": 1.9203016805621364e-05, "loss": 1.6524, "step": 3826 }, { "epoch": 0.15, "grad_norm": 6.404130745570322, "learning_rate": 1.920250635785953e-05, "loss": 1.551, "step": 3827 }, { "epoch": 0.15, "grad_norm": 10.738529910094059, "learning_rate": 1.9201995753474224e-05, "loss": 1.3145, "step": 3828 }, { "epoch": 0.15, "grad_norm": 5.772856275114295, "learning_rate": 1.920148499247414e-05, "loss": 1.3349, "step": 3829 }, { "epoch": 0.15, "grad_norm": 6.074887256067584, "learning_rate": 1.920097407486798e-05, "loss": 1.4214, "step": 3830 }, { "epoch": 0.15, "grad_norm": 4.697891003247016, "learning_rate": 1.9200463000664422e-05, "loss": 1.3174, "step": 3831 }, { "epoch": 0.15, "grad_norm": 5.128807077075823, "learning_rate": 1.9199951769872174e-05, "loss": 1.2457, "step": 3832 }, { "epoch": 0.15, "grad_norm": 7.792345818206707, "learning_rate": 1.919944038249994e-05, "loss": 1.6631, "step": 3833 }, { "epoch": 0.15, "grad_norm": 6.974756661409931, "learning_rate": 1.919892883855642e-05, "loss": 1.0845, "step": 3834 }, { "epoch": 0.15, "grad_norm": 7.436150015078166, "learning_rate": 1.919841713805032e-05, "loss": 1.3107, "step": 3835 }, { "epoch": 0.15, "grad_norm": 4.372083409615469, "learning_rate": 1.9197905280990348e-05, "loss": 1.1329, "step": 3836 }, { "epoch": 0.15, "grad_norm": 3.8916240096357244, "learning_rate": 1.919739326738522e-05, "loss": 1.1311, "step": 3837 }, { "epoch": 0.15, "grad_norm": 7.788597186741629, "learning_rate": 1.9196881097243642e-05, "loss": 1.1001, "step": 3838 }, { "epoch": 0.15, "grad_norm": 9.21082485911273, "learning_rate": 1.919636877057434e-05, "loss": 1.7171, "step": 3839 }, { "epoch": 0.15, "grad_norm": 5.436244014812157, "learning_rate": 1.9195856287386027e-05, "loss": 1.3476, "step": 3840 }, { "epoch": 0.15, "grad_norm": 7.0814037512354915, "learning_rate": 1.919534364768743e-05, "loss": 1.1749, "step": 3841 }, { "epoch": 0.15, "grad_norm": 5.1856133308859365, "learning_rate": 1.9194830851487273e-05, "loss": 1.182, "step": 3842 }, { "epoch": 0.15, "grad_norm": 4.65762750758182, "learning_rate": 1.919431789879428e-05, "loss": 1.1558, "step": 3843 }, { "epoch": 0.15, "grad_norm": 8.007061909464161, "learning_rate": 1.9193804789617188e-05, "loss": 1.5005, "step": 3844 }, { "epoch": 0.15, "grad_norm": 5.5035270428692735, "learning_rate": 1.9193291523964722e-05, "loss": 1.4588, "step": 3845 }, { "epoch": 0.15, "grad_norm": 8.086662395379156, "learning_rate": 1.9192778101845622e-05, "loss": 1.3039, "step": 3846 }, { "epoch": 0.15, "grad_norm": 5.910266420009931, "learning_rate": 1.9192264523268624e-05, "loss": 1.1328, "step": 3847 }, { "epoch": 0.15, "grad_norm": 4.698133319571874, "learning_rate": 1.9191750788242472e-05, "loss": 1.2088, "step": 3848 }, { "epoch": 0.16, "grad_norm": 7.33086963615655, "learning_rate": 1.919123689677591e-05, "loss": 1.7101, "step": 3849 }, { "epoch": 0.16, "grad_norm": 5.356822169575082, "learning_rate": 1.9190722848877683e-05, "loss": 1.3061, "step": 3850 }, { "epoch": 0.16, "grad_norm": 7.249389388081088, "learning_rate": 1.9190208644556538e-05, "loss": 1.3839, "step": 3851 }, { "epoch": 0.16, "grad_norm": 7.897460340172067, "learning_rate": 1.918969428382123e-05, "loss": 1.5199, "step": 3852 }, { "epoch": 0.16, "grad_norm": 6.98671194544257, "learning_rate": 1.9189179766680507e-05, "loss": 1.1599, "step": 3853 }, { "epoch": 0.16, "grad_norm": 4.850408992493337, "learning_rate": 1.9188665093143133e-05, "loss": 1.3345, "step": 3854 }, { "epoch": 0.16, "grad_norm": 6.391009796388568, "learning_rate": 1.9188150263217864e-05, "loss": 1.5095, "step": 3855 }, { "epoch": 0.16, "grad_norm": 5.838441234242977, "learning_rate": 1.9187635276913463e-05, "loss": 1.4144, "step": 3856 }, { "epoch": 0.16, "grad_norm": 6.034984575417448, "learning_rate": 1.9187120134238696e-05, "loss": 1.2125, "step": 3857 }, { "epoch": 0.16, "grad_norm": 4.822944790439971, "learning_rate": 1.9186604835202324e-05, "loss": 1.0103, "step": 3858 }, { "epoch": 0.16, "grad_norm": 6.889684936936214, "learning_rate": 1.918608937981313e-05, "loss": 1.3846, "step": 3859 }, { "epoch": 0.16, "grad_norm": 9.21692384744174, "learning_rate": 1.9185573768079876e-05, "loss": 1.2952, "step": 3860 }, { "epoch": 0.16, "grad_norm": 8.809230616007657, "learning_rate": 1.9185058000011342e-05, "loss": 1.401, "step": 3861 }, { "epoch": 0.16, "grad_norm": 4.576857951911166, "learning_rate": 1.9184542075616302e-05, "loss": 1.2086, "step": 3862 }, { "epoch": 0.16, "grad_norm": 6.734310243437038, "learning_rate": 1.9184025994903544e-05, "loss": 1.4372, "step": 3863 }, { "epoch": 0.16, "grad_norm": 7.837163982255326, "learning_rate": 1.9183509757881843e-05, "loss": 1.3568, "step": 3864 }, { "epoch": 0.16, "grad_norm": 8.12725320200854, "learning_rate": 1.9182993364559996e-05, "loss": 1.6745, "step": 3865 }, { "epoch": 0.16, "grad_norm": 4.633506535689906, "learning_rate": 1.918247681494678e-05, "loss": 0.9801, "step": 3866 }, { "epoch": 0.16, "grad_norm": 6.700271228741421, "learning_rate": 1.9181960109050993e-05, "loss": 1.3088, "step": 3867 }, { "epoch": 0.16, "grad_norm": 6.464754223041804, "learning_rate": 1.918144324688143e-05, "loss": 1.2367, "step": 3868 }, { "epoch": 0.16, "grad_norm": 4.913514429796562, "learning_rate": 1.9180926228446886e-05, "loss": 1.1111, "step": 3869 }, { "epoch": 0.16, "grad_norm": 6.537275438626429, "learning_rate": 1.9180409053756158e-05, "loss": 1.4125, "step": 3870 }, { "epoch": 0.16, "grad_norm": 6.738478179416889, "learning_rate": 1.9179891722818054e-05, "loss": 1.6166, "step": 3871 }, { "epoch": 0.16, "grad_norm": 6.827985215295418, "learning_rate": 1.9179374235641375e-05, "loss": 1.3614, "step": 3872 }, { "epoch": 0.16, "grad_norm": 9.622787491929182, "learning_rate": 1.9178856592234927e-05, "loss": 1.6012, "step": 3873 }, { "epoch": 0.16, "grad_norm": 6.242189412616971, "learning_rate": 1.9178338792607522e-05, "loss": 1.1982, "step": 3874 }, { "epoch": 0.16, "grad_norm": 6.4039197343953225, "learning_rate": 1.9177820836767974e-05, "loss": 1.4711, "step": 3875 }, { "epoch": 0.16, "grad_norm": 7.390833561885452, "learning_rate": 1.9177302724725095e-05, "loss": 1.4224, "step": 3876 }, { "epoch": 0.16, "grad_norm": 6.728029590502367, "learning_rate": 1.917678445648771e-05, "loss": 1.135, "step": 3877 }, { "epoch": 0.16, "grad_norm": 6.0386771633276926, "learning_rate": 1.917626603206463e-05, "loss": 1.267, "step": 3878 }, { "epoch": 0.16, "grad_norm": 6.722515838757173, "learning_rate": 1.9175747451464683e-05, "loss": 1.6525, "step": 3879 }, { "epoch": 0.16, "grad_norm": 7.161512264660469, "learning_rate": 1.91752287146967e-05, "loss": 1.4847, "step": 3880 }, { "epoch": 0.16, "grad_norm": 6.181849465314917, "learning_rate": 1.9174709821769504e-05, "loss": 1.3885, "step": 3881 }, { "epoch": 0.16, "grad_norm": 4.502587012621865, "learning_rate": 1.9174190772691924e-05, "loss": 1.3475, "step": 3882 }, { "epoch": 0.16, "grad_norm": 4.5929713414010696, "learning_rate": 1.9173671567472803e-05, "loss": 1.1267, "step": 3883 }, { "epoch": 0.16, "grad_norm": 5.532333076128531, "learning_rate": 1.9173152206120968e-05, "loss": 1.4223, "step": 3884 }, { "epoch": 0.16, "grad_norm": 5.403691751019014, "learning_rate": 1.9172632688645265e-05, "loss": 1.3165, "step": 3885 }, { "epoch": 0.16, "grad_norm": 4.477158275100342, "learning_rate": 1.917211301505453e-05, "loss": 1.1999, "step": 3886 }, { "epoch": 0.16, "grad_norm": 5.533871107985907, "learning_rate": 1.9171593185357614e-05, "loss": 1.1917, "step": 3887 }, { "epoch": 0.16, "grad_norm": 5.247619270820096, "learning_rate": 1.9171073199563364e-05, "loss": 1.4068, "step": 3888 }, { "epoch": 0.16, "grad_norm": 6.5892145983715045, "learning_rate": 1.9170553057680627e-05, "loss": 1.2391, "step": 3889 }, { "epoch": 0.16, "grad_norm": 5.587957018158503, "learning_rate": 1.9170032759718252e-05, "loss": 1.2336, "step": 3890 }, { "epoch": 0.16, "grad_norm": 7.026958200540149, "learning_rate": 1.9169512305685103e-05, "loss": 1.2288, "step": 3891 }, { "epoch": 0.16, "grad_norm": 6.988735901117943, "learning_rate": 1.916899169559003e-05, "loss": 1.3029, "step": 3892 }, { "epoch": 0.16, "grad_norm": 10.716828089818758, "learning_rate": 1.91684709294419e-05, "loss": 1.3467, "step": 3893 }, { "epoch": 0.16, "grad_norm": 5.202088366155682, "learning_rate": 1.9167950007249572e-05, "loss": 1.3521, "step": 3894 }, { "epoch": 0.16, "grad_norm": 6.609252817072948, "learning_rate": 1.9167428929021913e-05, "loss": 1.541, "step": 3895 }, { "epoch": 0.16, "grad_norm": 7.057070051912977, "learning_rate": 1.916690769476779e-05, "loss": 1.4667, "step": 3896 }, { "epoch": 0.16, "grad_norm": 6.507965385433437, "learning_rate": 1.916638630449608e-05, "loss": 1.5024, "step": 3897 }, { "epoch": 0.16, "grad_norm": 6.215247349603151, "learning_rate": 1.916586475821565e-05, "loss": 1.3783, "step": 3898 }, { "epoch": 0.16, "grad_norm": 7.033434260138074, "learning_rate": 1.916534305593538e-05, "loss": 1.4885, "step": 3899 }, { "epoch": 0.16, "grad_norm": 4.304444790246893, "learning_rate": 1.916482119766415e-05, "loss": 1.1599, "step": 3900 }, { "epoch": 0.16, "grad_norm": 6.072540441390557, "learning_rate": 1.9164299183410842e-05, "loss": 1.2316, "step": 3901 }, { "epoch": 0.16, "grad_norm": 5.536055589870417, "learning_rate": 1.9163777013184334e-05, "loss": 1.3214, "step": 3902 }, { "epoch": 0.16, "grad_norm": 8.061469530045128, "learning_rate": 1.9163254686993523e-05, "loss": 1.4698, "step": 3903 }, { "epoch": 0.16, "grad_norm": 6.9952480850266365, "learning_rate": 1.916273220484729e-05, "loss": 1.1851, "step": 3904 }, { "epoch": 0.16, "grad_norm": 7.005845166514685, "learning_rate": 1.9162209566754538e-05, "loss": 1.3878, "step": 3905 }, { "epoch": 0.16, "grad_norm": 6.524911528632204, "learning_rate": 1.916168677272415e-05, "loss": 1.2975, "step": 3906 }, { "epoch": 0.16, "grad_norm": 6.594218512839721, "learning_rate": 1.916116382276503e-05, "loss": 1.3037, "step": 3907 }, { "epoch": 0.16, "grad_norm": 6.496392316228006, "learning_rate": 1.9160640716886078e-05, "loss": 1.2733, "step": 3908 }, { "epoch": 0.16, "grad_norm": 5.016168457297015, "learning_rate": 1.9160117455096196e-05, "loss": 1.138, "step": 3909 }, { "epoch": 0.16, "grad_norm": 8.395068242671233, "learning_rate": 1.9159594037404293e-05, "loss": 1.3595, "step": 3910 }, { "epoch": 0.16, "grad_norm": 7.010449209608262, "learning_rate": 1.9159070463819272e-05, "loss": 1.2519, "step": 3911 }, { "epoch": 0.16, "grad_norm": 4.6229324037390045, "learning_rate": 1.9158546734350052e-05, "loss": 1.1025, "step": 3912 }, { "epoch": 0.16, "grad_norm": 5.6434831585598495, "learning_rate": 1.9158022849005537e-05, "loss": 1.4361, "step": 3913 }, { "epoch": 0.16, "grad_norm": 6.24781432772015, "learning_rate": 1.915749880779465e-05, "loss": 1.4862, "step": 3914 }, { "epoch": 0.16, "grad_norm": 8.355221530516276, "learning_rate": 1.9156974610726305e-05, "loss": 1.5043, "step": 3915 }, { "epoch": 0.16, "grad_norm": 6.013331507711504, "learning_rate": 1.9156450257809435e-05, "loss": 1.3182, "step": 3916 }, { "epoch": 0.16, "grad_norm": 7.915691744731051, "learning_rate": 1.915592574905295e-05, "loss": 1.3181, "step": 3917 }, { "epoch": 0.16, "grad_norm": 5.356850204145977, "learning_rate": 1.9155401084465782e-05, "loss": 1.0183, "step": 3918 }, { "epoch": 0.16, "grad_norm": 11.089190928528394, "learning_rate": 1.9154876264056863e-05, "loss": 1.2056, "step": 3919 }, { "epoch": 0.16, "grad_norm": 7.897977324407091, "learning_rate": 1.915435128783512e-05, "loss": 1.5153, "step": 3920 }, { "epoch": 0.16, "grad_norm": 7.6166049406825795, "learning_rate": 1.91538261558095e-05, "loss": 1.425, "step": 3921 }, { "epoch": 0.16, "grad_norm": 8.75718428060117, "learning_rate": 1.9153300867988927e-05, "loss": 1.0657, "step": 3922 }, { "epoch": 0.16, "grad_norm": 6.7666433423794246, "learning_rate": 1.9152775424382347e-05, "loss": 0.9287, "step": 3923 }, { "epoch": 0.16, "grad_norm": 6.5582159572859675, "learning_rate": 1.9152249824998703e-05, "loss": 1.6585, "step": 3924 }, { "epoch": 0.16, "grad_norm": 5.582317105012463, "learning_rate": 1.9151724069846937e-05, "loss": 1.1968, "step": 3925 }, { "epoch": 0.16, "grad_norm": 7.916294923686202, "learning_rate": 1.9151198158936007e-05, "loss": 1.1931, "step": 3926 }, { "epoch": 0.16, "grad_norm": 7.957959479994896, "learning_rate": 1.915067209227485e-05, "loss": 1.2911, "step": 3927 }, { "epoch": 0.16, "grad_norm": 8.510601133539533, "learning_rate": 1.915014586987243e-05, "loss": 1.1864, "step": 3928 }, { "epoch": 0.16, "grad_norm": 6.077513241994799, "learning_rate": 1.91496194917377e-05, "loss": 1.5012, "step": 3929 }, { "epoch": 0.16, "grad_norm": 6.338665777749406, "learning_rate": 1.9149092957879617e-05, "loss": 1.0849, "step": 3930 }, { "epoch": 0.16, "grad_norm": 8.979967275998465, "learning_rate": 1.9148566268307144e-05, "loss": 1.7117, "step": 3931 }, { "epoch": 0.16, "grad_norm": 7.402761380258971, "learning_rate": 1.9148039423029244e-05, "loss": 1.3435, "step": 3932 }, { "epoch": 0.16, "grad_norm": 6.397586508024166, "learning_rate": 1.9147512422054883e-05, "loss": 1.2432, "step": 3933 }, { "epoch": 0.16, "grad_norm": 11.651701699204425, "learning_rate": 1.9146985265393034e-05, "loss": 1.2454, "step": 3934 }, { "epoch": 0.16, "grad_norm": 5.218726204895244, "learning_rate": 1.914645795305267e-05, "loss": 1.1791, "step": 3935 }, { "epoch": 0.16, "grad_norm": 5.255495130330186, "learning_rate": 1.914593048504276e-05, "loss": 1.2026, "step": 3936 }, { "epoch": 0.16, "grad_norm": 5.677419869572245, "learning_rate": 1.914540286137228e-05, "loss": 1.2492, "step": 3937 }, { "epoch": 0.16, "grad_norm": 6.672548882527528, "learning_rate": 1.9144875082050217e-05, "loss": 1.4637, "step": 3938 }, { "epoch": 0.16, "grad_norm": 6.828162437648522, "learning_rate": 1.914434714708555e-05, "loss": 1.224, "step": 3939 }, { "epoch": 0.16, "grad_norm": 5.025841275952991, "learning_rate": 1.914381905648727e-05, "loss": 1.1636, "step": 3940 }, { "epoch": 0.16, "grad_norm": 6.0095438998974835, "learning_rate": 1.914329081026435e-05, "loss": 1.1343, "step": 3941 }, { "epoch": 0.16, "grad_norm": 5.408204680472971, "learning_rate": 1.9142762408425797e-05, "loss": 1.3587, "step": 3942 }, { "epoch": 0.16, "grad_norm": 4.882122571608768, "learning_rate": 1.9142233850980597e-05, "loss": 1.3185, "step": 3943 }, { "epoch": 0.16, "grad_norm": 4.389104872503573, "learning_rate": 1.9141705137937744e-05, "loss": 1.2037, "step": 3944 }, { "epoch": 0.16, "grad_norm": 7.811735139937446, "learning_rate": 1.914117626930624e-05, "loss": 1.3879, "step": 3945 }, { "epoch": 0.16, "grad_norm": 7.451260668935955, "learning_rate": 1.9140647245095085e-05, "loss": 1.3158, "step": 3946 }, { "epoch": 0.16, "grad_norm": 4.433150912453171, "learning_rate": 1.9140118065313283e-05, "loss": 1.3322, "step": 3947 }, { "epoch": 0.16, "grad_norm": 6.096560605172103, "learning_rate": 1.9139588729969842e-05, "loss": 1.4495, "step": 3948 }, { "epoch": 0.16, "grad_norm": 5.26722098626651, "learning_rate": 1.9139059239073768e-05, "loss": 1.3397, "step": 3949 }, { "epoch": 0.16, "grad_norm": 5.389180427294422, "learning_rate": 1.913852959263407e-05, "loss": 1.402, "step": 3950 }, { "epoch": 0.16, "grad_norm": 8.60419850154709, "learning_rate": 1.9137999790659773e-05, "loss": 1.434, "step": 3951 }, { "epoch": 0.16, "grad_norm": 5.10981211237569, "learning_rate": 1.9137469833159885e-05, "loss": 1.3826, "step": 3952 }, { "epoch": 0.16, "grad_norm": 5.031626752744284, "learning_rate": 1.9136939720143427e-05, "loss": 1.2693, "step": 3953 }, { "epoch": 0.16, "grad_norm": 6.289170127508096, "learning_rate": 1.9136409451619424e-05, "loss": 1.0715, "step": 3954 }, { "epoch": 0.16, "grad_norm": 10.743561585095742, "learning_rate": 1.91358790275969e-05, "loss": 1.3814, "step": 3955 }, { "epoch": 0.16, "grad_norm": 5.860988125599921, "learning_rate": 1.9135348448084882e-05, "loss": 1.3517, "step": 3956 }, { "epoch": 0.16, "grad_norm": 7.080799217369657, "learning_rate": 1.9134817713092398e-05, "loss": 1.2271, "step": 3957 }, { "epoch": 0.16, "grad_norm": 6.566551905934027, "learning_rate": 1.9134286822628487e-05, "loss": 1.3495, "step": 3958 }, { "epoch": 0.16, "grad_norm": 5.695460027798251, "learning_rate": 1.913375577670218e-05, "loss": 1.2411, "step": 3959 }, { "epoch": 0.16, "grad_norm": 6.51972245641186, "learning_rate": 1.9133224575322512e-05, "loss": 1.4759, "step": 3960 }, { "epoch": 0.16, "grad_norm": 5.697427461179988, "learning_rate": 1.9132693218498533e-05, "loss": 1.1836, "step": 3961 }, { "epoch": 0.16, "grad_norm": 10.240553494703757, "learning_rate": 1.9132161706239283e-05, "loss": 1.2489, "step": 3962 }, { "epoch": 0.16, "grad_norm": 8.0458551102618, "learning_rate": 1.9131630038553803e-05, "loss": 1.3072, "step": 3963 }, { "epoch": 0.16, "grad_norm": 5.276745761285537, "learning_rate": 1.9131098215451146e-05, "loss": 1.401, "step": 3964 }, { "epoch": 0.16, "grad_norm": 6.068671066868721, "learning_rate": 1.9130566236940363e-05, "loss": 1.3303, "step": 3965 }, { "epoch": 0.16, "grad_norm": 6.753395219577663, "learning_rate": 1.913003410303051e-05, "loss": 1.347, "step": 3966 }, { "epoch": 0.16, "grad_norm": 7.993137010254468, "learning_rate": 1.912950181373064e-05, "loss": 1.2497, "step": 3967 }, { "epoch": 0.16, "grad_norm": 9.633084078479786, "learning_rate": 1.9128969369049813e-05, "loss": 1.3824, "step": 3968 }, { "epoch": 0.16, "grad_norm": 5.235285590479765, "learning_rate": 1.9128436768997096e-05, "loss": 1.2654, "step": 3969 }, { "epoch": 0.16, "grad_norm": 7.33870591397474, "learning_rate": 1.9127904013581548e-05, "loss": 1.0856, "step": 3970 }, { "epoch": 0.16, "grad_norm": 10.910078706990259, "learning_rate": 1.912737110281224e-05, "loss": 1.2468, "step": 3971 }, { "epoch": 0.16, "grad_norm": 7.687189801457925, "learning_rate": 1.9126838036698236e-05, "loss": 1.4292, "step": 3972 }, { "epoch": 0.16, "grad_norm": 6.2162623685819955, "learning_rate": 1.9126304815248616e-05, "loss": 1.5462, "step": 3973 }, { "epoch": 0.16, "grad_norm": 6.938850309744121, "learning_rate": 1.9125771438472453e-05, "loss": 1.2516, "step": 3974 }, { "epoch": 0.16, "grad_norm": 5.351161935380967, "learning_rate": 1.912523790637882e-05, "loss": 1.3084, "step": 3975 }, { "epoch": 0.16, "grad_norm": 5.519061067936587, "learning_rate": 1.91247042189768e-05, "loss": 1.0883, "step": 3976 }, { "epoch": 0.16, "grad_norm": 6.858608754899617, "learning_rate": 1.9124170376275485e-05, "loss": 1.3124, "step": 3977 }, { "epoch": 0.16, "grad_norm": 6.839305567671578, "learning_rate": 1.9123636378283948e-05, "loss": 1.4764, "step": 3978 }, { "epoch": 0.16, "grad_norm": 5.880174468505113, "learning_rate": 1.9123102225011286e-05, "loss": 1.2123, "step": 3979 }, { "epoch": 0.16, "grad_norm": 5.3924268671000775, "learning_rate": 1.9122567916466584e-05, "loss": 1.3654, "step": 3980 }, { "epoch": 0.16, "grad_norm": 4.797929460736927, "learning_rate": 1.9122033452658938e-05, "loss": 1.2953, "step": 3981 }, { "epoch": 0.16, "grad_norm": 5.462425674597181, "learning_rate": 1.912149883359745e-05, "loss": 1.3913, "step": 3982 }, { "epoch": 0.16, "grad_norm": 4.7755814204741265, "learning_rate": 1.912096405929121e-05, "loss": 1.2286, "step": 3983 }, { "epoch": 0.16, "grad_norm": 7.042746204710508, "learning_rate": 1.9120429129749323e-05, "loss": 1.351, "step": 3984 }, { "epoch": 0.16, "grad_norm": 5.5600375864339, "learning_rate": 1.9119894044980895e-05, "loss": 1.2232, "step": 3985 }, { "epoch": 0.16, "grad_norm": 6.954628936058114, "learning_rate": 1.9119358804995035e-05, "loss": 1.2885, "step": 3986 }, { "epoch": 0.16, "grad_norm": 5.789663616420473, "learning_rate": 1.911882340980085e-05, "loss": 1.2644, "step": 3987 }, { "epoch": 0.16, "grad_norm": 5.624606870472025, "learning_rate": 1.911828785940745e-05, "loss": 1.0498, "step": 3988 }, { "epoch": 0.16, "grad_norm": 7.136777154300308, "learning_rate": 1.911775215382395e-05, "loss": 1.4429, "step": 3989 }, { "epoch": 0.16, "grad_norm": 7.789210439437904, "learning_rate": 1.9117216293059472e-05, "loss": 1.0914, "step": 3990 }, { "epoch": 0.16, "grad_norm": 4.820218220206617, "learning_rate": 1.911668027712313e-05, "loss": 1.1221, "step": 3991 }, { "epoch": 0.16, "grad_norm": 5.692253755503386, "learning_rate": 1.911614410602405e-05, "loss": 1.3284, "step": 3992 }, { "epoch": 0.16, "grad_norm": 5.939064276741031, "learning_rate": 1.9115607779771365e-05, "loss": 1.3062, "step": 3993 }, { "epoch": 0.16, "grad_norm": 5.565265641101854, "learning_rate": 1.911507129837419e-05, "loss": 1.3111, "step": 3994 }, { "epoch": 0.16, "grad_norm": 4.9710799451178005, "learning_rate": 1.9114534661841663e-05, "loss": 1.1121, "step": 3995 }, { "epoch": 0.16, "grad_norm": 7.541410297033705, "learning_rate": 1.9113997870182915e-05, "loss": 1.1924, "step": 3996 }, { "epoch": 0.16, "grad_norm": 4.484909188411632, "learning_rate": 1.9113460923407086e-05, "loss": 1.2646, "step": 3997 }, { "epoch": 0.16, "grad_norm": 5.1294841701586735, "learning_rate": 1.9112923821523307e-05, "loss": 1.155, "step": 3998 }, { "epoch": 0.16, "grad_norm": 5.060400873576505, "learning_rate": 1.911238656454073e-05, "loss": 1.0395, "step": 3999 }, { "epoch": 0.16, "grad_norm": 4.981002062322405, "learning_rate": 1.9111849152468488e-05, "loss": 1.2215, "step": 4000 }, { "epoch": 0.16, "grad_norm": 8.351345809523174, "learning_rate": 1.9111311585315733e-05, "loss": 1.6578, "step": 4001 }, { "epoch": 0.16, "grad_norm": 4.2902344118359, "learning_rate": 1.9110773863091614e-05, "loss": 1.0908, "step": 4002 }, { "epoch": 0.16, "grad_norm": 8.510523565078586, "learning_rate": 1.911023598580528e-05, "loss": 1.6234, "step": 4003 }, { "epoch": 0.16, "grad_norm": 4.94672909636757, "learning_rate": 1.9109697953465893e-05, "loss": 1.2226, "step": 4004 }, { "epoch": 0.16, "grad_norm": 7.13362832513854, "learning_rate": 1.9109159766082603e-05, "loss": 1.5722, "step": 4005 }, { "epoch": 0.16, "grad_norm": 5.538853039712078, "learning_rate": 1.9108621423664568e-05, "loss": 1.5411, "step": 4006 }, { "epoch": 0.16, "grad_norm": 4.465210824671681, "learning_rate": 1.9108082926220958e-05, "loss": 0.9195, "step": 4007 }, { "epoch": 0.16, "grad_norm": 6.356385050736317, "learning_rate": 1.9107544273760937e-05, "loss": 1.2664, "step": 4008 }, { "epoch": 0.16, "grad_norm": 4.916384097075544, "learning_rate": 1.9107005466293666e-05, "loss": 1.0919, "step": 4009 }, { "epoch": 0.16, "grad_norm": 5.507429889435454, "learning_rate": 1.9106466503828317e-05, "loss": 1.0175, "step": 4010 }, { "epoch": 0.16, "grad_norm": 6.342193800888606, "learning_rate": 1.910592738637407e-05, "loss": 1.0172, "step": 4011 }, { "epoch": 0.16, "grad_norm": 6.4553139610356585, "learning_rate": 1.9105388113940093e-05, "loss": 1.4864, "step": 4012 }, { "epoch": 0.16, "grad_norm": 6.208767929592601, "learning_rate": 1.9104848686535567e-05, "loss": 1.4763, "step": 4013 }, { "epoch": 0.16, "grad_norm": 6.041356404532198, "learning_rate": 1.9104309104169674e-05, "loss": 1.2814, "step": 4014 }, { "epoch": 0.16, "grad_norm": 6.531589273096803, "learning_rate": 1.9103769366851596e-05, "loss": 1.1889, "step": 4015 }, { "epoch": 0.16, "grad_norm": 4.602721974375811, "learning_rate": 1.910322947459052e-05, "loss": 0.8348, "step": 4016 }, { "epoch": 0.16, "grad_norm": 6.676311704820899, "learning_rate": 1.9102689427395633e-05, "loss": 1.5055, "step": 4017 }, { "epoch": 0.16, "grad_norm": 6.442611143275396, "learning_rate": 1.910214922527613e-05, "loss": 1.4062, "step": 4018 }, { "epoch": 0.16, "grad_norm": 7.998029811766504, "learning_rate": 1.91016088682412e-05, "loss": 1.2014, "step": 4019 }, { "epoch": 0.16, "grad_norm": 4.638442250038815, "learning_rate": 1.9101068356300044e-05, "loss": 1.2616, "step": 4020 }, { "epoch": 0.16, "grad_norm": 5.4308125467135975, "learning_rate": 1.910052768946186e-05, "loss": 1.3831, "step": 4021 }, { "epoch": 0.16, "grad_norm": 5.700519019361235, "learning_rate": 1.909998686773585e-05, "loss": 1.4969, "step": 4022 }, { "epoch": 0.16, "grad_norm": 7.849079879604387, "learning_rate": 1.909944589113122e-05, "loss": 1.4318, "step": 4023 }, { "epoch": 0.16, "grad_norm": 5.108428068547263, "learning_rate": 1.909890475965717e-05, "loss": 1.3843, "step": 4024 }, { "epoch": 0.16, "grad_norm": 4.595965653962027, "learning_rate": 1.9098363473322918e-05, "loss": 1.2431, "step": 4025 }, { "epoch": 0.16, "grad_norm": 6.028703473267938, "learning_rate": 1.9097822032137676e-05, "loss": 1.1114, "step": 4026 }, { "epoch": 0.16, "grad_norm": 5.4354098365999, "learning_rate": 1.9097280436110654e-05, "loss": 1.2692, "step": 4027 }, { "epoch": 0.16, "grad_norm": 5.537722732343423, "learning_rate": 1.9096738685251078e-05, "loss": 1.3151, "step": 4028 }, { "epoch": 0.16, "grad_norm": 6.999544876461522, "learning_rate": 1.909619677956816e-05, "loss": 1.4168, "step": 4029 }, { "epoch": 0.16, "grad_norm": 6.478221563824725, "learning_rate": 1.9095654719071123e-05, "loss": 1.3935, "step": 4030 }, { "epoch": 0.16, "grad_norm": 5.911313510294487, "learning_rate": 1.90951125037692e-05, "loss": 1.2622, "step": 4031 }, { "epoch": 0.16, "grad_norm": 4.996130345867584, "learning_rate": 1.9094570133671614e-05, "loss": 1.0175, "step": 4032 }, { "epoch": 0.16, "grad_norm": 4.479017377905313, "learning_rate": 1.9094027608787598e-05, "loss": 1.0729, "step": 4033 }, { "epoch": 0.16, "grad_norm": 5.261298019559854, "learning_rate": 1.9093484929126383e-05, "loss": 1.147, "step": 4034 }, { "epoch": 0.16, "grad_norm": 6.752432555672791, "learning_rate": 1.9092942094697213e-05, "loss": 1.3411, "step": 4035 }, { "epoch": 0.16, "grad_norm": 4.300407204066554, "learning_rate": 1.9092399105509315e-05, "loss": 1.3357, "step": 4036 }, { "epoch": 0.16, "grad_norm": 4.925035058879272, "learning_rate": 1.909185596157194e-05, "loss": 1.2917, "step": 4037 }, { "epoch": 0.16, "grad_norm": 3.5230609460436884, "learning_rate": 1.9091312662894325e-05, "loss": 1.0933, "step": 4038 }, { "epoch": 0.16, "grad_norm": 5.702579278503415, "learning_rate": 1.909076920948572e-05, "loss": 1.3678, "step": 4039 }, { "epoch": 0.16, "grad_norm": 5.982367437833771, "learning_rate": 1.909022560135538e-05, "loss": 1.1192, "step": 4040 }, { "epoch": 0.16, "grad_norm": 5.729083469654225, "learning_rate": 1.9089681838512547e-05, "loss": 1.0381, "step": 4041 }, { "epoch": 0.16, "grad_norm": 4.513821335628773, "learning_rate": 1.908913792096648e-05, "loss": 1.1477, "step": 4042 }, { "epoch": 0.16, "grad_norm": 5.773388251653469, "learning_rate": 1.908859384872644e-05, "loss": 1.1699, "step": 4043 }, { "epoch": 0.16, "grad_norm": 4.651860651937749, "learning_rate": 1.9088049621801684e-05, "loss": 1.0998, "step": 4044 }, { "epoch": 0.16, "grad_norm": 3.4349738027490617, "learning_rate": 1.9087505240201468e-05, "loss": 1.255, "step": 4045 }, { "epoch": 0.16, "grad_norm": 6.104153851918781, "learning_rate": 1.908696070393507e-05, "loss": 1.1829, "step": 4046 }, { "epoch": 0.16, "grad_norm": 5.635607179539315, "learning_rate": 1.9086416013011752e-05, "loss": 1.0639, "step": 4047 }, { "epoch": 0.16, "grad_norm": 6.561638287270044, "learning_rate": 1.9085871167440775e-05, "loss": 1.3473, "step": 4048 }, { "epoch": 0.16, "grad_norm": 6.543053350369894, "learning_rate": 1.908532616723143e-05, "loss": 1.3694, "step": 4049 }, { "epoch": 0.16, "grad_norm": 5.7217636391201685, "learning_rate": 1.9084781012392975e-05, "loss": 1.3103, "step": 4050 }, { "epoch": 0.16, "grad_norm": 5.852843198092934, "learning_rate": 1.9084235702934705e-05, "loss": 1.4944, "step": 4051 }, { "epoch": 0.16, "grad_norm": 5.541005270850799, "learning_rate": 1.9083690238865887e-05, "loss": 1.3616, "step": 4052 }, { "epoch": 0.16, "grad_norm": 6.197655348001722, "learning_rate": 1.9083144620195814e-05, "loss": 1.2814, "step": 4053 }, { "epoch": 0.16, "grad_norm": 7.229290844987736, "learning_rate": 1.9082598846933767e-05, "loss": 1.5011, "step": 4054 }, { "epoch": 0.16, "grad_norm": 4.242491562680106, "learning_rate": 1.9082052919089034e-05, "loss": 1.2355, "step": 4055 }, { "epoch": 0.16, "grad_norm": 5.826152213947616, "learning_rate": 1.9081506836670912e-05, "loss": 1.5492, "step": 4056 }, { "epoch": 0.16, "grad_norm": 6.967482843484629, "learning_rate": 1.908096059968869e-05, "loss": 1.2665, "step": 4057 }, { "epoch": 0.16, "grad_norm": 9.929183704047125, "learning_rate": 1.9080414208151668e-05, "loss": 1.1782, "step": 4058 }, { "epoch": 0.16, "grad_norm": 5.210670718484714, "learning_rate": 1.9079867662069144e-05, "loss": 1.2923, "step": 4059 }, { "epoch": 0.16, "grad_norm": 6.475009653548973, "learning_rate": 1.907932096145042e-05, "loss": 1.2918, "step": 4060 }, { "epoch": 0.16, "grad_norm": 8.032073573249129, "learning_rate": 1.90787741063048e-05, "loss": 1.1418, "step": 4061 }, { "epoch": 0.16, "grad_norm": 6.925142894109444, "learning_rate": 1.9078227096641594e-05, "loss": 1.2424, "step": 4062 }, { "epoch": 0.16, "grad_norm": 7.242719694152994, "learning_rate": 1.9077679932470113e-05, "loss": 1.2966, "step": 4063 }, { "epoch": 0.16, "grad_norm": 8.246429605300468, "learning_rate": 1.9077132613799662e-05, "loss": 1.1688, "step": 4064 }, { "epoch": 0.16, "grad_norm": 8.196638174202283, "learning_rate": 1.907658514063956e-05, "loss": 1.4808, "step": 4065 }, { "epoch": 0.16, "grad_norm": 4.277122466137503, "learning_rate": 1.907603751299913e-05, "loss": 1.2063, "step": 4066 }, { "epoch": 0.16, "grad_norm": 7.27510211772358, "learning_rate": 1.9075489730887685e-05, "loss": 1.1375, "step": 4067 }, { "epoch": 0.16, "grad_norm": 6.115132460349061, "learning_rate": 1.907494179431455e-05, "loss": 1.1361, "step": 4068 }, { "epoch": 0.16, "grad_norm": 12.622161903685695, "learning_rate": 1.9074393703289056e-05, "loss": 1.7304, "step": 4069 }, { "epoch": 0.16, "grad_norm": 5.297744031137932, "learning_rate": 1.9073845457820522e-05, "loss": 1.0489, "step": 4070 }, { "epoch": 0.16, "grad_norm": 7.848921161460434, "learning_rate": 1.907329705791829e-05, "loss": 1.1773, "step": 4071 }, { "epoch": 0.16, "grad_norm": 15.609906984655678, "learning_rate": 1.907274850359168e-05, "loss": 1.6249, "step": 4072 }, { "epoch": 0.16, "grad_norm": 6.632729223568407, "learning_rate": 1.9072199794850043e-05, "loss": 1.4483, "step": 4073 }, { "epoch": 0.16, "grad_norm": 5.551924924177208, "learning_rate": 1.907165093170271e-05, "loss": 1.3681, "step": 4074 }, { "epoch": 0.16, "grad_norm": 7.169654819379074, "learning_rate": 1.907110191415902e-05, "loss": 1.3127, "step": 4075 }, { "epoch": 0.16, "grad_norm": 13.456541155587372, "learning_rate": 1.907055274222832e-05, "loss": 1.4727, "step": 4076 }, { "epoch": 0.16, "grad_norm": 6.92732094597193, "learning_rate": 1.907000341591996e-05, "loss": 1.2995, "step": 4077 }, { "epoch": 0.16, "grad_norm": 6.701744017646784, "learning_rate": 1.9069453935243286e-05, "loss": 1.2846, "step": 4078 }, { "epoch": 0.16, "grad_norm": 11.537891497220597, "learning_rate": 1.9068904300207647e-05, "loss": 1.4928, "step": 4079 }, { "epoch": 0.16, "grad_norm": 7.388034364632162, "learning_rate": 1.9068354510822402e-05, "loss": 1.2935, "step": 4080 }, { "epoch": 0.16, "grad_norm": 5.881248442673398, "learning_rate": 1.906780456709691e-05, "loss": 1.1108, "step": 4081 }, { "epoch": 0.16, "grad_norm": 4.691952758896949, "learning_rate": 1.9067254469040527e-05, "loss": 1.1865, "step": 4082 }, { "epoch": 0.16, "grad_norm": 7.884892010250429, "learning_rate": 1.906670421666261e-05, "loss": 1.2785, "step": 4083 }, { "epoch": 0.16, "grad_norm": 6.8612112472085025, "learning_rate": 1.9066153809972537e-05, "loss": 1.2009, "step": 4084 }, { "epoch": 0.16, "grad_norm": 6.574998306287548, "learning_rate": 1.906560324897967e-05, "loss": 1.2508, "step": 4085 }, { "epoch": 0.16, "grad_norm": 4.876479189937202, "learning_rate": 1.9065052533693376e-05, "loss": 1.2509, "step": 4086 }, { "epoch": 0.16, "grad_norm": 4.962294925941931, "learning_rate": 1.9064501664123033e-05, "loss": 1.4141, "step": 4087 }, { "epoch": 0.16, "grad_norm": 9.760471564740774, "learning_rate": 1.9063950640278014e-05, "loss": 1.3964, "step": 4088 }, { "epoch": 0.16, "grad_norm": 6.203148923464315, "learning_rate": 1.9063399462167697e-05, "loss": 1.4512, "step": 4089 }, { "epoch": 0.16, "grad_norm": 6.299949418464628, "learning_rate": 1.9062848129801458e-05, "loss": 1.1381, "step": 4090 }, { "epoch": 0.16, "grad_norm": 6.119950786517242, "learning_rate": 1.9062296643188693e-05, "loss": 1.29, "step": 4091 }, { "epoch": 0.16, "grad_norm": 9.908816551484177, "learning_rate": 1.9061745002338777e-05, "loss": 1.4608, "step": 4092 }, { "epoch": 0.16, "grad_norm": 5.489005019678511, "learning_rate": 1.9061193207261102e-05, "loss": 1.1705, "step": 4093 }, { "epoch": 0.16, "grad_norm": 5.973546667733247, "learning_rate": 1.9060641257965064e-05, "loss": 1.5166, "step": 4094 }, { "epoch": 0.16, "grad_norm": 6.901476009621438, "learning_rate": 1.906008915446005e-05, "loss": 1.3733, "step": 4095 }, { "epoch": 0.16, "grad_norm": 9.315880935154338, "learning_rate": 1.905953689675546e-05, "loss": 0.9727, "step": 4096 }, { "epoch": 0.17, "grad_norm": 5.865692163863106, "learning_rate": 1.9058984484860695e-05, "loss": 1.2484, "step": 4097 }, { "epoch": 0.17, "grad_norm": 5.393028979758367, "learning_rate": 1.905843191878515e-05, "loss": 1.1992, "step": 4098 }, { "epoch": 0.17, "grad_norm": 5.726953082386982, "learning_rate": 1.9057879198538236e-05, "loss": 1.0842, "step": 4099 }, { "epoch": 0.17, "grad_norm": 6.109123650366585, "learning_rate": 1.905732632412936e-05, "loss": 1.2144, "step": 4100 }, { "epoch": 0.17, "grad_norm": 9.586129751350574, "learning_rate": 1.905677329556793e-05, "loss": 1.2536, "step": 4101 }, { "epoch": 0.17, "grad_norm": 6.870874733055597, "learning_rate": 1.9056220112863356e-05, "loss": 1.3207, "step": 4102 }, { "epoch": 0.17, "grad_norm": 6.667085183209596, "learning_rate": 1.905566677602506e-05, "loss": 1.3671, "step": 4103 }, { "epoch": 0.17, "grad_norm": 8.198058525992584, "learning_rate": 1.905511328506245e-05, "loss": 1.0882, "step": 4104 }, { "epoch": 0.17, "grad_norm": 5.889736921681723, "learning_rate": 1.9054559639984952e-05, "loss": 1.4496, "step": 4105 }, { "epoch": 0.17, "grad_norm": 6.173461754686899, "learning_rate": 1.905400584080199e-05, "loss": 0.8316, "step": 4106 }, { "epoch": 0.17, "grad_norm": 4.782626931693055, "learning_rate": 1.9053451887522988e-05, "loss": 1.1679, "step": 4107 }, { "epoch": 0.17, "grad_norm": 8.24056730017825, "learning_rate": 1.9052897780157372e-05, "loss": 1.6221, "step": 4108 }, { "epoch": 0.17, "grad_norm": 6.105805126529109, "learning_rate": 1.9052343518714572e-05, "loss": 1.1856, "step": 4109 }, { "epoch": 0.17, "grad_norm": 7.200808170312423, "learning_rate": 1.9051789103204025e-05, "loss": 1.0288, "step": 4110 }, { "epoch": 0.17, "grad_norm": 6.310740425012478, "learning_rate": 1.905123453363517e-05, "loss": 1.4674, "step": 4111 }, { "epoch": 0.17, "grad_norm": 4.401006686130842, "learning_rate": 1.9050679810017438e-05, "loss": 1.2391, "step": 4112 }, { "epoch": 0.17, "grad_norm": 5.4757669435233325, "learning_rate": 1.9050124932360273e-05, "loss": 1.4394, "step": 4113 }, { "epoch": 0.17, "grad_norm": 6.851951259336822, "learning_rate": 1.9049569900673118e-05, "loss": 1.1741, "step": 4114 }, { "epoch": 0.17, "grad_norm": 5.542666383593923, "learning_rate": 1.9049014714965422e-05, "loss": 1.069, "step": 4115 }, { "epoch": 0.17, "grad_norm": 4.420199798812453, "learning_rate": 1.9048459375246634e-05, "loss": 1.3526, "step": 4116 }, { "epoch": 0.17, "grad_norm": 4.554377579626602, "learning_rate": 1.90479038815262e-05, "loss": 1.1253, "step": 4117 }, { "epoch": 0.17, "grad_norm": 3.889652536431984, "learning_rate": 1.9047348233813583e-05, "loss": 1.129, "step": 4118 }, { "epoch": 0.17, "grad_norm": 7.502665099466654, "learning_rate": 1.9046792432118236e-05, "loss": 1.3682, "step": 4119 }, { "epoch": 0.17, "grad_norm": 5.620223849385193, "learning_rate": 1.9046236476449618e-05, "loss": 1.4377, "step": 4120 }, { "epoch": 0.17, "grad_norm": 7.402728737983948, "learning_rate": 1.904568036681719e-05, "loss": 1.4661, "step": 4121 }, { "epoch": 0.17, "grad_norm": 5.046263441294367, "learning_rate": 1.9045124103230417e-05, "loss": 1.0424, "step": 4122 }, { "epoch": 0.17, "grad_norm": 5.646090579412489, "learning_rate": 1.904456768569877e-05, "loss": 1.2779, "step": 4123 }, { "epoch": 0.17, "grad_norm": 6.707571479826666, "learning_rate": 1.9044011114231713e-05, "loss": 1.4202, "step": 4124 }, { "epoch": 0.17, "grad_norm": 5.0289642625186595, "learning_rate": 1.9043454388838724e-05, "loss": 1.1951, "step": 4125 }, { "epoch": 0.17, "grad_norm": 5.976845069796216, "learning_rate": 1.904289750952928e-05, "loss": 1.2837, "step": 4126 }, { "epoch": 0.17, "grad_norm": 5.767008015357354, "learning_rate": 1.9042340476312852e-05, "loss": 1.2335, "step": 4127 }, { "epoch": 0.17, "grad_norm": 5.452213153648672, "learning_rate": 1.9041783289198923e-05, "loss": 1.1895, "step": 4128 }, { "epoch": 0.17, "grad_norm": 5.14924579669854, "learning_rate": 1.9041225948196977e-05, "loss": 1.019, "step": 4129 }, { "epoch": 0.17, "grad_norm": 5.2480955391546855, "learning_rate": 1.9040668453316503e-05, "loss": 1.3784, "step": 4130 }, { "epoch": 0.17, "grad_norm": 5.460208038320138, "learning_rate": 1.9040110804566986e-05, "loss": 1.1634, "step": 4131 }, { "epoch": 0.17, "grad_norm": 6.332927046039073, "learning_rate": 1.9039553001957915e-05, "loss": 1.1034, "step": 4132 }, { "epoch": 0.17, "grad_norm": 7.196843591382779, "learning_rate": 1.9038995045498786e-05, "loss": 1.5509, "step": 4133 }, { "epoch": 0.17, "grad_norm": 7.087264325863853, "learning_rate": 1.9038436935199095e-05, "loss": 1.2561, "step": 4134 }, { "epoch": 0.17, "grad_norm": 6.144802973655612, "learning_rate": 1.9037878671068343e-05, "loss": 1.331, "step": 4135 }, { "epoch": 0.17, "grad_norm": 5.6076507022294555, "learning_rate": 1.9037320253116026e-05, "loss": 1.2651, "step": 4136 }, { "epoch": 0.17, "grad_norm": 6.738789407328971, "learning_rate": 1.9036761681351653e-05, "loss": 1.1613, "step": 4137 }, { "epoch": 0.17, "grad_norm": 5.157789660414331, "learning_rate": 1.9036202955784732e-05, "loss": 1.2435, "step": 4138 }, { "epoch": 0.17, "grad_norm": 6.660465149977502, "learning_rate": 1.9035644076424766e-05, "loss": 1.377, "step": 4139 }, { "epoch": 0.17, "grad_norm": 7.2396666040428705, "learning_rate": 1.9035085043281272e-05, "loss": 1.1554, "step": 4140 }, { "epoch": 0.17, "grad_norm": 4.69115708202659, "learning_rate": 1.903452585636376e-05, "loss": 1.2674, "step": 4141 }, { "epoch": 0.17, "grad_norm": 5.371007482887036, "learning_rate": 1.9033966515681754e-05, "loss": 1.4455, "step": 4142 }, { "epoch": 0.17, "grad_norm": 5.514316492059914, "learning_rate": 1.9033407021244768e-05, "loss": 1.3736, "step": 4143 }, { "epoch": 0.17, "grad_norm": 6.2104188642885525, "learning_rate": 1.9032847373062323e-05, "loss": 1.3408, "step": 4144 }, { "epoch": 0.17, "grad_norm": 6.111703791044323, "learning_rate": 1.9032287571143954e-05, "loss": 1.4128, "step": 4145 }, { "epoch": 0.17, "grad_norm": 6.246914915422701, "learning_rate": 1.9031727615499174e-05, "loss": 1.3446, "step": 4146 }, { "epoch": 0.17, "grad_norm": 5.48137919799662, "learning_rate": 1.903116750613753e-05, "loss": 1.1962, "step": 4147 }, { "epoch": 0.17, "grad_norm": 5.510867209155573, "learning_rate": 1.9030607243068538e-05, "loss": 1.2643, "step": 4148 }, { "epoch": 0.17, "grad_norm": 7.118363623608781, "learning_rate": 1.9030046826301746e-05, "loss": 1.3494, "step": 4149 }, { "epoch": 0.17, "grad_norm": 6.267618543365224, "learning_rate": 1.9029486255846688e-05, "loss": 1.6005, "step": 4150 }, { "epoch": 0.17, "grad_norm": 4.891015552381707, "learning_rate": 1.9028925531712903e-05, "loss": 1.3349, "step": 4151 }, { "epoch": 0.17, "grad_norm": 5.724143422510819, "learning_rate": 1.9028364653909937e-05, "loss": 1.1487, "step": 4152 }, { "epoch": 0.17, "grad_norm": 5.52367496957695, "learning_rate": 1.902780362244733e-05, "loss": 1.3595, "step": 4153 }, { "epoch": 0.17, "grad_norm": 4.588870029794739, "learning_rate": 1.902724243733464e-05, "loss": 1.3281, "step": 4154 }, { "epoch": 0.17, "grad_norm": 5.39501779418519, "learning_rate": 1.902668109858141e-05, "loss": 1.311, "step": 4155 }, { "epoch": 0.17, "grad_norm": 5.898455721056232, "learning_rate": 1.90261196061972e-05, "loss": 1.3004, "step": 4156 }, { "epoch": 0.17, "grad_norm": 7.357064018067873, "learning_rate": 1.902555796019156e-05, "loss": 1.4259, "step": 4157 }, { "epoch": 0.17, "grad_norm": 5.492053233673439, "learning_rate": 1.902499616057406e-05, "loss": 1.2429, "step": 4158 }, { "epoch": 0.17, "grad_norm": 8.22538259298864, "learning_rate": 1.9024434207354248e-05, "loss": 1.1377, "step": 4159 }, { "epoch": 0.17, "grad_norm": 6.7177887618253695, "learning_rate": 1.9023872100541692e-05, "loss": 1.4894, "step": 4160 }, { "epoch": 0.17, "grad_norm": 4.83359452997663, "learning_rate": 1.9023309840145966e-05, "loss": 1.1451, "step": 4161 }, { "epoch": 0.17, "grad_norm": 4.390113412692883, "learning_rate": 1.9022747426176633e-05, "loss": 0.9082, "step": 4162 }, { "epoch": 0.17, "grad_norm": 7.22015351576059, "learning_rate": 1.902218485864327e-05, "loss": 1.3903, "step": 4163 }, { "epoch": 0.17, "grad_norm": 6.791844895773235, "learning_rate": 1.9021622137555445e-05, "loss": 1.3022, "step": 4164 }, { "epoch": 0.17, "grad_norm": 6.757263776438203, "learning_rate": 1.9021059262922738e-05, "loss": 1.4691, "step": 4165 }, { "epoch": 0.17, "grad_norm": 5.61741147073761, "learning_rate": 1.9020496234754732e-05, "loss": 1.2111, "step": 4166 }, { "epoch": 0.17, "grad_norm": 8.718783430484432, "learning_rate": 1.9019933053061008e-05, "loss": 1.2667, "step": 4167 }, { "epoch": 0.17, "grad_norm": 7.276472599988779, "learning_rate": 1.9019369717851148e-05, "loss": 1.1607, "step": 4168 }, { "epoch": 0.17, "grad_norm": 5.700138567126512, "learning_rate": 1.9018806229134746e-05, "loss": 1.2683, "step": 4169 }, { "epoch": 0.17, "grad_norm": 7.97917893400863, "learning_rate": 1.9018242586921386e-05, "loss": 1.1441, "step": 4170 }, { "epoch": 0.17, "grad_norm": 6.674491098824714, "learning_rate": 1.9017678791220665e-05, "loss": 1.405, "step": 4171 }, { "epoch": 0.17, "grad_norm": 6.811535977505021, "learning_rate": 1.9017114842042174e-05, "loss": 1.2406, "step": 4172 }, { "epoch": 0.17, "grad_norm": 5.506173308628981, "learning_rate": 1.9016550739395517e-05, "loss": 1.3702, "step": 4173 }, { "epoch": 0.17, "grad_norm": 6.773617095044542, "learning_rate": 1.9015986483290295e-05, "loss": 1.4584, "step": 4174 }, { "epoch": 0.17, "grad_norm": 4.963510153289954, "learning_rate": 1.9015422073736106e-05, "loss": 1.253, "step": 4175 }, { "epoch": 0.17, "grad_norm": 6.223371264488661, "learning_rate": 1.901485751074256e-05, "loss": 1.3695, "step": 4176 }, { "epoch": 0.17, "grad_norm": 7.125777335669692, "learning_rate": 1.901429279431926e-05, "loss": 1.2943, "step": 4177 }, { "epoch": 0.17, "grad_norm": 5.303999755082373, "learning_rate": 1.9013727924475826e-05, "loss": 1.1347, "step": 4178 }, { "epoch": 0.17, "grad_norm": 4.968884518911134, "learning_rate": 1.9013162901221865e-05, "loss": 1.2601, "step": 4179 }, { "epoch": 0.17, "grad_norm": 6.5777634966437795, "learning_rate": 1.9012597724567003e-05, "loss": 1.291, "step": 4180 }, { "epoch": 0.17, "grad_norm": 7.410349299046547, "learning_rate": 1.9012032394520843e-05, "loss": 1.2015, "step": 4181 }, { "epoch": 0.17, "grad_norm": 4.577495444980912, "learning_rate": 1.9011466911093022e-05, "loss": 1.1122, "step": 4182 }, { "epoch": 0.17, "grad_norm": 7.409329341758364, "learning_rate": 1.9010901274293155e-05, "loss": 1.2332, "step": 4183 }, { "epoch": 0.17, "grad_norm": 5.5128201706210715, "learning_rate": 1.9010335484130875e-05, "loss": 0.8123, "step": 4184 }, { "epoch": 0.17, "grad_norm": 12.433498204616782, "learning_rate": 1.9009769540615805e-05, "loss": 1.1675, "step": 4185 }, { "epoch": 0.17, "grad_norm": 6.581343348196284, "learning_rate": 1.9009203443757585e-05, "loss": 1.1597, "step": 4186 }, { "epoch": 0.17, "grad_norm": 6.008334863854525, "learning_rate": 1.9008637193565844e-05, "loss": 1.268, "step": 4187 }, { "epoch": 0.17, "grad_norm": 8.666244154036399, "learning_rate": 1.9008070790050222e-05, "loss": 1.3138, "step": 4188 }, { "epoch": 0.17, "grad_norm": 7.513396578522295, "learning_rate": 1.9007504233220356e-05, "loss": 1.5344, "step": 4189 }, { "epoch": 0.17, "grad_norm": 8.689918154723225, "learning_rate": 1.900693752308589e-05, "loss": 1.1962, "step": 4190 }, { "epoch": 0.17, "grad_norm": 7.730261335277553, "learning_rate": 1.9006370659656475e-05, "loss": 0.9682, "step": 4191 }, { "epoch": 0.17, "grad_norm": 7.780503003484608, "learning_rate": 1.9005803642941753e-05, "loss": 1.3375, "step": 4192 }, { "epoch": 0.17, "grad_norm": 7.374370435815554, "learning_rate": 1.9005236472951374e-05, "loss": 1.2969, "step": 4193 }, { "epoch": 0.17, "grad_norm": 6.197691962394294, "learning_rate": 1.900466914969499e-05, "loss": 1.3945, "step": 4194 }, { "epoch": 0.17, "grad_norm": 5.963987768302976, "learning_rate": 1.900410167318226e-05, "loss": 1.1416, "step": 4195 }, { "epoch": 0.17, "grad_norm": 6.003354891023903, "learning_rate": 1.900353404342284e-05, "loss": 1.2174, "step": 4196 }, { "epoch": 0.17, "grad_norm": 8.304299760505899, "learning_rate": 1.9002966260426396e-05, "loss": 1.3525, "step": 4197 }, { "epoch": 0.17, "grad_norm": 4.729549876712869, "learning_rate": 1.9002398324202583e-05, "loss": 1.2705, "step": 4198 }, { "epoch": 0.17, "grad_norm": 5.768477788732904, "learning_rate": 1.9001830234761074e-05, "loss": 1.2811, "step": 4199 }, { "epoch": 0.17, "grad_norm": 5.340185716854088, "learning_rate": 1.9001261992111535e-05, "loss": 1.0455, "step": 4200 }, { "epoch": 0.17, "grad_norm": 6.127369634443601, "learning_rate": 1.9000693596263636e-05, "loss": 1.2094, "step": 4201 }, { "epoch": 0.17, "grad_norm": 10.838092044002746, "learning_rate": 1.9000125047227053e-05, "loss": 1.1596, "step": 4202 }, { "epoch": 0.17, "grad_norm": 4.931177384866107, "learning_rate": 1.8999556345011464e-05, "loss": 1.2924, "step": 4203 }, { "epoch": 0.17, "grad_norm": 6.225745293601149, "learning_rate": 1.8998987489626545e-05, "loss": 1.1341, "step": 4204 }, { "epoch": 0.17, "grad_norm": 7.735379037971675, "learning_rate": 1.8998418481081975e-05, "loss": 1.153, "step": 4205 }, { "epoch": 0.17, "grad_norm": 8.758671824270253, "learning_rate": 1.8997849319387448e-05, "loss": 1.3788, "step": 4206 }, { "epoch": 0.17, "grad_norm": 6.925834453560451, "learning_rate": 1.899728000455264e-05, "loss": 1.042, "step": 4207 }, { "epoch": 0.17, "grad_norm": 3.390627915233522, "learning_rate": 1.899671053658725e-05, "loss": 1.1889, "step": 4208 }, { "epoch": 0.17, "grad_norm": 8.5423754870824, "learning_rate": 1.8996140915500964e-05, "loss": 0.9527, "step": 4209 }, { "epoch": 0.17, "grad_norm": 10.117016306735069, "learning_rate": 1.899557114130348e-05, "loss": 1.2741, "step": 4210 }, { "epoch": 0.17, "grad_norm": 5.208528470147233, "learning_rate": 1.8995001214004487e-05, "loss": 1.2389, "step": 4211 }, { "epoch": 0.17, "grad_norm": 6.4466878052555225, "learning_rate": 1.8994431133613695e-05, "loss": 0.9779, "step": 4212 }, { "epoch": 0.17, "grad_norm": 9.537736222068103, "learning_rate": 1.8993860900140807e-05, "loss": 1.3798, "step": 4213 }, { "epoch": 0.17, "grad_norm": 7.16920820506357, "learning_rate": 1.899329051359552e-05, "loss": 1.2395, "step": 4214 }, { "epoch": 0.17, "grad_norm": 5.372975613187559, "learning_rate": 1.899271997398755e-05, "loss": 1.2014, "step": 4215 }, { "epoch": 0.17, "grad_norm": 7.656028995646227, "learning_rate": 1.8992149281326603e-05, "loss": 1.3257, "step": 4216 }, { "epoch": 0.17, "grad_norm": 6.5945922563721915, "learning_rate": 1.899157843562239e-05, "loss": 1.3995, "step": 4217 }, { "epoch": 0.17, "grad_norm": 6.903911745808679, "learning_rate": 1.8991007436884633e-05, "loss": 1.4616, "step": 4218 }, { "epoch": 0.17, "grad_norm": 5.415036367593793, "learning_rate": 1.8990436285123042e-05, "loss": 1.2587, "step": 4219 }, { "epoch": 0.17, "grad_norm": 6.2093674135774135, "learning_rate": 1.898986498034735e-05, "loss": 1.3395, "step": 4220 }, { "epoch": 0.17, "grad_norm": 6.123614662395101, "learning_rate": 1.8989293522567265e-05, "loss": 1.3079, "step": 4221 }, { "epoch": 0.17, "grad_norm": 7.206658805395513, "learning_rate": 1.8988721911792523e-05, "loss": 1.3163, "step": 4222 }, { "epoch": 0.17, "grad_norm": 5.888924777133074, "learning_rate": 1.8988150148032854e-05, "loss": 1.1933, "step": 4223 }, { "epoch": 0.17, "grad_norm": 6.965790676796993, "learning_rate": 1.898757823129798e-05, "loss": 1.1544, "step": 4224 }, { "epoch": 0.17, "grad_norm": 7.155195982004498, "learning_rate": 1.8987006161597643e-05, "loss": 1.1177, "step": 4225 }, { "epoch": 0.17, "grad_norm": 8.097808316632964, "learning_rate": 1.8986433938941577e-05, "loss": 1.5003, "step": 4226 }, { "epoch": 0.17, "grad_norm": 7.7449644979935615, "learning_rate": 1.898586156333952e-05, "loss": 1.2691, "step": 4227 }, { "epoch": 0.17, "grad_norm": 6.103774421891713, "learning_rate": 1.8985289034801212e-05, "loss": 1.243, "step": 4228 }, { "epoch": 0.17, "grad_norm": 6.498461143290428, "learning_rate": 1.8984716353336404e-05, "loss": 1.3659, "step": 4229 }, { "epoch": 0.17, "grad_norm": 6.294445868735889, "learning_rate": 1.8984143518954838e-05, "loss": 1.3825, "step": 4230 }, { "epoch": 0.17, "grad_norm": 7.757614800385568, "learning_rate": 1.898357053166626e-05, "loss": 1.4647, "step": 4231 }, { "epoch": 0.17, "grad_norm": 6.556306499899737, "learning_rate": 1.898299739148043e-05, "loss": 0.8725, "step": 4232 }, { "epoch": 0.17, "grad_norm": 7.793241043531141, "learning_rate": 1.89824240984071e-05, "loss": 1.1133, "step": 4233 }, { "epoch": 0.17, "grad_norm": 5.018227606322503, "learning_rate": 1.8981850652456024e-05, "loss": 1.0434, "step": 4234 }, { "epoch": 0.17, "grad_norm": 7.361342587091893, "learning_rate": 1.8981277053636963e-05, "loss": 1.4224, "step": 4235 }, { "epoch": 0.17, "grad_norm": 5.147018316009382, "learning_rate": 1.898070330195968e-05, "loss": 1.5018, "step": 4236 }, { "epoch": 0.17, "grad_norm": 7.098059028257036, "learning_rate": 1.898012939743394e-05, "loss": 1.6081, "step": 4237 }, { "epoch": 0.17, "grad_norm": 5.150083170476777, "learning_rate": 1.8979555340069512e-05, "loss": 1.152, "step": 4238 }, { "epoch": 0.17, "grad_norm": 5.995017937740366, "learning_rate": 1.8978981129876165e-05, "loss": 1.307, "step": 4239 }, { "epoch": 0.17, "grad_norm": 6.7810164915567235, "learning_rate": 1.897840676686367e-05, "loss": 1.1702, "step": 4240 }, { "epoch": 0.17, "grad_norm": 5.083915294583586, "learning_rate": 1.897783225104181e-05, "loss": 0.9839, "step": 4241 }, { "epoch": 0.17, "grad_norm": 5.6308581010287515, "learning_rate": 1.8977257582420354e-05, "loss": 1.4667, "step": 4242 }, { "epoch": 0.17, "grad_norm": 7.216692789242337, "learning_rate": 1.8976682761009086e-05, "loss": 0.992, "step": 4243 }, { "epoch": 0.17, "grad_norm": 6.344948975694708, "learning_rate": 1.897610778681779e-05, "loss": 1.454, "step": 4244 }, { "epoch": 0.17, "grad_norm": 4.594603256448575, "learning_rate": 1.897553265985625e-05, "loss": 1.2748, "step": 4245 }, { "epoch": 0.17, "grad_norm": 5.175387913595413, "learning_rate": 1.8974957380134258e-05, "loss": 1.1561, "step": 4246 }, { "epoch": 0.17, "grad_norm": 4.154112426744577, "learning_rate": 1.8974381947661604e-05, "loss": 1.0737, "step": 4247 }, { "epoch": 0.17, "grad_norm": 5.022794834860591, "learning_rate": 1.8973806362448076e-05, "loss": 1.1573, "step": 4248 }, { "epoch": 0.17, "grad_norm": 6.425517336601293, "learning_rate": 1.897323062450348e-05, "loss": 1.2308, "step": 4249 }, { "epoch": 0.17, "grad_norm": 4.99113089443778, "learning_rate": 1.8972654733837605e-05, "loss": 1.2826, "step": 4250 }, { "epoch": 0.17, "grad_norm": 4.732592895401215, "learning_rate": 1.897207869046026e-05, "loss": 1.1966, "step": 4251 }, { "epoch": 0.17, "grad_norm": 3.7832659368942547, "learning_rate": 1.8971502494381244e-05, "loss": 1.0807, "step": 4252 }, { "epoch": 0.17, "grad_norm": 3.661746696794355, "learning_rate": 1.8970926145610367e-05, "loss": 1.1946, "step": 4253 }, { "epoch": 0.17, "grad_norm": 9.005951955564505, "learning_rate": 1.8970349644157438e-05, "loss": 1.195, "step": 4254 }, { "epoch": 0.17, "grad_norm": 6.1558635044994485, "learning_rate": 1.8969772990032267e-05, "loss": 1.4774, "step": 4255 }, { "epoch": 0.17, "grad_norm": 8.09245462302887, "learning_rate": 1.896919618324467e-05, "loss": 1.4298, "step": 4256 }, { "epoch": 0.17, "grad_norm": 3.397221070844202, "learning_rate": 1.8968619223804464e-05, "loss": 1.0146, "step": 4257 }, { "epoch": 0.17, "grad_norm": 6.000394952102271, "learning_rate": 1.8968042111721464e-05, "loss": 1.5147, "step": 4258 }, { "epoch": 0.17, "grad_norm": 4.628687354439735, "learning_rate": 1.89674648470055e-05, "loss": 1.2967, "step": 4259 }, { "epoch": 0.17, "grad_norm": 7.47750713636404, "learning_rate": 1.896688742966639e-05, "loss": 1.3162, "step": 4260 }, { "epoch": 0.17, "grad_norm": 4.393961090797919, "learning_rate": 1.8966309859713972e-05, "loss": 1.2164, "step": 4261 }, { "epoch": 0.17, "grad_norm": 5.278199991008887, "learning_rate": 1.896573213715806e-05, "loss": 1.14, "step": 4262 }, { "epoch": 0.17, "grad_norm": 9.150660068598242, "learning_rate": 1.8965154262008504e-05, "loss": 1.5391, "step": 4263 }, { "epoch": 0.17, "grad_norm": 5.995955815178639, "learning_rate": 1.8964576234275123e-05, "loss": 1.4985, "step": 4264 }, { "epoch": 0.17, "grad_norm": 6.339509295519993, "learning_rate": 1.8963998053967767e-05, "loss": 1.2305, "step": 4265 }, { "epoch": 0.17, "grad_norm": 6.222936851594538, "learning_rate": 1.896341972109627e-05, "loss": 1.3357, "step": 4266 }, { "epoch": 0.17, "grad_norm": 8.773137705096902, "learning_rate": 1.8962841235670475e-05, "loss": 1.3041, "step": 4267 }, { "epoch": 0.17, "grad_norm": 6.740417396603743, "learning_rate": 1.8962262597700233e-05, "loss": 1.1314, "step": 4268 }, { "epoch": 0.17, "grad_norm": 4.373949504056772, "learning_rate": 1.8961683807195387e-05, "loss": 1.1384, "step": 4269 }, { "epoch": 0.17, "grad_norm": 3.7772118498666343, "learning_rate": 1.896110486416579e-05, "loss": 1.0885, "step": 4270 }, { "epoch": 0.17, "grad_norm": 5.665475800644459, "learning_rate": 1.8960525768621296e-05, "loss": 1.3285, "step": 4271 }, { "epoch": 0.17, "grad_norm": 5.549486977308858, "learning_rate": 1.8959946520571755e-05, "loss": 1.4301, "step": 4272 }, { "epoch": 0.17, "grad_norm": 5.838483660110163, "learning_rate": 1.8959367120027035e-05, "loss": 1.356, "step": 4273 }, { "epoch": 0.17, "grad_norm": 5.684825200050149, "learning_rate": 1.895878756699699e-05, "loss": 1.1585, "step": 4274 }, { "epoch": 0.17, "grad_norm": 6.263663518684972, "learning_rate": 1.8958207861491488e-05, "loss": 1.4996, "step": 4275 }, { "epoch": 0.17, "grad_norm": 5.551195626193033, "learning_rate": 1.8957628003520394e-05, "loss": 1.2472, "step": 4276 }, { "epoch": 0.17, "grad_norm": 5.697831817455648, "learning_rate": 1.8957047993093578e-05, "loss": 1.2591, "step": 4277 }, { "epoch": 0.17, "grad_norm": 5.077108015498099, "learning_rate": 1.895646783022091e-05, "loss": 1.1692, "step": 4278 }, { "epoch": 0.17, "grad_norm": 5.794474729322474, "learning_rate": 1.895588751491226e-05, "loss": 1.0932, "step": 4279 }, { "epoch": 0.17, "grad_norm": 4.106190316816176, "learning_rate": 1.8955307047177513e-05, "loss": 1.1443, "step": 4280 }, { "epoch": 0.17, "grad_norm": 7.73926457438282, "learning_rate": 1.8954726427026545e-05, "loss": 1.1674, "step": 4281 }, { "epoch": 0.17, "grad_norm": 5.990018960399158, "learning_rate": 1.8954145654469238e-05, "loss": 1.4399, "step": 4282 }, { "epoch": 0.17, "grad_norm": 4.793995788615614, "learning_rate": 1.8953564729515477e-05, "loss": 1.0895, "step": 4283 }, { "epoch": 0.17, "grad_norm": 6.321907130397331, "learning_rate": 1.8952983652175145e-05, "loss": 1.1901, "step": 4284 }, { "epoch": 0.17, "grad_norm": 5.096628903280536, "learning_rate": 1.895240242245814e-05, "loss": 1.4192, "step": 4285 }, { "epoch": 0.17, "grad_norm": 6.436370493843423, "learning_rate": 1.8951821040374345e-05, "loss": 1.1864, "step": 4286 }, { "epoch": 0.17, "grad_norm": 6.876592943922851, "learning_rate": 1.8951239505933663e-05, "loss": 1.3472, "step": 4287 }, { "epoch": 0.17, "grad_norm": 9.36632982465595, "learning_rate": 1.8950657819145987e-05, "loss": 1.3469, "step": 4288 }, { "epoch": 0.17, "grad_norm": 4.814468973651315, "learning_rate": 1.8950075980021215e-05, "loss": 1.2157, "step": 4289 }, { "epoch": 0.17, "grad_norm": 4.715411853835615, "learning_rate": 1.8949493988569255e-05, "loss": 1.034, "step": 4290 }, { "epoch": 0.17, "grad_norm": 4.947809600546948, "learning_rate": 1.894891184480001e-05, "loss": 1.3961, "step": 4291 }, { "epoch": 0.17, "grad_norm": 5.709186018797437, "learning_rate": 1.894832954872339e-05, "loss": 1.2778, "step": 4292 }, { "epoch": 0.17, "grad_norm": 5.162888226498755, "learning_rate": 1.8947747100349298e-05, "loss": 1.2603, "step": 4293 }, { "epoch": 0.17, "grad_norm": 5.629955481142523, "learning_rate": 1.8947164499687655e-05, "loss": 1.2178, "step": 4294 }, { "epoch": 0.17, "grad_norm": 6.0233820489206, "learning_rate": 1.8946581746748374e-05, "loss": 1.3454, "step": 4295 }, { "epoch": 0.17, "grad_norm": 4.927142539383322, "learning_rate": 1.8945998841541374e-05, "loss": 1.361, "step": 4296 }, { "epoch": 0.17, "grad_norm": 7.168033191393253, "learning_rate": 1.8945415784076574e-05, "loss": 1.3914, "step": 4297 }, { "epoch": 0.17, "grad_norm": 4.437293337136122, "learning_rate": 1.89448325743639e-05, "loss": 0.9796, "step": 4298 }, { "epoch": 0.17, "grad_norm": 6.824913662082511, "learning_rate": 1.8944249212413276e-05, "loss": 1.4317, "step": 4299 }, { "epoch": 0.17, "grad_norm": 6.028606151290398, "learning_rate": 1.894366569823463e-05, "loss": 1.336, "step": 4300 }, { "epoch": 0.17, "grad_norm": 6.328452649476066, "learning_rate": 1.8943082031837897e-05, "loss": 1.2715, "step": 4301 }, { "epoch": 0.17, "grad_norm": 6.794809047626491, "learning_rate": 1.8942498213233004e-05, "loss": 1.4298, "step": 4302 }, { "epoch": 0.17, "grad_norm": 7.135302713747916, "learning_rate": 1.8941914242429897e-05, "loss": 1.3603, "step": 4303 }, { "epoch": 0.17, "grad_norm": 6.365593516392254, "learning_rate": 1.8941330119438506e-05, "loss": 1.3723, "step": 4304 }, { "epoch": 0.17, "grad_norm": 5.132494157221942, "learning_rate": 1.894074584426878e-05, "loss": 1.2061, "step": 4305 }, { "epoch": 0.17, "grad_norm": 6.733069821896607, "learning_rate": 1.8940161416930656e-05, "loss": 1.3013, "step": 4306 }, { "epoch": 0.17, "grad_norm": 6.9387316338394776, "learning_rate": 1.8939576837434084e-05, "loss": 1.241, "step": 4307 }, { "epoch": 0.17, "grad_norm": 6.355142967288342, "learning_rate": 1.8938992105789012e-05, "loss": 1.153, "step": 4308 }, { "epoch": 0.17, "grad_norm": 5.696270706624069, "learning_rate": 1.8938407222005398e-05, "loss": 1.4796, "step": 4309 }, { "epoch": 0.17, "grad_norm": 7.613084490145604, "learning_rate": 1.893782218609319e-05, "loss": 1.1218, "step": 4310 }, { "epoch": 0.17, "grad_norm": 6.41515307852449, "learning_rate": 1.8937236998062344e-05, "loss": 1.0516, "step": 4311 }, { "epoch": 0.17, "grad_norm": 5.027167532917759, "learning_rate": 1.8936651657922827e-05, "loss": 1.2802, "step": 4312 }, { "epoch": 0.17, "grad_norm": 5.397578294957767, "learning_rate": 1.8936066165684598e-05, "loss": 1.1003, "step": 4313 }, { "epoch": 0.17, "grad_norm": 8.389898195355006, "learning_rate": 1.8935480521357616e-05, "loss": 1.3039, "step": 4314 }, { "epoch": 0.17, "grad_norm": 4.776881115905419, "learning_rate": 1.8934894724951855e-05, "loss": 1.4138, "step": 4315 }, { "epoch": 0.17, "grad_norm": 6.070534612381212, "learning_rate": 1.8934308776477283e-05, "loss": 1.2252, "step": 4316 }, { "epoch": 0.17, "grad_norm": 7.3668608436410175, "learning_rate": 1.8933722675943874e-05, "loss": 1.2269, "step": 4317 }, { "epoch": 0.17, "grad_norm": 7.138648330707551, "learning_rate": 1.8933136423361604e-05, "loss": 1.2831, "step": 4318 }, { "epoch": 0.17, "grad_norm": 5.907119050083833, "learning_rate": 1.8932550018740447e-05, "loss": 0.9935, "step": 4319 }, { "epoch": 0.17, "grad_norm": 5.649321260624856, "learning_rate": 1.8931963462090384e-05, "loss": 1.1005, "step": 4320 }, { "epoch": 0.17, "grad_norm": 6.5461662683998645, "learning_rate": 1.89313767534214e-05, "loss": 1.358, "step": 4321 }, { "epoch": 0.17, "grad_norm": 11.41675452869121, "learning_rate": 1.8930789892743485e-05, "loss": 1.2606, "step": 4322 }, { "epoch": 0.17, "grad_norm": 5.461829900212319, "learning_rate": 1.8930202880066618e-05, "loss": 0.9678, "step": 4323 }, { "epoch": 0.17, "grad_norm": 8.547301724679606, "learning_rate": 1.8929615715400792e-05, "loss": 1.3516, "step": 4324 }, { "epoch": 0.17, "grad_norm": 7.193958606566885, "learning_rate": 1.8929028398756008e-05, "loss": 1.3388, "step": 4325 }, { "epoch": 0.17, "grad_norm": 6.450831187389043, "learning_rate": 1.892844093014225e-05, "loss": 1.4321, "step": 4326 }, { "epoch": 0.17, "grad_norm": 5.4640168248911465, "learning_rate": 1.8927853309569526e-05, "loss": 1.3913, "step": 4327 }, { "epoch": 0.17, "grad_norm": 5.565248543857628, "learning_rate": 1.8927265537047832e-05, "loss": 1.3392, "step": 4328 }, { "epoch": 0.17, "grad_norm": 9.852347757732991, "learning_rate": 1.8926677612587177e-05, "loss": 1.2285, "step": 4329 }, { "epoch": 0.17, "grad_norm": 8.23178338552767, "learning_rate": 1.8926089536197563e-05, "loss": 1.226, "step": 4330 }, { "epoch": 0.17, "grad_norm": 5.697973490525278, "learning_rate": 1.8925501307889e-05, "loss": 1.349, "step": 4331 }, { "epoch": 0.17, "grad_norm": 10.859960368416996, "learning_rate": 1.8924912927671495e-05, "loss": 1.2498, "step": 4332 }, { "epoch": 0.17, "grad_norm": 7.595200223930764, "learning_rate": 1.8924324395555066e-05, "loss": 1.2582, "step": 4333 }, { "epoch": 0.17, "grad_norm": 7.144888384523135, "learning_rate": 1.8923735711549735e-05, "loss": 1.491, "step": 4334 }, { "epoch": 0.17, "grad_norm": 5.4057908836843165, "learning_rate": 1.892314687566551e-05, "loss": 1.2136, "step": 4335 }, { "epoch": 0.17, "grad_norm": 6.0447616586800015, "learning_rate": 1.8922557887912424e-05, "loss": 1.509, "step": 4336 }, { "epoch": 0.17, "grad_norm": 8.943509989891014, "learning_rate": 1.8921968748300495e-05, "loss": 1.5511, "step": 4337 }, { "epoch": 0.17, "grad_norm": 9.694518776077535, "learning_rate": 1.8921379456839747e-05, "loss": 1.1658, "step": 4338 }, { "epoch": 0.17, "grad_norm": 5.993000186310493, "learning_rate": 1.8920790013540218e-05, "loss": 1.3424, "step": 4339 }, { "epoch": 0.17, "grad_norm": 5.766819295212575, "learning_rate": 1.8920200418411933e-05, "loss": 1.0428, "step": 4340 }, { "epoch": 0.17, "grad_norm": 5.082405476770038, "learning_rate": 1.8919610671464933e-05, "loss": 1.0525, "step": 4341 }, { "epoch": 0.17, "grad_norm": 6.0136655816599385, "learning_rate": 1.891902077270925e-05, "loss": 1.3672, "step": 4342 }, { "epoch": 0.17, "grad_norm": 5.545127992661719, "learning_rate": 1.8918430722154924e-05, "loss": 1.3403, "step": 4343 }, { "epoch": 0.17, "grad_norm": 3.9411959338882334, "learning_rate": 1.8917840519812e-05, "loss": 1.2174, "step": 4344 }, { "epoch": 0.18, "grad_norm": 7.034176306693609, "learning_rate": 1.8917250165690523e-05, "loss": 1.5069, "step": 4345 }, { "epoch": 0.18, "grad_norm": 9.706656355508024, "learning_rate": 1.8916659659800535e-05, "loss": 1.0406, "step": 4346 }, { "epoch": 0.18, "grad_norm": 5.377725795847853, "learning_rate": 1.8916069002152096e-05, "loss": 1.4078, "step": 4347 }, { "epoch": 0.18, "grad_norm": 5.295479034387314, "learning_rate": 1.891547819275525e-05, "loss": 1.2969, "step": 4348 }, { "epoch": 0.18, "grad_norm": 6.218547221210901, "learning_rate": 1.891488723162006e-05, "loss": 1.2081, "step": 4349 }, { "epoch": 0.18, "grad_norm": 11.36522186300687, "learning_rate": 1.891429611875658e-05, "loss": 1.4318, "step": 4350 }, { "epoch": 0.18, "grad_norm": 4.436941180292118, "learning_rate": 1.8913704854174867e-05, "loss": 1.2503, "step": 4351 }, { "epoch": 0.18, "grad_norm": 9.136405922579668, "learning_rate": 1.8913113437884987e-05, "loss": 1.2039, "step": 4352 }, { "epoch": 0.18, "grad_norm": 6.8755568653850085, "learning_rate": 1.891252186989701e-05, "loss": 1.2686, "step": 4353 }, { "epoch": 0.18, "grad_norm": 7.400171292512557, "learning_rate": 1.8911930150221e-05, "loss": 1.3839, "step": 4354 }, { "epoch": 0.18, "grad_norm": 6.855400962344863, "learning_rate": 1.8911338278867028e-05, "loss": 1.2567, "step": 4355 }, { "epoch": 0.18, "grad_norm": 4.982819755432895, "learning_rate": 1.8910746255845168e-05, "loss": 1.2508, "step": 4356 }, { "epoch": 0.18, "grad_norm": 7.231276469993783, "learning_rate": 1.8910154081165494e-05, "loss": 1.5624, "step": 4357 }, { "epoch": 0.18, "grad_norm": 3.8790626975499602, "learning_rate": 1.890956175483809e-05, "loss": 1.2836, "step": 4358 }, { "epoch": 0.18, "grad_norm": 6.931027291510924, "learning_rate": 1.890896927687303e-05, "loss": 1.6742, "step": 4359 }, { "epoch": 0.18, "grad_norm": 6.6187023880244125, "learning_rate": 1.8908376647280404e-05, "loss": 1.0636, "step": 4360 }, { "epoch": 0.18, "grad_norm": 4.705412210480778, "learning_rate": 1.8907783866070296e-05, "loss": 1.2547, "step": 4361 }, { "epoch": 0.18, "grad_norm": 7.181676317917846, "learning_rate": 1.8907190933252794e-05, "loss": 1.1798, "step": 4362 }, { "epoch": 0.18, "grad_norm": 9.81482616461269, "learning_rate": 1.890659784883799e-05, "loss": 1.3226, "step": 4363 }, { "epoch": 0.18, "grad_norm": 5.953661017174649, "learning_rate": 1.8906004612835977e-05, "loss": 1.2867, "step": 4364 }, { "epoch": 0.18, "grad_norm": 6.144618242040318, "learning_rate": 1.8905411225256856e-05, "loss": 1.3516, "step": 4365 }, { "epoch": 0.18, "grad_norm": 5.022810711668597, "learning_rate": 1.890481768611072e-05, "loss": 1.205, "step": 4366 }, { "epoch": 0.18, "grad_norm": 11.071178090962952, "learning_rate": 1.890422399540768e-05, "loss": 1.3281, "step": 4367 }, { "epoch": 0.18, "grad_norm": 5.751429598600528, "learning_rate": 1.890363015315783e-05, "loss": 1.3129, "step": 4368 }, { "epoch": 0.18, "grad_norm": 6.597953196327177, "learning_rate": 1.8903036159371282e-05, "loss": 1.2595, "step": 4369 }, { "epoch": 0.18, "grad_norm": 6.617677540853075, "learning_rate": 1.8902442014058144e-05, "loss": 1.0626, "step": 4370 }, { "epoch": 0.18, "grad_norm": 10.074914047144023, "learning_rate": 1.890184771722853e-05, "loss": 1.2469, "step": 4371 }, { "epoch": 0.18, "grad_norm": 7.881060101272912, "learning_rate": 1.8901253268892557e-05, "loss": 1.2362, "step": 4372 }, { "epoch": 0.18, "grad_norm": 6.467233175883716, "learning_rate": 1.8900658669060336e-05, "loss": 1.4732, "step": 4373 }, { "epoch": 0.18, "grad_norm": 7.745584029051529, "learning_rate": 1.890006391774199e-05, "loss": 1.2648, "step": 4374 }, { "epoch": 0.18, "grad_norm": 6.582305153260924, "learning_rate": 1.8899469014947644e-05, "loss": 1.3636, "step": 4375 }, { "epoch": 0.18, "grad_norm": 5.689762419806045, "learning_rate": 1.889887396068742e-05, "loss": 1.4946, "step": 4376 }, { "epoch": 0.18, "grad_norm": 5.256219532959471, "learning_rate": 1.8898278754971445e-05, "loss": 1.3302, "step": 4377 }, { "epoch": 0.18, "grad_norm": 7.493968695076601, "learning_rate": 1.8897683397809854e-05, "loss": 1.2045, "step": 4378 }, { "epoch": 0.18, "grad_norm": 6.336877559634579, "learning_rate": 1.8897087889212772e-05, "loss": 1.3283, "step": 4379 }, { "epoch": 0.18, "grad_norm": 7.570182029669496, "learning_rate": 1.889649222919034e-05, "loss": 1.3018, "step": 4380 }, { "epoch": 0.18, "grad_norm": 6.045856246825819, "learning_rate": 1.8895896417752694e-05, "loss": 1.5388, "step": 4381 }, { "epoch": 0.18, "grad_norm": 4.911413200736854, "learning_rate": 1.8895300454909976e-05, "loss": 1.2405, "step": 4382 }, { "epoch": 0.18, "grad_norm": 5.085391023456443, "learning_rate": 1.889470434067233e-05, "loss": 1.3358, "step": 4383 }, { "epoch": 0.18, "grad_norm": 11.417037606037555, "learning_rate": 1.88941080750499e-05, "loss": 1.3578, "step": 4384 }, { "epoch": 0.18, "grad_norm": 5.712752240324182, "learning_rate": 1.889351165805283e-05, "loss": 0.9522, "step": 4385 }, { "epoch": 0.18, "grad_norm": 8.154277962239984, "learning_rate": 1.8892915089691276e-05, "loss": 1.1169, "step": 4386 }, { "epoch": 0.18, "grad_norm": 7.375079693148147, "learning_rate": 1.889231836997539e-05, "loss": 1.2072, "step": 4387 }, { "epoch": 0.18, "grad_norm": 8.085273106848625, "learning_rate": 1.889172149891533e-05, "loss": 1.5563, "step": 4388 }, { "epoch": 0.18, "grad_norm": 6.324312061107912, "learning_rate": 1.8891124476521253e-05, "loss": 1.1974, "step": 4389 }, { "epoch": 0.18, "grad_norm": 7.394357947145928, "learning_rate": 1.889052730280332e-05, "loss": 1.2813, "step": 4390 }, { "epoch": 0.18, "grad_norm": 9.690621147828054, "learning_rate": 1.888992997777169e-05, "loss": 1.0738, "step": 4391 }, { "epoch": 0.18, "grad_norm": 8.034193477990593, "learning_rate": 1.888933250143654e-05, "loss": 1.289, "step": 4392 }, { "epoch": 0.18, "grad_norm": 7.930537294931636, "learning_rate": 1.888873487380803e-05, "loss": 1.3259, "step": 4393 }, { "epoch": 0.18, "grad_norm": 6.787568030138284, "learning_rate": 1.888813709489633e-05, "loss": 1.4088, "step": 4394 }, { "epoch": 0.18, "grad_norm": 7.801297225045891, "learning_rate": 1.888753916471162e-05, "loss": 1.5406, "step": 4395 }, { "epoch": 0.18, "grad_norm": 6.150690603786375, "learning_rate": 1.888694108326408e-05, "loss": 0.9393, "step": 4396 }, { "epoch": 0.18, "grad_norm": 7.127693256107507, "learning_rate": 1.888634285056388e-05, "loss": 1.2916, "step": 4397 }, { "epoch": 0.18, "grad_norm": 6.639459294737565, "learning_rate": 1.8885744466621206e-05, "loss": 1.338, "step": 4398 }, { "epoch": 0.18, "grad_norm": 4.808000470405496, "learning_rate": 1.888514593144624e-05, "loss": 1.0546, "step": 4399 }, { "epoch": 0.18, "grad_norm": 5.008501925098046, "learning_rate": 1.888454724504917e-05, "loss": 1.1262, "step": 4400 }, { "epoch": 0.18, "grad_norm": 4.146473411509136, "learning_rate": 1.888394840744019e-05, "loss": 1.1292, "step": 4401 }, { "epoch": 0.18, "grad_norm": 8.8389111086878, "learning_rate": 1.8883349418629487e-05, "loss": 1.7098, "step": 4402 }, { "epoch": 0.18, "grad_norm": 5.79196616933261, "learning_rate": 1.8882750278627254e-05, "loss": 1.3485, "step": 4403 }, { "epoch": 0.18, "grad_norm": 6.733437592596033, "learning_rate": 1.888215098744369e-05, "loss": 1.6601, "step": 4404 }, { "epoch": 0.18, "grad_norm": 4.989369394058566, "learning_rate": 1.8881551545089e-05, "loss": 1.1521, "step": 4405 }, { "epoch": 0.18, "grad_norm": 4.549303542577163, "learning_rate": 1.8880951951573378e-05, "loss": 1.3346, "step": 4406 }, { "epoch": 0.18, "grad_norm": 5.332827649702478, "learning_rate": 1.8880352206907037e-05, "loss": 1.4142, "step": 4407 }, { "epoch": 0.18, "grad_norm": 6.1733283381223085, "learning_rate": 1.8879752311100176e-05, "loss": 1.3729, "step": 4408 }, { "epoch": 0.18, "grad_norm": 6.750456079645053, "learning_rate": 1.887915226416301e-05, "loss": 1.2957, "step": 4409 }, { "epoch": 0.18, "grad_norm": 4.614484725518983, "learning_rate": 1.8878552066105752e-05, "loss": 1.2285, "step": 4410 }, { "epoch": 0.18, "grad_norm": 4.194949779984246, "learning_rate": 1.8877951716938618e-05, "loss": 1.0329, "step": 4411 }, { "epoch": 0.18, "grad_norm": 5.6544631414064614, "learning_rate": 1.8877351216671817e-05, "loss": 1.3231, "step": 4412 }, { "epoch": 0.18, "grad_norm": 6.4397153094455915, "learning_rate": 1.887675056531558e-05, "loss": 1.2889, "step": 4413 }, { "epoch": 0.18, "grad_norm": 6.399622064545052, "learning_rate": 1.8876149762880123e-05, "loss": 0.9711, "step": 4414 }, { "epoch": 0.18, "grad_norm": 7.007096995843496, "learning_rate": 1.887554880937568e-05, "loss": 1.2303, "step": 4415 }, { "epoch": 0.18, "grad_norm": 6.859219769918822, "learning_rate": 1.887494770481247e-05, "loss": 1.2725, "step": 4416 }, { "epoch": 0.18, "grad_norm": 4.3695845494951095, "learning_rate": 1.8874346449200727e-05, "loss": 1.0699, "step": 4417 }, { "epoch": 0.18, "grad_norm": 6.642858324588418, "learning_rate": 1.8873745042550685e-05, "loss": 1.3867, "step": 4418 }, { "epoch": 0.18, "grad_norm": 7.288058024253876, "learning_rate": 1.8873143484872577e-05, "loss": 1.3379, "step": 4419 }, { "epoch": 0.18, "grad_norm": 5.345322234192439, "learning_rate": 1.8872541776176644e-05, "loss": 1.2178, "step": 4420 }, { "epoch": 0.18, "grad_norm": 4.664321215138521, "learning_rate": 1.8871939916473126e-05, "loss": 1.2527, "step": 4421 }, { "epoch": 0.18, "grad_norm": 5.94631377783516, "learning_rate": 1.8871337905772264e-05, "loss": 1.3158, "step": 4422 }, { "epoch": 0.18, "grad_norm": 4.996948982237108, "learning_rate": 1.8870735744084313e-05, "loss": 1.4475, "step": 4423 }, { "epoch": 0.18, "grad_norm": 5.251995088309851, "learning_rate": 1.8870133431419506e-05, "loss": 1.2211, "step": 4424 }, { "epoch": 0.18, "grad_norm": 4.400090132349033, "learning_rate": 1.886953096778811e-05, "loss": 1.1054, "step": 4425 }, { "epoch": 0.18, "grad_norm": 5.661036230029953, "learning_rate": 1.8868928353200367e-05, "loss": 1.0564, "step": 4426 }, { "epoch": 0.18, "grad_norm": 4.217034556981774, "learning_rate": 1.886832558766654e-05, "loss": 0.9428, "step": 4427 }, { "epoch": 0.18, "grad_norm": 3.796610734406143, "learning_rate": 1.886772267119689e-05, "loss": 1.1896, "step": 4428 }, { "epoch": 0.18, "grad_norm": 4.995910908699163, "learning_rate": 1.8867119603801673e-05, "loss": 1.5249, "step": 4429 }, { "epoch": 0.18, "grad_norm": 6.477827035439924, "learning_rate": 1.8866516385491152e-05, "loss": 1.4581, "step": 4430 }, { "epoch": 0.18, "grad_norm": 4.389102548868236, "learning_rate": 1.8865913016275596e-05, "loss": 1.1133, "step": 4431 }, { "epoch": 0.18, "grad_norm": 6.5281028767873845, "learning_rate": 1.8865309496165275e-05, "loss": 1.1693, "step": 4432 }, { "epoch": 0.18, "grad_norm": 6.215969664441569, "learning_rate": 1.886470582517046e-05, "loss": 0.9721, "step": 4433 }, { "epoch": 0.18, "grad_norm": 6.052442293107154, "learning_rate": 1.8864102003301426e-05, "loss": 1.2633, "step": 4434 }, { "epoch": 0.18, "grad_norm": 4.382132576376022, "learning_rate": 1.8863498030568448e-05, "loss": 1.3172, "step": 4435 }, { "epoch": 0.18, "grad_norm": 5.269198782687323, "learning_rate": 1.8862893906981805e-05, "loss": 1.4431, "step": 4436 }, { "epoch": 0.18, "grad_norm": 5.033302687913594, "learning_rate": 1.886228963255178e-05, "loss": 1.0094, "step": 4437 }, { "epoch": 0.18, "grad_norm": 5.0596317545262295, "learning_rate": 1.886168520728866e-05, "loss": 1.3735, "step": 4438 }, { "epoch": 0.18, "grad_norm": 5.914637482848103, "learning_rate": 1.886108063120273e-05, "loss": 1.3126, "step": 4439 }, { "epoch": 0.18, "grad_norm": 4.4171174897263095, "learning_rate": 1.886047590430428e-05, "loss": 1.2406, "step": 4440 }, { "epoch": 0.18, "grad_norm": 5.591025540082884, "learning_rate": 1.88598710266036e-05, "loss": 1.4657, "step": 4441 }, { "epoch": 0.18, "grad_norm": 6.631936261743646, "learning_rate": 1.8859265998110986e-05, "loss": 1.5003, "step": 4442 }, { "epoch": 0.18, "grad_norm": 7.117394961107376, "learning_rate": 1.885866081883674e-05, "loss": 1.315, "step": 4443 }, { "epoch": 0.18, "grad_norm": 4.73695124662593, "learning_rate": 1.8858055488791155e-05, "loss": 1.2914, "step": 4444 }, { "epoch": 0.18, "grad_norm": 7.653257776719424, "learning_rate": 1.885745000798454e-05, "loss": 1.0961, "step": 4445 }, { "epoch": 0.18, "grad_norm": 6.534803301492843, "learning_rate": 1.8856844376427194e-05, "loss": 1.1918, "step": 4446 }, { "epoch": 0.18, "grad_norm": 5.629857229163721, "learning_rate": 1.8856238594129426e-05, "loss": 1.1383, "step": 4447 }, { "epoch": 0.18, "grad_norm": 5.502217825642043, "learning_rate": 1.885563266110155e-05, "loss": 1.49, "step": 4448 }, { "epoch": 0.18, "grad_norm": 7.161735519050004, "learning_rate": 1.8855026577353872e-05, "loss": 1.2604, "step": 4449 }, { "epoch": 0.18, "grad_norm": 8.396950571496014, "learning_rate": 1.8854420342896716e-05, "loss": 1.3279, "step": 4450 }, { "epoch": 0.18, "grad_norm": 7.100674797603801, "learning_rate": 1.8853813957740392e-05, "loss": 1.2933, "step": 4451 }, { "epoch": 0.18, "grad_norm": 7.902191212169402, "learning_rate": 1.885320742189523e-05, "loss": 1.3177, "step": 4452 }, { "epoch": 0.18, "grad_norm": 7.666627676107658, "learning_rate": 1.885260073537154e-05, "loss": 1.2332, "step": 4453 }, { "epoch": 0.18, "grad_norm": 8.560875947090635, "learning_rate": 1.8851993898179655e-05, "loss": 1.2338, "step": 4454 }, { "epoch": 0.18, "grad_norm": 7.24279625717281, "learning_rate": 1.885138691032991e-05, "loss": 1.4482, "step": 4455 }, { "epoch": 0.18, "grad_norm": 5.561727693309757, "learning_rate": 1.8850779771832622e-05, "loss": 1.2817, "step": 4456 }, { "epoch": 0.18, "grad_norm": 5.979381781168331, "learning_rate": 1.885017248269813e-05, "loss": 1.6861, "step": 4457 }, { "epoch": 0.18, "grad_norm": 5.503194501713442, "learning_rate": 1.8849565042936775e-05, "loss": 1.2245, "step": 4458 }, { "epoch": 0.18, "grad_norm": 5.531002107179251, "learning_rate": 1.884895745255889e-05, "loss": 1.187, "step": 4459 }, { "epoch": 0.18, "grad_norm": 6.703881548803801, "learning_rate": 1.8848349711574813e-05, "loss": 1.1069, "step": 4460 }, { "epoch": 0.18, "grad_norm": 7.250100002370529, "learning_rate": 1.8847741819994895e-05, "loss": 1.4136, "step": 4461 }, { "epoch": 0.18, "grad_norm": 7.670462990005993, "learning_rate": 1.884713377782948e-05, "loss": 1.4004, "step": 4462 }, { "epoch": 0.18, "grad_norm": 6.875641512910392, "learning_rate": 1.884652558508891e-05, "loss": 1.1917, "step": 4463 }, { "epoch": 0.18, "grad_norm": 6.200519119639992, "learning_rate": 1.8845917241783547e-05, "loss": 1.2745, "step": 4464 }, { "epoch": 0.18, "grad_norm": 4.73768108435634, "learning_rate": 1.8845308747923735e-05, "loss": 0.9994, "step": 4465 }, { "epoch": 0.18, "grad_norm": 6.924002435505012, "learning_rate": 1.8844700103519836e-05, "loss": 1.5162, "step": 4466 }, { "epoch": 0.18, "grad_norm": 5.2661329633140275, "learning_rate": 1.884409130858221e-05, "loss": 1.2029, "step": 4467 }, { "epoch": 0.18, "grad_norm": 5.079354909110589, "learning_rate": 1.8843482363121213e-05, "loss": 1.4302, "step": 4468 }, { "epoch": 0.18, "grad_norm": 7.220705856247863, "learning_rate": 1.8842873267147214e-05, "loss": 1.3525, "step": 4469 }, { "epoch": 0.18, "grad_norm": 10.444105243229465, "learning_rate": 1.8842264020670578e-05, "loss": 1.2266, "step": 4470 }, { "epoch": 0.18, "grad_norm": 5.7769081668323645, "learning_rate": 1.8841654623701673e-05, "loss": 1.2906, "step": 4471 }, { "epoch": 0.18, "grad_norm": 6.053724802602392, "learning_rate": 1.8841045076250873e-05, "loss": 1.3564, "step": 4472 }, { "epoch": 0.18, "grad_norm": 5.6293095884343485, "learning_rate": 1.884043537832855e-05, "loss": 1.2075, "step": 4473 }, { "epoch": 0.18, "grad_norm": 10.198617731542617, "learning_rate": 1.8839825529945076e-05, "loss": 1.652, "step": 4474 }, { "epoch": 0.18, "grad_norm": 6.913149500051442, "learning_rate": 1.8839215531110842e-05, "loss": 1.3059, "step": 4475 }, { "epoch": 0.18, "grad_norm": 8.262451722333823, "learning_rate": 1.8838605381836225e-05, "loss": 1.4842, "step": 4476 }, { "epoch": 0.18, "grad_norm": 6.197614891059246, "learning_rate": 1.883799508213161e-05, "loss": 1.28, "step": 4477 }, { "epoch": 0.18, "grad_norm": 5.084304776563417, "learning_rate": 1.8837384632007376e-05, "loss": 1.0778, "step": 4478 }, { "epoch": 0.18, "grad_norm": 4.047925893665262, "learning_rate": 1.8836774031473923e-05, "loss": 1.2157, "step": 4479 }, { "epoch": 0.18, "grad_norm": 5.888862402044357, "learning_rate": 1.883616328054164e-05, "loss": 1.3635, "step": 4480 }, { "epoch": 0.18, "grad_norm": 9.256445800711791, "learning_rate": 1.883555237922092e-05, "loss": 1.3279, "step": 4481 }, { "epoch": 0.18, "grad_norm": 7.813584618718623, "learning_rate": 1.8834941327522162e-05, "loss": 1.2737, "step": 4482 }, { "epoch": 0.18, "grad_norm": 4.584289698864137, "learning_rate": 1.883433012545577e-05, "loss": 1.2498, "step": 4483 }, { "epoch": 0.18, "grad_norm": 7.414363332104274, "learning_rate": 1.8833718773032136e-05, "loss": 1.1305, "step": 4484 }, { "epoch": 0.18, "grad_norm": 6.0934537586672315, "learning_rate": 1.883310727026167e-05, "loss": 1.4291, "step": 4485 }, { "epoch": 0.18, "grad_norm": 7.885027733210101, "learning_rate": 1.8832495617154784e-05, "loss": 1.2216, "step": 4486 }, { "epoch": 0.18, "grad_norm": 4.977923895060429, "learning_rate": 1.8831883813721888e-05, "loss": 1.2047, "step": 4487 }, { "epoch": 0.18, "grad_norm": 7.341002217793443, "learning_rate": 1.8831271859973386e-05, "loss": 1.4237, "step": 4488 }, { "epoch": 0.18, "grad_norm": 7.299189659586374, "learning_rate": 1.88306597559197e-05, "loss": 1.4654, "step": 4489 }, { "epoch": 0.18, "grad_norm": 6.985043440378593, "learning_rate": 1.8830047501571247e-05, "loss": 1.3946, "step": 4490 }, { "epoch": 0.18, "grad_norm": 7.950037599983838, "learning_rate": 1.8829435096938447e-05, "loss": 1.4906, "step": 4491 }, { "epoch": 0.18, "grad_norm": 5.250181536800797, "learning_rate": 1.8828822542031723e-05, "loss": 1.2741, "step": 4492 }, { "epoch": 0.18, "grad_norm": 6.291412877206207, "learning_rate": 1.8828209836861496e-05, "loss": 1.5185, "step": 4493 }, { "epoch": 0.18, "grad_norm": 8.834209368600863, "learning_rate": 1.8827596981438202e-05, "loss": 1.35, "step": 4494 }, { "epoch": 0.18, "grad_norm": 5.145165080295693, "learning_rate": 1.882698397577227e-05, "loss": 1.0564, "step": 4495 }, { "epoch": 0.18, "grad_norm": 7.323295911055921, "learning_rate": 1.8826370819874128e-05, "loss": 0.9857, "step": 4496 }, { "epoch": 0.18, "grad_norm": 13.08787784288154, "learning_rate": 1.8825757513754215e-05, "loss": 1.5257, "step": 4497 }, { "epoch": 0.18, "grad_norm": 4.2022767310068145, "learning_rate": 1.882514405742297e-05, "loss": 0.9796, "step": 4498 }, { "epoch": 0.18, "grad_norm": 6.443515107702634, "learning_rate": 1.882453045089083e-05, "loss": 1.318, "step": 4499 }, { "epoch": 0.18, "grad_norm": 7.319359484340461, "learning_rate": 1.8823916694168242e-05, "loss": 1.1088, "step": 4500 }, { "epoch": 0.18, "grad_norm": 8.40264256631867, "learning_rate": 1.882330278726565e-05, "loss": 1.5257, "step": 4501 }, { "epoch": 0.18, "grad_norm": 6.078610948200963, "learning_rate": 1.8822688730193506e-05, "loss": 1.4138, "step": 4502 }, { "epoch": 0.18, "grad_norm": 4.329282975402542, "learning_rate": 1.8822074522962258e-05, "loss": 1.1031, "step": 4503 }, { "epoch": 0.18, "grad_norm": 8.121290600714152, "learning_rate": 1.8821460165582358e-05, "loss": 1.2793, "step": 4504 }, { "epoch": 0.18, "grad_norm": 8.63084566307773, "learning_rate": 1.8820845658064265e-05, "loss": 1.4674, "step": 4505 }, { "epoch": 0.18, "grad_norm": 4.54951902816772, "learning_rate": 1.8820231000418442e-05, "loss": 1.2521, "step": 4506 }, { "epoch": 0.18, "grad_norm": 4.486140188615365, "learning_rate": 1.8819616192655342e-05, "loss": 1.1938, "step": 4507 }, { "epoch": 0.18, "grad_norm": 7.652051170633008, "learning_rate": 1.881900123478543e-05, "loss": 1.3378, "step": 4508 }, { "epoch": 0.18, "grad_norm": 6.392778309910515, "learning_rate": 1.8818386126819173e-05, "loss": 1.239, "step": 4509 }, { "epoch": 0.18, "grad_norm": 8.444398620839957, "learning_rate": 1.8817770868767046e-05, "loss": 1.4517, "step": 4510 }, { "epoch": 0.18, "grad_norm": 4.79088154847275, "learning_rate": 1.8817155460639512e-05, "loss": 1.2352, "step": 4511 }, { "epoch": 0.18, "grad_norm": 5.183225135734056, "learning_rate": 1.881653990244705e-05, "loss": 1.0793, "step": 4512 }, { "epoch": 0.18, "grad_norm": 6.101924002078076, "learning_rate": 1.8815924194200138e-05, "loss": 1.08, "step": 4513 }, { "epoch": 0.18, "grad_norm": 4.164645497525183, "learning_rate": 1.881530833590925e-05, "loss": 1.1646, "step": 4514 }, { "epoch": 0.18, "grad_norm": 9.8367478485278, "learning_rate": 1.881469232758487e-05, "loss": 1.4225, "step": 4515 }, { "epoch": 0.18, "grad_norm": 4.336827465241128, "learning_rate": 1.8814076169237485e-05, "loss": 1.2651, "step": 4516 }, { "epoch": 0.18, "grad_norm": 8.18071839971492, "learning_rate": 1.8813459860877575e-05, "loss": 1.356, "step": 4517 }, { "epoch": 0.18, "grad_norm": 6.830981049738741, "learning_rate": 1.8812843402515638e-05, "loss": 1.3964, "step": 4518 }, { "epoch": 0.18, "grad_norm": 7.456459304913187, "learning_rate": 1.881222679416216e-05, "loss": 1.1463, "step": 4519 }, { "epoch": 0.18, "grad_norm": 5.229189759046091, "learning_rate": 1.8811610035827636e-05, "loss": 1.2574, "step": 4520 }, { "epoch": 0.18, "grad_norm": 7.781623820813202, "learning_rate": 1.8810993127522566e-05, "loss": 1.4418, "step": 4521 }, { "epoch": 0.18, "grad_norm": 6.15274658246238, "learning_rate": 1.8810376069257445e-05, "loss": 1.5474, "step": 4522 }, { "epoch": 0.18, "grad_norm": 4.396109744878242, "learning_rate": 1.880975886104278e-05, "loss": 1.0957, "step": 4523 }, { "epoch": 0.18, "grad_norm": 8.244068499328444, "learning_rate": 1.8809141502889073e-05, "loss": 1.4457, "step": 4524 }, { "epoch": 0.18, "grad_norm": 6.125810355867455, "learning_rate": 1.880852399480683e-05, "loss": 1.2754, "step": 4525 }, { "epoch": 0.18, "grad_norm": 8.157993597198173, "learning_rate": 1.8807906336806564e-05, "loss": 1.3595, "step": 4526 }, { "epoch": 0.18, "grad_norm": 3.8368633356268362, "learning_rate": 1.8807288528898786e-05, "loss": 1.2655, "step": 4527 }, { "epoch": 0.18, "grad_norm": 5.318300551986323, "learning_rate": 1.880667057109401e-05, "loss": 1.1486, "step": 4528 }, { "epoch": 0.18, "grad_norm": 5.092027253668999, "learning_rate": 1.8806052463402755e-05, "loss": 1.1191, "step": 4529 }, { "epoch": 0.18, "grad_norm": 8.022168399803412, "learning_rate": 1.8805434205835536e-05, "loss": 1.1899, "step": 4530 }, { "epoch": 0.18, "grad_norm": 5.558288559325908, "learning_rate": 1.8804815798402886e-05, "loss": 1.4704, "step": 4531 }, { "epoch": 0.18, "grad_norm": 5.490299621511517, "learning_rate": 1.880419724111532e-05, "loss": 0.9065, "step": 4532 }, { "epoch": 0.18, "grad_norm": 3.1961573242747745, "learning_rate": 1.8803578533983367e-05, "loss": 1.0668, "step": 4533 }, { "epoch": 0.18, "grad_norm": 5.157320858413727, "learning_rate": 1.8802959677017562e-05, "loss": 0.8709, "step": 4534 }, { "epoch": 0.18, "grad_norm": 8.830504418412042, "learning_rate": 1.8802340670228437e-05, "loss": 1.2743, "step": 4535 }, { "epoch": 0.18, "grad_norm": 4.858982838307921, "learning_rate": 1.8801721513626523e-05, "loss": 1.2311, "step": 4536 }, { "epoch": 0.18, "grad_norm": 9.406588774617404, "learning_rate": 1.880110220722236e-05, "loss": 1.319, "step": 4537 }, { "epoch": 0.18, "grad_norm": 4.00581185794077, "learning_rate": 1.880048275102649e-05, "loss": 1.1021, "step": 4538 }, { "epoch": 0.18, "grad_norm": 8.997605535728802, "learning_rate": 1.8799863145049454e-05, "loss": 1.4435, "step": 4539 }, { "epoch": 0.18, "grad_norm": 6.891767821457742, "learning_rate": 1.8799243389301796e-05, "loss": 1.2838, "step": 4540 }, { "epoch": 0.18, "grad_norm": 8.750310468618979, "learning_rate": 1.879862348379407e-05, "loss": 1.0947, "step": 4541 }, { "epoch": 0.18, "grad_norm": 3.4129831900261065, "learning_rate": 1.879800342853682e-05, "loss": 1.1225, "step": 4542 }, { "epoch": 0.18, "grad_norm": 4.4702797657156195, "learning_rate": 1.87973832235406e-05, "loss": 1.2621, "step": 4543 }, { "epoch": 0.18, "grad_norm": 4.623334244018804, "learning_rate": 1.8796762868815974e-05, "loss": 1.2064, "step": 4544 }, { "epoch": 0.18, "grad_norm": 5.718570690838693, "learning_rate": 1.879614236437349e-05, "loss": 1.0247, "step": 4545 }, { "epoch": 0.18, "grad_norm": 7.09042954782156, "learning_rate": 1.879552171022371e-05, "loss": 1.3439, "step": 4546 }, { "epoch": 0.18, "grad_norm": 6.225182128387661, "learning_rate": 1.8794900906377202e-05, "loss": 1.391, "step": 4547 }, { "epoch": 0.18, "grad_norm": 6.404872875788206, "learning_rate": 1.879427995284453e-05, "loss": 1.2436, "step": 4548 }, { "epoch": 0.18, "grad_norm": 7.30569607372079, "learning_rate": 1.879365884963626e-05, "loss": 1.4946, "step": 4549 }, { "epoch": 0.18, "grad_norm": 4.272256502943383, "learning_rate": 1.8793037596762967e-05, "loss": 0.8016, "step": 4550 }, { "epoch": 0.18, "grad_norm": 6.10225632387139, "learning_rate": 1.8792416194235223e-05, "loss": 1.438, "step": 4551 }, { "epoch": 0.18, "grad_norm": 4.320523792105241, "learning_rate": 1.8791794642063604e-05, "loss": 0.9384, "step": 4552 }, { "epoch": 0.18, "grad_norm": 4.731900497466726, "learning_rate": 1.8791172940258685e-05, "loss": 1.263, "step": 4553 }, { "epoch": 0.18, "grad_norm": 6.586534174244404, "learning_rate": 1.8790551088831054e-05, "loss": 1.3709, "step": 4554 }, { "epoch": 0.18, "grad_norm": 6.212164205088169, "learning_rate": 1.878992908779129e-05, "loss": 1.3088, "step": 4555 }, { "epoch": 0.18, "grad_norm": 5.881734470457721, "learning_rate": 1.8789306937149975e-05, "loss": 1.3407, "step": 4556 }, { "epoch": 0.18, "grad_norm": 4.445264846656285, "learning_rate": 1.8788684636917712e-05, "loss": 1.2234, "step": 4557 }, { "epoch": 0.18, "grad_norm": 5.121657642443756, "learning_rate": 1.8788062187105075e-05, "loss": 1.2993, "step": 4558 }, { "epoch": 0.18, "grad_norm": 6.077963657685995, "learning_rate": 1.8787439587722673e-05, "loss": 1.2244, "step": 4559 }, { "epoch": 0.18, "grad_norm": 4.505667260562164, "learning_rate": 1.8786816838781093e-05, "loss": 1.1706, "step": 4560 }, { "epoch": 0.18, "grad_norm": 5.266002524489676, "learning_rate": 1.8786193940290937e-05, "loss": 1.0087, "step": 4561 }, { "epoch": 0.18, "grad_norm": 7.4241248487741744, "learning_rate": 1.8785570892262806e-05, "loss": 1.0572, "step": 4562 }, { "epoch": 0.18, "grad_norm": 4.2249844914321715, "learning_rate": 1.87849476947073e-05, "loss": 0.9661, "step": 4563 }, { "epoch": 0.18, "grad_norm": 5.664364482199374, "learning_rate": 1.8784324347635035e-05, "loss": 1.3471, "step": 4564 }, { "epoch": 0.18, "grad_norm": 5.388554141565434, "learning_rate": 1.8783700851056616e-05, "loss": 1.3395, "step": 4565 }, { "epoch": 0.18, "grad_norm": 5.280304828072684, "learning_rate": 1.878307720498265e-05, "loss": 1.1092, "step": 4566 }, { "epoch": 0.18, "grad_norm": 5.205254374877958, "learning_rate": 1.8782453409423756e-05, "loss": 1.322, "step": 4567 }, { "epoch": 0.18, "grad_norm": 6.517086561841382, "learning_rate": 1.878182946439055e-05, "loss": 1.4162, "step": 4568 }, { "epoch": 0.18, "grad_norm": 5.494534003979354, "learning_rate": 1.8781205369893653e-05, "loss": 1.4238, "step": 4569 }, { "epoch": 0.18, "grad_norm": 6.547568806078674, "learning_rate": 1.8780581125943685e-05, "loss": 1.4186, "step": 4570 }, { "epoch": 0.18, "grad_norm": 4.531575482961443, "learning_rate": 1.8779956732551267e-05, "loss": 0.9405, "step": 4571 }, { "epoch": 0.18, "grad_norm": 8.936868487851518, "learning_rate": 1.877933218972703e-05, "loss": 1.22, "step": 4572 }, { "epoch": 0.18, "grad_norm": 6.547228826696501, "learning_rate": 1.8778707497481602e-05, "loss": 1.4736, "step": 4573 }, { "epoch": 0.18, "grad_norm": 6.908756792158426, "learning_rate": 1.877808265582562e-05, "loss": 1.1622, "step": 4574 }, { "epoch": 0.18, "grad_norm": 6.672945452763215, "learning_rate": 1.8777457664769712e-05, "loss": 1.1965, "step": 4575 }, { "epoch": 0.18, "grad_norm": 4.833688159377586, "learning_rate": 1.8776832524324518e-05, "loss": 1.3671, "step": 4576 }, { "epoch": 0.18, "grad_norm": 5.9739441911370035, "learning_rate": 1.8776207234500676e-05, "loss": 1.1654, "step": 4577 }, { "epoch": 0.18, "grad_norm": 7.127439467696708, "learning_rate": 1.8775581795308828e-05, "loss": 1.2655, "step": 4578 }, { "epoch": 0.18, "grad_norm": 5.388913525756902, "learning_rate": 1.8774956206759626e-05, "loss": 1.163, "step": 4579 }, { "epoch": 0.18, "grad_norm": 5.6864056583035705, "learning_rate": 1.8774330468863707e-05, "loss": 1.279, "step": 4580 }, { "epoch": 0.18, "grad_norm": 5.6234500912637495, "learning_rate": 1.8773704581631725e-05, "loss": 1.1634, "step": 4581 }, { "epoch": 0.18, "grad_norm": 6.410155215759155, "learning_rate": 1.8773078545074336e-05, "loss": 1.0441, "step": 4582 }, { "epoch": 0.18, "grad_norm": 5.8050926723433145, "learning_rate": 1.877245235920219e-05, "loss": 1.3381, "step": 4583 }, { "epoch": 0.18, "grad_norm": 6.124954067591887, "learning_rate": 1.8771826024025944e-05, "loss": 1.4467, "step": 4584 }, { "epoch": 0.18, "grad_norm": 6.757296912547764, "learning_rate": 1.8771199539556262e-05, "loss": 1.3221, "step": 4585 }, { "epoch": 0.18, "grad_norm": 6.23109552952346, "learning_rate": 1.8770572905803806e-05, "loss": 1.2868, "step": 4586 }, { "epoch": 0.18, "grad_norm": 4.892327266522782, "learning_rate": 1.8769946122779235e-05, "loss": 1.4066, "step": 4587 }, { "epoch": 0.18, "grad_norm": 6.6814255807880185, "learning_rate": 1.8769319190493226e-05, "loss": 1.4081, "step": 4588 }, { "epoch": 0.18, "grad_norm": 6.463828364007628, "learning_rate": 1.8768692108956443e-05, "loss": 1.4741, "step": 4589 }, { "epoch": 0.18, "grad_norm": 7.196970472837772, "learning_rate": 1.876806487817956e-05, "loss": 1.404, "step": 4590 }, { "epoch": 0.18, "grad_norm": 6.358945799541403, "learning_rate": 1.8767437498173257e-05, "loss": 1.3551, "step": 4591 }, { "epoch": 0.18, "grad_norm": 4.865053196910596, "learning_rate": 1.87668099689482e-05, "loss": 1.1321, "step": 4592 }, { "epoch": 0.19, "grad_norm": 5.6755994415718805, "learning_rate": 1.8766182290515083e-05, "loss": 1.0784, "step": 4593 }, { "epoch": 0.19, "grad_norm": 6.017823036368646, "learning_rate": 1.8765554462884576e-05, "loss": 1.0638, "step": 4594 }, { "epoch": 0.19, "grad_norm": 8.138934463331966, "learning_rate": 1.8764926486067375e-05, "loss": 1.3486, "step": 4595 }, { "epoch": 0.19, "grad_norm": 7.020186588290887, "learning_rate": 1.8764298360074167e-05, "loss": 1.0099, "step": 4596 }, { "epoch": 0.19, "grad_norm": 6.1393466002442025, "learning_rate": 1.876367008491563e-05, "loss": 1.1537, "step": 4597 }, { "epoch": 0.19, "grad_norm": 6.121708751944723, "learning_rate": 1.8763041660602478e-05, "loss": 1.231, "step": 4598 }, { "epoch": 0.19, "grad_norm": 4.147660527714543, "learning_rate": 1.876241308714539e-05, "loss": 1.2973, "step": 4599 }, { "epoch": 0.19, "grad_norm": 4.659432205867825, "learning_rate": 1.8761784364555066e-05, "loss": 1.0015, "step": 4600 }, { "epoch": 0.19, "grad_norm": 6.859273407726427, "learning_rate": 1.8761155492842213e-05, "loss": 1.4491, "step": 4601 }, { "epoch": 0.19, "grad_norm": 5.485255724409126, "learning_rate": 1.8760526472017528e-05, "loss": 1.1516, "step": 4602 }, { "epoch": 0.19, "grad_norm": 7.162433020104971, "learning_rate": 1.875989730209172e-05, "loss": 1.3195, "step": 4603 }, { "epoch": 0.19, "grad_norm": 5.129361773647051, "learning_rate": 1.8759267983075503e-05, "loss": 1.1531, "step": 4604 }, { "epoch": 0.19, "grad_norm": 7.167393974714415, "learning_rate": 1.8758638514979573e-05, "loss": 1.346, "step": 4605 }, { "epoch": 0.19, "grad_norm": 6.831129756704401, "learning_rate": 1.8758008897814657e-05, "loss": 1.3465, "step": 4606 }, { "epoch": 0.19, "grad_norm": 4.780306454822088, "learning_rate": 1.8757379131591468e-05, "loss": 1.1159, "step": 4607 }, { "epoch": 0.19, "grad_norm": 5.695916249574915, "learning_rate": 1.875674921632072e-05, "loss": 1.0702, "step": 4608 }, { "epoch": 0.19, "grad_norm": 6.491460442180391, "learning_rate": 1.8756119152013134e-05, "loss": 1.1601, "step": 4609 }, { "epoch": 0.19, "grad_norm": 7.9081847515599994, "learning_rate": 1.8755488938679433e-05, "loss": 1.187, "step": 4610 }, { "epoch": 0.19, "grad_norm": 7.6560089705652965, "learning_rate": 1.875485857633035e-05, "loss": 1.0686, "step": 4611 }, { "epoch": 0.19, "grad_norm": 7.424595659308349, "learning_rate": 1.875422806497661e-05, "loss": 1.391, "step": 4612 }, { "epoch": 0.19, "grad_norm": 7.618950523358616, "learning_rate": 1.875359740462894e-05, "loss": 1.3955, "step": 4613 }, { "epoch": 0.19, "grad_norm": 6.3840893587452685, "learning_rate": 1.8752966595298078e-05, "loss": 1.1914, "step": 4614 }, { "epoch": 0.19, "grad_norm": 6.219649099067193, "learning_rate": 1.8752335636994762e-05, "loss": 1.2548, "step": 4615 }, { "epoch": 0.19, "grad_norm": 7.6137060618783945, "learning_rate": 1.8751704529729726e-05, "loss": 1.5777, "step": 4616 }, { "epoch": 0.19, "grad_norm": 10.449309697393977, "learning_rate": 1.8751073273513715e-05, "loss": 1.5551, "step": 4617 }, { "epoch": 0.19, "grad_norm": 7.20166536894782, "learning_rate": 1.8750441868357464e-05, "loss": 1.3862, "step": 4618 }, { "epoch": 0.19, "grad_norm": 5.082021441184097, "learning_rate": 1.8749810314271733e-05, "loss": 1.1982, "step": 4619 }, { "epoch": 0.19, "grad_norm": 4.177859040929787, "learning_rate": 1.874917861126726e-05, "loss": 0.864, "step": 4620 }, { "epoch": 0.19, "grad_norm": 6.521633611874776, "learning_rate": 1.8748546759354803e-05, "loss": 1.349, "step": 4621 }, { "epoch": 0.19, "grad_norm": 9.414736798599506, "learning_rate": 1.874791475854511e-05, "loss": 1.5094, "step": 4622 }, { "epoch": 0.19, "grad_norm": 5.396283329276596, "learning_rate": 1.8747282608848946e-05, "loss": 1.2148, "step": 4623 }, { "epoch": 0.19, "grad_norm": 5.604300847576402, "learning_rate": 1.874665031027706e-05, "loss": 1.2823, "step": 4624 }, { "epoch": 0.19, "grad_norm": 7.7904999295724355, "learning_rate": 1.8746017862840218e-05, "loss": 1.1947, "step": 4625 }, { "epoch": 0.19, "grad_norm": 8.49728619137556, "learning_rate": 1.8745385266549184e-05, "loss": 1.3574, "step": 4626 }, { "epoch": 0.19, "grad_norm": 5.852817692011626, "learning_rate": 1.8744752521414725e-05, "loss": 1.0126, "step": 4627 }, { "epoch": 0.19, "grad_norm": 8.348966239069032, "learning_rate": 1.8744119627447612e-05, "loss": 1.3497, "step": 4628 }, { "epoch": 0.19, "grad_norm": 6.2150873360854, "learning_rate": 1.8743486584658613e-05, "loss": 1.0525, "step": 4629 }, { "epoch": 0.19, "grad_norm": 5.092850605431305, "learning_rate": 1.8742853393058505e-05, "loss": 1.3406, "step": 4630 }, { "epoch": 0.19, "grad_norm": 6.096191779352404, "learning_rate": 1.8742220052658063e-05, "loss": 1.3553, "step": 4631 }, { "epoch": 0.19, "grad_norm": 5.9512716695438055, "learning_rate": 1.8741586563468064e-05, "loss": 1.2562, "step": 4632 }, { "epoch": 0.19, "grad_norm": 4.169173002548652, "learning_rate": 1.8740952925499295e-05, "loss": 1.1415, "step": 4633 }, { "epoch": 0.19, "grad_norm": 6.41358987106939, "learning_rate": 1.8740319138762536e-05, "loss": 1.1705, "step": 4634 }, { "epoch": 0.19, "grad_norm": 10.78606065719576, "learning_rate": 1.8739685203268573e-05, "loss": 1.4717, "step": 4635 }, { "epoch": 0.19, "grad_norm": 6.469488269044575, "learning_rate": 1.87390511190282e-05, "loss": 1.1829, "step": 4636 }, { "epoch": 0.19, "grad_norm": 4.577792405729955, "learning_rate": 1.8738416886052208e-05, "loss": 1.165, "step": 4637 }, { "epoch": 0.19, "grad_norm": 6.252371663529462, "learning_rate": 1.873778250435139e-05, "loss": 1.1163, "step": 4638 }, { "epoch": 0.19, "grad_norm": 8.298618564353992, "learning_rate": 1.873714797393654e-05, "loss": 1.3149, "step": 4639 }, { "epoch": 0.19, "grad_norm": 7.281427872699856, "learning_rate": 1.8736513294818465e-05, "loss": 1.3689, "step": 4640 }, { "epoch": 0.19, "grad_norm": 5.013751609211, "learning_rate": 1.873587846700796e-05, "loss": 1.1238, "step": 4641 }, { "epoch": 0.19, "grad_norm": 6.1759704400751065, "learning_rate": 1.8735243490515832e-05, "loss": 1.2527, "step": 4642 }, { "epoch": 0.19, "grad_norm": 7.37545654314104, "learning_rate": 1.873460836535289e-05, "loss": 1.2973, "step": 4643 }, { "epoch": 0.19, "grad_norm": 6.911569698591918, "learning_rate": 1.8733973091529936e-05, "loss": 1.16, "step": 4644 }, { "epoch": 0.19, "grad_norm": 7.52670946247441, "learning_rate": 1.8733337669057796e-05, "loss": 1.1358, "step": 4645 }, { "epoch": 0.19, "grad_norm": 7.810140535345715, "learning_rate": 1.873270209794727e-05, "loss": 1.3117, "step": 4646 }, { "epoch": 0.19, "grad_norm": 8.22342388165926, "learning_rate": 1.8732066378209186e-05, "loss": 1.5277, "step": 4647 }, { "epoch": 0.19, "grad_norm": 6.573172407288157, "learning_rate": 1.873143050985436e-05, "loss": 1.0776, "step": 4648 }, { "epoch": 0.19, "grad_norm": 6.626195102224377, "learning_rate": 1.8730794492893612e-05, "loss": 1.2823, "step": 4649 }, { "epoch": 0.19, "grad_norm": 6.050868454277971, "learning_rate": 1.8730158327337767e-05, "loss": 1.1642, "step": 4650 }, { "epoch": 0.19, "grad_norm": 8.583836315220715, "learning_rate": 1.8729522013197655e-05, "loss": 1.2055, "step": 4651 }, { "epoch": 0.19, "grad_norm": 4.876269783971336, "learning_rate": 1.8728885550484105e-05, "loss": 1.2426, "step": 4652 }, { "epoch": 0.19, "grad_norm": 4.839442632105295, "learning_rate": 1.872824893920795e-05, "loss": 1.2379, "step": 4653 }, { "epoch": 0.19, "grad_norm": 5.869462641483248, "learning_rate": 1.8727612179380022e-05, "loss": 1.4272, "step": 4654 }, { "epoch": 0.19, "grad_norm": 6.142222768693301, "learning_rate": 1.8726975271011163e-05, "loss": 1.3829, "step": 4655 }, { "epoch": 0.19, "grad_norm": 6.477050488453169, "learning_rate": 1.8726338214112207e-05, "loss": 0.9291, "step": 4656 }, { "epoch": 0.19, "grad_norm": 8.760471681900068, "learning_rate": 1.8725701008694002e-05, "loss": 1.2014, "step": 4657 }, { "epoch": 0.19, "grad_norm": 8.23165387265446, "learning_rate": 1.872506365476739e-05, "loss": 1.3719, "step": 4658 }, { "epoch": 0.19, "grad_norm": 5.69488356473868, "learning_rate": 1.8724426152343224e-05, "loss": 1.3387, "step": 4659 }, { "epoch": 0.19, "grad_norm": 7.2598398998205225, "learning_rate": 1.8723788501432347e-05, "loss": 1.4809, "step": 4660 }, { "epoch": 0.19, "grad_norm": 6.510469888872794, "learning_rate": 1.872315070204561e-05, "loss": 1.2569, "step": 4661 }, { "epoch": 0.19, "grad_norm": 7.290037189522641, "learning_rate": 1.8722512754193872e-05, "loss": 1.3445, "step": 4662 }, { "epoch": 0.19, "grad_norm": 4.722829418661295, "learning_rate": 1.8721874657887996e-05, "loss": 1.2164, "step": 4663 }, { "epoch": 0.19, "grad_norm": 7.242417218377923, "learning_rate": 1.8721236413138837e-05, "loss": 1.3205, "step": 4664 }, { "epoch": 0.19, "grad_norm": 6.03173622775544, "learning_rate": 1.8720598019957254e-05, "loss": 0.998, "step": 4665 }, { "epoch": 0.19, "grad_norm": 4.1653729022379125, "learning_rate": 1.871995947835412e-05, "loss": 1.2475, "step": 4666 }, { "epoch": 0.19, "grad_norm": 7.8132025519639505, "learning_rate": 1.8719320788340296e-05, "loss": 1.3997, "step": 4667 }, { "epoch": 0.19, "grad_norm": 4.92939768599124, "learning_rate": 1.8718681949926656e-05, "loss": 1.0802, "step": 4668 }, { "epoch": 0.19, "grad_norm": 5.52746340990976, "learning_rate": 1.8718042963124072e-05, "loss": 1.0331, "step": 4669 }, { "epoch": 0.19, "grad_norm": 4.688522826851687, "learning_rate": 1.8717403827943416e-05, "loss": 1.3063, "step": 4670 }, { "epoch": 0.19, "grad_norm": 13.223806733064901, "learning_rate": 1.871676454439557e-05, "loss": 1.1858, "step": 4671 }, { "epoch": 0.19, "grad_norm": 7.469480515964721, "learning_rate": 1.871612511249142e-05, "loss": 1.1416, "step": 4672 }, { "epoch": 0.19, "grad_norm": 9.680627864934252, "learning_rate": 1.8715485532241836e-05, "loss": 1.1724, "step": 4673 }, { "epoch": 0.19, "grad_norm": 7.013016402554069, "learning_rate": 1.8714845803657714e-05, "loss": 1.2225, "step": 4674 }, { "epoch": 0.19, "grad_norm": 5.495111014401297, "learning_rate": 1.8714205926749933e-05, "loss": 0.8749, "step": 4675 }, { "epoch": 0.19, "grad_norm": 7.126560349128588, "learning_rate": 1.871356590152939e-05, "loss": 1.2205, "step": 4676 }, { "epoch": 0.19, "grad_norm": 10.173737337865287, "learning_rate": 1.8712925728006976e-05, "loss": 1.2147, "step": 4677 }, { "epoch": 0.19, "grad_norm": 6.973901495091993, "learning_rate": 1.8712285406193585e-05, "loss": 1.2448, "step": 4678 }, { "epoch": 0.19, "grad_norm": 5.260395959744758, "learning_rate": 1.871164493610012e-05, "loss": 1.4158, "step": 4679 }, { "epoch": 0.19, "grad_norm": 7.281084294582049, "learning_rate": 1.871100431773748e-05, "loss": 1.2511, "step": 4680 }, { "epoch": 0.19, "grad_norm": 7.145637835699388, "learning_rate": 1.8710363551116565e-05, "loss": 1.3473, "step": 4681 }, { "epoch": 0.19, "grad_norm": 5.264344050579813, "learning_rate": 1.8709722636248278e-05, "loss": 1.1152, "step": 4682 }, { "epoch": 0.19, "grad_norm": 5.750085556906818, "learning_rate": 1.8709081573143537e-05, "loss": 0.9851, "step": 4683 }, { "epoch": 0.19, "grad_norm": 6.074255591068009, "learning_rate": 1.8708440361813242e-05, "loss": 1.1979, "step": 4684 }, { "epoch": 0.19, "grad_norm": 7.601059231004394, "learning_rate": 1.8707799002268315e-05, "loss": 1.2024, "step": 4685 }, { "epoch": 0.19, "grad_norm": 4.548259819117604, "learning_rate": 1.8707157494519666e-05, "loss": 1.1473, "step": 4686 }, { "epoch": 0.19, "grad_norm": 4.329652485437876, "learning_rate": 1.8706515838578216e-05, "loss": 1.0674, "step": 4687 }, { "epoch": 0.19, "grad_norm": 5.190452401253011, "learning_rate": 1.8705874034454883e-05, "loss": 1.1267, "step": 4688 }, { "epoch": 0.19, "grad_norm": 8.027125936288616, "learning_rate": 1.8705232082160593e-05, "loss": 1.3743, "step": 4689 }, { "epoch": 0.19, "grad_norm": 7.522286997811931, "learning_rate": 1.8704589981706273e-05, "loss": 1.3945, "step": 4690 }, { "epoch": 0.19, "grad_norm": 5.531727844668713, "learning_rate": 1.8703947733102846e-05, "loss": 1.2108, "step": 4691 }, { "epoch": 0.19, "grad_norm": 9.358274275813516, "learning_rate": 1.8703305336361247e-05, "loss": 1.1512, "step": 4692 }, { "epoch": 0.19, "grad_norm": 10.219741413152358, "learning_rate": 1.8702662791492413e-05, "loss": 1.2593, "step": 4693 }, { "epoch": 0.19, "grad_norm": 9.649957748419576, "learning_rate": 1.8702020098507268e-05, "loss": 1.232, "step": 4694 }, { "epoch": 0.19, "grad_norm": 6.387843288331881, "learning_rate": 1.8701377257416762e-05, "loss": 1.0747, "step": 4695 }, { "epoch": 0.19, "grad_norm": 6.175779082559378, "learning_rate": 1.8700734268231833e-05, "loss": 1.239, "step": 4696 }, { "epoch": 0.19, "grad_norm": 6.973256239595401, "learning_rate": 1.870009113096342e-05, "loss": 1.1369, "step": 4697 }, { "epoch": 0.19, "grad_norm": 6.311416686397317, "learning_rate": 1.8699447845622473e-05, "loss": 1.2644, "step": 4698 }, { "epoch": 0.19, "grad_norm": 7.296643287538725, "learning_rate": 1.869880441221994e-05, "loss": 1.1792, "step": 4699 }, { "epoch": 0.19, "grad_norm": 5.731476684695116, "learning_rate": 1.8698160830766774e-05, "loss": 1.346, "step": 4700 }, { "epoch": 0.19, "grad_norm": 7.965188805574261, "learning_rate": 1.869751710127392e-05, "loss": 1.1328, "step": 4701 }, { "epoch": 0.19, "grad_norm": 7.404243091560286, "learning_rate": 1.8696873223752345e-05, "loss": 1.3173, "step": 4702 }, { "epoch": 0.19, "grad_norm": 7.2761638602828445, "learning_rate": 1.8696229198213003e-05, "loss": 1.3608, "step": 4703 }, { "epoch": 0.19, "grad_norm": 6.533222335586309, "learning_rate": 1.8695585024666853e-05, "loss": 1.4188, "step": 4704 }, { "epoch": 0.19, "grad_norm": 10.243686410348452, "learning_rate": 1.869494070312486e-05, "loss": 1.3473, "step": 4705 }, { "epoch": 0.19, "grad_norm": 6.041016589164226, "learning_rate": 1.8694296233597996e-05, "loss": 1.1876, "step": 4706 }, { "epoch": 0.19, "grad_norm": 6.428335480553839, "learning_rate": 1.869365161609722e-05, "loss": 1.5705, "step": 4707 }, { "epoch": 0.19, "grad_norm": 6.40341832280282, "learning_rate": 1.8693006850633507e-05, "loss": 0.8938, "step": 4708 }, { "epoch": 0.19, "grad_norm": 6.358502935952054, "learning_rate": 1.869236193721783e-05, "loss": 1.371, "step": 4709 }, { "epoch": 0.19, "grad_norm": 5.693763709805102, "learning_rate": 1.8691716875861168e-05, "loss": 1.3635, "step": 4710 }, { "epoch": 0.19, "grad_norm": 4.971331743006845, "learning_rate": 1.86910716665745e-05, "loss": 1.3244, "step": 4711 }, { "epoch": 0.19, "grad_norm": 5.055594484060847, "learning_rate": 1.8690426309368807e-05, "loss": 1.108, "step": 4712 }, { "epoch": 0.19, "grad_norm": 7.397840098317773, "learning_rate": 1.8689780804255063e-05, "loss": 1.0809, "step": 4713 }, { "epoch": 0.19, "grad_norm": 5.727465670946058, "learning_rate": 1.8689135151244268e-05, "loss": 1.3539, "step": 4714 }, { "epoch": 0.19, "grad_norm": 5.3017894307857425, "learning_rate": 1.86884893503474e-05, "loss": 1.08, "step": 4715 }, { "epoch": 0.19, "grad_norm": 7.776647696842687, "learning_rate": 1.8687843401575466e-05, "loss": 1.5007, "step": 4716 }, { "epoch": 0.19, "grad_norm": 7.270051005794018, "learning_rate": 1.868719730493944e-05, "loss": 1.49, "step": 4717 }, { "epoch": 0.19, "grad_norm": 7.215906620078886, "learning_rate": 1.8686551060450333e-05, "loss": 1.3041, "step": 4718 }, { "epoch": 0.19, "grad_norm": 5.0026977732707705, "learning_rate": 1.8685904668119136e-05, "loss": 1.2974, "step": 4719 }, { "epoch": 0.19, "grad_norm": 4.888073913380022, "learning_rate": 1.868525812795685e-05, "loss": 1.1872, "step": 4720 }, { "epoch": 0.19, "grad_norm": 7.679202962121157, "learning_rate": 1.8684611439974484e-05, "loss": 1.252, "step": 4721 }, { "epoch": 0.19, "grad_norm": 5.36601338831469, "learning_rate": 1.8683964604183042e-05, "loss": 1.1688, "step": 4722 }, { "epoch": 0.19, "grad_norm": 4.191039974000724, "learning_rate": 1.8683317620593537e-05, "loss": 1.0833, "step": 4723 }, { "epoch": 0.19, "grad_norm": 6.349593375540642, "learning_rate": 1.868267048921697e-05, "loss": 1.5754, "step": 4724 }, { "epoch": 0.19, "grad_norm": 9.798529148609207, "learning_rate": 1.8682023210064365e-05, "loss": 1.3122, "step": 4725 }, { "epoch": 0.19, "grad_norm": 6.650846067785629, "learning_rate": 1.8681375783146736e-05, "loss": 1.4649, "step": 4726 }, { "epoch": 0.19, "grad_norm": 6.532019510973847, "learning_rate": 1.86807282084751e-05, "loss": 1.2885, "step": 4727 }, { "epoch": 0.19, "grad_norm": 6.58891008154229, "learning_rate": 1.8680080486060478e-05, "loss": 1.5137, "step": 4728 }, { "epoch": 0.19, "grad_norm": 7.446722554253076, "learning_rate": 1.8679432615913896e-05, "loss": 1.3099, "step": 4729 }, { "epoch": 0.19, "grad_norm": 8.64313021717344, "learning_rate": 1.8678784598046377e-05, "loss": 1.4818, "step": 4730 }, { "epoch": 0.19, "grad_norm": 10.613170697404204, "learning_rate": 1.867813643246896e-05, "loss": 1.1787, "step": 4731 }, { "epoch": 0.19, "grad_norm": 7.296348684861599, "learning_rate": 1.8677488119192663e-05, "loss": 1.4224, "step": 4732 }, { "epoch": 0.19, "grad_norm": 9.47270479546295, "learning_rate": 1.8676839658228528e-05, "loss": 1.2251, "step": 4733 }, { "epoch": 0.19, "grad_norm": 6.4365747261220365, "learning_rate": 1.867619104958759e-05, "loss": 1.2529, "step": 4734 }, { "epoch": 0.19, "grad_norm": 9.415707330607988, "learning_rate": 1.867554229328089e-05, "loss": 1.3289, "step": 4735 }, { "epoch": 0.19, "grad_norm": 7.4073800727616375, "learning_rate": 1.8674893389319467e-05, "loss": 1.3336, "step": 4736 }, { "epoch": 0.19, "grad_norm": 5.657210902261495, "learning_rate": 1.867424433771437e-05, "loss": 1.3101, "step": 4737 }, { "epoch": 0.19, "grad_norm": 9.131738628406179, "learning_rate": 1.8673595138476636e-05, "loss": 1.2415, "step": 4738 }, { "epoch": 0.19, "grad_norm": 4.8460564948496945, "learning_rate": 1.8672945791617323e-05, "loss": 1.0397, "step": 4739 }, { "epoch": 0.19, "grad_norm": 6.670050507897262, "learning_rate": 1.8672296297147476e-05, "loss": 1.0905, "step": 4740 }, { "epoch": 0.19, "grad_norm": 5.850234097780103, "learning_rate": 1.8671646655078156e-05, "loss": 1.3881, "step": 4741 }, { "epoch": 0.19, "grad_norm": 5.128882029633549, "learning_rate": 1.8670996865420413e-05, "loss": 1.2139, "step": 4742 }, { "epoch": 0.19, "grad_norm": 8.093188129078133, "learning_rate": 1.8670346928185313e-05, "loss": 1.3208, "step": 4743 }, { "epoch": 0.19, "grad_norm": 6.2276203921334226, "learning_rate": 1.866969684338391e-05, "loss": 1.1496, "step": 4744 }, { "epoch": 0.19, "grad_norm": 8.605339787720142, "learning_rate": 1.8669046611027277e-05, "loss": 1.2104, "step": 4745 }, { "epoch": 0.19, "grad_norm": 6.536707901802901, "learning_rate": 1.8668396231126473e-05, "loss": 1.5519, "step": 4746 }, { "epoch": 0.19, "grad_norm": 9.814873219401962, "learning_rate": 1.866774570369257e-05, "loss": 1.4945, "step": 4747 }, { "epoch": 0.19, "grad_norm": 5.066076103827806, "learning_rate": 1.8667095028736642e-05, "loss": 0.982, "step": 4748 }, { "epoch": 0.19, "grad_norm": 6.250746764802006, "learning_rate": 1.866644420626976e-05, "loss": 1.1762, "step": 4749 }, { "epoch": 0.19, "grad_norm": 5.199243605856641, "learning_rate": 1.8665793236303006e-05, "loss": 1.4328, "step": 4750 }, { "epoch": 0.19, "grad_norm": 8.475364890457717, "learning_rate": 1.866514211884745e-05, "loss": 1.0401, "step": 4751 }, { "epoch": 0.19, "grad_norm": 16.1999236409403, "learning_rate": 1.8664490853914182e-05, "loss": 1.1663, "step": 4752 }, { "epoch": 0.19, "grad_norm": 7.018600678973663, "learning_rate": 1.8663839441514283e-05, "loss": 1.0869, "step": 4753 }, { "epoch": 0.19, "grad_norm": 5.6438041189981245, "learning_rate": 1.866318788165884e-05, "loss": 1.0815, "step": 4754 }, { "epoch": 0.19, "grad_norm": 6.579619104315967, "learning_rate": 1.8662536174358945e-05, "loss": 1.2675, "step": 4755 }, { "epoch": 0.19, "grad_norm": 5.082688931156475, "learning_rate": 1.8661884319625685e-05, "loss": 1.212, "step": 4756 }, { "epoch": 0.19, "grad_norm": 5.973808906955773, "learning_rate": 1.866123231747016e-05, "loss": 1.2132, "step": 4757 }, { "epoch": 0.19, "grad_norm": 5.0846188113759565, "learning_rate": 1.866058016790346e-05, "loss": 1.4228, "step": 4758 }, { "epoch": 0.19, "grad_norm": 5.823703889406248, "learning_rate": 1.8659927870936686e-05, "loss": 1.2, "step": 4759 }, { "epoch": 0.19, "grad_norm": 6.121018575627162, "learning_rate": 1.865927542658095e-05, "loss": 1.2867, "step": 4760 }, { "epoch": 0.19, "grad_norm": 4.892283770062672, "learning_rate": 1.865862283484734e-05, "loss": 1.2995, "step": 4761 }, { "epoch": 0.19, "grad_norm": 5.967658062571588, "learning_rate": 1.8657970095746972e-05, "loss": 1.2397, "step": 4762 }, { "epoch": 0.19, "grad_norm": 5.033827717678753, "learning_rate": 1.865731720929096e-05, "loss": 1.195, "step": 4763 }, { "epoch": 0.19, "grad_norm": 10.079527170649566, "learning_rate": 1.8656664175490406e-05, "loss": 1.0242, "step": 4764 }, { "epoch": 0.19, "grad_norm": 5.193608503157321, "learning_rate": 1.8656010994356432e-05, "loss": 1.2882, "step": 4765 }, { "epoch": 0.19, "grad_norm": 4.593566102195246, "learning_rate": 1.865535766590015e-05, "loss": 1.2099, "step": 4766 }, { "epoch": 0.19, "grad_norm": 5.40639838263613, "learning_rate": 1.865470419013268e-05, "loss": 1.1803, "step": 4767 }, { "epoch": 0.19, "grad_norm": 5.541172416750155, "learning_rate": 1.8654050567065148e-05, "loss": 1.3586, "step": 4768 }, { "epoch": 0.19, "grad_norm": 6.3537612936978425, "learning_rate": 1.8653396796708672e-05, "loss": 1.3302, "step": 4769 }, { "epoch": 0.19, "grad_norm": 6.376515296397973, "learning_rate": 1.8652742879074384e-05, "loss": 1.4052, "step": 4770 }, { "epoch": 0.19, "grad_norm": 6.835746066774642, "learning_rate": 1.8652088814173413e-05, "loss": 1.2548, "step": 4771 }, { "epoch": 0.19, "grad_norm": 4.773991329597739, "learning_rate": 1.8651434602016892e-05, "loss": 0.8248, "step": 4772 }, { "epoch": 0.19, "grad_norm": 7.663289873178941, "learning_rate": 1.865078024261595e-05, "loss": 1.3598, "step": 4773 }, { "epoch": 0.19, "grad_norm": 5.507649548223041, "learning_rate": 1.865012573598173e-05, "loss": 1.3281, "step": 4774 }, { "epoch": 0.19, "grad_norm": 6.913391200709237, "learning_rate": 1.8649471082125366e-05, "loss": 1.2519, "step": 4775 }, { "epoch": 0.19, "grad_norm": 9.923231398853915, "learning_rate": 1.8648816281058004e-05, "loss": 1.3954, "step": 4776 }, { "epoch": 0.19, "grad_norm": 6.1202108441893195, "learning_rate": 1.8648161332790787e-05, "loss": 1.0999, "step": 4777 }, { "epoch": 0.19, "grad_norm": 8.891505809344293, "learning_rate": 1.864750623733486e-05, "loss": 1.3611, "step": 4778 }, { "epoch": 0.19, "grad_norm": 5.676818872878414, "learning_rate": 1.864685099470138e-05, "loss": 1.1684, "step": 4779 }, { "epoch": 0.19, "grad_norm": 5.808030408013159, "learning_rate": 1.8646195604901492e-05, "loss": 1.2939, "step": 4780 }, { "epoch": 0.19, "grad_norm": 5.494419190993765, "learning_rate": 1.8645540067946353e-05, "loss": 1.0693, "step": 4781 }, { "epoch": 0.19, "grad_norm": 8.263464120169202, "learning_rate": 1.8644884383847117e-05, "loss": 1.3863, "step": 4782 }, { "epoch": 0.19, "grad_norm": 5.291149729998294, "learning_rate": 1.8644228552614944e-05, "loss": 1.0403, "step": 4783 }, { "epoch": 0.19, "grad_norm": 6.062613785360267, "learning_rate": 1.8643572574261002e-05, "loss": 1.2038, "step": 4784 }, { "epoch": 0.19, "grad_norm": 6.206786832677383, "learning_rate": 1.864291644879645e-05, "loss": 1.2475, "step": 4785 }, { "epoch": 0.19, "grad_norm": 5.783698476757297, "learning_rate": 1.8642260176232455e-05, "loss": 1.1877, "step": 4786 }, { "epoch": 0.19, "grad_norm": 8.331710295915455, "learning_rate": 1.8641603756580192e-05, "loss": 1.4113, "step": 4787 }, { "epoch": 0.19, "grad_norm": 5.069246630342143, "learning_rate": 1.8640947189850826e-05, "loss": 1.1039, "step": 4788 }, { "epoch": 0.19, "grad_norm": 7.554184198171185, "learning_rate": 1.8640290476055533e-05, "loss": 1.4079, "step": 4789 }, { "epoch": 0.19, "grad_norm": 5.3529131806517505, "learning_rate": 1.8639633615205495e-05, "loss": 0.9639, "step": 4790 }, { "epoch": 0.19, "grad_norm": 4.715335087719633, "learning_rate": 1.8638976607311886e-05, "loss": 1.196, "step": 4791 }, { "epoch": 0.19, "grad_norm": 17.87440928256434, "learning_rate": 1.863831945238589e-05, "loss": 1.5349, "step": 4792 }, { "epoch": 0.19, "grad_norm": 6.938206153473381, "learning_rate": 1.8637662150438695e-05, "loss": 1.5509, "step": 4793 }, { "epoch": 0.19, "grad_norm": 6.15524131754018, "learning_rate": 1.863700470148148e-05, "loss": 1.2832, "step": 4794 }, { "epoch": 0.19, "grad_norm": 5.148282200659404, "learning_rate": 1.8636347105525443e-05, "loss": 1.2743, "step": 4795 }, { "epoch": 0.19, "grad_norm": 6.864419631367674, "learning_rate": 1.863568936258177e-05, "loss": 1.2384, "step": 4796 }, { "epoch": 0.19, "grad_norm": 6.278371855908812, "learning_rate": 1.863503147266166e-05, "loss": 1.3792, "step": 4797 }, { "epoch": 0.19, "grad_norm": 12.49037575877421, "learning_rate": 1.8634373435776305e-05, "loss": 1.4447, "step": 4798 }, { "epoch": 0.19, "grad_norm": 4.904159189825487, "learning_rate": 1.863371525193691e-05, "loss": 1.4993, "step": 4799 }, { "epoch": 0.19, "grad_norm": 4.776027675632459, "learning_rate": 1.8633056921154672e-05, "loss": 1.1887, "step": 4800 }, { "epoch": 0.19, "grad_norm": 6.187091617505535, "learning_rate": 1.8632398443440802e-05, "loss": 1.0362, "step": 4801 }, { "epoch": 0.19, "grad_norm": 4.588420278848564, "learning_rate": 1.86317398188065e-05, "loss": 1.0866, "step": 4802 }, { "epoch": 0.19, "grad_norm": 4.6647837976013165, "learning_rate": 1.863108104726298e-05, "loss": 1.4587, "step": 4803 }, { "epoch": 0.19, "grad_norm": 7.492153134442642, "learning_rate": 1.8630422128821454e-05, "loss": 1.1454, "step": 4804 }, { "epoch": 0.19, "grad_norm": 5.802414360470013, "learning_rate": 1.8629763063493134e-05, "loss": 1.4279, "step": 4805 }, { "epoch": 0.19, "grad_norm": 5.9288284096786965, "learning_rate": 1.862910385128924e-05, "loss": 1.3112, "step": 4806 }, { "epoch": 0.19, "grad_norm": 4.704119453188851, "learning_rate": 1.8628444492220986e-05, "loss": 1.0877, "step": 4807 }, { "epoch": 0.19, "grad_norm": 5.995887664956757, "learning_rate": 1.86277849862996e-05, "loss": 1.2255, "step": 4808 }, { "epoch": 0.19, "grad_norm": 5.166099406459812, "learning_rate": 1.8627125333536308e-05, "loss": 1.1293, "step": 4809 }, { "epoch": 0.19, "grad_norm": 5.343571635816863, "learning_rate": 1.862646553394233e-05, "loss": 1.1286, "step": 4810 }, { "epoch": 0.19, "grad_norm": 4.985775864482364, "learning_rate": 1.8625805587528904e-05, "loss": 1.2365, "step": 4811 }, { "epoch": 0.19, "grad_norm": 4.689721228105725, "learning_rate": 1.862514549430725e-05, "loss": 1.1879, "step": 4812 }, { "epoch": 0.19, "grad_norm": 6.069639299293195, "learning_rate": 1.8624485254288615e-05, "loss": 1.3757, "step": 4813 }, { "epoch": 0.19, "grad_norm": 4.199580740781717, "learning_rate": 1.862382486748423e-05, "loss": 1.1734, "step": 4814 }, { "epoch": 0.19, "grad_norm": 5.120862349707271, "learning_rate": 1.8623164333905338e-05, "loss": 1.0758, "step": 4815 }, { "epoch": 0.19, "grad_norm": 5.15592037595946, "learning_rate": 1.8622503653563173e-05, "loss": 1.2398, "step": 4816 }, { "epoch": 0.19, "grad_norm": 7.881592619433803, "learning_rate": 1.862184282646899e-05, "loss": 1.4173, "step": 4817 }, { "epoch": 0.19, "grad_norm": 5.796853809083868, "learning_rate": 1.8621181852634026e-05, "loss": 1.1753, "step": 4818 }, { "epoch": 0.19, "grad_norm": 5.205943045916961, "learning_rate": 1.862052073206954e-05, "loss": 1.043, "step": 4819 }, { "epoch": 0.19, "grad_norm": 8.100304994679119, "learning_rate": 1.861985946478678e-05, "loss": 1.3407, "step": 4820 }, { "epoch": 0.19, "grad_norm": 4.755489154562663, "learning_rate": 1.8619198050796997e-05, "loss": 1.266, "step": 4821 }, { "epoch": 0.19, "grad_norm": 5.493744874470877, "learning_rate": 1.8618536490111454e-05, "loss": 1.4557, "step": 4822 }, { "epoch": 0.19, "grad_norm": 5.595080771834035, "learning_rate": 1.8617874782741407e-05, "loss": 1.0802, "step": 4823 }, { "epoch": 0.19, "grad_norm": 5.388153791572185, "learning_rate": 1.861721292869812e-05, "loss": 1.3174, "step": 4824 }, { "epoch": 0.19, "grad_norm": 8.434841545539097, "learning_rate": 1.8616550927992855e-05, "loss": 1.4697, "step": 4825 }, { "epoch": 0.19, "grad_norm": 4.248928766233886, "learning_rate": 1.861588878063688e-05, "loss": 1.2175, "step": 4826 }, { "epoch": 0.19, "grad_norm": 5.901263139385733, "learning_rate": 1.8615226486641464e-05, "loss": 1.2526, "step": 4827 }, { "epoch": 0.19, "grad_norm": 5.006239180874968, "learning_rate": 1.861456404601788e-05, "loss": 1.2278, "step": 4828 }, { "epoch": 0.19, "grad_norm": 8.847500450832106, "learning_rate": 1.861390145877741e-05, "loss": 1.1659, "step": 4829 }, { "epoch": 0.19, "grad_norm": 8.80584992547266, "learning_rate": 1.8613238724931315e-05, "loss": 1.4008, "step": 4830 }, { "epoch": 0.19, "grad_norm": 5.426613230165143, "learning_rate": 1.8612575844490886e-05, "loss": 0.9457, "step": 4831 }, { "epoch": 0.19, "grad_norm": 5.1236759603202, "learning_rate": 1.86119128174674e-05, "loss": 1.2515, "step": 4832 }, { "epoch": 0.19, "grad_norm": 3.7505908993379933, "learning_rate": 1.8611249643872145e-05, "loss": 1.1416, "step": 4833 }, { "epoch": 0.19, "grad_norm": 4.466442378087132, "learning_rate": 1.861058632371641e-05, "loss": 1.2084, "step": 4834 }, { "epoch": 0.19, "grad_norm": 8.154832628167044, "learning_rate": 1.8609922857011476e-05, "loss": 1.4096, "step": 4835 }, { "epoch": 0.19, "grad_norm": 6.002153744121289, "learning_rate": 1.860925924376864e-05, "loss": 1.2256, "step": 4836 }, { "epoch": 0.19, "grad_norm": 7.821181547160998, "learning_rate": 1.86085954839992e-05, "loss": 1.4562, "step": 4837 }, { "epoch": 0.19, "grad_norm": 6.722170778633219, "learning_rate": 1.8607931577714448e-05, "loss": 1.2687, "step": 4838 }, { "epoch": 0.19, "grad_norm": 4.759725447839159, "learning_rate": 1.8607267524925684e-05, "loss": 1.3116, "step": 4839 }, { "epoch": 0.19, "grad_norm": 7.5230557663144975, "learning_rate": 1.860660332564421e-05, "loss": 1.3178, "step": 4840 }, { "epoch": 0.19, "grad_norm": 5.757433690503789, "learning_rate": 1.8605938979881332e-05, "loss": 1.073, "step": 4841 }, { "epoch": 0.2, "grad_norm": 6.598707789687225, "learning_rate": 1.8605274487648355e-05, "loss": 1.5515, "step": 4842 }, { "epoch": 0.2, "grad_norm": 4.7984940617919145, "learning_rate": 1.8604609848956592e-05, "loss": 1.2201, "step": 4843 }, { "epoch": 0.2, "grad_norm": 7.960996932815514, "learning_rate": 1.860394506381735e-05, "loss": 1.3589, "step": 4844 }, { "epoch": 0.2, "grad_norm": 3.9556005595880905, "learning_rate": 1.860328013224195e-05, "loss": 1.2378, "step": 4845 }, { "epoch": 0.2, "grad_norm": 7.447257498384713, "learning_rate": 1.8602615054241698e-05, "loss": 1.1385, "step": 4846 }, { "epoch": 0.2, "grad_norm": 5.734442285170665, "learning_rate": 1.8601949829827926e-05, "loss": 1.1862, "step": 4847 }, { "epoch": 0.2, "grad_norm": 6.3463225657913815, "learning_rate": 1.860128445901195e-05, "loss": 1.4196, "step": 4848 }, { "epoch": 0.2, "grad_norm": 6.735295704596837, "learning_rate": 1.8600618941805087e-05, "loss": 1.5105, "step": 4849 }, { "epoch": 0.2, "grad_norm": 4.6766891958090975, "learning_rate": 1.8599953278218678e-05, "loss": 1.034, "step": 4850 }, { "epoch": 0.2, "grad_norm": 7.845800929192445, "learning_rate": 1.8599287468264043e-05, "loss": 1.0754, "step": 4851 }, { "epoch": 0.2, "grad_norm": 7.254493138577706, "learning_rate": 1.859862151195252e-05, "loss": 1.4979, "step": 4852 }, { "epoch": 0.2, "grad_norm": 5.083320978406206, "learning_rate": 1.8597955409295435e-05, "loss": 1.3297, "step": 4853 }, { "epoch": 0.2, "grad_norm": 4.47226686059179, "learning_rate": 1.8597289160304135e-05, "loss": 1.1258, "step": 4854 }, { "epoch": 0.2, "grad_norm": 5.0556403894023045, "learning_rate": 1.859662276498995e-05, "loss": 0.9799, "step": 4855 }, { "epoch": 0.2, "grad_norm": 6.466772433235342, "learning_rate": 1.8595956223364227e-05, "loss": 1.4965, "step": 4856 }, { "epoch": 0.2, "grad_norm": 6.161583908604117, "learning_rate": 1.8595289535438313e-05, "loss": 1.2016, "step": 4857 }, { "epoch": 0.2, "grad_norm": 4.874089943934934, "learning_rate": 1.8594622701223547e-05, "loss": 1.4652, "step": 4858 }, { "epoch": 0.2, "grad_norm": 6.879669373744303, "learning_rate": 1.8593955720731284e-05, "loss": 1.2818, "step": 4859 }, { "epoch": 0.2, "grad_norm": 8.590190458043105, "learning_rate": 1.859328859397287e-05, "loss": 0.9965, "step": 4860 }, { "epoch": 0.2, "grad_norm": 5.737772243317405, "learning_rate": 1.859262132095967e-05, "loss": 1.2179, "step": 4861 }, { "epoch": 0.2, "grad_norm": 4.742462165509475, "learning_rate": 1.8591953901703028e-05, "loss": 0.9621, "step": 4862 }, { "epoch": 0.2, "grad_norm": 7.346595527850519, "learning_rate": 1.859128633621431e-05, "loss": 1.1401, "step": 4863 }, { "epoch": 0.2, "grad_norm": 4.508671049098717, "learning_rate": 1.859061862450488e-05, "loss": 1.19, "step": 4864 }, { "epoch": 0.2, "grad_norm": 7.684982391315792, "learning_rate": 1.8589950766586098e-05, "loss": 0.9082, "step": 4865 }, { "epoch": 0.2, "grad_norm": 5.430807184769413, "learning_rate": 1.858928276246933e-05, "loss": 1.221, "step": 4866 }, { "epoch": 0.2, "grad_norm": 6.075470382755096, "learning_rate": 1.858861461216595e-05, "loss": 1.2625, "step": 4867 }, { "epoch": 0.2, "grad_norm": 5.807523917154261, "learning_rate": 1.8587946315687327e-05, "loss": 1.4001, "step": 4868 }, { "epoch": 0.2, "grad_norm": 5.329045590205233, "learning_rate": 1.8587277873044832e-05, "loss": 1.248, "step": 4869 }, { "epoch": 0.2, "grad_norm": 5.80783135384826, "learning_rate": 1.8586609284249845e-05, "loss": 1.1585, "step": 4870 }, { "epoch": 0.2, "grad_norm": 9.717831259757224, "learning_rate": 1.858594054931375e-05, "loss": 1.4091, "step": 4871 }, { "epoch": 0.2, "grad_norm": 4.02985272580209, "learning_rate": 1.8585271668247918e-05, "loss": 1.2069, "step": 4872 }, { "epoch": 0.2, "grad_norm": 6.720288107516998, "learning_rate": 1.8584602641063744e-05, "loss": 1.0181, "step": 4873 }, { "epoch": 0.2, "grad_norm": 9.399080979868415, "learning_rate": 1.8583933467772604e-05, "loss": 1.21, "step": 4874 }, { "epoch": 0.2, "grad_norm": 4.9770937799087465, "learning_rate": 1.858326414838589e-05, "loss": 1.1564, "step": 4875 }, { "epoch": 0.2, "grad_norm": 5.995687060218061, "learning_rate": 1.8582594682915002e-05, "loss": 1.0595, "step": 4876 }, { "epoch": 0.2, "grad_norm": 7.450483826108329, "learning_rate": 1.8581925071371326e-05, "loss": 1.1849, "step": 4877 }, { "epoch": 0.2, "grad_norm": 5.10970946674854, "learning_rate": 1.858125531376626e-05, "loss": 1.2444, "step": 4878 }, { "epoch": 0.2, "grad_norm": 4.9021065889576505, "learning_rate": 1.8580585410111203e-05, "loss": 0.9461, "step": 4879 }, { "epoch": 0.2, "grad_norm": 4.856278711207781, "learning_rate": 1.8579915360417556e-05, "loss": 0.9842, "step": 4880 }, { "epoch": 0.2, "grad_norm": 6.986797212823269, "learning_rate": 1.857924516469673e-05, "loss": 1.2877, "step": 4881 }, { "epoch": 0.2, "grad_norm": 7.588435111394594, "learning_rate": 1.857857482296012e-05, "loss": 1.3131, "step": 4882 }, { "epoch": 0.2, "grad_norm": 8.343615242976199, "learning_rate": 1.857790433521914e-05, "loss": 1.3931, "step": 4883 }, { "epoch": 0.2, "grad_norm": 5.502042508445991, "learning_rate": 1.8577233701485205e-05, "loss": 1.0414, "step": 4884 }, { "epoch": 0.2, "grad_norm": 10.223020501939153, "learning_rate": 1.8576562921769727e-05, "loss": 1.2803, "step": 4885 }, { "epoch": 0.2, "grad_norm": 7.481782721585964, "learning_rate": 1.857589199608412e-05, "loss": 1.287, "step": 4886 }, { "epoch": 0.2, "grad_norm": 7.030920683006389, "learning_rate": 1.8575220924439805e-05, "loss": 1.2145, "step": 4887 }, { "epoch": 0.2, "grad_norm": 6.16060356432405, "learning_rate": 1.85745497068482e-05, "loss": 1.3323, "step": 4888 }, { "epoch": 0.2, "grad_norm": 7.6002218269859725, "learning_rate": 1.8573878343320734e-05, "loss": 1.4679, "step": 4889 }, { "epoch": 0.2, "grad_norm": 9.651420465779763, "learning_rate": 1.8573206833868832e-05, "loss": 1.293, "step": 4890 }, { "epoch": 0.2, "grad_norm": 5.7809131202827295, "learning_rate": 1.857253517850392e-05, "loss": 1.379, "step": 4891 }, { "epoch": 0.2, "grad_norm": 7.971535161382314, "learning_rate": 1.8571863377237432e-05, "loss": 1.1716, "step": 4892 }, { "epoch": 0.2, "grad_norm": 8.482091171353579, "learning_rate": 1.85711914300808e-05, "loss": 1.3958, "step": 4893 }, { "epoch": 0.2, "grad_norm": 7.086356918943668, "learning_rate": 1.857051933704546e-05, "loss": 1.1768, "step": 4894 }, { "epoch": 0.2, "grad_norm": 8.2387450614894, "learning_rate": 1.8569847098142856e-05, "loss": 1.2017, "step": 4895 }, { "epoch": 0.2, "grad_norm": 9.591810515605802, "learning_rate": 1.8569174713384423e-05, "loss": 1.221, "step": 4896 }, { "epoch": 0.2, "grad_norm": 4.681883975492516, "learning_rate": 1.8568502182781608e-05, "loss": 1.3842, "step": 4897 }, { "epoch": 0.2, "grad_norm": 6.297753322349592, "learning_rate": 1.856782950634586e-05, "loss": 1.3155, "step": 4898 }, { "epoch": 0.2, "grad_norm": 9.169940383647992, "learning_rate": 1.8567156684088616e-05, "loss": 1.3053, "step": 4899 }, { "epoch": 0.2, "grad_norm": 9.091283678384803, "learning_rate": 1.856648371602134e-05, "loss": 1.4594, "step": 4900 }, { "epoch": 0.2, "grad_norm": 7.354255528209761, "learning_rate": 1.8565810602155483e-05, "loss": 1.3114, "step": 4901 }, { "epoch": 0.2, "grad_norm": 8.484746036595489, "learning_rate": 1.8565137342502496e-05, "loss": 1.5166, "step": 4902 }, { "epoch": 0.2, "grad_norm": 8.554593337249125, "learning_rate": 1.856446393707384e-05, "loss": 1.1602, "step": 4903 }, { "epoch": 0.2, "grad_norm": 5.634969094306412, "learning_rate": 1.8563790385880983e-05, "loss": 1.3846, "step": 4904 }, { "epoch": 0.2, "grad_norm": 7.395913559727462, "learning_rate": 1.8563116688935375e-05, "loss": 1.0856, "step": 4905 }, { "epoch": 0.2, "grad_norm": 8.828152307191678, "learning_rate": 1.8562442846248495e-05, "loss": 1.0387, "step": 4906 }, { "epoch": 0.2, "grad_norm": 7.470281914151468, "learning_rate": 1.8561768857831802e-05, "loss": 1.2334, "step": 4907 }, { "epoch": 0.2, "grad_norm": 4.438515434280546, "learning_rate": 1.8561094723696776e-05, "loss": 1.2373, "step": 4908 }, { "epoch": 0.2, "grad_norm": 9.346184439167995, "learning_rate": 1.8560420443854884e-05, "loss": 1.3353, "step": 4909 }, { "epoch": 0.2, "grad_norm": 5.2628086580650795, "learning_rate": 1.8559746018317603e-05, "loss": 1.2597, "step": 4910 }, { "epoch": 0.2, "grad_norm": 5.507550559505195, "learning_rate": 1.855907144709641e-05, "loss": 1.3671, "step": 4911 }, { "epoch": 0.2, "grad_norm": 3.9946184783446776, "learning_rate": 1.855839673020279e-05, "loss": 1.2893, "step": 4912 }, { "epoch": 0.2, "grad_norm": 5.553069372747117, "learning_rate": 1.855772186764823e-05, "loss": 0.8351, "step": 4913 }, { "epoch": 0.2, "grad_norm": 6.264288976564596, "learning_rate": 1.8557046859444204e-05, "loss": 1.1057, "step": 4914 }, { "epoch": 0.2, "grad_norm": 5.091333973259296, "learning_rate": 1.855637170560221e-05, "loss": 1.1846, "step": 4915 }, { "epoch": 0.2, "grad_norm": 5.2464132178785885, "learning_rate": 1.8555696406133736e-05, "loss": 1.0988, "step": 4916 }, { "epoch": 0.2, "grad_norm": 4.749325624671045, "learning_rate": 1.8555020961050273e-05, "loss": 1.3378, "step": 4917 }, { "epoch": 0.2, "grad_norm": 7.529543426667807, "learning_rate": 1.8554345370363323e-05, "loss": 1.2738, "step": 4918 }, { "epoch": 0.2, "grad_norm": 5.294208185826604, "learning_rate": 1.855366963408438e-05, "loss": 1.0611, "step": 4919 }, { "epoch": 0.2, "grad_norm": 6.427671902762321, "learning_rate": 1.855299375222494e-05, "loss": 1.2723, "step": 4920 }, { "epoch": 0.2, "grad_norm": 5.580751747904824, "learning_rate": 1.8552317724796513e-05, "loss": 0.952, "step": 4921 }, { "epoch": 0.2, "grad_norm": 7.049578447339299, "learning_rate": 1.8551641551810607e-05, "loss": 1.2982, "step": 4922 }, { "epoch": 0.2, "grad_norm": 7.6884778484588505, "learning_rate": 1.8550965233278725e-05, "loss": 1.2762, "step": 4923 }, { "epoch": 0.2, "grad_norm": 5.424164564327418, "learning_rate": 1.855028876921238e-05, "loss": 1.2837, "step": 4924 }, { "epoch": 0.2, "grad_norm": 8.111539229992815, "learning_rate": 1.854961215962308e-05, "loss": 1.4555, "step": 4925 }, { "epoch": 0.2, "grad_norm": 7.8098181740773205, "learning_rate": 1.854893540452235e-05, "loss": 1.2716, "step": 4926 }, { "epoch": 0.2, "grad_norm": 6.116227884846033, "learning_rate": 1.85482585039217e-05, "loss": 1.2532, "step": 4927 }, { "epoch": 0.2, "grad_norm": 5.016747759695288, "learning_rate": 1.8547581457832658e-05, "loss": 1.1145, "step": 4928 }, { "epoch": 0.2, "grad_norm": 6.796299870123977, "learning_rate": 1.854690426626674e-05, "loss": 1.1727, "step": 4929 }, { "epoch": 0.2, "grad_norm": 8.601038854342326, "learning_rate": 1.8546226929235475e-05, "loss": 1.5806, "step": 4930 }, { "epoch": 0.2, "grad_norm": 4.524929054731769, "learning_rate": 1.8545549446750392e-05, "loss": 1.1063, "step": 4931 }, { "epoch": 0.2, "grad_norm": 5.006937327041565, "learning_rate": 1.8544871818823016e-05, "loss": 0.9364, "step": 4932 }, { "epoch": 0.2, "grad_norm": 6.5852163600910325, "learning_rate": 1.8544194045464888e-05, "loss": 1.2487, "step": 4933 }, { "epoch": 0.2, "grad_norm": 5.713908266356004, "learning_rate": 1.8543516126687536e-05, "loss": 1.3832, "step": 4934 }, { "epoch": 0.2, "grad_norm": 7.419047460554571, "learning_rate": 1.8542838062502506e-05, "loss": 1.4038, "step": 4935 }, { "epoch": 0.2, "grad_norm": 5.381601208807444, "learning_rate": 1.854215985292133e-05, "loss": 1.3771, "step": 4936 }, { "epoch": 0.2, "grad_norm": 5.9280805930131075, "learning_rate": 1.8541481497955555e-05, "loss": 1.3752, "step": 4937 }, { "epoch": 0.2, "grad_norm": 6.927455138636782, "learning_rate": 1.8540802997616726e-05, "loss": 1.4683, "step": 4938 }, { "epoch": 0.2, "grad_norm": 9.83284083375383, "learning_rate": 1.8540124351916394e-05, "loss": 1.3374, "step": 4939 }, { "epoch": 0.2, "grad_norm": 3.8514139263067007, "learning_rate": 1.8539445560866106e-05, "loss": 1.1766, "step": 4940 }, { "epoch": 0.2, "grad_norm": 5.348588983151623, "learning_rate": 1.8538766624477412e-05, "loss": 1.0839, "step": 4941 }, { "epoch": 0.2, "grad_norm": 5.322861751543494, "learning_rate": 1.8538087542761873e-05, "loss": 1.3323, "step": 4942 }, { "epoch": 0.2, "grad_norm": 5.914716813090794, "learning_rate": 1.853740831573104e-05, "loss": 1.1698, "step": 4943 }, { "epoch": 0.2, "grad_norm": 5.231118437671922, "learning_rate": 1.8536728943396482e-05, "loss": 1.1192, "step": 4944 }, { "epoch": 0.2, "grad_norm": 5.7958137513895105, "learning_rate": 1.8536049425769755e-05, "loss": 1.2501, "step": 4945 }, { "epoch": 0.2, "grad_norm": 5.334595163464552, "learning_rate": 1.8535369762862427e-05, "loss": 1.1639, "step": 4946 }, { "epoch": 0.2, "grad_norm": 4.451249255573226, "learning_rate": 1.853468995468606e-05, "loss": 1.1592, "step": 4947 }, { "epoch": 0.2, "grad_norm": 4.7020273124957415, "learning_rate": 1.8534010001252233e-05, "loss": 1.1763, "step": 4948 }, { "epoch": 0.2, "grad_norm": 12.859141311793975, "learning_rate": 1.8533329902572515e-05, "loss": 1.1168, "step": 4949 }, { "epoch": 0.2, "grad_norm": 5.305421496794454, "learning_rate": 1.853264965865848e-05, "loss": 1.1514, "step": 4950 }, { "epoch": 0.2, "grad_norm": 4.929952330165524, "learning_rate": 1.8531969269521706e-05, "loss": 1.1762, "step": 4951 }, { "epoch": 0.2, "grad_norm": 7.086188822113843, "learning_rate": 1.853128873517377e-05, "loss": 1.0059, "step": 4952 }, { "epoch": 0.2, "grad_norm": 6.276791128863552, "learning_rate": 1.8530608055626263e-05, "loss": 1.1431, "step": 4953 }, { "epoch": 0.2, "grad_norm": 4.6961450974696115, "learning_rate": 1.8529927230890757e-05, "loss": 1.3135, "step": 4954 }, { "epoch": 0.2, "grad_norm": 5.971810219513697, "learning_rate": 1.8529246260978855e-05, "loss": 1.3306, "step": 4955 }, { "epoch": 0.2, "grad_norm": 7.813305709414453, "learning_rate": 1.852856514590213e-05, "loss": 1.1677, "step": 4956 }, { "epoch": 0.2, "grad_norm": 4.865239177873526, "learning_rate": 1.8527883885672188e-05, "loss": 1.2409, "step": 4957 }, { "epoch": 0.2, "grad_norm": 6.892825646524352, "learning_rate": 1.852720248030062e-05, "loss": 1.3161, "step": 4958 }, { "epoch": 0.2, "grad_norm": 6.176479377748006, "learning_rate": 1.852652092979902e-05, "loss": 1.2345, "step": 4959 }, { "epoch": 0.2, "grad_norm": 7.461495014834986, "learning_rate": 1.8525839234178987e-05, "loss": 1.371, "step": 4960 }, { "epoch": 0.2, "grad_norm": 4.180044270748957, "learning_rate": 1.8525157393452133e-05, "loss": 1.2472, "step": 4961 }, { "epoch": 0.2, "grad_norm": 5.8365271233441245, "learning_rate": 1.8524475407630052e-05, "loss": 1.2094, "step": 4962 }, { "epoch": 0.2, "grad_norm": 4.547418145836118, "learning_rate": 1.8523793276724354e-05, "loss": 1.1385, "step": 4963 }, { "epoch": 0.2, "grad_norm": 5.918281451372842, "learning_rate": 1.852311100074665e-05, "loss": 1.0587, "step": 4964 }, { "epoch": 0.2, "grad_norm": 5.975016069866408, "learning_rate": 1.852242857970855e-05, "loss": 0.9834, "step": 4965 }, { "epoch": 0.2, "grad_norm": 6.96099019548006, "learning_rate": 1.8521746013621674e-05, "loss": 1.1401, "step": 4966 }, { "epoch": 0.2, "grad_norm": 6.548623644563767, "learning_rate": 1.8521063302497634e-05, "loss": 0.8945, "step": 4967 }, { "epoch": 0.2, "grad_norm": 7.472231610639752, "learning_rate": 1.852038044634805e-05, "loss": 1.4251, "step": 4968 }, { "epoch": 0.2, "grad_norm": 4.154301469831157, "learning_rate": 1.8519697445184542e-05, "loss": 1.0775, "step": 4969 }, { "epoch": 0.2, "grad_norm": 5.4580070446352735, "learning_rate": 1.851901429901874e-05, "loss": 1.2789, "step": 4970 }, { "epoch": 0.2, "grad_norm": 4.944024577010546, "learning_rate": 1.8518331007862268e-05, "loss": 1.0977, "step": 4971 }, { "epoch": 0.2, "grad_norm": 5.764142644595848, "learning_rate": 1.8517647571726757e-05, "loss": 1.2337, "step": 4972 }, { "epoch": 0.2, "grad_norm": 11.64092749714835, "learning_rate": 1.8516963990623834e-05, "loss": 1.5389, "step": 4973 }, { "epoch": 0.2, "grad_norm": 6.774505039966708, "learning_rate": 1.851628026456514e-05, "loss": 1.2597, "step": 4974 }, { "epoch": 0.2, "grad_norm": 6.394937601767726, "learning_rate": 1.8515596393562304e-05, "loss": 1.3122, "step": 4975 }, { "epoch": 0.2, "grad_norm": 8.901728463538788, "learning_rate": 1.8514912377626974e-05, "loss": 1.5111, "step": 4976 }, { "epoch": 0.2, "grad_norm": 5.213782703032213, "learning_rate": 1.8514228216770784e-05, "loss": 1.1308, "step": 4977 }, { "epoch": 0.2, "grad_norm": 5.80899633128652, "learning_rate": 1.8513543911005384e-05, "loss": 1.2542, "step": 4978 }, { "epoch": 0.2, "grad_norm": 6.93427102443219, "learning_rate": 1.8512859460342415e-05, "loss": 1.1726, "step": 4979 }, { "epoch": 0.2, "grad_norm": 8.261431586668204, "learning_rate": 1.851217486479353e-05, "loss": 1.1313, "step": 4980 }, { "epoch": 0.2, "grad_norm": 8.16267404226706, "learning_rate": 1.8511490124370383e-05, "loss": 1.2813, "step": 4981 }, { "epoch": 0.2, "grad_norm": 6.771018765177063, "learning_rate": 1.851080523908462e-05, "loss": 1.474, "step": 4982 }, { "epoch": 0.2, "grad_norm": 5.36394717980409, "learning_rate": 1.851012020894791e-05, "loss": 1.4049, "step": 4983 }, { "epoch": 0.2, "grad_norm": 5.749617832085656, "learning_rate": 1.8509435033971898e-05, "loss": 1.2213, "step": 4984 }, { "epoch": 0.2, "grad_norm": 8.225439203105179, "learning_rate": 1.8508749714168252e-05, "loss": 1.2758, "step": 4985 }, { "epoch": 0.2, "grad_norm": 4.489371387065393, "learning_rate": 1.8508064249548637e-05, "loss": 1.1545, "step": 4986 }, { "epoch": 0.2, "grad_norm": 6.661627381145513, "learning_rate": 1.850737864012472e-05, "loss": 1.4009, "step": 4987 }, { "epoch": 0.2, "grad_norm": 4.135649099123507, "learning_rate": 1.8506692885908166e-05, "loss": 1.0118, "step": 4988 }, { "epoch": 0.2, "grad_norm": 4.848416411233792, "learning_rate": 1.850600698691065e-05, "loss": 1.1515, "step": 4989 }, { "epoch": 0.2, "grad_norm": 5.406624503002871, "learning_rate": 1.8505320943143843e-05, "loss": 1.265, "step": 4990 }, { "epoch": 0.2, "grad_norm": 7.008468638102408, "learning_rate": 1.850463475461942e-05, "loss": 1.0288, "step": 4991 }, { "epoch": 0.2, "grad_norm": 6.527073334637672, "learning_rate": 1.8503948421349064e-05, "loss": 1.2125, "step": 4992 }, { "epoch": 0.2, "grad_norm": 6.097280638018664, "learning_rate": 1.8503261943344455e-05, "loss": 1.5152, "step": 4993 }, { "epoch": 0.2, "grad_norm": 5.078347927440646, "learning_rate": 1.8502575320617274e-05, "loss": 1.1836, "step": 4994 }, { "epoch": 0.2, "grad_norm": 8.783532659060466, "learning_rate": 1.8501888553179207e-05, "loss": 1.4794, "step": 4995 }, { "epoch": 0.2, "grad_norm": 5.336082009639277, "learning_rate": 1.850120164104195e-05, "loss": 1.3371, "step": 4996 }, { "epoch": 0.2, "grad_norm": 5.681780212040139, "learning_rate": 1.8500514584217182e-05, "loss": 1.1234, "step": 4997 }, { "epoch": 0.2, "grad_norm": 7.023289979702203, "learning_rate": 1.8499827382716607e-05, "loss": 1.0907, "step": 4998 }, { "epoch": 0.2, "grad_norm": 5.288865424461899, "learning_rate": 1.8499140036551914e-05, "loss": 1.2467, "step": 4999 }, { "epoch": 0.2, "grad_norm": 4.959872191251502, "learning_rate": 1.8498452545734808e-05, "loss": 0.9999, "step": 5000 }, { "epoch": 0.2, "grad_norm": 5.772532747816903, "learning_rate": 1.913888626681689e-05, "loss": 1.5018, "step": 5001 }, { "epoch": 0.2, "step": 5001, "total_flos": 705784014913536.0, "train_loss": 0.0003003027410036181, "train_runtime": 45.3711, "train_samples_per_second": 180.555, "train_steps_per_second": 5.642 } ], "logging_steps": 1.0, "max_steps": 256, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "total_flos": 705784014913536.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }