{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 1124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017793594306049821, "grad_norm": 1.713125243161643, "learning_rate": 9.99998046979289e-06, "loss": 0.1091, "step": 1 }, { "epoch": 0.0035587188612099642, "grad_norm": 1.6302122706369235, "learning_rate": 9.999921879324127e-06, "loss": 0.105, "step": 2 }, { "epoch": 0.005338078291814947, "grad_norm": 1.809710022557897, "learning_rate": 9.999824229051425e-06, "loss": 0.1137, "step": 3 }, { "epoch": 0.0071174377224199285, "grad_norm": 2.202959405561451, "learning_rate": 9.999687519737639e-06, "loss": 0.1366, "step": 4 }, { "epoch": 0.008896797153024912, "grad_norm": 2.56865774769625, "learning_rate": 9.99951175245075e-06, "loss": 0.0998, "step": 5 }, { "epoch": 0.010676156583629894, "grad_norm": 2.1461290592083517, "learning_rate": 9.999296928563868e-06, "loss": 0.1289, "step": 6 }, { "epoch": 0.012455516014234875, "grad_norm": 2.5711621281004087, "learning_rate": 9.999043049755216e-06, "loss": 0.1266, "step": 7 }, { "epoch": 0.014234875444839857, "grad_norm": 1.7981967241630354, "learning_rate": 9.998750118008117e-06, "loss": 0.0809, "step": 8 }, { "epoch": 0.01601423487544484, "grad_norm": 1.973917848730113, "learning_rate": 9.998418135610974e-06, "loss": 0.0821, "step": 9 }, { "epoch": 0.017793594306049824, "grad_norm": 2.9625604235753817, "learning_rate": 9.998047105157265e-06, "loss": 0.1591, "step": 10 }, { "epoch": 0.019572953736654804, "grad_norm": 1.9602621665143425, "learning_rate": 9.997637029545509e-06, "loss": 0.0911, "step": 11 }, { "epoch": 0.021352313167259787, "grad_norm": 2.455283390741342, "learning_rate": 9.997187911979252e-06, "loss": 0.1191, "step": 12 }, { "epoch": 0.023131672597864767, "grad_norm": 2.9291596500061425, "learning_rate": 9.996699755967035e-06, "loss": 0.1527, "step": 13 }, { "epoch": 0.02491103202846975, "grad_norm": 2.3788876217664625, "learning_rate": 9.996172565322375e-06, "loss": 0.1147, "step": 14 }, { "epoch": 0.026690391459074734, "grad_norm": 1.9511478291744484, "learning_rate": 9.995606344163728e-06, "loss": 0.1019, "step": 15 }, { "epoch": 0.028469750889679714, "grad_norm": 1.8633939175740353, "learning_rate": 9.995001096914462e-06, "loss": 0.101, "step": 16 }, { "epoch": 0.030249110320284697, "grad_norm": 1.9196819989568994, "learning_rate": 9.994356828302818e-06, "loss": 0.121, "step": 17 }, { "epoch": 0.03202846975088968, "grad_norm": 1.5823196958090517, "learning_rate": 9.993673543361874e-06, "loss": 0.0909, "step": 18 }, { "epoch": 0.033807829181494664, "grad_norm": 2.327516890254515, "learning_rate": 9.992951247429512e-06, "loss": 0.149, "step": 19 }, { "epoch": 0.03558718861209965, "grad_norm": 1.8162688672610439, "learning_rate": 9.992189946148366e-06, "loss": 0.0911, "step": 20 }, { "epoch": 0.037366548042704624, "grad_norm": 1.4041247319897927, "learning_rate": 9.991389645465786e-06, "loss": 0.0739, "step": 21 }, { "epoch": 0.03914590747330961, "grad_norm": 1.8851290416161448, "learning_rate": 9.990550351633784e-06, "loss": 0.1262, "step": 22 }, { "epoch": 0.04092526690391459, "grad_norm": 1.63210744195948, "learning_rate": 9.989672071208993e-06, "loss": 0.1225, "step": 23 }, { "epoch": 0.042704626334519574, "grad_norm": 2.0050146620293154, "learning_rate": 9.988754811052616e-06, "loss": 0.1629, "step": 24 }, { "epoch": 0.04448398576512456, "grad_norm": 2.2192029017519266, "learning_rate": 9.987798578330365e-06, "loss": 0.1528, "step": 25 }, { "epoch": 0.046263345195729534, "grad_norm": 1.8738385921945637, "learning_rate": 9.986803380512406e-06, "loss": 0.1094, "step": 26 }, { "epoch": 0.04804270462633452, "grad_norm": 1.6732757112809902, "learning_rate": 9.98576922537331e-06, "loss": 0.1277, "step": 27 }, { "epoch": 0.0498220640569395, "grad_norm": 1.8916312858303854, "learning_rate": 9.984696120991979e-06, "loss": 0.1285, "step": 28 }, { "epoch": 0.051601423487544484, "grad_norm": 1.7052411499616327, "learning_rate": 9.983584075751598e-06, "loss": 0.1025, "step": 29 }, { "epoch": 0.05338078291814947, "grad_norm": 1.5828027832105331, "learning_rate": 9.982433098339553e-06, "loss": 0.1087, "step": 30 }, { "epoch": 0.05516014234875445, "grad_norm": 2.1560097406788143, "learning_rate": 9.981243197747375e-06, "loss": 0.1455, "step": 31 }, { "epoch": 0.05693950177935943, "grad_norm": 2.607390787096197, "learning_rate": 9.980014383270668e-06, "loss": 0.1533, "step": 32 }, { "epoch": 0.05871886120996441, "grad_norm": 1.9030143959070778, "learning_rate": 9.978746664509032e-06, "loss": 0.1253, "step": 33 }, { "epoch": 0.060498220640569395, "grad_norm": 1.7640275048348333, "learning_rate": 9.97744005136599e-06, "loss": 0.1047, "step": 34 }, { "epoch": 0.06227758007117438, "grad_norm": 1.781622258616041, "learning_rate": 9.976094554048912e-06, "loss": 0.1192, "step": 35 }, { "epoch": 0.06405693950177936, "grad_norm": 2.0819045291855685, "learning_rate": 9.974710183068935e-06, "loss": 0.1138, "step": 36 }, { "epoch": 0.06583629893238434, "grad_norm": 1.7462875295258162, "learning_rate": 9.97328694924088e-06, "loss": 0.1096, "step": 37 }, { "epoch": 0.06761565836298933, "grad_norm": 2.006516306117893, "learning_rate": 9.971824863683168e-06, "loss": 0.1356, "step": 38 }, { "epoch": 0.0693950177935943, "grad_norm": 2.0810563155345636, "learning_rate": 9.970323937817732e-06, "loss": 0.1126, "step": 39 }, { "epoch": 0.0711743772241993, "grad_norm": 1.8092099241779336, "learning_rate": 9.968784183369929e-06, "loss": 0.0951, "step": 40 }, { "epoch": 0.07295373665480427, "grad_norm": 1.9570918946441245, "learning_rate": 9.96720561236845e-06, "loss": 0.1246, "step": 41 }, { "epoch": 0.07473309608540925, "grad_norm": 2.4556981444616035, "learning_rate": 9.965588237145219e-06, "loss": 0.1515, "step": 42 }, { "epoch": 0.07651245551601424, "grad_norm": 1.8564819639482109, "learning_rate": 9.963932070335307e-06, "loss": 0.1102, "step": 43 }, { "epoch": 0.07829181494661921, "grad_norm": 2.1107286592911834, "learning_rate": 9.962237124876828e-06, "loss": 0.1218, "step": 44 }, { "epoch": 0.0800711743772242, "grad_norm": 1.850645723956103, "learning_rate": 9.960503414010833e-06, "loss": 0.1234, "step": 45 }, { "epoch": 0.08185053380782918, "grad_norm": 2.016274659018856, "learning_rate": 9.958730951281218e-06, "loss": 0.1202, "step": 46 }, { "epoch": 0.08362989323843416, "grad_norm": 2.008275852911467, "learning_rate": 9.956919750534607e-06, "loss": 0.1291, "step": 47 }, { "epoch": 0.08540925266903915, "grad_norm": 2.007279817801552, "learning_rate": 9.955069825920249e-06, "loss": 0.1349, "step": 48 }, { "epoch": 0.08718861209964412, "grad_norm": 1.7487451811256158, "learning_rate": 9.953181191889913e-06, "loss": 0.1361, "step": 49 }, { "epoch": 0.08896797153024912, "grad_norm": 1.517484712784115, "learning_rate": 9.95125386319776e-06, "loss": 0.0999, "step": 50 }, { "epoch": 0.09074733096085409, "grad_norm": 2.2499753814457515, "learning_rate": 9.949287854900243e-06, "loss": 0.1518, "step": 51 }, { "epoch": 0.09252669039145907, "grad_norm": 1.941191469639605, "learning_rate": 9.947283182355982e-06, "loss": 0.1227, "step": 52 }, { "epoch": 0.09430604982206406, "grad_norm": 1.8110278448455164, "learning_rate": 9.945239861225644e-06, "loss": 0.1217, "step": 53 }, { "epoch": 0.09608540925266904, "grad_norm": 1.6861188866251153, "learning_rate": 9.943157907471825e-06, "loss": 0.1088, "step": 54 }, { "epoch": 0.09786476868327403, "grad_norm": 1.5380016763892475, "learning_rate": 9.941037337358918e-06, "loss": 0.0897, "step": 55 }, { "epoch": 0.099644128113879, "grad_norm": 2.0206087949865372, "learning_rate": 9.938878167452991e-06, "loss": 0.1297, "step": 56 }, { "epoch": 0.10142348754448399, "grad_norm": 2.2039660026655317, "learning_rate": 9.936680414621663e-06, "loss": 0.1313, "step": 57 }, { "epoch": 0.10320284697508897, "grad_norm": 1.6159185919011219, "learning_rate": 9.934444096033958e-06, "loss": 0.1039, "step": 58 }, { "epoch": 0.10498220640569395, "grad_norm": 1.8014661668111245, "learning_rate": 9.932169229160183e-06, "loss": 0.1277, "step": 59 }, { "epoch": 0.10676156583629894, "grad_norm": 1.8882786008462558, "learning_rate": 9.929855831771787e-06, "loss": 0.1103, "step": 60 }, { "epoch": 0.10854092526690391, "grad_norm": 2.0518090104822924, "learning_rate": 9.927503921941218e-06, "loss": 0.1201, "step": 61 }, { "epoch": 0.1103202846975089, "grad_norm": 2.077298073157288, "learning_rate": 9.925113518041796e-06, "loss": 0.1434, "step": 62 }, { "epoch": 0.11209964412811388, "grad_norm": 2.232364693475648, "learning_rate": 9.922684638747551e-06, "loss": 0.1643, "step": 63 }, { "epoch": 0.11387900355871886, "grad_norm": 1.7031115509859254, "learning_rate": 9.920217303033091e-06, "loss": 0.11, "step": 64 }, { "epoch": 0.11565836298932385, "grad_norm": 1.986748690687823, "learning_rate": 9.917711530173444e-06, "loss": 0.118, "step": 65 }, { "epoch": 0.11743772241992882, "grad_norm": 2.149403485535034, "learning_rate": 9.91516733974392e-06, "loss": 0.1354, "step": 66 }, { "epoch": 0.11921708185053381, "grad_norm": 1.739273816537123, "learning_rate": 9.912584751619943e-06, "loss": 0.1299, "step": 67 }, { "epoch": 0.12099644128113879, "grad_norm": 1.5525247406222797, "learning_rate": 9.909963785976902e-06, "loss": 0.1059, "step": 68 }, { "epoch": 0.12277580071174377, "grad_norm": 1.5786375625670799, "learning_rate": 9.907304463290004e-06, "loss": 0.1115, "step": 69 }, { "epoch": 0.12455516014234876, "grad_norm": 1.5125376174848446, "learning_rate": 9.904606804334094e-06, "loss": 0.0935, "step": 70 }, { "epoch": 0.12633451957295375, "grad_norm": 1.6487502063928734, "learning_rate": 9.901870830183506e-06, "loss": 0.1154, "step": 71 }, { "epoch": 0.12811387900355872, "grad_norm": 1.6202582447353944, "learning_rate": 9.899096562211902e-06, "loss": 0.1133, "step": 72 }, { "epoch": 0.1298932384341637, "grad_norm": 2.0925196722374335, "learning_rate": 9.896284022092088e-06, "loss": 0.1596, "step": 73 }, { "epoch": 0.13167259786476868, "grad_norm": 2.1751876400604253, "learning_rate": 9.893433231795864e-06, "loss": 0.1687, "step": 74 }, { "epoch": 0.13345195729537365, "grad_norm": 2.254229114257366, "learning_rate": 9.890544213593838e-06, "loss": 0.1478, "step": 75 }, { "epoch": 0.13523131672597866, "grad_norm": 1.7064942356188921, "learning_rate": 9.887616990055262e-06, "loss": 0.1268, "step": 76 }, { "epoch": 0.13701067615658363, "grad_norm": 2.251141278398778, "learning_rate": 9.884651584047845e-06, "loss": 0.1368, "step": 77 }, { "epoch": 0.1387900355871886, "grad_norm": 1.9266767368483468, "learning_rate": 9.881648018737587e-06, "loss": 0.1437, "step": 78 }, { "epoch": 0.14056939501779359, "grad_norm": 1.7848749816394225, "learning_rate": 9.878606317588588e-06, "loss": 0.1048, "step": 79 }, { "epoch": 0.1423487544483986, "grad_norm": 2.2510801493075365, "learning_rate": 9.875526504362868e-06, "loss": 0.1569, "step": 80 }, { "epoch": 0.14412811387900357, "grad_norm": 2.2791687325251475, "learning_rate": 9.872408603120187e-06, "loss": 0.164, "step": 81 }, { "epoch": 0.14590747330960854, "grad_norm": 1.8163005919527195, "learning_rate": 9.869252638217846e-06, "loss": 0.1163, "step": 82 }, { "epoch": 0.14768683274021352, "grad_norm": 1.8087620419227883, "learning_rate": 9.866058634310503e-06, "loss": 0.1373, "step": 83 }, { "epoch": 0.1494661921708185, "grad_norm": 1.7281912802627122, "learning_rate": 9.862826616349981e-06, "loss": 0.1277, "step": 84 }, { "epoch": 0.1512455516014235, "grad_norm": 1.923607729784607, "learning_rate": 9.859556609585075e-06, "loss": 0.1269, "step": 85 }, { "epoch": 0.15302491103202848, "grad_norm": 1.831229049893517, "learning_rate": 9.856248639561346e-06, "loss": 0.1154, "step": 86 }, { "epoch": 0.15480427046263345, "grad_norm": 1.9074686739935474, "learning_rate": 9.85290273212093e-06, "loss": 0.11, "step": 87 }, { "epoch": 0.15658362989323843, "grad_norm": 1.8045565678093964, "learning_rate": 9.849518913402334e-06, "loss": 0.1214, "step": 88 }, { "epoch": 0.1583629893238434, "grad_norm": 1.8032943168404858, "learning_rate": 9.84609720984023e-06, "loss": 0.1226, "step": 89 }, { "epoch": 0.1601423487544484, "grad_norm": 1.8975578748071806, "learning_rate": 9.84263764816525e-06, "loss": 0.1166, "step": 90 }, { "epoch": 0.1619217081850534, "grad_norm": 1.8048566163814908, "learning_rate": 9.839140255403776e-06, "loss": 0.1155, "step": 91 }, { "epoch": 0.16370106761565836, "grad_norm": 1.893223930862753, "learning_rate": 9.83560505887773e-06, "loss": 0.1217, "step": 92 }, { "epoch": 0.16548042704626334, "grad_norm": 1.6588280399929534, "learning_rate": 9.83203208620436e-06, "loss": 0.1203, "step": 93 }, { "epoch": 0.16725978647686832, "grad_norm": 2.14320081947194, "learning_rate": 9.828421365296023e-06, "loss": 0.1286, "step": 94 }, { "epoch": 0.16903914590747332, "grad_norm": 2.025436310275238, "learning_rate": 9.824772924359974e-06, "loss": 0.1271, "step": 95 }, { "epoch": 0.1708185053380783, "grad_norm": 1.7723627067257361, "learning_rate": 9.821086791898133e-06, "loss": 0.1109, "step": 96 }, { "epoch": 0.17259786476868327, "grad_norm": 1.4989446449146937, "learning_rate": 9.817362996706872e-06, "loss": 0.1137, "step": 97 }, { "epoch": 0.17437722419928825, "grad_norm": 1.5044874236782473, "learning_rate": 9.81360156787679e-06, "loss": 0.0965, "step": 98 }, { "epoch": 0.17615658362989323, "grad_norm": 1.5268666027615458, "learning_rate": 9.809802534792477e-06, "loss": 0.1034, "step": 99 }, { "epoch": 0.17793594306049823, "grad_norm": 1.8910934973911067, "learning_rate": 9.805965927132294e-06, "loss": 0.1244, "step": 100 }, { "epoch": 0.1797153024911032, "grad_norm": 1.7285625650593492, "learning_rate": 9.802091774868143e-06, "loss": 0.1126, "step": 101 }, { "epoch": 0.18149466192170818, "grad_norm": 1.7589212356828077, "learning_rate": 9.798180108265218e-06, "loss": 0.1282, "step": 102 }, { "epoch": 0.18327402135231316, "grad_norm": 1.5602263250455246, "learning_rate": 9.794230957881785e-06, "loss": 0.0926, "step": 103 }, { "epoch": 0.18505338078291814, "grad_norm": 2.0016073161665893, "learning_rate": 9.79024435456893e-06, "loss": 0.122, "step": 104 }, { "epoch": 0.18683274021352314, "grad_norm": 1.7692510987473, "learning_rate": 9.786220329470334e-06, "loss": 0.1116, "step": 105 }, { "epoch": 0.18861209964412812, "grad_norm": 1.7462432785380835, "learning_rate": 9.782158914022011e-06, "loss": 0.1179, "step": 106 }, { "epoch": 0.1903914590747331, "grad_norm": 1.9729204164994196, "learning_rate": 9.778060139952075e-06, "loss": 0.1473, "step": 107 }, { "epoch": 0.19217081850533807, "grad_norm": 1.7216004683071324, "learning_rate": 9.773924039280488e-06, "loss": 0.112, "step": 108 }, { "epoch": 0.19395017793594305, "grad_norm": 1.2936195343213597, "learning_rate": 9.769750644318814e-06, "loss": 0.0826, "step": 109 }, { "epoch": 0.19572953736654805, "grad_norm": 2.091647632796318, "learning_rate": 9.765539987669956e-06, "loss": 0.1218, "step": 110 }, { "epoch": 0.19750889679715303, "grad_norm": 2.1594912942713913, "learning_rate": 9.761292102227917e-06, "loss": 0.1584, "step": 111 }, { "epoch": 0.199288256227758, "grad_norm": 2.1954918321202035, "learning_rate": 9.757007021177529e-06, "loss": 0.1585, "step": 112 }, { "epoch": 0.20106761565836298, "grad_norm": 1.423979053916681, "learning_rate": 9.752684777994197e-06, "loss": 0.1024, "step": 113 }, { "epoch": 0.20284697508896798, "grad_norm": 2.0610055996363172, "learning_rate": 9.748325406443647e-06, "loss": 0.1308, "step": 114 }, { "epoch": 0.20462633451957296, "grad_norm": 1.9436245682010498, "learning_rate": 9.743928940581646e-06, "loss": 0.1462, "step": 115 }, { "epoch": 0.20640569395017794, "grad_norm": 2.1693767257709387, "learning_rate": 9.739495414753754e-06, "loss": 0.148, "step": 116 }, { "epoch": 0.20818505338078291, "grad_norm": 1.6213065354324263, "learning_rate": 9.73502486359504e-06, "loss": 0.1148, "step": 117 }, { "epoch": 0.2099644128113879, "grad_norm": 2.3237060074988456, "learning_rate": 9.73051732202982e-06, "loss": 0.1366, "step": 118 }, { "epoch": 0.2117437722419929, "grad_norm": 1.7197723438397614, "learning_rate": 9.725972825271381e-06, "loss": 0.1187, "step": 119 }, { "epoch": 0.21352313167259787, "grad_norm": 1.74187600691338, "learning_rate": 9.721391408821713e-06, "loss": 0.1161, "step": 120 }, { "epoch": 0.21530249110320285, "grad_norm": 1.6654992677313791, "learning_rate": 9.716773108471213e-06, "loss": 0.1265, "step": 121 }, { "epoch": 0.21708185053380782, "grad_norm": 1.659156824791211, "learning_rate": 9.712117960298433e-06, "loss": 0.124, "step": 122 }, { "epoch": 0.2188612099644128, "grad_norm": 1.7685127112396284, "learning_rate": 9.707426000669773e-06, "loss": 0.115, "step": 123 }, { "epoch": 0.2206405693950178, "grad_norm": 1.8320851831926341, "learning_rate": 9.702697266239211e-06, "loss": 0.1225, "step": 124 }, { "epoch": 0.22241992882562278, "grad_norm": 2.0034924904385965, "learning_rate": 9.697931793948012e-06, "loss": 0.1419, "step": 125 }, { "epoch": 0.22419928825622776, "grad_norm": 1.3778277109253918, "learning_rate": 9.693129621024441e-06, "loss": 0.1123, "step": 126 }, { "epoch": 0.22597864768683273, "grad_norm": 1.572867202970663, "learning_rate": 9.68829078498347e-06, "loss": 0.1101, "step": 127 }, { "epoch": 0.2277580071174377, "grad_norm": 1.9911478485545366, "learning_rate": 9.683415323626487e-06, "loss": 0.1333, "step": 128 }, { "epoch": 0.22953736654804271, "grad_norm": 2.1526420394512735, "learning_rate": 9.678503275040997e-06, "loss": 0.1406, "step": 129 }, { "epoch": 0.2313167259786477, "grad_norm": 1.7509157608496215, "learning_rate": 9.673554677600336e-06, "loss": 0.1177, "step": 130 }, { "epoch": 0.23309608540925267, "grad_norm": 1.637913603488545, "learning_rate": 9.668569569963355e-06, "loss": 0.127, "step": 131 }, { "epoch": 0.23487544483985764, "grad_norm": 1.3008401042020348, "learning_rate": 9.663547991074129e-06, "loss": 0.0858, "step": 132 }, { "epoch": 0.23665480427046262, "grad_norm": 1.9909063025228493, "learning_rate": 9.658489980161643e-06, "loss": 0.1322, "step": 133 }, { "epoch": 0.23843416370106763, "grad_norm": 1.7908617586454427, "learning_rate": 9.653395576739504e-06, "loss": 0.1279, "step": 134 }, { "epoch": 0.2402135231316726, "grad_norm": 2.1475772083420788, "learning_rate": 9.648264820605611e-06, "loss": 0.1292, "step": 135 }, { "epoch": 0.24199288256227758, "grad_norm": 1.8450806226526224, "learning_rate": 9.643097751841854e-06, "loss": 0.1728, "step": 136 }, { "epoch": 0.24377224199288255, "grad_norm": 1.810413803319839, "learning_rate": 9.637894410813803e-06, "loss": 0.1364, "step": 137 }, { "epoch": 0.24555160142348753, "grad_norm": 2.196555501000083, "learning_rate": 9.632654838170393e-06, "loss": 0.1372, "step": 138 }, { "epoch": 0.24733096085409254, "grad_norm": 1.9545632962479074, "learning_rate": 9.627379074843595e-06, "loss": 0.1344, "step": 139 }, { "epoch": 0.2491103202846975, "grad_norm": 1.7909181406263304, "learning_rate": 9.622067162048111e-06, "loss": 0.1324, "step": 140 }, { "epoch": 0.2508896797153025, "grad_norm": 1.4775863824204931, "learning_rate": 9.616719141281044e-06, "loss": 0.1226, "step": 141 }, { "epoch": 0.2526690391459075, "grad_norm": 2.1594279710190247, "learning_rate": 9.611335054321576e-06, "loss": 0.1624, "step": 142 }, { "epoch": 0.25444839857651247, "grad_norm": 2.0008277111563317, "learning_rate": 9.605914943230637e-06, "loss": 0.1368, "step": 143 }, { "epoch": 0.25622775800711745, "grad_norm": 1.5584971412440352, "learning_rate": 9.600458850350588e-06, "loss": 0.1116, "step": 144 }, { "epoch": 0.2580071174377224, "grad_norm": 1.8962770343238244, "learning_rate": 9.594966818304875e-06, "loss": 0.1274, "step": 145 }, { "epoch": 0.2597864768683274, "grad_norm": 1.66993816626153, "learning_rate": 9.589438889997712e-06, "loss": 0.0981, "step": 146 }, { "epoch": 0.2615658362989324, "grad_norm": 1.5888037677062683, "learning_rate": 9.583875108613727e-06, "loss": 0.1001, "step": 147 }, { "epoch": 0.26334519572953735, "grad_norm": 1.7876632713109166, "learning_rate": 9.578275517617646e-06, "loss": 0.1265, "step": 148 }, { "epoch": 0.26512455516014233, "grad_norm": 1.8250752120968232, "learning_rate": 9.572640160753936e-06, "loss": 0.1286, "step": 149 }, { "epoch": 0.2669039145907473, "grad_norm": 1.8585340687105398, "learning_rate": 9.566969082046471e-06, "loss": 0.1291, "step": 150 }, { "epoch": 0.26868327402135234, "grad_norm": 1.392071488839272, "learning_rate": 9.561262325798188e-06, "loss": 0.0974, "step": 151 }, { "epoch": 0.2704626334519573, "grad_norm": 1.6352044589439714, "learning_rate": 9.555519936590739e-06, "loss": 0.1034, "step": 152 }, { "epoch": 0.2722419928825623, "grad_norm": 1.569559059368265, "learning_rate": 9.549741959284147e-06, "loss": 0.1023, "step": 153 }, { "epoch": 0.27402135231316727, "grad_norm": 1.7620804533134524, "learning_rate": 9.543928439016445e-06, "loss": 0.1206, "step": 154 }, { "epoch": 0.27580071174377224, "grad_norm": 1.653313741667142, "learning_rate": 9.538079421203339e-06, "loss": 0.1128, "step": 155 }, { "epoch": 0.2775800711743772, "grad_norm": 1.6113086994597163, "learning_rate": 9.532194951537838e-06, "loss": 0.1077, "step": 156 }, { "epoch": 0.2793594306049822, "grad_norm": 1.8876239574264841, "learning_rate": 9.52627507598991e-06, "loss": 0.1261, "step": 157 }, { "epoch": 0.28113879003558717, "grad_norm": 1.4699207754893826, "learning_rate": 9.52031984080611e-06, "loss": 0.0981, "step": 158 }, { "epoch": 0.28291814946619215, "grad_norm": 1.671507339053813, "learning_rate": 9.514329292509227e-06, "loss": 0.0999, "step": 159 }, { "epoch": 0.2846975088967972, "grad_norm": 1.6963881236457152, "learning_rate": 9.508303477897925e-06, "loss": 0.1122, "step": 160 }, { "epoch": 0.28647686832740216, "grad_norm": 1.9001224033992847, "learning_rate": 9.502242444046365e-06, "loss": 0.1194, "step": 161 }, { "epoch": 0.28825622775800713, "grad_norm": 1.8042859957541233, "learning_rate": 9.496146238303846e-06, "loss": 0.1338, "step": 162 }, { "epoch": 0.2900355871886121, "grad_norm": 1.8537826618863582, "learning_rate": 9.49001490829443e-06, "loss": 0.1257, "step": 163 }, { "epoch": 0.2918149466192171, "grad_norm": 2.2437080318564484, "learning_rate": 9.483848501916578e-06, "loss": 0.1646, "step": 164 }, { "epoch": 0.29359430604982206, "grad_norm": 1.8856490764519114, "learning_rate": 9.477647067342766e-06, "loss": 0.1305, "step": 165 }, { "epoch": 0.29537366548042704, "grad_norm": 1.871953057517294, "learning_rate": 9.471410653019115e-06, "loss": 0.1296, "step": 166 }, { "epoch": 0.297153024911032, "grad_norm": 1.8890262553041286, "learning_rate": 9.46513930766501e-06, "loss": 0.1461, "step": 167 }, { "epoch": 0.298932384341637, "grad_norm": 1.702604333086193, "learning_rate": 9.458833080272723e-06, "loss": 0.1193, "step": 168 }, { "epoch": 0.30071174377224197, "grad_norm": 2.1060131708011713, "learning_rate": 9.45249202010702e-06, "loss": 0.1247, "step": 169 }, { "epoch": 0.302491103202847, "grad_norm": 1.7844858825551164, "learning_rate": 9.446116176704791e-06, "loss": 0.1214, "step": 170 }, { "epoch": 0.304270462633452, "grad_norm": 1.809965711742156, "learning_rate": 9.439705599874653e-06, "loss": 0.1267, "step": 171 }, { "epoch": 0.30604982206405695, "grad_norm": 1.8383203200257763, "learning_rate": 9.433260339696564e-06, "loss": 0.1569, "step": 172 }, { "epoch": 0.30782918149466193, "grad_norm": 1.851579423022255, "learning_rate": 9.426780446521429e-06, "loss": 0.1427, "step": 173 }, { "epoch": 0.3096085409252669, "grad_norm": 1.7486749889832602, "learning_rate": 9.42026597097071e-06, "loss": 0.1375, "step": 174 }, { "epoch": 0.3113879003558719, "grad_norm": 1.5426125916987024, "learning_rate": 9.413716963936033e-06, "loss": 0.1067, "step": 175 }, { "epoch": 0.31316725978647686, "grad_norm": 1.7767166152296758, "learning_rate": 9.407133476578778e-06, "loss": 0.1304, "step": 176 }, { "epoch": 0.31494661921708184, "grad_norm": 2.0825001597829824, "learning_rate": 9.400515560329698e-06, "loss": 0.1614, "step": 177 }, { "epoch": 0.3167259786476868, "grad_norm": 1.936853644459691, "learning_rate": 9.393863266888501e-06, "loss": 0.1285, "step": 178 }, { "epoch": 0.3185053380782918, "grad_norm": 1.8691308371098896, "learning_rate": 9.387176648223457e-06, "loss": 0.1293, "step": 179 }, { "epoch": 0.3202846975088968, "grad_norm": 2.0845945821205594, "learning_rate": 9.38045575657098e-06, "loss": 0.1385, "step": 180 }, { "epoch": 0.3220640569395018, "grad_norm": 1.8922278351620145, "learning_rate": 9.37370064443524e-06, "loss": 0.1326, "step": 181 }, { "epoch": 0.3238434163701068, "grad_norm": 1.4484109298708199, "learning_rate": 9.366911364587726e-06, "loss": 0.1088, "step": 182 }, { "epoch": 0.32562277580071175, "grad_norm": 1.6068722883178463, "learning_rate": 9.360087970066854e-06, "loss": 0.1158, "step": 183 }, { "epoch": 0.3274021352313167, "grad_norm": 1.6483460514222545, "learning_rate": 9.353230514177553e-06, "loss": 0.1084, "step": 184 }, { "epoch": 0.3291814946619217, "grad_norm": 1.5900280334030124, "learning_rate": 9.346339050490832e-06, "loss": 0.1084, "step": 185 }, { "epoch": 0.3309608540925267, "grad_norm": 1.9398841007100078, "learning_rate": 9.33941363284338e-06, "loss": 0.1243, "step": 186 }, { "epoch": 0.33274021352313166, "grad_norm": 1.768852204916369, "learning_rate": 9.332454315337129e-06, "loss": 0.1211, "step": 187 }, { "epoch": 0.33451957295373663, "grad_norm": 1.8111870820232214, "learning_rate": 9.325461152338846e-06, "loss": 0.1186, "step": 188 }, { "epoch": 0.33629893238434166, "grad_norm": 1.8399631044999671, "learning_rate": 9.3184341984797e-06, "loss": 0.1254, "step": 189 }, { "epoch": 0.33807829181494664, "grad_norm": 1.8969509563880365, "learning_rate": 9.311373508654838e-06, "loss": 0.1282, "step": 190 }, { "epoch": 0.3398576512455516, "grad_norm": 2.063335135899934, "learning_rate": 9.30427913802295e-06, "loss": 0.1707, "step": 191 }, { "epoch": 0.3416370106761566, "grad_norm": 2.1666854721298567, "learning_rate": 9.297151142005852e-06, "loss": 0.148, "step": 192 }, { "epoch": 0.34341637010676157, "grad_norm": 1.650369001188152, "learning_rate": 9.289989576288035e-06, "loss": 0.1191, "step": 193 }, { "epoch": 0.34519572953736655, "grad_norm": 1.8993536752951423, "learning_rate": 9.282794496816244e-06, "loss": 0.1292, "step": 194 }, { "epoch": 0.3469750889679715, "grad_norm": 2.2609471080912584, "learning_rate": 9.27556595979904e-06, "loss": 0.1272, "step": 195 }, { "epoch": 0.3487544483985765, "grad_norm": 1.7117829576423016, "learning_rate": 9.26830402170635e-06, "loss": 0.1287, "step": 196 }, { "epoch": 0.3505338078291815, "grad_norm": 1.6290750835204475, "learning_rate": 9.261008739269035e-06, "loss": 0.1064, "step": 197 }, { "epoch": 0.35231316725978645, "grad_norm": 1.7563570331974911, "learning_rate": 9.253680169478448e-06, "loss": 0.1256, "step": 198 }, { "epoch": 0.3540925266903915, "grad_norm": 1.7616635408950143, "learning_rate": 9.246318369585983e-06, "loss": 0.1236, "step": 199 }, { "epoch": 0.35587188612099646, "grad_norm": 2.239621202888575, "learning_rate": 9.238923397102629e-06, "loss": 0.1506, "step": 200 }, { "epoch": 0.35587188612099646, "eval_loss": 0.13475362956523895, "eval_runtime": 1.9305, "eval_samples_per_second": 23.828, "eval_steps_per_second": 6.216, "step": 200 }, { "epoch": 0.35765124555160144, "grad_norm": 1.722022941441938, "learning_rate": 9.231495309798525e-06, "loss": 0.1094, "step": 201 }, { "epoch": 0.3594306049822064, "grad_norm": 1.8851030530255979, "learning_rate": 9.224034165702506e-06, "loss": 0.1297, "step": 202 }, { "epoch": 0.3612099644128114, "grad_norm": 1.312223289817984, "learning_rate": 9.216540023101646e-06, "loss": 0.0999, "step": 203 }, { "epoch": 0.36298932384341637, "grad_norm": 1.4473604929562314, "learning_rate": 9.209012940540806e-06, "loss": 0.1123, "step": 204 }, { "epoch": 0.36476868327402134, "grad_norm": 1.4571406246983327, "learning_rate": 9.20145297682218e-06, "loss": 0.1176, "step": 205 }, { "epoch": 0.3665480427046263, "grad_norm": 2.3012201580614335, "learning_rate": 9.193860191004833e-06, "loss": 0.1607, "step": 206 }, { "epoch": 0.3683274021352313, "grad_norm": 1.7914329813601564, "learning_rate": 9.186234642404234e-06, "loss": 0.1425, "step": 207 }, { "epoch": 0.3701067615658363, "grad_norm": 2.1066136590560833, "learning_rate": 9.178576390591803e-06, "loss": 0.1376, "step": 208 }, { "epoch": 0.3718861209964413, "grad_norm": 1.4796833660326827, "learning_rate": 9.170885495394435e-06, "loss": 0.1075, "step": 209 }, { "epoch": 0.3736654804270463, "grad_norm": 2.1451912781621703, "learning_rate": 9.16316201689404e-06, "loss": 0.1352, "step": 210 }, { "epoch": 0.37544483985765126, "grad_norm": 1.9950963378711903, "learning_rate": 9.155406015427076e-06, "loss": 0.136, "step": 211 }, { "epoch": 0.37722419928825623, "grad_norm": 2.014416096651257, "learning_rate": 9.147617551584066e-06, "loss": 0.125, "step": 212 }, { "epoch": 0.3790035587188612, "grad_norm": 1.6357866840912252, "learning_rate": 9.139796686209135e-06, "loss": 0.1385, "step": 213 }, { "epoch": 0.3807829181494662, "grad_norm": 1.5071862190810121, "learning_rate": 9.131943480399531e-06, "loss": 0.1035, "step": 214 }, { "epoch": 0.38256227758007116, "grad_norm": 1.5458548378488257, "learning_rate": 9.124057995505148e-06, "loss": 0.1179, "step": 215 }, { "epoch": 0.38434163701067614, "grad_norm": 2.437466435728138, "learning_rate": 9.11614029312805e-06, "loss": 0.179, "step": 216 }, { "epoch": 0.3861209964412811, "grad_norm": 2.4650250923800145, "learning_rate": 9.108190435121982e-06, "loss": 0.145, "step": 217 }, { "epoch": 0.3879003558718861, "grad_norm": 1.9026122449272178, "learning_rate": 9.100208483591892e-06, "loss": 0.1169, "step": 218 }, { "epoch": 0.3896797153024911, "grad_norm": 2.538495019627512, "learning_rate": 9.092194500893448e-06, "loss": 0.1915, "step": 219 }, { "epoch": 0.3914590747330961, "grad_norm": 1.8352767388204005, "learning_rate": 9.084148549632547e-06, "loss": 0.1093, "step": 220 }, { "epoch": 0.3932384341637011, "grad_norm": 1.7395462886077808, "learning_rate": 9.076070692664827e-06, "loss": 0.1323, "step": 221 }, { "epoch": 0.39501779359430605, "grad_norm": 1.9492465490294997, "learning_rate": 9.067960993095176e-06, "loss": 0.1371, "step": 222 }, { "epoch": 0.39679715302491103, "grad_norm": 1.9129506900488067, "learning_rate": 9.059819514277238e-06, "loss": 0.129, "step": 223 }, { "epoch": 0.398576512455516, "grad_norm": 1.5683853142980035, "learning_rate": 9.05164631981292e-06, "loss": 0.1138, "step": 224 }, { "epoch": 0.400355871886121, "grad_norm": 1.8327399507256446, "learning_rate": 9.043441473551893e-06, "loss": 0.1354, "step": 225 }, { "epoch": 0.40213523131672596, "grad_norm": 1.8097839744674817, "learning_rate": 9.035205039591099e-06, "loss": 0.1185, "step": 226 }, { "epoch": 0.40391459074733094, "grad_norm": 1.8950697374562147, "learning_rate": 9.02693708227424e-06, "loss": 0.1308, "step": 227 }, { "epoch": 0.40569395017793597, "grad_norm": 1.5329358084281786, "learning_rate": 9.018637666191284e-06, "loss": 0.1414, "step": 228 }, { "epoch": 0.40747330960854095, "grad_norm": 1.8451527794408986, "learning_rate": 9.010306856177958e-06, "loss": 0.1526, "step": 229 }, { "epoch": 0.4092526690391459, "grad_norm": 1.8746813968819265, "learning_rate": 9.001944717315236e-06, "loss": 0.1521, "step": 230 }, { "epoch": 0.4110320284697509, "grad_norm": 1.8654814285834336, "learning_rate": 8.993551314928846e-06, "loss": 0.1353, "step": 231 }, { "epoch": 0.4128113879003559, "grad_norm": 1.588468323963603, "learning_rate": 8.985126714588739e-06, "loss": 0.101, "step": 232 }, { "epoch": 0.41459074733096085, "grad_norm": 1.6435632650135055, "learning_rate": 8.976670982108591e-06, "loss": 0.1296, "step": 233 }, { "epoch": 0.41637010676156583, "grad_norm": 2.074509978038458, "learning_rate": 8.968184183545285e-06, "loss": 0.161, "step": 234 }, { "epoch": 0.4181494661921708, "grad_norm": 1.800921231336751, "learning_rate": 8.959666385198396e-06, "loss": 0.127, "step": 235 }, { "epoch": 0.4199288256227758, "grad_norm": 1.5481816813962979, "learning_rate": 8.951117653609666e-06, "loss": 0.1066, "step": 236 }, { "epoch": 0.42170818505338076, "grad_norm": 1.8174378992361993, "learning_rate": 8.9425380555625e-06, "loss": 0.1265, "step": 237 }, { "epoch": 0.4234875444839858, "grad_norm": 1.213418675768833, "learning_rate": 8.933927658081423e-06, "loss": 0.0879, "step": 238 }, { "epoch": 0.42526690391459077, "grad_norm": 1.7336578526752655, "learning_rate": 8.925286528431578e-06, "loss": 0.1282, "step": 239 }, { "epoch": 0.42704626334519574, "grad_norm": 1.7581123861292776, "learning_rate": 8.916614734118184e-06, "loss": 0.1074, "step": 240 }, { "epoch": 0.4288256227758007, "grad_norm": 1.4649653987511146, "learning_rate": 8.907912342886016e-06, "loss": 0.1049, "step": 241 }, { "epoch": 0.4306049822064057, "grad_norm": 1.8770703024666036, "learning_rate": 8.899179422718877e-06, "loss": 0.1299, "step": 242 }, { "epoch": 0.43238434163701067, "grad_norm": 1.7042374945879506, "learning_rate": 8.890416041839061e-06, "loss": 0.1205, "step": 243 }, { "epoch": 0.43416370106761565, "grad_norm": 1.541039271792107, "learning_rate": 8.881622268706825e-06, "loss": 0.1048, "step": 244 }, { "epoch": 0.4359430604982206, "grad_norm": 1.623341393251503, "learning_rate": 8.872798172019856e-06, "loss": 0.1039, "step": 245 }, { "epoch": 0.4377224199288256, "grad_norm": 1.8832552058089562, "learning_rate": 8.863943820712726e-06, "loss": 0.1323, "step": 246 }, { "epoch": 0.4395017793594306, "grad_norm": 1.8056962073705773, "learning_rate": 8.855059283956363e-06, "loss": 0.1565, "step": 247 }, { "epoch": 0.4412811387900356, "grad_norm": 2.064823148363225, "learning_rate": 8.8461446311575e-06, "loss": 0.1172, "step": 248 }, { "epoch": 0.4430604982206406, "grad_norm": 1.5565441163419895, "learning_rate": 8.837199931958147e-06, "loss": 0.0999, "step": 249 }, { "epoch": 0.44483985765124556, "grad_norm": 1.8960372885941401, "learning_rate": 8.828225256235035e-06, "loss": 0.1519, "step": 250 }, { "epoch": 0.44661921708185054, "grad_norm": 1.969709272424476, "learning_rate": 8.819220674099074e-06, "loss": 0.1401, "step": 251 }, { "epoch": 0.4483985765124555, "grad_norm": 1.6753926221638498, "learning_rate": 8.810186255894804e-06, "loss": 0.1131, "step": 252 }, { "epoch": 0.4501779359430605, "grad_norm": 1.790313522333547, "learning_rate": 8.801122072199848e-06, "loss": 0.1128, "step": 253 }, { "epoch": 0.45195729537366547, "grad_norm": 1.5256328049106158, "learning_rate": 8.792028193824364e-06, "loss": 0.0996, "step": 254 }, { "epoch": 0.45373665480427045, "grad_norm": 1.5352367306579815, "learning_rate": 8.782904691810478e-06, "loss": 0.0973, "step": 255 }, { "epoch": 0.4555160142348754, "grad_norm": 1.5787792820803526, "learning_rate": 8.77375163743175e-06, "loss": 0.107, "step": 256 }, { "epoch": 0.45729537366548045, "grad_norm": 1.6006892743569767, "learning_rate": 8.764569102192593e-06, "loss": 0.1243, "step": 257 }, { "epoch": 0.45907473309608543, "grad_norm": 1.486918683494662, "learning_rate": 8.755357157827735e-06, "loss": 0.0855, "step": 258 }, { "epoch": 0.4608540925266904, "grad_norm": 1.9745134700331965, "learning_rate": 8.746115876301651e-06, "loss": 0.1381, "step": 259 }, { "epoch": 0.4626334519572954, "grad_norm": 1.7190356287429387, "learning_rate": 8.736845329807994e-06, "loss": 0.1159, "step": 260 }, { "epoch": 0.46441281138790036, "grad_norm": 1.821394554170655, "learning_rate": 8.727545590769044e-06, "loss": 0.1225, "step": 261 }, { "epoch": 0.46619217081850534, "grad_norm": 1.563725148737111, "learning_rate": 8.718216731835131e-06, "loss": 0.1181, "step": 262 }, { "epoch": 0.4679715302491103, "grad_norm": 1.7469646688639315, "learning_rate": 8.708858825884075e-06, "loss": 0.1463, "step": 263 }, { "epoch": 0.4697508896797153, "grad_norm": 1.2802876928340927, "learning_rate": 8.699471946020612e-06, "loss": 0.1046, "step": 264 }, { "epoch": 0.47153024911032027, "grad_norm": 1.726895684120543, "learning_rate": 8.690056165575825e-06, "loss": 0.1203, "step": 265 }, { "epoch": 0.47330960854092524, "grad_norm": 1.6877644560065082, "learning_rate": 8.680611558106571e-06, "loss": 0.1204, "step": 266 }, { "epoch": 0.4750889679715303, "grad_norm": 1.6351820175443095, "learning_rate": 8.671138197394907e-06, "loss": 0.1112, "step": 267 }, { "epoch": 0.47686832740213525, "grad_norm": 1.391943383044968, "learning_rate": 8.661636157447511e-06, "loss": 0.11, "step": 268 }, { "epoch": 0.4786476868327402, "grad_norm": 2.241489032548888, "learning_rate": 8.652105512495106e-06, "loss": 0.1403, "step": 269 }, { "epoch": 0.4804270462633452, "grad_norm": 1.6491124607865364, "learning_rate": 8.64254633699188e-06, "loss": 0.1063, "step": 270 }, { "epoch": 0.4822064056939502, "grad_norm": 2.040840488179136, "learning_rate": 8.632958705614905e-06, "loss": 0.1401, "step": 271 }, { "epoch": 0.48398576512455516, "grad_norm": 1.598519190543473, "learning_rate": 8.623342693263549e-06, "loss": 0.1076, "step": 272 }, { "epoch": 0.48576512455516013, "grad_norm": 1.885951067799148, "learning_rate": 8.6136983750589e-06, "loss": 0.1299, "step": 273 }, { "epoch": 0.4875444839857651, "grad_norm": 1.9080426833799893, "learning_rate": 8.604025826343167e-06, "loss": 0.1321, "step": 274 }, { "epoch": 0.4893238434163701, "grad_norm": 2.167665140654122, "learning_rate": 8.594325122679107e-06, "loss": 0.1496, "step": 275 }, { "epoch": 0.49110320284697506, "grad_norm": 1.6546589653225403, "learning_rate": 8.584596339849419e-06, "loss": 0.1268, "step": 276 }, { "epoch": 0.4928825622775801, "grad_norm": 1.3032213408818085, "learning_rate": 8.574839553856157e-06, "loss": 0.0915, "step": 277 }, { "epoch": 0.49466192170818507, "grad_norm": 2.006909044611022, "learning_rate": 8.565054840920145e-06, "loss": 0.1563, "step": 278 }, { "epoch": 0.49644128113879005, "grad_norm": 1.4707698991652156, "learning_rate": 8.55524227748037e-06, "loss": 0.1003, "step": 279 }, { "epoch": 0.498220640569395, "grad_norm": 1.6284224803578924, "learning_rate": 8.545401940193392e-06, "loss": 0.1041, "step": 280 }, { "epoch": 0.5, "grad_norm": 1.7660225764099606, "learning_rate": 8.535533905932739e-06, "loss": 0.1181, "step": 281 }, { "epoch": 0.501779359430605, "grad_norm": 2.3295025011304404, "learning_rate": 8.525638251788312e-06, "loss": 0.1707, "step": 282 }, { "epoch": 0.50355871886121, "grad_norm": 1.8460447919155434, "learning_rate": 8.515715055065783e-06, "loss": 0.1392, "step": 283 }, { "epoch": 0.505338078291815, "grad_norm": 1.7877505252596202, "learning_rate": 8.505764393285985e-06, "loss": 0.1502, "step": 284 }, { "epoch": 0.5071174377224199, "grad_norm": 1.8027468359852232, "learning_rate": 8.495786344184314e-06, "loss": 0.1155, "step": 285 }, { "epoch": 0.5088967971530249, "grad_norm": 1.8283957307006067, "learning_rate": 8.485780985710113e-06, "loss": 0.1325, "step": 286 }, { "epoch": 0.5106761565836299, "grad_norm": 1.617620230902481, "learning_rate": 8.475748396026074e-06, "loss": 0.1143, "step": 287 }, { "epoch": 0.5124555160142349, "grad_norm": 1.7737423091093987, "learning_rate": 8.46568865350762e-06, "loss": 0.1288, "step": 288 }, { "epoch": 0.5142348754448398, "grad_norm": 2.16238464487307, "learning_rate": 8.45560183674229e-06, "loss": 0.1449, "step": 289 }, { "epoch": 0.5160142348754448, "grad_norm": 1.9859827985295293, "learning_rate": 8.445488024529133e-06, "loss": 0.1336, "step": 290 }, { "epoch": 0.5177935943060499, "grad_norm": 1.4852303020995183, "learning_rate": 8.435347295878087e-06, "loss": 0.0909, "step": 291 }, { "epoch": 0.5195729537366548, "grad_norm": 1.4904683254821536, "learning_rate": 8.425179730009368e-06, "loss": 0.0935, "step": 292 }, { "epoch": 0.5213523131672598, "grad_norm": 1.8017255674921966, "learning_rate": 8.41498540635284e-06, "loss": 0.1289, "step": 293 }, { "epoch": 0.5231316725978647, "grad_norm": 1.868536635319127, "learning_rate": 8.404764404547404e-06, "loss": 0.1225, "step": 294 }, { "epoch": 0.5249110320284698, "grad_norm": 1.925292982995918, "learning_rate": 8.394516804440374e-06, "loss": 0.1255, "step": 295 }, { "epoch": 0.5266903914590747, "grad_norm": 2.0014473918154194, "learning_rate": 8.384242686086848e-06, "loss": 0.1284, "step": 296 }, { "epoch": 0.5284697508896797, "grad_norm": 1.9207087771935707, "learning_rate": 8.373942129749094e-06, "loss": 0.1563, "step": 297 }, { "epoch": 0.5302491103202847, "grad_norm": 1.4839026560301476, "learning_rate": 8.363615215895908e-06, "loss": 0.113, "step": 298 }, { "epoch": 0.5320284697508897, "grad_norm": 2.01462825124946, "learning_rate": 8.353262025202e-06, "loss": 0.1341, "step": 299 }, { "epoch": 0.5338078291814946, "grad_norm": 1.6455438560729794, "learning_rate": 8.342882638547351e-06, "loss": 0.1044, "step": 300 }, { "epoch": 0.5355871886120996, "grad_norm": 1.7305094065473785, "learning_rate": 8.332477137016587e-06, "loss": 0.1188, "step": 301 }, { "epoch": 0.5373665480427047, "grad_norm": 1.542702868065484, "learning_rate": 8.322045601898354e-06, "loss": 0.1015, "step": 302 }, { "epoch": 0.5391459074733096, "grad_norm": 1.5192625113596616, "learning_rate": 8.311588114684665e-06, "loss": 0.1072, "step": 303 }, { "epoch": 0.5409252669039146, "grad_norm": 1.9929013945889749, "learning_rate": 8.301104757070276e-06, "loss": 0.1891, "step": 304 }, { "epoch": 0.5427046263345195, "grad_norm": 1.917073137249212, "learning_rate": 8.290595610952045e-06, "loss": 0.1301, "step": 305 }, { "epoch": 0.5444839857651246, "grad_norm": 2.105582590022374, "learning_rate": 8.280060758428294e-06, "loss": 0.151, "step": 306 }, { "epoch": 0.5462633451957295, "grad_norm": 1.4383491073063088, "learning_rate": 8.269500281798164e-06, "loss": 0.0958, "step": 307 }, { "epoch": 0.5480427046263345, "grad_norm": 1.71817539102519, "learning_rate": 8.258914263560971e-06, "loss": 0.1178, "step": 308 }, { "epoch": 0.5498220640569395, "grad_norm": 1.6918612604010685, "learning_rate": 8.248302786415567e-06, "loss": 0.125, "step": 309 }, { "epoch": 0.5516014234875445, "grad_norm": 1.974800223194389, "learning_rate": 8.237665933259693e-06, "loss": 0.1265, "step": 310 }, { "epoch": 0.5533807829181495, "grad_norm": 2.015176371319247, "learning_rate": 8.227003787189323e-06, "loss": 0.1352, "step": 311 }, { "epoch": 0.5551601423487544, "grad_norm": 1.5231634085500674, "learning_rate": 8.216316431498028e-06, "loss": 0.1125, "step": 312 }, { "epoch": 0.5569395017793595, "grad_norm": 1.686994913410353, "learning_rate": 8.205603949676317e-06, "loss": 0.1421, "step": 313 }, { "epoch": 0.5587188612099644, "grad_norm": 1.6346089206372874, "learning_rate": 8.194866425410984e-06, "loss": 0.1245, "step": 314 }, { "epoch": 0.5604982206405694, "grad_norm": 1.818534252631312, "learning_rate": 8.184103942584456e-06, "loss": 0.1229, "step": 315 }, { "epoch": 0.5622775800711743, "grad_norm": 1.5560887578457265, "learning_rate": 8.173316585274144e-06, "loss": 0.1099, "step": 316 }, { "epoch": 0.5640569395017794, "grad_norm": 1.3145157648633286, "learning_rate": 8.162504437751775e-06, "loss": 0.1061, "step": 317 }, { "epoch": 0.5658362989323843, "grad_norm": 1.5183108662488272, "learning_rate": 8.151667584482742e-06, "loss": 0.1129, "step": 318 }, { "epoch": 0.5676156583629893, "grad_norm": 1.932761087650258, "learning_rate": 8.140806110125442e-06, "loss": 0.1329, "step": 319 }, { "epoch": 0.5693950177935944, "grad_norm": 1.5180293915846248, "learning_rate": 8.129920099530608e-06, "loss": 0.1273, "step": 320 }, { "epoch": 0.5711743772241993, "grad_norm": 1.2786794962721737, "learning_rate": 8.119009637740663e-06, "loss": 0.0786, "step": 321 }, { "epoch": 0.5729537366548043, "grad_norm": 1.5302896723393862, "learning_rate": 8.108074809989032e-06, "loss": 0.1186, "step": 322 }, { "epoch": 0.5747330960854092, "grad_norm": 1.4684058764049013, "learning_rate": 8.097115701699498e-06, "loss": 0.0832, "step": 323 }, { "epoch": 0.5765124555160143, "grad_norm": 1.8338256122637822, "learning_rate": 8.086132398485525e-06, "loss": 0.1372, "step": 324 }, { "epoch": 0.5782918149466192, "grad_norm": 1.2459604312864998, "learning_rate": 8.075124986149583e-06, "loss": 0.0926, "step": 325 }, { "epoch": 0.5800711743772242, "grad_norm": 1.5228442272884104, "learning_rate": 8.064093550682494e-06, "loss": 0.1074, "step": 326 }, { "epoch": 0.5818505338078291, "grad_norm": 1.4784309568139622, "learning_rate": 8.053038178262742e-06, "loss": 0.088, "step": 327 }, { "epoch": 0.5836298932384342, "grad_norm": 1.9491523552105292, "learning_rate": 8.041958955255815e-06, "loss": 0.1309, "step": 328 }, { "epoch": 0.5854092526690391, "grad_norm": 1.7028424152601356, "learning_rate": 8.030855968213518e-06, "loss": 0.116, "step": 329 }, { "epoch": 0.5871886120996441, "grad_norm": 1.832716229289757, "learning_rate": 8.019729303873307e-06, "loss": 0.1222, "step": 330 }, { "epoch": 0.5889679715302492, "grad_norm": 1.4787109520975619, "learning_rate": 8.008579049157607e-06, "loss": 0.0942, "step": 331 }, { "epoch": 0.5907473309608541, "grad_norm": 1.3870666218321572, "learning_rate": 7.99740529117313e-06, "loss": 0.1034, "step": 332 }, { "epoch": 0.5925266903914591, "grad_norm": 1.8729180024029073, "learning_rate": 7.986208117210198e-06, "loss": 0.1215, "step": 333 }, { "epoch": 0.594306049822064, "grad_norm": 1.9494037432879063, "learning_rate": 7.974987614742066e-06, "loss": 0.1291, "step": 334 }, { "epoch": 0.5960854092526691, "grad_norm": 2.074788964054358, "learning_rate": 7.963743871424224e-06, "loss": 0.1354, "step": 335 }, { "epoch": 0.597864768683274, "grad_norm": 1.7358856083117655, "learning_rate": 7.952476975093729e-06, "loss": 0.1096, "step": 336 }, { "epoch": 0.599644128113879, "grad_norm": 2.18549630435684, "learning_rate": 7.941187013768508e-06, "loss": 0.1467, "step": 337 }, { "epoch": 0.6014234875444839, "grad_norm": 1.8549801001780495, "learning_rate": 7.929874075646673e-06, "loss": 0.1158, "step": 338 }, { "epoch": 0.603202846975089, "grad_norm": 2.281363378969548, "learning_rate": 7.918538249105835e-06, "loss": 0.1327, "step": 339 }, { "epoch": 0.604982206405694, "grad_norm": 1.9737692145790846, "learning_rate": 7.907179622702409e-06, "loss": 0.1371, "step": 340 }, { "epoch": 0.6067615658362989, "grad_norm": 1.541775896502389, "learning_rate": 7.895798285170927e-06, "loss": 0.1055, "step": 341 }, { "epoch": 0.608540925266904, "grad_norm": 1.5424414645659343, "learning_rate": 7.88439432542334e-06, "loss": 0.092, "step": 342 }, { "epoch": 0.6103202846975089, "grad_norm": 1.7641424692067884, "learning_rate": 7.872967832548327e-06, "loss": 0.1322, "step": 343 }, { "epoch": 0.6120996441281139, "grad_norm": 1.7068160648312833, "learning_rate": 7.861518895810597e-06, "loss": 0.1342, "step": 344 }, { "epoch": 0.6138790035587188, "grad_norm": 1.6198357548760545, "learning_rate": 7.850047604650188e-06, "loss": 0.1291, "step": 345 }, { "epoch": 0.6156583629893239, "grad_norm": 1.9424546799664637, "learning_rate": 7.838554048681783e-06, "loss": 0.1416, "step": 346 }, { "epoch": 0.6174377224199288, "grad_norm": 1.8134114400606622, "learning_rate": 7.827038317693988e-06, "loss": 0.1739, "step": 347 }, { "epoch": 0.6192170818505338, "grad_norm": 1.6179820531330302, "learning_rate": 7.815500501648654e-06, "loss": 0.1026, "step": 348 }, { "epoch": 0.6209964412811388, "grad_norm": 1.7337222934014782, "learning_rate": 7.80394069068015e-06, "loss": 0.125, "step": 349 }, { "epoch": 0.6227758007117438, "grad_norm": 1.667295539855105, "learning_rate": 7.79235897509468e-06, "loss": 0.1177, "step": 350 }, { "epoch": 0.6245551601423488, "grad_norm": 1.955546157211665, "learning_rate": 7.780755445369563e-06, "loss": 0.1282, "step": 351 }, { "epoch": 0.6263345195729537, "grad_norm": 1.9462021045750202, "learning_rate": 7.769130192152538e-06, "loss": 0.1451, "step": 352 }, { "epoch": 0.6281138790035588, "grad_norm": 1.800382141356206, "learning_rate": 7.757483306261042e-06, "loss": 0.1343, "step": 353 }, { "epoch": 0.6298932384341637, "grad_norm": 2.26985913926227, "learning_rate": 7.745814878681516e-06, "loss": 0.1489, "step": 354 }, { "epoch": 0.6316725978647687, "grad_norm": 1.799130259991671, "learning_rate": 7.734125000568684e-06, "loss": 0.124, "step": 355 }, { "epoch": 0.6334519572953736, "grad_norm": 1.7795729814205064, "learning_rate": 7.722413763244837e-06, "loss": 0.1153, "step": 356 }, { "epoch": 0.6352313167259787, "grad_norm": 1.3982291886331777, "learning_rate": 7.710681258199136e-06, "loss": 0.0891, "step": 357 }, { "epoch": 0.6370106761565836, "grad_norm": 1.8767939872776536, "learning_rate": 7.69892757708688e-06, "loss": 0.1137, "step": 358 }, { "epoch": 0.6387900355871886, "grad_norm": 1.7380204088883748, "learning_rate": 7.687152811728799e-06, "loss": 0.1225, "step": 359 }, { "epoch": 0.6405693950177936, "grad_norm": 1.8623478448351884, "learning_rate": 7.675357054110337e-06, "loss": 0.1296, "step": 360 }, { "epoch": 0.6423487544483986, "grad_norm": 1.6357070391019277, "learning_rate": 7.663540396380931e-06, "loss": 0.1163, "step": 361 }, { "epoch": 0.6441281138790036, "grad_norm": 1.613704121919402, "learning_rate": 7.651702930853287e-06, "loss": 0.1066, "step": 362 }, { "epoch": 0.6459074733096085, "grad_norm": 1.804431133157121, "learning_rate": 7.639844750002668e-06, "loss": 0.1176, "step": 363 }, { "epoch": 0.6476868327402135, "grad_norm": 1.575922045491728, "learning_rate": 7.627965946466167e-06, "loss": 0.1354, "step": 364 }, { "epoch": 0.6494661921708185, "grad_norm": 1.7194160828765315, "learning_rate": 7.616066613041977e-06, "loss": 0.1213, "step": 365 }, { "epoch": 0.6512455516014235, "grad_norm": 1.7672637643995626, "learning_rate": 7.6041468426886785e-06, "loss": 0.1265, "step": 366 }, { "epoch": 0.6530249110320284, "grad_norm": 1.2188856249208775, "learning_rate": 7.592206728524507e-06, "loss": 0.0782, "step": 367 }, { "epoch": 0.6548042704626335, "grad_norm": 1.6574272174437377, "learning_rate": 7.580246363826621e-06, "loss": 0.119, "step": 368 }, { "epoch": 0.6565836298932385, "grad_norm": 1.8707252165943433, "learning_rate": 7.568265842030381e-06, "loss": 0.1299, "step": 369 }, { "epoch": 0.6583629893238434, "grad_norm": 1.8017615193515297, "learning_rate": 7.556265256728618e-06, "loss": 0.133, "step": 370 }, { "epoch": 0.6601423487544484, "grad_norm": 1.5559132921238767, "learning_rate": 7.544244701670894e-06, "loss": 0.1121, "step": 371 }, { "epoch": 0.6619217081850534, "grad_norm": 1.6450753225373973, "learning_rate": 7.532204270762786e-06, "loss": 0.1179, "step": 372 }, { "epoch": 0.6637010676156584, "grad_norm": 1.4763378163352459, "learning_rate": 7.520144058065133e-06, "loss": 0.0935, "step": 373 }, { "epoch": 0.6654804270462633, "grad_norm": 1.5678797250102572, "learning_rate": 7.50806415779332e-06, "loss": 0.0986, "step": 374 }, { "epoch": 0.6672597864768683, "grad_norm": 1.8100227241796532, "learning_rate": 7.495964664316525e-06, "loss": 0.1228, "step": 375 }, { "epoch": 0.6690391459074733, "grad_norm": 1.803251843053676, "learning_rate": 7.4838456721569975e-06, "loss": 0.126, "step": 376 }, { "epoch": 0.6708185053380783, "grad_norm": 1.5683511146446225, "learning_rate": 7.471707275989304e-06, "loss": 0.1094, "step": 377 }, { "epoch": 0.6725978647686833, "grad_norm": 1.5339313950162716, "learning_rate": 7.459549570639602e-06, "loss": 0.1076, "step": 378 }, { "epoch": 0.6743772241992882, "grad_norm": 1.7413818738133184, "learning_rate": 7.447372651084896e-06, "loss": 0.1125, "step": 379 }, { "epoch": 0.6761565836298933, "grad_norm": 1.658372874930133, "learning_rate": 7.435176612452286e-06, "loss": 0.1035, "step": 380 }, { "epoch": 0.6779359430604982, "grad_norm": 1.6938524137294504, "learning_rate": 7.4229615500182396e-06, "loss": 0.1205, "step": 381 }, { "epoch": 0.6797153024911032, "grad_norm": 1.3779729634980662, "learning_rate": 7.4107275592078345e-06, "loss": 0.0919, "step": 382 }, { "epoch": 0.6814946619217082, "grad_norm": 1.5361671194524793, "learning_rate": 7.398474735594022e-06, "loss": 0.0935, "step": 383 }, { "epoch": 0.6832740213523132, "grad_norm": 1.7830401687214017, "learning_rate": 7.386203174896872e-06, "loss": 0.119, "step": 384 }, { "epoch": 0.6850533807829181, "grad_norm": 1.643113393999474, "learning_rate": 7.373912972982838e-06, "loss": 0.1127, "step": 385 }, { "epoch": 0.6868327402135231, "grad_norm": 1.6146921438882946, "learning_rate": 7.361604225863992e-06, "loss": 0.111, "step": 386 }, { "epoch": 0.6886120996441281, "grad_norm": 2.038729594815312, "learning_rate": 7.349277029697287e-06, "loss": 0.1229, "step": 387 }, { "epoch": 0.6903914590747331, "grad_norm": 1.8394395989354648, "learning_rate": 7.336931480783801e-06, "loss": 0.1135, "step": 388 }, { "epoch": 0.6921708185053381, "grad_norm": 2.0657047204677257, "learning_rate": 7.3245676755679854e-06, "loss": 0.1371, "step": 389 }, { "epoch": 0.693950177935943, "grad_norm": 1.8046034252430407, "learning_rate": 7.312185710636911e-06, "loss": 0.1303, "step": 390 }, { "epoch": 0.6957295373665481, "grad_norm": 1.605467186751234, "learning_rate": 7.299785682719512e-06, "loss": 0.0894, "step": 391 }, { "epoch": 0.697508896797153, "grad_norm": 1.7075429581097383, "learning_rate": 7.287367688685835e-06, "loss": 0.1116, "step": 392 }, { "epoch": 0.699288256227758, "grad_norm": 2.32565431520281, "learning_rate": 7.274931825546279e-06, "loss": 0.1418, "step": 393 }, { "epoch": 0.701067615658363, "grad_norm": 1.5879770982216497, "learning_rate": 7.262478190450834e-06, "loss": 0.1082, "step": 394 }, { "epoch": 0.702846975088968, "grad_norm": 1.9778471493382452, "learning_rate": 7.250006880688332e-06, "loss": 0.1339, "step": 395 }, { "epoch": 0.7046263345195729, "grad_norm": 1.8536563807341508, "learning_rate": 7.2375179936856775e-06, "loss": 0.1191, "step": 396 }, { "epoch": 0.7064056939501779, "grad_norm": 1.5238892089449574, "learning_rate": 7.22501162700709e-06, "loss": 0.1168, "step": 397 }, { "epoch": 0.708185053380783, "grad_norm": 1.5633979883321871, "learning_rate": 7.21248787835334e-06, "loss": 0.1034, "step": 398 }, { "epoch": 0.7099644128113879, "grad_norm": 1.4512875858323726, "learning_rate": 7.199946845560994e-06, "loss": 0.0983, "step": 399 }, { "epoch": 0.7117437722419929, "grad_norm": 1.9721738682199543, "learning_rate": 7.1873886266016365e-06, "loss": 0.1648, "step": 400 }, { "epoch": 0.7117437722419929, "eval_loss": 0.12959618866443634, "eval_runtime": 1.8938, "eval_samples_per_second": 24.29, "eval_steps_per_second": 6.337, "step": 400 }, { "epoch": 0.7135231316725978, "grad_norm": 1.650786272208085, "learning_rate": 7.174813319581115e-06, "loss": 0.1312, "step": 401 }, { "epoch": 0.7153024911032029, "grad_norm": 2.2854099811118678, "learning_rate": 7.162221022738768e-06, "loss": 0.1264, "step": 402 }, { "epoch": 0.7170818505338078, "grad_norm": 1.8413188198287198, "learning_rate": 7.149611834446664e-06, "loss": 0.1443, "step": 403 }, { "epoch": 0.7188612099644128, "grad_norm": 1.8316665572391215, "learning_rate": 7.136985853208824e-06, "loss": 0.1269, "step": 404 }, { "epoch": 0.7206405693950177, "grad_norm": 1.7696567334329205, "learning_rate": 7.124343177660462e-06, "loss": 0.1233, "step": 405 }, { "epoch": 0.7224199288256228, "grad_norm": 2.0410226214676617, "learning_rate": 7.111683906567206e-06, "loss": 0.1421, "step": 406 }, { "epoch": 0.7241992882562278, "grad_norm": 1.998853048286752, "learning_rate": 7.099008138824329e-06, "loss": 0.1165, "step": 407 }, { "epoch": 0.7259786476868327, "grad_norm": 1.4819468358435748, "learning_rate": 7.086315973455982e-06, "loss": 0.111, "step": 408 }, { "epoch": 0.7277580071174378, "grad_norm": 1.8351818346938322, "learning_rate": 7.0736075096144084e-06, "loss": 0.1334, "step": 409 }, { "epoch": 0.7295373665480427, "grad_norm": 1.4198255763046972, "learning_rate": 7.060882846579182e-06, "loss": 0.0995, "step": 410 }, { "epoch": 0.7313167259786477, "grad_norm": 1.600917979892676, "learning_rate": 7.048142083756427e-06, "loss": 0.1228, "step": 411 }, { "epoch": 0.7330960854092526, "grad_norm": 1.90864411785868, "learning_rate": 7.035385320678035e-06, "loss": 0.1269, "step": 412 }, { "epoch": 0.7348754448398577, "grad_norm": 1.6237496523624368, "learning_rate": 7.022612657000898e-06, "loss": 0.1178, "step": 413 }, { "epoch": 0.7366548042704626, "grad_norm": 1.6251548811313257, "learning_rate": 7.0098241925061215e-06, "loss": 0.1239, "step": 414 }, { "epoch": 0.7384341637010676, "grad_norm": 1.822242923118626, "learning_rate": 6.997020027098249e-06, "loss": 0.1188, "step": 415 }, { "epoch": 0.7402135231316725, "grad_norm": 1.7162826090987764, "learning_rate": 6.9842002608044844e-06, "loss": 0.1277, "step": 416 }, { "epoch": 0.7419928825622776, "grad_norm": 1.610396787883105, "learning_rate": 6.971364993773901e-06, "loss": 0.1116, "step": 417 }, { "epoch": 0.7437722419928826, "grad_norm": 1.7175436316671284, "learning_rate": 6.958514326276669e-06, "loss": 0.115, "step": 418 }, { "epoch": 0.7455516014234875, "grad_norm": 1.7443156287546546, "learning_rate": 6.945648358703269e-06, "loss": 0.1147, "step": 419 }, { "epoch": 0.7473309608540926, "grad_norm": 1.5876555242050159, "learning_rate": 6.932767191563703e-06, "loss": 0.1323, "step": 420 }, { "epoch": 0.7491103202846975, "grad_norm": 1.5695174495576405, "learning_rate": 6.919870925486718e-06, "loss": 0.0915, "step": 421 }, { "epoch": 0.7508896797153025, "grad_norm": 1.3651172743631523, "learning_rate": 6.906959661219011e-06, "loss": 0.0967, "step": 422 }, { "epoch": 0.7526690391459074, "grad_norm": 1.6444213992770682, "learning_rate": 6.8940334996244505e-06, "loss": 0.118, "step": 423 }, { "epoch": 0.7544483985765125, "grad_norm": 1.5930505187965764, "learning_rate": 6.881092541683279e-06, "loss": 0.1092, "step": 424 }, { "epoch": 0.7562277580071174, "grad_norm": 1.5479644418825749, "learning_rate": 6.8681368884913345e-06, "loss": 0.0997, "step": 425 }, { "epoch": 0.7580071174377224, "grad_norm": 1.3562907458800957, "learning_rate": 6.855166641259252e-06, "loss": 0.0983, "step": 426 }, { "epoch": 0.7597864768683275, "grad_norm": 1.4937038917842138, "learning_rate": 6.8421819013116766e-06, "loss": 0.1151, "step": 427 }, { "epoch": 0.7615658362989324, "grad_norm": 1.7104421483054004, "learning_rate": 6.829182770086474e-06, "loss": 0.1279, "step": 428 }, { "epoch": 0.7633451957295374, "grad_norm": 1.7244890467183474, "learning_rate": 6.816169349133934e-06, "loss": 0.1175, "step": 429 }, { "epoch": 0.7651245551601423, "grad_norm": 1.503448338372118, "learning_rate": 6.803141740115979e-06, "loss": 0.0904, "step": 430 }, { "epoch": 0.7669039145907474, "grad_norm": 2.3929780319512255, "learning_rate": 6.7901000448053676e-06, "loss": 0.1545, "step": 431 }, { "epoch": 0.7686832740213523, "grad_norm": 1.43897184818364, "learning_rate": 6.777044365084907e-06, "loss": 0.0883, "step": 432 }, { "epoch": 0.7704626334519573, "grad_norm": 1.4530826753752788, "learning_rate": 6.763974802946649e-06, "loss": 0.1057, "step": 433 }, { "epoch": 0.7722419928825622, "grad_norm": 1.7940307230352617, "learning_rate": 6.750891460491093e-06, "loss": 0.1408, "step": 434 }, { "epoch": 0.7740213523131673, "grad_norm": 1.9240804867547232, "learning_rate": 6.737794439926395e-06, "loss": 0.1542, "step": 435 }, { "epoch": 0.7758007117437722, "grad_norm": 1.6937236315495823, "learning_rate": 6.724683843567567e-06, "loss": 0.1263, "step": 436 }, { "epoch": 0.7775800711743772, "grad_norm": 1.7995741664274278, "learning_rate": 6.711559773835672e-06, "loss": 0.1212, "step": 437 }, { "epoch": 0.7793594306049823, "grad_norm": 1.984915526739746, "learning_rate": 6.69842233325703e-06, "loss": 0.1291, "step": 438 }, { "epoch": 0.7811387900355872, "grad_norm": 2.1011550852549568, "learning_rate": 6.685271624462416e-06, "loss": 0.1444, "step": 439 }, { "epoch": 0.7829181494661922, "grad_norm": 1.49523653959074, "learning_rate": 6.672107750186255e-06, "loss": 0.1088, "step": 440 }, { "epoch": 0.7846975088967971, "grad_norm": 1.1980117468822036, "learning_rate": 6.658930813265825e-06, "loss": 0.0905, "step": 441 }, { "epoch": 0.7864768683274022, "grad_norm": 1.5409102171679387, "learning_rate": 6.645740916640449e-06, "loss": 0.1086, "step": 442 }, { "epoch": 0.7882562277580071, "grad_norm": 1.336133139289894, "learning_rate": 6.63253816335069e-06, "loss": 0.091, "step": 443 }, { "epoch": 0.7900355871886121, "grad_norm": 1.5049065226881317, "learning_rate": 6.619322656537552e-06, "loss": 0.1115, "step": 444 }, { "epoch": 0.791814946619217, "grad_norm": 1.6763895935577287, "learning_rate": 6.606094499441671e-06, "loss": 0.1126, "step": 445 }, { "epoch": 0.7935943060498221, "grad_norm": 1.701011457351025, "learning_rate": 6.592853795402502e-06, "loss": 0.1165, "step": 446 }, { "epoch": 0.7953736654804271, "grad_norm": 1.7173326356205285, "learning_rate": 6.579600647857525e-06, "loss": 0.1154, "step": 447 }, { "epoch": 0.797153024911032, "grad_norm": 2.0689784607077684, "learning_rate": 6.566335160341425e-06, "loss": 0.1571, "step": 448 }, { "epoch": 0.798932384341637, "grad_norm": 1.4893826484487036, "learning_rate": 6.553057436485289e-06, "loss": 0.1119, "step": 449 }, { "epoch": 0.800711743772242, "grad_norm": 1.4973328833983912, "learning_rate": 6.539767580015799e-06, "loss": 0.1303, "step": 450 }, { "epoch": 0.802491103202847, "grad_norm": 2.121646356038464, "learning_rate": 6.52646569475441e-06, "loss": 0.1245, "step": 451 }, { "epoch": 0.8042704626334519, "grad_norm": 1.7539319055288605, "learning_rate": 6.513151884616556e-06, "loss": 0.1421, "step": 452 }, { "epoch": 0.806049822064057, "grad_norm": 1.8502044144055405, "learning_rate": 6.499826253610823e-06, "loss": 0.1223, "step": 453 }, { "epoch": 0.8078291814946619, "grad_norm": 1.5319280024206472, "learning_rate": 6.486488905838143e-06, "loss": 0.104, "step": 454 }, { "epoch": 0.8096085409252669, "grad_norm": 1.4797150090237958, "learning_rate": 6.473139945490984e-06, "loss": 0.0916, "step": 455 }, { "epoch": 0.8113879003558719, "grad_norm": 1.662988573862244, "learning_rate": 6.459779476852528e-06, "loss": 0.1137, "step": 456 }, { "epoch": 0.8131672597864769, "grad_norm": 1.639421479562668, "learning_rate": 6.446407604295863e-06, "loss": 0.1235, "step": 457 }, { "epoch": 0.8149466192170819, "grad_norm": 1.6822467563938035, "learning_rate": 6.433024432283169e-06, "loss": 0.0969, "step": 458 }, { "epoch": 0.8167259786476868, "grad_norm": 1.8318802686757292, "learning_rate": 6.41963006536489e-06, "loss": 0.1142, "step": 459 }, { "epoch": 0.8185053380782918, "grad_norm": 1.5892619636637637, "learning_rate": 6.4062246081789316e-06, "loss": 0.116, "step": 460 }, { "epoch": 0.8202846975088968, "grad_norm": 1.4199562438109783, "learning_rate": 6.392808165449836e-06, "loss": 0.0763, "step": 461 }, { "epoch": 0.8220640569395018, "grad_norm": 1.5896441433111637, "learning_rate": 6.379380841987965e-06, "loss": 0.1022, "step": 462 }, { "epoch": 0.8238434163701067, "grad_norm": 1.4918014202567753, "learning_rate": 6.365942742688684e-06, "loss": 0.0879, "step": 463 }, { "epoch": 0.8256227758007118, "grad_norm": 2.1081867590383774, "learning_rate": 6.352493972531535e-06, "loss": 0.1398, "step": 464 }, { "epoch": 0.8274021352313167, "grad_norm": 2.236482421245564, "learning_rate": 6.339034636579425e-06, "loss": 0.1402, "step": 465 }, { "epoch": 0.8291814946619217, "grad_norm": 1.7557563854463891, "learning_rate": 6.325564839977802e-06, "loss": 0.0986, "step": 466 }, { "epoch": 0.8309608540925267, "grad_norm": 1.5462814737486743, "learning_rate": 6.312084687953835e-06, "loss": 0.0995, "step": 467 }, { "epoch": 0.8327402135231317, "grad_norm": 2.0033068933866676, "learning_rate": 6.298594285815585e-06, "loss": 0.1415, "step": 468 }, { "epoch": 0.8345195729537367, "grad_norm": 1.9131492658503364, "learning_rate": 6.2850937389511936e-06, "loss": 0.1388, "step": 469 }, { "epoch": 0.8362989323843416, "grad_norm": 1.4654295413644622, "learning_rate": 6.271583152828049e-06, "loss": 0.087, "step": 470 }, { "epoch": 0.8380782918149466, "grad_norm": 1.5753128564647159, "learning_rate": 6.258062632991972e-06, "loss": 0.0877, "step": 471 }, { "epoch": 0.8398576512455516, "grad_norm": 1.6548802244524703, "learning_rate": 6.244532285066382e-06, "loss": 0.1163, "step": 472 }, { "epoch": 0.8416370106761566, "grad_norm": 1.530428330922915, "learning_rate": 6.2309922147514775e-06, "loss": 0.1005, "step": 473 }, { "epoch": 0.8434163701067615, "grad_norm": 1.490709317766687, "learning_rate": 6.2174425278234115e-06, "loss": 0.1281, "step": 474 }, { "epoch": 0.8451957295373665, "grad_norm": 1.7010977681402695, "learning_rate": 6.20388333013346e-06, "loss": 0.0892, "step": 475 }, { "epoch": 0.8469750889679716, "grad_norm": 1.6598493946668496, "learning_rate": 6.190314727607196e-06, "loss": 0.1296, "step": 476 }, { "epoch": 0.8487544483985765, "grad_norm": 1.6584750831655197, "learning_rate": 6.176736826243671e-06, "loss": 0.1138, "step": 477 }, { "epoch": 0.8505338078291815, "grad_norm": 1.6311767798481245, "learning_rate": 6.163149732114571e-06, "loss": 0.115, "step": 478 }, { "epoch": 0.8523131672597865, "grad_norm": 1.664554587358972, "learning_rate": 6.149553551363404e-06, "loss": 0.1021, "step": 479 }, { "epoch": 0.8540925266903915, "grad_norm": 1.789706761510048, "learning_rate": 6.1359483902046605e-06, "loss": 0.1334, "step": 480 }, { "epoch": 0.8558718861209964, "grad_norm": 1.6481955793107783, "learning_rate": 6.122334354922984e-06, "loss": 0.1095, "step": 481 }, { "epoch": 0.8576512455516014, "grad_norm": 1.6933883078833007, "learning_rate": 6.108711551872347e-06, "loss": 0.1197, "step": 482 }, { "epoch": 0.8594306049822064, "grad_norm": 1.9982362806330403, "learning_rate": 6.095080087475218e-06, "loss": 0.1191, "step": 483 }, { "epoch": 0.8612099644128114, "grad_norm": 1.6279749091088431, "learning_rate": 6.0814400682217236e-06, "loss": 0.1195, "step": 484 }, { "epoch": 0.8629893238434164, "grad_norm": 2.0702920307947217, "learning_rate": 6.067791600668823e-06, "loss": 0.0989, "step": 485 }, { "epoch": 0.8647686832740213, "grad_norm": 1.366280914619679, "learning_rate": 6.054134791439479e-06, "loss": 0.0811, "step": 486 }, { "epoch": 0.8665480427046264, "grad_norm": 1.620040269330004, "learning_rate": 6.040469747221815e-06, "loss": 0.0888, "step": 487 }, { "epoch": 0.8683274021352313, "grad_norm": 1.6550005458585226, "learning_rate": 6.026796574768288e-06, "loss": 0.0949, "step": 488 }, { "epoch": 0.8701067615658363, "grad_norm": 1.8368090577979796, "learning_rate": 6.013115380894854e-06, "loss": 0.1096, "step": 489 }, { "epoch": 0.8718861209964412, "grad_norm": 1.4081492155967914, "learning_rate": 5.999426272480133e-06, "loss": 0.0864, "step": 490 }, { "epoch": 0.8736654804270463, "grad_norm": 1.5439311233007154, "learning_rate": 5.985729356464575e-06, "loss": 0.1144, "step": 491 }, { "epoch": 0.8754448398576512, "grad_norm": 1.6390509124621109, "learning_rate": 5.972024739849622e-06, "loss": 0.1086, "step": 492 }, { "epoch": 0.8772241992882562, "grad_norm": 1.1804966375226666, "learning_rate": 5.958312529696874e-06, "loss": 0.0789, "step": 493 }, { "epoch": 0.8790035587188612, "grad_norm": 1.5359055122663885, "learning_rate": 5.944592833127253e-06, "loss": 0.1306, "step": 494 }, { "epoch": 0.8807829181494662, "grad_norm": 1.701310220891771, "learning_rate": 5.9308657573201645e-06, "loss": 0.1385, "step": 495 }, { "epoch": 0.8825622775800712, "grad_norm": 1.7232981407665273, "learning_rate": 5.917131409512663e-06, "loss": 0.1038, "step": 496 }, { "epoch": 0.8843416370106761, "grad_norm": 1.5134955064921805, "learning_rate": 5.903389896998611e-06, "loss": 0.112, "step": 497 }, { "epoch": 0.8861209964412812, "grad_norm": 1.7101215835872345, "learning_rate": 5.889641327127843e-06, "loss": 0.1076, "step": 498 }, { "epoch": 0.8879003558718861, "grad_norm": 1.5969144507758808, "learning_rate": 5.875885807305326e-06, "loss": 0.1127, "step": 499 }, { "epoch": 0.8896797153024911, "grad_norm": 1.5293635939910035, "learning_rate": 5.862123444990319e-06, "loss": 0.102, "step": 500 }, { "epoch": 0.891459074733096, "grad_norm": 1.9129248797631837, "learning_rate": 5.848354347695537e-06, "loss": 0.1657, "step": 501 }, { "epoch": 0.8932384341637011, "grad_norm": 1.779995432700007, "learning_rate": 5.83457862298631e-06, "loss": 0.1301, "step": 502 }, { "epoch": 0.895017793594306, "grad_norm": 1.9085016153186696, "learning_rate": 5.8207963784797396e-06, "loss": 0.1237, "step": 503 }, { "epoch": 0.896797153024911, "grad_norm": 1.6577275898251052, "learning_rate": 5.807007721843862e-06, "loss": 0.1224, "step": 504 }, { "epoch": 0.8985765124555161, "grad_norm": 1.9893076829870855, "learning_rate": 5.793212760796804e-06, "loss": 0.1478, "step": 505 }, { "epoch": 0.900355871886121, "grad_norm": 1.8021286071340172, "learning_rate": 5.779411603105947e-06, "loss": 0.1319, "step": 506 }, { "epoch": 0.902135231316726, "grad_norm": 1.6187095597458239, "learning_rate": 5.765604356587076e-06, "loss": 0.1136, "step": 507 }, { "epoch": 0.9039145907473309, "grad_norm": 1.9304740865856787, "learning_rate": 5.751791129103545e-06, "loss": 0.1244, "step": 508 }, { "epoch": 0.905693950177936, "grad_norm": 1.9072697594835601, "learning_rate": 5.737972028565431e-06, "loss": 0.1423, "step": 509 }, { "epoch": 0.9074733096085409, "grad_norm": 1.7973013201222665, "learning_rate": 5.7241471629286934e-06, "loss": 0.1263, "step": 510 }, { "epoch": 0.9092526690391459, "grad_norm": 2.0208042621710183, "learning_rate": 5.7103166401943276e-06, "loss": 0.1315, "step": 511 }, { "epoch": 0.9110320284697508, "grad_norm": 1.7789311400391496, "learning_rate": 5.696480568407523e-06, "loss": 0.112, "step": 512 }, { "epoch": 0.9128113879003559, "grad_norm": 1.9380071988357526, "learning_rate": 5.682639055656817e-06, "loss": 0.1407, "step": 513 }, { "epoch": 0.9145907473309609, "grad_norm": 1.7941071122408592, "learning_rate": 5.668792210073255e-06, "loss": 0.1419, "step": 514 }, { "epoch": 0.9163701067615658, "grad_norm": 1.782245032916222, "learning_rate": 5.654940139829544e-06, "loss": 0.1222, "step": 515 }, { "epoch": 0.9181494661921709, "grad_norm": 1.555276532221455, "learning_rate": 5.641082953139201e-06, "loss": 0.1048, "step": 516 }, { "epoch": 0.9199288256227758, "grad_norm": 1.186415095228362, "learning_rate": 5.6272207582557195e-06, "loss": 0.0795, "step": 517 }, { "epoch": 0.9217081850533808, "grad_norm": 1.5599270090425488, "learning_rate": 5.61335366347171e-06, "loss": 0.0968, "step": 518 }, { "epoch": 0.9234875444839857, "grad_norm": 1.734183135234115, "learning_rate": 5.599481777118071e-06, "loss": 0.125, "step": 519 }, { "epoch": 0.9252669039145908, "grad_norm": 1.826092599683182, "learning_rate": 5.585605207563124e-06, "loss": 0.0988, "step": 520 }, { "epoch": 0.9270462633451957, "grad_norm": 1.5856036507290956, "learning_rate": 5.571724063211782e-06, "loss": 0.095, "step": 521 }, { "epoch": 0.9288256227758007, "grad_norm": 1.5692105352719896, "learning_rate": 5.557838452504692e-06, "loss": 0.0892, "step": 522 }, { "epoch": 0.9306049822064056, "grad_norm": 1.9410504415194334, "learning_rate": 5.5439484839173996e-06, "loss": 0.1264, "step": 523 }, { "epoch": 0.9323843416370107, "grad_norm": 1.573827854269837, "learning_rate": 5.530054265959486e-06, "loss": 0.1106, "step": 524 }, { "epoch": 0.9341637010676157, "grad_norm": 1.7669043670382616, "learning_rate": 5.516155907173735e-06, "loss": 0.1306, "step": 525 }, { "epoch": 0.9359430604982206, "grad_norm": 2.0723297874693345, "learning_rate": 5.5022535161352764e-06, "loss": 0.1024, "step": 526 }, { "epoch": 0.9377224199288257, "grad_norm": 1.8477927933322909, "learning_rate": 5.488347201450741e-06, "loss": 0.1155, "step": 527 }, { "epoch": 0.9395017793594306, "grad_norm": 1.4335122848552868, "learning_rate": 5.47443707175741e-06, "loss": 0.1034, "step": 528 }, { "epoch": 0.9412811387900356, "grad_norm": 1.6848783449366886, "learning_rate": 5.46052323572237e-06, "loss": 0.1035, "step": 529 }, { "epoch": 0.9430604982206405, "grad_norm": 1.5026758325433949, "learning_rate": 5.446605802041662e-06, "loss": 0.0904, "step": 530 }, { "epoch": 0.9448398576512456, "grad_norm": 1.3646921844987656, "learning_rate": 5.432684879439428e-06, "loss": 0.093, "step": 531 }, { "epoch": 0.9466192170818505, "grad_norm": 1.3970690383199675, "learning_rate": 5.418760576667071e-06, "loss": 0.0787, "step": 532 }, { "epoch": 0.9483985765124555, "grad_norm": 1.7828424411384753, "learning_rate": 5.404833002502398e-06, "loss": 0.135, "step": 533 }, { "epoch": 0.9501779359430605, "grad_norm": 1.846733318748555, "learning_rate": 5.39090226574877e-06, "loss": 0.1151, "step": 534 }, { "epoch": 0.9519572953736655, "grad_norm": 2.081735210052978, "learning_rate": 5.376968475234258e-06, "loss": 0.1481, "step": 535 }, { "epoch": 0.9537366548042705, "grad_norm": 1.2654872607931265, "learning_rate": 5.363031739810787e-06, "loss": 0.0769, "step": 536 }, { "epoch": 0.9555160142348754, "grad_norm": 1.7040244513445832, "learning_rate": 5.349092168353291e-06, "loss": 0.125, "step": 537 }, { "epoch": 0.9572953736654805, "grad_norm": 2.2736701514547386, "learning_rate": 5.335149869758855e-06, "loss": 0.1011, "step": 538 }, { "epoch": 0.9590747330960854, "grad_norm": 1.8704870222474606, "learning_rate": 5.32120495294587e-06, "loss": 0.1243, "step": 539 }, { "epoch": 0.9608540925266904, "grad_norm": 1.5757509714900944, "learning_rate": 5.3072575268531835e-06, "loss": 0.1296, "step": 540 }, { "epoch": 0.9626334519572953, "grad_norm": 1.9597413650086275, "learning_rate": 5.293307700439242e-06, "loss": 0.1315, "step": 541 }, { "epoch": 0.9644128113879004, "grad_norm": 1.9584643962285975, "learning_rate": 5.2793555826812456e-06, "loss": 0.1242, "step": 542 }, { "epoch": 0.9661921708185054, "grad_norm": 1.6703671916927374, "learning_rate": 5.265401282574294e-06, "loss": 0.0931, "step": 543 }, { "epoch": 0.9679715302491103, "grad_norm": 1.7775044285702417, "learning_rate": 5.2514449091305375e-06, "loss": 0.1127, "step": 544 }, { "epoch": 0.9697508896797153, "grad_norm": 1.900198783977832, "learning_rate": 5.237486571378317e-06, "loss": 0.122, "step": 545 }, { "epoch": 0.9715302491103203, "grad_norm": 1.6252752512728061, "learning_rate": 5.22352637836133e-06, "loss": 0.1395, "step": 546 }, { "epoch": 0.9733096085409253, "grad_norm": 1.699565228191691, "learning_rate": 5.209564439137755e-06, "loss": 0.1106, "step": 547 }, { "epoch": 0.9750889679715302, "grad_norm": 1.7727298452584392, "learning_rate": 5.195600862779421e-06, "loss": 0.1455, "step": 548 }, { "epoch": 0.9768683274021353, "grad_norm": 1.1591710836347968, "learning_rate": 5.181635758370942e-06, "loss": 0.0672, "step": 549 }, { "epoch": 0.9786476868327402, "grad_norm": 1.8120422070928126, "learning_rate": 5.167669235008871e-06, "loss": 0.1278, "step": 550 }, { "epoch": 0.9804270462633452, "grad_norm": 2.091801828124558, "learning_rate": 5.153701401800845e-06, "loss": 0.1506, "step": 551 }, { "epoch": 0.9822064056939501, "grad_norm": 1.6050350521689105, "learning_rate": 5.139732367864736e-06, "loss": 0.1084, "step": 552 }, { "epoch": 0.9839857651245552, "grad_norm": 1.4294153326893941, "learning_rate": 5.1257622423277934e-06, "loss": 0.0976, "step": 553 }, { "epoch": 0.9857651245551602, "grad_norm": 1.587042660396233, "learning_rate": 5.111791134325793e-06, "loss": 0.1118, "step": 554 }, { "epoch": 0.9875444839857651, "grad_norm": 1.35922627926222, "learning_rate": 5.097819153002192e-06, "loss": 0.0976, "step": 555 }, { "epoch": 0.9893238434163701, "grad_norm": 1.3856363204998594, "learning_rate": 5.083846407507263e-06, "loss": 0.0921, "step": 556 }, { "epoch": 0.9911032028469751, "grad_norm": 1.9578614122627025, "learning_rate": 5.0698730069972535e-06, "loss": 0.138, "step": 557 }, { "epoch": 0.9928825622775801, "grad_norm": 1.4729540412888569, "learning_rate": 5.055899060633524e-06, "loss": 0.0993, "step": 558 }, { "epoch": 0.994661921708185, "grad_norm": 1.4805535422577267, "learning_rate": 5.041924677581702e-06, "loss": 0.1191, "step": 559 }, { "epoch": 0.99644128113879, "grad_norm": 1.5224552225206658, "learning_rate": 5.0279499670108245e-06, "loss": 0.1125, "step": 560 }, { "epoch": 0.998220640569395, "grad_norm": 1.5788176034634458, "learning_rate": 5.013975038092491e-06, "loss": 0.1103, "step": 561 }, { "epoch": 1.0, "grad_norm": 1.289171803628575, "learning_rate": 5e-06, "loss": 0.0774, "step": 562 }, { "epoch": 1.001779359430605, "grad_norm": 1.321547667551879, "learning_rate": 4.98602496190751e-06, "loss": 0.0648, "step": 563 }, { "epoch": 1.00355871886121, "grad_norm": 1.0341965320750677, "learning_rate": 4.9720500329891755e-06, "loss": 0.0473, "step": 564 }, { "epoch": 1.0053380782918149, "grad_norm": 1.3887310631222791, "learning_rate": 4.9580753224183005e-06, "loss": 0.066, "step": 565 }, { "epoch": 1.00711743772242, "grad_norm": 1.0190005857961302, "learning_rate": 4.944100939366478e-06, "loss": 0.0458, "step": 566 }, { "epoch": 1.008896797153025, "grad_norm": 0.8996172927014492, "learning_rate": 4.930126993002748e-06, "loss": 0.0442, "step": 567 }, { "epoch": 1.01067615658363, "grad_norm": 1.097170668718373, "learning_rate": 4.9161535924927375e-06, "loss": 0.0503, "step": 568 }, { "epoch": 1.0124555160142348, "grad_norm": 0.9670152388080503, "learning_rate": 4.90218084699781e-06, "loss": 0.0371, "step": 569 }, { "epoch": 1.0142348754448398, "grad_norm": 1.40278326941056, "learning_rate": 4.888208865674208e-06, "loss": 0.0642, "step": 570 }, { "epoch": 1.0160142348754448, "grad_norm": 1.1623749611062457, "learning_rate": 4.874237757672209e-06, "loss": 0.0518, "step": 571 }, { "epoch": 1.0177935943060499, "grad_norm": 1.6036458955615824, "learning_rate": 4.8602676321352646e-06, "loss": 0.0773, "step": 572 }, { "epoch": 1.019572953736655, "grad_norm": 1.365741105836663, "learning_rate": 4.846298598199155e-06, "loss": 0.0503, "step": 573 }, { "epoch": 1.0213523131672597, "grad_norm": 1.3357812339388766, "learning_rate": 4.832330764991131e-06, "loss": 0.0446, "step": 574 }, { "epoch": 1.0231316725978647, "grad_norm": 1.4334544851379145, "learning_rate": 4.81836424162906e-06, "loss": 0.0576, "step": 575 }, { "epoch": 1.0249110320284698, "grad_norm": 1.3829268803494221, "learning_rate": 4.80439913722058e-06, "loss": 0.0418, "step": 576 }, { "epoch": 1.0266903914590748, "grad_norm": 1.2483367813256785, "learning_rate": 4.790435560862247e-06, "loss": 0.0449, "step": 577 }, { "epoch": 1.0284697508896796, "grad_norm": 1.695500284079523, "learning_rate": 4.776473621638673e-06, "loss": 0.0435, "step": 578 }, { "epoch": 1.0302491103202847, "grad_norm": 1.8991173686037819, "learning_rate": 4.762513428621684e-06, "loss": 0.058, "step": 579 }, { "epoch": 1.0320284697508897, "grad_norm": 1.5521630263117991, "learning_rate": 4.748555090869464e-06, "loss": 0.0438, "step": 580 }, { "epoch": 1.0338078291814947, "grad_norm": 1.3477409399835327, "learning_rate": 4.734598717425706e-06, "loss": 0.0448, "step": 581 }, { "epoch": 1.0355871886120998, "grad_norm": 1.7106433757366346, "learning_rate": 4.720644417318755e-06, "loss": 0.0635, "step": 582 }, { "epoch": 1.0373665480427046, "grad_norm": 2.851090566805398, "learning_rate": 4.70669229956076e-06, "loss": 0.0709, "step": 583 }, { "epoch": 1.0391459074733096, "grad_norm": 1.5953113364569276, "learning_rate": 4.692742473146818e-06, "loss": 0.0477, "step": 584 }, { "epoch": 1.0409252669039146, "grad_norm": 1.4761057849508938, "learning_rate": 4.678795047054131e-06, "loss": 0.0522, "step": 585 }, { "epoch": 1.0427046263345197, "grad_norm": 1.6559294062055907, "learning_rate": 4.664850130241146e-06, "loss": 0.0413, "step": 586 }, { "epoch": 1.0444839857651245, "grad_norm": 1.4698096409217896, "learning_rate": 4.650907831646711e-06, "loss": 0.0437, "step": 587 }, { "epoch": 1.0462633451957295, "grad_norm": 1.6443599758137117, "learning_rate": 4.636968260189214e-06, "loss": 0.0701, "step": 588 }, { "epoch": 1.0480427046263345, "grad_norm": 1.5694553791769974, "learning_rate": 4.623031524765744e-06, "loss": 0.0429, "step": 589 }, { "epoch": 1.0498220640569396, "grad_norm": 1.4177615456583517, "learning_rate": 4.609097734251231e-06, "loss": 0.0431, "step": 590 }, { "epoch": 1.0516014234875444, "grad_norm": 1.8222158396214463, "learning_rate": 4.595166997497605e-06, "loss": 0.0469, "step": 591 }, { "epoch": 1.0533807829181494, "grad_norm": 1.595194163281614, "learning_rate": 4.58123942333293e-06, "loss": 0.0531, "step": 592 }, { "epoch": 1.0551601423487544, "grad_norm": 1.2829348617654672, "learning_rate": 4.567315120560573e-06, "loss": 0.0398, "step": 593 }, { "epoch": 1.0569395017793595, "grad_norm": 2.5773510148852994, "learning_rate": 4.553394197958339e-06, "loss": 0.054, "step": 594 }, { "epoch": 1.0587188612099645, "grad_norm": 1.3295098546028743, "learning_rate": 4.539476764277631e-06, "loss": 0.0375, "step": 595 }, { "epoch": 1.0604982206405693, "grad_norm": 1.1136857786265548, "learning_rate": 4.525562928242592e-06, "loss": 0.0314, "step": 596 }, { "epoch": 1.0622775800711743, "grad_norm": 1.8457059181272801, "learning_rate": 4.511652798549261e-06, "loss": 0.0571, "step": 597 }, { "epoch": 1.0640569395017794, "grad_norm": 1.754973123612851, "learning_rate": 4.497746483864725e-06, "loss": 0.0527, "step": 598 }, { "epoch": 1.0658362989323844, "grad_norm": 1.4884897747797308, "learning_rate": 4.483844092826267e-06, "loss": 0.034, "step": 599 }, { "epoch": 1.0676156583629894, "grad_norm": 1.1921017691419724, "learning_rate": 4.469945734040516e-06, "loss": 0.0422, "step": 600 }, { "epoch": 1.0676156583629894, "eval_loss": 0.14088106155395508, "eval_runtime": 1.9014, "eval_samples_per_second": 24.193, "eval_steps_per_second": 6.311, "step": 600 }, { "epoch": 1.0693950177935942, "grad_norm": 1.4520627009161615, "learning_rate": 4.456051516082603e-06, "loss": 0.0446, "step": 601 }, { "epoch": 1.0711743772241993, "grad_norm": 2.065566049470662, "learning_rate": 4.442161547495309e-06, "loss": 0.0636, "step": 602 }, { "epoch": 1.0729537366548043, "grad_norm": 1.8731193389363843, "learning_rate": 4.42827593678822e-06, "loss": 0.0541, "step": 603 }, { "epoch": 1.0747330960854093, "grad_norm": 1.5421526602172757, "learning_rate": 4.414394792436877e-06, "loss": 0.0469, "step": 604 }, { "epoch": 1.0765124555160142, "grad_norm": 1.5147767598172146, "learning_rate": 4.400518222881931e-06, "loss": 0.0584, "step": 605 }, { "epoch": 1.0782918149466192, "grad_norm": 1.4569848874928137, "learning_rate": 4.386646336528291e-06, "loss": 0.0471, "step": 606 }, { "epoch": 1.0800711743772242, "grad_norm": 1.2472822600183657, "learning_rate": 4.372779241744282e-06, "loss": 0.0369, "step": 607 }, { "epoch": 1.0818505338078293, "grad_norm": 1.4365896007681718, "learning_rate": 4.358917046860799e-06, "loss": 0.0467, "step": 608 }, { "epoch": 1.083629893238434, "grad_norm": 1.401699727521943, "learning_rate": 4.345059860170458e-06, "loss": 0.0454, "step": 609 }, { "epoch": 1.085409252669039, "grad_norm": 1.5809770179560945, "learning_rate": 4.331207789926746e-06, "loss": 0.0463, "step": 610 }, { "epoch": 1.0871886120996441, "grad_norm": 1.3652320905173538, "learning_rate": 4.317360944343184e-06, "loss": 0.0394, "step": 611 }, { "epoch": 1.0889679715302492, "grad_norm": 1.0954675718060185, "learning_rate": 4.303519431592479e-06, "loss": 0.0319, "step": 612 }, { "epoch": 1.0907473309608542, "grad_norm": 1.8123638898111263, "learning_rate": 4.289683359805673e-06, "loss": 0.0676, "step": 613 }, { "epoch": 1.092526690391459, "grad_norm": 1.5165089008513655, "learning_rate": 4.275852837071309e-06, "loss": 0.036, "step": 614 }, { "epoch": 1.094306049822064, "grad_norm": 1.349530319256878, "learning_rate": 4.26202797143457e-06, "loss": 0.0418, "step": 615 }, { "epoch": 1.096085409252669, "grad_norm": 1.3926375697865283, "learning_rate": 4.248208870896456e-06, "loss": 0.0398, "step": 616 }, { "epoch": 1.097864768683274, "grad_norm": 1.6140068888073313, "learning_rate": 4.234395643412925e-06, "loss": 0.0528, "step": 617 }, { "epoch": 1.099644128113879, "grad_norm": 1.4936700700182268, "learning_rate": 4.220588396894055e-06, "loss": 0.0449, "step": 618 }, { "epoch": 1.101423487544484, "grad_norm": 1.605327651042368, "learning_rate": 4.2067872392031965e-06, "loss": 0.046, "step": 619 }, { "epoch": 1.103202846975089, "grad_norm": 1.6306908072739312, "learning_rate": 4.192992278156141e-06, "loss": 0.0494, "step": 620 }, { "epoch": 1.104982206405694, "grad_norm": 1.5710135845017419, "learning_rate": 4.179203621520262e-06, "loss": 0.0519, "step": 621 }, { "epoch": 1.106761565836299, "grad_norm": 1.5232535291378393, "learning_rate": 4.165421377013691e-06, "loss": 0.0374, "step": 622 }, { "epoch": 1.1085409252669038, "grad_norm": 1.3194104666712834, "learning_rate": 4.151645652304465e-06, "loss": 0.0403, "step": 623 }, { "epoch": 1.1103202846975089, "grad_norm": 1.7240847723214994, "learning_rate": 4.137876555009684e-06, "loss": 0.0615, "step": 624 }, { "epoch": 1.112099644128114, "grad_norm": 1.7096274273606844, "learning_rate": 4.124114192694676e-06, "loss": 0.0474, "step": 625 }, { "epoch": 1.113879003558719, "grad_norm": 1.5625773874625453, "learning_rate": 4.110358672872158e-06, "loss": 0.0504, "step": 626 }, { "epoch": 1.1156583629893237, "grad_norm": 1.396076023321496, "learning_rate": 4.0966101030013915e-06, "loss": 0.0479, "step": 627 }, { "epoch": 1.1174377224199288, "grad_norm": 1.280866591548747, "learning_rate": 4.082868590487339e-06, "loss": 0.0433, "step": 628 }, { "epoch": 1.1192170818505338, "grad_norm": 0.9376285586247859, "learning_rate": 4.069134242679837e-06, "loss": 0.0285, "step": 629 }, { "epoch": 1.1209964412811388, "grad_norm": 1.9861569745921686, "learning_rate": 4.055407166872748e-06, "loss": 0.0504, "step": 630 }, { "epoch": 1.1227758007117439, "grad_norm": 1.2950339400408617, "learning_rate": 4.041687470303127e-06, "loss": 0.0498, "step": 631 }, { "epoch": 1.1245551601423487, "grad_norm": 1.2224045265981505, "learning_rate": 4.02797526015038e-06, "loss": 0.0488, "step": 632 }, { "epoch": 1.1263345195729537, "grad_norm": 1.415136249691723, "learning_rate": 4.014270643535427e-06, "loss": 0.0389, "step": 633 }, { "epoch": 1.1281138790035588, "grad_norm": 1.7473902552323264, "learning_rate": 4.000573727519868e-06, "loss": 0.0603, "step": 634 }, { "epoch": 1.1298932384341638, "grad_norm": 1.4250341245930906, "learning_rate": 3.9868846191051465e-06, "loss": 0.0458, "step": 635 }, { "epoch": 1.1316725978647686, "grad_norm": 1.5697987257601447, "learning_rate": 3.973203425231715e-06, "loss": 0.0537, "step": 636 }, { "epoch": 1.1334519572953736, "grad_norm": 1.3820206518714562, "learning_rate": 3.959530252778187e-06, "loss": 0.0559, "step": 637 }, { "epoch": 1.1352313167259787, "grad_norm": 1.890111172118723, "learning_rate": 3.945865208560522e-06, "loss": 0.0689, "step": 638 }, { "epoch": 1.1370106761565837, "grad_norm": 1.6690475978184052, "learning_rate": 3.932208399331177e-06, "loss": 0.0588, "step": 639 }, { "epoch": 1.1387900355871885, "grad_norm": 1.537753983006339, "learning_rate": 3.918559931778277e-06, "loss": 0.0543, "step": 640 }, { "epoch": 1.1405693950177935, "grad_norm": 1.2252836821683382, "learning_rate": 3.904919912524784e-06, "loss": 0.0334, "step": 641 }, { "epoch": 1.1423487544483986, "grad_norm": 1.3024566427865079, "learning_rate": 3.891288448127654e-06, "loss": 0.0477, "step": 642 }, { "epoch": 1.1441281138790036, "grad_norm": 1.6214233572235495, "learning_rate": 3.877665645077017e-06, "loss": 0.0481, "step": 643 }, { "epoch": 1.1459074733096086, "grad_norm": 1.5322825992787603, "learning_rate": 3.86405160979534e-06, "loss": 0.0531, "step": 644 }, { "epoch": 1.1476868327402134, "grad_norm": 1.4602298716456994, "learning_rate": 3.850446448636597e-06, "loss": 0.0475, "step": 645 }, { "epoch": 1.1494661921708185, "grad_norm": 0.9381534926052785, "learning_rate": 3.8368502678854296e-06, "loss": 0.0279, "step": 646 }, { "epoch": 1.1512455516014235, "grad_norm": 1.32324909822255, "learning_rate": 3.8232631737563306e-06, "loss": 0.0437, "step": 647 }, { "epoch": 1.1530249110320285, "grad_norm": 1.6095475209235117, "learning_rate": 3.809685272392804e-06, "loss": 0.0439, "step": 648 }, { "epoch": 1.1548042704626336, "grad_norm": 1.388044010745066, "learning_rate": 3.796116669866543e-06, "loss": 0.0463, "step": 649 }, { "epoch": 1.1565836298932384, "grad_norm": 1.4902338129431416, "learning_rate": 3.78255747217659e-06, "loss": 0.0469, "step": 650 }, { "epoch": 1.1583629893238434, "grad_norm": 1.2950571221168115, "learning_rate": 3.769007785248523e-06, "loss": 0.0398, "step": 651 }, { "epoch": 1.1601423487544484, "grad_norm": 1.404075114771851, "learning_rate": 3.7554677149336186e-06, "loss": 0.0456, "step": 652 }, { "epoch": 1.1619217081850535, "grad_norm": 1.4870402316668874, "learning_rate": 3.7419373670080284e-06, "loss": 0.046, "step": 653 }, { "epoch": 1.1637010676156583, "grad_norm": 1.8098481304060874, "learning_rate": 3.7284168471719527e-06, "loss": 0.0577, "step": 654 }, { "epoch": 1.1654804270462633, "grad_norm": 1.427817357295298, "learning_rate": 3.7149062610488085e-06, "loss": 0.0434, "step": 655 }, { "epoch": 1.1672597864768683, "grad_norm": 1.3560241301938276, "learning_rate": 3.701405714184416e-06, "loss": 0.0408, "step": 656 }, { "epoch": 1.1690391459074734, "grad_norm": 1.1882146638495006, "learning_rate": 3.687915312046166e-06, "loss": 0.0428, "step": 657 }, { "epoch": 1.1708185053380782, "grad_norm": 1.549165743586227, "learning_rate": 3.6744351600221994e-06, "loss": 0.0361, "step": 658 }, { "epoch": 1.1725978647686832, "grad_norm": 1.711958118064137, "learning_rate": 3.6609653634205773e-06, "loss": 0.0664, "step": 659 }, { "epoch": 1.1743772241992882, "grad_norm": 2.379429750352705, "learning_rate": 3.647506027468467e-06, "loss": 0.0529, "step": 660 }, { "epoch": 1.1761565836298933, "grad_norm": 1.3263718740247306, "learning_rate": 3.6340572573113176e-06, "loss": 0.0439, "step": 661 }, { "epoch": 1.1779359430604983, "grad_norm": 1.0138072445617423, "learning_rate": 3.6206191580120346e-06, "loss": 0.0346, "step": 662 }, { "epoch": 1.1797153024911031, "grad_norm": 1.187321952035311, "learning_rate": 3.6071918345501655e-06, "loss": 0.0343, "step": 663 }, { "epoch": 1.1814946619217082, "grad_norm": 1.5660777663999803, "learning_rate": 3.5937753918210705e-06, "loss": 0.0422, "step": 664 }, { "epoch": 1.1832740213523132, "grad_norm": 1.665330796338174, "learning_rate": 3.5803699346351117e-06, "loss": 0.042, "step": 665 }, { "epoch": 1.1850533807829182, "grad_norm": 1.280733162874042, "learning_rate": 3.566975567716833e-06, "loss": 0.0378, "step": 666 }, { "epoch": 1.1868327402135233, "grad_norm": 1.568241267913538, "learning_rate": 3.5535923957041374e-06, "loss": 0.0509, "step": 667 }, { "epoch": 1.188612099644128, "grad_norm": 1.2754536567847237, "learning_rate": 3.540220523147474e-06, "loss": 0.0416, "step": 668 }, { "epoch": 1.190391459074733, "grad_norm": 1.4117015620711075, "learning_rate": 3.5268600545090183e-06, "loss": 0.0413, "step": 669 }, { "epoch": 1.1921708185053381, "grad_norm": 1.4766697587328628, "learning_rate": 3.513511094161858e-06, "loss": 0.0431, "step": 670 }, { "epoch": 1.193950177935943, "grad_norm": 1.4252410807316729, "learning_rate": 3.5001737463891793e-06, "loss": 0.0356, "step": 671 }, { "epoch": 1.195729537366548, "grad_norm": 1.4613171450739462, "learning_rate": 3.4868481153834454e-06, "loss": 0.052, "step": 672 }, { "epoch": 1.197508896797153, "grad_norm": 1.3907823994123085, "learning_rate": 3.4735343052455905e-06, "loss": 0.036, "step": 673 }, { "epoch": 1.199288256227758, "grad_norm": 1.4184575748732666, "learning_rate": 3.4602324199842026e-06, "loss": 0.0369, "step": 674 }, { "epoch": 1.201067615658363, "grad_norm": 1.36001916499385, "learning_rate": 3.446942563514711e-06, "loss": 0.0447, "step": 675 }, { "epoch": 1.2028469750889679, "grad_norm": 1.5849240899708212, "learning_rate": 3.4336648396585777e-06, "loss": 0.04, "step": 676 }, { "epoch": 1.204626334519573, "grad_norm": 1.8093553492261332, "learning_rate": 3.4203993521424774e-06, "loss": 0.0494, "step": 677 }, { "epoch": 1.206405693950178, "grad_norm": 1.6175915096827707, "learning_rate": 3.407146204597499e-06, "loss": 0.0449, "step": 678 }, { "epoch": 1.208185053380783, "grad_norm": 1.4313997499435398, "learning_rate": 3.3939055005583305e-06, "loss": 0.0407, "step": 679 }, { "epoch": 1.209964412811388, "grad_norm": 1.8171800594362804, "learning_rate": 3.3806773434624475e-06, "loss": 0.0544, "step": 680 }, { "epoch": 1.2117437722419928, "grad_norm": 1.88415600137481, "learning_rate": 3.3674618366493117e-06, "loss": 0.0664, "step": 681 }, { "epoch": 1.2135231316725978, "grad_norm": 1.509965419009144, "learning_rate": 3.3542590833595533e-06, "loss": 0.0441, "step": 682 }, { "epoch": 1.2153024911032029, "grad_norm": 1.3740017262996316, "learning_rate": 3.341069186734176e-06, "loss": 0.0423, "step": 683 }, { "epoch": 1.217081850533808, "grad_norm": 1.1351552579207862, "learning_rate": 3.3278922498137455e-06, "loss": 0.0284, "step": 684 }, { "epoch": 1.2188612099644127, "grad_norm": 1.500731680019629, "learning_rate": 3.314728375537587e-06, "loss": 0.0412, "step": 685 }, { "epoch": 1.2206405693950177, "grad_norm": 1.3947257433457336, "learning_rate": 3.3015776667429724e-06, "loss": 0.0514, "step": 686 }, { "epoch": 1.2224199288256228, "grad_norm": 1.5673957109623669, "learning_rate": 3.2884402261643296e-06, "loss": 0.0435, "step": 687 }, { "epoch": 1.2241992882562278, "grad_norm": 1.646194599971042, "learning_rate": 3.2753161564324344e-06, "loss": 0.0498, "step": 688 }, { "epoch": 1.2259786476868326, "grad_norm": 1.7331102573507795, "learning_rate": 3.262205560073605e-06, "loss": 0.0406, "step": 689 }, { "epoch": 1.2277580071174377, "grad_norm": 1.3195416424228423, "learning_rate": 3.249108539508909e-06, "loss": 0.0233, "step": 690 }, { "epoch": 1.2295373665480427, "grad_norm": 1.645362672136927, "learning_rate": 3.2360251970533527e-06, "loss": 0.059, "step": 691 }, { "epoch": 1.2313167259786477, "grad_norm": 1.4355758651820663, "learning_rate": 3.2229556349150947e-06, "loss": 0.0469, "step": 692 }, { "epoch": 1.2330960854092528, "grad_norm": 1.2130450326062125, "learning_rate": 3.2098999551946337e-06, "loss": 0.0334, "step": 693 }, { "epoch": 1.2348754448398576, "grad_norm": 1.553424758251328, "learning_rate": 3.1968582598840234e-06, "loss": 0.0482, "step": 694 }, { "epoch": 1.2366548042704626, "grad_norm": 1.8096017032291503, "learning_rate": 3.183830650866068e-06, "loss": 0.0502, "step": 695 }, { "epoch": 1.2384341637010676, "grad_norm": 1.3611557146325757, "learning_rate": 3.1708172299135266e-06, "loss": 0.0431, "step": 696 }, { "epoch": 1.2402135231316727, "grad_norm": 2.1315854303653445, "learning_rate": 3.1578180986883234e-06, "loss": 0.0718, "step": 697 }, { "epoch": 1.2419928825622777, "grad_norm": 1.5995732681828845, "learning_rate": 3.1448333587407486e-06, "loss": 0.0397, "step": 698 }, { "epoch": 1.2437722419928825, "grad_norm": 1.9099934919827357, "learning_rate": 3.131863111508667e-06, "loss": 0.0481, "step": 699 }, { "epoch": 1.2455516014234875, "grad_norm": 1.7796727726121742, "learning_rate": 3.118907458316722e-06, "loss": 0.0458, "step": 700 }, { "epoch": 1.2473309608540926, "grad_norm": 1.5413449983837406, "learning_rate": 3.105966500375551e-06, "loss": 0.0304, "step": 701 }, { "epoch": 1.2491103202846976, "grad_norm": 1.5340534676383317, "learning_rate": 3.0930403387809892e-06, "loss": 0.0563, "step": 702 }, { "epoch": 1.2508896797153026, "grad_norm": 1.7857283319423334, "learning_rate": 3.080129074513285e-06, "loss": 0.0604, "step": 703 }, { "epoch": 1.2526690391459074, "grad_norm": 1.4470212376702312, "learning_rate": 3.067232808436299e-06, "loss": 0.0416, "step": 704 }, { "epoch": 1.2544483985765125, "grad_norm": 1.581230153366629, "learning_rate": 3.0543516412967327e-06, "loss": 0.0412, "step": 705 }, { "epoch": 1.2562277580071175, "grad_norm": 1.2528928269572963, "learning_rate": 3.041485673723331e-06, "loss": 0.0359, "step": 706 }, { "epoch": 1.2580071174377223, "grad_norm": 1.414387324032504, "learning_rate": 3.0286350062261017e-06, "loss": 0.0431, "step": 707 }, { "epoch": 1.2597864768683273, "grad_norm": 1.3825588629970618, "learning_rate": 3.0157997391955172e-06, "loss": 0.0427, "step": 708 }, { "epoch": 1.2615658362989324, "grad_norm": 1.5765392660994293, "learning_rate": 3.0029799729017518e-06, "loss": 0.0504, "step": 709 }, { "epoch": 1.2633451957295374, "grad_norm": 1.2090760592199181, "learning_rate": 2.9901758074938797e-06, "loss": 0.0355, "step": 710 }, { "epoch": 1.2651245551601424, "grad_norm": 1.437616375079449, "learning_rate": 2.977387342999103e-06, "loss": 0.0392, "step": 711 }, { "epoch": 1.2669039145907472, "grad_norm": 1.4974122591348888, "learning_rate": 2.964614679321966e-06, "loss": 0.0427, "step": 712 }, { "epoch": 1.2686832740213523, "grad_norm": 1.5168013370760074, "learning_rate": 2.951857916243574e-06, "loss": 0.0516, "step": 713 }, { "epoch": 1.2704626334519573, "grad_norm": 2.003028011281055, "learning_rate": 2.9391171534208185e-06, "loss": 0.0693, "step": 714 }, { "epoch": 1.2722419928825623, "grad_norm": 1.7991683612987106, "learning_rate": 2.9263924903855932e-06, "loss": 0.0406, "step": 715 }, { "epoch": 1.2740213523131674, "grad_norm": 1.3303723493564288, "learning_rate": 2.9136840265440213e-06, "loss": 0.0413, "step": 716 }, { "epoch": 1.2758007117437722, "grad_norm": 1.526024916039291, "learning_rate": 2.9009918611756732e-06, "loss": 0.0467, "step": 717 }, { "epoch": 1.2775800711743772, "grad_norm": 1.5867906173093136, "learning_rate": 2.8883160934327968e-06, "loss": 0.0416, "step": 718 }, { "epoch": 1.2793594306049823, "grad_norm": 1.9563296293164805, "learning_rate": 2.8756568223395396e-06, "loss": 0.0512, "step": 719 }, { "epoch": 1.281138790035587, "grad_norm": 1.8578308384644335, "learning_rate": 2.8630141467911777e-06, "loss": 0.051, "step": 720 }, { "epoch": 1.282918149466192, "grad_norm": 1.3561542441838423, "learning_rate": 2.8503881655533395e-06, "loss": 0.0354, "step": 721 }, { "epoch": 1.2846975088967971, "grad_norm": 1.315049816981894, "learning_rate": 2.837778977261235e-06, "loss": 0.0423, "step": 722 }, { "epoch": 1.2864768683274022, "grad_norm": 1.2376947248706438, "learning_rate": 2.8251866804188875e-06, "loss": 0.036, "step": 723 }, { "epoch": 1.2882562277580072, "grad_norm": 1.2549310049286888, "learning_rate": 2.812611373398365e-06, "loss": 0.0433, "step": 724 }, { "epoch": 1.290035587188612, "grad_norm": 1.451092755538499, "learning_rate": 2.8000531544390064e-06, "loss": 0.0373, "step": 725 }, { "epoch": 1.291814946619217, "grad_norm": 1.5962826162874273, "learning_rate": 2.7875121216466595e-06, "loss": 0.0501, "step": 726 }, { "epoch": 1.293594306049822, "grad_norm": 1.548895078757718, "learning_rate": 2.7749883729929105e-06, "loss": 0.0493, "step": 727 }, { "epoch": 1.295373665480427, "grad_norm": 1.17990257829568, "learning_rate": 2.762482006314324e-06, "loss": 0.0331, "step": 728 }, { "epoch": 1.2971530249110321, "grad_norm": 1.418124895350595, "learning_rate": 2.7499931193116692e-06, "loss": 0.0333, "step": 729 }, { "epoch": 1.298932384341637, "grad_norm": 1.5240534414358369, "learning_rate": 2.737521809549167e-06, "loss": 0.0483, "step": 730 }, { "epoch": 1.300711743772242, "grad_norm": 1.7782199464705766, "learning_rate": 2.725068174453722e-06, "loss": 0.0555, "step": 731 }, { "epoch": 1.302491103202847, "grad_norm": 1.536452691578479, "learning_rate": 2.712632311314165e-06, "loss": 0.0443, "step": 732 }, { "epoch": 1.304270462633452, "grad_norm": 1.7260553220223356, "learning_rate": 2.7002143172804875e-06, "loss": 0.0399, "step": 733 }, { "epoch": 1.306049822064057, "grad_norm": 1.6688231505615356, "learning_rate": 2.6878142893630904e-06, "loss": 0.0484, "step": 734 }, { "epoch": 1.3078291814946619, "grad_norm": 1.3634331026927937, "learning_rate": 2.6754323244320154e-06, "loss": 0.0506, "step": 735 }, { "epoch": 1.309608540925267, "grad_norm": 1.2266569925734165, "learning_rate": 2.6630685192161995e-06, "loss": 0.0386, "step": 736 }, { "epoch": 1.311387900355872, "grad_norm": 1.547088872548096, "learning_rate": 2.650722970302714e-06, "loss": 0.0373, "step": 737 }, { "epoch": 1.3131672597864767, "grad_norm": 1.7992473615890379, "learning_rate": 2.638395774136009e-06, "loss": 0.0536, "step": 738 }, { "epoch": 1.3149466192170818, "grad_norm": 1.5739426478457876, "learning_rate": 2.6260870270171645e-06, "loss": 0.0437, "step": 739 }, { "epoch": 1.3167259786476868, "grad_norm": 1.5956055301638785, "learning_rate": 2.613796825103129e-06, "loss": 0.0519, "step": 740 }, { "epoch": 1.3185053380782918, "grad_norm": 1.41793678602245, "learning_rate": 2.60152526440598e-06, "loss": 0.0391, "step": 741 }, { "epoch": 1.3202846975088969, "grad_norm": 1.220700573283321, "learning_rate": 2.5892724407921667e-06, "loss": 0.03, "step": 742 }, { "epoch": 1.3220640569395017, "grad_norm": 1.8057026056641117, "learning_rate": 2.577038449981763e-06, "loss": 0.0545, "step": 743 }, { "epoch": 1.3238434163701067, "grad_norm": 1.5437830629091855, "learning_rate": 2.564823387547716e-06, "loss": 0.0401, "step": 744 }, { "epoch": 1.3256227758007118, "grad_norm": 1.7115235307639178, "learning_rate": 2.552627348915106e-06, "loss": 0.0503, "step": 745 }, { "epoch": 1.3274021352313168, "grad_norm": 1.7122787035013294, "learning_rate": 2.5404504293603983e-06, "loss": 0.0586, "step": 746 }, { "epoch": 1.3291814946619218, "grad_norm": 1.874841166084849, "learning_rate": 2.528292724010697e-06, "loss": 0.0617, "step": 747 }, { "epoch": 1.3309608540925266, "grad_norm": 1.395531322757944, "learning_rate": 2.5161543278430055e-06, "loss": 0.0431, "step": 748 }, { "epoch": 1.3327402135231317, "grad_norm": 1.5827454142328405, "learning_rate": 2.5040353356834756e-06, "loss": 0.0452, "step": 749 }, { "epoch": 1.3345195729537367, "grad_norm": 1.6750835011708034, "learning_rate": 2.4919358422066816e-06, "loss": 0.0389, "step": 750 }, { "epoch": 1.3362989323843417, "grad_norm": 1.7128735446274321, "learning_rate": 2.4798559419348672e-06, "loss": 0.0421, "step": 751 }, { "epoch": 1.3380782918149468, "grad_norm": 1.2099184361355606, "learning_rate": 2.4677957292372166e-06, "loss": 0.0357, "step": 752 }, { "epoch": 1.3398576512455516, "grad_norm": 1.1929806757750958, "learning_rate": 2.455755298329107e-06, "loss": 0.0381, "step": 753 }, { "epoch": 1.3416370106761566, "grad_norm": 1.2743550325713942, "learning_rate": 2.4437347432713838e-06, "loss": 0.0365, "step": 754 }, { "epoch": 1.3434163701067616, "grad_norm": 1.6149355720674208, "learning_rate": 2.431734157969619e-06, "loss": 0.0438, "step": 755 }, { "epoch": 1.3451957295373664, "grad_norm": 1.7415000691153033, "learning_rate": 2.4197536361733792e-06, "loss": 0.0599, "step": 756 }, { "epoch": 1.3469750889679715, "grad_norm": 1.1977328149862356, "learning_rate": 2.407793271475495e-06, "loss": 0.0334, "step": 757 }, { "epoch": 1.3487544483985765, "grad_norm": 1.3909866005431668, "learning_rate": 2.3958531573113223e-06, "loss": 0.0528, "step": 758 }, { "epoch": 1.3505338078291815, "grad_norm": 1.4244544483594586, "learning_rate": 2.3839333869580243e-06, "loss": 0.0425, "step": 759 }, { "epoch": 1.3523131672597866, "grad_norm": 1.6973049135550342, "learning_rate": 2.372034053533835e-06, "loss": 0.0421, "step": 760 }, { "epoch": 1.3540925266903914, "grad_norm": 1.2874358350573702, "learning_rate": 2.360155249997334e-06, "loss": 0.0442, "step": 761 }, { "epoch": 1.3558718861209964, "grad_norm": 1.2434951030469052, "learning_rate": 2.348297069146715e-06, "loss": 0.0319, "step": 762 }, { "epoch": 1.3576512455516014, "grad_norm": 1.1665491752588282, "learning_rate": 2.3364596036190706e-06, "loss": 0.0359, "step": 763 }, { "epoch": 1.3594306049822065, "grad_norm": 1.461756045564582, "learning_rate": 2.3246429458896637e-06, "loss": 0.0459, "step": 764 }, { "epoch": 1.3612099644128115, "grad_norm": 1.6343361792210032, "learning_rate": 2.312847188271203e-06, "loss": 0.0524, "step": 765 }, { "epoch": 1.3629893238434163, "grad_norm": 1.3548274991759919, "learning_rate": 2.301072422913123e-06, "loss": 0.0321, "step": 766 }, { "epoch": 1.3647686832740213, "grad_norm": 1.4953383228875219, "learning_rate": 2.2893187418008666e-06, "loss": 0.0389, "step": 767 }, { "epoch": 1.3665480427046264, "grad_norm": 1.5567290941348062, "learning_rate": 2.2775862367551642e-06, "loss": 0.0447, "step": 768 }, { "epoch": 1.3683274021352312, "grad_norm": 1.5235519266079038, "learning_rate": 2.265874999431318e-06, "loss": 0.0428, "step": 769 }, { "epoch": 1.3701067615658362, "grad_norm": 1.4150161766049616, "learning_rate": 2.254185121318484e-06, "loss": 0.0329, "step": 770 }, { "epoch": 1.3718861209964412, "grad_norm": 1.4845114426548112, "learning_rate": 2.2425166937389596e-06, "loss": 0.0421, "step": 771 }, { "epoch": 1.3736654804270463, "grad_norm": 1.2681985823538768, "learning_rate": 2.2308698078474645e-06, "loss": 0.0418, "step": 772 }, { "epoch": 1.3754448398576513, "grad_norm": 1.3516636841672425, "learning_rate": 2.219244554630438e-06, "loss": 0.0455, "step": 773 }, { "epoch": 1.3772241992882561, "grad_norm": 1.3300362415228972, "learning_rate": 2.207641024905322e-06, "loss": 0.0386, "step": 774 }, { "epoch": 1.3790035587188612, "grad_norm": 1.6374240147172454, "learning_rate": 2.1960593093198508e-06, "loss": 0.0413, "step": 775 }, { "epoch": 1.3807829181494662, "grad_norm": 1.5008763316056877, "learning_rate": 2.184499498351347e-06, "loss": 0.042, "step": 776 }, { "epoch": 1.3825622775800712, "grad_norm": 1.3756810833501245, "learning_rate": 2.172961682306011e-06, "loss": 0.0454, "step": 777 }, { "epoch": 1.3843416370106763, "grad_norm": 1.7291217702441377, "learning_rate": 2.1614459513182173e-06, "loss": 0.0467, "step": 778 }, { "epoch": 1.386120996441281, "grad_norm": 1.4328660208411663, "learning_rate": 2.149952395349813e-06, "loss": 0.0353, "step": 779 }, { "epoch": 1.387900355871886, "grad_norm": 1.7034968289059418, "learning_rate": 2.1384811041894055e-06, "loss": 0.0402, "step": 780 }, { "epoch": 1.3896797153024911, "grad_norm": 1.5883094217308575, "learning_rate": 2.1270321674516736e-06, "loss": 0.0434, "step": 781 }, { "epoch": 1.3914590747330962, "grad_norm": 1.5960922211126674, "learning_rate": 2.1156056745766593e-06, "loss": 0.0443, "step": 782 }, { "epoch": 1.3932384341637012, "grad_norm": 1.492197946369974, "learning_rate": 2.104201714829074e-06, "loss": 0.045, "step": 783 }, { "epoch": 1.395017793594306, "grad_norm": 1.5295778778391644, "learning_rate": 2.0928203772975917e-06, "loss": 0.0459, "step": 784 }, { "epoch": 1.396797153024911, "grad_norm": 1.3529469158844694, "learning_rate": 2.081461750894166e-06, "loss": 0.0382, "step": 785 }, { "epoch": 1.398576512455516, "grad_norm": 1.6374373924539984, "learning_rate": 2.070125924353328e-06, "loss": 0.0413, "step": 786 }, { "epoch": 1.4003558718861209, "grad_norm": 1.2104674520430558, "learning_rate": 2.058812986231493e-06, "loss": 0.0402, "step": 787 }, { "epoch": 1.402135231316726, "grad_norm": 1.6742851795060303, "learning_rate": 2.0475230249062727e-06, "loss": 0.0696, "step": 788 }, { "epoch": 1.403914590747331, "grad_norm": 1.6002299302582397, "learning_rate": 2.0362561285757766e-06, "loss": 0.0407, "step": 789 }, { "epoch": 1.405693950177936, "grad_norm": 1.5373713699855578, "learning_rate": 2.0250123852579347e-06, "loss": 0.0432, "step": 790 }, { "epoch": 1.407473309608541, "grad_norm": 1.032138062275992, "learning_rate": 2.013791882789801e-06, "loss": 0.0243, "step": 791 }, { "epoch": 1.4092526690391458, "grad_norm": 1.274112710997587, "learning_rate": 2.0025947088268714e-06, "loss": 0.029, "step": 792 }, { "epoch": 1.4110320284697508, "grad_norm": 1.324044563801138, "learning_rate": 1.9914209508423943e-06, "loss": 0.039, "step": 793 }, { "epoch": 1.4128113879003559, "grad_norm": 1.2108667213796944, "learning_rate": 1.9802706961266936e-06, "loss": 0.0345, "step": 794 }, { "epoch": 1.414590747330961, "grad_norm": 1.801072446974817, "learning_rate": 1.969144031786483e-06, "loss": 0.0562, "step": 795 }, { "epoch": 1.416370106761566, "grad_norm": 1.477022032529501, "learning_rate": 1.958041044744186e-06, "loss": 0.0542, "step": 796 }, { "epoch": 1.4181494661921707, "grad_norm": 1.1339480073256372, "learning_rate": 1.94696182173726e-06, "loss": 0.0287, "step": 797 }, { "epoch": 1.4199288256227758, "grad_norm": 1.4188145835215062, "learning_rate": 1.9359064493175077e-06, "loss": 0.045, "step": 798 }, { "epoch": 1.4217081850533808, "grad_norm": 1.5010449192896431, "learning_rate": 1.9248750138504176e-06, "loss": 0.0438, "step": 799 }, { "epoch": 1.4234875444839858, "grad_norm": 1.6148218770935596, "learning_rate": 1.9138676015144765e-06, "loss": 0.0392, "step": 800 }, { "epoch": 1.4234875444839858, "eval_loss": 0.13553155958652496, "eval_runtime": 1.9004, "eval_samples_per_second": 24.206, "eval_steps_per_second": 6.315, "step": 800 }, { "epoch": 1.4252669039145909, "grad_norm": 1.6045264551089624, "learning_rate": 1.9028842983005036e-06, "loss": 0.046, "step": 801 }, { "epoch": 1.4270462633451957, "grad_norm": 1.5663550316366912, "learning_rate": 1.8919251900109697e-06, "loss": 0.05, "step": 802 }, { "epoch": 1.4288256227758007, "grad_norm": 1.3392919230009432, "learning_rate": 1.8809903622593395e-06, "loss": 0.0349, "step": 803 }, { "epoch": 1.4306049822064058, "grad_norm": 1.0197400883600565, "learning_rate": 1.870079900469392e-06, "loss": 0.0253, "step": 804 }, { "epoch": 1.4323843416370106, "grad_norm": 1.3083944846556463, "learning_rate": 1.8591938898745593e-06, "loss": 0.0359, "step": 805 }, { "epoch": 1.4341637010676156, "grad_norm": 1.6177889716123028, "learning_rate": 1.8483324155172594e-06, "loss": 0.0431, "step": 806 }, { "epoch": 1.4359430604982206, "grad_norm": 1.2746615320017745, "learning_rate": 1.837495562248226e-06, "loss": 0.0391, "step": 807 }, { "epoch": 1.4377224199288257, "grad_norm": 1.4729114621843828, "learning_rate": 1.8266834147258577e-06, "loss": 0.04, "step": 808 }, { "epoch": 1.4395017793594307, "grad_norm": 1.4598157425135223, "learning_rate": 1.8158960574155455e-06, "loss": 0.0376, "step": 809 }, { "epoch": 1.4412811387900355, "grad_norm": 1.720682633776246, "learning_rate": 1.8051335745890196e-06, "loss": 0.0441, "step": 810 }, { "epoch": 1.4430604982206405, "grad_norm": 1.4838556201963446, "learning_rate": 1.7943960503236856e-06, "loss": 0.0547, "step": 811 }, { "epoch": 1.4448398576512456, "grad_norm": 1.3314051570537497, "learning_rate": 1.7836835685019732e-06, "loss": 0.0395, "step": 812 }, { "epoch": 1.4466192170818506, "grad_norm": 2.6112852624779865, "learning_rate": 1.7729962128106787e-06, "loss": 0.0474, "step": 813 }, { "epoch": 1.4483985765124556, "grad_norm": 1.2722427860827188, "learning_rate": 1.7623340667403089e-06, "loss": 0.0349, "step": 814 }, { "epoch": 1.4501779359430604, "grad_norm": 1.9655154853037475, "learning_rate": 1.7516972135844352e-06, "loss": 0.066, "step": 815 }, { "epoch": 1.4519572953736655, "grad_norm": 1.6141416011186334, "learning_rate": 1.741085736439031e-06, "loss": 0.0404, "step": 816 }, { "epoch": 1.4537366548042705, "grad_norm": 1.5342062022939358, "learning_rate": 1.730499718201838e-06, "loss": 0.0467, "step": 817 }, { "epoch": 1.4555160142348753, "grad_norm": 1.4924465392368111, "learning_rate": 1.7199392415717064e-06, "loss": 0.0401, "step": 818 }, { "epoch": 1.4572953736654806, "grad_norm": 1.6401729800161722, "learning_rate": 1.7094043890479557e-06, "loss": 0.0474, "step": 819 }, { "epoch": 1.4590747330960854, "grad_norm": 1.285145366368902, "learning_rate": 1.698895242929725e-06, "loss": 0.0409, "step": 820 }, { "epoch": 1.4608540925266904, "grad_norm": 1.4781898012303594, "learning_rate": 1.6884118853153358e-06, "loss": 0.0327, "step": 821 }, { "epoch": 1.4626334519572954, "grad_norm": 1.5716182633825058, "learning_rate": 1.6779543981016478e-06, "loss": 0.0427, "step": 822 }, { "epoch": 1.4644128113879002, "grad_norm": 1.4583203708664216, "learning_rate": 1.6675228629834133e-06, "loss": 0.0395, "step": 823 }, { "epoch": 1.4661921708185053, "grad_norm": 1.3613801155561407, "learning_rate": 1.657117361452651e-06, "loss": 0.0412, "step": 824 }, { "epoch": 1.4679715302491103, "grad_norm": 1.3332552202413799, "learning_rate": 1.6467379747980011e-06, "loss": 0.0399, "step": 825 }, { "epoch": 1.4697508896797153, "grad_norm": 1.2983964530965444, "learning_rate": 1.6363847841040914e-06, "loss": 0.0345, "step": 826 }, { "epoch": 1.4715302491103204, "grad_norm": 1.4073663064364275, "learning_rate": 1.626057870250906e-06, "loss": 0.031, "step": 827 }, { "epoch": 1.4733096085409252, "grad_norm": 1.2993154536578022, "learning_rate": 1.6157573139131527e-06, "loss": 0.0295, "step": 828 }, { "epoch": 1.4750889679715302, "grad_norm": 1.2768133277723346, "learning_rate": 1.605483195559628e-06, "loss": 0.0441, "step": 829 }, { "epoch": 1.4768683274021353, "grad_norm": 1.5200745193794887, "learning_rate": 1.5952355954525966e-06, "loss": 0.0412, "step": 830 }, { "epoch": 1.4786476868327403, "grad_norm": 1.5704689191771652, "learning_rate": 1.5850145936471607e-06, "loss": 0.0382, "step": 831 }, { "epoch": 1.4804270462633453, "grad_norm": 1.628460117627646, "learning_rate": 1.5748202699906335e-06, "loss": 0.0475, "step": 832 }, { "epoch": 1.4822064056939501, "grad_norm": 0.9182008751415289, "learning_rate": 1.5646527041219128e-06, "loss": 0.0256, "step": 833 }, { "epoch": 1.4839857651245552, "grad_norm": 1.439259606190967, "learning_rate": 1.5545119754708682e-06, "loss": 0.0388, "step": 834 }, { "epoch": 1.4857651245551602, "grad_norm": 1.571297799533974, "learning_rate": 1.544398163257711e-06, "loss": 0.0423, "step": 835 }, { "epoch": 1.487544483985765, "grad_norm": 1.3212867468780074, "learning_rate": 1.5343113464923808e-06, "loss": 0.0397, "step": 836 }, { "epoch": 1.48932384341637, "grad_norm": 1.0278976202774528, "learning_rate": 1.524251603973927e-06, "loss": 0.0278, "step": 837 }, { "epoch": 1.491103202846975, "grad_norm": 1.2427517728362785, "learning_rate": 1.5142190142898883e-06, "loss": 0.0396, "step": 838 }, { "epoch": 1.49288256227758, "grad_norm": 1.4576960725750074, "learning_rate": 1.5042136558156883e-06, "loss": 0.0453, "step": 839 }, { "epoch": 1.4946619217081851, "grad_norm": 1.5951738011737673, "learning_rate": 1.4942356067140162e-06, "loss": 0.0475, "step": 840 }, { "epoch": 1.49644128113879, "grad_norm": 1.0994539356718618, "learning_rate": 1.4842849449342195e-06, "loss": 0.0354, "step": 841 }, { "epoch": 1.498220640569395, "grad_norm": 2.1389617046432385, "learning_rate": 1.4743617482116896e-06, "loss": 0.0436, "step": 842 }, { "epoch": 1.5, "grad_norm": 1.7674007226468915, "learning_rate": 1.4644660940672628e-06, "loss": 0.0347, "step": 843 }, { "epoch": 1.501779359430605, "grad_norm": 1.3372047597755599, "learning_rate": 1.454598059806609e-06, "loss": 0.0438, "step": 844 }, { "epoch": 1.50355871886121, "grad_norm": 1.8690478799924626, "learning_rate": 1.4447577225196296e-06, "loss": 0.048, "step": 845 }, { "epoch": 1.5053380782918149, "grad_norm": 1.5607126430748184, "learning_rate": 1.4349451590798564e-06, "loss": 0.0462, "step": 846 }, { "epoch": 1.50711743772242, "grad_norm": 1.3630895620251622, "learning_rate": 1.4251604461438444e-06, "loss": 0.0403, "step": 847 }, { "epoch": 1.508896797153025, "grad_norm": 1.1778397710106534, "learning_rate": 1.4154036601505834e-06, "loss": 0.0295, "step": 848 }, { "epoch": 1.5106761565836297, "grad_norm": 1.54919239226997, "learning_rate": 1.4056748773208933e-06, "loss": 0.0418, "step": 849 }, { "epoch": 1.512455516014235, "grad_norm": 1.4460624576031404, "learning_rate": 1.3959741736568339e-06, "loss": 0.0413, "step": 850 }, { "epoch": 1.5142348754448398, "grad_norm": 1.2016844961666362, "learning_rate": 1.3863016249411027e-06, "loss": 0.0368, "step": 851 }, { "epoch": 1.5160142348754448, "grad_norm": 1.3648649143091593, "learning_rate": 1.376657306736453e-06, "loss": 0.0406, "step": 852 }, { "epoch": 1.5177935943060499, "grad_norm": 1.569101439175862, "learning_rate": 1.3670412943850975e-06, "loss": 0.0508, "step": 853 }, { "epoch": 1.5195729537366547, "grad_norm": 1.0369296172387779, "learning_rate": 1.3574536630081208e-06, "loss": 0.0236, "step": 854 }, { "epoch": 1.52135231316726, "grad_norm": 1.3753090570781858, "learning_rate": 1.347894487504896e-06, "loss": 0.0393, "step": 855 }, { "epoch": 1.5231316725978647, "grad_norm": 1.4364217227404474, "learning_rate": 1.3383638425524909e-06, "loss": 0.0372, "step": 856 }, { "epoch": 1.5249110320284698, "grad_norm": 1.2082339988983573, "learning_rate": 1.3288618026050943e-06, "loss": 0.0359, "step": 857 }, { "epoch": 1.5266903914590748, "grad_norm": 1.7697427845149132, "learning_rate": 1.31938844189343e-06, "loss": 0.0411, "step": 858 }, { "epoch": 1.5284697508896796, "grad_norm": 1.5496943208028782, "learning_rate": 1.3099438344241777e-06, "loss": 0.0431, "step": 859 }, { "epoch": 1.5302491103202847, "grad_norm": 1.3876393495649573, "learning_rate": 1.3005280539793908e-06, "loss": 0.038, "step": 860 }, { "epoch": 1.5320284697508897, "grad_norm": 1.5283606760759147, "learning_rate": 1.2911411741159273e-06, "loss": 0.0341, "step": 861 }, { "epoch": 1.5338078291814945, "grad_norm": 1.6602618583439275, "learning_rate": 1.2817832681648712e-06, "loss": 0.0623, "step": 862 }, { "epoch": 1.5355871886120998, "grad_norm": 1.5411359409223566, "learning_rate": 1.2724544092309581e-06, "loss": 0.042, "step": 863 }, { "epoch": 1.5373665480427046, "grad_norm": 1.655744290136215, "learning_rate": 1.2631546701920073e-06, "loss": 0.0532, "step": 864 }, { "epoch": 1.5391459074733096, "grad_norm": 1.3563301578406313, "learning_rate": 1.2538841236983519e-06, "loss": 0.0399, "step": 865 }, { "epoch": 1.5409252669039146, "grad_norm": 1.39309908268714, "learning_rate": 1.244642842172266e-06, "loss": 0.0328, "step": 866 }, { "epoch": 1.5427046263345194, "grad_norm": 1.4010073149803954, "learning_rate": 1.2354308978074088e-06, "loss": 0.0407, "step": 867 }, { "epoch": 1.5444839857651247, "grad_norm": 1.349733650119393, "learning_rate": 1.2262483625682514e-06, "loss": 0.0426, "step": 868 }, { "epoch": 1.5462633451957295, "grad_norm": 1.5329016214906686, "learning_rate": 1.2170953081895214e-06, "loss": 0.0418, "step": 869 }, { "epoch": 1.5480427046263345, "grad_norm": 1.6663962662447735, "learning_rate": 1.2079718061756369e-06, "loss": 0.0359, "step": 870 }, { "epoch": 1.5498220640569396, "grad_norm": 1.3192206265069957, "learning_rate": 1.1988779278001517e-06, "loss": 0.0375, "step": 871 }, { "epoch": 1.5516014234875444, "grad_norm": 1.3072180362983923, "learning_rate": 1.1898137441051982e-06, "loss": 0.04, "step": 872 }, { "epoch": 1.5533807829181496, "grad_norm": 1.4269989776940648, "learning_rate": 1.1807793259009282e-06, "loss": 0.0453, "step": 873 }, { "epoch": 1.5551601423487544, "grad_norm": 1.2378646218489764, "learning_rate": 1.1717747437649657e-06, "loss": 0.0341, "step": 874 }, { "epoch": 1.5569395017793595, "grad_norm": 1.7443064937890536, "learning_rate": 1.1628000680418533e-06, "loss": 0.0459, "step": 875 }, { "epoch": 1.5587188612099645, "grad_norm": 1.6102197030681638, "learning_rate": 1.1538553688425002e-06, "loss": 0.0466, "step": 876 }, { "epoch": 1.5604982206405693, "grad_norm": 1.397834534714242, "learning_rate": 1.14494071604364e-06, "loss": 0.0381, "step": 877 }, { "epoch": 1.5622775800711743, "grad_norm": 1.4037906893710472, "learning_rate": 1.1360561792872754e-06, "loss": 0.0408, "step": 878 }, { "epoch": 1.5640569395017794, "grad_norm": 1.7743651590701754, "learning_rate": 1.127201827980145e-06, "loss": 0.0624, "step": 879 }, { "epoch": 1.5658362989323842, "grad_norm": 1.164655492700238, "learning_rate": 1.1183777312931748e-06, "loss": 0.0257, "step": 880 }, { "epoch": 1.5676156583629894, "grad_norm": 1.3555269499971025, "learning_rate": 1.1095839581609407e-06, "loss": 0.0432, "step": 881 }, { "epoch": 1.5693950177935942, "grad_norm": 1.7212104120336336, "learning_rate": 1.1008205772811248e-06, "loss": 0.0554, "step": 882 }, { "epoch": 1.5711743772241993, "grad_norm": 1.6447106126467486, "learning_rate": 1.0920876571139843e-06, "loss": 0.0473, "step": 883 }, { "epoch": 1.5729537366548043, "grad_norm": 1.3228366873077255, "learning_rate": 1.0833852658818167e-06, "loss": 0.0423, "step": 884 }, { "epoch": 1.5747330960854091, "grad_norm": 1.440481606522488, "learning_rate": 1.0747134715684221e-06, "loss": 0.0306, "step": 885 }, { "epoch": 1.5765124555160144, "grad_norm": 1.2889642040008586, "learning_rate": 1.0660723419185776e-06, "loss": 0.0354, "step": 886 }, { "epoch": 1.5782918149466192, "grad_norm": 1.5231752699982202, "learning_rate": 1.0574619444375017e-06, "loss": 0.0405, "step": 887 }, { "epoch": 1.5800711743772242, "grad_norm": 1.3381006034070078, "learning_rate": 1.0488823463903341e-06, "loss": 0.0355, "step": 888 }, { "epoch": 1.5818505338078293, "grad_norm": 1.470805921997831, "learning_rate": 1.0403336148016053e-06, "loss": 0.0484, "step": 889 }, { "epoch": 1.583629893238434, "grad_norm": 1.3067202932834707, "learning_rate": 1.0318158164547159e-06, "loss": 0.0389, "step": 890 }, { "epoch": 1.585409252669039, "grad_norm": 1.3902165526739316, "learning_rate": 1.0233290178914096e-06, "loss": 0.0343, "step": 891 }, { "epoch": 1.5871886120996441, "grad_norm": 1.0650989482283322, "learning_rate": 1.014873285411262e-06, "loss": 0.0305, "step": 892 }, { "epoch": 1.5889679715302492, "grad_norm": 1.5776228251020936, "learning_rate": 1.006448685071154e-06, "loss": 0.049, "step": 893 }, { "epoch": 1.5907473309608542, "grad_norm": 1.6847511370413242, "learning_rate": 9.980552826847635e-07, "loss": 0.0436, "step": 894 }, { "epoch": 1.592526690391459, "grad_norm": 1.5267170572384654, "learning_rate": 9.896931438220453e-07, "loss": 0.0453, "step": 895 }, { "epoch": 1.594306049822064, "grad_norm": 1.6853099289986873, "learning_rate": 9.813623338087181e-07, "loss": 0.0451, "step": 896 }, { "epoch": 1.596085409252669, "grad_norm": 1.8621945073557655, "learning_rate": 9.730629177257623e-07, "loss": 0.0532, "step": 897 }, { "epoch": 1.5978647686832739, "grad_norm": 1.1570214892546553, "learning_rate": 9.64794960408903e-07, "loss": 0.029, "step": 898 }, { "epoch": 1.5996441281138791, "grad_norm": 1.570142854946665, "learning_rate": 9.565585264481092e-07, "loss": 0.0467, "step": 899 }, { "epoch": 1.601423487544484, "grad_norm": 1.2747258900459593, "learning_rate": 9.483536801870835e-07, "loss": 0.0357, "step": 900 }, { "epoch": 1.603202846975089, "grad_norm": 1.768207167854998, "learning_rate": 9.401804857227648e-07, "loss": 0.0376, "step": 901 }, { "epoch": 1.604982206405694, "grad_norm": 1.3816828381796138, "learning_rate": 9.320390069048258e-07, "loss": 0.0419, "step": 902 }, { "epoch": 1.6067615658362988, "grad_norm": 1.3282762248575404, "learning_rate": 9.239293073351735e-07, "loss": 0.0368, "step": 903 }, { "epoch": 1.608540925266904, "grad_norm": 1.5077127417736853, "learning_rate": 9.158514503674543e-07, "loss": 0.0366, "step": 904 }, { "epoch": 1.6103202846975089, "grad_norm": 1.1844435993103961, "learning_rate": 9.078054991065532e-07, "loss": 0.0302, "step": 905 }, { "epoch": 1.612099644128114, "grad_norm": 1.5361179282683537, "learning_rate": 8.997915164081095e-07, "loss": 0.0516, "step": 906 }, { "epoch": 1.613879003558719, "grad_norm": 1.5967997549831903, "learning_rate": 8.918095648780195e-07, "loss": 0.0512, "step": 907 }, { "epoch": 1.6156583629893237, "grad_norm": 1.488194638702583, "learning_rate": 8.838597068719518e-07, "loss": 0.0357, "step": 908 }, { "epoch": 1.6174377224199288, "grad_norm": 1.751026076647405, "learning_rate": 8.75942004494853e-07, "loss": 0.0446, "step": 909 }, { "epoch": 1.6192170818505338, "grad_norm": 1.8228721992338053, "learning_rate": 8.680565196004704e-07, "loss": 0.0567, "step": 910 }, { "epoch": 1.6209964412811388, "grad_norm": 1.328560267532672, "learning_rate": 8.602033137908666e-07, "loss": 0.0361, "step": 911 }, { "epoch": 1.6227758007117439, "grad_norm": 1.3252873462126447, "learning_rate": 8.523824484159348e-07, "loss": 0.0302, "step": 912 }, { "epoch": 1.6245551601423487, "grad_norm": 1.3494694773324976, "learning_rate": 8.445939845729245e-07, "loss": 0.0342, "step": 913 }, { "epoch": 1.6263345195729537, "grad_norm": 1.1524568157816502, "learning_rate": 8.368379831059592e-07, "loss": 0.0378, "step": 914 }, { "epoch": 1.6281138790035588, "grad_norm": 1.2377124974985947, "learning_rate": 8.29114504605566e-07, "loss": 0.0381, "step": 915 }, { "epoch": 1.6298932384341636, "grad_norm": 1.2509629067651231, "learning_rate": 8.21423609408199e-07, "loss": 0.042, "step": 916 }, { "epoch": 1.6316725978647688, "grad_norm": 1.347754497325353, "learning_rate": 8.137653575957666e-07, "loss": 0.0325, "step": 917 }, { "epoch": 1.6334519572953736, "grad_norm": 1.3991680648985156, "learning_rate": 8.061398089951678e-07, "loss": 0.0474, "step": 918 }, { "epoch": 1.6352313167259787, "grad_norm": 1.3702972693459372, "learning_rate": 7.985470231778203e-07, "loss": 0.0375, "step": 919 }, { "epoch": 1.6370106761565837, "grad_norm": 1.4493468169237673, "learning_rate": 7.909870594591951e-07, "loss": 0.0364, "step": 920 }, { "epoch": 1.6387900355871885, "grad_norm": 1.6728531935037791, "learning_rate": 7.834599768983553e-07, "loss": 0.0459, "step": 921 }, { "epoch": 1.6405693950177938, "grad_norm": 1.2858460794736846, "learning_rate": 7.759658342974951e-07, "loss": 0.0301, "step": 922 }, { "epoch": 1.6423487544483986, "grad_norm": 1.43482025502732, "learning_rate": 7.685046902014747e-07, "loss": 0.0448, "step": 923 }, { "epoch": 1.6441281138790036, "grad_norm": 1.5661414126628719, "learning_rate": 7.61076602897371e-07, "loss": 0.0405, "step": 924 }, { "epoch": 1.6459074733096086, "grad_norm": 0.9422703119014241, "learning_rate": 7.536816304140177e-07, "loss": 0.0235, "step": 925 }, { "epoch": 1.6476868327402134, "grad_norm": 1.2618684227905703, "learning_rate": 7.46319830521553e-07, "loss": 0.0272, "step": 926 }, { "epoch": 1.6494661921708185, "grad_norm": 1.2743567720453082, "learning_rate": 7.389912607309662e-07, "loss": 0.0382, "step": 927 }, { "epoch": 1.6512455516014235, "grad_norm": 1.5516278790516005, "learning_rate": 7.316959782936516e-07, "loss": 0.0439, "step": 928 }, { "epoch": 1.6530249110320283, "grad_norm": 1.047201487462478, "learning_rate": 7.244340402009608e-07, "loss": 0.0279, "step": 929 }, { "epoch": 1.6548042704626336, "grad_norm": 1.9607395531967122, "learning_rate": 7.172055031837572e-07, "loss": 0.0558, "step": 930 }, { "epoch": 1.6565836298932384, "grad_norm": 1.2367918261541342, "learning_rate": 7.100104237119676e-07, "loss": 0.0358, "step": 931 }, { "epoch": 1.6583629893238434, "grad_norm": 1.2604371813311994, "learning_rate": 7.028488579941506e-07, "loss": 0.0435, "step": 932 }, { "epoch": 1.6601423487544484, "grad_norm": 1.1764718538407726, "learning_rate": 6.957208619770505e-07, "loss": 0.0314, "step": 933 }, { "epoch": 1.6619217081850532, "grad_norm": 1.917288342025507, "learning_rate": 6.886264913451635e-07, "loss": 0.0522, "step": 934 }, { "epoch": 1.6637010676156585, "grad_norm": 1.4495493967970827, "learning_rate": 6.815658015203014e-07, "loss": 0.0404, "step": 935 }, { "epoch": 1.6654804270462633, "grad_norm": 1.567577585358252, "learning_rate": 6.745388476611553e-07, "loss": 0.0494, "step": 936 }, { "epoch": 1.6672597864768683, "grad_norm": 1.3496135255298065, "learning_rate": 6.67545684662873e-07, "loss": 0.0392, "step": 937 }, { "epoch": 1.6690391459074734, "grad_norm": 1.2624866082491015, "learning_rate": 6.605863671566221e-07, "loss": 0.0397, "step": 938 }, { "epoch": 1.6708185053380782, "grad_norm": 1.4932679808167775, "learning_rate": 6.536609495091695e-07, "loss": 0.0357, "step": 939 }, { "epoch": 1.6725978647686834, "grad_norm": 2.0994499945272773, "learning_rate": 6.467694858224488e-07, "loss": 0.0605, "step": 940 }, { "epoch": 1.6743772241992882, "grad_norm": 1.2202425474352014, "learning_rate": 6.399120299331468e-07, "loss": 0.028, "step": 941 }, { "epoch": 1.6761565836298933, "grad_norm": 1.2547013509656346, "learning_rate": 6.330886354122768e-07, "loss": 0.0351, "step": 942 }, { "epoch": 1.6779359430604983, "grad_norm": 1.2243512640917917, "learning_rate": 6.262993555647617e-07, "loss": 0.0386, "step": 943 }, { "epoch": 1.6797153024911031, "grad_norm": 1.0270397840602432, "learning_rate": 6.1954424342902e-07, "loss": 0.0304, "step": 944 }, { "epoch": 1.6814946619217082, "grad_norm": 2.0627083576047265, "learning_rate": 6.128233517765448e-07, "loss": 0.0532, "step": 945 }, { "epoch": 1.6832740213523132, "grad_norm": 1.7232388309851174, "learning_rate": 6.061367331114992e-07, "loss": 0.041, "step": 946 }, { "epoch": 1.685053380782918, "grad_norm": 1.73200461230147, "learning_rate": 5.994844396703025e-07, "loss": 0.0543, "step": 947 }, { "epoch": 1.6868327402135233, "grad_norm": 1.4191648146634566, "learning_rate": 5.928665234212233e-07, "loss": 0.033, "step": 948 }, { "epoch": 1.688612099644128, "grad_norm": 1.769010986702388, "learning_rate": 5.862830360639698e-07, "loss": 0.0416, "step": 949 }, { "epoch": 1.690391459074733, "grad_norm": 1.2365859730297384, "learning_rate": 5.797340290292907e-07, "loss": 0.0327, "step": 950 }, { "epoch": 1.6921708185053381, "grad_norm": 1.5944536984646245, "learning_rate": 5.732195534785723e-07, "loss": 0.0425, "step": 951 }, { "epoch": 1.693950177935943, "grad_norm": 1.0978948243138542, "learning_rate": 5.667396603034369e-07, "loss": 0.0281, "step": 952 }, { "epoch": 1.6957295373665482, "grad_norm": 1.6489451622459412, "learning_rate": 5.602944001253486e-07, "loss": 0.0471, "step": 953 }, { "epoch": 1.697508896797153, "grad_norm": 1.5034551647306265, "learning_rate": 5.538838232952104e-07, "loss": 0.0443, "step": 954 }, { "epoch": 1.699288256227758, "grad_norm": 1.7753753659867093, "learning_rate": 5.475079798929816e-07, "loss": 0.0561, "step": 955 }, { "epoch": 1.701067615658363, "grad_norm": 1.4311690621571131, "learning_rate": 5.411669197272795e-07, "loss": 0.0356, "step": 956 }, { "epoch": 1.7028469750889679, "grad_norm": 1.2084248674878706, "learning_rate": 5.348606923349903e-07, "loss": 0.0314, "step": 957 }, { "epoch": 1.704626334519573, "grad_norm": 1.4653829807321628, "learning_rate": 5.285893469808855e-07, "loss": 0.0342, "step": 958 }, { "epoch": 1.706405693950178, "grad_norm": 1.4861717769330527, "learning_rate": 5.223529326572352e-07, "loss": 0.0396, "step": 959 }, { "epoch": 1.708185053380783, "grad_norm": 1.6382009093594376, "learning_rate": 5.161514980834232e-07, "loss": 0.053, "step": 960 }, { "epoch": 1.709964412811388, "grad_norm": 1.3917549305779011, "learning_rate": 5.099850917055709e-07, "loss": 0.0416, "step": 961 }, { "epoch": 1.7117437722419928, "grad_norm": 1.5941820214039433, "learning_rate": 5.038537616961559e-07, "loss": 0.0329, "step": 962 }, { "epoch": 1.7135231316725978, "grad_norm": 1.3505572045437793, "learning_rate": 4.977575559536358e-07, "loss": 0.0393, "step": 963 }, { "epoch": 1.7153024911032029, "grad_norm": 1.4906267614713944, "learning_rate": 4.916965221020753e-07, "loss": 0.0353, "step": 964 }, { "epoch": 1.7170818505338077, "grad_norm": 1.1664280571560683, "learning_rate": 4.856707074907729e-07, "loss": 0.0297, "step": 965 }, { "epoch": 1.718861209964413, "grad_norm": 1.5848671896442588, "learning_rate": 4.796801591938922e-07, "loss": 0.0416, "step": 966 }, { "epoch": 1.7206405693950177, "grad_norm": 1.6248722133837754, "learning_rate": 4.737249240100911e-07, "loss": 0.0424, "step": 967 }, { "epoch": 1.7224199288256228, "grad_norm": 1.5499587224919618, "learning_rate": 4.6780504846216155e-07, "loss": 0.0331, "step": 968 }, { "epoch": 1.7241992882562278, "grad_norm": 1.3626836640644866, "learning_rate": 4.619205787966613e-07, "loss": 0.0373, "step": 969 }, { "epoch": 1.7259786476868326, "grad_norm": 1.054552356505648, "learning_rate": 4.560715609835548e-07, "loss": 0.0313, "step": 970 }, { "epoch": 1.7277580071174379, "grad_norm": 1.32690819345873, "learning_rate": 4.5025804071585464e-07, "loss": 0.0363, "step": 971 }, { "epoch": 1.7295373665480427, "grad_norm": 1.8155269019307265, "learning_rate": 4.4448006340926163e-07, "loss": 0.0564, "step": 972 }, { "epoch": 1.7313167259786477, "grad_norm": 1.5408465686558235, "learning_rate": 4.3873767420181344e-07, "loss": 0.0362, "step": 973 }, { "epoch": 1.7330960854092528, "grad_norm": 1.6773239432537133, "learning_rate": 4.3303091795353024e-07, "loss": 0.0455, "step": 974 }, { "epoch": 1.7348754448398576, "grad_norm": 1.7322358515920886, "learning_rate": 4.2735983924606596e-07, "loss": 0.0408, "step": 975 }, { "epoch": 1.7366548042704626, "grad_norm": 1.4106575440753382, "learning_rate": 4.2172448238235464e-07, "loss": 0.0305, "step": 976 }, { "epoch": 1.7384341637010676, "grad_norm": 1.3733644626019057, "learning_rate": 4.161248913862731e-07, "loss": 0.0301, "step": 977 }, { "epoch": 1.7402135231316724, "grad_norm": 1.3994999537006558, "learning_rate": 4.1056111000228937e-07, "loss": 0.0361, "step": 978 }, { "epoch": 1.7419928825622777, "grad_norm": 1.8229650102075292, "learning_rate": 4.0503318169512417e-07, "loss": 0.041, "step": 979 }, { "epoch": 1.7437722419928825, "grad_norm": 1.1810685268091181, "learning_rate": 3.9954114964941336e-07, "loss": 0.0304, "step": 980 }, { "epoch": 1.7455516014234875, "grad_norm": 1.3189393973510206, "learning_rate": 3.9408505676936327e-07, "loss": 0.0338, "step": 981 }, { "epoch": 1.7473309608540926, "grad_norm": 1.6684414853575773, "learning_rate": 3.886649456784253e-07, "loss": 0.047, "step": 982 }, { "epoch": 1.7491103202846974, "grad_norm": 1.6906975429443252, "learning_rate": 3.8328085871895624e-07, "loss": 0.0558, "step": 983 }, { "epoch": 1.7508896797153026, "grad_norm": 1.3184724212662704, "learning_rate": 3.779328379518898e-07, "loss": 0.0345, "step": 984 }, { "epoch": 1.7526690391459074, "grad_norm": 1.4655669673179201, "learning_rate": 3.7262092515640556e-07, "loss": 0.0391, "step": 985 }, { "epoch": 1.7544483985765125, "grad_norm": 1.1582462516823915, "learning_rate": 3.673451618296081e-07, "loss": 0.0284, "step": 986 }, { "epoch": 1.7562277580071175, "grad_norm": 1.2884297196498398, "learning_rate": 3.621055891861963e-07, "loss": 0.0389, "step": 987 }, { "epoch": 1.7580071174377223, "grad_norm": 1.5602060082103053, "learning_rate": 3.56902248158148e-07, "loss": 0.0373, "step": 988 }, { "epoch": 1.7597864768683276, "grad_norm": 1.3896690008295718, "learning_rate": 3.517351793943913e-07, "loss": 0.0339, "step": 989 }, { "epoch": 1.7615658362989324, "grad_norm": 1.3806573572348462, "learning_rate": 3.4660442326049704e-07, "loss": 0.0296, "step": 990 }, { "epoch": 1.7633451957295374, "grad_norm": 1.2620526912546866, "learning_rate": 3.4151001983835696e-07, "loss": 0.0363, "step": 991 }, { "epoch": 1.7651245551601424, "grad_norm": 1.532310867131612, "learning_rate": 3.364520089258727e-07, "loss": 0.036, "step": 992 }, { "epoch": 1.7669039145907472, "grad_norm": 1.3687563826799665, "learning_rate": 3.314304300366461e-07, "loss": 0.0365, "step": 993 }, { "epoch": 1.7686832740213523, "grad_norm": 1.411437401308879, "learning_rate": 3.2644532239966444e-07, "loss": 0.0421, "step": 994 }, { "epoch": 1.7704626334519573, "grad_norm": 1.1120626082359164, "learning_rate": 3.2149672495900286e-07, "loss": 0.0305, "step": 995 }, { "epoch": 1.7722419928825621, "grad_norm": 1.3261442032732178, "learning_rate": 3.165846763735153e-07, "loss": 0.0404, "step": 996 }, { "epoch": 1.7740213523131674, "grad_norm": 1.3314927487277297, "learning_rate": 3.117092150165324e-07, "loss": 0.0395, "step": 997 }, { "epoch": 1.7758007117437722, "grad_norm": 1.354501013095216, "learning_rate": 3.068703789755606e-07, "loss": 0.0392, "step": 998 }, { "epoch": 1.7775800711743772, "grad_norm": 1.494535900064754, "learning_rate": 3.020682060519886e-07, "loss": 0.0388, "step": 999 }, { "epoch": 1.7793594306049823, "grad_norm": 1.5874497599489694, "learning_rate": 2.9730273376078923e-07, "loss": 0.0414, "step": 1000 }, { "epoch": 1.7793594306049823, "eval_loss": 0.13214601576328278, "eval_runtime": 1.901, "eval_samples_per_second": 24.198, "eval_steps_per_second": 6.312, "step": 1000 }, { "epoch": 1.781138790035587, "grad_norm": 2.3994380111667395, "learning_rate": 2.9257399933022737e-07, "loss": 0.0501, "step": 1001 }, { "epoch": 1.7829181494661923, "grad_norm": 1.2655566792616009, "learning_rate": 2.8788203970156805e-07, "loss": 0.0312, "step": 1002 }, { "epoch": 1.7846975088967971, "grad_norm": 1.3444020777696293, "learning_rate": 2.832268915287878e-07, "loss": 0.0437, "step": 1003 }, { "epoch": 1.7864768683274022, "grad_norm": 1.5542906543407722, "learning_rate": 2.7860859117828985e-07, "loss": 0.0428, "step": 1004 }, { "epoch": 1.7882562277580072, "grad_norm": 2.321897861204451, "learning_rate": 2.740271747286194e-07, "loss": 0.0747, "step": 1005 }, { "epoch": 1.790035587188612, "grad_norm": 1.330710360419337, "learning_rate": 2.6948267797018145e-07, "loss": 0.0356, "step": 1006 }, { "epoch": 1.791814946619217, "grad_norm": 1.3689667536089076, "learning_rate": 2.649751364049613e-07, "loss": 0.0329, "step": 1007 }, { "epoch": 1.793594306049822, "grad_norm": 1.382354181385714, "learning_rate": 2.6050458524624735e-07, "loss": 0.0349, "step": 1008 }, { "epoch": 1.795373665480427, "grad_norm": 1.2218244187209417, "learning_rate": 2.560710594183552e-07, "loss": 0.0298, "step": 1009 }, { "epoch": 1.7971530249110321, "grad_norm": 1.2892594694947963, "learning_rate": 2.5167459355635524e-07, "loss": 0.0407, "step": 1010 }, { "epoch": 1.798932384341637, "grad_norm": 1.6023679642626898, "learning_rate": 2.473152220058039e-07, "loss": 0.0387, "step": 1011 }, { "epoch": 1.800711743772242, "grad_norm": 1.665045377876774, "learning_rate": 2.429929788224722e-07, "loss": 0.0433, "step": 1012 }, { "epoch": 1.802491103202847, "grad_norm": 1.3909161579328713, "learning_rate": 2.38707897772083e-07, "loss": 0.0449, "step": 1013 }, { "epoch": 1.8042704626334518, "grad_norm": 1.5830892876212914, "learning_rate": 2.3446001233004333e-07, "loss": 0.0525, "step": 1014 }, { "epoch": 1.806049822064057, "grad_norm": 1.8525572439643299, "learning_rate": 2.3024935568118745e-07, "loss": 0.0409, "step": 1015 }, { "epoch": 1.8078291814946619, "grad_norm": 1.2135030517745455, "learning_rate": 2.2607596071951288e-07, "loss": 0.0292, "step": 1016 }, { "epoch": 1.809608540925267, "grad_norm": 1.1214741935267347, "learning_rate": 2.2193986004792667e-07, "loss": 0.032, "step": 1017 }, { "epoch": 1.811387900355872, "grad_norm": 1.068768510245524, "learning_rate": 2.1784108597799058e-07, "loss": 0.0209, "step": 1018 }, { "epoch": 1.8131672597864767, "grad_norm": 1.384011261046797, "learning_rate": 2.1377967052966685e-07, "loss": 0.0358, "step": 1019 }, { "epoch": 1.814946619217082, "grad_norm": 1.2099586837907217, "learning_rate": 2.0975564543107007e-07, "loss": 0.0331, "step": 1020 }, { "epoch": 1.8167259786476868, "grad_norm": 1.2940151547770433, "learning_rate": 2.057690421182168e-07, "loss": 0.0334, "step": 1021 }, { "epoch": 1.8185053380782918, "grad_norm": 1.3827063274317817, "learning_rate": 2.01819891734783e-07, "loss": 0.0397, "step": 1022 }, { "epoch": 1.8202846975088969, "grad_norm": 1.3215914412490635, "learning_rate": 1.979082251318576e-07, "loss": 0.0343, "step": 1023 }, { "epoch": 1.8220640569395017, "grad_norm": 1.2162108890219305, "learning_rate": 1.9403407286770592e-07, "loss": 0.0293, "step": 1024 }, { "epoch": 1.8238434163701067, "grad_norm": 1.193119010928115, "learning_rate": 1.9019746520752502e-07, "loss": 0.0382, "step": 1025 }, { "epoch": 1.8256227758007118, "grad_norm": 1.2590633332237346, "learning_rate": 1.8639843212321206e-07, "loss": 0.0379, "step": 1026 }, { "epoch": 1.8274021352313166, "grad_norm": 1.4827732789423547, "learning_rate": 1.826370032931285e-07, "loss": 0.0386, "step": 1027 }, { "epoch": 1.8291814946619218, "grad_norm": 1.4019662312978882, "learning_rate": 1.789132081018674e-07, "loss": 0.0344, "step": 1028 }, { "epoch": 1.8309608540925266, "grad_norm": 1.277130084295682, "learning_rate": 1.7522707564002706e-07, "loss": 0.0311, "step": 1029 }, { "epoch": 1.8327402135231317, "grad_norm": 1.1965932241938335, "learning_rate": 1.7157863470397718e-07, "loss": 0.0262, "step": 1030 }, { "epoch": 1.8345195729537367, "grad_norm": 1.5200736373961061, "learning_rate": 1.6796791379564138e-07, "loss": 0.0416, "step": 1031 }, { "epoch": 1.8362989323843415, "grad_norm": 1.2800099516950374, "learning_rate": 1.6439494112227173e-07, "loss": 0.0278, "step": 1032 }, { "epoch": 1.8380782918149468, "grad_norm": 1.5304018078565587, "learning_rate": 1.6085974459622567e-07, "loss": 0.0485, "step": 1033 }, { "epoch": 1.8398576512455516, "grad_norm": 1.071069909759288, "learning_rate": 1.573623518347517e-07, "loss": 0.0266, "step": 1034 }, { "epoch": 1.8416370106761566, "grad_norm": 1.493163724386636, "learning_rate": 1.5390279015977117e-07, "loss": 0.0453, "step": 1035 }, { "epoch": 1.8434163701067616, "grad_norm": 0.9977415418263439, "learning_rate": 1.5048108659766693e-07, "loss": 0.0246, "step": 1036 }, { "epoch": 1.8451957295373664, "grad_norm": 1.2758345593726452, "learning_rate": 1.470972678790711e-07, "loss": 0.0402, "step": 1037 }, { "epoch": 1.8469750889679717, "grad_norm": 1.321818491410688, "learning_rate": 1.437513604386559e-07, "loss": 0.0488, "step": 1038 }, { "epoch": 1.8487544483985765, "grad_norm": 1.3564010771277493, "learning_rate": 1.404433904149266e-07, "loss": 0.0319, "step": 1039 }, { "epoch": 1.8505338078291815, "grad_norm": 2.1447861358818656, "learning_rate": 1.3717338365001943e-07, "loss": 0.0353, "step": 1040 }, { "epoch": 1.8523131672597866, "grad_norm": 1.621539034569892, "learning_rate": 1.3394136568949834e-07, "loss": 0.0486, "step": 1041 }, { "epoch": 1.8540925266903914, "grad_norm": 1.7080237630248185, "learning_rate": 1.307473617821553e-07, "loss": 0.0437, "step": 1042 }, { "epoch": 1.8558718861209964, "grad_norm": 1.6764300595040629, "learning_rate": 1.275913968798137e-07, "loss": 0.0406, "step": 1043 }, { "epoch": 1.8576512455516014, "grad_norm": 1.5347915498756959, "learning_rate": 1.2447349563713186e-07, "loss": 0.0478, "step": 1044 }, { "epoch": 1.8594306049822062, "grad_norm": 1.0943355790240306, "learning_rate": 1.213936824114137e-07, "loss": 0.0298, "step": 1045 }, { "epoch": 1.8612099644128115, "grad_norm": 1.613150778817604, "learning_rate": 1.1835198126241509e-07, "loss": 0.0405, "step": 1046 }, { "epoch": 1.8629893238434163, "grad_norm": 1.5848325689123757, "learning_rate": 1.1534841595215617e-07, "loss": 0.0401, "step": 1047 }, { "epoch": 1.8647686832740213, "grad_norm": 1.0431301216835898, "learning_rate": 1.1238300994473983e-07, "loss": 0.023, "step": 1048 }, { "epoch": 1.8665480427046264, "grad_norm": 1.3741342743919496, "learning_rate": 1.0945578640616183e-07, "loss": 0.0453, "step": 1049 }, { "epoch": 1.8683274021352312, "grad_norm": 1.2455640295474606, "learning_rate": 1.0656676820413603e-07, "loss": 0.0241, "step": 1050 }, { "epoch": 1.8701067615658364, "grad_norm": 1.1284197355327303, "learning_rate": 1.0371597790791166e-07, "loss": 0.0297, "step": 1051 }, { "epoch": 1.8718861209964412, "grad_norm": 1.5640890914227485, "learning_rate": 1.0090343778809908e-07, "loss": 0.0455, "step": 1052 }, { "epoch": 1.8736654804270463, "grad_norm": 1.853926089016026, "learning_rate": 9.812916981649433e-08, "loss": 0.0522, "step": 1053 }, { "epoch": 1.8754448398576513, "grad_norm": 1.2087726984754221, "learning_rate": 9.539319566590766e-08, "loss": 0.0386, "step": 1054 }, { "epoch": 1.8772241992882561, "grad_norm": 1.3864583312632892, "learning_rate": 9.269553670999743e-08, "loss": 0.0414, "step": 1055 }, { "epoch": 1.8790035587188612, "grad_norm": 1.7413407084585752, "learning_rate": 9.003621402309815e-08, "loss": 0.0432, "step": 1056 }, { "epoch": 1.8807829181494662, "grad_norm": 1.7263371896066506, "learning_rate": 8.741524838005888e-08, "loss": 0.0462, "step": 1057 }, { "epoch": 1.8825622775800712, "grad_norm": 1.3344573548942262, "learning_rate": 8.483266025608061e-08, "loss": 0.0281, "step": 1058 }, { "epoch": 1.8843416370106763, "grad_norm": 1.2682211863586654, "learning_rate": 8.228846982655525e-08, "loss": 0.0336, "step": 1059 }, { "epoch": 1.886120996441281, "grad_norm": 1.3925211114661775, "learning_rate": 7.978269696691021e-08, "loss": 0.0443, "step": 1060 }, { "epoch": 1.887900355871886, "grad_norm": 1.4338224449866763, "learning_rate": 7.731536125244965e-08, "loss": 0.0389, "step": 1061 }, { "epoch": 1.8896797153024911, "grad_norm": 1.5427573154466507, "learning_rate": 7.488648195820513e-08, "loss": 0.0544, "step": 1062 }, { "epoch": 1.891459074733096, "grad_norm": 1.9957246447587085, "learning_rate": 7.249607805878245e-08, "loss": 0.0492, "step": 1063 }, { "epoch": 1.8932384341637012, "grad_norm": 1.5083653751763368, "learning_rate": 7.014416822821557e-08, "loss": 0.037, "step": 1064 }, { "epoch": 1.895017793594306, "grad_norm": 1.095822277468113, "learning_rate": 6.783077083981793e-08, "loss": 0.0235, "step": 1065 }, { "epoch": 1.896797153024911, "grad_norm": 1.4578622440819116, "learning_rate": 6.55559039660425e-08, "loss": 0.0352, "step": 1066 }, { "epoch": 1.898576512455516, "grad_norm": 1.6579309111167273, "learning_rate": 6.331958537833693e-08, "loss": 0.0428, "step": 1067 }, { "epoch": 1.9003558718861209, "grad_norm": 1.5358610493871263, "learning_rate": 6.112183254700866e-08, "loss": 0.0405, "step": 1068 }, { "epoch": 1.9021352313167261, "grad_norm": 1.260740212875412, "learning_rate": 5.8962662641083856e-08, "loss": 0.0292, "step": 1069 }, { "epoch": 1.903914590747331, "grad_norm": 1.4404732905947126, "learning_rate": 5.6842092528176516e-08, "loss": 0.039, "step": 1070 }, { "epoch": 1.905693950177936, "grad_norm": 1.3400934906594497, "learning_rate": 5.476013877435626e-08, "loss": 0.0332, "step": 1071 }, { "epoch": 1.907473309608541, "grad_norm": 1.558440412579011, "learning_rate": 5.271681764401848e-08, "loss": 0.0396, "step": 1072 }, { "epoch": 1.9092526690391458, "grad_norm": 1.0564459406252982, "learning_rate": 5.071214509975775e-08, "loss": 0.0298, "step": 1073 }, { "epoch": 1.9110320284697508, "grad_norm": 1.5703824080677984, "learning_rate": 4.8746136802240716e-08, "loss": 0.0388, "step": 1074 }, { "epoch": 1.9128113879003559, "grad_norm": 1.5587546168148023, "learning_rate": 4.6818808110087875e-08, "loss": 0.0445, "step": 1075 }, { "epoch": 1.914590747330961, "grad_norm": 1.7881392555936386, "learning_rate": 4.493017407975087e-08, "loss": 0.0562, "step": 1076 }, { "epoch": 1.916370106761566, "grad_norm": 1.2093607732055727, "learning_rate": 4.308024946539424e-08, "loss": 0.0259, "step": 1077 }, { "epoch": 1.9181494661921707, "grad_norm": 1.4963960688310853, "learning_rate": 4.1269048718783344e-08, "loss": 0.0353, "step": 1078 }, { "epoch": 1.9199288256227758, "grad_norm": 1.2644520299129427, "learning_rate": 3.9496585989167726e-08, "loss": 0.0297, "step": 1079 }, { "epoch": 1.9217081850533808, "grad_norm": 1.6218651221441036, "learning_rate": 3.776287512317345e-08, "loss": 0.0399, "step": 1080 }, { "epoch": 1.9234875444839856, "grad_norm": 1.3533436539813684, "learning_rate": 3.606792966469375e-08, "loss": 0.0333, "step": 1081 }, { "epoch": 1.9252669039145909, "grad_norm": 1.3436293271314022, "learning_rate": 3.4411762854782426e-08, "loss": 0.0339, "step": 1082 }, { "epoch": 1.9270462633451957, "grad_norm": 1.2424682477808022, "learning_rate": 3.279438763155174e-08, "loss": 0.0222, "step": 1083 }, { "epoch": 1.9288256227758007, "grad_norm": 1.4625796682975236, "learning_rate": 3.121581663007134e-08, "loss": 0.0441, "step": 1084 }, { "epoch": 1.9306049822064058, "grad_norm": 1.6645394110511582, "learning_rate": 2.967606218226837e-08, "loss": 0.0464, "step": 1085 }, { "epoch": 1.9323843416370106, "grad_norm": 1.106478723782067, "learning_rate": 2.8175136316832e-08, "loss": 0.027, "step": 1086 }, { "epoch": 1.9341637010676158, "grad_norm": 1.8103239757671263, "learning_rate": 2.6713050759120117e-08, "loss": 0.0592, "step": 1087 }, { "epoch": 1.9359430604982206, "grad_norm": 1.2901120841696607, "learning_rate": 2.528981693106558e-08, "loss": 0.03, "step": 1088 }, { "epoch": 1.9377224199288257, "grad_norm": 1.422552779123322, "learning_rate": 2.3905445951089013e-08, "loss": 0.0384, "step": 1089 }, { "epoch": 1.9395017793594307, "grad_norm": 1.8539314540512652, "learning_rate": 2.2559948634011673e-08, "loss": 0.0365, "step": 1090 }, { "epoch": 1.9412811387900355, "grad_norm": 1.5661200081043165, "learning_rate": 2.125333549096942e-08, "loss": 0.0381, "step": 1091 }, { "epoch": 1.9430604982206405, "grad_norm": 1.1789336601585891, "learning_rate": 1.9985616729332747e-08, "loss": 0.0343, "step": 1092 }, { "epoch": 1.9448398576512456, "grad_norm": 1.4173486867137657, "learning_rate": 1.8756802252625773e-08, "loss": 0.04, "step": 1093 }, { "epoch": 1.9466192170818504, "grad_norm": 1.2219710901824574, "learning_rate": 1.75669016604485e-08, "loss": 0.0337, "step": 1094 }, { "epoch": 1.9483985765124556, "grad_norm": 1.5078105550310874, "learning_rate": 1.6415924248403547e-08, "loss": 0.0292, "step": 1095 }, { "epoch": 1.9501779359430604, "grad_norm": 1.5769468954516352, "learning_rate": 1.5303879008021773e-08, "loss": 0.046, "step": 1096 }, { "epoch": 1.9519572953736655, "grad_norm": 1.5417189576459365, "learning_rate": 1.4230774626691756e-08, "loss": 0.0338, "step": 1097 }, { "epoch": 1.9537366548042705, "grad_norm": 1.4625492420378, "learning_rate": 1.3196619487594875e-08, "loss": 0.0419, "step": 1098 }, { "epoch": 1.9555160142348753, "grad_norm": 1.6253293099375465, "learning_rate": 1.2201421669636448e-08, "loss": 0.0449, "step": 1099 }, { "epoch": 1.9572953736654806, "grad_norm": 1.2845684493583356, "learning_rate": 1.1245188947384133e-08, "loss": 0.0321, "step": 1100 }, { "epoch": 1.9590747330960854, "grad_norm": 1.5211741158195164, "learning_rate": 1.0327928791006858e-08, "loss": 0.0394, "step": 1101 }, { "epoch": 1.9608540925266904, "grad_norm": 1.32502918966484, "learning_rate": 9.449648366217645e-09, "loss": 0.0388, "step": 1102 }, { "epoch": 1.9626334519572954, "grad_norm": 1.2306329274971302, "learning_rate": 8.61035453421588e-09, "loss": 0.0326, "step": 1103 }, { "epoch": 1.9644128113879002, "grad_norm": 1.259497000375209, "learning_rate": 7.81005385163458e-09, "loss": 0.0337, "step": 1104 }, { "epoch": 1.9661921708185055, "grad_norm": 1.0711614937711706, "learning_rate": 7.048752570488205e-09, "loss": 0.0349, "step": 1105 }, { "epoch": 1.9679715302491103, "grad_norm": 1.671578527538187, "learning_rate": 6.326456638125478e-09, "loss": 0.0491, "step": 1106 }, { "epoch": 1.9697508896797153, "grad_norm": 1.2763717849779994, "learning_rate": 5.643171697183314e-09, "loss": 0.0346, "step": 1107 }, { "epoch": 1.9715302491103204, "grad_norm": 1.3179124608704447, "learning_rate": 4.998903085539075e-09, "loss": 0.039, "step": 1108 }, { "epoch": 1.9733096085409252, "grad_norm": 1.3806194230293778, "learning_rate": 4.393655836272825e-09, "loss": 0.0332, "step": 1109 }, { "epoch": 1.9750889679715302, "grad_norm": 1.2918039555657261, "learning_rate": 3.8274346776262514e-09, "loss": 0.0331, "step": 1110 }, { "epoch": 1.9768683274021353, "grad_norm": 1.4506145536156116, "learning_rate": 3.300244032966582e-09, "loss": 0.0428, "step": 1111 }, { "epoch": 1.97864768683274, "grad_norm": 1.4951087403195147, "learning_rate": 2.8120880207493928e-09, "loss": 0.0332, "step": 1112 }, { "epoch": 1.9804270462633453, "grad_norm": 1.1081908867353776, "learning_rate": 2.362970454491409e-09, "loss": 0.0236, "step": 1113 }, { "epoch": 1.9822064056939501, "grad_norm": 1.5718787811958137, "learning_rate": 1.952894842735531e-09, "loss": 0.046, "step": 1114 }, { "epoch": 1.9839857651245552, "grad_norm": 1.048518531118998, "learning_rate": 1.5818643890258555e-09, "loss": 0.0284, "step": 1115 }, { "epoch": 1.9857651245551602, "grad_norm": 1.8576913911909543, "learning_rate": 1.2498819918843609e-09, "loss": 0.0395, "step": 1116 }, { "epoch": 1.987544483985765, "grad_norm": 1.7624658512036215, "learning_rate": 9.569502447837053e-10, "loss": 0.041, "step": 1117 }, { "epoch": 1.9893238434163703, "grad_norm": 1.4732334931781161, "learning_rate": 7.03071436131686e-10, "loss": 0.0331, "step": 1118 }, { "epoch": 1.991103202846975, "grad_norm": 1.3765988284402233, "learning_rate": 4.882475492506977e-10, "loss": 0.0368, "step": 1119 }, { "epoch": 1.99288256227758, "grad_norm": 1.4137741396402363, "learning_rate": 3.124802623627465e-10, "loss": 0.0376, "step": 1120 }, { "epoch": 1.9946619217081851, "grad_norm": 1.447737573132864, "learning_rate": 1.7577094857557097e-10, "loss": 0.0476, "step": 1121 }, { "epoch": 1.99644128113879, "grad_norm": 1.1791771499083685, "learning_rate": 7.812067587487093e-11, "loss": 0.0344, "step": 1122 }, { "epoch": 1.998220640569395, "grad_norm": 1.5023715918920821, "learning_rate": 1.9530207111539967e-11, "loss": 0.0332, "step": 1123 }, { "epoch": 2.0, "grad_norm": 1.1718081619906127, "learning_rate": 0.0, "loss": 0.0332, "step": 1124 }, { "epoch": 2.0, "step": 1124, "total_flos": 4998105219072.0, "train_loss": 0.08123247806327517, "train_runtime": 863.7561, "train_samples_per_second": 10.403, "train_steps_per_second": 1.301 } ], "logging_steps": 1, "max_steps": 1124, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4998105219072.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }