{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004347826086956522, "grad_norm": 30.10830424636028, "learning_rate": 9.999883393595949e-06, "loss": 1.1079, "step": 1 }, { "epoch": 0.008695652173913044, "grad_norm": 8.45234959602764, "learning_rate": 9.999533579822611e-06, "loss": 1.0206, "step": 2 }, { "epoch": 0.013043478260869565, "grad_norm": 12.590663103699724, "learning_rate": 9.9989505749962e-06, "loss": 0.8925, "step": 3 }, { "epoch": 0.017391304347826087, "grad_norm": 7.5496112069603, "learning_rate": 9.998134406309555e-06, "loss": 0.8343, "step": 4 }, { "epoch": 0.021739130434782608, "grad_norm": 4.862205369093291, "learning_rate": 9.99708511183087e-06, "loss": 0.6931, "step": 5 }, { "epoch": 0.02608695652173913, "grad_norm": 5.355958790833682, "learning_rate": 9.995802740501933e-06, "loss": 0.8249, "step": 6 }, { "epoch": 0.030434782608695653, "grad_norm": 4.813975580629782, "learning_rate": 9.994287352135826e-06, "loss": 0.8491, "step": 7 }, { "epoch": 0.034782608695652174, "grad_norm": 4.209698696676494, "learning_rate": 9.99253901741414e-06, "loss": 0.7867, "step": 8 }, { "epoch": 0.0391304347826087, "grad_norm": 5.3082388200901685, "learning_rate": 9.99055781788369e-06, "loss": 0.559, "step": 9 }, { "epoch": 0.043478260869565216, "grad_norm": 4.186725588547987, "learning_rate": 9.988343845952697e-06, "loss": 0.8831, "step": 10 }, { "epoch": 0.04782608695652174, "grad_norm": 3.6882197290844796, "learning_rate": 9.985897204886481e-06, "loss": 0.7416, "step": 11 }, { "epoch": 0.05217391304347826, "grad_norm": 4.265123735321938, "learning_rate": 9.983218008802648e-06, "loss": 0.7885, "step": 12 }, { "epoch": 0.05652173913043478, "grad_norm": 3.9942127849301583, "learning_rate": 9.98030638266577e-06, "loss": 0.8622, "step": 13 }, { "epoch": 0.06086956521739131, "grad_norm": 3.2098436387002125, "learning_rate": 9.977162462281544e-06, "loss": 0.7362, "step": 14 }, { "epoch": 0.06521739130434782, "grad_norm": 3.732980481277789, "learning_rate": 9.973786394290475e-06, "loss": 0.7865, "step": 15 }, { "epoch": 0.06956521739130435, "grad_norm": 4.634256740235812, "learning_rate": 9.970178336161018e-06, "loss": 0.7807, "step": 16 }, { "epoch": 0.07391304347826087, "grad_norm": 3.377746664235735, "learning_rate": 9.96633845618225e-06, "loss": 0.945, "step": 17 }, { "epoch": 0.0782608695652174, "grad_norm": 3.495299939802682, "learning_rate": 9.962266933456008e-06, "loss": 0.5839, "step": 18 }, { "epoch": 0.08260869565217391, "grad_norm": 3.7360952184020655, "learning_rate": 9.957963957888542e-06, "loss": 0.8982, "step": 19 }, { "epoch": 0.08695652173913043, "grad_norm": 4.253452751051838, "learning_rate": 9.953429730181653e-06, "loss": 0.7377, "step": 20 }, { "epoch": 0.09130434782608696, "grad_norm": 4.19176844357134, "learning_rate": 9.94866446182334e-06, "loss": 0.8799, "step": 21 }, { "epoch": 0.09565217391304348, "grad_norm": 7.890793002340963, "learning_rate": 9.943668375077926e-06, "loss": 0.9303, "step": 22 }, { "epoch": 0.1, "grad_norm": 4.112696318296595, "learning_rate": 9.938441702975689e-06, "loss": 0.9712, "step": 23 }, { "epoch": 0.10434782608695652, "grad_norm": 4.16656380286025, "learning_rate": 9.932984689302012e-06, "loss": 0.9881, "step": 24 }, { "epoch": 0.10869565217391304, "grad_norm": 5.254098871150094, "learning_rate": 9.927297588585984e-06, "loss": 0.7804, "step": 25 }, { "epoch": 0.11304347826086956, "grad_norm": 4.860475419154404, "learning_rate": 9.921380666088558e-06, "loss": 0.692, "step": 26 }, { "epoch": 0.11739130434782609, "grad_norm": 4.121016653034456, "learning_rate": 9.915234197790153e-06, "loss": 0.9255, "step": 27 }, { "epoch": 0.12173913043478261, "grad_norm": 3.5062273892646836, "learning_rate": 9.908858470377793e-06, "loss": 0.9835, "step": 28 }, { "epoch": 0.12608695652173912, "grad_norm": 3.9902741607607344, "learning_rate": 9.902253781231741e-06, "loss": 0.6559, "step": 29 }, { "epoch": 0.13043478260869565, "grad_norm": 3.72173213984111, "learning_rate": 9.895420438411616e-06, "loss": 0.8773, "step": 30 }, { "epoch": 0.13478260869565217, "grad_norm": 4.1059197008642485, "learning_rate": 9.88835876064203e-06, "loss": 0.763, "step": 31 }, { "epoch": 0.1391304347826087, "grad_norm": 3.448061821839538, "learning_rate": 9.881069077297724e-06, "loss": 0.8204, "step": 32 }, { "epoch": 0.14347826086956522, "grad_norm": 3.5353256397883577, "learning_rate": 9.873551728388203e-06, "loss": 0.7933, "step": 33 }, { "epoch": 0.14782608695652175, "grad_norm": 4.516883030605288, "learning_rate": 9.865807064541878e-06, "loss": 0.8324, "step": 34 }, { "epoch": 0.15217391304347827, "grad_norm": 4.017654364372926, "learning_rate": 9.857835446989708e-06, "loss": 1.1247, "step": 35 }, { "epoch": 0.1565217391304348, "grad_norm": 3.9881313932819276, "learning_rate": 9.849637247548356e-06, "loss": 0.5466, "step": 36 }, { "epoch": 0.1608695652173913, "grad_norm": 4.196181042954896, "learning_rate": 9.841212848602848e-06, "loss": 0.9061, "step": 37 }, { "epoch": 0.16521739130434782, "grad_norm": 3.971698021758679, "learning_rate": 9.832562643088724e-06, "loss": 0.8046, "step": 38 }, { "epoch": 0.16956521739130434, "grad_norm": 3.994700871237702, "learning_rate": 9.823687034473734e-06, "loss": 0.866, "step": 39 }, { "epoch": 0.17391304347826086, "grad_norm": 4.300878065609124, "learning_rate": 9.814586436738998e-06, "loss": 1.0917, "step": 40 }, { "epoch": 0.1782608695652174, "grad_norm": 4.920655306563475, "learning_rate": 9.805261274359705e-06, "loss": 0.8076, "step": 41 }, { "epoch": 0.1826086956521739, "grad_norm": 3.753606357325065, "learning_rate": 9.795711982285317e-06, "loss": 0.8976, "step": 42 }, { "epoch": 0.18695652173913044, "grad_norm": 3.2168659658690495, "learning_rate": 9.785939005919279e-06, "loss": 0.8099, "step": 43 }, { "epoch": 0.19130434782608696, "grad_norm": 3.0870127735418267, "learning_rate": 9.775942801098241e-06, "loss": 0.7528, "step": 44 }, { "epoch": 0.1956521739130435, "grad_norm": 4.360067131834863, "learning_rate": 9.765723834070805e-06, "loss": 0.8897, "step": 45 }, { "epoch": 0.2, "grad_norm": 2.9788089959406663, "learning_rate": 9.755282581475769e-06, "loss": 0.6911, "step": 46 }, { "epoch": 0.20434782608695654, "grad_norm": 4.9444885243270456, "learning_rate": 9.7446195303199e-06, "loss": 0.9875, "step": 47 }, { "epoch": 0.20869565217391303, "grad_norm": 3.0578360199041184, "learning_rate": 9.733735177955219e-06, "loss": 0.5668, "step": 48 }, { "epoch": 0.21304347826086956, "grad_norm": 3.977296603047938, "learning_rate": 9.722630032055804e-06, "loss": 0.8334, "step": 49 }, { "epoch": 0.21739130434782608, "grad_norm": 2.7972114537424146, "learning_rate": 9.711304610594104e-06, "loss": 0.8011, "step": 50 }, { "epoch": 0.2217391304347826, "grad_norm": 3.236782797956836, "learning_rate": 9.699759441816788e-06, "loss": 0.6684, "step": 51 }, { "epoch": 0.22608695652173913, "grad_norm": 4.416621854566764, "learning_rate": 9.687995064220102e-06, "loss": 0.8834, "step": 52 }, { "epoch": 0.23043478260869565, "grad_norm": 3.3671336274674117, "learning_rate": 9.676012026524755e-06, "loss": 0.6965, "step": 53 }, { "epoch": 0.23478260869565218, "grad_norm": 3.9913392830992267, "learning_rate": 9.66381088765032e-06, "loss": 0.938, "step": 54 }, { "epoch": 0.2391304347826087, "grad_norm": 3.760436305337915, "learning_rate": 9.651392216689167e-06, "loss": 1.0933, "step": 55 }, { "epoch": 0.24347826086956523, "grad_norm": 2.7129860608500365, "learning_rate": 9.638756592879923e-06, "loss": 0.6742, "step": 56 }, { "epoch": 0.24782608695652175, "grad_norm": 3.223448161038934, "learning_rate": 9.625904605580452e-06, "loss": 0.7107, "step": 57 }, { "epoch": 0.25217391304347825, "grad_norm": 3.5533175943979782, "learning_rate": 9.61283685424036e-06, "loss": 0.8229, "step": 58 }, { "epoch": 0.2565217391304348, "grad_norm": 4.469619534798299, "learning_rate": 9.599553948373047e-06, "loss": 0.6858, "step": 59 }, { "epoch": 0.2608695652173913, "grad_norm": 3.759555454725975, "learning_rate": 9.586056507527266e-06, "loss": 0.8061, "step": 60 }, { "epoch": 0.26521739130434785, "grad_norm": 4.237886610909077, "learning_rate": 9.572345161258235e-06, "loss": 0.8282, "step": 61 }, { "epoch": 0.26956521739130435, "grad_norm": 3.2748103648644546, "learning_rate": 9.558420549098269e-06, "loss": 0.8382, "step": 62 }, { "epoch": 0.27391304347826084, "grad_norm": 3.610728507102718, "learning_rate": 9.544283320526943e-06, "loss": 0.7024, "step": 63 }, { "epoch": 0.2782608695652174, "grad_norm": 3.0936408893059473, "learning_rate": 9.529934134940819e-06, "loss": 0.6702, "step": 64 }, { "epoch": 0.2826086956521739, "grad_norm": 3.168622390524342, "learning_rate": 9.515373661622665e-06, "loss": 0.8201, "step": 65 }, { "epoch": 0.28695652173913044, "grad_norm": 4.875869865721861, "learning_rate": 9.500602579710256e-06, "loss": 0.6873, "step": 66 }, { "epoch": 0.29130434782608694, "grad_norm": 3.194305690371054, "learning_rate": 9.48562157816469e-06, "loss": 0.667, "step": 67 }, { "epoch": 0.2956521739130435, "grad_norm": 3.3142405160528488, "learning_rate": 9.470431355738257e-06, "loss": 0.8051, "step": 68 }, { "epoch": 0.3, "grad_norm": 4.262816871896188, "learning_rate": 9.45503262094184e-06, "loss": 1.0722, "step": 69 }, { "epoch": 0.30434782608695654, "grad_norm": 3.599023595923798, "learning_rate": 9.439426092011877e-06, "loss": 0.7878, "step": 70 }, { "epoch": 0.30869565217391304, "grad_norm": 5.109580952764362, "learning_rate": 9.423612496876856e-06, "loss": 0.9254, "step": 71 }, { "epoch": 0.3130434782608696, "grad_norm": 3.948434430815186, "learning_rate": 9.407592573123359e-06, "loss": 0.6889, "step": 72 }, { "epoch": 0.3173913043478261, "grad_norm": 4.109705050607312, "learning_rate": 9.39136706796167e-06, "loss": 0.9096, "step": 73 }, { "epoch": 0.3217391304347826, "grad_norm": 4.685155702670585, "learning_rate": 9.374936738190913e-06, "loss": 1.0758, "step": 74 }, { "epoch": 0.32608695652173914, "grad_norm": 3.5583319543828535, "learning_rate": 9.358302350163758e-06, "loss": 0.9975, "step": 75 }, { "epoch": 0.33043478260869563, "grad_norm": 4.132644653851408, "learning_rate": 9.341464679750669e-06, "loss": 0.815, "step": 76 }, { "epoch": 0.3347826086956522, "grad_norm": 4.012241097059724, "learning_rate": 9.32442451230373e-06, "loss": 0.7434, "step": 77 }, { "epoch": 0.3391304347826087, "grad_norm": 4.056336217070605, "learning_rate": 9.307182642620001e-06, "loss": 0.9034, "step": 78 }, { "epoch": 0.34347826086956523, "grad_norm": 5.018176130315487, "learning_rate": 9.289739874904448e-06, "loss": 0.78, "step": 79 }, { "epoch": 0.34782608695652173, "grad_norm": 4.587214956739399, "learning_rate": 9.272097022732444e-06, "loss": 0.6927, "step": 80 }, { "epoch": 0.3521739130434783, "grad_norm": 3.3889605410886965, "learning_rate": 9.254254909011805e-06, "loss": 0.6882, "step": 81 }, { "epoch": 0.3565217391304348, "grad_norm": 4.148693119867694, "learning_rate": 9.236214365944418e-06, "loss": 0.7601, "step": 82 }, { "epoch": 0.36086956521739133, "grad_norm": 5.706866359933195, "learning_rate": 9.217976234987429e-06, "loss": 0.8002, "step": 83 }, { "epoch": 0.3652173913043478, "grad_norm": 3.761765555379755, "learning_rate": 9.199541366813984e-06, "loss": 0.9108, "step": 84 }, { "epoch": 0.3695652173913043, "grad_norm": 4.358469235825558, "learning_rate": 9.180910621273555e-06, "loss": 0.9495, "step": 85 }, { "epoch": 0.3739130434782609, "grad_norm": 5.19380750583511, "learning_rate": 9.16208486735184e-06, "loss": 0.796, "step": 86 }, { "epoch": 0.3782608695652174, "grad_norm": 4.40504584356501, "learning_rate": 9.14306498313023e-06, "loss": 0.9202, "step": 87 }, { "epoch": 0.3826086956521739, "grad_norm": 4.154237190106969, "learning_rate": 9.123851855744842e-06, "loss": 0.868, "step": 88 }, { "epoch": 0.3869565217391304, "grad_norm": 4.649679162473941, "learning_rate": 9.10444638134516e-06, "loss": 0.8633, "step": 89 }, { "epoch": 0.391304347826087, "grad_norm": 3.659547834041831, "learning_rate": 9.08484946505221e-06, "loss": 0.4639, "step": 90 }, { "epoch": 0.39565217391304347, "grad_norm": 3.772374253015877, "learning_rate": 9.065062020916376e-06, "loss": 1.0704, "step": 91 }, { "epoch": 0.4, "grad_norm": 4.044411509493095, "learning_rate": 9.045084971874738e-06, "loss": 0.7493, "step": 92 }, { "epoch": 0.4043478260869565, "grad_norm": 5.291875262805496, "learning_rate": 9.024919249708034e-06, "loss": 0.6936, "step": 93 }, { "epoch": 0.40869565217391307, "grad_norm": 4.964153371450278, "learning_rate": 9.004565794997209e-06, "loss": 1.2122, "step": 94 }, { "epoch": 0.41304347826086957, "grad_norm": 4.207843872473807, "learning_rate": 8.984025557079523e-06, "loss": 0.8132, "step": 95 }, { "epoch": 0.41739130434782606, "grad_norm": 6.132343241796095, "learning_rate": 8.963299494004292e-06, "loss": 0.8736, "step": 96 }, { "epoch": 0.4217391304347826, "grad_norm": 2.6269984574966365, "learning_rate": 8.942388572488188e-06, "loss": 0.8824, "step": 97 }, { "epoch": 0.4260869565217391, "grad_norm": 4.095187049220555, "learning_rate": 8.921293767870157e-06, "loss": 0.959, "step": 98 }, { "epoch": 0.43043478260869567, "grad_norm": 4.199828959982439, "learning_rate": 8.900016064065923e-06, "loss": 0.9751, "step": 99 }, { "epoch": 0.43478260869565216, "grad_norm": 3.3743148628694457, "learning_rate": 8.8785564535221e-06, "loss": 0.734, "step": 100 }, { "epoch": 0.4391304347826087, "grad_norm": 7.940326346556448, "learning_rate": 8.85691593716989e-06, "loss": 0.9944, "step": 101 }, { "epoch": 0.4434782608695652, "grad_norm": 3.7567313163102205, "learning_rate": 8.835095524378413e-06, "loss": 0.8144, "step": 102 }, { "epoch": 0.44782608695652176, "grad_norm": 5.966995376699713, "learning_rate": 8.81309623290762e-06, "loss": 1.1465, "step": 103 }, { "epoch": 0.45217391304347826, "grad_norm": 2.6433858040324907, "learning_rate": 8.790919088860815e-06, "loss": 0.5616, "step": 104 }, { "epoch": 0.45652173913043476, "grad_norm": 4.060660371560683, "learning_rate": 8.768565126636806e-06, "loss": 1.0732, "step": 105 }, { "epoch": 0.4608695652173913, "grad_norm": 4.243491466169052, "learning_rate": 8.746035388881655e-06, "loss": 0.8622, "step": 106 }, { "epoch": 0.4652173913043478, "grad_norm": 3.9070077494772115, "learning_rate": 8.723330926440045e-06, "loss": 0.9579, "step": 107 }, { "epoch": 0.46956521739130436, "grad_norm": 3.315573230842983, "learning_rate": 8.70045279830626e-06, "loss": 0.8125, "step": 108 }, { "epoch": 0.47391304347826085, "grad_norm": 3.418028185224645, "learning_rate": 8.677402071574806e-06, "loss": 0.5519, "step": 109 }, { "epoch": 0.4782608695652174, "grad_norm": 2.3583616451084706, "learning_rate": 8.65417982139062e-06, "loss": 0.5283, "step": 110 }, { "epoch": 0.4826086956521739, "grad_norm": 3.5720694264768937, "learning_rate": 8.630787130898943e-06, "loss": 0.652, "step": 111 }, { "epoch": 0.48695652173913045, "grad_norm": 3.5329821794242533, "learning_rate": 8.60722509119478e-06, "loss": 0.6831, "step": 112 }, { "epoch": 0.49130434782608695, "grad_norm": 3.375309035663863, "learning_rate": 8.583494801272018e-06, "loss": 0.8531, "step": 113 }, { "epoch": 0.4956521739130435, "grad_norm": 3.5989978695462663, "learning_rate": 8.559597367972168e-06, "loss": 0.6959, "step": 114 }, { "epoch": 0.5, "grad_norm": 2.5589265753781603, "learning_rate": 8.535533905932739e-06, "loss": 0.7375, "step": 115 }, { "epoch": 0.5043478260869565, "grad_norm": 4.346599181980232, "learning_rate": 8.511305537535238e-06, "loss": 0.8875, "step": 116 }, { "epoch": 0.508695652173913, "grad_norm": 3.387720172084152, "learning_rate": 8.48691339285283e-06, "loss": 0.593, "step": 117 }, { "epoch": 0.5130434782608696, "grad_norm": 5.311451544282254, "learning_rate": 8.462358609597629e-06, "loss": 0.7789, "step": 118 }, { "epoch": 0.5173913043478261, "grad_norm": 3.8052363309273147, "learning_rate": 8.437642333067626e-06, "loss": 0.9247, "step": 119 }, { "epoch": 0.5217391304347826, "grad_norm": 3.803654644029343, "learning_rate": 8.412765716093273e-06, "loss": 0.9483, "step": 120 }, { "epoch": 0.5260869565217391, "grad_norm": 3.120957568605628, "learning_rate": 8.387729918983706e-06, "loss": 0.7953, "step": 121 }, { "epoch": 0.5304347826086957, "grad_norm": 3.6215649638849317, "learning_rate": 8.362536109472637e-06, "loss": 0.9418, "step": 122 }, { "epoch": 0.5347826086956522, "grad_norm": 3.769862249632324, "learning_rate": 8.33718546266388e-06, "loss": 1.0663, "step": 123 }, { "epoch": 0.5391304347826087, "grad_norm": 3.339932901634682, "learning_rate": 8.31167916097654e-06, "loss": 0.7193, "step": 124 }, { "epoch": 0.5434782608695652, "grad_norm": 3.4380458029708603, "learning_rate": 8.286018394089864e-06, "loss": 0.8919, "step": 125 }, { "epoch": 0.5478260869565217, "grad_norm": 5.277459992809238, "learning_rate": 8.260204358887753e-06, "loss": 0.9262, "step": 126 }, { "epoch": 0.5521739130434783, "grad_norm": 4.2317096822642615, "learning_rate": 8.234238259402936e-06, "loss": 0.6884, "step": 127 }, { "epoch": 0.5565217391304348, "grad_norm": 4.526808713688422, "learning_rate": 8.208121306760806e-06, "loss": 0.976, "step": 128 }, { "epoch": 0.5608695652173913, "grad_norm": 5.4246343120836595, "learning_rate": 8.181854719122938e-06, "loss": 0.9472, "step": 129 }, { "epoch": 0.5652173913043478, "grad_norm": 5.84927247067023, "learning_rate": 8.155439721630265e-06, "loss": 1.205, "step": 130 }, { "epoch": 0.5695652173913044, "grad_norm": 4.573422770218524, "learning_rate": 8.128877546345932e-06, "loss": 0.7917, "step": 131 }, { "epoch": 0.5739130434782609, "grad_norm": 3.721444294410019, "learning_rate": 8.102169432197842e-06, "loss": 0.6568, "step": 132 }, { "epoch": 0.5782608695652174, "grad_norm": 4.9417471562139985, "learning_rate": 8.075316624920848e-06, "loss": 0.7244, "step": 133 }, { "epoch": 0.5826086956521739, "grad_norm": 3.5642397377694577, "learning_rate": 8.048320376998675e-06, "loss": 0.8697, "step": 134 }, { "epoch": 0.5869565217391305, "grad_norm": 3.8198832174635315, "learning_rate": 8.021181947605474e-06, "loss": 0.7617, "step": 135 }, { "epoch": 0.591304347826087, "grad_norm": 3.481415882832902, "learning_rate": 7.993902602547113e-06, "loss": 0.79, "step": 136 }, { "epoch": 0.5956521739130435, "grad_norm": 4.510299214820252, "learning_rate": 7.966483614202127e-06, "loss": 0.9127, "step": 137 }, { "epoch": 0.6, "grad_norm": 2.985106774813847, "learning_rate": 7.938926261462366e-06, "loss": 0.6109, "step": 138 }, { "epoch": 0.6043478260869565, "grad_norm": 3.4432849581794254, "learning_rate": 7.911231829673356e-06, "loss": 0.7885, "step": 139 }, { "epoch": 0.6086956521739131, "grad_norm": 4.31295058606684, "learning_rate": 7.883401610574338e-06, "loss": 0.6665, "step": 140 }, { "epoch": 0.6130434782608696, "grad_norm": 3.411489887606749, "learning_rate": 7.855436902238018e-06, "loss": 0.8652, "step": 141 }, { "epoch": 0.6173913043478261, "grad_norm": 2.6161499184487154, "learning_rate": 7.82733900901003e-06, "loss": 0.8522, "step": 142 }, { "epoch": 0.6217391304347826, "grad_norm": 3.7587252913188074, "learning_rate": 7.799109241448091e-06, "loss": 0.9895, "step": 143 }, { "epoch": 0.6260869565217392, "grad_norm": 4.171495681712704, "learning_rate": 7.770748916260875e-06, "loss": 1.1143, "step": 144 }, { "epoch": 0.6304347826086957, "grad_norm": 4.498205067110739, "learning_rate": 7.742259356246594e-06, "loss": 0.7515, "step": 145 }, { "epoch": 0.6347826086956522, "grad_norm": 7.042541090075897, "learning_rate": 7.71364189023131e-06, "loss": 1.019, "step": 146 }, { "epoch": 0.6391304347826087, "grad_norm": 3.4294733121091885, "learning_rate": 7.68489785300694e-06, "loss": 0.7827, "step": 147 }, { "epoch": 0.6434782608695652, "grad_norm": 5.107734671780975, "learning_rate": 7.656028585269017e-06, "loss": 0.881, "step": 148 }, { "epoch": 0.6478260869565218, "grad_norm": 3.793430012808144, "learning_rate": 7.627035433554138e-06, "loss": 0.83, "step": 149 }, { "epoch": 0.6521739130434783, "grad_norm": 2.93598624465324, "learning_rate": 7.597919750177168e-06, "loss": 0.731, "step": 150 }, { "epoch": 0.6565217391304348, "grad_norm": 4.596730503707614, "learning_rate": 7.5686828931681646e-06, "loss": 0.7652, "step": 151 }, { "epoch": 0.6608695652173913, "grad_norm": 3.6807761395861203, "learning_rate": 7.539326226209032e-06, "loss": 0.7775, "step": 152 }, { "epoch": 0.6652173913043479, "grad_norm": 3.262665001957235, "learning_rate": 7.509851118569915e-06, "loss": 0.8261, "step": 153 }, { "epoch": 0.6695652173913044, "grad_norm": 3.754006135747569, "learning_rate": 7.4802589450453415e-06, "loss": 0.9438, "step": 154 }, { "epoch": 0.6739130434782609, "grad_norm": 4.136880641675532, "learning_rate": 7.450551085890087e-06, "loss": 0.7168, "step": 155 }, { "epoch": 0.6782608695652174, "grad_norm": 4.537082958373421, "learning_rate": 7.420728926754803e-06, "loss": 0.7393, "step": 156 }, { "epoch": 0.6826086956521739, "grad_norm": 4.709451101592196, "learning_rate": 7.390793858621386e-06, "loss": 0.6084, "step": 157 }, { "epoch": 0.6869565217391305, "grad_norm": 3.478907976036108, "learning_rate": 7.360747277738094e-06, "loss": 0.9321, "step": 158 }, { "epoch": 0.691304347826087, "grad_norm": 3.4286921023964356, "learning_rate": 7.330590585554428e-06, "loss": 0.8209, "step": 159 }, { "epoch": 0.6956521739130435, "grad_norm": 4.006094985848452, "learning_rate": 7.300325188655762e-06, "loss": 0.7925, "step": 160 }, { "epoch": 0.7, "grad_norm": 3.7845797221282442, "learning_rate": 7.269952498697734e-06, "loss": 0.9974, "step": 161 }, { "epoch": 0.7043478260869566, "grad_norm": 3.554993958659687, "learning_rate": 7.2394739323404105e-06, "loss": 1.091, "step": 162 }, { "epoch": 0.7086956521739131, "grad_norm": 3.360485996421952, "learning_rate": 7.208890911182198e-06, "loss": 1.1279, "step": 163 }, { "epoch": 0.7130434782608696, "grad_norm": 4.068025031948494, "learning_rate": 7.178204861693546e-06, "loss": 0.8294, "step": 164 }, { "epoch": 0.717391304347826, "grad_norm": 5.238100324988785, "learning_rate": 7.147417215150411e-06, "loss": 1.0741, "step": 165 }, { "epoch": 0.7217391304347827, "grad_norm": 3.1428044117390157, "learning_rate": 7.116529407567489e-06, "loss": 0.614, "step": 166 }, { "epoch": 0.7260869565217392, "grad_norm": 4.913916286363875, "learning_rate": 7.085542879631253e-06, "loss": 0.7525, "step": 167 }, { "epoch": 0.7304347826086957, "grad_norm": 3.587542818686942, "learning_rate": 7.054459076632742e-06, "loss": 0.9964, "step": 168 }, { "epoch": 0.7347826086956522, "grad_norm": 4.446156336412497, "learning_rate": 7.0232794484001495e-06, "loss": 0.7518, "step": 169 }, { "epoch": 0.7391304347826086, "grad_norm": 4.1445810778077306, "learning_rate": 6.9920054492312086e-06, "loss": 0.7821, "step": 170 }, { "epoch": 0.7434782608695653, "grad_norm": 6.5457371594473885, "learning_rate": 6.960638537825352e-06, "loss": 1.1481, "step": 171 }, { "epoch": 0.7478260869565218, "grad_norm": 3.6216138215097557, "learning_rate": 6.9291801772156775e-06, "loss": 0.7234, "step": 172 }, { "epoch": 0.7521739130434782, "grad_norm": 3.4588765071154755, "learning_rate": 6.89763183470071e-06, "loss": 0.7835, "step": 173 }, { "epoch": 0.7565217391304347, "grad_norm": 3.4415204598446167, "learning_rate": 6.865994981775958e-06, "loss": 0.8836, "step": 174 }, { "epoch": 0.7608695652173914, "grad_norm": 4.164177481650729, "learning_rate": 6.834271094065284e-06, "loss": 0.9207, "step": 175 }, { "epoch": 0.7652173913043478, "grad_norm": 4.038475026990704, "learning_rate": 6.802461651252073e-06, "loss": 0.8593, "step": 176 }, { "epoch": 0.7695652173913043, "grad_norm": 3.468992427141025, "learning_rate": 6.770568137010226e-06, "loss": 0.9054, "step": 177 }, { "epoch": 0.7739130434782608, "grad_norm": 4.178768224138322, "learning_rate": 6.738592038934946e-06, "loss": 0.8688, "step": 178 }, { "epoch": 0.7782608695652173, "grad_norm": 4.945467366909143, "learning_rate": 6.706534848473353e-06, "loss": 0.8779, "step": 179 }, { "epoch": 0.782608695652174, "grad_norm": 4.150398477498481, "learning_rate": 6.674398060854931e-06, "loss": 0.8055, "step": 180 }, { "epoch": 0.7869565217391304, "grad_norm": 3.911559225842766, "learning_rate": 6.642183175021779e-06, "loss": 0.7101, "step": 181 }, { "epoch": 0.7913043478260869, "grad_norm": 3.227889892012402, "learning_rate": 6.609891693558692e-06, "loss": 0.7036, "step": 182 }, { "epoch": 0.7956521739130434, "grad_norm": 3.3323861592761306, "learning_rate": 6.5775251226230855e-06, "loss": 0.8071, "step": 183 }, { "epoch": 0.8, "grad_norm": 3.574513479071914, "learning_rate": 6.545084971874738e-06, "loss": 0.826, "step": 184 }, { "epoch": 0.8043478260869565, "grad_norm": 4.575750771124376, "learning_rate": 6.51257275440538e-06, "loss": 1.1722, "step": 185 }, { "epoch": 0.808695652173913, "grad_norm": 3.407884861003522, "learning_rate": 6.479989986668118e-06, "loss": 0.7167, "step": 186 }, { "epoch": 0.8130434782608695, "grad_norm": 4.119129382101472, "learning_rate": 6.447338188406705e-06, "loss": 0.9436, "step": 187 }, { "epoch": 0.8173913043478261, "grad_norm": 2.888021714493982, "learning_rate": 6.41461888258465e-06, "loss": 0.7366, "step": 188 }, { "epoch": 0.8217391304347826, "grad_norm": 3.853996538195132, "learning_rate": 6.3818335953141955e-06, "loss": 0.8666, "step": 189 }, { "epoch": 0.8260869565217391, "grad_norm": 3.7044180394270647, "learning_rate": 6.348983855785122e-06, "loss": 0.8697, "step": 190 }, { "epoch": 0.8304347826086956, "grad_norm": 4.285097504630087, "learning_rate": 6.31607119619343e-06, "loss": 0.6411, "step": 191 }, { "epoch": 0.8347826086956521, "grad_norm": 4.274714148494339, "learning_rate": 6.283097151669869e-06, "loss": 0.8285, "step": 192 }, { "epoch": 0.8391304347826087, "grad_norm": 4.403460887604329, "learning_rate": 6.250063260208345e-06, "loss": 0.7585, "step": 193 }, { "epoch": 0.8434782608695652, "grad_norm": 3.9513628543555877, "learning_rate": 6.216971062594179e-06, "loss": 0.8291, "step": 194 }, { "epoch": 0.8478260869565217, "grad_norm": 5.202803286054048, "learning_rate": 6.183822102332234e-06, "loss": 0.8107, "step": 195 }, { "epoch": 0.8521739130434782, "grad_norm": 3.4198047041657187, "learning_rate": 6.1506179255749335e-06, "loss": 0.9379, "step": 196 }, { "epoch": 0.8565217391304348, "grad_norm": 3.0362881676390265, "learning_rate": 6.1173600810501355e-06, "loss": 0.6765, "step": 197 }, { "epoch": 0.8608695652173913, "grad_norm": 3.154366355290239, "learning_rate": 6.084050119988905e-06, "loss": 0.8658, "step": 198 }, { "epoch": 0.8652173913043478, "grad_norm": 5.054661820553674, "learning_rate": 6.050689596053151e-06, "loss": 0.9007, "step": 199 }, { "epoch": 0.8695652173913043, "grad_norm": 2.738655053566571, "learning_rate": 6.0172800652631706e-06, "loss": 0.7356, "step": 200 }, { "epoch": 0.8695652173913043, "eval_loss": 0.825007438659668, "eval_runtime": 1.2203, "eval_samples_per_second": 15.571, "eval_steps_per_second": 4.098, "step": 200 }, { "epoch": 0.8739130434782608, "grad_norm": 3.5083994205280145, "learning_rate": 5.983823085925059e-06, "loss": 0.6462, "step": 201 }, { "epoch": 0.8782608695652174, "grad_norm": 3.824457588570473, "learning_rate": 5.950320218558037e-06, "loss": 0.6327, "step": 202 }, { "epoch": 0.8826086956521739, "grad_norm": 4.165824391035445, "learning_rate": 5.916773025821662e-06, "loss": 0.8199, "step": 203 }, { "epoch": 0.8869565217391304, "grad_norm": 3.3820570905655614, "learning_rate": 5.883183072442938e-06, "loss": 0.654, "step": 204 }, { "epoch": 0.8913043478260869, "grad_norm": 4.3684348308636425, "learning_rate": 5.849551925143334e-06, "loss": 0.8819, "step": 205 }, { "epoch": 0.8956521739130435, "grad_norm": 3.458483498590032, "learning_rate": 5.815881152565712e-06, "loss": 0.8065, "step": 206 }, { "epoch": 0.9, "grad_norm": 3.3805910702179647, "learning_rate": 5.782172325201155e-06, "loss": 1.0186, "step": 207 }, { "epoch": 0.9043478260869565, "grad_norm": 3.8013431956793924, "learning_rate": 5.7484270153157215e-06, "loss": 0.6512, "step": 208 }, { "epoch": 0.908695652173913, "grad_norm": 4.393091604713644, "learning_rate": 5.714646796877108e-06, "loss": 1.0551, "step": 209 }, { "epoch": 0.9130434782608695, "grad_norm": 3.0657867889766854, "learning_rate": 5.680833245481234e-06, "loss": 0.8123, "step": 210 }, { "epoch": 0.9173913043478261, "grad_norm": 4.968276591046584, "learning_rate": 5.646987938278753e-06, "loss": 0.7966, "step": 211 }, { "epoch": 0.9217391304347826, "grad_norm": 4.010276803924688, "learning_rate": 5.613112453901493e-06, "loss": 0.8161, "step": 212 }, { "epoch": 0.9260869565217391, "grad_norm": 3.404075079967763, "learning_rate": 5.579208372388822e-06, "loss": 0.8341, "step": 213 }, { "epoch": 0.9304347826086956, "grad_norm": 4.2830181812234684, "learning_rate": 5.5452772751139496e-06, "loss": 0.7814, "step": 214 }, { "epoch": 0.9347826086956522, "grad_norm": 3.3609431787385544, "learning_rate": 5.511320744710171e-06, "loss": 0.7195, "step": 215 }, { "epoch": 0.9391304347826087, "grad_norm": 4.496810291115776, "learning_rate": 5.477340364997051e-06, "loss": 0.9754, "step": 216 }, { "epoch": 0.9434782608695652, "grad_norm": 5.552749375587317, "learning_rate": 5.443337720906542e-06, "loss": 0.9599, "step": 217 }, { "epoch": 0.9478260869565217, "grad_norm": 2.923863761066833, "learning_rate": 5.409314398409067e-06, "loss": 0.7041, "step": 218 }, { "epoch": 0.9521739130434783, "grad_norm": 3.7061647650339355, "learning_rate": 5.375271984439541e-06, "loss": 0.6793, "step": 219 }, { "epoch": 0.9565217391304348, "grad_norm": 6.101090552344163, "learning_rate": 5.341212066823356e-06, "loss": 0.8546, "step": 220 }, { "epoch": 0.9608695652173913, "grad_norm": 3.96750395410377, "learning_rate": 5.307136234202318e-06, "loss": 0.6732, "step": 221 }, { "epoch": 0.9652173913043478, "grad_norm": 4.528462960181897, "learning_rate": 5.27304607596055e-06, "loss": 1.0779, "step": 222 }, { "epoch": 0.9695652173913043, "grad_norm": 3.7443235175746805, "learning_rate": 5.238943182150361e-06, "loss": 0.6516, "step": 223 }, { "epoch": 0.9739130434782609, "grad_norm": 5.394984365821971, "learning_rate": 5.204829143418072e-06, "loss": 0.709, "step": 224 }, { "epoch": 0.9782608695652174, "grad_norm": 3.678247988171025, "learning_rate": 5.17070555092984e-06, "loss": 0.753, "step": 225 }, { "epoch": 0.9826086956521739, "grad_norm": 4.6791048093273675, "learning_rate": 5.136573996297431e-06, "loss": 1.0238, "step": 226 }, { "epoch": 0.9869565217391304, "grad_norm": 5.143148587456582, "learning_rate": 5.102436071503983e-06, "loss": 0.9125, "step": 227 }, { "epoch": 0.991304347826087, "grad_norm": 2.2412926753259597, "learning_rate": 5.068293368829755e-06, "loss": 0.8642, "step": 228 }, { "epoch": 0.9956521739130435, "grad_norm": 4.942145964692695, "learning_rate": 5.034147480777867e-06, "loss": 0.8394, "step": 229 }, { "epoch": 1.0, "grad_norm": 2.9753321703674143, "learning_rate": 5e-06, "loss": 0.9067, "step": 230 }, { "epoch": 1.0043478260869565, "grad_norm": 4.2945289583949915, "learning_rate": 4.965852519222135e-06, "loss": 0.7034, "step": 231 }, { "epoch": 1.008695652173913, "grad_norm": 3.009740756628107, "learning_rate": 4.931706631170246e-06, "loss": 0.6254, "step": 232 }, { "epoch": 1.0130434782608695, "grad_norm": 3.656846296061854, "learning_rate": 4.89756392849602e-06, "loss": 0.4885, "step": 233 }, { "epoch": 1.017391304347826, "grad_norm": 4.485100058642041, "learning_rate": 4.863426003702572e-06, "loss": 0.7407, "step": 234 }, { "epoch": 1.0217391304347827, "grad_norm": 3.0577381177794174, "learning_rate": 4.829294449070161e-06, "loss": 0.6666, "step": 235 }, { "epoch": 1.0260869565217392, "grad_norm": 3.101255830757204, "learning_rate": 4.795170856581929e-06, "loss": 0.6291, "step": 236 }, { "epoch": 1.0304347826086957, "grad_norm": 2.4284538991912075, "learning_rate": 4.7610568178496405e-06, "loss": 0.5469, "step": 237 }, { "epoch": 1.0347826086956522, "grad_norm": 2.927431656629591, "learning_rate": 4.7269539240394505e-06, "loss": 0.4352, "step": 238 }, { "epoch": 1.0391304347826087, "grad_norm": 2.58940571881285, "learning_rate": 4.692863765797683e-06, "loss": 0.5481, "step": 239 }, { "epoch": 1.0434782608695652, "grad_norm": 2.781029232453596, "learning_rate": 4.6587879331766465e-06, "loss": 0.6503, "step": 240 }, { "epoch": 1.0478260869565217, "grad_norm": 2.6957972463902893, "learning_rate": 4.624728015560461e-06, "loss": 0.6356, "step": 241 }, { "epoch": 1.0521739130434782, "grad_norm": 4.28124421456519, "learning_rate": 4.5906856015909365e-06, "loss": 0.4898, "step": 242 }, { "epoch": 1.0565217391304347, "grad_norm": 2.9643501252854163, "learning_rate": 4.556662279093461e-06, "loss": 0.5339, "step": 243 }, { "epoch": 1.0608695652173914, "grad_norm": 3.8524672927440946, "learning_rate": 4.52265963500295e-06, "loss": 0.7113, "step": 244 }, { "epoch": 1.065217391304348, "grad_norm": 3.4202593539885635, "learning_rate": 4.488679255289829e-06, "loss": 0.6041, "step": 245 }, { "epoch": 1.0695652173913044, "grad_norm": 3.8134083559576277, "learning_rate": 4.454722724886051e-06, "loss": 0.4484, "step": 246 }, { "epoch": 1.0739130434782609, "grad_norm": 3.331059450346738, "learning_rate": 4.4207916276111795e-06, "loss": 0.4856, "step": 247 }, { "epoch": 1.0782608695652174, "grad_norm": 3.823406666773115, "learning_rate": 4.386887546098509e-06, "loss": 0.5426, "step": 248 }, { "epoch": 1.0826086956521739, "grad_norm": 4.6180569791107, "learning_rate": 4.353012061721249e-06, "loss": 0.4695, "step": 249 }, { "epoch": 1.0869565217391304, "grad_norm": 3.6666571715510474, "learning_rate": 4.319166754518768e-06, "loss": 0.3785, "step": 250 }, { "epoch": 1.0913043478260869, "grad_norm": 2.9634895325655766, "learning_rate": 4.285353203122894e-06, "loss": 0.3999, "step": 251 }, { "epoch": 1.0956521739130434, "grad_norm": 4.482488004555733, "learning_rate": 4.251572984684281e-06, "loss": 0.7579, "step": 252 }, { "epoch": 1.1, "grad_norm": 4.845918550477848, "learning_rate": 4.217827674798845e-06, "loss": 0.6067, "step": 253 }, { "epoch": 1.1043478260869566, "grad_norm": 3.6986408697041444, "learning_rate": 4.18411884743429e-06, "loss": 0.485, "step": 254 }, { "epoch": 1.108695652173913, "grad_norm": 3.509551929199139, "learning_rate": 4.150448074856667e-06, "loss": 0.5005, "step": 255 }, { "epoch": 1.1130434782608696, "grad_norm": 4.267669168225376, "learning_rate": 4.116816927557063e-06, "loss": 0.4883, "step": 256 }, { "epoch": 1.117391304347826, "grad_norm": 3.774832262855488, "learning_rate": 4.083226974178339e-06, "loss": 0.3788, "step": 257 }, { "epoch": 1.1217391304347826, "grad_norm": 3.5227515786871697, "learning_rate": 4.0496797814419655e-06, "loss": 0.4932, "step": 258 }, { "epoch": 1.126086956521739, "grad_norm": 4.360078037660404, "learning_rate": 4.016176914074944e-06, "loss": 0.5368, "step": 259 }, { "epoch": 1.1304347826086956, "grad_norm": 4.6746791628195545, "learning_rate": 3.982719934736832e-06, "loss": 0.4211, "step": 260 }, { "epoch": 1.134782608695652, "grad_norm": 2.7321352215986323, "learning_rate": 3.949310403946849e-06, "loss": 0.4899, "step": 261 }, { "epoch": 1.1391304347826088, "grad_norm": 3.5754406942915806, "learning_rate": 3.915949880011096e-06, "loss": 0.4823, "step": 262 }, { "epoch": 1.1434782608695653, "grad_norm": 3.6733528041970613, "learning_rate": 3.882639918949865e-06, "loss": 0.5596, "step": 263 }, { "epoch": 1.1478260869565218, "grad_norm": 2.9305256648457387, "learning_rate": 3.849382074425069e-06, "loss": 0.423, "step": 264 }, { "epoch": 1.1521739130434783, "grad_norm": 3.9637419933724143, "learning_rate": 3.816177897667767e-06, "loss": 0.8296, "step": 265 }, { "epoch": 1.1565217391304348, "grad_norm": 6.6811186783510506, "learning_rate": 3.7830289374058214e-06, "loss": 0.616, "step": 266 }, { "epoch": 1.1608695652173913, "grad_norm": 6.288600124931218, "learning_rate": 3.749936739791656e-06, "loss": 0.5357, "step": 267 }, { "epoch": 1.1652173913043478, "grad_norm": 3.8918231976142885, "learning_rate": 3.7169028483301333e-06, "loss": 0.4663, "step": 268 }, { "epoch": 1.1695652173913043, "grad_norm": 3.354158738300337, "learning_rate": 3.6839288038065736e-06, "loss": 0.3898, "step": 269 }, { "epoch": 1.1739130434782608, "grad_norm": 3.3417809268425582, "learning_rate": 3.6510161442148783e-06, "loss": 0.4019, "step": 270 }, { "epoch": 1.1782608695652175, "grad_norm": 5.117755201256695, "learning_rate": 3.6181664046858045e-06, "loss": 0.6163, "step": 271 }, { "epoch": 1.182608695652174, "grad_norm": 4.567775279483126, "learning_rate": 3.58538111741535e-06, "loss": 0.6357, "step": 272 }, { "epoch": 1.1869565217391305, "grad_norm": 5.6456639800107045, "learning_rate": 3.5526618115932974e-06, "loss": 0.5377, "step": 273 }, { "epoch": 1.191304347826087, "grad_norm": 3.7964931148403984, "learning_rate": 3.5200100133318836e-06, "loss": 0.7388, "step": 274 }, { "epoch": 1.1956521739130435, "grad_norm": 4.480294572542451, "learning_rate": 3.4874272455946217e-06, "loss": 0.3212, "step": 275 }, { "epoch": 1.2, "grad_norm": 3.467143996076006, "learning_rate": 3.4549150281252635e-06, "loss": 0.5172, "step": 276 }, { "epoch": 1.2043478260869565, "grad_norm": 2.7472548893281776, "learning_rate": 3.4224748773769166e-06, "loss": 0.4703, "step": 277 }, { "epoch": 1.208695652173913, "grad_norm": 2.930570037247241, "learning_rate": 3.39010830644131e-06, "loss": 0.5399, "step": 278 }, { "epoch": 1.2130434782608694, "grad_norm": 5.422324007097856, "learning_rate": 3.357816824978222e-06, "loss": 0.4727, "step": 279 }, { "epoch": 1.2173913043478262, "grad_norm": 4.756301690088914, "learning_rate": 3.3256019391450696e-06, "loss": 0.7769, "step": 280 }, { "epoch": 1.2217391304347827, "grad_norm": 3.6105329586336117, "learning_rate": 3.2934651515266485e-06, "loss": 0.5781, "step": 281 }, { "epoch": 1.2260869565217392, "grad_norm": 6.656045007547206, "learning_rate": 3.261407961065056e-06, "loss": 0.6225, "step": 282 }, { "epoch": 1.2304347826086957, "grad_norm": 4.715320102324048, "learning_rate": 3.2294318629897746e-06, "loss": 0.5312, "step": 283 }, { "epoch": 1.2347826086956522, "grad_norm": 3.707995507157655, "learning_rate": 3.197538348747927e-06, "loss": 0.5066, "step": 284 }, { "epoch": 1.2391304347826086, "grad_norm": 2.7635931646672565, "learning_rate": 3.1657289059347184e-06, "loss": 0.3998, "step": 285 }, { "epoch": 1.2434782608695651, "grad_norm": 5.451849629778433, "learning_rate": 3.1340050182240438e-06, "loss": 0.4784, "step": 286 }, { "epoch": 1.2478260869565219, "grad_norm": 4.782698874791613, "learning_rate": 3.1023681652992925e-06, "loss": 0.6345, "step": 287 }, { "epoch": 1.2521739130434781, "grad_norm": 3.672795410107192, "learning_rate": 3.070819822784323e-06, "loss": 0.4243, "step": 288 }, { "epoch": 1.2565217391304349, "grad_norm": 3.861532750319747, "learning_rate": 3.03936146217465e-06, "loss": 0.4092, "step": 289 }, { "epoch": 1.2608695652173914, "grad_norm": 6.800888319310977, "learning_rate": 3.007994550768793e-06, "loss": 0.4044, "step": 290 }, { "epoch": 1.2652173913043478, "grad_norm": 2.8381684901106143, "learning_rate": 2.976720551599852e-06, "loss": 0.4325, "step": 291 }, { "epoch": 1.2695652173913043, "grad_norm": 4.2780921182664144, "learning_rate": 2.9455409233672594e-06, "loss": 0.8048, "step": 292 }, { "epoch": 1.2739130434782608, "grad_norm": 4.836689064373329, "learning_rate": 2.914457120368748e-06, "loss": 0.7234, "step": 293 }, { "epoch": 1.2782608695652173, "grad_norm": 4.0103776843864125, "learning_rate": 2.883470592432512e-06, "loss": 0.4799, "step": 294 }, { "epoch": 1.2826086956521738, "grad_norm": 2.861102985436097, "learning_rate": 2.8525827848495912e-06, "loss": 0.4406, "step": 295 }, { "epoch": 1.2869565217391306, "grad_norm": 4.175986317455729, "learning_rate": 2.8217951383064546e-06, "loss": 0.5246, "step": 296 }, { "epoch": 1.2913043478260868, "grad_norm": 3.470122787658186, "learning_rate": 2.7911090888178033e-06, "loss": 0.5552, "step": 297 }, { "epoch": 1.2956521739130435, "grad_norm": 3.435883911757588, "learning_rate": 2.760526067659591e-06, "loss": 0.6207, "step": 298 }, { "epoch": 1.3, "grad_norm": 3.882897336083524, "learning_rate": 2.7300475013022666e-06, "loss": 0.5759, "step": 299 }, { "epoch": 1.3043478260869565, "grad_norm": 2.842616852615547, "learning_rate": 2.6996748113442397e-06, "loss": 0.3266, "step": 300 }, { "epoch": 1.308695652173913, "grad_norm": 4.601459226364831, "learning_rate": 2.669409414445574e-06, "loss": 0.5043, "step": 301 }, { "epoch": 1.3130434782608695, "grad_norm": 3.643401762012329, "learning_rate": 2.6392527222619078e-06, "loss": 0.4729, "step": 302 }, { "epoch": 1.317391304347826, "grad_norm": 4.175796804404032, "learning_rate": 2.6092061413786158e-06, "loss": 0.5379, "step": 303 }, { "epoch": 1.3217391304347825, "grad_norm": 4.787195403434198, "learning_rate": 2.5792710732452e-06, "loss": 0.6848, "step": 304 }, { "epoch": 1.3260869565217392, "grad_norm": 3.5409175757216054, "learning_rate": 2.5494489141099155e-06, "loss": 0.3645, "step": 305 }, { "epoch": 1.3304347826086955, "grad_norm": 3.861897686189232, "learning_rate": 2.5197410549546598e-06, "loss": 0.3829, "step": 306 }, { "epoch": 1.3347826086956522, "grad_norm": 5.3825995531189905, "learning_rate": 2.4901488814300855e-06, "loss": 0.5438, "step": 307 }, { "epoch": 1.3391304347826087, "grad_norm": 3.3702469438449247, "learning_rate": 2.4606737737909696e-06, "loss": 0.3286, "step": 308 }, { "epoch": 1.3434782608695652, "grad_norm": 4.690400044449517, "learning_rate": 2.431317106831836e-06, "loss": 0.7414, "step": 309 }, { "epoch": 1.3478260869565217, "grad_norm": 2.9599127269025005, "learning_rate": 2.4020802498228333e-06, "loss": 0.5214, "step": 310 }, { "epoch": 1.3521739130434782, "grad_norm": 4.746471132028925, "learning_rate": 2.3729645664458637e-06, "loss": 0.7109, "step": 311 }, { "epoch": 1.3565217391304347, "grad_norm": 5.25212834865194, "learning_rate": 2.3439714147309845e-06, "loss": 0.4919, "step": 312 }, { "epoch": 1.3608695652173912, "grad_norm": 3.125614769821592, "learning_rate": 2.315102146993061e-06, "loss": 0.3315, "step": 313 }, { "epoch": 1.365217391304348, "grad_norm": 4.108407402987031, "learning_rate": 2.286358109768693e-06, "loss": 0.4416, "step": 314 }, { "epoch": 1.3695652173913042, "grad_norm": 3.4235864126497555, "learning_rate": 2.2577406437534055e-06, "loss": 0.4012, "step": 315 }, { "epoch": 1.373913043478261, "grad_norm": 2.9536303450719505, "learning_rate": 2.229251083739127e-06, "loss": 0.4213, "step": 316 }, { "epoch": 1.3782608695652174, "grad_norm": 4.275562425786293, "learning_rate": 2.2008907585519094e-06, "loss": 0.6007, "step": 317 }, { "epoch": 1.382608695652174, "grad_norm": 4.439744549643693, "learning_rate": 2.172660990989971e-06, "loss": 0.5128, "step": 318 }, { "epoch": 1.3869565217391304, "grad_norm": 4.387149839982964, "learning_rate": 2.144563097761984e-06, "loss": 0.6044, "step": 319 }, { "epoch": 1.391304347826087, "grad_norm": 3.8269926419825393, "learning_rate": 2.1165983894256647e-06, "loss": 0.5243, "step": 320 }, { "epoch": 1.3956521739130434, "grad_norm": 4.263234062205227, "learning_rate": 2.0887681703266453e-06, "loss": 0.6501, "step": 321 }, { "epoch": 1.4, "grad_norm": 3.288715945278797, "learning_rate": 2.061073738537635e-06, "loss": 0.4023, "step": 322 }, { "epoch": 1.4043478260869566, "grad_norm": 4.840903830572076, "learning_rate": 2.0335163857978747e-06, "loss": 0.584, "step": 323 }, { "epoch": 1.4086956521739131, "grad_norm": 3.816755859554486, "learning_rate": 2.0060973974528873e-06, "loss": 0.4691, "step": 324 }, { "epoch": 1.4130434782608696, "grad_norm": 4.094050804066009, "learning_rate": 1.978818052394528e-06, "loss": 0.3754, "step": 325 }, { "epoch": 1.4173913043478261, "grad_norm": 4.305208159329463, "learning_rate": 1.9516796230013275e-06, "loss": 0.3779, "step": 326 }, { "epoch": 1.4217391304347826, "grad_norm": 6.126499754720256, "learning_rate": 1.9246833750791526e-06, "loss": 0.5946, "step": 327 }, { "epoch": 1.4260869565217391, "grad_norm": 4.986094326020495, "learning_rate": 1.8978305678021598e-06, "loss": 0.5896, "step": 328 }, { "epoch": 1.4304347826086956, "grad_norm": 5.166024500275062, "learning_rate": 1.8711224536540678e-06, "loss": 0.4248, "step": 329 }, { "epoch": 1.434782608695652, "grad_norm": 3.8712679668213323, "learning_rate": 1.8445602783697375e-06, "loss": 0.5666, "step": 330 }, { "epoch": 1.4391304347826086, "grad_norm": 4.205692307851411, "learning_rate": 1.8181452808770638e-06, "loss": 0.4982, "step": 331 }, { "epoch": 1.4434782608695653, "grad_norm": 3.8983034740726428, "learning_rate": 1.7918786932391945e-06, "loss": 0.5396, "step": 332 }, { "epoch": 1.4478260869565218, "grad_norm": 3.4651082117419802, "learning_rate": 1.765761740597065e-06, "loss": 0.679, "step": 333 }, { "epoch": 1.4521739130434783, "grad_norm": 3.8564393361166216, "learning_rate": 1.739795641112248e-06, "loss": 0.6588, "step": 334 }, { "epoch": 1.4565217391304348, "grad_norm": 4.272975876505804, "learning_rate": 1.7139816059101372e-06, "loss": 0.8358, "step": 335 }, { "epoch": 1.4608695652173913, "grad_norm": 4.076839893426693, "learning_rate": 1.688320839023463e-06, "loss": 0.5513, "step": 336 }, { "epoch": 1.4652173913043478, "grad_norm": 2.774567334417496, "learning_rate": 1.662814537336122e-06, "loss": 0.5271, "step": 337 }, { "epoch": 1.4695652173913043, "grad_norm": 3.667368206450109, "learning_rate": 1.6374638905273643e-06, "loss": 0.5086, "step": 338 }, { "epoch": 1.4739130434782608, "grad_norm": 2.6774469570055284, "learning_rate": 1.6122700810162967e-06, "loss": 0.4622, "step": 339 }, { "epoch": 1.4782608695652173, "grad_norm": 4.058760037761275, "learning_rate": 1.5872342839067305e-06, "loss": 0.3739, "step": 340 }, { "epoch": 1.482608695652174, "grad_norm": 4.232366716311447, "learning_rate": 1.5623576669323743e-06, "loss": 0.631, "step": 341 }, { "epoch": 1.4869565217391305, "grad_norm": 3.8466635247767553, "learning_rate": 1.5376413904023723e-06, "loss": 0.5104, "step": 342 }, { "epoch": 1.491304347826087, "grad_norm": 4.135942120347136, "learning_rate": 1.5130866071471717e-06, "loss": 0.5094, "step": 343 }, { "epoch": 1.4956521739130435, "grad_norm": 4.304273024625908, "learning_rate": 1.4886944624647647e-06, "loss": 0.3944, "step": 344 }, { "epoch": 1.5, "grad_norm": 4.785895968414715, "learning_rate": 1.4644660940672628e-06, "loss": 0.6328, "step": 345 }, { "epoch": 1.5043478260869565, "grad_norm": 4.304489013184394, "learning_rate": 1.4404026320278318e-06, "loss": 0.4224, "step": 346 }, { "epoch": 1.508695652173913, "grad_norm": 5.048498818452494, "learning_rate": 1.4165051987279832e-06, "loss": 0.6405, "step": 347 }, { "epoch": 1.5130434782608697, "grad_norm": 3.61479089476453, "learning_rate": 1.3927749088052218e-06, "loss": 0.56, "step": 348 }, { "epoch": 1.517391304347826, "grad_norm": 4.236357069281796, "learning_rate": 1.3692128691010592e-06, "loss": 0.4332, "step": 349 }, { "epoch": 1.5217391304347827, "grad_norm": 4.341493660417522, "learning_rate": 1.3458201786093795e-06, "loss": 0.5717, "step": 350 }, { "epoch": 1.526086956521739, "grad_norm": 4.9176115607656845, "learning_rate": 1.3225979284251955e-06, "loss": 0.3907, "step": 351 }, { "epoch": 1.5304347826086957, "grad_norm": 3.801480760249645, "learning_rate": 1.2995472016937405e-06, "loss": 0.5003, "step": 352 }, { "epoch": 1.5347826086956522, "grad_norm": 3.4307384301286956, "learning_rate": 1.2766690735599569e-06, "loss": 0.5317, "step": 353 }, { "epoch": 1.5391304347826087, "grad_norm": 4.016082443019338, "learning_rate": 1.2539646111183452e-06, "loss": 0.43, "step": 354 }, { "epoch": 1.5434782608695652, "grad_norm": 3.594983855812303, "learning_rate": 1.2314348733631958e-06, "loss": 0.5412, "step": 355 }, { "epoch": 1.5478260869565217, "grad_norm": 2.6282395580601547, "learning_rate": 1.209080911139187e-06, "loss": 0.3157, "step": 356 }, { "epoch": 1.5521739130434784, "grad_norm": 4.17706565626502, "learning_rate": 1.1869037670923817e-06, "loss": 0.4854, "step": 357 }, { "epoch": 1.5565217391304347, "grad_norm": 3.527144570708999, "learning_rate": 1.1649044756215872e-06, "loss": 0.6346, "step": 358 }, { "epoch": 1.5608695652173914, "grad_norm": 3.0628314553388827, "learning_rate": 1.1430840628301093e-06, "loss": 0.5448, "step": 359 }, { "epoch": 1.5652173913043477, "grad_norm": 2.848515880653068, "learning_rate": 1.1214435464779006e-06, "loss": 0.568, "step": 360 }, { "epoch": 1.5695652173913044, "grad_norm": 6.516959915724072, "learning_rate": 1.099983935934077e-06, "loss": 0.7196, "step": 361 }, { "epoch": 1.5739130434782609, "grad_norm": 5.057293110141789, "learning_rate": 1.0787062321298441e-06, "loss": 0.6018, "step": 362 }, { "epoch": 1.5782608695652174, "grad_norm": 4.77065142413127, "learning_rate": 1.0576114275118132e-06, "loss": 0.493, "step": 363 }, { "epoch": 1.5826086956521739, "grad_norm": 2.4354456562647635, "learning_rate": 1.0367005059957097e-06, "loss": 0.4647, "step": 364 }, { "epoch": 1.5869565217391304, "grad_norm": 4.274318416743583, "learning_rate": 1.0159744429204776e-06, "loss": 0.4877, "step": 365 }, { "epoch": 1.591304347826087, "grad_norm": 6.539038228109423, "learning_rate": 9.954342050027922e-07, "loss": 0.7812, "step": 366 }, { "epoch": 1.5956521739130434, "grad_norm": 4.146697517540566, "learning_rate": 9.75080750291965e-07, "loss": 0.6274, "step": 367 }, { "epoch": 1.6, "grad_norm": 2.372204077398317, "learning_rate": 9.549150281252633e-07, "loss": 0.4421, "step": 368 }, { "epoch": 1.6043478260869564, "grad_norm": 2.955199194745197, "learning_rate": 9.349379790836243e-07, "loss": 0.5028, "step": 369 }, { "epoch": 1.608695652173913, "grad_norm": 3.6177018572948736, "learning_rate": 9.151505349477901e-07, "loss": 0.5141, "step": 370 }, { "epoch": 1.6130434782608696, "grad_norm": 3.6516021998980017, "learning_rate": 8.955536186548425e-07, "loss": 0.6085, "step": 371 }, { "epoch": 1.617391304347826, "grad_norm": 3.500880225513966, "learning_rate": 8.761481442551573e-07, "loss": 0.5389, "step": 372 }, { "epoch": 1.6217391304347826, "grad_norm": 4.680508116117162, "learning_rate": 8.569350168697705e-07, "loss": 0.6038, "step": 373 }, { "epoch": 1.626086956521739, "grad_norm": 4.03657843570234, "learning_rate": 8.379151326481588e-07, "loss": 0.4799, "step": 374 }, { "epoch": 1.6304347826086958, "grad_norm": 3.651603518512998, "learning_rate": 8.19089378726447e-07, "loss": 0.6335, "step": 375 }, { "epoch": 1.634782608695652, "grad_norm": 3.181550181803468, "learning_rate": 8.004586331860176e-07, "loss": 0.3404, "step": 376 }, { "epoch": 1.6391304347826088, "grad_norm": 4.474431730080835, "learning_rate": 7.820237650125711e-07, "loss": 0.4203, "step": 377 }, { "epoch": 1.643478260869565, "grad_norm": 4.57860916947871, "learning_rate": 7.637856340555822e-07, "loss": 0.7531, "step": 378 }, { "epoch": 1.6478260869565218, "grad_norm": 3.5893337731483412, "learning_rate": 7.457450909881969e-07, "loss": 0.6053, "step": 379 }, { "epoch": 1.6521739130434783, "grad_norm": 3.281855093440728, "learning_rate": 7.279029772675572e-07, "loss": 0.46, "step": 380 }, { "epoch": 1.6565217391304348, "grad_norm": 3.5331855572127333, "learning_rate": 7.102601250955526e-07, "loss": 0.3617, "step": 381 }, { "epoch": 1.6608695652173913, "grad_norm": 3.32305374567492, "learning_rate": 6.928173573800007e-07, "loss": 0.739, "step": 382 }, { "epoch": 1.6652173913043478, "grad_norm": 2.998766048205789, "learning_rate": 6.755754876962711e-07, "loss": 0.463, "step": 383 }, { "epoch": 1.6695652173913045, "grad_norm": 3.144044172604275, "learning_rate": 6.585353202493322e-07, "loss": 0.5875, "step": 384 }, { "epoch": 1.6739130434782608, "grad_norm": 3.096077345682249, "learning_rate": 6.416976498362432e-07, "loss": 0.5305, "step": 385 }, { "epoch": 1.6782608695652175, "grad_norm": 2.738317055507431, "learning_rate": 6.250632618090868e-07, "loss": 0.4703, "step": 386 }, { "epoch": 1.6826086956521737, "grad_norm": 3.5071473316541826, "learning_rate": 6.086329320383311e-07, "loss": 0.4213, "step": 387 }, { "epoch": 1.6869565217391305, "grad_norm": 3.8877032575124137, "learning_rate": 5.924074268766422e-07, "loss": 0.4856, "step": 388 }, { "epoch": 1.691304347826087, "grad_norm": 4.828638835996497, "learning_rate": 5.763875031231464e-07, "loss": 0.5905, "step": 389 }, { "epoch": 1.6956521739130435, "grad_norm": 5.896054080665568, "learning_rate": 5.60573907988124e-07, "loss": 0.4024, "step": 390 }, { "epoch": 1.7, "grad_norm": 3.896327908000081, "learning_rate": 5.449673790581611e-07, "loss": 0.5591, "step": 391 }, { "epoch": 1.7043478260869565, "grad_norm": 3.3185586531295677, "learning_rate": 5.295686442617442e-07, "loss": 0.4434, "step": 392 }, { "epoch": 1.7086956521739132, "grad_norm": 3.263140832538685, "learning_rate": 5.143784218353104e-07, "loss": 0.7183, "step": 393 }, { "epoch": 1.7130434782608694, "grad_norm": 3.00513867387302, "learning_rate": 4.993974202897456e-07, "loss": 0.5925, "step": 394 }, { "epoch": 1.7173913043478262, "grad_norm": 4.514101571491169, "learning_rate": 4.846263383773364e-07, "loss": 0.4831, "step": 395 }, { "epoch": 1.7217391304347827, "grad_norm": 4.069820180851384, "learning_rate": 4.7006586505918273e-07, "loss": 0.4127, "step": 396 }, { "epoch": 1.7260869565217392, "grad_norm": 3.4177761306996666, "learning_rate": 4.557166794730572e-07, "loss": 0.5253, "step": 397 }, { "epoch": 1.7304347826086957, "grad_norm": 3.1509456616908365, "learning_rate": 4.4157945090173294e-07, "loss": 0.5074, "step": 398 }, { "epoch": 1.7347826086956522, "grad_norm": 3.696243545583048, "learning_rate": 4.276548387417656e-07, "loss": 0.5885, "step": 399 }, { "epoch": 1.7391304347826086, "grad_norm": 3.7886982318851805, "learning_rate": 4.139434924727359e-07, "loss": 0.2754, "step": 400 }, { "epoch": 1.7391304347826086, "eval_loss": 0.8716984987258911, "eval_runtime": 1.212, "eval_samples_per_second": 15.677, "eval_steps_per_second": 4.125, "step": 400 }, { "epoch": 1.7434782608695651, "grad_norm": 4.101705505067162, "learning_rate": 4.004460516269554e-07, "loss": 0.3822, "step": 401 }, { "epoch": 1.7478260869565219, "grad_norm": 4.276148559341361, "learning_rate": 3.8716314575964197e-07, "loss": 0.56, "step": 402 }, { "epoch": 1.7521739130434781, "grad_norm": 3.4463007331708395, "learning_rate": 3.740953944195497e-07, "loss": 0.4291, "step": 403 }, { "epoch": 1.7565217391304349, "grad_norm": 4.323773987111401, "learning_rate": 3.612434071200771e-07, "loss": 0.5948, "step": 404 }, { "epoch": 1.7608695652173914, "grad_norm": 3.8045725056072612, "learning_rate": 3.486077833108342e-07, "loss": 0.5438, "step": 405 }, { "epoch": 1.7652173913043478, "grad_norm": 3.751096313013388, "learning_rate": 3.361891123496824e-07, "loss": 0.4442, "step": 406 }, { "epoch": 1.7695652173913043, "grad_norm": 5.6873886034585865, "learning_rate": 3.2398797347524656e-07, "loss": 0.5722, "step": 407 }, { "epoch": 1.7739130434782608, "grad_norm": 3.4440987994083754, "learning_rate": 3.1200493577989875e-07, "loss": 0.5971, "step": 408 }, { "epoch": 1.7782608695652173, "grad_norm": 4.500827009009169, "learning_rate": 3.002405581832135e-07, "loss": 0.8116, "step": 409 }, { "epoch": 1.7826086956521738, "grad_norm": 4.466563287701003, "learning_rate": 2.88695389405898e-07, "loss": 0.6865, "step": 410 }, { "epoch": 1.7869565217391306, "grad_norm": 3.603233112278287, "learning_rate": 2.7736996794419767e-07, "loss": 0.5216, "step": 411 }, { "epoch": 1.7913043478260868, "grad_norm": 3.875200007299088, "learning_rate": 2.662648220447811e-07, "loss": 0.6106, "step": 412 }, { "epoch": 1.7956521739130435, "grad_norm": 3.534753252029232, "learning_rate": 2.5538046968010097e-07, "loss": 0.636, "step": 413 }, { "epoch": 1.8, "grad_norm": 4.461839191222703, "learning_rate": 2.447174185242324e-07, "loss": 0.6309, "step": 414 }, { "epoch": 1.8043478260869565, "grad_norm": 4.482635011621374, "learning_rate": 2.3427616592919587e-07, "loss": 0.2978, "step": 415 }, { "epoch": 1.808695652173913, "grad_norm": 5.623046102327689, "learning_rate": 2.240571989017598e-07, "loss": 0.3982, "step": 416 }, { "epoch": 1.8130434782608695, "grad_norm": 4.340291840632997, "learning_rate": 2.1406099408072256e-07, "loss": 0.4452, "step": 417 }, { "epoch": 1.8173913043478263, "grad_norm": 4.373122353194206, "learning_rate": 2.0428801771468388e-07, "loss": 0.5546, "step": 418 }, { "epoch": 1.8217391304347825, "grad_norm": 4.978141436723412, "learning_rate": 1.947387256402966e-07, "loss": 0.6283, "step": 419 }, { "epoch": 1.8260869565217392, "grad_norm": 4.9421439442580155, "learning_rate": 1.8541356326100436e-07, "loss": 0.5615, "step": 420 }, { "epoch": 1.8304347826086955, "grad_norm": 4.662787149917992, "learning_rate": 1.7631296552626687e-07, "loss": 0.6423, "step": 421 }, { "epoch": 1.8347826086956522, "grad_norm": 4.8067902104365645, "learning_rate": 1.6743735691127639e-07, "loss": 0.413, "step": 422 }, { "epoch": 1.8391304347826087, "grad_norm": 4.40926589583089, "learning_rate": 1.5878715139715395e-07, "loss": 0.5572, "step": 423 }, { "epoch": 1.8434782608695652, "grad_norm": 2.8704530992170514, "learning_rate": 1.5036275245164377e-07, "loss": 0.4556, "step": 424 }, { "epoch": 1.8478260869565217, "grad_norm": 4.55581396084849, "learning_rate": 1.4216455301029274e-07, "loss": 0.3541, "step": 425 }, { "epoch": 1.8521739130434782, "grad_norm": 14.46427054288811, "learning_rate": 1.341929354581234e-07, "loss": 0.4864, "step": 426 }, { "epoch": 1.856521739130435, "grad_norm": 3.2179410576367027, "learning_rate": 1.2644827161179763e-07, "loss": 0.4733, "step": 427 }, { "epoch": 1.8608695652173912, "grad_norm": 3.8030605461121274, "learning_rate": 1.1893092270227724e-07, "loss": 0.4404, "step": 428 }, { "epoch": 1.865217391304348, "grad_norm": 3.8617739599125946, "learning_rate": 1.1164123935797189e-07, "loss": 0.5796, "step": 429 }, { "epoch": 1.8695652173913042, "grad_norm": 3.7521959371096867, "learning_rate": 1.0457956158838545e-07, "loss": 0.5444, "step": 430 }, { "epoch": 1.873913043478261, "grad_norm": 4.360768628277166, "learning_rate": 9.774621876825985e-08, "loss": 0.7298, "step": 431 }, { "epoch": 1.8782608695652174, "grad_norm": 3.910205294075997, "learning_rate": 9.114152962220734e-08, "loss": 0.542, "step": 432 }, { "epoch": 1.882608695652174, "grad_norm": 4.005132857310367, "learning_rate": 8.476580220984854e-08, "loss": 0.6349, "step": 433 }, { "epoch": 1.8869565217391304, "grad_norm": 4.596239343877202, "learning_rate": 7.861933391144272e-08, "loss": 0.4515, "step": 434 }, { "epoch": 1.891304347826087, "grad_norm": 3.9563035871216643, "learning_rate": 7.270241141401568e-08, "loss": 0.6042, "step": 435 }, { "epoch": 1.8956521739130436, "grad_norm": 2.99941894532922, "learning_rate": 6.701531069799039e-08, "loss": 0.3209, "step": 436 }, { "epoch": 1.9, "grad_norm": 4.126028774169588, "learning_rate": 6.15582970243117e-08, "loss": 0.6095, "step": 437 }, { "epoch": 1.9043478260869566, "grad_norm": 4.219172610306852, "learning_rate": 5.633162492207633e-08, "loss": 0.5979, "step": 438 }, { "epoch": 1.908695652173913, "grad_norm": 4.866427440568141, "learning_rate": 5.133553817665948e-08, "loss": 0.662, "step": 439 }, { "epoch": 1.9130434782608696, "grad_norm": 4.6700212829119385, "learning_rate": 4.657026981834623e-08, "loss": 0.5815, "step": 440 }, { "epoch": 1.9173913043478261, "grad_norm": 3.0191187442564003, "learning_rate": 4.203604211145851e-08, "loss": 0.3505, "step": 441 }, { "epoch": 1.9217391304347826, "grad_norm": 3.3555580221359538, "learning_rate": 3.773306654399234e-08, "loss": 0.4539, "step": 442 }, { "epoch": 1.9260869565217391, "grad_norm": 3.932100191195551, "learning_rate": 3.366154381775011e-08, "loss": 0.5395, "step": 443 }, { "epoch": 1.9304347826086956, "grad_norm": 4.543654718724428, "learning_rate": 2.9821663838981994e-08, "loss": 0.5999, "step": 444 }, { "epoch": 1.9347826086956523, "grad_norm": 3.2027549612689263, "learning_rate": 2.6213605709525803e-08, "loss": 0.3673, "step": 445 }, { "epoch": 1.9391304347826086, "grad_norm": 3.1103705554436694, "learning_rate": 2.283753771845587e-08, "loss": 0.3699, "step": 446 }, { "epoch": 1.9434782608695653, "grad_norm": 3.596434865326805, "learning_rate": 1.969361733423103e-08, "loss": 0.6307, "step": 447 }, { "epoch": 1.9478260869565216, "grad_norm": 4.349343114323385, "learning_rate": 1.6781991197352133e-08, "loss": 0.5428, "step": 448 }, { "epoch": 1.9521739130434783, "grad_norm": 4.855414536760434, "learning_rate": 1.4102795113520307e-08, "loss": 0.4092, "step": 449 }, { "epoch": 1.9565217391304348, "grad_norm": 3.0119883785611585, "learning_rate": 1.1656154047303691e-08, "loss": 0.4938, "step": 450 }, { "epoch": 1.9608695652173913, "grad_norm": 4.642330127831433, "learning_rate": 9.442182116309872e-09, "loss": 0.7239, "step": 451 }, { "epoch": 1.9652173913043478, "grad_norm": 3.663654111517379, "learning_rate": 7.460982585860144e-09, "loss": 0.3265, "step": 452 }, { "epoch": 1.9695652173913043, "grad_norm": 2.870263572172439, "learning_rate": 5.712647864176135e-09, "loss": 0.5066, "step": 453 }, { "epoch": 1.973913043478261, "grad_norm": 3.8955974016594053, "learning_rate": 4.197259498067707e-09, "loss": 0.5468, "step": 454 }, { "epoch": 1.9782608695652173, "grad_norm": 3.973489744191462, "learning_rate": 2.9148881691298812e-09, "loss": 0.4732, "step": 455 }, { "epoch": 1.982608695652174, "grad_norm": 4.726977182661161, "learning_rate": 1.865593690446588e-09, "loss": 0.5222, "step": 456 }, { "epoch": 1.9869565217391303, "grad_norm": 3.1878654907075217, "learning_rate": 1.0494250038006747e-09, "loss": 0.4729, "step": 457 }, { "epoch": 1.991304347826087, "grad_norm": 5.2096767935404324, "learning_rate": 4.664201773896259e-10, "loss": 0.5507, "step": 458 }, { "epoch": 1.9956521739130435, "grad_norm": 4.661539977777431, "learning_rate": 1.1660640405308787e-10, "loss": 0.7135, "step": 459 }, { "epoch": 2.0, "grad_norm": 4.192820799122638, "learning_rate": 0.0, "loss": 0.4461, "step": 460 }, { "epoch": 2.0, "step": 460, "total_flos": 2556217393152.0, "train_loss": 0.679824001374452, "train_runtime": 538.1176, "train_samples_per_second": 6.839, "train_steps_per_second": 0.855 } ], "logging_steps": 1, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2556217393152.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }