{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 11696, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.3728413581848145, "learning_rate": 4.998290013679891e-05, "loss": 10.0117, "step": 4 }, { "epoch": 0.0, "grad_norm": 2.599367380142212, "learning_rate": 4.996580027359781e-05, "loss": 9.1745, "step": 8 }, { "epoch": 0.0, "grad_norm": 2.4951720237731934, "learning_rate": 4.994870041039672e-05, "loss": 8.7624, "step": 12 }, { "epoch": 0.0, "grad_norm": 2.4932913780212402, "learning_rate": 4.9931600547195625e-05, "loss": 8.5709, "step": 16 }, { "epoch": 0.0, "grad_norm": 2.3983781337738037, "learning_rate": 4.991450068399453e-05, "loss": 8.4411, "step": 20 }, { "epoch": 0.0, "grad_norm": 2.2340972423553467, "learning_rate": 4.989740082079344e-05, "loss": 8.1957, "step": 24 }, { "epoch": 0.0, "grad_norm": 2.1964590549468994, "learning_rate": 4.988030095759234e-05, "loss": 8.0598, "step": 28 }, { "epoch": 0.0, "grad_norm": 1.9686617851257324, "learning_rate": 4.986320109439125e-05, "loss": 7.8679, "step": 32 }, { "epoch": 0.0, "grad_norm": 1.8235087394714355, "learning_rate": 4.984610123119015e-05, "loss": 7.7621, "step": 36 }, { "epoch": 0.0, "grad_norm": 1.7598885297775269, "learning_rate": 4.9829001367989056e-05, "loss": 7.5417, "step": 40 }, { "epoch": 0.0, "grad_norm": 1.94789719581604, "learning_rate": 4.981190150478797e-05, "loss": 7.3331, "step": 44 }, { "epoch": 0.0, "grad_norm": 1.9366331100463867, "learning_rate": 4.979480164158687e-05, "loss": 7.1039, "step": 48 }, { "epoch": 0.0, "grad_norm": 2.1228675842285156, "learning_rate": 4.977770177838578e-05, "loss": 6.9965, "step": 52 }, { "epoch": 0.0, "grad_norm": 1.9855167865753174, "learning_rate": 4.976060191518468e-05, "loss": 6.8865, "step": 56 }, { "epoch": 0.01, "grad_norm": 1.8035985231399536, "learning_rate": 4.9743502051983585e-05, "loss": 6.6322, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.8589977025985718, "learning_rate": 4.9726402188782486e-05, "loss": 6.5616, "step": 64 }, { "epoch": 0.01, "grad_norm": 1.745139241218567, "learning_rate": 4.97093023255814e-05, "loss": 6.3479, "step": 68 }, { "epoch": 0.01, "grad_norm": 1.266196608543396, "learning_rate": 4.969220246238031e-05, "loss": 6.2343, "step": 72 }, { "epoch": 0.01, "grad_norm": 2.005223035812378, "learning_rate": 4.967510259917921e-05, "loss": 6.1866, "step": 76 }, { "epoch": 0.01, "grad_norm": 1.5017377138137817, "learning_rate": 4.9658002735978115e-05, "loss": 5.972, "step": 80 }, { "epoch": 0.01, "grad_norm": 1.6136974096298218, "learning_rate": 4.9640902872777016e-05, "loss": 6.1335, "step": 84 }, { "epoch": 0.01, "grad_norm": 1.3865970373153687, "learning_rate": 4.962380300957592e-05, "loss": 5.944, "step": 88 }, { "epoch": 0.01, "grad_norm": 1.283933162689209, "learning_rate": 4.960670314637483e-05, "loss": 5.8713, "step": 92 }, { "epoch": 0.01, "grad_norm": 1.1588549613952637, "learning_rate": 4.958960328317374e-05, "loss": 5.7731, "step": 96 }, { "epoch": 0.01, "grad_norm": 1.3081687688827515, "learning_rate": 4.9572503419972645e-05, "loss": 5.7429, "step": 100 }, { "epoch": 0.01, "grad_norm": 1.6212745904922485, "learning_rate": 4.9555403556771546e-05, "loss": 5.481, "step": 104 }, { "epoch": 0.01, "grad_norm": 1.709843397140503, "learning_rate": 4.953830369357045e-05, "loss": 5.4855, "step": 108 }, { "epoch": 0.01, "grad_norm": 1.2814812660217285, "learning_rate": 4.952120383036936e-05, "loss": 5.4367, "step": 112 }, { "epoch": 0.01, "grad_norm": 1.5539968013763428, "learning_rate": 4.950410396716826e-05, "loss": 5.2652, "step": 116 }, { "epoch": 0.01, "grad_norm": 1.500375509262085, "learning_rate": 4.948700410396717e-05, "loss": 5.5137, "step": 120 }, { "epoch": 0.01, "grad_norm": 1.7938899993896484, "learning_rate": 4.9469904240766076e-05, "loss": 5.4663, "step": 124 }, { "epoch": 0.01, "grad_norm": 1.6169573068618774, "learning_rate": 4.945280437756498e-05, "loss": 5.3137, "step": 128 }, { "epoch": 0.01, "grad_norm": 1.1904114484786987, "learning_rate": 4.943570451436389e-05, "loss": 5.1871, "step": 132 }, { "epoch": 0.01, "grad_norm": 2.5564723014831543, "learning_rate": 4.941860465116279e-05, "loss": 5.4058, "step": 136 }, { "epoch": 0.01, "grad_norm": 1.6187268495559692, "learning_rate": 4.94015047879617e-05, "loss": 5.1919, "step": 140 }, { "epoch": 0.01, "grad_norm": 1.2222367525100708, "learning_rate": 4.93844049247606e-05, "loss": 5.1876, "step": 144 }, { "epoch": 0.01, "grad_norm": 1.5898586511611938, "learning_rate": 4.936730506155951e-05, "loss": 5.1217, "step": 148 }, { "epoch": 0.01, "grad_norm": 1.446902871131897, "learning_rate": 4.935020519835842e-05, "loss": 5.0582, "step": 152 }, { "epoch": 0.01, "grad_norm": 1.862309217453003, "learning_rate": 4.933310533515732e-05, "loss": 4.9028, "step": 156 }, { "epoch": 0.01, "grad_norm": 1.5455704927444458, "learning_rate": 4.931600547195623e-05, "loss": 5.0352, "step": 160 }, { "epoch": 0.01, "grad_norm": 1.772558569908142, "learning_rate": 4.929890560875513e-05, "loss": 4.9707, "step": 164 }, { "epoch": 0.01, "grad_norm": 1.8154480457305908, "learning_rate": 4.9281805745554036e-05, "loss": 5.0588, "step": 168 }, { "epoch": 0.01, "grad_norm": 1.4536504745483398, "learning_rate": 4.9264705882352944e-05, "loss": 4.9997, "step": 172 }, { "epoch": 0.02, "grad_norm": 1.6166527271270752, "learning_rate": 4.924760601915185e-05, "loss": 4.9472, "step": 176 }, { "epoch": 0.02, "grad_norm": 2.1260979175567627, "learning_rate": 4.923050615595076e-05, "loss": 4.944, "step": 180 }, { "epoch": 0.02, "grad_norm": 1.4778213500976562, "learning_rate": 4.921340629274966e-05, "loss": 4.8657, "step": 184 }, { "epoch": 0.02, "grad_norm": 1.255488395690918, "learning_rate": 4.9196306429548566e-05, "loss": 4.7728, "step": 188 }, { "epoch": 0.02, "grad_norm": 1.5698314905166626, "learning_rate": 4.917920656634747e-05, "loss": 4.9433, "step": 192 }, { "epoch": 0.02, "grad_norm": 1.342402696609497, "learning_rate": 4.9162106703146374e-05, "loss": 4.6336, "step": 196 }, { "epoch": 0.02, "grad_norm": 1.4427978992462158, "learning_rate": 4.914500683994528e-05, "loss": 4.7112, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.6528425216674805, "learning_rate": 4.912790697674419e-05, "loss": 4.876, "step": 204 }, { "epoch": 0.02, "grad_norm": 1.706386923789978, "learning_rate": 4.9110807113543096e-05, "loss": 4.5971, "step": 208 }, { "epoch": 0.02, "grad_norm": 1.7773737907409668, "learning_rate": 4.9093707250342e-05, "loss": 4.7944, "step": 212 }, { "epoch": 0.02, "grad_norm": 1.7884222269058228, "learning_rate": 4.9076607387140904e-05, "loss": 4.5912, "step": 216 }, { "epoch": 0.02, "grad_norm": 1.5389587879180908, "learning_rate": 4.905950752393981e-05, "loss": 4.6656, "step": 220 }, { "epoch": 0.02, "grad_norm": 1.7006316184997559, "learning_rate": 4.904240766073871e-05, "loss": 4.6589, "step": 224 }, { "epoch": 0.02, "grad_norm": 1.6314700841903687, "learning_rate": 4.9025307797537626e-05, "loss": 4.5036, "step": 228 }, { "epoch": 0.02, "grad_norm": 1.7512930631637573, "learning_rate": 4.900820793433653e-05, "loss": 4.5105, "step": 232 }, { "epoch": 0.02, "grad_norm": 1.9109033346176147, "learning_rate": 4.8991108071135434e-05, "loss": 4.4449, "step": 236 }, { "epoch": 0.02, "grad_norm": 1.547317385673523, "learning_rate": 4.897400820793434e-05, "loss": 4.4906, "step": 240 }, { "epoch": 0.02, "grad_norm": 1.5143060684204102, "learning_rate": 4.895690834473324e-05, "loss": 4.3545, "step": 244 }, { "epoch": 0.02, "grad_norm": 1.6648136377334595, "learning_rate": 4.893980848153215e-05, "loss": 4.4093, "step": 248 }, { "epoch": 0.02, "grad_norm": 1.5027951002120972, "learning_rate": 4.892270861833106e-05, "loss": 4.3444, "step": 252 }, { "epoch": 0.02, "grad_norm": 1.9429397583007812, "learning_rate": 4.8905608755129964e-05, "loss": 4.4937, "step": 256 }, { "epoch": 0.02, "grad_norm": 1.4009640216827393, "learning_rate": 4.888850889192887e-05, "loss": 4.3625, "step": 260 }, { "epoch": 0.02, "grad_norm": 1.7445279359817505, "learning_rate": 4.887140902872777e-05, "loss": 4.2759, "step": 264 }, { "epoch": 0.02, "grad_norm": 1.4381403923034668, "learning_rate": 4.885430916552668e-05, "loss": 4.574, "step": 268 }, { "epoch": 0.02, "grad_norm": 1.5582979917526245, "learning_rate": 4.883720930232558e-05, "loss": 4.3831, "step": 272 }, { "epoch": 0.02, "grad_norm": 1.461166501045227, "learning_rate": 4.882010943912449e-05, "loss": 4.5049, "step": 276 }, { "epoch": 0.02, "grad_norm": 1.756549596786499, "learning_rate": 4.8803009575923394e-05, "loss": 4.1334, "step": 280 }, { "epoch": 0.02, "grad_norm": 1.9006497859954834, "learning_rate": 4.87859097127223e-05, "loss": 4.3406, "step": 284 }, { "epoch": 0.02, "grad_norm": 1.4551759958267212, "learning_rate": 4.876880984952121e-05, "loss": 4.4688, "step": 288 }, { "epoch": 0.02, "grad_norm": 1.5335747003555298, "learning_rate": 4.875170998632011e-05, "loss": 4.2096, "step": 292 }, { "epoch": 0.03, "grad_norm": 1.4802557229995728, "learning_rate": 4.873461012311902e-05, "loss": 4.0918, "step": 296 }, { "epoch": 0.03, "grad_norm": 1.9993939399719238, "learning_rate": 4.8717510259917924e-05, "loss": 4.3636, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.706895351409912, "learning_rate": 4.8700410396716825e-05, "loss": 4.0891, "step": 304 }, { "epoch": 0.03, "grad_norm": 1.4630484580993652, "learning_rate": 4.868331053351574e-05, "loss": 4.3254, "step": 308 }, { "epoch": 0.03, "grad_norm": 1.4353913068771362, "learning_rate": 4.866621067031464e-05, "loss": 4.1935, "step": 312 }, { "epoch": 0.03, "grad_norm": 1.5042275190353394, "learning_rate": 4.864911080711355e-05, "loss": 4.0913, "step": 316 }, { "epoch": 0.03, "grad_norm": 1.792472004890442, "learning_rate": 4.863201094391245e-05, "loss": 4.2159, "step": 320 }, { "epoch": 0.03, "grad_norm": 1.4059948921203613, "learning_rate": 4.8614911080711355e-05, "loss": 4.2934, "step": 324 }, { "epoch": 0.03, "grad_norm": 1.8408161401748657, "learning_rate": 4.859781121751026e-05, "loss": 4.1482, "step": 328 }, { "epoch": 0.03, "grad_norm": 1.716046690940857, "learning_rate": 4.858071135430917e-05, "loss": 4.0273, "step": 332 }, { "epoch": 0.03, "grad_norm": 1.8415429592132568, "learning_rate": 4.856361149110808e-05, "loss": 4.0837, "step": 336 }, { "epoch": 0.03, "grad_norm": 1.4880731105804443, "learning_rate": 4.854651162790698e-05, "loss": 4.1957, "step": 340 }, { "epoch": 0.03, "grad_norm": 1.8150016069412231, "learning_rate": 4.8529411764705885e-05, "loss": 4.225, "step": 344 }, { "epoch": 0.03, "grad_norm": 1.7828190326690674, "learning_rate": 4.851231190150479e-05, "loss": 4.1053, "step": 348 }, { "epoch": 0.03, "grad_norm": 1.8549565076828003, "learning_rate": 4.849521203830369e-05, "loss": 3.8068, "step": 352 }, { "epoch": 0.03, "grad_norm": 1.4295508861541748, "learning_rate": 4.84781121751026e-05, "loss": 3.9358, "step": 356 }, { "epoch": 0.03, "grad_norm": 1.4277971982955933, "learning_rate": 4.846101231190151e-05, "loss": 4.1512, "step": 360 }, { "epoch": 0.03, "grad_norm": 1.4937466382980347, "learning_rate": 4.8443912448700415e-05, "loss": 4.0162, "step": 364 }, { "epoch": 0.03, "grad_norm": 1.3076913356781006, "learning_rate": 4.842681258549932e-05, "loss": 4.0336, "step": 368 }, { "epoch": 0.03, "grad_norm": 1.673946738243103, "learning_rate": 4.840971272229822e-05, "loss": 4.0915, "step": 372 }, { "epoch": 0.03, "grad_norm": 1.5978224277496338, "learning_rate": 4.839261285909713e-05, "loss": 4.137, "step": 376 }, { "epoch": 0.03, "grad_norm": 2.214573383331299, "learning_rate": 4.837551299589603e-05, "loss": 3.8231, "step": 380 }, { "epoch": 0.03, "grad_norm": 1.4263286590576172, "learning_rate": 4.835841313269494e-05, "loss": 3.9326, "step": 384 }, { "epoch": 0.03, "grad_norm": 1.694931149482727, "learning_rate": 4.8341313269493845e-05, "loss": 4.0673, "step": 388 }, { "epoch": 0.03, "grad_norm": 1.558498740196228, "learning_rate": 4.832421340629275e-05, "loss": 3.9454, "step": 392 }, { "epoch": 0.03, "grad_norm": 1.5231866836547852, "learning_rate": 4.830711354309166e-05, "loss": 4.0214, "step": 396 }, { "epoch": 0.03, "grad_norm": 1.6748324632644653, "learning_rate": 4.829001367989056e-05, "loss": 3.9983, "step": 400 }, { "epoch": 0.03, "grad_norm": 1.3733506202697754, "learning_rate": 4.827291381668947e-05, "loss": 3.8526, "step": 404 }, { "epoch": 0.03, "grad_norm": 1.6695655584335327, "learning_rate": 4.8255813953488375e-05, "loss": 3.8948, "step": 408 }, { "epoch": 0.04, "grad_norm": 1.3915668725967407, "learning_rate": 4.8238714090287276e-05, "loss": 3.8511, "step": 412 }, { "epoch": 0.04, "grad_norm": 1.5736514329910278, "learning_rate": 4.822161422708619e-05, "loss": 3.9446, "step": 416 }, { "epoch": 0.04, "grad_norm": 1.562687635421753, "learning_rate": 4.820451436388509e-05, "loss": 3.6682, "step": 420 }, { "epoch": 0.04, "grad_norm": 1.4215232133865356, "learning_rate": 4.8187414500684e-05, "loss": 3.6618, "step": 424 }, { "epoch": 0.04, "grad_norm": 1.8750267028808594, "learning_rate": 4.8170314637482905e-05, "loss": 3.8246, "step": 428 }, { "epoch": 0.04, "grad_norm": 1.5843207836151123, "learning_rate": 4.8153214774281806e-05, "loss": 3.807, "step": 432 }, { "epoch": 0.04, "grad_norm": 1.6384755373001099, "learning_rate": 4.813611491108071e-05, "loss": 3.8329, "step": 436 }, { "epoch": 0.04, "grad_norm": 1.646612286567688, "learning_rate": 4.811901504787962e-05, "loss": 3.9736, "step": 440 }, { "epoch": 0.04, "grad_norm": 1.592463493347168, "learning_rate": 4.810191518467853e-05, "loss": 3.6107, "step": 444 }, { "epoch": 0.04, "grad_norm": 1.6230803728103638, "learning_rate": 4.808481532147743e-05, "loss": 3.8141, "step": 448 }, { "epoch": 0.04, "grad_norm": 1.7098625898361206, "learning_rate": 4.8067715458276336e-05, "loss": 3.8751, "step": 452 }, { "epoch": 0.04, "grad_norm": 1.441146731376648, "learning_rate": 4.805061559507524e-05, "loss": 3.8461, "step": 456 }, { "epoch": 0.04, "grad_norm": 1.4387036561965942, "learning_rate": 4.8033515731874144e-05, "loss": 3.8432, "step": 460 }, { "epoch": 0.04, "grad_norm": 1.6620376110076904, "learning_rate": 4.801641586867305e-05, "loss": 3.7844, "step": 464 }, { "epoch": 0.04, "grad_norm": 1.5403114557266235, "learning_rate": 4.799931600547196e-05, "loss": 3.6556, "step": 468 }, { "epoch": 0.04, "grad_norm": 1.5642478466033936, "learning_rate": 4.7982216142270866e-05, "loss": 3.7526, "step": 472 }, { "epoch": 0.04, "grad_norm": 1.5027506351470947, "learning_rate": 4.796511627906977e-05, "loss": 3.7862, "step": 476 }, { "epoch": 0.04, "grad_norm": 1.5570485591888428, "learning_rate": 4.7948016415868674e-05, "loss": 3.7656, "step": 480 }, { "epoch": 0.04, "grad_norm": 2.2708029747009277, "learning_rate": 4.793091655266758e-05, "loss": 3.8742, "step": 484 }, { "epoch": 0.04, "grad_norm": 1.5446516275405884, "learning_rate": 4.791381668946648e-05, "loss": 3.6401, "step": 488 }, { "epoch": 0.04, "grad_norm": 1.5480107069015503, "learning_rate": 4.789671682626539e-05, "loss": 3.7441, "step": 492 }, { "epoch": 0.04, "grad_norm": 1.5571659803390503, "learning_rate": 4.78796169630643e-05, "loss": 3.6849, "step": 496 }, { "epoch": 0.04, "grad_norm": 1.9021155834197998, "learning_rate": 4.7862517099863204e-05, "loss": 3.7989, "step": 500 }, { "epoch": 0.04, "grad_norm": 1.5292832851409912, "learning_rate": 4.784541723666211e-05, "loss": 3.8129, "step": 504 }, { "epoch": 0.04, "grad_norm": 1.5750986337661743, "learning_rate": 4.782831737346101e-05, "loss": 3.648, "step": 508 }, { "epoch": 0.04, "grad_norm": 1.6995611190795898, "learning_rate": 4.781121751025992e-05, "loss": 3.7686, "step": 512 }, { "epoch": 0.04, "grad_norm": 1.7573764324188232, "learning_rate": 4.7794117647058826e-05, "loss": 3.6636, "step": 516 }, { "epoch": 0.04, "grad_norm": 1.5131741762161255, "learning_rate": 4.7777017783857733e-05, "loss": 3.5793, "step": 520 }, { "epoch": 0.04, "grad_norm": 1.6838548183441162, "learning_rate": 4.775991792065664e-05, "loss": 3.6336, "step": 524 }, { "epoch": 0.05, "grad_norm": 1.9402954578399658, "learning_rate": 4.774281805745554e-05, "loss": 3.6733, "step": 528 }, { "epoch": 0.05, "grad_norm": 1.7752444744110107, "learning_rate": 4.772571819425445e-05, "loss": 3.8021, "step": 532 }, { "epoch": 0.05, "grad_norm": 1.4470579624176025, "learning_rate": 4.7708618331053356e-05, "loss": 3.6526, "step": 536 }, { "epoch": 0.05, "grad_norm": 1.5560115575790405, "learning_rate": 4.769151846785226e-05, "loss": 3.7236, "step": 540 }, { "epoch": 0.05, "grad_norm": 1.6849740743637085, "learning_rate": 4.7674418604651164e-05, "loss": 3.7586, "step": 544 }, { "epoch": 0.05, "grad_norm": 1.4917711019515991, "learning_rate": 4.765731874145007e-05, "loss": 3.7791, "step": 548 }, { "epoch": 0.05, "grad_norm": 1.791801929473877, "learning_rate": 4.764021887824898e-05, "loss": 3.5659, "step": 552 }, { "epoch": 0.05, "grad_norm": 1.531817078590393, "learning_rate": 4.7623119015047886e-05, "loss": 3.5811, "step": 556 }, { "epoch": 0.05, "grad_norm": 1.8725793361663818, "learning_rate": 4.7606019151846787e-05, "loss": 3.7337, "step": 560 }, { "epoch": 0.05, "grad_norm": 1.5851105451583862, "learning_rate": 4.7588919288645694e-05, "loss": 3.4899, "step": 564 }, { "epoch": 0.05, "grad_norm": 1.4937329292297363, "learning_rate": 4.7571819425444594e-05, "loss": 3.5653, "step": 568 }, { "epoch": 0.05, "grad_norm": 1.8102850914001465, "learning_rate": 4.75547195622435e-05, "loss": 3.517, "step": 572 }, { "epoch": 0.05, "grad_norm": 1.4572982788085938, "learning_rate": 4.753761969904241e-05, "loss": 3.723, "step": 576 }, { "epoch": 0.05, "grad_norm": 1.5815645456314087, "learning_rate": 4.7520519835841317e-05, "loss": 3.6615, "step": 580 }, { "epoch": 0.05, "grad_norm": 1.6406457424163818, "learning_rate": 4.7503419972640224e-05, "loss": 3.6796, "step": 584 }, { "epoch": 0.05, "grad_norm": 1.8512486219406128, "learning_rate": 4.7486320109439124e-05, "loss": 3.5586, "step": 588 }, { "epoch": 0.05, "grad_norm": 1.6738507747650146, "learning_rate": 4.746922024623803e-05, "loss": 3.4724, "step": 592 }, { "epoch": 0.05, "grad_norm": 1.766518235206604, "learning_rate": 4.745212038303693e-05, "loss": 3.7213, "step": 596 }, { "epoch": 0.05, "grad_norm": 1.681229591369629, "learning_rate": 4.7435020519835846e-05, "loss": 3.6538, "step": 600 }, { "epoch": 0.05, "grad_norm": 1.7900785207748413, "learning_rate": 4.7417920656634754e-05, "loss": 3.7952, "step": 604 }, { "epoch": 0.05, "grad_norm": 1.5610637664794922, "learning_rate": 4.7400820793433654e-05, "loss": 3.5255, "step": 608 }, { "epoch": 0.05, "grad_norm": 1.3365826606750488, "learning_rate": 4.738372093023256e-05, "loss": 3.5621, "step": 612 }, { "epoch": 0.05, "grad_norm": 1.8946609497070312, "learning_rate": 4.736662106703146e-05, "loss": 3.5664, "step": 616 }, { "epoch": 0.05, "grad_norm": 1.728330135345459, "learning_rate": 4.734952120383037e-05, "loss": 3.7577, "step": 620 }, { "epoch": 0.05, "grad_norm": 1.7583847045898438, "learning_rate": 4.733242134062928e-05, "loss": 3.5722, "step": 624 }, { "epoch": 0.05, "grad_norm": 1.9061710834503174, "learning_rate": 4.7315321477428184e-05, "loss": 3.3436, "step": 628 }, { "epoch": 0.05, "grad_norm": 1.7074024677276611, "learning_rate": 4.729822161422709e-05, "loss": 3.5426, "step": 632 }, { "epoch": 0.05, "grad_norm": 1.649793028831482, "learning_rate": 4.728112175102599e-05, "loss": 3.5826, "step": 636 }, { "epoch": 0.05, "grad_norm": 1.7260462045669556, "learning_rate": 4.72640218878249e-05, "loss": 3.5353, "step": 640 }, { "epoch": 0.06, "grad_norm": 1.745391607284546, "learning_rate": 4.724692202462381e-05, "loss": 3.3534, "step": 644 }, { "epoch": 0.06, "grad_norm": 1.374226450920105, "learning_rate": 4.722982216142271e-05, "loss": 3.386, "step": 648 }, { "epoch": 0.06, "grad_norm": 1.8732798099517822, "learning_rate": 4.7212722298221615e-05, "loss": 3.6073, "step": 652 }, { "epoch": 0.06, "grad_norm": 2.4075376987457275, "learning_rate": 4.719562243502052e-05, "loss": 3.5393, "step": 656 }, { "epoch": 0.06, "grad_norm": 1.54508638381958, "learning_rate": 4.717852257181943e-05, "loss": 3.475, "step": 660 }, { "epoch": 0.06, "grad_norm": 1.5737498998641968, "learning_rate": 4.716142270861834e-05, "loss": 3.5497, "step": 664 }, { "epoch": 0.06, "grad_norm": 1.8623074293136597, "learning_rate": 4.714432284541724e-05, "loss": 3.4603, "step": 668 }, { "epoch": 0.06, "grad_norm": 1.7199251651763916, "learning_rate": 4.7127222982216145e-05, "loss": 3.5801, "step": 672 }, { "epoch": 0.06, "grad_norm": 1.5866843461990356, "learning_rate": 4.7110123119015045e-05, "loss": 3.3454, "step": 676 }, { "epoch": 0.06, "grad_norm": 1.9907779693603516, "learning_rate": 4.709302325581396e-05, "loss": 3.6247, "step": 680 }, { "epoch": 0.06, "grad_norm": 1.3720687627792358, "learning_rate": 4.707592339261287e-05, "loss": 3.3614, "step": 684 }, { "epoch": 0.06, "grad_norm": 1.739660620689392, "learning_rate": 4.705882352941177e-05, "loss": 3.4211, "step": 688 }, { "epoch": 0.06, "grad_norm": 1.6425236463546753, "learning_rate": 4.7041723666210675e-05, "loss": 3.423, "step": 692 }, { "epoch": 0.06, "grad_norm": 1.5457091331481934, "learning_rate": 4.7024623803009575e-05, "loss": 3.3258, "step": 696 }, { "epoch": 0.06, "grad_norm": 1.5979949235916138, "learning_rate": 4.700752393980848e-05, "loss": 3.2838, "step": 700 }, { "epoch": 0.06, "grad_norm": 1.6761040687561035, "learning_rate": 4.699042407660739e-05, "loss": 3.334, "step": 704 }, { "epoch": 0.06, "grad_norm": 1.552573323249817, "learning_rate": 4.69733242134063e-05, "loss": 3.5355, "step": 708 }, { "epoch": 0.06, "grad_norm": 1.6743354797363281, "learning_rate": 4.6956224350205205e-05, "loss": 3.4076, "step": 712 }, { "epoch": 0.06, "grad_norm": 4.555662155151367, "learning_rate": 4.6939124487004105e-05, "loss": 3.7446, "step": 716 }, { "epoch": 0.06, "grad_norm": 1.5942860841751099, "learning_rate": 4.692202462380301e-05, "loss": 3.553, "step": 720 }, { "epoch": 0.06, "grad_norm": 1.9089189767837524, "learning_rate": 4.690492476060191e-05, "loss": 3.3584, "step": 724 }, { "epoch": 0.06, "grad_norm": 2.0768373012542725, "learning_rate": 4.688782489740082e-05, "loss": 3.4295, "step": 728 }, { "epoch": 0.06, "grad_norm": 3.3060362339019775, "learning_rate": 4.687072503419973e-05, "loss": 3.415, "step": 732 }, { "epoch": 0.06, "grad_norm": 1.9307421445846558, "learning_rate": 4.6853625170998635e-05, "loss": 3.4901, "step": 736 }, { "epoch": 0.06, "grad_norm": 1.7142446041107178, "learning_rate": 4.683652530779754e-05, "loss": 3.6096, "step": 740 }, { "epoch": 0.06, "grad_norm": 1.7756657600402832, "learning_rate": 4.681942544459644e-05, "loss": 3.4534, "step": 744 }, { "epoch": 0.06, "grad_norm": 1.5459442138671875, "learning_rate": 4.680232558139535e-05, "loss": 3.3813, "step": 748 }, { "epoch": 0.06, "grad_norm": 2.1961963176727295, "learning_rate": 4.678522571819426e-05, "loss": 3.3467, "step": 752 }, { "epoch": 0.06, "grad_norm": 1.7320613861083984, "learning_rate": 4.676812585499316e-05, "loss": 3.3927, "step": 756 }, { "epoch": 0.06, "grad_norm": 1.6255303621292114, "learning_rate": 4.6751025991792066e-05, "loss": 3.4644, "step": 760 }, { "epoch": 0.07, "grad_norm": 1.5142388343811035, "learning_rate": 4.673392612859097e-05, "loss": 3.4657, "step": 764 }, { "epoch": 0.07, "grad_norm": 1.5093744993209839, "learning_rate": 4.671682626538988e-05, "loss": 3.3144, "step": 768 }, { "epoch": 0.07, "grad_norm": 1.4730446338653564, "learning_rate": 4.669972640218879e-05, "loss": 3.3382, "step": 772 }, { "epoch": 0.07, "grad_norm": 1.7650116682052612, "learning_rate": 4.668262653898769e-05, "loss": 3.4519, "step": 776 }, { "epoch": 0.07, "grad_norm": 1.637071132659912, "learning_rate": 4.6665526675786596e-05, "loss": 3.3723, "step": 780 }, { "epoch": 0.07, "grad_norm": 1.4582788944244385, "learning_rate": 4.6648426812585496e-05, "loss": 3.4839, "step": 784 }, { "epoch": 0.07, "grad_norm": 1.4654922485351562, "learning_rate": 4.663132694938441e-05, "loss": 3.1257, "step": 788 }, { "epoch": 0.07, "grad_norm": 1.4668978452682495, "learning_rate": 4.661422708618332e-05, "loss": 3.2506, "step": 792 }, { "epoch": 0.07, "grad_norm": 1.5190536975860596, "learning_rate": 4.659712722298222e-05, "loss": 3.4055, "step": 796 }, { "epoch": 0.07, "grad_norm": 1.9057687520980835, "learning_rate": 4.6580027359781126e-05, "loss": 3.4804, "step": 800 }, { "epoch": 0.07, "grad_norm": 1.4573813676834106, "learning_rate": 4.6562927496580026e-05, "loss": 3.3792, "step": 804 }, { "epoch": 0.07, "grad_norm": 2.3757171630859375, "learning_rate": 4.6545827633378933e-05, "loss": 3.4649, "step": 808 }, { "epoch": 0.07, "grad_norm": 1.7024015188217163, "learning_rate": 4.652872777017784e-05, "loss": 3.3149, "step": 812 }, { "epoch": 0.07, "grad_norm": 1.5424153804779053, "learning_rate": 4.651162790697675e-05, "loss": 3.3, "step": 816 }, { "epoch": 0.07, "grad_norm": 1.4667370319366455, "learning_rate": 4.6494528043775655e-05, "loss": 3.2845, "step": 820 }, { "epoch": 0.07, "grad_norm": 1.5216193199157715, "learning_rate": 4.6477428180574556e-05, "loss": 3.3062, "step": 824 }, { "epoch": 0.07, "grad_norm": 1.5499827861785889, "learning_rate": 4.6460328317373463e-05, "loss": 3.263, "step": 828 }, { "epoch": 0.07, "grad_norm": 1.644148588180542, "learning_rate": 4.6443228454172364e-05, "loss": 3.4374, "step": 832 }, { "epoch": 0.07, "grad_norm": 1.6420782804489136, "learning_rate": 4.642612859097127e-05, "loss": 3.4049, "step": 836 }, { "epoch": 0.07, "grad_norm": 1.4325385093688965, "learning_rate": 4.640902872777018e-05, "loss": 3.3499, "step": 840 }, { "epoch": 0.07, "grad_norm": 1.6447879076004028, "learning_rate": 4.6391928864569086e-05, "loss": 3.4098, "step": 844 }, { "epoch": 0.07, "grad_norm": 1.4675137996673584, "learning_rate": 4.637482900136799e-05, "loss": 3.2778, "step": 848 }, { "epoch": 0.07, "grad_norm": 1.5847750902175903, "learning_rate": 4.6357729138166894e-05, "loss": 3.1695, "step": 852 }, { "epoch": 0.07, "grad_norm": 1.514772653579712, "learning_rate": 4.63406292749658e-05, "loss": 3.1556, "step": 856 }, { "epoch": 0.07, "grad_norm": 1.511367917060852, "learning_rate": 4.632352941176471e-05, "loss": 3.3307, "step": 860 }, { "epoch": 0.07, "grad_norm": 1.7640196084976196, "learning_rate": 4.630642954856361e-05, "loss": 3.2985, "step": 864 }, { "epoch": 0.07, "grad_norm": 1.7074265480041504, "learning_rate": 4.628932968536252e-05, "loss": 3.2598, "step": 868 }, { "epoch": 0.07, "grad_norm": 1.52577805519104, "learning_rate": 4.6272229822161424e-05, "loss": 3.0872, "step": 872 }, { "epoch": 0.07, "grad_norm": 1.6908501386642456, "learning_rate": 4.625512995896033e-05, "loss": 3.1931, "step": 876 }, { "epoch": 0.08, "grad_norm": 1.6583012342453003, "learning_rate": 4.623803009575924e-05, "loss": 3.1849, "step": 880 }, { "epoch": 0.08, "grad_norm": 1.4067027568817139, "learning_rate": 4.622093023255814e-05, "loss": 3.1912, "step": 884 }, { "epoch": 0.08, "grad_norm": 1.388487458229065, "learning_rate": 4.6203830369357046e-05, "loss": 3.2845, "step": 888 }, { "epoch": 0.08, "grad_norm": 1.7910948991775513, "learning_rate": 4.6186730506155954e-05, "loss": 3.1146, "step": 892 }, { "epoch": 0.08, "grad_norm": 1.4454721212387085, "learning_rate": 4.616963064295486e-05, "loss": 3.2114, "step": 896 }, { "epoch": 0.08, "grad_norm": 1.4809476137161255, "learning_rate": 4.615253077975377e-05, "loss": 3.3329, "step": 900 }, { "epoch": 0.08, "grad_norm": 1.6668082475662231, "learning_rate": 4.613543091655267e-05, "loss": 3.2517, "step": 904 }, { "epoch": 0.08, "grad_norm": 1.72063410282135, "learning_rate": 4.6118331053351576e-05, "loss": 3.0895, "step": 908 }, { "epoch": 0.08, "grad_norm": 1.5995526313781738, "learning_rate": 4.610123119015048e-05, "loss": 3.2703, "step": 912 }, { "epoch": 0.08, "grad_norm": 1.453747034072876, "learning_rate": 4.6084131326949384e-05, "loss": 3.3619, "step": 916 }, { "epoch": 0.08, "grad_norm": 1.7982878684997559, "learning_rate": 4.606703146374829e-05, "loss": 3.2657, "step": 920 }, { "epoch": 0.08, "grad_norm": 1.3937926292419434, "learning_rate": 4.60499316005472e-05, "loss": 3.1727, "step": 924 }, { "epoch": 0.08, "grad_norm": 1.5705368518829346, "learning_rate": 4.6032831737346106e-05, "loss": 3.0657, "step": 928 }, { "epoch": 0.08, "grad_norm": 1.6821500062942505, "learning_rate": 4.601573187414501e-05, "loss": 3.3191, "step": 932 }, { "epoch": 0.08, "grad_norm": 1.5288244485855103, "learning_rate": 4.5998632010943914e-05, "loss": 3.0876, "step": 936 }, { "epoch": 0.08, "grad_norm": 1.7748781442642212, "learning_rate": 4.598153214774282e-05, "loss": 3.18, "step": 940 }, { "epoch": 0.08, "grad_norm": 1.7024669647216797, "learning_rate": 4.596443228454172e-05, "loss": 3.3601, "step": 944 }, { "epoch": 0.08, "grad_norm": 1.7017521858215332, "learning_rate": 4.5947332421340636e-05, "loss": 2.9751, "step": 948 }, { "epoch": 0.08, "grad_norm": 1.7405025959014893, "learning_rate": 4.593023255813954e-05, "loss": 3.2073, "step": 952 }, { "epoch": 0.08, "grad_norm": 1.6610547304153442, "learning_rate": 4.5913132694938444e-05, "loss": 3.2363, "step": 956 }, { "epoch": 0.08, "grad_norm": 1.4785616397857666, "learning_rate": 4.5896032831737345e-05, "loss": 3.3254, "step": 960 }, { "epoch": 0.08, "grad_norm": 3.453533411026001, "learning_rate": 4.587893296853625e-05, "loss": 3.21, "step": 964 }, { "epoch": 0.08, "grad_norm": 1.4303959608078003, "learning_rate": 4.586183310533516e-05, "loss": 3.3226, "step": 968 }, { "epoch": 0.08, "grad_norm": 1.6272943019866943, "learning_rate": 4.584473324213407e-05, "loss": 3.5944, "step": 972 }, { "epoch": 0.08, "grad_norm": 1.823716402053833, "learning_rate": 4.5827633378932974e-05, "loss": 3.234, "step": 976 }, { "epoch": 0.08, "grad_norm": 1.7006231546401978, "learning_rate": 4.5810533515731875e-05, "loss": 3.0123, "step": 980 }, { "epoch": 0.08, "grad_norm": 1.3645117282867432, "learning_rate": 4.579343365253078e-05, "loss": 3.2243, "step": 984 }, { "epoch": 0.08, "grad_norm": 1.7724872827529907, "learning_rate": 4.577633378932969e-05, "loss": 3.1705, "step": 988 }, { "epoch": 0.08, "grad_norm": 1.6377599239349365, "learning_rate": 4.575923392612859e-05, "loss": 3.2647, "step": 992 }, { "epoch": 0.09, "grad_norm": 1.4828346967697144, "learning_rate": 4.57421340629275e-05, "loss": 3.0613, "step": 996 }, { "epoch": 0.09, "grad_norm": 1.515553593635559, "learning_rate": 4.5725034199726405e-05, "loss": 3.0368, "step": 1000 }, { "epoch": 0.09, "grad_norm": 1.498844861984253, "learning_rate": 4.570793433652531e-05, "loss": 3.102, "step": 1004 }, { "epoch": 0.09, "grad_norm": 1.4339860677719116, "learning_rate": 4.569083447332422e-05, "loss": 3.1874, "step": 1008 }, { "epoch": 0.09, "grad_norm": 1.5611178874969482, "learning_rate": 4.567373461012312e-05, "loss": 3.075, "step": 1012 }, { "epoch": 0.09, "grad_norm": 1.6089402437210083, "learning_rate": 4.565663474692203e-05, "loss": 3.1544, "step": 1016 }, { "epoch": 0.09, "grad_norm": 1.8874709606170654, "learning_rate": 4.563953488372093e-05, "loss": 3.2806, "step": 1020 }, { "epoch": 0.09, "grad_norm": 1.5624805688858032, "learning_rate": 4.5622435020519835e-05, "loss": 3.1442, "step": 1024 }, { "epoch": 0.09, "grad_norm": 1.968634009361267, "learning_rate": 4.560533515731875e-05, "loss": 3.2743, "step": 1028 }, { "epoch": 0.09, "grad_norm": 1.4064279794692993, "learning_rate": 4.558823529411765e-05, "loss": 3.0726, "step": 1032 }, { "epoch": 0.09, "grad_norm": 1.577048897743225, "learning_rate": 4.557113543091656e-05, "loss": 3.1129, "step": 1036 }, { "epoch": 0.09, "grad_norm": 1.548323392868042, "learning_rate": 4.555403556771546e-05, "loss": 3.144, "step": 1040 }, { "epoch": 0.09, "grad_norm": 1.5542961359024048, "learning_rate": 4.5536935704514365e-05, "loss": 3.182, "step": 1044 }, { "epoch": 0.09, "grad_norm": 1.5506356954574585, "learning_rate": 4.551983584131327e-05, "loss": 3.2186, "step": 1048 }, { "epoch": 0.09, "grad_norm": 1.6962274312973022, "learning_rate": 4.550273597811218e-05, "loss": 3.0385, "step": 1052 }, { "epoch": 0.09, "grad_norm": 1.4382833242416382, "learning_rate": 4.548563611491109e-05, "loss": 3.1268, "step": 1056 }, { "epoch": 0.09, "grad_norm": 2.269392728805542, "learning_rate": 4.546853625170999e-05, "loss": 3.2579, "step": 1060 }, { "epoch": 0.09, "grad_norm": 1.5414201021194458, "learning_rate": 4.5451436388508895e-05, "loss": 3.1501, "step": 1064 }, { "epoch": 0.09, "grad_norm": 1.936946153640747, "learning_rate": 4.54343365253078e-05, "loss": 3.1711, "step": 1068 }, { "epoch": 0.09, "grad_norm": 1.7430529594421387, "learning_rate": 4.54172366621067e-05, "loss": 3.1299, "step": 1072 }, { "epoch": 0.09, "grad_norm": 1.3500404357910156, "learning_rate": 4.540013679890561e-05, "loss": 3.0935, "step": 1076 }, { "epoch": 0.09, "grad_norm": 1.5768132209777832, "learning_rate": 4.538303693570452e-05, "loss": 3.1091, "step": 1080 }, { "epoch": 0.09, "grad_norm": 1.4829493761062622, "learning_rate": 4.5365937072503425e-05, "loss": 3.0862, "step": 1084 }, { "epoch": 0.09, "grad_norm": 1.5560483932495117, "learning_rate": 4.5348837209302326e-05, "loss": 3.1024, "step": 1088 }, { "epoch": 0.09, "grad_norm": 1.6295199394226074, "learning_rate": 4.533173734610123e-05, "loss": 2.9539, "step": 1092 }, { "epoch": 0.09, "grad_norm": 2.1724135875701904, "learning_rate": 4.531463748290014e-05, "loss": 3.1936, "step": 1096 }, { "epoch": 0.09, "grad_norm": 1.7400479316711426, "learning_rate": 4.529753761969904e-05, "loss": 3.1873, "step": 1100 }, { "epoch": 0.09, "grad_norm": 1.554962396621704, "learning_rate": 4.528043775649795e-05, "loss": 3.0643, "step": 1104 }, { "epoch": 0.09, "grad_norm": 1.3722119331359863, "learning_rate": 4.5263337893296855e-05, "loss": 3.0088, "step": 1108 }, { "epoch": 0.1, "grad_norm": 1.395577311515808, "learning_rate": 4.524623803009576e-05, "loss": 3.0584, "step": 1112 }, { "epoch": 0.1, "grad_norm": 1.509710431098938, "learning_rate": 4.522913816689467e-05, "loss": 3.2276, "step": 1116 }, { "epoch": 0.1, "grad_norm": 1.5213786363601685, "learning_rate": 4.521203830369357e-05, "loss": 2.9433, "step": 1120 }, { "epoch": 0.1, "grad_norm": 1.51616632938385, "learning_rate": 4.519493844049248e-05, "loss": 2.9902, "step": 1124 }, { "epoch": 0.1, "grad_norm": 1.5429805517196655, "learning_rate": 4.517783857729138e-05, "loss": 3.0884, "step": 1128 }, { "epoch": 0.1, "grad_norm": 1.4714343547821045, "learning_rate": 4.5160738714090286e-05, "loss": 3.0322, "step": 1132 }, { "epoch": 0.1, "grad_norm": 1.4637647867202759, "learning_rate": 4.51436388508892e-05, "loss": 3.0352, "step": 1136 }, { "epoch": 0.1, "grad_norm": 1.3424347639083862, "learning_rate": 4.51265389876881e-05, "loss": 3.0866, "step": 1140 }, { "epoch": 0.1, "grad_norm": 1.689968228340149, "learning_rate": 4.510943912448701e-05, "loss": 3.0002, "step": 1144 }, { "epoch": 0.1, "grad_norm": 1.9935293197631836, "learning_rate": 4.509233926128591e-05, "loss": 2.9518, "step": 1148 }, { "epoch": 0.1, "grad_norm": 1.5461673736572266, "learning_rate": 4.5075239398084816e-05, "loss": 3.0394, "step": 1152 }, { "epoch": 0.1, "grad_norm": 1.5787737369537354, "learning_rate": 4.505813953488372e-05, "loss": 3.0934, "step": 1156 }, { "epoch": 0.1, "grad_norm": 1.4527121782302856, "learning_rate": 4.504103967168263e-05, "loss": 2.9734, "step": 1160 }, { "epoch": 0.1, "grad_norm": 1.6236604452133179, "learning_rate": 4.502393980848154e-05, "loss": 3.021, "step": 1164 }, { "epoch": 0.1, "grad_norm": 1.5616329908370972, "learning_rate": 4.500683994528044e-05, "loss": 3.0602, "step": 1168 }, { "epoch": 0.1, "grad_norm": 1.6745967864990234, "learning_rate": 4.4989740082079346e-05, "loss": 3.2712, "step": 1172 }, { "epoch": 0.1, "grad_norm": 1.5813934803009033, "learning_rate": 4.497264021887825e-05, "loss": 3.2255, "step": 1176 }, { "epoch": 0.1, "grad_norm": 1.5584092140197754, "learning_rate": 4.4955540355677154e-05, "loss": 3.082, "step": 1180 }, { "epoch": 0.1, "grad_norm": 1.5441848039627075, "learning_rate": 4.493844049247606e-05, "loss": 2.978, "step": 1184 }, { "epoch": 0.1, "grad_norm": 1.6514015197753906, "learning_rate": 4.492134062927497e-05, "loss": 3.2604, "step": 1188 }, { "epoch": 0.1, "grad_norm": 1.4784066677093506, "learning_rate": 4.4904240766073876e-05, "loss": 3.0761, "step": 1192 }, { "epoch": 0.1, "grad_norm": 1.493974208831787, "learning_rate": 4.488714090287278e-05, "loss": 3.1323, "step": 1196 }, { "epoch": 0.1, "grad_norm": 1.4246760606765747, "learning_rate": 4.4870041039671684e-05, "loss": 2.7518, "step": 1200 }, { "epoch": 0.1, "grad_norm": 1.6689587831497192, "learning_rate": 4.485294117647059e-05, "loss": 3.2381, "step": 1204 }, { "epoch": 0.1, "grad_norm": 1.615942120552063, "learning_rate": 4.483584131326949e-05, "loss": 3.0886, "step": 1208 }, { "epoch": 0.1, "grad_norm": 1.3292858600616455, "learning_rate": 4.48187414500684e-05, "loss": 3.0968, "step": 1212 }, { "epoch": 0.1, "grad_norm": 1.7597836256027222, "learning_rate": 4.4801641586867306e-05, "loss": 3.0037, "step": 1216 }, { "epoch": 0.1, "grad_norm": 1.7249919176101685, "learning_rate": 4.4784541723666214e-05, "loss": 2.9228, "step": 1220 }, { "epoch": 0.1, "grad_norm": 1.630422830581665, "learning_rate": 4.476744186046512e-05, "loss": 3.0722, "step": 1224 }, { "epoch": 0.1, "grad_norm": 1.4283276796340942, "learning_rate": 4.475034199726402e-05, "loss": 2.8726, "step": 1228 }, { "epoch": 0.11, "grad_norm": 1.786851167678833, "learning_rate": 4.473324213406293e-05, "loss": 3.1425, "step": 1232 }, { "epoch": 0.11, "grad_norm": 1.669625997543335, "learning_rate": 4.471614227086183e-05, "loss": 3.013, "step": 1236 }, { "epoch": 0.11, "grad_norm": 1.4979221820831299, "learning_rate": 4.4699042407660744e-05, "loss": 3.106, "step": 1240 }, { "epoch": 0.11, "grad_norm": 1.67727792263031, "learning_rate": 4.468194254445965e-05, "loss": 2.9748, "step": 1244 }, { "epoch": 0.11, "grad_norm": 1.3726154565811157, "learning_rate": 4.466484268125855e-05, "loss": 3.0766, "step": 1248 }, { "epoch": 0.11, "grad_norm": 1.4735255241394043, "learning_rate": 4.464774281805746e-05, "loss": 2.9678, "step": 1252 }, { "epoch": 0.11, "grad_norm": 1.751224160194397, "learning_rate": 4.463064295485636e-05, "loss": 2.923, "step": 1256 }, { "epoch": 0.11, "grad_norm": 1.4682483673095703, "learning_rate": 4.461354309165527e-05, "loss": 2.9822, "step": 1260 }, { "epoch": 0.11, "grad_norm": 1.7135084867477417, "learning_rate": 4.4596443228454174e-05, "loss": 2.9087, "step": 1264 }, { "epoch": 0.11, "grad_norm": 1.6292390823364258, "learning_rate": 4.457934336525308e-05, "loss": 2.9948, "step": 1268 }, { "epoch": 0.11, "grad_norm": 1.3494658470153809, "learning_rate": 4.456224350205199e-05, "loss": 2.8939, "step": 1272 }, { "epoch": 0.11, "grad_norm": 1.584025263786316, "learning_rate": 4.454514363885089e-05, "loss": 2.9289, "step": 1276 }, { "epoch": 0.11, "grad_norm": 1.7876780033111572, "learning_rate": 4.45280437756498e-05, "loss": 3.0326, "step": 1280 }, { "epoch": 0.11, "grad_norm": 1.6452478170394897, "learning_rate": 4.4510943912448704e-05, "loss": 3.2085, "step": 1284 }, { "epoch": 0.11, "grad_norm": 1.733777642250061, "learning_rate": 4.4493844049247605e-05, "loss": 2.8625, "step": 1288 }, { "epoch": 0.11, "grad_norm": 1.5891138315200806, "learning_rate": 4.447674418604651e-05, "loss": 2.8191, "step": 1292 }, { "epoch": 0.11, "grad_norm": 1.5481853485107422, "learning_rate": 4.445964432284542e-05, "loss": 2.9586, "step": 1296 }, { "epoch": 0.11, "grad_norm": 1.4756824970245361, "learning_rate": 4.444254445964433e-05, "loss": 2.9225, "step": 1300 }, { "epoch": 0.11, "grad_norm": 1.4950698614120483, "learning_rate": 4.4425444596443234e-05, "loss": 2.8898, "step": 1304 }, { "epoch": 0.11, "grad_norm": 1.938930869102478, "learning_rate": 4.4408344733242135e-05, "loss": 3.0966, "step": 1308 }, { "epoch": 0.11, "grad_norm": 1.578041672706604, "learning_rate": 4.439124487004104e-05, "loss": 2.9547, "step": 1312 }, { "epoch": 0.11, "grad_norm": 1.5932279825210571, "learning_rate": 4.437414500683994e-05, "loss": 3.0289, "step": 1316 }, { "epoch": 0.11, "grad_norm": 1.5020122528076172, "learning_rate": 4.435704514363886e-05, "loss": 3.0404, "step": 1320 }, { "epoch": 0.11, "grad_norm": 1.6642231941223145, "learning_rate": 4.4339945280437764e-05, "loss": 3.0883, "step": 1324 }, { "epoch": 0.11, "grad_norm": 1.6281254291534424, "learning_rate": 4.4322845417236665e-05, "loss": 2.8135, "step": 1328 }, { "epoch": 0.11, "grad_norm": 1.4702434539794922, "learning_rate": 4.430574555403557e-05, "loss": 2.8406, "step": 1332 }, { "epoch": 0.11, "grad_norm": 2.073446750640869, "learning_rate": 4.428864569083447e-05, "loss": 3.0211, "step": 1336 }, { "epoch": 0.11, "grad_norm": 1.543094515800476, "learning_rate": 4.427154582763338e-05, "loss": 2.8459, "step": 1340 }, { "epoch": 0.11, "grad_norm": 1.5158188343048096, "learning_rate": 4.425444596443229e-05, "loss": 2.9384, "step": 1344 }, { "epoch": 0.12, "grad_norm": 1.6903549432754517, "learning_rate": 4.4237346101231194e-05, "loss": 3.0647, "step": 1348 }, { "epoch": 0.12, "grad_norm": 1.6733980178833008, "learning_rate": 4.42202462380301e-05, "loss": 2.8504, "step": 1352 }, { "epoch": 0.12, "grad_norm": 1.7396080493927002, "learning_rate": 4.4203146374829e-05, "loss": 3.1324, "step": 1356 }, { "epoch": 0.12, "grad_norm": 1.6228291988372803, "learning_rate": 4.418604651162791e-05, "loss": 2.9918, "step": 1360 }, { "epoch": 0.12, "grad_norm": 1.5741888284683228, "learning_rate": 4.416894664842681e-05, "loss": 3.0242, "step": 1364 }, { "epoch": 0.12, "grad_norm": 1.4788377285003662, "learning_rate": 4.415184678522572e-05, "loss": 2.7948, "step": 1368 }, { "epoch": 0.12, "grad_norm": 1.4203468561172485, "learning_rate": 4.4134746922024625e-05, "loss": 3.0096, "step": 1372 }, { "epoch": 0.12, "grad_norm": 1.4492088556289673, "learning_rate": 4.411764705882353e-05, "loss": 2.9375, "step": 1376 }, { "epoch": 0.12, "grad_norm": 1.4139641523361206, "learning_rate": 4.410054719562244e-05, "loss": 2.7041, "step": 1380 }, { "epoch": 0.12, "grad_norm": 1.5051127672195435, "learning_rate": 4.408344733242134e-05, "loss": 2.9992, "step": 1384 }, { "epoch": 0.12, "grad_norm": 1.6225379705429077, "learning_rate": 4.406634746922025e-05, "loss": 2.9139, "step": 1388 }, { "epoch": 0.12, "grad_norm": 2.4177663326263428, "learning_rate": 4.4049247606019155e-05, "loss": 3.1328, "step": 1392 }, { "epoch": 0.12, "grad_norm": 1.342938780784607, "learning_rate": 4.4032147742818055e-05, "loss": 2.7651, "step": 1396 }, { "epoch": 0.12, "grad_norm": 1.5941474437713623, "learning_rate": 4.401504787961697e-05, "loss": 3.0623, "step": 1400 }, { "epoch": 0.12, "grad_norm": 1.9736019372940063, "learning_rate": 4.399794801641587e-05, "loss": 2.9231, "step": 1404 }, { "epoch": 0.12, "grad_norm": 1.7911643981933594, "learning_rate": 4.398084815321478e-05, "loss": 2.9161, "step": 1408 }, { "epoch": 0.12, "grad_norm": 1.585992693901062, "learning_rate": 4.3963748290013685e-05, "loss": 3.0079, "step": 1412 }, { "epoch": 0.12, "grad_norm": 1.537345290184021, "learning_rate": 4.3946648426812585e-05, "loss": 2.9238, "step": 1416 }, { "epoch": 0.12, "grad_norm": 1.5920599699020386, "learning_rate": 4.392954856361149e-05, "loss": 2.8771, "step": 1420 }, { "epoch": 0.12, "grad_norm": 1.606899619102478, "learning_rate": 4.39124487004104e-05, "loss": 2.7501, "step": 1424 }, { "epoch": 0.12, "grad_norm": 1.5318756103515625, "learning_rate": 4.389534883720931e-05, "loss": 2.8197, "step": 1428 }, { "epoch": 0.12, "grad_norm": 1.987911581993103, "learning_rate": 4.3878248974008215e-05, "loss": 2.971, "step": 1432 }, { "epoch": 0.12, "grad_norm": 1.5010288953781128, "learning_rate": 4.3861149110807115e-05, "loss": 2.9632, "step": 1436 }, { "epoch": 0.12, "grad_norm": 1.488486647605896, "learning_rate": 4.384404924760602e-05, "loss": 3.0563, "step": 1440 }, { "epoch": 0.12, "grad_norm": 1.7354810237884521, "learning_rate": 4.382694938440492e-05, "loss": 2.866, "step": 1444 }, { "epoch": 0.12, "grad_norm": 1.509134292602539, "learning_rate": 4.380984952120383e-05, "loss": 2.8723, "step": 1448 }, { "epoch": 0.12, "grad_norm": 1.687049150466919, "learning_rate": 4.379274965800274e-05, "loss": 2.9496, "step": 1452 }, { "epoch": 0.12, "grad_norm": 1.5449968576431274, "learning_rate": 4.3775649794801645e-05, "loss": 2.884, "step": 1456 }, { "epoch": 0.12, "grad_norm": 1.7197214365005493, "learning_rate": 4.375854993160055e-05, "loss": 2.8686, "step": 1460 }, { "epoch": 0.13, "grad_norm": 1.3042534589767456, "learning_rate": 4.374145006839945e-05, "loss": 2.739, "step": 1464 }, { "epoch": 0.13, "grad_norm": 1.4646921157836914, "learning_rate": 4.372435020519836e-05, "loss": 2.8347, "step": 1468 }, { "epoch": 0.13, "grad_norm": 1.350310206413269, "learning_rate": 4.370725034199726e-05, "loss": 2.8797, "step": 1472 }, { "epoch": 0.13, "grad_norm": 1.5870814323425293, "learning_rate": 4.369015047879617e-05, "loss": 3.025, "step": 1476 }, { "epoch": 0.13, "grad_norm": 1.5572704076766968, "learning_rate": 4.3673050615595076e-05, "loss": 2.8424, "step": 1480 }, { "epoch": 0.13, "grad_norm": 1.4381687641143799, "learning_rate": 4.365595075239398e-05, "loss": 3.0395, "step": 1484 }, { "epoch": 0.13, "grad_norm": 1.7877554893493652, "learning_rate": 4.363885088919289e-05, "loss": 2.8723, "step": 1488 }, { "epoch": 0.13, "grad_norm": 1.7916138172149658, "learning_rate": 4.362175102599179e-05, "loss": 2.8751, "step": 1492 }, { "epoch": 0.13, "grad_norm": 1.8024954795837402, "learning_rate": 4.36046511627907e-05, "loss": 3.0156, "step": 1496 }, { "epoch": 0.13, "grad_norm": 1.4376300573349, "learning_rate": 4.3587551299589606e-05, "loss": 2.9152, "step": 1500 }, { "epoch": 0.13, "grad_norm": 1.5347250699996948, "learning_rate": 4.3570451436388506e-05, "loss": 2.8219, "step": 1504 }, { "epoch": 0.13, "grad_norm": 1.7805489301681519, "learning_rate": 4.355335157318742e-05, "loss": 2.9276, "step": 1508 }, { "epoch": 0.13, "grad_norm": 1.5822547674179077, "learning_rate": 4.353625170998632e-05, "loss": 2.9262, "step": 1512 }, { "epoch": 0.13, "grad_norm": 1.5745974779129028, "learning_rate": 4.351915184678523e-05, "loss": 2.8812, "step": 1516 }, { "epoch": 0.13, "grad_norm": 1.5027135610580444, "learning_rate": 4.3502051983584136e-05, "loss": 2.7748, "step": 1520 }, { "epoch": 0.13, "grad_norm": 1.5027369260787964, "learning_rate": 4.3484952120383036e-05, "loss": 2.8619, "step": 1524 }, { "epoch": 0.13, "grad_norm": 1.6836864948272705, "learning_rate": 4.3467852257181944e-05, "loss": 2.9688, "step": 1528 }, { "epoch": 0.13, "grad_norm": 1.682340145111084, "learning_rate": 4.345075239398085e-05, "loss": 2.9849, "step": 1532 }, { "epoch": 0.13, "grad_norm": 1.3924282789230347, "learning_rate": 4.343365253077976e-05, "loss": 2.8154, "step": 1536 }, { "epoch": 0.13, "grad_norm": 1.518629789352417, "learning_rate": 4.3416552667578666e-05, "loss": 2.8265, "step": 1540 }, { "epoch": 0.13, "grad_norm": 1.482877254486084, "learning_rate": 4.3399452804377566e-05, "loss": 2.8364, "step": 1544 }, { "epoch": 0.13, "grad_norm": 1.52615487575531, "learning_rate": 4.3382352941176474e-05, "loss": 2.9488, "step": 1548 }, { "epoch": 0.13, "grad_norm": 1.8858604431152344, "learning_rate": 4.3365253077975374e-05, "loss": 3.0754, "step": 1552 }, { "epoch": 0.13, "grad_norm": 1.3640977144241333, "learning_rate": 4.334815321477428e-05, "loss": 2.7192, "step": 1556 }, { "epoch": 0.13, "grad_norm": 1.6419317722320557, "learning_rate": 4.333105335157319e-05, "loss": 2.8016, "step": 1560 }, { "epoch": 0.13, "grad_norm": 1.4059332609176636, "learning_rate": 4.3313953488372096e-05, "loss": 2.8559, "step": 1564 }, { "epoch": 0.13, "grad_norm": 1.5837764739990234, "learning_rate": 4.3296853625171004e-05, "loss": 3.0297, "step": 1568 }, { "epoch": 0.13, "grad_norm": 1.8545840978622437, "learning_rate": 4.3279753761969904e-05, "loss": 2.9717, "step": 1572 }, { "epoch": 0.13, "grad_norm": 1.2715051174163818, "learning_rate": 4.326265389876881e-05, "loss": 2.7563, "step": 1576 }, { "epoch": 0.14, "grad_norm": 1.742698073387146, "learning_rate": 4.324555403556772e-05, "loss": 2.7826, "step": 1580 }, { "epoch": 0.14, "grad_norm": 1.7338377237319946, "learning_rate": 4.322845417236662e-05, "loss": 2.8006, "step": 1584 }, { "epoch": 0.14, "grad_norm": 1.5166951417922974, "learning_rate": 4.3211354309165533e-05, "loss": 3.0044, "step": 1588 }, { "epoch": 0.14, "grad_norm": 1.5267099142074585, "learning_rate": 4.3194254445964434e-05, "loss": 2.817, "step": 1592 }, { "epoch": 0.14, "grad_norm": 1.7478032112121582, "learning_rate": 4.317715458276334e-05, "loss": 3.0036, "step": 1596 }, { "epoch": 0.14, "grad_norm": 1.9281939268112183, "learning_rate": 4.316005471956224e-05, "loss": 2.9689, "step": 1600 }, { "epoch": 0.14, "grad_norm": 1.5747674703598022, "learning_rate": 4.314295485636115e-05, "loss": 2.7691, "step": 1604 }, { "epoch": 0.14, "grad_norm": 1.530669093132019, "learning_rate": 4.312585499316006e-05, "loss": 2.7754, "step": 1608 }, { "epoch": 0.14, "grad_norm": 1.4007419347763062, "learning_rate": 4.3108755129958964e-05, "loss": 2.6687, "step": 1612 }, { "epoch": 0.14, "grad_norm": 1.7224704027175903, "learning_rate": 4.309165526675787e-05, "loss": 3.0238, "step": 1616 }, { "epoch": 0.14, "grad_norm": 1.688553810119629, "learning_rate": 4.307455540355677e-05, "loss": 2.7985, "step": 1620 }, { "epoch": 0.14, "grad_norm": 1.3165500164031982, "learning_rate": 4.305745554035568e-05, "loss": 2.8546, "step": 1624 }, { "epoch": 0.14, "grad_norm": 1.4722650051116943, "learning_rate": 4.3040355677154587e-05, "loss": 2.8455, "step": 1628 }, { "epoch": 0.14, "grad_norm": 1.5830330848693848, "learning_rate": 4.302325581395349e-05, "loss": 2.709, "step": 1632 }, { "epoch": 0.14, "grad_norm": 1.5168038606643677, "learning_rate": 4.3006155950752394e-05, "loss": 2.8194, "step": 1636 }, { "epoch": 0.14, "grad_norm": 1.527031660079956, "learning_rate": 4.29890560875513e-05, "loss": 2.7123, "step": 1640 }, { "epoch": 0.14, "grad_norm": 1.492450475692749, "learning_rate": 4.297195622435021e-05, "loss": 2.9356, "step": 1644 }, { "epoch": 0.14, "grad_norm": 1.4466654062271118, "learning_rate": 4.2954856361149116e-05, "loss": 2.8938, "step": 1648 }, { "epoch": 0.14, "grad_norm": 1.6020383834838867, "learning_rate": 4.293775649794802e-05, "loss": 2.8547, "step": 1652 }, { "epoch": 0.14, "grad_norm": 1.5876679420471191, "learning_rate": 4.2920656634746924e-05, "loss": 2.7769, "step": 1656 }, { "epoch": 0.14, "grad_norm": 1.4787455797195435, "learning_rate": 4.2903556771545825e-05, "loss": 2.6641, "step": 1660 }, { "epoch": 0.14, "grad_norm": 1.6105437278747559, "learning_rate": 4.288645690834473e-05, "loss": 2.775, "step": 1664 }, { "epoch": 0.14, "grad_norm": 1.5835809707641602, "learning_rate": 4.2869357045143646e-05, "loss": 3.019, "step": 1668 }, { "epoch": 0.14, "grad_norm": 2.103267192840576, "learning_rate": 4.285225718194255e-05, "loss": 2.828, "step": 1672 }, { "epoch": 0.14, "grad_norm": 1.5641860961914062, "learning_rate": 4.2835157318741454e-05, "loss": 2.8392, "step": 1676 }, { "epoch": 0.14, "grad_norm": 1.475889801979065, "learning_rate": 4.2818057455540355e-05, "loss": 2.9558, "step": 1680 }, { "epoch": 0.14, "grad_norm": 1.3875279426574707, "learning_rate": 4.280095759233926e-05, "loss": 2.7178, "step": 1684 }, { "epoch": 0.14, "grad_norm": 1.6096158027648926, "learning_rate": 4.278385772913817e-05, "loss": 2.7963, "step": 1688 }, { "epoch": 0.14, "grad_norm": 1.4312164783477783, "learning_rate": 4.276675786593708e-05, "loss": 2.7887, "step": 1692 }, { "epoch": 0.15, "grad_norm": 2.0804941654205322, "learning_rate": 4.2749658002735984e-05, "loss": 2.9118, "step": 1696 }, { "epoch": 0.15, "grad_norm": 1.606184482574463, "learning_rate": 4.2732558139534885e-05, "loss": 2.7951, "step": 1700 }, { "epoch": 0.15, "grad_norm": 1.4609088897705078, "learning_rate": 4.271545827633379e-05, "loss": 2.8464, "step": 1704 }, { "epoch": 0.15, "grad_norm": 1.3830453157424927, "learning_rate": 4.26983584131327e-05, "loss": 2.9156, "step": 1708 }, { "epoch": 0.15, "grad_norm": 1.4420268535614014, "learning_rate": 4.26812585499316e-05, "loss": 2.5969, "step": 1712 }, { "epoch": 0.15, "grad_norm": 1.4451963901519775, "learning_rate": 4.266415868673051e-05, "loss": 2.9125, "step": 1716 }, { "epoch": 0.15, "grad_norm": 1.5134412050247192, "learning_rate": 4.2647058823529415e-05, "loss": 2.6605, "step": 1720 }, { "epoch": 0.15, "grad_norm": 1.561641812324524, "learning_rate": 4.262995896032832e-05, "loss": 2.8285, "step": 1724 }, { "epoch": 0.15, "grad_norm": 1.4210346937179565, "learning_rate": 4.261285909712722e-05, "loss": 2.7912, "step": 1728 }, { "epoch": 0.15, "grad_norm": 1.4329392910003662, "learning_rate": 4.259575923392613e-05, "loss": 2.8862, "step": 1732 }, { "epoch": 0.15, "grad_norm": 1.6114985942840576, "learning_rate": 4.257865937072504e-05, "loss": 2.7819, "step": 1736 }, { "epoch": 0.15, "grad_norm": 1.6259700059890747, "learning_rate": 4.256155950752394e-05, "loss": 2.9296, "step": 1740 }, { "epoch": 0.15, "grad_norm": 1.523130178451538, "learning_rate": 4.2544459644322845e-05, "loss": 2.7139, "step": 1744 }, { "epoch": 0.15, "grad_norm": 1.7980687618255615, "learning_rate": 4.252735978112175e-05, "loss": 2.7185, "step": 1748 }, { "epoch": 0.15, "grad_norm": 1.73753821849823, "learning_rate": 4.251025991792066e-05, "loss": 2.7322, "step": 1752 }, { "epoch": 0.15, "grad_norm": 1.7766021490097046, "learning_rate": 4.249316005471957e-05, "loss": 2.5836, "step": 1756 }, { "epoch": 0.15, "grad_norm": 1.4891935586929321, "learning_rate": 4.247606019151847e-05, "loss": 2.931, "step": 1760 }, { "epoch": 0.15, "grad_norm": 1.4233758449554443, "learning_rate": 4.2458960328317375e-05, "loss": 2.9084, "step": 1764 }, { "epoch": 0.15, "grad_norm": 1.5510305166244507, "learning_rate": 4.2441860465116276e-05, "loss": 2.8017, "step": 1768 }, { "epoch": 0.15, "grad_norm": 1.3143435716629028, "learning_rate": 4.242476060191519e-05, "loss": 2.7212, "step": 1772 }, { "epoch": 0.15, "grad_norm": 1.5766334533691406, "learning_rate": 4.24076607387141e-05, "loss": 2.7551, "step": 1776 }, { "epoch": 0.15, "grad_norm": 1.551622986793518, "learning_rate": 4.2390560875513e-05, "loss": 2.8498, "step": 1780 }, { "epoch": 0.15, "grad_norm": 2.2195143699645996, "learning_rate": 4.2373461012311905e-05, "loss": 2.811, "step": 1784 }, { "epoch": 0.15, "grad_norm": 1.530758023262024, "learning_rate": 4.2356361149110806e-05, "loss": 2.772, "step": 1788 }, { "epoch": 0.15, "grad_norm": 1.5073307752609253, "learning_rate": 4.233926128590971e-05, "loss": 2.7401, "step": 1792 }, { "epoch": 0.15, "grad_norm": 1.6950387954711914, "learning_rate": 4.232216142270862e-05, "loss": 2.7554, "step": 1796 }, { "epoch": 0.15, "grad_norm": 1.389538288116455, "learning_rate": 4.230506155950753e-05, "loss": 2.8187, "step": 1800 }, { "epoch": 0.15, "grad_norm": 1.592817783355713, "learning_rate": 4.2287961696306435e-05, "loss": 2.8686, "step": 1804 }, { "epoch": 0.15, "grad_norm": 1.6047254800796509, "learning_rate": 4.2270861833105336e-05, "loss": 2.8033, "step": 1808 }, { "epoch": 0.15, "grad_norm": 1.4730092287063599, "learning_rate": 4.225376196990424e-05, "loss": 2.6616, "step": 1812 }, { "epoch": 0.16, "grad_norm": 1.4784797430038452, "learning_rate": 4.223666210670315e-05, "loss": 2.8108, "step": 1816 }, { "epoch": 0.16, "grad_norm": 1.6881104707717896, "learning_rate": 4.221956224350205e-05, "loss": 2.8155, "step": 1820 }, { "epoch": 0.16, "grad_norm": 1.4548227787017822, "learning_rate": 4.220246238030096e-05, "loss": 2.7195, "step": 1824 }, { "epoch": 0.16, "grad_norm": 1.5578104257583618, "learning_rate": 4.2185362517099866e-05, "loss": 2.8058, "step": 1828 }, { "epoch": 0.16, "grad_norm": 1.6127967834472656, "learning_rate": 4.216826265389877e-05, "loss": 2.8783, "step": 1832 }, { "epoch": 0.16, "grad_norm": 1.883389949798584, "learning_rate": 4.215116279069768e-05, "loss": 2.6328, "step": 1836 }, { "epoch": 0.16, "grad_norm": 1.6216742992401123, "learning_rate": 4.213406292749658e-05, "loss": 2.8021, "step": 1840 }, { "epoch": 0.16, "grad_norm": 1.7198604345321655, "learning_rate": 4.211696306429549e-05, "loss": 2.6467, "step": 1844 }, { "epoch": 0.16, "grad_norm": 1.3806415796279907, "learning_rate": 4.209986320109439e-05, "loss": 2.8337, "step": 1848 }, { "epoch": 0.16, "grad_norm": 1.9981590509414673, "learning_rate": 4.2082763337893296e-05, "loss": 2.7637, "step": 1852 }, { "epoch": 0.16, "grad_norm": 1.4233486652374268, "learning_rate": 4.2065663474692204e-05, "loss": 2.6964, "step": 1856 }, { "epoch": 0.16, "grad_norm": 1.4599206447601318, "learning_rate": 4.204856361149111e-05, "loss": 2.8918, "step": 1860 }, { "epoch": 0.16, "grad_norm": 1.6856427192687988, "learning_rate": 4.203146374829002e-05, "loss": 2.7184, "step": 1864 }, { "epoch": 0.16, "grad_norm": 1.5229926109313965, "learning_rate": 4.201436388508892e-05, "loss": 2.7457, "step": 1868 }, { "epoch": 0.16, "grad_norm": 1.8297462463378906, "learning_rate": 4.1997264021887826e-05, "loss": 2.8097, "step": 1872 }, { "epoch": 0.16, "grad_norm": 1.4381957054138184, "learning_rate": 4.198016415868673e-05, "loss": 2.6845, "step": 1876 }, { "epoch": 0.16, "grad_norm": 1.8839136362075806, "learning_rate": 4.196306429548564e-05, "loss": 2.7721, "step": 1880 }, { "epoch": 0.16, "grad_norm": 1.6147316694259644, "learning_rate": 4.194596443228455e-05, "loss": 2.7529, "step": 1884 }, { "epoch": 0.16, "grad_norm": 1.4473137855529785, "learning_rate": 4.192886456908345e-05, "loss": 2.7582, "step": 1888 }, { "epoch": 0.16, "grad_norm": 1.6229139566421509, "learning_rate": 4.1911764705882356e-05, "loss": 2.7579, "step": 1892 }, { "epoch": 0.16, "grad_norm": 1.669582724571228, "learning_rate": 4.189466484268126e-05, "loss": 2.7924, "step": 1896 }, { "epoch": 0.16, "grad_norm": 1.4859235286712646, "learning_rate": 4.1877564979480164e-05, "loss": 2.6415, "step": 1900 }, { "epoch": 0.16, "grad_norm": 1.4661961793899536, "learning_rate": 4.186046511627907e-05, "loss": 2.5649, "step": 1904 }, { "epoch": 0.16, "grad_norm": 1.4079793691635132, "learning_rate": 4.184336525307798e-05, "loss": 2.6718, "step": 1908 }, { "epoch": 0.16, "grad_norm": 1.7637715339660645, "learning_rate": 4.1826265389876886e-05, "loss": 2.7626, "step": 1912 }, { "epoch": 0.16, "grad_norm": 1.4910571575164795, "learning_rate": 4.1809165526675787e-05, "loss": 2.683, "step": 1916 }, { "epoch": 0.16, "grad_norm": 1.709197759628296, "learning_rate": 4.1792065663474694e-05, "loss": 2.8062, "step": 1920 }, { "epoch": 0.16, "grad_norm": 1.8040099143981934, "learning_rate": 4.17749658002736e-05, "loss": 2.8158, "step": 1924 }, { "epoch": 0.16, "grad_norm": 1.4271622896194458, "learning_rate": 4.17578659370725e-05, "loss": 2.8655, "step": 1928 }, { "epoch": 0.17, "grad_norm": 1.6832592487335205, "learning_rate": 4.174076607387141e-05, "loss": 2.9054, "step": 1932 }, { "epoch": 0.17, "grad_norm": 1.674769639968872, "learning_rate": 4.1723666210670316e-05, "loss": 2.7739, "step": 1936 }, { "epoch": 0.17, "grad_norm": 1.3221181631088257, "learning_rate": 4.1706566347469224e-05, "loss": 2.7352, "step": 1940 }, { "epoch": 0.17, "grad_norm": 1.6557152271270752, "learning_rate": 4.168946648426813e-05, "loss": 2.7729, "step": 1944 }, { "epoch": 0.17, "grad_norm": 1.5999726057052612, "learning_rate": 4.167236662106703e-05, "loss": 2.8399, "step": 1948 }, { "epoch": 0.17, "grad_norm": 1.5852841138839722, "learning_rate": 4.165526675786594e-05, "loss": 2.8804, "step": 1952 }, { "epoch": 0.17, "grad_norm": 1.634738802909851, "learning_rate": 4.163816689466484e-05, "loss": 2.6103, "step": 1956 }, { "epoch": 0.17, "grad_norm": 1.5783636569976807, "learning_rate": 4.1621067031463754e-05, "loss": 2.4903, "step": 1960 }, { "epoch": 0.17, "grad_norm": 1.4169994592666626, "learning_rate": 4.160396716826266e-05, "loss": 2.6708, "step": 1964 }, { "epoch": 0.17, "grad_norm": 2.4021108150482178, "learning_rate": 4.158686730506156e-05, "loss": 2.6697, "step": 1968 }, { "epoch": 0.17, "grad_norm": 1.4128559827804565, "learning_rate": 4.156976744186047e-05, "loss": 2.7368, "step": 1972 }, { "epoch": 0.17, "grad_norm": 1.5087906122207642, "learning_rate": 4.155266757865937e-05, "loss": 2.7219, "step": 1976 }, { "epoch": 0.17, "grad_norm": 6.4628214836120605, "learning_rate": 4.153556771545828e-05, "loss": 2.7273, "step": 1980 }, { "epoch": 0.17, "grad_norm": 1.5917600393295288, "learning_rate": 4.1518467852257184e-05, "loss": 2.885, "step": 1984 }, { "epoch": 0.17, "grad_norm": 1.4118601083755493, "learning_rate": 4.150136798905609e-05, "loss": 2.7849, "step": 1988 }, { "epoch": 0.17, "grad_norm": 1.4783868789672852, "learning_rate": 4.1484268125855e-05, "loss": 2.8531, "step": 1992 }, { "epoch": 0.17, "grad_norm": 1.6047073602676392, "learning_rate": 4.14671682626539e-05, "loss": 2.784, "step": 1996 }, { "epoch": 0.17, "grad_norm": 1.565877079963684, "learning_rate": 4.145006839945281e-05, "loss": 2.6856, "step": 2000 }, { "epoch": 0.17, "grad_norm": 1.4095959663391113, "learning_rate": 4.143296853625171e-05, "loss": 2.7933, "step": 2004 }, { "epoch": 0.17, "grad_norm": 1.3851944208145142, "learning_rate": 4.1415868673050615e-05, "loss": 2.7286, "step": 2008 }, { "epoch": 0.17, "grad_norm": 1.4448835849761963, "learning_rate": 4.139876880984952e-05, "loss": 2.667, "step": 2012 }, { "epoch": 0.17, "grad_norm": 1.2041794061660767, "learning_rate": 4.138166894664843e-05, "loss": 2.553, "step": 2016 }, { "epoch": 0.17, "grad_norm": 1.5906238555908203, "learning_rate": 4.136456908344734e-05, "loss": 2.7194, "step": 2020 }, { "epoch": 0.17, "grad_norm": 1.493734359741211, "learning_rate": 4.134746922024624e-05, "loss": 2.6597, "step": 2024 }, { "epoch": 0.17, "grad_norm": 1.485025405883789, "learning_rate": 4.1330369357045145e-05, "loss": 2.7412, "step": 2028 }, { "epoch": 0.17, "grad_norm": 1.6668092012405396, "learning_rate": 4.131326949384405e-05, "loss": 2.7559, "step": 2032 }, { "epoch": 0.17, "grad_norm": 1.6972142457962036, "learning_rate": 4.129616963064295e-05, "loss": 2.6343, "step": 2036 }, { "epoch": 0.17, "grad_norm": 1.6248745918273926, "learning_rate": 4.127906976744187e-05, "loss": 2.749, "step": 2040 }, { "epoch": 0.17, "grad_norm": 1.486807942390442, "learning_rate": 4.126196990424077e-05, "loss": 2.5962, "step": 2044 }, { "epoch": 0.18, "grad_norm": 1.558280110359192, "learning_rate": 4.1244870041039675e-05, "loss": 2.7067, "step": 2048 }, { "epoch": 0.18, "grad_norm": 1.78428053855896, "learning_rate": 4.122777017783858e-05, "loss": 2.7901, "step": 2052 }, { "epoch": 0.18, "grad_norm": 1.5545462369918823, "learning_rate": 4.121067031463748e-05, "loss": 2.7868, "step": 2056 }, { "epoch": 0.18, "grad_norm": 1.5399055480957031, "learning_rate": 4.119357045143639e-05, "loss": 2.8405, "step": 2060 }, { "epoch": 0.18, "grad_norm": 1.535919189453125, "learning_rate": 4.11764705882353e-05, "loss": 2.6297, "step": 2064 }, { "epoch": 0.18, "grad_norm": 1.3103563785552979, "learning_rate": 4.1159370725034205e-05, "loss": 2.6334, "step": 2068 }, { "epoch": 0.18, "grad_norm": 1.4171690940856934, "learning_rate": 4.114227086183311e-05, "loss": 2.5261, "step": 2072 }, { "epoch": 0.18, "grad_norm": 1.5107210874557495, "learning_rate": 4.112517099863201e-05, "loss": 2.5669, "step": 2076 }, { "epoch": 0.18, "grad_norm": 1.4645127058029175, "learning_rate": 4.110807113543092e-05, "loss": 2.6166, "step": 2080 }, { "epoch": 0.18, "grad_norm": 1.8557233810424805, "learning_rate": 4.109097127222982e-05, "loss": 2.6651, "step": 2084 }, { "epoch": 0.18, "grad_norm": 1.4747710227966309, "learning_rate": 4.107387140902873e-05, "loss": 2.9066, "step": 2088 }, { "epoch": 0.18, "grad_norm": 1.4575109481811523, "learning_rate": 4.1056771545827635e-05, "loss": 2.7079, "step": 2092 }, { "epoch": 0.18, "grad_norm": 1.5356212854385376, "learning_rate": 4.103967168262654e-05, "loss": 2.711, "step": 2096 }, { "epoch": 0.18, "grad_norm": 1.4428212642669678, "learning_rate": 4.102257181942545e-05, "loss": 2.652, "step": 2100 }, { "epoch": 0.18, "grad_norm": 1.6778663396835327, "learning_rate": 4.100547195622435e-05, "loss": 2.7872, "step": 2104 }, { "epoch": 0.18, "grad_norm": 1.4642986059188843, "learning_rate": 4.098837209302326e-05, "loss": 2.587, "step": 2108 }, { "epoch": 0.18, "grad_norm": 1.5550886392593384, "learning_rate": 4.097127222982216e-05, "loss": 2.6646, "step": 2112 }, { "epoch": 0.18, "grad_norm": 1.6808562278747559, "learning_rate": 4.0954172366621066e-05, "loss": 2.588, "step": 2116 }, { "epoch": 0.18, "grad_norm": 1.7080578804016113, "learning_rate": 4.093707250341998e-05, "loss": 2.6144, "step": 2120 }, { "epoch": 0.18, "grad_norm": 1.8413456678390503, "learning_rate": 4.091997264021888e-05, "loss": 2.8098, "step": 2124 }, { "epoch": 0.18, "grad_norm": 1.4448609352111816, "learning_rate": 4.090287277701779e-05, "loss": 2.7619, "step": 2128 }, { "epoch": 0.18, "grad_norm": 1.5994160175323486, "learning_rate": 4.088577291381669e-05, "loss": 2.6444, "step": 2132 }, { "epoch": 0.18, "grad_norm": 1.5190199613571167, "learning_rate": 4.0868673050615596e-05, "loss": 2.5597, "step": 2136 }, { "epoch": 0.18, "grad_norm": 1.5521091222763062, "learning_rate": 4.08515731874145e-05, "loss": 2.6333, "step": 2140 }, { "epoch": 0.18, "grad_norm": 1.6080738306045532, "learning_rate": 4.083447332421341e-05, "loss": 2.5827, "step": 2144 }, { "epoch": 0.18, "grad_norm": 2.0923423767089844, "learning_rate": 4.081737346101232e-05, "loss": 2.6463, "step": 2148 }, { "epoch": 0.18, "grad_norm": 1.4243260622024536, "learning_rate": 4.080027359781122e-05, "loss": 2.6702, "step": 2152 }, { "epoch": 0.18, "grad_norm": 1.5463379621505737, "learning_rate": 4.0783173734610126e-05, "loss": 2.6284, "step": 2156 }, { "epoch": 0.18, "grad_norm": 1.4061999320983887, "learning_rate": 4.076607387140903e-05, "loss": 2.4983, "step": 2160 }, { "epoch": 0.19, "grad_norm": 1.3713377714157104, "learning_rate": 4.0748974008207933e-05, "loss": 2.5406, "step": 2164 }, { "epoch": 0.19, "grad_norm": 1.6995548009872437, "learning_rate": 4.073187414500684e-05, "loss": 2.5039, "step": 2168 }, { "epoch": 0.19, "grad_norm": 1.644362449645996, "learning_rate": 4.071477428180575e-05, "loss": 2.7398, "step": 2172 }, { "epoch": 0.19, "grad_norm": 1.5956717729568481, "learning_rate": 4.0697674418604655e-05, "loss": 2.7333, "step": 2176 }, { "epoch": 0.19, "grad_norm": 1.567484736442566, "learning_rate": 4.068057455540356e-05, "loss": 2.727, "step": 2180 }, { "epoch": 0.19, "grad_norm": 1.573633074760437, "learning_rate": 4.066347469220246e-05, "loss": 2.639, "step": 2184 }, { "epoch": 0.19, "grad_norm": 1.5799304246902466, "learning_rate": 4.064637482900137e-05, "loss": 2.735, "step": 2188 }, { "epoch": 0.19, "grad_norm": 1.5772294998168945, "learning_rate": 4.062927496580027e-05, "loss": 2.5753, "step": 2192 }, { "epoch": 0.19, "grad_norm": 1.4983608722686768, "learning_rate": 4.061217510259918e-05, "loss": 2.5989, "step": 2196 }, { "epoch": 0.19, "grad_norm": 1.625112533569336, "learning_rate": 4.0595075239398086e-05, "loss": 2.6539, "step": 2200 }, { "epoch": 0.19, "grad_norm": 1.4899359941482544, "learning_rate": 4.057797537619699e-05, "loss": 2.7391, "step": 2204 }, { "epoch": 0.19, "grad_norm": 1.4032224416732788, "learning_rate": 4.05608755129959e-05, "loss": 2.4294, "step": 2208 }, { "epoch": 0.19, "grad_norm": 2.379164218902588, "learning_rate": 4.05437756497948e-05, "loss": 2.8045, "step": 2212 }, { "epoch": 0.19, "grad_norm": 1.4953522682189941, "learning_rate": 4.052667578659371e-05, "loss": 2.6627, "step": 2216 }, { "epoch": 0.19, "grad_norm": 1.3828809261322021, "learning_rate": 4.0509575923392616e-05, "loss": 2.5951, "step": 2220 }, { "epoch": 0.19, "grad_norm": 1.5651103258132935, "learning_rate": 4.0492476060191516e-05, "loss": 2.7003, "step": 2224 }, { "epoch": 0.19, "grad_norm": 1.7900563478469849, "learning_rate": 4.047537619699043e-05, "loss": 2.5935, "step": 2228 }, { "epoch": 0.19, "grad_norm": 1.9265758991241455, "learning_rate": 4.045827633378933e-05, "loss": 2.6538, "step": 2232 }, { "epoch": 0.19, "grad_norm": 1.4881079196929932, "learning_rate": 4.044117647058824e-05, "loss": 2.4857, "step": 2236 }, { "epoch": 0.19, "grad_norm": 1.4665502309799194, "learning_rate": 4.042407660738714e-05, "loss": 2.5874, "step": 2240 }, { "epoch": 0.19, "grad_norm": 1.5124859809875488, "learning_rate": 4.0406976744186046e-05, "loss": 2.8067, "step": 2244 }, { "epoch": 0.19, "grad_norm": 1.4011058807373047, "learning_rate": 4.0389876880984954e-05, "loss": 2.5625, "step": 2248 }, { "epoch": 0.19, "grad_norm": 1.469796895980835, "learning_rate": 4.037277701778386e-05, "loss": 2.7695, "step": 2252 }, { "epoch": 0.19, "grad_norm": 1.610304355621338, "learning_rate": 4.035567715458277e-05, "loss": 2.5637, "step": 2256 }, { "epoch": 0.19, "grad_norm": 1.3841075897216797, "learning_rate": 4.033857729138167e-05, "loss": 2.7228, "step": 2260 }, { "epoch": 0.19, "grad_norm": 1.6841098070144653, "learning_rate": 4.0321477428180576e-05, "loss": 2.6203, "step": 2264 }, { "epoch": 0.19, "grad_norm": 1.5729596614837646, "learning_rate": 4.0304377564979484e-05, "loss": 2.5021, "step": 2268 }, { "epoch": 0.19, "grad_norm": 1.6014264822006226, "learning_rate": 4.0287277701778384e-05, "loss": 2.6825, "step": 2272 }, { "epoch": 0.19, "grad_norm": 1.3496001958847046, "learning_rate": 4.027017783857729e-05, "loss": 2.5271, "step": 2276 }, { "epoch": 0.19, "grad_norm": 1.5231562852859497, "learning_rate": 4.02530779753762e-05, "loss": 2.6128, "step": 2280 }, { "epoch": 0.2, "grad_norm": 1.7021198272705078, "learning_rate": 4.0235978112175106e-05, "loss": 2.7152, "step": 2284 }, { "epoch": 0.2, "grad_norm": 1.4971519708633423, "learning_rate": 4.0218878248974014e-05, "loss": 2.6637, "step": 2288 }, { "epoch": 0.2, "grad_norm": 1.6225758790969849, "learning_rate": 4.0201778385772914e-05, "loss": 2.6871, "step": 2292 }, { "epoch": 0.2, "grad_norm": 1.5330243110656738, "learning_rate": 4.018467852257182e-05, "loss": 2.5721, "step": 2296 }, { "epoch": 0.2, "grad_norm": 1.4374375343322754, "learning_rate": 4.016757865937072e-05, "loss": 2.6393, "step": 2300 }, { "epoch": 0.2, "grad_norm": 1.5932408571243286, "learning_rate": 4.015047879616963e-05, "loss": 2.5341, "step": 2304 }, { "epoch": 0.2, "grad_norm": 1.7230651378631592, "learning_rate": 4.0133378932968544e-05, "loss": 2.6912, "step": 2308 }, { "epoch": 0.2, "grad_norm": 1.5877001285552979, "learning_rate": 4.0116279069767444e-05, "loss": 2.6632, "step": 2312 }, { "epoch": 0.2, "grad_norm": 1.6193994283676147, "learning_rate": 4.009917920656635e-05, "loss": 2.3792, "step": 2316 }, { "epoch": 0.2, "grad_norm": 1.5001564025878906, "learning_rate": 4.008207934336525e-05, "loss": 2.538, "step": 2320 }, { "epoch": 0.2, "grad_norm": 1.5603808164596558, "learning_rate": 4.006497948016416e-05, "loss": 2.5206, "step": 2324 }, { "epoch": 0.2, "grad_norm": 1.4274383783340454, "learning_rate": 4.004787961696307e-05, "loss": 2.6341, "step": 2328 }, { "epoch": 0.2, "grad_norm": 1.5026893615722656, "learning_rate": 4.0030779753761974e-05, "loss": 2.5531, "step": 2332 }, { "epoch": 0.2, "grad_norm": 1.415287733078003, "learning_rate": 4.001367989056088e-05, "loss": 2.6344, "step": 2336 }, { "epoch": 0.2, "grad_norm": 1.61545991897583, "learning_rate": 3.999658002735978e-05, "loss": 2.5476, "step": 2340 }, { "epoch": 0.2, "grad_norm": 1.4016505479812622, "learning_rate": 3.997948016415869e-05, "loss": 2.562, "step": 2344 }, { "epoch": 0.2, "grad_norm": 1.665184497833252, "learning_rate": 3.99623803009576e-05, "loss": 2.6033, "step": 2348 }, { "epoch": 0.2, "grad_norm": 1.562911868095398, "learning_rate": 3.99452804377565e-05, "loss": 2.6955, "step": 2352 }, { "epoch": 0.2, "grad_norm": 1.477481484413147, "learning_rate": 3.9928180574555405e-05, "loss": 2.66, "step": 2356 }, { "epoch": 0.2, "grad_norm": 1.4621987342834473, "learning_rate": 3.991108071135431e-05, "loss": 2.4728, "step": 2360 }, { "epoch": 0.2, "grad_norm": 1.3315041065216064, "learning_rate": 3.989398084815322e-05, "loss": 2.5248, "step": 2364 }, { "epoch": 0.2, "grad_norm": 1.876422643661499, "learning_rate": 3.987688098495212e-05, "loss": 2.6436, "step": 2368 }, { "epoch": 0.2, "grad_norm": 1.47256600856781, "learning_rate": 3.985978112175103e-05, "loss": 2.5254, "step": 2372 }, { "epoch": 0.2, "grad_norm": 1.6386239528656006, "learning_rate": 3.9842681258549935e-05, "loss": 2.6176, "step": 2376 }, { "epoch": 0.2, "grad_norm": 1.3476667404174805, "learning_rate": 3.9825581395348835e-05, "loss": 2.4059, "step": 2380 }, { "epoch": 0.2, "grad_norm": 1.436629056930542, "learning_rate": 3.980848153214774e-05, "loss": 2.6304, "step": 2384 }, { "epoch": 0.2, "grad_norm": 1.3741427659988403, "learning_rate": 3.979138166894665e-05, "loss": 2.5209, "step": 2388 }, { "epoch": 0.2, "grad_norm": 1.5190575122833252, "learning_rate": 3.977428180574556e-05, "loss": 2.4463, "step": 2392 }, { "epoch": 0.2, "grad_norm": 1.389143943786621, "learning_rate": 3.9757181942544465e-05, "loss": 2.6484, "step": 2396 }, { "epoch": 0.21, "grad_norm": 1.3748509883880615, "learning_rate": 3.9740082079343365e-05, "loss": 2.4275, "step": 2400 }, { "epoch": 0.21, "grad_norm": 1.7997660636901855, "learning_rate": 3.972298221614227e-05, "loss": 2.6384, "step": 2404 }, { "epoch": 0.21, "grad_norm": 1.4528918266296387, "learning_rate": 3.970588235294117e-05, "loss": 2.6204, "step": 2408 }, { "epoch": 0.21, "grad_norm": 1.421633243560791, "learning_rate": 3.968878248974009e-05, "loss": 2.5918, "step": 2412 }, { "epoch": 0.21, "grad_norm": 1.4433393478393555, "learning_rate": 3.9671682626538994e-05, "loss": 2.6482, "step": 2416 }, { "epoch": 0.21, "grad_norm": 1.2582066059112549, "learning_rate": 3.9654582763337895e-05, "loss": 2.413, "step": 2420 }, { "epoch": 0.21, "grad_norm": 1.394639253616333, "learning_rate": 3.96374829001368e-05, "loss": 2.4778, "step": 2424 }, { "epoch": 0.21, "grad_norm": 1.424068570137024, "learning_rate": 3.96203830369357e-05, "loss": 2.6462, "step": 2428 }, { "epoch": 0.21, "grad_norm": 1.5364696979522705, "learning_rate": 3.960328317373461e-05, "loss": 2.5108, "step": 2432 }, { "epoch": 0.21, "grad_norm": 1.5929545164108276, "learning_rate": 3.958618331053352e-05, "loss": 2.5638, "step": 2436 }, { "epoch": 0.21, "grad_norm": 1.5119692087173462, "learning_rate": 3.9569083447332425e-05, "loss": 2.6631, "step": 2440 }, { "epoch": 0.21, "grad_norm": 1.573603630065918, "learning_rate": 3.955198358413133e-05, "loss": 2.7071, "step": 2444 }, { "epoch": 0.21, "grad_norm": 1.563513159751892, "learning_rate": 3.953488372093023e-05, "loss": 2.4158, "step": 2448 }, { "epoch": 0.21, "grad_norm": 1.572964072227478, "learning_rate": 3.951778385772914e-05, "loss": 2.6124, "step": 2452 }, { "epoch": 0.21, "grad_norm": 1.4729305505752563, "learning_rate": 3.950068399452805e-05, "loss": 2.4546, "step": 2456 }, { "epoch": 0.21, "grad_norm": 1.3578672409057617, "learning_rate": 3.948358413132695e-05, "loss": 2.5572, "step": 2460 }, { "epoch": 0.21, "grad_norm": 1.456809401512146, "learning_rate": 3.9466484268125855e-05, "loss": 2.6155, "step": 2464 }, { "epoch": 0.21, "grad_norm": 1.417240023612976, "learning_rate": 3.944938440492476e-05, "loss": 2.6249, "step": 2468 }, { "epoch": 0.21, "grad_norm": 1.6173032522201538, "learning_rate": 3.943228454172367e-05, "loss": 2.3838, "step": 2472 }, { "epoch": 0.21, "grad_norm": 1.5841619968414307, "learning_rate": 3.941518467852258e-05, "loss": 2.5878, "step": 2476 }, { "epoch": 0.21, "grad_norm": 1.526047945022583, "learning_rate": 3.939808481532148e-05, "loss": 2.6055, "step": 2480 }, { "epoch": 0.21, "grad_norm": 1.375135064125061, "learning_rate": 3.9380984952120385e-05, "loss": 2.4982, "step": 2484 }, { "epoch": 0.21, "grad_norm": 2.0735349655151367, "learning_rate": 3.9363885088919286e-05, "loss": 2.5964, "step": 2488 }, { "epoch": 0.21, "grad_norm": 1.4464576244354248, "learning_rate": 3.93467852257182e-05, "loss": 2.5043, "step": 2492 }, { "epoch": 0.21, "grad_norm": 1.4004943370819092, "learning_rate": 3.93296853625171e-05, "loss": 2.3337, "step": 2496 }, { "epoch": 0.21, "grad_norm": 1.5199224948883057, "learning_rate": 3.931258549931601e-05, "loss": 2.5759, "step": 2500 }, { "epoch": 0.21, "grad_norm": 2.1466448307037354, "learning_rate": 3.9295485636114915e-05, "loss": 2.6014, "step": 2504 }, { "epoch": 0.21, "grad_norm": 1.3880302906036377, "learning_rate": 3.9278385772913816e-05, "loss": 2.4023, "step": 2508 }, { "epoch": 0.21, "grad_norm": 1.501216173171997, "learning_rate": 3.926128590971272e-05, "loss": 2.6306, "step": 2512 }, { "epoch": 0.22, "grad_norm": 1.5279217958450317, "learning_rate": 3.924418604651163e-05, "loss": 2.5785, "step": 2516 }, { "epoch": 0.22, "grad_norm": 1.4741731882095337, "learning_rate": 3.922708618331054e-05, "loss": 2.5174, "step": 2520 }, { "epoch": 0.22, "grad_norm": 1.6046087741851807, "learning_rate": 3.9209986320109445e-05, "loss": 2.6823, "step": 2524 }, { "epoch": 0.22, "grad_norm": 1.4944065809249878, "learning_rate": 3.9192886456908346e-05, "loss": 2.7186, "step": 2528 }, { "epoch": 0.22, "grad_norm": 1.3260518312454224, "learning_rate": 3.917578659370725e-05, "loss": 2.3741, "step": 2532 }, { "epoch": 0.22, "grad_norm": 1.591991901397705, "learning_rate": 3.9158686730506154e-05, "loss": 2.5534, "step": 2536 }, { "epoch": 0.22, "grad_norm": 1.5477372407913208, "learning_rate": 3.914158686730506e-05, "loss": 2.568, "step": 2540 }, { "epoch": 0.22, "grad_norm": 1.5402345657348633, "learning_rate": 3.912448700410397e-05, "loss": 2.5795, "step": 2544 }, { "epoch": 0.22, "grad_norm": 2.2363271713256836, "learning_rate": 3.9107387140902876e-05, "loss": 2.4248, "step": 2548 }, { "epoch": 0.22, "grad_norm": 1.2395968437194824, "learning_rate": 3.909028727770178e-05, "loss": 2.6678, "step": 2552 }, { "epoch": 0.22, "grad_norm": 1.5662035942077637, "learning_rate": 3.9073187414500684e-05, "loss": 2.6065, "step": 2556 }, { "epoch": 0.22, "grad_norm": 1.3480955362319946, "learning_rate": 3.905608755129959e-05, "loss": 2.5286, "step": 2560 }, { "epoch": 0.22, "grad_norm": 1.5644841194152832, "learning_rate": 3.90389876880985e-05, "loss": 2.439, "step": 2564 }, { "epoch": 0.22, "grad_norm": 1.556207537651062, "learning_rate": 3.90218878248974e-05, "loss": 2.5694, "step": 2568 }, { "epoch": 0.22, "grad_norm": 1.4543012380599976, "learning_rate": 3.9004787961696306e-05, "loss": 2.5277, "step": 2572 }, { "epoch": 0.22, "grad_norm": 2.2177674770355225, "learning_rate": 3.8987688098495214e-05, "loss": 2.5114, "step": 2576 }, { "epoch": 0.22, "grad_norm": 1.5734175443649292, "learning_rate": 3.897058823529412e-05, "loss": 2.469, "step": 2580 }, { "epoch": 0.22, "grad_norm": 1.5571191310882568, "learning_rate": 3.895348837209303e-05, "loss": 2.5852, "step": 2584 }, { "epoch": 0.22, "grad_norm": 1.5742443799972534, "learning_rate": 3.893638850889193e-05, "loss": 2.429, "step": 2588 }, { "epoch": 0.22, "grad_norm": 1.4786051511764526, "learning_rate": 3.8919288645690836e-05, "loss": 2.6498, "step": 2592 }, { "epoch": 0.22, "grad_norm": 1.5065261125564575, "learning_rate": 3.890218878248974e-05, "loss": 2.5307, "step": 2596 }, { "epoch": 0.22, "grad_norm": 1.9272600412368774, "learning_rate": 3.888508891928865e-05, "loss": 2.5898, "step": 2600 }, { "epoch": 0.22, "grad_norm": 1.5606516599655151, "learning_rate": 3.886798905608756e-05, "loss": 2.4504, "step": 2604 }, { "epoch": 0.22, "grad_norm": 1.4898043870925903, "learning_rate": 3.885088919288646e-05, "loss": 2.4989, "step": 2608 }, { "epoch": 0.22, "grad_norm": 1.878469467163086, "learning_rate": 3.8833789329685366e-05, "loss": 2.6865, "step": 2612 }, { "epoch": 0.22, "grad_norm": 1.8781071901321411, "learning_rate": 3.881668946648427e-05, "loss": 2.5997, "step": 2616 }, { "epoch": 0.22, "grad_norm": 1.6376733779907227, "learning_rate": 3.8799589603283174e-05, "loss": 2.5791, "step": 2620 }, { "epoch": 0.22, "grad_norm": 1.4639188051223755, "learning_rate": 3.878248974008208e-05, "loss": 2.3323, "step": 2624 }, { "epoch": 0.22, "grad_norm": 1.5879504680633545, "learning_rate": 3.876538987688099e-05, "loss": 2.4866, "step": 2628 }, { "epoch": 0.23, "grad_norm": 1.5608062744140625, "learning_rate": 3.8748290013679896e-05, "loss": 2.3779, "step": 2632 }, { "epoch": 0.23, "grad_norm": 1.390249252319336, "learning_rate": 3.87311901504788e-05, "loss": 2.3288, "step": 2636 }, { "epoch": 0.23, "grad_norm": 1.357127070426941, "learning_rate": 3.8714090287277704e-05, "loss": 2.5016, "step": 2640 }, { "epoch": 0.23, "grad_norm": 1.4445414543151855, "learning_rate": 3.8696990424076605e-05, "loss": 2.5401, "step": 2644 }, { "epoch": 0.23, "grad_norm": 1.4636071920394897, "learning_rate": 3.867989056087551e-05, "loss": 2.3385, "step": 2648 }, { "epoch": 0.23, "grad_norm": 1.4558255672454834, "learning_rate": 3.866279069767442e-05, "loss": 2.3848, "step": 2652 }, { "epoch": 0.23, "grad_norm": 1.6711084842681885, "learning_rate": 3.864569083447333e-05, "loss": 2.4541, "step": 2656 }, { "epoch": 0.23, "grad_norm": 2.5824813842773438, "learning_rate": 3.8628590971272234e-05, "loss": 2.4533, "step": 2660 }, { "epoch": 0.23, "grad_norm": 1.5478894710540771, "learning_rate": 3.8611491108071135e-05, "loss": 2.4698, "step": 2664 }, { "epoch": 0.23, "grad_norm": 2.2620136737823486, "learning_rate": 3.859439124487004e-05, "loss": 2.746, "step": 2668 }, { "epoch": 0.23, "grad_norm": 2.4922306537628174, "learning_rate": 3.857729138166895e-05, "loss": 2.6415, "step": 2672 }, { "epoch": 0.23, "grad_norm": 1.476605772972107, "learning_rate": 3.856019151846785e-05, "loss": 2.4816, "step": 2676 }, { "epoch": 0.23, "grad_norm": 1.7977741956710815, "learning_rate": 3.8543091655266764e-05, "loss": 2.4403, "step": 2680 }, { "epoch": 0.23, "grad_norm": 1.3775659799575806, "learning_rate": 3.8525991792065665e-05, "loss": 2.425, "step": 2684 }, { "epoch": 0.23, "grad_norm": 1.645110845565796, "learning_rate": 3.850889192886457e-05, "loss": 2.4656, "step": 2688 }, { "epoch": 0.23, "grad_norm": 1.3301700353622437, "learning_rate": 3.849179206566348e-05, "loss": 2.4745, "step": 2692 }, { "epoch": 0.23, "grad_norm": 1.7708193063735962, "learning_rate": 3.847469220246238e-05, "loss": 2.6024, "step": 2696 }, { "epoch": 0.23, "grad_norm": 1.4959282875061035, "learning_rate": 3.845759233926129e-05, "loss": 2.5686, "step": 2700 }, { "epoch": 0.23, "grad_norm": 1.6098191738128662, "learning_rate": 3.8440492476060194e-05, "loss": 2.3471, "step": 2704 }, { "epoch": 0.23, "grad_norm": 1.3885496854782104, "learning_rate": 3.84233926128591e-05, "loss": 2.3912, "step": 2708 }, { "epoch": 0.23, "grad_norm": 1.4107521772384644, "learning_rate": 3.840629274965801e-05, "loss": 2.397, "step": 2712 }, { "epoch": 0.23, "grad_norm": 1.6801328659057617, "learning_rate": 3.838919288645691e-05, "loss": 2.5848, "step": 2716 }, { "epoch": 0.23, "grad_norm": 1.3853564262390137, "learning_rate": 3.837209302325582e-05, "loss": 2.2871, "step": 2720 }, { "epoch": 0.23, "grad_norm": 1.3792238235473633, "learning_rate": 3.835499316005472e-05, "loss": 2.45, "step": 2724 }, { "epoch": 0.23, "grad_norm": 1.6318219900131226, "learning_rate": 3.8337893296853625e-05, "loss": 2.6403, "step": 2728 }, { "epoch": 0.23, "grad_norm": 1.7002670764923096, "learning_rate": 3.832079343365253e-05, "loss": 2.5752, "step": 2732 }, { "epoch": 0.23, "grad_norm": 1.3887910842895508, "learning_rate": 3.830369357045144e-05, "loss": 2.4802, "step": 2736 }, { "epoch": 0.23, "grad_norm": 1.4336968660354614, "learning_rate": 3.828659370725035e-05, "loss": 2.4769, "step": 2740 }, { "epoch": 0.23, "grad_norm": 1.5607534646987915, "learning_rate": 3.826949384404925e-05, "loss": 2.4018, "step": 2744 }, { "epoch": 0.23, "grad_norm": 1.4709196090698242, "learning_rate": 3.8252393980848155e-05, "loss": 2.4358, "step": 2748 }, { "epoch": 0.24, "grad_norm": 1.3464138507843018, "learning_rate": 3.8235294117647055e-05, "loss": 2.5305, "step": 2752 }, { "epoch": 0.24, "grad_norm": 1.4054791927337646, "learning_rate": 3.821819425444596e-05, "loss": 2.4714, "step": 2756 }, { "epoch": 0.24, "grad_norm": 1.699159026145935, "learning_rate": 3.820109439124488e-05, "loss": 2.4277, "step": 2760 }, { "epoch": 0.24, "grad_norm": 1.3283201456069946, "learning_rate": 3.818399452804378e-05, "loss": 2.405, "step": 2764 }, { "epoch": 0.24, "grad_norm": 1.6334400177001953, "learning_rate": 3.8166894664842685e-05, "loss": 2.6551, "step": 2768 }, { "epoch": 0.24, "grad_norm": 1.4785411357879639, "learning_rate": 3.8149794801641585e-05, "loss": 2.5849, "step": 2772 }, { "epoch": 0.24, "grad_norm": 1.5953280925750732, "learning_rate": 3.813269493844049e-05, "loss": 2.4022, "step": 2776 }, { "epoch": 0.24, "grad_norm": 1.4671732187271118, "learning_rate": 3.81155950752394e-05, "loss": 2.3134, "step": 2780 }, { "epoch": 0.24, "grad_norm": 1.461626410484314, "learning_rate": 3.809849521203831e-05, "loss": 2.4263, "step": 2784 }, { "epoch": 0.24, "grad_norm": 1.5867888927459717, "learning_rate": 3.8081395348837215e-05, "loss": 2.4209, "step": 2788 }, { "epoch": 0.24, "grad_norm": 1.4046286344528198, "learning_rate": 3.8064295485636115e-05, "loss": 2.3673, "step": 2792 }, { "epoch": 0.24, "grad_norm": 1.6984418630599976, "learning_rate": 3.804719562243502e-05, "loss": 2.4858, "step": 2796 }, { "epoch": 0.24, "grad_norm": 1.6407222747802734, "learning_rate": 3.803009575923393e-05, "loss": 2.4795, "step": 2800 }, { "epoch": 0.24, "grad_norm": 1.4830973148345947, "learning_rate": 3.801299589603283e-05, "loss": 2.5164, "step": 2804 }, { "epoch": 0.24, "grad_norm": 1.2736021280288696, "learning_rate": 3.799589603283174e-05, "loss": 2.3541, "step": 2808 }, { "epoch": 0.24, "grad_norm": 1.564693570137024, "learning_rate": 3.7978796169630645e-05, "loss": 2.3652, "step": 2812 }, { "epoch": 0.24, "grad_norm": 1.5286824703216553, "learning_rate": 3.796169630642955e-05, "loss": 2.5063, "step": 2816 }, { "epoch": 0.24, "grad_norm": 1.466086745262146, "learning_rate": 3.794459644322846e-05, "loss": 2.5876, "step": 2820 }, { "epoch": 0.24, "grad_norm": 1.5650023221969604, "learning_rate": 3.792749658002736e-05, "loss": 2.4316, "step": 2824 }, { "epoch": 0.24, "grad_norm": 1.4438190460205078, "learning_rate": 3.791039671682627e-05, "loss": 2.3659, "step": 2828 }, { "epoch": 0.24, "grad_norm": 1.8183629512786865, "learning_rate": 3.789329685362517e-05, "loss": 2.4262, "step": 2832 }, { "epoch": 0.24, "grad_norm": 1.856266736984253, "learning_rate": 3.7876196990424076e-05, "loss": 2.457, "step": 2836 }, { "epoch": 0.24, "grad_norm": 1.6876139640808105, "learning_rate": 3.785909712722299e-05, "loss": 2.4234, "step": 2840 }, { "epoch": 0.24, "grad_norm": 1.3969238996505737, "learning_rate": 3.784199726402189e-05, "loss": 2.3044, "step": 2844 }, { "epoch": 0.24, "grad_norm": 1.3718289136886597, "learning_rate": 3.78248974008208e-05, "loss": 2.4889, "step": 2848 }, { "epoch": 0.24, "grad_norm": 1.5117738246917725, "learning_rate": 3.78077975376197e-05, "loss": 2.3139, "step": 2852 }, { "epoch": 0.24, "grad_norm": 1.564997673034668, "learning_rate": 3.7790697674418606e-05, "loss": 2.6801, "step": 2856 }, { "epoch": 0.24, "grad_norm": 1.5139191150665283, "learning_rate": 3.777359781121751e-05, "loss": 2.3891, "step": 2860 }, { "epoch": 0.24, "grad_norm": 1.5658632516860962, "learning_rate": 3.775649794801642e-05, "loss": 2.4947, "step": 2864 }, { "epoch": 0.25, "grad_norm": 1.5268386602401733, "learning_rate": 3.773939808481533e-05, "loss": 2.4591, "step": 2868 }, { "epoch": 0.25, "grad_norm": 1.977797508239746, "learning_rate": 3.772229822161423e-05, "loss": 2.5294, "step": 2872 }, { "epoch": 0.25, "grad_norm": 1.368944764137268, "learning_rate": 3.7705198358413136e-05, "loss": 2.2917, "step": 2876 }, { "epoch": 0.25, "grad_norm": 1.502429723739624, "learning_rate": 3.7688098495212036e-05, "loss": 2.5181, "step": 2880 }, { "epoch": 0.25, "grad_norm": 1.4561350345611572, "learning_rate": 3.7670998632010944e-05, "loss": 2.4059, "step": 2884 }, { "epoch": 0.25, "grad_norm": 1.9906591176986694, "learning_rate": 3.765389876880985e-05, "loss": 2.475, "step": 2888 }, { "epoch": 0.25, "grad_norm": 1.607998251914978, "learning_rate": 3.763679890560876e-05, "loss": 2.668, "step": 2892 }, { "epoch": 0.25, "grad_norm": 1.4631425142288208, "learning_rate": 3.7619699042407666e-05, "loss": 2.4278, "step": 2896 }, { "epoch": 0.25, "grad_norm": 1.3773366212844849, "learning_rate": 3.7602599179206566e-05, "loss": 2.5251, "step": 2900 }, { "epoch": 0.25, "grad_norm": 1.8001102209091187, "learning_rate": 3.7585499316005474e-05, "loss": 2.4981, "step": 2904 }, { "epoch": 0.25, "grad_norm": 1.4368932247161865, "learning_rate": 3.756839945280438e-05, "loss": 2.3902, "step": 2908 }, { "epoch": 0.25, "grad_norm": 1.5067082643508911, "learning_rate": 3.755129958960328e-05, "loss": 2.4835, "step": 2912 }, { "epoch": 0.25, "grad_norm": 1.6977261304855347, "learning_rate": 3.753419972640219e-05, "loss": 2.4424, "step": 2916 }, { "epoch": 0.25, "grad_norm": 1.5715231895446777, "learning_rate": 3.7517099863201096e-05, "loss": 2.376, "step": 2920 }, { "epoch": 0.25, "grad_norm": 1.8464969396591187, "learning_rate": 3.7500000000000003e-05, "loss": 2.3253, "step": 2924 }, { "epoch": 0.25, "grad_norm": 1.4582910537719727, "learning_rate": 3.748290013679891e-05, "loss": 2.2077, "step": 2928 }, { "epoch": 0.25, "grad_norm": 1.3920525312423706, "learning_rate": 3.746580027359781e-05, "loss": 2.2856, "step": 2932 }, { "epoch": 0.25, "grad_norm": 1.594838261604309, "learning_rate": 3.744870041039672e-05, "loss": 2.441, "step": 2936 }, { "epoch": 0.25, "grad_norm": 1.608803629875183, "learning_rate": 3.743160054719562e-05, "loss": 2.5194, "step": 2940 }, { "epoch": 0.25, "grad_norm": 1.492957592010498, "learning_rate": 3.741450068399453e-05, "loss": 2.3489, "step": 2944 }, { "epoch": 0.25, "grad_norm": 4.186474323272705, "learning_rate": 3.739740082079344e-05, "loss": 2.5155, "step": 2948 }, { "epoch": 0.25, "grad_norm": 1.3143714666366577, "learning_rate": 3.738030095759234e-05, "loss": 2.5857, "step": 2952 }, { "epoch": 0.25, "grad_norm": 2.287429094314575, "learning_rate": 3.736320109439125e-05, "loss": 2.4715, "step": 2956 }, { "epoch": 0.25, "grad_norm": 1.3473930358886719, "learning_rate": 3.734610123119015e-05, "loss": 2.4606, "step": 2960 }, { "epoch": 0.25, "grad_norm": 1.6107207536697388, "learning_rate": 3.7329001367989057e-05, "loss": 2.4097, "step": 2964 }, { "epoch": 0.25, "grad_norm": 1.959020733833313, "learning_rate": 3.7311901504787964e-05, "loss": 2.5748, "step": 2968 }, { "epoch": 0.25, "grad_norm": 1.483560562133789, "learning_rate": 3.729480164158687e-05, "loss": 2.4517, "step": 2972 }, { "epoch": 0.25, "grad_norm": 1.4312429428100586, "learning_rate": 3.727770177838578e-05, "loss": 2.6805, "step": 2976 }, { "epoch": 0.25, "grad_norm": 1.3787798881530762, "learning_rate": 3.726060191518468e-05, "loss": 2.3881, "step": 2980 }, { "epoch": 0.26, "grad_norm": 1.6351797580718994, "learning_rate": 3.7243502051983587e-05, "loss": 2.4977, "step": 2984 }, { "epoch": 0.26, "grad_norm": 1.615729808807373, "learning_rate": 3.722640218878249e-05, "loss": 2.4574, "step": 2988 }, { "epoch": 0.26, "grad_norm": 1.4360637664794922, "learning_rate": 3.7209302325581394e-05, "loss": 2.4282, "step": 2992 }, { "epoch": 0.26, "grad_norm": 1.5573183298110962, "learning_rate": 3.71922024623803e-05, "loss": 2.3941, "step": 2996 }, { "epoch": 0.26, "grad_norm": 1.398614764213562, "learning_rate": 3.717510259917921e-05, "loss": 2.4977, "step": 3000 }, { "epoch": 0.26, "grad_norm": 1.7220714092254639, "learning_rate": 3.7158002735978116e-05, "loss": 2.3985, "step": 3004 }, { "epoch": 0.26, "grad_norm": 1.470420479774475, "learning_rate": 3.714090287277702e-05, "loss": 2.3886, "step": 3008 }, { "epoch": 0.26, "grad_norm": 1.478071689605713, "learning_rate": 3.7123803009575924e-05, "loss": 2.3968, "step": 3012 }, { "epoch": 0.26, "grad_norm": 1.4263502359390259, "learning_rate": 3.710670314637483e-05, "loss": 2.5225, "step": 3016 }, { "epoch": 0.26, "grad_norm": 1.7249011993408203, "learning_rate": 3.708960328317373e-05, "loss": 2.538, "step": 3020 }, { "epoch": 0.26, "grad_norm": 1.5792475938796997, "learning_rate": 3.707250341997264e-05, "loss": 2.6076, "step": 3024 }, { "epoch": 0.26, "grad_norm": 1.558214783668518, "learning_rate": 3.705540355677155e-05, "loss": 2.4636, "step": 3028 }, { "epoch": 0.26, "grad_norm": 1.395617961883545, "learning_rate": 3.7038303693570454e-05, "loss": 2.4319, "step": 3032 }, { "epoch": 0.26, "grad_norm": 1.6385856866836548, "learning_rate": 3.702120383036936e-05, "loss": 2.7062, "step": 3036 }, { "epoch": 0.26, "grad_norm": 1.4634343385696411, "learning_rate": 3.700410396716826e-05, "loss": 2.5087, "step": 3040 }, { "epoch": 0.26, "grad_norm": 1.6941776275634766, "learning_rate": 3.698700410396717e-05, "loss": 2.3967, "step": 3044 }, { "epoch": 0.26, "grad_norm": 1.5948282480239868, "learning_rate": 3.696990424076607e-05, "loss": 2.371, "step": 3048 }, { "epoch": 0.26, "grad_norm": 1.6816450357437134, "learning_rate": 3.6952804377564984e-05, "loss": 2.3695, "step": 3052 }, { "epoch": 0.26, "grad_norm": 1.4531540870666504, "learning_rate": 3.693570451436389e-05, "loss": 2.375, "step": 3056 }, { "epoch": 0.26, "grad_norm": 1.337263584136963, "learning_rate": 3.691860465116279e-05, "loss": 2.3965, "step": 3060 }, { "epoch": 0.26, "grad_norm": 1.5273529291152954, "learning_rate": 3.69015047879617e-05, "loss": 2.3984, "step": 3064 }, { "epoch": 0.26, "grad_norm": 1.4230504035949707, "learning_rate": 3.68844049247606e-05, "loss": 2.4717, "step": 3068 }, { "epoch": 0.26, "grad_norm": 1.6386831998825073, "learning_rate": 3.686730506155951e-05, "loss": 2.5523, "step": 3072 }, { "epoch": 0.26, "grad_norm": 1.5074329376220703, "learning_rate": 3.6850205198358415e-05, "loss": 2.4326, "step": 3076 }, { "epoch": 0.26, "grad_norm": 1.5377895832061768, "learning_rate": 3.683310533515732e-05, "loss": 2.3838, "step": 3080 }, { "epoch": 0.26, "grad_norm": 1.5310460329055786, "learning_rate": 3.681600547195623e-05, "loss": 2.5792, "step": 3084 }, { "epoch": 0.26, "grad_norm": 1.486627221107483, "learning_rate": 3.679890560875513e-05, "loss": 2.3772, "step": 3088 }, { "epoch": 0.26, "grad_norm": 1.9239519834518433, "learning_rate": 3.678180574555404e-05, "loss": 2.4844, "step": 3092 }, { "epoch": 0.26, "grad_norm": 1.4352439641952515, "learning_rate": 3.6764705882352945e-05, "loss": 2.3233, "step": 3096 }, { "epoch": 0.27, "grad_norm": 1.540075421333313, "learning_rate": 3.6747606019151845e-05, "loss": 2.3849, "step": 3100 }, { "epoch": 0.27, "grad_norm": 1.5862653255462646, "learning_rate": 3.673050615595075e-05, "loss": 2.3689, "step": 3104 }, { "epoch": 0.27, "grad_norm": 1.4095063209533691, "learning_rate": 3.671340629274966e-05, "loss": 2.3854, "step": 3108 }, { "epoch": 0.27, "grad_norm": 1.3623970746994019, "learning_rate": 3.669630642954857e-05, "loss": 2.2868, "step": 3112 }, { "epoch": 0.27, "grad_norm": 1.4835405349731445, "learning_rate": 3.667920656634747e-05, "loss": 2.3524, "step": 3116 }, { "epoch": 0.27, "grad_norm": 1.790026068687439, "learning_rate": 3.6662106703146375e-05, "loss": 2.379, "step": 3120 }, { "epoch": 0.27, "grad_norm": 1.452793836593628, "learning_rate": 3.664500683994528e-05, "loss": 2.2826, "step": 3124 }, { "epoch": 0.27, "grad_norm": 1.438991665840149, "learning_rate": 3.662790697674418e-05, "loss": 2.4365, "step": 3128 }, { "epoch": 0.27, "grad_norm": 1.6045351028442383, "learning_rate": 3.66108071135431e-05, "loss": 2.2628, "step": 3132 }, { "epoch": 0.27, "grad_norm": 1.568385362625122, "learning_rate": 3.6593707250342e-05, "loss": 2.465, "step": 3136 }, { "epoch": 0.27, "grad_norm": 1.451302170753479, "learning_rate": 3.6576607387140905e-05, "loss": 2.3324, "step": 3140 }, { "epoch": 0.27, "grad_norm": 1.4595221281051636, "learning_rate": 3.655950752393981e-05, "loss": 2.4677, "step": 3144 }, { "epoch": 0.27, "grad_norm": 1.5326175689697266, "learning_rate": 3.654240766073871e-05, "loss": 2.458, "step": 3148 }, { "epoch": 0.27, "grad_norm": 1.651713490486145, "learning_rate": 3.652530779753762e-05, "loss": 2.3911, "step": 3152 }, { "epoch": 0.27, "grad_norm": 1.475189208984375, "learning_rate": 3.650820793433653e-05, "loss": 2.4598, "step": 3156 }, { "epoch": 0.27, "grad_norm": 1.7967069149017334, "learning_rate": 3.6491108071135435e-05, "loss": 2.41, "step": 3160 }, { "epoch": 0.27, "grad_norm": 1.380880355834961, "learning_rate": 3.647400820793434e-05, "loss": 2.3952, "step": 3164 }, { "epoch": 0.27, "grad_norm": 1.392062783241272, "learning_rate": 3.645690834473324e-05, "loss": 2.5062, "step": 3168 }, { "epoch": 0.27, "grad_norm": 1.5198041200637817, "learning_rate": 3.643980848153215e-05, "loss": 2.3908, "step": 3172 }, { "epoch": 0.27, "grad_norm": 1.3749966621398926, "learning_rate": 3.642270861833105e-05, "loss": 2.3101, "step": 3176 }, { "epoch": 0.27, "grad_norm": 1.559767246246338, "learning_rate": 3.640560875512996e-05, "loss": 2.3929, "step": 3180 }, { "epoch": 0.27, "grad_norm": 1.3676600456237793, "learning_rate": 3.6388508891928866e-05, "loss": 2.4963, "step": 3184 }, { "epoch": 0.27, "grad_norm": 1.4563826322555542, "learning_rate": 3.637140902872777e-05, "loss": 2.4377, "step": 3188 }, { "epoch": 0.27, "grad_norm": 1.5789074897766113, "learning_rate": 3.635430916552668e-05, "loss": 2.484, "step": 3192 }, { "epoch": 0.27, "grad_norm": 1.633427381515503, "learning_rate": 3.633720930232558e-05, "loss": 2.5337, "step": 3196 }, { "epoch": 0.27, "grad_norm": 1.6052011251449585, "learning_rate": 3.632010943912449e-05, "loss": 2.2981, "step": 3200 }, { "epoch": 0.27, "grad_norm": 1.4831585884094238, "learning_rate": 3.6303009575923396e-05, "loss": 2.2886, "step": 3204 }, { "epoch": 0.27, "grad_norm": 1.715384840965271, "learning_rate": 3.6285909712722296e-05, "loss": 2.2484, "step": 3208 }, { "epoch": 0.27, "grad_norm": 1.5094373226165771, "learning_rate": 3.626880984952121e-05, "loss": 2.3951, "step": 3212 }, { "epoch": 0.27, "grad_norm": 1.4048563241958618, "learning_rate": 3.625170998632011e-05, "loss": 2.3434, "step": 3216 }, { "epoch": 0.28, "grad_norm": 2.033588171005249, "learning_rate": 3.623461012311902e-05, "loss": 2.4892, "step": 3220 }, { "epoch": 0.28, "grad_norm": 1.5815871953964233, "learning_rate": 3.6217510259917926e-05, "loss": 2.4064, "step": 3224 }, { "epoch": 0.28, "grad_norm": 1.5218071937561035, "learning_rate": 3.6200410396716826e-05, "loss": 2.4017, "step": 3228 }, { "epoch": 0.28, "grad_norm": 1.5122883319854736, "learning_rate": 3.6183310533515733e-05, "loss": 2.2099, "step": 3232 }, { "epoch": 0.28, "grad_norm": 1.6052781343460083, "learning_rate": 3.616621067031464e-05, "loss": 2.4246, "step": 3236 }, { "epoch": 0.28, "grad_norm": 1.629193663597107, "learning_rate": 3.614911080711355e-05, "loss": 2.5361, "step": 3240 }, { "epoch": 0.28, "grad_norm": 1.478102445602417, "learning_rate": 3.613201094391245e-05, "loss": 2.2904, "step": 3244 }, { "epoch": 0.28, "grad_norm": 1.5051685571670532, "learning_rate": 3.6114911080711356e-05, "loss": 2.5879, "step": 3248 }, { "epoch": 0.28, "grad_norm": 1.39858078956604, "learning_rate": 3.609781121751026e-05, "loss": 2.3035, "step": 3252 }, { "epoch": 0.28, "grad_norm": 1.5903834104537964, "learning_rate": 3.6080711354309164e-05, "loss": 2.3173, "step": 3256 }, { "epoch": 0.28, "grad_norm": 1.5299763679504395, "learning_rate": 3.606361149110807e-05, "loss": 2.3344, "step": 3260 }, { "epoch": 0.28, "grad_norm": 1.5950665473937988, "learning_rate": 3.604651162790698e-05, "loss": 2.2693, "step": 3264 }, { "epoch": 0.28, "grad_norm": 1.4136861562728882, "learning_rate": 3.6029411764705886e-05, "loss": 2.3145, "step": 3268 }, { "epoch": 0.28, "grad_norm": 1.4014285802841187, "learning_rate": 3.601231190150479e-05, "loss": 2.4621, "step": 3272 }, { "epoch": 0.28, "grad_norm": 1.419997215270996, "learning_rate": 3.5995212038303694e-05, "loss": 2.3412, "step": 3276 }, { "epoch": 0.28, "grad_norm": 1.4487018585205078, "learning_rate": 3.59781121751026e-05, "loss": 2.3379, "step": 3280 }, { "epoch": 0.28, "grad_norm": 1.4295108318328857, "learning_rate": 3.59610123119015e-05, "loss": 2.3464, "step": 3284 }, { "epoch": 0.28, "grad_norm": 1.5380290746688843, "learning_rate": 3.594391244870041e-05, "loss": 2.1826, "step": 3288 }, { "epoch": 0.28, "grad_norm": 1.3480552434921265, "learning_rate": 3.5926812585499316e-05, "loss": 2.2881, "step": 3292 }, { "epoch": 0.28, "grad_norm": 1.5125305652618408, "learning_rate": 3.5909712722298224e-05, "loss": 2.4374, "step": 3296 }, { "epoch": 0.28, "grad_norm": 1.5424193143844604, "learning_rate": 3.589261285909713e-05, "loss": 2.3602, "step": 3300 }, { "epoch": 0.28, "grad_norm": 1.9561680555343628, "learning_rate": 3.587551299589603e-05, "loss": 2.598, "step": 3304 }, { "epoch": 0.28, "grad_norm": 1.4579107761383057, "learning_rate": 3.585841313269494e-05, "loss": 2.4397, "step": 3308 }, { "epoch": 0.28, "grad_norm": 1.3612507581710815, "learning_rate": 3.5841313269493846e-05, "loss": 2.3842, "step": 3312 }, { "epoch": 0.28, "grad_norm": 1.6156667470932007, "learning_rate": 3.582421340629275e-05, "loss": 2.4143, "step": 3316 }, { "epoch": 0.28, "grad_norm": 1.457672119140625, "learning_rate": 3.580711354309166e-05, "loss": 2.327, "step": 3320 }, { "epoch": 0.28, "grad_norm": 1.474558711051941, "learning_rate": 3.579001367989056e-05, "loss": 2.3957, "step": 3324 }, { "epoch": 0.28, "grad_norm": 1.4732122421264648, "learning_rate": 3.577291381668947e-05, "loss": 2.4016, "step": 3328 }, { "epoch": 0.28, "grad_norm": 1.4019269943237305, "learning_rate": 3.5755813953488376e-05, "loss": 2.4573, "step": 3332 }, { "epoch": 0.29, "grad_norm": 1.6624845266342163, "learning_rate": 3.573871409028728e-05, "loss": 2.2681, "step": 3336 }, { "epoch": 0.29, "grad_norm": 1.4161522388458252, "learning_rate": 3.5721614227086184e-05, "loss": 2.2106, "step": 3340 }, { "epoch": 0.29, "grad_norm": 1.3711514472961426, "learning_rate": 3.570451436388509e-05, "loss": 2.1563, "step": 3344 }, { "epoch": 0.29, "grad_norm": 1.575616478919983, "learning_rate": 3.5687414500684e-05, "loss": 2.3763, "step": 3348 }, { "epoch": 0.29, "grad_norm": 1.6165823936462402, "learning_rate": 3.5670314637482906e-05, "loss": 2.3312, "step": 3352 }, { "epoch": 0.29, "grad_norm": 1.3387306928634644, "learning_rate": 3.565321477428181e-05, "loss": 2.2245, "step": 3356 }, { "epoch": 0.29, "grad_norm": 1.541719675064087, "learning_rate": 3.5636114911080714e-05, "loss": 2.3347, "step": 3360 }, { "epoch": 0.29, "grad_norm": 1.4336274862289429, "learning_rate": 3.5619015047879615e-05, "loss": 2.3484, "step": 3364 }, { "epoch": 0.29, "grad_norm": 1.586674690246582, "learning_rate": 3.560191518467852e-05, "loss": 2.5278, "step": 3368 }, { "epoch": 0.29, "grad_norm": 1.429751992225647, "learning_rate": 3.558481532147743e-05, "loss": 2.2046, "step": 3372 }, { "epoch": 0.29, "grad_norm": 1.6239968538284302, "learning_rate": 3.556771545827634e-05, "loss": 2.4159, "step": 3376 }, { "epoch": 0.29, "grad_norm": 2.665915012359619, "learning_rate": 3.5550615595075244e-05, "loss": 2.3933, "step": 3380 }, { "epoch": 0.29, "grad_norm": 1.532535195350647, "learning_rate": 3.5533515731874145e-05, "loss": 2.514, "step": 3384 }, { "epoch": 0.29, "grad_norm": 1.8504074811935425, "learning_rate": 3.551641586867305e-05, "loss": 2.4337, "step": 3388 }, { "epoch": 0.29, "grad_norm": 1.6196644306182861, "learning_rate": 3.549931600547195e-05, "loss": 2.297, "step": 3392 }, { "epoch": 0.29, "grad_norm": 1.5914441347122192, "learning_rate": 3.548221614227086e-05, "loss": 2.3395, "step": 3396 }, { "epoch": 0.29, "grad_norm": 1.5233490467071533, "learning_rate": 3.5465116279069774e-05, "loss": 2.253, "step": 3400 }, { "epoch": 0.29, "grad_norm": 1.4033238887786865, "learning_rate": 3.5448016415868675e-05, "loss": 2.3872, "step": 3404 }, { "epoch": 0.29, "grad_norm": 1.485261082649231, "learning_rate": 3.543091655266758e-05, "loss": 2.367, "step": 3408 }, { "epoch": 0.29, "grad_norm": 1.3384130001068115, "learning_rate": 3.541381668946648e-05, "loss": 2.2609, "step": 3412 }, { "epoch": 0.29, "grad_norm": 1.3713083267211914, "learning_rate": 3.539671682626539e-05, "loss": 2.311, "step": 3416 }, { "epoch": 0.29, "grad_norm": 1.6065647602081299, "learning_rate": 3.53796169630643e-05, "loss": 2.1545, "step": 3420 }, { "epoch": 0.29, "grad_norm": 1.455736517906189, "learning_rate": 3.5362517099863205e-05, "loss": 2.3288, "step": 3424 }, { "epoch": 0.29, "grad_norm": 1.6390444040298462, "learning_rate": 3.534541723666211e-05, "loss": 2.3899, "step": 3428 }, { "epoch": 0.29, "grad_norm": 1.3747378587722778, "learning_rate": 3.532831737346101e-05, "loss": 2.3622, "step": 3432 }, { "epoch": 0.29, "grad_norm": 1.4323629140853882, "learning_rate": 3.531121751025992e-05, "loss": 2.2561, "step": 3436 }, { "epoch": 0.29, "grad_norm": 1.4392497539520264, "learning_rate": 3.529411764705883e-05, "loss": 2.4516, "step": 3440 }, { "epoch": 0.29, "grad_norm": 1.507541298866272, "learning_rate": 3.527701778385773e-05, "loss": 2.3012, "step": 3444 }, { "epoch": 0.29, "grad_norm": 1.6980098485946655, "learning_rate": 3.5259917920656635e-05, "loss": 2.3305, "step": 3448 }, { "epoch": 0.3, "grad_norm": 1.378029704093933, "learning_rate": 3.524281805745554e-05, "loss": 2.3159, "step": 3452 }, { "epoch": 0.3, "grad_norm": 1.4904416799545288, "learning_rate": 3.522571819425445e-05, "loss": 2.2613, "step": 3456 }, { "epoch": 0.3, "grad_norm": 1.5655815601348877, "learning_rate": 3.520861833105336e-05, "loss": 2.3091, "step": 3460 }, { "epoch": 0.3, "grad_norm": 1.4613752365112305, "learning_rate": 3.519151846785226e-05, "loss": 2.3852, "step": 3464 }, { "epoch": 0.3, "grad_norm": 1.735697627067566, "learning_rate": 3.5174418604651165e-05, "loss": 2.3746, "step": 3468 }, { "epoch": 0.3, "grad_norm": 1.56474769115448, "learning_rate": 3.5157318741450066e-05, "loss": 2.4085, "step": 3472 }, { "epoch": 0.3, "grad_norm": 1.6144722700119019, "learning_rate": 3.514021887824897e-05, "loss": 2.2048, "step": 3476 }, { "epoch": 0.3, "grad_norm": 1.4500874280929565, "learning_rate": 3.512311901504789e-05, "loss": 2.4197, "step": 3480 }, { "epoch": 0.3, "grad_norm": 1.5875989198684692, "learning_rate": 3.510601915184679e-05, "loss": 2.3395, "step": 3484 }, { "epoch": 0.3, "grad_norm": 1.5678739547729492, "learning_rate": 3.5088919288645695e-05, "loss": 2.4267, "step": 3488 }, { "epoch": 0.3, "grad_norm": 1.4578819274902344, "learning_rate": 3.5071819425444596e-05, "loss": 2.3653, "step": 3492 }, { "epoch": 0.3, "grad_norm": 1.8042303323745728, "learning_rate": 3.50547195622435e-05, "loss": 2.3785, "step": 3496 }, { "epoch": 0.3, "grad_norm": 1.4263440370559692, "learning_rate": 3.5037619699042403e-05, "loss": 2.3203, "step": 3500 }, { "epoch": 0.3, "grad_norm": 1.5346360206604004, "learning_rate": 3.502051983584132e-05, "loss": 2.3125, "step": 3504 }, { "epoch": 0.3, "grad_norm": 2.5747292041778564, "learning_rate": 3.5003419972640225e-05, "loss": 2.4275, "step": 3508 }, { "epoch": 0.3, "grad_norm": 1.7760605812072754, "learning_rate": 3.4986320109439126e-05, "loss": 2.6035, "step": 3512 }, { "epoch": 0.3, "grad_norm": 1.5713919401168823, "learning_rate": 3.496922024623803e-05, "loss": 2.3099, "step": 3516 }, { "epoch": 0.3, "grad_norm": 1.6528106927871704, "learning_rate": 3.4952120383036933e-05, "loss": 2.4168, "step": 3520 }, { "epoch": 0.3, "grad_norm": 1.4009822607040405, "learning_rate": 3.493502051983584e-05, "loss": 2.3955, "step": 3524 }, { "epoch": 0.3, "grad_norm": 1.7046074867248535, "learning_rate": 3.491792065663475e-05, "loss": 2.3926, "step": 3528 }, { "epoch": 0.3, "grad_norm": 1.3500627279281616, "learning_rate": 3.4900820793433655e-05, "loss": 2.3855, "step": 3532 }, { "epoch": 0.3, "grad_norm": 1.4953322410583496, "learning_rate": 3.488372093023256e-05, "loss": 2.3912, "step": 3536 }, { "epoch": 0.3, "grad_norm": 1.3733028173446655, "learning_rate": 3.486662106703146e-05, "loss": 2.2607, "step": 3540 }, { "epoch": 0.3, "grad_norm": 1.427222490310669, "learning_rate": 3.484952120383037e-05, "loss": 2.3234, "step": 3544 }, { "epoch": 0.3, "grad_norm": 1.4577361345291138, "learning_rate": 3.483242134062928e-05, "loss": 2.3509, "step": 3548 }, { "epoch": 0.3, "grad_norm": 1.3897165060043335, "learning_rate": 3.481532147742818e-05, "loss": 2.154, "step": 3552 }, { "epoch": 0.3, "grad_norm": 1.4338810443878174, "learning_rate": 3.4798221614227086e-05, "loss": 2.2335, "step": 3556 }, { "epoch": 0.3, "grad_norm": 1.6268779039382935, "learning_rate": 3.478112175102599e-05, "loss": 2.2263, "step": 3560 }, { "epoch": 0.3, "grad_norm": 1.4855129718780518, "learning_rate": 3.47640218878249e-05, "loss": 2.1839, "step": 3564 }, { "epoch": 0.31, "grad_norm": 1.525172233581543, "learning_rate": 3.474692202462381e-05, "loss": 2.3011, "step": 3568 }, { "epoch": 0.31, "grad_norm": 1.543357253074646, "learning_rate": 3.472982216142271e-05, "loss": 2.34, "step": 3572 }, { "epoch": 0.31, "grad_norm": 1.4775084257125854, "learning_rate": 3.4712722298221616e-05, "loss": 2.2593, "step": 3576 }, { "epoch": 0.31, "grad_norm": 1.3302327394485474, "learning_rate": 3.4695622435020516e-05, "loss": 2.1703, "step": 3580 }, { "epoch": 0.31, "grad_norm": 3.1734111309051514, "learning_rate": 3.467852257181943e-05, "loss": 2.3284, "step": 3584 }, { "epoch": 0.31, "grad_norm": 1.5368781089782715, "learning_rate": 3.466142270861834e-05, "loss": 2.3187, "step": 3588 }, { "epoch": 0.31, "grad_norm": 1.526089072227478, "learning_rate": 3.464432284541724e-05, "loss": 2.3247, "step": 3592 }, { "epoch": 0.31, "grad_norm": 1.460354208946228, "learning_rate": 3.4627222982216146e-05, "loss": 2.2871, "step": 3596 }, { "epoch": 0.31, "grad_norm": 1.5204929113388062, "learning_rate": 3.4610123119015046e-05, "loss": 2.2317, "step": 3600 }, { "epoch": 0.31, "grad_norm": 1.355778455734253, "learning_rate": 3.4593023255813954e-05, "loss": 2.2038, "step": 3604 }, { "epoch": 0.31, "grad_norm": 1.4250309467315674, "learning_rate": 3.457592339261286e-05, "loss": 2.288, "step": 3608 }, { "epoch": 0.31, "grad_norm": 1.529437780380249, "learning_rate": 3.455882352941177e-05, "loss": 2.2193, "step": 3612 }, { "epoch": 0.31, "grad_norm": 1.6923147439956665, "learning_rate": 3.4541723666210676e-05, "loss": 2.2283, "step": 3616 }, { "epoch": 0.31, "grad_norm": 1.5242345333099365, "learning_rate": 3.4524623803009576e-05, "loss": 2.3881, "step": 3620 }, { "epoch": 0.31, "grad_norm": 1.4843759536743164, "learning_rate": 3.4507523939808484e-05, "loss": 2.4458, "step": 3624 }, { "epoch": 0.31, "grad_norm": 1.361167550086975, "learning_rate": 3.4490424076607384e-05, "loss": 2.264, "step": 3628 }, { "epoch": 0.31, "grad_norm": 1.4210281372070312, "learning_rate": 3.447332421340629e-05, "loss": 2.2471, "step": 3632 }, { "epoch": 0.31, "grad_norm": 1.5141572952270508, "learning_rate": 3.44562243502052e-05, "loss": 2.3368, "step": 3636 }, { "epoch": 0.31, "grad_norm": 1.3176366090774536, "learning_rate": 3.4439124487004106e-05, "loss": 2.1529, "step": 3640 }, { "epoch": 0.31, "grad_norm": 1.4620164632797241, "learning_rate": 3.4422024623803014e-05, "loss": 2.3995, "step": 3644 }, { "epoch": 0.31, "grad_norm": 1.6192833185195923, "learning_rate": 3.4404924760601914e-05, "loss": 2.3621, "step": 3648 }, { "epoch": 0.31, "grad_norm": 1.459579348564148, "learning_rate": 3.438782489740082e-05, "loss": 2.3437, "step": 3652 }, { "epoch": 0.31, "grad_norm": 1.3151201009750366, "learning_rate": 3.437072503419973e-05, "loss": 2.318, "step": 3656 }, { "epoch": 0.31, "grad_norm": 1.4882017374038696, "learning_rate": 3.435362517099863e-05, "loss": 2.2267, "step": 3660 }, { "epoch": 0.31, "grad_norm": 1.3695846796035767, "learning_rate": 3.433652530779754e-05, "loss": 2.2106, "step": 3664 }, { "epoch": 0.31, "grad_norm": 1.8380380868911743, "learning_rate": 3.4319425444596444e-05, "loss": 2.3097, "step": 3668 }, { "epoch": 0.31, "grad_norm": 1.560056447982788, "learning_rate": 3.430232558139535e-05, "loss": 2.4494, "step": 3672 }, { "epoch": 0.31, "grad_norm": 1.4692696332931519, "learning_rate": 3.428522571819426e-05, "loss": 2.1774, "step": 3676 }, { "epoch": 0.31, "grad_norm": 1.585492730140686, "learning_rate": 3.426812585499316e-05, "loss": 2.2936, "step": 3680 }, { "epoch": 0.31, "grad_norm": 1.5503042936325073, "learning_rate": 3.425102599179207e-05, "loss": 2.2004, "step": 3684 }, { "epoch": 0.32, "grad_norm": 1.640663504600525, "learning_rate": 3.423392612859097e-05, "loss": 2.2647, "step": 3688 }, { "epoch": 0.32, "grad_norm": 1.4468867778778076, "learning_rate": 3.421682626538988e-05, "loss": 2.3467, "step": 3692 }, { "epoch": 0.32, "grad_norm": 1.5558197498321533, "learning_rate": 3.419972640218879e-05, "loss": 2.4436, "step": 3696 }, { "epoch": 0.32, "grad_norm": 1.489776372909546, "learning_rate": 3.418262653898769e-05, "loss": 2.3, "step": 3700 }, { "epoch": 0.32, "grad_norm": 1.4555796384811401, "learning_rate": 3.41655266757866e-05, "loss": 2.2145, "step": 3704 }, { "epoch": 0.32, "grad_norm": 1.3881696462631226, "learning_rate": 3.41484268125855e-05, "loss": 2.3039, "step": 3708 }, { "epoch": 0.32, "grad_norm": 1.412907361984253, "learning_rate": 3.4131326949384405e-05, "loss": 2.3594, "step": 3712 }, { "epoch": 0.32, "grad_norm": 1.746773362159729, "learning_rate": 3.411422708618331e-05, "loss": 2.2989, "step": 3716 }, { "epoch": 0.32, "grad_norm": 1.4776164293289185, "learning_rate": 3.409712722298222e-05, "loss": 2.3647, "step": 3720 }, { "epoch": 0.32, "grad_norm": 1.4442262649536133, "learning_rate": 3.408002735978113e-05, "loss": 2.1363, "step": 3724 }, { "epoch": 0.32, "grad_norm": 1.49587082862854, "learning_rate": 3.406292749658003e-05, "loss": 2.3467, "step": 3728 }, { "epoch": 0.32, "grad_norm": 1.8401976823806763, "learning_rate": 3.4045827633378935e-05, "loss": 2.4508, "step": 3732 }, { "epoch": 0.32, "grad_norm": 1.379357099533081, "learning_rate": 3.402872777017784e-05, "loss": 2.2629, "step": 3736 }, { "epoch": 0.32, "grad_norm": 1.590693712234497, "learning_rate": 3.401162790697674e-05, "loss": 2.3216, "step": 3740 }, { "epoch": 0.32, "grad_norm": 1.4456241130828857, "learning_rate": 3.399452804377565e-05, "loss": 2.3142, "step": 3744 }, { "epoch": 0.32, "grad_norm": 1.5368770360946655, "learning_rate": 3.397742818057456e-05, "loss": 2.2955, "step": 3748 }, { "epoch": 0.32, "grad_norm": 1.6656548976898193, "learning_rate": 3.3960328317373464e-05, "loss": 2.4015, "step": 3752 }, { "epoch": 0.32, "grad_norm": 1.519033432006836, "learning_rate": 3.3943228454172365e-05, "loss": 2.1557, "step": 3756 }, { "epoch": 0.32, "grad_norm": 1.56667959690094, "learning_rate": 3.392612859097127e-05, "loss": 2.5277, "step": 3760 }, { "epoch": 0.32, "grad_norm": 1.5404236316680908, "learning_rate": 3.390902872777018e-05, "loss": 2.2924, "step": 3764 }, { "epoch": 0.32, "grad_norm": 1.619696021080017, "learning_rate": 3.389192886456908e-05, "loss": 2.2094, "step": 3768 }, { "epoch": 0.32, "grad_norm": 1.3681530952453613, "learning_rate": 3.3874829001367994e-05, "loss": 2.231, "step": 3772 }, { "epoch": 0.32, "grad_norm": 1.7254544496536255, "learning_rate": 3.3857729138166895e-05, "loss": 2.2382, "step": 3776 }, { "epoch": 0.32, "grad_norm": 1.6230955123901367, "learning_rate": 3.38406292749658e-05, "loss": 2.1105, "step": 3780 }, { "epoch": 0.32, "grad_norm": 1.5223900079727173, "learning_rate": 3.382352941176471e-05, "loss": 2.1982, "step": 3784 }, { "epoch": 0.32, "grad_norm": 1.3313345909118652, "learning_rate": 3.380642954856361e-05, "loss": 2.1896, "step": 3788 }, { "epoch": 0.32, "grad_norm": 1.5461819171905518, "learning_rate": 3.378932968536252e-05, "loss": 2.2544, "step": 3792 }, { "epoch": 0.32, "grad_norm": 1.4953100681304932, "learning_rate": 3.3772229822161425e-05, "loss": 2.3158, "step": 3796 }, { "epoch": 0.32, "grad_norm": 1.4358595609664917, "learning_rate": 3.375512995896033e-05, "loss": 2.1203, "step": 3800 }, { "epoch": 0.33, "grad_norm": 1.6208328008651733, "learning_rate": 3.373803009575924e-05, "loss": 2.3804, "step": 3804 }, { "epoch": 0.33, "grad_norm": 1.58054518699646, "learning_rate": 3.372093023255814e-05, "loss": 2.2398, "step": 3808 }, { "epoch": 0.33, "grad_norm": 1.6291831731796265, "learning_rate": 3.370383036935705e-05, "loss": 2.2565, "step": 3812 }, { "epoch": 0.33, "grad_norm": 1.500659465789795, "learning_rate": 3.368673050615595e-05, "loss": 2.2771, "step": 3816 }, { "epoch": 0.33, "grad_norm": 1.3757811784744263, "learning_rate": 3.3669630642954855e-05, "loss": 2.2549, "step": 3820 }, { "epoch": 0.33, "grad_norm": 1.5131489038467407, "learning_rate": 3.365253077975376e-05, "loss": 2.1822, "step": 3824 }, { "epoch": 0.33, "grad_norm": 1.511177897453308, "learning_rate": 3.363543091655267e-05, "loss": 2.3104, "step": 3828 }, { "epoch": 0.33, "grad_norm": 1.4455082416534424, "learning_rate": 3.361833105335158e-05, "loss": 2.5066, "step": 3832 }, { "epoch": 0.33, "grad_norm": 1.3408520221710205, "learning_rate": 3.360123119015048e-05, "loss": 2.2685, "step": 3836 }, { "epoch": 0.33, "grad_norm": 1.3482859134674072, "learning_rate": 3.3584131326949385e-05, "loss": 2.4401, "step": 3840 }, { "epoch": 0.33, "grad_norm": 1.4088398218154907, "learning_rate": 3.356703146374829e-05, "loss": 2.3414, "step": 3844 }, { "epoch": 0.33, "grad_norm": 1.483805775642395, "learning_rate": 3.354993160054719e-05, "loss": 2.1907, "step": 3848 }, { "epoch": 0.33, "grad_norm": 1.4749683141708374, "learning_rate": 3.353283173734611e-05, "loss": 2.4228, "step": 3852 }, { "epoch": 0.33, "grad_norm": 1.5431963205337524, "learning_rate": 3.351573187414501e-05, "loss": 2.2988, "step": 3856 }, { "epoch": 0.33, "grad_norm": 1.7580955028533936, "learning_rate": 3.3498632010943915e-05, "loss": 2.2444, "step": 3860 }, { "epoch": 0.33, "grad_norm": 1.3347162008285522, "learning_rate": 3.348153214774282e-05, "loss": 2.3015, "step": 3864 }, { "epoch": 0.33, "grad_norm": 1.4706435203552246, "learning_rate": 3.346443228454172e-05, "loss": 2.3175, "step": 3868 }, { "epoch": 0.33, "grad_norm": 1.63969886302948, "learning_rate": 3.344733242134063e-05, "loss": 2.1465, "step": 3872 }, { "epoch": 0.33, "grad_norm": 2.056724786758423, "learning_rate": 3.343023255813954e-05, "loss": 2.3949, "step": 3876 }, { "epoch": 0.33, "grad_norm": 1.551066279411316, "learning_rate": 3.3413132694938445e-05, "loss": 2.2891, "step": 3880 }, { "epoch": 0.33, "grad_norm": 1.6610709428787231, "learning_rate": 3.3396032831737346e-05, "loss": 2.5973, "step": 3884 }, { "epoch": 0.33, "grad_norm": 1.5867125988006592, "learning_rate": 3.337893296853625e-05, "loss": 2.2614, "step": 3888 }, { "epoch": 0.33, "grad_norm": 1.8439687490463257, "learning_rate": 3.336183310533516e-05, "loss": 2.1923, "step": 3892 }, { "epoch": 0.33, "grad_norm": 1.4217692613601685, "learning_rate": 3.334473324213406e-05, "loss": 2.2826, "step": 3896 }, { "epoch": 0.33, "grad_norm": 1.2552785873413086, "learning_rate": 3.332763337893297e-05, "loss": 2.3539, "step": 3900 }, { "epoch": 0.33, "grad_norm": 1.5517688989639282, "learning_rate": 3.3310533515731876e-05, "loss": 2.2412, "step": 3904 }, { "epoch": 0.33, "grad_norm": 1.6065213680267334, "learning_rate": 3.329343365253078e-05, "loss": 2.2576, "step": 3908 }, { "epoch": 0.33, "grad_norm": 1.4800779819488525, "learning_rate": 3.327633378932969e-05, "loss": 2.324, "step": 3912 }, { "epoch": 0.33, "grad_norm": 1.3546322584152222, "learning_rate": 3.325923392612859e-05, "loss": 2.323, "step": 3916 }, { "epoch": 0.34, "grad_norm": 1.6678704023361206, "learning_rate": 3.32421340629275e-05, "loss": 2.3459, "step": 3920 }, { "epoch": 0.34, "grad_norm": 1.4288249015808105, "learning_rate": 3.32250341997264e-05, "loss": 2.2059, "step": 3924 }, { "epoch": 0.34, "grad_norm": 1.4324655532836914, "learning_rate": 3.3207934336525306e-05, "loss": 2.1479, "step": 3928 }, { "epoch": 0.34, "grad_norm": 1.5552873611450195, "learning_rate": 3.319083447332422e-05, "loss": 2.2897, "step": 3932 }, { "epoch": 0.34, "grad_norm": 1.4077160358428955, "learning_rate": 3.317373461012312e-05, "loss": 2.3246, "step": 3936 }, { "epoch": 0.34, "grad_norm": 1.4134106636047363, "learning_rate": 3.315663474692203e-05, "loss": 2.2485, "step": 3940 }, { "epoch": 0.34, "grad_norm": 1.670109510421753, "learning_rate": 3.313953488372093e-05, "loss": 2.2681, "step": 3944 }, { "epoch": 0.34, "grad_norm": 1.706309199333191, "learning_rate": 3.3122435020519836e-05, "loss": 2.2443, "step": 3948 }, { "epoch": 0.34, "grad_norm": 1.5278905630111694, "learning_rate": 3.3105335157318744e-05, "loss": 2.1895, "step": 3952 }, { "epoch": 0.34, "grad_norm": 1.3573322296142578, "learning_rate": 3.308823529411765e-05, "loss": 2.4002, "step": 3956 }, { "epoch": 0.34, "grad_norm": 1.383784532546997, "learning_rate": 3.307113543091656e-05, "loss": 2.3252, "step": 3960 }, { "epoch": 0.34, "grad_norm": 1.5072522163391113, "learning_rate": 3.305403556771546e-05, "loss": 2.3, "step": 3964 }, { "epoch": 0.34, "grad_norm": 1.5655957460403442, "learning_rate": 3.3036935704514366e-05, "loss": 2.1498, "step": 3968 }, { "epoch": 0.34, "grad_norm": 1.4027670621871948, "learning_rate": 3.3019835841313274e-05, "loss": 2.2169, "step": 3972 }, { "epoch": 0.34, "grad_norm": 1.5429667234420776, "learning_rate": 3.3002735978112174e-05, "loss": 2.3642, "step": 3976 }, { "epoch": 0.34, "grad_norm": 1.4422931671142578, "learning_rate": 3.298563611491108e-05, "loss": 2.1742, "step": 3980 }, { "epoch": 0.34, "grad_norm": 1.4613866806030273, "learning_rate": 3.296853625170999e-05, "loss": 2.4487, "step": 3984 }, { "epoch": 0.34, "grad_norm": 1.5176717042922974, "learning_rate": 3.2951436388508896e-05, "loss": 2.3083, "step": 3988 }, { "epoch": 0.34, "grad_norm": 1.7266799211502075, "learning_rate": 3.2934336525307803e-05, "loss": 2.2963, "step": 3992 }, { "epoch": 0.34, "grad_norm": 1.5054651498794556, "learning_rate": 3.2917236662106704e-05, "loss": 2.4407, "step": 3996 }, { "epoch": 0.34, "grad_norm": 1.4415006637573242, "learning_rate": 3.290013679890561e-05, "loss": 2.25, "step": 4000 }, { "epoch": 0.34, "grad_norm": 1.5419758558273315, "learning_rate": 3.288303693570451e-05, "loss": 2.3623, "step": 4004 }, { "epoch": 0.34, "grad_norm": 1.7471479177474976, "learning_rate": 3.286593707250342e-05, "loss": 2.4276, "step": 4008 }, { "epoch": 0.34, "grad_norm": 1.7161086797714233, "learning_rate": 3.284883720930233e-05, "loss": 2.3613, "step": 4012 }, { "epoch": 0.34, "grad_norm": 1.622846007347107, "learning_rate": 3.2831737346101234e-05, "loss": 2.2338, "step": 4016 }, { "epoch": 0.34, "grad_norm": 1.52423095703125, "learning_rate": 3.281463748290014e-05, "loss": 2.3147, "step": 4020 }, { "epoch": 0.34, "grad_norm": 1.4126418828964233, "learning_rate": 3.279753761969904e-05, "loss": 2.2087, "step": 4024 }, { "epoch": 0.34, "grad_norm": 1.316510796546936, "learning_rate": 3.278043775649795e-05, "loss": 2.2318, "step": 4028 }, { "epoch": 0.34, "grad_norm": 1.7000290155410767, "learning_rate": 3.276333789329685e-05, "loss": 2.187, "step": 4032 }, { "epoch": 0.35, "grad_norm": 1.4855263233184814, "learning_rate": 3.274623803009576e-05, "loss": 2.1994, "step": 4036 }, { "epoch": 0.35, "grad_norm": 1.4632859230041504, "learning_rate": 3.272913816689467e-05, "loss": 2.2279, "step": 4040 }, { "epoch": 0.35, "grad_norm": 1.456778645515442, "learning_rate": 3.271203830369357e-05, "loss": 2.1008, "step": 4044 }, { "epoch": 0.35, "grad_norm": 1.4377546310424805, "learning_rate": 3.269493844049248e-05, "loss": 2.2231, "step": 4048 }, { "epoch": 0.35, "grad_norm": 1.5278170108795166, "learning_rate": 3.267783857729138e-05, "loss": 2.1359, "step": 4052 }, { "epoch": 0.35, "grad_norm": 1.558070182800293, "learning_rate": 3.266073871409029e-05, "loss": 2.383, "step": 4056 }, { "epoch": 0.35, "grad_norm": 1.5845755338668823, "learning_rate": 3.2643638850889194e-05, "loss": 2.2107, "step": 4060 }, { "epoch": 0.35, "grad_norm": 1.6732598543167114, "learning_rate": 3.26265389876881e-05, "loss": 2.0189, "step": 4064 }, { "epoch": 0.35, "grad_norm": 1.5206589698791504, "learning_rate": 3.260943912448701e-05, "loss": 2.2918, "step": 4068 }, { "epoch": 0.35, "grad_norm": 1.6750423908233643, "learning_rate": 3.259233926128591e-05, "loss": 2.2082, "step": 4072 }, { "epoch": 0.35, "grad_norm": 1.6563453674316406, "learning_rate": 3.257523939808482e-05, "loss": 2.3358, "step": 4076 }, { "epoch": 0.35, "grad_norm": 1.575918197631836, "learning_rate": 3.2558139534883724e-05, "loss": 2.355, "step": 4080 }, { "epoch": 0.35, "grad_norm": 1.5032954216003418, "learning_rate": 3.2541039671682625e-05, "loss": 2.259, "step": 4084 }, { "epoch": 0.35, "grad_norm": 1.4477620124816895, "learning_rate": 3.252393980848153e-05, "loss": 2.2821, "step": 4088 }, { "epoch": 0.35, "grad_norm": 1.342366337776184, "learning_rate": 3.250683994528044e-05, "loss": 2.1086, "step": 4092 }, { "epoch": 0.35, "grad_norm": 1.4458723068237305, "learning_rate": 3.248974008207935e-05, "loss": 2.2197, "step": 4096 }, { "epoch": 0.35, "grad_norm": 1.4470369815826416, "learning_rate": 3.2472640218878254e-05, "loss": 2.3436, "step": 4100 }, { "epoch": 0.35, "grad_norm": 1.6926721334457397, "learning_rate": 3.2455540355677155e-05, "loss": 2.3198, "step": 4104 }, { "epoch": 0.35, "grad_norm": 1.483053207397461, "learning_rate": 3.243844049247606e-05, "loss": 2.2944, "step": 4108 }, { "epoch": 0.35, "grad_norm": 1.5896413326263428, "learning_rate": 3.242134062927496e-05, "loss": 2.1459, "step": 4112 }, { "epoch": 0.35, "grad_norm": 1.915610432624817, "learning_rate": 3.240424076607387e-05, "loss": 2.2768, "step": 4116 }, { "epoch": 0.35, "grad_norm": 1.4022173881530762, "learning_rate": 3.2387140902872784e-05, "loss": 2.1106, "step": 4120 }, { "epoch": 0.35, "grad_norm": 1.3702282905578613, "learning_rate": 3.2370041039671685e-05, "loss": 2.0183, "step": 4124 }, { "epoch": 0.35, "grad_norm": 1.3947548866271973, "learning_rate": 3.235294117647059e-05, "loss": 2.1075, "step": 4128 }, { "epoch": 0.35, "grad_norm": 1.5231854915618896, "learning_rate": 3.233584131326949e-05, "loss": 2.3221, "step": 4132 }, { "epoch": 0.35, "grad_norm": 1.5928566455841064, "learning_rate": 3.23187414500684e-05, "loss": 2.1517, "step": 4136 }, { "epoch": 0.35, "grad_norm": 1.518593192100525, "learning_rate": 3.23016415868673e-05, "loss": 2.4391, "step": 4140 }, { "epoch": 0.35, "grad_norm": 1.4657520055770874, "learning_rate": 3.2284541723666215e-05, "loss": 2.2852, "step": 4144 }, { "epoch": 0.35, "grad_norm": 1.4905215501785278, "learning_rate": 3.226744186046512e-05, "loss": 2.1462, "step": 4148 }, { "epoch": 0.35, "grad_norm": 1.6270649433135986, "learning_rate": 3.225034199726402e-05, "loss": 2.4138, "step": 4152 }, { "epoch": 0.36, "grad_norm": 1.7110861539840698, "learning_rate": 3.223324213406293e-05, "loss": 2.2067, "step": 4156 }, { "epoch": 0.36, "grad_norm": 1.771278977394104, "learning_rate": 3.221614227086183e-05, "loss": 2.3056, "step": 4160 }, { "epoch": 0.36, "grad_norm": 1.7753063440322876, "learning_rate": 3.219904240766074e-05, "loss": 2.1597, "step": 4164 }, { "epoch": 0.36, "grad_norm": 1.4673937559127808, "learning_rate": 3.2181942544459645e-05, "loss": 2.2006, "step": 4168 }, { "epoch": 0.36, "grad_norm": 1.4760159254074097, "learning_rate": 3.216484268125855e-05, "loss": 2.213, "step": 4172 }, { "epoch": 0.36, "grad_norm": 1.435507893562317, "learning_rate": 3.214774281805746e-05, "loss": 2.2281, "step": 4176 }, { "epoch": 0.36, "grad_norm": 1.3926265239715576, "learning_rate": 3.213064295485636e-05, "loss": 2.1146, "step": 4180 }, { "epoch": 0.36, "grad_norm": 1.4813460111618042, "learning_rate": 3.211354309165527e-05, "loss": 2.2133, "step": 4184 }, { "epoch": 0.36, "grad_norm": 1.576973795890808, "learning_rate": 3.2096443228454175e-05, "loss": 2.1075, "step": 4188 }, { "epoch": 0.36, "grad_norm": 2.532358169555664, "learning_rate": 3.2079343365253076e-05, "loss": 2.2588, "step": 4192 }, { "epoch": 0.36, "grad_norm": 1.4591597318649292, "learning_rate": 3.206224350205198e-05, "loss": 2.2587, "step": 4196 }, { "epoch": 0.36, "grad_norm": 1.5652563571929932, "learning_rate": 3.204514363885089e-05, "loss": 2.217, "step": 4200 }, { "epoch": 0.36, "grad_norm": 1.4030766487121582, "learning_rate": 3.20280437756498e-05, "loss": 2.2451, "step": 4204 }, { "epoch": 0.36, "grad_norm": 1.5012942552566528, "learning_rate": 3.2010943912448705e-05, "loss": 2.2296, "step": 4208 }, { "epoch": 0.36, "grad_norm": 1.3982198238372803, "learning_rate": 3.1993844049247606e-05, "loss": 2.3254, "step": 4212 }, { "epoch": 0.36, "grad_norm": 2.359818458557129, "learning_rate": 3.197674418604651e-05, "loss": 2.3709, "step": 4216 }, { "epoch": 0.36, "grad_norm": 1.687652349472046, "learning_rate": 3.1959644322845414e-05, "loss": 2.2442, "step": 4220 }, { "epoch": 0.36, "grad_norm": 1.577440857887268, "learning_rate": 3.194254445964433e-05, "loss": 2.1392, "step": 4224 }, { "epoch": 0.36, "grad_norm": 1.4704028367996216, "learning_rate": 3.1925444596443235e-05, "loss": 2.3586, "step": 4228 }, { "epoch": 0.36, "grad_norm": 1.5463335514068604, "learning_rate": 3.1908344733242136e-05, "loss": 2.3475, "step": 4232 }, { "epoch": 0.36, "grad_norm": 1.4893290996551514, "learning_rate": 3.189124487004104e-05, "loss": 2.1804, "step": 4236 }, { "epoch": 0.36, "grad_norm": 1.6646398305892944, "learning_rate": 3.1874145006839944e-05, "loss": 2.2972, "step": 4240 }, { "epoch": 0.36, "grad_norm": 1.4279797077178955, "learning_rate": 3.185704514363885e-05, "loss": 2.235, "step": 4244 }, { "epoch": 0.36, "grad_norm": 1.5145081281661987, "learning_rate": 3.183994528043776e-05, "loss": 2.231, "step": 4248 }, { "epoch": 0.36, "grad_norm": 1.4431096315383911, "learning_rate": 3.1822845417236666e-05, "loss": 2.1942, "step": 4252 }, { "epoch": 0.36, "grad_norm": 1.431936264038086, "learning_rate": 3.180574555403557e-05, "loss": 2.2726, "step": 4256 }, { "epoch": 0.36, "grad_norm": 1.4116493463516235, "learning_rate": 3.1788645690834474e-05, "loss": 2.346, "step": 4260 }, { "epoch": 0.36, "grad_norm": 1.480595588684082, "learning_rate": 3.177154582763338e-05, "loss": 2.1486, "step": 4264 }, { "epoch": 0.36, "grad_norm": 1.4615095853805542, "learning_rate": 3.175444596443228e-05, "loss": 2.2935, "step": 4268 }, { "epoch": 0.37, "grad_norm": 1.414937973022461, "learning_rate": 3.173734610123119e-05, "loss": 2.0618, "step": 4272 }, { "epoch": 0.37, "grad_norm": 2.260504961013794, "learning_rate": 3.1720246238030096e-05, "loss": 2.2604, "step": 4276 }, { "epoch": 0.37, "grad_norm": 1.573109745979309, "learning_rate": 3.1703146374829003e-05, "loss": 2.1983, "step": 4280 }, { "epoch": 0.37, "grad_norm": 1.4415810108184814, "learning_rate": 3.168604651162791e-05, "loss": 2.1983, "step": 4284 }, { "epoch": 0.37, "grad_norm": 1.4468250274658203, "learning_rate": 3.166894664842681e-05, "loss": 2.3544, "step": 4288 }, { "epoch": 0.37, "grad_norm": 1.4121309518814087, "learning_rate": 3.165184678522572e-05, "loss": 2.3186, "step": 4292 }, { "epoch": 0.37, "grad_norm": 1.3506333827972412, "learning_rate": 3.1634746922024626e-05, "loss": 2.2057, "step": 4296 }, { "epoch": 0.37, "grad_norm": 1.5312514305114746, "learning_rate": 3.161764705882353e-05, "loss": 2.1451, "step": 4300 }, { "epoch": 0.37, "grad_norm": 1.8453575372695923, "learning_rate": 3.160054719562244e-05, "loss": 2.2856, "step": 4304 }, { "epoch": 0.37, "grad_norm": 1.5432989597320557, "learning_rate": 3.158344733242134e-05, "loss": 2.2947, "step": 4308 }, { "epoch": 0.37, "grad_norm": 1.575966238975525, "learning_rate": 3.156634746922025e-05, "loss": 2.1444, "step": 4312 }, { "epoch": 0.37, "grad_norm": 1.6484041213989258, "learning_rate": 3.1549247606019156e-05, "loss": 2.1952, "step": 4316 }, { "epoch": 0.37, "grad_norm": 1.4162933826446533, "learning_rate": 3.1532147742818057e-05, "loss": 2.0885, "step": 4320 }, { "epoch": 0.37, "grad_norm": 1.570765495300293, "learning_rate": 3.1515047879616964e-05, "loss": 2.1415, "step": 4324 }, { "epoch": 0.37, "grad_norm": 1.603226661682129, "learning_rate": 3.149794801641587e-05, "loss": 2.2982, "step": 4328 }, { "epoch": 0.37, "grad_norm": 1.4410016536712646, "learning_rate": 3.148084815321478e-05, "loss": 2.2572, "step": 4332 }, { "epoch": 0.37, "grad_norm": 1.392094373703003, "learning_rate": 3.1463748290013686e-05, "loss": 2.4107, "step": 4336 }, { "epoch": 0.37, "grad_norm": 1.4003255367279053, "learning_rate": 3.1446648426812587e-05, "loss": 2.2181, "step": 4340 }, { "epoch": 0.37, "grad_norm": 3.496429204940796, "learning_rate": 3.1429548563611494e-05, "loss": 2.241, "step": 4344 }, { "epoch": 0.37, "grad_norm": 1.5706188678741455, "learning_rate": 3.1412448700410394e-05, "loss": 2.1635, "step": 4348 }, { "epoch": 0.37, "grad_norm": 1.4904406070709229, "learning_rate": 3.13953488372093e-05, "loss": 2.2688, "step": 4352 }, { "epoch": 0.37, "grad_norm": 1.4973410367965698, "learning_rate": 3.137824897400821e-05, "loss": 2.2812, "step": 4356 }, { "epoch": 0.37, "grad_norm": 1.6343064308166504, "learning_rate": 3.1361149110807116e-05, "loss": 2.3717, "step": 4360 }, { "epoch": 0.37, "grad_norm": 1.327519178390503, "learning_rate": 3.1344049247606024e-05, "loss": 2.1028, "step": 4364 }, { "epoch": 0.37, "grad_norm": 1.46675705909729, "learning_rate": 3.1326949384404924e-05, "loss": 2.2575, "step": 4368 }, { "epoch": 0.37, "grad_norm": 1.5625230073928833, "learning_rate": 3.130984952120383e-05, "loss": 2.1085, "step": 4372 }, { "epoch": 0.37, "grad_norm": 1.4606382846832275, "learning_rate": 3.129274965800274e-05, "loss": 2.1484, "step": 4376 }, { "epoch": 0.37, "grad_norm": 1.3543037176132202, "learning_rate": 3.127564979480164e-05, "loss": 2.3504, "step": 4380 }, { "epoch": 0.37, "grad_norm": 1.3623141050338745, "learning_rate": 3.125854993160055e-05, "loss": 2.1932, "step": 4384 }, { "epoch": 0.38, "grad_norm": 1.2342579364776611, "learning_rate": 3.1241450068399454e-05, "loss": 2.229, "step": 4388 }, { "epoch": 0.38, "grad_norm": 1.5705771446228027, "learning_rate": 3.122435020519836e-05, "loss": 2.3358, "step": 4392 }, { "epoch": 0.38, "grad_norm": 1.5220746994018555, "learning_rate": 3.120725034199726e-05, "loss": 2.0569, "step": 4396 }, { "epoch": 0.38, "grad_norm": 1.5559509992599487, "learning_rate": 3.119015047879617e-05, "loss": 2.348, "step": 4400 }, { "epoch": 0.38, "grad_norm": 1.4289395809173584, "learning_rate": 3.117305061559508e-05, "loss": 2.1036, "step": 4404 }, { "epoch": 0.38, "grad_norm": 1.69617760181427, "learning_rate": 3.115595075239398e-05, "loss": 2.1352, "step": 4408 }, { "epoch": 0.38, "grad_norm": 1.6432487964630127, "learning_rate": 3.113885088919289e-05, "loss": 2.2481, "step": 4412 }, { "epoch": 0.38, "grad_norm": 1.986351728439331, "learning_rate": 3.112175102599179e-05, "loss": 2.2431, "step": 4416 }, { "epoch": 0.38, "grad_norm": 1.541490912437439, "learning_rate": 3.11046511627907e-05, "loss": 2.1826, "step": 4420 }, { "epoch": 0.38, "grad_norm": 1.5522269010543823, "learning_rate": 3.108755129958961e-05, "loss": 2.1923, "step": 4424 }, { "epoch": 0.38, "grad_norm": 1.4916808605194092, "learning_rate": 3.107045143638851e-05, "loss": 2.1626, "step": 4428 }, { "epoch": 0.38, "grad_norm": 1.3527686595916748, "learning_rate": 3.1053351573187415e-05, "loss": 2.0595, "step": 4432 }, { "epoch": 0.38, "grad_norm": 1.568322777748108, "learning_rate": 3.103625170998632e-05, "loss": 2.2729, "step": 4436 }, { "epoch": 0.38, "grad_norm": 1.5389866828918457, "learning_rate": 3.101915184678523e-05, "loss": 2.0978, "step": 4440 }, { "epoch": 0.38, "grad_norm": 1.4940327405929565, "learning_rate": 3.100205198358414e-05, "loss": 2.2146, "step": 4444 }, { "epoch": 0.38, "grad_norm": 1.5102323293685913, "learning_rate": 3.098495212038304e-05, "loss": 2.1204, "step": 4448 }, { "epoch": 0.38, "grad_norm": 1.4519439935684204, "learning_rate": 3.0967852257181945e-05, "loss": 2.2592, "step": 4452 }, { "epoch": 0.38, "grad_norm": 1.3934937715530396, "learning_rate": 3.0950752393980845e-05, "loss": 2.1279, "step": 4456 }, { "epoch": 0.38, "grad_norm": 1.3405640125274658, "learning_rate": 3.093365253077975e-05, "loss": 2.1896, "step": 4460 }, { "epoch": 0.38, "grad_norm": 1.6696523427963257, "learning_rate": 3.091655266757866e-05, "loss": 2.0079, "step": 4464 }, { "epoch": 0.38, "grad_norm": 1.470809817314148, "learning_rate": 3.089945280437757e-05, "loss": 2.2609, "step": 4468 }, { "epoch": 0.38, "grad_norm": 1.54929518699646, "learning_rate": 3.0882352941176475e-05, "loss": 2.18, "step": 4472 }, { "epoch": 0.38, "grad_norm": 2.0508859157562256, "learning_rate": 3.0865253077975375e-05, "loss": 2.1995, "step": 4476 }, { "epoch": 0.38, "grad_norm": 1.4847677946090698, "learning_rate": 3.084815321477428e-05, "loss": 2.1576, "step": 4480 }, { "epoch": 0.38, "grad_norm": 1.490350604057312, "learning_rate": 3.083105335157319e-05, "loss": 2.188, "step": 4484 }, { "epoch": 0.38, "grad_norm": 1.5087822675704956, "learning_rate": 3.081395348837209e-05, "loss": 2.2151, "step": 4488 }, { "epoch": 0.38, "grad_norm": 1.8956283330917358, "learning_rate": 3.0796853625171005e-05, "loss": 2.1073, "step": 4492 }, { "epoch": 0.38, "grad_norm": 1.575896978378296, "learning_rate": 3.0779753761969905e-05, "loss": 2.2574, "step": 4496 }, { "epoch": 0.38, "grad_norm": 1.5014541149139404, "learning_rate": 3.076265389876881e-05, "loss": 2.1651, "step": 4500 }, { "epoch": 0.39, "grad_norm": 1.8757340908050537, "learning_rate": 3.074555403556772e-05, "loss": 2.0871, "step": 4504 }, { "epoch": 0.39, "grad_norm": 1.5806230306625366, "learning_rate": 3.072845417236662e-05, "loss": 2.1398, "step": 4508 }, { "epoch": 0.39, "grad_norm": 1.7795255184173584, "learning_rate": 3.071135430916553e-05, "loss": 2.2246, "step": 4512 }, { "epoch": 0.39, "grad_norm": 1.6598823070526123, "learning_rate": 3.0694254445964435e-05, "loss": 2.2928, "step": 4516 }, { "epoch": 0.39, "grad_norm": 1.5600835084915161, "learning_rate": 3.067715458276334e-05, "loss": 2.3343, "step": 4520 }, { "epoch": 0.39, "grad_norm": 1.5268632173538208, "learning_rate": 3.066005471956224e-05, "loss": 2.3551, "step": 4524 }, { "epoch": 0.39, "grad_norm": 1.8789829015731812, "learning_rate": 3.064295485636115e-05, "loss": 2.3127, "step": 4528 }, { "epoch": 0.39, "grad_norm": 1.607021450996399, "learning_rate": 3.062585499316006e-05, "loss": 2.142, "step": 4532 }, { "epoch": 0.39, "grad_norm": 1.4491835832595825, "learning_rate": 3.060875512995896e-05, "loss": 2.2, "step": 4536 }, { "epoch": 0.39, "grad_norm": 1.5633890628814697, "learning_rate": 3.0591655266757866e-05, "loss": 2.0443, "step": 4540 }, { "epoch": 0.39, "grad_norm": 1.6567158699035645, "learning_rate": 3.057455540355677e-05, "loss": 2.1271, "step": 4544 }, { "epoch": 0.39, "grad_norm": 1.5010050535202026, "learning_rate": 3.055745554035568e-05, "loss": 2.2272, "step": 4548 }, { "epoch": 0.39, "grad_norm": 1.5739331245422363, "learning_rate": 3.054035567715459e-05, "loss": 2.1923, "step": 4552 }, { "epoch": 0.39, "grad_norm": 1.4975517988204956, "learning_rate": 3.052325581395349e-05, "loss": 2.2189, "step": 4556 }, { "epoch": 0.39, "grad_norm": 1.571330189704895, "learning_rate": 3.0506155950752396e-05, "loss": 2.1658, "step": 4560 }, { "epoch": 0.39, "grad_norm": 1.296584129333496, "learning_rate": 3.04890560875513e-05, "loss": 2.1641, "step": 4564 }, { "epoch": 0.39, "grad_norm": 1.920922875404358, "learning_rate": 3.0471956224350207e-05, "loss": 2.3731, "step": 4568 }, { "epoch": 0.39, "grad_norm": 1.5119904279708862, "learning_rate": 3.0454856361149114e-05, "loss": 2.0966, "step": 4572 }, { "epoch": 0.39, "grad_norm": 1.3779191970825195, "learning_rate": 3.0437756497948018e-05, "loss": 2.0729, "step": 4576 }, { "epoch": 0.39, "grad_norm": 1.3981715440750122, "learning_rate": 3.0420656634746926e-05, "loss": 2.1438, "step": 4580 }, { "epoch": 0.39, "grad_norm": 1.4112988710403442, "learning_rate": 3.0403556771545826e-05, "loss": 2.3213, "step": 4584 }, { "epoch": 0.39, "grad_norm": 1.4584025144577026, "learning_rate": 3.0386456908344733e-05, "loss": 2.2532, "step": 4588 }, { "epoch": 0.39, "grad_norm": 1.500198245048523, "learning_rate": 3.0369357045143644e-05, "loss": 2.029, "step": 4592 }, { "epoch": 0.39, "grad_norm": 1.4857838153839111, "learning_rate": 3.0352257181942545e-05, "loss": 2.0186, "step": 4596 }, { "epoch": 0.39, "grad_norm": 1.396033525466919, "learning_rate": 3.0335157318741452e-05, "loss": 2.2293, "step": 4600 }, { "epoch": 0.39, "grad_norm": 1.5105597972869873, "learning_rate": 3.0318057455540356e-05, "loss": 2.1206, "step": 4604 }, { "epoch": 0.39, "grad_norm": 1.3721805810928345, "learning_rate": 3.0300957592339263e-05, "loss": 2.1646, "step": 4608 }, { "epoch": 0.39, "grad_norm": 1.4651081562042236, "learning_rate": 3.028385772913817e-05, "loss": 2.2184, "step": 4612 }, { "epoch": 0.39, "grad_norm": 1.468990445137024, "learning_rate": 3.0266757865937075e-05, "loss": 2.1083, "step": 4616 }, { "epoch": 0.4, "grad_norm": 2.047722578048706, "learning_rate": 3.0249658002735982e-05, "loss": 2.1685, "step": 4620 }, { "epoch": 0.4, "grad_norm": 1.5102801322937012, "learning_rate": 3.0232558139534883e-05, "loss": 2.2332, "step": 4624 }, { "epoch": 0.4, "grad_norm": 1.3657864332199097, "learning_rate": 3.021545827633379e-05, "loss": 2.3045, "step": 4628 }, { "epoch": 0.4, "grad_norm": 1.468764305114746, "learning_rate": 3.01983584131327e-05, "loss": 2.2094, "step": 4632 }, { "epoch": 0.4, "grad_norm": 1.451079249382019, "learning_rate": 3.01812585499316e-05, "loss": 2.0958, "step": 4636 }, { "epoch": 0.4, "grad_norm": 1.5094703435897827, "learning_rate": 3.016415868673051e-05, "loss": 2.186, "step": 4640 }, { "epoch": 0.4, "grad_norm": 1.5918653011322021, "learning_rate": 3.0147058823529413e-05, "loss": 2.2616, "step": 4644 }, { "epoch": 0.4, "grad_norm": 1.5116558074951172, "learning_rate": 3.012995896032832e-05, "loss": 2.2503, "step": 4648 }, { "epoch": 0.4, "grad_norm": 1.4294779300689697, "learning_rate": 3.011285909712722e-05, "loss": 2.3022, "step": 4652 }, { "epoch": 0.4, "grad_norm": 2.080925464630127, "learning_rate": 3.009575923392613e-05, "loss": 2.0627, "step": 4656 }, { "epoch": 0.4, "grad_norm": 1.4744940996170044, "learning_rate": 3.007865937072504e-05, "loss": 2.1063, "step": 4660 }, { "epoch": 0.4, "grad_norm": 1.6344597339630127, "learning_rate": 3.006155950752394e-05, "loss": 2.1501, "step": 4664 }, { "epoch": 0.4, "grad_norm": 1.5882827043533325, "learning_rate": 3.0044459644322846e-05, "loss": 2.1208, "step": 4668 }, { "epoch": 0.4, "grad_norm": 1.4376238584518433, "learning_rate": 3.002735978112175e-05, "loss": 2.0962, "step": 4672 }, { "epoch": 0.4, "grad_norm": 1.6142209768295288, "learning_rate": 3.0010259917920658e-05, "loss": 2.0721, "step": 4676 }, { "epoch": 0.4, "grad_norm": 1.4628876447677612, "learning_rate": 2.9993160054719565e-05, "loss": 1.959, "step": 4680 }, { "epoch": 0.4, "grad_norm": 1.3108237981796265, "learning_rate": 2.997606019151847e-05, "loss": 2.1634, "step": 4684 }, { "epoch": 0.4, "grad_norm": 1.4312976598739624, "learning_rate": 2.9958960328317376e-05, "loss": 2.1643, "step": 4688 }, { "epoch": 0.4, "grad_norm": 1.449655294418335, "learning_rate": 2.9941860465116277e-05, "loss": 2.2098, "step": 4692 }, { "epoch": 0.4, "grad_norm": 1.5844769477844238, "learning_rate": 2.9924760601915188e-05, "loss": 2.1497, "step": 4696 }, { "epoch": 0.4, "grad_norm": 1.4893046617507935, "learning_rate": 2.9907660738714095e-05, "loss": 2.2939, "step": 4700 }, { "epoch": 0.4, "grad_norm": 1.3878313302993774, "learning_rate": 2.9890560875512996e-05, "loss": 2.1361, "step": 4704 }, { "epoch": 0.4, "grad_norm": 1.9359514713287354, "learning_rate": 2.9873461012311903e-05, "loss": 2.178, "step": 4708 }, { "epoch": 0.4, "grad_norm": 1.581032633781433, "learning_rate": 2.9856361149110807e-05, "loss": 2.0514, "step": 4712 }, { "epoch": 0.4, "grad_norm": 1.3860806226730347, "learning_rate": 2.9839261285909714e-05, "loss": 2.2888, "step": 4716 }, { "epoch": 0.4, "grad_norm": 1.4699238538742065, "learning_rate": 2.982216142270862e-05, "loss": 2.167, "step": 4720 }, { "epoch": 0.4, "grad_norm": 1.459145188331604, "learning_rate": 2.9805061559507526e-05, "loss": 2.1719, "step": 4724 }, { "epoch": 0.4, "grad_norm": 1.7492071390151978, "learning_rate": 2.9787961696306433e-05, "loss": 2.314, "step": 4728 }, { "epoch": 0.4, "grad_norm": 1.9393258094787598, "learning_rate": 2.9770861833105333e-05, "loss": 2.3491, "step": 4732 }, { "epoch": 0.4, "grad_norm": 1.8979884386062622, "learning_rate": 2.9753761969904244e-05, "loss": 2.1962, "step": 4736 }, { "epoch": 0.41, "grad_norm": 1.5597001314163208, "learning_rate": 2.973666210670315e-05, "loss": 2.1819, "step": 4740 }, { "epoch": 0.41, "grad_norm": 1.6771557331085205, "learning_rate": 2.9719562243502052e-05, "loss": 2.1407, "step": 4744 }, { "epoch": 0.41, "grad_norm": 1.5106451511383057, "learning_rate": 2.970246238030096e-05, "loss": 2.1655, "step": 4748 }, { "epoch": 0.41, "grad_norm": 1.531887173652649, "learning_rate": 2.9685362517099863e-05, "loss": 2.1351, "step": 4752 }, { "epoch": 0.41, "grad_norm": 1.3597395420074463, "learning_rate": 2.966826265389877e-05, "loss": 2.0589, "step": 4756 }, { "epoch": 0.41, "grad_norm": 1.761173129081726, "learning_rate": 2.9651162790697678e-05, "loss": 2.0529, "step": 4760 }, { "epoch": 0.41, "grad_norm": 1.44886314868927, "learning_rate": 2.9634062927496582e-05, "loss": 1.9243, "step": 4764 }, { "epoch": 0.41, "grad_norm": 1.516022801399231, "learning_rate": 2.961696306429549e-05, "loss": 2.0999, "step": 4768 }, { "epoch": 0.41, "grad_norm": 1.3773995637893677, "learning_rate": 2.959986320109439e-05, "loss": 2.2363, "step": 4772 }, { "epoch": 0.41, "grad_norm": 1.5385714769363403, "learning_rate": 2.9582763337893297e-05, "loss": 2.1583, "step": 4776 }, { "epoch": 0.41, "grad_norm": 1.746711015701294, "learning_rate": 2.95656634746922e-05, "loss": 2.1572, "step": 4780 }, { "epoch": 0.41, "grad_norm": 1.6620087623596191, "learning_rate": 2.954856361149111e-05, "loss": 2.102, "step": 4784 }, { "epoch": 0.41, "grad_norm": 1.4622972011566162, "learning_rate": 2.9531463748290016e-05, "loss": 2.2325, "step": 4788 }, { "epoch": 0.41, "grad_norm": 1.3566945791244507, "learning_rate": 2.951436388508892e-05, "loss": 2.2317, "step": 4792 }, { "epoch": 0.41, "grad_norm": 1.4192919731140137, "learning_rate": 2.9497264021887827e-05, "loss": 2.1342, "step": 4796 }, { "epoch": 0.41, "grad_norm": 1.7992663383483887, "learning_rate": 2.9480164158686728e-05, "loss": 1.9621, "step": 4800 }, { "epoch": 0.41, "grad_norm": 1.591426968574524, "learning_rate": 2.946306429548564e-05, "loss": 2.0418, "step": 4804 }, { "epoch": 0.41, "grad_norm": 1.5884109735488892, "learning_rate": 2.9445964432284546e-05, "loss": 2.2499, "step": 4808 }, { "epoch": 0.41, "grad_norm": 1.4876173734664917, "learning_rate": 2.9428864569083446e-05, "loss": 2.172, "step": 4812 }, { "epoch": 0.41, "grad_norm": 1.9083555936813354, "learning_rate": 2.9411764705882354e-05, "loss": 2.1113, "step": 4816 }, { "epoch": 0.41, "grad_norm": 1.6316877603530884, "learning_rate": 2.9394664842681258e-05, "loss": 2.2434, "step": 4820 }, { "epoch": 0.41, "grad_norm": 1.578415036201477, "learning_rate": 2.9377564979480165e-05, "loss": 2.1634, "step": 4824 }, { "epoch": 0.41, "grad_norm": 1.7644866704940796, "learning_rate": 2.9360465116279072e-05, "loss": 2.0934, "step": 4828 }, { "epoch": 0.41, "grad_norm": 1.3768229484558105, "learning_rate": 2.9343365253077976e-05, "loss": 1.9734, "step": 4832 }, { "epoch": 0.41, "grad_norm": 1.6598727703094482, "learning_rate": 2.9326265389876884e-05, "loss": 2.2622, "step": 4836 }, { "epoch": 0.41, "grad_norm": 1.346545934677124, "learning_rate": 2.9309165526675784e-05, "loss": 2.1191, "step": 4840 }, { "epoch": 0.41, "grad_norm": 1.5258831977844238, "learning_rate": 2.9292065663474695e-05, "loss": 1.9744, "step": 4844 }, { "epoch": 0.41, "grad_norm": 1.573669195175171, "learning_rate": 2.9274965800273602e-05, "loss": 2.1529, "step": 4848 }, { "epoch": 0.41, "grad_norm": 1.3979613780975342, "learning_rate": 2.9257865937072503e-05, "loss": 2.1268, "step": 4852 }, { "epoch": 0.42, "grad_norm": 1.5907671451568604, "learning_rate": 2.924076607387141e-05, "loss": 2.0754, "step": 4856 }, { "epoch": 0.42, "grad_norm": 1.4485595226287842, "learning_rate": 2.9223666210670314e-05, "loss": 2.2218, "step": 4860 }, { "epoch": 0.42, "grad_norm": 1.4896790981292725, "learning_rate": 2.920656634746922e-05, "loss": 2.2253, "step": 4864 }, { "epoch": 0.42, "grad_norm": 1.5012835264205933, "learning_rate": 2.918946648426813e-05, "loss": 2.3698, "step": 4868 }, { "epoch": 0.42, "grad_norm": 1.4697033166885376, "learning_rate": 2.9172366621067033e-05, "loss": 2.2947, "step": 4872 }, { "epoch": 0.42, "grad_norm": 1.693452000617981, "learning_rate": 2.915526675786594e-05, "loss": 2.1249, "step": 4876 }, { "epoch": 0.42, "grad_norm": 1.5979125499725342, "learning_rate": 2.913816689466484e-05, "loss": 2.3482, "step": 4880 }, { "epoch": 0.42, "grad_norm": 1.587105631828308, "learning_rate": 2.912106703146375e-05, "loss": 2.1287, "step": 4884 }, { "epoch": 0.42, "grad_norm": 1.5790472030639648, "learning_rate": 2.910396716826266e-05, "loss": 2.1099, "step": 4888 }, { "epoch": 0.42, "grad_norm": 1.4962116479873657, "learning_rate": 2.908686730506156e-05, "loss": 2.1216, "step": 4892 }, { "epoch": 0.42, "grad_norm": 1.657333254814148, "learning_rate": 2.9069767441860467e-05, "loss": 2.1813, "step": 4896 }, { "epoch": 0.42, "grad_norm": 1.520858645439148, "learning_rate": 2.905266757865937e-05, "loss": 2.1585, "step": 4900 }, { "epoch": 0.42, "grad_norm": 1.4019360542297363, "learning_rate": 2.9035567715458278e-05, "loss": 2.1757, "step": 4904 }, { "epoch": 0.42, "grad_norm": 1.4157730340957642, "learning_rate": 2.9018467852257182e-05, "loss": 2.313, "step": 4908 }, { "epoch": 0.42, "grad_norm": 1.4938081502914429, "learning_rate": 2.900136798905609e-05, "loss": 2.2372, "step": 4912 }, { "epoch": 0.42, "grad_norm": 1.5117088556289673, "learning_rate": 2.8984268125854997e-05, "loss": 2.3339, "step": 4916 }, { "epoch": 0.42, "grad_norm": 1.4263554811477661, "learning_rate": 2.8967168262653897e-05, "loss": 2.1498, "step": 4920 }, { "epoch": 0.42, "grad_norm": 1.3601114749908447, "learning_rate": 2.8950068399452808e-05, "loss": 2.1967, "step": 4924 }, { "epoch": 0.42, "grad_norm": 1.5118712186813354, "learning_rate": 2.893296853625171e-05, "loss": 2.2998, "step": 4928 }, { "epoch": 0.42, "grad_norm": 1.63152015209198, "learning_rate": 2.8915868673050616e-05, "loss": 2.085, "step": 4932 }, { "epoch": 0.42, "grad_norm": 1.4992414712905884, "learning_rate": 2.8898768809849523e-05, "loss": 2.2337, "step": 4936 }, { "epoch": 0.42, "grad_norm": 1.4842474460601807, "learning_rate": 2.8881668946648427e-05, "loss": 2.2246, "step": 4940 }, { "epoch": 0.42, "grad_norm": 2.064748525619507, "learning_rate": 2.8864569083447335e-05, "loss": 2.2128, "step": 4944 }, { "epoch": 0.42, "grad_norm": 1.7368000745773315, "learning_rate": 2.884746922024624e-05, "loss": 2.1803, "step": 4948 }, { "epoch": 0.42, "grad_norm": 1.6576658487319946, "learning_rate": 2.8830369357045146e-05, "loss": 2.0681, "step": 4952 }, { "epoch": 0.42, "grad_norm": 1.886851191520691, "learning_rate": 2.8813269493844053e-05, "loss": 2.3038, "step": 4956 }, { "epoch": 0.42, "grad_norm": 1.6933567523956299, "learning_rate": 2.8796169630642954e-05, "loss": 2.1189, "step": 4960 }, { "epoch": 0.42, "grad_norm": 1.5234380960464478, "learning_rate": 2.8779069767441864e-05, "loss": 2.239, "step": 4964 }, { "epoch": 0.42, "grad_norm": 1.4364981651306152, "learning_rate": 2.8761969904240765e-05, "loss": 2.1525, "step": 4968 }, { "epoch": 0.43, "grad_norm": 1.5221827030181885, "learning_rate": 2.8744870041039672e-05, "loss": 2.0648, "step": 4972 }, { "epoch": 0.43, "grad_norm": 1.4764840602874756, "learning_rate": 2.872777017783858e-05, "loss": 2.1009, "step": 4976 }, { "epoch": 0.43, "grad_norm": 1.361648440361023, "learning_rate": 2.8710670314637484e-05, "loss": 2.0012, "step": 4980 }, { "epoch": 0.43, "grad_norm": 1.4987961053848267, "learning_rate": 2.869357045143639e-05, "loss": 2.3403, "step": 4984 }, { "epoch": 0.43, "grad_norm": 1.422187328338623, "learning_rate": 2.8676470588235295e-05, "loss": 2.1681, "step": 4988 }, { "epoch": 0.43, "grad_norm": 1.5344418287277222, "learning_rate": 2.8659370725034202e-05, "loss": 2.1772, "step": 4992 }, { "epoch": 0.43, "grad_norm": 1.5092509984970093, "learning_rate": 2.864227086183311e-05, "loss": 2.207, "step": 4996 }, { "epoch": 0.43, "grad_norm": 1.6343382596969604, "learning_rate": 2.862517099863201e-05, "loss": 2.1994, "step": 5000 }, { "epoch": 0.43, "grad_norm": 1.447045087814331, "learning_rate": 2.860807113543092e-05, "loss": 2.1439, "step": 5004 }, { "epoch": 0.43, "grad_norm": 1.4870860576629639, "learning_rate": 2.859097127222982e-05, "loss": 2.1185, "step": 5008 }, { "epoch": 0.43, "grad_norm": 1.538109302520752, "learning_rate": 2.857387140902873e-05, "loss": 2.1966, "step": 5012 }, { "epoch": 0.43, "grad_norm": 1.5985993146896362, "learning_rate": 2.8556771545827636e-05, "loss": 2.1939, "step": 5016 }, { "epoch": 0.43, "grad_norm": 1.562982439994812, "learning_rate": 2.853967168262654e-05, "loss": 2.1722, "step": 5020 }, { "epoch": 0.43, "grad_norm": 1.4697779417037964, "learning_rate": 2.8522571819425448e-05, "loss": 2.1089, "step": 5024 }, { "epoch": 0.43, "grad_norm": 1.3964099884033203, "learning_rate": 2.850547195622435e-05, "loss": 2.1349, "step": 5028 }, { "epoch": 0.43, "grad_norm": 1.446781039237976, "learning_rate": 2.848837209302326e-05, "loss": 2.0189, "step": 5032 }, { "epoch": 0.43, "grad_norm": 1.305580735206604, "learning_rate": 2.847127222982216e-05, "loss": 2.1448, "step": 5036 }, { "epoch": 0.43, "grad_norm": 1.5852243900299072, "learning_rate": 2.8454172366621067e-05, "loss": 2.0051, "step": 5040 }, { "epoch": 0.43, "grad_norm": 1.6528069972991943, "learning_rate": 2.8437072503419977e-05, "loss": 2.0865, "step": 5044 }, { "epoch": 0.43, "grad_norm": 1.5359212160110474, "learning_rate": 2.8419972640218878e-05, "loss": 2.301, "step": 5048 }, { "epoch": 0.43, "grad_norm": 1.2999087572097778, "learning_rate": 2.8402872777017785e-05, "loss": 2.074, "step": 5052 }, { "epoch": 0.43, "grad_norm": 1.6305381059646606, "learning_rate": 2.838577291381669e-05, "loss": 2.1127, "step": 5056 }, { "epoch": 0.43, "grad_norm": 1.4597679376602173, "learning_rate": 2.8368673050615597e-05, "loss": 2.0304, "step": 5060 }, { "epoch": 0.43, "grad_norm": 1.6588799953460693, "learning_rate": 2.8351573187414504e-05, "loss": 2.2835, "step": 5064 }, { "epoch": 0.43, "grad_norm": 1.5638916492462158, "learning_rate": 2.8334473324213408e-05, "loss": 2.0453, "step": 5068 }, { "epoch": 0.43, "grad_norm": 1.3972184658050537, "learning_rate": 2.8317373461012315e-05, "loss": 2.0929, "step": 5072 }, { "epoch": 0.43, "grad_norm": 1.448038935661316, "learning_rate": 2.8300273597811216e-05, "loss": 2.2133, "step": 5076 }, { "epoch": 0.43, "grad_norm": 1.492927074432373, "learning_rate": 2.8283173734610123e-05, "loss": 2.0079, "step": 5080 }, { "epoch": 0.43, "grad_norm": 1.555188775062561, "learning_rate": 2.8266073871409034e-05, "loss": 2.1773, "step": 5084 }, { "epoch": 0.44, "grad_norm": 1.4068621397018433, "learning_rate": 2.8248974008207935e-05, "loss": 2.288, "step": 5088 }, { "epoch": 0.44, "grad_norm": 1.592962622642517, "learning_rate": 2.8231874145006842e-05, "loss": 2.2457, "step": 5092 }, { "epoch": 0.44, "grad_norm": 1.6329854726791382, "learning_rate": 2.8214774281805746e-05, "loss": 2.1736, "step": 5096 }, { "epoch": 0.44, "grad_norm": 1.5315256118774414, "learning_rate": 2.8197674418604653e-05, "loss": 2.0541, "step": 5100 }, { "epoch": 0.44, "grad_norm": 1.5884466171264648, "learning_rate": 2.818057455540356e-05, "loss": 2.0702, "step": 5104 }, { "epoch": 0.44, "grad_norm": 1.4373297691345215, "learning_rate": 2.8163474692202464e-05, "loss": 2.0738, "step": 5108 }, { "epoch": 0.44, "grad_norm": 1.7089366912841797, "learning_rate": 2.8146374829001372e-05, "loss": 2.2945, "step": 5112 }, { "epoch": 0.44, "grad_norm": 1.4914695024490356, "learning_rate": 2.8129274965800272e-05, "loss": 2.0974, "step": 5116 }, { "epoch": 0.44, "grad_norm": 1.7952321767807007, "learning_rate": 2.811217510259918e-05, "loss": 2.0721, "step": 5120 }, { "epoch": 0.44, "grad_norm": 1.4391483068466187, "learning_rate": 2.8095075239398087e-05, "loss": 2.1753, "step": 5124 }, { "epoch": 0.44, "grad_norm": 1.6410449743270874, "learning_rate": 2.807797537619699e-05, "loss": 2.0375, "step": 5128 }, { "epoch": 0.44, "grad_norm": 1.5932730436325073, "learning_rate": 2.80608755129959e-05, "loss": 2.0907, "step": 5132 }, { "epoch": 0.44, "grad_norm": 1.5292744636535645, "learning_rate": 2.8043775649794802e-05, "loss": 2.266, "step": 5136 }, { "epoch": 0.44, "grad_norm": 1.472765326499939, "learning_rate": 2.802667578659371e-05, "loss": 2.075, "step": 5140 }, { "epoch": 0.44, "grad_norm": 1.5142451524734497, "learning_rate": 2.8009575923392617e-05, "loss": 2.0953, "step": 5144 }, { "epoch": 0.44, "grad_norm": 1.8267227411270142, "learning_rate": 2.7992476060191518e-05, "loss": 2.183, "step": 5148 }, { "epoch": 0.44, "grad_norm": 1.5693491697311401, "learning_rate": 2.797537619699043e-05, "loss": 2.1086, "step": 5152 }, { "epoch": 0.44, "grad_norm": 1.3335392475128174, "learning_rate": 2.795827633378933e-05, "loss": 2.0814, "step": 5156 }, { "epoch": 0.44, "grad_norm": 1.5873955488204956, "learning_rate": 2.7941176470588236e-05, "loss": 2.132, "step": 5160 }, { "epoch": 0.44, "grad_norm": 1.5618573427200317, "learning_rate": 2.792407660738714e-05, "loss": 2.1605, "step": 5164 }, { "epoch": 0.44, "grad_norm": 1.355206847190857, "learning_rate": 2.7906976744186048e-05, "loss": 2.0064, "step": 5168 }, { "epoch": 0.44, "grad_norm": 1.4535104036331177, "learning_rate": 2.7889876880984955e-05, "loss": 2.092, "step": 5172 }, { "epoch": 0.44, "grad_norm": 1.477333426475525, "learning_rate": 2.787277701778386e-05, "loss": 2.0415, "step": 5176 }, { "epoch": 0.44, "grad_norm": 1.4349262714385986, "learning_rate": 2.7855677154582766e-05, "loss": 2.1695, "step": 5180 }, { "epoch": 0.44, "grad_norm": 1.4431016445159912, "learning_rate": 2.7838577291381667e-05, "loss": 2.1246, "step": 5184 }, { "epoch": 0.44, "grad_norm": 1.419053554534912, "learning_rate": 2.7821477428180574e-05, "loss": 2.1279, "step": 5188 }, { "epoch": 0.44, "grad_norm": 1.5947861671447754, "learning_rate": 2.7804377564979485e-05, "loss": 2.1138, "step": 5192 }, { "epoch": 0.44, "grad_norm": 1.8315672874450684, "learning_rate": 2.7787277701778385e-05, "loss": 2.172, "step": 5196 }, { "epoch": 0.44, "grad_norm": 1.532003402709961, "learning_rate": 2.7770177838577293e-05, "loss": 2.0775, "step": 5200 }, { "epoch": 0.44, "grad_norm": 1.6215981245040894, "learning_rate": 2.7753077975376197e-05, "loss": 2.0522, "step": 5204 }, { "epoch": 0.45, "grad_norm": 1.8606538772583008, "learning_rate": 2.7735978112175104e-05, "loss": 2.0371, "step": 5208 }, { "epoch": 0.45, "grad_norm": 1.9666029214859009, "learning_rate": 2.771887824897401e-05, "loss": 2.256, "step": 5212 }, { "epoch": 0.45, "grad_norm": 1.5295203924179077, "learning_rate": 2.7701778385772915e-05, "loss": 2.0297, "step": 5216 }, { "epoch": 0.45, "grad_norm": 1.395086407661438, "learning_rate": 2.7684678522571823e-05, "loss": 2.1393, "step": 5220 }, { "epoch": 0.45, "grad_norm": 1.396425485610962, "learning_rate": 2.7667578659370723e-05, "loss": 1.9676, "step": 5224 }, { "epoch": 0.45, "grad_norm": 1.5650209188461304, "learning_rate": 2.765047879616963e-05, "loss": 2.1365, "step": 5228 }, { "epoch": 0.45, "grad_norm": 1.447646975517273, "learning_rate": 2.763337893296854e-05, "loss": 1.9595, "step": 5232 }, { "epoch": 0.45, "grad_norm": 1.438038945198059, "learning_rate": 2.7616279069767442e-05, "loss": 1.9664, "step": 5236 }, { "epoch": 0.45, "grad_norm": 1.4711318016052246, "learning_rate": 2.759917920656635e-05, "loss": 2.0887, "step": 5240 }, { "epoch": 0.45, "grad_norm": 1.4893648624420166, "learning_rate": 2.7582079343365253e-05, "loss": 1.9264, "step": 5244 }, { "epoch": 0.45, "grad_norm": 1.579157829284668, "learning_rate": 2.756497948016416e-05, "loss": 2.126, "step": 5248 }, { "epoch": 0.45, "grad_norm": 1.480891466140747, "learning_rate": 2.7547879616963068e-05, "loss": 2.2045, "step": 5252 }, { "epoch": 0.45, "grad_norm": 1.8084710836410522, "learning_rate": 2.7530779753761972e-05, "loss": 2.2102, "step": 5256 }, { "epoch": 0.45, "grad_norm": 1.4894654750823975, "learning_rate": 2.751367989056088e-05, "loss": 2.1216, "step": 5260 }, { "epoch": 0.45, "grad_norm": 1.478981852531433, "learning_rate": 2.749658002735978e-05, "loss": 2.0361, "step": 5264 }, { "epoch": 0.45, "grad_norm": 1.5740448236465454, "learning_rate": 2.7479480164158687e-05, "loss": 1.9947, "step": 5268 }, { "epoch": 0.45, "grad_norm": 1.4918889999389648, "learning_rate": 2.7462380300957598e-05, "loss": 2.068, "step": 5272 }, { "epoch": 0.45, "grad_norm": 1.586724877357483, "learning_rate": 2.74452804377565e-05, "loss": 1.9932, "step": 5276 }, { "epoch": 0.45, "grad_norm": 1.7443528175354004, "learning_rate": 2.7428180574555406e-05, "loss": 2.2906, "step": 5280 }, { "epoch": 0.45, "grad_norm": 1.630373239517212, "learning_rate": 2.741108071135431e-05, "loss": 2.161, "step": 5284 }, { "epoch": 0.45, "grad_norm": 1.3437303304672241, "learning_rate": 2.7393980848153217e-05, "loss": 1.9711, "step": 5288 }, { "epoch": 0.45, "grad_norm": 1.6629517078399658, "learning_rate": 2.7376880984952118e-05, "loss": 2.1511, "step": 5292 }, { "epoch": 0.45, "grad_norm": 1.507279872894287, "learning_rate": 2.735978112175103e-05, "loss": 2.0911, "step": 5296 }, { "epoch": 0.45, "grad_norm": 1.5633537769317627, "learning_rate": 2.7342681258549936e-05, "loss": 2.0376, "step": 5300 }, { "epoch": 0.45, "grad_norm": 1.524722933769226, "learning_rate": 2.7325581395348836e-05, "loss": 2.131, "step": 5304 }, { "epoch": 0.45, "grad_norm": 1.4950675964355469, "learning_rate": 2.7308481532147744e-05, "loss": 2.1594, "step": 5308 }, { "epoch": 0.45, "grad_norm": 1.6241108179092407, "learning_rate": 2.7291381668946648e-05, "loss": 2.1674, "step": 5312 }, { "epoch": 0.45, "grad_norm": 1.6131986379623413, "learning_rate": 2.7274281805745555e-05, "loss": 2.0742, "step": 5316 }, { "epoch": 0.45, "grad_norm": 1.5340795516967773, "learning_rate": 2.7257181942544462e-05, "loss": 2.0877, "step": 5320 }, { "epoch": 0.46, "grad_norm": 1.586997389793396, "learning_rate": 2.7240082079343366e-05, "loss": 2.2434, "step": 5324 }, { "epoch": 0.46, "grad_norm": 1.51520836353302, "learning_rate": 2.7222982216142274e-05, "loss": 2.0303, "step": 5328 }, { "epoch": 0.46, "grad_norm": 1.6436048746109009, "learning_rate": 2.7205882352941174e-05, "loss": 2.0539, "step": 5332 }, { "epoch": 0.46, "grad_norm": 1.3098795413970947, "learning_rate": 2.7188782489740085e-05, "loss": 2.2811, "step": 5336 }, { "epoch": 0.46, "grad_norm": 1.6447290182113647, "learning_rate": 2.7171682626538992e-05, "loss": 2.1018, "step": 5340 }, { "epoch": 0.46, "grad_norm": 1.8019658327102661, "learning_rate": 2.7154582763337893e-05, "loss": 2.0778, "step": 5344 }, { "epoch": 0.46, "grad_norm": 1.7310752868652344, "learning_rate": 2.71374829001368e-05, "loss": 2.1467, "step": 5348 }, { "epoch": 0.46, "grad_norm": 1.4999805688858032, "learning_rate": 2.7120383036935704e-05, "loss": 2.134, "step": 5352 }, { "epoch": 0.46, "grad_norm": 1.4711189270019531, "learning_rate": 2.710328317373461e-05, "loss": 2.1542, "step": 5356 }, { "epoch": 0.46, "grad_norm": 1.477725863456726, "learning_rate": 2.708618331053352e-05, "loss": 2.0225, "step": 5360 }, { "epoch": 0.46, "grad_norm": 1.6154271364212036, "learning_rate": 2.7069083447332423e-05, "loss": 2.1603, "step": 5364 }, { "epoch": 0.46, "grad_norm": 1.6522784233093262, "learning_rate": 2.705198358413133e-05, "loss": 2.2915, "step": 5368 }, { "epoch": 0.46, "grad_norm": 1.6237088441848755, "learning_rate": 2.703488372093023e-05, "loss": 1.9629, "step": 5372 }, { "epoch": 0.46, "grad_norm": 1.8103796243667603, "learning_rate": 2.701778385772914e-05, "loss": 2.0765, "step": 5376 }, { "epoch": 0.46, "grad_norm": 1.4163451194763184, "learning_rate": 2.700068399452805e-05, "loss": 2.0539, "step": 5380 }, { "epoch": 0.46, "grad_norm": 2.560591697692871, "learning_rate": 2.698358413132695e-05, "loss": 2.1445, "step": 5384 }, { "epoch": 0.46, "grad_norm": 1.5179965496063232, "learning_rate": 2.6966484268125857e-05, "loss": 2.1425, "step": 5388 }, { "epoch": 0.46, "grad_norm": 1.5749553442001343, "learning_rate": 2.694938440492476e-05, "loss": 1.8312, "step": 5392 }, { "epoch": 0.46, "grad_norm": 1.3497620820999146, "learning_rate": 2.6932284541723668e-05, "loss": 1.9427, "step": 5396 }, { "epoch": 0.46, "grad_norm": 1.3808335065841675, "learning_rate": 2.6915184678522575e-05, "loss": 2.2364, "step": 5400 }, { "epoch": 0.46, "grad_norm": 1.8348784446716309, "learning_rate": 2.689808481532148e-05, "loss": 1.9962, "step": 5404 }, { "epoch": 0.46, "grad_norm": 1.3197436332702637, "learning_rate": 2.6880984952120387e-05, "loss": 2.1261, "step": 5408 }, { "epoch": 0.46, "grad_norm": 1.622539758682251, "learning_rate": 2.6863885088919287e-05, "loss": 2.2002, "step": 5412 }, { "epoch": 0.46, "grad_norm": 1.4938942193984985, "learning_rate": 2.6846785225718198e-05, "loss": 2.0079, "step": 5416 }, { "epoch": 0.46, "grad_norm": 1.4376903772354126, "learning_rate": 2.68296853625171e-05, "loss": 1.9435, "step": 5420 }, { "epoch": 0.46, "grad_norm": 1.7015161514282227, "learning_rate": 2.6812585499316006e-05, "loss": 2.1806, "step": 5424 }, { "epoch": 0.46, "grad_norm": 1.7079006433486938, "learning_rate": 2.6795485636114913e-05, "loss": 2.1705, "step": 5428 }, { "epoch": 0.46, "grad_norm": 1.5330369472503662, "learning_rate": 2.6778385772913817e-05, "loss": 2.109, "step": 5432 }, { "epoch": 0.46, "grad_norm": 1.606583595275879, "learning_rate": 2.6761285909712724e-05, "loss": 2.1793, "step": 5436 }, { "epoch": 0.47, "grad_norm": 1.5606343746185303, "learning_rate": 2.674418604651163e-05, "loss": 2.1413, "step": 5440 }, { "epoch": 0.47, "grad_norm": 1.5075594186782837, "learning_rate": 2.6727086183310536e-05, "loss": 2.1066, "step": 5444 }, { "epoch": 0.47, "grad_norm": 1.3254252672195435, "learning_rate": 2.6709986320109443e-05, "loss": 1.9901, "step": 5448 }, { "epoch": 0.47, "grad_norm": 1.5555039644241333, "learning_rate": 2.6692886456908344e-05, "loss": 2.1421, "step": 5452 }, { "epoch": 0.47, "grad_norm": 1.5434138774871826, "learning_rate": 2.6675786593707254e-05, "loss": 2.1533, "step": 5456 }, { "epoch": 0.47, "grad_norm": 1.5345412492752075, "learning_rate": 2.6658686730506155e-05, "loss": 2.1013, "step": 5460 }, { "epoch": 0.47, "grad_norm": 1.5029314756393433, "learning_rate": 2.6641586867305062e-05, "loss": 2.0473, "step": 5464 }, { "epoch": 0.47, "grad_norm": 1.668525218963623, "learning_rate": 2.662448700410397e-05, "loss": 2.0636, "step": 5468 }, { "epoch": 0.47, "grad_norm": 1.5454894304275513, "learning_rate": 2.6607387140902874e-05, "loss": 2.1219, "step": 5472 }, { "epoch": 0.47, "grad_norm": 1.3577351570129395, "learning_rate": 2.659028727770178e-05, "loss": 1.9395, "step": 5476 }, { "epoch": 0.47, "grad_norm": 1.7034279108047485, "learning_rate": 2.6573187414500685e-05, "loss": 2.1734, "step": 5480 }, { "epoch": 0.47, "grad_norm": 1.519974708557129, "learning_rate": 2.6556087551299592e-05, "loss": 2.0007, "step": 5484 }, { "epoch": 0.47, "grad_norm": 1.6780128479003906, "learning_rate": 2.65389876880985e-05, "loss": 2.2113, "step": 5488 }, { "epoch": 0.47, "grad_norm": 1.5499041080474854, "learning_rate": 2.65218878248974e-05, "loss": 2.0985, "step": 5492 }, { "epoch": 0.47, "grad_norm": 1.5147311687469482, "learning_rate": 2.6504787961696307e-05, "loss": 2.0704, "step": 5496 }, { "epoch": 0.47, "grad_norm": 1.4439009428024292, "learning_rate": 2.648768809849521e-05, "loss": 1.9784, "step": 5500 }, { "epoch": 0.47, "grad_norm": 1.5315637588500977, "learning_rate": 2.647058823529412e-05, "loss": 2.0466, "step": 5504 }, { "epoch": 0.47, "grad_norm": 1.514778971672058, "learning_rate": 2.6453488372093026e-05, "loss": 2.1113, "step": 5508 }, { "epoch": 0.47, "grad_norm": 1.898563027381897, "learning_rate": 2.643638850889193e-05, "loss": 2.129, "step": 5512 }, { "epoch": 0.47, "grad_norm": 1.4862741231918335, "learning_rate": 2.6419288645690837e-05, "loss": 2.1319, "step": 5516 }, { "epoch": 0.47, "grad_norm": 1.5534049272537231, "learning_rate": 2.6402188782489738e-05, "loss": 2.1904, "step": 5520 }, { "epoch": 0.47, "grad_norm": 1.6791940927505493, "learning_rate": 2.638508891928865e-05, "loss": 2.0653, "step": 5524 }, { "epoch": 0.47, "grad_norm": 1.4288556575775146, "learning_rate": 2.6367989056087556e-05, "loss": 2.1007, "step": 5528 }, { "epoch": 0.47, "grad_norm": 1.4318090677261353, "learning_rate": 2.6350889192886457e-05, "loss": 2.0791, "step": 5532 }, { "epoch": 0.47, "grad_norm": 1.5377432107925415, "learning_rate": 2.6333789329685364e-05, "loss": 2.2141, "step": 5536 }, { "epoch": 0.47, "grad_norm": 1.6055461168289185, "learning_rate": 2.6316689466484268e-05, "loss": 2.0733, "step": 5540 }, { "epoch": 0.47, "grad_norm": 1.4851833581924438, "learning_rate": 2.6299589603283175e-05, "loss": 2.1332, "step": 5544 }, { "epoch": 0.47, "grad_norm": 1.4060307741165161, "learning_rate": 2.628248974008208e-05, "loss": 2.0943, "step": 5548 }, { "epoch": 0.47, "grad_norm": 1.5619680881500244, "learning_rate": 2.6265389876880987e-05, "loss": 2.1601, "step": 5552 }, { "epoch": 0.48, "grad_norm": 1.6302011013031006, "learning_rate": 2.6248290013679894e-05, "loss": 2.1288, "step": 5556 }, { "epoch": 0.48, "grad_norm": 1.514349102973938, "learning_rate": 2.6231190150478794e-05, "loss": 1.9701, "step": 5560 }, { "epoch": 0.48, "grad_norm": 1.5871026515960693, "learning_rate": 2.6214090287277705e-05, "loss": 2.0344, "step": 5564 }, { "epoch": 0.48, "grad_norm": 1.5523772239685059, "learning_rate": 2.6196990424076606e-05, "loss": 2.0973, "step": 5568 }, { "epoch": 0.48, "grad_norm": 1.4616386890411377, "learning_rate": 2.6179890560875513e-05, "loss": 2.1624, "step": 5572 }, { "epoch": 0.48, "grad_norm": 1.53379487991333, "learning_rate": 2.616279069767442e-05, "loss": 2.0887, "step": 5576 }, { "epoch": 0.48, "grad_norm": 1.6133915185928345, "learning_rate": 2.6145690834473324e-05, "loss": 2.0548, "step": 5580 }, { "epoch": 0.48, "grad_norm": 1.7219460010528564, "learning_rate": 2.6128590971272232e-05, "loss": 2.0882, "step": 5584 }, { "epoch": 0.48, "grad_norm": 1.4539477825164795, "learning_rate": 2.6111491108071136e-05, "loss": 1.9867, "step": 5588 }, { "epoch": 0.48, "grad_norm": 1.6804320812225342, "learning_rate": 2.6094391244870043e-05, "loss": 2.2284, "step": 5592 }, { "epoch": 0.48, "grad_norm": 1.6927332878112793, "learning_rate": 2.607729138166895e-05, "loss": 2.0798, "step": 5596 }, { "epoch": 0.48, "grad_norm": 1.5225088596343994, "learning_rate": 2.606019151846785e-05, "loss": 2.1072, "step": 5600 }, { "epoch": 0.48, "grad_norm": 1.4824042320251465, "learning_rate": 2.604309165526676e-05, "loss": 2.0789, "step": 5604 }, { "epoch": 0.48, "grad_norm": 1.3722364902496338, "learning_rate": 2.6025991792065662e-05, "loss": 2.1098, "step": 5608 }, { "epoch": 0.48, "grad_norm": 1.4595104455947876, "learning_rate": 2.600889192886457e-05, "loss": 2.119, "step": 5612 }, { "epoch": 0.48, "grad_norm": 1.418157696723938, "learning_rate": 2.5991792065663477e-05, "loss": 2.1059, "step": 5616 }, { "epoch": 0.48, "grad_norm": 1.4522316455841064, "learning_rate": 2.597469220246238e-05, "loss": 2.2825, "step": 5620 }, { "epoch": 0.48, "grad_norm": 1.685712456703186, "learning_rate": 2.5957592339261288e-05, "loss": 2.0914, "step": 5624 }, { "epoch": 0.48, "grad_norm": 1.5950579643249512, "learning_rate": 2.5940492476060192e-05, "loss": 2.0957, "step": 5628 }, { "epoch": 0.48, "grad_norm": 1.5742863416671753, "learning_rate": 2.59233926128591e-05, "loss": 2.2025, "step": 5632 }, { "epoch": 0.48, "grad_norm": 1.5395307540893555, "learning_rate": 2.5906292749658007e-05, "loss": 2.0688, "step": 5636 }, { "epoch": 0.48, "grad_norm": 1.5343267917633057, "learning_rate": 2.5889192886456907e-05, "loss": 2.1783, "step": 5640 }, { "epoch": 0.48, "grad_norm": 1.4409688711166382, "learning_rate": 2.5872093023255818e-05, "loss": 1.9016, "step": 5644 }, { "epoch": 0.48, "grad_norm": 1.5827747583389282, "learning_rate": 2.585499316005472e-05, "loss": 2.1501, "step": 5648 }, { "epoch": 0.48, "grad_norm": 1.6408090591430664, "learning_rate": 2.5837893296853626e-05, "loss": 2.0266, "step": 5652 }, { "epoch": 0.48, "grad_norm": 1.454546332359314, "learning_rate": 2.5820793433652533e-05, "loss": 1.9665, "step": 5656 }, { "epoch": 0.48, "grad_norm": 1.4659438133239746, "learning_rate": 2.5803693570451437e-05, "loss": 2.0251, "step": 5660 }, { "epoch": 0.48, "grad_norm": 1.6807434558868408, "learning_rate": 2.5786593707250345e-05, "loss": 2.0587, "step": 5664 }, { "epoch": 0.48, "grad_norm": 1.4312716722488403, "learning_rate": 2.576949384404925e-05, "loss": 1.9798, "step": 5668 }, { "epoch": 0.48, "grad_norm": 1.5319584608078003, "learning_rate": 2.5752393980848156e-05, "loss": 2.0179, "step": 5672 }, { "epoch": 0.49, "grad_norm": 1.5227807760238647, "learning_rate": 2.5735294117647057e-05, "loss": 2.079, "step": 5676 }, { "epoch": 0.49, "grad_norm": 1.5442856550216675, "learning_rate": 2.5718194254445964e-05, "loss": 2.0757, "step": 5680 }, { "epoch": 0.49, "grad_norm": 1.4400264024734497, "learning_rate": 2.5701094391244875e-05, "loss": 2.0712, "step": 5684 }, { "epoch": 0.49, "grad_norm": 1.5786032676696777, "learning_rate": 2.5683994528043775e-05, "loss": 2.1628, "step": 5688 }, { "epoch": 0.49, "grad_norm": 1.5079550743103027, "learning_rate": 2.5666894664842683e-05, "loss": 2.0104, "step": 5692 }, { "epoch": 0.49, "grad_norm": 1.6693617105484009, "learning_rate": 2.5649794801641587e-05, "loss": 1.9668, "step": 5696 }, { "epoch": 0.49, "grad_norm": 1.6841495037078857, "learning_rate": 2.5632694938440494e-05, "loss": 2.2648, "step": 5700 }, { "epoch": 0.49, "grad_norm": 1.4236379861831665, "learning_rate": 2.56155950752394e-05, "loss": 2.1775, "step": 5704 }, { "epoch": 0.49, "grad_norm": 1.4873919486999512, "learning_rate": 2.5598495212038305e-05, "loss": 1.8591, "step": 5708 }, { "epoch": 0.49, "grad_norm": 1.5848278999328613, "learning_rate": 2.5581395348837212e-05, "loss": 2.0736, "step": 5712 }, { "epoch": 0.49, "grad_norm": 1.9348700046539307, "learning_rate": 2.5564295485636113e-05, "loss": 2.1204, "step": 5716 }, { "epoch": 0.49, "grad_norm": 1.5655988454818726, "learning_rate": 2.554719562243502e-05, "loss": 2.0318, "step": 5720 }, { "epoch": 0.49, "grad_norm": 1.6480430364608765, "learning_rate": 2.553009575923393e-05, "loss": 2.0188, "step": 5724 }, { "epoch": 0.49, "grad_norm": 1.4328726530075073, "learning_rate": 2.5512995896032832e-05, "loss": 2.1973, "step": 5728 }, { "epoch": 0.49, "grad_norm": 1.6577439308166504, "learning_rate": 2.549589603283174e-05, "loss": 2.0672, "step": 5732 }, { "epoch": 0.49, "grad_norm": 1.3879199028015137, "learning_rate": 2.5478796169630643e-05, "loss": 1.9124, "step": 5736 }, { "epoch": 0.49, "grad_norm": 1.4111934900283813, "learning_rate": 2.546169630642955e-05, "loss": 1.9643, "step": 5740 }, { "epoch": 0.49, "grad_norm": 1.6698105335235596, "learning_rate": 2.5444596443228458e-05, "loss": 2.167, "step": 5744 }, { "epoch": 0.49, "grad_norm": 1.643890380859375, "learning_rate": 2.542749658002736e-05, "loss": 2.1449, "step": 5748 }, { "epoch": 0.49, "grad_norm": 1.4590214490890503, "learning_rate": 2.541039671682627e-05, "loss": 2.2577, "step": 5752 }, { "epoch": 0.49, "grad_norm": 1.5629549026489258, "learning_rate": 2.539329685362517e-05, "loss": 2.2183, "step": 5756 }, { "epoch": 0.49, "grad_norm": 1.4380923509597778, "learning_rate": 2.5376196990424077e-05, "loss": 1.9198, "step": 5760 }, { "epoch": 0.49, "grad_norm": 1.4581702947616577, "learning_rate": 2.5359097127222988e-05, "loss": 2.0363, "step": 5764 }, { "epoch": 0.49, "grad_norm": 1.3352808952331543, "learning_rate": 2.5341997264021888e-05, "loss": 2.1038, "step": 5768 }, { "epoch": 0.49, "grad_norm": 1.604493498802185, "learning_rate": 2.5324897400820796e-05, "loss": 2.1413, "step": 5772 }, { "epoch": 0.49, "grad_norm": 1.360095739364624, "learning_rate": 2.53077975376197e-05, "loss": 2.0967, "step": 5776 }, { "epoch": 0.49, "grad_norm": 1.400731086730957, "learning_rate": 2.5290697674418607e-05, "loss": 2.0638, "step": 5780 }, { "epoch": 0.49, "grad_norm": 1.611741304397583, "learning_rate": 2.5273597811217514e-05, "loss": 1.9994, "step": 5784 }, { "epoch": 0.49, "grad_norm": 1.614253044128418, "learning_rate": 2.5256497948016418e-05, "loss": 2.0905, "step": 5788 }, { "epoch": 0.5, "grad_norm": 1.5998769998550415, "learning_rate": 2.5239398084815325e-05, "loss": 2.2255, "step": 5792 }, { "epoch": 0.5, "grad_norm": 1.5840561389923096, "learning_rate": 2.5222298221614226e-05, "loss": 2.0949, "step": 5796 }, { "epoch": 0.5, "grad_norm": 1.5957179069519043, "learning_rate": 2.5205198358413133e-05, "loss": 1.9565, "step": 5800 }, { "epoch": 0.5, "grad_norm": 1.5299445390701294, "learning_rate": 2.5188098495212037e-05, "loss": 2.0896, "step": 5804 }, { "epoch": 0.5, "grad_norm": 1.5328844785690308, "learning_rate": 2.5170998632010945e-05, "loss": 2.0734, "step": 5808 }, { "epoch": 0.5, "grad_norm": 1.43681800365448, "learning_rate": 2.5153898768809852e-05, "loss": 1.9839, "step": 5812 }, { "epoch": 0.5, "grad_norm": 1.5096018314361572, "learning_rate": 2.5136798905608756e-05, "loss": 2.1289, "step": 5816 }, { "epoch": 0.5, "grad_norm": 1.5648272037506104, "learning_rate": 2.5119699042407663e-05, "loss": 1.9608, "step": 5820 }, { "epoch": 0.5, "grad_norm": 1.7235368490219116, "learning_rate": 2.5102599179206564e-05, "loss": 2.0752, "step": 5824 }, { "epoch": 0.5, "grad_norm": 1.6707707643508911, "learning_rate": 2.5085499316005475e-05, "loss": 2.2227, "step": 5828 }, { "epoch": 0.5, "grad_norm": 1.5506484508514404, "learning_rate": 2.5068399452804382e-05, "loss": 1.9812, "step": 5832 }, { "epoch": 0.5, "grad_norm": 1.50376296043396, "learning_rate": 2.5051299589603283e-05, "loss": 2.0457, "step": 5836 }, { "epoch": 0.5, "grad_norm": 1.5945008993148804, "learning_rate": 2.503419972640219e-05, "loss": 2.0204, "step": 5840 }, { "epoch": 0.5, "grad_norm": 1.6294224262237549, "learning_rate": 2.5017099863201094e-05, "loss": 2.0159, "step": 5844 }, { "epoch": 0.5, "grad_norm": 1.4806592464447021, "learning_rate": 2.5e-05, "loss": 2.0233, "step": 5848 }, { "epoch": 0.5, "grad_norm": 1.4078065156936646, "learning_rate": 2.4982900136798905e-05, "loss": 1.9797, "step": 5852 }, { "epoch": 0.5, "grad_norm": 1.5420806407928467, "learning_rate": 2.4965800273597812e-05, "loss": 2.0582, "step": 5856 }, { "epoch": 0.5, "grad_norm": 1.4052869081497192, "learning_rate": 2.494870041039672e-05, "loss": 2.0785, "step": 5860 }, { "epoch": 0.5, "grad_norm": 1.4887535572052002, "learning_rate": 2.4931600547195624e-05, "loss": 2.1067, "step": 5864 }, { "epoch": 0.5, "grad_norm": 1.723113775253296, "learning_rate": 2.4914500683994528e-05, "loss": 1.973, "step": 5868 }, { "epoch": 0.5, "grad_norm": 4.109470367431641, "learning_rate": 2.4897400820793435e-05, "loss": 2.2463, "step": 5872 }, { "epoch": 0.5, "grad_norm": 1.645058274269104, "learning_rate": 2.488030095759234e-05, "loss": 2.1643, "step": 5876 }, { "epoch": 0.5, "grad_norm": 1.4571024179458618, "learning_rate": 2.4863201094391243e-05, "loss": 2.0342, "step": 5880 }, { "epoch": 0.5, "grad_norm": 1.738869309425354, "learning_rate": 2.4846101231190154e-05, "loss": 2.106, "step": 5884 }, { "epoch": 0.5, "grad_norm": 1.58241868019104, "learning_rate": 2.4829001367989058e-05, "loss": 2.1393, "step": 5888 }, { "epoch": 0.5, "grad_norm": 1.5051425695419312, "learning_rate": 2.481190150478796e-05, "loss": 2.1032, "step": 5892 }, { "epoch": 0.5, "grad_norm": 1.5358223915100098, "learning_rate": 2.479480164158687e-05, "loss": 2.065, "step": 5896 }, { "epoch": 0.5, "grad_norm": 1.3522099256515503, "learning_rate": 2.4777701778385773e-05, "loss": 1.9355, "step": 5900 }, { "epoch": 0.5, "grad_norm": 1.4918060302734375, "learning_rate": 2.476060191518468e-05, "loss": 2.0173, "step": 5904 }, { "epoch": 0.51, "grad_norm": 1.5328975915908813, "learning_rate": 2.4743502051983584e-05, "loss": 2.1863, "step": 5908 }, { "epoch": 0.51, "grad_norm": 1.4947617053985596, "learning_rate": 2.472640218878249e-05, "loss": 2.1405, "step": 5912 }, { "epoch": 0.51, "grad_norm": 1.535042643547058, "learning_rate": 2.4709302325581396e-05, "loss": 1.9693, "step": 5916 }, { "epoch": 0.51, "grad_norm": 1.497016191482544, "learning_rate": 2.46922024623803e-05, "loss": 1.9618, "step": 5920 }, { "epoch": 0.51, "grad_norm": 1.490457534790039, "learning_rate": 2.467510259917921e-05, "loss": 2.0978, "step": 5924 }, { "epoch": 0.51, "grad_norm": 1.4083185195922852, "learning_rate": 2.4658002735978114e-05, "loss": 1.9533, "step": 5928 }, { "epoch": 0.51, "grad_norm": 1.638238549232483, "learning_rate": 2.4640902872777018e-05, "loss": 2.0625, "step": 5932 }, { "epoch": 0.51, "grad_norm": 1.6385629177093506, "learning_rate": 2.4623803009575925e-05, "loss": 1.9919, "step": 5936 }, { "epoch": 0.51, "grad_norm": 1.3852545022964478, "learning_rate": 2.460670314637483e-05, "loss": 2.0971, "step": 5940 }, { "epoch": 0.51, "grad_norm": 1.4530996084213257, "learning_rate": 2.4589603283173733e-05, "loss": 2.0959, "step": 5944 }, { "epoch": 0.51, "grad_norm": 1.573773980140686, "learning_rate": 2.457250341997264e-05, "loss": 2.1314, "step": 5948 }, { "epoch": 0.51, "grad_norm": 1.4911787509918213, "learning_rate": 2.4555403556771548e-05, "loss": 1.9789, "step": 5952 }, { "epoch": 0.51, "grad_norm": 1.5948853492736816, "learning_rate": 2.4538303693570452e-05, "loss": 2.1686, "step": 5956 }, { "epoch": 0.51, "grad_norm": 1.3944917917251587, "learning_rate": 2.4521203830369356e-05, "loss": 1.9249, "step": 5960 }, { "epoch": 0.51, "grad_norm": 1.5396506786346436, "learning_rate": 2.4504103967168263e-05, "loss": 2.0486, "step": 5964 }, { "epoch": 0.51, "grad_norm": 1.5431126356124878, "learning_rate": 2.448700410396717e-05, "loss": 1.9906, "step": 5968 }, { "epoch": 0.51, "grad_norm": 1.4996250867843628, "learning_rate": 2.4469904240766075e-05, "loss": 2.0529, "step": 5972 }, { "epoch": 0.51, "grad_norm": 1.653665542602539, "learning_rate": 2.4452804377564982e-05, "loss": 2.0667, "step": 5976 }, { "epoch": 0.51, "grad_norm": 1.543585181236267, "learning_rate": 2.4435704514363886e-05, "loss": 2.2203, "step": 5980 }, { "epoch": 0.51, "grad_norm": 1.4741888046264648, "learning_rate": 2.441860465116279e-05, "loss": 2.0958, "step": 5984 }, { "epoch": 0.51, "grad_norm": 1.7316616773605347, "learning_rate": 2.4401504787961697e-05, "loss": 2.0456, "step": 5988 }, { "epoch": 0.51, "grad_norm": 1.6189284324645996, "learning_rate": 2.4384404924760605e-05, "loss": 2.0137, "step": 5992 }, { "epoch": 0.51, "grad_norm": 1.3436368703842163, "learning_rate": 2.436730506155951e-05, "loss": 2.0219, "step": 5996 }, { "epoch": 0.51, "grad_norm": 1.45026433467865, "learning_rate": 2.4350205198358412e-05, "loss": 1.938, "step": 6000 }, { "epoch": 0.51, "grad_norm": 1.4618995189666748, "learning_rate": 2.433310533515732e-05, "loss": 2.1091, "step": 6004 }, { "epoch": 0.51, "grad_norm": 1.3523361682891846, "learning_rate": 2.4316005471956224e-05, "loss": 2.0435, "step": 6008 }, { "epoch": 0.51, "grad_norm": 1.414756178855896, "learning_rate": 2.429890560875513e-05, "loss": 2.0553, "step": 6012 }, { "epoch": 0.51, "grad_norm": 1.8548967838287354, "learning_rate": 2.428180574555404e-05, "loss": 2.1002, "step": 6016 }, { "epoch": 0.51, "grad_norm": 1.431153416633606, "learning_rate": 2.4264705882352942e-05, "loss": 1.9441, "step": 6020 }, { "epoch": 0.52, "grad_norm": 1.4049307107925415, "learning_rate": 2.4247606019151846e-05, "loss": 2.1948, "step": 6024 }, { "epoch": 0.52, "grad_norm": 1.4106427431106567, "learning_rate": 2.4230506155950754e-05, "loss": 1.9183, "step": 6028 }, { "epoch": 0.52, "grad_norm": 1.8905525207519531, "learning_rate": 2.421340629274966e-05, "loss": 1.961, "step": 6032 }, { "epoch": 0.52, "grad_norm": 1.5894415378570557, "learning_rate": 2.4196306429548565e-05, "loss": 2.1565, "step": 6036 }, { "epoch": 0.52, "grad_norm": 1.6033257246017456, "learning_rate": 2.417920656634747e-05, "loss": 1.8504, "step": 6040 }, { "epoch": 0.52, "grad_norm": 2.1501522064208984, "learning_rate": 2.4162106703146376e-05, "loss": 2.0988, "step": 6044 }, { "epoch": 0.52, "grad_norm": 1.688828945159912, "learning_rate": 2.414500683994528e-05, "loss": 2.0132, "step": 6048 }, { "epoch": 0.52, "grad_norm": 1.479693055152893, "learning_rate": 2.4127906976744188e-05, "loss": 2.0963, "step": 6052 }, { "epoch": 0.52, "grad_norm": 1.6395564079284668, "learning_rate": 2.4110807113543095e-05, "loss": 2.1367, "step": 6056 }, { "epoch": 0.52, "grad_norm": 1.4523251056671143, "learning_rate": 2.4093707250342e-05, "loss": 2.0349, "step": 6060 }, { "epoch": 0.52, "grad_norm": 1.580451488494873, "learning_rate": 2.4076607387140903e-05, "loss": 1.9535, "step": 6064 }, { "epoch": 0.52, "grad_norm": 1.5951658487319946, "learning_rate": 2.405950752393981e-05, "loss": 2.1379, "step": 6068 }, { "epoch": 0.52, "grad_norm": 1.5413308143615723, "learning_rate": 2.4042407660738714e-05, "loss": 2.0473, "step": 6072 }, { "epoch": 0.52, "grad_norm": 1.5242167711257935, "learning_rate": 2.402530779753762e-05, "loss": 2.026, "step": 6076 }, { "epoch": 0.52, "grad_norm": 1.484012484550476, "learning_rate": 2.4008207934336525e-05, "loss": 2.0974, "step": 6080 }, { "epoch": 0.52, "grad_norm": 1.6217378377914429, "learning_rate": 2.3991108071135433e-05, "loss": 2.0572, "step": 6084 }, { "epoch": 0.52, "grad_norm": 1.622316837310791, "learning_rate": 2.3974008207934337e-05, "loss": 2.111, "step": 6088 }, { "epoch": 0.52, "grad_norm": 1.4300661087036133, "learning_rate": 2.395690834473324e-05, "loss": 2.0311, "step": 6092 }, { "epoch": 0.52, "grad_norm": 1.4880815744400024, "learning_rate": 2.393980848153215e-05, "loss": 1.9217, "step": 6096 }, { "epoch": 0.52, "grad_norm": 1.5886203050613403, "learning_rate": 2.3922708618331055e-05, "loss": 2.1043, "step": 6100 }, { "epoch": 0.52, "grad_norm": 1.4203860759735107, "learning_rate": 2.390560875512996e-05, "loss": 2.1326, "step": 6104 }, { "epoch": 0.52, "grad_norm": 1.777569055557251, "learning_rate": 2.3888508891928867e-05, "loss": 2.0363, "step": 6108 }, { "epoch": 0.52, "grad_norm": 1.4837594032287598, "learning_rate": 2.387140902872777e-05, "loss": 1.8943, "step": 6112 }, { "epoch": 0.52, "grad_norm": 1.6649631261825562, "learning_rate": 2.3854309165526678e-05, "loss": 2.1792, "step": 6116 }, { "epoch": 0.52, "grad_norm": 1.4544529914855957, "learning_rate": 2.3837209302325582e-05, "loss": 1.9499, "step": 6120 }, { "epoch": 0.52, "grad_norm": 1.4706698656082153, "learning_rate": 2.382010943912449e-05, "loss": 1.9453, "step": 6124 }, { "epoch": 0.52, "grad_norm": 1.8051691055297852, "learning_rate": 2.3803009575923393e-05, "loss": 2.124, "step": 6128 }, { "epoch": 0.52, "grad_norm": 1.5130858421325684, "learning_rate": 2.3785909712722297e-05, "loss": 2.0967, "step": 6132 }, { "epoch": 0.52, "grad_norm": 1.6385159492492676, "learning_rate": 2.3768809849521205e-05, "loss": 2.155, "step": 6136 }, { "epoch": 0.52, "grad_norm": 1.579830527305603, "learning_rate": 2.3751709986320112e-05, "loss": 2.0291, "step": 6140 }, { "epoch": 0.53, "grad_norm": 1.6579816341400146, "learning_rate": 2.3734610123119016e-05, "loss": 1.9364, "step": 6144 }, { "epoch": 0.53, "grad_norm": 1.4621435403823853, "learning_rate": 2.3717510259917923e-05, "loss": 2.0041, "step": 6148 }, { "epoch": 0.53, "grad_norm": 1.4901877641677856, "learning_rate": 2.3700410396716827e-05, "loss": 2.0768, "step": 6152 }, { "epoch": 0.53, "grad_norm": 1.5023964643478394, "learning_rate": 2.368331053351573e-05, "loss": 1.8666, "step": 6156 }, { "epoch": 0.53, "grad_norm": 1.394523024559021, "learning_rate": 2.366621067031464e-05, "loss": 2.0315, "step": 6160 }, { "epoch": 0.53, "grad_norm": 1.3989911079406738, "learning_rate": 2.3649110807113546e-05, "loss": 2.0719, "step": 6164 }, { "epoch": 0.53, "grad_norm": 1.6071873903274536, "learning_rate": 2.363201094391245e-05, "loss": 2.07, "step": 6168 }, { "epoch": 0.53, "grad_norm": 1.4817439317703247, "learning_rate": 2.3614911080711354e-05, "loss": 2.0119, "step": 6172 }, { "epoch": 0.53, "grad_norm": 1.3561433553695679, "learning_rate": 2.359781121751026e-05, "loss": 1.9405, "step": 6176 }, { "epoch": 0.53, "grad_norm": 1.5416829586029053, "learning_rate": 2.358071135430917e-05, "loss": 2.0001, "step": 6180 }, { "epoch": 0.53, "grad_norm": 1.4573452472686768, "learning_rate": 2.3563611491108072e-05, "loss": 1.9938, "step": 6184 }, { "epoch": 0.53, "grad_norm": 1.6019344329833984, "learning_rate": 2.354651162790698e-05, "loss": 1.9446, "step": 6188 }, { "epoch": 0.53, "grad_norm": 1.7344564199447632, "learning_rate": 2.3529411764705884e-05, "loss": 1.862, "step": 6192 }, { "epoch": 0.53, "grad_norm": 1.6940727233886719, "learning_rate": 2.3512311901504788e-05, "loss": 1.9843, "step": 6196 }, { "epoch": 0.53, "grad_norm": 1.5194576978683472, "learning_rate": 2.3495212038303695e-05, "loss": 2.0475, "step": 6200 }, { "epoch": 0.53, "grad_norm": 1.7754817008972168, "learning_rate": 2.3478112175102602e-05, "loss": 2.0975, "step": 6204 }, { "epoch": 0.53, "grad_norm": 1.676499605178833, "learning_rate": 2.3461012311901506e-05, "loss": 2.1042, "step": 6208 }, { "epoch": 0.53, "grad_norm": 1.5539844036102295, "learning_rate": 2.344391244870041e-05, "loss": 2.0322, "step": 6212 }, { "epoch": 0.53, "grad_norm": 1.5558497905731201, "learning_rate": 2.3426812585499318e-05, "loss": 2.1731, "step": 6216 }, { "epoch": 0.53, "grad_norm": 1.4876151084899902, "learning_rate": 2.340971272229822e-05, "loss": 1.9864, "step": 6220 }, { "epoch": 0.53, "grad_norm": 1.5103055238723755, "learning_rate": 2.339261285909713e-05, "loss": 2.0973, "step": 6224 }, { "epoch": 0.53, "grad_norm": 1.617957353591919, "learning_rate": 2.3375512995896033e-05, "loss": 1.8867, "step": 6228 }, { "epoch": 0.53, "grad_norm": 1.417094111442566, "learning_rate": 2.335841313269494e-05, "loss": 2.0805, "step": 6232 }, { "epoch": 0.53, "grad_norm": 1.5233298540115356, "learning_rate": 2.3341313269493844e-05, "loss": 1.9551, "step": 6236 }, { "epoch": 0.53, "grad_norm": 1.5211933851242065, "learning_rate": 2.3324213406292748e-05, "loss": 2.0242, "step": 6240 }, { "epoch": 0.53, "grad_norm": 1.4824392795562744, "learning_rate": 2.330711354309166e-05, "loss": 1.9477, "step": 6244 }, { "epoch": 0.53, "grad_norm": 1.4873578548431396, "learning_rate": 2.3290013679890563e-05, "loss": 2.1701, "step": 6248 }, { "epoch": 0.53, "grad_norm": 1.5688248872756958, "learning_rate": 2.3272913816689467e-05, "loss": 2.0478, "step": 6252 }, { "epoch": 0.53, "grad_norm": 1.4556810855865479, "learning_rate": 2.3255813953488374e-05, "loss": 1.9112, "step": 6256 }, { "epoch": 0.54, "grad_norm": 1.8388420343399048, "learning_rate": 2.3238714090287278e-05, "loss": 2.1023, "step": 6260 }, { "epoch": 0.54, "grad_norm": 1.615323543548584, "learning_rate": 2.3221614227086182e-05, "loss": 2.0831, "step": 6264 }, { "epoch": 0.54, "grad_norm": 1.488853096961975, "learning_rate": 2.320451436388509e-05, "loss": 1.9403, "step": 6268 }, { "epoch": 0.54, "grad_norm": 1.5217219591140747, "learning_rate": 2.3187414500683997e-05, "loss": 2.1556, "step": 6272 }, { "epoch": 0.54, "grad_norm": 1.4916918277740479, "learning_rate": 2.31703146374829e-05, "loss": 2.0442, "step": 6276 }, { "epoch": 0.54, "grad_norm": 1.976369857788086, "learning_rate": 2.3153214774281805e-05, "loss": 2.0054, "step": 6280 }, { "epoch": 0.54, "grad_norm": 1.6137586832046509, "learning_rate": 2.3136114911080712e-05, "loss": 2.0149, "step": 6284 }, { "epoch": 0.54, "grad_norm": 1.3688173294067383, "learning_rate": 2.311901504787962e-05, "loss": 2.0518, "step": 6288 }, { "epoch": 0.54, "grad_norm": 1.489639163017273, "learning_rate": 2.3101915184678523e-05, "loss": 2.186, "step": 6292 }, { "epoch": 0.54, "grad_norm": 1.4436581134796143, "learning_rate": 2.308481532147743e-05, "loss": 1.9726, "step": 6296 }, { "epoch": 0.54, "grad_norm": 5.76146125793457, "learning_rate": 2.3067715458276335e-05, "loss": 1.9351, "step": 6300 }, { "epoch": 0.54, "grad_norm": 1.7009334564208984, "learning_rate": 2.305061559507524e-05, "loss": 1.9967, "step": 6304 }, { "epoch": 0.54, "grad_norm": 1.611024260520935, "learning_rate": 2.3033515731874146e-05, "loss": 2.0911, "step": 6308 }, { "epoch": 0.54, "grad_norm": 1.5418471097946167, "learning_rate": 2.3016415868673053e-05, "loss": 1.9721, "step": 6312 }, { "epoch": 0.54, "grad_norm": 1.5658395290374756, "learning_rate": 2.2999316005471957e-05, "loss": 2.0573, "step": 6316 }, { "epoch": 0.54, "grad_norm": 1.365360140800476, "learning_rate": 2.298221614227086e-05, "loss": 1.9245, "step": 6320 }, { "epoch": 0.54, "grad_norm": 1.7127231359481812, "learning_rate": 2.296511627906977e-05, "loss": 1.9568, "step": 6324 }, { "epoch": 0.54, "grad_norm": 1.572302222251892, "learning_rate": 2.2948016415868672e-05, "loss": 2.0137, "step": 6328 }, { "epoch": 0.54, "grad_norm": 1.7327531576156616, "learning_rate": 2.293091655266758e-05, "loss": 2.0781, "step": 6332 }, { "epoch": 0.54, "grad_norm": 1.5620208978652954, "learning_rate": 2.2913816689466487e-05, "loss": 2.0689, "step": 6336 }, { "epoch": 0.54, "grad_norm": 1.4865524768829346, "learning_rate": 2.289671682626539e-05, "loss": 1.9261, "step": 6340 }, { "epoch": 0.54, "grad_norm": 1.6262495517730713, "learning_rate": 2.2879616963064295e-05, "loss": 2.1261, "step": 6344 }, { "epoch": 0.54, "grad_norm": 1.536917805671692, "learning_rate": 2.2862517099863202e-05, "loss": 2.0757, "step": 6348 }, { "epoch": 0.54, "grad_norm": 1.4335952997207642, "learning_rate": 2.284541723666211e-05, "loss": 1.967, "step": 6352 }, { "epoch": 0.54, "grad_norm": 2.1377627849578857, "learning_rate": 2.2828317373461014e-05, "loss": 1.996, "step": 6356 }, { "epoch": 0.54, "grad_norm": 1.705106496810913, "learning_rate": 2.2811217510259918e-05, "loss": 2.0619, "step": 6360 }, { "epoch": 0.54, "grad_norm": 1.5044035911560059, "learning_rate": 2.2794117647058825e-05, "loss": 2.0941, "step": 6364 }, { "epoch": 0.54, "grad_norm": 1.485723614692688, "learning_rate": 2.277701778385773e-05, "loss": 2.0471, "step": 6368 }, { "epoch": 0.54, "grad_norm": 1.3270933628082275, "learning_rate": 2.2759917920656636e-05, "loss": 1.9035, "step": 6372 }, { "epoch": 0.55, "grad_norm": 1.8408259153366089, "learning_rate": 2.2742818057455544e-05, "loss": 2.1103, "step": 6376 }, { "epoch": 0.55, "grad_norm": 1.5372594594955444, "learning_rate": 2.2725718194254448e-05, "loss": 1.9872, "step": 6380 }, { "epoch": 0.55, "grad_norm": 1.5519907474517822, "learning_rate": 2.270861833105335e-05, "loss": 2.0445, "step": 6384 }, { "epoch": 0.55, "grad_norm": 1.5492377281188965, "learning_rate": 2.269151846785226e-05, "loss": 2.1288, "step": 6388 }, { "epoch": 0.55, "grad_norm": 1.5490655899047852, "learning_rate": 2.2674418604651163e-05, "loss": 2.0827, "step": 6392 }, { "epoch": 0.55, "grad_norm": 1.575005292892456, "learning_rate": 2.265731874145007e-05, "loss": 2.0462, "step": 6396 }, { "epoch": 0.55, "grad_norm": 1.5636953115463257, "learning_rate": 2.2640218878248974e-05, "loss": 2.0272, "step": 6400 }, { "epoch": 0.55, "grad_norm": 1.4085521697998047, "learning_rate": 2.262311901504788e-05, "loss": 1.9316, "step": 6404 }, { "epoch": 0.55, "grad_norm": 1.375346064567566, "learning_rate": 2.2606019151846785e-05, "loss": 2.1006, "step": 6408 }, { "epoch": 0.55, "grad_norm": 1.5177054405212402, "learning_rate": 2.258891928864569e-05, "loss": 2.104, "step": 6412 }, { "epoch": 0.55, "grad_norm": 1.590615153312683, "learning_rate": 2.25718194254446e-05, "loss": 2.054, "step": 6416 }, { "epoch": 0.55, "grad_norm": 1.6680008172988892, "learning_rate": 2.2554719562243504e-05, "loss": 2.0611, "step": 6420 }, { "epoch": 0.55, "grad_norm": 2.7814908027648926, "learning_rate": 2.2537619699042408e-05, "loss": 2.0015, "step": 6424 }, { "epoch": 0.55, "grad_norm": 1.419728398323059, "learning_rate": 2.2520519835841315e-05, "loss": 2.0902, "step": 6428 }, { "epoch": 0.55, "grad_norm": 1.4798617362976074, "learning_rate": 2.250341997264022e-05, "loss": 2.05, "step": 6432 }, { "epoch": 0.55, "grad_norm": 1.506953477859497, "learning_rate": 2.2486320109439127e-05, "loss": 2.0153, "step": 6436 }, { "epoch": 0.55, "grad_norm": 1.6882991790771484, "learning_rate": 2.246922024623803e-05, "loss": 1.9237, "step": 6440 }, { "epoch": 0.55, "grad_norm": 1.4662657976150513, "learning_rate": 2.2452120383036938e-05, "loss": 2.0524, "step": 6444 }, { "epoch": 0.55, "grad_norm": 1.5177686214447021, "learning_rate": 2.2435020519835842e-05, "loss": 2.0207, "step": 6448 }, { "epoch": 0.55, "grad_norm": 1.7615087032318115, "learning_rate": 2.2417920656634746e-05, "loss": 2.07, "step": 6452 }, { "epoch": 0.55, "grad_norm": 1.7587693929672241, "learning_rate": 2.2400820793433653e-05, "loss": 2.0928, "step": 6456 }, { "epoch": 0.55, "grad_norm": 1.6532100439071655, "learning_rate": 2.238372093023256e-05, "loss": 2.0425, "step": 6460 }, { "epoch": 0.55, "grad_norm": 1.4801387786865234, "learning_rate": 2.2366621067031464e-05, "loss": 2.1567, "step": 6464 }, { "epoch": 0.55, "grad_norm": 1.6623640060424805, "learning_rate": 2.2349521203830372e-05, "loss": 2.0281, "step": 6468 }, { "epoch": 0.55, "grad_norm": 1.5329927206039429, "learning_rate": 2.2332421340629276e-05, "loss": 1.9157, "step": 6472 }, { "epoch": 0.55, "grad_norm": 1.5152140855789185, "learning_rate": 2.231532147742818e-05, "loss": 2.0218, "step": 6476 }, { "epoch": 0.55, "grad_norm": 1.4606438875198364, "learning_rate": 2.2298221614227087e-05, "loss": 1.9652, "step": 6480 }, { "epoch": 0.55, "grad_norm": 1.4744431972503662, "learning_rate": 2.2281121751025994e-05, "loss": 1.9493, "step": 6484 }, { "epoch": 0.55, "grad_norm": 1.527673363685608, "learning_rate": 2.22640218878249e-05, "loss": 2.1294, "step": 6488 }, { "epoch": 0.56, "grad_norm": 1.5893338918685913, "learning_rate": 2.2246922024623802e-05, "loss": 2.0014, "step": 6492 }, { "epoch": 0.56, "grad_norm": 1.53531014919281, "learning_rate": 2.222982216142271e-05, "loss": 2.1177, "step": 6496 }, { "epoch": 0.56, "grad_norm": 1.4468746185302734, "learning_rate": 2.2212722298221617e-05, "loss": 1.9305, "step": 6500 }, { "epoch": 0.56, "grad_norm": 1.5749852657318115, "learning_rate": 2.219562243502052e-05, "loss": 2.0401, "step": 6504 }, { "epoch": 0.56, "grad_norm": 1.6165703535079956, "learning_rate": 2.217852257181943e-05, "loss": 2.1496, "step": 6508 }, { "epoch": 0.56, "grad_norm": 1.5743823051452637, "learning_rate": 2.2161422708618332e-05, "loss": 2.0051, "step": 6512 }, { "epoch": 0.56, "grad_norm": 1.5725358724594116, "learning_rate": 2.2144322845417236e-05, "loss": 2.1795, "step": 6516 }, { "epoch": 0.56, "grad_norm": 1.5228146314620972, "learning_rate": 2.2127222982216144e-05, "loss": 2.0832, "step": 6520 }, { "epoch": 0.56, "grad_norm": 1.584122657775879, "learning_rate": 2.211012311901505e-05, "loss": 1.9658, "step": 6524 }, { "epoch": 0.56, "grad_norm": 1.6192436218261719, "learning_rate": 2.2093023255813955e-05, "loss": 2.047, "step": 6528 }, { "epoch": 0.56, "grad_norm": 1.4840277433395386, "learning_rate": 2.207592339261286e-05, "loss": 2.0195, "step": 6532 }, { "epoch": 0.56, "grad_norm": 1.5676360130310059, "learning_rate": 2.2058823529411766e-05, "loss": 2.0233, "step": 6536 }, { "epoch": 0.56, "grad_norm": 1.5410796403884888, "learning_rate": 2.204172366621067e-05, "loss": 1.9596, "step": 6540 }, { "epoch": 0.56, "grad_norm": 1.6324695348739624, "learning_rate": 2.2024623803009577e-05, "loss": 1.9401, "step": 6544 }, { "epoch": 0.56, "grad_norm": 1.4897451400756836, "learning_rate": 2.2007523939808485e-05, "loss": 2.0069, "step": 6548 }, { "epoch": 0.56, "grad_norm": 1.5398979187011719, "learning_rate": 2.199042407660739e-05, "loss": 2.0448, "step": 6552 }, { "epoch": 0.56, "grad_norm": 1.3995730876922607, "learning_rate": 2.1973324213406293e-05, "loss": 1.9606, "step": 6556 }, { "epoch": 0.56, "grad_norm": 1.6123522520065308, "learning_rate": 2.19562243502052e-05, "loss": 1.9598, "step": 6560 }, { "epoch": 0.56, "grad_norm": 1.5927785634994507, "learning_rate": 2.1939124487004107e-05, "loss": 1.8914, "step": 6564 }, { "epoch": 0.56, "grad_norm": 1.3952325582504272, "learning_rate": 2.192202462380301e-05, "loss": 1.8675, "step": 6568 }, { "epoch": 0.56, "grad_norm": 1.780047059059143, "learning_rate": 2.1904924760601915e-05, "loss": 2.037, "step": 6572 }, { "epoch": 0.56, "grad_norm": 1.719232439994812, "learning_rate": 2.1887824897400823e-05, "loss": 1.9979, "step": 6576 }, { "epoch": 0.56, "grad_norm": 1.8734204769134521, "learning_rate": 2.1870725034199727e-05, "loss": 2.0388, "step": 6580 }, { "epoch": 0.56, "grad_norm": 1.5046261548995972, "learning_rate": 2.185362517099863e-05, "loss": 2.0383, "step": 6584 }, { "epoch": 0.56, "grad_norm": 1.5430729389190674, "learning_rate": 2.1836525307797538e-05, "loss": 2.1083, "step": 6588 }, { "epoch": 0.56, "grad_norm": 1.647208571434021, "learning_rate": 2.1819425444596445e-05, "loss": 2.0813, "step": 6592 }, { "epoch": 0.56, "grad_norm": 1.5349608659744263, "learning_rate": 2.180232558139535e-05, "loss": 1.9528, "step": 6596 }, { "epoch": 0.56, "grad_norm": 1.6204650402069092, "learning_rate": 2.1785225718194253e-05, "loss": 2.1957, "step": 6600 }, { "epoch": 0.56, "grad_norm": 1.6103018522262573, "learning_rate": 2.176812585499316e-05, "loss": 1.9506, "step": 6604 }, { "epoch": 0.56, "grad_norm": 2.1254968643188477, "learning_rate": 2.1751025991792068e-05, "loss": 2.1188, "step": 6608 }, { "epoch": 0.57, "grad_norm": 1.5318729877471924, "learning_rate": 2.1733926128590972e-05, "loss": 2.0858, "step": 6612 }, { "epoch": 0.57, "grad_norm": 1.4337228536605835, "learning_rate": 2.171682626538988e-05, "loss": 1.9164, "step": 6616 }, { "epoch": 0.57, "grad_norm": 1.4926187992095947, "learning_rate": 2.1699726402188783e-05, "loss": 1.9784, "step": 6620 }, { "epoch": 0.57, "grad_norm": 1.419592022895813, "learning_rate": 2.1682626538987687e-05, "loss": 1.9949, "step": 6624 }, { "epoch": 0.57, "grad_norm": 1.4496034383773804, "learning_rate": 2.1665526675786594e-05, "loss": 2.068, "step": 6628 }, { "epoch": 0.57, "grad_norm": 1.4800695180892944, "learning_rate": 2.1648426812585502e-05, "loss": 1.9682, "step": 6632 }, { "epoch": 0.57, "grad_norm": 1.7047966718673706, "learning_rate": 2.1631326949384406e-05, "loss": 2.0647, "step": 6636 }, { "epoch": 0.57, "grad_norm": 1.4624961614608765, "learning_rate": 2.161422708618331e-05, "loss": 1.9154, "step": 6640 }, { "epoch": 0.57, "grad_norm": 1.4518662691116333, "learning_rate": 2.1597127222982217e-05, "loss": 2.1044, "step": 6644 }, { "epoch": 0.57, "grad_norm": 1.488329529762268, "learning_rate": 2.158002735978112e-05, "loss": 2.0274, "step": 6648 }, { "epoch": 0.57, "grad_norm": 1.4353028535842896, "learning_rate": 2.156292749658003e-05, "loss": 1.9888, "step": 6652 }, { "epoch": 0.57, "grad_norm": 1.470017433166504, "learning_rate": 2.1545827633378936e-05, "loss": 1.9887, "step": 6656 }, { "epoch": 0.57, "grad_norm": 1.5084627866744995, "learning_rate": 2.152872777017784e-05, "loss": 1.9513, "step": 6660 }, { "epoch": 0.57, "grad_norm": 1.5296393632888794, "learning_rate": 2.1511627906976744e-05, "loss": 1.8803, "step": 6664 }, { "epoch": 0.57, "grad_norm": 1.597259283065796, "learning_rate": 2.149452804377565e-05, "loss": 1.9239, "step": 6668 }, { "epoch": 0.57, "grad_norm": 1.476731300354004, "learning_rate": 2.1477428180574558e-05, "loss": 2.0592, "step": 6672 }, { "epoch": 0.57, "grad_norm": 1.5087945461273193, "learning_rate": 2.1460328317373462e-05, "loss": 2.0161, "step": 6676 }, { "epoch": 0.57, "grad_norm": 1.6463593244552612, "learning_rate": 2.1443228454172366e-05, "loss": 2.0282, "step": 6680 }, { "epoch": 0.57, "grad_norm": 1.6463061571121216, "learning_rate": 2.1426128590971274e-05, "loss": 2.0664, "step": 6684 }, { "epoch": 0.57, "grad_norm": 1.5039496421813965, "learning_rate": 2.1409028727770177e-05, "loss": 2.0983, "step": 6688 }, { "epoch": 0.57, "grad_norm": 1.4754881858825684, "learning_rate": 2.1391928864569085e-05, "loss": 2.0178, "step": 6692 }, { "epoch": 0.57, "grad_norm": 1.7630116939544678, "learning_rate": 2.1374829001367992e-05, "loss": 2.0397, "step": 6696 }, { "epoch": 0.57, "grad_norm": 1.4631503820419312, "learning_rate": 2.1357729138166896e-05, "loss": 1.9825, "step": 6700 }, { "epoch": 0.57, "grad_norm": 1.5634981393814087, "learning_rate": 2.13406292749658e-05, "loss": 1.8953, "step": 6704 }, { "epoch": 0.57, "grad_norm": 1.5709517002105713, "learning_rate": 2.1323529411764707e-05, "loss": 1.9242, "step": 6708 }, { "epoch": 0.57, "grad_norm": 1.5792012214660645, "learning_rate": 2.130642954856361e-05, "loss": 2.2404, "step": 6712 }, { "epoch": 0.57, "grad_norm": 1.5689932107925415, "learning_rate": 2.128932968536252e-05, "loss": 2.1333, "step": 6716 }, { "epoch": 0.57, "grad_norm": 1.3986164331436157, "learning_rate": 2.1272229822161423e-05, "loss": 1.8481, "step": 6720 }, { "epoch": 0.57, "grad_norm": 1.62111496925354, "learning_rate": 2.125512995896033e-05, "loss": 2.057, "step": 6724 }, { "epoch": 0.58, "grad_norm": 1.6501810550689697, "learning_rate": 2.1238030095759234e-05, "loss": 2.0681, "step": 6728 }, { "epoch": 0.58, "grad_norm": 1.5174288749694824, "learning_rate": 2.1220930232558138e-05, "loss": 2.0297, "step": 6732 }, { "epoch": 0.58, "grad_norm": 1.5836937427520752, "learning_rate": 2.120383036935705e-05, "loss": 2.0201, "step": 6736 }, { "epoch": 0.58, "grad_norm": 1.6528754234313965, "learning_rate": 2.1186730506155953e-05, "loss": 2.0832, "step": 6740 }, { "epoch": 0.58, "grad_norm": 1.435103416442871, "learning_rate": 2.1169630642954857e-05, "loss": 1.9953, "step": 6744 }, { "epoch": 0.58, "grad_norm": 1.4534496068954468, "learning_rate": 2.1152530779753764e-05, "loss": 2.0343, "step": 6748 }, { "epoch": 0.58, "grad_norm": 1.5175713300704956, "learning_rate": 2.1135430916552668e-05, "loss": 1.9486, "step": 6752 }, { "epoch": 0.58, "grad_norm": 1.3781896829605103, "learning_rate": 2.1118331053351575e-05, "loss": 1.8642, "step": 6756 }, { "epoch": 0.58, "grad_norm": 1.5847692489624023, "learning_rate": 2.110123119015048e-05, "loss": 1.9693, "step": 6760 }, { "epoch": 0.58, "grad_norm": 1.9104549884796143, "learning_rate": 2.1084131326949386e-05, "loss": 2.0385, "step": 6764 }, { "epoch": 0.58, "grad_norm": 1.6398965120315552, "learning_rate": 2.106703146374829e-05, "loss": 2.0187, "step": 6768 }, { "epoch": 0.58, "grad_norm": 1.4324244260787964, "learning_rate": 2.1049931600547194e-05, "loss": 1.8963, "step": 6772 }, { "epoch": 0.58, "grad_norm": 1.57370126247406, "learning_rate": 2.1032831737346102e-05, "loss": 2.0443, "step": 6776 }, { "epoch": 0.58, "grad_norm": 1.39073646068573, "learning_rate": 2.101573187414501e-05, "loss": 1.9976, "step": 6780 }, { "epoch": 0.58, "grad_norm": 1.6951498985290527, "learning_rate": 2.0998632010943913e-05, "loss": 1.7654, "step": 6784 }, { "epoch": 0.58, "grad_norm": 1.5796420574188232, "learning_rate": 2.098153214774282e-05, "loss": 1.9724, "step": 6788 }, { "epoch": 0.58, "grad_norm": 1.536661148071289, "learning_rate": 2.0964432284541724e-05, "loss": 2.0424, "step": 6792 }, { "epoch": 0.58, "grad_norm": 1.6385177373886108, "learning_rate": 2.094733242134063e-05, "loss": 2.0934, "step": 6796 }, { "epoch": 0.58, "grad_norm": 1.5548115968704224, "learning_rate": 2.0930232558139536e-05, "loss": 1.8169, "step": 6800 }, { "epoch": 0.58, "grad_norm": 1.4583497047424316, "learning_rate": 2.0913132694938443e-05, "loss": 1.9631, "step": 6804 }, { "epoch": 0.58, "grad_norm": 1.6827950477600098, "learning_rate": 2.0896032831737347e-05, "loss": 1.8003, "step": 6808 }, { "epoch": 0.58, "grad_norm": 1.881792664527893, "learning_rate": 2.087893296853625e-05, "loss": 1.9482, "step": 6812 }, { "epoch": 0.58, "grad_norm": 1.5031013488769531, "learning_rate": 2.0861833105335158e-05, "loss": 1.888, "step": 6816 }, { "epoch": 0.58, "grad_norm": 1.7408212423324585, "learning_rate": 2.0844733242134066e-05, "loss": 2.1306, "step": 6820 }, { "epoch": 0.58, "grad_norm": 1.5273849964141846, "learning_rate": 2.082763337893297e-05, "loss": 1.9256, "step": 6824 }, { "epoch": 0.58, "grad_norm": 1.6354838609695435, "learning_rate": 2.0810533515731877e-05, "loss": 1.9018, "step": 6828 }, { "epoch": 0.58, "grad_norm": 1.5808496475219727, "learning_rate": 2.079343365253078e-05, "loss": 1.9239, "step": 6832 }, { "epoch": 0.58, "grad_norm": 1.5880577564239502, "learning_rate": 2.0776333789329685e-05, "loss": 1.9876, "step": 6836 }, { "epoch": 0.58, "grad_norm": 1.554107904434204, "learning_rate": 2.0759233926128592e-05, "loss": 2.1325, "step": 6840 }, { "epoch": 0.59, "grad_norm": 1.7010103464126587, "learning_rate": 2.07421340629275e-05, "loss": 2.1108, "step": 6844 }, { "epoch": 0.59, "grad_norm": 1.4548836946487427, "learning_rate": 2.0725034199726403e-05, "loss": 1.854, "step": 6848 }, { "epoch": 0.59, "grad_norm": 1.602513074874878, "learning_rate": 2.0707934336525307e-05, "loss": 1.9275, "step": 6852 }, { "epoch": 0.59, "grad_norm": 1.379732608795166, "learning_rate": 2.0690834473324215e-05, "loss": 1.8298, "step": 6856 }, { "epoch": 0.59, "grad_norm": 1.5978758335113525, "learning_rate": 2.067373461012312e-05, "loss": 2.0292, "step": 6860 }, { "epoch": 0.59, "grad_norm": 1.5919899940490723, "learning_rate": 2.0656634746922026e-05, "loss": 1.9795, "step": 6864 }, { "epoch": 0.59, "grad_norm": 1.4929299354553223, "learning_rate": 2.0639534883720933e-05, "loss": 2.0711, "step": 6868 }, { "epoch": 0.59, "grad_norm": 1.5734412670135498, "learning_rate": 2.0622435020519837e-05, "loss": 1.9286, "step": 6872 }, { "epoch": 0.59, "grad_norm": 1.3680405616760254, "learning_rate": 2.060533515731874e-05, "loss": 1.9525, "step": 6876 }, { "epoch": 0.59, "grad_norm": 1.4015724658966064, "learning_rate": 2.058823529411765e-05, "loss": 1.8651, "step": 6880 }, { "epoch": 0.59, "grad_norm": 1.4925768375396729, "learning_rate": 2.0571135430916556e-05, "loss": 1.8487, "step": 6884 }, { "epoch": 0.59, "grad_norm": 1.5962053537368774, "learning_rate": 2.055403556771546e-05, "loss": 2.0726, "step": 6888 }, { "epoch": 0.59, "grad_norm": 1.5657358169555664, "learning_rate": 2.0536935704514364e-05, "loss": 2.0781, "step": 6892 }, { "epoch": 0.59, "grad_norm": 1.4741593599319458, "learning_rate": 2.051983584131327e-05, "loss": 1.9721, "step": 6896 }, { "epoch": 0.59, "grad_norm": 1.5878019332885742, "learning_rate": 2.0502735978112175e-05, "loss": 1.9813, "step": 6900 }, { "epoch": 0.59, "grad_norm": 1.4257550239562988, "learning_rate": 2.048563611491108e-05, "loss": 1.9342, "step": 6904 }, { "epoch": 0.59, "grad_norm": 1.603327751159668, "learning_rate": 2.046853625170999e-05, "loss": 2.0587, "step": 6908 }, { "epoch": 0.59, "grad_norm": 1.4854941368103027, "learning_rate": 2.0451436388508894e-05, "loss": 1.8446, "step": 6912 }, { "epoch": 0.59, "grad_norm": 1.5847572088241577, "learning_rate": 2.0434336525307798e-05, "loss": 2.1653, "step": 6916 }, { "epoch": 0.59, "grad_norm": 1.638177514076233, "learning_rate": 2.0417236662106705e-05, "loss": 2.0012, "step": 6920 }, { "epoch": 0.59, "grad_norm": 1.627571702003479, "learning_rate": 2.040013679890561e-05, "loss": 1.9958, "step": 6924 }, { "epoch": 0.59, "grad_norm": 1.5089852809906006, "learning_rate": 2.0383036935704516e-05, "loss": 1.9816, "step": 6928 }, { "epoch": 0.59, "grad_norm": 1.6622666120529175, "learning_rate": 2.036593707250342e-05, "loss": 2.0482, "step": 6932 }, { "epoch": 0.59, "grad_norm": 1.5746067762374878, "learning_rate": 2.0348837209302328e-05, "loss": 2.0877, "step": 6936 }, { "epoch": 0.59, "grad_norm": 1.4563333988189697, "learning_rate": 2.033173734610123e-05, "loss": 2.0376, "step": 6940 }, { "epoch": 0.59, "grad_norm": 1.417210340499878, "learning_rate": 2.0314637482900136e-05, "loss": 1.8537, "step": 6944 }, { "epoch": 0.59, "grad_norm": 1.644472360610962, "learning_rate": 2.0297537619699043e-05, "loss": 2.0294, "step": 6948 }, { "epoch": 0.59, "grad_norm": 1.462584137916565, "learning_rate": 2.028043775649795e-05, "loss": 2.0171, "step": 6952 }, { "epoch": 0.59, "grad_norm": 1.6051154136657715, "learning_rate": 2.0263337893296854e-05, "loss": 2.0095, "step": 6956 }, { "epoch": 0.6, "grad_norm": 1.6152054071426392, "learning_rate": 2.0246238030095758e-05, "loss": 1.9219, "step": 6960 }, { "epoch": 0.6, "grad_norm": 1.3505607843399048, "learning_rate": 2.0229138166894666e-05, "loss": 2.0034, "step": 6964 }, { "epoch": 0.6, "grad_norm": 1.522525429725647, "learning_rate": 2.021203830369357e-05, "loss": 1.8819, "step": 6968 }, { "epoch": 0.6, "grad_norm": 1.6121808290481567, "learning_rate": 2.0194938440492477e-05, "loss": 2.2193, "step": 6972 }, { "epoch": 0.6, "grad_norm": 1.8233622312545776, "learning_rate": 2.0177838577291384e-05, "loss": 1.8937, "step": 6976 }, { "epoch": 0.6, "grad_norm": 1.6225485801696777, "learning_rate": 2.0160738714090288e-05, "loss": 1.9533, "step": 6980 }, { "epoch": 0.6, "grad_norm": 1.5252420902252197, "learning_rate": 2.0143638850889192e-05, "loss": 1.9956, "step": 6984 }, { "epoch": 0.6, "grad_norm": 1.5580767393112183, "learning_rate": 2.01265389876881e-05, "loss": 2.0131, "step": 6988 }, { "epoch": 0.6, "grad_norm": 1.5372722148895264, "learning_rate": 2.0109439124487007e-05, "loss": 1.9964, "step": 6992 }, { "epoch": 0.6, "grad_norm": 1.500069499015808, "learning_rate": 2.009233926128591e-05, "loss": 2.0251, "step": 6996 }, { "epoch": 0.6, "grad_norm": 1.6729307174682617, "learning_rate": 2.0075239398084815e-05, "loss": 1.9491, "step": 7000 }, { "epoch": 0.6, "grad_norm": 1.8070749044418335, "learning_rate": 2.0058139534883722e-05, "loss": 1.9621, "step": 7004 }, { "epoch": 0.6, "grad_norm": 1.5209249258041382, "learning_rate": 2.0041039671682626e-05, "loss": 1.8494, "step": 7008 }, { "epoch": 0.6, "grad_norm": 1.4102156162261963, "learning_rate": 2.0023939808481533e-05, "loss": 1.8024, "step": 7012 }, { "epoch": 0.6, "grad_norm": 1.6024938821792603, "learning_rate": 2.000683994528044e-05, "loss": 2.0584, "step": 7016 }, { "epoch": 0.6, "grad_norm": 1.5164461135864258, "learning_rate": 1.9989740082079345e-05, "loss": 1.9705, "step": 7020 }, { "epoch": 0.6, "grad_norm": 1.4683334827423096, "learning_rate": 1.997264021887825e-05, "loss": 1.8526, "step": 7024 }, { "epoch": 0.6, "grad_norm": 1.5955207347869873, "learning_rate": 1.9955540355677156e-05, "loss": 2.0018, "step": 7028 }, { "epoch": 0.6, "grad_norm": 1.6623783111572266, "learning_rate": 1.993844049247606e-05, "loss": 2.1312, "step": 7032 }, { "epoch": 0.6, "grad_norm": 1.660420298576355, "learning_rate": 1.9921340629274967e-05, "loss": 1.8685, "step": 7036 }, { "epoch": 0.6, "grad_norm": 1.7066700458526611, "learning_rate": 1.990424076607387e-05, "loss": 1.9847, "step": 7040 }, { "epoch": 0.6, "grad_norm": 1.5725295543670654, "learning_rate": 1.988714090287278e-05, "loss": 1.9324, "step": 7044 }, { "epoch": 0.6, "grad_norm": 1.5613770484924316, "learning_rate": 1.9870041039671683e-05, "loss": 2.0648, "step": 7048 }, { "epoch": 0.6, "grad_norm": 1.4569567441940308, "learning_rate": 1.9852941176470586e-05, "loss": 1.8675, "step": 7052 }, { "epoch": 0.6, "grad_norm": 1.610832691192627, "learning_rate": 1.9835841313269497e-05, "loss": 2.0513, "step": 7056 }, { "epoch": 0.6, "grad_norm": 1.7159889936447144, "learning_rate": 1.98187414500684e-05, "loss": 1.8884, "step": 7060 }, { "epoch": 0.6, "grad_norm": 1.6798243522644043, "learning_rate": 1.9801641586867305e-05, "loss": 1.9021, "step": 7064 }, { "epoch": 0.6, "grad_norm": 1.87516450881958, "learning_rate": 1.9784541723666212e-05, "loss": 2.1314, "step": 7068 }, { "epoch": 0.6, "grad_norm": 1.465100646018982, "learning_rate": 1.9767441860465116e-05, "loss": 1.9359, "step": 7072 }, { "epoch": 0.6, "grad_norm": 1.7637323141098022, "learning_rate": 1.9750341997264024e-05, "loss": 2.0628, "step": 7076 }, { "epoch": 0.61, "grad_norm": 1.4772495031356812, "learning_rate": 1.9733242134062928e-05, "loss": 2.0258, "step": 7080 }, { "epoch": 0.61, "grad_norm": 1.5895733833312988, "learning_rate": 1.9716142270861835e-05, "loss": 1.8629, "step": 7084 }, { "epoch": 0.61, "grad_norm": 1.5556437969207764, "learning_rate": 1.969904240766074e-05, "loss": 1.9945, "step": 7088 }, { "epoch": 0.61, "grad_norm": 1.502622365951538, "learning_rate": 1.9681942544459643e-05, "loss": 2.0398, "step": 7092 }, { "epoch": 0.61, "grad_norm": 1.5926116704940796, "learning_rate": 1.966484268125855e-05, "loss": 1.8625, "step": 7096 }, { "epoch": 0.61, "grad_norm": 1.5113784074783325, "learning_rate": 1.9647742818057458e-05, "loss": 2.0816, "step": 7100 }, { "epoch": 0.61, "grad_norm": 1.5072494745254517, "learning_rate": 1.963064295485636e-05, "loss": 1.9681, "step": 7104 }, { "epoch": 0.61, "grad_norm": 1.6104905605316162, "learning_rate": 1.961354309165527e-05, "loss": 2.1155, "step": 7108 }, { "epoch": 0.61, "grad_norm": 1.6391701698303223, "learning_rate": 1.9596443228454173e-05, "loss": 1.9331, "step": 7112 }, { "epoch": 0.61, "grad_norm": 1.5969183444976807, "learning_rate": 1.9579343365253077e-05, "loss": 1.9331, "step": 7116 }, { "epoch": 0.61, "grad_norm": 1.6461735963821411, "learning_rate": 1.9562243502051984e-05, "loss": 2.115, "step": 7120 }, { "epoch": 0.61, "grad_norm": 1.572561264038086, "learning_rate": 1.954514363885089e-05, "loss": 2.0272, "step": 7124 }, { "epoch": 0.61, "grad_norm": 1.5880368947982788, "learning_rate": 1.9528043775649796e-05, "loss": 2.0262, "step": 7128 }, { "epoch": 0.61, "grad_norm": 1.697801947593689, "learning_rate": 1.95109439124487e-05, "loss": 2.1504, "step": 7132 }, { "epoch": 0.61, "grad_norm": 1.5090667009353638, "learning_rate": 1.9493844049247607e-05, "loss": 2.0096, "step": 7136 }, { "epoch": 0.61, "grad_norm": 1.6617227792739868, "learning_rate": 1.9476744186046514e-05, "loss": 1.9429, "step": 7140 }, { "epoch": 0.61, "grad_norm": 1.489587426185608, "learning_rate": 1.9459644322845418e-05, "loss": 1.9735, "step": 7144 }, { "epoch": 0.61, "grad_norm": 1.4970664978027344, "learning_rate": 1.9442544459644325e-05, "loss": 2.0434, "step": 7148 }, { "epoch": 0.61, "grad_norm": 1.7819324731826782, "learning_rate": 1.942544459644323e-05, "loss": 2.0847, "step": 7152 }, { "epoch": 0.61, "grad_norm": 1.572629690170288, "learning_rate": 1.9408344733242133e-05, "loss": 2.0716, "step": 7156 }, { "epoch": 0.61, "grad_norm": 1.6527976989746094, "learning_rate": 1.939124487004104e-05, "loss": 1.9941, "step": 7160 }, { "epoch": 0.61, "grad_norm": 1.7278069257736206, "learning_rate": 1.9374145006839948e-05, "loss": 1.9261, "step": 7164 }, { "epoch": 0.61, "grad_norm": 1.5078670978546143, "learning_rate": 1.9357045143638852e-05, "loss": 1.9513, "step": 7168 }, { "epoch": 0.61, "grad_norm": 1.6161733865737915, "learning_rate": 1.9339945280437756e-05, "loss": 1.9143, "step": 7172 }, { "epoch": 0.61, "grad_norm": 1.7204173803329468, "learning_rate": 1.9322845417236663e-05, "loss": 1.8862, "step": 7176 }, { "epoch": 0.61, "grad_norm": 1.607851266860962, "learning_rate": 1.9305745554035567e-05, "loss": 1.98, "step": 7180 }, { "epoch": 0.61, "grad_norm": 1.5279322862625122, "learning_rate": 1.9288645690834475e-05, "loss": 1.8538, "step": 7184 }, { "epoch": 0.61, "grad_norm": 1.5147992372512817, "learning_rate": 1.9271545827633382e-05, "loss": 1.9975, "step": 7188 }, { "epoch": 0.61, "grad_norm": 1.6696971654891968, "learning_rate": 1.9254445964432286e-05, "loss": 1.8899, "step": 7192 }, { "epoch": 0.62, "grad_norm": 1.3951395750045776, "learning_rate": 1.923734610123119e-05, "loss": 1.9392, "step": 7196 }, { "epoch": 0.62, "grad_norm": 1.5910269021987915, "learning_rate": 1.9220246238030097e-05, "loss": 2.082, "step": 7200 }, { "epoch": 0.62, "grad_norm": 1.5352290868759155, "learning_rate": 1.9203146374829005e-05, "loss": 2.0371, "step": 7204 }, { "epoch": 0.62, "grad_norm": 1.521520972251892, "learning_rate": 1.918604651162791e-05, "loss": 1.9586, "step": 7208 }, { "epoch": 0.62, "grad_norm": 1.501772403717041, "learning_rate": 1.9168946648426812e-05, "loss": 1.982, "step": 7212 }, { "epoch": 0.62, "grad_norm": 1.548953890800476, "learning_rate": 1.915184678522572e-05, "loss": 2.0191, "step": 7216 }, { "epoch": 0.62, "grad_norm": 1.6001508235931396, "learning_rate": 1.9134746922024624e-05, "loss": 2.0142, "step": 7220 }, { "epoch": 0.62, "grad_norm": 1.6897025108337402, "learning_rate": 1.9117647058823528e-05, "loss": 1.9539, "step": 7224 }, { "epoch": 0.62, "grad_norm": 1.6484875679016113, "learning_rate": 1.910054719562244e-05, "loss": 1.9652, "step": 7228 }, { "epoch": 0.62, "grad_norm": 1.504267692565918, "learning_rate": 1.9083447332421342e-05, "loss": 1.8661, "step": 7232 }, { "epoch": 0.62, "grad_norm": 1.442720890045166, "learning_rate": 1.9066347469220246e-05, "loss": 1.849, "step": 7236 }, { "epoch": 0.62, "grad_norm": 1.3908931016921997, "learning_rate": 1.9049247606019154e-05, "loss": 2.0333, "step": 7240 }, { "epoch": 0.62, "grad_norm": 1.7459537982940674, "learning_rate": 1.9032147742818058e-05, "loss": 1.98, "step": 7244 }, { "epoch": 0.62, "grad_norm": 1.5512782335281372, "learning_rate": 1.9015047879616965e-05, "loss": 2.1063, "step": 7248 }, { "epoch": 0.62, "grad_norm": 1.5178790092468262, "learning_rate": 1.899794801641587e-05, "loss": 2.0102, "step": 7252 }, { "epoch": 0.62, "grad_norm": 1.7210770845413208, "learning_rate": 1.8980848153214776e-05, "loss": 1.9342, "step": 7256 }, { "epoch": 0.62, "grad_norm": 1.472294569015503, "learning_rate": 1.896374829001368e-05, "loss": 2.0667, "step": 7260 }, { "epoch": 0.62, "grad_norm": 1.5506442785263062, "learning_rate": 1.8946648426812584e-05, "loss": 2.2188, "step": 7264 }, { "epoch": 0.62, "grad_norm": 1.6018651723861694, "learning_rate": 1.8929548563611495e-05, "loss": 1.9151, "step": 7268 }, { "epoch": 0.62, "grad_norm": 1.6608229875564575, "learning_rate": 1.89124487004104e-05, "loss": 1.8974, "step": 7272 }, { "epoch": 0.62, "grad_norm": 1.580942153930664, "learning_rate": 1.8895348837209303e-05, "loss": 1.9818, "step": 7276 }, { "epoch": 0.62, "grad_norm": 1.5641908645629883, "learning_rate": 1.887824897400821e-05, "loss": 1.8878, "step": 7280 }, { "epoch": 0.62, "grad_norm": 1.6174907684326172, "learning_rate": 1.8861149110807114e-05, "loss": 2.0124, "step": 7284 }, { "epoch": 0.62, "grad_norm": 1.6148037910461426, "learning_rate": 1.8844049247606018e-05, "loss": 1.902, "step": 7288 }, { "epoch": 0.62, "grad_norm": 1.6356844902038574, "learning_rate": 1.8826949384404925e-05, "loss": 2.0167, "step": 7292 }, { "epoch": 0.62, "grad_norm": 1.441463589668274, "learning_rate": 1.8809849521203833e-05, "loss": 1.8633, "step": 7296 }, { "epoch": 0.62, "grad_norm": 1.5125188827514648, "learning_rate": 1.8792749658002737e-05, "loss": 1.7871, "step": 7300 }, { "epoch": 0.62, "grad_norm": 1.4316954612731934, "learning_rate": 1.877564979480164e-05, "loss": 2.0366, "step": 7304 }, { "epoch": 0.62, "grad_norm": 1.7826207876205444, "learning_rate": 1.8758549931600548e-05, "loss": 1.9861, "step": 7308 }, { "epoch": 0.63, "grad_norm": 1.7452281713485718, "learning_rate": 1.8741450068399455e-05, "loss": 2.1054, "step": 7312 }, { "epoch": 0.63, "grad_norm": 1.5087822675704956, "learning_rate": 1.872435020519836e-05, "loss": 1.9194, "step": 7316 }, { "epoch": 0.63, "grad_norm": 1.560201644897461, "learning_rate": 1.8707250341997263e-05, "loss": 1.983, "step": 7320 }, { "epoch": 0.63, "grad_norm": 1.5828901529312134, "learning_rate": 1.869015047879617e-05, "loss": 2.0051, "step": 7324 }, { "epoch": 0.63, "grad_norm": 1.5497353076934814, "learning_rate": 1.8673050615595075e-05, "loss": 2.0197, "step": 7328 }, { "epoch": 0.63, "grad_norm": 1.5281920433044434, "learning_rate": 1.8655950752393982e-05, "loss": 1.9502, "step": 7332 }, { "epoch": 0.63, "grad_norm": 1.6356514692306519, "learning_rate": 1.863885088919289e-05, "loss": 2.069, "step": 7336 }, { "epoch": 0.63, "grad_norm": 1.4687514305114746, "learning_rate": 1.8621751025991793e-05, "loss": 1.931, "step": 7340 }, { "epoch": 0.63, "grad_norm": 1.604845643043518, "learning_rate": 1.8604651162790697e-05, "loss": 2.0305, "step": 7344 }, { "epoch": 0.63, "grad_norm": 1.5669852495193481, "learning_rate": 1.8587551299589605e-05, "loss": 2.0094, "step": 7348 }, { "epoch": 0.63, "grad_norm": 1.586909532546997, "learning_rate": 1.857045143638851e-05, "loss": 2.0053, "step": 7352 }, { "epoch": 0.63, "grad_norm": 1.5457959175109863, "learning_rate": 1.8553351573187416e-05, "loss": 1.9575, "step": 7356 }, { "epoch": 0.63, "grad_norm": 1.4828368425369263, "learning_rate": 1.853625170998632e-05, "loss": 1.9047, "step": 7360 }, { "epoch": 0.63, "grad_norm": 1.5736109018325806, "learning_rate": 1.8519151846785227e-05, "loss": 1.93, "step": 7364 }, { "epoch": 0.63, "grad_norm": 1.7825120687484741, "learning_rate": 1.850205198358413e-05, "loss": 1.9667, "step": 7368 }, { "epoch": 0.63, "grad_norm": 1.529508113861084, "learning_rate": 1.8484952120383035e-05, "loss": 1.8791, "step": 7372 }, { "epoch": 0.63, "grad_norm": 1.5138580799102783, "learning_rate": 1.8467852257181946e-05, "loss": 1.932, "step": 7376 }, { "epoch": 0.63, "grad_norm": 1.619393229484558, "learning_rate": 1.845075239398085e-05, "loss": 1.9461, "step": 7380 }, { "epoch": 0.63, "grad_norm": 1.4028980731964111, "learning_rate": 1.8433652530779754e-05, "loss": 1.9428, "step": 7384 }, { "epoch": 0.63, "grad_norm": 1.631604552268982, "learning_rate": 1.841655266757866e-05, "loss": 1.9728, "step": 7388 }, { "epoch": 0.63, "grad_norm": 1.4649015665054321, "learning_rate": 1.8399452804377565e-05, "loss": 1.8762, "step": 7392 }, { "epoch": 0.63, "grad_norm": 1.48709237575531, "learning_rate": 1.8382352941176472e-05, "loss": 1.9312, "step": 7396 }, { "epoch": 0.63, "grad_norm": 1.6089003086090088, "learning_rate": 1.8365253077975376e-05, "loss": 1.8671, "step": 7400 }, { "epoch": 0.63, "grad_norm": 1.723889708518982, "learning_rate": 1.8348153214774284e-05, "loss": 1.9107, "step": 7404 }, { "epoch": 0.63, "grad_norm": 1.5013092756271362, "learning_rate": 1.8331053351573188e-05, "loss": 1.9322, "step": 7408 }, { "epoch": 0.63, "grad_norm": 1.8294460773468018, "learning_rate": 1.831395348837209e-05, "loss": 2.1004, "step": 7412 }, { "epoch": 0.63, "grad_norm": 1.4940433502197266, "learning_rate": 1.8296853625171e-05, "loss": 1.882, "step": 7416 }, { "epoch": 0.63, "grad_norm": 1.6243213415145874, "learning_rate": 1.8279753761969906e-05, "loss": 2.1548, "step": 7420 }, { "epoch": 0.63, "grad_norm": 1.934673547744751, "learning_rate": 1.826265389876881e-05, "loss": 1.9928, "step": 7424 }, { "epoch": 0.64, "grad_norm": 1.7047098875045776, "learning_rate": 1.8245554035567718e-05, "loss": 1.9881, "step": 7428 }, { "epoch": 0.64, "grad_norm": 1.5831687450408936, "learning_rate": 1.822845417236662e-05, "loss": 1.9889, "step": 7432 }, { "epoch": 0.64, "grad_norm": 1.6739925146102905, "learning_rate": 1.8211354309165525e-05, "loss": 1.9835, "step": 7436 }, { "epoch": 0.64, "grad_norm": 1.4848778247833252, "learning_rate": 1.8194254445964433e-05, "loss": 1.9711, "step": 7440 }, { "epoch": 0.64, "grad_norm": 1.499522089958191, "learning_rate": 1.817715458276334e-05, "loss": 1.9454, "step": 7444 }, { "epoch": 0.64, "grad_norm": 1.6421815156936646, "learning_rate": 1.8160054719562244e-05, "loss": 1.9402, "step": 7448 }, { "epoch": 0.64, "grad_norm": 1.7279685735702515, "learning_rate": 1.8142954856361148e-05, "loss": 1.7985, "step": 7452 }, { "epoch": 0.64, "grad_norm": 1.6793206930160522, "learning_rate": 1.8125854993160055e-05, "loss": 2.0187, "step": 7456 }, { "epoch": 0.64, "grad_norm": 1.4948447942733765, "learning_rate": 1.8108755129958963e-05, "loss": 1.9569, "step": 7460 }, { "epoch": 0.64, "grad_norm": 1.5078566074371338, "learning_rate": 1.8091655266757867e-05, "loss": 1.8296, "step": 7464 }, { "epoch": 0.64, "grad_norm": 1.7886958122253418, "learning_rate": 1.8074555403556774e-05, "loss": 2.0134, "step": 7468 }, { "epoch": 0.64, "grad_norm": 1.4695848226547241, "learning_rate": 1.8057455540355678e-05, "loss": 1.9271, "step": 7472 }, { "epoch": 0.64, "grad_norm": 1.4936541318893433, "learning_rate": 1.8040355677154582e-05, "loss": 1.9308, "step": 7476 }, { "epoch": 0.64, "grad_norm": 1.5521645545959473, "learning_rate": 1.802325581395349e-05, "loss": 1.8857, "step": 7480 }, { "epoch": 0.64, "grad_norm": 1.5674141645431519, "learning_rate": 1.8006155950752397e-05, "loss": 1.9064, "step": 7484 }, { "epoch": 0.64, "grad_norm": 1.6228454113006592, "learning_rate": 1.79890560875513e-05, "loss": 1.8739, "step": 7488 }, { "epoch": 0.64, "grad_norm": 1.4367411136627197, "learning_rate": 1.7971956224350205e-05, "loss": 1.9576, "step": 7492 }, { "epoch": 0.64, "grad_norm": 1.5149922370910645, "learning_rate": 1.7954856361149112e-05, "loss": 1.9571, "step": 7496 }, { "epoch": 0.64, "grad_norm": 1.6180402040481567, "learning_rate": 1.7937756497948016e-05, "loss": 1.8322, "step": 7500 }, { "epoch": 0.64, "grad_norm": 1.410872459411621, "learning_rate": 1.7920656634746923e-05, "loss": 1.8944, "step": 7504 }, { "epoch": 0.64, "grad_norm": 1.4997954368591309, "learning_rate": 1.790355677154583e-05, "loss": 2.014, "step": 7508 }, { "epoch": 0.64, "grad_norm": 1.4796793460845947, "learning_rate": 1.7886456908344735e-05, "loss": 1.908, "step": 7512 }, { "epoch": 0.64, "grad_norm": 1.6083449125289917, "learning_rate": 1.786935704514364e-05, "loss": 1.9159, "step": 7516 }, { "epoch": 0.64, "grad_norm": 1.6810011863708496, "learning_rate": 1.7852257181942546e-05, "loss": 1.9426, "step": 7520 }, { "epoch": 0.64, "grad_norm": 1.5123205184936523, "learning_rate": 1.7835157318741453e-05, "loss": 1.8627, "step": 7524 }, { "epoch": 0.64, "grad_norm": 1.4853988885879517, "learning_rate": 1.7818057455540357e-05, "loss": 1.9814, "step": 7528 }, { "epoch": 0.64, "grad_norm": 1.720802903175354, "learning_rate": 1.780095759233926e-05, "loss": 1.9168, "step": 7532 }, { "epoch": 0.64, "grad_norm": 1.6009372472763062, "learning_rate": 1.778385772913817e-05, "loss": 1.9071, "step": 7536 }, { "epoch": 0.64, "grad_norm": 2.1640713214874268, "learning_rate": 1.7766757865937072e-05, "loss": 1.9403, "step": 7540 }, { "epoch": 0.65, "grad_norm": 1.9134138822555542, "learning_rate": 1.7749658002735976e-05, "loss": 2.013, "step": 7544 }, { "epoch": 0.65, "grad_norm": 1.6833550930023193, "learning_rate": 1.7732558139534887e-05, "loss": 2.0307, "step": 7548 }, { "epoch": 0.65, "grad_norm": 1.643018364906311, "learning_rate": 1.771545827633379e-05, "loss": 1.8817, "step": 7552 }, { "epoch": 0.65, "grad_norm": 1.5238628387451172, "learning_rate": 1.7698358413132695e-05, "loss": 2.0012, "step": 7556 }, { "epoch": 0.65, "grad_norm": 1.612910270690918, "learning_rate": 1.7681258549931602e-05, "loss": 1.9054, "step": 7560 }, { "epoch": 0.65, "grad_norm": 1.4849001169204712, "learning_rate": 1.7664158686730506e-05, "loss": 1.8482, "step": 7564 }, { "epoch": 0.65, "grad_norm": 1.7270807027816772, "learning_rate": 1.7647058823529414e-05, "loss": 2.0145, "step": 7568 }, { "epoch": 0.65, "grad_norm": 1.676148533821106, "learning_rate": 1.7629958960328318e-05, "loss": 2.0066, "step": 7572 }, { "epoch": 0.65, "grad_norm": 1.3805028200149536, "learning_rate": 1.7612859097127225e-05, "loss": 1.813, "step": 7576 }, { "epoch": 0.65, "grad_norm": 1.528660774230957, "learning_rate": 1.759575923392613e-05, "loss": 1.9291, "step": 7580 }, { "epoch": 0.65, "grad_norm": 1.623838186264038, "learning_rate": 1.7578659370725033e-05, "loss": 1.9315, "step": 7584 }, { "epoch": 0.65, "grad_norm": 1.4332454204559326, "learning_rate": 1.7561559507523944e-05, "loss": 1.9888, "step": 7588 }, { "epoch": 0.65, "grad_norm": 1.5631705522537231, "learning_rate": 1.7544459644322848e-05, "loss": 2.1324, "step": 7592 }, { "epoch": 0.65, "grad_norm": 2.10516357421875, "learning_rate": 1.752735978112175e-05, "loss": 1.9798, "step": 7596 }, { "epoch": 0.65, "grad_norm": 1.6901365518569946, "learning_rate": 1.751025991792066e-05, "loss": 1.9679, "step": 7600 }, { "epoch": 0.65, "grad_norm": 1.9404395818710327, "learning_rate": 1.7493160054719563e-05, "loss": 2.0198, "step": 7604 }, { "epoch": 0.65, "grad_norm": 1.5954433679580688, "learning_rate": 1.7476060191518467e-05, "loss": 1.887, "step": 7608 }, { "epoch": 0.65, "grad_norm": 1.4495494365692139, "learning_rate": 1.7458960328317374e-05, "loss": 2.0086, "step": 7612 }, { "epoch": 0.65, "grad_norm": 1.513266682624817, "learning_rate": 1.744186046511628e-05, "loss": 1.9934, "step": 7616 }, { "epoch": 0.65, "grad_norm": 1.6928620338439941, "learning_rate": 1.7424760601915185e-05, "loss": 1.9103, "step": 7620 }, { "epoch": 0.65, "grad_norm": 1.7382714748382568, "learning_rate": 1.740766073871409e-05, "loss": 2.015, "step": 7624 }, { "epoch": 0.65, "grad_norm": 1.4485249519348145, "learning_rate": 1.7390560875512997e-05, "loss": 2.076, "step": 7628 }, { "epoch": 0.65, "grad_norm": 1.5897313356399536, "learning_rate": 1.7373461012311904e-05, "loss": 1.8503, "step": 7632 }, { "epoch": 0.65, "grad_norm": 1.6106144189834595, "learning_rate": 1.7356361149110808e-05, "loss": 1.7611, "step": 7636 }, { "epoch": 0.65, "grad_norm": 1.6251461505889893, "learning_rate": 1.7339261285909715e-05, "loss": 1.8426, "step": 7640 }, { "epoch": 0.65, "grad_norm": 1.6952441930770874, "learning_rate": 1.732216142270862e-05, "loss": 1.9492, "step": 7644 }, { "epoch": 0.65, "grad_norm": 1.6333529949188232, "learning_rate": 1.7305061559507523e-05, "loss": 1.9824, "step": 7648 }, { "epoch": 0.65, "grad_norm": 1.5729711055755615, "learning_rate": 1.728796169630643e-05, "loss": 1.9888, "step": 7652 }, { "epoch": 0.65, "grad_norm": 1.5947141647338867, "learning_rate": 1.7270861833105338e-05, "loss": 1.8368, "step": 7656 }, { "epoch": 0.65, "grad_norm": 1.5854895114898682, "learning_rate": 1.7253761969904242e-05, "loss": 1.9155, "step": 7660 }, { "epoch": 0.66, "grad_norm": 1.6690934896469116, "learning_rate": 1.7236662106703146e-05, "loss": 1.9139, "step": 7664 }, { "epoch": 0.66, "grad_norm": 1.5118485689163208, "learning_rate": 1.7219562243502053e-05, "loss": 1.8872, "step": 7668 }, { "epoch": 0.66, "grad_norm": 1.6435084342956543, "learning_rate": 1.7202462380300957e-05, "loss": 2.1278, "step": 7672 }, { "epoch": 0.66, "grad_norm": 1.6225100755691528, "learning_rate": 1.7185362517099864e-05, "loss": 1.9057, "step": 7676 }, { "epoch": 0.66, "grad_norm": 1.603485345840454, "learning_rate": 1.716826265389877e-05, "loss": 1.7631, "step": 7680 }, { "epoch": 0.66, "grad_norm": 1.588488221168518, "learning_rate": 1.7151162790697676e-05, "loss": 2.0738, "step": 7684 }, { "epoch": 0.66, "grad_norm": 1.5990886688232422, "learning_rate": 1.713406292749658e-05, "loss": 1.9249, "step": 7688 }, { "epoch": 0.66, "grad_norm": 1.5695997476577759, "learning_rate": 1.7116963064295484e-05, "loss": 1.8807, "step": 7692 }, { "epoch": 0.66, "grad_norm": 1.8457928895950317, "learning_rate": 1.7099863201094394e-05, "loss": 2.0667, "step": 7696 }, { "epoch": 0.66, "grad_norm": 1.7506129741668701, "learning_rate": 1.70827633378933e-05, "loss": 1.9462, "step": 7700 }, { "epoch": 0.66, "grad_norm": 1.5219964981079102, "learning_rate": 1.7065663474692202e-05, "loss": 1.8061, "step": 7704 }, { "epoch": 0.66, "grad_norm": 1.5406177043914795, "learning_rate": 1.704856361149111e-05, "loss": 1.9219, "step": 7708 }, { "epoch": 0.66, "grad_norm": 1.5064541101455688, "learning_rate": 1.7031463748290014e-05, "loss": 1.9149, "step": 7712 }, { "epoch": 0.66, "grad_norm": 1.7326499223709106, "learning_rate": 1.701436388508892e-05, "loss": 1.8593, "step": 7716 }, { "epoch": 0.66, "grad_norm": 1.6578881740570068, "learning_rate": 1.6997264021887825e-05, "loss": 2.0094, "step": 7720 }, { "epoch": 0.66, "grad_norm": 1.724426031112671, "learning_rate": 1.6980164158686732e-05, "loss": 2.0086, "step": 7724 }, { "epoch": 0.66, "grad_norm": 1.7272109985351562, "learning_rate": 1.6963064295485636e-05, "loss": 2.0135, "step": 7728 }, { "epoch": 0.66, "grad_norm": 1.8231357336044312, "learning_rate": 1.694596443228454e-05, "loss": 1.9537, "step": 7732 }, { "epoch": 0.66, "grad_norm": 1.6103249788284302, "learning_rate": 1.6928864569083448e-05, "loss": 2.0054, "step": 7736 }, { "epoch": 0.66, "grad_norm": 1.507234811782837, "learning_rate": 1.6911764705882355e-05, "loss": 2.0489, "step": 7740 }, { "epoch": 0.66, "grad_norm": 1.49418306350708, "learning_rate": 1.689466484268126e-05, "loss": 1.8426, "step": 7744 }, { "epoch": 0.66, "grad_norm": 1.828412652015686, "learning_rate": 1.6877564979480166e-05, "loss": 2.0645, "step": 7748 }, { "epoch": 0.66, "grad_norm": 1.5563526153564453, "learning_rate": 1.686046511627907e-05, "loss": 1.9228, "step": 7752 }, { "epoch": 0.66, "grad_norm": 1.680085301399231, "learning_rate": 1.6843365253077974e-05, "loss": 2.0503, "step": 7756 }, { "epoch": 0.66, "grad_norm": 1.8583743572235107, "learning_rate": 1.682626538987688e-05, "loss": 1.9134, "step": 7760 }, { "epoch": 0.66, "grad_norm": 1.5803444385528564, "learning_rate": 1.680916552667579e-05, "loss": 1.8787, "step": 7764 }, { "epoch": 0.66, "grad_norm": 1.4421805143356323, "learning_rate": 1.6792065663474693e-05, "loss": 1.7833, "step": 7768 }, { "epoch": 0.66, "grad_norm": 1.6548757553100586, "learning_rate": 1.6774965800273597e-05, "loss": 1.9397, "step": 7772 }, { "epoch": 0.66, "grad_norm": 1.6442538499832153, "learning_rate": 1.6757865937072504e-05, "loss": 2.1046, "step": 7776 }, { "epoch": 0.67, "grad_norm": 1.643537163734436, "learning_rate": 1.674076607387141e-05, "loss": 1.9088, "step": 7780 }, { "epoch": 0.67, "grad_norm": 1.5146218538284302, "learning_rate": 1.6723666210670315e-05, "loss": 1.9828, "step": 7784 }, { "epoch": 0.67, "grad_norm": 1.689788579940796, "learning_rate": 1.6706566347469223e-05, "loss": 1.9587, "step": 7788 }, { "epoch": 0.67, "grad_norm": 1.498406171798706, "learning_rate": 1.6689466484268127e-05, "loss": 1.8098, "step": 7792 }, { "epoch": 0.67, "grad_norm": 1.6559585332870483, "learning_rate": 1.667236662106703e-05, "loss": 2.0373, "step": 7796 }, { "epoch": 0.67, "grad_norm": 1.675614595413208, "learning_rate": 1.6655266757865938e-05, "loss": 1.9298, "step": 7800 }, { "epoch": 0.67, "grad_norm": 1.6751221418380737, "learning_rate": 1.6638166894664845e-05, "loss": 1.9871, "step": 7804 }, { "epoch": 0.67, "grad_norm": 1.4255478382110596, "learning_rate": 1.662106703146375e-05, "loss": 1.821, "step": 7808 }, { "epoch": 0.67, "grad_norm": 1.5127853155136108, "learning_rate": 1.6603967168262653e-05, "loss": 2.0976, "step": 7812 }, { "epoch": 0.67, "grad_norm": 1.5522574186325073, "learning_rate": 1.658686730506156e-05, "loss": 2.027, "step": 7816 }, { "epoch": 0.67, "grad_norm": 1.492023229598999, "learning_rate": 1.6569767441860464e-05, "loss": 1.9444, "step": 7820 }, { "epoch": 0.67, "grad_norm": 1.5379337072372437, "learning_rate": 1.6552667578659372e-05, "loss": 1.9637, "step": 7824 }, { "epoch": 0.67, "grad_norm": 1.615531086921692, "learning_rate": 1.653556771545828e-05, "loss": 1.9079, "step": 7828 }, { "epoch": 0.67, "grad_norm": 1.5299206972122192, "learning_rate": 1.6518467852257183e-05, "loss": 1.8934, "step": 7832 }, { "epoch": 0.67, "grad_norm": 1.5533653497695923, "learning_rate": 1.6501367989056087e-05, "loss": 1.8824, "step": 7836 }, { "epoch": 0.67, "grad_norm": 1.5589085817337036, "learning_rate": 1.6484268125854994e-05, "loss": 1.8289, "step": 7840 }, { "epoch": 0.67, "grad_norm": 1.5232523679733276, "learning_rate": 1.6467168262653902e-05, "loss": 1.9622, "step": 7844 }, { "epoch": 0.67, "grad_norm": 1.4827030897140503, "learning_rate": 1.6450068399452806e-05, "loss": 2.0214, "step": 7848 }, { "epoch": 0.67, "grad_norm": 1.5622025728225708, "learning_rate": 1.643296853625171e-05, "loss": 1.9528, "step": 7852 }, { "epoch": 0.67, "grad_norm": 1.578583836555481, "learning_rate": 1.6415868673050617e-05, "loss": 1.9292, "step": 7856 }, { "epoch": 0.67, "grad_norm": 1.5761486291885376, "learning_rate": 1.639876880984952e-05, "loss": 1.8721, "step": 7860 }, { "epoch": 0.67, "grad_norm": 1.434203028678894, "learning_rate": 1.6381668946648425e-05, "loss": 1.882, "step": 7864 }, { "epoch": 0.67, "grad_norm": 1.7951371669769287, "learning_rate": 1.6364569083447336e-05, "loss": 1.8724, "step": 7868 }, { "epoch": 0.67, "grad_norm": 1.5747816562652588, "learning_rate": 1.634746922024624e-05, "loss": 1.9838, "step": 7872 }, { "epoch": 0.67, "grad_norm": 1.6416630744934082, "learning_rate": 1.6330369357045144e-05, "loss": 2.0448, "step": 7876 }, { "epoch": 0.67, "grad_norm": 1.5873196125030518, "learning_rate": 1.631326949384405e-05, "loss": 1.8606, "step": 7880 }, { "epoch": 0.67, "grad_norm": 1.708465576171875, "learning_rate": 1.6296169630642955e-05, "loss": 2.0646, "step": 7884 }, { "epoch": 0.67, "grad_norm": 1.764533281326294, "learning_rate": 1.6279069767441862e-05, "loss": 1.9685, "step": 7888 }, { "epoch": 0.67, "grad_norm": 1.6329145431518555, "learning_rate": 1.6261969904240766e-05, "loss": 1.9665, "step": 7892 }, { "epoch": 0.68, "grad_norm": 1.5862176418304443, "learning_rate": 1.6244870041039673e-05, "loss": 1.9459, "step": 7896 }, { "epoch": 0.68, "grad_norm": 1.66335129737854, "learning_rate": 1.6227770177838577e-05, "loss": 1.9046, "step": 7900 }, { "epoch": 0.68, "grad_norm": 1.854282259941101, "learning_rate": 1.621067031463748e-05, "loss": 1.8697, "step": 7904 }, { "epoch": 0.68, "grad_norm": 1.5997343063354492, "learning_rate": 1.6193570451436392e-05, "loss": 1.8564, "step": 7908 }, { "epoch": 0.68, "grad_norm": 1.5500624179840088, "learning_rate": 1.6176470588235296e-05, "loss": 1.9197, "step": 7912 }, { "epoch": 0.68, "grad_norm": 1.7503950595855713, "learning_rate": 1.61593707250342e-05, "loss": 1.8817, "step": 7916 }, { "epoch": 0.68, "grad_norm": 1.5188136100769043, "learning_rate": 1.6142270861833107e-05, "loss": 1.9714, "step": 7920 }, { "epoch": 0.68, "grad_norm": 1.632093906402588, "learning_rate": 1.612517099863201e-05, "loss": 1.9189, "step": 7924 }, { "epoch": 0.68, "grad_norm": 2.079958915710449, "learning_rate": 1.6108071135430915e-05, "loss": 1.9399, "step": 7928 }, { "epoch": 0.68, "grad_norm": 1.5415632724761963, "learning_rate": 1.6090971272229823e-05, "loss": 1.977, "step": 7932 }, { "epoch": 0.68, "grad_norm": 1.6359429359436035, "learning_rate": 1.607387140902873e-05, "loss": 1.9225, "step": 7936 }, { "epoch": 0.68, "grad_norm": 1.6793197393417358, "learning_rate": 1.6056771545827634e-05, "loss": 1.891, "step": 7940 }, { "epoch": 0.68, "grad_norm": 1.5686537027359009, "learning_rate": 1.6039671682626538e-05, "loss": 1.8904, "step": 7944 }, { "epoch": 0.68, "grad_norm": 1.8018839359283447, "learning_rate": 1.6022571819425445e-05, "loss": 1.8233, "step": 7948 }, { "epoch": 0.68, "grad_norm": 1.547400951385498, "learning_rate": 1.6005471956224353e-05, "loss": 1.9372, "step": 7952 }, { "epoch": 0.68, "grad_norm": 1.5246180295944214, "learning_rate": 1.5988372093023257e-05, "loss": 1.9839, "step": 7956 }, { "epoch": 0.68, "grad_norm": 1.4722802639007568, "learning_rate": 1.5971272229822164e-05, "loss": 1.7585, "step": 7960 }, { "epoch": 0.68, "grad_norm": 1.5736894607543945, "learning_rate": 1.5954172366621068e-05, "loss": 1.9647, "step": 7964 }, { "epoch": 0.68, "grad_norm": 1.6675376892089844, "learning_rate": 1.5937072503419972e-05, "loss": 1.9783, "step": 7968 }, { "epoch": 0.68, "grad_norm": 1.5381276607513428, "learning_rate": 1.591997264021888e-05, "loss": 1.8318, "step": 7972 }, { "epoch": 0.68, "grad_norm": 1.672789454460144, "learning_rate": 1.5902872777017786e-05, "loss": 1.9432, "step": 7976 }, { "epoch": 0.68, "grad_norm": 1.5092620849609375, "learning_rate": 1.588577291381669e-05, "loss": 1.8645, "step": 7980 }, { "epoch": 0.68, "grad_norm": 1.7272820472717285, "learning_rate": 1.5868673050615594e-05, "loss": 2.0409, "step": 7984 }, { "epoch": 0.68, "grad_norm": 1.5391128063201904, "learning_rate": 1.5851573187414502e-05, "loss": 1.7384, "step": 7988 }, { "epoch": 0.68, "grad_norm": 1.4302643537521362, "learning_rate": 1.5834473324213406e-05, "loss": 1.891, "step": 7992 }, { "epoch": 0.68, "grad_norm": 1.5676170587539673, "learning_rate": 1.5817373461012313e-05, "loss": 1.7741, "step": 7996 }, { "epoch": 0.68, "grad_norm": 1.7254034280776978, "learning_rate": 1.580027359781122e-05, "loss": 1.9008, "step": 8000 }, { "epoch": 0.68, "grad_norm": 1.5930966138839722, "learning_rate": 1.5783173734610124e-05, "loss": 1.9161, "step": 8004 }, { "epoch": 0.68, "grad_norm": 2.011584758758545, "learning_rate": 1.5766073871409028e-05, "loss": 1.9344, "step": 8008 }, { "epoch": 0.69, "grad_norm": 1.5066322088241577, "learning_rate": 1.5748974008207936e-05, "loss": 1.8778, "step": 8012 }, { "epoch": 0.69, "grad_norm": 1.775755763053894, "learning_rate": 1.5731874145006843e-05, "loss": 1.8985, "step": 8016 }, { "epoch": 0.69, "grad_norm": 1.5539259910583496, "learning_rate": 1.5714774281805747e-05, "loss": 1.8531, "step": 8020 }, { "epoch": 0.69, "grad_norm": 1.7370346784591675, "learning_rate": 1.569767441860465e-05, "loss": 1.9098, "step": 8024 }, { "epoch": 0.69, "grad_norm": 1.4785212278366089, "learning_rate": 1.5680574555403558e-05, "loss": 2.0605, "step": 8028 }, { "epoch": 0.69, "grad_norm": 1.6083650588989258, "learning_rate": 1.5663474692202462e-05, "loss": 2.0839, "step": 8032 }, { "epoch": 0.69, "grad_norm": 1.6726301908493042, "learning_rate": 1.564637482900137e-05, "loss": 1.8263, "step": 8036 }, { "epoch": 0.69, "grad_norm": 4.202913761138916, "learning_rate": 1.5629274965800273e-05, "loss": 1.8298, "step": 8040 }, { "epoch": 0.69, "grad_norm": 1.6206969022750854, "learning_rate": 1.561217510259918e-05, "loss": 1.9383, "step": 8044 }, { "epoch": 0.69, "grad_norm": 1.6992504596710205, "learning_rate": 1.5595075239398085e-05, "loss": 1.9817, "step": 8048 }, { "epoch": 0.69, "grad_norm": 1.685895323753357, "learning_rate": 1.557797537619699e-05, "loss": 2.0378, "step": 8052 }, { "epoch": 0.69, "grad_norm": 1.6659513711929321, "learning_rate": 1.5560875512995896e-05, "loss": 1.923, "step": 8056 }, { "epoch": 0.69, "grad_norm": 1.6059319972991943, "learning_rate": 1.5543775649794803e-05, "loss": 1.9304, "step": 8060 }, { "epoch": 0.69, "grad_norm": 1.725585699081421, "learning_rate": 1.5526675786593707e-05, "loss": 2.0079, "step": 8064 }, { "epoch": 0.69, "grad_norm": 1.7117788791656494, "learning_rate": 1.5509575923392615e-05, "loss": 2.1715, "step": 8068 }, { "epoch": 0.69, "grad_norm": 1.5837129354476929, "learning_rate": 1.549247606019152e-05, "loss": 1.9936, "step": 8072 }, { "epoch": 0.69, "grad_norm": 1.9159271717071533, "learning_rate": 1.5475376196990423e-05, "loss": 1.9387, "step": 8076 }, { "epoch": 0.69, "grad_norm": 1.7202599048614502, "learning_rate": 1.545827633378933e-05, "loss": 1.9975, "step": 8080 }, { "epoch": 0.69, "grad_norm": 1.5703368186950684, "learning_rate": 1.5441176470588237e-05, "loss": 1.8077, "step": 8084 }, { "epoch": 0.69, "grad_norm": 1.520732045173645, "learning_rate": 1.542407660738714e-05, "loss": 2.0593, "step": 8088 }, { "epoch": 0.69, "grad_norm": 1.8129254579544067, "learning_rate": 1.5406976744186045e-05, "loss": 1.8897, "step": 8092 }, { "epoch": 0.69, "grad_norm": 1.4720978736877441, "learning_rate": 1.5389876880984953e-05, "loss": 1.9366, "step": 8096 }, { "epoch": 0.69, "grad_norm": 1.5979335308074951, "learning_rate": 1.537277701778386e-05, "loss": 1.843, "step": 8100 }, { "epoch": 0.69, "grad_norm": 1.4189525842666626, "learning_rate": 1.5355677154582764e-05, "loss": 1.7472, "step": 8104 }, { "epoch": 0.69, "grad_norm": 1.724085807800293, "learning_rate": 1.533857729138167e-05, "loss": 2.108, "step": 8108 }, { "epoch": 0.69, "grad_norm": 1.5960993766784668, "learning_rate": 1.5321477428180575e-05, "loss": 1.8707, "step": 8112 }, { "epoch": 0.69, "grad_norm": 1.624754786491394, "learning_rate": 1.530437756497948e-05, "loss": 1.9318, "step": 8116 }, { "epoch": 0.69, "grad_norm": 1.758505940437317, "learning_rate": 1.5287277701778386e-05, "loss": 1.8996, "step": 8120 }, { "epoch": 0.69, "grad_norm": 1.633908987045288, "learning_rate": 1.5270177838577294e-05, "loss": 1.8455, "step": 8124 }, { "epoch": 0.69, "grad_norm": 1.578237533569336, "learning_rate": 1.5253077975376198e-05, "loss": 2.071, "step": 8128 }, { "epoch": 0.7, "grad_norm": 1.5181132555007935, "learning_rate": 1.5235978112175103e-05, "loss": 1.766, "step": 8132 }, { "epoch": 0.7, "grad_norm": 1.6433082818984985, "learning_rate": 1.5218878248974009e-05, "loss": 2.0446, "step": 8136 }, { "epoch": 0.7, "grad_norm": 1.6200603246688843, "learning_rate": 1.5201778385772913e-05, "loss": 1.7629, "step": 8140 }, { "epoch": 0.7, "grad_norm": 1.7067992687225342, "learning_rate": 1.5184678522571822e-05, "loss": 1.8624, "step": 8144 }, { "epoch": 0.7, "grad_norm": 1.515853762626648, "learning_rate": 1.5167578659370726e-05, "loss": 1.8187, "step": 8148 }, { "epoch": 0.7, "grad_norm": 1.7134236097335815, "learning_rate": 1.5150478796169632e-05, "loss": 1.9738, "step": 8152 }, { "epoch": 0.7, "grad_norm": 1.5560367107391357, "learning_rate": 1.5133378932968537e-05, "loss": 1.9252, "step": 8156 }, { "epoch": 0.7, "grad_norm": 1.476830005645752, "learning_rate": 1.5116279069767441e-05, "loss": 1.7564, "step": 8160 }, { "epoch": 0.7, "grad_norm": 1.6756997108459473, "learning_rate": 1.509917920656635e-05, "loss": 1.8457, "step": 8164 }, { "epoch": 0.7, "grad_norm": 1.6191574335098267, "learning_rate": 1.5082079343365254e-05, "loss": 1.9078, "step": 8168 }, { "epoch": 0.7, "grad_norm": 1.7781707048416138, "learning_rate": 1.506497948016416e-05, "loss": 1.9178, "step": 8172 }, { "epoch": 0.7, "grad_norm": 1.361154556274414, "learning_rate": 1.5047879616963066e-05, "loss": 1.9068, "step": 8176 }, { "epoch": 0.7, "grad_norm": 1.6916831731796265, "learning_rate": 1.503077975376197e-05, "loss": 1.9089, "step": 8180 }, { "epoch": 0.7, "grad_norm": 1.682424783706665, "learning_rate": 1.5013679890560875e-05, "loss": 2.0016, "step": 8184 }, { "epoch": 0.7, "grad_norm": 1.4187464714050293, "learning_rate": 1.4996580027359783e-05, "loss": 1.811, "step": 8188 }, { "epoch": 0.7, "grad_norm": 1.5043885707855225, "learning_rate": 1.4979480164158688e-05, "loss": 1.8448, "step": 8192 }, { "epoch": 0.7, "grad_norm": 1.6151576042175293, "learning_rate": 1.4962380300957594e-05, "loss": 1.9776, "step": 8196 }, { "epoch": 0.7, "grad_norm": 1.5769206285476685, "learning_rate": 1.4945280437756498e-05, "loss": 1.937, "step": 8200 }, { "epoch": 0.7, "grad_norm": 1.5044043064117432, "learning_rate": 1.4928180574555403e-05, "loss": 1.852, "step": 8204 }, { "epoch": 0.7, "grad_norm": 1.6411508321762085, "learning_rate": 1.491108071135431e-05, "loss": 1.9599, "step": 8208 }, { "epoch": 0.7, "grad_norm": 1.6469234228134155, "learning_rate": 1.4893980848153216e-05, "loss": 1.9749, "step": 8212 }, { "epoch": 0.7, "grad_norm": 1.64458167552948, "learning_rate": 1.4876880984952122e-05, "loss": 1.728, "step": 8216 }, { "epoch": 0.7, "grad_norm": 1.5000782012939453, "learning_rate": 1.4859781121751026e-05, "loss": 1.8083, "step": 8220 }, { "epoch": 0.7, "grad_norm": 1.521461844444275, "learning_rate": 1.4842681258549932e-05, "loss": 1.9935, "step": 8224 }, { "epoch": 0.7, "grad_norm": 1.5517728328704834, "learning_rate": 1.4825581395348839e-05, "loss": 2.0673, "step": 8228 }, { "epoch": 0.7, "grad_norm": 1.566142201423645, "learning_rate": 1.4808481532147745e-05, "loss": 1.8942, "step": 8232 }, { "epoch": 0.7, "grad_norm": 1.5397378206253052, "learning_rate": 1.4791381668946649e-05, "loss": 1.963, "step": 8236 }, { "epoch": 0.7, "grad_norm": 1.3787508010864258, "learning_rate": 1.4774281805745554e-05, "loss": 1.9094, "step": 8240 }, { "epoch": 0.7, "grad_norm": 1.4250317811965942, "learning_rate": 1.475718194254446e-05, "loss": 1.8181, "step": 8244 }, { "epoch": 0.71, "grad_norm": 1.868123173713684, "learning_rate": 1.4740082079343364e-05, "loss": 1.975, "step": 8248 }, { "epoch": 0.71, "grad_norm": 1.6380820274353027, "learning_rate": 1.4722982216142273e-05, "loss": 1.8019, "step": 8252 }, { "epoch": 0.71, "grad_norm": 1.7641537189483643, "learning_rate": 1.4705882352941177e-05, "loss": 1.8762, "step": 8256 }, { "epoch": 0.71, "grad_norm": 1.704601526260376, "learning_rate": 1.4688782489740083e-05, "loss": 1.938, "step": 8260 }, { "epoch": 0.71, "grad_norm": 1.5377604961395264, "learning_rate": 1.4671682626538988e-05, "loss": 1.837, "step": 8264 }, { "epoch": 0.71, "grad_norm": 1.5826222896575928, "learning_rate": 1.4654582763337892e-05, "loss": 1.9301, "step": 8268 }, { "epoch": 0.71, "grad_norm": 1.691020131111145, "learning_rate": 1.4637482900136801e-05, "loss": 1.9719, "step": 8272 }, { "epoch": 0.71, "grad_norm": 1.5105916261672974, "learning_rate": 1.4620383036935705e-05, "loss": 1.9823, "step": 8276 }, { "epoch": 0.71, "grad_norm": 1.5825799703598022, "learning_rate": 1.460328317373461e-05, "loss": 1.8319, "step": 8280 }, { "epoch": 0.71, "grad_norm": 1.7014378309249878, "learning_rate": 1.4586183310533516e-05, "loss": 1.85, "step": 8284 }, { "epoch": 0.71, "grad_norm": 1.5736719369888306, "learning_rate": 1.456908344733242e-05, "loss": 1.7308, "step": 8288 }, { "epoch": 0.71, "grad_norm": 1.4500672817230225, "learning_rate": 1.455198358413133e-05, "loss": 1.8872, "step": 8292 }, { "epoch": 0.71, "grad_norm": 1.5889211893081665, "learning_rate": 1.4534883720930233e-05, "loss": 1.9407, "step": 8296 }, { "epoch": 0.71, "grad_norm": 1.7505712509155273, "learning_rate": 1.4517783857729139e-05, "loss": 1.9576, "step": 8300 }, { "epoch": 0.71, "grad_norm": 1.6860861778259277, "learning_rate": 1.4500683994528045e-05, "loss": 2.0681, "step": 8304 }, { "epoch": 0.71, "grad_norm": 1.6268845796585083, "learning_rate": 1.4483584131326949e-05, "loss": 2.0465, "step": 8308 }, { "epoch": 0.71, "grad_norm": 1.5005725622177124, "learning_rate": 1.4466484268125854e-05, "loss": 1.8097, "step": 8312 }, { "epoch": 0.71, "grad_norm": 1.6980187892913818, "learning_rate": 1.4449384404924762e-05, "loss": 1.8862, "step": 8316 }, { "epoch": 0.71, "grad_norm": 1.7522437572479248, "learning_rate": 1.4432284541723667e-05, "loss": 1.8532, "step": 8320 }, { "epoch": 0.71, "grad_norm": 1.590436577796936, "learning_rate": 1.4415184678522573e-05, "loss": 1.8276, "step": 8324 }, { "epoch": 0.71, "grad_norm": 1.487074613571167, "learning_rate": 1.4398084815321477e-05, "loss": 1.9804, "step": 8328 }, { "epoch": 0.71, "grad_norm": 1.9625794887542725, "learning_rate": 1.4380984952120383e-05, "loss": 1.9065, "step": 8332 }, { "epoch": 0.71, "grad_norm": 1.6127842664718628, "learning_rate": 1.436388508891929e-05, "loss": 1.8961, "step": 8336 }, { "epoch": 0.71, "grad_norm": 1.7317554950714111, "learning_rate": 1.4346785225718196e-05, "loss": 1.9341, "step": 8340 }, { "epoch": 0.71, "grad_norm": 1.5197057723999023, "learning_rate": 1.4329685362517101e-05, "loss": 1.9849, "step": 8344 }, { "epoch": 0.71, "grad_norm": 1.5553900003433228, "learning_rate": 1.4312585499316005e-05, "loss": 1.7892, "step": 8348 }, { "epoch": 0.71, "grad_norm": 1.52109956741333, "learning_rate": 1.429548563611491e-05, "loss": 1.8928, "step": 8352 }, { "epoch": 0.71, "grad_norm": 1.6558705568313599, "learning_rate": 1.4278385772913818e-05, "loss": 1.799, "step": 8356 }, { "epoch": 0.71, "grad_norm": 1.612926959991455, "learning_rate": 1.4261285909712724e-05, "loss": 2.0211, "step": 8360 }, { "epoch": 0.72, "grad_norm": 1.5769792795181274, "learning_rate": 1.424418604651163e-05, "loss": 1.9197, "step": 8364 }, { "epoch": 0.72, "grad_norm": 1.557539939880371, "learning_rate": 1.4227086183310533e-05, "loss": 1.973, "step": 8368 }, { "epoch": 0.72, "grad_norm": 1.6096733808517456, "learning_rate": 1.4209986320109439e-05, "loss": 1.9608, "step": 8372 }, { "epoch": 0.72, "grad_norm": 1.759639024734497, "learning_rate": 1.4192886456908345e-05, "loss": 2.0181, "step": 8376 }, { "epoch": 0.72, "grad_norm": 1.7144453525543213, "learning_rate": 1.4175786593707252e-05, "loss": 1.9385, "step": 8380 }, { "epoch": 0.72, "grad_norm": 1.6760494709014893, "learning_rate": 1.4158686730506158e-05, "loss": 2.1005, "step": 8384 }, { "epoch": 0.72, "grad_norm": 1.7116944789886475, "learning_rate": 1.4141586867305062e-05, "loss": 1.9031, "step": 8388 }, { "epoch": 0.72, "grad_norm": 1.5736141204833984, "learning_rate": 1.4124487004103967e-05, "loss": 2.0717, "step": 8392 }, { "epoch": 0.72, "grad_norm": 1.5009993314743042, "learning_rate": 1.4107387140902873e-05, "loss": 1.9062, "step": 8396 }, { "epoch": 0.72, "grad_norm": 1.734379529953003, "learning_rate": 1.409028727770178e-05, "loss": 1.9607, "step": 8400 }, { "epoch": 0.72, "grad_norm": 1.7106738090515137, "learning_rate": 1.4073187414500686e-05, "loss": 1.9597, "step": 8404 }, { "epoch": 0.72, "grad_norm": 1.503051996231079, "learning_rate": 1.405608755129959e-05, "loss": 1.8636, "step": 8408 }, { "epoch": 0.72, "grad_norm": 1.544773817062378, "learning_rate": 1.4038987688098496e-05, "loss": 1.903, "step": 8412 }, { "epoch": 0.72, "grad_norm": 1.6054282188415527, "learning_rate": 1.4021887824897401e-05, "loss": 2.01, "step": 8416 }, { "epoch": 0.72, "grad_norm": 1.5877810716629028, "learning_rate": 1.4004787961696309e-05, "loss": 1.8562, "step": 8420 }, { "epoch": 0.72, "grad_norm": 1.669568419456482, "learning_rate": 1.3987688098495214e-05, "loss": 1.8486, "step": 8424 }, { "epoch": 0.72, "grad_norm": 1.780972957611084, "learning_rate": 1.3970588235294118e-05, "loss": 1.9747, "step": 8428 }, { "epoch": 0.72, "grad_norm": 1.4789724349975586, "learning_rate": 1.3953488372093024e-05, "loss": 1.9874, "step": 8432 }, { "epoch": 0.72, "grad_norm": 1.5372753143310547, "learning_rate": 1.393638850889193e-05, "loss": 1.6921, "step": 8436 }, { "epoch": 0.72, "grad_norm": 1.5397859811782837, "learning_rate": 1.3919288645690833e-05, "loss": 1.9584, "step": 8440 }, { "epoch": 0.72, "grad_norm": 1.774103045463562, "learning_rate": 1.3902188782489742e-05, "loss": 1.9538, "step": 8444 }, { "epoch": 0.72, "grad_norm": 1.6088874340057373, "learning_rate": 1.3885088919288646e-05, "loss": 1.9065, "step": 8448 }, { "epoch": 0.72, "grad_norm": 1.5402274131774902, "learning_rate": 1.3867989056087552e-05, "loss": 1.8515, "step": 8452 }, { "epoch": 0.72, "grad_norm": 1.530392050743103, "learning_rate": 1.3850889192886458e-05, "loss": 1.6816, "step": 8456 }, { "epoch": 0.72, "grad_norm": 1.4472659826278687, "learning_rate": 1.3833789329685362e-05, "loss": 1.8966, "step": 8460 }, { "epoch": 0.72, "grad_norm": 1.587443470954895, "learning_rate": 1.381668946648427e-05, "loss": 1.7889, "step": 8464 }, { "epoch": 0.72, "grad_norm": 1.7226911783218384, "learning_rate": 1.3799589603283175e-05, "loss": 1.9593, "step": 8468 }, { "epoch": 0.72, "grad_norm": 1.587898850440979, "learning_rate": 1.378248974008208e-05, "loss": 1.8951, "step": 8472 }, { "epoch": 0.72, "grad_norm": 1.9148885011672974, "learning_rate": 1.3765389876880986e-05, "loss": 1.9351, "step": 8476 }, { "epoch": 0.73, "grad_norm": 1.8292063474655151, "learning_rate": 1.374829001367989e-05, "loss": 1.8462, "step": 8480 }, { "epoch": 0.73, "grad_norm": 1.73773992061615, "learning_rate": 1.3731190150478799e-05, "loss": 1.8633, "step": 8484 }, { "epoch": 0.73, "grad_norm": 1.4449963569641113, "learning_rate": 1.3714090287277703e-05, "loss": 1.924, "step": 8488 }, { "epoch": 0.73, "grad_norm": 1.5996627807617188, "learning_rate": 1.3696990424076609e-05, "loss": 1.8774, "step": 8492 }, { "epoch": 0.73, "grad_norm": 1.6882178783416748, "learning_rate": 1.3679890560875514e-05, "loss": 1.9221, "step": 8496 }, { "epoch": 0.73, "grad_norm": 1.6781047582626343, "learning_rate": 1.3662790697674418e-05, "loss": 1.7734, "step": 8500 }, { "epoch": 0.73, "grad_norm": 1.6286660432815552, "learning_rate": 1.3645690834473324e-05, "loss": 1.8426, "step": 8504 }, { "epoch": 0.73, "grad_norm": 1.4449224472045898, "learning_rate": 1.3628590971272231e-05, "loss": 1.8119, "step": 8508 }, { "epoch": 0.73, "grad_norm": 1.6157493591308594, "learning_rate": 1.3611491108071137e-05, "loss": 1.7127, "step": 8512 }, { "epoch": 0.73, "grad_norm": 1.8456206321716309, "learning_rate": 1.3594391244870042e-05, "loss": 1.8209, "step": 8516 }, { "epoch": 0.73, "grad_norm": 1.5280355215072632, "learning_rate": 1.3577291381668946e-05, "loss": 2.0005, "step": 8520 }, { "epoch": 0.73, "grad_norm": 1.8175022602081299, "learning_rate": 1.3560191518467852e-05, "loss": 1.9212, "step": 8524 }, { "epoch": 0.73, "grad_norm": 1.5971537828445435, "learning_rate": 1.354309165526676e-05, "loss": 1.8606, "step": 8528 }, { "epoch": 0.73, "grad_norm": 1.7127560377120972, "learning_rate": 1.3525991792065665e-05, "loss": 1.8265, "step": 8532 }, { "epoch": 0.73, "grad_norm": 1.723050832748413, "learning_rate": 1.350889192886457e-05, "loss": 1.9374, "step": 8536 }, { "epoch": 0.73, "grad_norm": 1.4820556640625, "learning_rate": 1.3491792065663475e-05, "loss": 1.8634, "step": 8540 }, { "epoch": 0.73, "grad_norm": 1.731012225151062, "learning_rate": 1.347469220246238e-05, "loss": 1.9204, "step": 8544 }, { "epoch": 0.73, "grad_norm": 1.586348295211792, "learning_rate": 1.3457592339261288e-05, "loss": 1.8462, "step": 8548 }, { "epoch": 0.73, "grad_norm": 1.7355362176895142, "learning_rate": 1.3440492476060193e-05, "loss": 2.0345, "step": 8552 }, { "epoch": 0.73, "grad_norm": 1.4833712577819824, "learning_rate": 1.3423392612859099e-05, "loss": 1.9132, "step": 8556 }, { "epoch": 0.73, "grad_norm": 1.999471664428711, "learning_rate": 1.3406292749658003e-05, "loss": 2.0444, "step": 8560 }, { "epoch": 0.73, "grad_norm": 1.6343189477920532, "learning_rate": 1.3389192886456909e-05, "loss": 1.8392, "step": 8564 }, { "epoch": 0.73, "grad_norm": 1.5069808959960938, "learning_rate": 1.3372093023255814e-05, "loss": 1.8399, "step": 8568 }, { "epoch": 0.73, "grad_norm": 1.521087884902954, "learning_rate": 1.3354993160054722e-05, "loss": 1.9186, "step": 8572 }, { "epoch": 0.73, "grad_norm": 1.4739211797714233, "learning_rate": 1.3337893296853627e-05, "loss": 1.9674, "step": 8576 }, { "epoch": 0.73, "grad_norm": 1.5574839115142822, "learning_rate": 1.3320793433652531e-05, "loss": 1.8879, "step": 8580 }, { "epoch": 0.73, "grad_norm": 1.7786009311676025, "learning_rate": 1.3303693570451437e-05, "loss": 1.9308, "step": 8584 }, { "epoch": 0.73, "grad_norm": 1.627382516860962, "learning_rate": 1.3286593707250342e-05, "loss": 1.8588, "step": 8588 }, { "epoch": 0.73, "grad_norm": 1.5507442951202393, "learning_rate": 1.326949384404925e-05, "loss": 2.0934, "step": 8592 }, { "epoch": 0.73, "grad_norm": 1.486575722694397, "learning_rate": 1.3252393980848154e-05, "loss": 2.1199, "step": 8596 }, { "epoch": 0.74, "grad_norm": 1.3391540050506592, "learning_rate": 1.323529411764706e-05, "loss": 1.9324, "step": 8600 }, { "epoch": 0.74, "grad_norm": 1.69169020652771, "learning_rate": 1.3218194254445965e-05, "loss": 1.906, "step": 8604 }, { "epoch": 0.74, "grad_norm": 1.7022899389266968, "learning_rate": 1.3201094391244869e-05, "loss": 1.9422, "step": 8608 }, { "epoch": 0.74, "grad_norm": 1.4805599451065063, "learning_rate": 1.3183994528043778e-05, "loss": 1.8661, "step": 8612 }, { "epoch": 0.74, "grad_norm": 1.5892281532287598, "learning_rate": 1.3166894664842682e-05, "loss": 1.8542, "step": 8616 }, { "epoch": 0.74, "grad_norm": 1.6527740955352783, "learning_rate": 1.3149794801641588e-05, "loss": 1.8107, "step": 8620 }, { "epoch": 0.74, "grad_norm": 1.5607056617736816, "learning_rate": 1.3132694938440493e-05, "loss": 1.8057, "step": 8624 }, { "epoch": 0.74, "grad_norm": 1.5519297122955322, "learning_rate": 1.3115595075239397e-05, "loss": 1.8115, "step": 8628 }, { "epoch": 0.74, "grad_norm": 1.5834949016571045, "learning_rate": 1.3098495212038303e-05, "loss": 1.8907, "step": 8632 }, { "epoch": 0.74, "grad_norm": 1.6823513507843018, "learning_rate": 1.308139534883721e-05, "loss": 1.7454, "step": 8636 }, { "epoch": 0.74, "grad_norm": 1.6757451295852661, "learning_rate": 1.3064295485636116e-05, "loss": 1.7607, "step": 8640 }, { "epoch": 0.74, "grad_norm": 1.6643555164337158, "learning_rate": 1.3047195622435022e-05, "loss": 1.9063, "step": 8644 }, { "epoch": 0.74, "grad_norm": 1.7445098161697388, "learning_rate": 1.3030095759233925e-05, "loss": 1.7924, "step": 8648 }, { "epoch": 0.74, "grad_norm": 1.6503303050994873, "learning_rate": 1.3012995896032831e-05, "loss": 2.072, "step": 8652 }, { "epoch": 0.74, "grad_norm": 1.5941767692565918, "learning_rate": 1.2995896032831738e-05, "loss": 2.0103, "step": 8656 }, { "epoch": 0.74, "grad_norm": 1.6478708982467651, "learning_rate": 1.2978796169630644e-05, "loss": 1.7316, "step": 8660 }, { "epoch": 0.74, "grad_norm": 1.5611132383346558, "learning_rate": 1.296169630642955e-05, "loss": 1.822, "step": 8664 }, { "epoch": 0.74, "grad_norm": 1.5451310873031616, "learning_rate": 1.2944596443228454e-05, "loss": 2.0152, "step": 8668 }, { "epoch": 0.74, "grad_norm": 1.7043241262435913, "learning_rate": 1.292749658002736e-05, "loss": 1.9034, "step": 8672 }, { "epoch": 0.74, "grad_norm": 1.7091752290725708, "learning_rate": 1.2910396716826267e-05, "loss": 1.8423, "step": 8676 }, { "epoch": 0.74, "grad_norm": 1.6127060651779175, "learning_rate": 1.2893296853625172e-05, "loss": 1.6937, "step": 8680 }, { "epoch": 0.74, "grad_norm": 1.7381840944290161, "learning_rate": 1.2876196990424078e-05, "loss": 1.9543, "step": 8684 }, { "epoch": 0.74, "grad_norm": 1.5597513914108276, "learning_rate": 1.2859097127222982e-05, "loss": 1.8262, "step": 8688 }, { "epoch": 0.74, "grad_norm": 1.6478689908981323, "learning_rate": 1.2841997264021888e-05, "loss": 2.0061, "step": 8692 }, { "epoch": 0.74, "grad_norm": 1.6866868734359741, "learning_rate": 1.2824897400820793e-05, "loss": 2.0661, "step": 8696 }, { "epoch": 0.74, "grad_norm": 1.6700700521469116, "learning_rate": 1.28077975376197e-05, "loss": 1.8317, "step": 8700 }, { "epoch": 0.74, "grad_norm": 1.6782305240631104, "learning_rate": 1.2790697674418606e-05, "loss": 1.7571, "step": 8704 }, { "epoch": 0.74, "grad_norm": 1.5588642358779907, "learning_rate": 1.277359781121751e-05, "loss": 1.8465, "step": 8708 }, { "epoch": 0.74, "grad_norm": 1.5376290082931519, "learning_rate": 1.2756497948016416e-05, "loss": 1.8062, "step": 8712 }, { "epoch": 0.75, "grad_norm": 1.6274547576904297, "learning_rate": 1.2739398084815322e-05, "loss": 1.8891, "step": 8716 }, { "epoch": 0.75, "grad_norm": 1.577243447303772, "learning_rate": 1.2722298221614229e-05, "loss": 2.0104, "step": 8720 }, { "epoch": 0.75, "grad_norm": 1.848371148109436, "learning_rate": 1.2705198358413134e-05, "loss": 1.7878, "step": 8724 }, { "epoch": 0.75, "grad_norm": 1.5511852502822876, "learning_rate": 1.2688098495212038e-05, "loss": 1.765, "step": 8728 }, { "epoch": 0.75, "grad_norm": 1.580466389656067, "learning_rate": 1.2670998632010944e-05, "loss": 1.9052, "step": 8732 }, { "epoch": 0.75, "grad_norm": 1.6786214113235474, "learning_rate": 1.265389876880985e-05, "loss": 2.0108, "step": 8736 }, { "epoch": 0.75, "grad_norm": 1.574545979499817, "learning_rate": 1.2636798905608757e-05, "loss": 1.9098, "step": 8740 }, { "epoch": 0.75, "grad_norm": 1.7882972955703735, "learning_rate": 1.2619699042407663e-05, "loss": 1.9742, "step": 8744 }, { "epoch": 0.75, "grad_norm": 1.6370630264282227, "learning_rate": 1.2602599179206567e-05, "loss": 1.8861, "step": 8748 }, { "epoch": 0.75, "grad_norm": 2.044381618499756, "learning_rate": 1.2585499316005472e-05, "loss": 1.7537, "step": 8752 }, { "epoch": 0.75, "grad_norm": 1.6843827962875366, "learning_rate": 1.2568399452804378e-05, "loss": 1.9561, "step": 8756 }, { "epoch": 0.75, "grad_norm": 1.605976939201355, "learning_rate": 1.2551299589603282e-05, "loss": 1.8912, "step": 8760 }, { "epoch": 0.75, "grad_norm": 1.6944398880004883, "learning_rate": 1.2534199726402191e-05, "loss": 1.988, "step": 8764 }, { "epoch": 0.75, "grad_norm": 1.5753448009490967, "learning_rate": 1.2517099863201095e-05, "loss": 1.9168, "step": 8768 }, { "epoch": 0.75, "grad_norm": 1.6419624090194702, "learning_rate": 1.25e-05, "loss": 1.9642, "step": 8772 }, { "epoch": 0.75, "grad_norm": 1.6650840044021606, "learning_rate": 1.2482900136798906e-05, "loss": 1.9391, "step": 8776 }, { "epoch": 0.75, "grad_norm": 1.5701370239257812, "learning_rate": 1.2465800273597812e-05, "loss": 1.7449, "step": 8780 }, { "epoch": 0.75, "grad_norm": 1.6118509769439697, "learning_rate": 1.2448700410396718e-05, "loss": 1.9024, "step": 8784 }, { "epoch": 0.75, "grad_norm": 1.4989128112792969, "learning_rate": 1.2431600547195622e-05, "loss": 1.8511, "step": 8788 }, { "epoch": 0.75, "grad_norm": 1.8597005605697632, "learning_rate": 1.2414500683994529e-05, "loss": 2.0558, "step": 8792 }, { "epoch": 0.75, "grad_norm": 1.6900529861450195, "learning_rate": 1.2397400820793434e-05, "loss": 1.9513, "step": 8796 }, { "epoch": 0.75, "grad_norm": 1.8187960386276245, "learning_rate": 1.238030095759234e-05, "loss": 2.057, "step": 8800 }, { "epoch": 0.75, "grad_norm": 1.5813332796096802, "learning_rate": 1.2363201094391246e-05, "loss": 1.8454, "step": 8804 }, { "epoch": 0.75, "grad_norm": 1.655596137046814, "learning_rate": 1.234610123119015e-05, "loss": 1.8531, "step": 8808 }, { "epoch": 0.75, "grad_norm": 1.681291937828064, "learning_rate": 1.2329001367989057e-05, "loss": 1.8698, "step": 8812 }, { "epoch": 0.75, "grad_norm": 1.6871446371078491, "learning_rate": 1.2311901504787963e-05, "loss": 1.7355, "step": 8816 }, { "epoch": 0.75, "grad_norm": 1.62662935256958, "learning_rate": 1.2294801641586867e-05, "loss": 2.067, "step": 8820 }, { "epoch": 0.75, "grad_norm": 1.6432703733444214, "learning_rate": 1.2277701778385774e-05, "loss": 1.8635, "step": 8824 }, { "epoch": 0.75, "grad_norm": 1.410051941871643, "learning_rate": 1.2260601915184678e-05, "loss": 1.7336, "step": 8828 }, { "epoch": 0.76, "grad_norm": 1.6481599807739258, "learning_rate": 1.2243502051983585e-05, "loss": 2.025, "step": 8832 }, { "epoch": 0.76, "grad_norm": 1.7318426370620728, "learning_rate": 1.2226402188782491e-05, "loss": 1.9353, "step": 8836 }, { "epoch": 0.76, "grad_norm": 1.6890826225280762, "learning_rate": 1.2209302325581395e-05, "loss": 1.8659, "step": 8840 }, { "epoch": 0.76, "grad_norm": 1.5665086507797241, "learning_rate": 1.2192202462380302e-05, "loss": 1.9448, "step": 8844 }, { "epoch": 0.76, "grad_norm": 1.7020827531814575, "learning_rate": 1.2175102599179206e-05, "loss": 2.0202, "step": 8848 }, { "epoch": 0.76, "grad_norm": 1.6117048263549805, "learning_rate": 1.2158002735978112e-05, "loss": 1.9785, "step": 8852 }, { "epoch": 0.76, "grad_norm": 1.5090599060058594, "learning_rate": 1.214090287277702e-05, "loss": 1.929, "step": 8856 }, { "epoch": 0.76, "grad_norm": 1.5808215141296387, "learning_rate": 1.2123803009575923e-05, "loss": 1.9534, "step": 8860 }, { "epoch": 0.76, "grad_norm": 1.5516841411590576, "learning_rate": 1.210670314637483e-05, "loss": 1.9717, "step": 8864 }, { "epoch": 0.76, "grad_norm": 1.79678475856781, "learning_rate": 1.2089603283173734e-05, "loss": 1.8398, "step": 8868 }, { "epoch": 0.76, "grad_norm": 1.5560812950134277, "learning_rate": 1.207250341997264e-05, "loss": 1.9108, "step": 8872 }, { "epoch": 0.76, "grad_norm": 1.660593032836914, "learning_rate": 1.2055403556771547e-05, "loss": 1.9138, "step": 8876 }, { "epoch": 0.76, "grad_norm": 1.6071677207946777, "learning_rate": 1.2038303693570451e-05, "loss": 1.7974, "step": 8880 }, { "epoch": 0.76, "grad_norm": 1.5675511360168457, "learning_rate": 1.2021203830369357e-05, "loss": 1.7748, "step": 8884 }, { "epoch": 0.76, "grad_norm": 1.5390551090240479, "learning_rate": 1.2004103967168263e-05, "loss": 1.8618, "step": 8888 }, { "epoch": 0.76, "grad_norm": 1.4867161512374878, "learning_rate": 1.1987004103967168e-05, "loss": 1.7825, "step": 8892 }, { "epoch": 0.76, "grad_norm": 1.6912176609039307, "learning_rate": 1.1969904240766076e-05, "loss": 1.9014, "step": 8896 }, { "epoch": 0.76, "grad_norm": 1.5211007595062256, "learning_rate": 1.195280437756498e-05, "loss": 1.8937, "step": 8900 }, { "epoch": 0.76, "grad_norm": 1.582693099975586, "learning_rate": 1.1935704514363885e-05, "loss": 1.8975, "step": 8904 }, { "epoch": 0.76, "grad_norm": 1.5599603652954102, "learning_rate": 1.1918604651162791e-05, "loss": 1.8864, "step": 8908 }, { "epoch": 0.76, "grad_norm": 1.5778361558914185, "learning_rate": 1.1901504787961697e-05, "loss": 2.0265, "step": 8912 }, { "epoch": 0.76, "grad_norm": 1.5679864883422852, "learning_rate": 1.1884404924760602e-05, "loss": 1.8832, "step": 8916 }, { "epoch": 0.76, "grad_norm": 1.741749882698059, "learning_rate": 1.1867305061559508e-05, "loss": 1.9069, "step": 8920 }, { "epoch": 0.76, "grad_norm": 1.6839423179626465, "learning_rate": 1.1850205198358414e-05, "loss": 1.8291, "step": 8924 }, { "epoch": 0.76, "grad_norm": 1.638995885848999, "learning_rate": 1.183310533515732e-05, "loss": 1.8998, "step": 8928 }, { "epoch": 0.76, "grad_norm": 1.6037232875823975, "learning_rate": 1.1816005471956225e-05, "loss": 1.8874, "step": 8932 }, { "epoch": 0.76, "grad_norm": 2.105273962020874, "learning_rate": 1.179890560875513e-05, "loss": 1.9178, "step": 8936 }, { "epoch": 0.76, "grad_norm": 1.7534767389297485, "learning_rate": 1.1781805745554036e-05, "loss": 1.893, "step": 8940 }, { "epoch": 0.76, "grad_norm": 1.6804143190383911, "learning_rate": 1.1764705882352942e-05, "loss": 1.9738, "step": 8944 }, { "epoch": 0.77, "grad_norm": 1.6752735376358032, "learning_rate": 1.1747606019151847e-05, "loss": 1.774, "step": 8948 }, { "epoch": 0.77, "grad_norm": 1.6710683107376099, "learning_rate": 1.1730506155950753e-05, "loss": 1.9189, "step": 8952 }, { "epoch": 0.77, "grad_norm": 1.5679121017456055, "learning_rate": 1.1713406292749659e-05, "loss": 1.7889, "step": 8956 }, { "epoch": 0.77, "grad_norm": 1.6607458591461182, "learning_rate": 1.1696306429548564e-05, "loss": 1.7928, "step": 8960 }, { "epoch": 0.77, "grad_norm": 1.458150029182434, "learning_rate": 1.167920656634747e-05, "loss": 1.8047, "step": 8964 }, { "epoch": 0.77, "grad_norm": 1.8078303337097168, "learning_rate": 1.1662106703146374e-05, "loss": 2.096, "step": 8968 }, { "epoch": 0.77, "grad_norm": 1.7263890504837036, "learning_rate": 1.1645006839945281e-05, "loss": 2.0121, "step": 8972 }, { "epoch": 0.77, "grad_norm": 1.5924429893493652, "learning_rate": 1.1627906976744187e-05, "loss": 1.9003, "step": 8976 }, { "epoch": 0.77, "grad_norm": 1.7311723232269287, "learning_rate": 1.1610807113543091e-05, "loss": 1.8617, "step": 8980 }, { "epoch": 0.77, "grad_norm": 1.6257747411727905, "learning_rate": 1.1593707250341998e-05, "loss": 1.7866, "step": 8984 }, { "epoch": 0.77, "grad_norm": 1.6690551042556763, "learning_rate": 1.1576607387140902e-05, "loss": 1.9114, "step": 8988 }, { "epoch": 0.77, "grad_norm": 1.6894851922988892, "learning_rate": 1.155950752393981e-05, "loss": 1.9979, "step": 8992 }, { "epoch": 0.77, "grad_norm": 1.7198370695114136, "learning_rate": 1.1542407660738715e-05, "loss": 2.0374, "step": 8996 }, { "epoch": 0.77, "grad_norm": 1.5919808149337769, "learning_rate": 1.152530779753762e-05, "loss": 1.7399, "step": 9000 }, { "epoch": 0.77, "grad_norm": 1.6823852062225342, "learning_rate": 1.1508207934336527e-05, "loss": 1.8478, "step": 9004 }, { "epoch": 0.77, "grad_norm": 1.7140239477157593, "learning_rate": 1.149110807113543e-05, "loss": 1.9611, "step": 9008 }, { "epoch": 0.77, "grad_norm": 1.5392545461654663, "learning_rate": 1.1474008207934336e-05, "loss": 1.7857, "step": 9012 }, { "epoch": 0.77, "grad_norm": 1.8829190731048584, "learning_rate": 1.1456908344733244e-05, "loss": 1.8474, "step": 9016 }, { "epoch": 0.77, "grad_norm": 1.5163602828979492, "learning_rate": 1.1439808481532147e-05, "loss": 1.9006, "step": 9020 }, { "epoch": 0.77, "grad_norm": 1.5375906229019165, "learning_rate": 1.1422708618331055e-05, "loss": 2.0124, "step": 9024 }, { "epoch": 0.77, "grad_norm": 1.5933159589767456, "learning_rate": 1.1405608755129959e-05, "loss": 1.8251, "step": 9028 }, { "epoch": 0.77, "grad_norm": 1.8997576236724854, "learning_rate": 1.1388508891928864e-05, "loss": 1.9688, "step": 9032 }, { "epoch": 0.77, "grad_norm": 1.5310240983963013, "learning_rate": 1.1371409028727772e-05, "loss": 1.9101, "step": 9036 }, { "epoch": 0.77, "grad_norm": 1.7153500318527222, "learning_rate": 1.1354309165526676e-05, "loss": 1.7864, "step": 9040 }, { "epoch": 0.77, "grad_norm": 1.7716693878173828, "learning_rate": 1.1337209302325581e-05, "loss": 1.9119, "step": 9044 }, { "epoch": 0.77, "grad_norm": 1.8320080041885376, "learning_rate": 1.1320109439124487e-05, "loss": 2.0897, "step": 9048 }, { "epoch": 0.77, "grad_norm": 1.6159136295318604, "learning_rate": 1.1303009575923393e-05, "loss": 2.0392, "step": 9052 }, { "epoch": 0.77, "grad_norm": 1.5778326988220215, "learning_rate": 1.12859097127223e-05, "loss": 1.9324, "step": 9056 }, { "epoch": 0.77, "grad_norm": 1.5392941236495972, "learning_rate": 1.1268809849521204e-05, "loss": 1.8133, "step": 9060 }, { "epoch": 0.77, "grad_norm": 1.6904296875, "learning_rate": 1.125170998632011e-05, "loss": 1.9265, "step": 9064 }, { "epoch": 0.78, "grad_norm": 1.7861131429672241, "learning_rate": 1.1234610123119015e-05, "loss": 1.7405, "step": 9068 }, { "epoch": 0.78, "grad_norm": 1.6475653648376465, "learning_rate": 1.1217510259917921e-05, "loss": 1.8417, "step": 9072 }, { "epoch": 0.78, "grad_norm": 1.4819046258926392, "learning_rate": 1.1200410396716827e-05, "loss": 1.7927, "step": 9076 }, { "epoch": 0.78, "grad_norm": 1.5303575992584229, "learning_rate": 1.1183310533515732e-05, "loss": 1.735, "step": 9080 }, { "epoch": 0.78, "grad_norm": 1.5807466506958008, "learning_rate": 1.1166210670314638e-05, "loss": 1.7733, "step": 9084 }, { "epoch": 0.78, "grad_norm": 1.6154811382293701, "learning_rate": 1.1149110807113544e-05, "loss": 1.985, "step": 9088 }, { "epoch": 0.78, "grad_norm": 1.588836908340454, "learning_rate": 1.113201094391245e-05, "loss": 1.8861, "step": 9092 }, { "epoch": 0.78, "grad_norm": 1.6571130752563477, "learning_rate": 1.1114911080711355e-05, "loss": 1.9013, "step": 9096 }, { "epoch": 0.78, "grad_norm": 1.7480363845825195, "learning_rate": 1.109781121751026e-05, "loss": 2.0298, "step": 9100 }, { "epoch": 0.78, "grad_norm": 1.53179931640625, "learning_rate": 1.1080711354309166e-05, "loss": 1.8883, "step": 9104 }, { "epoch": 0.78, "grad_norm": 1.7249871492385864, "learning_rate": 1.1063611491108072e-05, "loss": 1.888, "step": 9108 }, { "epoch": 0.78, "grad_norm": 1.771964192390442, "learning_rate": 1.1046511627906977e-05, "loss": 1.7898, "step": 9112 }, { "epoch": 0.78, "grad_norm": 1.8706704378128052, "learning_rate": 1.1029411764705883e-05, "loss": 1.8936, "step": 9116 }, { "epoch": 0.78, "grad_norm": 1.6252784729003906, "learning_rate": 1.1012311901504789e-05, "loss": 1.909, "step": 9120 }, { "epoch": 0.78, "grad_norm": 1.5766993761062622, "learning_rate": 1.0995212038303694e-05, "loss": 1.7883, "step": 9124 }, { "epoch": 0.78, "grad_norm": 1.5931488275527954, "learning_rate": 1.09781121751026e-05, "loss": 1.8009, "step": 9128 }, { "epoch": 0.78, "grad_norm": 1.652861475944519, "learning_rate": 1.0961012311901506e-05, "loss": 1.9441, "step": 9132 }, { "epoch": 0.78, "grad_norm": 1.604109764099121, "learning_rate": 1.0943912448700411e-05, "loss": 1.944, "step": 9136 }, { "epoch": 0.78, "grad_norm": 1.648501992225647, "learning_rate": 1.0926812585499315e-05, "loss": 1.7164, "step": 9140 }, { "epoch": 0.78, "grad_norm": 1.697082757949829, "learning_rate": 1.0909712722298223e-05, "loss": 1.9967, "step": 9144 }, { "epoch": 0.78, "grad_norm": 1.5939291715621948, "learning_rate": 1.0892612859097127e-05, "loss": 1.8479, "step": 9148 }, { "epoch": 0.78, "grad_norm": 1.4296135902404785, "learning_rate": 1.0875512995896034e-05, "loss": 1.7912, "step": 9152 }, { "epoch": 0.78, "grad_norm": 1.5867764949798584, "learning_rate": 1.085841313269494e-05, "loss": 2.0543, "step": 9156 }, { "epoch": 0.78, "grad_norm": 1.877381443977356, "learning_rate": 1.0841313269493844e-05, "loss": 1.9605, "step": 9160 }, { "epoch": 0.78, "grad_norm": 1.5704667568206787, "learning_rate": 1.0824213406292751e-05, "loss": 1.8716, "step": 9164 }, { "epoch": 0.78, "grad_norm": 1.7016427516937256, "learning_rate": 1.0807113543091655e-05, "loss": 1.8002, "step": 9168 }, { "epoch": 0.78, "grad_norm": 1.5268316268920898, "learning_rate": 1.079001367989056e-05, "loss": 1.8328, "step": 9172 }, { "epoch": 0.78, "grad_norm": 1.773216962814331, "learning_rate": 1.0772913816689468e-05, "loss": 2.0852, "step": 9176 }, { "epoch": 0.78, "grad_norm": 1.8278284072875977, "learning_rate": 1.0755813953488372e-05, "loss": 1.7678, "step": 9180 }, { "epoch": 0.79, "grad_norm": 1.9138497114181519, "learning_rate": 1.0738714090287279e-05, "loss": 1.8216, "step": 9184 }, { "epoch": 0.79, "grad_norm": 1.6149177551269531, "learning_rate": 1.0721614227086183e-05, "loss": 2.0147, "step": 9188 }, { "epoch": 0.79, "grad_norm": 1.5738332271575928, "learning_rate": 1.0704514363885089e-05, "loss": 1.9592, "step": 9192 }, { "epoch": 0.79, "grad_norm": 1.617006778717041, "learning_rate": 1.0687414500683996e-05, "loss": 1.9349, "step": 9196 }, { "epoch": 0.79, "grad_norm": 1.6684281826019287, "learning_rate": 1.06703146374829e-05, "loss": 1.9102, "step": 9200 }, { "epoch": 0.79, "grad_norm": 1.5881692171096802, "learning_rate": 1.0653214774281806e-05, "loss": 1.7868, "step": 9204 }, { "epoch": 0.79, "grad_norm": 1.6087418794631958, "learning_rate": 1.0636114911080711e-05, "loss": 1.8317, "step": 9208 }, { "epoch": 0.79, "grad_norm": 1.5474470853805542, "learning_rate": 1.0619015047879617e-05, "loss": 1.8203, "step": 9212 }, { "epoch": 0.79, "grad_norm": 1.6170072555541992, "learning_rate": 1.0601915184678524e-05, "loss": 1.9812, "step": 9216 }, { "epoch": 0.79, "grad_norm": 1.6150282621383667, "learning_rate": 1.0584815321477428e-05, "loss": 1.831, "step": 9220 }, { "epoch": 0.79, "grad_norm": 1.888481855392456, "learning_rate": 1.0567715458276334e-05, "loss": 1.9179, "step": 9224 }, { "epoch": 0.79, "grad_norm": 1.499454379081726, "learning_rate": 1.055061559507524e-05, "loss": 1.9117, "step": 9228 }, { "epoch": 0.79, "grad_norm": 1.8907907009124756, "learning_rate": 1.0533515731874145e-05, "loss": 1.9377, "step": 9232 }, { "epoch": 0.79, "grad_norm": 1.6337422132492065, "learning_rate": 1.0516415868673051e-05, "loss": 1.946, "step": 9236 }, { "epoch": 0.79, "grad_norm": 1.692440390586853, "learning_rate": 1.0499316005471957e-05, "loss": 2.0171, "step": 9240 }, { "epoch": 0.79, "grad_norm": 1.8192511796951294, "learning_rate": 1.0482216142270862e-05, "loss": 2.0665, "step": 9244 }, { "epoch": 0.79, "grad_norm": 1.7105309963226318, "learning_rate": 1.0465116279069768e-05, "loss": 1.8517, "step": 9248 }, { "epoch": 0.79, "grad_norm": 1.570191740989685, "learning_rate": 1.0448016415868673e-05, "loss": 1.879, "step": 9252 }, { "epoch": 0.79, "grad_norm": 1.5352414846420288, "learning_rate": 1.0430916552667579e-05, "loss": 1.9844, "step": 9256 }, { "epoch": 0.79, "grad_norm": 1.6077239513397217, "learning_rate": 1.0413816689466485e-05, "loss": 1.8066, "step": 9260 }, { "epoch": 0.79, "grad_norm": 1.6216827630996704, "learning_rate": 1.039671682626539e-05, "loss": 1.7569, "step": 9264 }, { "epoch": 0.79, "grad_norm": 1.5977295637130737, "learning_rate": 1.0379616963064296e-05, "loss": 1.841, "step": 9268 }, { "epoch": 0.79, "grad_norm": 1.7415016889572144, "learning_rate": 1.0362517099863202e-05, "loss": 2.0001, "step": 9272 }, { "epoch": 0.79, "grad_norm": 1.6122945547103882, "learning_rate": 1.0345417236662107e-05, "loss": 1.7887, "step": 9276 }, { "epoch": 0.79, "grad_norm": 1.7112343311309814, "learning_rate": 1.0328317373461013e-05, "loss": 1.9709, "step": 9280 }, { "epoch": 0.79, "grad_norm": 1.583895206451416, "learning_rate": 1.0311217510259919e-05, "loss": 2.0006, "step": 9284 }, { "epoch": 0.79, "grad_norm": 1.5309836864471436, "learning_rate": 1.0294117647058824e-05, "loss": 1.8978, "step": 9288 }, { "epoch": 0.79, "grad_norm": 1.6894787549972534, "learning_rate": 1.027701778385773e-05, "loss": 1.856, "step": 9292 }, { "epoch": 0.79, "grad_norm": 1.5949068069458008, "learning_rate": 1.0259917920656636e-05, "loss": 1.84, "step": 9296 }, { "epoch": 0.8, "grad_norm": 1.5734741687774658, "learning_rate": 1.024281805745554e-05, "loss": 1.9617, "step": 9300 }, { "epoch": 0.8, "grad_norm": 1.7839090824127197, "learning_rate": 1.0225718194254447e-05, "loss": 1.7957, "step": 9304 }, { "epoch": 0.8, "grad_norm": 1.9882410764694214, "learning_rate": 1.0208618331053353e-05, "loss": 2.0439, "step": 9308 }, { "epoch": 0.8, "grad_norm": 1.6469732522964478, "learning_rate": 1.0191518467852258e-05, "loss": 1.82, "step": 9312 }, { "epoch": 0.8, "grad_norm": 1.597993016242981, "learning_rate": 1.0174418604651164e-05, "loss": 2.0186, "step": 9316 }, { "epoch": 0.8, "grad_norm": 1.5456162691116333, "learning_rate": 1.0157318741450068e-05, "loss": 1.9057, "step": 9320 }, { "epoch": 0.8, "grad_norm": 1.589836597442627, "learning_rate": 1.0140218878248975e-05, "loss": 1.8665, "step": 9324 }, { "epoch": 0.8, "grad_norm": 1.5683108568191528, "learning_rate": 1.0123119015047879e-05, "loss": 1.7532, "step": 9328 }, { "epoch": 0.8, "grad_norm": 1.5445048809051514, "learning_rate": 1.0106019151846785e-05, "loss": 1.9741, "step": 9332 }, { "epoch": 0.8, "grad_norm": 1.6579010486602783, "learning_rate": 1.0088919288645692e-05, "loss": 1.8469, "step": 9336 }, { "epoch": 0.8, "grad_norm": 1.813675045967102, "learning_rate": 1.0071819425444596e-05, "loss": 1.9811, "step": 9340 }, { "epoch": 0.8, "grad_norm": 1.69435453414917, "learning_rate": 1.0054719562243503e-05, "loss": 1.6031, "step": 9344 }, { "epoch": 0.8, "grad_norm": 1.9789328575134277, "learning_rate": 1.0037619699042407e-05, "loss": 2.0603, "step": 9348 }, { "epoch": 0.8, "grad_norm": 1.7569694519042969, "learning_rate": 1.0020519835841313e-05, "loss": 1.8548, "step": 9352 }, { "epoch": 0.8, "grad_norm": 1.7058336734771729, "learning_rate": 1.000341997264022e-05, "loss": 1.8937, "step": 9356 }, { "epoch": 0.8, "grad_norm": 1.7999751567840576, "learning_rate": 9.986320109439124e-06, "loss": 2.0138, "step": 9360 }, { "epoch": 0.8, "grad_norm": 1.6166166067123413, "learning_rate": 9.96922024623803e-06, "loss": 1.8514, "step": 9364 }, { "epoch": 0.8, "grad_norm": 1.5393308401107788, "learning_rate": 9.952120383036936e-06, "loss": 1.8538, "step": 9368 }, { "epoch": 0.8, "grad_norm": 1.7060655355453491, "learning_rate": 9.935020519835841e-06, "loss": 1.8204, "step": 9372 }, { "epoch": 0.8, "grad_norm": 1.5741777420043945, "learning_rate": 9.917920656634749e-06, "loss": 1.8723, "step": 9376 }, { "epoch": 0.8, "grad_norm": 1.7236156463623047, "learning_rate": 9.900820793433653e-06, "loss": 2.0826, "step": 9380 }, { "epoch": 0.8, "grad_norm": 1.6071019172668457, "learning_rate": 9.883720930232558e-06, "loss": 1.8699, "step": 9384 }, { "epoch": 0.8, "grad_norm": 1.5609468221664429, "learning_rate": 9.866621067031464e-06, "loss": 1.7908, "step": 9388 }, { "epoch": 0.8, "grad_norm": 1.5828368663787842, "learning_rate": 9.84952120383037e-06, "loss": 1.9035, "step": 9392 }, { "epoch": 0.8, "grad_norm": 1.6804313659667969, "learning_rate": 9.832421340629275e-06, "loss": 1.873, "step": 9396 }, { "epoch": 0.8, "grad_norm": 1.7444369792938232, "learning_rate": 9.81532147742818e-06, "loss": 1.9714, "step": 9400 }, { "epoch": 0.8, "grad_norm": 1.478379487991333, "learning_rate": 9.798221614227086e-06, "loss": 1.825, "step": 9404 }, { "epoch": 0.8, "grad_norm": 1.8659464120864868, "learning_rate": 9.781121751025992e-06, "loss": 2.0924, "step": 9408 }, { "epoch": 0.8, "grad_norm": 1.6995452642440796, "learning_rate": 9.764021887824898e-06, "loss": 1.7909, "step": 9412 }, { "epoch": 0.81, "grad_norm": 1.7288949489593506, "learning_rate": 9.746922024623803e-06, "loss": 1.9683, "step": 9416 }, { "epoch": 0.81, "grad_norm": 1.728184461593628, "learning_rate": 9.729822161422709e-06, "loss": 1.867, "step": 9420 }, { "epoch": 0.81, "grad_norm": 1.6516950130462646, "learning_rate": 9.712722298221615e-06, "loss": 1.847, "step": 9424 }, { "epoch": 0.81, "grad_norm": 1.6084563732147217, "learning_rate": 9.69562243502052e-06, "loss": 1.8446, "step": 9428 }, { "epoch": 0.81, "grad_norm": 1.6345360279083252, "learning_rate": 9.678522571819426e-06, "loss": 1.8856, "step": 9432 }, { "epoch": 0.81, "grad_norm": 1.6293641328811646, "learning_rate": 9.661422708618332e-06, "loss": 1.8721, "step": 9436 }, { "epoch": 0.81, "grad_norm": 1.7901579141616821, "learning_rate": 9.644322845417237e-06, "loss": 1.8333, "step": 9440 }, { "epoch": 0.81, "grad_norm": 1.8444390296936035, "learning_rate": 9.627222982216143e-06, "loss": 1.8012, "step": 9444 }, { "epoch": 0.81, "grad_norm": 1.9100682735443115, "learning_rate": 9.610123119015049e-06, "loss": 1.8989, "step": 9448 }, { "epoch": 0.81, "grad_norm": 1.5829551219940186, "learning_rate": 9.593023255813954e-06, "loss": 1.692, "step": 9452 }, { "epoch": 0.81, "grad_norm": 1.7920702695846558, "learning_rate": 9.57592339261286e-06, "loss": 1.9008, "step": 9456 }, { "epoch": 0.81, "grad_norm": 1.8126853704452515, "learning_rate": 9.558823529411764e-06, "loss": 1.8993, "step": 9460 }, { "epoch": 0.81, "grad_norm": 1.6333680152893066, "learning_rate": 9.541723666210671e-06, "loss": 1.8486, "step": 9464 }, { "epoch": 0.81, "grad_norm": 1.6809686422348022, "learning_rate": 9.524623803009577e-06, "loss": 1.8182, "step": 9468 }, { "epoch": 0.81, "grad_norm": 1.5512781143188477, "learning_rate": 9.507523939808483e-06, "loss": 1.782, "step": 9472 }, { "epoch": 0.81, "grad_norm": 1.4863585233688354, "learning_rate": 9.490424076607388e-06, "loss": 1.9, "step": 9476 }, { "epoch": 0.81, "grad_norm": 1.8519527912139893, "learning_rate": 9.473324213406292e-06, "loss": 1.8792, "step": 9480 }, { "epoch": 0.81, "grad_norm": 1.68942391872406, "learning_rate": 9.4562243502052e-06, "loss": 1.81, "step": 9484 }, { "epoch": 0.81, "grad_norm": 1.6430591344833374, "learning_rate": 9.439124487004105e-06, "loss": 1.7057, "step": 9488 }, { "epoch": 0.81, "grad_norm": 1.7153677940368652, "learning_rate": 9.422024623803009e-06, "loss": 1.7675, "step": 9492 }, { "epoch": 0.81, "grad_norm": 2.2399628162384033, "learning_rate": 9.404924760601916e-06, "loss": 1.9985, "step": 9496 }, { "epoch": 0.81, "grad_norm": 1.6189675331115723, "learning_rate": 9.38782489740082e-06, "loss": 1.8922, "step": 9500 }, { "epoch": 0.81, "grad_norm": 1.6087392568588257, "learning_rate": 9.370725034199728e-06, "loss": 1.8978, "step": 9504 }, { "epoch": 0.81, "grad_norm": 1.7213088274002075, "learning_rate": 9.353625170998632e-06, "loss": 1.9047, "step": 9508 }, { "epoch": 0.81, "grad_norm": 1.6473731994628906, "learning_rate": 9.336525307797537e-06, "loss": 1.922, "step": 9512 }, { "epoch": 0.81, "grad_norm": 1.6384577751159668, "learning_rate": 9.319425444596445e-06, "loss": 1.9051, "step": 9516 }, { "epoch": 0.81, "grad_norm": 1.6715625524520874, "learning_rate": 9.302325581395349e-06, "loss": 1.7335, "step": 9520 }, { "epoch": 0.81, "grad_norm": 1.7468408346176147, "learning_rate": 9.285225718194254e-06, "loss": 1.8317, "step": 9524 }, { "epoch": 0.81, "grad_norm": 1.6021628379821777, "learning_rate": 9.26812585499316e-06, "loss": 1.7717, "step": 9528 }, { "epoch": 0.81, "grad_norm": 1.4728972911834717, "learning_rate": 9.251025991792066e-06, "loss": 1.8383, "step": 9532 }, { "epoch": 0.82, "grad_norm": 1.641257643699646, "learning_rate": 9.233926128590973e-06, "loss": 1.9203, "step": 9536 }, { "epoch": 0.82, "grad_norm": 1.5959044694900513, "learning_rate": 9.216826265389877e-06, "loss": 1.8186, "step": 9540 }, { "epoch": 0.82, "grad_norm": 1.8256573677062988, "learning_rate": 9.199726402188783e-06, "loss": 1.9279, "step": 9544 }, { "epoch": 0.82, "grad_norm": 1.6256192922592163, "learning_rate": 9.182626538987688e-06, "loss": 1.8773, "step": 9548 }, { "epoch": 0.82, "grad_norm": 1.7054941654205322, "learning_rate": 9.165526675786594e-06, "loss": 1.8826, "step": 9552 }, { "epoch": 0.82, "grad_norm": 1.7333673238754272, "learning_rate": 9.1484268125855e-06, "loss": 1.8855, "step": 9556 }, { "epoch": 0.82, "grad_norm": 1.9579733610153198, "learning_rate": 9.131326949384405e-06, "loss": 1.7222, "step": 9560 }, { "epoch": 0.82, "grad_norm": 1.6406586170196533, "learning_rate": 9.11422708618331e-06, "loss": 1.8063, "step": 9564 }, { "epoch": 0.82, "grad_norm": 1.8166844844818115, "learning_rate": 9.097127222982216e-06, "loss": 1.8802, "step": 9568 }, { "epoch": 0.82, "grad_norm": 1.815399169921875, "learning_rate": 9.080027359781122e-06, "loss": 1.9574, "step": 9572 }, { "epoch": 0.82, "grad_norm": 1.7717112302780151, "learning_rate": 9.062927496580028e-06, "loss": 1.8687, "step": 9576 }, { "epoch": 0.82, "grad_norm": 1.8283933401107788, "learning_rate": 9.045827633378933e-06, "loss": 1.7794, "step": 9580 }, { "epoch": 0.82, "grad_norm": 1.5563534498214722, "learning_rate": 9.028727770177839e-06, "loss": 1.9697, "step": 9584 }, { "epoch": 0.82, "grad_norm": 1.7888858318328857, "learning_rate": 9.011627906976745e-06, "loss": 1.8549, "step": 9588 }, { "epoch": 0.82, "grad_norm": 1.6961920261383057, "learning_rate": 8.99452804377565e-06, "loss": 1.8044, "step": 9592 }, { "epoch": 0.82, "grad_norm": 1.6775288581848145, "learning_rate": 8.977428180574556e-06, "loss": 1.7884, "step": 9596 }, { "epoch": 0.82, "grad_norm": 1.5376509428024292, "learning_rate": 8.960328317373462e-06, "loss": 1.9472, "step": 9600 }, { "epoch": 0.82, "grad_norm": 1.5770347118377686, "learning_rate": 8.943228454172367e-06, "loss": 1.7587, "step": 9604 }, { "epoch": 0.82, "grad_norm": 1.5645045042037964, "learning_rate": 8.926128590971273e-06, "loss": 1.9036, "step": 9608 }, { "epoch": 0.82, "grad_norm": 1.6209347248077393, "learning_rate": 8.909028727770179e-06, "loss": 1.9395, "step": 9612 }, { "epoch": 0.82, "grad_norm": 1.5157575607299805, "learning_rate": 8.891928864569084e-06, "loss": 1.9096, "step": 9616 }, { "epoch": 0.82, "grad_norm": 1.6890482902526855, "learning_rate": 8.874829001367988e-06, "loss": 1.8541, "step": 9620 }, { "epoch": 0.82, "grad_norm": 1.6792246103286743, "learning_rate": 8.857729138166896e-06, "loss": 1.9734, "step": 9624 }, { "epoch": 0.82, "grad_norm": 1.5776066780090332, "learning_rate": 8.840629274965801e-06, "loss": 1.9166, "step": 9628 }, { "epoch": 0.82, "grad_norm": 1.5487701892852783, "learning_rate": 8.823529411764707e-06, "loss": 1.7777, "step": 9632 }, { "epoch": 0.82, "grad_norm": 1.6531468629837036, "learning_rate": 8.806429548563612e-06, "loss": 1.6488, "step": 9636 }, { "epoch": 0.82, "grad_norm": 1.6902531385421753, "learning_rate": 8.789329685362516e-06, "loss": 1.8287, "step": 9640 }, { "epoch": 0.82, "grad_norm": 1.6954971551895142, "learning_rate": 8.772229822161424e-06, "loss": 1.896, "step": 9644 }, { "epoch": 0.82, "grad_norm": 1.5442079305648804, "learning_rate": 8.75512995896033e-06, "loss": 1.8335, "step": 9648 }, { "epoch": 0.83, "grad_norm": 1.739211916923523, "learning_rate": 8.738030095759233e-06, "loss": 1.8265, "step": 9652 }, { "epoch": 0.83, "grad_norm": 1.6437937021255493, "learning_rate": 8.72093023255814e-06, "loss": 1.7487, "step": 9656 }, { "epoch": 0.83, "grad_norm": 1.568172812461853, "learning_rate": 8.703830369357045e-06, "loss": 1.8618, "step": 9660 }, { "epoch": 0.83, "grad_norm": 1.84135103225708, "learning_rate": 8.686730506155952e-06, "loss": 1.8962, "step": 9664 }, { "epoch": 0.83, "grad_norm": 1.599940538406372, "learning_rate": 8.669630642954858e-06, "loss": 1.7502, "step": 9668 }, { "epoch": 0.83, "grad_norm": 1.9855149984359741, "learning_rate": 8.652530779753762e-06, "loss": 2.0228, "step": 9672 }, { "epoch": 0.83, "grad_norm": 1.641782283782959, "learning_rate": 8.635430916552669e-06, "loss": 2.0184, "step": 9676 }, { "epoch": 0.83, "grad_norm": 1.9988453388214111, "learning_rate": 8.618331053351573e-06, "loss": 2.0865, "step": 9680 }, { "epoch": 0.83, "grad_norm": 1.7934141159057617, "learning_rate": 8.601231190150479e-06, "loss": 1.8933, "step": 9684 }, { "epoch": 0.83, "grad_norm": 1.5691959857940674, "learning_rate": 8.584131326949384e-06, "loss": 1.8344, "step": 9688 }, { "epoch": 0.83, "grad_norm": 1.7705130577087402, "learning_rate": 8.56703146374829e-06, "loss": 1.646, "step": 9692 }, { "epoch": 0.83, "grad_norm": 1.6504237651824951, "learning_rate": 8.549931600547197e-06, "loss": 1.7268, "step": 9696 }, { "epoch": 0.83, "grad_norm": 1.4613388776779175, "learning_rate": 8.532831737346101e-06, "loss": 1.7871, "step": 9700 }, { "epoch": 0.83, "grad_norm": 1.7658060789108276, "learning_rate": 8.515731874145007e-06, "loss": 1.8519, "step": 9704 }, { "epoch": 0.83, "grad_norm": 1.84966242313385, "learning_rate": 8.498632010943912e-06, "loss": 1.8277, "step": 9708 }, { "epoch": 0.83, "grad_norm": 1.674241542816162, "learning_rate": 8.481532147742818e-06, "loss": 1.7632, "step": 9712 }, { "epoch": 0.83, "grad_norm": 1.7811789512634277, "learning_rate": 8.464432284541724e-06, "loss": 1.899, "step": 9716 }, { "epoch": 0.83, "grad_norm": 1.7027822732925415, "learning_rate": 8.44733242134063e-06, "loss": 1.9129, "step": 9720 }, { "epoch": 0.83, "grad_norm": 1.7912241220474243, "learning_rate": 8.430232558139535e-06, "loss": 1.8546, "step": 9724 }, { "epoch": 0.83, "grad_norm": 1.5889939069747925, "learning_rate": 8.41313269493844e-06, "loss": 2.0574, "step": 9728 }, { "epoch": 0.83, "grad_norm": 1.55410635471344, "learning_rate": 8.396032831737346e-06, "loss": 1.8547, "step": 9732 }, { "epoch": 0.83, "grad_norm": 1.6358247995376587, "learning_rate": 8.378932968536252e-06, "loss": 1.7924, "step": 9736 }, { "epoch": 0.83, "grad_norm": 1.698111891746521, "learning_rate": 8.361833105335158e-06, "loss": 1.8366, "step": 9740 }, { "epoch": 0.83, "grad_norm": 1.5512586832046509, "learning_rate": 8.344733242134063e-06, "loss": 1.7097, "step": 9744 }, { "epoch": 0.83, "grad_norm": 1.5253231525421143, "learning_rate": 8.327633378932969e-06, "loss": 1.9076, "step": 9748 }, { "epoch": 0.83, "grad_norm": 1.6231822967529297, "learning_rate": 8.310533515731875e-06, "loss": 1.9026, "step": 9752 }, { "epoch": 0.83, "grad_norm": 1.726797103881836, "learning_rate": 8.29343365253078e-06, "loss": 1.9518, "step": 9756 }, { "epoch": 0.83, "grad_norm": 1.7323880195617676, "learning_rate": 8.276333789329686e-06, "loss": 1.9352, "step": 9760 }, { "epoch": 0.83, "grad_norm": 1.7740181684494019, "learning_rate": 8.259233926128592e-06, "loss": 1.8697, "step": 9764 }, { "epoch": 0.84, "grad_norm": 1.6174265146255493, "learning_rate": 8.242134062927497e-06, "loss": 1.8365, "step": 9768 }, { "epoch": 0.84, "grad_norm": 1.7219985723495483, "learning_rate": 8.225034199726403e-06, "loss": 1.8343, "step": 9772 }, { "epoch": 0.84, "grad_norm": 1.618703007698059, "learning_rate": 8.207934336525308e-06, "loss": 1.914, "step": 9776 }, { "epoch": 0.84, "grad_norm": 1.5896012783050537, "learning_rate": 8.190834473324212e-06, "loss": 1.8311, "step": 9780 }, { "epoch": 0.84, "grad_norm": 1.614791989326477, "learning_rate": 8.17373461012312e-06, "loss": 1.8189, "step": 9784 }, { "epoch": 0.84, "grad_norm": 1.6827095746994019, "learning_rate": 8.156634746922025e-06, "loss": 1.7639, "step": 9788 }, { "epoch": 0.84, "grad_norm": 1.563234806060791, "learning_rate": 8.139534883720931e-06, "loss": 1.8255, "step": 9792 }, { "epoch": 0.84, "grad_norm": 1.871492624282837, "learning_rate": 8.122435020519837e-06, "loss": 1.7558, "step": 9796 }, { "epoch": 0.84, "grad_norm": 1.827107548713684, "learning_rate": 8.10533515731874e-06, "loss": 1.93, "step": 9800 }, { "epoch": 0.84, "grad_norm": 1.7132670879364014, "learning_rate": 8.088235294117648e-06, "loss": 1.9702, "step": 9804 }, { "epoch": 0.84, "grad_norm": 1.71363365650177, "learning_rate": 8.071135430916554e-06, "loss": 1.9638, "step": 9808 }, { "epoch": 0.84, "grad_norm": 1.6538909673690796, "learning_rate": 8.054035567715458e-06, "loss": 1.9329, "step": 9812 }, { "epoch": 0.84, "grad_norm": 1.4397317171096802, "learning_rate": 8.036935704514365e-06, "loss": 1.7031, "step": 9816 }, { "epoch": 0.84, "grad_norm": 1.7500611543655396, "learning_rate": 8.019835841313269e-06, "loss": 1.7927, "step": 9820 }, { "epoch": 0.84, "grad_norm": 1.7213584184646606, "learning_rate": 8.002735978112176e-06, "loss": 1.8121, "step": 9824 }, { "epoch": 0.84, "grad_norm": 1.8555186986923218, "learning_rate": 7.985636114911082e-06, "loss": 1.9388, "step": 9828 }, { "epoch": 0.84, "grad_norm": 1.5887397527694702, "learning_rate": 7.968536251709986e-06, "loss": 1.8344, "step": 9832 }, { "epoch": 0.84, "grad_norm": 1.5652546882629395, "learning_rate": 7.951436388508893e-06, "loss": 1.9696, "step": 9836 }, { "epoch": 0.84, "grad_norm": 1.5642329454421997, "learning_rate": 7.934336525307797e-06, "loss": 1.6744, "step": 9840 }, { "epoch": 0.84, "grad_norm": 1.631618618965149, "learning_rate": 7.917236662106703e-06, "loss": 1.8742, "step": 9844 }, { "epoch": 0.84, "grad_norm": 1.6112003326416016, "learning_rate": 7.90013679890561e-06, "loss": 1.9281, "step": 9848 }, { "epoch": 0.84, "grad_norm": 1.612426996231079, "learning_rate": 7.883036935704514e-06, "loss": 1.9576, "step": 9852 }, { "epoch": 0.84, "grad_norm": 1.7601630687713623, "learning_rate": 7.865937072503421e-06, "loss": 1.7478, "step": 9856 }, { "epoch": 0.84, "grad_norm": 1.782696008682251, "learning_rate": 7.848837209302325e-06, "loss": 1.8667, "step": 9860 }, { "epoch": 0.84, "grad_norm": 1.716011881828308, "learning_rate": 7.831737346101231e-06, "loss": 1.9523, "step": 9864 }, { "epoch": 0.84, "grad_norm": 1.5538578033447266, "learning_rate": 7.814637482900137e-06, "loss": 1.9909, "step": 9868 }, { "epoch": 0.84, "grad_norm": 1.6781466007232666, "learning_rate": 7.797537619699042e-06, "loss": 1.7684, "step": 9872 }, { "epoch": 0.84, "grad_norm": 1.6765227317810059, "learning_rate": 7.780437756497948e-06, "loss": 1.8663, "step": 9876 }, { "epoch": 0.84, "grad_norm": 1.5819069147109985, "learning_rate": 7.763337893296854e-06, "loss": 1.8592, "step": 9880 }, { "epoch": 0.85, "grad_norm": 1.6805880069732666, "learning_rate": 7.74623803009576e-06, "loss": 1.8612, "step": 9884 }, { "epoch": 0.85, "grad_norm": 1.6755802631378174, "learning_rate": 7.729138166894665e-06, "loss": 1.8602, "step": 9888 }, { "epoch": 0.85, "grad_norm": 1.7085371017456055, "learning_rate": 7.71203830369357e-06, "loss": 1.6679, "step": 9892 }, { "epoch": 0.85, "grad_norm": 1.5899639129638672, "learning_rate": 7.694938440492476e-06, "loss": 1.7626, "step": 9896 }, { "epoch": 0.85, "grad_norm": 1.740966796875, "learning_rate": 7.677838577291382e-06, "loss": 1.7984, "step": 9900 }, { "epoch": 0.85, "grad_norm": 1.8925389051437378, "learning_rate": 7.660738714090288e-06, "loss": 1.8242, "step": 9904 }, { "epoch": 0.85, "grad_norm": 1.7259442806243896, "learning_rate": 7.643638850889193e-06, "loss": 1.7816, "step": 9908 }, { "epoch": 0.85, "grad_norm": 1.5764368772506714, "learning_rate": 7.626538987688099e-06, "loss": 1.7718, "step": 9912 }, { "epoch": 0.85, "grad_norm": 1.7654069662094116, "learning_rate": 7.6094391244870045e-06, "loss": 1.7728, "step": 9916 }, { "epoch": 0.85, "grad_norm": 1.5519089698791504, "learning_rate": 7.592339261285911e-06, "loss": 1.9182, "step": 9920 }, { "epoch": 0.85, "grad_norm": 1.690704584121704, "learning_rate": 7.575239398084816e-06, "loss": 1.9885, "step": 9924 }, { "epoch": 0.85, "grad_norm": 1.5261144638061523, "learning_rate": 7.558139534883721e-06, "loss": 1.7169, "step": 9928 }, { "epoch": 0.85, "grad_norm": 1.5968029499053955, "learning_rate": 7.541039671682627e-06, "loss": 1.7766, "step": 9932 }, { "epoch": 0.85, "grad_norm": 1.5991588830947876, "learning_rate": 7.523939808481533e-06, "loss": 1.8359, "step": 9936 }, { "epoch": 0.85, "grad_norm": 1.591503620147705, "learning_rate": 7.506839945280438e-06, "loss": 1.7582, "step": 9940 }, { "epoch": 0.85, "grad_norm": 1.5666191577911377, "learning_rate": 7.489740082079344e-06, "loss": 1.8947, "step": 9944 }, { "epoch": 0.85, "grad_norm": 1.7336504459381104, "learning_rate": 7.472640218878249e-06, "loss": 1.8194, "step": 9948 }, { "epoch": 0.85, "grad_norm": 1.7414416074752808, "learning_rate": 7.455540355677155e-06, "loss": 1.8543, "step": 9952 }, { "epoch": 0.85, "grad_norm": 1.6664258241653442, "learning_rate": 7.438440492476061e-06, "loss": 1.786, "step": 9956 }, { "epoch": 0.85, "grad_norm": 1.594941258430481, "learning_rate": 7.421340629274966e-06, "loss": 1.773, "step": 9960 }, { "epoch": 0.85, "grad_norm": 1.71213698387146, "learning_rate": 7.404240766073872e-06, "loss": 1.9381, "step": 9964 }, { "epoch": 0.85, "grad_norm": 1.6945880651474, "learning_rate": 7.387140902872777e-06, "loss": 1.7854, "step": 9968 }, { "epoch": 0.85, "grad_norm": 1.71058189868927, "learning_rate": 7.370041039671682e-06, "loss": 1.8552, "step": 9972 }, { "epoch": 0.85, "grad_norm": 1.8152309656143188, "learning_rate": 7.3529411764705884e-06, "loss": 1.8759, "step": 9976 }, { "epoch": 0.85, "grad_norm": 1.6043202877044678, "learning_rate": 7.335841313269494e-06, "loss": 1.7945, "step": 9980 }, { "epoch": 0.85, "grad_norm": 1.7119457721710205, "learning_rate": 7.318741450068401e-06, "loss": 1.9138, "step": 9984 }, { "epoch": 0.85, "grad_norm": 1.9094306230545044, "learning_rate": 7.301641586867305e-06, "loss": 1.8234, "step": 9988 }, { "epoch": 0.85, "grad_norm": 1.6789647340774536, "learning_rate": 7.28454172366621e-06, "loss": 1.8883, "step": 9992 }, { "epoch": 0.85, "grad_norm": 1.6013123989105225, "learning_rate": 7.267441860465117e-06, "loss": 1.6778, "step": 9996 }, { "epoch": 0.85, "grad_norm": 1.7987964153289795, "learning_rate": 7.250341997264022e-06, "loss": 1.6727, "step": 10000 }, { "epoch": 0.86, "grad_norm": 1.6856164932250977, "learning_rate": 7.233242134062927e-06, "loss": 1.8289, "step": 10004 }, { "epoch": 0.86, "grad_norm": 1.7940067052841187, "learning_rate": 7.216142270861834e-06, "loss": 1.917, "step": 10008 }, { "epoch": 0.86, "grad_norm": 1.806894063949585, "learning_rate": 7.1990424076607384e-06, "loss": 1.9523, "step": 10012 }, { "epoch": 0.86, "grad_norm": 1.6045538187026978, "learning_rate": 7.181942544459645e-06, "loss": 1.8155, "step": 10016 }, { "epoch": 0.86, "grad_norm": 1.6840627193450928, "learning_rate": 7.164842681258551e-06, "loss": 1.87, "step": 10020 }, { "epoch": 0.86, "grad_norm": 1.5683917999267578, "learning_rate": 7.147742818057455e-06, "loss": 1.9501, "step": 10024 }, { "epoch": 0.86, "grad_norm": 1.6646571159362793, "learning_rate": 7.130642954856362e-06, "loss": 1.862, "step": 10028 }, { "epoch": 0.86, "grad_norm": 1.6929519176483154, "learning_rate": 7.113543091655267e-06, "loss": 1.9485, "step": 10032 }, { "epoch": 0.86, "grad_norm": 1.600963830947876, "learning_rate": 7.096443228454172e-06, "loss": 1.7745, "step": 10036 }, { "epoch": 0.86, "grad_norm": 1.6967799663543701, "learning_rate": 7.079343365253079e-06, "loss": 1.9167, "step": 10040 }, { "epoch": 0.86, "grad_norm": 1.6387646198272705, "learning_rate": 7.062243502051984e-06, "loss": 1.9051, "step": 10044 }, { "epoch": 0.86, "grad_norm": 1.5275384187698364, "learning_rate": 7.04514363885089e-06, "loss": 1.9997, "step": 10048 }, { "epoch": 0.86, "grad_norm": 1.677001953125, "learning_rate": 7.028043775649795e-06, "loss": 1.9359, "step": 10052 }, { "epoch": 0.86, "grad_norm": 1.5836267471313477, "learning_rate": 7.010943912448701e-06, "loss": 1.7359, "step": 10056 }, { "epoch": 0.86, "grad_norm": 1.7847533226013184, "learning_rate": 6.993844049247607e-06, "loss": 1.9568, "step": 10060 }, { "epoch": 0.86, "grad_norm": 1.6765036582946777, "learning_rate": 6.976744186046512e-06, "loss": 1.8862, "step": 10064 }, { "epoch": 0.86, "grad_norm": 1.8817429542541504, "learning_rate": 6.959644322845417e-06, "loss": 1.8935, "step": 10068 }, { "epoch": 0.86, "grad_norm": 1.765724778175354, "learning_rate": 6.942544459644323e-06, "loss": 1.7135, "step": 10072 }, { "epoch": 0.86, "grad_norm": 1.5911288261413574, "learning_rate": 6.925444596443229e-06, "loss": 1.7035, "step": 10076 }, { "epoch": 0.86, "grad_norm": 1.738471508026123, "learning_rate": 6.908344733242135e-06, "loss": 1.7544, "step": 10080 }, { "epoch": 0.86, "grad_norm": 1.719092845916748, "learning_rate": 6.89124487004104e-06, "loss": 1.8507, "step": 10084 }, { "epoch": 0.86, "grad_norm": 1.786017656326294, "learning_rate": 6.874145006839945e-06, "loss": 1.9009, "step": 10088 }, { "epoch": 0.86, "grad_norm": 2.0530200004577637, "learning_rate": 6.8570451436388514e-06, "loss": 1.8839, "step": 10092 }, { "epoch": 0.86, "grad_norm": 1.6444154977798462, "learning_rate": 6.839945280437757e-06, "loss": 1.8756, "step": 10096 }, { "epoch": 0.86, "grad_norm": 1.6261224746704102, "learning_rate": 6.822845417236662e-06, "loss": 1.6744, "step": 10100 }, { "epoch": 0.86, "grad_norm": 1.6390396356582642, "learning_rate": 6.805745554035568e-06, "loss": 1.8752, "step": 10104 }, { "epoch": 0.86, "grad_norm": 1.9215726852416992, "learning_rate": 6.788645690834473e-06, "loss": 1.9022, "step": 10108 }, { "epoch": 0.86, "grad_norm": 1.839656949043274, "learning_rate": 6.77154582763338e-06, "loss": 2.1024, "step": 10112 }, { "epoch": 0.86, "grad_norm": 1.6141258478164673, "learning_rate": 6.754445964432285e-06, "loss": 1.7609, "step": 10116 }, { "epoch": 0.87, "grad_norm": 1.6340337991714478, "learning_rate": 6.73734610123119e-06, "loss": 1.8939, "step": 10120 }, { "epoch": 0.87, "grad_norm": 1.529555082321167, "learning_rate": 6.720246238030097e-06, "loss": 1.7863, "step": 10124 }, { "epoch": 0.87, "grad_norm": 1.670279860496521, "learning_rate": 6.7031463748290014e-06, "loss": 1.8213, "step": 10128 }, { "epoch": 0.87, "grad_norm": 1.7106720209121704, "learning_rate": 6.686046511627907e-06, "loss": 1.6828, "step": 10132 }, { "epoch": 0.87, "grad_norm": 1.8096644878387451, "learning_rate": 6.668946648426814e-06, "loss": 1.8081, "step": 10136 }, { "epoch": 0.87, "grad_norm": 1.636301040649414, "learning_rate": 6.651846785225718e-06, "loss": 1.8518, "step": 10140 }, { "epoch": 0.87, "grad_norm": 1.5952695608139038, "learning_rate": 6.634746922024625e-06, "loss": 1.7897, "step": 10144 }, { "epoch": 0.87, "grad_norm": 1.5805269479751587, "learning_rate": 6.61764705882353e-06, "loss": 1.9542, "step": 10148 }, { "epoch": 0.87, "grad_norm": 1.8776960372924805, "learning_rate": 6.6005471956224345e-06, "loss": 1.8719, "step": 10152 }, { "epoch": 0.87, "grad_norm": 1.5122414827346802, "learning_rate": 6.583447332421341e-06, "loss": 1.7259, "step": 10156 }, { "epoch": 0.87, "grad_norm": 1.7734562158584595, "learning_rate": 6.566347469220247e-06, "loss": 1.8812, "step": 10160 }, { "epoch": 0.87, "grad_norm": 1.511674165725708, "learning_rate": 6.5492476060191514e-06, "loss": 1.841, "step": 10164 }, { "epoch": 0.87, "grad_norm": 1.6405954360961914, "learning_rate": 6.532147742818058e-06, "loss": 1.8166, "step": 10168 }, { "epoch": 0.87, "grad_norm": 1.822506070137024, "learning_rate": 6.515047879616963e-06, "loss": 1.8154, "step": 10172 }, { "epoch": 0.87, "grad_norm": 1.6633646488189697, "learning_rate": 6.497948016415869e-06, "loss": 1.8579, "step": 10176 }, { "epoch": 0.87, "grad_norm": 1.8290987014770508, "learning_rate": 6.480848153214775e-06, "loss": 1.8602, "step": 10180 }, { "epoch": 0.87, "grad_norm": 1.6676056385040283, "learning_rate": 6.46374829001368e-06, "loss": 1.7908, "step": 10184 }, { "epoch": 0.87, "grad_norm": 1.4981609582901, "learning_rate": 6.446648426812586e-06, "loss": 1.7605, "step": 10188 }, { "epoch": 0.87, "grad_norm": 1.7770746946334839, "learning_rate": 6.429548563611491e-06, "loss": 2.005, "step": 10192 }, { "epoch": 0.87, "grad_norm": 1.6636948585510254, "learning_rate": 6.412448700410397e-06, "loss": 1.9328, "step": 10196 }, { "epoch": 0.87, "grad_norm": 1.5431137084960938, "learning_rate": 6.395348837209303e-06, "loss": 1.7199, "step": 10200 }, { "epoch": 0.87, "grad_norm": 1.6235666275024414, "learning_rate": 6.378248974008208e-06, "loss": 1.8701, "step": 10204 }, { "epoch": 0.87, "grad_norm": 1.7597670555114746, "learning_rate": 6.3611491108071144e-06, "loss": 1.7161, "step": 10208 }, { "epoch": 0.87, "grad_norm": 1.8666725158691406, "learning_rate": 6.344049247606019e-06, "loss": 1.8283, "step": 10212 }, { "epoch": 0.87, "grad_norm": 1.8828492164611816, "learning_rate": 6.326949384404925e-06, "loss": 1.9689, "step": 10216 }, { "epoch": 0.87, "grad_norm": 1.539336919784546, "learning_rate": 6.309849521203831e-06, "loss": 1.731, "step": 10220 }, { "epoch": 0.87, "grad_norm": 1.6746304035186768, "learning_rate": 6.292749658002736e-06, "loss": 1.7627, "step": 10224 }, { "epoch": 0.87, "grad_norm": 1.6880711317062378, "learning_rate": 6.275649794801641e-06, "loss": 1.8081, "step": 10228 }, { "epoch": 0.87, "grad_norm": 1.6686969995498657, "learning_rate": 6.2585499316005475e-06, "loss": 1.7955, "step": 10232 }, { "epoch": 0.88, "grad_norm": 1.6514986753463745, "learning_rate": 6.241450068399453e-06, "loss": 1.7788, "step": 10236 }, { "epoch": 0.88, "grad_norm": 1.5582215785980225, "learning_rate": 6.224350205198359e-06, "loss": 1.7514, "step": 10240 }, { "epoch": 0.88, "grad_norm": 1.6330581903457642, "learning_rate": 6.2072503419972644e-06, "loss": 1.9055, "step": 10244 }, { "epoch": 0.88, "grad_norm": 1.6368670463562012, "learning_rate": 6.19015047879617e-06, "loss": 1.732, "step": 10248 }, { "epoch": 0.88, "grad_norm": 1.7110068798065186, "learning_rate": 6.173050615595075e-06, "loss": 1.9471, "step": 10252 }, { "epoch": 0.88, "grad_norm": 1.7540807723999023, "learning_rate": 6.155950752393981e-06, "loss": 1.8072, "step": 10256 }, { "epoch": 0.88, "grad_norm": 1.6526390314102173, "learning_rate": 6.138850889192887e-06, "loss": 1.7913, "step": 10260 }, { "epoch": 0.88, "grad_norm": 1.7240028381347656, "learning_rate": 6.121751025991793e-06, "loss": 1.8526, "step": 10264 }, { "epoch": 0.88, "grad_norm": 1.683828592300415, "learning_rate": 6.1046511627906975e-06, "loss": 1.7572, "step": 10268 }, { "epoch": 0.88, "grad_norm": 1.5554591417312622, "learning_rate": 6.087551299589603e-06, "loss": 1.7571, "step": 10272 }, { "epoch": 0.88, "grad_norm": 1.6821385622024536, "learning_rate": 6.07045143638851e-06, "loss": 1.7848, "step": 10276 }, { "epoch": 0.88, "grad_norm": 1.6544996500015259, "learning_rate": 6.053351573187415e-06, "loss": 1.8252, "step": 10280 }, { "epoch": 0.88, "grad_norm": 1.6138795614242554, "learning_rate": 6.03625170998632e-06, "loss": 1.979, "step": 10284 }, { "epoch": 0.88, "grad_norm": 2.004995346069336, "learning_rate": 6.019151846785226e-06, "loss": 1.7996, "step": 10288 }, { "epoch": 0.88, "grad_norm": 1.8927818536758423, "learning_rate": 6.002051983584131e-06, "loss": 1.8892, "step": 10292 }, { "epoch": 0.88, "grad_norm": 1.9253323078155518, "learning_rate": 5.984952120383038e-06, "loss": 1.9798, "step": 10296 }, { "epoch": 0.88, "grad_norm": 1.623855471611023, "learning_rate": 5.967852257181943e-06, "loss": 1.6606, "step": 10300 }, { "epoch": 0.88, "grad_norm": 1.5401018857955933, "learning_rate": 5.950752393980848e-06, "loss": 1.7557, "step": 10304 }, { "epoch": 0.88, "grad_norm": 1.643587350845337, "learning_rate": 5.933652530779754e-06, "loss": 1.9448, "step": 10308 }, { "epoch": 0.88, "grad_norm": 1.7403227090835571, "learning_rate": 5.91655266757866e-06, "loss": 1.9609, "step": 10312 }, { "epoch": 0.88, "grad_norm": 1.7716693878173828, "learning_rate": 5.899452804377565e-06, "loss": 1.8861, "step": 10316 }, { "epoch": 0.88, "grad_norm": 1.7579643726348877, "learning_rate": 5.882352941176471e-06, "loss": 1.8461, "step": 10320 }, { "epoch": 0.88, "grad_norm": 1.512486219406128, "learning_rate": 5.8652530779753766e-06, "loss": 1.8184, "step": 10324 }, { "epoch": 0.88, "grad_norm": 1.8807471990585327, "learning_rate": 5.848153214774282e-06, "loss": 1.7981, "step": 10328 }, { "epoch": 0.88, "grad_norm": 1.8313617706298828, "learning_rate": 5.831053351573187e-06, "loss": 1.7566, "step": 10332 }, { "epoch": 0.88, "grad_norm": 1.7667449712753296, "learning_rate": 5.8139534883720935e-06, "loss": 1.8996, "step": 10336 }, { "epoch": 0.88, "grad_norm": 1.5892364978790283, "learning_rate": 5.796853625170999e-06, "loss": 1.7042, "step": 10340 }, { "epoch": 0.88, "grad_norm": 1.9081833362579346, "learning_rate": 5.779753761969905e-06, "loss": 2.0525, "step": 10344 }, { "epoch": 0.88, "grad_norm": 1.891441822052002, "learning_rate": 5.76265389876881e-06, "loss": 1.7879, "step": 10348 }, { "epoch": 0.89, "grad_norm": 1.9260497093200684, "learning_rate": 5.745554035567715e-06, "loss": 1.8796, "step": 10352 }, { "epoch": 0.89, "grad_norm": 1.6760286092758179, "learning_rate": 5.728454172366622e-06, "loss": 1.9087, "step": 10356 }, { "epoch": 0.89, "grad_norm": 1.6360900402069092, "learning_rate": 5.711354309165527e-06, "loss": 1.7715, "step": 10360 }, { "epoch": 0.89, "grad_norm": 1.7029601335525513, "learning_rate": 5.694254445964432e-06, "loss": 1.7859, "step": 10364 }, { "epoch": 0.89, "grad_norm": 1.5900228023529053, "learning_rate": 5.677154582763338e-06, "loss": 1.8456, "step": 10368 }, { "epoch": 0.89, "grad_norm": 1.6234673261642456, "learning_rate": 5.6600547195622435e-06, "loss": 1.8007, "step": 10372 }, { "epoch": 0.89, "grad_norm": 1.5690399408340454, "learning_rate": 5.64295485636115e-06, "loss": 1.8154, "step": 10376 }, { "epoch": 0.89, "grad_norm": 1.622010588645935, "learning_rate": 5.625854993160055e-06, "loss": 1.9555, "step": 10380 }, { "epoch": 0.89, "grad_norm": 1.7594480514526367, "learning_rate": 5.6087551299589605e-06, "loss": 1.9146, "step": 10384 }, { "epoch": 0.89, "grad_norm": 1.644951581954956, "learning_rate": 5.591655266757866e-06, "loss": 1.9562, "step": 10388 }, { "epoch": 0.89, "grad_norm": 1.6730928421020508, "learning_rate": 5.574555403556772e-06, "loss": 1.927, "step": 10392 }, { "epoch": 0.89, "grad_norm": 1.749422311782837, "learning_rate": 5.557455540355677e-06, "loss": 1.8613, "step": 10396 }, { "epoch": 0.89, "grad_norm": 1.6241000890731812, "learning_rate": 5.540355677154583e-06, "loss": 1.6963, "step": 10400 }, { "epoch": 0.89, "grad_norm": 1.6478880643844604, "learning_rate": 5.523255813953489e-06, "loss": 1.7823, "step": 10404 }, { "epoch": 0.89, "grad_norm": 1.8106017112731934, "learning_rate": 5.506155950752394e-06, "loss": 1.8875, "step": 10408 }, { "epoch": 0.89, "grad_norm": 1.792172908782959, "learning_rate": 5.4890560875513e-06, "loss": 1.9241, "step": 10412 }, { "epoch": 0.89, "grad_norm": 1.5553025007247925, "learning_rate": 5.471956224350206e-06, "loss": 1.7159, "step": 10416 }, { "epoch": 0.89, "grad_norm": 1.5697897672653198, "learning_rate": 5.454856361149111e-06, "loss": 1.7839, "step": 10420 }, { "epoch": 0.89, "grad_norm": 1.7735848426818848, "learning_rate": 5.437756497948017e-06, "loss": 1.7892, "step": 10424 }, { "epoch": 0.89, "grad_norm": 1.8028829097747803, "learning_rate": 5.420656634746922e-06, "loss": 1.8241, "step": 10428 }, { "epoch": 0.89, "grad_norm": 1.7222471237182617, "learning_rate": 5.403556771545827e-06, "loss": 1.7322, "step": 10432 }, { "epoch": 0.89, "grad_norm": 1.5926076173782349, "learning_rate": 5.386456908344734e-06, "loss": 1.8576, "step": 10436 }, { "epoch": 0.89, "grad_norm": 1.642992377281189, "learning_rate": 5.3693570451436396e-06, "loss": 1.83, "step": 10440 }, { "epoch": 0.89, "grad_norm": 1.577621579170227, "learning_rate": 5.352257181942544e-06, "loss": 1.7914, "step": 10444 }, { "epoch": 0.89, "grad_norm": 1.5518428087234497, "learning_rate": 5.33515731874145e-06, "loss": 1.9518, "step": 10448 }, { "epoch": 0.89, "grad_norm": 1.6200405359268188, "learning_rate": 5.318057455540356e-06, "loss": 1.7868, "step": 10452 }, { "epoch": 0.89, "grad_norm": 1.5001662969589233, "learning_rate": 5.300957592339262e-06, "loss": 1.8093, "step": 10456 }, { "epoch": 0.89, "grad_norm": 1.6836313009262085, "learning_rate": 5.283857729138167e-06, "loss": 1.8562, "step": 10460 }, { "epoch": 0.89, "grad_norm": 1.7168313264846802, "learning_rate": 5.266757865937073e-06, "loss": 1.8735, "step": 10464 }, { "epoch": 0.9, "grad_norm": 1.7337522506713867, "learning_rate": 5.249658002735978e-06, "loss": 1.9054, "step": 10468 }, { "epoch": 0.9, "grad_norm": 1.4755985736846924, "learning_rate": 5.232558139534884e-06, "loss": 1.8456, "step": 10472 }, { "epoch": 0.9, "grad_norm": 1.7717063426971436, "learning_rate": 5.2154582763337896e-06, "loss": 1.8035, "step": 10476 }, { "epoch": 0.9, "grad_norm": 1.8099918365478516, "learning_rate": 5.198358413132695e-06, "loss": 1.9099, "step": 10480 }, { "epoch": 0.9, "grad_norm": 1.6142700910568237, "learning_rate": 5.181258549931601e-06, "loss": 1.7275, "step": 10484 }, { "epoch": 0.9, "grad_norm": 1.6501797437667847, "learning_rate": 5.1641586867305065e-06, "loss": 1.7888, "step": 10488 }, { "epoch": 0.9, "grad_norm": 1.6632269620895386, "learning_rate": 5.147058823529412e-06, "loss": 1.77, "step": 10492 }, { "epoch": 0.9, "grad_norm": 1.698384165763855, "learning_rate": 5.129958960328318e-06, "loss": 1.7493, "step": 10496 }, { "epoch": 0.9, "grad_norm": 1.6063936948776245, "learning_rate": 5.1128590971272235e-06, "loss": 1.7966, "step": 10500 }, { "epoch": 0.9, "grad_norm": 1.9107542037963867, "learning_rate": 5.095759233926129e-06, "loss": 1.7634, "step": 10504 }, { "epoch": 0.9, "grad_norm": 1.839381217956543, "learning_rate": 5.078659370725034e-06, "loss": 1.7991, "step": 10508 }, { "epoch": 0.9, "grad_norm": 1.7109549045562744, "learning_rate": 5.0615595075239396e-06, "loss": 1.7911, "step": 10512 }, { "epoch": 0.9, "grad_norm": 1.7641953229904175, "learning_rate": 5.044459644322846e-06, "loss": 1.9, "step": 10516 }, { "epoch": 0.9, "grad_norm": 1.981584072113037, "learning_rate": 5.027359781121752e-06, "loss": 1.8533, "step": 10520 }, { "epoch": 0.9, "grad_norm": 1.5387113094329834, "learning_rate": 5.0102599179206565e-06, "loss": 1.9877, "step": 10524 }, { "epoch": 0.9, "grad_norm": 1.630399465560913, "learning_rate": 4.993160054719562e-06, "loss": 1.9056, "step": 10528 }, { "epoch": 0.9, "grad_norm": 1.8931196928024292, "learning_rate": 4.976060191518468e-06, "loss": 1.7797, "step": 10532 }, { "epoch": 0.9, "grad_norm": 1.5811591148376465, "learning_rate": 4.958960328317374e-06, "loss": 2.0371, "step": 10536 }, { "epoch": 0.9, "grad_norm": 1.7181917428970337, "learning_rate": 4.941860465116279e-06, "loss": 1.7816, "step": 10540 }, { "epoch": 0.9, "grad_norm": 1.6405460834503174, "learning_rate": 4.924760601915185e-06, "loss": 1.7458, "step": 10544 }, { "epoch": 0.9, "grad_norm": 1.758260726928711, "learning_rate": 4.90766073871409e-06, "loss": 1.948, "step": 10548 }, { "epoch": 0.9, "grad_norm": 1.845942497253418, "learning_rate": 4.890560875512996e-06, "loss": 1.7851, "step": 10552 }, { "epoch": 0.9, "grad_norm": 1.79182767868042, "learning_rate": 4.873461012311902e-06, "loss": 1.8356, "step": 10556 }, { "epoch": 0.9, "grad_norm": 1.8030176162719727, "learning_rate": 4.856361149110807e-06, "loss": 1.9056, "step": 10560 }, { "epoch": 0.9, "grad_norm": 1.7074739933013916, "learning_rate": 4.839261285909713e-06, "loss": 1.8684, "step": 10564 }, { "epoch": 0.9, "grad_norm": 1.5636216402053833, "learning_rate": 4.822161422708619e-06, "loss": 1.7258, "step": 10568 }, { "epoch": 0.9, "grad_norm": 1.8994554281234741, "learning_rate": 4.805061559507524e-06, "loss": 1.8462, "step": 10572 }, { "epoch": 0.9, "grad_norm": 1.5819472074508667, "learning_rate": 4.78796169630643e-06, "loss": 1.7744, "step": 10576 }, { "epoch": 0.9, "grad_norm": 1.7214453220367432, "learning_rate": 4.770861833105336e-06, "loss": 1.8286, "step": 10580 }, { "epoch": 0.9, "grad_norm": 1.6858314275741577, "learning_rate": 4.753761969904241e-06, "loss": 1.9328, "step": 10584 }, { "epoch": 0.91, "grad_norm": 1.6816788911819458, "learning_rate": 4.736662106703146e-06, "loss": 1.9712, "step": 10588 }, { "epoch": 0.91, "grad_norm": 1.5183528661727905, "learning_rate": 4.7195622435020526e-06, "loss": 1.8719, "step": 10592 }, { "epoch": 0.91, "grad_norm": 1.7054622173309326, "learning_rate": 4.702462380300958e-06, "loss": 1.8244, "step": 10596 }, { "epoch": 0.91, "grad_norm": 1.5704643726348877, "learning_rate": 4.685362517099864e-06, "loss": 1.7187, "step": 10600 }, { "epoch": 0.91, "grad_norm": 1.616736888885498, "learning_rate": 4.668262653898769e-06, "loss": 1.9231, "step": 10604 }, { "epoch": 0.91, "grad_norm": 1.8305636644363403, "learning_rate": 4.651162790697674e-06, "loss": 2.0724, "step": 10608 }, { "epoch": 0.91, "grad_norm": 1.623712420463562, "learning_rate": 4.63406292749658e-06, "loss": 1.8901, "step": 10612 }, { "epoch": 0.91, "grad_norm": 1.7231340408325195, "learning_rate": 4.6169630642954865e-06, "loss": 1.7852, "step": 10616 }, { "epoch": 0.91, "grad_norm": 1.707742691040039, "learning_rate": 4.599863201094391e-06, "loss": 1.861, "step": 10620 }, { "epoch": 0.91, "grad_norm": 1.6897032260894775, "learning_rate": 4.582763337893297e-06, "loss": 1.7987, "step": 10624 }, { "epoch": 0.91, "grad_norm": 1.4974230527877808, "learning_rate": 4.5656634746922026e-06, "loss": 1.7377, "step": 10628 }, { "epoch": 0.91, "grad_norm": 1.7821507453918457, "learning_rate": 4.548563611491108e-06, "loss": 1.7803, "step": 10632 }, { "epoch": 0.91, "grad_norm": 1.7004159688949585, "learning_rate": 4.531463748290014e-06, "loss": 1.7409, "step": 10636 }, { "epoch": 0.91, "grad_norm": 1.620969295501709, "learning_rate": 4.5143638850889195e-06, "loss": 1.6708, "step": 10640 }, { "epoch": 0.91, "grad_norm": 1.6645839214324951, "learning_rate": 4.497264021887825e-06, "loss": 1.8713, "step": 10644 }, { "epoch": 0.91, "grad_norm": 1.6770342588424683, "learning_rate": 4.480164158686731e-06, "loss": 1.9282, "step": 10648 }, { "epoch": 0.91, "grad_norm": 1.7173173427581787, "learning_rate": 4.4630642954856365e-06, "loss": 1.8702, "step": 10652 }, { "epoch": 0.91, "grad_norm": 1.7240304946899414, "learning_rate": 4.445964432284542e-06, "loss": 1.8032, "step": 10656 }, { "epoch": 0.91, "grad_norm": 1.618517518043518, "learning_rate": 4.428864569083448e-06, "loss": 1.846, "step": 10660 }, { "epoch": 0.91, "grad_norm": 1.6240915060043335, "learning_rate": 4.411764705882353e-06, "loss": 1.8224, "step": 10664 }, { "epoch": 0.91, "grad_norm": 1.519727110862732, "learning_rate": 4.394664842681258e-06, "loss": 1.7819, "step": 10668 }, { "epoch": 0.91, "grad_norm": 2.0412402153015137, "learning_rate": 4.377564979480165e-06, "loss": 1.8308, "step": 10672 }, { "epoch": 0.91, "grad_norm": 1.5424792766571045, "learning_rate": 4.36046511627907e-06, "loss": 1.8432, "step": 10676 }, { "epoch": 0.91, "grad_norm": 1.9428133964538574, "learning_rate": 4.343365253077976e-06, "loss": 1.6436, "step": 10680 }, { "epoch": 0.91, "grad_norm": 1.6989853382110596, "learning_rate": 4.326265389876881e-06, "loss": 1.7265, "step": 10684 }, { "epoch": 0.91, "grad_norm": 1.5763792991638184, "learning_rate": 4.3091655266757865e-06, "loss": 1.8635, "step": 10688 }, { "epoch": 0.91, "grad_norm": 1.7506353855133057, "learning_rate": 4.292065663474692e-06, "loss": 1.83, "step": 10692 }, { "epoch": 0.91, "grad_norm": 1.6734927892684937, "learning_rate": 4.274965800273599e-06, "loss": 1.8733, "step": 10696 }, { "epoch": 0.91, "grad_norm": 1.7979787588119507, "learning_rate": 4.257865937072503e-06, "loss": 1.8805, "step": 10700 }, { "epoch": 0.92, "grad_norm": 1.643833875656128, "learning_rate": 4.240766073871409e-06, "loss": 1.7028, "step": 10704 }, { "epoch": 0.92, "grad_norm": 1.727108120918274, "learning_rate": 4.223666210670315e-06, "loss": 1.7918, "step": 10708 }, { "epoch": 0.92, "grad_norm": 1.6539056301116943, "learning_rate": 4.20656634746922e-06, "loss": 1.8238, "step": 10712 }, { "epoch": 0.92, "grad_norm": 1.7414677143096924, "learning_rate": 4.189466484268126e-06, "loss": 1.8796, "step": 10716 }, { "epoch": 0.92, "grad_norm": 1.6490898132324219, "learning_rate": 4.172366621067032e-06, "loss": 1.7907, "step": 10720 }, { "epoch": 0.92, "grad_norm": 1.7541985511779785, "learning_rate": 4.155266757865937e-06, "loss": 1.8669, "step": 10724 }, { "epoch": 0.92, "grad_norm": 1.6836141347885132, "learning_rate": 4.138166894664843e-06, "loss": 1.7802, "step": 10728 }, { "epoch": 0.92, "grad_norm": 2.008136034011841, "learning_rate": 4.121067031463749e-06, "loss": 1.9157, "step": 10732 }, { "epoch": 0.92, "grad_norm": 1.7152680158615112, "learning_rate": 4.103967168262654e-06, "loss": 1.8344, "step": 10736 }, { "epoch": 0.92, "grad_norm": 1.5120372772216797, "learning_rate": 4.08686730506156e-06, "loss": 1.7443, "step": 10740 }, { "epoch": 0.92, "grad_norm": 1.5764340162277222, "learning_rate": 4.0697674418604655e-06, "loss": 1.7508, "step": 10744 }, { "epoch": 0.92, "grad_norm": 1.6534470319747925, "learning_rate": 4.05266757865937e-06, "loss": 1.8064, "step": 10748 }, { "epoch": 0.92, "grad_norm": 1.8456621170043945, "learning_rate": 4.035567715458277e-06, "loss": 1.8589, "step": 10752 }, { "epoch": 0.92, "grad_norm": 1.6170421838760376, "learning_rate": 4.0184678522571825e-06, "loss": 1.6253, "step": 10756 }, { "epoch": 0.92, "grad_norm": 1.5028194189071655, "learning_rate": 4.001367989056088e-06, "loss": 1.7209, "step": 10760 }, { "epoch": 0.92, "grad_norm": 1.9968106746673584, "learning_rate": 3.984268125854993e-06, "loss": 1.9172, "step": 10764 }, { "epoch": 0.92, "grad_norm": 1.7104765176773071, "learning_rate": 3.967168262653899e-06, "loss": 1.7745, "step": 10768 }, { "epoch": 0.92, "grad_norm": 1.739295244216919, "learning_rate": 3.950068399452805e-06, "loss": 1.8637, "step": 10772 }, { "epoch": 0.92, "grad_norm": 1.587810754776001, "learning_rate": 3.932968536251711e-06, "loss": 1.6962, "step": 10776 }, { "epoch": 0.92, "grad_norm": 1.6938265562057495, "learning_rate": 3.9158686730506155e-06, "loss": 1.8125, "step": 10780 }, { "epoch": 0.92, "grad_norm": 1.7163448333740234, "learning_rate": 3.898768809849521e-06, "loss": 1.8493, "step": 10784 }, { "epoch": 0.92, "grad_norm": 1.6594278812408447, "learning_rate": 3.881668946648427e-06, "loss": 1.8301, "step": 10788 }, { "epoch": 0.92, "grad_norm": 1.661649227142334, "learning_rate": 3.8645690834473325e-06, "loss": 1.774, "step": 10792 }, { "epoch": 0.92, "grad_norm": 1.8990195989608765, "learning_rate": 3.847469220246238e-06, "loss": 1.8284, "step": 10796 }, { "epoch": 0.92, "grad_norm": 1.8179618120193481, "learning_rate": 3.830369357045144e-06, "loss": 1.7436, "step": 10800 }, { "epoch": 0.92, "grad_norm": 2.0388402938842773, "learning_rate": 3.8132694938440494e-06, "loss": 1.9214, "step": 10804 }, { "epoch": 0.92, "grad_norm": 1.578235387802124, "learning_rate": 3.7961696306429555e-06, "loss": 1.8137, "step": 10808 }, { "epoch": 0.92, "grad_norm": 1.9262685775756836, "learning_rate": 3.7790697674418603e-06, "loss": 1.8067, "step": 10812 }, { "epoch": 0.92, "grad_norm": 1.7524725198745728, "learning_rate": 3.7619699042407664e-06, "loss": 2.018, "step": 10816 }, { "epoch": 0.93, "grad_norm": 1.7184381484985352, "learning_rate": 3.744870041039672e-06, "loss": 1.8733, "step": 10820 }, { "epoch": 0.93, "grad_norm": 1.626380443572998, "learning_rate": 3.7277701778385777e-06, "loss": 1.8379, "step": 10824 }, { "epoch": 0.93, "grad_norm": 1.610595941543579, "learning_rate": 3.710670314637483e-06, "loss": 1.8246, "step": 10828 }, { "epoch": 0.93, "grad_norm": 1.6897306442260742, "learning_rate": 3.6935704514363886e-06, "loss": 1.7103, "step": 10832 }, { "epoch": 0.93, "grad_norm": 1.8274520635604858, "learning_rate": 3.6764705882352942e-06, "loss": 1.9273, "step": 10836 }, { "epoch": 0.93, "grad_norm": 1.8843746185302734, "learning_rate": 3.6593707250342003e-06, "loss": 1.792, "step": 10840 }, { "epoch": 0.93, "grad_norm": 3.664485216140747, "learning_rate": 3.642270861833105e-06, "loss": 2.0409, "step": 10844 }, { "epoch": 0.93, "grad_norm": 1.5859229564666748, "learning_rate": 3.625170998632011e-06, "loss": 1.9006, "step": 10848 }, { "epoch": 0.93, "grad_norm": 1.6038724184036255, "learning_rate": 3.608071135430917e-06, "loss": 1.8245, "step": 10852 }, { "epoch": 0.93, "grad_norm": 1.6321396827697754, "learning_rate": 3.5909712722298225e-06, "loss": 1.8706, "step": 10856 }, { "epoch": 0.93, "grad_norm": 1.7967422008514404, "learning_rate": 3.5738714090287277e-06, "loss": 1.86, "step": 10860 }, { "epoch": 0.93, "grad_norm": 1.5631977319717407, "learning_rate": 3.5567715458276333e-06, "loss": 1.7408, "step": 10864 }, { "epoch": 0.93, "grad_norm": 1.5402038097381592, "learning_rate": 3.5396716826265394e-06, "loss": 1.8824, "step": 10868 }, { "epoch": 0.93, "grad_norm": 1.5307432413101196, "learning_rate": 3.522571819425445e-06, "loss": 1.7778, "step": 10872 }, { "epoch": 0.93, "grad_norm": 1.6216157674789429, "learning_rate": 3.5054719562243503e-06, "loss": 1.6656, "step": 10876 }, { "epoch": 0.93, "grad_norm": 1.6624993085861206, "learning_rate": 3.488372093023256e-06, "loss": 1.9027, "step": 10880 }, { "epoch": 0.93, "grad_norm": 1.6463823318481445, "learning_rate": 3.4712722298221616e-06, "loss": 1.8464, "step": 10884 }, { "epoch": 0.93, "grad_norm": 1.56813645362854, "learning_rate": 3.4541723666210677e-06, "loss": 1.8271, "step": 10888 }, { "epoch": 0.93, "grad_norm": 1.8501081466674805, "learning_rate": 3.4370725034199725e-06, "loss": 1.9077, "step": 10892 }, { "epoch": 0.93, "grad_norm": 1.9633204936981201, "learning_rate": 3.4199726402188785e-06, "loss": 1.8458, "step": 10896 }, { "epoch": 0.93, "grad_norm": 1.8002575635910034, "learning_rate": 3.402872777017784e-06, "loss": 2.0061, "step": 10900 }, { "epoch": 0.93, "grad_norm": 1.6839830875396729, "learning_rate": 3.38577291381669e-06, "loss": 1.8378, "step": 10904 }, { "epoch": 0.93, "grad_norm": 1.6927244663238525, "learning_rate": 3.368673050615595e-06, "loss": 1.7045, "step": 10908 }, { "epoch": 0.93, "grad_norm": 1.5076004266738892, "learning_rate": 3.3515731874145007e-06, "loss": 1.9581, "step": 10912 }, { "epoch": 0.93, "grad_norm": 1.7239080667495728, "learning_rate": 3.334473324213407e-06, "loss": 1.9258, "step": 10916 }, { "epoch": 0.93, "grad_norm": 1.8428497314453125, "learning_rate": 3.3173734610123124e-06, "loss": 1.8136, "step": 10920 }, { "epoch": 0.93, "grad_norm": 1.6915169954299927, "learning_rate": 3.3002735978112172e-06, "loss": 1.9492, "step": 10924 }, { "epoch": 0.93, "grad_norm": 1.641465425491333, "learning_rate": 3.2831737346101233e-06, "loss": 1.7496, "step": 10928 }, { "epoch": 0.93, "grad_norm": 1.6878330707550049, "learning_rate": 3.266073871409029e-06, "loss": 1.85, "step": 10932 }, { "epoch": 0.94, "grad_norm": 1.8498550653457642, "learning_rate": 3.2489740082079346e-06, "loss": 1.9107, "step": 10936 }, { "epoch": 0.94, "grad_norm": 1.61210036277771, "learning_rate": 3.23187414500684e-06, "loss": 1.9388, "step": 10940 }, { "epoch": 0.94, "grad_norm": 1.551893949508667, "learning_rate": 3.2147742818057455e-06, "loss": 1.8542, "step": 10944 }, { "epoch": 0.94, "grad_norm": 1.587180733680725, "learning_rate": 3.1976744186046516e-06, "loss": 1.8516, "step": 10948 }, { "epoch": 0.94, "grad_norm": 1.42110276222229, "learning_rate": 3.1805745554035572e-06, "loss": 1.7036, "step": 10952 }, { "epoch": 0.94, "grad_norm": 1.7156518697738647, "learning_rate": 3.1634746922024624e-06, "loss": 1.8442, "step": 10956 }, { "epoch": 0.94, "grad_norm": 1.6081629991531372, "learning_rate": 3.146374829001368e-06, "loss": 1.9073, "step": 10960 }, { "epoch": 0.94, "grad_norm": 2.0794620513916016, "learning_rate": 3.1292749658002737e-06, "loss": 2.0049, "step": 10964 }, { "epoch": 0.94, "grad_norm": 1.6746197938919067, "learning_rate": 3.1121751025991794e-06, "loss": 1.7059, "step": 10968 }, { "epoch": 0.94, "grad_norm": 1.5624709129333496, "learning_rate": 3.095075239398085e-06, "loss": 1.8929, "step": 10972 }, { "epoch": 0.94, "grad_norm": 1.6810204982757568, "learning_rate": 3.0779753761969907e-06, "loss": 1.8587, "step": 10976 }, { "epoch": 0.94, "grad_norm": 1.586511492729187, "learning_rate": 3.0608755129958963e-06, "loss": 1.7846, "step": 10980 }, { "epoch": 0.94, "grad_norm": 1.6255276203155518, "learning_rate": 3.0437756497948016e-06, "loss": 1.8619, "step": 10984 }, { "epoch": 0.94, "grad_norm": 1.8912711143493652, "learning_rate": 3.0266757865937076e-06, "loss": 1.9116, "step": 10988 }, { "epoch": 0.94, "grad_norm": 1.4620989561080933, "learning_rate": 3.009575923392613e-06, "loss": 1.9151, "step": 10992 }, { "epoch": 0.94, "grad_norm": 1.5181527137756348, "learning_rate": 2.992476060191519e-06, "loss": 1.6923, "step": 10996 }, { "epoch": 0.94, "grad_norm": 1.7841485738754272, "learning_rate": 2.975376196990424e-06, "loss": 1.7321, "step": 11000 }, { "epoch": 0.94, "grad_norm": 1.8255186080932617, "learning_rate": 2.95827633378933e-06, "loss": 1.8564, "step": 11004 }, { "epoch": 0.94, "grad_norm": 1.6331710815429688, "learning_rate": 2.9411764705882355e-06, "loss": 1.9261, "step": 11008 }, { "epoch": 0.94, "grad_norm": 1.5975050926208496, "learning_rate": 2.924076607387141e-06, "loss": 1.7562, "step": 11012 }, { "epoch": 0.94, "grad_norm": 1.5330755710601807, "learning_rate": 2.9069767441860468e-06, "loss": 1.7575, "step": 11016 }, { "epoch": 0.94, "grad_norm": 1.7199543714523315, "learning_rate": 2.8898768809849524e-06, "loss": 1.8875, "step": 11020 }, { "epoch": 0.94, "grad_norm": 1.7265928983688354, "learning_rate": 2.8727770177838576e-06, "loss": 1.9619, "step": 11024 }, { "epoch": 0.94, "grad_norm": 1.955166220664978, "learning_rate": 2.8556771545827637e-06, "loss": 1.9266, "step": 11028 }, { "epoch": 0.94, "grad_norm": 1.708550214767456, "learning_rate": 2.838577291381669e-06, "loss": 1.8194, "step": 11032 }, { "epoch": 0.94, "grad_norm": 1.7049528360366821, "learning_rate": 2.821477428180575e-06, "loss": 1.728, "step": 11036 }, { "epoch": 0.94, "grad_norm": 1.8736356496810913, "learning_rate": 2.8043775649794802e-06, "loss": 1.7949, "step": 11040 }, { "epoch": 0.94, "grad_norm": 1.8903663158416748, "learning_rate": 2.787277701778386e-06, "loss": 1.7765, "step": 11044 }, { "epoch": 0.94, "grad_norm": 1.690417766571045, "learning_rate": 2.7701778385772915e-06, "loss": 1.8932, "step": 11048 }, { "epoch": 0.94, "grad_norm": 1.7920684814453125, "learning_rate": 2.753077975376197e-06, "loss": 1.7986, "step": 11052 }, { "epoch": 0.95, "grad_norm": 1.771796703338623, "learning_rate": 2.735978112175103e-06, "loss": 1.7103, "step": 11056 }, { "epoch": 0.95, "grad_norm": 1.51233971118927, "learning_rate": 2.7188782489740085e-06, "loss": 1.6045, "step": 11060 }, { "epoch": 0.95, "grad_norm": 1.6907927989959717, "learning_rate": 2.7017783857729137e-06, "loss": 1.9416, "step": 11064 }, { "epoch": 0.95, "grad_norm": 1.575386881828308, "learning_rate": 2.6846785225718198e-06, "loss": 1.7111, "step": 11068 }, { "epoch": 0.95, "grad_norm": 2.0558063983917236, "learning_rate": 2.667578659370725e-06, "loss": 1.7934, "step": 11072 }, { "epoch": 0.95, "grad_norm": 1.705366611480713, "learning_rate": 2.650478796169631e-06, "loss": 1.8026, "step": 11076 }, { "epoch": 0.95, "grad_norm": 1.625901222229004, "learning_rate": 2.6333789329685363e-06, "loss": 1.9377, "step": 11080 }, { "epoch": 0.95, "grad_norm": 1.7622089385986328, "learning_rate": 2.616279069767442e-06, "loss": 1.7597, "step": 11084 }, { "epoch": 0.95, "grad_norm": 1.841213583946228, "learning_rate": 2.5991792065663476e-06, "loss": 1.901, "step": 11088 }, { "epoch": 0.95, "grad_norm": 1.6358860731124878, "learning_rate": 2.5820793433652533e-06, "loss": 1.7777, "step": 11092 }, { "epoch": 0.95, "grad_norm": 1.6823545694351196, "learning_rate": 2.564979480164159e-06, "loss": 1.9779, "step": 11096 }, { "epoch": 0.95, "grad_norm": 1.8069067001342773, "learning_rate": 2.5478796169630646e-06, "loss": 1.9257, "step": 11100 }, { "epoch": 0.95, "grad_norm": 1.4878418445587158, "learning_rate": 2.5307797537619698e-06, "loss": 1.8447, "step": 11104 }, { "epoch": 0.95, "grad_norm": 1.6673344373703003, "learning_rate": 2.513679890560876e-06, "loss": 1.8752, "step": 11108 }, { "epoch": 0.95, "grad_norm": 1.6673535108566284, "learning_rate": 2.496580027359781e-06, "loss": 1.8024, "step": 11112 }, { "epoch": 0.95, "grad_norm": 1.7625112533569336, "learning_rate": 2.479480164158687e-06, "loss": 1.7451, "step": 11116 }, { "epoch": 0.95, "grad_norm": 1.7562973499298096, "learning_rate": 2.4623803009575924e-06, "loss": 1.8505, "step": 11120 }, { "epoch": 0.95, "grad_norm": 1.55601966381073, "learning_rate": 2.445280437756498e-06, "loss": 1.7542, "step": 11124 }, { "epoch": 0.95, "grad_norm": 1.5536571741104126, "learning_rate": 2.4281805745554037e-06, "loss": 1.7571, "step": 11128 }, { "epoch": 0.95, "grad_norm": 2.797058343887329, "learning_rate": 2.4110807113543093e-06, "loss": 1.8041, "step": 11132 }, { "epoch": 0.95, "grad_norm": 1.5327272415161133, "learning_rate": 2.393980848153215e-06, "loss": 1.9662, "step": 11136 }, { "epoch": 0.95, "grad_norm": 1.6519979238510132, "learning_rate": 2.3768809849521206e-06, "loss": 1.8325, "step": 11140 }, { "epoch": 0.95, "grad_norm": 1.6532912254333496, "learning_rate": 2.3597811217510263e-06, "loss": 1.7309, "step": 11144 }, { "epoch": 0.95, "grad_norm": 1.6513277292251587, "learning_rate": 2.342681258549932e-06, "loss": 1.8967, "step": 11148 }, { "epoch": 0.95, "grad_norm": 2.01287579536438, "learning_rate": 2.325581395348837e-06, "loss": 1.8535, "step": 11152 }, { "epoch": 0.95, "grad_norm": 1.7155896425247192, "learning_rate": 2.3084815321477432e-06, "loss": 1.9063, "step": 11156 }, { "epoch": 0.95, "grad_norm": 1.802552342414856, "learning_rate": 2.2913816689466485e-06, "loss": 1.7353, "step": 11160 }, { "epoch": 0.95, "grad_norm": 1.850523591041565, "learning_rate": 2.274281805745554e-06, "loss": 2.0235, "step": 11164 }, { "epoch": 0.95, "grad_norm": 1.6676249504089355, "learning_rate": 2.2571819425444598e-06, "loss": 1.8164, "step": 11168 }, { "epoch": 0.96, "grad_norm": 2.562305212020874, "learning_rate": 2.2400820793433654e-06, "loss": 1.8171, "step": 11172 }, { "epoch": 0.96, "grad_norm": 1.9502869844436646, "learning_rate": 2.222982216142271e-06, "loss": 1.8298, "step": 11176 }, { "epoch": 0.96, "grad_norm": 1.739216923713684, "learning_rate": 2.2058823529411767e-06, "loss": 1.7871, "step": 11180 }, { "epoch": 0.96, "grad_norm": 1.595462441444397, "learning_rate": 2.1887824897400824e-06, "loss": 1.7876, "step": 11184 }, { "epoch": 0.96, "grad_norm": 2.28205943107605, "learning_rate": 2.171682626538988e-06, "loss": 1.9108, "step": 11188 }, { "epoch": 0.96, "grad_norm": 1.8560374975204468, "learning_rate": 2.1545827633378932e-06, "loss": 1.7791, "step": 11192 }, { "epoch": 0.96, "grad_norm": 1.6630357503890991, "learning_rate": 2.1374829001367993e-06, "loss": 1.8153, "step": 11196 }, { "epoch": 0.96, "grad_norm": 1.6874091625213623, "learning_rate": 2.1203830369357045e-06, "loss": 1.813, "step": 11200 }, { "epoch": 0.96, "grad_norm": 1.5925251245498657, "learning_rate": 2.10328317373461e-06, "loss": 1.7251, "step": 11204 }, { "epoch": 0.96, "grad_norm": 1.7215338945388794, "learning_rate": 2.086183310533516e-06, "loss": 1.8046, "step": 11208 }, { "epoch": 0.96, "grad_norm": 1.6895273923873901, "learning_rate": 2.0690834473324215e-06, "loss": 1.8298, "step": 11212 }, { "epoch": 0.96, "grad_norm": 1.641157865524292, "learning_rate": 2.051983584131327e-06, "loss": 1.7497, "step": 11216 }, { "epoch": 0.96, "grad_norm": 1.756468653678894, "learning_rate": 2.0348837209302328e-06, "loss": 1.791, "step": 11220 }, { "epoch": 0.96, "grad_norm": 1.7440533638000488, "learning_rate": 2.0177838577291384e-06, "loss": 1.7596, "step": 11224 }, { "epoch": 0.96, "grad_norm": 1.4534015655517578, "learning_rate": 2.000683994528044e-06, "loss": 1.7385, "step": 11228 }, { "epoch": 0.96, "grad_norm": 1.6513500213623047, "learning_rate": 1.9835841313269493e-06, "loss": 2.1346, "step": 11232 }, { "epoch": 0.96, "grad_norm": 1.7908762693405151, "learning_rate": 1.9664842681258554e-06, "loss": 1.7386, "step": 11236 }, { "epoch": 0.96, "grad_norm": 1.5934255123138428, "learning_rate": 1.9493844049247606e-06, "loss": 1.8361, "step": 11240 }, { "epoch": 0.96, "grad_norm": 1.4890646934509277, "learning_rate": 1.9322845417236662e-06, "loss": 1.7226, "step": 11244 }, { "epoch": 0.96, "grad_norm": 1.6311677694320679, "learning_rate": 1.915184678522572e-06, "loss": 1.7406, "step": 11248 }, { "epoch": 0.96, "grad_norm": 1.662655234336853, "learning_rate": 1.8980848153214778e-06, "loss": 1.894, "step": 11252 }, { "epoch": 0.96, "grad_norm": 1.6911251544952393, "learning_rate": 1.8809849521203832e-06, "loss": 2.0025, "step": 11256 }, { "epoch": 0.96, "grad_norm": 1.5574413537979126, "learning_rate": 1.8638850889192888e-06, "loss": 1.7462, "step": 11260 }, { "epoch": 0.96, "grad_norm": 1.699474573135376, "learning_rate": 1.8467852257181943e-06, "loss": 1.7242, "step": 11264 }, { "epoch": 0.96, "grad_norm": 1.642033338546753, "learning_rate": 1.8296853625171001e-06, "loss": 1.7673, "step": 11268 }, { "epoch": 0.96, "grad_norm": 1.6996281147003174, "learning_rate": 1.8125854993160056e-06, "loss": 1.8045, "step": 11272 }, { "epoch": 0.96, "grad_norm": 1.6581562757492065, "learning_rate": 1.7954856361149112e-06, "loss": 1.9656, "step": 11276 }, { "epoch": 0.96, "grad_norm": 1.6569535732269287, "learning_rate": 1.7783857729138167e-06, "loss": 1.9959, "step": 11280 }, { "epoch": 0.96, "grad_norm": 1.7449274063110352, "learning_rate": 1.7612859097127225e-06, "loss": 1.8343, "step": 11284 }, { "epoch": 0.97, "grad_norm": 1.6935420036315918, "learning_rate": 1.744186046511628e-06, "loss": 1.8053, "step": 11288 }, { "epoch": 0.97, "grad_norm": 1.5281199216842651, "learning_rate": 1.7270861833105338e-06, "loss": 1.9919, "step": 11292 }, { "epoch": 0.97, "grad_norm": 1.633588433265686, "learning_rate": 1.7099863201094393e-06, "loss": 1.8699, "step": 11296 }, { "epoch": 0.97, "grad_norm": 1.7310950756072998, "learning_rate": 1.692886456908345e-06, "loss": 1.9074, "step": 11300 }, { "epoch": 0.97, "grad_norm": 1.5915794372558594, "learning_rate": 1.6757865937072504e-06, "loss": 1.7659, "step": 11304 }, { "epoch": 0.97, "grad_norm": 1.7064154148101807, "learning_rate": 1.6586867305061562e-06, "loss": 1.8796, "step": 11308 }, { "epoch": 0.97, "grad_norm": 1.7068238258361816, "learning_rate": 1.6415868673050617e-06, "loss": 1.824, "step": 11312 }, { "epoch": 0.97, "grad_norm": 1.686372995376587, "learning_rate": 1.6244870041039673e-06, "loss": 2.0173, "step": 11316 }, { "epoch": 0.97, "grad_norm": 1.5897817611694336, "learning_rate": 1.6073871409028727e-06, "loss": 1.8182, "step": 11320 }, { "epoch": 0.97, "grad_norm": 1.6900789737701416, "learning_rate": 1.5902872777017786e-06, "loss": 1.7615, "step": 11324 }, { "epoch": 0.97, "grad_norm": 1.7033038139343262, "learning_rate": 1.573187414500684e-06, "loss": 1.7663, "step": 11328 }, { "epoch": 0.97, "grad_norm": 1.661864161491394, "learning_rate": 1.5560875512995897e-06, "loss": 1.8007, "step": 11332 }, { "epoch": 0.97, "grad_norm": 1.663250207901001, "learning_rate": 1.5389876880984953e-06, "loss": 1.7852, "step": 11336 }, { "epoch": 0.97, "grad_norm": 1.7646623849868774, "learning_rate": 1.5218878248974008e-06, "loss": 1.7738, "step": 11340 }, { "epoch": 0.97, "grad_norm": 1.6536412239074707, "learning_rate": 1.5047879616963064e-06, "loss": 1.8398, "step": 11344 }, { "epoch": 0.97, "grad_norm": 1.726285457611084, "learning_rate": 1.487688098495212e-06, "loss": 1.7792, "step": 11348 }, { "epoch": 0.97, "grad_norm": 1.7666223049163818, "learning_rate": 1.4705882352941177e-06, "loss": 1.9069, "step": 11352 }, { "epoch": 0.97, "grad_norm": 1.7547391653060913, "learning_rate": 1.4534883720930234e-06, "loss": 1.8827, "step": 11356 }, { "epoch": 0.97, "grad_norm": 1.8074520826339722, "learning_rate": 1.4363885088919288e-06, "loss": 1.8349, "step": 11360 }, { "epoch": 0.97, "grad_norm": 1.5819740295410156, "learning_rate": 1.4192886456908345e-06, "loss": 2.0226, "step": 11364 }, { "epoch": 0.97, "grad_norm": 1.4653937816619873, "learning_rate": 1.4021887824897401e-06, "loss": 1.7796, "step": 11368 }, { "epoch": 0.97, "grad_norm": 1.7137483358383179, "learning_rate": 1.3850889192886458e-06, "loss": 1.9836, "step": 11372 }, { "epoch": 0.97, "grad_norm": 1.5901538133621216, "learning_rate": 1.3679890560875514e-06, "loss": 1.7209, "step": 11376 }, { "epoch": 0.97, "grad_norm": 1.5994963645935059, "learning_rate": 1.3508891928864569e-06, "loss": 1.8554, "step": 11380 }, { "epoch": 0.97, "grad_norm": 1.578391194343567, "learning_rate": 1.3337893296853625e-06, "loss": 1.8694, "step": 11384 }, { "epoch": 0.97, "grad_norm": 1.6012123823165894, "learning_rate": 1.3166894664842682e-06, "loss": 1.7565, "step": 11388 }, { "epoch": 0.97, "grad_norm": 1.6539853811264038, "learning_rate": 1.2995896032831738e-06, "loss": 1.8733, "step": 11392 }, { "epoch": 0.97, "grad_norm": 1.7771899700164795, "learning_rate": 1.2824897400820795e-06, "loss": 1.8847, "step": 11396 }, { "epoch": 0.97, "grad_norm": 1.6242918968200684, "learning_rate": 1.2653898768809849e-06, "loss": 1.7995, "step": 11400 }, { "epoch": 0.98, "grad_norm": 1.7309808731079102, "learning_rate": 1.2482900136798905e-06, "loss": 1.7911, "step": 11404 }, { "epoch": 0.98, "grad_norm": 1.64167320728302, "learning_rate": 1.2311901504787962e-06, "loss": 1.8132, "step": 11408 }, { "epoch": 0.98, "grad_norm": 1.7144200801849365, "learning_rate": 1.2140902872777018e-06, "loss": 1.8484, "step": 11412 }, { "epoch": 0.98, "grad_norm": 1.498760461807251, "learning_rate": 1.1969904240766075e-06, "loss": 1.7778, "step": 11416 }, { "epoch": 0.98, "grad_norm": 1.62421452999115, "learning_rate": 1.1798905608755131e-06, "loss": 1.8898, "step": 11420 }, { "epoch": 0.98, "grad_norm": 1.7838332653045654, "learning_rate": 1.1627906976744186e-06, "loss": 1.8246, "step": 11424 }, { "epoch": 0.98, "grad_norm": 1.7368948459625244, "learning_rate": 1.1456908344733242e-06, "loss": 1.9796, "step": 11428 }, { "epoch": 0.98, "grad_norm": 1.5505352020263672, "learning_rate": 1.1285909712722299e-06, "loss": 1.8111, "step": 11432 }, { "epoch": 0.98, "grad_norm": 1.7267060279846191, "learning_rate": 1.1114911080711355e-06, "loss": 1.7691, "step": 11436 }, { "epoch": 0.98, "grad_norm": 1.6978029012680054, "learning_rate": 1.0943912448700412e-06, "loss": 1.7508, "step": 11440 }, { "epoch": 0.98, "grad_norm": 1.665993571281433, "learning_rate": 1.0772913816689466e-06, "loss": 1.7835, "step": 11444 }, { "epoch": 0.98, "grad_norm": 1.5911554098129272, "learning_rate": 1.0601915184678523e-06, "loss": 1.8659, "step": 11448 }, { "epoch": 0.98, "grad_norm": 2.122206449508667, "learning_rate": 1.043091655266758e-06, "loss": 1.8262, "step": 11452 }, { "epoch": 0.98, "grad_norm": 1.5077828168869019, "learning_rate": 1.0259917920656636e-06, "loss": 1.8085, "step": 11456 }, { "epoch": 0.98, "grad_norm": 1.527305245399475, "learning_rate": 1.0088919288645692e-06, "loss": 1.7245, "step": 11460 }, { "epoch": 0.98, "grad_norm": 1.8870925903320312, "learning_rate": 9.917920656634746e-07, "loss": 1.891, "step": 11464 }, { "epoch": 0.98, "grad_norm": 1.66902756690979, "learning_rate": 9.746922024623803e-07, "loss": 1.8576, "step": 11468 }, { "epoch": 0.98, "grad_norm": 1.6984986066818237, "learning_rate": 9.57592339261286e-07, "loss": 1.711, "step": 11472 }, { "epoch": 0.98, "grad_norm": 1.8500134944915771, "learning_rate": 9.404924760601916e-07, "loss": 1.83, "step": 11476 }, { "epoch": 0.98, "grad_norm": 1.5974342823028564, "learning_rate": 9.233926128590971e-07, "loss": 1.7812, "step": 11480 }, { "epoch": 0.98, "grad_norm": 1.678200364112854, "learning_rate": 9.062927496580028e-07, "loss": 1.8443, "step": 11484 }, { "epoch": 0.98, "grad_norm": 1.766434907913208, "learning_rate": 8.891928864569083e-07, "loss": 1.9573, "step": 11488 }, { "epoch": 0.98, "grad_norm": 1.7245994806289673, "learning_rate": 8.72093023255814e-07, "loss": 1.8823, "step": 11492 }, { "epoch": 0.98, "grad_norm": 1.598664402961731, "learning_rate": 8.549931600547196e-07, "loss": 1.78, "step": 11496 }, { "epoch": 0.98, "grad_norm": 1.5603172779083252, "learning_rate": 8.378932968536252e-07, "loss": 1.7606, "step": 11500 }, { "epoch": 0.98, "grad_norm": 1.5893508195877075, "learning_rate": 8.207934336525308e-07, "loss": 1.7072, "step": 11504 }, { "epoch": 0.98, "grad_norm": 1.6806880235671997, "learning_rate": 8.036935704514364e-07, "loss": 1.9592, "step": 11508 }, { "epoch": 0.98, "grad_norm": 1.6872777938842773, "learning_rate": 7.86593707250342e-07, "loss": 1.8096, "step": 11512 }, { "epoch": 0.98, "grad_norm": 1.6055060625076294, "learning_rate": 7.694938440492477e-07, "loss": 1.991, "step": 11516 }, { "epoch": 0.98, "grad_norm": 1.5616644620895386, "learning_rate": 7.523939808481532e-07, "loss": 1.8996, "step": 11520 }, { "epoch": 0.99, "grad_norm": 1.7734068632125854, "learning_rate": 7.352941176470589e-07, "loss": 1.8998, "step": 11524 }, { "epoch": 0.99, "grad_norm": 1.7730764150619507, "learning_rate": 7.181942544459644e-07, "loss": 1.82, "step": 11528 }, { "epoch": 0.99, "grad_norm": 1.6176447868347168, "learning_rate": 7.010943912448701e-07, "loss": 1.6604, "step": 11532 }, { "epoch": 0.99, "grad_norm": 1.6673433780670166, "learning_rate": 6.839945280437757e-07, "loss": 1.8491, "step": 11536 }, { "epoch": 0.99, "grad_norm": 1.7543388605117798, "learning_rate": 6.668946648426813e-07, "loss": 1.9856, "step": 11540 }, { "epoch": 0.99, "grad_norm": 1.687723159790039, "learning_rate": 6.497948016415869e-07, "loss": 1.7666, "step": 11544 }, { "epoch": 0.99, "grad_norm": 1.6949374675750732, "learning_rate": 6.326949384404924e-07, "loss": 1.7367, "step": 11548 }, { "epoch": 0.99, "grad_norm": 1.88869047164917, "learning_rate": 6.155950752393981e-07, "loss": 1.8341, "step": 11552 }, { "epoch": 0.99, "grad_norm": 1.616167426109314, "learning_rate": 5.984952120383037e-07, "loss": 1.8967, "step": 11556 }, { "epoch": 0.99, "grad_norm": 1.6819902658462524, "learning_rate": 5.813953488372093e-07, "loss": 1.6976, "step": 11560 }, { "epoch": 0.99, "grad_norm": 1.628939151763916, "learning_rate": 5.642954856361149e-07, "loss": 1.7678, "step": 11564 }, { "epoch": 0.99, "grad_norm": 1.9879276752471924, "learning_rate": 5.471956224350206e-07, "loss": 1.8467, "step": 11568 }, { "epoch": 0.99, "grad_norm": 1.9371817111968994, "learning_rate": 5.300957592339261e-07, "loss": 1.8366, "step": 11572 }, { "epoch": 0.99, "grad_norm": 1.7080752849578857, "learning_rate": 5.129958960328318e-07, "loss": 1.8151, "step": 11576 }, { "epoch": 0.99, "grad_norm": 1.6665830612182617, "learning_rate": 4.958960328317373e-07, "loss": 1.7895, "step": 11580 }, { "epoch": 0.99, "grad_norm": 1.5489290952682495, "learning_rate": 4.78796169630643e-07, "loss": 1.7158, "step": 11584 }, { "epoch": 0.99, "grad_norm": 1.6452659368515015, "learning_rate": 4.6169630642954857e-07, "loss": 1.9806, "step": 11588 }, { "epoch": 0.99, "grad_norm": 1.5039621591567993, "learning_rate": 4.4459644322845417e-07, "loss": 1.6445, "step": 11592 }, { "epoch": 0.99, "grad_norm": 1.6051033735275269, "learning_rate": 4.274965800273598e-07, "loss": 1.812, "step": 11596 }, { "epoch": 0.99, "grad_norm": 1.5924128293991089, "learning_rate": 4.103967168262654e-07, "loss": 1.7894, "step": 11600 }, { "epoch": 0.99, "grad_norm": 1.752646565437317, "learning_rate": 3.93296853625171e-07, "loss": 1.7116, "step": 11604 }, { "epoch": 0.99, "grad_norm": 1.6699579954147339, "learning_rate": 3.761969904240766e-07, "loss": 1.7171, "step": 11608 }, { "epoch": 0.99, "grad_norm": 1.7405366897583008, "learning_rate": 3.590971272229822e-07, "loss": 1.9161, "step": 11612 }, { "epoch": 0.99, "grad_norm": 1.5944126844406128, "learning_rate": 3.4199726402188785e-07, "loss": 1.758, "step": 11616 }, { "epoch": 0.99, "grad_norm": 1.7876613140106201, "learning_rate": 3.2489740082079345e-07, "loss": 1.8338, "step": 11620 }, { "epoch": 0.99, "grad_norm": 1.640367031097412, "learning_rate": 3.0779753761969905e-07, "loss": 1.8632, "step": 11624 }, { "epoch": 0.99, "grad_norm": 1.646633267402649, "learning_rate": 2.9069767441860464e-07, "loss": 1.8923, "step": 11628 }, { "epoch": 0.99, "grad_norm": 1.551578402519226, "learning_rate": 2.735978112175103e-07, "loss": 1.8043, "step": 11632 }, { "epoch": 0.99, "grad_norm": 2.0039708614349365, "learning_rate": 2.564979480164159e-07, "loss": 1.8416, "step": 11636 }, { "epoch": 1.0, "grad_norm": 1.6551536321640015, "learning_rate": 2.393980848153215e-07, "loss": 1.8004, "step": 11640 }, { "epoch": 1.0, "grad_norm": 1.774258017539978, "learning_rate": 2.2229822161422708e-07, "loss": 1.6694, "step": 11644 }, { "epoch": 1.0, "grad_norm": 1.6978957653045654, "learning_rate": 2.051983584131327e-07, "loss": 1.9608, "step": 11648 }, { "epoch": 1.0, "grad_norm": 1.5244742631912231, "learning_rate": 1.880984952120383e-07, "loss": 1.749, "step": 11652 }, { "epoch": 1.0, "grad_norm": 1.7631537914276123, "learning_rate": 1.7099863201094393e-07, "loss": 1.9072, "step": 11656 }, { "epoch": 1.0, "grad_norm": 1.6298117637634277, "learning_rate": 1.5389876880984952e-07, "loss": 1.8008, "step": 11660 }, { "epoch": 1.0, "grad_norm": 1.6735055446624756, "learning_rate": 1.3679890560875515e-07, "loss": 1.7698, "step": 11664 }, { "epoch": 1.0, "grad_norm": 1.6959282159805298, "learning_rate": 1.1969904240766074e-07, "loss": 1.7243, "step": 11668 }, { "epoch": 1.0, "grad_norm": 1.6977593898773193, "learning_rate": 1.0259917920656635e-07, "loss": 1.7902, "step": 11672 }, { "epoch": 1.0, "grad_norm": 1.5963069200515747, "learning_rate": 8.549931600547196e-08, "loss": 1.6623, "step": 11676 }, { "epoch": 1.0, "grad_norm": 1.5912084579467773, "learning_rate": 6.839945280437757e-08, "loss": 1.8806, "step": 11680 }, { "epoch": 1.0, "grad_norm": 1.8129706382751465, "learning_rate": 5.129958960328318e-08, "loss": 1.9337, "step": 11684 }, { "epoch": 1.0, "grad_norm": 1.7241120338439941, "learning_rate": 3.419972640218879e-08, "loss": 1.9163, "step": 11688 }, { "epoch": 1.0, "grad_norm": 1.5432144403457642, "learning_rate": 1.7099863201094393e-08, "loss": 1.6597, "step": 11692 }, { "epoch": 1.0, "grad_norm": 1.5885206460952759, "learning_rate": 0.0, "loss": 1.9262, "step": 11696 }, { "epoch": 1.0, "step": 11696, "total_flos": 1.5848503128568627e+17, "train_loss": 2.3174790558589957, "train_runtime": 2998.5108, "train_samples_per_second": 31.202, "train_steps_per_second": 3.901 } ], "logging_steps": 4, "max_steps": 11696, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1170, "total_flos": 1.5848503128568627e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }