{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999459527627146, "eval_steps": 100, "global_step": 3469, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008647557965661989, "grad_norm": 10.80162525177002, "learning_rate": 1.7241379310344828e-07, "loss": 2.2563, "step": 3 }, { "epoch": 0.0017295115931323978, "grad_norm": 9.61453914642334, "learning_rate": 3.4482758620689656e-07, "loss": 2.0897, "step": 6 }, { "epoch": 0.0025942673896985967, "grad_norm": 9.470385551452637, "learning_rate": 5.172413793103449e-07, "loss": 1.9261, "step": 9 }, { "epoch": 0.0034590231862647956, "grad_norm": 9.873086929321289, "learning_rate": 6.896551724137931e-07, "loss": 2.0147, "step": 12 }, { "epoch": 0.004323778982830994, "grad_norm": 10.191353797912598, "learning_rate": 8.620689655172415e-07, "loss": 1.858, "step": 15 }, { "epoch": 0.005188534779397193, "grad_norm": 10.490758895874023, "learning_rate": 1.0344827586206898e-06, "loss": 2.2373, "step": 18 }, { "epoch": 0.006053290575963392, "grad_norm": 10.7379789352417, "learning_rate": 1.2068965517241381e-06, "loss": 2.2257, "step": 21 }, { "epoch": 0.006918046372529591, "grad_norm": 9.31098747253418, "learning_rate": 1.3793103448275862e-06, "loss": 1.9876, "step": 24 }, { "epoch": 0.00778280216909579, "grad_norm": 9.79057502746582, "learning_rate": 1.5517241379310346e-06, "loss": 1.9015, "step": 27 }, { "epoch": 0.008647557965661988, "grad_norm": 9.3207368850708, "learning_rate": 1.724137931034483e-06, "loss": 1.8092, "step": 30 }, { "epoch": 0.009512313762228188, "grad_norm": 8.36523151397705, "learning_rate": 1.896551724137931e-06, "loss": 1.7048, "step": 33 }, { "epoch": 0.010377069558794387, "grad_norm": 7.840041160583496, "learning_rate": 2.0689655172413796e-06, "loss": 1.4938, "step": 36 }, { "epoch": 0.011241825355360585, "grad_norm": 8.646334648132324, "learning_rate": 2.241379310344828e-06, "loss": 1.4769, "step": 39 }, { "epoch": 0.012106581151926784, "grad_norm": 8.217284202575684, "learning_rate": 2.4137931034482762e-06, "loss": 1.4442, "step": 42 }, { "epoch": 0.012971336948492982, "grad_norm": 8.16273307800293, "learning_rate": 2.5862068965517246e-06, "loss": 1.6091, "step": 45 }, { "epoch": 0.013836092745059182, "grad_norm": 7.606343746185303, "learning_rate": 2.7586206896551725e-06, "loss": 1.565, "step": 48 }, { "epoch": 0.01470084854162538, "grad_norm": 5.831145763397217, "learning_rate": 2.931034482758621e-06, "loss": 1.4604, "step": 51 }, { "epoch": 0.01556560433819158, "grad_norm": 5.626949310302734, "learning_rate": 3.103448275862069e-06, "loss": 1.2422, "step": 54 }, { "epoch": 0.01643036013475778, "grad_norm": 8.196585655212402, "learning_rate": 3.2758620689655175e-06, "loss": 1.178, "step": 57 }, { "epoch": 0.017295115931323976, "grad_norm": 3.9342713356018066, "learning_rate": 3.448275862068966e-06, "loss": 1.1149, "step": 60 }, { "epoch": 0.018159871727890176, "grad_norm": 5.660026550292969, "learning_rate": 3.620689655172414e-06, "loss": 1.2666, "step": 63 }, { "epoch": 0.019024627524456376, "grad_norm": 3.853914737701416, "learning_rate": 3.793103448275862e-06, "loss": 0.9817, "step": 66 }, { "epoch": 0.019889383321022573, "grad_norm": 4.146341323852539, "learning_rate": 3.96551724137931e-06, "loss": 1.0925, "step": 69 }, { "epoch": 0.020754139117588773, "grad_norm": 3.1577234268188477, "learning_rate": 4.137931034482759e-06, "loss": 0.8252, "step": 72 }, { "epoch": 0.02161889491415497, "grad_norm": 3.2642972469329834, "learning_rate": 4.310344827586207e-06, "loss": 0.7533, "step": 75 }, { "epoch": 0.02248365071072117, "grad_norm": 3.2303547859191895, "learning_rate": 4.482758620689656e-06, "loss": 0.7326, "step": 78 }, { "epoch": 0.02334840650728737, "grad_norm": 2.59647536277771, "learning_rate": 4.655172413793104e-06, "loss": 0.7174, "step": 81 }, { "epoch": 0.024213162303853567, "grad_norm": 2.8148062229156494, "learning_rate": 4.8275862068965525e-06, "loss": 0.7488, "step": 84 }, { "epoch": 0.025077918100419767, "grad_norm": 1.758013129234314, "learning_rate": 5e-06, "loss": 0.7332, "step": 87 }, { "epoch": 0.025942673896985964, "grad_norm": 2.0942482948303223, "learning_rate": 5.172413793103449e-06, "loss": 0.8884, "step": 90 }, { "epoch": 0.026807429693552164, "grad_norm": 0.4857475757598877, "learning_rate": 5.344827586206896e-06, "loss": 0.497, "step": 93 }, { "epoch": 0.027672185490118364, "grad_norm": 1.5475839376449585, "learning_rate": 5.517241379310345e-06, "loss": 0.669, "step": 96 }, { "epoch": 0.02853694128668456, "grad_norm": 1.0966241359710693, "learning_rate": 5.689655172413794e-06, "loss": 0.5887, "step": 99 }, { "epoch": 0.028825193218873297, "eval_loss": 0.6418702006340027, "eval_mse": 0.6418701782226562, "eval_runtime": 6.7842, "eval_samples_per_second": 147.401, "eval_steps_per_second": 18.425, "step": 100 }, { "epoch": 0.02940169708325076, "grad_norm": 0.6968008875846863, "learning_rate": 5.862068965517242e-06, "loss": 0.7585, "step": 102 }, { "epoch": 0.03026645287981696, "grad_norm": 1.3219984769821167, "learning_rate": 6.03448275862069e-06, "loss": 0.7782, "step": 105 }, { "epoch": 0.03113120867638316, "grad_norm": 1.0195666551589966, "learning_rate": 6.206896551724138e-06, "loss": 0.6689, "step": 108 }, { "epoch": 0.031995964472949355, "grad_norm": 1.4114892482757568, "learning_rate": 6.379310344827587e-06, "loss": 0.6057, "step": 111 }, { "epoch": 0.03286072026951556, "grad_norm": 1.173322319984436, "learning_rate": 6.551724137931035e-06, "loss": 0.7234, "step": 114 }, { "epoch": 0.033725476066081755, "grad_norm": 1.520411729812622, "learning_rate": 6.724137931034484e-06, "loss": 0.7145, "step": 117 }, { "epoch": 0.03459023186264795, "grad_norm": 2.086540937423706, "learning_rate": 6.896551724137932e-06, "loss": 0.6265, "step": 120 }, { "epoch": 0.035454987659214156, "grad_norm": 1.582171082496643, "learning_rate": 7.0689655172413796e-06, "loss": 0.6202, "step": 123 }, { "epoch": 0.03631974345578035, "grad_norm": 1.1396665573120117, "learning_rate": 7.241379310344828e-06, "loss": 0.5784, "step": 126 }, { "epoch": 0.03718449925234655, "grad_norm": 1.427815318107605, "learning_rate": 7.413793103448277e-06, "loss": 0.4815, "step": 129 }, { "epoch": 0.03804925504891275, "grad_norm": 4.718282699584961, "learning_rate": 7.586206896551724e-06, "loss": 0.6151, "step": 132 }, { "epoch": 0.03891401084547895, "grad_norm": 1.1084308624267578, "learning_rate": 7.758620689655173e-06, "loss": 0.548, "step": 135 }, { "epoch": 0.039778766642045146, "grad_norm": 2.411567211151123, "learning_rate": 7.93103448275862e-06, "loss": 0.4664, "step": 138 }, { "epoch": 0.04064352243861134, "grad_norm": 1.9377732276916504, "learning_rate": 8.103448275862069e-06, "loss": 0.4761, "step": 141 }, { "epoch": 0.04150827823517755, "grad_norm": 1.6360706090927124, "learning_rate": 8.275862068965518e-06, "loss": 0.5101, "step": 144 }, { "epoch": 0.04237303403174374, "grad_norm": 1.6306743621826172, "learning_rate": 8.448275862068966e-06, "loss": 0.4356, "step": 147 }, { "epoch": 0.04323778982830994, "grad_norm": 2.038235664367676, "learning_rate": 8.620689655172414e-06, "loss": 0.4647, "step": 150 }, { "epoch": 0.044102545624876144, "grad_norm": 2.968766450881958, "learning_rate": 8.793103448275862e-06, "loss": 0.5035, "step": 153 }, { "epoch": 0.04496730142144234, "grad_norm": 1.407654047012329, "learning_rate": 8.965517241379312e-06, "loss": 0.4639, "step": 156 }, { "epoch": 0.04583205721800854, "grad_norm": 2.141657590866089, "learning_rate": 9.13793103448276e-06, "loss": 0.4255, "step": 159 }, { "epoch": 0.04669681301457474, "grad_norm": 2.2702648639678955, "learning_rate": 9.310344827586207e-06, "loss": 0.49, "step": 162 }, { "epoch": 0.04756156881114094, "grad_norm": 1.1990652084350586, "learning_rate": 9.482758620689655e-06, "loss": 0.4145, "step": 165 }, { "epoch": 0.048426324607707134, "grad_norm": 2.7780394554138184, "learning_rate": 9.655172413793105e-06, "loss": 0.4001, "step": 168 }, { "epoch": 0.04929108040427334, "grad_norm": 1.6007318496704102, "learning_rate": 9.827586206896553e-06, "loss": 0.3719, "step": 171 }, { "epoch": 0.050155836200839535, "grad_norm": 1.820497751235962, "learning_rate": 1e-05, "loss": 0.4183, "step": 174 }, { "epoch": 0.05102059199740573, "grad_norm": 2.3770830631256104, "learning_rate": 9.990895295902884e-06, "loss": 0.4273, "step": 177 }, { "epoch": 0.05188534779397193, "grad_norm": 1.9047311544418335, "learning_rate": 9.981790591805767e-06, "loss": 0.4622, "step": 180 }, { "epoch": 0.05275010359053813, "grad_norm": 1.876373052597046, "learning_rate": 9.972685887708651e-06, "loss": 0.3132, "step": 183 }, { "epoch": 0.05361485938710433, "grad_norm": 1.6469354629516602, "learning_rate": 9.963581183611534e-06, "loss": 0.3647, "step": 186 }, { "epoch": 0.054479615183670525, "grad_norm": 1.3573085069656372, "learning_rate": 9.954476479514417e-06, "loss": 0.3267, "step": 189 }, { "epoch": 0.05534437098023673, "grad_norm": 6.931013107299805, "learning_rate": 9.9453717754173e-06, "loss": 0.3981, "step": 192 }, { "epoch": 0.056209126776802926, "grad_norm": 3.3258678913116455, "learning_rate": 9.936267071320182e-06, "loss": 0.3855, "step": 195 }, { "epoch": 0.05707388257336912, "grad_norm": 4.0184550285339355, "learning_rate": 9.927162367223067e-06, "loss": 0.371, "step": 198 }, { "epoch": 0.057650386437746594, "eval_loss": 0.34387460350990295, "eval_mse": 0.3438746197223663, "eval_runtime": 6.7538, "eval_samples_per_second": 148.065, "eval_steps_per_second": 18.508, "step": 200 }, { "epoch": 0.057938638369935326, "grad_norm": 1.7570185661315918, "learning_rate": 9.91805766312595e-06, "loss": 0.4019, "step": 201 }, { "epoch": 0.05880339416650152, "grad_norm": 3.925257444381714, "learning_rate": 9.908952959028833e-06, "loss": 0.3352, "step": 204 }, { "epoch": 0.05966814996306772, "grad_norm": 1.5225563049316406, "learning_rate": 9.899848254931715e-06, "loss": 0.3081, "step": 207 }, { "epoch": 0.06053290575963392, "grad_norm": 1.902543067932129, "learning_rate": 9.890743550834598e-06, "loss": 0.3908, "step": 210 }, { "epoch": 0.06139766155620012, "grad_norm": 1.2444404363632202, "learning_rate": 9.881638846737481e-06, "loss": 0.31, "step": 213 }, { "epoch": 0.06226241735276632, "grad_norm": 1.8852041959762573, "learning_rate": 9.872534142640366e-06, "loss": 0.297, "step": 216 }, { "epoch": 0.06312717314933251, "grad_norm": 5.955360412597656, "learning_rate": 9.863429438543249e-06, "loss": 0.3639, "step": 219 }, { "epoch": 0.06399192894589871, "grad_norm": 1.8612481355667114, "learning_rate": 9.854324734446131e-06, "loss": 0.3808, "step": 222 }, { "epoch": 0.06485668474246492, "grad_norm": 1.4975637197494507, "learning_rate": 9.845220030349014e-06, "loss": 0.3694, "step": 225 }, { "epoch": 0.06572144053903112, "grad_norm": 1.5896477699279785, "learning_rate": 9.836115326251897e-06, "loss": 0.3056, "step": 228 }, { "epoch": 0.06658619633559731, "grad_norm": 3.6155972480773926, "learning_rate": 9.827010622154782e-06, "loss": 0.3084, "step": 231 }, { "epoch": 0.06745095213216351, "grad_norm": 4.709763526916504, "learning_rate": 9.817905918057664e-06, "loss": 0.3949, "step": 234 }, { "epoch": 0.06831570792872971, "grad_norm": 3.7941088676452637, "learning_rate": 9.808801213960547e-06, "loss": 0.3213, "step": 237 }, { "epoch": 0.0691804637252959, "grad_norm": 2.548584461212158, "learning_rate": 9.79969650986343e-06, "loss": 0.3335, "step": 240 }, { "epoch": 0.0700452195218621, "grad_norm": 1.5992470979690552, "learning_rate": 9.790591805766313e-06, "loss": 0.2688, "step": 243 }, { "epoch": 0.07090997531842831, "grad_norm": 2.99310564994812, "learning_rate": 9.781487101669198e-06, "loss": 0.3488, "step": 246 }, { "epoch": 0.07177473111499451, "grad_norm": 2.987074613571167, "learning_rate": 9.77238239757208e-06, "loss": 0.2703, "step": 249 }, { "epoch": 0.0726394869115607, "grad_norm": 5.408164978027344, "learning_rate": 9.763277693474963e-06, "loss": 0.2854, "step": 252 }, { "epoch": 0.0735042427081269, "grad_norm": 1.6425719261169434, "learning_rate": 9.754172989377846e-06, "loss": 0.2824, "step": 255 }, { "epoch": 0.0743689985046931, "grad_norm": 1.8134018182754517, "learning_rate": 9.745068285280729e-06, "loss": 0.3286, "step": 258 }, { "epoch": 0.0752337543012593, "grad_norm": 5.619387626647949, "learning_rate": 9.735963581183613e-06, "loss": 0.3226, "step": 261 }, { "epoch": 0.0760985100978255, "grad_norm": 2.6264755725860596, "learning_rate": 9.726858877086496e-06, "loss": 0.3442, "step": 264 }, { "epoch": 0.0769632658943917, "grad_norm": 3.537142515182495, "learning_rate": 9.717754172989379e-06, "loss": 0.2687, "step": 267 }, { "epoch": 0.0778280216909579, "grad_norm": 2.1801705360412598, "learning_rate": 9.708649468892262e-06, "loss": 0.3705, "step": 270 }, { "epoch": 0.0786927774875241, "grad_norm": 6.869683742523193, "learning_rate": 9.699544764795145e-06, "loss": 0.3777, "step": 273 }, { "epoch": 0.07955753328409029, "grad_norm": 3.4050021171569824, "learning_rate": 9.690440060698028e-06, "loss": 0.3265, "step": 276 }, { "epoch": 0.08042228908065649, "grad_norm": 2.8107333183288574, "learning_rate": 9.681335356600912e-06, "loss": 0.3286, "step": 279 }, { "epoch": 0.08128704487722269, "grad_norm": 1.5242034196853638, "learning_rate": 9.672230652503795e-06, "loss": 0.2786, "step": 282 }, { "epoch": 0.0821518006737889, "grad_norm": 22.8824462890625, "learning_rate": 9.663125948406678e-06, "loss": 0.3706, "step": 285 }, { "epoch": 0.0830165564703551, "grad_norm": 4.904447555541992, "learning_rate": 9.65402124430956e-06, "loss": 0.298, "step": 288 }, { "epoch": 0.08388131226692129, "grad_norm": 4.405270576477051, "learning_rate": 9.644916540212444e-06, "loss": 0.3625, "step": 291 }, { "epoch": 0.08474606806348749, "grad_norm": 6.424873352050781, "learning_rate": 9.635811836115328e-06, "loss": 0.3246, "step": 294 }, { "epoch": 0.08561082386005368, "grad_norm": 1.9719147682189941, "learning_rate": 9.626707132018211e-06, "loss": 0.3014, "step": 297 }, { "epoch": 0.08647557965661988, "grad_norm": 5.9254021644592285, "learning_rate": 9.617602427921094e-06, "loss": 0.3607, "step": 300 }, { "epoch": 0.08647557965661988, "eval_loss": 0.28438636660575867, "eval_mse": 0.2843863361030817, "eval_runtime": 6.6145, "eval_samples_per_second": 151.183, "eval_steps_per_second": 18.898, "step": 300 }, { "epoch": 0.08734033545318609, "grad_norm": 3.4420008659362793, "learning_rate": 9.608497723823977e-06, "loss": 0.2631, "step": 303 }, { "epoch": 0.08820509124975229, "grad_norm": 2.4832332134246826, "learning_rate": 9.59939301972686e-06, "loss": 0.2985, "step": 306 }, { "epoch": 0.08906984704631848, "grad_norm": 3.3724935054779053, "learning_rate": 9.590288315629744e-06, "loss": 0.2703, "step": 309 }, { "epoch": 0.08993460284288468, "grad_norm": 8.569412231445312, "learning_rate": 9.581183611532627e-06, "loss": 0.3068, "step": 312 }, { "epoch": 0.09079935863945088, "grad_norm": 1.9817373752593994, "learning_rate": 9.57207890743551e-06, "loss": 0.3271, "step": 315 }, { "epoch": 0.09166411443601707, "grad_norm": 2.6405210494995117, "learning_rate": 9.562974203338393e-06, "loss": 0.2454, "step": 318 }, { "epoch": 0.09252887023258327, "grad_norm": 1.6005452871322632, "learning_rate": 9.553869499241275e-06, "loss": 0.2506, "step": 321 }, { "epoch": 0.09339362602914948, "grad_norm": 1.67816162109375, "learning_rate": 9.54476479514416e-06, "loss": 0.2503, "step": 324 }, { "epoch": 0.09425838182571568, "grad_norm": 1.4401655197143555, "learning_rate": 9.535660091047043e-06, "loss": 0.2322, "step": 327 }, { "epoch": 0.09512313762228188, "grad_norm": 2.919785261154175, "learning_rate": 9.526555386949926e-06, "loss": 0.2746, "step": 330 }, { "epoch": 0.09598789341884807, "grad_norm": 4.454317569732666, "learning_rate": 9.517450682852808e-06, "loss": 0.2907, "step": 333 }, { "epoch": 0.09685264921541427, "grad_norm": 4.585294246673584, "learning_rate": 9.508345978755691e-06, "loss": 0.2873, "step": 336 }, { "epoch": 0.09771740501198047, "grad_norm": 2.247422218322754, "learning_rate": 9.499241274658574e-06, "loss": 0.2798, "step": 339 }, { "epoch": 0.09858216080854668, "grad_norm": 3.6044836044311523, "learning_rate": 9.490136570561459e-06, "loss": 0.3284, "step": 342 }, { "epoch": 0.09944691660511287, "grad_norm": 4.609151363372803, "learning_rate": 9.481031866464341e-06, "loss": 0.2274, "step": 345 }, { "epoch": 0.10031167240167907, "grad_norm": 4.793229103088379, "learning_rate": 9.471927162367224e-06, "loss": 0.2734, "step": 348 }, { "epoch": 0.10117642819824527, "grad_norm": 4.507264614105225, "learning_rate": 9.462822458270107e-06, "loss": 0.3082, "step": 351 }, { "epoch": 0.10204118399481146, "grad_norm": 6.129451274871826, "learning_rate": 9.45371775417299e-06, "loss": 0.2617, "step": 354 }, { "epoch": 0.10290593979137766, "grad_norm": 2.631593704223633, "learning_rate": 9.444613050075875e-06, "loss": 0.2565, "step": 357 }, { "epoch": 0.10377069558794386, "grad_norm": 4.379823684692383, "learning_rate": 9.435508345978757e-06, "loss": 0.3235, "step": 360 }, { "epoch": 0.10463545138451007, "grad_norm": 2.077354907989502, "learning_rate": 9.42640364188164e-06, "loss": 0.2938, "step": 363 }, { "epoch": 0.10550020718107626, "grad_norm": 5.108116626739502, "learning_rate": 9.417298937784523e-06, "loss": 0.2698, "step": 366 }, { "epoch": 0.10636496297764246, "grad_norm": 7.464448928833008, "learning_rate": 9.408194233687406e-06, "loss": 0.2703, "step": 369 }, { "epoch": 0.10722971877420866, "grad_norm": 1.855411410331726, "learning_rate": 9.399089529590289e-06, "loss": 0.2958, "step": 372 }, { "epoch": 0.10809447457077485, "grad_norm": 5.284719944000244, "learning_rate": 9.389984825493173e-06, "loss": 0.2703, "step": 375 }, { "epoch": 0.10895923036734105, "grad_norm": 2.494473457336426, "learning_rate": 9.380880121396056e-06, "loss": 0.2698, "step": 378 }, { "epoch": 0.10982398616390726, "grad_norm": 2.0765345096588135, "learning_rate": 9.371775417298939e-06, "loss": 0.2726, "step": 381 }, { "epoch": 0.11068874196047346, "grad_norm": 1.893574595451355, "learning_rate": 9.362670713201822e-06, "loss": 0.2739, "step": 384 }, { "epoch": 0.11155349775703965, "grad_norm": 2.0016255378723145, "learning_rate": 9.353566009104705e-06, "loss": 0.2624, "step": 387 }, { "epoch": 0.11241825355360585, "grad_norm": 2.99924898147583, "learning_rate": 9.344461305007587e-06, "loss": 0.3098, "step": 390 }, { "epoch": 0.11328300935017205, "grad_norm": 1.666891098022461, "learning_rate": 9.335356600910472e-06, "loss": 0.2451, "step": 393 }, { "epoch": 0.11414776514673824, "grad_norm": 2.8993024826049805, "learning_rate": 9.326251896813355e-06, "loss": 0.271, "step": 396 }, { "epoch": 0.11501252094330444, "grad_norm": 5.040359973907471, "learning_rate": 9.317147192716238e-06, "loss": 0.2576, "step": 399 }, { "epoch": 0.11530077287549319, "eval_loss": 0.2588765025138855, "eval_mse": 0.25887649209996744, "eval_runtime": 6.6586, "eval_samples_per_second": 150.181, "eval_steps_per_second": 18.773, "step": 400 }, { "epoch": 0.11587727673987065, "grad_norm": 3.234560251235962, "learning_rate": 9.30804248861912e-06, "loss": 0.2702, "step": 402 }, { "epoch": 0.11674203253643685, "grad_norm": 2.8497729301452637, "learning_rate": 9.298937784522003e-06, "loss": 0.2371, "step": 405 }, { "epoch": 0.11760678833300305, "grad_norm": 1.3214294910430908, "learning_rate": 9.289833080424886e-06, "loss": 0.2262, "step": 408 }, { "epoch": 0.11847154412956924, "grad_norm": 3.5736958980560303, "learning_rate": 9.28072837632777e-06, "loss": 0.2619, "step": 411 }, { "epoch": 0.11933629992613544, "grad_norm": 3.478178024291992, "learning_rate": 9.271623672230654e-06, "loss": 0.3061, "step": 414 }, { "epoch": 0.12020105572270164, "grad_norm": 2.523387908935547, "learning_rate": 9.262518968133536e-06, "loss": 0.3232, "step": 417 }, { "epoch": 0.12106581151926785, "grad_norm": 2.1046786308288574, "learning_rate": 9.25341426403642e-06, "loss": 0.2815, "step": 420 }, { "epoch": 0.12193056731583404, "grad_norm": 4.513411045074463, "learning_rate": 9.244309559939302e-06, "loss": 0.3053, "step": 423 }, { "epoch": 0.12279532311240024, "grad_norm": 2.628030776977539, "learning_rate": 9.235204855842187e-06, "loss": 0.2714, "step": 426 }, { "epoch": 0.12366007890896644, "grad_norm": 6.008927345275879, "learning_rate": 9.22610015174507e-06, "loss": 0.3243, "step": 429 }, { "epoch": 0.12452483470553263, "grad_norm": 6.236274242401123, "learning_rate": 9.216995447647952e-06, "loss": 0.2916, "step": 432 }, { "epoch": 0.12538959050209883, "grad_norm": 2.30412220954895, "learning_rate": 9.207890743550835e-06, "loss": 0.2731, "step": 435 }, { "epoch": 0.12625434629866503, "grad_norm": 3.3161492347717285, "learning_rate": 9.198786039453718e-06, "loss": 0.2508, "step": 438 }, { "epoch": 0.12711910209523122, "grad_norm": 4.023074626922607, "learning_rate": 9.189681335356601e-06, "loss": 0.2973, "step": 441 }, { "epoch": 0.12798385789179742, "grad_norm": 2.486236333847046, "learning_rate": 9.180576631259485e-06, "loss": 0.235, "step": 444 }, { "epoch": 0.12884861368836362, "grad_norm": 4.445496082305908, "learning_rate": 9.171471927162368e-06, "loss": 0.2519, "step": 447 }, { "epoch": 0.12971336948492984, "grad_norm": 1.629809021949768, "learning_rate": 9.162367223065251e-06, "loss": 0.2664, "step": 450 }, { "epoch": 0.13057812528149604, "grad_norm": 3.0351674556732178, "learning_rate": 9.153262518968134e-06, "loss": 0.2433, "step": 453 }, { "epoch": 0.13144288107806223, "grad_norm": 1.9163283109664917, "learning_rate": 9.144157814871017e-06, "loss": 0.2997, "step": 456 }, { "epoch": 0.13230763687462843, "grad_norm": 3.978429079055786, "learning_rate": 9.1350531107739e-06, "loss": 0.2969, "step": 459 }, { "epoch": 0.13317239267119463, "grad_norm": 1.7428011894226074, "learning_rate": 9.125948406676784e-06, "loss": 0.2721, "step": 462 }, { "epoch": 0.13403714846776082, "grad_norm": 1.462106704711914, "learning_rate": 9.116843702579667e-06, "loss": 0.3179, "step": 465 }, { "epoch": 0.13490190426432702, "grad_norm": 4.226785659790039, "learning_rate": 9.10773899848255e-06, "loss": 0.262, "step": 468 }, { "epoch": 0.13576666006089322, "grad_norm": 3.227842330932617, "learning_rate": 9.098634294385433e-06, "loss": 0.2602, "step": 471 }, { "epoch": 0.13663141585745942, "grad_norm": 4.708644866943359, "learning_rate": 9.089529590288316e-06, "loss": 0.2502, "step": 474 }, { "epoch": 0.1374961716540256, "grad_norm": 3.478773832321167, "learning_rate": 9.080424886191198e-06, "loss": 0.2799, "step": 477 }, { "epoch": 0.1383609274505918, "grad_norm": 5.991547107696533, "learning_rate": 9.071320182094083e-06, "loss": 0.3103, "step": 480 }, { "epoch": 0.139225683247158, "grad_norm": 5.446369647979736, "learning_rate": 9.062215477996966e-06, "loss": 0.2834, "step": 483 }, { "epoch": 0.1400904390437242, "grad_norm": 4.723580360412598, "learning_rate": 9.053110773899849e-06, "loss": 0.244, "step": 486 }, { "epoch": 0.14095519484029043, "grad_norm": 2.518148183822632, "learning_rate": 9.044006069802731e-06, "loss": 0.3102, "step": 489 }, { "epoch": 0.14181995063685662, "grad_norm": 4.1146039962768555, "learning_rate": 9.034901365705614e-06, "loss": 0.2913, "step": 492 }, { "epoch": 0.14268470643342282, "grad_norm": 2.6333212852478027, "learning_rate": 9.025796661608497e-06, "loss": 0.2195, "step": 495 }, { "epoch": 0.14354946222998902, "grad_norm": 3.344228506088257, "learning_rate": 9.016691957511382e-06, "loss": 0.2822, "step": 498 }, { "epoch": 0.14412596609436648, "eval_loss": 0.2707275450229645, "eval_mse": 0.27072753977278263, "eval_runtime": 6.638, "eval_samples_per_second": 150.647, "eval_steps_per_second": 18.831, "step": 500 }, { "epoch": 0.1444142180265552, "grad_norm": 2.839517593383789, "learning_rate": 9.007587253414265e-06, "loss": 0.2469, "step": 501 }, { "epoch": 0.1452789738231214, "grad_norm": 3.71785569190979, "learning_rate": 8.998482549317147e-06, "loss": 0.2695, "step": 504 }, { "epoch": 0.1461437296196876, "grad_norm": 2.9626924991607666, "learning_rate": 8.98937784522003e-06, "loss": 0.233, "step": 507 }, { "epoch": 0.1470084854162538, "grad_norm": 2.0274481773376465, "learning_rate": 8.980273141122913e-06, "loss": 0.2854, "step": 510 }, { "epoch": 0.14787324121282, "grad_norm": 2.5144805908203125, "learning_rate": 8.971168437025798e-06, "loss": 0.2545, "step": 513 }, { "epoch": 0.1487379970093862, "grad_norm": 3.106039047241211, "learning_rate": 8.96206373292868e-06, "loss": 0.2305, "step": 516 }, { "epoch": 0.1496027528059524, "grad_norm": 3.3986117839813232, "learning_rate": 8.952959028831563e-06, "loss": 0.2564, "step": 519 }, { "epoch": 0.1504675086025186, "grad_norm": 2.877206325531006, "learning_rate": 8.943854324734446e-06, "loss": 0.2781, "step": 522 }, { "epoch": 0.1513322643990848, "grad_norm": 2.561119556427002, "learning_rate": 8.934749620637329e-06, "loss": 0.2666, "step": 525 }, { "epoch": 0.152197020195651, "grad_norm": 2.566633939743042, "learning_rate": 8.925644916540213e-06, "loss": 0.2791, "step": 528 }, { "epoch": 0.1530617759922172, "grad_norm": 2.1075491905212402, "learning_rate": 8.916540212443096e-06, "loss": 0.2276, "step": 531 }, { "epoch": 0.1539265317887834, "grad_norm": 3.239712715148926, "learning_rate": 8.90743550834598e-06, "loss": 0.2749, "step": 534 }, { "epoch": 0.1547912875853496, "grad_norm": 6.005987167358398, "learning_rate": 8.898330804248862e-06, "loss": 0.2395, "step": 537 }, { "epoch": 0.1556560433819158, "grad_norm": 1.6621935367584229, "learning_rate": 8.889226100151745e-06, "loss": 0.2616, "step": 540 }, { "epoch": 0.156520799178482, "grad_norm": 3.1361093521118164, "learning_rate": 8.880121396054628e-06, "loss": 0.2712, "step": 543 }, { "epoch": 0.1573855549750482, "grad_norm": 1.8823013305664062, "learning_rate": 8.871016691957512e-06, "loss": 0.2571, "step": 546 }, { "epoch": 0.1582503107716144, "grad_norm": 3.6857988834381104, "learning_rate": 8.861911987860395e-06, "loss": 0.2552, "step": 549 }, { "epoch": 0.15911506656818059, "grad_norm": 2.26597261428833, "learning_rate": 8.852807283763278e-06, "loss": 0.2239, "step": 552 }, { "epoch": 0.15997982236474678, "grad_norm": 5.791572570800781, "learning_rate": 8.84370257966616e-06, "loss": 0.2859, "step": 555 }, { "epoch": 0.16084457816131298, "grad_norm": 3.4344265460968018, "learning_rate": 8.834597875569044e-06, "loss": 0.3055, "step": 558 }, { "epoch": 0.16170933395787918, "grad_norm": 3.98288631439209, "learning_rate": 8.825493171471928e-06, "loss": 0.2698, "step": 561 }, { "epoch": 0.16257408975444537, "grad_norm": 7.429836273193359, "learning_rate": 8.816388467374811e-06, "loss": 0.3207, "step": 564 }, { "epoch": 0.1634388455510116, "grad_norm": 4.021480560302734, "learning_rate": 8.807283763277694e-06, "loss": 0.2477, "step": 567 }, { "epoch": 0.1643036013475778, "grad_norm": 2.619497537612915, "learning_rate": 8.798179059180577e-06, "loss": 0.2152, "step": 570 }, { "epoch": 0.165168357144144, "grad_norm": 2.073925018310547, "learning_rate": 8.78907435508346e-06, "loss": 0.2557, "step": 573 }, { "epoch": 0.1660331129407102, "grad_norm": 2.250293493270874, "learning_rate": 8.779969650986344e-06, "loss": 0.2294, "step": 576 }, { "epoch": 0.16689786873727638, "grad_norm": 2.9747421741485596, "learning_rate": 8.770864946889227e-06, "loss": 0.2461, "step": 579 }, { "epoch": 0.16776262453384258, "grad_norm": 1.8815991878509521, "learning_rate": 8.76176024279211e-06, "loss": 0.2698, "step": 582 }, { "epoch": 0.16862738033040878, "grad_norm": 2.1905224323272705, "learning_rate": 8.752655538694993e-06, "loss": 0.2631, "step": 585 }, { "epoch": 0.16949213612697497, "grad_norm": 3.6808903217315674, "learning_rate": 8.743550834597875e-06, "loss": 0.2341, "step": 588 }, { "epoch": 0.17035689192354117, "grad_norm": 2.905963897705078, "learning_rate": 8.73444613050076e-06, "loss": 0.2295, "step": 591 }, { "epoch": 0.17122164772010737, "grad_norm": 5.486540794372559, "learning_rate": 8.725341426403643e-06, "loss": 0.2816, "step": 594 }, { "epoch": 0.17208640351667356, "grad_norm": 1.8525919914245605, "learning_rate": 8.716236722306526e-06, "loss": 0.2197, "step": 597 }, { "epoch": 0.17295115931323976, "grad_norm": 5.908591270446777, "learning_rate": 8.707132018209408e-06, "loss": 0.2908, "step": 600 }, { "epoch": 0.17295115931323976, "eval_loss": 0.23815499246120453, "eval_mse": 0.23815500601008535, "eval_runtime": 6.6897, "eval_samples_per_second": 149.484, "eval_steps_per_second": 18.685, "step": 600 }, { "epoch": 0.17381591510980596, "grad_norm": 3.1334948539733887, "learning_rate": 8.698027314112291e-06, "loss": 0.253, "step": 603 }, { "epoch": 0.17468067090637218, "grad_norm": 2.314412832260132, "learning_rate": 8.688922610015174e-06, "loss": 0.2631, "step": 606 }, { "epoch": 0.17554542670293838, "grad_norm": 3.483959436416626, "learning_rate": 8.679817905918059e-06, "loss": 0.2452, "step": 609 }, { "epoch": 0.17641018249950458, "grad_norm": 2.2747623920440674, "learning_rate": 8.670713201820942e-06, "loss": 0.2462, "step": 612 }, { "epoch": 0.17727493829607077, "grad_norm": 5.517392635345459, "learning_rate": 8.661608497723824e-06, "loss": 0.2707, "step": 615 }, { "epoch": 0.17813969409263697, "grad_norm": 2.7062125205993652, "learning_rate": 8.652503793626707e-06, "loss": 0.2324, "step": 618 }, { "epoch": 0.17900444988920317, "grad_norm": 2.712933301925659, "learning_rate": 8.64339908952959e-06, "loss": 0.2914, "step": 621 }, { "epoch": 0.17986920568576936, "grad_norm": 3.0349957942962646, "learning_rate": 8.634294385432475e-06, "loss": 0.245, "step": 624 }, { "epoch": 0.18073396148233556, "grad_norm": 1.533530592918396, "learning_rate": 8.625189681335357e-06, "loss": 0.245, "step": 627 }, { "epoch": 0.18159871727890176, "grad_norm": 1.5070098638534546, "learning_rate": 8.61608497723824e-06, "loss": 0.252, "step": 630 }, { "epoch": 0.18246347307546795, "grad_norm": 1.803514003753662, "learning_rate": 8.606980273141123e-06, "loss": 0.2395, "step": 633 }, { "epoch": 0.18332822887203415, "grad_norm": 1.8047112226486206, "learning_rate": 8.597875569044006e-06, "loss": 0.2731, "step": 636 }, { "epoch": 0.18419298466860035, "grad_norm": 3.090371608734131, "learning_rate": 8.58877086494689e-06, "loss": 0.2314, "step": 639 }, { "epoch": 0.18505774046516654, "grad_norm": 2.2020678520202637, "learning_rate": 8.579666160849773e-06, "loss": 0.2828, "step": 642 }, { "epoch": 0.18592249626173277, "grad_norm": 1.8086636066436768, "learning_rate": 8.570561456752656e-06, "loss": 0.2736, "step": 645 }, { "epoch": 0.18678725205829896, "grad_norm": 1.9154764413833618, "learning_rate": 8.561456752655539e-06, "loss": 0.2422, "step": 648 }, { "epoch": 0.18765200785486516, "grad_norm": 2.6091620922088623, "learning_rate": 8.552352048558422e-06, "loss": 0.278, "step": 651 }, { "epoch": 0.18851676365143136, "grad_norm": 4.057301998138428, "learning_rate": 8.543247344461306e-06, "loss": 0.2268, "step": 654 }, { "epoch": 0.18938151944799755, "grad_norm": 1.6707180738449097, "learning_rate": 8.53414264036419e-06, "loss": 0.2657, "step": 657 }, { "epoch": 0.19024627524456375, "grad_norm": 4.327409744262695, "learning_rate": 8.525037936267072e-06, "loss": 0.2235, "step": 660 }, { "epoch": 0.19111103104112995, "grad_norm": 3.991241931915283, "learning_rate": 8.515933232169955e-06, "loss": 0.2429, "step": 663 }, { "epoch": 0.19197578683769614, "grad_norm": 1.742564082145691, "learning_rate": 8.506828528072838e-06, "loss": 0.2245, "step": 666 }, { "epoch": 0.19284054263426234, "grad_norm": 2.362626791000366, "learning_rate": 8.49772382397572e-06, "loss": 0.2689, "step": 669 }, { "epoch": 0.19370529843082854, "grad_norm": 1.4896934032440186, "learning_rate": 8.488619119878605e-06, "loss": 0.2669, "step": 672 }, { "epoch": 0.19457005422739473, "grad_norm": 1.8312315940856934, "learning_rate": 8.479514415781488e-06, "loss": 0.2255, "step": 675 }, { "epoch": 0.19543481002396093, "grad_norm": 3.9799551963806152, "learning_rate": 8.470409711684371e-06, "loss": 0.2152, "step": 678 }, { "epoch": 0.19629956582052713, "grad_norm": 1.9584782123565674, "learning_rate": 8.461305007587254e-06, "loss": 0.2721, "step": 681 }, { "epoch": 0.19716432161709335, "grad_norm": 3.361952781677246, "learning_rate": 8.452200303490137e-06, "loss": 0.265, "step": 684 }, { "epoch": 0.19802907741365955, "grad_norm": 2.125466823577881, "learning_rate": 8.443095599393021e-06, "loss": 0.2473, "step": 687 }, { "epoch": 0.19889383321022575, "grad_norm": 4.599373817443848, "learning_rate": 8.433990895295904e-06, "loss": 0.2201, "step": 690 }, { "epoch": 0.19975858900679194, "grad_norm": 2.139647960662842, "learning_rate": 8.424886191198787e-06, "loss": 0.2647, "step": 693 }, { "epoch": 0.20062334480335814, "grad_norm": 1.7437653541564941, "learning_rate": 8.41578148710167e-06, "loss": 0.2242, "step": 696 }, { "epoch": 0.20148810059992434, "grad_norm": 4.154083728790283, "learning_rate": 8.406676783004552e-06, "loss": 0.2258, "step": 699 }, { "epoch": 0.20177635253211307, "eval_loss": 0.2404525727033615, "eval_mse": 0.24045259381830691, "eval_runtime": 6.5158, "eval_samples_per_second": 153.474, "eval_steps_per_second": 19.184, "step": 700 }, { "epoch": 0.20235285639649053, "grad_norm": 1.7175828218460083, "learning_rate": 8.397572078907437e-06, "loss": 0.2734, "step": 702 }, { "epoch": 0.20321761219305673, "grad_norm": 1.8725277185440063, "learning_rate": 8.38846737481032e-06, "loss": 0.2309, "step": 705 }, { "epoch": 0.20408236798962293, "grad_norm": 1.7434577941894531, "learning_rate": 8.379362670713203e-06, "loss": 0.2395, "step": 708 }, { "epoch": 0.20494712378618912, "grad_norm": 2.8038480281829834, "learning_rate": 8.370257966616086e-06, "loss": 0.2787, "step": 711 }, { "epoch": 0.20581187958275532, "grad_norm": 1.701920509338379, "learning_rate": 8.361153262518968e-06, "loss": 0.2267, "step": 714 }, { "epoch": 0.20667663537932152, "grad_norm": 4.903564453125, "learning_rate": 8.352048558421853e-06, "loss": 0.2716, "step": 717 }, { "epoch": 0.2075413911758877, "grad_norm": 3.723651647567749, "learning_rate": 8.342943854324736e-06, "loss": 0.2674, "step": 720 }, { "epoch": 0.20840614697245394, "grad_norm": 1.7158082723617554, "learning_rate": 8.333839150227619e-06, "loss": 0.2296, "step": 723 }, { "epoch": 0.20927090276902013, "grad_norm": 2.1699960231781006, "learning_rate": 8.324734446130501e-06, "loss": 0.2843, "step": 726 }, { "epoch": 0.21013565856558633, "grad_norm": 4.244576454162598, "learning_rate": 8.315629742033384e-06, "loss": 0.2678, "step": 729 }, { "epoch": 0.21100041436215253, "grad_norm": 1.4597645998001099, "learning_rate": 8.306525037936269e-06, "loss": 0.248, "step": 732 }, { "epoch": 0.21186517015871872, "grad_norm": 2.9361813068389893, "learning_rate": 8.297420333839152e-06, "loss": 0.241, "step": 735 }, { "epoch": 0.21272992595528492, "grad_norm": 4.552188396453857, "learning_rate": 8.288315629742034e-06, "loss": 0.2935, "step": 738 }, { "epoch": 0.21359468175185112, "grad_norm": 4.201780796051025, "learning_rate": 8.279210925644917e-06, "loss": 0.2935, "step": 741 }, { "epoch": 0.21445943754841731, "grad_norm": 7.236446380615234, "learning_rate": 8.2701062215478e-06, "loss": 0.2584, "step": 744 }, { "epoch": 0.2153241933449835, "grad_norm": 1.7462209463119507, "learning_rate": 8.261001517450683e-06, "loss": 0.2457, "step": 747 }, { "epoch": 0.2161889491415497, "grad_norm": 3.9612677097320557, "learning_rate": 8.251896813353568e-06, "loss": 0.2472, "step": 750 }, { "epoch": 0.2170537049381159, "grad_norm": 3.581313371658325, "learning_rate": 8.24279210925645e-06, "loss": 0.2833, "step": 753 }, { "epoch": 0.2179184607346821, "grad_norm": 3.473001003265381, "learning_rate": 8.233687405159333e-06, "loss": 0.2474, "step": 756 }, { "epoch": 0.2187832165312483, "grad_norm": 5.877528667449951, "learning_rate": 8.224582701062216e-06, "loss": 0.2469, "step": 759 }, { "epoch": 0.21964797232781452, "grad_norm": 4.294084072113037, "learning_rate": 8.215477996965099e-06, "loss": 0.2569, "step": 762 }, { "epoch": 0.22051272812438072, "grad_norm": 1.9812819957733154, "learning_rate": 8.206373292867983e-06, "loss": 0.2592, "step": 765 }, { "epoch": 0.22137748392094692, "grad_norm": 1.6627389192581177, "learning_rate": 8.197268588770866e-06, "loss": 0.2597, "step": 768 }, { "epoch": 0.2222422397175131, "grad_norm": 2.557081699371338, "learning_rate": 8.188163884673749e-06, "loss": 0.2624, "step": 771 }, { "epoch": 0.2231069955140793, "grad_norm": 3.3301448822021484, "learning_rate": 8.179059180576632e-06, "loss": 0.2458, "step": 774 }, { "epoch": 0.2239717513106455, "grad_norm": 3.717036247253418, "learning_rate": 8.169954476479515e-06, "loss": 0.2403, "step": 777 }, { "epoch": 0.2248365071072117, "grad_norm": 1.9032166004180908, "learning_rate": 8.1608497723824e-06, "loss": 0.2194, "step": 780 }, { "epoch": 0.2257012629037779, "grad_norm": 11.293305397033691, "learning_rate": 8.151745068285282e-06, "loss": 0.276, "step": 783 }, { "epoch": 0.2265660187003441, "grad_norm": 2.2903361320495605, "learning_rate": 8.142640364188165e-06, "loss": 0.2089, "step": 786 }, { "epoch": 0.2274307744969103, "grad_norm": 1.6450647115707397, "learning_rate": 8.133535660091048e-06, "loss": 0.239, "step": 789 }, { "epoch": 0.2282955302934765, "grad_norm": 2.290724277496338, "learning_rate": 8.12443095599393e-06, "loss": 0.2509, "step": 792 }, { "epoch": 0.2291602860900427, "grad_norm": 2.6252450942993164, "learning_rate": 8.115326251896815e-06, "loss": 0.2578, "step": 795 }, { "epoch": 0.23002504188660888, "grad_norm": 4.217925071716309, "learning_rate": 8.106221547799698e-06, "loss": 0.2604, "step": 798 }, { "epoch": 0.23060154575098638, "eval_loss": 0.23178604245185852, "eval_mse": 0.2317860303344205, "eval_runtime": 6.6317, "eval_samples_per_second": 150.79, "eval_steps_per_second": 18.849, "step": 800 }, { "epoch": 0.2308897976831751, "grad_norm": 2.014478921890259, "learning_rate": 8.097116843702581e-06, "loss": 0.238, "step": 801 }, { "epoch": 0.2317545534797413, "grad_norm": 3.1536102294921875, "learning_rate": 8.088012139605464e-06, "loss": 0.2236, "step": 804 }, { "epoch": 0.2326193092763075, "grad_norm": 2.094320297241211, "learning_rate": 8.078907435508347e-06, "loss": 0.2536, "step": 807 }, { "epoch": 0.2334840650728737, "grad_norm": 1.7041524648666382, "learning_rate": 8.06980273141123e-06, "loss": 0.2243, "step": 810 }, { "epoch": 0.2343488208694399, "grad_norm": 4.586849689483643, "learning_rate": 8.060698027314114e-06, "loss": 0.2276, "step": 813 }, { "epoch": 0.2352135766660061, "grad_norm": 1.8360718488693237, "learning_rate": 8.051593323216997e-06, "loss": 0.2299, "step": 816 }, { "epoch": 0.2360783324625723, "grad_norm": 2.283907651901245, "learning_rate": 8.04248861911988e-06, "loss": 0.2996, "step": 819 }, { "epoch": 0.23694308825913848, "grad_norm": 2.762160301208496, "learning_rate": 8.033383915022763e-06, "loss": 0.2474, "step": 822 }, { "epoch": 0.23780784405570468, "grad_norm": 2.776780366897583, "learning_rate": 8.024279210925645e-06, "loss": 0.2394, "step": 825 }, { "epoch": 0.23867259985227088, "grad_norm": 1.8678719997406006, "learning_rate": 8.01517450682853e-06, "loss": 0.2732, "step": 828 }, { "epoch": 0.23953735564883707, "grad_norm": 1.6664822101593018, "learning_rate": 8.006069802731413e-06, "loss": 0.245, "step": 831 }, { "epoch": 0.24040211144540327, "grad_norm": 2.244666814804077, "learning_rate": 7.996965098634296e-06, "loss": 0.2587, "step": 834 }, { "epoch": 0.24126686724196947, "grad_norm": 1.8958755731582642, "learning_rate": 7.987860394537178e-06, "loss": 0.2632, "step": 837 }, { "epoch": 0.2421316230385357, "grad_norm": 1.7408092021942139, "learning_rate": 7.978755690440061e-06, "loss": 0.2475, "step": 840 }, { "epoch": 0.2429963788351019, "grad_norm": 4.280789375305176, "learning_rate": 7.969650986342944e-06, "loss": 0.2478, "step": 843 }, { "epoch": 0.2438611346316681, "grad_norm": 3.1445536613464355, "learning_rate": 7.960546282245829e-06, "loss": 0.2412, "step": 846 }, { "epoch": 0.24472589042823428, "grad_norm": 1.806154727935791, "learning_rate": 7.951441578148712e-06, "loss": 0.2772, "step": 849 }, { "epoch": 0.24559064622480048, "grad_norm": 2.123932123184204, "learning_rate": 7.942336874051594e-06, "loss": 0.2535, "step": 852 }, { "epoch": 0.24645540202136668, "grad_norm": 1.488558292388916, "learning_rate": 7.933232169954477e-06, "loss": 0.2342, "step": 855 }, { "epoch": 0.24732015781793287, "grad_norm": 3.2414135932922363, "learning_rate": 7.92412746585736e-06, "loss": 0.2536, "step": 858 }, { "epoch": 0.24818491361449907, "grad_norm": 1.952009916305542, "learning_rate": 7.915022761760245e-06, "loss": 0.2028, "step": 861 }, { "epoch": 0.24904966941106527, "grad_norm": 1.7001383304595947, "learning_rate": 7.905918057663127e-06, "loss": 0.2791, "step": 864 }, { "epoch": 0.24991442520763146, "grad_norm": 2.1715829372406006, "learning_rate": 7.89681335356601e-06, "loss": 0.2211, "step": 867 }, { "epoch": 0.25077918100419766, "grad_norm": 2.4868202209472656, "learning_rate": 7.887708649468893e-06, "loss": 0.2599, "step": 870 }, { "epoch": 0.25164393680076386, "grad_norm": 1.4740101099014282, "learning_rate": 7.878603945371776e-06, "loss": 0.2076, "step": 873 }, { "epoch": 0.25250869259733005, "grad_norm": 2.241642951965332, "learning_rate": 7.869499241274659e-06, "loss": 0.2413, "step": 876 }, { "epoch": 0.25337344839389625, "grad_norm": 4.3130784034729, "learning_rate": 7.860394537177543e-06, "loss": 0.2495, "step": 879 }, { "epoch": 0.25423820419046245, "grad_norm": 6.674787998199463, "learning_rate": 7.851289833080426e-06, "loss": 0.2396, "step": 882 }, { "epoch": 0.25510295998702864, "grad_norm": 2.463395118713379, "learning_rate": 7.842185128983309e-06, "loss": 0.2304, "step": 885 }, { "epoch": 0.25596771578359484, "grad_norm": 4.519803047180176, "learning_rate": 7.833080424886192e-06, "loss": 0.2603, "step": 888 }, { "epoch": 0.25683247158016104, "grad_norm": 1.3876014947891235, "learning_rate": 7.823975720789075e-06, "loss": 0.2282, "step": 891 }, { "epoch": 0.25769722737672723, "grad_norm": 2.0575900077819824, "learning_rate": 7.814871016691958e-06, "loss": 0.2286, "step": 894 }, { "epoch": 0.2585619831732935, "grad_norm": 4.538529872894287, "learning_rate": 7.805766312594842e-06, "loss": 0.2606, "step": 897 }, { "epoch": 0.2594267389698597, "grad_norm": 3.036686658859253, "learning_rate": 7.796661608497725e-06, "loss": 0.2961, "step": 900 }, { "epoch": 0.2594267389698597, "eval_loss": 0.2185884416103363, "eval_mse": 0.21858844196051358, "eval_runtime": 6.596, "eval_samples_per_second": 151.607, "eval_steps_per_second": 18.951, "step": 900 }, { "epoch": 0.2602914947664259, "grad_norm": 2.319840669631958, "learning_rate": 7.787556904400608e-06, "loss": 0.2493, "step": 903 }, { "epoch": 0.2611562505629921, "grad_norm": 4.723567485809326, "learning_rate": 7.77845220030349e-06, "loss": 0.2236, "step": 906 }, { "epoch": 0.2620210063595583, "grad_norm": 2.231250286102295, "learning_rate": 7.769347496206373e-06, "loss": 0.2766, "step": 909 }, { "epoch": 0.26288576215612447, "grad_norm": 2.3697762489318848, "learning_rate": 7.760242792109256e-06, "loss": 0.2319, "step": 912 }, { "epoch": 0.26375051795269067, "grad_norm": 2.0771355628967285, "learning_rate": 7.75113808801214e-06, "loss": 0.246, "step": 915 }, { "epoch": 0.26461527374925686, "grad_norm": 2.4788358211517334, "learning_rate": 7.742033383915024e-06, "loss": 0.2163, "step": 918 }, { "epoch": 0.26548002954582306, "grad_norm": 1.5741719007492065, "learning_rate": 7.732928679817907e-06, "loss": 0.2308, "step": 921 }, { "epoch": 0.26634478534238926, "grad_norm": 2.12677001953125, "learning_rate": 7.72382397572079e-06, "loss": 0.2304, "step": 924 }, { "epoch": 0.26720954113895545, "grad_norm": 1.9022995233535767, "learning_rate": 7.714719271623672e-06, "loss": 0.2454, "step": 927 }, { "epoch": 0.26807429693552165, "grad_norm": 3.3529253005981445, "learning_rate": 7.705614567526557e-06, "loss": 0.2137, "step": 930 }, { "epoch": 0.26893905273208785, "grad_norm": 1.935281753540039, "learning_rate": 7.69650986342944e-06, "loss": 0.2581, "step": 933 }, { "epoch": 0.26980380852865404, "grad_norm": 2.14315128326416, "learning_rate": 7.687405159332322e-06, "loss": 0.2317, "step": 936 }, { "epoch": 0.27066856432522024, "grad_norm": 2.028090238571167, "learning_rate": 7.678300455235205e-06, "loss": 0.2379, "step": 939 }, { "epoch": 0.27153332012178644, "grad_norm": 2.918959379196167, "learning_rate": 7.669195751138088e-06, "loss": 0.2708, "step": 942 }, { "epoch": 0.27239807591835263, "grad_norm": 4.042644500732422, "learning_rate": 7.660091047040971e-06, "loss": 0.3112, "step": 945 }, { "epoch": 0.27326283171491883, "grad_norm": 1.8345041275024414, "learning_rate": 7.650986342943855e-06, "loss": 0.267, "step": 948 }, { "epoch": 0.274127587511485, "grad_norm": 5.901050567626953, "learning_rate": 7.641881638846738e-06, "loss": 0.272, "step": 951 }, { "epoch": 0.2749923433080512, "grad_norm": 2.413675546646118, "learning_rate": 7.632776934749621e-06, "loss": 0.2906, "step": 954 }, { "epoch": 0.2758570991046174, "grad_norm": 2.699126958847046, "learning_rate": 7.623672230652505e-06, "loss": 0.2201, "step": 957 }, { "epoch": 0.2767218549011836, "grad_norm": 1.3186564445495605, "learning_rate": 7.614567526555388e-06, "loss": 0.227, "step": 960 }, { "epoch": 0.2775866106977498, "grad_norm": 7.576478958129883, "learning_rate": 7.6054628224582705e-06, "loss": 0.2569, "step": 963 }, { "epoch": 0.278451366494316, "grad_norm": 1.853145718574524, "learning_rate": 7.596358118361153e-06, "loss": 0.2586, "step": 966 }, { "epoch": 0.2793161222908822, "grad_norm": 3.2667322158813477, "learning_rate": 7.587253414264037e-06, "loss": 0.2483, "step": 969 }, { "epoch": 0.2801808780874484, "grad_norm": 2.7756762504577637, "learning_rate": 7.578148710166921e-06, "loss": 0.2724, "step": 972 }, { "epoch": 0.28104563388401466, "grad_norm": 2.138936758041382, "learning_rate": 7.569044006069804e-06, "loss": 0.2517, "step": 975 }, { "epoch": 0.28191038968058085, "grad_norm": 1.9090849161148071, "learning_rate": 7.5599393019726864e-06, "loss": 0.2608, "step": 978 }, { "epoch": 0.28277514547714705, "grad_norm": 2.6184728145599365, "learning_rate": 7.550834597875569e-06, "loss": 0.2728, "step": 981 }, { "epoch": 0.28363990127371325, "grad_norm": 1.5910395383834839, "learning_rate": 7.541729893778453e-06, "loss": 0.2144, "step": 984 }, { "epoch": 0.28450465707027944, "grad_norm": 2.3524558544158936, "learning_rate": 7.532625189681337e-06, "loss": 0.23, "step": 987 }, { "epoch": 0.28536941286684564, "grad_norm": 3.056361675262451, "learning_rate": 7.5235204855842195e-06, "loss": 0.2397, "step": 990 }, { "epoch": 0.28623416866341184, "grad_norm": 3.4158847332000732, "learning_rate": 7.514415781487102e-06, "loss": 0.2712, "step": 993 }, { "epoch": 0.28709892445997803, "grad_norm": 3.3620333671569824, "learning_rate": 7.505311077389985e-06, "loss": 0.2107, "step": 996 }, { "epoch": 0.28796368025654423, "grad_norm": 3.508890390396118, "learning_rate": 7.496206373292868e-06, "loss": 0.2453, "step": 999 }, { "epoch": 0.28825193218873296, "eval_loss": 0.21684841811656952, "eval_mse": 0.21684841979760677, "eval_runtime": 6.6234, "eval_samples_per_second": 150.98, "eval_steps_per_second": 18.872, "step": 1000 }, { "epoch": 0.2888284360531104, "grad_norm": 2.658740997314453, "learning_rate": 7.487101669195752e-06, "loss": 0.2319, "step": 1002 }, { "epoch": 0.2896931918496766, "grad_norm": 1.370540976524353, "learning_rate": 7.477996965098635e-06, "loss": 0.2443, "step": 1005 }, { "epoch": 0.2905579476462428, "grad_norm": 8.996585845947266, "learning_rate": 7.468892261001518e-06, "loss": 0.2859, "step": 1008 }, { "epoch": 0.291422703442809, "grad_norm": 3.880053758621216, "learning_rate": 7.459787556904401e-06, "loss": 0.2913, "step": 1011 }, { "epoch": 0.2922874592393752, "grad_norm": 4.735537052154541, "learning_rate": 7.450682852807284e-06, "loss": 0.258, "step": 1014 }, { "epoch": 0.2931522150359414, "grad_norm": 1.7119396924972534, "learning_rate": 7.441578148710168e-06, "loss": 0.2397, "step": 1017 }, { "epoch": 0.2940169708325076, "grad_norm": 3.4769861698150635, "learning_rate": 7.4324734446130505e-06, "loss": 0.209, "step": 1020 }, { "epoch": 0.2948817266290738, "grad_norm": 2.3741278648376465, "learning_rate": 7.423368740515934e-06, "loss": 0.2627, "step": 1023 }, { "epoch": 0.29574648242564, "grad_norm": 1.5303798913955688, "learning_rate": 7.414264036418817e-06, "loss": 0.2352, "step": 1026 }, { "epoch": 0.2966112382222062, "grad_norm": 1.63661789894104, "learning_rate": 7.4051593323217e-06, "loss": 0.2448, "step": 1029 }, { "epoch": 0.2974759940187724, "grad_norm": 1.531538724899292, "learning_rate": 7.3960546282245835e-06, "loss": 0.23, "step": 1032 }, { "epoch": 0.2983407498153386, "grad_norm": 1.3936281204223633, "learning_rate": 7.386949924127466e-06, "loss": 0.2367, "step": 1035 }, { "epoch": 0.2992055056119048, "grad_norm": 4.795119762420654, "learning_rate": 7.377845220030349e-06, "loss": 0.2621, "step": 1038 }, { "epoch": 0.300070261408471, "grad_norm": 3.725170135498047, "learning_rate": 7.368740515933233e-06, "loss": 0.2708, "step": 1041 }, { "epoch": 0.3009350172050372, "grad_norm": 1.5772522687911987, "learning_rate": 7.359635811836116e-06, "loss": 0.1897, "step": 1044 }, { "epoch": 0.3017997730016034, "grad_norm": 4.716231822967529, "learning_rate": 7.3505311077389994e-06, "loss": 0.2297, "step": 1047 }, { "epoch": 0.3026645287981696, "grad_norm": 3.9254777431488037, "learning_rate": 7.341426403641882e-06, "loss": 0.2285, "step": 1050 }, { "epoch": 0.3035292845947358, "grad_norm": 2.7238216400146484, "learning_rate": 7.332321699544765e-06, "loss": 0.2488, "step": 1053 }, { "epoch": 0.304394040391302, "grad_norm": 7.061728477478027, "learning_rate": 7.323216995447649e-06, "loss": 0.2329, "step": 1056 }, { "epoch": 0.3052587961878682, "grad_norm": 2.2323367595672607, "learning_rate": 7.314112291350532e-06, "loss": 0.2511, "step": 1059 }, { "epoch": 0.3061235519844344, "grad_norm": 3.432279586791992, "learning_rate": 7.305007587253415e-06, "loss": 0.251, "step": 1062 }, { "epoch": 0.3069883077810006, "grad_norm": 4.372462272644043, "learning_rate": 7.295902883156298e-06, "loss": 0.2306, "step": 1065 }, { "epoch": 0.3078530635775668, "grad_norm": 3.428677558898926, "learning_rate": 7.286798179059181e-06, "loss": 0.2389, "step": 1068 }, { "epoch": 0.308717819374133, "grad_norm": 1.7803095579147339, "learning_rate": 7.277693474962064e-06, "loss": 0.2374, "step": 1071 }, { "epoch": 0.3095825751706992, "grad_norm": 1.7453327178955078, "learning_rate": 7.2685887708649476e-06, "loss": 0.2175, "step": 1074 }, { "epoch": 0.3104473309672654, "grad_norm": 3.4764058589935303, "learning_rate": 7.25948406676783e-06, "loss": 0.2649, "step": 1077 }, { "epoch": 0.3113120867638316, "grad_norm": 2.5065643787384033, "learning_rate": 7.250379362670714e-06, "loss": 0.2436, "step": 1080 }, { "epoch": 0.3121768425603978, "grad_norm": 2.4735498428344727, "learning_rate": 7.241274658573597e-06, "loss": 0.2565, "step": 1083 }, { "epoch": 0.313041598356964, "grad_norm": 7.107683181762695, "learning_rate": 7.23216995447648e-06, "loss": 0.2691, "step": 1086 }, { "epoch": 0.3139063541535302, "grad_norm": 1.9517550468444824, "learning_rate": 7.223065250379363e-06, "loss": 0.2869, "step": 1089 }, { "epoch": 0.3147711099500964, "grad_norm": 3.1179702281951904, "learning_rate": 7.213960546282246e-06, "loss": 0.2434, "step": 1092 }, { "epoch": 0.3156358657466626, "grad_norm": 4.619999885559082, "learning_rate": 7.20485584218513e-06, "loss": 0.2469, "step": 1095 }, { "epoch": 0.3165006215432288, "grad_norm": 6.7724528312683105, "learning_rate": 7.195751138088013e-06, "loss": 0.278, "step": 1098 }, { "epoch": 0.31707712540760624, "eval_loss": 0.2247343808412552, "eval_mse": 0.22473438137583435, "eval_runtime": 6.5256, "eval_samples_per_second": 153.242, "eval_steps_per_second": 19.155, "step": 1100 }, { "epoch": 0.317365377339795, "grad_norm": 1.6179205179214478, "learning_rate": 7.186646433990896e-06, "loss": 0.252, "step": 1101 }, { "epoch": 0.31823013313636117, "grad_norm": 1.8803197145462036, "learning_rate": 7.1775417298937785e-06, "loss": 0.2227, "step": 1104 }, { "epoch": 0.31909488893292737, "grad_norm": 2.2427573204040527, "learning_rate": 7.168437025796661e-06, "loss": 0.2399, "step": 1107 }, { "epoch": 0.31995964472949356, "grad_norm": 1.907244086265564, "learning_rate": 7.159332321699546e-06, "loss": 0.2162, "step": 1110 }, { "epoch": 0.32082440052605976, "grad_norm": 3.7878000736236572, "learning_rate": 7.150227617602429e-06, "loss": 0.246, "step": 1113 }, { "epoch": 0.32168915632262596, "grad_norm": 1.9053196907043457, "learning_rate": 7.141122913505312e-06, "loss": 0.2684, "step": 1116 }, { "epoch": 0.32255391211919215, "grad_norm": 5.108983039855957, "learning_rate": 7.1320182094081944e-06, "loss": 0.2237, "step": 1119 }, { "epoch": 0.32341866791575835, "grad_norm": 2.2469422817230225, "learning_rate": 7.122913505311077e-06, "loss": 0.2462, "step": 1122 }, { "epoch": 0.32428342371232455, "grad_norm": 5.127351760864258, "learning_rate": 7.113808801213962e-06, "loss": 0.2543, "step": 1125 }, { "epoch": 0.32514817950889074, "grad_norm": 4.980170249938965, "learning_rate": 7.104704097116845e-06, "loss": 0.2775, "step": 1128 }, { "epoch": 0.326012935305457, "grad_norm": 3.701903820037842, "learning_rate": 7.0955993930197275e-06, "loss": 0.2548, "step": 1131 }, { "epoch": 0.3268776911020232, "grad_norm": 3.780144214630127, "learning_rate": 7.08649468892261e-06, "loss": 0.2687, "step": 1134 }, { "epoch": 0.3277424468985894, "grad_norm": 2.2161099910736084, "learning_rate": 7.077389984825493e-06, "loss": 0.2351, "step": 1137 }, { "epoch": 0.3286072026951556, "grad_norm": 4.7017998695373535, "learning_rate": 7.068285280728376e-06, "loss": 0.2431, "step": 1140 }, { "epoch": 0.3294719584917218, "grad_norm": 2.053750991821289, "learning_rate": 7.0591805766312606e-06, "loss": 0.254, "step": 1143 }, { "epoch": 0.330336714288288, "grad_norm": 2.8078341484069824, "learning_rate": 7.050075872534143e-06, "loss": 0.2646, "step": 1146 }, { "epoch": 0.3312014700848542, "grad_norm": 2.585087776184082, "learning_rate": 7.040971168437026e-06, "loss": 0.2536, "step": 1149 }, { "epoch": 0.3320662258814204, "grad_norm": 4.2963104248046875, "learning_rate": 7.031866464339909e-06, "loss": 0.2276, "step": 1152 }, { "epoch": 0.33293098167798657, "grad_norm": 4.205751419067383, "learning_rate": 7.022761760242792e-06, "loss": 0.2404, "step": 1155 }, { "epoch": 0.33379573747455277, "grad_norm": 3.460796356201172, "learning_rate": 7.0136570561456765e-06, "loss": 0.2473, "step": 1158 }, { "epoch": 0.33466049327111896, "grad_norm": 3.529181957244873, "learning_rate": 7.004552352048559e-06, "loss": 0.24, "step": 1161 }, { "epoch": 0.33552524906768516, "grad_norm": 3.203437328338623, "learning_rate": 6.995447647951442e-06, "loss": 0.2547, "step": 1164 }, { "epoch": 0.33639000486425136, "grad_norm": 4.1535491943359375, "learning_rate": 6.986342943854325e-06, "loss": 0.2532, "step": 1167 }, { "epoch": 0.33725476066081755, "grad_norm": 3.7933478355407715, "learning_rate": 6.977238239757208e-06, "loss": 0.2435, "step": 1170 }, { "epoch": 0.33811951645738375, "grad_norm": 2.6703147888183594, "learning_rate": 6.968133535660092e-06, "loss": 0.2293, "step": 1173 }, { "epoch": 0.33898427225394995, "grad_norm": 2.8900182247161865, "learning_rate": 6.959028831562975e-06, "loss": 0.2163, "step": 1176 }, { "epoch": 0.33984902805051614, "grad_norm": 3.9563350677490234, "learning_rate": 6.949924127465858e-06, "loss": 0.2682, "step": 1179 }, { "epoch": 0.34071378384708234, "grad_norm": 1.8461941480636597, "learning_rate": 6.940819423368741e-06, "loss": 0.2293, "step": 1182 }, { "epoch": 0.34157853964364854, "grad_norm": 3.313368082046509, "learning_rate": 6.931714719271624e-06, "loss": 0.2436, "step": 1185 }, { "epoch": 0.34244329544021473, "grad_norm": 1.7820873260498047, "learning_rate": 6.922610015174508e-06, "loss": 0.212, "step": 1188 }, { "epoch": 0.34330805123678093, "grad_norm": 1.995291829109192, "learning_rate": 6.913505311077391e-06, "loss": 0.2243, "step": 1191 }, { "epoch": 0.34417280703334713, "grad_norm": 2.928727626800537, "learning_rate": 6.904400606980274e-06, "loss": 0.2296, "step": 1194 }, { "epoch": 0.3450375628299133, "grad_norm": 2.5598068237304688, "learning_rate": 6.895295902883157e-06, "loss": 0.2135, "step": 1197 }, { "epoch": 0.3459023186264795, "grad_norm": 2.4700326919555664, "learning_rate": 6.88619119878604e-06, "loss": 0.2319, "step": 1200 }, { "epoch": 0.3459023186264795, "eval_loss": 0.21415139734745026, "eval_mse": 0.2141513990436215, "eval_runtime": 6.5018, "eval_samples_per_second": 153.803, "eval_steps_per_second": 19.225, "step": 1200 }, { "epoch": 0.3467670744230457, "grad_norm": 3.6825876235961914, "learning_rate": 6.8770864946889225e-06, "loss": 0.2409, "step": 1203 }, { "epoch": 0.3476318302196119, "grad_norm": 2.8780245780944824, "learning_rate": 6.867981790591807e-06, "loss": 0.2219, "step": 1206 }, { "epoch": 0.34849658601617817, "grad_norm": 2.641505479812622, "learning_rate": 6.85887708649469e-06, "loss": 0.2315, "step": 1209 }, { "epoch": 0.34936134181274436, "grad_norm": 1.660909652709961, "learning_rate": 6.849772382397573e-06, "loss": 0.224, "step": 1212 }, { "epoch": 0.35022609760931056, "grad_norm": 5.104085445404053, "learning_rate": 6.8406676783004556e-06, "loss": 0.2184, "step": 1215 }, { "epoch": 0.35109085340587676, "grad_norm": 1.870938777923584, "learning_rate": 6.831562974203338e-06, "loss": 0.234, "step": 1218 }, { "epoch": 0.35195560920244295, "grad_norm": 4.542322635650635, "learning_rate": 6.822458270106223e-06, "loss": 0.2247, "step": 1221 }, { "epoch": 0.35282036499900915, "grad_norm": 1.4236701726913452, "learning_rate": 6.813353566009106e-06, "loss": 0.2168, "step": 1224 }, { "epoch": 0.35368512079557535, "grad_norm": 2.0418410301208496, "learning_rate": 6.804248861911989e-06, "loss": 0.2425, "step": 1227 }, { "epoch": 0.35454987659214154, "grad_norm": 2.868399143218994, "learning_rate": 6.7951441578148715e-06, "loss": 0.2354, "step": 1230 }, { "epoch": 0.35541463238870774, "grad_norm": 2.453361749649048, "learning_rate": 6.786039453717754e-06, "loss": 0.2266, "step": 1233 }, { "epoch": 0.35627938818527394, "grad_norm": 2.0826542377471924, "learning_rate": 6.776934749620638e-06, "loss": 0.2194, "step": 1236 }, { "epoch": 0.35714414398184013, "grad_norm": 1.511440634727478, "learning_rate": 6.767830045523522e-06, "loss": 0.2264, "step": 1239 }, { "epoch": 0.35800889977840633, "grad_norm": 2.456897020339966, "learning_rate": 6.7587253414264045e-06, "loss": 0.2864, "step": 1242 }, { "epoch": 0.3588736555749725, "grad_norm": 2.873429298400879, "learning_rate": 6.749620637329287e-06, "loss": 0.2055, "step": 1245 }, { "epoch": 0.3597384113715387, "grad_norm": 1.7501113414764404, "learning_rate": 6.74051593323217e-06, "loss": 0.2205, "step": 1248 }, { "epoch": 0.3606031671681049, "grad_norm": 1.615105390548706, "learning_rate": 6.731411229135054e-06, "loss": 0.2337, "step": 1251 }, { "epoch": 0.3614679229646711, "grad_norm": 1.6872748136520386, "learning_rate": 6.722306525037937e-06, "loss": 0.2164, "step": 1254 }, { "epoch": 0.3623326787612373, "grad_norm": 3.1001312732696533, "learning_rate": 6.7132018209408204e-06, "loss": 0.2154, "step": 1257 }, { "epoch": 0.3631974345578035, "grad_norm": 2.1165292263031006, "learning_rate": 6.704097116843703e-06, "loss": 0.2512, "step": 1260 }, { "epoch": 0.3640621903543697, "grad_norm": 4.326318740844727, "learning_rate": 6.694992412746586e-06, "loss": 0.2471, "step": 1263 }, { "epoch": 0.3649269461509359, "grad_norm": 5.773290634155273, "learning_rate": 6.685887708649469e-06, "loss": 0.2407, "step": 1266 }, { "epoch": 0.3657917019475021, "grad_norm": 3.2723119258880615, "learning_rate": 6.676783004552353e-06, "loss": 0.3129, "step": 1269 }, { "epoch": 0.3666564577440683, "grad_norm": 2.927086114883423, "learning_rate": 6.6676783004552355e-06, "loss": 0.231, "step": 1272 }, { "epoch": 0.3675212135406345, "grad_norm": 1.7322252988815308, "learning_rate": 6.658573596358119e-06, "loss": 0.2504, "step": 1275 }, { "epoch": 0.3683859693372007, "grad_norm": 2.5904715061187744, "learning_rate": 6.649468892261002e-06, "loss": 0.2079, "step": 1278 }, { "epoch": 0.3692507251337669, "grad_norm": 2.6561062335968018, "learning_rate": 6.640364188163885e-06, "loss": 0.2423, "step": 1281 }, { "epoch": 0.3701154809303331, "grad_norm": 3.3299241065979004, "learning_rate": 6.6312594840667686e-06, "loss": 0.2386, "step": 1284 }, { "epoch": 0.37098023672689934, "grad_norm": 1.731477975845337, "learning_rate": 6.622154779969651e-06, "loss": 0.2017, "step": 1287 }, { "epoch": 0.37184499252346553, "grad_norm": 2.5077965259552, "learning_rate": 6.613050075872534e-06, "loss": 0.2446, "step": 1290 }, { "epoch": 0.37270974832003173, "grad_norm": 2.36556077003479, "learning_rate": 6.603945371775418e-06, "loss": 0.2544, "step": 1293 }, { "epoch": 0.3735745041165979, "grad_norm": 1.9789232015609741, "learning_rate": 6.594840667678301e-06, "loss": 0.2247, "step": 1296 }, { "epoch": 0.3744392599131641, "grad_norm": 1.7310489416122437, "learning_rate": 6.5857359635811845e-06, "loss": 0.1983, "step": 1299 }, { "epoch": 0.37472751184535286, "eval_loss": 0.2175411880016327, "eval_mse": 0.21754117820138344, "eval_runtime": 7.1272, "eval_samples_per_second": 140.308, "eval_steps_per_second": 17.538, "step": 1300 }, { "epoch": 0.3753040157097303, "grad_norm": 4.152148723602295, "learning_rate": 6.576631259484067e-06, "loss": 0.2899, "step": 1302 }, { "epoch": 0.3761687715062965, "grad_norm": 2.1581296920776367, "learning_rate": 6.56752655538695e-06, "loss": 0.2448, "step": 1305 }, { "epoch": 0.3770335273028627, "grad_norm": 2.8600330352783203, "learning_rate": 6.558421851289834e-06, "loss": 0.2084, "step": 1308 }, { "epoch": 0.3778982830994289, "grad_norm": 1.5723686218261719, "learning_rate": 6.549317147192717e-06, "loss": 0.2441, "step": 1311 }, { "epoch": 0.3787630388959951, "grad_norm": 3.4149153232574463, "learning_rate": 6.5402124430956e-06, "loss": 0.247, "step": 1314 }, { "epoch": 0.3796277946925613, "grad_norm": 16.35414695739746, "learning_rate": 6.531107738998483e-06, "loss": 0.2478, "step": 1317 }, { "epoch": 0.3804925504891275, "grad_norm": 6.227761268615723, "learning_rate": 6.522003034901366e-06, "loss": 0.2316, "step": 1320 }, { "epoch": 0.3813573062856937, "grad_norm": 2.2669637203216553, "learning_rate": 6.512898330804249e-06, "loss": 0.2836, "step": 1323 }, { "epoch": 0.3822220620822599, "grad_norm": 1.4385027885437012, "learning_rate": 6.503793626707133e-06, "loss": 0.2122, "step": 1326 }, { "epoch": 0.3830868178788261, "grad_norm": 2.3909130096435547, "learning_rate": 6.4946889226100154e-06, "loss": 0.2357, "step": 1329 }, { "epoch": 0.3839515736753923, "grad_norm": 1.7610464096069336, "learning_rate": 6.485584218512899e-06, "loss": 0.2746, "step": 1332 }, { "epoch": 0.3848163294719585, "grad_norm": 2.8983278274536133, "learning_rate": 6.476479514415782e-06, "loss": 0.2176, "step": 1335 }, { "epoch": 0.3856810852685247, "grad_norm": 1.7231597900390625, "learning_rate": 6.467374810318665e-06, "loss": 0.2065, "step": 1338 }, { "epoch": 0.3865458410650909, "grad_norm": 1.6913188695907593, "learning_rate": 6.458270106221548e-06, "loss": 0.2541, "step": 1341 }, { "epoch": 0.3874105968616571, "grad_norm": 2.1574645042419434, "learning_rate": 6.449165402124431e-06, "loss": 0.2393, "step": 1344 }, { "epoch": 0.38827535265822327, "grad_norm": 3.8315231800079346, "learning_rate": 6.440060698027315e-06, "loss": 0.2353, "step": 1347 }, { "epoch": 0.38914010845478947, "grad_norm": 2.655545711517334, "learning_rate": 6.430955993930198e-06, "loss": 0.2222, "step": 1350 }, { "epoch": 0.39000486425135567, "grad_norm": 2.0346477031707764, "learning_rate": 6.421851289833081e-06, "loss": 0.2289, "step": 1353 }, { "epoch": 0.39086962004792186, "grad_norm": 4.726449489593506, "learning_rate": 6.4127465857359636e-06, "loss": 0.2748, "step": 1356 }, { "epoch": 0.39173437584448806, "grad_norm": 3.0121731758117676, "learning_rate": 6.403641881638846e-06, "loss": 0.2244, "step": 1359 }, { "epoch": 0.39259913164105426, "grad_norm": 6.562178611755371, "learning_rate": 6.394537177541731e-06, "loss": 0.264, "step": 1362 }, { "epoch": 0.3934638874376205, "grad_norm": 3.1092755794525146, "learning_rate": 6.385432473444614e-06, "loss": 0.228, "step": 1365 }, { "epoch": 0.3943286432341867, "grad_norm": 4.431830406188965, "learning_rate": 6.376327769347497e-06, "loss": 0.2544, "step": 1368 }, { "epoch": 0.3951933990307529, "grad_norm": 2.545694351196289, "learning_rate": 6.3672230652503795e-06, "loss": 0.2618, "step": 1371 }, { "epoch": 0.3960581548273191, "grad_norm": 1.5483859777450562, "learning_rate": 6.358118361153262e-06, "loss": 0.282, "step": 1374 }, { "epoch": 0.3969229106238853, "grad_norm": 1.750784993171692, "learning_rate": 6.349013657056147e-06, "loss": 0.2447, "step": 1377 }, { "epoch": 0.3977876664204515, "grad_norm": 2.440020799636841, "learning_rate": 6.33990895295903e-06, "loss": 0.2472, "step": 1380 }, { "epoch": 0.3986524222170177, "grad_norm": 1.4584132432937622, "learning_rate": 6.3308042488619125e-06, "loss": 0.2469, "step": 1383 }, { "epoch": 0.3995171780135839, "grad_norm": 1.3845562934875488, "learning_rate": 6.321699544764795e-06, "loss": 0.2111, "step": 1386 }, { "epoch": 0.4003819338101501, "grad_norm": 1.736345648765564, "learning_rate": 6.312594840667678e-06, "loss": 0.2203, "step": 1389 }, { "epoch": 0.4012466896067163, "grad_norm": 2.5879809856414795, "learning_rate": 6.303490136570563e-06, "loss": 0.2776, "step": 1392 }, { "epoch": 0.4021114454032825, "grad_norm": 1.3224149942398071, "learning_rate": 6.294385432473446e-06, "loss": 0.2016, "step": 1395 }, { "epoch": 0.40297620119984867, "grad_norm": 2.7825825214385986, "learning_rate": 6.2852807283763284e-06, "loss": 0.2264, "step": 1398 }, { "epoch": 0.40355270506422614, "eval_loss": 0.23064111173152924, "eval_mse": 0.2306411211611703, "eval_runtime": 6.653, "eval_samples_per_second": 150.308, "eval_steps_per_second": 18.789, "step": 1400 }, { "epoch": 0.40384095699641487, "grad_norm": 3.059859275817871, "learning_rate": 6.276176024279211e-06, "loss": 0.1952, "step": 1401 }, { "epoch": 0.40470571279298106, "grad_norm": 2.1431009769439697, "learning_rate": 6.267071320182094e-06, "loss": 0.2681, "step": 1404 }, { "epoch": 0.40557046858954726, "grad_norm": 1.6716617345809937, "learning_rate": 6.257966616084977e-06, "loss": 0.221, "step": 1407 }, { "epoch": 0.40643522438611346, "grad_norm": 1.9646525382995605, "learning_rate": 6.2488619119878615e-06, "loss": 0.2218, "step": 1410 }, { "epoch": 0.40729998018267966, "grad_norm": 1.2912189960479736, "learning_rate": 6.239757207890744e-06, "loss": 0.2077, "step": 1413 }, { "epoch": 0.40816473597924585, "grad_norm": 2.4723434448242188, "learning_rate": 6.230652503793627e-06, "loss": 0.2629, "step": 1416 }, { "epoch": 0.40902949177581205, "grad_norm": 2.1053199768066406, "learning_rate": 6.22154779969651e-06, "loss": 0.2524, "step": 1419 }, { "epoch": 0.40989424757237825, "grad_norm": 2.039580821990967, "learning_rate": 6.212443095599393e-06, "loss": 0.2035, "step": 1422 }, { "epoch": 0.41075900336894444, "grad_norm": 1.499022364616394, "learning_rate": 6.203338391502277e-06, "loss": 0.2263, "step": 1425 }, { "epoch": 0.41162375916551064, "grad_norm": 2.090580701828003, "learning_rate": 6.19423368740516e-06, "loss": 0.2834, "step": 1428 }, { "epoch": 0.41248851496207684, "grad_norm": 2.0547232627868652, "learning_rate": 6.185128983308043e-06, "loss": 0.2725, "step": 1431 }, { "epoch": 0.41335327075864303, "grad_norm": 1.8254441022872925, "learning_rate": 6.176024279210926e-06, "loss": 0.2234, "step": 1434 }, { "epoch": 0.41421802655520923, "grad_norm": 1.860533595085144, "learning_rate": 6.166919575113809e-06, "loss": 0.2554, "step": 1437 }, { "epoch": 0.4150827823517754, "grad_norm": 3.225929021835327, "learning_rate": 6.157814871016693e-06, "loss": 0.2784, "step": 1440 }, { "epoch": 0.4159475381483417, "grad_norm": 2.2769436836242676, "learning_rate": 6.148710166919576e-06, "loss": 0.2261, "step": 1443 }, { "epoch": 0.4168122939449079, "grad_norm": 1.6467565298080444, "learning_rate": 6.139605462822459e-06, "loss": 0.2702, "step": 1446 }, { "epoch": 0.41767704974147407, "grad_norm": 3.0362329483032227, "learning_rate": 6.130500758725342e-06, "loss": 0.2348, "step": 1449 }, { "epoch": 0.41854180553804027, "grad_norm": 1.8852200508117676, "learning_rate": 6.121396054628225e-06, "loss": 0.2222, "step": 1452 }, { "epoch": 0.41940656133460646, "grad_norm": 2.119568109512329, "learning_rate": 6.112291350531108e-06, "loss": 0.226, "step": 1455 }, { "epoch": 0.42027131713117266, "grad_norm": 2.534950017929077, "learning_rate": 6.103186646433992e-06, "loss": 0.2353, "step": 1458 }, { "epoch": 0.42113607292773886, "grad_norm": 3.6363894939422607, "learning_rate": 6.094081942336875e-06, "loss": 0.2724, "step": 1461 }, { "epoch": 0.42200082872430505, "grad_norm": 1.8480486869812012, "learning_rate": 6.084977238239758e-06, "loss": 0.2417, "step": 1464 }, { "epoch": 0.42286558452087125, "grad_norm": 4.110941410064697, "learning_rate": 6.075872534142641e-06, "loss": 0.2061, "step": 1467 }, { "epoch": 0.42373034031743745, "grad_norm": 2.7998435497283936, "learning_rate": 6.0667678300455234e-06, "loss": 0.231, "step": 1470 }, { "epoch": 0.42459509611400365, "grad_norm": 1.7628705501556396, "learning_rate": 6.057663125948408e-06, "loss": 0.2193, "step": 1473 }, { "epoch": 0.42545985191056984, "grad_norm": 2.7976937294006348, "learning_rate": 6.048558421851291e-06, "loss": 0.2361, "step": 1476 }, { "epoch": 0.42632460770713604, "grad_norm": 2.4593019485473633, "learning_rate": 6.039453717754174e-06, "loss": 0.2494, "step": 1479 }, { "epoch": 0.42718936350370224, "grad_norm": 2.5946741104125977, "learning_rate": 6.0303490136570565e-06, "loss": 0.2279, "step": 1482 }, { "epoch": 0.42805411930026843, "grad_norm": 1.6827466487884521, "learning_rate": 6.021244309559939e-06, "loss": 0.2458, "step": 1485 }, { "epoch": 0.42891887509683463, "grad_norm": 4.625283241271973, "learning_rate": 6.012139605462823e-06, "loss": 0.2707, "step": 1488 }, { "epoch": 0.4297836308934008, "grad_norm": 2.733687400817871, "learning_rate": 6.003034901365707e-06, "loss": 0.2294, "step": 1491 }, { "epoch": 0.430648386689967, "grad_norm": 1.38575279712677, "learning_rate": 5.9939301972685896e-06, "loss": 0.1992, "step": 1494 }, { "epoch": 0.4315131424865332, "grad_norm": 1.9684631824493408, "learning_rate": 5.984825493171472e-06, "loss": 0.2118, "step": 1497 }, { "epoch": 0.4323778982830994, "grad_norm": 3.40984845161438, "learning_rate": 5.975720789074355e-06, "loss": 0.2175, "step": 1500 }, { "epoch": 0.4323778982830994, "eval_loss": 0.2375136762857437, "eval_mse": 0.23751368772797288, "eval_runtime": 6.7445, "eval_samples_per_second": 148.27, "eval_steps_per_second": 18.534, "step": 1500 }, { "epoch": 0.4332426540796656, "grad_norm": 2.3788678646087646, "learning_rate": 5.966616084977239e-06, "loss": 0.2311, "step": 1503 }, { "epoch": 0.4341074098762318, "grad_norm": 4.027227401733398, "learning_rate": 5.957511380880122e-06, "loss": 0.2456, "step": 1506 }, { "epoch": 0.434972165672798, "grad_norm": 5.0818586349487305, "learning_rate": 5.9484066767830055e-06, "loss": 0.2631, "step": 1509 }, { "epoch": 0.4358369214693642, "grad_norm": 4.373122215270996, "learning_rate": 5.939301972685888e-06, "loss": 0.243, "step": 1512 }, { "epoch": 0.4367016772659304, "grad_norm": 3.31792950630188, "learning_rate": 5.930197268588771e-06, "loss": 0.2278, "step": 1515 }, { "epoch": 0.4375664330624966, "grad_norm": 1.9427984952926636, "learning_rate": 5.921092564491655e-06, "loss": 0.2578, "step": 1518 }, { "epoch": 0.43843118885906285, "grad_norm": 2.5355935096740723, "learning_rate": 5.911987860394538e-06, "loss": 0.2125, "step": 1521 }, { "epoch": 0.43929594465562904, "grad_norm": 5.661628723144531, "learning_rate": 5.9028831562974205e-06, "loss": 0.2591, "step": 1524 }, { "epoch": 0.44016070045219524, "grad_norm": 2.133945941925049, "learning_rate": 5.893778452200304e-06, "loss": 0.2577, "step": 1527 }, { "epoch": 0.44102545624876144, "grad_norm": 2.8841874599456787, "learning_rate": 5.884673748103187e-06, "loss": 0.2344, "step": 1530 }, { "epoch": 0.44189021204532763, "grad_norm": 1.6562261581420898, "learning_rate": 5.87556904400607e-06, "loss": 0.2133, "step": 1533 }, { "epoch": 0.44275496784189383, "grad_norm": 3.133864164352417, "learning_rate": 5.866464339908954e-06, "loss": 0.2154, "step": 1536 }, { "epoch": 0.44361972363846003, "grad_norm": 1.9966986179351807, "learning_rate": 5.8573596358118364e-06, "loss": 0.1964, "step": 1539 }, { "epoch": 0.4444844794350262, "grad_norm": 1.9703294038772583, "learning_rate": 5.848254931714719e-06, "loss": 0.2238, "step": 1542 }, { "epoch": 0.4453492352315924, "grad_norm": 3.2984211444854736, "learning_rate": 5.839150227617603e-06, "loss": 0.2628, "step": 1545 }, { "epoch": 0.4462139910281586, "grad_norm": 3.6368560791015625, "learning_rate": 5.830045523520486e-06, "loss": 0.2567, "step": 1548 }, { "epoch": 0.4470787468247248, "grad_norm": 2.366480827331543, "learning_rate": 5.8209408194233695e-06, "loss": 0.2493, "step": 1551 }, { "epoch": 0.447943502621291, "grad_norm": 3.3239293098449707, "learning_rate": 5.811836115326252e-06, "loss": 0.2268, "step": 1554 }, { "epoch": 0.4488082584178572, "grad_norm": 4.618416786193848, "learning_rate": 5.802731411229135e-06, "loss": 0.2456, "step": 1557 }, { "epoch": 0.4496730142144234, "grad_norm": 2.826070785522461, "learning_rate": 5.793626707132019e-06, "loss": 0.2206, "step": 1560 }, { "epoch": 0.4505377700109896, "grad_norm": 4.012238025665283, "learning_rate": 5.784522003034902e-06, "loss": 0.2157, "step": 1563 }, { "epoch": 0.4514025258075558, "grad_norm": 2.0067975521087646, "learning_rate": 5.775417298937785e-06, "loss": 0.2489, "step": 1566 }, { "epoch": 0.452267281604122, "grad_norm": 1.500907301902771, "learning_rate": 5.766312594840668e-06, "loss": 0.2348, "step": 1569 }, { "epoch": 0.4531320374006882, "grad_norm": 1.8496533632278442, "learning_rate": 5.757207890743551e-06, "loss": 0.2019, "step": 1572 }, { "epoch": 0.4539967931972544, "grad_norm": 3.222740650177002, "learning_rate": 5.748103186646434e-06, "loss": 0.2587, "step": 1575 }, { "epoch": 0.4548615489938206, "grad_norm": 1.572925329208374, "learning_rate": 5.738998482549318e-06, "loss": 0.2265, "step": 1578 }, { "epoch": 0.4557263047903868, "grad_norm": 1.603624701499939, "learning_rate": 5.729893778452201e-06, "loss": 0.2433, "step": 1581 }, { "epoch": 0.456591060586953, "grad_norm": 3.0979552268981934, "learning_rate": 5.720789074355084e-06, "loss": 0.221, "step": 1584 }, { "epoch": 0.4574558163835192, "grad_norm": 1.8691914081573486, "learning_rate": 5.711684370257967e-06, "loss": 0.2083, "step": 1587 }, { "epoch": 0.4583205721800854, "grad_norm": 1.6643935441970825, "learning_rate": 5.70257966616085e-06, "loss": 0.2173, "step": 1590 }, { "epoch": 0.45918532797665157, "grad_norm": 3.626629590988159, "learning_rate": 5.693474962063733e-06, "loss": 0.2256, "step": 1593 }, { "epoch": 0.46005008377321777, "grad_norm": 1.6160587072372437, "learning_rate": 5.684370257966616e-06, "loss": 0.2197, "step": 1596 }, { "epoch": 0.460914839569784, "grad_norm": 3.206094264984131, "learning_rate": 5.6752655538695e-06, "loss": 0.2461, "step": 1599 }, { "epoch": 0.46120309150197275, "eval_loss": 0.24926815927028656, "eval_mse": 0.24926817585621028, "eval_runtime": 6.5749, "eval_samples_per_second": 152.094, "eval_steps_per_second": 19.012, "step": 1600 }, { "epoch": 0.4617795953663502, "grad_norm": 4.698122024536133, "learning_rate": 5.666160849772383e-06, "loss": 0.2546, "step": 1602 }, { "epoch": 0.4626443511629164, "grad_norm": 1.5181471109390259, "learning_rate": 5.657056145675266e-06, "loss": 0.2167, "step": 1605 }, { "epoch": 0.4635091069594826, "grad_norm": 7.337409496307373, "learning_rate": 5.647951441578149e-06, "loss": 0.2435, "step": 1608 }, { "epoch": 0.4643738627560488, "grad_norm": 1.6400500535964966, "learning_rate": 5.6388467374810314e-06, "loss": 0.2336, "step": 1611 }, { "epoch": 0.465238618552615, "grad_norm": 1.7364429235458374, "learning_rate": 5.629742033383916e-06, "loss": 0.2397, "step": 1614 }, { "epoch": 0.4661033743491812, "grad_norm": 2.7070679664611816, "learning_rate": 5.620637329286799e-06, "loss": 0.2389, "step": 1617 }, { "epoch": 0.4669681301457474, "grad_norm": 1.28640878200531, "learning_rate": 5.611532625189682e-06, "loss": 0.202, "step": 1620 }, { "epoch": 0.4678328859423136, "grad_norm": 2.6867973804473877, "learning_rate": 5.6024279210925645e-06, "loss": 0.2501, "step": 1623 }, { "epoch": 0.4686976417388798, "grad_norm": 2.1441292762756348, "learning_rate": 5.593323216995447e-06, "loss": 0.2364, "step": 1626 }, { "epoch": 0.469562397535446, "grad_norm": 1.6109544038772583, "learning_rate": 5.584218512898332e-06, "loss": 0.244, "step": 1629 }, { "epoch": 0.4704271533320122, "grad_norm": 2.0842268466949463, "learning_rate": 5.575113808801215e-06, "loss": 0.2434, "step": 1632 }, { "epoch": 0.4712919091285784, "grad_norm": 1.4527335166931152, "learning_rate": 5.5660091047040976e-06, "loss": 0.2113, "step": 1635 }, { "epoch": 0.4721566649251446, "grad_norm": 2.0434927940368652, "learning_rate": 5.55690440060698e-06, "loss": 0.2404, "step": 1638 }, { "epoch": 0.4730214207217108, "grad_norm": 3.0256736278533936, "learning_rate": 5.547799696509863e-06, "loss": 0.2652, "step": 1641 }, { "epoch": 0.47388617651827697, "grad_norm": 2.3856120109558105, "learning_rate": 5.538694992412748e-06, "loss": 0.2156, "step": 1644 }, { "epoch": 0.47475093231484317, "grad_norm": 2.0948779582977295, "learning_rate": 5.529590288315631e-06, "loss": 0.2245, "step": 1647 }, { "epoch": 0.47561568811140936, "grad_norm": 1.7975860834121704, "learning_rate": 5.5204855842185135e-06, "loss": 0.2542, "step": 1650 }, { "epoch": 0.47648044390797556, "grad_norm": 3.510812997817993, "learning_rate": 5.511380880121396e-06, "loss": 0.2288, "step": 1653 }, { "epoch": 0.47734519970454176, "grad_norm": 3.4858286380767822, "learning_rate": 5.502276176024279e-06, "loss": 0.2497, "step": 1656 }, { "epoch": 0.47820995550110795, "grad_norm": 2.605661630630493, "learning_rate": 5.493171471927162e-06, "loss": 0.2062, "step": 1659 }, { "epoch": 0.47907471129767415, "grad_norm": 2.162203788757324, "learning_rate": 5.4840667678300465e-06, "loss": 0.2167, "step": 1662 }, { "epoch": 0.47993946709424035, "grad_norm": 4.7574262619018555, "learning_rate": 5.474962063732929e-06, "loss": 0.2088, "step": 1665 }, { "epoch": 0.48080422289080654, "grad_norm": 2.1833505630493164, "learning_rate": 5.465857359635812e-06, "loss": 0.2622, "step": 1668 }, { "epoch": 0.48166897868737274, "grad_norm": 1.645045280456543, "learning_rate": 5.456752655538695e-06, "loss": 0.2164, "step": 1671 }, { "epoch": 0.48253373448393894, "grad_norm": 2.270944356918335, "learning_rate": 5.447647951441578e-06, "loss": 0.2441, "step": 1674 }, { "epoch": 0.4833984902805052, "grad_norm": 2.0133309364318848, "learning_rate": 5.4385432473444624e-06, "loss": 0.2032, "step": 1677 }, { "epoch": 0.4842632460770714, "grad_norm": 2.5585310459136963, "learning_rate": 5.429438543247345e-06, "loss": 0.2385, "step": 1680 }, { "epoch": 0.4851280018736376, "grad_norm": 1.7377713918685913, "learning_rate": 5.420333839150228e-06, "loss": 0.216, "step": 1683 }, { "epoch": 0.4859927576702038, "grad_norm": 1.8322491645812988, "learning_rate": 5.411229135053111e-06, "loss": 0.2102, "step": 1686 }, { "epoch": 0.48685751346677, "grad_norm": 1.8956815004348755, "learning_rate": 5.402124430955994e-06, "loss": 0.2561, "step": 1689 }, { "epoch": 0.4877222692633362, "grad_norm": 1.5535213947296143, "learning_rate": 5.393019726858878e-06, "loss": 0.2099, "step": 1692 }, { "epoch": 0.48858702505990237, "grad_norm": 2.924278974533081, "learning_rate": 5.383915022761761e-06, "loss": 0.2662, "step": 1695 }, { "epoch": 0.48945178085646857, "grad_norm": 2.1653637886047363, "learning_rate": 5.374810318664644e-06, "loss": 0.2419, "step": 1698 }, { "epoch": 0.49002828472084603, "eval_loss": 0.22344937920570374, "eval_mse": 0.22344938813522458, "eval_runtime": 6.5992, "eval_samples_per_second": 151.533, "eval_steps_per_second": 18.942, "step": 1700 }, { "epoch": 0.49031653665303476, "grad_norm": 2.360328197479248, "learning_rate": 5.365705614567527e-06, "loss": 0.2661, "step": 1701 }, { "epoch": 0.49118129244960096, "grad_norm": 2.328495502471924, "learning_rate": 5.35660091047041e-06, "loss": 0.2347, "step": 1704 }, { "epoch": 0.49204604824616716, "grad_norm": 1.6670514345169067, "learning_rate": 5.347496206373293e-06, "loss": 0.2269, "step": 1707 }, { "epoch": 0.49291080404273335, "grad_norm": 2.426805257797241, "learning_rate": 5.338391502276177e-06, "loss": 0.1946, "step": 1710 }, { "epoch": 0.49377555983929955, "grad_norm": 1.7583879232406616, "learning_rate": 5.32928679817906e-06, "loss": 0.2235, "step": 1713 }, { "epoch": 0.49464031563586575, "grad_norm": 1.7235326766967773, "learning_rate": 5.320182094081943e-06, "loss": 0.2395, "step": 1716 }, { "epoch": 0.49550507143243194, "grad_norm": 1.4216803312301636, "learning_rate": 5.311077389984826e-06, "loss": 0.2634, "step": 1719 }, { "epoch": 0.49636982722899814, "grad_norm": 1.2892686128616333, "learning_rate": 5.301972685887709e-06, "loss": 0.2083, "step": 1722 }, { "epoch": 0.49723458302556434, "grad_norm": 2.5210540294647217, "learning_rate": 5.292867981790593e-06, "loss": 0.2017, "step": 1725 }, { "epoch": 0.49809933882213053, "grad_norm": 5.790046691894531, "learning_rate": 5.283763277693476e-06, "loss": 0.2306, "step": 1728 }, { "epoch": 0.49896409461869673, "grad_norm": 1.4158023595809937, "learning_rate": 5.274658573596359e-06, "loss": 0.266, "step": 1731 }, { "epoch": 0.4998288504152629, "grad_norm": 3.0490024089813232, "learning_rate": 5.2655538694992415e-06, "loss": 0.2506, "step": 1734 }, { "epoch": 0.5006936062118291, "grad_norm": 2.33208966255188, "learning_rate": 5.256449165402124e-06, "loss": 0.2088, "step": 1737 }, { "epoch": 0.5015583620083953, "grad_norm": 1.8087997436523438, "learning_rate": 5.247344461305008e-06, "loss": 0.2095, "step": 1740 }, { "epoch": 0.5024231178049615, "grad_norm": 1.5517979860305786, "learning_rate": 5.238239757207892e-06, "loss": 0.2342, "step": 1743 }, { "epoch": 0.5032878736015277, "grad_norm": 1.841036319732666, "learning_rate": 5.229135053110775e-06, "loss": 0.2385, "step": 1746 }, { "epoch": 0.5041526293980939, "grad_norm": 1.8034095764160156, "learning_rate": 5.2200303490136574e-06, "loss": 0.2537, "step": 1749 }, { "epoch": 0.5050173851946601, "grad_norm": 3.617159366607666, "learning_rate": 5.21092564491654e-06, "loss": 0.2378, "step": 1752 }, { "epoch": 0.5058821409912263, "grad_norm": 2.903215169906616, "learning_rate": 5.201820940819424e-06, "loss": 0.2356, "step": 1755 }, { "epoch": 0.5067468967877925, "grad_norm": 2.084693193435669, "learning_rate": 5.192716236722307e-06, "loss": 0.2631, "step": 1758 }, { "epoch": 0.5076116525843587, "grad_norm": 1.8994488716125488, "learning_rate": 5.1836115326251905e-06, "loss": 0.2742, "step": 1761 }, { "epoch": 0.5084764083809249, "grad_norm": 2.651257276535034, "learning_rate": 5.174506828528073e-06, "loss": 0.2332, "step": 1764 }, { "epoch": 0.5093411641774911, "grad_norm": 4.182311534881592, "learning_rate": 5.165402124430956e-06, "loss": 0.2467, "step": 1767 }, { "epoch": 0.5102059199740573, "grad_norm": 27.990720748901367, "learning_rate": 5.15629742033384e-06, "loss": 0.2135, "step": 1770 }, { "epoch": 0.5110706757706235, "grad_norm": 1.942474126815796, "learning_rate": 5.147192716236723e-06, "loss": 0.2443, "step": 1773 }, { "epoch": 0.5119354315671897, "grad_norm": 2.768105983734131, "learning_rate": 5.1380880121396055e-06, "loss": 0.2566, "step": 1776 }, { "epoch": 0.5128001873637559, "grad_norm": 2.423797607421875, "learning_rate": 5.128983308042489e-06, "loss": 0.2667, "step": 1779 }, { "epoch": 0.5136649431603221, "grad_norm": 2.395047426223755, "learning_rate": 5.119878603945372e-06, "loss": 0.2495, "step": 1782 }, { "epoch": 0.5145296989568883, "grad_norm": 2.577787160873413, "learning_rate": 5.110773899848256e-06, "loss": 0.2454, "step": 1785 }, { "epoch": 0.5153944547534545, "grad_norm": 3.106776714324951, "learning_rate": 5.101669195751139e-06, "loss": 0.2162, "step": 1788 }, { "epoch": 0.5162592105500207, "grad_norm": 1.5912446975708008, "learning_rate": 5.0925644916540215e-06, "loss": 0.2201, "step": 1791 }, { "epoch": 0.517123966346587, "grad_norm": 2.427795171737671, "learning_rate": 5.083459787556905e-06, "loss": 0.236, "step": 1794 }, { "epoch": 0.5179887221431532, "grad_norm": 2.5363399982452393, "learning_rate": 5.074355083459788e-06, "loss": 0.2551, "step": 1797 }, { "epoch": 0.5188534779397194, "grad_norm": 1.8077950477600098, "learning_rate": 5.065250379362671e-06, "loss": 0.2411, "step": 1800 }, { "epoch": 0.5188534779397194, "eval_loss": 0.21374772489070892, "eval_mse": 0.21374773593991994, "eval_runtime": 6.6116, "eval_samples_per_second": 151.249, "eval_steps_per_second": 18.906, "step": 1800 }, { "epoch": 0.5197182337362856, "grad_norm": 5.243107795715332, "learning_rate": 5.0561456752655545e-06, "loss": 0.243, "step": 1803 }, { "epoch": 0.5205829895328518, "grad_norm": 2.7711331844329834, "learning_rate": 5.047040971168437e-06, "loss": 0.2037, "step": 1806 }, { "epoch": 0.521447745329418, "grad_norm": 2.8418238162994385, "learning_rate": 5.03793626707132e-06, "loss": 0.2331, "step": 1809 }, { "epoch": 0.5223125011259842, "grad_norm": 3.0842299461364746, "learning_rate": 5.028831562974204e-06, "loss": 0.2422, "step": 1812 }, { "epoch": 0.5231772569225503, "grad_norm": 2.5956835746765137, "learning_rate": 5.019726858877087e-06, "loss": 0.2057, "step": 1815 }, { "epoch": 0.5240420127191165, "grad_norm": 1.3121715784072876, "learning_rate": 5.0106221547799704e-06, "loss": 0.2387, "step": 1818 }, { "epoch": 0.5249067685156827, "grad_norm": 2.2341432571411133, "learning_rate": 5.001517450682853e-06, "loss": 0.2611, "step": 1821 }, { "epoch": 0.5257715243122489, "grad_norm": 2.0494587421417236, "learning_rate": 4.992412746585736e-06, "loss": 0.2238, "step": 1824 }, { "epoch": 0.5266362801088151, "grad_norm": 2.2597897052764893, "learning_rate": 4.983308042488619e-06, "loss": 0.2322, "step": 1827 }, { "epoch": 0.5275010359053813, "grad_norm": 3.5993051528930664, "learning_rate": 4.974203338391503e-06, "loss": 0.234, "step": 1830 }, { "epoch": 0.5283657917019475, "grad_norm": 3.769505262374878, "learning_rate": 4.9650986342943855e-06, "loss": 0.2454, "step": 1833 }, { "epoch": 0.5292305474985137, "grad_norm": 1.4726693630218506, "learning_rate": 4.955993930197269e-06, "loss": 0.2261, "step": 1836 }, { "epoch": 0.5300953032950799, "grad_norm": 2.2910659313201904, "learning_rate": 4.946889226100152e-06, "loss": 0.2551, "step": 1839 }, { "epoch": 0.5309600590916461, "grad_norm": 1.8401825428009033, "learning_rate": 4.937784522003035e-06, "loss": 0.1891, "step": 1842 }, { "epoch": 0.5318248148882123, "grad_norm": 2.0589776039123535, "learning_rate": 4.9286798179059185e-06, "loss": 0.2475, "step": 1845 }, { "epoch": 0.5326895706847785, "grad_norm": 1.8461434841156006, "learning_rate": 4.919575113808801e-06, "loss": 0.2516, "step": 1848 }, { "epoch": 0.5335543264813447, "grad_norm": 1.8950605392456055, "learning_rate": 4.910470409711684e-06, "loss": 0.246, "step": 1851 }, { "epoch": 0.5344190822779109, "grad_norm": 1.7544567584991455, "learning_rate": 4.901365705614568e-06, "loss": 0.2159, "step": 1854 }, { "epoch": 0.5352838380744771, "grad_norm": 2.04953932762146, "learning_rate": 4.892261001517451e-06, "loss": 0.2745, "step": 1857 }, { "epoch": 0.5361485938710433, "grad_norm": 2.173112154006958, "learning_rate": 4.8831562974203345e-06, "loss": 0.2222, "step": 1860 }, { "epoch": 0.5370133496676095, "grad_norm": 1.4713711738586426, "learning_rate": 4.874051593323217e-06, "loss": 0.2182, "step": 1863 }, { "epoch": 0.5378781054641757, "grad_norm": 2.421405792236328, "learning_rate": 4.8649468892261e-06, "loss": 0.2472, "step": 1866 }, { "epoch": 0.5387428612607419, "grad_norm": 2.6306304931640625, "learning_rate": 4.855842185128984e-06, "loss": 0.2148, "step": 1869 }, { "epoch": 0.5396076170573081, "grad_norm": 3.4957375526428223, "learning_rate": 4.846737481031867e-06, "loss": 0.2321, "step": 1872 }, { "epoch": 0.5404723728538743, "grad_norm": 4.008154392242432, "learning_rate": 4.8376327769347495e-06, "loss": 0.216, "step": 1875 }, { "epoch": 0.5413371286504405, "grad_norm": 2.417433977127075, "learning_rate": 4.828528072837633e-06, "loss": 0.2261, "step": 1878 }, { "epoch": 0.5422018844470067, "grad_norm": 1.7908028364181519, "learning_rate": 4.819423368740516e-06, "loss": 0.2445, "step": 1881 }, { "epoch": 0.5430666402435729, "grad_norm": 1.3756656646728516, "learning_rate": 4.8103186646434e-06, "loss": 0.201, "step": 1884 }, { "epoch": 0.5439313960401391, "grad_norm": 1.686787724494934, "learning_rate": 4.801213960546283e-06, "loss": 0.217, "step": 1887 }, { "epoch": 0.5447961518367053, "grad_norm": 3.6207942962646484, "learning_rate": 4.792109256449165e-06, "loss": 0.2746, "step": 1890 }, { "epoch": 0.5456609076332715, "grad_norm": 3.838956117630005, "learning_rate": 4.783004552352049e-06, "loss": 0.2313, "step": 1893 }, { "epoch": 0.5465256634298377, "grad_norm": 2.059926748275757, "learning_rate": 4.773899848254932e-06, "loss": 0.2302, "step": 1896 }, { "epoch": 0.5473904192264039, "grad_norm": 1.9524738788604736, "learning_rate": 4.764795144157816e-06, "loss": 0.2473, "step": 1899 }, { "epoch": 0.5476786711585926, "eval_loss": 0.21403329074382782, "eval_mse": 0.2140333094932139, "eval_runtime": 6.5394, "eval_samples_per_second": 152.918, "eval_steps_per_second": 19.115, "step": 1900 }, { "epoch": 0.54825517502297, "grad_norm": 1.8144432306289673, "learning_rate": 4.7556904400606985e-06, "loss": 0.2082, "step": 1902 }, { "epoch": 0.5491199308195363, "grad_norm": 2.4071717262268066, "learning_rate": 4.746585735963581e-06, "loss": 0.2364, "step": 1905 }, { "epoch": 0.5499846866161024, "grad_norm": 1.7162179946899414, "learning_rate": 4.737481031866465e-06, "loss": 0.2119, "step": 1908 }, { "epoch": 0.5508494424126686, "grad_norm": 2.368528366088867, "learning_rate": 4.728376327769348e-06, "loss": 0.2314, "step": 1911 }, { "epoch": 0.5517141982092348, "grad_norm": 3.422670602798462, "learning_rate": 4.719271623672231e-06, "loss": 0.2355, "step": 1914 }, { "epoch": 0.552578954005801, "grad_norm": 2.324976682662964, "learning_rate": 4.710166919575114e-06, "loss": 0.2483, "step": 1917 }, { "epoch": 0.5534437098023672, "grad_norm": 2.3686418533325195, "learning_rate": 4.701062215477997e-06, "loss": 0.204, "step": 1920 }, { "epoch": 0.5543084655989334, "grad_norm": 2.1361286640167236, "learning_rate": 4.691957511380881e-06, "loss": 0.195, "step": 1923 }, { "epoch": 0.5551732213954996, "grad_norm": 1.5527316331863403, "learning_rate": 4.682852807283764e-06, "loss": 0.2232, "step": 1926 }, { "epoch": 0.5560379771920658, "grad_norm": 3.792592763900757, "learning_rate": 4.673748103186647e-06, "loss": 0.2157, "step": 1929 }, { "epoch": 0.556902732988632, "grad_norm": 1.8878562450408936, "learning_rate": 4.66464339908953e-06, "loss": 0.21, "step": 1932 }, { "epoch": 0.5577674887851982, "grad_norm": 1.686164140701294, "learning_rate": 4.655538694992413e-06, "loss": 0.2177, "step": 1935 }, { "epoch": 0.5586322445817644, "grad_norm": 2.310054302215576, "learning_rate": 4.646433990895296e-06, "loss": 0.2039, "step": 1938 }, { "epoch": 0.5594970003783306, "grad_norm": 1.8163293600082397, "learning_rate": 4.63732928679818e-06, "loss": 0.2117, "step": 1941 }, { "epoch": 0.5603617561748968, "grad_norm": 1.608209490776062, "learning_rate": 4.6282245827010625e-06, "loss": 0.2361, "step": 1944 }, { "epoch": 0.561226511971463, "grad_norm": 1.4159106016159058, "learning_rate": 4.619119878603946e-06, "loss": 0.2042, "step": 1947 }, { "epoch": 0.5620912677680293, "grad_norm": 2.920888900756836, "learning_rate": 4.610015174506829e-06, "loss": 0.2715, "step": 1950 }, { "epoch": 0.5629560235645955, "grad_norm": 1.1805585622787476, "learning_rate": 4.600910470409712e-06, "loss": 0.2329, "step": 1953 }, { "epoch": 0.5638207793611617, "grad_norm": 3.1957271099090576, "learning_rate": 4.591805766312596e-06, "loss": 0.253, "step": 1956 }, { "epoch": 0.5646855351577279, "grad_norm": 2.3281495571136475, "learning_rate": 4.582701062215478e-06, "loss": 0.2508, "step": 1959 }, { "epoch": 0.5655502909542941, "grad_norm": 1.9826480150222778, "learning_rate": 4.573596358118362e-06, "loss": 0.2107, "step": 1962 }, { "epoch": 0.5664150467508603, "grad_norm": 5.98641300201416, "learning_rate": 4.564491654021245e-06, "loss": 0.2343, "step": 1965 }, { "epoch": 0.5672798025474265, "grad_norm": 4.082788944244385, "learning_rate": 4.555386949924128e-06, "loss": 0.2493, "step": 1968 }, { "epoch": 0.5681445583439927, "grad_norm": 1.6683293581008911, "learning_rate": 4.5462822458270115e-06, "loss": 0.1892, "step": 1971 }, { "epoch": 0.5690093141405589, "grad_norm": 4.348227024078369, "learning_rate": 4.537177541729894e-06, "loss": 0.2277, "step": 1974 }, { "epoch": 0.5698740699371251, "grad_norm": 1.2774113416671753, "learning_rate": 4.528072837632777e-06, "loss": 0.2077, "step": 1977 }, { "epoch": 0.5707388257336913, "grad_norm": 1.5915591716766357, "learning_rate": 4.518968133535661e-06, "loss": 0.2284, "step": 1980 }, { "epoch": 0.5716035815302575, "grad_norm": 1.3817569017410278, "learning_rate": 4.509863429438544e-06, "loss": 0.2159, "step": 1983 }, { "epoch": 0.5724683373268237, "grad_norm": 1.3606081008911133, "learning_rate": 4.500758725341427e-06, "loss": 0.2055, "step": 1986 }, { "epoch": 0.5733330931233899, "grad_norm": 2.6602344512939453, "learning_rate": 4.49165402124431e-06, "loss": 0.2611, "step": 1989 }, { "epoch": 0.5741978489199561, "grad_norm": 1.796570897102356, "learning_rate": 4.482549317147193e-06, "loss": 0.1925, "step": 1992 }, { "epoch": 0.5750626047165223, "grad_norm": 2.4218592643737793, "learning_rate": 4.473444613050077e-06, "loss": 0.1992, "step": 1995 }, { "epoch": 0.5759273605130885, "grad_norm": 1.9690345525741577, "learning_rate": 4.46433990895296e-06, "loss": 0.237, "step": 1998 }, { "epoch": 0.5765038643774659, "eval_loss": 0.2176571637392044, "eval_mse": 0.2176571699755732, "eval_runtime": 6.5589, "eval_samples_per_second": 152.465, "eval_steps_per_second": 19.058, "step": 2000 }, { "epoch": 0.5767921163096547, "grad_norm": 2.168555498123169, "learning_rate": 4.4552352048558425e-06, "loss": 0.2009, "step": 2001 }, { "epoch": 0.5776568721062209, "grad_norm": 3.1508069038391113, "learning_rate": 4.446130500758726e-06, "loss": 0.2316, "step": 2004 }, { "epoch": 0.578521627902787, "grad_norm": 1.686283826828003, "learning_rate": 4.437025796661609e-06, "loss": 0.2313, "step": 2007 }, { "epoch": 0.5793863836993532, "grad_norm": 1.3212209939956665, "learning_rate": 4.427921092564492e-06, "loss": 0.2076, "step": 2010 }, { "epoch": 0.5802511394959194, "grad_norm": 3.2884185314178467, "learning_rate": 4.4188163884673755e-06, "loss": 0.2345, "step": 2013 }, { "epoch": 0.5811158952924856, "grad_norm": 1.9892568588256836, "learning_rate": 4.409711684370258e-06, "loss": 0.2428, "step": 2016 }, { "epoch": 0.5819806510890518, "grad_norm": 1.5294564962387085, "learning_rate": 4.400606980273141e-06, "loss": 0.2252, "step": 2019 }, { "epoch": 0.582845406885618, "grad_norm": 2.2944376468658447, "learning_rate": 4.391502276176025e-06, "loss": 0.2429, "step": 2022 }, { "epoch": 0.5837101626821842, "grad_norm": 1.584145426750183, "learning_rate": 4.382397572078908e-06, "loss": 0.2548, "step": 2025 }, { "epoch": 0.5845749184787504, "grad_norm": 1.604771375656128, "learning_rate": 4.3732928679817906e-06, "loss": 0.2211, "step": 2028 }, { "epoch": 0.5854396742753166, "grad_norm": 1.7536587715148926, "learning_rate": 4.364188163884674e-06, "loss": 0.2165, "step": 2031 }, { "epoch": 0.5863044300718828, "grad_norm": 1.6281161308288574, "learning_rate": 4.355083459787557e-06, "loss": 0.2609, "step": 2034 }, { "epoch": 0.587169185868449, "grad_norm": 1.77180016040802, "learning_rate": 4.34597875569044e-06, "loss": 0.2333, "step": 2037 }, { "epoch": 0.5880339416650152, "grad_norm": 3.2408559322357178, "learning_rate": 4.336874051593324e-06, "loss": 0.2454, "step": 2040 }, { "epoch": 0.5888986974615814, "grad_norm": 3.4444427490234375, "learning_rate": 4.3277693474962065e-06, "loss": 0.2549, "step": 2043 }, { "epoch": 0.5897634532581476, "grad_norm": 4.399412155151367, "learning_rate": 4.31866464339909e-06, "loss": 0.2258, "step": 2046 }, { "epoch": 0.5906282090547138, "grad_norm": 1.646681785583496, "learning_rate": 4.309559939301973e-06, "loss": 0.1858, "step": 2049 }, { "epoch": 0.59149296485128, "grad_norm": 2.786576986312866, "learning_rate": 4.300455235204856e-06, "loss": 0.2085, "step": 2052 }, { "epoch": 0.5923577206478462, "grad_norm": 3.0838379859924316, "learning_rate": 4.2913505311077395e-06, "loss": 0.2428, "step": 2055 }, { "epoch": 0.5932224764444124, "grad_norm": 1.518548846244812, "learning_rate": 4.282245827010622e-06, "loss": 0.2159, "step": 2058 }, { "epoch": 0.5940872322409786, "grad_norm": 2.2088887691497803, "learning_rate": 4.273141122913505e-06, "loss": 0.2107, "step": 2061 }, { "epoch": 0.5949519880375448, "grad_norm": 4.359911918640137, "learning_rate": 4.264036418816389e-06, "loss": 0.2461, "step": 2064 }, { "epoch": 0.595816743834111, "grad_norm": 1.796669840812683, "learning_rate": 4.254931714719272e-06, "loss": 0.1836, "step": 2067 }, { "epoch": 0.5966814996306772, "grad_norm": 2.0835959911346436, "learning_rate": 4.245827010622155e-06, "loss": 0.2192, "step": 2070 }, { "epoch": 0.5975462554272434, "grad_norm": 2.772815227508545, "learning_rate": 4.236722306525038e-06, "loss": 0.1974, "step": 2073 }, { "epoch": 0.5984110112238096, "grad_norm": 1.8529541492462158, "learning_rate": 4.227617602427921e-06, "loss": 0.1946, "step": 2076 }, { "epoch": 0.5992757670203758, "grad_norm": 2.0415849685668945, "learning_rate": 4.218512898330804e-06, "loss": 0.2182, "step": 2079 }, { "epoch": 0.600140522816942, "grad_norm": 2.6295053958892822, "learning_rate": 4.209408194233688e-06, "loss": 0.2438, "step": 2082 }, { "epoch": 0.6010052786135082, "grad_norm": 1.9082621335983276, "learning_rate": 4.2003034901365705e-06, "loss": 0.1834, "step": 2085 }, { "epoch": 0.6018700344100744, "grad_norm": 1.692436933517456, "learning_rate": 4.191198786039454e-06, "loss": 0.2104, "step": 2088 }, { "epoch": 0.6027347902066406, "grad_norm": 2.0836997032165527, "learning_rate": 4.182094081942337e-06, "loss": 0.2323, "step": 2091 }, { "epoch": 0.6035995460032068, "grad_norm": 1.354612946510315, "learning_rate": 4.17298937784522e-06, "loss": 0.1975, "step": 2094 }, { "epoch": 0.604464301799773, "grad_norm": 3.683278799057007, "learning_rate": 4.1638846737481036e-06, "loss": 0.2256, "step": 2097 }, { "epoch": 0.6053290575963391, "grad_norm": 2.232513189315796, "learning_rate": 4.154779969650986e-06, "loss": 0.1972, "step": 2100 }, { "epoch": 0.6053290575963391, "eval_loss": 0.21862919628620148, "eval_mse": 0.218629194105044, "eval_runtime": 6.7266, "eval_samples_per_second": 148.664, "eval_steps_per_second": 18.583, "step": 2100 }, { "epoch": 0.6061938133929053, "grad_norm": 1.5396499633789062, "learning_rate": 4.145675265553869e-06, "loss": 0.2151, "step": 2103 }, { "epoch": 0.6070585691894717, "grad_norm": 1.482253909111023, "learning_rate": 4.136570561456753e-06, "loss": 0.2222, "step": 2106 }, { "epoch": 0.6079233249860378, "grad_norm": 4.880987644195557, "learning_rate": 4.127465857359636e-06, "loss": 0.2535, "step": 2109 }, { "epoch": 0.608788080782604, "grad_norm": 2.1557230949401855, "learning_rate": 4.1183611532625195e-06, "loss": 0.2213, "step": 2112 }, { "epoch": 0.6096528365791702, "grad_norm": 2.417856454849243, "learning_rate": 4.109256449165402e-06, "loss": 0.2481, "step": 2115 }, { "epoch": 0.6105175923757364, "grad_norm": 2.211514949798584, "learning_rate": 4.100151745068285e-06, "loss": 0.2369, "step": 2118 }, { "epoch": 0.6113823481723026, "grad_norm": 2.2844600677490234, "learning_rate": 4.091047040971169e-06, "loss": 0.2174, "step": 2121 }, { "epoch": 0.6122471039688688, "grad_norm": 2.7534289360046387, "learning_rate": 4.081942336874052e-06, "loss": 0.22, "step": 2124 }, { "epoch": 0.613111859765435, "grad_norm": 1.5547044277191162, "learning_rate": 4.072837632776935e-06, "loss": 0.227, "step": 2127 }, { "epoch": 0.6139766155620012, "grad_norm": 1.6965092420578003, "learning_rate": 4.063732928679818e-06, "loss": 0.219, "step": 2130 }, { "epoch": 0.6148413713585674, "grad_norm": 1.8000000715255737, "learning_rate": 4.054628224582701e-06, "loss": 0.2303, "step": 2133 }, { "epoch": 0.6157061271551336, "grad_norm": 1.5764316320419312, "learning_rate": 4.045523520485585e-06, "loss": 0.191, "step": 2136 }, { "epoch": 0.6165708829516998, "grad_norm": 2.0041658878326416, "learning_rate": 4.036418816388468e-06, "loss": 0.2025, "step": 2139 }, { "epoch": 0.617435638748266, "grad_norm": 1.9013463258743286, "learning_rate": 4.0273141122913504e-06, "loss": 0.225, "step": 2142 }, { "epoch": 0.6183003945448322, "grad_norm": 2.1815786361694336, "learning_rate": 4.018209408194234e-06, "loss": 0.2409, "step": 2145 }, { "epoch": 0.6191651503413984, "grad_norm": 1.6740418672561646, "learning_rate": 4.009104704097117e-06, "loss": 0.2143, "step": 2148 }, { "epoch": 0.6200299061379646, "grad_norm": 2.1056814193725586, "learning_rate": 4.000000000000001e-06, "loss": 0.2365, "step": 2151 }, { "epoch": 0.6208946619345308, "grad_norm": 2.629563808441162, "learning_rate": 3.9908952959028835e-06, "loss": 0.1932, "step": 2154 }, { "epoch": 0.621759417731097, "grad_norm": 1.7547650337219238, "learning_rate": 3.981790591805766e-06, "loss": 0.246, "step": 2157 }, { "epoch": 0.6226241735276632, "grad_norm": 2.2451794147491455, "learning_rate": 3.97268588770865e-06, "loss": 0.2405, "step": 2160 }, { "epoch": 0.6234889293242294, "grad_norm": 2.8820624351501465, "learning_rate": 3.963581183611533e-06, "loss": 0.253, "step": 2163 }, { "epoch": 0.6243536851207956, "grad_norm": 2.9832215309143066, "learning_rate": 3.9544764795144166e-06, "loss": 0.223, "step": 2166 }, { "epoch": 0.6252184409173618, "grad_norm": 2.911879539489746, "learning_rate": 3.945371775417299e-06, "loss": 0.2177, "step": 2169 }, { "epoch": 0.626083196713928, "grad_norm": 2.266767740249634, "learning_rate": 3.936267071320182e-06, "loss": 0.2288, "step": 2172 }, { "epoch": 0.6269479525104942, "grad_norm": 1.401633858680725, "learning_rate": 3.927162367223066e-06, "loss": 0.2556, "step": 2175 }, { "epoch": 0.6278127083070604, "grad_norm": 2.7354822158813477, "learning_rate": 3.918057663125949e-06, "loss": 0.2439, "step": 2178 }, { "epoch": 0.6286774641036266, "grad_norm": 1.6652506589889526, "learning_rate": 3.908952959028832e-06, "loss": 0.2041, "step": 2181 }, { "epoch": 0.6295422199001928, "grad_norm": 3.3072733879089355, "learning_rate": 3.899848254931715e-06, "loss": 0.2207, "step": 2184 }, { "epoch": 0.630406975696759, "grad_norm": 2.254608392715454, "learning_rate": 3.890743550834598e-06, "loss": 0.2352, "step": 2187 }, { "epoch": 0.6312717314933252, "grad_norm": 1.765981674194336, "learning_rate": 3.881638846737482e-06, "loss": 0.2923, "step": 2190 }, { "epoch": 0.6321364872898914, "grad_norm": 1.7792342901229858, "learning_rate": 3.872534142640365e-06, "loss": 0.2428, "step": 2193 }, { "epoch": 0.6330012430864576, "grad_norm": 5.084781646728516, "learning_rate": 3.8634294385432475e-06, "loss": 0.2616, "step": 2196 }, { "epoch": 0.6338659988830238, "grad_norm": 2.0845305919647217, "learning_rate": 3.854324734446131e-06, "loss": 0.2556, "step": 2199 }, { "epoch": 0.6341542508152125, "eval_loss": 0.2416454404592514, "eval_mse": 0.2416454482518602, "eval_runtime": 6.5442, "eval_samples_per_second": 152.808, "eval_steps_per_second": 19.101, "step": 2200 }, { "epoch": 0.63473075467959, "grad_norm": 3.8279173374176025, "learning_rate": 3.845220030349014e-06, "loss": 0.2245, "step": 2202 }, { "epoch": 0.6355955104761561, "grad_norm": 3.564417839050293, "learning_rate": 3.836115326251897e-06, "loss": 0.2233, "step": 2205 }, { "epoch": 0.6364602662727223, "grad_norm": 2.4322614669799805, "learning_rate": 3.827010622154781e-06, "loss": 0.2227, "step": 2208 }, { "epoch": 0.6373250220692885, "grad_norm": 1.6439640522003174, "learning_rate": 3.8179059180576634e-06, "loss": 0.2135, "step": 2211 }, { "epoch": 0.6381897778658547, "grad_norm": 2.605598211288452, "learning_rate": 3.8088012139605467e-06, "loss": 0.2203, "step": 2214 }, { "epoch": 0.6390545336624209, "grad_norm": 1.939488410949707, "learning_rate": 3.7996965098634296e-06, "loss": 0.2377, "step": 2217 }, { "epoch": 0.6399192894589871, "grad_norm": 1.851778507232666, "learning_rate": 3.790591805766313e-06, "loss": 0.2199, "step": 2220 }, { "epoch": 0.6407840452555533, "grad_norm": 2.077923059463501, "learning_rate": 3.781487101669196e-06, "loss": 0.223, "step": 2223 }, { "epoch": 0.6416488010521195, "grad_norm": 5.192010402679443, "learning_rate": 3.772382397572079e-06, "loss": 0.2412, "step": 2226 }, { "epoch": 0.6425135568486857, "grad_norm": 1.6057194471359253, "learning_rate": 3.7632776934749626e-06, "loss": 0.2354, "step": 2229 }, { "epoch": 0.6433783126452519, "grad_norm": 3.0130653381347656, "learning_rate": 3.7541729893778455e-06, "loss": 0.2307, "step": 2232 }, { "epoch": 0.6442430684418181, "grad_norm": 2.835080623626709, "learning_rate": 3.7450682852807287e-06, "loss": 0.1929, "step": 2235 }, { "epoch": 0.6451078242383843, "grad_norm": 1.800140619277954, "learning_rate": 3.735963581183612e-06, "loss": 0.2158, "step": 2238 }, { "epoch": 0.6459725800349505, "grad_norm": 1.8859021663665771, "learning_rate": 3.726858877086495e-06, "loss": 0.2469, "step": 2241 }, { "epoch": 0.6468373358315167, "grad_norm": 1.8524531126022339, "learning_rate": 3.717754172989378e-06, "loss": 0.2257, "step": 2244 }, { "epoch": 0.6477020916280829, "grad_norm": 2.759021520614624, "learning_rate": 3.7086494688922614e-06, "loss": 0.2187, "step": 2247 }, { "epoch": 0.6485668474246491, "grad_norm": 1.885272741317749, "learning_rate": 3.699544764795144e-06, "loss": 0.2416, "step": 2250 }, { "epoch": 0.6494316032212153, "grad_norm": 2.244595527648926, "learning_rate": 3.690440060698028e-06, "loss": 0.2002, "step": 2253 }, { "epoch": 0.6502963590177815, "grad_norm": 2.533815622329712, "learning_rate": 3.6813353566009107e-06, "loss": 0.2065, "step": 2256 }, { "epoch": 0.6511611148143477, "grad_norm": 1.883478045463562, "learning_rate": 3.6722306525037936e-06, "loss": 0.2014, "step": 2259 }, { "epoch": 0.652025870610914, "grad_norm": 4.37358283996582, "learning_rate": 3.6631259484066773e-06, "loss": 0.2347, "step": 2262 }, { "epoch": 0.6528906264074802, "grad_norm": 1.9453434944152832, "learning_rate": 3.65402124430956e-06, "loss": 0.2414, "step": 2265 }, { "epoch": 0.6537553822040464, "grad_norm": 1.5430552959442139, "learning_rate": 3.644916540212443e-06, "loss": 0.2012, "step": 2268 }, { "epoch": 0.6546201380006126, "grad_norm": 2.023857593536377, "learning_rate": 3.6358118361153266e-06, "loss": 0.216, "step": 2271 }, { "epoch": 0.6554848937971788, "grad_norm": 2.0380475521087646, "learning_rate": 3.6267071320182095e-06, "loss": 0.1851, "step": 2274 }, { "epoch": 0.656349649593745, "grad_norm": 1.9703435897827148, "learning_rate": 3.617602427921093e-06, "loss": 0.2152, "step": 2277 }, { "epoch": 0.6572144053903112, "grad_norm": 2.0153567790985107, "learning_rate": 3.608497723823976e-06, "loss": 0.2248, "step": 2280 }, { "epoch": 0.6580791611868774, "grad_norm": 1.6611104011535645, "learning_rate": 3.599393019726859e-06, "loss": 0.2416, "step": 2283 }, { "epoch": 0.6589439169834436, "grad_norm": 1.8866307735443115, "learning_rate": 3.5902883156297426e-06, "loss": 0.2319, "step": 2286 }, { "epoch": 0.6598086727800098, "grad_norm": 2.9478352069854736, "learning_rate": 3.5811836115326254e-06, "loss": 0.2614, "step": 2289 }, { "epoch": 0.660673428576576, "grad_norm": 1.578539252281189, "learning_rate": 3.572078907435509e-06, "loss": 0.2097, "step": 2292 }, { "epoch": 0.6615381843731422, "grad_norm": 3.096663236618042, "learning_rate": 3.562974203338392e-06, "loss": 0.1977, "step": 2295 }, { "epoch": 0.6624029401697084, "grad_norm": 1.81285560131073, "learning_rate": 3.5538694992412748e-06, "loss": 0.2273, "step": 2298 }, { "epoch": 0.6629794440340858, "eval_loss": 0.2196592092514038, "eval_mse": 0.2196592075770641, "eval_runtime": 6.5661, "eval_samples_per_second": 152.298, "eval_steps_per_second": 19.037, "step": 2300 }, { "epoch": 0.6632676959662746, "grad_norm": 1.6824992895126343, "learning_rate": 3.5447647951441585e-06, "loss": 0.1899, "step": 2301 }, { "epoch": 0.6641324517628407, "grad_norm": 2.2244341373443604, "learning_rate": 3.5356600910470413e-06, "loss": 0.2113, "step": 2304 }, { "epoch": 0.6649972075594069, "grad_norm": 2.0596795082092285, "learning_rate": 3.526555386949924e-06, "loss": 0.2264, "step": 2307 }, { "epoch": 0.6658619633559731, "grad_norm": 2.2340962886810303, "learning_rate": 3.517450682852808e-06, "loss": 0.2168, "step": 2310 }, { "epoch": 0.6667267191525393, "grad_norm": 1.9558560848236084, "learning_rate": 3.5083459787556907e-06, "loss": 0.2088, "step": 2313 }, { "epoch": 0.6675914749491055, "grad_norm": 1.9379972219467163, "learning_rate": 3.499241274658574e-06, "loss": 0.2284, "step": 2316 }, { "epoch": 0.6684562307456717, "grad_norm": 2.3833818435668945, "learning_rate": 3.490136570561457e-06, "loss": 0.2526, "step": 2319 }, { "epoch": 0.6693209865422379, "grad_norm": 2.1912760734558105, "learning_rate": 3.48103186646434e-06, "loss": 0.2579, "step": 2322 }, { "epoch": 0.6701857423388041, "grad_norm": 1.4502041339874268, "learning_rate": 3.4719271623672233e-06, "loss": 0.2118, "step": 2325 }, { "epoch": 0.6710504981353703, "grad_norm": 1.6936084032058716, "learning_rate": 3.4628224582701066e-06, "loss": 0.2417, "step": 2328 }, { "epoch": 0.6719152539319365, "grad_norm": 2.5630977153778076, "learning_rate": 3.45371775417299e-06, "loss": 0.255, "step": 2331 }, { "epoch": 0.6727800097285027, "grad_norm": 1.7571030855178833, "learning_rate": 3.4446130500758727e-06, "loss": 0.2122, "step": 2334 }, { "epoch": 0.6736447655250689, "grad_norm": 2.4220032691955566, "learning_rate": 3.435508345978756e-06, "loss": 0.2383, "step": 2337 }, { "epoch": 0.6745095213216351, "grad_norm": 2.3608193397521973, "learning_rate": 3.4264036418816392e-06, "loss": 0.241, "step": 2340 }, { "epoch": 0.6753742771182013, "grad_norm": 2.0964229106903076, "learning_rate": 3.417298937784522e-06, "loss": 0.2178, "step": 2343 }, { "epoch": 0.6762390329147675, "grad_norm": 3.038722038269043, "learning_rate": 3.4081942336874053e-06, "loss": 0.2276, "step": 2346 }, { "epoch": 0.6771037887113337, "grad_norm": 3.0551295280456543, "learning_rate": 3.3990895295902886e-06, "loss": 0.1959, "step": 2349 }, { "epoch": 0.6779685445078999, "grad_norm": 1.4905811548233032, "learning_rate": 3.3899848254931714e-06, "loss": 0.1863, "step": 2352 }, { "epoch": 0.6788333003044661, "grad_norm": 3.0637025833129883, "learning_rate": 3.380880121396055e-06, "loss": 0.2341, "step": 2355 }, { "epoch": 0.6796980561010323, "grad_norm": 2.9851162433624268, "learning_rate": 3.371775417298938e-06, "loss": 0.2586, "step": 2358 }, { "epoch": 0.6805628118975985, "grad_norm": 2.0650391578674316, "learning_rate": 3.3626707132018212e-06, "loss": 0.2181, "step": 2361 }, { "epoch": 0.6814275676941647, "grad_norm": 3.1792430877685547, "learning_rate": 3.3535660091047045e-06, "loss": 0.234, "step": 2364 }, { "epoch": 0.6822923234907309, "grad_norm": 2.171764612197876, "learning_rate": 3.3444613050075873e-06, "loss": 0.2232, "step": 2367 }, { "epoch": 0.6831570792872971, "grad_norm": 1.8832968473434448, "learning_rate": 3.3353566009104706e-06, "loss": 0.2386, "step": 2370 }, { "epoch": 0.6840218350838633, "grad_norm": 2.217407703399658, "learning_rate": 3.326251896813354e-06, "loss": 0.2575, "step": 2373 }, { "epoch": 0.6848865908804295, "grad_norm": 1.3866760730743408, "learning_rate": 3.3171471927162367e-06, "loss": 0.2321, "step": 2376 }, { "epoch": 0.6857513466769957, "grad_norm": 2.836749315261841, "learning_rate": 3.3080424886191204e-06, "loss": 0.2082, "step": 2379 }, { "epoch": 0.6866161024735619, "grad_norm": 4.798961162567139, "learning_rate": 3.2989377845220033e-06, "loss": 0.2397, "step": 2382 }, { "epoch": 0.6874808582701281, "grad_norm": 1.8099883794784546, "learning_rate": 3.289833080424886e-06, "loss": 0.2468, "step": 2385 }, { "epoch": 0.6883456140666943, "grad_norm": 3.0821380615234375, "learning_rate": 3.2807283763277698e-06, "loss": 0.2013, "step": 2388 }, { "epoch": 0.6892103698632605, "grad_norm": 1.6952015161514282, "learning_rate": 3.2716236722306526e-06, "loss": 0.2156, "step": 2391 }, { "epoch": 0.6900751256598266, "grad_norm": 2.4413681030273438, "learning_rate": 3.2625189681335363e-06, "loss": 0.2307, "step": 2394 }, { "epoch": 0.6909398814563928, "grad_norm": 1.9589879512786865, "learning_rate": 3.253414264036419e-06, "loss": 0.2948, "step": 2397 }, { "epoch": 0.691804637252959, "grad_norm": 2.4465548992156982, "learning_rate": 3.244309559939302e-06, "loss": 0.223, "step": 2400 }, { "epoch": 0.691804637252959, "eval_loss": 0.22531487047672272, "eval_mse": 0.2253148703626357, "eval_runtime": 6.6334, "eval_samples_per_second": 150.751, "eval_steps_per_second": 18.844, "step": 2400 }, { "epoch": 0.6926693930495252, "grad_norm": 2.3037302494049072, "learning_rate": 3.2352048558421857e-06, "loss": 0.2199, "step": 2403 }, { "epoch": 0.6935341488460914, "grad_norm": 1.6721099615097046, "learning_rate": 3.2261001517450685e-06, "loss": 0.2286, "step": 2406 }, { "epoch": 0.6943989046426576, "grad_norm": 3.3806381225585938, "learning_rate": 3.2169954476479514e-06, "loss": 0.1988, "step": 2409 }, { "epoch": 0.6952636604392238, "grad_norm": 2.1515021324157715, "learning_rate": 3.207890743550835e-06, "loss": 0.2412, "step": 2412 }, { "epoch": 0.69612841623579, "grad_norm": 3.980482816696167, "learning_rate": 3.198786039453718e-06, "loss": 0.2088, "step": 2415 }, { "epoch": 0.6969931720323563, "grad_norm": 2.2418131828308105, "learning_rate": 3.1896813353566016e-06, "loss": 0.2107, "step": 2418 }, { "epoch": 0.6978579278289225, "grad_norm": 1.819807767868042, "learning_rate": 3.1805766312594844e-06, "loss": 0.2435, "step": 2421 }, { "epoch": 0.6987226836254887, "grad_norm": 3.8227691650390625, "learning_rate": 3.1714719271623673e-06, "loss": 0.2278, "step": 2424 }, { "epoch": 0.6995874394220549, "grad_norm": 2.207240104675293, "learning_rate": 3.162367223065251e-06, "loss": 0.2362, "step": 2427 }, { "epoch": 0.7004521952186211, "grad_norm": 1.796724796295166, "learning_rate": 3.153262518968134e-06, "loss": 0.2383, "step": 2430 }, { "epoch": 0.7013169510151873, "grad_norm": 2.7628397941589355, "learning_rate": 3.1441578148710167e-06, "loss": 0.2326, "step": 2433 }, { "epoch": 0.7021817068117535, "grad_norm": 1.3642479181289673, "learning_rate": 3.1350531107739003e-06, "loss": 0.2059, "step": 2436 }, { "epoch": 0.7030464626083197, "grad_norm": 1.5554901361465454, "learning_rate": 3.125948406676783e-06, "loss": 0.2162, "step": 2439 }, { "epoch": 0.7039112184048859, "grad_norm": 2.5311179161071777, "learning_rate": 3.1168437025796665e-06, "loss": 0.237, "step": 2442 }, { "epoch": 0.7047759742014521, "grad_norm": 1.805991768836975, "learning_rate": 3.1077389984825497e-06, "loss": 0.2426, "step": 2445 }, { "epoch": 0.7056407299980183, "grad_norm": 2.029891014099121, "learning_rate": 3.0986342943854326e-06, "loss": 0.2139, "step": 2448 }, { "epoch": 0.7065054857945845, "grad_norm": 2.5495572090148926, "learning_rate": 3.089529590288316e-06, "loss": 0.2279, "step": 2451 }, { "epoch": 0.7073702415911507, "grad_norm": 1.5610768795013428, "learning_rate": 3.080424886191199e-06, "loss": 0.2175, "step": 2454 }, { "epoch": 0.7082349973877169, "grad_norm": 3.5896153450012207, "learning_rate": 3.0713201820940824e-06, "loss": 0.2459, "step": 2457 }, { "epoch": 0.7090997531842831, "grad_norm": 2.1120688915252686, "learning_rate": 3.062215477996965e-06, "loss": 0.2565, "step": 2460 }, { "epoch": 0.7099645089808493, "grad_norm": 2.7099833488464355, "learning_rate": 3.0531107738998485e-06, "loss": 0.2088, "step": 2463 }, { "epoch": 0.7108292647774155, "grad_norm": 1.7153905630111694, "learning_rate": 3.0440060698027317e-06, "loss": 0.2114, "step": 2466 }, { "epoch": 0.7116940205739817, "grad_norm": 1.9465105533599854, "learning_rate": 3.0349013657056146e-06, "loss": 0.1932, "step": 2469 }, { "epoch": 0.7125587763705479, "grad_norm": 1.6483453512191772, "learning_rate": 3.025796661608498e-06, "loss": 0.232, "step": 2472 }, { "epoch": 0.7134235321671141, "grad_norm": 2.3854711055755615, "learning_rate": 3.016691957511381e-06, "loss": 0.2397, "step": 2475 }, { "epoch": 0.7142882879636803, "grad_norm": 1.565765380859375, "learning_rate": 3.0075872534142644e-06, "loss": 0.2055, "step": 2478 }, { "epoch": 0.7151530437602465, "grad_norm": 1.5985909700393677, "learning_rate": 2.9984825493171476e-06, "loss": 0.2077, "step": 2481 }, { "epoch": 0.7160177995568127, "grad_norm": 2.9907102584838867, "learning_rate": 2.9893778452200305e-06, "loss": 0.2125, "step": 2484 }, { "epoch": 0.7168825553533789, "grad_norm": 3.0764994621276855, "learning_rate": 2.9802731411229137e-06, "loss": 0.2342, "step": 2487 }, { "epoch": 0.717747311149945, "grad_norm": 2.954237461090088, "learning_rate": 2.971168437025797e-06, "loss": 0.232, "step": 2490 }, { "epoch": 0.7186120669465113, "grad_norm": 1.5421547889709473, "learning_rate": 2.96206373292868e-06, "loss": 0.2019, "step": 2493 }, { "epoch": 0.7194768227430774, "grad_norm": 5.054042816162109, "learning_rate": 2.9529590288315635e-06, "loss": 0.2359, "step": 2496 }, { "epoch": 0.7203415785396436, "grad_norm": 1.5477067232131958, "learning_rate": 2.9438543247344464e-06, "loss": 0.2028, "step": 2499 }, { "epoch": 0.7206298304718324, "eval_loss": 0.22387926280498505, "eval_mse": 0.22387926151184365, "eval_runtime": 6.5037, "eval_samples_per_second": 153.759, "eval_steps_per_second": 19.22, "step": 2500 }, { "epoch": 0.7212063343362098, "grad_norm": 1.912636160850525, "learning_rate": 2.9347496206373292e-06, "loss": 0.2167, "step": 2502 }, { "epoch": 0.722071090132776, "grad_norm": 1.644394040107727, "learning_rate": 2.925644916540213e-06, "loss": 0.2068, "step": 2505 }, { "epoch": 0.7229358459293422, "grad_norm": 3.4328315258026123, "learning_rate": 2.9165402124430958e-06, "loss": 0.2417, "step": 2508 }, { "epoch": 0.7238006017259084, "grad_norm": 1.918043613433838, "learning_rate": 2.9074355083459786e-06, "loss": 0.2308, "step": 2511 }, { "epoch": 0.7246653575224746, "grad_norm": 1.94221031665802, "learning_rate": 2.8983308042488623e-06, "loss": 0.2957, "step": 2514 }, { "epoch": 0.7255301133190408, "grad_norm": 2.877037525177002, "learning_rate": 2.889226100151745e-06, "loss": 0.2133, "step": 2517 }, { "epoch": 0.726394869115607, "grad_norm": 2.2768120765686035, "learning_rate": 2.880121396054629e-06, "loss": 0.2251, "step": 2520 }, { "epoch": 0.7272596249121732, "grad_norm": 2.9239742755889893, "learning_rate": 2.8710166919575117e-06, "loss": 0.215, "step": 2523 }, { "epoch": 0.7281243807087394, "grad_norm": 1.5520339012145996, "learning_rate": 2.8619119878603945e-06, "loss": 0.2113, "step": 2526 }, { "epoch": 0.7289891365053056, "grad_norm": 3.458822011947632, "learning_rate": 2.852807283763278e-06, "loss": 0.2465, "step": 2529 }, { "epoch": 0.7298538923018718, "grad_norm": 1.606724500656128, "learning_rate": 2.843702579666161e-06, "loss": 0.188, "step": 2532 }, { "epoch": 0.730718648098438, "grad_norm": 3.552236318588257, "learning_rate": 2.834597875569044e-06, "loss": 0.2178, "step": 2535 }, { "epoch": 0.7315834038950042, "grad_norm": 2.940363883972168, "learning_rate": 2.8254931714719276e-06, "loss": 0.2149, "step": 2538 }, { "epoch": 0.7324481596915704, "grad_norm": 1.3787034749984741, "learning_rate": 2.8163884673748104e-06, "loss": 0.2155, "step": 2541 }, { "epoch": 0.7333129154881366, "grad_norm": 1.4637516736984253, "learning_rate": 2.807283763277694e-06, "loss": 0.1944, "step": 2544 }, { "epoch": 0.7341776712847028, "grad_norm": 1.5903189182281494, "learning_rate": 2.798179059180577e-06, "loss": 0.214, "step": 2547 }, { "epoch": 0.735042427081269, "grad_norm": 1.999664306640625, "learning_rate": 2.78907435508346e-06, "loss": 0.2277, "step": 2550 }, { "epoch": 0.7359071828778352, "grad_norm": 2.250450849533081, "learning_rate": 2.7799696509863435e-06, "loss": 0.1958, "step": 2553 }, { "epoch": 0.7367719386744014, "grad_norm": 1.8212629556655884, "learning_rate": 2.7708649468892263e-06, "loss": 0.2155, "step": 2556 }, { "epoch": 0.7376366944709676, "grad_norm": 1.4670761823654175, "learning_rate": 2.7617602427921096e-06, "loss": 0.2251, "step": 2559 }, { "epoch": 0.7385014502675338, "grad_norm": 2.9052860736846924, "learning_rate": 2.752655538694993e-06, "loss": 0.2279, "step": 2562 }, { "epoch": 0.7393662060641, "grad_norm": 1.647455096244812, "learning_rate": 2.7435508345978757e-06, "loss": 0.2011, "step": 2565 }, { "epoch": 0.7402309618606662, "grad_norm": 2.5667457580566406, "learning_rate": 2.734446130500759e-06, "loss": 0.2113, "step": 2568 }, { "epoch": 0.7410957176572324, "grad_norm": 2.021571159362793, "learning_rate": 2.7253414264036422e-06, "loss": 0.2085, "step": 2571 }, { "epoch": 0.7419604734537987, "grad_norm": 3.436924457550049, "learning_rate": 2.716236722306525e-06, "loss": 0.2658, "step": 2574 }, { "epoch": 0.7428252292503649, "grad_norm": 1.9480434656143188, "learning_rate": 2.7071320182094083e-06, "loss": 0.2595, "step": 2577 }, { "epoch": 0.7436899850469311, "grad_norm": 1.8556321859359741, "learning_rate": 2.6980273141122916e-06, "loss": 0.2308, "step": 2580 }, { "epoch": 0.7445547408434973, "grad_norm": 3.174111843109131, "learning_rate": 2.688922610015175e-06, "loss": 0.2172, "step": 2583 }, { "epoch": 0.7454194966400635, "grad_norm": 1.5629518032073975, "learning_rate": 2.6798179059180577e-06, "loss": 0.2417, "step": 2586 }, { "epoch": 0.7462842524366297, "grad_norm": 1.8133536577224731, "learning_rate": 2.670713201820941e-06, "loss": 0.2188, "step": 2589 }, { "epoch": 0.7471490082331959, "grad_norm": 1.5448634624481201, "learning_rate": 2.6616084977238242e-06, "loss": 0.2201, "step": 2592 }, { "epoch": 0.748013764029762, "grad_norm": 2.376194953918457, "learning_rate": 2.652503793626707e-06, "loss": 0.2146, "step": 2595 }, { "epoch": 0.7488785198263282, "grad_norm": 1.8988285064697266, "learning_rate": 2.6433990895295904e-06, "loss": 0.2322, "step": 2598 }, { "epoch": 0.7494550236907057, "eval_loss": 0.21797478199005127, "eval_mse": 0.21797478066571058, "eval_runtime": 6.533, "eval_samples_per_second": 153.07, "eval_steps_per_second": 19.134, "step": 2600 }, { "epoch": 0.7497432756228944, "grad_norm": 3.16768217086792, "learning_rate": 2.6342943854324736e-06, "loss": 0.2533, "step": 2601 }, { "epoch": 0.7506080314194606, "grad_norm": 1.7228988409042358, "learning_rate": 2.625189681335357e-06, "loss": 0.2406, "step": 2604 }, { "epoch": 0.7514727872160268, "grad_norm": 2.9629013538360596, "learning_rate": 2.61608497723824e-06, "loss": 0.2131, "step": 2607 }, { "epoch": 0.752337543012593, "grad_norm": 3.4181559085845947, "learning_rate": 2.606980273141123e-06, "loss": 0.2126, "step": 2610 }, { "epoch": 0.7532022988091592, "grad_norm": 2.142685890197754, "learning_rate": 2.5978755690440063e-06, "loss": 0.2345, "step": 2613 }, { "epoch": 0.7540670546057254, "grad_norm": 3.622145175933838, "learning_rate": 2.5887708649468895e-06, "loss": 0.2187, "step": 2616 }, { "epoch": 0.7549318104022916, "grad_norm": 1.5243175029754639, "learning_rate": 2.5796661608497724e-06, "loss": 0.1996, "step": 2619 }, { "epoch": 0.7557965661988578, "grad_norm": 2.7075355052948, "learning_rate": 2.570561456752656e-06, "loss": 0.2755, "step": 2622 }, { "epoch": 0.756661321995424, "grad_norm": 2.6778082847595215, "learning_rate": 2.561456752655539e-06, "loss": 0.1959, "step": 2625 }, { "epoch": 0.7575260777919902, "grad_norm": 3.8043668270111084, "learning_rate": 2.5523520485584217e-06, "loss": 0.2054, "step": 2628 }, { "epoch": 0.7583908335885564, "grad_norm": 3.628281354904175, "learning_rate": 2.5432473444613054e-06, "loss": 0.2095, "step": 2631 }, { "epoch": 0.7592555893851226, "grad_norm": 3.2685089111328125, "learning_rate": 2.5341426403641883e-06, "loss": 0.2167, "step": 2634 }, { "epoch": 0.7601203451816888, "grad_norm": 1.6048041582107544, "learning_rate": 2.525037936267071e-06, "loss": 0.2416, "step": 2637 }, { "epoch": 0.760985100978255, "grad_norm": 1.5175867080688477, "learning_rate": 2.515933232169955e-06, "loss": 0.2326, "step": 2640 }, { "epoch": 0.7618498567748212, "grad_norm": 1.9787158966064453, "learning_rate": 2.5068285280728376e-06, "loss": 0.2266, "step": 2643 }, { "epoch": 0.7627146125713874, "grad_norm": 4.297834873199463, "learning_rate": 2.497723823975721e-06, "loss": 0.2414, "step": 2646 }, { "epoch": 0.7635793683679536, "grad_norm": 1.822587251663208, "learning_rate": 2.488619119878604e-06, "loss": 0.2386, "step": 2649 }, { "epoch": 0.7644441241645198, "grad_norm": 4.966648101806641, "learning_rate": 2.4795144157814874e-06, "loss": 0.2268, "step": 2652 }, { "epoch": 0.765308879961086, "grad_norm": 2.8137335777282715, "learning_rate": 2.4704097116843703e-06, "loss": 0.2349, "step": 2655 }, { "epoch": 0.7661736357576522, "grad_norm": 2.5401577949523926, "learning_rate": 2.4613050075872536e-06, "loss": 0.1941, "step": 2658 }, { "epoch": 0.7670383915542184, "grad_norm": 1.550758957862854, "learning_rate": 2.452200303490137e-06, "loss": 0.2151, "step": 2661 }, { "epoch": 0.7679031473507846, "grad_norm": 6.889467239379883, "learning_rate": 2.44309559939302e-06, "loss": 0.2287, "step": 2664 }, { "epoch": 0.7687679031473508, "grad_norm": 1.4833314418792725, "learning_rate": 2.4339908952959034e-06, "loss": 0.2017, "step": 2667 }, { "epoch": 0.769632658943917, "grad_norm": 1.607751727104187, "learning_rate": 2.424886191198786e-06, "loss": 0.2309, "step": 2670 }, { "epoch": 0.7704974147404832, "grad_norm": 1.7369478940963745, "learning_rate": 2.4157814871016695e-06, "loss": 0.217, "step": 2673 }, { "epoch": 0.7713621705370494, "grad_norm": 1.7309290170669556, "learning_rate": 2.4066767830045527e-06, "loss": 0.2151, "step": 2676 }, { "epoch": 0.7722269263336156, "grad_norm": 2.047727108001709, "learning_rate": 2.397572078907436e-06, "loss": 0.2078, "step": 2679 }, { "epoch": 0.7730916821301818, "grad_norm": 2.2800142765045166, "learning_rate": 2.388467374810319e-06, "loss": 0.2372, "step": 2682 }, { "epoch": 0.773956437926748, "grad_norm": 3.920849323272705, "learning_rate": 2.379362670713202e-06, "loss": 0.2304, "step": 2685 }, { "epoch": 0.7748211937233141, "grad_norm": 3.4216678142547607, "learning_rate": 2.3702579666160854e-06, "loss": 0.2382, "step": 2688 }, { "epoch": 0.7756859495198803, "grad_norm": 1.5471861362457275, "learning_rate": 2.3611532625189686e-06, "loss": 0.2243, "step": 2691 }, { "epoch": 0.7765507053164465, "grad_norm": 1.7489866018295288, "learning_rate": 2.3520485584218515e-06, "loss": 0.2239, "step": 2694 }, { "epoch": 0.7774154611130127, "grad_norm": 4.3836822509765625, "learning_rate": 2.3429438543247347e-06, "loss": 0.205, "step": 2697 }, { "epoch": 0.7782802169095789, "grad_norm": 1.8707342147827148, "learning_rate": 2.333839150227618e-06, "loss": 0.1933, "step": 2700 }, { "epoch": 0.7782802169095789, "eval_loss": 0.2158193737268448, "eval_mse": 0.21581936262454837, "eval_runtime": 6.5614, "eval_samples_per_second": 152.407, "eval_steps_per_second": 19.051, "step": 2700 }, { "epoch": 0.7791449727061451, "grad_norm": 1.6554896831512451, "learning_rate": 2.324734446130501e-06, "loss": 0.2178, "step": 2703 }, { "epoch": 0.7800097285027113, "grad_norm": 3.131352424621582, "learning_rate": 2.315629742033384e-06, "loss": 0.2122, "step": 2706 }, { "epoch": 0.7808744842992775, "grad_norm": 2.62422776222229, "learning_rate": 2.3065250379362674e-06, "loss": 0.2085, "step": 2709 }, { "epoch": 0.7817392400958437, "grad_norm": 2.1258456707000732, "learning_rate": 2.2974203338391502e-06, "loss": 0.2203, "step": 2712 }, { "epoch": 0.7826039958924099, "grad_norm": 3.144688606262207, "learning_rate": 2.2883156297420335e-06, "loss": 0.2428, "step": 2715 }, { "epoch": 0.7834687516889761, "grad_norm": 4.5740180015563965, "learning_rate": 2.2792109256449168e-06, "loss": 0.2196, "step": 2718 }, { "epoch": 0.7843335074855423, "grad_norm": 1.7256083488464355, "learning_rate": 2.2701062215477996e-06, "loss": 0.2316, "step": 2721 }, { "epoch": 0.7851982632821085, "grad_norm": 2.0723230838775635, "learning_rate": 2.261001517450683e-06, "loss": 0.1958, "step": 2724 }, { "epoch": 0.7860630190786747, "grad_norm": 1.6268962621688843, "learning_rate": 2.251896813353566e-06, "loss": 0.2329, "step": 2727 }, { "epoch": 0.786927774875241, "grad_norm": 4.054417610168457, "learning_rate": 2.2427921092564494e-06, "loss": 0.2091, "step": 2730 }, { "epoch": 0.7877925306718072, "grad_norm": 1.7409260272979736, "learning_rate": 2.2336874051593322e-06, "loss": 0.2144, "step": 2733 }, { "epoch": 0.7886572864683734, "grad_norm": 2.77607798576355, "learning_rate": 2.2245827010622155e-06, "loss": 0.2199, "step": 2736 }, { "epoch": 0.7895220422649396, "grad_norm": 2.215284585952759, "learning_rate": 2.2154779969650988e-06, "loss": 0.2388, "step": 2739 }, { "epoch": 0.7903867980615058, "grad_norm": 1.7318382263183594, "learning_rate": 2.206373292867982e-06, "loss": 0.2115, "step": 2742 }, { "epoch": 0.791251553858072, "grad_norm": 1.5627691745758057, "learning_rate": 2.197268588770865e-06, "loss": 0.2054, "step": 2745 }, { "epoch": 0.7921163096546382, "grad_norm": 1.810509443283081, "learning_rate": 2.188163884673748e-06, "loss": 0.2086, "step": 2748 }, { "epoch": 0.7929810654512044, "grad_norm": 2.1531972885131836, "learning_rate": 2.1790591805766314e-06, "loss": 0.2252, "step": 2751 }, { "epoch": 0.7938458212477706, "grad_norm": 2.0212440490722656, "learning_rate": 2.1699544764795147e-06, "loss": 0.2355, "step": 2754 }, { "epoch": 0.7947105770443368, "grad_norm": 5.030855178833008, "learning_rate": 2.1608497723823975e-06, "loss": 0.2341, "step": 2757 }, { "epoch": 0.795575332840903, "grad_norm": 2.213249921798706, "learning_rate": 2.1517450682852808e-06, "loss": 0.2133, "step": 2760 }, { "epoch": 0.7964400886374692, "grad_norm": 1.8025689125061035, "learning_rate": 2.142640364188164e-06, "loss": 0.2251, "step": 2763 }, { "epoch": 0.7973048444340354, "grad_norm": 4.14149284362793, "learning_rate": 2.1335356600910473e-06, "loss": 0.253, "step": 2766 }, { "epoch": 0.7981696002306016, "grad_norm": 2.2051069736480713, "learning_rate": 2.12443095599393e-06, "loss": 0.2238, "step": 2769 }, { "epoch": 0.7990343560271678, "grad_norm": 2.249032497406006, "learning_rate": 2.1153262518968134e-06, "loss": 0.2282, "step": 2772 }, { "epoch": 0.799899111823734, "grad_norm": 1.5087867975234985, "learning_rate": 2.1062215477996967e-06, "loss": 0.1948, "step": 2775 }, { "epoch": 0.8007638676203002, "grad_norm": 1.9934585094451904, "learning_rate": 2.09711684370258e-06, "loss": 0.2061, "step": 2778 }, { "epoch": 0.8016286234168664, "grad_norm": 2.521526336669922, "learning_rate": 2.0880121396054632e-06, "loss": 0.2331, "step": 2781 }, { "epoch": 0.8024933792134326, "grad_norm": 4.441010475158691, "learning_rate": 2.078907435508346e-06, "loss": 0.2337, "step": 2784 }, { "epoch": 0.8033581350099988, "grad_norm": 1.9386543035507202, "learning_rate": 2.0698027314112293e-06, "loss": 0.2434, "step": 2787 }, { "epoch": 0.804222890806565, "grad_norm": 1.6140722036361694, "learning_rate": 2.0606980273141126e-06, "loss": 0.223, "step": 2790 }, { "epoch": 0.8050876466031311, "grad_norm": 3.248769998550415, "learning_rate": 2.051593323216996e-06, "loss": 0.2352, "step": 2793 }, { "epoch": 0.8059524023996973, "grad_norm": 2.259561061859131, "learning_rate": 2.0424886191198787e-06, "loss": 0.2428, "step": 2796 }, { "epoch": 0.8068171581962635, "grad_norm": 2.289113998413086, "learning_rate": 2.033383915022762e-06, "loss": 0.2085, "step": 2799 }, { "epoch": 0.8071054101284523, "eval_loss": 0.22976131737232208, "eval_mse": 0.22976131996285404, "eval_runtime": 6.5104, "eval_samples_per_second": 153.6, "eval_steps_per_second": 19.2, "step": 2800 }, { "epoch": 0.8076819139928297, "grad_norm": 1.4588847160339355, "learning_rate": 2.0242792109256452e-06, "loss": 0.2004, "step": 2802 }, { "epoch": 0.8085466697893959, "grad_norm": 3.1680517196655273, "learning_rate": 2.0151745068285285e-06, "loss": 0.2038, "step": 2805 }, { "epoch": 0.8094114255859621, "grad_norm": 2.708411693572998, "learning_rate": 2.0060698027314113e-06, "loss": 0.2289, "step": 2808 }, { "epoch": 0.8102761813825283, "grad_norm": 2.2479403018951416, "learning_rate": 1.9969650986342946e-06, "loss": 0.2465, "step": 2811 }, { "epoch": 0.8111409371790945, "grad_norm": 3.2582664489746094, "learning_rate": 1.987860394537178e-06, "loss": 0.2187, "step": 2814 }, { "epoch": 0.8120056929756607, "grad_norm": 2.5267367362976074, "learning_rate": 1.978755690440061e-06, "loss": 0.1955, "step": 2817 }, { "epoch": 0.8128704487722269, "grad_norm": 2.42645525932312, "learning_rate": 1.969650986342944e-06, "loss": 0.233, "step": 2820 }, { "epoch": 0.8137352045687931, "grad_norm": 1.6183414459228516, "learning_rate": 1.9605462822458273e-06, "loss": 0.2187, "step": 2823 }, { "epoch": 0.8145999603653593, "grad_norm": 2.7215240001678467, "learning_rate": 1.9514415781487105e-06, "loss": 0.2109, "step": 2826 }, { "epoch": 0.8154647161619255, "grad_norm": 2.152639389038086, "learning_rate": 1.9423368740515934e-06, "loss": 0.2056, "step": 2829 }, { "epoch": 0.8163294719584917, "grad_norm": 2.530045509338379, "learning_rate": 1.9332321699544766e-06, "loss": 0.2185, "step": 2832 }, { "epoch": 0.8171942277550579, "grad_norm": 1.8189818859100342, "learning_rate": 1.92412746585736e-06, "loss": 0.2502, "step": 2835 }, { "epoch": 0.8180589835516241, "grad_norm": 1.5473167896270752, "learning_rate": 1.9150227617602427e-06, "loss": 0.1923, "step": 2838 }, { "epoch": 0.8189237393481903, "grad_norm": 2.99245285987854, "learning_rate": 1.9059180576631262e-06, "loss": 0.209, "step": 2841 }, { "epoch": 0.8197884951447565, "grad_norm": 1.509892225265503, "learning_rate": 1.8968133535660093e-06, "loss": 0.2293, "step": 2844 }, { "epoch": 0.8206532509413227, "grad_norm": 3.5667645931243896, "learning_rate": 1.8877086494688923e-06, "loss": 0.1935, "step": 2847 }, { "epoch": 0.8215180067378889, "grad_norm": 3.1929867267608643, "learning_rate": 1.8786039453717756e-06, "loss": 0.201, "step": 2850 }, { "epoch": 0.8223827625344551, "grad_norm": 1.5706427097320557, "learning_rate": 1.8694992412746589e-06, "loss": 0.2315, "step": 2853 }, { "epoch": 0.8232475183310213, "grad_norm": 3.1730329990386963, "learning_rate": 1.860394537177542e-06, "loss": 0.2475, "step": 2856 }, { "epoch": 0.8241122741275875, "grad_norm": 1.3369548320770264, "learning_rate": 1.851289833080425e-06, "loss": 0.2186, "step": 2859 }, { "epoch": 0.8249770299241537, "grad_norm": 2.522751569747925, "learning_rate": 1.8421851289833082e-06, "loss": 0.2477, "step": 2862 }, { "epoch": 0.8258417857207199, "grad_norm": 1.991076111793518, "learning_rate": 1.8330804248861913e-06, "loss": 0.2009, "step": 2865 }, { "epoch": 0.8267065415172861, "grad_norm": 2.707282781600952, "learning_rate": 1.8239757207890745e-06, "loss": 0.2302, "step": 2868 }, { "epoch": 0.8275712973138523, "grad_norm": 1.578192949295044, "learning_rate": 1.8148710166919576e-06, "loss": 0.2021, "step": 2871 }, { "epoch": 0.8284360531104185, "grad_norm": 3.6003148555755615, "learning_rate": 1.8057663125948407e-06, "loss": 0.2409, "step": 2874 }, { "epoch": 0.8293008089069847, "grad_norm": 3.1442506313323975, "learning_rate": 1.796661608497724e-06, "loss": 0.2129, "step": 2877 }, { "epoch": 0.8301655647035509, "grad_norm": 1.509333610534668, "learning_rate": 1.7875569044006072e-06, "loss": 0.2202, "step": 2880 }, { "epoch": 0.831030320500117, "grad_norm": 1.9379024505615234, "learning_rate": 1.7784522003034905e-06, "loss": 0.2238, "step": 2883 }, { "epoch": 0.8318950762966834, "grad_norm": 1.3617918491363525, "learning_rate": 1.7693474962063733e-06, "loss": 0.1972, "step": 2886 }, { "epoch": 0.8327598320932496, "grad_norm": 1.4775515794754028, "learning_rate": 1.7602427921092566e-06, "loss": 0.1985, "step": 2889 }, { "epoch": 0.8336245878898157, "grad_norm": 1.4302202463150024, "learning_rate": 1.7511380880121398e-06, "loss": 0.2403, "step": 2892 }, { "epoch": 0.834489343686382, "grad_norm": 2.128401041030884, "learning_rate": 1.742033383915023e-06, "loss": 0.2204, "step": 2895 }, { "epoch": 0.8353540994829481, "grad_norm": 1.9766216278076172, "learning_rate": 1.732928679817906e-06, "loss": 0.2038, "step": 2898 }, { "epoch": 0.8359306033473256, "eval_loss": 0.21660968661308289, "eval_mse": 0.21660968138270925, "eval_runtime": 6.5731, "eval_samples_per_second": 152.135, "eval_steps_per_second": 19.017, "step": 2900 }, { "epoch": 0.8362188552795143, "grad_norm": 2.260625123977661, "learning_rate": 1.7238239757207892e-06, "loss": 0.2206, "step": 2901 }, { "epoch": 0.8370836110760805, "grad_norm": 2.605224609375, "learning_rate": 1.7147192716236725e-06, "loss": 0.2186, "step": 2904 }, { "epoch": 0.8379483668726467, "grad_norm": 2.7619729042053223, "learning_rate": 1.7056145675265557e-06, "loss": 0.2195, "step": 2907 }, { "epoch": 0.8388131226692129, "grad_norm": 1.9395873546600342, "learning_rate": 1.6965098634294386e-06, "loss": 0.2119, "step": 2910 }, { "epoch": 0.8396778784657791, "grad_norm": 1.727263331413269, "learning_rate": 1.6874051593323218e-06, "loss": 0.2125, "step": 2913 }, { "epoch": 0.8405426342623453, "grad_norm": 2.149775981903076, "learning_rate": 1.6783004552352051e-06, "loss": 0.2644, "step": 2916 }, { "epoch": 0.8414073900589115, "grad_norm": 2.6743104457855225, "learning_rate": 1.6691957511380882e-06, "loss": 0.2217, "step": 2919 }, { "epoch": 0.8422721458554777, "grad_norm": 2.795736074447632, "learning_rate": 1.6600910470409712e-06, "loss": 0.2115, "step": 2922 }, { "epoch": 0.8431369016520439, "grad_norm": 1.8719727993011475, "learning_rate": 1.6509863429438545e-06, "loss": 0.1768, "step": 2925 }, { "epoch": 0.8440016574486101, "grad_norm": 3.4366025924682617, "learning_rate": 1.6418816388467375e-06, "loss": 0.2357, "step": 2928 }, { "epoch": 0.8448664132451763, "grad_norm": 2.2458267211914062, "learning_rate": 1.6327769347496208e-06, "loss": 0.2147, "step": 2931 }, { "epoch": 0.8457311690417425, "grad_norm": 1.958115577697754, "learning_rate": 1.6236722306525039e-06, "loss": 0.2339, "step": 2934 }, { "epoch": 0.8465959248383087, "grad_norm": 1.6470586061477661, "learning_rate": 1.614567526555387e-06, "loss": 0.2259, "step": 2937 }, { "epoch": 0.8474606806348749, "grad_norm": 1.2936792373657227, "learning_rate": 1.6054628224582702e-06, "loss": 0.1981, "step": 2940 }, { "epoch": 0.8483254364314411, "grad_norm": 4.565530300140381, "learning_rate": 1.5963581183611534e-06, "loss": 0.1993, "step": 2943 }, { "epoch": 0.8491901922280073, "grad_norm": 2.8682401180267334, "learning_rate": 1.5872534142640367e-06, "loss": 0.2093, "step": 2946 }, { "epoch": 0.8500549480245735, "grad_norm": 1.7801469564437866, "learning_rate": 1.5781487101669196e-06, "loss": 0.2136, "step": 2949 }, { "epoch": 0.8509197038211397, "grad_norm": 2.372549057006836, "learning_rate": 1.5690440060698028e-06, "loss": 0.1845, "step": 2952 }, { "epoch": 0.8517844596177059, "grad_norm": 2.190469741821289, "learning_rate": 1.559939301972686e-06, "loss": 0.2137, "step": 2955 }, { "epoch": 0.8526492154142721, "grad_norm": 1.6399952173233032, "learning_rate": 1.5508345978755694e-06, "loss": 0.2206, "step": 2958 }, { "epoch": 0.8535139712108383, "grad_norm": 1.6555943489074707, "learning_rate": 1.5417298937784522e-06, "loss": 0.2403, "step": 2961 }, { "epoch": 0.8543787270074045, "grad_norm": 2.6609280109405518, "learning_rate": 1.5326251896813355e-06, "loss": 0.2176, "step": 2964 }, { "epoch": 0.8552434828039707, "grad_norm": 2.3398261070251465, "learning_rate": 1.5235204855842187e-06, "loss": 0.219, "step": 2967 }, { "epoch": 0.8561082386005369, "grad_norm": 1.9740712642669678, "learning_rate": 1.514415781487102e-06, "loss": 0.2232, "step": 2970 }, { "epoch": 0.8569729943971031, "grad_norm": 1.6300252676010132, "learning_rate": 1.5053110773899848e-06, "loss": 0.2094, "step": 2973 }, { "epoch": 0.8578377501936693, "grad_norm": 2.8211612701416016, "learning_rate": 1.496206373292868e-06, "loss": 0.1696, "step": 2976 }, { "epoch": 0.8587025059902355, "grad_norm": 2.621321439743042, "learning_rate": 1.4871016691957514e-06, "loss": 0.2357, "step": 2979 }, { "epoch": 0.8595672617868017, "grad_norm": 1.5020322799682617, "learning_rate": 1.4779969650986344e-06, "loss": 0.2456, "step": 2982 }, { "epoch": 0.8604320175833678, "grad_norm": 1.474507212638855, "learning_rate": 1.4688922610015175e-06, "loss": 0.2074, "step": 2985 }, { "epoch": 0.861296773379934, "grad_norm": 2.9856317043304443, "learning_rate": 1.4597875569044007e-06, "loss": 0.2241, "step": 2988 }, { "epoch": 0.8621615291765002, "grad_norm": 2.0011954307556152, "learning_rate": 1.4506828528072838e-06, "loss": 0.2026, "step": 2991 }, { "epoch": 0.8630262849730664, "grad_norm": 1.6045671701431274, "learning_rate": 1.441578148710167e-06, "loss": 0.186, "step": 2994 }, { "epoch": 0.8638910407696326, "grad_norm": 1.5708575248718262, "learning_rate": 1.4324734446130503e-06, "loss": 0.2048, "step": 2997 }, { "epoch": 0.8647557965661988, "grad_norm": 2.8704543113708496, "learning_rate": 1.4233687405159332e-06, "loss": 0.2158, "step": 3000 }, { "epoch": 0.8647557965661988, "eval_loss": 0.2083793729543686, "eval_mse": 0.20837937731285638, "eval_runtime": 6.6796, "eval_samples_per_second": 149.709, "eval_steps_per_second": 18.714, "step": 3000 }, { "epoch": 0.865620552362765, "grad_norm": 3.2445404529571533, "learning_rate": 1.4142640364188164e-06, "loss": 0.2144, "step": 3003 }, { "epoch": 0.8664853081593312, "grad_norm": 1.96418297290802, "learning_rate": 1.4051593323216997e-06, "loss": 0.2491, "step": 3006 }, { "epoch": 0.8673500639558974, "grad_norm": 1.8195468187332153, "learning_rate": 1.396054628224583e-06, "loss": 0.2065, "step": 3009 }, { "epoch": 0.8682148197524636, "grad_norm": 1.3888121843338013, "learning_rate": 1.3869499241274658e-06, "loss": 0.2049, "step": 3012 }, { "epoch": 0.8690795755490298, "grad_norm": 3.738133668899536, "learning_rate": 1.377845220030349e-06, "loss": 0.2522, "step": 3015 }, { "epoch": 0.869944331345596, "grad_norm": 6.097411632537842, "learning_rate": 1.3687405159332323e-06, "loss": 0.2532, "step": 3018 }, { "epoch": 0.8708090871421622, "grad_norm": 2.2873427867889404, "learning_rate": 1.3596358118361156e-06, "loss": 0.2382, "step": 3021 }, { "epoch": 0.8716738429387284, "grad_norm": 1.540143370628357, "learning_rate": 1.3505311077389985e-06, "loss": 0.1979, "step": 3024 }, { "epoch": 0.8725385987352946, "grad_norm": 1.6231845617294312, "learning_rate": 1.3414264036418817e-06, "loss": 0.195, "step": 3027 }, { "epoch": 0.8734033545318608, "grad_norm": 2.2970290184020996, "learning_rate": 1.332321699544765e-06, "loss": 0.2154, "step": 3030 }, { "epoch": 0.874268110328427, "grad_norm": 2.0112340450286865, "learning_rate": 1.3232169954476482e-06, "loss": 0.2355, "step": 3033 }, { "epoch": 0.8751328661249932, "grad_norm": 1.3887783288955688, "learning_rate": 1.314112291350531e-06, "loss": 0.201, "step": 3036 }, { "epoch": 0.8759976219215594, "grad_norm": 2.187082529067993, "learning_rate": 1.3050075872534144e-06, "loss": 0.1965, "step": 3039 }, { "epoch": 0.8768623777181257, "grad_norm": 1.300243616104126, "learning_rate": 1.2959028831562976e-06, "loss": 0.2097, "step": 3042 }, { "epoch": 0.8777271335146919, "grad_norm": 2.1217234134674072, "learning_rate": 1.2867981790591807e-06, "loss": 0.214, "step": 3045 }, { "epoch": 0.8785918893112581, "grad_norm": 1.8281973600387573, "learning_rate": 1.277693474962064e-06, "loss": 0.2082, "step": 3048 }, { "epoch": 0.8794566451078243, "grad_norm": 2.3602306842803955, "learning_rate": 1.268588770864947e-06, "loss": 0.2172, "step": 3051 }, { "epoch": 0.8803214009043905, "grad_norm": 1.903954267501831, "learning_rate": 1.25948406676783e-06, "loss": 0.2323, "step": 3054 }, { "epoch": 0.8811861567009567, "grad_norm": 3.514057159423828, "learning_rate": 1.2503793626707133e-06, "loss": 0.2323, "step": 3057 }, { "epoch": 0.8820509124975229, "grad_norm": 3.3089487552642822, "learning_rate": 1.2412746585735964e-06, "loss": 0.2506, "step": 3060 }, { "epoch": 0.8829156682940891, "grad_norm": 2.317981004714966, "learning_rate": 1.2321699544764796e-06, "loss": 0.2274, "step": 3063 }, { "epoch": 0.8837804240906553, "grad_norm": 2.7326478958129883, "learning_rate": 1.2230652503793627e-06, "loss": 0.2057, "step": 3066 }, { "epoch": 0.8846451798872215, "grad_norm": 4.13656759262085, "learning_rate": 1.213960546282246e-06, "loss": 0.2209, "step": 3069 }, { "epoch": 0.8855099356837877, "grad_norm": 2.0688633918762207, "learning_rate": 1.204855842185129e-06, "loss": 0.2608, "step": 3072 }, { "epoch": 0.8863746914803539, "grad_norm": 1.9340734481811523, "learning_rate": 1.1957511380880123e-06, "loss": 0.2187, "step": 3075 }, { "epoch": 0.8872394472769201, "grad_norm": 2.4431509971618652, "learning_rate": 1.1866464339908953e-06, "loss": 0.2578, "step": 3078 }, { "epoch": 0.8881042030734863, "grad_norm": 2.9879822731018066, "learning_rate": 1.1775417298937786e-06, "loss": 0.2614, "step": 3081 }, { "epoch": 0.8889689588700525, "grad_norm": 1.412150502204895, "learning_rate": 1.1684370257966617e-06, "loss": 0.1879, "step": 3084 }, { "epoch": 0.8898337146666186, "grad_norm": 2.1106693744659424, "learning_rate": 1.159332321699545e-06, "loss": 0.2436, "step": 3087 }, { "epoch": 0.8906984704631848, "grad_norm": 1.6913747787475586, "learning_rate": 1.150227617602428e-06, "loss": 0.2234, "step": 3090 }, { "epoch": 0.891563226259751, "grad_norm": 1.6746766567230225, "learning_rate": 1.1411229135053112e-06, "loss": 0.2381, "step": 3093 }, { "epoch": 0.8924279820563172, "grad_norm": 3.068824291229248, "learning_rate": 1.1320182094081943e-06, "loss": 0.223, "step": 3096 }, { "epoch": 0.8932927378528834, "grad_norm": 2.9033825397491455, "learning_rate": 1.1229135053110776e-06, "loss": 0.2197, "step": 3099 }, { "epoch": 0.8935809897850722, "eval_loss": 0.21448279917240143, "eval_mse": 0.21448280355427415, "eval_runtime": 6.6727, "eval_samples_per_second": 149.865, "eval_steps_per_second": 18.733, "step": 3100 }, { "epoch": 0.8941574936494496, "grad_norm": 1.8722037076950073, "learning_rate": 1.1138088012139606e-06, "loss": 0.1974, "step": 3102 }, { "epoch": 0.8950222494460158, "grad_norm": 1.319684624671936, "learning_rate": 1.1047040971168439e-06, "loss": 0.2027, "step": 3105 }, { "epoch": 0.895887005242582, "grad_norm": 3.9506242275238037, "learning_rate": 1.095599393019727e-06, "loss": 0.2124, "step": 3108 }, { "epoch": 0.8967517610391482, "grad_norm": 1.7725896835327148, "learning_rate": 1.08649468892261e-06, "loss": 0.2146, "step": 3111 }, { "epoch": 0.8976165168357144, "grad_norm": 1.9070608615875244, "learning_rate": 1.0773899848254933e-06, "loss": 0.2311, "step": 3114 }, { "epoch": 0.8984812726322806, "grad_norm": 2.3098270893096924, "learning_rate": 1.0682852807283763e-06, "loss": 0.1847, "step": 3117 }, { "epoch": 0.8993460284288468, "grad_norm": 2.392598867416382, "learning_rate": 1.0591805766312596e-06, "loss": 0.252, "step": 3120 }, { "epoch": 0.900210784225413, "grad_norm": 1.592748761177063, "learning_rate": 1.0500758725341426e-06, "loss": 0.2226, "step": 3123 }, { "epoch": 0.9010755400219792, "grad_norm": 2.1816020011901855, "learning_rate": 1.0409711684370259e-06, "loss": 0.2551, "step": 3126 }, { "epoch": 0.9019402958185454, "grad_norm": 2.04571270942688, "learning_rate": 1.031866464339909e-06, "loss": 0.2506, "step": 3129 }, { "epoch": 0.9028050516151116, "grad_norm": 3.148040771484375, "learning_rate": 1.0227617602427922e-06, "loss": 0.2056, "step": 3132 }, { "epoch": 0.9036698074116778, "grad_norm": 4.721248626708984, "learning_rate": 1.0136570561456753e-06, "loss": 0.2243, "step": 3135 }, { "epoch": 0.904534563208244, "grad_norm": 1.5059829950332642, "learning_rate": 1.0045523520485585e-06, "loss": 0.1884, "step": 3138 }, { "epoch": 0.9053993190048102, "grad_norm": 1.9442839622497559, "learning_rate": 9.954476479514416e-07, "loss": 0.1965, "step": 3141 }, { "epoch": 0.9062640748013764, "grad_norm": 2.1955509185791016, "learning_rate": 9.863429438543249e-07, "loss": 0.204, "step": 3144 }, { "epoch": 0.9071288305979426, "grad_norm": 1.5504348278045654, "learning_rate": 9.77238239757208e-07, "loss": 0.2248, "step": 3147 }, { "epoch": 0.9079935863945088, "grad_norm": 1.6235114336013794, "learning_rate": 9.681335356600912e-07, "loss": 0.2276, "step": 3150 }, { "epoch": 0.908858342191075, "grad_norm": 2.8215818405151367, "learning_rate": 9.590288315629742e-07, "loss": 0.2292, "step": 3153 }, { "epoch": 0.9097230979876412, "grad_norm": 1.7022403478622437, "learning_rate": 9.499241274658574e-07, "loss": 0.2383, "step": 3156 }, { "epoch": 0.9105878537842074, "grad_norm": 2.5963640213012695, "learning_rate": 9.408194233687407e-07, "loss": 0.2311, "step": 3159 }, { "epoch": 0.9114526095807736, "grad_norm": 1.830937147140503, "learning_rate": 9.317147192716237e-07, "loss": 0.2073, "step": 3162 }, { "epoch": 0.9123173653773398, "grad_norm": 1.8354508876800537, "learning_rate": 9.22610015174507e-07, "loss": 0.2143, "step": 3165 }, { "epoch": 0.913182121173906, "grad_norm": 1.5569084882736206, "learning_rate": 9.1350531107739e-07, "loss": 0.2102, "step": 3168 }, { "epoch": 0.9140468769704722, "grad_norm": 1.5762006044387817, "learning_rate": 9.044006069802733e-07, "loss": 0.2335, "step": 3171 }, { "epoch": 0.9149116327670384, "grad_norm": 1.4155080318450928, "learning_rate": 8.952959028831563e-07, "loss": 0.1993, "step": 3174 }, { "epoch": 0.9157763885636045, "grad_norm": 3.324310779571533, "learning_rate": 8.861911987860396e-07, "loss": 0.2209, "step": 3177 }, { "epoch": 0.9166411443601707, "grad_norm": 2.4857962131500244, "learning_rate": 8.770864946889227e-07, "loss": 0.2384, "step": 3180 }, { "epoch": 0.9175059001567369, "grad_norm": 1.998383641242981, "learning_rate": 8.679817905918058e-07, "loss": 0.2216, "step": 3183 }, { "epoch": 0.9183706559533031, "grad_norm": 2.2427866458892822, "learning_rate": 8.58877086494689e-07, "loss": 0.2328, "step": 3186 }, { "epoch": 0.9192354117498693, "grad_norm": 2.5861330032348633, "learning_rate": 8.497723823975721e-07, "loss": 0.2228, "step": 3189 }, { "epoch": 0.9201001675464355, "grad_norm": 1.680492639541626, "learning_rate": 8.406676783004553e-07, "loss": 0.2241, "step": 3192 }, { "epoch": 0.9209649233430017, "grad_norm": 2.616267204284668, "learning_rate": 8.315629742033385e-07, "loss": 0.2524, "step": 3195 }, { "epoch": 0.921829679139568, "grad_norm": 1.922303318977356, "learning_rate": 8.224582701062215e-07, "loss": 0.2397, "step": 3198 }, { "epoch": 0.9224061830039455, "eval_loss": 0.21633096039295197, "eval_mse": 0.2163309759118274, "eval_runtime": 6.5496, "eval_samples_per_second": 152.68, "eval_steps_per_second": 19.085, "step": 3200 }, { "epoch": 0.9226944349361342, "grad_norm": 4.39849853515625, "learning_rate": 8.133535660091048e-07, "loss": 0.2216, "step": 3201 }, { "epoch": 0.9235591907327004, "grad_norm": 2.2769124507904053, "learning_rate": 8.042488619119878e-07, "loss": 0.2182, "step": 3204 }, { "epoch": 0.9244239465292666, "grad_norm": 2.8028557300567627, "learning_rate": 7.951441578148711e-07, "loss": 0.229, "step": 3207 }, { "epoch": 0.9252887023258328, "grad_norm": 2.1702733039855957, "learning_rate": 7.860394537177542e-07, "loss": 0.2261, "step": 3210 }, { "epoch": 0.926153458122399, "grad_norm": 2.6439995765686035, "learning_rate": 7.769347496206374e-07, "loss": 0.2246, "step": 3213 }, { "epoch": 0.9270182139189652, "grad_norm": 1.576919436454773, "learning_rate": 7.678300455235206e-07, "loss": 0.2354, "step": 3216 }, { "epoch": 0.9278829697155314, "grad_norm": 1.6755398511886597, "learning_rate": 7.587253414264036e-07, "loss": 0.2158, "step": 3219 }, { "epoch": 0.9287477255120976, "grad_norm": 2.1890718936920166, "learning_rate": 7.496206373292869e-07, "loss": 0.2233, "step": 3222 }, { "epoch": 0.9296124813086638, "grad_norm": 1.7316986322402954, "learning_rate": 7.4051593323217e-07, "loss": 0.2296, "step": 3225 }, { "epoch": 0.93047723710523, "grad_norm": 1.7639137506484985, "learning_rate": 7.314112291350532e-07, "loss": 0.2274, "step": 3228 }, { "epoch": 0.9313419929017962, "grad_norm": 1.7912460565567017, "learning_rate": 7.223065250379363e-07, "loss": 0.1942, "step": 3231 }, { "epoch": 0.9322067486983624, "grad_norm": 1.9391751289367676, "learning_rate": 7.132018209408196e-07, "loss": 0.2326, "step": 3234 }, { "epoch": 0.9330715044949286, "grad_norm": 2.4329934120178223, "learning_rate": 7.040971168437026e-07, "loss": 0.2205, "step": 3237 }, { "epoch": 0.9339362602914948, "grad_norm": 1.5994445085525513, "learning_rate": 6.949924127465859e-07, "loss": 0.2149, "step": 3240 }, { "epoch": 0.934801016088061, "grad_norm": 2.992966651916504, "learning_rate": 6.858877086494689e-07, "loss": 0.2194, "step": 3243 }, { "epoch": 0.9356657718846272, "grad_norm": 1.8795028924942017, "learning_rate": 6.767830045523521e-07, "loss": 0.2158, "step": 3246 }, { "epoch": 0.9365305276811934, "grad_norm": 3.5229902267456055, "learning_rate": 6.676783004552352e-07, "loss": 0.2396, "step": 3249 }, { "epoch": 0.9373952834777596, "grad_norm": 1.6539433002471924, "learning_rate": 6.585735963581184e-07, "loss": 0.2077, "step": 3252 }, { "epoch": 0.9382600392743258, "grad_norm": 2.448824405670166, "learning_rate": 6.494688922610016e-07, "loss": 0.2271, "step": 3255 }, { "epoch": 0.939124795070892, "grad_norm": 1.3006185293197632, "learning_rate": 6.403641881638847e-07, "loss": 0.2432, "step": 3258 }, { "epoch": 0.9399895508674582, "grad_norm": 2.689985752105713, "learning_rate": 6.312594840667678e-07, "loss": 0.2152, "step": 3261 }, { "epoch": 0.9408543066640244, "grad_norm": 1.5370323657989502, "learning_rate": 6.22154779969651e-07, "loss": 0.197, "step": 3264 }, { "epoch": 0.9417190624605906, "grad_norm": 1.6089318990707397, "learning_rate": 6.130500758725342e-07, "loss": 0.2145, "step": 3267 }, { "epoch": 0.9425838182571568, "grad_norm": 2.1321656703948975, "learning_rate": 6.039453717754174e-07, "loss": 0.1989, "step": 3270 }, { "epoch": 0.943448574053723, "grad_norm": 1.990070104598999, "learning_rate": 5.948406676783005e-07, "loss": 0.1982, "step": 3273 }, { "epoch": 0.9443133298502892, "grad_norm": 3.087510585784912, "learning_rate": 5.857359635811837e-07, "loss": 0.206, "step": 3276 }, { "epoch": 0.9451780856468553, "grad_norm": 2.3679909706115723, "learning_rate": 5.766312594840668e-07, "loss": 0.2292, "step": 3279 }, { "epoch": 0.9460428414434215, "grad_norm": 1.546036958694458, "learning_rate": 5.675265553869499e-07, "loss": 0.2127, "step": 3282 }, { "epoch": 0.9469075972399877, "grad_norm": 1.9500880241394043, "learning_rate": 5.584218512898331e-07, "loss": 0.2157, "step": 3285 }, { "epoch": 0.9477723530365539, "grad_norm": 1.9931670427322388, "learning_rate": 5.493171471927162e-07, "loss": 0.2151, "step": 3288 }, { "epoch": 0.9486371088331201, "grad_norm": 1.7962517738342285, "learning_rate": 5.402124430955994e-07, "loss": 0.2035, "step": 3291 }, { "epoch": 0.9495018646296863, "grad_norm": 1.3835334777832031, "learning_rate": 5.311077389984825e-07, "loss": 0.2059, "step": 3294 }, { "epoch": 0.9503666204262525, "grad_norm": 3.157975912094116, "learning_rate": 5.220030349013658e-07, "loss": 0.2002, "step": 3297 }, { "epoch": 0.9512313762228187, "grad_norm": 2.0881123542785645, "learning_rate": 5.12898330804249e-07, "loss": 0.2307, "step": 3300 }, { "epoch": 0.9512313762228187, "eval_loss": 0.21600358188152313, "eval_mse": 0.21600358275626785, "eval_runtime": 6.4833, "eval_samples_per_second": 154.243, "eval_steps_per_second": 19.28, "step": 3300 }, { "epoch": 0.9520961320193849, "grad_norm": 2.1528842449188232, "learning_rate": 5.037936267071321e-07, "loss": 0.2657, "step": 3303 }, { "epoch": 0.9529608878159511, "grad_norm": 2.4048590660095215, "learning_rate": 4.946889226100153e-07, "loss": 0.2355, "step": 3306 }, { "epoch": 0.9538256436125173, "grad_norm": 1.866373896598816, "learning_rate": 4.855842185128983e-07, "loss": 0.1932, "step": 3309 }, { "epoch": 0.9546903994090835, "grad_norm": 1.543273687362671, "learning_rate": 4.7647951441578155e-07, "loss": 0.2088, "step": 3312 }, { "epoch": 0.9555551552056497, "grad_norm": 1.585208535194397, "learning_rate": 4.673748103186647e-07, "loss": 0.1945, "step": 3315 }, { "epoch": 0.9564199110022159, "grad_norm": 1.9510430097579956, "learning_rate": 4.582701062215478e-07, "loss": 0.1719, "step": 3318 }, { "epoch": 0.9572846667987821, "grad_norm": 1.36229407787323, "learning_rate": 4.49165402124431e-07, "loss": 0.1997, "step": 3321 }, { "epoch": 0.9581494225953483, "grad_norm": 2.563950777053833, "learning_rate": 4.4006069802731414e-07, "loss": 0.2385, "step": 3324 }, { "epoch": 0.9590141783919145, "grad_norm": 2.159186363220215, "learning_rate": 4.309559939301973e-07, "loss": 0.2319, "step": 3327 }, { "epoch": 0.9598789341884807, "grad_norm": 2.4233345985412598, "learning_rate": 4.2185128983308046e-07, "loss": 0.2117, "step": 3330 }, { "epoch": 0.9607436899850469, "grad_norm": 1.915822982788086, "learning_rate": 4.127465857359636e-07, "loss": 0.2244, "step": 3333 }, { "epoch": 0.9616084457816131, "grad_norm": 3.731882333755493, "learning_rate": 4.0364188163884673e-07, "loss": 0.2222, "step": 3336 }, { "epoch": 0.9624732015781793, "grad_norm": 1.4122893810272217, "learning_rate": 3.945371775417299e-07, "loss": 0.2052, "step": 3339 }, { "epoch": 0.9633379573747455, "grad_norm": 3.098508596420288, "learning_rate": 3.8543247344461305e-07, "loss": 0.212, "step": 3342 }, { "epoch": 0.9642027131713117, "grad_norm": 1.4726747274398804, "learning_rate": 3.763277693474962e-07, "loss": 0.2374, "step": 3345 }, { "epoch": 0.9650674689678779, "grad_norm": 1.8609498739242554, "learning_rate": 3.6722306525037937e-07, "loss": 0.2173, "step": 3348 }, { "epoch": 0.9659322247644441, "grad_norm": 1.8475359678268433, "learning_rate": 3.581183611532626e-07, "loss": 0.2032, "step": 3351 }, { "epoch": 0.9667969805610104, "grad_norm": 1.7029978036880493, "learning_rate": 3.4901365705614574e-07, "loss": 0.1713, "step": 3354 }, { "epoch": 0.9676617363575766, "grad_norm": 2.657390832901001, "learning_rate": 3.399089529590289e-07, "loss": 0.2216, "step": 3357 }, { "epoch": 0.9685264921541428, "grad_norm": 3.23854398727417, "learning_rate": 3.3080424886191206e-07, "loss": 0.1776, "step": 3360 }, { "epoch": 0.969391247950709, "grad_norm": 2.09384822845459, "learning_rate": 3.2169954476479517e-07, "loss": 0.1866, "step": 3363 }, { "epoch": 0.9702560037472752, "grad_norm": 1.8957816362380981, "learning_rate": 3.1259484066767833e-07, "loss": 0.2317, "step": 3366 }, { "epoch": 0.9711207595438414, "grad_norm": 1.5353327989578247, "learning_rate": 3.034901365705615e-07, "loss": 0.209, "step": 3369 }, { "epoch": 0.9719855153404076, "grad_norm": 1.776352882385254, "learning_rate": 2.9438543247344465e-07, "loss": 0.219, "step": 3372 }, { "epoch": 0.9728502711369738, "grad_norm": 3.282552480697632, "learning_rate": 2.852807283763278e-07, "loss": 0.224, "step": 3375 }, { "epoch": 0.97371502693354, "grad_norm": 1.8530248403549194, "learning_rate": 2.7617602427921097e-07, "loss": 0.2147, "step": 3378 }, { "epoch": 0.9745797827301061, "grad_norm": 2.0258686542510986, "learning_rate": 2.670713201820941e-07, "loss": 0.2081, "step": 3381 }, { "epoch": 0.9754445385266723, "grad_norm": 1.830100655555725, "learning_rate": 2.5796661608497724e-07, "loss": 0.2248, "step": 3384 }, { "epoch": 0.9763092943232385, "grad_norm": 3.846280813217163, "learning_rate": 2.488619119878604e-07, "loss": 0.2269, "step": 3387 }, { "epoch": 0.9771740501198047, "grad_norm": 2.3556368350982666, "learning_rate": 2.3975720789074356e-07, "loss": 0.2195, "step": 3390 }, { "epoch": 0.9780388059163709, "grad_norm": 1.8791780471801758, "learning_rate": 2.3065250379362674e-07, "loss": 0.2297, "step": 3393 }, { "epoch": 0.9789035617129371, "grad_norm": 1.6351841688156128, "learning_rate": 2.215477996965099e-07, "loss": 0.1925, "step": 3396 }, { "epoch": 0.9797683175095033, "grad_norm": 2.4028263092041016, "learning_rate": 2.1244309559939304e-07, "loss": 0.2099, "step": 3399 }, { "epoch": 0.9800565694416921, "eval_loss": 0.2100730687379837, "eval_mse": 0.21007308381050824, "eval_runtime": 6.746, "eval_samples_per_second": 148.235, "eval_steps_per_second": 18.529, "step": 3400 }, { "epoch": 0.9806330733060695, "grad_norm": 3.0006494522094727, "learning_rate": 2.033383915022762e-07, "loss": 0.2095, "step": 3402 }, { "epoch": 0.9814978291026357, "grad_norm": 2.7023770809173584, "learning_rate": 1.9423368740515936e-07, "loss": 0.2064, "step": 3405 }, { "epoch": 0.9823625848992019, "grad_norm": 3.8468315601348877, "learning_rate": 1.851289833080425e-07, "loss": 0.2412, "step": 3408 }, { "epoch": 0.9832273406957681, "grad_norm": 2.5841078758239746, "learning_rate": 1.7602427921092565e-07, "loss": 0.2056, "step": 3411 }, { "epoch": 0.9840920964923343, "grad_norm": 2.557180881500244, "learning_rate": 1.669195751138088e-07, "loss": 0.2449, "step": 3414 }, { "epoch": 0.9849568522889005, "grad_norm": 2.1071228981018066, "learning_rate": 1.5781487101669194e-07, "loss": 0.2664, "step": 3417 }, { "epoch": 0.9858216080854667, "grad_norm": 2.5804591178894043, "learning_rate": 1.4871016691957513e-07, "loss": 0.2252, "step": 3420 }, { "epoch": 0.9866863638820329, "grad_norm": 1.557554006576538, "learning_rate": 1.3960546282245826e-07, "loss": 0.1979, "step": 3423 }, { "epoch": 0.9875511196785991, "grad_norm": 2.0957911014556885, "learning_rate": 1.3050075872534145e-07, "loss": 0.21, "step": 3426 }, { "epoch": 0.9884158754751653, "grad_norm": 2.239748954772949, "learning_rate": 1.2139605462822459e-07, "loss": 0.1995, "step": 3429 }, { "epoch": 0.9892806312717315, "grad_norm": 2.46882963180542, "learning_rate": 1.1229135053110775e-07, "loss": 0.2319, "step": 3432 }, { "epoch": 0.9901453870682977, "grad_norm": 2.6145310401916504, "learning_rate": 1.031866464339909e-07, "loss": 0.2245, "step": 3435 }, { "epoch": 0.9910101428648639, "grad_norm": 2.32053279876709, "learning_rate": 9.408194233687405e-08, "loss": 0.2338, "step": 3438 }, { "epoch": 0.9918748986614301, "grad_norm": 1.4136054515838623, "learning_rate": 8.497723823975723e-08, "loss": 0.2173, "step": 3441 }, { "epoch": 0.9927396544579963, "grad_norm": 1.3618991374969482, "learning_rate": 7.587253414264037e-08, "loss": 0.1794, "step": 3444 }, { "epoch": 0.9936044102545625, "grad_norm": 2.8345561027526855, "learning_rate": 6.676783004552352e-08, "loss": 0.2182, "step": 3447 }, { "epoch": 0.9944691660511287, "grad_norm": 1.9578803777694702, "learning_rate": 5.7663125948406686e-08, "loss": 0.2333, "step": 3450 }, { "epoch": 0.9953339218476949, "grad_norm": 1.6162538528442383, "learning_rate": 4.855842185128984e-08, "loss": 0.2005, "step": 3453 }, { "epoch": 0.9961986776442611, "grad_norm": 3.075516939163208, "learning_rate": 3.9453717754172986e-08, "loss": 0.2249, "step": 3456 }, { "epoch": 0.9970634334408273, "grad_norm": 2.544214963912964, "learning_rate": 3.0349013657056146e-08, "loss": 0.2147, "step": 3459 }, { "epoch": 0.9979281892373935, "grad_norm": 1.9591963291168213, "learning_rate": 2.1244309559939306e-08, "loss": 0.2164, "step": 3462 }, { "epoch": 0.9987929450339597, "grad_norm": 1.6480742692947388, "learning_rate": 1.213960546282246e-08, "loss": 0.1889, "step": 3465 }, { "epoch": 0.9996577008305259, "grad_norm": 1.6390196084976196, "learning_rate": 3.034901365705615e-09, "loss": 0.2205, "step": 3468 }, { "epoch": 0.9999459527627146, "step": 3469, "total_flos": 1.1682867916662374e+17, "train_loss": 0.2771636627187094, "train_runtime": 4603.777, "train_samples_per_second": 96.454, "train_steps_per_second": 0.754 } ], "logging_steps": 3, "max_steps": 3469, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1682867916662374e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }