{ "best_metric": 0.5200754404067993, "best_model_checkpoint": "./models/T-lite-it_7B_lora_thinking/checkpoint-129408", "epoch": 2.8803418803418803, "eval_steps": 2696, "global_step": 129408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00022257834757834758, "grad_norm": 0.28544881939888, "learning_rate": 0.00025, "loss": 0.9123, "step": 10 }, { "epoch": 0.00044515669515669517, "grad_norm": 0.3727859556674957, "learning_rate": 0.0003999999991305469, "loss": 0.8314, "step": 20 }, { "epoch": 0.0006677350427350427, "grad_norm": 0.28690457344055176, "learning_rate": 0.0003999999893491996, "loss": 0.6179, "step": 30 }, { "epoch": 0.0008903133903133903, "grad_norm": 0.3062066435813904, "learning_rate": 0.00039999996869968915, "loss": 0.6463, "step": 40 }, { "epoch": 0.001112891737891738, "grad_norm": 0.4153619110584259, "learning_rate": 0.0003999999371820167, "loss": 0.648, "step": 50 }, { "epoch": 0.0013354700854700855, "grad_norm": 0.3052282929420471, "learning_rate": 0.0003999998947961839, "loss": 0.7065, "step": 60 }, { "epoch": 0.001558048433048433, "grad_norm": 0.2599051296710968, "learning_rate": 0.00039999984154219303, "loss": 0.5866, "step": 70 }, { "epoch": 0.0017806267806267807, "grad_norm": 0.3215443193912506, "learning_rate": 0.0003999997774200471, "loss": 0.6875, "step": 80 }, { "epoch": 0.002003205128205128, "grad_norm": 0.2554830014705658, "learning_rate": 0.0003999997024297496, "loss": 0.6784, "step": 90 }, { "epoch": 0.002225783475783476, "grad_norm": 0.46018001437187195, "learning_rate": 0.0003999996165713045, "loss": 0.8871, "step": 100 }, { "epoch": 0.002448361823361823, "grad_norm": 0.42150408029556274, "learning_rate": 0.0003999995198447165, "loss": 0.575, "step": 110 }, { "epoch": 0.002670940170940171, "grad_norm": 0.38404232263565063, "learning_rate": 0.0003999994122499908, "loss": 0.5757, "step": 120 }, { "epoch": 0.0028935185185185184, "grad_norm": 0.3114868402481079, "learning_rate": 0.00039999929378713346, "loss": 0.572, "step": 130 }, { "epoch": 0.003116096866096866, "grad_norm": 0.25811660289764404, "learning_rate": 0.00039999916445615063, "loss": 0.8068, "step": 140 }, { "epoch": 0.0033386752136752135, "grad_norm": 0.3242238461971283, "learning_rate": 0.00039999902425704955, "loss": 0.7357, "step": 150 }, { "epoch": 0.0035612535612535613, "grad_norm": 0.24886858463287354, "learning_rate": 0.00039999887318983775, "loss": 0.5593, "step": 160 }, { "epoch": 0.0037838319088319087, "grad_norm": 0.45792001485824585, "learning_rate": 0.0003999987112545234, "loss": 0.6795, "step": 170 }, { "epoch": 0.004006410256410256, "grad_norm": 0.4137823283672333, "learning_rate": 0.00039999853845111544, "loss": 0.6331, "step": 180 }, { "epoch": 0.004228988603988604, "grad_norm": 0.3585183322429657, "learning_rate": 0.0003999983547796231, "loss": 0.5823, "step": 190 }, { "epoch": 0.004451566951566952, "grad_norm": 0.4130493998527527, "learning_rate": 0.00039999816024005645, "loss": 0.6371, "step": 200 }, { "epoch": 0.004674145299145299, "grad_norm": 0.5912800431251526, "learning_rate": 0.0003999979548324261, "loss": 0.797, "step": 210 }, { "epoch": 0.004896723646723646, "grad_norm": 0.32389238476753235, "learning_rate": 0.0003999977385567431, "loss": 0.6167, "step": 220 }, { "epoch": 0.005119301994301995, "grad_norm": 0.3468959331512451, "learning_rate": 0.0003999975114130193, "loss": 0.6044, "step": 230 }, { "epoch": 0.005341880341880342, "grad_norm": 0.4953303933143616, "learning_rate": 0.000399997273401267, "loss": 0.6288, "step": 240 }, { "epoch": 0.005564458689458689, "grad_norm": 0.3117259442806244, "learning_rate": 0.0003999970245214991, "loss": 0.5408, "step": 250 }, { "epoch": 0.005787037037037037, "grad_norm": 0.39363420009613037, "learning_rate": 0.0003999967647737292, "loss": 0.6291, "step": 260 }, { "epoch": 0.006009615384615385, "grad_norm": 0.44677507877349854, "learning_rate": 0.0003999964941579714, "loss": 0.6248, "step": 270 }, { "epoch": 0.006232193732193732, "grad_norm": 0.33922237157821655, "learning_rate": 0.00039999621267424037, "loss": 0.654, "step": 280 }, { "epoch": 0.00645477207977208, "grad_norm": 0.33551132678985596, "learning_rate": 0.00039999592032255134, "loss": 0.5206, "step": 290 }, { "epoch": 0.006677350427350427, "grad_norm": 0.3641911745071411, "learning_rate": 0.00039999561710292035, "loss": 0.656, "step": 300 }, { "epoch": 0.006899928774928775, "grad_norm": 0.5253650546073914, "learning_rate": 0.00039999530301536377, "loss": 0.7078, "step": 310 }, { "epoch": 0.007122507122507123, "grad_norm": 0.41651082038879395, "learning_rate": 0.00039999497805989873, "loss": 0.8011, "step": 320 }, { "epoch": 0.00734508547008547, "grad_norm": 0.20987141132354736, "learning_rate": 0.00039999464223654284, "loss": 0.724, "step": 330 }, { "epoch": 0.007567663817663817, "grad_norm": 0.5447407364845276, "learning_rate": 0.0003999942955453144, "loss": 0.6052, "step": 340 }, { "epoch": 0.007790242165242166, "grad_norm": 0.37690338492393494, "learning_rate": 0.00039999393798623216, "loss": 0.7012, "step": 350 }, { "epoch": 0.008012820512820512, "grad_norm": 0.23951241374015808, "learning_rate": 0.00039999356955931564, "loss": 0.7919, "step": 360 }, { "epoch": 0.008235398860398861, "grad_norm": 0.45175644755363464, "learning_rate": 0.0003999931902645848, "loss": 0.6225, "step": 370 }, { "epoch": 0.008457977207977209, "grad_norm": 0.4582245647907257, "learning_rate": 0.0003999928001020603, "loss": 0.7225, "step": 380 }, { "epoch": 0.008680555555555556, "grad_norm": 0.1780846118927002, "learning_rate": 0.0003999923990717633, "loss": 0.5654, "step": 390 }, { "epoch": 0.008903133903133903, "grad_norm": 0.5246115326881409, "learning_rate": 0.00039999198717371564, "loss": 0.6064, "step": 400 }, { "epoch": 0.00912571225071225, "grad_norm": 0.5305804014205933, "learning_rate": 0.0003999915644079397, "loss": 0.5286, "step": 410 }, { "epoch": 0.009348290598290598, "grad_norm": 0.45081108808517456, "learning_rate": 0.00039999113077445847, "loss": 0.7647, "step": 420 }, { "epoch": 0.009570868945868945, "grad_norm": 0.2678021788597107, "learning_rate": 0.0003999906862732954, "loss": 0.7105, "step": 430 }, { "epoch": 0.009793447293447293, "grad_norm": 0.4171365797519684, "learning_rate": 0.0003999902309044747, "loss": 0.5313, "step": 440 }, { "epoch": 0.010016025641025642, "grad_norm": 0.3753603994846344, "learning_rate": 0.0003999897646680212, "loss": 0.6744, "step": 450 }, { "epoch": 0.01023860398860399, "grad_norm": 0.533179759979248, "learning_rate": 0.0003999892875639601, "loss": 0.528, "step": 460 }, { "epoch": 0.010461182336182337, "grad_norm": 0.3925873041152954, "learning_rate": 0.0003999887995923174, "loss": 0.6925, "step": 470 }, { "epoch": 0.010683760683760684, "grad_norm": 0.3100382089614868, "learning_rate": 0.0003999883007531196, "loss": 0.58, "step": 480 }, { "epoch": 0.010906339031339031, "grad_norm": 0.3338608145713806, "learning_rate": 0.0003999877910463939, "loss": 0.533, "step": 490 }, { "epoch": 0.011128917378917379, "grad_norm": 0.2950505018234253, "learning_rate": 0.00039998727047216786, "loss": 0.6492, "step": 500 }, { "epoch": 0.011351495726495726, "grad_norm": 0.4604972004890442, "learning_rate": 0.0003999867390304698, "loss": 0.5994, "step": 510 }, { "epoch": 0.011574074074074073, "grad_norm": 0.46998804807662964, "learning_rate": 0.00039998619672132864, "loss": 0.6596, "step": 520 }, { "epoch": 0.01179665242165242, "grad_norm": 0.3429502248764038, "learning_rate": 0.0003999856435447739, "loss": 0.6498, "step": 530 }, { "epoch": 0.01201923076923077, "grad_norm": 0.45733514428138733, "learning_rate": 0.00039998507950083555, "loss": 0.7259, "step": 540 }, { "epoch": 0.012241809116809117, "grad_norm": 0.4332731068134308, "learning_rate": 0.0003999845045895442, "loss": 0.7316, "step": 550 }, { "epoch": 0.012464387464387465, "grad_norm": 0.32235032320022583, "learning_rate": 0.0003999839188109312, "loss": 0.6579, "step": 560 }, { "epoch": 0.012686965811965812, "grad_norm": 0.4269620180130005, "learning_rate": 0.00039998332216502835, "loss": 0.5983, "step": 570 }, { "epoch": 0.01290954415954416, "grad_norm": 0.3818014860153198, "learning_rate": 0.000399982714651868, "loss": 0.7129, "step": 580 }, { "epoch": 0.013132122507122507, "grad_norm": 0.31318408250808716, "learning_rate": 0.0003999820962714833, "loss": 0.6421, "step": 590 }, { "epoch": 0.013354700854700854, "grad_norm": 0.5945875644683838, "learning_rate": 0.00039998146702390776, "loss": 0.7313, "step": 600 }, { "epoch": 0.013577279202279201, "grad_norm": 0.3725283443927765, "learning_rate": 0.00039998082690917555, "loss": 0.6931, "step": 610 }, { "epoch": 0.01379985754985755, "grad_norm": 0.415237694978714, "learning_rate": 0.0003999801759273215, "loss": 0.5633, "step": 620 }, { "epoch": 0.014022435897435898, "grad_norm": 0.4613287150859833, "learning_rate": 0.00039997951407838106, "loss": 0.6357, "step": 630 }, { "epoch": 0.014245014245014245, "grad_norm": 0.41351374983787537, "learning_rate": 0.00039997884136238997, "loss": 0.6294, "step": 640 }, { "epoch": 0.014467592592592593, "grad_norm": 0.7268733382225037, "learning_rate": 0.00039997815777938504, "loss": 0.7595, "step": 650 }, { "epoch": 0.01469017094017094, "grad_norm": 0.355999618768692, "learning_rate": 0.0003999774633294033, "loss": 0.5714, "step": 660 }, { "epoch": 0.014912749287749287, "grad_norm": 0.8212071657180786, "learning_rate": 0.00039997675801248245, "loss": 0.6492, "step": 670 }, { "epoch": 0.015135327635327635, "grad_norm": 0.47924473881721497, "learning_rate": 0.00039997604182866084, "loss": 0.5294, "step": 680 }, { "epoch": 0.015357905982905982, "grad_norm": 0.4293022155761719, "learning_rate": 0.00039997531477797745, "loss": 0.5905, "step": 690 }, { "epoch": 0.015580484330484331, "grad_norm": 0.526131808757782, "learning_rate": 0.00039997457686047176, "loss": 0.6094, "step": 700 }, { "epoch": 0.01580306267806268, "grad_norm": 0.5501940250396729, "learning_rate": 0.0003999738280761839, "loss": 0.6805, "step": 710 }, { "epoch": 0.016025641025641024, "grad_norm": 0.7406270503997803, "learning_rate": 0.0003999730684251544, "loss": 0.6279, "step": 720 }, { "epoch": 0.016248219373219373, "grad_norm": 0.4423774778842926, "learning_rate": 0.0003999722979074247, "loss": 0.6017, "step": 730 }, { "epoch": 0.016470797720797722, "grad_norm": 0.5101915001869202, "learning_rate": 0.0003999715165230367, "loss": 0.6441, "step": 740 }, { "epoch": 0.016693376068376068, "grad_norm": 0.24923740327358246, "learning_rate": 0.0003999707242720327, "loss": 0.6781, "step": 750 }, { "epoch": 0.016915954415954417, "grad_norm": 0.6661295294761658, "learning_rate": 0.00039996992115445585, "loss": 0.5248, "step": 760 }, { "epoch": 0.017138532763532763, "grad_norm": 0.3601098954677582, "learning_rate": 0.00039996910717034976, "loss": 0.6236, "step": 770 }, { "epoch": 0.017361111111111112, "grad_norm": 0.43522101640701294, "learning_rate": 0.0003999682823197588, "loss": 0.6272, "step": 780 }, { "epoch": 0.017583689458689458, "grad_norm": 0.6393892168998718, "learning_rate": 0.0003999674466027276, "loss": 0.6327, "step": 790 }, { "epoch": 0.017806267806267807, "grad_norm": 0.5061392188072205, "learning_rate": 0.0003999666000193016, "loss": 0.611, "step": 800 }, { "epoch": 0.018028846153846152, "grad_norm": 0.3880053460597992, "learning_rate": 0.00039996574256952693, "loss": 0.6651, "step": 810 }, { "epoch": 0.0182514245014245, "grad_norm": 0.45905396342277527, "learning_rate": 0.00039996487425345006, "loss": 0.6382, "step": 820 }, { "epoch": 0.01847400284900285, "grad_norm": 0.5195857882499695, "learning_rate": 0.0003999639950711183, "loss": 0.682, "step": 830 }, { "epoch": 0.018696581196581196, "grad_norm": 0.4754457473754883, "learning_rate": 0.00039996310502257926, "loss": 0.6563, "step": 840 }, { "epoch": 0.018919159544159545, "grad_norm": 0.4215100407600403, "learning_rate": 0.0003999622041078815, "loss": 0.5943, "step": 850 }, { "epoch": 0.01914173789173789, "grad_norm": 0.6068809032440186, "learning_rate": 0.00039996129232707374, "loss": 0.7153, "step": 860 }, { "epoch": 0.01936431623931624, "grad_norm": 0.41532132029533386, "learning_rate": 0.00039996036968020576, "loss": 0.593, "step": 870 }, { "epoch": 0.019586894586894586, "grad_norm": 0.5850262641906738, "learning_rate": 0.00039995943616732754, "loss": 0.5694, "step": 880 }, { "epoch": 0.019809472934472935, "grad_norm": 0.4964942932128906, "learning_rate": 0.0003999584917884899, "loss": 0.6062, "step": 890 }, { "epoch": 0.020032051282051284, "grad_norm": 0.6453270316123962, "learning_rate": 0.0003999575365437441, "loss": 0.627, "step": 900 }, { "epoch": 0.02025462962962963, "grad_norm": 0.4876425266265869, "learning_rate": 0.00039995657043314205, "loss": 0.6849, "step": 910 }, { "epoch": 0.02047720797720798, "grad_norm": 0.5182598829269409, "learning_rate": 0.0003999555934567363, "loss": 0.7098, "step": 920 }, { "epoch": 0.020699786324786324, "grad_norm": 0.3465155065059662, "learning_rate": 0.0003999546056145799, "loss": 0.6095, "step": 930 }, { "epoch": 0.020922364672364673, "grad_norm": 0.6155537366867065, "learning_rate": 0.00039995360690672657, "loss": 0.5103, "step": 940 }, { "epoch": 0.02114494301994302, "grad_norm": 0.6999111771583557, "learning_rate": 0.0003999525973332306, "loss": 0.7678, "step": 950 }, { "epoch": 0.021367521367521368, "grad_norm": 0.2875516414642334, "learning_rate": 0.00039995157689414666, "loss": 0.6324, "step": 960 }, { "epoch": 0.021590099715099714, "grad_norm": 0.6682087182998657, "learning_rate": 0.00039995054558953047, "loss": 0.6292, "step": 970 }, { "epoch": 0.021812678062678063, "grad_norm": 0.63261479139328, "learning_rate": 0.00039994950341943784, "loss": 0.6469, "step": 980 }, { "epoch": 0.022035256410256412, "grad_norm": 0.5575029253959656, "learning_rate": 0.0003999484503839256, "loss": 0.5626, "step": 990 }, { "epoch": 0.022257834757834757, "grad_norm": 0.5150830149650574, "learning_rate": 0.00039994738648305086, "loss": 0.5915, "step": 1000 }, { "epoch": 0.022480413105413107, "grad_norm": 0.5310041904449463, "learning_rate": 0.0003999463117168714, "loss": 0.5757, "step": 1010 }, { "epoch": 0.022702991452991452, "grad_norm": 0.31499940156936646, "learning_rate": 0.0003999452260854457, "loss": 0.7014, "step": 1020 }, { "epoch": 0.0229255698005698, "grad_norm": 0.5305114984512329, "learning_rate": 0.0003999441295888328, "loss": 0.6615, "step": 1030 }, { "epoch": 0.023148148148148147, "grad_norm": 0.47061678767204285, "learning_rate": 0.0003999430222270921, "loss": 0.6299, "step": 1040 }, { "epoch": 0.023370726495726496, "grad_norm": 0.49521180987358093, "learning_rate": 0.000399941904000284, "loss": 0.6524, "step": 1050 }, { "epoch": 0.02359330484330484, "grad_norm": 0.5753868222236633, "learning_rate": 0.0003999407749084691, "loss": 0.666, "step": 1060 }, { "epoch": 0.02381588319088319, "grad_norm": 0.3605792224407196, "learning_rate": 0.0003999396349517088, "loss": 0.4916, "step": 1070 }, { "epoch": 0.02403846153846154, "grad_norm": 0.47851887345314026, "learning_rate": 0.0003999384841300651, "loss": 0.6972, "step": 1080 }, { "epoch": 0.024261039886039885, "grad_norm": 0.4846053719520569, "learning_rate": 0.00039993732244360047, "loss": 0.5415, "step": 1090 }, { "epoch": 0.024483618233618235, "grad_norm": 0.5065937638282776, "learning_rate": 0.0003999361498923781, "loss": 0.6008, "step": 1100 }, { "epoch": 0.02470619658119658, "grad_norm": 0.6912859678268433, "learning_rate": 0.00039993496647646164, "loss": 0.657, "step": 1110 }, { "epoch": 0.02492877492877493, "grad_norm": 0.48633942008018494, "learning_rate": 0.0003999337721959154, "loss": 0.7885, "step": 1120 }, { "epoch": 0.025151353276353275, "grad_norm": 0.5203970670700073, "learning_rate": 0.00039993256705080435, "loss": 0.524, "step": 1130 }, { "epoch": 0.025373931623931624, "grad_norm": 0.3965185284614563, "learning_rate": 0.0003999313510411939, "loss": 0.5524, "step": 1140 }, { "epoch": 0.025596509971509973, "grad_norm": 0.5027601718902588, "learning_rate": 0.00039993012416715014, "loss": 0.6776, "step": 1150 }, { "epoch": 0.02581908831908832, "grad_norm": 0.5505548119544983, "learning_rate": 0.00039992888642873984, "loss": 0.5468, "step": 1160 }, { "epoch": 0.026041666666666668, "grad_norm": 0.7258923649787903, "learning_rate": 0.0003999276378260302, "loss": 0.6938, "step": 1170 }, { "epoch": 0.026264245014245013, "grad_norm": 0.40920066833496094, "learning_rate": 0.00039992637835908895, "loss": 0.6654, "step": 1180 }, { "epoch": 0.026486823361823363, "grad_norm": 0.4127655327320099, "learning_rate": 0.00039992510802798465, "loss": 0.5783, "step": 1190 }, { "epoch": 0.026709401709401708, "grad_norm": 0.49130943417549133, "learning_rate": 0.0003999238268327863, "loss": 0.6508, "step": 1200 }, { "epoch": 0.026931980056980057, "grad_norm": 0.782516360282898, "learning_rate": 0.00039992253477356363, "loss": 0.6979, "step": 1210 }, { "epoch": 0.027154558404558403, "grad_norm": 0.4363095760345459, "learning_rate": 0.0003999212318503867, "loss": 0.5841, "step": 1220 }, { "epoch": 0.027377136752136752, "grad_norm": 0.4464375972747803, "learning_rate": 0.00039991991806332636, "loss": 0.6695, "step": 1230 }, { "epoch": 0.0275997150997151, "grad_norm": 0.6378709673881531, "learning_rate": 0.000399918593412454, "loss": 0.7352, "step": 1240 }, { "epoch": 0.027822293447293447, "grad_norm": 0.7491675615310669, "learning_rate": 0.00039991725789784166, "loss": 0.7471, "step": 1250 }, { "epoch": 0.028044871794871796, "grad_norm": 0.54839026927948, "learning_rate": 0.00039991591151956185, "loss": 0.6001, "step": 1260 }, { "epoch": 0.02826745014245014, "grad_norm": 0.5543126463890076, "learning_rate": 0.00039991455427768773, "loss": 0.5391, "step": 1270 }, { "epoch": 0.02849002849002849, "grad_norm": 0.41751816868782043, "learning_rate": 0.0003999131861722931, "loss": 0.7174, "step": 1280 }, { "epoch": 0.028712606837606836, "grad_norm": 0.5156871676445007, "learning_rate": 0.0003999118072034522, "loss": 0.6834, "step": 1290 }, { "epoch": 0.028935185185185185, "grad_norm": 0.7073959112167358, "learning_rate": 0.0003999104173712401, "loss": 0.5147, "step": 1300 }, { "epoch": 0.029157763532763534, "grad_norm": 0.8129472136497498, "learning_rate": 0.0003999090166757323, "loss": 0.6713, "step": 1310 }, { "epoch": 0.02938034188034188, "grad_norm": 0.7459490299224854, "learning_rate": 0.00039990760511700486, "loss": 0.7992, "step": 1320 }, { "epoch": 0.02960292022792023, "grad_norm": 0.5266855955123901, "learning_rate": 0.00039990618269513447, "loss": 0.7064, "step": 1330 }, { "epoch": 0.029825498575498575, "grad_norm": 0.8832513689994812, "learning_rate": 0.0003999047494101985, "loss": 0.526, "step": 1340 }, { "epoch": 0.030048076923076924, "grad_norm": 0.5843579769134521, "learning_rate": 0.0003999033052622748, "loss": 0.6702, "step": 1350 }, { "epoch": 0.03027065527065527, "grad_norm": 0.5423453450202942, "learning_rate": 0.0003999018502514418, "loss": 0.6079, "step": 1360 }, { "epoch": 0.03049323361823362, "grad_norm": 0.43239280581474304, "learning_rate": 0.0003999003843777786, "loss": 0.7386, "step": 1370 }, { "epoch": 0.030715811965811964, "grad_norm": 0.43782392144203186, "learning_rate": 0.00039989890764136494, "loss": 0.6689, "step": 1380 }, { "epoch": 0.030938390313390313, "grad_norm": 0.44703444838523865, "learning_rate": 0.000399897420042281, "loss": 0.7488, "step": 1390 }, { "epoch": 0.031160968660968662, "grad_norm": 0.6166260242462158, "learning_rate": 0.0003998959215806075, "loss": 0.607, "step": 1400 }, { "epoch": 0.03138354700854701, "grad_norm": 0.6796806454658508, "learning_rate": 0.0003998944122564261, "loss": 0.5879, "step": 1410 }, { "epoch": 0.03160612535612536, "grad_norm": 0.6388904452323914, "learning_rate": 0.00039989289206981857, "loss": 0.7348, "step": 1420 }, { "epoch": 0.031828703703703706, "grad_norm": 0.501086413860321, "learning_rate": 0.00039989136102086775, "loss": 0.7234, "step": 1430 }, { "epoch": 0.03205128205128205, "grad_norm": 0.7662519216537476, "learning_rate": 0.0003998898191096567, "loss": 0.7404, "step": 1440 }, { "epoch": 0.0322738603988604, "grad_norm": 0.6648521423339844, "learning_rate": 0.0003998882663362692, "loss": 0.5212, "step": 1450 }, { "epoch": 0.03249643874643875, "grad_norm": 0.3986724019050598, "learning_rate": 0.0003998867027007897, "loss": 0.5945, "step": 1460 }, { "epoch": 0.032719017094017096, "grad_norm": 0.5667985677719116, "learning_rate": 0.00039988512820330317, "loss": 0.4911, "step": 1470 }, { "epoch": 0.032941595441595445, "grad_norm": 0.3411963880062103, "learning_rate": 0.0003998835428438952, "loss": 0.5614, "step": 1480 }, { "epoch": 0.03316417378917379, "grad_norm": 0.8257679343223572, "learning_rate": 0.00039988194662265173, "loss": 0.5847, "step": 1490 }, { "epoch": 0.033386752136752136, "grad_norm": 0.6138916015625, "learning_rate": 0.00039988033953965976, "loss": 0.791, "step": 1500 }, { "epoch": 0.033609330484330485, "grad_norm": 0.7213571071624756, "learning_rate": 0.00039987872159500645, "loss": 0.6312, "step": 1510 }, { "epoch": 0.033831908831908834, "grad_norm": 0.6440274715423584, "learning_rate": 0.00039987709278877984, "loss": 0.5499, "step": 1520 }, { "epoch": 0.034054487179487176, "grad_norm": 0.4683380722999573, "learning_rate": 0.0003998754531210683, "loss": 0.7107, "step": 1530 }, { "epoch": 0.034277065527065526, "grad_norm": 0.44701075553894043, "learning_rate": 0.0003998738025919611, "loss": 0.6155, "step": 1540 }, { "epoch": 0.034499643874643875, "grad_norm": 0.5694136619567871, "learning_rate": 0.00039987214120154775, "loss": 0.6355, "step": 1550 }, { "epoch": 0.034722222222222224, "grad_norm": 0.4150397777557373, "learning_rate": 0.0003998704689499187, "loss": 0.5468, "step": 1560 }, { "epoch": 0.03494480056980057, "grad_norm": 0.45780056715011597, "learning_rate": 0.0003998687858371647, "loss": 0.5747, "step": 1570 }, { "epoch": 0.035167378917378915, "grad_norm": 0.5170963406562805, "learning_rate": 0.00039986709186337724, "loss": 0.4998, "step": 1580 }, { "epoch": 0.035389957264957264, "grad_norm": 0.7482688426971436, "learning_rate": 0.0003998653870286485, "loss": 0.8305, "step": 1590 }, { "epoch": 0.03561253561253561, "grad_norm": 0.585242509841919, "learning_rate": 0.00039986367133307087, "loss": 0.6235, "step": 1600 }, { "epoch": 0.03583511396011396, "grad_norm": 0.5389675498008728, "learning_rate": 0.0003998619447767378, "loss": 0.5928, "step": 1610 }, { "epoch": 0.036057692307692304, "grad_norm": 0.6164636611938477, "learning_rate": 0.0003998602073597431, "loss": 0.7214, "step": 1620 }, { "epoch": 0.036280270655270654, "grad_norm": 0.466166228055954, "learning_rate": 0.000399858459082181, "loss": 0.5967, "step": 1630 }, { "epoch": 0.036502849002849, "grad_norm": 0.6401296854019165, "learning_rate": 0.00039985669994414667, "loss": 0.691, "step": 1640 }, { "epoch": 0.03672542735042735, "grad_norm": 0.5248224139213562, "learning_rate": 0.00039985492994573565, "loss": 0.6946, "step": 1650 }, { "epoch": 0.0369480056980057, "grad_norm": 0.4127870500087738, "learning_rate": 0.0003998531490870441, "loss": 0.4733, "step": 1660 }, { "epoch": 0.03717058404558404, "grad_norm": 0.47520193457603455, "learning_rate": 0.0003998513573681689, "loss": 0.6858, "step": 1670 }, { "epoch": 0.03739316239316239, "grad_norm": 0.4704365134239197, "learning_rate": 0.0003998495547892072, "loss": 0.5678, "step": 1680 }, { "epoch": 0.03761574074074074, "grad_norm": 0.36997130513191223, "learning_rate": 0.0003998477413502572, "loss": 0.6287, "step": 1690 }, { "epoch": 0.03783831908831909, "grad_norm": 0.7199309468269348, "learning_rate": 0.0003998459170514173, "loss": 0.6482, "step": 1700 }, { "epoch": 0.03806089743589743, "grad_norm": 0.41155070066452026, "learning_rate": 0.0003998440818927867, "loss": 0.6537, "step": 1710 }, { "epoch": 0.03828347578347578, "grad_norm": 0.647976815700531, "learning_rate": 0.00039984223587446506, "loss": 0.622, "step": 1720 }, { "epoch": 0.03850605413105413, "grad_norm": 0.5233225226402283, "learning_rate": 0.0003998403789965528, "loss": 0.5125, "step": 1730 }, { "epoch": 0.03872863247863248, "grad_norm": 0.7623016834259033, "learning_rate": 0.0003998385112591506, "loss": 0.6881, "step": 1740 }, { "epoch": 0.03895121082621083, "grad_norm": 0.45694059133529663, "learning_rate": 0.0003998366326623602, "loss": 0.6947, "step": 1750 }, { "epoch": 0.03917378917378917, "grad_norm": 0.5509689450263977, "learning_rate": 0.0003998347432062835, "loss": 0.6702, "step": 1760 }, { "epoch": 0.03939636752136752, "grad_norm": 0.5311200022697449, "learning_rate": 0.00039983284289102334, "loss": 0.6444, "step": 1770 }, { "epoch": 0.03961894586894587, "grad_norm": 0.48745280504226685, "learning_rate": 0.0003998309317166829, "loss": 0.5315, "step": 1780 }, { "epoch": 0.03984152421652422, "grad_norm": 0.558219313621521, "learning_rate": 0.000399829009683366, "loss": 0.7597, "step": 1790 }, { "epoch": 0.04006410256410257, "grad_norm": 0.617832362651825, "learning_rate": 0.0003998270767911772, "loss": 0.6786, "step": 1800 }, { "epoch": 0.04028668091168091, "grad_norm": 0.48355668783187866, "learning_rate": 0.00039982513304022145, "loss": 0.6901, "step": 1810 }, { "epoch": 0.04050925925925926, "grad_norm": 0.4814782440662384, "learning_rate": 0.00039982317843060433, "loss": 0.6997, "step": 1820 }, { "epoch": 0.04073183760683761, "grad_norm": 0.5412116646766663, "learning_rate": 0.00039982121296243216, "loss": 0.6539, "step": 1830 }, { "epoch": 0.04095441595441596, "grad_norm": 0.9898232221603394, "learning_rate": 0.0003998192366358117, "loss": 0.6633, "step": 1840 }, { "epoch": 0.0411769943019943, "grad_norm": 0.5413815975189209, "learning_rate": 0.0003998172494508503, "loss": 0.6422, "step": 1850 }, { "epoch": 0.04139957264957265, "grad_norm": 0.6026504635810852, "learning_rate": 0.000399815251407656, "loss": 0.6522, "step": 1860 }, { "epoch": 0.041622150997151, "grad_norm": 0.490631103515625, "learning_rate": 0.00039981324250633733, "loss": 0.5702, "step": 1870 }, { "epoch": 0.041844729344729346, "grad_norm": 0.7394275665283203, "learning_rate": 0.0003998112227470036, "loss": 0.6785, "step": 1880 }, { "epoch": 0.042067307692307696, "grad_norm": 0.689113438129425, "learning_rate": 0.00039980919212976435, "loss": 0.6489, "step": 1890 }, { "epoch": 0.04228988603988604, "grad_norm": 0.5154189467430115, "learning_rate": 0.00039980715065473, "loss": 0.6737, "step": 1900 }, { "epoch": 0.04251246438746439, "grad_norm": 0.5986299514770508, "learning_rate": 0.00039980509832201165, "loss": 0.7127, "step": 1910 }, { "epoch": 0.042735042735042736, "grad_norm": 0.5043966770172119, "learning_rate": 0.00039980303513172057, "loss": 0.6557, "step": 1920 }, { "epoch": 0.042957621082621085, "grad_norm": 0.5125043988227844, "learning_rate": 0.000399800961083969, "loss": 0.6017, "step": 1930 }, { "epoch": 0.04318019943019943, "grad_norm": 0.4287809729576111, "learning_rate": 0.0003997988761788697, "loss": 0.6154, "step": 1940 }, { "epoch": 0.043402777777777776, "grad_norm": 0.4034154415130615, "learning_rate": 0.00039979678041653587, "loss": 0.5579, "step": 1950 }, { "epoch": 0.043625356125356125, "grad_norm": 0.6922469735145569, "learning_rate": 0.0003997946737970814, "loss": 0.7429, "step": 1960 }, { "epoch": 0.043847934472934474, "grad_norm": 0.670454204082489, "learning_rate": 0.00039979255632062086, "loss": 0.804, "step": 1970 }, { "epoch": 0.044070512820512824, "grad_norm": 0.4805451035499573, "learning_rate": 0.00039979042798726923, "loss": 0.5674, "step": 1980 }, { "epoch": 0.044293091168091166, "grad_norm": 0.3706532418727875, "learning_rate": 0.00039978828879714217, "loss": 0.7832, "step": 1990 }, { "epoch": 0.044515669515669515, "grad_norm": 0.5441576242446899, "learning_rate": 0.00039978613875035594, "loss": 0.6144, "step": 2000 }, { "epoch": 0.044738247863247864, "grad_norm": 0.5556431412696838, "learning_rate": 0.00039978397784702743, "loss": 0.623, "step": 2010 }, { "epoch": 0.04496082621082621, "grad_norm": 0.4374752342700958, "learning_rate": 0.00039978180608727396, "loss": 0.601, "step": 2020 }, { "epoch": 0.045183404558404555, "grad_norm": 0.5470070242881775, "learning_rate": 0.00039977962347121363, "loss": 0.6754, "step": 2030 }, { "epoch": 0.045405982905982904, "grad_norm": 0.5512667298316956, "learning_rate": 0.000399777429998965, "loss": 0.6927, "step": 2040 }, { "epoch": 0.04562856125356125, "grad_norm": 0.491028755903244, "learning_rate": 0.00039977522567064726, "loss": 0.5932, "step": 2050 }, { "epoch": 0.0458511396011396, "grad_norm": 0.8657106161117554, "learning_rate": 0.00039977301048638023, "loss": 0.7108, "step": 2060 }, { "epoch": 0.04607371794871795, "grad_norm": 0.5009307265281677, "learning_rate": 0.00039977078444628427, "loss": 0.4469, "step": 2070 }, { "epoch": 0.046296296296296294, "grad_norm": 0.5697231292724609, "learning_rate": 0.0003997685475504803, "loss": 0.731, "step": 2080 }, { "epoch": 0.04651887464387464, "grad_norm": 0.47939106822013855, "learning_rate": 0.00039976629979909, "loss": 0.66, "step": 2090 }, { "epoch": 0.04674145299145299, "grad_norm": 0.6738607287406921, "learning_rate": 0.0003997640411922354, "loss": 0.6655, "step": 2100 }, { "epoch": 0.04696403133903134, "grad_norm": 0.43843522667884827, "learning_rate": 0.00039976177173003924, "loss": 0.6354, "step": 2110 }, { "epoch": 0.04718660968660968, "grad_norm": 0.7421457171440125, "learning_rate": 0.0003997594914126249, "loss": 0.7271, "step": 2120 }, { "epoch": 0.04740918803418803, "grad_norm": 0.6225273609161377, "learning_rate": 0.0003997572002401163, "loss": 0.6037, "step": 2130 }, { "epoch": 0.04763176638176638, "grad_norm": 0.6136428117752075, "learning_rate": 0.00039975489821263783, "loss": 0.5636, "step": 2140 }, { "epoch": 0.04785434472934473, "grad_norm": 0.5567286610603333, "learning_rate": 0.0003997525853303147, "loss": 0.6963, "step": 2150 }, { "epoch": 0.04807692307692308, "grad_norm": 0.9734333753585815, "learning_rate": 0.00039975026159327253, "loss": 0.7507, "step": 2160 }, { "epoch": 0.04829950142450142, "grad_norm": 0.3505744934082031, "learning_rate": 0.00039974792700163766, "loss": 0.5056, "step": 2170 }, { "epoch": 0.04852207977207977, "grad_norm": 0.6284974813461304, "learning_rate": 0.0003997455815555369, "loss": 0.6345, "step": 2180 }, { "epoch": 0.04874465811965812, "grad_norm": 0.43681466579437256, "learning_rate": 0.00039974322525509776, "loss": 0.5093, "step": 2190 }, { "epoch": 0.04896723646723647, "grad_norm": 0.8628547787666321, "learning_rate": 0.0003997408581004482, "loss": 0.6074, "step": 2200 }, { "epoch": 0.04918981481481482, "grad_norm": 0.37960782647132874, "learning_rate": 0.0003997384800917169, "loss": 0.6353, "step": 2210 }, { "epoch": 0.04941239316239316, "grad_norm": 0.5450817942619324, "learning_rate": 0.0003997360912290331, "loss": 0.7205, "step": 2220 }, { "epoch": 0.04963497150997151, "grad_norm": 0.6776572465896606, "learning_rate": 0.00039973369151252654, "loss": 0.7616, "step": 2230 }, { "epoch": 0.04985754985754986, "grad_norm": 0.5885903239250183, "learning_rate": 0.0003997312809423277, "loss": 0.6358, "step": 2240 }, { "epoch": 0.05008012820512821, "grad_norm": 0.6875993013381958, "learning_rate": 0.00039972885951856756, "loss": 0.575, "step": 2250 }, { "epoch": 0.05030270655270655, "grad_norm": 0.7251737117767334, "learning_rate": 0.0003997264272413777, "loss": 0.7485, "step": 2260 }, { "epoch": 0.0505252849002849, "grad_norm": 0.6607152819633484, "learning_rate": 0.0003997239841108902, "loss": 0.72, "step": 2270 }, { "epoch": 0.05074786324786325, "grad_norm": 0.45974501967430115, "learning_rate": 0.000399721530127238, "loss": 0.7453, "step": 2280 }, { "epoch": 0.0509704415954416, "grad_norm": 0.46049365401268005, "learning_rate": 0.0003997190652905543, "loss": 0.7315, "step": 2290 }, { "epoch": 0.051193019943019946, "grad_norm": 0.5258949398994446, "learning_rate": 0.0003997165896009731, "loss": 0.7081, "step": 2300 }, { "epoch": 0.05141559829059829, "grad_norm": 0.4143979251384735, "learning_rate": 0.000399714103058629, "loss": 0.6437, "step": 2310 }, { "epoch": 0.05163817663817664, "grad_norm": 0.7126597166061401, "learning_rate": 0.00039971160566365695, "loss": 0.6343, "step": 2320 }, { "epoch": 0.05186075498575499, "grad_norm": 0.384854257106781, "learning_rate": 0.0003997090974161928, "loss": 0.5415, "step": 2330 }, { "epoch": 0.052083333333333336, "grad_norm": 0.45989155769348145, "learning_rate": 0.0003997065783163728, "loss": 0.6082, "step": 2340 }, { "epoch": 0.05230591168091168, "grad_norm": 0.5884913206100464, "learning_rate": 0.0003997040483643338, "loss": 0.6769, "step": 2350 }, { "epoch": 0.05252849002849003, "grad_norm": 0.5443273782730103, "learning_rate": 0.0003997015075602134, "loss": 0.5862, "step": 2360 }, { "epoch": 0.052751068376068376, "grad_norm": 0.6262222528457642, "learning_rate": 0.00039969895590414954, "loss": 0.5576, "step": 2370 }, { "epoch": 0.052973646723646725, "grad_norm": 0.37384849786758423, "learning_rate": 0.00039969639339628094, "loss": 0.6212, "step": 2380 }, { "epoch": 0.053196225071225074, "grad_norm": 1.0342721939086914, "learning_rate": 0.00039969382003674685, "loss": 0.7919, "step": 2390 }, { "epoch": 0.053418803418803416, "grad_norm": 0.49225690960884094, "learning_rate": 0.00039969123582568714, "loss": 0.6046, "step": 2400 }, { "epoch": 0.053641381766381765, "grad_norm": 0.6797170042991638, "learning_rate": 0.0003996886407632422, "loss": 0.7064, "step": 2410 }, { "epoch": 0.053863960113960115, "grad_norm": 0.3157079815864563, "learning_rate": 0.00039968603484955305, "loss": 0.5789, "step": 2420 }, { "epoch": 0.054086538461538464, "grad_norm": 0.6280565857887268, "learning_rate": 0.0003996834180847612, "loss": 0.6395, "step": 2430 }, { "epoch": 0.054309116809116806, "grad_norm": 0.714435338973999, "learning_rate": 0.00039968079046900906, "loss": 0.653, "step": 2440 }, { "epoch": 0.054531695156695155, "grad_norm": 0.8974350094795227, "learning_rate": 0.0003996781520024392, "loss": 0.7271, "step": 2450 }, { "epoch": 0.054754273504273504, "grad_norm": 0.7227585911750793, "learning_rate": 0.00039967550268519517, "loss": 0.7686, "step": 2460 }, { "epoch": 0.05497685185185185, "grad_norm": 0.8472549319267273, "learning_rate": 0.00039967284251742085, "loss": 0.7441, "step": 2470 }, { "epoch": 0.0551994301994302, "grad_norm": 0.8443915247917175, "learning_rate": 0.00039967017149926084, "loss": 0.5644, "step": 2480 }, { "epoch": 0.055422008547008544, "grad_norm": 0.5027754306793213, "learning_rate": 0.0003996674896308602, "loss": 0.6362, "step": 2490 }, { "epoch": 0.055644586894586893, "grad_norm": 0.4711776375770569, "learning_rate": 0.0003996647969123647, "loss": 0.6692, "step": 2500 }, { "epoch": 0.05586716524216524, "grad_norm": 0.5613887906074524, "learning_rate": 0.0003996620933439207, "loss": 0.6728, "step": 2510 }, { "epoch": 0.05608974358974359, "grad_norm": 0.4277225732803345, "learning_rate": 0.00039965937892567514, "loss": 0.6065, "step": 2520 }, { "epoch": 0.056312321937321934, "grad_norm": 0.5663828253746033, "learning_rate": 0.00039965665365777545, "loss": 0.5488, "step": 2530 }, { "epoch": 0.05653490028490028, "grad_norm": 0.49894288182258606, "learning_rate": 0.0003996539175403697, "loss": 0.5605, "step": 2540 }, { "epoch": 0.05675747863247863, "grad_norm": 0.6952654123306274, "learning_rate": 0.0003996511705736067, "loss": 0.5986, "step": 2550 }, { "epoch": 0.05698005698005698, "grad_norm": 0.49367037415504456, "learning_rate": 0.00039964841275763564, "loss": 0.6781, "step": 2560 }, { "epoch": 0.05720263532763533, "grad_norm": 0.554221510887146, "learning_rate": 0.0003996456440926064, "loss": 0.7196, "step": 2570 }, { "epoch": 0.05742521367521367, "grad_norm": 0.7069229483604431, "learning_rate": 0.00039964286457866937, "loss": 0.653, "step": 2580 }, { "epoch": 0.05764779202279202, "grad_norm": 0.4243583679199219, "learning_rate": 0.0003996400742159757, "loss": 0.6262, "step": 2590 }, { "epoch": 0.05787037037037037, "grad_norm": 0.5111228227615356, "learning_rate": 0.0003996372730046769, "loss": 0.6631, "step": 2600 }, { "epoch": 0.05809294871794872, "grad_norm": 0.5107941627502441, "learning_rate": 0.0003996344609449253, "loss": 0.8428, "step": 2610 }, { "epoch": 0.05831552706552707, "grad_norm": 0.7400026321411133, "learning_rate": 0.00039963163803687367, "loss": 0.6988, "step": 2620 }, { "epoch": 0.05853810541310541, "grad_norm": 0.67812180519104, "learning_rate": 0.0003996288042806754, "loss": 0.6731, "step": 2630 }, { "epoch": 0.05876068376068376, "grad_norm": 0.6009535789489746, "learning_rate": 0.00039962595967648446, "loss": 0.7021, "step": 2640 }, { "epoch": 0.05898326210826211, "grad_norm": 3.271467447280884, "learning_rate": 0.00039962310422445545, "loss": 0.755, "step": 2650 }, { "epoch": 0.05920584045584046, "grad_norm": 0.6115080118179321, "learning_rate": 0.00039962023792474355, "loss": 0.7505, "step": 2660 }, { "epoch": 0.0594284188034188, "grad_norm": 0.5685977339744568, "learning_rate": 0.00039961736077750456, "loss": 0.706, "step": 2670 }, { "epoch": 0.05965099715099715, "grad_norm": 0.6168237328529358, "learning_rate": 0.00039961447278289466, "loss": 0.6097, "step": 2680 }, { "epoch": 0.0598735754985755, "grad_norm": 0.4329089820384979, "learning_rate": 0.00039961157394107096, "loss": 0.6782, "step": 2690 }, { "epoch": 0.06000712250712251, "eval_loss": 0.6390817165374756, "eval_runtime": 337.4282, "eval_samples_per_second": 7.009, "eval_steps_per_second": 7.009, "step": 2696 }, { "epoch": 0.06009615384615385, "grad_norm": 0.5909023284912109, "learning_rate": 0.00039960866425219093, "loss": 0.6413, "step": 2700 }, { "epoch": 0.0603187321937322, "grad_norm": 0.4124837815761566, "learning_rate": 0.00039960574371641265, "loss": 0.6268, "step": 2710 }, { "epoch": 0.06054131054131054, "grad_norm": 0.8504196405410767, "learning_rate": 0.0003996028123338949, "loss": 0.6622, "step": 2720 }, { "epoch": 0.06076388888888889, "grad_norm": 0.4204083979129791, "learning_rate": 0.00039959987010479685, "loss": 0.554, "step": 2730 }, { "epoch": 0.06098646723646724, "grad_norm": 0.6747197508811951, "learning_rate": 0.0003995969170292785, "loss": 0.7341, "step": 2740 }, { "epoch": 0.061209045584045586, "grad_norm": 0.41700077056884766, "learning_rate": 0.00039959395310750027, "loss": 0.6148, "step": 2750 }, { "epoch": 0.06143162393162393, "grad_norm": 0.41387853026390076, "learning_rate": 0.00039959097833962325, "loss": 0.7127, "step": 2760 }, { "epoch": 0.06165420227920228, "grad_norm": 0.9684638381004333, "learning_rate": 0.000399587992725809, "loss": 0.6922, "step": 2770 }, { "epoch": 0.06187678062678063, "grad_norm": 0.5317614078521729, "learning_rate": 0.00039958499626622, "loss": 0.553, "step": 2780 }, { "epoch": 0.062099358974358976, "grad_norm": 0.5378207564353943, "learning_rate": 0.00039958198896101874, "loss": 0.5355, "step": 2790 }, { "epoch": 0.062321937321937325, "grad_norm": 0.5899708271026611, "learning_rate": 0.0003995789708103689, "loss": 0.5899, "step": 2800 }, { "epoch": 0.06254451566951567, "grad_norm": 0.5884451270103455, "learning_rate": 0.0003995759418144344, "loss": 0.7328, "step": 2810 }, { "epoch": 0.06276709401709402, "grad_norm": 0.6845195293426514, "learning_rate": 0.0003995729019733799, "loss": 0.6292, "step": 2820 }, { "epoch": 0.06298967236467236, "grad_norm": 0.623423159122467, "learning_rate": 0.0003995698512873704, "loss": 0.7828, "step": 2830 }, { "epoch": 0.06321225071225071, "grad_norm": 0.5040481090545654, "learning_rate": 0.0003995667897565719, "loss": 0.5945, "step": 2840 }, { "epoch": 0.06343482905982906, "grad_norm": 0.7745060324668884, "learning_rate": 0.0003995637173811506, "loss": 0.6323, "step": 2850 }, { "epoch": 0.06365740740740741, "grad_norm": 0.6161078810691833, "learning_rate": 0.0003995606341612736, "loss": 0.6442, "step": 2860 }, { "epoch": 0.06387998575498575, "grad_norm": 0.41616639494895935, "learning_rate": 0.0003995575400971083, "loss": 0.5293, "step": 2870 }, { "epoch": 0.0641025641025641, "grad_norm": 0.6070893406867981, "learning_rate": 0.00039955443518882296, "loss": 0.6674, "step": 2880 }, { "epoch": 0.06432514245014245, "grad_norm": 0.5885868668556213, "learning_rate": 0.00039955131943658623, "loss": 0.6609, "step": 2890 }, { "epoch": 0.0645477207977208, "grad_norm": 0.662845253944397, "learning_rate": 0.00039954819284056747, "loss": 0.5523, "step": 2900 }, { "epoch": 0.06477029914529915, "grad_norm": 0.6544046401977539, "learning_rate": 0.00039954505540093645, "loss": 0.6903, "step": 2910 }, { "epoch": 0.0649928774928775, "grad_norm": 0.4174458682537079, "learning_rate": 0.0003995419071178639, "loss": 0.6201, "step": 2920 }, { "epoch": 0.06521545584045584, "grad_norm": 0.6580033898353577, "learning_rate": 0.00039953874799152073, "loss": 0.6212, "step": 2930 }, { "epoch": 0.06543803418803419, "grad_norm": 0.6909990310668945, "learning_rate": 0.00039953557802207857, "loss": 0.7442, "step": 2940 }, { "epoch": 0.06566061253561253, "grad_norm": 0.7643675208091736, "learning_rate": 0.0003995323972097098, "loss": 0.7135, "step": 2950 }, { "epoch": 0.06588319088319089, "grad_norm": 0.5393293499946594, "learning_rate": 0.00039952920555458727, "loss": 0.6405, "step": 2960 }, { "epoch": 0.06610576923076923, "grad_norm": 0.9071075916290283, "learning_rate": 0.00039952600305688426, "loss": 0.6748, "step": 2970 }, { "epoch": 0.06632834757834757, "grad_norm": 0.3836714029312134, "learning_rate": 0.00039952278971677497, "loss": 0.6589, "step": 2980 }, { "epoch": 0.06655092592592593, "grad_norm": 0.5095319151878357, "learning_rate": 0.000399519565534434, "loss": 0.7009, "step": 2990 }, { "epoch": 0.06677350427350427, "grad_norm": 0.58671635389328, "learning_rate": 0.00039951633051003643, "loss": 0.6462, "step": 3000 }, { "epoch": 0.06699608262108261, "grad_norm": 0.5948638319969177, "learning_rate": 0.00039951308464375814, "loss": 0.6507, "step": 3010 }, { "epoch": 0.06721866096866097, "grad_norm": 0.8463034629821777, "learning_rate": 0.00039950982793577553, "loss": 0.6059, "step": 3020 }, { "epoch": 0.06744123931623931, "grad_norm": 0.4733259677886963, "learning_rate": 0.00039950656038626554, "loss": 0.68, "step": 3030 }, { "epoch": 0.06766381766381767, "grad_norm": 0.7535138726234436, "learning_rate": 0.0003995032819954057, "loss": 0.5747, "step": 3040 }, { "epoch": 0.06788639601139601, "grad_norm": 0.48301541805267334, "learning_rate": 0.00039949999276337427, "loss": 0.7331, "step": 3050 }, { "epoch": 0.06810897435897435, "grad_norm": 0.7251609563827515, "learning_rate": 0.0003994966926903498, "loss": 0.6975, "step": 3060 }, { "epoch": 0.06833155270655271, "grad_norm": 0.4815773665904999, "learning_rate": 0.00039949338177651183, "loss": 0.7074, "step": 3070 }, { "epoch": 0.06855413105413105, "grad_norm": 0.5007272362709045, "learning_rate": 0.0003994900600220401, "loss": 0.6067, "step": 3080 }, { "epoch": 0.06877670940170941, "grad_norm": 0.6377426385879517, "learning_rate": 0.0003994867274271153, "loss": 0.7899, "step": 3090 }, { "epoch": 0.06899928774928775, "grad_norm": 1.270365595817566, "learning_rate": 0.0003994833839919183, "loss": 0.7652, "step": 3100 }, { "epoch": 0.06922186609686609, "grad_norm": 0.5779575109481812, "learning_rate": 0.00039948002971663103, "loss": 0.6203, "step": 3110 }, { "epoch": 0.06944444444444445, "grad_norm": 0.5712846517562866, "learning_rate": 0.0003994766646014355, "loss": 0.7494, "step": 3120 }, { "epoch": 0.06966702279202279, "grad_norm": 0.5811907649040222, "learning_rate": 0.00039947328864651485, "loss": 0.6286, "step": 3130 }, { "epoch": 0.06988960113960115, "grad_norm": 0.7779800891876221, "learning_rate": 0.00039946990185205235, "loss": 0.6996, "step": 3140 }, { "epoch": 0.07011217948717949, "grad_norm": 0.5738331079483032, "learning_rate": 0.0003994665042182321, "loss": 0.6631, "step": 3150 }, { "epoch": 0.07033475783475783, "grad_norm": 1.0784659385681152, "learning_rate": 0.00039946309574523874, "loss": 0.6092, "step": 3160 }, { "epoch": 0.07055733618233619, "grad_norm": 1.0656852722167969, "learning_rate": 0.00039945967643325746, "loss": 0.6525, "step": 3170 }, { "epoch": 0.07077991452991453, "grad_norm": 0.6276305913925171, "learning_rate": 0.0003994562462824741, "loss": 0.5683, "step": 3180 }, { "epoch": 0.07100249287749288, "grad_norm": 0.6950351595878601, "learning_rate": 0.00039945280529307496, "loss": 0.5425, "step": 3190 }, { "epoch": 0.07122507122507123, "grad_norm": 0.6023120880126953, "learning_rate": 0.0003994493534652471, "loss": 0.614, "step": 3200 }, { "epoch": 0.07144764957264957, "grad_norm": 0.6610495448112488, "learning_rate": 0.00039944589079917814, "loss": 0.5214, "step": 3210 }, { "epoch": 0.07167022792022792, "grad_norm": 0.6308820843696594, "learning_rate": 0.0003994424172950562, "loss": 0.6614, "step": 3220 }, { "epoch": 0.07189280626780627, "grad_norm": 0.5825803875923157, "learning_rate": 0.00039943893295307, "loss": 0.6141, "step": 3230 }, { "epoch": 0.07211538461538461, "grad_norm": 0.7101708650588989, "learning_rate": 0.00039943543777340895, "loss": 0.6021, "step": 3240 }, { "epoch": 0.07233796296296297, "grad_norm": 0.3456685245037079, "learning_rate": 0.00039943193175626297, "loss": 0.6149, "step": 3250 }, { "epoch": 0.07256054131054131, "grad_norm": 0.8595088124275208, "learning_rate": 0.0003994284149018225, "loss": 0.6951, "step": 3260 }, { "epoch": 0.07278311965811966, "grad_norm": 0.5759161114692688, "learning_rate": 0.0003994248872102787, "loss": 0.5981, "step": 3270 }, { "epoch": 0.073005698005698, "grad_norm": 0.7946069836616516, "learning_rate": 0.00039942134868182333, "loss": 0.6799, "step": 3280 }, { "epoch": 0.07322827635327635, "grad_norm": 0.5611863136291504, "learning_rate": 0.00039941779931664857, "loss": 0.8381, "step": 3290 }, { "epoch": 0.0734508547008547, "grad_norm": 0.6077750325202942, "learning_rate": 0.0003994142391149474, "loss": 0.7151, "step": 3300 }, { "epoch": 0.07367343304843305, "grad_norm": 0.6016742587089539, "learning_rate": 0.0003994106680769131, "loss": 0.5861, "step": 3310 }, { "epoch": 0.0738960113960114, "grad_norm": 0.8998081684112549, "learning_rate": 0.00039940708620273994, "loss": 0.7188, "step": 3320 }, { "epoch": 0.07411858974358974, "grad_norm": 0.607742965221405, "learning_rate": 0.0003994034934926226, "loss": 0.5308, "step": 3330 }, { "epoch": 0.07434116809116809, "grad_norm": 0.3822357654571533, "learning_rate": 0.000399399889946756, "loss": 0.5146, "step": 3340 }, { "epoch": 0.07456374643874644, "grad_norm": 0.5996620655059814, "learning_rate": 0.0003993962755653362, "loss": 0.582, "step": 3350 }, { "epoch": 0.07478632478632478, "grad_norm": 0.6241055727005005, "learning_rate": 0.00039939265034855955, "loss": 0.6331, "step": 3360 }, { "epoch": 0.07500890313390314, "grad_norm": 0.41144707798957825, "learning_rate": 0.00039938901429662307, "loss": 0.5581, "step": 3370 }, { "epoch": 0.07523148148148148, "grad_norm": 0.7115545868873596, "learning_rate": 0.00039938536740972427, "loss": 0.7581, "step": 3380 }, { "epoch": 0.07545405982905982, "grad_norm": 0.4126880466938019, "learning_rate": 0.00039938170968806144, "loss": 0.6505, "step": 3390 }, { "epoch": 0.07567663817663818, "grad_norm": 0.6458644866943359, "learning_rate": 0.0003993780411318333, "loss": 0.7225, "step": 3400 }, { "epoch": 0.07589921652421652, "grad_norm": 0.7713171243667603, "learning_rate": 0.0003993743617412391, "loss": 0.7107, "step": 3410 }, { "epoch": 0.07612179487179487, "grad_norm": 0.5597316026687622, "learning_rate": 0.00039937067151647894, "loss": 0.7864, "step": 3420 }, { "epoch": 0.07634437321937322, "grad_norm": 0.5204119682312012, "learning_rate": 0.0003993669704577533, "loss": 0.6962, "step": 3430 }, { "epoch": 0.07656695156695156, "grad_norm": 0.5534259676933289, "learning_rate": 0.00039936325856526324, "loss": 0.7243, "step": 3440 }, { "epoch": 0.07678952991452992, "grad_norm": 0.6227967739105225, "learning_rate": 0.00039935953583921047, "loss": 0.6225, "step": 3450 }, { "epoch": 0.07701210826210826, "grad_norm": 0.5725488662719727, "learning_rate": 0.00039935580227979734, "loss": 0.6252, "step": 3460 }, { "epoch": 0.0772346866096866, "grad_norm": 0.6526398658752441, "learning_rate": 0.0003993520578872267, "loss": 0.6411, "step": 3470 }, { "epoch": 0.07745726495726496, "grad_norm": 0.9423260688781738, "learning_rate": 0.000399348302661702, "loss": 0.6321, "step": 3480 }, { "epoch": 0.0776798433048433, "grad_norm": 0.5385962724685669, "learning_rate": 0.0003993445366034275, "loss": 0.6431, "step": 3490 }, { "epoch": 0.07790242165242166, "grad_norm": 1.0496457815170288, "learning_rate": 0.00039934075971260753, "loss": 0.709, "step": 3500 }, { "epoch": 0.078125, "grad_norm": 0.9465022087097168, "learning_rate": 0.0003993369719894475, "loss": 0.5711, "step": 3510 }, { "epoch": 0.07834757834757834, "grad_norm": 0.6062886714935303, "learning_rate": 0.0003993331734341533, "loss": 0.6693, "step": 3520 }, { "epoch": 0.0785701566951567, "grad_norm": 0.6801285743713379, "learning_rate": 0.0003993293640469313, "loss": 0.5972, "step": 3530 }, { "epoch": 0.07879273504273504, "grad_norm": 0.5197761654853821, "learning_rate": 0.0003993255438279884, "loss": 0.5398, "step": 3540 }, { "epoch": 0.0790153133903134, "grad_norm": 0.6844536662101746, "learning_rate": 0.0003993217127775323, "loss": 0.7739, "step": 3550 }, { "epoch": 0.07923789173789174, "grad_norm": 0.5622774958610535, "learning_rate": 0.00039931787089577113, "loss": 0.5835, "step": 3560 }, { "epoch": 0.07946047008547008, "grad_norm": 0.6373755931854248, "learning_rate": 0.00039931401818291373, "loss": 0.588, "step": 3570 }, { "epoch": 0.07968304843304844, "grad_norm": 0.6422734260559082, "learning_rate": 0.0003993101546391694, "loss": 0.6587, "step": 3580 }, { "epoch": 0.07990562678062678, "grad_norm": 0.3758021593093872, "learning_rate": 0.0003993062802647481, "loss": 0.5819, "step": 3590 }, { "epoch": 0.08012820512820513, "grad_norm": 0.41243478655815125, "learning_rate": 0.00039930239505986035, "loss": 0.6124, "step": 3600 }, { "epoch": 0.08035078347578348, "grad_norm": 0.33799731731414795, "learning_rate": 0.0003992984990247173, "loss": 0.5755, "step": 3610 }, { "epoch": 0.08057336182336182, "grad_norm": 0.6254426836967468, "learning_rate": 0.0003992945921595307, "loss": 0.5911, "step": 3620 }, { "epoch": 0.08079594017094018, "grad_norm": 0.7614779472351074, "learning_rate": 0.0003992906744645128, "loss": 0.6772, "step": 3630 }, { "epoch": 0.08101851851851852, "grad_norm": 0.5825209617614746, "learning_rate": 0.0003992867459398765, "loss": 0.7136, "step": 3640 }, { "epoch": 0.08124109686609686, "grad_norm": 0.5053777098655701, "learning_rate": 0.0003992828065858352, "loss": 0.5367, "step": 3650 }, { "epoch": 0.08146367521367522, "grad_norm": 0.9008991718292236, "learning_rate": 0.00039927885640260317, "loss": 0.7203, "step": 3660 }, { "epoch": 0.08168625356125356, "grad_norm": 0.6658133268356323, "learning_rate": 0.0003992748953903949, "loss": 0.5676, "step": 3670 }, { "epoch": 0.08190883190883191, "grad_norm": 0.4479519724845886, "learning_rate": 0.0003992709235494257, "loss": 0.6296, "step": 3680 }, { "epoch": 0.08213141025641026, "grad_norm": 0.7825643420219421, "learning_rate": 0.0003992669408799113, "loss": 0.6228, "step": 3690 }, { "epoch": 0.0823539886039886, "grad_norm": 0.7348558902740479, "learning_rate": 0.0003992629473820683, "loss": 0.6685, "step": 3700 }, { "epoch": 0.08257656695156695, "grad_norm": 0.6211950182914734, "learning_rate": 0.0003992589430561136, "loss": 0.593, "step": 3710 }, { "epoch": 0.0827991452991453, "grad_norm": 0.49289315938949585, "learning_rate": 0.00039925492790226477, "loss": 0.7275, "step": 3720 }, { "epoch": 0.08302172364672365, "grad_norm": 0.513803243637085, "learning_rate": 0.00039925090192074005, "loss": 0.6036, "step": 3730 }, { "epoch": 0.083244301994302, "grad_norm": 0.5003082752227783, "learning_rate": 0.00039924686511175824, "loss": 0.6403, "step": 3740 }, { "epoch": 0.08346688034188034, "grad_norm": 0.7302523255348206, "learning_rate": 0.00039924281747553866, "loss": 0.6492, "step": 3750 }, { "epoch": 0.08368945868945869, "grad_norm": 0.6277854442596436, "learning_rate": 0.00039923875901230125, "loss": 0.5726, "step": 3760 }, { "epoch": 0.08391203703703703, "grad_norm": 0.9266096353530884, "learning_rate": 0.00039923468972226654, "loss": 0.8142, "step": 3770 }, { "epoch": 0.08413461538461539, "grad_norm": 0.5914018750190735, "learning_rate": 0.00039923060960565576, "loss": 0.628, "step": 3780 }, { "epoch": 0.08435719373219373, "grad_norm": 0.7605827450752258, "learning_rate": 0.0003992265186626905, "loss": 0.7179, "step": 3790 }, { "epoch": 0.08457977207977208, "grad_norm": 0.7958018779754639, "learning_rate": 0.0003992224168935932, "loss": 0.6565, "step": 3800 }, { "epoch": 0.08480235042735043, "grad_norm": 0.49368587136268616, "learning_rate": 0.0003992183042985866, "loss": 0.6812, "step": 3810 }, { "epoch": 0.08502492877492877, "grad_norm": 0.6145605444908142, "learning_rate": 0.00039921418087789426, "loss": 0.6702, "step": 3820 }, { "epoch": 0.08524750712250712, "grad_norm": 0.5280147790908813, "learning_rate": 0.0003992100466317403, "loss": 0.694, "step": 3830 }, { "epoch": 0.08547008547008547, "grad_norm": 0.771976113319397, "learning_rate": 0.0003992059015603493, "loss": 0.6219, "step": 3840 }, { "epoch": 0.08569266381766381, "grad_norm": 0.888991117477417, "learning_rate": 0.00039920174566394646, "loss": 0.6727, "step": 3850 }, { "epoch": 0.08591524216524217, "grad_norm": 0.8724148869514465, "learning_rate": 0.0003991975789427578, "loss": 0.6773, "step": 3860 }, { "epoch": 0.08613782051282051, "grad_norm": 0.5815834999084473, "learning_rate": 0.0003991934013970096, "loss": 0.6245, "step": 3870 }, { "epoch": 0.08636039886039885, "grad_norm": 0.8174399733543396, "learning_rate": 0.0003991892130269288, "loss": 0.6157, "step": 3880 }, { "epoch": 0.08658297720797721, "grad_norm": 0.7425127029418945, "learning_rate": 0.0003991850138327432, "loss": 0.7522, "step": 3890 }, { "epoch": 0.08680555555555555, "grad_norm": 0.46512556076049805, "learning_rate": 0.00039918080381468095, "loss": 0.5833, "step": 3900 }, { "epoch": 0.08702813390313391, "grad_norm": 0.6372483968734741, "learning_rate": 0.0003991765829729706, "loss": 0.5622, "step": 3910 }, { "epoch": 0.08725071225071225, "grad_norm": 0.627477765083313, "learning_rate": 0.00039917235130784175, "loss": 0.708, "step": 3920 }, { "epoch": 0.08747329059829059, "grad_norm": 0.7006207704544067, "learning_rate": 0.0003991681088195243, "loss": 0.7236, "step": 3930 }, { "epoch": 0.08769586894586895, "grad_norm": 0.5845404863357544, "learning_rate": 0.0003991638555082488, "loss": 0.6517, "step": 3940 }, { "epoch": 0.08791844729344729, "grad_norm": 0.4940961003303528, "learning_rate": 0.0003991595913742463, "loss": 0.68, "step": 3950 }, { "epoch": 0.08814102564102565, "grad_norm": 0.6172021627426147, "learning_rate": 0.00039915531641774855, "loss": 0.7886, "step": 3960 }, { "epoch": 0.08836360398860399, "grad_norm": 0.671653687953949, "learning_rate": 0.00039915103063898786, "loss": 0.7868, "step": 3970 }, { "epoch": 0.08858618233618233, "grad_norm": 0.6825588941574097, "learning_rate": 0.0003991467340381972, "loss": 0.5949, "step": 3980 }, { "epoch": 0.08880876068376069, "grad_norm": 0.6282208561897278, "learning_rate": 0.0003991424266156099, "loss": 0.6068, "step": 3990 }, { "epoch": 0.08903133903133903, "grad_norm": 0.8855395913124084, "learning_rate": 0.00039913810837146024, "loss": 0.6163, "step": 4000 }, { "epoch": 0.08925391737891739, "grad_norm": 0.566017210483551, "learning_rate": 0.0003991337793059827, "loss": 0.6877, "step": 4010 }, { "epoch": 0.08947649572649573, "grad_norm": 0.5654011368751526, "learning_rate": 0.0003991294394194125, "loss": 0.6206, "step": 4020 }, { "epoch": 0.08969907407407407, "grad_norm": 0.7026615738868713, "learning_rate": 0.0003991250887119856, "loss": 0.6946, "step": 4030 }, { "epoch": 0.08992165242165243, "grad_norm": 0.8429543375968933, "learning_rate": 0.0003991207271839383, "loss": 0.6198, "step": 4040 }, { "epoch": 0.09014423076923077, "grad_norm": 0.5337466597557068, "learning_rate": 0.0003991163548355078, "loss": 0.7019, "step": 4050 }, { "epoch": 0.09036680911680911, "grad_norm": 0.6086145639419556, "learning_rate": 0.00039911197166693144, "loss": 0.5869, "step": 4060 }, { "epoch": 0.09058938746438747, "grad_norm": 0.578947901725769, "learning_rate": 0.00039910757767844767, "loss": 0.6893, "step": 4070 }, { "epoch": 0.09081196581196581, "grad_norm": 0.6418620944023132, "learning_rate": 0.00039910317287029505, "loss": 0.695, "step": 4080 }, { "epoch": 0.09103454415954416, "grad_norm": 0.587532639503479, "learning_rate": 0.00039909875724271305, "loss": 0.5836, "step": 4090 }, { "epoch": 0.0912571225071225, "grad_norm": 0.46247434616088867, "learning_rate": 0.0003990943307959416, "loss": 0.5812, "step": 4100 }, { "epoch": 0.09147970085470085, "grad_norm": 0.5754249691963196, "learning_rate": 0.0003990898935302212, "loss": 0.6688, "step": 4110 }, { "epoch": 0.0917022792022792, "grad_norm": 0.6887472867965698, "learning_rate": 0.000399085445445793, "loss": 0.6571, "step": 4120 }, { "epoch": 0.09192485754985755, "grad_norm": 0.9536170959472656, "learning_rate": 0.00039908098654289876, "loss": 0.7645, "step": 4130 }, { "epoch": 0.0921474358974359, "grad_norm": 0.5921887159347534, "learning_rate": 0.0003990765168217807, "loss": 0.6758, "step": 4140 }, { "epoch": 0.09237001424501425, "grad_norm": 0.8859508633613586, "learning_rate": 0.0003990720362826817, "loss": 0.57, "step": 4150 }, { "epoch": 0.09259259259259259, "grad_norm": 0.76180499792099, "learning_rate": 0.00039906754492584535, "loss": 0.6223, "step": 4160 }, { "epoch": 0.09281517094017094, "grad_norm": 0.43025851249694824, "learning_rate": 0.0003990630427515156, "loss": 0.6777, "step": 4170 }, { "epoch": 0.09303774928774929, "grad_norm": 0.3659840524196625, "learning_rate": 0.00039905852975993724, "loss": 0.5963, "step": 4180 }, { "epoch": 0.09326032763532764, "grad_norm": 0.8982321619987488, "learning_rate": 0.0003990540059513554, "loss": 0.6307, "step": 4190 }, { "epoch": 0.09348290598290598, "grad_norm": 0.6464897990226746, "learning_rate": 0.0003990494713260158, "loss": 0.6593, "step": 4200 }, { "epoch": 0.09370548433048433, "grad_norm": 0.7186951637268066, "learning_rate": 0.00039904492588416506, "loss": 0.7252, "step": 4210 }, { "epoch": 0.09392806267806268, "grad_norm": 0.5287630558013916, "learning_rate": 0.00039904036962605006, "loss": 0.7238, "step": 4220 }, { "epoch": 0.09415064102564102, "grad_norm": 0.963729739189148, "learning_rate": 0.0003990358025519185, "loss": 0.7215, "step": 4230 }, { "epoch": 0.09437321937321937, "grad_norm": 0.355486124753952, "learning_rate": 0.0003990312246620184, "loss": 0.6176, "step": 4240 }, { "epoch": 0.09459579772079772, "grad_norm": 0.671535849571228, "learning_rate": 0.0003990266359565987, "loss": 0.6039, "step": 4250 }, { "epoch": 0.09481837606837606, "grad_norm": 0.7099301218986511, "learning_rate": 0.0003990220364359087, "loss": 0.6712, "step": 4260 }, { "epoch": 0.09504095441595442, "grad_norm": 0.5276811718940735, "learning_rate": 0.00039901742610019825, "loss": 0.6352, "step": 4270 }, { "epoch": 0.09526353276353276, "grad_norm": 0.547616720199585, "learning_rate": 0.00039901280494971796, "loss": 0.6072, "step": 4280 }, { "epoch": 0.0954861111111111, "grad_norm": 0.570436954498291, "learning_rate": 0.0003990081729847189, "loss": 0.678, "step": 4290 }, { "epoch": 0.09570868945868946, "grad_norm": 0.8578181862831116, "learning_rate": 0.0003990035302054528, "loss": 0.5163, "step": 4300 }, { "epoch": 0.0959312678062678, "grad_norm": 0.6730733513832092, "learning_rate": 0.00039899887661217203, "loss": 0.6553, "step": 4310 }, { "epoch": 0.09615384615384616, "grad_norm": 0.8366342186927795, "learning_rate": 0.0003989942122051293, "loss": 0.6638, "step": 4320 }, { "epoch": 0.0963764245014245, "grad_norm": 0.6078762412071228, "learning_rate": 0.00039898953698457826, "loss": 0.5934, "step": 4330 }, { "epoch": 0.09659900284900284, "grad_norm": 0.7725120782852173, "learning_rate": 0.0003989848509507728, "loss": 0.6387, "step": 4340 }, { "epoch": 0.0968215811965812, "grad_norm": 0.87664794921875, "learning_rate": 0.0003989801541039677, "loss": 0.675, "step": 4350 }, { "epoch": 0.09704415954415954, "grad_norm": 0.43659526109695435, "learning_rate": 0.00039897544644441814, "loss": 0.6767, "step": 4360 }, { "epoch": 0.0972667378917379, "grad_norm": 0.615048348903656, "learning_rate": 0.0003989707279723799, "loss": 0.8962, "step": 4370 }, { "epoch": 0.09748931623931624, "grad_norm": 0.6210547089576721, "learning_rate": 0.0003989659986881094, "loss": 0.553, "step": 4380 }, { "epoch": 0.09771189458689458, "grad_norm": 0.8046955466270447, "learning_rate": 0.0003989612585918637, "loss": 0.5356, "step": 4390 }, { "epoch": 0.09793447293447294, "grad_norm": 0.5782980918884277, "learning_rate": 0.0003989565076839003, "loss": 0.726, "step": 4400 }, { "epoch": 0.09815705128205128, "grad_norm": 0.9094864726066589, "learning_rate": 0.0003989517459644774, "loss": 0.6938, "step": 4410 }, { "epoch": 0.09837962962962964, "grad_norm": 0.49428123235702515, "learning_rate": 0.00039894697343385377, "loss": 0.5804, "step": 4420 }, { "epoch": 0.09860220797720798, "grad_norm": 0.6479130387306213, "learning_rate": 0.00039894219009228876, "loss": 0.6167, "step": 4430 }, { "epoch": 0.09882478632478632, "grad_norm": 0.784542977809906, "learning_rate": 0.0003989373959400422, "loss": 0.6455, "step": 4440 }, { "epoch": 0.09904736467236468, "grad_norm": 0.6367858052253723, "learning_rate": 0.00039893259097737474, "loss": 0.6466, "step": 4450 }, { "epoch": 0.09926994301994302, "grad_norm": 2.0138823986053467, "learning_rate": 0.00039892777520454746, "loss": 0.6594, "step": 4460 }, { "epoch": 0.09949252136752136, "grad_norm": 0.5950619578361511, "learning_rate": 0.00039892294862182195, "loss": 0.727, "step": 4470 }, { "epoch": 0.09971509971509972, "grad_norm": 0.7700321078300476, "learning_rate": 0.0003989181112294606, "loss": 0.6564, "step": 4480 }, { "epoch": 0.09993767806267806, "grad_norm": 0.7688900232315063, "learning_rate": 0.0003989132630277263, "loss": 0.5971, "step": 4490 }, { "epoch": 0.10016025641025642, "grad_norm": 0.8038826584815979, "learning_rate": 0.0003989084040168824, "loss": 0.5495, "step": 4500 }, { "epoch": 0.10038283475783476, "grad_norm": 0.7080345749855042, "learning_rate": 0.000398903534197193, "loss": 0.6569, "step": 4510 }, { "epoch": 0.1006054131054131, "grad_norm": 0.5524005889892578, "learning_rate": 0.0003988986535689227, "loss": 0.5265, "step": 4520 }, { "epoch": 0.10082799145299146, "grad_norm": 1.074703335762024, "learning_rate": 0.0003988937621323368, "loss": 0.5926, "step": 4530 }, { "epoch": 0.1010505698005698, "grad_norm": 0.5182725787162781, "learning_rate": 0.000398888859887701, "loss": 0.5199, "step": 4540 }, { "epoch": 0.10127314814814815, "grad_norm": 0.6152669787406921, "learning_rate": 0.00039888394683528167, "loss": 0.7744, "step": 4550 }, { "epoch": 0.1014957264957265, "grad_norm": 0.7104138731956482, "learning_rate": 0.00039887902297534595, "loss": 0.6647, "step": 4560 }, { "epoch": 0.10171830484330484, "grad_norm": 0.8174471855163574, "learning_rate": 0.0003988740883081613, "loss": 0.7876, "step": 4570 }, { "epoch": 0.1019408831908832, "grad_norm": 0.743220329284668, "learning_rate": 0.00039886914283399587, "loss": 0.8068, "step": 4580 }, { "epoch": 0.10216346153846154, "grad_norm": 0.3491692543029785, "learning_rate": 0.0003988641865531184, "loss": 0.5628, "step": 4590 }, { "epoch": 0.10238603988603989, "grad_norm": 0.9163690805435181, "learning_rate": 0.0003988592194657982, "loss": 0.6775, "step": 4600 }, { "epoch": 0.10260861823361823, "grad_norm": 0.27448055148124695, "learning_rate": 0.0003988542415723053, "loss": 0.5396, "step": 4610 }, { "epoch": 0.10283119658119658, "grad_norm": 0.5256446003913879, "learning_rate": 0.0003988492528729101, "loss": 0.5133, "step": 4620 }, { "epoch": 0.10305377492877493, "grad_norm": 0.5656632781028748, "learning_rate": 0.00039884425336788367, "loss": 0.5857, "step": 4630 }, { "epoch": 0.10327635327635327, "grad_norm": 0.6648633480072021, "learning_rate": 0.0003988392430574978, "loss": 0.6029, "step": 4640 }, { "epoch": 0.10349893162393162, "grad_norm": 0.49245837330818176, "learning_rate": 0.00039883422194202464, "loss": 0.6558, "step": 4650 }, { "epoch": 0.10372150997150997, "grad_norm": 0.6503485441207886, "learning_rate": 0.00039882919002173713, "loss": 0.6207, "step": 4660 }, { "epoch": 0.10394408831908832, "grad_norm": 0.5436996817588806, "learning_rate": 0.0003988241472969086, "loss": 0.7351, "step": 4670 }, { "epoch": 0.10416666666666667, "grad_norm": 0.8330632448196411, "learning_rate": 0.0003988190937678132, "loss": 0.6843, "step": 4680 }, { "epoch": 0.10438924501424501, "grad_norm": 0.7099012136459351, "learning_rate": 0.00039881402943472543, "loss": 0.624, "step": 4690 }, { "epoch": 0.10461182336182336, "grad_norm": 0.47862574458122253, "learning_rate": 0.0003988089542979206, "loss": 0.6694, "step": 4700 }, { "epoch": 0.10483440170940171, "grad_norm": 0.5711633563041687, "learning_rate": 0.0003988038683576744, "loss": 0.5787, "step": 4710 }, { "epoch": 0.10505698005698005, "grad_norm": 0.8468604683876038, "learning_rate": 0.0003987987716142632, "loss": 0.5867, "step": 4720 }, { "epoch": 0.10527955840455841, "grad_norm": 0.45642709732055664, "learning_rate": 0.0003987936640679641, "loss": 0.5533, "step": 4730 }, { "epoch": 0.10550213675213675, "grad_norm": 0.847876250743866, "learning_rate": 0.00039878854571905454, "loss": 0.732, "step": 4740 }, { "epoch": 0.1057247150997151, "grad_norm": 0.690815269947052, "learning_rate": 0.0003987834165678126, "loss": 0.6412, "step": 4750 }, { "epoch": 0.10594729344729345, "grad_norm": 0.6536726355552673, "learning_rate": 0.0003987782766145172, "loss": 0.5655, "step": 4760 }, { "epoch": 0.10616987179487179, "grad_norm": 0.660818874835968, "learning_rate": 0.00039877312585944743, "loss": 0.5416, "step": 4770 }, { "epoch": 0.10639245014245015, "grad_norm": 0.4305530786514282, "learning_rate": 0.0003987679643028832, "loss": 0.6333, "step": 4780 }, { "epoch": 0.10661502849002849, "grad_norm": 0.7705379128456116, "learning_rate": 0.00039876279194510524, "loss": 0.6503, "step": 4790 }, { "epoch": 0.10683760683760683, "grad_norm": 0.5306333303451538, "learning_rate": 0.00039875760878639436, "loss": 0.5507, "step": 4800 }, { "epoch": 0.10706018518518519, "grad_norm": 0.7447746396064758, "learning_rate": 0.0003987524148270323, "loss": 0.6574, "step": 4810 }, { "epoch": 0.10728276353276353, "grad_norm": 0.652462363243103, "learning_rate": 0.0003987472100673013, "loss": 0.6528, "step": 4820 }, { "epoch": 0.10750534188034189, "grad_norm": 0.6787484288215637, "learning_rate": 0.00039874199450748427, "loss": 0.5111, "step": 4830 }, { "epoch": 0.10772792022792023, "grad_norm": 0.3653728663921356, "learning_rate": 0.0003987367681478645, "loss": 0.6208, "step": 4840 }, { "epoch": 0.10795049857549857, "grad_norm": 0.5167881846427917, "learning_rate": 0.00039873153098872607, "loss": 0.725, "step": 4850 }, { "epoch": 0.10817307692307693, "grad_norm": 0.45200806856155396, "learning_rate": 0.00039872628303035357, "loss": 0.6057, "step": 4860 }, { "epoch": 0.10839565527065527, "grad_norm": 0.7915605306625366, "learning_rate": 0.00039872102427303214, "loss": 0.5791, "step": 4870 }, { "epoch": 0.10861823361823361, "grad_norm": 0.7434691190719604, "learning_rate": 0.0003987157547170476, "loss": 0.7281, "step": 4880 }, { "epoch": 0.10884081196581197, "grad_norm": 0.6410295963287354, "learning_rate": 0.00039871047436268627, "loss": 0.6529, "step": 4890 }, { "epoch": 0.10906339031339031, "grad_norm": 0.655968964099884, "learning_rate": 0.0003987051832102351, "loss": 0.6572, "step": 4900 }, { "epoch": 0.10928596866096867, "grad_norm": 0.5038496851921082, "learning_rate": 0.0003986998812599816, "loss": 0.5462, "step": 4910 }, { "epoch": 0.10950854700854701, "grad_norm": 0.7149390578269958, "learning_rate": 0.00039869456851221387, "loss": 0.6658, "step": 4920 }, { "epoch": 0.10973112535612535, "grad_norm": 0.5790989398956299, "learning_rate": 0.00039868924496722064, "loss": 0.6599, "step": 4930 }, { "epoch": 0.1099537037037037, "grad_norm": 0.6562433838844299, "learning_rate": 0.0003986839106252912, "loss": 0.6651, "step": 4940 }, { "epoch": 0.11017628205128205, "grad_norm": 0.6387081146240234, "learning_rate": 0.00039867856548671536, "loss": 0.7696, "step": 4950 }, { "epoch": 0.1103988603988604, "grad_norm": 0.7654621005058289, "learning_rate": 0.00039867320955178364, "loss": 0.7165, "step": 4960 }, { "epoch": 0.11062143874643875, "grad_norm": 0.49957525730133057, "learning_rate": 0.00039866784282078713, "loss": 0.6508, "step": 4970 }, { "epoch": 0.11084401709401709, "grad_norm": 0.6308586001396179, "learning_rate": 0.00039866246529401733, "loss": 0.5636, "step": 4980 }, { "epoch": 0.11106659544159544, "grad_norm": 0.6661836504936218, "learning_rate": 0.0003986570769717666, "loss": 0.5475, "step": 4990 }, { "epoch": 0.11128917378917379, "grad_norm": 0.5627828240394592, "learning_rate": 0.0003986516778543276, "loss": 0.6255, "step": 5000 }, { "epoch": 0.11151175213675214, "grad_norm": 0.7126817107200623, "learning_rate": 0.00039864626794199385, "loss": 0.5624, "step": 5010 }, { "epoch": 0.11173433048433049, "grad_norm": 0.7044990658760071, "learning_rate": 0.00039864084723505925, "loss": 0.6114, "step": 5020 }, { "epoch": 0.11195690883190883, "grad_norm": 0.7257621884346008, "learning_rate": 0.00039863541573381846, "loss": 0.6342, "step": 5030 }, { "epoch": 0.11217948717948718, "grad_norm": 0.6506055593490601, "learning_rate": 0.0003986299734385665, "loss": 0.6685, "step": 5040 }, { "epoch": 0.11240206552706553, "grad_norm": 0.7877110838890076, "learning_rate": 0.0003986245203495992, "loss": 0.5746, "step": 5050 }, { "epoch": 0.11262464387464387, "grad_norm": 0.6759909391403198, "learning_rate": 0.0003986190564672129, "loss": 0.6528, "step": 5060 }, { "epoch": 0.11284722222222222, "grad_norm": 0.5384511351585388, "learning_rate": 0.00039861358179170447, "loss": 0.5897, "step": 5070 }, { "epoch": 0.11306980056980057, "grad_norm": 0.6556115746498108, "learning_rate": 0.0003986080963233714, "loss": 0.636, "step": 5080 }, { "epoch": 0.11329237891737892, "grad_norm": 0.5106657147407532, "learning_rate": 0.00039860260006251174, "loss": 0.5044, "step": 5090 }, { "epoch": 0.11351495726495726, "grad_norm": 0.6946271657943726, "learning_rate": 0.0003985970930094242, "loss": 0.496, "step": 5100 }, { "epoch": 0.1137375356125356, "grad_norm": 0.7798992395401001, "learning_rate": 0.00039859157516440813, "loss": 0.6639, "step": 5110 }, { "epoch": 0.11396011396011396, "grad_norm": 0.5560083389282227, "learning_rate": 0.00039858604652776323, "loss": 0.644, "step": 5120 }, { "epoch": 0.1141826923076923, "grad_norm": 0.8663046360015869, "learning_rate": 0.00039858050709979, "loss": 0.6779, "step": 5130 }, { "epoch": 0.11440527065527066, "grad_norm": 0.6664817333221436, "learning_rate": 0.00039857495688078946, "loss": 0.6587, "step": 5140 }, { "epoch": 0.114627849002849, "grad_norm": 0.6127937436103821, "learning_rate": 0.00039856939587106324, "loss": 0.5973, "step": 5150 }, { "epoch": 0.11485042735042734, "grad_norm": 0.4794783294200897, "learning_rate": 0.00039856382407091345, "loss": 0.5779, "step": 5160 }, { "epoch": 0.1150730056980057, "grad_norm": 0.418834924697876, "learning_rate": 0.0003985582414806429, "loss": 0.5309, "step": 5170 }, { "epoch": 0.11529558404558404, "grad_norm": 0.6403540968894958, "learning_rate": 0.00039855264810055493, "loss": 0.6882, "step": 5180 }, { "epoch": 0.1155181623931624, "grad_norm": 0.9398502707481384, "learning_rate": 0.00039854704393095357, "loss": 0.5824, "step": 5190 }, { "epoch": 0.11574074074074074, "grad_norm": 0.6912689208984375, "learning_rate": 0.0003985414289721433, "loss": 0.6691, "step": 5200 }, { "epoch": 0.11596331908831908, "grad_norm": 0.33099091053009033, "learning_rate": 0.00039853580322442923, "loss": 0.5808, "step": 5210 }, { "epoch": 0.11618589743589744, "grad_norm": 0.48186391592025757, "learning_rate": 0.00039853016668811716, "loss": 0.8117, "step": 5220 }, { "epoch": 0.11640847578347578, "grad_norm": 0.46495237946510315, "learning_rate": 0.0003985245193635132, "loss": 0.5786, "step": 5230 }, { "epoch": 0.11663105413105414, "grad_norm": 0.7267534136772156, "learning_rate": 0.0003985188612509244, "loss": 0.5912, "step": 5240 }, { "epoch": 0.11685363247863248, "grad_norm": 0.4775184988975525, "learning_rate": 0.00039851319235065816, "loss": 0.6497, "step": 5250 }, { "epoch": 0.11707621082621082, "grad_norm": 0.9448670148849487, "learning_rate": 0.00039850751266302253, "loss": 0.6227, "step": 5260 }, { "epoch": 0.11729878917378918, "grad_norm": 0.785839855670929, "learning_rate": 0.00039850182218832615, "loss": 0.6165, "step": 5270 }, { "epoch": 0.11752136752136752, "grad_norm": 0.4065621495246887, "learning_rate": 0.00039849612092687824, "loss": 0.744, "step": 5280 }, { "epoch": 0.11774394586894586, "grad_norm": 0.9685630798339844, "learning_rate": 0.00039849040887898865, "loss": 0.5659, "step": 5290 }, { "epoch": 0.11796652421652422, "grad_norm": 0.5907897353172302, "learning_rate": 0.0003984846860449677, "loss": 0.6881, "step": 5300 }, { "epoch": 0.11818910256410256, "grad_norm": 0.727581799030304, "learning_rate": 0.0003984789524251265, "loss": 0.6058, "step": 5310 }, { "epoch": 0.11841168091168092, "grad_norm": 0.6126496195793152, "learning_rate": 0.00039847320801977647, "loss": 0.7483, "step": 5320 }, { "epoch": 0.11863425925925926, "grad_norm": 1.0143053531646729, "learning_rate": 0.0003984674528292299, "loss": 0.53, "step": 5330 }, { "epoch": 0.1188568376068376, "grad_norm": 0.6108867526054382, "learning_rate": 0.00039846168685379944, "loss": 0.6549, "step": 5340 }, { "epoch": 0.11907941595441596, "grad_norm": 0.5987547636032104, "learning_rate": 0.0003984559100937984, "loss": 0.5832, "step": 5350 }, { "epoch": 0.1193019943019943, "grad_norm": 0.7370131015777588, "learning_rate": 0.00039845012254954084, "loss": 0.5923, "step": 5360 }, { "epoch": 0.11952457264957266, "grad_norm": 0.6291268467903137, "learning_rate": 0.0003984443242213411, "loss": 0.6125, "step": 5370 }, { "epoch": 0.119747150997151, "grad_norm": 0.5933393239974976, "learning_rate": 0.0003984385151095143, "loss": 0.8159, "step": 5380 }, { "epoch": 0.11996972934472934, "grad_norm": 0.833824098110199, "learning_rate": 0.0003984326952143762, "loss": 0.6775, "step": 5390 }, { "epoch": 0.12001424501424501, "eval_loss": 0.6372599005699158, "eval_runtime": 337.3844, "eval_samples_per_second": 7.01, "eval_steps_per_second": 7.01, "step": 5392 }, { "epoch": 0.1201923076923077, "grad_norm": 0.7204414010047913, "learning_rate": 0.00039842686453624295, "loss": 0.5655, "step": 5400 }, { "epoch": 0.12041488603988604, "grad_norm": 0.7585781812667847, "learning_rate": 0.0003984210230754315, "loss": 0.6104, "step": 5410 }, { "epoch": 0.1206374643874644, "grad_norm": 0.5115826725959778, "learning_rate": 0.00039841517083225915, "loss": 0.663, "step": 5420 }, { "epoch": 0.12086004273504274, "grad_norm": 0.38228660821914673, "learning_rate": 0.0003984093078070441, "loss": 0.5706, "step": 5430 }, { "epoch": 0.12108262108262108, "grad_norm": 0.38406258821487427, "learning_rate": 0.00039840343400010476, "loss": 0.5985, "step": 5440 }, { "epoch": 0.12130519943019943, "grad_norm": 0.6619420647621155, "learning_rate": 0.0003983975494117604, "loss": 0.6852, "step": 5450 }, { "epoch": 0.12152777777777778, "grad_norm": 0.5927149653434753, "learning_rate": 0.00039839165404233077, "loss": 0.5684, "step": 5460 }, { "epoch": 0.12175035612535613, "grad_norm": 0.7376818656921387, "learning_rate": 0.00039838574789213626, "loss": 0.5955, "step": 5470 }, { "epoch": 0.12197293447293447, "grad_norm": 0.9106239080429077, "learning_rate": 0.00039837983096149783, "loss": 0.7286, "step": 5480 }, { "epoch": 0.12219551282051282, "grad_norm": 0.6335828304290771, "learning_rate": 0.00039837390325073694, "loss": 0.5584, "step": 5490 }, { "epoch": 0.12241809116809117, "grad_norm": 0.5210475921630859, "learning_rate": 0.0003983679647601758, "loss": 0.6321, "step": 5500 }, { "epoch": 0.12264066951566951, "grad_norm": 0.6025746464729309, "learning_rate": 0.00039836201549013704, "loss": 0.5565, "step": 5510 }, { "epoch": 0.12286324786324786, "grad_norm": 0.570562481880188, "learning_rate": 0.00039835605544094393, "loss": 0.7184, "step": 5520 }, { "epoch": 0.12308582621082621, "grad_norm": 0.4416629374027252, "learning_rate": 0.00039835008461292046, "loss": 0.6752, "step": 5530 }, { "epoch": 0.12330840455840456, "grad_norm": 0.7752945423126221, "learning_rate": 0.000398344103006391, "loss": 0.6763, "step": 5540 }, { "epoch": 0.12353098290598291, "grad_norm": 0.5153147578239441, "learning_rate": 0.0003983381106216805, "loss": 0.7373, "step": 5550 }, { "epoch": 0.12375356125356125, "grad_norm": 0.44418540596961975, "learning_rate": 0.00039833210745911484, "loss": 0.6644, "step": 5560 }, { "epoch": 0.1239761396011396, "grad_norm": 0.5131638646125793, "learning_rate": 0.00039832609351902006, "loss": 0.5635, "step": 5570 }, { "epoch": 0.12419871794871795, "grad_norm": 0.49686259031295776, "learning_rate": 0.000398320068801723, "loss": 0.6164, "step": 5580 }, { "epoch": 0.1244212962962963, "grad_norm": 0.9833499789237976, "learning_rate": 0.00039831403330755103, "loss": 0.5118, "step": 5590 }, { "epoch": 0.12464387464387465, "grad_norm": 0.7739784717559814, "learning_rate": 0.0003983079870368322, "loss": 0.7191, "step": 5600 }, { "epoch": 0.12486645299145299, "grad_norm": 0.5591177344322205, "learning_rate": 0.00039830192998989493, "loss": 0.5886, "step": 5610 }, { "epoch": 0.12508903133903135, "grad_norm": 0.7678048610687256, "learning_rate": 0.0003982958621670685, "loss": 0.7345, "step": 5620 }, { "epoch": 0.12531160968660968, "grad_norm": 0.6148435473442078, "learning_rate": 0.0003982897835686825, "loss": 0.803, "step": 5630 }, { "epoch": 0.12553418803418803, "grad_norm": 0.5740734338760376, "learning_rate": 0.00039828369419506746, "loss": 0.6542, "step": 5640 }, { "epoch": 0.1257567663817664, "grad_norm": 0.5886821746826172, "learning_rate": 0.000398277594046554, "loss": 0.6559, "step": 5650 }, { "epoch": 0.12597934472934472, "grad_norm": 0.5730891823768616, "learning_rate": 0.00039827148312347396, "loss": 0.8286, "step": 5660 }, { "epoch": 0.12620192307692307, "grad_norm": 0.5327824950218201, "learning_rate": 0.0003982653614261591, "loss": 0.5672, "step": 5670 }, { "epoch": 0.12642450142450143, "grad_norm": 0.6959152221679688, "learning_rate": 0.0003982592289549422, "loss": 0.4987, "step": 5680 }, { "epoch": 0.12664707977207978, "grad_norm": 0.6996831893920898, "learning_rate": 0.00039825308571015647, "loss": 0.6099, "step": 5690 }, { "epoch": 0.1268696581196581, "grad_norm": 0.6751108169555664, "learning_rate": 0.0003982469316921358, "loss": 0.749, "step": 5700 }, { "epoch": 0.12709223646723647, "grad_norm": 0.742348849773407, "learning_rate": 0.0003982407669012146, "loss": 0.6497, "step": 5710 }, { "epoch": 0.12731481481481483, "grad_norm": 0.5918314456939697, "learning_rate": 0.0003982345913377278, "loss": 0.5652, "step": 5720 }, { "epoch": 0.12753739316239315, "grad_norm": 0.5717595815658569, "learning_rate": 0.000398228405002011, "loss": 0.6369, "step": 5730 }, { "epoch": 0.1277599715099715, "grad_norm": 1.0393725633621216, "learning_rate": 0.0003982222078944005, "loss": 0.6695, "step": 5740 }, { "epoch": 0.12798254985754987, "grad_norm": 0.9899468421936035, "learning_rate": 0.00039821600001523283, "loss": 0.7561, "step": 5750 }, { "epoch": 0.1282051282051282, "grad_norm": 0.43348416686058044, "learning_rate": 0.0003982097813648455, "loss": 0.6485, "step": 5760 }, { "epoch": 0.12842770655270655, "grad_norm": 0.5063757300376892, "learning_rate": 0.00039820355194357637, "loss": 0.5846, "step": 5770 }, { "epoch": 0.1286502849002849, "grad_norm": 0.6151053309440613, "learning_rate": 0.00039819731175176403, "loss": 0.6263, "step": 5780 }, { "epoch": 0.12887286324786323, "grad_norm": 0.5029515027999878, "learning_rate": 0.00039819106078974747, "loss": 0.5724, "step": 5790 }, { "epoch": 0.1290954415954416, "grad_norm": 0.6326857805252075, "learning_rate": 0.00039818479905786636, "loss": 0.7097, "step": 5800 }, { "epoch": 0.12931801994301995, "grad_norm": 0.5461398959159851, "learning_rate": 0.00039817852655646115, "loss": 0.6849, "step": 5810 }, { "epoch": 0.1295405982905983, "grad_norm": 0.579310417175293, "learning_rate": 0.0003981722432858725, "loss": 0.6139, "step": 5820 }, { "epoch": 0.12976317663817663, "grad_norm": 0.8785983324050903, "learning_rate": 0.00039816594924644194, "loss": 0.6478, "step": 5830 }, { "epoch": 0.129985754985755, "grad_norm": 0.8515345454216003, "learning_rate": 0.00039815964443851143, "loss": 0.5224, "step": 5840 }, { "epoch": 0.13020833333333334, "grad_norm": 0.6978330612182617, "learning_rate": 0.00039815332886242367, "loss": 0.6621, "step": 5850 }, { "epoch": 0.13043091168091167, "grad_norm": 0.9195356965065002, "learning_rate": 0.0003981470025185218, "loss": 0.6799, "step": 5860 }, { "epoch": 0.13065349002849003, "grad_norm": 0.7251855134963989, "learning_rate": 0.0003981406654071496, "loss": 0.6915, "step": 5870 }, { "epoch": 0.13087606837606838, "grad_norm": 0.73192298412323, "learning_rate": 0.00039813431752865145, "loss": 0.625, "step": 5880 }, { "epoch": 0.1310986467236467, "grad_norm": 0.8073905110359192, "learning_rate": 0.00039812795888337225, "loss": 0.5896, "step": 5890 }, { "epoch": 0.13132122507122507, "grad_norm": 0.46170732378959656, "learning_rate": 0.00039812158947165755, "loss": 0.6374, "step": 5900 }, { "epoch": 0.13154380341880342, "grad_norm": 0.790496826171875, "learning_rate": 0.0003981152092938535, "loss": 0.6458, "step": 5910 }, { "epoch": 0.13176638176638178, "grad_norm": 0.9606544971466064, "learning_rate": 0.0003981088183503069, "loss": 0.824, "step": 5920 }, { "epoch": 0.1319889601139601, "grad_norm": 0.5605319738388062, "learning_rate": 0.0003981024166413648, "loss": 0.7279, "step": 5930 }, { "epoch": 0.13221153846153846, "grad_norm": 0.7565925717353821, "learning_rate": 0.00039809600416737523, "loss": 0.5449, "step": 5940 }, { "epoch": 0.13243411680911682, "grad_norm": 0.5933641791343689, "learning_rate": 0.00039808958092868663, "loss": 0.6217, "step": 5950 }, { "epoch": 0.13265669515669515, "grad_norm": 0.6032646894454956, "learning_rate": 0.00039808314692564806, "loss": 0.4533, "step": 5960 }, { "epoch": 0.1328792735042735, "grad_norm": 0.6120531558990479, "learning_rate": 0.00039807670215860917, "loss": 0.6702, "step": 5970 }, { "epoch": 0.13310185185185186, "grad_norm": 0.79580157995224, "learning_rate": 0.00039807024662792, "loss": 0.7245, "step": 5980 }, { "epoch": 0.1333244301994302, "grad_norm": 0.6910545229911804, "learning_rate": 0.00039806378033393157, "loss": 0.5987, "step": 5990 }, { "epoch": 0.13354700854700854, "grad_norm": 0.38598570227622986, "learning_rate": 0.0003980573032769952, "loss": 0.7247, "step": 6000 }, { "epoch": 0.1337695868945869, "grad_norm": 0.6384770274162292, "learning_rate": 0.0003980508154574628, "loss": 0.6279, "step": 6010 }, { "epoch": 0.13399216524216523, "grad_norm": 0.666942298412323, "learning_rate": 0.00039804431687568694, "loss": 0.6807, "step": 6020 }, { "epoch": 0.13421474358974358, "grad_norm": 0.4904678463935852, "learning_rate": 0.0003980378075320208, "loss": 0.6168, "step": 6030 }, { "epoch": 0.13443732193732194, "grad_norm": 0.6417827606201172, "learning_rate": 0.00039803128742681805, "loss": 0.5758, "step": 6040 }, { "epoch": 0.1346599002849003, "grad_norm": 0.7031834125518799, "learning_rate": 0.00039802475656043303, "loss": 0.7149, "step": 6050 }, { "epoch": 0.13488247863247863, "grad_norm": 0.6842778325080872, "learning_rate": 0.00039801821493322067, "loss": 0.6906, "step": 6060 }, { "epoch": 0.13510505698005698, "grad_norm": 0.8279538750648499, "learning_rate": 0.0003980116625455364, "loss": 0.6726, "step": 6070 }, { "epoch": 0.13532763532763534, "grad_norm": 0.5008348226547241, "learning_rate": 0.00039800509939773624, "loss": 0.5404, "step": 6080 }, { "epoch": 0.13555021367521367, "grad_norm": 0.831046462059021, "learning_rate": 0.00039799852549017686, "loss": 0.728, "step": 6090 }, { "epoch": 0.13577279202279202, "grad_norm": 0.5071601867675781, "learning_rate": 0.00039799194082321555, "loss": 0.5437, "step": 6100 }, { "epoch": 0.13599537037037038, "grad_norm": 0.522179365158081, "learning_rate": 0.00039798534539721013, "loss": 0.678, "step": 6110 }, { "epoch": 0.1362179487179487, "grad_norm": 0.5688888430595398, "learning_rate": 0.00039797873921251895, "loss": 0.4982, "step": 6120 }, { "epoch": 0.13644052706552706, "grad_norm": 0.4896734654903412, "learning_rate": 0.00039797212226950097, "loss": 0.6654, "step": 6130 }, { "epoch": 0.13666310541310542, "grad_norm": 0.6670453548431396, "learning_rate": 0.0003979654945685158, "loss": 0.7016, "step": 6140 }, { "epoch": 0.13688568376068377, "grad_norm": 0.4609311819076538, "learning_rate": 0.00039795885610992364, "loss": 0.6884, "step": 6150 }, { "epoch": 0.1371082621082621, "grad_norm": 0.6140702366828918, "learning_rate": 0.00039795220689408517, "loss": 0.7774, "step": 6160 }, { "epoch": 0.13733084045584046, "grad_norm": 0.613925576210022, "learning_rate": 0.00039794554692136174, "loss": 0.4291, "step": 6170 }, { "epoch": 0.13755341880341881, "grad_norm": 0.5260471105575562, "learning_rate": 0.00039793887619211525, "loss": 0.569, "step": 6180 }, { "epoch": 0.13777599715099714, "grad_norm": 0.6670699119567871, "learning_rate": 0.0003979321947067081, "loss": 0.5147, "step": 6190 }, { "epoch": 0.1379985754985755, "grad_norm": 0.6594804525375366, "learning_rate": 0.00039792550246550354, "loss": 0.5451, "step": 6200 }, { "epoch": 0.13822115384615385, "grad_norm": 0.8946701884269714, "learning_rate": 0.0003979187994688651, "loss": 0.7319, "step": 6210 }, { "epoch": 0.13844373219373218, "grad_norm": 0.5910085439682007, "learning_rate": 0.00039791208571715705, "loss": 0.6866, "step": 6220 }, { "epoch": 0.13866631054131054, "grad_norm": 0.6339130401611328, "learning_rate": 0.00039790536121074436, "loss": 0.622, "step": 6230 }, { "epoch": 0.1388888888888889, "grad_norm": 0.8916711807250977, "learning_rate": 0.0003978986259499922, "loss": 0.6254, "step": 6240 }, { "epoch": 0.13911146723646722, "grad_norm": 0.4093226492404938, "learning_rate": 0.0003978918799352668, "loss": 0.572, "step": 6250 }, { "epoch": 0.13933404558404558, "grad_norm": 0.6929894089698792, "learning_rate": 0.0003978851231669346, "loss": 0.5564, "step": 6260 }, { "epoch": 0.13955662393162394, "grad_norm": 0.6963203549385071, "learning_rate": 0.00039787835564536277, "loss": 0.6477, "step": 6270 }, { "epoch": 0.1397792022792023, "grad_norm": 0.5985345840454102, "learning_rate": 0.00039787157737091914, "loss": 0.6064, "step": 6280 }, { "epoch": 0.14000178062678062, "grad_norm": 0.5991149544715881, "learning_rate": 0.000397864788343972, "loss": 0.6235, "step": 6290 }, { "epoch": 0.14022435897435898, "grad_norm": 0.6351196765899658, "learning_rate": 0.00039785798856489026, "loss": 0.6082, "step": 6300 }, { "epoch": 0.14044693732193733, "grad_norm": 0.4941553473472595, "learning_rate": 0.0003978511780340435, "loss": 0.7004, "step": 6310 }, { "epoch": 0.14066951566951566, "grad_norm": 0.5571572780609131, "learning_rate": 0.0003978443567518017, "loss": 0.708, "step": 6320 }, { "epoch": 0.14089209401709402, "grad_norm": 0.6778060793876648, "learning_rate": 0.00039783752471853566, "loss": 0.6762, "step": 6330 }, { "epoch": 0.14111467236467237, "grad_norm": 0.7342366576194763, "learning_rate": 0.00039783068193461653, "loss": 0.6335, "step": 6340 }, { "epoch": 0.1413372507122507, "grad_norm": 0.7785418629646301, "learning_rate": 0.0003978238284004162, "loss": 0.6961, "step": 6350 }, { "epoch": 0.14155982905982906, "grad_norm": 0.5320806503295898, "learning_rate": 0.00039781696411630714, "loss": 0.5599, "step": 6360 }, { "epoch": 0.1417824074074074, "grad_norm": 0.6191650032997131, "learning_rate": 0.0003978100890826622, "loss": 0.694, "step": 6370 }, { "epoch": 0.14200498575498577, "grad_norm": 0.6701613068580627, "learning_rate": 0.00039780320329985515, "loss": 0.6344, "step": 6380 }, { "epoch": 0.1422275641025641, "grad_norm": 0.529349684715271, "learning_rate": 0.0003977963067682601, "loss": 0.586, "step": 6390 }, { "epoch": 0.14245014245014245, "grad_norm": 0.7802019119262695, "learning_rate": 0.00039778939948825184, "loss": 0.7415, "step": 6400 }, { "epoch": 0.1426727207977208, "grad_norm": 0.6360093355178833, "learning_rate": 0.00039778248146020564, "loss": 0.5822, "step": 6410 }, { "epoch": 0.14289529914529914, "grad_norm": 0.829375684261322, "learning_rate": 0.0003977755526844975, "loss": 0.6785, "step": 6420 }, { "epoch": 0.1431178774928775, "grad_norm": 0.664879560470581, "learning_rate": 0.00039776861316150394, "loss": 0.6984, "step": 6430 }, { "epoch": 0.14334045584045585, "grad_norm": 0.47659850120544434, "learning_rate": 0.000397761662891602, "loss": 0.5464, "step": 6440 }, { "epoch": 0.14356303418803418, "grad_norm": 0.7773748636245728, "learning_rate": 0.0003977547018751695, "loss": 0.6879, "step": 6450 }, { "epoch": 0.14378561253561253, "grad_norm": 0.6674002408981323, "learning_rate": 0.0003977477301125845, "loss": 0.6609, "step": 6460 }, { "epoch": 0.1440081908831909, "grad_norm": 0.7954365611076355, "learning_rate": 0.000397740747604226, "loss": 0.6521, "step": 6470 }, { "epoch": 0.14423076923076922, "grad_norm": 0.929842472076416, "learning_rate": 0.0003977337543504734, "loss": 0.6339, "step": 6480 }, { "epoch": 0.14445334757834757, "grad_norm": 0.5389747023582458, "learning_rate": 0.0003977267503517067, "loss": 0.6441, "step": 6490 }, { "epoch": 0.14467592592592593, "grad_norm": 0.5743294358253479, "learning_rate": 0.00039771973560830657, "loss": 0.656, "step": 6500 }, { "epoch": 0.14489850427350429, "grad_norm": 0.7883462905883789, "learning_rate": 0.00039771271012065416, "loss": 0.6832, "step": 6510 }, { "epoch": 0.14512108262108261, "grad_norm": 0.8884845972061157, "learning_rate": 0.0003977056738891311, "loss": 0.5839, "step": 6520 }, { "epoch": 0.14534366096866097, "grad_norm": 0.6745997667312622, "learning_rate": 0.00039769862691412, "loss": 0.7437, "step": 6530 }, { "epoch": 0.14556623931623933, "grad_norm": 0.4044835865497589, "learning_rate": 0.00039769156919600363, "loss": 0.6029, "step": 6540 }, { "epoch": 0.14578881766381765, "grad_norm": 0.6320715546607971, "learning_rate": 0.00039768450073516555, "loss": 0.645, "step": 6550 }, { "epoch": 0.146011396011396, "grad_norm": 0.52195805311203, "learning_rate": 0.00039767742153198985, "loss": 0.6134, "step": 6560 }, { "epoch": 0.14623397435897437, "grad_norm": 0.6805070638656616, "learning_rate": 0.00039767033158686125, "loss": 0.5271, "step": 6570 }, { "epoch": 0.1464565527065527, "grad_norm": 0.9781261086463928, "learning_rate": 0.00039766323090016496, "loss": 0.617, "step": 6580 }, { "epoch": 0.14667913105413105, "grad_norm": 0.7117429971694946, "learning_rate": 0.00039765611947228696, "loss": 0.6566, "step": 6590 }, { "epoch": 0.1469017094017094, "grad_norm": 0.5962918400764465, "learning_rate": 0.0003976489973036136, "loss": 0.7107, "step": 6600 }, { "epoch": 0.14712428774928774, "grad_norm": 0.6786366701126099, "learning_rate": 0.00039764186439453193, "loss": 0.5947, "step": 6610 }, { "epoch": 0.1473468660968661, "grad_norm": 0.5934000611305237, "learning_rate": 0.00039763472074542955, "loss": 0.6663, "step": 6620 }, { "epoch": 0.14756944444444445, "grad_norm": 0.5991849303245544, "learning_rate": 0.0003976275663566947, "loss": 0.5502, "step": 6630 }, { "epoch": 0.1477920227920228, "grad_norm": 0.6965344548225403, "learning_rate": 0.000397620401228716, "loss": 0.7819, "step": 6640 }, { "epoch": 0.14801460113960113, "grad_norm": 0.5270944237709045, "learning_rate": 0.00039761322536188297, "loss": 0.5685, "step": 6650 }, { "epoch": 0.1482371794871795, "grad_norm": 0.7651955485343933, "learning_rate": 0.0003976060387565855, "loss": 0.6355, "step": 6660 }, { "epoch": 0.14845975783475784, "grad_norm": 0.969373881816864, "learning_rate": 0.00039759884141321415, "loss": 0.7148, "step": 6670 }, { "epoch": 0.14868233618233617, "grad_norm": 0.5654862523078918, "learning_rate": 0.00039759163333215997, "loss": 0.6992, "step": 6680 }, { "epoch": 0.14890491452991453, "grad_norm": 0.4438091218471527, "learning_rate": 0.00039758441451381464, "loss": 0.5312, "step": 6690 }, { "epoch": 0.14912749287749288, "grad_norm": 0.4933086633682251, "learning_rate": 0.0003975771849585705, "loss": 0.6724, "step": 6700 }, { "epoch": 0.1493500712250712, "grad_norm": 0.5377151966094971, "learning_rate": 0.0003975699446668204, "loss": 0.51, "step": 6710 }, { "epoch": 0.14957264957264957, "grad_norm": 0.6050658822059631, "learning_rate": 0.00039756269363895775, "loss": 0.6447, "step": 6720 }, { "epoch": 0.14979522792022792, "grad_norm": 0.46284544467926025, "learning_rate": 0.00039755543187537667, "loss": 0.5348, "step": 6730 }, { "epoch": 0.15001780626780628, "grad_norm": 0.5097401142120361, "learning_rate": 0.0003975481593764716, "loss": 0.5738, "step": 6740 }, { "epoch": 0.1502403846153846, "grad_norm": 0.3497709631919861, "learning_rate": 0.00039754087614263787, "loss": 0.6685, "step": 6750 }, { "epoch": 0.15046296296296297, "grad_norm": 0.7363093495368958, "learning_rate": 0.00039753358217427124, "loss": 0.5343, "step": 6760 }, { "epoch": 0.15068554131054132, "grad_norm": 0.7607445120811462, "learning_rate": 0.000397526277471768, "loss": 0.6038, "step": 6770 }, { "epoch": 0.15090811965811965, "grad_norm": 0.6368987560272217, "learning_rate": 0.0003975189620355251, "loss": 0.5928, "step": 6780 }, { "epoch": 0.151130698005698, "grad_norm": 0.7133484482765198, "learning_rate": 0.00039751163586594017, "loss": 0.6399, "step": 6790 }, { "epoch": 0.15135327635327636, "grad_norm": 0.6672192215919495, "learning_rate": 0.0003975042989634113, "loss": 0.5147, "step": 6800 }, { "epoch": 0.1515758547008547, "grad_norm": 0.597719669342041, "learning_rate": 0.000397496951328337, "loss": 0.6102, "step": 6810 }, { "epoch": 0.15179843304843305, "grad_norm": 0.4774816334247589, "learning_rate": 0.00039748959296111684, "loss": 0.5469, "step": 6820 }, { "epoch": 0.1520210113960114, "grad_norm": 0.789928674697876, "learning_rate": 0.00039748222386215044, "loss": 0.574, "step": 6830 }, { "epoch": 0.15224358974358973, "grad_norm": 0.6176126003265381, "learning_rate": 0.0003974748440318384, "loss": 0.6669, "step": 6840 }, { "epoch": 0.15246616809116809, "grad_norm": 0.5427154302597046, "learning_rate": 0.0003974674534705816, "loss": 0.5147, "step": 6850 }, { "epoch": 0.15268874643874644, "grad_norm": 0.6813560724258423, "learning_rate": 0.00039746005217878173, "loss": 0.5202, "step": 6860 }, { "epoch": 0.1529113247863248, "grad_norm": 0.46288907527923584, "learning_rate": 0.00039745264015684096, "loss": 0.6455, "step": 6870 }, { "epoch": 0.15313390313390313, "grad_norm": 0.5715744495391846, "learning_rate": 0.00039744521740516214, "loss": 0.7415, "step": 6880 }, { "epoch": 0.15335648148148148, "grad_norm": 0.7895148396492004, "learning_rate": 0.0003974377839241486, "loss": 0.6318, "step": 6890 }, { "epoch": 0.15357905982905984, "grad_norm": 0.6893559098243713, "learning_rate": 0.00039743033971420414, "loss": 0.7765, "step": 6900 }, { "epoch": 0.15380163817663817, "grad_norm": 0.5792664289474487, "learning_rate": 0.0003974228847757335, "loss": 0.7086, "step": 6910 }, { "epoch": 0.15402421652421652, "grad_norm": 0.7981087565422058, "learning_rate": 0.0003974154191091416, "loss": 0.59, "step": 6920 }, { "epoch": 0.15424679487179488, "grad_norm": 0.5684230923652649, "learning_rate": 0.0003974079427148342, "loss": 0.5112, "step": 6930 }, { "epoch": 0.1544693732193732, "grad_norm": 0.6198676824569702, "learning_rate": 0.0003974004555932177, "loss": 0.599, "step": 6940 }, { "epoch": 0.15469195156695156, "grad_norm": 0.8237385749816895, "learning_rate": 0.00039739295774469875, "loss": 0.6301, "step": 6950 }, { "epoch": 0.15491452991452992, "grad_norm": 0.8568017482757568, "learning_rate": 0.00039738544916968494, "loss": 0.8313, "step": 6960 }, { "epoch": 0.15513710826210828, "grad_norm": 0.5193782448768616, "learning_rate": 0.0003973779298685842, "loss": 0.6072, "step": 6970 }, { "epoch": 0.1553596866096866, "grad_norm": 0.7648767828941345, "learning_rate": 0.0003973703998418052, "loss": 0.6464, "step": 6980 }, { "epoch": 0.15558226495726496, "grad_norm": 0.49430105090141296, "learning_rate": 0.0003973628590897571, "loss": 0.6868, "step": 6990 }, { "epoch": 0.15580484330484332, "grad_norm": 0.38553017377853394, "learning_rate": 0.0003973553076128496, "loss": 0.6342, "step": 7000 }, { "epoch": 0.15602742165242164, "grad_norm": 0.6819826364517212, "learning_rate": 0.00039734774541149315, "loss": 0.6189, "step": 7010 }, { "epoch": 0.15625, "grad_norm": 1.0453999042510986, "learning_rate": 0.0003973401724860987, "loss": 0.6177, "step": 7020 }, { "epoch": 0.15647257834757836, "grad_norm": 0.6561777591705322, "learning_rate": 0.0003973325888370777, "loss": 0.7275, "step": 7030 }, { "epoch": 0.15669515669515668, "grad_norm": 0.574906051158905, "learning_rate": 0.0003973249944648423, "loss": 0.5917, "step": 7040 }, { "epoch": 0.15691773504273504, "grad_norm": 0.961545467376709, "learning_rate": 0.0003973173893698051, "loss": 0.6971, "step": 7050 }, { "epoch": 0.1571403133903134, "grad_norm": 0.48292088508605957, "learning_rate": 0.00039730977355237953, "loss": 0.7646, "step": 7060 }, { "epoch": 0.15736289173789172, "grad_norm": 0.8326444029808044, "learning_rate": 0.00039730214701297925, "loss": 0.6865, "step": 7070 }, { "epoch": 0.15758547008547008, "grad_norm": 0.6857150197029114, "learning_rate": 0.0003972945097520188, "loss": 0.7559, "step": 7080 }, { "epoch": 0.15780804843304844, "grad_norm": 0.6443191766738892, "learning_rate": 0.0003972868617699132, "loss": 0.5099, "step": 7090 }, { "epoch": 0.1580306267806268, "grad_norm": 0.7539528012275696, "learning_rate": 0.000397279203067078, "loss": 0.7146, "step": 7100 }, { "epoch": 0.15825320512820512, "grad_norm": 0.548685610294342, "learning_rate": 0.00039727153364392943, "loss": 0.6138, "step": 7110 }, { "epoch": 0.15847578347578348, "grad_norm": 0.7192095518112183, "learning_rate": 0.0003972638535008842, "loss": 0.727, "step": 7120 }, { "epoch": 0.15869836182336183, "grad_norm": 0.7837377190589905, "learning_rate": 0.0003972561626383597, "loss": 0.6666, "step": 7130 }, { "epoch": 0.15892094017094016, "grad_norm": 0.6404113173484802, "learning_rate": 0.00039724846105677387, "loss": 0.823, "step": 7140 }, { "epoch": 0.15914351851851852, "grad_norm": 0.9070250988006592, "learning_rate": 0.0003972407487565452, "loss": 0.7189, "step": 7150 }, { "epoch": 0.15936609686609687, "grad_norm": 0.5076274871826172, "learning_rate": 0.0003972330257380927, "loss": 0.64, "step": 7160 }, { "epoch": 0.1595886752136752, "grad_norm": 0.628984272480011, "learning_rate": 0.00039722529200183614, "loss": 0.6955, "step": 7170 }, { "epoch": 0.15981125356125356, "grad_norm": 0.6998558640480042, "learning_rate": 0.0003972175475481958, "loss": 0.6029, "step": 7180 }, { "epoch": 0.16003383190883191, "grad_norm": 0.6283873319625854, "learning_rate": 0.0003972097923775924, "loss": 0.6558, "step": 7190 }, { "epoch": 0.16025641025641027, "grad_norm": 0.4948321282863617, "learning_rate": 0.0003972020264904475, "loss": 0.6605, "step": 7200 }, { "epoch": 0.1604789886039886, "grad_norm": 0.6738045811653137, "learning_rate": 0.00039719424988718307, "loss": 0.6408, "step": 7210 }, { "epoch": 0.16070156695156695, "grad_norm": 0.6499155163764954, "learning_rate": 0.00039718646256822163, "loss": 0.5679, "step": 7220 }, { "epoch": 0.1609241452991453, "grad_norm": 0.8065938949584961, "learning_rate": 0.0003971786645339864, "loss": 0.6205, "step": 7230 }, { "epoch": 0.16114672364672364, "grad_norm": 0.5457149744033813, "learning_rate": 0.00039717085578490114, "loss": 0.5933, "step": 7240 }, { "epoch": 0.161369301994302, "grad_norm": 0.6167128682136536, "learning_rate": 0.0003971630363213901, "loss": 0.6124, "step": 7250 }, { "epoch": 0.16159188034188035, "grad_norm": 1.0074851512908936, "learning_rate": 0.00039715520614387834, "loss": 0.5991, "step": 7260 }, { "epoch": 0.16181445868945868, "grad_norm": 0.575910747051239, "learning_rate": 0.0003971473652527912, "loss": 0.7633, "step": 7270 }, { "epoch": 0.16203703703703703, "grad_norm": 0.6274125576019287, "learning_rate": 0.00039713951364855486, "loss": 0.5563, "step": 7280 }, { "epoch": 0.1622596153846154, "grad_norm": 0.7009009122848511, "learning_rate": 0.000397131651331596, "loss": 0.6355, "step": 7290 }, { "epoch": 0.16248219373219372, "grad_norm": 0.532028317451477, "learning_rate": 0.00039712377830234183, "loss": 0.6231, "step": 7300 }, { "epoch": 0.16270477207977208, "grad_norm": 0.7437495589256287, "learning_rate": 0.0003971158945612201, "loss": 0.8354, "step": 7310 }, { "epoch": 0.16292735042735043, "grad_norm": 0.7639107704162598, "learning_rate": 0.00039710800010865936, "loss": 0.6896, "step": 7320 }, { "epoch": 0.1631499287749288, "grad_norm": 0.7520061731338501, "learning_rate": 0.0003971000949450885, "loss": 0.6361, "step": 7330 }, { "epoch": 0.16337250712250712, "grad_norm": 0.9273715615272522, "learning_rate": 0.00039709217907093715, "loss": 0.6059, "step": 7340 }, { "epoch": 0.16359508547008547, "grad_norm": 0.8598505854606628, "learning_rate": 0.00039708425248663546, "loss": 0.6117, "step": 7350 }, { "epoch": 0.16381766381766383, "grad_norm": 0.616790235042572, "learning_rate": 0.00039707631519261415, "loss": 0.6402, "step": 7360 }, { "epoch": 0.16404024216524216, "grad_norm": 0.5139318704605103, "learning_rate": 0.0003970683671893045, "loss": 0.5693, "step": 7370 }, { "epoch": 0.1642628205128205, "grad_norm": 0.563727617263794, "learning_rate": 0.0003970604084771385, "loss": 0.5927, "step": 7380 }, { "epoch": 0.16448539886039887, "grad_norm": 0.7112362384796143, "learning_rate": 0.0003970524390565485, "loss": 0.5696, "step": 7390 }, { "epoch": 0.1647079772079772, "grad_norm": 0.4131312668323517, "learning_rate": 0.0003970444589279677, "loss": 0.7855, "step": 7400 }, { "epoch": 0.16493055555555555, "grad_norm": 0.8097583055496216, "learning_rate": 0.0003970364680918297, "loss": 0.6333, "step": 7410 }, { "epoch": 0.1651531339031339, "grad_norm": 0.632159411907196, "learning_rate": 0.0003970284665485688, "loss": 0.5816, "step": 7420 }, { "epoch": 0.16537571225071226, "grad_norm": 0.5271141529083252, "learning_rate": 0.00039702045429861955, "loss": 0.6219, "step": 7430 }, { "epoch": 0.1655982905982906, "grad_norm": 0.36876511573791504, "learning_rate": 0.00039701243134241765, "loss": 0.6601, "step": 7440 }, { "epoch": 0.16582086894586895, "grad_norm": 0.755054771900177, "learning_rate": 0.0003970043976803989, "loss": 0.6177, "step": 7450 }, { "epoch": 0.1660434472934473, "grad_norm": 0.7070847153663635, "learning_rate": 0.00039699635331299994, "loss": 0.6584, "step": 7460 }, { "epoch": 0.16626602564102563, "grad_norm": 0.523898720741272, "learning_rate": 0.00039698829824065784, "loss": 0.6504, "step": 7470 }, { "epoch": 0.166488603988604, "grad_norm": 0.6656180620193481, "learning_rate": 0.00039698023246381036, "loss": 0.7437, "step": 7480 }, { "epoch": 0.16671118233618235, "grad_norm": 0.8116356730461121, "learning_rate": 0.0003969721559828958, "loss": 0.6025, "step": 7490 }, { "epoch": 0.16693376068376067, "grad_norm": 0.5214651823043823, "learning_rate": 0.00039696406879835306, "loss": 0.5921, "step": 7500 }, { "epoch": 0.16715633903133903, "grad_norm": 0.5218111276626587, "learning_rate": 0.00039695597091062154, "loss": 0.5214, "step": 7510 }, { "epoch": 0.16737891737891739, "grad_norm": 0.838103711605072, "learning_rate": 0.0003969478623201413, "loss": 0.6568, "step": 7520 }, { "epoch": 0.16760149572649571, "grad_norm": 0.6541166305541992, "learning_rate": 0.00039693974302735304, "loss": 0.7354, "step": 7530 }, { "epoch": 0.16782407407407407, "grad_norm": 0.6170065999031067, "learning_rate": 0.0003969316130326979, "loss": 0.6438, "step": 7540 }, { "epoch": 0.16804665242165243, "grad_norm": 0.7294967770576477, "learning_rate": 0.0003969234723366177, "loss": 0.687, "step": 7550 }, { "epoch": 0.16826923076923078, "grad_norm": 0.8346336483955383, "learning_rate": 0.00039691532093955484, "loss": 0.6582, "step": 7560 }, { "epoch": 0.1684918091168091, "grad_norm": 0.5497300028800964, "learning_rate": 0.00039690715884195223, "loss": 0.7647, "step": 7570 }, { "epoch": 0.16871438746438747, "grad_norm": 0.7623947858810425, "learning_rate": 0.0003968989860442534, "loss": 0.6478, "step": 7580 }, { "epoch": 0.16893696581196582, "grad_norm": 0.8158397078514099, "learning_rate": 0.0003968908025469024, "loss": 0.6687, "step": 7590 }, { "epoch": 0.16915954415954415, "grad_norm": 0.369629830121994, "learning_rate": 0.0003968826083503441, "loss": 0.6755, "step": 7600 }, { "epoch": 0.1693821225071225, "grad_norm": 0.5609103441238403, "learning_rate": 0.00039687440345502364, "loss": 0.5001, "step": 7610 }, { "epoch": 0.16960470085470086, "grad_norm": 0.5148101449012756, "learning_rate": 0.000396866187861387, "loss": 0.5391, "step": 7620 }, { "epoch": 0.1698272792022792, "grad_norm": 0.47444501519203186, "learning_rate": 0.0003968579615698805, "loss": 0.5694, "step": 7630 }, { "epoch": 0.17004985754985755, "grad_norm": 0.6512032151222229, "learning_rate": 0.0003968497245809512, "loss": 0.5598, "step": 7640 }, { "epoch": 0.1702724358974359, "grad_norm": 0.7191357016563416, "learning_rate": 0.0003968414768950467, "loss": 0.5568, "step": 7650 }, { "epoch": 0.17049501424501423, "grad_norm": 0.8455227017402649, "learning_rate": 0.00039683321851261526, "loss": 0.7559, "step": 7660 }, { "epoch": 0.1707175925925926, "grad_norm": 0.5608453154563904, "learning_rate": 0.00039682494943410555, "loss": 0.7258, "step": 7670 }, { "epoch": 0.17094017094017094, "grad_norm": 0.5924692153930664, "learning_rate": 0.000396816669659967, "loss": 0.5982, "step": 7680 }, { "epoch": 0.1711627492877493, "grad_norm": 0.5104554891586304, "learning_rate": 0.00039680837919064943, "loss": 0.646, "step": 7690 }, { "epoch": 0.17138532763532763, "grad_norm": 0.6683109998703003, "learning_rate": 0.0003968000780266035, "loss": 0.6309, "step": 7700 }, { "epoch": 0.17160790598290598, "grad_norm": 0.9296523928642273, "learning_rate": 0.0003967917661682802, "loss": 0.5818, "step": 7710 }, { "epoch": 0.17183048433048434, "grad_norm": 0.6392726302146912, "learning_rate": 0.00039678344361613113, "loss": 0.6847, "step": 7720 }, { "epoch": 0.17205306267806267, "grad_norm": 0.35833972692489624, "learning_rate": 0.0003967751103706088, "loss": 0.5908, "step": 7730 }, { "epoch": 0.17227564102564102, "grad_norm": 0.8805040121078491, "learning_rate": 0.0003967667664321658, "loss": 0.7011, "step": 7740 }, { "epoch": 0.17249821937321938, "grad_norm": 0.6751018166542053, "learning_rate": 0.00039675841180125557, "loss": 0.4629, "step": 7750 }, { "epoch": 0.1727207977207977, "grad_norm": 0.5945695638656616, "learning_rate": 0.00039675004647833227, "loss": 0.6547, "step": 7760 }, { "epoch": 0.17294337606837606, "grad_norm": 1.1874905824661255, "learning_rate": 0.00039674167046385033, "loss": 0.6882, "step": 7770 }, { "epoch": 0.17316595441595442, "grad_norm": 0.6665182709693909, "learning_rate": 0.000396733283758265, "loss": 0.4854, "step": 7780 }, { "epoch": 0.17338853276353278, "grad_norm": 0.45658349990844727, "learning_rate": 0.0003967248863620319, "loss": 0.6115, "step": 7790 }, { "epoch": 0.1736111111111111, "grad_norm": 0.5834914445877075, "learning_rate": 0.00039671647827560746, "loss": 0.536, "step": 7800 }, { "epoch": 0.17383368945868946, "grad_norm": 0.3922024369239807, "learning_rate": 0.0003967080594994486, "loss": 0.4614, "step": 7810 }, { "epoch": 0.17405626780626782, "grad_norm": 0.6845150589942932, "learning_rate": 0.00039669963003401273, "loss": 0.705, "step": 7820 }, { "epoch": 0.17427884615384615, "grad_norm": 0.7234890460968018, "learning_rate": 0.00039669118987975793, "loss": 0.6177, "step": 7830 }, { "epoch": 0.1745014245014245, "grad_norm": 0.9000455141067505, "learning_rate": 0.0003966827390371428, "loss": 0.689, "step": 7840 }, { "epoch": 0.17472400284900286, "grad_norm": 0.5849109292030334, "learning_rate": 0.00039667427750662674, "loss": 0.6172, "step": 7850 }, { "epoch": 0.17494658119658119, "grad_norm": 0.6037909388542175, "learning_rate": 0.00039666580528866934, "loss": 0.6353, "step": 7860 }, { "epoch": 0.17516915954415954, "grad_norm": 0.4271775782108307, "learning_rate": 0.0003966573223837311, "loss": 0.6561, "step": 7870 }, { "epoch": 0.1753917378917379, "grad_norm": 0.6348351836204529, "learning_rate": 0.00039664882879227297, "loss": 0.5983, "step": 7880 }, { "epoch": 0.17561431623931623, "grad_norm": 0.7074285745620728, "learning_rate": 0.0003966403245147565, "loss": 0.7175, "step": 7890 }, { "epoch": 0.17583689458689458, "grad_norm": 0.7125088572502136, "learning_rate": 0.00039663180955164387, "loss": 0.6112, "step": 7900 }, { "epoch": 0.17605947293447294, "grad_norm": 0.5250646471977234, "learning_rate": 0.00039662328390339767, "loss": 0.6212, "step": 7910 }, { "epoch": 0.1762820512820513, "grad_norm": 0.5152850151062012, "learning_rate": 0.0003966147475704813, "loss": 0.6286, "step": 7920 }, { "epoch": 0.17650462962962962, "grad_norm": 0.674994945526123, "learning_rate": 0.0003966062005533585, "loss": 0.5661, "step": 7930 }, { "epoch": 0.17672720797720798, "grad_norm": 0.8784959316253662, "learning_rate": 0.00039659764285249395, "loss": 0.6632, "step": 7940 }, { "epoch": 0.17694978632478633, "grad_norm": 0.6024604439735413, "learning_rate": 0.00039658907446835247, "loss": 0.5838, "step": 7950 }, { "epoch": 0.17717236467236466, "grad_norm": 0.6517950892448425, "learning_rate": 0.00039658049540139975, "loss": 0.6594, "step": 7960 }, { "epoch": 0.17739494301994302, "grad_norm": 0.9225330948829651, "learning_rate": 0.000396571905652102, "loss": 0.6816, "step": 7970 }, { "epoch": 0.17761752136752137, "grad_norm": 0.5469350218772888, "learning_rate": 0.00039656330522092596, "loss": 0.5391, "step": 7980 }, { "epoch": 0.1778400997150997, "grad_norm": 0.7431405782699585, "learning_rate": 0.000396554694108339, "loss": 0.7307, "step": 7990 }, { "epoch": 0.17806267806267806, "grad_norm": 0.643224835395813, "learning_rate": 0.00039654607231480904, "loss": 0.6326, "step": 8000 }, { "epoch": 0.17828525641025642, "grad_norm": 0.6889805197715759, "learning_rate": 0.0003965374398408047, "loss": 0.5998, "step": 8010 }, { "epoch": 0.17850783475783477, "grad_norm": 0.6159041523933411, "learning_rate": 0.00039652879668679487, "loss": 0.6374, "step": 8020 }, { "epoch": 0.1787304131054131, "grad_norm": 0.63005131483078, "learning_rate": 0.0003965201428532494, "loss": 0.6403, "step": 8030 }, { "epoch": 0.17895299145299146, "grad_norm": 0.4250248372554779, "learning_rate": 0.00039651147834063853, "loss": 0.5081, "step": 8040 }, { "epoch": 0.1791755698005698, "grad_norm": 0.757688045501709, "learning_rate": 0.00039650280314943294, "loss": 0.6617, "step": 8050 }, { "epoch": 0.17939814814814814, "grad_norm": 0.5579215884208679, "learning_rate": 0.00039649411728010425, "loss": 0.5452, "step": 8060 }, { "epoch": 0.1796207264957265, "grad_norm": 0.5789703726768494, "learning_rate": 0.00039648542073312436, "loss": 0.7948, "step": 8070 }, { "epoch": 0.17984330484330485, "grad_norm": 0.5581598281860352, "learning_rate": 0.0003964767135089658, "loss": 0.7904, "step": 8080 }, { "epoch": 0.18002136752136752, "eval_loss": 0.6378280520439148, "eval_runtime": 337.3975, "eval_samples_per_second": 7.01, "eval_steps_per_second": 7.01, "step": 8088 }, { "epoch": 0.18006588319088318, "grad_norm": 0.855660080909729, "learning_rate": 0.00039646799560810183, "loss": 0.7713, "step": 8090 }, { "epoch": 0.18028846153846154, "grad_norm": 1.1552504301071167, "learning_rate": 0.00039645926703100613, "loss": 0.6661, "step": 8100 }, { "epoch": 0.1805110398860399, "grad_norm": 0.6898704767227173, "learning_rate": 0.000396450527778153, "loss": 0.7366, "step": 8110 }, { "epoch": 0.18073361823361822, "grad_norm": 0.9909326434135437, "learning_rate": 0.0003964417778500175, "loss": 0.6548, "step": 8120 }, { "epoch": 0.18095619658119658, "grad_norm": 0.8328597545623779, "learning_rate": 0.0003964330172470748, "loss": 0.537, "step": 8130 }, { "epoch": 0.18117877492877493, "grad_norm": 0.696701169013977, "learning_rate": 0.00039642424596980126, "loss": 0.8127, "step": 8140 }, { "epoch": 0.1814013532763533, "grad_norm": 0.6511324644088745, "learning_rate": 0.00039641546401867337, "loss": 0.4931, "step": 8150 }, { "epoch": 0.18162393162393162, "grad_norm": 0.5209097266197205, "learning_rate": 0.00039640667139416837, "loss": 0.5291, "step": 8160 }, { "epoch": 0.18184650997150997, "grad_norm": 0.6797854900360107, "learning_rate": 0.00039639786809676406, "loss": 0.584, "step": 8170 }, { "epoch": 0.18206908831908833, "grad_norm": 0.5029028058052063, "learning_rate": 0.0003963890541269388, "loss": 0.6508, "step": 8180 }, { "epoch": 0.18229166666666666, "grad_norm": 0.6294994354248047, "learning_rate": 0.00039638022948517153, "loss": 0.5387, "step": 8190 }, { "epoch": 0.182514245014245, "grad_norm": 0.7849052548408508, "learning_rate": 0.0003963713941719419, "loss": 0.6858, "step": 8200 }, { "epoch": 0.18273682336182337, "grad_norm": 0.5021756291389465, "learning_rate": 0.0003963625481877299, "loss": 0.6363, "step": 8210 }, { "epoch": 0.1829594017094017, "grad_norm": 0.7355058193206787, "learning_rate": 0.00039635369153301635, "loss": 0.7881, "step": 8220 }, { "epoch": 0.18318198005698005, "grad_norm": 0.6309876441955566, "learning_rate": 0.0003963448242082824, "loss": 0.6424, "step": 8230 }, { "epoch": 0.1834045584045584, "grad_norm": 0.4791712164878845, "learning_rate": 0.0003963359462140099, "loss": 0.5954, "step": 8240 }, { "epoch": 0.18362713675213677, "grad_norm": 0.4504138231277466, "learning_rate": 0.0003963270575506815, "loss": 0.5041, "step": 8250 }, { "epoch": 0.1838497150997151, "grad_norm": 0.6858550310134888, "learning_rate": 0.00039631815821878, "loss": 0.6295, "step": 8260 }, { "epoch": 0.18407229344729345, "grad_norm": 0.5094566941261292, "learning_rate": 0.00039630924821878907, "loss": 0.6105, "step": 8270 }, { "epoch": 0.1842948717948718, "grad_norm": 0.827899158000946, "learning_rate": 0.0003963003275511929, "loss": 0.5796, "step": 8280 }, { "epoch": 0.18451745014245013, "grad_norm": 0.6932994723320007, "learning_rate": 0.00039629139621647625, "loss": 0.6434, "step": 8290 }, { "epoch": 0.1847400284900285, "grad_norm": 0.5510367155075073, "learning_rate": 0.00039628245421512436, "loss": 0.6752, "step": 8300 }, { "epoch": 0.18496260683760685, "grad_norm": 0.6791307330131531, "learning_rate": 0.0003962735015476233, "loss": 0.7159, "step": 8310 }, { "epoch": 0.18518518518518517, "grad_norm": 0.5468825101852417, "learning_rate": 0.00039626453821445945, "loss": 0.5856, "step": 8320 }, { "epoch": 0.18540776353276353, "grad_norm": 0.7370385527610779, "learning_rate": 0.00039625556421611993, "loss": 0.645, "step": 8330 }, { "epoch": 0.1856303418803419, "grad_norm": 0.5625198483467102, "learning_rate": 0.00039624657955309237, "loss": 0.5382, "step": 8340 }, { "epoch": 0.18585292022792022, "grad_norm": 0.7717461585998535, "learning_rate": 0.00039623758422586514, "loss": 0.5829, "step": 8350 }, { "epoch": 0.18607549857549857, "grad_norm": 0.6957966685295105, "learning_rate": 0.0003962285782349268, "loss": 0.6079, "step": 8360 }, { "epoch": 0.18629807692307693, "grad_norm": 0.5371886491775513, "learning_rate": 0.0003962195615807669, "loss": 0.6639, "step": 8370 }, { "epoch": 0.18652065527065528, "grad_norm": 0.575091540813446, "learning_rate": 0.0003962105342638754, "loss": 0.5967, "step": 8380 }, { "epoch": 0.1867432336182336, "grad_norm": 0.5292919278144836, "learning_rate": 0.00039620149628474284, "loss": 0.6593, "step": 8390 }, { "epoch": 0.18696581196581197, "grad_norm": 0.5691158771514893, "learning_rate": 0.0003961924476438604, "loss": 0.572, "step": 8400 }, { "epoch": 0.18718839031339032, "grad_norm": 0.5576357245445251, "learning_rate": 0.0003961833883417197, "loss": 0.5306, "step": 8410 }, { "epoch": 0.18741096866096865, "grad_norm": 0.7963212132453918, "learning_rate": 0.00039617431837881306, "loss": 0.6785, "step": 8420 }, { "epoch": 0.187633547008547, "grad_norm": 0.648432195186615, "learning_rate": 0.00039616523775563346, "loss": 0.5836, "step": 8430 }, { "epoch": 0.18785612535612536, "grad_norm": 0.7052241563796997, "learning_rate": 0.0003961561464726742, "loss": 0.6708, "step": 8440 }, { "epoch": 0.1880787037037037, "grad_norm": 0.6538907885551453, "learning_rate": 0.0003961470445304293, "loss": 0.5733, "step": 8450 }, { "epoch": 0.18830128205128205, "grad_norm": 0.5793173909187317, "learning_rate": 0.0003961379319293935, "loss": 0.6527, "step": 8460 }, { "epoch": 0.1885238603988604, "grad_norm": 0.9065718650817871, "learning_rate": 0.0003961288086700619, "loss": 0.6676, "step": 8470 }, { "epoch": 0.18874643874643873, "grad_norm": 0.6593164205551147, "learning_rate": 0.00039611967475293024, "loss": 0.7057, "step": 8480 }, { "epoch": 0.1889690170940171, "grad_norm": 0.6331801414489746, "learning_rate": 0.0003961105301784949, "loss": 0.668, "step": 8490 }, { "epoch": 0.18919159544159544, "grad_norm": 0.8595057725906372, "learning_rate": 0.0003961013749472529, "loss": 0.8011, "step": 8500 }, { "epoch": 0.1894141737891738, "grad_norm": 0.8133804798126221, "learning_rate": 0.00039609220905970153, "loss": 0.6667, "step": 8510 }, { "epoch": 0.18963675213675213, "grad_norm": 0.7005951404571533, "learning_rate": 0.00039608303251633905, "loss": 0.5827, "step": 8520 }, { "epoch": 0.18985933048433049, "grad_norm": 0.6705909371376038, "learning_rate": 0.00039607384531766405, "loss": 0.6357, "step": 8530 }, { "epoch": 0.19008190883190884, "grad_norm": 0.3930394649505615, "learning_rate": 0.00039606464746417576, "loss": 0.6741, "step": 8540 }, { "epoch": 0.19030448717948717, "grad_norm": 0.5143939852714539, "learning_rate": 0.00039605543895637405, "loss": 0.6344, "step": 8550 }, { "epoch": 0.19052706552706553, "grad_norm": 1.4165090322494507, "learning_rate": 0.0003960462197947593, "loss": 0.6876, "step": 8560 }, { "epoch": 0.19074964387464388, "grad_norm": 0.5349679589271545, "learning_rate": 0.00039603698997983243, "loss": 0.6644, "step": 8570 }, { "epoch": 0.1909722222222222, "grad_norm": 0.8081477284431458, "learning_rate": 0.0003960277495120951, "loss": 0.6176, "step": 8580 }, { "epoch": 0.19119480056980057, "grad_norm": 0.6244815587997437, "learning_rate": 0.00039601849839204935, "loss": 0.7712, "step": 8590 }, { "epoch": 0.19141737891737892, "grad_norm": 1.218777060508728, "learning_rate": 0.00039600923662019795, "loss": 0.692, "step": 8600 }, { "epoch": 0.19163995726495728, "grad_norm": 0.6282643675804138, "learning_rate": 0.0003959999641970441, "loss": 0.5798, "step": 8610 }, { "epoch": 0.1918625356125356, "grad_norm": 0.9417448043823242, "learning_rate": 0.00039599068112309183, "loss": 0.6914, "step": 8620 }, { "epoch": 0.19208511396011396, "grad_norm": 0.7199434041976929, "learning_rate": 0.0003959813873988455, "loss": 0.5549, "step": 8630 }, { "epoch": 0.19230769230769232, "grad_norm": 0.46725979447364807, "learning_rate": 0.0003959720830248101, "loss": 0.808, "step": 8640 }, { "epoch": 0.19253027065527065, "grad_norm": 0.6076691746711731, "learning_rate": 0.0003959627680014913, "loss": 0.5883, "step": 8650 }, { "epoch": 0.192752849002849, "grad_norm": 0.4056580662727356, "learning_rate": 0.0003959534423293953, "loss": 0.5899, "step": 8660 }, { "epoch": 0.19297542735042736, "grad_norm": 0.7676776647567749, "learning_rate": 0.0003959441060090288, "loss": 0.562, "step": 8670 }, { "epoch": 0.1931980056980057, "grad_norm": 0.43628600239753723, "learning_rate": 0.0003959347590408991, "loss": 0.6806, "step": 8680 }, { "epoch": 0.19342058404558404, "grad_norm": 0.8665661215782166, "learning_rate": 0.0003959254014255143, "loss": 0.679, "step": 8690 }, { "epoch": 0.1936431623931624, "grad_norm": 0.8895514011383057, "learning_rate": 0.0003959160331633827, "loss": 0.7006, "step": 8700 }, { "epoch": 0.19386574074074073, "grad_norm": 0.6683569550514221, "learning_rate": 0.0003959066542550135, "loss": 0.6949, "step": 8710 }, { "epoch": 0.19408831908831908, "grad_norm": 0.8206831812858582, "learning_rate": 0.0003958972647009164, "loss": 0.642, "step": 8720 }, { "epoch": 0.19431089743589744, "grad_norm": 0.5844119191169739, "learning_rate": 0.0003958878645016015, "loss": 0.574, "step": 8730 }, { "epoch": 0.1945334757834758, "grad_norm": 0.609693169593811, "learning_rate": 0.0003958784536575797, "loss": 0.6547, "step": 8740 }, { "epoch": 0.19475605413105412, "grad_norm": 0.4209142029285431, "learning_rate": 0.00039586903216936236, "loss": 0.6194, "step": 8750 }, { "epoch": 0.19497863247863248, "grad_norm": 0.7345901727676392, "learning_rate": 0.0003958596000374615, "loss": 0.676, "step": 8760 }, { "epoch": 0.19520121082621084, "grad_norm": 0.6274478435516357, "learning_rate": 0.00039585015726238963, "loss": 0.7814, "step": 8770 }, { "epoch": 0.19542378917378916, "grad_norm": 0.6954050064086914, "learning_rate": 0.0003958407038446598, "loss": 0.7683, "step": 8780 }, { "epoch": 0.19564636752136752, "grad_norm": 0.9183118939399719, "learning_rate": 0.0003958312397847859, "loss": 0.6729, "step": 8790 }, { "epoch": 0.19586894586894588, "grad_norm": 0.7694580554962158, "learning_rate": 0.0003958217650832821, "loss": 0.6632, "step": 8800 }, { "epoch": 0.1960915242165242, "grad_norm": 0.6874392032623291, "learning_rate": 0.0003958122797406633, "loss": 0.6299, "step": 8810 }, { "epoch": 0.19631410256410256, "grad_norm": 0.5900786519050598, "learning_rate": 0.00039580278375744485, "loss": 0.6253, "step": 8820 }, { "epoch": 0.19653668091168092, "grad_norm": 0.42310723662376404, "learning_rate": 0.00039579327713414286, "loss": 0.456, "step": 8830 }, { "epoch": 0.19675925925925927, "grad_norm": 0.7175334692001343, "learning_rate": 0.0003957837598712739, "loss": 0.4972, "step": 8840 }, { "epoch": 0.1969818376068376, "grad_norm": 0.6608636379241943, "learning_rate": 0.0003957742319693552, "loss": 0.6856, "step": 8850 }, { "epoch": 0.19720441595441596, "grad_norm": 0.7150349020957947, "learning_rate": 0.0003957646934289044, "loss": 0.7014, "step": 8860 }, { "epoch": 0.1974269943019943, "grad_norm": 0.8787618279457092, "learning_rate": 0.00039575514425043996, "loss": 0.7043, "step": 8870 }, { "epoch": 0.19764957264957264, "grad_norm": 0.8096165657043457, "learning_rate": 0.0003957455844344807, "loss": 0.7042, "step": 8880 }, { "epoch": 0.197872150997151, "grad_norm": 0.5648137927055359, "learning_rate": 0.00039573601398154617, "loss": 0.6565, "step": 8890 }, { "epoch": 0.19809472934472935, "grad_norm": 0.7525642514228821, "learning_rate": 0.00039572643289215636, "loss": 0.6618, "step": 8900 }, { "epoch": 0.19831730769230768, "grad_norm": 0.7236807346343994, "learning_rate": 0.00039571684116683194, "loss": 0.6229, "step": 8910 }, { "epoch": 0.19853988603988604, "grad_norm": 0.690436601638794, "learning_rate": 0.0003957072388060942, "loss": 0.5987, "step": 8920 }, { "epoch": 0.1987624643874644, "grad_norm": 0.38092443346977234, "learning_rate": 0.0003956976258104649, "loss": 0.5136, "step": 8930 }, { "epoch": 0.19898504273504272, "grad_norm": 0.6596299409866333, "learning_rate": 0.0003956880021804664, "loss": 0.7893, "step": 8940 }, { "epoch": 0.19920762108262108, "grad_norm": 0.5776147842407227, "learning_rate": 0.0003956783679166216, "loss": 0.5902, "step": 8950 }, { "epoch": 0.19943019943019943, "grad_norm": 0.7063926458358765, "learning_rate": 0.00039566872301945416, "loss": 0.6769, "step": 8960 }, { "epoch": 0.1996527777777778, "grad_norm": 0.7202835083007812, "learning_rate": 0.0003956590674894881, "loss": 0.5673, "step": 8970 }, { "epoch": 0.19987535612535612, "grad_norm": 0.721526563167572, "learning_rate": 0.00039564940132724816, "loss": 0.5911, "step": 8980 }, { "epoch": 0.20009793447293447, "grad_norm": 0.6836729049682617, "learning_rate": 0.00039563972453325954, "loss": 0.5803, "step": 8990 }, { "epoch": 0.20032051282051283, "grad_norm": 0.6123881936073303, "learning_rate": 0.0003956300371080482, "loss": 0.6177, "step": 9000 }, { "epoch": 0.20054309116809116, "grad_norm": 0.48289263248443604, "learning_rate": 0.0003956203390521405, "loss": 0.6179, "step": 9010 }, { "epoch": 0.20076566951566951, "grad_norm": 0.4708038568496704, "learning_rate": 0.0003956106303660634, "loss": 0.4972, "step": 9020 }, { "epoch": 0.20098824786324787, "grad_norm": 0.4728592336177826, "learning_rate": 0.00039560091105034445, "loss": 0.652, "step": 9030 }, { "epoch": 0.2012108262108262, "grad_norm": 1.943331241607666, "learning_rate": 0.000395591181105512, "loss": 0.7249, "step": 9040 }, { "epoch": 0.20143340455840456, "grad_norm": 0.9894505739212036, "learning_rate": 0.0003955814405320945, "loss": 0.7012, "step": 9050 }, { "epoch": 0.2016559829059829, "grad_norm": 0.6454615592956543, "learning_rate": 0.0003955716893306215, "loss": 0.6747, "step": 9060 }, { "epoch": 0.20187856125356127, "grad_norm": 0.5542591214179993, "learning_rate": 0.00039556192750162276, "loss": 0.6098, "step": 9070 }, { "epoch": 0.2021011396011396, "grad_norm": 0.5976565480232239, "learning_rate": 0.0003955521550456288, "loss": 0.7787, "step": 9080 }, { "epoch": 0.20232371794871795, "grad_norm": 1.246936559677124, "learning_rate": 0.0003955423719631707, "loss": 0.6445, "step": 9090 }, { "epoch": 0.2025462962962963, "grad_norm": 0.6775862574577332, "learning_rate": 0.0003955325782547799, "loss": 0.5769, "step": 9100 }, { "epoch": 0.20276887464387464, "grad_norm": 0.4151378273963928, "learning_rate": 0.0003955227739209889, "loss": 0.6429, "step": 9110 }, { "epoch": 0.202991452991453, "grad_norm": 0.6238030791282654, "learning_rate": 0.00039551295896233016, "loss": 0.6775, "step": 9120 }, { "epoch": 0.20321403133903135, "grad_norm": 0.6367015838623047, "learning_rate": 0.00039550313337933726, "loss": 0.5909, "step": 9130 }, { "epoch": 0.20343660968660968, "grad_norm": 0.605278491973877, "learning_rate": 0.000395493297172544, "loss": 0.6829, "step": 9140 }, { "epoch": 0.20365918803418803, "grad_norm": 0.5138265490531921, "learning_rate": 0.00039548345034248495, "loss": 0.56, "step": 9150 }, { "epoch": 0.2038817663817664, "grad_norm": 0.703345775604248, "learning_rate": 0.0003954735928896952, "loss": 0.6245, "step": 9160 }, { "epoch": 0.20410434472934472, "grad_norm": 0.5316170454025269, "learning_rate": 0.0003954637248147104, "loss": 0.7017, "step": 9170 }, { "epoch": 0.20432692307692307, "grad_norm": 0.8845970034599304, "learning_rate": 0.00039545384611806676, "loss": 0.6894, "step": 9180 }, { "epoch": 0.20454950142450143, "grad_norm": 0.34213122725486755, "learning_rate": 0.0003954439568003011, "loss": 0.659, "step": 9190 }, { "epoch": 0.20477207977207978, "grad_norm": 0.6910355091094971, "learning_rate": 0.0003954340568619508, "loss": 0.7405, "step": 9200 }, { "epoch": 0.2049946581196581, "grad_norm": 0.8337056040763855, "learning_rate": 0.0003954241463035539, "loss": 0.6928, "step": 9210 }, { "epoch": 0.20521723646723647, "grad_norm": 0.6273629069328308, "learning_rate": 0.0003954142251256489, "loss": 0.52, "step": 9220 }, { "epoch": 0.20543981481481483, "grad_norm": 0.4294908046722412, "learning_rate": 0.0003954042933287749, "loss": 0.6373, "step": 9230 }, { "epoch": 0.20566239316239315, "grad_norm": 0.6154212355613708, "learning_rate": 0.00039539435091347176, "loss": 0.827, "step": 9240 }, { "epoch": 0.2058849715099715, "grad_norm": 1.015303134918213, "learning_rate": 0.0003953843978802795, "loss": 0.6347, "step": 9250 }, { "epoch": 0.20610754985754987, "grad_norm": 0.9026708006858826, "learning_rate": 0.0003953744342297391, "loss": 0.6995, "step": 9260 }, { "epoch": 0.2063301282051282, "grad_norm": 0.3805409073829651, "learning_rate": 0.0003953644599623921, "loss": 0.5911, "step": 9270 }, { "epoch": 0.20655270655270655, "grad_norm": 0.8477494120597839, "learning_rate": 0.00039535447507878035, "loss": 0.5985, "step": 9280 }, { "epoch": 0.2067752849002849, "grad_norm": 0.5137654542922974, "learning_rate": 0.0003953444795794465, "loss": 0.7368, "step": 9290 }, { "epoch": 0.20699786324786323, "grad_norm": 0.8575098514556885, "learning_rate": 0.0003953344734649338, "loss": 0.591, "step": 9300 }, { "epoch": 0.2072204415954416, "grad_norm": 0.9351524114608765, "learning_rate": 0.00039532445673578587, "loss": 0.4845, "step": 9310 }, { "epoch": 0.20744301994301995, "grad_norm": 0.4439866542816162, "learning_rate": 0.000395314429392547, "loss": 0.6406, "step": 9320 }, { "epoch": 0.2076655982905983, "grad_norm": 0.8442595601081848, "learning_rate": 0.0003953043914357622, "loss": 0.7016, "step": 9330 }, { "epoch": 0.20788817663817663, "grad_norm": 0.6058248281478882, "learning_rate": 0.0003952943428659768, "loss": 0.5101, "step": 9340 }, { "epoch": 0.208110754985755, "grad_norm": 0.6372517347335815, "learning_rate": 0.00039528428368373696, "loss": 0.5624, "step": 9350 }, { "epoch": 0.20833333333333334, "grad_norm": 0.6820414662361145, "learning_rate": 0.0003952742138895894, "loss": 0.5839, "step": 9360 }, { "epoch": 0.20855591168091167, "grad_norm": 0.7529656291007996, "learning_rate": 0.0003952641334840811, "loss": 0.651, "step": 9370 }, { "epoch": 0.20877849002849003, "grad_norm": 0.943543553352356, "learning_rate": 0.0003952540424677599, "loss": 0.5813, "step": 9380 }, { "epoch": 0.20900106837606838, "grad_norm": 0.9541203379631042, "learning_rate": 0.00039524394084117427, "loss": 0.6527, "step": 9390 }, { "epoch": 0.2092236467236467, "grad_norm": 0.7158280611038208, "learning_rate": 0.000395233828604873, "loss": 0.712, "step": 9400 }, { "epoch": 0.20944622507122507, "grad_norm": 0.5238418579101562, "learning_rate": 0.0003952237057594057, "loss": 0.6042, "step": 9410 }, { "epoch": 0.20966880341880342, "grad_norm": 0.855786919593811, "learning_rate": 0.0003952135723053224, "loss": 0.6428, "step": 9420 }, { "epoch": 0.20989138176638178, "grad_norm": 0.6584997773170471, "learning_rate": 0.0003952034282431738, "loss": 0.6112, "step": 9430 }, { "epoch": 0.2101139601139601, "grad_norm": 1.005631446838379, "learning_rate": 0.000395193273573511, "loss": 0.7763, "step": 9440 }, { "epoch": 0.21033653846153846, "grad_norm": 0.8165929317474365, "learning_rate": 0.00039518310829688596, "loss": 0.6888, "step": 9450 }, { "epoch": 0.21055911680911682, "grad_norm": 0.44331878423690796, "learning_rate": 0.0003951729324138511, "loss": 0.6588, "step": 9460 }, { "epoch": 0.21078169515669515, "grad_norm": 0.7836384177207947, "learning_rate": 0.0003951627459249593, "loss": 0.5745, "step": 9470 }, { "epoch": 0.2110042735042735, "grad_norm": 0.7953740954399109, "learning_rate": 0.0003951525488307641, "loss": 0.5136, "step": 9480 }, { "epoch": 0.21122685185185186, "grad_norm": 0.5712646842002869, "learning_rate": 0.0003951423411318197, "loss": 0.7217, "step": 9490 }, { "epoch": 0.2114494301994302, "grad_norm": 0.667172908782959, "learning_rate": 0.00039513212282868063, "loss": 0.6767, "step": 9500 }, { "epoch": 0.21167200854700854, "grad_norm": 0.747340738773346, "learning_rate": 0.0003951218939219023, "loss": 0.7564, "step": 9510 }, { "epoch": 0.2118945868945869, "grad_norm": 1.0341793298721313, "learning_rate": 0.0003951116544120405, "loss": 0.7363, "step": 9520 }, { "epoch": 0.21211716524216523, "grad_norm": 0.5947072505950928, "learning_rate": 0.0003951014042996517, "loss": 0.7156, "step": 9530 }, { "epoch": 0.21233974358974358, "grad_norm": 0.535180926322937, "learning_rate": 0.0003950911435852929, "loss": 0.594, "step": 9540 }, { "epoch": 0.21256232193732194, "grad_norm": 0.9859617352485657, "learning_rate": 0.0003950808722695216, "loss": 0.638, "step": 9550 }, { "epoch": 0.2127849002849003, "grad_norm": 0.5788066983222961, "learning_rate": 0.00039507059035289604, "loss": 0.6786, "step": 9560 }, { "epoch": 0.21300747863247863, "grad_norm": 0.5944311618804932, "learning_rate": 0.0003950602978359749, "loss": 0.5964, "step": 9570 }, { "epoch": 0.21323005698005698, "grad_norm": 0.5792214274406433, "learning_rate": 0.00039504999471931746, "loss": 0.553, "step": 9580 }, { "epoch": 0.21345263532763534, "grad_norm": 0.6323608756065369, "learning_rate": 0.0003950396810034836, "loss": 0.5525, "step": 9590 }, { "epoch": 0.21367521367521367, "grad_norm": 0.7955515384674072, "learning_rate": 0.00039502935668903386, "loss": 0.6462, "step": 9600 }, { "epoch": 0.21389779202279202, "grad_norm": 0.7051757574081421, "learning_rate": 0.00039501902177652924, "loss": 0.5955, "step": 9610 }, { "epoch": 0.21412037037037038, "grad_norm": 0.458967387676239, "learning_rate": 0.0003950086762665313, "loss": 0.6787, "step": 9620 }, { "epoch": 0.2143429487179487, "grad_norm": 0.7646269798278809, "learning_rate": 0.00039499832015960225, "loss": 0.5951, "step": 9630 }, { "epoch": 0.21456552706552706, "grad_norm": 0.7068632245063782, "learning_rate": 0.00039498795345630487, "loss": 0.6497, "step": 9640 }, { "epoch": 0.21478810541310542, "grad_norm": 0.7031964659690857, "learning_rate": 0.00039497757615720243, "loss": 0.795, "step": 9650 }, { "epoch": 0.21501068376068377, "grad_norm": 0.7918108701705933, "learning_rate": 0.00039496718826285894, "loss": 0.6667, "step": 9660 }, { "epoch": 0.2152332621082621, "grad_norm": 0.4682900011539459, "learning_rate": 0.0003949567897738388, "loss": 0.6158, "step": 9670 }, { "epoch": 0.21545584045584046, "grad_norm": 0.47049540281295776, "learning_rate": 0.0003949463806907071, "loss": 0.6403, "step": 9680 }, { "epoch": 0.21567841880341881, "grad_norm": 0.6707554459571838, "learning_rate": 0.00039493596101402954, "loss": 0.6894, "step": 9690 }, { "epoch": 0.21590099715099714, "grad_norm": 0.6850556135177612, "learning_rate": 0.00039492553074437224, "loss": 0.5975, "step": 9700 }, { "epoch": 0.2161235754985755, "grad_norm": 0.5969346165657043, "learning_rate": 0.000394915089882302, "loss": 0.8084, "step": 9710 }, { "epoch": 0.21634615384615385, "grad_norm": 0.8701211214065552, "learning_rate": 0.0003949046384283862, "loss": 0.8172, "step": 9720 }, { "epoch": 0.21656873219373218, "grad_norm": 0.7247389554977417, "learning_rate": 0.0003948941763831928, "loss": 0.7495, "step": 9730 }, { "epoch": 0.21679131054131054, "grad_norm": 0.4859636723995209, "learning_rate": 0.0003948837037472903, "loss": 0.6212, "step": 9740 }, { "epoch": 0.2170138888888889, "grad_norm": 0.4931568205356598, "learning_rate": 0.00039487322052124787, "loss": 0.6867, "step": 9750 }, { "epoch": 0.21723646723646722, "grad_norm": 0.8555331826210022, "learning_rate": 0.00039486272670563507, "loss": 0.6811, "step": 9760 }, { "epoch": 0.21745904558404558, "grad_norm": 0.7685542702674866, "learning_rate": 0.00039485222230102216, "loss": 0.5231, "step": 9770 }, { "epoch": 0.21768162393162394, "grad_norm": 0.7093376517295837, "learning_rate": 0.00039484170730798, "loss": 0.7635, "step": 9780 }, { "epoch": 0.2179042022792023, "grad_norm": 0.5934985876083374, "learning_rate": 0.0003948311817270799, "loss": 0.6767, "step": 9790 }, { "epoch": 0.21812678062678062, "grad_norm": 0.638926088809967, "learning_rate": 0.000394820645558894, "loss": 0.7192, "step": 9800 }, { "epoch": 0.21834935897435898, "grad_norm": 0.5633494257926941, "learning_rate": 0.0003948100988039946, "loss": 0.6495, "step": 9810 }, { "epoch": 0.21857193732193733, "grad_norm": 1.0099177360534668, "learning_rate": 0.0003947995414629551, "loss": 0.5245, "step": 9820 }, { "epoch": 0.21879451566951566, "grad_norm": 0.7500694394111633, "learning_rate": 0.00039478897353634895, "loss": 0.8152, "step": 9830 }, { "epoch": 0.21901709401709402, "grad_norm": 0.6685538291931152, "learning_rate": 0.0003947783950247505, "loss": 0.682, "step": 9840 }, { "epoch": 0.21923967236467237, "grad_norm": 0.5551173090934753, "learning_rate": 0.00039476780592873463, "loss": 0.468, "step": 9850 }, { "epoch": 0.2194622507122507, "grad_norm": 0.5178750157356262, "learning_rate": 0.0003947572062488768, "loss": 0.621, "step": 9860 }, { "epoch": 0.21968482905982906, "grad_norm": 0.6670868396759033, "learning_rate": 0.0003947465959857529, "loss": 0.7889, "step": 9870 }, { "epoch": 0.2199074074074074, "grad_norm": 0.4501799941062927, "learning_rate": 0.0003947359751399395, "loss": 0.6168, "step": 9880 }, { "epoch": 0.22012998575498577, "grad_norm": 0.6546468138694763, "learning_rate": 0.00039472534371201383, "loss": 0.5885, "step": 9890 }, { "epoch": 0.2203525641025641, "grad_norm": 0.6405220031738281, "learning_rate": 0.0003947147017025536, "loss": 0.5922, "step": 9900 }, { "epoch": 0.22057514245014245, "grad_norm": 0.4797993302345276, "learning_rate": 0.00039470404911213705, "loss": 0.7047, "step": 9910 }, { "epoch": 0.2207977207977208, "grad_norm": 0.721960186958313, "learning_rate": 0.0003946933859413431, "loss": 0.6127, "step": 9920 }, { "epoch": 0.22102029914529914, "grad_norm": 0.4744751453399658, "learning_rate": 0.0003946827121907512, "loss": 0.5057, "step": 9930 }, { "epoch": 0.2212428774928775, "grad_norm": 0.6154679656028748, "learning_rate": 0.0003946720278609413, "loss": 0.6269, "step": 9940 }, { "epoch": 0.22146545584045585, "grad_norm": 0.7666019797325134, "learning_rate": 0.00039466133295249406, "loss": 0.6895, "step": 9950 }, { "epoch": 0.22168803418803418, "grad_norm": 0.6650170683860779, "learning_rate": 0.0003946506274659906, "loss": 0.7485, "step": 9960 }, { "epoch": 0.22191061253561253, "grad_norm": 0.8512768745422363, "learning_rate": 0.00039463991140201274, "loss": 0.6962, "step": 9970 }, { "epoch": 0.2221331908831909, "grad_norm": 0.7195132970809937, "learning_rate": 0.00039462918476114277, "loss": 0.6445, "step": 9980 }, { "epoch": 0.22235576923076922, "grad_norm": 0.6763859391212463, "learning_rate": 0.0003946184475439635, "loss": 0.7474, "step": 9990 }, { "epoch": 0.22257834757834757, "grad_norm": 0.6375609636306763, "learning_rate": 0.00039460769975105853, "loss": 0.575, "step": 10000 }, { "epoch": 0.22280092592592593, "grad_norm": 0.4558027684688568, "learning_rate": 0.0003945969413830118, "loss": 0.6552, "step": 10010 }, { "epoch": 0.22302350427350429, "grad_norm": 0.6802572011947632, "learning_rate": 0.000394586172440408, "loss": 0.6885, "step": 10020 }, { "epoch": 0.22324608262108261, "grad_norm": 0.77765291929245, "learning_rate": 0.00039457539292383223, "loss": 0.7531, "step": 10030 }, { "epoch": 0.22346866096866097, "grad_norm": 0.8640170693397522, "learning_rate": 0.0003945646028338704, "loss": 0.7519, "step": 10040 }, { "epoch": 0.22369123931623933, "grad_norm": 0.464138925075531, "learning_rate": 0.00039455380217110874, "loss": 0.7144, "step": 10050 }, { "epoch": 0.22391381766381765, "grad_norm": 0.5078659057617188, "learning_rate": 0.0003945429909361342, "loss": 0.602, "step": 10060 }, { "epoch": 0.224136396011396, "grad_norm": 0.6784718632698059, "learning_rate": 0.0003945321691295343, "loss": 0.5502, "step": 10070 }, { "epoch": 0.22435897435897437, "grad_norm": 0.6645569801330566, "learning_rate": 0.000394521336751897, "loss": 0.5954, "step": 10080 }, { "epoch": 0.2245815527065527, "grad_norm": 0.6368417143821716, "learning_rate": 0.0003945104938038111, "loss": 0.6147, "step": 10090 }, { "epoch": 0.22480413105413105, "grad_norm": 1.4571202993392944, "learning_rate": 0.0003944996402858657, "loss": 0.6064, "step": 10100 }, { "epoch": 0.2250267094017094, "grad_norm": 0.7869674563407898, "learning_rate": 0.0003944887761986506, "loss": 0.5489, "step": 10110 }, { "epoch": 0.22524928774928774, "grad_norm": 0.7581375241279602, "learning_rate": 0.0003944779015427562, "loss": 0.6868, "step": 10120 }, { "epoch": 0.2254718660968661, "grad_norm": 0.7421298623085022, "learning_rate": 0.0003944670163187735, "loss": 0.5624, "step": 10130 }, { "epoch": 0.22569444444444445, "grad_norm": 0.6129317283630371, "learning_rate": 0.0003944561205272939, "loss": 0.7149, "step": 10140 }, { "epoch": 0.2259170227920228, "grad_norm": 0.7636057138442993, "learning_rate": 0.00039444521416890944, "loss": 0.584, "step": 10150 }, { "epoch": 0.22613960113960113, "grad_norm": 0.7134559154510498, "learning_rate": 0.0003944342972442129, "loss": 0.7097, "step": 10160 }, { "epoch": 0.2263621794871795, "grad_norm": 0.6366649866104126, "learning_rate": 0.0003944233697537975, "loss": 0.5996, "step": 10170 }, { "epoch": 0.22658475783475784, "grad_norm": 0.6678193807601929, "learning_rate": 0.000394412431698257, "loss": 0.6114, "step": 10180 }, { "epoch": 0.22680733618233617, "grad_norm": 0.6763032674789429, "learning_rate": 0.0003944014830781858, "loss": 0.5864, "step": 10190 }, { "epoch": 0.22702991452991453, "grad_norm": 0.6134129166603088, "learning_rate": 0.0003943905238941789, "loss": 0.6545, "step": 10200 }, { "epoch": 0.22725249287749288, "grad_norm": 0.8235966563224792, "learning_rate": 0.0003943795541468318, "loss": 0.5798, "step": 10210 }, { "epoch": 0.2274750712250712, "grad_norm": 0.6266351342201233, "learning_rate": 0.0003943685738367406, "loss": 0.5682, "step": 10220 }, { "epoch": 0.22769764957264957, "grad_norm": 0.5528531074523926, "learning_rate": 0.00039435758296450194, "loss": 0.4977, "step": 10230 }, { "epoch": 0.22792022792022792, "grad_norm": 0.7468633055686951, "learning_rate": 0.00039434658153071313, "loss": 0.557, "step": 10240 }, { "epoch": 0.22814280626780628, "grad_norm": 0.4979506731033325, "learning_rate": 0.00039433556953597204, "loss": 0.582, "step": 10250 }, { "epoch": 0.2283653846153846, "grad_norm": 0.7964197397232056, "learning_rate": 0.000394324546980877, "loss": 0.7436, "step": 10260 }, { "epoch": 0.22858796296296297, "grad_norm": 0.8756190538406372, "learning_rate": 0.0003943135138660269, "loss": 0.7267, "step": 10270 }, { "epoch": 0.22881054131054132, "grad_norm": 0.6306865215301514, "learning_rate": 0.00039430247019202146, "loss": 0.5717, "step": 10280 }, { "epoch": 0.22903311965811965, "grad_norm": 0.8970215320587158, "learning_rate": 0.00039429141595946073, "loss": 0.6467, "step": 10290 }, { "epoch": 0.229255698005698, "grad_norm": 0.9413365721702576, "learning_rate": 0.0003942803511689453, "loss": 0.7759, "step": 10300 }, { "epoch": 0.22947827635327636, "grad_norm": 0.684522807598114, "learning_rate": 0.00039426927582107663, "loss": 0.8251, "step": 10310 }, { "epoch": 0.2297008547008547, "grad_norm": 0.7772836685180664, "learning_rate": 0.0003942581899164565, "loss": 0.5982, "step": 10320 }, { "epoch": 0.22992343304843305, "grad_norm": 0.8296146988868713, "learning_rate": 0.0003942470934556873, "loss": 0.6668, "step": 10330 }, { "epoch": 0.2301460113960114, "grad_norm": 0.6865838170051575, "learning_rate": 0.00039423598643937197, "loss": 0.5596, "step": 10340 }, { "epoch": 0.23036858974358973, "grad_norm": 0.8723232746124268, "learning_rate": 0.00039422486886811416, "loss": 0.6831, "step": 10350 }, { "epoch": 0.23059116809116809, "grad_norm": 0.7553172707557678, "learning_rate": 0.00039421374074251797, "loss": 0.6949, "step": 10360 }, { "epoch": 0.23081374643874644, "grad_norm": 0.7564347982406616, "learning_rate": 0.00039420260206318806, "loss": 0.7041, "step": 10370 }, { "epoch": 0.2310363247863248, "grad_norm": 0.6427896618843079, "learning_rate": 0.00039419145283072984, "loss": 0.5599, "step": 10380 }, { "epoch": 0.23125890313390313, "grad_norm": 0.7506496906280518, "learning_rate": 0.00039418029304574907, "loss": 0.5654, "step": 10390 }, { "epoch": 0.23148148148148148, "grad_norm": 0.7676074504852295, "learning_rate": 0.0003941691227088523, "loss": 0.7259, "step": 10400 }, { "epoch": 0.23170405982905984, "grad_norm": 1.0894349813461304, "learning_rate": 0.00039415794182064633, "loss": 0.7539, "step": 10410 }, { "epoch": 0.23192663817663817, "grad_norm": 0.4351309835910797, "learning_rate": 0.0003941467503817389, "loss": 0.6425, "step": 10420 }, { "epoch": 0.23214921652421652, "grad_norm": 0.5593180656433105, "learning_rate": 0.00039413554839273817, "loss": 0.6471, "step": 10430 }, { "epoch": 0.23237179487179488, "grad_norm": 0.41081494092941284, "learning_rate": 0.00039412433585425276, "loss": 0.6515, "step": 10440 }, { "epoch": 0.2325943732193732, "grad_norm": 0.4768132269382477, "learning_rate": 0.000394113112766892, "loss": 0.582, "step": 10450 }, { "epoch": 0.23281695156695156, "grad_norm": 0.6515503525733948, "learning_rate": 0.0003941018791312658, "loss": 0.6886, "step": 10460 }, { "epoch": 0.23303952991452992, "grad_norm": 0.4547460377216339, "learning_rate": 0.00039409063494798464, "loss": 0.6271, "step": 10470 }, { "epoch": 0.23326210826210828, "grad_norm": 0.7479694485664368, "learning_rate": 0.0003940793802176594, "loss": 0.5905, "step": 10480 }, { "epoch": 0.2334846866096866, "grad_norm": 0.5531708598136902, "learning_rate": 0.0003940681149409018, "loss": 0.5962, "step": 10490 }, { "epoch": 0.23370726495726496, "grad_norm": 0.8904018402099609, "learning_rate": 0.000394056839118324, "loss": 0.5928, "step": 10500 }, { "epoch": 0.23392984330484332, "grad_norm": 0.8517378568649292, "learning_rate": 0.0003940455527505387, "loss": 0.6815, "step": 10510 }, { "epoch": 0.23415242165242164, "grad_norm": 0.6255640983581543, "learning_rate": 0.0003940342558381591, "loss": 0.5768, "step": 10520 }, { "epoch": 0.234375, "grad_norm": 0.8824275135993958, "learning_rate": 0.00039402294838179937, "loss": 0.553, "step": 10530 }, { "epoch": 0.23459757834757836, "grad_norm": 0.8169212341308594, "learning_rate": 0.00039401163038207363, "loss": 0.5748, "step": 10540 }, { "epoch": 0.23482015669515668, "grad_norm": 0.8070210218429565, "learning_rate": 0.0003940003018395971, "loss": 0.721, "step": 10550 }, { "epoch": 0.23504273504273504, "grad_norm": 0.7101810574531555, "learning_rate": 0.0003939889627549854, "loss": 0.7535, "step": 10560 }, { "epoch": 0.2352653133903134, "grad_norm": 0.6106790900230408, "learning_rate": 0.00039397761312885465, "loss": 0.7275, "step": 10570 }, { "epoch": 0.23548789173789172, "grad_norm": 0.8647605180740356, "learning_rate": 0.0003939662529618216, "loss": 0.6286, "step": 10580 }, { "epoch": 0.23571047008547008, "grad_norm": 0.481120228767395, "learning_rate": 0.00039395488225450363, "loss": 0.6296, "step": 10590 }, { "epoch": 0.23593304843304844, "grad_norm": 0.8005710244178772, "learning_rate": 0.00039394350100751846, "loss": 0.734, "step": 10600 }, { "epoch": 0.2361556267806268, "grad_norm": 0.5818419456481934, "learning_rate": 0.00039393210922148477, "loss": 0.7026, "step": 10610 }, { "epoch": 0.23637820512820512, "grad_norm": 0.5003513693809509, "learning_rate": 0.0003939207068970214, "loss": 0.5966, "step": 10620 }, { "epoch": 0.23660078347578348, "grad_norm": 0.661308765411377, "learning_rate": 0.00039390929403474817, "loss": 0.5786, "step": 10630 }, { "epoch": 0.23682336182336183, "grad_norm": 0.9554111957550049, "learning_rate": 0.00039389787063528515, "loss": 0.5686, "step": 10640 }, { "epoch": 0.23704594017094016, "grad_norm": 0.6321176290512085, "learning_rate": 0.00039388643669925307, "loss": 0.5561, "step": 10650 }, { "epoch": 0.23726851851851852, "grad_norm": 0.4389246106147766, "learning_rate": 0.00039387499222727333, "loss": 0.654, "step": 10660 }, { "epoch": 0.23749109686609687, "grad_norm": 0.5420764088630676, "learning_rate": 0.0003938635372199678, "loss": 0.4846, "step": 10670 }, { "epoch": 0.2377136752136752, "grad_norm": 0.7103323340415955, "learning_rate": 0.0003938520716779589, "loss": 0.5769, "step": 10680 }, { "epoch": 0.23793625356125356, "grad_norm": 0.7802491188049316, "learning_rate": 0.00039384059560186975, "loss": 0.6831, "step": 10690 }, { "epoch": 0.23815883190883191, "grad_norm": 0.7568184733390808, "learning_rate": 0.000393829108992324, "loss": 0.7352, "step": 10700 }, { "epoch": 0.23838141025641027, "grad_norm": 0.5993239879608154, "learning_rate": 0.0003938176118499458, "loss": 0.684, "step": 10710 }, { "epoch": 0.2386039886039886, "grad_norm": 0.6817304491996765, "learning_rate": 0.0003938061041753598, "loss": 0.733, "step": 10720 }, { "epoch": 0.23882656695156695, "grad_norm": 0.7760605216026306, "learning_rate": 0.0003937945859691915, "loss": 0.6041, "step": 10730 }, { "epoch": 0.2390491452991453, "grad_norm": 0.9699644446372986, "learning_rate": 0.0003937830572320668, "loss": 0.6125, "step": 10740 }, { "epoch": 0.23927172364672364, "grad_norm": 0.808280348777771, "learning_rate": 0.00039377151796461213, "loss": 0.6503, "step": 10750 }, { "epoch": 0.239494301994302, "grad_norm": 0.5646578073501587, "learning_rate": 0.00039375996816745445, "loss": 0.5865, "step": 10760 }, { "epoch": 0.23971688034188035, "grad_norm": 0.811384916305542, "learning_rate": 0.0003937484078412215, "loss": 0.6442, "step": 10770 }, { "epoch": 0.23993945868945868, "grad_norm": 0.7848852872848511, "learning_rate": 0.0003937368369865415, "loss": 0.646, "step": 10780 }, { "epoch": 0.24002849002849003, "eval_loss": 0.6369755864143372, "eval_runtime": 337.4494, "eval_samples_per_second": 7.008, "eval_steps_per_second": 7.008, "step": 10784 }, { "epoch": 0.24016203703703703, "grad_norm": 0.6187881827354431, "learning_rate": 0.0003937252556040432, "loss": 0.6877, "step": 10790 }, { "epoch": 0.2403846153846154, "grad_norm": 0.6488537192344666, "learning_rate": 0.0003937136636943559, "loss": 0.7301, "step": 10800 }, { "epoch": 0.24060719373219372, "grad_norm": 0.8597158789634705, "learning_rate": 0.0003937020612581095, "loss": 0.6421, "step": 10810 }, { "epoch": 0.24082977207977208, "grad_norm": 0.6430831551551819, "learning_rate": 0.0003936904482959346, "loss": 0.6035, "step": 10820 }, { "epoch": 0.24105235042735043, "grad_norm": 0.690912127494812, "learning_rate": 0.00039367882480846204, "loss": 0.5751, "step": 10830 }, { "epoch": 0.2412749287749288, "grad_norm": 0.664210319519043, "learning_rate": 0.00039366719079632367, "loss": 0.5842, "step": 10840 }, { "epoch": 0.24149750712250712, "grad_norm": 0.6491913199424744, "learning_rate": 0.00039365554626015167, "loss": 0.7105, "step": 10850 }, { "epoch": 0.24172008547008547, "grad_norm": 0.7318790555000305, "learning_rate": 0.00039364389120057866, "loss": 0.5448, "step": 10860 }, { "epoch": 0.24194266381766383, "grad_norm": 0.780798614025116, "learning_rate": 0.0003936322256182381, "loss": 0.6862, "step": 10870 }, { "epoch": 0.24216524216524216, "grad_norm": 0.8476507067680359, "learning_rate": 0.0003936205495137639, "loss": 0.699, "step": 10880 }, { "epoch": 0.2423878205128205, "grad_norm": 0.6828676462173462, "learning_rate": 0.0003936088628877905, "loss": 0.6713, "step": 10890 }, { "epoch": 0.24261039886039887, "grad_norm": 0.8574651479721069, "learning_rate": 0.00039359716574095306, "loss": 0.6122, "step": 10900 }, { "epoch": 0.2428329772079772, "grad_norm": 0.7367123961448669, "learning_rate": 0.0003935854580738871, "loss": 0.6888, "step": 10910 }, { "epoch": 0.24305555555555555, "grad_norm": 0.40965545177459717, "learning_rate": 0.0003935737398872289, "loss": 0.5856, "step": 10920 }, { "epoch": 0.2432781339031339, "grad_norm": 0.5977027416229248, "learning_rate": 0.0003935620111816151, "loss": 0.6046, "step": 10930 }, { "epoch": 0.24350071225071226, "grad_norm": 0.8471280932426453, "learning_rate": 0.0003935502719576833, "loss": 0.7611, "step": 10940 }, { "epoch": 0.2437232905982906, "grad_norm": 0.8705294728279114, "learning_rate": 0.00039353852221607125, "loss": 0.5893, "step": 10950 }, { "epoch": 0.24394586894586895, "grad_norm": 0.43957415223121643, "learning_rate": 0.0003935267619574174, "loss": 0.6643, "step": 10960 }, { "epoch": 0.2441684472934473, "grad_norm": 0.7031153440475464, "learning_rate": 0.0003935149911823609, "loss": 0.7179, "step": 10970 }, { "epoch": 0.24439102564102563, "grad_norm": 0.7570757269859314, "learning_rate": 0.00039350320989154134, "loss": 0.5547, "step": 10980 }, { "epoch": 0.244613603988604, "grad_norm": 0.69059157371521, "learning_rate": 0.000393491418085599, "loss": 0.5241, "step": 10990 }, { "epoch": 0.24483618233618235, "grad_norm": 0.6872458457946777, "learning_rate": 0.00039347961576517455, "loss": 0.6495, "step": 11000 }, { "epoch": 0.24505876068376067, "grad_norm": 0.5548238158226013, "learning_rate": 0.00039346780293090947, "loss": 0.6809, "step": 11010 }, { "epoch": 0.24528133903133903, "grad_norm": 0.7153156399726868, "learning_rate": 0.0003934559795834455, "loss": 0.5655, "step": 11020 }, { "epoch": 0.24550391737891739, "grad_norm": 0.8266432285308838, "learning_rate": 0.0003934441457234253, "loss": 0.7458, "step": 11030 }, { "epoch": 0.24572649572649571, "grad_norm": 0.4854261875152588, "learning_rate": 0.0003934323013514918, "loss": 0.758, "step": 11040 }, { "epoch": 0.24594907407407407, "grad_norm": 0.7041235566139221, "learning_rate": 0.00039342044646828873, "loss": 0.6934, "step": 11050 }, { "epoch": 0.24617165242165243, "grad_norm": 0.4012024402618408, "learning_rate": 0.0003934085810744603, "loss": 0.5695, "step": 11060 }, { "epoch": 0.24639423076923078, "grad_norm": 0.6638675332069397, "learning_rate": 0.00039339670517065116, "loss": 0.7782, "step": 11070 }, { "epoch": 0.2466168091168091, "grad_norm": 0.7643360495567322, "learning_rate": 0.00039338481875750677, "loss": 0.582, "step": 11080 }, { "epoch": 0.24683938746438747, "grad_norm": 0.5708128213882446, "learning_rate": 0.0003933729218356731, "loss": 0.57, "step": 11090 }, { "epoch": 0.24706196581196582, "grad_norm": 0.5449902415275574, "learning_rate": 0.00039336101440579644, "loss": 0.6559, "step": 11100 }, { "epoch": 0.24728454415954415, "grad_norm": 0.8565914630889893, "learning_rate": 0.00039334909646852396, "loss": 0.5984, "step": 11110 }, { "epoch": 0.2475071225071225, "grad_norm": 0.7144999504089355, "learning_rate": 0.00039333716802450333, "loss": 0.816, "step": 11120 }, { "epoch": 0.24772970085470086, "grad_norm": 0.5774109363555908, "learning_rate": 0.00039332522907438276, "loss": 0.4983, "step": 11130 }, { "epoch": 0.2479522792022792, "grad_norm": 0.4489073157310486, "learning_rate": 0.00039331327961881097, "loss": 0.7048, "step": 11140 }, { "epoch": 0.24817485754985755, "grad_norm": 0.572086751461029, "learning_rate": 0.0003933013196584373, "loss": 0.6814, "step": 11150 }, { "epoch": 0.2483974358974359, "grad_norm": 0.9465423822402954, "learning_rate": 0.00039328934919391164, "loss": 0.6921, "step": 11160 }, { "epoch": 0.24862001424501423, "grad_norm": 0.7562800645828247, "learning_rate": 0.0003932773682258845, "loss": 0.5815, "step": 11170 }, { "epoch": 0.2488425925925926, "grad_norm": 0.7506551146507263, "learning_rate": 0.00039326537675500703, "loss": 0.567, "step": 11180 }, { "epoch": 0.24906517094017094, "grad_norm": 0.5940730571746826, "learning_rate": 0.0003932533747819306, "loss": 0.6497, "step": 11190 }, { "epoch": 0.2492877492877493, "grad_norm": 0.6203614473342896, "learning_rate": 0.00039324136230730776, "loss": 0.6522, "step": 11200 }, { "epoch": 0.24951032763532763, "grad_norm": 0.9097982048988342, "learning_rate": 0.00039322933933179106, "loss": 0.6082, "step": 11210 }, { "epoch": 0.24973290598290598, "grad_norm": 0.9215477705001831, "learning_rate": 0.00039321730585603387, "loss": 0.6086, "step": 11220 }, { "epoch": 0.24995548433048434, "grad_norm": 0.3814356327056885, "learning_rate": 0.0003932052618806901, "loss": 0.6829, "step": 11230 }, { "epoch": 0.2501780626780627, "grad_norm": 0.8164243698120117, "learning_rate": 0.00039319320740641416, "loss": 0.6757, "step": 11240 }, { "epoch": 0.250400641025641, "grad_norm": 0.6056912541389465, "learning_rate": 0.00039318114243386124, "loss": 0.5695, "step": 11250 }, { "epoch": 0.25062321937321935, "grad_norm": 0.7039971351623535, "learning_rate": 0.00039316906696368694, "loss": 0.6198, "step": 11260 }, { "epoch": 0.25084579772079774, "grad_norm": 0.7455176115036011, "learning_rate": 0.0003931569809965473, "loss": 0.5613, "step": 11270 }, { "epoch": 0.25106837606837606, "grad_norm": 0.5015754103660583, "learning_rate": 0.0003931448845330993, "loss": 0.61, "step": 11280 }, { "epoch": 0.2512909544159544, "grad_norm": 1.0012961626052856, "learning_rate": 0.00039313277757400006, "loss": 0.6641, "step": 11290 }, { "epoch": 0.2515135327635328, "grad_norm": 0.6197919845581055, "learning_rate": 0.00039312066011990763, "loss": 0.6363, "step": 11300 }, { "epoch": 0.2517361111111111, "grad_norm": 0.5845314264297485, "learning_rate": 0.00039310853217148047, "loss": 0.6148, "step": 11310 }, { "epoch": 0.25195868945868943, "grad_norm": 0.5891401171684265, "learning_rate": 0.00039309639372937756, "loss": 0.6119, "step": 11320 }, { "epoch": 0.2521812678062678, "grad_norm": 0.5159587264060974, "learning_rate": 0.00039308424479425846, "loss": 0.725, "step": 11330 }, { "epoch": 0.25240384615384615, "grad_norm": 0.6200169324874878, "learning_rate": 0.00039307208536678353, "loss": 0.5924, "step": 11340 }, { "epoch": 0.25262642450142453, "grad_norm": 0.8009082674980164, "learning_rate": 0.00039305991544761335, "loss": 0.6477, "step": 11350 }, { "epoch": 0.25284900284900286, "grad_norm": 0.5633246302604675, "learning_rate": 0.00039304773503740935, "loss": 0.6603, "step": 11360 }, { "epoch": 0.2530715811965812, "grad_norm": 0.7888376116752625, "learning_rate": 0.00039303554413683343, "loss": 0.6561, "step": 11370 }, { "epoch": 0.25329415954415957, "grad_norm": 0.8472663760185242, "learning_rate": 0.00039302334274654793, "loss": 0.5612, "step": 11380 }, { "epoch": 0.2535167378917379, "grad_norm": 0.7532154321670532, "learning_rate": 0.00039301113086721607, "loss": 0.534, "step": 11390 }, { "epoch": 0.2537393162393162, "grad_norm": 0.7994014024734497, "learning_rate": 0.0003929989084995013, "loss": 0.6819, "step": 11400 }, { "epoch": 0.2539618945868946, "grad_norm": 0.7608009576797485, "learning_rate": 0.0003929866756440679, "loss": 0.6416, "step": 11410 }, { "epoch": 0.25418447293447294, "grad_norm": 0.620042085647583, "learning_rate": 0.0003929744323015805, "loss": 0.6459, "step": 11420 }, { "epoch": 0.25440705128205127, "grad_norm": 0.8690176010131836, "learning_rate": 0.00039296217847270445, "loss": 0.5823, "step": 11430 }, { "epoch": 0.25462962962962965, "grad_norm": 1.0111145973205566, "learning_rate": 0.00039294991415810574, "loss": 0.6488, "step": 11440 }, { "epoch": 0.254852207977208, "grad_norm": 0.5568084120750427, "learning_rate": 0.0003929376393584506, "loss": 0.6298, "step": 11450 }, { "epoch": 0.2550747863247863, "grad_norm": 0.5776306986808777, "learning_rate": 0.0003929253540744063, "loss": 0.7216, "step": 11460 }, { "epoch": 0.2552973646723647, "grad_norm": 0.7077421545982361, "learning_rate": 0.00039291305830664033, "loss": 0.6168, "step": 11470 }, { "epoch": 0.255519943019943, "grad_norm": 0.6776533126831055, "learning_rate": 0.0003929007520558208, "loss": 0.598, "step": 11480 }, { "epoch": 0.25574252136752135, "grad_norm": 0.6140464544296265, "learning_rate": 0.0003928884353226165, "loss": 0.7212, "step": 11490 }, { "epoch": 0.25596509971509973, "grad_norm": 0.8022708296775818, "learning_rate": 0.00039287610810769674, "loss": 0.6462, "step": 11500 }, { "epoch": 0.25618767806267806, "grad_norm": 0.6690821647644043, "learning_rate": 0.00039286377041173134, "loss": 0.6317, "step": 11510 }, { "epoch": 0.2564102564102564, "grad_norm": 0.8081620931625366, "learning_rate": 0.0003928514222353908, "loss": 0.7468, "step": 11520 }, { "epoch": 0.25663283475783477, "grad_norm": 0.39370104670524597, "learning_rate": 0.0003928390635793461, "loss": 0.672, "step": 11530 }, { "epoch": 0.2568554131054131, "grad_norm": 0.7194493412971497, "learning_rate": 0.0003928266944442688, "loss": 0.7313, "step": 11540 }, { "epoch": 0.25707799145299143, "grad_norm": 0.8301408886909485, "learning_rate": 0.000392814314830831, "loss": 0.546, "step": 11550 }, { "epoch": 0.2573005698005698, "grad_norm": 0.5671295523643494, "learning_rate": 0.00039280192473970557, "loss": 0.7288, "step": 11560 }, { "epoch": 0.25752314814814814, "grad_norm": 0.942840576171875, "learning_rate": 0.00039278952417156574, "loss": 0.6642, "step": 11570 }, { "epoch": 0.25774572649572647, "grad_norm": 0.6493533253669739, "learning_rate": 0.0003927771131270853, "loss": 0.6634, "step": 11580 }, { "epoch": 0.25796830484330485, "grad_norm": 0.7302709221839905, "learning_rate": 0.0003927646916069387, "loss": 0.5299, "step": 11590 }, { "epoch": 0.2581908831908832, "grad_norm": 0.7260543704032898, "learning_rate": 0.000392752259611801, "loss": 0.8042, "step": 11600 }, { "epoch": 0.25841346153846156, "grad_norm": 0.6935957670211792, "learning_rate": 0.0003927398171423477, "loss": 0.5939, "step": 11610 }, { "epoch": 0.2586360398860399, "grad_norm": 1.0461655855178833, "learning_rate": 0.00039272736419925495, "loss": 0.6719, "step": 11620 }, { "epoch": 0.2588586182336182, "grad_norm": 0.7319514155387878, "learning_rate": 0.00039271490078319945, "loss": 0.6549, "step": 11630 }, { "epoch": 0.2590811965811966, "grad_norm": 0.5853979587554932, "learning_rate": 0.0003927024268948585, "loss": 0.6174, "step": 11640 }, { "epoch": 0.25930377492877493, "grad_norm": 0.4176386594772339, "learning_rate": 0.00039268994253490987, "loss": 0.5942, "step": 11650 }, { "epoch": 0.25952635327635326, "grad_norm": 0.6766052842140198, "learning_rate": 0.000392677447704032, "loss": 0.6723, "step": 11660 }, { "epoch": 0.25974893162393164, "grad_norm": 0.6937527060508728, "learning_rate": 0.000392664942402904, "loss": 0.7434, "step": 11670 }, { "epoch": 0.25997150997151, "grad_norm": 0.32973021268844604, "learning_rate": 0.0003926524266322052, "loss": 0.5382, "step": 11680 }, { "epoch": 0.2601940883190883, "grad_norm": 0.8505682349205017, "learning_rate": 0.0003926399003926159, "loss": 0.6913, "step": 11690 }, { "epoch": 0.2604166666666667, "grad_norm": 0.7429829835891724, "learning_rate": 0.00039262736368481663, "loss": 0.5692, "step": 11700 }, { "epoch": 0.260639245014245, "grad_norm": 0.8143709897994995, "learning_rate": 0.0003926148165094888, "loss": 0.6598, "step": 11710 }, { "epoch": 0.26086182336182334, "grad_norm": 0.7106712460517883, "learning_rate": 0.0003926022588673142, "loss": 0.6684, "step": 11720 }, { "epoch": 0.2610844017094017, "grad_norm": 1.1324150562286377, "learning_rate": 0.0003925896907589751, "loss": 0.647, "step": 11730 }, { "epoch": 0.26130698005698005, "grad_norm": 0.43803641200065613, "learning_rate": 0.0003925771121851545, "loss": 0.6768, "step": 11740 }, { "epoch": 0.2615295584045584, "grad_norm": 0.8513333201408386, "learning_rate": 0.00039256452314653605, "loss": 0.6525, "step": 11750 }, { "epoch": 0.26175213675213677, "grad_norm": 0.8602158427238464, "learning_rate": 0.0003925519236438038, "loss": 0.6689, "step": 11760 }, { "epoch": 0.2619747150997151, "grad_norm": 0.41731908917427063, "learning_rate": 0.0003925393136776423, "loss": 0.5316, "step": 11770 }, { "epoch": 0.2621972934472934, "grad_norm": 0.863296389579773, "learning_rate": 0.0003925266932487369, "loss": 0.7248, "step": 11780 }, { "epoch": 0.2624198717948718, "grad_norm": 0.6863886117935181, "learning_rate": 0.00039251406235777346, "loss": 0.5437, "step": 11790 }, { "epoch": 0.26264245014245013, "grad_norm": 0.6773598194122314, "learning_rate": 0.00039250142100543815, "loss": 0.6482, "step": 11800 }, { "epoch": 0.26286502849002846, "grad_norm": 0.6856159567832947, "learning_rate": 0.0003924887691924181, "loss": 0.7223, "step": 11810 }, { "epoch": 0.26308760683760685, "grad_norm": 0.770422637462616, "learning_rate": 0.00039247610691940074, "loss": 0.652, "step": 11820 }, { "epoch": 0.2633101851851852, "grad_norm": 0.5794972777366638, "learning_rate": 0.00039246343418707417, "loss": 0.6338, "step": 11830 }, { "epoch": 0.26353276353276356, "grad_norm": 0.6820332407951355, "learning_rate": 0.000392450750996127, "loss": 0.7889, "step": 11840 }, { "epoch": 0.2637553418803419, "grad_norm": 0.7752724885940552, "learning_rate": 0.0003924380573472485, "loss": 0.7358, "step": 11850 }, { "epoch": 0.2639779202279202, "grad_norm": 0.606903076171875, "learning_rate": 0.0003924253532411284, "loss": 0.6442, "step": 11860 }, { "epoch": 0.2642004985754986, "grad_norm": 0.4325622320175171, "learning_rate": 0.00039241263867845715, "loss": 0.6324, "step": 11870 }, { "epoch": 0.2644230769230769, "grad_norm": 0.7513414025306702, "learning_rate": 0.00039239991365992553, "loss": 0.5955, "step": 11880 }, { "epoch": 0.26464565527065526, "grad_norm": 0.9659456610679626, "learning_rate": 0.0003923871781862251, "loss": 0.8048, "step": 11890 }, { "epoch": 0.26486823361823364, "grad_norm": 0.733268141746521, "learning_rate": 0.00039237443225804795, "loss": 0.6318, "step": 11900 }, { "epoch": 0.26509081196581197, "grad_norm": 0.7908543348312378, "learning_rate": 0.0003923616758760867, "loss": 0.627, "step": 11910 }, { "epoch": 0.2653133903133903, "grad_norm": 0.7832129597663879, "learning_rate": 0.00039234890904103444, "loss": 0.7449, "step": 11920 }, { "epoch": 0.2655359686609687, "grad_norm": 0.6216659545898438, "learning_rate": 0.000392336131753585, "loss": 0.579, "step": 11930 }, { "epoch": 0.265758547008547, "grad_norm": 0.5633769631385803, "learning_rate": 0.0003923233440144327, "loss": 0.6811, "step": 11940 }, { "epoch": 0.26598112535612534, "grad_norm": 0.8467236757278442, "learning_rate": 0.00039231054582427243, "loss": 0.6787, "step": 11950 }, { "epoch": 0.2662037037037037, "grad_norm": 0.5862749218940735, "learning_rate": 0.00039229773718379974, "loss": 0.5676, "step": 11960 }, { "epoch": 0.26642628205128205, "grad_norm": 0.7872330546379089, "learning_rate": 0.00039228491809371055, "loss": 0.6813, "step": 11970 }, { "epoch": 0.2666488603988604, "grad_norm": 0.535454273223877, "learning_rate": 0.0003922720885547015, "loss": 0.5713, "step": 11980 }, { "epoch": 0.26687143874643876, "grad_norm": 0.6369079351425171, "learning_rate": 0.0003922592485674697, "loss": 0.6077, "step": 11990 }, { "epoch": 0.2670940170940171, "grad_norm": 0.4691181778907776, "learning_rate": 0.00039224639813271306, "loss": 0.5685, "step": 12000 }, { "epoch": 0.2673165954415954, "grad_norm": 0.8555895686149597, "learning_rate": 0.0003922335372511297, "loss": 0.6186, "step": 12010 }, { "epoch": 0.2675391737891738, "grad_norm": 0.7896236777305603, "learning_rate": 0.00039222066592341855, "loss": 0.6436, "step": 12020 }, { "epoch": 0.26776175213675213, "grad_norm": 0.651438295841217, "learning_rate": 0.000392207784150279, "loss": 0.6381, "step": 12030 }, { "epoch": 0.26798433048433046, "grad_norm": 0.7535800933837891, "learning_rate": 0.00039219489193241124, "loss": 0.6344, "step": 12040 }, { "epoch": 0.26820690883190884, "grad_norm": 0.7681666612625122, "learning_rate": 0.0003921819892705156, "loss": 0.6896, "step": 12050 }, { "epoch": 0.26842948717948717, "grad_norm": 0.8771412968635559, "learning_rate": 0.00039216907616529336, "loss": 0.4947, "step": 12060 }, { "epoch": 0.26865206552706555, "grad_norm": 0.7458842396736145, "learning_rate": 0.00039215615261744625, "loss": 0.5895, "step": 12070 }, { "epoch": 0.2688746438746439, "grad_norm": 0.7990144491195679, "learning_rate": 0.0003921432186276765, "loss": 0.5073, "step": 12080 }, { "epoch": 0.2690972222222222, "grad_norm": 0.8297378420829773, "learning_rate": 0.0003921302741966869, "loss": 0.7029, "step": 12090 }, { "epoch": 0.2693198005698006, "grad_norm": 0.7059113383293152, "learning_rate": 0.000392117319325181, "loss": 0.8696, "step": 12100 }, { "epoch": 0.2695423789173789, "grad_norm": 0.514255702495575, "learning_rate": 0.0003921043540138626, "loss": 0.6366, "step": 12110 }, { "epoch": 0.26976495726495725, "grad_norm": 0.6856957674026489, "learning_rate": 0.00039209137826343637, "loss": 0.5064, "step": 12120 }, { "epoch": 0.26998753561253563, "grad_norm": 0.6941282749176025, "learning_rate": 0.00039207839207460745, "loss": 0.522, "step": 12130 }, { "epoch": 0.27021011396011396, "grad_norm": 0.8638391494750977, "learning_rate": 0.0003920653954480813, "loss": 0.7316, "step": 12140 }, { "epoch": 0.2704326923076923, "grad_norm": 0.792180597782135, "learning_rate": 0.0003920523883845645, "loss": 0.6044, "step": 12150 }, { "epoch": 0.2706552706552707, "grad_norm": 0.9648000597953796, "learning_rate": 0.00039203937088476366, "loss": 0.6755, "step": 12160 }, { "epoch": 0.270877849002849, "grad_norm": 0.6021059155464172, "learning_rate": 0.00039202634294938614, "loss": 0.6504, "step": 12170 }, { "epoch": 0.27110042735042733, "grad_norm": 0.8093569278717041, "learning_rate": 0.00039201330457914, "loss": 0.8009, "step": 12180 }, { "epoch": 0.2713230056980057, "grad_norm": 0.5667636394500732, "learning_rate": 0.0003920002557747337, "loss": 0.5784, "step": 12190 }, { "epoch": 0.27154558404558404, "grad_norm": 0.672580361366272, "learning_rate": 0.00039198719653687624, "loss": 0.6791, "step": 12200 }, { "epoch": 0.27176816239316237, "grad_norm": 0.520078718662262, "learning_rate": 0.0003919741268662774, "loss": 0.5996, "step": 12210 }, { "epoch": 0.27199074074074076, "grad_norm": 0.5578593015670776, "learning_rate": 0.0003919610467636474, "loss": 0.6789, "step": 12220 }, { "epoch": 0.2722133190883191, "grad_norm": 1.0336942672729492, "learning_rate": 0.0003919479562296969, "loss": 0.7435, "step": 12230 }, { "epoch": 0.2724358974358974, "grad_norm": 0.7458341717720032, "learning_rate": 0.00039193485526513734, "loss": 0.623, "step": 12240 }, { "epoch": 0.2726584757834758, "grad_norm": 0.44324880838394165, "learning_rate": 0.0003919217438706807, "loss": 0.5499, "step": 12250 }, { "epoch": 0.2728810541310541, "grad_norm": 0.5924796462059021, "learning_rate": 0.00039190862204703926, "loss": 0.5998, "step": 12260 }, { "epoch": 0.27310363247863245, "grad_norm": 0.6144111752510071, "learning_rate": 0.00039189548979492626, "loss": 0.5662, "step": 12270 }, { "epoch": 0.27332621082621084, "grad_norm": 0.5802494287490845, "learning_rate": 0.0003918823471150552, "loss": 0.5966, "step": 12280 }, { "epoch": 0.27354878917378916, "grad_norm": 0.6217107772827148, "learning_rate": 0.0003918691940081404, "loss": 0.6996, "step": 12290 }, { "epoch": 0.27377136752136755, "grad_norm": 0.8809908032417297, "learning_rate": 0.0003918560304748965, "loss": 0.6406, "step": 12300 }, { "epoch": 0.2739939458689459, "grad_norm": 3.5845541954040527, "learning_rate": 0.0003918428565160388, "loss": 0.6467, "step": 12310 }, { "epoch": 0.2742165242165242, "grad_norm": 0.366034597158432, "learning_rate": 0.00039182967213228327, "loss": 0.5746, "step": 12320 }, { "epoch": 0.2744391025641026, "grad_norm": 0.6246192455291748, "learning_rate": 0.0003918164773243463, "loss": 0.6412, "step": 12330 }, { "epoch": 0.2746616809116809, "grad_norm": 1.0219444036483765, "learning_rate": 0.0003918032720929449, "loss": 0.5756, "step": 12340 }, { "epoch": 0.27488425925925924, "grad_norm": 0.9017455577850342, "learning_rate": 0.0003917900564387967, "loss": 0.66, "step": 12350 }, { "epoch": 0.27510683760683763, "grad_norm": 0.8596403002738953, "learning_rate": 0.00039177683036261985, "loss": 0.712, "step": 12360 }, { "epoch": 0.27532941595441596, "grad_norm": 0.7327362895011902, "learning_rate": 0.000391763593865133, "loss": 0.7019, "step": 12370 }, { "epoch": 0.2755519943019943, "grad_norm": 0.42158403992652893, "learning_rate": 0.0003917503469470555, "loss": 0.6587, "step": 12380 }, { "epoch": 0.27577457264957267, "grad_norm": 0.9592531323432922, "learning_rate": 0.00039173708960910716, "loss": 0.6499, "step": 12390 }, { "epoch": 0.275997150997151, "grad_norm": 1.0942060947418213, "learning_rate": 0.0003917238218520084, "loss": 0.692, "step": 12400 }, { "epoch": 0.2762197293447293, "grad_norm": 0.8639572858810425, "learning_rate": 0.0003917105436764803, "loss": 0.6718, "step": 12410 }, { "epoch": 0.2764423076923077, "grad_norm": 0.682167112827301, "learning_rate": 0.0003916972550832442, "loss": 0.6555, "step": 12420 }, { "epoch": 0.27666488603988604, "grad_norm": 0.6073299646377563, "learning_rate": 0.0003916839560730224, "loss": 0.5755, "step": 12430 }, { "epoch": 0.27688746438746437, "grad_norm": 0.8207989931106567, "learning_rate": 0.0003916706466465375, "loss": 0.6832, "step": 12440 }, { "epoch": 0.27711004273504275, "grad_norm": 0.5853878259658813, "learning_rate": 0.00039165732680451266, "loss": 0.6955, "step": 12450 }, { "epoch": 0.2773326210826211, "grad_norm": 0.6994444131851196, "learning_rate": 0.0003916439965476718, "loss": 0.591, "step": 12460 }, { "epoch": 0.2775551994301994, "grad_norm": 0.5923715233802795, "learning_rate": 0.0003916306558767394, "loss": 0.5649, "step": 12470 }, { "epoch": 0.2777777777777778, "grad_norm": 0.698859691619873, "learning_rate": 0.00039161730479244023, "loss": 0.7315, "step": 12480 }, { "epoch": 0.2780003561253561, "grad_norm": 0.554038405418396, "learning_rate": 0.0003916039432954998, "loss": 0.5093, "step": 12490 }, { "epoch": 0.27822293447293445, "grad_norm": 0.8297765851020813, "learning_rate": 0.00039159057138664425, "loss": 0.5514, "step": 12500 }, { "epoch": 0.27844551282051283, "grad_norm": 0.6924365758895874, "learning_rate": 0.00039157718906660026, "loss": 0.5219, "step": 12510 }, { "epoch": 0.27866809116809116, "grad_norm": 0.5998541116714478, "learning_rate": 0.00039156379633609493, "loss": 0.6241, "step": 12520 }, { "epoch": 0.27889066951566954, "grad_norm": 0.4820443391799927, "learning_rate": 0.00039155039319585614, "loss": 0.7489, "step": 12530 }, { "epoch": 0.27911324786324787, "grad_norm": 0.48114636540412903, "learning_rate": 0.0003915369796466121, "loss": 0.4851, "step": 12540 }, { "epoch": 0.2793358262108262, "grad_norm": 0.8057467937469482, "learning_rate": 0.00039152355568909187, "loss": 0.6562, "step": 12550 }, { "epoch": 0.2795584045584046, "grad_norm": 0.6877124905586243, "learning_rate": 0.0003915101213240248, "loss": 0.6969, "step": 12560 }, { "epoch": 0.2797809829059829, "grad_norm": 0.775690495967865, "learning_rate": 0.00039149667655214094, "loss": 0.642, "step": 12570 }, { "epoch": 0.28000356125356124, "grad_norm": 0.80083829164505, "learning_rate": 0.0003914832213741709, "loss": 0.6972, "step": 12580 }, { "epoch": 0.2802261396011396, "grad_norm": 1.017709493637085, "learning_rate": 0.00039146975579084584, "loss": 0.6516, "step": 12590 }, { "epoch": 0.28044871794871795, "grad_norm": 0.7244583964347839, "learning_rate": 0.0003914562798028976, "loss": 0.6617, "step": 12600 }, { "epoch": 0.2806712962962963, "grad_norm": 0.58905029296875, "learning_rate": 0.0003914427934110583, "loss": 0.6822, "step": 12610 }, { "epoch": 0.28089387464387466, "grad_norm": 0.7043224573135376, "learning_rate": 0.0003914292966160609, "loss": 0.5562, "step": 12620 }, { "epoch": 0.281116452991453, "grad_norm": 0.8032645583152771, "learning_rate": 0.00039141578941863885, "loss": 0.5138, "step": 12630 }, { "epoch": 0.2813390313390313, "grad_norm": 0.8150473833084106, "learning_rate": 0.0003914022718195261, "loss": 0.6488, "step": 12640 }, { "epoch": 0.2815616096866097, "grad_norm": 0.8619401454925537, "learning_rate": 0.00039138874381945715, "loss": 0.5323, "step": 12650 }, { "epoch": 0.28178418803418803, "grad_norm": 0.5432770848274231, "learning_rate": 0.00039137520541916726, "loss": 0.6153, "step": 12660 }, { "epoch": 0.28200676638176636, "grad_norm": 0.9435534477233887, "learning_rate": 0.00039136165661939195, "loss": 0.7069, "step": 12670 }, { "epoch": 0.28222934472934474, "grad_norm": 0.5666217803955078, "learning_rate": 0.0003913480974208676, "loss": 0.5967, "step": 12680 }, { "epoch": 0.2824519230769231, "grad_norm": 1.044399619102478, "learning_rate": 0.00039133452782433097, "loss": 0.6681, "step": 12690 }, { "epoch": 0.2826745014245014, "grad_norm": 0.5483806133270264, "learning_rate": 0.0003913209478305196, "loss": 0.6281, "step": 12700 }, { "epoch": 0.2828970797720798, "grad_norm": 0.8065449595451355, "learning_rate": 0.00039130735744017113, "loss": 0.5793, "step": 12710 }, { "epoch": 0.2831196581196581, "grad_norm": 0.5208731889724731, "learning_rate": 0.00039129375665402434, "loss": 0.7103, "step": 12720 }, { "epoch": 0.28334223646723644, "grad_norm": 0.821640133857727, "learning_rate": 0.0003912801454728181, "loss": 0.7682, "step": 12730 }, { "epoch": 0.2835648148148148, "grad_norm": 0.6433324813842773, "learning_rate": 0.0003912665238972923, "loss": 0.6647, "step": 12740 }, { "epoch": 0.28378739316239315, "grad_norm": 0.7271362543106079, "learning_rate": 0.000391252891928187, "loss": 0.5601, "step": 12750 }, { "epoch": 0.28400997150997154, "grad_norm": 0.5222648978233337, "learning_rate": 0.00039123924956624293, "loss": 0.6467, "step": 12760 }, { "epoch": 0.28423254985754987, "grad_norm": 0.7300889492034912, "learning_rate": 0.00039122559681220153, "loss": 0.5843, "step": 12770 }, { "epoch": 0.2844551282051282, "grad_norm": 0.6423002481460571, "learning_rate": 0.0003912119336668046, "loss": 0.6337, "step": 12780 }, { "epoch": 0.2846777065527066, "grad_norm": 0.568601667881012, "learning_rate": 0.00039119826013079466, "loss": 0.6153, "step": 12790 }, { "epoch": 0.2849002849002849, "grad_norm": 0.9605054259300232, "learning_rate": 0.0003911845762049148, "loss": 0.6467, "step": 12800 }, { "epoch": 0.28512286324786323, "grad_norm": 0.6418153643608093, "learning_rate": 0.0003911708818899086, "loss": 0.5672, "step": 12810 }, { "epoch": 0.2853454415954416, "grad_norm": 0.5108728408813477, "learning_rate": 0.00039115717718652006, "loss": 0.7055, "step": 12820 }, { "epoch": 0.28556801994301995, "grad_norm": 0.8929889798164368, "learning_rate": 0.0003911434620954941, "loss": 0.6672, "step": 12830 }, { "epoch": 0.2857905982905983, "grad_norm": 0.8994865417480469, "learning_rate": 0.00039112973661757584, "loss": 0.7828, "step": 12840 }, { "epoch": 0.28601317663817666, "grad_norm": 0.5077641010284424, "learning_rate": 0.00039111600075351135, "loss": 0.7327, "step": 12850 }, { "epoch": 0.286235754985755, "grad_norm": 0.6523764729499817, "learning_rate": 0.00039110225450404685, "loss": 0.7155, "step": 12860 }, { "epoch": 0.2864583333333333, "grad_norm": 0.5647468566894531, "learning_rate": 0.0003910884978699294, "loss": 0.5385, "step": 12870 }, { "epoch": 0.2866809116809117, "grad_norm": 0.6217672228813171, "learning_rate": 0.00039107473085190654, "loss": 0.4422, "step": 12880 }, { "epoch": 0.28690349002849, "grad_norm": 0.7185326814651489, "learning_rate": 0.00039106095345072645, "loss": 0.7382, "step": 12890 }, { "epoch": 0.28712606837606836, "grad_norm": 0.6368201971054077, "learning_rate": 0.0003910471656671376, "loss": 0.6403, "step": 12900 }, { "epoch": 0.28734864672364674, "grad_norm": 0.6725485324859619, "learning_rate": 0.00039103336750188954, "loss": 0.4992, "step": 12910 }, { "epoch": 0.28757122507122507, "grad_norm": 0.7353759407997131, "learning_rate": 0.0003910195589557318, "loss": 0.7156, "step": 12920 }, { "epoch": 0.2877938034188034, "grad_norm": 0.4989571273326874, "learning_rate": 0.0003910057400294149, "loss": 0.6063, "step": 12930 }, { "epoch": 0.2880163817663818, "grad_norm": 0.48165708780288696, "learning_rate": 0.00039099191072368964, "loss": 0.6651, "step": 12940 }, { "epoch": 0.2882389601139601, "grad_norm": 0.8041085004806519, "learning_rate": 0.0003909780710393077, "loss": 0.6656, "step": 12950 }, { "epoch": 0.28846153846153844, "grad_norm": 0.6328619122505188, "learning_rate": 0.00039096422097702096, "loss": 0.5947, "step": 12960 }, { "epoch": 0.2886841168091168, "grad_norm": 0.9309022426605225, "learning_rate": 0.00039095036053758215, "loss": 0.6813, "step": 12970 }, { "epoch": 0.28890669515669515, "grad_norm": 0.703187108039856, "learning_rate": 0.0003909364897217445, "loss": 0.5864, "step": 12980 }, { "epoch": 0.28912927350427353, "grad_norm": 0.8175052404403687, "learning_rate": 0.0003909226085302616, "loss": 0.6581, "step": 12990 }, { "epoch": 0.28935185185185186, "grad_norm": 1.1442813873291016, "learning_rate": 0.00039090871696388787, "loss": 0.7457, "step": 13000 }, { "epoch": 0.2895744301994302, "grad_norm": 0.6309537887573242, "learning_rate": 0.0003908948150233782, "loss": 0.7326, "step": 13010 }, { "epoch": 0.28979700854700857, "grad_norm": 0.7221547365188599, "learning_rate": 0.0003908809027094879, "loss": 0.5485, "step": 13020 }, { "epoch": 0.2900195868945869, "grad_norm": 0.5818193554878235, "learning_rate": 0.0003908669800229732, "loss": 0.6009, "step": 13030 }, { "epoch": 0.29024216524216523, "grad_norm": 0.5316341519355774, "learning_rate": 0.0003908530469645905, "loss": 0.5764, "step": 13040 }, { "epoch": 0.2904647435897436, "grad_norm": 0.6840851902961731, "learning_rate": 0.00039083910353509703, "loss": 0.6708, "step": 13050 }, { "epoch": 0.29068732193732194, "grad_norm": 0.9945989847183228, "learning_rate": 0.0003908251497352505, "loss": 0.6217, "step": 13060 }, { "epoch": 0.29090990028490027, "grad_norm": 0.6313449144363403, "learning_rate": 0.000390811185565809, "loss": 0.7558, "step": 13070 }, { "epoch": 0.29113247863247865, "grad_norm": 0.8493428230285645, "learning_rate": 0.0003907972110275315, "loss": 0.6076, "step": 13080 }, { "epoch": 0.291355056980057, "grad_norm": 0.5349470973014832, "learning_rate": 0.0003907832261211774, "loss": 0.7087, "step": 13090 }, { "epoch": 0.2915776353276353, "grad_norm": 0.5637439489364624, "learning_rate": 0.0003907692308475066, "loss": 0.6684, "step": 13100 }, { "epoch": 0.2918002136752137, "grad_norm": 0.7880241274833679, "learning_rate": 0.0003907552252072796, "loss": 0.7463, "step": 13110 }, { "epoch": 0.292022792022792, "grad_norm": 0.8356769680976868, "learning_rate": 0.00039074120920125756, "loss": 0.5763, "step": 13120 }, { "epoch": 0.29224537037037035, "grad_norm": 0.9101738333702087, "learning_rate": 0.000390727182830202, "loss": 0.6954, "step": 13130 }, { "epoch": 0.29246794871794873, "grad_norm": 0.49127423763275146, "learning_rate": 0.0003907131460948752, "loss": 0.6114, "step": 13140 }, { "epoch": 0.29269052706552706, "grad_norm": 0.4214502274990082, "learning_rate": 0.00039069909899604, "loss": 0.5386, "step": 13150 }, { "epoch": 0.2929131054131054, "grad_norm": 0.9073315262794495, "learning_rate": 0.0003906850415344595, "loss": 0.6927, "step": 13160 }, { "epoch": 0.2931356837606838, "grad_norm": 0.35063672065734863, "learning_rate": 0.0003906709737108978, "loss": 0.6315, "step": 13170 }, { "epoch": 0.2933582621082621, "grad_norm": 0.4961087107658386, "learning_rate": 0.0003906568955261193, "loss": 0.6075, "step": 13180 }, { "epoch": 0.29358084045584043, "grad_norm": 0.7128926515579224, "learning_rate": 0.00039064280698088903, "loss": 0.6542, "step": 13190 }, { "epoch": 0.2938034188034188, "grad_norm": 0.7185338735580444, "learning_rate": 0.0003906287080759726, "loss": 0.6805, "step": 13200 }, { "epoch": 0.29402599715099714, "grad_norm": 0.7085132002830505, "learning_rate": 0.00039061459881213606, "loss": 0.6612, "step": 13210 }, { "epoch": 0.29424857549857547, "grad_norm": 0.8242709040641785, "learning_rate": 0.00039060047919014623, "loss": 0.5647, "step": 13220 }, { "epoch": 0.29447115384615385, "grad_norm": 0.8153027296066284, "learning_rate": 0.0003905863492107702, "loss": 0.6909, "step": 13230 }, { "epoch": 0.2946937321937322, "grad_norm": 0.6219608783721924, "learning_rate": 0.00039057220887477615, "loss": 0.6192, "step": 13240 }, { "epoch": 0.29491631054131057, "grad_norm": 0.9387757182121277, "learning_rate": 0.00039055805818293205, "loss": 0.6474, "step": 13250 }, { "epoch": 0.2951388888888889, "grad_norm": 0.538312554359436, "learning_rate": 0.00039054389713600717, "loss": 0.5752, "step": 13260 }, { "epoch": 0.2953614672364672, "grad_norm": 0.6803849935531616, "learning_rate": 0.00039052972573477097, "loss": 0.4585, "step": 13270 }, { "epoch": 0.2955840455840456, "grad_norm": 0.5901275873184204, "learning_rate": 0.0003905155439799934, "loss": 0.6493, "step": 13280 }, { "epoch": 0.29580662393162394, "grad_norm": 0.49050992727279663, "learning_rate": 0.00039050135187244526, "loss": 0.6126, "step": 13290 }, { "epoch": 0.29602920227920226, "grad_norm": 0.4540439546108246, "learning_rate": 0.0003904871494128977, "loss": 0.4577, "step": 13300 }, { "epoch": 0.29625178062678065, "grad_norm": 0.7715014219284058, "learning_rate": 0.0003904729366021225, "loss": 0.5782, "step": 13310 }, { "epoch": 0.296474358974359, "grad_norm": 0.39345112442970276, "learning_rate": 0.000390458713440892, "loss": 0.5754, "step": 13320 }, { "epoch": 0.2966969373219373, "grad_norm": 0.7827655076980591, "learning_rate": 0.0003904444799299791, "loss": 0.7669, "step": 13330 }, { "epoch": 0.2969195156695157, "grad_norm": 0.7510057091712952, "learning_rate": 0.0003904302360701572, "loss": 0.4807, "step": 13340 }, { "epoch": 0.297142094017094, "grad_norm": 0.7219814658164978, "learning_rate": 0.0003904159818622005, "loss": 0.581, "step": 13350 }, { "epoch": 0.29736467236467234, "grad_norm": 0.6957118511199951, "learning_rate": 0.0003904017173068834, "loss": 0.5917, "step": 13360 }, { "epoch": 0.29758725071225073, "grad_norm": 0.4547037184238434, "learning_rate": 0.00039038744240498105, "loss": 0.5798, "step": 13370 }, { "epoch": 0.29780982905982906, "grad_norm": 0.6696369647979736, "learning_rate": 0.0003903731571572693, "loss": 0.6329, "step": 13380 }, { "epoch": 0.2980324074074074, "grad_norm": 0.6257852911949158, "learning_rate": 0.00039035886156452436, "loss": 0.6221, "step": 13390 }, { "epoch": 0.29825498575498577, "grad_norm": 0.8077847957611084, "learning_rate": 0.000390344555627523, "loss": 0.6277, "step": 13400 }, { "epoch": 0.2984775641025641, "grad_norm": 0.8844859600067139, "learning_rate": 0.0003903302393470426, "loss": 0.6486, "step": 13410 }, { "epoch": 0.2987001424501424, "grad_norm": 0.8643153309822083, "learning_rate": 0.0003903159127238613, "loss": 0.6019, "step": 13420 }, { "epoch": 0.2989227207977208, "grad_norm": 0.7332974672317505, "learning_rate": 0.0003903015757587574, "loss": 0.5546, "step": 13430 }, { "epoch": 0.29914529914529914, "grad_norm": 0.7666581869125366, "learning_rate": 0.0003902872284525102, "loss": 0.5489, "step": 13440 }, { "epoch": 0.29936787749287747, "grad_norm": 0.5811508893966675, "learning_rate": 0.0003902728708058991, "loss": 0.6611, "step": 13450 }, { "epoch": 0.29959045584045585, "grad_norm": 0.9579280018806458, "learning_rate": 0.00039025850281970454, "loss": 0.6002, "step": 13460 }, { "epoch": 0.2998130341880342, "grad_norm": 0.5538046956062317, "learning_rate": 0.00039024412449470717, "loss": 0.7622, "step": 13470 }, { "epoch": 0.30003561253561256, "grad_norm": 0.609813928604126, "learning_rate": 0.0003902297358316883, "loss": 0.6494, "step": 13480 }, { "epoch": 0.30003561253561256, "eval_loss": 0.6374564170837402, "eval_runtime": 337.424, "eval_samples_per_second": 7.009, "eval_steps_per_second": 7.009, "step": 13480 }, { "epoch": 0.3002581908831909, "grad_norm": 0.7406212687492371, "learning_rate": 0.00039021533683142984, "loss": 0.6261, "step": 13490 }, { "epoch": 0.3004807692307692, "grad_norm": 0.8207390308380127, "learning_rate": 0.00039020092749471433, "loss": 0.6493, "step": 13500 }, { "epoch": 0.3007033475783476, "grad_norm": 0.5859150886535645, "learning_rate": 0.00039018650782232466, "loss": 0.5998, "step": 13510 }, { "epoch": 0.30092592592592593, "grad_norm": 0.5716903805732727, "learning_rate": 0.0003901720778150445, "loss": 0.6489, "step": 13520 }, { "epoch": 0.30114850427350426, "grad_norm": 0.7157092094421387, "learning_rate": 0.0003901576374736579, "loss": 0.7296, "step": 13530 }, { "epoch": 0.30137108262108264, "grad_norm": 0.8354113101959229, "learning_rate": 0.0003901431867989496, "loss": 0.6373, "step": 13540 }, { "epoch": 0.30159366096866097, "grad_norm": 0.7195545434951782, "learning_rate": 0.00039012872579170495, "loss": 0.5963, "step": 13550 }, { "epoch": 0.3018162393162393, "grad_norm": 0.5032168030738831, "learning_rate": 0.00039011425445270966, "loss": 0.5855, "step": 13560 }, { "epoch": 0.3020388176638177, "grad_norm": 0.6268942356109619, "learning_rate": 0.0003900997727827501, "loss": 0.5637, "step": 13570 }, { "epoch": 0.302261396011396, "grad_norm": 0.7669327259063721, "learning_rate": 0.0003900852807826133, "loss": 0.5935, "step": 13580 }, { "epoch": 0.30248397435897434, "grad_norm": 0.6793097853660583, "learning_rate": 0.0003900707784530867, "loss": 0.696, "step": 13590 }, { "epoch": 0.3027065527065527, "grad_norm": 0.552746593952179, "learning_rate": 0.0003900562657949585, "loss": 0.6304, "step": 13600 }, { "epoch": 0.30292913105413105, "grad_norm": 0.6738058924674988, "learning_rate": 0.00039004174280901714, "loss": 0.6347, "step": 13610 }, { "epoch": 0.3031517094017094, "grad_norm": 0.8077411651611328, "learning_rate": 0.0003900272094960519, "loss": 0.7001, "step": 13620 }, { "epoch": 0.30337428774928776, "grad_norm": 0.806163489818573, "learning_rate": 0.00039001266585685253, "loss": 0.5889, "step": 13630 }, { "epoch": 0.3035968660968661, "grad_norm": 0.68730628490448, "learning_rate": 0.0003899981118922094, "loss": 0.6084, "step": 13640 }, { "epoch": 0.3038194444444444, "grad_norm": 0.7665461301803589, "learning_rate": 0.0003899835476029133, "loss": 0.5428, "step": 13650 }, { "epoch": 0.3040420227920228, "grad_norm": 0.9742176532745361, "learning_rate": 0.0003899689729897557, "loss": 0.6455, "step": 13660 }, { "epoch": 0.30426460113960113, "grad_norm": 0.6864128112792969, "learning_rate": 0.0003899543880535286, "loss": 0.6755, "step": 13670 }, { "epoch": 0.30448717948717946, "grad_norm": 0.8203412890434265, "learning_rate": 0.0003899397927950245, "loss": 0.6731, "step": 13680 }, { "epoch": 0.30470975783475784, "grad_norm": 0.6704636216163635, "learning_rate": 0.0003899251872150366, "loss": 0.5118, "step": 13690 }, { "epoch": 0.30493233618233617, "grad_norm": 0.6445683836936951, "learning_rate": 0.0003899105713143586, "loss": 0.5113, "step": 13700 }, { "epoch": 0.30515491452991456, "grad_norm": 0.7139549851417542, "learning_rate": 0.0003898959450937846, "loss": 0.5594, "step": 13710 }, { "epoch": 0.3053774928774929, "grad_norm": 0.9933525323867798, "learning_rate": 0.00038988130855410947, "loss": 0.6033, "step": 13720 }, { "epoch": 0.3056000712250712, "grad_norm": 0.6525797843933105, "learning_rate": 0.0003898666616961287, "loss": 0.6571, "step": 13730 }, { "epoch": 0.3058226495726496, "grad_norm": 0.6067349314689636, "learning_rate": 0.00038985200452063804, "loss": 0.7637, "step": 13740 }, { "epoch": 0.3060452279202279, "grad_norm": 0.7824680209159851, "learning_rate": 0.00038983733702843405, "loss": 0.5977, "step": 13750 }, { "epoch": 0.30626780626780625, "grad_norm": 0.3695496618747711, "learning_rate": 0.00038982265922031376, "loss": 0.4772, "step": 13760 }, { "epoch": 0.30649038461538464, "grad_norm": 0.5000861883163452, "learning_rate": 0.00038980797109707476, "loss": 0.6247, "step": 13770 }, { "epoch": 0.30671296296296297, "grad_norm": 0.9075541496276855, "learning_rate": 0.0003897932726595152, "loss": 0.6185, "step": 13780 }, { "epoch": 0.3069355413105413, "grad_norm": 0.8326929807662964, "learning_rate": 0.00038977856390843386, "loss": 0.6476, "step": 13790 }, { "epoch": 0.3071581196581197, "grad_norm": 0.47030866146087646, "learning_rate": 0.00038976384484462997, "loss": 0.6748, "step": 13800 }, { "epoch": 0.307380698005698, "grad_norm": 0.5289137363433838, "learning_rate": 0.0003897491154689034, "loss": 0.5827, "step": 13810 }, { "epoch": 0.30760327635327633, "grad_norm": 0.6581617593765259, "learning_rate": 0.00038973437578205465, "loss": 0.6515, "step": 13820 }, { "epoch": 0.3078258547008547, "grad_norm": 0.4768698513507843, "learning_rate": 0.0003897196257848845, "loss": 0.6406, "step": 13830 }, { "epoch": 0.30804843304843305, "grad_norm": 0.5576051473617554, "learning_rate": 0.0003897048654781946, "loss": 0.5606, "step": 13840 }, { "epoch": 0.3082710113960114, "grad_norm": 0.6507335901260376, "learning_rate": 0.00038969009486278705, "loss": 0.6384, "step": 13850 }, { "epoch": 0.30849358974358976, "grad_norm": 1.2971117496490479, "learning_rate": 0.0003896753139394644, "loss": 0.6221, "step": 13860 }, { "epoch": 0.3087161680911681, "grad_norm": 0.5396764874458313, "learning_rate": 0.00038966052270902993, "loss": 0.6386, "step": 13870 }, { "epoch": 0.3089387464387464, "grad_norm": 0.5200977325439453, "learning_rate": 0.0003896457211722874, "loss": 0.6257, "step": 13880 }, { "epoch": 0.3091613247863248, "grad_norm": 0.4826856851577759, "learning_rate": 0.00038963090933004114, "loss": 0.6103, "step": 13890 }, { "epoch": 0.3093839031339031, "grad_norm": 0.6851069927215576, "learning_rate": 0.000389616087183096, "loss": 0.5623, "step": 13900 }, { "epoch": 0.30960648148148145, "grad_norm": 0.5373729467391968, "learning_rate": 0.0003896012547322575, "loss": 0.6298, "step": 13910 }, { "epoch": 0.30982905982905984, "grad_norm": 0.4789588153362274, "learning_rate": 0.00038958641197833153, "loss": 0.4662, "step": 13920 }, { "epoch": 0.31005163817663817, "grad_norm": 0.6719768047332764, "learning_rate": 0.0003895715589221248, "loss": 0.7211, "step": 13930 }, { "epoch": 0.31027421652421655, "grad_norm": 0.9529721736907959, "learning_rate": 0.00038955669556444436, "loss": 0.655, "step": 13940 }, { "epoch": 0.3104967948717949, "grad_norm": 0.6303074359893799, "learning_rate": 0.00038954182190609784, "loss": 0.5846, "step": 13950 }, { "epoch": 0.3107193732193732, "grad_norm": 0.8553338050842285, "learning_rate": 0.0003895269379478936, "loss": 0.7021, "step": 13960 }, { "epoch": 0.3109419515669516, "grad_norm": 0.8133220672607422, "learning_rate": 0.00038951204369064047, "loss": 0.6132, "step": 13970 }, { "epoch": 0.3111645299145299, "grad_norm": 0.6394578218460083, "learning_rate": 0.00038949713913514767, "loss": 0.6557, "step": 13980 }, { "epoch": 0.31138710826210825, "grad_norm": 0.5929552912712097, "learning_rate": 0.0003894822242822252, "loss": 0.5875, "step": 13990 }, { "epoch": 0.31160968660968663, "grad_norm": 0.5577292442321777, "learning_rate": 0.0003894672991326835, "loss": 0.5826, "step": 14000 }, { "epoch": 0.31183226495726496, "grad_norm": 0.6399706602096558, "learning_rate": 0.00038945236368733376, "loss": 0.7175, "step": 14010 }, { "epoch": 0.3120548433048433, "grad_norm": 0.5782634019851685, "learning_rate": 0.0003894374179469874, "loss": 0.6909, "step": 14020 }, { "epoch": 0.31227742165242167, "grad_norm": 0.49260640144348145, "learning_rate": 0.00038942246191245674, "loss": 0.6483, "step": 14030 }, { "epoch": 0.3125, "grad_norm": 0.8788143396377563, "learning_rate": 0.0003894074955845544, "loss": 0.7238, "step": 14040 }, { "epoch": 0.31272257834757833, "grad_norm": 0.7116497755050659, "learning_rate": 0.0003893925189640936, "loss": 0.6048, "step": 14050 }, { "epoch": 0.3129451566951567, "grad_norm": 0.6584917306900024, "learning_rate": 0.00038937753205188844, "loss": 0.5464, "step": 14060 }, { "epoch": 0.31316773504273504, "grad_norm": 0.7588227391242981, "learning_rate": 0.00038936253484875304, "loss": 0.6912, "step": 14070 }, { "epoch": 0.31339031339031337, "grad_norm": 0.8676776885986328, "learning_rate": 0.00038934752735550254, "loss": 0.7545, "step": 14080 }, { "epoch": 0.31361289173789175, "grad_norm": 0.4870927631855011, "learning_rate": 0.0003893325095729524, "loss": 0.8075, "step": 14090 }, { "epoch": 0.3138354700854701, "grad_norm": 0.5967193841934204, "learning_rate": 0.0003893174815019186, "loss": 0.6192, "step": 14100 }, { "epoch": 0.3140580484330484, "grad_norm": 0.5258004665374756, "learning_rate": 0.00038930244314321793, "loss": 0.611, "step": 14110 }, { "epoch": 0.3142806267806268, "grad_norm": 0.7101635932922363, "learning_rate": 0.00038928739449766754, "loss": 0.7051, "step": 14120 }, { "epoch": 0.3145032051282051, "grad_norm": 0.6969490647315979, "learning_rate": 0.0003892723355660852, "loss": 0.5837, "step": 14130 }, { "epoch": 0.31472578347578345, "grad_norm": 0.8672996759414673, "learning_rate": 0.0003892572663492892, "loss": 0.7978, "step": 14140 }, { "epoch": 0.31494836182336183, "grad_norm": 0.6507262587547302, "learning_rate": 0.0003892421868480984, "loss": 0.4929, "step": 14150 }, { "epoch": 0.31517094017094016, "grad_norm": 0.6931272745132446, "learning_rate": 0.0003892270970633322, "loss": 0.7297, "step": 14160 }, { "epoch": 0.31539351851851855, "grad_norm": 0.6623298525810242, "learning_rate": 0.0003892119969958107, "loss": 0.5653, "step": 14170 }, { "epoch": 0.3156160968660969, "grad_norm": 0.7796519994735718, "learning_rate": 0.00038919688664635434, "loss": 0.5297, "step": 14180 }, { "epoch": 0.3158386752136752, "grad_norm": 0.8253905177116394, "learning_rate": 0.0003891817660157843, "loss": 0.6523, "step": 14190 }, { "epoch": 0.3160612535612536, "grad_norm": 0.7216755747795105, "learning_rate": 0.0003891666351049222, "loss": 0.6081, "step": 14200 }, { "epoch": 0.3162838319088319, "grad_norm": 0.788808286190033, "learning_rate": 0.00038915149391459034, "loss": 0.7106, "step": 14210 }, { "epoch": 0.31650641025641024, "grad_norm": 0.9707726240158081, "learning_rate": 0.0003891363424456114, "loss": 0.6154, "step": 14220 }, { "epoch": 0.3167289886039886, "grad_norm": 0.6529638767242432, "learning_rate": 0.0003891211806988088, "loss": 0.6526, "step": 14230 }, { "epoch": 0.31695156695156695, "grad_norm": 0.6194156408309937, "learning_rate": 0.0003891060086750064, "loss": 0.6328, "step": 14240 }, { "epoch": 0.3171741452991453, "grad_norm": 0.4574219882488251, "learning_rate": 0.0003890908263750287, "loss": 0.4735, "step": 14250 }, { "epoch": 0.31739672364672367, "grad_norm": 0.4415189325809479, "learning_rate": 0.0003890756337997007, "loss": 0.6351, "step": 14260 }, { "epoch": 0.317619301994302, "grad_norm": 0.8414084911346436, "learning_rate": 0.00038906043094984796, "loss": 0.7368, "step": 14270 }, { "epoch": 0.3178418803418803, "grad_norm": 0.8305835127830505, "learning_rate": 0.00038904521782629663, "loss": 0.5888, "step": 14280 }, { "epoch": 0.3180644586894587, "grad_norm": 0.43939080834388733, "learning_rate": 0.0003890299944298734, "loss": 0.6886, "step": 14290 }, { "epoch": 0.31828703703703703, "grad_norm": 0.6629655361175537, "learning_rate": 0.0003890147607614056, "loss": 0.6991, "step": 14300 }, { "epoch": 0.31850961538461536, "grad_norm": 0.549863874912262, "learning_rate": 0.00038899951682172084, "loss": 0.6437, "step": 14310 }, { "epoch": 0.31873219373219375, "grad_norm": 0.6703042387962341, "learning_rate": 0.0003889842626116477, "loss": 0.6647, "step": 14320 }, { "epoch": 0.3189547720797721, "grad_norm": 0.7996009588241577, "learning_rate": 0.000388968998132015, "loss": 0.478, "step": 14330 }, { "epoch": 0.3191773504273504, "grad_norm": 0.6925052404403687, "learning_rate": 0.0003889537233836523, "loss": 0.777, "step": 14340 }, { "epoch": 0.3193999287749288, "grad_norm": 0.7804181575775146, "learning_rate": 0.00038893843836738945, "loss": 0.6204, "step": 14350 }, { "epoch": 0.3196225071225071, "grad_norm": 0.677384078502655, "learning_rate": 0.0003889231430840573, "loss": 0.5158, "step": 14360 }, { "epoch": 0.31984508547008544, "grad_norm": 0.7306846976280212, "learning_rate": 0.00038890783753448683, "loss": 0.5729, "step": 14370 }, { "epoch": 0.32006766381766383, "grad_norm": 0.7445614337921143, "learning_rate": 0.0003888925217195099, "loss": 0.7354, "step": 14380 }, { "epoch": 0.32029024216524216, "grad_norm": 0.6261416077613831, "learning_rate": 0.0003888771956399586, "loss": 0.6795, "step": 14390 }, { "epoch": 0.32051282051282054, "grad_norm": 0.8659251928329468, "learning_rate": 0.00038886185929666594, "loss": 0.6217, "step": 14400 }, { "epoch": 0.32073539886039887, "grad_norm": 0.5013568997383118, "learning_rate": 0.00038884651269046526, "loss": 0.5297, "step": 14410 }, { "epoch": 0.3209579772079772, "grad_norm": 0.5654299855232239, "learning_rate": 0.00038883115582219046, "loss": 0.6419, "step": 14420 }, { "epoch": 0.3211805555555556, "grad_norm": 0.9625751376152039, "learning_rate": 0.000388815788692676, "loss": 0.497, "step": 14430 }, { "epoch": 0.3214031339031339, "grad_norm": 0.6212838292121887, "learning_rate": 0.00038880041130275706, "loss": 0.6888, "step": 14440 }, { "epoch": 0.32162571225071224, "grad_norm": 0.7968057990074158, "learning_rate": 0.00038878502365326923, "loss": 0.6761, "step": 14450 }, { "epoch": 0.3218482905982906, "grad_norm": 0.7091825008392334, "learning_rate": 0.0003887696257450486, "loss": 0.5637, "step": 14460 }, { "epoch": 0.32207086894586895, "grad_norm": 0.9699985384941101, "learning_rate": 0.00038875421757893203, "loss": 0.6297, "step": 14470 }, { "epoch": 0.3222934472934473, "grad_norm": 0.7332914471626282, "learning_rate": 0.0003887387991557568, "loss": 0.6152, "step": 14480 }, { "epoch": 0.32251602564102566, "grad_norm": 0.5954024791717529, "learning_rate": 0.0003887233704763606, "loss": 0.5747, "step": 14490 }, { "epoch": 0.322738603988604, "grad_norm": 0.906563937664032, "learning_rate": 0.00038870793154158206, "loss": 0.5849, "step": 14500 }, { "epoch": 0.3229611823361823, "grad_norm": 0.5665345788002014, "learning_rate": 0.00038869248235226, "loss": 0.5829, "step": 14510 }, { "epoch": 0.3231837606837607, "grad_norm": 0.722339928150177, "learning_rate": 0.00038867702290923395, "loss": 0.5569, "step": 14520 }, { "epoch": 0.32340633903133903, "grad_norm": 0.6280966997146606, "learning_rate": 0.00038866155321334406, "loss": 0.5243, "step": 14530 }, { "epoch": 0.32362891737891736, "grad_norm": 0.5326039791107178, "learning_rate": 0.00038864607326543086, "loss": 0.7024, "step": 14540 }, { "epoch": 0.32385149572649574, "grad_norm": 0.748881995677948, "learning_rate": 0.00038863058306633566, "loss": 0.6266, "step": 14550 }, { "epoch": 0.32407407407407407, "grad_norm": 0.7223565578460693, "learning_rate": 0.00038861508261690017, "loss": 0.6603, "step": 14560 }, { "epoch": 0.3242966524216524, "grad_norm": 0.7084729671478271, "learning_rate": 0.00038859957191796665, "loss": 0.7027, "step": 14570 }, { "epoch": 0.3245192307692308, "grad_norm": 0.6114988923072815, "learning_rate": 0.00038858405097037796, "loss": 0.5891, "step": 14580 }, { "epoch": 0.3247418091168091, "grad_norm": 0.9163961410522461, "learning_rate": 0.0003885685197749776, "loss": 0.6098, "step": 14590 }, { "epoch": 0.32496438746438744, "grad_norm": 1.123960256576538, "learning_rate": 0.00038855297833260955, "loss": 0.6258, "step": 14600 }, { "epoch": 0.3251869658119658, "grad_norm": 0.7056410908699036, "learning_rate": 0.00038853742664411825, "loss": 0.7209, "step": 14610 }, { "epoch": 0.32540954415954415, "grad_norm": 0.49438974261283875, "learning_rate": 0.00038852186471034886, "loss": 0.5346, "step": 14620 }, { "epoch": 0.32563212250712253, "grad_norm": 0.4864642918109894, "learning_rate": 0.000388506292532147, "loss": 0.6908, "step": 14630 }, { "epoch": 0.32585470085470086, "grad_norm": 0.7527804374694824, "learning_rate": 0.0003884907101103589, "loss": 0.5082, "step": 14640 }, { "epoch": 0.3260772792022792, "grad_norm": 0.8456656336784363, "learning_rate": 0.00038847511744583127, "loss": 0.7523, "step": 14650 }, { "epoch": 0.3262998575498576, "grad_norm": 0.5107564330101013, "learning_rate": 0.0003884595145394115, "loss": 0.47, "step": 14660 }, { "epoch": 0.3265224358974359, "grad_norm": 0.5012245774269104, "learning_rate": 0.0003884439013919474, "loss": 0.6565, "step": 14670 }, { "epoch": 0.32674501424501423, "grad_norm": 0.7485440373420715, "learning_rate": 0.00038842827800428747, "loss": 0.67, "step": 14680 }, { "epoch": 0.3269675925925926, "grad_norm": 0.7245430946350098, "learning_rate": 0.00038841264437728057, "loss": 0.6425, "step": 14690 }, { "epoch": 0.32719017094017094, "grad_norm": 0.4616965353488922, "learning_rate": 0.00038839700051177645, "loss": 0.6578, "step": 14700 }, { "epoch": 0.32741274928774927, "grad_norm": 0.8992801308631897, "learning_rate": 0.00038838134640862507, "loss": 0.6097, "step": 14710 }, { "epoch": 0.32763532763532766, "grad_norm": 0.7011298537254333, "learning_rate": 0.00038836568206867704, "loss": 0.6629, "step": 14720 }, { "epoch": 0.327857905982906, "grad_norm": 0.5743442177772522, "learning_rate": 0.0003883500074927837, "loss": 0.6509, "step": 14730 }, { "epoch": 0.3280804843304843, "grad_norm": 0.6825052499771118, "learning_rate": 0.0003883343226817967, "loss": 0.5321, "step": 14740 }, { "epoch": 0.3283030626780627, "grad_norm": 0.8856199979782104, "learning_rate": 0.00038831862763656855, "loss": 0.7376, "step": 14750 }, { "epoch": 0.328525641025641, "grad_norm": 0.6327480673789978, "learning_rate": 0.00038830292235795193, "loss": 0.6739, "step": 14760 }, { "epoch": 0.32874821937321935, "grad_norm": 0.6097829341888428, "learning_rate": 0.00038828720684680034, "loss": 0.6656, "step": 14770 }, { "epoch": 0.32897079772079774, "grad_norm": 0.686923623085022, "learning_rate": 0.00038827148110396785, "loss": 0.6197, "step": 14780 }, { "epoch": 0.32919337606837606, "grad_norm": 0.5756046175956726, "learning_rate": 0.00038825574513030886, "loss": 0.5645, "step": 14790 }, { "epoch": 0.3294159544159544, "grad_norm": 0.5095319747924805, "learning_rate": 0.00038823999892667865, "loss": 0.6822, "step": 14800 }, { "epoch": 0.3296385327635328, "grad_norm": 0.9366177916526794, "learning_rate": 0.0003882242424939327, "loss": 0.5301, "step": 14810 }, { "epoch": 0.3298611111111111, "grad_norm": 0.7820391654968262, "learning_rate": 0.00038820847583292744, "loss": 0.6568, "step": 14820 }, { "epoch": 0.33008368945868943, "grad_norm": 0.6345818638801575, "learning_rate": 0.0003881926989445195, "loss": 0.6711, "step": 14830 }, { "epoch": 0.3303062678062678, "grad_norm": 0.6514973044395447, "learning_rate": 0.00038817691182956623, "loss": 0.8656, "step": 14840 }, { "epoch": 0.33052884615384615, "grad_norm": 0.7673830389976501, "learning_rate": 0.0003881611144889255, "loss": 0.558, "step": 14850 }, { "epoch": 0.33075142450142453, "grad_norm": 0.5530751943588257, "learning_rate": 0.0003881453069234558, "loss": 0.6342, "step": 14860 }, { "epoch": 0.33097400284900286, "grad_norm": 0.7152727842330933, "learning_rate": 0.00038812948913401603, "loss": 0.5806, "step": 14870 }, { "epoch": 0.3311965811965812, "grad_norm": 0.7458562254905701, "learning_rate": 0.00038811366112146594, "loss": 0.6891, "step": 14880 }, { "epoch": 0.33141915954415957, "grad_norm": 0.6589481830596924, "learning_rate": 0.00038809782288666534, "loss": 0.5577, "step": 14890 }, { "epoch": 0.3316417378917379, "grad_norm": 0.6615011692047119, "learning_rate": 0.0003880819744304752, "loss": 0.7042, "step": 14900 }, { "epoch": 0.3318643162393162, "grad_norm": 0.4032817482948303, "learning_rate": 0.0003880661157537565, "loss": 0.5792, "step": 14910 }, { "epoch": 0.3320868945868946, "grad_norm": 0.9323031306266785, "learning_rate": 0.0003880502468573712, "loss": 0.7316, "step": 14920 }, { "epoch": 0.33230947293447294, "grad_norm": 0.5589344501495361, "learning_rate": 0.00038803436774218153, "loss": 0.6257, "step": 14930 }, { "epoch": 0.33253205128205127, "grad_norm": 0.5097547173500061, "learning_rate": 0.00038801847840905034, "loss": 0.5746, "step": 14940 }, { "epoch": 0.33275462962962965, "grad_norm": 0.6569631695747375, "learning_rate": 0.00038800257885884115, "loss": 0.607, "step": 14950 }, { "epoch": 0.332977207977208, "grad_norm": 0.6447013020515442, "learning_rate": 0.0003879866690924179, "loss": 0.7258, "step": 14960 }, { "epoch": 0.3331997863247863, "grad_norm": 0.9661784768104553, "learning_rate": 0.00038797074911064517, "loss": 0.62, "step": 14970 }, { "epoch": 0.3334223646723647, "grad_norm": 0.7204642295837402, "learning_rate": 0.000387954818914388, "loss": 0.7651, "step": 14980 }, { "epoch": 0.333644943019943, "grad_norm": 1.1423128843307495, "learning_rate": 0.0003879388785045122, "loss": 0.6571, "step": 14990 }, { "epoch": 0.33386752136752135, "grad_norm": 1.5268319845199585, "learning_rate": 0.0003879229278818838, "loss": 0.508, "step": 15000 }, { "epoch": 0.33409009971509973, "grad_norm": 0.7197328805923462, "learning_rate": 0.0003879069670473697, "loss": 0.618, "step": 15010 }, { "epoch": 0.33431267806267806, "grad_norm": 0.5955047607421875, "learning_rate": 0.00038789099600183716, "loss": 0.651, "step": 15020 }, { "epoch": 0.3345352564102564, "grad_norm": 0.7561123967170715, "learning_rate": 0.0003878750147461541, "loss": 0.5922, "step": 15030 }, { "epoch": 0.33475783475783477, "grad_norm": 0.5777478218078613, "learning_rate": 0.00038785902328118894, "loss": 0.7357, "step": 15040 }, { "epoch": 0.3349804131054131, "grad_norm": 1.1352577209472656, "learning_rate": 0.00038784302160781067, "loss": 0.6401, "step": 15050 }, { "epoch": 0.33520299145299143, "grad_norm": 0.7301923036575317, "learning_rate": 0.0003878270097268888, "loss": 0.7025, "step": 15060 }, { "epoch": 0.3354255698005698, "grad_norm": 1.0378355979919434, "learning_rate": 0.0003878109876392935, "loss": 0.5704, "step": 15070 }, { "epoch": 0.33564814814814814, "grad_norm": 0.5009276866912842, "learning_rate": 0.00038779495534589534, "loss": 0.5682, "step": 15080 }, { "epoch": 0.33587072649572647, "grad_norm": 0.636855959892273, "learning_rate": 0.0003877789128475656, "loss": 0.6932, "step": 15090 }, { "epoch": 0.33609330484330485, "grad_norm": 0.9682590365409851, "learning_rate": 0.000387762860145176, "loss": 0.6237, "step": 15100 }, { "epoch": 0.3363158831908832, "grad_norm": 1.0007511377334595, "learning_rate": 0.0003877467972395989, "loss": 0.5892, "step": 15110 }, { "epoch": 0.33653846153846156, "grad_norm": 0.6202926635742188, "learning_rate": 0.00038773072413170716, "loss": 0.5913, "step": 15120 }, { "epoch": 0.3367610398860399, "grad_norm": 0.56829434633255, "learning_rate": 0.0003877146408223741, "loss": 0.6632, "step": 15130 }, { "epoch": 0.3369836182336182, "grad_norm": 0.8759254813194275, "learning_rate": 0.00038769854731247384, "loss": 0.6321, "step": 15140 }, { "epoch": 0.3372061965811966, "grad_norm": 0.6968265175819397, "learning_rate": 0.00038768244360288086, "loss": 0.6905, "step": 15150 }, { "epoch": 0.33742877492877493, "grad_norm": 0.8931856751441956, "learning_rate": 0.00038766632969447024, "loss": 0.6507, "step": 15160 }, { "epoch": 0.33765135327635326, "grad_norm": 0.6409062743186951, "learning_rate": 0.0003876502055881177, "loss": 0.6041, "step": 15170 }, { "epoch": 0.33787393162393164, "grad_norm": 0.6757847666740417, "learning_rate": 0.0003876340712846992, "loss": 0.5633, "step": 15180 }, { "epoch": 0.33809650997151, "grad_norm": 0.6828376650810242, "learning_rate": 0.0003876179267850918, "loss": 0.6538, "step": 15190 }, { "epoch": 0.3383190883190883, "grad_norm": 1.2060045003890991, "learning_rate": 0.00038760177209017267, "loss": 0.6757, "step": 15200 }, { "epoch": 0.3385416666666667, "grad_norm": 0.654515266418457, "learning_rate": 0.0003875856072008196, "loss": 0.6166, "step": 15210 }, { "epoch": 0.338764245014245, "grad_norm": 0.6422186493873596, "learning_rate": 0.0003875694321179111, "loss": 0.7331, "step": 15220 }, { "epoch": 0.33898682336182334, "grad_norm": 0.8152149319648743, "learning_rate": 0.00038755324684232616, "loss": 0.6558, "step": 15230 }, { "epoch": 0.3392094017094017, "grad_norm": 0.8730194568634033, "learning_rate": 0.00038753705137494415, "loss": 0.6754, "step": 15240 }, { "epoch": 0.33943198005698005, "grad_norm": 0.6081664562225342, "learning_rate": 0.0003875208457166453, "loss": 0.659, "step": 15250 }, { "epoch": 0.3396545584045584, "grad_norm": 0.4139869213104248, "learning_rate": 0.0003875046298683102, "loss": 0.5921, "step": 15260 }, { "epoch": 0.33987713675213677, "grad_norm": 0.621178925037384, "learning_rate": 0.00038748840383082004, "loss": 0.522, "step": 15270 }, { "epoch": 0.3400997150997151, "grad_norm": 0.8623523712158203, "learning_rate": 0.00038747216760505644, "loss": 0.7395, "step": 15280 }, { "epoch": 0.3403222934472934, "grad_norm": 0.557036280632019, "learning_rate": 0.0003874559211919018, "loss": 0.608, "step": 15290 }, { "epoch": 0.3405448717948718, "grad_norm": 0.3644825220108032, "learning_rate": 0.00038743966459223894, "loss": 0.5917, "step": 15300 }, { "epoch": 0.34076745014245013, "grad_norm": 0.678953230381012, "learning_rate": 0.0003874233978069513, "loss": 0.6087, "step": 15310 }, { "epoch": 0.34099002849002846, "grad_norm": 0.5321779251098633, "learning_rate": 0.0003874071208369228, "loss": 0.569, "step": 15320 }, { "epoch": 0.34121260683760685, "grad_norm": 0.6926671862602234, "learning_rate": 0.0003873908336830379, "loss": 0.6858, "step": 15330 }, { "epoch": 0.3414351851851852, "grad_norm": 0.8424744606018066, "learning_rate": 0.0003873745363461817, "loss": 0.6813, "step": 15340 }, { "epoch": 0.34165776353276356, "grad_norm": 0.6209324598312378, "learning_rate": 0.0003873582288272398, "loss": 0.7667, "step": 15350 }, { "epoch": 0.3418803418803419, "grad_norm": 0.5841759443283081, "learning_rate": 0.0003873419111270984, "loss": 0.4966, "step": 15360 }, { "epoch": 0.3421029202279202, "grad_norm": 0.6942004561424255, "learning_rate": 0.0003873255832466442, "loss": 0.5913, "step": 15370 }, { "epoch": 0.3423254985754986, "grad_norm": 0.8919575810432434, "learning_rate": 0.00038730924518676435, "loss": 0.6681, "step": 15380 }, { "epoch": 0.3425480769230769, "grad_norm": 0.6405223608016968, "learning_rate": 0.0003872928969483469, "loss": 0.5544, "step": 15390 }, { "epoch": 0.34277065527065526, "grad_norm": 0.63236004114151, "learning_rate": 0.00038727653853228, "loss": 0.6711, "step": 15400 }, { "epoch": 0.34299323361823364, "grad_norm": 0.6226939558982849, "learning_rate": 0.00038726016993945276, "loss": 0.6142, "step": 15410 }, { "epoch": 0.34321581196581197, "grad_norm": 0.7648651599884033, "learning_rate": 0.00038724379117075457, "loss": 0.7771, "step": 15420 }, { "epoch": 0.3434383903133903, "grad_norm": 0.7330575585365295, "learning_rate": 0.00038722740222707546, "loss": 0.704, "step": 15430 }, { "epoch": 0.3436609686609687, "grad_norm": 0.6973746418952942, "learning_rate": 0.00038721100310930604, "loss": 0.7464, "step": 15440 }, { "epoch": 0.343883547008547, "grad_norm": 0.851994514465332, "learning_rate": 0.0003871945938183375, "loss": 0.5979, "step": 15450 }, { "epoch": 0.34410612535612534, "grad_norm": 0.5129497051239014, "learning_rate": 0.00038717817435506143, "loss": 0.5974, "step": 15460 }, { "epoch": 0.3443287037037037, "grad_norm": 0.4729333817958832, "learning_rate": 0.00038716174472037013, "loss": 0.6519, "step": 15470 }, { "epoch": 0.34455128205128205, "grad_norm": 0.6375283598899841, "learning_rate": 0.00038714530491515647, "loss": 0.6194, "step": 15480 }, { "epoch": 0.3447738603988604, "grad_norm": 0.753584623336792, "learning_rate": 0.0003871288549403137, "loss": 0.6828, "step": 15490 }, { "epoch": 0.34499643874643876, "grad_norm": 0.5335111618041992, "learning_rate": 0.0003871123947967357, "loss": 0.5159, "step": 15500 }, { "epoch": 0.3452190170940171, "grad_norm": 0.9678267240524292, "learning_rate": 0.0003870959244853171, "loss": 0.6309, "step": 15510 }, { "epoch": 0.3454415954415954, "grad_norm": 0.875391960144043, "learning_rate": 0.0003870794440069527, "loss": 0.5351, "step": 15520 }, { "epoch": 0.3456641737891738, "grad_norm": 0.7199859619140625, "learning_rate": 0.00038706295336253825, "loss": 0.6512, "step": 15530 }, { "epoch": 0.34588675213675213, "grad_norm": 0.48389071226119995, "learning_rate": 0.00038704645255296976, "loss": 0.7339, "step": 15540 }, { "epoch": 0.34610933048433046, "grad_norm": 0.5464368462562561, "learning_rate": 0.0003870299415791439, "loss": 0.5017, "step": 15550 }, { "epoch": 0.34633190883190884, "grad_norm": 1.0332673788070679, "learning_rate": 0.00038701342044195785, "loss": 0.6115, "step": 15560 }, { "epoch": 0.34655448717948717, "grad_norm": 0.6519765257835388, "learning_rate": 0.0003869968891423095, "loss": 0.6015, "step": 15570 }, { "epoch": 0.34677706552706555, "grad_norm": 0.49065589904785156, "learning_rate": 0.0003869803476810971, "loss": 0.6006, "step": 15580 }, { "epoch": 0.3469996438746439, "grad_norm": 0.7999041080474854, "learning_rate": 0.0003869637960592195, "loss": 0.7736, "step": 15590 }, { "epoch": 0.3472222222222222, "grad_norm": 0.7751318216323853, "learning_rate": 0.00038694723427757624, "loss": 0.5886, "step": 15600 }, { "epoch": 0.3474448005698006, "grad_norm": 0.7226809859275818, "learning_rate": 0.00038693066233706715, "loss": 0.5635, "step": 15610 }, { "epoch": 0.3476673789173789, "grad_norm": 0.5385544896125793, "learning_rate": 0.0003869140802385929, "loss": 0.6578, "step": 15620 }, { "epoch": 0.34788995726495725, "grad_norm": 0.739736795425415, "learning_rate": 0.0003868974879830545, "loss": 0.7629, "step": 15630 }, { "epoch": 0.34811253561253563, "grad_norm": 0.8486205339431763, "learning_rate": 0.00038688088557135364, "loss": 0.6528, "step": 15640 }, { "epoch": 0.34833511396011396, "grad_norm": 0.9158767461776733, "learning_rate": 0.00038686427300439237, "loss": 0.5216, "step": 15650 }, { "epoch": 0.3485576923076923, "grad_norm": 0.6496191620826721, "learning_rate": 0.0003868476502830736, "loss": 0.6958, "step": 15660 }, { "epoch": 0.3487802706552707, "grad_norm": 0.6712784171104431, "learning_rate": 0.0003868310174083005, "loss": 0.6468, "step": 15670 }, { "epoch": 0.349002849002849, "grad_norm": 0.4141985774040222, "learning_rate": 0.00038681437438097704, "loss": 0.6772, "step": 15680 }, { "epoch": 0.34922542735042733, "grad_norm": 0.8436933159828186, "learning_rate": 0.00038679772120200754, "loss": 0.6587, "step": 15690 }, { "epoch": 0.3494480056980057, "grad_norm": 0.6606414914131165, "learning_rate": 0.0003867810578722969, "loss": 0.6682, "step": 15700 }, { "epoch": 0.34967058404558404, "grad_norm": 0.4795196056365967, "learning_rate": 0.0003867643843927507, "loss": 0.6216, "step": 15710 }, { "epoch": 0.34989316239316237, "grad_norm": 0.5964969992637634, "learning_rate": 0.0003867477007642749, "loss": 0.6396, "step": 15720 }, { "epoch": 0.35011574074074076, "grad_norm": 0.9071320295333862, "learning_rate": 0.0003867310069877762, "loss": 0.6362, "step": 15730 }, { "epoch": 0.3503383190883191, "grad_norm": 0.9060355424880981, "learning_rate": 0.00038671430306416174, "loss": 0.5166, "step": 15740 }, { "epoch": 0.3505608974358974, "grad_norm": 0.7958703637123108, "learning_rate": 0.0003866975889943392, "loss": 0.5176, "step": 15750 }, { "epoch": 0.3507834757834758, "grad_norm": 0.5803673267364502, "learning_rate": 0.00038668086477921685, "loss": 0.5258, "step": 15760 }, { "epoch": 0.3510060541310541, "grad_norm": 0.8940984606742859, "learning_rate": 0.00038666413041970346, "loss": 0.601, "step": 15770 }, { "epoch": 0.35122863247863245, "grad_norm": 0.7736290693283081, "learning_rate": 0.00038664738591670837, "loss": 0.6799, "step": 15780 }, { "epoch": 0.35145121082621084, "grad_norm": 0.3179571330547333, "learning_rate": 0.0003866306312711416, "loss": 0.6422, "step": 15790 }, { "epoch": 0.35167378917378916, "grad_norm": 0.6942645311355591, "learning_rate": 0.0003866138664839135, "loss": 0.6703, "step": 15800 }, { "epoch": 0.35189636752136755, "grad_norm": 0.63601154088974, "learning_rate": 0.0003865970915559351, "loss": 0.6482, "step": 15810 }, { "epoch": 0.3521189458689459, "grad_norm": 0.981825590133667, "learning_rate": 0.000386580306488118, "loss": 0.499, "step": 15820 }, { "epoch": 0.3523415242165242, "grad_norm": 0.836377739906311, "learning_rate": 0.00038656351128137437, "loss": 0.5932, "step": 15830 }, { "epoch": 0.3525641025641026, "grad_norm": 0.9686137437820435, "learning_rate": 0.0003865467059366168, "loss": 0.7086, "step": 15840 }, { "epoch": 0.3527866809116809, "grad_norm": 0.937958836555481, "learning_rate": 0.00038652989045475847, "loss": 0.7537, "step": 15850 }, { "epoch": 0.35300925925925924, "grad_norm": 0.46349942684173584, "learning_rate": 0.00038651306483671326, "loss": 0.6199, "step": 15860 }, { "epoch": 0.35323183760683763, "grad_norm": 0.7645864486694336, "learning_rate": 0.0003864962290833953, "loss": 0.6083, "step": 15870 }, { "epoch": 0.35345441595441596, "grad_norm": 0.4245271384716034, "learning_rate": 0.00038647938319571963, "loss": 0.4999, "step": 15880 }, { "epoch": 0.3536769943019943, "grad_norm": 0.6851386427879333, "learning_rate": 0.0003864625271746017, "loss": 0.6637, "step": 15890 }, { "epoch": 0.35389957264957267, "grad_norm": 0.863572895526886, "learning_rate": 0.0003864456610209573, "loss": 0.6681, "step": 15900 }, { "epoch": 0.354122150997151, "grad_norm": 0.5621462464332581, "learning_rate": 0.0003864287847357031, "loss": 0.5403, "step": 15910 }, { "epoch": 0.3543447293447293, "grad_norm": 0.781972348690033, "learning_rate": 0.00038641189831975606, "loss": 0.6362, "step": 15920 }, { "epoch": 0.3545673076923077, "grad_norm": 0.9945448040962219, "learning_rate": 0.0003863950017740339, "loss": 0.6942, "step": 15930 }, { "epoch": 0.35478988603988604, "grad_norm": 0.8110677599906921, "learning_rate": 0.0003863780950994548, "loss": 0.5756, "step": 15940 }, { "epoch": 0.35501246438746437, "grad_norm": 0.9088088274002075, "learning_rate": 0.0003863611782969374, "loss": 0.5669, "step": 15950 }, { "epoch": 0.35523504273504275, "grad_norm": 0.9252178072929382, "learning_rate": 0.00038634425136740096, "loss": 0.6743, "step": 15960 }, { "epoch": 0.3554576210826211, "grad_norm": 0.5928763747215271, "learning_rate": 0.0003863273143117654, "loss": 0.6595, "step": 15970 }, { "epoch": 0.3556801994301994, "grad_norm": 0.5745807886123657, "learning_rate": 0.0003863103671309511, "loss": 0.5165, "step": 15980 }, { "epoch": 0.3559027777777778, "grad_norm": 0.6771084070205688, "learning_rate": 0.0003862934098258788, "loss": 0.66, "step": 15990 }, { "epoch": 0.3561253561253561, "grad_norm": 0.6055546402931213, "learning_rate": 0.00038627644239747023, "loss": 0.6146, "step": 16000 }, { "epoch": 0.35634793447293445, "grad_norm": 0.7007344961166382, "learning_rate": 0.0003862594648466472, "loss": 0.658, "step": 16010 }, { "epoch": 0.35657051282051283, "grad_norm": 0.6773852109909058, "learning_rate": 0.0003862424771743324, "loss": 0.716, "step": 16020 }, { "epoch": 0.35679309116809116, "grad_norm": 0.45812416076660156, "learning_rate": 0.0003862254793814489, "loss": 0.5845, "step": 16030 }, { "epoch": 0.35701566951566954, "grad_norm": 0.7533524632453918, "learning_rate": 0.0003862084714689204, "loss": 0.6995, "step": 16040 }, { "epoch": 0.35723824786324787, "grad_norm": 0.5962110757827759, "learning_rate": 0.0003861914534376712, "loss": 0.5721, "step": 16050 }, { "epoch": 0.3574608262108262, "grad_norm": 0.9739892482757568, "learning_rate": 0.00038617442528862596, "loss": 0.6319, "step": 16060 }, { "epoch": 0.3576834045584046, "grad_norm": 0.9599641561508179, "learning_rate": 0.00038615738702271003, "loss": 0.6161, "step": 16070 }, { "epoch": 0.3579059829059829, "grad_norm": 0.49321597814559937, "learning_rate": 0.0003861403386408493, "loss": 0.647, "step": 16080 }, { "epoch": 0.35812856125356124, "grad_norm": 0.5280551910400391, "learning_rate": 0.0003861232801439702, "loss": 0.8232, "step": 16090 }, { "epoch": 0.3583511396011396, "grad_norm": 0.6274698376655579, "learning_rate": 0.0003861062115329996, "loss": 0.8408, "step": 16100 }, { "epoch": 0.35857371794871795, "grad_norm": 0.4998033046722412, "learning_rate": 0.0003860891328088652, "loss": 0.5723, "step": 16110 }, { "epoch": 0.3587962962962963, "grad_norm": 0.791354775428772, "learning_rate": 0.00038607204397249497, "loss": 0.6366, "step": 16120 }, { "epoch": 0.35901887464387466, "grad_norm": 0.6643574237823486, "learning_rate": 0.0003860549450248175, "loss": 0.5278, "step": 16130 }, { "epoch": 0.359241452991453, "grad_norm": 0.7511951923370361, "learning_rate": 0.000386037835966762, "loss": 0.6013, "step": 16140 }, { "epoch": 0.3594640313390313, "grad_norm": 0.589197039604187, "learning_rate": 0.0003860207167992583, "loss": 0.6842, "step": 16150 }, { "epoch": 0.3596866096866097, "grad_norm": 0.5355151295661926, "learning_rate": 0.0003860035875232365, "loss": 0.5977, "step": 16160 }, { "epoch": 0.35990918803418803, "grad_norm": 0.5727527737617493, "learning_rate": 0.0003859864481396275, "loss": 0.5406, "step": 16170 }, { "epoch": 0.36004273504273504, "eval_loss": 0.6386870741844177, "eval_runtime": 337.1238, "eval_samples_per_second": 7.015, "eval_steps_per_second": 7.015, "step": 16176 }, { "epoch": 0.36013176638176636, "grad_norm": 0.5183542966842651, "learning_rate": 0.0003859692986493626, "loss": 0.4373, "step": 16180 }, { "epoch": 0.36035434472934474, "grad_norm": 0.5630384683609009, "learning_rate": 0.0003859521390533738, "loss": 0.4529, "step": 16190 }, { "epoch": 0.3605769230769231, "grad_norm": 0.8775503039360046, "learning_rate": 0.0003859349693525935, "loss": 0.6765, "step": 16200 }, { "epoch": 0.3607995014245014, "grad_norm": 0.80485999584198, "learning_rate": 0.0003859177895479549, "loss": 0.5268, "step": 16210 }, { "epoch": 0.3610220797720798, "grad_norm": 0.582216739654541, "learning_rate": 0.00038590059964039127, "loss": 0.6238, "step": 16220 }, { "epoch": 0.3612446581196581, "grad_norm": 0.7229297757148743, "learning_rate": 0.0003858833996308369, "loss": 0.5674, "step": 16230 }, { "epoch": 0.36146723646723644, "grad_norm": 0.5581183433532715, "learning_rate": 0.00038586618952022645, "loss": 0.72, "step": 16240 }, { "epoch": 0.3616898148148148, "grad_norm": 0.8541936278343201, "learning_rate": 0.0003858489693094951, "loss": 0.6385, "step": 16250 }, { "epoch": 0.36191239316239315, "grad_norm": 0.5558866262435913, "learning_rate": 0.0003858317389995786, "loss": 0.6244, "step": 16260 }, { "epoch": 0.36213497150997154, "grad_norm": 0.44892260432243347, "learning_rate": 0.0003858144985914133, "loss": 0.6247, "step": 16270 }, { "epoch": 0.36235754985754987, "grad_norm": 0.4403764009475708, "learning_rate": 0.00038579724808593597, "loss": 0.5703, "step": 16280 }, { "epoch": 0.3625801282051282, "grad_norm": 0.7626928091049194, "learning_rate": 0.0003857799874840842, "loss": 0.4869, "step": 16290 }, { "epoch": 0.3628027065527066, "grad_norm": 0.5833191871643066, "learning_rate": 0.0003857627167867957, "loss": 0.5903, "step": 16300 }, { "epoch": 0.3630252849002849, "grad_norm": 0.9135043025016785, "learning_rate": 0.00038574543599500914, "loss": 0.6778, "step": 16310 }, { "epoch": 0.36324786324786323, "grad_norm": 0.633033275604248, "learning_rate": 0.00038572814510966355, "loss": 0.659, "step": 16320 }, { "epoch": 0.3634704415954416, "grad_norm": 0.5691318511962891, "learning_rate": 0.00038571084413169845, "loss": 0.5765, "step": 16330 }, { "epoch": 0.36369301994301995, "grad_norm": 0.7053967118263245, "learning_rate": 0.0003856935330620541, "loss": 0.6832, "step": 16340 }, { "epoch": 0.3639155982905983, "grad_norm": 1.0150195360183716, "learning_rate": 0.0003856762119016711, "loss": 0.7993, "step": 16350 }, { "epoch": 0.36413817663817666, "grad_norm": 0.5497720837593079, "learning_rate": 0.00038565888065149084, "loss": 0.6516, "step": 16360 }, { "epoch": 0.364360754985755, "grad_norm": 0.974551260471344, "learning_rate": 0.00038564153931245493, "loss": 0.6516, "step": 16370 }, { "epoch": 0.3645833333333333, "grad_norm": 0.7135359644889832, "learning_rate": 0.00038562418788550587, "loss": 0.623, "step": 16380 }, { "epoch": 0.3648059116809117, "grad_norm": 0.7332806587219238, "learning_rate": 0.00038560682637158643, "loss": 0.6314, "step": 16390 }, { "epoch": 0.36502849002849, "grad_norm": 0.8631349802017212, "learning_rate": 0.0003855894547716401, "loss": 0.5178, "step": 16400 }, { "epoch": 0.36525106837606836, "grad_norm": 0.52054363489151, "learning_rate": 0.0003855720730866109, "loss": 0.4755, "step": 16410 }, { "epoch": 0.36547364672364674, "grad_norm": 0.552855908870697, "learning_rate": 0.0003855546813174433, "loss": 0.6811, "step": 16420 }, { "epoch": 0.36569622507122507, "grad_norm": 0.5592019557952881, "learning_rate": 0.00038553727946508246, "loss": 0.6854, "step": 16430 }, { "epoch": 0.3659188034188034, "grad_norm": 0.7347126007080078, "learning_rate": 0.0003855198675304739, "loss": 0.584, "step": 16440 }, { "epoch": 0.3661413817663818, "grad_norm": 0.7042830586433411, "learning_rate": 0.0003855024455145639, "loss": 0.6909, "step": 16450 }, { "epoch": 0.3663639601139601, "grad_norm": 0.6266321539878845, "learning_rate": 0.0003854850134182991, "loss": 0.7062, "step": 16460 }, { "epoch": 0.36658653846153844, "grad_norm": 0.9455707669258118, "learning_rate": 0.0003854675712426269, "loss": 0.5297, "step": 16470 }, { "epoch": 0.3668091168091168, "grad_norm": 0.6796744465827942, "learning_rate": 0.000385450118988495, "loss": 0.8696, "step": 16480 }, { "epoch": 0.36703169515669515, "grad_norm": 0.5304951071739197, "learning_rate": 0.0003854326566568519, "loss": 0.6025, "step": 16490 }, { "epoch": 0.36725427350427353, "grad_norm": 0.5772337913513184, "learning_rate": 0.00038541518424864635, "loss": 0.6433, "step": 16500 }, { "epoch": 0.36747685185185186, "grad_norm": 0.7764016389846802, "learning_rate": 0.0003853977017648279, "loss": 0.6938, "step": 16510 }, { "epoch": 0.3676994301994302, "grad_norm": 0.8687075972557068, "learning_rate": 0.00038538020920634664, "loss": 0.6411, "step": 16520 }, { "epoch": 0.36792200854700857, "grad_norm": 0.6664961576461792, "learning_rate": 0.00038536270657415296, "loss": 0.7044, "step": 16530 }, { "epoch": 0.3681445868945869, "grad_norm": 0.659508228302002, "learning_rate": 0.0003853451938691981, "loss": 0.5969, "step": 16540 }, { "epoch": 0.36836716524216523, "grad_norm": 0.6755292415618896, "learning_rate": 0.00038532767109243366, "loss": 0.7851, "step": 16550 }, { "epoch": 0.3685897435897436, "grad_norm": 0.5175288319587708, "learning_rate": 0.0003853101382448119, "loss": 0.6716, "step": 16560 }, { "epoch": 0.36881232193732194, "grad_norm": 0.8062911033630371, "learning_rate": 0.00038529259532728543, "loss": 0.5692, "step": 16570 }, { "epoch": 0.36903490028490027, "grad_norm": 0.9816310405731201, "learning_rate": 0.00038527504234080775, "loss": 0.7529, "step": 16580 }, { "epoch": 0.36925747863247865, "grad_norm": 0.5312542915344238, "learning_rate": 0.00038525747928633253, "loss": 0.5502, "step": 16590 }, { "epoch": 0.369480056980057, "grad_norm": 0.8313422203063965, "learning_rate": 0.0003852399061648143, "loss": 0.597, "step": 16600 }, { "epoch": 0.3697026353276353, "grad_norm": 0.5209640860557556, "learning_rate": 0.00038522232297720786, "loss": 0.5609, "step": 16610 }, { "epoch": 0.3699252136752137, "grad_norm": 0.6579368710517883, "learning_rate": 0.0003852047297244687, "loss": 0.6973, "step": 16620 }, { "epoch": 0.370147792022792, "grad_norm": 0.7361441850662231, "learning_rate": 0.00038518712640755304, "loss": 0.6594, "step": 16630 }, { "epoch": 0.37037037037037035, "grad_norm": 0.674850344657898, "learning_rate": 0.00038516951302741735, "loss": 0.6678, "step": 16640 }, { "epoch": 0.37059294871794873, "grad_norm": 0.552830159664154, "learning_rate": 0.0003851518895850186, "loss": 0.5816, "step": 16650 }, { "epoch": 0.37081552706552706, "grad_norm": 0.8027279376983643, "learning_rate": 0.00038513425608131466, "loss": 0.6624, "step": 16660 }, { "epoch": 0.3710381054131054, "grad_norm": 0.8535835146903992, "learning_rate": 0.0003851166125172637, "loss": 0.5926, "step": 16670 }, { "epoch": 0.3712606837606838, "grad_norm": 0.7689616084098816, "learning_rate": 0.00038509895889382443, "loss": 0.5824, "step": 16680 }, { "epoch": 0.3714832621082621, "grad_norm": 0.7670106887817383, "learning_rate": 0.00038508129521195623, "loss": 0.7243, "step": 16690 }, { "epoch": 0.37170584045584043, "grad_norm": 0.7338453531265259, "learning_rate": 0.00038506362147261897, "loss": 0.6473, "step": 16700 }, { "epoch": 0.3719284188034188, "grad_norm": 0.7035108804702759, "learning_rate": 0.000385045937676773, "loss": 0.5963, "step": 16710 }, { "epoch": 0.37215099715099714, "grad_norm": 0.5865857601165771, "learning_rate": 0.00038502824382537925, "loss": 0.618, "step": 16720 }, { "epoch": 0.37237357549857547, "grad_norm": 0.7001277208328247, "learning_rate": 0.00038501053991939926, "loss": 0.6251, "step": 16730 }, { "epoch": 0.37259615384615385, "grad_norm": 0.692415714263916, "learning_rate": 0.00038499282595979515, "loss": 0.5677, "step": 16740 }, { "epoch": 0.3728187321937322, "grad_norm": 0.6563718318939209, "learning_rate": 0.0003849751019475294, "loss": 0.6252, "step": 16750 }, { "epoch": 0.37304131054131057, "grad_norm": 0.7072324752807617, "learning_rate": 0.00038495736788356514, "loss": 0.6283, "step": 16760 }, { "epoch": 0.3732638888888889, "grad_norm": 0.5536751747131348, "learning_rate": 0.00038493962376886614, "loss": 0.6762, "step": 16770 }, { "epoch": 0.3734864672364672, "grad_norm": 0.707798421382904, "learning_rate": 0.00038492186960439656, "loss": 0.6905, "step": 16780 }, { "epoch": 0.3737090455840456, "grad_norm": 1.0452958345413208, "learning_rate": 0.0003849041053911212, "loss": 0.7101, "step": 16790 }, { "epoch": 0.37393162393162394, "grad_norm": 0.7269765138626099, "learning_rate": 0.0003848863311300054, "loss": 0.7, "step": 16800 }, { "epoch": 0.37415420227920226, "grad_norm": 0.8381827473640442, "learning_rate": 0.000384868546822015, "loss": 0.719, "step": 16810 }, { "epoch": 0.37437678062678065, "grad_norm": 0.590775191783905, "learning_rate": 0.0003848507524681164, "loss": 0.5596, "step": 16820 }, { "epoch": 0.374599358974359, "grad_norm": 0.7995975017547607, "learning_rate": 0.0003848329480692766, "loss": 0.6334, "step": 16830 }, { "epoch": 0.3748219373219373, "grad_norm": 0.8189190626144409, "learning_rate": 0.00038481513362646313, "loss": 0.5925, "step": 16840 }, { "epoch": 0.3750445156695157, "grad_norm": 0.6057611107826233, "learning_rate": 0.000384797309140644, "loss": 0.6012, "step": 16850 }, { "epoch": 0.375267094017094, "grad_norm": 0.7356190085411072, "learning_rate": 0.0003847794746127878, "loss": 0.6582, "step": 16860 }, { "epoch": 0.37548967236467234, "grad_norm": 0.8917863368988037, "learning_rate": 0.0003847616300438636, "loss": 0.6011, "step": 16870 }, { "epoch": 0.37571225071225073, "grad_norm": 0.6896722316741943, "learning_rate": 0.0003847437754348413, "loss": 0.6149, "step": 16880 }, { "epoch": 0.37593482905982906, "grad_norm": 0.5577037930488586, "learning_rate": 0.00038472591078669095, "loss": 0.669, "step": 16890 }, { "epoch": 0.3761574074074074, "grad_norm": 0.8162705898284912, "learning_rate": 0.00038470803610038336, "loss": 0.8111, "step": 16900 }, { "epoch": 0.37637998575498577, "grad_norm": 0.8367506861686707, "learning_rate": 0.0003846901513768899, "loss": 0.7359, "step": 16910 }, { "epoch": 0.3766025641025641, "grad_norm": 0.8381311893463135, "learning_rate": 0.0003846722566171824, "loss": 0.8893, "step": 16920 }, { "epoch": 0.3768251424501424, "grad_norm": 0.8151772618293762, "learning_rate": 0.00038465435182223335, "loss": 0.6444, "step": 16930 }, { "epoch": 0.3770477207977208, "grad_norm": 0.5317022204399109, "learning_rate": 0.00038463643699301566, "loss": 0.5619, "step": 16940 }, { "epoch": 0.37727029914529914, "grad_norm": 0.39144349098205566, "learning_rate": 0.00038461851213050276, "loss": 0.5957, "step": 16950 }, { "epoch": 0.37749287749287747, "grad_norm": 0.6841157078742981, "learning_rate": 0.0003846005772356688, "loss": 0.458, "step": 16960 }, { "epoch": 0.37771545584045585, "grad_norm": 1.2825673818588257, "learning_rate": 0.0003845826323094883, "loss": 0.6741, "step": 16970 }, { "epoch": 0.3779380341880342, "grad_norm": 0.7954182028770447, "learning_rate": 0.00038456467735293654, "loss": 0.751, "step": 16980 }, { "epoch": 0.37816061253561256, "grad_norm": 0.6667285561561584, "learning_rate": 0.00038454671236698917, "loss": 0.6689, "step": 16990 }, { "epoch": 0.3783831908831909, "grad_norm": 0.8058348894119263, "learning_rate": 0.00038452873735262224, "loss": 0.6378, "step": 17000 }, { "epoch": 0.3786057692307692, "grad_norm": 0.79270339012146, "learning_rate": 0.00038451075231081273, "loss": 0.5072, "step": 17010 }, { "epoch": 0.3788283475783476, "grad_norm": 0.931929886341095, "learning_rate": 0.0003844927572425379, "loss": 0.6612, "step": 17020 }, { "epoch": 0.37905092592592593, "grad_norm": 0.7652475833892822, "learning_rate": 0.0003844747521487756, "loss": 0.5512, "step": 17030 }, { "epoch": 0.37927350427350426, "grad_norm": 1.1219477653503418, "learning_rate": 0.00038445673703050426, "loss": 0.6397, "step": 17040 }, { "epoch": 0.37949608262108264, "grad_norm": 0.5693344473838806, "learning_rate": 0.0003844387118887028, "loss": 0.679, "step": 17050 }, { "epoch": 0.37971866096866097, "grad_norm": 0.4774753153324127, "learning_rate": 0.0003844206767243507, "loss": 0.4744, "step": 17060 }, { "epoch": 0.3799412393162393, "grad_norm": 0.6411837935447693, "learning_rate": 0.0003844026315384281, "loss": 0.7865, "step": 17070 }, { "epoch": 0.3801638176638177, "grad_norm": 0.768381655216217, "learning_rate": 0.00038438457633191555, "loss": 0.5511, "step": 17080 }, { "epoch": 0.380386396011396, "grad_norm": 0.7788591980934143, "learning_rate": 0.0003843665111057942, "loss": 0.6633, "step": 17090 }, { "epoch": 0.38060897435897434, "grad_norm": 0.5127415657043457, "learning_rate": 0.0003843484358610457, "loss": 0.5877, "step": 17100 }, { "epoch": 0.3808315527065527, "grad_norm": 0.8092899322509766, "learning_rate": 0.00038433035059865227, "loss": 0.5595, "step": 17110 }, { "epoch": 0.38105413105413105, "grad_norm": 0.716167688369751, "learning_rate": 0.00038431225531959667, "loss": 0.7413, "step": 17120 }, { "epoch": 0.3812767094017094, "grad_norm": 1.0542246103286743, "learning_rate": 0.00038429415002486225, "loss": 0.6816, "step": 17130 }, { "epoch": 0.38149928774928776, "grad_norm": 0.6427927017211914, "learning_rate": 0.0003842760347154328, "loss": 0.578, "step": 17140 }, { "epoch": 0.3817218660968661, "grad_norm": 0.5981531143188477, "learning_rate": 0.00038425790939229285, "loss": 0.4801, "step": 17150 }, { "epoch": 0.3819444444444444, "grad_norm": 0.9075816869735718, "learning_rate": 0.0003842397740564272, "loss": 0.6257, "step": 17160 }, { "epoch": 0.3821670227920228, "grad_norm": 0.9044390916824341, "learning_rate": 0.00038422162870882146, "loss": 0.6812, "step": 17170 }, { "epoch": 0.38238960113960113, "grad_norm": 0.7249968647956848, "learning_rate": 0.00038420347335046154, "loss": 0.6322, "step": 17180 }, { "epoch": 0.38261217948717946, "grad_norm": 0.58587646484375, "learning_rate": 0.00038418530798233413, "loss": 0.5745, "step": 17190 }, { "epoch": 0.38283475783475784, "grad_norm": 0.6931901574134827, "learning_rate": 0.0003841671326054263, "loss": 0.7244, "step": 17200 }, { "epoch": 0.38305733618233617, "grad_norm": 0.6910154223442078, "learning_rate": 0.0003841489472207257, "loss": 0.7252, "step": 17210 }, { "epoch": 0.38327991452991456, "grad_norm": 0.6198472380638123, "learning_rate": 0.0003841307518292205, "loss": 0.7574, "step": 17220 }, { "epoch": 0.3835024928774929, "grad_norm": 0.6532488465309143, "learning_rate": 0.0003841125464318996, "loss": 0.6653, "step": 17230 }, { "epoch": 0.3837250712250712, "grad_norm": 0.8199801445007324, "learning_rate": 0.00038409433102975225, "loss": 0.5744, "step": 17240 }, { "epoch": 0.3839476495726496, "grad_norm": 0.47872394323349, "learning_rate": 0.0003840761056237681, "loss": 0.6205, "step": 17250 }, { "epoch": 0.3841702279202279, "grad_norm": 0.6670446395874023, "learning_rate": 0.0003840578702149378, "loss": 0.6703, "step": 17260 }, { "epoch": 0.38439280626780625, "grad_norm": 0.6099238991737366, "learning_rate": 0.00038403962480425204, "loss": 0.6519, "step": 17270 }, { "epoch": 0.38461538461538464, "grad_norm": 0.862278163433075, "learning_rate": 0.0003840213693927025, "loss": 0.5906, "step": 17280 }, { "epoch": 0.38483796296296297, "grad_norm": 0.6451935172080994, "learning_rate": 0.00038400310398128105, "loss": 0.7502, "step": 17290 }, { "epoch": 0.3850605413105413, "grad_norm": 0.7635634541511536, "learning_rate": 0.00038398482857098036, "loss": 0.6136, "step": 17300 }, { "epoch": 0.3852831196581197, "grad_norm": 0.7795054912567139, "learning_rate": 0.0003839665431627934, "loss": 0.5953, "step": 17310 }, { "epoch": 0.385505698005698, "grad_norm": 0.7930306196212769, "learning_rate": 0.0003839482477577139, "loss": 0.7062, "step": 17320 }, { "epoch": 0.38572827635327633, "grad_norm": 0.5813744068145752, "learning_rate": 0.000383929942356736, "loss": 0.5969, "step": 17330 }, { "epoch": 0.3859508547008547, "grad_norm": 0.8584675192832947, "learning_rate": 0.00038391162696085456, "loss": 0.6734, "step": 17340 }, { "epoch": 0.38617343304843305, "grad_norm": 0.6163985729217529, "learning_rate": 0.00038389330157106473, "loss": 0.5826, "step": 17350 }, { "epoch": 0.3863960113960114, "grad_norm": 0.6514464020729065, "learning_rate": 0.00038387496618836226, "loss": 0.5307, "step": 17360 }, { "epoch": 0.38661858974358976, "grad_norm": 0.9078758358955383, "learning_rate": 0.00038385662081374364, "loss": 0.7306, "step": 17370 }, { "epoch": 0.3868411680911681, "grad_norm": 0.8621419668197632, "learning_rate": 0.0003838382654482058, "loss": 0.7171, "step": 17380 }, { "epoch": 0.3870637464387464, "grad_norm": 0.5458530187606812, "learning_rate": 0.00038381990009274603, "loss": 0.6731, "step": 17390 }, { "epoch": 0.3872863247863248, "grad_norm": 0.4690433442592621, "learning_rate": 0.00038380152474836246, "loss": 0.7042, "step": 17400 }, { "epoch": 0.3875089031339031, "grad_norm": 0.4992372393608093, "learning_rate": 0.0003837831394160535, "loss": 0.5502, "step": 17410 }, { "epoch": 0.38773148148148145, "grad_norm": 0.601311981678009, "learning_rate": 0.0003837647440968184, "loss": 0.583, "step": 17420 }, { "epoch": 0.38795405982905984, "grad_norm": 0.7417802214622498, "learning_rate": 0.00038374633879165664, "loss": 0.6168, "step": 17430 }, { "epoch": 0.38817663817663817, "grad_norm": 0.8696532845497131, "learning_rate": 0.00038372792350156834, "loss": 0.6895, "step": 17440 }, { "epoch": 0.38839921652421655, "grad_norm": 0.660205066204071, "learning_rate": 0.00038370949822755436, "loss": 0.7395, "step": 17450 }, { "epoch": 0.3886217948717949, "grad_norm": 0.506757915019989, "learning_rate": 0.0003836910629706158, "loss": 0.6585, "step": 17460 }, { "epoch": 0.3888443732193732, "grad_norm": 0.793761134147644, "learning_rate": 0.00038367261773175447, "loss": 0.5542, "step": 17470 }, { "epoch": 0.3890669515669516, "grad_norm": 0.7295845746994019, "learning_rate": 0.00038365416251197283, "loss": 0.577, "step": 17480 }, { "epoch": 0.3892895299145299, "grad_norm": 1.1868880987167358, "learning_rate": 0.0003836356973122735, "loss": 0.7305, "step": 17490 }, { "epoch": 0.38951210826210825, "grad_norm": 0.6900010704994202, "learning_rate": 0.0003836172221336602, "loss": 0.6991, "step": 17500 }, { "epoch": 0.38973468660968663, "grad_norm": 0.7464649677276611, "learning_rate": 0.0003835987369771367, "loss": 0.607, "step": 17510 }, { "epoch": 0.38995726495726496, "grad_norm": 0.6252322196960449, "learning_rate": 0.00038358024184370745, "loss": 0.563, "step": 17520 }, { "epoch": 0.3901798433048433, "grad_norm": 0.6333285570144653, "learning_rate": 0.0003835617367343776, "loss": 0.6089, "step": 17530 }, { "epoch": 0.39040242165242167, "grad_norm": 0.7202562093734741, "learning_rate": 0.0003835432216501528, "loss": 0.7287, "step": 17540 }, { "epoch": 0.390625, "grad_norm": 0.6413396000862122, "learning_rate": 0.000383524696592039, "loss": 0.6301, "step": 17550 }, { "epoch": 0.39084757834757833, "grad_norm": 0.521155834197998, "learning_rate": 0.0003835061615610429, "loss": 0.5874, "step": 17560 }, { "epoch": 0.3910701566951567, "grad_norm": 0.4937155544757843, "learning_rate": 0.0003834876165581719, "loss": 0.4613, "step": 17570 }, { "epoch": 0.39129273504273504, "grad_norm": 0.9195260405540466, "learning_rate": 0.0003834690615844335, "loss": 0.5578, "step": 17580 }, { "epoch": 0.39151531339031337, "grad_norm": 1.136011004447937, "learning_rate": 0.0003834504966408361, "loss": 0.6035, "step": 17590 }, { "epoch": 0.39173789173789175, "grad_norm": 0.5977762937545776, "learning_rate": 0.00038343192172838854, "loss": 0.6696, "step": 17600 }, { "epoch": 0.3919604700854701, "grad_norm": 0.5439203381538391, "learning_rate": 0.0003834133368481002, "loss": 0.6116, "step": 17610 }, { "epoch": 0.3921830484330484, "grad_norm": 0.40402865409851074, "learning_rate": 0.000383394742000981, "loss": 0.6585, "step": 17620 }, { "epoch": 0.3924056267806268, "grad_norm": 1.3911811113357544, "learning_rate": 0.00038337613718804136, "loss": 0.7167, "step": 17630 }, { "epoch": 0.3926282051282051, "grad_norm": 0.8521585464477539, "learning_rate": 0.00038335752241029235, "loss": 0.6441, "step": 17640 }, { "epoch": 0.39285078347578345, "grad_norm": 0.6077307462692261, "learning_rate": 0.0003833388976687454, "loss": 0.591, "step": 17650 }, { "epoch": 0.39307336182336183, "grad_norm": 0.7540651559829712, "learning_rate": 0.0003833202629644127, "loss": 0.6238, "step": 17660 }, { "epoch": 0.39329594017094016, "grad_norm": 1.261109709739685, "learning_rate": 0.0003833016182983069, "loss": 0.6148, "step": 17670 }, { "epoch": 0.39351851851851855, "grad_norm": 0.6413763761520386, "learning_rate": 0.00038328296367144097, "loss": 0.6449, "step": 17680 }, { "epoch": 0.3937410968660969, "grad_norm": 1.002094030380249, "learning_rate": 0.00038326429908482887, "loss": 0.5893, "step": 17690 }, { "epoch": 0.3939636752136752, "grad_norm": 0.6313454508781433, "learning_rate": 0.00038324562453948463, "loss": 0.7269, "step": 17700 }, { "epoch": 0.3941862535612536, "grad_norm": 0.49584224820137024, "learning_rate": 0.00038322694003642323, "loss": 0.5761, "step": 17710 }, { "epoch": 0.3944088319088319, "grad_norm": 0.8321060538291931, "learning_rate": 0.00038320824557665987, "loss": 0.5659, "step": 17720 }, { "epoch": 0.39463141025641024, "grad_norm": 0.5422239899635315, "learning_rate": 0.0003831895411612105, "loss": 0.5458, "step": 17730 }, { "epoch": 0.3948539886039886, "grad_norm": 0.5938933491706848, "learning_rate": 0.00038317082679109143, "loss": 0.6163, "step": 17740 }, { "epoch": 0.39507656695156695, "grad_norm": 0.8462273478507996, "learning_rate": 0.0003831521024673197, "loss": 0.6927, "step": 17750 }, { "epoch": 0.3952991452991453, "grad_norm": 0.7590085864067078, "learning_rate": 0.00038313336819091284, "loss": 0.7876, "step": 17760 }, { "epoch": 0.39552172364672367, "grad_norm": 0.6377508044242859, "learning_rate": 0.0003831146239628888, "loss": 0.6783, "step": 17770 }, { "epoch": 0.395744301994302, "grad_norm": 0.5582488775253296, "learning_rate": 0.00038309586978426617, "loss": 0.7001, "step": 17780 }, { "epoch": 0.3959668803418803, "grad_norm": 0.9236611127853394, "learning_rate": 0.00038307710565606414, "loss": 0.7399, "step": 17790 }, { "epoch": 0.3961894586894587, "grad_norm": 0.5856517553329468, "learning_rate": 0.0003830583315793023, "loss": 0.5419, "step": 17800 }, { "epoch": 0.39641203703703703, "grad_norm": 0.7868704199790955, "learning_rate": 0.0003830395475550008, "loss": 0.6998, "step": 17810 }, { "epoch": 0.39663461538461536, "grad_norm": 0.745160698890686, "learning_rate": 0.0003830207535841805, "loss": 0.6117, "step": 17820 }, { "epoch": 0.39685719373219375, "grad_norm": 0.966376006603241, "learning_rate": 0.00038300194966786263, "loss": 0.5653, "step": 17830 }, { "epoch": 0.3970797720797721, "grad_norm": 1.0567193031311035, "learning_rate": 0.000382983135807069, "loss": 0.6592, "step": 17840 }, { "epoch": 0.3973023504273504, "grad_norm": 0.6462680697441101, "learning_rate": 0.0003829643120028219, "loss": 0.6337, "step": 17850 }, { "epoch": 0.3975249287749288, "grad_norm": 0.7606768012046814, "learning_rate": 0.0003829454782561444, "loss": 0.6381, "step": 17860 }, { "epoch": 0.3977475071225071, "grad_norm": 0.697704553604126, "learning_rate": 0.0003829266345680598, "loss": 0.64, "step": 17870 }, { "epoch": 0.39797008547008544, "grad_norm": 0.7255660891532898, "learning_rate": 0.0003829077809395921, "loss": 0.7084, "step": 17880 }, { "epoch": 0.39819266381766383, "grad_norm": 0.7107759714126587, "learning_rate": 0.0003828889173717659, "loss": 0.6351, "step": 17890 }, { "epoch": 0.39841524216524216, "grad_norm": 0.635511577129364, "learning_rate": 0.0003828700438656062, "loss": 0.5843, "step": 17900 }, { "epoch": 0.39863782051282054, "grad_norm": 0.7312158942222595, "learning_rate": 0.0003828511604221386, "loss": 0.6226, "step": 17910 }, { "epoch": 0.39886039886039887, "grad_norm": 0.6904292702674866, "learning_rate": 0.0003828322670423893, "loss": 0.6916, "step": 17920 }, { "epoch": 0.3990829772079772, "grad_norm": 0.5614930987358093, "learning_rate": 0.0003828133637273848, "loss": 0.6149, "step": 17930 }, { "epoch": 0.3993055555555556, "grad_norm": 1.0300472974777222, "learning_rate": 0.00038279445047815255, "loss": 0.6267, "step": 17940 }, { "epoch": 0.3995281339031339, "grad_norm": 0.535834014415741, "learning_rate": 0.00038277552729572024, "loss": 0.6585, "step": 17950 }, { "epoch": 0.39975071225071224, "grad_norm": 0.4721270203590393, "learning_rate": 0.00038275659418111614, "loss": 0.5913, "step": 17960 }, { "epoch": 0.3999732905982906, "grad_norm": 0.9660285711288452, "learning_rate": 0.00038273765113536906, "loss": 0.652, "step": 17970 }, { "epoch": 0.40019586894586895, "grad_norm": 0.5322182178497314, "learning_rate": 0.0003827186981595085, "loss": 0.6456, "step": 17980 }, { "epoch": 0.4004184472934473, "grad_norm": 0.4535355567932129, "learning_rate": 0.0003826997352545642, "loss": 0.8067, "step": 17990 }, { "epoch": 0.40064102564102566, "grad_norm": 0.7890868186950684, "learning_rate": 0.00038268076242156684, "loss": 0.4926, "step": 18000 }, { "epoch": 0.400863603988604, "grad_norm": 0.8056808114051819, "learning_rate": 0.0003826617796615472, "loss": 0.628, "step": 18010 }, { "epoch": 0.4010861823361823, "grad_norm": 0.6489529609680176, "learning_rate": 0.00038264278697553697, "loss": 0.7349, "step": 18020 }, { "epoch": 0.4013087606837607, "grad_norm": 0.7820817232131958, "learning_rate": 0.00038262378436456815, "loss": 0.6355, "step": 18030 }, { "epoch": 0.40153133903133903, "grad_norm": 0.5969810485839844, "learning_rate": 0.0003826047718296734, "loss": 0.6453, "step": 18040 }, { "epoch": 0.40175391737891736, "grad_norm": 0.7493919134140015, "learning_rate": 0.0003825857493718858, "loss": 0.5864, "step": 18050 }, { "epoch": 0.40197649572649574, "grad_norm": 0.6539332270622253, "learning_rate": 0.0003825667169922392, "loss": 0.7754, "step": 18060 }, { "epoch": 0.40219907407407407, "grad_norm": 0.6059889793395996, "learning_rate": 0.0003825476746917677, "loss": 0.6061, "step": 18070 }, { "epoch": 0.4024216524216524, "grad_norm": 0.616847574710846, "learning_rate": 0.0003825286224715061, "loss": 0.5818, "step": 18080 }, { "epoch": 0.4026442307692308, "grad_norm": 0.728132963180542, "learning_rate": 0.0003825095603324898, "loss": 0.5179, "step": 18090 }, { "epoch": 0.4028668091168091, "grad_norm": 0.45363709330558777, "learning_rate": 0.0003824904882757545, "loss": 0.7275, "step": 18100 }, { "epoch": 0.40308938746438744, "grad_norm": 1.0843173265457153, "learning_rate": 0.0003824714063023367, "loss": 0.5816, "step": 18110 }, { "epoch": 0.4033119658119658, "grad_norm": 0.5212698578834534, "learning_rate": 0.00038245231441327333, "loss": 0.5885, "step": 18120 }, { "epoch": 0.40353454415954415, "grad_norm": 0.7329869866371155, "learning_rate": 0.00038243321260960186, "loss": 0.6696, "step": 18130 }, { "epoch": 0.40375712250712253, "grad_norm": 0.6578690409660339, "learning_rate": 0.00038241410089236014, "loss": 0.5821, "step": 18140 }, { "epoch": 0.40397970085470086, "grad_norm": 0.5347065329551697, "learning_rate": 0.00038239497926258697, "loss": 0.6141, "step": 18150 }, { "epoch": 0.4042022792022792, "grad_norm": 0.6627506017684937, "learning_rate": 0.00038237584772132126, "loss": 0.646, "step": 18160 }, { "epoch": 0.4044248575498576, "grad_norm": 0.6520497798919678, "learning_rate": 0.0003823567062696027, "loss": 0.5735, "step": 18170 }, { "epoch": 0.4046474358974359, "grad_norm": 1.2275110483169556, "learning_rate": 0.00038233755490847145, "loss": 0.5811, "step": 18180 }, { "epoch": 0.40487001424501423, "grad_norm": 0.7590609788894653, "learning_rate": 0.0003823183936389682, "loss": 0.594, "step": 18190 }, { "epoch": 0.4050925925925926, "grad_norm": 0.7115891575813293, "learning_rate": 0.00038229922246213417, "loss": 0.5999, "step": 18200 }, { "epoch": 0.40531517094017094, "grad_norm": 0.4889591634273529, "learning_rate": 0.00038228004137901114, "loss": 0.6459, "step": 18210 }, { "epoch": 0.40553774928774927, "grad_norm": 0.7974612712860107, "learning_rate": 0.0003822608503906414, "loss": 0.7665, "step": 18220 }, { "epoch": 0.40576032763532766, "grad_norm": 0.4833396077156067, "learning_rate": 0.0003822416494980679, "loss": 0.5332, "step": 18230 }, { "epoch": 0.405982905982906, "grad_norm": 0.7940080165863037, "learning_rate": 0.000382222438702334, "loss": 0.6702, "step": 18240 }, { "epoch": 0.4062054843304843, "grad_norm": 0.6132873892784119, "learning_rate": 0.00038220321800448356, "loss": 0.5453, "step": 18250 }, { "epoch": 0.4064280626780627, "grad_norm": 0.7880875468254089, "learning_rate": 0.00038218398740556115, "loss": 0.7812, "step": 18260 }, { "epoch": 0.406650641025641, "grad_norm": 0.6013901233673096, "learning_rate": 0.0003821647469066117, "loss": 0.705, "step": 18270 }, { "epoch": 0.40687321937321935, "grad_norm": 0.6151648759841919, "learning_rate": 0.0003821454965086807, "loss": 0.6537, "step": 18280 }, { "epoch": 0.40709579772079774, "grad_norm": 0.6463825702667236, "learning_rate": 0.0003821262362128144, "loss": 0.6388, "step": 18290 }, { "epoch": 0.40731837606837606, "grad_norm": 0.7021704316139221, "learning_rate": 0.0003821069660200593, "loss": 0.6958, "step": 18300 }, { "epoch": 0.4075409544159544, "grad_norm": 0.9794663786888123, "learning_rate": 0.0003820876859314626, "loss": 0.6377, "step": 18310 }, { "epoch": 0.4077635327635328, "grad_norm": 0.8478348255157471, "learning_rate": 0.00038206839594807197, "loss": 0.6647, "step": 18320 }, { "epoch": 0.4079861111111111, "grad_norm": 0.7850647568702698, "learning_rate": 0.00038204909607093563, "loss": 0.4885, "step": 18330 }, { "epoch": 0.40820868945868943, "grad_norm": 0.9271420836448669, "learning_rate": 0.00038202978630110245, "loss": 0.716, "step": 18340 }, { "epoch": 0.4084312678062678, "grad_norm": 0.8825610280036926, "learning_rate": 0.0003820104666396216, "loss": 0.5358, "step": 18350 }, { "epoch": 0.40865384615384615, "grad_norm": 0.5441417694091797, "learning_rate": 0.000381991137087543, "loss": 0.6127, "step": 18360 }, { "epoch": 0.40887642450142453, "grad_norm": 0.8587644696235657, "learning_rate": 0.00038197179764591703, "loss": 0.5525, "step": 18370 }, { "epoch": 0.40909900284900286, "grad_norm": 0.6348844170570374, "learning_rate": 0.0003819524483157946, "loss": 0.6251, "step": 18380 }, { "epoch": 0.4093215811965812, "grad_norm": 0.5927433967590332, "learning_rate": 0.0003819330890982272, "loss": 0.4646, "step": 18390 }, { "epoch": 0.40954415954415957, "grad_norm": 0.6170734763145447, "learning_rate": 0.0003819137199942668, "loss": 0.6167, "step": 18400 }, { "epoch": 0.4097667378917379, "grad_norm": 0.49462154507637024, "learning_rate": 0.00038189434100496594, "loss": 0.6901, "step": 18410 }, { "epoch": 0.4099893162393162, "grad_norm": 0.5856154561042786, "learning_rate": 0.0003818749521313777, "loss": 0.7309, "step": 18420 }, { "epoch": 0.4102118945868946, "grad_norm": 0.7598608732223511, "learning_rate": 0.0003818555533745556, "loss": 0.6551, "step": 18430 }, { "epoch": 0.41043447293447294, "grad_norm": 0.5119199156761169, "learning_rate": 0.00038183614473555387, "loss": 0.7321, "step": 18440 }, { "epoch": 0.41065705128205127, "grad_norm": 0.6292290687561035, "learning_rate": 0.0003818167262154272, "loss": 0.6233, "step": 18450 }, { "epoch": 0.41087962962962965, "grad_norm": 0.47254326939582825, "learning_rate": 0.0003817972978152308, "loss": 0.6084, "step": 18460 }, { "epoch": 0.411102207977208, "grad_norm": 0.7951480746269226, "learning_rate": 0.00038177785953602035, "loss": 0.6765, "step": 18470 }, { "epoch": 0.4113247863247863, "grad_norm": 0.7218686938285828, "learning_rate": 0.00038175841137885226, "loss": 0.5814, "step": 18480 }, { "epoch": 0.4115473646723647, "grad_norm": 0.6350971460342407, "learning_rate": 0.00038173895334478333, "loss": 0.557, "step": 18490 }, { "epoch": 0.411769943019943, "grad_norm": 0.43875062465667725, "learning_rate": 0.0003817194854348709, "loss": 0.4481, "step": 18500 }, { "epoch": 0.41199252136752135, "grad_norm": 0.49206212162971497, "learning_rate": 0.0003817000076501728, "loss": 0.5852, "step": 18510 }, { "epoch": 0.41221509971509973, "grad_norm": 0.4355153441429138, "learning_rate": 0.00038168051999174754, "loss": 0.5811, "step": 18520 }, { "epoch": 0.41243767806267806, "grad_norm": 0.5758615136146545, "learning_rate": 0.00038166102246065415, "loss": 0.7024, "step": 18530 }, { "epoch": 0.4126602564102564, "grad_norm": 0.4741193354129791, "learning_rate": 0.000381641515057952, "loss": 0.7598, "step": 18540 }, { "epoch": 0.41288283475783477, "grad_norm": 1.0277267694473267, "learning_rate": 0.00038162199778470134, "loss": 0.6103, "step": 18550 }, { "epoch": 0.4131054131054131, "grad_norm": 0.8170305490493774, "learning_rate": 0.00038160247064196256, "loss": 0.683, "step": 18560 }, { "epoch": 0.41332799145299143, "grad_norm": 0.5887982249259949, "learning_rate": 0.00038158293363079685, "loss": 0.7479, "step": 18570 }, { "epoch": 0.4135505698005698, "grad_norm": 0.7957956194877625, "learning_rate": 0.0003815633867522659, "loss": 0.6387, "step": 18580 }, { "epoch": 0.41377314814814814, "grad_norm": 0.5423303842544556, "learning_rate": 0.0003815438300074319, "loss": 0.5784, "step": 18590 }, { "epoch": 0.41399572649572647, "grad_norm": 0.6572567820549011, "learning_rate": 0.00038152426339735753, "loss": 0.5361, "step": 18600 }, { "epoch": 0.41421830484330485, "grad_norm": 1.020389199256897, "learning_rate": 0.0003815046869231061, "loss": 0.7019, "step": 18610 }, { "epoch": 0.4144408831908832, "grad_norm": 0.6280043721199036, "learning_rate": 0.0003814851005857413, "loss": 0.5532, "step": 18620 }, { "epoch": 0.41466346153846156, "grad_norm": 1.0602699518203735, "learning_rate": 0.0003814655043863277, "loss": 0.6681, "step": 18630 }, { "epoch": 0.4148860398860399, "grad_norm": 1.3655202388763428, "learning_rate": 0.00038144589832593003, "loss": 0.5886, "step": 18640 }, { "epoch": 0.4151086182336182, "grad_norm": 1.4455960988998413, "learning_rate": 0.0003814262824056137, "loss": 0.6894, "step": 18650 }, { "epoch": 0.4153311965811966, "grad_norm": 0.6867120862007141, "learning_rate": 0.00038140665662644456, "loss": 0.7084, "step": 18660 }, { "epoch": 0.41555377492877493, "grad_norm": 0.6166002154350281, "learning_rate": 0.00038138702098948924, "loss": 0.638, "step": 18670 }, { "epoch": 0.41577635327635326, "grad_norm": 0.7494068741798401, "learning_rate": 0.00038136737549581475, "loss": 0.6502, "step": 18680 }, { "epoch": 0.41599893162393164, "grad_norm": 0.9189199209213257, "learning_rate": 0.00038134772014648854, "loss": 0.6337, "step": 18690 }, { "epoch": 0.41622150997151, "grad_norm": 0.44938209652900696, "learning_rate": 0.0003813280549425788, "loss": 0.6499, "step": 18700 }, { "epoch": 0.4164440883190883, "grad_norm": 0.4579305648803711, "learning_rate": 0.0003813083798851541, "loss": 0.6309, "step": 18710 }, { "epoch": 0.4166666666666667, "grad_norm": 0.5728359818458557, "learning_rate": 0.0003812886949752837, "loss": 0.6228, "step": 18720 }, { "epoch": 0.416889245014245, "grad_norm": 0.652080237865448, "learning_rate": 0.00038126900021403707, "loss": 0.6655, "step": 18730 }, { "epoch": 0.41711182336182334, "grad_norm": 0.6501042246818542, "learning_rate": 0.00038124929560248466, "loss": 0.6089, "step": 18740 }, { "epoch": 0.4173344017094017, "grad_norm": 0.7336424589157104, "learning_rate": 0.00038122958114169707, "loss": 0.5065, "step": 18750 }, { "epoch": 0.41755698005698005, "grad_norm": 0.5579254031181335, "learning_rate": 0.0003812098568327458, "loss": 0.5542, "step": 18760 }, { "epoch": 0.4177795584045584, "grad_norm": 0.585763156414032, "learning_rate": 0.00038119012267670246, "loss": 0.7124, "step": 18770 }, { "epoch": 0.41800213675213677, "grad_norm": 0.8041399121284485, "learning_rate": 0.00038117037867463956, "loss": 0.5581, "step": 18780 }, { "epoch": 0.4182247150997151, "grad_norm": 0.6856745481491089, "learning_rate": 0.00038115062482763, "loss": 0.5774, "step": 18790 }, { "epoch": 0.4184472934472934, "grad_norm": 0.6076633930206299, "learning_rate": 0.0003811308611367471, "loss": 0.5197, "step": 18800 }, { "epoch": 0.4186698717948718, "grad_norm": 0.8954238891601562, "learning_rate": 0.000381111087603065, "loss": 0.6354, "step": 18810 }, { "epoch": 0.41889245014245013, "grad_norm": 0.6402801275253296, "learning_rate": 0.0003810913042276581, "loss": 0.5346, "step": 18820 }, { "epoch": 0.41911502849002846, "grad_norm": 0.8094208240509033, "learning_rate": 0.00038107151101160155, "loss": 0.6664, "step": 18830 }, { "epoch": 0.41933760683760685, "grad_norm": 0.8701035976409912, "learning_rate": 0.0003810517079559708, "loss": 0.5255, "step": 18840 }, { "epoch": 0.4195601851851852, "grad_norm": 0.5222055315971375, "learning_rate": 0.000381031895061842, "loss": 0.5832, "step": 18850 }, { "epoch": 0.41978276353276356, "grad_norm": 0.5464537739753723, "learning_rate": 0.00038101207233029184, "loss": 0.638, "step": 18860 }, { "epoch": 0.4200053418803419, "grad_norm": 0.666346549987793, "learning_rate": 0.0003809922397623975, "loss": 0.5112, "step": 18870 }, { "epoch": 0.4200498575498576, "eval_loss": 0.6332426071166992, "eval_runtime": 337.3455, "eval_samples_per_second": 7.011, "eval_steps_per_second": 7.011, "step": 18872 }, { "epoch": 0.4202279202279202, "grad_norm": 0.4587787091732025, "learning_rate": 0.00038097239735923675, "loss": 0.633, "step": 18880 }, { "epoch": 0.4204504985754986, "grad_norm": 0.9687144160270691, "learning_rate": 0.0003809525451218877, "loss": 0.8232, "step": 18890 }, { "epoch": 0.4206730769230769, "grad_norm": 1.0683296918869019, "learning_rate": 0.0003809326830514292, "loss": 0.5876, "step": 18900 }, { "epoch": 0.42089565527065526, "grad_norm": 0.6031848192214966, "learning_rate": 0.0003809128111489406, "loss": 0.6018, "step": 18910 }, { "epoch": 0.42111823361823364, "grad_norm": 0.961243212223053, "learning_rate": 0.0003808929294155018, "loss": 0.6699, "step": 18920 }, { "epoch": 0.42134081196581197, "grad_norm": 0.9062433838844299, "learning_rate": 0.00038087303785219306, "loss": 0.7075, "step": 18930 }, { "epoch": 0.4215633903133903, "grad_norm": 0.5354229807853699, "learning_rate": 0.0003808531364600954, "loss": 0.7262, "step": 18940 }, { "epoch": 0.4217859686609687, "grad_norm": 0.7969614863395691, "learning_rate": 0.00038083322524029025, "loss": 0.6425, "step": 18950 }, { "epoch": 0.422008547008547, "grad_norm": 0.6732149720191956, "learning_rate": 0.0003808133041938596, "loss": 0.6986, "step": 18960 }, { "epoch": 0.42223112535612534, "grad_norm": 0.6219040751457214, "learning_rate": 0.0003807933733218859, "loss": 0.6399, "step": 18970 }, { "epoch": 0.4224537037037037, "grad_norm": 0.44824638962745667, "learning_rate": 0.0003807734326254524, "loss": 0.5337, "step": 18980 }, { "epoch": 0.42267628205128205, "grad_norm": 0.5918938517570496, "learning_rate": 0.0003807534821056426, "loss": 0.5343, "step": 18990 }, { "epoch": 0.4228988603988604, "grad_norm": 0.41495221853256226, "learning_rate": 0.00038073352176354054, "loss": 0.6466, "step": 19000 }, { "epoch": 0.42312143874643876, "grad_norm": 0.643989086151123, "learning_rate": 0.00038071355160023096, "loss": 0.6864, "step": 19010 }, { "epoch": 0.4233440170940171, "grad_norm": 0.7840922474861145, "learning_rate": 0.00038069357161679907, "loss": 0.7135, "step": 19020 }, { "epoch": 0.4235665954415954, "grad_norm": 0.8468214273452759, "learning_rate": 0.00038067358181433054, "loss": 0.514, "step": 19030 }, { "epoch": 0.4237891737891738, "grad_norm": 0.7911776304244995, "learning_rate": 0.00038065358219391174, "loss": 0.6465, "step": 19040 }, { "epoch": 0.42401175213675213, "grad_norm": 0.5380447506904602, "learning_rate": 0.00038063357275662936, "loss": 0.6041, "step": 19050 }, { "epoch": 0.42423433048433046, "grad_norm": 0.5977974534034729, "learning_rate": 0.0003806135535035707, "loss": 0.6742, "step": 19060 }, { "epoch": 0.42445690883190884, "grad_norm": 0.636170506477356, "learning_rate": 0.00038059352443582374, "loss": 0.6234, "step": 19070 }, { "epoch": 0.42467948717948717, "grad_norm": 0.4726337492465973, "learning_rate": 0.0003805734855544768, "loss": 0.5494, "step": 19080 }, { "epoch": 0.42490206552706555, "grad_norm": 0.7931132912635803, "learning_rate": 0.0003805534368606189, "loss": 0.6485, "step": 19090 }, { "epoch": 0.4251246438746439, "grad_norm": 0.766657292842865, "learning_rate": 0.00038053337835533937, "loss": 0.5549, "step": 19100 }, { "epoch": 0.4253472222222222, "grad_norm": 0.796826183795929, "learning_rate": 0.0003805133100397283, "loss": 0.7259, "step": 19110 }, { "epoch": 0.4255698005698006, "grad_norm": 0.7064245343208313, "learning_rate": 0.0003804932319148761, "loss": 0.6653, "step": 19120 }, { "epoch": 0.4257923789173789, "grad_norm": 0.7597333788871765, "learning_rate": 0.000380473143981874, "loss": 0.6861, "step": 19130 }, { "epoch": 0.42601495726495725, "grad_norm": 0.7162373661994934, "learning_rate": 0.00038045304624181354, "loss": 0.8093, "step": 19140 }, { "epoch": 0.42623753561253563, "grad_norm": 0.4276905059814453, "learning_rate": 0.0003804329386957868, "loss": 0.5909, "step": 19150 }, { "epoch": 0.42646011396011396, "grad_norm": 1.56265389919281, "learning_rate": 0.0003804128213448864, "loss": 0.7254, "step": 19160 }, { "epoch": 0.4266826923076923, "grad_norm": 0.6898297071456909, "learning_rate": 0.00038039269419020566, "loss": 0.6241, "step": 19170 }, { "epoch": 0.4269052706552707, "grad_norm": 0.5293618440628052, "learning_rate": 0.00038037255723283824, "loss": 0.6753, "step": 19180 }, { "epoch": 0.427127849002849, "grad_norm": 0.7283686995506287, "learning_rate": 0.00038035241047387834, "loss": 0.6608, "step": 19190 }, { "epoch": 0.42735042735042733, "grad_norm": 0.6621398329734802, "learning_rate": 0.00038033225391442084, "loss": 0.6593, "step": 19200 }, { "epoch": 0.4275730056980057, "grad_norm": 0.6056594252586365, "learning_rate": 0.00038031208755556105, "loss": 0.5824, "step": 19210 }, { "epoch": 0.42779558404558404, "grad_norm": 0.818027138710022, "learning_rate": 0.0003802919113983948, "loss": 0.5348, "step": 19220 }, { "epoch": 0.42801816239316237, "grad_norm": 0.6298645734786987, "learning_rate": 0.0003802717254440185, "loss": 0.6155, "step": 19230 }, { "epoch": 0.42824074074074076, "grad_norm": 0.48016032576560974, "learning_rate": 0.00038025152969352907, "loss": 0.593, "step": 19240 }, { "epoch": 0.4284633190883191, "grad_norm": 0.3382346034049988, "learning_rate": 0.00038023132414802393, "loss": 0.4929, "step": 19250 }, { "epoch": 0.4286858974358974, "grad_norm": 0.8397310972213745, "learning_rate": 0.00038021110880860116, "loss": 0.5998, "step": 19260 }, { "epoch": 0.4289084757834758, "grad_norm": 0.813310980796814, "learning_rate": 0.0003801908836763591, "loss": 0.7211, "step": 19270 }, { "epoch": 0.4291310541310541, "grad_norm": 0.7902941107749939, "learning_rate": 0.000380170648752397, "loss": 0.6227, "step": 19280 }, { "epoch": 0.42935363247863245, "grad_norm": 0.5741865634918213, "learning_rate": 0.0003801504040378143, "loss": 0.4576, "step": 19290 }, { "epoch": 0.42957621082621084, "grad_norm": 0.4125952124595642, "learning_rate": 0.00038013014953371127, "loss": 0.4642, "step": 19300 }, { "epoch": 0.42979878917378916, "grad_norm": 0.8440093398094177, "learning_rate": 0.0003801098852411883, "loss": 0.8402, "step": 19310 }, { "epoch": 0.43002136752136755, "grad_norm": 0.7689343094825745, "learning_rate": 0.0003800896111613468, "loss": 0.6535, "step": 19320 }, { "epoch": 0.4302439458689459, "grad_norm": 0.5488240718841553, "learning_rate": 0.0003800693272952884, "loss": 0.6912, "step": 19330 }, { "epoch": 0.4304665242165242, "grad_norm": 0.6208325624465942, "learning_rate": 0.0003800490336441153, "loss": 0.6589, "step": 19340 }, { "epoch": 0.4306891025641026, "grad_norm": 0.5974116921424866, "learning_rate": 0.0003800287302089304, "loss": 0.5887, "step": 19350 }, { "epoch": 0.4309116809116809, "grad_norm": 0.7017622590065002, "learning_rate": 0.00038000841699083686, "loss": 0.5926, "step": 19360 }, { "epoch": 0.43113425925925924, "grad_norm": 0.7657302021980286, "learning_rate": 0.0003799880939909386, "loss": 0.6634, "step": 19370 }, { "epoch": 0.43135683760683763, "grad_norm": 0.7428944706916809, "learning_rate": 0.00037996776121034, "loss": 0.5295, "step": 19380 }, { "epoch": 0.43157941595441596, "grad_norm": 0.7173558473587036, "learning_rate": 0.00037994741865014585, "loss": 0.6091, "step": 19390 }, { "epoch": 0.4318019943019943, "grad_norm": 0.42774316668510437, "learning_rate": 0.00037992706631146165, "loss": 0.4986, "step": 19400 }, { "epoch": 0.43202457264957267, "grad_norm": 0.7183822989463806, "learning_rate": 0.00037990670419539346, "loss": 0.6067, "step": 19410 }, { "epoch": 0.432247150997151, "grad_norm": 0.8368187546730042, "learning_rate": 0.0003798863323030476, "loss": 0.8275, "step": 19420 }, { "epoch": 0.4324697293447293, "grad_norm": 1.3504329919815063, "learning_rate": 0.0003798659506355313, "loss": 0.6697, "step": 19430 }, { "epoch": 0.4326923076923077, "grad_norm": 0.4535563588142395, "learning_rate": 0.0003798455591939519, "loss": 0.5502, "step": 19440 }, { "epoch": 0.43291488603988604, "grad_norm": 0.5445149540901184, "learning_rate": 0.0003798251579794176, "loss": 0.5602, "step": 19450 }, { "epoch": 0.43313746438746437, "grad_norm": 0.43985071778297424, "learning_rate": 0.000379804746993037, "loss": 0.5629, "step": 19460 }, { "epoch": 0.43336004273504275, "grad_norm": 0.6011999845504761, "learning_rate": 0.0003797843262359193, "loss": 0.655, "step": 19470 }, { "epoch": 0.4335826210826211, "grad_norm": 0.5765155553817749, "learning_rate": 0.00037976389570917407, "loss": 0.6234, "step": 19480 }, { "epoch": 0.4338051994301994, "grad_norm": 0.4562526047229767, "learning_rate": 0.0003797434554139116, "loss": 0.64, "step": 19490 }, { "epoch": 0.4340277777777778, "grad_norm": 0.846602201461792, "learning_rate": 0.00037972300535124267, "loss": 0.7789, "step": 19500 }, { "epoch": 0.4342503561253561, "grad_norm": 0.47665026783943176, "learning_rate": 0.00037970254552227844, "loss": 0.5548, "step": 19510 }, { "epoch": 0.43447293447293445, "grad_norm": 0.7600558996200562, "learning_rate": 0.0003796820759281308, "loss": 0.7801, "step": 19520 }, { "epoch": 0.43469551282051283, "grad_norm": 0.6859473586082458, "learning_rate": 0.0003796615965699121, "loss": 0.5721, "step": 19530 }, { "epoch": 0.43491809116809116, "grad_norm": 0.8362597823143005, "learning_rate": 0.0003796411074487351, "loss": 0.6686, "step": 19540 }, { "epoch": 0.43514066951566954, "grad_norm": 0.47653642296791077, "learning_rate": 0.0003796206085657133, "loss": 0.7665, "step": 19550 }, { "epoch": 0.43536324786324787, "grad_norm": 0.7798507213592529, "learning_rate": 0.00037960009992196053, "loss": 0.6039, "step": 19560 }, { "epoch": 0.4355858262108262, "grad_norm": 0.8148535490036011, "learning_rate": 0.00037957958151859137, "loss": 0.7193, "step": 19570 }, { "epoch": 0.4358084045584046, "grad_norm": 0.842745304107666, "learning_rate": 0.0003795590533567207, "loss": 0.5693, "step": 19580 }, { "epoch": 0.4360309829059829, "grad_norm": 0.8762781023979187, "learning_rate": 0.0003795385154374641, "loss": 0.5878, "step": 19590 }, { "epoch": 0.43625356125356124, "grad_norm": 0.4569462239742279, "learning_rate": 0.0003795179677619376, "loss": 0.5811, "step": 19600 }, { "epoch": 0.4364761396011396, "grad_norm": 1.1307685375213623, "learning_rate": 0.0003794974103312577, "loss": 0.5775, "step": 19610 }, { "epoch": 0.43669871794871795, "grad_norm": 1.057785153388977, "learning_rate": 0.00037947684314654164, "loss": 0.7388, "step": 19620 }, { "epoch": 0.4369212962962963, "grad_norm": 0.6690455079078674, "learning_rate": 0.000379456266208907, "loss": 0.5863, "step": 19630 }, { "epoch": 0.43714387464387466, "grad_norm": 0.9382486939430237, "learning_rate": 0.00037943567951947196, "loss": 0.5357, "step": 19640 }, { "epoch": 0.437366452991453, "grad_norm": 0.6697343587875366, "learning_rate": 0.00037941508307935516, "loss": 0.5759, "step": 19650 }, { "epoch": 0.4375890313390313, "grad_norm": 0.6135135889053345, "learning_rate": 0.0003793944768896759, "loss": 0.6192, "step": 19660 }, { "epoch": 0.4378116096866097, "grad_norm": 0.7405604124069214, "learning_rate": 0.0003793738609515539, "loss": 0.6352, "step": 19670 }, { "epoch": 0.43803418803418803, "grad_norm": 0.7242349982261658, "learning_rate": 0.0003793532352661094, "loss": 0.6406, "step": 19680 }, { "epoch": 0.43825676638176636, "grad_norm": 0.5112965106964111, "learning_rate": 0.0003793325998344633, "loss": 0.5229, "step": 19690 }, { "epoch": 0.43847934472934474, "grad_norm": 0.573747992515564, "learning_rate": 0.000379311954657737, "loss": 0.6026, "step": 19700 }, { "epoch": 0.4387019230769231, "grad_norm": 0.6106606721878052, "learning_rate": 0.00037929129973705215, "loss": 0.5305, "step": 19710 }, { "epoch": 0.4389245014245014, "grad_norm": 0.6479077935218811, "learning_rate": 0.0003792706350735314, "loss": 0.5524, "step": 19720 }, { "epoch": 0.4391470797720798, "grad_norm": 0.43118560314178467, "learning_rate": 0.00037924996066829753, "loss": 0.5813, "step": 19730 }, { "epoch": 0.4393696581196581, "grad_norm": 1.0099085569381714, "learning_rate": 0.0003792292765224741, "loss": 0.6683, "step": 19740 }, { "epoch": 0.43959223646723644, "grad_norm": 0.7920699119567871, "learning_rate": 0.00037920858263718504, "loss": 0.6348, "step": 19750 }, { "epoch": 0.4398148148148148, "grad_norm": 0.7661017179489136, "learning_rate": 0.0003791878790135549, "loss": 0.5387, "step": 19760 }, { "epoch": 0.44003739316239315, "grad_norm": 0.7241034507751465, "learning_rate": 0.0003791671656527087, "loss": 0.5588, "step": 19770 }, { "epoch": 0.44025997150997154, "grad_norm": 0.8879605531692505, "learning_rate": 0.0003791464425557721, "loss": 0.7116, "step": 19780 }, { "epoch": 0.44048254985754987, "grad_norm": 0.7457120418548584, "learning_rate": 0.00037912570972387116, "loss": 0.6061, "step": 19790 }, { "epoch": 0.4407051282051282, "grad_norm": 0.49138614535331726, "learning_rate": 0.0003791049671581324, "loss": 0.4982, "step": 19800 }, { "epoch": 0.4409277065527066, "grad_norm": 0.669255256652832, "learning_rate": 0.0003790842148596832, "loss": 0.5541, "step": 19810 }, { "epoch": 0.4411502849002849, "grad_norm": 0.554853081703186, "learning_rate": 0.0003790634528296511, "loss": 0.6385, "step": 19820 }, { "epoch": 0.44137286324786323, "grad_norm": 0.49231138825416565, "learning_rate": 0.00037904268106916445, "loss": 0.5486, "step": 19830 }, { "epoch": 0.4415954415954416, "grad_norm": 0.8338886499404907, "learning_rate": 0.00037902189957935193, "loss": 0.6802, "step": 19840 }, { "epoch": 0.44181801994301995, "grad_norm": 0.6095072031021118, "learning_rate": 0.0003790011083613428, "loss": 0.5594, "step": 19850 }, { "epoch": 0.4420405982905983, "grad_norm": 0.4810936450958252, "learning_rate": 0.00037898030741626693, "loss": 0.6117, "step": 19860 }, { "epoch": 0.44226317663817666, "grad_norm": 0.42109841108322144, "learning_rate": 0.0003789594967452546, "loss": 0.5319, "step": 19870 }, { "epoch": 0.442485754985755, "grad_norm": 0.9655678272247314, "learning_rate": 0.00037893867634943674, "loss": 0.6568, "step": 19880 }, { "epoch": 0.4427083333333333, "grad_norm": 0.5279555916786194, "learning_rate": 0.0003789178462299447, "loss": 0.5831, "step": 19890 }, { "epoch": 0.4429309116809117, "grad_norm": 0.8120352625846863, "learning_rate": 0.0003788970063879105, "loss": 0.7757, "step": 19900 }, { "epoch": 0.44315349002849, "grad_norm": 0.9156867861747742, "learning_rate": 0.0003788761568244664, "loss": 0.6598, "step": 19910 }, { "epoch": 0.44337606837606836, "grad_norm": 0.3847975432872772, "learning_rate": 0.0003788552975407456, "loss": 0.5608, "step": 19920 }, { "epoch": 0.44359864672364674, "grad_norm": 0.8960539102554321, "learning_rate": 0.0003788344285378815, "loss": 0.677, "step": 19930 }, { "epoch": 0.44382122507122507, "grad_norm": 0.9364832043647766, "learning_rate": 0.0003788135498170081, "loss": 0.8056, "step": 19940 }, { "epoch": 0.4440438034188034, "grad_norm": 0.7676774859428406, "learning_rate": 0.00037879266137926003, "loss": 0.6854, "step": 19950 }, { "epoch": 0.4442663817663818, "grad_norm": 0.7598232626914978, "learning_rate": 0.00037877176322577243, "loss": 0.6803, "step": 19960 }, { "epoch": 0.4444889601139601, "grad_norm": 1.0017249584197998, "learning_rate": 0.00037875085535768086, "loss": 0.6141, "step": 19970 }, { "epoch": 0.44471153846153844, "grad_norm": 0.8074401021003723, "learning_rate": 0.00037872993777612147, "loss": 0.6689, "step": 19980 }, { "epoch": 0.4449341168091168, "grad_norm": 0.5603516697883606, "learning_rate": 0.0003787090104822309, "loss": 0.4914, "step": 19990 }, { "epoch": 0.44515669515669515, "grad_norm": 0.6445973515510559, "learning_rate": 0.0003786880734771464, "loss": 0.6741, "step": 20000 }, { "epoch": 0.44537927350427353, "grad_norm": 0.7388302087783813, "learning_rate": 0.00037866712676200574, "loss": 0.6186, "step": 20010 }, { "epoch": 0.44560185185185186, "grad_norm": 0.6099076867103577, "learning_rate": 0.00037864617033794715, "loss": 0.475, "step": 20020 }, { "epoch": 0.4458244301994302, "grad_norm": 0.7826319336891174, "learning_rate": 0.00037862520420610943, "loss": 0.6261, "step": 20030 }, { "epoch": 0.44604700854700857, "grad_norm": 0.5059393048286438, "learning_rate": 0.0003786042283676319, "loss": 0.677, "step": 20040 }, { "epoch": 0.4462695868945869, "grad_norm": 0.6359015703201294, "learning_rate": 0.00037858324282365435, "loss": 0.6867, "step": 20050 }, { "epoch": 0.44649216524216523, "grad_norm": 0.5743014812469482, "learning_rate": 0.00037856224757531717, "loss": 0.567, "step": 20060 }, { "epoch": 0.4467147435897436, "grad_norm": 0.5873332023620605, "learning_rate": 0.00037854124262376134, "loss": 0.5609, "step": 20070 }, { "epoch": 0.44693732193732194, "grad_norm": 0.6396929025650024, "learning_rate": 0.0003785202279701282, "loss": 0.5618, "step": 20080 }, { "epoch": 0.44715990028490027, "grad_norm": 0.8452794551849365, "learning_rate": 0.00037849920361555966, "loss": 0.5362, "step": 20090 }, { "epoch": 0.44738247863247865, "grad_norm": 1.1678051948547363, "learning_rate": 0.0003784781695611983, "loss": 0.6013, "step": 20100 }, { "epoch": 0.447605056980057, "grad_norm": 0.6501947045326233, "learning_rate": 0.0003784571258081871, "loss": 0.5056, "step": 20110 }, { "epoch": 0.4478276353276353, "grad_norm": 0.7473871111869812, "learning_rate": 0.00037843607235766967, "loss": 0.6463, "step": 20120 }, { "epoch": 0.4480502136752137, "grad_norm": 0.4127056300640106, "learning_rate": 0.00037841500921078996, "loss": 0.6582, "step": 20130 }, { "epoch": 0.448272792022792, "grad_norm": 0.7116015553474426, "learning_rate": 0.0003783939363686925, "loss": 0.6459, "step": 20140 }, { "epoch": 0.44849537037037035, "grad_norm": 0.6783828735351562, "learning_rate": 0.0003783728538325226, "loss": 0.6952, "step": 20150 }, { "epoch": 0.44871794871794873, "grad_norm": 0.46691304445266724, "learning_rate": 0.0003783517616034258, "loss": 0.7028, "step": 20160 }, { "epoch": 0.44894052706552706, "grad_norm": 0.6711305379867554, "learning_rate": 0.00037833065968254824, "loss": 0.5297, "step": 20170 }, { "epoch": 0.4491631054131054, "grad_norm": 0.856510579586029, "learning_rate": 0.00037830954807103665, "loss": 0.7389, "step": 20180 }, { "epoch": 0.4493856837606838, "grad_norm": 0.6568981409072876, "learning_rate": 0.0003782884267700382, "loss": 0.7781, "step": 20190 }, { "epoch": 0.4496082621082621, "grad_norm": 1.019824743270874, "learning_rate": 0.00037826729578070077, "loss": 0.585, "step": 20200 }, { "epoch": 0.44983084045584043, "grad_norm": 0.9498888254165649, "learning_rate": 0.0003782461551041725, "loss": 0.7491, "step": 20210 }, { "epoch": 0.4500534188034188, "grad_norm": 1.014466643333435, "learning_rate": 0.0003782250047416023, "loss": 0.588, "step": 20220 }, { "epoch": 0.45027599715099714, "grad_norm": 0.5617772340774536, "learning_rate": 0.00037820384469413937, "loss": 0.5928, "step": 20230 }, { "epoch": 0.45049857549857547, "grad_norm": 0.5865727066993713, "learning_rate": 0.0003781826749629336, "loss": 0.7415, "step": 20240 }, { "epoch": 0.45072115384615385, "grad_norm": 0.8064214587211609, "learning_rate": 0.0003781614955491355, "loss": 0.5553, "step": 20250 }, { "epoch": 0.4509437321937322, "grad_norm": 0.5541604161262512, "learning_rate": 0.00037814030645389585, "loss": 0.5791, "step": 20260 }, { "epoch": 0.45116631054131057, "grad_norm": 0.4761522710323334, "learning_rate": 0.00037811910767836606, "loss": 0.5813, "step": 20270 }, { "epoch": 0.4513888888888889, "grad_norm": 0.5106942057609558, "learning_rate": 0.0003780978992236982, "loss": 0.5916, "step": 20280 }, { "epoch": 0.4516114672364672, "grad_norm": 0.5374470949172974, "learning_rate": 0.0003780766810910447, "loss": 0.686, "step": 20290 }, { "epoch": 0.4518340455840456, "grad_norm": 0.6303367018699646, "learning_rate": 0.0003780554532815586, "loss": 0.6568, "step": 20300 }, { "epoch": 0.45205662393162394, "grad_norm": 0.7026957273483276, "learning_rate": 0.0003780342157963933, "loss": 0.6484, "step": 20310 }, { "epoch": 0.45227920227920226, "grad_norm": 0.6824941635131836, "learning_rate": 0.00037801296863670307, "loss": 0.5139, "step": 20320 }, { "epoch": 0.45250178062678065, "grad_norm": 0.8238844275474548, "learning_rate": 0.00037799171180364233, "loss": 0.7948, "step": 20330 }, { "epoch": 0.452724358974359, "grad_norm": 0.5989433526992798, "learning_rate": 0.0003779704452983663, "loss": 0.6106, "step": 20340 }, { "epoch": 0.4529469373219373, "grad_norm": 0.7117564678192139, "learning_rate": 0.00037794916912203054, "loss": 0.6437, "step": 20350 }, { "epoch": 0.4531695156695157, "grad_norm": 1.0147078037261963, "learning_rate": 0.00037792788327579134, "loss": 0.7429, "step": 20360 }, { "epoch": 0.453392094017094, "grad_norm": 0.8484946489334106, "learning_rate": 0.0003779065877608052, "loss": 0.6691, "step": 20370 }, { "epoch": 0.45361467236467234, "grad_norm": 0.8865606188774109, "learning_rate": 0.0003778852825782295, "loss": 0.6478, "step": 20380 }, { "epoch": 0.45383725071225073, "grad_norm": 0.6782509088516235, "learning_rate": 0.0003778639677292219, "loss": 0.5474, "step": 20390 }, { "epoch": 0.45405982905982906, "grad_norm": 0.82992023229599, "learning_rate": 0.00037784264321494065, "loss": 0.6493, "step": 20400 }, { "epoch": 0.4542824074074074, "grad_norm": 0.6378486752510071, "learning_rate": 0.00037782130903654465, "loss": 0.5903, "step": 20410 }, { "epoch": 0.45450498575498577, "grad_norm": 0.561734676361084, "learning_rate": 0.00037779996519519314, "loss": 1.501, "step": 20420 }, { "epoch": 0.4547275641025641, "grad_norm": 0.5857129693031311, "learning_rate": 0.0003777786116920459, "loss": 0.756, "step": 20430 }, { "epoch": 0.4549501424501424, "grad_norm": 0.8134432435035706, "learning_rate": 0.00037775724852826345, "loss": 0.6615, "step": 20440 }, { "epoch": 0.4551727207977208, "grad_norm": 0.6562219858169556, "learning_rate": 0.00037773587570500653, "loss": 0.6501, "step": 20450 }, { "epoch": 0.45539529914529914, "grad_norm": 0.7692068219184875, "learning_rate": 0.00037771449322343667, "loss": 0.6392, "step": 20460 }, { "epoch": 0.45561787749287747, "grad_norm": 0.7109014391899109, "learning_rate": 0.00037769310108471576, "loss": 0.6475, "step": 20470 }, { "epoch": 0.45584045584045585, "grad_norm": 0.961484432220459, "learning_rate": 0.0003776716992900062, "loss": 0.6645, "step": 20480 }, { "epoch": 0.4560630341880342, "grad_norm": 0.5773506164550781, "learning_rate": 0.0003776502878404712, "loss": 0.6764, "step": 20490 }, { "epoch": 0.45628561253561256, "grad_norm": 0.8350430727005005, "learning_rate": 0.00037762886673727394, "loss": 0.5623, "step": 20500 }, { "epoch": 0.4565081908831909, "grad_norm": 0.6372119188308716, "learning_rate": 0.00037760743598157877, "loss": 0.7378, "step": 20510 }, { "epoch": 0.4567307692307692, "grad_norm": 0.31066614389419556, "learning_rate": 0.0003775859955745501, "loss": 0.5992, "step": 20520 }, { "epoch": 0.4569533475783476, "grad_norm": 0.5898124575614929, "learning_rate": 0.00037756454551735307, "loss": 0.7565, "step": 20530 }, { "epoch": 0.45717592592592593, "grad_norm": 0.5064905881881714, "learning_rate": 0.00037754308581115325, "loss": 0.7678, "step": 20540 }, { "epoch": 0.45739850427350426, "grad_norm": 2.300577402114868, "learning_rate": 0.00037752161645711676, "loss": 0.6805, "step": 20550 }, { "epoch": 0.45762108262108264, "grad_norm": 1.0232763290405273, "learning_rate": 0.0003775001374564104, "loss": 0.6008, "step": 20560 }, { "epoch": 0.45784366096866097, "grad_norm": 0.8136579394340515, "learning_rate": 0.0003774786488102012, "loss": 0.7021, "step": 20570 }, { "epoch": 0.4580662393162393, "grad_norm": 0.9268751740455627, "learning_rate": 0.000377457150519657, "loss": 0.7578, "step": 20580 }, { "epoch": 0.4582888176638177, "grad_norm": 1.078420639038086, "learning_rate": 0.0003774356425859459, "loss": 0.5974, "step": 20590 }, { "epoch": 0.458511396011396, "grad_norm": 0.8404714465141296, "learning_rate": 0.00037741412501023677, "loss": 0.6292, "step": 20600 }, { "epoch": 0.45873397435897434, "grad_norm": 0.708867073059082, "learning_rate": 0.00037739259779369876, "loss": 0.7109, "step": 20610 }, { "epoch": 0.4589565527065527, "grad_norm": 0.5947948098182678, "learning_rate": 0.0003773710609375019, "loss": 0.6681, "step": 20620 }, { "epoch": 0.45917913105413105, "grad_norm": 0.6741048693656921, "learning_rate": 0.0003773495144428163, "loss": 0.6094, "step": 20630 }, { "epoch": 0.4594017094017094, "grad_norm": 1.0347967147827148, "learning_rate": 0.000377327958310813, "loss": 0.6386, "step": 20640 }, { "epoch": 0.45962428774928776, "grad_norm": 1.3338631391525269, "learning_rate": 0.00037730639254266314, "loss": 0.7083, "step": 20650 }, { "epoch": 0.4598468660968661, "grad_norm": 0.883495032787323, "learning_rate": 0.0003772848171395388, "loss": 0.7422, "step": 20660 }, { "epoch": 0.4600694444444444, "grad_norm": 0.734618604183197, "learning_rate": 0.0003772632321026124, "loss": 0.5737, "step": 20670 }, { "epoch": 0.4602920227920228, "grad_norm": 0.8266713619232178, "learning_rate": 0.0003772416374330568, "loss": 0.8087, "step": 20680 }, { "epoch": 0.46051460113960113, "grad_norm": 0.7156257033348083, "learning_rate": 0.00037722003313204555, "loss": 0.7197, "step": 20690 }, { "epoch": 0.46073717948717946, "grad_norm": 0.5872949957847595, "learning_rate": 0.00037719841920075265, "loss": 0.5454, "step": 20700 }, { "epoch": 0.46095975783475784, "grad_norm": 0.6012685894966125, "learning_rate": 0.0003771767956403526, "loss": 0.7516, "step": 20710 }, { "epoch": 0.46118233618233617, "grad_norm": 0.5311578512191772, "learning_rate": 0.00037715516245202037, "loss": 0.5725, "step": 20720 }, { "epoch": 0.46140491452991456, "grad_norm": 0.6649594306945801, "learning_rate": 0.0003771335196369316, "loss": 0.7018, "step": 20730 }, { "epoch": 0.4616274928774929, "grad_norm": 0.8509739637374878, "learning_rate": 0.0003771118671962624, "loss": 0.7174, "step": 20740 }, { "epoch": 0.4618500712250712, "grad_norm": 0.6986957788467407, "learning_rate": 0.00037709020513118933, "loss": 0.6416, "step": 20750 }, { "epoch": 0.4620726495726496, "grad_norm": 0.48745501041412354, "learning_rate": 0.00037706853344288957, "loss": 0.6395, "step": 20760 }, { "epoch": 0.4622952279202279, "grad_norm": 0.7349006533622742, "learning_rate": 0.0003770468521325407, "loss": 0.6701, "step": 20770 }, { "epoch": 0.46251780626780625, "grad_norm": 0.9265018105506897, "learning_rate": 0.0003770251612013209, "loss": 0.7558, "step": 20780 }, { "epoch": 0.46274038461538464, "grad_norm": 0.7990279197692871, "learning_rate": 0.00037700346065040903, "loss": 0.7709, "step": 20790 }, { "epoch": 0.46296296296296297, "grad_norm": 0.7279727458953857, "learning_rate": 0.0003769817504809842, "loss": 0.7482, "step": 20800 }, { "epoch": 0.4631855413105413, "grad_norm": 0.5113296508789062, "learning_rate": 0.0003769600306942261, "loss": 0.6341, "step": 20810 }, { "epoch": 0.4634081196581197, "grad_norm": 0.6591224074363708, "learning_rate": 0.0003769383012913151, "loss": 0.5927, "step": 20820 }, { "epoch": 0.463630698005698, "grad_norm": 0.9548632502555847, "learning_rate": 0.00037691656227343195, "loss": 0.6021, "step": 20830 }, { "epoch": 0.46385327635327633, "grad_norm": 0.8181374669075012, "learning_rate": 0.000376894813641758, "loss": 0.6352, "step": 20840 }, { "epoch": 0.4640758547008547, "grad_norm": 0.7255536317825317, "learning_rate": 0.00037687305539747497, "loss": 0.5023, "step": 20850 }, { "epoch": 0.46429843304843305, "grad_norm": 0.8241360783576965, "learning_rate": 0.00037685128754176545, "loss": 0.588, "step": 20860 }, { "epoch": 0.4645210113960114, "grad_norm": 0.7465112805366516, "learning_rate": 0.00037682951007581207, "loss": 0.698, "step": 20870 }, { "epoch": 0.46474358974358976, "grad_norm": 0.5946525931358337, "learning_rate": 0.00037680772300079844, "loss": 0.5726, "step": 20880 }, { "epoch": 0.4649661680911681, "grad_norm": 0.7747005820274353, "learning_rate": 0.00037678592631790837, "loss": 0.706, "step": 20890 }, { "epoch": 0.4651887464387464, "grad_norm": 1.039262056350708, "learning_rate": 0.00037676412002832633, "loss": 0.6516, "step": 20900 }, { "epoch": 0.4654113247863248, "grad_norm": 0.7734355330467224, "learning_rate": 0.0003767423041332373, "loss": 0.581, "step": 20910 }, { "epoch": 0.4656339031339031, "grad_norm": 0.8960153460502625, "learning_rate": 0.0003767204786338268, "loss": 0.6354, "step": 20920 }, { "epoch": 0.46585648148148145, "grad_norm": 0.6276922225952148, "learning_rate": 0.0003766986435312808, "loss": 0.6693, "step": 20930 }, { "epoch": 0.46607905982905984, "grad_norm": 0.7944068908691406, "learning_rate": 0.00037667679882678586, "loss": 0.6442, "step": 20940 }, { "epoch": 0.46630163817663817, "grad_norm": 0.7230113744735718, "learning_rate": 0.000376654944521529, "loss": 0.6328, "step": 20950 }, { "epoch": 0.46652421652421655, "grad_norm": 0.5475065112113953, "learning_rate": 0.0003766330806166979, "loss": 0.6933, "step": 20960 }, { "epoch": 0.4667467948717949, "grad_norm": 0.6445038318634033, "learning_rate": 0.00037661120711348056, "loss": 0.6437, "step": 20970 }, { "epoch": 0.4669693732193732, "grad_norm": 0.6793280839920044, "learning_rate": 0.0003765893240130657, "loss": 0.6109, "step": 20980 }, { "epoch": 0.4671919515669516, "grad_norm": 0.7854118347167969, "learning_rate": 0.00037656743131664236, "loss": 0.576, "step": 20990 }, { "epoch": 0.4674145299145299, "grad_norm": 0.6892343759536743, "learning_rate": 0.00037654552902540025, "loss": 0.7005, "step": 21000 }, { "epoch": 0.46763710826210825, "grad_norm": 0.7508336901664734, "learning_rate": 0.0003765236171405296, "loss": 0.5504, "step": 21010 }, { "epoch": 0.46785968660968663, "grad_norm": 0.4559895098209381, "learning_rate": 0.0003765016956632211, "loss": 0.4633, "step": 21020 }, { "epoch": 0.46808226495726496, "grad_norm": 0.7084289789199829, "learning_rate": 0.00037647976459466594, "loss": 0.7449, "step": 21030 }, { "epoch": 0.4683048433048433, "grad_norm": 0.720272958278656, "learning_rate": 0.0003764578239360559, "loss": 0.5805, "step": 21040 }, { "epoch": 0.46852742165242167, "grad_norm": 0.8112165331840515, "learning_rate": 0.00037643587368858323, "loss": 0.5505, "step": 21050 }, { "epoch": 0.46875, "grad_norm": 0.4571852385997772, "learning_rate": 0.00037641391385344076, "loss": 0.6441, "step": 21060 }, { "epoch": 0.46897257834757833, "grad_norm": 0.8824451565742493, "learning_rate": 0.0003763919444318218, "loss": 0.615, "step": 21070 }, { "epoch": 0.4691951566951567, "grad_norm": 1.367150068283081, "learning_rate": 0.0003763699654249202, "loss": 0.5973, "step": 21080 }, { "epoch": 0.46941773504273504, "grad_norm": 0.9916679263114929, "learning_rate": 0.0003763479768339303, "loss": 0.7755, "step": 21090 }, { "epoch": 0.46964031339031337, "grad_norm": 0.6311511397361755, "learning_rate": 0.0003763259786600469, "loss": 0.6311, "step": 21100 }, { "epoch": 0.46986289173789175, "grad_norm": 1.2279185056686401, "learning_rate": 0.0003763039709044655, "loss": 0.6385, "step": 21110 }, { "epoch": 0.4700854700854701, "grad_norm": 0.7499618530273438, "learning_rate": 0.00037628195356838204, "loss": 0.5764, "step": 21120 }, { "epoch": 0.4703080484330484, "grad_norm": 0.7156887054443359, "learning_rate": 0.0003762599266529929, "loss": 0.56, "step": 21130 }, { "epoch": 0.4705306267806268, "grad_norm": 0.7908844351768494, "learning_rate": 0.000376237890159495, "loss": 0.7162, "step": 21140 }, { "epoch": 0.4707532051282051, "grad_norm": 0.8906026482582092, "learning_rate": 0.00037621584408908596, "loss": 0.6661, "step": 21150 }, { "epoch": 0.47097578347578345, "grad_norm": 0.7481372356414795, "learning_rate": 0.0003761937884429636, "loss": 0.6073, "step": 21160 }, { "epoch": 0.47119836182336183, "grad_norm": 0.6544264554977417, "learning_rate": 0.0003761717232223266, "loss": 0.6206, "step": 21170 }, { "epoch": 0.47142094017094016, "grad_norm": 1.5991239547729492, "learning_rate": 0.0003761496484283739, "loss": 0.6163, "step": 21180 }, { "epoch": 0.47164351851851855, "grad_norm": 0.9703873991966248, "learning_rate": 0.00037612756406230514, "loss": 0.6135, "step": 21190 }, { "epoch": 0.4718660968660969, "grad_norm": 0.8297857046127319, "learning_rate": 0.0003761054701253204, "loss": 0.558, "step": 21200 }, { "epoch": 0.4720886752136752, "grad_norm": 0.471038281917572, "learning_rate": 0.00037608336661862016, "loss": 0.5461, "step": 21210 }, { "epoch": 0.4723112535612536, "grad_norm": 0.362981915473938, "learning_rate": 0.00037606125354340563, "loss": 0.543, "step": 21220 }, { "epoch": 0.4725338319088319, "grad_norm": 0.7584346532821655, "learning_rate": 0.0003760391309008785, "loss": 0.6135, "step": 21230 }, { "epoch": 0.47275641025641024, "grad_norm": 0.6571023464202881, "learning_rate": 0.0003760169986922409, "loss": 0.6293, "step": 21240 }, { "epoch": 0.4729789886039886, "grad_norm": 0.5312460660934448, "learning_rate": 0.00037599485691869544, "loss": 0.665, "step": 21250 }, { "epoch": 0.47320156695156695, "grad_norm": 0.767343282699585, "learning_rate": 0.00037597270558144545, "loss": 0.5404, "step": 21260 }, { "epoch": 0.4734241452991453, "grad_norm": 1.1337848901748657, "learning_rate": 0.00037595054468169455, "loss": 0.615, "step": 21270 }, { "epoch": 0.47364672364672367, "grad_norm": 0.6291069388389587, "learning_rate": 0.00037592837422064697, "loss": 0.6336, "step": 21280 }, { "epoch": 0.473869301994302, "grad_norm": 0.9171074032783508, "learning_rate": 0.0003759061941995075, "loss": 0.614, "step": 21290 }, { "epoch": 0.4740918803418803, "grad_norm": 0.6373035907745361, "learning_rate": 0.0003758840046194815, "loss": 0.678, "step": 21300 }, { "epoch": 0.4743144586894587, "grad_norm": 0.6766063570976257, "learning_rate": 0.00037586180548177466, "loss": 0.7444, "step": 21310 }, { "epoch": 0.47453703703703703, "grad_norm": 0.6747170090675354, "learning_rate": 0.00037583959678759335, "loss": 0.706, "step": 21320 }, { "epoch": 0.47475961538461536, "grad_norm": 0.6572831273078918, "learning_rate": 0.0003758173785381445, "loss": 0.6419, "step": 21330 }, { "epoch": 0.47498219373219375, "grad_norm": 0.773108720779419, "learning_rate": 0.0003757951507346352, "loss": 0.6609, "step": 21340 }, { "epoch": 0.4752047720797721, "grad_norm": 0.5571287274360657, "learning_rate": 0.0003757729133782736, "loss": 0.4853, "step": 21350 }, { "epoch": 0.4754273504273504, "grad_norm": 0.6754597425460815, "learning_rate": 0.000375750666470268, "loss": 0.6991, "step": 21360 }, { "epoch": 0.4756499287749288, "grad_norm": 0.7020809650421143, "learning_rate": 0.00037572841001182726, "loss": 0.8095, "step": 21370 }, { "epoch": 0.4758725071225071, "grad_norm": 0.7230724692344666, "learning_rate": 0.0003757061440041609, "loss": 0.7263, "step": 21380 }, { "epoch": 0.47609508547008544, "grad_norm": 0.7014229893684387, "learning_rate": 0.00037568386844847885, "loss": 0.6969, "step": 21390 }, { "epoch": 0.47631766381766383, "grad_norm": 0.5396464467048645, "learning_rate": 0.0003756615833459915, "loss": 0.6536, "step": 21400 }, { "epoch": 0.47654024216524216, "grad_norm": 0.8703533411026001, "learning_rate": 0.0003756392886979099, "loss": 0.656, "step": 21410 }, { "epoch": 0.47676282051282054, "grad_norm": 0.8160148859024048, "learning_rate": 0.00037561698450544565, "loss": 0.7003, "step": 21420 }, { "epoch": 0.47698539886039887, "grad_norm": 0.8830581307411194, "learning_rate": 0.0003755946707698106, "loss": 0.656, "step": 21430 }, { "epoch": 0.4772079772079772, "grad_norm": 0.7755318880081177, "learning_rate": 0.0003755723474922175, "loss": 0.5179, "step": 21440 }, { "epoch": 0.4774305555555556, "grad_norm": 0.8805164694786072, "learning_rate": 0.00037555001467387924, "loss": 0.6699, "step": 21450 }, { "epoch": 0.4776531339031339, "grad_norm": 0.4396213889122009, "learning_rate": 0.0003755276723160095, "loss": 0.4509, "step": 21460 }, { "epoch": 0.47787571225071224, "grad_norm": 0.5609117746353149, "learning_rate": 0.00037550532041982234, "loss": 0.5504, "step": 21470 }, { "epoch": 0.4780982905982906, "grad_norm": 0.6986370086669922, "learning_rate": 0.0003754829589865324, "loss": 0.5165, "step": 21480 }, { "epoch": 0.47832086894586895, "grad_norm": 0.6412321925163269, "learning_rate": 0.0003754605880173548, "loss": 0.6239, "step": 21490 }, { "epoch": 0.4785434472934473, "grad_norm": 0.6213876008987427, "learning_rate": 0.0003754382075135052, "loss": 0.4858, "step": 21500 }, { "epoch": 0.47876602564102566, "grad_norm": 0.400880366563797, "learning_rate": 0.0003754158174761998, "loss": 0.5655, "step": 21510 }, { "epoch": 0.478988603988604, "grad_norm": 0.707527756690979, "learning_rate": 0.0003753934179066552, "loss": 0.7367, "step": 21520 }, { "epoch": 0.4792111823361823, "grad_norm": 0.5736559629440308, "learning_rate": 0.00037537100880608883, "loss": 0.6368, "step": 21530 }, { "epoch": 0.4794337606837607, "grad_norm": 0.7527413368225098, "learning_rate": 0.00037534859017571815, "loss": 0.6444, "step": 21540 }, { "epoch": 0.47965633903133903, "grad_norm": 0.739428699016571, "learning_rate": 0.00037532616201676165, "loss": 0.6678, "step": 21550 }, { "epoch": 0.47987891737891736, "grad_norm": 0.4493646025657654, "learning_rate": 0.00037530372433043787, "loss": 0.5731, "step": 21560 }, { "epoch": 0.48005698005698005, "eval_loss": 0.6317010521888733, "eval_runtime": 337.4466, "eval_samples_per_second": 7.009, "eval_steps_per_second": 7.009, "step": 21568 }, { "epoch": 0.48010149572649574, "grad_norm": 0.8192841410636902, "learning_rate": 0.00037528127711796626, "loss": 0.627, "step": 21570 }, { "epoch": 0.48032407407407407, "grad_norm": 0.7585240006446838, "learning_rate": 0.00037525882038056654, "loss": 0.6677, "step": 21580 }, { "epoch": 0.4805466524216524, "grad_norm": 0.7832114696502686, "learning_rate": 0.00037523635411945905, "loss": 0.5631, "step": 21590 }, { "epoch": 0.4807692307692308, "grad_norm": 0.7623310685157776, "learning_rate": 0.0003752138783358646, "loss": 0.5473, "step": 21600 }, { "epoch": 0.4809918091168091, "grad_norm": 0.7951676249504089, "learning_rate": 0.0003751913930310046, "loss": 0.6501, "step": 21610 }, { "epoch": 0.48121438746438744, "grad_norm": 0.6903308033943176, "learning_rate": 0.00037516889820610086, "loss": 0.7276, "step": 21620 }, { "epoch": 0.4814369658119658, "grad_norm": 0.815481960773468, "learning_rate": 0.00037514639386237585, "loss": 0.6222, "step": 21630 }, { "epoch": 0.48165954415954415, "grad_norm": 0.6189022064208984, "learning_rate": 0.0003751238800010523, "loss": 0.6608, "step": 21640 }, { "epoch": 0.48188212250712253, "grad_norm": 0.8595947027206421, "learning_rate": 0.0003751013566233538, "loss": 0.6706, "step": 21650 }, { "epoch": 0.48210470085470086, "grad_norm": 0.47164031863212585, "learning_rate": 0.0003750788237305043, "loss": 0.6553, "step": 21660 }, { "epoch": 0.4823272792022792, "grad_norm": 0.5663688778877258, "learning_rate": 0.00037505628132372817, "loss": 0.4919, "step": 21670 }, { "epoch": 0.4825498575498576, "grad_norm": 0.840472936630249, "learning_rate": 0.00037503372940425036, "loss": 0.6202, "step": 21680 }, { "epoch": 0.4827724358974359, "grad_norm": 0.8134217262268066, "learning_rate": 0.00037501116797329637, "loss": 0.7261, "step": 21690 }, { "epoch": 0.48299501424501423, "grad_norm": 0.6884470582008362, "learning_rate": 0.0003749885970320923, "loss": 0.5365, "step": 21700 }, { "epoch": 0.4832175925925926, "grad_norm": 0.7177421450614929, "learning_rate": 0.00037496601658186464, "loss": 0.6229, "step": 21710 }, { "epoch": 0.48344017094017094, "grad_norm": 0.7939632534980774, "learning_rate": 0.0003749434266238404, "loss": 0.5793, "step": 21720 }, { "epoch": 0.48366274928774927, "grad_norm": 0.5567938089370728, "learning_rate": 0.00037492082715924707, "loss": 0.5883, "step": 21730 }, { "epoch": 0.48388532763532766, "grad_norm": 0.47863462567329407, "learning_rate": 0.0003748982181893129, "loss": 0.5821, "step": 21740 }, { "epoch": 0.484107905982906, "grad_norm": 0.6115350723266602, "learning_rate": 0.0003748755997152663, "loss": 0.5837, "step": 21750 }, { "epoch": 0.4843304843304843, "grad_norm": 0.915977418422699, "learning_rate": 0.0003748529717383365, "loss": 0.5941, "step": 21760 }, { "epoch": 0.4845530626780627, "grad_norm": 0.7119535803794861, "learning_rate": 0.000374830334259753, "loss": 0.5648, "step": 21770 }, { "epoch": 0.484775641025641, "grad_norm": 1.0399959087371826, "learning_rate": 0.00037480768728074605, "loss": 0.6491, "step": 21780 }, { "epoch": 0.48499821937321935, "grad_norm": 0.5946542620658875, "learning_rate": 0.00037478503080254626, "loss": 0.5447, "step": 21790 }, { "epoch": 0.48522079772079774, "grad_norm": 0.6002269387245178, "learning_rate": 0.00037476236482638487, "loss": 0.5436, "step": 21800 }, { "epoch": 0.48544337606837606, "grad_norm": 0.6677395701408386, "learning_rate": 0.0003747396893534934, "loss": 0.5574, "step": 21810 }, { "epoch": 0.4856659544159544, "grad_norm": 0.7505663633346558, "learning_rate": 0.00037471700438510426, "loss": 0.5188, "step": 21820 }, { "epoch": 0.4858885327635328, "grad_norm": 0.860384464263916, "learning_rate": 0.0003746943099224501, "loss": 1.6389, "step": 21830 }, { "epoch": 0.4861111111111111, "grad_norm": 0.6575486660003662, "learning_rate": 0.000374671605966764, "loss": 0.6875, "step": 21840 }, { "epoch": 0.48633368945868943, "grad_norm": 0.48352324962615967, "learning_rate": 0.00037464889251927994, "loss": 0.5916, "step": 21850 }, { "epoch": 0.4865562678062678, "grad_norm": 1.2798899412155151, "learning_rate": 0.00037462616958123206, "loss": 0.6379, "step": 21860 }, { "epoch": 0.48677884615384615, "grad_norm": 0.8500372767448425, "learning_rate": 0.0003746034371538551, "loss": 0.6326, "step": 21870 }, { "epoch": 0.48700142450142453, "grad_norm": 0.6251464486122131, "learning_rate": 0.0003745806952383845, "loss": 0.5766, "step": 21880 }, { "epoch": 0.48722400284900286, "grad_norm": 0.6444466710090637, "learning_rate": 0.00037455794383605605, "loss": 0.7005, "step": 21890 }, { "epoch": 0.4874465811965812, "grad_norm": 0.8004457950592041, "learning_rate": 0.00037453518294810594, "loss": 0.6974, "step": 21900 }, { "epoch": 0.48766915954415957, "grad_norm": 0.5639207363128662, "learning_rate": 0.00037451241257577115, "loss": 0.6421, "step": 21910 }, { "epoch": 0.4878917378917379, "grad_norm": 0.6112554669380188, "learning_rate": 0.00037448963272028896, "loss": 0.5886, "step": 21920 }, { "epoch": 0.4881143162393162, "grad_norm": 0.5895763635635376, "learning_rate": 0.0003744668433828974, "loss": 0.6137, "step": 21930 }, { "epoch": 0.4883368945868946, "grad_norm": 0.8514500260353088, "learning_rate": 0.0003744440445648346, "loss": 0.5455, "step": 21940 }, { "epoch": 0.48855947293447294, "grad_norm": 0.7488554120063782, "learning_rate": 0.0003744212362673396, "loss": 0.6121, "step": 21950 }, { "epoch": 0.48878205128205127, "grad_norm": 0.49900588393211365, "learning_rate": 0.000374398418491652, "loss": 0.6145, "step": 21960 }, { "epoch": 0.48900462962962965, "grad_norm": 0.7089706659317017, "learning_rate": 0.00037437559123901145, "loss": 0.5018, "step": 21970 }, { "epoch": 0.489227207977208, "grad_norm": 0.6289607882499695, "learning_rate": 0.00037435275451065854, "loss": 0.6118, "step": 21980 }, { "epoch": 0.4894497863247863, "grad_norm": 0.8315375447273254, "learning_rate": 0.00037432990830783423, "loss": 0.7574, "step": 21990 }, { "epoch": 0.4896723646723647, "grad_norm": 0.8315839171409607, "learning_rate": 0.00037430705263177995, "loss": 0.4651, "step": 22000 }, { "epoch": 0.489894943019943, "grad_norm": 0.6891462802886963, "learning_rate": 0.0003742841874837378, "loss": 0.6585, "step": 22010 }, { "epoch": 0.49011752136752135, "grad_norm": 1.2439353466033936, "learning_rate": 0.0003742613128649502, "loss": 0.7067, "step": 22020 }, { "epoch": 0.49034009971509973, "grad_norm": 0.7287011742591858, "learning_rate": 0.00037423842877666016, "loss": 0.6761, "step": 22030 }, { "epoch": 0.49056267806267806, "grad_norm": 0.6492077708244324, "learning_rate": 0.00037421553522011135, "loss": 0.5644, "step": 22040 }, { "epoch": 0.4907852564102564, "grad_norm": 0.6765930652618408, "learning_rate": 0.00037419263219654763, "loss": 0.5853, "step": 22050 }, { "epoch": 0.49100783475783477, "grad_norm": 0.5932199358940125, "learning_rate": 0.0003741697197072138, "loss": 0.5416, "step": 22060 }, { "epoch": 0.4912304131054131, "grad_norm": 0.9352262020111084, "learning_rate": 0.0003741467977533547, "loss": 0.771, "step": 22070 }, { "epoch": 0.49145299145299143, "grad_norm": 0.56147700548172, "learning_rate": 0.0003741238663362161, "loss": 0.6866, "step": 22080 }, { "epoch": 0.4916755698005698, "grad_norm": 0.7923839092254639, "learning_rate": 0.00037410092545704405, "loss": 0.746, "step": 22090 }, { "epoch": 0.49189814814814814, "grad_norm": 1.0098870992660522, "learning_rate": 0.00037407797511708517, "loss": 0.5563, "step": 22100 }, { "epoch": 0.49212072649572647, "grad_norm": 0.8181535005569458, "learning_rate": 0.00037405501531758665, "loss": 0.6467, "step": 22110 }, { "epoch": 0.49234330484330485, "grad_norm": 0.8707475066184998, "learning_rate": 0.0003740320460597961, "loss": 0.676, "step": 22120 }, { "epoch": 0.4925658831908832, "grad_norm": 0.7622846961021423, "learning_rate": 0.0003740090673449617, "loss": 0.5734, "step": 22130 }, { "epoch": 0.49278846153846156, "grad_norm": 0.6305636763572693, "learning_rate": 0.0003739860791743321, "loss": 0.5678, "step": 22140 }, { "epoch": 0.4930110398860399, "grad_norm": 0.5063610672950745, "learning_rate": 0.0003739630815491566, "loss": 0.419, "step": 22150 }, { "epoch": 0.4932336182336182, "grad_norm": 0.8419811725616455, "learning_rate": 0.0003739400744706848, "loss": 0.7332, "step": 22160 }, { "epoch": 0.4934561965811966, "grad_norm": 0.6084222197532654, "learning_rate": 0.0003739170579401669, "loss": 0.5931, "step": 22170 }, { "epoch": 0.49367877492877493, "grad_norm": 0.8849558234214783, "learning_rate": 0.00037389403195885374, "loss": 0.6365, "step": 22180 }, { "epoch": 0.49390135327635326, "grad_norm": 0.8881486058235168, "learning_rate": 0.00037387099652799657, "loss": 0.7062, "step": 22190 }, { "epoch": 0.49412393162393164, "grad_norm": 0.7176637649536133, "learning_rate": 0.0003738479516488471, "loss": 0.6893, "step": 22200 }, { "epoch": 0.49434650997151, "grad_norm": 0.6873642206192017, "learning_rate": 0.00037382489732265756, "loss": 0.7328, "step": 22210 }, { "epoch": 0.4945690883190883, "grad_norm": 0.5525507926940918, "learning_rate": 0.00037380183355068084, "loss": 0.5907, "step": 22220 }, { "epoch": 0.4947916666666667, "grad_norm": 0.7779015898704529, "learning_rate": 0.00037377876033417015, "loss": 0.6603, "step": 22230 }, { "epoch": 0.495014245014245, "grad_norm": 0.9474874138832092, "learning_rate": 0.0003737556776743794, "loss": 0.7767, "step": 22240 }, { "epoch": 0.49523682336182334, "grad_norm": 0.9156774878501892, "learning_rate": 0.0003737325855725629, "loss": 0.5673, "step": 22250 }, { "epoch": 0.4954594017094017, "grad_norm": 0.9191159605979919, "learning_rate": 0.00037370948402997545, "loss": 0.7124, "step": 22260 }, { "epoch": 0.49568198005698005, "grad_norm": 0.9254758358001709, "learning_rate": 0.00037368637304787246, "loss": 0.635, "step": 22270 }, { "epoch": 0.4959045584045584, "grad_norm": 0.8832774758338928, "learning_rate": 0.0003736632526275098, "loss": 0.5771, "step": 22280 }, { "epoch": 0.49612713675213677, "grad_norm": 0.8039012551307678, "learning_rate": 0.0003736401227701437, "loss": 0.657, "step": 22290 }, { "epoch": 0.4963497150997151, "grad_norm": 0.8206833600997925, "learning_rate": 0.00037361698347703127, "loss": 0.5793, "step": 22300 }, { "epoch": 0.4965722934472934, "grad_norm": 0.7622146010398865, "learning_rate": 0.0003735938347494298, "loss": 0.5886, "step": 22310 }, { "epoch": 0.4967948717948718, "grad_norm": 0.3747854232788086, "learning_rate": 0.0003735706765885973, "loss": 0.7687, "step": 22320 }, { "epoch": 0.49701745014245013, "grad_norm": 0.8601908087730408, "learning_rate": 0.00037354750899579214, "loss": 0.7588, "step": 22330 }, { "epoch": 0.49724002849002846, "grad_norm": 0.4530029594898224, "learning_rate": 0.00037352433197227315, "loss": 0.7051, "step": 22340 }, { "epoch": 0.49746260683760685, "grad_norm": 0.8396738171577454, "learning_rate": 0.00037350114551930005, "loss": 0.738, "step": 22350 }, { "epoch": 0.4976851851851852, "grad_norm": 0.6398061513900757, "learning_rate": 0.0003734779496381326, "loss": 0.6236, "step": 22360 }, { "epoch": 0.49790776353276356, "grad_norm": 0.7329898476600647, "learning_rate": 0.0003734547443300313, "loss": 0.7076, "step": 22370 }, { "epoch": 0.4981303418803419, "grad_norm": 0.8609546422958374, "learning_rate": 0.0003734315295962573, "loss": 0.6196, "step": 22380 }, { "epoch": 0.4983529202279202, "grad_norm": 0.746404230594635, "learning_rate": 0.00037340830543807196, "loss": 0.6858, "step": 22390 }, { "epoch": 0.4985754985754986, "grad_norm": 0.4643799066543579, "learning_rate": 0.0003733850718567373, "loss": 0.583, "step": 22400 }, { "epoch": 0.4987980769230769, "grad_norm": 0.7553014755249023, "learning_rate": 0.00037336182885351594, "loss": 0.661, "step": 22410 }, { "epoch": 0.49902065527065526, "grad_norm": 0.5482897758483887, "learning_rate": 0.0003733385764296709, "loss": 0.7295, "step": 22420 }, { "epoch": 0.49924323361823364, "grad_norm": 0.5720972418785095, "learning_rate": 0.0003733153145864657, "loss": 0.5406, "step": 22430 }, { "epoch": 0.49946581196581197, "grad_norm": 0.8809942007064819, "learning_rate": 0.0003732920433251644, "loss": 0.8006, "step": 22440 }, { "epoch": 0.4996883903133903, "grad_norm": 0.5236983895301819, "learning_rate": 0.00037326876264703163, "loss": 0.5946, "step": 22450 }, { "epoch": 0.4999109686609687, "grad_norm": 0.8043753504753113, "learning_rate": 0.0003732454725533324, "loss": 0.5971, "step": 22460 }, { "epoch": 0.5001335470085471, "grad_norm": 0.8286345601081848, "learning_rate": 0.00037322217304533244, "loss": 0.6566, "step": 22470 }, { "epoch": 0.5003561253561254, "grad_norm": 0.7317217588424683, "learning_rate": 0.00037319886412429777, "loss": 0.6861, "step": 22480 }, { "epoch": 0.5005787037037037, "grad_norm": 0.9986227750778198, "learning_rate": 0.000373175545791495, "loss": 0.8038, "step": 22490 }, { "epoch": 0.500801282051282, "grad_norm": 0.5715426802635193, "learning_rate": 0.00037315221804819134, "loss": 0.6179, "step": 22500 }, { "epoch": 0.5010238603988604, "grad_norm": 0.7587363719940186, "learning_rate": 0.0003731288808956544, "loss": 0.5958, "step": 22510 }, { "epoch": 0.5012464387464387, "grad_norm": 0.5153867602348328, "learning_rate": 0.0003731055343351523, "loss": 0.607, "step": 22520 }, { "epoch": 0.5014690170940171, "grad_norm": 0.71803879737854, "learning_rate": 0.0003730821783679538, "loss": 0.6498, "step": 22530 }, { "epoch": 0.5016915954415955, "grad_norm": 0.8072484731674194, "learning_rate": 0.000373058812995328, "loss": 0.6912, "step": 22540 }, { "epoch": 0.5019141737891738, "grad_norm": 0.9070923328399658, "learning_rate": 0.0003730354382185447, "loss": 0.6367, "step": 22550 }, { "epoch": 0.5021367521367521, "grad_norm": 0.6757694482803345, "learning_rate": 0.00037301205403887395, "loss": 0.6196, "step": 22560 }, { "epoch": 0.5023593304843305, "grad_norm": 0.6036407947540283, "learning_rate": 0.00037298866045758656, "loss": 0.6573, "step": 22570 }, { "epoch": 0.5025819088319088, "grad_norm": 0.8025512099266052, "learning_rate": 0.0003729652574759538, "loss": 0.6912, "step": 22580 }, { "epoch": 0.5028044871794872, "grad_norm": 0.9629495143890381, "learning_rate": 0.0003729418450952473, "loss": 0.6471, "step": 22590 }, { "epoch": 0.5030270655270656, "grad_norm": 0.7720807194709778, "learning_rate": 0.00037291842331673943, "loss": 0.6282, "step": 22600 }, { "epoch": 0.5032496438746439, "grad_norm": 1.0801931619644165, "learning_rate": 0.0003728949921417028, "loss": 0.8203, "step": 22610 }, { "epoch": 0.5034722222222222, "grad_norm": 0.6417792439460754, "learning_rate": 0.0003728715515714108, "loss": 0.6012, "step": 22620 }, { "epoch": 0.5036948005698005, "grad_norm": 0.8479206562042236, "learning_rate": 0.00037284810160713715, "loss": 0.5524, "step": 22630 }, { "epoch": 0.5039173789173789, "grad_norm": 1.038094401359558, "learning_rate": 0.00037282464225015617, "loss": 0.7643, "step": 22640 }, { "epoch": 0.5041399572649573, "grad_norm": 0.48345646262168884, "learning_rate": 0.0003728011735017427, "loss": 0.5751, "step": 22650 }, { "epoch": 0.5043625356125356, "grad_norm": 0.5323746204376221, "learning_rate": 0.0003727776953631719, "loss": 0.6605, "step": 22660 }, { "epoch": 0.504585113960114, "grad_norm": 0.6944118142127991, "learning_rate": 0.0003727542078357197, "loss": 0.6518, "step": 22670 }, { "epoch": 0.5048076923076923, "grad_norm": 0.6447984576225281, "learning_rate": 0.0003727307109206625, "loss": 0.5313, "step": 22680 }, { "epoch": 0.5050302706552706, "grad_norm": 0.7850675582885742, "learning_rate": 0.00037270720461927704, "loss": 0.6458, "step": 22690 }, { "epoch": 0.5052528490028491, "grad_norm": 0.8559784889221191, "learning_rate": 0.0003726836889328407, "loss": 0.6545, "step": 22700 }, { "epoch": 0.5054754273504274, "grad_norm": 0.54237961769104, "learning_rate": 0.00037266016386263123, "loss": 0.5886, "step": 22710 }, { "epoch": 0.5056980056980057, "grad_norm": 0.6112391948699951, "learning_rate": 0.00037263662940992725, "loss": 0.6634, "step": 22720 }, { "epoch": 0.505920584045584, "grad_norm": 0.7948022484779358, "learning_rate": 0.0003726130855760074, "loss": 0.7228, "step": 22730 }, { "epoch": 0.5061431623931624, "grad_norm": 0.5578857660293579, "learning_rate": 0.00037258953236215126, "loss": 0.5033, "step": 22740 }, { "epoch": 0.5063657407407407, "grad_norm": 0.6403529047966003, "learning_rate": 0.00037256596976963866, "loss": 0.6009, "step": 22750 }, { "epoch": 0.5065883190883191, "grad_norm": 0.7904924154281616, "learning_rate": 0.0003725423977997499, "loss": 0.5801, "step": 22760 }, { "epoch": 0.5068108974358975, "grad_norm": 0.5248593091964722, "learning_rate": 0.00037251881645376605, "loss": 0.5125, "step": 22770 }, { "epoch": 0.5070334757834758, "grad_norm": 0.6650285720825195, "learning_rate": 0.0003724952257329684, "loss": 0.6307, "step": 22780 }, { "epoch": 0.5072560541310541, "grad_norm": 0.8868667483329773, "learning_rate": 0.00037247162563863907, "loss": 0.7645, "step": 22790 }, { "epoch": 0.5074786324786325, "grad_norm": 0.8926972150802612, "learning_rate": 0.0003724480161720604, "loss": 0.6528, "step": 22800 }, { "epoch": 0.5077012108262108, "grad_norm": 0.6713107824325562, "learning_rate": 0.00037242439733451533, "loss": 0.5721, "step": 22810 }, { "epoch": 0.5079237891737892, "grad_norm": 0.9303070306777954, "learning_rate": 0.00037240076912728736, "loss": 0.664, "step": 22820 }, { "epoch": 0.5081463675213675, "grad_norm": 0.9031490683555603, "learning_rate": 0.0003723771315516605, "loss": 0.7006, "step": 22830 }, { "epoch": 0.5083689458689459, "grad_norm": 0.5676231980323792, "learning_rate": 0.00037235348460891915, "loss": 0.6106, "step": 22840 }, { "epoch": 0.5085915242165242, "grad_norm": 0.6862084865570068, "learning_rate": 0.00037232982830034836, "loss": 0.6386, "step": 22850 }, { "epoch": 0.5088141025641025, "grad_norm": 0.5832529664039612, "learning_rate": 0.00037230616262723366, "loss": 0.5575, "step": 22860 }, { "epoch": 0.5090366809116809, "grad_norm": 0.4869333803653717, "learning_rate": 0.00037228248759086095, "loss": 0.6385, "step": 22870 }, { "epoch": 0.5092592592592593, "grad_norm": 0.8570073246955872, "learning_rate": 0.0003722588031925169, "loss": 0.6212, "step": 22880 }, { "epoch": 0.5094818376068376, "grad_norm": 0.7280600666999817, "learning_rate": 0.0003722351094334884, "loss": 0.5418, "step": 22890 }, { "epoch": 0.509704415954416, "grad_norm": 0.8902294635772705, "learning_rate": 0.0003722114063150631, "loss": 0.5825, "step": 22900 }, { "epoch": 0.5099269943019943, "grad_norm": 0.9598889946937561, "learning_rate": 0.00037218769383852906, "loss": 0.528, "step": 22910 }, { "epoch": 0.5101495726495726, "grad_norm": 0.6155282258987427, "learning_rate": 0.00037216397200517465, "loss": 0.5101, "step": 22920 }, { "epoch": 0.5103721509971509, "grad_norm": 0.604421854019165, "learning_rate": 0.00037214024081628914, "loss": 0.558, "step": 22930 }, { "epoch": 0.5105947293447294, "grad_norm": 0.8476591110229492, "learning_rate": 0.000372116500273162, "loss": 0.6325, "step": 22940 }, { "epoch": 0.5108173076923077, "grad_norm": 0.6499559283256531, "learning_rate": 0.00037209275037708336, "loss": 0.7019, "step": 22950 }, { "epoch": 0.511039886039886, "grad_norm": 0.6757542490959167, "learning_rate": 0.0003720689911293437, "loss": 0.6507, "step": 22960 }, { "epoch": 0.5112624643874644, "grad_norm": 0.6575201153755188, "learning_rate": 0.0003720452225312343, "loss": 0.6491, "step": 22970 }, { "epoch": 0.5114850427350427, "grad_norm": 0.3498440682888031, "learning_rate": 0.00037202144458404665, "loss": 0.6026, "step": 22980 }, { "epoch": 0.5117076210826211, "grad_norm": 0.8307576775550842, "learning_rate": 0.0003719976572890729, "loss": 0.573, "step": 22990 }, { "epoch": 0.5119301994301995, "grad_norm": 0.8176911473274231, "learning_rate": 0.0003719738606476056, "loss": 0.719, "step": 23000 }, { "epoch": 0.5121527777777778, "grad_norm": 0.7571500539779663, "learning_rate": 0.00037195005466093795, "loss": 0.6963, "step": 23010 }, { "epoch": 0.5123753561253561, "grad_norm": 0.9647487998008728, "learning_rate": 0.0003719262393303635, "loss": 0.5809, "step": 23020 }, { "epoch": 0.5125979344729344, "grad_norm": 0.7572385668754578, "learning_rate": 0.0003719024146571765, "loss": 0.7812, "step": 23030 }, { "epoch": 0.5128205128205128, "grad_norm": 1.1750061511993408, "learning_rate": 0.0003718785806426716, "loss": 0.6111, "step": 23040 }, { "epoch": 0.5130430911680912, "grad_norm": 0.5060396790504456, "learning_rate": 0.00037185473728814386, "loss": 0.7578, "step": 23050 }, { "epoch": 0.5132656695156695, "grad_norm": 0.5580783486366272, "learning_rate": 0.00037183088459488906, "loss": 0.6576, "step": 23060 }, { "epoch": 0.5134882478632479, "grad_norm": 0.6911407709121704, "learning_rate": 0.0003718070225642033, "loss": 0.6099, "step": 23070 }, { "epoch": 0.5137108262108262, "grad_norm": 0.6477782726287842, "learning_rate": 0.00037178315119738327, "loss": 0.7283, "step": 23080 }, { "epoch": 0.5139334045584045, "grad_norm": 0.5770291090011597, "learning_rate": 0.00037175927049572623, "loss": 0.6806, "step": 23090 }, { "epoch": 0.5141559829059829, "grad_norm": 0.7728904485702515, "learning_rate": 0.00037173538046052977, "loss": 0.6663, "step": 23100 }, { "epoch": 0.5143785612535613, "grad_norm": 0.8347638249397278, "learning_rate": 0.0003717114810930922, "loss": 0.5948, "step": 23110 }, { "epoch": 0.5146011396011396, "grad_norm": 0.6052911281585693, "learning_rate": 0.0003716875723947121, "loss": 0.5974, "step": 23120 }, { "epoch": 0.514823717948718, "grad_norm": 0.9551176428794861, "learning_rate": 0.0003716636543666888, "loss": 0.6198, "step": 23130 }, { "epoch": 0.5150462962962963, "grad_norm": 0.4984239339828491, "learning_rate": 0.00037163972701032206, "loss": 0.5537, "step": 23140 }, { "epoch": 0.5152688746438746, "grad_norm": 0.5534871220588684, "learning_rate": 0.000371615790326912, "loss": 0.7402, "step": 23150 }, { "epoch": 0.5154914529914529, "grad_norm": 0.6838101744651794, "learning_rate": 0.00037159184431775937, "loss": 0.604, "step": 23160 }, { "epoch": 0.5157140313390314, "grad_norm": 0.5246807932853699, "learning_rate": 0.0003715678889841654, "loss": 0.5308, "step": 23170 }, { "epoch": 0.5159366096866097, "grad_norm": 1.1374447345733643, "learning_rate": 0.000371543924327432, "loss": 0.7181, "step": 23180 }, { "epoch": 0.516159188034188, "grad_norm": 0.8713901042938232, "learning_rate": 0.0003715199503488613, "loss": 0.5903, "step": 23190 }, { "epoch": 0.5163817663817664, "grad_norm": 0.5066902041435242, "learning_rate": 0.00037149596704975604, "loss": 0.5557, "step": 23200 }, { "epoch": 0.5166043447293447, "grad_norm": 0.7605969309806824, "learning_rate": 0.00037147197443141957, "loss": 0.56, "step": 23210 }, { "epoch": 0.5168269230769231, "grad_norm": 0.9316904544830322, "learning_rate": 0.0003714479724951556, "loss": 0.6541, "step": 23220 }, { "epoch": 0.5170495014245015, "grad_norm": 0.6615206599235535, "learning_rate": 0.00037142396124226847, "loss": 0.7339, "step": 23230 }, { "epoch": 0.5172720797720798, "grad_norm": 0.4588244557380676, "learning_rate": 0.000371399940674063, "loss": 0.6555, "step": 23240 }, { "epoch": 0.5174946581196581, "grad_norm": 0.8920488357543945, "learning_rate": 0.00037137591079184436, "loss": 0.6542, "step": 23250 }, { "epoch": 0.5177172364672364, "grad_norm": 0.6547673344612122, "learning_rate": 0.0003713518715969185, "loss": 0.6547, "step": 23260 }, { "epoch": 0.5179398148148148, "grad_norm": 0.8283072710037231, "learning_rate": 0.00037132782309059163, "loss": 0.7011, "step": 23270 }, { "epoch": 0.5181623931623932, "grad_norm": 0.7763789296150208, "learning_rate": 0.0003713037652741706, "loss": 0.688, "step": 23280 }, { "epoch": 0.5183849715099715, "grad_norm": 0.6761392951011658, "learning_rate": 0.0003712796981489627, "loss": 0.6504, "step": 23290 }, { "epoch": 0.5186075498575499, "grad_norm": 0.6442679166793823, "learning_rate": 0.0003712556217162758, "loss": 0.6057, "step": 23300 }, { "epoch": 0.5188301282051282, "grad_norm": 0.6514276266098022, "learning_rate": 0.00037123153597741823, "loss": 0.5104, "step": 23310 }, { "epoch": 0.5190527065527065, "grad_norm": 0.79853355884552, "learning_rate": 0.00037120744093369887, "loss": 0.6224, "step": 23320 }, { "epoch": 0.5192752849002849, "grad_norm": 0.630884051322937, "learning_rate": 0.00037118333658642694, "loss": 0.5658, "step": 23330 }, { "epoch": 0.5194978632478633, "grad_norm": 1.0390676259994507, "learning_rate": 0.00037115922293691245, "loss": 0.7549, "step": 23340 }, { "epoch": 0.5197204415954416, "grad_norm": 0.7197343111038208, "learning_rate": 0.00037113509998646554, "loss": 0.6401, "step": 23350 }, { "epoch": 0.51994301994302, "grad_norm": 0.7944161295890808, "learning_rate": 0.00037111096773639727, "loss": 0.5773, "step": 23360 }, { "epoch": 0.5201655982905983, "grad_norm": 0.5929605960845947, "learning_rate": 0.00037108682618801895, "loss": 0.5739, "step": 23370 }, { "epoch": 0.5203881766381766, "grad_norm": 0.679068922996521, "learning_rate": 0.0003710626753426424, "loss": 0.5944, "step": 23380 }, { "epoch": 0.5206107549857549, "grad_norm": 0.867856502532959, "learning_rate": 0.00037103851520158004, "loss": 0.7019, "step": 23390 }, { "epoch": 0.5208333333333334, "grad_norm": 0.4647440016269684, "learning_rate": 0.0003710143457661448, "loss": 0.5525, "step": 23400 }, { "epoch": 0.5210559116809117, "grad_norm": 0.7468436360359192, "learning_rate": 0.00037099016703764996, "loss": 0.5012, "step": 23410 }, { "epoch": 0.52127849002849, "grad_norm": 0.8206691145896912, "learning_rate": 0.00037096597901740947, "loss": 0.7897, "step": 23420 }, { "epoch": 0.5215010683760684, "grad_norm": 0.6338486671447754, "learning_rate": 0.00037094178170673765, "loss": 0.589, "step": 23430 }, { "epoch": 0.5217236467236467, "grad_norm": 0.6311833262443542, "learning_rate": 0.0003709175751069496, "loss": 0.5387, "step": 23440 }, { "epoch": 0.5219462250712251, "grad_norm": 0.8822476267814636, "learning_rate": 0.00037089335921936054, "loss": 0.6912, "step": 23450 }, { "epoch": 0.5221688034188035, "grad_norm": 0.5450563430786133, "learning_rate": 0.0003708691340452865, "loss": 0.5881, "step": 23460 }, { "epoch": 0.5223913817663818, "grad_norm": 0.845903754234314, "learning_rate": 0.00037084489958604373, "loss": 0.6549, "step": 23470 }, { "epoch": 0.5226139601139601, "grad_norm": 0.7649895548820496, "learning_rate": 0.00037082065584294934, "loss": 0.7001, "step": 23480 }, { "epoch": 0.5228365384615384, "grad_norm": 0.521933376789093, "learning_rate": 0.00037079640281732063, "loss": 0.7073, "step": 23490 }, { "epoch": 0.5230591168091168, "grad_norm": 0.6133053302764893, "learning_rate": 0.00037077214051047555, "loss": 0.5488, "step": 23500 }, { "epoch": 0.5232816951566952, "grad_norm": 0.422405481338501, "learning_rate": 0.0003707478689237326, "loss": 0.593, "step": 23510 }, { "epoch": 0.5235042735042735, "grad_norm": 0.8190181255340576, "learning_rate": 0.00037072358805841066, "loss": 0.673, "step": 23520 }, { "epoch": 0.5237268518518519, "grad_norm": 0.7577298879623413, "learning_rate": 0.0003706992979158292, "loss": 0.6556, "step": 23530 }, { "epoch": 0.5239494301994302, "grad_norm": 0.447704941034317, "learning_rate": 0.00037067499849730815, "loss": 0.7052, "step": 23540 }, { "epoch": 0.5241720085470085, "grad_norm": 0.5758539438247681, "learning_rate": 0.0003706506898041679, "loss": 0.5871, "step": 23550 }, { "epoch": 0.5243945868945868, "grad_norm": 0.669194221496582, "learning_rate": 0.0003706263718377295, "loss": 0.5948, "step": 23560 }, { "epoch": 0.5246171652421653, "grad_norm": 0.8515828847885132, "learning_rate": 0.00037060204459931435, "loss": 0.6965, "step": 23570 }, { "epoch": 0.5248397435897436, "grad_norm": 0.594035267829895, "learning_rate": 0.0003705777080902445, "loss": 0.6552, "step": 23580 }, { "epoch": 0.5250623219373219, "grad_norm": 0.4388732612133026, "learning_rate": 0.0003705533623118423, "loss": 0.6365, "step": 23590 }, { "epoch": 0.5252849002849003, "grad_norm": 0.8432769179344177, "learning_rate": 0.00037052900726543085, "loss": 0.4611, "step": 23600 }, { "epoch": 0.5255074786324786, "grad_norm": 0.6244149208068848, "learning_rate": 0.0003705046429523335, "loss": 0.6372, "step": 23610 }, { "epoch": 0.5257300569800569, "grad_norm": 0.4183826744556427, "learning_rate": 0.0003704802693738742, "loss": 0.6894, "step": 23620 }, { "epoch": 0.5259526353276354, "grad_norm": 0.8266147971153259, "learning_rate": 0.00037045588653137755, "loss": 0.7249, "step": 23630 }, { "epoch": 0.5261752136752137, "grad_norm": 0.669316291809082, "learning_rate": 0.00037043149442616847, "loss": 0.5349, "step": 23640 }, { "epoch": 0.526397792022792, "grad_norm": 0.7408722639083862, "learning_rate": 0.0003704070930595725, "loss": 0.7687, "step": 23650 }, { "epoch": 0.5266203703703703, "grad_norm": 0.9338363409042358, "learning_rate": 0.0003703826824329155, "loss": 0.6314, "step": 23660 }, { "epoch": 0.5268429487179487, "grad_norm": 1.0505681037902832, "learning_rate": 0.00037035826254752413, "loss": 0.6607, "step": 23670 }, { "epoch": 0.5270655270655271, "grad_norm": 1.1003633737564087, "learning_rate": 0.00037033383340472536, "loss": 0.7044, "step": 23680 }, { "epoch": 0.5272881054131054, "grad_norm": 0.5831704139709473, "learning_rate": 0.00037030939500584654, "loss": 0.7256, "step": 23690 }, { "epoch": 0.5275106837606838, "grad_norm": 0.8710972666740417, "learning_rate": 0.0003702849473522158, "loss": 0.5331, "step": 23700 }, { "epoch": 0.5277332621082621, "grad_norm": 0.6843211650848389, "learning_rate": 0.00037026049044516166, "loss": 0.6554, "step": 23710 }, { "epoch": 0.5279558404558404, "grad_norm": 0.5306840538978577, "learning_rate": 0.0003702360242860131, "loss": 0.5618, "step": 23720 }, { "epoch": 0.5281784188034188, "grad_norm": 0.6895719766616821, "learning_rate": 0.00037021154887609953, "loss": 0.5243, "step": 23730 }, { "epoch": 0.5284009971509972, "grad_norm": 0.41083499789237976, "learning_rate": 0.0003701870642167511, "loss": 0.7049, "step": 23740 }, { "epoch": 0.5286235754985755, "grad_norm": 0.9922495484352112, "learning_rate": 0.0003701625703092983, "loss": 0.641, "step": 23750 }, { "epoch": 0.5288461538461539, "grad_norm": 0.6313428282737732, "learning_rate": 0.00037013806715507214, "loss": 0.662, "step": 23760 }, { "epoch": 0.5290687321937322, "grad_norm": 0.5346630811691284, "learning_rate": 0.00037011355475540414, "loss": 0.7308, "step": 23770 }, { "epoch": 0.5292913105413105, "grad_norm": 0.7471534013748169, "learning_rate": 0.00037008903311162617, "loss": 0.57, "step": 23780 }, { "epoch": 0.5295138888888888, "grad_norm": 0.6286147832870483, "learning_rate": 0.0003700645022250711, "loss": 0.6702, "step": 23790 }, { "epoch": 0.5297364672364673, "grad_norm": 0.526298463344574, "learning_rate": 0.00037003996209707157, "loss": 0.6621, "step": 23800 }, { "epoch": 0.5299590455840456, "grad_norm": 0.579492449760437, "learning_rate": 0.00037001541272896143, "loss": 0.6442, "step": 23810 }, { "epoch": 0.5301816239316239, "grad_norm": 0.40422266721725464, "learning_rate": 0.00036999085412207455, "loss": 0.6136, "step": 23820 }, { "epoch": 0.5304042022792023, "grad_norm": 0.5754667520523071, "learning_rate": 0.0003699662862777455, "loss": 0.7358, "step": 23830 }, { "epoch": 0.5306267806267806, "grad_norm": 0.3586984872817993, "learning_rate": 0.00036994170919730926, "loss": 0.572, "step": 23840 }, { "epoch": 0.5308493589743589, "grad_norm": 0.5701540112495422, "learning_rate": 0.00036991712288210146, "loss": 0.5319, "step": 23850 }, { "epoch": 0.5310719373219374, "grad_norm": 0.6356937885284424, "learning_rate": 0.0003698925273334581, "loss": 0.5811, "step": 23860 }, { "epoch": 0.5312945156695157, "grad_norm": 0.7801693081855774, "learning_rate": 0.0003698679225527157, "loss": 0.5285, "step": 23870 }, { "epoch": 0.531517094017094, "grad_norm": 0.6584993600845337, "learning_rate": 0.0003698433085412114, "loss": 0.5872, "step": 23880 }, { "epoch": 0.5317396723646723, "grad_norm": 0.9852097630500793, "learning_rate": 0.00036981868530028267, "loss": 0.5429, "step": 23890 }, { "epoch": 0.5319622507122507, "grad_norm": 1.0301185846328735, "learning_rate": 0.00036979405283126747, "loss": 0.7417, "step": 23900 }, { "epoch": 0.5321848290598291, "grad_norm": 0.712128758430481, "learning_rate": 0.00036976941113550454, "loss": 0.5969, "step": 23910 }, { "epoch": 0.5324074074074074, "grad_norm": 0.6619728207588196, "learning_rate": 0.00036974476021433276, "loss": 0.4735, "step": 23920 }, { "epoch": 0.5326299857549858, "grad_norm": 0.6637291312217712, "learning_rate": 0.00036972010006909177, "loss": 0.5637, "step": 23930 }, { "epoch": 0.5328525641025641, "grad_norm": 0.670197606086731, "learning_rate": 0.00036969543070112154, "loss": 0.5575, "step": 23940 }, { "epoch": 0.5330751424501424, "grad_norm": 0.6946614384651184, "learning_rate": 0.00036967075211176285, "loss": 0.733, "step": 23950 }, { "epoch": 0.5332977207977208, "grad_norm": 0.9224454760551453, "learning_rate": 0.00036964606430235647, "loss": 0.5485, "step": 23960 }, { "epoch": 0.5335202991452992, "grad_norm": 0.6943668127059937, "learning_rate": 0.0003696213672742441, "loss": 0.7453, "step": 23970 }, { "epoch": 0.5337428774928775, "grad_norm": 0.7649568915367126, "learning_rate": 0.0003695966610287677, "loss": 0.7363, "step": 23980 }, { "epoch": 0.5339654558404558, "grad_norm": 0.8155977129936218, "learning_rate": 0.00036957194556727, "loss": 0.5108, "step": 23990 }, { "epoch": 0.5341880341880342, "grad_norm": 0.8972145915031433, "learning_rate": 0.00036954722089109395, "loss": 0.4441, "step": 24000 }, { "epoch": 0.5344106125356125, "grad_norm": 0.7684455513954163, "learning_rate": 0.00036952248700158305, "loss": 0.543, "step": 24010 }, { "epoch": 0.5346331908831908, "grad_norm": 0.8472932577133179, "learning_rate": 0.0003694977439000815, "loss": 0.7048, "step": 24020 }, { "epoch": 0.5348557692307693, "grad_norm": 0.8715865612030029, "learning_rate": 0.0003694729915879338, "loss": 0.6962, "step": 24030 }, { "epoch": 0.5350783475783476, "grad_norm": 0.8468021750450134, "learning_rate": 0.00036944823006648494, "loss": 0.6192, "step": 24040 }, { "epoch": 0.5353009259259259, "grad_norm": 0.5794544219970703, "learning_rate": 0.0003694234593370806, "loss": 0.6453, "step": 24050 }, { "epoch": 0.5355235042735043, "grad_norm": 1.058516263961792, "learning_rate": 0.00036939867940106677, "loss": 0.593, "step": 24060 }, { "epoch": 0.5357460826210826, "grad_norm": 0.6857909560203552, "learning_rate": 0.00036937389025979, "loss": 0.6452, "step": 24070 }, { "epoch": 0.5359686609686609, "grad_norm": 0.5412909388542175, "learning_rate": 0.00036934909191459734, "loss": 0.6902, "step": 24080 }, { "epoch": 0.5361912393162394, "grad_norm": 0.696171760559082, "learning_rate": 0.0003693242843668365, "loss": 0.7975, "step": 24090 }, { "epoch": 0.5364138176638177, "grad_norm": 0.5655274987220764, "learning_rate": 0.00036929946761785537, "loss": 0.507, "step": 24100 }, { "epoch": 0.536636396011396, "grad_norm": 0.8406558632850647, "learning_rate": 0.00036927464166900255, "loss": 0.7638, "step": 24110 }, { "epoch": 0.5368589743589743, "grad_norm": 0.831794023513794, "learning_rate": 0.00036924980652162714, "loss": 0.7466, "step": 24120 }, { "epoch": 0.5370815527065527, "grad_norm": 0.7433670163154602, "learning_rate": 0.0003692249621770787, "loss": 0.6373, "step": 24130 }, { "epoch": 0.5373041310541311, "grad_norm": 1.1349642276763916, "learning_rate": 0.0003692001086367073, "loss": 0.6649, "step": 24140 }, { "epoch": 0.5375267094017094, "grad_norm": 0.5549025535583496, "learning_rate": 0.0003691752459018634, "loss": 0.7769, "step": 24150 }, { "epoch": 0.5377492877492878, "grad_norm": 0.4678022861480713, "learning_rate": 0.00036915037397389824, "loss": 0.6035, "step": 24160 }, { "epoch": 0.5379718660968661, "grad_norm": 0.6058796644210815, "learning_rate": 0.0003691254928541633, "loss": 0.5491, "step": 24170 }, { "epoch": 0.5381944444444444, "grad_norm": 0.7192685008049011, "learning_rate": 0.00036910060254401054, "loss": 0.7346, "step": 24180 }, { "epoch": 0.5384170227920227, "grad_norm": 0.5943816304206848, "learning_rate": 0.00036907570304479264, "loss": 0.7317, "step": 24190 }, { "epoch": 0.5386396011396012, "grad_norm": 0.8074010014533997, "learning_rate": 0.00036905079435786264, "loss": 0.7167, "step": 24200 }, { "epoch": 0.5388621794871795, "grad_norm": 0.5450646877288818, "learning_rate": 0.0003690258764845741, "loss": 0.6761, "step": 24210 }, { "epoch": 0.5390847578347578, "grad_norm": 0.8199670314788818, "learning_rate": 0.00036900094942628105, "loss": 0.5708, "step": 24220 }, { "epoch": 0.5393073361823362, "grad_norm": 0.5967410206794739, "learning_rate": 0.000368976013184338, "loss": 0.4851, "step": 24230 }, { "epoch": 0.5395299145299145, "grad_norm": 0.8061283826828003, "learning_rate": 0.00036895106776010006, "loss": 0.5799, "step": 24240 }, { "epoch": 0.5397524928774928, "grad_norm": 0.6298791170120239, "learning_rate": 0.0003689261131549229, "loss": 0.6722, "step": 24250 }, { "epoch": 0.5399750712250713, "grad_norm": 0.8000302314758301, "learning_rate": 0.0003689011493701624, "loss": 0.5899, "step": 24260 }, { "epoch": 0.5400641025641025, "eval_loss": 0.6258811950683594, "eval_runtime": 337.2215, "eval_samples_per_second": 7.013, "eval_steps_per_second": 7.013, "step": 24264 }, { "epoch": 0.5401976495726496, "grad_norm": 0.8310365080833435, "learning_rate": 0.00036887617640717513, "loss": 0.6246, "step": 24270 }, { "epoch": 0.5404202279202279, "grad_norm": 0.7653577327728271, "learning_rate": 0.0003688511942673182, "loss": 0.5665, "step": 24280 }, { "epoch": 0.5406428062678063, "grad_norm": 0.6216724514961243, "learning_rate": 0.0003688262029519492, "loss": 0.5356, "step": 24290 }, { "epoch": 0.5408653846153846, "grad_norm": 0.5706295967102051, "learning_rate": 0.0003688012024624261, "loss": 0.7744, "step": 24300 }, { "epoch": 0.5410879629629629, "grad_norm": 0.7092143297195435, "learning_rate": 0.00036877619280010744, "loss": 0.6825, "step": 24310 }, { "epoch": 0.5413105413105413, "grad_norm": 0.8476496338844299, "learning_rate": 0.00036875117396635234, "loss": 0.768, "step": 24320 }, { "epoch": 0.5415331196581197, "grad_norm": 0.6610291004180908, "learning_rate": 0.0003687261459625203, "loss": 0.6644, "step": 24330 }, { "epoch": 0.541755698005698, "grad_norm": 0.4774007797241211, "learning_rate": 0.0003687011087899713, "loss": 0.6664, "step": 24340 }, { "epoch": 0.5419782763532763, "grad_norm": 1.256393313407898, "learning_rate": 0.00036867606245006597, "loss": 0.6652, "step": 24350 }, { "epoch": 0.5422008547008547, "grad_norm": 0.5507504343986511, "learning_rate": 0.00036865100694416535, "loss": 0.5793, "step": 24360 }, { "epoch": 0.5424234330484331, "grad_norm": 1.010922908782959, "learning_rate": 0.000368625942273631, "loss": 0.6169, "step": 24370 }, { "epoch": 0.5426460113960114, "grad_norm": 0.7326360940933228, "learning_rate": 0.0003686008684398248, "loss": 0.6562, "step": 24380 }, { "epoch": 0.5428685897435898, "grad_norm": 0.8364658951759338, "learning_rate": 0.0003685757854441095, "loss": 0.6444, "step": 24390 }, { "epoch": 0.5430911680911681, "grad_norm": 0.73721843957901, "learning_rate": 0.0003685506932878479, "loss": 0.74, "step": 24400 }, { "epoch": 0.5433137464387464, "grad_norm": 0.689838707447052, "learning_rate": 0.00036852559197240363, "loss": 0.6273, "step": 24410 }, { "epoch": 0.5435363247863247, "grad_norm": 0.5031551122665405, "learning_rate": 0.0003685004814991408, "loss": 0.5855, "step": 24420 }, { "epoch": 0.5437589031339032, "grad_norm": 0.8085066676139832, "learning_rate": 0.0003684753618694239, "loss": 0.5039, "step": 24430 }, { "epoch": 0.5439814814814815, "grad_norm": 0.5739592909812927, "learning_rate": 0.00036845023308461783, "loss": 0.6338, "step": 24440 }, { "epoch": 0.5442040598290598, "grad_norm": 0.7855496406555176, "learning_rate": 0.00036842509514608824, "loss": 0.489, "step": 24450 }, { "epoch": 0.5444266381766382, "grad_norm": 0.5251293182373047, "learning_rate": 0.00036839994805520107, "loss": 0.6098, "step": 24460 }, { "epoch": 0.5446492165242165, "grad_norm": 0.8609602451324463, "learning_rate": 0.00036837479181332295, "loss": 0.5285, "step": 24470 }, { "epoch": 0.5448717948717948, "grad_norm": 0.628017783164978, "learning_rate": 0.00036834962642182074, "loss": 0.6919, "step": 24480 }, { "epoch": 0.5450943732193733, "grad_norm": 0.5945170521736145, "learning_rate": 0.000368324451882062, "loss": 0.7378, "step": 24490 }, { "epoch": 0.5453169515669516, "grad_norm": 0.6073055267333984, "learning_rate": 0.00036829926819541476, "loss": 0.7293, "step": 24500 }, { "epoch": 0.5455395299145299, "grad_norm": 0.8239802122116089, "learning_rate": 0.00036827407536324747, "loss": 0.7061, "step": 24510 }, { "epoch": 0.5457621082621082, "grad_norm": 0.5099710822105408, "learning_rate": 0.00036824887338692924, "loss": 0.6795, "step": 24520 }, { "epoch": 0.5459846866096866, "grad_norm": 0.5891330242156982, "learning_rate": 0.00036822366226782943, "loss": 0.6871, "step": 24530 }, { "epoch": 0.5462072649572649, "grad_norm": 0.8029281497001648, "learning_rate": 0.00036819844200731814, "loss": 0.6513, "step": 24540 }, { "epoch": 0.5464298433048433, "grad_norm": 0.7131868600845337, "learning_rate": 0.0003681732126067659, "loss": 0.6778, "step": 24550 }, { "epoch": 0.5466524216524217, "grad_norm": 0.6184797883033752, "learning_rate": 0.0003681479740675435, "loss": 0.7158, "step": 24560 }, { "epoch": 0.546875, "grad_norm": 0.8021351099014282, "learning_rate": 0.0003681227263910225, "loss": 0.5596, "step": 24570 }, { "epoch": 0.5470975783475783, "grad_norm": 0.71075040102005, "learning_rate": 0.00036809746957857504, "loss": 0.6611, "step": 24580 }, { "epoch": 0.5473201566951567, "grad_norm": 0.813607931137085, "learning_rate": 0.0003680722036315734, "loss": 0.6434, "step": 24590 }, { "epoch": 0.5475427350427351, "grad_norm": 0.4937489330768585, "learning_rate": 0.00036804692855139064, "loss": 0.5076, "step": 24600 }, { "epoch": 0.5477653133903134, "grad_norm": 0.7622106075286865, "learning_rate": 0.00036802164433940025, "loss": 0.7269, "step": 24610 }, { "epoch": 0.5479878917378918, "grad_norm": 0.8251647353172302, "learning_rate": 0.00036799635099697605, "loss": 0.6925, "step": 24620 }, { "epoch": 0.5482104700854701, "grad_norm": 0.5652473568916321, "learning_rate": 0.00036797104852549274, "loss": 0.4639, "step": 24630 }, { "epoch": 0.5484330484330484, "grad_norm": 0.6232659816741943, "learning_rate": 0.00036794573692632503, "loss": 0.6967, "step": 24640 }, { "epoch": 0.5486556267806267, "grad_norm": 0.8803776502609253, "learning_rate": 0.0003679204162008485, "loss": 0.6616, "step": 24650 }, { "epoch": 0.5488782051282052, "grad_norm": 0.9684003591537476, "learning_rate": 0.0003678950863504392, "loss": 0.7329, "step": 24660 }, { "epoch": 0.5491007834757835, "grad_norm": 1.0482511520385742, "learning_rate": 0.00036786974737647337, "loss": 0.6262, "step": 24670 }, { "epoch": 0.5493233618233618, "grad_norm": 0.8103609681129456, "learning_rate": 0.0003678443992803281, "loss": 0.63, "step": 24680 }, { "epoch": 0.5495459401709402, "grad_norm": 0.7665788531303406, "learning_rate": 0.0003678190420633807, "loss": 0.6572, "step": 24690 }, { "epoch": 0.5497685185185185, "grad_norm": 0.6839772462844849, "learning_rate": 0.0003677936757270092, "loss": 0.589, "step": 24700 }, { "epoch": 0.5499910968660968, "grad_norm": 0.7093663215637207, "learning_rate": 0.00036776830027259204, "loss": 0.676, "step": 24710 }, { "epoch": 0.5502136752136753, "grad_norm": 0.8419938087463379, "learning_rate": 0.000367742915701508, "loss": 0.6811, "step": 24720 }, { "epoch": 0.5504362535612536, "grad_norm": 0.6843752861022949, "learning_rate": 0.0003677175220151367, "loss": 0.6703, "step": 24730 }, { "epoch": 0.5506588319088319, "grad_norm": 0.8185937404632568, "learning_rate": 0.0003676921192148579, "loss": 0.6178, "step": 24740 }, { "epoch": 0.5508814102564102, "grad_norm": 0.5972009301185608, "learning_rate": 0.0003676667073020521, "loss": 0.6306, "step": 24750 }, { "epoch": 0.5511039886039886, "grad_norm": 0.7188451290130615, "learning_rate": 0.00036764128627810017, "loss": 0.5118, "step": 24760 }, { "epoch": 0.5513265669515669, "grad_norm": 0.565358579158783, "learning_rate": 0.0003676158561443835, "loss": 0.6679, "step": 24770 }, { "epoch": 0.5515491452991453, "grad_norm": 0.6097283959388733, "learning_rate": 0.00036759041690228396, "loss": 0.6096, "step": 24780 }, { "epoch": 0.5517717236467237, "grad_norm": 0.641505777835846, "learning_rate": 0.00036756496855318396, "loss": 0.6809, "step": 24790 }, { "epoch": 0.551994301994302, "grad_norm": 0.8230586647987366, "learning_rate": 0.00036753951109846645, "loss": 0.5768, "step": 24800 }, { "epoch": 0.5522168803418803, "grad_norm": 1.1163147687911987, "learning_rate": 0.0003675140445395147, "loss": 0.637, "step": 24810 }, { "epoch": 0.5524394586894587, "grad_norm": 1.0802885293960571, "learning_rate": 0.0003674885688777127, "loss": 0.5016, "step": 24820 }, { "epoch": 0.5526620370370371, "grad_norm": 0.955658495426178, "learning_rate": 0.00036746308411444463, "loss": 0.609, "step": 24830 }, { "epoch": 0.5528846153846154, "grad_norm": 0.5523378849029541, "learning_rate": 0.0003674375902510956, "loss": 0.5569, "step": 24840 }, { "epoch": 0.5531071937321937, "grad_norm": 0.8032726645469666, "learning_rate": 0.0003674120872890508, "loss": 0.6498, "step": 24850 }, { "epoch": 0.5533297720797721, "grad_norm": 0.5347462296485901, "learning_rate": 0.0003673865752296961, "loss": 0.6242, "step": 24860 }, { "epoch": 0.5535523504273504, "grad_norm": 0.603898823261261, "learning_rate": 0.0003673610540744179, "loss": 0.5649, "step": 24870 }, { "epoch": 0.5537749287749287, "grad_norm": 0.3848460614681244, "learning_rate": 0.00036733552382460304, "loss": 0.5996, "step": 24880 }, { "epoch": 0.5539975071225072, "grad_norm": 0.7490546107292175, "learning_rate": 0.0003673099844816388, "loss": 0.6149, "step": 24890 }, { "epoch": 0.5542200854700855, "grad_norm": 0.630919873714447, "learning_rate": 0.0003672844360469131, "loss": 0.59, "step": 24900 }, { "epoch": 0.5544426638176638, "grad_norm": 1.0681458711624146, "learning_rate": 0.00036725887852181413, "loss": 0.7436, "step": 24910 }, { "epoch": 0.5546652421652422, "grad_norm": 0.5887709856033325, "learning_rate": 0.0003672333119077307, "loss": 0.604, "step": 24920 }, { "epoch": 0.5548878205128205, "grad_norm": 0.621523916721344, "learning_rate": 0.0003672077362060524, "loss": 0.6135, "step": 24930 }, { "epoch": 0.5551103988603988, "grad_norm": 0.5083357095718384, "learning_rate": 0.0003671821514181686, "loss": 0.6993, "step": 24940 }, { "epoch": 0.5553329772079773, "grad_norm": 0.7470302581787109, "learning_rate": 0.00036715655754547, "loss": 0.692, "step": 24950 }, { "epoch": 0.5555555555555556, "grad_norm": 0.5239513516426086, "learning_rate": 0.0003671309545893472, "loss": 0.6401, "step": 24960 }, { "epoch": 0.5557781339031339, "grad_norm": 0.6273417472839355, "learning_rate": 0.00036710534255119146, "loss": 0.5611, "step": 24970 }, { "epoch": 0.5560007122507122, "grad_norm": 0.4738331735134125, "learning_rate": 0.00036707972143239465, "loss": 0.6627, "step": 24980 }, { "epoch": 0.5562232905982906, "grad_norm": 0.5370237827301025, "learning_rate": 0.000367054091234349, "loss": 0.6241, "step": 24990 }, { "epoch": 0.5564458689458689, "grad_norm": 0.5510512590408325, "learning_rate": 0.0003670284519584472, "loss": 0.6591, "step": 25000 }, { "epoch": 0.5566684472934473, "grad_norm": 0.6586712002754211, "learning_rate": 0.0003670028036060826, "loss": 0.6104, "step": 25010 }, { "epoch": 0.5568910256410257, "grad_norm": 0.8063983917236328, "learning_rate": 0.000366977146178649, "loss": 0.6048, "step": 25020 }, { "epoch": 0.557113603988604, "grad_norm": 0.8083786368370056, "learning_rate": 0.0003669514796775406, "loss": 0.6517, "step": 25030 }, { "epoch": 0.5573361823361823, "grad_norm": 0.481600821018219, "learning_rate": 0.00036692580410415207, "loss": 0.6172, "step": 25040 }, { "epoch": 0.5575587606837606, "grad_norm": 0.7515348196029663, "learning_rate": 0.0003669001194598787, "loss": 0.6766, "step": 25050 }, { "epoch": 0.5577813390313391, "grad_norm": 0.5784083008766174, "learning_rate": 0.00036687442574611615, "loss": 0.5886, "step": 25060 }, { "epoch": 0.5580039173789174, "grad_norm": 0.8006730079650879, "learning_rate": 0.0003668487229642608, "loss": 0.6038, "step": 25070 }, { "epoch": 0.5582264957264957, "grad_norm": 0.6441351771354675, "learning_rate": 0.0003668230111157092, "loss": 0.5502, "step": 25080 }, { "epoch": 0.5584490740740741, "grad_norm": 0.8060956597328186, "learning_rate": 0.0003667972902018586, "loss": 0.6798, "step": 25090 }, { "epoch": 0.5586716524216524, "grad_norm": 0.5438023805618286, "learning_rate": 0.00036677156022410674, "loss": 0.7869, "step": 25100 }, { "epoch": 0.5588942307692307, "grad_norm": 0.6967006921768188, "learning_rate": 0.00036674582118385174, "loss": 0.7252, "step": 25110 }, { "epoch": 0.5591168091168092, "grad_norm": 1.0618207454681396, "learning_rate": 0.0003667200730824924, "loss": 0.7587, "step": 25120 }, { "epoch": 0.5593393874643875, "grad_norm": 0.8812199234962463, "learning_rate": 0.0003666943159214276, "loss": 0.6272, "step": 25130 }, { "epoch": 0.5595619658119658, "grad_norm": 0.7430441379547119, "learning_rate": 0.0003666685497020574, "loss": 0.6784, "step": 25140 }, { "epoch": 0.5597845441595442, "grad_norm": 0.6264715790748596, "learning_rate": 0.0003666427744257817, "loss": 0.5762, "step": 25150 }, { "epoch": 0.5600071225071225, "grad_norm": 0.7243641018867493, "learning_rate": 0.00036661699009400125, "loss": 0.564, "step": 25160 }, { "epoch": 0.5602297008547008, "grad_norm": 0.7546271681785583, "learning_rate": 0.0003665911967081171, "loss": 0.5746, "step": 25170 }, { "epoch": 0.5604522792022792, "grad_norm": 0.7522180080413818, "learning_rate": 0.00036656539426953104, "loss": 0.6288, "step": 25180 }, { "epoch": 0.5606748575498576, "grad_norm": 0.7609201073646545, "learning_rate": 0.000366539582779645, "loss": 0.7739, "step": 25190 }, { "epoch": 0.5608974358974359, "grad_norm": 0.804742157459259, "learning_rate": 0.0003665137622398617, "loss": 0.7322, "step": 25200 }, { "epoch": 0.5611200142450142, "grad_norm": 0.7445507049560547, "learning_rate": 0.0003664879326515843, "loss": 0.6174, "step": 25210 }, { "epoch": 0.5613425925925926, "grad_norm": 0.5064197182655334, "learning_rate": 0.0003664620940162164, "loss": 0.6026, "step": 25220 }, { "epoch": 0.5615651709401709, "grad_norm": 0.7717465758323669, "learning_rate": 0.0003664362463351619, "loss": 0.6095, "step": 25230 }, { "epoch": 0.5617877492877493, "grad_norm": 0.8355364799499512, "learning_rate": 0.0003664103896098256, "loss": 0.7885, "step": 25240 }, { "epoch": 0.5620103276353277, "grad_norm": 0.9792073369026184, "learning_rate": 0.0003663845238416125, "loss": 0.7913, "step": 25250 }, { "epoch": 0.562232905982906, "grad_norm": 0.5464093089103699, "learning_rate": 0.0003663586490319281, "loss": 0.6366, "step": 25260 }, { "epoch": 0.5624554843304843, "grad_norm": 1.1562082767486572, "learning_rate": 0.0003663327651821786, "loss": 0.6432, "step": 25270 }, { "epoch": 0.5626780626780626, "grad_norm": 0.7032479047775269, "learning_rate": 0.00036630687229377047, "loss": 0.6814, "step": 25280 }, { "epoch": 0.5629006410256411, "grad_norm": 0.855377733707428, "learning_rate": 0.0003662809703681107, "loss": 0.6815, "step": 25290 }, { "epoch": 0.5631232193732194, "grad_norm": 0.7194737792015076, "learning_rate": 0.00036625505940660687, "loss": 0.6352, "step": 25300 }, { "epoch": 0.5633457977207977, "grad_norm": 0.6877269744873047, "learning_rate": 0.00036622913941066707, "loss": 0.5418, "step": 25310 }, { "epoch": 0.5635683760683761, "grad_norm": 0.6020021438598633, "learning_rate": 0.0003662032103816998, "loss": 0.4835, "step": 25320 }, { "epoch": 0.5637909544159544, "grad_norm": 0.9626232385635376, "learning_rate": 0.00036617727232111393, "loss": 0.7111, "step": 25330 }, { "epoch": 0.5640135327635327, "grad_norm": 0.5444539785385132, "learning_rate": 0.000366151325230319, "loss": 0.5697, "step": 25340 }, { "epoch": 0.5642361111111112, "grad_norm": 0.8597228527069092, "learning_rate": 0.00036612536911072513, "loss": 0.7058, "step": 25350 }, { "epoch": 0.5644586894586895, "grad_norm": 0.46579617261886597, "learning_rate": 0.0003660994039637427, "loss": 0.5261, "step": 25360 }, { "epoch": 0.5646812678062678, "grad_norm": 0.7070087790489197, "learning_rate": 0.0003660734297907826, "loss": 0.664, "step": 25370 }, { "epoch": 0.5649038461538461, "grad_norm": 0.7118215560913086, "learning_rate": 0.0003660474465932565, "loss": 0.7071, "step": 25380 }, { "epoch": 0.5651264245014245, "grad_norm": 0.6779159307479858, "learning_rate": 0.00036602145437257614, "loss": 0.5657, "step": 25390 }, { "epoch": 0.5653490028490028, "grad_norm": 0.5989352464675903, "learning_rate": 0.00036599545313015404, "loss": 0.5557, "step": 25400 }, { "epoch": 0.5655715811965812, "grad_norm": 0.8546493053436279, "learning_rate": 0.0003659694428674032, "loss": 0.7181, "step": 25410 }, { "epoch": 0.5657941595441596, "grad_norm": 0.6739733219146729, "learning_rate": 0.00036594342358573683, "loss": 0.566, "step": 25420 }, { "epoch": 0.5660167378917379, "grad_norm": 0.9013862013816833, "learning_rate": 0.00036591739528656905, "loss": 0.7043, "step": 25430 }, { "epoch": 0.5662393162393162, "grad_norm": 0.7019580602645874, "learning_rate": 0.0003658913579713142, "loss": 0.6938, "step": 25440 }, { "epoch": 0.5664618945868946, "grad_norm": 0.6055698394775391, "learning_rate": 0.00036586531164138706, "loss": 0.6043, "step": 25450 }, { "epoch": 0.5666844729344729, "grad_norm": 0.5486964583396912, "learning_rate": 0.0003658392562982032, "loss": 0.6373, "step": 25460 }, { "epoch": 0.5669070512820513, "grad_norm": 0.9835435748100281, "learning_rate": 0.0003658131919431784, "loss": 0.6492, "step": 25470 }, { "epoch": 0.5671296296296297, "grad_norm": 1.1097290515899658, "learning_rate": 0.000365787118577729, "loss": 0.52, "step": 25480 }, { "epoch": 0.567352207977208, "grad_norm": 0.7063470482826233, "learning_rate": 0.0003657610362032718, "loss": 0.6542, "step": 25490 }, { "epoch": 0.5675747863247863, "grad_norm": 1.0969784259796143, "learning_rate": 0.00036573494482122423, "loss": 0.68, "step": 25500 }, { "epoch": 0.5677973646723646, "grad_norm": 0.7936034202575684, "learning_rate": 0.00036570884443300406, "loss": 0.5786, "step": 25510 }, { "epoch": 0.5680199430199431, "grad_norm": 0.7072309851646423, "learning_rate": 0.00036568273504002964, "loss": 0.6274, "step": 25520 }, { "epoch": 0.5682425213675214, "grad_norm": 1.030142903327942, "learning_rate": 0.0003656566166437198, "loss": 0.6594, "step": 25530 }, { "epoch": 0.5684650997150997, "grad_norm": 0.7467629909515381, "learning_rate": 0.00036563048924549376, "loss": 0.5441, "step": 25540 }, { "epoch": 0.5686876780626781, "grad_norm": 0.5796084403991699, "learning_rate": 0.0003656043528467714, "loss": 0.5869, "step": 25550 }, { "epoch": 0.5689102564102564, "grad_norm": 0.5709526538848877, "learning_rate": 0.00036557820744897285, "loss": 0.5471, "step": 25560 }, { "epoch": 0.5691328347578347, "grad_norm": 1.1228928565979004, "learning_rate": 0.000365552053053519, "loss": 0.7536, "step": 25570 }, { "epoch": 0.5693554131054132, "grad_norm": 0.6233734488487244, "learning_rate": 0.00036552588966183103, "loss": 0.638, "step": 25580 }, { "epoch": 0.5695779914529915, "grad_norm": 0.848446249961853, "learning_rate": 0.00036549971727533074, "loss": 0.5712, "step": 25590 }, { "epoch": 0.5698005698005698, "grad_norm": 0.7704927921295166, "learning_rate": 0.00036547353589544033, "loss": 0.5454, "step": 25600 }, { "epoch": 0.5700231481481481, "grad_norm": 0.5541343092918396, "learning_rate": 0.00036544734552358254, "loss": 0.5125, "step": 25610 }, { "epoch": 0.5702457264957265, "grad_norm": 0.6036107540130615, "learning_rate": 0.0003654211461611805, "loss": 0.6403, "step": 25620 }, { "epoch": 0.5704683048433048, "grad_norm": 0.996376097202301, "learning_rate": 0.000365394937809658, "loss": 0.608, "step": 25630 }, { "epoch": 0.5706908831908832, "grad_norm": 0.5010185837745667, "learning_rate": 0.0003653687204704391, "loss": 0.4907, "step": 25640 }, { "epoch": 0.5709134615384616, "grad_norm": 1.5402705669403076, "learning_rate": 0.0003653424941449487, "loss": 0.6147, "step": 25650 }, { "epoch": 0.5711360398860399, "grad_norm": 0.9346346259117126, "learning_rate": 0.0003653162588346117, "loss": 0.7224, "step": 25660 }, { "epoch": 0.5713586182336182, "grad_norm": 0.8029820919036865, "learning_rate": 0.00036529001454085387, "loss": 0.5599, "step": 25670 }, { "epoch": 0.5715811965811965, "grad_norm": 0.7116772532463074, "learning_rate": 0.00036526376126510136, "loss": 0.6636, "step": 25680 }, { "epoch": 0.5718037749287749, "grad_norm": 0.4518112242221832, "learning_rate": 0.0003652374990087807, "loss": 0.6618, "step": 25690 }, { "epoch": 0.5720263532763533, "grad_norm": 0.6178964376449585, "learning_rate": 0.0003652112277733192, "loss": 0.4945, "step": 25700 }, { "epoch": 0.5722489316239316, "grad_norm": 0.5029324889183044, "learning_rate": 0.0003651849475601443, "loss": 0.5303, "step": 25710 }, { "epoch": 0.57247150997151, "grad_norm": 0.6093102693557739, "learning_rate": 0.00036515865837068406, "loss": 0.5364, "step": 25720 }, { "epoch": 0.5726940883190883, "grad_norm": 0.6700037121772766, "learning_rate": 0.0003651323602063672, "loss": 0.6722, "step": 25730 }, { "epoch": 0.5729166666666666, "grad_norm": 0.5653918385505676, "learning_rate": 0.0003651060530686226, "loss": 0.5539, "step": 25740 }, { "epoch": 0.5731392450142451, "grad_norm": 1.0705516338348389, "learning_rate": 0.00036507973695888, "loss": 0.7888, "step": 25750 }, { "epoch": 0.5733618233618234, "grad_norm": 0.5369784235954285, "learning_rate": 0.0003650534118785693, "loss": 0.5768, "step": 25760 }, { "epoch": 0.5735844017094017, "grad_norm": 0.7054560780525208, "learning_rate": 0.0003650270778291211, "loss": 0.6524, "step": 25770 }, { "epoch": 0.57380698005698, "grad_norm": 0.5634106397628784, "learning_rate": 0.00036500073481196646, "loss": 0.6372, "step": 25780 }, { "epoch": 0.5740295584045584, "grad_norm": 0.6334834098815918, "learning_rate": 0.0003649743828285368, "loss": 0.6347, "step": 25790 }, { "epoch": 0.5742521367521367, "grad_norm": 0.8807213306427002, "learning_rate": 0.00036494802188026396, "loss": 0.6003, "step": 25800 }, { "epoch": 0.5744747150997151, "grad_norm": 0.8929731845855713, "learning_rate": 0.0003649216519685807, "loss": 0.7382, "step": 25810 }, { "epoch": 0.5746972934472935, "grad_norm": 0.43042662739753723, "learning_rate": 0.0003648952730949199, "loss": 0.6431, "step": 25820 }, { "epoch": 0.5749198717948718, "grad_norm": 0.6679039597511292, "learning_rate": 0.00036486888526071496, "loss": 0.7355, "step": 25830 }, { "epoch": 0.5751424501424501, "grad_norm": 0.8297902941703796, "learning_rate": 0.00036484248846739976, "loss": 0.7014, "step": 25840 }, { "epoch": 0.5753650284900285, "grad_norm": 0.4913148283958435, "learning_rate": 0.0003648160827164088, "loss": 0.5351, "step": 25850 }, { "epoch": 0.5755876068376068, "grad_norm": 0.7515653371810913, "learning_rate": 0.000364789668009177, "loss": 0.6329, "step": 25860 }, { "epoch": 0.5758101851851852, "grad_norm": 0.7551255822181702, "learning_rate": 0.0003647632443471398, "loss": 0.6201, "step": 25870 }, { "epoch": 0.5760327635327636, "grad_norm": 0.8179723024368286, "learning_rate": 0.00036473681173173294, "loss": 0.6514, "step": 25880 }, { "epoch": 0.5762553418803419, "grad_norm": 0.4706272482872009, "learning_rate": 0.00036471037016439287, "loss": 0.7006, "step": 25890 }, { "epoch": 0.5764779202279202, "grad_norm": 0.6636437773704529, "learning_rate": 0.0003646839196465565, "loss": 0.6523, "step": 25900 }, { "epoch": 0.5767004985754985, "grad_norm": 0.7169828414916992, "learning_rate": 0.0003646574601796611, "loss": 0.6445, "step": 25910 }, { "epoch": 0.5769230769230769, "grad_norm": 0.7686861753463745, "learning_rate": 0.00036463099176514447, "loss": 0.5783, "step": 25920 }, { "epoch": 0.5771456552706553, "grad_norm": 0.6391460299491882, "learning_rate": 0.000364604514404445, "loss": 0.5322, "step": 25930 }, { "epoch": 0.5773682336182336, "grad_norm": 0.79499751329422, "learning_rate": 0.0003645780280990015, "loss": 0.659, "step": 25940 }, { "epoch": 0.577590811965812, "grad_norm": 0.7389349341392517, "learning_rate": 0.0003645515328502532, "loss": 0.5739, "step": 25950 }, { "epoch": 0.5778133903133903, "grad_norm": 0.8088229894638062, "learning_rate": 0.0003645250286596399, "loss": 0.528, "step": 25960 }, { "epoch": 0.5780359686609686, "grad_norm": 0.4672555923461914, "learning_rate": 0.00036449851552860184, "loss": 0.6496, "step": 25970 }, { "epoch": 0.5782585470085471, "grad_norm": 0.7340976595878601, "learning_rate": 0.00036447199345857983, "loss": 0.7119, "step": 25980 }, { "epoch": 0.5784811253561254, "grad_norm": 0.5094794631004333, "learning_rate": 0.000364445462451015, "loss": 0.6694, "step": 25990 }, { "epoch": 0.5787037037037037, "grad_norm": 0.6030073165893555, "learning_rate": 0.00036441892250734914, "loss": 0.6226, "step": 26000 }, { "epoch": 0.578926282051282, "grad_norm": 0.7402897477149963, "learning_rate": 0.0003643923736290244, "loss": 0.6389, "step": 26010 }, { "epoch": 0.5791488603988604, "grad_norm": 0.48543375730514526, "learning_rate": 0.00036436581581748357, "loss": 0.5502, "step": 26020 }, { "epoch": 0.5793714387464387, "grad_norm": 0.7590353488922119, "learning_rate": 0.0003643392490741697, "loss": 0.715, "step": 26030 }, { "epoch": 0.5795940170940171, "grad_norm": 0.7811538577079773, "learning_rate": 0.0003643126734005265, "loss": 0.6448, "step": 26040 }, { "epoch": 0.5798165954415955, "grad_norm": 0.7644380927085876, "learning_rate": 0.00036428608879799816, "loss": 0.6684, "step": 26050 }, { "epoch": 0.5800391737891738, "grad_norm": 0.7908637523651123, "learning_rate": 0.0003642594952680292, "loss": 0.7409, "step": 26060 }, { "epoch": 0.5802617521367521, "grad_norm": 0.539625346660614, "learning_rate": 0.00036423289281206487, "loss": 0.6192, "step": 26070 }, { "epoch": 0.5804843304843305, "grad_norm": 1.0244312286376953, "learning_rate": 0.0003642062814315506, "loss": 0.7173, "step": 26080 }, { "epoch": 0.5807069088319088, "grad_norm": 0.5301309823989868, "learning_rate": 0.0003641796611279327, "loss": 0.5922, "step": 26090 }, { "epoch": 0.5809294871794872, "grad_norm": 0.769178569316864, "learning_rate": 0.00036415303190265747, "loss": 0.5554, "step": 26100 }, { "epoch": 0.5811520655270656, "grad_norm": 0.47896939516067505, "learning_rate": 0.0003641263937571722, "loss": 0.5712, "step": 26110 }, { "epoch": 0.5813746438746439, "grad_norm": 0.49155372381210327, "learning_rate": 0.0003640997466929243, "loss": 0.5569, "step": 26120 }, { "epoch": 0.5815972222222222, "grad_norm": 0.9530248641967773, "learning_rate": 0.00036407309071136184, "loss": 0.7337, "step": 26130 }, { "epoch": 0.5818198005698005, "grad_norm": 0.639849841594696, "learning_rate": 0.0003640464258139333, "loss": 0.7232, "step": 26140 }, { "epoch": 0.5820423789173789, "grad_norm": 0.5914270877838135, "learning_rate": 0.0003640197520020877, "loss": 0.6236, "step": 26150 }, { "epoch": 0.5822649572649573, "grad_norm": 1.2797378301620483, "learning_rate": 0.00036399306927727446, "loss": 0.6709, "step": 26160 }, { "epoch": 0.5824875356125356, "grad_norm": 0.7411321997642517, "learning_rate": 0.00036396637764094365, "loss": 0.6829, "step": 26170 }, { "epoch": 0.582710113960114, "grad_norm": 0.7942419648170471, "learning_rate": 0.0003639396770945456, "loss": 0.6094, "step": 26180 }, { "epoch": 0.5829326923076923, "grad_norm": 0.5377127528190613, "learning_rate": 0.0003639129676395313, "loss": 0.5491, "step": 26190 }, { "epoch": 0.5831552706552706, "grad_norm": 0.6508163213729858, "learning_rate": 0.00036388624927735224, "loss": 0.6355, "step": 26200 }, { "epoch": 0.5833778490028491, "grad_norm": 0.761579692363739, "learning_rate": 0.0003638595220094601, "loss": 0.509, "step": 26210 }, { "epoch": 0.5836004273504274, "grad_norm": 0.6185252666473389, "learning_rate": 0.00036383278583730747, "loss": 0.6034, "step": 26220 }, { "epoch": 0.5838230056980057, "grad_norm": 0.7071433067321777, "learning_rate": 0.0003638060407623471, "loss": 0.4562, "step": 26230 }, { "epoch": 0.584045584045584, "grad_norm": 0.4515877068042755, "learning_rate": 0.00036377928678603237, "loss": 0.5233, "step": 26240 }, { "epoch": 0.5842681623931624, "grad_norm": 0.5538159608840942, "learning_rate": 0.0003637525239098172, "loss": 0.4903, "step": 26250 }, { "epoch": 0.5844907407407407, "grad_norm": 0.6422973275184631, "learning_rate": 0.00036372575213515577, "loss": 0.6706, "step": 26260 }, { "epoch": 0.5847133190883191, "grad_norm": 0.6675984263420105, "learning_rate": 0.0003636989714635029, "loss": 0.6804, "step": 26270 }, { "epoch": 0.5849358974358975, "grad_norm": 0.6627989411354065, "learning_rate": 0.0003636721818963139, "loss": 0.6538, "step": 26280 }, { "epoch": 0.5851584757834758, "grad_norm": 0.5040997862815857, "learning_rate": 0.0003636453834350446, "loss": 0.5534, "step": 26290 }, { "epoch": 0.5853810541310541, "grad_norm": 0.8540357351303101, "learning_rate": 0.0003636185760811512, "loss": 0.701, "step": 26300 }, { "epoch": 0.5856036324786325, "grad_norm": 0.5554685592651367, "learning_rate": 0.0003635917598360904, "loss": 0.5512, "step": 26310 }, { "epoch": 0.5858262108262108, "grad_norm": 0.6635841131210327, "learning_rate": 0.0003635649347013195, "loss": 0.6747, "step": 26320 }, { "epoch": 0.5860487891737892, "grad_norm": 1.081513524055481, "learning_rate": 0.00036353810067829616, "loss": 0.5939, "step": 26330 }, { "epoch": 0.5862713675213675, "grad_norm": 0.5436891913414001, "learning_rate": 0.0003635112577684785, "loss": 0.5855, "step": 26340 }, { "epoch": 0.5864939458689459, "grad_norm": 0.586488664150238, "learning_rate": 0.00036348440597332523, "loss": 0.6242, "step": 26350 }, { "epoch": 0.5867165242165242, "grad_norm": 0.5410413146018982, "learning_rate": 0.00036345754529429553, "loss": 0.5378, "step": 26360 }, { "epoch": 0.5869391025641025, "grad_norm": 0.9179112315177917, "learning_rate": 0.000363430675732849, "loss": 0.6296, "step": 26370 }, { "epoch": 0.5871616809116809, "grad_norm": 0.6292691826820374, "learning_rate": 0.00036340379729044573, "loss": 0.6738, "step": 26380 }, { "epoch": 0.5873842592592593, "grad_norm": 0.47527605295181274, "learning_rate": 0.00036337690996854637, "loss": 0.6117, "step": 26390 }, { "epoch": 0.5876068376068376, "grad_norm": 0.779073178768158, "learning_rate": 0.00036335001376861203, "loss": 0.6211, "step": 26400 }, { "epoch": 0.587829415954416, "grad_norm": 0.765661358833313, "learning_rate": 0.0003633231086921042, "loss": 0.5958, "step": 26410 }, { "epoch": 0.5880519943019943, "grad_norm": 0.7019172310829163, "learning_rate": 0.00036329619474048485, "loss": 0.5939, "step": 26420 }, { "epoch": 0.5882745726495726, "grad_norm": 0.4380790591239929, "learning_rate": 0.00036326927191521663, "loss": 0.6325, "step": 26430 }, { "epoch": 0.5884971509971509, "grad_norm": 0.6067693829536438, "learning_rate": 0.0003632423402177626, "loss": 0.5798, "step": 26440 }, { "epoch": 0.5887197293447294, "grad_norm": 0.6987513899803162, "learning_rate": 0.00036321539964958606, "loss": 0.6136, "step": 26450 }, { "epoch": 0.5889423076923077, "grad_norm": 0.7827839255332947, "learning_rate": 0.0003631884502121511, "loss": 0.5435, "step": 26460 }, { "epoch": 0.589164886039886, "grad_norm": 0.6228511333465576, "learning_rate": 0.00036316149190692223, "loss": 0.5427, "step": 26470 }, { "epoch": 0.5893874643874644, "grad_norm": 0.7204742431640625, "learning_rate": 0.00036313452473536425, "loss": 0.6434, "step": 26480 }, { "epoch": 0.5896100427350427, "grad_norm": 0.6403375864028931, "learning_rate": 0.0003631075486989427, "loss": 0.6231, "step": 26490 }, { "epoch": 0.5898326210826211, "grad_norm": 0.46823328733444214, "learning_rate": 0.00036308056379912344, "loss": 0.5668, "step": 26500 }, { "epoch": 0.5900551994301995, "grad_norm": 0.5952900052070618, "learning_rate": 0.00036305357003737284, "loss": 0.5498, "step": 26510 }, { "epoch": 0.5902777777777778, "grad_norm": 0.8604204654693604, "learning_rate": 0.0003630265674151577, "loss": 0.6581, "step": 26520 }, { "epoch": 0.5905003561253561, "grad_norm": 1.0747565031051636, "learning_rate": 0.00036299955593394544, "loss": 0.6561, "step": 26530 }, { "epoch": 0.5907229344729344, "grad_norm": 0.7867896556854248, "learning_rate": 0.0003629725355952039, "loss": 0.668, "step": 26540 }, { "epoch": 0.5909455128205128, "grad_norm": 0.6911695599555969, "learning_rate": 0.0003629455064004014, "loss": 0.7318, "step": 26550 }, { "epoch": 0.5911680911680912, "grad_norm": 0.65760737657547, "learning_rate": 0.00036291846835100663, "loss": 0.709, "step": 26560 }, { "epoch": 0.5913906695156695, "grad_norm": 0.5564249157905579, "learning_rate": 0.0003628914214484889, "loss": 0.6305, "step": 26570 }, { "epoch": 0.5916132478632479, "grad_norm": 0.6941995620727539, "learning_rate": 0.00036286436569431805, "loss": 0.7899, "step": 26580 }, { "epoch": 0.5918358262108262, "grad_norm": 0.4514349699020386, "learning_rate": 0.0003628373010899642, "loss": 0.5176, "step": 26590 }, { "epoch": 0.5920584045584045, "grad_norm": 0.4993559420108795, "learning_rate": 0.0003628102276368981, "loss": 0.6228, "step": 26600 }, { "epoch": 0.5922809829059829, "grad_norm": 0.6619306802749634, "learning_rate": 0.00036278314533659095, "loss": 0.6596, "step": 26610 }, { "epoch": 0.5925035612535613, "grad_norm": 0.6388404965400696, "learning_rate": 0.0003627560541905144, "loss": 0.5418, "step": 26620 }, { "epoch": 0.5927261396011396, "grad_norm": 0.7671358585357666, "learning_rate": 0.00036272895420014066, "loss": 0.6312, "step": 26630 }, { "epoch": 0.592948717948718, "grad_norm": 0.4864520728588104, "learning_rate": 0.0003627018453669423, "loss": 0.528, "step": 26640 }, { "epoch": 0.5931712962962963, "grad_norm": 0.6411517262458801, "learning_rate": 0.0003626747276923925, "loss": 0.5324, "step": 26650 }, { "epoch": 0.5933938746438746, "grad_norm": 0.5414084792137146, "learning_rate": 0.00036264760117796484, "loss": 0.5629, "step": 26660 }, { "epoch": 0.5936164529914529, "grad_norm": 0.7658395171165466, "learning_rate": 0.00036262046582513337, "loss": 0.6198, "step": 26670 }, { "epoch": 0.5938390313390314, "grad_norm": 0.5669873356819153, "learning_rate": 0.00036259332163537266, "loss": 0.5935, "step": 26680 }, { "epoch": 0.5940616096866097, "grad_norm": 0.635143518447876, "learning_rate": 0.0003625661686101578, "loss": 0.5584, "step": 26690 }, { "epoch": 0.594284188034188, "grad_norm": 0.8045666217803955, "learning_rate": 0.0003625390067509641, "loss": 0.5575, "step": 26700 }, { "epoch": 0.5945067663817664, "grad_norm": 0.5953568816184998, "learning_rate": 0.0003625118360592678, "loss": 0.6082, "step": 26710 }, { "epoch": 0.5947293447293447, "grad_norm": 0.6741862893104553, "learning_rate": 0.0003624846565365453, "loss": 0.626, "step": 26720 }, { "epoch": 0.5949519230769231, "grad_norm": 0.4664577841758728, "learning_rate": 0.0003624574681842736, "loss": 0.6427, "step": 26730 }, { "epoch": 0.5951745014245015, "grad_norm": 0.9127620458602905, "learning_rate": 0.00036243027100393, "loss": 0.7419, "step": 26740 }, { "epoch": 0.5953970797720798, "grad_norm": 0.6853809952735901, "learning_rate": 0.00036240306499699256, "loss": 0.5637, "step": 26750 }, { "epoch": 0.5956196581196581, "grad_norm": 0.8793456554412842, "learning_rate": 0.00036237585016493955, "loss": 0.7329, "step": 26760 }, { "epoch": 0.5958422364672364, "grad_norm": 0.7338986396789551, "learning_rate": 0.00036234862650925, "loss": 0.7303, "step": 26770 }, { "epoch": 0.5960648148148148, "grad_norm": 1.601108431816101, "learning_rate": 0.00036232139403140313, "loss": 0.7491, "step": 26780 }, { "epoch": 0.5962873931623932, "grad_norm": 0.8078438639640808, "learning_rate": 0.0003622941527328788, "loss": 0.6711, "step": 26790 }, { "epoch": 0.5965099715099715, "grad_norm": 0.7182638049125671, "learning_rate": 0.00036226690261515734, "loss": 0.6057, "step": 26800 }, { "epoch": 0.5967325498575499, "grad_norm": 0.5949748158454895, "learning_rate": 0.0003622396436797196, "loss": 0.6201, "step": 26810 }, { "epoch": 0.5969551282051282, "grad_norm": 0.6344726085662842, "learning_rate": 0.0003622123759280468, "loss": 0.5434, "step": 26820 }, { "epoch": 0.5971777065527065, "grad_norm": 0.6737859845161438, "learning_rate": 0.00036218509936162077, "loss": 0.6778, "step": 26830 }, { "epoch": 0.5974002849002849, "grad_norm": 0.6358687281608582, "learning_rate": 0.0003621578139819236, "loss": 0.5214, "step": 26840 }, { "epoch": 0.5976228632478633, "grad_norm": 0.6937312483787537, "learning_rate": 0.00036213051979043807, "loss": 0.5482, "step": 26850 }, { "epoch": 0.5978454415954416, "grad_norm": 0.809160053730011, "learning_rate": 0.0003621032167886473, "loss": 0.6438, "step": 26860 }, { "epoch": 0.59806801994302, "grad_norm": 0.7212644815444946, "learning_rate": 0.0003620759049780351, "loss": 0.6243, "step": 26870 }, { "epoch": 0.5982905982905983, "grad_norm": 0.8717097640037537, "learning_rate": 0.0003620485843600856, "loss": 0.6122, "step": 26880 }, { "epoch": 0.5985131766381766, "grad_norm": 0.546472430229187, "learning_rate": 0.00036202125493628326, "loss": 0.6884, "step": 26890 }, { "epoch": 0.5987357549857549, "grad_norm": 0.8792861700057983, "learning_rate": 0.00036199391670811335, "loss": 0.7374, "step": 26900 }, { "epoch": 0.5989583333333334, "grad_norm": 0.6809139251708984, "learning_rate": 0.0003619665696770614, "loss": 0.6165, "step": 26910 }, { "epoch": 0.5991809116809117, "grad_norm": 0.7540663480758667, "learning_rate": 0.00036193921384461344, "loss": 0.605, "step": 26920 }, { "epoch": 0.59940349002849, "grad_norm": 0.6481123566627502, "learning_rate": 0.000361911849212256, "loss": 0.6943, "step": 26930 }, { "epoch": 0.5996260683760684, "grad_norm": 0.8331771492958069, "learning_rate": 0.00036188447578147615, "loss": 0.737, "step": 26940 }, { "epoch": 0.5998486467236467, "grad_norm": 0.6901444792747498, "learning_rate": 0.0003618570935537614, "loss": 0.6721, "step": 26950 }, { "epoch": 0.6000712250712251, "grad_norm": 0.763639509677887, "learning_rate": 0.00036182970253059965, "loss": 0.554, "step": 26960 }, { "epoch": 0.6000712250712251, "eval_loss": 0.6276374459266663, "eval_runtime": 337.4029, "eval_samples_per_second": 7.009, "eval_steps_per_second": 7.009, "step": 26960 }, { "epoch": 0.6002938034188035, "grad_norm": 0.6949964761734009, "learning_rate": 0.0003618023027134794, "loss": 0.6418, "step": 26970 }, { "epoch": 0.6005163817663818, "grad_norm": 0.8426802754402161, "learning_rate": 0.00036177489410388954, "loss": 0.5959, "step": 26980 }, { "epoch": 0.6007389601139601, "grad_norm": 0.709507167339325, "learning_rate": 0.0003617474767033195, "loss": 0.6305, "step": 26990 }, { "epoch": 0.6009615384615384, "grad_norm": 0.7175819873809814, "learning_rate": 0.00036172005051325916, "loss": 0.4853, "step": 27000 }, { "epoch": 0.6011841168091168, "grad_norm": 0.4408554434776306, "learning_rate": 0.00036169261553519887, "loss": 0.6845, "step": 27010 }, { "epoch": 0.6014066951566952, "grad_norm": 0.7172206044197083, "learning_rate": 0.00036166517177062957, "loss": 0.5734, "step": 27020 }, { "epoch": 0.6016292735042735, "grad_norm": 1.5862455368041992, "learning_rate": 0.00036163771922104236, "loss": 0.6894, "step": 27030 }, { "epoch": 0.6018518518518519, "grad_norm": 0.9772451519966125, "learning_rate": 0.0003616102578879293, "loss": 0.6468, "step": 27040 }, { "epoch": 0.6020744301994302, "grad_norm": 1.0505484342575073, "learning_rate": 0.00036158278777278244, "loss": 0.5143, "step": 27050 }, { "epoch": 0.6022970085470085, "grad_norm": 0.5887817740440369, "learning_rate": 0.0003615553088770946, "loss": 0.5918, "step": 27060 }, { "epoch": 0.6025195868945868, "grad_norm": 0.7041050791740417, "learning_rate": 0.0003615278212023591, "loss": 0.5378, "step": 27070 }, { "epoch": 0.6027421652421653, "grad_norm": 0.8429660201072693, "learning_rate": 0.00036150032475006945, "loss": 0.7211, "step": 27080 }, { "epoch": 0.6029647435897436, "grad_norm": 0.8379825353622437, "learning_rate": 0.00036147281952172, "loss": 0.671, "step": 27090 }, { "epoch": 0.6031873219373219, "grad_norm": 0.9291252493858337, "learning_rate": 0.0003614453055188054, "loss": 0.581, "step": 27100 }, { "epoch": 0.6034099002849003, "grad_norm": 0.8719322681427002, "learning_rate": 0.0003614177827428207, "loss": 0.6302, "step": 27110 }, { "epoch": 0.6036324786324786, "grad_norm": 0.6128899455070496, "learning_rate": 0.0003613902511952615, "loss": 0.7683, "step": 27120 }, { "epoch": 0.6038550569800569, "grad_norm": 0.9219420552253723, "learning_rate": 0.00036136271087762396, "loss": 0.5287, "step": 27130 }, { "epoch": 0.6040776353276354, "grad_norm": 0.6347882747650146, "learning_rate": 0.0003613351617914047, "loss": 0.6586, "step": 27140 }, { "epoch": 0.6043002136752137, "grad_norm": 0.7034119963645935, "learning_rate": 0.0003613076039381006, "loss": 0.5183, "step": 27150 }, { "epoch": 0.604522792022792, "grad_norm": 0.8866810202598572, "learning_rate": 0.00036128003731920915, "loss": 0.6867, "step": 27160 }, { "epoch": 0.6047453703703703, "grad_norm": 0.9018061757087708, "learning_rate": 0.0003612524619362286, "loss": 0.5517, "step": 27170 }, { "epoch": 0.6049679487179487, "grad_norm": 0.6314164400100708, "learning_rate": 0.00036122487779065716, "loss": 0.6991, "step": 27180 }, { "epoch": 0.6051905270655271, "grad_norm": 0.7263935804367065, "learning_rate": 0.00036119728488399395, "loss": 0.5825, "step": 27190 }, { "epoch": 0.6054131054131054, "grad_norm": 0.6667831540107727, "learning_rate": 0.00036116968321773824, "loss": 0.5671, "step": 27200 }, { "epoch": 0.6056356837606838, "grad_norm": 0.8208069801330566, "learning_rate": 0.0003611420727933901, "loss": 0.6483, "step": 27210 }, { "epoch": 0.6058582621082621, "grad_norm": 0.7313910126686096, "learning_rate": 0.00036111445361244974, "loss": 0.585, "step": 27220 }, { "epoch": 0.6060808404558404, "grad_norm": 0.6981318593025208, "learning_rate": 0.00036108682567641807, "loss": 0.6057, "step": 27230 }, { "epoch": 0.6063034188034188, "grad_norm": 0.40817198157310486, "learning_rate": 0.00036105918898679643, "loss": 0.5288, "step": 27240 }, { "epoch": 0.6065259971509972, "grad_norm": 1.2144339084625244, "learning_rate": 0.0003610315435450866, "loss": 0.8191, "step": 27250 }, { "epoch": 0.6067485754985755, "grad_norm": 0.712494432926178, "learning_rate": 0.0003610038893527909, "loss": 0.5641, "step": 27260 }, { "epoch": 0.6069711538461539, "grad_norm": 0.724338710308075, "learning_rate": 0.000360976226411412, "loss": 0.589, "step": 27270 }, { "epoch": 0.6071937321937322, "grad_norm": 0.6945069432258606, "learning_rate": 0.00036094855472245323, "loss": 0.6487, "step": 27280 }, { "epoch": 0.6074163105413105, "grad_norm": 0.6646556854248047, "learning_rate": 0.0003609208742874182, "loss": 0.6672, "step": 27290 }, { "epoch": 0.6076388888888888, "grad_norm": 0.6772031188011169, "learning_rate": 0.0003608931851078111, "loss": 0.5538, "step": 27300 }, { "epoch": 0.6078614672364673, "grad_norm": 0.4915063679218292, "learning_rate": 0.00036086548718513667, "loss": 0.7779, "step": 27310 }, { "epoch": 0.6080840455840456, "grad_norm": 0.669331431388855, "learning_rate": 0.0003608377805209, "loss": 0.6369, "step": 27320 }, { "epoch": 0.6083066239316239, "grad_norm": 0.5951343178749084, "learning_rate": 0.00036081006511660664, "loss": 0.6196, "step": 27330 }, { "epoch": 0.6085292022792023, "grad_norm": 0.6411831378936768, "learning_rate": 0.0003607823409737627, "loss": 0.4927, "step": 27340 }, { "epoch": 0.6087517806267806, "grad_norm": 0.7948203682899475, "learning_rate": 0.00036075460809387465, "loss": 0.772, "step": 27350 }, { "epoch": 0.6089743589743589, "grad_norm": 0.5307528972625732, "learning_rate": 0.00036072686647844966, "loss": 0.5684, "step": 27360 }, { "epoch": 0.6091969373219374, "grad_norm": 0.9788020253181458, "learning_rate": 0.0003606991161289952, "loss": 0.5741, "step": 27370 }, { "epoch": 0.6094195156695157, "grad_norm": 0.5573745369911194, "learning_rate": 0.0003606713570470192, "loss": 0.6287, "step": 27380 }, { "epoch": 0.609642094017094, "grad_norm": 0.655276894569397, "learning_rate": 0.00036064358923403007, "loss": 0.4993, "step": 27390 }, { "epoch": 0.6098646723646723, "grad_norm": 0.6795368194580078, "learning_rate": 0.00036061581269153684, "loss": 0.5322, "step": 27400 }, { "epoch": 0.6100872507122507, "grad_norm": 0.8472557067871094, "learning_rate": 0.00036058802742104884, "loss": 0.6817, "step": 27410 }, { "epoch": 0.6103098290598291, "grad_norm": 0.38561925292015076, "learning_rate": 0.00036056023342407597, "loss": 0.548, "step": 27420 }, { "epoch": 0.6105324074074074, "grad_norm": 0.5530402064323425, "learning_rate": 0.0003605324307021286, "loss": 0.599, "step": 27430 }, { "epoch": 0.6107549857549858, "grad_norm": 0.6635321974754333, "learning_rate": 0.00036050461925671756, "loss": 0.5592, "step": 27440 }, { "epoch": 0.6109775641025641, "grad_norm": 0.6213598847389221, "learning_rate": 0.000360476799089354, "loss": 0.6556, "step": 27450 }, { "epoch": 0.6112001424501424, "grad_norm": 0.8107698559761047, "learning_rate": 0.0003604489702015499, "loss": 0.6625, "step": 27460 }, { "epoch": 0.6114227207977208, "grad_norm": 0.8441813588142395, "learning_rate": 0.0003604211325948174, "loss": 0.659, "step": 27470 }, { "epoch": 0.6116452991452992, "grad_norm": 0.959496021270752, "learning_rate": 0.00036039328627066915, "loss": 0.7157, "step": 27480 }, { "epoch": 0.6118678774928775, "grad_norm": 1.1635382175445557, "learning_rate": 0.0003603654312306185, "loss": 0.6214, "step": 27490 }, { "epoch": 0.6120904558404558, "grad_norm": 0.6326111555099487, "learning_rate": 0.000360337567476179, "loss": 0.6334, "step": 27500 }, { "epoch": 0.6123130341880342, "grad_norm": 0.41505324840545654, "learning_rate": 0.00036030969500886487, "loss": 0.5372, "step": 27510 }, { "epoch": 0.6125356125356125, "grad_norm": 0.7522630095481873, "learning_rate": 0.00036028181383019063, "loss": 0.6667, "step": 27520 }, { "epoch": 0.6127581908831908, "grad_norm": 0.7173981070518494, "learning_rate": 0.00036025392394167143, "loss": 0.5472, "step": 27530 }, { "epoch": 0.6129807692307693, "grad_norm": 0.5531005859375, "learning_rate": 0.0003602260253448228, "loss": 0.5993, "step": 27540 }, { "epoch": 0.6132033475783476, "grad_norm": 0.9125895500183105, "learning_rate": 0.00036019811804116077, "loss": 0.5588, "step": 27550 }, { "epoch": 0.6134259259259259, "grad_norm": 0.6552771329879761, "learning_rate": 0.00036017020203220194, "loss": 0.4663, "step": 27560 }, { "epoch": 0.6136485042735043, "grad_norm": 0.9296733736991882, "learning_rate": 0.00036014227731946316, "loss": 0.6318, "step": 27570 }, { "epoch": 0.6138710826210826, "grad_norm": 0.82627272605896, "learning_rate": 0.0003601143439044619, "loss": 0.6888, "step": 27580 }, { "epoch": 0.6140936609686609, "grad_norm": 0.8942478895187378, "learning_rate": 0.0003600864017887162, "loss": 0.713, "step": 27590 }, { "epoch": 0.6143162393162394, "grad_norm": 1.0062716007232666, "learning_rate": 0.0003600584509737443, "loss": 0.5045, "step": 27600 }, { "epoch": 0.6145388176638177, "grad_norm": 0.904301643371582, "learning_rate": 0.00036003049146106516, "loss": 0.5635, "step": 27610 }, { "epoch": 0.614761396011396, "grad_norm": 0.5842049717903137, "learning_rate": 0.0003600025232521981, "loss": 0.7147, "step": 27620 }, { "epoch": 0.6149839743589743, "grad_norm": 0.3948265016078949, "learning_rate": 0.0003599745463486629, "loss": 0.6557, "step": 27630 }, { "epoch": 0.6152065527065527, "grad_norm": 0.9642991423606873, "learning_rate": 0.00035994656075198, "loss": 0.6969, "step": 27640 }, { "epoch": 0.6154291310541311, "grad_norm": 0.5985532999038696, "learning_rate": 0.00035991856646367, "loss": 0.5313, "step": 27650 }, { "epoch": 0.6156517094017094, "grad_norm": 0.510049045085907, "learning_rate": 0.00035989056348525414, "loss": 0.6087, "step": 27660 }, { "epoch": 0.6158742877492878, "grad_norm": 0.41505852341651917, "learning_rate": 0.00035986255181825425, "loss": 0.4565, "step": 27670 }, { "epoch": 0.6160968660968661, "grad_norm": 0.7733290791511536, "learning_rate": 0.00035983453146419233, "loss": 0.6904, "step": 27680 }, { "epoch": 0.6163194444444444, "grad_norm": 0.8328065872192383, "learning_rate": 0.0003598065024245912, "loss": 0.607, "step": 27690 }, { "epoch": 0.6165420227920227, "grad_norm": 0.5937551856040955, "learning_rate": 0.00035977846470097393, "loss": 0.6013, "step": 27700 }, { "epoch": 0.6167646011396012, "grad_norm": 0.4843963086605072, "learning_rate": 0.000359750418294864, "loss": 0.6265, "step": 27710 }, { "epoch": 0.6169871794871795, "grad_norm": 0.6373627185821533, "learning_rate": 0.00035972236320778555, "loss": 0.5394, "step": 27720 }, { "epoch": 0.6172097578347578, "grad_norm": 0.6972571611404419, "learning_rate": 0.0003596942994412632, "loss": 0.7018, "step": 27730 }, { "epoch": 0.6174323361823362, "grad_norm": 0.6809747219085693, "learning_rate": 0.00035966622699682186, "loss": 0.6933, "step": 27740 }, { "epoch": 0.6176549145299145, "grad_norm": 0.6674181818962097, "learning_rate": 0.0003596381458759871, "loss": 0.6342, "step": 27750 }, { "epoch": 0.6178774928774928, "grad_norm": 0.5982582569122314, "learning_rate": 0.0003596100560802847, "loss": 0.5072, "step": 27760 }, { "epoch": 0.6181000712250713, "grad_norm": 0.7734548449516296, "learning_rate": 0.00035958195761124126, "loss": 0.5844, "step": 27770 }, { "epoch": 0.6183226495726496, "grad_norm": 0.5733859539031982, "learning_rate": 0.00035955385047038355, "loss": 0.5021, "step": 27780 }, { "epoch": 0.6185452279202279, "grad_norm": 0.8854355216026306, "learning_rate": 0.000359525734659239, "loss": 0.7261, "step": 27790 }, { "epoch": 0.6187678062678063, "grad_norm": 0.534376859664917, "learning_rate": 0.00035949761017933546, "loss": 0.5296, "step": 27800 }, { "epoch": 0.6189903846153846, "grad_norm": 0.5967615246772766, "learning_rate": 0.00035946947703220124, "loss": 0.5746, "step": 27810 }, { "epoch": 0.6192129629629629, "grad_norm": 0.9212782382965088, "learning_rate": 0.0003594413352193651, "loss": 0.6819, "step": 27820 }, { "epoch": 0.6194355413105413, "grad_norm": 0.6893104910850525, "learning_rate": 0.0003594131847423562, "loss": 0.5115, "step": 27830 }, { "epoch": 0.6196581196581197, "grad_norm": 0.46085241436958313, "learning_rate": 0.0003593850256027044, "loss": 0.6953, "step": 27840 }, { "epoch": 0.619880698005698, "grad_norm": 0.5326361656188965, "learning_rate": 0.00035935685780193974, "loss": 0.5761, "step": 27850 }, { "epoch": 0.6201032763532763, "grad_norm": 0.8955104947090149, "learning_rate": 0.0003593286813415931, "loss": 0.6468, "step": 27860 }, { "epoch": 0.6203258547008547, "grad_norm": 0.6292840242385864, "learning_rate": 0.00035930049622319535, "loss": 0.5546, "step": 27870 }, { "epoch": 0.6205484330484331, "grad_norm": 0.48575925827026367, "learning_rate": 0.00035927230244827833, "loss": 0.7017, "step": 27880 }, { "epoch": 0.6207710113960114, "grad_norm": 0.7187725305557251, "learning_rate": 0.00035924410001837395, "loss": 0.6553, "step": 27890 }, { "epoch": 0.6209935897435898, "grad_norm": 0.6629842519760132, "learning_rate": 0.00035921588893501487, "loss": 0.5627, "step": 27900 }, { "epoch": 0.6212161680911681, "grad_norm": 0.8785918354988098, "learning_rate": 0.00035918766919973395, "loss": 0.6831, "step": 27910 }, { "epoch": 0.6214387464387464, "grad_norm": 0.847978949546814, "learning_rate": 0.0003591594408140649, "loss": 0.6169, "step": 27920 }, { "epoch": 0.6216613247863247, "grad_norm": 0.7552899122238159, "learning_rate": 0.0003591312037795414, "loss": 0.5807, "step": 27930 }, { "epoch": 0.6218839031339032, "grad_norm": 0.5452701449394226, "learning_rate": 0.0003591029580976981, "loss": 0.7221, "step": 27940 }, { "epoch": 0.6221064814814815, "grad_norm": 0.5792766213417053, "learning_rate": 0.0003590747037700698, "loss": 0.6582, "step": 27950 }, { "epoch": 0.6223290598290598, "grad_norm": 0.7792395353317261, "learning_rate": 0.0003590464407981919, "loss": 0.7478, "step": 27960 }, { "epoch": 0.6225516381766382, "grad_norm": 0.4148932993412018, "learning_rate": 0.00035901816918360014, "loss": 0.6012, "step": 27970 }, { "epoch": 0.6227742165242165, "grad_norm": 0.8461902737617493, "learning_rate": 0.00035898988892783096, "loss": 0.6691, "step": 27980 }, { "epoch": 0.6229967948717948, "grad_norm": 0.7921462059020996, "learning_rate": 0.00035896160003242103, "loss": 0.593, "step": 27990 }, { "epoch": 0.6232193732193733, "grad_norm": 0.6527276635169983, "learning_rate": 0.00035893330249890757, "loss": 0.6857, "step": 28000 }, { "epoch": 0.6234419515669516, "grad_norm": 0.35611993074417114, "learning_rate": 0.0003589049963288284, "loss": 0.7097, "step": 28010 }, { "epoch": 0.6236645299145299, "grad_norm": 0.6560295224189758, "learning_rate": 0.0003588766815237216, "loss": 0.5787, "step": 28020 }, { "epoch": 0.6238871082621082, "grad_norm": 0.669855535030365, "learning_rate": 0.00035884835808512594, "loss": 0.6569, "step": 28030 }, { "epoch": 0.6241096866096866, "grad_norm": 0.386393278837204, "learning_rate": 0.00035882002601458045, "loss": 0.6923, "step": 28040 }, { "epoch": 0.6243322649572649, "grad_norm": 0.5557284951210022, "learning_rate": 0.0003587916853136247, "loss": 0.606, "step": 28050 }, { "epoch": 0.6245548433048433, "grad_norm": 0.5414696931838989, "learning_rate": 0.00035876333598379873, "loss": 0.5686, "step": 28060 }, { "epoch": 0.6247774216524217, "grad_norm": 0.6101901531219482, "learning_rate": 0.00035873497802664316, "loss": 0.7799, "step": 28070 }, { "epoch": 0.625, "grad_norm": 0.7300838232040405, "learning_rate": 0.0003587066114436989, "loss": 0.6007, "step": 28080 }, { "epoch": 0.6252225783475783, "grad_norm": 0.42730575799942017, "learning_rate": 0.00035867823623650754, "loss": 0.5635, "step": 28090 }, { "epoch": 0.6254451566951567, "grad_norm": 0.7427262663841248, "learning_rate": 0.00035864985240661085, "loss": 0.7147, "step": 28100 }, { "epoch": 0.6256677350427351, "grad_norm": 0.6119892001152039, "learning_rate": 0.00035862145995555134, "loss": 0.6254, "step": 28110 }, { "epoch": 0.6258903133903134, "grad_norm": 0.6742544770240784, "learning_rate": 0.00035859305888487185, "loss": 0.6997, "step": 28120 }, { "epoch": 0.6261128917378918, "grad_norm": 0.7052304148674011, "learning_rate": 0.0003585646491961157, "loss": 0.5544, "step": 28130 }, { "epoch": 0.6263354700854701, "grad_norm": 0.5233950018882751, "learning_rate": 0.00035853623089082665, "loss": 0.5589, "step": 28140 }, { "epoch": 0.6265580484330484, "grad_norm": 0.6088318228721619, "learning_rate": 0.0003585078039705491, "loss": 0.8011, "step": 28150 }, { "epoch": 0.6267806267806267, "grad_norm": 0.8205341100692749, "learning_rate": 0.0003584793684368277, "loss": 0.675, "step": 28160 }, { "epoch": 0.6270032051282052, "grad_norm": 0.47185105085372925, "learning_rate": 0.0003584509242912076, "loss": 0.794, "step": 28170 }, { "epoch": 0.6272257834757835, "grad_norm": 0.48162466287612915, "learning_rate": 0.0003584224715352347, "loss": 0.8218, "step": 28180 }, { "epoch": 0.6274483618233618, "grad_norm": 0.6121693253517151, "learning_rate": 0.0003583940101704549, "loss": 0.6535, "step": 28190 }, { "epoch": 0.6276709401709402, "grad_norm": 0.8618494868278503, "learning_rate": 0.00035836554019841495, "loss": 0.6823, "step": 28200 }, { "epoch": 0.6278935185185185, "grad_norm": 0.9534609913825989, "learning_rate": 0.00035833706162066194, "loss": 0.6325, "step": 28210 }, { "epoch": 0.6281160968660968, "grad_norm": 0.6802915334701538, "learning_rate": 0.0003583085744387433, "loss": 0.6319, "step": 28220 }, { "epoch": 0.6283386752136753, "grad_norm": 0.7972056865692139, "learning_rate": 0.0003582800786542072, "loss": 0.5217, "step": 28230 }, { "epoch": 0.6285612535612536, "grad_norm": 0.8219149708747864, "learning_rate": 0.00035825157426860204, "loss": 0.7513, "step": 28240 }, { "epoch": 0.6287838319088319, "grad_norm": 0.813468337059021, "learning_rate": 0.0003582230612834768, "loss": 0.6351, "step": 28250 }, { "epoch": 0.6290064102564102, "grad_norm": 0.6350398659706116, "learning_rate": 0.0003581945397003809, "loss": 0.558, "step": 28260 }, { "epoch": 0.6292289886039886, "grad_norm": 0.8708319664001465, "learning_rate": 0.0003581660095208641, "loss": 0.5868, "step": 28270 }, { "epoch": 0.6294515669515669, "grad_norm": 0.7231774926185608, "learning_rate": 0.00035813747074647697, "loss": 0.6014, "step": 28280 }, { "epoch": 0.6296741452991453, "grad_norm": 0.9997678995132446, "learning_rate": 0.0003581089233787702, "loss": 0.6687, "step": 28290 }, { "epoch": 0.6298967236467237, "grad_norm": 0.5924893617630005, "learning_rate": 0.00035808036741929506, "loss": 0.6482, "step": 28300 }, { "epoch": 0.630119301994302, "grad_norm": 0.4346763789653778, "learning_rate": 0.0003580518028696034, "loss": 0.5784, "step": 28310 }, { "epoch": 0.6303418803418803, "grad_norm": 1.0329116582870483, "learning_rate": 0.00035802322973124733, "loss": 0.5514, "step": 28320 }, { "epoch": 0.6305644586894587, "grad_norm": 0.7910727262496948, "learning_rate": 0.0003579946480057796, "loss": 0.6541, "step": 28330 }, { "epoch": 0.6307870370370371, "grad_norm": 0.7650035619735718, "learning_rate": 0.00035796605769475336, "loss": 0.5975, "step": 28340 }, { "epoch": 0.6310096153846154, "grad_norm": 0.6159213185310364, "learning_rate": 0.00035793745879972224, "loss": 0.6394, "step": 28350 }, { "epoch": 0.6312321937321937, "grad_norm": 0.5805545449256897, "learning_rate": 0.0003579088513222403, "loss": 0.6183, "step": 28360 }, { "epoch": 0.6314547720797721, "grad_norm": 0.7439352869987488, "learning_rate": 0.00035788023526386214, "loss": 0.636, "step": 28370 }, { "epoch": 0.6316773504273504, "grad_norm": 0.7128954529762268, "learning_rate": 0.0003578516106261427, "loss": 0.7803, "step": 28380 }, { "epoch": 0.6318999287749287, "grad_norm": 0.7685819864273071, "learning_rate": 0.0003578229774106376, "loss": 0.5954, "step": 28390 }, { "epoch": 0.6321225071225072, "grad_norm": 0.5727129578590393, "learning_rate": 0.0003577943356189026, "loss": 0.5705, "step": 28400 }, { "epoch": 0.6323450854700855, "grad_norm": 0.6094703674316406, "learning_rate": 0.0003577656852524943, "loss": 0.6513, "step": 28410 }, { "epoch": 0.6325676638176638, "grad_norm": 0.5944792032241821, "learning_rate": 0.00035773702631296955, "loss": 0.5142, "step": 28420 }, { "epoch": 0.6327902421652422, "grad_norm": 0.4907169044017792, "learning_rate": 0.00035770835880188554, "loss": 0.6042, "step": 28430 }, { "epoch": 0.6330128205128205, "grad_norm": 0.8299042582511902, "learning_rate": 0.00035767968272080027, "loss": 0.6913, "step": 28440 }, { "epoch": 0.6332353988603988, "grad_norm": 0.5609634518623352, "learning_rate": 0.00035765099807127194, "loss": 0.6383, "step": 28450 }, { "epoch": 0.6334579772079773, "grad_norm": 0.713447093963623, "learning_rate": 0.00035762230485485933, "loss": 0.5828, "step": 28460 }, { "epoch": 0.6336805555555556, "grad_norm": 0.4866918623447418, "learning_rate": 0.0003575936030731216, "loss": 0.6244, "step": 28470 }, { "epoch": 0.6339031339031339, "grad_norm": 0.4822397828102112, "learning_rate": 0.00035756489272761855, "loss": 0.5602, "step": 28480 }, { "epoch": 0.6341257122507122, "grad_norm": 0.6143772602081299, "learning_rate": 0.0003575361738199102, "loss": 0.5137, "step": 28490 }, { "epoch": 0.6343482905982906, "grad_norm": 0.7375980019569397, "learning_rate": 0.0003575074463515572, "loss": 0.6496, "step": 28500 }, { "epoch": 0.6345708689458689, "grad_norm": 1.0098057985305786, "learning_rate": 0.0003574787103241206, "loss": 0.616, "step": 28510 }, { "epoch": 0.6347934472934473, "grad_norm": 0.6686311364173889, "learning_rate": 0.000357449965739162, "loss": 0.5475, "step": 28520 }, { "epoch": 0.6350160256410257, "grad_norm": 0.7210583090782166, "learning_rate": 0.0003574212125982434, "loss": 0.6409, "step": 28530 }, { "epoch": 0.635238603988604, "grad_norm": 0.8931546211242676, "learning_rate": 0.00035739245090292713, "loss": 0.7039, "step": 28540 }, { "epoch": 0.6354611823361823, "grad_norm": 0.4405869245529175, "learning_rate": 0.0003573636806547763, "loss": 0.5408, "step": 28550 }, { "epoch": 0.6356837606837606, "grad_norm": 0.7087313532829285, "learning_rate": 0.00035733490185535424, "loss": 0.7057, "step": 28560 }, { "epoch": 0.6359063390313391, "grad_norm": 0.6314172744750977, "learning_rate": 0.00035730611450622476, "loss": 0.5741, "step": 28570 }, { "epoch": 0.6361289173789174, "grad_norm": 0.6478539705276489, "learning_rate": 0.0003572773186089523, "loss": 0.7413, "step": 28580 }, { "epoch": 0.6363514957264957, "grad_norm": 0.47926536202430725, "learning_rate": 0.0003572485141651016, "loss": 0.5663, "step": 28590 }, { "epoch": 0.6365740740740741, "grad_norm": 0.8513566851615906, "learning_rate": 0.0003572197011762378, "loss": 0.6291, "step": 28600 }, { "epoch": 0.6367966524216524, "grad_norm": 0.7166478037834167, "learning_rate": 0.00035719087964392683, "loss": 0.7298, "step": 28610 }, { "epoch": 0.6370192307692307, "grad_norm": 0.6767480969429016, "learning_rate": 0.0003571620495697348, "loss": 0.647, "step": 28620 }, { "epoch": 0.6372418091168092, "grad_norm": 0.5176422595977783, "learning_rate": 0.0003571332109552283, "loss": 0.4404, "step": 28630 }, { "epoch": 0.6374643874643875, "grad_norm": 0.7984474897384644, "learning_rate": 0.00035710436380197445, "loss": 0.6743, "step": 28640 }, { "epoch": 0.6376869658119658, "grad_norm": 0.5979481339454651, "learning_rate": 0.00035707550811154095, "loss": 0.6531, "step": 28650 }, { "epoch": 0.6379095441595442, "grad_norm": 0.6247133016586304, "learning_rate": 0.00035704664388549567, "loss": 0.7876, "step": 28660 }, { "epoch": 0.6381321225071225, "grad_norm": 0.4504818916320801, "learning_rate": 0.0003570177711254072, "loss": 0.7235, "step": 28670 }, { "epoch": 0.6383547008547008, "grad_norm": 0.6310564279556274, "learning_rate": 0.00035698888983284454, "loss": 0.5558, "step": 28680 }, { "epoch": 0.6385772792022792, "grad_norm": 0.8027581572532654, "learning_rate": 0.00035696000000937707, "loss": 0.6422, "step": 28690 }, { "epoch": 0.6387998575498576, "grad_norm": 0.55892413854599, "learning_rate": 0.0003569311016565747, "loss": 0.5426, "step": 28700 }, { "epoch": 0.6390224358974359, "grad_norm": 0.5204585194587708, "learning_rate": 0.0003569021947760078, "loss": 0.4812, "step": 28710 }, { "epoch": 0.6392450142450142, "grad_norm": 0.6362113356590271, "learning_rate": 0.00035687327936924726, "loss": 0.5975, "step": 28720 }, { "epoch": 0.6394675925925926, "grad_norm": 0.45520856976509094, "learning_rate": 0.0003568443554378642, "loss": 0.4732, "step": 28730 }, { "epoch": 0.6396901709401709, "grad_norm": 0.6635448336601257, "learning_rate": 0.0003568154229834305, "loss": 0.542, "step": 28740 }, { "epoch": 0.6399127492877493, "grad_norm": 0.8992020487785339, "learning_rate": 0.0003567864820075183, "loss": 0.6056, "step": 28750 }, { "epoch": 0.6401353276353277, "grad_norm": 0.5336700081825256, "learning_rate": 0.00035675753251170045, "loss": 0.6984, "step": 28760 }, { "epoch": 0.640357905982906, "grad_norm": 0.5950776934623718, "learning_rate": 0.00035672857449754985, "loss": 0.7067, "step": 28770 }, { "epoch": 0.6405804843304843, "grad_norm": 0.4215049743652344, "learning_rate": 0.00035669960796664023, "loss": 0.6264, "step": 28780 }, { "epoch": 0.6408030626780626, "grad_norm": 1.0368537902832031, "learning_rate": 0.0003566706329205456, "loss": 0.4807, "step": 28790 }, { "epoch": 0.6410256410256411, "grad_norm": 0.517913281917572, "learning_rate": 0.00035664164936084053, "loss": 0.5554, "step": 28800 }, { "epoch": 0.6412482193732194, "grad_norm": 0.5046729445457458, "learning_rate": 0.0003566126572891, "loss": 0.5988, "step": 28810 }, { "epoch": 0.6414707977207977, "grad_norm": 0.7036557197570801, "learning_rate": 0.00035658365670689947, "loss": 0.5184, "step": 28820 }, { "epoch": 0.6416933760683761, "grad_norm": 0.4869050681591034, "learning_rate": 0.0003565546476158149, "loss": 0.6969, "step": 28830 }, { "epoch": 0.6419159544159544, "grad_norm": 1.1376734972000122, "learning_rate": 0.00035652563001742257, "loss": 0.5895, "step": 28840 }, { "epoch": 0.6421385327635327, "grad_norm": 0.6507123112678528, "learning_rate": 0.00035649660391329934, "loss": 0.7427, "step": 28850 }, { "epoch": 0.6423611111111112, "grad_norm": 0.49312824010849, "learning_rate": 0.00035646756930502257, "loss": 0.5665, "step": 28860 }, { "epoch": 0.6425836894586895, "grad_norm": 0.7473471760749817, "learning_rate": 0.00035643852619417004, "loss": 0.6141, "step": 28870 }, { "epoch": 0.6428062678062678, "grad_norm": 0.602100133895874, "learning_rate": 0.00035640947458231986, "loss": 0.6248, "step": 28880 }, { "epoch": 0.6430288461538461, "grad_norm": 0.4823180139064789, "learning_rate": 0.0003563804144710508, "loss": 0.5684, "step": 28890 }, { "epoch": 0.6432514245014245, "grad_norm": 0.6477437019348145, "learning_rate": 0.00035635134586194204, "loss": 0.6773, "step": 28900 }, { "epoch": 0.6434740028490028, "grad_norm": 0.6315403580665588, "learning_rate": 0.00035632226875657316, "loss": 0.6367, "step": 28910 }, { "epoch": 0.6436965811965812, "grad_norm": 0.7338327169418335, "learning_rate": 0.00035629318315652417, "loss": 0.5882, "step": 28920 }, { "epoch": 0.6439191595441596, "grad_norm": 0.4510704278945923, "learning_rate": 0.0003562640890633756, "loss": 0.6198, "step": 28930 }, { "epoch": 0.6441417378917379, "grad_norm": 0.6776688694953918, "learning_rate": 0.00035623498647870865, "loss": 0.6024, "step": 28940 }, { "epoch": 0.6443643162393162, "grad_norm": 0.4859476685523987, "learning_rate": 0.0003562058754041045, "loss": 0.5885, "step": 28950 }, { "epoch": 0.6445868945868946, "grad_norm": 0.8991909027099609, "learning_rate": 0.0003561767558411453, "loss": 0.6528, "step": 28960 }, { "epoch": 0.6448094729344729, "grad_norm": 0.6131160855293274, "learning_rate": 0.00035614762779141333, "loss": 0.7233, "step": 28970 }, { "epoch": 0.6450320512820513, "grad_norm": 0.723871648311615, "learning_rate": 0.0003561184912564914, "loss": 0.7137, "step": 28980 }, { "epoch": 0.6452546296296297, "grad_norm": 0.6209283471107483, "learning_rate": 0.0003560893462379629, "loss": 0.6371, "step": 28990 }, { "epoch": 0.645477207977208, "grad_norm": 0.7736254334449768, "learning_rate": 0.0003560601927374115, "loss": 0.711, "step": 29000 }, { "epoch": 0.6456997863247863, "grad_norm": 0.7672634720802307, "learning_rate": 0.0003560310307564215, "loss": 0.7021, "step": 29010 }, { "epoch": 0.6459223646723646, "grad_norm": 0.7883049249649048, "learning_rate": 0.0003560018602965775, "loss": 0.6126, "step": 29020 }, { "epoch": 0.6461449430199431, "grad_norm": 0.7914969325065613, "learning_rate": 0.00035597268135946475, "loss": 0.6034, "step": 29030 }, { "epoch": 0.6463675213675214, "grad_norm": 0.5914848446846008, "learning_rate": 0.00035594349394666884, "loss": 0.5519, "step": 29040 }, { "epoch": 0.6465900997150997, "grad_norm": 0.647603452205658, "learning_rate": 0.00035591429805977573, "loss": 0.7662, "step": 29050 }, { "epoch": 0.6468126780626781, "grad_norm": 0.5746428370475769, "learning_rate": 0.00035588509370037207, "loss": 0.6192, "step": 29060 }, { "epoch": 0.6470352564102564, "grad_norm": 0.7971509099006653, "learning_rate": 0.0003558558808700448, "loss": 0.727, "step": 29070 }, { "epoch": 0.6472578347578347, "grad_norm": 0.8022960424423218, "learning_rate": 0.0003558266595703814, "loss": 0.5038, "step": 29080 }, { "epoch": 0.6474804131054132, "grad_norm": 0.7282474040985107, "learning_rate": 0.00035579742980296967, "loss": 0.5816, "step": 29090 }, { "epoch": 0.6477029914529915, "grad_norm": 0.667869508266449, "learning_rate": 0.00035576819156939816, "loss": 0.683, "step": 29100 }, { "epoch": 0.6479255698005698, "grad_norm": 0.6066399812698364, "learning_rate": 0.00035573894487125554, "loss": 0.569, "step": 29110 }, { "epoch": 0.6481481481481481, "grad_norm": 0.776150643825531, "learning_rate": 0.0003557096897101312, "loss": 0.6937, "step": 29120 }, { "epoch": 0.6483707264957265, "grad_norm": 0.5233556628227234, "learning_rate": 0.0003556804260876148, "loss": 0.6897, "step": 29130 }, { "epoch": 0.6485933048433048, "grad_norm": 1.1723662614822388, "learning_rate": 0.00035565115400529665, "loss": 0.802, "step": 29140 }, { "epoch": 0.6488158831908832, "grad_norm": 1.0054035186767578, "learning_rate": 0.00035562187346476734, "loss": 0.5343, "step": 29150 }, { "epoch": 0.6490384615384616, "grad_norm": 1.1244323253631592, "learning_rate": 0.00035559258446761803, "loss": 0.571, "step": 29160 }, { "epoch": 0.6492610398860399, "grad_norm": 0.7842982411384583, "learning_rate": 0.0003555632870154403, "loss": 0.5818, "step": 29170 }, { "epoch": 0.6494836182336182, "grad_norm": 0.5447773337364197, "learning_rate": 0.00035553398110982625, "loss": 0.6002, "step": 29180 }, { "epoch": 0.6497061965811965, "grad_norm": 0.47989046573638916, "learning_rate": 0.00035550466675236835, "loss": 0.5581, "step": 29190 }, { "epoch": 0.6499287749287749, "grad_norm": 0.9793609976768494, "learning_rate": 0.0003554753439446595, "loss": 0.6088, "step": 29200 }, { "epoch": 0.6501513532763533, "grad_norm": 0.7243526577949524, "learning_rate": 0.0003554460126882932, "loss": 0.7478, "step": 29210 }, { "epoch": 0.6503739316239316, "grad_norm": 0.43227654695510864, "learning_rate": 0.00035541667298486326, "loss": 0.6304, "step": 29220 }, { "epoch": 0.65059650997151, "grad_norm": 0.8305972218513489, "learning_rate": 0.00035538732483596415, "loss": 0.5699, "step": 29230 }, { "epoch": 0.6508190883190883, "grad_norm": 0.6374602317810059, "learning_rate": 0.00035535796824319064, "loss": 0.7133, "step": 29240 }, { "epoch": 0.6510416666666666, "grad_norm": 0.3493801951408386, "learning_rate": 0.00035532860320813787, "loss": 0.6185, "step": 29250 }, { "epoch": 0.6512642450142451, "grad_norm": 0.5009222626686096, "learning_rate": 0.00035529922973240167, "loss": 0.6071, "step": 29260 }, { "epoch": 0.6514868233618234, "grad_norm": 0.9517810344696045, "learning_rate": 0.0003552698478175782, "loss": 0.6672, "step": 29270 }, { "epoch": 0.6517094017094017, "grad_norm": 0.8424570560455322, "learning_rate": 0.0003552404574652641, "loss": 0.6607, "step": 29280 }, { "epoch": 0.65193198005698, "grad_norm": 0.7108397483825684, "learning_rate": 0.00035521105867705646, "loss": 0.4565, "step": 29290 }, { "epoch": 0.6521545584045584, "grad_norm": 0.7616039514541626, "learning_rate": 0.0003551816514545528, "loss": 0.6568, "step": 29300 }, { "epoch": 0.6523771367521367, "grad_norm": 0.6160005331039429, "learning_rate": 0.0003551522357993512, "loss": 0.4912, "step": 29310 }, { "epoch": 0.6525997150997151, "grad_norm": 0.7282902002334595, "learning_rate": 0.00035512281171305, "loss": 0.5883, "step": 29320 }, { "epoch": 0.6528222934472935, "grad_norm": 1.0402792692184448, "learning_rate": 0.0003550933791972483, "loss": 0.5807, "step": 29330 }, { "epoch": 0.6530448717948718, "grad_norm": 0.607012152671814, "learning_rate": 0.00035506393825354547, "loss": 0.5477, "step": 29340 }, { "epoch": 0.6532674501424501, "grad_norm": 0.7881841063499451, "learning_rate": 0.0003550344888835412, "loss": 0.6495, "step": 29350 }, { "epoch": 0.6534900284900285, "grad_norm": 0.6767070889472961, "learning_rate": 0.0003550050310888359, "loss": 0.5842, "step": 29360 }, { "epoch": 0.6537126068376068, "grad_norm": 0.9461158514022827, "learning_rate": 0.00035497556487103037, "loss": 0.6715, "step": 29370 }, { "epoch": 0.6539351851851852, "grad_norm": 0.7762941718101501, "learning_rate": 0.0003549460902317257, "loss": 0.6558, "step": 29380 }, { "epoch": 0.6541577635327636, "grad_norm": 0.6619521379470825, "learning_rate": 0.0003549166071725237, "loss": 0.68, "step": 29390 }, { "epoch": 0.6543803418803419, "grad_norm": 0.8690246343612671, "learning_rate": 0.0003548871156950264, "loss": 0.6592, "step": 29400 }, { "epoch": 0.6546029202279202, "grad_norm": 0.9152123332023621, "learning_rate": 0.00035485761580083646, "loss": 0.6177, "step": 29410 }, { "epoch": 0.6548254985754985, "grad_norm": 0.32496699690818787, "learning_rate": 0.0003548281074915569, "loss": 0.607, "step": 29420 }, { "epoch": 0.6550480769230769, "grad_norm": 0.5853595733642578, "learning_rate": 0.00035479859076879123, "loss": 0.5845, "step": 29430 }, { "epoch": 0.6552706552706553, "grad_norm": 0.792079508304596, "learning_rate": 0.00035476906563414347, "loss": 0.6061, "step": 29440 }, { "epoch": 0.6554932336182336, "grad_norm": 0.6753571629524231, "learning_rate": 0.00035473953208921787, "loss": 0.6051, "step": 29450 }, { "epoch": 0.655715811965812, "grad_norm": 0.6931265592575073, "learning_rate": 0.00035470999013561947, "loss": 0.6763, "step": 29460 }, { "epoch": 0.6559383903133903, "grad_norm": 0.7315155267715454, "learning_rate": 0.0003546804397749536, "loss": 0.6601, "step": 29470 }, { "epoch": 0.6561609686609686, "grad_norm": 0.826026201248169, "learning_rate": 0.000354650881008826, "loss": 0.6453, "step": 29480 }, { "epoch": 0.6563835470085471, "grad_norm": 0.8723484873771667, "learning_rate": 0.0003546213138388429, "loss": 0.63, "step": 29490 }, { "epoch": 0.6566061253561254, "grad_norm": 0.9157624244689941, "learning_rate": 0.000354591738266611, "loss": 0.523, "step": 29500 }, { "epoch": 0.6568287037037037, "grad_norm": 0.7259867191314697, "learning_rate": 0.0003545621542937375, "loss": 0.6265, "step": 29510 }, { "epoch": 0.657051282051282, "grad_norm": 0.6993746161460876, "learning_rate": 0.00035453256192183, "loss": 0.4993, "step": 29520 }, { "epoch": 0.6572738603988604, "grad_norm": 0.7524179816246033, "learning_rate": 0.00035450296115249665, "loss": 0.5703, "step": 29530 }, { "epoch": 0.6574964387464387, "grad_norm": 0.7260290384292603, "learning_rate": 0.0003544733519873458, "loss": 0.6222, "step": 29540 }, { "epoch": 0.6577190170940171, "grad_norm": 0.4377947449684143, "learning_rate": 0.00035444373442798666, "loss": 0.5503, "step": 29550 }, { "epoch": 0.6579415954415955, "grad_norm": 0.5709770321846008, "learning_rate": 0.00035441410847602845, "loss": 0.6104, "step": 29560 }, { "epoch": 0.6581641737891738, "grad_norm": 0.6281164884567261, "learning_rate": 0.0003543844741330812, "loss": 0.6844, "step": 29570 }, { "epoch": 0.6583867521367521, "grad_norm": 0.7790220379829407, "learning_rate": 0.0003543548314007553, "loss": 0.5133, "step": 29580 }, { "epoch": 0.6586093304843305, "grad_norm": 0.5882272124290466, "learning_rate": 0.00035432518028066145, "loss": 0.5498, "step": 29590 }, { "epoch": 0.6588319088319088, "grad_norm": 0.733991265296936, "learning_rate": 0.00035429552077441103, "loss": 0.664, "step": 29600 }, { "epoch": 0.6590544871794872, "grad_norm": 0.7435101270675659, "learning_rate": 0.0003542658528836156, "loss": 0.782, "step": 29610 }, { "epoch": 0.6592770655270656, "grad_norm": 0.5844705104827881, "learning_rate": 0.0003542361766098875, "loss": 0.6496, "step": 29620 }, { "epoch": 0.6594996438746439, "grad_norm": 0.895987868309021, "learning_rate": 0.0003542064919548393, "loss": 0.6748, "step": 29630 }, { "epoch": 0.6597222222222222, "grad_norm": 0.5698703527450562, "learning_rate": 0.00035417679892008405, "loss": 0.5785, "step": 29640 }, { "epoch": 0.6599448005698005, "grad_norm": 0.761055588722229, "learning_rate": 0.0003541470975072354, "loss": 0.6263, "step": 29650 }, { "epoch": 0.6600783475783476, "eval_loss": 0.6208131909370422, "eval_runtime": 337.1211, "eval_samples_per_second": 7.015, "eval_steps_per_second": 7.015, "step": 29656 }, { "epoch": 0.6601673789173789, "grad_norm": 0.6658109426498413, "learning_rate": 0.0003541173877179072, "loss": 0.7333, "step": 29660 }, { "epoch": 0.6603899572649573, "grad_norm": 0.6604793667793274, "learning_rate": 0.000354087669553714, "loss": 0.4947, "step": 29670 }, { "epoch": 0.6606125356125356, "grad_norm": 0.742704451084137, "learning_rate": 0.00035405794301627077, "loss": 0.5558, "step": 29680 }, { "epoch": 0.660835113960114, "grad_norm": 0.7001523375511169, "learning_rate": 0.0003540282081071927, "loss": 0.7059, "step": 29690 }, { "epoch": 0.6610576923076923, "grad_norm": 1.10336172580719, "learning_rate": 0.0003539984648280958, "loss": 0.7525, "step": 29700 }, { "epoch": 0.6612802706552706, "grad_norm": 0.7263476252555847, "learning_rate": 0.00035396871318059615, "loss": 0.6185, "step": 29710 }, { "epoch": 0.6615028490028491, "grad_norm": 0.9817376732826233, "learning_rate": 0.0003539389531663107, "loss": 0.6282, "step": 29720 }, { "epoch": 0.6617254273504274, "grad_norm": 0.5954140424728394, "learning_rate": 0.0003539091847868564, "loss": 0.5171, "step": 29730 }, { "epoch": 0.6619480056980057, "grad_norm": 0.6389563679695129, "learning_rate": 0.00035387940804385107, "loss": 0.6703, "step": 29740 }, { "epoch": 0.662170584045584, "grad_norm": 0.5653455853462219, "learning_rate": 0.0003538496229389127, "loss": 0.5959, "step": 29750 }, { "epoch": 0.6623931623931624, "grad_norm": 0.5209896564483643, "learning_rate": 0.0003538198294736599, "loss": 0.5791, "step": 29760 }, { "epoch": 0.6626157407407407, "grad_norm": 0.8491448760032654, "learning_rate": 0.00035379002764971166, "loss": 0.5636, "step": 29770 }, { "epoch": 0.6628383190883191, "grad_norm": 0.48439210653305054, "learning_rate": 0.0003537602174686874, "loss": 0.5535, "step": 29780 }, { "epoch": 0.6630608974358975, "grad_norm": 0.5228064060211182, "learning_rate": 0.00035373039893220706, "loss": 0.5412, "step": 29790 }, { "epoch": 0.6632834757834758, "grad_norm": 0.5903054475784302, "learning_rate": 0.00035370057204189094, "loss": 0.5664, "step": 29800 }, { "epoch": 0.6635060541310541, "grad_norm": 0.7371571660041809, "learning_rate": 0.0003536707367993599, "loss": 0.7484, "step": 29810 }, { "epoch": 0.6637286324786325, "grad_norm": 0.7421772480010986, "learning_rate": 0.0003536408932062353, "loss": 0.6104, "step": 29820 }, { "epoch": 0.6639512108262108, "grad_norm": 0.7558051943778992, "learning_rate": 0.0003536110412641388, "loss": 0.491, "step": 29830 }, { "epoch": 0.6641737891737892, "grad_norm": 0.4967116117477417, "learning_rate": 0.0003535811809746925, "loss": 0.5694, "step": 29840 }, { "epoch": 0.6643963675213675, "grad_norm": 0.6911063194274902, "learning_rate": 0.0003535513123395191, "loss": 0.5885, "step": 29850 }, { "epoch": 0.6646189458689459, "grad_norm": 0.6311195492744446, "learning_rate": 0.00035352143536024165, "loss": 0.5241, "step": 29860 }, { "epoch": 0.6648415242165242, "grad_norm": 0.5731965899467468, "learning_rate": 0.00035349155003848383, "loss": 0.679, "step": 29870 }, { "epoch": 0.6650641025641025, "grad_norm": 0.8735531568527222, "learning_rate": 0.00035346165637586946, "loss": 0.7353, "step": 29880 }, { "epoch": 0.6652866809116809, "grad_norm": 0.663013756275177, "learning_rate": 0.00035343175437402307, "loss": 0.5838, "step": 29890 }, { "epoch": 0.6655092592592593, "grad_norm": 0.6095181107521057, "learning_rate": 0.0003534018440345696, "loss": 0.5743, "step": 29900 }, { "epoch": 0.6657318376068376, "grad_norm": 0.7778880596160889, "learning_rate": 0.00035337192535913426, "loss": 0.563, "step": 29910 }, { "epoch": 0.665954415954416, "grad_norm": 0.7310755252838135, "learning_rate": 0.00035334199834934294, "loss": 0.6542, "step": 29920 }, { "epoch": 0.6661769943019943, "grad_norm": 1.2421523332595825, "learning_rate": 0.0003533120630068219, "loss": 0.6713, "step": 29930 }, { "epoch": 0.6663995726495726, "grad_norm": 0.7105190753936768, "learning_rate": 0.0003532821193331979, "loss": 0.6277, "step": 29940 }, { "epoch": 0.6666221509971509, "grad_norm": 0.7447794675827026, "learning_rate": 0.000353252167330098, "loss": 0.4767, "step": 29950 }, { "epoch": 0.6668447293447294, "grad_norm": 0.8217668533325195, "learning_rate": 0.0003532222069991499, "loss": 0.6367, "step": 29960 }, { "epoch": 0.6670673076923077, "grad_norm": 0.7856162190437317, "learning_rate": 0.0003531922383419816, "loss": 0.7551, "step": 29970 }, { "epoch": 0.667289886039886, "grad_norm": 0.7897430062294006, "learning_rate": 0.00035316226136022173, "loss": 0.5759, "step": 29980 }, { "epoch": 0.6675124643874644, "grad_norm": 0.9113932251930237, "learning_rate": 0.00035313227605549913, "loss": 0.7267, "step": 29990 }, { "epoch": 0.6677350427350427, "grad_norm": 0.7457767724990845, "learning_rate": 0.0003531022824294433, "loss": 0.6099, "step": 30000 }, { "epoch": 0.6679576210826211, "grad_norm": 0.49353641271591187, "learning_rate": 0.0003530722804836842, "loss": 0.5984, "step": 30010 }, { "epoch": 0.6681801994301995, "grad_norm": 0.5130963325500488, "learning_rate": 0.00035304227021985195, "loss": 0.5404, "step": 30020 }, { "epoch": 0.6684027777777778, "grad_norm": 0.7923216819763184, "learning_rate": 0.0003530122516395775, "loss": 0.5814, "step": 30030 }, { "epoch": 0.6686253561253561, "grad_norm": 0.7568293809890747, "learning_rate": 0.000352982224744492, "loss": 0.5782, "step": 30040 }, { "epoch": 0.6688479344729344, "grad_norm": 0.6948940753936768, "learning_rate": 0.00035295218953622717, "loss": 0.5985, "step": 30050 }, { "epoch": 0.6690705128205128, "grad_norm": 0.6068517565727234, "learning_rate": 0.0003529221460164152, "loss": 0.5879, "step": 30060 }, { "epoch": 0.6692930911680912, "grad_norm": 0.7442594170570374, "learning_rate": 0.0003528920941866886, "loss": 0.6432, "step": 30070 }, { "epoch": 0.6695156695156695, "grad_norm": 0.5323207378387451, "learning_rate": 0.00035286203404868044, "loss": 0.4737, "step": 30080 }, { "epoch": 0.6697382478632479, "grad_norm": 0.9668570160865784, "learning_rate": 0.00035283196560402416, "loss": 0.7099, "step": 30090 }, { "epoch": 0.6699608262108262, "grad_norm": 0.5750164985656738, "learning_rate": 0.00035280188885435386, "loss": 0.6165, "step": 30100 }, { "epoch": 0.6701834045584045, "grad_norm": 0.46717461943626404, "learning_rate": 0.00035277180380130377, "loss": 0.4929, "step": 30110 }, { "epoch": 0.6704059829059829, "grad_norm": 0.7870973348617554, "learning_rate": 0.0003527417104465088, "loss": 0.6755, "step": 30120 }, { "epoch": 0.6706285612535613, "grad_norm": 0.6971989274024963, "learning_rate": 0.0003527116087916042, "loss": 0.6212, "step": 30130 }, { "epoch": 0.6708511396011396, "grad_norm": 0.689439594745636, "learning_rate": 0.0003526814988382258, "loss": 0.5999, "step": 30140 }, { "epoch": 0.671073717948718, "grad_norm": 0.48943307995796204, "learning_rate": 0.0003526513805880098, "loss": 0.5033, "step": 30150 }, { "epoch": 0.6712962962962963, "grad_norm": 0.8167397379875183, "learning_rate": 0.0003526212540425928, "loss": 0.6477, "step": 30160 }, { "epoch": 0.6715188746438746, "grad_norm": 0.8747900128364563, "learning_rate": 0.00035259111920361185, "loss": 0.6257, "step": 30170 }, { "epoch": 0.6717414529914529, "grad_norm": 0.701403796672821, "learning_rate": 0.0003525609760727046, "loss": 0.4335, "step": 30180 }, { "epoch": 0.6719640313390314, "grad_norm": 0.6762571930885315, "learning_rate": 0.00035253082465150907, "loss": 0.5561, "step": 30190 }, { "epoch": 0.6721866096866097, "grad_norm": 0.8309723734855652, "learning_rate": 0.00035250066494166364, "loss": 0.678, "step": 30200 }, { "epoch": 0.672409188034188, "grad_norm": 0.7671947479248047, "learning_rate": 0.0003524704969448072, "loss": 0.6803, "step": 30210 }, { "epoch": 0.6726317663817664, "grad_norm": 0.7105935215950012, "learning_rate": 0.00035244032066257915, "loss": 0.6202, "step": 30220 }, { "epoch": 0.6728543447293447, "grad_norm": 0.6643106937408447, "learning_rate": 0.0003524101360966193, "loss": 0.6799, "step": 30230 }, { "epoch": 0.6730769230769231, "grad_norm": 0.6214847564697266, "learning_rate": 0.00035237994324856784, "loss": 0.7154, "step": 30240 }, { "epoch": 0.6732995014245015, "grad_norm": 0.8612035512924194, "learning_rate": 0.00035234974212006555, "loss": 0.6859, "step": 30250 }, { "epoch": 0.6735220797720798, "grad_norm": 0.6152910590171814, "learning_rate": 0.00035231953271275355, "loss": 0.6501, "step": 30260 }, { "epoch": 0.6737446581196581, "grad_norm": 0.600477933883667, "learning_rate": 0.0003522893150282735, "loss": 0.446, "step": 30270 }, { "epoch": 0.6739672364672364, "grad_norm": 0.7982945442199707, "learning_rate": 0.0003522590890682673, "loss": 0.681, "step": 30280 }, { "epoch": 0.6741898148148148, "grad_norm": 0.7654717564582825, "learning_rate": 0.00035222885483437766, "loss": 0.6666, "step": 30290 }, { "epoch": 0.6744123931623932, "grad_norm": 1.2194135189056396, "learning_rate": 0.0003521986123282473, "loss": 0.7611, "step": 30300 }, { "epoch": 0.6746349715099715, "grad_norm": 0.6501625776290894, "learning_rate": 0.0003521683615515198, "loss": 0.566, "step": 30310 }, { "epoch": 0.6748575498575499, "grad_norm": 0.714274525642395, "learning_rate": 0.00035213810250583904, "loss": 0.6326, "step": 30320 }, { "epoch": 0.6750801282051282, "grad_norm": 0.7483553886413574, "learning_rate": 0.0003521078351928492, "loss": 0.7211, "step": 30330 }, { "epoch": 0.6753027065527065, "grad_norm": 1.305942177772522, "learning_rate": 0.00035207755961419506, "loss": 0.7585, "step": 30340 }, { "epoch": 0.6755252849002849, "grad_norm": 0.820911169052124, "learning_rate": 0.00035204727577152186, "loss": 0.6037, "step": 30350 }, { "epoch": 0.6757478632478633, "grad_norm": 1.0651403665542603, "learning_rate": 0.0003520169836664752, "loss": 0.5951, "step": 30360 }, { "epoch": 0.6759704415954416, "grad_norm": 0.9064886569976807, "learning_rate": 0.0003519866833007012, "loss": 0.6801, "step": 30370 }, { "epoch": 0.67619301994302, "grad_norm": 0.5420659184455872, "learning_rate": 0.0003519563746758464, "loss": 0.6233, "step": 30380 }, { "epoch": 0.6764155982905983, "grad_norm": 0.6689449548721313, "learning_rate": 0.0003519260577935578, "loss": 0.6992, "step": 30390 }, { "epoch": 0.6766381766381766, "grad_norm": 0.48055005073547363, "learning_rate": 0.0003518957326554829, "loss": 0.5653, "step": 30400 }, { "epoch": 0.6768607549857549, "grad_norm": 0.702401340007782, "learning_rate": 0.0003518653992632695, "loss": 0.5689, "step": 30410 }, { "epoch": 0.6770833333333334, "grad_norm": 0.7911334037780762, "learning_rate": 0.000351835057618566, "loss": 0.6179, "step": 30420 }, { "epoch": 0.6773059116809117, "grad_norm": 0.9225923418998718, "learning_rate": 0.0003518047077230211, "loss": 0.6146, "step": 30430 }, { "epoch": 0.67752849002849, "grad_norm": 0.5468271374702454, "learning_rate": 0.0003517743495782842, "loss": 0.6134, "step": 30440 }, { "epoch": 0.6777510683760684, "grad_norm": 0.7671268582344055, "learning_rate": 0.0003517439831860048, "loss": 0.5387, "step": 30450 }, { "epoch": 0.6779736467236467, "grad_norm": 0.6207623481750488, "learning_rate": 0.00035171360854783324, "loss": 0.5735, "step": 30460 }, { "epoch": 0.6781962250712251, "grad_norm": 0.5850103497505188, "learning_rate": 0.00035168322566541993, "loss": 0.5557, "step": 30470 }, { "epoch": 0.6784188034188035, "grad_norm": 0.5051694512367249, "learning_rate": 0.000351652834540416, "loss": 0.6332, "step": 30480 }, { "epoch": 0.6786413817663818, "grad_norm": 0.7209334373474121, "learning_rate": 0.0003516224351744729, "loss": 0.6913, "step": 30490 }, { "epoch": 0.6788639601139601, "grad_norm": 0.8140676617622375, "learning_rate": 0.0003515920275692425, "loss": 0.6377, "step": 30500 }, { "epoch": 0.6790865384615384, "grad_norm": 0.9659133553504944, "learning_rate": 0.0003515616117263772, "loss": 0.6876, "step": 30510 }, { "epoch": 0.6793091168091168, "grad_norm": 0.875493586063385, "learning_rate": 0.00035153118764752987, "loss": 0.5625, "step": 30520 }, { "epoch": 0.6795316951566952, "grad_norm": 0.4299869239330292, "learning_rate": 0.0003515007553343538, "loss": 0.4195, "step": 30530 }, { "epoch": 0.6797542735042735, "grad_norm": 0.8948302268981934, "learning_rate": 0.0003514703147885026, "loss": 0.6594, "step": 30540 }, { "epoch": 0.6799768518518519, "grad_norm": 0.5883477330207825, "learning_rate": 0.00035143986601163057, "loss": 0.6186, "step": 30550 }, { "epoch": 0.6801994301994302, "grad_norm": 0.8271781206130981, "learning_rate": 0.00035140940900539217, "loss": 0.56, "step": 30560 }, { "epoch": 0.6804220085470085, "grad_norm": 0.8564368486404419, "learning_rate": 0.00035137894377144257, "loss": 0.6942, "step": 30570 }, { "epoch": 0.6806445868945868, "grad_norm": 1.0115987062454224, "learning_rate": 0.0003513484703114372, "loss": 0.681, "step": 30580 }, { "epoch": 0.6808671652421653, "grad_norm": 0.6871652603149414, "learning_rate": 0.00035131798862703215, "loss": 0.5671, "step": 30590 }, { "epoch": 0.6810897435897436, "grad_norm": 0.7678980827331543, "learning_rate": 0.0003512874987198837, "loss": 0.8507, "step": 30600 }, { "epoch": 0.6813123219373219, "grad_norm": 0.8469058275222778, "learning_rate": 0.00035125700059164864, "loss": 0.6523, "step": 30610 }, { "epoch": 0.6815349002849003, "grad_norm": 0.5048143863677979, "learning_rate": 0.0003512264942439844, "loss": 0.6415, "step": 30620 }, { "epoch": 0.6817574786324786, "grad_norm": 0.6894258260726929, "learning_rate": 0.0003511959796785486, "loss": 0.6732, "step": 30630 }, { "epoch": 0.6819800569800569, "grad_norm": 0.5911911129951477, "learning_rate": 0.0003511654568969996, "loss": 0.6205, "step": 30640 }, { "epoch": 0.6822026353276354, "grad_norm": 0.519760012626648, "learning_rate": 0.0003511349259009958, "loss": 0.6151, "step": 30650 }, { "epoch": 0.6824252136752137, "grad_norm": 0.4933334290981293, "learning_rate": 0.00035110438669219647, "loss": 0.5582, "step": 30660 }, { "epoch": 0.682647792022792, "grad_norm": 0.7749009728431702, "learning_rate": 0.0003510738392722611, "loss": 0.6336, "step": 30670 }, { "epoch": 0.6828703703703703, "grad_norm": 0.9254588484764099, "learning_rate": 0.00035104328364284954, "loss": 0.7357, "step": 30680 }, { "epoch": 0.6830929487179487, "grad_norm": 0.8186060786247253, "learning_rate": 0.0003510127198056224, "loss": 0.635, "step": 30690 }, { "epoch": 0.6833155270655271, "grad_norm": 1.0061089992523193, "learning_rate": 0.0003509821477622404, "loss": 0.5805, "step": 30700 }, { "epoch": 0.6835381054131054, "grad_norm": 0.4683438241481781, "learning_rate": 0.00035095156751436483, "loss": 0.6817, "step": 30710 }, { "epoch": 0.6837606837606838, "grad_norm": 0.7145074605941772, "learning_rate": 0.0003509209790636576, "loss": 0.5704, "step": 30720 }, { "epoch": 0.6839832621082621, "grad_norm": 0.7668792605400085, "learning_rate": 0.0003508903824117807, "loss": 0.6027, "step": 30730 }, { "epoch": 0.6842058404558404, "grad_norm": 0.5148479342460632, "learning_rate": 0.00035085977756039695, "loss": 0.7058, "step": 30740 }, { "epoch": 0.6844284188034188, "grad_norm": 0.7288443446159363, "learning_rate": 0.0003508291645111695, "loss": 0.6624, "step": 30750 }, { "epoch": 0.6846509971509972, "grad_norm": 2.099850654602051, "learning_rate": 0.0003507985432657616, "loss": 0.7333, "step": 30760 }, { "epoch": 0.6848735754985755, "grad_norm": 0.8458130359649658, "learning_rate": 0.0003507679138258375, "loss": 0.5342, "step": 30770 }, { "epoch": 0.6850961538461539, "grad_norm": 0.6164962649345398, "learning_rate": 0.0003507372761930616, "loss": 0.7124, "step": 30780 }, { "epoch": 0.6853187321937322, "grad_norm": 0.706070601940155, "learning_rate": 0.0003507066303690986, "loss": 0.5651, "step": 30790 }, { "epoch": 0.6855413105413105, "grad_norm": 0.6817519664764404, "learning_rate": 0.000350675976355614, "loss": 0.6462, "step": 30800 }, { "epoch": 0.6857638888888888, "grad_norm": 0.8440865278244019, "learning_rate": 0.00035064531415427347, "loss": 0.4929, "step": 30810 }, { "epoch": 0.6859864672364673, "grad_norm": 0.5422286987304688, "learning_rate": 0.00035061464376674327, "loss": 0.563, "step": 30820 }, { "epoch": 0.6862090455840456, "grad_norm": 0.7093590497970581, "learning_rate": 0.0003505839651946899, "loss": 0.6696, "step": 30830 }, { "epoch": 0.6864316239316239, "grad_norm": 0.6214165091514587, "learning_rate": 0.00035055327843978076, "loss": 0.7322, "step": 30840 }, { "epoch": 0.6866542022792023, "grad_norm": 0.5425866842269897, "learning_rate": 0.0003505225835036831, "loss": 0.5924, "step": 30850 }, { "epoch": 0.6868767806267806, "grad_norm": 0.6415085196495056, "learning_rate": 0.0003504918803880651, "loss": 0.5754, "step": 30860 }, { "epoch": 0.6870993589743589, "grad_norm": 0.7075197696685791, "learning_rate": 0.0003504611690945951, "loss": 0.7709, "step": 30870 }, { "epoch": 0.6873219373219374, "grad_norm": 0.6782618165016174, "learning_rate": 0.00035043044962494203, "loss": 0.6097, "step": 30880 }, { "epoch": 0.6875445156695157, "grad_norm": 0.7449636459350586, "learning_rate": 0.0003503997219807751, "loss": 0.7023, "step": 30890 }, { "epoch": 0.687767094017094, "grad_norm": 0.5584663152694702, "learning_rate": 0.00035036898616376425, "loss": 0.6441, "step": 30900 }, { "epoch": 0.6879896723646723, "grad_norm": 0.7006946802139282, "learning_rate": 0.0003503382421755795, "loss": 0.6955, "step": 30910 }, { "epoch": 0.6882122507122507, "grad_norm": 0.7925690412521362, "learning_rate": 0.0003503074900178917, "loss": 0.5764, "step": 30920 }, { "epoch": 0.6884348290598291, "grad_norm": 0.520116925239563, "learning_rate": 0.0003502767296923718, "loss": 0.4167, "step": 30930 }, { "epoch": 0.6886574074074074, "grad_norm": 0.6459872126579285, "learning_rate": 0.0003502459612006913, "loss": 0.5849, "step": 30940 }, { "epoch": 0.6888799857549858, "grad_norm": 0.8839649558067322, "learning_rate": 0.00035021518454452237, "loss": 0.643, "step": 30950 }, { "epoch": 0.6891025641025641, "grad_norm": 0.7639024257659912, "learning_rate": 0.00035018439972553736, "loss": 0.6718, "step": 30960 }, { "epoch": 0.6893251424501424, "grad_norm": 0.8284814953804016, "learning_rate": 0.0003501536067454091, "loss": 0.6959, "step": 30970 }, { "epoch": 0.6895477207977208, "grad_norm": 0.6810000538825989, "learning_rate": 0.00035012280560581087, "loss": 0.6354, "step": 30980 }, { "epoch": 0.6897702991452992, "grad_norm": 0.6328868865966797, "learning_rate": 0.00035009199630841654, "loss": 0.6258, "step": 30990 }, { "epoch": 0.6899928774928775, "grad_norm": 0.5300467610359192, "learning_rate": 0.0003500611788549002, "loss": 0.5189, "step": 31000 }, { "epoch": 0.6902154558404558, "grad_norm": 0.7155830264091492, "learning_rate": 0.0003500303532469366, "loss": 0.6178, "step": 31010 }, { "epoch": 0.6904380341880342, "grad_norm": 0.7034480571746826, "learning_rate": 0.00034999951948620084, "loss": 0.551, "step": 31020 }, { "epoch": 0.6906606125356125, "grad_norm": 0.7898311018943787, "learning_rate": 0.00034996867757436834, "loss": 0.8097, "step": 31030 }, { "epoch": 0.6908831908831908, "grad_norm": 0.7837551236152649, "learning_rate": 0.0003499378275131151, "loss": 0.598, "step": 31040 }, { "epoch": 0.6911057692307693, "grad_norm": 0.6349511742591858, "learning_rate": 0.00034990696930411764, "loss": 0.6538, "step": 31050 }, { "epoch": 0.6913283475783476, "grad_norm": 0.8751807808876038, "learning_rate": 0.00034987610294905265, "loss": 0.691, "step": 31060 }, { "epoch": 0.6915509259259259, "grad_norm": 0.6790831089019775, "learning_rate": 0.0003498452284495976, "loss": 0.5019, "step": 31070 }, { "epoch": 0.6917735042735043, "grad_norm": 0.6484060883522034, "learning_rate": 0.0003498143458074302, "loss": 0.6183, "step": 31080 }, { "epoch": 0.6919960826210826, "grad_norm": 0.7466754913330078, "learning_rate": 0.0003497834550242285, "loss": 0.5818, "step": 31090 }, { "epoch": 0.6922186609686609, "grad_norm": 0.5304076671600342, "learning_rate": 0.0003497525561016713, "loss": 0.5433, "step": 31100 }, { "epoch": 0.6924412393162394, "grad_norm": 0.43836620450019836, "learning_rate": 0.00034972164904143767, "loss": 0.6111, "step": 31110 }, { "epoch": 0.6926638176638177, "grad_norm": 0.5781193375587463, "learning_rate": 0.0003496907338452069, "loss": 0.6533, "step": 31120 }, { "epoch": 0.692886396011396, "grad_norm": 0.4405815899372101, "learning_rate": 0.00034965981051465923, "loss": 0.6361, "step": 31130 }, { "epoch": 0.6931089743589743, "grad_norm": 0.7085813879966736, "learning_rate": 0.00034962887905147494, "loss": 0.6169, "step": 31140 }, { "epoch": 0.6933315527065527, "grad_norm": 0.412544846534729, "learning_rate": 0.00034959793945733484, "loss": 0.4898, "step": 31150 }, { "epoch": 0.6935541310541311, "grad_norm": 0.8504443764686584, "learning_rate": 0.00034956699173392024, "loss": 0.6557, "step": 31160 }, { "epoch": 0.6937767094017094, "grad_norm": 0.7342494130134583, "learning_rate": 0.0003495360358829129, "loss": 0.5863, "step": 31170 }, { "epoch": 0.6939992877492878, "grad_norm": 1.312693476676941, "learning_rate": 0.00034950507190599495, "loss": 0.6473, "step": 31180 }, { "epoch": 0.6942218660968661, "grad_norm": 0.7119789719581604, "learning_rate": 0.000349474099804849, "loss": 0.7246, "step": 31190 }, { "epoch": 0.6944444444444444, "grad_norm": 0.7937467694282532, "learning_rate": 0.0003494431195811581, "loss": 0.6255, "step": 31200 }, { "epoch": 0.6946670227920227, "grad_norm": 0.6429157257080078, "learning_rate": 0.00034941213123660574, "loss": 0.5958, "step": 31210 }, { "epoch": 0.6948896011396012, "grad_norm": 0.8229726552963257, "learning_rate": 0.0003493811347728758, "loss": 0.5507, "step": 31220 }, { "epoch": 0.6951121794871795, "grad_norm": 0.3695134222507477, "learning_rate": 0.00034935013019165277, "loss": 0.5767, "step": 31230 }, { "epoch": 0.6953347578347578, "grad_norm": 0.6000204086303711, "learning_rate": 0.00034931911749462135, "loss": 0.6586, "step": 31240 }, { "epoch": 0.6955573361823362, "grad_norm": 0.7966252565383911, "learning_rate": 0.0003492880966834669, "loss": 0.5889, "step": 31250 }, { "epoch": 0.6957799145299145, "grad_norm": 0.7294942140579224, "learning_rate": 0.00034925706775987503, "loss": 0.6751, "step": 31260 }, { "epoch": 0.6960024928774928, "grad_norm": 0.5740212798118591, "learning_rate": 0.0003492260307255319, "loss": 0.5789, "step": 31270 }, { "epoch": 0.6962250712250713, "grad_norm": 0.6787393689155579, "learning_rate": 0.00034919498558212415, "loss": 0.6466, "step": 31280 }, { "epoch": 0.6964476495726496, "grad_norm": 0.8797711730003357, "learning_rate": 0.0003491639323313387, "loss": 0.6238, "step": 31290 }, { "epoch": 0.6966702279202279, "grad_norm": 0.7877987623214722, "learning_rate": 0.0003491328709748631, "loss": 0.6028, "step": 31300 }, { "epoch": 0.6968928062678063, "grad_norm": 0.6889671683311462, "learning_rate": 0.0003491018015143852, "loss": 0.5021, "step": 31310 }, { "epoch": 0.6971153846153846, "grad_norm": 0.38437703251838684, "learning_rate": 0.0003490707239515933, "loss": 0.676, "step": 31320 }, { "epoch": 0.6973379629629629, "grad_norm": 0.6212413311004639, "learning_rate": 0.00034903963828817626, "loss": 0.6266, "step": 31330 }, { "epoch": 0.6975605413105413, "grad_norm": 0.6480898857116699, "learning_rate": 0.0003490085445258233, "loss": 0.5704, "step": 31340 }, { "epoch": 0.6977831196581197, "grad_norm": 0.658523678779602, "learning_rate": 0.000348977442666224, "loss": 0.7526, "step": 31350 }, { "epoch": 0.698005698005698, "grad_norm": 1.17215096950531, "learning_rate": 0.00034894633271106843, "loss": 0.6893, "step": 31360 }, { "epoch": 0.6982282763532763, "grad_norm": 0.8638959527015686, "learning_rate": 0.0003489152146620473, "loss": 0.7922, "step": 31370 }, { "epoch": 0.6984508547008547, "grad_norm": 0.823543131351471, "learning_rate": 0.00034888408852085155, "loss": 0.5136, "step": 31380 }, { "epoch": 0.6986734330484331, "grad_norm": 0.6390395760536194, "learning_rate": 0.00034885295428917245, "loss": 0.6816, "step": 31390 }, { "epoch": 0.6988960113960114, "grad_norm": 0.6792179346084595, "learning_rate": 0.000348821811968702, "loss": 0.4866, "step": 31400 }, { "epoch": 0.6991185897435898, "grad_norm": 0.5242462158203125, "learning_rate": 0.00034879066156113245, "loss": 0.5386, "step": 31410 }, { "epoch": 0.6993411680911681, "grad_norm": 0.6970105767250061, "learning_rate": 0.00034875950306815655, "loss": 0.5325, "step": 31420 }, { "epoch": 0.6995637464387464, "grad_norm": 0.7846478223800659, "learning_rate": 0.0003487283364914674, "loss": 0.6103, "step": 31430 }, { "epoch": 0.6997863247863247, "grad_norm": 0.747134804725647, "learning_rate": 0.0003486971618327588, "loss": 0.7325, "step": 31440 }, { "epoch": 0.7000089031339032, "grad_norm": 0.7611316442489624, "learning_rate": 0.0003486659790937246, "loss": 0.5463, "step": 31450 }, { "epoch": 0.7002314814814815, "grad_norm": 1.1192259788513184, "learning_rate": 0.0003486347882760595, "loss": 0.7405, "step": 31460 }, { "epoch": 0.7004540598290598, "grad_norm": 0.771552324295044, "learning_rate": 0.00034860358938145825, "loss": 0.5434, "step": 31470 }, { "epoch": 0.7006766381766382, "grad_norm": 0.6160570383071899, "learning_rate": 0.0003485723824116163, "loss": 0.6583, "step": 31480 }, { "epoch": 0.7008992165242165, "grad_norm": 0.7527022361755371, "learning_rate": 0.00034854116736822953, "loss": 0.6388, "step": 31490 }, { "epoch": 0.7011217948717948, "grad_norm": 0.870607852935791, "learning_rate": 0.00034850994425299404, "loss": 0.6683, "step": 31500 }, { "epoch": 0.7013443732193733, "grad_norm": 0.7308458685874939, "learning_rate": 0.00034847871306760664, "loss": 0.6992, "step": 31510 }, { "epoch": 0.7015669515669516, "grad_norm": 0.7584428787231445, "learning_rate": 0.0003484474738137644, "loss": 0.649, "step": 31520 }, { "epoch": 0.7017895299145299, "grad_norm": 0.7453662157058716, "learning_rate": 0.000348416226493165, "loss": 0.6592, "step": 31530 }, { "epoch": 0.7020121082621082, "grad_norm": 0.7075641751289368, "learning_rate": 0.00034838497110750623, "loss": 0.6829, "step": 31540 }, { "epoch": 0.7022346866096866, "grad_norm": 0.9151403307914734, "learning_rate": 0.0003483537076584867, "loss": 0.6963, "step": 31550 }, { "epoch": 0.7024572649572649, "grad_norm": 0.5822266340255737, "learning_rate": 0.0003483224361478053, "loss": 0.5676, "step": 31560 }, { "epoch": 0.7026798433048433, "grad_norm": 1.0856817960739136, "learning_rate": 0.00034829115657716126, "loss": 0.8445, "step": 31570 }, { "epoch": 0.7029024216524217, "grad_norm": 0.5421498417854309, "learning_rate": 0.00034825986894825435, "loss": 0.5714, "step": 31580 }, { "epoch": 0.703125, "grad_norm": 0.6657152771949768, "learning_rate": 0.0003482285732627848, "loss": 0.5931, "step": 31590 }, { "epoch": 0.7033475783475783, "grad_norm": 0.8101097345352173, "learning_rate": 0.00034819726952245325, "loss": 0.6062, "step": 31600 }, { "epoch": 0.7035701566951567, "grad_norm": 0.6771178841590881, "learning_rate": 0.00034816595772896075, "loss": 0.7197, "step": 31610 }, { "epoch": 0.7037927350427351, "grad_norm": 0.8368094563484192, "learning_rate": 0.0003481346378840088, "loss": 0.6603, "step": 31620 }, { "epoch": 0.7040153133903134, "grad_norm": 0.567988932132721, "learning_rate": 0.00034810330998929936, "loss": 0.6759, "step": 31630 }, { "epoch": 0.7042378917378918, "grad_norm": 0.5423449277877808, "learning_rate": 0.0003480719740465348, "loss": 0.5562, "step": 31640 }, { "epoch": 0.7044604700854701, "grad_norm": 0.704281210899353, "learning_rate": 0.000348040630057418, "loss": 0.5836, "step": 31650 }, { "epoch": 0.7046830484330484, "grad_norm": 0.6200312972068787, "learning_rate": 0.00034800927802365215, "loss": 0.4597, "step": 31660 }, { "epoch": 0.7049056267806267, "grad_norm": 0.5637904405593872, "learning_rate": 0.00034797791794694097, "loss": 0.7486, "step": 31670 }, { "epoch": 0.7051282051282052, "grad_norm": 0.8819078207015991, "learning_rate": 0.00034794654982898856, "loss": 0.6331, "step": 31680 }, { "epoch": 0.7053507834757835, "grad_norm": 0.8103722929954529, "learning_rate": 0.00034791517367149956, "loss": 0.6076, "step": 31690 }, { "epoch": 0.7055733618233618, "grad_norm": 0.5280768275260925, "learning_rate": 0.0003478837894761789, "loss": 0.5688, "step": 31700 }, { "epoch": 0.7057959401709402, "grad_norm": 0.8044383525848389, "learning_rate": 0.0003478523972447321, "loss": 0.7396, "step": 31710 }, { "epoch": 0.7060185185185185, "grad_norm": 0.7547085881233215, "learning_rate": 0.0003478209969788649, "loss": 0.5739, "step": 31720 }, { "epoch": 0.7062410968660968, "grad_norm": 0.7226533889770508, "learning_rate": 0.0003477895886802838, "loss": 0.5934, "step": 31730 }, { "epoch": 0.7064636752136753, "grad_norm": 0.5839481353759766, "learning_rate": 0.0003477581723506955, "loss": 0.5804, "step": 31740 }, { "epoch": 0.7066862535612536, "grad_norm": 0.6620025634765625, "learning_rate": 0.000347726747991807, "loss": 0.6022, "step": 31750 }, { "epoch": 0.7069088319088319, "grad_norm": 0.7869464159011841, "learning_rate": 0.0003476953156053262, "loss": 0.6546, "step": 31760 }, { "epoch": 0.7071314102564102, "grad_norm": 0.6986963748931885, "learning_rate": 0.0003476638751929611, "loss": 0.666, "step": 31770 }, { "epoch": 0.7073539886039886, "grad_norm": 0.8924002051353455, "learning_rate": 0.00034763242675642003, "loss": 0.6925, "step": 31780 }, { "epoch": 0.7075765669515669, "grad_norm": 0.5170907378196716, "learning_rate": 0.000347600970297412, "loss": 0.5942, "step": 31790 }, { "epoch": 0.7077991452991453, "grad_norm": 0.9141911268234253, "learning_rate": 0.0003475695058176465, "loss": 0.6946, "step": 31800 }, { "epoch": 0.7080217236467237, "grad_norm": 0.6999244689941406, "learning_rate": 0.0003475380333188332, "loss": 0.7155, "step": 31810 }, { "epoch": 0.708244301994302, "grad_norm": 0.8103520274162292, "learning_rate": 0.0003475065528026824, "loss": 0.5796, "step": 31820 }, { "epoch": 0.7084668803418803, "grad_norm": 0.6898837089538574, "learning_rate": 0.0003474750642709048, "loss": 0.5836, "step": 31830 }, { "epoch": 0.7086894586894587, "grad_norm": 0.7483912706375122, "learning_rate": 0.0003474435677252115, "loss": 0.5765, "step": 31840 }, { "epoch": 0.7089120370370371, "grad_norm": 0.9520956873893738, "learning_rate": 0.0003474120631673139, "loss": 0.6258, "step": 31850 }, { "epoch": 0.7091346153846154, "grad_norm": 0.5112124681472778, "learning_rate": 0.0003473805505989242, "loss": 0.6141, "step": 31860 }, { "epoch": 0.7093571937321937, "grad_norm": 0.7033752202987671, "learning_rate": 0.0003473490300217547, "loss": 0.5962, "step": 31870 }, { "epoch": 0.7095797720797721, "grad_norm": 0.8038191795349121, "learning_rate": 0.00034731750143751833, "loss": 0.5635, "step": 31880 }, { "epoch": 0.7098023504273504, "grad_norm": 0.7981531023979187, "learning_rate": 0.0003472859648479283, "loss": 0.6539, "step": 31890 }, { "epoch": 0.7100249287749287, "grad_norm": 0.8092349767684937, "learning_rate": 0.0003472544202546984, "loss": 0.5704, "step": 31900 }, { "epoch": 0.7102475071225072, "grad_norm": 0.7940104007720947, "learning_rate": 0.00034722286765954274, "loss": 0.6257, "step": 31910 }, { "epoch": 0.7104700854700855, "grad_norm": 0.9252044558525085, "learning_rate": 0.00034719130706417585, "loss": 0.6482, "step": 31920 }, { "epoch": 0.7106926638176638, "grad_norm": 1.02089524269104, "learning_rate": 0.00034715973847031294, "loss": 0.5812, "step": 31930 }, { "epoch": 0.7109152421652422, "grad_norm": 0.7068274617195129, "learning_rate": 0.0003471281618796693, "loss": 0.6511, "step": 31940 }, { "epoch": 0.7111378205128205, "grad_norm": 0.8781763315200806, "learning_rate": 0.0003470965772939609, "loss": 0.5816, "step": 31950 }, { "epoch": 0.7113603988603988, "grad_norm": 0.909304678440094, "learning_rate": 0.00034706498471490414, "loss": 0.6203, "step": 31960 }, { "epoch": 0.7115829772079773, "grad_norm": 0.7124899625778198, "learning_rate": 0.0003470333841442157, "loss": 1.022, "step": 31970 }, { "epoch": 0.7118055555555556, "grad_norm": 0.8078319430351257, "learning_rate": 0.00034700177558361273, "loss": 0.6832, "step": 31980 }, { "epoch": 0.7120281339031339, "grad_norm": 0.8073205351829529, "learning_rate": 0.00034697015903481304, "loss": 0.5991, "step": 31990 }, { "epoch": 0.7122507122507122, "grad_norm": 0.9433754682540894, "learning_rate": 0.0003469385344995345, "loss": 0.6174, "step": 32000 }, { "epoch": 0.7124732905982906, "grad_norm": 0.44269195199012756, "learning_rate": 0.0003469069019794958, "loss": 0.6549, "step": 32010 }, { "epoch": 0.7126958689458689, "grad_norm": 0.5322273373603821, "learning_rate": 0.0003468752614764156, "loss": 0.6386, "step": 32020 }, { "epoch": 0.7129184472934473, "grad_norm": 0.6944203972816467, "learning_rate": 0.00034684361299201365, "loss": 0.5912, "step": 32030 }, { "epoch": 0.7131410256410257, "grad_norm": 0.7633023858070374, "learning_rate": 0.00034681195652800945, "loss": 0.5741, "step": 32040 }, { "epoch": 0.713363603988604, "grad_norm": 0.8351055383682251, "learning_rate": 0.00034678029208612345, "loss": 0.5826, "step": 32050 }, { "epoch": 0.7135861823361823, "grad_norm": 0.5108731985092163, "learning_rate": 0.00034674861966807615, "loss": 0.5997, "step": 32060 }, { "epoch": 0.7138087606837606, "grad_norm": 0.48932331800460815, "learning_rate": 0.0003467169392755887, "loss": 0.5924, "step": 32070 }, { "epoch": 0.7140313390313391, "grad_norm": 0.3925066888332367, "learning_rate": 0.00034668525091038265, "loss": 0.6352, "step": 32080 }, { "epoch": 0.7142539173789174, "grad_norm": 0.9368833303451538, "learning_rate": 0.00034665355457418, "loss": 0.679, "step": 32090 }, { "epoch": 0.7144764957264957, "grad_norm": 0.8664660453796387, "learning_rate": 0.00034662185026870324, "loss": 0.6223, "step": 32100 }, { "epoch": 0.7146990740740741, "grad_norm": 0.8929124474525452, "learning_rate": 0.000346590137995675, "loss": 0.5348, "step": 32110 }, { "epoch": 0.7149216524216524, "grad_norm": 0.5446501970291138, "learning_rate": 0.0003465584177568187, "loss": 0.5645, "step": 32120 }, { "epoch": 0.7151442307692307, "grad_norm": 0.5276156663894653, "learning_rate": 0.0003465266895538579, "loss": 0.6256, "step": 32130 }, { "epoch": 0.7153668091168092, "grad_norm": 0.5158522129058838, "learning_rate": 0.000346494953388517, "loss": 0.6573, "step": 32140 }, { "epoch": 0.7155893874643875, "grad_norm": 0.8431310653686523, "learning_rate": 0.00034646320926252027, "loss": 0.8396, "step": 32150 }, { "epoch": 0.7158119658119658, "grad_norm": 0.6748244762420654, "learning_rate": 0.0003464314571775929, "loss": 0.5628, "step": 32160 }, { "epoch": 0.7160345441595442, "grad_norm": 0.668067216873169, "learning_rate": 0.0003463996971354603, "loss": 0.5305, "step": 32170 }, { "epoch": 0.7162571225071225, "grad_norm": 0.5041331648826599, "learning_rate": 0.0003463679291378483, "loss": 0.63, "step": 32180 }, { "epoch": 0.7164797008547008, "grad_norm": 0.749484658241272, "learning_rate": 0.0003463361531864831, "loss": 0.677, "step": 32190 }, { "epoch": 0.7167022792022792, "grad_norm": 0.6030726432800293, "learning_rate": 0.0003463043692830917, "loss": 0.5044, "step": 32200 }, { "epoch": 0.7169248575498576, "grad_norm": 0.4747789204120636, "learning_rate": 0.000346272577429401, "loss": 0.5998, "step": 32210 }, { "epoch": 0.7171474358974359, "grad_norm": 0.6367260217666626, "learning_rate": 0.0003462407776271388, "loss": 0.6071, "step": 32220 }, { "epoch": 0.7173700142450142, "grad_norm": 0.453498899936676, "learning_rate": 0.00034620896987803295, "loss": 0.7674, "step": 32230 }, { "epoch": 0.7175925925925926, "grad_norm": 0.8879516124725342, "learning_rate": 0.00034617715418381196, "loss": 0.5801, "step": 32240 }, { "epoch": 0.7178151709401709, "grad_norm": 0.5511301159858704, "learning_rate": 0.00034614533054620473, "loss": 0.5343, "step": 32250 }, { "epoch": 0.7180377492877493, "grad_norm": 0.45961880683898926, "learning_rate": 0.0003461134989669407, "loss": 0.5569, "step": 32260 }, { "epoch": 0.7182603276353277, "grad_norm": 0.8157079815864563, "learning_rate": 0.00034608165944774943, "loss": 0.6672, "step": 32270 }, { "epoch": 0.718482905982906, "grad_norm": 1.0624051094055176, "learning_rate": 0.0003460498119903613, "loss": 0.6932, "step": 32280 }, { "epoch": 0.7187054843304843, "grad_norm": 0.6351505517959595, "learning_rate": 0.0003460179565965067, "loss": 0.6836, "step": 32290 }, { "epoch": 0.7189280626780626, "grad_norm": 0.5927011370658875, "learning_rate": 0.0003459860932679169, "loss": 0.7204, "step": 32300 }, { "epoch": 0.7191506410256411, "grad_norm": 0.6216453909873962, "learning_rate": 0.00034595422200632325, "loss": 0.4918, "step": 32310 }, { "epoch": 0.7193732193732194, "grad_norm": 0.9162493944168091, "learning_rate": 0.00034592234281345766, "loss": 0.601, "step": 32320 }, { "epoch": 0.7195957977207977, "grad_norm": 0.6896253228187561, "learning_rate": 0.00034589045569105256, "loss": 0.706, "step": 32330 }, { "epoch": 0.7198183760683761, "grad_norm": 0.5422629714012146, "learning_rate": 0.00034585856064084066, "loss": 0.63, "step": 32340 }, { "epoch": 0.7200409544159544, "grad_norm": 0.6882944107055664, "learning_rate": 0.0003458266576645551, "loss": 0.5267, "step": 32350 }, { "epoch": 0.7200854700854701, "eval_loss": 0.6198766827583313, "eval_runtime": 337.4337, "eval_samples_per_second": 7.009, "eval_steps_per_second": 7.009, "step": 32352 }, { "epoch": 0.7202635327635327, "grad_norm": 0.8453060388565063, "learning_rate": 0.0003457947467639296, "loss": 0.6723, "step": 32360 }, { "epoch": 0.7204861111111112, "grad_norm": 0.6546816229820251, "learning_rate": 0.00034576282794069826, "loss": 0.607, "step": 32370 }, { "epoch": 0.7207086894586895, "grad_norm": 0.5675908327102661, "learning_rate": 0.0003457309011965955, "loss": 0.6337, "step": 32380 }, { "epoch": 0.7209312678062678, "grad_norm": 0.45033371448516846, "learning_rate": 0.00034569896653335625, "loss": 0.6044, "step": 32390 }, { "epoch": 0.7211538461538461, "grad_norm": 0.6744174361228943, "learning_rate": 0.00034566702395271597, "loss": 0.6373, "step": 32400 }, { "epoch": 0.7213764245014245, "grad_norm": 0.7250241041183472, "learning_rate": 0.0003456350734564103, "loss": 0.7083, "step": 32410 }, { "epoch": 0.7215990028490028, "grad_norm": 0.3396146297454834, "learning_rate": 0.0003456031150461755, "loss": 0.5993, "step": 32420 }, { "epoch": 0.7218215811965812, "grad_norm": 0.6883082985877991, "learning_rate": 0.00034557114872374824, "loss": 0.6734, "step": 32430 }, { "epoch": 0.7220441595441596, "grad_norm": 0.7887311577796936, "learning_rate": 0.00034553917449086556, "loss": 0.6377, "step": 32440 }, { "epoch": 0.7222667378917379, "grad_norm": 0.7964504957199097, "learning_rate": 0.00034550719234926504, "loss": 0.7179, "step": 32450 }, { "epoch": 0.7224893162393162, "grad_norm": 0.7036212086677551, "learning_rate": 0.00034547520230068454, "loss": 0.6612, "step": 32460 }, { "epoch": 0.7227118945868946, "grad_norm": 0.4557095170021057, "learning_rate": 0.00034544320434686253, "loss": 0.5499, "step": 32470 }, { "epoch": 0.7229344729344729, "grad_norm": 0.9200230240821838, "learning_rate": 0.00034541119848953764, "loss": 0.592, "step": 32480 }, { "epoch": 0.7231570512820513, "grad_norm": 0.48239457607269287, "learning_rate": 0.00034537918473044924, "loss": 0.6185, "step": 32490 }, { "epoch": 0.7233796296296297, "grad_norm": 0.6177913546562195, "learning_rate": 0.00034534716307133684, "loss": 0.7293, "step": 32500 }, { "epoch": 0.723602207977208, "grad_norm": 0.8865199685096741, "learning_rate": 0.0003453151335139407, "loss": 0.5699, "step": 32510 }, { "epoch": 0.7238247863247863, "grad_norm": 0.6912305951118469, "learning_rate": 0.0003452830960600012, "loss": 0.5005, "step": 32520 }, { "epoch": 0.7240473646723646, "grad_norm": 0.5994203090667725, "learning_rate": 0.00034525105071125933, "loss": 0.6104, "step": 32530 }, { "epoch": 0.7242699430199431, "grad_norm": 1.0352870225906372, "learning_rate": 0.0003452189974694565, "loss": 0.8157, "step": 32540 }, { "epoch": 0.7244925213675214, "grad_norm": 0.4674214720726013, "learning_rate": 0.0003451869363363344, "loss": 0.5084, "step": 32550 }, { "epoch": 0.7247150997150997, "grad_norm": 0.9042168259620667, "learning_rate": 0.0003451548673136354, "loss": 0.6597, "step": 32560 }, { "epoch": 0.7249376780626781, "grad_norm": 0.7562392950057983, "learning_rate": 0.000345122790403102, "loss": 0.6503, "step": 32570 }, { "epoch": 0.7251602564102564, "grad_norm": 0.5220441222190857, "learning_rate": 0.0003450907056064774, "loss": 0.6522, "step": 32580 }, { "epoch": 0.7253828347578347, "grad_norm": 0.32991522550582886, "learning_rate": 0.00034505861292550514, "loss": 0.5881, "step": 32590 }, { "epoch": 0.7256054131054132, "grad_norm": 0.7591485381126404, "learning_rate": 0.00034502651236192905, "loss": 0.6425, "step": 32600 }, { "epoch": 0.7258279914529915, "grad_norm": 0.6426007151603699, "learning_rate": 0.0003449944039174935, "loss": 0.626, "step": 32610 }, { "epoch": 0.7260505698005698, "grad_norm": 0.5632911920547485, "learning_rate": 0.0003449622875939435, "loss": 0.67, "step": 32620 }, { "epoch": 0.7262731481481481, "grad_norm": 0.6731725931167603, "learning_rate": 0.00034493016339302396, "loss": 0.5885, "step": 32630 }, { "epoch": 0.7264957264957265, "grad_norm": 0.6174749732017517, "learning_rate": 0.00034489803131648077, "loss": 0.6846, "step": 32640 }, { "epoch": 0.7267183048433048, "grad_norm": 1.0542770624160767, "learning_rate": 0.00034486589136605993, "loss": 0.6128, "step": 32650 }, { "epoch": 0.7269408831908832, "grad_norm": 0.5963648557662964, "learning_rate": 0.00034483374354350797, "loss": 0.6567, "step": 32660 }, { "epoch": 0.7271634615384616, "grad_norm": 0.912215530872345, "learning_rate": 0.0003448015878505718, "loss": 0.6237, "step": 32670 }, { "epoch": 0.7273860398860399, "grad_norm": 0.6201345324516296, "learning_rate": 0.00034476942428899877, "loss": 0.5933, "step": 32680 }, { "epoch": 0.7276086182336182, "grad_norm": 0.9106643199920654, "learning_rate": 0.0003447372528605368, "loss": 0.7458, "step": 32690 }, { "epoch": 0.7278311965811965, "grad_norm": 0.7907149195671082, "learning_rate": 0.00034470507356693396, "loss": 0.7266, "step": 32700 }, { "epoch": 0.7280537749287749, "grad_norm": 0.44493839144706726, "learning_rate": 0.00034467288640993896, "loss": 0.5099, "step": 32710 }, { "epoch": 0.7282763532763533, "grad_norm": 0.7286844849586487, "learning_rate": 0.0003446406913913009, "loss": 0.6397, "step": 32720 }, { "epoch": 0.7284989316239316, "grad_norm": 0.6643046736717224, "learning_rate": 0.00034460848851276924, "loss": 0.6728, "step": 32730 }, { "epoch": 0.72872150997151, "grad_norm": 0.8569263219833374, "learning_rate": 0.000344576277776094, "loss": 0.6978, "step": 32740 }, { "epoch": 0.7289440883190883, "grad_norm": 0.4604608118534088, "learning_rate": 0.0003445440591830254, "loss": 0.5785, "step": 32750 }, { "epoch": 0.7291666666666666, "grad_norm": 1.0402268171310425, "learning_rate": 0.0003445118327353143, "loss": 0.646, "step": 32760 }, { "epoch": 0.7293892450142451, "grad_norm": 0.6748735308647156, "learning_rate": 0.0003444795984347119, "loss": 0.5849, "step": 32770 }, { "epoch": 0.7296118233618234, "grad_norm": 1.337708592414856, "learning_rate": 0.0003444473562829699, "loss": 0.7221, "step": 32780 }, { "epoch": 0.7298344017094017, "grad_norm": 0.633197546005249, "learning_rate": 0.00034441510628184025, "loss": 0.6345, "step": 32790 }, { "epoch": 0.73005698005698, "grad_norm": 0.8810262084007263, "learning_rate": 0.0003443828484330755, "loss": 0.6605, "step": 32800 }, { "epoch": 0.7302795584045584, "grad_norm": 0.6289214491844177, "learning_rate": 0.0003443505827384286, "loss": 0.7498, "step": 32810 }, { "epoch": 0.7305021367521367, "grad_norm": 0.5705475807189941, "learning_rate": 0.00034431830919965284, "loss": 0.581, "step": 32820 }, { "epoch": 0.7307247150997151, "grad_norm": 0.7125277519226074, "learning_rate": 0.0003442860278185021, "loss": 0.5964, "step": 32830 }, { "epoch": 0.7309472934472935, "grad_norm": 0.6184514760971069, "learning_rate": 0.0003442537385967303, "loss": 0.4608, "step": 32840 }, { "epoch": 0.7311698717948718, "grad_norm": 0.48804301023483276, "learning_rate": 0.00034422144153609243, "loss": 0.5924, "step": 32850 }, { "epoch": 0.7313924501424501, "grad_norm": 0.661341667175293, "learning_rate": 0.00034418913663834333, "loss": 0.7042, "step": 32860 }, { "epoch": 0.7316150284900285, "grad_norm": 0.6228981614112854, "learning_rate": 0.00034415682390523844, "loss": 0.6062, "step": 32870 }, { "epoch": 0.7318376068376068, "grad_norm": 0.6047166585922241, "learning_rate": 0.0003441245033385338, "loss": 0.5581, "step": 32880 }, { "epoch": 0.7320601851851852, "grad_norm": 0.7112004160881042, "learning_rate": 0.0003440921749399856, "loss": 0.6883, "step": 32890 }, { "epoch": 0.7322827635327636, "grad_norm": 0.778866708278656, "learning_rate": 0.0003440598387113507, "loss": 0.6048, "step": 32900 }, { "epoch": 0.7325053418803419, "grad_norm": 0.6084232330322266, "learning_rate": 0.0003440274946543862, "loss": 0.5741, "step": 32910 }, { "epoch": 0.7327279202279202, "grad_norm": 0.6071760654449463, "learning_rate": 0.00034399514277084976, "loss": 0.5831, "step": 32920 }, { "epoch": 0.7329504985754985, "grad_norm": 0.622017502784729, "learning_rate": 0.00034396278306249935, "loss": 0.5463, "step": 32930 }, { "epoch": 0.7331730769230769, "grad_norm": 0.6587297320365906, "learning_rate": 0.00034393041553109347, "loss": 0.5529, "step": 32940 }, { "epoch": 0.7333956552706553, "grad_norm": 0.8048000335693359, "learning_rate": 0.00034389804017839103, "loss": 0.6364, "step": 32950 }, { "epoch": 0.7336182336182336, "grad_norm": 0.76850426197052, "learning_rate": 0.00034386565700615125, "loss": 0.5629, "step": 32960 }, { "epoch": 0.733840811965812, "grad_norm": 0.6999525427818298, "learning_rate": 0.00034383326601613386, "loss": 0.6677, "step": 32970 }, { "epoch": 0.7340633903133903, "grad_norm": 0.794244647026062, "learning_rate": 0.0003438008672100991, "loss": 0.7736, "step": 32980 }, { "epoch": 0.7342859686609686, "grad_norm": 0.7045572996139526, "learning_rate": 0.00034376846058980744, "loss": 0.6419, "step": 32990 }, { "epoch": 0.7345085470085471, "grad_norm": 0.6053374409675598, "learning_rate": 0.00034373604615702, "loss": 0.4803, "step": 33000 }, { "epoch": 0.7347311253561254, "grad_norm": 0.7537468671798706, "learning_rate": 0.0003437036239134981, "loss": 0.5578, "step": 33010 }, { "epoch": 0.7349537037037037, "grad_norm": 0.7278876304626465, "learning_rate": 0.0003436711938610037, "loss": 0.4338, "step": 33020 }, { "epoch": 0.735176282051282, "grad_norm": 0.6402742862701416, "learning_rate": 0.0003436387560012989, "loss": 0.59, "step": 33030 }, { "epoch": 0.7353988603988604, "grad_norm": 0.7100064158439636, "learning_rate": 0.0003436063103361466, "loss": 0.6198, "step": 33040 }, { "epoch": 0.7356214387464387, "grad_norm": 0.8709797859191895, "learning_rate": 0.0003435738568673098, "loss": 0.7059, "step": 33050 }, { "epoch": 0.7358440170940171, "grad_norm": 0.5005736351013184, "learning_rate": 0.0003435413955965521, "loss": 0.6524, "step": 33060 }, { "epoch": 0.7360665954415955, "grad_norm": 0.6272966265678406, "learning_rate": 0.00034350892652563737, "loss": 0.6263, "step": 33070 }, { "epoch": 0.7362891737891738, "grad_norm": 0.6624988317489624, "learning_rate": 0.00034347644965633024, "loss": 0.5786, "step": 33080 }, { "epoch": 0.7365117521367521, "grad_norm": 0.9508379697799683, "learning_rate": 0.00034344396499039523, "loss": 0.5954, "step": 33090 }, { "epoch": 0.7367343304843305, "grad_norm": 0.5892514586448669, "learning_rate": 0.0003434114725295978, "loss": 0.6951, "step": 33100 }, { "epoch": 0.7369569088319088, "grad_norm": 0.6346218585968018, "learning_rate": 0.0003433789722757036, "loss": 0.6452, "step": 33110 }, { "epoch": 0.7371794871794872, "grad_norm": 0.6012640595436096, "learning_rate": 0.0003433464642304786, "loss": 0.4765, "step": 33120 }, { "epoch": 0.7374020655270656, "grad_norm": 0.6348587870597839, "learning_rate": 0.00034331394839568944, "loss": 0.5871, "step": 33130 }, { "epoch": 0.7376246438746439, "grad_norm": 0.5616714954376221, "learning_rate": 0.000343281424773103, "loss": 0.6428, "step": 33140 }, { "epoch": 0.7378472222222222, "grad_norm": 0.7842782139778137, "learning_rate": 0.0003432488933644866, "loss": 0.5245, "step": 33150 }, { "epoch": 0.7380698005698005, "grad_norm": 0.5432232022285461, "learning_rate": 0.0003432163541716081, "loss": 0.6654, "step": 33160 }, { "epoch": 0.7382923789173789, "grad_norm": 0.5655632615089417, "learning_rate": 0.00034318380719623563, "loss": 0.6713, "step": 33170 }, { "epoch": 0.7385149572649573, "grad_norm": 0.5200754404067993, "learning_rate": 0.0003431512524401379, "loss": 0.6942, "step": 33180 }, { "epoch": 0.7387375356125356, "grad_norm": 0.7658675312995911, "learning_rate": 0.00034311868990508386, "loss": 0.5798, "step": 33190 }, { "epoch": 0.738960113960114, "grad_norm": 0.5904093980789185, "learning_rate": 0.0003430861195928431, "loss": 0.6543, "step": 33200 }, { "epoch": 0.7391826923076923, "grad_norm": 0.9393807649612427, "learning_rate": 0.00034305354150518554, "loss": 0.5894, "step": 33210 }, { "epoch": 0.7394052706552706, "grad_norm": 0.5630085468292236, "learning_rate": 0.0003430209556438813, "loss": 0.6652, "step": 33220 }, { "epoch": 0.7396278490028491, "grad_norm": 0.8873823881149292, "learning_rate": 0.0003429883620107013, "loss": 0.5247, "step": 33230 }, { "epoch": 0.7398504273504274, "grad_norm": 0.6598261594772339, "learning_rate": 0.00034295576060741666, "loss": 0.5125, "step": 33240 }, { "epoch": 0.7400730056980057, "grad_norm": 0.5915985107421875, "learning_rate": 0.000342923151435799, "loss": 0.637, "step": 33250 }, { "epoch": 0.740295584045584, "grad_norm": 0.8445550799369812, "learning_rate": 0.0003428905344976202, "loss": 0.6333, "step": 33260 }, { "epoch": 0.7405181623931624, "grad_norm": 0.9027766585350037, "learning_rate": 0.0003428579097946528, "loss": 0.6398, "step": 33270 }, { "epoch": 0.7407407407407407, "grad_norm": 0.6807312965393066, "learning_rate": 0.0003428252773286697, "loss": 0.6241, "step": 33280 }, { "epoch": 0.7409633190883191, "grad_norm": 0.6065702438354492, "learning_rate": 0.000342792637101444, "loss": 0.7469, "step": 33290 }, { "epoch": 0.7411858974358975, "grad_norm": 0.9258270859718323, "learning_rate": 0.00034275998911474957, "loss": 0.7389, "step": 33300 }, { "epoch": 0.7414084757834758, "grad_norm": 0.5104573965072632, "learning_rate": 0.0003427273333703605, "loss": 0.6547, "step": 33310 }, { "epoch": 0.7416310541310541, "grad_norm": 0.7797284126281738, "learning_rate": 0.0003426946698700512, "loss": 0.6665, "step": 33320 }, { "epoch": 0.7418536324786325, "grad_norm": 0.8530895709991455, "learning_rate": 0.00034266199861559675, "loss": 0.5328, "step": 33330 }, { "epoch": 0.7420762108262108, "grad_norm": 0.5849500298500061, "learning_rate": 0.00034262931960877256, "loss": 0.6509, "step": 33340 }, { "epoch": 0.7422987891737892, "grad_norm": 0.7217156291007996, "learning_rate": 0.00034259663285135435, "loss": 0.6171, "step": 33350 }, { "epoch": 0.7425213675213675, "grad_norm": 0.5825576186180115, "learning_rate": 0.0003425639383451184, "loss": 0.674, "step": 33360 }, { "epoch": 0.7427439458689459, "grad_norm": 0.41654032468795776, "learning_rate": 0.00034253123609184126, "loss": 0.5258, "step": 33370 }, { "epoch": 0.7429665242165242, "grad_norm": 0.7584710121154785, "learning_rate": 0.0003424985260933001, "loss": 0.4909, "step": 33380 }, { "epoch": 0.7431891025641025, "grad_norm": 0.7973439693450928, "learning_rate": 0.00034246580835127244, "loss": 0.6153, "step": 33390 }, { "epoch": 0.7434116809116809, "grad_norm": 0.6786189079284668, "learning_rate": 0.0003424330828675361, "loss": 0.6246, "step": 33400 }, { "epoch": 0.7436342592592593, "grad_norm": 0.40943869948387146, "learning_rate": 0.0003424003496438694, "loss": 0.4983, "step": 33410 }, { "epoch": 0.7438568376068376, "grad_norm": 0.5922877192497253, "learning_rate": 0.00034236760868205116, "loss": 0.6999, "step": 33420 }, { "epoch": 0.744079415954416, "grad_norm": 0.6780363917350769, "learning_rate": 0.00034233485998386046, "loss": 0.7115, "step": 33430 }, { "epoch": 0.7443019943019943, "grad_norm": 1.0446654558181763, "learning_rate": 0.000342302103551077, "loss": 0.5759, "step": 33440 }, { "epoch": 0.7445245726495726, "grad_norm": 0.6994029879570007, "learning_rate": 0.00034226933938548074, "loss": 0.5636, "step": 33450 }, { "epoch": 0.7447471509971509, "grad_norm": 0.6624497175216675, "learning_rate": 0.0003422365674888521, "loss": 0.6727, "step": 33460 }, { "epoch": 0.7449697293447294, "grad_norm": 0.6512168645858765, "learning_rate": 0.0003422037878629719, "loss": 0.593, "step": 33470 }, { "epoch": 0.7451923076923077, "grad_norm": 0.49083104729652405, "learning_rate": 0.00034217100050962153, "loss": 0.5841, "step": 33480 }, { "epoch": 0.745414886039886, "grad_norm": 0.8152703046798706, "learning_rate": 0.0003421382054305825, "loss": 0.6204, "step": 33490 }, { "epoch": 0.7456374643874644, "grad_norm": 0.7201706767082214, "learning_rate": 0.0003421054026276371, "loss": 0.6814, "step": 33500 }, { "epoch": 0.7458600427350427, "grad_norm": 0.5058859586715698, "learning_rate": 0.00034207259210256784, "loss": 0.5908, "step": 33510 }, { "epoch": 0.7460826210826211, "grad_norm": 0.5888089537620544, "learning_rate": 0.00034203977385715754, "loss": 0.5705, "step": 33520 }, { "epoch": 0.7463051994301995, "grad_norm": 0.3613194227218628, "learning_rate": 0.00034200694789318967, "loss": 0.6557, "step": 33530 }, { "epoch": 0.7465277777777778, "grad_norm": 0.7530273795127869, "learning_rate": 0.00034197411421244803, "loss": 0.6738, "step": 33540 }, { "epoch": 0.7467503561253561, "grad_norm": 0.84321129322052, "learning_rate": 0.00034194127281671677, "loss": 0.5973, "step": 33550 }, { "epoch": 0.7469729344729344, "grad_norm": 0.7950652837753296, "learning_rate": 0.00034190842370778054, "loss": 0.7292, "step": 33560 }, { "epoch": 0.7471955128205128, "grad_norm": 0.7758309841156006, "learning_rate": 0.00034187556688742443, "loss": 0.6571, "step": 33570 }, { "epoch": 0.7474180911680912, "grad_norm": 0.712580144405365, "learning_rate": 0.0003418427023574338, "loss": 0.5803, "step": 33580 }, { "epoch": 0.7476406695156695, "grad_norm": 0.523370087146759, "learning_rate": 0.00034180983011959464, "loss": 0.6803, "step": 33590 }, { "epoch": 0.7478632478632479, "grad_norm": 0.5187812447547913, "learning_rate": 0.00034177695017569324, "loss": 0.5895, "step": 33600 }, { "epoch": 0.7480858262108262, "grad_norm": 0.9683651328086853, "learning_rate": 0.0003417440625275163, "loss": 0.6858, "step": 33610 }, { "epoch": 0.7483084045584045, "grad_norm": 0.9042038917541504, "learning_rate": 0.00034171116717685094, "loss": 0.7055, "step": 33620 }, { "epoch": 0.7485309829059829, "grad_norm": 0.5068843960762024, "learning_rate": 0.00034167826412548477, "loss": 0.6619, "step": 33630 }, { "epoch": 0.7487535612535613, "grad_norm": 0.5242236852645874, "learning_rate": 0.00034164535337520574, "loss": 0.5987, "step": 33640 }, { "epoch": 0.7489761396011396, "grad_norm": 0.6024574041366577, "learning_rate": 0.00034161243492780225, "loss": 0.6789, "step": 33650 }, { "epoch": 0.749198717948718, "grad_norm": 0.587216854095459, "learning_rate": 0.00034157950878506313, "loss": 0.6347, "step": 33660 }, { "epoch": 0.7494212962962963, "grad_norm": 0.6755560636520386, "learning_rate": 0.0003415465749487776, "loss": 0.6212, "step": 33670 }, { "epoch": 0.7496438746438746, "grad_norm": 0.5546956062316895, "learning_rate": 0.00034151363342073524, "loss": 0.5529, "step": 33680 }, { "epoch": 0.7498664529914529, "grad_norm": 0.7829976081848145, "learning_rate": 0.0003414806842027263, "loss": 0.7853, "step": 33690 }, { "epoch": 0.7500890313390314, "grad_norm": 0.7056668996810913, "learning_rate": 0.00034144772729654107, "loss": 0.4261, "step": 33700 }, { "epoch": 0.7503116096866097, "grad_norm": 0.4800557792186737, "learning_rate": 0.00034141476270397057, "loss": 0.6852, "step": 33710 }, { "epoch": 0.750534188034188, "grad_norm": 0.5922639966011047, "learning_rate": 0.0003413817904268061, "loss": 0.5152, "step": 33720 }, { "epoch": 0.7507567663817664, "grad_norm": 0.6094176769256592, "learning_rate": 0.0003413488104668394, "loss": 0.6417, "step": 33730 }, { "epoch": 0.7509793447293447, "grad_norm": 0.5336151719093323, "learning_rate": 0.00034131582282586264, "loss": 0.472, "step": 33740 }, { "epoch": 0.7512019230769231, "grad_norm": 1.060437560081482, "learning_rate": 0.00034128282750566836, "loss": 0.6104, "step": 33750 }, { "epoch": 0.7514245014245015, "grad_norm": 0.8258956074714661, "learning_rate": 0.0003412498245080496, "loss": 0.8442, "step": 33760 }, { "epoch": 0.7516470797720798, "grad_norm": 0.6972953081130981, "learning_rate": 0.00034121681383479977, "loss": 0.6083, "step": 33770 }, { "epoch": 0.7518696581196581, "grad_norm": 0.588059663772583, "learning_rate": 0.0003411837954877126, "loss": 0.6383, "step": 33780 }, { "epoch": 0.7520922364672364, "grad_norm": 0.8543235659599304, "learning_rate": 0.00034115076946858246, "loss": 0.5739, "step": 33790 }, { "epoch": 0.7523148148148148, "grad_norm": 0.5297170877456665, "learning_rate": 0.00034111773577920394, "loss": 0.6443, "step": 33800 }, { "epoch": 0.7525373931623932, "grad_norm": 0.5189324021339417, "learning_rate": 0.0003410846944213721, "loss": 0.5406, "step": 33810 }, { "epoch": 0.7527599715099715, "grad_norm": 0.8608090877532959, "learning_rate": 0.00034105164539688246, "loss": 0.6248, "step": 33820 }, { "epoch": 0.7529825498575499, "grad_norm": 0.8554071187973022, "learning_rate": 0.000341018588707531, "loss": 0.6015, "step": 33830 }, { "epoch": 0.7532051282051282, "grad_norm": 0.5413314700126648, "learning_rate": 0.000340985524355114, "loss": 0.6217, "step": 33840 }, { "epoch": 0.7534277065527065, "grad_norm": 0.7716408371925354, "learning_rate": 0.0003409524523414281, "loss": 0.5588, "step": 33850 }, { "epoch": 0.7536502849002849, "grad_norm": 0.7285047173500061, "learning_rate": 0.0003409193726682706, "loss": 0.7738, "step": 33860 }, { "epoch": 0.7538728632478633, "grad_norm": 0.4201757311820984, "learning_rate": 0.000340886285337439, "loss": 0.5486, "step": 33870 }, { "epoch": 0.7540954415954416, "grad_norm": 0.5679952502250671, "learning_rate": 0.00034085319035073134, "loss": 0.5122, "step": 33880 }, { "epoch": 0.75431801994302, "grad_norm": 0.5826485753059387, "learning_rate": 0.00034082008770994606, "loss": 0.549, "step": 33890 }, { "epoch": 0.7545405982905983, "grad_norm": 0.3409044146537781, "learning_rate": 0.0003407869774168819, "loss": 0.6678, "step": 33900 }, { "epoch": 0.7547631766381766, "grad_norm": 0.5800386667251587, "learning_rate": 0.00034075385947333805, "loss": 0.6745, "step": 33910 }, { "epoch": 0.7549857549857549, "grad_norm": 0.5018765926361084, "learning_rate": 0.0003407207338811144, "loss": 0.4987, "step": 33920 }, { "epoch": 0.7552083333333334, "grad_norm": 0.7626890540122986, "learning_rate": 0.0003406876006420108, "loss": 0.5695, "step": 33930 }, { "epoch": 0.7554309116809117, "grad_norm": 0.6939761638641357, "learning_rate": 0.0003406544597578278, "loss": 0.718, "step": 33940 }, { "epoch": 0.75565349002849, "grad_norm": 0.5773590803146362, "learning_rate": 0.0003406213112303663, "loss": 0.7233, "step": 33950 }, { "epoch": 0.7558760683760684, "grad_norm": 0.6457138061523438, "learning_rate": 0.00034058815506142763, "loss": 0.5802, "step": 33960 }, { "epoch": 0.7560986467236467, "grad_norm": 0.643480122089386, "learning_rate": 0.0003405549912528135, "loss": 0.6462, "step": 33970 }, { "epoch": 0.7563212250712251, "grad_norm": 0.6513316631317139, "learning_rate": 0.00034052181980632617, "loss": 0.6008, "step": 33980 }, { "epoch": 0.7565438034188035, "grad_norm": 0.6498247385025024, "learning_rate": 0.00034048864072376805, "loss": 0.6596, "step": 33990 }, { "epoch": 0.7567663817663818, "grad_norm": 0.7045326232910156, "learning_rate": 0.0003404554540069422, "loss": 0.5607, "step": 34000 }, { "epoch": 0.7569889601139601, "grad_norm": 0.9087044596672058, "learning_rate": 0.000340422259657652, "loss": 0.7722, "step": 34010 }, { "epoch": 0.7572115384615384, "grad_norm": 0.6496840715408325, "learning_rate": 0.00034038905767770123, "loss": 0.5651, "step": 34020 }, { "epoch": 0.7574341168091168, "grad_norm": 0.6747355461120605, "learning_rate": 0.00034035584806889417, "loss": 0.5881, "step": 34030 }, { "epoch": 0.7576566951566952, "grad_norm": 1.1374226808547974, "learning_rate": 0.0003403226308330354, "loss": 0.725, "step": 34040 }, { "epoch": 0.7578792735042735, "grad_norm": 0.7352719902992249, "learning_rate": 0.00034028940597193003, "loss": 0.6371, "step": 34050 }, { "epoch": 0.7581018518518519, "grad_norm": 0.8843091726303101, "learning_rate": 0.0003402561734873834, "loss": 0.5925, "step": 34060 }, { "epoch": 0.7583244301994302, "grad_norm": 0.4928925335407257, "learning_rate": 0.0003402229333812016, "loss": 0.6538, "step": 34070 }, { "epoch": 0.7585470085470085, "grad_norm": 0.4970260262489319, "learning_rate": 0.00034018968565519074, "loss": 0.4335, "step": 34080 }, { "epoch": 0.7587695868945868, "grad_norm": 1.0510859489440918, "learning_rate": 0.0003401564303111576, "loss": 0.6611, "step": 34090 }, { "epoch": 0.7589921652421653, "grad_norm": 0.5787474513053894, "learning_rate": 0.00034012316735090934, "loss": 0.4912, "step": 34100 }, { "epoch": 0.7592147435897436, "grad_norm": 0.7155798673629761, "learning_rate": 0.0003400898967762535, "loss": 0.7107, "step": 34110 }, { "epoch": 0.7594373219373219, "grad_norm": 0.8900055885314941, "learning_rate": 0.0003400566185889979, "loss": 0.6932, "step": 34120 }, { "epoch": 0.7596599002849003, "grad_norm": 0.4244909882545471, "learning_rate": 0.00034002333279095105, "loss": 0.5289, "step": 34130 }, { "epoch": 0.7598824786324786, "grad_norm": 0.4296218454837799, "learning_rate": 0.0003399900393839216, "loss": 0.5492, "step": 34140 }, { "epoch": 0.7601050569800569, "grad_norm": 0.5768629312515259, "learning_rate": 0.0003399567383697188, "loss": 0.4549, "step": 34150 }, { "epoch": 0.7603276353276354, "grad_norm": 0.5289151668548584, "learning_rate": 0.0003399234297501523, "loss": 0.5806, "step": 34160 }, { "epoch": 0.7605502136752137, "grad_norm": 0.9661064147949219, "learning_rate": 0.0003398901135270321, "loss": 0.6065, "step": 34170 }, { "epoch": 0.760772792022792, "grad_norm": 0.6725865006446838, "learning_rate": 0.00033985678970216865, "loss": 0.6203, "step": 34180 }, { "epoch": 0.7609953703703703, "grad_norm": 0.7186846137046814, "learning_rate": 0.00033982345827737265, "loss": 0.6291, "step": 34190 }, { "epoch": 0.7612179487179487, "grad_norm": 0.6042883396148682, "learning_rate": 0.0003397901192544555, "loss": 0.5486, "step": 34200 }, { "epoch": 0.7614405270655271, "grad_norm": 0.41314706206321716, "learning_rate": 0.0003397567726352289, "loss": 0.5823, "step": 34210 }, { "epoch": 0.7616631054131054, "grad_norm": 0.6745661497116089, "learning_rate": 0.0003397234184215048, "loss": 0.6311, "step": 34220 }, { "epoch": 0.7618856837606838, "grad_norm": 0.6066137552261353, "learning_rate": 0.0003396900566150957, "loss": 0.7573, "step": 34230 }, { "epoch": 0.7621082621082621, "grad_norm": 0.6698915362358093, "learning_rate": 0.0003396566872178147, "loss": 0.5738, "step": 34240 }, { "epoch": 0.7623308404558404, "grad_norm": 0.6477178931236267, "learning_rate": 0.00033962331023147495, "loss": 0.6469, "step": 34250 }, { "epoch": 0.7625534188034188, "grad_norm": 0.7714323997497559, "learning_rate": 0.00033958992565789013, "loss": 0.5805, "step": 34260 }, { "epoch": 0.7627759971509972, "grad_norm": 0.6476292610168457, "learning_rate": 0.00033955653349887455, "loss": 0.665, "step": 34270 }, { "epoch": 0.7629985754985755, "grad_norm": 0.49801915884017944, "learning_rate": 0.0003395231337562427, "loss": 0.5172, "step": 34280 }, { "epoch": 0.7632211538461539, "grad_norm": 0.4742410182952881, "learning_rate": 0.0003394897264318095, "loss": 0.6301, "step": 34290 }, { "epoch": 0.7634437321937322, "grad_norm": 0.8749886751174927, "learning_rate": 0.0003394563115273904, "loss": 0.5594, "step": 34300 }, { "epoch": 0.7636663105413105, "grad_norm": 0.8473349809646606, "learning_rate": 0.00033942288904480124, "loss": 0.6868, "step": 34310 }, { "epoch": 0.7638888888888888, "grad_norm": 1.0486420392990112, "learning_rate": 0.00033938945898585805, "loss": 0.6768, "step": 34320 }, { "epoch": 0.7641114672364673, "grad_norm": 0.5907694697380066, "learning_rate": 0.00033935602135237757, "loss": 0.6682, "step": 34330 }, { "epoch": 0.7643340455840456, "grad_norm": 0.7957288026809692, "learning_rate": 0.00033932257614617686, "loss": 0.4985, "step": 34340 }, { "epoch": 0.7645566239316239, "grad_norm": 0.9030253291130066, "learning_rate": 0.00033928912336907325, "loss": 0.6389, "step": 34350 }, { "epoch": 0.7647792022792023, "grad_norm": 0.6879715323448181, "learning_rate": 0.00033925566302288465, "loss": 0.6378, "step": 34360 }, { "epoch": 0.7650017806267806, "grad_norm": 0.6054188013076782, "learning_rate": 0.00033922219510942934, "loss": 0.6265, "step": 34370 }, { "epoch": 0.7652243589743589, "grad_norm": 0.6758320331573486, "learning_rate": 0.000339188719630526, "loss": 0.6042, "step": 34380 }, { "epoch": 0.7654469373219374, "grad_norm": 0.717961311340332, "learning_rate": 0.00033915523658799366, "loss": 0.7417, "step": 34390 }, { "epoch": 0.7656695156695157, "grad_norm": 0.7856202721595764, "learning_rate": 0.00033912174598365187, "loss": 0.5175, "step": 34400 }, { "epoch": 0.765892094017094, "grad_norm": 0.7919657826423645, "learning_rate": 0.0003390882478193205, "loss": 0.6577, "step": 34410 }, { "epoch": 0.7661146723646723, "grad_norm": 0.642520546913147, "learning_rate": 0.0003390547420968198, "loss": 0.6411, "step": 34420 }, { "epoch": 0.7663372507122507, "grad_norm": 0.6515674591064453, "learning_rate": 0.0003390212288179707, "loss": 0.5741, "step": 34430 }, { "epoch": 0.7665598290598291, "grad_norm": 0.7228606939315796, "learning_rate": 0.0003389877079845942, "loss": 0.6719, "step": 34440 }, { "epoch": 0.7667824074074074, "grad_norm": 0.6175971031188965, "learning_rate": 0.00033895417959851177, "loss": 0.7009, "step": 34450 }, { "epoch": 0.7670049857549858, "grad_norm": 0.6188730597496033, "learning_rate": 0.00033892064366154555, "loss": 0.6157, "step": 34460 }, { "epoch": 0.7672275641025641, "grad_norm": 0.70756596326828, "learning_rate": 0.00033888710017551785, "loss": 0.6073, "step": 34470 }, { "epoch": 0.7674501424501424, "grad_norm": 0.6952245235443115, "learning_rate": 0.0003388535491422514, "loss": 0.658, "step": 34480 }, { "epoch": 0.7676727207977208, "grad_norm": 0.7117817401885986, "learning_rate": 0.0003388199905635694, "loss": 0.635, "step": 34490 }, { "epoch": 0.7678952991452992, "grad_norm": 0.8737725615501404, "learning_rate": 0.00033878642444129547, "loss": 0.6462, "step": 34500 }, { "epoch": 0.7681178774928775, "grad_norm": 0.8220909237861633, "learning_rate": 0.0003387528507772536, "loss": 0.7008, "step": 34510 }, { "epoch": 0.7683404558404558, "grad_norm": 0.728020191192627, "learning_rate": 0.0003387192695732683, "loss": 0.5707, "step": 34520 }, { "epoch": 0.7685630341880342, "grad_norm": 0.7700955271720886, "learning_rate": 0.00033868568083116426, "loss": 0.6196, "step": 34530 }, { "epoch": 0.7687856125356125, "grad_norm": 0.725366473197937, "learning_rate": 0.0003386520845527668, "loss": 0.5375, "step": 34540 }, { "epoch": 0.7690081908831908, "grad_norm": 0.48743993043899536, "learning_rate": 0.0003386184807399016, "loss": 0.4771, "step": 34550 }, { "epoch": 0.7692307692307693, "grad_norm": 0.777994692325592, "learning_rate": 0.00033858486939439465, "loss": 0.6777, "step": 34560 }, { "epoch": 0.7694533475783476, "grad_norm": 0.8114678263664246, "learning_rate": 0.00033855125051807246, "loss": 0.7312, "step": 34570 }, { "epoch": 0.7696759259259259, "grad_norm": 0.7239607572555542, "learning_rate": 0.0003385176241127619, "loss": 0.6857, "step": 34580 }, { "epoch": 0.7698985042735043, "grad_norm": 0.6901978850364685, "learning_rate": 0.00033848399018029024, "loss": 0.6023, "step": 34590 }, { "epoch": 0.7701210826210826, "grad_norm": 0.5981554388999939, "learning_rate": 0.00033845034872248515, "loss": 0.6188, "step": 34600 }, { "epoch": 0.7703436609686609, "grad_norm": 0.9385389685630798, "learning_rate": 0.0003384166997411748, "loss": 0.6454, "step": 34610 }, { "epoch": 0.7705662393162394, "grad_norm": 0.6972981095314026, "learning_rate": 0.0003383830432381877, "loss": 0.5368, "step": 34620 }, { "epoch": 0.7707888176638177, "grad_norm": 0.7719412446022034, "learning_rate": 0.0003383493792153527, "loss": 0.5493, "step": 34630 }, { "epoch": 0.771011396011396, "grad_norm": 0.6207097172737122, "learning_rate": 0.0003383157076744992, "loss": 0.4886, "step": 34640 }, { "epoch": 0.7712339743589743, "grad_norm": 0.7396118640899658, "learning_rate": 0.0003382820286174569, "loss": 0.5986, "step": 34650 }, { "epoch": 0.7714565527065527, "grad_norm": 0.40450310707092285, "learning_rate": 0.00033824834204605595, "loss": 0.6138, "step": 34660 }, { "epoch": 0.7716791310541311, "grad_norm": 0.5757637023925781, "learning_rate": 0.00033821464796212697, "loss": 0.5282, "step": 34670 }, { "epoch": 0.7719017094017094, "grad_norm": 0.6552518606185913, "learning_rate": 0.00033818094636750085, "loss": 0.5035, "step": 34680 }, { "epoch": 0.7721242877492878, "grad_norm": 0.6848192811012268, "learning_rate": 0.00033814723726400896, "loss": 0.6008, "step": 34690 }, { "epoch": 0.7723468660968661, "grad_norm": 0.5395011305809021, "learning_rate": 0.0003381135206534832, "loss": 0.7105, "step": 34700 }, { "epoch": 0.7725694444444444, "grad_norm": 0.5970385670661926, "learning_rate": 0.00033807979653775554, "loss": 0.592, "step": 34710 }, { "epoch": 0.7727920227920227, "grad_norm": 0.6868297457695007, "learning_rate": 0.00033804606491865877, "loss": 0.6598, "step": 34720 }, { "epoch": 0.7730146011396012, "grad_norm": 0.5928608775138855, "learning_rate": 0.0003380123257980257, "loss": 0.5612, "step": 34730 }, { "epoch": 0.7732371794871795, "grad_norm": 0.7232846021652222, "learning_rate": 0.00033797857917769003, "loss": 0.509, "step": 34740 }, { "epoch": 0.7734597578347578, "grad_norm": 0.662609875202179, "learning_rate": 0.0003379448250594853, "loss": 0.6415, "step": 34750 }, { "epoch": 0.7736823361823362, "grad_norm": 0.286639541387558, "learning_rate": 0.00033791106344524584, "loss": 0.6826, "step": 34760 }, { "epoch": 0.7739049145299145, "grad_norm": 1.0770177841186523, "learning_rate": 0.0003378772943368064, "loss": 0.5784, "step": 34770 }, { "epoch": 0.7741274928774928, "grad_norm": 0.6964942216873169, "learning_rate": 0.0003378435177360019, "loss": 0.4603, "step": 34780 }, { "epoch": 0.7743500712250713, "grad_norm": 0.40256690979003906, "learning_rate": 0.0003378097336446677, "loss": 0.5608, "step": 34790 }, { "epoch": 0.7745726495726496, "grad_norm": 0.5216047763824463, "learning_rate": 0.0003377759420646398, "loss": 0.4453, "step": 34800 }, { "epoch": 0.7747952279202279, "grad_norm": 0.7011343240737915, "learning_rate": 0.00033774214299775446, "loss": 0.6648, "step": 34810 }, { "epoch": 0.7750178062678063, "grad_norm": 0.6892892122268677, "learning_rate": 0.00033770833644584827, "loss": 0.6408, "step": 34820 }, { "epoch": 0.7752403846153846, "grad_norm": 1.0617207288742065, "learning_rate": 0.00033767452241075836, "loss": 0.5153, "step": 34830 }, { "epoch": 0.7754629629629629, "grad_norm": 0.6845688819885254, "learning_rate": 0.00033764070089432224, "loss": 0.718, "step": 34840 }, { "epoch": 0.7756855413105413, "grad_norm": 0.5099952816963196, "learning_rate": 0.00033760687189837767, "loss": 0.7615, "step": 34850 }, { "epoch": 0.7759081196581197, "grad_norm": 0.5010229349136353, "learning_rate": 0.00033757303542476314, "loss": 0.5986, "step": 34860 }, { "epoch": 0.776130698005698, "grad_norm": 0.50068199634552, "learning_rate": 0.00033753919147531714, "loss": 0.6493, "step": 34870 }, { "epoch": 0.7763532763532763, "grad_norm": 0.7622873187065125, "learning_rate": 0.00033750534005187895, "loss": 0.7228, "step": 34880 }, { "epoch": 0.7765758547008547, "grad_norm": 0.769127368927002, "learning_rate": 0.00033747148115628793, "loss": 0.5665, "step": 34890 }, { "epoch": 0.7767984330484331, "grad_norm": 0.8573402762413025, "learning_rate": 0.0003374376147903842, "loss": 0.6565, "step": 34900 }, { "epoch": 0.7770210113960114, "grad_norm": 0.5039611458778381, "learning_rate": 0.0003374037409560078, "loss": 0.7783, "step": 34910 }, { "epoch": 0.7772435897435898, "grad_norm": 0.8162313103675842, "learning_rate": 0.0003373698596549998, "loss": 0.6285, "step": 34920 }, { "epoch": 0.7774661680911681, "grad_norm": 0.6889898777008057, "learning_rate": 0.0003373359708892011, "loss": 0.6166, "step": 34930 }, { "epoch": 0.7776887464387464, "grad_norm": 0.4901689887046814, "learning_rate": 0.0003373020746604533, "loss": 0.4887, "step": 34940 }, { "epoch": 0.7779113247863247, "grad_norm": 0.9870404005050659, "learning_rate": 0.0003372681709705984, "loss": 0.5822, "step": 34950 }, { "epoch": 0.7781339031339032, "grad_norm": 0.8554285764694214, "learning_rate": 0.0003372342598214787, "loss": 0.6114, "step": 34960 }, { "epoch": 0.7783564814814815, "grad_norm": 0.6954330801963806, "learning_rate": 0.000337200341214937, "loss": 0.6105, "step": 34970 }, { "epoch": 0.7785790598290598, "grad_norm": 0.6738479137420654, "learning_rate": 0.0003371664151528164, "loss": 0.7448, "step": 34980 }, { "epoch": 0.7788016381766382, "grad_norm": 0.5652036070823669, "learning_rate": 0.00033713248163696054, "loss": 0.5914, "step": 34990 }, { "epoch": 0.7790242165242165, "grad_norm": 0.5839760899543762, "learning_rate": 0.00033709854066921337, "loss": 0.7341, "step": 35000 }, { "epoch": 0.7792467948717948, "grad_norm": 0.5647395849227905, "learning_rate": 0.0003370645922514192, "loss": 0.5992, "step": 35010 }, { "epoch": 0.7794693732193733, "grad_norm": 0.6457863450050354, "learning_rate": 0.0003370306363854229, "loss": 0.6987, "step": 35020 }, { "epoch": 0.7796919515669516, "grad_norm": 0.9118708968162537, "learning_rate": 0.0003369966730730697, "loss": 0.5721, "step": 35030 }, { "epoch": 0.7799145299145299, "grad_norm": 0.4689837694168091, "learning_rate": 0.00033696270231620514, "loss": 0.6103, "step": 35040 }, { "epoch": 0.7800925925925926, "eval_loss": 0.6148799061775208, "eval_runtime": 337.3683, "eval_samples_per_second": 7.01, "eval_steps_per_second": 7.01, "step": 35048 }, { "epoch": 0.7801371082621082, "grad_norm": 0.6478744149208069, "learning_rate": 0.00033692872411667516, "loss": 0.6644, "step": 35050 }, { "epoch": 0.7803596866096866, "grad_norm": 0.4604703187942505, "learning_rate": 0.0003368947384763263, "loss": 0.5932, "step": 35060 }, { "epoch": 0.7805822649572649, "grad_norm": 0.5048819780349731, "learning_rate": 0.00033686074539700516, "loss": 0.5681, "step": 35070 }, { "epoch": 0.7808048433048433, "grad_norm": 0.9202898740768433, "learning_rate": 0.0003368267448805591, "loss": 0.7438, "step": 35080 }, { "epoch": 0.7810274216524217, "grad_norm": 0.7058709859848022, "learning_rate": 0.00033679273692883575, "loss": 0.607, "step": 35090 }, { "epoch": 0.78125, "grad_norm": 0.4312692880630493, "learning_rate": 0.00033675872154368314, "loss": 0.5137, "step": 35100 }, { "epoch": 0.7814725783475783, "grad_norm": 0.9358759522438049, "learning_rate": 0.00033672469872694956, "loss": 0.6285, "step": 35110 }, { "epoch": 0.7816951566951567, "grad_norm": 0.5190538763999939, "learning_rate": 0.00033669066848048397, "loss": 0.4727, "step": 35120 }, { "epoch": 0.7819177350427351, "grad_norm": 0.6738810539245605, "learning_rate": 0.0003366566308061355, "loss": 0.7522, "step": 35130 }, { "epoch": 0.7821403133903134, "grad_norm": 0.903041660785675, "learning_rate": 0.0003366225857057539, "loss": 0.5667, "step": 35140 }, { "epoch": 0.7823628917378918, "grad_norm": 0.8216555714607239, "learning_rate": 0.00033658853318118916, "loss": 0.6111, "step": 35150 }, { "epoch": 0.7825854700854701, "grad_norm": 0.9077396392822266, "learning_rate": 0.0003365544732342917, "loss": 0.6179, "step": 35160 }, { "epoch": 0.7828080484330484, "grad_norm": 1.029372215270996, "learning_rate": 0.00033652040586691233, "loss": 0.6137, "step": 35170 }, { "epoch": 0.7830306267806267, "grad_norm": 0.569773256778717, "learning_rate": 0.0003364863310809024, "loss": 0.5054, "step": 35180 }, { "epoch": 0.7832532051282052, "grad_norm": 0.8230850696563721, "learning_rate": 0.00033645224887811343, "loss": 0.4942, "step": 35190 }, { "epoch": 0.7834757834757835, "grad_norm": 0.5945245623588562, "learning_rate": 0.0003364181592603976, "loss": 0.6899, "step": 35200 }, { "epoch": 0.7836983618233618, "grad_norm": 0.7154036164283752, "learning_rate": 0.00033638406222960733, "loss": 0.6614, "step": 35210 }, { "epoch": 0.7839209401709402, "grad_norm": 0.7123019695281982, "learning_rate": 0.00033634995778759544, "loss": 0.8057, "step": 35220 }, { "epoch": 0.7841435185185185, "grad_norm": 0.733191728591919, "learning_rate": 0.00033631584593621524, "loss": 0.6401, "step": 35230 }, { "epoch": 0.7843660968660968, "grad_norm": 0.5150040984153748, "learning_rate": 0.0003362817266773204, "loss": 0.5645, "step": 35240 }, { "epoch": 0.7845886752136753, "grad_norm": 0.8314915895462036, "learning_rate": 0.0003362476000127649, "loss": 0.6106, "step": 35250 }, { "epoch": 0.7848112535612536, "grad_norm": 0.42255085706710815, "learning_rate": 0.00033621346594440337, "loss": 0.5478, "step": 35260 }, { "epoch": 0.7850338319088319, "grad_norm": 0.6707674860954285, "learning_rate": 0.0003361793244740905, "loss": 0.7008, "step": 35270 }, { "epoch": 0.7852564102564102, "grad_norm": 1.109925627708435, "learning_rate": 0.0003361451756036817, "loss": 0.724, "step": 35280 }, { "epoch": 0.7854789886039886, "grad_norm": 0.9113699197769165, "learning_rate": 0.0003361110193350326, "loss": 0.6307, "step": 35290 }, { "epoch": 0.7857015669515669, "grad_norm": 0.8045711517333984, "learning_rate": 0.0003360768556699993, "loss": 0.5713, "step": 35300 }, { "epoch": 0.7859241452991453, "grad_norm": 0.5049736499786377, "learning_rate": 0.00033604268461043826, "loss": 0.6299, "step": 35310 }, { "epoch": 0.7861467236467237, "grad_norm": 0.6946551203727722, "learning_rate": 0.0003360085061582064, "loss": 0.6118, "step": 35320 }, { "epoch": 0.786369301994302, "grad_norm": 0.6586059927940369, "learning_rate": 0.00033597432031516085, "loss": 0.5925, "step": 35330 }, { "epoch": 0.7865918803418803, "grad_norm": 0.8724303245544434, "learning_rate": 0.00033594012708315955, "loss": 0.4647, "step": 35340 }, { "epoch": 0.7868144586894587, "grad_norm": 0.6088204383850098, "learning_rate": 0.0003359059264640604, "loss": 0.6173, "step": 35350 }, { "epoch": 0.7870370370370371, "grad_norm": 0.718809187412262, "learning_rate": 0.000335871718459722, "loss": 0.6609, "step": 35360 }, { "epoch": 0.7872596153846154, "grad_norm": 0.7139840722084045, "learning_rate": 0.0003358375030720031, "loss": 0.6287, "step": 35370 }, { "epoch": 0.7874821937321937, "grad_norm": 0.6466283202171326, "learning_rate": 0.0003358032803027632, "loss": 0.5697, "step": 35380 }, { "epoch": 0.7877047720797721, "grad_norm": 0.6111293435096741, "learning_rate": 0.0003357690501538618, "loss": 0.6616, "step": 35390 }, { "epoch": 0.7879273504273504, "grad_norm": 0.7320126891136169, "learning_rate": 0.000335734812627159, "loss": 0.7165, "step": 35400 }, { "epoch": 0.7881499287749287, "grad_norm": 1.3482513427734375, "learning_rate": 0.00033570056772451543, "loss": 0.4982, "step": 35410 }, { "epoch": 0.7883725071225072, "grad_norm": 0.4562806785106659, "learning_rate": 0.00033566631544779195, "loss": 0.6573, "step": 35420 }, { "epoch": 0.7885950854700855, "grad_norm": 0.663861870765686, "learning_rate": 0.00033563205579884985, "loss": 0.6171, "step": 35430 }, { "epoch": 0.7888176638176638, "grad_norm": 0.6606183648109436, "learning_rate": 0.00033559778877955077, "loss": 0.5925, "step": 35440 }, { "epoch": 0.7890402421652422, "grad_norm": 0.6257029175758362, "learning_rate": 0.0003355635143917568, "loss": 0.6735, "step": 35450 }, { "epoch": 0.7892628205128205, "grad_norm": 0.7059953808784485, "learning_rate": 0.0003355292326373305, "loss": 0.6859, "step": 35460 }, { "epoch": 0.7894853988603988, "grad_norm": 0.5329318642616272, "learning_rate": 0.00033549494351813475, "loss": 0.5843, "step": 35470 }, { "epoch": 0.7897079772079773, "grad_norm": 0.6622263193130493, "learning_rate": 0.00033546064703603287, "loss": 0.6743, "step": 35480 }, { "epoch": 0.7899305555555556, "grad_norm": 0.7779316306114197, "learning_rate": 0.00033542634319288855, "loss": 0.597, "step": 35490 }, { "epoch": 0.7901531339031339, "grad_norm": 0.49835607409477234, "learning_rate": 0.0003353920319905658, "loss": 0.7376, "step": 35500 }, { "epoch": 0.7903757122507122, "grad_norm": 0.7341712713241577, "learning_rate": 0.00033535771343092935, "loss": 0.5066, "step": 35510 }, { "epoch": 0.7905982905982906, "grad_norm": 0.8437432646751404, "learning_rate": 0.0003353233875158438, "loss": 0.702, "step": 35520 }, { "epoch": 0.7908208689458689, "grad_norm": 0.6561760306358337, "learning_rate": 0.00033528905424717463, "loss": 0.5098, "step": 35530 }, { "epoch": 0.7910434472934473, "grad_norm": 0.2741072177886963, "learning_rate": 0.0003352547136267875, "loss": 0.5497, "step": 35540 }, { "epoch": 0.7912660256410257, "grad_norm": 0.6874168515205383, "learning_rate": 0.00033522036565654845, "loss": 0.6892, "step": 35550 }, { "epoch": 0.791488603988604, "grad_norm": 0.8610624074935913, "learning_rate": 0.0003351860103383241, "loss": 0.5923, "step": 35560 }, { "epoch": 0.7917111823361823, "grad_norm": 0.5035836696624756, "learning_rate": 0.00033515164767398134, "loss": 0.6147, "step": 35570 }, { "epoch": 0.7919337606837606, "grad_norm": 0.730381429195404, "learning_rate": 0.0003351172776653873, "loss": 0.6442, "step": 35580 }, { "epoch": 0.7921563390313391, "grad_norm": 0.5736960768699646, "learning_rate": 0.00033508290031440983, "loss": 0.523, "step": 35590 }, { "epoch": 0.7923789173789174, "grad_norm": 0.8420835733413696, "learning_rate": 0.00033504851562291693, "loss": 0.7223, "step": 35600 }, { "epoch": 0.7926014957264957, "grad_norm": 1.1751604080200195, "learning_rate": 0.00033501412359277714, "loss": 0.6019, "step": 35610 }, { "epoch": 0.7928240740740741, "grad_norm": 0.6272011399269104, "learning_rate": 0.0003349797242258594, "loss": 0.6303, "step": 35620 }, { "epoch": 0.7930466524216524, "grad_norm": 0.7410932183265686, "learning_rate": 0.00033494531752403296, "loss": 0.5262, "step": 35630 }, { "epoch": 0.7932692307692307, "grad_norm": 0.9739224910736084, "learning_rate": 0.0003349109034891674, "loss": 0.7398, "step": 35640 }, { "epoch": 0.7934918091168092, "grad_norm": 1.0735089778900146, "learning_rate": 0.00033487648212313293, "loss": 0.7595, "step": 35650 }, { "epoch": 0.7937143874643875, "grad_norm": 0.7913787961006165, "learning_rate": 0.00033484205342780007, "loss": 0.6099, "step": 35660 }, { "epoch": 0.7939369658119658, "grad_norm": 0.9250198006629944, "learning_rate": 0.0003348076174050396, "loss": 0.6167, "step": 35670 }, { "epoch": 0.7941595441595442, "grad_norm": 0.5305607318878174, "learning_rate": 0.0003347731740567228, "loss": 0.5329, "step": 35680 }, { "epoch": 0.7943821225071225, "grad_norm": 0.7727921009063721, "learning_rate": 0.0003347387233847215, "loss": 0.5638, "step": 35690 }, { "epoch": 0.7946047008547008, "grad_norm": 0.7801950573921204, "learning_rate": 0.00033470426539090756, "loss": 0.6296, "step": 35700 }, { "epoch": 0.7948272792022792, "grad_norm": 0.5226908326148987, "learning_rate": 0.00033466980007715357, "loss": 0.6034, "step": 35710 }, { "epoch": 0.7950498575498576, "grad_norm": 0.7990700006484985, "learning_rate": 0.00033463532744533247, "loss": 0.6582, "step": 35720 }, { "epoch": 0.7952724358974359, "grad_norm": 0.6833288669586182, "learning_rate": 0.0003346008474973174, "loss": 0.553, "step": 35730 }, { "epoch": 0.7954950142450142, "grad_norm": 0.6693017482757568, "learning_rate": 0.0003345663602349821, "loss": 0.5633, "step": 35740 }, { "epoch": 0.7957175925925926, "grad_norm": 0.8235263228416443, "learning_rate": 0.00033453186566020064, "loss": 0.6119, "step": 35750 }, { "epoch": 0.7959401709401709, "grad_norm": 0.5683590769767761, "learning_rate": 0.0003344973637748475, "loss": 0.7155, "step": 35760 }, { "epoch": 0.7961627492877493, "grad_norm": 1.1679894924163818, "learning_rate": 0.0003344628545807974, "loss": 0.4866, "step": 35770 }, { "epoch": 0.7963853276353277, "grad_norm": 0.5607545375823975, "learning_rate": 0.0003344283380799258, "loss": 0.5868, "step": 35780 }, { "epoch": 0.796607905982906, "grad_norm": 0.7906357049942017, "learning_rate": 0.00033439381427410826, "loss": 0.6184, "step": 35790 }, { "epoch": 0.7968304843304843, "grad_norm": 0.5747066140174866, "learning_rate": 0.00033435928316522077, "loss": 0.5405, "step": 35800 }, { "epoch": 0.7970530626780626, "grad_norm": 0.5270879864692688, "learning_rate": 0.00033432474475513993, "loss": 0.5811, "step": 35810 }, { "epoch": 0.7972756410256411, "grad_norm": 0.7191861867904663, "learning_rate": 0.0003342901990457424, "loss": 0.6492, "step": 35820 }, { "epoch": 0.7974982193732194, "grad_norm": 0.9543971419334412, "learning_rate": 0.0003342556460389056, "loss": 0.6754, "step": 35830 }, { "epoch": 0.7977207977207977, "grad_norm": 0.5868616104125977, "learning_rate": 0.00033422108573650703, "loss": 0.6033, "step": 35840 }, { "epoch": 0.7979433760683761, "grad_norm": 0.5501631498336792, "learning_rate": 0.0003341865181404248, "loss": 0.7264, "step": 35850 }, { "epoch": 0.7981659544159544, "grad_norm": 0.5450555682182312, "learning_rate": 0.0003341519432525373, "loss": 0.5298, "step": 35860 }, { "epoch": 0.7983885327635327, "grad_norm": 0.7831109762191772, "learning_rate": 0.0003341173610747235, "loss": 0.83, "step": 35870 }, { "epoch": 0.7986111111111112, "grad_norm": 0.8205850720405579, "learning_rate": 0.0003340827716088624, "loss": 0.5863, "step": 35880 }, { "epoch": 0.7988336894586895, "grad_norm": 0.6535419225692749, "learning_rate": 0.0003340481748568337, "loss": 0.5711, "step": 35890 }, { "epoch": 0.7990562678062678, "grad_norm": 0.55476975440979, "learning_rate": 0.00033401357082051746, "loss": 0.6778, "step": 35900 }, { "epoch": 0.7992788461538461, "grad_norm": 0.7246994972229004, "learning_rate": 0.0003339789595017941, "loss": 0.7777, "step": 35910 }, { "epoch": 0.7995014245014245, "grad_norm": 0.5746579170227051, "learning_rate": 0.0003339443409025444, "loss": 0.6582, "step": 35920 }, { "epoch": 0.7997240028490028, "grad_norm": 0.5451569557189941, "learning_rate": 0.0003339097150246496, "loss": 0.6411, "step": 35930 }, { "epoch": 0.7999465811965812, "grad_norm": 0.6169950366020203, "learning_rate": 0.00033387508186999117, "loss": 0.6785, "step": 35940 }, { "epoch": 0.8001691595441596, "grad_norm": 0.5016130805015564, "learning_rate": 0.0003338404414404513, "loss": 0.6404, "step": 35950 }, { "epoch": 0.8003917378917379, "grad_norm": 0.432508647441864, "learning_rate": 0.0003338057937379122, "loss": 0.5448, "step": 35960 }, { "epoch": 0.8006143162393162, "grad_norm": 0.6061640381813049, "learning_rate": 0.00033377113876425677, "loss": 0.6483, "step": 35970 }, { "epoch": 0.8008368945868946, "grad_norm": 0.6601110696792603, "learning_rate": 0.0003337364765213681, "loss": 0.5811, "step": 35980 }, { "epoch": 0.8010594729344729, "grad_norm": 0.6183525919914246, "learning_rate": 0.0003337018070111299, "loss": 0.7447, "step": 35990 }, { "epoch": 0.8012820512820513, "grad_norm": 0.7363453507423401, "learning_rate": 0.00033366713023542596, "loss": 0.6509, "step": 36000 }, { "epoch": 0.8015046296296297, "grad_norm": 0.5352987051010132, "learning_rate": 0.00033363244619614074, "loss": 0.6707, "step": 36010 }, { "epoch": 0.801727207977208, "grad_norm": 0.6848888993263245, "learning_rate": 0.00033359775489515906, "loss": 0.685, "step": 36020 }, { "epoch": 0.8019497863247863, "grad_norm": 0.8171915411949158, "learning_rate": 0.000333563056334366, "loss": 0.5364, "step": 36030 }, { "epoch": 0.8021723646723646, "grad_norm": 0.6944659352302551, "learning_rate": 0.0003335283505156471, "loss": 0.6257, "step": 36040 }, { "epoch": 0.8023949430199431, "grad_norm": 0.744377851486206, "learning_rate": 0.00033349363744088835, "loss": 0.7668, "step": 36050 }, { "epoch": 0.8026175213675214, "grad_norm": 0.5722485184669495, "learning_rate": 0.00033345891711197595, "loss": 0.6662, "step": 36060 }, { "epoch": 0.8028400997150997, "grad_norm": 0.7393105626106262, "learning_rate": 0.0003334241895307969, "loss": 0.6016, "step": 36070 }, { "epoch": 0.8030626780626781, "grad_norm": 0.37272927165031433, "learning_rate": 0.0003333894546992381, "loss": 0.5908, "step": 36080 }, { "epoch": 0.8032852564102564, "grad_norm": 0.8427095413208008, "learning_rate": 0.0003333547126191871, "loss": 0.4997, "step": 36090 }, { "epoch": 0.8035078347578347, "grad_norm": 0.641094982624054, "learning_rate": 0.00033331996329253184, "loss": 0.7035, "step": 36100 }, { "epoch": 0.8037304131054132, "grad_norm": 0.6963313221931458, "learning_rate": 0.0003332852067211607, "loss": 0.7224, "step": 36110 }, { "epoch": 0.8039529914529915, "grad_norm": 0.5846840739250183, "learning_rate": 0.0003332504429069623, "loss": 0.592, "step": 36120 }, { "epoch": 0.8041755698005698, "grad_norm": 0.67938232421875, "learning_rate": 0.0003332156718518257, "loss": 0.7094, "step": 36130 }, { "epoch": 0.8043981481481481, "grad_norm": 0.8473862409591675, "learning_rate": 0.00033318089355764046, "loss": 0.6621, "step": 36140 }, { "epoch": 0.8046207264957265, "grad_norm": 0.665254533290863, "learning_rate": 0.00033314610802629644, "loss": 0.6013, "step": 36150 }, { "epoch": 0.8048433048433048, "grad_norm": 0.4536101818084717, "learning_rate": 0.0003331113152596839, "loss": 0.5538, "step": 36160 }, { "epoch": 0.8050658831908832, "grad_norm": 0.6429980993270874, "learning_rate": 0.00033307651525969355, "loss": 0.7499, "step": 36170 }, { "epoch": 0.8052884615384616, "grad_norm": 0.9357435703277588, "learning_rate": 0.0003330417080282164, "loss": 0.6593, "step": 36180 }, { "epoch": 0.8055110398860399, "grad_norm": 0.6779121160507202, "learning_rate": 0.0003330068935671439, "loss": 0.6892, "step": 36190 }, { "epoch": 0.8057336182336182, "grad_norm": 0.587925910949707, "learning_rate": 0.0003329720718783679, "loss": 0.525, "step": 36200 }, { "epoch": 0.8059561965811965, "grad_norm": 0.9141523241996765, "learning_rate": 0.00033293724296378077, "loss": 0.62, "step": 36210 }, { "epoch": 0.8061787749287749, "grad_norm": 0.7711586356163025, "learning_rate": 0.0003329024068252749, "loss": 0.6833, "step": 36220 }, { "epoch": 0.8064013532763533, "grad_norm": 0.722883939743042, "learning_rate": 0.00033286756346474354, "loss": 0.6203, "step": 36230 }, { "epoch": 0.8066239316239316, "grad_norm": 0.6778685450553894, "learning_rate": 0.0003328327128840799, "loss": 0.6965, "step": 36240 }, { "epoch": 0.80684650997151, "grad_norm": 0.47538506984710693, "learning_rate": 0.00033279785508517803, "loss": 0.6006, "step": 36250 }, { "epoch": 0.8070690883190883, "grad_norm": 0.7078222632408142, "learning_rate": 0.000332762990069932, "loss": 0.6305, "step": 36260 }, { "epoch": 0.8072916666666666, "grad_norm": 1.0178117752075195, "learning_rate": 0.00033272811784023623, "loss": 0.7016, "step": 36270 }, { "epoch": 0.8075142450142451, "grad_norm": 0.5266718864440918, "learning_rate": 0.0003326932383979861, "loss": 0.565, "step": 36280 }, { "epoch": 0.8077368233618234, "grad_norm": 0.4685933589935303, "learning_rate": 0.00033265835174507664, "loss": 0.4594, "step": 36290 }, { "epoch": 0.8079594017094017, "grad_norm": 0.7532642483711243, "learning_rate": 0.00033262345788340376, "loss": 0.5572, "step": 36300 }, { "epoch": 0.80818198005698, "grad_norm": 0.7520643472671509, "learning_rate": 0.0003325885568148636, "loss": 0.5862, "step": 36310 }, { "epoch": 0.8084045584045584, "grad_norm": 0.6257230043411255, "learning_rate": 0.00033255364854135275, "loss": 0.5583, "step": 36320 }, { "epoch": 0.8086271367521367, "grad_norm": 0.7662017941474915, "learning_rate": 0.00033251873306476814, "loss": 0.5729, "step": 36330 }, { "epoch": 0.8088497150997151, "grad_norm": 0.7460561990737915, "learning_rate": 0.0003324838103870071, "loss": 0.6806, "step": 36340 }, { "epoch": 0.8090722934472935, "grad_norm": 0.9843911528587341, "learning_rate": 0.0003324488805099673, "loss": 0.7306, "step": 36350 }, { "epoch": 0.8092948717948718, "grad_norm": 0.5303930640220642, "learning_rate": 0.0003324139434355469, "loss": 0.5246, "step": 36360 }, { "epoch": 0.8095174501424501, "grad_norm": 0.5533699989318848, "learning_rate": 0.0003323789991656444, "loss": 0.6466, "step": 36370 }, { "epoch": 0.8097400284900285, "grad_norm": 0.6484217643737793, "learning_rate": 0.0003323440477021587, "loss": 0.5728, "step": 36380 }, { "epoch": 0.8099626068376068, "grad_norm": 0.6987613439559937, "learning_rate": 0.0003323090890469892, "loss": 0.5621, "step": 36390 }, { "epoch": 0.8101851851851852, "grad_norm": 0.9706330299377441, "learning_rate": 0.00033227412320203546, "loss": 0.655, "step": 36400 }, { "epoch": 0.8104077635327636, "grad_norm": 0.5594670176506042, "learning_rate": 0.00033223915016919757, "loss": 0.7187, "step": 36410 }, { "epoch": 0.8106303418803419, "grad_norm": 0.5286457538604736, "learning_rate": 0.00033220416995037604, "loss": 0.5207, "step": 36420 }, { "epoch": 0.8108529202279202, "grad_norm": 0.49551501870155334, "learning_rate": 0.0003321691825474716, "loss": 0.4212, "step": 36430 }, { "epoch": 0.8110754985754985, "grad_norm": 0.6061522364616394, "learning_rate": 0.00033213418796238566, "loss": 0.5597, "step": 36440 }, { "epoch": 0.8112980769230769, "grad_norm": 0.9590118527412415, "learning_rate": 0.0003320991861970197, "loss": 0.6513, "step": 36450 }, { "epoch": 0.8115206552706553, "grad_norm": 0.7732698321342468, "learning_rate": 0.0003320641772532759, "loss": 0.6888, "step": 36460 }, { "epoch": 0.8117432336182336, "grad_norm": 0.7111770510673523, "learning_rate": 0.00033202916113305657, "loss": 0.6541, "step": 36470 }, { "epoch": 0.811965811965812, "grad_norm": 0.6621142625808716, "learning_rate": 0.0003319941378382645, "loss": 0.6328, "step": 36480 }, { "epoch": 0.8121883903133903, "grad_norm": 0.5446364283561707, "learning_rate": 0.00033195910737080295, "loss": 0.5501, "step": 36490 }, { "epoch": 0.8124109686609686, "grad_norm": 0.7416331768035889, "learning_rate": 0.00033192406973257555, "loss": 0.6579, "step": 36500 }, { "epoch": 0.8126335470085471, "grad_norm": 0.7212896943092346, "learning_rate": 0.0003318890249254861, "loss": 0.7042, "step": 36510 }, { "epoch": 0.8128561253561254, "grad_norm": 0.5195409655570984, "learning_rate": 0.0003318539729514391, "loss": 0.6703, "step": 36520 }, { "epoch": 0.8130787037037037, "grad_norm": 0.572192370891571, "learning_rate": 0.0003318189138123392, "loss": 0.5075, "step": 36530 }, { "epoch": 0.813301282051282, "grad_norm": 0.4907376766204834, "learning_rate": 0.0003317838475100918, "loss": 0.6511, "step": 36540 }, { "epoch": 0.8135238603988604, "grad_norm": 0.7664365768432617, "learning_rate": 0.0003317487740466021, "loss": 0.4856, "step": 36550 }, { "epoch": 0.8137464387464387, "grad_norm": 0.5438021421432495, "learning_rate": 0.00033171369342377616, "loss": 0.6068, "step": 36560 }, { "epoch": 0.8139690170940171, "grad_norm": 0.5583726763725281, "learning_rate": 0.00033167860564352027, "loss": 0.6175, "step": 36570 }, { "epoch": 0.8141915954415955, "grad_norm": 0.5016492009162903, "learning_rate": 0.00033164351070774124, "loss": 0.6092, "step": 36580 }, { "epoch": 0.8144141737891738, "grad_norm": 1.1724473237991333, "learning_rate": 0.0003316084086183461, "loss": 0.5914, "step": 36590 }, { "epoch": 0.8146367521367521, "grad_norm": 0.5008374452590942, "learning_rate": 0.00033157329937724217, "loss": 0.6082, "step": 36600 }, { "epoch": 0.8148593304843305, "grad_norm": 0.4436315596103668, "learning_rate": 0.0003315381829863375, "loss": 0.6483, "step": 36610 }, { "epoch": 0.8150819088319088, "grad_norm": 1.088879942893982, "learning_rate": 0.0003315030594475403, "loss": 0.7563, "step": 36620 }, { "epoch": 0.8153044871794872, "grad_norm": 0.5001720190048218, "learning_rate": 0.00033146792876275914, "loss": 0.5567, "step": 36630 }, { "epoch": 0.8155270655270656, "grad_norm": 0.7010923624038696, "learning_rate": 0.00033143279093390316, "loss": 0.6492, "step": 36640 }, { "epoch": 0.8157496438746439, "grad_norm": 0.7686439752578735, "learning_rate": 0.0003313976459628817, "loss": 0.5096, "step": 36650 }, { "epoch": 0.8159722222222222, "grad_norm": 1.0666576623916626, "learning_rate": 0.0003313624938516046, "loss": 0.5674, "step": 36660 }, { "epoch": 0.8161948005698005, "grad_norm": 0.8343584537506104, "learning_rate": 0.0003313273346019821, "loss": 0.6541, "step": 36670 }, { "epoch": 0.8164173789173789, "grad_norm": 0.48941463232040405, "learning_rate": 0.00033129216821592465, "loss": 0.5769, "step": 36680 }, { "epoch": 0.8166399572649573, "grad_norm": 0.5967426300048828, "learning_rate": 0.00033125699469534333, "loss": 0.6193, "step": 36690 }, { "epoch": 0.8168625356125356, "grad_norm": 0.6565297245979309, "learning_rate": 0.0003312218140421495, "loss": 0.5518, "step": 36700 }, { "epoch": 0.817085113960114, "grad_norm": 0.7968794107437134, "learning_rate": 0.0003311866262582548, "loss": 0.564, "step": 36710 }, { "epoch": 0.8173076923076923, "grad_norm": 0.8082167506217957, "learning_rate": 0.00033115143134557147, "loss": 0.5872, "step": 36720 }, { "epoch": 0.8175302706552706, "grad_norm": 0.5784871578216553, "learning_rate": 0.00033111622930601196, "loss": 0.595, "step": 36730 }, { "epoch": 0.8177528490028491, "grad_norm": 0.8299255967140198, "learning_rate": 0.0003310810201414893, "loss": 0.5521, "step": 36740 }, { "epoch": 0.8179754273504274, "grad_norm": 0.6350299715995789, "learning_rate": 0.0003310458038539166, "loss": 0.6748, "step": 36750 }, { "epoch": 0.8181980056980057, "grad_norm": 0.5850403308868408, "learning_rate": 0.0003310105804452077, "loss": 0.6192, "step": 36760 }, { "epoch": 0.818420584045584, "grad_norm": 0.6726582050323486, "learning_rate": 0.0003309753499172766, "loss": 0.6385, "step": 36770 }, { "epoch": 0.8186431623931624, "grad_norm": 0.49928078055381775, "learning_rate": 0.00033094011227203775, "loss": 0.5801, "step": 36780 }, { "epoch": 0.8188657407407407, "grad_norm": 0.5119436979293823, "learning_rate": 0.00033090486751140606, "loss": 0.7312, "step": 36790 }, { "epoch": 0.8190883190883191, "grad_norm": 0.48218220472335815, "learning_rate": 0.00033086961563729664, "loss": 0.4542, "step": 36800 }, { "epoch": 0.8193108974358975, "grad_norm": 0.7016624212265015, "learning_rate": 0.00033083435665162524, "loss": 0.6112, "step": 36810 }, { "epoch": 0.8195334757834758, "grad_norm": 0.5343238115310669, "learning_rate": 0.0003307990905563077, "loss": 0.4985, "step": 36820 }, { "epoch": 0.8197560541310541, "grad_norm": 1.0482207536697388, "learning_rate": 0.0003307638173532605, "loss": 0.4824, "step": 36830 }, { "epoch": 0.8199786324786325, "grad_norm": 0.6126405596733093, "learning_rate": 0.00033072853704440046, "loss": 0.5322, "step": 36840 }, { "epoch": 0.8202012108262108, "grad_norm": 0.8611001372337341, "learning_rate": 0.00033069324963164474, "loss": 0.7653, "step": 36850 }, { "epoch": 0.8204237891737892, "grad_norm": 0.5939532518386841, "learning_rate": 0.0003306579551169108, "loss": 0.6571, "step": 36860 }, { "epoch": 0.8206463675213675, "grad_norm": 1.0668377876281738, "learning_rate": 0.0003306226535021166, "loss": 0.7, "step": 36870 }, { "epoch": 0.8208689458689459, "grad_norm": 0.8537008762359619, "learning_rate": 0.0003305873447891806, "loss": 0.8041, "step": 36880 }, { "epoch": 0.8210915242165242, "grad_norm": 0.8350732922554016, "learning_rate": 0.0003305520289800212, "loss": 0.5502, "step": 36890 }, { "epoch": 0.8213141025641025, "grad_norm": 0.7095655202865601, "learning_rate": 0.0003305167060765578, "loss": 0.5797, "step": 36900 }, { "epoch": 0.8215366809116809, "grad_norm": 0.607930600643158, "learning_rate": 0.0003304813760807097, "loss": 0.6496, "step": 36910 }, { "epoch": 0.8217592592592593, "grad_norm": 0.7336176037788391, "learning_rate": 0.00033044603899439677, "loss": 0.5963, "step": 36920 }, { "epoch": 0.8219818376068376, "grad_norm": 0.4098057150840759, "learning_rate": 0.00033041069481953936, "loss": 0.5506, "step": 36930 }, { "epoch": 0.822204415954416, "grad_norm": 0.8852785229682922, "learning_rate": 0.0003303753435580579, "loss": 0.5608, "step": 36940 }, { "epoch": 0.8224269943019943, "grad_norm": 0.6913530826568604, "learning_rate": 0.00033033998521187375, "loss": 0.494, "step": 36950 }, { "epoch": 0.8226495726495726, "grad_norm": 0.8665242195129395, "learning_rate": 0.0003303046197829079, "loss": 0.5654, "step": 36960 }, { "epoch": 0.8228721509971509, "grad_norm": 0.41242697834968567, "learning_rate": 0.0003302692472730825, "loss": 0.5263, "step": 36970 }, { "epoch": 0.8230947293447294, "grad_norm": 0.4868178367614746, "learning_rate": 0.00033023386768431945, "loss": 0.7242, "step": 36980 }, { "epoch": 0.8233173076923077, "grad_norm": 0.8877532482147217, "learning_rate": 0.00033019848101854143, "loss": 0.6203, "step": 36990 }, { "epoch": 0.823539886039886, "grad_norm": 0.5433663725852966, "learning_rate": 0.00033016308727767143, "loss": 0.5508, "step": 37000 }, { "epoch": 0.8237624643874644, "grad_norm": 0.82059645652771, "learning_rate": 0.0003301276864636327, "loss": 0.6424, "step": 37010 }, { "epoch": 0.8239850427350427, "grad_norm": 0.4982110857963562, "learning_rate": 0.0003300922785783489, "loss": 0.592, "step": 37020 }, { "epoch": 0.8242076210826211, "grad_norm": 0.6042728424072266, "learning_rate": 0.0003300568636237442, "loss": 0.6052, "step": 37030 }, { "epoch": 0.8244301994301995, "grad_norm": 0.6098039746284485, "learning_rate": 0.0003300214416017431, "loss": 0.7938, "step": 37040 }, { "epoch": 0.8246527777777778, "grad_norm": 0.8635123372077942, "learning_rate": 0.00032998601251427043, "loss": 0.7422, "step": 37050 }, { "epoch": 0.8248753561253561, "grad_norm": 0.842786967754364, "learning_rate": 0.00032995057636325137, "loss": 0.5865, "step": 37060 }, { "epoch": 0.8250979344729344, "grad_norm": 0.703381359577179, "learning_rate": 0.00032991513315061165, "loss": 0.5967, "step": 37070 }, { "epoch": 0.8253205128205128, "grad_norm": 0.652272641658783, "learning_rate": 0.00032987968287827724, "loss": 0.5933, "step": 37080 }, { "epoch": 0.8255430911680912, "grad_norm": 0.467128187417984, "learning_rate": 0.00032984422554817447, "loss": 0.5171, "step": 37090 }, { "epoch": 0.8257656695156695, "grad_norm": 0.7063168287277222, "learning_rate": 0.0003298087611622303, "loss": 0.5875, "step": 37100 }, { "epoch": 0.8259882478632479, "grad_norm": 0.3760249614715576, "learning_rate": 0.0003297732897223717, "loss": 0.5645, "step": 37110 }, { "epoch": 0.8262108262108262, "grad_norm": 0.796840488910675, "learning_rate": 0.0003297378112305263, "loss": 0.6644, "step": 37120 }, { "epoch": 0.8264334045584045, "grad_norm": 0.6814299821853638, "learning_rate": 0.0003297023256886221, "loss": 0.5546, "step": 37130 }, { "epoch": 0.8266559829059829, "grad_norm": 0.7078778147697449, "learning_rate": 0.0003296668330985873, "loss": 0.582, "step": 37140 }, { "epoch": 0.8268785612535613, "grad_norm": 0.5444463491439819, "learning_rate": 0.00032963133346235066, "loss": 0.4931, "step": 37150 }, { "epoch": 0.8271011396011396, "grad_norm": 0.7480208873748779, "learning_rate": 0.0003295958267818412, "loss": 0.6201, "step": 37160 }, { "epoch": 0.827323717948718, "grad_norm": 0.6348940134048462, "learning_rate": 0.00032956031305898836, "loss": 0.5632, "step": 37170 }, { "epoch": 0.8275462962962963, "grad_norm": 0.43813246488571167, "learning_rate": 0.0003295247922957222, "loss": 0.6038, "step": 37180 }, { "epoch": 0.8277688746438746, "grad_norm": 0.6713902950286865, "learning_rate": 0.00032948926449397265, "loss": 0.5168, "step": 37190 }, { "epoch": 0.8279914529914529, "grad_norm": 0.7052330374717712, "learning_rate": 0.0003294537296556706, "loss": 0.5426, "step": 37200 }, { "epoch": 0.8282140313390314, "grad_norm": 0.6007211804389954, "learning_rate": 0.00032941818778274676, "loss": 0.5749, "step": 37210 }, { "epoch": 0.8284366096866097, "grad_norm": 0.5694396495819092, "learning_rate": 0.00032938263887713275, "loss": 0.4963, "step": 37220 }, { "epoch": 0.828659188034188, "grad_norm": 0.8777496218681335, "learning_rate": 0.0003293470829407602, "loss": 0.6211, "step": 37230 }, { "epoch": 0.8288817663817664, "grad_norm": 0.8059148788452148, "learning_rate": 0.00032931151997556124, "loss": 0.6515, "step": 37240 }, { "epoch": 0.8291043447293447, "grad_norm": 0.6097303628921509, "learning_rate": 0.00032927594998346854, "loss": 0.6599, "step": 37250 }, { "epoch": 0.8293269230769231, "grad_norm": 0.8829268217086792, "learning_rate": 0.00032924037296641476, "loss": 0.619, "step": 37260 }, { "epoch": 0.8295495014245015, "grad_norm": 0.7852076292037964, "learning_rate": 0.0003292047889263334, "loss": 0.6172, "step": 37270 }, { "epoch": 0.8297720797720798, "grad_norm": 0.6745901703834534, "learning_rate": 0.00032916919786515794, "loss": 0.5426, "step": 37280 }, { "epoch": 0.8299946581196581, "grad_norm": 0.5950226783752441, "learning_rate": 0.00032913359978482256, "loss": 0.6676, "step": 37290 }, { "epoch": 0.8302172364672364, "grad_norm": 0.5079329609870911, "learning_rate": 0.0003290979946872617, "loss": 0.5057, "step": 37300 }, { "epoch": 0.8304398148148148, "grad_norm": 0.5150577425956726, "learning_rate": 0.0003290623825744101, "loss": 0.4958, "step": 37310 }, { "epoch": 0.8306623931623932, "grad_norm": 0.6359812617301941, "learning_rate": 0.000329026763448203, "loss": 0.5448, "step": 37320 }, { "epoch": 0.8308849715099715, "grad_norm": 0.9012143611907959, "learning_rate": 0.0003289911373105759, "loss": 0.5756, "step": 37330 }, { "epoch": 0.8311075498575499, "grad_norm": 0.5244525074958801, "learning_rate": 0.00032895550416346485, "loss": 0.5128, "step": 37340 }, { "epoch": 0.8313301282051282, "grad_norm": 0.5555477142333984, "learning_rate": 0.00032891986400880607, "loss": 0.6683, "step": 37350 }, { "epoch": 0.8315527065527065, "grad_norm": 0.5338780879974365, "learning_rate": 0.0003288842168485364, "loss": 0.582, "step": 37360 }, { "epoch": 0.8317752849002849, "grad_norm": 0.5661174654960632, "learning_rate": 0.00032884856268459284, "loss": 0.5865, "step": 37370 }, { "epoch": 0.8319978632478633, "grad_norm": 0.7979121804237366, "learning_rate": 0.0003288129015189129, "loss": 0.5493, "step": 37380 }, { "epoch": 0.8322204415954416, "grad_norm": 0.6640493869781494, "learning_rate": 0.0003287772333534345, "loss": 0.5727, "step": 37390 }, { "epoch": 0.83244301994302, "grad_norm": 0.6453659534454346, "learning_rate": 0.0003287415581900958, "loss": 0.5334, "step": 37400 }, { "epoch": 0.8326655982905983, "grad_norm": 0.8953363299369812, "learning_rate": 0.0003287058760308354, "loss": 0.8172, "step": 37410 }, { "epoch": 0.8328881766381766, "grad_norm": 0.5227009654045105, "learning_rate": 0.0003286701868775923, "loss": 0.6285, "step": 37420 }, { "epoch": 0.8331107549857549, "grad_norm": 0.6829435229301453, "learning_rate": 0.000328634490732306, "loss": 0.4957, "step": 37430 }, { "epoch": 0.8333333333333334, "grad_norm": 0.6302551627159119, "learning_rate": 0.0003285987875969161, "loss": 0.5354, "step": 37440 }, { "epoch": 0.8335559116809117, "grad_norm": 0.7687264680862427, "learning_rate": 0.00032856307747336277, "loss": 0.6332, "step": 37450 }, { "epoch": 0.83377849002849, "grad_norm": 0.7602887749671936, "learning_rate": 0.0003285273603635867, "loss": 0.5534, "step": 37460 }, { "epoch": 0.8340010683760684, "grad_norm": 0.7089099884033203, "learning_rate": 0.00032849163626952853, "loss": 0.4901, "step": 37470 }, { "epoch": 0.8342236467236467, "grad_norm": 0.5121763944625854, "learning_rate": 0.0003284559051931297, "loss": 0.5413, "step": 37480 }, { "epoch": 0.8344462250712251, "grad_norm": 0.38285666704177856, "learning_rate": 0.00032842016713633185, "loss": 0.5369, "step": 37490 }, { "epoch": 0.8346688034188035, "grad_norm": 0.7840296030044556, "learning_rate": 0.00032838442210107694, "loss": 0.4726, "step": 37500 }, { "epoch": 0.8348913817663818, "grad_norm": 0.8556367754936218, "learning_rate": 0.00032834867008930745, "loss": 0.5818, "step": 37510 }, { "epoch": 0.8351139601139601, "grad_norm": 0.5417022109031677, "learning_rate": 0.0003283129111029662, "loss": 0.5866, "step": 37520 }, { "epoch": 0.8353365384615384, "grad_norm": 0.7985814809799194, "learning_rate": 0.0003282771451439963, "loss": 0.7183, "step": 37530 }, { "epoch": 0.8355591168091168, "grad_norm": 0.6315109133720398, "learning_rate": 0.0003282413722143413, "loss": 0.6315, "step": 37540 }, { "epoch": 0.8357816951566952, "grad_norm": 0.45857003331184387, "learning_rate": 0.00032820559231594513, "loss": 0.5245, "step": 37550 }, { "epoch": 0.8360042735042735, "grad_norm": 0.6412796378135681, "learning_rate": 0.00032816980545075216, "loss": 0.6218, "step": 37560 }, { "epoch": 0.8362268518518519, "grad_norm": 0.5888071060180664, "learning_rate": 0.000328134011620707, "loss": 0.57, "step": 37570 }, { "epoch": 0.8364494301994302, "grad_norm": 0.5844222903251648, "learning_rate": 0.0003280982108277548, "loss": 0.6996, "step": 37580 }, { "epoch": 0.8366720085470085, "grad_norm": 0.6059329509735107, "learning_rate": 0.0003280624030738409, "loss": 0.5442, "step": 37590 }, { "epoch": 0.8368945868945868, "grad_norm": 0.38470956683158875, "learning_rate": 0.00032802658836091124, "loss": 0.5368, "step": 37600 }, { "epoch": 0.8371171652421653, "grad_norm": 0.8118173480033875, "learning_rate": 0.0003279907666909119, "loss": 0.6644, "step": 37610 }, { "epoch": 0.8373397435897436, "grad_norm": 0.5427059531211853, "learning_rate": 0.0003279549380657896, "loss": 0.5515, "step": 37620 }, { "epoch": 0.8375623219373219, "grad_norm": 0.7684349417686462, "learning_rate": 0.0003279191024874911, "loss": 0.7565, "step": 37630 }, { "epoch": 0.8377849002849003, "grad_norm": 0.8485226631164551, "learning_rate": 0.00032788325995796396, "loss": 0.6957, "step": 37640 }, { "epoch": 0.8380074786324786, "grad_norm": 1.1634833812713623, "learning_rate": 0.0003278474104791557, "loss": 0.6749, "step": 37650 }, { "epoch": 0.8382300569800569, "grad_norm": 0.7527285814285278, "learning_rate": 0.0003278115540530146, "loss": 0.5256, "step": 37660 }, { "epoch": 0.8384526353276354, "grad_norm": 0.46826934814453125, "learning_rate": 0.00032777569068148893, "loss": 0.6188, "step": 37670 }, { "epoch": 0.8386752136752137, "grad_norm": 0.6066411137580872, "learning_rate": 0.00032773982036652765, "loss": 0.6407, "step": 37680 }, { "epoch": 0.838897792022792, "grad_norm": 0.8002700805664062, "learning_rate": 0.00032770394311007993, "loss": 0.6213, "step": 37690 }, { "epoch": 0.8391203703703703, "grad_norm": 0.5692296028137207, "learning_rate": 0.0003276680589140955, "loss": 0.7038, "step": 37700 }, { "epoch": 0.8393429487179487, "grad_norm": 0.5736252665519714, "learning_rate": 0.0003276321677805241, "loss": 0.531, "step": 37710 }, { "epoch": 0.8395655270655271, "grad_norm": 0.6136668920516968, "learning_rate": 0.0003275962697113163, "loss": 0.5928, "step": 37720 }, { "epoch": 0.8397881054131054, "grad_norm": 0.8906228542327881, "learning_rate": 0.00032756036470842277, "loss": 0.6129, "step": 37730 }, { "epoch": 0.8400106837606838, "grad_norm": 0.7700905203819275, "learning_rate": 0.0003275244527737945, "loss": 0.5606, "step": 37740 }, { "epoch": 0.8400997150997151, "eval_loss": 0.6122514605522156, "eval_runtime": 337.0919, "eval_samples_per_second": 7.016, "eval_steps_per_second": 7.016, "step": 37744 }, { "epoch": 0.8402332621082621, "grad_norm": 0.7092208862304688, "learning_rate": 0.00032748853390938314, "loss": 0.573, "step": 37750 }, { "epoch": 0.8404558404558404, "grad_norm": 0.6493269801139832, "learning_rate": 0.00032745260811714046, "loss": 0.5398, "step": 37760 }, { "epoch": 0.8406784188034188, "grad_norm": 0.7034374475479126, "learning_rate": 0.00032741667539901875, "loss": 0.593, "step": 37770 }, { "epoch": 0.8409009971509972, "grad_norm": 0.7925833463668823, "learning_rate": 0.00032738073575697054, "loss": 0.5922, "step": 37780 }, { "epoch": 0.8411235754985755, "grad_norm": 0.7417179942131042, "learning_rate": 0.000327344789192949, "loss": 0.527, "step": 37790 }, { "epoch": 0.8413461538461539, "grad_norm": 0.5861806869506836, "learning_rate": 0.0003273088357089072, "loss": 0.6919, "step": 37800 }, { "epoch": 0.8415687321937322, "grad_norm": 0.7780978083610535, "learning_rate": 0.00032727287530679914, "loss": 0.6734, "step": 37810 }, { "epoch": 0.8417913105413105, "grad_norm": 0.6386851072311401, "learning_rate": 0.00032723690798857876, "loss": 0.6519, "step": 37820 }, { "epoch": 0.8420138888888888, "grad_norm": 0.7099935412406921, "learning_rate": 0.00032720093375620065, "loss": 0.5304, "step": 37830 }, { "epoch": 0.8422364672364673, "grad_norm": 0.6465808153152466, "learning_rate": 0.0003271649526116198, "loss": 0.6308, "step": 37840 }, { "epoch": 0.8424590455840456, "grad_norm": 0.4721263349056244, "learning_rate": 0.00032712896455679125, "loss": 0.6353, "step": 37850 }, { "epoch": 0.8426816239316239, "grad_norm": 0.46420302987098694, "learning_rate": 0.0003270929695936706, "loss": 0.6307, "step": 37860 }, { "epoch": 0.8429042022792023, "grad_norm": 0.8225671052932739, "learning_rate": 0.00032705696772421407, "loss": 0.6044, "step": 37870 }, { "epoch": 0.8431267806267806, "grad_norm": 0.6508604288101196, "learning_rate": 0.00032702095895037784, "loss": 0.6072, "step": 37880 }, { "epoch": 0.8433493589743589, "grad_norm": 0.47768697142601013, "learning_rate": 0.0003269849432741187, "loss": 0.5711, "step": 37890 }, { "epoch": 0.8435719373219374, "grad_norm": 0.7702714204788208, "learning_rate": 0.00032694892069739384, "loss": 0.739, "step": 37900 }, { "epoch": 0.8437945156695157, "grad_norm": 0.6471502780914307, "learning_rate": 0.0003269128912221607, "loss": 0.5683, "step": 37910 }, { "epoch": 0.844017094017094, "grad_norm": 0.5908994078636169, "learning_rate": 0.0003268768548503771, "loss": 0.64, "step": 37920 }, { "epoch": 0.8442396723646723, "grad_norm": 0.5731729865074158, "learning_rate": 0.00032684081158400135, "loss": 0.6329, "step": 37930 }, { "epoch": 0.8444622507122507, "grad_norm": 0.4691145718097687, "learning_rate": 0.0003268047614249921, "loss": 0.5713, "step": 37940 }, { "epoch": 0.8446848290598291, "grad_norm": 0.5721814036369324, "learning_rate": 0.0003267687043753083, "loss": 0.5048, "step": 37950 }, { "epoch": 0.8449074074074074, "grad_norm": 1.1615492105484009, "learning_rate": 0.0003267326404369093, "loss": 0.8903, "step": 37960 }, { "epoch": 0.8451299857549858, "grad_norm": 0.6510785818099976, "learning_rate": 0.0003266965696117549, "loss": 0.6599, "step": 37970 }, { "epoch": 0.8453525641025641, "grad_norm": 0.7093966007232666, "learning_rate": 0.0003266604919018052, "loss": 0.6341, "step": 37980 }, { "epoch": 0.8455751424501424, "grad_norm": 0.8076547980308533, "learning_rate": 0.0003266244073090206, "loss": 0.505, "step": 37990 }, { "epoch": 0.8457977207977208, "grad_norm": 1.5181430578231812, "learning_rate": 0.00032658831583536215, "loss": 0.6225, "step": 38000 }, { "epoch": 0.8460202991452992, "grad_norm": 1.261081576347351, "learning_rate": 0.00032655221748279097, "loss": 0.6192, "step": 38010 }, { "epoch": 0.8462428774928775, "grad_norm": 0.7477047443389893, "learning_rate": 0.00032651611225326864, "loss": 0.5815, "step": 38020 }, { "epoch": 0.8464654558404558, "grad_norm": 0.582606315612793, "learning_rate": 0.00032648000014875723, "loss": 0.5552, "step": 38030 }, { "epoch": 0.8466880341880342, "grad_norm": 0.5662571787834167, "learning_rate": 0.00032644388117121905, "loss": 0.6245, "step": 38040 }, { "epoch": 0.8469106125356125, "grad_norm": 0.9054650664329529, "learning_rate": 0.0003264077553226169, "loss": 0.6222, "step": 38050 }, { "epoch": 0.8471331908831908, "grad_norm": 1.1601990461349487, "learning_rate": 0.00032637162260491386, "loss": 0.6315, "step": 38060 }, { "epoch": 0.8473557692307693, "grad_norm": 0.6006155610084534, "learning_rate": 0.0003263354830200733, "loss": 0.7051, "step": 38070 }, { "epoch": 0.8475783475783476, "grad_norm": 0.7477725148200989, "learning_rate": 0.0003262993365700592, "loss": 0.5918, "step": 38080 }, { "epoch": 0.8478009259259259, "grad_norm": 0.5139760971069336, "learning_rate": 0.0003262631832568358, "loss": 0.6576, "step": 38090 }, { "epoch": 0.8480235042735043, "grad_norm": 0.5229126214981079, "learning_rate": 0.00032622702308236757, "loss": 0.4205, "step": 38100 }, { "epoch": 0.8482460826210826, "grad_norm": 0.6959442496299744, "learning_rate": 0.0003261908560486197, "loss": 0.6536, "step": 38110 }, { "epoch": 0.8484686609686609, "grad_norm": 0.5733725428581238, "learning_rate": 0.0003261546821575573, "loss": 0.5371, "step": 38120 }, { "epoch": 0.8486912393162394, "grad_norm": 0.4880182445049286, "learning_rate": 0.00032611850141114624, "loss": 0.4979, "step": 38130 }, { "epoch": 0.8489138176638177, "grad_norm": 0.7861484289169312, "learning_rate": 0.0003260823138113526, "loss": 0.5845, "step": 38140 }, { "epoch": 0.849136396011396, "grad_norm": 0.8785387277603149, "learning_rate": 0.0003260461193601428, "loss": 0.769, "step": 38150 }, { "epoch": 0.8493589743589743, "grad_norm": 0.9648477435112, "learning_rate": 0.0003260099180594836, "loss": 0.7094, "step": 38160 }, { "epoch": 0.8495815527065527, "grad_norm": 0.475644052028656, "learning_rate": 0.00032597370991134235, "loss": 0.5446, "step": 38170 }, { "epoch": 0.8498041310541311, "grad_norm": 0.6651561260223389, "learning_rate": 0.00032593749491768663, "loss": 0.5502, "step": 38180 }, { "epoch": 0.8500267094017094, "grad_norm": 0.6330234408378601, "learning_rate": 0.0003259012730804843, "loss": 0.6349, "step": 38190 }, { "epoch": 0.8502492877492878, "grad_norm": 0.7780224084854126, "learning_rate": 0.0003258650444017037, "loss": 0.7218, "step": 38200 }, { "epoch": 0.8504718660968661, "grad_norm": 0.6567363142967224, "learning_rate": 0.0003258288088833136, "loss": 0.5092, "step": 38210 }, { "epoch": 0.8506944444444444, "grad_norm": 0.7575187683105469, "learning_rate": 0.000325792566527283, "loss": 0.4875, "step": 38220 }, { "epoch": 0.8509170227920227, "grad_norm": 0.6900361776351929, "learning_rate": 0.00032575631733558133, "loss": 0.6574, "step": 38230 }, { "epoch": 0.8511396011396012, "grad_norm": 1.2023710012435913, "learning_rate": 0.00032572006131017844, "loss": 0.5447, "step": 38240 }, { "epoch": 0.8513621794871795, "grad_norm": 0.594343900680542, "learning_rate": 0.00032568379845304446, "loss": 0.4553, "step": 38250 }, { "epoch": 0.8515847578347578, "grad_norm": 0.7324010133743286, "learning_rate": 0.00032564752876615004, "loss": 0.5672, "step": 38260 }, { "epoch": 0.8518073361823362, "grad_norm": 0.6826527118682861, "learning_rate": 0.00032561125225146604, "loss": 0.6463, "step": 38270 }, { "epoch": 0.8520299145299145, "grad_norm": 0.8564441204071045, "learning_rate": 0.00032557496891096375, "loss": 0.5814, "step": 38280 }, { "epoch": 0.8522524928774928, "grad_norm": 0.6812259554862976, "learning_rate": 0.00032553867874661485, "loss": 0.6826, "step": 38290 }, { "epoch": 0.8524750712250713, "grad_norm": 0.7946615219116211, "learning_rate": 0.0003255023817603914, "loss": 0.6607, "step": 38300 }, { "epoch": 0.8526976495726496, "grad_norm": 0.5488380193710327, "learning_rate": 0.00032546607795426577, "loss": 0.5731, "step": 38310 }, { "epoch": 0.8529202279202279, "grad_norm": 0.7387109398841858, "learning_rate": 0.0003254297673302108, "loss": 0.6652, "step": 38320 }, { "epoch": 0.8531428062678063, "grad_norm": 1.0683709383010864, "learning_rate": 0.00032539344989019947, "loss": 0.5992, "step": 38330 }, { "epoch": 0.8533653846153846, "grad_norm": 0.5943742990493774, "learning_rate": 0.0003253571256362055, "loss": 0.6335, "step": 38340 }, { "epoch": 0.8535879629629629, "grad_norm": 0.7003427743911743, "learning_rate": 0.0003253207945702027, "loss": 0.5376, "step": 38350 }, { "epoch": 0.8538105413105413, "grad_norm": 0.6439740657806396, "learning_rate": 0.00032528445669416524, "loss": 0.6638, "step": 38360 }, { "epoch": 0.8540331196581197, "grad_norm": 0.45052602887153625, "learning_rate": 0.000325248112010068, "loss": 0.6284, "step": 38370 }, { "epoch": 0.854255698005698, "grad_norm": 0.8613705635070801, "learning_rate": 0.00032521176051988573, "loss": 0.5812, "step": 38380 }, { "epoch": 0.8544782763532763, "grad_norm": 0.9319939613342285, "learning_rate": 0.0003251754022255939, "loss": 0.6173, "step": 38390 }, { "epoch": 0.8547008547008547, "grad_norm": 0.8355406522750854, "learning_rate": 0.00032513903712916823, "loss": 0.6346, "step": 38400 }, { "epoch": 0.8549234330484331, "grad_norm": 0.6055408716201782, "learning_rate": 0.0003251026652325848, "loss": 0.5601, "step": 38410 }, { "epoch": 0.8551460113960114, "grad_norm": 0.6037227511405945, "learning_rate": 0.0003250662865378202, "loss": 0.5399, "step": 38420 }, { "epoch": 0.8553685897435898, "grad_norm": 0.7371780276298523, "learning_rate": 0.0003250299010468512, "loss": 0.5711, "step": 38430 }, { "epoch": 0.8555911680911681, "grad_norm": 0.4835084080696106, "learning_rate": 0.000324993508761655, "loss": 0.6015, "step": 38440 }, { "epoch": 0.8558137464387464, "grad_norm": 0.8498573303222656, "learning_rate": 0.0003249571096842092, "loss": 0.6905, "step": 38450 }, { "epoch": 0.8560363247863247, "grad_norm": 0.5593055486679077, "learning_rate": 0.00032492070381649177, "loss": 0.5372, "step": 38460 }, { "epoch": 0.8562589031339032, "grad_norm": 0.46001946926116943, "learning_rate": 0.0003248842911604811, "loss": 0.559, "step": 38470 }, { "epoch": 0.8564814814814815, "grad_norm": 0.7182387113571167, "learning_rate": 0.00032484787171815574, "loss": 0.7162, "step": 38480 }, { "epoch": 0.8567040598290598, "grad_norm": 0.8392935395240784, "learning_rate": 0.0003248114454914948, "loss": 0.7595, "step": 38490 }, { "epoch": 0.8569266381766382, "grad_norm": 0.8445121645927429, "learning_rate": 0.00032477501248247775, "loss": 0.5439, "step": 38500 }, { "epoch": 0.8571492165242165, "grad_norm": 0.8420943021774292, "learning_rate": 0.00032473857269308445, "loss": 0.6484, "step": 38510 }, { "epoch": 0.8573717948717948, "grad_norm": 0.5731983184814453, "learning_rate": 0.00032470212612529495, "loss": 0.6886, "step": 38520 }, { "epoch": 0.8575943732193733, "grad_norm": 0.7260280251502991, "learning_rate": 0.0003246656727810898, "loss": 0.5561, "step": 38530 }, { "epoch": 0.8578169515669516, "grad_norm": 0.8293136358261108, "learning_rate": 0.00032462921266245, "loss": 0.6339, "step": 38540 }, { "epoch": 0.8580395299145299, "grad_norm": 1.0204569101333618, "learning_rate": 0.0003245927457713567, "loss": 0.7484, "step": 38550 }, { "epoch": 0.8582621082621082, "grad_norm": 0.9848127961158752, "learning_rate": 0.0003245562721097916, "loss": 0.7431, "step": 38560 }, { "epoch": 0.8584846866096866, "grad_norm": 0.4912465810775757, "learning_rate": 0.00032451979167973674, "loss": 0.6141, "step": 38570 }, { "epoch": 0.8587072649572649, "grad_norm": 0.5672870874404907, "learning_rate": 0.00032448330448317444, "loss": 0.5866, "step": 38580 }, { "epoch": 0.8589298433048433, "grad_norm": 0.6375393867492676, "learning_rate": 0.0003244468105220875, "loss": 0.6703, "step": 38590 }, { "epoch": 0.8591524216524217, "grad_norm": 0.3784322142601013, "learning_rate": 0.00032441030979845893, "loss": 0.6274, "step": 38600 }, { "epoch": 0.859375, "grad_norm": 0.6850314736366272, "learning_rate": 0.0003243738023142723, "loss": 0.6696, "step": 38610 }, { "epoch": 0.8595975783475783, "grad_norm": 0.9629839062690735, "learning_rate": 0.00032433728807151153, "loss": 0.561, "step": 38620 }, { "epoch": 0.8598201566951567, "grad_norm": 0.885299563407898, "learning_rate": 0.00032430076707216064, "loss": 0.6774, "step": 38630 }, { "epoch": 0.8600427350427351, "grad_norm": 0.668006956577301, "learning_rate": 0.00032426423931820436, "loss": 0.5257, "step": 38640 }, { "epoch": 0.8602653133903134, "grad_norm": 0.9198821187019348, "learning_rate": 0.00032422770481162753, "loss": 0.6907, "step": 38650 }, { "epoch": 0.8604878917378918, "grad_norm": 0.6214486360549927, "learning_rate": 0.00032419116355441555, "loss": 0.7267, "step": 38660 }, { "epoch": 0.8607104700854701, "grad_norm": 0.7451410889625549, "learning_rate": 0.00032415461554855413, "loss": 0.5338, "step": 38670 }, { "epoch": 0.8609330484330484, "grad_norm": 0.43799296021461487, "learning_rate": 0.0003241180607960292, "loss": 0.6098, "step": 38680 }, { "epoch": 0.8611556267806267, "grad_norm": 0.693584680557251, "learning_rate": 0.00032408149929882726, "loss": 0.5971, "step": 38690 }, { "epoch": 0.8613782051282052, "grad_norm": 0.7107314467430115, "learning_rate": 0.00032404493105893503, "loss": 0.6508, "step": 38700 }, { "epoch": 0.8616007834757835, "grad_norm": 0.6193886399269104, "learning_rate": 0.00032400835607833975, "loss": 0.6652, "step": 38710 }, { "epoch": 0.8618233618233618, "grad_norm": 0.664205014705658, "learning_rate": 0.0003239717743590289, "loss": 0.6026, "step": 38720 }, { "epoch": 0.8620459401709402, "grad_norm": 0.7307686805725098, "learning_rate": 0.00032393518590299023, "loss": 0.6434, "step": 38730 }, { "epoch": 0.8622685185185185, "grad_norm": 0.815770149230957, "learning_rate": 0.0003238985907122122, "loss": 0.6246, "step": 38740 }, { "epoch": 0.8624910968660968, "grad_norm": 0.8578146696090698, "learning_rate": 0.0003238619887886833, "loss": 0.5791, "step": 38750 }, { "epoch": 0.8627136752136753, "grad_norm": 0.672818124294281, "learning_rate": 0.0003238253801343925, "loss": 0.9415, "step": 38760 }, { "epoch": 0.8629362535612536, "grad_norm": 0.6374601125717163, "learning_rate": 0.00032378876475132925, "loss": 0.6183, "step": 38770 }, { "epoch": 0.8631588319088319, "grad_norm": 0.4592694640159607, "learning_rate": 0.00032375214264148317, "loss": 0.5575, "step": 38780 }, { "epoch": 0.8633814102564102, "grad_norm": 0.5796618461608887, "learning_rate": 0.0003237155138068444, "loss": 0.5472, "step": 38790 }, { "epoch": 0.8636039886039886, "grad_norm": 0.5853553414344788, "learning_rate": 0.00032367887824940315, "loss": 0.5363, "step": 38800 }, { "epoch": 0.8638265669515669, "grad_norm": 0.7586563229560852, "learning_rate": 0.0003236422359711505, "loss": 0.652, "step": 38810 }, { "epoch": 0.8640491452991453, "grad_norm": 0.3684835135936737, "learning_rate": 0.00032360558697407755, "loss": 0.6623, "step": 38820 }, { "epoch": 0.8642717236467237, "grad_norm": 0.686464250087738, "learning_rate": 0.0003235689312601758, "loss": 0.5202, "step": 38830 }, { "epoch": 0.864494301994302, "grad_norm": 0.9118617177009583, "learning_rate": 0.00032353226883143716, "loss": 0.5926, "step": 38840 }, { "epoch": 0.8647168803418803, "grad_norm": 0.6207898259162903, "learning_rate": 0.00032349559968985396, "loss": 0.5538, "step": 38850 }, { "epoch": 0.8649394586894587, "grad_norm": 1.0657782554626465, "learning_rate": 0.0003234589238374187, "loss": 0.6224, "step": 38860 }, { "epoch": 0.8651620370370371, "grad_norm": 0.5780830979347229, "learning_rate": 0.0003234222412761245, "loss": 0.5131, "step": 38870 }, { "epoch": 0.8653846153846154, "grad_norm": 0.6197191476821899, "learning_rate": 0.00032338555200796466, "loss": 0.6776, "step": 38880 }, { "epoch": 0.8656071937321937, "grad_norm": 0.5804508924484253, "learning_rate": 0.00032334885603493293, "loss": 0.5172, "step": 38890 }, { "epoch": 0.8658297720797721, "grad_norm": 0.6176242232322693, "learning_rate": 0.00032331215335902337, "loss": 0.6256, "step": 38900 }, { "epoch": 0.8660523504273504, "grad_norm": 0.9177045226097107, "learning_rate": 0.0003232754439822304, "loss": 0.6757, "step": 38910 }, { "epoch": 0.8662749287749287, "grad_norm": 1.1486618518829346, "learning_rate": 0.00032323872790654894, "loss": 0.55, "step": 38920 }, { "epoch": 0.8664975071225072, "grad_norm": 0.6468950510025024, "learning_rate": 0.00032320200513397416, "loss": 0.6339, "step": 38930 }, { "epoch": 0.8667200854700855, "grad_norm": 0.7570953965187073, "learning_rate": 0.0003231652756665015, "loss": 0.7275, "step": 38940 }, { "epoch": 0.8669426638176638, "grad_norm": 0.7113020420074463, "learning_rate": 0.0003231285395061269, "loss": 0.5602, "step": 38950 }, { "epoch": 0.8671652421652422, "grad_norm": 0.4715264141559601, "learning_rate": 0.0003230917966548467, "loss": 0.6001, "step": 38960 }, { "epoch": 0.8673878205128205, "grad_norm": 0.8866527676582336, "learning_rate": 0.00032305504711465754, "loss": 0.6429, "step": 38970 }, { "epoch": 0.8676103988603988, "grad_norm": 0.7117496132850647, "learning_rate": 0.00032301829088755634, "loss": 0.5025, "step": 38980 }, { "epoch": 0.8678329772079773, "grad_norm": 0.7015672326087952, "learning_rate": 0.00032298152797554053, "loss": 0.6292, "step": 38990 }, { "epoch": 0.8680555555555556, "grad_norm": 0.7511372566223145, "learning_rate": 0.0003229447583806078, "loss": 0.6656, "step": 39000 }, { "epoch": 0.8682781339031339, "grad_norm": 0.5627785325050354, "learning_rate": 0.00032290798210475623, "loss": 0.5419, "step": 39010 }, { "epoch": 0.8685007122507122, "grad_norm": 0.5964632034301758, "learning_rate": 0.00032287119914998434, "loss": 0.6173, "step": 39020 }, { "epoch": 0.8687232905982906, "grad_norm": 0.6726840734481812, "learning_rate": 0.0003228344095182909, "loss": 0.4604, "step": 39030 }, { "epoch": 0.8689458689458689, "grad_norm": 0.6931193470954895, "learning_rate": 0.00032279761321167506, "loss": 0.5933, "step": 39040 }, { "epoch": 0.8691684472934473, "grad_norm": 0.8055073022842407, "learning_rate": 0.0003227608102321364, "loss": 0.581, "step": 39050 }, { "epoch": 0.8693910256410257, "grad_norm": 0.7156325578689575, "learning_rate": 0.0003227240005816748, "loss": 0.5492, "step": 39060 }, { "epoch": 0.869613603988604, "grad_norm": 0.8013405203819275, "learning_rate": 0.0003226871842622906, "loss": 0.5804, "step": 39070 }, { "epoch": 0.8698361823361823, "grad_norm": 0.61861252784729, "learning_rate": 0.0003226503612759843, "loss": 0.573, "step": 39080 }, { "epoch": 0.8700587606837606, "grad_norm": 0.7677802443504333, "learning_rate": 0.000322613531624757, "loss": 0.7337, "step": 39090 }, { "epoch": 0.8702813390313391, "grad_norm": 0.6915109753608704, "learning_rate": 0.00032257669531061, "loss": 0.5641, "step": 39100 }, { "epoch": 0.8705039173789174, "grad_norm": 0.8183631896972656, "learning_rate": 0.000322539852335545, "loss": 0.5222, "step": 39110 }, { "epoch": 0.8707264957264957, "grad_norm": 0.5204386711120605, "learning_rate": 0.00032250300270156415, "loss": 0.5386, "step": 39120 }, { "epoch": 0.8709490740740741, "grad_norm": 0.4840696156024933, "learning_rate": 0.0003224661464106698, "loss": 0.5482, "step": 39130 }, { "epoch": 0.8711716524216524, "grad_norm": 0.7944409847259521, "learning_rate": 0.0003224292834648649, "loss": 0.6042, "step": 39140 }, { "epoch": 0.8713942307692307, "grad_norm": 0.8521105647087097, "learning_rate": 0.00032239241386615246, "loss": 0.7336, "step": 39150 }, { "epoch": 0.8716168091168092, "grad_norm": 0.8539936542510986, "learning_rate": 0.00032235553761653606, "loss": 0.612, "step": 39160 }, { "epoch": 0.8718393874643875, "grad_norm": 0.5302663445472717, "learning_rate": 0.0003223186547180196, "loss": 0.6096, "step": 39170 }, { "epoch": 0.8720619658119658, "grad_norm": 0.8561046719551086, "learning_rate": 0.00032228176517260724, "loss": 0.5625, "step": 39180 }, { "epoch": 0.8722845441595442, "grad_norm": 0.7265895009040833, "learning_rate": 0.0003222448689823037, "loss": 0.6372, "step": 39190 }, { "epoch": 0.8725071225071225, "grad_norm": 0.6467223167419434, "learning_rate": 0.00032220796614911386, "loss": 0.6874, "step": 39200 }, { "epoch": 0.8727297008547008, "grad_norm": 0.8323819041252136, "learning_rate": 0.00032217105667504313, "loss": 0.6516, "step": 39210 }, { "epoch": 0.8729522792022792, "grad_norm": 0.42687639594078064, "learning_rate": 0.0003221341405620972, "loss": 0.551, "step": 39220 }, { "epoch": 0.8731748575498576, "grad_norm": 0.5661822557449341, "learning_rate": 0.000322097217812282, "loss": 0.5575, "step": 39230 }, { "epoch": 0.8733974358974359, "grad_norm": 0.9337760806083679, "learning_rate": 0.00032206028842760416, "loss": 0.6917, "step": 39240 }, { "epoch": 0.8736200142450142, "grad_norm": 0.6879515647888184, "learning_rate": 0.00032202335241007026, "loss": 0.5838, "step": 39250 }, { "epoch": 0.8738425925925926, "grad_norm": 0.8157190084457397, "learning_rate": 0.00032198640976168743, "loss": 0.5214, "step": 39260 }, { "epoch": 0.8740651709401709, "grad_norm": 0.7548959255218506, "learning_rate": 0.0003219494604844633, "loss": 0.6848, "step": 39270 }, { "epoch": 0.8742877492877493, "grad_norm": 0.6269037127494812, "learning_rate": 0.00032191250458040566, "loss": 0.5813, "step": 39280 }, { "epoch": 0.8745103276353277, "grad_norm": 0.9379816651344299, "learning_rate": 0.0003218755420515227, "loss": 0.6591, "step": 39290 }, { "epoch": 0.874732905982906, "grad_norm": 0.6595720052719116, "learning_rate": 0.00032183857289982303, "loss": 0.6301, "step": 39300 }, { "epoch": 0.8749554843304843, "grad_norm": 0.5300357341766357, "learning_rate": 0.00032180159712731556, "loss": 0.6853, "step": 39310 }, { "epoch": 0.8751780626780626, "grad_norm": 0.7907893061637878, "learning_rate": 0.0003217646147360096, "loss": 0.7163, "step": 39320 }, { "epoch": 0.8754006410256411, "grad_norm": 0.819496750831604, "learning_rate": 0.00032172762572791475, "loss": 0.6733, "step": 39330 }, { "epoch": 0.8756232193732194, "grad_norm": 0.7941359281539917, "learning_rate": 0.00032169063010504113, "loss": 0.6714, "step": 39340 }, { "epoch": 0.8758457977207977, "grad_norm": 1.1781067848205566, "learning_rate": 0.000321653627869399, "loss": 0.7206, "step": 39350 }, { "epoch": 0.8760683760683761, "grad_norm": 0.7022318243980408, "learning_rate": 0.00032161661902299914, "loss": 0.5669, "step": 39360 }, { "epoch": 0.8762909544159544, "grad_norm": 0.7623576521873474, "learning_rate": 0.0003215796035678527, "loss": 0.5496, "step": 39370 }, { "epoch": 0.8765135327635327, "grad_norm": 0.4989342987537384, "learning_rate": 0.00032154258150597105, "loss": 0.5955, "step": 39380 }, { "epoch": 0.8767361111111112, "grad_norm": 0.5586682558059692, "learning_rate": 0.00032150555283936595, "loss": 0.5446, "step": 39390 }, { "epoch": 0.8769586894586895, "grad_norm": 0.8281471133232117, "learning_rate": 0.0003214685175700497, "loss": 0.5533, "step": 39400 }, { "epoch": 0.8771812678062678, "grad_norm": 0.6243104338645935, "learning_rate": 0.0003214314757000347, "loss": 0.6527, "step": 39410 }, { "epoch": 0.8774038461538461, "grad_norm": 0.602491557598114, "learning_rate": 0.00032139442723133404, "loss": 0.6527, "step": 39420 }, { "epoch": 0.8776264245014245, "grad_norm": 0.8057866096496582, "learning_rate": 0.00032135737216596073, "loss": 0.6958, "step": 39430 }, { "epoch": 0.8778490028490028, "grad_norm": 0.6868727803230286, "learning_rate": 0.0003213203105059285, "loss": 0.5553, "step": 39440 }, { "epoch": 0.8780715811965812, "grad_norm": 0.4937323033809662, "learning_rate": 0.0003212832422532512, "loss": 0.7375, "step": 39450 }, { "epoch": 0.8782941595441596, "grad_norm": 0.7090859413146973, "learning_rate": 0.00032124616740994335, "loss": 0.6359, "step": 39460 }, { "epoch": 0.8785167378917379, "grad_norm": 0.7405864000320435, "learning_rate": 0.00032120908597801944, "loss": 0.6074, "step": 39470 }, { "epoch": 0.8787393162393162, "grad_norm": 0.5793715715408325, "learning_rate": 0.0003211719979594946, "loss": 0.5593, "step": 39480 }, { "epoch": 0.8789618945868946, "grad_norm": 0.9205363988876343, "learning_rate": 0.0003211349033563842, "loss": 0.7185, "step": 39490 }, { "epoch": 0.8791844729344729, "grad_norm": 0.8902425169944763, "learning_rate": 0.00032109780217070395, "loss": 0.5122, "step": 39500 }, { "epoch": 0.8794070512820513, "grad_norm": 0.5989090204238892, "learning_rate": 0.00032106069440447, "loss": 0.6292, "step": 39510 }, { "epoch": 0.8796296296296297, "grad_norm": 0.7132798433303833, "learning_rate": 0.00032102358005969877, "loss": 0.6775, "step": 39520 }, { "epoch": 0.879852207977208, "grad_norm": 0.6441278457641602, "learning_rate": 0.0003209864591384072, "loss": 0.6522, "step": 39530 }, { "epoch": 0.8800747863247863, "grad_norm": 0.7071154117584229, "learning_rate": 0.00032094933164261236, "loss": 0.8055, "step": 39540 }, { "epoch": 0.8802973646723646, "grad_norm": 0.5231700539588928, "learning_rate": 0.00032091219757433186, "loss": 0.6843, "step": 39550 }, { "epoch": 0.8805199430199431, "grad_norm": 0.9400531053543091, "learning_rate": 0.0003208750569355835, "loss": 0.659, "step": 39560 }, { "epoch": 0.8807425213675214, "grad_norm": 0.6579751372337341, "learning_rate": 0.00032083790972838565, "loss": 0.624, "step": 39570 }, { "epoch": 0.8809650997150997, "grad_norm": 0.683641791343689, "learning_rate": 0.00032080075595475685, "loss": 0.6428, "step": 39580 }, { "epoch": 0.8811876780626781, "grad_norm": 0.8134473562240601, "learning_rate": 0.00032076359561671606, "loss": 0.6543, "step": 39590 }, { "epoch": 0.8814102564102564, "grad_norm": 0.4869878888130188, "learning_rate": 0.00032072642871628265, "loss": 0.5401, "step": 39600 }, { "epoch": 0.8816328347578347, "grad_norm": 0.734983503818512, "learning_rate": 0.0003206892552554762, "loss": 0.6767, "step": 39610 }, { "epoch": 0.8818554131054132, "grad_norm": 1.119775652885437, "learning_rate": 0.00032065207523631695, "loss": 0.637, "step": 39620 }, { "epoch": 0.8820779914529915, "grad_norm": 0.5646172165870667, "learning_rate": 0.0003206148886608251, "loss": 0.6219, "step": 39630 }, { "epoch": 0.8823005698005698, "grad_norm": 0.5112728476524353, "learning_rate": 0.00032057769553102145, "loss": 0.6746, "step": 39640 }, { "epoch": 0.8825231481481481, "grad_norm": 0.5482468605041504, "learning_rate": 0.0003205404958489271, "loss": 0.8427, "step": 39650 }, { "epoch": 0.8827457264957265, "grad_norm": 0.6891186833381653, "learning_rate": 0.0003205032896165635, "loss": 0.6567, "step": 39660 }, { "epoch": 0.8829683048433048, "grad_norm": 1.1482949256896973, "learning_rate": 0.0003204660768359525, "loss": 0.6634, "step": 39670 }, { "epoch": 0.8831908831908832, "grad_norm": 0.5539254546165466, "learning_rate": 0.00032042885750911633, "loss": 0.5657, "step": 39680 }, { "epoch": 0.8834134615384616, "grad_norm": 1.1480712890625, "learning_rate": 0.00032039163163807746, "loss": 0.6426, "step": 39690 }, { "epoch": 0.8836360398860399, "grad_norm": 0.7224293351173401, "learning_rate": 0.0003203543992248587, "loss": 0.5478, "step": 39700 }, { "epoch": 0.8838586182336182, "grad_norm": 0.8542203903198242, "learning_rate": 0.0003203171602714834, "loss": 0.6527, "step": 39710 }, { "epoch": 0.8840811965811965, "grad_norm": 0.6108570694923401, "learning_rate": 0.0003202799147799751, "loss": 0.5726, "step": 39720 }, { "epoch": 0.8843037749287749, "grad_norm": 0.5748672485351562, "learning_rate": 0.00032024266275235776, "loss": 0.6126, "step": 39730 }, { "epoch": 0.8845263532763533, "grad_norm": 0.7551204562187195, "learning_rate": 0.0003202054041906557, "loss": 0.7203, "step": 39740 }, { "epoch": 0.8847489316239316, "grad_norm": 0.7964481115341187, "learning_rate": 0.00032016813909689363, "loss": 0.5755, "step": 39750 }, { "epoch": 0.88497150997151, "grad_norm": 0.5368321537971497, "learning_rate": 0.0003201308674730964, "loss": 0.5515, "step": 39760 }, { "epoch": 0.8851940883190883, "grad_norm": 0.5676383376121521, "learning_rate": 0.00032009358932128955, "loss": 0.5616, "step": 39770 }, { "epoch": 0.8854166666666666, "grad_norm": 0.7017415165901184, "learning_rate": 0.00032005630464349873, "loss": 0.5178, "step": 39780 }, { "epoch": 0.8856392450142451, "grad_norm": 0.7300966382026672, "learning_rate": 0.00032001901344175005, "loss": 0.5538, "step": 39790 }, { "epoch": 0.8858618233618234, "grad_norm": 0.6249212026596069, "learning_rate": 0.00031998171571806993, "loss": 0.5435, "step": 39800 }, { "epoch": 0.8860844017094017, "grad_norm": 0.4156573414802551, "learning_rate": 0.0003199444114744851, "loss": 0.5803, "step": 39810 }, { "epoch": 0.88630698005698, "grad_norm": 0.6538870930671692, "learning_rate": 0.0003199071007130228, "loss": 0.7323, "step": 39820 }, { "epoch": 0.8865295584045584, "grad_norm": 0.7236296534538269, "learning_rate": 0.0003198697834357105, "loss": 0.6266, "step": 39830 }, { "epoch": 0.8867521367521367, "grad_norm": 0.7877007126808167, "learning_rate": 0.000319832459644576, "loss": 0.6534, "step": 39840 }, { "epoch": 0.8869747150997151, "grad_norm": 0.7191532254219055, "learning_rate": 0.0003197951293416476, "loss": 0.8359, "step": 39850 }, { "epoch": 0.8871972934472935, "grad_norm": 0.5475645661354065, "learning_rate": 0.00031975779252895375, "loss": 0.7697, "step": 39860 }, { "epoch": 0.8874198717948718, "grad_norm": 1.168534755706787, "learning_rate": 0.0003197204492085234, "loss": 0.6695, "step": 39870 }, { "epoch": 0.8876424501424501, "grad_norm": 0.6283919215202332, "learning_rate": 0.0003196830993823859, "loss": 0.5903, "step": 39880 }, { "epoch": 0.8878650284900285, "grad_norm": 3.3726868629455566, "learning_rate": 0.00031964574305257083, "loss": 0.5589, "step": 39890 }, { "epoch": 0.8880876068376068, "grad_norm": 1.0309503078460693, "learning_rate": 0.00031960838022110805, "loss": 0.6842, "step": 39900 }, { "epoch": 0.8883101851851852, "grad_norm": 0.6248919367790222, "learning_rate": 0.00031957101089002797, "loss": 0.5577, "step": 39910 }, { "epoch": 0.8885327635327636, "grad_norm": 0.8905360102653503, "learning_rate": 0.0003195336350613613, "loss": 0.6414, "step": 39920 }, { "epoch": 0.8887553418803419, "grad_norm": 0.7523095607757568, "learning_rate": 0.00031949625273713906, "loss": 0.7783, "step": 39930 }, { "epoch": 0.8889779202279202, "grad_norm": 0.6982303261756897, "learning_rate": 0.00031945886391939257, "loss": 0.7337, "step": 39940 }, { "epoch": 0.8892004985754985, "grad_norm": 0.5420619249343872, "learning_rate": 0.00031942146861015374, "loss": 0.7451, "step": 39950 }, { "epoch": 0.8894230769230769, "grad_norm": 0.589128851890564, "learning_rate": 0.0003193840668114544, "loss": 0.4996, "step": 39960 }, { "epoch": 0.8896456552706553, "grad_norm": 0.8386995196342468, "learning_rate": 0.00031934665852532723, "loss": 0.6711, "step": 39970 }, { "epoch": 0.8898682336182336, "grad_norm": 0.9134801030158997, "learning_rate": 0.0003193092437538049, "loss": 0.5935, "step": 39980 }, { "epoch": 0.890090811965812, "grad_norm": 0.9789322018623352, "learning_rate": 0.00031927182249892063, "loss": 0.7247, "step": 39990 }, { "epoch": 0.8903133903133903, "grad_norm": 0.8390123844146729, "learning_rate": 0.0003192343947627078, "loss": 0.6485, "step": 40000 }, { "epoch": 0.8905359686609686, "grad_norm": 0.5787250399589539, "learning_rate": 0.0003191969605472004, "loss": 0.4625, "step": 40010 }, { "epoch": 0.8907585470085471, "grad_norm": 0.8755205273628235, "learning_rate": 0.0003191595198544326, "loss": 0.6651, "step": 40020 }, { "epoch": 0.8909811253561254, "grad_norm": 0.8312947750091553, "learning_rate": 0.000319122072686439, "loss": 0.6148, "step": 40030 }, { "epoch": 0.8912037037037037, "grad_norm": 0.7560413479804993, "learning_rate": 0.0003190846190452543, "loss": 0.5487, "step": 40040 }, { "epoch": 0.891426282051282, "grad_norm": 0.7159906029701233, "learning_rate": 0.000319047158932914, "loss": 0.525, "step": 40050 }, { "epoch": 0.8916488603988604, "grad_norm": 0.8458636403083801, "learning_rate": 0.00031900969235145366, "loss": 0.6415, "step": 40060 }, { "epoch": 0.8918714387464387, "grad_norm": 0.7349972128868103, "learning_rate": 0.0003189722193029091, "loss": 0.6364, "step": 40070 }, { "epoch": 0.8920940170940171, "grad_norm": 0.8532900810241699, "learning_rate": 0.0003189347397893169, "loss": 0.6439, "step": 40080 }, { "epoch": 0.8923165954415955, "grad_norm": 1.060137391090393, "learning_rate": 0.0003188972538127135, "loss": 0.7015, "step": 40090 }, { "epoch": 0.8925391737891738, "grad_norm": 0.6980665922164917, "learning_rate": 0.00031885976137513596, "loss": 0.7174, "step": 40100 }, { "epoch": 0.8927617521367521, "grad_norm": 0.6002524495124817, "learning_rate": 0.0003188222624786217, "loss": 0.6698, "step": 40110 }, { "epoch": 0.8929843304843305, "grad_norm": 0.45758989453315735, "learning_rate": 0.00031878475712520846, "loss": 0.545, "step": 40120 }, { "epoch": 0.8932069088319088, "grad_norm": 0.8262528777122498, "learning_rate": 0.0003187472453169343, "loss": 0.4717, "step": 40130 }, { "epoch": 0.8934294871794872, "grad_norm": 0.8281633853912354, "learning_rate": 0.00031870972705583755, "loss": 0.6197, "step": 40140 }, { "epoch": 0.8936520655270656, "grad_norm": 0.739718496799469, "learning_rate": 0.0003186722023439571, "loss": 0.5508, "step": 40150 }, { "epoch": 0.8938746438746439, "grad_norm": 0.6326023936271667, "learning_rate": 0.000318634671183332, "loss": 0.6486, "step": 40160 }, { "epoch": 0.8940972222222222, "grad_norm": 0.649603009223938, "learning_rate": 0.0003185971335760017, "loss": 0.6006, "step": 40170 }, { "epoch": 0.8943198005698005, "grad_norm": 0.5098690390586853, "learning_rate": 0.0003185595895240061, "loss": 0.6338, "step": 40180 }, { "epoch": 0.8945423789173789, "grad_norm": 0.7413763999938965, "learning_rate": 0.0003185220390293854, "loss": 0.5233, "step": 40190 }, { "epoch": 0.8947649572649573, "grad_norm": 0.5394598245620728, "learning_rate": 0.00031848448209418007, "loss": 0.5585, "step": 40200 }, { "epoch": 0.8949875356125356, "grad_norm": 0.8628697991371155, "learning_rate": 0.00031844691872043096, "loss": 0.7985, "step": 40210 }, { "epoch": 0.895210113960114, "grad_norm": 0.5882583260536194, "learning_rate": 0.0003184093489101793, "loss": 0.5264, "step": 40220 }, { "epoch": 0.8954326923076923, "grad_norm": 0.8291221857070923, "learning_rate": 0.0003183717726654667, "loss": 0.6469, "step": 40230 }, { "epoch": 0.8956552706552706, "grad_norm": 0.843272864818573, "learning_rate": 0.00031833418998833505, "loss": 0.7344, "step": 40240 }, { "epoch": 0.8958778490028491, "grad_norm": 0.49712222814559937, "learning_rate": 0.00031829660088082673, "loss": 0.593, "step": 40250 }, { "epoch": 0.8961004273504274, "grad_norm": 0.6992495656013489, "learning_rate": 0.0003182590053449842, "loss": 0.6134, "step": 40260 }, { "epoch": 0.8963230056980057, "grad_norm": 0.48579835891723633, "learning_rate": 0.0003182214033828505, "loss": 0.6549, "step": 40270 }, { "epoch": 0.896545584045584, "grad_norm": 0.558591365814209, "learning_rate": 0.000318183794996469, "loss": 0.6306, "step": 40280 }, { "epoch": 0.8967681623931624, "grad_norm": 0.7068283557891846, "learning_rate": 0.00031814618018788333, "loss": 0.6821, "step": 40290 }, { "epoch": 0.8969907407407407, "grad_norm": 0.7181311845779419, "learning_rate": 0.0003181085589591375, "loss": 0.6187, "step": 40300 }, { "epoch": 0.8972133190883191, "grad_norm": 0.6119157075881958, "learning_rate": 0.0003180709313122759, "loss": 0.6697, "step": 40310 }, { "epoch": 0.8974358974358975, "grad_norm": 0.5585033297538757, "learning_rate": 0.00031803329724934315, "loss": 0.6431, "step": 40320 }, { "epoch": 0.8976584757834758, "grad_norm": 0.6253518462181091, "learning_rate": 0.00031799565677238453, "loss": 0.5559, "step": 40330 }, { "epoch": 0.8978810541310541, "grad_norm": 0.6004060506820679, "learning_rate": 0.00031795800988344527, "loss": 0.6072, "step": 40340 }, { "epoch": 0.8981036324786325, "grad_norm": 0.6097738742828369, "learning_rate": 0.00031792035658457113, "loss": 0.5182, "step": 40350 }, { "epoch": 0.8983262108262108, "grad_norm": 1.233040690422058, "learning_rate": 0.00031788269687780835, "loss": 0.5693, "step": 40360 }, { "epoch": 0.8985487891737892, "grad_norm": 0.9896701574325562, "learning_rate": 0.0003178450307652033, "loss": 0.5541, "step": 40370 }, { "epoch": 0.8987713675213675, "grad_norm": 0.5857483744621277, "learning_rate": 0.00031780735824880283, "loss": 0.5526, "step": 40380 }, { "epoch": 0.8989939458689459, "grad_norm": 0.7917017936706543, "learning_rate": 0.00031776967933065404, "loss": 0.6533, "step": 40390 }, { "epoch": 0.8992165242165242, "grad_norm": 0.8823297023773193, "learning_rate": 0.0003177319940128045, "loss": 0.6673, "step": 40400 }, { "epoch": 0.8994391025641025, "grad_norm": 0.5655326843261719, "learning_rate": 0.000317694302297302, "loss": 0.584, "step": 40410 }, { "epoch": 0.8996616809116809, "grad_norm": 0.4943428933620453, "learning_rate": 0.0003176566041861947, "loss": 0.4593, "step": 40420 }, { "epoch": 0.8998842592592593, "grad_norm": 0.6836668252944946, "learning_rate": 0.0003176188996815313, "loss": 0.6873, "step": 40430 }, { "epoch": 0.9001068376068376, "grad_norm": 0.5697005987167358, "learning_rate": 0.00031758118878536055, "loss": 0.5643, "step": 40440 }, { "epoch": 0.9001068376068376, "eval_loss": 0.6068045496940613, "eval_runtime": 337.3079, "eval_samples_per_second": 7.011, "eval_steps_per_second": 7.011, "step": 40440 }, { "epoch": 0.900329415954416, "grad_norm": 0.9452328681945801, "learning_rate": 0.0003175434714997318, "loss": 0.6032, "step": 40450 }, { "epoch": 0.9005519943019943, "grad_norm": 0.3648838400840759, "learning_rate": 0.0003175057478266945, "loss": 0.7467, "step": 40460 }, { "epoch": 0.9007745726495726, "grad_norm": 0.5086342096328735, "learning_rate": 0.00031746801776829877, "loss": 0.4958, "step": 40470 }, { "epoch": 0.9009971509971509, "grad_norm": 0.4477662444114685, "learning_rate": 0.0003174302813265948, "loss": 0.6573, "step": 40480 }, { "epoch": 0.9012197293447294, "grad_norm": 1.764902949333191, "learning_rate": 0.00031739253850363307, "loss": 0.5559, "step": 40490 }, { "epoch": 0.9014423076923077, "grad_norm": 0.6134351491928101, "learning_rate": 0.0003173547893014648, "loss": 0.4968, "step": 40500 }, { "epoch": 0.901664886039886, "grad_norm": 0.51532381772995, "learning_rate": 0.00031731703372214114, "loss": 0.6746, "step": 40510 }, { "epoch": 0.9018874643874644, "grad_norm": 0.6712770462036133, "learning_rate": 0.0003172792717677139, "loss": 0.6102, "step": 40520 }, { "epoch": 0.9021100427350427, "grad_norm": 0.5384175777435303, "learning_rate": 0.000317241503440235, "loss": 0.5977, "step": 40530 }, { "epoch": 0.9023326210826211, "grad_norm": 0.7249717712402344, "learning_rate": 0.00031720372874175683, "loss": 0.5999, "step": 40540 }, { "epoch": 0.9025551994301995, "grad_norm": 0.47203508019447327, "learning_rate": 0.0003171659476743321, "loss": 0.6368, "step": 40550 }, { "epoch": 0.9027777777777778, "grad_norm": 0.6192301511764526, "learning_rate": 0.0003171281602400138, "loss": 0.521, "step": 40560 }, { "epoch": 0.9030003561253561, "grad_norm": 0.8027661442756653, "learning_rate": 0.00031709036644085543, "loss": 0.5966, "step": 40570 }, { "epoch": 0.9032229344729344, "grad_norm": 0.61912602186203, "learning_rate": 0.0003170525662789107, "loss": 0.6183, "step": 40580 }, { "epoch": 0.9034455128205128, "grad_norm": 0.953890860080719, "learning_rate": 0.0003170147597562337, "loss": 0.6791, "step": 40590 }, { "epoch": 0.9036680911680912, "grad_norm": 0.6121290326118469, "learning_rate": 0.0003169769468748788, "loss": 0.5699, "step": 40600 }, { "epoch": 0.9038906695156695, "grad_norm": 0.3996066451072693, "learning_rate": 0.00031693912763690096, "loss": 0.5991, "step": 40610 }, { "epoch": 0.9041132478632479, "grad_norm": 0.7684537768363953, "learning_rate": 0.0003169013020443551, "loss": 0.6052, "step": 40620 }, { "epoch": 0.9043358262108262, "grad_norm": 0.628394365310669, "learning_rate": 0.0003168634700992968, "loss": 0.5785, "step": 40630 }, { "epoch": 0.9045584045584045, "grad_norm": 0.8542444109916687, "learning_rate": 0.0003168256318037819, "loss": 0.6702, "step": 40640 }, { "epoch": 0.9047809829059829, "grad_norm": 0.7758163213729858, "learning_rate": 0.00031678778715986655, "loss": 0.6605, "step": 40650 }, { "epoch": 0.9050035612535613, "grad_norm": 0.6627446413040161, "learning_rate": 0.0003167499361696071, "loss": 0.5987, "step": 40660 }, { "epoch": 0.9052261396011396, "grad_norm": 0.4540365934371948, "learning_rate": 0.0003167120788350607, "loss": 0.5297, "step": 40670 }, { "epoch": 0.905448717948718, "grad_norm": 0.5813724398612976, "learning_rate": 0.00031667421515828433, "loss": 0.6154, "step": 40680 }, { "epoch": 0.9056712962962963, "grad_norm": 0.9838677048683167, "learning_rate": 0.0003166363451413356, "loss": 0.5669, "step": 40690 }, { "epoch": 0.9058938746438746, "grad_norm": 0.9327704310417175, "learning_rate": 0.00031659846878627235, "loss": 0.588, "step": 40700 }, { "epoch": 0.9061164529914529, "grad_norm": 0.5827953219413757, "learning_rate": 0.0003165605860951529, "loss": 0.5702, "step": 40710 }, { "epoch": 0.9063390313390314, "grad_norm": 0.8706266283988953, "learning_rate": 0.0003165226970700358, "loss": 0.7091, "step": 40720 }, { "epoch": 0.9065616096866097, "grad_norm": 0.6274487972259521, "learning_rate": 0.0003164848017129799, "loss": 0.6592, "step": 40730 }, { "epoch": 0.906784188034188, "grad_norm": 0.7041484117507935, "learning_rate": 0.00031644690002604454, "loss": 0.7755, "step": 40740 }, { "epoch": 0.9070067663817664, "grad_norm": 0.6966767311096191, "learning_rate": 0.0003164089920112893, "loss": 0.612, "step": 40750 }, { "epoch": 0.9072293447293447, "grad_norm": 0.5250440835952759, "learning_rate": 0.0003163710776707742, "loss": 0.4996, "step": 40760 }, { "epoch": 0.9074519230769231, "grad_norm": 0.777004599571228, "learning_rate": 0.00031633315700655936, "loss": 0.6096, "step": 40770 }, { "epoch": 0.9076745014245015, "grad_norm": 0.6018292307853699, "learning_rate": 0.00031629523002070563, "loss": 0.5804, "step": 40780 }, { "epoch": 0.9078970797720798, "grad_norm": 0.8258164525032043, "learning_rate": 0.00031625729671527384, "loss": 0.6846, "step": 40790 }, { "epoch": 0.9081196581196581, "grad_norm": 0.6959264278411865, "learning_rate": 0.0003162193570923254, "loss": 0.6592, "step": 40800 }, { "epoch": 0.9083422364672364, "grad_norm": 0.784415602684021, "learning_rate": 0.00031618141115392196, "loss": 0.7515, "step": 40810 }, { "epoch": 0.9085648148148148, "grad_norm": 1.0342206954956055, "learning_rate": 0.0003161434589021255, "loss": 0.6266, "step": 40820 }, { "epoch": 0.9087873931623932, "grad_norm": 0.8727086186408997, "learning_rate": 0.0003161055003389984, "loss": 0.5163, "step": 40830 }, { "epoch": 0.9090099715099715, "grad_norm": 0.5465406179428101, "learning_rate": 0.00031606753546660336, "loss": 0.5507, "step": 40840 }, { "epoch": 0.9092325498575499, "grad_norm": 0.5344588756561279, "learning_rate": 0.0003160295642870034, "loss": 0.5908, "step": 40850 }, { "epoch": 0.9094551282051282, "grad_norm": 0.5498618483543396, "learning_rate": 0.000315991586802262, "loss": 0.4823, "step": 40860 }, { "epoch": 0.9096777065527065, "grad_norm": 0.700057327747345, "learning_rate": 0.00031595360301444285, "loss": 0.6203, "step": 40870 }, { "epoch": 0.9099002849002849, "grad_norm": 0.6060965657234192, "learning_rate": 0.0003159156129256099, "loss": 0.6212, "step": 40880 }, { "epoch": 0.9101228632478633, "grad_norm": 0.9121140837669373, "learning_rate": 0.0003158776165378277, "loss": 0.73, "step": 40890 }, { "epoch": 0.9103454415954416, "grad_norm": 0.7383050322532654, "learning_rate": 0.0003158396138531609, "loss": 0.616, "step": 40900 }, { "epoch": 0.91056801994302, "grad_norm": 0.5925201773643494, "learning_rate": 0.00031580160487367474, "loss": 0.5811, "step": 40910 }, { "epoch": 0.9107905982905983, "grad_norm": 0.6484251022338867, "learning_rate": 0.00031576358960143445, "loss": 0.6064, "step": 40920 }, { "epoch": 0.9110131766381766, "grad_norm": 0.7028173804283142, "learning_rate": 0.00031572556803850603, "loss": 0.6034, "step": 40930 }, { "epoch": 0.9112357549857549, "grad_norm": 0.6228461265563965, "learning_rate": 0.0003156875401869555, "loss": 0.4492, "step": 40940 }, { "epoch": 0.9114583333333334, "grad_norm": 0.5296692252159119, "learning_rate": 0.0003156495060488493, "loss": 0.6993, "step": 40950 }, { "epoch": 0.9116809116809117, "grad_norm": 0.5507607460021973, "learning_rate": 0.0003156114656262543, "loss": 0.6036, "step": 40960 }, { "epoch": 0.91190349002849, "grad_norm": 0.6303926110267639, "learning_rate": 0.0003155734189212375, "loss": 0.5727, "step": 40970 }, { "epoch": 0.9121260683760684, "grad_norm": 0.7401798963546753, "learning_rate": 0.00031553536593586666, "loss": 0.4791, "step": 40980 }, { "epoch": 0.9123486467236467, "grad_norm": 0.8919644951820374, "learning_rate": 0.00031549730667220936, "loss": 0.7607, "step": 40990 }, { "epoch": 0.9125712250712251, "grad_norm": 0.4974423050880432, "learning_rate": 0.0003154592411323339, "loss": 0.6209, "step": 41000 }, { "epoch": 0.9127938034188035, "grad_norm": 0.9230018258094788, "learning_rate": 0.00031542116931830875, "loss": 0.5109, "step": 41010 }, { "epoch": 0.9130163817663818, "grad_norm": 0.6732121109962463, "learning_rate": 0.0003153830912322028, "loss": 0.6689, "step": 41020 }, { "epoch": 0.9132389601139601, "grad_norm": 0.5105556845664978, "learning_rate": 0.0003153450068760852, "loss": 0.5894, "step": 41030 }, { "epoch": 0.9134615384615384, "grad_norm": 0.6419169306755066, "learning_rate": 0.0003153069162520255, "loss": 0.4713, "step": 41040 }, { "epoch": 0.9136841168091168, "grad_norm": 0.8677101731300354, "learning_rate": 0.0003152688193620935, "loss": 0.7491, "step": 41050 }, { "epoch": 0.9139066951566952, "grad_norm": 0.7640895843505859, "learning_rate": 0.0003152307162083595, "loss": 0.6545, "step": 41060 }, { "epoch": 0.9141292735042735, "grad_norm": 0.37812137603759766, "learning_rate": 0.00031519260679289414, "loss": 0.5917, "step": 41070 }, { "epoch": 0.9143518518518519, "grad_norm": 0.74465411901474, "learning_rate": 0.00031515449111776825, "loss": 0.7212, "step": 41080 }, { "epoch": 0.9145744301994302, "grad_norm": 0.6022646427154541, "learning_rate": 0.0003151163691850529, "loss": 0.6398, "step": 41090 }, { "epoch": 0.9147970085470085, "grad_norm": 0.5874007344245911, "learning_rate": 0.00031507824099681993, "loss": 0.5747, "step": 41100 }, { "epoch": 0.9150195868945868, "grad_norm": 0.4806864857673645, "learning_rate": 0.0003150401065551411, "loss": 0.5708, "step": 41110 }, { "epoch": 0.9152421652421653, "grad_norm": 0.6752296686172485, "learning_rate": 0.0003150019658620887, "loss": 0.548, "step": 41120 }, { "epoch": 0.9154647435897436, "grad_norm": 0.5574817061424255, "learning_rate": 0.00031496381891973533, "loss": 0.6457, "step": 41130 }, { "epoch": 0.9156873219373219, "grad_norm": 0.7656465768814087, "learning_rate": 0.0003149256657301539, "loss": 0.5559, "step": 41140 }, { "epoch": 0.9159099002849003, "grad_norm": 0.5118249654769897, "learning_rate": 0.0003148875062954177, "loss": 0.7083, "step": 41150 }, { "epoch": 0.9161324786324786, "grad_norm": 0.7542393207550049, "learning_rate": 0.00031484934061760044, "loss": 0.5739, "step": 41160 }, { "epoch": 0.9163550569800569, "grad_norm": 0.5904927849769592, "learning_rate": 0.00031481116869877594, "loss": 0.5556, "step": 41170 }, { "epoch": 0.9165776353276354, "grad_norm": 0.46618613600730896, "learning_rate": 0.00031477299054101857, "loss": 0.5605, "step": 41180 }, { "epoch": 0.9168002136752137, "grad_norm": 0.5759552717208862, "learning_rate": 0.00031473480614640294, "loss": 0.6537, "step": 41190 }, { "epoch": 0.917022792022792, "grad_norm": 0.8385375738143921, "learning_rate": 0.00031469661551700395, "loss": 0.6678, "step": 41200 }, { "epoch": 0.9172453703703703, "grad_norm": 0.44897398352622986, "learning_rate": 0.00031465841865489704, "loss": 0.549, "step": 41210 }, { "epoch": 0.9174679487179487, "grad_norm": 0.679852306842804, "learning_rate": 0.0003146202155621578, "loss": 0.547, "step": 41220 }, { "epoch": 0.9176905270655271, "grad_norm": 0.7378935813903809, "learning_rate": 0.0003145820062408622, "loss": 0.7427, "step": 41230 }, { "epoch": 0.9179131054131054, "grad_norm": 0.7794041633605957, "learning_rate": 0.0003145437906930865, "loss": 0.6781, "step": 41240 }, { "epoch": 0.9181356837606838, "grad_norm": 0.5164105892181396, "learning_rate": 0.00031450556892090753, "loss": 0.5227, "step": 41250 }, { "epoch": 0.9183582621082621, "grad_norm": 0.5497879385948181, "learning_rate": 0.00031446734092640226, "loss": 0.6573, "step": 41260 }, { "epoch": 0.9185808404558404, "grad_norm": 0.5047428011894226, "learning_rate": 0.0003144291067116479, "loss": 0.6054, "step": 41270 }, { "epoch": 0.9188034188034188, "grad_norm": 0.6174345016479492, "learning_rate": 0.0003143908662787222, "loss": 0.6215, "step": 41280 }, { "epoch": 0.9190259971509972, "grad_norm": 0.6649008393287659, "learning_rate": 0.0003143526196297032, "loss": 0.5482, "step": 41290 }, { "epoch": 0.9192485754985755, "grad_norm": 0.7139372825622559, "learning_rate": 0.0003143143667666692, "loss": 0.5291, "step": 41300 }, { "epoch": 0.9194711538461539, "grad_norm": 0.661760151386261, "learning_rate": 0.0003142761076916989, "loss": 0.6053, "step": 41310 }, { "epoch": 0.9196937321937322, "grad_norm": 0.5792003870010376, "learning_rate": 0.0003142378424068715, "loss": 0.5836, "step": 41320 }, { "epoch": 0.9199163105413105, "grad_norm": 0.8285216093063354, "learning_rate": 0.0003141995709142662, "loss": 0.511, "step": 41330 }, { "epoch": 0.9201388888888888, "grad_norm": 0.7349622845649719, "learning_rate": 0.00031416129321596266, "loss": 0.735, "step": 41340 }, { "epoch": 0.9203614672364673, "grad_norm": 0.6661806106567383, "learning_rate": 0.00031412300931404094, "loss": 0.5323, "step": 41350 }, { "epoch": 0.9205840455840456, "grad_norm": 0.6408804655075073, "learning_rate": 0.0003140847192105815, "loss": 0.5903, "step": 41360 }, { "epoch": 0.9208066239316239, "grad_norm": 0.9972931742668152, "learning_rate": 0.00031404642290766506, "loss": 0.6596, "step": 41370 }, { "epoch": 0.9210292022792023, "grad_norm": 0.6347147822380066, "learning_rate": 0.00031400812040737266, "loss": 0.5938, "step": 41380 }, { "epoch": 0.9212517806267806, "grad_norm": 0.509615421295166, "learning_rate": 0.00031396981171178567, "loss": 0.7123, "step": 41390 }, { "epoch": 0.9214743589743589, "grad_norm": 0.42907026410102844, "learning_rate": 0.00031393149682298573, "loss": 0.4879, "step": 41400 }, { "epoch": 0.9216969373219374, "grad_norm": 0.5011876225471497, "learning_rate": 0.000313893175743055, "loss": 0.5529, "step": 41410 }, { "epoch": 0.9219195156695157, "grad_norm": 0.5521091222763062, "learning_rate": 0.0003138548484740759, "loss": 0.5901, "step": 41420 }, { "epoch": 0.922142094017094, "grad_norm": 0.6726789474487305, "learning_rate": 0.00031381651501813113, "loss": 0.568, "step": 41430 }, { "epoch": 0.9223646723646723, "grad_norm": 0.479596883058548, "learning_rate": 0.00031377817537730373, "loss": 0.6254, "step": 41440 }, { "epoch": 0.9225872507122507, "grad_norm": 0.8564170598983765, "learning_rate": 0.00031373982955367716, "loss": 0.4415, "step": 41450 }, { "epoch": 0.9228098290598291, "grad_norm": 0.8632370233535767, "learning_rate": 0.0003137014775493352, "loss": 0.4761, "step": 41460 }, { "epoch": 0.9230324074074074, "grad_norm": 0.674905002117157, "learning_rate": 0.0003136631193663618, "loss": 0.6691, "step": 41470 }, { "epoch": 0.9232549857549858, "grad_norm": 0.8387262225151062, "learning_rate": 0.0003136247550068414, "loss": 0.6602, "step": 41480 }, { "epoch": 0.9234775641025641, "grad_norm": 0.7182347774505615, "learning_rate": 0.00031358638447285885, "loss": 0.4812, "step": 41490 }, { "epoch": 0.9237001424501424, "grad_norm": 0.8412067294120789, "learning_rate": 0.0003135480077664992, "loss": 0.5495, "step": 41500 }, { "epoch": 0.9239227207977208, "grad_norm": 0.6454422473907471, "learning_rate": 0.00031350962488984783, "loss": 0.6778, "step": 41510 }, { "epoch": 0.9241452991452992, "grad_norm": 0.8959324955940247, "learning_rate": 0.0003134712358449905, "loss": 0.6016, "step": 41520 }, { "epoch": 0.9243678774928775, "grad_norm": 0.6741381287574768, "learning_rate": 0.0003134328406340134, "loss": 0.4758, "step": 41530 }, { "epoch": 0.9245904558404558, "grad_norm": 0.6390167474746704, "learning_rate": 0.0003133944392590028, "loss": 0.492, "step": 41540 }, { "epoch": 0.9248130341880342, "grad_norm": 0.6453841924667358, "learning_rate": 0.00031335603172204555, "loss": 0.5103, "step": 41550 }, { "epoch": 0.9250356125356125, "grad_norm": 0.6824930310249329, "learning_rate": 0.0003133176180252288, "loss": 0.5694, "step": 41560 }, { "epoch": 0.9252581908831908, "grad_norm": 0.8084796071052551, "learning_rate": 0.0003132791981706398, "loss": 0.5371, "step": 41570 }, { "epoch": 0.9254807692307693, "grad_norm": 0.6897899508476257, "learning_rate": 0.0003132407721603666, "loss": 0.6974, "step": 41580 }, { "epoch": 0.9257033475783476, "grad_norm": 0.8142766952514648, "learning_rate": 0.000313202339996497, "loss": 0.5623, "step": 41590 }, { "epoch": 0.9259259259259259, "grad_norm": 1.095940351486206, "learning_rate": 0.0003131639016811196, "loss": 0.7174, "step": 41600 }, { "epoch": 0.9261485042735043, "grad_norm": 1.131062388420105, "learning_rate": 0.00031312545721632316, "loss": 0.6568, "step": 41610 }, { "epoch": 0.9263710826210826, "grad_norm": 0.42202919721603394, "learning_rate": 0.00031308700660419677, "loss": 0.4689, "step": 41620 }, { "epoch": 0.9265936609686609, "grad_norm": 0.3474844992160797, "learning_rate": 0.0003130485498468299, "loss": 0.5296, "step": 41630 }, { "epoch": 0.9268162393162394, "grad_norm": 0.6784580945968628, "learning_rate": 0.0003130100869463122, "loss": 0.5368, "step": 41640 }, { "epoch": 0.9270388176638177, "grad_norm": 0.5489214062690735, "learning_rate": 0.00031297161790473394, "loss": 0.589, "step": 41650 }, { "epoch": 0.927261396011396, "grad_norm": 0.7951946258544922, "learning_rate": 0.0003129331427241855, "loss": 0.6161, "step": 41660 }, { "epoch": 0.9274839743589743, "grad_norm": 0.9211964011192322, "learning_rate": 0.00031289466140675757, "loss": 0.5625, "step": 41670 }, { "epoch": 0.9277065527065527, "grad_norm": 0.7942441701889038, "learning_rate": 0.0003128561739545413, "loss": 0.7409, "step": 41680 }, { "epoch": 0.9279291310541311, "grad_norm": 0.5457864999771118, "learning_rate": 0.0003128176803696282, "loss": 0.637, "step": 41690 }, { "epoch": 0.9281517094017094, "grad_norm": 0.7595346570014954, "learning_rate": 0.00031277918065410995, "loss": 0.4984, "step": 41700 }, { "epoch": 0.9283742877492878, "grad_norm": 0.6316278576850891, "learning_rate": 0.00031274067481007874, "loss": 0.6209, "step": 41710 }, { "epoch": 0.9285968660968661, "grad_norm": 0.5016762018203735, "learning_rate": 0.00031270216283962697, "loss": 0.5954, "step": 41720 }, { "epoch": 0.9288194444444444, "grad_norm": 0.6339188814163208, "learning_rate": 0.0003126636447448473, "loss": 0.6837, "step": 41730 }, { "epoch": 0.9290420227920227, "grad_norm": 0.7948164343833923, "learning_rate": 0.0003126251205278331, "loss": 0.6081, "step": 41740 }, { "epoch": 0.9292646011396012, "grad_norm": 0.6831077337265015, "learning_rate": 0.0003125865901906776, "loss": 0.6106, "step": 41750 }, { "epoch": 0.9294871794871795, "grad_norm": 0.7580824494361877, "learning_rate": 0.00031254805373547457, "loss": 0.5183, "step": 41760 }, { "epoch": 0.9297097578347578, "grad_norm": 0.6469782590866089, "learning_rate": 0.0003125095111643182, "loss": 0.602, "step": 41770 }, { "epoch": 0.9299323361823362, "grad_norm": 0.6362723112106323, "learning_rate": 0.0003124709624793029, "loss": 0.673, "step": 41780 }, { "epoch": 0.9301549145299145, "grad_norm": 0.6553264856338501, "learning_rate": 0.0003124324076825234, "loss": 0.5923, "step": 41790 }, { "epoch": 0.9303774928774928, "grad_norm": 0.6719655394554138, "learning_rate": 0.0003123938467760748, "loss": 0.5773, "step": 41800 }, { "epoch": 0.9306000712250713, "grad_norm": 0.46676400303840637, "learning_rate": 0.0003123552797620526, "loss": 0.6049, "step": 41810 }, { "epoch": 0.9308226495726496, "grad_norm": 0.8152366876602173, "learning_rate": 0.0003123167066425525, "loss": 0.5892, "step": 41820 }, { "epoch": 0.9310452279202279, "grad_norm": 0.7665014266967773, "learning_rate": 0.00031227812741967066, "loss": 0.5525, "step": 41830 }, { "epoch": 0.9312678062678063, "grad_norm": 0.9132956862449646, "learning_rate": 0.00031223954209550343, "loss": 0.5866, "step": 41840 }, { "epoch": 0.9314903846153846, "grad_norm": 0.5664402842521667, "learning_rate": 0.0003122009506721476, "loss": 0.5722, "step": 41850 }, { "epoch": 0.9317129629629629, "grad_norm": 0.6909135580062866, "learning_rate": 0.00031216235315170026, "loss": 0.5854, "step": 41860 }, { "epoch": 0.9319355413105413, "grad_norm": 0.6950509548187256, "learning_rate": 0.00031212374953625883, "loss": 0.6248, "step": 41870 }, { "epoch": 0.9321581196581197, "grad_norm": 0.800899863243103, "learning_rate": 0.0003120851398279211, "loss": 0.6557, "step": 41880 }, { "epoch": 0.932380698005698, "grad_norm": 0.8420230150222778, "learning_rate": 0.00031204652402878506, "loss": 0.6, "step": 41890 }, { "epoch": 0.9326032763532763, "grad_norm": 0.6417230367660522, "learning_rate": 0.0003120079021409492, "loss": 0.7307, "step": 41900 }, { "epoch": 0.9328258547008547, "grad_norm": 0.8272371292114258, "learning_rate": 0.0003119692741665122, "loss": 0.7223, "step": 41910 }, { "epoch": 0.9330484330484331, "grad_norm": 0.7413893342018127, "learning_rate": 0.0003119306401075732, "loss": 0.6794, "step": 41920 }, { "epoch": 0.9332710113960114, "grad_norm": 0.51690673828125, "learning_rate": 0.0003118919999662316, "loss": 0.5408, "step": 41930 }, { "epoch": 0.9334935897435898, "grad_norm": 0.5625960230827332, "learning_rate": 0.00031185335374458713, "loss": 0.6769, "step": 41940 }, { "epoch": 0.9337161680911681, "grad_norm": 0.8159250020980835, "learning_rate": 0.00031181470144473985, "loss": 0.6423, "step": 41950 }, { "epoch": 0.9339387464387464, "grad_norm": 0.6201074123382568, "learning_rate": 0.0003117760430687901, "loss": 0.4813, "step": 41960 }, { "epoch": 0.9341613247863247, "grad_norm": 0.6656062006950378, "learning_rate": 0.00031173737861883873, "loss": 0.5664, "step": 41970 }, { "epoch": 0.9343839031339032, "grad_norm": 0.6236233115196228, "learning_rate": 0.0003116987080969867, "loss": 0.5341, "step": 41980 }, { "epoch": 0.9346064814814815, "grad_norm": 0.5731833577156067, "learning_rate": 0.0003116600315053355, "loss": 0.566, "step": 41990 }, { "epoch": 0.9348290598290598, "grad_norm": 0.724296510219574, "learning_rate": 0.00031162134884598676, "loss": 0.6318, "step": 42000 }, { "epoch": 0.9350516381766382, "grad_norm": 0.8767474889755249, "learning_rate": 0.0003115826601210425, "loss": 0.5942, "step": 42010 }, { "epoch": 0.9352742165242165, "grad_norm": 0.6816518306732178, "learning_rate": 0.00031154396533260515, "loss": 0.5605, "step": 42020 }, { "epoch": 0.9354967948717948, "grad_norm": 0.8070791959762573, "learning_rate": 0.0003115052644827774, "loss": 0.5516, "step": 42030 }, { "epoch": 0.9357193732193733, "grad_norm": 0.6746892333030701, "learning_rate": 0.0003114665575736623, "loss": 0.5388, "step": 42040 }, { "epoch": 0.9359419515669516, "grad_norm": 0.6569458842277527, "learning_rate": 0.00031142784460736324, "loss": 0.6843, "step": 42050 }, { "epoch": 0.9361645299145299, "grad_norm": 0.8369061350822449, "learning_rate": 0.0003113891255859839, "loss": 0.6018, "step": 42060 }, { "epoch": 0.9363871082621082, "grad_norm": 0.692882776260376, "learning_rate": 0.00031135040051162826, "loss": 0.6985, "step": 42070 }, { "epoch": 0.9366096866096866, "grad_norm": 0.5424764752388, "learning_rate": 0.0003113116693864007, "loss": 0.4902, "step": 42080 }, { "epoch": 0.9368322649572649, "grad_norm": 0.5952123999595642, "learning_rate": 0.00031127293221240587, "loss": 0.6561, "step": 42090 }, { "epoch": 0.9370548433048433, "grad_norm": 0.6257585287094116, "learning_rate": 0.0003112341889917488, "loss": 0.637, "step": 42100 }, { "epoch": 0.9372774216524217, "grad_norm": 0.6995611786842346, "learning_rate": 0.0003111954397265349, "loss": 0.5221, "step": 42110 }, { "epoch": 0.9375, "grad_norm": 0.62469482421875, "learning_rate": 0.00031115668441886976, "loss": 0.6586, "step": 42120 }, { "epoch": 0.9377225783475783, "grad_norm": 0.7522687911987305, "learning_rate": 0.0003111179230708594, "loss": 0.6675, "step": 42130 }, { "epoch": 0.9379451566951567, "grad_norm": 0.44003018736839294, "learning_rate": 0.00031107915568461014, "loss": 0.6966, "step": 42140 }, { "epoch": 0.9381677350427351, "grad_norm": 0.4076796770095825, "learning_rate": 0.0003110403822622286, "loss": 0.6358, "step": 42150 }, { "epoch": 0.9383903133903134, "grad_norm": 0.7261608839035034, "learning_rate": 0.0003110016028058218, "loss": 0.6083, "step": 42160 }, { "epoch": 0.9386128917378918, "grad_norm": 0.5442433953285217, "learning_rate": 0.000310962817317497, "loss": 0.5734, "step": 42170 }, { "epoch": 0.9388354700854701, "grad_norm": 0.5819172263145447, "learning_rate": 0.0003109240257993619, "loss": 0.5775, "step": 42180 }, { "epoch": 0.9390580484330484, "grad_norm": 0.9003576636314392, "learning_rate": 0.00031088522825352443, "loss": 0.5348, "step": 42190 }, { "epoch": 0.9392806267806267, "grad_norm": 0.6950390934944153, "learning_rate": 0.00031084642468209286, "loss": 0.5162, "step": 42200 }, { "epoch": 0.9395032051282052, "grad_norm": 0.4344139099121094, "learning_rate": 0.0003108076150871759, "loss": 0.5719, "step": 42210 }, { "epoch": 0.9397257834757835, "grad_norm": 1.0126549005508423, "learning_rate": 0.00031076879947088235, "loss": 0.6178, "step": 42220 }, { "epoch": 0.9399483618233618, "grad_norm": 0.7256695628166199, "learning_rate": 0.0003107299778353216, "loss": 0.5811, "step": 42230 }, { "epoch": 0.9401709401709402, "grad_norm": 0.9113925099372864, "learning_rate": 0.00031069115018260315, "loss": 0.574, "step": 42240 }, { "epoch": 0.9403935185185185, "grad_norm": 0.5826119780540466, "learning_rate": 0.0003106523165148371, "loss": 0.5027, "step": 42250 }, { "epoch": 0.9406160968660968, "grad_norm": 0.8413066864013672, "learning_rate": 0.0003106134768341335, "loss": 0.6589, "step": 42260 }, { "epoch": 0.9408386752136753, "grad_norm": 0.5362156629562378, "learning_rate": 0.000310574631142603, "loss": 0.4235, "step": 42270 }, { "epoch": 0.9410612535612536, "grad_norm": 0.7126069068908691, "learning_rate": 0.00031053577944235654, "loss": 0.5982, "step": 42280 }, { "epoch": 0.9412838319088319, "grad_norm": 0.9489959478378296, "learning_rate": 0.0003104969217355053, "loss": 0.5615, "step": 42290 }, { "epoch": 0.9415064102564102, "grad_norm": 0.8428272008895874, "learning_rate": 0.00031045805802416094, "loss": 0.6999, "step": 42300 }, { "epoch": 0.9417289886039886, "grad_norm": 0.39634135365486145, "learning_rate": 0.0003104191883104353, "loss": 0.8428, "step": 42310 }, { "epoch": 0.9419515669515669, "grad_norm": 0.7322662472724915, "learning_rate": 0.00031038031259644056, "loss": 0.6709, "step": 42320 }, { "epoch": 0.9421741452991453, "grad_norm": 0.875149130821228, "learning_rate": 0.00031034143088428924, "loss": 0.678, "step": 42330 }, { "epoch": 0.9423967236467237, "grad_norm": 0.7805944681167603, "learning_rate": 0.00031030254317609426, "loss": 0.6248, "step": 42340 }, { "epoch": 0.942619301994302, "grad_norm": 0.6153649091720581, "learning_rate": 0.0003102636494739687, "loss": 0.7148, "step": 42350 }, { "epoch": 0.9428418803418803, "grad_norm": 0.4850127100944519, "learning_rate": 0.00031022474978002626, "loss": 0.5177, "step": 42360 }, { "epoch": 0.9430644586894587, "grad_norm": 0.5072647929191589, "learning_rate": 0.00031018584409638067, "loss": 0.5377, "step": 42370 }, { "epoch": 0.9432870370370371, "grad_norm": 0.5038172006607056, "learning_rate": 0.0003101469324251461, "loss": 0.5688, "step": 42380 }, { "epoch": 0.9435096153846154, "grad_norm": 0.736919641494751, "learning_rate": 0.00031010801476843704, "loss": 0.7235, "step": 42390 }, { "epoch": 0.9437321937321937, "grad_norm": 0.6360201239585876, "learning_rate": 0.0003100690911283683, "loss": 0.5508, "step": 42400 }, { "epoch": 0.9439547720797721, "grad_norm": 0.4708540439605713, "learning_rate": 0.0003100301615070551, "loss": 0.5771, "step": 42410 }, { "epoch": 0.9441773504273504, "grad_norm": 0.4812738001346588, "learning_rate": 0.0003099912259066128, "loss": 0.5551, "step": 42420 }, { "epoch": 0.9443999287749287, "grad_norm": 0.5777409076690674, "learning_rate": 0.0003099522843291572, "loss": 0.6947, "step": 42430 }, { "epoch": 0.9446225071225072, "grad_norm": 0.6029115319252014, "learning_rate": 0.00030991333677680456, "loss": 0.6122, "step": 42440 }, { "epoch": 0.9448450854700855, "grad_norm": 0.6521156430244446, "learning_rate": 0.00030987438325167114, "loss": 0.576, "step": 42450 }, { "epoch": 0.9450676638176638, "grad_norm": 0.7701613306999207, "learning_rate": 0.0003098354237558739, "loss": 0.513, "step": 42460 }, { "epoch": 0.9452902421652422, "grad_norm": 0.5667462944984436, "learning_rate": 0.00030979645829152966, "loss": 0.6486, "step": 42470 }, { "epoch": 0.9455128205128205, "grad_norm": 0.8962845206260681, "learning_rate": 0.0003097574868607561, "loss": 0.5852, "step": 42480 }, { "epoch": 0.9457353988603988, "grad_norm": 0.5495222210884094, "learning_rate": 0.00030971850946567083, "loss": 0.5829, "step": 42490 }, { "epoch": 0.9459579772079773, "grad_norm": 0.6287023425102234, "learning_rate": 0.0003096795261083919, "loss": 0.6074, "step": 42500 }, { "epoch": 0.9461805555555556, "grad_norm": 0.60505211353302, "learning_rate": 0.00030964053679103775, "loss": 0.5515, "step": 42510 }, { "epoch": 0.9464031339031339, "grad_norm": 0.6437878608703613, "learning_rate": 0.0003096015415157271, "loss": 0.588, "step": 42520 }, { "epoch": 0.9466257122507122, "grad_norm": 0.6274718642234802, "learning_rate": 0.000309562540284579, "loss": 0.5576, "step": 42530 }, { "epoch": 0.9468482905982906, "grad_norm": 0.6374098062515259, "learning_rate": 0.0003095235330997127, "loss": 0.5445, "step": 42540 }, { "epoch": 0.9470708689458689, "grad_norm": 0.8956291675567627, "learning_rate": 0.000309484519963248, "loss": 0.5241, "step": 42550 }, { "epoch": 0.9472934472934473, "grad_norm": 0.8904242515563965, "learning_rate": 0.0003094455008773048, "loss": 0.7352, "step": 42560 }, { "epoch": 0.9475160256410257, "grad_norm": 0.9804019331932068, "learning_rate": 0.0003094064758440035, "loss": 0.639, "step": 42570 }, { "epoch": 0.947738603988604, "grad_norm": 0.5462363958358765, "learning_rate": 0.0003093674448654648, "loss": 0.6243, "step": 42580 }, { "epoch": 0.9479611823361823, "grad_norm": 0.641595721244812, "learning_rate": 0.00030932840794380953, "loss": 0.6183, "step": 42590 }, { "epoch": 0.9481837606837606, "grad_norm": 0.4465916156768799, "learning_rate": 0.00030928936508115907, "loss": 0.5259, "step": 42600 }, { "epoch": 0.9484063390313391, "grad_norm": 0.7366055846214294, "learning_rate": 0.0003092503162796351, "loss": 0.7196, "step": 42610 }, { "epoch": 0.9486289173789174, "grad_norm": 0.5737682580947876, "learning_rate": 0.0003092112615413595, "loss": 0.5638, "step": 42620 }, { "epoch": 0.9488514957264957, "grad_norm": 0.6066577434539795, "learning_rate": 0.0003091722008684545, "loss": 0.6529, "step": 42630 }, { "epoch": 0.9490740740740741, "grad_norm": 0.8688057661056519, "learning_rate": 0.0003091331342630428, "loss": 0.6387, "step": 42640 }, { "epoch": 0.9492966524216524, "grad_norm": 0.749236524105072, "learning_rate": 0.00030909406172724717, "loss": 0.481, "step": 42650 }, { "epoch": 0.9495192307692307, "grad_norm": 0.4578602910041809, "learning_rate": 0.00030905498326319093, "loss": 0.6296, "step": 42660 }, { "epoch": 0.9497418091168092, "grad_norm": 0.9280728697776794, "learning_rate": 0.0003090158988729977, "loss": 0.5446, "step": 42670 }, { "epoch": 0.9499643874643875, "grad_norm": 0.7775986790657043, "learning_rate": 0.0003089768085587912, "loss": 0.591, "step": 42680 }, { "epoch": 0.9501869658119658, "grad_norm": 0.8049306869506836, "learning_rate": 0.00030893771232269574, "loss": 0.6957, "step": 42690 }, { "epoch": 0.9504095441595442, "grad_norm": 0.6778254508972168, "learning_rate": 0.0003088986101668358, "loss": 0.6916, "step": 42700 }, { "epoch": 0.9506321225071225, "grad_norm": 0.5649783611297607, "learning_rate": 0.00030885950209333623, "loss": 0.577, "step": 42710 }, { "epoch": 0.9508547008547008, "grad_norm": 0.4995388984680176, "learning_rate": 0.00030882038810432224, "loss": 0.5385, "step": 42720 }, { "epoch": 0.9510772792022792, "grad_norm": 0.492776095867157, "learning_rate": 0.0003087812682019192, "loss": 0.6314, "step": 42730 }, { "epoch": 0.9512998575498576, "grad_norm": 0.4935969412326813, "learning_rate": 0.0003087421423882531, "loss": 0.6476, "step": 42740 }, { "epoch": 0.9515224358974359, "grad_norm": 0.6174660921096802, "learning_rate": 0.0003087030106654499, "loss": 0.5734, "step": 42750 }, { "epoch": 0.9517450142450142, "grad_norm": 0.799747884273529, "learning_rate": 0.0003086638730356361, "loss": 0.5207, "step": 42760 }, { "epoch": 0.9519675925925926, "grad_norm": 0.5621200203895569, "learning_rate": 0.0003086247295009385, "loss": 0.7379, "step": 42770 }, { "epoch": 0.9521901709401709, "grad_norm": 0.867217481136322, "learning_rate": 0.00030858558006348417, "loss": 0.5859, "step": 42780 }, { "epoch": 0.9524127492877493, "grad_norm": 0.6303733587265015, "learning_rate": 0.0003085464247254005, "loss": 0.5096, "step": 42790 }, { "epoch": 0.9526353276353277, "grad_norm": 0.7135583758354187, "learning_rate": 0.0003085072634888153, "loss": 0.5051, "step": 42800 }, { "epoch": 0.952857905982906, "grad_norm": 0.7252805233001709, "learning_rate": 0.00030846809635585656, "loss": 0.6157, "step": 42810 }, { "epoch": 0.9530804843304843, "grad_norm": 0.672892689704895, "learning_rate": 0.00030842892332865265, "loss": 0.6354, "step": 42820 }, { "epoch": 0.9533030626780626, "grad_norm": 0.5883125066757202, "learning_rate": 0.0003083897444093323, "loss": 0.4532, "step": 42830 }, { "epoch": 0.9535256410256411, "grad_norm": 0.5267878770828247, "learning_rate": 0.00030835055960002456, "loss": 0.4659, "step": 42840 }, { "epoch": 0.9537482193732194, "grad_norm": 0.9347203373908997, "learning_rate": 0.00030831136890285864, "loss": 0.5361, "step": 42850 }, { "epoch": 0.9539707977207977, "grad_norm": 0.3487144410610199, "learning_rate": 0.0003082721723199643, "loss": 0.5186, "step": 42860 }, { "epoch": 0.9541933760683761, "grad_norm": 0.7461722493171692, "learning_rate": 0.00030823296985347143, "loss": 0.5533, "step": 42870 }, { "epoch": 0.9544159544159544, "grad_norm": 0.5731413960456848, "learning_rate": 0.0003081937615055104, "loss": 0.4873, "step": 42880 }, { "epoch": 0.9546385327635327, "grad_norm": 0.7203046083450317, "learning_rate": 0.0003081545472782118, "loss": 0.604, "step": 42890 }, { "epoch": 0.9548611111111112, "grad_norm": 0.802855372428894, "learning_rate": 0.00030811532717370656, "loss": 0.5076, "step": 42900 }, { "epoch": 0.9550836894586895, "grad_norm": 0.5875809192657471, "learning_rate": 0.00030807610119412595, "loss": 0.559, "step": 42910 }, { "epoch": 0.9553062678062678, "grad_norm": 0.5268616676330566, "learning_rate": 0.00030803686934160145, "loss": 0.502, "step": 42920 }, { "epoch": 0.9555288461538461, "grad_norm": 0.9944384694099426, "learning_rate": 0.00030799763161826514, "loss": 0.7, "step": 42930 }, { "epoch": 0.9557514245014245, "grad_norm": 1.0348243713378906, "learning_rate": 0.00030795838802624904, "loss": 0.6141, "step": 42940 }, { "epoch": 0.9559740028490028, "grad_norm": 0.5023366808891296, "learning_rate": 0.00030791913856768573, "loss": 0.541, "step": 42950 }, { "epoch": 0.9561965811965812, "grad_norm": 0.9290866851806641, "learning_rate": 0.0003078798832447082, "loss": 0.7567, "step": 42960 }, { "epoch": 0.9564191595441596, "grad_norm": 0.5600704550743103, "learning_rate": 0.00030784062205944945, "loss": 0.5905, "step": 42970 }, { "epoch": 0.9566417378917379, "grad_norm": 0.559613823890686, "learning_rate": 0.000307801355014043, "loss": 0.5237, "step": 42980 }, { "epoch": 0.9568643162393162, "grad_norm": 0.5399856567382812, "learning_rate": 0.0003077620821106226, "loss": 0.6838, "step": 42990 }, { "epoch": 0.9570868945868946, "grad_norm": 0.42649152874946594, "learning_rate": 0.0003077228033513226, "loss": 0.6487, "step": 43000 }, { "epoch": 0.9573094729344729, "grad_norm": 0.8213827610015869, "learning_rate": 0.00030768351873827716, "loss": 0.5688, "step": 43010 }, { "epoch": 0.9575320512820513, "grad_norm": 0.6761860847473145, "learning_rate": 0.0003076442282736212, "loss": 0.6261, "step": 43020 }, { "epoch": 0.9577546296296297, "grad_norm": 0.6132062673568726, "learning_rate": 0.0003076049319594898, "loss": 0.6003, "step": 43030 }, { "epoch": 0.957977207977208, "grad_norm": 0.3853282034397125, "learning_rate": 0.0003075656297980182, "loss": 0.5995, "step": 43040 }, { "epoch": 0.9581997863247863, "grad_norm": 0.4675573408603668, "learning_rate": 0.00030752632179134224, "loss": 0.6665, "step": 43050 }, { "epoch": 0.9584223646723646, "grad_norm": 0.5293225646018982, "learning_rate": 0.00030748700794159796, "loss": 0.5195, "step": 43060 }, { "epoch": 0.9586449430199431, "grad_norm": 0.7288942337036133, "learning_rate": 0.00030744768825092167, "loss": 0.6372, "step": 43070 }, { "epoch": 0.9588675213675214, "grad_norm": 0.8237767815589905, "learning_rate": 0.00030740836272145005, "loss": 0.6325, "step": 43080 }, { "epoch": 0.9590900997150997, "grad_norm": 0.7287095785140991, "learning_rate": 0.00030736903135532, "loss": 0.5934, "step": 43090 }, { "epoch": 0.9593126780626781, "grad_norm": 0.8140029311180115, "learning_rate": 0.00030732969415466903, "loss": 0.5926, "step": 43100 }, { "epoch": 0.9595352564102564, "grad_norm": 0.6723431944847107, "learning_rate": 0.00030729035112163453, "loss": 0.5835, "step": 43110 }, { "epoch": 0.9597578347578347, "grad_norm": 0.5774185657501221, "learning_rate": 0.0003072510022583545, "loss": 0.6066, "step": 43120 }, { "epoch": 0.9599804131054132, "grad_norm": 0.8784375190734863, "learning_rate": 0.0003072116475669672, "loss": 0.665, "step": 43130 }, { "epoch": 0.9601139601139601, "eval_loss": 0.6042813658714294, "eval_runtime": 337.231, "eval_samples_per_second": 7.013, "eval_steps_per_second": 7.013, "step": 43136 }, { "epoch": 0.9602029914529915, "grad_norm": 0.7646319270133972, "learning_rate": 0.0003071722870496113, "loss": 0.6678, "step": 43140 }, { "epoch": 0.9604255698005698, "grad_norm": 0.5818307399749756, "learning_rate": 0.0003071329207084255, "loss": 0.5272, "step": 43150 }, { "epoch": 0.9606481481481481, "grad_norm": 0.65185546875, "learning_rate": 0.0003070935485455491, "loss": 0.6083, "step": 43160 }, { "epoch": 0.9608707264957265, "grad_norm": 0.8646022081375122, "learning_rate": 0.00030705417056312164, "loss": 0.66, "step": 43170 }, { "epoch": 0.9610933048433048, "grad_norm": 0.4757365882396698, "learning_rate": 0.0003070147867632829, "loss": 0.5159, "step": 43180 }, { "epoch": 0.9613158831908832, "grad_norm": 0.6112939715385437, "learning_rate": 0.00030697539714817304, "loss": 0.5295, "step": 43190 }, { "epoch": 0.9615384615384616, "grad_norm": 0.8913406133651733, "learning_rate": 0.00030693600171993247, "loss": 0.5462, "step": 43200 }, { "epoch": 0.9617610398860399, "grad_norm": 0.8653389811515808, "learning_rate": 0.0003068966004807021, "loss": 0.6887, "step": 43210 }, { "epoch": 0.9619836182336182, "grad_norm": 0.8796609044075012, "learning_rate": 0.00030685719343262284, "loss": 0.8114, "step": 43220 }, { "epoch": 0.9622061965811965, "grad_norm": 0.5240429639816284, "learning_rate": 0.0003068177805778364, "loss": 0.5522, "step": 43230 }, { "epoch": 0.9624287749287749, "grad_norm": 0.6439476609230042, "learning_rate": 0.0003067783619184842, "loss": 0.605, "step": 43240 }, { "epoch": 0.9626513532763533, "grad_norm": 0.851402223110199, "learning_rate": 0.0003067389374567084, "loss": 0.5514, "step": 43250 }, { "epoch": 0.9628739316239316, "grad_norm": 0.8971279263496399, "learning_rate": 0.0003066995071946513, "loss": 0.6248, "step": 43260 }, { "epoch": 0.96309650997151, "grad_norm": 0.44522354006767273, "learning_rate": 0.0003066600711344557, "loss": 0.5892, "step": 43270 }, { "epoch": 0.9633190883190883, "grad_norm": 0.6671565771102905, "learning_rate": 0.0003066206292782645, "loss": 0.7284, "step": 43280 }, { "epoch": 0.9635416666666666, "grad_norm": 0.8489352464675903, "learning_rate": 0.000306581181628221, "loss": 0.4295, "step": 43290 }, { "epoch": 0.9637642450142451, "grad_norm": 0.7675427794456482, "learning_rate": 0.00030654172818646893, "loss": 0.6216, "step": 43300 }, { "epoch": 0.9639868233618234, "grad_norm": 0.9285682439804077, "learning_rate": 0.0003065022689551521, "loss": 0.7118, "step": 43310 }, { "epoch": 0.9642094017094017, "grad_norm": 0.5413016676902771, "learning_rate": 0.00030646280393641475, "loss": 0.5437, "step": 43320 }, { "epoch": 0.96443198005698, "grad_norm": 0.7694899439811707, "learning_rate": 0.00030642333313240144, "loss": 0.732, "step": 43330 }, { "epoch": 0.9646545584045584, "grad_norm": 0.6586807370185852, "learning_rate": 0.00030638385654525716, "loss": 0.5091, "step": 43340 }, { "epoch": 0.9648771367521367, "grad_norm": 0.4724593162536621, "learning_rate": 0.000306344374177127, "loss": 0.5117, "step": 43350 }, { "epoch": 0.9650997150997151, "grad_norm": 0.8600608110427856, "learning_rate": 0.00030630488603015655, "loss": 0.546, "step": 43360 }, { "epoch": 0.9653222934472935, "grad_norm": 0.6995387673377991, "learning_rate": 0.0003062653921064915, "loss": 0.5045, "step": 43370 }, { "epoch": 0.9655448717948718, "grad_norm": 0.8419876098632812, "learning_rate": 0.00030622589240827806, "loss": 0.6705, "step": 43380 }, { "epoch": 0.9657674501424501, "grad_norm": 0.5719366669654846, "learning_rate": 0.00030618638693766264, "loss": 0.6209, "step": 43390 }, { "epoch": 0.9659900284900285, "grad_norm": 0.5143960118293762, "learning_rate": 0.000306146875696792, "loss": 0.5871, "step": 43400 }, { "epoch": 0.9662126068376068, "grad_norm": 0.8171896934509277, "learning_rate": 0.00030610735868781335, "loss": 0.656, "step": 43410 }, { "epoch": 0.9664351851851852, "grad_norm": 0.9602444767951965, "learning_rate": 0.00030606783591287384, "loss": 0.6494, "step": 43420 }, { "epoch": 0.9666577635327636, "grad_norm": 0.8292070031166077, "learning_rate": 0.0003060283073741214, "loss": 0.5705, "step": 43430 }, { "epoch": 0.9668803418803419, "grad_norm": 0.5162893533706665, "learning_rate": 0.0003059887730737039, "loss": 0.5762, "step": 43440 }, { "epoch": 0.9671029202279202, "grad_norm": 1.0457990169525146, "learning_rate": 0.00030594923301376976, "loss": 0.6512, "step": 43450 }, { "epoch": 0.9673254985754985, "grad_norm": 0.6627582311630249, "learning_rate": 0.00030590968719646747, "loss": 0.5665, "step": 43460 }, { "epoch": 0.9675480769230769, "grad_norm": 0.6730947494506836, "learning_rate": 0.0003058701356239461, "loss": 0.5233, "step": 43470 }, { "epoch": 0.9677706552706553, "grad_norm": 0.7079799175262451, "learning_rate": 0.0003058305782983549, "loss": 0.6293, "step": 43480 }, { "epoch": 0.9679932336182336, "grad_norm": 0.6541440486907959, "learning_rate": 0.0003057910152218435, "loss": 0.5677, "step": 43490 }, { "epoch": 0.968215811965812, "grad_norm": 0.5727503299713135, "learning_rate": 0.00030575144639656164, "loss": 0.6793, "step": 43500 }, { "epoch": 0.9684383903133903, "grad_norm": 0.6658546924591064, "learning_rate": 0.0003057118718246597, "loss": 0.6498, "step": 43510 }, { "epoch": 0.9686609686609686, "grad_norm": 1.0690386295318604, "learning_rate": 0.000305672291508288, "loss": 0.577, "step": 43520 }, { "epoch": 0.9688835470085471, "grad_norm": 0.6312462091445923, "learning_rate": 0.0003056327054495975, "loss": 0.5497, "step": 43530 }, { "epoch": 0.9691061253561254, "grad_norm": 0.49718987941741943, "learning_rate": 0.0003055931136507393, "loss": 0.5781, "step": 43540 }, { "epoch": 0.9693287037037037, "grad_norm": 0.6868771910667419, "learning_rate": 0.00030555351611386494, "loss": 0.62, "step": 43550 }, { "epoch": 0.969551282051282, "grad_norm": 0.5947691798210144, "learning_rate": 0.0003055139128411261, "loss": 0.4976, "step": 43560 }, { "epoch": 0.9697738603988604, "grad_norm": 0.6794607043266296, "learning_rate": 0.00030547430383467473, "loss": 0.509, "step": 43570 }, { "epoch": 0.9699964387464387, "grad_norm": 0.5065454840660095, "learning_rate": 0.00030543468909666346, "loss": 0.6121, "step": 43580 }, { "epoch": 0.9702190170940171, "grad_norm": 0.7035951018333435, "learning_rate": 0.00030539506862924484, "loss": 0.7243, "step": 43590 }, { "epoch": 0.9704415954415955, "grad_norm": 0.6408650279045105, "learning_rate": 0.0003053554424345719, "loss": 0.5817, "step": 43600 }, { "epoch": 0.9706641737891738, "grad_norm": 0.5592657327651978, "learning_rate": 0.00030531581051479803, "loss": 0.623, "step": 43610 }, { "epoch": 0.9708867521367521, "grad_norm": 0.6538555026054382, "learning_rate": 0.00030527617287207673, "loss": 0.4772, "step": 43620 }, { "epoch": 0.9711093304843305, "grad_norm": 0.8865237236022949, "learning_rate": 0.00030523652950856207, "loss": 0.5248, "step": 43630 }, { "epoch": 0.9713319088319088, "grad_norm": 0.7268497943878174, "learning_rate": 0.0003051968804264082, "loss": 0.6957, "step": 43640 }, { "epoch": 0.9715544871794872, "grad_norm": 0.9204932451248169, "learning_rate": 0.00030515722562776974, "loss": 0.6097, "step": 43650 }, { "epoch": 0.9717770655270656, "grad_norm": 0.7186421155929565, "learning_rate": 0.00030511756511480157, "loss": 0.5993, "step": 43660 }, { "epoch": 0.9719996438746439, "grad_norm": 0.8013362288475037, "learning_rate": 0.0003050778988896588, "loss": 0.678, "step": 43670 }, { "epoch": 0.9722222222222222, "grad_norm": 0.5864419937133789, "learning_rate": 0.000305038226954497, "loss": 0.662, "step": 43680 }, { "epoch": 0.9724448005698005, "grad_norm": 0.8709241151809692, "learning_rate": 0.000304998549311472, "loss": 0.524, "step": 43690 }, { "epoch": 0.9726673789173789, "grad_norm": 0.5410850644111633, "learning_rate": 0.0003049588659627399, "loss": 0.531, "step": 43700 }, { "epoch": 0.9728899572649573, "grad_norm": 0.5670583844184875, "learning_rate": 0.00030491917691045705, "loss": 0.6241, "step": 43710 }, { "epoch": 0.9731125356125356, "grad_norm": 0.7049233913421631, "learning_rate": 0.00030487948215678025, "loss": 0.6616, "step": 43720 }, { "epoch": 0.973335113960114, "grad_norm": 0.46305641531944275, "learning_rate": 0.0003048397817038665, "loss": 0.4417, "step": 43730 }, { "epoch": 0.9735576923076923, "grad_norm": 0.8418055772781372, "learning_rate": 0.0003048000755538732, "loss": 0.6689, "step": 43740 }, { "epoch": 0.9737802706552706, "grad_norm": 0.6632723212242126, "learning_rate": 0.000304760363708958, "loss": 0.4771, "step": 43750 }, { "epoch": 0.9740028490028491, "grad_norm": 0.7734985947608948, "learning_rate": 0.0003047206461712789, "loss": 0.5589, "step": 43760 }, { "epoch": 0.9742254273504274, "grad_norm": 0.774679958820343, "learning_rate": 0.0003046809229429942, "loss": 0.5736, "step": 43770 }, { "epoch": 0.9744480056980057, "grad_norm": 0.6929972171783447, "learning_rate": 0.00030464119402626237, "loss": 0.6126, "step": 43780 }, { "epoch": 0.974670584045584, "grad_norm": 0.4365656077861786, "learning_rate": 0.0003046014594232424, "loss": 0.6184, "step": 43790 }, { "epoch": 0.9748931623931624, "grad_norm": 0.7632807493209839, "learning_rate": 0.0003045617191360935, "loss": 0.5464, "step": 43800 }, { "epoch": 0.9751157407407407, "grad_norm": 0.7853899598121643, "learning_rate": 0.00030452197316697526, "loss": 0.505, "step": 43810 }, { "epoch": 0.9753383190883191, "grad_norm": 0.511212944984436, "learning_rate": 0.00030448222151804736, "loss": 0.597, "step": 43820 }, { "epoch": 0.9755608974358975, "grad_norm": 0.5872905850410461, "learning_rate": 0.00030444246419147, "loss": 0.551, "step": 43830 }, { "epoch": 0.9757834757834758, "grad_norm": 0.7174262404441833, "learning_rate": 0.0003044027011894037, "loss": 0.5746, "step": 43840 }, { "epoch": 0.9760060541310541, "grad_norm": 0.6769015192985535, "learning_rate": 0.0003043629325140091, "loss": 0.5102, "step": 43850 }, { "epoch": 0.9762286324786325, "grad_norm": 0.4921936094760895, "learning_rate": 0.0003043231581674473, "loss": 0.5115, "step": 43860 }, { "epoch": 0.9764512108262108, "grad_norm": 0.9814968705177307, "learning_rate": 0.00030428337815187974, "loss": 0.7997, "step": 43870 }, { "epoch": 0.9766737891737892, "grad_norm": 0.6479461789131165, "learning_rate": 0.00030424359246946796, "loss": 0.6602, "step": 43880 }, { "epoch": 0.9768963675213675, "grad_norm": 1.096750020980835, "learning_rate": 0.0003042038011223741, "loss": 0.6912, "step": 43890 }, { "epoch": 0.9771189458689459, "grad_norm": 0.8794252872467041, "learning_rate": 0.0003041640041127603, "loss": 0.6069, "step": 43900 }, { "epoch": 0.9773415242165242, "grad_norm": 0.9189031720161438, "learning_rate": 0.0003041242014427894, "loss": 0.6475, "step": 43910 }, { "epoch": 0.9775641025641025, "grad_norm": 0.6648414134979248, "learning_rate": 0.000304084393114624, "loss": 0.5758, "step": 43920 }, { "epoch": 0.9777866809116809, "grad_norm": 0.694111704826355, "learning_rate": 0.0003040445791304275, "loss": 0.5805, "step": 43930 }, { "epoch": 0.9780092592592593, "grad_norm": 0.4170455038547516, "learning_rate": 0.00030400475949236345, "loss": 0.6003, "step": 43940 }, { "epoch": 0.9782318376068376, "grad_norm": 0.6975066065788269, "learning_rate": 0.0003039649342025956, "loss": 0.5608, "step": 43950 }, { "epoch": 0.978454415954416, "grad_norm": 0.5400005578994751, "learning_rate": 0.0003039251032632881, "loss": 0.6245, "step": 43960 }, { "epoch": 0.9786769943019943, "grad_norm": 0.43616047501564026, "learning_rate": 0.0003038852666766054, "loss": 0.5996, "step": 43970 }, { "epoch": 0.9788995726495726, "grad_norm": 0.4482074975967407, "learning_rate": 0.0003038454244447123, "loss": 0.4164, "step": 43980 }, { "epoch": 0.9791221509971509, "grad_norm": 0.8714448809623718, "learning_rate": 0.00030380557656977384, "loss": 0.5505, "step": 43990 }, { "epoch": 0.9793447293447294, "grad_norm": 0.9814329743385315, "learning_rate": 0.00030376572305395525, "loss": 0.6242, "step": 44000 }, { "epoch": 0.9795673076923077, "grad_norm": 0.5086197257041931, "learning_rate": 0.00030372586389942245, "loss": 0.5395, "step": 44010 }, { "epoch": 0.979789886039886, "grad_norm": 0.49613261222839355, "learning_rate": 0.00030368599910834124, "loss": 0.6055, "step": 44020 }, { "epoch": 0.9800124643874644, "grad_norm": 0.8347887396812439, "learning_rate": 0.000303646128682878, "loss": 0.6295, "step": 44030 }, { "epoch": 0.9802350427350427, "grad_norm": 0.4856536090373993, "learning_rate": 0.00030360625262519924, "loss": 0.6006, "step": 44040 }, { "epoch": 0.9804576210826211, "grad_norm": 0.7615097761154175, "learning_rate": 0.0003035663709374719, "loss": 0.5862, "step": 44050 }, { "epoch": 0.9806801994301995, "grad_norm": 0.6671962738037109, "learning_rate": 0.0003035264836218632, "loss": 0.6552, "step": 44060 }, { "epoch": 0.9809027777777778, "grad_norm": 0.597777783870697, "learning_rate": 0.0003034865906805406, "loss": 0.584, "step": 44070 }, { "epoch": 0.9811253561253561, "grad_norm": 0.6560484766960144, "learning_rate": 0.000303446692115672, "loss": 0.6308, "step": 44080 }, { "epoch": 0.9813479344729344, "grad_norm": 0.7486174702644348, "learning_rate": 0.00030340678792942536, "loss": 0.5648, "step": 44090 }, { "epoch": 0.9815705128205128, "grad_norm": 0.6524199843406677, "learning_rate": 0.0003033668781239693, "loss": 0.6903, "step": 44100 }, { "epoch": 0.9817930911680912, "grad_norm": 0.9349985122680664, "learning_rate": 0.00030332696270147244, "loss": 0.6595, "step": 44110 }, { "epoch": 0.9820156695156695, "grad_norm": 0.4954359233379364, "learning_rate": 0.0003032870416641038, "loss": 0.5674, "step": 44120 }, { "epoch": 0.9822382478632479, "grad_norm": 0.6777122020721436, "learning_rate": 0.00030324711501403284, "loss": 0.5881, "step": 44130 }, { "epoch": 0.9824608262108262, "grad_norm": 0.5254316926002502, "learning_rate": 0.0003032071827534291, "loss": 0.5998, "step": 44140 }, { "epoch": 0.9826834045584045, "grad_norm": 1.055670976638794, "learning_rate": 0.0003031672448844625, "loss": 0.4804, "step": 44150 }, { "epoch": 0.9829059829059829, "grad_norm": 0.6267298460006714, "learning_rate": 0.0003031273014093035, "loss": 0.5552, "step": 44160 }, { "epoch": 0.9831285612535613, "grad_norm": 0.7355030179023743, "learning_rate": 0.00030308735233012233, "loss": 0.7159, "step": 44170 }, { "epoch": 0.9833511396011396, "grad_norm": 0.7559900283813477, "learning_rate": 0.0003030473976490901, "loss": 0.5603, "step": 44180 }, { "epoch": 0.983573717948718, "grad_norm": 1.1423794031143188, "learning_rate": 0.000303007437368378, "loss": 0.5983, "step": 44190 }, { "epoch": 0.9837962962962963, "grad_norm": 0.7574682831764221, "learning_rate": 0.00030296747149015737, "loss": 0.5575, "step": 44200 }, { "epoch": 0.9840188746438746, "grad_norm": 0.4764862060546875, "learning_rate": 0.0003029275000166001, "loss": 0.5668, "step": 44210 }, { "epoch": 0.9842414529914529, "grad_norm": 0.6267021894454956, "learning_rate": 0.0003028875229498782, "loss": 0.6212, "step": 44220 }, { "epoch": 0.9844640313390314, "grad_norm": 0.5620948076248169, "learning_rate": 0.00030284754029216406, "loss": 0.6254, "step": 44230 }, { "epoch": 0.9846866096866097, "grad_norm": 0.4687175750732422, "learning_rate": 0.00030280755204563034, "loss": 0.5893, "step": 44240 }, { "epoch": 0.984909188034188, "grad_norm": 0.9068381786346436, "learning_rate": 0.0003027675582124501, "loss": 0.6576, "step": 44250 }, { "epoch": 0.9851317663817664, "grad_norm": 0.5325086712837219, "learning_rate": 0.00030272755879479665, "loss": 0.587, "step": 44260 }, { "epoch": 0.9853543447293447, "grad_norm": 0.9383319616317749, "learning_rate": 0.0003026875537948436, "loss": 0.6018, "step": 44270 }, { "epoch": 0.9855769230769231, "grad_norm": 0.8008456230163574, "learning_rate": 0.0003026475432147647, "loss": 0.5127, "step": 44280 }, { "epoch": 0.9857995014245015, "grad_norm": 0.6528541445732117, "learning_rate": 0.00030260752705673434, "loss": 0.6322, "step": 44290 }, { "epoch": 0.9860220797720798, "grad_norm": 0.6493551135063171, "learning_rate": 0.00030256750532292695, "loss": 0.5318, "step": 44300 }, { "epoch": 0.9862446581196581, "grad_norm": 0.7947010397911072, "learning_rate": 0.00030252747801551733, "loss": 0.6309, "step": 44310 }, { "epoch": 0.9864672364672364, "grad_norm": 0.6355711817741394, "learning_rate": 0.00030248744513668065, "loss": 0.659, "step": 44320 }, { "epoch": 0.9866898148148148, "grad_norm": 0.7108096480369568, "learning_rate": 0.0003024474066885923, "loss": 0.6309, "step": 44330 }, { "epoch": 0.9869123931623932, "grad_norm": 0.6758264899253845, "learning_rate": 0.000302407362673428, "loss": 0.6833, "step": 44340 }, { "epoch": 0.9871349715099715, "grad_norm": 0.6141546964645386, "learning_rate": 0.0003023673130933638, "loss": 0.5538, "step": 44350 }, { "epoch": 0.9873575498575499, "grad_norm": 0.5645638704299927, "learning_rate": 0.000302327257950576, "loss": 0.6953, "step": 44360 }, { "epoch": 0.9875801282051282, "grad_norm": 0.6685665845870972, "learning_rate": 0.00030228719724724126, "loss": 0.5587, "step": 44370 }, { "epoch": 0.9878027065527065, "grad_norm": 0.8498179316520691, "learning_rate": 0.0003022471309855364, "loss": 0.6689, "step": 44380 }, { "epoch": 0.9880252849002849, "grad_norm": 0.6236562132835388, "learning_rate": 0.00030220705916763875, "loss": 0.5571, "step": 44390 }, { "epoch": 0.9882478632478633, "grad_norm": 0.7307772636413574, "learning_rate": 0.00030216698179572586, "loss": 0.5927, "step": 44400 }, { "epoch": 0.9884704415954416, "grad_norm": 0.6887956261634827, "learning_rate": 0.0003021268988719756, "loss": 0.6221, "step": 44410 }, { "epoch": 0.98869301994302, "grad_norm": 0.5145651698112488, "learning_rate": 0.000302086810398566, "loss": 0.7497, "step": 44420 }, { "epoch": 0.9889155982905983, "grad_norm": 0.6188973784446716, "learning_rate": 0.0003020467163776755, "loss": 0.6187, "step": 44430 }, { "epoch": 0.9891381766381766, "grad_norm": 0.8421676158905029, "learning_rate": 0.000302006616811483, "loss": 0.8401, "step": 44440 }, { "epoch": 0.9893607549857549, "grad_norm": 0.5950721502304077, "learning_rate": 0.0003019665117021673, "loss": 0.5747, "step": 44450 }, { "epoch": 0.9895833333333334, "grad_norm": 0.46402913331985474, "learning_rate": 0.000301926401051908, "loss": 0.6544, "step": 44460 }, { "epoch": 0.9898059116809117, "grad_norm": 0.8954247236251831, "learning_rate": 0.0003018862848628846, "loss": 0.5758, "step": 44470 }, { "epoch": 0.99002849002849, "grad_norm": 0.6421555876731873, "learning_rate": 0.00030184616313727705, "loss": 0.5715, "step": 44480 }, { "epoch": 0.9902510683760684, "grad_norm": 0.6220232248306274, "learning_rate": 0.0003018060358772656, "loss": 0.4822, "step": 44490 }, { "epoch": 0.9904736467236467, "grad_norm": 0.6873606443405151, "learning_rate": 0.0003017659030850308, "loss": 0.6926, "step": 44500 }, { "epoch": 0.9906962250712251, "grad_norm": 0.6596918702125549, "learning_rate": 0.00030172576476275364, "loss": 0.7499, "step": 44510 }, { "epoch": 0.9909188034188035, "grad_norm": 0.8093087673187256, "learning_rate": 0.00030168562091261505, "loss": 0.6116, "step": 44520 }, { "epoch": 0.9911413817663818, "grad_norm": 0.5271719694137573, "learning_rate": 0.00030164547153679655, "loss": 0.6351, "step": 44530 }, { "epoch": 0.9913639601139601, "grad_norm": 0.7143903374671936, "learning_rate": 0.00030160531663748005, "loss": 0.7094, "step": 44540 }, { "epoch": 0.9915865384615384, "grad_norm": 0.8920359015464783, "learning_rate": 0.0003015651562168474, "loss": 0.5879, "step": 44550 }, { "epoch": 0.9918091168091168, "grad_norm": 0.6196558475494385, "learning_rate": 0.00030152499027708094, "loss": 0.6776, "step": 44560 }, { "epoch": 0.9920316951566952, "grad_norm": 0.7099842429161072, "learning_rate": 0.0003014848188203634, "loss": 0.6097, "step": 44570 }, { "epoch": 0.9922542735042735, "grad_norm": 0.5325936079025269, "learning_rate": 0.00030144464184887786, "loss": 0.4899, "step": 44580 }, { "epoch": 0.9924768518518519, "grad_norm": 0.5483936071395874, "learning_rate": 0.0003014044593648073, "loss": 0.4846, "step": 44590 }, { "epoch": 0.9926994301994302, "grad_norm": 0.6429783701896667, "learning_rate": 0.0003013642713703354, "loss": 0.6581, "step": 44600 }, { "epoch": 0.9929220085470085, "grad_norm": 0.7688935399055481, "learning_rate": 0.0003013240778676462, "loss": 0.5981, "step": 44610 }, { "epoch": 0.9931445868945868, "grad_norm": 0.7688998579978943, "learning_rate": 0.0003012838788589234, "loss": 0.676, "step": 44620 }, { "epoch": 0.9933671652421653, "grad_norm": 0.6492190361022949, "learning_rate": 0.00030124367434635185, "loss": 0.6548, "step": 44630 }, { "epoch": 0.9935897435897436, "grad_norm": 0.9331862330436707, "learning_rate": 0.00030120346433211616, "loss": 0.7739, "step": 44640 }, { "epoch": 0.9938123219373219, "grad_norm": 0.4795180559158325, "learning_rate": 0.0003011632488184014, "loss": 0.6069, "step": 44650 }, { "epoch": 0.9940349002849003, "grad_norm": 0.6427105069160461, "learning_rate": 0.00030112302780739276, "loss": 0.6617, "step": 44660 }, { "epoch": 0.9942574786324786, "grad_norm": 0.8687929511070251, "learning_rate": 0.00030108280130127613, "loss": 0.5745, "step": 44670 }, { "epoch": 0.9944800569800569, "grad_norm": 0.3637758791446686, "learning_rate": 0.00030104256930223725, "loss": 0.5226, "step": 44680 }, { "epoch": 0.9947026353276354, "grad_norm": 0.614262580871582, "learning_rate": 0.00030100233181246246, "loss": 0.6666, "step": 44690 }, { "epoch": 0.9949252136752137, "grad_norm": 0.735403299331665, "learning_rate": 0.0003009620888341383, "loss": 0.4987, "step": 44700 }, { "epoch": 0.995147792022792, "grad_norm": 0.7281267642974854, "learning_rate": 0.0003009218403694516, "loss": 0.6631, "step": 44710 }, { "epoch": 0.9953703703703703, "grad_norm": 0.6882529258728027, "learning_rate": 0.0003008815864205894, "loss": 0.612, "step": 44720 }, { "epoch": 0.9955929487179487, "grad_norm": 0.7215903997421265, "learning_rate": 0.00030084132698973915, "loss": 0.6723, "step": 44730 }, { "epoch": 0.9958155270655271, "grad_norm": 1.017327070236206, "learning_rate": 0.0003008010620790888, "loss": 0.6662, "step": 44740 }, { "epoch": 0.9960381054131054, "grad_norm": 0.5069785714149475, "learning_rate": 0.00030076079169082614, "loss": 0.478, "step": 44750 }, { "epoch": 0.9962606837606838, "grad_norm": 0.7547106742858887, "learning_rate": 0.0003007205158271396, "loss": 0.7862, "step": 44760 }, { "epoch": 0.9964832621082621, "grad_norm": 0.5410762429237366, "learning_rate": 0.00030068023449021777, "loss": 0.5717, "step": 44770 }, { "epoch": 0.9967058404558404, "grad_norm": 0.6520301699638367, "learning_rate": 0.0003006399476822495, "loss": 0.6349, "step": 44780 }, { "epoch": 0.9969284188034188, "grad_norm": 0.4946806728839874, "learning_rate": 0.0003005996554054242, "loss": 0.5792, "step": 44790 }, { "epoch": 0.9971509971509972, "grad_norm": 0.6016153693199158, "learning_rate": 0.00030055935766193115, "loss": 0.5773, "step": 44800 }, { "epoch": 0.9973735754985755, "grad_norm": 0.5813798904418945, "learning_rate": 0.00030051905445396045, "loss": 0.5932, "step": 44810 }, { "epoch": 0.9975961538461539, "grad_norm": 0.52809739112854, "learning_rate": 0.00030047874578370196, "loss": 0.6489, "step": 44820 }, { "epoch": 0.9978187321937322, "grad_norm": 0.7697903513908386, "learning_rate": 0.0003004384316533461, "loss": 0.604, "step": 44830 }, { "epoch": 0.9980413105413105, "grad_norm": 0.750226616859436, "learning_rate": 0.0003003981120650837, "loss": 0.6535, "step": 44840 }, { "epoch": 0.9982638888888888, "grad_norm": 0.8747888207435608, "learning_rate": 0.0003003577870211057, "loss": 0.5326, "step": 44850 }, { "epoch": 0.9984864672364673, "grad_norm": 0.7685528993606567, "learning_rate": 0.00030031745652360335, "loss": 0.6057, "step": 44860 }, { "epoch": 0.9987090455840456, "grad_norm": 0.690782904624939, "learning_rate": 0.0003002771205747684, "loss": 0.7289, "step": 44870 }, { "epoch": 0.9989316239316239, "grad_norm": 0.6289418339729309, "learning_rate": 0.00030023677917679253, "loss": 0.6548, "step": 44880 }, { "epoch": 0.9991542022792023, "grad_norm": 0.6266171336174011, "learning_rate": 0.00030019643233186803, "loss": 0.6579, "step": 44890 }, { "epoch": 0.9993767806267806, "grad_norm": 0.602969229221344, "learning_rate": 0.00030015608004218744, "loss": 0.4845, "step": 44900 }, { "epoch": 0.9995993589743589, "grad_norm": 0.6873510479927063, "learning_rate": 0.0003001157223099434, "loss": 0.5004, "step": 44910 }, { "epoch": 0.9998219373219374, "grad_norm": 0.5698683261871338, "learning_rate": 0.00030007535913732905, "loss": 0.5475, "step": 44920 }, { "epoch": 1.0000445156695157, "grad_norm": 0.7336957454681396, "learning_rate": 0.00030003499052653777, "loss": 0.6564, "step": 44930 }, { "epoch": 1.0002670940170941, "grad_norm": 0.809277355670929, "learning_rate": 0.0002999946164797633, "loss": 0.6366, "step": 44940 }, { "epoch": 1.0004896723646723, "grad_norm": 0.5942495465278625, "learning_rate": 0.0002999542369991994, "loss": 0.5138, "step": 44950 }, { "epoch": 1.0007122507122508, "grad_norm": 0.5038458704948425, "learning_rate": 0.0002999138520870405, "loss": 0.5748, "step": 44960 }, { "epoch": 1.000934829059829, "grad_norm": 0.8980821371078491, "learning_rate": 0.00029987346174548097, "loss": 0.4932, "step": 44970 }, { "epoch": 1.0011574074074074, "grad_norm": 0.6963857412338257, "learning_rate": 0.00029983306597671584, "loss": 0.5204, "step": 44980 }, { "epoch": 1.0013799857549857, "grad_norm": 0.8751989603042603, "learning_rate": 0.00029979266478294024, "loss": 0.5569, "step": 44990 }, { "epoch": 1.001602564102564, "grad_norm": 0.9675778150558472, "learning_rate": 0.00029975225816634954, "loss": 0.6096, "step": 45000 }, { "epoch": 1.0018251424501425, "grad_norm": 0.7282714247703552, "learning_rate": 0.0002997118461291394, "loss": 0.5134, "step": 45010 }, { "epoch": 1.0020477207977208, "grad_norm": 0.6374226212501526, "learning_rate": 0.0002996714286735059, "loss": 0.5018, "step": 45020 }, { "epoch": 1.0022702991452992, "grad_norm": 0.50523841381073, "learning_rate": 0.0002996310058016454, "loss": 0.7654, "step": 45030 }, { "epoch": 1.0024928774928774, "grad_norm": 0.5092727541923523, "learning_rate": 0.0002995905775157545, "loss": 0.4078, "step": 45040 }, { "epoch": 1.0027154558404558, "grad_norm": 0.6609806418418884, "learning_rate": 0.0002995501438180301, "loss": 0.4932, "step": 45050 }, { "epoch": 1.0029380341880343, "grad_norm": 0.5364968776702881, "learning_rate": 0.00029950970471066937, "loss": 0.5699, "step": 45060 }, { "epoch": 1.0031606125356125, "grad_norm": 0.6287388801574707, "learning_rate": 0.0002994692601958698, "loss": 0.6062, "step": 45070 }, { "epoch": 1.003383190883191, "grad_norm": 0.5691414475440979, "learning_rate": 0.00029942881027582925, "loss": 0.5667, "step": 45080 }, { "epoch": 1.0036057692307692, "grad_norm": 0.7531828284263611, "learning_rate": 0.0002993883549527457, "loss": 0.6336, "step": 45090 }, { "epoch": 1.0038283475783476, "grad_norm": 0.5090972781181335, "learning_rate": 0.0002993478942288176, "loss": 0.5269, "step": 45100 }, { "epoch": 1.0040509259259258, "grad_norm": 0.7268334031105042, "learning_rate": 0.0002993074281062436, "loss": 0.5477, "step": 45110 }, { "epoch": 1.0042735042735043, "grad_norm": 0.8607544898986816, "learning_rate": 0.0002992669565872227, "loss": 0.4817, "step": 45120 }, { "epoch": 1.0044960826210827, "grad_norm": 0.5529241561889648, "learning_rate": 0.0002992264796739541, "loss": 0.4315, "step": 45130 }, { "epoch": 1.004718660968661, "grad_norm": 0.5200088620185852, "learning_rate": 0.0002991859973686373, "loss": 0.5856, "step": 45140 }, { "epoch": 1.0049412393162394, "grad_norm": 0.7229819893836975, "learning_rate": 0.0002991455096734723, "loss": 0.537, "step": 45150 }, { "epoch": 1.0051638176638176, "grad_norm": 0.8149036765098572, "learning_rate": 0.00029910501659065905, "loss": 0.63, "step": 45160 }, { "epoch": 1.005386396011396, "grad_norm": 0.5828270316123962, "learning_rate": 0.0002990645181223981, "loss": 0.5111, "step": 45170 }, { "epoch": 1.0056089743589745, "grad_norm": 0.7001296877861023, "learning_rate": 0.00029902401427089014, "loss": 0.6598, "step": 45180 }, { "epoch": 1.0058315527065527, "grad_norm": 0.6515721082687378, "learning_rate": 0.0002989835050383362, "loss": 0.6275, "step": 45190 }, { "epoch": 1.006054131054131, "grad_norm": 0.6579293012619019, "learning_rate": 0.0002989429904269375, "loss": 0.5268, "step": 45200 }, { "epoch": 1.0062767094017093, "grad_norm": 0.9173814654350281, "learning_rate": 0.0002989024704388958, "loss": 0.5915, "step": 45210 }, { "epoch": 1.0064992877492878, "grad_norm": 0.6272097229957581, "learning_rate": 0.00029886194507641284, "loss": 0.4741, "step": 45220 }, { "epoch": 1.0067218660968662, "grad_norm": 0.694480836391449, "learning_rate": 0.00029882141434169084, "loss": 0.5071, "step": 45230 }, { "epoch": 1.0069444444444444, "grad_norm": 0.664422869682312, "learning_rate": 0.0002987808782369323, "loss": 0.6094, "step": 45240 }, { "epoch": 1.0071670227920229, "grad_norm": 0.5120536684989929, "learning_rate": 0.00029874033676433997, "loss": 0.5058, "step": 45250 }, { "epoch": 1.007389601139601, "grad_norm": 0.5487117171287537, "learning_rate": 0.0002986997899261169, "loss": 0.6141, "step": 45260 }, { "epoch": 1.0076121794871795, "grad_norm": 0.5553765296936035, "learning_rate": 0.00029865923772446647, "loss": 0.6366, "step": 45270 }, { "epoch": 1.0078347578347577, "grad_norm": 0.5263356566429138, "learning_rate": 0.0002986186801615923, "loss": 0.4694, "step": 45280 }, { "epoch": 1.0080573361823362, "grad_norm": 0.8697277903556824, "learning_rate": 0.00029857811723969823, "loss": 0.6524, "step": 45290 }, { "epoch": 1.0082799145299146, "grad_norm": 0.6866238713264465, "learning_rate": 0.0002985375489609886, "loss": 0.555, "step": 45300 }, { "epoch": 1.0085024928774928, "grad_norm": 0.623012900352478, "learning_rate": 0.000298496975327668, "loss": 0.5244, "step": 45310 }, { "epoch": 1.0087250712250713, "grad_norm": 0.42710331082344055, "learning_rate": 0.0002984563963419411, "loss": 0.5599, "step": 45320 }, { "epoch": 1.0089476495726495, "grad_norm": 0.8865259289741516, "learning_rate": 0.00029841581200601295, "loss": 0.551, "step": 45330 }, { "epoch": 1.009170227920228, "grad_norm": 0.49819451570510864, "learning_rate": 0.000298375222322089, "loss": 0.6152, "step": 45340 }, { "epoch": 1.0093928062678064, "grad_norm": 0.5800816416740417, "learning_rate": 0.000298334627292375, "loss": 0.6104, "step": 45350 }, { "epoch": 1.0096153846153846, "grad_norm": 0.7009190320968628, "learning_rate": 0.00029829402691907683, "loss": 0.625, "step": 45360 }, { "epoch": 1.009837962962963, "grad_norm": 0.3883427083492279, "learning_rate": 0.00029825342120440074, "loss": 0.5493, "step": 45370 }, { "epoch": 1.0100605413105412, "grad_norm": 0.7414738535881042, "learning_rate": 0.0002982128101505533, "loss": 0.6544, "step": 45380 }, { "epoch": 1.0102831196581197, "grad_norm": 0.6679602265357971, "learning_rate": 0.0002981721937597414, "loss": 0.5879, "step": 45390 }, { "epoch": 1.010505698005698, "grad_norm": 0.7506459355354309, "learning_rate": 0.00029813157203417213, "loss": 0.4979, "step": 45400 }, { "epoch": 1.0107282763532763, "grad_norm": 0.723677933216095, "learning_rate": 0.0002980909449760529, "loss": 0.5513, "step": 45410 }, { "epoch": 1.0109508547008548, "grad_norm": 0.7994402647018433, "learning_rate": 0.0002980503125875914, "loss": 0.4788, "step": 45420 }, { "epoch": 1.011173433048433, "grad_norm": 0.7365484833717346, "learning_rate": 0.00029800967487099563, "loss": 0.6212, "step": 45430 }, { "epoch": 1.0113960113960114, "grad_norm": 0.3517953157424927, "learning_rate": 0.00029796903182847386, "loss": 0.5521, "step": 45440 }, { "epoch": 1.0116185897435896, "grad_norm": 0.6479682326316833, "learning_rate": 0.00029792838346223477, "loss": 0.6885, "step": 45450 }, { "epoch": 1.011841168091168, "grad_norm": 0.5270677208900452, "learning_rate": 0.00029788772977448725, "loss": 0.5095, "step": 45460 }, { "epoch": 1.0120637464387465, "grad_norm": 0.7885897159576416, "learning_rate": 0.00029784707076744015, "loss": 0.532, "step": 45470 }, { "epoch": 1.0122863247863247, "grad_norm": 0.7115561366081238, "learning_rate": 0.00029780640644330324, "loss": 0.428, "step": 45480 }, { "epoch": 1.0125089031339032, "grad_norm": 0.8711773157119751, "learning_rate": 0.0002977657368042861, "loss": 0.5233, "step": 45490 }, { "epoch": 1.0127314814814814, "grad_norm": 0.480892151594162, "learning_rate": 0.0002977250618525988, "loss": 0.5043, "step": 45500 }, { "epoch": 1.0129540598290598, "grad_norm": 0.513482391834259, "learning_rate": 0.0002976843815904516, "loss": 0.3921, "step": 45510 }, { "epoch": 1.0131766381766383, "grad_norm": 0.6706327199935913, "learning_rate": 0.0002976436960200552, "loss": 0.5297, "step": 45520 }, { "epoch": 1.0133992165242165, "grad_norm": 0.7570107579231262, "learning_rate": 0.00029760300514362046, "loss": 0.6574, "step": 45530 }, { "epoch": 1.013621794871795, "grad_norm": 0.6689755320549011, "learning_rate": 0.00029756230896335845, "loss": 0.564, "step": 45540 }, { "epoch": 1.0138443732193732, "grad_norm": 0.35196173191070557, "learning_rate": 0.0002975216074814807, "loss": 0.5871, "step": 45550 }, { "epoch": 1.0140669515669516, "grad_norm": 0.5093061923980713, "learning_rate": 0.00029748090070019897, "loss": 0.6584, "step": 45560 }, { "epoch": 1.0142895299145298, "grad_norm": 0.7802603244781494, "learning_rate": 0.0002974401886217253, "loss": 0.4538, "step": 45570 }, { "epoch": 1.0145121082621082, "grad_norm": 1.1860054731369019, "learning_rate": 0.00029739947124827196, "loss": 0.5224, "step": 45580 }, { "epoch": 1.0147346866096867, "grad_norm": 0.5448476672172546, "learning_rate": 0.0002973587485820517, "loss": 0.6472, "step": 45590 }, { "epoch": 1.014957264957265, "grad_norm": 0.5475285053253174, "learning_rate": 0.00029731802062527734, "loss": 0.5009, "step": 45600 }, { "epoch": 1.0151798433048433, "grad_norm": 0.6277813911437988, "learning_rate": 0.000297277287380162, "loss": 0.536, "step": 45610 }, { "epoch": 1.0154024216524216, "grad_norm": 0.9393863081932068, "learning_rate": 0.00029723654884891926, "loss": 0.4785, "step": 45620 }, { "epoch": 1.015625, "grad_norm": 0.6652750968933105, "learning_rate": 0.0002971958050337629, "loss": 0.5289, "step": 45630 }, { "epoch": 1.0158475783475784, "grad_norm": 0.6442055106163025, "learning_rate": 0.00029715505593690686, "loss": 0.654, "step": 45640 }, { "epoch": 1.0160701566951567, "grad_norm": 1.0009881258010864, "learning_rate": 0.00029711430156056554, "loss": 0.6465, "step": 45650 }, { "epoch": 1.016292735042735, "grad_norm": 0.5782968401908875, "learning_rate": 0.00029707354190695363, "loss": 0.6587, "step": 45660 }, { "epoch": 1.0165153133903133, "grad_norm": 0.8827396631240845, "learning_rate": 0.000297032776978286, "loss": 0.6876, "step": 45670 }, { "epoch": 1.0167378917378918, "grad_norm": 0.5242670774459839, "learning_rate": 0.0002969920067767778, "loss": 0.4636, "step": 45680 }, { "epoch": 1.0169604700854702, "grad_norm": 0.5782427191734314, "learning_rate": 0.0002969512313046445, "loss": 0.6092, "step": 45690 }, { "epoch": 1.0171830484330484, "grad_norm": 0.5887977480888367, "learning_rate": 0.000296910450564102, "loss": 0.4911, "step": 45700 }, { "epoch": 1.0174056267806268, "grad_norm": 0.49322015047073364, "learning_rate": 0.00029686966455736623, "loss": 0.468, "step": 45710 }, { "epoch": 1.017628205128205, "grad_norm": 0.7434693574905396, "learning_rate": 0.0002968288732866536, "loss": 0.5743, "step": 45720 }, { "epoch": 1.0178507834757835, "grad_norm": 0.4505147337913513, "learning_rate": 0.00029678807675418073, "loss": 0.5177, "step": 45730 }, { "epoch": 1.0180733618233617, "grad_norm": 0.593360185623169, "learning_rate": 0.00029674727496216457, "loss": 0.6319, "step": 45740 }, { "epoch": 1.0182959401709402, "grad_norm": 0.8309880495071411, "learning_rate": 0.00029670646791282225, "loss": 0.6012, "step": 45750 }, { "epoch": 1.0185185185185186, "grad_norm": 0.5800014734268188, "learning_rate": 0.0002966656556083713, "loss": 0.4977, "step": 45760 }, { "epoch": 1.0187410968660968, "grad_norm": 0.6049439311027527, "learning_rate": 0.00029662483805102945, "loss": 0.5165, "step": 45770 }, { "epoch": 1.0189636752136753, "grad_norm": 0.730975866317749, "learning_rate": 0.0002965840152430149, "loss": 0.5126, "step": 45780 }, { "epoch": 1.0191862535612535, "grad_norm": 0.712568461894989, "learning_rate": 0.00029654318718654586, "loss": 0.4807, "step": 45790 }, { "epoch": 1.019408831908832, "grad_norm": 0.6383031606674194, "learning_rate": 0.00029650235388384093, "loss": 0.5993, "step": 45800 }, { "epoch": 1.0196314102564104, "grad_norm": 1.111724615097046, "learning_rate": 0.00029646151533711915, "loss": 0.5663, "step": 45810 }, { "epoch": 1.0198539886039886, "grad_norm": 0.7301346063613892, "learning_rate": 0.00029642067154859965, "loss": 0.646, "step": 45820 }, { "epoch": 1.020076566951567, "grad_norm": 0.9122412800788879, "learning_rate": 0.00029637982252050184, "loss": 0.5205, "step": 45830 }, { "epoch": 1.0201210826210827, "eval_loss": 0.5998291373252869, "eval_runtime": 337.5403, "eval_samples_per_second": 7.007, "eval_steps_per_second": 7.007, "step": 45832 }, { "epoch": 1.0202991452991452, "grad_norm": 0.8955168128013611, "learning_rate": 0.0002963389682550457, "loss": 0.4934, "step": 45840 }, { "epoch": 1.0205217236467237, "grad_norm": 0.7003114223480225, "learning_rate": 0.0002962981087544511, "loss": 0.5329, "step": 45850 }, { "epoch": 1.020744301994302, "grad_norm": 0.9676564931869507, "learning_rate": 0.00029625724402093846, "loss": 0.6922, "step": 45860 }, { "epoch": 1.0209668803418803, "grad_norm": 0.31678351759910583, "learning_rate": 0.0002962163740567283, "loss": 0.5997, "step": 45870 }, { "epoch": 1.0211894586894588, "grad_norm": 0.7018308639526367, "learning_rate": 0.00029617549886404174, "loss": 0.6506, "step": 45880 }, { "epoch": 1.021412037037037, "grad_norm": 0.6368674039840698, "learning_rate": 0.00029613461844509975, "loss": 0.5323, "step": 45890 }, { "epoch": 1.0216346153846154, "grad_norm": 0.5889302492141724, "learning_rate": 0.0002960937328021239, "loss": 0.6061, "step": 45900 }, { "epoch": 1.0218571937321936, "grad_norm": 0.6018082499504089, "learning_rate": 0.000296052841937336, "loss": 0.6059, "step": 45910 }, { "epoch": 1.022079772079772, "grad_norm": 0.8754140138626099, "learning_rate": 0.000296011945852958, "loss": 0.5061, "step": 45920 }, { "epoch": 1.0223023504273505, "grad_norm": 0.373577743768692, "learning_rate": 0.00029597104455121224, "loss": 0.5363, "step": 45930 }, { "epoch": 1.0225249287749287, "grad_norm": 0.582492470741272, "learning_rate": 0.0002959301380343214, "loss": 0.6262, "step": 45940 }, { "epoch": 1.0227475071225072, "grad_norm": 0.529481053352356, "learning_rate": 0.00029588922630450825, "loss": 0.5704, "step": 45950 }, { "epoch": 1.0229700854700854, "grad_norm": 0.6251696944236755, "learning_rate": 0.0002958483093639961, "loss": 0.5239, "step": 45960 }, { "epoch": 1.0231926638176638, "grad_norm": 0.8448797464370728, "learning_rate": 0.0002958073872150083, "loss": 0.498, "step": 45970 }, { "epoch": 1.0234152421652423, "grad_norm": 0.8178675174713135, "learning_rate": 0.00029576645985976874, "loss": 0.6725, "step": 45980 }, { "epoch": 1.0236378205128205, "grad_norm": 0.7064274549484253, "learning_rate": 0.0002957255273005013, "loss": 0.5908, "step": 45990 }, { "epoch": 1.023860398860399, "grad_norm": 0.5764282941818237, "learning_rate": 0.00029568458953943035, "loss": 0.4453, "step": 46000 }, { "epoch": 1.0240829772079771, "grad_norm": 0.6136001348495483, "learning_rate": 0.0002956436465787805, "loss": 0.6059, "step": 46010 }, { "epoch": 1.0243055555555556, "grad_norm": 0.6569823622703552, "learning_rate": 0.00029560269842077657, "loss": 0.4823, "step": 46020 }, { "epoch": 1.0245281339031338, "grad_norm": 0.6871667504310608, "learning_rate": 0.0002955617450676437, "loss": 0.5419, "step": 46030 }, { "epoch": 1.0247507122507122, "grad_norm": 0.5179684162139893, "learning_rate": 0.0002955207865216074, "loss": 0.6193, "step": 46040 }, { "epoch": 1.0249732905982907, "grad_norm": 0.5849087834358215, "learning_rate": 0.0002954798227848934, "loss": 0.78, "step": 46050 }, { "epoch": 1.025195868945869, "grad_norm": 0.58812016248703, "learning_rate": 0.0002954388538597277, "loss": 0.6257, "step": 46060 }, { "epoch": 1.0254184472934473, "grad_norm": 0.3955261707305908, "learning_rate": 0.00029539787974833644, "loss": 0.5079, "step": 46070 }, { "epoch": 1.0256410256410255, "grad_norm": 0.5466234087944031, "learning_rate": 0.0002953569004529464, "loss": 0.4818, "step": 46080 }, { "epoch": 1.025863603988604, "grad_norm": 1.1418683528900146, "learning_rate": 0.00029531591597578425, "loss": 0.6846, "step": 46090 }, { "epoch": 1.0260861823361824, "grad_norm": 0.7915250062942505, "learning_rate": 0.0002952749263190772, "loss": 0.6076, "step": 46100 }, { "epoch": 1.0263087606837606, "grad_norm": 0.5871915817260742, "learning_rate": 0.0002952339314850527, "loss": 0.6356, "step": 46110 }, { "epoch": 1.026531339031339, "grad_norm": 0.6075290441513062, "learning_rate": 0.00029519293147593843, "loss": 0.4626, "step": 46120 }, { "epoch": 1.0267539173789173, "grad_norm": 0.3844590485095978, "learning_rate": 0.0002951519262939623, "loss": 0.5026, "step": 46130 }, { "epoch": 1.0269764957264957, "grad_norm": 0.6965892910957336, "learning_rate": 0.00029511091594135256, "loss": 0.4676, "step": 46140 }, { "epoch": 1.0271990740740742, "grad_norm": 0.8521528244018555, "learning_rate": 0.00029506990042033786, "loss": 0.4739, "step": 46150 }, { "epoch": 1.0274216524216524, "grad_norm": 0.7694055438041687, "learning_rate": 0.00029502887973314687, "loss": 0.6516, "step": 46160 }, { "epoch": 1.0276442307692308, "grad_norm": 0.5086039900779724, "learning_rate": 0.0002949878538820088, "loss": 0.5088, "step": 46170 }, { "epoch": 1.027866809116809, "grad_norm": 0.4185430705547333, "learning_rate": 0.000294946822869153, "loss": 0.4825, "step": 46180 }, { "epoch": 1.0280893874643875, "grad_norm": 0.7606822848320007, "learning_rate": 0.0002949057866968091, "loss": 0.698, "step": 46190 }, { "epoch": 1.0283119658119657, "grad_norm": 0.6091985702514648, "learning_rate": 0.00029486474536720707, "loss": 0.4992, "step": 46200 }, { "epoch": 1.0285345441595442, "grad_norm": 0.6582888960838318, "learning_rate": 0.000294823698882577, "loss": 0.522, "step": 46210 }, { "epoch": 1.0287571225071226, "grad_norm": 0.5502168536186218, "learning_rate": 0.00029478264724514967, "loss": 0.4766, "step": 46220 }, { "epoch": 1.0289797008547008, "grad_norm": 0.487687885761261, "learning_rate": 0.0002947415904571556, "loss": 0.4861, "step": 46230 }, { "epoch": 1.0292022792022792, "grad_norm": 1.1762452125549316, "learning_rate": 0.000294700528520826, "loss": 0.6959, "step": 46240 }, { "epoch": 1.0294248575498575, "grad_norm": 0.814188539981842, "learning_rate": 0.0002946594614383922, "loss": 0.6722, "step": 46250 }, { "epoch": 1.029647435897436, "grad_norm": 0.46749377250671387, "learning_rate": 0.0002946183892120857, "loss": 0.5451, "step": 46260 }, { "epoch": 1.0298700142450143, "grad_norm": 0.6072514653205872, "learning_rate": 0.0002945773118441385, "loss": 0.5755, "step": 46270 }, { "epoch": 1.0300925925925926, "grad_norm": 0.48334813117980957, "learning_rate": 0.00029453622933678266, "loss": 0.7137, "step": 46280 }, { "epoch": 1.030315170940171, "grad_norm": 0.8070351481437683, "learning_rate": 0.0002944951416922509, "loss": 0.61, "step": 46290 }, { "epoch": 1.0305377492877492, "grad_norm": 0.5267581343650818, "learning_rate": 0.0002944540489127756, "loss": 0.5429, "step": 46300 }, { "epoch": 1.0307603276353277, "grad_norm": 0.6215224266052246, "learning_rate": 0.0002944129510005901, "loss": 0.5384, "step": 46310 }, { "epoch": 1.0309829059829059, "grad_norm": 0.8042832612991333, "learning_rate": 0.0002943718479579275, "loss": 0.5225, "step": 46320 }, { "epoch": 1.0312054843304843, "grad_norm": 0.6923868656158447, "learning_rate": 0.0002943307397870214, "loss": 0.5047, "step": 46330 }, { "epoch": 1.0314280626780628, "grad_norm": 0.7418728470802307, "learning_rate": 0.0002942896264901057, "loss": 0.6181, "step": 46340 }, { "epoch": 1.031650641025641, "grad_norm": 0.5733642578125, "learning_rate": 0.00029424850806941444, "loss": 0.5159, "step": 46350 }, { "epoch": 1.0318732193732194, "grad_norm": 0.42896318435668945, "learning_rate": 0.00029420738452718223, "loss": 0.5517, "step": 46360 }, { "epoch": 1.0320957977207976, "grad_norm": 0.8593009114265442, "learning_rate": 0.0002941662558656435, "loss": 0.5949, "step": 46370 }, { "epoch": 1.032318376068376, "grad_norm": 0.6495790481567383, "learning_rate": 0.00029412512208703347, "loss": 0.4398, "step": 46380 }, { "epoch": 1.0325409544159545, "grad_norm": 0.6790677905082703, "learning_rate": 0.0002940839831935871, "loss": 0.4867, "step": 46390 }, { "epoch": 1.0327635327635327, "grad_norm": 0.7355635166168213, "learning_rate": 0.0002940428391875402, "loss": 0.4541, "step": 46400 }, { "epoch": 1.0329861111111112, "grad_norm": 0.5827793478965759, "learning_rate": 0.00029400169007112834, "loss": 0.5366, "step": 46410 }, { "epoch": 1.0332086894586894, "grad_norm": 0.47678840160369873, "learning_rate": 0.0002939605358465877, "loss": 0.4735, "step": 46420 }, { "epoch": 1.0334312678062678, "grad_norm": 0.6106163263320923, "learning_rate": 0.00029391937651615464, "loss": 0.4913, "step": 46430 }, { "epoch": 1.0336538461538463, "grad_norm": 0.6181490421295166, "learning_rate": 0.00029387821208206574, "loss": 0.6202, "step": 46440 }, { "epoch": 1.0338764245014245, "grad_norm": 0.7003015875816345, "learning_rate": 0.000293837042546558, "loss": 0.5188, "step": 46450 }, { "epoch": 1.034099002849003, "grad_norm": 0.7870360612869263, "learning_rate": 0.00029379586791186853, "loss": 0.6886, "step": 46460 }, { "epoch": 1.0343215811965811, "grad_norm": 0.9469454288482666, "learning_rate": 0.0002937546881802348, "loss": 0.5678, "step": 46470 }, { "epoch": 1.0345441595441596, "grad_norm": 0.5496917963027954, "learning_rate": 0.00029371350335389456, "loss": 0.5574, "step": 46480 }, { "epoch": 1.0347667378917378, "grad_norm": 0.6661463379859924, "learning_rate": 0.00029367231343508586, "loss": 0.6327, "step": 46490 }, { "epoch": 1.0349893162393162, "grad_norm": 0.7526752948760986, "learning_rate": 0.00029363111842604694, "loss": 0.5669, "step": 46500 }, { "epoch": 1.0352118945868947, "grad_norm": 0.6870206594467163, "learning_rate": 0.0002935899183290165, "loss": 0.5794, "step": 46510 }, { "epoch": 1.0354344729344729, "grad_norm": 0.8028680682182312, "learning_rate": 0.0002935487131462331, "loss": 0.6761, "step": 46520 }, { "epoch": 1.0356570512820513, "grad_norm": 0.7931338548660278, "learning_rate": 0.00029350750287993613, "loss": 0.6181, "step": 46530 }, { "epoch": 1.0358796296296295, "grad_norm": 0.3808395564556122, "learning_rate": 0.00029346628753236493, "loss": 0.6137, "step": 46540 }, { "epoch": 1.036102207977208, "grad_norm": 0.8837310671806335, "learning_rate": 0.00029342506710575904, "loss": 0.5217, "step": 46550 }, { "epoch": 1.0363247863247864, "grad_norm": 0.8158642649650574, "learning_rate": 0.00029338384160235864, "loss": 0.6392, "step": 46560 }, { "epoch": 1.0365473646723646, "grad_norm": 0.7373179197311401, "learning_rate": 0.0002933426110244038, "loss": 0.6444, "step": 46570 }, { "epoch": 1.036769943019943, "grad_norm": 0.46640434861183167, "learning_rate": 0.00029330137537413514, "loss": 0.5789, "step": 46580 }, { "epoch": 1.0369925213675213, "grad_norm": 0.6358410716056824, "learning_rate": 0.0002932601346537932, "loss": 0.5697, "step": 46590 }, { "epoch": 1.0372150997150997, "grad_norm": 0.7933520078659058, "learning_rate": 0.00029321888886561933, "loss": 0.4974, "step": 46600 }, { "epoch": 1.0374376780626782, "grad_norm": 0.5213042497634888, "learning_rate": 0.00029317763801185465, "loss": 0.6363, "step": 46610 }, { "epoch": 1.0376602564102564, "grad_norm": 0.5368013978004456, "learning_rate": 0.0002931363820947409, "loss": 0.5479, "step": 46620 }, { "epoch": 1.0378828347578348, "grad_norm": 0.7525256872177124, "learning_rate": 0.0002930951211165198, "loss": 0.6095, "step": 46630 }, { "epoch": 1.038105413105413, "grad_norm": 0.8799393177032471, "learning_rate": 0.00029305385507943373, "loss": 0.5226, "step": 46640 }, { "epoch": 1.0383279914529915, "grad_norm": 0.597439706325531, "learning_rate": 0.000293012583985725, "loss": 0.6052, "step": 46650 }, { "epoch": 1.0385505698005697, "grad_norm": 0.7384129762649536, "learning_rate": 0.00029297130783763624, "loss": 0.5461, "step": 46660 }, { "epoch": 1.0387731481481481, "grad_norm": 0.7148140072822571, "learning_rate": 0.00029293002663741054, "loss": 0.4575, "step": 46670 }, { "epoch": 1.0389957264957266, "grad_norm": 0.8531787991523743, "learning_rate": 0.00029288874038729107, "loss": 0.6255, "step": 46680 }, { "epoch": 1.0392183048433048, "grad_norm": 0.6198742985725403, "learning_rate": 0.0002928474490895214, "loss": 0.4236, "step": 46690 }, { "epoch": 1.0394408831908832, "grad_norm": 0.6757825016975403, "learning_rate": 0.0002928061527463454, "loss": 0.6227, "step": 46700 }, { "epoch": 1.0396634615384615, "grad_norm": 0.7546913623809814, "learning_rate": 0.00029276485136000706, "loss": 0.5614, "step": 46710 }, { "epoch": 1.03988603988604, "grad_norm": 0.7523819804191589, "learning_rate": 0.0002927235449327508, "loss": 0.4941, "step": 46720 }, { "epoch": 1.0401086182336183, "grad_norm": 0.9511393308639526, "learning_rate": 0.0002926822334668211, "loss": 0.5301, "step": 46730 }, { "epoch": 1.0403311965811965, "grad_norm": 0.8329669833183289, "learning_rate": 0.00029264091696446306, "loss": 0.6019, "step": 46740 }, { "epoch": 1.040553774928775, "grad_norm": 0.6324247717857361, "learning_rate": 0.0002925995954279217, "loss": 0.5551, "step": 46750 }, { "epoch": 1.0407763532763532, "grad_norm": 0.6515164375305176, "learning_rate": 0.0002925582688594425, "loss": 0.5437, "step": 46760 }, { "epoch": 1.0409989316239316, "grad_norm": 0.4561392664909363, "learning_rate": 0.0002925169372612713, "loss": 0.4715, "step": 46770 }, { "epoch": 1.04122150997151, "grad_norm": 0.6248541474342346, "learning_rate": 0.0002924756006356539, "loss": 0.6171, "step": 46780 }, { "epoch": 1.0414440883190883, "grad_norm": 0.9043360352516174, "learning_rate": 0.0002924342589848367, "loss": 0.5258, "step": 46790 }, { "epoch": 1.0416666666666667, "grad_norm": 0.8318607807159424, "learning_rate": 0.0002923929123110661, "loss": 0.5527, "step": 46800 }, { "epoch": 1.041889245014245, "grad_norm": 0.764336347579956, "learning_rate": 0.0002923515606165891, "loss": 0.6689, "step": 46810 }, { "epoch": 1.0421118233618234, "grad_norm": 0.5335542559623718, "learning_rate": 0.0002923102039036527, "loss": 0.424, "step": 46820 }, { "epoch": 1.0423344017094016, "grad_norm": 0.6081898808479309, "learning_rate": 0.0002922688421745042, "loss": 0.6146, "step": 46830 }, { "epoch": 1.04255698005698, "grad_norm": 0.5974134802818298, "learning_rate": 0.00029222747543139135, "loss": 0.5533, "step": 46840 }, { "epoch": 1.0427795584045585, "grad_norm": 0.7155280709266663, "learning_rate": 0.0002921861036765619, "loss": 0.6214, "step": 46850 }, { "epoch": 1.0430021367521367, "grad_norm": 0.5848572254180908, "learning_rate": 0.0002921447269122642, "loss": 0.6203, "step": 46860 }, { "epoch": 1.0432247150997151, "grad_norm": 0.5690578818321228, "learning_rate": 0.00029210334514074654, "loss": 0.4135, "step": 46870 }, { "epoch": 1.0434472934472934, "grad_norm": 0.8502477407455444, "learning_rate": 0.00029206195836425767, "loss": 0.6178, "step": 46880 }, { "epoch": 1.0436698717948718, "grad_norm": 0.7801943421363831, "learning_rate": 0.0002920205665850467, "loss": 0.6523, "step": 46890 }, { "epoch": 1.0438924501424502, "grad_norm": 0.48374879360198975, "learning_rate": 0.00029197916980536274, "loss": 0.4545, "step": 46900 }, { "epoch": 1.0441150284900285, "grad_norm": 0.4702428877353668, "learning_rate": 0.00029193776802745547, "loss": 0.4453, "step": 46910 }, { "epoch": 1.044337606837607, "grad_norm": 0.6450238227844238, "learning_rate": 0.0002918963612535746, "loss": 0.5475, "step": 46920 }, { "epoch": 1.0445601851851851, "grad_norm": 0.596511721611023, "learning_rate": 0.00029185494948597024, "loss": 0.6097, "step": 46930 }, { "epoch": 1.0447827635327636, "grad_norm": 0.6973013281822205, "learning_rate": 0.0002918135327268927, "loss": 0.6392, "step": 46940 }, { "epoch": 1.0450053418803418, "grad_norm": 0.6602265238761902, "learning_rate": 0.0002917721109785926, "loss": 0.5507, "step": 46950 }, { "epoch": 1.0452279202279202, "grad_norm": 0.6385603547096252, "learning_rate": 0.00029173068424332094, "loss": 0.623, "step": 46960 }, { "epoch": 1.0454504985754987, "grad_norm": 0.49321964383125305, "learning_rate": 0.0002916892525233288, "loss": 0.5145, "step": 46970 }, { "epoch": 1.0456730769230769, "grad_norm": 0.6037874817848206, "learning_rate": 0.00029164781582086753, "loss": 0.5974, "step": 46980 }, { "epoch": 1.0458956552706553, "grad_norm": 0.5289360880851746, "learning_rate": 0.000291606374138189, "loss": 0.6204, "step": 46990 }, { "epoch": 1.0461182336182335, "grad_norm": 1.165456771850586, "learning_rate": 0.0002915649274775451, "loss": 0.5946, "step": 47000 }, { "epoch": 1.046340811965812, "grad_norm": 0.9170607924461365, "learning_rate": 0.000291523475841188, "loss": 0.678, "step": 47010 }, { "epoch": 1.0465633903133904, "grad_norm": 0.6722274422645569, "learning_rate": 0.0002914820192313704, "loss": 0.6116, "step": 47020 }, { "epoch": 1.0467859686609686, "grad_norm": 0.738879919052124, "learning_rate": 0.0002914405576503449, "loss": 0.4964, "step": 47030 }, { "epoch": 1.047008547008547, "grad_norm": 0.9060617685317993, "learning_rate": 0.0002913990911003647, "loss": 0.5929, "step": 47040 }, { "epoch": 1.0472311253561253, "grad_norm": 0.45788079500198364, "learning_rate": 0.00029135761958368303, "loss": 0.5966, "step": 47050 }, { "epoch": 1.0474537037037037, "grad_norm": 1.0104951858520508, "learning_rate": 0.00029131614310255353, "loss": 0.5632, "step": 47060 }, { "epoch": 1.047676282051282, "grad_norm": 0.4078928530216217, "learning_rate": 0.0002912746616592301, "loss": 0.5451, "step": 47070 }, { "epoch": 1.0478988603988604, "grad_norm": 0.43406420946121216, "learning_rate": 0.0002912331752559668, "loss": 0.4626, "step": 47080 }, { "epoch": 1.0481214387464388, "grad_norm": 0.7026646137237549, "learning_rate": 0.00029119168389501803, "loss": 0.6345, "step": 47090 }, { "epoch": 1.048344017094017, "grad_norm": 0.4128137528896332, "learning_rate": 0.0002911501875786386, "loss": 0.5166, "step": 47100 }, { "epoch": 1.0485665954415955, "grad_norm": 0.6436150670051575, "learning_rate": 0.00029110868630908334, "loss": 0.5672, "step": 47110 }, { "epoch": 1.0487891737891737, "grad_norm": 0.7415764927864075, "learning_rate": 0.00029106718008860743, "loss": 0.5835, "step": 47120 }, { "epoch": 1.0490117521367521, "grad_norm": 0.6365389227867126, "learning_rate": 0.00029102566891946635, "loss": 0.6038, "step": 47130 }, { "epoch": 1.0492343304843306, "grad_norm": 0.8669440746307373, "learning_rate": 0.000290984152803916, "loss": 0.6079, "step": 47140 }, { "epoch": 1.0494569088319088, "grad_norm": 0.6483818888664246, "learning_rate": 0.0002909426317442123, "loss": 0.5265, "step": 47150 }, { "epoch": 1.0496794871794872, "grad_norm": 0.6815274953842163, "learning_rate": 0.00029090110574261154, "loss": 0.6666, "step": 47160 }, { "epoch": 1.0499020655270654, "grad_norm": 0.9355290532112122, "learning_rate": 0.0002908595748013702, "loss": 0.6024, "step": 47170 }, { "epoch": 1.0501246438746439, "grad_norm": 0.883792519569397, "learning_rate": 0.00029081803892274527, "loss": 0.5186, "step": 47180 }, { "epoch": 1.0503472222222223, "grad_norm": 0.6974046230316162, "learning_rate": 0.00029077649810899374, "loss": 0.5896, "step": 47190 }, { "epoch": 1.0505698005698005, "grad_norm": 0.6341229677200317, "learning_rate": 0.000290734952362373, "loss": 0.4812, "step": 47200 }, { "epoch": 1.050792378917379, "grad_norm": 0.8992486596107483, "learning_rate": 0.0002906934016851406, "loss": 0.6442, "step": 47210 }, { "epoch": 1.0510149572649572, "grad_norm": 0.526816725730896, "learning_rate": 0.0002906518460795546, "loss": 0.5308, "step": 47220 }, { "epoch": 1.0512375356125356, "grad_norm": 0.8405412435531616, "learning_rate": 0.00029061028554787306, "loss": 0.5937, "step": 47230 }, { "epoch": 1.0514601139601139, "grad_norm": 0.7325668334960938, "learning_rate": 0.0002905687200923544, "loss": 0.6085, "step": 47240 }, { "epoch": 1.0516826923076923, "grad_norm": 0.8678027987480164, "learning_rate": 0.00029052714971525734, "loss": 0.5914, "step": 47250 }, { "epoch": 1.0519052706552707, "grad_norm": 0.6910134553909302, "learning_rate": 0.00029048557441884083, "loss": 0.601, "step": 47260 }, { "epoch": 1.052127849002849, "grad_norm": 0.8204662203788757, "learning_rate": 0.0002904439942053641, "loss": 0.6606, "step": 47270 }, { "epoch": 1.0523504273504274, "grad_norm": 0.5311092734336853, "learning_rate": 0.0002904024090770868, "loss": 0.5108, "step": 47280 }, { "epoch": 1.0525730056980056, "grad_norm": 0.7201509475708008, "learning_rate": 0.0002903608190362685, "loss": 0.5197, "step": 47290 }, { "epoch": 1.052795584045584, "grad_norm": 0.7765344381332397, "learning_rate": 0.0002903192240851694, "loss": 0.5792, "step": 47300 }, { "epoch": 1.0530181623931625, "grad_norm": 0.6695293188095093, "learning_rate": 0.0002902776242260497, "loss": 0.5312, "step": 47310 }, { "epoch": 1.0532407407407407, "grad_norm": 0.703255295753479, "learning_rate": 0.00029023601946116996, "loss": 0.6347, "step": 47320 }, { "epoch": 1.0534633190883191, "grad_norm": 0.7623192071914673, "learning_rate": 0.0002901944097927911, "loss": 0.6112, "step": 47330 }, { "epoch": 1.0536858974358974, "grad_norm": 0.920707106590271, "learning_rate": 0.00029015279522317405, "loss": 0.5685, "step": 47340 }, { "epoch": 1.0539084757834758, "grad_norm": 0.744606077671051, "learning_rate": 0.00029011117575458045, "loss": 0.4672, "step": 47350 }, { "epoch": 1.0541310541310542, "grad_norm": 0.7063766121864319, "learning_rate": 0.0002900695513892717, "loss": 0.5412, "step": 47360 }, { "epoch": 1.0543536324786325, "grad_norm": 0.6504448056221008, "learning_rate": 0.00029002792212950984, "loss": 0.6068, "step": 47370 }, { "epoch": 1.054576210826211, "grad_norm": 0.7272416353225708, "learning_rate": 0.000289986287977557, "loss": 0.5946, "step": 47380 }, { "epoch": 1.054798789173789, "grad_norm": 0.4593257009983063, "learning_rate": 0.00028994464893567553, "loss": 0.4715, "step": 47390 }, { "epoch": 1.0550213675213675, "grad_norm": 1.023835301399231, "learning_rate": 0.0002899030050061283, "loss": 0.6391, "step": 47400 }, { "epoch": 1.0552439458689458, "grad_norm": 0.6272522211074829, "learning_rate": 0.0002898613561911781, "loss": 0.5719, "step": 47410 }, { "epoch": 1.0554665242165242, "grad_norm": 0.784984290599823, "learning_rate": 0.0002898197024930883, "loss": 0.5492, "step": 47420 }, { "epoch": 1.0556891025641026, "grad_norm": 0.8112901449203491, "learning_rate": 0.0002897780439141223, "loss": 0.5022, "step": 47430 }, { "epoch": 1.0559116809116809, "grad_norm": 0.8352168202400208, "learning_rate": 0.00028973638045654395, "loss": 0.6067, "step": 47440 }, { "epoch": 1.0561342592592593, "grad_norm": 0.8020703792572021, "learning_rate": 0.0002896947121226172, "loss": 0.5609, "step": 47450 }, { "epoch": 1.0563568376068375, "grad_norm": 0.6858905553817749, "learning_rate": 0.00028965303891460636, "loss": 0.5885, "step": 47460 }, { "epoch": 1.056579415954416, "grad_norm": 0.8974461555480957, "learning_rate": 0.0002896113608347759, "loss": 0.5669, "step": 47470 }, { "epoch": 1.0568019943019944, "grad_norm": 1.0188039541244507, "learning_rate": 0.0002895696778853908, "loss": 0.5348, "step": 47480 }, { "epoch": 1.0570245726495726, "grad_norm": 0.798266589641571, "learning_rate": 0.0002895279900687161, "loss": 0.6305, "step": 47490 }, { "epoch": 1.057247150997151, "grad_norm": 0.7277517318725586, "learning_rate": 0.0002894862973870172, "loss": 0.5724, "step": 47500 }, { "epoch": 1.0574697293447293, "grad_norm": 0.862324059009552, "learning_rate": 0.00028944459984255955, "loss": 0.561, "step": 47510 }, { "epoch": 1.0576923076923077, "grad_norm": 0.45276638865470886, "learning_rate": 0.00028940289743760916, "loss": 0.5345, "step": 47520 }, { "epoch": 1.0579148860398861, "grad_norm": 0.6970518231391907, "learning_rate": 0.0002893611901744321, "loss": 0.4926, "step": 47530 }, { "epoch": 1.0581374643874644, "grad_norm": 1.012525200843811, "learning_rate": 0.0002893194780552948, "loss": 0.5492, "step": 47540 }, { "epoch": 1.0583600427350428, "grad_norm": 0.7472673654556274, "learning_rate": 0.000289277761082464, "loss": 0.5119, "step": 47550 }, { "epoch": 1.058582621082621, "grad_norm": 0.8029112815856934, "learning_rate": 0.00028923603925820656, "loss": 0.5215, "step": 47560 }, { "epoch": 1.0588051994301995, "grad_norm": 0.5627382397651672, "learning_rate": 0.00028919431258478965, "loss": 0.5293, "step": 47570 }, { "epoch": 1.0590277777777777, "grad_norm": 0.8303996324539185, "learning_rate": 0.0002891525810644808, "loss": 0.5951, "step": 47580 }, { "epoch": 1.0592503561253561, "grad_norm": 0.46132200956344604, "learning_rate": 0.0002891108446995477, "loss": 0.5242, "step": 47590 }, { "epoch": 1.0594729344729346, "grad_norm": 0.5943209528923035, "learning_rate": 0.0002890691034922584, "loss": 0.5589, "step": 47600 }, { "epoch": 1.0596955128205128, "grad_norm": 0.5154911875724792, "learning_rate": 0.00028902735744488106, "loss": 0.623, "step": 47610 }, { "epoch": 1.0599180911680912, "grad_norm": 0.5615541338920593, "learning_rate": 0.0002889856065596842, "loss": 0.6751, "step": 47620 }, { "epoch": 1.0601406695156694, "grad_norm": 1.0154298543930054, "learning_rate": 0.00028894385083893674, "loss": 0.7437, "step": 47630 }, { "epoch": 1.0603632478632479, "grad_norm": 1.1956875324249268, "learning_rate": 0.0002889020902849075, "loss": 0.6716, "step": 47640 }, { "epoch": 1.0605858262108263, "grad_norm": 0.6510722637176514, "learning_rate": 0.00028886032489986596, "loss": 0.5675, "step": 47650 }, { "epoch": 1.0608084045584045, "grad_norm": 0.8667111992835999, "learning_rate": 0.0002888185546860816, "loss": 0.6497, "step": 47660 }, { "epoch": 1.061030982905983, "grad_norm": 0.794508159160614, "learning_rate": 0.0002887767796458243, "loss": 0.6908, "step": 47670 }, { "epoch": 1.0612535612535612, "grad_norm": 0.8232386112213135, "learning_rate": 0.000288734999781364, "loss": 0.619, "step": 47680 }, { "epoch": 1.0614761396011396, "grad_norm": 0.3844764530658722, "learning_rate": 0.0002886932150949713, "loss": 0.6195, "step": 47690 }, { "epoch": 1.061698717948718, "grad_norm": 0.6595999002456665, "learning_rate": 0.0002886514255889167, "loss": 0.5477, "step": 47700 }, { "epoch": 1.0619212962962963, "grad_norm": 0.6769583225250244, "learning_rate": 0.00028860963126547094, "loss": 0.4758, "step": 47710 }, { "epoch": 1.0621438746438747, "grad_norm": 0.5123394727706909, "learning_rate": 0.00028856783212690535, "loss": 0.5444, "step": 47720 }, { "epoch": 1.062366452991453, "grad_norm": 0.6297634840011597, "learning_rate": 0.00028852602817549123, "loss": 0.5524, "step": 47730 }, { "epoch": 1.0625890313390314, "grad_norm": 0.8499884009361267, "learning_rate": 0.0002884842194135003, "loss": 0.5867, "step": 47740 }, { "epoch": 1.0628116096866096, "grad_norm": 0.6831232309341431, "learning_rate": 0.00028844240584320445, "loss": 0.7312, "step": 47750 }, { "epoch": 1.063034188034188, "grad_norm": 0.534511387348175, "learning_rate": 0.00028840058746687584, "loss": 0.5455, "step": 47760 }, { "epoch": 1.0632567663817665, "grad_norm": 0.6291297078132629, "learning_rate": 0.000288358764286787, "loss": 0.5431, "step": 47770 }, { "epoch": 1.0634793447293447, "grad_norm": 0.3100387454032898, "learning_rate": 0.0002883169363052105, "loss": 0.6269, "step": 47780 }, { "epoch": 1.0637019230769231, "grad_norm": 0.43622884154319763, "learning_rate": 0.0002882751035244194, "loss": 0.5198, "step": 47790 }, { "epoch": 1.0639245014245013, "grad_norm": 0.6858329772949219, "learning_rate": 0.00028823326594668697, "loss": 0.6213, "step": 47800 }, { "epoch": 1.0641470797720798, "grad_norm": 0.8117880821228027, "learning_rate": 0.0002881914235742865, "loss": 0.5976, "step": 47810 }, { "epoch": 1.064369658119658, "grad_norm": 0.7009490132331848, "learning_rate": 0.000288149576409492, "loss": 0.5409, "step": 47820 }, { "epoch": 1.0645922364672364, "grad_norm": 0.5884186029434204, "learning_rate": 0.0002881077244545773, "loss": 0.6015, "step": 47830 }, { "epoch": 1.0648148148148149, "grad_norm": 0.8653995394706726, "learning_rate": 0.0002880658677118168, "loss": 0.5313, "step": 47840 }, { "epoch": 1.065037393162393, "grad_norm": 0.6076104044914246, "learning_rate": 0.0002880240061834849, "loss": 0.5347, "step": 47850 }, { "epoch": 1.0652599715099715, "grad_norm": 0.4640044569969177, "learning_rate": 0.0002879821398718564, "loss": 0.4839, "step": 47860 }, { "epoch": 1.0654825498575498, "grad_norm": 0.9877360463142395, "learning_rate": 0.0002879402687792064, "loss": 0.5968, "step": 47870 }, { "epoch": 1.0657051282051282, "grad_norm": 0.857296347618103, "learning_rate": 0.00028789839290781026, "loss": 0.6268, "step": 47880 }, { "epoch": 1.0659277065527066, "grad_norm": 0.6341042518615723, "learning_rate": 0.00028785651225994346, "loss": 0.5405, "step": 47890 }, { "epoch": 1.0661502849002849, "grad_norm": 0.7061439156532288, "learning_rate": 0.00028781462683788185, "loss": 0.5172, "step": 47900 }, { "epoch": 1.0663728632478633, "grad_norm": 0.7179070115089417, "learning_rate": 0.0002877727366439016, "loss": 0.6257, "step": 47910 }, { "epoch": 1.0665954415954415, "grad_norm": 0.6872360110282898, "learning_rate": 0.0002877308416802789, "loss": 0.5932, "step": 47920 }, { "epoch": 1.06681801994302, "grad_norm": 0.5205060839653015, "learning_rate": 0.00028768894194929046, "loss": 0.5841, "step": 47930 }, { "epoch": 1.0670405982905984, "grad_norm": 0.6895760893821716, "learning_rate": 0.0002876470374532132, "loss": 0.6624, "step": 47940 }, { "epoch": 1.0672631766381766, "grad_norm": 0.7013905048370361, "learning_rate": 0.0002876051281943241, "loss": 0.5467, "step": 47950 }, { "epoch": 1.067485754985755, "grad_norm": 0.5206983089447021, "learning_rate": 0.00028756321417490064, "loss": 0.599, "step": 47960 }, { "epoch": 1.0677083333333333, "grad_norm": 0.6484621167182922, "learning_rate": 0.0002875212953972204, "loss": 0.5841, "step": 47970 }, { "epoch": 1.0679309116809117, "grad_norm": 0.47715893387794495, "learning_rate": 0.00028747937186356136, "loss": 0.6958, "step": 47980 }, { "epoch": 1.06815349002849, "grad_norm": 0.343030720949173, "learning_rate": 0.00028743744357620163, "loss": 0.5377, "step": 47990 }, { "epoch": 1.0683760683760684, "grad_norm": 0.7047831416130066, "learning_rate": 0.0002873955105374196, "loss": 0.6889, "step": 48000 }, { "epoch": 1.0685986467236468, "grad_norm": 0.8576478958129883, "learning_rate": 0.00028735357274949406, "loss": 0.5977, "step": 48010 }, { "epoch": 1.068821225071225, "grad_norm": 0.8345176577568054, "learning_rate": 0.0002873116302147039, "loss": 0.6413, "step": 48020 }, { "epoch": 1.0690438034188035, "grad_norm": 0.9953126311302185, "learning_rate": 0.0002872696829353282, "loss": 0.4842, "step": 48030 }, { "epoch": 1.0692663817663817, "grad_norm": 0.7177563905715942, "learning_rate": 0.0002872277309136464, "loss": 0.6846, "step": 48040 }, { "epoch": 1.06948896011396, "grad_norm": 0.37509727478027344, "learning_rate": 0.00028718577415193843, "loss": 0.5025, "step": 48050 }, { "epoch": 1.0697115384615385, "grad_norm": 0.6414170861244202, "learning_rate": 0.0002871438126524841, "loss": 0.5031, "step": 48060 }, { "epoch": 1.0699341168091168, "grad_norm": 0.6359707713127136, "learning_rate": 0.0002871018464175636, "loss": 0.5773, "step": 48070 }, { "epoch": 1.0701566951566952, "grad_norm": 0.6460133194923401, "learning_rate": 0.0002870598754494575, "loss": 0.6274, "step": 48080 }, { "epoch": 1.0703792735042734, "grad_norm": 0.5895810127258301, "learning_rate": 0.0002870178997504465, "loss": 0.5491, "step": 48090 }, { "epoch": 1.0706018518518519, "grad_norm": 0.7487587332725525, "learning_rate": 0.0002869759193228116, "loss": 0.5297, "step": 48100 }, { "epoch": 1.0708244301994303, "grad_norm": 1.058447003364563, "learning_rate": 0.000286933934168834, "loss": 0.7013, "step": 48110 }, { "epoch": 1.0710470085470085, "grad_norm": 0.6013885140419006, "learning_rate": 0.0002868919442907953, "loss": 0.6068, "step": 48120 }, { "epoch": 1.071269586894587, "grad_norm": 0.5251742601394653, "learning_rate": 0.00028684994969097716, "loss": 0.7068, "step": 48130 }, { "epoch": 1.0714921652421652, "grad_norm": 0.5119776725769043, "learning_rate": 0.00028680795037166166, "loss": 0.6008, "step": 48140 }, { "epoch": 1.0717147435897436, "grad_norm": 1.0093798637390137, "learning_rate": 0.0002867659463351311, "loss": 0.6564, "step": 48150 }, { "epoch": 1.0719373219373218, "grad_norm": 0.8163377046585083, "learning_rate": 0.000286723937583668, "loss": 0.6341, "step": 48160 }, { "epoch": 1.0721599002849003, "grad_norm": 0.7382004857063293, "learning_rate": 0.00028668192411955513, "loss": 0.5709, "step": 48170 }, { "epoch": 1.0723824786324787, "grad_norm": 0.5771430730819702, "learning_rate": 0.0002866399059450755, "loss": 0.5574, "step": 48180 }, { "epoch": 1.072605056980057, "grad_norm": 0.5379883646965027, "learning_rate": 0.00028659788306251247, "loss": 0.5961, "step": 48190 }, { "epoch": 1.0728276353276354, "grad_norm": 0.404498815536499, "learning_rate": 0.0002865558554741496, "loss": 0.6153, "step": 48200 }, { "epoch": 1.0730502136752136, "grad_norm": 0.7866203188896179, "learning_rate": 0.0002865138231822706, "loss": 0.6833, "step": 48210 }, { "epoch": 1.073272792022792, "grad_norm": 0.8402442932128906, "learning_rate": 0.0002864717861891598, "loss": 0.6441, "step": 48220 }, { "epoch": 1.0734953703703705, "grad_norm": 0.4625440239906311, "learning_rate": 0.0002864297444971013, "loss": 0.6017, "step": 48230 }, { "epoch": 1.0737179487179487, "grad_norm": 0.6925306916236877, "learning_rate": 0.0002863876981083796, "loss": 0.5939, "step": 48240 }, { "epoch": 1.0739405270655271, "grad_norm": 0.7206820249557495, "learning_rate": 0.00028634564702527973, "loss": 0.4961, "step": 48250 }, { "epoch": 1.0741631054131053, "grad_norm": 0.9899659752845764, "learning_rate": 0.00028630359125008677, "loss": 0.6196, "step": 48260 }, { "epoch": 1.0743856837606838, "grad_norm": 0.505004346370697, "learning_rate": 0.00028626153078508597, "loss": 0.5804, "step": 48270 }, { "epoch": 1.0746082621082622, "grad_norm": 0.46326836943626404, "learning_rate": 0.000286219465632563, "loss": 0.6611, "step": 48280 }, { "epoch": 1.0748308404558404, "grad_norm": 0.6721147298812866, "learning_rate": 0.0002861773957948036, "loss": 0.5854, "step": 48290 }, { "epoch": 1.0750534188034189, "grad_norm": 0.8311693072319031, "learning_rate": 0.0002861353212740941, "loss": 0.5729, "step": 48300 }, { "epoch": 1.075275997150997, "grad_norm": 0.47469931840896606, "learning_rate": 0.0002860932420727206, "loss": 0.5694, "step": 48310 }, { "epoch": 1.0754985754985755, "grad_norm": 0.44514262676239014, "learning_rate": 0.0002860511581929699, "loss": 0.671, "step": 48320 }, { "epoch": 1.0757211538461537, "grad_norm": 0.6041156649589539, "learning_rate": 0.00028600906963712885, "loss": 0.6614, "step": 48330 }, { "epoch": 1.0759437321937322, "grad_norm": 0.5938594341278076, "learning_rate": 0.00028596697640748445, "loss": 0.4862, "step": 48340 }, { "epoch": 1.0761663105413106, "grad_norm": 0.8382576704025269, "learning_rate": 0.0002859248785063243, "loss": 0.5732, "step": 48350 }, { "epoch": 1.0763888888888888, "grad_norm": 0.6398228406906128, "learning_rate": 0.0002858827759359358, "loss": 0.519, "step": 48360 }, { "epoch": 1.0766114672364673, "grad_norm": 1.0226026773452759, "learning_rate": 0.00028584066869860705, "loss": 0.4869, "step": 48370 }, { "epoch": 1.0768340455840455, "grad_norm": 0.42375320196151733, "learning_rate": 0.00028579855679662603, "loss": 0.4687, "step": 48380 }, { "epoch": 1.077056623931624, "grad_norm": 0.6117098331451416, "learning_rate": 0.00028575644023228115, "loss": 0.5195, "step": 48390 }, { "epoch": 1.0772792022792024, "grad_norm": 0.6376365423202515, "learning_rate": 0.0002857143190078612, "loss": 0.4911, "step": 48400 }, { "epoch": 1.0775017806267806, "grad_norm": 0.5959184169769287, "learning_rate": 0.00028567219312565495, "loss": 0.6362, "step": 48410 }, { "epoch": 1.077724358974359, "grad_norm": 0.5974500775337219, "learning_rate": 0.0002856300625879516, "loss": 0.6091, "step": 48420 }, { "epoch": 1.0779469373219372, "grad_norm": 0.685723602771759, "learning_rate": 0.00028558792739704047, "loss": 0.5275, "step": 48430 }, { "epoch": 1.0781695156695157, "grad_norm": 0.7660711407661438, "learning_rate": 0.00028554578755521137, "loss": 0.6141, "step": 48440 }, { "epoch": 1.0783920940170941, "grad_norm": 0.6586264967918396, "learning_rate": 0.00028550364306475416, "loss": 0.5741, "step": 48450 }, { "epoch": 1.0786146723646723, "grad_norm": 0.6372463703155518, "learning_rate": 0.0002854614939279589, "loss": 0.5087, "step": 48460 }, { "epoch": 1.0788372507122508, "grad_norm": 0.6863496899604797, "learning_rate": 0.00028541934014711623, "loss": 0.6393, "step": 48470 }, { "epoch": 1.079059829059829, "grad_norm": 0.3809172511100769, "learning_rate": 0.00028537718172451664, "loss": 0.5958, "step": 48480 }, { "epoch": 1.0792824074074074, "grad_norm": 0.7188693284988403, "learning_rate": 0.00028533501866245104, "loss": 0.4992, "step": 48490 }, { "epoch": 1.0795049857549857, "grad_norm": 0.5671699643135071, "learning_rate": 0.00028529285096321074, "loss": 0.5711, "step": 48500 }, { "epoch": 1.079727564102564, "grad_norm": 0.669846773147583, "learning_rate": 0.0002852506786290871, "loss": 0.5551, "step": 48510 }, { "epoch": 1.0799501424501425, "grad_norm": 0.6436952352523804, "learning_rate": 0.0002852085016623717, "loss": 0.5843, "step": 48520 }, { "epoch": 1.080128205128205, "eval_loss": 0.597112238407135, "eval_runtime": 337.36, "eval_samples_per_second": 7.01, "eval_steps_per_second": 7.01, "step": 48528 }, { "epoch": 1.0801727207977208, "grad_norm": 0.5778088569641113, "learning_rate": 0.0002851663200653566, "loss": 0.5245, "step": 48530 }, { "epoch": 1.0803952991452992, "grad_norm": 0.6053064465522766, "learning_rate": 0.000285124133840334, "loss": 0.5655, "step": 48540 }, { "epoch": 1.0806178774928774, "grad_norm": 0.7350863814353943, "learning_rate": 0.0002850819429895963, "loss": 0.5324, "step": 48550 }, { "epoch": 1.0808404558404558, "grad_norm": 0.568662703037262, "learning_rate": 0.0002850397475154361, "loss": 0.586, "step": 48560 }, { "epoch": 1.0810630341880343, "grad_norm": 0.5690873861312866, "learning_rate": 0.00028499754742014637, "loss": 0.4823, "step": 48570 }, { "epoch": 1.0812856125356125, "grad_norm": 0.7732914686203003, "learning_rate": 0.00028495534270602045, "loss": 0.5866, "step": 48580 }, { "epoch": 1.081508190883191, "grad_norm": 0.856427788734436, "learning_rate": 0.00028491313337535154, "loss": 0.6257, "step": 48590 }, { "epoch": 1.0817307692307692, "grad_norm": 0.6734537482261658, "learning_rate": 0.0002848709194304335, "loss": 0.5378, "step": 48600 }, { "epoch": 1.0819533475783476, "grad_norm": 0.6803973317146301, "learning_rate": 0.00028482870087356024, "loss": 0.4872, "step": 48610 }, { "epoch": 1.082175925925926, "grad_norm": 0.6282123923301697, "learning_rate": 0.00028478647770702593, "loss": 0.5887, "step": 48620 }, { "epoch": 1.0823985042735043, "grad_norm": 0.4611707627773285, "learning_rate": 0.000284744249933125, "loss": 0.5363, "step": 48630 }, { "epoch": 1.0826210826210827, "grad_norm": 0.7379218339920044, "learning_rate": 0.0002847020175541521, "loss": 0.6323, "step": 48640 }, { "epoch": 1.082843660968661, "grad_norm": 0.745467483997345, "learning_rate": 0.00028465978057240233, "loss": 0.5655, "step": 48650 }, { "epoch": 1.0830662393162394, "grad_norm": 1.1102386713027954, "learning_rate": 0.0002846175389901707, "loss": 0.6453, "step": 48660 }, { "epoch": 1.0832888176638176, "grad_norm": 0.4855097532272339, "learning_rate": 0.0002845752928097527, "loss": 0.4949, "step": 48670 }, { "epoch": 1.083511396011396, "grad_norm": 0.6703089475631714, "learning_rate": 0.00028453304203344417, "loss": 0.6062, "step": 48680 }, { "epoch": 1.0837339743589745, "grad_norm": 0.6551437377929688, "learning_rate": 0.0002844907866635409, "loss": 0.5481, "step": 48690 }, { "epoch": 1.0839565527065527, "grad_norm": 0.49450287222862244, "learning_rate": 0.00028444852670233905, "loss": 0.712, "step": 48700 }, { "epoch": 1.084179131054131, "grad_norm": 0.9869162440299988, "learning_rate": 0.00028440626215213514, "loss": 0.6178, "step": 48710 }, { "epoch": 1.0844017094017093, "grad_norm": 0.7060110569000244, "learning_rate": 0.0002843639930152259, "loss": 0.5479, "step": 48720 }, { "epoch": 1.0846242877492878, "grad_norm": 0.6050060391426086, "learning_rate": 0.00028432171929390816, "loss": 0.5437, "step": 48730 }, { "epoch": 1.084846866096866, "grad_norm": 0.5769473314285278, "learning_rate": 0.0002842794409904792, "loss": 0.5704, "step": 48740 }, { "epoch": 1.0850694444444444, "grad_norm": 0.6124515533447266, "learning_rate": 0.00028423715810723646, "loss": 0.4613, "step": 48750 }, { "epoch": 1.0852920227920229, "grad_norm": 0.7483047842979431, "learning_rate": 0.00028419487064647753, "loss": 0.5731, "step": 48760 }, { "epoch": 1.085514601139601, "grad_norm": 0.6910437345504761, "learning_rate": 0.0002841525786105003, "loss": 0.5796, "step": 48770 }, { "epoch": 1.0857371794871795, "grad_norm": 0.9437000751495361, "learning_rate": 0.00028411028200160324, "loss": 0.6652, "step": 48780 }, { "epoch": 1.0859597578347577, "grad_norm": 0.8151460886001587, "learning_rate": 0.0002840679808220845, "loss": 0.649, "step": 48790 }, { "epoch": 1.0861823361823362, "grad_norm": 0.6039183735847473, "learning_rate": 0.0002840256750742429, "loss": 0.509, "step": 48800 }, { "epoch": 1.0864049145299146, "grad_norm": 0.7109172344207764, "learning_rate": 0.00028398336476037736, "loss": 0.4449, "step": 48810 }, { "epoch": 1.0866274928774928, "grad_norm": 0.5566388964653015, "learning_rate": 0.000283941049882787, "loss": 0.5998, "step": 48820 }, { "epoch": 1.0868500712250713, "grad_norm": 0.5644212365150452, "learning_rate": 0.00028389873044377126, "loss": 0.5577, "step": 48830 }, { "epoch": 1.0870726495726495, "grad_norm": 0.5297816395759583, "learning_rate": 0.0002838564064456298, "loss": 0.4875, "step": 48840 }, { "epoch": 1.087295227920228, "grad_norm": 0.792813241481781, "learning_rate": 0.0002838140778906626, "loss": 0.6341, "step": 48850 }, { "epoch": 1.0875178062678064, "grad_norm": 0.5224297642707825, "learning_rate": 0.0002837717447811698, "loss": 0.5436, "step": 48860 }, { "epoch": 1.0877403846153846, "grad_norm": 0.7507627606391907, "learning_rate": 0.0002837294071194518, "loss": 0.531, "step": 48870 }, { "epoch": 1.087962962962963, "grad_norm": 0.8729502558708191, "learning_rate": 0.0002836870649078092, "loss": 0.7076, "step": 48880 }, { "epoch": 1.0881855413105412, "grad_norm": 0.6775315403938293, "learning_rate": 0.00028364471814854307, "loss": 0.5732, "step": 48890 }, { "epoch": 1.0884081196581197, "grad_norm": 0.43340983986854553, "learning_rate": 0.00028360236684395445, "loss": 0.6266, "step": 48900 }, { "epoch": 1.088630698005698, "grad_norm": 0.7650466561317444, "learning_rate": 0.00028356001099634476, "loss": 0.594, "step": 48910 }, { "epoch": 1.0888532763532763, "grad_norm": 0.7358598709106445, "learning_rate": 0.00028351765060801576, "loss": 0.5908, "step": 48920 }, { "epoch": 1.0890758547008548, "grad_norm": 0.688077986240387, "learning_rate": 0.00028347528568126916, "loss": 0.6863, "step": 48930 }, { "epoch": 1.089298433048433, "grad_norm": 0.714925229549408, "learning_rate": 0.00028343291621840726, "loss": 0.6128, "step": 48940 }, { "epoch": 1.0895210113960114, "grad_norm": 0.7040088176727295, "learning_rate": 0.00028339054222173237, "loss": 0.5468, "step": 48950 }, { "epoch": 1.0897435897435896, "grad_norm": 0.5987093448638916, "learning_rate": 0.00028334816369354716, "loss": 0.704, "step": 48960 }, { "epoch": 1.089966168091168, "grad_norm": 0.6418634653091431, "learning_rate": 0.00028330578063615443, "loss": 0.4999, "step": 48970 }, { "epoch": 1.0901887464387465, "grad_norm": 0.6263144612312317, "learning_rate": 0.0002832633930518574, "loss": 0.5361, "step": 48980 }, { "epoch": 1.0904113247863247, "grad_norm": 0.6335980296134949, "learning_rate": 0.00028322100094295953, "loss": 0.6292, "step": 48990 }, { "epoch": 1.0906339031339032, "grad_norm": 0.8092513084411621, "learning_rate": 0.0002831786043117643, "loss": 0.6042, "step": 49000 }, { "epoch": 1.0908564814814814, "grad_norm": 0.5754400491714478, "learning_rate": 0.00028313620316057557, "loss": 0.4429, "step": 49010 }, { "epoch": 1.0910790598290598, "grad_norm": 0.5414375066757202, "learning_rate": 0.0002830937974916975, "loss": 0.5623, "step": 49020 }, { "epoch": 1.0913016381766383, "grad_norm": 0.699462890625, "learning_rate": 0.00028305138730743453, "loss": 0.605, "step": 49030 }, { "epoch": 1.0915242165242165, "grad_norm": 0.5149260759353638, "learning_rate": 0.0002830089726100911, "loss": 0.4677, "step": 49040 }, { "epoch": 1.091746794871795, "grad_norm": 0.36808982491493225, "learning_rate": 0.00028296655340197214, "loss": 0.559, "step": 49050 }, { "epoch": 1.0919693732193732, "grad_norm": 0.4984835386276245, "learning_rate": 0.00028292412968538287, "loss": 0.6101, "step": 49060 }, { "epoch": 1.0921919515669516, "grad_norm": 0.5984585881233215, "learning_rate": 0.0002828817014626284, "loss": 0.5243, "step": 49070 }, { "epoch": 1.0924145299145298, "grad_norm": 0.469533771276474, "learning_rate": 0.0002828392687360144, "loss": 0.5437, "step": 49080 }, { "epoch": 1.0926371082621082, "grad_norm": 0.5558397173881531, "learning_rate": 0.00028279683150784677, "loss": 0.5478, "step": 49090 }, { "epoch": 1.0928596866096867, "grad_norm": 0.6726979613304138, "learning_rate": 0.00028275438978043153, "loss": 0.4915, "step": 49100 }, { "epoch": 1.093082264957265, "grad_norm": 0.8622997403144836, "learning_rate": 0.000282711943556075, "loss": 0.5943, "step": 49110 }, { "epoch": 1.0933048433048433, "grad_norm": 0.43874213099479675, "learning_rate": 0.00028266949283708374, "loss": 0.5687, "step": 49120 }, { "epoch": 1.0935274216524216, "grad_norm": 0.6451436281204224, "learning_rate": 0.0002826270376257646, "loss": 0.6372, "step": 49130 }, { "epoch": 1.09375, "grad_norm": 0.7769515514373779, "learning_rate": 0.0002825845779244246, "loss": 0.6425, "step": 49140 }, { "epoch": 1.0939725783475784, "grad_norm": 0.4474250078201294, "learning_rate": 0.000282542113735371, "loss": 0.6699, "step": 49150 }, { "epoch": 1.0941951566951567, "grad_norm": 0.9048413634300232, "learning_rate": 0.00028249964506091134, "loss": 0.7362, "step": 49160 }, { "epoch": 1.094417735042735, "grad_norm": 0.9284833669662476, "learning_rate": 0.0002824571719033535, "loss": 0.5964, "step": 49170 }, { "epoch": 1.0946403133903133, "grad_norm": 0.45629414916038513, "learning_rate": 0.00028241469426500533, "loss": 0.502, "step": 49180 }, { "epoch": 1.0948628917378918, "grad_norm": 0.6365179419517517, "learning_rate": 0.00028237221214817525, "loss": 0.5897, "step": 49190 }, { "epoch": 1.0950854700854702, "grad_norm": 0.7972654104232788, "learning_rate": 0.00028232972555517177, "loss": 0.5398, "step": 49200 }, { "epoch": 1.0953080484330484, "grad_norm": 0.6874790787696838, "learning_rate": 0.0002822872344883036, "loss": 0.6816, "step": 49210 }, { "epoch": 1.0955306267806268, "grad_norm": 0.6386187672615051, "learning_rate": 0.0002822447389498797, "loss": 0.5956, "step": 49220 }, { "epoch": 1.095753205128205, "grad_norm": 0.6826030611991882, "learning_rate": 0.00028220223894220934, "loss": 0.5611, "step": 49230 }, { "epoch": 1.0959757834757835, "grad_norm": 0.8523663282394409, "learning_rate": 0.0002821597344676021, "loss": 0.5323, "step": 49240 }, { "epoch": 1.0961983618233617, "grad_norm": 0.8153828382492065, "learning_rate": 0.0002821172255283676, "loss": 0.6641, "step": 49250 }, { "epoch": 1.0964209401709402, "grad_norm": 0.638305127620697, "learning_rate": 0.00028207471212681585, "loss": 0.6519, "step": 49260 }, { "epoch": 1.0966435185185186, "grad_norm": 0.4358106553554535, "learning_rate": 0.000282032194265257, "loss": 0.5503, "step": 49270 }, { "epoch": 1.0968660968660968, "grad_norm": 0.6441114544868469, "learning_rate": 0.0002819896719460016, "loss": 0.4837, "step": 49280 }, { "epoch": 1.0970886752136753, "grad_norm": 0.43398475646972656, "learning_rate": 0.00028194714517136034, "loss": 0.4673, "step": 49290 }, { "epoch": 1.0973112535612535, "grad_norm": 0.6699478626251221, "learning_rate": 0.00028190461394364405, "loss": 0.611, "step": 49300 }, { "epoch": 1.097533831908832, "grad_norm": 0.519451379776001, "learning_rate": 0.00028186207826516406, "loss": 0.5189, "step": 49310 }, { "epoch": 1.0977564102564104, "grad_norm": 0.5254443883895874, "learning_rate": 0.0002818195381382316, "loss": 0.5701, "step": 49320 }, { "epoch": 1.0979789886039886, "grad_norm": 0.6886885762214661, "learning_rate": 0.0002817769935651586, "loss": 0.4703, "step": 49330 }, { "epoch": 1.098201566951567, "grad_norm": 0.7456334233283997, "learning_rate": 0.0002817344445482568, "loss": 0.6408, "step": 49340 }, { "epoch": 1.0984241452991452, "grad_norm": 0.7715759873390198, "learning_rate": 0.00028169189108983835, "loss": 0.53, "step": 49350 }, { "epoch": 1.0986467236467237, "grad_norm": 0.647322952747345, "learning_rate": 0.0002816493331922156, "loss": 0.5447, "step": 49360 }, { "epoch": 1.098869301994302, "grad_norm": 0.6035053730010986, "learning_rate": 0.0002816067708577013, "loss": 0.5208, "step": 49370 }, { "epoch": 1.0990918803418803, "grad_norm": 0.6071327328681946, "learning_rate": 0.0002815642040886083, "loss": 0.5828, "step": 49380 }, { "epoch": 1.0993144586894588, "grad_norm": 1.0424257516860962, "learning_rate": 0.0002815216328872496, "loss": 0.5788, "step": 49390 }, { "epoch": 1.099537037037037, "grad_norm": 0.7601616978645325, "learning_rate": 0.0002814790572559387, "loss": 0.5718, "step": 49400 }, { "epoch": 1.0997596153846154, "grad_norm": 0.7542137503623962, "learning_rate": 0.0002814364771969891, "loss": 0.5238, "step": 49410 }, { "epoch": 1.0999821937321936, "grad_norm": 0.7142971754074097, "learning_rate": 0.0002813938927127147, "loss": 0.5532, "step": 49420 }, { "epoch": 1.100204772079772, "grad_norm": 0.8098645806312561, "learning_rate": 0.00028135130380542943, "loss": 0.5718, "step": 49430 }, { "epoch": 1.1004273504273505, "grad_norm": 0.7008213400840759, "learning_rate": 0.0002813087104774478, "loss": 0.6006, "step": 49440 }, { "epoch": 1.1006499287749287, "grad_norm": 0.7402480244636536, "learning_rate": 0.0002812661127310843, "loss": 0.5456, "step": 49450 }, { "epoch": 1.1008725071225072, "grad_norm": 0.36383122205734253, "learning_rate": 0.0002812235105686537, "loss": 0.5996, "step": 49460 }, { "epoch": 1.1010950854700854, "grad_norm": 0.5641146302223206, "learning_rate": 0.00028118090399247096, "loss": 0.5792, "step": 49470 }, { "epoch": 1.1013176638176638, "grad_norm": 0.8477437496185303, "learning_rate": 0.0002811382930048515, "loss": 0.5853, "step": 49480 }, { "epoch": 1.1015402421652423, "grad_norm": 0.6013820171356201, "learning_rate": 0.0002810956776081108, "loss": 0.5816, "step": 49490 }, { "epoch": 1.1017628205128205, "grad_norm": 1.032570242881775, "learning_rate": 0.00028105305780456454, "loss": 0.5702, "step": 49500 }, { "epoch": 1.101985398860399, "grad_norm": 0.7165135145187378, "learning_rate": 0.00028101043359652874, "loss": 0.6409, "step": 49510 }, { "epoch": 1.1022079772079771, "grad_norm": 0.7688490152359009, "learning_rate": 0.0002809678049863197, "loss": 0.589, "step": 49520 }, { "epoch": 1.1024305555555556, "grad_norm": 0.5828442573547363, "learning_rate": 0.00028092517197625394, "loss": 0.4621, "step": 49530 }, { "epoch": 1.102653133903134, "grad_norm": 0.4801251292228699, "learning_rate": 0.00028088253456864796, "loss": 0.6135, "step": 49540 }, { "epoch": 1.1028757122507122, "grad_norm": 0.5696801543235779, "learning_rate": 0.00028083989276581886, "loss": 0.6186, "step": 49550 }, { "epoch": 1.1030982905982907, "grad_norm": 0.6546761989593506, "learning_rate": 0.00028079724657008385, "loss": 0.592, "step": 49560 }, { "epoch": 1.103320868945869, "grad_norm": 0.6570590734481812, "learning_rate": 0.0002807545959837603, "loss": 0.6159, "step": 49570 }, { "epoch": 1.1035434472934473, "grad_norm": 0.5510005354881287, "learning_rate": 0.0002807119410091659, "loss": 0.4846, "step": 49580 }, { "epoch": 1.1037660256410255, "grad_norm": 0.841663122177124, "learning_rate": 0.00028066928164861854, "loss": 0.618, "step": 49590 }, { "epoch": 1.103988603988604, "grad_norm": 0.5082978010177612, "learning_rate": 0.0002806266179044364, "loss": 0.6095, "step": 49600 }, { "epoch": 1.1042111823361824, "grad_norm": 0.6725553274154663, "learning_rate": 0.0002805839497789378, "loss": 0.5768, "step": 49610 }, { "epoch": 1.1044337606837606, "grad_norm": 0.5792170166969299, "learning_rate": 0.0002805412772744414, "loss": 0.6151, "step": 49620 }, { "epoch": 1.104656339031339, "grad_norm": 0.6467113494873047, "learning_rate": 0.0002804986003932661, "loss": 0.5936, "step": 49630 }, { "epoch": 1.1048789173789173, "grad_norm": 0.5447518229484558, "learning_rate": 0.0002804559191377309, "loss": 0.4788, "step": 49640 }, { "epoch": 1.1051014957264957, "grad_norm": 0.6909335851669312, "learning_rate": 0.00028041323351015525, "loss": 0.5433, "step": 49650 }, { "epoch": 1.105324074074074, "grad_norm": 0.6738542318344116, "learning_rate": 0.00028037054351285864, "loss": 0.4256, "step": 49660 }, { "epoch": 1.1055466524216524, "grad_norm": 0.4187549352645874, "learning_rate": 0.0002803278491481609, "loss": 0.5291, "step": 49670 }, { "epoch": 1.1057692307692308, "grad_norm": 0.5423128604888916, "learning_rate": 0.00028028515041838203, "loss": 0.5666, "step": 49680 }, { "epoch": 1.105991809116809, "grad_norm": 0.41569629311561584, "learning_rate": 0.00028024244732584243, "loss": 0.625, "step": 49690 }, { "epoch": 1.1062143874643875, "grad_norm": 3.5168192386627197, "learning_rate": 0.00028019973987286254, "loss": 0.8, "step": 49700 }, { "epoch": 1.1064369658119657, "grad_norm": 0.8853825926780701, "learning_rate": 0.00028015702806176315, "loss": 0.4617, "step": 49710 }, { "epoch": 1.1066595441595442, "grad_norm": 0.7445874214172363, "learning_rate": 0.00028011431189486517, "loss": 0.5553, "step": 49720 }, { "epoch": 1.1068821225071226, "grad_norm": 1.18168044090271, "learning_rate": 0.00028007159137448997, "loss": 0.582, "step": 49730 }, { "epoch": 1.1071047008547008, "grad_norm": 0.5334780216217041, "learning_rate": 0.0002800288665029589, "loss": 0.5465, "step": 49740 }, { "epoch": 1.1073272792022792, "grad_norm": 0.9836109280586243, "learning_rate": 0.00027998613728259374, "loss": 0.6869, "step": 49750 }, { "epoch": 1.1075498575498575, "grad_norm": 0.9165136218070984, "learning_rate": 0.00027994340371571635, "loss": 0.6017, "step": 49760 }, { "epoch": 1.107772435897436, "grad_norm": 0.8517610430717468, "learning_rate": 0.00027990066580464896, "loss": 0.5321, "step": 49770 }, { "epoch": 1.1079950142450143, "grad_norm": 0.6246629953384399, "learning_rate": 0.00027985792355171406, "loss": 0.5605, "step": 49780 }, { "epoch": 1.1082175925925926, "grad_norm": 0.9364470839500427, "learning_rate": 0.0002798151769592342, "loss": 0.6157, "step": 49790 }, { "epoch": 1.108440170940171, "grad_norm": 0.7243967652320862, "learning_rate": 0.0002797724260295322, "loss": 0.6616, "step": 49800 }, { "epoch": 1.1086627492877492, "grad_norm": 0.9982474446296692, "learning_rate": 0.00027972967076493133, "loss": 0.5218, "step": 49810 }, { "epoch": 1.1088853276353277, "grad_norm": 0.7943207621574402, "learning_rate": 0.0002796869111677548, "loss": 0.6087, "step": 49820 }, { "epoch": 1.1091079059829059, "grad_norm": 1.0676311254501343, "learning_rate": 0.0002796441472403264, "loss": 0.6007, "step": 49830 }, { "epoch": 1.1093304843304843, "grad_norm": 0.6319277882575989, "learning_rate": 0.0002796013789849698, "loss": 0.6302, "step": 49840 }, { "epoch": 1.1095530626780628, "grad_norm": 0.6356170773506165, "learning_rate": 0.0002795586064040091, "loss": 0.583, "step": 49850 }, { "epoch": 1.109775641025641, "grad_norm": 0.652543306350708, "learning_rate": 0.00027951582949976855, "loss": 0.5332, "step": 49860 }, { "epoch": 1.1099982193732194, "grad_norm": 1.1238377094268799, "learning_rate": 0.00027947304827457273, "loss": 0.6134, "step": 49870 }, { "epoch": 1.1102207977207976, "grad_norm": 0.459655225276947, "learning_rate": 0.0002794302627307465, "loss": 0.497, "step": 49880 }, { "epoch": 1.110443376068376, "grad_norm": 0.7160477042198181, "learning_rate": 0.0002793874728706147, "loss": 0.5633, "step": 49890 }, { "epoch": 1.1106659544159545, "grad_norm": 0.5809594988822937, "learning_rate": 0.0002793446786965026, "loss": 0.5299, "step": 49900 }, { "epoch": 1.1108885327635327, "grad_norm": 0.36635035276412964, "learning_rate": 0.0002793018802107358, "loss": 0.4768, "step": 49910 }, { "epoch": 1.1111111111111112, "grad_norm": 0.7522535920143127, "learning_rate": 0.0002792590774156399, "loss": 0.5788, "step": 49920 }, { "epoch": 1.1113336894586894, "grad_norm": 0.687225878238678, "learning_rate": 0.0002792162703135408, "loss": 0.6284, "step": 49930 }, { "epoch": 1.1115562678062678, "grad_norm": 0.766548216342926, "learning_rate": 0.0002791734589067647, "loss": 0.5067, "step": 49940 }, { "epoch": 1.1117788461538463, "grad_norm": 0.6063271164894104, "learning_rate": 0.00027913064319763805, "loss": 0.6553, "step": 49950 }, { "epoch": 1.1120014245014245, "grad_norm": 0.5717689990997314, "learning_rate": 0.0002790878231884875, "loss": 0.4677, "step": 49960 }, { "epoch": 1.112224002849003, "grad_norm": 0.8794634342193604, "learning_rate": 0.00027904499888163983, "loss": 0.5599, "step": 49970 }, { "epoch": 1.1124465811965811, "grad_norm": 0.7475899457931519, "learning_rate": 0.0002790021702794223, "loss": 0.5733, "step": 49980 }, { "epoch": 1.1126691595441596, "grad_norm": 0.4899514615535736, "learning_rate": 0.0002789593373841621, "loss": 0.4454, "step": 49990 }, { "epoch": 1.1128917378917378, "grad_norm": 0.5497984290122986, "learning_rate": 0.0002789165001981869, "loss": 0.5753, "step": 50000 }, { "epoch": 1.1131143162393162, "grad_norm": 0.5439674854278564, "learning_rate": 0.00027887365872382447, "loss": 0.5884, "step": 50010 }, { "epoch": 1.1133368945868947, "grad_norm": 0.4920559227466583, "learning_rate": 0.00027883081296340285, "loss": 0.4844, "step": 50020 }, { "epoch": 1.1135594729344729, "grad_norm": 0.6336165070533752, "learning_rate": 0.0002787879629192503, "loss": 0.5899, "step": 50030 }, { "epoch": 1.1137820512820513, "grad_norm": 0.42336025834083557, "learning_rate": 0.00027874510859369534, "loss": 0.4811, "step": 50040 }, { "epoch": 1.1140046296296295, "grad_norm": 0.649666965007782, "learning_rate": 0.0002787022499890668, "loss": 0.5941, "step": 50050 }, { "epoch": 1.114227207977208, "grad_norm": 0.6693488955497742, "learning_rate": 0.0002786593871076935, "loss": 0.5914, "step": 50060 }, { "epoch": 1.1144497863247864, "grad_norm": 0.8287727236747742, "learning_rate": 0.0002786165199519047, "loss": 0.5427, "step": 50070 }, { "epoch": 1.1146723646723646, "grad_norm": 0.9317525029182434, "learning_rate": 0.0002785736485240299, "loss": 0.4981, "step": 50080 }, { "epoch": 1.114894943019943, "grad_norm": 0.4562414884567261, "learning_rate": 0.00027853077282639867, "loss": 0.536, "step": 50090 }, { "epoch": 1.1151175213675213, "grad_norm": 0.7502874135971069, "learning_rate": 0.0002784878928613409, "loss": 0.6141, "step": 50100 }, { "epoch": 1.1153400997150997, "grad_norm": 0.6522567868232727, "learning_rate": 0.00027844500863118685, "loss": 0.528, "step": 50110 }, { "epoch": 1.1155626780626782, "grad_norm": 0.6541271209716797, "learning_rate": 0.0002784021201382669, "loss": 0.4591, "step": 50120 }, { "epoch": 1.1157852564102564, "grad_norm": 0.48647746443748474, "learning_rate": 0.0002783592273849114, "loss": 0.5641, "step": 50130 }, { "epoch": 1.1160078347578348, "grad_norm": 0.5013317465782166, "learning_rate": 0.00027831633037345144, "loss": 0.5116, "step": 50140 }, { "epoch": 1.116230413105413, "grad_norm": 0.8025067448616028, "learning_rate": 0.0002782734291062179, "loss": 0.6091, "step": 50150 }, { "epoch": 1.1164529914529915, "grad_norm": 0.8498494625091553, "learning_rate": 0.0002782305235855422, "loss": 0.5829, "step": 50160 }, { "epoch": 1.1166755698005697, "grad_norm": 0.6301447749137878, "learning_rate": 0.00027818761381375573, "loss": 0.5449, "step": 50170 }, { "epoch": 1.1168981481481481, "grad_norm": 0.3991430103778839, "learning_rate": 0.00027814469979319043, "loss": 0.6215, "step": 50180 }, { "epoch": 1.1171207264957266, "grad_norm": 0.8412891626358032, "learning_rate": 0.00027810178152617814, "loss": 0.4809, "step": 50190 }, { "epoch": 1.1173433048433048, "grad_norm": 0.7133082747459412, "learning_rate": 0.00027805885901505107, "loss": 0.4103, "step": 50200 }, { "epoch": 1.1175658831908832, "grad_norm": 0.8361467719078064, "learning_rate": 0.0002780159322621417, "loss": 0.5977, "step": 50210 }, { "epoch": 1.1177884615384615, "grad_norm": 0.7250277996063232, "learning_rate": 0.0002779730012697827, "loss": 0.6668, "step": 50220 }, { "epoch": 1.11801103988604, "grad_norm": 0.5908347368240356, "learning_rate": 0.00027793006604030703, "loss": 0.5778, "step": 50230 }, { "epoch": 1.1182336182336183, "grad_norm": 0.8042272925376892, "learning_rate": 0.0002778871265760477, "loss": 0.6422, "step": 50240 }, { "epoch": 1.1184561965811965, "grad_norm": 0.867863655090332, "learning_rate": 0.0002778441828793382, "loss": 0.6648, "step": 50250 }, { "epoch": 1.118678774928775, "grad_norm": 0.3790102005004883, "learning_rate": 0.0002778012349525121, "loss": 0.6024, "step": 50260 }, { "epoch": 1.1189013532763532, "grad_norm": 0.6354236006736755, "learning_rate": 0.00027775828279790317, "loss": 0.5493, "step": 50270 }, { "epoch": 1.1191239316239316, "grad_norm": 0.9611608386039734, "learning_rate": 0.00027771532641784544, "loss": 0.7362, "step": 50280 }, { "epoch": 1.11934650997151, "grad_norm": 0.5919707417488098, "learning_rate": 0.00027767236581467333, "loss": 0.5445, "step": 50290 }, { "epoch": 1.1195690883190883, "grad_norm": 0.6382510662078857, "learning_rate": 0.00027762940099072125, "loss": 0.6089, "step": 50300 }, { "epoch": 1.1197916666666667, "grad_norm": 0.6243929862976074, "learning_rate": 0.0002775864319483239, "loss": 0.6145, "step": 50310 }, { "epoch": 1.120014245014245, "grad_norm": 0.5520569086074829, "learning_rate": 0.0002775434586898165, "loss": 0.4577, "step": 50320 }, { "epoch": 1.1202368233618234, "grad_norm": 0.5697688460350037, "learning_rate": 0.00027750048121753394, "loss": 0.4462, "step": 50330 }, { "epoch": 1.1204594017094016, "grad_norm": 0.7084352374076843, "learning_rate": 0.0002774574995338118, "loss": 0.5839, "step": 50340 }, { "epoch": 1.12068198005698, "grad_norm": 0.45827311277389526, "learning_rate": 0.0002774145136409858, "loss": 0.6047, "step": 50350 }, { "epoch": 1.1209045584045585, "grad_norm": 0.7183460593223572, "learning_rate": 0.00027737152354139166, "loss": 0.5396, "step": 50360 }, { "epoch": 1.1211271367521367, "grad_norm": 0.6099705696105957, "learning_rate": 0.0002773285292373656, "loss": 0.6081, "step": 50370 }, { "epoch": 1.1213497150997151, "grad_norm": 0.8301957845687866, "learning_rate": 0.00027728553073124405, "loss": 0.5672, "step": 50380 }, { "epoch": 1.1215722934472934, "grad_norm": 0.6909571886062622, "learning_rate": 0.00027724252802536337, "loss": 0.4832, "step": 50390 }, { "epoch": 1.1217948717948718, "grad_norm": 0.5145627856254578, "learning_rate": 0.00027719952112206054, "loss": 0.5603, "step": 50400 }, { "epoch": 1.1220174501424502, "grad_norm": 0.7964401841163635, "learning_rate": 0.0002771565100236726, "loss": 0.6, "step": 50410 }, { "epoch": 1.1222400284900285, "grad_norm": 0.4715334475040436, "learning_rate": 0.00027711349473253657, "loss": 0.4057, "step": 50420 }, { "epoch": 1.122462606837607, "grad_norm": 0.6473780870437622, "learning_rate": 0.0002770704752509903, "loss": 0.5549, "step": 50430 }, { "epoch": 1.1226851851851851, "grad_norm": 0.5076059699058533, "learning_rate": 0.00027702745158137115, "loss": 0.6064, "step": 50440 }, { "epoch": 1.1229077635327636, "grad_norm": 0.8316875100135803, "learning_rate": 0.00027698442372601736, "loss": 0.5, "step": 50450 }, { "epoch": 1.123130341880342, "grad_norm": 0.718682587146759, "learning_rate": 0.0002769413916872669, "loss": 0.6023, "step": 50460 }, { "epoch": 1.1233529202279202, "grad_norm": 0.7767266631126404, "learning_rate": 0.00027689835546745823, "loss": 0.5666, "step": 50470 }, { "epoch": 1.1235754985754987, "grad_norm": 0.5583837628364563, "learning_rate": 0.00027685531506892993, "loss": 0.57, "step": 50480 }, { "epoch": 1.1237980769230769, "grad_norm": 0.6625356674194336, "learning_rate": 0.00027681227049402093, "loss": 0.4804, "step": 50490 }, { "epoch": 1.1240206552706553, "grad_norm": 0.42987099289894104, "learning_rate": 0.0002767692217450703, "loss": 0.4838, "step": 50500 }, { "epoch": 1.1242432336182335, "grad_norm": 0.4361467957496643, "learning_rate": 0.00027672616882441726, "loss": 0.6805, "step": 50510 }, { "epoch": 1.124465811965812, "grad_norm": 0.7139217257499695, "learning_rate": 0.00027668311173440147, "loss": 0.5663, "step": 50520 }, { "epoch": 1.1246883903133904, "grad_norm": 0.7087649703025818, "learning_rate": 0.0002766400504773625, "loss": 0.5915, "step": 50530 }, { "epoch": 1.1249109686609686, "grad_norm": 0.6644691824913025, "learning_rate": 0.00027659698505564056, "loss": 0.5914, "step": 50540 }, { "epoch": 1.125133547008547, "grad_norm": 0.6349228620529175, "learning_rate": 0.0002765539154715757, "loss": 0.4877, "step": 50550 }, { "epoch": 1.1253561253561253, "grad_norm": 0.710770308971405, "learning_rate": 0.0002765108417275084, "loss": 0.4708, "step": 50560 }, { "epoch": 1.1255787037037037, "grad_norm": 0.699897825717926, "learning_rate": 0.00027646776382577934, "loss": 0.7282, "step": 50570 }, { "epoch": 1.125801282051282, "grad_norm": 0.6822852492332458, "learning_rate": 0.0002764246817687294, "loss": 0.6721, "step": 50580 }, { "epoch": 1.1260238603988604, "grad_norm": 0.6469874978065491, "learning_rate": 0.00027638159555869966, "loss": 0.6107, "step": 50590 }, { "epoch": 1.1262464387464388, "grad_norm": 0.5671929121017456, "learning_rate": 0.00027633850519803146, "loss": 0.5588, "step": 50600 }, { "epoch": 1.126469017094017, "grad_norm": 0.6890249848365784, "learning_rate": 0.00027629541068906644, "loss": 0.5867, "step": 50610 }, { "epoch": 1.1266915954415955, "grad_norm": 0.6389486789703369, "learning_rate": 0.0002762523120341463, "loss": 0.5792, "step": 50620 }, { "epoch": 1.126914173789174, "grad_norm": 1.1844886541366577, "learning_rate": 0.0002762092092356131, "loss": 0.6745, "step": 50630 }, { "epoch": 1.1271367521367521, "grad_norm": 0.6294887065887451, "learning_rate": 0.0002761661022958092, "loss": 0.6385, "step": 50640 }, { "epoch": 1.1273593304843306, "grad_norm": 0.6752757430076599, "learning_rate": 0.00027612299121707685, "loss": 0.5475, "step": 50650 }, { "epoch": 1.1275819088319088, "grad_norm": 1.0090526342391968, "learning_rate": 0.0002760798760017589, "loss": 0.5784, "step": 50660 }, { "epoch": 1.1278044871794872, "grad_norm": 0.7660955786705017, "learning_rate": 0.0002760367566521981, "loss": 0.5528, "step": 50670 }, { "epoch": 1.1280270655270654, "grad_norm": 0.6590836644172668, "learning_rate": 0.0002759936331707378, "loss": 0.692, "step": 50680 }, { "epoch": 1.1282496438746439, "grad_norm": 0.8564108610153198, "learning_rate": 0.0002759505055597212, "loss": 0.5678, "step": 50690 }, { "epoch": 1.1284722222222223, "grad_norm": 0.587363600730896, "learning_rate": 0.000275907373821492, "loss": 0.5068, "step": 50700 }, { "epoch": 1.1286948005698005, "grad_norm": 0.3763004541397095, "learning_rate": 0.00027586423795839394, "loss": 0.5296, "step": 50710 }, { "epoch": 1.128917378917379, "grad_norm": 0.8793259859085083, "learning_rate": 0.0002758210979727711, "loss": 0.6651, "step": 50720 }, { "epoch": 1.1291399572649572, "grad_norm": 0.8572914600372314, "learning_rate": 0.0002757779538669677, "loss": 0.5349, "step": 50730 }, { "epoch": 1.1293625356125356, "grad_norm": 0.5334573984146118, "learning_rate": 0.0002757348056433282, "loss": 0.4848, "step": 50740 }, { "epoch": 1.1295851139601139, "grad_norm": 0.5879189968109131, "learning_rate": 0.0002756916533041975, "loss": 0.5391, "step": 50750 }, { "epoch": 1.1298076923076923, "grad_norm": 0.7693625688552856, "learning_rate": 0.0002756484968519203, "loss": 0.6844, "step": 50760 }, { "epoch": 1.1300302706552707, "grad_norm": 0.9467472434043884, "learning_rate": 0.0002756053362888419, "loss": 0.655, "step": 50770 }, { "epoch": 1.130252849002849, "grad_norm": 0.684518575668335, "learning_rate": 0.00027556217161730763, "loss": 0.5942, "step": 50780 }, { "epoch": 1.1304754273504274, "grad_norm": 1.1190756559371948, "learning_rate": 0.0002755190028396631, "loss": 0.5974, "step": 50790 }, { "epoch": 1.1306980056980056, "grad_norm": 0.7289356589317322, "learning_rate": 0.00027547582995825405, "loss": 0.7014, "step": 50800 }, { "epoch": 1.130920584045584, "grad_norm": 0.8716086149215698, "learning_rate": 0.00027543265297542665, "loss": 0.6442, "step": 50810 }, { "epoch": 1.1311431623931625, "grad_norm": 0.5718209743499756, "learning_rate": 0.0002753894718935272, "loss": 0.5779, "step": 50820 }, { "epoch": 1.1313657407407407, "grad_norm": 0.6775786876678467, "learning_rate": 0.0002753462867149021, "loss": 0.6441, "step": 50830 }, { "epoch": 1.1315883190883191, "grad_norm": 0.35854214429855347, "learning_rate": 0.00027530309744189805, "loss": 0.5515, "step": 50840 }, { "epoch": 1.1318108974358974, "grad_norm": 0.3995623290538788, "learning_rate": 0.00027525990407686207, "loss": 0.5819, "step": 50850 }, { "epoch": 1.1320334757834758, "grad_norm": 0.6870381832122803, "learning_rate": 0.0002752167066221413, "loss": 0.6606, "step": 50860 }, { "epoch": 1.1322560541310542, "grad_norm": 0.9938977360725403, "learning_rate": 0.00027517350508008315, "loss": 0.6918, "step": 50870 }, { "epoch": 1.1324786324786325, "grad_norm": 0.5590777397155762, "learning_rate": 0.0002751302994530351, "loss": 0.6583, "step": 50880 }, { "epoch": 1.132701210826211, "grad_norm": 0.6947663426399231, "learning_rate": 0.0002750870897433451, "loss": 0.4942, "step": 50890 }, { "epoch": 1.132923789173789, "grad_norm": 0.45011070370674133, "learning_rate": 0.0002750438759533612, "loss": 0.4522, "step": 50900 }, { "epoch": 1.1331463675213675, "grad_norm": 2.3322110176086426, "learning_rate": 0.0002750006580854317, "loss": 0.4981, "step": 50910 }, { "epoch": 1.1333689458689458, "grad_norm": 0.560584545135498, "learning_rate": 0.00027495743614190497, "loss": 0.6019, "step": 50920 }, { "epoch": 1.1335915242165242, "grad_norm": 0.6565274596214294, "learning_rate": 0.0002749142101251299, "loss": 0.7166, "step": 50930 }, { "epoch": 1.1338141025641026, "grad_norm": 0.7385024428367615, "learning_rate": 0.00027487098003745514, "loss": 0.5216, "step": 50940 }, { "epoch": 1.1340366809116809, "grad_norm": 0.5504783391952515, "learning_rate": 0.00027482774588123016, "loss": 0.5478, "step": 50950 }, { "epoch": 1.1342592592592593, "grad_norm": 0.6359137892723083, "learning_rate": 0.00027478450765880424, "loss": 0.6607, "step": 50960 }, { "epoch": 1.1344818376068375, "grad_norm": 0.6465204954147339, "learning_rate": 0.0002747412653725269, "loss": 0.6833, "step": 50970 }, { "epoch": 1.134704415954416, "grad_norm": 0.6576072573661804, "learning_rate": 0.000274698019024748, "loss": 0.5211, "step": 50980 }, { "epoch": 1.1349269943019944, "grad_norm": 0.6555946469306946, "learning_rate": 0.0002746547686178176, "loss": 0.5938, "step": 50990 }, { "epoch": 1.1351495726495726, "grad_norm": 0.5931433439254761, "learning_rate": 0.00027461151415408597, "loss": 0.4851, "step": 51000 }, { "epoch": 1.135372150997151, "grad_norm": 0.48684531450271606, "learning_rate": 0.00027456825563590355, "loss": 0.4995, "step": 51010 }, { "epoch": 1.1355947293447293, "grad_norm": 0.813859760761261, "learning_rate": 0.00027452499306562106, "loss": 0.5633, "step": 51020 }, { "epoch": 1.1358173076923077, "grad_norm": 0.42580199241638184, "learning_rate": 0.00027448172644558953, "loss": 0.483, "step": 51030 }, { "epoch": 1.1360398860398861, "grad_norm": 0.6813814043998718, "learning_rate": 0.00027443845577816, "loss": 0.4505, "step": 51040 }, { "epoch": 1.1362624643874644, "grad_norm": 1.0832009315490723, "learning_rate": 0.0002743951810656838, "loss": 0.5776, "step": 51050 }, { "epoch": 1.1364850427350428, "grad_norm": 0.6177014112472534, "learning_rate": 0.0002743519023105125, "loss": 0.6503, "step": 51060 }, { "epoch": 1.136707621082621, "grad_norm": 0.8180685043334961, "learning_rate": 0.0002743086195149981, "loss": 0.6229, "step": 51070 }, { "epoch": 1.1369301994301995, "grad_norm": 0.8289804458618164, "learning_rate": 0.00027426533268149237, "loss": 0.6538, "step": 51080 }, { "epoch": 1.1371527777777777, "grad_norm": 0.7466900944709778, "learning_rate": 0.00027422204181234766, "loss": 0.6753, "step": 51090 }, { "epoch": 1.1373753561253561, "grad_norm": 0.7393838763237, "learning_rate": 0.00027417874690991654, "loss": 0.5579, "step": 51100 }, { "epoch": 1.1375979344729346, "grad_norm": 0.4931185841560364, "learning_rate": 0.00027413544797655153, "loss": 0.5762, "step": 51110 }, { "epoch": 1.1378205128205128, "grad_norm": 0.6497383117675781, "learning_rate": 0.0002740921450146056, "loss": 0.3733, "step": 51120 }, { "epoch": 1.1380430911680912, "grad_norm": 0.9239633679389954, "learning_rate": 0.0002740488380264318, "loss": 0.5591, "step": 51130 }, { "epoch": 1.1382656695156694, "grad_norm": 0.6807590126991272, "learning_rate": 0.0002740055270143836, "loss": 0.5626, "step": 51140 }, { "epoch": 1.1384882478632479, "grad_norm": 0.8736845254898071, "learning_rate": 0.0002739622119808144, "loss": 0.6351, "step": 51150 }, { "epoch": 1.138710826210826, "grad_norm": 0.49122872948646545, "learning_rate": 0.0002739188929280781, "loss": 0.5252, "step": 51160 }, { "epoch": 1.1389334045584045, "grad_norm": 0.5275121331214905, "learning_rate": 0.00027387556985852867, "loss": 0.5161, "step": 51170 }, { "epoch": 1.139155982905983, "grad_norm": 0.8667572140693665, "learning_rate": 0.00027383224277452027, "loss": 0.617, "step": 51180 }, { "epoch": 1.1393785612535612, "grad_norm": 0.885334849357605, "learning_rate": 0.0002737889116784073, "loss": 0.5347, "step": 51190 }, { "epoch": 1.1396011396011396, "grad_norm": 0.5012974739074707, "learning_rate": 0.0002737455765725445, "loss": 0.6395, "step": 51200 }, { "epoch": 1.139823717948718, "grad_norm": 1.1351536512374878, "learning_rate": 0.00027370223745928673, "loss": 0.5427, "step": 51210 }, { "epoch": 1.1400462962962963, "grad_norm": 0.4724816381931305, "learning_rate": 0.000273658894340989, "loss": 0.5503, "step": 51220 }, { "epoch": 1.1401353276353277, "eval_loss": 0.5929179787635803, "eval_runtime": 337.1971, "eval_samples_per_second": 7.014, "eval_steps_per_second": 7.014, "step": 51224 }, { "epoch": 1.1402688746438747, "grad_norm": 0.9337814450263977, "learning_rate": 0.0002736155472200067, "loss": 0.7102, "step": 51230 }, { "epoch": 1.140491452991453, "grad_norm": 0.6762569546699524, "learning_rate": 0.0002735721960986953, "loss": 0.5636, "step": 51240 }, { "epoch": 1.1407140313390314, "grad_norm": 0.6992948651313782, "learning_rate": 0.0002735288409794105, "loss": 0.623, "step": 51250 }, { "epoch": 1.1409366096866096, "grad_norm": 0.6342442631721497, "learning_rate": 0.00027348548186450827, "loss": 0.5408, "step": 51260 }, { "epoch": 1.141159188034188, "grad_norm": 0.5864853262901306, "learning_rate": 0.0002734421187563448, "loss": 0.5168, "step": 51270 }, { "epoch": 1.1413817663817665, "grad_norm": 0.5342767834663391, "learning_rate": 0.00027339875165727657, "loss": 0.6507, "step": 51280 }, { "epoch": 1.1416043447293447, "grad_norm": 0.7018928527832031, "learning_rate": 0.00027335538056966, "loss": 0.7081, "step": 51290 }, { "epoch": 1.1418269230769231, "grad_norm": 0.7439033389091492, "learning_rate": 0.000273312005495852, "loss": 0.5094, "step": 51300 }, { "epoch": 1.1420495014245013, "grad_norm": 0.7723569273948669, "learning_rate": 0.00027326862643820964, "loss": 0.5982, "step": 51310 }, { "epoch": 1.1422720797720798, "grad_norm": 0.4509701132774353, "learning_rate": 0.00027322524339909015, "loss": 0.4447, "step": 51320 }, { "epoch": 1.142494658119658, "grad_norm": 0.5816810131072998, "learning_rate": 0.00027318185638085094, "loss": 0.5588, "step": 51330 }, { "epoch": 1.1427172364672364, "grad_norm": 0.7561889290809631, "learning_rate": 0.00027313846538584973, "loss": 0.6732, "step": 51340 }, { "epoch": 1.1429398148148149, "grad_norm": 0.8186838030815125, "learning_rate": 0.0002730950704164445, "loss": 0.4714, "step": 51350 }, { "epoch": 1.143162393162393, "grad_norm": 1.4625139236450195, "learning_rate": 0.00027305167147499324, "loss": 0.576, "step": 51360 }, { "epoch": 1.1433849715099715, "grad_norm": 0.896793007850647, "learning_rate": 0.0002730082685638544, "loss": 0.5008, "step": 51370 }, { "epoch": 1.14360754985755, "grad_norm": 0.9344685077667236, "learning_rate": 0.00027296486168538645, "loss": 0.4954, "step": 51380 }, { "epoch": 1.1438301282051282, "grad_norm": 0.6186417937278748, "learning_rate": 0.00027292145084194827, "loss": 0.5856, "step": 51390 }, { "epoch": 1.1440527065527066, "grad_norm": 1.0679600238800049, "learning_rate": 0.00027287803603589866, "loss": 0.5139, "step": 51400 }, { "epoch": 1.1442752849002849, "grad_norm": 0.4196697771549225, "learning_rate": 0.0002728346172695969, "loss": 0.443, "step": 51410 }, { "epoch": 1.1444978632478633, "grad_norm": 0.5441080927848816, "learning_rate": 0.00027279119454540245, "loss": 0.4916, "step": 51420 }, { "epoch": 1.1447204415954415, "grad_norm": 0.919732928276062, "learning_rate": 0.0002727477678656749, "loss": 0.5201, "step": 51430 }, { "epoch": 1.14494301994302, "grad_norm": 0.7137250900268555, "learning_rate": 0.00027270433723277406, "loss": 0.6276, "step": 51440 }, { "epoch": 1.1451655982905984, "grad_norm": 0.535349428653717, "learning_rate": 0.0002726609026490601, "loss": 0.5404, "step": 51450 }, { "epoch": 1.1453881766381766, "grad_norm": 0.6664863228797913, "learning_rate": 0.00027261746411689315, "loss": 0.4525, "step": 51460 }, { "epoch": 1.145610754985755, "grad_norm": 0.5625571012496948, "learning_rate": 0.00027257402163863374, "loss": 0.5836, "step": 51470 }, { "epoch": 1.1458333333333333, "grad_norm": 0.6733038425445557, "learning_rate": 0.0002725305752166426, "loss": 0.564, "step": 51480 }, { "epoch": 1.1460559116809117, "grad_norm": 0.7813240885734558, "learning_rate": 0.0002724871248532806, "loss": 0.4939, "step": 51490 }, { "epoch": 1.14627849002849, "grad_norm": 0.7392660975456238, "learning_rate": 0.00027244367055090894, "loss": 0.7043, "step": 51500 }, { "epoch": 1.1465010683760684, "grad_norm": 0.4674152731895447, "learning_rate": 0.00027240021231188883, "loss": 0.5184, "step": 51510 }, { "epoch": 1.1467236467236468, "grad_norm": 0.656161367893219, "learning_rate": 0.000272356750138582, "loss": 0.6081, "step": 51520 }, { "epoch": 1.146946225071225, "grad_norm": 0.6295078992843628, "learning_rate": 0.0002723132840333501, "loss": 0.5184, "step": 51530 }, { "epoch": 1.1471688034188035, "grad_norm": 0.769020676612854, "learning_rate": 0.00027226981399855514, "loss": 0.6159, "step": 51540 }, { "epoch": 1.147391381766382, "grad_norm": 0.6695685386657715, "learning_rate": 0.0002722263400365594, "loss": 0.6638, "step": 51550 }, { "epoch": 1.14761396011396, "grad_norm": 0.61558997631073, "learning_rate": 0.00027218286214972514, "loss": 0.5738, "step": 51560 }, { "epoch": 1.1478365384615385, "grad_norm": 0.8256414532661438, "learning_rate": 0.0002721393803404151, "loss": 0.6167, "step": 51570 }, { "epoch": 1.1480591168091168, "grad_norm": 0.8803865909576416, "learning_rate": 0.00027209589461099203, "loss": 0.6634, "step": 51580 }, { "epoch": 1.1482816951566952, "grad_norm": 0.41930705308914185, "learning_rate": 0.0002720524049638191, "loss": 0.6836, "step": 51590 }, { "epoch": 1.1485042735042734, "grad_norm": 1.0584372282028198, "learning_rate": 0.0002720089114012594, "loss": 0.4381, "step": 51600 }, { "epoch": 1.1487268518518519, "grad_norm": 0.5026808381080627, "learning_rate": 0.0002719654139256765, "loss": 0.6022, "step": 51610 }, { "epoch": 1.1489494301994303, "grad_norm": 1.0847352743148804, "learning_rate": 0.00027192191253943415, "loss": 0.6017, "step": 51620 }, { "epoch": 1.1491720085470085, "grad_norm": 0.6440492272377014, "learning_rate": 0.0002718784072448963, "loss": 0.5381, "step": 51630 }, { "epoch": 1.149394586894587, "grad_norm": 0.7534992694854736, "learning_rate": 0.0002718348980444268, "loss": 0.603, "step": 51640 }, { "epoch": 1.1496171652421652, "grad_norm": 0.55170738697052, "learning_rate": 0.0002717913849403901, "loss": 0.7346, "step": 51650 }, { "epoch": 1.1498397435897436, "grad_norm": 0.5600918531417847, "learning_rate": 0.0002717478679351509, "loss": 0.6366, "step": 51660 }, { "epoch": 1.1500623219373218, "grad_norm": 0.6963495016098022, "learning_rate": 0.0002717043470310738, "loss": 0.586, "step": 51670 }, { "epoch": 1.1502849002849003, "grad_norm": 0.6831533908843994, "learning_rate": 0.00027166082223052375, "loss": 0.6305, "step": 51680 }, { "epoch": 1.1505074786324787, "grad_norm": 0.7113856673240662, "learning_rate": 0.00027161729353586595, "loss": 0.5272, "step": 51690 }, { "epoch": 1.150730056980057, "grad_norm": 0.7831088900566101, "learning_rate": 0.0002715737609494658, "loss": 0.6085, "step": 51700 }, { "epoch": 1.1509526353276354, "grad_norm": 0.401477575302124, "learning_rate": 0.0002715302244736889, "loss": 0.5864, "step": 51710 }, { "epoch": 1.1511752136752136, "grad_norm": 0.814154863357544, "learning_rate": 0.00027148668411090105, "loss": 0.5873, "step": 51720 }, { "epoch": 1.151397792022792, "grad_norm": 0.5306657552719116, "learning_rate": 0.00027144313986346826, "loss": 0.5309, "step": 51730 }, { "epoch": 1.1516203703703705, "grad_norm": 0.45474058389663696, "learning_rate": 0.0002713995917337567, "loss": 0.5582, "step": 51740 }, { "epoch": 1.1518429487179487, "grad_norm": 0.7120450139045715, "learning_rate": 0.0002713560397241329, "loss": 0.6194, "step": 51750 }, { "epoch": 1.1520655270655271, "grad_norm": 0.650835394859314, "learning_rate": 0.00027131248383696356, "loss": 0.6751, "step": 51760 }, { "epoch": 1.1522881054131053, "grad_norm": 0.8832502961158752, "learning_rate": 0.00027126892407461546, "loss": 0.5607, "step": 51770 }, { "epoch": 1.1525106837606838, "grad_norm": 0.9212602376937866, "learning_rate": 0.0002712253604394556, "loss": 0.5538, "step": 51780 }, { "epoch": 1.1527332621082622, "grad_norm": 0.6670851707458496, "learning_rate": 0.0002711817929338514, "loss": 0.5079, "step": 51790 }, { "epoch": 1.1529558404558404, "grad_norm": 0.4576922059059143, "learning_rate": 0.0002711382215601703, "loss": 0.6051, "step": 51800 }, { "epoch": 1.1531784188034189, "grad_norm": 0.4248937666416168, "learning_rate": 0.00027109464632078, "loss": 0.5919, "step": 51810 }, { "epoch": 1.153400997150997, "grad_norm": 0.46840816736221313, "learning_rate": 0.00027105106721804837, "loss": 0.5576, "step": 51820 }, { "epoch": 1.1536235754985755, "grad_norm": 0.5896355509757996, "learning_rate": 0.00027100748425434363, "loss": 0.574, "step": 51830 }, { "epoch": 1.1538461538461537, "grad_norm": 0.8185621500015259, "learning_rate": 0.0002709638974320341, "loss": 0.5869, "step": 51840 }, { "epoch": 1.1540687321937322, "grad_norm": 0.5478002429008484, "learning_rate": 0.00027092030675348824, "loss": 0.4995, "step": 51850 }, { "epoch": 1.1542913105413106, "grad_norm": 0.7509670257568359, "learning_rate": 0.00027087671222107484, "loss": 0.6434, "step": 51860 }, { "epoch": 1.1545138888888888, "grad_norm": 0.5304856300354004, "learning_rate": 0.0002708331138371629, "loss": 0.5819, "step": 51870 }, { "epoch": 1.1547364672364673, "grad_norm": 0.39571112394332886, "learning_rate": 0.00027078951160412155, "loss": 0.6543, "step": 51880 }, { "epoch": 1.1549590455840455, "grad_norm": 0.6003801226615906, "learning_rate": 0.00027074590552432026, "loss": 0.603, "step": 51890 }, { "epoch": 1.155181623931624, "grad_norm": 0.5318456292152405, "learning_rate": 0.00027070229560012847, "loss": 0.4693, "step": 51900 }, { "epoch": 1.1554042022792024, "grad_norm": 0.4450821578502655, "learning_rate": 0.0002706586818339161, "loss": 0.4332, "step": 51910 }, { "epoch": 1.1556267806267806, "grad_norm": 0.5957626104354858, "learning_rate": 0.0002706150642280531, "loss": 0.5313, "step": 51920 }, { "epoch": 1.155849358974359, "grad_norm": 0.7766496539115906, "learning_rate": 0.00027057144278490964, "loss": 0.5441, "step": 51930 }, { "epoch": 1.1560719373219372, "grad_norm": 0.48109349608421326, "learning_rate": 0.0002705278175068563, "loss": 0.5063, "step": 51940 }, { "epoch": 1.1562945156695157, "grad_norm": 0.7788119316101074, "learning_rate": 0.0002704841883962636, "loss": 0.5365, "step": 51950 }, { "epoch": 1.1565170940170941, "grad_norm": 0.7502115368843079, "learning_rate": 0.0002704405554555024, "loss": 0.6355, "step": 51960 }, { "epoch": 1.1567396723646723, "grad_norm": 0.6445801854133606, "learning_rate": 0.00027039691868694374, "loss": 0.5152, "step": 51970 }, { "epoch": 1.1569622507122508, "grad_norm": 0.651919424533844, "learning_rate": 0.00027035327809295885, "loss": 0.559, "step": 51980 }, { "epoch": 1.157184829059829, "grad_norm": 0.5348412990570068, "learning_rate": 0.00027030963367591924, "loss": 0.6066, "step": 51990 }, { "epoch": 1.1574074074074074, "grad_norm": 0.5168819427490234, "learning_rate": 0.00027026598543819665, "loss": 0.6061, "step": 52000 }, { "epoch": 1.1576299857549857, "grad_norm": 0.48285502195358276, "learning_rate": 0.00027022233338216283, "loss": 0.5346, "step": 52010 }, { "epoch": 1.157852564102564, "grad_norm": 0.6535779237747192, "learning_rate": 0.00027017867751019, "loss": 0.5861, "step": 52020 }, { "epoch": 1.1580751424501425, "grad_norm": 0.50959312915802, "learning_rate": 0.0002701350178246503, "loss": 0.5394, "step": 52030 }, { "epoch": 1.1582977207977208, "grad_norm": 0.7481958270072937, "learning_rate": 0.0002700913543279163, "loss": 0.5979, "step": 52040 }, { "epoch": 1.1585202991452992, "grad_norm": 0.5922252535820007, "learning_rate": 0.0002700476870223608, "loss": 0.5435, "step": 52050 }, { "epoch": 1.1587428774928774, "grad_norm": 0.6566937565803528, "learning_rate": 0.00027000401591035665, "loss": 0.5208, "step": 52060 }, { "epoch": 1.1589654558404558, "grad_norm": 0.6020127534866333, "learning_rate": 0.0002699603409942769, "loss": 0.5358, "step": 52070 }, { "epoch": 1.159188034188034, "grad_norm": 0.5534032583236694, "learning_rate": 0.00026991666227649497, "loss": 0.5399, "step": 52080 }, { "epoch": 1.1594106125356125, "grad_norm": 0.6564294695854187, "learning_rate": 0.0002698729797593844, "loss": 0.6216, "step": 52090 }, { "epoch": 1.159633190883191, "grad_norm": 0.5825326442718506, "learning_rate": 0.0002698292934453189, "loss": 0.5339, "step": 52100 }, { "epoch": 1.1598557692307692, "grad_norm": 0.48360762000083923, "learning_rate": 0.0002697856033366724, "loss": 0.5682, "step": 52110 }, { "epoch": 1.1600783475783476, "grad_norm": 0.7247682213783264, "learning_rate": 0.0002697419094358192, "loss": 0.537, "step": 52120 }, { "epoch": 1.160300925925926, "grad_norm": 0.7545642256736755, "learning_rate": 0.0002696982117451334, "loss": 0.5773, "step": 52130 }, { "epoch": 1.1605235042735043, "grad_norm": 0.6881244778633118, "learning_rate": 0.0002696545102669897, "loss": 0.5605, "step": 52140 }, { "epoch": 1.1607460826210827, "grad_norm": 0.463986873626709, "learning_rate": 0.000269610805003763, "loss": 0.5707, "step": 52150 }, { "epoch": 1.160968660968661, "grad_norm": 0.41395312547683716, "learning_rate": 0.0002695670959578282, "loss": 0.6493, "step": 52160 }, { "epoch": 1.1611912393162394, "grad_norm": 0.9375655055046082, "learning_rate": 0.00026952338313156036, "loss": 0.5997, "step": 52170 }, { "epoch": 1.1614138176638176, "grad_norm": 0.6170309782028198, "learning_rate": 0.00026947966652733494, "loss": 0.5741, "step": 52180 }, { "epoch": 1.161636396011396, "grad_norm": 0.6665390729904175, "learning_rate": 0.0002694359461475277, "loss": 0.4955, "step": 52190 }, { "epoch": 1.1618589743589745, "grad_norm": 0.6789956092834473, "learning_rate": 0.0002693922219945142, "loss": 0.6042, "step": 52200 }, { "epoch": 1.1620815527065527, "grad_norm": 0.38877052068710327, "learning_rate": 0.00026934849407067054, "loss": 0.5404, "step": 52210 }, { "epoch": 1.162304131054131, "grad_norm": 0.8757748007774353, "learning_rate": 0.000269304762378373, "loss": 0.5848, "step": 52220 }, { "epoch": 1.1625267094017093, "grad_norm": 0.8811602592468262, "learning_rate": 0.0002692610269199979, "loss": 0.5052, "step": 52230 }, { "epoch": 1.1627492877492878, "grad_norm": 0.5613075494766235, "learning_rate": 0.0002692172876979219, "loss": 0.5791, "step": 52240 }, { "epoch": 1.162971866096866, "grad_norm": 0.41427573561668396, "learning_rate": 0.00026917354471452185, "loss": 0.6177, "step": 52250 }, { "epoch": 1.1631944444444444, "grad_norm": 0.8255056738853455, "learning_rate": 0.0002691297979721747, "loss": 0.5372, "step": 52260 }, { "epoch": 1.1634170227920229, "grad_norm": 0.5964106917381287, "learning_rate": 0.0002690860474732578, "loss": 0.459, "step": 52270 }, { "epoch": 1.163639601139601, "grad_norm": 0.875540018081665, "learning_rate": 0.0002690422932201485, "loss": 0.5354, "step": 52280 }, { "epoch": 1.1638621794871795, "grad_norm": 0.5432913899421692, "learning_rate": 0.0002689985352152244, "loss": 0.6149, "step": 52290 }, { "epoch": 1.164084757834758, "grad_norm": 0.732105553150177, "learning_rate": 0.0002689547734608635, "loss": 0.52, "step": 52300 }, { "epoch": 1.1643073361823362, "grad_norm": 0.7695441246032715, "learning_rate": 0.00026891100795944375, "loss": 0.6225, "step": 52310 }, { "epoch": 1.1645299145299146, "grad_norm": 0.9869396686553955, "learning_rate": 0.00026886723871334336, "loss": 0.5307, "step": 52320 }, { "epoch": 1.1647524928774928, "grad_norm": 0.6698643565177917, "learning_rate": 0.0002688234657249409, "loss": 0.6412, "step": 52330 }, { "epoch": 1.1649750712250713, "grad_norm": 0.8857219219207764, "learning_rate": 0.0002687796889966149, "loss": 0.5894, "step": 52340 }, { "epoch": 1.1651976495726495, "grad_norm": 0.6016620397567749, "learning_rate": 0.0002687359085307444, "loss": 0.5249, "step": 52350 }, { "epoch": 1.165420227920228, "grad_norm": 0.6211386322975159, "learning_rate": 0.00026869212432970827, "loss": 0.5685, "step": 52360 }, { "epoch": 1.1656428062678064, "grad_norm": 0.622130811214447, "learning_rate": 0.00026864833639588594, "loss": 0.6297, "step": 52370 }, { "epoch": 1.1658653846153846, "grad_norm": 0.4021855890750885, "learning_rate": 0.0002686045447316567, "loss": 0.6885, "step": 52380 }, { "epoch": 1.166087962962963, "grad_norm": 0.6166818737983704, "learning_rate": 0.0002685607493394004, "loss": 0.6306, "step": 52390 }, { "epoch": 1.1663105413105412, "grad_norm": 0.588339626789093, "learning_rate": 0.0002685169502214969, "loss": 0.6566, "step": 52400 }, { "epoch": 1.1665331196581197, "grad_norm": 0.7152255177497864, "learning_rate": 0.0002684731473803262, "loss": 0.5807, "step": 52410 }, { "epoch": 1.166755698005698, "grad_norm": 0.8684418201446533, "learning_rate": 0.0002684293408182686, "loss": 0.6132, "step": 52420 }, { "epoch": 1.1669782763532763, "grad_norm": 0.6165472865104675, "learning_rate": 0.0002683855305377046, "loss": 0.5071, "step": 52430 }, { "epoch": 1.1672008547008548, "grad_norm": 0.7422646880149841, "learning_rate": 0.0002683417165410149, "loss": 0.6087, "step": 52440 }, { "epoch": 1.167423433048433, "grad_norm": 0.5423184037208557, "learning_rate": 0.0002682978988305804, "loss": 0.5059, "step": 52450 }, { "epoch": 1.1676460113960114, "grad_norm": 0.8247236013412476, "learning_rate": 0.0002682540774087821, "loss": 0.5573, "step": 52460 }, { "epoch": 1.1678685897435896, "grad_norm": 0.5281010866165161, "learning_rate": 0.00026821025227800145, "loss": 0.4842, "step": 52470 }, { "epoch": 1.168091168091168, "grad_norm": 0.5985293388366699, "learning_rate": 0.00026816642344061983, "loss": 0.5485, "step": 52480 }, { "epoch": 1.1683137464387465, "grad_norm": 0.5074992775917053, "learning_rate": 0.0002681225908990189, "loss": 0.5867, "step": 52490 }, { "epoch": 1.1685363247863247, "grad_norm": 0.598908543586731, "learning_rate": 0.00026807875465558064, "loss": 0.4858, "step": 52500 }, { "epoch": 1.1687589031339032, "grad_norm": 0.7070103883743286, "learning_rate": 0.00026803491471268716, "loss": 0.5633, "step": 52510 }, { "epoch": 1.1689814814814814, "grad_norm": 0.8883180618286133, "learning_rate": 0.00026799107107272066, "loss": 0.5992, "step": 52520 }, { "epoch": 1.1692040598290598, "grad_norm": 0.456495076417923, "learning_rate": 0.00026794722373806365, "loss": 0.4145, "step": 52530 }, { "epoch": 1.1694266381766383, "grad_norm": 0.3848685026168823, "learning_rate": 0.000267903372711099, "loss": 0.6011, "step": 52540 }, { "epoch": 1.1696492165242165, "grad_norm": 0.5944089889526367, "learning_rate": 0.0002678595179942095, "loss": 0.5212, "step": 52550 }, { "epoch": 1.169871794871795, "grad_norm": 0.5271813273429871, "learning_rate": 0.00026781565958977816, "loss": 0.6392, "step": 52560 }, { "epoch": 1.1700943732193732, "grad_norm": 0.6273509860038757, "learning_rate": 0.0002677717975001883, "loss": 0.5713, "step": 52570 }, { "epoch": 1.1703169515669516, "grad_norm": 0.6436539888381958, "learning_rate": 0.00026772793172782363, "loss": 0.6242, "step": 52580 }, { "epoch": 1.1705395299145298, "grad_norm": 0.49243152141571045, "learning_rate": 0.0002676840622750676, "loss": 0.5868, "step": 52590 }, { "epoch": 1.1707621082621082, "grad_norm": 0.6814731955528259, "learning_rate": 0.00026764018914430426, "loss": 0.4408, "step": 52600 }, { "epoch": 1.1709846866096867, "grad_norm": 0.49323052167892456, "learning_rate": 0.00026759631233791767, "loss": 0.4908, "step": 52610 }, { "epoch": 1.171207264957265, "grad_norm": 0.48324576020240784, "learning_rate": 0.00026755243185829213, "loss": 0.5272, "step": 52620 }, { "epoch": 1.1714298433048433, "grad_norm": 0.5585512518882751, "learning_rate": 0.0002675085477078121, "loss": 0.5014, "step": 52630 }, { "epoch": 1.1716524216524216, "grad_norm": 0.5609182119369507, "learning_rate": 0.0002674646598888624, "loss": 0.5145, "step": 52640 }, { "epoch": 1.171875, "grad_norm": 0.6564444899559021, "learning_rate": 0.0002674207684038278, "loss": 0.6005, "step": 52650 }, { "epoch": 1.1720975783475784, "grad_norm": 0.6718125343322754, "learning_rate": 0.00026737687325509345, "loss": 0.6004, "step": 52660 }, { "epoch": 1.1723201566951567, "grad_norm": 0.6030959486961365, "learning_rate": 0.0002673329744450447, "loss": 0.5524, "step": 52670 }, { "epoch": 1.172542735042735, "grad_norm": 0.48877060413360596, "learning_rate": 0.00026728907197606696, "loss": 0.4131, "step": 52680 }, { "epoch": 1.1727653133903133, "grad_norm": 0.5057005286216736, "learning_rate": 0.00026724516585054596, "loss": 0.694, "step": 52690 }, { "epoch": 1.1729878917378918, "grad_norm": 1.0375540256500244, "learning_rate": 0.0002672012560708676, "loss": 0.486, "step": 52700 }, { "epoch": 1.1732104700854702, "grad_norm": 0.7084336876869202, "learning_rate": 0.00026715734263941794, "loss": 0.5738, "step": 52710 }, { "epoch": 1.1734330484330484, "grad_norm": 0.6579006314277649, "learning_rate": 0.0002671134255585834, "loss": 0.7216, "step": 52720 }, { "epoch": 1.1736556267806268, "grad_norm": 0.7561776638031006, "learning_rate": 0.0002670695048307502, "loss": 0.6214, "step": 52730 }, { "epoch": 1.173878205128205, "grad_norm": 0.7474552392959595, "learning_rate": 0.0002670255804583054, "loss": 0.6917, "step": 52740 }, { "epoch": 1.1741007834757835, "grad_norm": 0.49505066871643066, "learning_rate": 0.00026698165244363564, "loss": 0.6039, "step": 52750 }, { "epoch": 1.1743233618233617, "grad_norm": 1.0127018690109253, "learning_rate": 0.00026693772078912795, "loss": 0.5748, "step": 52760 }, { "epoch": 1.1745459401709402, "grad_norm": 0.4878261685371399, "learning_rate": 0.0002668937854971698, "loss": 0.4545, "step": 52770 }, { "epoch": 1.1747685185185186, "grad_norm": 0.7447136044502258, "learning_rate": 0.0002668498465701485, "loss": 0.7469, "step": 52780 }, { "epoch": 1.1749910968660968, "grad_norm": 0.581995964050293, "learning_rate": 0.00026680590401045195, "loss": 0.6097, "step": 52790 }, { "epoch": 1.1752136752136753, "grad_norm": 0.5570381283760071, "learning_rate": 0.00026676195782046776, "loss": 0.4755, "step": 52800 }, { "epoch": 1.1754362535612535, "grad_norm": 0.49623483419418335, "learning_rate": 0.0002667180080025842, "loss": 0.5373, "step": 52810 }, { "epoch": 1.175658831908832, "grad_norm": 0.5563613176345825, "learning_rate": 0.00026667405455918947, "loss": 0.5642, "step": 52820 }, { "epoch": 1.1758814102564104, "grad_norm": 0.6269398927688599, "learning_rate": 0.000266630097492672, "loss": 0.6702, "step": 52830 }, { "epoch": 1.1761039886039886, "grad_norm": 0.6564904451370239, "learning_rate": 0.0002665861368054205, "loss": 0.6917, "step": 52840 }, { "epoch": 1.176326566951567, "grad_norm": 0.9609503746032715, "learning_rate": 0.00026654217249982376, "loss": 0.6088, "step": 52850 }, { "epoch": 1.1765491452991452, "grad_norm": 0.6753114461898804, "learning_rate": 0.00026649820457827093, "loss": 0.5698, "step": 52860 }, { "epoch": 1.1767717236467237, "grad_norm": 0.6223951578140259, "learning_rate": 0.0002664542330431513, "loss": 0.4725, "step": 52870 }, { "epoch": 1.176994301994302, "grad_norm": 0.9456238150596619, "learning_rate": 0.0002664102578968541, "loss": 0.6092, "step": 52880 }, { "epoch": 1.1772168803418803, "grad_norm": 0.5020238757133484, "learning_rate": 0.0002663662791417693, "loss": 0.5609, "step": 52890 }, { "epoch": 1.1774394586894588, "grad_norm": 0.5456209778785706, "learning_rate": 0.0002663222967802864, "loss": 0.6475, "step": 52900 }, { "epoch": 1.177662037037037, "grad_norm": 0.8311290144920349, "learning_rate": 0.00026627831081479567, "loss": 0.6265, "step": 52910 }, { "epoch": 1.1778846153846154, "grad_norm": 0.6691579818725586, "learning_rate": 0.00026623432124768726, "loss": 0.6148, "step": 52920 }, { "epoch": 1.1781071937321936, "grad_norm": 0.7260137796401978, "learning_rate": 0.0002661903280813516, "loss": 0.5983, "step": 52930 }, { "epoch": 1.178329772079772, "grad_norm": 0.7961229085922241, "learning_rate": 0.00026614633131817936, "loss": 0.5465, "step": 52940 }, { "epoch": 1.1785523504273505, "grad_norm": 0.7031990885734558, "learning_rate": 0.00026610233096056136, "loss": 0.7622, "step": 52950 }, { "epoch": 1.1787749287749287, "grad_norm": 0.6887387037277222, "learning_rate": 0.00026605832701088853, "loss": 0.582, "step": 52960 }, { "epoch": 1.1789975071225072, "grad_norm": 0.9699671864509583, "learning_rate": 0.0002660143194715521, "loss": 0.5528, "step": 52970 }, { "epoch": 1.1792200854700854, "grad_norm": 0.6452189683914185, "learning_rate": 0.0002659703083449435, "loss": 0.5524, "step": 52980 }, { "epoch": 1.1794426638176638, "grad_norm": 0.6887800097465515, "learning_rate": 0.00026592629363345445, "loss": 0.6631, "step": 52990 }, { "epoch": 1.179665242165242, "grad_norm": 0.5450928211212158, "learning_rate": 0.00026588227533947653, "loss": 0.5136, "step": 53000 }, { "epoch": 1.1798878205128205, "grad_norm": 0.6100490689277649, "learning_rate": 0.0002658382534654019, "loss": 0.5468, "step": 53010 }, { "epoch": 1.180110398860399, "grad_norm": 0.3202427327632904, "learning_rate": 0.0002657942280136226, "loss": 0.4887, "step": 53020 }, { "epoch": 1.1803329772079771, "grad_norm": 0.5512992739677429, "learning_rate": 0.00026575019898653117, "loss": 0.673, "step": 53030 }, { "epoch": 1.1805555555555556, "grad_norm": 1.174062728881836, "learning_rate": 0.00026570616638652006, "loss": 0.5801, "step": 53040 }, { "epoch": 1.180778133903134, "grad_norm": 0.7160769701004028, "learning_rate": 0.0002656621302159821, "loss": 0.5151, "step": 53050 }, { "epoch": 1.1810007122507122, "grad_norm": 0.6652625203132629, "learning_rate": 0.0002656180904773102, "loss": 0.6656, "step": 53060 }, { "epoch": 1.1812232905982907, "grad_norm": 0.3985412120819092, "learning_rate": 0.00026557404717289756, "loss": 0.4716, "step": 53070 }, { "epoch": 1.181445868945869, "grad_norm": 0.6421456336975098, "learning_rate": 0.0002655300003051375, "loss": 0.597, "step": 53080 }, { "epoch": 1.1816684472934473, "grad_norm": 0.4710211455821991, "learning_rate": 0.00026548594987642365, "loss": 0.4481, "step": 53090 }, { "epoch": 1.1818910256410255, "grad_norm": 0.4872575104236603, "learning_rate": 0.00026544189588914964, "loss": 0.4522, "step": 53100 }, { "epoch": 1.182113603988604, "grad_norm": 0.7041962146759033, "learning_rate": 0.0002653978383457094, "loss": 0.5575, "step": 53110 }, { "epoch": 1.1823361823361824, "grad_norm": 0.5078232288360596, "learning_rate": 0.00026535377724849703, "loss": 0.6371, "step": 53120 }, { "epoch": 1.1825587606837606, "grad_norm": 0.8470007181167603, "learning_rate": 0.00026530971259990696, "loss": 0.4924, "step": 53130 }, { "epoch": 1.182781339031339, "grad_norm": 0.8424126505851746, "learning_rate": 0.0002652656444023338, "loss": 0.6354, "step": 53140 }, { "epoch": 1.1830039173789173, "grad_norm": 0.6890965104103088, "learning_rate": 0.0002652215726581719, "loss": 0.6349, "step": 53150 }, { "epoch": 1.1832264957264957, "grad_norm": 0.6198452115058899, "learning_rate": 0.00026517749736981635, "loss": 0.4863, "step": 53160 }, { "epoch": 1.183449074074074, "grad_norm": 0.7459619641304016, "learning_rate": 0.0002651334185396623, "loss": 0.5476, "step": 53170 }, { "epoch": 1.1836716524216524, "grad_norm": 0.7808142304420471, "learning_rate": 0.000265089336170105, "loss": 0.6134, "step": 53180 }, { "epoch": 1.1838942307692308, "grad_norm": 0.6265953779220581, "learning_rate": 0.0002650452502635398, "loss": 0.5628, "step": 53190 }, { "epoch": 1.184116809116809, "grad_norm": 0.5564361810684204, "learning_rate": 0.0002650011608223625, "loss": 0.5641, "step": 53200 }, { "epoch": 1.1843393874643875, "grad_norm": 0.6715850234031677, "learning_rate": 0.0002649570678489689, "loss": 0.5415, "step": 53210 }, { "epoch": 1.184561965811966, "grad_norm": 0.7137435078620911, "learning_rate": 0.00026491297134575504, "loss": 0.6647, "step": 53220 }, { "epoch": 1.1847845441595442, "grad_norm": 0.6090908646583557, "learning_rate": 0.0002648688713151172, "loss": 0.5755, "step": 53230 }, { "epoch": 1.1850071225071226, "grad_norm": 0.7091788649559021, "learning_rate": 0.0002648247677594518, "loss": 0.5249, "step": 53240 }, { "epoch": 1.1852297008547008, "grad_norm": 0.5731387734413147, "learning_rate": 0.0002647806606811554, "loss": 0.5721, "step": 53250 }, { "epoch": 1.1854522792022792, "grad_norm": 0.851579487323761, "learning_rate": 0.00026473655008262486, "loss": 0.6269, "step": 53260 }, { "epoch": 1.1856748575498575, "grad_norm": 0.6304162740707397, "learning_rate": 0.0002646924359662573, "loss": 0.6449, "step": 53270 }, { "epoch": 1.185897435897436, "grad_norm": 0.8386474847793579, "learning_rate": 0.00026464831833444976, "loss": 0.5611, "step": 53280 }, { "epoch": 1.1861200142450143, "grad_norm": 0.6858943700790405, "learning_rate": 0.00026460419718959965, "loss": 0.6902, "step": 53290 }, { "epoch": 1.1863425925925926, "grad_norm": 0.6749021410942078, "learning_rate": 0.0002645600725341046, "loss": 0.5678, "step": 53300 }, { "epoch": 1.186565170940171, "grad_norm": 0.6653118133544922, "learning_rate": 0.00026451594437036234, "loss": 0.6369, "step": 53310 }, { "epoch": 1.1867877492877492, "grad_norm": 0.4555003345012665, "learning_rate": 0.00026447181270077084, "loss": 0.5047, "step": 53320 }, { "epoch": 1.1870103276353277, "grad_norm": 0.44015443325042725, "learning_rate": 0.0002644276775277283, "loss": 0.5344, "step": 53330 }, { "epoch": 1.1872329059829059, "grad_norm": 0.4752315878868103, "learning_rate": 0.00026438353885363297, "loss": 0.5221, "step": 53340 }, { "epoch": 1.1874554843304843, "grad_norm": 0.8851911425590515, "learning_rate": 0.00026433939668088344, "loss": 0.6015, "step": 53350 }, { "epoch": 1.1876780626780628, "grad_norm": 0.4986988604068756, "learning_rate": 0.0002642952510118785, "loss": 0.6087, "step": 53360 }, { "epoch": 1.187900641025641, "grad_norm": 1.0384236574172974, "learning_rate": 0.00026425110184901687, "loss": 0.4508, "step": 53370 }, { "epoch": 1.1881232193732194, "grad_norm": 0.7926008105278015, "learning_rate": 0.00026420694919469784, "loss": 0.6285, "step": 53380 }, { "epoch": 1.1883457977207976, "grad_norm": 0.9093513488769531, "learning_rate": 0.0002641627930513206, "loss": 0.532, "step": 53390 }, { "epoch": 1.188568376068376, "grad_norm": 0.739948570728302, "learning_rate": 0.0002641186334212847, "loss": 0.576, "step": 53400 }, { "epoch": 1.1887909544159545, "grad_norm": 0.4455528259277344, "learning_rate": 0.00026407447030698974, "loss": 0.437, "step": 53410 }, { "epoch": 1.1890135327635327, "grad_norm": 0.6131460070610046, "learning_rate": 0.00026403030371083557, "loss": 0.5818, "step": 53420 }, { "epoch": 1.1892361111111112, "grad_norm": 0.6285380721092224, "learning_rate": 0.0002639861336352223, "loss": 0.4712, "step": 53430 }, { "epoch": 1.1894586894586894, "grad_norm": 0.8274964094161987, "learning_rate": 0.00026394196008255015, "loss": 0.5606, "step": 53440 }, { "epoch": 1.1896812678062678, "grad_norm": 0.6058911681175232, "learning_rate": 0.0002638977830552196, "loss": 0.5672, "step": 53450 }, { "epoch": 1.1899038461538463, "grad_norm": 0.6424249410629272, "learning_rate": 0.0002638536025556312, "loss": 0.5608, "step": 53460 }, { "epoch": 1.1901264245014245, "grad_norm": 0.5659881234169006, "learning_rate": 0.0002638094185861857, "loss": 0.4935, "step": 53470 }, { "epoch": 1.190349002849003, "grad_norm": 0.5742921829223633, "learning_rate": 0.0002637652311492842, "loss": 0.624, "step": 53480 }, { "epoch": 1.1905715811965811, "grad_norm": 0.6720833778381348, "learning_rate": 0.00026372104024732784, "loss": 0.5983, "step": 53490 }, { "epoch": 1.1907941595441596, "grad_norm": 0.9703565835952759, "learning_rate": 0.00026367684588271794, "loss": 0.6631, "step": 53500 }, { "epoch": 1.1910167378917378, "grad_norm": 0.7967060208320618, "learning_rate": 0.00026363264805785616, "loss": 0.5349, "step": 53510 }, { "epoch": 1.1912393162393162, "grad_norm": 0.5237561464309692, "learning_rate": 0.0002635884467751442, "loss": 0.5134, "step": 53520 }, { "epoch": 1.1914618945868947, "grad_norm": 0.5853220820426941, "learning_rate": 0.000263544242036984, "loss": 0.5488, "step": 53530 }, { "epoch": 1.1916844729344729, "grad_norm": 0.6570531129837036, "learning_rate": 0.0002635000338457776, "loss": 0.5671, "step": 53540 }, { "epoch": 1.1919070512820513, "grad_norm": 0.6014338135719299, "learning_rate": 0.00026345582220392734, "loss": 0.4712, "step": 53550 }, { "epoch": 1.1921296296296295, "grad_norm": 0.7948473691940308, "learning_rate": 0.0002634116071138359, "loss": 0.5277, "step": 53560 }, { "epoch": 1.192352207977208, "grad_norm": 0.5840214490890503, "learning_rate": 0.0002633673885779057, "loss": 0.5787, "step": 53570 }, { "epoch": 1.1925747863247864, "grad_norm": 0.5520692467689514, "learning_rate": 0.00026332316659853975, "loss": 0.5515, "step": 53580 }, { "epoch": 1.1927973646723646, "grad_norm": 0.49704843759536743, "learning_rate": 0.00026327894117814116, "loss": 0.5606, "step": 53590 }, { "epoch": 1.193019943019943, "grad_norm": 0.7613427639007568, "learning_rate": 0.00026323471231911303, "loss": 0.5976, "step": 53600 }, { "epoch": 1.1932425213675213, "grad_norm": 0.615630567073822, "learning_rate": 0.0002631904800238589, "loss": 0.5749, "step": 53610 }, { "epoch": 1.1934650997150997, "grad_norm": 0.38871222734451294, "learning_rate": 0.0002631462442947823, "loss": 0.5895, "step": 53620 }, { "epoch": 1.1936876780626782, "grad_norm": 0.47710564732551575, "learning_rate": 0.0002631020051342872, "loss": 0.4715, "step": 53630 }, { "epoch": 1.1939102564102564, "grad_norm": 0.6385695934295654, "learning_rate": 0.00026305776254477735, "loss": 0.6086, "step": 53640 }, { "epoch": 1.1941328347578348, "grad_norm": 0.49752333760261536, "learning_rate": 0.0002630135165286571, "loss": 0.4322, "step": 53650 }, { "epoch": 1.194355413105413, "grad_norm": 0.6448046565055847, "learning_rate": 0.00026296926708833083, "loss": 0.5492, "step": 53660 }, { "epoch": 1.1945779914529915, "grad_norm": 0.6780838966369629, "learning_rate": 0.00026292501422620307, "loss": 0.5434, "step": 53670 }, { "epoch": 1.1948005698005697, "grad_norm": 0.6584859490394592, "learning_rate": 0.00026288075794467843, "loss": 0.5916, "step": 53680 }, { "epoch": 1.1950231481481481, "grad_norm": 0.6336795091629028, "learning_rate": 0.00026283649824616195, "loss": 0.5295, "step": 53690 }, { "epoch": 1.1952457264957266, "grad_norm": 0.5201059579849243, "learning_rate": 0.0002627922351330588, "loss": 0.5759, "step": 53700 }, { "epoch": 1.1954683048433048, "grad_norm": 0.4014904499053955, "learning_rate": 0.0002627479686077741, "loss": 0.5278, "step": 53710 }, { "epoch": 1.1956908831908832, "grad_norm": 0.532713770866394, "learning_rate": 0.00026270369867271336, "loss": 0.4821, "step": 53720 }, { "epoch": 1.1959134615384615, "grad_norm": 0.46067413687705994, "learning_rate": 0.0002626594253302824, "loss": 0.5367, "step": 53730 }, { "epoch": 1.19613603988604, "grad_norm": 0.7721562385559082, "learning_rate": 0.000262615148582887, "loss": 0.4979, "step": 53740 }, { "epoch": 1.196358618233618, "grad_norm": 0.6185513734817505, "learning_rate": 0.0002625708684329331, "loss": 0.5486, "step": 53750 }, { "epoch": 1.1965811965811965, "grad_norm": 0.6002604365348816, "learning_rate": 0.00026252658488282697, "loss": 0.4288, "step": 53760 }, { "epoch": 1.196803774928775, "grad_norm": 0.5647562742233276, "learning_rate": 0.00026248229793497506, "loss": 0.4885, "step": 53770 }, { "epoch": 1.1970263532763532, "grad_norm": 0.7083515524864197, "learning_rate": 0.00026243800759178396, "loss": 0.7101, "step": 53780 }, { "epoch": 1.1972489316239316, "grad_norm": 0.7820031642913818, "learning_rate": 0.00026239371385566044, "loss": 0.6792, "step": 53790 }, { "epoch": 1.19747150997151, "grad_norm": 0.4229520857334137, "learning_rate": 0.00026234941672901137, "loss": 0.5298, "step": 53800 }, { "epoch": 1.1976940883190883, "grad_norm": 0.8767134547233582, "learning_rate": 0.00026230511621424396, "loss": 0.6235, "step": 53810 }, { "epoch": 1.1979166666666667, "grad_norm": 0.6357962489128113, "learning_rate": 0.0002622608123137655, "loss": 0.5968, "step": 53820 }, { "epoch": 1.198139245014245, "grad_norm": 0.5747750997543335, "learning_rate": 0.00026221650502998356, "loss": 0.5996, "step": 53830 }, { "epoch": 1.1983618233618234, "grad_norm": 0.6612414717674255, "learning_rate": 0.0002621721943653058, "loss": 0.6281, "step": 53840 }, { "epoch": 1.1985844017094016, "grad_norm": 0.4362661838531494, "learning_rate": 0.0002621278803221401, "loss": 0.6812, "step": 53850 }, { "epoch": 1.19880698005698, "grad_norm": 0.4877786934375763, "learning_rate": 0.0002620835629028946, "loss": 0.5982, "step": 53860 }, { "epoch": 1.1990295584045585, "grad_norm": 0.6957717537879944, "learning_rate": 0.00026203924210997735, "loss": 0.5806, "step": 53870 }, { "epoch": 1.1992521367521367, "grad_norm": 0.5992857813835144, "learning_rate": 0.00026199491794579694, "loss": 0.5286, "step": 53880 }, { "epoch": 1.1994747150997151, "grad_norm": 0.6934967637062073, "learning_rate": 0.0002619505904127619, "loss": 0.7629, "step": 53890 }, { "epoch": 1.1996972934472934, "grad_norm": 0.5664094090461731, "learning_rate": 0.0002619062595132811, "loss": 0.7004, "step": 53900 }, { "epoch": 1.1999198717948718, "grad_norm": 1.015615701675415, "learning_rate": 0.00026186192524976353, "loss": 0.5354, "step": 53910 }, { "epoch": 1.20014245014245, "grad_norm": 0.5994182825088501, "learning_rate": 0.00026181758762461825, "loss": 0.549, "step": 53920 }, { "epoch": 1.20014245014245, "eval_loss": 0.5869694352149963, "eval_runtime": 337.2585, "eval_samples_per_second": 7.012, "eval_steps_per_second": 7.012, "step": 53920 }, { "epoch": 1.2003650284900285, "grad_norm": 0.9583232402801514, "learning_rate": 0.0002617732466402547, "loss": 0.6322, "step": 53930 }, { "epoch": 1.200587606837607, "grad_norm": 0.7566698789596558, "learning_rate": 0.00026172890229908226, "loss": 0.7383, "step": 53940 }, { "epoch": 1.2008101851851851, "grad_norm": 0.767450213432312, "learning_rate": 0.0002616845546035108, "loss": 0.5751, "step": 53950 }, { "epoch": 1.2010327635327636, "grad_norm": 0.5808753967285156, "learning_rate": 0.00026164020355595014, "loss": 0.5769, "step": 53960 }, { "epoch": 1.201255341880342, "grad_norm": 0.5383173823356628, "learning_rate": 0.0002615958491588103, "loss": 0.4942, "step": 53970 }, { "epoch": 1.2014779202279202, "grad_norm": 1.001544713973999, "learning_rate": 0.0002615514914145017, "loss": 0.6011, "step": 53980 }, { "epoch": 1.2017004985754987, "grad_norm": 0.48997971415519714, "learning_rate": 0.0002615071303254346, "loss": 0.496, "step": 53990 }, { "epoch": 1.2019230769230769, "grad_norm": 0.7488561868667603, "learning_rate": 0.00026146276589401966, "loss": 0.5785, "step": 54000 }, { "epoch": 1.2021456552706553, "grad_norm": 0.6256828904151917, "learning_rate": 0.0002614183981226678, "loss": 0.5895, "step": 54010 }, { "epoch": 1.2023682336182335, "grad_norm": 0.5626097917556763, "learning_rate": 0.00026137402701378984, "loss": 0.4916, "step": 54020 }, { "epoch": 1.202590811965812, "grad_norm": 0.5621308088302612, "learning_rate": 0.000261329652569797, "loss": 0.5951, "step": 54030 }, { "epoch": 1.2028133903133904, "grad_norm": 0.6779301166534424, "learning_rate": 0.00026128527479310064, "loss": 0.5456, "step": 54040 }, { "epoch": 1.2030359686609686, "grad_norm": 0.816089391708374, "learning_rate": 0.0002612408936861123, "loss": 0.482, "step": 54050 }, { "epoch": 1.203258547008547, "grad_norm": 0.6551616191864014, "learning_rate": 0.00026119650925124366, "loss": 0.4211, "step": 54060 }, { "epoch": 1.2034811253561253, "grad_norm": 0.6255223751068115, "learning_rate": 0.0002611521214909066, "loss": 0.5114, "step": 54070 }, { "epoch": 1.2037037037037037, "grad_norm": 0.514076292514801, "learning_rate": 0.0002611077304075132, "loss": 0.4465, "step": 54080 }, { "epoch": 1.203926282051282, "grad_norm": 0.4324864149093628, "learning_rate": 0.00026106333600347566, "loss": 0.4174, "step": 54090 }, { "epoch": 1.2041488603988604, "grad_norm": 0.6271386742591858, "learning_rate": 0.0002610189382812065, "loss": 0.628, "step": 54100 }, { "epoch": 1.2043714387464388, "grad_norm": 0.6508593559265137, "learning_rate": 0.0002609745372431183, "loss": 0.4914, "step": 54110 }, { "epoch": 1.204594017094017, "grad_norm": 0.5108374953269958, "learning_rate": 0.00026093013289162385, "loss": 0.563, "step": 54120 }, { "epoch": 1.2048165954415955, "grad_norm": 0.8042571544647217, "learning_rate": 0.00026088572522913606, "loss": 0.5608, "step": 54130 }, { "epoch": 1.205039173789174, "grad_norm": 0.7624759674072266, "learning_rate": 0.000260841314258068, "loss": 0.456, "step": 54140 }, { "epoch": 1.2052617521367521, "grad_norm": 0.789258599281311, "learning_rate": 0.0002607968999808333, "loss": 0.4817, "step": 54150 }, { "epoch": 1.2054843304843306, "grad_norm": 0.7475525140762329, "learning_rate": 0.0002607524823998452, "loss": 0.5729, "step": 54160 }, { "epoch": 1.2057069088319088, "grad_norm": 0.5522698163986206, "learning_rate": 0.0002607080615175175, "loss": 0.4242, "step": 54170 }, { "epoch": 1.2059294871794872, "grad_norm": 0.9012867212295532, "learning_rate": 0.00026066363733626396, "loss": 0.5816, "step": 54180 }, { "epoch": 1.2061520655270654, "grad_norm": 0.6278597712516785, "learning_rate": 0.0002606192098584988, "loss": 0.5937, "step": 54190 }, { "epoch": 1.2063746438746439, "grad_norm": 0.674605131149292, "learning_rate": 0.00026057477908663615, "loss": 0.4871, "step": 54200 }, { "epoch": 1.2065972222222223, "grad_norm": 0.5200433135032654, "learning_rate": 0.00026053034502309037, "loss": 0.5309, "step": 54210 }, { "epoch": 1.2068198005698005, "grad_norm": 0.6211512088775635, "learning_rate": 0.0002604859076702761, "loss": 0.7805, "step": 54220 }, { "epoch": 1.207042378917379, "grad_norm": 0.5976756811141968, "learning_rate": 0.0002604414670306081, "loss": 0.4788, "step": 54230 }, { "epoch": 1.2072649572649572, "grad_norm": 0.7606221437454224, "learning_rate": 0.0002603970231065013, "loss": 0.4546, "step": 54240 }, { "epoch": 1.2074875356125356, "grad_norm": 0.5437316298484802, "learning_rate": 0.00026035257590037084, "loss": 0.5922, "step": 54250 }, { "epoch": 1.2077101139601139, "grad_norm": 0.7171007990837097, "learning_rate": 0.000260308125414632, "loss": 0.59, "step": 54260 }, { "epoch": 1.2079326923076923, "grad_norm": 0.7423374652862549, "learning_rate": 0.00026026367165170024, "loss": 0.5753, "step": 54270 }, { "epoch": 1.2081552706552707, "grad_norm": 0.42859748005867004, "learning_rate": 0.0002602192146139912, "loss": 0.5436, "step": 54280 }, { "epoch": 1.208377849002849, "grad_norm": 0.5075267553329468, "learning_rate": 0.0002601747543039207, "loss": 0.513, "step": 54290 }, { "epoch": 1.2086004273504274, "grad_norm": 0.6833608746528625, "learning_rate": 0.0002601302907239049, "loss": 0.5712, "step": 54300 }, { "epoch": 1.2088230056980056, "grad_norm": 0.8390812873840332, "learning_rate": 0.0002600858238763598, "loss": 0.5476, "step": 54310 }, { "epoch": 1.209045584045584, "grad_norm": 0.7078325748443604, "learning_rate": 0.0002600413537637019, "loss": 0.6077, "step": 54320 }, { "epoch": 1.2092681623931625, "grad_norm": 0.8721582889556885, "learning_rate": 0.0002599968803883477, "loss": 0.6386, "step": 54330 }, { "epoch": 1.2094907407407407, "grad_norm": 0.6040824055671692, "learning_rate": 0.0002599524037527138, "loss": 0.6223, "step": 54340 }, { "epoch": 1.2097133190883191, "grad_norm": 0.8039819002151489, "learning_rate": 0.00025990792385921724, "loss": 0.5922, "step": 54350 }, { "epoch": 1.2099358974358974, "grad_norm": 0.8088663816452026, "learning_rate": 0.00025986344071027507, "loss": 0.707, "step": 54360 }, { "epoch": 1.2101584757834758, "grad_norm": 0.38190963864326477, "learning_rate": 0.00025981895430830456, "loss": 0.5355, "step": 54370 }, { "epoch": 1.2103810541310542, "grad_norm": 0.5602192282676697, "learning_rate": 0.00025977446465572313, "loss": 0.6918, "step": 54380 }, { "epoch": 1.2106036324786325, "grad_norm": 0.3187742531299591, "learning_rate": 0.00025972997175494826, "loss": 0.3899, "step": 54390 }, { "epoch": 1.210826210826211, "grad_norm": 0.46724361181259155, "learning_rate": 0.0002596854756083979, "loss": 0.5282, "step": 54400 }, { "epoch": 1.211048789173789, "grad_norm": 0.44511914253234863, "learning_rate": 0.0002596409762184899, "loss": 0.4879, "step": 54410 }, { "epoch": 1.2112713675213675, "grad_norm": 0.5991200804710388, "learning_rate": 0.00025959647358764237, "loss": 0.5285, "step": 54420 }, { "epoch": 1.2114939458689458, "grad_norm": 0.5048407316207886, "learning_rate": 0.00025955196771827374, "loss": 0.5108, "step": 54430 }, { "epoch": 1.2117165242165242, "grad_norm": 0.4918557405471802, "learning_rate": 0.00025950745861280243, "loss": 0.4733, "step": 54440 }, { "epoch": 1.2119391025641026, "grad_norm": 0.56037837266922, "learning_rate": 0.00025946294627364713, "loss": 0.5976, "step": 54450 }, { "epoch": 1.2121616809116809, "grad_norm": 0.3522513508796692, "learning_rate": 0.0002594184307032266, "loss": 0.5696, "step": 54460 }, { "epoch": 1.2123842592592593, "grad_norm": 0.6364935040473938, "learning_rate": 0.0002593739119039599, "loss": 0.651, "step": 54470 }, { "epoch": 1.2126068376068375, "grad_norm": 0.6097647547721863, "learning_rate": 0.00025932938987826626, "loss": 0.6558, "step": 54480 }, { "epoch": 1.212829415954416, "grad_norm": 0.5620438456535339, "learning_rate": 0.000259284864628565, "loss": 0.5837, "step": 54490 }, { "epoch": 1.2130519943019944, "grad_norm": 0.7545850276947021, "learning_rate": 0.00025924033615727567, "loss": 0.6373, "step": 54500 }, { "epoch": 1.2132745726495726, "grad_norm": 0.49159228801727295, "learning_rate": 0.000259195804466818, "loss": 0.4563, "step": 54510 }, { "epoch": 1.213497150997151, "grad_norm": 0.5741070508956909, "learning_rate": 0.0002591512695596118, "loss": 0.537, "step": 54520 }, { "epoch": 1.2137197293447293, "grad_norm": 0.7356338500976562, "learning_rate": 0.0002591067314380772, "loss": 0.5082, "step": 54530 }, { "epoch": 1.2139423076923077, "grad_norm": 0.49523648619651794, "learning_rate": 0.00025906219010463446, "loss": 0.6259, "step": 54540 }, { "epoch": 1.2141648860398861, "grad_norm": 0.5031092762947083, "learning_rate": 0.00025901764556170387, "loss": 0.5766, "step": 54550 }, { "epoch": 1.2143874643874644, "grad_norm": 0.5970885753631592, "learning_rate": 0.0002589730978117062, "loss": 0.6695, "step": 54560 }, { "epoch": 1.2146100427350428, "grad_norm": 0.8675116300582886, "learning_rate": 0.0002589285468570621, "loss": 0.5188, "step": 54570 }, { "epoch": 1.214832621082621, "grad_norm": 0.6658830046653748, "learning_rate": 0.0002588839927001925, "loss": 0.6013, "step": 54580 }, { "epoch": 1.2150551994301995, "grad_norm": 0.37236538529396057, "learning_rate": 0.0002588394353435185, "loss": 0.4736, "step": 54590 }, { "epoch": 1.2152777777777777, "grad_norm": 0.6953976154327393, "learning_rate": 0.0002587948747894615, "loss": 0.4716, "step": 54600 }, { "epoch": 1.2155003561253561, "grad_norm": 0.5283172726631165, "learning_rate": 0.00025875031104044283, "loss": 0.5972, "step": 54610 }, { "epoch": 1.2157229344729346, "grad_norm": 0.9671580195426941, "learning_rate": 0.00025870574409888415, "loss": 0.5456, "step": 54620 }, { "epoch": 1.2159455128205128, "grad_norm": 0.4424217641353607, "learning_rate": 0.00025866117396720727, "loss": 0.6644, "step": 54630 }, { "epoch": 1.2161680911680912, "grad_norm": 0.6921446919441223, "learning_rate": 0.0002586166006478342, "loss": 0.7807, "step": 54640 }, { "epoch": 1.2163906695156694, "grad_norm": 0.6515398025512695, "learning_rate": 0.00025857202414318706, "loss": 0.6287, "step": 54650 }, { "epoch": 1.2166132478632479, "grad_norm": 0.6656025052070618, "learning_rate": 0.0002585274444556882, "loss": 0.5519, "step": 54660 }, { "epoch": 1.216835826210826, "grad_norm": 0.5132244825363159, "learning_rate": 0.00025848286158776005, "loss": 0.5149, "step": 54670 }, { "epoch": 1.2170584045584045, "grad_norm": 0.755617618560791, "learning_rate": 0.00025843827554182535, "loss": 0.5941, "step": 54680 }, { "epoch": 1.217280982905983, "grad_norm": 0.4537512958049774, "learning_rate": 0.0002583936863203069, "loss": 0.5314, "step": 54690 }, { "epoch": 1.2175035612535612, "grad_norm": 0.5429509282112122, "learning_rate": 0.00025834909392562775, "loss": 0.5425, "step": 54700 }, { "epoch": 1.2177261396011396, "grad_norm": 0.766391396522522, "learning_rate": 0.0002583044983602111, "loss": 0.671, "step": 54710 }, { "epoch": 1.217948717948718, "grad_norm": 0.7556502819061279, "learning_rate": 0.00025825989962648024, "loss": 0.5542, "step": 54720 }, { "epoch": 1.2181712962962963, "grad_norm": 0.5049962401390076, "learning_rate": 0.00025821529772685874, "loss": 0.5729, "step": 54730 }, { "epoch": 1.2183938746438747, "grad_norm": 0.6555672287940979, "learning_rate": 0.00025817069266377026, "loss": 0.6085, "step": 54740 }, { "epoch": 1.218616452991453, "grad_norm": 0.6055775880813599, "learning_rate": 0.00025812608443963884, "loss": 0.6407, "step": 54750 }, { "epoch": 1.2188390313390314, "grad_norm": 0.7035840749740601, "learning_rate": 0.0002580814730568883, "loss": 0.586, "step": 54760 }, { "epoch": 1.2190616096866096, "grad_norm": 0.39872151613235474, "learning_rate": 0.000258036858517943, "loss": 0.555, "step": 54770 }, { "epoch": 1.219284188034188, "grad_norm": 0.5271748900413513, "learning_rate": 0.0002579922408252273, "loss": 0.5704, "step": 54780 }, { "epoch": 1.2195067663817665, "grad_norm": 0.5622698664665222, "learning_rate": 0.00025794761998116576, "loss": 0.6895, "step": 54790 }, { "epoch": 1.2197293447293447, "grad_norm": 0.5202801823616028, "learning_rate": 0.0002579029959881831, "loss": 0.4322, "step": 54800 }, { "epoch": 1.2199519230769231, "grad_norm": 0.7955455183982849, "learning_rate": 0.00025785836884870426, "loss": 0.5425, "step": 54810 }, { "epoch": 1.2201745014245013, "grad_norm": 0.801661491394043, "learning_rate": 0.00025781373856515426, "loss": 0.5512, "step": 54820 }, { "epoch": 1.2203970797720798, "grad_norm": 0.4858386516571045, "learning_rate": 0.0002577691051399584, "loss": 0.6204, "step": 54830 }, { "epoch": 1.220619658119658, "grad_norm": 0.7022749185562134, "learning_rate": 0.0002577244685755421, "loss": 0.5774, "step": 54840 }, { "epoch": 1.2208422364672364, "grad_norm": 0.6417854428291321, "learning_rate": 0.00025767982887433085, "loss": 0.4721, "step": 54850 }, { "epoch": 1.2210648148148149, "grad_norm": 0.7101449966430664, "learning_rate": 0.00025763518603875063, "loss": 0.5465, "step": 54860 }, { "epoch": 1.221287393162393, "grad_norm": 0.6686637997627258, "learning_rate": 0.00025759054007122703, "loss": 0.4875, "step": 54870 }, { "epoch": 1.2215099715099715, "grad_norm": 0.6626191139221191, "learning_rate": 0.00025754589097418644, "loss": 0.4817, "step": 54880 }, { "epoch": 1.22173254985755, "grad_norm": 0.8132066130638123, "learning_rate": 0.00025750123875005503, "loss": 0.5493, "step": 54890 }, { "epoch": 1.2219551282051282, "grad_norm": 0.5762627720832825, "learning_rate": 0.0002574565834012592, "loss": 0.6158, "step": 54900 }, { "epoch": 1.2221777065527066, "grad_norm": 0.4756952226161957, "learning_rate": 0.0002574119249302256, "loss": 0.5642, "step": 54910 }, { "epoch": 1.2224002849002849, "grad_norm": 0.6637765765190125, "learning_rate": 0.00025736726333938095, "loss": 0.5216, "step": 54920 }, { "epoch": 1.2226228632478633, "grad_norm": 0.7495138049125671, "learning_rate": 0.0002573225986311523, "loss": 0.6827, "step": 54930 }, { "epoch": 1.2228454415954415, "grad_norm": 0.7420310974121094, "learning_rate": 0.00025727793080796677, "loss": 0.683, "step": 54940 }, { "epoch": 1.22306801994302, "grad_norm": 0.5983907580375671, "learning_rate": 0.0002572332598722515, "loss": 0.5965, "step": 54950 }, { "epoch": 1.2232905982905984, "grad_norm": 0.8083733320236206, "learning_rate": 0.00025718858582643407, "loss": 0.5241, "step": 54960 }, { "epoch": 1.2235131766381766, "grad_norm": 0.630792498588562, "learning_rate": 0.0002571439086729421, "loss": 0.4769, "step": 54970 }, { "epoch": 1.223735754985755, "grad_norm": 0.4133424758911133, "learning_rate": 0.00025709922841420324, "loss": 0.6182, "step": 54980 }, { "epoch": 1.2239583333333333, "grad_norm": 0.6753107905387878, "learning_rate": 0.00025705454505264565, "loss": 0.488, "step": 54990 }, { "epoch": 1.2241809116809117, "grad_norm": 0.6498923897743225, "learning_rate": 0.0002570098585906974, "loss": 0.533, "step": 55000 }, { "epoch": 1.22440349002849, "grad_norm": 0.7844968438148499, "learning_rate": 0.0002569651690307867, "loss": 0.4815, "step": 55010 }, { "epoch": 1.2246260683760684, "grad_norm": 0.644562304019928, "learning_rate": 0.0002569204763753421, "loss": 0.5618, "step": 55020 }, { "epoch": 1.2248486467236468, "grad_norm": 0.342278391122818, "learning_rate": 0.00025687578062679226, "loss": 0.6154, "step": 55030 }, { "epoch": 1.225071225071225, "grad_norm": 0.3919949233531952, "learning_rate": 0.00025683108178756593, "loss": 0.5703, "step": 55040 }, { "epoch": 1.2252938034188035, "grad_norm": 0.655899167060852, "learning_rate": 0.00025678637986009206, "loss": 0.6265, "step": 55050 }, { "epoch": 1.225516381766382, "grad_norm": 0.5508276224136353, "learning_rate": 0.0002567416748467998, "loss": 0.51, "step": 55060 }, { "epoch": 1.22573896011396, "grad_norm": 0.5614111423492432, "learning_rate": 0.00025669696675011854, "loss": 0.692, "step": 55070 }, { "epoch": 1.2259615384615385, "grad_norm": 0.5933703184127808, "learning_rate": 0.00025665225557247763, "loss": 0.696, "step": 55080 }, { "epoch": 1.2261841168091168, "grad_norm": 0.9442933797836304, "learning_rate": 0.0002566075413163068, "loss": 0.4774, "step": 55090 }, { "epoch": 1.2264066951566952, "grad_norm": 0.45799508690834045, "learning_rate": 0.00025656282398403584, "loss": 0.5187, "step": 55100 }, { "epoch": 1.2266292735042734, "grad_norm": 1.2259610891342163, "learning_rate": 0.00025651810357809474, "loss": 0.5338, "step": 55110 }, { "epoch": 1.2268518518518519, "grad_norm": 0.6217261552810669, "learning_rate": 0.0002564733801009136, "loss": 0.569, "step": 55120 }, { "epoch": 1.2270744301994303, "grad_norm": 0.5604465007781982, "learning_rate": 0.00025642865355492275, "loss": 0.5324, "step": 55130 }, { "epoch": 1.2272970085470085, "grad_norm": 0.9130906462669373, "learning_rate": 0.0002563839239425527, "loss": 0.5891, "step": 55140 }, { "epoch": 1.227519586894587, "grad_norm": 0.42064061760902405, "learning_rate": 0.00025633919126623404, "loss": 0.6374, "step": 55150 }, { "epoch": 1.2277421652421652, "grad_norm": 0.6115346550941467, "learning_rate": 0.00025629445552839756, "loss": 0.5738, "step": 55160 }, { "epoch": 1.2279647435897436, "grad_norm": 0.7261988520622253, "learning_rate": 0.00025624971673147436, "loss": 0.5057, "step": 55170 }, { "epoch": 1.2281873219373218, "grad_norm": 0.6409322023391724, "learning_rate": 0.0002562049748778955, "loss": 0.6314, "step": 55180 }, { "epoch": 1.2284099002849003, "grad_norm": 0.7145477533340454, "learning_rate": 0.00025616022997009225, "loss": 0.6691, "step": 55190 }, { "epoch": 1.2286324786324787, "grad_norm": 0.8280013203620911, "learning_rate": 0.0002561154820104961, "loss": 0.5243, "step": 55200 }, { "epoch": 1.228855056980057, "grad_norm": 0.7874851822853088, "learning_rate": 0.0002560707310015388, "loss": 0.4837, "step": 55210 }, { "epoch": 1.2290776353276354, "grad_norm": 0.6673729419708252, "learning_rate": 0.00025602597694565204, "loss": 0.5322, "step": 55220 }, { "epoch": 1.2293002136752136, "grad_norm": 0.7367050647735596, "learning_rate": 0.0002559812198452678, "loss": 0.5932, "step": 55230 }, { "epoch": 1.229522792022792, "grad_norm": 0.7605116367340088, "learning_rate": 0.0002559364597028183, "loss": 0.6529, "step": 55240 }, { "epoch": 1.2297453703703705, "grad_norm": 0.7067446708679199, "learning_rate": 0.0002558916965207358, "loss": 0.5559, "step": 55250 }, { "epoch": 1.2299679487179487, "grad_norm": 0.7943964004516602, "learning_rate": 0.0002558469303014527, "loss": 0.6749, "step": 55260 }, { "epoch": 1.2301905270655271, "grad_norm": 0.7154000997543335, "learning_rate": 0.00025580216104740167, "loss": 0.5852, "step": 55270 }, { "epoch": 1.2304131054131053, "grad_norm": 0.7004664540290833, "learning_rate": 0.00025575738876101563, "loss": 0.6099, "step": 55280 }, { "epoch": 1.2306356837606838, "grad_norm": 0.37125012278556824, "learning_rate": 0.0002557126134447273, "loss": 0.5734, "step": 55290 }, { "epoch": 1.2308582621082622, "grad_norm": 0.6853071451187134, "learning_rate": 0.0002556678351009701, "loss": 0.5636, "step": 55300 }, { "epoch": 1.2310808404558404, "grad_norm": 0.5351976156234741, "learning_rate": 0.00025562305373217703, "loss": 0.5404, "step": 55310 }, { "epoch": 1.2313034188034189, "grad_norm": 0.6240566372871399, "learning_rate": 0.00025557826934078184, "loss": 0.6675, "step": 55320 }, { "epoch": 1.231525997150997, "grad_norm": 0.9049735069274902, "learning_rate": 0.00025553348192921784, "loss": 0.5755, "step": 55330 }, { "epoch": 1.2317485754985755, "grad_norm": 0.49391135573387146, "learning_rate": 0.000255488691499919, "loss": 0.5853, "step": 55340 }, { "epoch": 1.2319711538461537, "grad_norm": 0.6163642406463623, "learning_rate": 0.0002554438980553193, "loss": 0.5491, "step": 55350 }, { "epoch": 1.2321937321937322, "grad_norm": 0.7874521017074585, "learning_rate": 0.00025539910159785276, "loss": 0.5642, "step": 55360 }, { "epoch": 1.2324163105413106, "grad_norm": 0.7445282936096191, "learning_rate": 0.00025535430212995366, "loss": 0.5149, "step": 55370 }, { "epoch": 1.2326388888888888, "grad_norm": 0.7496173977851868, "learning_rate": 0.0002553094996540565, "loss": 0.6407, "step": 55380 }, { "epoch": 1.2328614672364673, "grad_norm": 0.7357646226882935, "learning_rate": 0.00025526469417259587, "loss": 0.5723, "step": 55390 }, { "epoch": 1.2330840455840455, "grad_norm": 0.6955065727233887, "learning_rate": 0.0002552198856880065, "loss": 0.5503, "step": 55400 }, { "epoch": 1.233306623931624, "grad_norm": 0.8007845878601074, "learning_rate": 0.0002551750742027233, "loss": 0.673, "step": 55410 }, { "epoch": 1.2335292022792024, "grad_norm": 0.5288559794425964, "learning_rate": 0.00025513025971918144, "loss": 0.4695, "step": 55420 }, { "epoch": 1.2337517806267806, "grad_norm": 0.4855462610721588, "learning_rate": 0.00025508544223981617, "loss": 0.4069, "step": 55430 }, { "epoch": 1.233974358974359, "grad_norm": 0.5603774189949036, "learning_rate": 0.0002550406217670628, "loss": 0.5772, "step": 55440 }, { "epoch": 1.2341969373219372, "grad_norm": 0.4866039454936981, "learning_rate": 0.000254995798303357, "loss": 0.586, "step": 55450 }, { "epoch": 1.2344195156695157, "grad_norm": 0.5705690383911133, "learning_rate": 0.0002549509718511345, "loss": 0.406, "step": 55460 }, { "epoch": 1.2346420940170941, "grad_norm": 1.1998240947723389, "learning_rate": 0.0002549061424128312, "loss": 0.5793, "step": 55470 }, { "epoch": 1.2348646723646723, "grad_norm": 0.8668375015258789, "learning_rate": 0.0002548613099908832, "loss": 0.5561, "step": 55480 }, { "epoch": 1.2350872507122508, "grad_norm": 0.5967520475387573, "learning_rate": 0.0002548164745877267, "loss": 0.5796, "step": 55490 }, { "epoch": 1.235309829059829, "grad_norm": 0.41339319944381714, "learning_rate": 0.00025477163620579816, "loss": 0.5141, "step": 55500 }, { "epoch": 1.2355324074074074, "grad_norm": 0.545566737651825, "learning_rate": 0.00025472679484753397, "loss": 0.5999, "step": 55510 }, { "epoch": 1.2357549857549857, "grad_norm": 0.5253483653068542, "learning_rate": 0.00025468195051537093, "loss": 0.6773, "step": 55520 }, { "epoch": 1.235977564102564, "grad_norm": 0.7213420271873474, "learning_rate": 0.000254637103211746, "loss": 0.5317, "step": 55530 }, { "epoch": 1.2362001424501425, "grad_norm": 0.7994476556777954, "learning_rate": 0.0002545922529390961, "loss": 0.4462, "step": 55540 }, { "epoch": 1.2364227207977208, "grad_norm": 0.4448365867137909, "learning_rate": 0.0002545473996998585, "loss": 0.5264, "step": 55550 }, { "epoch": 1.2366452991452992, "grad_norm": 0.7122238874435425, "learning_rate": 0.00025450254349647063, "loss": 0.5363, "step": 55560 }, { "epoch": 1.2368678774928774, "grad_norm": 0.5884889364242554, "learning_rate": 0.0002544576843313698, "loss": 0.4136, "step": 55570 }, { "epoch": 1.2370904558404558, "grad_norm": 0.6523982286453247, "learning_rate": 0.0002544128222069939, "loss": 0.7017, "step": 55580 }, { "epoch": 1.237313034188034, "grad_norm": 0.5850688219070435, "learning_rate": 0.0002543679571257807, "loss": 0.5455, "step": 55590 }, { "epoch": 1.2375356125356125, "grad_norm": 0.6737033724784851, "learning_rate": 0.00025432308909016817, "loss": 0.4931, "step": 55600 }, { "epoch": 1.237758190883191, "grad_norm": 0.4806579053401947, "learning_rate": 0.00025427821810259456, "loss": 0.4767, "step": 55610 }, { "epoch": 1.2379807692307692, "grad_norm": 0.7363060116767883, "learning_rate": 0.00025423334416549805, "loss": 0.5505, "step": 55620 }, { "epoch": 1.2382033475783476, "grad_norm": 0.662604570388794, "learning_rate": 0.00025418846728131735, "loss": 0.5909, "step": 55630 }, { "epoch": 1.238425925925926, "grad_norm": 0.5632050633430481, "learning_rate": 0.00025414358745249086, "loss": 0.5021, "step": 55640 }, { "epoch": 1.2386485042735043, "grad_norm": 0.5719519853591919, "learning_rate": 0.0002540987046814575, "loss": 0.4946, "step": 55650 }, { "epoch": 1.2388710826210827, "grad_norm": 0.6867715716362, "learning_rate": 0.00025405381897065633, "loss": 0.5261, "step": 55660 }, { "epoch": 1.239093660968661, "grad_norm": 0.7669585943222046, "learning_rate": 0.00025400893032252633, "loss": 0.5767, "step": 55670 }, { "epoch": 1.2393162393162394, "grad_norm": 0.48740777373313904, "learning_rate": 0.00025396403873950685, "loss": 0.6365, "step": 55680 }, { "epoch": 1.2395388176638176, "grad_norm": 0.5656322240829468, "learning_rate": 0.0002539191442240373, "loss": 0.5406, "step": 55690 }, { "epoch": 1.239761396011396, "grad_norm": 0.4301076829433441, "learning_rate": 0.0002538742467785574, "loss": 0.6131, "step": 55700 }, { "epoch": 1.2399839743589745, "grad_norm": 0.5691587924957275, "learning_rate": 0.0002538293464055068, "loss": 0.6222, "step": 55710 }, { "epoch": 1.2402065527065527, "grad_norm": 0.9512937664985657, "learning_rate": 0.00025378444310732536, "loss": 0.7261, "step": 55720 }, { "epoch": 1.240429131054131, "grad_norm": 0.8524298071861267, "learning_rate": 0.0002537395368864534, "loss": 0.5653, "step": 55730 }, { "epoch": 1.2406517094017093, "grad_norm": 0.6196438670158386, "learning_rate": 0.00025369462774533087, "loss": 0.629, "step": 55740 }, { "epoch": 1.2408742877492878, "grad_norm": 0.7953673601150513, "learning_rate": 0.0002536497156863983, "loss": 0.5801, "step": 55750 }, { "epoch": 1.241096866096866, "grad_norm": 0.6761536002159119, "learning_rate": 0.0002536048007120964, "loss": 0.5326, "step": 55760 }, { "epoch": 1.2413194444444444, "grad_norm": 1.063515067100525, "learning_rate": 0.00025355988282486566, "loss": 0.6407, "step": 55770 }, { "epoch": 1.2415420227920229, "grad_norm": 0.559059739112854, "learning_rate": 0.0002535149620271471, "loss": 0.5126, "step": 55780 }, { "epoch": 1.241764601139601, "grad_norm": 0.4550701081752777, "learning_rate": 0.0002534700383213816, "loss": 0.5939, "step": 55790 }, { "epoch": 1.2419871794871795, "grad_norm": 0.49273550510406494, "learning_rate": 0.0002534251117100105, "loss": 0.5026, "step": 55800 }, { "epoch": 1.242209757834758, "grad_norm": 0.8309057354927063, "learning_rate": 0.0002533801821954751, "loss": 0.4389, "step": 55810 }, { "epoch": 1.2424323361823362, "grad_norm": 0.8344371914863586, "learning_rate": 0.00025333524978021684, "loss": 0.6484, "step": 55820 }, { "epoch": 1.2426549145299146, "grad_norm": 0.5584569573402405, "learning_rate": 0.0002532903144666775, "loss": 0.6408, "step": 55830 }, { "epoch": 1.2428774928774928, "grad_norm": 0.4493742287158966, "learning_rate": 0.0002532453762572989, "loss": 0.6098, "step": 55840 }, { "epoch": 1.2431000712250713, "grad_norm": 0.623287558555603, "learning_rate": 0.00025320043515452285, "loss": 0.5406, "step": 55850 }, { "epoch": 1.2433226495726495, "grad_norm": 0.4659002423286438, "learning_rate": 0.00025315549116079164, "loss": 0.5666, "step": 55860 }, { "epoch": 1.243545227920228, "grad_norm": 1.6551035642623901, "learning_rate": 0.0002531105442785476, "loss": 0.7074, "step": 55870 }, { "epoch": 1.2437678062678064, "grad_norm": 0.6472983360290527, "learning_rate": 0.000253065594510233, "loss": 0.6331, "step": 55880 }, { "epoch": 1.2439903846153846, "grad_norm": 0.5087982416152954, "learning_rate": 0.00025302064185829065, "loss": 0.453, "step": 55890 }, { "epoch": 1.244212962962963, "grad_norm": 0.49544757604599, "learning_rate": 0.00025297568632516316, "loss": 0.456, "step": 55900 }, { "epoch": 1.2444355413105412, "grad_norm": 0.45202043652534485, "learning_rate": 0.0002529307279132935, "loss": 0.6218, "step": 55910 }, { "epoch": 1.2446581196581197, "grad_norm": 0.6582351922988892, "learning_rate": 0.00025288576662512477, "loss": 0.6431, "step": 55920 }, { "epoch": 1.244880698005698, "grad_norm": 0.5657109618186951, "learning_rate": 0.0002528408024631002, "loss": 0.511, "step": 55930 }, { "epoch": 1.2451032763532763, "grad_norm": 0.7638427019119263, "learning_rate": 0.00025279583542966316, "loss": 0.5974, "step": 55940 }, { "epoch": 1.2453258547008548, "grad_norm": 0.6323970556259155, "learning_rate": 0.00025275086552725717, "loss": 0.6025, "step": 55950 }, { "epoch": 1.245548433048433, "grad_norm": 0.5526149868965149, "learning_rate": 0.000252705892758326, "loss": 0.5352, "step": 55960 }, { "epoch": 1.2457710113960114, "grad_norm": 1.1778260469436646, "learning_rate": 0.00025266091712531345, "loss": 0.6064, "step": 55970 }, { "epoch": 1.2459935897435896, "grad_norm": 0.9252521991729736, "learning_rate": 0.00025261593863066357, "loss": 0.7074, "step": 55980 }, { "epoch": 1.246216168091168, "grad_norm": 0.5749571919441223, "learning_rate": 0.0002525709572768205, "loss": 0.5105, "step": 55990 }, { "epoch": 1.2464387464387465, "grad_norm": 0.6217884421348572, "learning_rate": 0.0002525259730662286, "loss": 0.5243, "step": 56000 }, { "epoch": 1.2466613247863247, "grad_norm": 0.6942879557609558, "learning_rate": 0.00025248098600133225, "loss": 0.6208, "step": 56010 }, { "epoch": 1.2468839031339032, "grad_norm": 0.597964882850647, "learning_rate": 0.0002524359960845763, "loss": 0.5515, "step": 56020 }, { "epoch": 1.2471064814814814, "grad_norm": 0.5471338033676147, "learning_rate": 0.00025239100331840526, "loss": 0.6837, "step": 56030 }, { "epoch": 1.2473290598290598, "grad_norm": 0.6313568353652954, "learning_rate": 0.00025234600770526424, "loss": 0.5684, "step": 56040 }, { "epoch": 1.2475516381766383, "grad_norm": 0.5859391689300537, "learning_rate": 0.00025230100924759837, "loss": 0.429, "step": 56050 }, { "epoch": 1.2477742165242165, "grad_norm": 0.5821884870529175, "learning_rate": 0.00025225600794785274, "loss": 0.4953, "step": 56060 }, { "epoch": 1.247996794871795, "grad_norm": 0.7767961621284485, "learning_rate": 0.00025221100380847287, "loss": 0.5567, "step": 56070 }, { "epoch": 1.2482193732193732, "grad_norm": 0.4788326025009155, "learning_rate": 0.00025216599683190445, "loss": 0.6074, "step": 56080 }, { "epoch": 1.2484419515669516, "grad_norm": 0.7091581225395203, "learning_rate": 0.00025212098702059296, "loss": 0.5637, "step": 56090 }, { "epoch": 1.2486645299145298, "grad_norm": 0.723284125328064, "learning_rate": 0.00025207597437698436, "loss": 0.4697, "step": 56100 }, { "epoch": 1.2488871082621082, "grad_norm": 0.6637664437294006, "learning_rate": 0.00025203095890352466, "loss": 0.5175, "step": 56110 }, { "epoch": 1.2491096866096867, "grad_norm": 0.5903345942497253, "learning_rate": 0.00025198594060266014, "loss": 0.6325, "step": 56120 }, { "epoch": 1.249332264957265, "grad_norm": 0.8338324427604675, "learning_rate": 0.00025194091947683693, "loss": 0.6398, "step": 56130 }, { "epoch": 1.2495548433048433, "grad_norm": 0.6326521039009094, "learning_rate": 0.0002518958955285017, "loss": 0.3617, "step": 56140 }, { "epoch": 1.2497774216524216, "grad_norm": 0.615234375, "learning_rate": 0.00025185086876010104, "loss": 0.7264, "step": 56150 }, { "epoch": 1.25, "grad_norm": 0.8148561716079712, "learning_rate": 0.00025180583917408175, "loss": 0.6266, "step": 56160 }, { "epoch": 1.2502225783475782, "grad_norm": 0.5812483429908752, "learning_rate": 0.0002517608067728907, "loss": 0.49, "step": 56170 }, { "epoch": 1.2504451566951567, "grad_norm": 0.3880385458469391, "learning_rate": 0.00025171577155897503, "loss": 0.5487, "step": 56180 }, { "epoch": 1.250667735042735, "grad_norm": 0.5918259024620056, "learning_rate": 0.0002516707335347821, "loss": 0.4947, "step": 56190 }, { "epoch": 1.2508903133903133, "grad_norm": 0.6482241749763489, "learning_rate": 0.0002516256927027591, "loss": 0.6409, "step": 56200 }, { "epoch": 1.2511128917378918, "grad_norm": 0.6706924438476562, "learning_rate": 0.0002515806490653537, "loss": 0.6098, "step": 56210 }, { "epoch": 1.2513354700854702, "grad_norm": 0.6281888484954834, "learning_rate": 0.00025153560262501363, "loss": 0.5878, "step": 56220 }, { "epoch": 1.2515580484330484, "grad_norm": 0.8230198621749878, "learning_rate": 0.0002514905533841867, "loss": 0.5487, "step": 56230 }, { "epoch": 1.2517806267806268, "grad_norm": 0.6760601997375488, "learning_rate": 0.000251445501345321, "loss": 0.6127, "step": 56240 }, { "epoch": 1.252003205128205, "grad_norm": 0.6492635607719421, "learning_rate": 0.00025140044651086456, "loss": 0.5875, "step": 56250 }, { "epoch": 1.2522257834757835, "grad_norm": 0.6327205300331116, "learning_rate": 0.00025135538888326585, "loss": 0.5046, "step": 56260 }, { "epoch": 1.2524483618233617, "grad_norm": 0.7743632793426514, "learning_rate": 0.00025131032846497324, "loss": 0.5061, "step": 56270 }, { "epoch": 1.2526709401709402, "grad_norm": 0.5469549298286438, "learning_rate": 0.0002512652652584354, "loss": 0.5637, "step": 56280 }, { "epoch": 1.2528935185185186, "grad_norm": 0.584494948387146, "learning_rate": 0.00025122019926610104, "loss": 0.4669, "step": 56290 }, { "epoch": 1.2531160968660968, "grad_norm": 0.614250659942627, "learning_rate": 0.00025117513049041916, "loss": 0.7461, "step": 56300 }, { "epoch": 1.2533386752136753, "grad_norm": 0.6992598176002502, "learning_rate": 0.0002511300589338388, "loss": 0.6399, "step": 56310 }, { "epoch": 1.2535612535612537, "grad_norm": 0.7041775584220886, "learning_rate": 0.0002510849845988091, "loss": 0.5378, "step": 56320 }, { "epoch": 1.253783831908832, "grad_norm": 0.7931853532791138, "learning_rate": 0.00025103990748777963, "loss": 0.5429, "step": 56330 }, { "epoch": 1.2540064102564101, "grad_norm": 0.7894284725189209, "learning_rate": 0.0002509948276031997, "loss": 0.494, "step": 56340 }, { "epoch": 1.2542289886039886, "grad_norm": 0.4571925103664398, "learning_rate": 0.00025094974494751913, "loss": 0.5855, "step": 56350 }, { "epoch": 1.254451566951567, "grad_norm": 0.6734161376953125, "learning_rate": 0.0002509046595231877, "loss": 0.6186, "step": 56360 }, { "epoch": 1.2546741452991452, "grad_norm": 0.543872594833374, "learning_rate": 0.0002508595713326555, "loss": 0.492, "step": 56370 }, { "epoch": 1.2548967236467237, "grad_norm": 0.680564820766449, "learning_rate": 0.0002508144803783724, "loss": 0.5019, "step": 56380 }, { "epoch": 1.255119301994302, "grad_norm": 0.5233339071273804, "learning_rate": 0.00025076938666278894, "loss": 0.4705, "step": 56390 }, { "epoch": 1.2553418803418803, "grad_norm": 0.6287051439285278, "learning_rate": 0.00025072429018835546, "loss": 0.5671, "step": 56400 }, { "epoch": 1.2555644586894588, "grad_norm": 0.6765820980072021, "learning_rate": 0.00025067919095752244, "loss": 0.619, "step": 56410 }, { "epoch": 1.255787037037037, "grad_norm": 0.4503667950630188, "learning_rate": 0.00025063408897274075, "loss": 0.4486, "step": 56420 }, { "epoch": 1.2560096153846154, "grad_norm": 0.7574901580810547, "learning_rate": 0.00025058898423646115, "loss": 0.5588, "step": 56430 }, { "epoch": 1.2562321937321936, "grad_norm": 0.855404257774353, "learning_rate": 0.00025054387675113484, "loss": 0.4947, "step": 56440 }, { "epoch": 1.256454772079772, "grad_norm": 0.6277042627334595, "learning_rate": 0.00025049876651921283, "loss": 0.6169, "step": 56450 }, { "epoch": 1.2566773504273505, "grad_norm": 0.7483381628990173, "learning_rate": 0.00025045365354314656, "loss": 0.7175, "step": 56460 }, { "epoch": 1.2568999287749287, "grad_norm": 0.5631914138793945, "learning_rate": 0.00025040853782538734, "loss": 0.5895, "step": 56470 }, { "epoch": 1.2571225071225072, "grad_norm": 0.5679247975349426, "learning_rate": 0.00025036341936838705, "loss": 0.4367, "step": 56480 }, { "epoch": 1.2573450854700854, "grad_norm": 0.5473216772079468, "learning_rate": 0.00025031829817459723, "loss": 0.5431, "step": 56490 }, { "epoch": 1.2575676638176638, "grad_norm": 0.4608478546142578, "learning_rate": 0.0002502731742464699, "loss": 0.5678, "step": 56500 }, { "epoch": 1.257790242165242, "grad_norm": 0.726207971572876, "learning_rate": 0.00025022804758645714, "loss": 0.6393, "step": 56510 }, { "epoch": 1.2580128205128205, "grad_norm": 1.0481374263763428, "learning_rate": 0.00025018291819701115, "loss": 0.5899, "step": 56520 }, { "epoch": 1.258235398860399, "grad_norm": 0.6271913647651672, "learning_rate": 0.0002501377860805843, "loss": 0.5223, "step": 56530 }, { "epoch": 1.2584579772079771, "grad_norm": 0.626106321811676, "learning_rate": 0.00025009265123962916, "loss": 0.5491, "step": 56540 }, { "epoch": 1.2586805555555556, "grad_norm": 0.48963356018066406, "learning_rate": 0.0002500475136765983, "loss": 0.5274, "step": 56550 }, { "epoch": 1.258903133903134, "grad_norm": 0.4977332055568695, "learning_rate": 0.0002500023733939446, "loss": 0.4912, "step": 56560 }, { "epoch": 1.2591257122507122, "grad_norm": 0.7764100432395935, "learning_rate": 0.0002499572303941209, "loss": 0.7082, "step": 56570 }, { "epoch": 1.2593482905982907, "grad_norm": 0.6956349015235901, "learning_rate": 0.00024991208467958054, "loss": 0.6231, "step": 56580 }, { "epoch": 1.259570868945869, "grad_norm": 0.7979551553726196, "learning_rate": 0.00024986693625277654, "loss": 0.5122, "step": 56590 }, { "epoch": 1.2597934472934473, "grad_norm": 0.47603729367256165, "learning_rate": 0.0002498217851161624, "loss": 0.436, "step": 56600 }, { "epoch": 1.2600160256410255, "grad_norm": 0.7334918975830078, "learning_rate": 0.00024977663127219175, "loss": 0.6345, "step": 56610 }, { "epoch": 1.2601495726495726, "eval_loss": 0.5836193561553955, "eval_runtime": 337.4574, "eval_samples_per_second": 7.008, "eval_steps_per_second": 7.008, "step": 56616 }, { "epoch": 1.260238603988604, "grad_norm": 0.6111765503883362, "learning_rate": 0.0002497314747233182, "loss": 0.6062, "step": 56620 }, { "epoch": 1.2604611823361824, "grad_norm": 0.4151182174682617, "learning_rate": 0.0002496863154719955, "loss": 0.4476, "step": 56630 }, { "epoch": 1.2606837606837606, "grad_norm": 0.7085245251655579, "learning_rate": 0.0002496411535206778, "loss": 0.5815, "step": 56640 }, { "epoch": 1.260906339031339, "grad_norm": 0.5205219388008118, "learning_rate": 0.00024959598887181925, "loss": 0.5473, "step": 56650 }, { "epoch": 1.2611289173789173, "grad_norm": 0.7376794815063477, "learning_rate": 0.000249550821527874, "loss": 0.5397, "step": 56660 }, { "epoch": 1.2613514957264957, "grad_norm": 0.4363914132118225, "learning_rate": 0.00024950565149129653, "loss": 0.5169, "step": 56670 }, { "epoch": 1.261574074074074, "grad_norm": 0.6825656890869141, "learning_rate": 0.0002494604787645415, "loss": 0.5345, "step": 56680 }, { "epoch": 1.2617966524216524, "grad_norm": 0.6744056344032288, "learning_rate": 0.00024941530335006345, "loss": 0.682, "step": 56690 }, { "epoch": 1.2620192307692308, "grad_norm": 0.6776096820831299, "learning_rate": 0.00024937012525031745, "loss": 0.5671, "step": 56700 }, { "epoch": 1.262241809116809, "grad_norm": 0.5862076282501221, "learning_rate": 0.0002493249444677584, "loss": 0.5864, "step": 56710 }, { "epoch": 1.2624643874643875, "grad_norm": 0.6812041401863098, "learning_rate": 0.0002492797610048415, "loss": 0.6633, "step": 56720 }, { "epoch": 1.262686965811966, "grad_norm": 0.610002875328064, "learning_rate": 0.000249234574864022, "loss": 0.5861, "step": 56730 }, { "epoch": 1.2629095441595442, "grad_norm": 0.4847986102104187, "learning_rate": 0.0002491893860477554, "loss": 0.5637, "step": 56740 }, { "epoch": 1.2631321225071226, "grad_norm": 0.641619861125946, "learning_rate": 0.0002491441945584974, "loss": 0.6276, "step": 56750 }, { "epoch": 1.2633547008547008, "grad_norm": 0.6451306343078613, "learning_rate": 0.00024909900039870355, "loss": 0.6559, "step": 56760 }, { "epoch": 1.2635772792022792, "grad_norm": 0.8267781138420105, "learning_rate": 0.00024905380357082983, "loss": 0.5444, "step": 56770 }, { "epoch": 1.2637998575498575, "grad_norm": 0.6346902847290039, "learning_rate": 0.00024900860407733226, "loss": 0.5551, "step": 56780 }, { "epoch": 1.264022435897436, "grad_norm": 0.8560341596603394, "learning_rate": 0.00024896340192066704, "loss": 0.4665, "step": 56790 }, { "epoch": 1.2642450142450143, "grad_norm": 0.7849281430244446, "learning_rate": 0.0002489181971032905, "loss": 0.5678, "step": 56800 }, { "epoch": 1.2644675925925926, "grad_norm": 0.5926291942596436, "learning_rate": 0.00024887298962765903, "loss": 0.5712, "step": 56810 }, { "epoch": 1.264690170940171, "grad_norm": 0.5794327855110168, "learning_rate": 0.0002488277794962293, "loss": 0.567, "step": 56820 }, { "epoch": 1.2649127492877492, "grad_norm": 0.5844407081604004, "learning_rate": 0.0002487825667114581, "loss": 0.4536, "step": 56830 }, { "epoch": 1.2651353276353277, "grad_norm": 0.7397210597991943, "learning_rate": 0.00024873735127580224, "loss": 0.5212, "step": 56840 }, { "epoch": 1.2653579059829059, "grad_norm": 0.7658064365386963, "learning_rate": 0.0002486921331917189, "loss": 0.681, "step": 56850 }, { "epoch": 1.2655804843304843, "grad_norm": 0.6665608286857605, "learning_rate": 0.0002486469124616651, "loss": 0.5455, "step": 56860 }, { "epoch": 1.2658030626780628, "grad_norm": 1.3184243440628052, "learning_rate": 0.00024860168908809826, "loss": 0.6189, "step": 56870 }, { "epoch": 1.266025641025641, "grad_norm": 0.4860929548740387, "learning_rate": 0.00024855646307347587, "loss": 0.4443, "step": 56880 }, { "epoch": 1.2662482193732194, "grad_norm": 0.6908466815948486, "learning_rate": 0.0002485112344202555, "loss": 0.7412, "step": 56890 }, { "epoch": 1.2664707977207978, "grad_norm": 0.4728986620903015, "learning_rate": 0.000248466003130895, "loss": 0.6416, "step": 56900 }, { "epoch": 1.266693376068376, "grad_norm": 0.5925902128219604, "learning_rate": 0.00024842076920785215, "loss": 0.5215, "step": 56910 }, { "epoch": 1.2669159544159543, "grad_norm": 0.5110843777656555, "learning_rate": 0.0002483755326535851, "loss": 0.6172, "step": 56920 }, { "epoch": 1.2671385327635327, "grad_norm": 0.795608401298523, "learning_rate": 0.000248330293470552, "loss": 0.5711, "step": 56930 }, { "epoch": 1.2673611111111112, "grad_norm": 0.9751588702201843, "learning_rate": 0.00024828505166121117, "loss": 0.5147, "step": 56940 }, { "epoch": 1.2675836894586894, "grad_norm": 0.6915881037712097, "learning_rate": 0.0002482398072280211, "loss": 0.6626, "step": 56950 }, { "epoch": 1.2678062678062678, "grad_norm": 0.5895326137542725, "learning_rate": 0.00024819456017344043, "loss": 0.5313, "step": 56960 }, { "epoch": 1.2680288461538463, "grad_norm": 0.6742547750473022, "learning_rate": 0.00024814931049992793, "loss": 0.6308, "step": 56970 }, { "epoch": 1.2682514245014245, "grad_norm": 0.976432204246521, "learning_rate": 0.0002481040582099424, "loss": 0.5129, "step": 56980 }, { "epoch": 1.268474002849003, "grad_norm": 0.6051344275474548, "learning_rate": 0.000248058803305943, "loss": 0.4686, "step": 56990 }, { "epoch": 1.2686965811965811, "grad_norm": 0.47400906682014465, "learning_rate": 0.00024801354579038896, "loss": 0.6147, "step": 57000 }, { "epoch": 1.2689191595441596, "grad_norm": 0.7733394503593445, "learning_rate": 0.0002479682856657395, "loss": 0.534, "step": 57010 }, { "epoch": 1.2691417378917378, "grad_norm": 0.7045361995697021, "learning_rate": 0.0002479230229344541, "loss": 0.6043, "step": 57020 }, { "epoch": 1.2693643162393162, "grad_norm": 0.6193640232086182, "learning_rate": 0.0002478777575989924, "loss": 0.6197, "step": 57030 }, { "epoch": 1.2695868945868947, "grad_norm": 0.5314254760742188, "learning_rate": 0.00024783248966181416, "loss": 0.4338, "step": 57040 }, { "epoch": 1.2698094729344729, "grad_norm": 0.5635586977005005, "learning_rate": 0.00024778721912537926, "loss": 0.5019, "step": 57050 }, { "epoch": 1.2700320512820513, "grad_norm": 0.573419988155365, "learning_rate": 0.0002477419459921478, "loss": 0.5775, "step": 57060 }, { "epoch": 1.2702546296296298, "grad_norm": 0.6648149490356445, "learning_rate": 0.00024769667026457994, "loss": 0.6239, "step": 57070 }, { "epoch": 1.270477207977208, "grad_norm": 0.6661327481269836, "learning_rate": 0.0002476513919451359, "loss": 0.5564, "step": 57080 }, { "epoch": 1.2706997863247862, "grad_norm": 0.5547678470611572, "learning_rate": 0.0002476061110362762, "loss": 0.6744, "step": 57090 }, { "epoch": 1.2709223646723646, "grad_norm": 0.5434178709983826, "learning_rate": 0.0002475608275404615, "loss": 0.4616, "step": 57100 }, { "epoch": 1.271144943019943, "grad_norm": 0.45356640219688416, "learning_rate": 0.0002475155414601525, "loss": 0.5319, "step": 57110 }, { "epoch": 1.2713675213675213, "grad_norm": 0.7899428606033325, "learning_rate": 0.00024747025279781004, "loss": 0.5353, "step": 57120 }, { "epoch": 1.2715900997150997, "grad_norm": 0.6958609223365784, "learning_rate": 0.0002474249615558951, "loss": 0.5495, "step": 57130 }, { "epoch": 1.2718126780626782, "grad_norm": 0.579924464225769, "learning_rate": 0.00024737966773686915, "loss": 0.5014, "step": 57140 }, { "epoch": 1.2720352564102564, "grad_norm": 0.7924838066101074, "learning_rate": 0.0002473343713431931, "loss": 0.6105, "step": 57150 }, { "epoch": 1.2722578347578348, "grad_norm": 0.9225060343742371, "learning_rate": 0.0002472890723773286, "loss": 0.6499, "step": 57160 }, { "epoch": 1.272480413105413, "grad_norm": 0.8588963747024536, "learning_rate": 0.00024724377084173725, "loss": 0.5493, "step": 57170 }, { "epoch": 1.2727029914529915, "grad_norm": 0.6969565749168396, "learning_rate": 0.00024719846673888063, "loss": 0.5307, "step": 57180 }, { "epoch": 1.2729255698005697, "grad_norm": 0.5771629810333252, "learning_rate": 0.0002471531600712207, "loss": 0.5566, "step": 57190 }, { "epoch": 1.2731481481481481, "grad_norm": 0.6116702556610107, "learning_rate": 0.0002471078508412195, "loss": 0.5853, "step": 57200 }, { "epoch": 1.2733707264957266, "grad_norm": 0.9617108106613159, "learning_rate": 0.00024706253905133914, "loss": 0.6215, "step": 57210 }, { "epoch": 1.2735933048433048, "grad_norm": 0.4409613013267517, "learning_rate": 0.0002470172247040418, "loss": 0.6077, "step": 57220 }, { "epoch": 1.2738158831908832, "grad_norm": 0.6302945613861084, "learning_rate": 0.00024697190780179003, "loss": 0.6608, "step": 57230 }, { "epoch": 1.2740384615384617, "grad_norm": 0.5273117423057556, "learning_rate": 0.00024692658834704633, "loss": 0.5804, "step": 57240 }, { "epoch": 1.27426103988604, "grad_norm": 0.5631963610649109, "learning_rate": 0.0002468812663422734, "loss": 0.6048, "step": 57250 }, { "epoch": 1.274483618233618, "grad_norm": 0.5829416513442993, "learning_rate": 0.00024683594178993406, "loss": 0.605, "step": 57260 }, { "epoch": 1.2747061965811965, "grad_norm": 0.7434841394424438, "learning_rate": 0.00024679061469249134, "loss": 0.5767, "step": 57270 }, { "epoch": 1.274928774928775, "grad_norm": 0.6747451424598694, "learning_rate": 0.0002467452850524083, "loss": 0.6182, "step": 57280 }, { "epoch": 1.2751513532763532, "grad_norm": 0.6527425050735474, "learning_rate": 0.0002466999528721482, "loss": 0.6533, "step": 57290 }, { "epoch": 1.2753739316239316, "grad_norm": 0.577849805355072, "learning_rate": 0.0002466546181541744, "loss": 0.7178, "step": 57300 }, { "epoch": 1.27559650997151, "grad_norm": 0.5812302231788635, "learning_rate": 0.0002466092809009505, "loss": 0.6465, "step": 57310 }, { "epoch": 1.2758190883190883, "grad_norm": 0.6347759366035461, "learning_rate": 0.0002465639411149401, "loss": 0.6204, "step": 57320 }, { "epoch": 1.2760416666666667, "grad_norm": 0.49303147196769714, "learning_rate": 0.000246518598798607, "loss": 0.4739, "step": 57330 }, { "epoch": 1.276264245014245, "grad_norm": 0.6746029853820801, "learning_rate": 0.0002464732539544152, "loss": 0.5648, "step": 57340 }, { "epoch": 1.2764868233618234, "grad_norm": 0.8490906953811646, "learning_rate": 0.0002464279065848287, "loss": 0.5943, "step": 57350 }, { "epoch": 1.2767094017094016, "grad_norm": 1.0166454315185547, "learning_rate": 0.0002463825566923118, "loss": 0.564, "step": 57360 }, { "epoch": 1.27693198005698, "grad_norm": 0.8642008304595947, "learning_rate": 0.00024633720427932876, "loss": 0.6205, "step": 57370 }, { "epoch": 1.2771545584045585, "grad_norm": 0.7879709005355835, "learning_rate": 0.0002462918493483441, "loss": 0.5458, "step": 57380 }, { "epoch": 1.2773771367521367, "grad_norm": 1.0417356491088867, "learning_rate": 0.00024624649190182243, "loss": 0.6538, "step": 57390 }, { "epoch": 1.2775997150997151, "grad_norm": 0.517301619052887, "learning_rate": 0.0002462011319422286, "loss": 0.4757, "step": 57400 }, { "epoch": 1.2778222934472934, "grad_norm": 0.7360623478889465, "learning_rate": 0.0002461557694720274, "loss": 0.4645, "step": 57410 }, { "epoch": 1.2780448717948718, "grad_norm": 0.5814298391342163, "learning_rate": 0.0002461104044936839, "loss": 0.5377, "step": 57420 }, { "epoch": 1.27826745014245, "grad_norm": 0.6260385513305664, "learning_rate": 0.0002460650370096633, "loss": 0.4785, "step": 57430 }, { "epoch": 1.2784900284900285, "grad_norm": 0.5605562925338745, "learning_rate": 0.0002460196670224308, "loss": 0.7021, "step": 57440 }, { "epoch": 1.278712606837607, "grad_norm": 0.9051653742790222, "learning_rate": 0.000245974294534452, "loss": 0.509, "step": 57450 }, { "epoch": 1.2789351851851851, "grad_norm": 0.5881842970848083, "learning_rate": 0.0002459289195481924, "loss": 0.5765, "step": 57460 }, { "epoch": 1.2791577635327636, "grad_norm": 0.5452075004577637, "learning_rate": 0.0002458835420661177, "loss": 0.7318, "step": 57470 }, { "epoch": 1.279380341880342, "grad_norm": 0.3864049017429352, "learning_rate": 0.0002458381620906937, "loss": 0.4592, "step": 57480 }, { "epoch": 1.2796029202279202, "grad_norm": 0.6464360952377319, "learning_rate": 0.0002457927796243865, "loss": 0.6142, "step": 57490 }, { "epoch": 1.2798254985754987, "grad_norm": 0.680077850818634, "learning_rate": 0.0002457473946696621, "loss": 0.6157, "step": 57500 }, { "epoch": 1.2800480769230769, "grad_norm": 0.6645841002464294, "learning_rate": 0.0002457020072289869, "loss": 0.631, "step": 57510 }, { "epoch": 1.2802706552706553, "grad_norm": 1.004813313484192, "learning_rate": 0.00024565661730482723, "loss": 0.4855, "step": 57520 }, { "epoch": 1.2804932336182335, "grad_norm": 0.743014931678772, "learning_rate": 0.0002456112248996496, "loss": 0.4654, "step": 57530 }, { "epoch": 1.280715811965812, "grad_norm": 0.5669029951095581, "learning_rate": 0.00024556583001592063, "loss": 0.548, "step": 57540 }, { "epoch": 1.2809383903133904, "grad_norm": 0.425339937210083, "learning_rate": 0.0002455204326561071, "loss": 0.4507, "step": 57550 }, { "epoch": 1.2811609686609686, "grad_norm": 0.49417296051979065, "learning_rate": 0.0002454750328226761, "loss": 0.5089, "step": 57560 }, { "epoch": 1.281383547008547, "grad_norm": 0.5269678831100464, "learning_rate": 0.0002454296305180945, "loss": 0.5332, "step": 57570 }, { "epoch": 1.2816061253561253, "grad_norm": 0.44602254033088684, "learning_rate": 0.00024538422574482964, "loss": 0.4913, "step": 57580 }, { "epoch": 1.2818287037037037, "grad_norm": 0.4153461754322052, "learning_rate": 0.00024533881850534884, "loss": 0.6558, "step": 57590 }, { "epoch": 1.282051282051282, "grad_norm": 0.9092263579368591, "learning_rate": 0.0002452934088021195, "loss": 0.5675, "step": 57600 }, { "epoch": 1.2822738603988604, "grad_norm": 0.450931191444397, "learning_rate": 0.0002452479966376092, "loss": 0.595, "step": 57610 }, { "epoch": 1.2824964387464388, "grad_norm": 0.7668671011924744, "learning_rate": 0.0002452025820142857, "loss": 0.6489, "step": 57620 }, { "epoch": 1.282719017094017, "grad_norm": 0.6127539873123169, "learning_rate": 0.000245157164934617, "loss": 0.5202, "step": 57630 }, { "epoch": 1.2829415954415955, "grad_norm": 0.7740516662597656, "learning_rate": 0.0002451117454010709, "loss": 0.535, "step": 57640 }, { "epoch": 1.283164173789174, "grad_norm": 0.7097222208976746, "learning_rate": 0.0002450663234161156, "loss": 0.6147, "step": 57650 }, { "epoch": 1.2833867521367521, "grad_norm": 0.5709938406944275, "learning_rate": 0.0002450208989822195, "loss": 0.5674, "step": 57660 }, { "epoch": 1.2836093304843303, "grad_norm": 0.668928325176239, "learning_rate": 0.00024497547210185086, "loss": 0.522, "step": 57670 }, { "epoch": 1.2838319088319088, "grad_norm": 0.590245246887207, "learning_rate": 0.0002449300427774782, "loss": 0.5693, "step": 57680 }, { "epoch": 1.2840544871794872, "grad_norm": 0.7981787919998169, "learning_rate": 0.00024488461101157023, "loss": 0.556, "step": 57690 }, { "epoch": 1.2842770655270654, "grad_norm": 0.6494070887565613, "learning_rate": 0.0002448391768065958, "loss": 0.4989, "step": 57700 }, { "epoch": 1.2844996438746439, "grad_norm": 0.9221472144126892, "learning_rate": 0.00024479374016502377, "loss": 0.5186, "step": 57710 }, { "epoch": 1.2847222222222223, "grad_norm": 0.5488528609275818, "learning_rate": 0.0002447483010893232, "loss": 0.6221, "step": 57720 }, { "epoch": 1.2849448005698005, "grad_norm": 0.5177143216133118, "learning_rate": 0.0002447028595819634, "loss": 0.4592, "step": 57730 }, { "epoch": 1.285167378917379, "grad_norm": 0.5516415238380432, "learning_rate": 0.0002446574156454136, "loss": 0.5079, "step": 57740 }, { "epoch": 1.2853899572649572, "grad_norm": 0.4461617171764374, "learning_rate": 0.0002446119692821432, "loss": 0.4729, "step": 57750 }, { "epoch": 1.2856125356125356, "grad_norm": 0.6107337474822998, "learning_rate": 0.00024456652049462195, "loss": 0.5218, "step": 57760 }, { "epoch": 1.2858351139601139, "grad_norm": 0.6461668610572815, "learning_rate": 0.00024452106928531944, "loss": 0.5012, "step": 57770 }, { "epoch": 1.2860576923076923, "grad_norm": 0.7621982097625732, "learning_rate": 0.0002444756156567056, "loss": 0.6168, "step": 57780 }, { "epoch": 1.2862802706552707, "grad_norm": 0.4369259178638458, "learning_rate": 0.0002444301596112504, "loss": 0.5539, "step": 57790 }, { "epoch": 1.286502849002849, "grad_norm": 0.6202419996261597, "learning_rate": 0.00024438470115142386, "loss": 0.5763, "step": 57800 }, { "epoch": 1.2867254273504274, "grad_norm": 0.4778003394603729, "learning_rate": 0.00024433924027969647, "loss": 0.5908, "step": 57810 }, { "epoch": 1.2869480056980058, "grad_norm": 0.9239721298217773, "learning_rate": 0.00024429377699853835, "loss": 0.5821, "step": 57820 }, { "epoch": 1.287170584045584, "grad_norm": 0.7348893284797668, "learning_rate": 0.0002442483113104202, "loss": 0.53, "step": 57830 }, { "epoch": 1.2873931623931623, "grad_norm": 0.4809786379337311, "learning_rate": 0.0002442028432178126, "loss": 0.6144, "step": 57840 }, { "epoch": 1.2876157407407407, "grad_norm": 0.493743896484375, "learning_rate": 0.00024415737272318625, "loss": 0.5849, "step": 57850 }, { "epoch": 1.2878383190883191, "grad_norm": 0.818612277507782, "learning_rate": 0.00024411189982901217, "loss": 0.6053, "step": 57860 }, { "epoch": 1.2880608974358974, "grad_norm": 0.6896690726280212, "learning_rate": 0.00024406642453776129, "loss": 0.5601, "step": 57870 }, { "epoch": 1.2882834757834758, "grad_norm": 0.5468393564224243, "learning_rate": 0.0002440209468519049, "loss": 0.4614, "step": 57880 }, { "epoch": 1.2885060541310542, "grad_norm": 0.5655200481414795, "learning_rate": 0.00024397546677391415, "loss": 0.5631, "step": 57890 }, { "epoch": 1.2887286324786325, "grad_norm": 0.3707646429538727, "learning_rate": 0.00024392998430626056, "loss": 0.5412, "step": 57900 }, { "epoch": 1.288951210826211, "grad_norm": 0.6031479835510254, "learning_rate": 0.0002438844994514157, "loss": 0.6383, "step": 57910 }, { "epoch": 1.289173789173789, "grad_norm": 0.6139857172966003, "learning_rate": 0.00024383901221185114, "loss": 0.6776, "step": 57920 }, { "epoch": 1.2893963675213675, "grad_norm": 0.7917313575744629, "learning_rate": 0.00024379352259003883, "loss": 0.6386, "step": 57930 }, { "epoch": 1.2896189458689458, "grad_norm": 0.5444529056549072, "learning_rate": 0.00024374803058845062, "loss": 0.6352, "step": 57940 }, { "epoch": 1.2898415242165242, "grad_norm": 0.5975114107131958, "learning_rate": 0.00024370253620955863, "loss": 0.6732, "step": 57950 }, { "epoch": 1.2900641025641026, "grad_norm": 0.6343320608139038, "learning_rate": 0.00024365703945583502, "loss": 0.6364, "step": 57960 }, { "epoch": 1.2902866809116809, "grad_norm": 0.7224991321563721, "learning_rate": 0.00024361154032975218, "loss": 0.6188, "step": 57970 }, { "epoch": 1.2905092592592593, "grad_norm": 0.6302068829536438, "learning_rate": 0.0002435660388337825, "loss": 0.6218, "step": 57980 }, { "epoch": 1.2907318376068377, "grad_norm": 0.971505880355835, "learning_rate": 0.00024352053497039865, "loss": 0.5681, "step": 57990 }, { "epoch": 1.290954415954416, "grad_norm": 1.0870121717453003, "learning_rate": 0.00024347502874207328, "loss": 0.4242, "step": 58000 }, { "epoch": 1.2911769943019942, "grad_norm": 0.7964351177215576, "learning_rate": 0.00024342952015127926, "loss": 0.6279, "step": 58010 }, { "epoch": 1.2913995726495726, "grad_norm": 0.5268858075141907, "learning_rate": 0.00024338400920048955, "loss": 0.6166, "step": 58020 }, { "epoch": 1.291622150997151, "grad_norm": 0.6330119967460632, "learning_rate": 0.00024333849589217726, "loss": 0.5551, "step": 58030 }, { "epoch": 1.2918447293447293, "grad_norm": 0.6858647465705872, "learning_rate": 0.0002432929802288156, "loss": 0.5315, "step": 58040 }, { "epoch": 1.2920673076923077, "grad_norm": 0.347758412361145, "learning_rate": 0.000243247462212878, "loss": 0.4285, "step": 58050 }, { "epoch": 1.2922898860398861, "grad_norm": 0.531583845615387, "learning_rate": 0.00024320194184683795, "loss": 0.7146, "step": 58060 }, { "epoch": 1.2925124643874644, "grad_norm": 0.613280177116394, "learning_rate": 0.00024315641913316891, "loss": 0.6505, "step": 58070 }, { "epoch": 1.2927350427350428, "grad_norm": 0.7552340030670166, "learning_rate": 0.00024311089407434477, "loss": 0.6316, "step": 58080 }, { "epoch": 1.292957621082621, "grad_norm": 0.6747114658355713, "learning_rate": 0.00024306536667283938, "loss": 0.7003, "step": 58090 }, { "epoch": 1.2931801994301995, "grad_norm": 0.5359660387039185, "learning_rate": 0.00024301983693112664, "loss": 0.5668, "step": 58100 }, { "epoch": 1.2934027777777777, "grad_norm": 0.969078540802002, "learning_rate": 0.00024297430485168079, "loss": 0.5855, "step": 58110 }, { "epoch": 1.2936253561253561, "grad_norm": 0.7394294738769531, "learning_rate": 0.00024292877043697605, "loss": 0.5572, "step": 58120 }, { "epoch": 1.2938479344729346, "grad_norm": 0.7185589671134949, "learning_rate": 0.00024288323368948676, "loss": 0.5608, "step": 58130 }, { "epoch": 1.2940705128205128, "grad_norm": 0.7642703056335449, "learning_rate": 0.00024283769461168743, "loss": 0.7009, "step": 58140 }, { "epoch": 1.2942930911680912, "grad_norm": 0.7779717445373535, "learning_rate": 0.00024279215320605272, "loss": 0.516, "step": 58150 }, { "epoch": 1.2945156695156697, "grad_norm": 0.6956257224082947, "learning_rate": 0.0002427466094750574, "loss": 0.5231, "step": 58160 }, { "epoch": 1.2947382478632479, "grad_norm": 0.796648383140564, "learning_rate": 0.00024270106342117628, "loss": 0.6464, "step": 58170 }, { "epoch": 1.294960826210826, "grad_norm": 0.6371486186981201, "learning_rate": 0.00024265551504688441, "loss": 0.6439, "step": 58180 }, { "epoch": 1.2951834045584045, "grad_norm": 1.0725200176239014, "learning_rate": 0.00024260996435465697, "loss": 0.5708, "step": 58190 }, { "epoch": 1.295405982905983, "grad_norm": 0.5749133229255676, "learning_rate": 0.0002425644113469692, "loss": 0.4988, "step": 58200 }, { "epoch": 1.2956285612535612, "grad_norm": 0.4231486916542053, "learning_rate": 0.00024251885602629645, "loss": 0.5386, "step": 58210 }, { "epoch": 1.2958511396011396, "grad_norm": 0.5720852017402649, "learning_rate": 0.00024247329839511425, "loss": 0.546, "step": 58220 }, { "epoch": 1.296073717948718, "grad_norm": 0.6299185752868652, "learning_rate": 0.00024242773845589827, "loss": 0.5484, "step": 58230 }, { "epoch": 1.2962962962962963, "grad_norm": 0.5682166814804077, "learning_rate": 0.0002423821762111242, "loss": 0.5864, "step": 58240 }, { "epoch": 1.2965188746438747, "grad_norm": 0.4346299469470978, "learning_rate": 0.00024233661166326803, "loss": 0.5586, "step": 58250 }, { "epoch": 1.296741452991453, "grad_norm": 0.9597857594490051, "learning_rate": 0.00024229104481480568, "loss": 0.5722, "step": 58260 }, { "epoch": 1.2969640313390314, "grad_norm": 0.6724293828010559, "learning_rate": 0.0002422454756682134, "loss": 0.6278, "step": 58270 }, { "epoch": 1.2971866096866096, "grad_norm": 0.4890615940093994, "learning_rate": 0.0002421999042259673, "loss": 0.5214, "step": 58280 }, { "epoch": 1.297409188034188, "grad_norm": 0.7504422664642334, "learning_rate": 0.0002421543304905439, "loss": 0.6461, "step": 58290 }, { "epoch": 1.2976317663817665, "grad_norm": 0.7557802200317383, "learning_rate": 0.0002421087544644197, "loss": 0.6423, "step": 58300 }, { "epoch": 1.2978543447293447, "grad_norm": 0.947760820388794, "learning_rate": 0.00024206317615007127, "loss": 0.62, "step": 58310 }, { "epoch": 1.2980769230769231, "grad_norm": 0.6679830551147461, "learning_rate": 0.00024201759554997546, "loss": 0.5337, "step": 58320 }, { "epoch": 1.2982995014245013, "grad_norm": 0.4446559250354767, "learning_rate": 0.0002419720126666091, "loss": 0.4686, "step": 58330 }, { "epoch": 1.2985220797720798, "grad_norm": 0.7188494801521301, "learning_rate": 0.00024192642750244919, "loss": 0.5269, "step": 58340 }, { "epoch": 1.298744658119658, "grad_norm": 0.42798712849617004, "learning_rate": 0.00024188084005997285, "loss": 0.6084, "step": 58350 }, { "epoch": 1.2989672364672364, "grad_norm": 0.5221139788627625, "learning_rate": 0.0002418352503416574, "loss": 0.586, "step": 58360 }, { "epoch": 1.2991898148148149, "grad_norm": 0.8559644222259521, "learning_rate": 0.00024178965834998023, "loss": 0.6131, "step": 58370 }, { "epoch": 1.299412393162393, "grad_norm": 0.7762161493301392, "learning_rate": 0.00024174406408741876, "loss": 0.5871, "step": 58380 }, { "epoch": 1.2996349715099715, "grad_norm": 0.7057475447654724, "learning_rate": 0.00024169846755645074, "loss": 0.5825, "step": 58390 }, { "epoch": 1.29985754985755, "grad_norm": 0.4852276146411896, "learning_rate": 0.00024165286875955385, "loss": 0.5527, "step": 58400 }, { "epoch": 1.3000801282051282, "grad_norm": 0.6271269917488098, "learning_rate": 0.00024160726769920598, "loss": 0.5368, "step": 58410 }, { "epoch": 1.3003027065527066, "grad_norm": 0.6222318410873413, "learning_rate": 0.00024156166437788504, "loss": 0.6229, "step": 58420 }, { "epoch": 1.3005252849002849, "grad_norm": 0.9286940097808838, "learning_rate": 0.0002415160587980693, "loss": 0.5029, "step": 58430 }, { "epoch": 1.3007478632478633, "grad_norm": 0.5464807152748108, "learning_rate": 0.00024147045096223693, "loss": 0.6235, "step": 58440 }, { "epoch": 1.3009704415954415, "grad_norm": 0.5680151581764221, "learning_rate": 0.00024142484087286633, "loss": 0.5727, "step": 58450 }, { "epoch": 1.30119301994302, "grad_norm": 0.6306223273277283, "learning_rate": 0.00024137922853243588, "loss": 0.778, "step": 58460 }, { "epoch": 1.3014155982905984, "grad_norm": 0.4028373658657074, "learning_rate": 0.0002413336139434244, "loss": 0.5189, "step": 58470 }, { "epoch": 1.3016381766381766, "grad_norm": 0.473341703414917, "learning_rate": 0.0002412879971083104, "loss": 0.6445, "step": 58480 }, { "epoch": 1.301860754985755, "grad_norm": 0.5544946193695068, "learning_rate": 0.00024124237802957286, "loss": 0.5764, "step": 58490 }, { "epoch": 1.3020833333333333, "grad_norm": 0.7774984836578369, "learning_rate": 0.0002411967567096907, "loss": 0.7581, "step": 58500 }, { "epoch": 1.3023059116809117, "grad_norm": 0.664369523525238, "learning_rate": 0.00024115113315114313, "loss": 0.5788, "step": 58510 }, { "epoch": 1.30252849002849, "grad_norm": 0.831189751625061, "learning_rate": 0.00024110550735640928, "loss": 0.6589, "step": 58520 }, { "epoch": 1.3027510683760684, "grad_norm": 0.6030679941177368, "learning_rate": 0.0002410598793279685, "loss": 0.6406, "step": 58530 }, { "epoch": 1.3029736467236468, "grad_norm": 0.534193217754364, "learning_rate": 0.00024101424906830028, "loss": 0.5129, "step": 58540 }, { "epoch": 1.303196225071225, "grad_norm": 0.5773665904998779, "learning_rate": 0.00024096861657988417, "loss": 0.6065, "step": 58550 }, { "epoch": 1.3034188034188035, "grad_norm": 0.8891463875770569, "learning_rate": 0.00024092298186519987, "loss": 0.7145, "step": 58560 }, { "epoch": 1.303641381766382, "grad_norm": 0.5164126753807068, "learning_rate": 0.00024087734492672725, "loss": 0.5036, "step": 58570 }, { "epoch": 1.30386396011396, "grad_norm": 0.5912543535232544, "learning_rate": 0.00024083170576694635, "loss": 0.6331, "step": 58580 }, { "epoch": 1.3040865384615383, "grad_norm": 0.5806360244750977, "learning_rate": 0.00024078606438833697, "loss": 0.6346, "step": 58590 }, { "epoch": 1.3043091168091168, "grad_norm": 0.6180190443992615, "learning_rate": 0.00024074042079337953, "loss": 0.518, "step": 58600 }, { "epoch": 1.3045316951566952, "grad_norm": 1.0360801219940186, "learning_rate": 0.0002406947749845543, "loss": 0.593, "step": 58610 }, { "epoch": 1.3047542735042734, "grad_norm": 0.7276124358177185, "learning_rate": 0.0002406491269643416, "loss": 0.6122, "step": 58620 }, { "epoch": 1.3049768518518519, "grad_norm": 0.6221876740455627, "learning_rate": 0.00024060347673522214, "loss": 0.636, "step": 58630 }, { "epoch": 1.3051994301994303, "grad_norm": 0.631127119064331, "learning_rate": 0.00024055782429967644, "loss": 0.6477, "step": 58640 }, { "epoch": 1.3054220085470085, "grad_norm": 0.6096075177192688, "learning_rate": 0.00024051216966018552, "loss": 0.6325, "step": 58650 }, { "epoch": 1.305644586894587, "grad_norm": 1.1462111473083496, "learning_rate": 0.00024046651281923, "loss": 0.6728, "step": 58660 }, { "epoch": 1.3058671652421652, "grad_norm": 0.4736732244491577, "learning_rate": 0.00024042085377929103, "loss": 0.6686, "step": 58670 }, { "epoch": 1.3060897435897436, "grad_norm": 0.6927259564399719, "learning_rate": 0.0002403751925428499, "loss": 0.6348, "step": 58680 }, { "epoch": 1.3063123219373218, "grad_norm": 0.4512447416782379, "learning_rate": 0.00024032952911238767, "loss": 0.5676, "step": 58690 }, { "epoch": 1.3065349002849003, "grad_norm": 0.45592087507247925, "learning_rate": 0.00024028386349038576, "loss": 0.5839, "step": 58700 }, { "epoch": 1.3067574786324787, "grad_norm": 0.6498264670372009, "learning_rate": 0.00024023819567932586, "loss": 0.5339, "step": 58710 }, { "epoch": 1.306980056980057, "grad_norm": 0.7101892232894897, "learning_rate": 0.0002401925256816894, "loss": 0.5566, "step": 58720 }, { "epoch": 1.3072026353276354, "grad_norm": 0.6671558618545532, "learning_rate": 0.0002401468534999582, "loss": 0.5722, "step": 58730 }, { "epoch": 1.3074252136752138, "grad_norm": 0.7375342845916748, "learning_rate": 0.00024010117913661407, "loss": 0.5704, "step": 58740 }, { "epoch": 1.307647792022792, "grad_norm": 0.7588819861412048, "learning_rate": 0.0002400555025941391, "loss": 0.587, "step": 58750 }, { "epoch": 1.3078703703703702, "grad_norm": 0.6468998789787292, "learning_rate": 0.0002400098238750153, "loss": 0.6558, "step": 58760 }, { "epoch": 1.3080929487179487, "grad_norm": 0.5806763768196106, "learning_rate": 0.00023996414298172488, "loss": 0.5347, "step": 58770 }, { "epoch": 1.3083155270655271, "grad_norm": 0.5419186949729919, "learning_rate": 0.0002399184599167503, "loss": 0.5315, "step": 58780 }, { "epoch": 1.3085381054131053, "grad_norm": 0.5879216194152832, "learning_rate": 0.00023987277468257386, "loss": 0.5976, "step": 58790 }, { "epoch": 1.3087606837606838, "grad_norm": 0.684470534324646, "learning_rate": 0.00023982708728167822, "loss": 0.5041, "step": 58800 }, { "epoch": 1.3089832621082622, "grad_norm": 0.5002015829086304, "learning_rate": 0.00023978139771654603, "loss": 0.4835, "step": 58810 }, { "epoch": 1.3092058404558404, "grad_norm": 0.7689092755317688, "learning_rate": 0.00023973570598966019, "loss": 0.5983, "step": 58820 }, { "epoch": 1.3094284188034189, "grad_norm": 0.586552083492279, "learning_rate": 0.0002396900121035035, "loss": 0.4554, "step": 58830 }, { "epoch": 1.309650997150997, "grad_norm": 0.9228227734565735, "learning_rate": 0.00023964431606055908, "loss": 0.6011, "step": 58840 }, { "epoch": 1.3098735754985755, "grad_norm": 0.6102882623672485, "learning_rate": 0.00023959861786331007, "loss": 0.5829, "step": 58850 }, { "epoch": 1.3100961538461537, "grad_norm": 0.6513160467147827, "learning_rate": 0.0002395529175142398, "loss": 0.4476, "step": 58860 }, { "epoch": 1.3103187321937322, "grad_norm": 0.5500876307487488, "learning_rate": 0.0002395072150158315, "loss": 0.4558, "step": 58870 }, { "epoch": 1.3105413105413106, "grad_norm": 0.49348214268684387, "learning_rate": 0.00023946151037056886, "loss": 0.5777, "step": 58880 }, { "epoch": 1.3107638888888888, "grad_norm": 0.5179466009140015, "learning_rate": 0.00023941580358093547, "loss": 0.6253, "step": 58890 }, { "epoch": 1.3109864672364673, "grad_norm": 0.8439163565635681, "learning_rate": 0.00023937009464941497, "loss": 0.6493, "step": 58900 }, { "epoch": 1.3112090455840457, "grad_norm": 0.5786725878715515, "learning_rate": 0.00023932438357849133, "loss": 0.599, "step": 58910 }, { "epoch": 1.311431623931624, "grad_norm": 0.5733489990234375, "learning_rate": 0.00023927867037064853, "loss": 0.5755, "step": 58920 }, { "epoch": 1.3116542022792022, "grad_norm": 0.5293898582458496, "learning_rate": 0.0002392329550283706, "loss": 0.6261, "step": 58930 }, { "epoch": 1.3118767806267806, "grad_norm": 1.0148966312408447, "learning_rate": 0.00023918723755414178, "loss": 0.714, "step": 58940 }, { "epoch": 1.312099358974359, "grad_norm": 0.5190929174423218, "learning_rate": 0.00023914151795044637, "loss": 0.4459, "step": 58950 }, { "epoch": 1.3123219373219372, "grad_norm": 0.5621257424354553, "learning_rate": 0.00023909579621976884, "loss": 0.4767, "step": 58960 }, { "epoch": 1.3125445156695157, "grad_norm": 0.4501630365848541, "learning_rate": 0.00023905007236459374, "loss": 0.533, "step": 58970 }, { "epoch": 1.3127670940170941, "grad_norm": 0.3523252606391907, "learning_rate": 0.00023900434638740578, "loss": 0.6054, "step": 58980 }, { "epoch": 1.3129896723646723, "grad_norm": 0.4954468309879303, "learning_rate": 0.00023895861829068964, "loss": 0.7315, "step": 58990 }, { "epoch": 1.3132122507122508, "grad_norm": 0.5253806710243225, "learning_rate": 0.00023891288807693039, "loss": 0.5097, "step": 59000 }, { "epoch": 1.313434829059829, "grad_norm": 0.5932738780975342, "learning_rate": 0.0002388671557486128, "loss": 0.6076, "step": 59010 }, { "epoch": 1.3136574074074074, "grad_norm": 0.8328933715820312, "learning_rate": 0.00023882142130822223, "loss": 0.5887, "step": 59020 }, { "epoch": 1.3138799857549857, "grad_norm": 0.5181304812431335, "learning_rate": 0.00023877568475824386, "loss": 0.4473, "step": 59030 }, { "epoch": 1.314102564102564, "grad_norm": 0.5174534916877747, "learning_rate": 0.00023872994610116304, "loss": 0.5158, "step": 59040 }, { "epoch": 1.3143251424501425, "grad_norm": 0.8329073190689087, "learning_rate": 0.0002386842053394652, "loss": 0.7148, "step": 59050 }, { "epoch": 1.3145477207977208, "grad_norm": 0.5756538510322571, "learning_rate": 0.00023863846247563602, "loss": 0.6326, "step": 59060 }, { "epoch": 1.3147702991452992, "grad_norm": 0.6975627541542053, "learning_rate": 0.00023859271751216113, "loss": 0.6248, "step": 59070 }, { "epoch": 1.3149928774928774, "grad_norm": 0.7237173318862915, "learning_rate": 0.00023854697045152637, "loss": 0.5916, "step": 59080 }, { "epoch": 1.3152154558404558, "grad_norm": 0.6929823160171509, "learning_rate": 0.00023850122129621766, "loss": 0.7398, "step": 59090 }, { "epoch": 1.315438034188034, "grad_norm": 0.6520585417747498, "learning_rate": 0.0002384554700487211, "loss": 0.5406, "step": 59100 }, { "epoch": 1.3156606125356125, "grad_norm": 0.7275229692459106, "learning_rate": 0.00023840971671152287, "loss": 0.6248, "step": 59110 }, { "epoch": 1.315883190883191, "grad_norm": 0.7695404291152954, "learning_rate": 0.00023836396128710914, "loss": 0.5023, "step": 59120 }, { "epoch": 1.3161057692307692, "grad_norm": 0.5422924160957336, "learning_rate": 0.00023831820377796627, "loss": 0.6049, "step": 59130 }, { "epoch": 1.3163283475783476, "grad_norm": 0.6437564492225647, "learning_rate": 0.00023827244418658095, "loss": 0.5623, "step": 59140 }, { "epoch": 1.316550925925926, "grad_norm": 0.7772666215896606, "learning_rate": 0.00023822668251543964, "loss": 0.5435, "step": 59150 }, { "epoch": 1.3167735042735043, "grad_norm": 0.578720211982727, "learning_rate": 0.00023818091876702906, "loss": 0.6183, "step": 59160 }, { "epoch": 1.3169960826210827, "grad_norm": 0.5212342739105225, "learning_rate": 0.00023813515294383622, "loss": 0.5001, "step": 59170 }, { "epoch": 1.317218660968661, "grad_norm": 0.3705865144729614, "learning_rate": 0.0002380893850483479, "loss": 0.5063, "step": 59180 }, { "epoch": 1.3174412393162394, "grad_norm": 0.7557964324951172, "learning_rate": 0.00023804361508305113, "loss": 0.4601, "step": 59190 }, { "epoch": 1.3176638176638176, "grad_norm": 0.8965262174606323, "learning_rate": 0.00023799784305043322, "loss": 0.5239, "step": 59200 }, { "epoch": 1.317886396011396, "grad_norm": 0.38725170493125916, "learning_rate": 0.00023795206895298144, "loss": 0.4289, "step": 59210 }, { "epoch": 1.3181089743589745, "grad_norm": 0.4773240089416504, "learning_rate": 0.00023790629279318317, "loss": 0.5446, "step": 59220 }, { "epoch": 1.3183315527065527, "grad_norm": 0.8197348713874817, "learning_rate": 0.00023786051457352585, "loss": 0.5693, "step": 59230 }, { "epoch": 1.318554131054131, "grad_norm": 0.5566990375518799, "learning_rate": 0.0002378147342964973, "loss": 0.5183, "step": 59240 }, { "epoch": 1.3187767094017093, "grad_norm": 0.4379849433898926, "learning_rate": 0.000237768951964585, "loss": 0.538, "step": 59250 }, { "epoch": 1.3189992877492878, "grad_norm": 0.4216741919517517, "learning_rate": 0.000237723167580277, "loss": 0.522, "step": 59260 }, { "epoch": 1.319221866096866, "grad_norm": 0.4712885022163391, "learning_rate": 0.00023767738114606119, "loss": 0.4766, "step": 59270 }, { "epoch": 1.3194444444444444, "grad_norm": 0.7440497279167175, "learning_rate": 0.00023763159266442565, "loss": 0.459, "step": 59280 }, { "epoch": 1.3196670227920229, "grad_norm": 0.5086557865142822, "learning_rate": 0.00023758580213785853, "loss": 0.6002, "step": 59290 }, { "epoch": 1.319889601139601, "grad_norm": 1.0026373863220215, "learning_rate": 0.00023754000956884816, "loss": 0.5271, "step": 59300 }, { "epoch": 1.3201121794871795, "grad_norm": 0.7639952301979065, "learning_rate": 0.00023749421495988294, "loss": 0.6444, "step": 59310 }, { "epoch": 1.3201566951566952, "eval_loss": 0.5806767344474792, "eval_runtime": 337.354, "eval_samples_per_second": 7.01, "eval_steps_per_second": 7.01, "step": 59312 }, { "epoch": 1.320334757834758, "grad_norm": 0.5289011001586914, "learning_rate": 0.00023744841831345142, "loss": 0.6351, "step": 59320 }, { "epoch": 1.3205573361823362, "grad_norm": 0.6011822819709778, "learning_rate": 0.0002374026196320421, "loss": 0.6502, "step": 59330 }, { "epoch": 1.3207799145299146, "grad_norm": 0.522434413433075, "learning_rate": 0.00023735681891814386, "loss": 0.6585, "step": 59340 }, { "epoch": 1.3210024928774928, "grad_norm": 0.5660699009895325, "learning_rate": 0.00023731101617424557, "loss": 0.5665, "step": 59350 }, { "epoch": 1.3212250712250713, "grad_norm": 0.7258313298225403, "learning_rate": 0.00023726521140283603, "loss": 0.5767, "step": 59360 }, { "epoch": 1.3214476495726495, "grad_norm": 0.6533500552177429, "learning_rate": 0.00023721940460640442, "loss": 0.5371, "step": 59370 }, { "epoch": 1.321670227920228, "grad_norm": 0.6535837650299072, "learning_rate": 0.00023717359578743986, "loss": 0.4816, "step": 59380 }, { "epoch": 1.3218928062678064, "grad_norm": 0.812309980392456, "learning_rate": 0.00023712778494843173, "loss": 0.576, "step": 59390 }, { "epoch": 1.3221153846153846, "grad_norm": 0.8531997203826904, "learning_rate": 0.00023708197209186934, "loss": 0.524, "step": 59400 }, { "epoch": 1.322337962962963, "grad_norm": 0.6680968999862671, "learning_rate": 0.00023703615722024224, "loss": 0.4451, "step": 59410 }, { "epoch": 1.3225605413105412, "grad_norm": 0.5203328728675842, "learning_rate": 0.00023699034033604002, "loss": 0.5163, "step": 59420 }, { "epoch": 1.3227831196581197, "grad_norm": 0.5531857013702393, "learning_rate": 0.0002369445214417525, "loss": 0.5387, "step": 59430 }, { "epoch": 1.323005698005698, "grad_norm": 0.693932056427002, "learning_rate": 0.00023689870053986934, "loss": 0.665, "step": 59440 }, { "epoch": 1.3232282763532763, "grad_norm": 0.6742056012153625, "learning_rate": 0.0002368528776328806, "loss": 0.7007, "step": 59450 }, { "epoch": 1.3234508547008548, "grad_norm": 0.6122947335243225, "learning_rate": 0.00023680705272327636, "loss": 0.6116, "step": 59460 }, { "epoch": 1.323673433048433, "grad_norm": 0.44931745529174805, "learning_rate": 0.00023676122581354673, "loss": 0.5818, "step": 59470 }, { "epoch": 1.3238960113960114, "grad_norm": 0.5619750022888184, "learning_rate": 0.000236715396906182, "loss": 0.554, "step": 59480 }, { "epoch": 1.3241185897435899, "grad_norm": 0.5768644213676453, "learning_rate": 0.00023666956600367254, "loss": 0.6054, "step": 59490 }, { "epoch": 1.324341168091168, "grad_norm": 0.9065821170806885, "learning_rate": 0.00023662373310850886, "loss": 0.593, "step": 59500 }, { "epoch": 1.3245637464387463, "grad_norm": 0.5836758613586426, "learning_rate": 0.00023657789822318154, "loss": 0.562, "step": 59510 }, { "epoch": 1.3247863247863247, "grad_norm": 0.5203041434288025, "learning_rate": 0.00023653206135018122, "loss": 0.6223, "step": 59520 }, { "epoch": 1.3250089031339032, "grad_norm": 0.8059108853340149, "learning_rate": 0.00023648622249199886, "loss": 0.5299, "step": 59530 }, { "epoch": 1.3252314814814814, "grad_norm": 0.6576545834541321, "learning_rate": 0.0002364403816511253, "loss": 0.6453, "step": 59540 }, { "epoch": 1.3254540598290598, "grad_norm": 0.5824242830276489, "learning_rate": 0.00023639453883005147, "loss": 0.529, "step": 59550 }, { "epoch": 1.3256766381766383, "grad_norm": 0.5908051133155823, "learning_rate": 0.00023634869403126873, "loss": 0.666, "step": 59560 }, { "epoch": 1.3258992165242165, "grad_norm": 0.5934712886810303, "learning_rate": 0.00023630284725726814, "loss": 0.4812, "step": 59570 }, { "epoch": 1.326121794871795, "grad_norm": 0.5465096831321716, "learning_rate": 0.00023625699851054113, "loss": 0.4924, "step": 59580 }, { "epoch": 1.3263443732193732, "grad_norm": 0.4866415560245514, "learning_rate": 0.0002362111477935791, "loss": 0.6447, "step": 59590 }, { "epoch": 1.3265669515669516, "grad_norm": 0.8629594445228577, "learning_rate": 0.0002361652951088737, "loss": 0.6519, "step": 59600 }, { "epoch": 1.3267895299145298, "grad_norm": 0.3857871890068054, "learning_rate": 0.0002361194404589165, "loss": 0.6385, "step": 59610 }, { "epoch": 1.3270121082621082, "grad_norm": 1.1052052974700928, "learning_rate": 0.00023607358384619942, "loss": 0.5939, "step": 59620 }, { "epoch": 1.3272346866096867, "grad_norm": 0.7581235766410828, "learning_rate": 0.0002360277252732143, "loss": 0.5635, "step": 59630 }, { "epoch": 1.327457264957265, "grad_norm": 0.7944244146347046, "learning_rate": 0.00023598186474245298, "loss": 0.4476, "step": 59640 }, { "epoch": 1.3276798433048433, "grad_norm": 0.6725006699562073, "learning_rate": 0.00023593600225640767, "loss": 0.5455, "step": 59650 }, { "epoch": 1.3279024216524218, "grad_norm": 0.5654692649841309, "learning_rate": 0.00023589013781757064, "loss": 0.5602, "step": 59660 }, { "epoch": 1.328125, "grad_norm": 0.44069766998291016, "learning_rate": 0.00023584427142843412, "loss": 0.5114, "step": 59670 }, { "epoch": 1.3283475783475782, "grad_norm": 0.6530085206031799, "learning_rate": 0.0002357984030914905, "loss": 0.6105, "step": 59680 }, { "epoch": 1.3285701566951567, "grad_norm": 0.603279709815979, "learning_rate": 0.0002357525328092324, "loss": 0.606, "step": 59690 }, { "epoch": 1.328792735042735, "grad_norm": 0.9058358669281006, "learning_rate": 0.00023570666058415248, "loss": 0.6441, "step": 59700 }, { "epoch": 1.3290153133903133, "grad_norm": 1.116974115371704, "learning_rate": 0.00023566078641874328, "loss": 0.5387, "step": 59710 }, { "epoch": 1.3292378917378918, "grad_norm": 0.5993869304656982, "learning_rate": 0.00023561491031549774, "loss": 0.7069, "step": 59720 }, { "epoch": 1.3294604700854702, "grad_norm": 0.516359269618988, "learning_rate": 0.00023556903227690885, "loss": 0.5338, "step": 59730 }, { "epoch": 1.3296830484330484, "grad_norm": 0.8609277009963989, "learning_rate": 0.00023552315230546963, "loss": 0.6723, "step": 59740 }, { "epoch": 1.3299056267806268, "grad_norm": 0.872204601764679, "learning_rate": 0.00023547727040367327, "loss": 0.5943, "step": 59750 }, { "epoch": 1.330128205128205, "grad_norm": 0.5097432732582092, "learning_rate": 0.00023543138657401298, "loss": 0.4863, "step": 59760 }, { "epoch": 1.3303507834757835, "grad_norm": 0.6608929634094238, "learning_rate": 0.0002353855008189821, "loss": 0.4568, "step": 59770 }, { "epoch": 1.3305733618233617, "grad_norm": 0.675301194190979, "learning_rate": 0.0002353396131410742, "loss": 0.7222, "step": 59780 }, { "epoch": 1.3307959401709402, "grad_norm": 0.6722831130027771, "learning_rate": 0.0002352937235427827, "loss": 0.5946, "step": 59790 }, { "epoch": 1.3310185185185186, "grad_norm": 0.3495960533618927, "learning_rate": 0.00023524783202660143, "loss": 0.5669, "step": 59800 }, { "epoch": 1.3312410968660968, "grad_norm": 0.7395991683006287, "learning_rate": 0.00023520193859502412, "loss": 0.606, "step": 59810 }, { "epoch": 1.3314636752136753, "grad_norm": 0.7243375182151794, "learning_rate": 0.0002351560432505446, "loss": 0.5941, "step": 59820 }, { "epoch": 1.3316862535612537, "grad_norm": 0.6544011235237122, "learning_rate": 0.00023511014599565696, "loss": 0.4566, "step": 59830 }, { "epoch": 1.331908831908832, "grad_norm": 0.8000948429107666, "learning_rate": 0.00023506424683285526, "loss": 0.5545, "step": 59840 }, { "epoch": 1.3321314102564101, "grad_norm": 0.6928564310073853, "learning_rate": 0.00023501834576463365, "loss": 0.4613, "step": 59850 }, { "epoch": 1.3323539886039886, "grad_norm": 0.5772952437400818, "learning_rate": 0.00023497244279348643, "loss": 0.6453, "step": 59860 }, { "epoch": 1.332576566951567, "grad_norm": 0.6352185606956482, "learning_rate": 0.00023492653792190802, "loss": 0.6916, "step": 59870 }, { "epoch": 1.3327991452991452, "grad_norm": 0.4369979798793793, "learning_rate": 0.00023488063115239305, "loss": 0.6148, "step": 59880 }, { "epoch": 1.3330217236467237, "grad_norm": 0.700515866279602, "learning_rate": 0.00023483472248743596, "loss": 0.6073, "step": 59890 }, { "epoch": 1.333244301994302, "grad_norm": 0.9979990124702454, "learning_rate": 0.00023478881192953157, "loss": 0.5441, "step": 59900 }, { "epoch": 1.3334668803418803, "grad_norm": 0.7997949123382568, "learning_rate": 0.00023474289948117468, "loss": 0.6842, "step": 59910 }, { "epoch": 1.3336894586894588, "grad_norm": 0.6371301412582397, "learning_rate": 0.00023469698514486012, "loss": 0.4973, "step": 59920 }, { "epoch": 1.333912037037037, "grad_norm": 0.6061527729034424, "learning_rate": 0.00023465106892308298, "loss": 0.4863, "step": 59930 }, { "epoch": 1.3341346153846154, "grad_norm": 0.7907821536064148, "learning_rate": 0.0002346051508183384, "loss": 0.6388, "step": 59940 }, { "epoch": 1.3343571937321936, "grad_norm": 0.8174198865890503, "learning_rate": 0.00023455923083312165, "loss": 0.5345, "step": 59950 }, { "epoch": 1.334579772079772, "grad_norm": 0.6560558080673218, "learning_rate": 0.00023451330896992798, "loss": 0.6792, "step": 59960 }, { "epoch": 1.3348023504273505, "grad_norm": 0.5488092303276062, "learning_rate": 0.0002344673852312528, "loss": 0.4952, "step": 59970 }, { "epoch": 1.3350249287749287, "grad_norm": 0.8640246987342834, "learning_rate": 0.00023442145961959177, "loss": 0.6287, "step": 59980 }, { "epoch": 1.3352475071225072, "grad_norm": 0.5481551885604858, "learning_rate": 0.00023437553213744039, "loss": 0.5579, "step": 59990 }, { "epoch": 1.3354700854700854, "grad_norm": 0.860477864742279, "learning_rate": 0.00023432960278729444, "loss": 0.5365, "step": 60000 }, { "epoch": 1.3356926638176638, "grad_norm": 0.5612112879753113, "learning_rate": 0.00023428367157164983, "loss": 0.5128, "step": 60010 }, { "epoch": 1.335915242165242, "grad_norm": 0.5579436421394348, "learning_rate": 0.0002342377384930024, "loss": 0.6091, "step": 60020 }, { "epoch": 1.3361378205128205, "grad_norm": 0.5270305275917053, "learning_rate": 0.00023419180355384827, "loss": 0.4779, "step": 60030 }, { "epoch": 1.336360398860399, "grad_norm": 0.6184577345848083, "learning_rate": 0.00023414586675668346, "loss": 0.6175, "step": 60040 }, { "epoch": 1.3365829772079771, "grad_norm": 0.6760562658309937, "learning_rate": 0.00023409992810400439, "loss": 0.479, "step": 60050 }, { "epoch": 1.3368055555555556, "grad_norm": 0.6230688691139221, "learning_rate": 0.00023405398759830727, "loss": 0.5463, "step": 60060 }, { "epoch": 1.337028133903134, "grad_norm": 0.355333149433136, "learning_rate": 0.00023400804524208852, "loss": 0.5802, "step": 60070 }, { "epoch": 1.3372507122507122, "grad_norm": 0.70462965965271, "learning_rate": 0.00023396210103784486, "loss": 0.5679, "step": 60080 }, { "epoch": 1.3374732905982907, "grad_norm": 0.7561984658241272, "learning_rate": 0.00023391615498807283, "loss": 0.5643, "step": 60090 }, { "epoch": 1.337695868945869, "grad_norm": 0.6452364921569824, "learning_rate": 0.0002338702070952691, "loss": 0.4912, "step": 60100 }, { "epoch": 1.3379184472934473, "grad_norm": 0.759984016418457, "learning_rate": 0.0002338242573619306, "loss": 0.5302, "step": 60110 }, { "epoch": 1.3381410256410255, "grad_norm": 0.5478008389472961, "learning_rate": 0.0002337783057905543, "loss": 0.4984, "step": 60120 }, { "epoch": 1.338363603988604, "grad_norm": 0.3967496156692505, "learning_rate": 0.00023373235238363717, "loss": 0.5099, "step": 60130 }, { "epoch": 1.3385861823361824, "grad_norm": 0.3813558518886566, "learning_rate": 0.0002336863971436764, "loss": 0.523, "step": 60140 }, { "epoch": 1.3388087606837606, "grad_norm": 0.6732510328292847, "learning_rate": 0.00023364044007316933, "loss": 0.5673, "step": 60150 }, { "epoch": 1.339031339031339, "grad_norm": 0.5705891847610474, "learning_rate": 0.00023359448117461312, "loss": 0.412, "step": 60160 }, { "epoch": 1.3392539173789173, "grad_norm": 0.44214022159576416, "learning_rate": 0.0002335485204505053, "loss": 0.6245, "step": 60170 }, { "epoch": 1.3394764957264957, "grad_norm": 0.5217635631561279, "learning_rate": 0.0002335025579033434, "loss": 0.4878, "step": 60180 }, { "epoch": 1.339699074074074, "grad_norm": 0.5117211937904358, "learning_rate": 0.00023345659353562513, "loss": 0.616, "step": 60190 }, { "epoch": 1.3399216524216524, "grad_norm": 0.399731308221817, "learning_rate": 0.00023341062734984815, "loss": 0.4766, "step": 60200 }, { "epoch": 1.3401442307692308, "grad_norm": 0.6643040776252747, "learning_rate": 0.0002333646593485103, "loss": 0.4042, "step": 60210 }, { "epoch": 1.340366809116809, "grad_norm": 0.693706214427948, "learning_rate": 0.0002333186895341096, "loss": 0.6591, "step": 60220 }, { "epoch": 1.3405893874643875, "grad_norm": 0.797396719455719, "learning_rate": 0.00023327271790914408, "loss": 0.4986, "step": 60230 }, { "epoch": 1.340811965811966, "grad_norm": 0.7917518615722656, "learning_rate": 0.00023322674447611173, "loss": 0.5472, "step": 60240 }, { "epoch": 1.3410345441595442, "grad_norm": 0.6472679972648621, "learning_rate": 0.0002331807692375109, "loss": 0.5395, "step": 60250 }, { "epoch": 1.3412571225071226, "grad_norm": 0.5727419257164001, "learning_rate": 0.0002331347921958399, "loss": 0.5142, "step": 60260 }, { "epoch": 1.3414797008547008, "grad_norm": 0.4394914209842682, "learning_rate": 0.0002330888133535972, "loss": 0.6813, "step": 60270 }, { "epoch": 1.3417022792022792, "grad_norm": 0.733902096748352, "learning_rate": 0.0002330428327132813, "loss": 0.6836, "step": 60280 }, { "epoch": 1.3419248575498575, "grad_norm": 0.714946985244751, "learning_rate": 0.0002329968502773908, "loss": 0.4508, "step": 60290 }, { "epoch": 1.342147435897436, "grad_norm": 0.5767934322357178, "learning_rate": 0.0002329508660484245, "loss": 0.541, "step": 60300 }, { "epoch": 1.3423700142450143, "grad_norm": 0.5443108081817627, "learning_rate": 0.00023290488002888107, "loss": 0.5748, "step": 60310 }, { "epoch": 1.3425925925925926, "grad_norm": 0.5115030407905579, "learning_rate": 0.00023285889222125956, "loss": 0.4549, "step": 60320 }, { "epoch": 1.342815170940171, "grad_norm": 0.6017990112304688, "learning_rate": 0.00023281290262805896, "loss": 0.4661, "step": 60330 }, { "epoch": 1.3430377492877492, "grad_norm": 0.438231498003006, "learning_rate": 0.00023276691125177835, "loss": 0.6032, "step": 60340 }, { "epoch": 1.3432603276353277, "grad_norm": 0.5477312207221985, "learning_rate": 0.000232720918094917, "loss": 0.6482, "step": 60350 }, { "epoch": 1.3434829059829059, "grad_norm": 0.8656781911849976, "learning_rate": 0.00023267492315997413, "loss": 0.5728, "step": 60360 }, { "epoch": 1.3437054843304843, "grad_norm": 0.6571064591407776, "learning_rate": 0.00023262892644944922, "loss": 0.4504, "step": 60370 }, { "epoch": 1.3439280626780628, "grad_norm": 0.45420676469802856, "learning_rate": 0.0002325829279658417, "loss": 0.4575, "step": 60380 }, { "epoch": 1.344150641025641, "grad_norm": 0.668060302734375, "learning_rate": 0.0002325369277116512, "loss": 0.7203, "step": 60390 }, { "epoch": 1.3443732193732194, "grad_norm": 0.6548107862472534, "learning_rate": 0.00023249092568937744, "loss": 0.5799, "step": 60400 }, { "epoch": 1.3445957977207978, "grad_norm": 0.8012334704399109, "learning_rate": 0.00023244492190152016, "loss": 0.5629, "step": 60410 }, { "epoch": 1.344818376068376, "grad_norm": 0.560871958732605, "learning_rate": 0.0002323989163505793, "loss": 0.5206, "step": 60420 }, { "epoch": 1.3450409544159543, "grad_norm": 0.5777971744537354, "learning_rate": 0.00023235290903905474, "loss": 0.5262, "step": 60430 }, { "epoch": 1.3452635327635327, "grad_norm": 0.6702395081520081, "learning_rate": 0.0002323068999694467, "loss": 0.5214, "step": 60440 }, { "epoch": 1.3454861111111112, "grad_norm": 0.5977016091346741, "learning_rate": 0.00023226088914425516, "loss": 0.4321, "step": 60450 }, { "epoch": 1.3457086894586894, "grad_norm": 0.44499361515045166, "learning_rate": 0.00023221487656598056, "loss": 0.5829, "step": 60460 }, { "epoch": 1.3459312678062678, "grad_norm": 0.46239128708839417, "learning_rate": 0.0002321688622371232, "loss": 0.5084, "step": 60470 }, { "epoch": 1.3461538461538463, "grad_norm": 0.4279974102973938, "learning_rate": 0.00023212284616018356, "loss": 0.577, "step": 60480 }, { "epoch": 1.3463764245014245, "grad_norm": 0.4787989854812622, "learning_rate": 0.0002320768283376621, "loss": 0.5562, "step": 60490 }, { "epoch": 1.346599002849003, "grad_norm": 0.7557083368301392, "learning_rate": 0.0002320308087720595, "loss": 0.5946, "step": 60500 }, { "epoch": 1.3468215811965811, "grad_norm": 0.7249836921691895, "learning_rate": 0.00023198478746587665, "loss": 0.5909, "step": 60510 }, { "epoch": 1.3470441595441596, "grad_norm": 0.7555255889892578, "learning_rate": 0.00023193876442161417, "loss": 0.5493, "step": 60520 }, { "epoch": 1.3472667378917378, "grad_norm": 0.653106153011322, "learning_rate": 0.0002318927396417731, "loss": 0.5377, "step": 60530 }, { "epoch": 1.3474893162393162, "grad_norm": 0.5108314156532288, "learning_rate": 0.00023184671312885446, "loss": 0.589, "step": 60540 }, { "epoch": 1.3477118945868947, "grad_norm": 0.4078660011291504, "learning_rate": 0.00023180068488535941, "loss": 0.5749, "step": 60550 }, { "epoch": 1.3479344729344729, "grad_norm": 0.8522250652313232, "learning_rate": 0.00023175465491378906, "loss": 0.6098, "step": 60560 }, { "epoch": 1.3481570512820513, "grad_norm": 0.7734964489936829, "learning_rate": 0.00023170862321664472, "loss": 0.5097, "step": 60570 }, { "epoch": 1.3483796296296298, "grad_norm": 0.47563299536705017, "learning_rate": 0.00023166258979642792, "loss": 0.5825, "step": 60580 }, { "epoch": 1.348602207977208, "grad_norm": 0.7040695548057556, "learning_rate": 0.00023161655465564, "loss": 0.4996, "step": 60590 }, { "epoch": 1.3488247863247862, "grad_norm": 0.5625837445259094, "learning_rate": 0.00023157051779678262, "loss": 0.4862, "step": 60600 }, { "epoch": 1.3490473646723646, "grad_norm": 0.6915884017944336, "learning_rate": 0.0002315244792223575, "loss": 0.6076, "step": 60610 }, { "epoch": 1.349269943019943, "grad_norm": 0.6123034954071045, "learning_rate": 0.0002314784389348664, "loss": 0.5694, "step": 60620 }, { "epoch": 1.3494925213675213, "grad_norm": 0.6105090379714966, "learning_rate": 0.00023143239693681111, "loss": 0.5683, "step": 60630 }, { "epoch": 1.3497150997150997, "grad_norm": 0.6517614126205444, "learning_rate": 0.00023138635323069365, "loss": 0.5279, "step": 60640 }, { "epoch": 1.3499376780626782, "grad_norm": 0.6277623176574707, "learning_rate": 0.0002313403078190161, "loss": 0.69, "step": 60650 }, { "epoch": 1.3501602564102564, "grad_norm": 0.49262914061546326, "learning_rate": 0.00023129426070428045, "loss": 0.6217, "step": 60660 }, { "epoch": 1.3503828347578348, "grad_norm": 0.46751052141189575, "learning_rate": 0.0002312482118889891, "loss": 0.5562, "step": 60670 }, { "epoch": 1.350605413105413, "grad_norm": 0.584173321723938, "learning_rate": 0.00023120216137564441, "loss": 0.4987, "step": 60680 }, { "epoch": 1.3508279914529915, "grad_norm": 0.6504818797111511, "learning_rate": 0.00023115610916674871, "loss": 0.6683, "step": 60690 }, { "epoch": 1.3510505698005697, "grad_norm": 0.9467840790748596, "learning_rate": 0.00023111005526480448, "loss": 0.5891, "step": 60700 }, { "epoch": 1.3512731481481481, "grad_norm": 0.3871930241584778, "learning_rate": 0.00023106399967231443, "loss": 0.5134, "step": 60710 }, { "epoch": 1.3514957264957266, "grad_norm": 0.6421234011650085, "learning_rate": 0.00023101794239178118, "loss": 0.4355, "step": 60720 }, { "epoch": 1.3517183048433048, "grad_norm": 0.5072511434555054, "learning_rate": 0.00023097188342570751, "loss": 0.5835, "step": 60730 }, { "epoch": 1.3519408831908832, "grad_norm": 0.7537820339202881, "learning_rate": 0.00023092582277659638, "loss": 0.4876, "step": 60740 }, { "epoch": 1.3521634615384617, "grad_norm": 0.6739486455917358, "learning_rate": 0.00023087976044695077, "loss": 0.5496, "step": 60750 }, { "epoch": 1.35238603988604, "grad_norm": 0.6261805891990662, "learning_rate": 0.00023083369643927366, "loss": 0.4884, "step": 60760 }, { "epoch": 1.352608618233618, "grad_norm": 0.5273316502571106, "learning_rate": 0.0002307876307560682, "loss": 0.521, "step": 60770 }, { "epoch": 1.3528311965811965, "grad_norm": 0.9039011597633362, "learning_rate": 0.00023074156339983773, "loss": 0.625, "step": 60780 }, { "epoch": 1.353053774928775, "grad_norm": 0.5701965689659119, "learning_rate": 0.00023069549437308552, "loss": 0.5523, "step": 60790 }, { "epoch": 1.3532763532763532, "grad_norm": 0.6095125675201416, "learning_rate": 0.00023064942367831499, "loss": 0.469, "step": 60800 }, { "epoch": 1.3534989316239316, "grad_norm": 0.7715137600898743, "learning_rate": 0.00023060335131802978, "loss": 0.5077, "step": 60810 }, { "epoch": 1.35372150997151, "grad_norm": 0.45640823245048523, "learning_rate": 0.0002305572772947333, "loss": 0.5276, "step": 60820 }, { "epoch": 1.3539440883190883, "grad_norm": 0.6939868927001953, "learning_rate": 0.00023051120161092942, "loss": 0.5504, "step": 60830 }, { "epoch": 1.3541666666666667, "grad_norm": 0.5779340267181396, "learning_rate": 0.00023046512426912186, "loss": 0.5491, "step": 60840 }, { "epoch": 1.354389245014245, "grad_norm": 0.5798224210739136, "learning_rate": 0.00023041904527181447, "loss": 0.5363, "step": 60850 }, { "epoch": 1.3546118233618234, "grad_norm": 0.6343097686767578, "learning_rate": 0.00023037296462151129, "loss": 0.5538, "step": 60860 }, { "epoch": 1.3548344017094016, "grad_norm": 0.9751542210578918, "learning_rate": 0.00023032688232071632, "loss": 0.5797, "step": 60870 }, { "epoch": 1.35505698005698, "grad_norm": 0.7180834412574768, "learning_rate": 0.00023028079837193377, "loss": 0.4524, "step": 60880 }, { "epoch": 1.3552795584045585, "grad_norm": 0.5285040140151978, "learning_rate": 0.00023023471277766784, "loss": 0.6168, "step": 60890 }, { "epoch": 1.3555021367521367, "grad_norm": 0.8148588538169861, "learning_rate": 0.00023018862554042286, "loss": 0.4125, "step": 60900 }, { "epoch": 1.3557247150997151, "grad_norm": 0.7640627026557922, "learning_rate": 0.00023014253666270326, "loss": 0.5679, "step": 60910 }, { "epoch": 1.3559472934472934, "grad_norm": 0.6860986351966858, "learning_rate": 0.0002300964461470135, "loss": 0.5544, "step": 60920 }, { "epoch": 1.3561698717948718, "grad_norm": 0.5359602570533752, "learning_rate": 0.00023005035399585827, "loss": 0.6402, "step": 60930 }, { "epoch": 1.35639245014245, "grad_norm": 0.7734375596046448, "learning_rate": 0.00023000426021174225, "loss": 0.5711, "step": 60940 }, { "epoch": 1.3566150284900285, "grad_norm": 0.4474536180496216, "learning_rate": 0.00022995816479717007, "loss": 0.3943, "step": 60950 }, { "epoch": 1.356837606837607, "grad_norm": 0.6209190487861633, "learning_rate": 0.00022991206775464673, "loss": 0.6773, "step": 60960 }, { "epoch": 1.3570601851851851, "grad_norm": 0.43275436758995056, "learning_rate": 0.00022986596908667717, "loss": 0.4396, "step": 60970 }, { "epoch": 1.3572827635327636, "grad_norm": 0.6500193476676941, "learning_rate": 0.00022981986879576637, "loss": 0.5547, "step": 60980 }, { "epoch": 1.357505341880342, "grad_norm": 0.5044712424278259, "learning_rate": 0.00022977376688441945, "loss": 0.6024, "step": 60990 }, { "epoch": 1.3577279202279202, "grad_norm": 0.6947741508483887, "learning_rate": 0.00022972766335514172, "loss": 0.5264, "step": 61000 }, { "epoch": 1.3579504985754987, "grad_norm": 0.6112157106399536, "learning_rate": 0.00022968155821043843, "loss": 0.4964, "step": 61010 }, { "epoch": 1.3581730769230769, "grad_norm": 0.80865079164505, "learning_rate": 0.00022963545145281492, "loss": 0.5743, "step": 61020 }, { "epoch": 1.3583956552706553, "grad_norm": 0.4842277467250824, "learning_rate": 0.00022958934308477672, "loss": 0.6893, "step": 61030 }, { "epoch": 1.3586182336182335, "grad_norm": 0.872617244720459, "learning_rate": 0.00022954323310882946, "loss": 0.6816, "step": 61040 }, { "epoch": 1.358840811965812, "grad_norm": 0.5222064852714539, "learning_rate": 0.00022949712152747865, "loss": 0.4895, "step": 61050 }, { "epoch": 1.3590633903133904, "grad_norm": 0.5121984481811523, "learning_rate": 0.00022945100834323014, "loss": 0.4499, "step": 61060 }, { "epoch": 1.3592859686609686, "grad_norm": 1.0106990337371826, "learning_rate": 0.00022940489355858978, "loss": 0.6911, "step": 61070 }, { "epoch": 1.359508547008547, "grad_norm": 0.6854296922683716, "learning_rate": 0.00022935877717606337, "loss": 0.7157, "step": 61080 }, { "epoch": 1.3597311253561253, "grad_norm": 0.5751320719718933, "learning_rate": 0.00022931265919815696, "loss": 0.5062, "step": 61090 }, { "epoch": 1.3599537037037037, "grad_norm": 0.47325485944747925, "learning_rate": 0.0002292665396273767, "loss": 0.5205, "step": 61100 }, { "epoch": 1.360176282051282, "grad_norm": 0.6046233177185059, "learning_rate": 0.0002292204184662287, "loss": 0.4618, "step": 61110 }, { "epoch": 1.3603988603988604, "grad_norm": 0.7212419509887695, "learning_rate": 0.0002291742957172192, "loss": 0.5979, "step": 61120 }, { "epoch": 1.3606214387464388, "grad_norm": 0.4322417378425598, "learning_rate": 0.00022912817138285462, "loss": 0.5809, "step": 61130 }, { "epoch": 1.360844017094017, "grad_norm": 0.7356921434402466, "learning_rate": 0.00022908204546564145, "loss": 0.529, "step": 61140 }, { "epoch": 1.3610665954415955, "grad_norm": 0.3717854619026184, "learning_rate": 0.000229035917968086, "loss": 0.5253, "step": 61150 }, { "epoch": 1.361289173789174, "grad_norm": 0.8095477223396301, "learning_rate": 0.000228989788892695, "loss": 0.5405, "step": 61160 }, { "epoch": 1.3615117521367521, "grad_norm": 0.4735598564147949, "learning_rate": 0.0002289436582419752, "loss": 0.607, "step": 61170 }, { "epoch": 1.3617343304843303, "grad_norm": 0.6034855246543884, "learning_rate": 0.0002288975260184333, "loss": 0.5844, "step": 61180 }, { "epoch": 1.3619569088319088, "grad_norm": 0.5404762625694275, "learning_rate": 0.00022885139222457616, "loss": 0.4322, "step": 61190 }, { "epoch": 1.3621794871794872, "grad_norm": 1.5063923597335815, "learning_rate": 0.00022880525686291075, "loss": 0.7102, "step": 61200 }, { "epoch": 1.3624020655270654, "grad_norm": 0.6246905326843262, "learning_rate": 0.00022875911993594413, "loss": 0.5407, "step": 61210 }, { "epoch": 1.3626246438746439, "grad_norm": 0.5119190216064453, "learning_rate": 0.00022871298144618339, "loss": 0.5707, "step": 61220 }, { "epoch": 1.3628472222222223, "grad_norm": 0.5602666735649109, "learning_rate": 0.0002286668413961357, "loss": 0.6487, "step": 61230 }, { "epoch": 1.3630698005698005, "grad_norm": 0.6652705073356628, "learning_rate": 0.00022862069978830837, "loss": 0.5519, "step": 61240 }, { "epoch": 1.363292378917379, "grad_norm": 0.6826812028884888, "learning_rate": 0.00022857455662520884, "loss": 0.5748, "step": 61250 }, { "epoch": 1.3635149572649572, "grad_norm": 0.46549534797668457, "learning_rate": 0.00022852841190934445, "loss": 0.5655, "step": 61260 }, { "epoch": 1.3637375356125356, "grad_norm": 0.5537489056587219, "learning_rate": 0.00022848226564322284, "loss": 0.6324, "step": 61270 }, { "epoch": 1.3639601139601139, "grad_norm": 0.46651938557624817, "learning_rate": 0.0002284361178293516, "loss": 0.6068, "step": 61280 }, { "epoch": 1.3641826923076923, "grad_norm": 0.5096479058265686, "learning_rate": 0.00022838996847023842, "loss": 0.4882, "step": 61290 }, { "epoch": 1.3644052706552707, "grad_norm": 0.5014933943748474, "learning_rate": 0.0002283438175683911, "loss": 0.5067, "step": 61300 }, { "epoch": 1.364627849002849, "grad_norm": 0.7643182873725891, "learning_rate": 0.00022829766512631755, "loss": 0.6686, "step": 61310 }, { "epoch": 1.3648504273504274, "grad_norm": 0.5372745394706726, "learning_rate": 0.00022825151114652572, "loss": 0.5306, "step": 61320 }, { "epoch": 1.3650730056980058, "grad_norm": 0.5938313603401184, "learning_rate": 0.00022820535563152362, "loss": 0.4763, "step": 61330 }, { "epoch": 1.365295584045584, "grad_norm": 0.5216083526611328, "learning_rate": 0.00022815919858381944, "loss": 0.5552, "step": 61340 }, { "epoch": 1.3655181623931623, "grad_norm": 0.4963095486164093, "learning_rate": 0.00022811304000592135, "loss": 0.4957, "step": 61350 }, { "epoch": 1.3657407407407407, "grad_norm": 0.7983525991439819, "learning_rate": 0.00022806687990033764, "loss": 0.531, "step": 61360 }, { "epoch": 1.3659633190883191, "grad_norm": 1.3070868253707886, "learning_rate": 0.00022802071826957669, "loss": 0.6501, "step": 61370 }, { "epoch": 1.3661858974358974, "grad_norm": 0.77958083152771, "learning_rate": 0.00022797455511614702, "loss": 0.7044, "step": 61380 }, { "epoch": 1.3664084757834758, "grad_norm": 0.6154484152793884, "learning_rate": 0.00022792839044255705, "loss": 0.5782, "step": 61390 }, { "epoch": 1.3666310541310542, "grad_norm": 0.5589473247528076, "learning_rate": 0.00022788222425131554, "loss": 0.4325, "step": 61400 }, { "epoch": 1.3668536324786325, "grad_norm": 0.42728063464164734, "learning_rate": 0.00022783605654493107, "loss": 0.5843, "step": 61410 }, { "epoch": 1.367076210826211, "grad_norm": 0.8472455739974976, "learning_rate": 0.00022778988732591259, "loss": 0.6619, "step": 61420 }, { "epoch": 1.367298789173789, "grad_norm": 0.6462082862854004, "learning_rate": 0.0002277437165967688, "loss": 0.6922, "step": 61430 }, { "epoch": 1.3675213675213675, "grad_norm": 0.49318283796310425, "learning_rate": 0.00022769754436000877, "loss": 0.5898, "step": 61440 }, { "epoch": 1.3677439458689458, "grad_norm": 0.6925389766693115, "learning_rate": 0.00022765137061814153, "loss": 0.5382, "step": 61450 }, { "epoch": 1.3679665242165242, "grad_norm": 0.6416910886764526, "learning_rate": 0.00022760519537367614, "loss": 0.5307, "step": 61460 }, { "epoch": 1.3681891025641026, "grad_norm": 0.47730687260627747, "learning_rate": 0.00022755901862912188, "loss": 0.5994, "step": 61470 }, { "epoch": 1.3684116809116809, "grad_norm": 0.554335355758667, "learning_rate": 0.0002275128403869879, "loss": 0.6278, "step": 61480 }, { "epoch": 1.3686342592592593, "grad_norm": 0.548578143119812, "learning_rate": 0.00022746666064978377, "loss": 0.5798, "step": 61490 }, { "epoch": 1.3688568376068377, "grad_norm": 0.6649680733680725, "learning_rate": 0.00022742047942001873, "loss": 0.5958, "step": 61500 }, { "epoch": 1.369079415954416, "grad_norm": 0.5594968199729919, "learning_rate": 0.00022737429670020238, "loss": 0.5051, "step": 61510 }, { "epoch": 1.3693019943019942, "grad_norm": 0.5725864768028259, "learning_rate": 0.00022732811249284436, "loss": 0.4258, "step": 61520 }, { "epoch": 1.3695245726495726, "grad_norm": 0.4459512233734131, "learning_rate": 0.00022728192680045438, "loss": 0.5485, "step": 61530 }, { "epoch": 1.369747150997151, "grad_norm": 0.6141966581344604, "learning_rate": 0.0002272357396255421, "loss": 0.5744, "step": 61540 }, { "epoch": 1.3699697293447293, "grad_norm": 0.5769743919372559, "learning_rate": 0.00022718955097061745, "loss": 0.576, "step": 61550 }, { "epoch": 1.3701923076923077, "grad_norm": 0.4499601721763611, "learning_rate": 0.00022714336083819037, "loss": 0.5567, "step": 61560 }, { "epoch": 1.3704148860398861, "grad_norm": 0.7231540679931641, "learning_rate": 0.0002270971692307708, "loss": 0.552, "step": 61570 }, { "epoch": 1.3706374643874644, "grad_norm": 0.36756712198257446, "learning_rate": 0.00022705097615086887, "loss": 0.5, "step": 61580 }, { "epoch": 1.3708600427350428, "grad_norm": 0.6529523730278015, "learning_rate": 0.0002270047816009948, "loss": 0.5632, "step": 61590 }, { "epoch": 1.371082621082621, "grad_norm": 0.8186941742897034, "learning_rate": 0.00022695858558365882, "loss": 0.518, "step": 61600 }, { "epoch": 1.3713051994301995, "grad_norm": 0.4617755115032196, "learning_rate": 0.00022691238810137115, "loss": 0.5366, "step": 61610 }, { "epoch": 1.3715277777777777, "grad_norm": 0.4704979956150055, "learning_rate": 0.00022686618915664227, "loss": 0.5903, "step": 61620 }, { "epoch": 1.3717503561253561, "grad_norm": 0.5129684209823608, "learning_rate": 0.00022681998875198275, "loss": 0.5632, "step": 61630 }, { "epoch": 1.3719729344729346, "grad_norm": 0.9282556176185608, "learning_rate": 0.00022677378688990306, "loss": 0.6957, "step": 61640 }, { "epoch": 1.3721955128205128, "grad_norm": 0.4365776479244232, "learning_rate": 0.00022672758357291381, "loss": 0.5194, "step": 61650 }, { "epoch": 1.3724180911680912, "grad_norm": 0.5178688168525696, "learning_rate": 0.00022668137880352585, "loss": 0.6173, "step": 61660 }, { "epoch": 1.3726406695156697, "grad_norm": 0.7604503631591797, "learning_rate": 0.00022663517258424994, "loss": 0.601, "step": 61670 }, { "epoch": 1.3728632478632479, "grad_norm": 0.7271844148635864, "learning_rate": 0.00022658896491759692, "loss": 0.5686, "step": 61680 }, { "epoch": 1.373085826210826, "grad_norm": 0.5016389489173889, "learning_rate": 0.00022654275580607776, "loss": 0.5662, "step": 61690 }, { "epoch": 1.3733084045584045, "grad_norm": 0.8639649152755737, "learning_rate": 0.00022649654525220357, "loss": 0.4636, "step": 61700 }, { "epoch": 1.373530982905983, "grad_norm": 0.9058513045310974, "learning_rate": 0.00022645033325848534, "loss": 0.6237, "step": 61710 }, { "epoch": 1.3737535612535612, "grad_norm": 0.7580538988113403, "learning_rate": 0.0002264041198274344, "loss": 0.5563, "step": 61720 }, { "epoch": 1.3739761396011396, "grad_norm": 0.4570119380950928, "learning_rate": 0.00022635790496156201, "loss": 0.6289, "step": 61730 }, { "epoch": 1.374198717948718, "grad_norm": 0.8195516467094421, "learning_rate": 0.00022631168866337945, "loss": 0.6815, "step": 61740 }, { "epoch": 1.3744212962962963, "grad_norm": 0.6854251027107239, "learning_rate": 0.00022626547093539817, "loss": 0.5385, "step": 61750 }, { "epoch": 1.3746438746438747, "grad_norm": 0.6686112880706787, "learning_rate": 0.00022621925178012967, "loss": 0.549, "step": 61760 }, { "epoch": 1.374866452991453, "grad_norm": 0.46365267038345337, "learning_rate": 0.00022617303120008565, "loss": 0.5998, "step": 61770 }, { "epoch": 1.3750890313390314, "grad_norm": 0.5610596537590027, "learning_rate": 0.00022612680919777766, "loss": 0.473, "step": 61780 }, { "epoch": 1.3753116096866096, "grad_norm": 0.6108236908912659, "learning_rate": 0.00022608058577571743, "loss": 0.4008, "step": 61790 }, { "epoch": 1.375534188034188, "grad_norm": 0.6078330874443054, "learning_rate": 0.00022603436093641683, "loss": 0.6347, "step": 61800 }, { "epoch": 1.3757567663817665, "grad_norm": 0.6715290546417236, "learning_rate": 0.00022598813468238782, "loss": 0.4849, "step": 61810 }, { "epoch": 1.3759793447293447, "grad_norm": 0.6614882946014404, "learning_rate": 0.0002259419070161422, "loss": 0.5161, "step": 61820 }, { "epoch": 1.3762019230769231, "grad_norm": 0.3726741373538971, "learning_rate": 0.00022589567794019212, "loss": 0.5588, "step": 61830 }, { "epoch": 1.3764245014245013, "grad_norm": 0.7765806913375854, "learning_rate": 0.00022584944745704974, "loss": 0.5807, "step": 61840 }, { "epoch": 1.3766470797720798, "grad_norm": 0.8264938592910767, "learning_rate": 0.00022580321556922722, "loss": 0.624, "step": 61850 }, { "epoch": 1.376869658119658, "grad_norm": 0.41498658061027527, "learning_rate": 0.00022575698227923687, "loss": 0.5096, "step": 61860 }, { "epoch": 1.3770922364672364, "grad_norm": 0.555334210395813, "learning_rate": 0.00022571074758959093, "loss": 0.5419, "step": 61870 }, { "epoch": 1.3773148148148149, "grad_norm": 0.6658076643943787, "learning_rate": 0.00022566451150280204, "loss": 0.5898, "step": 61880 }, { "epoch": 1.377537393162393, "grad_norm": 0.7786030769348145, "learning_rate": 0.00022561827402138248, "loss": 0.6097, "step": 61890 }, { "epoch": 1.3777599715099715, "grad_norm": 0.5594373345375061, "learning_rate": 0.00022557203514784498, "loss": 0.5618, "step": 61900 }, { "epoch": 1.37798254985755, "grad_norm": 0.4955253601074219, "learning_rate": 0.0002255257948847022, "loss": 0.43, "step": 61910 }, { "epoch": 1.3782051282051282, "grad_norm": 0.5867207646369934, "learning_rate": 0.00022547955323446673, "loss": 0.6679, "step": 61920 }, { "epoch": 1.3784277065527066, "grad_norm": 0.7317691445350647, "learning_rate": 0.00022543331019965154, "loss": 0.4994, "step": 61930 }, { "epoch": 1.3786502849002849, "grad_norm": 0.5208408832550049, "learning_rate": 0.0002253870657827694, "loss": 0.6063, "step": 61940 }, { "epoch": 1.3788728632478633, "grad_norm": 0.7552675008773804, "learning_rate": 0.00022534081998633343, "loss": 0.6639, "step": 61950 }, { "epoch": 1.3790954415954415, "grad_norm": 0.5934411287307739, "learning_rate": 0.00022529457281285646, "loss": 0.6692, "step": 61960 }, { "epoch": 1.37931801994302, "grad_norm": 0.7339460253715515, "learning_rate": 0.00022524832426485173, "loss": 0.6032, "step": 61970 }, { "epoch": 1.3795405982905984, "grad_norm": 0.7074295282363892, "learning_rate": 0.00022520207434483238, "loss": 0.5184, "step": 61980 }, { "epoch": 1.3797631766381766, "grad_norm": 0.42233505845069885, "learning_rate": 0.00022515582305531173, "loss": 0.5433, "step": 61990 }, { "epoch": 1.379985754985755, "grad_norm": 0.46188631653785706, "learning_rate": 0.000225109570398803, "loss": 0.5613, "step": 62000 }, { "epoch": 1.3801638176638176, "eval_loss": 0.576337993144989, "eval_runtime": 337.4259, "eval_samples_per_second": 7.009, "eval_steps_per_second": 7.009, "step": 62008 }, { "epoch": 1.3802083333333333, "grad_norm": 0.5289804339408875, "learning_rate": 0.00022506331637781965, "loss": 0.6027, "step": 62010 }, { "epoch": 1.3804309116809117, "grad_norm": 0.3944786787033081, "learning_rate": 0.00022501706099487524, "loss": 0.5538, "step": 62020 }, { "epoch": 1.38065349002849, "grad_norm": 0.6000606417655945, "learning_rate": 0.00022497080425248317, "loss": 0.7033, "step": 62030 }, { "epoch": 1.3808760683760684, "grad_norm": 0.5767252445220947, "learning_rate": 0.0002249245461531572, "loss": 0.5677, "step": 62040 }, { "epoch": 1.3810986467236468, "grad_norm": 0.5308343768119812, "learning_rate": 0.00022487828669941104, "loss": 0.582, "step": 62050 }, { "epoch": 1.381321225071225, "grad_norm": 0.540984034538269, "learning_rate": 0.00022483202589375836, "loss": 0.5718, "step": 62060 }, { "epoch": 1.3815438034188035, "grad_norm": 0.6492820382118225, "learning_rate": 0.00022478576373871306, "loss": 0.5682, "step": 62070 }, { "epoch": 1.381766381766382, "grad_norm": 0.5033608078956604, "learning_rate": 0.00022473950023678903, "loss": 0.5774, "step": 62080 }, { "epoch": 1.38198896011396, "grad_norm": 0.7015234231948853, "learning_rate": 0.00022469323539050037, "loss": 0.6103, "step": 62090 }, { "epoch": 1.3822115384615383, "grad_norm": 0.605975866317749, "learning_rate": 0.00022464696920236103, "loss": 0.502, "step": 62100 }, { "epoch": 1.3824341168091168, "grad_norm": 0.39481988549232483, "learning_rate": 0.00022460070167488523, "loss": 0.4986, "step": 62110 }, { "epoch": 1.3826566951566952, "grad_norm": 0.6863367557525635, "learning_rate": 0.00022455443281058722, "loss": 0.6544, "step": 62120 }, { "epoch": 1.3828792735042734, "grad_norm": 0.6480594873428345, "learning_rate": 0.0002245081626119812, "loss": 0.6128, "step": 62130 }, { "epoch": 1.3831018518518519, "grad_norm": 0.5644036531448364, "learning_rate": 0.00022446189108158148, "loss": 0.4863, "step": 62140 }, { "epoch": 1.3833244301994303, "grad_norm": 0.7015679478645325, "learning_rate": 0.00022441561822190265, "loss": 0.5432, "step": 62150 }, { "epoch": 1.3835470085470085, "grad_norm": 0.559040904045105, "learning_rate": 0.00022436934403545914, "loss": 0.4822, "step": 62160 }, { "epoch": 1.383769586894587, "grad_norm": 0.5581338405609131, "learning_rate": 0.00022432306852476547, "loss": 0.602, "step": 62170 }, { "epoch": 1.3839921652421652, "grad_norm": 0.4883200228214264, "learning_rate": 0.00022427679169233637, "loss": 0.5778, "step": 62180 }, { "epoch": 1.3842147435897436, "grad_norm": 0.912431001663208, "learning_rate": 0.00022423051354068658, "loss": 0.5926, "step": 62190 }, { "epoch": 1.3844373219373218, "grad_norm": 0.54873126745224, "learning_rate": 0.00022418423407233085, "loss": 0.4685, "step": 62200 }, { "epoch": 1.3846599002849003, "grad_norm": 0.4148489832878113, "learning_rate": 0.000224137953289784, "loss": 0.5606, "step": 62210 }, { "epoch": 1.3848824786324787, "grad_norm": 0.6492806673049927, "learning_rate": 0.000224091671195561, "loss": 0.4948, "step": 62220 }, { "epoch": 1.385105056980057, "grad_norm": 0.782181978225708, "learning_rate": 0.00022404538779217687, "loss": 0.5751, "step": 62230 }, { "epoch": 1.3853276353276354, "grad_norm": 0.47252678871154785, "learning_rate": 0.00022399910308214672, "loss": 0.4825, "step": 62240 }, { "epoch": 1.3855502136752138, "grad_norm": 0.6602449417114258, "learning_rate": 0.00022395281706798562, "loss": 0.6602, "step": 62250 }, { "epoch": 1.385772792022792, "grad_norm": 0.538071870803833, "learning_rate": 0.00022390652975220885, "loss": 0.4552, "step": 62260 }, { "epoch": 1.3859953703703702, "grad_norm": 0.7834280729293823, "learning_rate": 0.00022386024113733172, "loss": 0.5724, "step": 62270 }, { "epoch": 1.3862179487179487, "grad_norm": 0.7608660459518433, "learning_rate": 0.00022381395122586945, "loss": 0.6567, "step": 62280 }, { "epoch": 1.3864405270655271, "grad_norm": 0.7100602388381958, "learning_rate": 0.00022376766002033763, "loss": 0.626, "step": 62290 }, { "epoch": 1.3866631054131053, "grad_norm": 0.4596807062625885, "learning_rate": 0.00022372136752325176, "loss": 0.6701, "step": 62300 }, { "epoch": 1.3868856837606838, "grad_norm": 0.5603671669960022, "learning_rate": 0.00022367507373712727, "loss": 0.5665, "step": 62310 }, { "epoch": 1.3871082621082622, "grad_norm": 0.5945459008216858, "learning_rate": 0.00022362877866448, "loss": 0.5837, "step": 62320 }, { "epoch": 1.3873308404558404, "grad_norm": 0.688590943813324, "learning_rate": 0.00022358248230782546, "loss": 0.5475, "step": 62330 }, { "epoch": 1.3875534188034189, "grad_norm": 0.41545212268829346, "learning_rate": 0.00022353618466967957, "loss": 0.559, "step": 62340 }, { "epoch": 1.387775997150997, "grad_norm": 0.4486536979675293, "learning_rate": 0.0002234898857525581, "loss": 0.4332, "step": 62350 }, { "epoch": 1.3879985754985755, "grad_norm": 0.4931320250034332, "learning_rate": 0.00022344358555897702, "loss": 0.5444, "step": 62360 }, { "epoch": 1.3882211538461537, "grad_norm": 0.6781936287879944, "learning_rate": 0.00022339728409145236, "loss": 0.4987, "step": 62370 }, { "epoch": 1.3884437321937322, "grad_norm": 0.5794190764427185, "learning_rate": 0.00022335098135250006, "loss": 0.5242, "step": 62380 }, { "epoch": 1.3886663105413106, "grad_norm": 0.6140381097793579, "learning_rate": 0.00022330467734463637, "loss": 0.5791, "step": 62390 }, { "epoch": 1.3888888888888888, "grad_norm": 0.8430309891700745, "learning_rate": 0.0002232583720703774, "loss": 0.5393, "step": 62400 }, { "epoch": 1.3891114672364673, "grad_norm": 0.3692317605018616, "learning_rate": 0.00022321206553223947, "loss": 0.3972, "step": 62410 }, { "epoch": 1.3893340455840457, "grad_norm": 0.6154002547264099, "learning_rate": 0.00022316575773273888, "loss": 0.5825, "step": 62420 }, { "epoch": 1.389556623931624, "grad_norm": 0.62913978099823, "learning_rate": 0.00022311944867439208, "loss": 0.596, "step": 62430 }, { "epoch": 1.3897792022792022, "grad_norm": 0.8898218870162964, "learning_rate": 0.00022307313835971551, "loss": 0.6229, "step": 62440 }, { "epoch": 1.3900017806267806, "grad_norm": 0.7014033794403076, "learning_rate": 0.0002230268267912257, "loss": 0.6822, "step": 62450 }, { "epoch": 1.390224358974359, "grad_norm": 0.7933834195137024, "learning_rate": 0.00022298051397143928, "loss": 0.6245, "step": 62460 }, { "epoch": 1.3904469373219372, "grad_norm": 0.7982556223869324, "learning_rate": 0.0002229341999028729, "loss": 0.6596, "step": 62470 }, { "epoch": 1.3906695156695157, "grad_norm": 0.7251663208007812, "learning_rate": 0.00022288788458804334, "loss": 0.6613, "step": 62480 }, { "epoch": 1.3908920940170941, "grad_norm": 0.6037178635597229, "learning_rate": 0.00022284156802946737, "loss": 0.6531, "step": 62490 }, { "epoch": 1.3911146723646723, "grad_norm": 0.6063000559806824, "learning_rate": 0.00022279525022966189, "loss": 0.4803, "step": 62500 }, { "epoch": 1.3913372507122508, "grad_norm": 0.5215537548065186, "learning_rate": 0.00022274893119114388, "loss": 0.5963, "step": 62510 }, { "epoch": 1.391559829059829, "grad_norm": 0.7405494451522827, "learning_rate": 0.00022270261091643034, "loss": 0.5951, "step": 62520 }, { "epoch": 1.3917824074074074, "grad_norm": 0.7568892240524292, "learning_rate": 0.0002226562894080383, "loss": 0.4655, "step": 62530 }, { "epoch": 1.3920049857549857, "grad_norm": 0.8880526423454285, "learning_rate": 0.00022260996666848497, "loss": 0.7519, "step": 62540 }, { "epoch": 1.392227564102564, "grad_norm": 0.4250199794769287, "learning_rate": 0.00022256364270028752, "loss": 0.5116, "step": 62550 }, { "epoch": 1.3924501424501425, "grad_norm": 0.38995271921157837, "learning_rate": 0.00022251731750596326, "loss": 0.5977, "step": 62560 }, { "epoch": 1.3926727207977208, "grad_norm": 0.7418897151947021, "learning_rate": 0.00022247099108802952, "loss": 0.468, "step": 62570 }, { "epoch": 1.3928952991452992, "grad_norm": 0.5622426867485046, "learning_rate": 0.00022242466344900383, "loss": 0.4863, "step": 62580 }, { "epoch": 1.3931178774928774, "grad_norm": 0.3932307958602905, "learning_rate": 0.00022237833459140346, "loss": 0.6142, "step": 62590 }, { "epoch": 1.3933404558404558, "grad_norm": 0.6206909418106079, "learning_rate": 0.00022233200451774607, "loss": 0.5501, "step": 62600 }, { "epoch": 1.393563034188034, "grad_norm": 0.6179700493812561, "learning_rate": 0.00022228567323054934, "loss": 0.4821, "step": 62610 }, { "epoch": 1.3937856125356125, "grad_norm": 0.5563085675239563, "learning_rate": 0.0002222393407323308, "loss": 0.5044, "step": 62620 }, { "epoch": 1.394008190883191, "grad_norm": 0.6906719207763672, "learning_rate": 0.00022219300702560832, "loss": 0.4655, "step": 62630 }, { "epoch": 1.3942307692307692, "grad_norm": 0.7658056020736694, "learning_rate": 0.00022214667211289965, "loss": 0.6384, "step": 62640 }, { "epoch": 1.3944533475783476, "grad_norm": 0.5997521877288818, "learning_rate": 0.00022210033599672277, "loss": 0.5264, "step": 62650 }, { "epoch": 1.394675925925926, "grad_norm": 0.6423810124397278, "learning_rate": 0.00022205399867959545, "loss": 0.6993, "step": 62660 }, { "epoch": 1.3948985042735043, "grad_norm": 0.8208138346672058, "learning_rate": 0.00022200766016403577, "loss": 0.5452, "step": 62670 }, { "epoch": 1.3951210826210827, "grad_norm": 0.5556749105453491, "learning_rate": 0.0002219613204525619, "loss": 0.5979, "step": 62680 }, { "epoch": 1.395343660968661, "grad_norm": 0.5077260732650757, "learning_rate": 0.0002219149795476919, "loss": 0.4665, "step": 62690 }, { "epoch": 1.3955662393162394, "grad_norm": 0.6669285893440247, "learning_rate": 0.00022186863745194392, "loss": 0.6145, "step": 62700 }, { "epoch": 1.3957888176638176, "grad_norm": 0.7747873663902283, "learning_rate": 0.0002218222941678363, "loss": 0.4928, "step": 62710 }, { "epoch": 1.396011396011396, "grad_norm": 0.642152726650238, "learning_rate": 0.00022177594969788732, "loss": 0.5777, "step": 62720 }, { "epoch": 1.3962339743589745, "grad_norm": 0.5894016027450562, "learning_rate": 0.00022172960404461542, "loss": 0.6064, "step": 62730 }, { "epoch": 1.3964565527065527, "grad_norm": 0.6865715980529785, "learning_rate": 0.00022168325721053908, "loss": 0.5555, "step": 62740 }, { "epoch": 1.396679131054131, "grad_norm": 0.6753658652305603, "learning_rate": 0.00022163690919817678, "loss": 0.5101, "step": 62750 }, { "epoch": 1.3969017094017093, "grad_norm": 0.4573817551136017, "learning_rate": 0.0002215905600100471, "loss": 0.4022, "step": 62760 }, { "epoch": 1.3971242877492878, "grad_norm": 0.5543593764305115, "learning_rate": 0.0002215442096486687, "loss": 0.5375, "step": 62770 }, { "epoch": 1.397346866096866, "grad_norm": 0.6529911160469055, "learning_rate": 0.00022149785811656036, "loss": 0.5514, "step": 62780 }, { "epoch": 1.3975694444444444, "grad_norm": 0.5409330725669861, "learning_rate": 0.0002214515054162408, "loss": 0.5565, "step": 62790 }, { "epoch": 1.3977920227920229, "grad_norm": 0.4280526638031006, "learning_rate": 0.00022140515155022884, "loss": 0.5782, "step": 62800 }, { "epoch": 1.398014601139601, "grad_norm": 0.6719635725021362, "learning_rate": 0.00022135879652104344, "loss": 0.5799, "step": 62810 }, { "epoch": 1.3982371794871795, "grad_norm": 0.4967477023601532, "learning_rate": 0.00022131244033120359, "loss": 0.5666, "step": 62820 }, { "epoch": 1.398459757834758, "grad_norm": 0.6547058820724487, "learning_rate": 0.00022126608298322822, "loss": 0.6124, "step": 62830 }, { "epoch": 1.3986823361823362, "grad_norm": 0.8326950073242188, "learning_rate": 0.00022121972447963653, "loss": 0.6077, "step": 62840 }, { "epoch": 1.3989049145299146, "grad_norm": 0.8579636812210083, "learning_rate": 0.00022117336482294767, "loss": 0.6663, "step": 62850 }, { "epoch": 1.3991274928774928, "grad_norm": 0.6645912528038025, "learning_rate": 0.0002211270040156808, "loss": 0.5493, "step": 62860 }, { "epoch": 1.3993500712250713, "grad_norm": 0.5138288736343384, "learning_rate": 0.0002210806420603552, "loss": 0.52, "step": 62870 }, { "epoch": 1.3995726495726495, "grad_norm": 0.6342707276344299, "learning_rate": 0.00022103427895949027, "loss": 0.6662, "step": 62880 }, { "epoch": 1.399795227920228, "grad_norm": 0.6028152108192444, "learning_rate": 0.00022098791471560544, "loss": 0.4837, "step": 62890 }, { "epoch": 1.4000178062678064, "grad_norm": 0.4323852062225342, "learning_rate": 0.00022094154933122008, "loss": 0.4636, "step": 62900 }, { "epoch": 1.4002403846153846, "grad_norm": 0.6630343198776245, "learning_rate": 0.00022089518280885386, "loss": 0.6073, "step": 62910 }, { "epoch": 1.400462962962963, "grad_norm": 0.5092897415161133, "learning_rate": 0.00022084881515102627, "loss": 0.5255, "step": 62920 }, { "epoch": 1.4006855413105412, "grad_norm": 0.5960636138916016, "learning_rate": 0.00022080244636025703, "loss": 0.591, "step": 62930 }, { "epoch": 1.4009081196581197, "grad_norm": 0.46302658319473267, "learning_rate": 0.00022075607643906576, "loss": 0.5215, "step": 62940 }, { "epoch": 1.401130698005698, "grad_norm": 0.6542785167694092, "learning_rate": 0.00022070970538997232, "loss": 0.639, "step": 62950 }, { "epoch": 1.4013532763532763, "grad_norm": 0.4922322928905487, "learning_rate": 0.00022066333321549655, "loss": 0.6914, "step": 62960 }, { "epoch": 1.4015758547008548, "grad_norm": 0.6241610050201416, "learning_rate": 0.0002206169599181583, "loss": 0.574, "step": 62970 }, { "epoch": 1.401798433048433, "grad_norm": 0.7919242978096008, "learning_rate": 0.00022057058550047765, "loss": 0.5021, "step": 62980 }, { "epoch": 1.4020210113960114, "grad_norm": 0.4039008617401123, "learning_rate": 0.00022052420996497446, "loss": 0.5266, "step": 62990 }, { "epoch": 1.4022435897435899, "grad_norm": 0.7754238247871399, "learning_rate": 0.000220477833314169, "loss": 0.526, "step": 63000 }, { "epoch": 1.402466168091168, "grad_norm": 0.9635992646217346, "learning_rate": 0.00022043145555058122, "loss": 0.5949, "step": 63010 }, { "epoch": 1.4026887464387463, "grad_norm": 0.6123163104057312, "learning_rate": 0.00022038507667673142, "loss": 0.6486, "step": 63020 }, { "epoch": 1.4029113247863247, "grad_norm": 0.6298154592514038, "learning_rate": 0.00022033869669513996, "loss": 0.5848, "step": 63030 }, { "epoch": 1.4031339031339032, "grad_norm": 0.6101586222648621, "learning_rate": 0.00022029231560832701, "loss": 0.4707, "step": 63040 }, { "epoch": 1.4033564814814814, "grad_norm": 0.6360435485839844, "learning_rate": 0.000220245933418813, "loss": 0.4835, "step": 63050 }, { "epoch": 1.4035790598290598, "grad_norm": 0.4704650044441223, "learning_rate": 0.0002201995501291184, "loss": 0.4642, "step": 63060 }, { "epoch": 1.4038016381766383, "grad_norm": 0.383497953414917, "learning_rate": 0.00022015316574176374, "loss": 0.5225, "step": 63070 }, { "epoch": 1.4040242165242165, "grad_norm": 0.6637696623802185, "learning_rate": 0.00022010678025926952, "loss": 0.4968, "step": 63080 }, { "epoch": 1.404246794871795, "grad_norm": 0.5835416913032532, "learning_rate": 0.0002200603936841564, "loss": 0.5077, "step": 63090 }, { "epoch": 1.4044693732193732, "grad_norm": 0.3635801076889038, "learning_rate": 0.00022001400601894512, "loss": 0.6125, "step": 63100 }, { "epoch": 1.4046919515669516, "grad_norm": 0.8239157795906067, "learning_rate": 0.00021996761726615632, "loss": 0.7048, "step": 63110 }, { "epoch": 1.4049145299145298, "grad_norm": 0.3798117935657501, "learning_rate": 0.00021992122742831084, "loss": 0.5571, "step": 63120 }, { "epoch": 1.4051371082621082, "grad_norm": 0.634911298751831, "learning_rate": 0.00021987483650792955, "loss": 0.5474, "step": 63130 }, { "epoch": 1.4053596866096867, "grad_norm": 0.764754056930542, "learning_rate": 0.00021982844450753347, "loss": 0.6282, "step": 63140 }, { "epoch": 1.405582264957265, "grad_norm": 0.8541821241378784, "learning_rate": 0.00021978205142964336, "loss": 0.5572, "step": 63150 }, { "epoch": 1.4058048433048433, "grad_norm": 0.4713304042816162, "learning_rate": 0.0002197356572767804, "loss": 0.5569, "step": 63160 }, { "epoch": 1.4060274216524218, "grad_norm": 0.6356391906738281, "learning_rate": 0.00021968926205146575, "loss": 0.5644, "step": 63170 }, { "epoch": 1.40625, "grad_norm": 0.48272061347961426, "learning_rate": 0.00021964286575622044, "loss": 0.6363, "step": 63180 }, { "epoch": 1.4064725783475782, "grad_norm": 0.9424344897270203, "learning_rate": 0.0002195964683935657, "loss": 0.7512, "step": 63190 }, { "epoch": 1.4066951566951567, "grad_norm": 0.7565159797668457, "learning_rate": 0.0002195500699660228, "loss": 0.6125, "step": 63200 }, { "epoch": 1.406917735042735, "grad_norm": 0.8030708432197571, "learning_rate": 0.00021950367047611318, "loss": 0.6668, "step": 63210 }, { "epoch": 1.4071403133903133, "grad_norm": 0.46443840861320496, "learning_rate": 0.0002194572699263581, "loss": 0.5308, "step": 63220 }, { "epoch": 1.4073628917378918, "grad_norm": 0.5370081067085266, "learning_rate": 0.000219410868319279, "loss": 0.5857, "step": 63230 }, { "epoch": 1.4075854700854702, "grad_norm": 1.0536185503005981, "learning_rate": 0.00021936446565739748, "loss": 0.5349, "step": 63240 }, { "epoch": 1.4078080484330484, "grad_norm": 0.4503636956214905, "learning_rate": 0.00021931806194323508, "loss": 0.5168, "step": 63250 }, { "epoch": 1.4080306267806268, "grad_norm": 0.5147433280944824, "learning_rate": 0.0002192716571793133, "loss": 0.6778, "step": 63260 }, { "epoch": 1.408253205128205, "grad_norm": 0.5514103174209595, "learning_rate": 0.00021922525136815391, "loss": 0.5253, "step": 63270 }, { "epoch": 1.4084757834757835, "grad_norm": 0.6138553023338318, "learning_rate": 0.00021917884451227865, "loss": 0.5666, "step": 63280 }, { "epoch": 1.4086983618233617, "grad_norm": 0.6634931564331055, "learning_rate": 0.00021913243661420923, "loss": 0.6384, "step": 63290 }, { "epoch": 1.4089209401709402, "grad_norm": 0.8572415709495544, "learning_rate": 0.00021908602767646755, "loss": 0.5278, "step": 63300 }, { "epoch": 1.4091435185185186, "grad_norm": 0.7475055456161499, "learning_rate": 0.00021903961770157557, "loss": 0.4888, "step": 63310 }, { "epoch": 1.4093660968660968, "grad_norm": 0.634645402431488, "learning_rate": 0.0002189932066920551, "loss": 0.675, "step": 63320 }, { "epoch": 1.4095886752136753, "grad_norm": 0.6861944794654846, "learning_rate": 0.00021894679465042828, "loss": 0.6083, "step": 63330 }, { "epoch": 1.4098112535612537, "grad_norm": 0.5424638986587524, "learning_rate": 0.00021890038157921707, "loss": 0.5013, "step": 63340 }, { "epoch": 1.410033831908832, "grad_norm": 0.7189966440200806, "learning_rate": 0.00021885396748094372, "loss": 0.6209, "step": 63350 }, { "epoch": 1.4102564102564101, "grad_norm": 0.5611239075660706, "learning_rate": 0.00021880755235813033, "loss": 0.5145, "step": 63360 }, { "epoch": 1.4104789886039886, "grad_norm": 0.6758479475975037, "learning_rate": 0.00021876113621329912, "loss": 0.7104, "step": 63370 }, { "epoch": 1.410701566951567, "grad_norm": 0.5141634345054626, "learning_rate": 0.00021871471904897242, "loss": 0.4923, "step": 63380 }, { "epoch": 1.4109241452991452, "grad_norm": 0.5466700792312622, "learning_rate": 0.00021866830086767261, "loss": 0.6644, "step": 63390 }, { "epoch": 1.4111467236467237, "grad_norm": 0.4376499056816101, "learning_rate": 0.00021862188167192197, "loss": 0.55, "step": 63400 }, { "epoch": 1.411369301994302, "grad_norm": 0.5034909248352051, "learning_rate": 0.00021857546146424305, "loss": 0.5723, "step": 63410 }, { "epoch": 1.4115918803418803, "grad_norm": 0.656143307685852, "learning_rate": 0.00021852904024715838, "loss": 0.4818, "step": 63420 }, { "epoch": 1.4118144586894588, "grad_norm": 0.5316798090934753, "learning_rate": 0.00021848261802319047, "loss": 0.632, "step": 63430 }, { "epoch": 1.412037037037037, "grad_norm": 0.800273060798645, "learning_rate": 0.000218436194794862, "loss": 0.6068, "step": 63440 }, { "epoch": 1.4122596153846154, "grad_norm": 0.6114327907562256, "learning_rate": 0.00021838977056469556, "loss": 0.5579, "step": 63450 }, { "epoch": 1.4124821937321936, "grad_norm": 0.5441319346427917, "learning_rate": 0.00021834334533521398, "loss": 0.5263, "step": 63460 }, { "epoch": 1.412704772079772, "grad_norm": 0.738415002822876, "learning_rate": 0.00021829691910893998, "loss": 0.4855, "step": 63470 }, { "epoch": 1.4129273504273505, "grad_norm": 0.49261611700057983, "learning_rate": 0.00021825049188839643, "loss": 0.5376, "step": 63480 }, { "epoch": 1.4131499287749287, "grad_norm": 0.5762203931808472, "learning_rate": 0.0002182040636761062, "loss": 0.527, "step": 63490 }, { "epoch": 1.4133725071225072, "grad_norm": 0.46112143993377686, "learning_rate": 0.00021815763447459227, "loss": 0.5495, "step": 63500 }, { "epoch": 1.4135950854700854, "grad_norm": 0.5711562037467957, "learning_rate": 0.00021811120428637758, "loss": 0.5324, "step": 63510 }, { "epoch": 1.4138176638176638, "grad_norm": 0.7803882956504822, "learning_rate": 0.00021806477311398523, "loss": 0.5632, "step": 63520 }, { "epoch": 1.414040242165242, "grad_norm": 0.7463172674179077, "learning_rate": 0.00021801834095993834, "loss": 0.5532, "step": 63530 }, { "epoch": 1.4142628205128205, "grad_norm": 0.7782133221626282, "learning_rate": 0.00021797190782676005, "loss": 0.4673, "step": 63540 }, { "epoch": 1.414485398860399, "grad_norm": 0.4096747934818268, "learning_rate": 0.00021792547371697355, "loss": 0.6458, "step": 63550 }, { "epoch": 1.4147079772079771, "grad_norm": 0.7637385129928589, "learning_rate": 0.00021787903863310218, "loss": 0.5388, "step": 63560 }, { "epoch": 1.4149305555555556, "grad_norm": 0.7087598443031311, "learning_rate": 0.00021783260257766918, "loss": 0.5184, "step": 63570 }, { "epoch": 1.415153133903134, "grad_norm": 0.6480220556259155, "learning_rate": 0.00021778616555319795, "loss": 0.5198, "step": 63580 }, { "epoch": 1.4153757122507122, "grad_norm": 0.8158738017082214, "learning_rate": 0.0002177397275622119, "loss": 0.5879, "step": 63590 }, { "epoch": 1.4155982905982907, "grad_norm": 0.5854315757751465, "learning_rate": 0.0002176932886072346, "loss": 0.6099, "step": 63600 }, { "epoch": 1.415820868945869, "grad_norm": 0.6004104018211365, "learning_rate": 0.00021764684869078944, "loss": 0.5783, "step": 63610 }, { "epoch": 1.4160434472934473, "grad_norm": 0.5777664184570312, "learning_rate": 0.0002176004078154001, "loss": 0.604, "step": 63620 }, { "epoch": 1.4162660256410255, "grad_norm": 0.40372851490974426, "learning_rate": 0.00021755396598359026, "loss": 0.5008, "step": 63630 }, { "epoch": 1.416488603988604, "grad_norm": 0.8670433163642883, "learning_rate": 0.0002175075231978835, "loss": 0.6425, "step": 63640 }, { "epoch": 1.4167111823361824, "grad_norm": 0.9667596817016602, "learning_rate": 0.00021746107946080353, "loss": 0.5284, "step": 63650 }, { "epoch": 1.4169337606837606, "grad_norm": 0.5009735822677612, "learning_rate": 0.00021741463477487427, "loss": 0.494, "step": 63660 }, { "epoch": 1.417156339031339, "grad_norm": 0.6341816186904907, "learning_rate": 0.00021736818914261955, "loss": 0.5357, "step": 63670 }, { "epoch": 1.4173789173789173, "grad_norm": 0.6542185544967651, "learning_rate": 0.0002173217425665631, "loss": 0.6142, "step": 63680 }, { "epoch": 1.4176014957264957, "grad_norm": 0.4323403835296631, "learning_rate": 0.00021727529504922904, "loss": 0.568, "step": 63690 }, { "epoch": 1.417824074074074, "grad_norm": 0.7435312867164612, "learning_rate": 0.00021722884659314132, "loss": 0.4743, "step": 63700 }, { "epoch": 1.4180466524216524, "grad_norm": 0.6980502605438232, "learning_rate": 0.00021718239720082397, "loss": 0.5454, "step": 63710 }, { "epoch": 1.4182692307692308, "grad_norm": 0.9360513687133789, "learning_rate": 0.000217135946874801, "loss": 0.4378, "step": 63720 }, { "epoch": 1.418491809116809, "grad_norm": 0.6106828451156616, "learning_rate": 0.0002170894956175967, "loss": 0.518, "step": 63730 }, { "epoch": 1.4187143874643875, "grad_norm": 0.5087167024612427, "learning_rate": 0.0002170430434317353, "loss": 0.499, "step": 63740 }, { "epoch": 1.418936965811966, "grad_norm": 0.7681663632392883, "learning_rate": 0.00021699659031974088, "loss": 0.5969, "step": 63750 }, { "epoch": 1.4191595441595442, "grad_norm": 0.5107528567314148, "learning_rate": 0.00021695013628413788, "loss": 0.446, "step": 63760 }, { "epoch": 1.4193821225071226, "grad_norm": 0.6044454574584961, "learning_rate": 0.00021690368132745055, "loss": 0.5466, "step": 63770 }, { "epoch": 1.4196047008547008, "grad_norm": 0.5367859601974487, "learning_rate": 0.00021685722545220336, "loss": 0.6068, "step": 63780 }, { "epoch": 1.4198272792022792, "grad_norm": 0.6570670008659363, "learning_rate": 0.00021681076866092073, "loss": 0.6939, "step": 63790 }, { "epoch": 1.4200498575498575, "grad_norm": 0.5233334302902222, "learning_rate": 0.00021676431095612718, "loss": 0.5027, "step": 63800 }, { "epoch": 1.420272435897436, "grad_norm": 0.48035338521003723, "learning_rate": 0.00021671785234034726, "loss": 0.524, "step": 63810 }, { "epoch": 1.4204950142450143, "grad_norm": 0.7729887962341309, "learning_rate": 0.0002166713928161055, "loss": 0.6281, "step": 63820 }, { "epoch": 1.4207175925925926, "grad_norm": 0.9266574382781982, "learning_rate": 0.00021662493238592666, "loss": 0.5339, "step": 63830 }, { "epoch": 1.420940170940171, "grad_norm": 0.6719033122062683, "learning_rate": 0.0002165784710523354, "loss": 0.4944, "step": 63840 }, { "epoch": 1.4211627492877492, "grad_norm": 0.6520929336547852, "learning_rate": 0.0002165320088178564, "loss": 0.4898, "step": 63850 }, { "epoch": 1.4213853276353277, "grad_norm": 0.4981181025505066, "learning_rate": 0.00021648554568501455, "loss": 0.5676, "step": 63860 }, { "epoch": 1.4216079059829059, "grad_norm": 0.746509313583374, "learning_rate": 0.0002164390816563346, "loss": 0.4484, "step": 63870 }, { "epoch": 1.4218304843304843, "grad_norm": 0.6327930092811584, "learning_rate": 0.00021639261673434156, "loss": 0.5454, "step": 63880 }, { "epoch": 1.4220530626780628, "grad_norm": 0.7335301041603088, "learning_rate": 0.00021634615092156026, "loss": 0.5584, "step": 63890 }, { "epoch": 1.422275641025641, "grad_norm": 0.4814402163028717, "learning_rate": 0.0002162996842205158, "loss": 0.6066, "step": 63900 }, { "epoch": 1.4224982193732194, "grad_norm": 0.666235089302063, "learning_rate": 0.00021625321663373313, "loss": 0.5322, "step": 63910 }, { "epoch": 1.4227207977207978, "grad_norm": 0.8331716656684875, "learning_rate": 0.00021620674816373733, "loss": 0.4986, "step": 63920 }, { "epoch": 1.422943376068376, "grad_norm": 0.6989117860794067, "learning_rate": 0.00021616027881305353, "loss": 0.5071, "step": 63930 }, { "epoch": 1.4231659544159543, "grad_norm": 0.6009159088134766, "learning_rate": 0.00021611380858420698, "loss": 0.5468, "step": 63940 }, { "epoch": 1.4233885327635327, "grad_norm": 0.5715070366859436, "learning_rate": 0.0002160673374797229, "loss": 0.6371, "step": 63950 }, { "epoch": 1.4236111111111112, "grad_norm": 0.6078493595123291, "learning_rate": 0.00021602086550212657, "loss": 0.6225, "step": 63960 }, { "epoch": 1.4238336894586894, "grad_norm": 0.8261004090309143, "learning_rate": 0.00021597439265394326, "loss": 0.7131, "step": 63970 }, { "epoch": 1.4240562678062678, "grad_norm": 0.8117848634719849, "learning_rate": 0.0002159279189376984, "loss": 0.5462, "step": 63980 }, { "epoch": 1.4242788461538463, "grad_norm": 0.42046865820884705, "learning_rate": 0.0002158814443559174, "loss": 0.4954, "step": 63990 }, { "epoch": 1.4245014245014245, "grad_norm": 0.8559585809707642, "learning_rate": 0.00021583496891112566, "loss": 0.6157, "step": 64000 }, { "epoch": 1.424724002849003, "grad_norm": 0.9661999344825745, "learning_rate": 0.00021578849260584876, "loss": 0.5031, "step": 64010 }, { "epoch": 1.4249465811965811, "grad_norm": 0.6524626612663269, "learning_rate": 0.00021574201544261225, "loss": 0.5737, "step": 64020 }, { "epoch": 1.4251691595441596, "grad_norm": 0.7119362354278564, "learning_rate": 0.00021569553742394176, "loss": 0.6444, "step": 64030 }, { "epoch": 1.4253917378917378, "grad_norm": 0.5532019138336182, "learning_rate": 0.0002156490585523629, "loss": 0.4423, "step": 64040 }, { "epoch": 1.4256143162393162, "grad_norm": 0.5825714468955994, "learning_rate": 0.00021560257883040138, "loss": 0.5145, "step": 64050 }, { "epoch": 1.4258368945868947, "grad_norm": 0.5801465511322021, "learning_rate": 0.00021555609826058296, "loss": 0.5956, "step": 64060 }, { "epoch": 1.4260594729344729, "grad_norm": 0.5406079888343811, "learning_rate": 0.00021550961684543341, "loss": 0.5333, "step": 64070 }, { "epoch": 1.4262820512820513, "grad_norm": 0.637581467628479, "learning_rate": 0.0002154631345874786, "loss": 0.6044, "step": 64080 }, { "epoch": 1.4265046296296298, "grad_norm": 0.9882199764251709, "learning_rate": 0.00021541665148924443, "loss": 0.5219, "step": 64090 }, { "epoch": 1.426727207977208, "grad_norm": 0.6166510581970215, "learning_rate": 0.00021537016755325677, "loss": 0.5737, "step": 64100 }, { "epoch": 1.4269497863247862, "grad_norm": 0.6122153401374817, "learning_rate": 0.0002153236827820416, "loss": 0.4894, "step": 64110 }, { "epoch": 1.4271723646723646, "grad_norm": 0.4888789653778076, "learning_rate": 0.00021527719717812498, "loss": 0.5273, "step": 64120 }, { "epoch": 1.427394943019943, "grad_norm": 0.8074785470962524, "learning_rate": 0.00021523071074403295, "loss": 0.4874, "step": 64130 }, { "epoch": 1.4276175213675213, "grad_norm": 0.7119250893592834, "learning_rate": 0.0002151842234822916, "loss": 0.5333, "step": 64140 }, { "epoch": 1.4278400997150997, "grad_norm": 0.6475407481193542, "learning_rate": 0.00021513773539542715, "loss": 0.5837, "step": 64150 }, { "epoch": 1.4280626780626782, "grad_norm": 0.5739694237709045, "learning_rate": 0.00021509124648596582, "loss": 0.5146, "step": 64160 }, { "epoch": 1.4282852564102564, "grad_norm": 0.6470806002616882, "learning_rate": 0.00021504475675643376, "loss": 0.5036, "step": 64170 }, { "epoch": 1.4285078347578348, "grad_norm": 0.5994382500648499, "learning_rate": 0.00021499826620935726, "loss": 0.4994, "step": 64180 }, { "epoch": 1.428730413105413, "grad_norm": 0.8757331371307373, "learning_rate": 0.00021495177484726278, "loss": 0.5777, "step": 64190 }, { "epoch": 1.4289529914529915, "grad_norm": 0.5289104580879211, "learning_rate": 0.00021490528267267654, "loss": 0.535, "step": 64200 }, { "epoch": 1.4291755698005697, "grad_norm": 0.5589162111282349, "learning_rate": 0.00021485878968812504, "loss": 0.5024, "step": 64210 }, { "epoch": 1.4293981481481481, "grad_norm": 0.8635842204093933, "learning_rate": 0.0002148122958961348, "loss": 0.5991, "step": 64220 }, { "epoch": 1.4296207264957266, "grad_norm": 0.7705470323562622, "learning_rate": 0.00021476580129923227, "loss": 0.5253, "step": 64230 }, { "epoch": 1.4298433048433048, "grad_norm": 0.5482203960418701, "learning_rate": 0.00021471930589994393, "loss": 0.7469, "step": 64240 }, { "epoch": 1.4300658831908832, "grad_norm": 0.7359560132026672, "learning_rate": 0.0002146728097007965, "loss": 0.579, "step": 64250 }, { "epoch": 1.4302884615384617, "grad_norm": 0.8372468948364258, "learning_rate": 0.0002146263127043166, "loss": 0.684, "step": 64260 }, { "epoch": 1.43051103988604, "grad_norm": 0.7516504526138306, "learning_rate": 0.00021457981491303086, "loss": 0.6032, "step": 64270 }, { "epoch": 1.430733618233618, "grad_norm": 0.4100610613822937, "learning_rate": 0.00021453331632946605, "loss": 0.537, "step": 64280 }, { "epoch": 1.4309561965811965, "grad_norm": 0.7946826815605164, "learning_rate": 0.000214486816956149, "loss": 0.5378, "step": 64290 }, { "epoch": 1.431178774928775, "grad_norm": 0.6719437837600708, "learning_rate": 0.00021444031679560644, "loss": 0.6651, "step": 64300 }, { "epoch": 1.4314013532763532, "grad_norm": 0.6615647673606873, "learning_rate": 0.00021439381585036516, "loss": 0.5943, "step": 64310 }, { "epoch": 1.4316239316239316, "grad_norm": 0.5145480632781982, "learning_rate": 0.00021434731412295216, "loss": 0.5319, "step": 64320 }, { "epoch": 1.43184650997151, "grad_norm": 0.7475807070732117, "learning_rate": 0.00021430081161589442, "loss": 0.5558, "step": 64330 }, { "epoch": 1.4320690883190883, "grad_norm": 0.4000377058982849, "learning_rate": 0.00021425430833171887, "loss": 0.5037, "step": 64340 }, { "epoch": 1.4322916666666667, "grad_norm": 0.45284637808799744, "learning_rate": 0.0002142078042729525, "loss": 0.5588, "step": 64350 }, { "epoch": 1.432514245014245, "grad_norm": 0.5943143367767334, "learning_rate": 0.00021416129944212245, "loss": 0.5597, "step": 64360 }, { "epoch": 1.4327368233618234, "grad_norm": 0.9331151843070984, "learning_rate": 0.00021411479384175576, "loss": 0.6329, "step": 64370 }, { "epoch": 1.4329594017094016, "grad_norm": 0.5388138294219971, "learning_rate": 0.00021406828747437963, "loss": 0.5553, "step": 64380 }, { "epoch": 1.43318198005698, "grad_norm": 0.38518887758255005, "learning_rate": 0.0002140217803425212, "loss": 0.4621, "step": 64390 }, { "epoch": 1.4334045584045585, "grad_norm": 0.5624729990959167, "learning_rate": 0.0002139752724487078, "loss": 0.4825, "step": 64400 }, { "epoch": 1.4336271367521367, "grad_norm": 0.7365434169769287, "learning_rate": 0.00021392876379546665, "loss": 0.6077, "step": 64410 }, { "epoch": 1.4338497150997151, "grad_norm": 0.584722101688385, "learning_rate": 0.00021388225438532507, "loss": 0.5207, "step": 64420 }, { "epoch": 1.4340722934472934, "grad_norm": 0.6751091480255127, "learning_rate": 0.00021383574422081041, "loss": 0.4445, "step": 64430 }, { "epoch": 1.4342948717948718, "grad_norm": 0.9433859586715698, "learning_rate": 0.00021378923330445012, "loss": 0.6379, "step": 64440 }, { "epoch": 1.43451745014245, "grad_norm": 0.5465646386146545, "learning_rate": 0.00021374272163877155, "loss": 0.6188, "step": 64450 }, { "epoch": 1.4347400284900285, "grad_norm": 0.8962814807891846, "learning_rate": 0.00021369620922630228, "loss": 0.5778, "step": 64460 }, { "epoch": 1.434962606837607, "grad_norm": 0.7114683389663696, "learning_rate": 0.0002136496960695698, "loss": 0.6164, "step": 64470 }, { "epoch": 1.4351851851851851, "grad_norm": 0.5674235820770264, "learning_rate": 0.0002136031821711016, "loss": 0.419, "step": 64480 }, { "epoch": 1.4354077635327636, "grad_norm": 0.5753316879272461, "learning_rate": 0.00021355666753342537, "loss": 0.646, "step": 64490 }, { "epoch": 1.435630341880342, "grad_norm": 0.697969913482666, "learning_rate": 0.00021351015215906875, "loss": 0.5691, "step": 64500 }, { "epoch": 1.4358529202279202, "grad_norm": 0.5356051921844482, "learning_rate": 0.0002134636360505594, "loss": 0.517, "step": 64510 }, { "epoch": 1.4360754985754987, "grad_norm": 0.5579541921615601, "learning_rate": 0.000213417119210425, "loss": 0.5676, "step": 64520 }, { "epoch": 1.4362980769230769, "grad_norm": 0.7190403342247009, "learning_rate": 0.00021337060164119338, "loss": 0.5499, "step": 64530 }, { "epoch": 1.4365206552706553, "grad_norm": 0.40730300545692444, "learning_rate": 0.00021332408334539236, "loss": 0.671, "step": 64540 }, { "epoch": 1.4367432336182335, "grad_norm": 0.5748006105422974, "learning_rate": 0.00021327756432554975, "loss": 0.5321, "step": 64550 }, { "epoch": 1.436965811965812, "grad_norm": 0.7539768218994141, "learning_rate": 0.00021323104458419336, "loss": 0.4917, "step": 64560 }, { "epoch": 1.4371883903133904, "grad_norm": 0.6702855229377747, "learning_rate": 0.00021318452412385117, "loss": 0.7121, "step": 64570 }, { "epoch": 1.4374109686609686, "grad_norm": 0.6595850586891174, "learning_rate": 0.0002131380029470512, "loss": 0.5186, "step": 64580 }, { "epoch": 1.437633547008547, "grad_norm": 0.7162553668022156, "learning_rate": 0.00021309148105632137, "loss": 0.4905, "step": 64590 }, { "epoch": 1.4378561253561253, "grad_norm": 0.6966633796691895, "learning_rate": 0.00021304495845418973, "loss": 0.4671, "step": 64600 }, { "epoch": 1.4380787037037037, "grad_norm": 0.9159937500953674, "learning_rate": 0.0002129984351431844, "loss": 0.6021, "step": 64610 }, { "epoch": 1.438301282051282, "grad_norm": 0.6505913734436035, "learning_rate": 0.0002129519111258335, "loss": 0.5875, "step": 64620 }, { "epoch": 1.4385238603988604, "grad_norm": 0.5346196889877319, "learning_rate": 0.00021290538640466507, "loss": 0.518, "step": 64630 }, { "epoch": 1.4387464387464388, "grad_norm": 0.4172317683696747, "learning_rate": 0.00021285886098220736, "loss": 0.4858, "step": 64640 }, { "epoch": 1.438969017094017, "grad_norm": 0.6790648698806763, "learning_rate": 0.00021281233486098868, "loss": 0.5761, "step": 64650 }, { "epoch": 1.4391915954415955, "grad_norm": 0.44441673159599304, "learning_rate": 0.0002127658080435372, "loss": 0.4783, "step": 64660 }, { "epoch": 1.439414173789174, "grad_norm": 0.4197510778903961, "learning_rate": 0.00021271928053238125, "loss": 0.503, "step": 64670 }, { "epoch": 1.4396367521367521, "grad_norm": 0.3227827250957489, "learning_rate": 0.00021267275233004926, "loss": 0.4116, "step": 64680 }, { "epoch": 1.4398593304843303, "grad_norm": 0.8242856860160828, "learning_rate": 0.0002126262234390695, "loss": 0.4626, "step": 64690 }, { "epoch": 1.4400819088319088, "grad_norm": 0.5800617933273315, "learning_rate": 0.00021257969386197042, "loss": 0.5454, "step": 64700 }, { "epoch": 1.4401709401709402, "eval_loss": 0.5727022886276245, "eval_runtime": 337.62, "eval_samples_per_second": 7.005, "eval_steps_per_second": 7.005, "step": 64704 }, { "epoch": 1.4403044871794872, "grad_norm": 0.45175546407699585, "learning_rate": 0.00021253316360128038, "loss": 0.4923, "step": 64710 }, { "epoch": 1.4405270655270654, "grad_norm": 0.6314474940299988, "learning_rate": 0.0002124866326595281, "loss": 0.432, "step": 64720 }, { "epoch": 1.4407496438746439, "grad_norm": 0.5468671321868896, "learning_rate": 0.00021244010103924193, "loss": 0.5427, "step": 64730 }, { "epoch": 1.4409722222222223, "grad_norm": 0.5443201065063477, "learning_rate": 0.00021239356874295045, "loss": 0.5005, "step": 64740 }, { "epoch": 1.4411948005698005, "grad_norm": 0.6474123001098633, "learning_rate": 0.00021234703577318237, "loss": 0.4838, "step": 64750 }, { "epoch": 1.441417378917379, "grad_norm": 0.7085283994674683, "learning_rate": 0.00021230050213246626, "loss": 0.5677, "step": 64760 }, { "epoch": 1.4416399572649572, "grad_norm": 0.5790860652923584, "learning_rate": 0.0002122539678233307, "loss": 0.6691, "step": 64770 }, { "epoch": 1.4418625356125356, "grad_norm": 0.6387883424758911, "learning_rate": 0.00021220743284830457, "loss": 0.4925, "step": 64780 }, { "epoch": 1.4420851139601139, "grad_norm": 0.6226698160171509, "learning_rate": 0.00021216089720991655, "loss": 0.5552, "step": 64790 }, { "epoch": 1.4423076923076923, "grad_norm": 0.5773744583129883, "learning_rate": 0.00021211436091069538, "loss": 0.4695, "step": 64800 }, { "epoch": 1.4425302706552707, "grad_norm": 0.6607078313827515, "learning_rate": 0.00021206782395316996, "loss": 0.6169, "step": 64810 }, { "epoch": 1.442752849002849, "grad_norm": 0.6033298373222351, "learning_rate": 0.0002120212863398691, "loss": 0.5271, "step": 64820 }, { "epoch": 1.4429754273504274, "grad_norm": 0.5513878464698792, "learning_rate": 0.0002119747480733217, "loss": 0.64, "step": 64830 }, { "epoch": 1.4431980056980058, "grad_norm": 0.44069719314575195, "learning_rate": 0.00021192820915605666, "loss": 0.4799, "step": 64840 }, { "epoch": 1.443420584045584, "grad_norm": 0.5613614320755005, "learning_rate": 0.00021188166959060296, "loss": 0.5006, "step": 64850 }, { "epoch": 1.4436431623931623, "grad_norm": 0.6709674596786499, "learning_rate": 0.00021183512937948966, "loss": 0.6678, "step": 64860 }, { "epoch": 1.4438657407407407, "grad_norm": 0.4399740695953369, "learning_rate": 0.0002117885885252457, "loss": 0.4794, "step": 64870 }, { "epoch": 1.4440883190883191, "grad_norm": 0.513027548789978, "learning_rate": 0.00021174204703040022, "loss": 0.5169, "step": 64880 }, { "epoch": 1.4443108974358974, "grad_norm": 0.6490597128868103, "learning_rate": 0.00021169550489748225, "loss": 0.504, "step": 64890 }, { "epoch": 1.4445334757834758, "grad_norm": 0.7682234048843384, "learning_rate": 0.000211648962129021, "loss": 0.6058, "step": 64900 }, { "epoch": 1.4447560541310542, "grad_norm": 0.6964247822761536, "learning_rate": 0.0002116024187275456, "loss": 0.6491, "step": 64910 }, { "epoch": 1.4449786324786325, "grad_norm": 0.5793104767799377, "learning_rate": 0.0002115558746955853, "loss": 0.4567, "step": 64920 }, { "epoch": 1.445201210826211, "grad_norm": 0.4298732876777649, "learning_rate": 0.00021150933003566928, "loss": 0.5615, "step": 64930 }, { "epoch": 1.445423789173789, "grad_norm": 0.6879767179489136, "learning_rate": 0.0002114627847503268, "loss": 0.5527, "step": 64940 }, { "epoch": 1.4456463675213675, "grad_norm": 0.7321903705596924, "learning_rate": 0.00021141623884208733, "loss": 0.5424, "step": 64950 }, { "epoch": 1.4458689458689458, "grad_norm": 0.7019228935241699, "learning_rate": 0.00021136969231347997, "loss": 0.5813, "step": 64960 }, { "epoch": 1.4460915242165242, "grad_norm": 0.5599492192268372, "learning_rate": 0.00021132314516703434, "loss": 0.5678, "step": 64970 }, { "epoch": 1.4463141025641026, "grad_norm": 0.3333553373813629, "learning_rate": 0.00021127659740527964, "loss": 0.4524, "step": 64980 }, { "epoch": 1.4465366809116809, "grad_norm": 0.777064323425293, "learning_rate": 0.00021123004903074541, "loss": 0.5751, "step": 64990 }, { "epoch": 1.4467592592592593, "grad_norm": 0.5544342994689941, "learning_rate": 0.00021118350004596117, "loss": 0.6019, "step": 65000 }, { "epoch": 1.4469818376068377, "grad_norm": 0.6325475573539734, "learning_rate": 0.0002111369504534564, "loss": 0.5359, "step": 65010 }, { "epoch": 1.447204415954416, "grad_norm": 0.5768696069717407, "learning_rate": 0.00021109040025576054, "loss": 0.5335, "step": 65020 }, { "epoch": 1.4474269943019942, "grad_norm": 0.7404335737228394, "learning_rate": 0.00021104384945540327, "loss": 0.5483, "step": 65030 }, { "epoch": 1.4476495726495726, "grad_norm": 0.654624879360199, "learning_rate": 0.00021099729805491423, "loss": 0.6153, "step": 65040 }, { "epoch": 1.447872150997151, "grad_norm": 0.5879744291305542, "learning_rate": 0.00021095074605682296, "loss": 0.5576, "step": 65050 }, { "epoch": 1.4480947293447293, "grad_norm": 0.5990705490112305, "learning_rate": 0.00021090419346365922, "loss": 0.5858, "step": 65060 }, { "epoch": 1.4483173076923077, "grad_norm": 0.7336555123329163, "learning_rate": 0.0002108576402779527, "loss": 0.4919, "step": 65070 }, { "epoch": 1.4485398860398861, "grad_norm": 0.6426530480384827, "learning_rate": 0.00021081108650223312, "loss": 0.6436, "step": 65080 }, { "epoch": 1.4487624643874644, "grad_norm": 0.5368412137031555, "learning_rate": 0.0002107645321390302, "loss": 0.6246, "step": 65090 }, { "epoch": 1.4489850427350428, "grad_norm": 0.9056705832481384, "learning_rate": 0.0002107179771908738, "loss": 0.615, "step": 65100 }, { "epoch": 1.449207621082621, "grad_norm": 0.9572508931159973, "learning_rate": 0.00021067142166029376, "loss": 0.5894, "step": 65110 }, { "epoch": 1.4494301994301995, "grad_norm": 0.607247531414032, "learning_rate": 0.00021062486554981988, "loss": 0.6655, "step": 65120 }, { "epoch": 1.4496527777777777, "grad_norm": 0.7506188750267029, "learning_rate": 0.00021057830886198216, "loss": 0.5114, "step": 65130 }, { "epoch": 1.4498753561253561, "grad_norm": 0.7134078145027161, "learning_rate": 0.00021053175159931056, "loss": 0.636, "step": 65140 }, { "epoch": 1.4500979344729346, "grad_norm": 0.7259635925292969, "learning_rate": 0.00021048519376433485, "loss": 0.5056, "step": 65150 }, { "epoch": 1.4503205128205128, "grad_norm": 0.8849378824234009, "learning_rate": 0.0002104386353595851, "loss": 0.748, "step": 65160 }, { "epoch": 1.4505430911680912, "grad_norm": 0.45202553272247314, "learning_rate": 0.00021039207638759138, "loss": 0.627, "step": 65170 }, { "epoch": 1.4507656695156697, "grad_norm": 0.6308692097663879, "learning_rate": 0.0002103455168508838, "loss": 0.6249, "step": 65180 }, { "epoch": 1.4509882478632479, "grad_norm": 0.748424232006073, "learning_rate": 0.00021029895675199226, "loss": 0.572, "step": 65190 }, { "epoch": 1.451210826210826, "grad_norm": 0.7193168997764587, "learning_rate": 0.00021025239609344701, "loss": 0.4869, "step": 65200 }, { "epoch": 1.4514334045584045, "grad_norm": 0.47657930850982666, "learning_rate": 0.00021020583487777828, "loss": 0.6517, "step": 65210 }, { "epoch": 1.451655982905983, "grad_norm": 0.42479145526885986, "learning_rate": 0.00021015927310751598, "loss": 0.4743, "step": 65220 }, { "epoch": 1.4518785612535612, "grad_norm": 0.8069111108779907, "learning_rate": 0.00021011271078519054, "loss": 0.6877, "step": 65230 }, { "epoch": 1.4521011396011396, "grad_norm": 0.31496503949165344, "learning_rate": 0.00021006614791333205, "loss": 0.5257, "step": 65240 }, { "epoch": 1.452323717948718, "grad_norm": 0.30991029739379883, "learning_rate": 0.00021001958449447087, "loss": 0.5881, "step": 65250 }, { "epoch": 1.4525462962962963, "grad_norm": 0.5215082168579102, "learning_rate": 0.0002099730205311373, "loss": 0.4965, "step": 65260 }, { "epoch": 1.4527688746438747, "grad_norm": 0.6242929100990295, "learning_rate": 0.00020992645602586164, "loss": 0.5573, "step": 65270 }, { "epoch": 1.452991452991453, "grad_norm": 0.5348063707351685, "learning_rate": 0.0002098798909811742, "loss": 0.5565, "step": 65280 }, { "epoch": 1.4532140313390314, "grad_norm": 0.527005136013031, "learning_rate": 0.00020983332539960538, "loss": 0.6289, "step": 65290 }, { "epoch": 1.4534366096866096, "grad_norm": 0.792716920375824, "learning_rate": 0.0002097867592836856, "loss": 0.5747, "step": 65300 }, { "epoch": 1.453659188034188, "grad_norm": 0.6454300880432129, "learning_rate": 0.00020974019263594534, "loss": 0.6055, "step": 65310 }, { "epoch": 1.4538817663817665, "grad_norm": 0.4974537491798401, "learning_rate": 0.00020969362545891507, "loss": 0.4444, "step": 65320 }, { "epoch": 1.4541043447293447, "grad_norm": 0.46109578013420105, "learning_rate": 0.00020964705775512518, "loss": 0.504, "step": 65330 }, { "epoch": 1.4543269230769231, "grad_norm": 0.7986196279525757, "learning_rate": 0.00020960048952710632, "loss": 0.5724, "step": 65340 }, { "epoch": 1.4545495014245013, "grad_norm": 0.43146848678588867, "learning_rate": 0.00020955392077738903, "loss": 0.5882, "step": 65350 }, { "epoch": 1.4547720797720798, "grad_norm": 0.8425827026367188, "learning_rate": 0.0002095073515085038, "loss": 0.5565, "step": 65360 }, { "epoch": 1.454994658119658, "grad_norm": 0.6973526477813721, "learning_rate": 0.00020946078172298137, "loss": 0.4806, "step": 65370 }, { "epoch": 1.4552172364672364, "grad_norm": 0.4596269726753235, "learning_rate": 0.00020941421142335224, "loss": 0.5385, "step": 65380 }, { "epoch": 1.4554398148148149, "grad_norm": 0.7151857018470764, "learning_rate": 0.0002093676406121472, "loss": 0.565, "step": 65390 }, { "epoch": 1.455662393162393, "grad_norm": 0.5673050880432129, "learning_rate": 0.00020932106929189695, "loss": 0.5585, "step": 65400 }, { "epoch": 1.4558849715099715, "grad_norm": 0.683448314666748, "learning_rate": 0.0002092744974651321, "loss": 0.6157, "step": 65410 }, { "epoch": 1.45610754985755, "grad_norm": 0.5907111167907715, "learning_rate": 0.00020922792513438347, "loss": 0.5675, "step": 65420 }, { "epoch": 1.4563301282051282, "grad_norm": 0.5666101574897766, "learning_rate": 0.0002091813523021818, "loss": 0.5541, "step": 65430 }, { "epoch": 1.4565527065527066, "grad_norm": 0.891807496547699, "learning_rate": 0.00020913477897105797, "loss": 0.5664, "step": 65440 }, { "epoch": 1.4567752849002849, "grad_norm": 0.691290020942688, "learning_rate": 0.00020908820514354274, "loss": 0.5416, "step": 65450 }, { "epoch": 1.4569978632478633, "grad_norm": 0.5921575427055359, "learning_rate": 0.00020904163082216708, "loss": 0.5611, "step": 65460 }, { "epoch": 1.4572204415954415, "grad_norm": 0.6684668660163879, "learning_rate": 0.00020899505600946173, "loss": 0.5772, "step": 65470 }, { "epoch": 1.45744301994302, "grad_norm": 0.6148405075073242, "learning_rate": 0.0002089484807079577, "loss": 0.4741, "step": 65480 }, { "epoch": 1.4576655982905984, "grad_norm": 0.6442460417747498, "learning_rate": 0.00020890190492018596, "loss": 0.4478, "step": 65490 }, { "epoch": 1.4578881766381766, "grad_norm": 0.6702595949172974, "learning_rate": 0.00020885532864867732, "loss": 0.553, "step": 65500 }, { "epoch": 1.458110754985755, "grad_norm": 0.8547864556312561, "learning_rate": 0.0002088087518959629, "loss": 0.5483, "step": 65510 }, { "epoch": 1.4583333333333333, "grad_norm": 0.7362828850746155, "learning_rate": 0.0002087621746645737, "loss": 0.639, "step": 65520 }, { "epoch": 1.4585559116809117, "grad_norm": 0.914097785949707, "learning_rate": 0.00020871559695704073, "loss": 0.567, "step": 65530 }, { "epoch": 1.45877849002849, "grad_norm": 0.5775737166404724, "learning_rate": 0.00020866901877589515, "loss": 0.5771, "step": 65540 }, { "epoch": 1.4590010683760684, "grad_norm": 0.8055770397186279, "learning_rate": 0.00020862244012366792, "loss": 0.5812, "step": 65550 }, { "epoch": 1.4592236467236468, "grad_norm": 0.6688644289970398, "learning_rate": 0.00020857586100289034, "loss": 0.6124, "step": 65560 }, { "epoch": 1.459446225071225, "grad_norm": 0.6874992847442627, "learning_rate": 0.00020852928141609333, "loss": 0.6507, "step": 65570 }, { "epoch": 1.4596688034188035, "grad_norm": 0.398538202047348, "learning_rate": 0.00020848270136580822, "loss": 0.611, "step": 65580 }, { "epoch": 1.459891381766382, "grad_norm": 0.6153919100761414, "learning_rate": 0.0002084361208545662, "loss": 0.6638, "step": 65590 }, { "epoch": 1.46011396011396, "grad_norm": 0.6108056902885437, "learning_rate": 0.00020838953988489852, "loss": 0.645, "step": 65600 }, { "epoch": 1.4603365384615383, "grad_norm": 0.5398427248001099, "learning_rate": 0.0002083429584593363, "loss": 0.619, "step": 65610 }, { "epoch": 1.4605591168091168, "grad_norm": 0.5409917235374451, "learning_rate": 0.00020829637658041086, "loss": 0.5931, "step": 65620 }, { "epoch": 1.4607816951566952, "grad_norm": 0.9313738346099854, "learning_rate": 0.0002082497942506536, "loss": 0.6415, "step": 65630 }, { "epoch": 1.4610042735042734, "grad_norm": 0.6145248413085938, "learning_rate": 0.0002082032114725957, "loss": 0.6641, "step": 65640 }, { "epoch": 1.4612268518518519, "grad_norm": 0.6647670865058899, "learning_rate": 0.00020815662824876858, "loss": 0.6287, "step": 65650 }, { "epoch": 1.4614494301994303, "grad_norm": 0.7241197228431702, "learning_rate": 0.00020811004458170366, "loss": 0.5647, "step": 65660 }, { "epoch": 1.4616720085470085, "grad_norm": 0.7002893686294556, "learning_rate": 0.00020806346047393226, "loss": 0.6097, "step": 65670 }, { "epoch": 1.461894586894587, "grad_norm": 0.40175196528434753, "learning_rate": 0.00020801687592798582, "loss": 0.5823, "step": 65680 }, { "epoch": 1.4621171652421652, "grad_norm": 0.5566397905349731, "learning_rate": 0.00020797029094639572, "loss": 0.5513, "step": 65690 }, { "epoch": 1.4623397435897436, "grad_norm": 0.5421175956726074, "learning_rate": 0.00020792370553169355, "loss": 0.6408, "step": 65700 }, { "epoch": 1.4625623219373218, "grad_norm": 0.4728619158267975, "learning_rate": 0.00020787711968641071, "loss": 0.5313, "step": 65710 }, { "epoch": 1.4627849002849003, "grad_norm": 0.5809181332588196, "learning_rate": 0.0002078305334130787, "loss": 0.4613, "step": 65720 }, { "epoch": 1.4630074786324787, "grad_norm": 0.5622936487197876, "learning_rate": 0.00020778394671422916, "loss": 0.4403, "step": 65730 }, { "epoch": 1.463230056980057, "grad_norm": 0.5469698905944824, "learning_rate": 0.0002077373595923936, "loss": 0.6044, "step": 65740 }, { "epoch": 1.4634526353276354, "grad_norm": 0.8442160487174988, "learning_rate": 0.00020769077205010352, "loss": 0.6237, "step": 65750 }, { "epoch": 1.4636752136752138, "grad_norm": 0.5504755973815918, "learning_rate": 0.00020764418408989062, "loss": 0.5859, "step": 65760 }, { "epoch": 1.463897792022792, "grad_norm": 0.7845527529716492, "learning_rate": 0.00020759759571428648, "loss": 0.6043, "step": 65770 }, { "epoch": 1.4641203703703702, "grad_norm": 0.5308002829551697, "learning_rate": 0.00020755100692582275, "loss": 0.4304, "step": 65780 }, { "epoch": 1.4643429487179487, "grad_norm": 1.0177899599075317, "learning_rate": 0.00020750441772703114, "loss": 0.5702, "step": 65790 }, { "epoch": 1.4645655270655271, "grad_norm": 0.8551054000854492, "learning_rate": 0.0002074578281204434, "loss": 0.6045, "step": 65800 }, { "epoch": 1.4647881054131053, "grad_norm": 0.6332492828369141, "learning_rate": 0.00020741123810859112, "loss": 0.4905, "step": 65810 }, { "epoch": 1.4650106837606838, "grad_norm": 0.6236631274223328, "learning_rate": 0.0002073646476940061, "loss": 0.6526, "step": 65820 }, { "epoch": 1.4652332621082622, "grad_norm": 0.5618539452552795, "learning_rate": 0.00020731805687922004, "loss": 0.5197, "step": 65830 }, { "epoch": 1.4654558404558404, "grad_norm": 0.5351765751838684, "learning_rate": 0.00020727146566676486, "loss": 0.5043, "step": 65840 }, { "epoch": 1.4656784188034189, "grad_norm": 0.5465881824493408, "learning_rate": 0.00020722487405917223, "loss": 0.5711, "step": 65850 }, { "epoch": 1.465900997150997, "grad_norm": 0.7434455752372742, "learning_rate": 0.00020717828205897405, "loss": 0.5675, "step": 65860 }, { "epoch": 1.4661235754985755, "grad_norm": 0.7556038498878479, "learning_rate": 0.00020713168966870216, "loss": 0.5711, "step": 65870 }, { "epoch": 1.4663461538461537, "grad_norm": 0.6555251479148865, "learning_rate": 0.0002070850968908884, "loss": 0.5165, "step": 65880 }, { "epoch": 1.4665687321937322, "grad_norm": 0.9948902130126953, "learning_rate": 0.00020703850372806465, "loss": 0.4885, "step": 65890 }, { "epoch": 1.4667913105413106, "grad_norm": 0.7312471270561218, "learning_rate": 0.00020699191018276288, "loss": 0.522, "step": 65900 }, { "epoch": 1.4670138888888888, "grad_norm": 0.567846417427063, "learning_rate": 0.00020694531625751496, "loss": 0.5453, "step": 65910 }, { "epoch": 1.4672364672364673, "grad_norm": 0.5828446745872498, "learning_rate": 0.00020689872195485287, "loss": 0.5965, "step": 65920 }, { "epoch": 1.4674590455840457, "grad_norm": 0.3739456534385681, "learning_rate": 0.00020685212727730864, "loss": 0.6246, "step": 65930 }, { "epoch": 1.467681623931624, "grad_norm": 0.6678730845451355, "learning_rate": 0.00020680553222741414, "loss": 0.5216, "step": 65940 }, { "epoch": 1.4679042022792022, "grad_norm": 0.6459123492240906, "learning_rate": 0.0002067589368077015, "loss": 0.5315, "step": 65950 }, { "epoch": 1.4681267806267806, "grad_norm": 0.6650933027267456, "learning_rate": 0.00020671234102070263, "loss": 0.6411, "step": 65960 }, { "epoch": 1.468349358974359, "grad_norm": 0.6498469114303589, "learning_rate": 0.0002066657448689497, "loss": 0.5779, "step": 65970 }, { "epoch": 1.4685719373219372, "grad_norm": 0.5796330571174622, "learning_rate": 0.00020661914835497474, "loss": 0.5096, "step": 65980 }, { "epoch": 1.4687945156695157, "grad_norm": 0.5412297248840332, "learning_rate": 0.00020657255148130984, "loss": 0.5169, "step": 65990 }, { "epoch": 1.4690170940170941, "grad_norm": 0.4486381411552429, "learning_rate": 0.00020652595425048705, "loss": 0.5125, "step": 66000 }, { "epoch": 1.4692396723646723, "grad_norm": 0.6411442756652832, "learning_rate": 0.00020647935666503862, "loss": 0.5713, "step": 66010 }, { "epoch": 1.4694622507122508, "grad_norm": 0.5224676132202148, "learning_rate": 0.00020643275872749665, "loss": 0.7264, "step": 66020 }, { "epoch": 1.469684829059829, "grad_norm": 0.4064271152019501, "learning_rate": 0.00020638616044039328, "loss": 0.5996, "step": 66030 }, { "epoch": 1.4699074074074074, "grad_norm": 0.7597877383232117, "learning_rate": 0.00020633956180626074, "loss": 0.508, "step": 66040 }, { "epoch": 1.4701299857549857, "grad_norm": 0.6439722180366516, "learning_rate": 0.00020629296282763125, "loss": 0.4433, "step": 66050 }, { "epoch": 1.470352564102564, "grad_norm": 0.7077510952949524, "learning_rate": 0.000206246363507037, "loss": 0.6232, "step": 66060 }, { "epoch": 1.4705751424501425, "grad_norm": 0.6614882349967957, "learning_rate": 0.0002061997638470102, "loss": 0.5851, "step": 66070 }, { "epoch": 1.4707977207977208, "grad_norm": 0.5982300043106079, "learning_rate": 0.00020615316385008315, "loss": 0.4403, "step": 66080 }, { "epoch": 1.4710202991452992, "grad_norm": 0.6720724701881409, "learning_rate": 0.0002061065635187882, "loss": 0.5336, "step": 66090 }, { "epoch": 1.4712428774928774, "grad_norm": 0.6834327578544617, "learning_rate": 0.0002060599628556575, "loss": 0.5241, "step": 66100 }, { "epoch": 1.4714654558404558, "grad_norm": 0.5241697430610657, "learning_rate": 0.00020601336186322353, "loss": 0.576, "step": 66110 }, { "epoch": 1.471688034188034, "grad_norm": 0.7107514142990112, "learning_rate": 0.00020596676054401858, "loss": 0.5049, "step": 66120 }, { "epoch": 1.4719106125356125, "grad_norm": 0.4992298185825348, "learning_rate": 0.00020592015890057494, "loss": 0.5305, "step": 66130 }, { "epoch": 1.472133190883191, "grad_norm": 0.6178436875343323, "learning_rate": 0.000205873556935425, "loss": 0.5717, "step": 66140 }, { "epoch": 1.4723557692307692, "grad_norm": 0.442743718624115, "learning_rate": 0.0002058269546511012, "loss": 0.5183, "step": 66150 }, { "epoch": 1.4725783475783476, "grad_norm": 0.5500785708427429, "learning_rate": 0.0002057803520501359, "loss": 0.493, "step": 66160 }, { "epoch": 1.472800925925926, "grad_norm": 0.6199305653572083, "learning_rate": 0.00020573374913506148, "loss": 0.5522, "step": 66170 }, { "epoch": 1.4730235042735043, "grad_norm": 0.6486396193504333, "learning_rate": 0.00020568714590841046, "loss": 0.6705, "step": 66180 }, { "epoch": 1.4732460826210827, "grad_norm": 0.7136431932449341, "learning_rate": 0.00020564054237271536, "loss": 0.6631, "step": 66190 }, { "epoch": 1.473468660968661, "grad_norm": 0.5296502113342285, "learning_rate": 0.00020559393853050853, "loss": 0.5054, "step": 66200 }, { "epoch": 1.4736912393162394, "grad_norm": 0.5589050054550171, "learning_rate": 0.00020554733438432247, "loss": 0.5524, "step": 66210 }, { "epoch": 1.4739138176638176, "grad_norm": 0.5227344632148743, "learning_rate": 0.00020550072993668974, "loss": 0.5718, "step": 66220 }, { "epoch": 1.474136396011396, "grad_norm": 0.6966056227684021, "learning_rate": 0.00020545412519014285, "loss": 0.5455, "step": 66230 }, { "epoch": 1.4743589743589745, "grad_norm": 0.5115660429000854, "learning_rate": 0.00020540752014721432, "loss": 0.5812, "step": 66240 }, { "epoch": 1.4745815527065527, "grad_norm": 0.5743473768234253, "learning_rate": 0.00020536091481043668, "loss": 0.5892, "step": 66250 }, { "epoch": 1.474804131054131, "grad_norm": 0.4098198413848877, "learning_rate": 0.00020531430918234258, "loss": 0.5955, "step": 66260 }, { "epoch": 1.4750267094017093, "grad_norm": 0.6191383600234985, "learning_rate": 0.00020526770326546463, "loss": 0.4784, "step": 66270 }, { "epoch": 1.4752492877492878, "grad_norm": 0.6457961201667786, "learning_rate": 0.00020522109706233525, "loss": 0.5828, "step": 66280 }, { "epoch": 1.475471866096866, "grad_norm": 0.628578782081604, "learning_rate": 0.00020517449057548724, "loss": 0.6064, "step": 66290 }, { "epoch": 1.4756944444444444, "grad_norm": 0.6337209343910217, "learning_rate": 0.0002051278838074532, "loss": 0.5404, "step": 66300 }, { "epoch": 1.4759170227920229, "grad_norm": 0.5599583387374878, "learning_rate": 0.00020508127676076572, "loss": 0.5639, "step": 66310 }, { "epoch": 1.476139601139601, "grad_norm": 0.8558855652809143, "learning_rate": 0.00020503466943795756, "loss": 0.5392, "step": 66320 }, { "epoch": 1.4763621794871795, "grad_norm": 0.5551844239234924, "learning_rate": 0.00020498806184156125, "loss": 0.5564, "step": 66330 }, { "epoch": 1.476584757834758, "grad_norm": 0.9600586295127869, "learning_rate": 0.00020494145397410965, "loss": 0.5996, "step": 66340 }, { "epoch": 1.4768073361823362, "grad_norm": 0.4699052572250366, "learning_rate": 0.00020489484583813535, "loss": 0.5395, "step": 66350 }, { "epoch": 1.4770299145299146, "grad_norm": 0.6838102340698242, "learning_rate": 0.00020484823743617114, "loss": 0.6533, "step": 66360 }, { "epoch": 1.4772524928774928, "grad_norm": 0.639786958694458, "learning_rate": 0.00020480162877074975, "loss": 0.6041, "step": 66370 }, { "epoch": 1.4774750712250713, "grad_norm": 0.7160543203353882, "learning_rate": 0.00020475501984440388, "loss": 0.5567, "step": 66380 }, { "epoch": 1.4776976495726495, "grad_norm": 0.8186416029930115, "learning_rate": 0.0002047084106596664, "loss": 0.6609, "step": 66390 }, { "epoch": 1.477920227920228, "grad_norm": 0.8617456555366516, "learning_rate": 0.00020466180121906998, "loss": 0.717, "step": 66400 }, { "epoch": 1.4781428062678064, "grad_norm": 0.4262973964214325, "learning_rate": 0.00020461519152514753, "loss": 0.6177, "step": 66410 }, { "epoch": 1.4783653846153846, "grad_norm": 0.5646436810493469, "learning_rate": 0.00020456858158043168, "loss": 0.5103, "step": 66420 }, { "epoch": 1.478587962962963, "grad_norm": 0.5096763372421265, "learning_rate": 0.0002045219713874554, "loss": 0.485, "step": 66430 }, { "epoch": 1.4788105413105412, "grad_norm": 0.6648346185684204, "learning_rate": 0.00020447536094875157, "loss": 0.5736, "step": 66440 }, { "epoch": 1.4790331196581197, "grad_norm": 0.6746906042098999, "learning_rate": 0.00020442875026685297, "loss": 0.6185, "step": 66450 }, { "epoch": 1.479255698005698, "grad_norm": 0.6889836192131042, "learning_rate": 0.00020438213934429237, "loss": 0.4814, "step": 66460 }, { "epoch": 1.4794782763532763, "grad_norm": 0.579818844795227, "learning_rate": 0.00020433552818360275, "loss": 0.6022, "step": 66470 }, { "epoch": 1.4797008547008548, "grad_norm": 0.6798656582832336, "learning_rate": 0.00020428891678731702, "loss": 0.6192, "step": 66480 }, { "epoch": 1.479923433048433, "grad_norm": 0.49344855546951294, "learning_rate": 0.000204242305157968, "loss": 0.495, "step": 66490 }, { "epoch": 1.4801460113960114, "grad_norm": 0.393621027469635, "learning_rate": 0.00020419569329808862, "loss": 0.545, "step": 66500 }, { "epoch": 1.4803685897435899, "grad_norm": 0.4905228316783905, "learning_rate": 0.0002041490812102119, "loss": 0.577, "step": 66510 }, { "epoch": 1.480591168091168, "grad_norm": 0.5233932137489319, "learning_rate": 0.00020410246889687072, "loss": 0.4568, "step": 66520 }, { "epoch": 1.4808137464387463, "grad_norm": 0.47157812118530273, "learning_rate": 0.00020405585636059796, "loss": 0.6753, "step": 66530 }, { "epoch": 1.4810363247863247, "grad_norm": 0.35142749547958374, "learning_rate": 0.00020400924360392667, "loss": 0.4946, "step": 66540 }, { "epoch": 1.4812589031339032, "grad_norm": 0.8298178911209106, "learning_rate": 0.0002039626306293898, "loss": 0.5958, "step": 66550 }, { "epoch": 1.4814814814814814, "grad_norm": 0.7113330960273743, "learning_rate": 0.00020391601743952032, "loss": 0.5403, "step": 66560 }, { "epoch": 1.4817040598290598, "grad_norm": 0.4129575788974762, "learning_rate": 0.00020386940403685125, "loss": 0.5357, "step": 66570 }, { "epoch": 1.4819266381766383, "grad_norm": 0.5855956673622131, "learning_rate": 0.00020382279042391565, "loss": 0.4916, "step": 66580 }, { "epoch": 1.4821492165242165, "grad_norm": 0.5066674947738647, "learning_rate": 0.00020377617660324648, "loss": 0.5156, "step": 66590 }, { "epoch": 1.482371794871795, "grad_norm": 0.7026668190956116, "learning_rate": 0.0002037295625773767, "loss": 0.5411, "step": 66600 }, { "epoch": 1.4825943732193732, "grad_norm": 0.8947693109512329, "learning_rate": 0.0002036829483488395, "loss": 0.4803, "step": 66610 }, { "epoch": 1.4828169515669516, "grad_norm": 0.5915529131889343, "learning_rate": 0.00020363633392016784, "loss": 0.5067, "step": 66620 }, { "epoch": 1.4830395299145298, "grad_norm": 0.6306509375572205, "learning_rate": 0.00020358971929389482, "loss": 0.4563, "step": 66630 }, { "epoch": 1.4832621082621082, "grad_norm": 0.8169883489608765, "learning_rate": 0.00020354310447255353, "loss": 0.6565, "step": 66640 }, { "epoch": 1.4834846866096867, "grad_norm": 1.0991393327713013, "learning_rate": 0.00020349648945867715, "loss": 0.5178, "step": 66650 }, { "epoch": 1.483707264957265, "grad_norm": 0.5982011556625366, "learning_rate": 0.00020344987425479852, "loss": 0.5572, "step": 66660 }, { "epoch": 1.4839298433048433, "grad_norm": 0.46377143263816833, "learning_rate": 0.00020340325886345092, "loss": 0.5503, "step": 66670 }, { "epoch": 1.4841524216524218, "grad_norm": 0.6298932433128357, "learning_rate": 0.00020335664328716745, "loss": 0.4792, "step": 66680 }, { "epoch": 1.484375, "grad_norm": 0.6804326176643372, "learning_rate": 0.0002033100275284813, "loss": 0.4894, "step": 66690 }, { "epoch": 1.4845975783475782, "grad_norm": 0.5818316340446472, "learning_rate": 0.00020326341158992547, "loss": 0.6249, "step": 66700 }, { "epoch": 1.4848201566951567, "grad_norm": 0.5855556726455688, "learning_rate": 0.0002032167954740332, "loss": 0.5472, "step": 66710 }, { "epoch": 1.485042735042735, "grad_norm": 0.497353196144104, "learning_rate": 0.0002031701791833377, "loss": 0.5875, "step": 66720 }, { "epoch": 1.4852653133903133, "grad_norm": 0.48648321628570557, "learning_rate": 0.000203123562720372, "loss": 0.4869, "step": 66730 }, { "epoch": 1.4854878917378918, "grad_norm": 0.6546781063079834, "learning_rate": 0.0002030769460876693, "loss": 0.5434, "step": 66740 }, { "epoch": 1.4857104700854702, "grad_norm": 1.0312706232070923, "learning_rate": 0.0002030303292877629, "loss": 0.6661, "step": 66750 }, { "epoch": 1.4859330484330484, "grad_norm": 0.4677811861038208, "learning_rate": 0.00020298371232318596, "loss": 0.4725, "step": 66760 }, { "epoch": 1.4861556267806268, "grad_norm": 0.6403644680976868, "learning_rate": 0.00020293709519647157, "loss": 0.6621, "step": 66770 }, { "epoch": 1.486378205128205, "grad_norm": 0.4164356291294098, "learning_rate": 0.00020289047791015308, "loss": 0.5814, "step": 66780 }, { "epoch": 1.4866007834757835, "grad_norm": 0.5813632607460022, "learning_rate": 0.00020284386046676365, "loss": 0.6091, "step": 66790 }, { "epoch": 1.4868233618233617, "grad_norm": 0.5498270988464355, "learning_rate": 0.0002027972428688365, "loss": 0.6177, "step": 66800 }, { "epoch": 1.4870459401709402, "grad_norm": 0.47491297125816345, "learning_rate": 0.00020275062511890485, "loss": 0.5591, "step": 66810 }, { "epoch": 1.4872685185185186, "grad_norm": 0.5644556879997253, "learning_rate": 0.00020270400721950202, "loss": 0.5741, "step": 66820 }, { "epoch": 1.4874910968660968, "grad_norm": 0.6705946326255798, "learning_rate": 0.00020265738917316117, "loss": 0.5214, "step": 66830 }, { "epoch": 1.4877136752136753, "grad_norm": 0.771763801574707, "learning_rate": 0.00020261077098241565, "loss": 0.5585, "step": 66840 }, { "epoch": 1.4879362535612537, "grad_norm": 0.7709307074546814, "learning_rate": 0.00020256415264979872, "loss": 0.5727, "step": 66850 }, { "epoch": 1.488158831908832, "grad_norm": 0.5889680981636047, "learning_rate": 0.00020251753417784368, "loss": 0.5227, "step": 66860 }, { "epoch": 1.4883814102564101, "grad_norm": 0.5113076567649841, "learning_rate": 0.0002024709155690837, "loss": 0.5422, "step": 66870 }, { "epoch": 1.4886039886039886, "grad_norm": 0.6153723001480103, "learning_rate": 0.00020242429682605214, "loss": 0.7747, "step": 66880 }, { "epoch": 1.488826566951567, "grad_norm": 0.3839668333530426, "learning_rate": 0.0002023776779512823, "loss": 0.6726, "step": 66890 }, { "epoch": 1.4890491452991452, "grad_norm": 0.6124092936515808, "learning_rate": 0.00020233105894730752, "loss": 0.5958, "step": 66900 }, { "epoch": 1.4892717236467237, "grad_norm": 0.7292911410331726, "learning_rate": 0.0002022844398166611, "loss": 0.4976, "step": 66910 }, { "epoch": 1.489494301994302, "grad_norm": 0.6575756669044495, "learning_rate": 0.00020223782056187634, "loss": 0.7109, "step": 66920 }, { "epoch": 1.4897168803418803, "grad_norm": 0.680202066898346, "learning_rate": 0.0002021912011854866, "loss": 0.6018, "step": 66930 }, { "epoch": 1.4899394586894588, "grad_norm": 0.6214728951454163, "learning_rate": 0.00020214458169002514, "loss": 0.6302, "step": 66940 }, { "epoch": 1.490162037037037, "grad_norm": 0.6200801730155945, "learning_rate": 0.00020209796207802536, "loss": 0.5556, "step": 66950 }, { "epoch": 1.4903846153846154, "grad_norm": 0.6757798790931702, "learning_rate": 0.00020205134235202064, "loss": 0.5413, "step": 66960 }, { "epoch": 1.4906071937321936, "grad_norm": 0.5411709547042847, "learning_rate": 0.00020200472251454427, "loss": 0.5122, "step": 66970 }, { "epoch": 1.490829772079772, "grad_norm": 0.6982552409172058, "learning_rate": 0.00020195810256812968, "loss": 0.5668, "step": 66980 }, { "epoch": 1.4910523504273505, "grad_norm": 0.6113097667694092, "learning_rate": 0.00020191148251531016, "loss": 0.5333, "step": 66990 }, { "epoch": 1.4912749287749287, "grad_norm": 0.5838876366615295, "learning_rate": 0.00020186486235861914, "loss": 0.4169, "step": 67000 }, { "epoch": 1.4914975071225072, "grad_norm": 0.48443251848220825, "learning_rate": 0.00020181824210058994, "loss": 0.5466, "step": 67010 }, { "epoch": 1.4917200854700854, "grad_norm": 0.5497033596038818, "learning_rate": 0.00020177162174375596, "loss": 0.5244, "step": 67020 }, { "epoch": 1.4919426638176638, "grad_norm": 0.6074718832969666, "learning_rate": 0.00020172500129065065, "loss": 0.5937, "step": 67030 }, { "epoch": 1.492165242165242, "grad_norm": 0.36284077167510986, "learning_rate": 0.00020167838074380736, "loss": 0.5153, "step": 67040 }, { "epoch": 1.4923878205128205, "grad_norm": 0.6307108998298645, "learning_rate": 0.00020163176010575947, "loss": 0.6369, "step": 67050 }, { "epoch": 1.492610398860399, "grad_norm": 0.8147817254066467, "learning_rate": 0.00020158513937904035, "loss": 0.5483, "step": 67060 }, { "epoch": 1.4928329772079771, "grad_norm": 0.4598945677280426, "learning_rate": 0.00020153851856618356, "loss": 0.567, "step": 67070 }, { "epoch": 1.4930555555555556, "grad_norm": 0.5549848079681396, "learning_rate": 0.00020149189766972234, "loss": 0.6136, "step": 67080 }, { "epoch": 1.493278133903134, "grad_norm": 0.8051429986953735, "learning_rate": 0.00020144527669219015, "loss": 0.6486, "step": 67090 }, { "epoch": 1.4935007122507122, "grad_norm": 0.6233551502227783, "learning_rate": 0.00020139865563612052, "loss": 0.4419, "step": 67100 }, { "epoch": 1.4937232905982907, "grad_norm": 0.513957679271698, "learning_rate": 0.0002013520345040468, "loss": 0.6169, "step": 67110 }, { "epoch": 1.493945868945869, "grad_norm": 0.5634671449661255, "learning_rate": 0.0002013054132985024, "loss": 0.5219, "step": 67120 }, { "epoch": 1.4941684472934473, "grad_norm": 0.7170486450195312, "learning_rate": 0.00020125879202202073, "loss": 0.5853, "step": 67130 }, { "epoch": 1.4943910256410255, "grad_norm": 0.7605602145195007, "learning_rate": 0.0002012121706771353, "loss": 0.5669, "step": 67140 }, { "epoch": 1.494613603988604, "grad_norm": 0.39489638805389404, "learning_rate": 0.0002011655492663795, "loss": 0.5545, "step": 67150 }, { "epoch": 1.4948361823361824, "grad_norm": 0.759689211845398, "learning_rate": 0.00020111892779228679, "loss": 0.6603, "step": 67160 }, { "epoch": 1.4950587606837606, "grad_norm": 0.5082457661628723, "learning_rate": 0.0002010723062573907, "loss": 0.5883, "step": 67170 }, { "epoch": 1.495281339031339, "grad_norm": 0.8190949559211731, "learning_rate": 0.0002010256846642246, "loss": 0.6132, "step": 67180 }, { "epoch": 1.4955039173789173, "grad_norm": 0.600661039352417, "learning_rate": 0.00020097906301532188, "loss": 0.3774, "step": 67190 }, { "epoch": 1.4957264957264957, "grad_norm": 0.44674062728881836, "learning_rate": 0.00020093244131321608, "loss": 0.4768, "step": 67200 }, { "epoch": 1.495949074074074, "grad_norm": 0.5484546422958374, "learning_rate": 0.00020088581956044074, "loss": 0.5206, "step": 67210 }, { "epoch": 1.4961716524216524, "grad_norm": 0.6583910584449768, "learning_rate": 0.0002008391977595292, "loss": 0.5034, "step": 67220 }, { "epoch": 1.4963942307692308, "grad_norm": 0.6517849564552307, "learning_rate": 0.00020079257591301493, "loss": 0.6159, "step": 67230 }, { "epoch": 1.496616809116809, "grad_norm": 0.5481684803962708, "learning_rate": 0.00020074595402343147, "loss": 0.65, "step": 67240 }, { "epoch": 1.4968393874643875, "grad_norm": 0.6224780082702637, "learning_rate": 0.00020069933209331228, "loss": 0.4955, "step": 67250 }, { "epoch": 1.497061965811966, "grad_norm": 0.956333339214325, "learning_rate": 0.00020065271012519075, "loss": 0.524, "step": 67260 }, { "epoch": 1.4972845441595442, "grad_norm": 0.7989146113395691, "learning_rate": 0.00020060608812160044, "loss": 0.5822, "step": 67270 }, { "epoch": 1.4975071225071226, "grad_norm": 0.8631839156150818, "learning_rate": 0.0002005594660850749, "loss": 0.5416, "step": 67280 }, { "epoch": 1.4977297008547008, "grad_norm": 0.46611177921295166, "learning_rate": 0.00020051284401814736, "loss": 0.5857, "step": 67290 }, { "epoch": 1.4979522792022792, "grad_norm": 0.5473828315734863, "learning_rate": 0.00020046622192335152, "loss": 0.4891, "step": 67300 }, { "epoch": 1.4981748575498575, "grad_norm": 0.6410860419273376, "learning_rate": 0.00020041959980322084, "loss": 0.6214, "step": 67310 }, { "epoch": 1.498397435897436, "grad_norm": 0.5481650829315186, "learning_rate": 0.00020037297766028878, "loss": 0.6417, "step": 67320 }, { "epoch": 1.4986200142450143, "grad_norm": 0.9253783822059631, "learning_rate": 0.0002003263554970887, "loss": 0.4641, "step": 67330 }, { "epoch": 1.4988425925925926, "grad_norm": 0.5475835800170898, "learning_rate": 0.00020027973331615426, "loss": 0.5779, "step": 67340 }, { "epoch": 1.499065170940171, "grad_norm": 0.6916240453720093, "learning_rate": 0.0002002331111200189, "loss": 0.4992, "step": 67350 }, { "epoch": 1.4992877492877492, "grad_norm": 0.5551814436912537, "learning_rate": 0.00020018648891121602, "loss": 0.5891, "step": 67360 }, { "epoch": 1.4995103276353277, "grad_norm": 0.5963566303253174, "learning_rate": 0.00020013986669227925, "loss": 0.4562, "step": 67370 }, { "epoch": 1.4997329059829059, "grad_norm": 0.7776322960853577, "learning_rate": 0.000200093244465742, "loss": 0.7268, "step": 67380 }, { "epoch": 1.4999554843304843, "grad_norm": 0.4523514211177826, "learning_rate": 0.00020004662223413778, "loss": 0.4972, "step": 67390 }, { "epoch": 1.5001780626780628, "grad_norm": 0.4121207296848297, "learning_rate": 0.0002, "loss": 0.5202, "step": 67400 }, { "epoch": 1.5001780626780628, "eval_loss": 0.5678955912590027, "eval_runtime": 337.2407, "eval_samples_per_second": 7.013, "eval_steps_per_second": 7.013, "step": 67400 }, { "epoch": 1.500400641025641, "grad_norm": 0.5370911955833435, "learning_rate": 0.00019995337776586227, "loss": 0.4783, "step": 67410 }, { "epoch": 1.5006232193732194, "grad_norm": 0.3378683924674988, "learning_rate": 0.00019990675553425806, "loss": 0.5036, "step": 67420 }, { "epoch": 1.5008457977207978, "grad_norm": 0.48726844787597656, "learning_rate": 0.00019986013330772077, "loss": 0.4933, "step": 67430 }, { "epoch": 1.501068376068376, "grad_norm": 0.5741036534309387, "learning_rate": 0.00019981351108878397, "loss": 0.6323, "step": 67440 }, { "epoch": 1.5012909544159543, "grad_norm": 0.4806444048881531, "learning_rate": 0.00019976688887998116, "loss": 0.5906, "step": 67450 }, { "epoch": 1.5015135327635327, "grad_norm": 0.5801340937614441, "learning_rate": 0.00019972026668384576, "loss": 0.5764, "step": 67460 }, { "epoch": 1.5017361111111112, "grad_norm": 0.4316343367099762, "learning_rate": 0.00019967364450291136, "loss": 0.5098, "step": 67470 }, { "epoch": 1.5019586894586894, "grad_norm": 0.37861862778663635, "learning_rate": 0.00019962702233971134, "loss": 0.6163, "step": 67480 }, { "epoch": 1.5021812678062678, "grad_norm": 0.6555972695350647, "learning_rate": 0.0001995804001967792, "loss": 0.516, "step": 67490 }, { "epoch": 1.5024038461538463, "grad_norm": 0.44705936312675476, "learning_rate": 0.00019953377807664852, "loss": 0.5441, "step": 67500 }, { "epoch": 1.5026264245014245, "grad_norm": 0.6999156475067139, "learning_rate": 0.0001994871559818527, "loss": 0.6709, "step": 67510 }, { "epoch": 1.5028490028490027, "grad_norm": 0.5605092644691467, "learning_rate": 0.00019944053391492519, "loss": 0.5358, "step": 67520 }, { "epoch": 1.5030715811965814, "grad_norm": 0.7177448272705078, "learning_rate": 0.0001993939118783996, "loss": 0.517, "step": 67530 }, { "epoch": 1.5032941595441596, "grad_norm": 0.5700393319129944, "learning_rate": 0.00019934728987480927, "loss": 0.6628, "step": 67540 }, { "epoch": 1.5035167378917378, "grad_norm": 0.5855696797370911, "learning_rate": 0.00019930066790668777, "loss": 0.5235, "step": 67550 }, { "epoch": 1.5037393162393162, "grad_norm": 0.53740394115448, "learning_rate": 0.00019925404597656855, "loss": 0.5231, "step": 67560 }, { "epoch": 1.5039618945868947, "grad_norm": 0.5468155145645142, "learning_rate": 0.00019920742408698508, "loss": 0.5379, "step": 67570 }, { "epoch": 1.5041844729344729, "grad_norm": 0.6854047179222107, "learning_rate": 0.00019916080224047082, "loss": 0.5855, "step": 67580 }, { "epoch": 1.5044070512820513, "grad_norm": 0.6865229606628418, "learning_rate": 0.00019911418043955928, "loss": 0.5998, "step": 67590 }, { "epoch": 1.5046296296296298, "grad_norm": 0.6859182715415955, "learning_rate": 0.00019906755868678394, "loss": 0.6969, "step": 67600 }, { "epoch": 1.504852207977208, "grad_norm": 0.6807510852813721, "learning_rate": 0.00019902093698467822, "loss": 0.5432, "step": 67610 }, { "epoch": 1.5050747863247862, "grad_norm": 0.6587150692939758, "learning_rate": 0.0001989743153357755, "loss": 0.4533, "step": 67620 }, { "epoch": 1.5052973646723646, "grad_norm": 0.4862242639064789, "learning_rate": 0.00019892769374260937, "loss": 0.626, "step": 67630 }, { "epoch": 1.505519943019943, "grad_norm": 0.4418836236000061, "learning_rate": 0.00019888107220771323, "loss": 0.447, "step": 67640 }, { "epoch": 1.5057425213675213, "grad_norm": 0.675957977771759, "learning_rate": 0.00019883445073362054, "loss": 0.5451, "step": 67650 }, { "epoch": 1.5059650997150997, "grad_norm": 0.4350026249885559, "learning_rate": 0.00019878782932286474, "loss": 0.6169, "step": 67660 }, { "epoch": 1.5061876780626782, "grad_norm": 0.5768257975578308, "learning_rate": 0.00019874120797797935, "loss": 0.5835, "step": 67670 }, { "epoch": 1.5064102564102564, "grad_norm": 0.8592005968093872, "learning_rate": 0.00019869458670149768, "loss": 0.5191, "step": 67680 }, { "epoch": 1.5066328347578346, "grad_norm": 0.3954167068004608, "learning_rate": 0.00019864796549595324, "loss": 0.5395, "step": 67690 }, { "epoch": 1.5068554131054133, "grad_norm": 0.5921634435653687, "learning_rate": 0.00019860134436387953, "loss": 0.4337, "step": 67700 }, { "epoch": 1.5070779914529915, "grad_norm": 0.7694410681724548, "learning_rate": 0.00019855472330780982, "loss": 0.5281, "step": 67710 }, { "epoch": 1.5073005698005697, "grad_norm": 0.8094078898429871, "learning_rate": 0.00019850810233027768, "loss": 0.6113, "step": 67720 }, { "epoch": 1.5075231481481481, "grad_norm": 1.537431240081787, "learning_rate": 0.00019846148143381654, "loss": 0.5638, "step": 67730 }, { "epoch": 1.5077457264957266, "grad_norm": 0.5468326807022095, "learning_rate": 0.0001984148606209597, "loss": 0.6547, "step": 67740 }, { "epoch": 1.5079683048433048, "grad_norm": 0.5707191824913025, "learning_rate": 0.00019836823989424063, "loss": 0.6599, "step": 67750 }, { "epoch": 1.5081908831908832, "grad_norm": 0.7331326603889465, "learning_rate": 0.0001983216192561927, "loss": 0.4641, "step": 67760 }, { "epoch": 1.5084134615384617, "grad_norm": 0.5310831665992737, "learning_rate": 0.00019827499870934937, "loss": 0.5002, "step": 67770 }, { "epoch": 1.50863603988604, "grad_norm": 0.500921368598938, "learning_rate": 0.00019822837825624406, "loss": 0.5765, "step": 67780 }, { "epoch": 1.508858618233618, "grad_norm": 0.5608747005462646, "learning_rate": 0.0001981817578994101, "loss": 0.455, "step": 67790 }, { "epoch": 1.5090811965811965, "grad_norm": 0.5071516036987305, "learning_rate": 0.0001981351376413809, "loss": 0.4695, "step": 67800 }, { "epoch": 1.509303774928775, "grad_norm": 0.7218843698501587, "learning_rate": 0.00019808851748468988, "loss": 0.6235, "step": 67810 }, { "epoch": 1.5095263532763532, "grad_norm": 0.6392359733581543, "learning_rate": 0.00019804189743187036, "loss": 0.5179, "step": 67820 }, { "epoch": 1.5097489316239316, "grad_norm": 0.6713154911994934, "learning_rate": 0.00019799527748545572, "loss": 0.5209, "step": 67830 }, { "epoch": 1.50997150997151, "grad_norm": 0.7304174304008484, "learning_rate": 0.00019794865764797938, "loss": 0.464, "step": 67840 }, { "epoch": 1.5101940883190883, "grad_norm": 0.5477138161659241, "learning_rate": 0.00019790203792197463, "loss": 0.5502, "step": 67850 }, { "epoch": 1.5104166666666665, "grad_norm": 0.43742215633392334, "learning_rate": 0.00019785541830997494, "loss": 0.6281, "step": 67860 }, { "epoch": 1.5106392450142452, "grad_norm": 0.5618810057640076, "learning_rate": 0.0001978087988145135, "loss": 0.5577, "step": 67870 }, { "epoch": 1.5108618233618234, "grad_norm": 0.47301146388053894, "learning_rate": 0.00019776217943812376, "loss": 0.5327, "step": 67880 }, { "epoch": 1.5110844017094016, "grad_norm": 0.4628704786300659, "learning_rate": 0.00019771556018333898, "loss": 0.6488, "step": 67890 }, { "epoch": 1.51130698005698, "grad_norm": 0.49233415722846985, "learning_rate": 0.00019766894105269252, "loss": 0.4458, "step": 67900 }, { "epoch": 1.5115295584045585, "grad_norm": 0.4551265239715576, "learning_rate": 0.00019762232204871772, "loss": 0.5321, "step": 67910 }, { "epoch": 1.5117521367521367, "grad_norm": 1.0391403436660767, "learning_rate": 0.00019757570317394793, "loss": 0.5385, "step": 67920 }, { "epoch": 1.5119747150997151, "grad_norm": 0.7271586656570435, "learning_rate": 0.00019752908443091636, "loss": 0.5177, "step": 67930 }, { "epoch": 1.5121972934472936, "grad_norm": 0.6301023960113525, "learning_rate": 0.00019748246582215636, "loss": 0.6633, "step": 67940 }, { "epoch": 1.5124198717948718, "grad_norm": 0.5653015375137329, "learning_rate": 0.0001974358473502013, "loss": 0.4245, "step": 67950 }, { "epoch": 1.51264245014245, "grad_norm": 0.9529626965522766, "learning_rate": 0.00019738922901758435, "loss": 0.4955, "step": 67960 }, { "epoch": 1.5128650284900285, "grad_norm": 0.5109310746192932, "learning_rate": 0.0001973426108268388, "loss": 0.5948, "step": 67970 }, { "epoch": 1.513087606837607, "grad_norm": 0.5099339485168457, "learning_rate": 0.00019729599278049803, "loss": 0.3919, "step": 67980 }, { "epoch": 1.5133101851851851, "grad_norm": 0.6454190015792847, "learning_rate": 0.00019724937488109517, "loss": 0.5759, "step": 67990 }, { "epoch": 1.5135327635327636, "grad_norm": 0.6826174259185791, "learning_rate": 0.00019720275713116362, "loss": 0.6977, "step": 68000 }, { "epoch": 1.513755341880342, "grad_norm": 0.3470633924007416, "learning_rate": 0.00019715613953323643, "loss": 0.5286, "step": 68010 }, { "epoch": 1.5139779202279202, "grad_norm": 0.4481032192707062, "learning_rate": 0.00019710952208984702, "loss": 0.545, "step": 68020 }, { "epoch": 1.5142004985754984, "grad_norm": 0.6486870050430298, "learning_rate": 0.00019706290480352848, "loss": 0.4998, "step": 68030 }, { "epoch": 1.5144230769230769, "grad_norm": 0.7330490350723267, "learning_rate": 0.0001970162876768141, "loss": 0.6018, "step": 68040 }, { "epoch": 1.5146456552706553, "grad_norm": 0.5175281763076782, "learning_rate": 0.00019696967071223712, "loss": 0.5314, "step": 68050 }, { "epoch": 1.5148682336182335, "grad_norm": 0.6107418537139893, "learning_rate": 0.0001969230539123307, "loss": 0.5814, "step": 68060 }, { "epoch": 1.515090811965812, "grad_norm": 0.5667102336883545, "learning_rate": 0.00019687643727962802, "loss": 0.5239, "step": 68070 }, { "epoch": 1.5153133903133904, "grad_norm": 0.6517727971076965, "learning_rate": 0.00019682982081666234, "loss": 0.5387, "step": 68080 }, { "epoch": 1.5155359686609686, "grad_norm": 0.44955015182495117, "learning_rate": 0.00019678320452596682, "loss": 0.5079, "step": 68090 }, { "epoch": 1.515758547008547, "grad_norm": 0.379574179649353, "learning_rate": 0.00019673658841007455, "loss": 0.5597, "step": 68100 }, { "epoch": 1.5159811253561255, "grad_norm": 0.7952883839607239, "learning_rate": 0.00019668997247151873, "loss": 0.5436, "step": 68110 }, { "epoch": 1.5162037037037037, "grad_norm": 0.5392372012138367, "learning_rate": 0.00019664335671283254, "loss": 0.5144, "step": 68120 }, { "epoch": 1.516426282051282, "grad_norm": 0.3944644033908844, "learning_rate": 0.00019659674113654916, "loss": 0.477, "step": 68130 }, { "epoch": 1.5166488603988604, "grad_norm": 0.44683077931404114, "learning_rate": 0.00019655012574520158, "loss": 0.4392, "step": 68140 }, { "epoch": 1.5168714387464388, "grad_norm": 0.5390403866767883, "learning_rate": 0.00019650351054132298, "loss": 0.4811, "step": 68150 }, { "epoch": 1.517094017094017, "grad_norm": 0.7489467859268188, "learning_rate": 0.00019645689552744651, "loss": 0.6996, "step": 68160 }, { "epoch": 1.5173165954415955, "grad_norm": 0.5423558950424194, "learning_rate": 0.00019641028070610522, "loss": 0.4971, "step": 68170 }, { "epoch": 1.517539173789174, "grad_norm": 0.7267695069313049, "learning_rate": 0.00019636366607983218, "loss": 0.7174, "step": 68180 }, { "epoch": 1.5177617521367521, "grad_norm": 0.6742967963218689, "learning_rate": 0.00019631705165116056, "loss": 0.5926, "step": 68190 }, { "epoch": 1.5179843304843303, "grad_norm": 0.6137351989746094, "learning_rate": 0.0001962704374226233, "loss": 0.5389, "step": 68200 }, { "epoch": 1.5182069088319088, "grad_norm": 0.5949881076812744, "learning_rate": 0.0001962238233967536, "loss": 0.502, "step": 68210 }, { "epoch": 1.5184294871794872, "grad_norm": 0.5518471002578735, "learning_rate": 0.00019617720957608437, "loss": 0.4403, "step": 68220 }, { "epoch": 1.5186520655270654, "grad_norm": 0.6174115538597107, "learning_rate": 0.00019613059596314877, "loss": 0.6888, "step": 68230 }, { "epoch": 1.5188746438746439, "grad_norm": 0.4383307695388794, "learning_rate": 0.00019608398256047967, "loss": 0.4862, "step": 68240 }, { "epoch": 1.5190972222222223, "grad_norm": 0.598969578742981, "learning_rate": 0.0001960373693706102, "loss": 0.5032, "step": 68250 }, { "epoch": 1.5193198005698005, "grad_norm": 0.4754716455936432, "learning_rate": 0.00019599075639607338, "loss": 0.5774, "step": 68260 }, { "epoch": 1.5195423789173788, "grad_norm": 0.6664845943450928, "learning_rate": 0.0001959441436394021, "loss": 0.7143, "step": 68270 }, { "epoch": 1.5197649572649574, "grad_norm": 0.6848062872886658, "learning_rate": 0.00019589753110312936, "loss": 0.4915, "step": 68280 }, { "epoch": 1.5199875356125356, "grad_norm": 0.7403508424758911, "learning_rate": 0.0001958509187897881, "loss": 0.4748, "step": 68290 }, { "epoch": 1.5202101139601139, "grad_norm": 0.5345255732536316, "learning_rate": 0.00019580430670191142, "loss": 0.4903, "step": 68300 }, { "epoch": 1.5204326923076923, "grad_norm": 0.5372764468193054, "learning_rate": 0.00019575769484203205, "loss": 0.6183, "step": 68310 }, { "epoch": 1.5206552706552707, "grad_norm": 0.6304016709327698, "learning_rate": 0.000195711083212683, "loss": 0.5801, "step": 68320 }, { "epoch": 1.520877849002849, "grad_norm": 0.8080713748931885, "learning_rate": 0.0001956644718163973, "loss": 0.5597, "step": 68330 }, { "epoch": 1.5211004273504274, "grad_norm": 0.40685272216796875, "learning_rate": 0.00019561786065570765, "loss": 0.5366, "step": 68340 }, { "epoch": 1.5213230056980058, "grad_norm": 0.5551041960716248, "learning_rate": 0.0001955712497331471, "loss": 0.5362, "step": 68350 }, { "epoch": 1.521545584045584, "grad_norm": 0.659697949886322, "learning_rate": 0.0001955246390512484, "loss": 0.5348, "step": 68360 }, { "epoch": 1.5217681623931623, "grad_norm": 0.5986943244934082, "learning_rate": 0.00019547802861254456, "loss": 0.5273, "step": 68370 }, { "epoch": 1.5219907407407407, "grad_norm": 0.4209531247615814, "learning_rate": 0.0001954314184195683, "loss": 0.5386, "step": 68380 }, { "epoch": 1.5222133190883191, "grad_norm": 0.6699190139770508, "learning_rate": 0.00019538480847485257, "loss": 0.6869, "step": 68390 }, { "epoch": 1.5224358974358974, "grad_norm": 0.7325060963630676, "learning_rate": 0.00019533819878093006, "loss": 0.4593, "step": 68400 }, { "epoch": 1.5226584757834758, "grad_norm": 0.5187638401985168, "learning_rate": 0.0001952915893403337, "loss": 0.4833, "step": 68410 }, { "epoch": 1.5228810541310542, "grad_norm": 0.640946090221405, "learning_rate": 0.00019524498015559616, "loss": 0.5063, "step": 68420 }, { "epoch": 1.5231036324786325, "grad_norm": 0.6378102898597717, "learning_rate": 0.0001951983712292503, "loss": 0.5305, "step": 68430 }, { "epoch": 1.5233262108262107, "grad_norm": 0.8082734942436218, "learning_rate": 0.0001951517625638289, "loss": 0.5905, "step": 68440 }, { "epoch": 1.5235487891737893, "grad_norm": 0.6392168998718262, "learning_rate": 0.0001951051541618647, "loss": 0.4997, "step": 68450 }, { "epoch": 1.5237713675213675, "grad_norm": 0.5380043983459473, "learning_rate": 0.0001950585460258904, "loss": 0.4804, "step": 68460 }, { "epoch": 1.5239939458689458, "grad_norm": 0.7026430368423462, "learning_rate": 0.0001950119381584388, "loss": 0.5851, "step": 68470 }, { "epoch": 1.5242165242165242, "grad_norm": 0.4975331723690033, "learning_rate": 0.0001949653305620425, "loss": 0.5668, "step": 68480 }, { "epoch": 1.5244391025641026, "grad_norm": 0.5272009372711182, "learning_rate": 0.00019491872323923427, "loss": 0.5599, "step": 68490 }, { "epoch": 1.5246616809116809, "grad_norm": 0.47747164964675903, "learning_rate": 0.00019487211619254684, "loss": 0.4989, "step": 68500 }, { "epoch": 1.5248842592592593, "grad_norm": 0.8575394153594971, "learning_rate": 0.00019482550942451275, "loss": 0.4727, "step": 68510 }, { "epoch": 1.5251068376068377, "grad_norm": 0.37729412317276, "learning_rate": 0.00019477890293766482, "loss": 0.5011, "step": 68520 }, { "epoch": 1.525329415954416, "grad_norm": 0.4902661144733429, "learning_rate": 0.00019473229673453547, "loss": 0.4458, "step": 68530 }, { "epoch": 1.5255519943019942, "grad_norm": 0.45972684025764465, "learning_rate": 0.00019468569081765744, "loss": 0.603, "step": 68540 }, { "epoch": 1.5257745726495726, "grad_norm": 0.5463181138038635, "learning_rate": 0.00019463908518956336, "loss": 0.56, "step": 68550 }, { "epoch": 1.525997150997151, "grad_norm": 0.4450381100177765, "learning_rate": 0.00019459247985278576, "loss": 0.67, "step": 68560 }, { "epoch": 1.5262197293447293, "grad_norm": 0.6549626588821411, "learning_rate": 0.0001945458748098572, "loss": 0.6309, "step": 68570 }, { "epoch": 1.5264423076923077, "grad_norm": 0.6377149820327759, "learning_rate": 0.00019449927006331033, "loss": 0.431, "step": 68580 }, { "epoch": 1.5266648860398861, "grad_norm": 0.38826367259025574, "learning_rate": 0.00019445266561567755, "loss": 0.5232, "step": 68590 }, { "epoch": 1.5268874643874644, "grad_norm": 0.5607532262802124, "learning_rate": 0.0001944060614694915, "loss": 0.663, "step": 68600 }, { "epoch": 1.5271100427350426, "grad_norm": 0.3576250374317169, "learning_rate": 0.00019435945762728469, "loss": 0.5527, "step": 68610 }, { "epoch": 1.5273326210826212, "grad_norm": 0.8259841203689575, "learning_rate": 0.00019431285409158953, "loss": 0.4382, "step": 68620 }, { "epoch": 1.5275551994301995, "grad_norm": 0.5837790966033936, "learning_rate": 0.0001942662508649385, "loss": 0.4992, "step": 68630 }, { "epoch": 1.5277777777777777, "grad_norm": 0.6744682192802429, "learning_rate": 0.00019421964794986415, "loss": 0.5334, "step": 68640 }, { "epoch": 1.5280003561253561, "grad_norm": 0.599739134311676, "learning_rate": 0.00019417304534889888, "loss": 0.5101, "step": 68650 }, { "epoch": 1.5282229344729346, "grad_norm": 0.42003190517425537, "learning_rate": 0.0001941264430645751, "loss": 0.4881, "step": 68660 }, { "epoch": 1.5284455128205128, "grad_norm": 0.5991514325141907, "learning_rate": 0.00019407984109942513, "loss": 0.6502, "step": 68670 }, { "epoch": 1.5286680911680912, "grad_norm": 0.7176218628883362, "learning_rate": 0.0001940332394559815, "loss": 0.5538, "step": 68680 }, { "epoch": 1.5288906695156697, "grad_norm": 0.46449580788612366, "learning_rate": 0.00019398663813677652, "loss": 0.55, "step": 68690 }, { "epoch": 1.5291132478632479, "grad_norm": 0.5694019794464111, "learning_rate": 0.0001939400371443425, "loss": 0.5211, "step": 68700 }, { "epoch": 1.529335826210826, "grad_norm": 0.7176836729049683, "learning_rate": 0.00019389343648121185, "loss": 0.5964, "step": 68710 }, { "epoch": 1.5295584045584045, "grad_norm": 0.5984424948692322, "learning_rate": 0.0001938468361499169, "loss": 0.4925, "step": 68720 }, { "epoch": 1.529780982905983, "grad_norm": 0.6754959225654602, "learning_rate": 0.00019380023615298984, "loss": 0.5136, "step": 68730 }, { "epoch": 1.5300035612535612, "grad_norm": 0.38003575801849365, "learning_rate": 0.00019375363649296306, "loss": 0.5454, "step": 68740 }, { "epoch": 1.5302261396011396, "grad_norm": 0.5140901803970337, "learning_rate": 0.0001937070371723688, "loss": 0.6238, "step": 68750 }, { "epoch": 1.530448717948718, "grad_norm": 0.6815766096115112, "learning_rate": 0.00019366043819373928, "loss": 0.5283, "step": 68760 }, { "epoch": 1.5306712962962963, "grad_norm": 0.8449596166610718, "learning_rate": 0.0001936138395596067, "loss": 0.6162, "step": 68770 }, { "epoch": 1.5308938746438745, "grad_norm": 0.4473537504673004, "learning_rate": 0.0001935672412725034, "loss": 0.5368, "step": 68780 }, { "epoch": 1.531116452991453, "grad_norm": 0.6608182191848755, "learning_rate": 0.0001935206433349614, "loss": 0.6517, "step": 68790 }, { "epoch": 1.5313390313390314, "grad_norm": 1.1109671592712402, "learning_rate": 0.000193474045749513, "loss": 0.5815, "step": 68800 }, { "epoch": 1.5315616096866096, "grad_norm": 0.6018622517585754, "learning_rate": 0.00019342744851869024, "loss": 0.4961, "step": 68810 }, { "epoch": 1.531784188034188, "grad_norm": 0.4928274154663086, "learning_rate": 0.0001933808516450253, "loss": 0.515, "step": 68820 }, { "epoch": 1.5320067663817665, "grad_norm": 0.6179516911506653, "learning_rate": 0.00019333425513105038, "loss": 0.6854, "step": 68830 }, { "epoch": 1.5322293447293447, "grad_norm": 0.7085897922515869, "learning_rate": 0.00019328765897929742, "loss": 0.6374, "step": 68840 }, { "epoch": 1.5324519230769231, "grad_norm": 0.874418318271637, "learning_rate": 0.00019324106319229856, "loss": 0.569, "step": 68850 }, { "epoch": 1.5326745014245016, "grad_norm": 0.7772431969642639, "learning_rate": 0.00019319446777258593, "loss": 0.4234, "step": 68860 }, { "epoch": 1.5328970797720798, "grad_norm": 0.5707485675811768, "learning_rate": 0.0001931478727226914, "loss": 0.5065, "step": 68870 }, { "epoch": 1.533119658119658, "grad_norm": 0.4485999643802643, "learning_rate": 0.0001931012780451471, "loss": 0.5181, "step": 68880 }, { "epoch": 1.5333422364672364, "grad_norm": 0.5521811842918396, "learning_rate": 0.00019305468374248506, "loss": 0.6346, "step": 68890 }, { "epoch": 1.5335648148148149, "grad_norm": 0.7295104265213013, "learning_rate": 0.00019300808981723714, "loss": 0.6556, "step": 68900 }, { "epoch": 1.533787393162393, "grad_norm": 0.5098261833190918, "learning_rate": 0.00019296149627193542, "loss": 0.6277, "step": 68910 }, { "epoch": 1.5340099715099715, "grad_norm": 0.4031944274902344, "learning_rate": 0.0001929149031091117, "loss": 0.5626, "step": 68920 }, { "epoch": 1.53423254985755, "grad_norm": 0.7615691423416138, "learning_rate": 0.00019286831033129791, "loss": 0.5249, "step": 68930 }, { "epoch": 1.5344551282051282, "grad_norm": 0.8737319707870483, "learning_rate": 0.00019282171794102602, "loss": 0.5877, "step": 68940 }, { "epoch": 1.5346777065527064, "grad_norm": 0.5554376244544983, "learning_rate": 0.00019277512594082782, "loss": 0.3718, "step": 68950 }, { "epoch": 1.5349002849002849, "grad_norm": 0.3122378885746002, "learning_rate": 0.00019272853433323519, "loss": 0.4532, "step": 68960 }, { "epoch": 1.5351228632478633, "grad_norm": 0.4163728952407837, "learning_rate": 0.00019268194312077998, "loss": 0.5449, "step": 68970 }, { "epoch": 1.5353454415954415, "grad_norm": 0.7278553247451782, "learning_rate": 0.00019263535230599398, "loss": 0.5452, "step": 68980 }, { "epoch": 1.53556801994302, "grad_norm": 0.5134122371673584, "learning_rate": 0.0001925887618914089, "loss": 0.6467, "step": 68990 }, { "epoch": 1.5357905982905984, "grad_norm": 0.5455583333969116, "learning_rate": 0.00019254217187955665, "loss": 0.5355, "step": 69000 }, { "epoch": 1.5360131766381766, "grad_norm": 0.47929325699806213, "learning_rate": 0.00019249558227296885, "loss": 0.5137, "step": 69010 }, { "epoch": 1.5362357549857548, "grad_norm": 0.7038000822067261, "learning_rate": 0.00019244899307417724, "loss": 0.5802, "step": 69020 }, { "epoch": 1.5364583333333335, "grad_norm": 0.5524876117706299, "learning_rate": 0.00019240240428571354, "loss": 0.6186, "step": 69030 }, { "epoch": 1.5366809116809117, "grad_norm": 0.6715249419212341, "learning_rate": 0.0001923558159101095, "loss": 0.4595, "step": 69040 }, { "epoch": 1.53690349002849, "grad_norm": 0.6825011372566223, "learning_rate": 0.00019230922794989655, "loss": 0.528, "step": 69050 }, { "epoch": 1.5371260683760684, "grad_norm": 0.8443838953971863, "learning_rate": 0.00019226264040760649, "loss": 0.5717, "step": 69060 }, { "epoch": 1.5373486467236468, "grad_norm": 0.4754297137260437, "learning_rate": 0.0001922160532857709, "loss": 0.6914, "step": 69070 }, { "epoch": 1.537571225071225, "grad_norm": 0.7522182464599609, "learning_rate": 0.00019216946658692132, "loss": 0.6039, "step": 69080 }, { "epoch": 1.5377938034188035, "grad_norm": 0.6504005789756775, "learning_rate": 0.00019212288031358933, "loss": 0.5819, "step": 69090 }, { "epoch": 1.538016381766382, "grad_norm": 0.479922890663147, "learning_rate": 0.0001920762944683065, "loss": 0.6508, "step": 69100 }, { "epoch": 1.53823896011396, "grad_norm": 0.6442951560020447, "learning_rate": 0.00019202970905360432, "loss": 0.5327, "step": 69110 }, { "epoch": 1.5384615384615383, "grad_norm": 0.4072447121143341, "learning_rate": 0.00019198312407201425, "loss": 0.4809, "step": 69120 }, { "epoch": 1.5386841168091168, "grad_norm": 0.5883573293685913, "learning_rate": 0.00019193653952606776, "loss": 0.5166, "step": 69130 }, { "epoch": 1.5389066951566952, "grad_norm": 0.4536867141723633, "learning_rate": 0.00019188995541829636, "loss": 0.4637, "step": 69140 }, { "epoch": 1.5391292735042734, "grad_norm": 0.5501317381858826, "learning_rate": 0.00019184337175123141, "loss": 0.5643, "step": 69150 }, { "epoch": 1.5393518518518519, "grad_norm": 0.5903235673904419, "learning_rate": 0.0001917967885274043, "loss": 0.5332, "step": 69160 }, { "epoch": 1.5395744301994303, "grad_norm": 0.6028198599815369, "learning_rate": 0.00019175020574934646, "loss": 0.502, "step": 69170 }, { "epoch": 1.5397970085470085, "grad_norm": 0.39726942777633667, "learning_rate": 0.00019170362341958922, "loss": 0.5645, "step": 69180 }, { "epoch": 1.5400195868945867, "grad_norm": 0.5767681002616882, "learning_rate": 0.0001916570415406638, "loss": 0.5459, "step": 69190 }, { "epoch": 1.5402421652421654, "grad_norm": 0.48167499899864197, "learning_rate": 0.00019161046011510158, "loss": 0.4562, "step": 69200 }, { "epoch": 1.5404647435897436, "grad_norm": 0.5863054394721985, "learning_rate": 0.00019156387914543382, "loss": 0.6115, "step": 69210 }, { "epoch": 1.5406873219373218, "grad_norm": 0.5131021738052368, "learning_rate": 0.0001915172986341918, "loss": 0.5833, "step": 69220 }, { "epoch": 1.5409099002849003, "grad_norm": 0.4436676502227783, "learning_rate": 0.00019147071858390671, "loss": 0.4423, "step": 69230 }, { "epoch": 1.5411324786324787, "grad_norm": 0.7246342897415161, "learning_rate": 0.00019142413899710974, "loss": 0.5924, "step": 69240 }, { "epoch": 1.541355056980057, "grad_norm": 0.8733262419700623, "learning_rate": 0.0001913775598763321, "loss": 0.5369, "step": 69250 }, { "epoch": 1.5415776353276354, "grad_norm": 0.5569106340408325, "learning_rate": 0.0001913309812241049, "loss": 0.5274, "step": 69260 }, { "epoch": 1.5418002136752138, "grad_norm": 0.6295933127403259, "learning_rate": 0.00019128440304295926, "loss": 0.5813, "step": 69270 }, { "epoch": 1.542022792022792, "grad_norm": 0.9999169707298279, "learning_rate": 0.00019123782533542633, "loss": 0.5836, "step": 69280 }, { "epoch": 1.5422453703703702, "grad_norm": 0.44312646985054016, "learning_rate": 0.00019119124810403713, "loss": 0.6223, "step": 69290 }, { "epoch": 1.5424679487179487, "grad_norm": 0.5661088228225708, "learning_rate": 0.00019114467135132268, "loss": 0.4703, "step": 69300 }, { "epoch": 1.5426905270655271, "grad_norm": 0.6291427612304688, "learning_rate": 0.00019109809507981414, "loss": 0.6484, "step": 69310 }, { "epoch": 1.5429131054131053, "grad_norm": 0.624117910861969, "learning_rate": 0.00019105151929204236, "loss": 0.4465, "step": 69320 }, { "epoch": 1.5431356837606838, "grad_norm": 0.5193196535110474, "learning_rate": 0.00019100494399053832, "loss": 0.6065, "step": 69330 }, { "epoch": 1.5433582621082622, "grad_norm": 0.5729727149009705, "learning_rate": 0.000190958369177833, "loss": 0.6657, "step": 69340 }, { "epoch": 1.5435808404558404, "grad_norm": 0.7318823933601379, "learning_rate": 0.00019091179485645728, "loss": 0.5558, "step": 69350 }, { "epoch": 1.5438034188034186, "grad_norm": 0.6170826554298401, "learning_rate": 0.00019086522102894208, "loss": 0.5517, "step": 69360 }, { "epoch": 1.5440259971509973, "grad_norm": 0.5891059637069702, "learning_rate": 0.00019081864769781822, "loss": 0.5764, "step": 69370 }, { "epoch": 1.5442485754985755, "grad_norm": 0.7347457408905029, "learning_rate": 0.00019077207486561658, "loss": 0.522, "step": 69380 }, { "epoch": 1.5444711538461537, "grad_norm": 0.6764406561851501, "learning_rate": 0.00019072550253486798, "loss": 0.6101, "step": 69390 }, { "epoch": 1.5446937321937322, "grad_norm": 0.34939131140708923, "learning_rate": 0.00019067893070810312, "loss": 0.5842, "step": 69400 }, { "epoch": 1.5449163105413106, "grad_norm": 0.5678504705429077, "learning_rate": 0.0001906323593878528, "loss": 0.6085, "step": 69410 }, { "epoch": 1.5451388888888888, "grad_norm": 0.4918629229068756, "learning_rate": 0.00019058578857664778, "loss": 0.6456, "step": 69420 }, { "epoch": 1.5453614672364673, "grad_norm": 0.636618435382843, "learning_rate": 0.00019053921827701865, "loss": 0.5643, "step": 69430 }, { "epoch": 1.5455840455840457, "grad_norm": 0.4326227605342865, "learning_rate": 0.00019049264849149627, "loss": 0.5054, "step": 69440 }, { "epoch": 1.545806623931624, "grad_norm": 0.6106733083724976, "learning_rate": 0.00019044607922261104, "loss": 0.5299, "step": 69450 }, { "epoch": 1.5460292022792022, "grad_norm": 0.5556911826133728, "learning_rate": 0.00019039951047289375, "loss": 0.5381, "step": 69460 }, { "epoch": 1.5462517806267806, "grad_norm": 0.3829481303691864, "learning_rate": 0.00019035294224487487, "loss": 0.4977, "step": 69470 }, { "epoch": 1.546474358974359, "grad_norm": 0.35006266832351685, "learning_rate": 0.000190306374541085, "loss": 0.5947, "step": 69480 }, { "epoch": 1.5466969373219372, "grad_norm": 0.6910932660102844, "learning_rate": 0.0001902598073640547, "loss": 0.5648, "step": 69490 }, { "epoch": 1.5469195156695157, "grad_norm": 0.5231965184211731, "learning_rate": 0.00019021324071631442, "loss": 0.5198, "step": 69500 }, { "epoch": 1.5471420940170941, "grad_norm": 0.5858936309814453, "learning_rate": 0.00019016667460039466, "loss": 0.6829, "step": 69510 }, { "epoch": 1.5473646723646723, "grad_norm": 0.5553900003433228, "learning_rate": 0.00019012010901882584, "loss": 0.5341, "step": 69520 }, { "epoch": 1.5475872507122506, "grad_norm": 0.7034667134284973, "learning_rate": 0.0001900735439741384, "loss": 0.5864, "step": 69530 }, { "epoch": 1.5478098290598292, "grad_norm": 0.6792713403701782, "learning_rate": 0.00019002697946886272, "loss": 0.6046, "step": 69540 }, { "epoch": 1.5480324074074074, "grad_norm": 0.8406374454498291, "learning_rate": 0.0001899804155055291, "loss": 0.5032, "step": 69550 }, { "epoch": 1.5482549857549857, "grad_norm": 0.5789881944656372, "learning_rate": 0.00018993385208666797, "loss": 0.5651, "step": 69560 }, { "epoch": 1.548477564102564, "grad_norm": 0.732040286064148, "learning_rate": 0.0001898872892148096, "loss": 0.5107, "step": 69570 }, { "epoch": 1.5487001424501425, "grad_norm": 0.7222851514816284, "learning_rate": 0.0001898407268924841, "loss": 0.5173, "step": 69580 }, { "epoch": 1.5489227207977208, "grad_norm": 1.1726871728897095, "learning_rate": 0.00018979416512222182, "loss": 0.5663, "step": 69590 }, { "epoch": 1.5491452991452992, "grad_norm": 0.6467028260231018, "learning_rate": 0.000189747603906553, "loss": 0.5539, "step": 69600 }, { "epoch": 1.5493678774928776, "grad_norm": 0.4506545662879944, "learning_rate": 0.00018970104324800776, "loss": 0.6448, "step": 69610 }, { "epoch": 1.5495904558404558, "grad_norm": 0.4832093417644501, "learning_rate": 0.00018965448314911627, "loss": 0.5298, "step": 69620 }, { "epoch": 1.549813034188034, "grad_norm": 0.6158397197723389, "learning_rate": 0.00018960792361240867, "loss": 0.5429, "step": 69630 }, { "epoch": 1.5500356125356125, "grad_norm": 0.4832749664783478, "learning_rate": 0.00018956136464041493, "loss": 0.5784, "step": 69640 }, { "epoch": 1.550258190883191, "grad_norm": 0.7306379079818726, "learning_rate": 0.00018951480623566523, "loss": 0.6367, "step": 69650 }, { "epoch": 1.5504807692307692, "grad_norm": 0.563369631767273, "learning_rate": 0.0001894682484006895, "loss": 0.4721, "step": 69660 }, { "epoch": 1.5507033475783476, "grad_norm": 0.4590865671634674, "learning_rate": 0.00018942169113801783, "loss": 0.5756, "step": 69670 }, { "epoch": 1.550925925925926, "grad_norm": 0.8247950673103333, "learning_rate": 0.00018937513445018008, "loss": 0.5036, "step": 69680 }, { "epoch": 1.5511485042735043, "grad_norm": 0.4580720365047455, "learning_rate": 0.00018932857833970626, "loss": 0.5799, "step": 69690 }, { "epoch": 1.5513710826210825, "grad_norm": 0.6762135028839111, "learning_rate": 0.0001892820228091263, "loss": 0.489, "step": 69700 }, { "epoch": 1.551593660968661, "grad_norm": 0.6307932734489441, "learning_rate": 0.0001892354678609699, "loss": 0.6051, "step": 69710 }, { "epoch": 1.5518162393162394, "grad_norm": 0.6217736601829529, "learning_rate": 0.000189188913497767, "loss": 0.5964, "step": 69720 }, { "epoch": 1.5520388176638176, "grad_norm": 0.47461748123168945, "learning_rate": 0.00018914235972204737, "loss": 0.6002, "step": 69730 }, { "epoch": 1.552261396011396, "grad_norm": 0.5183067917823792, "learning_rate": 0.00018909580653634085, "loss": 0.5517, "step": 69740 }, { "epoch": 1.5524839743589745, "grad_norm": 0.5647053122520447, "learning_rate": 0.00018904925394317709, "loss": 0.4627, "step": 69750 }, { "epoch": 1.5527065527065527, "grad_norm": 0.5931477546691895, "learning_rate": 0.00018900270194508581, "loss": 0.5838, "step": 69760 }, { "epoch": 1.552929131054131, "grad_norm": 0.74869304895401, "learning_rate": 0.00018895615054459678, "loss": 0.572, "step": 69770 }, { "epoch": 1.5531517094017095, "grad_norm": 0.47858738899230957, "learning_rate": 0.0001889095997442395, "loss": 0.5936, "step": 69780 }, { "epoch": 1.5533742877492878, "grad_norm": 0.5192180871963501, "learning_rate": 0.00018886304954654365, "loss": 0.5538, "step": 69790 }, { "epoch": 1.553596866096866, "grad_norm": 0.5217307806015015, "learning_rate": 0.00018881649995403888, "loss": 0.5716, "step": 69800 }, { "epoch": 1.5538194444444444, "grad_norm": 0.4526553452014923, "learning_rate": 0.0001887699509692546, "loss": 0.5187, "step": 69810 }, { "epoch": 1.5540420227920229, "grad_norm": 0.4228460192680359, "learning_rate": 0.00018872340259472035, "loss": 0.6612, "step": 69820 }, { "epoch": 1.554264601139601, "grad_norm": 0.46403852105140686, "learning_rate": 0.0001886768548329658, "loss": 0.5794, "step": 69830 }, { "epoch": 1.5544871794871795, "grad_norm": 0.6431500911712646, "learning_rate": 0.00018863030768652005, "loss": 0.5492, "step": 69840 }, { "epoch": 1.554709757834758, "grad_norm": 0.5244579911231995, "learning_rate": 0.00018858376115791277, "loss": 0.5452, "step": 69850 }, { "epoch": 1.5549323361823362, "grad_norm": 0.5913880467414856, "learning_rate": 0.00018853721524967322, "loss": 0.5058, "step": 69860 }, { "epoch": 1.5551549145299144, "grad_norm": 0.7027416825294495, "learning_rate": 0.0001884906699643308, "loss": 0.5089, "step": 69870 }, { "epoch": 1.5553774928774928, "grad_norm": 0.7109727263450623, "learning_rate": 0.00018844412530441478, "loss": 0.533, "step": 69880 }, { "epoch": 1.5556000712250713, "grad_norm": 0.5859348177909851, "learning_rate": 0.00018839758127245444, "loss": 0.5419, "step": 69890 }, { "epoch": 1.5558226495726495, "grad_norm": 0.7551231980323792, "learning_rate": 0.00018835103787097902, "loss": 0.6146, "step": 69900 }, { "epoch": 1.556045227920228, "grad_norm": 0.548856258392334, "learning_rate": 0.00018830449510251777, "loss": 0.6349, "step": 69910 }, { "epoch": 1.5562678062678064, "grad_norm": 0.5823180079460144, "learning_rate": 0.00018825795296959982, "loss": 0.6391, "step": 69920 }, { "epoch": 1.5564903846153846, "grad_norm": 0.6126546859741211, "learning_rate": 0.00018821141147475428, "loss": 0.433, "step": 69930 }, { "epoch": 1.5567129629629628, "grad_norm": 0.38551953434944153, "learning_rate": 0.0001881648706205104, "loss": 0.5385, "step": 69940 }, { "epoch": 1.5569355413105415, "grad_norm": 0.48852142691612244, "learning_rate": 0.00018811833040939703, "loss": 0.5742, "step": 69950 }, { "epoch": 1.5571581196581197, "grad_norm": 0.6741325855255127, "learning_rate": 0.00018807179084394344, "loss": 0.5228, "step": 69960 }, { "epoch": 1.557380698005698, "grad_norm": 0.4975391924381256, "learning_rate": 0.0001880252519266784, "loss": 0.5911, "step": 69970 }, { "epoch": 1.5576032763532763, "grad_norm": 0.868884265422821, "learning_rate": 0.00018797871366013096, "loss": 0.6094, "step": 69980 }, { "epoch": 1.5578258547008548, "grad_norm": 0.5068603157997131, "learning_rate": 0.0001879321760468301, "loss": 0.5399, "step": 69990 }, { "epoch": 1.558048433048433, "grad_norm": 0.4866209924221039, "learning_rate": 0.00018788563908930466, "loss": 0.656, "step": 70000 }, { "epoch": 1.5582710113960114, "grad_norm": 0.5103034377098083, "learning_rate": 0.0001878391027900835, "loss": 0.5125, "step": 70010 }, { "epoch": 1.5584935897435899, "grad_norm": 0.5046936869621277, "learning_rate": 0.00018779256715169547, "loss": 0.6247, "step": 70020 }, { "epoch": 1.558716168091168, "grad_norm": 0.9371962547302246, "learning_rate": 0.00018774603217666932, "loss": 0.6255, "step": 70030 }, { "epoch": 1.5589387464387463, "grad_norm": 0.5806150436401367, "learning_rate": 0.00018769949786753381, "loss": 0.5952, "step": 70040 }, { "epoch": 1.5591613247863247, "grad_norm": 0.5372239947319031, "learning_rate": 0.00018765296422681765, "loss": 0.5557, "step": 70050 }, { "epoch": 1.5593839031339032, "grad_norm": 0.42751508951187134, "learning_rate": 0.00018760643125704954, "loss": 0.6372, "step": 70060 }, { "epoch": 1.5596064814814814, "grad_norm": 0.3532927632331848, "learning_rate": 0.00018755989896075809, "loss": 0.5215, "step": 70070 }, { "epoch": 1.5598290598290598, "grad_norm": 0.6899681687355042, "learning_rate": 0.00018751336734047194, "loss": 0.5297, "step": 70080 }, { "epoch": 1.5600516381766383, "grad_norm": 0.4909971356391907, "learning_rate": 0.00018746683639871964, "loss": 0.5418, "step": 70090 }, { "epoch": 1.5601851851851851, "eval_loss": 0.564594566822052, "eval_runtime": 337.0867, "eval_samples_per_second": 7.016, "eval_steps_per_second": 7.016, "step": 70096 }, { "epoch": 1.5602742165242165, "grad_norm": 0.5869930982589722, "learning_rate": 0.0001874203061380297, "loss": 0.6224, "step": 70100 }, { "epoch": 1.5604967948717947, "grad_norm": 0.653296709060669, "learning_rate": 0.0001873737765609306, "loss": 0.5368, "step": 70110 }, { "epoch": 1.5607193732193734, "grad_norm": 0.6351401209831238, "learning_rate": 0.0001873272476699508, "loss": 0.6583, "step": 70120 }, { "epoch": 1.5609419515669516, "grad_norm": 0.6833685040473938, "learning_rate": 0.0001872807194676188, "loss": 0.502, "step": 70130 }, { "epoch": 1.5611645299145298, "grad_norm": 0.5794316530227661, "learning_rate": 0.00018723419195646284, "loss": 0.5486, "step": 70140 }, { "epoch": 1.5613871082621082, "grad_norm": 0.6334303021430969, "learning_rate": 0.00018718766513901134, "loss": 0.5893, "step": 70150 }, { "epoch": 1.5616096866096867, "grad_norm": 0.3508698642253876, "learning_rate": 0.00018714113901779266, "loss": 0.4961, "step": 70160 }, { "epoch": 1.561832264957265, "grad_norm": 0.44076502323150635, "learning_rate": 0.000187094613595335, "loss": 0.5079, "step": 70170 }, { "epoch": 1.5620548433048433, "grad_norm": 0.5710964202880859, "learning_rate": 0.00018704808887416656, "loss": 0.6312, "step": 70180 }, { "epoch": 1.5622774216524218, "grad_norm": 0.34635406732559204, "learning_rate": 0.00018700156485681563, "loss": 0.5937, "step": 70190 }, { "epoch": 1.5625, "grad_norm": 0.46365317702293396, "learning_rate": 0.00018695504154581026, "loss": 0.4601, "step": 70200 }, { "epoch": 1.5627225783475782, "grad_norm": 0.7181133031845093, "learning_rate": 0.00018690851894367864, "loss": 0.5682, "step": 70210 }, { "epoch": 1.5629451566951567, "grad_norm": 1.1888110637664795, "learning_rate": 0.00018686199705294883, "loss": 0.5758, "step": 70220 }, { "epoch": 1.563167735042735, "grad_norm": 0.5635021924972534, "learning_rate": 0.00018681547587614888, "loss": 0.4943, "step": 70230 }, { "epoch": 1.5633903133903133, "grad_norm": 0.7435877323150635, "learning_rate": 0.00018676895541580674, "loss": 0.4574, "step": 70240 }, { "epoch": 1.5636128917378918, "grad_norm": 0.5918731093406677, "learning_rate": 0.00018672243567445035, "loss": 0.5346, "step": 70250 }, { "epoch": 1.5638354700854702, "grad_norm": 0.5541383624076843, "learning_rate": 0.00018667591665460769, "loss": 0.5451, "step": 70260 }, { "epoch": 1.5640580484330484, "grad_norm": 0.8829107880592346, "learning_rate": 0.00018662939835880667, "loss": 0.6155, "step": 70270 }, { "epoch": 1.5642806267806266, "grad_norm": 0.5331526398658752, "learning_rate": 0.00018658288078957503, "loss": 0.492, "step": 70280 }, { "epoch": 1.5645032051282053, "grad_norm": 0.47868871688842773, "learning_rate": 0.00018653636394944066, "loss": 0.5991, "step": 70290 }, { "epoch": 1.5647257834757835, "grad_norm": 0.918210506439209, "learning_rate": 0.0001864898478409313, "loss": 0.5491, "step": 70300 }, { "epoch": 1.5649483618233617, "grad_norm": 0.48229533433914185, "learning_rate": 0.00018644333246657467, "loss": 0.4871, "step": 70310 }, { "epoch": 1.5651709401709402, "grad_norm": 0.6398565769195557, "learning_rate": 0.00018639681782889843, "loss": 0.7005, "step": 70320 }, { "epoch": 1.5653935185185186, "grad_norm": 0.43786656856536865, "learning_rate": 0.00018635030393043028, "loss": 0.6143, "step": 70330 }, { "epoch": 1.5656160968660968, "grad_norm": 0.5263013243675232, "learning_rate": 0.00018630379077369774, "loss": 0.5106, "step": 70340 }, { "epoch": 1.5658386752136753, "grad_norm": 0.6509763598442078, "learning_rate": 0.00018625727836122844, "loss": 0.6045, "step": 70350 }, { "epoch": 1.5660612535612537, "grad_norm": 0.45926612615585327, "learning_rate": 0.00018621076669554995, "loss": 0.5444, "step": 70360 }, { "epoch": 1.566283831908832, "grad_norm": 0.8301941752433777, "learning_rate": 0.0001861642557791896, "loss": 0.6015, "step": 70370 }, { "epoch": 1.5665064102564101, "grad_norm": 0.5289263129234314, "learning_rate": 0.00018611774561467498, "loss": 0.5817, "step": 70380 }, { "epoch": 1.5667289886039886, "grad_norm": 0.49125346541404724, "learning_rate": 0.0001860712362045334, "loss": 0.5645, "step": 70390 }, { "epoch": 1.566951566951567, "grad_norm": 0.6305233240127563, "learning_rate": 0.0001860247275512922, "loss": 0.6307, "step": 70400 }, { "epoch": 1.5671741452991452, "grad_norm": 0.7199904322624207, "learning_rate": 0.00018597821965747884, "loss": 0.4794, "step": 70410 }, { "epoch": 1.5673967236467237, "grad_norm": 0.45911091566085815, "learning_rate": 0.00018593171252562042, "loss": 0.6006, "step": 70420 }, { "epoch": 1.567619301994302, "grad_norm": 0.6156550049781799, "learning_rate": 0.00018588520615824428, "loss": 0.5031, "step": 70430 }, { "epoch": 1.5678418803418803, "grad_norm": 0.8003026247024536, "learning_rate": 0.00018583870055787762, "loss": 0.4507, "step": 70440 }, { "epoch": 1.5680644586894585, "grad_norm": 0.6622931361198425, "learning_rate": 0.00018579219572704752, "loss": 0.472, "step": 70450 }, { "epoch": 1.5682870370370372, "grad_norm": 0.588738739490509, "learning_rate": 0.00018574569166828115, "loss": 0.489, "step": 70460 }, { "epoch": 1.5685096153846154, "grad_norm": 0.652209997177124, "learning_rate": 0.0001856991883841056, "loss": 0.6046, "step": 70470 }, { "epoch": 1.5687321937321936, "grad_norm": 0.5961380004882812, "learning_rate": 0.00018565268587704783, "loss": 0.5215, "step": 70480 }, { "epoch": 1.568954772079772, "grad_norm": 0.48777759075164795, "learning_rate": 0.0001856061841496349, "loss": 0.5625, "step": 70490 }, { "epoch": 1.5691773504273505, "grad_norm": 0.4242284297943115, "learning_rate": 0.00018555968320439368, "loss": 0.4548, "step": 70500 }, { "epoch": 1.5693999287749287, "grad_norm": 0.49864986538887024, "learning_rate": 0.00018551318304385107, "loss": 0.5304, "step": 70510 }, { "epoch": 1.5696225071225072, "grad_norm": 0.8613521456718445, "learning_rate": 0.00018546668367053397, "loss": 0.6085, "step": 70520 }, { "epoch": 1.5698450854700856, "grad_norm": 0.49887996912002563, "learning_rate": 0.00018542018508696916, "loss": 0.6019, "step": 70530 }, { "epoch": 1.5700676638176638, "grad_norm": 0.48798030614852905, "learning_rate": 0.00018537368729568343, "loss": 0.5913, "step": 70540 }, { "epoch": 1.570290242165242, "grad_norm": 0.3880743384361267, "learning_rate": 0.00018532719029920353, "loss": 0.4696, "step": 70550 }, { "epoch": 1.5705128205128205, "grad_norm": 0.5356256365776062, "learning_rate": 0.0001852806941000561, "loss": 0.5902, "step": 70560 }, { "epoch": 1.570735398860399, "grad_norm": 0.8185734748840332, "learning_rate": 0.0001852341987007678, "loss": 0.5621, "step": 70570 }, { "epoch": 1.5709579772079771, "grad_norm": 0.8998014330863953, "learning_rate": 0.00018518770410386526, "loss": 0.4523, "step": 70580 }, { "epoch": 1.5711805555555556, "grad_norm": 0.6867089867591858, "learning_rate": 0.00018514121031187498, "loss": 0.6557, "step": 70590 }, { "epoch": 1.571403133903134, "grad_norm": 0.6047399640083313, "learning_rate": 0.00018509471732732348, "loss": 0.5615, "step": 70600 }, { "epoch": 1.5716257122507122, "grad_norm": 0.5918436050415039, "learning_rate": 0.00018504822515273727, "loss": 0.6941, "step": 70610 }, { "epoch": 1.5718482905982905, "grad_norm": 0.673238217830658, "learning_rate": 0.00018500173379064282, "loss": 0.5057, "step": 70620 }, { "epoch": 1.572070868945869, "grad_norm": 0.3770233988761902, "learning_rate": 0.00018495524324356631, "loss": 0.4909, "step": 70630 }, { "epoch": 1.5722934472934473, "grad_norm": 0.6354002356529236, "learning_rate": 0.00018490875351403425, "loss": 0.5185, "step": 70640 }, { "epoch": 1.5725160256410255, "grad_norm": 0.3865412175655365, "learning_rate": 0.00018486226460457286, "loss": 0.4186, "step": 70650 }, { "epoch": 1.572738603988604, "grad_norm": 0.7606604099273682, "learning_rate": 0.00018481577651770844, "loss": 0.522, "step": 70660 }, { "epoch": 1.5729611823361824, "grad_norm": 0.45895153284072876, "learning_rate": 0.0001847692892559671, "loss": 0.5963, "step": 70670 }, { "epoch": 1.5731837606837606, "grad_norm": 0.4174489378929138, "learning_rate": 0.00018472280282187504, "loss": 0.4792, "step": 70680 }, { "epoch": 1.573406339031339, "grad_norm": 1.0084757804870605, "learning_rate": 0.00018467631721795846, "loss": 0.619, "step": 70690 }, { "epoch": 1.5736289173789175, "grad_norm": 0.5571333169937134, "learning_rate": 0.00018462983244674328, "loss": 0.4807, "step": 70700 }, { "epoch": 1.5738514957264957, "grad_norm": 0.8347536325454712, "learning_rate": 0.0001845833485107556, "loss": 0.5284, "step": 70710 }, { "epoch": 1.574074074074074, "grad_norm": 0.9292858839035034, "learning_rate": 0.0001845368654125214, "loss": 0.5157, "step": 70720 }, { "epoch": 1.5742966524216524, "grad_norm": 0.639552652835846, "learning_rate": 0.0001844903831545666, "loss": 0.567, "step": 70730 }, { "epoch": 1.5745192307692308, "grad_norm": 0.7539328336715698, "learning_rate": 0.00018444390173941703, "loss": 0.5415, "step": 70740 }, { "epoch": 1.574741809116809, "grad_norm": 0.6498465538024902, "learning_rate": 0.00018439742116959867, "loss": 0.5269, "step": 70750 }, { "epoch": 1.5749643874643875, "grad_norm": 0.511473536491394, "learning_rate": 0.0001843509414476372, "loss": 0.5868, "step": 70760 }, { "epoch": 1.575186965811966, "grad_norm": 0.4643702805042267, "learning_rate": 0.0001843044625760583, "loss": 0.5276, "step": 70770 }, { "epoch": 1.5754095441595442, "grad_norm": 0.6674240231513977, "learning_rate": 0.00018425798455738777, "loss": 0.4897, "step": 70780 }, { "epoch": 1.5756321225071224, "grad_norm": 0.827182948589325, "learning_rate": 0.00018421150739415131, "loss": 0.5322, "step": 70790 }, { "epoch": 1.5758547008547008, "grad_norm": 0.6789349317550659, "learning_rate": 0.0001841650310888744, "loss": 0.5365, "step": 70800 }, { "epoch": 1.5760772792022792, "grad_norm": 0.4275471270084381, "learning_rate": 0.00018411855564408268, "loss": 0.5983, "step": 70810 }, { "epoch": 1.5762998575498575, "grad_norm": 0.5810036659240723, "learning_rate": 0.00018407208106230161, "loss": 0.5801, "step": 70820 }, { "epoch": 1.576522435897436, "grad_norm": 0.589556872844696, "learning_rate": 0.0001840256073460568, "loss": 0.6111, "step": 70830 }, { "epoch": 1.5767450142450143, "grad_norm": 0.8313420414924622, "learning_rate": 0.00018397913449787345, "loss": 0.5872, "step": 70840 }, { "epoch": 1.5769675925925926, "grad_norm": 0.4779322147369385, "learning_rate": 0.00018393266252027707, "loss": 0.6764, "step": 70850 }, { "epoch": 1.5771901709401708, "grad_norm": 0.6745999455451965, "learning_rate": 0.00018388619141579301, "loss": 0.4457, "step": 70860 }, { "epoch": 1.5774127492877494, "grad_norm": 0.5358150005340576, "learning_rate": 0.0001838397211869465, "loss": 0.5312, "step": 70870 }, { "epoch": 1.5776353276353277, "grad_norm": 0.5253196954727173, "learning_rate": 0.0001837932518362628, "loss": 0.6305, "step": 70880 }, { "epoch": 1.5778579059829059, "grad_norm": 0.592937707901001, "learning_rate": 0.000183746783366267, "loss": 0.4918, "step": 70890 }, { "epoch": 1.5780804843304843, "grad_norm": 0.5754040479660034, "learning_rate": 0.00018370031577948431, "loss": 0.478, "step": 70900 }, { "epoch": 1.5783030626780628, "grad_norm": 0.8871728181838989, "learning_rate": 0.00018365384907843978, "loss": 0.5147, "step": 70910 }, { "epoch": 1.578525641025641, "grad_norm": 0.7244358062744141, "learning_rate": 0.0001836073832656585, "loss": 0.5109, "step": 70920 }, { "epoch": 1.5787482193732194, "grad_norm": 0.6952163577079773, "learning_rate": 0.00018356091834366545, "loss": 0.5046, "step": 70930 }, { "epoch": 1.5789707977207978, "grad_norm": 0.5666680335998535, "learning_rate": 0.0001835144543149855, "loss": 0.6313, "step": 70940 }, { "epoch": 1.579193376068376, "grad_norm": 0.6511092782020569, "learning_rate": 0.00018346799118214361, "loss": 0.5232, "step": 70950 }, { "epoch": 1.5794159544159543, "grad_norm": 0.4347042739391327, "learning_rate": 0.00018342152894766463, "loss": 0.4945, "step": 70960 }, { "epoch": 1.5796385327635327, "grad_norm": 0.8085306286811829, "learning_rate": 0.00018337506761407338, "loss": 0.5116, "step": 70970 }, { "epoch": 1.5798611111111112, "grad_norm": 0.8530517816543579, "learning_rate": 0.0001833286071838945, "loss": 0.7389, "step": 70980 }, { "epoch": 1.5800836894586894, "grad_norm": 0.6170468926429749, "learning_rate": 0.00018328214765965276, "loss": 0.4751, "step": 70990 }, { "epoch": 1.5803062678062678, "grad_norm": 0.8308414220809937, "learning_rate": 0.00018323568904387284, "loss": 0.6135, "step": 71000 }, { "epoch": 1.5805288461538463, "grad_norm": 0.5918954610824585, "learning_rate": 0.00018318923133907937, "loss": 0.5327, "step": 71010 }, { "epoch": 1.5807514245014245, "grad_norm": 0.7379618287086487, "learning_rate": 0.0001831427745477967, "loss": 0.4964, "step": 71020 }, { "epoch": 1.5809740028490027, "grad_norm": 0.6630150675773621, "learning_rate": 0.0001830963186725495, "loss": 0.5815, "step": 71030 }, { "epoch": 1.5811965811965814, "grad_norm": 0.5696947574615479, "learning_rate": 0.00018304986371586225, "loss": 0.4795, "step": 71040 }, { "epoch": 1.5814191595441596, "grad_norm": 0.6283588409423828, "learning_rate": 0.00018300340968025917, "loss": 0.4752, "step": 71050 }, { "epoch": 1.5816417378917378, "grad_norm": 0.7472113370895386, "learning_rate": 0.00018295695656826477, "loss": 0.4727, "step": 71060 }, { "epoch": 1.5818643162393162, "grad_norm": 0.5877878665924072, "learning_rate": 0.00018291050438240332, "loss": 0.5458, "step": 71070 }, { "epoch": 1.5820868945868947, "grad_norm": 0.9098490476608276, "learning_rate": 0.000182864053125199, "loss": 0.6001, "step": 71080 }, { "epoch": 1.5823094729344729, "grad_norm": 0.5156393647193909, "learning_rate": 0.0001828176027991761, "loss": 0.6244, "step": 71090 }, { "epoch": 1.5825320512820513, "grad_norm": 0.6438578963279724, "learning_rate": 0.00018277115340685875, "loss": 0.5452, "step": 71100 }, { "epoch": 1.5827546296296298, "grad_norm": 0.6225441098213196, "learning_rate": 0.00018272470495077098, "loss": 0.5292, "step": 71110 }, { "epoch": 1.582977207977208, "grad_norm": 0.5010538101196289, "learning_rate": 0.0001826782574334369, "loss": 0.6081, "step": 71120 }, { "epoch": 1.5831997863247862, "grad_norm": 0.4037385582923889, "learning_rate": 0.00018263181085738047, "loss": 0.5426, "step": 71130 }, { "epoch": 1.5834223646723646, "grad_norm": 0.6718563437461853, "learning_rate": 0.00018258536522512578, "loss": 0.6698, "step": 71140 }, { "epoch": 1.583644943019943, "grad_norm": 0.5091724395751953, "learning_rate": 0.0001825389205391965, "loss": 0.6267, "step": 71150 }, { "epoch": 1.5838675213675213, "grad_norm": 0.5664081573486328, "learning_rate": 0.0001824924768021166, "loss": 0.6177, "step": 71160 }, { "epoch": 1.5840900997150997, "grad_norm": 1.1884257793426514, "learning_rate": 0.00018244603401640981, "loss": 0.4706, "step": 71170 }, { "epoch": 1.5843126780626782, "grad_norm": 0.4916132986545563, "learning_rate": 0.0001823995921845999, "loss": 0.7023, "step": 71180 }, { "epoch": 1.5845352564102564, "grad_norm": 0.45757535099983215, "learning_rate": 0.00018235315130921058, "loss": 0.545, "step": 71190 }, { "epoch": 1.5847578347578346, "grad_norm": 0.5892700552940369, "learning_rate": 0.00018230671139276544, "loss": 0.585, "step": 71200 }, { "epoch": 1.5849804131054133, "grad_norm": 0.5792036652565002, "learning_rate": 0.00018226027243778813, "loss": 0.5685, "step": 71210 }, { "epoch": 1.5852029914529915, "grad_norm": 0.6208937168121338, "learning_rate": 0.0001822138344468021, "loss": 0.6743, "step": 71220 }, { "epoch": 1.5854255698005697, "grad_norm": 0.4228981137275696, "learning_rate": 0.00018216739742233086, "loss": 0.4699, "step": 71230 }, { "epoch": 1.5856481481481481, "grad_norm": 0.3311798870563507, "learning_rate": 0.0001821209613668979, "loss": 0.5585, "step": 71240 }, { "epoch": 1.5858707264957266, "grad_norm": 0.617067813873291, "learning_rate": 0.00018207452628302647, "loss": 0.5973, "step": 71250 }, { "epoch": 1.5860933048433048, "grad_norm": 0.8205387592315674, "learning_rate": 0.00018202809217323997, "loss": 0.4705, "step": 71260 }, { "epoch": 1.5863158831908832, "grad_norm": 0.6504186391830444, "learning_rate": 0.00018198165904006165, "loss": 0.5848, "step": 71270 }, { "epoch": 1.5865384615384617, "grad_norm": 0.5408328771591187, "learning_rate": 0.00018193522688601482, "loss": 0.4718, "step": 71280 }, { "epoch": 1.58676103988604, "grad_norm": 0.5376555919647217, "learning_rate": 0.0001818887957136225, "loss": 0.6386, "step": 71290 }, { "epoch": 1.586983618233618, "grad_norm": 0.5056243538856506, "learning_rate": 0.0001818423655254078, "loss": 0.5108, "step": 71300 }, { "epoch": 1.5872061965811965, "grad_norm": 1.2390053272247314, "learning_rate": 0.00018179593632389383, "loss": 0.6817, "step": 71310 }, { "epoch": 1.587428774928775, "grad_norm": 0.614452600479126, "learning_rate": 0.00018174950811160364, "loss": 0.4678, "step": 71320 }, { "epoch": 1.5876513532763532, "grad_norm": 0.682161271572113, "learning_rate": 0.00018170308089106006, "loss": 0.5338, "step": 71330 }, { "epoch": 1.5878739316239316, "grad_norm": 0.5767924785614014, "learning_rate": 0.00018165665466478604, "loss": 0.6269, "step": 71340 }, { "epoch": 1.58809650997151, "grad_norm": 0.4488545060157776, "learning_rate": 0.00018161022943530446, "loss": 0.6213, "step": 71350 }, { "epoch": 1.5883190883190883, "grad_norm": 0.7338999509811401, "learning_rate": 0.00018156380520513804, "loss": 0.4092, "step": 71360 }, { "epoch": 1.5885416666666665, "grad_norm": 0.536692202091217, "learning_rate": 0.00018151738197680952, "loss": 0.4023, "step": 71370 }, { "epoch": 1.5887642450142452, "grad_norm": 0.6144670248031616, "learning_rate": 0.00018147095975284166, "loss": 0.6507, "step": 71380 }, { "epoch": 1.5889868233618234, "grad_norm": 0.8293641209602356, "learning_rate": 0.00018142453853575697, "loss": 0.4912, "step": 71390 }, { "epoch": 1.5892094017094016, "grad_norm": 0.41700026392936707, "learning_rate": 0.00018137811832807802, "loss": 0.5167, "step": 71400 }, { "epoch": 1.58943198005698, "grad_norm": 0.6886595487594604, "learning_rate": 0.0001813316991323275, "loss": 0.4783, "step": 71410 }, { "epoch": 1.5896545584045585, "grad_norm": 0.5606061816215515, "learning_rate": 0.00018128528095102763, "loss": 0.535, "step": 71420 }, { "epoch": 1.5898771367521367, "grad_norm": 0.47022682428359985, "learning_rate": 0.00018123886378670095, "loss": 0.4686, "step": 71430 }, { "epoch": 1.5900997150997151, "grad_norm": 0.7232200503349304, "learning_rate": 0.00018119244764186972, "loss": 0.6632, "step": 71440 }, { "epoch": 1.5903222934472936, "grad_norm": 0.6137561798095703, "learning_rate": 0.0001811460325190563, "loss": 0.4522, "step": 71450 }, { "epoch": 1.5905448717948718, "grad_norm": 0.586788535118103, "learning_rate": 0.00018109961842078295, "loss": 0.4802, "step": 71460 }, { "epoch": 1.59076745014245, "grad_norm": 0.7412348985671997, "learning_rate": 0.00018105320534957177, "loss": 0.5579, "step": 71470 }, { "epoch": 1.5909900284900285, "grad_norm": 0.5436372756958008, "learning_rate": 0.0001810067933079449, "loss": 0.6088, "step": 71480 }, { "epoch": 1.591212606837607, "grad_norm": 0.3423854410648346, "learning_rate": 0.0001809603822984245, "loss": 0.3988, "step": 71490 }, { "epoch": 1.5914351851851851, "grad_norm": 0.4644746780395508, "learning_rate": 0.00018091397232353245, "loss": 0.5718, "step": 71500 }, { "epoch": 1.5916577635327636, "grad_norm": 0.6078989505767822, "learning_rate": 0.00018086756338579077, "loss": 0.4827, "step": 71510 }, { "epoch": 1.591880341880342, "grad_norm": 0.419260174036026, "learning_rate": 0.0001808211554877214, "loss": 0.4581, "step": 71520 }, { "epoch": 1.5921029202279202, "grad_norm": 0.4638487994670868, "learning_rate": 0.0001807747486318461, "loss": 0.5443, "step": 71530 }, { "epoch": 1.5923254985754984, "grad_norm": 0.49142155051231384, "learning_rate": 0.0001807283428206868, "loss": 0.4928, "step": 71540 }, { "epoch": 1.5925480769230769, "grad_norm": 0.7733557224273682, "learning_rate": 0.00018068193805676502, "loss": 0.6753, "step": 71550 }, { "epoch": 1.5927706552706553, "grad_norm": 0.6128353476524353, "learning_rate": 0.00018063553434260254, "loss": 0.7242, "step": 71560 }, { "epoch": 1.5929932336182335, "grad_norm": 0.9399389028549194, "learning_rate": 0.00018058913168072106, "loss": 0.5613, "step": 71570 }, { "epoch": 1.593215811965812, "grad_norm": 0.47446173429489136, "learning_rate": 0.00018054273007364196, "loss": 0.5745, "step": 71580 }, { "epoch": 1.5934383903133904, "grad_norm": 0.6724503636360168, "learning_rate": 0.00018049632952388684, "loss": 0.6099, "step": 71590 }, { "epoch": 1.5936609686609686, "grad_norm": 0.5252869725227356, "learning_rate": 0.00018044993003397722, "loss": 0.5341, "step": 71600 }, { "epoch": 1.593883547008547, "grad_norm": 0.6296842098236084, "learning_rate": 0.00018040353160643432, "loss": 0.4828, "step": 71610 }, { "epoch": 1.5941061253561255, "grad_norm": 0.8079186081886292, "learning_rate": 0.0001803571342437796, "loss": 0.6052, "step": 71620 }, { "epoch": 1.5943287037037037, "grad_norm": 0.4828781485557556, "learning_rate": 0.0001803107379485343, "loss": 0.424, "step": 71630 }, { "epoch": 1.594551282051282, "grad_norm": 0.5179343819618225, "learning_rate": 0.00018026434272321956, "loss": 0.4261, "step": 71640 }, { "epoch": 1.5947738603988604, "grad_norm": 0.6875476837158203, "learning_rate": 0.00018021794857035663, "loss": 0.51, "step": 71650 }, { "epoch": 1.5949964387464388, "grad_norm": 0.7260758876800537, "learning_rate": 0.0001801715554924666, "loss": 0.5178, "step": 71660 }, { "epoch": 1.595219017094017, "grad_norm": 0.5954149961471558, "learning_rate": 0.00018012516349207047, "loss": 0.437, "step": 71670 }, { "epoch": 1.5954415954415955, "grad_norm": 0.6847278475761414, "learning_rate": 0.00018007877257168923, "loss": 0.5978, "step": 71680 }, { "epoch": 1.595664173789174, "grad_norm": 0.5501822233200073, "learning_rate": 0.00018003238273384375, "loss": 0.3647, "step": 71690 }, { "epoch": 1.5958867521367521, "grad_norm": 0.4372483789920807, "learning_rate": 0.00017998599398105495, "loss": 0.4032, "step": 71700 }, { "epoch": 1.5961093304843303, "grad_norm": 0.4264819920063019, "learning_rate": 0.00017993960631584367, "loss": 0.4546, "step": 71710 }, { "epoch": 1.5963319088319088, "grad_norm": 0.4155157506465912, "learning_rate": 0.00017989321974073053, "loss": 0.638, "step": 71720 }, { "epoch": 1.5965544871794872, "grad_norm": 0.45495685935020447, "learning_rate": 0.00017984683425823633, "loss": 0.5307, "step": 71730 }, { "epoch": 1.5967770655270654, "grad_norm": 0.6772878766059875, "learning_rate": 0.00017980044987088166, "loss": 0.6329, "step": 71740 }, { "epoch": 1.5969996438746439, "grad_norm": 0.5602226257324219, "learning_rate": 0.00017975406658118706, "loss": 0.63, "step": 71750 }, { "epoch": 1.5972222222222223, "grad_norm": 0.5901429653167725, "learning_rate": 0.00017970768439167303, "loss": 0.5602, "step": 71760 }, { "epoch": 1.5974448005698005, "grad_norm": 0.5109543800354004, "learning_rate": 0.0001796613033048601, "loss": 0.5425, "step": 71770 }, { "epoch": 1.5976673789173788, "grad_norm": 0.7157694101333618, "learning_rate": 0.00017961492332326857, "loss": 0.5909, "step": 71780 }, { "epoch": 1.5978899572649574, "grad_norm": 0.5277893543243408, "learning_rate": 0.00017956854444941878, "loss": 0.542, "step": 71790 }, { "epoch": 1.5981125356125356, "grad_norm": 0.5802013278007507, "learning_rate": 0.0001795221666858311, "loss": 0.556, "step": 71800 }, { "epoch": 1.5983351139601139, "grad_norm": 0.5859088897705078, "learning_rate": 0.00017947579003502556, "loss": 0.6438, "step": 71810 }, { "epoch": 1.5985576923076923, "grad_norm": 0.6160123944282532, "learning_rate": 0.00017942941449952245, "loss": 0.6975, "step": 71820 }, { "epoch": 1.5987802706552707, "grad_norm": 0.6131763458251953, "learning_rate": 0.00017938304008184174, "loss": 0.5347, "step": 71830 }, { "epoch": 1.599002849002849, "grad_norm": 0.5505862236022949, "learning_rate": 0.0001793366667845035, "loss": 0.6364, "step": 71840 }, { "epoch": 1.5992254273504274, "grad_norm": 0.5568742752075195, "learning_rate": 0.00017929029461002772, "loss": 0.5858, "step": 71850 }, { "epoch": 1.5994480056980058, "grad_norm": 0.6193007230758667, "learning_rate": 0.0001792439235609343, "loss": 0.4796, "step": 71860 }, { "epoch": 1.599670584045584, "grad_norm": 0.5258501172065735, "learning_rate": 0.00017919755363974304, "loss": 0.5053, "step": 71870 }, { "epoch": 1.5998931623931623, "grad_norm": 0.6486078500747681, "learning_rate": 0.00017915118484897378, "loss": 0.5535, "step": 71880 }, { "epoch": 1.6001157407407407, "grad_norm": 0.8163578510284424, "learning_rate": 0.00017910481719114616, "loss": 0.5581, "step": 71890 }, { "epoch": 1.6003383190883191, "grad_norm": 0.42560261487960815, "learning_rate": 0.0001790584506687799, "loss": 0.3899, "step": 71900 }, { "epoch": 1.6005608974358974, "grad_norm": 0.571181058883667, "learning_rate": 0.0001790120852843946, "loss": 0.5681, "step": 71910 }, { "epoch": 1.6007834757834758, "grad_norm": 0.5772104859352112, "learning_rate": 0.00017896572104050972, "loss": 0.5197, "step": 71920 }, { "epoch": 1.6010060541310542, "grad_norm": 0.4699803292751312, "learning_rate": 0.00017891935793964487, "loss": 0.475, "step": 71930 }, { "epoch": 1.6012286324786325, "grad_norm": 0.6732143759727478, "learning_rate": 0.0001788729959843193, "loss": 0.4731, "step": 71940 }, { "epoch": 1.6014512108262107, "grad_norm": 0.6661916971206665, "learning_rate": 0.00017882663517705246, "loss": 0.4658, "step": 71950 }, { "epoch": 1.6016737891737893, "grad_norm": 0.6001116037368774, "learning_rate": 0.00017878027552036354, "loss": 0.5499, "step": 71960 }, { "epoch": 1.6018963675213675, "grad_norm": 0.6547945141792297, "learning_rate": 0.00017873391701677183, "loss": 0.5831, "step": 71970 }, { "epoch": 1.6021189458689458, "grad_norm": 0.7915700674057007, "learning_rate": 0.00017868755966879646, "loss": 0.604, "step": 71980 }, { "epoch": 1.6023415242165242, "grad_norm": 0.28278616070747375, "learning_rate": 0.00017864120347895658, "loss": 0.4522, "step": 71990 }, { "epoch": 1.6025641025641026, "grad_norm": 0.4669821262359619, "learning_rate": 0.00017859484844977118, "loss": 0.5869, "step": 72000 }, { "epoch": 1.6027866809116809, "grad_norm": 0.907658040523529, "learning_rate": 0.00017854849458375923, "loss": 0.5388, "step": 72010 }, { "epoch": 1.6030092592592593, "grad_norm": 0.5259823203086853, "learning_rate": 0.00017850214188343968, "loss": 0.6287, "step": 72020 }, { "epoch": 1.6032318376068377, "grad_norm": 0.5709260702133179, "learning_rate": 0.0001784557903513313, "loss": 0.5868, "step": 72030 }, { "epoch": 1.603454415954416, "grad_norm": 0.5346744060516357, "learning_rate": 0.0001784094399899529, "loss": 0.5212, "step": 72040 }, { "epoch": 1.6036769943019942, "grad_norm": 0.5017918348312378, "learning_rate": 0.00017836309080182327, "loss": 0.5202, "step": 72050 }, { "epoch": 1.6038995726495726, "grad_norm": 0.520685076713562, "learning_rate": 0.00017831674278946102, "loss": 0.5132, "step": 72060 }, { "epoch": 1.604122150997151, "grad_norm": 0.6095485687255859, "learning_rate": 0.00017827039595538465, "loss": 0.6865, "step": 72070 }, { "epoch": 1.6043447293447293, "grad_norm": 0.47989124059677124, "learning_rate": 0.00017822405030211273, "loss": 0.4015, "step": 72080 }, { "epoch": 1.6045673076923077, "grad_norm": 0.5909743309020996, "learning_rate": 0.0001781777058321638, "loss": 0.4716, "step": 72090 }, { "epoch": 1.6047898860398861, "grad_norm": 0.5345268249511719, "learning_rate": 0.00017813136254805616, "loss": 0.5581, "step": 72100 }, { "epoch": 1.6050124643874644, "grad_norm": 0.5118623375892639, "learning_rate": 0.00017808502045230817, "loss": 0.5295, "step": 72110 }, { "epoch": 1.6052350427350426, "grad_norm": 0.5770447850227356, "learning_rate": 0.00017803867954743811, "loss": 0.4923, "step": 72120 }, { "epoch": 1.6054576210826212, "grad_norm": 0.49001508951187134, "learning_rate": 0.00017799233983596425, "loss": 0.5184, "step": 72130 }, { "epoch": 1.6056801994301995, "grad_norm": 0.6325069665908813, "learning_rate": 0.00017794600132040457, "loss": 0.5174, "step": 72140 }, { "epoch": 1.6059027777777777, "grad_norm": 0.36604610085487366, "learning_rate": 0.00017789966400327727, "loss": 0.5228, "step": 72150 }, { "epoch": 1.6061253561253561, "grad_norm": 0.5231015682220459, "learning_rate": 0.00017785332788710037, "loss": 0.5275, "step": 72160 }, { "epoch": 1.6063479344729346, "grad_norm": 0.8676664233207703, "learning_rate": 0.0001778069929743917, "loss": 0.6281, "step": 72170 }, { "epoch": 1.6065705128205128, "grad_norm": 0.4106631875038147, "learning_rate": 0.0001777606592676692, "loss": 0.4852, "step": 72180 }, { "epoch": 1.6067930911680912, "grad_norm": 0.4075709581375122, "learning_rate": 0.00017771432676945076, "loss": 0.4958, "step": 72190 }, { "epoch": 1.6070156695156697, "grad_norm": 0.6719409823417664, "learning_rate": 0.000177667995482254, "loss": 0.5834, "step": 72200 }, { "epoch": 1.6072382478632479, "grad_norm": 0.6902719736099243, "learning_rate": 0.00017762166540859664, "loss": 0.4113, "step": 72210 }, { "epoch": 1.607460826210826, "grad_norm": 0.5260211229324341, "learning_rate": 0.00017757533655099627, "loss": 0.5081, "step": 72220 }, { "epoch": 1.6076834045584045, "grad_norm": 0.4960169792175293, "learning_rate": 0.00017752900891197053, "loss": 0.6683, "step": 72230 }, { "epoch": 1.607905982905983, "grad_norm": 0.5500181913375854, "learning_rate": 0.00017748268249403678, "loss": 0.5539, "step": 72240 }, { "epoch": 1.6081285612535612, "grad_norm": 0.5022486448287964, "learning_rate": 0.0001774363572997125, "loss": 0.5495, "step": 72250 }, { "epoch": 1.6083511396011396, "grad_norm": 0.4881563186645508, "learning_rate": 0.00017739003333151505, "loss": 0.5351, "step": 72260 }, { "epoch": 1.608573717948718, "grad_norm": 0.5786612033843994, "learning_rate": 0.00017734371059196176, "loss": 0.5741, "step": 72270 }, { "epoch": 1.6087962962962963, "grad_norm": 0.5145733952522278, "learning_rate": 0.0001772973890835697, "loss": 0.5077, "step": 72280 }, { "epoch": 1.6090188746438745, "grad_norm": 0.532596230506897, "learning_rate": 0.00017725106880885612, "loss": 0.5594, "step": 72290 }, { "epoch": 1.609241452991453, "grad_norm": 0.4536045491695404, "learning_rate": 0.0001772047497703381, "loss": 0.5199, "step": 72300 }, { "epoch": 1.6094640313390314, "grad_norm": 0.7692566514015198, "learning_rate": 0.00017715843197053263, "loss": 0.4523, "step": 72310 }, { "epoch": 1.6096866096866096, "grad_norm": 0.7244535684585571, "learning_rate": 0.00017711211541195676, "loss": 0.5721, "step": 72320 }, { "epoch": 1.609909188034188, "grad_norm": 0.477071076631546, "learning_rate": 0.00017706580009712716, "loss": 0.5792, "step": 72330 }, { "epoch": 1.6101317663817665, "grad_norm": 1.0131349563598633, "learning_rate": 0.00017701948602856082, "loss": 0.5588, "step": 72340 }, { "epoch": 1.6103543447293447, "grad_norm": 0.5438540577888489, "learning_rate": 0.00017697317320877436, "loss": 0.4861, "step": 72350 }, { "epoch": 1.6105769230769231, "grad_norm": 0.4957207143306732, "learning_rate": 0.00017692686164028453, "loss": 0.4832, "step": 72360 }, { "epoch": 1.6107995014245016, "grad_norm": 0.5664975643157959, "learning_rate": 0.00017688055132560797, "loss": 0.5985, "step": 72370 }, { "epoch": 1.6110220797720798, "grad_norm": 0.5937734246253967, "learning_rate": 0.00017683424226726114, "loss": 0.5806, "step": 72380 }, { "epoch": 1.611244658119658, "grad_norm": 0.48959553241729736, "learning_rate": 0.00017678793446776055, "loss": 0.5822, "step": 72390 }, { "epoch": 1.6114672364672364, "grad_norm": 0.7238348126411438, "learning_rate": 0.00017674162792962264, "loss": 0.5435, "step": 72400 }, { "epoch": 1.6116898148148149, "grad_norm": 0.5640783309936523, "learning_rate": 0.00017669532265536365, "loss": 0.5744, "step": 72410 }, { "epoch": 1.611912393162393, "grad_norm": 0.44172927737236023, "learning_rate": 0.0001766490186474999, "loss": 0.6866, "step": 72420 }, { "epoch": 1.6121349715099715, "grad_norm": 0.8031690120697021, "learning_rate": 0.00017660271590854765, "loss": 0.6749, "step": 72430 }, { "epoch": 1.61235754985755, "grad_norm": 0.6939573287963867, "learning_rate": 0.00017655641444102294, "loss": 0.5717, "step": 72440 }, { "epoch": 1.6125801282051282, "grad_norm": 0.3727976679801941, "learning_rate": 0.00017651011424744189, "loss": 0.4701, "step": 72450 }, { "epoch": 1.6128027065527064, "grad_norm": 0.43301764130592346, "learning_rate": 0.0001764638153303205, "loss": 0.5043, "step": 72460 }, { "epoch": 1.6130252849002849, "grad_norm": 0.5547754168510437, "learning_rate": 0.00017641751769217459, "loss": 0.5573, "step": 72470 }, { "epoch": 1.6132478632478633, "grad_norm": 0.5771608352661133, "learning_rate": 0.0001763712213355201, "loss": 0.4549, "step": 72480 }, { "epoch": 1.6134704415954415, "grad_norm": 0.7360827922821045, "learning_rate": 0.00017632492626287275, "loss": 0.5214, "step": 72490 }, { "epoch": 1.61369301994302, "grad_norm": 0.5349162220954895, "learning_rate": 0.00017627863247674828, "loss": 0.4989, "step": 72500 }, { "epoch": 1.6139155982905984, "grad_norm": 0.3698183000087738, "learning_rate": 0.00017623233997966239, "loss": 0.4294, "step": 72510 }, { "epoch": 1.6141381766381766, "grad_norm": 0.4332517683506012, "learning_rate": 0.00017618604877413056, "loss": 0.55, "step": 72520 }, { "epoch": 1.6143607549857548, "grad_norm": 0.4140152037143707, "learning_rate": 0.00017613975886266833, "loss": 0.5283, "step": 72530 }, { "epoch": 1.6145833333333335, "grad_norm": 0.7178450226783752, "learning_rate": 0.0001760934702477912, "loss": 0.6523, "step": 72540 }, { "epoch": 1.6148059116809117, "grad_norm": 0.5415472984313965, "learning_rate": 0.0001760471829320144, "loss": 0.5014, "step": 72550 }, { "epoch": 1.61502849002849, "grad_norm": 0.8831973075866699, "learning_rate": 0.0001760008969178533, "loss": 0.7235, "step": 72560 }, { "epoch": 1.6152510683760684, "grad_norm": 0.6190544962882996, "learning_rate": 0.0001759546122078231, "loss": 0.595, "step": 72570 }, { "epoch": 1.6154736467236468, "grad_norm": 0.5446276068687439, "learning_rate": 0.000175908328804439, "loss": 0.5364, "step": 72580 }, { "epoch": 1.615696225071225, "grad_norm": 0.500360369682312, "learning_rate": 0.00017586204671021612, "loss": 0.6368, "step": 72590 }, { "epoch": 1.6159188034188035, "grad_norm": 0.6088921427726746, "learning_rate": 0.00017581576592766925, "loss": 0.4953, "step": 72600 }, { "epoch": 1.616141381766382, "grad_norm": 0.7649800777435303, "learning_rate": 0.00017576948645931347, "loss": 0.6165, "step": 72610 }, { "epoch": 1.61636396011396, "grad_norm": 0.7882862091064453, "learning_rate": 0.00017572320830766368, "loss": 0.4718, "step": 72620 }, { "epoch": 1.6165865384615383, "grad_norm": 0.7119544744491577, "learning_rate": 0.00017567693147523455, "loss": 0.5845, "step": 72630 }, { "epoch": 1.6168091168091168, "grad_norm": 0.5975489020347595, "learning_rate": 0.0001756306559645409, "loss": 0.5229, "step": 72640 }, { "epoch": 1.6170316951566952, "grad_norm": 0.389389306306839, "learning_rate": 0.0001755843817780974, "loss": 0.4661, "step": 72650 }, { "epoch": 1.6172542735042734, "grad_norm": 0.4852727949619293, "learning_rate": 0.00017553810891841854, "loss": 0.5355, "step": 72660 }, { "epoch": 1.6174768518518519, "grad_norm": 0.48424190282821655, "learning_rate": 0.00017549183738801887, "loss": 0.522, "step": 72670 }, { "epoch": 1.6176994301994303, "grad_norm": 0.6597685217857361, "learning_rate": 0.00017544556718941285, "loss": 0.5024, "step": 72680 }, { "epoch": 1.6179220085470085, "grad_norm": 0.724755048751831, "learning_rate": 0.00017539929832511477, "loss": 0.5394, "step": 72690 }, { "epoch": 1.6181445868945867, "grad_norm": 0.6331667900085449, "learning_rate": 0.00017535303079763893, "loss": 0.5192, "step": 72700 }, { "epoch": 1.6183671652421654, "grad_norm": 0.4962537884712219, "learning_rate": 0.00017530676460949968, "loss": 0.5735, "step": 72710 }, { "epoch": 1.6185897435897436, "grad_norm": 0.3787330090999603, "learning_rate": 0.000175260499763211, "loss": 0.4621, "step": 72720 }, { "epoch": 1.6188123219373218, "grad_norm": 0.543692946434021, "learning_rate": 0.00017521423626128704, "loss": 0.6384, "step": 72730 }, { "epoch": 1.6190349002849003, "grad_norm": 0.3744504749774933, "learning_rate": 0.00017516797410624174, "loss": 0.6521, "step": 72740 }, { "epoch": 1.6192574786324787, "grad_norm": 0.5380321145057678, "learning_rate": 0.00017512171330058904, "loss": 0.5083, "step": 72750 }, { "epoch": 1.619480056980057, "grad_norm": 0.6366248726844788, "learning_rate": 0.00017507545384684285, "loss": 0.7177, "step": 72760 }, { "epoch": 1.6197026353276354, "grad_norm": 0.3844091296195984, "learning_rate": 0.00017502919574751685, "loss": 0.4953, "step": 72770 }, { "epoch": 1.6199252136752138, "grad_norm": 0.645712673664093, "learning_rate": 0.0001749829390051248, "loss": 0.5604, "step": 72780 }, { "epoch": 1.620147792022792, "grad_norm": 0.4898901581764221, "learning_rate": 0.0001749366836221804, "loss": 0.745, "step": 72790 }, { "epoch": 1.6201923076923077, "eval_loss": 0.5608183145523071, "eval_runtime": 338.6161, "eval_samples_per_second": 6.984, "eval_steps_per_second": 6.984, "step": 72792 }, { "epoch": 1.6203703703703702, "grad_norm": 0.5302802324295044, "learning_rate": 0.00017489042960119707, "loss": 0.5637, "step": 72800 }, { "epoch": 1.6205929487179487, "grad_norm": 0.45127832889556885, "learning_rate": 0.00017484417694468832, "loss": 0.5059, "step": 72810 }, { "epoch": 1.6208155270655271, "grad_norm": 0.46210169792175293, "learning_rate": 0.00017479792565516764, "loss": 0.4899, "step": 72820 }, { "epoch": 1.6210381054131053, "grad_norm": 0.6143471002578735, "learning_rate": 0.0001747516757351483, "loss": 0.4319, "step": 72830 }, { "epoch": 1.6212606837606838, "grad_norm": 0.6548733115196228, "learning_rate": 0.00017470542718714353, "loss": 0.4886, "step": 72840 }, { "epoch": 1.6214832621082622, "grad_norm": 0.6141097545623779, "learning_rate": 0.00017465918001366668, "loss": 0.5904, "step": 72850 }, { "epoch": 1.6217058404558404, "grad_norm": 0.6009771823883057, "learning_rate": 0.0001746129342172306, "loss": 0.6371, "step": 72860 }, { "epoch": 1.6219284188034186, "grad_norm": 0.45862460136413574, "learning_rate": 0.00017456668980034854, "loss": 0.5505, "step": 72870 }, { "epoch": 1.6221509971509973, "grad_norm": 0.550284743309021, "learning_rate": 0.00017452044676553332, "loss": 0.4793, "step": 72880 }, { "epoch": 1.6223735754985755, "grad_norm": 0.35887646675109863, "learning_rate": 0.00017447420511529789, "loss": 0.4587, "step": 72890 }, { "epoch": 1.6225961538461537, "grad_norm": 0.5029733180999756, "learning_rate": 0.0001744279648521551, "loss": 0.447, "step": 72900 }, { "epoch": 1.6228187321937322, "grad_norm": 0.621500551700592, "learning_rate": 0.00017438172597861754, "loss": 0.4205, "step": 72910 }, { "epoch": 1.6230413105413106, "grad_norm": 0.8010604381561279, "learning_rate": 0.00017433548849719803, "loss": 0.7382, "step": 72920 }, { "epoch": 1.6232638888888888, "grad_norm": 0.5936803817749023, "learning_rate": 0.0001742892524104091, "loss": 0.4996, "step": 72930 }, { "epoch": 1.6234864672364673, "grad_norm": 0.589506208896637, "learning_rate": 0.0001742430177207632, "loss": 0.5262, "step": 72940 }, { "epoch": 1.6237090455840457, "grad_norm": 0.6230021119117737, "learning_rate": 0.00017419678443077278, "loss": 0.5251, "step": 72950 }, { "epoch": 1.623931623931624, "grad_norm": 0.5615085363388062, "learning_rate": 0.00017415055254295028, "loss": 0.4211, "step": 72960 }, { "epoch": 1.6241542022792022, "grad_norm": 0.7193329334259033, "learning_rate": 0.00017410432205980787, "loss": 0.4324, "step": 72970 }, { "epoch": 1.6243767806267806, "grad_norm": 0.5282472968101501, "learning_rate": 0.00017405809298385788, "loss": 0.4229, "step": 72980 }, { "epoch": 1.624599358974359, "grad_norm": 0.2699781358242035, "learning_rate": 0.00017401186531761228, "loss": 0.5352, "step": 72990 }, { "epoch": 1.6248219373219372, "grad_norm": 0.6161572933197021, "learning_rate": 0.00017396563906358319, "loss": 0.5319, "step": 73000 }, { "epoch": 1.6250445156695157, "grad_norm": 0.4191792607307434, "learning_rate": 0.00017391941422428264, "loss": 0.5876, "step": 73010 }, { "epoch": 1.6252670940170941, "grad_norm": 0.5169584155082703, "learning_rate": 0.00017387319080222242, "loss": 0.606, "step": 73020 }, { "epoch": 1.6254896723646723, "grad_norm": 0.510918915271759, "learning_rate": 0.00017382696879991437, "loss": 0.5738, "step": 73030 }, { "epoch": 1.6257122507122506, "grad_norm": 0.49762460589408875, "learning_rate": 0.00017378074821987035, "loss": 0.5594, "step": 73040 }, { "epoch": 1.6259348290598292, "grad_norm": 0.5228525400161743, "learning_rate": 0.00017373452906460188, "loss": 0.4566, "step": 73050 }, { "epoch": 1.6261574074074074, "grad_norm": 0.46729880571365356, "learning_rate": 0.00017368831133662057, "loss": 0.4876, "step": 73060 }, { "epoch": 1.6263799857549857, "grad_norm": 0.7242257595062256, "learning_rate": 0.00017364209503843806, "loss": 0.6487, "step": 73070 }, { "epoch": 1.626602564102564, "grad_norm": 0.4142407774925232, "learning_rate": 0.0001735958801725656, "loss": 0.4958, "step": 73080 }, { "epoch": 1.6268251424501425, "grad_norm": 0.48071005940437317, "learning_rate": 0.00017354966674151462, "loss": 0.4204, "step": 73090 }, { "epoch": 1.6270477207977208, "grad_norm": 0.44896841049194336, "learning_rate": 0.00017350345474779648, "loss": 0.4794, "step": 73100 }, { "epoch": 1.6272702991452992, "grad_norm": 0.5316339731216431, "learning_rate": 0.0001734572441939223, "loss": 0.6107, "step": 73110 }, { "epoch": 1.6274928774928776, "grad_norm": 0.9510208964347839, "learning_rate": 0.00017341103508240318, "loss": 0.6287, "step": 73120 }, { "epoch": 1.6277154558404558, "grad_norm": 0.5629388093948364, "learning_rate": 0.00017336482741575013, "loss": 0.5422, "step": 73130 }, { "epoch": 1.627938034188034, "grad_norm": 0.377602219581604, "learning_rate": 0.00017331862119647417, "loss": 0.631, "step": 73140 }, { "epoch": 1.6281606125356125, "grad_norm": 0.4106764495372772, "learning_rate": 0.00017327241642708623, "loss": 0.4257, "step": 73150 }, { "epoch": 1.628383190883191, "grad_norm": 0.3863488435745239, "learning_rate": 0.00017322621311009702, "loss": 0.4273, "step": 73160 }, { "epoch": 1.6286057692307692, "grad_norm": 0.6114441156387329, "learning_rate": 0.0001731800112480173, "loss": 0.4727, "step": 73170 }, { "epoch": 1.6288283475783476, "grad_norm": 0.8081455230712891, "learning_rate": 0.00017313381084335775, "loss": 0.6697, "step": 73180 }, { "epoch": 1.629050925925926, "grad_norm": 0.5485579967498779, "learning_rate": 0.0001730876118986289, "loss": 0.486, "step": 73190 }, { "epoch": 1.6292735042735043, "grad_norm": 0.6874799132347107, "learning_rate": 0.00017304141441634122, "loss": 0.4315, "step": 73200 }, { "epoch": 1.6294960826210825, "grad_norm": 0.5704666376113892, "learning_rate": 0.0001729952183990052, "loss": 0.4986, "step": 73210 }, { "epoch": 1.629718660968661, "grad_norm": 0.4740930199623108, "learning_rate": 0.0001729490238491311, "loss": 0.4556, "step": 73220 }, { "epoch": 1.6299412393162394, "grad_norm": 0.5930688381195068, "learning_rate": 0.00017290283076922919, "loss": 0.6498, "step": 73230 }, { "epoch": 1.6301638176638176, "grad_norm": 0.5056358575820923, "learning_rate": 0.0001728566391618097, "loss": 0.5727, "step": 73240 }, { "epoch": 1.630386396011396, "grad_norm": 0.5368411540985107, "learning_rate": 0.00017281044902938263, "loss": 0.3919, "step": 73250 }, { "epoch": 1.6306089743589745, "grad_norm": 0.7721600532531738, "learning_rate": 0.00017276426037445797, "loss": 0.4666, "step": 73260 }, { "epoch": 1.6308315527065527, "grad_norm": 0.7881020307540894, "learning_rate": 0.0001727180731995457, "loss": 0.548, "step": 73270 }, { "epoch": 1.631054131054131, "grad_norm": 0.5035044550895691, "learning_rate": 0.00017267188750715566, "loss": 0.5716, "step": 73280 }, { "epoch": 1.6312767094017095, "grad_norm": 0.6801565289497375, "learning_rate": 0.0001726257032997977, "loss": 0.4976, "step": 73290 }, { "epoch": 1.6314992877492878, "grad_norm": 0.41549575328826904, "learning_rate": 0.00017257952057998135, "loss": 0.5138, "step": 73300 }, { "epoch": 1.631721866096866, "grad_norm": 0.6814451217651367, "learning_rate": 0.0001725333393502163, "loss": 0.5514, "step": 73310 }, { "epoch": 1.6319444444444444, "grad_norm": 0.6054021120071411, "learning_rate": 0.00017248715961301213, "loss": 0.5926, "step": 73320 }, { "epoch": 1.6321670227920229, "grad_norm": 0.788329005241394, "learning_rate": 0.0001724409813708782, "loss": 0.5818, "step": 73330 }, { "epoch": 1.632389601139601, "grad_norm": 0.7401924729347229, "learning_rate": 0.00017239480462632388, "loss": 0.5906, "step": 73340 }, { "epoch": 1.6326121794871795, "grad_norm": 0.8028993606567383, "learning_rate": 0.00017234862938185852, "loss": 0.5838, "step": 73350 }, { "epoch": 1.632834757834758, "grad_norm": 0.5766822099685669, "learning_rate": 0.00017230245563999122, "loss": 0.5865, "step": 73360 }, { "epoch": 1.6330573361823362, "grad_norm": 0.7944523096084595, "learning_rate": 0.00017225628340323127, "loss": 0.636, "step": 73370 }, { "epoch": 1.6332799145299144, "grad_norm": 0.5362275242805481, "learning_rate": 0.00017221011267408751, "loss": 0.6232, "step": 73380 }, { "epoch": 1.6335024928774928, "grad_norm": 0.4361020028591156, "learning_rate": 0.000172163943455069, "loss": 0.4872, "step": 73390 }, { "epoch": 1.6337250712250713, "grad_norm": 0.6074864268302917, "learning_rate": 0.00017211777574868456, "loss": 0.485, "step": 73400 }, { "epoch": 1.6339476495726495, "grad_norm": 0.7449727654457092, "learning_rate": 0.00017207160955744303, "loss": 0.4808, "step": 73410 }, { "epoch": 1.634170227920228, "grad_norm": 0.830277681350708, "learning_rate": 0.00017202544488385306, "loss": 0.564, "step": 73420 }, { "epoch": 1.6343928062678064, "grad_norm": 0.505328893661499, "learning_rate": 0.00017197928173042336, "loss": 0.4169, "step": 73430 }, { "epoch": 1.6346153846153846, "grad_norm": 0.4163178503513336, "learning_rate": 0.0001719331200996624, "loss": 0.3699, "step": 73440 }, { "epoch": 1.6348379629629628, "grad_norm": 0.6493053436279297, "learning_rate": 0.00017188695999407867, "loss": 0.4961, "step": 73450 }, { "epoch": 1.6350605413105415, "grad_norm": 0.3108616769313812, "learning_rate": 0.0001718408014161806, "loss": 0.5145, "step": 73460 }, { "epoch": 1.6352831196581197, "grad_norm": 0.4791993498802185, "learning_rate": 0.0001717946443684764, "loss": 0.5821, "step": 73470 }, { "epoch": 1.635505698005698, "grad_norm": 0.48567846417427063, "learning_rate": 0.00017174848885347427, "loss": 0.4676, "step": 73480 }, { "epoch": 1.6357282763532763, "grad_norm": 0.5976247191429138, "learning_rate": 0.00017170233487368247, "loss": 0.5768, "step": 73490 }, { "epoch": 1.6359508547008548, "grad_norm": 0.534724771976471, "learning_rate": 0.0001716561824316089, "loss": 0.5824, "step": 73500 }, { "epoch": 1.636173433048433, "grad_norm": 0.593966543674469, "learning_rate": 0.00017161003152976165, "loss": 0.5155, "step": 73510 }, { "epoch": 1.6363960113960114, "grad_norm": 0.675961971282959, "learning_rate": 0.00017156388217064847, "loss": 0.512, "step": 73520 }, { "epoch": 1.6366185897435899, "grad_norm": 0.67283034324646, "learning_rate": 0.00017151773435677726, "loss": 0.6103, "step": 73530 }, { "epoch": 1.636841168091168, "grad_norm": 0.6470919251441956, "learning_rate": 0.0001714715880906556, "loss": 0.5908, "step": 73540 }, { "epoch": 1.6370637464387463, "grad_norm": 0.39839234948158264, "learning_rate": 0.00017142544337479123, "loss": 0.5409, "step": 73550 }, { "epoch": 1.6372863247863247, "grad_norm": 0.6064070463180542, "learning_rate": 0.00017137930021169165, "loss": 0.4618, "step": 73560 }, { "epoch": 1.6375089031339032, "grad_norm": 0.46118393540382385, "learning_rate": 0.00017133315860386436, "loss": 0.5061, "step": 73570 }, { "epoch": 1.6377314814814814, "grad_norm": 0.5544940829277039, "learning_rate": 0.00017128701855381666, "loss": 0.5178, "step": 73580 }, { "epoch": 1.6379540598290598, "grad_norm": 0.5723961591720581, "learning_rate": 0.00017124088006405588, "loss": 0.6169, "step": 73590 }, { "epoch": 1.6381766381766383, "grad_norm": 0.6090016961097717, "learning_rate": 0.00017119474313708927, "loss": 0.4677, "step": 73600 }, { "epoch": 1.6383992165242165, "grad_norm": 0.6423790454864502, "learning_rate": 0.00017114860777542386, "loss": 0.6065, "step": 73610 }, { "epoch": 1.6386217948717947, "grad_norm": 0.6712021231651306, "learning_rate": 0.0001711024739815667, "loss": 0.5973, "step": 73620 }, { "epoch": 1.6388443732193734, "grad_norm": 0.4111859202384949, "learning_rate": 0.00017105634175802482, "loss": 0.5508, "step": 73630 }, { "epoch": 1.6390669515669516, "grad_norm": 0.571266233921051, "learning_rate": 0.00017101021110730506, "loss": 0.5742, "step": 73640 }, { "epoch": 1.6392895299145298, "grad_norm": 0.658608615398407, "learning_rate": 0.00017096408203191407, "loss": 0.5593, "step": 73650 }, { "epoch": 1.6395121082621082, "grad_norm": 0.6612945199012756, "learning_rate": 0.00017091795453435865, "loss": 0.6158, "step": 73660 }, { "epoch": 1.6397346866096867, "grad_norm": 0.504089891910553, "learning_rate": 0.00017087182861714542, "loss": 0.4897, "step": 73670 }, { "epoch": 1.639957264957265, "grad_norm": 0.63788902759552, "learning_rate": 0.00017082570428278082, "loss": 0.4936, "step": 73680 }, { "epoch": 1.6401798433048433, "grad_norm": 0.7329126596450806, "learning_rate": 0.00017077958153377133, "loss": 0.608, "step": 73690 }, { "epoch": 1.6404024216524218, "grad_norm": 0.5291252732276917, "learning_rate": 0.00017073346037262336, "loss": 0.4443, "step": 73700 }, { "epoch": 1.640625, "grad_norm": 0.4839900732040405, "learning_rate": 0.00017068734080184306, "loss": 0.5896, "step": 73710 }, { "epoch": 1.6408475783475782, "grad_norm": 0.6721231937408447, "learning_rate": 0.00017064122282393665, "loss": 0.6759, "step": 73720 }, { "epoch": 1.6410701566951567, "grad_norm": 0.7454251646995544, "learning_rate": 0.00017059510644141027, "loss": 0.56, "step": 73730 }, { "epoch": 1.641292735042735, "grad_norm": 0.5054280757904053, "learning_rate": 0.00017054899165676988, "loss": 0.4465, "step": 73740 }, { "epoch": 1.6415153133903133, "grad_norm": 0.6631110906600952, "learning_rate": 0.00017050287847252134, "loss": 0.5212, "step": 73750 }, { "epoch": 1.6417378917378918, "grad_norm": 0.5210461020469666, "learning_rate": 0.00017045676689117056, "loss": 0.5512, "step": 73760 }, { "epoch": 1.6419604700854702, "grad_norm": 0.504281759262085, "learning_rate": 0.0001704106569152233, "loss": 0.5389, "step": 73770 }, { "epoch": 1.6421830484330484, "grad_norm": 0.8250054121017456, "learning_rate": 0.00017036454854718516, "loss": 0.5495, "step": 73780 }, { "epoch": 1.6424056267806266, "grad_norm": 0.4285098910331726, "learning_rate": 0.00017031844178956167, "loss": 0.5417, "step": 73790 }, { "epoch": 1.6426282051282053, "grad_norm": 0.6083645820617676, "learning_rate": 0.00017027233664485832, "loss": 0.5558, "step": 73800 }, { "epoch": 1.6428507834757835, "grad_norm": 0.4949245750904083, "learning_rate": 0.00017022623311558062, "loss": 0.4727, "step": 73810 }, { "epoch": 1.6430733618233617, "grad_norm": 0.9863992929458618, "learning_rate": 0.0001701801312042337, "loss": 0.6826, "step": 73820 }, { "epoch": 1.6432959401709402, "grad_norm": 0.45118269324302673, "learning_rate": 0.0001701340309133229, "loss": 0.5129, "step": 73830 }, { "epoch": 1.6435185185185186, "grad_norm": 0.5702125430107117, "learning_rate": 0.00017008793224535334, "loss": 0.5916, "step": 73840 }, { "epoch": 1.6437410968660968, "grad_norm": 0.4908457100391388, "learning_rate": 0.00017004183520282995, "loss": 0.5519, "step": 73850 }, { "epoch": 1.6439636752136753, "grad_norm": 0.6474966406822205, "learning_rate": 0.0001699957397882578, "loss": 0.5739, "step": 73860 }, { "epoch": 1.6441862535612537, "grad_norm": 0.5858306288719177, "learning_rate": 0.0001699496460041417, "loss": 0.5725, "step": 73870 }, { "epoch": 1.644408831908832, "grad_norm": 0.655928373336792, "learning_rate": 0.00016990355385298648, "loss": 0.6545, "step": 73880 }, { "epoch": 1.6446314102564101, "grad_norm": 0.4970945417881012, "learning_rate": 0.00016985746333729674, "loss": 0.3836, "step": 73890 }, { "epoch": 1.6448539886039886, "grad_norm": 0.5481718182563782, "learning_rate": 0.0001698113744595772, "loss": 0.4997, "step": 73900 }, { "epoch": 1.645076566951567, "grad_norm": 0.7615852355957031, "learning_rate": 0.0001697652872223322, "loss": 0.6299, "step": 73910 }, { "epoch": 1.6452991452991452, "grad_norm": 0.6412912011146545, "learning_rate": 0.0001697192016280663, "loss": 0.7039, "step": 73920 }, { "epoch": 1.6455217236467237, "grad_norm": 0.4065113663673401, "learning_rate": 0.0001696731176792837, "loss": 0.4645, "step": 73930 }, { "epoch": 1.645744301994302, "grad_norm": 0.6548492908477783, "learning_rate": 0.00016962703537848873, "loss": 0.5546, "step": 73940 }, { "epoch": 1.6459668803418803, "grad_norm": 0.7567201852798462, "learning_rate": 0.00016958095472818557, "loss": 0.6382, "step": 73950 }, { "epoch": 1.6461894586894585, "grad_norm": 0.9115142226219177, "learning_rate": 0.00016953487573087821, "loss": 0.555, "step": 73960 }, { "epoch": 1.6464120370370372, "grad_norm": 0.5288968682289124, "learning_rate": 0.00016948879838907062, "loss": 0.52, "step": 73970 }, { "epoch": 1.6466346153846154, "grad_norm": 0.3623640835285187, "learning_rate": 0.00016944272270526673, "loss": 0.4835, "step": 73980 }, { "epoch": 1.6468571937321936, "grad_norm": 0.5390410423278809, "learning_rate": 0.0001693966486819703, "loss": 0.6114, "step": 73990 }, { "epoch": 1.647079772079772, "grad_norm": 0.5235438942909241, "learning_rate": 0.00016935057632168498, "loss": 0.5105, "step": 74000 }, { "epoch": 1.6473023504273505, "grad_norm": 0.36156705021858215, "learning_rate": 0.0001693045056269145, "loss": 0.5099, "step": 74010 }, { "epoch": 1.6475249287749287, "grad_norm": 0.601723313331604, "learning_rate": 0.00016925843660016229, "loss": 0.5058, "step": 74020 }, { "epoch": 1.6477475071225072, "grad_norm": 0.5614992380142212, "learning_rate": 0.0001692123692439319, "loss": 0.6299, "step": 74030 }, { "epoch": 1.6479700854700856, "grad_norm": 0.42595812678337097, "learning_rate": 0.00016916630356072644, "loss": 0.5607, "step": 74040 }, { "epoch": 1.6481926638176638, "grad_norm": 0.5589383840560913, "learning_rate": 0.0001691202395530493, "loss": 0.542, "step": 74050 }, { "epoch": 1.648415242165242, "grad_norm": 0.7026439905166626, "learning_rate": 0.00016907417722340366, "loss": 0.535, "step": 74060 }, { "epoch": 1.6486378205128205, "grad_norm": 0.38493791222572327, "learning_rate": 0.0001690281165742925, "loss": 0.4701, "step": 74070 }, { "epoch": 1.648860398860399, "grad_norm": 0.5316752791404724, "learning_rate": 0.00016898205760821887, "loss": 0.5313, "step": 74080 }, { "epoch": 1.6490829772079771, "grad_norm": 0.6451575756072998, "learning_rate": 0.00016893600032768564, "loss": 0.5871, "step": 74090 }, { "epoch": 1.6493055555555556, "grad_norm": 0.3266368806362152, "learning_rate": 0.00016888994473519554, "loss": 0.6027, "step": 74100 }, { "epoch": 1.649528133903134, "grad_norm": 0.9386067986488342, "learning_rate": 0.0001688438908332513, "loss": 0.5026, "step": 74110 }, { "epoch": 1.6497507122507122, "grad_norm": 0.418160617351532, "learning_rate": 0.0001687978386243556, "loss": 0.6328, "step": 74120 }, { "epoch": 1.6499732905982905, "grad_norm": 0.5406116843223572, "learning_rate": 0.00016875178811101086, "loss": 0.5237, "step": 74130 }, { "epoch": 1.650195868945869, "grad_norm": 0.5164955258369446, "learning_rate": 0.00016870573929571954, "loss": 0.5915, "step": 74140 }, { "epoch": 1.6504184472934473, "grad_norm": 0.401613712310791, "learning_rate": 0.00016865969218098398, "loss": 0.6145, "step": 74150 }, { "epoch": 1.6506410256410255, "grad_norm": 0.5824975371360779, "learning_rate": 0.00016861364676930643, "loss": 0.5881, "step": 74160 }, { "epoch": 1.650863603988604, "grad_norm": 0.596259593963623, "learning_rate": 0.00016856760306318896, "loss": 0.6278, "step": 74170 }, { "epoch": 1.6510861823361824, "grad_norm": 0.698535144329071, "learning_rate": 0.00016852156106513368, "loss": 0.5502, "step": 74180 }, { "epoch": 1.6513087606837606, "grad_norm": 0.6913437843322754, "learning_rate": 0.00016847552077764252, "loss": 0.5445, "step": 74190 }, { "epoch": 1.651531339031339, "grad_norm": 0.3660818040370941, "learning_rate": 0.0001684294822032174, "loss": 0.5501, "step": 74200 }, { "epoch": 1.6517539173789175, "grad_norm": 0.6315298676490784, "learning_rate": 0.00016838344534436003, "loss": 0.5986, "step": 74210 }, { "epoch": 1.6519764957264957, "grad_norm": 0.3987181782722473, "learning_rate": 0.00016833741020357213, "loss": 0.5797, "step": 74220 }, { "epoch": 1.652199074074074, "grad_norm": 0.49659889936447144, "learning_rate": 0.00016829137678335532, "loss": 0.4949, "step": 74230 }, { "epoch": 1.6524216524216524, "grad_norm": 0.5157960057258606, "learning_rate": 0.00016824534508621102, "loss": 0.6625, "step": 74240 }, { "epoch": 1.6526442307692308, "grad_norm": 0.5533856153488159, "learning_rate": 0.00016819931511464063, "loss": 0.5254, "step": 74250 }, { "epoch": 1.652866809116809, "grad_norm": 0.4741371273994446, "learning_rate": 0.00016815328687114556, "loss": 0.5132, "step": 74260 }, { "epoch": 1.6530893874643875, "grad_norm": 0.4546084403991699, "learning_rate": 0.00016810726035822692, "loss": 0.4191, "step": 74270 }, { "epoch": 1.653311965811966, "grad_norm": 0.5832876563072205, "learning_rate": 0.00016806123557838582, "loss": 0.5897, "step": 74280 }, { "epoch": 1.6535345441595442, "grad_norm": 0.3638170063495636, "learning_rate": 0.00016801521253412345, "loss": 0.4935, "step": 74290 }, { "epoch": 1.6537571225071224, "grad_norm": 0.5617278218269348, "learning_rate": 0.0001679691912279405, "loss": 0.6729, "step": 74300 }, { "epoch": 1.6539797008547008, "grad_norm": 0.5840321779251099, "learning_rate": 0.000167923171662338, "loss": 0.6152, "step": 74310 }, { "epoch": 1.6542022792022792, "grad_norm": 0.7067362070083618, "learning_rate": 0.00016787715383981652, "loss": 0.5307, "step": 74320 }, { "epoch": 1.6544248575498575, "grad_norm": 0.7421824336051941, "learning_rate": 0.00016783113776287683, "loss": 0.6672, "step": 74330 }, { "epoch": 1.654647435897436, "grad_norm": 0.7069873213768005, "learning_rate": 0.00016778512343401948, "loss": 0.553, "step": 74340 }, { "epoch": 1.6548700142450143, "grad_norm": 0.5692541003227234, "learning_rate": 0.00016773911085574486, "loss": 0.6681, "step": 74350 }, { "epoch": 1.6550925925925926, "grad_norm": 0.5314786434173584, "learning_rate": 0.00016769310003055338, "loss": 0.4982, "step": 74360 }, { "epoch": 1.6553151709401708, "grad_norm": 0.5425453186035156, "learning_rate": 0.00016764709096094528, "loss": 0.5796, "step": 74370 }, { "epoch": 1.6555377492877494, "grad_norm": 0.6942946314811707, "learning_rate": 0.00016760108364942075, "loss": 0.5619, "step": 74380 }, { "epoch": 1.6557603276353277, "grad_norm": 0.48159825801849365, "learning_rate": 0.00016755507809847986, "loss": 0.4793, "step": 74390 }, { "epoch": 1.6559829059829059, "grad_norm": 0.4759584963321686, "learning_rate": 0.00016750907431062258, "loss": 0.4454, "step": 74400 }, { "epoch": 1.6562054843304843, "grad_norm": 0.6936882138252258, "learning_rate": 0.0001674630722883488, "loss": 0.5632, "step": 74410 }, { "epoch": 1.6564280626780628, "grad_norm": 0.812346875667572, "learning_rate": 0.0001674170720341584, "loss": 0.5834, "step": 74420 }, { "epoch": 1.656650641025641, "grad_norm": 0.8212725520133972, "learning_rate": 0.00016737107355055088, "loss": 0.5352, "step": 74430 }, { "epoch": 1.6568732193732194, "grad_norm": 0.7028321623802185, "learning_rate": 0.00016732507684002595, "loss": 0.4907, "step": 74440 }, { "epoch": 1.6570957977207978, "grad_norm": 0.4733540415763855, "learning_rate": 0.0001672790819050831, "loss": 0.4795, "step": 74450 }, { "epoch": 1.657318376068376, "grad_norm": 0.6220640540122986, "learning_rate": 0.0001672330887482217, "loss": 0.5511, "step": 74460 }, { "epoch": 1.6575409544159543, "grad_norm": 0.5480875372886658, "learning_rate": 0.00016718709737194106, "loss": 0.4772, "step": 74470 }, { "epoch": 1.6577635327635327, "grad_norm": 0.6275351643562317, "learning_rate": 0.0001671411077787405, "loss": 0.445, "step": 74480 }, { "epoch": 1.6579861111111112, "grad_norm": 0.5668492317199707, "learning_rate": 0.00016709511997111898, "loss": 0.5913, "step": 74490 }, { "epoch": 1.6582086894586894, "grad_norm": 0.9715355634689331, "learning_rate": 0.00016704913395157559, "loss": 0.5269, "step": 74500 }, { "epoch": 1.6584312678062678, "grad_norm": 0.36021795868873596, "learning_rate": 0.00016700314972260924, "loss": 0.4903, "step": 74510 }, { "epoch": 1.6586538461538463, "grad_norm": 0.6244007349014282, "learning_rate": 0.00016695716728671873, "loss": 0.5046, "step": 74520 }, { "epoch": 1.6588764245014245, "grad_norm": 0.5061987638473511, "learning_rate": 0.0001669111866464028, "loss": 0.5313, "step": 74530 }, { "epoch": 1.6590990028490027, "grad_norm": 0.6868669390678406, "learning_rate": 0.00016686520780416012, "loss": 0.679, "step": 74540 }, { "epoch": 1.6593215811965814, "grad_norm": 0.5997483730316162, "learning_rate": 0.00016681923076248913, "loss": 0.5782, "step": 74550 }, { "epoch": 1.6595441595441596, "grad_norm": 0.3696061968803406, "learning_rate": 0.0001667732555238884, "loss": 0.4457, "step": 74560 }, { "epoch": 1.6597667378917378, "grad_norm": 0.4042704403400421, "learning_rate": 0.00016672728209085605, "loss": 0.4871, "step": 74570 }, { "epoch": 1.6599893162393162, "grad_norm": 0.5580457448959351, "learning_rate": 0.00016668131046589045, "loss": 0.4977, "step": 74580 }, { "epoch": 1.6602118945868947, "grad_norm": 0.617868185043335, "learning_rate": 0.00016663534065148973, "loss": 0.6508, "step": 74590 }, { "epoch": 1.6604344729344729, "grad_norm": 0.458039790391922, "learning_rate": 0.00016658937265015192, "loss": 0.5306, "step": 74600 }, { "epoch": 1.6606570512820513, "grad_norm": 0.8516250848770142, "learning_rate": 0.0001665434064643749, "loss": 0.5773, "step": 74610 }, { "epoch": 1.6608796296296298, "grad_norm": 0.6809437870979309, "learning_rate": 0.00016649744209665663, "loss": 0.5101, "step": 74620 }, { "epoch": 1.661102207977208, "grad_norm": 0.4385990798473358, "learning_rate": 0.00016645147954949473, "loss": 0.5177, "step": 74630 }, { "epoch": 1.6613247863247862, "grad_norm": 0.38819900155067444, "learning_rate": 0.00016640551882538693, "loss": 0.4473, "step": 74640 }, { "epoch": 1.6615473646723646, "grad_norm": 0.5712143778800964, "learning_rate": 0.00016635955992683074, "loss": 0.4045, "step": 74650 }, { "epoch": 1.661769943019943, "grad_norm": 0.4798211455345154, "learning_rate": 0.00016631360285632358, "loss": 0.5643, "step": 74660 }, { "epoch": 1.6619925213675213, "grad_norm": 0.5165737271308899, "learning_rate": 0.0001662676476163628, "loss": 0.6715, "step": 74670 }, { "epoch": 1.6622150997150997, "grad_norm": 0.6299379467964172, "learning_rate": 0.00016622169420944574, "loss": 0.5062, "step": 74680 }, { "epoch": 1.6624376780626782, "grad_norm": 0.5905649662017822, "learning_rate": 0.0001661757426380695, "loss": 0.5199, "step": 74690 }, { "epoch": 1.6626602564102564, "grad_norm": 0.5792425274848938, "learning_rate": 0.000166129792904731, "loss": 0.6112, "step": 74700 }, { "epoch": 1.6628828347578346, "grad_norm": 0.6446602940559387, "learning_rate": 0.00016608384501192727, "loss": 0.5768, "step": 74710 }, { "epoch": 1.6631054131054133, "grad_norm": 0.4840184450149536, "learning_rate": 0.00016603789896215516, "loss": 0.4455, "step": 74720 }, { "epoch": 1.6633279914529915, "grad_norm": 0.7535210251808167, "learning_rate": 0.0001659919547579115, "loss": 0.6043, "step": 74730 }, { "epoch": 1.6635505698005697, "grad_norm": 0.552081823348999, "learning_rate": 0.00016594601240169278, "loss": 0.447, "step": 74740 }, { "epoch": 1.6637731481481481, "grad_norm": 0.5401560664176941, "learning_rate": 0.00016590007189599566, "loss": 0.5122, "step": 74750 }, { "epoch": 1.6639957264957266, "grad_norm": 0.6353166699409485, "learning_rate": 0.00016585413324331658, "loss": 0.4692, "step": 74760 }, { "epoch": 1.6642183048433048, "grad_norm": 0.6277305483818054, "learning_rate": 0.0001658081964461518, "loss": 0.4863, "step": 74770 }, { "epoch": 1.6644408831908832, "grad_norm": 0.4086846113204956, "learning_rate": 0.00016576226150699763, "loss": 0.4235, "step": 74780 }, { "epoch": 1.6646634615384617, "grad_norm": 0.574285626411438, "learning_rate": 0.00016571632842835024, "loss": 0.6429, "step": 74790 }, { "epoch": 1.66488603988604, "grad_norm": 0.39552202820777893, "learning_rate": 0.00016567039721270558, "loss": 0.4631, "step": 74800 }, { "epoch": 1.665108618233618, "grad_norm": 0.5707929730415344, "learning_rate": 0.0001656244678625596, "loss": 0.5358, "step": 74810 }, { "epoch": 1.6653311965811965, "grad_norm": 0.5050995945930481, "learning_rate": 0.00016557854038040833, "loss": 0.4214, "step": 74820 }, { "epoch": 1.665553774928775, "grad_norm": 0.49576276540756226, "learning_rate": 0.00016553261476874728, "loss": 0.6185, "step": 74830 }, { "epoch": 1.6657763532763532, "grad_norm": 0.5018727779388428, "learning_rate": 0.0001654866910300721, "loss": 0.4222, "step": 74840 }, { "epoch": 1.6659989316239316, "grad_norm": 0.8848727941513062, "learning_rate": 0.0001654407691668784, "loss": 0.6535, "step": 74850 }, { "epoch": 1.66622150997151, "grad_norm": 0.5684048533439636, "learning_rate": 0.00016539484918166167, "loss": 0.6015, "step": 74860 }, { "epoch": 1.6664440883190883, "grad_norm": 0.7550521492958069, "learning_rate": 0.00016534893107691707, "loss": 0.607, "step": 74870 }, { "epoch": 1.6666666666666665, "grad_norm": 0.5899578928947449, "learning_rate": 0.00016530301485513996, "loss": 0.5685, "step": 74880 }, { "epoch": 1.6668892450142452, "grad_norm": 0.7316328883171082, "learning_rate": 0.0001652571005188254, "loss": 0.5741, "step": 74890 }, { "epoch": 1.6671118233618234, "grad_norm": 0.5651301741600037, "learning_rate": 0.0001652111880704685, "loss": 0.5463, "step": 74900 }, { "epoch": 1.6673344017094016, "grad_norm": 0.5351511240005493, "learning_rate": 0.00016516527751256406, "loss": 0.4856, "step": 74910 }, { "epoch": 1.66755698005698, "grad_norm": 0.5877324938774109, "learning_rate": 0.00016511936884760697, "loss": 0.5068, "step": 74920 }, { "epoch": 1.6677795584045585, "grad_norm": 0.40048742294311523, "learning_rate": 0.00016507346207809195, "loss": 0.5428, "step": 74930 }, { "epoch": 1.6680021367521367, "grad_norm": 0.34698426723480225, "learning_rate": 0.00016502755720651359, "loss": 0.5051, "step": 74940 }, { "epoch": 1.6682247150997151, "grad_norm": 0.6042134761810303, "learning_rate": 0.00016498165423536645, "loss": 0.5602, "step": 74950 }, { "epoch": 1.6684472934472936, "grad_norm": 0.4832810163497925, "learning_rate": 0.00016493575316714484, "loss": 0.4747, "step": 74960 }, { "epoch": 1.6686698717948718, "grad_norm": 0.5508270263671875, "learning_rate": 0.00016488985400434314, "loss": 0.5238, "step": 74970 }, { "epoch": 1.66889245014245, "grad_norm": 0.4627018868923187, "learning_rate": 0.00016484395674945545, "loss": 0.6073, "step": 74980 }, { "epoch": 1.6691150284900285, "grad_norm": 0.6229496002197266, "learning_rate": 0.00016479806140497593, "loss": 0.6461, "step": 74990 }, { "epoch": 1.669337606837607, "grad_norm": 0.9775029420852661, "learning_rate": 0.00016475216797339864, "loss": 0.6489, "step": 75000 }, { "epoch": 1.6695601851851851, "grad_norm": 0.38404151797294617, "learning_rate": 0.0001647062764572173, "loss": 0.5041, "step": 75010 }, { "epoch": 1.6697827635327636, "grad_norm": 0.5900865793228149, "learning_rate": 0.00016466038685892587, "loss": 0.5822, "step": 75020 }, { "epoch": 1.670005341880342, "grad_norm": 0.5986582636833191, "learning_rate": 0.0001646144991810179, "loss": 0.4817, "step": 75030 }, { "epoch": 1.6702279202279202, "grad_norm": 0.6265101432800293, "learning_rate": 0.0001645686134259871, "loss": 0.588, "step": 75040 }, { "epoch": 1.6704504985754984, "grad_norm": 0.46558502316474915, "learning_rate": 0.00016452272959632675, "loss": 0.4984, "step": 75050 }, { "epoch": 1.6706730769230769, "grad_norm": 0.33297625184059143, "learning_rate": 0.00016447684769453034, "loss": 0.5982, "step": 75060 }, { "epoch": 1.6708956552706553, "grad_norm": 0.6559587121009827, "learning_rate": 0.00016443096772309114, "loss": 0.5793, "step": 75070 }, { "epoch": 1.6711182336182335, "grad_norm": 0.43765076994895935, "learning_rate": 0.00016438508968450233, "loss": 0.464, "step": 75080 }, { "epoch": 1.671340811965812, "grad_norm": 0.46449562907218933, "learning_rate": 0.00016433921358125682, "loss": 0.5096, "step": 75090 }, { "epoch": 1.6715633903133904, "grad_norm": 0.4868076741695404, "learning_rate": 0.00016429333941584765, "loss": 0.4879, "step": 75100 }, { "epoch": 1.6717859686609686, "grad_norm": 0.5010735392570496, "learning_rate": 0.00016424746719076764, "loss": 0.4999, "step": 75110 }, { "epoch": 1.672008547008547, "grad_norm": 0.5864977240562439, "learning_rate": 0.0001642015969085095, "loss": 0.6716, "step": 75120 }, { "epoch": 1.6722311253561255, "grad_norm": 0.4027913808822632, "learning_rate": 0.00016415572857156593, "loss": 0.5468, "step": 75130 }, { "epoch": 1.6724537037037037, "grad_norm": 0.789313793182373, "learning_rate": 0.00016410986218242944, "loss": 0.5943, "step": 75140 }, { "epoch": 1.672676282051282, "grad_norm": 0.6468246579170227, "learning_rate": 0.00016406399774359235, "loss": 0.5485, "step": 75150 }, { "epoch": 1.6728988603988604, "grad_norm": 0.46155524253845215, "learning_rate": 0.00016401813525754707, "loss": 0.4953, "step": 75160 }, { "epoch": 1.6731214387464388, "grad_norm": 0.907846212387085, "learning_rate": 0.00016397227472678578, "loss": 0.5333, "step": 75170 }, { "epoch": 1.673344017094017, "grad_norm": 0.44893550872802734, "learning_rate": 0.0001639264161538006, "loss": 0.5691, "step": 75180 }, { "epoch": 1.6735665954415955, "grad_norm": 0.6367380619049072, "learning_rate": 0.00016388055954108345, "loss": 0.6064, "step": 75190 }, { "epoch": 1.673789173789174, "grad_norm": 0.6882296204566956, "learning_rate": 0.00016383470489112628, "loss": 0.6168, "step": 75200 }, { "epoch": 1.6740117521367521, "grad_norm": 0.6169039607048035, "learning_rate": 0.00016378885220642093, "loss": 0.5464, "step": 75210 }, { "epoch": 1.6742343304843303, "grad_norm": 0.36328089237213135, "learning_rate": 0.00016374300148945897, "loss": 0.6335, "step": 75220 }, { "epoch": 1.6744569088319088, "grad_norm": 0.5436282753944397, "learning_rate": 0.00016369715274273193, "loss": 0.5793, "step": 75230 }, { "epoch": 1.6746794871794872, "grad_norm": 0.43563759326934814, "learning_rate": 0.00016365130596873132, "loss": 0.4787, "step": 75240 }, { "epoch": 1.6749020655270654, "grad_norm": 0.603461742401123, "learning_rate": 0.00016360546116994855, "loss": 0.6215, "step": 75250 }, { "epoch": 1.6751246438746439, "grad_norm": 0.5234988927841187, "learning_rate": 0.00016355961834887479, "loss": 0.5816, "step": 75260 }, { "epoch": 1.6753472222222223, "grad_norm": 0.536532461643219, "learning_rate": 0.00016351377750800116, "loss": 0.5119, "step": 75270 }, { "epoch": 1.6755698005698005, "grad_norm": 0.5434236526489258, "learning_rate": 0.0001634679386498188, "loss": 0.6079, "step": 75280 }, { "epoch": 1.6757923789173788, "grad_norm": 0.48780888319015503, "learning_rate": 0.0001634221017768185, "loss": 0.5606, "step": 75290 }, { "epoch": 1.6760149572649574, "grad_norm": 0.6546382308006287, "learning_rate": 0.00016337626689149118, "loss": 0.6589, "step": 75300 }, { "epoch": 1.6762375356125356, "grad_norm": 0.37422406673431396, "learning_rate": 0.0001633304339963275, "loss": 0.4647, "step": 75310 }, { "epoch": 1.6764601139601139, "grad_norm": 0.7309848666191101, "learning_rate": 0.000163284603093818, "loss": 0.5384, "step": 75320 }, { "epoch": 1.6766826923076923, "grad_norm": 0.4979402422904968, "learning_rate": 0.00016323877418645327, "loss": 0.5439, "step": 75330 }, { "epoch": 1.6769052706552707, "grad_norm": 0.48228442668914795, "learning_rate": 0.0001631929472767237, "loss": 0.642, "step": 75340 }, { "epoch": 1.677127849002849, "grad_norm": 0.7119601368904114, "learning_rate": 0.00016314712236711944, "loss": 0.5524, "step": 75350 }, { "epoch": 1.6773504273504274, "grad_norm": 0.8681598901748657, "learning_rate": 0.00016310129946013073, "loss": 0.7036, "step": 75360 }, { "epoch": 1.6775730056980058, "grad_norm": 0.754706859588623, "learning_rate": 0.0001630554785582476, "loss": 0.5865, "step": 75370 }, { "epoch": 1.677795584045584, "grad_norm": 0.41640523076057434, "learning_rate": 0.00016300965966396, "loss": 0.8015, "step": 75380 }, { "epoch": 1.6780181623931623, "grad_norm": 0.6275506615638733, "learning_rate": 0.00016296384277975784, "loss": 0.5445, "step": 75390 }, { "epoch": 1.6782407407407407, "grad_norm": 0.5284397602081299, "learning_rate": 0.0001629180279081307, "loss": 0.6416, "step": 75400 }, { "epoch": 1.6784633190883191, "grad_norm": 0.42982757091522217, "learning_rate": 0.00016287221505156832, "loss": 0.6049, "step": 75410 }, { "epoch": 1.6786858974358974, "grad_norm": 0.4871768355369568, "learning_rate": 0.00016282640421256018, "loss": 0.4916, "step": 75420 }, { "epoch": 1.6789084757834758, "grad_norm": 0.6214765310287476, "learning_rate": 0.00016278059539359563, "loss": 0.4929, "step": 75430 }, { "epoch": 1.6791310541310542, "grad_norm": 0.7499983310699463, "learning_rate": 0.000162734788597164, "loss": 0.5846, "step": 75440 }, { "epoch": 1.6793536324786325, "grad_norm": 0.6454204320907593, "learning_rate": 0.0001626889838257545, "loss": 0.532, "step": 75450 }, { "epoch": 1.6795762108262107, "grad_norm": 0.7106652855873108, "learning_rate": 0.00016264318108185616, "loss": 0.476, "step": 75460 }, { "epoch": 1.6797987891737893, "grad_norm": 0.6261777281761169, "learning_rate": 0.00016259738036795797, "loss": 0.6637, "step": 75470 }, { "epoch": 1.6800213675213675, "grad_norm": 0.4820646643638611, "learning_rate": 0.00016255158168654868, "loss": 0.5294, "step": 75480 }, { "epoch": 1.6801994301994303, "eval_loss": 0.5559062361717224, "eval_runtime": 337.5129, "eval_samples_per_second": 7.007, "eval_steps_per_second": 7.007, "step": 75488 }, { "epoch": 1.6802439458689458, "grad_norm": 0.5208863615989685, "learning_rate": 0.00016250578504011713, "loss": 0.4434, "step": 75490 }, { "epoch": 1.6804665242165242, "grad_norm": 0.3657657504081726, "learning_rate": 0.0001624599904311519, "loss": 0.4016, "step": 75500 }, { "epoch": 1.6806891025641026, "grad_norm": 0.837756335735321, "learning_rate": 0.00016241419786214154, "loss": 0.5181, "step": 75510 }, { "epoch": 1.6809116809116809, "grad_norm": 0.5618343353271484, "learning_rate": 0.00016236840733557442, "loss": 0.6338, "step": 75520 }, { "epoch": 1.6811342592592593, "grad_norm": 0.44379135966300964, "learning_rate": 0.00016232261885393886, "loss": 0.573, "step": 75530 }, { "epoch": 1.6813568376068377, "grad_norm": 0.4937328100204468, "learning_rate": 0.000162276832419723, "loss": 0.4523, "step": 75540 }, { "epoch": 1.681579415954416, "grad_norm": 0.6280141472816467, "learning_rate": 0.00016223104803541501, "loss": 0.5038, "step": 75550 }, { "epoch": 1.6818019943019942, "grad_norm": 0.43941378593444824, "learning_rate": 0.00016218526570350279, "loss": 0.638, "step": 75560 }, { "epoch": 1.6820245726495726, "grad_norm": 0.5391653776168823, "learning_rate": 0.00016213948542647414, "loss": 0.5817, "step": 75570 }, { "epoch": 1.682247150997151, "grad_norm": 0.3680667579174042, "learning_rate": 0.00016209370720681685, "loss": 0.5353, "step": 75580 }, { "epoch": 1.6824697293447293, "grad_norm": 0.386249303817749, "learning_rate": 0.00016204793104701858, "loss": 0.4681, "step": 75590 }, { "epoch": 1.6826923076923077, "grad_norm": 0.4776840806007385, "learning_rate": 0.0001620021569495668, "loss": 0.5674, "step": 75600 }, { "epoch": 1.6829148860398861, "grad_norm": 0.6500211358070374, "learning_rate": 0.00016195638491694892, "loss": 0.4516, "step": 75610 }, { "epoch": 1.6831374643874644, "grad_norm": 0.4960470199584961, "learning_rate": 0.0001619106149516522, "loss": 0.5763, "step": 75620 }, { "epoch": 1.6833600427350426, "grad_norm": 0.4113209545612335, "learning_rate": 0.00016186484705616385, "loss": 0.4562, "step": 75630 }, { "epoch": 1.6835826210826212, "grad_norm": 0.5733873844146729, "learning_rate": 0.00016181908123297096, "loss": 0.5481, "step": 75640 }, { "epoch": 1.6838051994301995, "grad_norm": 0.592670738697052, "learning_rate": 0.0001617733174845604, "loss": 0.5587, "step": 75650 }, { "epoch": 1.6840277777777777, "grad_norm": 0.5716107487678528, "learning_rate": 0.00016172755581341907, "loss": 0.5366, "step": 75660 }, { "epoch": 1.6842503561253561, "grad_norm": 0.635908305644989, "learning_rate": 0.00016168179622203375, "loss": 0.4558, "step": 75670 }, { "epoch": 1.6844729344729346, "grad_norm": 0.29668745398521423, "learning_rate": 0.00016163603871289093, "loss": 0.5495, "step": 75680 }, { "epoch": 1.6846955128205128, "grad_norm": 0.34001338481903076, "learning_rate": 0.00016159028328847715, "loss": 0.4745, "step": 75690 }, { "epoch": 1.6849180911680912, "grad_norm": 1.0909297466278076, "learning_rate": 0.00016154452995127892, "loss": 0.6351, "step": 75700 }, { "epoch": 1.6851406695156697, "grad_norm": 0.8510547280311584, "learning_rate": 0.00016149877870378233, "loss": 0.578, "step": 75710 }, { "epoch": 1.6853632478632479, "grad_norm": 0.5867220163345337, "learning_rate": 0.00016145302954847363, "loss": 0.5132, "step": 75720 }, { "epoch": 1.685585826210826, "grad_norm": 0.5249475836753845, "learning_rate": 0.0001614072824878389, "loss": 0.6319, "step": 75730 }, { "epoch": 1.6858084045584045, "grad_norm": 0.6304720640182495, "learning_rate": 0.00016136153752436405, "loss": 0.5996, "step": 75740 }, { "epoch": 1.686030982905983, "grad_norm": 0.4601406753063202, "learning_rate": 0.00016131579466053487, "loss": 0.4823, "step": 75750 }, { "epoch": 1.6862535612535612, "grad_norm": 0.3753646910190582, "learning_rate": 0.00016127005389883703, "loss": 0.6404, "step": 75760 }, { "epoch": 1.6864761396011396, "grad_norm": 0.5258237719535828, "learning_rate": 0.00016122431524175616, "loss": 0.6586, "step": 75770 }, { "epoch": 1.686698717948718, "grad_norm": 0.4938367009162903, "learning_rate": 0.00016117857869177781, "loss": 0.5113, "step": 75780 }, { "epoch": 1.6869212962962963, "grad_norm": 0.674801230430603, "learning_rate": 0.00016113284425138723, "loss": 0.4846, "step": 75790 }, { "epoch": 1.6871438746438745, "grad_norm": 0.606334924697876, "learning_rate": 0.0001610871119230697, "loss": 0.4629, "step": 75800 }, { "epoch": 1.687366452991453, "grad_norm": 1.3602806329727173, "learning_rate": 0.00016104138170931038, "loss": 0.5608, "step": 75810 }, { "epoch": 1.6875890313390314, "grad_norm": 0.6526398062705994, "learning_rate": 0.00016099565361259426, "loss": 0.7189, "step": 75820 }, { "epoch": 1.6878116096866096, "grad_norm": 0.7846774458885193, "learning_rate": 0.00016094992763540625, "loss": 0.6107, "step": 75830 }, { "epoch": 1.688034188034188, "grad_norm": 0.6672149896621704, "learning_rate": 0.00016090420378023118, "loss": 0.4595, "step": 75840 }, { "epoch": 1.6882567663817665, "grad_norm": 1.1179325580596924, "learning_rate": 0.00016085848204955365, "loss": 0.4434, "step": 75850 }, { "epoch": 1.6884793447293447, "grad_norm": 0.6847007870674133, "learning_rate": 0.00016081276244585824, "loss": 0.6622, "step": 75860 }, { "epoch": 1.6887019230769231, "grad_norm": 0.596674382686615, "learning_rate": 0.00016076704497162948, "loss": 0.4638, "step": 75870 }, { "epoch": 1.6889245014245016, "grad_norm": 0.7307976484298706, "learning_rate": 0.00016072132962935155, "loss": 0.5899, "step": 75880 }, { "epoch": 1.6891470797720798, "grad_norm": 0.5637958645820618, "learning_rate": 0.00016067561642150871, "loss": 0.6062, "step": 75890 }, { "epoch": 1.689369658119658, "grad_norm": 0.528197169303894, "learning_rate": 0.00016062990535058505, "loss": 0.5993, "step": 75900 }, { "epoch": 1.6895922364672364, "grad_norm": 0.5327113270759583, "learning_rate": 0.0001605841964190646, "loss": 0.5893, "step": 75910 }, { "epoch": 1.6898148148148149, "grad_norm": 0.5732520818710327, "learning_rate": 0.00016053848962943118, "loss": 0.556, "step": 75920 }, { "epoch": 1.690037393162393, "grad_norm": 0.40062829852104187, "learning_rate": 0.00016049278498416852, "loss": 0.5818, "step": 75930 }, { "epoch": 1.6902599715099715, "grad_norm": 0.4862927198410034, "learning_rate": 0.00016044708248576028, "loss": 0.5753, "step": 75940 }, { "epoch": 1.69048254985755, "grad_norm": 0.5803338289260864, "learning_rate": 0.00016040138213668995, "loss": 0.6473, "step": 75950 }, { "epoch": 1.6907051282051282, "grad_norm": 0.47554415464401245, "learning_rate": 0.00016035568393944094, "loss": 0.4034, "step": 75960 }, { "epoch": 1.6909277065527064, "grad_norm": 0.5601959824562073, "learning_rate": 0.00016030998789649649, "loss": 0.6176, "step": 75970 }, { "epoch": 1.6911502849002849, "grad_norm": 0.5545170903205872, "learning_rate": 0.00016026429401033983, "loss": 0.4686, "step": 75980 }, { "epoch": 1.6913728632478633, "grad_norm": 0.45020928978919983, "learning_rate": 0.00016021860228345396, "loss": 0.4481, "step": 75990 }, { "epoch": 1.6915954415954415, "grad_norm": 0.35779350996017456, "learning_rate": 0.00016017291271832183, "loss": 0.4946, "step": 76000 }, { "epoch": 1.69181801994302, "grad_norm": 0.7461297512054443, "learning_rate": 0.0001601272253174262, "loss": 0.5929, "step": 76010 }, { "epoch": 1.6920405982905984, "grad_norm": 0.39829355478286743, "learning_rate": 0.00016008154008324976, "loss": 0.5274, "step": 76020 }, { "epoch": 1.6922631766381766, "grad_norm": 0.4320659935474396, "learning_rate": 0.00016003585701827516, "loss": 0.594, "step": 76030 }, { "epoch": 1.6924857549857548, "grad_norm": 0.6035652756690979, "learning_rate": 0.00015999017612498476, "loss": 0.4655, "step": 76040 }, { "epoch": 1.6927083333333335, "grad_norm": 0.4487076997756958, "learning_rate": 0.00015994449740586094, "loss": 0.5315, "step": 76050 }, { "epoch": 1.6929309116809117, "grad_norm": 0.5057926774024963, "learning_rate": 0.00015989882086338598, "loss": 0.5373, "step": 76060 }, { "epoch": 1.69315349002849, "grad_norm": 0.5225903987884521, "learning_rate": 0.00015985314650004186, "loss": 0.4748, "step": 76070 }, { "epoch": 1.6933760683760684, "grad_norm": 0.7291542291641235, "learning_rate": 0.00015980747431831063, "loss": 0.5516, "step": 76080 }, { "epoch": 1.6935986467236468, "grad_norm": 0.642415463924408, "learning_rate": 0.0001597618043206742, "loss": 0.442, "step": 76090 }, { "epoch": 1.693821225071225, "grad_norm": 0.45231255888938904, "learning_rate": 0.0001597161365096142, "loss": 0.5974, "step": 76100 }, { "epoch": 1.6940438034188035, "grad_norm": 0.7588939070701599, "learning_rate": 0.00015967047088761234, "loss": 0.6961, "step": 76110 }, { "epoch": 1.694266381766382, "grad_norm": 0.6613054871559143, "learning_rate": 0.00015962480745715016, "loss": 0.6047, "step": 76120 }, { "epoch": 1.69448896011396, "grad_norm": 0.7823227643966675, "learning_rate": 0.00015957914622070901, "loss": 0.5886, "step": 76130 }, { "epoch": 1.6947115384615383, "grad_norm": 0.7325944304466248, "learning_rate": 0.00015953348718077007, "loss": 0.6105, "step": 76140 }, { "epoch": 1.6949341168091168, "grad_norm": 0.5065510869026184, "learning_rate": 0.00015948783033981458, "loss": 0.4539, "step": 76150 }, { "epoch": 1.6951566951566952, "grad_norm": 0.5566450953483582, "learning_rate": 0.00015944217570032358, "loss": 0.6209, "step": 76160 }, { "epoch": 1.6953792735042734, "grad_norm": 0.5529851317405701, "learning_rate": 0.00015939652326477793, "loss": 0.5442, "step": 76170 }, { "epoch": 1.6956018518518519, "grad_norm": 0.6417269706726074, "learning_rate": 0.0001593508730356584, "loss": 0.5299, "step": 76180 }, { "epoch": 1.6958244301994303, "grad_norm": 0.5870069265365601, "learning_rate": 0.00015930522501544575, "loss": 0.5943, "step": 76190 }, { "epoch": 1.6960470085470085, "grad_norm": 0.8581626415252686, "learning_rate": 0.00015925957920662052, "loss": 0.5854, "step": 76200 }, { "epoch": 1.6962695868945867, "grad_norm": 0.5439938306808472, "learning_rate": 0.00015921393561166308, "loss": 0.4965, "step": 76210 }, { "epoch": 1.6964921652421654, "grad_norm": 0.6037572026252747, "learning_rate": 0.00015916829423305372, "loss": 0.4593, "step": 76220 }, { "epoch": 1.6967147435897436, "grad_norm": 0.40113234519958496, "learning_rate": 0.00015912265507327277, "loss": 0.3858, "step": 76230 }, { "epoch": 1.6969373219373218, "grad_norm": 0.29176080226898193, "learning_rate": 0.00015907701813480015, "loss": 0.4355, "step": 76240 }, { "epoch": 1.6971599002849003, "grad_norm": 0.5197509527206421, "learning_rate": 0.00015903138342011585, "loss": 0.5374, "step": 76250 }, { "epoch": 1.6973824786324787, "grad_norm": 0.6022807955741882, "learning_rate": 0.0001589857509316998, "loss": 0.6799, "step": 76260 }, { "epoch": 1.697605056980057, "grad_norm": 0.781623125076294, "learning_rate": 0.00015894012067203158, "loss": 0.514, "step": 76270 }, { "epoch": 1.6978276353276354, "grad_norm": 0.4291946291923523, "learning_rate": 0.00015889449264359077, "loss": 0.6303, "step": 76280 }, { "epoch": 1.6980502136752138, "grad_norm": 0.49008262157440186, "learning_rate": 0.0001588488668488569, "loss": 0.5911, "step": 76290 }, { "epoch": 1.698272792022792, "grad_norm": 0.6763221621513367, "learning_rate": 0.00015880324329030934, "loss": 0.5334, "step": 76300 }, { "epoch": 1.6984953703703702, "grad_norm": 0.4447299838066101, "learning_rate": 0.0001587576219704272, "loss": 0.4779, "step": 76310 }, { "epoch": 1.6987179487179487, "grad_norm": 0.8476657271385193, "learning_rate": 0.00015871200289168966, "loss": 0.491, "step": 76320 }, { "epoch": 1.6989405270655271, "grad_norm": 0.5236039161682129, "learning_rate": 0.00015866638605657567, "loss": 0.5248, "step": 76330 }, { "epoch": 1.6991631054131053, "grad_norm": 0.9949755072593689, "learning_rate": 0.00015862077146756414, "loss": 0.5585, "step": 76340 }, { "epoch": 1.6993856837606838, "grad_norm": 0.5810121297836304, "learning_rate": 0.00015857515912713372, "loss": 0.4416, "step": 76350 }, { "epoch": 1.6996082621082622, "grad_norm": 0.47213947772979736, "learning_rate": 0.0001585295490377631, "loss": 0.5593, "step": 76360 }, { "epoch": 1.6998308404558404, "grad_norm": 0.5209132432937622, "learning_rate": 0.00015848394120193072, "loss": 0.5851, "step": 76370 }, { "epoch": 1.7000534188034186, "grad_norm": 0.42960062623023987, "learning_rate": 0.00015843833562211496, "loss": 0.4306, "step": 76380 }, { "epoch": 1.7002759971509973, "grad_norm": 0.7299292683601379, "learning_rate": 0.00015839273230079415, "loss": 0.4564, "step": 76390 }, { "epoch": 1.7004985754985755, "grad_norm": 0.5951300263404846, "learning_rate": 0.00015834713124044622, "loss": 0.6025, "step": 76400 }, { "epoch": 1.7007211538461537, "grad_norm": 0.7156624794006348, "learning_rate": 0.00015830153244354933, "loss": 0.5785, "step": 76410 }, { "epoch": 1.7009437321937322, "grad_norm": 0.3453843295574188, "learning_rate": 0.00015825593591258126, "loss": 0.5587, "step": 76420 }, { "epoch": 1.7011663105413106, "grad_norm": 0.5409964323043823, "learning_rate": 0.0001582103416500198, "loss": 0.4189, "step": 76430 }, { "epoch": 1.7013888888888888, "grad_norm": 0.484016478061676, "learning_rate": 0.00015816474965834264, "loss": 0.5967, "step": 76440 }, { "epoch": 1.7016114672364673, "grad_norm": 0.6141249537467957, "learning_rate": 0.0001581191599400272, "loss": 0.5393, "step": 76450 }, { "epoch": 1.7018340455840457, "grad_norm": 0.36540478467941284, "learning_rate": 0.0001580735724975509, "loss": 0.5607, "step": 76460 }, { "epoch": 1.702056623931624, "grad_norm": 0.7449799180030823, "learning_rate": 0.00015802798733339094, "loss": 0.5529, "step": 76470 }, { "epoch": 1.7022792022792022, "grad_norm": 0.609114944934845, "learning_rate": 0.00015798240445002458, "loss": 0.5248, "step": 76480 }, { "epoch": 1.7025017806267806, "grad_norm": 0.5341273546218872, "learning_rate": 0.00015793682384992872, "loss": 0.4829, "step": 76490 }, { "epoch": 1.702724358974359, "grad_norm": 0.546101450920105, "learning_rate": 0.0001578912455355803, "loss": 0.563, "step": 76500 }, { "epoch": 1.7029469373219372, "grad_norm": 0.4574396014213562, "learning_rate": 0.00015784566950945608, "loss": 0.4483, "step": 76510 }, { "epoch": 1.7031695156695157, "grad_norm": 0.3588087260723114, "learning_rate": 0.00015780009577403276, "loss": 0.5059, "step": 76520 }, { "epoch": 1.7033920940170941, "grad_norm": 0.6983396410942078, "learning_rate": 0.0001577545243317867, "loss": 0.5204, "step": 76530 }, { "epoch": 1.7036146723646723, "grad_norm": 0.6309730410575867, "learning_rate": 0.00015770895518519434, "loss": 0.5282, "step": 76540 }, { "epoch": 1.7038372507122506, "grad_norm": 0.6660280227661133, "learning_rate": 0.00015766338833673205, "loss": 0.4507, "step": 76550 }, { "epoch": 1.7040598290598292, "grad_norm": 0.7245699167251587, "learning_rate": 0.00015761782378887585, "loss": 0.3934, "step": 76560 }, { "epoch": 1.7042824074074074, "grad_norm": 0.48217499256134033, "learning_rate": 0.00015757226154410175, "loss": 0.6225, "step": 76570 }, { "epoch": 1.7045049857549857, "grad_norm": 0.5108203887939453, "learning_rate": 0.0001575267016048858, "loss": 0.6156, "step": 76580 }, { "epoch": 1.704727564102564, "grad_norm": 0.4934636354446411, "learning_rate": 0.0001574811439737036, "loss": 0.4839, "step": 76590 }, { "epoch": 1.7049501424501425, "grad_norm": 0.4846111834049225, "learning_rate": 0.00015743558865303082, "loss": 0.5209, "step": 76600 }, { "epoch": 1.7051727207977208, "grad_norm": 0.8412664532661438, "learning_rate": 0.00015739003564534305, "loss": 0.6843, "step": 76610 }, { "epoch": 1.7053952991452992, "grad_norm": 0.5684376955032349, "learning_rate": 0.00015734448495311558, "loss": 0.4732, "step": 76620 }, { "epoch": 1.7056178774928776, "grad_norm": 2.491905689239502, "learning_rate": 0.00015729893657882374, "loss": 0.4859, "step": 76630 }, { "epoch": 1.7058404558404558, "grad_norm": 0.6867818236351013, "learning_rate": 0.00015725339052494262, "loss": 0.653, "step": 76640 }, { "epoch": 1.706063034188034, "grad_norm": 0.5544010996818542, "learning_rate": 0.00015720784679394733, "loss": 0.5295, "step": 76650 }, { "epoch": 1.7062856125356125, "grad_norm": 0.4254001975059509, "learning_rate": 0.00015716230538831264, "loss": 0.6488, "step": 76660 }, { "epoch": 1.706508190883191, "grad_norm": 0.7363816499710083, "learning_rate": 0.00015711676631051331, "loss": 0.5307, "step": 76670 }, { "epoch": 1.7067307692307692, "grad_norm": 0.7330361008644104, "learning_rate": 0.000157071229563024, "loss": 0.5654, "step": 76680 }, { "epoch": 1.7069533475783476, "grad_norm": 0.5299323201179504, "learning_rate": 0.00015702569514831926, "loss": 0.5194, "step": 76690 }, { "epoch": 1.707175925925926, "grad_norm": 0.5326554179191589, "learning_rate": 0.00015698016306887338, "loss": 0.6502, "step": 76700 }, { "epoch": 1.7073985042735043, "grad_norm": 0.5645008087158203, "learning_rate": 0.00015693463332716067, "loss": 0.5256, "step": 76710 }, { "epoch": 1.7076210826210825, "grad_norm": 0.574365496635437, "learning_rate": 0.0001568891059256553, "loss": 0.4167, "step": 76720 }, { "epoch": 1.707843660968661, "grad_norm": 0.759273886680603, "learning_rate": 0.0001568435808668311, "loss": 0.57, "step": 76730 }, { "epoch": 1.7080662393162394, "grad_norm": 0.6679409146308899, "learning_rate": 0.00015679805815316212, "loss": 0.5119, "step": 76740 }, { "epoch": 1.7082888176638176, "grad_norm": 0.4140261113643646, "learning_rate": 0.000156752537787122, "loss": 0.5053, "step": 76750 }, { "epoch": 1.708511396011396, "grad_norm": 0.6662968397140503, "learning_rate": 0.00015670701977118438, "loss": 0.4888, "step": 76760 }, { "epoch": 1.7087339743589745, "grad_norm": 0.47943583130836487, "learning_rate": 0.00015666150410782276, "loss": 0.4513, "step": 76770 }, { "epoch": 1.7089565527065527, "grad_norm": 0.6128782629966736, "learning_rate": 0.00015661599079951045, "loss": 0.5287, "step": 76780 }, { "epoch": 1.709179131054131, "grad_norm": 0.6315615773200989, "learning_rate": 0.00015657047984872082, "loss": 0.4315, "step": 76790 }, { "epoch": 1.7094017094017095, "grad_norm": 0.5509540438652039, "learning_rate": 0.0001565249712579268, "loss": 0.5383, "step": 76800 }, { "epoch": 1.7096242877492878, "grad_norm": 0.4812536835670471, "learning_rate": 0.00015647946502960142, "loss": 0.5343, "step": 76810 }, { "epoch": 1.709846866096866, "grad_norm": 0.6319760680198669, "learning_rate": 0.0001564339611662175, "loss": 0.4889, "step": 76820 }, { "epoch": 1.7100694444444444, "grad_norm": 0.7118447422981262, "learning_rate": 0.0001563884596702479, "loss": 0.5909, "step": 76830 }, { "epoch": 1.7102920227920229, "grad_norm": 0.6964597702026367, "learning_rate": 0.00015634296054416503, "loss": 0.4947, "step": 76840 }, { "epoch": 1.710514601139601, "grad_norm": 0.5916907787322998, "learning_rate": 0.0001562974637904414, "loss": 0.5048, "step": 76850 }, { "epoch": 1.7107371794871795, "grad_norm": 0.5401000380516052, "learning_rate": 0.00015625196941154943, "loss": 0.6055, "step": 76860 }, { "epoch": 1.710959757834758, "grad_norm": 0.6075836420059204, "learning_rate": 0.0001562064774099612, "loss": 0.5337, "step": 76870 }, { "epoch": 1.7111823361823362, "grad_norm": 0.6079210638999939, "learning_rate": 0.00015616098778814885, "loss": 0.5166, "step": 76880 }, { "epoch": 1.7114049145299144, "grad_norm": 0.48606622219085693, "learning_rate": 0.00015611550054858437, "loss": 0.5265, "step": 76890 }, { "epoch": 1.7116274928774928, "grad_norm": 0.6630349159240723, "learning_rate": 0.00015607001569373945, "loss": 0.5691, "step": 76900 }, { "epoch": 1.7118500712250713, "grad_norm": 0.5666025280952454, "learning_rate": 0.00015602453322608584, "loss": 0.5278, "step": 76910 }, { "epoch": 1.7120726495726495, "grad_norm": 0.6412245035171509, "learning_rate": 0.00015597905314809518, "loss": 0.5876, "step": 76920 }, { "epoch": 1.712295227920228, "grad_norm": 0.5317904353141785, "learning_rate": 0.00015593357546223873, "loss": 0.5432, "step": 76930 }, { "epoch": 1.7125178062678064, "grad_norm": 0.5867559909820557, "learning_rate": 0.00015588810017098793, "loss": 0.5655, "step": 76940 }, { "epoch": 1.7127403846153846, "grad_norm": 0.5823548436164856, "learning_rate": 0.00015584262727681377, "loss": 0.4988, "step": 76950 }, { "epoch": 1.7129629629629628, "grad_norm": 0.8102551102638245, "learning_rate": 0.00015579715678218744, "loss": 0.6317, "step": 76960 }, { "epoch": 1.7131855413105415, "grad_norm": 0.5633259415626526, "learning_rate": 0.00015575168868957984, "loss": 0.6443, "step": 76970 }, { "epoch": 1.7134081196581197, "grad_norm": 0.4515515863895416, "learning_rate": 0.00015570622300146165, "loss": 0.4327, "step": 76980 }, { "epoch": 1.713630698005698, "grad_norm": 0.5940386652946472, "learning_rate": 0.00015566075972030355, "loss": 0.5207, "step": 76990 }, { "epoch": 1.7138532763532763, "grad_norm": 0.8212336301803589, "learning_rate": 0.00015561529884857613, "loss": 0.5504, "step": 77000 }, { "epoch": 1.7140758547008548, "grad_norm": 0.6525058150291443, "learning_rate": 0.00015556984038874965, "loss": 0.4645, "step": 77010 }, { "epoch": 1.714298433048433, "grad_norm": 0.8459281325340271, "learning_rate": 0.00015552438434329445, "loss": 0.6111, "step": 77020 }, { "epoch": 1.7145210113960114, "grad_norm": 0.7160471677780151, "learning_rate": 0.0001554789307146806, "loss": 0.527, "step": 77030 }, { "epoch": 1.7147435897435899, "grad_norm": 0.557977020740509, "learning_rate": 0.0001554334795053781, "loss": 0.6238, "step": 77040 }, { "epoch": 1.714966168091168, "grad_norm": 0.45475080609321594, "learning_rate": 0.00015538803071785687, "loss": 0.5713, "step": 77050 }, { "epoch": 1.7151887464387463, "grad_norm": 0.795819103717804, "learning_rate": 0.00015534258435458652, "loss": 0.5447, "step": 77060 }, { "epoch": 1.7154113247863247, "grad_norm": 0.5533003211021423, "learning_rate": 0.00015529714041803664, "loss": 0.5369, "step": 77070 }, { "epoch": 1.7156339031339032, "grad_norm": 0.7124785780906677, "learning_rate": 0.0001552516989106768, "loss": 0.5782, "step": 77080 }, { "epoch": 1.7158564814814814, "grad_norm": 0.4880826771259308, "learning_rate": 0.00015520625983497628, "loss": 0.3975, "step": 77090 }, { "epoch": 1.7160790598290598, "grad_norm": 0.6201895475387573, "learning_rate": 0.0001551608231934042, "loss": 0.5766, "step": 77100 }, { "epoch": 1.7163016381766383, "grad_norm": 0.6869056820869446, "learning_rate": 0.0001551153889884298, "loss": 0.4937, "step": 77110 }, { "epoch": 1.7165242165242165, "grad_norm": 0.6806768774986267, "learning_rate": 0.00015506995722252184, "loss": 0.5843, "step": 77120 }, { "epoch": 1.7167467948717947, "grad_norm": 0.47910743951797485, "learning_rate": 0.00015502452789814918, "loss": 0.4628, "step": 77130 }, { "epoch": 1.7169693732193734, "grad_norm": 0.6489134430885315, "learning_rate": 0.00015497910101778056, "loss": 0.5903, "step": 77140 }, { "epoch": 1.7171919515669516, "grad_norm": 0.654880702495575, "learning_rate": 0.00015493367658388438, "loss": 0.6535, "step": 77150 }, { "epoch": 1.7174145299145298, "grad_norm": 0.4657239317893982, "learning_rate": 0.0001548882545989291, "loss": 0.536, "step": 77160 }, { "epoch": 1.7176371082621082, "grad_norm": 0.4619198739528656, "learning_rate": 0.00015484283506538303, "loss": 0.6001, "step": 77170 }, { "epoch": 1.7178596866096867, "grad_norm": 0.8681321144104004, "learning_rate": 0.0001547974179857143, "loss": 0.5092, "step": 77180 }, { "epoch": 1.718082264957265, "grad_norm": 0.408426433801651, "learning_rate": 0.00015475200336239088, "loss": 0.6088, "step": 77190 }, { "epoch": 1.7183048433048433, "grad_norm": 0.3688236474990845, "learning_rate": 0.0001547065911978806, "loss": 0.5477, "step": 77200 }, { "epoch": 1.7185274216524218, "grad_norm": 0.5153622627258301, "learning_rate": 0.0001546611814946512, "loss": 0.4702, "step": 77210 }, { "epoch": 1.71875, "grad_norm": 0.40111953020095825, "learning_rate": 0.00015461577425517038, "loss": 0.4189, "step": 77220 }, { "epoch": 1.7189725783475782, "grad_norm": 0.7101040482521057, "learning_rate": 0.0001545703694819055, "loss": 0.5583, "step": 77230 }, { "epoch": 1.7191951566951567, "grad_norm": 0.6449742317199707, "learning_rate": 0.00015452496717732392, "loss": 0.5873, "step": 77240 }, { "epoch": 1.719417735042735, "grad_norm": 0.4655434489250183, "learning_rate": 0.0001544795673438929, "loss": 0.5781, "step": 77250 }, { "epoch": 1.7196403133903133, "grad_norm": 0.32710781693458557, "learning_rate": 0.00015443416998407942, "loss": 0.4204, "step": 77260 }, { "epoch": 1.7198628917378918, "grad_norm": 0.6005839705467224, "learning_rate": 0.00015438877510035043, "loss": 0.5271, "step": 77270 }, { "epoch": 1.7200854700854702, "grad_norm": 0.4636354148387909, "learning_rate": 0.0001543433826951728, "loss": 0.596, "step": 77280 }, { "epoch": 1.7203080484330484, "grad_norm": 0.47903260588645935, "learning_rate": 0.0001542979927710131, "loss": 0.6041, "step": 77290 }, { "epoch": 1.7205306267806266, "grad_norm": 0.5806395411491394, "learning_rate": 0.00015425260533033784, "loss": 0.5057, "step": 77300 }, { "epoch": 1.7207532051282053, "grad_norm": 0.5809629559516907, "learning_rate": 0.00015420722037561357, "loss": 0.5285, "step": 77310 }, { "epoch": 1.7209757834757835, "grad_norm": 0.5405491590499878, "learning_rate": 0.00015416183790930633, "loss": 0.493, "step": 77320 }, { "epoch": 1.7211983618233617, "grad_norm": 0.6415594816207886, "learning_rate": 0.00015411645793388242, "loss": 0.501, "step": 77330 }, { "epoch": 1.7214209401709402, "grad_norm": 0.6232026815414429, "learning_rate": 0.0001540710804518077, "loss": 0.5539, "step": 77340 }, { "epoch": 1.7216435185185186, "grad_norm": 0.417968213558197, "learning_rate": 0.00015402570546554803, "loss": 0.4794, "step": 77350 }, { "epoch": 1.7218660968660968, "grad_norm": 0.6860714554786682, "learning_rate": 0.00015398033297756925, "loss": 0.6022, "step": 77360 }, { "epoch": 1.7220886752136753, "grad_norm": 0.6967795491218567, "learning_rate": 0.00015393496299033677, "loss": 0.674, "step": 77370 }, { "epoch": 1.7223112535612537, "grad_norm": 1.5055320262908936, "learning_rate": 0.00015388959550631612, "loss": 0.5298, "step": 77380 }, { "epoch": 1.722533831908832, "grad_norm": 0.6755040884017944, "learning_rate": 0.00015384423052797265, "loss": 0.504, "step": 77390 }, { "epoch": 1.7227564102564101, "grad_norm": 0.45495426654815674, "learning_rate": 0.00015379886805777144, "loss": 0.5517, "step": 77400 }, { "epoch": 1.7229789886039886, "grad_norm": 0.7500666975975037, "learning_rate": 0.00015375350809817754, "loss": 0.515, "step": 77410 }, { "epoch": 1.723201566951567, "grad_norm": 0.6109678745269775, "learning_rate": 0.00015370815065165593, "loss": 0.4122, "step": 77420 }, { "epoch": 1.7234241452991452, "grad_norm": 0.7749100923538208, "learning_rate": 0.00015366279572067126, "loss": 0.6301, "step": 77430 }, { "epoch": 1.7236467236467237, "grad_norm": 0.6647700667381287, "learning_rate": 0.0001536174433076883, "loss": 0.6777, "step": 77440 }, { "epoch": 1.723869301994302, "grad_norm": 0.4303279519081116, "learning_rate": 0.00015357209341517134, "loss": 0.6068, "step": 77450 }, { "epoch": 1.7240918803418803, "grad_norm": 0.34944385290145874, "learning_rate": 0.00015352674604558487, "loss": 0.5226, "step": 77460 }, { "epoch": 1.7243144586894585, "grad_norm": 0.5302987098693848, "learning_rate": 0.00015348140120139306, "loss": 0.5667, "step": 77470 }, { "epoch": 1.7245370370370372, "grad_norm": 0.5831298828125, "learning_rate": 0.00015343605888505995, "loss": 0.4849, "step": 77480 }, { "epoch": 1.7247596153846154, "grad_norm": 0.8023634552955627, "learning_rate": 0.00015339071909904953, "loss": 0.5295, "step": 77490 }, { "epoch": 1.7249821937321936, "grad_norm": 0.6617007255554199, "learning_rate": 0.00015334538184582565, "loss": 0.7097, "step": 77500 }, { "epoch": 1.725204772079772, "grad_norm": 0.43473944067955017, "learning_rate": 0.00015330004712785185, "loss": 0.5037, "step": 77510 }, { "epoch": 1.7254273504273505, "grad_norm": 0.3872452676296234, "learning_rate": 0.0001532547149475917, "loss": 0.4662, "step": 77520 }, { "epoch": 1.7256499287749287, "grad_norm": 0.4788249731063843, "learning_rate": 0.0001532093853075087, "loss": 0.5742, "step": 77530 }, { "epoch": 1.7258725071225072, "grad_norm": 0.6486528515815735, "learning_rate": 0.00015316405821006593, "loss": 0.6181, "step": 77540 }, { "epoch": 1.7260950854700856, "grad_norm": 0.5705780386924744, "learning_rate": 0.0001531187336577266, "loss": 0.5193, "step": 77550 }, { "epoch": 1.7263176638176638, "grad_norm": 0.7377183437347412, "learning_rate": 0.0001530734116529537, "loss": 0.5357, "step": 77560 }, { "epoch": 1.726540242165242, "grad_norm": 0.4787918031215668, "learning_rate": 0.00015302809219821007, "loss": 0.5632, "step": 77570 }, { "epoch": 1.7267628205128205, "grad_norm": 0.5801129937171936, "learning_rate": 0.00015298277529595826, "loss": 0.4936, "step": 77580 }, { "epoch": 1.726985398860399, "grad_norm": 0.5748696327209473, "learning_rate": 0.00015293746094866096, "loss": 0.6502, "step": 77590 }, { "epoch": 1.7272079772079771, "grad_norm": 0.6041720509529114, "learning_rate": 0.00015289214915878055, "loss": 0.5518, "step": 77600 }, { "epoch": 1.7274305555555556, "grad_norm": 0.55303955078125, "learning_rate": 0.00015284683992877933, "loss": 0.5525, "step": 77610 }, { "epoch": 1.727653133903134, "grad_norm": 0.36641934514045715, "learning_rate": 0.00015280153326111941, "loss": 0.5056, "step": 77620 }, { "epoch": 1.7278757122507122, "grad_norm": 0.6180765628814697, "learning_rate": 0.0001527562291582628, "loss": 0.6146, "step": 77630 }, { "epoch": 1.7280982905982905, "grad_norm": 0.8334943652153015, "learning_rate": 0.00015271092762267143, "loss": 0.5016, "step": 77640 }, { "epoch": 1.728320868945869, "grad_norm": 0.3000129163265228, "learning_rate": 0.0001526656286568069, "loss": 0.4684, "step": 77650 }, { "epoch": 1.7285434472934473, "grad_norm": 0.6617578864097595, "learning_rate": 0.0001526203322631309, "loss": 0.4794, "step": 77660 }, { "epoch": 1.7287660256410255, "grad_norm": 0.6590235829353333, "learning_rate": 0.00015257503844410487, "loss": 0.4775, "step": 77670 }, { "epoch": 1.728988603988604, "grad_norm": 0.6377565264701843, "learning_rate": 0.00015252974720219, "loss": 0.6553, "step": 77680 }, { "epoch": 1.7292111823361824, "grad_norm": 0.6191003322601318, "learning_rate": 0.00015248445853984754, "loss": 0.4988, "step": 77690 }, { "epoch": 1.7294337606837606, "grad_norm": 0.408372700214386, "learning_rate": 0.00015243917245953857, "loss": 0.4995, "step": 77700 }, { "epoch": 1.729656339031339, "grad_norm": 0.5942658185958862, "learning_rate": 0.00015239388896372388, "loss": 0.6144, "step": 77710 }, { "epoch": 1.7298789173789175, "grad_norm": 0.6212291717529297, "learning_rate": 0.00015234860805486423, "loss": 0.6215, "step": 77720 }, { "epoch": 1.7301014957264957, "grad_norm": 0.5465625524520874, "learning_rate": 0.00015230332973542016, "loss": 0.5146, "step": 77730 }, { "epoch": 1.730324074074074, "grad_norm": 0.4212600290775299, "learning_rate": 0.00015225805400785226, "loss": 0.5602, "step": 77740 }, { "epoch": 1.7305466524216524, "grad_norm": 0.7564339637756348, "learning_rate": 0.00015221278087462076, "loss": 0.5131, "step": 77750 }, { "epoch": 1.7307692307692308, "grad_norm": 0.6511266231536865, "learning_rate": 0.0001521675103381859, "loss": 0.5001, "step": 77760 }, { "epoch": 1.730991809116809, "grad_norm": 0.534263014793396, "learning_rate": 0.00015212224240100764, "loss": 0.4023, "step": 77770 }, { "epoch": 1.7312143874643875, "grad_norm": 0.4892014265060425, "learning_rate": 0.00015207697706554597, "loss": 0.5147, "step": 77780 }, { "epoch": 1.731436965811966, "grad_norm": 0.437839537858963, "learning_rate": 0.00015203171433426056, "loss": 0.4898, "step": 77790 }, { "epoch": 1.7316595441595442, "grad_norm": 0.33123520016670227, "learning_rate": 0.00015198645420961106, "loss": 0.4448, "step": 77800 }, { "epoch": 1.7318821225071224, "grad_norm": 0.4915980398654938, "learning_rate": 0.00015194119669405698, "loss": 0.5647, "step": 77810 }, { "epoch": 1.7321047008547008, "grad_norm": 0.46801048517227173, "learning_rate": 0.0001518959417900576, "loss": 0.4096, "step": 77820 }, { "epoch": 1.7323272792022792, "grad_norm": 0.6970308423042297, "learning_rate": 0.0001518506895000721, "loss": 0.66, "step": 77830 }, { "epoch": 1.7325498575498575, "grad_norm": 0.3197179138660431, "learning_rate": 0.00015180543982655964, "loss": 0.4739, "step": 77840 }, { "epoch": 1.732772435897436, "grad_norm": 0.4276377856731415, "learning_rate": 0.000151760192771979, "loss": 0.4646, "step": 77850 }, { "epoch": 1.7329950142450143, "grad_norm": 0.5552869439125061, "learning_rate": 0.0001517149483387889, "loss": 0.531, "step": 77860 }, { "epoch": 1.7332175925925926, "grad_norm": 0.6214910745620728, "learning_rate": 0.00015166970652944807, "loss": 0.5939, "step": 77870 }, { "epoch": 1.7334401709401708, "grad_norm": 0.6200249791145325, "learning_rate": 0.00015162446734641498, "loss": 0.6064, "step": 77880 }, { "epoch": 1.7336627492877494, "grad_norm": 0.5559808611869812, "learning_rate": 0.0001515792307921479, "loss": 0.4792, "step": 77890 }, { "epoch": 1.7338853276353277, "grad_norm": 0.8645526766777039, "learning_rate": 0.00015153399686910506, "loss": 0.4616, "step": 77900 }, { "epoch": 1.7341079059829059, "grad_norm": 0.6816758513450623, "learning_rate": 0.0001514887655797445, "loss": 0.5304, "step": 77910 }, { "epoch": 1.7343304843304843, "grad_norm": 0.5790347456932068, "learning_rate": 0.00015144353692652415, "loss": 0.5979, "step": 77920 }, { "epoch": 1.7345530626780628, "grad_norm": 0.59865802526474, "learning_rate": 0.00015139831091190176, "loss": 0.6378, "step": 77930 }, { "epoch": 1.734775641025641, "grad_norm": 0.45446011424064636, "learning_rate": 0.0001513530875383349, "loss": 0.5979, "step": 77940 }, { "epoch": 1.7349982193732194, "grad_norm": 0.4167000949382782, "learning_rate": 0.00015130786680828115, "loss": 0.4618, "step": 77950 }, { "epoch": 1.7352207977207978, "grad_norm": 0.3784964084625244, "learning_rate": 0.00015126264872419772, "loss": 0.478, "step": 77960 }, { "epoch": 1.735443376068376, "grad_norm": 0.4573151469230652, "learning_rate": 0.00015121743328854197, "loss": 0.4937, "step": 77970 }, { "epoch": 1.7356659544159543, "grad_norm": 0.4052749574184418, "learning_rate": 0.00015117222050377074, "loss": 0.646, "step": 77980 }, { "epoch": 1.7358885327635327, "grad_norm": 0.48372966051101685, "learning_rate": 0.00015112701037234104, "loss": 0.5491, "step": 77990 }, { "epoch": 1.7361111111111112, "grad_norm": 0.7810246348381042, "learning_rate": 0.00015108180289670958, "loss": 0.451, "step": 78000 }, { "epoch": 1.7363336894586894, "grad_norm": 0.6991286873817444, "learning_rate": 0.000151036598079333, "loss": 0.479, "step": 78010 }, { "epoch": 1.7365562678062678, "grad_norm": 0.47509098052978516, "learning_rate": 0.0001509913959226678, "loss": 0.5591, "step": 78020 }, { "epoch": 1.7367788461538463, "grad_norm": 0.5491872429847717, "learning_rate": 0.00015094619642917024, "loss": 0.4718, "step": 78030 }, { "epoch": 1.7370014245014245, "grad_norm": 0.504374623298645, "learning_rate": 0.0001509009996012965, "loss": 0.4848, "step": 78040 }, { "epoch": 1.7372240028490027, "grad_norm": 0.7922311425209045, "learning_rate": 0.0001508558054415027, "loss": 0.6024, "step": 78050 }, { "epoch": 1.7374465811965814, "grad_norm": 0.6239007711410522, "learning_rate": 0.0001508106139522446, "loss": 0.6778, "step": 78060 }, { "epoch": 1.7376691595441596, "grad_norm": 0.7354421615600586, "learning_rate": 0.000150765425135978, "loss": 0.6422, "step": 78070 }, { "epoch": 1.7378917378917378, "grad_norm": 0.8253692388534546, "learning_rate": 0.00015072023899515854, "loss": 0.4777, "step": 78080 }, { "epoch": 1.7381143162393162, "grad_norm": 0.5854160189628601, "learning_rate": 0.00015067505553224164, "loss": 0.4572, "step": 78090 }, { "epoch": 1.7383368945868947, "grad_norm": 0.43329954147338867, "learning_rate": 0.00015062987474968265, "loss": 0.4732, "step": 78100 }, { "epoch": 1.7385594729344729, "grad_norm": 0.5088170766830444, "learning_rate": 0.0001505846966499366, "loss": 0.4901, "step": 78110 }, { "epoch": 1.7387820512820513, "grad_norm": 0.5127506852149963, "learning_rate": 0.0001505395212354586, "loss": 0.4453, "step": 78120 }, { "epoch": 1.7390046296296298, "grad_norm": 0.6326592564582825, "learning_rate": 0.00015049434850870354, "loss": 0.5379, "step": 78130 }, { "epoch": 1.739227207977208, "grad_norm": 0.616950273513794, "learning_rate": 0.00015044917847212608, "loss": 0.5545, "step": 78140 }, { "epoch": 1.7394497863247862, "grad_norm": 0.5827544331550598, "learning_rate": 0.00015040401112818082, "loss": 0.5862, "step": 78150 }, { "epoch": 1.7396723646723646, "grad_norm": 0.5349555611610413, "learning_rate": 0.00015035884647932222, "loss": 0.4486, "step": 78160 }, { "epoch": 1.739894943019943, "grad_norm": 0.5661860704421997, "learning_rate": 0.0001503136845280045, "loss": 0.4761, "step": 78170 }, { "epoch": 1.7401175213675213, "grad_norm": 0.4856835901737213, "learning_rate": 0.00015026852527668186, "loss": 0.407, "step": 78180 }, { "epoch": 1.7402065527065527, "eval_loss": 0.5531619787216187, "eval_runtime": 337.3034, "eval_samples_per_second": 7.011, "eval_steps_per_second": 7.011, "step": 78184 }, { "epoch": 1.7403400997150997, "grad_norm": 0.5013774633407593, "learning_rate": 0.0001502233687278083, "loss": 0.5465, "step": 78190 }, { "epoch": 1.7405626780626782, "grad_norm": 0.6066476702690125, "learning_rate": 0.00015017821488383758, "loss": 0.5875, "step": 78200 }, { "epoch": 1.7407852564102564, "grad_norm": 0.4210889935493469, "learning_rate": 0.00015013306374722348, "loss": 0.3845, "step": 78210 }, { "epoch": 1.7410078347578346, "grad_norm": 0.6543701887130737, "learning_rate": 0.00015008791532041953, "loss": 0.5046, "step": 78220 }, { "epoch": 1.7412304131054133, "grad_norm": 0.6810021996498108, "learning_rate": 0.00015004276960587913, "loss": 0.447, "step": 78230 }, { "epoch": 1.7414529914529915, "grad_norm": 0.5493144392967224, "learning_rate": 0.00014999762660605553, "loss": 0.6239, "step": 78240 }, { "epoch": 1.7416755698005697, "grad_norm": 0.4917827546596527, "learning_rate": 0.0001499524863234018, "loss": 0.6435, "step": 78250 }, { "epoch": 1.7418981481481481, "grad_norm": 0.592804491519928, "learning_rate": 0.0001499073487603709, "loss": 0.4306, "step": 78260 }, { "epoch": 1.7421207264957266, "grad_norm": 0.43577784299850464, "learning_rate": 0.00014986221391941575, "loss": 0.504, "step": 78270 }, { "epoch": 1.7423433048433048, "grad_norm": 0.2997153401374817, "learning_rate": 0.00014981708180298887, "loss": 0.5224, "step": 78280 }, { "epoch": 1.7425658831908832, "grad_norm": 0.3560454845428467, "learning_rate": 0.0001497719524135429, "loss": 0.5944, "step": 78290 }, { "epoch": 1.7427884615384617, "grad_norm": 0.6725266575813293, "learning_rate": 0.00014972682575353015, "loss": 0.5343, "step": 78300 }, { "epoch": 1.74301103988604, "grad_norm": 0.5585955381393433, "learning_rate": 0.0001496817018254028, "loss": 0.6153, "step": 78310 }, { "epoch": 1.743233618233618, "grad_norm": 0.5645360350608826, "learning_rate": 0.000149636580631613, "loss": 0.4829, "step": 78320 }, { "epoch": 1.7434561965811965, "grad_norm": 0.6441385746002197, "learning_rate": 0.00014959146217461265, "loss": 0.5087, "step": 78330 }, { "epoch": 1.743678774928775, "grad_norm": 0.5112342834472656, "learning_rate": 0.0001495463464568535, "loss": 0.3337, "step": 78340 }, { "epoch": 1.7439013532763532, "grad_norm": 0.834675669670105, "learning_rate": 0.00014950123348078716, "loss": 0.5751, "step": 78350 }, { "epoch": 1.7441239316239316, "grad_norm": 0.6172758936882019, "learning_rate": 0.00014945612324886523, "loss": 0.6955, "step": 78360 }, { "epoch": 1.74434650997151, "grad_norm": 0.5868250727653503, "learning_rate": 0.00014941101576353884, "loss": 0.4277, "step": 78370 }, { "epoch": 1.7445690883190883, "grad_norm": 0.7029470205307007, "learning_rate": 0.00014936591102725932, "loss": 0.5665, "step": 78380 }, { "epoch": 1.7447916666666665, "grad_norm": 0.31776389479637146, "learning_rate": 0.0001493208090424776, "loss": 0.496, "step": 78390 }, { "epoch": 1.7450142450142452, "grad_norm": 0.584300696849823, "learning_rate": 0.00014927570981164464, "loss": 0.5566, "step": 78400 }, { "epoch": 1.7452368233618234, "grad_norm": 0.5410353541374207, "learning_rate": 0.0001492306133372111, "loss": 0.5756, "step": 78410 }, { "epoch": 1.7454594017094016, "grad_norm": 0.5734127759933472, "learning_rate": 0.0001491855196216276, "loss": 0.6147, "step": 78420 }, { "epoch": 1.74568198005698, "grad_norm": 0.6105397343635559, "learning_rate": 0.00014914042866734457, "loss": 0.5736, "step": 78430 }, { "epoch": 1.7459045584045585, "grad_norm": 0.5667615532875061, "learning_rate": 0.0001490953404768123, "loss": 0.5383, "step": 78440 }, { "epoch": 1.7461271367521367, "grad_norm": 0.7542938590049744, "learning_rate": 0.00014905025505248086, "loss": 0.5947, "step": 78450 }, { "epoch": 1.7463497150997151, "grad_norm": 0.39676132798194885, "learning_rate": 0.0001490051723968003, "loss": 0.4752, "step": 78460 }, { "epoch": 1.7465722934472936, "grad_norm": 0.6052358150482178, "learning_rate": 0.00014896009251222044, "loss": 0.5905, "step": 78470 }, { "epoch": 1.7467948717948718, "grad_norm": 0.36085426807403564, "learning_rate": 0.0001489150154011909, "loss": 0.589, "step": 78480 }, { "epoch": 1.74701745014245, "grad_norm": 0.4671715497970581, "learning_rate": 0.00014886994106616132, "loss": 0.5841, "step": 78490 }, { "epoch": 1.7472400284900285, "grad_norm": 0.569623589515686, "learning_rate": 0.0001488248695095809, "loss": 0.584, "step": 78500 }, { "epoch": 1.747462606837607, "grad_norm": 0.5761276483535767, "learning_rate": 0.00014877980073389898, "loss": 0.5222, "step": 78510 }, { "epoch": 1.7476851851851851, "grad_norm": 0.779426097869873, "learning_rate": 0.00014873473474156468, "loss": 0.5492, "step": 78520 }, { "epoch": 1.7479077635327636, "grad_norm": 0.7712996006011963, "learning_rate": 0.0001486896715350268, "loss": 0.4714, "step": 78530 }, { "epoch": 1.748130341880342, "grad_norm": 0.6011484265327454, "learning_rate": 0.00014864461111673417, "loss": 0.4811, "step": 78540 }, { "epoch": 1.7483529202279202, "grad_norm": 0.7905614376068115, "learning_rate": 0.00014859955348913548, "loss": 0.5346, "step": 78550 }, { "epoch": 1.7485754985754984, "grad_norm": 0.5179427862167358, "learning_rate": 0.00014855449865467906, "loss": 0.5522, "step": 78560 }, { "epoch": 1.7487980769230769, "grad_norm": 0.9913924336433411, "learning_rate": 0.00014850944661581333, "loss": 0.6609, "step": 78570 }, { "epoch": 1.7490206552706553, "grad_norm": 0.7358751893043518, "learning_rate": 0.0001484643973749864, "loss": 0.5785, "step": 78580 }, { "epoch": 1.7492432336182335, "grad_norm": 0.46974995732307434, "learning_rate": 0.0001484193509346463, "loss": 0.432, "step": 78590 }, { "epoch": 1.749465811965812, "grad_norm": 0.665675163269043, "learning_rate": 0.00014837430729724093, "loss": 0.5735, "step": 78600 }, { "epoch": 1.7496883903133904, "grad_norm": 0.44517970085144043, "learning_rate": 0.00014832926646521797, "loss": 0.541, "step": 78610 }, { "epoch": 1.7499109686609686, "grad_norm": 0.3566705584526062, "learning_rate": 0.00014828422844102499, "loss": 0.6136, "step": 78620 }, { "epoch": 1.750133547008547, "grad_norm": 0.7967696785926819, "learning_rate": 0.00014823919322710935, "loss": 0.6009, "step": 78630 }, { "epoch": 1.7503561253561255, "grad_norm": 0.687789797782898, "learning_rate": 0.0001481941608259183, "loss": 0.4748, "step": 78640 }, { "epoch": 1.7505787037037037, "grad_norm": 0.5504183173179626, "learning_rate": 0.00014814913123989895, "loss": 0.424, "step": 78650 }, { "epoch": 1.750801282051282, "grad_norm": 0.617791473865509, "learning_rate": 0.00014810410447149832, "loss": 0.579, "step": 78660 }, { "epoch": 1.7510238603988604, "grad_norm": 0.6422916054725647, "learning_rate": 0.00014805908052316306, "loss": 0.5115, "step": 78670 }, { "epoch": 1.7512464387464388, "grad_norm": 0.5082630515098572, "learning_rate": 0.00014801405939733993, "loss": 0.6194, "step": 78680 }, { "epoch": 1.751469017094017, "grad_norm": 0.5182098150253296, "learning_rate": 0.00014796904109647536, "loss": 0.5075, "step": 78690 }, { "epoch": 1.7516915954415955, "grad_norm": 0.5061643719673157, "learning_rate": 0.0001479240256230157, "loss": 0.6505, "step": 78700 }, { "epoch": 1.751914173789174, "grad_norm": 0.5079662203788757, "learning_rate": 0.00014787901297940708, "loss": 0.4824, "step": 78710 }, { "epoch": 1.7521367521367521, "grad_norm": 0.43398627638816833, "learning_rate": 0.00014783400316809565, "loss": 0.5377, "step": 78720 }, { "epoch": 1.7523593304843303, "grad_norm": 0.5941019058227539, "learning_rate": 0.00014778899619152707, "loss": 0.5567, "step": 78730 }, { "epoch": 1.7525819088319088, "grad_norm": 0.6169543266296387, "learning_rate": 0.00014774399205214723, "loss": 0.5531, "step": 78740 }, { "epoch": 1.7528044871794872, "grad_norm": 0.5469086766242981, "learning_rate": 0.00014769899075240176, "loss": 0.627, "step": 78750 }, { "epoch": 1.7530270655270654, "grad_norm": 0.5565235614776611, "learning_rate": 0.00014765399229473583, "loss": 0.488, "step": 78760 }, { "epoch": 1.7532496438746439, "grad_norm": 0.4693734049797058, "learning_rate": 0.00014760899668159481, "loss": 0.5234, "step": 78770 }, { "epoch": 1.7534722222222223, "grad_norm": 0.7753030061721802, "learning_rate": 0.00014756400391542382, "loss": 0.6111, "step": 78780 }, { "epoch": 1.7536948005698005, "grad_norm": 0.5935645699501038, "learning_rate": 0.00014751901399866774, "loss": 0.5248, "step": 78790 }, { "epoch": 1.7539173789173788, "grad_norm": 0.6888484358787537, "learning_rate": 0.0001474740269337715, "loss": 0.4863, "step": 78800 }, { "epoch": 1.7541399572649574, "grad_norm": 0.701227605342865, "learning_rate": 0.00014742904272317954, "loss": 0.6862, "step": 78810 }, { "epoch": 1.7543625356125356, "grad_norm": 0.3563346564769745, "learning_rate": 0.00014738406136933648, "loss": 0.5245, "step": 78820 }, { "epoch": 1.7545851139601139, "grad_norm": 0.42991700768470764, "learning_rate": 0.0001473390828746866, "loss": 0.4954, "step": 78830 }, { "epoch": 1.7548076923076923, "grad_norm": 0.6053158640861511, "learning_rate": 0.00014729410724167403, "loss": 0.4704, "step": 78840 }, { "epoch": 1.7550302706552707, "grad_norm": 0.4515514671802521, "learning_rate": 0.00014724913447274282, "loss": 0.5426, "step": 78850 }, { "epoch": 1.755252849002849, "grad_norm": 0.4333280622959137, "learning_rate": 0.0001472041645703369, "loss": 0.526, "step": 78860 }, { "epoch": 1.7554754273504274, "grad_norm": 0.8473355174064636, "learning_rate": 0.00014715919753689982, "loss": 0.6062, "step": 78870 }, { "epoch": 1.7556980056980058, "grad_norm": 0.5016404986381531, "learning_rate": 0.0001471142333748753, "loss": 0.5792, "step": 78880 }, { "epoch": 1.755920584045584, "grad_norm": 0.668297290802002, "learning_rate": 0.00014706927208670654, "loss": 0.5197, "step": 78890 }, { "epoch": 1.7561431623931623, "grad_norm": 0.6492266654968262, "learning_rate": 0.00014702431367483694, "loss": 0.6069, "step": 78900 }, { "epoch": 1.7563657407407407, "grad_norm": 0.5919086933135986, "learning_rate": 0.00014697935814170942, "loss": 0.47, "step": 78910 }, { "epoch": 1.7565883190883191, "grad_norm": 0.7258105874061584, "learning_rate": 0.000146934405489767, "loss": 0.6531, "step": 78920 }, { "epoch": 1.7568108974358974, "grad_norm": 0.39832577109336853, "learning_rate": 0.00014688945572145245, "loss": 0.5133, "step": 78930 }, { "epoch": 1.7570334757834758, "grad_norm": 0.6426448225975037, "learning_rate": 0.00014684450883920838, "loss": 0.5626, "step": 78940 }, { "epoch": 1.7572560541310542, "grad_norm": 0.6589502096176147, "learning_rate": 0.00014679956484547714, "loss": 0.5481, "step": 78950 }, { "epoch": 1.7574786324786325, "grad_norm": 0.7137744426727295, "learning_rate": 0.00014675462374270115, "loss": 0.5171, "step": 78960 }, { "epoch": 1.7577012108262107, "grad_norm": 0.4286314845085144, "learning_rate": 0.00014670968553332249, "loss": 0.4934, "step": 78970 }, { "epoch": 1.7579237891737893, "grad_norm": 0.41813647747039795, "learning_rate": 0.00014666475021978315, "loss": 0.5078, "step": 78980 }, { "epoch": 1.7581463675213675, "grad_norm": 0.5532824397087097, "learning_rate": 0.0001466198178045249, "loss": 0.5654, "step": 78990 }, { "epoch": 1.7583689458689458, "grad_norm": 0.6196248531341553, "learning_rate": 0.0001465748882899895, "loss": 0.6976, "step": 79000 }, { "epoch": 1.7585915242165242, "grad_norm": 0.5199751257896423, "learning_rate": 0.0001465299616786184, "loss": 0.5639, "step": 79010 }, { "epoch": 1.7588141025641026, "grad_norm": 0.7880491018295288, "learning_rate": 0.00014648503797285302, "loss": 0.639, "step": 79020 }, { "epoch": 1.7590366809116809, "grad_norm": 0.45788395404815674, "learning_rate": 0.00014644011717513438, "loss": 0.4218, "step": 79030 }, { "epoch": 1.7592592592592593, "grad_norm": 0.4737023413181305, "learning_rate": 0.00014639519928790368, "loss": 0.6488, "step": 79040 }, { "epoch": 1.7594818376068377, "grad_norm": 0.5662586688995361, "learning_rate": 0.0001463502843136017, "loss": 0.5559, "step": 79050 }, { "epoch": 1.759704415954416, "grad_norm": 0.996168315410614, "learning_rate": 0.00014630537225466918, "loss": 0.4669, "step": 79060 }, { "epoch": 1.7599269943019942, "grad_norm": 0.5135176181793213, "learning_rate": 0.0001462604631135467, "loss": 0.5703, "step": 79070 }, { "epoch": 1.7601495726495726, "grad_norm": 0.3148840069770813, "learning_rate": 0.00014621555689267468, "loss": 0.5434, "step": 79080 }, { "epoch": 1.760372150997151, "grad_norm": 0.5665394067764282, "learning_rate": 0.00014617065359449327, "loss": 0.5154, "step": 79090 }, { "epoch": 1.7605947293447293, "grad_norm": 0.4433667063713074, "learning_rate": 0.00014612575322144262, "loss": 0.5047, "step": 79100 }, { "epoch": 1.7608173076923077, "grad_norm": 0.692866325378418, "learning_rate": 0.0001460808557759627, "loss": 0.625, "step": 79110 }, { "epoch": 1.7610398860398861, "grad_norm": 0.6210705041885376, "learning_rate": 0.00014603596126049314, "loss": 0.5122, "step": 79120 }, { "epoch": 1.7612624643874644, "grad_norm": 0.484747052192688, "learning_rate": 0.00014599106967747364, "loss": 0.4343, "step": 79130 }, { "epoch": 1.7614850427350426, "grad_norm": 0.46711021661758423, "learning_rate": 0.0001459461810293437, "loss": 0.4887, "step": 79140 }, { "epoch": 1.7617076210826212, "grad_norm": 0.6077610850334167, "learning_rate": 0.00014590129531854255, "loss": 0.6496, "step": 79150 }, { "epoch": 1.7619301994301995, "grad_norm": 0.35435062646865845, "learning_rate": 0.0001458564125475092, "loss": 0.4541, "step": 79160 }, { "epoch": 1.7621527777777777, "grad_norm": 0.8237841129302979, "learning_rate": 0.00014581153271868276, "loss": 0.5728, "step": 79170 }, { "epoch": 1.7623753561253561, "grad_norm": 0.5024449229240417, "learning_rate": 0.000145766655834502, "loss": 0.5759, "step": 79180 }, { "epoch": 1.7625979344729346, "grad_norm": 0.4281274974346161, "learning_rate": 0.00014572178189740554, "loss": 0.5587, "step": 79190 }, { "epoch": 1.7628205128205128, "grad_norm": 0.9404083490371704, "learning_rate": 0.00014567691090983185, "loss": 0.5896, "step": 79200 }, { "epoch": 1.7630430911680912, "grad_norm": 0.5195593237876892, "learning_rate": 0.00014563204287421937, "loss": 0.5516, "step": 79210 }, { "epoch": 1.7632656695156697, "grad_norm": 0.6367482542991638, "learning_rate": 0.00014558717779300612, "loss": 0.5328, "step": 79220 }, { "epoch": 1.7634882478632479, "grad_norm": 0.7262487411499023, "learning_rate": 0.0001455423156686302, "loss": 0.548, "step": 79230 }, { "epoch": 1.763710826210826, "grad_norm": 0.7013747096061707, "learning_rate": 0.00014549745650352942, "loss": 0.5405, "step": 79240 }, { "epoch": 1.7639334045584045, "grad_norm": 0.808320164680481, "learning_rate": 0.00014545260030014148, "loss": 0.6022, "step": 79250 }, { "epoch": 1.764155982905983, "grad_norm": 0.5514160394668579, "learning_rate": 0.00014540774706090387, "loss": 0.7491, "step": 79260 }, { "epoch": 1.7643785612535612, "grad_norm": 0.40290629863739014, "learning_rate": 0.00014536289678825402, "loss": 0.513, "step": 79270 }, { "epoch": 1.7646011396011396, "grad_norm": 0.656762421131134, "learning_rate": 0.00014531804948462912, "loss": 0.6651, "step": 79280 }, { "epoch": 1.764823717948718, "grad_norm": 0.9429783225059509, "learning_rate": 0.00014527320515246613, "loss": 0.6955, "step": 79290 }, { "epoch": 1.7650462962962963, "grad_norm": 0.5018649697303772, "learning_rate": 0.00014522836379420197, "loss": 0.4788, "step": 79300 }, { "epoch": 1.7652688746438745, "grad_norm": 0.6236805319786072, "learning_rate": 0.00014518352541227332, "loss": 0.5814, "step": 79310 }, { "epoch": 1.765491452991453, "grad_norm": 0.40159523487091064, "learning_rate": 0.00014513869000911685, "loss": 0.5379, "step": 79320 }, { "epoch": 1.7657140313390314, "grad_norm": 0.4981936514377594, "learning_rate": 0.00014509385758716881, "loss": 0.4977, "step": 79330 }, { "epoch": 1.7659366096866096, "grad_norm": 0.4534938931465149, "learning_rate": 0.00014504902814886552, "loss": 0.419, "step": 79340 }, { "epoch": 1.766159188034188, "grad_norm": 0.5050660967826843, "learning_rate": 0.00014500420169664304, "loss": 0.492, "step": 79350 }, { "epoch": 1.7663817663817665, "grad_norm": 0.7194937467575073, "learning_rate": 0.00014495937823293727, "loss": 0.5557, "step": 79360 }, { "epoch": 1.7666043447293447, "grad_norm": 0.7035837769508362, "learning_rate": 0.0001449145577601839, "loss": 0.5342, "step": 79370 }, { "epoch": 1.7668269230769231, "grad_norm": 0.7265353202819824, "learning_rate": 0.00014486974028081855, "loss": 0.554, "step": 79380 }, { "epoch": 1.7670495014245016, "grad_norm": 0.6127051711082458, "learning_rate": 0.0001448249257972767, "loss": 0.6079, "step": 79390 }, { "epoch": 1.7672720797720798, "grad_norm": 0.5281509160995483, "learning_rate": 0.00014478011431199353, "loss": 0.5042, "step": 79400 }, { "epoch": 1.767494658119658, "grad_norm": 0.4901590943336487, "learning_rate": 0.0001447353058274042, "loss": 0.5553, "step": 79410 }, { "epoch": 1.7677172364672364, "grad_norm": 0.3638477921485901, "learning_rate": 0.00014469050034594352, "loss": 0.5171, "step": 79420 }, { "epoch": 1.7679398148148149, "grad_norm": 0.7034321427345276, "learning_rate": 0.0001446456978700464, "loss": 0.5711, "step": 79430 }, { "epoch": 1.768162393162393, "grad_norm": 0.7615830302238464, "learning_rate": 0.0001446008984021473, "loss": 0.5898, "step": 79440 }, { "epoch": 1.7683849715099715, "grad_norm": 0.43075403571128845, "learning_rate": 0.00014455610194468075, "loss": 0.5373, "step": 79450 }, { "epoch": 1.76860754985755, "grad_norm": 0.552230954170227, "learning_rate": 0.00014451130850008103, "loss": 0.6413, "step": 79460 }, { "epoch": 1.7688301282051282, "grad_norm": 0.41900429129600525, "learning_rate": 0.00014446651807078223, "loss": 0.6078, "step": 79470 }, { "epoch": 1.7690527065527064, "grad_norm": 0.5278525948524475, "learning_rate": 0.00014442173065921823, "loss": 0.4369, "step": 79480 }, { "epoch": 1.7692752849002849, "grad_norm": 0.47469159960746765, "learning_rate": 0.00014437694626782298, "loss": 0.5996, "step": 79490 }, { "epoch": 1.7694978632478633, "grad_norm": 0.6462084054946899, "learning_rate": 0.00014433216489902998, "loss": 0.5545, "step": 79500 }, { "epoch": 1.7697204415954415, "grad_norm": 0.5856205224990845, "learning_rate": 0.00014428738655527265, "loss": 0.5303, "step": 79510 }, { "epoch": 1.76994301994302, "grad_norm": 0.5695719122886658, "learning_rate": 0.00014424261123898444, "loss": 0.5023, "step": 79520 }, { "epoch": 1.7701655982905984, "grad_norm": 0.651113748550415, "learning_rate": 0.0001441978389525983, "loss": 0.5535, "step": 79530 }, { "epoch": 1.7703881766381766, "grad_norm": 0.6896957755088806, "learning_rate": 0.0001441530696985474, "loss": 0.537, "step": 79540 }, { "epoch": 1.7706107549857548, "grad_norm": 0.34784117341041565, "learning_rate": 0.00014410830347926428, "loss": 0.4425, "step": 79550 }, { "epoch": 1.7708333333333335, "grad_norm": 0.4808754026889801, "learning_rate": 0.00014406354029718172, "loss": 0.4823, "step": 79560 }, { "epoch": 1.7710559116809117, "grad_norm": 0.49746209383010864, "learning_rate": 0.00014401878015473223, "loss": 0.4782, "step": 79570 }, { "epoch": 1.77127849002849, "grad_norm": 0.45266780257225037, "learning_rate": 0.00014397402305434798, "loss": 0.4927, "step": 79580 }, { "epoch": 1.7715010683760684, "grad_norm": 0.5584794878959656, "learning_rate": 0.00014392926899846123, "loss": 0.5003, "step": 79590 }, { "epoch": 1.7717236467236468, "grad_norm": 0.5668531060218811, "learning_rate": 0.0001438845179895039, "loss": 0.6955, "step": 79600 }, { "epoch": 1.771946225071225, "grad_norm": 0.9424641728401184, "learning_rate": 0.0001438397700299078, "loss": 0.4932, "step": 79610 }, { "epoch": 1.7721688034188035, "grad_norm": 0.526470422744751, "learning_rate": 0.00014379502512210454, "loss": 0.4848, "step": 79620 }, { "epoch": 1.772391381766382, "grad_norm": 0.6422226428985596, "learning_rate": 0.00014375028326852566, "loss": 0.6079, "step": 79630 }, { "epoch": 1.77261396011396, "grad_norm": 0.6118940711021423, "learning_rate": 0.00014370554447160243, "loss": 0.6929, "step": 79640 }, { "epoch": 1.7728365384615383, "grad_norm": 0.4747575521469116, "learning_rate": 0.000143660808733766, "loss": 0.4412, "step": 79650 }, { "epoch": 1.7730591168091168, "grad_norm": 0.5872943997383118, "learning_rate": 0.00014361607605744734, "loss": 0.5547, "step": 79660 }, { "epoch": 1.7732816951566952, "grad_norm": 0.5565270781517029, "learning_rate": 0.0001435713464450773, "loss": 0.5267, "step": 79670 }, { "epoch": 1.7735042735042734, "grad_norm": 0.7619370222091675, "learning_rate": 0.00014352661989908646, "loss": 0.5872, "step": 79680 }, { "epoch": 1.7737268518518519, "grad_norm": 0.8954969048500061, "learning_rate": 0.00014348189642190534, "loss": 0.5826, "step": 79690 }, { "epoch": 1.7739494301994303, "grad_norm": 0.538362979888916, "learning_rate": 0.00014343717601596418, "loss": 0.5234, "step": 79700 }, { "epoch": 1.7741720085470085, "grad_norm": 0.7379159331321716, "learning_rate": 0.00014339245868369324, "loss": 0.5972, "step": 79710 }, { "epoch": 1.7743945868945867, "grad_norm": 0.5484915971755981, "learning_rate": 0.00014334774442752241, "loss": 0.5816, "step": 79720 }, { "epoch": 1.7746171652421654, "grad_norm": 0.5934858322143555, "learning_rate": 0.0001433030332498815, "loss": 0.6715, "step": 79730 }, { "epoch": 1.7748397435897436, "grad_norm": 0.6535338163375854, "learning_rate": 0.00014325832515320024, "loss": 0.4886, "step": 79740 }, { "epoch": 1.7750623219373218, "grad_norm": 0.3943345844745636, "learning_rate": 0.000143213620139908, "loss": 0.5029, "step": 79750 }, { "epoch": 1.7752849002849003, "grad_norm": 0.3244159519672394, "learning_rate": 0.0001431689182124341, "loss": 0.4258, "step": 79760 }, { "epoch": 1.7755074786324787, "grad_norm": 0.6063029766082764, "learning_rate": 0.00014312421937320776, "loss": 0.4813, "step": 79770 }, { "epoch": 1.775730056980057, "grad_norm": 0.5160526633262634, "learning_rate": 0.0001430795236246579, "loss": 0.5442, "step": 79780 }, { "epoch": 1.7759526353276354, "grad_norm": 0.4923551082611084, "learning_rate": 0.00014303483096921328, "loss": 0.6922, "step": 79790 }, { "epoch": 1.7761752136752138, "grad_norm": 0.42703601717948914, "learning_rate": 0.00014299014140930268, "loss": 0.5636, "step": 79800 }, { "epoch": 1.776397792022792, "grad_norm": 0.6603934168815613, "learning_rate": 0.00014294545494735437, "loss": 0.5109, "step": 79810 }, { "epoch": 1.7766203703703702, "grad_norm": 0.4960465431213379, "learning_rate": 0.0001429007715857968, "loss": 0.6324, "step": 79820 }, { "epoch": 1.7768429487179487, "grad_norm": 0.4624791741371155, "learning_rate": 0.00014285609132705802, "loss": 0.5882, "step": 79830 }, { "epoch": 1.7770655270655271, "grad_norm": 0.38224509358406067, "learning_rate": 0.00014281141417356598, "loss": 0.5201, "step": 79840 }, { "epoch": 1.7772881054131053, "grad_norm": 0.571273684501648, "learning_rate": 0.00014276674012774857, "loss": 0.4605, "step": 79850 }, { "epoch": 1.7775106837606838, "grad_norm": 0.45023903250694275, "learning_rate": 0.00014272206919203334, "loss": 0.4811, "step": 79860 }, { "epoch": 1.7777332621082622, "grad_norm": 0.43059608340263367, "learning_rate": 0.0001426774013688477, "loss": 0.4993, "step": 79870 }, { "epoch": 1.7779558404558404, "grad_norm": 0.844609797000885, "learning_rate": 0.00014263273666061907, "loss": 0.5858, "step": 79880 }, { "epoch": 1.7781784188034186, "grad_norm": 0.5263960361480713, "learning_rate": 0.00014258807506977445, "loss": 0.5184, "step": 79890 }, { "epoch": 1.7784009971509973, "grad_norm": 0.6724256873130798, "learning_rate": 0.00014254341659874085, "loss": 0.5282, "step": 79900 }, { "epoch": 1.7786235754985755, "grad_norm": 0.709722638130188, "learning_rate": 0.00014249876124994504, "loss": 0.5087, "step": 79910 }, { "epoch": 1.7788461538461537, "grad_norm": 0.599476158618927, "learning_rate": 0.00014245410902581358, "loss": 0.6163, "step": 79920 }, { "epoch": 1.7790687321937322, "grad_norm": 0.6736618876457214, "learning_rate": 0.00014240945992877304, "loss": 0.4809, "step": 79930 }, { "epoch": 1.7792913105413106, "grad_norm": 0.7935197353363037, "learning_rate": 0.0001423648139612495, "loss": 0.5017, "step": 79940 }, { "epoch": 1.7795138888888888, "grad_norm": 0.5399793982505798, "learning_rate": 0.00014232017112566914, "loss": 0.5457, "step": 79950 }, { "epoch": 1.7797364672364673, "grad_norm": 0.7083317041397095, "learning_rate": 0.00014227553142445796, "loss": 0.65, "step": 79960 }, { "epoch": 1.7799590455840457, "grad_norm": 0.6122748255729675, "learning_rate": 0.00014223089486004162, "loss": 0.5145, "step": 79970 }, { "epoch": 1.780181623931624, "grad_norm": 0.4464356601238251, "learning_rate": 0.00014218626143484573, "loss": 0.596, "step": 79980 }, { "epoch": 1.7804042022792022, "grad_norm": 0.5562244057655334, "learning_rate": 0.00014214163115129578, "loss": 0.5743, "step": 79990 }, { "epoch": 1.7806267806267806, "grad_norm": 0.6051118969917297, "learning_rate": 0.0001420970040118169, "loss": 0.4303, "step": 80000 }, { "epoch": 1.780849358974359, "grad_norm": 0.47445961833000183, "learning_rate": 0.00014205238001883426, "loss": 0.4666, "step": 80010 }, { "epoch": 1.7810719373219372, "grad_norm": 0.6427887678146362, "learning_rate": 0.00014200775917477273, "loss": 0.5215, "step": 80020 }, { "epoch": 1.7812945156695157, "grad_norm": 0.6066806316375732, "learning_rate": 0.00014196314148205702, "loss": 0.5826, "step": 80030 }, { "epoch": 1.7815170940170941, "grad_norm": 0.6234745979309082, "learning_rate": 0.0001419185269431117, "loss": 0.4631, "step": 80040 }, { "epoch": 1.7817396723646723, "grad_norm": 0.6324895024299622, "learning_rate": 0.00014187391556036124, "loss": 0.618, "step": 80050 }, { "epoch": 1.7819622507122506, "grad_norm": 0.6726653575897217, "learning_rate": 0.0001418293073362297, "loss": 0.5663, "step": 80060 }, { "epoch": 1.7821848290598292, "grad_norm": 0.5067920088768005, "learning_rate": 0.00014178470227314133, "loss": 0.5297, "step": 80070 }, { "epoch": 1.7824074074074074, "grad_norm": 0.6199058890342712, "learning_rate": 0.00014174010037351983, "loss": 0.569, "step": 80080 }, { "epoch": 1.7826299857549857, "grad_norm": 0.4189685881137848, "learning_rate": 0.00014169550163978896, "loss": 0.377, "step": 80090 }, { "epoch": 1.782852564102564, "grad_norm": 0.3900449275970459, "learning_rate": 0.0001416509060743723, "loss": 0.5089, "step": 80100 }, { "epoch": 1.7830751424501425, "grad_norm": 0.6637765765190125, "learning_rate": 0.00014160631367969313, "loss": 0.5911, "step": 80110 }, { "epoch": 1.7832977207977208, "grad_norm": 0.9584378004074097, "learning_rate": 0.00014156172445817467, "loss": 0.4981, "step": 80120 }, { "epoch": 1.7835202991452992, "grad_norm": 0.7218378186225891, "learning_rate": 0.00014151713841224, "loss": 0.5753, "step": 80130 }, { "epoch": 1.7837428774928776, "grad_norm": 0.5870119333267212, "learning_rate": 0.00014147255554431185, "loss": 0.5396, "step": 80140 }, { "epoch": 1.7839654558404558, "grad_norm": 0.4775555431842804, "learning_rate": 0.00014142797585681293, "loss": 0.5523, "step": 80150 }, { "epoch": 1.784188034188034, "grad_norm": 0.4715177118778229, "learning_rate": 0.00014138339935216584, "loss": 0.4691, "step": 80160 }, { "epoch": 1.7844106125356125, "grad_norm": 0.4988359808921814, "learning_rate": 0.00014133882603279273, "loss": 0.451, "step": 80170 }, { "epoch": 1.784633190883191, "grad_norm": 0.5942032933235168, "learning_rate": 0.00014129425590111584, "loss": 0.625, "step": 80180 }, { "epoch": 1.7848557692307692, "grad_norm": 0.40297701954841614, "learning_rate": 0.00014124968895955719, "loss": 0.5377, "step": 80190 }, { "epoch": 1.7850783475783476, "grad_norm": 0.5702888369560242, "learning_rate": 0.0001412051252105386, "loss": 0.5224, "step": 80200 }, { "epoch": 1.785300925925926, "grad_norm": 0.5079876780509949, "learning_rate": 0.00014116056465648152, "loss": 0.5006, "step": 80210 }, { "epoch": 1.7855235042735043, "grad_norm": 0.8400496244430542, "learning_rate": 0.00014111600729980756, "loss": 0.5151, "step": 80220 }, { "epoch": 1.7857460826210825, "grad_norm": 0.7045161724090576, "learning_rate": 0.00014107145314293796, "loss": 0.4752, "step": 80230 }, { "epoch": 1.785968660968661, "grad_norm": 0.58415687084198, "learning_rate": 0.00014102690218829387, "loss": 0.4254, "step": 80240 }, { "epoch": 1.7861912393162394, "grad_norm": 0.7725820541381836, "learning_rate": 0.00014098235443829615, "loss": 0.5807, "step": 80250 }, { "epoch": 1.7864138176638176, "grad_norm": 0.5273394584655762, "learning_rate": 0.0001409378098953656, "loss": 0.6497, "step": 80260 }, { "epoch": 1.786636396011396, "grad_norm": 0.32381510734558105, "learning_rate": 0.00014089326856192287, "loss": 0.5428, "step": 80270 }, { "epoch": 1.7868589743589745, "grad_norm": 0.45702850818634033, "learning_rate": 0.00014084873044038825, "loss": 0.4426, "step": 80280 }, { "epoch": 1.7870815527065527, "grad_norm": 0.5660591125488281, "learning_rate": 0.00014080419553318206, "loss": 0.4756, "step": 80290 }, { "epoch": 1.787304131054131, "grad_norm": 0.4793355166912079, "learning_rate": 0.00014075966384272437, "loss": 0.5081, "step": 80300 }, { "epoch": 1.7875267094017095, "grad_norm": 0.47678154706954956, "learning_rate": 0.000140715135371435, "loss": 0.5122, "step": 80310 }, { "epoch": 1.7877492877492878, "grad_norm": 0.6590827703475952, "learning_rate": 0.0001406706101217337, "loss": 0.4629, "step": 80320 }, { "epoch": 1.787971866096866, "grad_norm": 0.528451144695282, "learning_rate": 0.00014062608809604013, "loss": 0.5777, "step": 80330 }, { "epoch": 1.7881944444444444, "grad_norm": 0.41150301694869995, "learning_rate": 0.0001405815692967735, "loss": 0.5946, "step": 80340 }, { "epoch": 1.7884170227920229, "grad_norm": 0.46680572628974915, "learning_rate": 0.00014053705372635297, "loss": 0.4817, "step": 80350 }, { "epoch": 1.788639601139601, "grad_norm": 0.4094049334526062, "learning_rate": 0.00014049254138719764, "loss": 0.4549, "step": 80360 }, { "epoch": 1.7888621794871795, "grad_norm": 0.5676484107971191, "learning_rate": 0.00014044803228172628, "loss": 0.6292, "step": 80370 }, { "epoch": 1.789084757834758, "grad_norm": 0.42871958017349243, "learning_rate": 0.00014040352641235768, "loss": 0.4641, "step": 80380 }, { "epoch": 1.7893073361823362, "grad_norm": 0.4868099093437195, "learning_rate": 0.00014035902378151018, "loss": 0.5832, "step": 80390 }, { "epoch": 1.7895299145299144, "grad_norm": 0.6399804353713989, "learning_rate": 0.00014031452439160216, "loss": 0.6641, "step": 80400 }, { "epoch": 1.7897524928774928, "grad_norm": 0.36850976943969727, "learning_rate": 0.0001402700282450518, "loss": 0.5838, "step": 80410 }, { "epoch": 1.7899750712250713, "grad_norm": 0.42611685395240784, "learning_rate": 0.00014022553534427697, "loss": 0.4689, "step": 80420 }, { "epoch": 1.7901976495726495, "grad_norm": 0.5654397010803223, "learning_rate": 0.00014018104569169543, "loss": 0.45, "step": 80430 }, { "epoch": 1.790420227920228, "grad_norm": 0.5701908469200134, "learning_rate": 0.00014013655928972493, "loss": 0.5506, "step": 80440 }, { "epoch": 1.7906428062678064, "grad_norm": 0.28367915749549866, "learning_rate": 0.00014009207614078272, "loss": 0.592, "step": 80450 }, { "epoch": 1.7908653846153846, "grad_norm": 0.5797436237335205, "learning_rate": 0.00014004759624728624, "loss": 0.511, "step": 80460 }, { "epoch": 1.7910879629629628, "grad_norm": 0.4902498722076416, "learning_rate": 0.00014000311961165243, "loss": 0.4024, "step": 80470 }, { "epoch": 1.7913105413105415, "grad_norm": 0.5887957215309143, "learning_rate": 0.0001399586462362982, "loss": 0.6255, "step": 80480 }, { "epoch": 1.7915331196581197, "grad_norm": 0.4187609851360321, "learning_rate": 0.00013991417612364024, "loss": 0.425, "step": 80490 }, { "epoch": 1.791755698005698, "grad_norm": 0.9703378677368164, "learning_rate": 0.00013986970927609514, "loss": 0.6751, "step": 80500 }, { "epoch": 1.7919782763532763, "grad_norm": 0.46224939823150635, "learning_rate": 0.0001398252456960793, "loss": 0.496, "step": 80510 }, { "epoch": 1.7922008547008548, "grad_norm": 0.6721695065498352, "learning_rate": 0.00013978078538600888, "loss": 0.5485, "step": 80520 }, { "epoch": 1.792423433048433, "grad_norm": 0.5210494995117188, "learning_rate": 0.00013973632834829984, "loss": 0.613, "step": 80530 }, { "epoch": 1.7926460113960114, "grad_norm": 0.6934546828269958, "learning_rate": 0.00013969187458536805, "loss": 0.6063, "step": 80540 }, { "epoch": 1.7928685897435899, "grad_norm": 0.7616796493530273, "learning_rate": 0.0001396474240996292, "loss": 0.6277, "step": 80550 }, { "epoch": 1.793091168091168, "grad_norm": 0.46376466751098633, "learning_rate": 0.0001396029768934987, "loss": 0.46, "step": 80560 }, { "epoch": 1.7933137464387463, "grad_norm": 0.5162578821182251, "learning_rate": 0.0001395585329693919, "loss": 0.5241, "step": 80570 }, { "epoch": 1.7935363247863247, "grad_norm": 0.9000981450080872, "learning_rate": 0.00013951409232972392, "loss": 0.5347, "step": 80580 }, { "epoch": 1.7937589031339032, "grad_norm": 0.6056053042411804, "learning_rate": 0.0001394696549769097, "loss": 0.4556, "step": 80590 }, { "epoch": 1.7939814814814814, "grad_norm": 0.6429079174995422, "learning_rate": 0.00013942522091336392, "loss": 0.5873, "step": 80600 }, { "epoch": 1.7942040598290598, "grad_norm": 0.6571381092071533, "learning_rate": 0.00013938079014150123, "loss": 0.4782, "step": 80610 }, { "epoch": 1.7944266381766383, "grad_norm": 0.3865581154823303, "learning_rate": 0.00013933636266373606, "loss": 0.5661, "step": 80620 }, { "epoch": 1.7946492165242165, "grad_norm": 0.4678865373134613, "learning_rate": 0.00013929193848248255, "loss": 0.4541, "step": 80630 }, { "epoch": 1.7948717948717947, "grad_norm": 0.4663577973842621, "learning_rate": 0.00013924751760015482, "loss": 0.5383, "step": 80640 }, { "epoch": 1.7950943732193734, "grad_norm": 0.5332295894622803, "learning_rate": 0.0001392031000191668, "loss": 0.5486, "step": 80650 }, { "epoch": 1.7953169515669516, "grad_norm": 0.5024805665016174, "learning_rate": 0.00013915868574193197, "loss": 0.5637, "step": 80660 }, { "epoch": 1.7955395299145298, "grad_norm": 0.5011512637138367, "learning_rate": 0.00013911427477086402, "loss": 0.5621, "step": 80670 }, { "epoch": 1.7957621082621082, "grad_norm": 0.50482577085495, "learning_rate": 0.0001390698671083762, "loss": 0.5181, "step": 80680 }, { "epoch": 1.7959846866096867, "grad_norm": 0.4566592276096344, "learning_rate": 0.00013902546275688173, "loss": 0.5264, "step": 80690 }, { "epoch": 1.796207264957265, "grad_norm": 0.6772483587265015, "learning_rate": 0.00013898106171879348, "loss": 0.5465, "step": 80700 }, { "epoch": 1.7964298433048433, "grad_norm": 0.6919073462486267, "learning_rate": 0.0001389366639965243, "loss": 0.5014, "step": 80710 }, { "epoch": 1.7966524216524218, "grad_norm": 1.044856071472168, "learning_rate": 0.0001388922695924869, "loss": 0.5896, "step": 80720 }, { "epoch": 1.796875, "grad_norm": 0.691770613193512, "learning_rate": 0.0001388478785090935, "loss": 0.4374, "step": 80730 }, { "epoch": 1.7970975783475782, "grad_norm": 0.6516551375389099, "learning_rate": 0.00013880349074875642, "loss": 0.5147, "step": 80740 }, { "epoch": 1.7973201566951567, "grad_norm": 0.5089389681816101, "learning_rate": 0.00013875910631388775, "loss": 0.4736, "step": 80750 }, { "epoch": 1.797542735042735, "grad_norm": 0.5509934425354004, "learning_rate": 0.00013871472520689943, "loss": 0.3939, "step": 80760 }, { "epoch": 1.7977653133903133, "grad_norm": 0.7316688299179077, "learning_rate": 0.00013867034743020304, "loss": 0.5217, "step": 80770 }, { "epoch": 1.7979878917378918, "grad_norm": 0.44678255915641785, "learning_rate": 0.00013862597298621023, "loss": 0.445, "step": 80780 }, { "epoch": 1.7982104700854702, "grad_norm": 0.6435457468032837, "learning_rate": 0.0001385816018773323, "loss": 0.5519, "step": 80790 }, { "epoch": 1.7984330484330484, "grad_norm": 0.6044391989707947, "learning_rate": 0.00013853723410598033, "loss": 0.6251, "step": 80800 }, { "epoch": 1.7986556267806266, "grad_norm": 0.6454141139984131, "learning_rate": 0.00013849286967456542, "loss": 0.6475, "step": 80810 }, { "epoch": 1.7988782051282053, "grad_norm": 0.596731960773468, "learning_rate": 0.00013844850858549837, "loss": 0.5181, "step": 80820 }, { "epoch": 1.7991007834757835, "grad_norm": 0.5489130616188049, "learning_rate": 0.0001384041508411897, "loss": 0.6074, "step": 80830 }, { "epoch": 1.7993233618233617, "grad_norm": 0.9071128964424133, "learning_rate": 0.00013835979644404988, "loss": 0.4769, "step": 80840 }, { "epoch": 1.7995459401709402, "grad_norm": 0.5657101273536682, "learning_rate": 0.0001383154453964893, "loss": 0.4991, "step": 80850 }, { "epoch": 1.7997685185185186, "grad_norm": 0.6993838548660278, "learning_rate": 0.0001382710977009178, "loss": 0.5707, "step": 80860 }, { "epoch": 1.7999910968660968, "grad_norm": 0.5687394142150879, "learning_rate": 0.0001382267533597454, "loss": 0.4844, "step": 80870 }, { "epoch": 1.8002136752136753, "grad_norm": 0.4542628228664398, "learning_rate": 0.00013818241237538182, "loss": 0.5811, "step": 80880 }, { "epoch": 1.8002136752136753, "eval_loss": 0.5486223101615906, "eval_runtime": 337.5274, "eval_samples_per_second": 7.007, "eval_steps_per_second": 7.007, "step": 80880 }, { "epoch": 1.8004362535612537, "grad_norm": 0.5703278183937073, "learning_rate": 0.0001381380747502365, "loss": 0.457, "step": 80890 }, { "epoch": 1.800658831908832, "grad_norm": 0.3732733428478241, "learning_rate": 0.00013809374048671892, "loss": 0.5358, "step": 80900 }, { "epoch": 1.8008814102564101, "grad_norm": 0.49211806058883667, "learning_rate": 0.0001380494095872381, "loss": 0.5763, "step": 80910 }, { "epoch": 1.8011039886039886, "grad_norm": 0.6078943014144897, "learning_rate": 0.0001380050820542031, "loss": 0.6499, "step": 80920 }, { "epoch": 1.801326566951567, "grad_norm": 0.3904814124107361, "learning_rate": 0.0001379607578900227, "loss": 0.5125, "step": 80930 }, { "epoch": 1.8015491452991452, "grad_norm": 0.43924203515052795, "learning_rate": 0.0001379164370971055, "loss": 0.5847, "step": 80940 }, { "epoch": 1.8017717236467237, "grad_norm": 0.5108456015586853, "learning_rate": 0.0001378721196778599, "loss": 0.5818, "step": 80950 }, { "epoch": 1.801994301994302, "grad_norm": 0.5348002314567566, "learning_rate": 0.00013782780563469422, "loss": 0.5172, "step": 80960 }, { "epoch": 1.8022168803418803, "grad_norm": 0.6139683127403259, "learning_rate": 0.00013778349497001646, "loss": 0.4641, "step": 80970 }, { "epoch": 1.8024394586894585, "grad_norm": 0.5597288608551025, "learning_rate": 0.00013773918768623456, "loss": 0.4787, "step": 80980 }, { "epoch": 1.8026620370370372, "grad_norm": 0.6026448607444763, "learning_rate": 0.0001376948837857561, "loss": 0.5414, "step": 80990 }, { "epoch": 1.8028846153846154, "grad_norm": 0.9139476418495178, "learning_rate": 0.00013765058327098873, "loss": 0.6162, "step": 81000 }, { "epoch": 1.8031071937321936, "grad_norm": 0.35381126403808594, "learning_rate": 0.0001376062861443397, "loss": 0.4206, "step": 81010 }, { "epoch": 1.803329772079772, "grad_norm": 0.5708917379379272, "learning_rate": 0.0001375619924082161, "loss": 0.5939, "step": 81020 }, { "epoch": 1.8035523504273505, "grad_norm": 0.7910793423652649, "learning_rate": 0.00013751770206502494, "loss": 0.396, "step": 81030 }, { "epoch": 1.8037749287749287, "grad_norm": 0.4806177020072937, "learning_rate": 0.00013747341511717305, "loss": 0.5387, "step": 81040 }, { "epoch": 1.8039975071225072, "grad_norm": 0.5476149320602417, "learning_rate": 0.00013742913156706695, "loss": 0.5224, "step": 81050 }, { "epoch": 1.8042200854700856, "grad_norm": 0.6252416372299194, "learning_rate": 0.00013738485141711303, "loss": 0.4524, "step": 81060 }, { "epoch": 1.8044426638176638, "grad_norm": 0.6658093333244324, "learning_rate": 0.0001373405746697176, "loss": 0.531, "step": 81070 }, { "epoch": 1.804665242165242, "grad_norm": 0.5023934245109558, "learning_rate": 0.0001372963013272866, "loss": 0.5644, "step": 81080 }, { "epoch": 1.8048878205128205, "grad_norm": 0.6625211238861084, "learning_rate": 0.00013725203139222593, "loss": 0.6344, "step": 81090 }, { "epoch": 1.805110398860399, "grad_norm": 0.3682522475719452, "learning_rate": 0.0001372077648669413, "loss": 0.5456, "step": 81100 }, { "epoch": 1.8053329772079771, "grad_norm": 0.5537610650062561, "learning_rate": 0.00013716350175383806, "loss": 0.5544, "step": 81110 }, { "epoch": 1.8055555555555556, "grad_norm": 0.6162594556808472, "learning_rate": 0.00013711924205532164, "loss": 0.5939, "step": 81120 }, { "epoch": 1.805778133903134, "grad_norm": 0.49779462814331055, "learning_rate": 0.000137074985773797, "loss": 0.6856, "step": 81130 }, { "epoch": 1.8060007122507122, "grad_norm": 0.8165244460105896, "learning_rate": 0.00013703073291166916, "loss": 0.5924, "step": 81140 }, { "epoch": 1.8062232905982905, "grad_norm": 0.6566153764724731, "learning_rate": 0.0001369864834713429, "loss": 0.5529, "step": 81150 }, { "epoch": 1.806445868945869, "grad_norm": 0.6264129281044006, "learning_rate": 0.00013694223745522267, "loss": 0.5535, "step": 81160 }, { "epoch": 1.8066684472934473, "grad_norm": 0.44103971123695374, "learning_rate": 0.0001368979948657129, "loss": 0.4672, "step": 81170 }, { "epoch": 1.8068910256410255, "grad_norm": 0.4332234263420105, "learning_rate": 0.00013685375570521774, "loss": 0.5054, "step": 81180 }, { "epoch": 1.807113603988604, "grad_norm": 0.3906761407852173, "learning_rate": 0.00013680951997614116, "loss": 0.4347, "step": 81190 }, { "epoch": 1.8073361823361824, "grad_norm": 0.4635500907897949, "learning_rate": 0.00013676528768088702, "loss": 0.522, "step": 81200 }, { "epoch": 1.8075587606837606, "grad_norm": 0.4586428701877594, "learning_rate": 0.00013672105882185892, "loss": 0.4188, "step": 81210 }, { "epoch": 1.807781339031339, "grad_norm": 0.8698287010192871, "learning_rate": 0.00013667683340146025, "loss": 0.554, "step": 81220 }, { "epoch": 1.8080039173789175, "grad_norm": 0.6498838663101196, "learning_rate": 0.0001366326114220943, "loss": 0.6529, "step": 81230 }, { "epoch": 1.8082264957264957, "grad_norm": 0.5619851350784302, "learning_rate": 0.00013658839288616415, "loss": 0.5616, "step": 81240 }, { "epoch": 1.808449074074074, "grad_norm": 0.5245595574378967, "learning_rate": 0.00013654417779607268, "loss": 0.4498, "step": 81250 }, { "epoch": 1.8086716524216524, "grad_norm": 0.5840174555778503, "learning_rate": 0.0001364999661542225, "loss": 0.4625, "step": 81260 }, { "epoch": 1.8088942307692308, "grad_norm": 0.5450571179389954, "learning_rate": 0.00013645575796301612, "loss": 0.5568, "step": 81270 }, { "epoch": 1.809116809116809, "grad_norm": 0.46242818236351013, "learning_rate": 0.00013641155322485586, "loss": 0.583, "step": 81280 }, { "epoch": 1.8093393874643875, "grad_norm": 0.6096475720405579, "learning_rate": 0.0001363673519421439, "loss": 0.5224, "step": 81290 }, { "epoch": 1.809561965811966, "grad_norm": 0.6651652455329895, "learning_rate": 0.00013632315411728208, "loss": 0.5991, "step": 81300 }, { "epoch": 1.8097845441595442, "grad_norm": 0.45310983061790466, "learning_rate": 0.0001362789597526722, "loss": 0.4822, "step": 81310 }, { "epoch": 1.8100071225071224, "grad_norm": 0.5300917625427246, "learning_rate": 0.00013623476885071586, "loss": 0.47, "step": 81320 }, { "epoch": 1.8102297008547008, "grad_norm": 0.6232476830482483, "learning_rate": 0.00013619058141381435, "loss": 0.5301, "step": 81330 }, { "epoch": 1.8104522792022792, "grad_norm": 0.8016089200973511, "learning_rate": 0.00013614639744436883, "loss": 0.6121, "step": 81340 }, { "epoch": 1.8106748575498575, "grad_norm": 0.526396632194519, "learning_rate": 0.00013610221694478042, "loss": 0.5645, "step": 81350 }, { "epoch": 1.810897435897436, "grad_norm": 0.43031802773475647, "learning_rate": 0.00013605803991744982, "loss": 0.6062, "step": 81360 }, { "epoch": 1.8111200142450143, "grad_norm": 0.6098934412002563, "learning_rate": 0.00013601386636477768, "loss": 0.6972, "step": 81370 }, { "epoch": 1.8113425925925926, "grad_norm": 0.6424472332000732, "learning_rate": 0.00013596969628916448, "loss": 0.6723, "step": 81380 }, { "epoch": 1.8115651709401708, "grad_norm": 0.4741729497909546, "learning_rate": 0.00013592552969301033, "loss": 0.5176, "step": 81390 }, { "epoch": 1.8117877492877494, "grad_norm": 0.4829888641834259, "learning_rate": 0.0001358813665787154, "loss": 0.5935, "step": 81400 }, { "epoch": 1.8120103276353277, "grad_norm": 0.5409865379333496, "learning_rate": 0.00013583720694867942, "loss": 0.5528, "step": 81410 }, { "epoch": 1.8122329059829059, "grad_norm": 0.6868401169776917, "learning_rate": 0.0001357930508053022, "loss": 0.5798, "step": 81420 }, { "epoch": 1.8124554843304843, "grad_norm": 0.6660972833633423, "learning_rate": 0.00013574889815098318, "loss": 0.6272, "step": 81430 }, { "epoch": 1.8126780626780628, "grad_norm": 0.5210102200508118, "learning_rate": 0.00013570474898812158, "loss": 0.5527, "step": 81440 }, { "epoch": 1.812900641025641, "grad_norm": 0.6258218884468079, "learning_rate": 0.00013566060331911657, "loss": 0.6019, "step": 81450 }, { "epoch": 1.8131232193732194, "grad_norm": 0.5742608904838562, "learning_rate": 0.00013561646114636705, "loss": 0.5425, "step": 81460 }, { "epoch": 1.8133457977207978, "grad_norm": 0.848927915096283, "learning_rate": 0.00013557232247227174, "loss": 0.582, "step": 81470 }, { "epoch": 1.813568376068376, "grad_norm": 0.4526084363460541, "learning_rate": 0.00013552818729922915, "loss": 0.4612, "step": 81480 }, { "epoch": 1.8137909544159543, "grad_norm": 0.8034663796424866, "learning_rate": 0.00013548405562963768, "loss": 0.6144, "step": 81490 }, { "epoch": 1.8140135327635327, "grad_norm": 0.3783120810985565, "learning_rate": 0.00013543992746589543, "loss": 0.4725, "step": 81500 }, { "epoch": 1.8142361111111112, "grad_norm": 0.6843919157981873, "learning_rate": 0.00013539580281040042, "loss": 0.533, "step": 81510 }, { "epoch": 1.8144586894586894, "grad_norm": 0.5809860825538635, "learning_rate": 0.0001353516816655503, "loss": 0.4795, "step": 81520 }, { "epoch": 1.8146812678062678, "grad_norm": 0.5024617314338684, "learning_rate": 0.00013530756403374274, "loss": 0.5093, "step": 81530 }, { "epoch": 1.8149038461538463, "grad_norm": 0.7193806171417236, "learning_rate": 0.00013526344991737513, "loss": 0.6808, "step": 81540 }, { "epoch": 1.8151264245014245, "grad_norm": 0.6356869339942932, "learning_rate": 0.00013521933931884462, "loss": 0.5459, "step": 81550 }, { "epoch": 1.8153490028490027, "grad_norm": 0.4601297676563263, "learning_rate": 0.00013517523224054824, "loss": 0.6181, "step": 81560 }, { "epoch": 1.8155715811965814, "grad_norm": 0.42826223373413086, "learning_rate": 0.00013513112868488285, "loss": 0.5506, "step": 81570 }, { "epoch": 1.8157941595441596, "grad_norm": 0.5442543625831604, "learning_rate": 0.00013508702865424498, "loss": 0.5424, "step": 81580 }, { "epoch": 1.8160167378917378, "grad_norm": 0.772955060005188, "learning_rate": 0.00013504293215103113, "loss": 0.6139, "step": 81590 }, { "epoch": 1.8162393162393162, "grad_norm": 0.5679219961166382, "learning_rate": 0.00013499883917763756, "loss": 0.5283, "step": 81600 }, { "epoch": 1.8164618945868947, "grad_norm": 0.5960325002670288, "learning_rate": 0.00013495474973646023, "loss": 0.6502, "step": 81610 }, { "epoch": 1.8166844729344729, "grad_norm": 0.376598060131073, "learning_rate": 0.00013491066382989505, "loss": 0.5098, "step": 81620 }, { "epoch": 1.8169070512820513, "grad_norm": 0.4028322398662567, "learning_rate": 0.00013486658146033773, "loss": 0.5193, "step": 81630 }, { "epoch": 1.8171296296296298, "grad_norm": 0.4078958034515381, "learning_rate": 0.00013482250263018372, "loss": 0.5282, "step": 81640 }, { "epoch": 1.817352207977208, "grad_norm": 0.5620450973510742, "learning_rate": 0.00013477842734182821, "loss": 0.6942, "step": 81650 }, { "epoch": 1.8175747863247862, "grad_norm": 0.6660060882568359, "learning_rate": 0.00013473435559766634, "loss": 0.4954, "step": 81660 }, { "epoch": 1.8177973646723646, "grad_norm": 0.7021716833114624, "learning_rate": 0.00013469028740009306, "loss": 0.634, "step": 81670 }, { "epoch": 1.818019943019943, "grad_norm": 0.45811787247657776, "learning_rate": 0.000134646222751503, "loss": 0.5591, "step": 81680 }, { "epoch": 1.8182425213675213, "grad_norm": 0.8208920955657959, "learning_rate": 0.00013460216165429067, "loss": 0.5028, "step": 81690 }, { "epoch": 1.8184650997150997, "grad_norm": 0.4805906414985657, "learning_rate": 0.00013455810411085043, "loss": 0.5842, "step": 81700 }, { "epoch": 1.8186876780626782, "grad_norm": 0.3821577727794647, "learning_rate": 0.00013451405012357643, "loss": 0.6637, "step": 81710 }, { "epoch": 1.8189102564102564, "grad_norm": 0.64554363489151, "learning_rate": 0.0001344699996948625, "loss": 0.4815, "step": 81720 }, { "epoch": 1.8191328347578346, "grad_norm": 0.7134438157081604, "learning_rate": 0.00013442595282710243, "loss": 0.4481, "step": 81730 }, { "epoch": 1.8193554131054133, "grad_norm": 0.6993748545646667, "learning_rate": 0.00013438190952268984, "loss": 0.6328, "step": 81740 }, { "epoch": 1.8195779914529915, "grad_norm": 0.6261551380157471, "learning_rate": 0.00013433786978401792, "loss": 0.5424, "step": 81750 }, { "epoch": 1.8198005698005697, "grad_norm": 0.5403935313224792, "learning_rate": 0.00013429383361347993, "loss": 0.4991, "step": 81760 }, { "epoch": 1.8200231481481481, "grad_norm": 0.45395776629447937, "learning_rate": 0.0001342498010134689, "loss": 0.4634, "step": 81770 }, { "epoch": 1.8202457264957266, "grad_norm": 0.5860028266906738, "learning_rate": 0.00013420577198637745, "loss": 0.5032, "step": 81780 }, { "epoch": 1.8204683048433048, "grad_norm": 0.6436797976493835, "learning_rate": 0.00013416174653459818, "loss": 0.5357, "step": 81790 }, { "epoch": 1.8206908831908832, "grad_norm": 0.7727674245834351, "learning_rate": 0.0001341177246605235, "loss": 0.5628, "step": 81800 }, { "epoch": 1.8209134615384617, "grad_norm": 0.852611780166626, "learning_rate": 0.00013407370636654565, "loss": 0.5068, "step": 81810 }, { "epoch": 1.82113603988604, "grad_norm": 0.5791258215904236, "learning_rate": 0.0001340296916550565, "loss": 0.4513, "step": 81820 }, { "epoch": 1.821358618233618, "grad_norm": 0.6760326027870178, "learning_rate": 0.00013398568052844792, "loss": 0.5396, "step": 81830 }, { "epoch": 1.8215811965811965, "grad_norm": 0.7038044333457947, "learning_rate": 0.0001339416729891115, "loss": 0.6021, "step": 81840 }, { "epoch": 1.821803774928775, "grad_norm": 0.35636159777641296, "learning_rate": 0.00013389766903943871, "loss": 0.4685, "step": 81850 }, { "epoch": 1.8220263532763532, "grad_norm": 0.47771090269088745, "learning_rate": 0.00013385366868182063, "loss": 0.687, "step": 81860 }, { "epoch": 1.8222489316239316, "grad_norm": 0.6764662265777588, "learning_rate": 0.00013380967191864836, "loss": 0.5052, "step": 81870 }, { "epoch": 1.82247150997151, "grad_norm": 0.42575374245643616, "learning_rate": 0.00013376567875231273, "loss": 0.4958, "step": 81880 }, { "epoch": 1.8226940883190883, "grad_norm": 0.423395037651062, "learning_rate": 0.00013372168918520432, "loss": 0.4892, "step": 81890 }, { "epoch": 1.8229166666666665, "grad_norm": 0.3987009823322296, "learning_rate": 0.00013367770321971365, "loss": 0.5223, "step": 81900 }, { "epoch": 1.8231392450142452, "grad_norm": 0.736409604549408, "learning_rate": 0.00013363372085823077, "loss": 0.473, "step": 81910 }, { "epoch": 1.8233618233618234, "grad_norm": 0.522287905216217, "learning_rate": 0.00013358974210314593, "loss": 0.5072, "step": 81920 }, { "epoch": 1.8235844017094016, "grad_norm": 0.5287613272666931, "learning_rate": 0.00013354576695684877, "loss": 0.4859, "step": 81930 }, { "epoch": 1.82380698005698, "grad_norm": 0.9009698033332825, "learning_rate": 0.00013350179542172906, "loss": 0.5714, "step": 81940 }, { "epoch": 1.8240295584045585, "grad_norm": 0.5715592503547668, "learning_rate": 0.00013345782750017628, "loss": 0.5878, "step": 81950 }, { "epoch": 1.8242521367521367, "grad_norm": 0.6704865097999573, "learning_rate": 0.00013341386319457957, "loss": 0.6115, "step": 81960 }, { "epoch": 1.8244747150997151, "grad_norm": 0.4145774841308594, "learning_rate": 0.00013336990250732806, "loss": 0.429, "step": 81970 }, { "epoch": 1.8246972934472936, "grad_norm": 0.5823487043380737, "learning_rate": 0.00013332594544081058, "loss": 0.5456, "step": 81980 }, { "epoch": 1.8249198717948718, "grad_norm": 0.6845299005508423, "learning_rate": 0.00013328199199741584, "loss": 0.4696, "step": 81990 }, { "epoch": 1.82514245014245, "grad_norm": 0.4744720160961151, "learning_rate": 0.00013323804217953223, "loss": 0.5328, "step": 82000 }, { "epoch": 1.8253650284900285, "grad_norm": 0.4606737494468689, "learning_rate": 0.0001331940959895481, "loss": 0.3736, "step": 82010 }, { "epoch": 1.825587606837607, "grad_norm": 0.28650131821632385, "learning_rate": 0.00013315015342985146, "loss": 0.5077, "step": 82020 }, { "epoch": 1.8258101851851851, "grad_norm": 0.37976205348968506, "learning_rate": 0.0001331062145028303, "loss": 0.5773, "step": 82030 }, { "epoch": 1.8260327635327636, "grad_norm": 0.3755210041999817, "learning_rate": 0.00013306227921087212, "loss": 0.509, "step": 82040 }, { "epoch": 1.826255341880342, "grad_norm": 0.4783906638622284, "learning_rate": 0.00013301834755636446, "loss": 0.4079, "step": 82050 }, { "epoch": 1.8264779202279202, "grad_norm": 0.7430179119110107, "learning_rate": 0.00013297441954169468, "loss": 0.6565, "step": 82060 }, { "epoch": 1.8267004985754984, "grad_norm": 0.43394824862480164, "learning_rate": 0.00013293049516924978, "loss": 0.6075, "step": 82070 }, { "epoch": 1.8269230769230769, "grad_norm": 0.6090715527534485, "learning_rate": 0.0001328865744414167, "loss": 0.4517, "step": 82080 }, { "epoch": 1.8271456552706553, "grad_norm": 0.7434033751487732, "learning_rate": 0.0001328426573605821, "loss": 0.6067, "step": 82090 }, { "epoch": 1.8273682336182335, "grad_norm": 0.5030858516693115, "learning_rate": 0.00013279874392913246, "loss": 0.4758, "step": 82100 }, { "epoch": 1.827590811965812, "grad_norm": 0.47948139905929565, "learning_rate": 0.00013275483414945408, "loss": 0.566, "step": 82110 }, { "epoch": 1.8278133903133904, "grad_norm": 0.6807217001914978, "learning_rate": 0.0001327109280239331, "loss": 0.517, "step": 82120 }, { "epoch": 1.8280359686609686, "grad_norm": 0.6474286317825317, "learning_rate": 0.00013266702555495536, "loss": 0.5654, "step": 82130 }, { "epoch": 1.828258547008547, "grad_norm": 0.6399319767951965, "learning_rate": 0.00013262312674490657, "loss": 0.3978, "step": 82140 }, { "epoch": 1.8284811253561255, "grad_norm": 0.5145894289016724, "learning_rate": 0.0001325792315961722, "loss": 0.5827, "step": 82150 }, { "epoch": 1.8287037037037037, "grad_norm": 0.8448268175125122, "learning_rate": 0.00013253534011113766, "loss": 0.6026, "step": 82160 }, { "epoch": 1.828926282051282, "grad_norm": 0.6144095659255981, "learning_rate": 0.00013249145229218793, "loss": 0.4726, "step": 82170 }, { "epoch": 1.8291488603988604, "grad_norm": 0.3744725286960602, "learning_rate": 0.00013244756814170794, "loss": 0.4769, "step": 82180 }, { "epoch": 1.8293714387464388, "grad_norm": 0.4054800570011139, "learning_rate": 0.00013240368766208235, "loss": 0.4378, "step": 82190 }, { "epoch": 1.829594017094017, "grad_norm": 0.5873180627822876, "learning_rate": 0.0001323598108556958, "loss": 0.624, "step": 82200 }, { "epoch": 1.8298165954415955, "grad_norm": 0.3750841021537781, "learning_rate": 0.00013231593772493244, "loss": 0.5533, "step": 82210 }, { "epoch": 1.830039173789174, "grad_norm": 0.7284649014472961, "learning_rate": 0.00013227206827217642, "loss": 0.6229, "step": 82220 }, { "epoch": 1.8302617521367521, "grad_norm": 0.43646135926246643, "learning_rate": 0.0001322282024998117, "loss": 0.4732, "step": 82230 }, { "epoch": 1.8304843304843303, "grad_norm": 0.5836132168769836, "learning_rate": 0.0001321843404102219, "loss": 0.5659, "step": 82240 }, { "epoch": 1.8307069088319088, "grad_norm": 0.5374044179916382, "learning_rate": 0.00013214048200579058, "loss": 0.4631, "step": 82250 }, { "epoch": 1.8309294871794872, "grad_norm": 0.84559166431427, "learning_rate": 0.00013209662728890103, "loss": 0.6409, "step": 82260 }, { "epoch": 1.8311520655270654, "grad_norm": 0.5360352993011475, "learning_rate": 0.0001320527762619363, "loss": 0.6815, "step": 82270 }, { "epoch": 1.8313746438746439, "grad_norm": 0.5297994613647461, "learning_rate": 0.00013200892892727936, "loss": 0.5423, "step": 82280 }, { "epoch": 1.8315972222222223, "grad_norm": 0.6134427189826965, "learning_rate": 0.00013196508528731286, "loss": 0.5763, "step": 82290 }, { "epoch": 1.8318198005698005, "grad_norm": 0.5154188871383667, "learning_rate": 0.0001319212453444194, "loss": 0.5902, "step": 82300 }, { "epoch": 1.8320423789173788, "grad_norm": 0.47101709246635437, "learning_rate": 0.00013187740910098117, "loss": 0.5594, "step": 82310 }, { "epoch": 1.8322649572649574, "grad_norm": 0.6749807000160217, "learning_rate": 0.00013183357655938025, "loss": 0.5638, "step": 82320 }, { "epoch": 1.8324875356125356, "grad_norm": 0.6740586757659912, "learning_rate": 0.00013178974772199862, "loss": 0.5373, "step": 82330 }, { "epoch": 1.8327101139601139, "grad_norm": 0.47263529896736145, "learning_rate": 0.00013174592259121792, "loss": 0.6147, "step": 82340 }, { "epoch": 1.8329326923076923, "grad_norm": 0.5166828632354736, "learning_rate": 0.00013170210116941966, "loss": 0.5655, "step": 82350 }, { "epoch": 1.8331552706552707, "grad_norm": 0.6054937243461609, "learning_rate": 0.0001316582834589851, "loss": 0.5334, "step": 82360 }, { "epoch": 1.833377849002849, "grad_norm": 0.7472933530807495, "learning_rate": 0.00013161446946229543, "loss": 0.4991, "step": 82370 }, { "epoch": 1.8336004273504274, "grad_norm": 0.5342981219291687, "learning_rate": 0.00013157065918173143, "loss": 0.4818, "step": 82380 }, { "epoch": 1.8338230056980058, "grad_norm": 0.42199358344078064, "learning_rate": 0.00013152685261967384, "loss": 0.4548, "step": 82390 }, { "epoch": 1.834045584045584, "grad_norm": 0.9192740321159363, "learning_rate": 0.00013148304977850315, "loss": 0.4891, "step": 82400 }, { "epoch": 1.8342681623931623, "grad_norm": 0.6426318287849426, "learning_rate": 0.0001314392506605996, "loss": 0.4976, "step": 82410 }, { "epoch": 1.8344907407407407, "grad_norm": 0.8605575561523438, "learning_rate": 0.00013139545526834329, "loss": 0.4526, "step": 82420 }, { "epoch": 1.8347133190883191, "grad_norm": 0.3284328579902649, "learning_rate": 0.0001313516636041142, "loss": 0.481, "step": 82430 }, { "epoch": 1.8349358974358974, "grad_norm": 0.8184496760368347, "learning_rate": 0.0001313078756702918, "loss": 0.5942, "step": 82440 }, { "epoch": 1.8351584757834758, "grad_norm": 0.725690484046936, "learning_rate": 0.00013126409146925572, "loss": 0.5737, "step": 82450 }, { "epoch": 1.8353810541310542, "grad_norm": 0.9030028581619263, "learning_rate": 0.00013122031100338513, "loss": 0.5774, "step": 82460 }, { "epoch": 1.8356036324786325, "grad_norm": 0.635688304901123, "learning_rate": 0.0001311765342750592, "loss": 0.5737, "step": 82470 }, { "epoch": 1.8358262108262107, "grad_norm": 0.5312469601631165, "learning_rate": 0.0001311327612866567, "loss": 0.5476, "step": 82480 }, { "epoch": 1.8360487891737893, "grad_norm": 0.6173189878463745, "learning_rate": 0.00013108899204055633, "loss": 0.5478, "step": 82490 }, { "epoch": 1.8362713675213675, "grad_norm": 0.44116851687431335, "learning_rate": 0.00013104522653913656, "loss": 0.4731, "step": 82500 }, { "epoch": 1.8364939458689458, "grad_norm": 0.6289439797401428, "learning_rate": 0.0001310014647847756, "loss": 0.5242, "step": 82510 }, { "epoch": 1.8367165242165242, "grad_norm": 0.7131869792938232, "learning_rate": 0.00013095770677985156, "loss": 0.5566, "step": 82520 }, { "epoch": 1.8369391025641026, "grad_norm": 0.8671508431434631, "learning_rate": 0.00013091395252674227, "loss": 0.6793, "step": 82530 }, { "epoch": 1.8371616809116809, "grad_norm": 0.45645245909690857, "learning_rate": 0.00013087020202782532, "loss": 0.5947, "step": 82540 }, { "epoch": 1.8373842592592593, "grad_norm": 0.6090142726898193, "learning_rate": 0.0001308264552854782, "loss": 0.485, "step": 82550 }, { "epoch": 1.8376068376068377, "grad_norm": 0.666921079158783, "learning_rate": 0.00013078271230207818, "loss": 0.5573, "step": 82560 }, { "epoch": 1.837829415954416, "grad_norm": 0.5578299164772034, "learning_rate": 0.00013073897308000216, "loss": 0.4586, "step": 82570 }, { "epoch": 1.8380519943019942, "grad_norm": 0.48516398668289185, "learning_rate": 0.00013069523762162705, "loss": 0.5203, "step": 82580 }, { "epoch": 1.8382745726495726, "grad_norm": 0.47454801201820374, "learning_rate": 0.00013065150592932954, "loss": 0.6178, "step": 82590 }, { "epoch": 1.838497150997151, "grad_norm": 0.5279563069343567, "learning_rate": 0.00013060777800548586, "loss": 0.4358, "step": 82600 }, { "epoch": 1.8387197293447293, "grad_norm": 0.554388165473938, "learning_rate": 0.0001305640538524724, "loss": 0.471, "step": 82610 }, { "epoch": 1.8389423076923077, "grad_norm": 0.7549745440483093, "learning_rate": 0.00013052033347266508, "loss": 0.4871, "step": 82620 }, { "epoch": 1.8391648860398861, "grad_norm": 0.5078638195991516, "learning_rate": 0.0001304766168684397, "loss": 0.552, "step": 82630 }, { "epoch": 1.8393874643874644, "grad_norm": 0.6136375665664673, "learning_rate": 0.00013043290404217187, "loss": 0.4959, "step": 82640 }, { "epoch": 1.8396100427350426, "grad_norm": 0.8161970376968384, "learning_rate": 0.00013038919499623703, "loss": 0.625, "step": 82650 }, { "epoch": 1.8398326210826212, "grad_norm": 0.5410851240158081, "learning_rate": 0.00013034548973301028, "loss": 0.6344, "step": 82660 }, { "epoch": 1.8400551994301995, "grad_norm": 0.6233614087104797, "learning_rate": 0.00013030178825486664, "loss": 0.5575, "step": 82670 }, { "epoch": 1.8402777777777777, "grad_norm": 0.5429202318191528, "learning_rate": 0.0001302580905641809, "loss": 0.6325, "step": 82680 }, { "epoch": 1.8405003561253561, "grad_norm": 0.594391405582428, "learning_rate": 0.00013021439666332762, "loss": 0.5302, "step": 82690 }, { "epoch": 1.8407229344729346, "grad_norm": 0.5456381440162659, "learning_rate": 0.0001301707065546812, "loss": 0.6353, "step": 82700 }, { "epoch": 1.8409455128205128, "grad_norm": 0.524573802947998, "learning_rate": 0.00013012702024061564, "loss": 0.633, "step": 82710 }, { "epoch": 1.8411680911680912, "grad_norm": 0.3950157165527344, "learning_rate": 0.00013008333772350508, "loss": 0.4898, "step": 82720 }, { "epoch": 1.8413906695156697, "grad_norm": 0.6695838570594788, "learning_rate": 0.00013003965900572318, "loss": 0.5071, "step": 82730 }, { "epoch": 1.8416132478632479, "grad_norm": 0.701891303062439, "learning_rate": 0.00012999598408964342, "loss": 0.5664, "step": 82740 }, { "epoch": 1.841835826210826, "grad_norm": 0.5936694741249084, "learning_rate": 0.0001299523129776392, "loss": 0.5612, "step": 82750 }, { "epoch": 1.8420584045584045, "grad_norm": 0.4507628381252289, "learning_rate": 0.00012990864567208373, "loss": 0.5959, "step": 82760 }, { "epoch": 1.842280982905983, "grad_norm": 0.6318559050559998, "learning_rate": 0.00012986498217534975, "loss": 0.4675, "step": 82770 }, { "epoch": 1.8425035612535612, "grad_norm": 0.45048779249191284, "learning_rate": 0.00012982132248981005, "loss": 0.4773, "step": 82780 }, { "epoch": 1.8427261396011396, "grad_norm": 0.41977986693382263, "learning_rate": 0.00012977766661783718, "loss": 0.4448, "step": 82790 }, { "epoch": 1.842948717948718, "grad_norm": 0.42034912109375, "learning_rate": 0.00012973401456180337, "loss": 0.3574, "step": 82800 }, { "epoch": 1.8431712962962963, "grad_norm": 0.5503208041191101, "learning_rate": 0.0001296903663240807, "loss": 0.6054, "step": 82810 }, { "epoch": 1.8433938746438745, "grad_norm": 0.5614815354347229, "learning_rate": 0.0001296467219070412, "loss": 0.5596, "step": 82820 }, { "epoch": 1.843616452991453, "grad_norm": 0.5360153913497925, "learning_rate": 0.00012960308131305633, "loss": 0.5477, "step": 82830 }, { "epoch": 1.8438390313390314, "grad_norm": 0.5727353096008301, "learning_rate": 0.0001295594445444977, "loss": 0.4641, "step": 82840 }, { "epoch": 1.8440616096866096, "grad_norm": 0.48841726779937744, "learning_rate": 0.00012951581160373644, "loss": 0.5742, "step": 82850 }, { "epoch": 1.844284188034188, "grad_norm": 0.6567955017089844, "learning_rate": 0.00012947218249314372, "loss": 0.6564, "step": 82860 }, { "epoch": 1.8445067663817665, "grad_norm": 0.3863896429538727, "learning_rate": 0.00012942855721509037, "loss": 0.3935, "step": 82870 }, { "epoch": 1.8447293447293447, "grad_norm": 0.4754053056240082, "learning_rate": 0.00012938493577194696, "loss": 0.5936, "step": 82880 }, { "epoch": 1.8449519230769231, "grad_norm": 0.6173115968704224, "learning_rate": 0.00012934131816608394, "loss": 0.5695, "step": 82890 }, { "epoch": 1.8451745014245016, "grad_norm": 0.45684781670570374, "learning_rate": 0.00012929770439987155, "loss": 0.4782, "step": 82900 }, { "epoch": 1.8453970797720798, "grad_norm": 0.5181066393852234, "learning_rate": 0.0001292540944756798, "loss": 0.4371, "step": 82910 }, { "epoch": 1.845619658119658, "grad_norm": 0.6220055818557739, "learning_rate": 0.00012921048839587842, "loss": 0.7226, "step": 82920 }, { "epoch": 1.8458422364672364, "grad_norm": 0.7300344109535217, "learning_rate": 0.0001291668861628371, "loss": 0.5493, "step": 82930 }, { "epoch": 1.8460648148148149, "grad_norm": 0.3150061070919037, "learning_rate": 0.00012912328777892515, "loss": 0.486, "step": 82940 }, { "epoch": 1.846287393162393, "grad_norm": 0.5863127708435059, "learning_rate": 0.00012907969324651183, "loss": 0.5338, "step": 82950 }, { "epoch": 1.8465099715099715, "grad_norm": 0.7054229974746704, "learning_rate": 0.00012903610256796596, "loss": 0.4996, "step": 82960 }, { "epoch": 1.84673254985755, "grad_norm": 0.6429014205932617, "learning_rate": 0.00012899251574565642, "loss": 0.7624, "step": 82970 }, { "epoch": 1.8469551282051282, "grad_norm": 0.5570936799049377, "learning_rate": 0.00012894893278195165, "loss": 0.5169, "step": 82980 }, { "epoch": 1.8471777065527064, "grad_norm": 0.5311518907546997, "learning_rate": 0.00012890535367922007, "loss": 0.4844, "step": 82990 }, { "epoch": 1.8474002849002849, "grad_norm": 0.4749816954135895, "learning_rate": 0.00012886177843982974, "loss": 0.575, "step": 83000 }, { "epoch": 1.8476228632478633, "grad_norm": 0.4664008617401123, "learning_rate": 0.00012881820706614865, "loss": 0.4627, "step": 83010 }, { "epoch": 1.8478454415954415, "grad_norm": 0.5003736615180969, "learning_rate": 0.00012877463956054443, "loss": 0.6056, "step": 83020 }, { "epoch": 1.84806801994302, "grad_norm": 0.45148807764053345, "learning_rate": 0.0001287310759253846, "loss": 0.4996, "step": 83030 }, { "epoch": 1.8482905982905984, "grad_norm": 0.39340224862098694, "learning_rate": 0.0001286875161630365, "loss": 0.592, "step": 83040 }, { "epoch": 1.8485131766381766, "grad_norm": 0.6800395250320435, "learning_rate": 0.00012864396027586708, "loss": 0.4621, "step": 83050 }, { "epoch": 1.8487357549857548, "grad_norm": 0.4289543628692627, "learning_rate": 0.00012860040826624328, "loss": 0.5439, "step": 83060 }, { "epoch": 1.8489583333333335, "grad_norm": 0.4997618496417999, "learning_rate": 0.00012855686013653182, "loss": 0.5645, "step": 83070 }, { "epoch": 1.8491809116809117, "grad_norm": 0.423065721988678, "learning_rate": 0.00012851331588909908, "loss": 0.5694, "step": 83080 }, { "epoch": 1.84940349002849, "grad_norm": 0.5687001347541809, "learning_rate": 0.00012846977552631117, "loss": 0.495, "step": 83090 }, { "epoch": 1.8496260683760684, "grad_norm": 0.6185249090194702, "learning_rate": 0.00012842623905053423, "loss": 0.5242, "step": 83100 }, { "epoch": 1.8498486467236468, "grad_norm": 0.5709801316261292, "learning_rate": 0.00012838270646413412, "loss": 0.471, "step": 83110 }, { "epoch": 1.850071225071225, "grad_norm": 0.6331045031547546, "learning_rate": 0.00012833917776947632, "loss": 0.5447, "step": 83120 }, { "epoch": 1.8502938034188035, "grad_norm": 0.5902771949768066, "learning_rate": 0.00012829565296892625, "loss": 0.4946, "step": 83130 }, { "epoch": 1.850516381766382, "grad_norm": 0.5342350602149963, "learning_rate": 0.0001282521320648491, "loss": 0.6436, "step": 83140 }, { "epoch": 1.85073896011396, "grad_norm": 0.49461767077445984, "learning_rate": 0.0001282086150596099, "loss": 0.5505, "step": 83150 }, { "epoch": 1.8509615384615383, "grad_norm": 0.6072582602500916, "learning_rate": 0.00012816510195557326, "loss": 0.5282, "step": 83160 }, { "epoch": 1.8511841168091168, "grad_norm": 0.6011079549789429, "learning_rate": 0.0001281215927551038, "loss": 0.4349, "step": 83170 }, { "epoch": 1.8514066951566952, "grad_norm": 0.6158570051193237, "learning_rate": 0.00012807808746056584, "loss": 0.4527, "step": 83180 }, { "epoch": 1.8516292735042734, "grad_norm": 0.4972705543041229, "learning_rate": 0.00012803458607432347, "loss": 0.503, "step": 83190 }, { "epoch": 1.8518518518518519, "grad_norm": 0.7067302465438843, "learning_rate": 0.0001279910885987406, "loss": 0.5721, "step": 83200 }, { "epoch": 1.8520744301994303, "grad_norm": 0.6685699224472046, "learning_rate": 0.00012794759503618103, "loss": 0.5895, "step": 83210 }, { "epoch": 1.8522970085470085, "grad_norm": 0.4621349275112152, "learning_rate": 0.00012790410538900807, "loss": 0.4735, "step": 83220 }, { "epoch": 1.8525195868945867, "grad_norm": 0.6665029525756836, "learning_rate": 0.00012786061965958497, "loss": 0.5579, "step": 83230 }, { "epoch": 1.8527421652421654, "grad_norm": 0.9082955121994019, "learning_rate": 0.0001278171378502749, "loss": 0.5431, "step": 83240 }, { "epoch": 1.8529647435897436, "grad_norm": 0.7993310689926147, "learning_rate": 0.0001277736599634407, "loss": 0.5488, "step": 83250 }, { "epoch": 1.8531873219373218, "grad_norm": 0.7538060545921326, "learning_rate": 0.00012773018600144485, "loss": 0.511, "step": 83260 }, { "epoch": 1.8534099002849003, "grad_norm": 0.6340315341949463, "learning_rate": 0.00012768671596664988, "loss": 0.5297, "step": 83270 }, { "epoch": 1.8536324786324787, "grad_norm": 0.7901144027709961, "learning_rate": 0.000127643249861418, "loss": 0.514, "step": 83280 }, { "epoch": 1.853855056980057, "grad_norm": 0.43684589862823486, "learning_rate": 0.00012759978768811116, "loss": 0.5556, "step": 83290 }, { "epoch": 1.8540776353276354, "grad_norm": 0.6661888360977173, "learning_rate": 0.0001275563294490911, "loss": 0.5632, "step": 83300 }, { "epoch": 1.8543002136752138, "grad_norm": 0.5754364728927612, "learning_rate": 0.0001275128751467194, "loss": 0.5782, "step": 83310 }, { "epoch": 1.854522792022792, "grad_norm": 0.6119531989097595, "learning_rate": 0.00012746942478335745, "loss": 0.5541, "step": 83320 }, { "epoch": 1.8547453703703702, "grad_norm": 0.38876596093177795, "learning_rate": 0.00012742597836136628, "loss": 0.5523, "step": 83330 }, { "epoch": 1.8549679487179487, "grad_norm": 0.4897285997867584, "learning_rate": 0.00012738253588310685, "loss": 0.5975, "step": 83340 }, { "epoch": 1.8551905270655271, "grad_norm": 0.6104738116264343, "learning_rate": 0.00012733909735094, "loss": 0.5249, "step": 83350 }, { "epoch": 1.8554131054131053, "grad_norm": 0.5610323548316956, "learning_rate": 0.00012729566276722598, "loss": 0.6574, "step": 83360 }, { "epoch": 1.8556356837606838, "grad_norm": 0.24581007659435272, "learning_rate": 0.00012725223213432514, "loss": 0.5365, "step": 83370 }, { "epoch": 1.8558582621082622, "grad_norm": 0.5664445161819458, "learning_rate": 0.0001272088054545976, "loss": 0.4695, "step": 83380 }, { "epoch": 1.8560808404558404, "grad_norm": 0.35660311579704285, "learning_rate": 0.00012716538273040314, "loss": 0.4945, "step": 83390 }, { "epoch": 1.8563034188034186, "grad_norm": 0.7497749924659729, "learning_rate": 0.0001271219639641014, "loss": 0.6068, "step": 83400 }, { "epoch": 1.8565259971509973, "grad_norm": 0.4711231291294098, "learning_rate": 0.0001270785491580518, "loss": 0.5364, "step": 83410 }, { "epoch": 1.8567485754985755, "grad_norm": 0.5040110349655151, "learning_rate": 0.00012703513831461357, "loss": 0.5249, "step": 83420 }, { "epoch": 1.8569711538461537, "grad_norm": 0.5002251267433167, "learning_rate": 0.0001269917314361456, "loss": 0.4847, "step": 83430 }, { "epoch": 1.8571937321937322, "grad_norm": 0.466820627450943, "learning_rate": 0.00012694832852500672, "loss": 0.5891, "step": 83440 }, { "epoch": 1.8574163105413106, "grad_norm": 0.7309194207191467, "learning_rate": 0.0001269049295835555, "loss": 0.7185, "step": 83450 }, { "epoch": 1.8576388888888888, "grad_norm": 0.42788100242614746, "learning_rate": 0.00012686153461415026, "loss": 0.6125, "step": 83460 }, { "epoch": 1.8578614672364673, "grad_norm": 0.5784748196601868, "learning_rate": 0.00012681814361914908, "loss": 0.5353, "step": 83470 }, { "epoch": 1.8580840455840457, "grad_norm": 0.82913738489151, "learning_rate": 0.00012677475660090995, "loss": 0.5588, "step": 83480 }, { "epoch": 1.858306623931624, "grad_norm": 0.7082958221435547, "learning_rate": 0.0001267313735617904, "loss": 0.6553, "step": 83490 }, { "epoch": 1.8585292022792022, "grad_norm": 0.4604193866252899, "learning_rate": 0.00012668799450414806, "loss": 0.4829, "step": 83500 }, { "epoch": 1.8587517806267806, "grad_norm": 0.4414191246032715, "learning_rate": 0.00012664461943034006, "loss": 0.5907, "step": 83510 }, { "epoch": 1.858974358974359, "grad_norm": 0.44887977838516235, "learning_rate": 0.00012660124834272348, "loss": 0.5824, "step": 83520 }, { "epoch": 1.8591969373219372, "grad_norm": 0.3785727322101593, "learning_rate": 0.00012655788124365518, "loss": 0.5708, "step": 83530 }, { "epoch": 1.8594195156695157, "grad_norm": 0.5096050500869751, "learning_rate": 0.00012651451813549172, "loss": 0.5093, "step": 83540 }, { "epoch": 1.8596420940170941, "grad_norm": 0.4787435829639435, "learning_rate": 0.00012647115902058952, "loss": 0.4097, "step": 83550 }, { "epoch": 1.8598646723646723, "grad_norm": 0.5042093992233276, "learning_rate": 0.00012642780390130476, "loss": 0.6072, "step": 83560 }, { "epoch": 1.8600872507122506, "grad_norm": 0.6546964049339294, "learning_rate": 0.00012638445277999332, "loss": 0.5195, "step": 83570 }, { "epoch": 1.8602207977207978, "eval_loss": 0.5436797142028809, "eval_runtime": 337.4885, "eval_samples_per_second": 7.008, "eval_steps_per_second": 7.008, "step": 83576 }, { "epoch": 1.8603098290598292, "grad_norm": 0.5635590553283691, "learning_rate": 0.00012634110565901095, "loss": 0.6364, "step": 83580 }, { "epoch": 1.8605324074074074, "grad_norm": 0.5586998462677002, "learning_rate": 0.00012629776254071326, "loss": 0.5486, "step": 83590 }, { "epoch": 1.8607549857549857, "grad_norm": 0.5139333009719849, "learning_rate": 0.0001262544234274555, "loss": 0.5651, "step": 83600 }, { "epoch": 1.860977564102564, "grad_norm": 0.49953770637512207, "learning_rate": 0.00012621108832159276, "loss": 0.5109, "step": 83610 }, { "epoch": 1.8612001424501425, "grad_norm": 0.5323540568351746, "learning_rate": 0.00012616775722547986, "loss": 0.5728, "step": 83620 }, { "epoch": 1.8614227207977208, "grad_norm": 0.571060061454773, "learning_rate": 0.0001261244301414714, "loss": 0.4647, "step": 83630 }, { "epoch": 1.8616452991452992, "grad_norm": 0.585278332233429, "learning_rate": 0.00012608110707192197, "loss": 0.5556, "step": 83640 }, { "epoch": 1.8618678774928776, "grad_norm": 0.35864341259002686, "learning_rate": 0.00012603778801918564, "loss": 0.4576, "step": 83650 }, { "epoch": 1.8620904558404558, "grad_norm": 0.6041143536567688, "learning_rate": 0.00012599447298561648, "loss": 0.6435, "step": 83660 }, { "epoch": 1.862313034188034, "grad_norm": 0.5151081681251526, "learning_rate": 0.00012595116197356825, "loss": 0.4269, "step": 83670 }, { "epoch": 1.8625356125356125, "grad_norm": 0.515720784664154, "learning_rate": 0.00012590785498539447, "loss": 0.5999, "step": 83680 }, { "epoch": 1.862758190883191, "grad_norm": 0.4865381717681885, "learning_rate": 0.00012586455202344852, "loss": 0.4351, "step": 83690 }, { "epoch": 1.8629807692307692, "grad_norm": 0.7341486811637878, "learning_rate": 0.00012582125309008353, "loss": 0.5515, "step": 83700 }, { "epoch": 1.8632033475783476, "grad_norm": 0.6000564098358154, "learning_rate": 0.00012577795818765234, "loss": 0.5063, "step": 83710 }, { "epoch": 1.863425925925926, "grad_norm": 0.7252387404441833, "learning_rate": 0.00012573466731850765, "loss": 0.5685, "step": 83720 }, { "epoch": 1.8636485042735043, "grad_norm": 0.7386507987976074, "learning_rate": 0.00012569138048500194, "loss": 0.6376, "step": 83730 }, { "epoch": 1.8638710826210825, "grad_norm": 0.5711615681648254, "learning_rate": 0.00012564809768948754, "loss": 0.6018, "step": 83740 }, { "epoch": 1.864093660968661, "grad_norm": 0.4323315918445587, "learning_rate": 0.00012560481893431633, "loss": 0.4106, "step": 83750 }, { "epoch": 1.8643162393162394, "grad_norm": 0.5830208659172058, "learning_rate": 0.00012556154422184012, "loss": 0.5466, "step": 83760 }, { "epoch": 1.8645388176638176, "grad_norm": 0.5899856090545654, "learning_rate": 0.00012551827355441051, "loss": 0.5519, "step": 83770 }, { "epoch": 1.864761396011396, "grad_norm": 0.4912465512752533, "learning_rate": 0.00012547500693437893, "loss": 0.5965, "step": 83780 }, { "epoch": 1.8649839743589745, "grad_norm": 0.7850707173347473, "learning_rate": 0.00012543174436409647, "loss": 0.5518, "step": 83790 }, { "epoch": 1.8652065527065527, "grad_norm": 1.1132444143295288, "learning_rate": 0.0001253884858459141, "loss": 0.4395, "step": 83800 }, { "epoch": 1.865429131054131, "grad_norm": 0.5983325242996216, "learning_rate": 0.00012534523138218247, "loss": 0.5475, "step": 83810 }, { "epoch": 1.8656517094017095, "grad_norm": 0.5957872271537781, "learning_rate": 0.00012530198097525206, "loss": 0.5988, "step": 83820 }, { "epoch": 1.8658742877492878, "grad_norm": 0.4412519335746765, "learning_rate": 0.00012525873462747316, "loss": 0.4865, "step": 83830 }, { "epoch": 1.866096866096866, "grad_norm": 0.6750221252441406, "learning_rate": 0.00012521549234119586, "loss": 0.5586, "step": 83840 }, { "epoch": 1.8663194444444444, "grad_norm": 0.5277356505393982, "learning_rate": 0.00012517225411876984, "loss": 0.5821, "step": 83850 }, { "epoch": 1.8665420227920229, "grad_norm": 0.5814523100852966, "learning_rate": 0.00012512901996254483, "loss": 0.5374, "step": 83860 }, { "epoch": 1.866764601139601, "grad_norm": 0.4394824802875519, "learning_rate": 0.00012508578987487025, "loss": 0.6425, "step": 83870 }, { "epoch": 1.8669871794871795, "grad_norm": 0.49788931012153625, "learning_rate": 0.00012504256385809508, "loss": 0.5151, "step": 83880 }, { "epoch": 1.867209757834758, "grad_norm": 0.5545089840888977, "learning_rate": 0.0001249993419145684, "loss": 0.5008, "step": 83890 }, { "epoch": 1.8674323361823362, "grad_norm": 0.5493986010551453, "learning_rate": 0.0001249561240466388, "loss": 0.5409, "step": 83900 }, { "epoch": 1.8676549145299144, "grad_norm": 0.5672900080680847, "learning_rate": 0.0001249129102566549, "loss": 0.4545, "step": 83910 }, { "epoch": 1.8678774928774928, "grad_norm": 0.41749370098114014, "learning_rate": 0.00012486970054696497, "loss": 0.5166, "step": 83920 }, { "epoch": 1.8681000712250713, "grad_norm": 0.5397555828094482, "learning_rate": 0.00012482649491991693, "loss": 0.5845, "step": 83930 }, { "epoch": 1.8683226495726495, "grad_norm": 0.48144885897636414, "learning_rate": 0.0001247832933778587, "loss": 0.5117, "step": 83940 }, { "epoch": 1.868545227920228, "grad_norm": 0.6149954795837402, "learning_rate": 0.00012474009592313798, "loss": 0.6119, "step": 83950 }, { "epoch": 1.8687678062678064, "grad_norm": 0.6500161290168762, "learning_rate": 0.000124696902558102, "loss": 0.409, "step": 83960 }, { "epoch": 1.8689903846153846, "grad_norm": 0.3311348855495453, "learning_rate": 0.00012465371328509797, "loss": 0.458, "step": 83970 }, { "epoch": 1.8692129629629628, "grad_norm": 0.41198965907096863, "learning_rate": 0.00012461052810647285, "loss": 0.4788, "step": 83980 }, { "epoch": 1.8694355413105415, "grad_norm": 0.5640448927879333, "learning_rate": 0.00012456734702457335, "loss": 0.5883, "step": 83990 }, { "epoch": 1.8696581196581197, "grad_norm": 0.4277702569961548, "learning_rate": 0.00012452417004174603, "loss": 0.6195, "step": 84000 }, { "epoch": 1.869880698005698, "grad_norm": 0.37998971343040466, "learning_rate": 0.00012448099716033702, "loss": 0.4813, "step": 84010 }, { "epoch": 1.8701032763532763, "grad_norm": 0.8592267632484436, "learning_rate": 0.00012443782838269244, "loss": 0.5471, "step": 84020 }, { "epoch": 1.8703258547008548, "grad_norm": 0.6263825297355652, "learning_rate": 0.00012439466371115817, "loss": 0.3867, "step": 84030 }, { "epoch": 1.870548433048433, "grad_norm": 0.7152578234672546, "learning_rate": 0.00012435150314807973, "loss": 0.5774, "step": 84040 }, { "epoch": 1.8707710113960114, "grad_norm": 0.7144100666046143, "learning_rate": 0.00012430834669580252, "loss": 0.6186, "step": 84050 }, { "epoch": 1.8709935897435899, "grad_norm": 0.4694978892803192, "learning_rate": 0.0001242651943566718, "loss": 0.4516, "step": 84060 }, { "epoch": 1.871216168091168, "grad_norm": 0.34822237491607666, "learning_rate": 0.00012422204613303235, "loss": 0.5519, "step": 84070 }, { "epoch": 1.8714387464387463, "grad_norm": 0.6391999125480652, "learning_rate": 0.00012417890202722891, "loss": 0.5739, "step": 84080 }, { "epoch": 1.8716613247863247, "grad_norm": 1.0606963634490967, "learning_rate": 0.0001241357620416061, "loss": 0.468, "step": 84090 }, { "epoch": 1.8718839031339032, "grad_norm": 0.5437816977500916, "learning_rate": 0.00012409262617850804, "loss": 0.6617, "step": 84100 }, { "epoch": 1.8721064814814814, "grad_norm": 0.5535063147544861, "learning_rate": 0.0001240494944402788, "loss": 0.5088, "step": 84110 }, { "epoch": 1.8723290598290598, "grad_norm": 0.6068494915962219, "learning_rate": 0.00012400636682926224, "loss": 0.6898, "step": 84120 }, { "epoch": 1.8725516381766383, "grad_norm": 0.3118143081665039, "learning_rate": 0.00012396324334780192, "loss": 0.5705, "step": 84130 }, { "epoch": 1.8727742165242165, "grad_norm": 0.7818388342857361, "learning_rate": 0.00012392012399824122, "loss": 0.477, "step": 84140 }, { "epoch": 1.8729967948717947, "grad_norm": 0.4516705572605133, "learning_rate": 0.00012387700878292322, "loss": 0.5883, "step": 84150 }, { "epoch": 1.8732193732193734, "grad_norm": 0.7244430184364319, "learning_rate": 0.00012383389770419085, "loss": 0.5847, "step": 84160 }, { "epoch": 1.8734419515669516, "grad_norm": 0.24914613366127014, "learning_rate": 0.0001237907907643869, "loss": 0.4618, "step": 84170 }, { "epoch": 1.8736645299145298, "grad_norm": 0.646794855594635, "learning_rate": 0.00012374768796585372, "loss": 0.5783, "step": 84180 }, { "epoch": 1.8738871082621082, "grad_norm": 0.8252887725830078, "learning_rate": 0.0001237045893109336, "loss": 0.5806, "step": 84190 }, { "epoch": 1.8741096866096867, "grad_norm": 0.5684128999710083, "learning_rate": 0.00012366149480196858, "loss": 0.5365, "step": 84200 }, { "epoch": 1.874332264957265, "grad_norm": 0.46364259719848633, "learning_rate": 0.00012361840444130042, "loss": 0.4656, "step": 84210 }, { "epoch": 1.8745548433048433, "grad_norm": 0.45392075181007385, "learning_rate": 0.00012357531823127066, "loss": 0.5358, "step": 84220 }, { "epoch": 1.8747774216524218, "grad_norm": 0.7082318067550659, "learning_rate": 0.00012353223617422073, "loss": 0.5213, "step": 84230 }, { "epoch": 1.875, "grad_norm": 0.5095435976982117, "learning_rate": 0.00012348915827249167, "loss": 0.5009, "step": 84240 }, { "epoch": 1.8752225783475782, "grad_norm": 0.50666344165802, "learning_rate": 0.0001234460845284243, "loss": 0.4844, "step": 84250 }, { "epoch": 1.8754451566951567, "grad_norm": 0.5035162568092346, "learning_rate": 0.00012340301494435954, "loss": 0.5847, "step": 84260 }, { "epoch": 1.875667735042735, "grad_norm": 0.7025172710418701, "learning_rate": 0.00012335994952263757, "loss": 0.5392, "step": 84270 }, { "epoch": 1.8758903133903133, "grad_norm": 0.5457714200019836, "learning_rate": 0.00012331688826559863, "loss": 0.617, "step": 84280 }, { "epoch": 1.8761128917378918, "grad_norm": 0.6207228899002075, "learning_rate": 0.00012327383117558278, "loss": 0.4245, "step": 84290 }, { "epoch": 1.8763354700854702, "grad_norm": 0.7247689962387085, "learning_rate": 0.00012323077825492974, "loss": 0.64, "step": 84300 }, { "epoch": 1.8765580484330484, "grad_norm": 0.6851694583892822, "learning_rate": 0.00012318772950597911, "loss": 0.4342, "step": 84310 }, { "epoch": 1.8767806267806266, "grad_norm": 0.5423067212104797, "learning_rate": 0.0001231446849310701, "loss": 0.4896, "step": 84320 }, { "epoch": 1.8770032051282053, "grad_norm": 0.6308934092521667, "learning_rate": 0.00012310164453254184, "loss": 0.6464, "step": 84330 }, { "epoch": 1.8772257834757835, "grad_norm": 0.8702993392944336, "learning_rate": 0.0001230586083127332, "loss": 0.5344, "step": 84340 }, { "epoch": 1.8774483618233617, "grad_norm": 0.5256815552711487, "learning_rate": 0.0001230155762739827, "loss": 0.4322, "step": 84350 }, { "epoch": 1.8776709401709402, "grad_norm": 0.656808614730835, "learning_rate": 0.00012297254841862882, "loss": 0.5554, "step": 84360 }, { "epoch": 1.8778935185185186, "grad_norm": 0.38146379590034485, "learning_rate": 0.0001229295247490098, "loss": 0.4928, "step": 84370 }, { "epoch": 1.8781160968660968, "grad_norm": 0.6034631729125977, "learning_rate": 0.0001228865052674634, "loss": 0.5746, "step": 84380 }, { "epoch": 1.8783386752136753, "grad_norm": 0.5881955623626709, "learning_rate": 0.00012284348997632747, "loss": 0.4958, "step": 84390 }, { "epoch": 1.8785612535612537, "grad_norm": 0.5065589547157288, "learning_rate": 0.0001228004788779395, "loss": 0.5454, "step": 84400 }, { "epoch": 1.878783831908832, "grad_norm": 0.6091739535331726, "learning_rate": 0.0001227574719746367, "loss": 0.657, "step": 84410 }, { "epoch": 1.8790064102564101, "grad_norm": 0.3164239823818207, "learning_rate": 0.00012271446926875607, "loss": 0.4448, "step": 84420 }, { "epoch": 1.8792289886039886, "grad_norm": 0.6602497100830078, "learning_rate": 0.00012267147076263445, "loss": 0.5011, "step": 84430 }, { "epoch": 1.879451566951567, "grad_norm": 0.4034685790538788, "learning_rate": 0.00012262847645860839, "loss": 0.5238, "step": 84440 }, { "epoch": 1.8796741452991452, "grad_norm": 0.5005523562431335, "learning_rate": 0.0001225854863590143, "loss": 0.5152, "step": 84450 }, { "epoch": 1.8798967236467237, "grad_norm": 0.7055954933166504, "learning_rate": 0.00012254250046618822, "loss": 0.5278, "step": 84460 }, { "epoch": 1.880119301994302, "grad_norm": 0.5619649887084961, "learning_rate": 0.00012249951878246608, "loss": 0.4446, "step": 84470 }, { "epoch": 1.8803418803418803, "grad_norm": 0.657782793045044, "learning_rate": 0.00012245654131018356, "loss": 0.4602, "step": 84480 }, { "epoch": 1.8805644586894585, "grad_norm": 0.799344539642334, "learning_rate": 0.00012241356805167602, "loss": 0.4666, "step": 84490 }, { "epoch": 1.8807870370370372, "grad_norm": 0.5603383779525757, "learning_rate": 0.00012237059900927877, "loss": 0.4702, "step": 84500 }, { "epoch": 1.8810096153846154, "grad_norm": 0.7844343781471252, "learning_rate": 0.00012232763418532669, "loss": 0.5809, "step": 84510 }, { "epoch": 1.8812321937321936, "grad_norm": 0.5265217423439026, "learning_rate": 0.00012228467358215453, "loss": 0.5684, "step": 84520 }, { "epoch": 1.881454772079772, "grad_norm": 0.578589916229248, "learning_rate": 0.00012224171720209693, "loss": 0.4439, "step": 84530 }, { "epoch": 1.8816773504273505, "grad_norm": 0.4853005111217499, "learning_rate": 0.00012219876504748796, "loss": 0.5476, "step": 84540 }, { "epoch": 1.8818999287749287, "grad_norm": 0.444774329662323, "learning_rate": 0.00012215581712066186, "loss": 0.4716, "step": 84550 }, { "epoch": 1.8821225071225072, "grad_norm": 0.31034624576568604, "learning_rate": 0.00012211287342395234, "loss": 0.5194, "step": 84560 }, { "epoch": 1.8823450854700856, "grad_norm": 0.799504816532135, "learning_rate": 0.00012206993395969304, "loss": 0.4748, "step": 84570 }, { "epoch": 1.8825676638176638, "grad_norm": 0.45250698924064636, "learning_rate": 0.0001220269987302173, "loss": 0.6653, "step": 84580 }, { "epoch": 1.882790242165242, "grad_norm": 0.5559919476509094, "learning_rate": 0.00012198406773785835, "loss": 0.5582, "step": 84590 }, { "epoch": 1.8830128205128205, "grad_norm": 0.5852392315864563, "learning_rate": 0.00012194114098494898, "loss": 0.5398, "step": 84600 }, { "epoch": 1.883235398860399, "grad_norm": 1.573107123374939, "learning_rate": 0.0001218982184738219, "loss": 0.5588, "step": 84610 }, { "epoch": 1.8834579772079771, "grad_norm": 0.5588731169700623, "learning_rate": 0.0001218553002068096, "loss": 0.4883, "step": 84620 }, { "epoch": 1.8836805555555556, "grad_norm": 0.5660615563392639, "learning_rate": 0.00012181238618624422, "loss": 0.4707, "step": 84630 }, { "epoch": 1.883903133903134, "grad_norm": 0.6259063482284546, "learning_rate": 0.00012176947641445782, "loss": 0.4833, "step": 84640 }, { "epoch": 1.8841257122507122, "grad_norm": 0.735929548740387, "learning_rate": 0.00012172657089378212, "loss": 0.4643, "step": 84650 }, { "epoch": 1.8843482905982905, "grad_norm": 0.4203285574913025, "learning_rate": 0.00012168366962654866, "loss": 0.4736, "step": 84660 }, { "epoch": 1.884570868945869, "grad_norm": 0.4741728603839874, "learning_rate": 0.00012164077261508864, "loss": 0.4674, "step": 84670 }, { "epoch": 1.8847934472934473, "grad_norm": 0.3773883283138275, "learning_rate": 0.00012159787986173322, "loss": 0.4748, "step": 84680 }, { "epoch": 1.8850160256410255, "grad_norm": 0.4112374484539032, "learning_rate": 0.00012155499136881318, "loss": 0.4899, "step": 84690 }, { "epoch": 1.885238603988604, "grad_norm": 0.44202110171318054, "learning_rate": 0.00012151210713865912, "loss": 0.5883, "step": 84700 }, { "epoch": 1.8854611823361824, "grad_norm": 0.3438667356967926, "learning_rate": 0.00012146922717360138, "loss": 0.4871, "step": 84710 }, { "epoch": 1.8856837606837606, "grad_norm": 0.5432357788085938, "learning_rate": 0.0001214263514759702, "loss": 0.4654, "step": 84720 }, { "epoch": 1.885906339031339, "grad_norm": 0.5579224824905396, "learning_rate": 0.00012138348004809535, "loss": 0.5805, "step": 84730 }, { "epoch": 1.8861289173789175, "grad_norm": 0.5524855852127075, "learning_rate": 0.00012134061289230654, "loss": 0.4803, "step": 84740 }, { "epoch": 1.8863514957264957, "grad_norm": 0.5937925577163696, "learning_rate": 0.00012129775001093322, "loss": 0.4385, "step": 84750 }, { "epoch": 1.886574074074074, "grad_norm": 0.8141183257102966, "learning_rate": 0.00012125489140630466, "loss": 0.4975, "step": 84760 }, { "epoch": 1.8867966524216524, "grad_norm": 0.3882113993167877, "learning_rate": 0.00012121203708074969, "loss": 0.5738, "step": 84770 }, { "epoch": 1.8870192307692308, "grad_norm": 0.5651386380195618, "learning_rate": 0.00012116918703659714, "loss": 0.6125, "step": 84780 }, { "epoch": 1.887241809116809, "grad_norm": 0.5414236187934875, "learning_rate": 0.00012112634127617556, "loss": 0.5745, "step": 84790 }, { "epoch": 1.8874643874643875, "grad_norm": 0.6082583069801331, "learning_rate": 0.00012108349980181317, "loss": 0.5157, "step": 84800 }, { "epoch": 1.887686965811966, "grad_norm": 0.6844538450241089, "learning_rate": 0.00012104066261583792, "loss": 0.5765, "step": 84810 }, { "epoch": 1.8879095441595442, "grad_norm": 0.7233855128288269, "learning_rate": 0.00012099782972057773, "loss": 0.623, "step": 84820 }, { "epoch": 1.8881321225071224, "grad_norm": 0.44369903206825256, "learning_rate": 0.00012095500111836018, "loss": 0.4765, "step": 84830 }, { "epoch": 1.8883547008547008, "grad_norm": 0.5439565181732178, "learning_rate": 0.00012091217681151254, "loss": 0.4019, "step": 84840 }, { "epoch": 1.8885772792022792, "grad_norm": 0.6787068247795105, "learning_rate": 0.00012086935680236196, "loss": 0.6234, "step": 84850 }, { "epoch": 1.8887998575498575, "grad_norm": 0.47564569115638733, "learning_rate": 0.00012082654109323535, "loss": 0.504, "step": 84860 }, { "epoch": 1.889022435897436, "grad_norm": 0.39963382482528687, "learning_rate": 0.00012078372968645926, "loss": 0.389, "step": 84870 }, { "epoch": 1.8892450142450143, "grad_norm": 0.4792673885822296, "learning_rate": 0.00012074092258436014, "loss": 0.5531, "step": 84880 }, { "epoch": 1.8894675925925926, "grad_norm": 0.49218714237213135, "learning_rate": 0.0001206981197892642, "loss": 0.5529, "step": 84890 }, { "epoch": 1.8896901709401708, "grad_norm": 0.7303302884101868, "learning_rate": 0.00012065532130349737, "loss": 0.4026, "step": 84900 }, { "epoch": 1.8899127492877494, "grad_norm": 0.6758183240890503, "learning_rate": 0.00012061252712938528, "loss": 0.5874, "step": 84910 }, { "epoch": 1.8901353276353277, "grad_norm": 0.7041637897491455, "learning_rate": 0.00012056973726925359, "loss": 0.5742, "step": 84920 }, { "epoch": 1.8903579059829059, "grad_norm": 0.7809052467346191, "learning_rate": 0.00012052695172542727, "loss": 0.5525, "step": 84930 }, { "epoch": 1.8905804843304843, "grad_norm": 0.37585023045539856, "learning_rate": 0.00012048417050023151, "loss": 0.5036, "step": 84940 }, { "epoch": 1.8908030626780628, "grad_norm": 0.5783548355102539, "learning_rate": 0.00012044139359599099, "loss": 0.5024, "step": 84950 }, { "epoch": 1.891025641025641, "grad_norm": 0.5071015954017639, "learning_rate": 0.00012039862101503026, "loss": 0.4789, "step": 84960 }, { "epoch": 1.8912482193732194, "grad_norm": 0.5445882678031921, "learning_rate": 0.00012035585275967368, "loss": 0.4893, "step": 84970 }, { "epoch": 1.8914707977207978, "grad_norm": 0.3778285086154938, "learning_rate": 0.00012031308883224519, "loss": 0.4128, "step": 84980 }, { "epoch": 1.891693376068376, "grad_norm": 0.5169997811317444, "learning_rate": 0.00012027032923506872, "loss": 0.512, "step": 84990 }, { "epoch": 1.8919159544159543, "grad_norm": 0.5017030835151672, "learning_rate": 0.00012022757397046786, "loss": 0.4416, "step": 85000 }, { "epoch": 1.8921385327635327, "grad_norm": 1.1345303058624268, "learning_rate": 0.00012018482304076588, "loss": 0.5782, "step": 85010 }, { "epoch": 1.8923611111111112, "grad_norm": 0.4666755497455597, "learning_rate": 0.00012014207644828596, "loss": 0.6162, "step": 85020 }, { "epoch": 1.8925836894586894, "grad_norm": 0.7333738803863525, "learning_rate": 0.00012009933419535104, "loss": 0.5684, "step": 85030 }, { "epoch": 1.8928062678062678, "grad_norm": 0.5773189067840576, "learning_rate": 0.00012005659628428367, "loss": 0.4725, "step": 85040 }, { "epoch": 1.8930288461538463, "grad_norm": 0.6351726651191711, "learning_rate": 0.00012001386271740637, "loss": 0.6178, "step": 85050 }, { "epoch": 1.8932514245014245, "grad_norm": 0.49017852544784546, "learning_rate": 0.00011997113349704116, "loss": 0.5083, "step": 85060 }, { "epoch": 1.8934740028490027, "grad_norm": 0.5200414061546326, "learning_rate": 0.00011992840862551009, "loss": 0.513, "step": 85070 }, { "epoch": 1.8936965811965814, "grad_norm": 0.6539701223373413, "learning_rate": 0.00011988568810513488, "loss": 0.4864, "step": 85080 }, { "epoch": 1.8939191595441596, "grad_norm": 0.568430483341217, "learning_rate": 0.00011984297193823692, "loss": 0.5922, "step": 85090 }, { "epoch": 1.8941417378917378, "grad_norm": 0.7377212047576904, "learning_rate": 0.00011980026012713748, "loss": 0.5317, "step": 85100 }, { "epoch": 1.8943643162393162, "grad_norm": 0.6099929213523865, "learning_rate": 0.0001197575526741576, "loss": 0.6454, "step": 85110 }, { "epoch": 1.8945868945868947, "grad_norm": 0.5552646517753601, "learning_rate": 0.00011971484958161796, "loss": 0.4626, "step": 85120 }, { "epoch": 1.8948094729344729, "grad_norm": 0.6018966436386108, "learning_rate": 0.00011967215085183912, "loss": 0.501, "step": 85130 }, { "epoch": 1.8950320512820513, "grad_norm": 0.5927259922027588, "learning_rate": 0.00011962945648714141, "loss": 0.5232, "step": 85140 }, { "epoch": 1.8952546296296298, "grad_norm": 0.5345644950866699, "learning_rate": 0.00011958676648984477, "loss": 0.6754, "step": 85150 }, { "epoch": 1.895477207977208, "grad_norm": 0.47902974486351013, "learning_rate": 0.00011954408086226908, "loss": 0.4121, "step": 85160 }, { "epoch": 1.8956997863247862, "grad_norm": 0.6269406676292419, "learning_rate": 0.00011950139960673393, "loss": 0.5845, "step": 85170 }, { "epoch": 1.8959223646723646, "grad_norm": 0.6758217215538025, "learning_rate": 0.00011945872272555862, "loss": 0.7172, "step": 85180 }, { "epoch": 1.896144943019943, "grad_norm": 0.6879309415817261, "learning_rate": 0.00011941605022106228, "loss": 0.6026, "step": 85190 }, { "epoch": 1.8963675213675213, "grad_norm": 0.587652325630188, "learning_rate": 0.00011937338209556368, "loss": 0.5271, "step": 85200 }, { "epoch": 1.8965900997150997, "grad_norm": 0.4308949410915375, "learning_rate": 0.00011933071835138152, "loss": 0.6274, "step": 85210 }, { "epoch": 1.8968126780626782, "grad_norm": 0.4711028039455414, "learning_rate": 0.00011928805899083418, "loss": 0.5202, "step": 85220 }, { "epoch": 1.8970352564102564, "grad_norm": 0.5150978565216064, "learning_rate": 0.00011924540401623976, "loss": 0.5259, "step": 85230 }, { "epoch": 1.8972578347578346, "grad_norm": 0.5412598252296448, "learning_rate": 0.00011920275342991618, "loss": 0.553, "step": 85240 }, { "epoch": 1.8974804131054133, "grad_norm": 0.5050225257873535, "learning_rate": 0.00011916010723418116, "loss": 0.4893, "step": 85250 }, { "epoch": 1.8977029914529915, "grad_norm": 0.587312638759613, "learning_rate": 0.00011911746543135209, "loss": 0.676, "step": 85260 }, { "epoch": 1.8979255698005697, "grad_norm": 0.5557898879051208, "learning_rate": 0.00011907482802374615, "loss": 0.5312, "step": 85270 }, { "epoch": 1.8981481481481481, "grad_norm": 0.42806851863861084, "learning_rate": 0.0001190321950136803, "loss": 0.4447, "step": 85280 }, { "epoch": 1.8983707264957266, "grad_norm": 0.5302470922470093, "learning_rate": 0.00011898956640347125, "loss": 0.4668, "step": 85290 }, { "epoch": 1.8985933048433048, "grad_norm": 0.4515651762485504, "learning_rate": 0.00011894694219543549, "loss": 0.5503, "step": 85300 }, { "epoch": 1.8988158831908832, "grad_norm": 0.5524975061416626, "learning_rate": 0.0001189043223918893, "loss": 0.4934, "step": 85310 }, { "epoch": 1.8990384615384617, "grad_norm": 0.6137921214103699, "learning_rate": 0.00011886170699514855, "loss": 0.5975, "step": 85320 }, { "epoch": 1.89926103988604, "grad_norm": 0.8360822796821594, "learning_rate": 0.00011881909600752909, "loss": 0.6624, "step": 85330 }, { "epoch": 1.899483618233618, "grad_norm": 0.5582287907600403, "learning_rate": 0.0001187764894313464, "loss": 0.553, "step": 85340 }, { "epoch": 1.8997061965811965, "grad_norm": 0.6215211153030396, "learning_rate": 0.00011873388726891575, "loss": 0.5912, "step": 85350 }, { "epoch": 1.899928774928775, "grad_norm": 0.4308690130710602, "learning_rate": 0.00011869128952255226, "loss": 0.532, "step": 85360 }, { "epoch": 1.9001513532763532, "grad_norm": 0.6136817932128906, "learning_rate": 0.00011864869619457057, "loss": 0.7096, "step": 85370 }, { "epoch": 1.9003739316239316, "grad_norm": 0.33455830812454224, "learning_rate": 0.00011860610728728536, "loss": 0.7005, "step": 85380 }, { "epoch": 1.90059650997151, "grad_norm": 0.776676595211029, "learning_rate": 0.00011856352280301095, "loss": 0.6113, "step": 85390 }, { "epoch": 1.9008190883190883, "grad_norm": 0.570492684841156, "learning_rate": 0.00011852094274406133, "loss": 0.4769, "step": 85400 }, { "epoch": 1.9010416666666665, "grad_norm": 0.3798914849758148, "learning_rate": 0.00011847836711275038, "loss": 0.4396, "step": 85410 }, { "epoch": 1.9012642450142452, "grad_norm": 0.43583646416664124, "learning_rate": 0.00011843579591139175, "loss": 0.63, "step": 85420 }, { "epoch": 1.9014868233618234, "grad_norm": 0.632577657699585, "learning_rate": 0.00011839322914229869, "loss": 0.6159, "step": 85430 }, { "epoch": 1.9017094017094016, "grad_norm": 0.5504708886146545, "learning_rate": 0.00011835066680778446, "loss": 0.4517, "step": 85440 }, { "epoch": 1.90193198005698, "grad_norm": 0.6885342597961426, "learning_rate": 0.00011830810891016173, "loss": 0.5899, "step": 85450 }, { "epoch": 1.9021545584045585, "grad_norm": 0.6813510656356812, "learning_rate": 0.00011826555545174324, "loss": 0.4313, "step": 85460 }, { "epoch": 1.9023771367521367, "grad_norm": 0.8150388598442078, "learning_rate": 0.00011822300643484145, "loss": 0.5397, "step": 85470 }, { "epoch": 1.9025997150997151, "grad_norm": 0.5913096070289612, "learning_rate": 0.00011818046186176839, "loss": 0.5695, "step": 85480 }, { "epoch": 1.9028222934472936, "grad_norm": 0.5552115440368652, "learning_rate": 0.000118137921734836, "loss": 0.4947, "step": 85490 }, { "epoch": 1.9030448717948718, "grad_norm": 0.5598598122596741, "learning_rate": 0.00011809538605635599, "loss": 0.5299, "step": 85500 }, { "epoch": 1.90326745014245, "grad_norm": 0.5184574127197266, "learning_rate": 0.00011805285482863972, "loss": 0.4939, "step": 85510 }, { "epoch": 1.9034900284900285, "grad_norm": 0.4912760853767395, "learning_rate": 0.00011801032805399841, "loss": 0.5775, "step": 85520 }, { "epoch": 1.903712606837607, "grad_norm": 0.6037229299545288, "learning_rate": 0.00011796780573474304, "loss": 0.5351, "step": 85530 }, { "epoch": 1.9039351851851851, "grad_norm": 0.6881440281867981, "learning_rate": 0.0001179252878731842, "loss": 0.504, "step": 85540 }, { "epoch": 1.9041577635327636, "grad_norm": 0.6429431438446045, "learning_rate": 0.00011788277447163244, "loss": 0.6366, "step": 85550 }, { "epoch": 1.904380341880342, "grad_norm": 0.5920882225036621, "learning_rate": 0.00011784026553239793, "loss": 0.5859, "step": 85560 }, { "epoch": 1.9046029202279202, "grad_norm": 0.7298809885978699, "learning_rate": 0.00011779776105779064, "loss": 0.4179, "step": 85570 }, { "epoch": 1.9048254985754984, "grad_norm": 0.5162444114685059, "learning_rate": 0.00011775526105012038, "loss": 0.6221, "step": 85580 }, { "epoch": 1.9050480769230769, "grad_norm": 0.5679960250854492, "learning_rate": 0.00011771276551169647, "loss": 0.5587, "step": 85590 }, { "epoch": 1.9052706552706553, "grad_norm": 0.663221001625061, "learning_rate": 0.00011767027444482828, "loss": 0.5354, "step": 85600 }, { "epoch": 1.9054932336182335, "grad_norm": 0.44698238372802734, "learning_rate": 0.0001176277878518248, "loss": 0.4357, "step": 85610 }, { "epoch": 1.905715811965812, "grad_norm": 0.6691514253616333, "learning_rate": 0.00011758530573499471, "loss": 0.6166, "step": 85620 }, { "epoch": 1.9059383903133904, "grad_norm": 0.4185422956943512, "learning_rate": 0.00011754282809664659, "loss": 0.4938, "step": 85630 }, { "epoch": 1.9061609686609686, "grad_norm": 0.5682917237281799, "learning_rate": 0.00011750035493908874, "loss": 0.5895, "step": 85640 }, { "epoch": 1.906383547008547, "grad_norm": 0.5465940833091736, "learning_rate": 0.00011745788626462908, "loss": 0.568, "step": 85650 }, { "epoch": 1.9066061253561255, "grad_norm": 0.477728933095932, "learning_rate": 0.00011741542207557546, "loss": 0.5208, "step": 85660 }, { "epoch": 1.9068287037037037, "grad_norm": 0.5027092099189758, "learning_rate": 0.00011737296237423545, "loss": 0.5392, "step": 85670 }, { "epoch": 1.907051282051282, "grad_norm": 0.47351041436195374, "learning_rate": 0.00011733050716291627, "loss": 0.5045, "step": 85680 }, { "epoch": 1.9072738603988604, "grad_norm": 0.43656596541404724, "learning_rate": 0.00011728805644392498, "loss": 0.3866, "step": 85690 }, { "epoch": 1.9074964387464388, "grad_norm": 0.492007315158844, "learning_rate": 0.00011724561021956849, "loss": 0.4327, "step": 85700 }, { "epoch": 1.907719017094017, "grad_norm": 0.6703928112983704, "learning_rate": 0.00011720316849215332, "loss": 0.507, "step": 85710 }, { "epoch": 1.9079415954415955, "grad_norm": 0.49135464429855347, "learning_rate": 0.00011716073126398565, "loss": 0.5191, "step": 85720 }, { "epoch": 1.908164173789174, "grad_norm": 0.46850427985191345, "learning_rate": 0.00011711829853737169, "loss": 0.5541, "step": 85730 }, { "epoch": 1.9083867521367521, "grad_norm": 0.4342600107192993, "learning_rate": 0.00011707587031461722, "loss": 0.5583, "step": 85740 }, { "epoch": 1.9086093304843303, "grad_norm": 0.5486937165260315, "learning_rate": 0.00011703344659802789, "loss": 0.4439, "step": 85750 }, { "epoch": 1.9088319088319088, "grad_norm": 0.5380405187606812, "learning_rate": 0.00011699102738990895, "loss": 0.5932, "step": 85760 }, { "epoch": 1.9090544871794872, "grad_norm": 0.386771023273468, "learning_rate": 0.00011694861269256554, "loss": 0.5885, "step": 85770 }, { "epoch": 1.9092770655270654, "grad_norm": 0.5973242521286011, "learning_rate": 0.00011690620250830253, "loss": 0.4062, "step": 85780 }, { "epoch": 1.9094996438746439, "grad_norm": 0.48295095562934875, "learning_rate": 0.00011686379683942448, "loss": 0.5558, "step": 85790 }, { "epoch": 1.9097222222222223, "grad_norm": 0.46864092350006104, "learning_rate": 0.00011682139568823576, "loss": 0.5994, "step": 85800 }, { "epoch": 1.9099448005698005, "grad_norm": 0.5002042055130005, "learning_rate": 0.00011677899905704053, "loss": 0.5547, "step": 85810 }, { "epoch": 1.9101673789173788, "grad_norm": 0.4876922369003296, "learning_rate": 0.00011673660694814259, "loss": 0.4462, "step": 85820 }, { "epoch": 1.9103899572649574, "grad_norm": 0.4872403144836426, "learning_rate": 0.00011669421936384559, "loss": 0.6234, "step": 85830 }, { "epoch": 1.9106125356125356, "grad_norm": 0.5643612742424011, "learning_rate": 0.00011665183630645298, "loss": 0.5341, "step": 85840 }, { "epoch": 1.9108351139601139, "grad_norm": 0.5881261825561523, "learning_rate": 0.00011660945777826775, "loss": 0.5138, "step": 85850 }, { "epoch": 1.9110576923076923, "grad_norm": 0.4867708683013916, "learning_rate": 0.00011656708378159283, "loss": 0.5568, "step": 85860 }, { "epoch": 1.9112802706552707, "grad_norm": 0.6191570162773132, "learning_rate": 0.00011652471431873086, "loss": 0.5563, "step": 85870 }, { "epoch": 1.911502849002849, "grad_norm": 0.6064071655273438, "learning_rate": 0.00011648234939198428, "loss": 0.5772, "step": 85880 }, { "epoch": 1.9117254273504274, "grad_norm": 0.4651621878147125, "learning_rate": 0.0001164399890036552, "loss": 0.5513, "step": 85890 }, { "epoch": 1.9119480056980058, "grad_norm": 0.3724319040775299, "learning_rate": 0.0001163976331560456, "loss": 0.4405, "step": 85900 }, { "epoch": 1.912170584045584, "grad_norm": 0.4384273886680603, "learning_rate": 0.00011635528185145696, "loss": 0.5506, "step": 85910 }, { "epoch": 1.9123931623931623, "grad_norm": 0.7029179334640503, "learning_rate": 0.0001163129350921908, "loss": 0.5189, "step": 85920 }, { "epoch": 1.9126157407407407, "grad_norm": 0.625603437423706, "learning_rate": 0.00011627059288054827, "loss": 0.5317, "step": 85930 }, { "epoch": 1.9128383190883191, "grad_norm": 0.5059728026390076, "learning_rate": 0.00011622825521883025, "loss": 0.4053, "step": 85940 }, { "epoch": 1.9130608974358974, "grad_norm": 0.5113756656646729, "learning_rate": 0.00011618592210933741, "loss": 0.5821, "step": 85950 }, { "epoch": 1.9132834757834758, "grad_norm": 0.648999810218811, "learning_rate": 0.00011614359355437026, "loss": 0.6489, "step": 85960 }, { "epoch": 1.9135060541310542, "grad_norm": 0.5037275552749634, "learning_rate": 0.00011610126955622882, "loss": 0.5174, "step": 85970 }, { "epoch": 1.9137286324786325, "grad_norm": 0.5843381881713867, "learning_rate": 0.00011605895011721313, "loss": 0.5892, "step": 85980 }, { "epoch": 1.9139512108262107, "grad_norm": 0.43946772813796997, "learning_rate": 0.00011601663523962274, "loss": 0.5989, "step": 85990 }, { "epoch": 1.9141737891737893, "grad_norm": 0.7192827463150024, "learning_rate": 0.00011597432492575713, "loss": 0.609, "step": 86000 }, { "epoch": 1.9143963675213675, "grad_norm": 0.583531379699707, "learning_rate": 0.00011593201917791552, "loss": 0.537, "step": 86010 }, { "epoch": 1.9146189458689458, "grad_norm": 0.5226663947105408, "learning_rate": 0.00011588971799839678, "loss": 0.4969, "step": 86020 }, { "epoch": 1.9148415242165242, "grad_norm": 0.808573305606842, "learning_rate": 0.00011584742138949965, "loss": 0.6544, "step": 86030 }, { "epoch": 1.9150641025641026, "grad_norm": 0.6683142185211182, "learning_rate": 0.0001158051293535226, "loss": 0.6896, "step": 86040 }, { "epoch": 1.9152866809116809, "grad_norm": 0.47932371497154236, "learning_rate": 0.00011576284189276365, "loss": 0.4986, "step": 86050 }, { "epoch": 1.9155092592592593, "grad_norm": 0.6403408646583557, "learning_rate": 0.00011572055900952085, "loss": 0.5601, "step": 86060 }, { "epoch": 1.9157318376068377, "grad_norm": 0.46542832255363464, "learning_rate": 0.00011567828070609183, "loss": 0.5125, "step": 86070 }, { "epoch": 1.915954415954416, "grad_norm": 1.7472851276397705, "learning_rate": 0.00011563600698477413, "loss": 0.5651, "step": 86080 }, { "epoch": 1.9161769943019942, "grad_norm": 0.4838975965976715, "learning_rate": 0.00011559373784786483, "loss": 0.5217, "step": 86090 }, { "epoch": 1.9163995726495726, "grad_norm": 0.612494170665741, "learning_rate": 0.00011555147329766098, "loss": 0.5574, "step": 86100 }, { "epoch": 1.916622150997151, "grad_norm": 0.2956370711326599, "learning_rate": 0.00011550921333645917, "loss": 0.5312, "step": 86110 }, { "epoch": 1.9168447293447293, "grad_norm": 0.4158254861831665, "learning_rate": 0.00011546695796655593, "loss": 0.5194, "step": 86120 }, { "epoch": 1.9170673076923077, "grad_norm": 0.9217372536659241, "learning_rate": 0.00011542470719024732, "loss": 0.5323, "step": 86130 }, { "epoch": 1.9172898860398861, "grad_norm": 0.5834040641784668, "learning_rate": 0.00011538246100982935, "loss": 0.4904, "step": 86140 }, { "epoch": 1.9175124643874644, "grad_norm": 0.6396893858909607, "learning_rate": 0.00011534021942759775, "loss": 0.4483, "step": 86150 }, { "epoch": 1.9177350427350426, "grad_norm": 0.41121169924736023, "learning_rate": 0.00011529798244584789, "loss": 0.4859, "step": 86160 }, { "epoch": 1.9179576210826212, "grad_norm": 0.6772044897079468, "learning_rate": 0.00011525575006687502, "loss": 0.5019, "step": 86170 }, { "epoch": 1.9181801994301995, "grad_norm": 0.5235520601272583, "learning_rate": 0.00011521352229297412, "loss": 0.6181, "step": 86180 }, { "epoch": 1.9184027777777777, "grad_norm": 0.6111951470375061, "learning_rate": 0.0001151712991264398, "loss": 0.6011, "step": 86190 }, { "epoch": 1.9186253561253561, "grad_norm": 0.5527158379554749, "learning_rate": 0.00011512908056956651, "loss": 0.5046, "step": 86200 }, { "epoch": 1.9188479344729346, "grad_norm": 0.4959039092063904, "learning_rate": 0.00011508686662464844, "loss": 0.5589, "step": 86210 }, { "epoch": 1.9190705128205128, "grad_norm": 0.3603544533252716, "learning_rate": 0.00011504465729397957, "loss": 0.4758, "step": 86220 }, { "epoch": 1.9192930911680912, "grad_norm": 0.594570517539978, "learning_rate": 0.00011500245257985365, "loss": 0.5857, "step": 86230 }, { "epoch": 1.9195156695156697, "grad_norm": 0.4193931221961975, "learning_rate": 0.00011496025248456397, "loss": 0.4426, "step": 86240 }, { "epoch": 1.9197382478632479, "grad_norm": 0.47187313437461853, "learning_rate": 0.00011491805701040376, "loss": 0.4891, "step": 86250 }, { "epoch": 1.919960826210826, "grad_norm": 0.5686632990837097, "learning_rate": 0.00011487586615966607, "loss": 0.6884, "step": 86260 }, { "epoch": 1.9201834045584045, "grad_norm": 0.43781980872154236, "learning_rate": 0.00011483367993464345, "loss": 0.5434, "step": 86270 }, { "epoch": 1.9202279202279202, "eval_loss": 0.5408384203910828, "eval_runtime": 337.9859, "eval_samples_per_second": 6.997, "eval_steps_per_second": 6.997, "step": 86272 }, { "epoch": 1.920405982905983, "grad_norm": 0.4699023962020874, "learning_rate": 0.00011479149833762832, "loss": 0.5495, "step": 86280 }, { "epoch": 1.9206285612535612, "grad_norm": 0.7036943435668945, "learning_rate": 0.00011474932137091299, "loss": 0.5594, "step": 86290 }, { "epoch": 1.9208511396011396, "grad_norm": 0.4120835065841675, "learning_rate": 0.0001147071490367893, "loss": 0.5549, "step": 86300 }, { "epoch": 1.921073717948718, "grad_norm": 0.4701841175556183, "learning_rate": 0.00011466498133754895, "loss": 0.4743, "step": 86310 }, { "epoch": 1.9212962962962963, "grad_norm": 0.5886813998222351, "learning_rate": 0.00011462281827548347, "loss": 0.5863, "step": 86320 }, { "epoch": 1.9215188746438745, "grad_norm": 0.63768470287323, "learning_rate": 0.00011458065985288382, "loss": 0.5573, "step": 86330 }, { "epoch": 1.921741452991453, "grad_norm": 0.42090120911598206, "learning_rate": 0.00011453850607204106, "loss": 0.4823, "step": 86340 }, { "epoch": 1.9219640313390314, "grad_norm": 0.4522937834262848, "learning_rate": 0.00011449635693524587, "loss": 0.4412, "step": 86350 }, { "epoch": 1.9221866096866096, "grad_norm": 0.7093581557273865, "learning_rate": 0.00011445421244478869, "loss": 0.5428, "step": 86360 }, { "epoch": 1.922409188034188, "grad_norm": 0.6658710241317749, "learning_rate": 0.00011441207260295956, "loss": 0.5897, "step": 86370 }, { "epoch": 1.9226317663817665, "grad_norm": 0.438462495803833, "learning_rate": 0.00011436993741204847, "loss": 0.4477, "step": 86380 }, { "epoch": 1.9228543447293447, "grad_norm": 0.5765256881713867, "learning_rate": 0.0001143278068743451, "loss": 0.5906, "step": 86390 }, { "epoch": 1.9230769230769231, "grad_norm": 0.6065409183502197, "learning_rate": 0.0001142856809921389, "loss": 0.6645, "step": 86400 }, { "epoch": 1.9232995014245016, "grad_norm": 0.3438239097595215, "learning_rate": 0.0001142435597677189, "loss": 0.5343, "step": 86410 }, { "epoch": 1.9235220797720798, "grad_norm": 0.5237026810646057, "learning_rate": 0.00011420144320337405, "loss": 0.5707, "step": 86420 }, { "epoch": 1.923744658119658, "grad_norm": 0.7502513527870178, "learning_rate": 0.00011415933130139302, "loss": 0.6223, "step": 86430 }, { "epoch": 1.9239672364672364, "grad_norm": 0.5259876251220703, "learning_rate": 0.0001141172240640642, "loss": 0.5175, "step": 86440 }, { "epoch": 1.9241898148148149, "grad_norm": 0.5037872195243835, "learning_rate": 0.00011407512149367572, "loss": 0.4996, "step": 86450 }, { "epoch": 1.924412393162393, "grad_norm": 0.5434580445289612, "learning_rate": 0.00011403302359251558, "loss": 0.5908, "step": 86460 }, { "epoch": 1.9246349715099715, "grad_norm": 0.38471388816833496, "learning_rate": 0.00011399093036287123, "loss": 0.5842, "step": 86470 }, { "epoch": 1.92485754985755, "grad_norm": 0.6415163278579712, "learning_rate": 0.00011394884180703012, "loss": 0.6516, "step": 86480 }, { "epoch": 1.9250801282051282, "grad_norm": 0.5737249255180359, "learning_rate": 0.00011390675792727947, "loss": 0.6385, "step": 86490 }, { "epoch": 1.9253027065527064, "grad_norm": 0.5640256404876709, "learning_rate": 0.00011386467872590601, "loss": 0.5657, "step": 86500 }, { "epoch": 1.9255252849002849, "grad_norm": 0.6238548159599304, "learning_rate": 0.00011382260420519641, "loss": 0.589, "step": 86510 }, { "epoch": 1.9257478632478633, "grad_norm": 0.5096440315246582, "learning_rate": 0.00011378053436743706, "loss": 0.4942, "step": 86520 }, { "epoch": 1.9259704415954415, "grad_norm": 0.49294623732566833, "learning_rate": 0.00011373846921491402, "loss": 0.4893, "step": 86530 }, { "epoch": 1.92619301994302, "grad_norm": 0.5232724547386169, "learning_rate": 0.0001136964087499133, "loss": 0.6122, "step": 86540 }, { "epoch": 1.9264155982905984, "grad_norm": 0.7509873509407043, "learning_rate": 0.00011365435297472027, "loss": 0.5612, "step": 86550 }, { "epoch": 1.9266381766381766, "grad_norm": 0.4268711805343628, "learning_rate": 0.00011361230189162042, "loss": 0.521, "step": 86560 }, { "epoch": 1.9268607549857548, "grad_norm": 0.5106337666511536, "learning_rate": 0.0001135702555028988, "loss": 0.5348, "step": 86570 }, { "epoch": 1.9270833333333335, "grad_norm": 0.7293399572372437, "learning_rate": 0.00011352821381084022, "loss": 0.6527, "step": 86580 }, { "epoch": 1.9273059116809117, "grad_norm": 0.6081629395484924, "learning_rate": 0.00011348617681772931, "loss": 0.5959, "step": 86590 }, { "epoch": 1.92752849002849, "grad_norm": 0.3281137943267822, "learning_rate": 0.00011344414452585044, "loss": 0.4751, "step": 86600 }, { "epoch": 1.9277510683760684, "grad_norm": 0.6834414005279541, "learning_rate": 0.00011340211693748755, "loss": 0.478, "step": 86610 }, { "epoch": 1.9279736467236468, "grad_norm": 0.44042229652404785, "learning_rate": 0.00011336009405492452, "loss": 0.5395, "step": 86620 }, { "epoch": 1.928196225071225, "grad_norm": 0.6206148862838745, "learning_rate": 0.00011331807588044498, "loss": 0.5037, "step": 86630 }, { "epoch": 1.9284188034188035, "grad_norm": 0.4447237551212311, "learning_rate": 0.00011327606241633209, "loss": 0.5585, "step": 86640 }, { "epoch": 1.928641381766382, "grad_norm": 0.60000079870224, "learning_rate": 0.00011323405366486892, "loss": 0.5136, "step": 86650 }, { "epoch": 1.92886396011396, "grad_norm": 0.2710607945919037, "learning_rate": 0.00011319204962833834, "loss": 0.5419, "step": 86660 }, { "epoch": 1.9290865384615383, "grad_norm": 0.5812567472457886, "learning_rate": 0.00011315005030902292, "loss": 0.5482, "step": 86670 }, { "epoch": 1.9293091168091168, "grad_norm": 0.34236037731170654, "learning_rate": 0.0001131080557092048, "loss": 0.4468, "step": 86680 }, { "epoch": 1.9295316951566952, "grad_norm": 0.7552074790000916, "learning_rate": 0.00011306606583116606, "loss": 0.5755, "step": 86690 }, { "epoch": 1.9297542735042734, "grad_norm": 0.4339049458503723, "learning_rate": 0.00011302408067718846, "loss": 0.5557, "step": 86700 }, { "epoch": 1.9299768518518519, "grad_norm": 0.529327392578125, "learning_rate": 0.00011298210024955351, "loss": 0.7157, "step": 86710 }, { "epoch": 1.9301994301994303, "grad_norm": 0.5876535773277283, "learning_rate": 0.00011294012455054249, "loss": 0.5565, "step": 86720 }, { "epoch": 1.9304220085470085, "grad_norm": 0.7795631289482117, "learning_rate": 0.00011289815358243636, "loss": 0.5131, "step": 86730 }, { "epoch": 1.9306445868945867, "grad_norm": 0.5589414834976196, "learning_rate": 0.00011285618734751595, "loss": 0.5326, "step": 86740 }, { "epoch": 1.9308671652421654, "grad_norm": 0.46770164370536804, "learning_rate": 0.00011281422584806156, "loss": 0.5333, "step": 86750 }, { "epoch": 1.9310897435897436, "grad_norm": 0.6444997787475586, "learning_rate": 0.00011277226908635363, "loss": 0.5487, "step": 86760 }, { "epoch": 1.9313123219373218, "grad_norm": 0.7315465211868286, "learning_rate": 0.0001127303170646719, "loss": 0.5377, "step": 86770 }, { "epoch": 1.9315349002849003, "grad_norm": 0.4663032293319702, "learning_rate": 0.0001126883697852962, "loss": 0.5601, "step": 86780 }, { "epoch": 1.9317574786324787, "grad_norm": 0.8682087063789368, "learning_rate": 0.00011264642725050594, "loss": 0.4918, "step": 86790 }, { "epoch": 1.931980056980057, "grad_norm": 0.5688363313674927, "learning_rate": 0.00011260448946258035, "loss": 0.6255, "step": 86800 }, { "epoch": 1.9322026353276354, "grad_norm": 0.6298112273216248, "learning_rate": 0.00011256255642379843, "loss": 0.4573, "step": 86810 }, { "epoch": 1.9324252136752138, "grad_norm": 0.3815062940120697, "learning_rate": 0.00011252062813643868, "loss": 0.6459, "step": 86820 }, { "epoch": 1.932647792022792, "grad_norm": 0.5226765275001526, "learning_rate": 0.00011247870460277962, "loss": 0.6222, "step": 86830 }, { "epoch": 1.9328703703703702, "grad_norm": 0.6144941449165344, "learning_rate": 0.0001124367858250994, "loss": 0.4883, "step": 86840 }, { "epoch": 1.9330929487179487, "grad_norm": 0.7446057200431824, "learning_rate": 0.00011239487180567594, "loss": 0.5385, "step": 86850 }, { "epoch": 1.9333155270655271, "grad_norm": 0.5342898964881897, "learning_rate": 0.00011235296254678682, "loss": 0.5993, "step": 86860 }, { "epoch": 1.9335381054131053, "grad_norm": 0.5611191987991333, "learning_rate": 0.00011231105805070949, "loss": 0.5582, "step": 86870 }, { "epoch": 1.9337606837606838, "grad_norm": 0.6036495566368103, "learning_rate": 0.00011226915831972114, "loss": 0.6238, "step": 86880 }, { "epoch": 1.9339832621082622, "grad_norm": 0.6017242670059204, "learning_rate": 0.00011222726335609852, "loss": 0.5185, "step": 86890 }, { "epoch": 1.9342058404558404, "grad_norm": 0.3665631115436554, "learning_rate": 0.00011218537316211821, "loss": 0.5793, "step": 86900 }, { "epoch": 1.9344284188034186, "grad_norm": 0.6043411493301392, "learning_rate": 0.00011214348774005661, "loss": 0.5855, "step": 86910 }, { "epoch": 1.9346509971509973, "grad_norm": 0.5133441090583801, "learning_rate": 0.0001121016070921898, "loss": 0.4867, "step": 86920 }, { "epoch": 1.9348735754985755, "grad_norm": 0.5243979692459106, "learning_rate": 0.00011205973122079361, "loss": 0.5063, "step": 86930 }, { "epoch": 1.9350961538461537, "grad_norm": 0.5311282277107239, "learning_rate": 0.00011201786012814363, "loss": 0.6174, "step": 86940 }, { "epoch": 1.9353187321937322, "grad_norm": 0.6318812370300293, "learning_rate": 0.00011197599381651525, "loss": 0.5249, "step": 86950 }, { "epoch": 1.9355413105413106, "grad_norm": 1.000100016593933, "learning_rate": 0.0001119341322881833, "loss": 0.5078, "step": 86960 }, { "epoch": 1.9357638888888888, "grad_norm": 0.5029283165931702, "learning_rate": 0.00011189227554542272, "loss": 0.4465, "step": 86970 }, { "epoch": 1.9359864672364673, "grad_norm": 0.5905060768127441, "learning_rate": 0.00011185042359050801, "loss": 0.5005, "step": 86980 }, { "epoch": 1.9362090455840457, "grad_norm": 0.6078103184700012, "learning_rate": 0.00011180857642571347, "loss": 0.5083, "step": 86990 }, { "epoch": 1.936431623931624, "grad_norm": 0.48058727383613586, "learning_rate": 0.00011176673405331306, "loss": 0.6146, "step": 87000 }, { "epoch": 1.9366542022792022, "grad_norm": 0.5735967755317688, "learning_rate": 0.00011172489647558055, "loss": 0.6516, "step": 87010 }, { "epoch": 1.9368767806267806, "grad_norm": 0.3764326572418213, "learning_rate": 0.00011168306369478954, "loss": 0.6557, "step": 87020 }, { "epoch": 1.937099358974359, "grad_norm": 0.46224406361579895, "learning_rate": 0.00011164123571321312, "loss": 0.4073, "step": 87030 }, { "epoch": 1.9373219373219372, "grad_norm": 0.6236437559127808, "learning_rate": 0.0001115994125331242, "loss": 0.5778, "step": 87040 }, { "epoch": 1.9375445156695157, "grad_norm": 0.4578424394130707, "learning_rate": 0.00011155759415679558, "loss": 0.5797, "step": 87050 }, { "epoch": 1.9377670940170941, "grad_norm": 0.5358936786651611, "learning_rate": 0.00011151578058649971, "loss": 0.5618, "step": 87060 }, { "epoch": 1.9379896723646723, "grad_norm": 0.7817856073379517, "learning_rate": 0.00011147397182450876, "loss": 0.5458, "step": 87070 }, { "epoch": 1.9382122507122506, "grad_norm": 0.6386322975158691, "learning_rate": 0.00011143216787309466, "loss": 0.4807, "step": 87080 }, { "epoch": 1.9384348290598292, "grad_norm": 0.4296022951602936, "learning_rate": 0.0001113903687345291, "loss": 0.5916, "step": 87090 }, { "epoch": 1.9386574074074074, "grad_norm": 0.4393704831600189, "learning_rate": 0.00011134857441108337, "loss": 0.5215, "step": 87100 }, { "epoch": 1.9388799857549857, "grad_norm": 0.47139036655426025, "learning_rate": 0.0001113067849050287, "loss": 0.5059, "step": 87110 }, { "epoch": 1.939102564102564, "grad_norm": 0.5626164674758911, "learning_rate": 0.00011126500021863595, "loss": 0.4885, "step": 87120 }, { "epoch": 1.9393251424501425, "grad_norm": 1.0058131217956543, "learning_rate": 0.00011122322035417574, "loss": 0.5409, "step": 87130 }, { "epoch": 1.9395477207977208, "grad_norm": 0.5677262544631958, "learning_rate": 0.00011118144531391838, "loss": 0.5311, "step": 87140 }, { "epoch": 1.9397702991452992, "grad_norm": 0.5968604683876038, "learning_rate": 0.00011113967510013407, "loss": 0.4733, "step": 87150 }, { "epoch": 1.9399928774928776, "grad_norm": 0.5895420908927917, "learning_rate": 0.0001110979097150925, "loss": 0.5589, "step": 87160 }, { "epoch": 1.9402154558404558, "grad_norm": 0.41556987166404724, "learning_rate": 0.00011105614916106337, "loss": 0.4725, "step": 87170 }, { "epoch": 1.940438034188034, "grad_norm": 0.6243895888328552, "learning_rate": 0.00011101439344031584, "loss": 0.5698, "step": 87180 }, { "epoch": 1.9406606125356125, "grad_norm": 0.452260285615921, "learning_rate": 0.00011097264255511901, "loss": 0.4175, "step": 87190 }, { "epoch": 1.940883190883191, "grad_norm": 0.4803001284599304, "learning_rate": 0.00011093089650774167, "loss": 0.4819, "step": 87200 }, { "epoch": 1.9411057692307692, "grad_norm": 0.5832453370094299, "learning_rate": 0.0001108891553004523, "loss": 0.6637, "step": 87210 }, { "epoch": 1.9413283475783476, "grad_norm": 0.5619366765022278, "learning_rate": 0.0001108474189355192, "loss": 0.4702, "step": 87220 }, { "epoch": 1.941550925925926, "grad_norm": 0.4576612114906311, "learning_rate": 0.00011080568741521043, "loss": 0.5133, "step": 87230 }, { "epoch": 1.9417735042735043, "grad_norm": 0.6994546055793762, "learning_rate": 0.00011076396074179352, "loss": 0.5462, "step": 87240 }, { "epoch": 1.9419960826210825, "grad_norm": 0.6044816374778748, "learning_rate": 0.00011072223891753605, "loss": 0.5178, "step": 87250 }, { "epoch": 1.942218660968661, "grad_norm": 0.5070165395736694, "learning_rate": 0.00011068052194470519, "loss": 0.5199, "step": 87260 }, { "epoch": 1.9424412393162394, "grad_norm": 0.7343992590904236, "learning_rate": 0.00011063880982556791, "loss": 0.4915, "step": 87270 }, { "epoch": 1.9426638176638176, "grad_norm": 0.4487825632095337, "learning_rate": 0.00011059710256239091, "loss": 0.5069, "step": 87280 }, { "epoch": 1.942886396011396, "grad_norm": 0.39010611176490784, "learning_rate": 0.00011055540015744048, "loss": 0.5441, "step": 87290 }, { "epoch": 1.9431089743589745, "grad_norm": 0.39558878540992737, "learning_rate": 0.00011051370261298287, "loss": 0.5061, "step": 87300 }, { "epoch": 1.9433315527065527, "grad_norm": 0.6780951619148254, "learning_rate": 0.00011047200993128393, "loss": 0.6518, "step": 87310 }, { "epoch": 1.943554131054131, "grad_norm": 0.4943266212940216, "learning_rate": 0.0001104303221146092, "loss": 0.573, "step": 87320 }, { "epoch": 1.9437767094017095, "grad_norm": 0.35894832015037537, "learning_rate": 0.00011038863916522411, "loss": 0.466, "step": 87330 }, { "epoch": 1.9439992877492878, "grad_norm": 1.0896111726760864, "learning_rate": 0.00011034696108539373, "loss": 0.5727, "step": 87340 }, { "epoch": 1.944221866096866, "grad_norm": 0.4004582166671753, "learning_rate": 0.00011030528787738286, "loss": 0.4915, "step": 87350 }, { "epoch": 1.9444444444444444, "grad_norm": 0.5520352125167847, "learning_rate": 0.00011026361954345607, "loss": 0.4131, "step": 87360 }, { "epoch": 1.9446670227920229, "grad_norm": 0.4198590815067291, "learning_rate": 0.00011022195608587776, "loss": 0.4725, "step": 87370 }, { "epoch": 1.944889601139601, "grad_norm": 0.49883604049682617, "learning_rate": 0.00011018029750691174, "loss": 0.4958, "step": 87380 }, { "epoch": 1.9451121794871795, "grad_norm": 0.470523864030838, "learning_rate": 0.00011013864380882191, "loss": 0.5394, "step": 87390 }, { "epoch": 1.945334757834758, "grad_norm": 0.31142184138298035, "learning_rate": 0.00011009699499387172, "loss": 0.5702, "step": 87400 }, { "epoch": 1.9455573361823362, "grad_norm": 0.753667950630188, "learning_rate": 0.00011005535106432452, "loss": 0.4705, "step": 87410 }, { "epoch": 1.9457799145299144, "grad_norm": 0.6320822238922119, "learning_rate": 0.00011001371202244309, "loss": 0.5632, "step": 87420 }, { "epoch": 1.9460024928774928, "grad_norm": 0.7997458577156067, "learning_rate": 0.0001099720778704902, "loss": 0.6188, "step": 87430 }, { "epoch": 1.9462250712250713, "grad_norm": 0.5000240206718445, "learning_rate": 0.0001099304486107283, "loss": 0.4939, "step": 87440 }, { "epoch": 1.9464476495726495, "grad_norm": 0.6512289047241211, "learning_rate": 0.00010988882424541964, "loss": 0.5788, "step": 87450 }, { "epoch": 1.946670227920228, "grad_norm": 0.4772588014602661, "learning_rate": 0.00010984720477682597, "loss": 0.6477, "step": 87460 }, { "epoch": 1.9468928062678064, "grad_norm": 0.7150284051895142, "learning_rate": 0.000109805590207209, "loss": 0.4929, "step": 87470 }, { "epoch": 1.9471153846153846, "grad_norm": 0.4428633451461792, "learning_rate": 0.00010976398053883008, "loss": 0.5268, "step": 87480 }, { "epoch": 1.9473379629629628, "grad_norm": 0.5598447322845459, "learning_rate": 0.00010972237577395033, "loss": 0.614, "step": 87490 }, { "epoch": 1.9475605413105415, "grad_norm": 0.67003333568573, "learning_rate": 0.00010968077591483059, "loss": 0.5683, "step": 87500 }, { "epoch": 1.9477831196581197, "grad_norm": 0.7352870106697083, "learning_rate": 0.00010963918096373152, "loss": 0.5054, "step": 87510 }, { "epoch": 1.948005698005698, "grad_norm": 0.4697332978248596, "learning_rate": 0.00010959759092291322, "loss": 0.6146, "step": 87520 }, { "epoch": 1.9482282763532763, "grad_norm": 0.544786810874939, "learning_rate": 0.00010955600579463583, "loss": 0.5645, "step": 87530 }, { "epoch": 1.9484508547008548, "grad_norm": 0.5202828049659729, "learning_rate": 0.00010951442558115923, "loss": 0.573, "step": 87540 }, { "epoch": 1.948673433048433, "grad_norm": 0.4407440423965454, "learning_rate": 0.00010947285028474275, "loss": 0.4154, "step": 87550 }, { "epoch": 1.9488960113960114, "grad_norm": 0.4419044554233551, "learning_rate": 0.00010943127990764566, "loss": 0.5709, "step": 87560 }, { "epoch": 1.9491185897435899, "grad_norm": 0.5056807398796082, "learning_rate": 0.00010938971445212698, "loss": 0.5888, "step": 87570 }, { "epoch": 1.949341168091168, "grad_norm": 0.5175442695617676, "learning_rate": 0.00010934815392044542, "loss": 0.5497, "step": 87580 }, { "epoch": 1.9495637464387463, "grad_norm": 0.7415947318077087, "learning_rate": 0.00010930659831485944, "loss": 0.5651, "step": 87590 }, { "epoch": 1.9497863247863247, "grad_norm": 0.5029433965682983, "learning_rate": 0.00010926504763762706, "loss": 0.4877, "step": 87600 }, { "epoch": 1.9500089031339032, "grad_norm": 0.47681859135627747, "learning_rate": 0.0001092235018910063, "loss": 0.5207, "step": 87610 }, { "epoch": 1.9502314814814814, "grad_norm": 0.3940894305706024, "learning_rate": 0.00010918196107725474, "loss": 0.4589, "step": 87620 }, { "epoch": 1.9504540598290598, "grad_norm": 0.4383629262447357, "learning_rate": 0.00010914042519862975, "loss": 0.5397, "step": 87630 }, { "epoch": 1.9506766381766383, "grad_norm": 0.4269826412200928, "learning_rate": 0.00010909889425738847, "loss": 0.4881, "step": 87640 }, { "epoch": 1.9508992165242165, "grad_norm": 0.725731611251831, "learning_rate": 0.00010905736825578774, "loss": 0.485, "step": 87650 }, { "epoch": 1.9511217948717947, "grad_norm": 0.5675638914108276, "learning_rate": 0.000109015847196084, "loss": 0.4996, "step": 87660 }, { "epoch": 1.9513443732193734, "grad_norm": 0.5646101236343384, "learning_rate": 0.00010897433108053361, "loss": 0.6168, "step": 87670 }, { "epoch": 1.9515669515669516, "grad_norm": 0.6364313364028931, "learning_rate": 0.00010893281991139268, "loss": 0.6142, "step": 87680 }, { "epoch": 1.9517895299145298, "grad_norm": 0.5038259625434875, "learning_rate": 0.00010889131369091676, "loss": 0.492, "step": 87690 }, { "epoch": 1.9520121082621082, "grad_norm": 0.5601425766944885, "learning_rate": 0.00010884981242136145, "loss": 0.6074, "step": 87700 }, { "epoch": 1.9522346866096867, "grad_norm": 0.9033480882644653, "learning_rate": 0.00010880831610498194, "loss": 0.5971, "step": 87710 }, { "epoch": 1.952457264957265, "grad_norm": 0.6076274514198303, "learning_rate": 0.00010876682474403321, "loss": 0.5046, "step": 87720 }, { "epoch": 1.9526798433048433, "grad_norm": 0.7191054821014404, "learning_rate": 0.00010872533834076999, "loss": 0.6105, "step": 87730 }, { "epoch": 1.9529024216524218, "grad_norm": 0.3403964042663574, "learning_rate": 0.00010868385689744652, "loss": 0.4262, "step": 87740 }, { "epoch": 1.953125, "grad_norm": 0.29592037200927734, "learning_rate": 0.000108642380416317, "loss": 0.6061, "step": 87750 }, { "epoch": 1.9533475783475782, "grad_norm": 0.5361388921737671, "learning_rate": 0.00010860090889963532, "loss": 0.6893, "step": 87760 }, { "epoch": 1.9535701566951567, "grad_norm": 0.48884057998657227, "learning_rate": 0.00010855944234965512, "loss": 0.4991, "step": 87770 }, { "epoch": 1.953792735042735, "grad_norm": 0.47964853048324585, "learning_rate": 0.00010851798076862962, "loss": 0.6482, "step": 87780 }, { "epoch": 1.9540153133903133, "grad_norm": 0.8488168716430664, "learning_rate": 0.00010847652415881204, "loss": 0.5503, "step": 87790 }, { "epoch": 1.9542378917378918, "grad_norm": 0.5509916543960571, "learning_rate": 0.00010843507252245499, "loss": 0.5735, "step": 87800 }, { "epoch": 1.9544604700854702, "grad_norm": 0.5244113802909851, "learning_rate": 0.00010839362586181114, "loss": 0.5345, "step": 87810 }, { "epoch": 1.9546830484330484, "grad_norm": 0.5117759108543396, "learning_rate": 0.00010835218417913255, "loss": 0.5599, "step": 87820 }, { "epoch": 1.9549056267806266, "grad_norm": 0.6230669021606445, "learning_rate": 0.00010831074747667131, "loss": 0.4652, "step": 87830 }, { "epoch": 1.9551282051282053, "grad_norm": 0.6364635229110718, "learning_rate": 0.00010826931575667911, "loss": 0.4689, "step": 87840 }, { "epoch": 1.9553507834757835, "grad_norm": 0.4438205659389496, "learning_rate": 0.00010822788902140741, "loss": 0.5304, "step": 87850 }, { "epoch": 1.9555733618233617, "grad_norm": 0.48501142859458923, "learning_rate": 0.0001081864672731073, "loss": 0.4529, "step": 87860 }, { "epoch": 1.9557959401709402, "grad_norm": 0.4966143071651459, "learning_rate": 0.00010814505051402984, "loss": 0.429, "step": 87870 }, { "epoch": 1.9560185185185186, "grad_norm": 0.39442554116249084, "learning_rate": 0.00010810363874642544, "loss": 0.6, "step": 87880 }, { "epoch": 1.9562410968660968, "grad_norm": 0.7312954664230347, "learning_rate": 0.00010806223197254455, "loss": 0.6283, "step": 87890 }, { "epoch": 1.9564636752136753, "grad_norm": 0.6002002358436584, "learning_rate": 0.00010802083019463722, "loss": 0.5609, "step": 87900 }, { "epoch": 1.9566862535612537, "grad_norm": 0.528228223323822, "learning_rate": 0.00010797943341495329, "loss": 0.5249, "step": 87910 }, { "epoch": 1.956908831908832, "grad_norm": 0.7415391802787781, "learning_rate": 0.00010793804163574228, "loss": 0.8109, "step": 87920 }, { "epoch": 1.9571314102564101, "grad_norm": 0.4688614010810852, "learning_rate": 0.00010789665485925353, "loss": 0.4535, "step": 87930 }, { "epoch": 1.9573539886039886, "grad_norm": 0.5116744041442871, "learning_rate": 0.00010785527308773593, "loss": 0.6447, "step": 87940 }, { "epoch": 1.957576566951567, "grad_norm": 0.487576961517334, "learning_rate": 0.00010781389632343817, "loss": 0.5942, "step": 87950 }, { "epoch": 1.9577991452991452, "grad_norm": 0.5948076248168945, "learning_rate": 0.00010777252456860876, "loss": 0.501, "step": 87960 }, { "epoch": 1.9580217236467237, "grad_norm": 0.5517193078994751, "learning_rate": 0.00010773115782549585, "loss": 0.5566, "step": 87970 }, { "epoch": 1.958244301994302, "grad_norm": 0.5932909250259399, "learning_rate": 0.00010768979609634735, "loss": 0.669, "step": 87980 }, { "epoch": 1.9584668803418803, "grad_norm": 0.9186346530914307, "learning_rate": 0.00010764843938341092, "loss": 0.6424, "step": 87990 }, { "epoch": 1.9586894586894585, "grad_norm": 0.5425516366958618, "learning_rate": 0.00010760708768893386, "loss": 0.4626, "step": 88000 }, { "epoch": 1.9589120370370372, "grad_norm": 0.3957112431526184, "learning_rate": 0.00010756574101516342, "loss": 0.5944, "step": 88010 }, { "epoch": 1.9591346153846154, "grad_norm": 0.37178757786750793, "learning_rate": 0.00010752439936434617, "loss": 0.4518, "step": 88020 }, { "epoch": 1.9593571937321936, "grad_norm": 0.48003068566322327, "learning_rate": 0.00010748306273872876, "loss": 0.5355, "step": 88030 }, { "epoch": 1.959579772079772, "grad_norm": 0.510742723941803, "learning_rate": 0.00010744173114055747, "loss": 0.4429, "step": 88040 }, { "epoch": 1.9598023504273505, "grad_norm": 0.4262735843658447, "learning_rate": 0.0001074004045720783, "loss": 0.4797, "step": 88050 }, { "epoch": 1.9600249287749287, "grad_norm": 0.5816901326179504, "learning_rate": 0.00010735908303553693, "loss": 0.5103, "step": 88060 }, { "epoch": 1.9602475071225072, "grad_norm": 0.5432152152061462, "learning_rate": 0.0001073177665331789, "loss": 0.4934, "step": 88070 }, { "epoch": 1.9604700854700856, "grad_norm": 0.43575814366340637, "learning_rate": 0.00010727645506724933, "loss": 0.4969, "step": 88080 }, { "epoch": 1.9606926638176638, "grad_norm": 0.441064715385437, "learning_rate": 0.000107235148639993, "loss": 0.5214, "step": 88090 }, { "epoch": 1.960915242165242, "grad_norm": 0.5713571310043335, "learning_rate": 0.00010719384725365465, "loss": 0.4997, "step": 88100 }, { "epoch": 1.9611378205128205, "grad_norm": 0.5397986769676208, "learning_rate": 0.00010715255091047862, "loss": 0.5765, "step": 88110 }, { "epoch": 1.961360398860399, "grad_norm": 0.5189772844314575, "learning_rate": 0.00010711125961270896, "loss": 0.4562, "step": 88120 }, { "epoch": 1.9615829772079771, "grad_norm": 0.554828405380249, "learning_rate": 0.0001070699733625895, "loss": 0.5483, "step": 88130 }, { "epoch": 1.9618055555555556, "grad_norm": 0.28737306594848633, "learning_rate": 0.00010702869216236378, "loss": 0.5179, "step": 88140 }, { "epoch": 1.962028133903134, "grad_norm": 0.5746546387672424, "learning_rate": 0.00010698741601427511, "loss": 0.505, "step": 88150 }, { "epoch": 1.9622507122507122, "grad_norm": 0.5059083104133606, "learning_rate": 0.0001069461449205663, "loss": 0.4812, "step": 88160 }, { "epoch": 1.9624732905982905, "grad_norm": 0.6395819783210754, "learning_rate": 0.00010690487888348016, "loss": 0.5105, "step": 88170 }, { "epoch": 1.962695868945869, "grad_norm": 0.7052072286605835, "learning_rate": 0.00010686361790525911, "loss": 0.5763, "step": 88180 }, { "epoch": 1.9629184472934473, "grad_norm": 0.42889729142189026, "learning_rate": 0.00010682236198814533, "loss": 0.4853, "step": 88190 }, { "epoch": 1.9631410256410255, "grad_norm": 0.875728189945221, "learning_rate": 0.00010678111113438074, "loss": 0.6747, "step": 88200 }, { "epoch": 1.963363603988604, "grad_norm": 0.6272012591362, "learning_rate": 0.00010673986534620681, "loss": 0.6072, "step": 88210 }, { "epoch": 1.9635861823361824, "grad_norm": 0.46567538380622864, "learning_rate": 0.00010669862462586502, "loss": 0.4489, "step": 88220 }, { "epoch": 1.9638087606837606, "grad_norm": 0.6730668544769287, "learning_rate": 0.00010665738897559627, "loss": 0.495, "step": 88230 }, { "epoch": 1.964031339031339, "grad_norm": 0.6411313414573669, "learning_rate": 0.00010661615839764141, "loss": 0.5778, "step": 88240 }, { "epoch": 1.9642539173789175, "grad_norm": 0.4434345066547394, "learning_rate": 0.00010657493289424097, "loss": 0.4711, "step": 88250 }, { "epoch": 1.9644764957264957, "grad_norm": 0.49573907256126404, "learning_rate": 0.00010653371246763515, "loss": 0.5912, "step": 88260 }, { "epoch": 1.964699074074074, "grad_norm": 0.3320474624633789, "learning_rate": 0.0001064924971200639, "loss": 0.5829, "step": 88270 }, { "epoch": 1.9649216524216524, "grad_norm": 0.6052396893501282, "learning_rate": 0.00010645128685376699, "loss": 0.5362, "step": 88280 }, { "epoch": 1.9651442307692308, "grad_norm": 0.3489750623703003, "learning_rate": 0.00010641008167098365, "loss": 0.6776, "step": 88290 }, { "epoch": 1.965366809116809, "grad_norm": 0.5899769067764282, "learning_rate": 0.0001063688815739531, "loss": 0.5392, "step": 88300 }, { "epoch": 1.9655893874643875, "grad_norm": 0.33934059739112854, "learning_rate": 0.00010632768656491416, "loss": 0.3809, "step": 88310 }, { "epoch": 1.965811965811966, "grad_norm": 0.5670376420021057, "learning_rate": 0.00010628649664610542, "loss": 0.5409, "step": 88320 }, { "epoch": 1.9660345441595442, "grad_norm": 0.7151758670806885, "learning_rate": 0.00010624531181976526, "loss": 0.4336, "step": 88330 }, { "epoch": 1.9662571225071224, "grad_norm": 0.43221569061279297, "learning_rate": 0.00010620413208813152, "loss": 0.472, "step": 88340 }, { "epoch": 1.9664797008547008, "grad_norm": 0.7412872314453125, "learning_rate": 0.00010616295745344203, "loss": 0.6344, "step": 88350 }, { "epoch": 1.9667022792022792, "grad_norm": 0.42692792415618896, "learning_rate": 0.00010612178791793432, "loss": 0.4771, "step": 88360 }, { "epoch": 1.9669248575498575, "grad_norm": 0.6319110989570618, "learning_rate": 0.00010608062348384544, "loss": 0.6329, "step": 88370 }, { "epoch": 1.967147435897436, "grad_norm": 0.4742584526538849, "learning_rate": 0.00010603946415341237, "loss": 0.5043, "step": 88380 }, { "epoch": 1.9673700142450143, "grad_norm": 0.43028610944747925, "learning_rate": 0.00010599830992887172, "loss": 0.4571, "step": 88390 }, { "epoch": 1.9675925925925926, "grad_norm": 0.5484131574630737, "learning_rate": 0.00010595716081245988, "loss": 0.4252, "step": 88400 }, { "epoch": 1.9678151709401708, "grad_norm": 0.6796470284461975, "learning_rate": 0.00010591601680641288, "loss": 0.4696, "step": 88410 }, { "epoch": 1.9680377492877494, "grad_norm": 0.48466822504997253, "learning_rate": 0.00010587487791296666, "loss": 0.4373, "step": 88420 }, { "epoch": 1.9682603276353277, "grad_norm": 0.6868494749069214, "learning_rate": 0.00010583374413435651, "loss": 0.4667, "step": 88430 }, { "epoch": 1.9684829059829059, "grad_norm": 0.699004590511322, "learning_rate": 0.00010579261547281785, "loss": 0.5441, "step": 88440 }, { "epoch": 1.9687054843304843, "grad_norm": 0.6828630566596985, "learning_rate": 0.00010575149193058554, "loss": 0.4365, "step": 88450 }, { "epoch": 1.9689280626780628, "grad_norm": 0.45245376229286194, "learning_rate": 0.00010571037350989442, "loss": 0.4757, "step": 88460 }, { "epoch": 1.969150641025641, "grad_norm": 0.5244765877723694, "learning_rate": 0.0001056692602129787, "loss": 0.4994, "step": 88470 }, { "epoch": 1.9693732193732194, "grad_norm": 0.5580021739006042, "learning_rate": 0.00010562815204207259, "loss": 0.6172, "step": 88480 }, { "epoch": 1.9695957977207978, "grad_norm": 0.7489878535270691, "learning_rate": 0.00010558704899940996, "loss": 0.6022, "step": 88490 }, { "epoch": 1.969818376068376, "grad_norm": 0.5939428806304932, "learning_rate": 0.00010554595108722445, "loss": 0.5763, "step": 88500 }, { "epoch": 1.9700409544159543, "grad_norm": 0.5634039640426636, "learning_rate": 0.00010550485830774923, "loss": 0.4778, "step": 88510 }, { "epoch": 1.9702635327635327, "grad_norm": 0.5487877130508423, "learning_rate": 0.00010546377066321735, "loss": 0.5419, "step": 88520 }, { "epoch": 1.9704861111111112, "grad_norm": 0.37066948413848877, "learning_rate": 0.00010542268815586158, "loss": 0.4689, "step": 88530 }, { "epoch": 1.9707086894586894, "grad_norm": 0.473908007144928, "learning_rate": 0.00010538161078791433, "loss": 0.4853, "step": 88540 }, { "epoch": 1.9709312678062678, "grad_norm": 0.5134755969047546, "learning_rate": 0.00010534053856160784, "loss": 0.5325, "step": 88550 }, { "epoch": 1.9711538461538463, "grad_norm": 0.40971675515174866, "learning_rate": 0.00010529947147917403, "loss": 0.5162, "step": 88560 }, { "epoch": 1.9713764245014245, "grad_norm": 0.6359806060791016, "learning_rate": 0.00010525840954284438, "loss": 0.7041, "step": 88570 }, { "epoch": 1.9715990028490027, "grad_norm": 0.5774009227752686, "learning_rate": 0.00010521735275485035, "loss": 0.5622, "step": 88580 }, { "epoch": 1.9718215811965814, "grad_norm": 0.758904218673706, "learning_rate": 0.00010517630111742303, "loss": 0.6136, "step": 88590 }, { "epoch": 1.9720441595441596, "grad_norm": 0.40916797518730164, "learning_rate": 0.00010513525463279306, "loss": 0.5237, "step": 88600 }, { "epoch": 1.9722667378917378, "grad_norm": 0.4158216118812561, "learning_rate": 0.00010509421330319098, "loss": 0.4971, "step": 88610 }, { "epoch": 1.9724893162393162, "grad_norm": 0.7104225158691406, "learning_rate": 0.00010505317713084708, "loss": 0.5799, "step": 88620 }, { "epoch": 1.9727118945868947, "grad_norm": 0.5312607288360596, "learning_rate": 0.00010501214611799125, "loss": 0.5532, "step": 88630 }, { "epoch": 1.9729344729344729, "grad_norm": 0.6019620895385742, "learning_rate": 0.00010497112026685321, "loss": 0.5284, "step": 88640 }, { "epoch": 1.9731570512820513, "grad_norm": 0.6432709693908691, "learning_rate": 0.00010493009957966224, "loss": 0.6361, "step": 88650 }, { "epoch": 1.9733796296296298, "grad_norm": 0.6770408153533936, "learning_rate": 0.00010488908405864749, "loss": 0.5335, "step": 88660 }, { "epoch": 1.973602207977208, "grad_norm": 0.681075930595398, "learning_rate": 0.00010484807370603777, "loss": 0.6111, "step": 88670 }, { "epoch": 1.9738247863247862, "grad_norm": 0.5270264744758606, "learning_rate": 0.00010480706852406159, "loss": 0.5412, "step": 88680 }, { "epoch": 1.9740473646723646, "grad_norm": 0.6909594535827637, "learning_rate": 0.00010476606851494728, "loss": 0.4828, "step": 88690 }, { "epoch": 1.974269943019943, "grad_norm": 0.4906465411186218, "learning_rate": 0.00010472507368092284, "loss": 0.54, "step": 88700 }, { "epoch": 1.9744925213675213, "grad_norm": 0.7198073267936707, "learning_rate": 0.00010468408402421578, "loss": 0.6627, "step": 88710 }, { "epoch": 1.9747150997150997, "grad_norm": 0.8544343709945679, "learning_rate": 0.00010464309954705371, "loss": 0.6367, "step": 88720 }, { "epoch": 1.9749376780626782, "grad_norm": 1.1126538515090942, "learning_rate": 0.00010460212025166363, "loss": 0.45, "step": 88730 }, { "epoch": 1.9751602564102564, "grad_norm": 0.503483772277832, "learning_rate": 0.00010456114614027241, "loss": 0.5136, "step": 88740 }, { "epoch": 1.9753828347578346, "grad_norm": 0.5431207418441772, "learning_rate": 0.00010452017721510663, "loss": 0.6539, "step": 88750 }, { "epoch": 1.9756054131054133, "grad_norm": 0.724375307559967, "learning_rate": 0.0001044792134783926, "loss": 0.6294, "step": 88760 }, { "epoch": 1.9758279914529915, "grad_norm": 0.7335283160209656, "learning_rate": 0.00010443825493235628, "loss": 0.4855, "step": 88770 }, { "epoch": 1.9760505698005697, "grad_norm": 0.5082575678825378, "learning_rate": 0.00010439730157922355, "loss": 0.5777, "step": 88780 }, { "epoch": 1.9762731481481481, "grad_norm": 0.34826645255088806, "learning_rate": 0.00010435635342121959, "loss": 0.5078, "step": 88790 }, { "epoch": 1.9764957264957266, "grad_norm": 0.5735419392585754, "learning_rate": 0.0001043154104605697, "loss": 0.6271, "step": 88800 }, { "epoch": 1.9767183048433048, "grad_norm": 0.8078730702400208, "learning_rate": 0.00010427447269949872, "loss": 0.6738, "step": 88810 }, { "epoch": 1.9769408831908832, "grad_norm": 0.3965419828891754, "learning_rate": 0.00010423354014023128, "loss": 0.5354, "step": 88820 }, { "epoch": 1.9771634615384617, "grad_norm": 0.37384480237960815, "learning_rate": 0.00010419261278499166, "loss": 0.3936, "step": 88830 }, { "epoch": 1.97738603988604, "grad_norm": 0.7163903713226318, "learning_rate": 0.00010415169063600397, "loss": 0.4801, "step": 88840 }, { "epoch": 1.977608618233618, "grad_norm": 0.6502336859703064, "learning_rate": 0.00010411077369549178, "loss": 0.4773, "step": 88850 }, { "epoch": 1.9778311965811965, "grad_norm": 0.7422441840171814, "learning_rate": 0.00010406986196567873, "loss": 0.6388, "step": 88860 }, { "epoch": 1.978053774928775, "grad_norm": 0.5089988112449646, "learning_rate": 0.00010402895544878785, "loss": 0.5556, "step": 88870 }, { "epoch": 1.9782763532763532, "grad_norm": 0.4812432527542114, "learning_rate": 0.0001039880541470421, "loss": 0.3783, "step": 88880 }, { "epoch": 1.9784989316239316, "grad_norm": 0.6941017508506775, "learning_rate": 0.00010394715806266409, "loss": 0.5007, "step": 88890 }, { "epoch": 1.97872150997151, "grad_norm": 0.44974082708358765, "learning_rate": 0.0001039062671978761, "loss": 0.4807, "step": 88900 }, { "epoch": 1.9789440883190883, "grad_norm": 0.43339797854423523, "learning_rate": 0.00010386538155490026, "loss": 0.5258, "step": 88910 }, { "epoch": 1.9791666666666665, "grad_norm": 0.4669586420059204, "learning_rate": 0.00010382450113595835, "loss": 0.4959, "step": 88920 }, { "epoch": 1.9793892450142452, "grad_norm": 0.4113246500492096, "learning_rate": 0.00010378362594327171, "loss": 0.5621, "step": 88930 }, { "epoch": 1.9796118233618234, "grad_norm": 0.5443630814552307, "learning_rate": 0.00010374275597906157, "loss": 0.5251, "step": 88940 }, { "epoch": 1.9798344017094016, "grad_norm": 0.5284280180931091, "learning_rate": 0.00010370189124554892, "loss": 0.6476, "step": 88950 }, { "epoch": 1.98005698005698, "grad_norm": 0.584843635559082, "learning_rate": 0.00010366103174495432, "loss": 0.5103, "step": 88960 }, { "epoch": 1.9802350427350426, "eval_loss": 0.5369717478752136, "eval_runtime": 337.2246, "eval_samples_per_second": 7.013, "eval_steps_per_second": 7.013, "step": 88968 }, { "epoch": 1.9802795584045585, "grad_norm": 0.5102009773254395, "learning_rate": 0.0001036201774794981, "loss": 0.5013, "step": 88970 }, { "epoch": 1.9805021367521367, "grad_norm": 0.5352452397346497, "learning_rate": 0.00010357932845140044, "loss": 0.5947, "step": 88980 }, { "epoch": 1.9807247150997151, "grad_norm": 0.42273956537246704, "learning_rate": 0.0001035384846628809, "loss": 0.4462, "step": 88990 }, { "epoch": 1.9809472934472936, "grad_norm": 0.5700570940971375, "learning_rate": 0.00010349764611615915, "loss": 0.4705, "step": 89000 }, { "epoch": 1.9811698717948718, "grad_norm": 0.487447053194046, "learning_rate": 0.00010345681281345425, "loss": 0.4994, "step": 89010 }, { "epoch": 1.98139245014245, "grad_norm": 0.7617775797843933, "learning_rate": 0.00010341598475698517, "loss": 0.5634, "step": 89020 }, { "epoch": 1.9816150284900285, "grad_norm": 0.528607964515686, "learning_rate": 0.00010337516194897054, "loss": 0.5352, "step": 89030 }, { "epoch": 1.981837606837607, "grad_norm": 0.40189066529273987, "learning_rate": 0.00010333434439162872, "loss": 0.6469, "step": 89040 }, { "epoch": 1.9820601851851851, "grad_norm": 0.4828200340270996, "learning_rate": 0.00010329353208717776, "loss": 0.4271, "step": 89050 }, { "epoch": 1.9822827635327636, "grad_norm": 0.6710944771766663, "learning_rate": 0.0001032527250378355, "loss": 0.5488, "step": 89060 }, { "epoch": 1.982505341880342, "grad_norm": 0.6126538515090942, "learning_rate": 0.0001032119232458193, "loss": 0.4838, "step": 89070 }, { "epoch": 1.9827279202279202, "grad_norm": 0.5800119042396545, "learning_rate": 0.00010317112671334643, "loss": 0.5686, "step": 89080 }, { "epoch": 1.9829504985754984, "grad_norm": 0.6056652665138245, "learning_rate": 0.00010313033544263378, "loss": 0.4074, "step": 89090 }, { "epoch": 1.9831730769230769, "grad_norm": 0.7546206712722778, "learning_rate": 0.00010308954943589801, "loss": 0.5134, "step": 89100 }, { "epoch": 1.9833956552706553, "grad_norm": 0.5002375245094299, "learning_rate": 0.00010304876869535545, "loss": 0.6756, "step": 89110 }, { "epoch": 1.9836182336182335, "grad_norm": 0.4622044563293457, "learning_rate": 0.00010300799322322228, "loss": 0.5782, "step": 89120 }, { "epoch": 1.983840811965812, "grad_norm": 0.5083198547363281, "learning_rate": 0.00010296722302171411, "loss": 0.5272, "step": 89130 }, { "epoch": 1.9840633903133904, "grad_norm": 0.47521403431892395, "learning_rate": 0.00010292645809304644, "loss": 0.4775, "step": 89140 }, { "epoch": 1.9842859686609686, "grad_norm": 0.48532363772392273, "learning_rate": 0.00010288569843943449, "loss": 0.4386, "step": 89150 }, { "epoch": 1.984508547008547, "grad_norm": 0.5855087041854858, "learning_rate": 0.0001028449440630932, "loss": 0.5472, "step": 89160 }, { "epoch": 1.9847311253561255, "grad_norm": 0.5583044290542603, "learning_rate": 0.00010280419496623714, "loss": 0.633, "step": 89170 }, { "epoch": 1.9849537037037037, "grad_norm": 0.5322542190551758, "learning_rate": 0.00010276345115108075, "loss": 0.5051, "step": 89180 }, { "epoch": 1.985176282051282, "grad_norm": 0.43916016817092896, "learning_rate": 0.00010272271261983799, "loss": 0.4588, "step": 89190 }, { "epoch": 1.9853988603988604, "grad_norm": 0.7113112807273865, "learning_rate": 0.00010268197937472275, "loss": 0.5232, "step": 89200 }, { "epoch": 1.9856214387464388, "grad_norm": 0.6803197860717773, "learning_rate": 0.00010264125141794836, "loss": 0.5176, "step": 89210 }, { "epoch": 1.985844017094017, "grad_norm": 1.2475115060806274, "learning_rate": 0.00010260052875172803, "loss": 0.4694, "step": 89220 }, { "epoch": 1.9860665954415955, "grad_norm": 0.585325300693512, "learning_rate": 0.00010255981137827473, "loss": 0.4763, "step": 89230 }, { "epoch": 1.986289173789174, "grad_norm": 0.5921732783317566, "learning_rate": 0.00010251909929980103, "loss": 0.5698, "step": 89240 }, { "epoch": 1.9865117521367521, "grad_norm": 0.3192692995071411, "learning_rate": 0.00010247839251851936, "loss": 0.636, "step": 89250 }, { "epoch": 1.9867343304843303, "grad_norm": 0.445182204246521, "learning_rate": 0.00010243769103664163, "loss": 0.3865, "step": 89260 }, { "epoch": 1.9869569088319088, "grad_norm": 0.5367228388786316, "learning_rate": 0.00010239699485637966, "loss": 0.4922, "step": 89270 }, { "epoch": 1.9871794871794872, "grad_norm": 0.4065212905406952, "learning_rate": 0.00010235630397994485, "loss": 0.5156, "step": 89280 }, { "epoch": 1.9874020655270654, "grad_norm": 0.538700520992279, "learning_rate": 0.0001023156184095484, "loss": 0.5277, "step": 89290 }, { "epoch": 1.9876246438746439, "grad_norm": 0.6323667764663696, "learning_rate": 0.00010227493814740124, "loss": 0.6574, "step": 89300 }, { "epoch": 1.9878472222222223, "grad_norm": 0.29169631004333496, "learning_rate": 0.00010223426319571392, "loss": 0.4746, "step": 89310 }, { "epoch": 1.9880698005698005, "grad_norm": 0.47286543250083923, "learning_rate": 0.00010219359355669677, "loss": 0.4648, "step": 89320 }, { "epoch": 1.9882923789173788, "grad_norm": 0.5874037146568298, "learning_rate": 0.00010215292923255982, "loss": 0.5053, "step": 89330 }, { "epoch": 1.9885149572649574, "grad_norm": 0.49212488532066345, "learning_rate": 0.00010211227022551288, "loss": 0.575, "step": 89340 }, { "epoch": 1.9887375356125356, "grad_norm": 0.6886469125747681, "learning_rate": 0.00010207161653776522, "loss": 0.3993, "step": 89350 }, { "epoch": 1.9889601139601139, "grad_norm": 0.6136534810066223, "learning_rate": 0.00010203096817152609, "loss": 0.5185, "step": 89360 }, { "epoch": 1.9891826923076923, "grad_norm": 0.6381192803382874, "learning_rate": 0.00010199032512900437, "loss": 0.5533, "step": 89370 }, { "epoch": 1.9894052706552707, "grad_norm": 0.6178556680679321, "learning_rate": 0.00010194968741240866, "loss": 0.5602, "step": 89380 }, { "epoch": 1.989627849002849, "grad_norm": 0.46962249279022217, "learning_rate": 0.00010190905502394717, "loss": 0.5227, "step": 89390 }, { "epoch": 1.9898504273504274, "grad_norm": 0.4782501459121704, "learning_rate": 0.00010186842796582789, "loss": 0.4977, "step": 89400 }, { "epoch": 1.9900730056980058, "grad_norm": 0.8166126608848572, "learning_rate": 0.00010182780624025868, "loss": 0.6993, "step": 89410 }, { "epoch": 1.990295584045584, "grad_norm": 0.55714350938797, "learning_rate": 0.00010178718984944673, "loss": 0.5452, "step": 89420 }, { "epoch": 1.9905181623931623, "grad_norm": 0.6192950010299683, "learning_rate": 0.0001017465787955993, "loss": 0.5868, "step": 89430 }, { "epoch": 1.9907407407407407, "grad_norm": 0.5710587501525879, "learning_rate": 0.00010170597308092324, "loss": 0.5548, "step": 89440 }, { "epoch": 1.9909633190883191, "grad_norm": 0.43953341245651245, "learning_rate": 0.00010166537270762504, "loss": 0.4858, "step": 89450 }, { "epoch": 1.9911858974358974, "grad_norm": 0.43213924765586853, "learning_rate": 0.00010162477767791099, "loss": 0.4924, "step": 89460 }, { "epoch": 1.9914084757834758, "grad_norm": 0.5746148228645325, "learning_rate": 0.00010158418799398706, "loss": 0.578, "step": 89470 }, { "epoch": 1.9916310541310542, "grad_norm": 0.6764656901359558, "learning_rate": 0.00010154360365805899, "loss": 0.5371, "step": 89480 }, { "epoch": 1.9918536324786325, "grad_norm": 0.5575287342071533, "learning_rate": 0.00010150302467233204, "loss": 0.6483, "step": 89490 }, { "epoch": 1.9920762108262107, "grad_norm": 0.5395053625106812, "learning_rate": 0.00010146245103901135, "loss": 0.5266, "step": 89500 }, { "epoch": 1.9922987891737893, "grad_norm": 0.4512389004230499, "learning_rate": 0.00010142188276030182, "loss": 0.4788, "step": 89510 }, { "epoch": 1.9925213675213675, "grad_norm": 0.5930687189102173, "learning_rate": 0.00010138131983840779, "loss": 0.4624, "step": 89520 }, { "epoch": 1.9927439458689458, "grad_norm": 0.32276391983032227, "learning_rate": 0.00010134076227553358, "loss": 0.4986, "step": 89530 }, { "epoch": 1.9929665242165242, "grad_norm": 0.4842802882194519, "learning_rate": 0.00010130021007388313, "loss": 0.5604, "step": 89540 }, { "epoch": 1.9931891025641026, "grad_norm": 0.7211653590202332, "learning_rate": 0.00010125966323566012, "loss": 0.5766, "step": 89550 }, { "epoch": 1.9934116809116809, "grad_norm": 0.38424408435821533, "learning_rate": 0.00010121912176306776, "loss": 0.5793, "step": 89560 }, { "epoch": 1.9936342592592593, "grad_norm": 0.5912850499153137, "learning_rate": 0.00010117858565830922, "loss": 0.5374, "step": 89570 }, { "epoch": 1.9938568376068377, "grad_norm": 0.4847007691860199, "learning_rate": 0.00010113805492358721, "loss": 0.4387, "step": 89580 }, { "epoch": 1.994079415954416, "grad_norm": 0.4593251645565033, "learning_rate": 0.00010109752956110423, "loss": 0.447, "step": 89590 }, { "epoch": 1.9943019943019942, "grad_norm": 0.7256568074226379, "learning_rate": 0.00010105700957306248, "loss": 0.5952, "step": 89600 }, { "epoch": 1.9945245726495726, "grad_norm": 0.4649495780467987, "learning_rate": 0.00010101649496166379, "loss": 0.5566, "step": 89610 }, { "epoch": 1.994747150997151, "grad_norm": 0.4171789586544037, "learning_rate": 0.00010097598572910988, "loss": 0.5237, "step": 89620 }, { "epoch": 1.9949697293447293, "grad_norm": 0.4397051930427551, "learning_rate": 0.00010093548187760192, "loss": 0.6384, "step": 89630 }, { "epoch": 1.9951923076923077, "grad_norm": 0.5089512467384338, "learning_rate": 0.00010089498340934102, "loss": 0.4769, "step": 89640 }, { "epoch": 1.9954148860398861, "grad_norm": 0.6652963757514954, "learning_rate": 0.0001008544903265278, "loss": 0.6014, "step": 89650 }, { "epoch": 1.9956374643874644, "grad_norm": 0.4736979007720947, "learning_rate": 0.00010081400263136274, "loss": 0.4361, "step": 89660 }, { "epoch": 1.9958600427350426, "grad_norm": 0.7009586095809937, "learning_rate": 0.00010077352032604597, "loss": 0.6858, "step": 89670 }, { "epoch": 1.9960826210826212, "grad_norm": 0.7269137501716614, "learning_rate": 0.00010073304341277733, "loss": 0.5995, "step": 89680 }, { "epoch": 1.9963051994301995, "grad_norm": 0.6158400774002075, "learning_rate": 0.00010069257189375645, "loss": 0.6985, "step": 89690 }, { "epoch": 1.9965277777777777, "grad_norm": 0.9363667964935303, "learning_rate": 0.00010065210577118245, "loss": 0.4506, "step": 89700 }, { "epoch": 1.9967503561253561, "grad_norm": 0.6668571829795837, "learning_rate": 0.00010061164504725433, "loss": 0.5016, "step": 89710 }, { "epoch": 1.9969729344729346, "grad_norm": 0.5492924451828003, "learning_rate": 0.00010057118972417079, "loss": 0.4525, "step": 89720 }, { "epoch": 1.9971955128205128, "grad_norm": 0.6436484456062317, "learning_rate": 0.00010053073980413019, "loss": 0.6489, "step": 89730 }, { "epoch": 1.9974180911680912, "grad_norm": 0.6494245529174805, "learning_rate": 0.00010049029528933065, "loss": 0.6466, "step": 89740 }, { "epoch": 1.9976406695156697, "grad_norm": 0.33054423332214355, "learning_rate": 0.00010044985618196987, "loss": 0.5338, "step": 89750 }, { "epoch": 1.9978632478632479, "grad_norm": 0.4462552070617676, "learning_rate": 0.00010040942248424553, "loss": 0.4926, "step": 89760 }, { "epoch": 1.998085826210826, "grad_norm": 0.5570842623710632, "learning_rate": 0.00010036899419835468, "loss": 0.4973, "step": 89770 }, { "epoch": 1.9983084045584045, "grad_norm": 0.5469715595245361, "learning_rate": 0.00010032857132649418, "loss": 0.4614, "step": 89780 }, { "epoch": 1.998530982905983, "grad_norm": 0.6691291928291321, "learning_rate": 0.0001002881538708607, "loss": 0.5004, "step": 89790 }, { "epoch": 1.9987535612535612, "grad_norm": 0.6604383587837219, "learning_rate": 0.00010024774183365056, "loss": 0.4791, "step": 89800 }, { "epoch": 1.9989761396011396, "grad_norm": 0.6781284809112549, "learning_rate": 0.00010020733521705978, "loss": 0.5646, "step": 89810 }, { "epoch": 1.999198717948718, "grad_norm": 0.5180469751358032, "learning_rate": 0.00010016693402328412, "loss": 0.5442, "step": 89820 }, { "epoch": 1.9994212962962963, "grad_norm": 0.647405743598938, "learning_rate": 0.00010012653825451908, "loss": 0.508, "step": 89830 }, { "epoch": 1.9996438746438745, "grad_norm": 0.5377041101455688, "learning_rate": 0.00010008614791295961, "loss": 0.5258, "step": 89840 }, { "epoch": 1.999866452991453, "grad_norm": 0.6276906132698059, "learning_rate": 0.00010004576300080067, "loss": 0.4638, "step": 89850 }, { "epoch": 2.0000890313390314, "grad_norm": 0.3543466031551361, "learning_rate": 0.00010000538352023676, "loss": 0.4689, "step": 89860 }, { "epoch": 2.0003116096866096, "grad_norm": 0.43617504835128784, "learning_rate": 9.996500947346221e-05, "loss": 0.3632, "step": 89870 }, { "epoch": 2.0005341880341883, "grad_norm": 0.45962151885032654, "learning_rate": 9.992464086267093e-05, "loss": 0.4641, "step": 89880 }, { "epoch": 2.0007567663817665, "grad_norm": 0.42497971653938293, "learning_rate": 9.98842776900566e-05, "loss": 0.5192, "step": 89890 }, { "epoch": 2.0009793447293447, "grad_norm": 0.777499794960022, "learning_rate": 9.984391995781262e-05, "loss": 0.436, "step": 89900 }, { "epoch": 2.001201923076923, "grad_norm": 0.5374550819396973, "learning_rate": 9.980356766813206e-05, "loss": 0.4228, "step": 89910 }, { "epoch": 2.0014245014245016, "grad_norm": 0.6309812664985657, "learning_rate": 9.976322082320755e-05, "loss": 0.4356, "step": 89920 }, { "epoch": 2.00164707977208, "grad_norm": 0.6194542646408081, "learning_rate": 9.972287942523168e-05, "loss": 0.5146, "step": 89930 }, { "epoch": 2.001869658119658, "grad_norm": 0.5618504881858826, "learning_rate": 9.968254347639666e-05, "loss": 0.3839, "step": 89940 }, { "epoch": 2.0020922364672367, "grad_norm": 0.41237515211105347, "learning_rate": 9.964221297889433e-05, "loss": 0.3925, "step": 89950 }, { "epoch": 2.002314814814815, "grad_norm": 0.5048966407775879, "learning_rate": 9.960188793491632e-05, "loss": 0.5813, "step": 89960 }, { "epoch": 2.002537393162393, "grad_norm": 0.5869396328926086, "learning_rate": 9.956156834665398e-05, "loss": 0.5925, "step": 89970 }, { "epoch": 2.0027599715099713, "grad_norm": 0.598250150680542, "learning_rate": 9.952125421629814e-05, "loss": 0.4339, "step": 89980 }, { "epoch": 2.00298254985755, "grad_norm": 0.7474868893623352, "learning_rate": 9.948094554603962e-05, "loss": 0.4814, "step": 89990 }, { "epoch": 2.003205128205128, "grad_norm": 0.7503690719604492, "learning_rate": 9.94406423380688e-05, "loss": 0.4807, "step": 90000 }, { "epoch": 2.0034277065527064, "grad_norm": 0.47324520349502563, "learning_rate": 9.94003445945758e-05, "loss": 0.4463, "step": 90010 }, { "epoch": 2.003650284900285, "grad_norm": 0.586833119392395, "learning_rate": 9.936005231775046e-05, "loss": 0.5546, "step": 90020 }, { "epoch": 2.0038728632478633, "grad_norm": 0.5987151265144348, "learning_rate": 9.93197655097823e-05, "loss": 0.473, "step": 90030 }, { "epoch": 2.0040954415954415, "grad_norm": 0.4147493243217468, "learning_rate": 9.927948417286044e-05, "loss": 0.3984, "step": 90040 }, { "epoch": 2.0043180199430197, "grad_norm": 0.34556564688682556, "learning_rate": 9.923920830917395e-05, "loss": 0.3464, "step": 90050 }, { "epoch": 2.0045405982905984, "grad_norm": 0.4374103248119354, "learning_rate": 9.919893792091129e-05, "loss": 0.4862, "step": 90060 }, { "epoch": 2.0047631766381766, "grad_norm": 0.5401944518089294, "learning_rate": 9.915867301026083e-05, "loss": 0.4721, "step": 90070 }, { "epoch": 2.004985754985755, "grad_norm": 0.367075651884079, "learning_rate": 9.911841357941068e-05, "loss": 0.6188, "step": 90080 }, { "epoch": 2.0052083333333335, "grad_norm": 0.6527010798454285, "learning_rate": 9.907815963054847e-05, "loss": 0.5411, "step": 90090 }, { "epoch": 2.0054309116809117, "grad_norm": 0.45079076290130615, "learning_rate": 9.903791116586172e-05, "loss": 0.4643, "step": 90100 }, { "epoch": 2.00565349002849, "grad_norm": 0.31943023204803467, "learning_rate": 9.89976681875376e-05, "loss": 0.3938, "step": 90110 }, { "epoch": 2.0058760683760686, "grad_norm": 0.7696496844291687, "learning_rate": 9.89574306977628e-05, "loss": 0.5147, "step": 90120 }, { "epoch": 2.006098646723647, "grad_norm": 0.5576694011688232, "learning_rate": 9.891719869872391e-05, "loss": 0.4113, "step": 90130 }, { "epoch": 2.006321225071225, "grad_norm": 0.5383765697479248, "learning_rate": 9.887697219260723e-05, "loss": 0.4871, "step": 90140 }, { "epoch": 2.0065438034188032, "grad_norm": 0.38253116607666016, "learning_rate": 9.883675118159863e-05, "loss": 0.4429, "step": 90150 }, { "epoch": 2.006766381766382, "grad_norm": 0.6916284561157227, "learning_rate": 9.879653566788381e-05, "loss": 0.5345, "step": 90160 }, { "epoch": 2.00698896011396, "grad_norm": 0.5268100500106812, "learning_rate": 9.875632565364816e-05, "loss": 0.3958, "step": 90170 }, { "epoch": 2.0072115384615383, "grad_norm": 0.4431670606136322, "learning_rate": 9.871612114107658e-05, "loss": 0.4366, "step": 90180 }, { "epoch": 2.007434116809117, "grad_norm": 0.4209796190261841, "learning_rate": 9.867592213235397e-05, "loss": 0.3836, "step": 90190 }, { "epoch": 2.007656695156695, "grad_norm": 0.6848621964454651, "learning_rate": 9.863572862966461e-05, "loss": 0.5611, "step": 90200 }, { "epoch": 2.0078792735042734, "grad_norm": 0.4434153735637665, "learning_rate": 9.859554063519276e-05, "loss": 0.4577, "step": 90210 }, { "epoch": 2.0081018518518516, "grad_norm": 0.7151669263839722, "learning_rate": 9.855535815112222e-05, "loss": 0.5231, "step": 90220 }, { "epoch": 2.0083244301994303, "grad_norm": 0.5325934290885925, "learning_rate": 9.851518117963659e-05, "loss": 0.491, "step": 90230 }, { "epoch": 2.0085470085470085, "grad_norm": 0.4765765070915222, "learning_rate": 9.847500972291908e-05, "loss": 0.4732, "step": 90240 }, { "epoch": 2.0087695868945867, "grad_norm": 0.6172223091125488, "learning_rate": 9.843484378315274e-05, "loss": 0.3766, "step": 90250 }, { "epoch": 2.0089921652421654, "grad_norm": 0.7189575433731079, "learning_rate": 9.839468336252002e-05, "loss": 0.458, "step": 90260 }, { "epoch": 2.0092147435897436, "grad_norm": 0.7340860366821289, "learning_rate": 9.835452846320343e-05, "loss": 0.4761, "step": 90270 }, { "epoch": 2.009437321937322, "grad_norm": 0.7318997979164124, "learning_rate": 9.831437908738494e-05, "loss": 0.5299, "step": 90280 }, { "epoch": 2.0096599002849005, "grad_norm": 0.4002963602542877, "learning_rate": 9.827423523724636e-05, "loss": 0.395, "step": 90290 }, { "epoch": 2.0098824786324787, "grad_norm": 0.5723457932472229, "learning_rate": 9.823409691496918e-05, "loss": 0.53, "step": 90300 }, { "epoch": 2.010105056980057, "grad_norm": 0.646827220916748, "learning_rate": 9.819396412273441e-05, "loss": 0.5054, "step": 90310 }, { "epoch": 2.010327635327635, "grad_norm": 0.5630742311477661, "learning_rate": 9.815383686272297e-05, "loss": 0.5638, "step": 90320 }, { "epoch": 2.010550213675214, "grad_norm": 0.6631711721420288, "learning_rate": 9.811371513711549e-05, "loss": 0.3844, "step": 90330 }, { "epoch": 2.010772792022792, "grad_norm": 0.796441376209259, "learning_rate": 9.807359894809205e-05, "loss": 0.3774, "step": 90340 }, { "epoch": 2.0109953703703702, "grad_norm": 0.4723573625087738, "learning_rate": 9.80334882978327e-05, "loss": 0.4761, "step": 90350 }, { "epoch": 2.011217948717949, "grad_norm": 0.7912504076957703, "learning_rate": 9.799338318851706e-05, "loss": 0.4198, "step": 90360 }, { "epoch": 2.011440527065527, "grad_norm": 0.5627785325050354, "learning_rate": 9.795328362232448e-05, "loss": 0.4511, "step": 90370 }, { "epoch": 2.0116631054131053, "grad_norm": 0.7288662195205688, "learning_rate": 9.791318960143401e-05, "loss": 0.487, "step": 90380 }, { "epoch": 2.0118856837606836, "grad_norm": 0.761756181716919, "learning_rate": 9.787310112802448e-05, "loss": 0.4703, "step": 90390 }, { "epoch": 2.012108262108262, "grad_norm": 0.7722057700157166, "learning_rate": 9.783301820427416e-05, "loss": 0.4858, "step": 90400 }, { "epoch": 2.0123308404558404, "grad_norm": 0.4718744456768036, "learning_rate": 9.779294083236124e-05, "loss": 0.4952, "step": 90410 }, { "epoch": 2.0125534188034186, "grad_norm": 0.6757660508155823, "learning_rate": 9.775286901446362e-05, "loss": 0.4728, "step": 90420 }, { "epoch": 2.0127759971509973, "grad_norm": 0.5467312932014465, "learning_rate": 9.771280275275885e-05, "loss": 0.4071, "step": 90430 }, { "epoch": 2.0129985754985755, "grad_norm": 0.37198352813720703, "learning_rate": 9.767274204942405e-05, "loss": 0.4475, "step": 90440 }, { "epoch": 2.0132211538461537, "grad_norm": 0.8231156468391418, "learning_rate": 9.763268690663623e-05, "loss": 0.4464, "step": 90450 }, { "epoch": 2.0134437321937324, "grad_norm": 0.4113079309463501, "learning_rate": 9.759263732657198e-05, "loss": 0.4318, "step": 90460 }, { "epoch": 2.0136663105413106, "grad_norm": 0.4778950810432434, "learning_rate": 9.755259331140774e-05, "loss": 0.4658, "step": 90470 }, { "epoch": 2.013888888888889, "grad_norm": 0.6860994100570679, "learning_rate": 9.751255486331938e-05, "loss": 0.4329, "step": 90480 }, { "epoch": 2.014111467236467, "grad_norm": 0.43397676944732666, "learning_rate": 9.747252198448267e-05, "loss": 0.4092, "step": 90490 }, { "epoch": 2.0143340455840457, "grad_norm": 0.7068895101547241, "learning_rate": 9.743249467707307e-05, "loss": 0.4854, "step": 90500 }, { "epoch": 2.014556623931624, "grad_norm": 0.4531174302101135, "learning_rate": 9.739247294326565e-05, "loss": 0.4063, "step": 90510 }, { "epoch": 2.014779202279202, "grad_norm": 0.5841737985610962, "learning_rate": 9.735245678523527e-05, "loss": 0.5277, "step": 90520 }, { "epoch": 2.015001780626781, "grad_norm": 0.5976734161376953, "learning_rate": 9.731244620515649e-05, "loss": 0.5151, "step": 90530 }, { "epoch": 2.015224358974359, "grad_norm": 0.6347478628158569, "learning_rate": 9.727244120520338e-05, "loss": 0.3879, "step": 90540 }, { "epoch": 2.0154469373219372, "grad_norm": 0.4365602433681488, "learning_rate": 9.723244178754988e-05, "loss": 0.5014, "step": 90550 }, { "epoch": 2.0156695156695155, "grad_norm": 0.5678282380104065, "learning_rate": 9.719244795436972e-05, "loss": 0.4745, "step": 90560 }, { "epoch": 2.015892094017094, "grad_norm": 0.6716097593307495, "learning_rate": 9.715245970783604e-05, "loss": 0.4533, "step": 90570 }, { "epoch": 2.0161146723646723, "grad_norm": 0.5791259407997131, "learning_rate": 9.711247705012187e-05, "loss": 0.4712, "step": 90580 }, { "epoch": 2.0163372507122506, "grad_norm": 0.6677980422973633, "learning_rate": 9.707249998339993e-05, "loss": 0.4188, "step": 90590 }, { "epoch": 2.0165598290598292, "grad_norm": 0.580368161201477, "learning_rate": 9.703252850984261e-05, "loss": 0.4026, "step": 90600 }, { "epoch": 2.0167824074074074, "grad_norm": 0.5054930448532104, "learning_rate": 9.699256263162205e-05, "loss": 0.472, "step": 90610 }, { "epoch": 2.0170049857549857, "grad_norm": 0.5461148023605347, "learning_rate": 9.695260235090988e-05, "loss": 0.5155, "step": 90620 }, { "epoch": 2.0172275641025643, "grad_norm": 0.6772049069404602, "learning_rate": 9.691264766987769e-05, "loss": 0.6472, "step": 90630 }, { "epoch": 2.0174501424501425, "grad_norm": 0.4362126588821411, "learning_rate": 9.687269859069659e-05, "loss": 0.3806, "step": 90640 }, { "epoch": 2.0176727207977208, "grad_norm": 0.42470234632492065, "learning_rate": 9.683275511553746e-05, "loss": 0.3881, "step": 90650 }, { "epoch": 2.017895299145299, "grad_norm": 0.5822004079818726, "learning_rate": 9.67928172465709e-05, "loss": 0.5111, "step": 90660 }, { "epoch": 2.0181178774928776, "grad_norm": 0.708109438419342, "learning_rate": 9.67528849859672e-05, "loss": 0.4816, "step": 90670 }, { "epoch": 2.018340455840456, "grad_norm": 0.740749716758728, "learning_rate": 9.67129583358962e-05, "loss": 0.4824, "step": 90680 }, { "epoch": 2.018563034188034, "grad_norm": 0.5704371333122253, "learning_rate": 9.667303729852763e-05, "loss": 0.4345, "step": 90690 }, { "epoch": 2.0187856125356127, "grad_norm": 0.534059464931488, "learning_rate": 9.663312187603077e-05, "loss": 0.5181, "step": 90700 }, { "epoch": 2.019008190883191, "grad_norm": 0.408011257648468, "learning_rate": 9.659321207057466e-05, "loss": 0.4758, "step": 90710 }, { "epoch": 2.019230769230769, "grad_norm": 0.4999423325061798, "learning_rate": 9.655330788432808e-05, "loss": 0.4698, "step": 90720 }, { "epoch": 2.0194533475783474, "grad_norm": 0.5350780487060547, "learning_rate": 9.651340931945942e-05, "loss": 0.4726, "step": 90730 }, { "epoch": 2.019675925925926, "grad_norm": 0.4886220693588257, "learning_rate": 9.647351637813682e-05, "loss": 0.526, "step": 90740 }, { "epoch": 2.0198985042735043, "grad_norm": 0.7873054146766663, "learning_rate": 9.643362906252816e-05, "loss": 0.37, "step": 90750 }, { "epoch": 2.0201210826210825, "grad_norm": 0.60884690284729, "learning_rate": 9.63937473748008e-05, "loss": 0.5565, "step": 90760 }, { "epoch": 2.020343660968661, "grad_norm": 0.4547918438911438, "learning_rate": 9.635387131712204e-05, "loss": 0.4058, "step": 90770 }, { "epoch": 2.0205662393162394, "grad_norm": 0.410516619682312, "learning_rate": 9.631400089165876e-05, "loss": 0.3991, "step": 90780 }, { "epoch": 2.0207888176638176, "grad_norm": 0.5903146862983704, "learning_rate": 9.627413610057754e-05, "loss": 0.4019, "step": 90790 }, { "epoch": 2.021011396011396, "grad_norm": 0.7246853113174438, "learning_rate": 9.623427694604467e-05, "loss": 0.5818, "step": 90800 }, { "epoch": 2.0212339743589745, "grad_norm": 0.5898091197013855, "learning_rate": 9.619442343022625e-05, "loss": 0.4375, "step": 90810 }, { "epoch": 2.0214565527065527, "grad_norm": 0.5030173063278198, "learning_rate": 9.615457555528782e-05, "loss": 0.4817, "step": 90820 }, { "epoch": 2.021679131054131, "grad_norm": 0.656521737575531, "learning_rate": 9.611473332339469e-05, "loss": 0.4512, "step": 90830 }, { "epoch": 2.0219017094017095, "grad_norm": 0.5611187815666199, "learning_rate": 9.607489673671199e-05, "loss": 0.5406, "step": 90840 }, { "epoch": 2.0221242877492878, "grad_norm": 0.5025830864906311, "learning_rate": 9.603506579740447e-05, "loss": 0.4433, "step": 90850 }, { "epoch": 2.022346866096866, "grad_norm": 0.5679742693901062, "learning_rate": 9.59952405076366e-05, "loss": 0.5052, "step": 90860 }, { "epoch": 2.0225694444444446, "grad_norm": 0.5760874152183533, "learning_rate": 9.59554208695725e-05, "loss": 0.4054, "step": 90870 }, { "epoch": 2.022792022792023, "grad_norm": 0.8978238701820374, "learning_rate": 9.591560688537609e-05, "loss": 0.5044, "step": 90880 }, { "epoch": 2.023014601139601, "grad_norm": 0.4812617301940918, "learning_rate": 9.587579855721071e-05, "loss": 0.38, "step": 90890 }, { "epoch": 2.0232371794871793, "grad_norm": 0.41074270009994507, "learning_rate": 9.58359958872397e-05, "loss": 0.4779, "step": 90900 }, { "epoch": 2.023459757834758, "grad_norm": 0.4377320408821106, "learning_rate": 9.579619887762594e-05, "loss": 0.4976, "step": 90910 }, { "epoch": 2.023682336182336, "grad_norm": 0.610565185546875, "learning_rate": 9.575640753053204e-05, "loss": 0.4633, "step": 90920 }, { "epoch": 2.0239049145299144, "grad_norm": 0.49709609150886536, "learning_rate": 9.571662184812027e-05, "loss": 0.4137, "step": 90930 }, { "epoch": 2.024127492877493, "grad_norm": 0.6810767650604248, "learning_rate": 9.567684183255265e-05, "loss": 0.408, "step": 90940 }, { "epoch": 2.0243500712250713, "grad_norm": 0.6302564144134521, "learning_rate": 9.563706748599095e-05, "loss": 0.5975, "step": 90950 }, { "epoch": 2.0245726495726495, "grad_norm": 0.8101886510848999, "learning_rate": 9.55972988105964e-05, "loss": 0.539, "step": 90960 }, { "epoch": 2.0247952279202277, "grad_norm": 0.5000439882278442, "learning_rate": 9.555753580853007e-05, "loss": 0.4115, "step": 90970 }, { "epoch": 2.0250178062678064, "grad_norm": 0.5363843441009521, "learning_rate": 9.551777848195269e-05, "loss": 0.4885, "step": 90980 }, { "epoch": 2.0252403846153846, "grad_norm": 0.5441843271255493, "learning_rate": 9.54780268330248e-05, "loss": 0.4076, "step": 90990 }, { "epoch": 2.025462962962963, "grad_norm": 0.6781536936759949, "learning_rate": 9.543828086390649e-05, "loss": 0.3442, "step": 91000 }, { "epoch": 2.0256855413105415, "grad_norm": 0.5577031970024109, "learning_rate": 9.539854057675759e-05, "loss": 0.4969, "step": 91010 }, { "epoch": 2.0259081196581197, "grad_norm": 0.5493881702423096, "learning_rate": 9.535880597373769e-05, "loss": 0.3874, "step": 91020 }, { "epoch": 2.026130698005698, "grad_norm": 0.5387133955955505, "learning_rate": 9.531907705700589e-05, "loss": 0.4539, "step": 91030 }, { "epoch": 2.0263532763532766, "grad_norm": 0.5434364080429077, "learning_rate": 9.52793538287211e-05, "loss": 0.4671, "step": 91040 }, { "epoch": 2.0265758547008548, "grad_norm": 0.595073401927948, "learning_rate": 9.5239636291042e-05, "loss": 0.5306, "step": 91050 }, { "epoch": 2.026798433048433, "grad_norm": 0.7041422128677368, "learning_rate": 9.51999244461268e-05, "loss": 0.5565, "step": 91060 }, { "epoch": 2.027021011396011, "grad_norm": 0.48130425810813904, "learning_rate": 9.516021829613348e-05, "loss": 0.4901, "step": 91070 }, { "epoch": 2.02724358974359, "grad_norm": 0.7272732257843018, "learning_rate": 9.512051784321976e-05, "loss": 0.4513, "step": 91080 }, { "epoch": 2.027466168091168, "grad_norm": 0.4732091724872589, "learning_rate": 9.5080823089543e-05, "loss": 0.4689, "step": 91090 }, { "epoch": 2.0276887464387463, "grad_norm": 0.41249731183052063, "learning_rate": 9.504113403726023e-05, "loss": 0.425, "step": 91100 }, { "epoch": 2.027911324786325, "grad_norm": 0.6525123119354248, "learning_rate": 9.500145068852806e-05, "loss": 0.4401, "step": 91110 }, { "epoch": 2.028133903133903, "grad_norm": 0.5969793200492859, "learning_rate": 9.496177304550305e-05, "loss": 0.3355, "step": 91120 }, { "epoch": 2.0283564814814814, "grad_norm": 0.8126296401023865, "learning_rate": 9.492210111034126e-05, "loss": 0.5219, "step": 91130 }, { "epoch": 2.0285790598290596, "grad_norm": 0.4047560691833496, "learning_rate": 9.48824348851985e-05, "loss": 0.4808, "step": 91140 }, { "epoch": 2.0288016381766383, "grad_norm": 0.3511456251144409, "learning_rate": 9.48427743722303e-05, "loss": 0.4441, "step": 91150 }, { "epoch": 2.0290242165242165, "grad_norm": 0.7069734334945679, "learning_rate": 9.480311957359192e-05, "loss": 0.4695, "step": 91160 }, { "epoch": 2.0292467948717947, "grad_norm": 0.5533146262168884, "learning_rate": 9.476347049143803e-05, "loss": 0.4474, "step": 91170 }, { "epoch": 2.0294693732193734, "grad_norm": 0.6713184118270874, "learning_rate": 9.472382712792332e-05, "loss": 0.3889, "step": 91180 }, { "epoch": 2.0296919515669516, "grad_norm": 0.6932535171508789, "learning_rate": 9.468418948520204e-05, "loss": 0.4224, "step": 91190 }, { "epoch": 2.02991452991453, "grad_norm": 0.6654514074325562, "learning_rate": 9.46445575654281e-05, "loss": 0.4261, "step": 91200 }, { "epoch": 2.0301371082621085, "grad_norm": 0.5747096538543701, "learning_rate": 9.460493137075514e-05, "loss": 0.572, "step": 91210 }, { "epoch": 2.0303596866096867, "grad_norm": 0.5004822015762329, "learning_rate": 9.456531090333658e-05, "loss": 0.4607, "step": 91220 }, { "epoch": 2.030582264957265, "grad_norm": 0.7012773156166077, "learning_rate": 9.452569616532528e-05, "loss": 0.4961, "step": 91230 }, { "epoch": 2.030804843304843, "grad_norm": 0.5176010131835938, "learning_rate": 9.448608715887403e-05, "loss": 0.5134, "step": 91240 }, { "epoch": 2.031027421652422, "grad_norm": 0.907871663570404, "learning_rate": 9.444648388613515e-05, "loss": 0.4613, "step": 91250 }, { "epoch": 2.03125, "grad_norm": 0.36404740810394287, "learning_rate": 9.440688634926071e-05, "loss": 0.3384, "step": 91260 }, { "epoch": 2.031472578347578, "grad_norm": 0.6608119606971741, "learning_rate": 9.436729455040254e-05, "loss": 0.3983, "step": 91270 }, { "epoch": 2.031695156695157, "grad_norm": 0.8917686939239502, "learning_rate": 9.432770849171204e-05, "loss": 0.4951, "step": 91280 }, { "epoch": 2.031917735042735, "grad_norm": 0.5580182075500488, "learning_rate": 9.428812817534037e-05, "loss": 0.4805, "step": 91290 }, { "epoch": 2.0321403133903133, "grad_norm": 0.6238675713539124, "learning_rate": 9.424855360343842e-05, "loss": 0.5015, "step": 91300 }, { "epoch": 2.0323628917378915, "grad_norm": 0.45403924584388733, "learning_rate": 9.420898477815658e-05, "loss": 0.4426, "step": 91310 }, { "epoch": 2.03258547008547, "grad_norm": 0.5086132287979126, "learning_rate": 9.41694217016451e-05, "loss": 0.3683, "step": 91320 }, { "epoch": 2.0328080484330484, "grad_norm": 0.6602432131767273, "learning_rate": 9.412986437605391e-05, "loss": 0.5435, "step": 91330 }, { "epoch": 2.0330306267806266, "grad_norm": 0.4932442903518677, "learning_rate": 9.409031280353254e-05, "loss": 0.4734, "step": 91340 }, { "epoch": 2.0332532051282053, "grad_norm": 0.4811466634273529, "learning_rate": 9.405076698623034e-05, "loss": 0.3959, "step": 91350 }, { "epoch": 2.0334757834757835, "grad_norm": 0.6618918180465698, "learning_rate": 9.401122692629613e-05, "loss": 0.4753, "step": 91360 }, { "epoch": 2.0336983618233617, "grad_norm": 0.5234821438789368, "learning_rate": 9.397169262587862e-05, "loss": 0.5293, "step": 91370 }, { "epoch": 2.0339209401709404, "grad_norm": 0.5152151584625244, "learning_rate": 9.393216408712619e-05, "loss": 0.4318, "step": 91380 }, { "epoch": 2.0341435185185186, "grad_norm": 0.3490557074546814, "learning_rate": 9.389264131218673e-05, "loss": 0.4209, "step": 91390 }, { "epoch": 2.034366096866097, "grad_norm": 0.5929777026176453, "learning_rate": 9.385312430320801e-05, "loss": 0.5068, "step": 91400 }, { "epoch": 2.034588675213675, "grad_norm": 0.630289614200592, "learning_rate": 9.38136130623374e-05, "loss": 0.5485, "step": 91410 }, { "epoch": 2.0348112535612537, "grad_norm": 0.572494626045227, "learning_rate": 9.377410759172198e-05, "loss": 0.4738, "step": 91420 }, { "epoch": 2.035033831908832, "grad_norm": 0.5333491563796997, "learning_rate": 9.373460789350854e-05, "loss": 0.5702, "step": 91430 }, { "epoch": 2.03525641025641, "grad_norm": 0.6336487531661987, "learning_rate": 9.369511396984356e-05, "loss": 0.5368, "step": 91440 }, { "epoch": 2.035478988603989, "grad_norm": 0.5200332999229431, "learning_rate": 9.365562582287304e-05, "loss": 0.5022, "step": 91450 }, { "epoch": 2.035701566951567, "grad_norm": 0.31436365842819214, "learning_rate": 9.361614345474286e-05, "loss": 0.4842, "step": 91460 }, { "epoch": 2.0359241452991452, "grad_norm": 0.5940852165222168, "learning_rate": 9.357666686759854e-05, "loss": 0.4173, "step": 91470 }, { "epoch": 2.0361467236467234, "grad_norm": 0.43001100420951843, "learning_rate": 9.353719606358533e-05, "loss": 0.3914, "step": 91480 }, { "epoch": 2.036369301994302, "grad_norm": 0.4594919681549072, "learning_rate": 9.349773104484798e-05, "loss": 0.4438, "step": 91490 }, { "epoch": 2.0365918803418803, "grad_norm": 0.38712412118911743, "learning_rate": 9.34582718135311e-05, "loss": 0.4537, "step": 91500 }, { "epoch": 2.0368144586894585, "grad_norm": 0.553755521774292, "learning_rate": 9.341881837177897e-05, "loss": 0.4922, "step": 91510 }, { "epoch": 2.037037037037037, "grad_norm": 0.39810413122177124, "learning_rate": 9.337937072173557e-05, "loss": 0.451, "step": 91520 }, { "epoch": 2.0372596153846154, "grad_norm": 0.5983201861381531, "learning_rate": 9.333992886554437e-05, "loss": 0.5033, "step": 91530 }, { "epoch": 2.0374821937321936, "grad_norm": 0.45720791816711426, "learning_rate": 9.330049280534874e-05, "loss": 0.5107, "step": 91540 }, { "epoch": 2.0377047720797723, "grad_norm": 0.48426130414009094, "learning_rate": 9.326106254329167e-05, "loss": 0.5196, "step": 91550 }, { "epoch": 2.0379273504273505, "grad_norm": 0.679417073726654, "learning_rate": 9.322163808151587e-05, "loss": 0.503, "step": 91560 }, { "epoch": 2.0381499287749287, "grad_norm": 0.6564821600914001, "learning_rate": 9.318221942216366e-05, "loss": 0.4255, "step": 91570 }, { "epoch": 2.038372507122507, "grad_norm": 0.6791837215423584, "learning_rate": 9.314280656737717e-05, "loss": 0.3442, "step": 91580 }, { "epoch": 2.0385950854700856, "grad_norm": 0.7403519153594971, "learning_rate": 9.310339951929797e-05, "loss": 0.3808, "step": 91590 }, { "epoch": 2.038817663817664, "grad_norm": 0.7340866923332214, "learning_rate": 9.306399828006754e-05, "loss": 0.5083, "step": 91600 }, { "epoch": 2.039040242165242, "grad_norm": 0.6276527643203735, "learning_rate": 9.302460285182707e-05, "loss": 0.5224, "step": 91610 }, { "epoch": 2.0392628205128207, "grad_norm": 0.6082279086112976, "learning_rate": 9.298521323671719e-05, "loss": 0.5278, "step": 91620 }, { "epoch": 2.039485398860399, "grad_norm": 0.5417894124984741, "learning_rate": 9.294582943687842e-05, "loss": 0.4473, "step": 91630 }, { "epoch": 2.039707977207977, "grad_norm": 0.5575587749481201, "learning_rate": 9.29064514544509e-05, "loss": 0.5124, "step": 91640 }, { "epoch": 2.0399305555555554, "grad_norm": 0.49067622423171997, "learning_rate": 9.28670792915745e-05, "loss": 0.44, "step": 91650 }, { "epoch": 2.040153133903134, "grad_norm": 0.6434421539306641, "learning_rate": 9.282771295038877e-05, "loss": 0.4494, "step": 91660 }, { "epoch": 2.0402421652421654, "eval_loss": 0.5388351082801819, "eval_runtime": 337.1739, "eval_samples_per_second": 7.014, "eval_steps_per_second": 7.014, "step": 91664 }, { "epoch": 2.0403757122507122, "grad_norm": 0.6126685738563538, "learning_rate": 9.278835243303281e-05, "loss": 0.4354, "step": 91670 }, { "epoch": 2.0405982905982905, "grad_norm": 0.6492798924446106, "learning_rate": 9.274899774164552e-05, "loss": 0.4865, "step": 91680 }, { "epoch": 2.040820868945869, "grad_norm": 0.4895659387111664, "learning_rate": 9.27096488783655e-05, "loss": 0.4222, "step": 91690 }, { "epoch": 2.0410434472934473, "grad_norm": 0.7228186130523682, "learning_rate": 9.267030584533099e-05, "loss": 0.5693, "step": 91700 }, { "epoch": 2.0412660256410255, "grad_norm": 0.3963758647441864, "learning_rate": 9.263096864467993e-05, "loss": 0.4665, "step": 91710 }, { "epoch": 2.041488603988604, "grad_norm": 0.4847806394100189, "learning_rate": 9.259163727855001e-05, "loss": 0.472, "step": 91720 }, { "epoch": 2.0417111823361824, "grad_norm": 0.5810156464576721, "learning_rate": 9.255231174907835e-05, "loss": 0.4409, "step": 91730 }, { "epoch": 2.0419337606837606, "grad_norm": 0.4100762903690338, "learning_rate": 9.251299205840214e-05, "loss": 0.5712, "step": 91740 }, { "epoch": 2.042156339031339, "grad_norm": 0.6675921082496643, "learning_rate": 9.247367820865782e-05, "loss": 0.4256, "step": 91750 }, { "epoch": 2.0423789173789175, "grad_norm": 0.4220803678035736, "learning_rate": 9.243437020198189e-05, "loss": 0.5776, "step": 91760 }, { "epoch": 2.0426014957264957, "grad_norm": 0.40601903200149536, "learning_rate": 9.239506804051032e-05, "loss": 0.4718, "step": 91770 }, { "epoch": 2.042824074074074, "grad_norm": 0.719412088394165, "learning_rate": 9.235577172637884e-05, "loss": 0.6789, "step": 91780 }, { "epoch": 2.0430466524216526, "grad_norm": 0.4384337365627289, "learning_rate": 9.231648126172286e-05, "loss": 0.4875, "step": 91790 }, { "epoch": 2.043269230769231, "grad_norm": 0.5705899000167847, "learning_rate": 9.227719664867748e-05, "loss": 0.4742, "step": 91800 }, { "epoch": 2.043491809116809, "grad_norm": 0.46044304966926575, "learning_rate": 9.223791788937738e-05, "loss": 0.3934, "step": 91810 }, { "epoch": 2.0437143874643873, "grad_norm": 0.4466346204280853, "learning_rate": 9.219864498595705e-05, "loss": 0.4922, "step": 91820 }, { "epoch": 2.043936965811966, "grad_norm": 0.3737199306488037, "learning_rate": 9.215937794055058e-05, "loss": 0.4456, "step": 91830 }, { "epoch": 2.044159544159544, "grad_norm": 0.6909509897232056, "learning_rate": 9.21201167552918e-05, "loss": 0.4351, "step": 91840 }, { "epoch": 2.0443821225071224, "grad_norm": 0.42793193459510803, "learning_rate": 9.208086143231418e-05, "loss": 0.3917, "step": 91850 }, { "epoch": 2.044604700854701, "grad_norm": 0.5402208566665649, "learning_rate": 9.204161197375098e-05, "loss": 0.3512, "step": 91860 }, { "epoch": 2.0448272792022792, "grad_norm": 0.7035687565803528, "learning_rate": 9.200236838173497e-05, "loss": 0.4324, "step": 91870 }, { "epoch": 2.0450498575498575, "grad_norm": 0.6524356007575989, "learning_rate": 9.196313065839861e-05, "loss": 0.4678, "step": 91880 }, { "epoch": 2.0452724358974357, "grad_norm": 0.573111891746521, "learning_rate": 9.192389880587415e-05, "loss": 0.4819, "step": 91890 }, { "epoch": 2.0454950142450143, "grad_norm": 0.4412767291069031, "learning_rate": 9.188467282629352e-05, "loss": 0.392, "step": 91900 }, { "epoch": 2.0457175925925926, "grad_norm": 0.2940179705619812, "learning_rate": 9.184545272178827e-05, "loss": 0.5119, "step": 91910 }, { "epoch": 2.0459401709401708, "grad_norm": 1.0994153022766113, "learning_rate": 9.180623849448964e-05, "loss": 0.4068, "step": 91920 }, { "epoch": 2.0461627492877494, "grad_norm": 0.6845361590385437, "learning_rate": 9.176703014652862e-05, "loss": 0.4443, "step": 91930 }, { "epoch": 2.0463853276353277, "grad_norm": 0.5533000230789185, "learning_rate": 9.172782768003582e-05, "loss": 0.6033, "step": 91940 }, { "epoch": 2.046607905982906, "grad_norm": 0.4258192479610443, "learning_rate": 9.168863109714143e-05, "loss": 0.3596, "step": 91950 }, { "epoch": 2.0468304843304845, "grad_norm": 0.8160050511360168, "learning_rate": 9.164944039997551e-05, "loss": 0.5137, "step": 91960 }, { "epoch": 2.0470530626780628, "grad_norm": 0.45495203137397766, "learning_rate": 9.161025559066769e-05, "loss": 0.512, "step": 91970 }, { "epoch": 2.047275641025641, "grad_norm": 0.5231293439865112, "learning_rate": 9.157107667134733e-05, "loss": 0.4806, "step": 91980 }, { "epoch": 2.047498219373219, "grad_norm": 0.7062931060791016, "learning_rate": 9.153190364414341e-05, "loss": 0.4642, "step": 91990 }, { "epoch": 2.047720797720798, "grad_norm": 0.7344730496406555, "learning_rate": 9.149273651118473e-05, "loss": 0.5388, "step": 92000 }, { "epoch": 2.047943376068376, "grad_norm": 0.5181725025177002, "learning_rate": 9.145357527459957e-05, "loss": 0.4403, "step": 92010 }, { "epoch": 2.0481659544159543, "grad_norm": 0.5806347727775574, "learning_rate": 9.141441993651592e-05, "loss": 0.437, "step": 92020 }, { "epoch": 2.048388532763533, "grad_norm": 0.9183672070503235, "learning_rate": 9.137527049906156e-05, "loss": 0.5308, "step": 92030 }, { "epoch": 2.048611111111111, "grad_norm": 0.6128705143928528, "learning_rate": 9.133612696436396e-05, "loss": 0.4582, "step": 92040 }, { "epoch": 2.0488336894586894, "grad_norm": 0.5914685726165771, "learning_rate": 9.129698933455016e-05, "loss": 0.3601, "step": 92050 }, { "epoch": 2.0490562678062676, "grad_norm": 0.5341922640800476, "learning_rate": 9.125785761174694e-05, "loss": 0.5504, "step": 92060 }, { "epoch": 2.0492788461538463, "grad_norm": 0.6216781139373779, "learning_rate": 9.121873179808076e-05, "loss": 0.4186, "step": 92070 }, { "epoch": 2.0495014245014245, "grad_norm": 0.4968711733818054, "learning_rate": 9.117961189567783e-05, "loss": 0.3992, "step": 92080 }, { "epoch": 2.0497240028490027, "grad_norm": 0.634192943572998, "learning_rate": 9.114049790666379e-05, "loss": 0.4502, "step": 92090 }, { "epoch": 2.0499465811965814, "grad_norm": 0.6684777736663818, "learning_rate": 9.110138983316422e-05, "loss": 0.5756, "step": 92100 }, { "epoch": 2.0501691595441596, "grad_norm": 0.29836568236351013, "learning_rate": 9.106228767730426e-05, "loss": 0.4609, "step": 92110 }, { "epoch": 2.050391737891738, "grad_norm": 0.7051525115966797, "learning_rate": 9.102319144120879e-05, "loss": 0.383, "step": 92120 }, { "epoch": 2.0506143162393164, "grad_norm": 0.7818118333816528, "learning_rate": 9.09841011270023e-05, "loss": 0.5358, "step": 92130 }, { "epoch": 2.0508368945868947, "grad_norm": 0.6384372711181641, "learning_rate": 9.094501673680909e-05, "loss": 0.3883, "step": 92140 }, { "epoch": 2.051059472934473, "grad_norm": 0.4258003830909729, "learning_rate": 9.090593827275291e-05, "loss": 0.4494, "step": 92150 }, { "epoch": 2.051282051282051, "grad_norm": 0.5207109451293945, "learning_rate": 9.086686573695731e-05, "loss": 0.4022, "step": 92160 }, { "epoch": 2.0515046296296298, "grad_norm": 0.3599660098552704, "learning_rate": 9.082779913154555e-05, "loss": 0.3917, "step": 92170 }, { "epoch": 2.051727207977208, "grad_norm": 0.5726381540298462, "learning_rate": 9.078873845864055e-05, "loss": 0.4969, "step": 92180 }, { "epoch": 2.051949786324786, "grad_norm": 0.5060173273086548, "learning_rate": 9.074968372036492e-05, "loss": 0.5765, "step": 92190 }, { "epoch": 2.052172364672365, "grad_norm": 0.6169702410697937, "learning_rate": 9.071063491884094e-05, "loss": 0.4987, "step": 92200 }, { "epoch": 2.052394943019943, "grad_norm": 0.5102764368057251, "learning_rate": 9.067159205619049e-05, "loss": 0.3631, "step": 92210 }, { "epoch": 2.0526175213675213, "grad_norm": 0.4469665288925171, "learning_rate": 9.06325551345353e-05, "loss": 0.457, "step": 92220 }, { "epoch": 2.0528400997150995, "grad_norm": 0.5031200051307678, "learning_rate": 9.059352415599654e-05, "loss": 0.4391, "step": 92230 }, { "epoch": 2.053062678062678, "grad_norm": 0.5435706377029419, "learning_rate": 9.055449912269523e-05, "loss": 0.3567, "step": 92240 }, { "epoch": 2.0532852564102564, "grad_norm": 0.5851424932479858, "learning_rate": 9.051548003675203e-05, "loss": 0.3947, "step": 92250 }, { "epoch": 2.0535078347578346, "grad_norm": 0.5932340621948242, "learning_rate": 9.04764669002873e-05, "loss": 0.4843, "step": 92260 }, { "epoch": 2.0537304131054133, "grad_norm": 0.6475874781608582, "learning_rate": 9.043745971542107e-05, "loss": 0.3962, "step": 92270 }, { "epoch": 2.0539529914529915, "grad_norm": 0.5898043513298035, "learning_rate": 9.039845848427291e-05, "loss": 0.3851, "step": 92280 }, { "epoch": 2.0541755698005697, "grad_norm": 0.6884317398071289, "learning_rate": 9.03594632089623e-05, "loss": 0.4134, "step": 92290 }, { "epoch": 2.0543981481481484, "grad_norm": 0.4127469062805176, "learning_rate": 9.032047389160814e-05, "loss": 0.4321, "step": 92300 }, { "epoch": 2.0546207264957266, "grad_norm": 0.5552111268043518, "learning_rate": 9.028149053432923e-05, "loss": 0.5608, "step": 92310 }, { "epoch": 2.054843304843305, "grad_norm": 0.6372430920600891, "learning_rate": 9.024251313924394e-05, "loss": 0.56, "step": 92320 }, { "epoch": 2.055065883190883, "grad_norm": 0.4178203046321869, "learning_rate": 9.020354170847033e-05, "loss": 0.4222, "step": 92330 }, { "epoch": 2.0552884615384617, "grad_norm": 0.6235283613204956, "learning_rate": 9.016457624412616e-05, "loss": 0.542, "step": 92340 }, { "epoch": 2.05551103988604, "grad_norm": 0.5176951885223389, "learning_rate": 9.012561674832882e-05, "loss": 0.5165, "step": 92350 }, { "epoch": 2.055733618233618, "grad_norm": 0.7084895968437195, "learning_rate": 9.00866632231955e-05, "loss": 0.5354, "step": 92360 }, { "epoch": 2.0559561965811968, "grad_norm": 0.5494484305381775, "learning_rate": 9.00477156708428e-05, "loss": 0.4844, "step": 92370 }, { "epoch": 2.056178774928775, "grad_norm": 0.6145066618919373, "learning_rate": 9.000877409338723e-05, "loss": 0.4319, "step": 92380 }, { "epoch": 2.056401353276353, "grad_norm": 0.5414249897003174, "learning_rate": 8.996983849294494e-05, "loss": 0.4177, "step": 92390 }, { "epoch": 2.0566239316239314, "grad_norm": 0.5711957812309265, "learning_rate": 8.993090887163176e-05, "loss": 0.4718, "step": 92400 }, { "epoch": 2.05684650997151, "grad_norm": 0.5843347907066345, "learning_rate": 8.989198523156301e-05, "loss": 0.5651, "step": 92410 }, { "epoch": 2.0570690883190883, "grad_norm": 0.5972548723220825, "learning_rate": 8.985306757485394e-05, "loss": 0.4103, "step": 92420 }, { "epoch": 2.0572916666666665, "grad_norm": 0.6618757247924805, "learning_rate": 8.981415590361943e-05, "loss": 0.5238, "step": 92430 }, { "epoch": 2.057514245014245, "grad_norm": 0.577944278717041, "learning_rate": 8.977525021997381e-05, "loss": 0.3938, "step": 92440 }, { "epoch": 2.0577368233618234, "grad_norm": 0.56364905834198, "learning_rate": 8.973635052603133e-05, "loss": 0.4253, "step": 92450 }, { "epoch": 2.0579594017094016, "grad_norm": 0.6828592419624329, "learning_rate": 8.969745682390583e-05, "loss": 0.3796, "step": 92460 }, { "epoch": 2.05818198005698, "grad_norm": 0.5902596712112427, "learning_rate": 8.96585691157108e-05, "loss": 0.4654, "step": 92470 }, { "epoch": 2.0584045584045585, "grad_norm": 0.7088204026222229, "learning_rate": 8.961968740355949e-05, "loss": 0.3455, "step": 92480 }, { "epoch": 2.0586271367521367, "grad_norm": 0.44851240515708923, "learning_rate": 8.958081168956478e-05, "loss": 0.5068, "step": 92490 }, { "epoch": 2.058849715099715, "grad_norm": 0.6315305829048157, "learning_rate": 8.954194197583908e-05, "loss": 0.4933, "step": 92500 }, { "epoch": 2.0590722934472936, "grad_norm": 0.5136936902999878, "learning_rate": 8.950307826449468e-05, "loss": 0.4816, "step": 92510 }, { "epoch": 2.059294871794872, "grad_norm": 0.42075827717781067, "learning_rate": 8.946422055764347e-05, "loss": 0.3074, "step": 92520 }, { "epoch": 2.05951745014245, "grad_norm": 0.64700847864151, "learning_rate": 8.942536885739708e-05, "loss": 0.498, "step": 92530 }, { "epoch": 2.0597400284900287, "grad_norm": 0.4763801097869873, "learning_rate": 8.93865231658666e-05, "loss": 0.4241, "step": 92540 }, { "epoch": 2.059962606837607, "grad_norm": 0.5297418236732483, "learning_rate": 8.934768348516298e-05, "loss": 0.5568, "step": 92550 }, { "epoch": 2.060185185185185, "grad_norm": 0.4956209063529968, "learning_rate": 8.930884981739684e-05, "loss": 0.4939, "step": 92560 }, { "epoch": 2.0604077635327633, "grad_norm": 0.5499415397644043, "learning_rate": 8.927002216467848e-05, "loss": 0.5963, "step": 92570 }, { "epoch": 2.060630341880342, "grad_norm": 0.696087658405304, "learning_rate": 8.923120052911771e-05, "loss": 0.5168, "step": 92580 }, { "epoch": 2.06085292022792, "grad_norm": 0.5364202857017517, "learning_rate": 8.919238491282416e-05, "loss": 0.4864, "step": 92590 }, { "epoch": 2.0610754985754984, "grad_norm": 0.49946531653404236, "learning_rate": 8.915357531790713e-05, "loss": 0.541, "step": 92600 }, { "epoch": 2.061298076923077, "grad_norm": 0.7604655623435974, "learning_rate": 8.911477174647558e-05, "loss": 0.3397, "step": 92610 }, { "epoch": 2.0615206552706553, "grad_norm": 0.49020248651504517, "learning_rate": 8.907597420063808e-05, "loss": 0.449, "step": 92620 }, { "epoch": 2.0617432336182335, "grad_norm": 0.7224427461624146, "learning_rate": 8.903718268250304e-05, "loss": 0.4455, "step": 92630 }, { "epoch": 2.0619658119658117, "grad_norm": 0.5862017273902893, "learning_rate": 8.899839719417827e-05, "loss": 0.3779, "step": 92640 }, { "epoch": 2.0621883903133904, "grad_norm": 0.42739906907081604, "learning_rate": 8.895961773777144e-05, "loss": 0.4847, "step": 92650 }, { "epoch": 2.0624109686609686, "grad_norm": 0.4330075979232788, "learning_rate": 8.892084431538996e-05, "loss": 0.4962, "step": 92660 }, { "epoch": 2.062633547008547, "grad_norm": 0.6505460143089294, "learning_rate": 8.888207692914065e-05, "loss": 0.5212, "step": 92670 }, { "epoch": 2.0628561253561255, "grad_norm": 0.5942981243133545, "learning_rate": 8.884331558113028e-05, "loss": 0.4443, "step": 92680 }, { "epoch": 2.0630787037037037, "grad_norm": 0.4336341321468353, "learning_rate": 8.880456027346511e-05, "loss": 0.3539, "step": 92690 }, { "epoch": 2.063301282051282, "grad_norm": 0.38467174768447876, "learning_rate": 8.876581100825119e-05, "loss": 0.4438, "step": 92700 }, { "epoch": 2.0635238603988606, "grad_norm": 0.4787411093711853, "learning_rate": 8.872706778759422e-05, "loss": 0.4363, "step": 92710 }, { "epoch": 2.063746438746439, "grad_norm": 0.3388282358646393, "learning_rate": 8.86883306135994e-05, "loss": 0.3916, "step": 92720 }, { "epoch": 2.063969017094017, "grad_norm": 0.43503648042678833, "learning_rate": 8.864959948837181e-05, "loss": 0.4058, "step": 92730 }, { "epoch": 2.0641915954415953, "grad_norm": 0.5398057103157043, "learning_rate": 8.861087441401616e-05, "loss": 0.491, "step": 92740 }, { "epoch": 2.064414173789174, "grad_norm": 0.42972442507743835, "learning_rate": 8.857215539263677e-05, "loss": 0.3559, "step": 92750 }, { "epoch": 2.064636752136752, "grad_norm": 0.6799263954162598, "learning_rate": 8.853344242633767e-05, "loss": 0.4094, "step": 92760 }, { "epoch": 2.0648593304843303, "grad_norm": 0.4752724766731262, "learning_rate": 8.849473551722265e-05, "loss": 0.4443, "step": 92770 }, { "epoch": 2.065081908831909, "grad_norm": 0.5786859393119812, "learning_rate": 8.84560346673949e-05, "loss": 0.4233, "step": 92780 }, { "epoch": 2.0653044871794872, "grad_norm": 0.5894894003868103, "learning_rate": 8.841733987895761e-05, "loss": 0.4082, "step": 92790 }, { "epoch": 2.0655270655270654, "grad_norm": 0.584894061088562, "learning_rate": 8.837865115401336e-05, "loss": 0.5259, "step": 92800 }, { "epoch": 2.0657496438746437, "grad_norm": 0.5054792761802673, "learning_rate": 8.833996849466458e-05, "loss": 0.5733, "step": 92810 }, { "epoch": 2.0659722222222223, "grad_norm": 0.7498967051506042, "learning_rate": 8.830129190301331e-05, "loss": 0.4991, "step": 92820 }, { "epoch": 2.0661948005698005, "grad_norm": 0.492639422416687, "learning_rate": 8.826262138116128e-05, "loss": 0.5089, "step": 92830 }, { "epoch": 2.0664173789173788, "grad_norm": 0.5578546524047852, "learning_rate": 8.822395693120989e-05, "loss": 0.4406, "step": 92840 }, { "epoch": 2.0666399572649574, "grad_norm": 0.767248809337616, "learning_rate": 8.818529855526024e-05, "loss": 0.4079, "step": 92850 }, { "epoch": 2.0668625356125356, "grad_norm": 0.6224132776260376, "learning_rate": 8.814664625541293e-05, "loss": 0.4362, "step": 92860 }, { "epoch": 2.067085113960114, "grad_norm": 0.5436907410621643, "learning_rate": 8.810800003376843e-05, "loss": 0.4614, "step": 92870 }, { "epoch": 2.0673076923076925, "grad_norm": 0.6860635280609131, "learning_rate": 8.806935989242681e-05, "loss": 0.5936, "step": 92880 }, { "epoch": 2.0675302706552707, "grad_norm": 0.7654950022697449, "learning_rate": 8.803072583348782e-05, "loss": 0.5193, "step": 92890 }, { "epoch": 2.067752849002849, "grad_norm": 0.4014761447906494, "learning_rate": 8.799209785905083e-05, "loss": 0.5032, "step": 92900 }, { "epoch": 2.067975427350427, "grad_norm": 0.799392819404602, "learning_rate": 8.795347597121501e-05, "loss": 0.4786, "step": 92910 }, { "epoch": 2.068198005698006, "grad_norm": 0.5604988932609558, "learning_rate": 8.791486017207898e-05, "loss": 0.4582, "step": 92920 }, { "epoch": 2.068420584045584, "grad_norm": 0.7473601698875427, "learning_rate": 8.787625046374126e-05, "loss": 0.4344, "step": 92930 }, { "epoch": 2.0686431623931623, "grad_norm": 0.5701202750205994, "learning_rate": 8.783764684829981e-05, "loss": 0.4618, "step": 92940 }, { "epoch": 2.068865740740741, "grad_norm": 0.5223069787025452, "learning_rate": 8.779904932785246e-05, "loss": 0.5245, "step": 92950 }, { "epoch": 2.069088319088319, "grad_norm": 0.4085731506347656, "learning_rate": 8.77604579044966e-05, "loss": 0.4249, "step": 92960 }, { "epoch": 2.0693108974358974, "grad_norm": 0.4149361550807953, "learning_rate": 8.772187258032936e-05, "loss": 0.4093, "step": 92970 }, { "epoch": 2.0695334757834756, "grad_norm": 0.7283080220222473, "learning_rate": 8.768329335744747e-05, "loss": 0.531, "step": 92980 }, { "epoch": 2.0697560541310542, "grad_norm": 0.44233766198158264, "learning_rate": 8.764472023794742e-05, "loss": 0.4348, "step": 92990 }, { "epoch": 2.0699786324786325, "grad_norm": 0.42627280950546265, "learning_rate": 8.760615322392521e-05, "loss": 0.5617, "step": 93000 }, { "epoch": 2.0702012108262107, "grad_norm": 0.534911036491394, "learning_rate": 8.756759231747664e-05, "loss": 0.5039, "step": 93010 }, { "epoch": 2.0704237891737893, "grad_norm": 0.29487407207489014, "learning_rate": 8.752903752069712e-05, "loss": 0.4299, "step": 93020 }, { "epoch": 2.0706463675213675, "grad_norm": 0.2687482237815857, "learning_rate": 8.74904888356818e-05, "loss": 0.4787, "step": 93030 }, { "epoch": 2.0708689458689458, "grad_norm": 0.5471023917198181, "learning_rate": 8.745194626452542e-05, "loss": 0.4017, "step": 93040 }, { "epoch": 2.0710915242165244, "grad_norm": 0.6511717438697815, "learning_rate": 8.741340980932246e-05, "loss": 0.5364, "step": 93050 }, { "epoch": 2.0713141025641026, "grad_norm": 0.36137768626213074, "learning_rate": 8.737487947216693e-05, "loss": 0.453, "step": 93060 }, { "epoch": 2.071536680911681, "grad_norm": 0.5700806379318237, "learning_rate": 8.733635525515273e-05, "loss": 0.4989, "step": 93070 }, { "epoch": 2.071759259259259, "grad_norm": 0.823119580745697, "learning_rate": 8.729783716037312e-05, "loss": 0.4331, "step": 93080 }, { "epoch": 2.0719818376068377, "grad_norm": 0.4080774188041687, "learning_rate": 8.725932518992132e-05, "loss": 0.3784, "step": 93090 }, { "epoch": 2.072204415954416, "grad_norm": 0.38573476672172546, "learning_rate": 8.722081934589008e-05, "loss": 0.3619, "step": 93100 }, { "epoch": 2.072426994301994, "grad_norm": 0.4866933524608612, "learning_rate": 8.718231963037185e-05, "loss": 0.5084, "step": 93110 }, { "epoch": 2.072649572649573, "grad_norm": 0.6777507066726685, "learning_rate": 8.71438260454587e-05, "loss": 0.4189, "step": 93120 }, { "epoch": 2.072872150997151, "grad_norm": 0.5231661796569824, "learning_rate": 8.710533859324253e-05, "loss": 0.4679, "step": 93130 }, { "epoch": 2.0730947293447293, "grad_norm": 0.5415060520172119, "learning_rate": 8.706685727581458e-05, "loss": 0.4766, "step": 93140 }, { "epoch": 2.0733173076923075, "grad_norm": 0.3524067997932434, "learning_rate": 8.702838209526609e-05, "loss": 0.4928, "step": 93150 }, { "epoch": 2.073539886039886, "grad_norm": 0.2378246933221817, "learning_rate": 8.698991305368778e-05, "loss": 0.3772, "step": 93160 }, { "epoch": 2.0737624643874644, "grad_norm": 0.6241315603256226, "learning_rate": 8.69514501531701e-05, "loss": 0.4476, "step": 93170 }, { "epoch": 2.0739850427350426, "grad_norm": 0.32090499997138977, "learning_rate": 8.691299339580318e-05, "loss": 0.4351, "step": 93180 }, { "epoch": 2.0742076210826212, "grad_norm": 0.696263313293457, "learning_rate": 8.687454278367686e-05, "loss": 0.5288, "step": 93190 }, { "epoch": 2.0744301994301995, "grad_norm": 0.7352275252342224, "learning_rate": 8.68360983188804e-05, "loss": 0.4412, "step": 93200 }, { "epoch": 2.0746527777777777, "grad_norm": 0.3914749324321747, "learning_rate": 8.679766000350309e-05, "loss": 0.4483, "step": 93210 }, { "epoch": 2.0748753561253563, "grad_norm": 0.4394104480743408, "learning_rate": 8.67592278396335e-05, "loss": 0.4899, "step": 93220 }, { "epoch": 2.0750979344729346, "grad_norm": 0.3059239089488983, "learning_rate": 8.67208018293602e-05, "loss": 0.4111, "step": 93230 }, { "epoch": 2.0753205128205128, "grad_norm": 0.5648074150085449, "learning_rate": 8.668238197477128e-05, "loss": 0.5929, "step": 93240 }, { "epoch": 2.075543091168091, "grad_norm": 0.5991377830505371, "learning_rate": 8.664396827795444e-05, "loss": 0.4856, "step": 93250 }, { "epoch": 2.0757656695156697, "grad_norm": 0.5345048308372498, "learning_rate": 8.660556074099722e-05, "loss": 0.4525, "step": 93260 }, { "epoch": 2.075988247863248, "grad_norm": 0.3742232322692871, "learning_rate": 8.656715936598669e-05, "loss": 0.4495, "step": 93270 }, { "epoch": 2.076210826210826, "grad_norm": 0.5081925988197327, "learning_rate": 8.652876415500953e-05, "loss": 0.4513, "step": 93280 }, { "epoch": 2.0764334045584047, "grad_norm": 0.3067689538002014, "learning_rate": 8.649037511015219e-05, "loss": 0.4551, "step": 93290 }, { "epoch": 2.076655982905983, "grad_norm": 0.3174913227558136, "learning_rate": 8.64519922335008e-05, "loss": 0.473, "step": 93300 }, { "epoch": 2.076878561253561, "grad_norm": 0.3483714163303375, "learning_rate": 8.641361552714111e-05, "loss": 0.4364, "step": 93310 }, { "epoch": 2.0771011396011394, "grad_norm": 0.42381465435028076, "learning_rate": 8.637524499315864e-05, "loss": 0.3395, "step": 93320 }, { "epoch": 2.077323717948718, "grad_norm": 0.4798165261745453, "learning_rate": 8.633688063363824e-05, "loss": 0.4197, "step": 93330 }, { "epoch": 2.0775462962962963, "grad_norm": 0.6045119166374207, "learning_rate": 8.629852245066485e-05, "loss": 0.4756, "step": 93340 }, { "epoch": 2.0777688746438745, "grad_norm": 0.5622422099113464, "learning_rate": 8.626017044632289e-05, "loss": 0.4412, "step": 93350 }, { "epoch": 2.077991452991453, "grad_norm": 0.5713974833488464, "learning_rate": 8.62218246226963e-05, "loss": 0.3372, "step": 93360 }, { "epoch": 2.0782140313390314, "grad_norm": 0.32925736904144287, "learning_rate": 8.61834849818689e-05, "loss": 0.4533, "step": 93370 }, { "epoch": 2.0784366096866096, "grad_norm": 0.5514950752258301, "learning_rate": 8.61451515259241e-05, "loss": 0.3857, "step": 93380 }, { "epoch": 2.0786591880341883, "grad_norm": 0.5712468028068542, "learning_rate": 8.610682425694498e-05, "loss": 0.457, "step": 93390 }, { "epoch": 2.0788817663817665, "grad_norm": 0.5968846082687378, "learning_rate": 8.606850317701427e-05, "loss": 0.4946, "step": 93400 }, { "epoch": 2.0791043447293447, "grad_norm": 0.31459808349609375, "learning_rate": 8.603018828821443e-05, "loss": 0.4765, "step": 93410 }, { "epoch": 2.079326923076923, "grad_norm": 0.6099420189857483, "learning_rate": 8.599187959262738e-05, "loss": 0.5371, "step": 93420 }, { "epoch": 2.0795495014245016, "grad_norm": 0.38431429862976074, "learning_rate": 8.595357709233493e-05, "loss": 0.4074, "step": 93430 }, { "epoch": 2.07977207977208, "grad_norm": 0.48925134539604187, "learning_rate": 8.591528078941846e-05, "loss": 0.4671, "step": 93440 }, { "epoch": 2.079994658119658, "grad_norm": 0.4454767405986786, "learning_rate": 8.587699068595912e-05, "loss": 0.4699, "step": 93450 }, { "epoch": 2.0802172364672367, "grad_norm": 0.5479843616485596, "learning_rate": 8.583870678403745e-05, "loss": 0.4729, "step": 93460 }, { "epoch": 2.080439814814815, "grad_norm": 0.5257598757743835, "learning_rate": 8.580042908573389e-05, "loss": 0.3654, "step": 93470 }, { "epoch": 2.080662393162393, "grad_norm": 0.7161563634872437, "learning_rate": 8.57621575931286e-05, "loss": 0.4546, "step": 93480 }, { "epoch": 2.0808849715099713, "grad_norm": 0.6287294626235962, "learning_rate": 8.572389230830109e-05, "loss": 0.4629, "step": 93490 }, { "epoch": 2.08110754985755, "grad_norm": 0.6539040207862854, "learning_rate": 8.568563323333083e-05, "loss": 0.5137, "step": 93500 }, { "epoch": 2.081330128205128, "grad_norm": 0.6592426896095276, "learning_rate": 8.564738037029685e-05, "loss": 0.4433, "step": 93510 }, { "epoch": 2.0815527065527064, "grad_norm": 0.5749611258506775, "learning_rate": 8.560913372127784e-05, "loss": 0.385, "step": 93520 }, { "epoch": 2.081775284900285, "grad_norm": 0.8078531622886658, "learning_rate": 8.557089328835212e-05, "loss": 0.5508, "step": 93530 }, { "epoch": 2.0819978632478633, "grad_norm": 0.5725900530815125, "learning_rate": 8.553265907359777e-05, "loss": 0.4822, "step": 93540 }, { "epoch": 2.0822204415954415, "grad_norm": 0.6212359070777893, "learning_rate": 8.54944310790925e-05, "loss": 0.3839, "step": 93550 }, { "epoch": 2.08244301994302, "grad_norm": 0.7050454020500183, "learning_rate": 8.545620930691349e-05, "loss": 0.5479, "step": 93560 }, { "epoch": 2.0826655982905984, "grad_norm": 0.5940646529197693, "learning_rate": 8.541799375913783e-05, "loss": 0.492, "step": 93570 }, { "epoch": 2.0828881766381766, "grad_norm": 0.5924594402313232, "learning_rate": 8.537978443784229e-05, "loss": 0.494, "step": 93580 }, { "epoch": 2.083110754985755, "grad_norm": 0.748340904712677, "learning_rate": 8.534158134510302e-05, "loss": 0.5395, "step": 93590 }, { "epoch": 2.0833333333333335, "grad_norm": 0.7004445195198059, "learning_rate": 8.530338448299607e-05, "loss": 0.5362, "step": 93600 }, { "epoch": 2.0835559116809117, "grad_norm": 0.6023527979850769, "learning_rate": 8.52651938535971e-05, "loss": 0.4703, "step": 93610 }, { "epoch": 2.08377849002849, "grad_norm": 0.4019067585468292, "learning_rate": 8.522700945898152e-05, "loss": 0.4851, "step": 93620 }, { "epoch": 2.0840010683760686, "grad_norm": 0.4958648085594177, "learning_rate": 8.51888313012241e-05, "loss": 0.4182, "step": 93630 }, { "epoch": 2.084223646723647, "grad_norm": 0.7258164286613464, "learning_rate": 8.515065938239959e-05, "loss": 0.4581, "step": 93640 }, { "epoch": 2.084446225071225, "grad_norm": 0.48704367876052856, "learning_rate": 8.511249370458227e-05, "loss": 0.4772, "step": 93650 }, { "epoch": 2.0846688034188032, "grad_norm": 0.6313557028770447, "learning_rate": 8.50743342698461e-05, "loss": 0.4946, "step": 93660 }, { "epoch": 2.084891381766382, "grad_norm": 0.5559951066970825, "learning_rate": 8.503618108026468e-05, "loss": 0.5083, "step": 93670 }, { "epoch": 2.08511396011396, "grad_norm": 0.6134077310562134, "learning_rate": 8.49980341379113e-05, "loss": 0.3626, "step": 93680 }, { "epoch": 2.0853365384615383, "grad_norm": 0.558841347694397, "learning_rate": 8.495989344485895e-05, "loss": 0.4417, "step": 93690 }, { "epoch": 2.085559116809117, "grad_norm": 0.6272107362747192, "learning_rate": 8.492175900318011e-05, "loss": 0.5813, "step": 93700 }, { "epoch": 2.085781695156695, "grad_norm": 0.6320427060127258, "learning_rate": 8.488363081494715e-05, "loss": 0.4615, "step": 93710 }, { "epoch": 2.0860042735042734, "grad_norm": 0.5066922903060913, "learning_rate": 8.484550888223186e-05, "loss": 0.5195, "step": 93720 }, { "epoch": 2.0862268518518516, "grad_norm": 0.6472083330154419, "learning_rate": 8.480739320710592e-05, "loss": 0.5324, "step": 93730 }, { "epoch": 2.0864494301994303, "grad_norm": 0.5297220945358276, "learning_rate": 8.476928379164048e-05, "loss": 0.4975, "step": 93740 }, { "epoch": 2.0866720085470085, "grad_norm": 0.5604262948036194, "learning_rate": 8.473118063790653e-05, "loss": 0.4071, "step": 93750 }, { "epoch": 2.0868945868945867, "grad_norm": 0.5125129222869873, "learning_rate": 8.469308374797464e-05, "loss": 0.4617, "step": 93760 }, { "epoch": 2.0871171652421654, "grad_norm": 0.3355007767677307, "learning_rate": 8.465499312391491e-05, "loss": 0.4668, "step": 93770 }, { "epoch": 2.0873397435897436, "grad_norm": 0.6570688486099243, "learning_rate": 8.461690876779729e-05, "loss": 0.3485, "step": 93780 }, { "epoch": 2.087562321937322, "grad_norm": 0.44765180349349976, "learning_rate": 8.457883068169128e-05, "loss": 0.4467, "step": 93790 }, { "epoch": 2.0877849002849005, "grad_norm": 0.4833996295928955, "learning_rate": 8.454075886766612e-05, "loss": 0.3855, "step": 93800 }, { "epoch": 2.0880074786324787, "grad_norm": 0.8714666962623596, "learning_rate": 8.450269332779065e-05, "loss": 0.4644, "step": 93810 }, { "epoch": 2.088230056980057, "grad_norm": 0.5705962777137756, "learning_rate": 8.446463406413335e-05, "loss": 0.4754, "step": 93820 }, { "epoch": 2.088452635327635, "grad_norm": 0.6698535680770874, "learning_rate": 8.44265810787625e-05, "loss": 0.4284, "step": 93830 }, { "epoch": 2.088675213675214, "grad_norm": 0.3349551856517792, "learning_rate": 8.438853437374583e-05, "loss": 0.4488, "step": 93840 }, { "epoch": 2.088897792022792, "grad_norm": 0.44186100363731384, "learning_rate": 8.43504939511508e-05, "loss": 0.3784, "step": 93850 }, { "epoch": 2.0891203703703702, "grad_norm": 0.8291239142417908, "learning_rate": 8.431245981304459e-05, "loss": 0.4066, "step": 93860 }, { "epoch": 2.089342948717949, "grad_norm": 0.5814853310585022, "learning_rate": 8.4274431961494e-05, "loss": 0.4305, "step": 93870 }, { "epoch": 2.089565527065527, "grad_norm": 0.670498251914978, "learning_rate": 8.423641039856555e-05, "loss": 0.3973, "step": 93880 }, { "epoch": 2.0897881054131053, "grad_norm": 0.833396852016449, "learning_rate": 8.419839512632532e-05, "loss": 0.3523, "step": 93890 }, { "epoch": 2.0900106837606836, "grad_norm": 0.70555180311203, "learning_rate": 8.416038614683916e-05, "loss": 0.4674, "step": 93900 }, { "epoch": 2.090233262108262, "grad_norm": 0.8045915961265564, "learning_rate": 8.412238346217238e-05, "loss": 0.4466, "step": 93910 }, { "epoch": 2.0904558404558404, "grad_norm": 0.593986988067627, "learning_rate": 8.408438707439015e-05, "loss": 0.5011, "step": 93920 }, { "epoch": 2.0906784188034186, "grad_norm": 0.5474157929420471, "learning_rate": 8.404639698555721e-05, "loss": 0.5583, "step": 93930 }, { "epoch": 2.0909009971509973, "grad_norm": 0.7892537117004395, "learning_rate": 8.400841319773797e-05, "loss": 0.5914, "step": 93940 }, { "epoch": 2.0911235754985755, "grad_norm": 0.5392142534255981, "learning_rate": 8.397043571299654e-05, "loss": 0.5828, "step": 93950 }, { "epoch": 2.0913461538461537, "grad_norm": 0.7543501257896423, "learning_rate": 8.393246453339661e-05, "loss": 0.401, "step": 93960 }, { "epoch": 2.0915687321937324, "grad_norm": 0.7711642384529114, "learning_rate": 8.389449966100164e-05, "loss": 0.5597, "step": 93970 }, { "epoch": 2.0917913105413106, "grad_norm": 0.570942759513855, "learning_rate": 8.385654109787461e-05, "loss": 0.396, "step": 93980 }, { "epoch": 2.092013888888889, "grad_norm": 0.42397984862327576, "learning_rate": 8.381858884607816e-05, "loss": 0.4265, "step": 93990 }, { "epoch": 2.092236467236467, "grad_norm": 0.4155064821243286, "learning_rate": 8.378064290767469e-05, "loss": 0.4785, "step": 94000 }, { "epoch": 2.0924590455840457, "grad_norm": 0.53016597032547, "learning_rate": 8.374270328472622e-05, "loss": 0.4579, "step": 94010 }, { "epoch": 2.092681623931624, "grad_norm": 0.46546798944473267, "learning_rate": 8.370476997929442e-05, "loss": 0.395, "step": 94020 }, { "epoch": 2.092904202279202, "grad_norm": 1.021172285079956, "learning_rate": 8.366684299344063e-05, "loss": 0.4434, "step": 94030 }, { "epoch": 2.093126780626781, "grad_norm": 0.331666499376297, "learning_rate": 8.36289223292259e-05, "loss": 0.4571, "step": 94040 }, { "epoch": 2.093349358974359, "grad_norm": 0.3523954451084137, "learning_rate": 8.359100798871073e-05, "loss": 0.4707, "step": 94050 }, { "epoch": 2.0935719373219372, "grad_norm": 0.6340324282646179, "learning_rate": 8.355309997395548e-05, "loss": 0.464, "step": 94060 }, { "epoch": 2.0937945156695155, "grad_norm": 0.6942493915557861, "learning_rate": 8.35151982870201e-05, "loss": 0.5229, "step": 94070 }, { "epoch": 2.094017094017094, "grad_norm": 0.5725482106208801, "learning_rate": 8.347730292996421e-05, "loss": 0.3971, "step": 94080 }, { "epoch": 2.0942396723646723, "grad_norm": 0.4753534495830536, "learning_rate": 8.343941390484707e-05, "loss": 0.4699, "step": 94090 }, { "epoch": 2.0944622507122506, "grad_norm": 0.7122089862823486, "learning_rate": 8.340153121372767e-05, "loss": 0.4823, "step": 94100 }, { "epoch": 2.0946848290598292, "grad_norm": 0.8469312191009521, "learning_rate": 8.336365485866444e-05, "loss": 0.4615, "step": 94110 }, { "epoch": 2.0949074074074074, "grad_norm": 0.6163572669029236, "learning_rate": 8.332578484171575e-05, "loss": 0.4688, "step": 94120 }, { "epoch": 2.0951299857549857, "grad_norm": 0.5812152028083801, "learning_rate": 8.328792116493937e-05, "loss": 0.5371, "step": 94130 }, { "epoch": 2.095352564102564, "grad_norm": 0.46097496151924133, "learning_rate": 8.325006383039291e-05, "loss": 0.4029, "step": 94140 }, { "epoch": 2.0955751424501425, "grad_norm": 0.8459033966064453, "learning_rate": 8.321221284013354e-05, "loss": 0.4604, "step": 94150 }, { "epoch": 2.0957977207977208, "grad_norm": 0.5037481784820557, "learning_rate": 8.317436819621813e-05, "loss": 0.4163, "step": 94160 }, { "epoch": 2.096020299145299, "grad_norm": 0.4708966314792633, "learning_rate": 8.31365299007032e-05, "loss": 0.458, "step": 94170 }, { "epoch": 2.0962428774928776, "grad_norm": 0.457185834646225, "learning_rate": 8.309869795564495e-05, "loss": 0.5437, "step": 94180 }, { "epoch": 2.096465455840456, "grad_norm": 0.5290883183479309, "learning_rate": 8.306087236309912e-05, "loss": 0.5182, "step": 94190 }, { "epoch": 2.096688034188034, "grad_norm": 0.4177439212799072, "learning_rate": 8.30230531251212e-05, "loss": 0.3714, "step": 94200 }, { "epoch": 2.0969106125356127, "grad_norm": 0.5624401569366455, "learning_rate": 8.298524024376632e-05, "loss": 0.4405, "step": 94210 }, { "epoch": 2.097133190883191, "grad_norm": 0.330450177192688, "learning_rate": 8.294743372108928e-05, "loss": 0.4666, "step": 94220 }, { "epoch": 2.097355769230769, "grad_norm": 0.5528355240821838, "learning_rate": 8.290963355914453e-05, "loss": 0.5391, "step": 94230 }, { "epoch": 2.0975783475783474, "grad_norm": 0.47880885004997253, "learning_rate": 8.287183975998623e-05, "loss": 0.4698, "step": 94240 }, { "epoch": 2.097800925925926, "grad_norm": 0.7645977139472961, "learning_rate": 8.283405232566794e-05, "loss": 0.4942, "step": 94250 }, { "epoch": 2.0980235042735043, "grad_norm": 0.5995956063270569, "learning_rate": 8.279627125824326e-05, "loss": 0.4539, "step": 94260 }, { "epoch": 2.0982460826210825, "grad_norm": 0.48524004220962524, "learning_rate": 8.275849655976506e-05, "loss": 0.4036, "step": 94270 }, { "epoch": 2.098468660968661, "grad_norm": 0.5934380292892456, "learning_rate": 8.272072823228614e-05, "loss": 0.482, "step": 94280 }, { "epoch": 2.0986912393162394, "grad_norm": 0.7825711369514465, "learning_rate": 8.268296627785885e-05, "loss": 0.5104, "step": 94290 }, { "epoch": 2.0989138176638176, "grad_norm": 0.6093398332595825, "learning_rate": 8.264521069853523e-05, "loss": 0.3482, "step": 94300 }, { "epoch": 2.099136396011396, "grad_norm": 0.5677360892295837, "learning_rate": 8.260746149636691e-05, "loss": 0.4758, "step": 94310 }, { "epoch": 2.0993589743589745, "grad_norm": 0.4341944456100464, "learning_rate": 8.256971867340532e-05, "loss": 0.451, "step": 94320 }, { "epoch": 2.0995815527065527, "grad_norm": 0.7116082906723022, "learning_rate": 8.253198223170129e-05, "loss": 0.4402, "step": 94330 }, { "epoch": 2.099804131054131, "grad_norm": 0.40924692153930664, "learning_rate": 8.249425217330548e-05, "loss": 0.4372, "step": 94340 }, { "epoch": 2.1000267094017095, "grad_norm": 0.473922997713089, "learning_rate": 8.245652850026823e-05, "loss": 0.4199, "step": 94350 }, { "epoch": 2.1002492877492878, "grad_norm": 0.5568352341651917, "learning_rate": 8.241881121463943e-05, "loss": 0.409, "step": 94360 }, { "epoch": 2.1002492877492878, "eval_loss": 0.5367586016654968, "eval_runtime": 337.1532, "eval_samples_per_second": 7.015, "eval_steps_per_second": 7.015, "step": 94360 }, { "epoch": 2.100471866096866, "grad_norm": 0.5693296790122986, "learning_rate": 8.238110031846878e-05, "loss": 0.5006, "step": 94370 }, { "epoch": 2.1006944444444446, "grad_norm": 0.40285006165504456, "learning_rate": 8.234339581380532e-05, "loss": 0.3592, "step": 94380 }, { "epoch": 2.100917022792023, "grad_norm": 0.406491219997406, "learning_rate": 8.230569770269807e-05, "loss": 0.4356, "step": 94390 }, { "epoch": 2.101139601139601, "grad_norm": 0.4366461932659149, "learning_rate": 8.226800598719562e-05, "loss": 0.499, "step": 94400 }, { "epoch": 2.1013621794871793, "grad_norm": 0.5887917280197144, "learning_rate": 8.223032066934603e-05, "loss": 0.4306, "step": 94410 }, { "epoch": 2.101584757834758, "grad_norm": 0.5526008605957031, "learning_rate": 8.219264175119723e-05, "loss": 0.5057, "step": 94420 }, { "epoch": 2.101807336182336, "grad_norm": 0.43040841817855835, "learning_rate": 8.215496923479672e-05, "loss": 0.4997, "step": 94430 }, { "epoch": 2.1020299145299144, "grad_norm": 0.5127760171890259, "learning_rate": 8.211730312219165e-05, "loss": 0.5375, "step": 94440 }, { "epoch": 2.102252492877493, "grad_norm": 0.5958731770515442, "learning_rate": 8.207964341542884e-05, "loss": 0.3318, "step": 94450 }, { "epoch": 2.1024750712250713, "grad_norm": 0.6060073375701904, "learning_rate": 8.204199011655481e-05, "loss": 0.4845, "step": 94460 }, { "epoch": 2.1026976495726495, "grad_norm": 0.523822546005249, "learning_rate": 8.200434322761551e-05, "loss": 0.4344, "step": 94470 }, { "epoch": 2.1029202279202277, "grad_norm": 0.5422360897064209, "learning_rate": 8.196670275065683e-05, "loss": 0.3397, "step": 94480 }, { "epoch": 2.1031428062678064, "grad_norm": 0.4780917465686798, "learning_rate": 8.192906868772414e-05, "loss": 0.4053, "step": 94490 }, { "epoch": 2.1033653846153846, "grad_norm": 0.7546813488006592, "learning_rate": 8.189144104086257e-05, "loss": 0.4096, "step": 94500 }, { "epoch": 2.103587962962963, "grad_norm": 0.4494799077510834, "learning_rate": 8.185381981211673e-05, "loss": 0.4379, "step": 94510 }, { "epoch": 2.1038105413105415, "grad_norm": 0.5556170344352722, "learning_rate": 8.181620500353103e-05, "loss": 0.5039, "step": 94520 }, { "epoch": 2.1040331196581197, "grad_norm": 0.6032299399375916, "learning_rate": 8.177859661714949e-05, "loss": 0.4897, "step": 94530 }, { "epoch": 2.104255698005698, "grad_norm": 0.6110396981239319, "learning_rate": 8.174099465501588e-05, "loss": 0.5793, "step": 94540 }, { "epoch": 2.1044782763532766, "grad_norm": 0.8312065601348877, "learning_rate": 8.170339911917335e-05, "loss": 0.4654, "step": 94550 }, { "epoch": 2.1047008547008548, "grad_norm": 0.8147444725036621, "learning_rate": 8.166581001166496e-05, "loss": 0.4232, "step": 94560 }, { "epoch": 2.104923433048433, "grad_norm": 0.4960596263408661, "learning_rate": 8.16282273345333e-05, "loss": 0.4324, "step": 94570 }, { "epoch": 2.105146011396011, "grad_norm": 0.5012384653091431, "learning_rate": 8.159065108982071e-05, "loss": 0.4532, "step": 94580 }, { "epoch": 2.10536858974359, "grad_norm": 0.7035501599311829, "learning_rate": 8.155308127956905e-05, "loss": 0.5045, "step": 94590 }, { "epoch": 2.105591168091168, "grad_norm": 0.6068798899650574, "learning_rate": 8.151551790581999e-05, "loss": 0.5179, "step": 94600 }, { "epoch": 2.1058137464387463, "grad_norm": 0.6684674024581909, "learning_rate": 8.147796097061463e-05, "loss": 0.3775, "step": 94610 }, { "epoch": 2.106036324786325, "grad_norm": 0.6053679585456848, "learning_rate": 8.144041047599389e-05, "loss": 0.4893, "step": 94620 }, { "epoch": 2.106258903133903, "grad_norm": 0.602454662322998, "learning_rate": 8.140286642399837e-05, "loss": 0.3968, "step": 94630 }, { "epoch": 2.1064814814814814, "grad_norm": 0.54142165184021, "learning_rate": 8.13653288166681e-05, "loss": 0.5053, "step": 94640 }, { "epoch": 2.1067040598290596, "grad_norm": 0.4809923470020294, "learning_rate": 8.1327797656043e-05, "loss": 0.4167, "step": 94650 }, { "epoch": 2.1069266381766383, "grad_norm": 0.7400349378585815, "learning_rate": 8.12902729441625e-05, "loss": 0.4062, "step": 94660 }, { "epoch": 2.1071492165242165, "grad_norm": 0.5602110028266907, "learning_rate": 8.125275468306574e-05, "loss": 0.4817, "step": 94670 }, { "epoch": 2.1073717948717947, "grad_norm": 0.5356965661048889, "learning_rate": 8.121524287479161e-05, "loss": 0.4635, "step": 94680 }, { "epoch": 2.1075943732193734, "grad_norm": 0.42339104413986206, "learning_rate": 8.117773752137833e-05, "loss": 0.3797, "step": 94690 }, { "epoch": 2.1078169515669516, "grad_norm": 0.6352277994155884, "learning_rate": 8.114023862486406e-05, "loss": 0.4707, "step": 94700 }, { "epoch": 2.10803952991453, "grad_norm": 0.6159664392471313, "learning_rate": 8.110274618728654e-05, "loss": 0.4815, "step": 94710 }, { "epoch": 2.1082621082621085, "grad_norm": 0.5292482376098633, "learning_rate": 8.106526021068313e-05, "loss": 0.5653, "step": 94720 }, { "epoch": 2.1084846866096867, "grad_norm": 0.6296613216400146, "learning_rate": 8.102778069709083e-05, "loss": 0.4936, "step": 94730 }, { "epoch": 2.108707264957265, "grad_norm": 0.4425913691520691, "learning_rate": 8.09903076485464e-05, "loss": 0.5033, "step": 94740 }, { "epoch": 2.108929843304843, "grad_norm": 0.41408321261405945, "learning_rate": 8.0952841067086e-05, "loss": 0.4678, "step": 94750 }, { "epoch": 2.109152421652422, "grad_norm": 0.7556164860725403, "learning_rate": 8.091538095474576e-05, "loss": 0.4469, "step": 94760 }, { "epoch": 2.109375, "grad_norm": 0.6101809740066528, "learning_rate": 8.087792731356112e-05, "loss": 0.5185, "step": 94770 }, { "epoch": 2.109597578347578, "grad_norm": 0.5169164538383484, "learning_rate": 8.084048014556745e-05, "loss": 0.3463, "step": 94780 }, { "epoch": 2.109820156695157, "grad_norm": 0.6174333691596985, "learning_rate": 8.080303945279961e-05, "loss": 0.5064, "step": 94790 }, { "epoch": 2.110042735042735, "grad_norm": 0.5151993036270142, "learning_rate": 8.07656052372922e-05, "loss": 0.4483, "step": 94800 }, { "epoch": 2.1102653133903133, "grad_norm": 0.5747796893119812, "learning_rate": 8.072817750107942e-05, "loss": 0.4774, "step": 94810 }, { "epoch": 2.1104878917378915, "grad_norm": 0.3656141459941864, "learning_rate": 8.069075624619516e-05, "loss": 0.4848, "step": 94820 }, { "epoch": 2.11071047008547, "grad_norm": 0.4049152731895447, "learning_rate": 8.065334147467283e-05, "loss": 0.5153, "step": 94830 }, { "epoch": 2.1109330484330484, "grad_norm": 0.5611526370048523, "learning_rate": 8.061593318854562e-05, "loss": 0.3793, "step": 94840 }, { "epoch": 2.1111556267806266, "grad_norm": 0.4716905951499939, "learning_rate": 8.057853138984632e-05, "loss": 0.4409, "step": 94850 }, { "epoch": 2.1113782051282053, "grad_norm": 0.42111822962760925, "learning_rate": 8.054113608060738e-05, "loss": 0.3701, "step": 94860 }, { "epoch": 2.1116007834757835, "grad_norm": 0.6273981928825378, "learning_rate": 8.050374726286092e-05, "loss": 0.4781, "step": 94870 }, { "epoch": 2.1118233618233617, "grad_norm": 0.49173009395599365, "learning_rate": 8.046636493863873e-05, "loss": 0.5226, "step": 94880 }, { "epoch": 2.1120459401709404, "grad_norm": 0.49693116545677185, "learning_rate": 8.042898910997212e-05, "loss": 0.4516, "step": 94890 }, { "epoch": 2.1122685185185186, "grad_norm": 0.43563681840896606, "learning_rate": 8.039161977889205e-05, "loss": 0.465, "step": 94900 }, { "epoch": 2.112491096866097, "grad_norm": 0.879296064376831, "learning_rate": 8.035425694742929e-05, "loss": 0.4238, "step": 94910 }, { "epoch": 2.112713675213675, "grad_norm": 0.763816773891449, "learning_rate": 8.031690061761414e-05, "loss": 0.4125, "step": 94920 }, { "epoch": 2.1129362535612537, "grad_norm": 0.5743877291679382, "learning_rate": 8.02795507914766e-05, "loss": 0.4042, "step": 94930 }, { "epoch": 2.113158831908832, "grad_norm": 0.4411097764968872, "learning_rate": 8.024220747104627e-05, "loss": 0.4687, "step": 94940 }, { "epoch": 2.11338141025641, "grad_norm": 0.6350961923599243, "learning_rate": 8.020487065835243e-05, "loss": 0.4661, "step": 94950 }, { "epoch": 2.113603988603989, "grad_norm": 0.5903646349906921, "learning_rate": 8.016754035542404e-05, "loss": 0.4633, "step": 94960 }, { "epoch": 2.113826566951567, "grad_norm": 0.6953848600387573, "learning_rate": 8.013021656428954e-05, "loss": 0.5241, "step": 94970 }, { "epoch": 2.1140491452991452, "grad_norm": 0.41217243671417236, "learning_rate": 8.00928992869772e-05, "loss": 0.4664, "step": 94980 }, { "epoch": 2.1142717236467234, "grad_norm": 0.555772602558136, "learning_rate": 8.00555885255149e-05, "loss": 0.4651, "step": 94990 }, { "epoch": 2.114494301994302, "grad_norm": 0.5230336785316467, "learning_rate": 8.00182842819301e-05, "loss": 0.5052, "step": 95000 }, { "epoch": 2.1147168803418803, "grad_norm": 0.7198736667633057, "learning_rate": 7.998098655824995e-05, "loss": 0.516, "step": 95010 }, { "epoch": 2.1149394586894585, "grad_norm": 0.45915162563323975, "learning_rate": 7.99436953565013e-05, "loss": 0.4227, "step": 95020 }, { "epoch": 2.115162037037037, "grad_norm": 0.8346054553985596, "learning_rate": 7.990641067871054e-05, "loss": 0.4145, "step": 95030 }, { "epoch": 2.1153846153846154, "grad_norm": 0.5007335543632507, "learning_rate": 7.986913252690367e-05, "loss": 0.3814, "step": 95040 }, { "epoch": 2.1156071937321936, "grad_norm": 0.6236112117767334, "learning_rate": 7.983186090310648e-05, "loss": 0.474, "step": 95050 }, { "epoch": 2.1158297720797723, "grad_norm": 0.7768984436988831, "learning_rate": 7.979459580934434e-05, "loss": 0.4494, "step": 95060 }, { "epoch": 2.1160523504273505, "grad_norm": 0.5049422979354858, "learning_rate": 7.975733724764225e-05, "loss": 0.384, "step": 95070 }, { "epoch": 2.1162749287749287, "grad_norm": 0.45497995615005493, "learning_rate": 7.972008522002491e-05, "loss": 0.352, "step": 95080 }, { "epoch": 2.116497507122507, "grad_norm": 0.3762676417827606, "learning_rate": 7.968283972851669e-05, "loss": 0.5393, "step": 95090 }, { "epoch": 2.1167200854700856, "grad_norm": 0.4134499430656433, "learning_rate": 7.964560077514136e-05, "loss": 0.5039, "step": 95100 }, { "epoch": 2.116942663817664, "grad_norm": 0.7410092353820801, "learning_rate": 7.960836836192263e-05, "loss": 0.5567, "step": 95110 }, { "epoch": 2.117165242165242, "grad_norm": 0.637007474899292, "learning_rate": 7.957114249088369e-05, "loss": 0.4832, "step": 95120 }, { "epoch": 2.1173878205128207, "grad_norm": 0.7132471203804016, "learning_rate": 7.953392316404748e-05, "loss": 0.4884, "step": 95130 }, { "epoch": 2.117610398860399, "grad_norm": 0.9175398945808411, "learning_rate": 7.94967103834365e-05, "loss": 0.5452, "step": 95140 }, { "epoch": 2.117832977207977, "grad_norm": 0.4358776807785034, "learning_rate": 7.945950415107299e-05, "loss": 0.4458, "step": 95150 }, { "epoch": 2.1180555555555554, "grad_norm": 0.8178651928901672, "learning_rate": 7.942230446897862e-05, "loss": 0.5678, "step": 95160 }, { "epoch": 2.118278133903134, "grad_norm": 0.4699116349220276, "learning_rate": 7.938511133917503e-05, "loss": 0.3165, "step": 95170 }, { "epoch": 2.1185007122507122, "grad_norm": 0.4907972812652588, "learning_rate": 7.934792476368316e-05, "loss": 0.4241, "step": 95180 }, { "epoch": 2.1187232905982905, "grad_norm": 0.5656859278678894, "learning_rate": 7.931074474452381e-05, "loss": 0.4015, "step": 95190 }, { "epoch": 2.118945868945869, "grad_norm": 0.4246733784675598, "learning_rate": 7.927357128371739e-05, "loss": 0.4926, "step": 95200 }, { "epoch": 2.1191684472934473, "grad_norm": 0.8523581027984619, "learning_rate": 7.923640438328396e-05, "loss": 0.5256, "step": 95210 }, { "epoch": 2.1193910256410255, "grad_norm": 0.37937796115875244, "learning_rate": 7.919924404524317e-05, "loss": 0.4751, "step": 95220 }, { "epoch": 2.119613603988604, "grad_norm": 0.4993674159049988, "learning_rate": 7.916209027161441e-05, "loss": 0.4355, "step": 95230 }, { "epoch": 2.1198361823361824, "grad_norm": 0.7296259999275208, "learning_rate": 7.912494306441654e-05, "loss": 0.5167, "step": 95240 }, { "epoch": 2.1200587606837606, "grad_norm": 0.3281956911087036, "learning_rate": 7.908780242566817e-05, "loss": 0.5247, "step": 95250 }, { "epoch": 2.120281339031339, "grad_norm": 0.38215774297714233, "learning_rate": 7.905066835738763e-05, "loss": 0.349, "step": 95260 }, { "epoch": 2.1205039173789175, "grad_norm": 0.6754162907600403, "learning_rate": 7.90135408615928e-05, "loss": 0.4428, "step": 95270 }, { "epoch": 2.1207264957264957, "grad_norm": 0.6413685083389282, "learning_rate": 7.897641994030127e-05, "loss": 0.4805, "step": 95280 }, { "epoch": 2.120949074074074, "grad_norm": 0.633334755897522, "learning_rate": 7.893930559553007e-05, "loss": 0.4691, "step": 95290 }, { "epoch": 2.1211716524216526, "grad_norm": 0.5734654664993286, "learning_rate": 7.890219782929611e-05, "loss": 0.4752, "step": 95300 }, { "epoch": 2.121394230769231, "grad_norm": 0.6622852683067322, "learning_rate": 7.886509664361592e-05, "loss": 0.4182, "step": 95310 }, { "epoch": 2.121616809116809, "grad_norm": 0.6082544922828674, "learning_rate": 7.882800204050549e-05, "loss": 0.5433, "step": 95320 }, { "epoch": 2.1218393874643873, "grad_norm": 0.5464411377906799, "learning_rate": 7.879091402198062e-05, "loss": 0.4334, "step": 95330 }, { "epoch": 2.122061965811966, "grad_norm": 0.47847822308540344, "learning_rate": 7.875383259005671e-05, "loss": 0.4479, "step": 95340 }, { "epoch": 2.122284544159544, "grad_norm": 0.49448779225349426, "learning_rate": 7.871675774674878e-05, "loss": 0.4607, "step": 95350 }, { "epoch": 2.1225071225071224, "grad_norm": 0.4804612398147583, "learning_rate": 7.867968949407153e-05, "loss": 0.4253, "step": 95360 }, { "epoch": 2.122729700854701, "grad_norm": 0.599602997303009, "learning_rate": 7.864262783403935e-05, "loss": 0.3489, "step": 95370 }, { "epoch": 2.1229522792022792, "grad_norm": 0.5012718439102173, "learning_rate": 7.860557276866603e-05, "loss": 0.6108, "step": 95380 }, { "epoch": 2.1231748575498575, "grad_norm": 0.6502254605293274, "learning_rate": 7.856852429996528e-05, "loss": 0.5054, "step": 95390 }, { "epoch": 2.123397435897436, "grad_norm": 0.5006417632102966, "learning_rate": 7.853148242995031e-05, "loss": 0.333, "step": 95400 }, { "epoch": 2.1236200142450143, "grad_norm": 0.8475795388221741, "learning_rate": 7.849444716063405e-05, "loss": 0.5125, "step": 95410 }, { "epoch": 2.1238425925925926, "grad_norm": 0.6902666091918945, "learning_rate": 7.845741849402906e-05, "loss": 0.4753, "step": 95420 }, { "epoch": 2.1240651709401708, "grad_norm": 0.6792203783988953, "learning_rate": 7.842039643214737e-05, "loss": 0.4453, "step": 95430 }, { "epoch": 2.1242877492877494, "grad_norm": 0.6841697096824646, "learning_rate": 7.838338097700088e-05, "loss": 0.4297, "step": 95440 }, { "epoch": 2.1245103276353277, "grad_norm": 0.47156375646591187, "learning_rate": 7.83463721306011e-05, "loss": 0.5173, "step": 95450 }, { "epoch": 2.124732905982906, "grad_norm": 0.6936251521110535, "learning_rate": 7.830936989495897e-05, "loss": 0.4983, "step": 95460 }, { "epoch": 2.1249554843304845, "grad_norm": 0.3907637596130371, "learning_rate": 7.827237427208529e-05, "loss": 0.5749, "step": 95470 }, { "epoch": 2.1251780626780628, "grad_norm": 0.6941818594932556, "learning_rate": 7.823538526399045e-05, "loss": 0.4437, "step": 95480 }, { "epoch": 2.125400641025641, "grad_norm": 0.49002397060394287, "learning_rate": 7.819840287268444e-05, "loss": 0.3759, "step": 95490 }, { "epoch": 2.125623219373219, "grad_norm": 0.8776218295097351, "learning_rate": 7.816142710017697e-05, "loss": 0.4317, "step": 95500 }, { "epoch": 2.125845797720798, "grad_norm": 0.6318543553352356, "learning_rate": 7.812445794847734e-05, "loss": 0.4536, "step": 95510 }, { "epoch": 2.126068376068376, "grad_norm": 0.39666521549224854, "learning_rate": 7.808749541959437e-05, "loss": 0.5282, "step": 95520 }, { "epoch": 2.1262909544159543, "grad_norm": 0.6775786280632019, "learning_rate": 7.80505395155367e-05, "loss": 0.638, "step": 95530 }, { "epoch": 2.126513532763533, "grad_norm": 0.3907429575920105, "learning_rate": 7.801359023831254e-05, "loss": 0.4733, "step": 95540 }, { "epoch": 2.126736111111111, "grad_norm": 0.4724726974964142, "learning_rate": 7.797664758992984e-05, "loss": 0.455, "step": 95550 }, { "epoch": 2.1269586894586894, "grad_norm": 0.5766648054122925, "learning_rate": 7.79397115723959e-05, "loss": 0.3844, "step": 95560 }, { "epoch": 2.127181267806268, "grad_norm": 0.5338031053543091, "learning_rate": 7.790278218771798e-05, "loss": 0.4238, "step": 95570 }, { "epoch": 2.1274038461538463, "grad_norm": 0.5047205090522766, "learning_rate": 7.786585943790283e-05, "loss": 0.4362, "step": 95580 }, { "epoch": 2.1276264245014245, "grad_norm": 0.3229919970035553, "learning_rate": 7.782894332495691e-05, "loss": 0.4147, "step": 95590 }, { "epoch": 2.1278490028490027, "grad_norm": 0.49135375022888184, "learning_rate": 7.779203385088618e-05, "loss": 0.4977, "step": 95600 }, { "epoch": 2.1280715811965814, "grad_norm": 0.5660061836242676, "learning_rate": 7.775513101769636e-05, "loss": 0.4634, "step": 95610 }, { "epoch": 2.1282941595441596, "grad_norm": 0.5993034243583679, "learning_rate": 7.77182348273928e-05, "loss": 0.4497, "step": 95620 }, { "epoch": 2.128516737891738, "grad_norm": 0.5378540754318237, "learning_rate": 7.768134528198046e-05, "loss": 0.4574, "step": 95630 }, { "epoch": 2.128739316239316, "grad_norm": 0.4644266366958618, "learning_rate": 7.764446238346395e-05, "loss": 0.5494, "step": 95640 }, { "epoch": 2.1289618945868947, "grad_norm": 0.7540342211723328, "learning_rate": 7.76075861338476e-05, "loss": 0.4136, "step": 95650 }, { "epoch": 2.129184472934473, "grad_norm": 0.6110064387321472, "learning_rate": 7.757071653513512e-05, "loss": 0.3748, "step": 95660 }, { "epoch": 2.129407051282051, "grad_norm": 0.4860828220844269, "learning_rate": 7.753385358933016e-05, "loss": 0.4584, "step": 95670 }, { "epoch": 2.1296296296296298, "grad_norm": 0.6992922425270081, "learning_rate": 7.749699729843591e-05, "loss": 0.4407, "step": 95680 }, { "epoch": 2.129852207977208, "grad_norm": 0.46820303797721863, "learning_rate": 7.746014766445504e-05, "loss": 0.4352, "step": 95690 }, { "epoch": 2.130074786324786, "grad_norm": 0.4686327874660492, "learning_rate": 7.742330468939006e-05, "loss": 0.4255, "step": 95700 }, { "epoch": 2.130297364672365, "grad_norm": 0.7035104036331177, "learning_rate": 7.738646837524306e-05, "loss": 0.5257, "step": 95710 }, { "epoch": 2.130519943019943, "grad_norm": 0.6425663232803345, "learning_rate": 7.734963872401573e-05, "loss": 0.5195, "step": 95720 }, { "epoch": 2.1307425213675213, "grad_norm": 0.6731189489364624, "learning_rate": 7.73128157377095e-05, "loss": 0.4616, "step": 95730 }, { "epoch": 2.1309650997150995, "grad_norm": 0.5955937504768372, "learning_rate": 7.727599941832526e-05, "loss": 0.5135, "step": 95740 }, { "epoch": 2.131187678062678, "grad_norm": 0.4566531479358673, "learning_rate": 7.723918976786366e-05, "loss": 0.3921, "step": 95750 }, { "epoch": 2.1314102564102564, "grad_norm": 0.5514445900917053, "learning_rate": 7.720238678832498e-05, "loss": 0.5411, "step": 95760 }, { "epoch": 2.1316328347578346, "grad_norm": 0.4605773687362671, "learning_rate": 7.716559048170913e-05, "loss": 0.383, "step": 95770 }, { "epoch": 2.1318554131054133, "grad_norm": 0.7322866320610046, "learning_rate": 7.712880085001565e-05, "loss": 0.5983, "step": 95780 }, { "epoch": 2.1320779914529915, "grad_norm": 0.36871546506881714, "learning_rate": 7.709201789524381e-05, "loss": 0.3566, "step": 95790 }, { "epoch": 2.1323005698005697, "grad_norm": 0.5514602661132812, "learning_rate": 7.705524161939223e-05, "loss": 0.469, "step": 95800 }, { "epoch": 2.132523148148148, "grad_norm": 0.7098495960235596, "learning_rate": 7.701847202445956e-05, "loss": 0.486, "step": 95810 }, { "epoch": 2.1327457264957266, "grad_norm": 0.5476725101470947, "learning_rate": 7.698170911244373e-05, "loss": 0.3667, "step": 95820 }, { "epoch": 2.132968304843305, "grad_norm": 0.7701019048690796, "learning_rate": 7.694495288534252e-05, "loss": 0.472, "step": 95830 }, { "epoch": 2.133190883190883, "grad_norm": 0.586618185043335, "learning_rate": 7.690820334515331e-05, "loss": 0.3609, "step": 95840 }, { "epoch": 2.1334134615384617, "grad_norm": 0.35569578409194946, "learning_rate": 7.68714604938731e-05, "loss": 0.3996, "step": 95850 }, { "epoch": 2.13363603988604, "grad_norm": 0.7099726796150208, "learning_rate": 7.683472433349854e-05, "loss": 0.4586, "step": 95860 }, { "epoch": 2.133858618233618, "grad_norm": 0.6935610175132751, "learning_rate": 7.679799486602595e-05, "loss": 0.424, "step": 95870 }, { "epoch": 2.1340811965811968, "grad_norm": 0.5956912040710449, "learning_rate": 7.67612720934511e-05, "loss": 0.4726, "step": 95880 }, { "epoch": 2.134303774928775, "grad_norm": 0.6841393113136292, "learning_rate": 7.67245560177696e-05, "loss": 0.5007, "step": 95890 }, { "epoch": 2.134526353276353, "grad_norm": 0.6714563965797424, "learning_rate": 7.668784664097668e-05, "loss": 0.4712, "step": 95900 }, { "epoch": 2.1347489316239314, "grad_norm": 0.4370259642601013, "learning_rate": 7.665114396506709e-05, "loss": 0.5509, "step": 95910 }, { "epoch": 2.13497150997151, "grad_norm": 0.7829974889755249, "learning_rate": 7.661444799203532e-05, "loss": 0.426, "step": 95920 }, { "epoch": 2.1351940883190883, "grad_norm": 0.5304891467094421, "learning_rate": 7.657775872387554e-05, "loss": 0.5888, "step": 95930 }, { "epoch": 2.1354166666666665, "grad_norm": 0.4999597370624542, "learning_rate": 7.654107616258137e-05, "loss": 0.4326, "step": 95940 }, { "epoch": 2.135639245014245, "grad_norm": 0.5106223821640015, "learning_rate": 7.650440031014611e-05, "loss": 0.4832, "step": 95950 }, { "epoch": 2.1358618233618234, "grad_norm": 0.5175594687461853, "learning_rate": 7.646773116856287e-05, "loss": 0.3942, "step": 95960 }, { "epoch": 2.1360844017094016, "grad_norm": 0.6161689162254333, "learning_rate": 7.643106873982422e-05, "loss": 0.5379, "step": 95970 }, { "epoch": 2.13630698005698, "grad_norm": 0.6075182557106018, "learning_rate": 7.639441302592248e-05, "loss": 0.4709, "step": 95980 }, { "epoch": 2.1365295584045585, "grad_norm": 0.9312088489532471, "learning_rate": 7.635776402884949e-05, "loss": 0.5752, "step": 95990 }, { "epoch": 2.1367521367521367, "grad_norm": 0.5864628553390503, "learning_rate": 7.632112175059684e-05, "loss": 0.5621, "step": 96000 }, { "epoch": 2.136974715099715, "grad_norm": 0.46275755763053894, "learning_rate": 7.628448619315575e-05, "loss": 0.3262, "step": 96010 }, { "epoch": 2.1371972934472936, "grad_norm": 0.5860275030136108, "learning_rate": 7.62478573585169e-05, "loss": 0.498, "step": 96020 }, { "epoch": 2.137419871794872, "grad_norm": 0.3560199737548828, "learning_rate": 7.621123524867077e-05, "loss": 0.4276, "step": 96030 }, { "epoch": 2.13764245014245, "grad_norm": 0.4286309778690338, "learning_rate": 7.617461986560746e-05, "loss": 0.4103, "step": 96040 }, { "epoch": 2.1378650284900287, "grad_norm": 0.28722521662712097, "learning_rate": 7.613801121131667e-05, "loss": 0.3054, "step": 96050 }, { "epoch": 2.138087606837607, "grad_norm": 0.766991376876831, "learning_rate": 7.610140928778777e-05, "loss": 0.5683, "step": 96060 }, { "epoch": 2.138310185185185, "grad_norm": 0.7361512184143066, "learning_rate": 7.606481409700976e-05, "loss": 0.5906, "step": 96070 }, { "epoch": 2.1385327635327633, "grad_norm": 0.46420538425445557, "learning_rate": 7.602822564097122e-05, "loss": 0.4366, "step": 96080 }, { "epoch": 2.138755341880342, "grad_norm": 0.8343654870986938, "learning_rate": 7.599164392166033e-05, "loss": 0.4457, "step": 96090 }, { "epoch": 2.13897792022792, "grad_norm": 0.6298092603683472, "learning_rate": 7.595506894106503e-05, "loss": 0.4655, "step": 96100 }, { "epoch": 2.1392004985754984, "grad_norm": 0.6869631409645081, "learning_rate": 7.591850070117281e-05, "loss": 0.3374, "step": 96110 }, { "epoch": 2.139423076923077, "grad_norm": 0.6529879570007324, "learning_rate": 7.588193920397084e-05, "loss": 0.3987, "step": 96120 }, { "epoch": 2.1396456552706553, "grad_norm": 0.4423435628414154, "learning_rate": 7.584538445144591e-05, "loss": 0.401, "step": 96130 }, { "epoch": 2.1398682336182335, "grad_norm": 0.6857985258102417, "learning_rate": 7.580883644558443e-05, "loss": 0.5029, "step": 96140 }, { "epoch": 2.1400908119658117, "grad_norm": 0.4324056804180145, "learning_rate": 7.577229518837252e-05, "loss": 0.4247, "step": 96150 }, { "epoch": 2.1403133903133904, "grad_norm": 0.5784803628921509, "learning_rate": 7.57357606817957e-05, "loss": 0.474, "step": 96160 }, { "epoch": 2.1405359686609686, "grad_norm": 0.5325353741645813, "learning_rate": 7.569923292783938e-05, "loss": 0.4022, "step": 96170 }, { "epoch": 2.140758547008547, "grad_norm": 0.35216307640075684, "learning_rate": 7.56627119284885e-05, "loss": 0.4482, "step": 96180 }, { "epoch": 2.1409811253561255, "grad_norm": 0.8801172375679016, "learning_rate": 7.562619768572765e-05, "loss": 0.5553, "step": 96190 }, { "epoch": 2.1412037037037037, "grad_norm": 0.6519230008125305, "learning_rate": 7.55896902015411e-05, "loss": 0.502, "step": 96200 }, { "epoch": 2.141426282051282, "grad_norm": 0.5615265965461731, "learning_rate": 7.555318947791257e-05, "loss": 0.4937, "step": 96210 }, { "epoch": 2.1416488603988606, "grad_norm": 0.49167194962501526, "learning_rate": 7.551669551682565e-05, "loss": 0.523, "step": 96220 }, { "epoch": 2.141871438746439, "grad_norm": 0.48790618777275085, "learning_rate": 7.548020832026335e-05, "loss": 0.4501, "step": 96230 }, { "epoch": 2.142094017094017, "grad_norm": 0.8500549793243408, "learning_rate": 7.544372789020844e-05, "loss": 0.4977, "step": 96240 }, { "epoch": 2.1423165954415953, "grad_norm": 0.715935230255127, "learning_rate": 7.540725422864334e-05, "loss": 0.4873, "step": 96250 }, { "epoch": 2.142539173789174, "grad_norm": 0.6317983865737915, "learning_rate": 7.537078733755005e-05, "loss": 0.4425, "step": 96260 }, { "epoch": 2.142761752136752, "grad_norm": 1.0182199478149414, "learning_rate": 7.53343272189102e-05, "loss": 0.5664, "step": 96270 }, { "epoch": 2.1429843304843303, "grad_norm": 0.7626780271530151, "learning_rate": 7.529787387470506e-05, "loss": 0.5206, "step": 96280 }, { "epoch": 2.143206908831909, "grad_norm": 0.45802199840545654, "learning_rate": 7.526142730691561e-05, "loss": 0.4275, "step": 96290 }, { "epoch": 2.1434294871794872, "grad_norm": 0.5165693759918213, "learning_rate": 7.522498751752225e-05, "loss": 0.5341, "step": 96300 }, { "epoch": 2.1436520655270654, "grad_norm": 0.5957320928573608, "learning_rate": 7.518855450850519e-05, "loss": 0.3631, "step": 96310 }, { "epoch": 2.1438746438746437, "grad_norm": 0.5179235339164734, "learning_rate": 7.515212828184428e-05, "loss": 0.449, "step": 96320 }, { "epoch": 2.1440972222222223, "grad_norm": 0.4287464916706085, "learning_rate": 7.5115708839519e-05, "loss": 0.4, "step": 96330 }, { "epoch": 2.1443198005698005, "grad_norm": 0.45676085352897644, "learning_rate": 7.507929618350824e-05, "loss": 0.4563, "step": 96340 }, { "epoch": 2.1445423789173788, "grad_norm": 0.6015214323997498, "learning_rate": 7.504289031579081e-05, "loss": 0.4153, "step": 96350 }, { "epoch": 2.1447649572649574, "grad_norm": 0.6132090091705322, "learning_rate": 7.500649123834507e-05, "loss": 0.4989, "step": 96360 }, { "epoch": 2.1449875356125356, "grad_norm": 0.4717426598072052, "learning_rate": 7.497009895314887e-05, "loss": 0.5924, "step": 96370 }, { "epoch": 2.145210113960114, "grad_norm": 0.48287132382392883, "learning_rate": 7.493371346217983e-05, "loss": 0.39, "step": 96380 }, { "epoch": 2.1454326923076925, "grad_norm": 0.589698076248169, "learning_rate": 7.489733476741519e-05, "loss": 0.4849, "step": 96390 }, { "epoch": 2.1456552706552707, "grad_norm": 0.4884866178035736, "learning_rate": 7.48609628708318e-05, "loss": 0.4765, "step": 96400 }, { "epoch": 2.145877849002849, "grad_norm": 0.5568512082099915, "learning_rate": 7.482459777440612e-05, "loss": 0.4794, "step": 96410 }, { "epoch": 2.146100427350427, "grad_norm": 0.4387529194355011, "learning_rate": 7.478823948011429e-05, "loss": 0.481, "step": 96420 }, { "epoch": 2.146323005698006, "grad_norm": 0.5052660703659058, "learning_rate": 7.475188798993206e-05, "loss": 0.4859, "step": 96430 }, { "epoch": 2.146545584045584, "grad_norm": 0.5194045305252075, "learning_rate": 7.471554330583475e-05, "loss": 0.3896, "step": 96440 }, { "epoch": 2.1467681623931623, "grad_norm": 0.7493390440940857, "learning_rate": 7.467920542979734e-05, "loss": 0.5528, "step": 96450 }, { "epoch": 2.146990740740741, "grad_norm": 0.48206827044487, "learning_rate": 7.464287436379451e-05, "loss": 0.5136, "step": 96460 }, { "epoch": 2.147213319088319, "grad_norm": 0.5970621705055237, "learning_rate": 7.460655010980058e-05, "loss": 0.4574, "step": 96470 }, { "epoch": 2.1474358974358974, "grad_norm": 0.4395906627178192, "learning_rate": 7.45702326697893e-05, "loss": 0.5257, "step": 96480 }, { "epoch": 2.1476584757834756, "grad_norm": 0.6027551293373108, "learning_rate": 7.453392204573426e-05, "loss": 0.527, "step": 96490 }, { "epoch": 2.1478810541310542, "grad_norm": 0.4774160087108612, "learning_rate": 7.449761823960868e-05, "loss": 0.4066, "step": 96500 }, { "epoch": 2.1481036324786325, "grad_norm": 0.8844955563545227, "learning_rate": 7.446132125338519e-05, "loss": 0.536, "step": 96510 }, { "epoch": 2.1483262108262107, "grad_norm": 0.5076330304145813, "learning_rate": 7.442503108903629e-05, "loss": 0.5438, "step": 96520 }, { "epoch": 2.1485487891737893, "grad_norm": 0.6229117512702942, "learning_rate": 7.438874774853397e-05, "loss": 0.5332, "step": 96530 }, { "epoch": 2.1487713675213675, "grad_norm": 0.4757481515407562, "learning_rate": 7.435247123384996e-05, "loss": 0.4961, "step": 96540 }, { "epoch": 2.1489939458689458, "grad_norm": 0.5127527117729187, "learning_rate": 7.431620154695551e-05, "loss": 0.4622, "step": 96550 }, { "epoch": 2.1492165242165244, "grad_norm": 0.8079750537872314, "learning_rate": 7.427993868982155e-05, "loss": 0.4647, "step": 96560 }, { "epoch": 2.1494391025641026, "grad_norm": 0.656417965888977, "learning_rate": 7.424368266441873e-05, "loss": 0.4997, "step": 96570 }, { "epoch": 2.149661680911681, "grad_norm": 0.46112340688705444, "learning_rate": 7.420743347271703e-05, "loss": 0.5201, "step": 96580 }, { "epoch": 2.149884259259259, "grad_norm": 0.7423391938209534, "learning_rate": 7.417119111668642e-05, "loss": 0.5169, "step": 96590 }, { "epoch": 2.1501068376068377, "grad_norm": 0.4680321514606476, "learning_rate": 7.413495559829635e-05, "loss": 0.3816, "step": 96600 }, { "epoch": 2.150329415954416, "grad_norm": 0.6164966225624084, "learning_rate": 7.409872691951573e-05, "loss": 0.4939, "step": 96610 }, { "epoch": 2.150551994301994, "grad_norm": 0.7925980687141418, "learning_rate": 7.40625050823134e-05, "loss": 0.4332, "step": 96620 }, { "epoch": 2.150774572649573, "grad_norm": 0.48925745487213135, "learning_rate": 7.402629008865763e-05, "loss": 0.4911, "step": 96630 }, { "epoch": 2.150997150997151, "grad_norm": 0.45277827978134155, "learning_rate": 7.399008194051644e-05, "loss": 0.5383, "step": 96640 }, { "epoch": 2.1512197293447293, "grad_norm": 0.4843733310699463, "learning_rate": 7.395388063985729e-05, "loss": 0.3939, "step": 96650 }, { "epoch": 2.1514423076923075, "grad_norm": 0.7864764928817749, "learning_rate": 7.391768618864745e-05, "loss": 0.4114, "step": 96660 }, { "epoch": 2.151664886039886, "grad_norm": 0.6666766405105591, "learning_rate": 7.388149858885378e-05, "loss": 0.4779, "step": 96670 }, { "epoch": 2.1518874643874644, "grad_norm": 0.6841953992843628, "learning_rate": 7.384531784244271e-05, "loss": 0.4747, "step": 96680 }, { "epoch": 2.1521100427350426, "grad_norm": 0.7607446312904358, "learning_rate": 7.380914395138033e-05, "loss": 0.4095, "step": 96690 }, { "epoch": 2.1523326210826212, "grad_norm": 0.6345654129981995, "learning_rate": 7.377297691763239e-05, "loss": 0.5492, "step": 96700 }, { "epoch": 2.1525551994301995, "grad_norm": 0.5190750956535339, "learning_rate": 7.373681674316426e-05, "loss": 0.457, "step": 96710 }, { "epoch": 2.1527777777777777, "grad_norm": 0.3715447783470154, "learning_rate": 7.370066342994081e-05, "loss": 0.4415, "step": 96720 }, { "epoch": 2.1530003561253563, "grad_norm": 0.715945303440094, "learning_rate": 7.36645169799268e-05, "loss": 0.4283, "step": 96730 }, { "epoch": 2.1532229344729346, "grad_norm": 0.6834224462509155, "learning_rate": 7.362837739508629e-05, "loss": 0.551, "step": 96740 }, { "epoch": 2.1534455128205128, "grad_norm": 0.46780624985694885, "learning_rate": 7.359224467738317e-05, "loss": 0.383, "step": 96750 }, { "epoch": 2.153668091168091, "grad_norm": 0.6412452459335327, "learning_rate": 7.355611882878097e-05, "loss": 0.4973, "step": 96760 }, { "epoch": 2.1538906695156697, "grad_norm": 0.44822946190834045, "learning_rate": 7.35199998512428e-05, "loss": 0.4009, "step": 96770 }, { "epoch": 2.154113247863248, "grad_norm": 0.4693109095096588, "learning_rate": 7.348388774673143e-05, "loss": 0.4893, "step": 96780 }, { "epoch": 2.154335826210826, "grad_norm": 0.8119006156921387, "learning_rate": 7.344778251720911e-05, "loss": 0.4104, "step": 96790 }, { "epoch": 2.1545584045584047, "grad_norm": 0.5076255798339844, "learning_rate": 7.341168416463789e-05, "loss": 0.5757, "step": 96800 }, { "epoch": 2.154780982905983, "grad_norm": 0.5202703475952148, "learning_rate": 7.337559269097938e-05, "loss": 0.5638, "step": 96810 }, { "epoch": 2.155003561253561, "grad_norm": 0.5872973203659058, "learning_rate": 7.333950809819484e-05, "loss": 0.4893, "step": 96820 }, { "epoch": 2.1552261396011394, "grad_norm": 0.4134224057197571, "learning_rate": 7.33034303882451e-05, "loss": 0.5044, "step": 96830 }, { "epoch": 2.155448717948718, "grad_norm": 0.48488810658454895, "learning_rate": 7.326735956309074e-05, "loss": 0.4659, "step": 96840 }, { "epoch": 2.1556712962962963, "grad_norm": 0.5602866411209106, "learning_rate": 7.323129562469174e-05, "loss": 0.4519, "step": 96850 }, { "epoch": 2.1558938746438745, "grad_norm": 0.3875146508216858, "learning_rate": 7.319523857500798e-05, "loss": 0.4158, "step": 96860 }, { "epoch": 2.156116452991453, "grad_norm": 0.5519022345542908, "learning_rate": 7.315918841599869e-05, "loss": 0.4951, "step": 96870 }, { "epoch": 2.1563390313390314, "grad_norm": 0.7909629344940186, "learning_rate": 7.312314514962295e-05, "loss": 0.5099, "step": 96880 }, { "epoch": 2.1565616096866096, "grad_norm": 0.334187388420105, "learning_rate": 7.308710877783937e-05, "loss": 0.4609, "step": 96890 }, { "epoch": 2.1567841880341883, "grad_norm": 0.3953896462917328, "learning_rate": 7.305107930260619e-05, "loss": 0.4981, "step": 96900 }, { "epoch": 2.1570067663817665, "grad_norm": 0.5780625939369202, "learning_rate": 7.30150567258813e-05, "loss": 0.3871, "step": 96910 }, { "epoch": 2.1572293447293447, "grad_norm": 0.7589643597602844, "learning_rate": 7.297904104962223e-05, "loss": 0.4304, "step": 96920 }, { "epoch": 2.157451923076923, "grad_norm": 0.584696888923645, "learning_rate": 7.2943032275786e-05, "loss": 0.4656, "step": 96930 }, { "epoch": 2.1576745014245016, "grad_norm": 0.6244634985923767, "learning_rate": 7.29070304063294e-05, "loss": 0.4773, "step": 96940 }, { "epoch": 2.15789707977208, "grad_norm": 0.6585376858711243, "learning_rate": 7.287103544320881e-05, "loss": 0.4357, "step": 96950 }, { "epoch": 2.158119658119658, "grad_norm": 0.605294942855835, "learning_rate": 7.283504738838022e-05, "loss": 0.5408, "step": 96960 }, { "epoch": 2.1583422364672367, "grad_norm": 0.5022411346435547, "learning_rate": 7.279906624379928e-05, "loss": 0.5125, "step": 96970 }, { "epoch": 2.158564814814815, "grad_norm": 0.6028233766555786, "learning_rate": 7.276309201142129e-05, "loss": 0.5237, "step": 96980 }, { "epoch": 2.158787393162393, "grad_norm": 0.5513371229171753, "learning_rate": 7.272712469320094e-05, "loss": 0.4935, "step": 96990 }, { "epoch": 2.1590099715099713, "grad_norm": 0.922234833240509, "learning_rate": 7.269116429109291e-05, "loss": 0.4227, "step": 97000 }, { "epoch": 2.15923254985755, "grad_norm": 0.6376217007637024, "learning_rate": 7.265521080705115e-05, "loss": 0.4728, "step": 97010 }, { "epoch": 2.159455128205128, "grad_norm": 0.5466942191123962, "learning_rate": 7.261926424302949e-05, "loss": 0.4958, "step": 97020 }, { "epoch": 2.1596777065527064, "grad_norm": 0.4967202842235565, "learning_rate": 7.25833246009813e-05, "loss": 0.4079, "step": 97030 }, { "epoch": 2.159900284900285, "grad_norm": 0.48792919516563416, "learning_rate": 7.254739188285955e-05, "loss": 0.419, "step": 97040 }, { "epoch": 2.1601228632478633, "grad_norm": 0.6444426774978638, "learning_rate": 7.251146609061685e-05, "loss": 0.355, "step": 97050 }, { "epoch": 2.16025641025641, "eval_loss": 0.5352727174758911, "eval_runtime": 337.3252, "eval_samples_per_second": 7.011, "eval_steps_per_second": 7.011, "step": 97056 }, { "epoch": 2.1603454415954415, "grad_norm": 0.597774088382721, "learning_rate": 7.247554722620552e-05, "loss": 0.5716, "step": 97060 }, { "epoch": 2.16056801994302, "grad_norm": 0.6890891790390015, "learning_rate": 7.243963529157731e-05, "loss": 0.5159, "step": 97070 }, { "epoch": 2.1607905982905984, "grad_norm": 0.5812705755233765, "learning_rate": 7.240373028868372e-05, "loss": 0.3916, "step": 97080 }, { "epoch": 2.1610131766381766, "grad_norm": 0.6706557869911194, "learning_rate": 7.236783221947589e-05, "loss": 0.5548, "step": 97090 }, { "epoch": 2.161235754985755, "grad_norm": 0.5046707391738892, "learning_rate": 7.233194108590455e-05, "loss": 0.3913, "step": 97100 }, { "epoch": 2.1614583333333335, "grad_norm": 0.40967047214508057, "learning_rate": 7.229605688992002e-05, "loss": 0.4302, "step": 97110 }, { "epoch": 2.1616809116809117, "grad_norm": 0.5742045640945435, "learning_rate": 7.22601796334724e-05, "loss": 0.4413, "step": 97120 }, { "epoch": 2.16190349002849, "grad_norm": 0.6036415100097656, "learning_rate": 7.222430931851109e-05, "loss": 0.4734, "step": 97130 }, { "epoch": 2.1621260683760686, "grad_norm": 0.7149750590324402, "learning_rate": 7.218844594698552e-05, "loss": 0.5379, "step": 97140 }, { "epoch": 2.162348646723647, "grad_norm": 0.48467162251472473, "learning_rate": 7.215258952084434e-05, "loss": 0.4919, "step": 97150 }, { "epoch": 2.162571225071225, "grad_norm": 0.588701605796814, "learning_rate": 7.21167400420361e-05, "loss": 0.3474, "step": 97160 }, { "epoch": 2.1627938034188032, "grad_norm": 0.5003872513771057, "learning_rate": 7.208089751250891e-05, "loss": 0.5029, "step": 97170 }, { "epoch": 2.163016381766382, "grad_norm": 0.8031962513923645, "learning_rate": 7.204506193421045e-05, "loss": 0.5116, "step": 97180 }, { "epoch": 2.16323896011396, "grad_norm": 0.5941150188446045, "learning_rate": 7.200923330908811e-05, "loss": 0.5218, "step": 97190 }, { "epoch": 2.1634615384615383, "grad_norm": 0.6272960901260376, "learning_rate": 7.197341163908883e-05, "loss": 0.3902, "step": 97200 }, { "epoch": 2.163684116809117, "grad_norm": 0.6789999604225159, "learning_rate": 7.193759692615914e-05, "loss": 0.5537, "step": 97210 }, { "epoch": 2.163906695156695, "grad_norm": 0.5254083871841431, "learning_rate": 7.190178917224525e-05, "loss": 0.4646, "step": 97220 }, { "epoch": 2.1641292735042734, "grad_norm": 0.5182301998138428, "learning_rate": 7.186598837929302e-05, "loss": 0.484, "step": 97230 }, { "epoch": 2.164351851851852, "grad_norm": 0.7297257781028748, "learning_rate": 7.183019454924784e-05, "loss": 0.5208, "step": 97240 }, { "epoch": 2.1645744301994303, "grad_norm": 0.40275275707244873, "learning_rate": 7.179440768405492e-05, "loss": 0.487, "step": 97250 }, { "epoch": 2.1647970085470085, "grad_norm": 0.6008617281913757, "learning_rate": 7.175862778565876e-05, "loss": 0.4279, "step": 97260 }, { "epoch": 2.1650195868945867, "grad_norm": 0.6788070201873779, "learning_rate": 7.172285485600374e-05, "loss": 0.4198, "step": 97270 }, { "epoch": 2.1652421652421654, "grad_norm": 0.4225766062736511, "learning_rate": 7.168708889703388e-05, "loss": 0.5602, "step": 97280 }, { "epoch": 2.1654647435897436, "grad_norm": 0.7581738233566284, "learning_rate": 7.165132991069256e-05, "loss": 0.4849, "step": 97290 }, { "epoch": 2.165687321937322, "grad_norm": 0.4646041691303253, "learning_rate": 7.161557789892308e-05, "loss": 0.5735, "step": 97300 }, { "epoch": 2.1659099002849005, "grad_norm": 0.47280997037887573, "learning_rate": 7.157983286366816e-05, "loss": 0.3521, "step": 97310 }, { "epoch": 2.1661324786324787, "grad_norm": 0.3988167345523834, "learning_rate": 7.154409480687027e-05, "loss": 0.4472, "step": 97320 }, { "epoch": 2.166355056980057, "grad_norm": 0.4632010757923126, "learning_rate": 7.150836373047145e-05, "loss": 0.4917, "step": 97330 }, { "epoch": 2.166577635327635, "grad_norm": 0.5823772549629211, "learning_rate": 7.147263963641337e-05, "loss": 0.6083, "step": 97340 }, { "epoch": 2.166800213675214, "grad_norm": 0.7309155464172363, "learning_rate": 7.14369225266372e-05, "loss": 0.4442, "step": 97350 }, { "epoch": 2.167022792022792, "grad_norm": 0.5929965972900391, "learning_rate": 7.140121240308393e-05, "loss": 0.4356, "step": 97360 }, { "epoch": 2.1672453703703702, "grad_norm": 0.8357145190238953, "learning_rate": 7.136550926769403e-05, "loss": 0.3463, "step": 97370 }, { "epoch": 2.167467948717949, "grad_norm": 0.86272794008255, "learning_rate": 7.132981312240774e-05, "loss": 0.4568, "step": 97380 }, { "epoch": 2.167690527065527, "grad_norm": 0.656522274017334, "learning_rate": 7.129412396916469e-05, "loss": 0.4935, "step": 97390 }, { "epoch": 2.1679131054131053, "grad_norm": 0.3058460056781769, "learning_rate": 7.125844180990427e-05, "loss": 0.4591, "step": 97400 }, { "epoch": 2.168135683760684, "grad_norm": 0.6548540592193604, "learning_rate": 7.122276664656553e-05, "loss": 0.4842, "step": 97410 }, { "epoch": 2.168358262108262, "grad_norm": 0.5646089315414429, "learning_rate": 7.118709848108716e-05, "loss": 0.448, "step": 97420 }, { "epoch": 2.1685808404558404, "grad_norm": 0.5614109039306641, "learning_rate": 7.11514373154072e-05, "loss": 0.486, "step": 97430 }, { "epoch": 2.1688034188034186, "grad_norm": 0.6399925351142883, "learning_rate": 7.111578315146365e-05, "loss": 0.4793, "step": 97440 }, { "epoch": 2.1690259971509973, "grad_norm": 0.6552419662475586, "learning_rate": 7.108013599119394e-05, "loss": 0.4544, "step": 97450 }, { "epoch": 2.1692485754985755, "grad_norm": 0.5547710061073303, "learning_rate": 7.104449583653518e-05, "loss": 0.4983, "step": 97460 }, { "epoch": 2.1694711538461537, "grad_norm": 0.6505656242370605, "learning_rate": 7.100886268942411e-05, "loss": 0.4198, "step": 97470 }, { "epoch": 2.169693732193732, "grad_norm": 0.761031985282898, "learning_rate": 7.097323655179708e-05, "loss": 0.5211, "step": 97480 }, { "epoch": 2.1699163105413106, "grad_norm": 0.5829175710678101, "learning_rate": 7.093761742558993e-05, "loss": 0.5456, "step": 97490 }, { "epoch": 2.170138888888889, "grad_norm": 0.6106030344963074, "learning_rate": 7.090200531273832e-05, "loss": 0.4444, "step": 97500 }, { "epoch": 2.170361467236467, "grad_norm": 0.5242830514907837, "learning_rate": 7.086640021517741e-05, "loss": 0.4114, "step": 97510 }, { "epoch": 2.1705840455840457, "grad_norm": 0.4403768479824066, "learning_rate": 7.08308021348421e-05, "loss": 0.4941, "step": 97520 }, { "epoch": 2.170806623931624, "grad_norm": 0.6011183261871338, "learning_rate": 7.079521107366669e-05, "loss": 0.36, "step": 97530 }, { "epoch": 2.171029202279202, "grad_norm": 0.6127752661705017, "learning_rate": 7.075962703358527e-05, "loss": 0.4552, "step": 97540 }, { "epoch": 2.171251780626781, "grad_norm": 0.5007062554359436, "learning_rate": 7.072405001653153e-05, "loss": 0.4492, "step": 97550 }, { "epoch": 2.171474358974359, "grad_norm": 0.5939716696739197, "learning_rate": 7.06884800244388e-05, "loss": 0.5014, "step": 97560 }, { "epoch": 2.1716969373219372, "grad_norm": 0.6019341349601746, "learning_rate": 7.065291705923984e-05, "loss": 0.4862, "step": 97570 }, { "epoch": 2.1719195156695155, "grad_norm": 0.5671701431274414, "learning_rate": 7.061736112286728e-05, "loss": 0.5061, "step": 97580 }, { "epoch": 2.172142094017094, "grad_norm": 0.670473039150238, "learning_rate": 7.058181221725322e-05, "loss": 0.4381, "step": 97590 }, { "epoch": 2.1723646723646723, "grad_norm": 0.33992794156074524, "learning_rate": 7.054627034432944e-05, "loss": 0.3826, "step": 97600 }, { "epoch": 2.1725872507122506, "grad_norm": 0.4417734444141388, "learning_rate": 7.051073550602731e-05, "loss": 0.4528, "step": 97610 }, { "epoch": 2.1728098290598292, "grad_norm": 0.40786826610565186, "learning_rate": 7.047520770427787e-05, "loss": 0.4451, "step": 97620 }, { "epoch": 2.1730324074074074, "grad_norm": 0.3992997407913208, "learning_rate": 7.043968694101162e-05, "loss": 0.5139, "step": 97630 }, { "epoch": 2.1732549857549857, "grad_norm": 0.45102357864379883, "learning_rate": 7.040417321815884e-05, "loss": 0.5253, "step": 97640 }, { "epoch": 2.173477564102564, "grad_norm": 0.7903671264648438, "learning_rate": 7.036866653764944e-05, "loss": 0.4218, "step": 97650 }, { "epoch": 2.1737001424501425, "grad_norm": 0.5459246039390564, "learning_rate": 7.033316690141278e-05, "loss": 0.4656, "step": 97660 }, { "epoch": 2.1739227207977208, "grad_norm": 0.47902384400367737, "learning_rate": 7.029767431137794e-05, "loss": 0.5076, "step": 97670 }, { "epoch": 2.174145299145299, "grad_norm": 0.7973486185073853, "learning_rate": 7.02621887694737e-05, "loss": 0.4914, "step": 97680 }, { "epoch": 2.1743678774928776, "grad_norm": 0.6195580959320068, "learning_rate": 7.022671027762837e-05, "loss": 0.4633, "step": 97690 }, { "epoch": 2.174590455840456, "grad_norm": 0.8144098520278931, "learning_rate": 7.019123883776979e-05, "loss": 0.4831, "step": 97700 }, { "epoch": 2.174813034188034, "grad_norm": 0.47490057349205017, "learning_rate": 7.015577445182555e-05, "loss": 0.3452, "step": 97710 }, { "epoch": 2.1750356125356127, "grad_norm": 0.6486740112304688, "learning_rate": 7.012031712172283e-05, "loss": 0.4742, "step": 97720 }, { "epoch": 2.175258190883191, "grad_norm": 0.31936442852020264, "learning_rate": 7.008486684938837e-05, "loss": 0.467, "step": 97730 }, { "epoch": 2.175480769230769, "grad_norm": 0.559054970741272, "learning_rate": 7.004942363674864e-05, "loss": 0.5024, "step": 97740 }, { "epoch": 2.1757033475783474, "grad_norm": 0.48571863770484924, "learning_rate": 7.001398748572958e-05, "loss": 0.5012, "step": 97750 }, { "epoch": 2.175925925925926, "grad_norm": 0.38457971811294556, "learning_rate": 6.997855839825695e-05, "loss": 0.4385, "step": 97760 }, { "epoch": 2.1761485042735043, "grad_norm": 0.526547372341156, "learning_rate": 6.99431363762558e-05, "loss": 0.5034, "step": 97770 }, { "epoch": 2.1763710826210825, "grad_norm": 0.4569105803966522, "learning_rate": 6.990772142165118e-05, "loss": 0.4015, "step": 97780 }, { "epoch": 2.176593660968661, "grad_norm": 0.7775120139122009, "learning_rate": 6.987231353636741e-05, "loss": 0.4182, "step": 97790 }, { "epoch": 2.1768162393162394, "grad_norm": 0.7812454104423523, "learning_rate": 6.983691272232861e-05, "loss": 0.3933, "step": 97800 }, { "epoch": 2.1770388176638176, "grad_norm": 0.7553454041481018, "learning_rate": 6.980151898145858e-05, "loss": 0.4667, "step": 97810 }, { "epoch": 2.177261396011396, "grad_norm": 0.45227760076522827, "learning_rate": 6.976613231568057e-05, "loss": 0.4502, "step": 97820 }, { "epoch": 2.1774839743589745, "grad_norm": 0.9420391917228699, "learning_rate": 6.97307527269176e-05, "loss": 0.5162, "step": 97830 }, { "epoch": 2.1777065527065527, "grad_norm": 0.39840957522392273, "learning_rate": 6.969538021709212e-05, "loss": 0.4783, "step": 97840 }, { "epoch": 2.177929131054131, "grad_norm": 0.6985434889793396, "learning_rate": 6.966001478812636e-05, "loss": 0.4801, "step": 97850 }, { "epoch": 2.1781517094017095, "grad_norm": 0.6731042265892029, "learning_rate": 6.962465644194207e-05, "loss": 0.5688, "step": 97860 }, { "epoch": 2.1783742877492878, "grad_norm": 0.608867347240448, "learning_rate": 6.95893051804607e-05, "loss": 0.4455, "step": 97870 }, { "epoch": 2.178596866096866, "grad_norm": 0.4440484046936035, "learning_rate": 6.955396100560325e-05, "loss": 0.4191, "step": 97880 }, { "epoch": 2.1788194444444446, "grad_norm": 0.5435939431190491, "learning_rate": 6.951862391929033e-05, "loss": 0.4844, "step": 97890 }, { "epoch": 2.179042022792023, "grad_norm": 0.7728957533836365, "learning_rate": 6.948329392344228e-05, "loss": 0.5298, "step": 97900 }, { "epoch": 2.179264601139601, "grad_norm": 0.5529747605323792, "learning_rate": 6.944797101997889e-05, "loss": 0.5869, "step": 97910 }, { "epoch": 2.1794871794871793, "grad_norm": 0.5707988739013672, "learning_rate": 6.941265521081954e-05, "loss": 0.4759, "step": 97920 }, { "epoch": 2.179709757834758, "grad_norm": 0.4855974614620209, "learning_rate": 6.937734649788343e-05, "loss": 0.4444, "step": 97930 }, { "epoch": 2.179932336182336, "grad_norm": 0.46950584650039673, "learning_rate": 6.934204488308924e-05, "loss": 0.4483, "step": 97940 }, { "epoch": 2.1801549145299144, "grad_norm": 0.7412520051002502, "learning_rate": 6.930675036835528e-05, "loss": 0.5823, "step": 97950 }, { "epoch": 2.180377492877493, "grad_norm": 0.4875216484069824, "learning_rate": 6.927146295559952e-05, "loss": 0.4769, "step": 97960 }, { "epoch": 2.1806000712250713, "grad_norm": 0.5370801687240601, "learning_rate": 6.923618264673953e-05, "loss": 0.5906, "step": 97970 }, { "epoch": 2.1808226495726495, "grad_norm": 0.42827099561691284, "learning_rate": 6.920090944369235e-05, "loss": 0.5098, "step": 97980 }, { "epoch": 2.1810452279202277, "grad_norm": 0.4902758002281189, "learning_rate": 6.916564334837485e-05, "loss": 0.4485, "step": 97990 }, { "epoch": 2.1812678062678064, "grad_norm": 0.3472527861595154, "learning_rate": 6.913038436270338e-05, "loss": 0.3637, "step": 98000 }, { "epoch": 2.1814903846153846, "grad_norm": 0.45957064628601074, "learning_rate": 6.909513248859396e-05, "loss": 0.4666, "step": 98010 }, { "epoch": 2.181712962962963, "grad_norm": 0.7909951210021973, "learning_rate": 6.905988772796222e-05, "loss": 0.4711, "step": 98020 }, { "epoch": 2.1819355413105415, "grad_norm": 0.5959575772285461, "learning_rate": 6.902465008272337e-05, "loss": 0.4394, "step": 98030 }, { "epoch": 2.1821581196581197, "grad_norm": 0.5832464098930359, "learning_rate": 6.89894195547923e-05, "loss": 0.4472, "step": 98040 }, { "epoch": 2.182380698005698, "grad_norm": 0.5050378441810608, "learning_rate": 6.895419614608346e-05, "loss": 0.4717, "step": 98050 }, { "epoch": 2.1826032763532766, "grad_norm": 0.3996773958206177, "learning_rate": 6.891897985851077e-05, "loss": 0.4428, "step": 98060 }, { "epoch": 2.1828258547008548, "grad_norm": 0.8722583055496216, "learning_rate": 6.888377069398804e-05, "loss": 0.5293, "step": 98070 }, { "epoch": 2.183048433048433, "grad_norm": 0.47751516103744507, "learning_rate": 6.884856865442855e-05, "loss": 0.4147, "step": 98080 }, { "epoch": 2.183271011396011, "grad_norm": 0.5492531061172485, "learning_rate": 6.881337374174521e-05, "loss": 0.4396, "step": 98090 }, { "epoch": 2.18349358974359, "grad_norm": 0.4488508403301239, "learning_rate": 6.877818595785053e-05, "loss": 0.5729, "step": 98100 }, { "epoch": 2.183716168091168, "grad_norm": 0.5802498459815979, "learning_rate": 6.874300530465671e-05, "loss": 0.468, "step": 98110 }, { "epoch": 2.1839387464387463, "grad_norm": 0.6385955214500427, "learning_rate": 6.870783178407538e-05, "loss": 0.5316, "step": 98120 }, { "epoch": 2.184161324786325, "grad_norm": 0.6732540726661682, "learning_rate": 6.867266539801796e-05, "loss": 0.5995, "step": 98130 }, { "epoch": 2.184383903133903, "grad_norm": 0.8565722703933716, "learning_rate": 6.863750614839537e-05, "loss": 0.4553, "step": 98140 }, { "epoch": 2.1846064814814814, "grad_norm": 0.8222180604934692, "learning_rate": 6.860235403711827e-05, "loss": 0.5389, "step": 98150 }, { "epoch": 2.1848290598290596, "grad_norm": 0.47665002942085266, "learning_rate": 6.856720906609681e-05, "loss": 0.5462, "step": 98160 }, { "epoch": 2.1850516381766383, "grad_norm": 0.5633779168128967, "learning_rate": 6.853207123724085e-05, "loss": 0.4064, "step": 98170 }, { "epoch": 2.1852742165242165, "grad_norm": 0.7037586569786072, "learning_rate": 6.849694055245974e-05, "loss": 0.4568, "step": 98180 }, { "epoch": 2.1854967948717947, "grad_norm": 0.6584839224815369, "learning_rate": 6.846181701366257e-05, "loss": 0.4715, "step": 98190 }, { "epoch": 2.1857193732193734, "grad_norm": 0.4015384912490845, "learning_rate": 6.842670062275789e-05, "loss": 0.4596, "step": 98200 }, { "epoch": 2.1859419515669516, "grad_norm": 0.5888437628746033, "learning_rate": 6.8391591381654e-05, "loss": 0.4623, "step": 98210 }, { "epoch": 2.18616452991453, "grad_norm": 0.5876869559288025, "learning_rate": 6.835648929225879e-05, "loss": 0.6175, "step": 98220 }, { "epoch": 2.1863871082621085, "grad_norm": 0.43403884768486023, "learning_rate": 6.832139435647971e-05, "loss": 0.4205, "step": 98230 }, { "epoch": 2.1866096866096867, "grad_norm": 0.6193419694900513, "learning_rate": 6.828630657622386e-05, "loss": 0.4591, "step": 98240 }, { "epoch": 2.186832264957265, "grad_norm": 0.6113179922103882, "learning_rate": 6.8251225953398e-05, "loss": 0.5874, "step": 98250 }, { "epoch": 2.187054843304843, "grad_norm": 0.33731210231781006, "learning_rate": 6.821615248990831e-05, "loss": 0.4017, "step": 98260 }, { "epoch": 2.187277421652422, "grad_norm": 0.556659460067749, "learning_rate": 6.818108618766077e-05, "loss": 0.5342, "step": 98270 }, { "epoch": 2.1875, "grad_norm": 0.5533732771873474, "learning_rate": 6.814602704856092e-05, "loss": 0.4999, "step": 98280 }, { "epoch": 2.187722578347578, "grad_norm": 0.6558281779289246, "learning_rate": 6.811097507451391e-05, "loss": 0.4326, "step": 98290 }, { "epoch": 2.187945156695157, "grad_norm": 0.44423213601112366, "learning_rate": 6.807593026742456e-05, "loss": 0.4674, "step": 98300 }, { "epoch": 2.188167735042735, "grad_norm": 0.5235666036605835, "learning_rate": 6.804089262919706e-05, "loss": 0.5688, "step": 98310 }, { "epoch": 2.1883903133903133, "grad_norm": 0.5956048965454102, "learning_rate": 6.80058621617355e-05, "loss": 0.555, "step": 98320 }, { "epoch": 2.1886128917378915, "grad_norm": 0.46237239241600037, "learning_rate": 6.797083886694353e-05, "loss": 0.5437, "step": 98330 }, { "epoch": 2.18883547008547, "grad_norm": 0.4554048180580139, "learning_rate": 6.793582274672416e-05, "loss": 0.5219, "step": 98340 }, { "epoch": 2.1890580484330484, "grad_norm": 0.6391506195068359, "learning_rate": 6.790081380298032e-05, "loss": 0.4697, "step": 98350 }, { "epoch": 2.1892806267806266, "grad_norm": 0.4395703971385956, "learning_rate": 6.78658120376144e-05, "loss": 0.4557, "step": 98360 }, { "epoch": 2.1895032051282053, "grad_norm": 0.6101352572441101, "learning_rate": 6.783081745252839e-05, "loss": 0.5763, "step": 98370 }, { "epoch": 2.1897257834757835, "grad_norm": 0.37331920862197876, "learning_rate": 6.7795830049624e-05, "loss": 0.4456, "step": 98380 }, { "epoch": 2.1899483618233617, "grad_norm": 0.6069123148918152, "learning_rate": 6.776084983080247e-05, "loss": 0.4583, "step": 98390 }, { "epoch": 2.1901709401709404, "grad_norm": 0.6946260333061218, "learning_rate": 6.772587679796456e-05, "loss": 0.4695, "step": 98400 }, { "epoch": 2.1903935185185186, "grad_norm": 0.44884660840034485, "learning_rate": 6.769091095301079e-05, "loss": 0.4295, "step": 98410 }, { "epoch": 2.190616096866097, "grad_norm": 0.6085039973258972, "learning_rate": 6.765595229784123e-05, "loss": 0.4646, "step": 98420 }, { "epoch": 2.190838675213675, "grad_norm": 0.300814688205719, "learning_rate": 6.762100083435562e-05, "loss": 0.4507, "step": 98430 }, { "epoch": 2.1910612535612537, "grad_norm": 0.4522894024848938, "learning_rate": 6.758605656445315e-05, "loss": 0.4733, "step": 98440 }, { "epoch": 2.191283831908832, "grad_norm": 0.6713006496429443, "learning_rate": 6.755111949003277e-05, "loss": 0.5671, "step": 98450 }, { "epoch": 2.19150641025641, "grad_norm": 0.6300404667854309, "learning_rate": 6.751618961299296e-05, "loss": 0.5089, "step": 98460 }, { "epoch": 2.191728988603989, "grad_norm": 0.613237738609314, "learning_rate": 6.748126693523193e-05, "loss": 0.4503, "step": 98470 }, { "epoch": 2.191951566951567, "grad_norm": 0.6103219985961914, "learning_rate": 6.74463514586473e-05, "loss": 0.4693, "step": 98480 }, { "epoch": 2.1921741452991452, "grad_norm": 0.7985731363296509, "learning_rate": 6.741144318513641e-05, "loss": 0.4554, "step": 98490 }, { "epoch": 2.1923967236467234, "grad_norm": 0.6526599526405334, "learning_rate": 6.737654211659627e-05, "loss": 0.5476, "step": 98500 }, { "epoch": 2.192619301994302, "grad_norm": 0.6740187406539917, "learning_rate": 6.734164825492339e-05, "loss": 0.487, "step": 98510 }, { "epoch": 2.1928418803418803, "grad_norm": 0.703872561454773, "learning_rate": 6.730676160201394e-05, "loss": 0.4868, "step": 98520 }, { "epoch": 2.1930644586894585, "grad_norm": 0.702970027923584, "learning_rate": 6.727188215976376e-05, "loss": 0.4743, "step": 98530 }, { "epoch": 2.193287037037037, "grad_norm": 0.7146590948104858, "learning_rate": 6.72370099300681e-05, "loss": 0.3917, "step": 98540 }, { "epoch": 2.1935096153846154, "grad_norm": 0.4664157032966614, "learning_rate": 6.7202144914822e-05, "loss": 0.4909, "step": 98550 }, { "epoch": 2.1937321937321936, "grad_norm": 0.451846718788147, "learning_rate": 6.716728711592013e-05, "loss": 0.4031, "step": 98560 }, { "epoch": 2.1939547720797723, "grad_norm": 0.9357752799987793, "learning_rate": 6.713243653525653e-05, "loss": 0.4001, "step": 98570 }, { "epoch": 2.1941773504273505, "grad_norm": 0.5053947567939758, "learning_rate": 6.709759317472513e-05, "loss": 0.4501, "step": 98580 }, { "epoch": 2.1943999287749287, "grad_norm": 0.5573664903640747, "learning_rate": 6.706275703621932e-05, "loss": 0.4847, "step": 98590 }, { "epoch": 2.194622507122507, "grad_norm": 0.508842408657074, "learning_rate": 6.70279281216321e-05, "loss": 0.4954, "step": 98600 }, { "epoch": 2.1948450854700856, "grad_norm": 0.5140504240989685, "learning_rate": 6.69931064328562e-05, "loss": 0.4875, "step": 98610 }, { "epoch": 2.195067663817664, "grad_norm": 0.7873440384864807, "learning_rate": 6.69582919717837e-05, "loss": 0.4963, "step": 98620 }, { "epoch": 2.195290242165242, "grad_norm": 0.8808897137641907, "learning_rate": 6.692348474030652e-05, "loss": 0.4906, "step": 98630 }, { "epoch": 2.1955128205128207, "grad_norm": 0.5072993040084839, "learning_rate": 6.688868474031614e-05, "loss": 0.3668, "step": 98640 }, { "epoch": 2.195735398860399, "grad_norm": 0.5444533228874207, "learning_rate": 6.68538919737036e-05, "loss": 0.4285, "step": 98650 }, { "epoch": 2.195957977207977, "grad_norm": 0.4028169810771942, "learning_rate": 6.681910644235956e-05, "loss": 0.5166, "step": 98660 }, { "epoch": 2.1961805555555554, "grad_norm": 0.655707061290741, "learning_rate": 6.678432814817437e-05, "loss": 0.5236, "step": 98670 }, { "epoch": 2.196403133903134, "grad_norm": 0.45885083079338074, "learning_rate": 6.674955709303778e-05, "loss": 0.3951, "step": 98680 }, { "epoch": 2.1966257122507122, "grad_norm": 0.7110733985900879, "learning_rate": 6.671479327883934e-05, "loss": 0.4361, "step": 98690 }, { "epoch": 2.1968482905982905, "grad_norm": 0.5791854858398438, "learning_rate": 6.668003670746823e-05, "loss": 0.4113, "step": 98700 }, { "epoch": 2.197070868945869, "grad_norm": 0.6926530599594116, "learning_rate": 6.664528738081298e-05, "loss": 0.4975, "step": 98710 }, { "epoch": 2.1972934472934473, "grad_norm": 0.48590734601020813, "learning_rate": 6.661054530076198e-05, "loss": 0.5481, "step": 98720 }, { "epoch": 2.1975160256410255, "grad_norm": 0.49652108550071716, "learning_rate": 6.657581046920316e-05, "loss": 0.485, "step": 98730 }, { "epoch": 2.197738603988604, "grad_norm": 0.8256610035896301, "learning_rate": 6.654108288802401e-05, "loss": 0.4494, "step": 98740 }, { "epoch": 2.1979611823361824, "grad_norm": 0.3270527422428131, "learning_rate": 6.650636255911175e-05, "loss": 0.4132, "step": 98750 }, { "epoch": 2.1981837606837606, "grad_norm": 0.5124560594558716, "learning_rate": 6.647164948435296e-05, "loss": 0.3695, "step": 98760 }, { "epoch": 2.198406339031339, "grad_norm": 0.5017485022544861, "learning_rate": 6.643694366563405e-05, "loss": 0.3798, "step": 98770 }, { "epoch": 2.1986289173789175, "grad_norm": 0.7371763586997986, "learning_rate": 6.640224510484097e-05, "loss": 0.4621, "step": 98780 }, { "epoch": 2.1988514957264957, "grad_norm": 0.4819795489311218, "learning_rate": 6.636755380385924e-05, "loss": 0.5866, "step": 98790 }, { "epoch": 2.199074074074074, "grad_norm": 0.3381766080856323, "learning_rate": 6.633286976457404e-05, "loss": 0.4564, "step": 98800 }, { "epoch": 2.1992966524216526, "grad_norm": 0.44364458322525024, "learning_rate": 6.629819298887019e-05, "loss": 0.4222, "step": 98810 }, { "epoch": 2.199519230769231, "grad_norm": 0.5250563621520996, "learning_rate": 6.626352347863191e-05, "loss": 0.4457, "step": 98820 }, { "epoch": 2.199741809116809, "grad_norm": 1.1085268259048462, "learning_rate": 6.622886123574333e-05, "loss": 0.3828, "step": 98830 }, { "epoch": 2.1999643874643873, "grad_norm": 0.5071104764938354, "learning_rate": 6.619420626208788e-05, "loss": 0.4507, "step": 98840 }, { "epoch": 2.200186965811966, "grad_norm": 0.5599742531776428, "learning_rate": 6.615955855954878e-05, "loss": 0.5491, "step": 98850 }, { "epoch": 2.200409544159544, "grad_norm": 0.5447997450828552, "learning_rate": 6.612491813000883e-05, "loss": 0.4793, "step": 98860 }, { "epoch": 2.2006321225071224, "grad_norm": 0.5703315734863281, "learning_rate": 6.609028497535043e-05, "loss": 0.3342, "step": 98870 }, { "epoch": 2.200854700854701, "grad_norm": 0.5179494619369507, "learning_rate": 6.605565909745559e-05, "loss": 0.402, "step": 98880 }, { "epoch": 2.2010772792022792, "grad_norm": 0.6648085713386536, "learning_rate": 6.602104049820594e-05, "loss": 0.5129, "step": 98890 }, { "epoch": 2.2012998575498575, "grad_norm": 0.4397805333137512, "learning_rate": 6.598642917948255e-05, "loss": 0.3512, "step": 98900 }, { "epoch": 2.201522435897436, "grad_norm": 0.6573788523674011, "learning_rate": 6.595182514316631e-05, "loss": 0.5547, "step": 98910 }, { "epoch": 2.2017450142450143, "grad_norm": 0.3666999042034149, "learning_rate": 6.591722839113765e-05, "loss": 0.4774, "step": 98920 }, { "epoch": 2.2019675925925926, "grad_norm": 0.5320939421653748, "learning_rate": 6.588263892527655e-05, "loss": 0.4257, "step": 98930 }, { "epoch": 2.2021901709401708, "grad_norm": 0.6304528117179871, "learning_rate": 6.584805674746264e-05, "loss": 0.464, "step": 98940 }, { "epoch": 2.2024127492877494, "grad_norm": 0.6009019017219543, "learning_rate": 6.581348185957523e-05, "loss": 0.4468, "step": 98950 }, { "epoch": 2.2026353276353277, "grad_norm": 0.4979470372200012, "learning_rate": 6.577891426349306e-05, "loss": 0.3993, "step": 98960 }, { "epoch": 2.202857905982906, "grad_norm": 0.5306423902511597, "learning_rate": 6.574435396109448e-05, "loss": 0.512, "step": 98970 }, { "epoch": 2.2030804843304845, "grad_norm": 0.4831714928150177, "learning_rate": 6.570980095425763e-05, "loss": 0.4364, "step": 98980 }, { "epoch": 2.2033030626780628, "grad_norm": 0.5222032070159912, "learning_rate": 6.567525524486013e-05, "loss": 0.4322, "step": 98990 }, { "epoch": 2.203525641025641, "grad_norm": 0.742304265499115, "learning_rate": 6.564071683477924e-05, "loss": 0.4608, "step": 99000 }, { "epoch": 2.203748219373219, "grad_norm": 0.4952321946620941, "learning_rate": 6.560618572589177e-05, "loss": 0.3382, "step": 99010 }, { "epoch": 2.203970797720798, "grad_norm": 0.4795549213886261, "learning_rate": 6.557166192007418e-05, "loss": 0.5629, "step": 99020 }, { "epoch": 2.204193376068376, "grad_norm": 0.6933590173721313, "learning_rate": 6.553714541920259e-05, "loss": 0.4518, "step": 99030 }, { "epoch": 2.2044159544159543, "grad_norm": 0.4522525370121002, "learning_rate": 6.550263622515256e-05, "loss": 0.4033, "step": 99040 }, { "epoch": 2.204638532763533, "grad_norm": 0.8026001453399658, "learning_rate": 6.546813433979937e-05, "loss": 0.4414, "step": 99050 }, { "epoch": 2.204861111111111, "grad_norm": 0.5256423950195312, "learning_rate": 6.543363976501788e-05, "loss": 0.381, "step": 99060 }, { "epoch": 2.2050836894586894, "grad_norm": 0.5207923054695129, "learning_rate": 6.539915250268258e-05, "loss": 0.3722, "step": 99070 }, { "epoch": 2.205306267806268, "grad_norm": 0.5042715072631836, "learning_rate": 6.536467255466752e-05, "loss": 0.4009, "step": 99080 }, { "epoch": 2.2055288461538463, "grad_norm": 0.698853611946106, "learning_rate": 6.533019992284644e-05, "loss": 0.3718, "step": 99090 }, { "epoch": 2.2057514245014245, "grad_norm": 0.4758215546607971, "learning_rate": 6.529573460909253e-05, "loss": 0.357, "step": 99100 }, { "epoch": 2.2059740028490027, "grad_norm": 0.7511695027351379, "learning_rate": 6.526127661527861e-05, "loss": 0.6324, "step": 99110 }, { "epoch": 2.2061965811965814, "grad_norm": 0.9775429964065552, "learning_rate": 6.522682594327722e-05, "loss": 0.5158, "step": 99120 }, { "epoch": 2.2064191595441596, "grad_norm": 0.5471348166465759, "learning_rate": 6.519238259496046e-05, "loss": 0.5492, "step": 99130 }, { "epoch": 2.206641737891738, "grad_norm": 0.5604672431945801, "learning_rate": 6.515794657219996e-05, "loss": 0.3754, "step": 99140 }, { "epoch": 2.206864316239316, "grad_norm": 0.7262865304946899, "learning_rate": 6.512351787686706e-05, "loss": 0.439, "step": 99150 }, { "epoch": 2.2070868945868947, "grad_norm": 0.599807858467102, "learning_rate": 6.50890965108326e-05, "loss": 0.487, "step": 99160 }, { "epoch": 2.207309472934473, "grad_norm": 0.7784044742584229, "learning_rate": 6.505468247596713e-05, "loss": 0.4654, "step": 99170 }, { "epoch": 2.207532051282051, "grad_norm": 0.5841115117073059, "learning_rate": 6.502027577414062e-05, "loss": 0.4926, "step": 99180 }, { "epoch": 2.2077546296296298, "grad_norm": 0.38955599069595337, "learning_rate": 6.498587640722285e-05, "loss": 0.4819, "step": 99190 }, { "epoch": 2.207977207977208, "grad_norm": 0.7164641618728638, "learning_rate": 6.495148437708308e-05, "loss": 0.4765, "step": 99200 }, { "epoch": 2.208199786324786, "grad_norm": 0.44146817922592163, "learning_rate": 6.491709968559019e-05, "loss": 0.4453, "step": 99210 }, { "epoch": 2.208422364672365, "grad_norm": 0.5845023393630981, "learning_rate": 6.488272233461274e-05, "loss": 0.5473, "step": 99220 }, { "epoch": 2.208644943019943, "grad_norm": 0.40102317929267883, "learning_rate": 6.484835232601873e-05, "loss": 0.4975, "step": 99230 }, { "epoch": 2.2088675213675213, "grad_norm": 0.5128397345542908, "learning_rate": 6.481398966167595e-05, "loss": 0.4143, "step": 99240 }, { "epoch": 2.2090900997150995, "grad_norm": 0.6717884540557861, "learning_rate": 6.477963434345158e-05, "loss": 0.4679, "step": 99250 }, { "epoch": 2.209312678062678, "grad_norm": 0.46634578704833984, "learning_rate": 6.474528637321258e-05, "loss": 0.4547, "step": 99260 }, { "epoch": 2.2095352564102564, "grad_norm": 0.6119767427444458, "learning_rate": 6.471094575282544e-05, "loss": 0.4605, "step": 99270 }, { "epoch": 2.2097578347578346, "grad_norm": 0.538398027420044, "learning_rate": 6.467661248415624e-05, "loss": 0.4892, "step": 99280 }, { "epoch": 2.2099804131054133, "grad_norm": 0.5721553564071655, "learning_rate": 6.464228656907071e-05, "loss": 0.4595, "step": 99290 }, { "epoch": 2.2102029914529915, "grad_norm": 0.43349581956863403, "learning_rate": 6.460796800943423e-05, "loss": 0.423, "step": 99300 }, { "epoch": 2.2104255698005697, "grad_norm": 0.7425587773323059, "learning_rate": 6.457365680711151e-05, "loss": 0.4235, "step": 99310 }, { "epoch": 2.210648148148148, "grad_norm": 0.4731992781162262, "learning_rate": 6.453935296396716e-05, "loss": 0.5229, "step": 99320 }, { "epoch": 2.2108707264957266, "grad_norm": 0.6548227667808533, "learning_rate": 6.450505648186524e-05, "loss": 0.4649, "step": 99330 }, { "epoch": 2.211093304843305, "grad_norm": 0.8874356746673584, "learning_rate": 6.44707673626695e-05, "loss": 0.5212, "step": 99340 }, { "epoch": 2.211315883190883, "grad_norm": 0.5008230805397034, "learning_rate": 6.443648560824326e-05, "loss": 0.4533, "step": 99350 }, { "epoch": 2.2115384615384617, "grad_norm": 0.4697886109352112, "learning_rate": 6.440221122044932e-05, "loss": 0.4913, "step": 99360 }, { "epoch": 2.21176103988604, "grad_norm": 0.49064961075782776, "learning_rate": 6.43679442011502e-05, "loss": 0.4923, "step": 99370 }, { "epoch": 2.211983618233618, "grad_norm": 0.4335954487323761, "learning_rate": 6.433368455220811e-05, "loss": 0.4019, "step": 99380 }, { "epoch": 2.2122061965811968, "grad_norm": 0.6782045364379883, "learning_rate": 6.42994322754846e-05, "loss": 0.4978, "step": 99390 }, { "epoch": 2.212428774928775, "grad_norm": 0.6394982933998108, "learning_rate": 6.426518737284102e-05, "loss": 0.4363, "step": 99400 }, { "epoch": 2.212651353276353, "grad_norm": 0.620266854763031, "learning_rate": 6.42309498461383e-05, "loss": 0.4837, "step": 99410 }, { "epoch": 2.2128739316239314, "grad_norm": 0.43421876430511475, "learning_rate": 6.419671969723686e-05, "loss": 0.5502, "step": 99420 }, { "epoch": 2.21309650997151, "grad_norm": 0.6228753328323364, "learning_rate": 6.416249692799689e-05, "loss": 0.4654, "step": 99430 }, { "epoch": 2.2133190883190883, "grad_norm": 0.5116588473320007, "learning_rate": 6.41282815402781e-05, "loss": 0.4352, "step": 99440 }, { "epoch": 2.2135416666666665, "grad_norm": 0.34148287773132324, "learning_rate": 6.409407353593964e-05, "loss": 0.4115, "step": 99450 }, { "epoch": 2.213764245014245, "grad_norm": 0.524929404258728, "learning_rate": 6.405987291684049e-05, "loss": 0.548, "step": 99460 }, { "epoch": 2.2139868233618234, "grad_norm": 0.5598315000534058, "learning_rate": 6.402567968483913e-05, "loss": 0.4296, "step": 99470 }, { "epoch": 2.2142094017094016, "grad_norm": 0.4203566908836365, "learning_rate": 6.39914938417937e-05, "loss": 0.3476, "step": 99480 }, { "epoch": 2.21443198005698, "grad_norm": 0.6650808453559875, "learning_rate": 6.39573153895618e-05, "loss": 0.5173, "step": 99490 }, { "epoch": 2.2146545584045585, "grad_norm": 0.5748631954193115, "learning_rate": 6.392314433000073e-05, "loss": 0.445, "step": 99500 }, { "epoch": 2.2148771367521367, "grad_norm": 0.6141366958618164, "learning_rate": 6.38889806649674e-05, "loss": 0.4875, "step": 99510 }, { "epoch": 2.215099715099715, "grad_norm": 0.6306594014167786, "learning_rate": 6.385482439631836e-05, "loss": 0.3793, "step": 99520 }, { "epoch": 2.2153222934472936, "grad_norm": 0.9447283148765564, "learning_rate": 6.382067552590954e-05, "loss": 0.5865, "step": 99530 }, { "epoch": 2.215544871794872, "grad_norm": 0.770289957523346, "learning_rate": 6.378653405559669e-05, "loss": 0.4947, "step": 99540 }, { "epoch": 2.21576745014245, "grad_norm": 0.5178115367889404, "learning_rate": 6.37523999872351e-05, "loss": 0.4488, "step": 99550 }, { "epoch": 2.2159900284900287, "grad_norm": 0.9453518986701965, "learning_rate": 6.371827332267964e-05, "loss": 0.5901, "step": 99560 }, { "epoch": 2.216212606837607, "grad_norm": 0.7093714475631714, "learning_rate": 6.368415406378476e-05, "loss": 0.4739, "step": 99570 }, { "epoch": 2.216435185185185, "grad_norm": 0.5546488165855408, "learning_rate": 6.365004221240461e-05, "loss": 0.511, "step": 99580 }, { "epoch": 2.2166577635327633, "grad_norm": 0.3860114514827728, "learning_rate": 6.361593777039272e-05, "loss": 0.461, "step": 99590 }, { "epoch": 2.216880341880342, "grad_norm": 0.4976522922515869, "learning_rate": 6.358184073960241e-05, "loss": 0.4717, "step": 99600 }, { "epoch": 2.21710292022792, "grad_norm": 0.7040444612503052, "learning_rate": 6.354775112188662e-05, "loss": 0.3994, "step": 99610 }, { "epoch": 2.2173254985754984, "grad_norm": 0.4434584379196167, "learning_rate": 6.351366891909768e-05, "loss": 0.4361, "step": 99620 }, { "epoch": 2.217548076923077, "grad_norm": 0.6409199833869934, "learning_rate": 6.347959413308773e-05, "loss": 0.4323, "step": 99630 }, { "epoch": 2.2177706552706553, "grad_norm": 0.7286927103996277, "learning_rate": 6.344552676570836e-05, "loss": 0.4481, "step": 99640 }, { "epoch": 2.2179932336182335, "grad_norm": 0.8713966608047485, "learning_rate": 6.341146681881087e-05, "loss": 0.4698, "step": 99650 }, { "epoch": 2.2182158119658117, "grad_norm": 0.7125070095062256, "learning_rate": 6.337741429424615e-05, "loss": 0.6052, "step": 99660 }, { "epoch": 2.2184383903133904, "grad_norm": 0.6974532604217529, "learning_rate": 6.334336919386452e-05, "loss": 0.3992, "step": 99670 }, { "epoch": 2.2186609686609686, "grad_norm": 0.709204912185669, "learning_rate": 6.330933151951608e-05, "loss": 0.5381, "step": 99680 }, { "epoch": 2.218883547008547, "grad_norm": 0.6567760705947876, "learning_rate": 6.327530127305046e-05, "loss": 0.3979, "step": 99690 }, { "epoch": 2.2191061253561255, "grad_norm": 0.5433497428894043, "learning_rate": 6.324127845631688e-05, "loss": 0.5281, "step": 99700 }, { "epoch": 2.2193287037037037, "grad_norm": 0.6927756071090698, "learning_rate": 6.320726307116422e-05, "loss": 0.5071, "step": 99710 }, { "epoch": 2.219551282051282, "grad_norm": 0.43783435225486755, "learning_rate": 6.317325511944093e-05, "loss": 0.4411, "step": 99720 }, { "epoch": 2.2197738603988606, "grad_norm": 0.6120383739471436, "learning_rate": 6.313925460299488e-05, "loss": 0.3843, "step": 99730 }, { "epoch": 2.219996438746439, "grad_norm": 0.6174899935722351, "learning_rate": 6.310526152367377e-05, "loss": 0.4308, "step": 99740 }, { "epoch": 2.220219017094017, "grad_norm": 0.6677566170692444, "learning_rate": 6.307127588332491e-05, "loss": 0.4542, "step": 99750 }, { "epoch": 2.220263532763533, "eval_loss": 0.5339695811271667, "eval_runtime": 337.4487, "eval_samples_per_second": 7.008, "eval_steps_per_second": 7.008, "step": 99752 }, { "epoch": 2.2204415954415953, "grad_norm": 0.6040011644363403, "learning_rate": 6.303729768379493e-05, "loss": 0.5667, "step": 99760 }, { "epoch": 2.220664173789174, "grad_norm": 0.40889132022857666, "learning_rate": 6.300332692693032e-05, "loss": 0.4262, "step": 99770 }, { "epoch": 2.220886752136752, "grad_norm": 0.6798517107963562, "learning_rate": 6.296936361457709e-05, "loss": 0.4203, "step": 99780 }, { "epoch": 2.2211093304843303, "grad_norm": 0.3974006474018097, "learning_rate": 6.29354077485808e-05, "loss": 0.3396, "step": 99790 }, { "epoch": 2.221331908831909, "grad_norm": 0.5051971077919006, "learning_rate": 6.290145933078673e-05, "loss": 0.4209, "step": 99800 }, { "epoch": 2.2215544871794872, "grad_norm": 0.5820755958557129, "learning_rate": 6.286751836303952e-05, "loss": 0.4572, "step": 99810 }, { "epoch": 2.2217770655270654, "grad_norm": 0.5005538463592529, "learning_rate": 6.283358484718365e-05, "loss": 0.4265, "step": 99820 }, { "epoch": 2.2219996438746437, "grad_norm": 0.4250526428222656, "learning_rate": 6.279965878506305e-05, "loss": 0.4732, "step": 99830 }, { "epoch": 2.2222222222222223, "grad_norm": 0.6704878807067871, "learning_rate": 6.27657401785213e-05, "loss": 0.4835, "step": 99840 }, { "epoch": 2.2224448005698005, "grad_norm": 0.7315400838851929, "learning_rate": 6.27318290294016e-05, "loss": 0.4405, "step": 99850 }, { "epoch": 2.2226673789173788, "grad_norm": 0.6555359363555908, "learning_rate": 6.269792533954673e-05, "loss": 0.3954, "step": 99860 }, { "epoch": 2.2228899572649574, "grad_norm": 0.6594572067260742, "learning_rate": 6.266402911079894e-05, "loss": 0.4352, "step": 99870 }, { "epoch": 2.2231125356125356, "grad_norm": 0.5686958432197571, "learning_rate": 6.26301403450003e-05, "loss": 0.4538, "step": 99880 }, { "epoch": 2.223335113960114, "grad_norm": 0.5742239356040955, "learning_rate": 6.259625904399223e-05, "loss": 0.4345, "step": 99890 }, { "epoch": 2.2235576923076925, "grad_norm": 0.5327342748641968, "learning_rate": 6.25623852096159e-05, "loss": 0.557, "step": 99900 }, { "epoch": 2.2237802706552707, "grad_norm": 0.6104030013084412, "learning_rate": 6.252851884371209e-05, "loss": 0.4583, "step": 99910 }, { "epoch": 2.224002849002849, "grad_norm": 0.6607154011726379, "learning_rate": 6.249465994812111e-05, "loss": 0.6308, "step": 99920 }, { "epoch": 2.224225427350427, "grad_norm": 0.5360001921653748, "learning_rate": 6.246080852468288e-05, "loss": 0.4952, "step": 99930 }, { "epoch": 2.224448005698006, "grad_norm": 0.7452919483184814, "learning_rate": 6.242696457523696e-05, "loss": 0.3986, "step": 99940 }, { "epoch": 2.224670584045584, "grad_norm": 0.3966030776500702, "learning_rate": 6.239312810162234e-05, "loss": 0.4379, "step": 99950 }, { "epoch": 2.2248931623931623, "grad_norm": 0.6134777069091797, "learning_rate": 6.235929910567781e-05, "loss": 0.4995, "step": 99960 }, { "epoch": 2.225115740740741, "grad_norm": 0.5449444055557251, "learning_rate": 6.232547758924163e-05, "loss": 0.4427, "step": 99970 }, { "epoch": 2.225338319088319, "grad_norm": 0.8455405831336975, "learning_rate": 6.22916635541517e-05, "loss": 0.4743, "step": 99980 }, { "epoch": 2.2255608974358974, "grad_norm": 0.809389591217041, "learning_rate": 6.22578570022455e-05, "loss": 0.4629, "step": 99990 }, { "epoch": 2.2257834757834756, "grad_norm": 0.720911979675293, "learning_rate": 6.222405793536021e-05, "loss": 0.5837, "step": 100000 }, { "epoch": 2.2260060541310542, "grad_norm": 0.6490511298179626, "learning_rate": 6.219026635533232e-05, "loss": 0.4231, "step": 100010 }, { "epoch": 2.2262286324786325, "grad_norm": 0.6139844059944153, "learning_rate": 6.215648226399822e-05, "loss": 0.3518, "step": 100020 }, { "epoch": 2.2264512108262107, "grad_norm": 0.4793091118335724, "learning_rate": 6.212270566319368e-05, "loss": 0.4306, "step": 100030 }, { "epoch": 2.2266737891737893, "grad_norm": 0.554528534412384, "learning_rate": 6.208893655475417e-05, "loss": 0.4464, "step": 100040 }, { "epoch": 2.2268963675213675, "grad_norm": 0.4235374331474304, "learning_rate": 6.205517494051474e-05, "loss": 0.4374, "step": 100050 }, { "epoch": 2.2271189458689458, "grad_norm": 0.6234670877456665, "learning_rate": 6.202142082231002e-05, "loss": 0.5593, "step": 100060 }, { "epoch": 2.2273415242165244, "grad_norm": 0.31259241700172424, "learning_rate": 6.198767420197427e-05, "loss": 0.4652, "step": 100070 }, { "epoch": 2.2275641025641026, "grad_norm": 0.3480241000652313, "learning_rate": 6.195393508134136e-05, "loss": 0.4259, "step": 100080 }, { "epoch": 2.227786680911681, "grad_norm": 0.5502161979675293, "learning_rate": 6.192020346224455e-05, "loss": 0.3988, "step": 100090 }, { "epoch": 2.228009259259259, "grad_norm": 0.5438657402992249, "learning_rate": 6.18864793465169e-05, "loss": 0.3868, "step": 100100 }, { "epoch": 2.2282318376068377, "grad_norm": 0.5434377789497375, "learning_rate": 6.185276273599106e-05, "loss": 0.5102, "step": 100110 }, { "epoch": 2.228454415954416, "grad_norm": 0.5145140886306763, "learning_rate": 6.181905363249916e-05, "loss": 0.3758, "step": 100120 }, { "epoch": 2.228676994301994, "grad_norm": 0.4315755069255829, "learning_rate": 6.1785352037873e-05, "loss": 0.4181, "step": 100130 }, { "epoch": 2.228899572649573, "grad_norm": 0.699902355670929, "learning_rate": 6.175165795394407e-05, "loss": 0.4775, "step": 100140 }, { "epoch": 2.229122150997151, "grad_norm": 0.6445436477661133, "learning_rate": 6.171797138254312e-05, "loss": 0.4147, "step": 100150 }, { "epoch": 2.2293447293447293, "grad_norm": 0.6389631032943726, "learning_rate": 6.168429232550088e-05, "loss": 0.3922, "step": 100160 }, { "epoch": 2.2295673076923075, "grad_norm": 0.6590464115142822, "learning_rate": 6.165062078464735e-05, "loss": 0.5344, "step": 100170 }, { "epoch": 2.229789886039886, "grad_norm": 0.5974990129470825, "learning_rate": 6.161695676181236e-05, "loss": 0.4084, "step": 100180 }, { "epoch": 2.2300124643874644, "grad_norm": 0.48121482133865356, "learning_rate": 6.158330025882522e-05, "loss": 0.3951, "step": 100190 }, { "epoch": 2.2302350427350426, "grad_norm": 0.6151335835456848, "learning_rate": 6.154965127751486e-05, "loss": 0.4861, "step": 100200 }, { "epoch": 2.2304576210826212, "grad_norm": 0.6503200531005859, "learning_rate": 6.15160098197098e-05, "loss": 0.4966, "step": 100210 }, { "epoch": 2.2306801994301995, "grad_norm": 0.555203914642334, "learning_rate": 6.148237588723817e-05, "loss": 0.4536, "step": 100220 }, { "epoch": 2.2309027777777777, "grad_norm": 0.7991825938224792, "learning_rate": 6.144874948192758e-05, "loss": 0.5368, "step": 100230 }, { "epoch": 2.2311253561253563, "grad_norm": 0.4899354875087738, "learning_rate": 6.141513060560537e-05, "loss": 0.4644, "step": 100240 }, { "epoch": 2.2313479344729346, "grad_norm": 0.5462999939918518, "learning_rate": 6.138151926009843e-05, "loss": 0.3995, "step": 100250 }, { "epoch": 2.2315705128205128, "grad_norm": 0.4971529245376587, "learning_rate": 6.134791544723319e-05, "loss": 0.4257, "step": 100260 }, { "epoch": 2.231793091168091, "grad_norm": 0.66850745677948, "learning_rate": 6.131431916883579e-05, "loss": 0.4657, "step": 100270 }, { "epoch": 2.2320156695156697, "grad_norm": 0.5258366465568542, "learning_rate": 6.128073042673176e-05, "loss": 0.4896, "step": 100280 }, { "epoch": 2.232238247863248, "grad_norm": 0.7665773630142212, "learning_rate": 6.124714922274648e-05, "loss": 0.496, "step": 100290 }, { "epoch": 2.232460826210826, "grad_norm": 0.4078090786933899, "learning_rate": 6.121357555870462e-05, "loss": 0.4215, "step": 100300 }, { "epoch": 2.2326834045584047, "grad_norm": 0.5825769305229187, "learning_rate": 6.11800094364307e-05, "loss": 0.4048, "step": 100310 }, { "epoch": 2.232905982905983, "grad_norm": 0.3658164441585541, "learning_rate": 6.114645085774868e-05, "loss": 0.4594, "step": 100320 }, { "epoch": 2.233128561253561, "grad_norm": 0.6267021894454956, "learning_rate": 6.111289982448221e-05, "loss": 0.4313, "step": 100330 }, { "epoch": 2.2333511396011394, "grad_norm": 0.7192836999893188, "learning_rate": 6.107935633845445e-05, "loss": 0.5962, "step": 100340 }, { "epoch": 2.233573717948718, "grad_norm": 0.5349550247192383, "learning_rate": 6.104582040148821e-05, "loss": 0.4404, "step": 100350 }, { "epoch": 2.2337962962962963, "grad_norm": 0.5640553832054138, "learning_rate": 6.10122920154059e-05, "loss": 0.5369, "step": 100360 }, { "epoch": 2.2340188746438745, "grad_norm": 0.45412135124206543, "learning_rate": 6.0978771182029346e-05, "loss": 0.5089, "step": 100370 }, { "epoch": 2.234241452991453, "grad_norm": 0.500693142414093, "learning_rate": 6.0945257903180196e-05, "loss": 0.4994, "step": 100380 }, { "epoch": 2.2344640313390314, "grad_norm": 0.3699854910373688, "learning_rate": 6.091175218067955e-05, "loss": 0.3459, "step": 100390 }, { "epoch": 2.2346866096866096, "grad_norm": 0.40656983852386475, "learning_rate": 6.0878254016348214e-05, "loss": 0.512, "step": 100400 }, { "epoch": 2.2349091880341883, "grad_norm": 0.769273579120636, "learning_rate": 6.08447634120064e-05, "loss": 0.4603, "step": 100410 }, { "epoch": 2.2351317663817665, "grad_norm": 0.7436431646347046, "learning_rate": 6.0811280369474054e-05, "loss": 0.4733, "step": 100420 }, { "epoch": 2.2353543447293447, "grad_norm": 0.697196900844574, "learning_rate": 6.077780489057072e-05, "loss": 0.57, "step": 100430 }, { "epoch": 2.235576923076923, "grad_norm": 0.48001793026924133, "learning_rate": 6.0744336977115414e-05, "loss": 0.5416, "step": 100440 }, { "epoch": 2.2357995014245016, "grad_norm": 0.7711690664291382, "learning_rate": 6.0710876630926805e-05, "loss": 0.4299, "step": 100450 }, { "epoch": 2.23602207977208, "grad_norm": 0.48359090089797974, "learning_rate": 6.06774238538232e-05, "loss": 0.4307, "step": 100460 }, { "epoch": 2.236244658119658, "grad_norm": 0.7829921245574951, "learning_rate": 6.0643978647622435e-05, "loss": 0.4167, "step": 100470 }, { "epoch": 2.2364672364672367, "grad_norm": 0.5604079365730286, "learning_rate": 6.0610541014141944e-05, "loss": 0.4173, "step": 100480 }, { "epoch": 2.236689814814815, "grad_norm": 0.6749580502510071, "learning_rate": 6.057711095519878e-05, "loss": 0.5747, "step": 100490 }, { "epoch": 2.236912393162393, "grad_norm": 0.7519851326942444, "learning_rate": 6.0543688472609604e-05, "loss": 0.4357, "step": 100500 }, { "epoch": 2.2371349715099713, "grad_norm": 0.6080546975135803, "learning_rate": 6.05102735681905e-05, "loss": 0.5401, "step": 100510 }, { "epoch": 2.23735754985755, "grad_norm": 0.5083986520767212, "learning_rate": 6.04768662437573e-05, "loss": 0.4284, "step": 100520 }, { "epoch": 2.237580128205128, "grad_norm": 0.6632825136184692, "learning_rate": 6.0443466501125486e-05, "loss": 0.5337, "step": 100530 }, { "epoch": 2.2378027065527064, "grad_norm": 0.6212065815925598, "learning_rate": 6.04100743421099e-05, "loss": 0.4282, "step": 100540 }, { "epoch": 2.238025284900285, "grad_norm": 0.404258131980896, "learning_rate": 6.037668976852513e-05, "loss": 0.394, "step": 100550 }, { "epoch": 2.2382478632478633, "grad_norm": 0.559452474117279, "learning_rate": 6.0343312782185346e-05, "loss": 0.5065, "step": 100560 }, { "epoch": 2.2384704415954415, "grad_norm": 0.44808855652809143, "learning_rate": 6.030994338490432e-05, "loss": 0.4189, "step": 100570 }, { "epoch": 2.23869301994302, "grad_norm": 0.48122096061706543, "learning_rate": 6.027658157849529e-05, "loss": 0.4731, "step": 100580 }, { "epoch": 2.2389155982905984, "grad_norm": 0.5979316234588623, "learning_rate": 6.024322736477117e-05, "loss": 0.4881, "step": 100590 }, { "epoch": 2.2391381766381766, "grad_norm": 0.40405070781707764, "learning_rate": 6.020988074554452e-05, "loss": 0.3934, "step": 100600 }, { "epoch": 2.239360754985755, "grad_norm": 0.6947756409645081, "learning_rate": 6.017654172262737e-05, "loss": 0.4821, "step": 100610 }, { "epoch": 2.2395833333333335, "grad_norm": 0.7525035738945007, "learning_rate": 6.0143210297831385e-05, "loss": 0.4503, "step": 100620 }, { "epoch": 2.2398059116809117, "grad_norm": 0.4716576933860779, "learning_rate": 6.010988647296787e-05, "loss": 0.5023, "step": 100630 }, { "epoch": 2.24002849002849, "grad_norm": 0.7538976669311523, "learning_rate": 6.007657024984772e-05, "loss": 0.4782, "step": 100640 }, { "epoch": 2.2402510683760686, "grad_norm": 0.5350600481033325, "learning_rate": 6.00432616302812e-05, "loss": 0.4736, "step": 100650 }, { "epoch": 2.240473646723647, "grad_norm": 0.894558846950531, "learning_rate": 6.0009960616078484e-05, "loss": 0.3729, "step": 100660 }, { "epoch": 2.240696225071225, "grad_norm": 0.5190034508705139, "learning_rate": 5.997666720904907e-05, "loss": 0.4033, "step": 100670 }, { "epoch": 2.2409188034188032, "grad_norm": 0.5922912955284119, "learning_rate": 5.994338141100215e-05, "loss": 0.3694, "step": 100680 }, { "epoch": 2.241141381766382, "grad_norm": 0.42299923300743103, "learning_rate": 5.9910103223746574e-05, "loss": 0.5054, "step": 100690 }, { "epoch": 2.24136396011396, "grad_norm": 0.7082996368408203, "learning_rate": 5.9876832649090655e-05, "loss": 0.5104, "step": 100700 }, { "epoch": 2.2415865384615383, "grad_norm": 0.6098785996437073, "learning_rate": 5.9843569688842415e-05, "loss": 0.4679, "step": 100710 }, { "epoch": 2.241809116809117, "grad_norm": 0.6080827116966248, "learning_rate": 5.981031434480928e-05, "loss": 0.3405, "step": 100720 }, { "epoch": 2.242031695156695, "grad_norm": 0.55560702085495, "learning_rate": 5.977706661879843e-05, "loss": 0.4762, "step": 100730 }, { "epoch": 2.2422542735042734, "grad_norm": 0.36331528425216675, "learning_rate": 5.974382651261659e-05, "loss": 0.4664, "step": 100740 }, { "epoch": 2.242476851851852, "grad_norm": 0.5189181566238403, "learning_rate": 5.971059402807e-05, "loss": 0.3889, "step": 100750 }, { "epoch": 2.2426994301994303, "grad_norm": 0.5288671851158142, "learning_rate": 5.96773691669646e-05, "loss": 0.5082, "step": 100760 }, { "epoch": 2.2429220085470085, "grad_norm": 0.4157465100288391, "learning_rate": 5.964415193110584e-05, "loss": 0.5014, "step": 100770 }, { "epoch": 2.2431445868945867, "grad_norm": 0.5020760893821716, "learning_rate": 5.9610942322298805e-05, "loss": 0.4667, "step": 100780 }, { "epoch": 2.2433671652421654, "grad_norm": 0.4918922185897827, "learning_rate": 5.9577740342348044e-05, "loss": 0.5327, "step": 100790 }, { "epoch": 2.2435897435897436, "grad_norm": 0.5720158815383911, "learning_rate": 5.954454599305788e-05, "loss": 0.5067, "step": 100800 }, { "epoch": 2.243812321937322, "grad_norm": 0.6430893540382385, "learning_rate": 5.9511359276232015e-05, "loss": 0.5036, "step": 100810 }, { "epoch": 2.2440349002849005, "grad_norm": 0.8744356036186218, "learning_rate": 5.94781801936739e-05, "loss": 0.542, "step": 100820 }, { "epoch": 2.2442574786324787, "grad_norm": 0.4596138894557953, "learning_rate": 5.9445008747186505e-05, "loss": 0.4761, "step": 100830 }, { "epoch": 2.244480056980057, "grad_norm": 0.5335553884506226, "learning_rate": 5.9411844938572394e-05, "loss": 0.518, "step": 100840 }, { "epoch": 2.244702635327635, "grad_norm": 0.6477447748184204, "learning_rate": 5.9378688769633796e-05, "loss": 0.5032, "step": 100850 }, { "epoch": 2.244925213675214, "grad_norm": 0.8028891682624817, "learning_rate": 5.9345540242172295e-05, "loss": 0.5746, "step": 100860 }, { "epoch": 2.245147792022792, "grad_norm": 0.5045342445373535, "learning_rate": 5.931239935798927e-05, "loss": 0.3358, "step": 100870 }, { "epoch": 2.2453703703703702, "grad_norm": 0.6941115260124207, "learning_rate": 5.9279266118885655e-05, "loss": 0.3895, "step": 100880 }, { "epoch": 2.245592948717949, "grad_norm": 0.38937705755233765, "learning_rate": 5.924614052666191e-05, "loss": 0.411, "step": 100890 }, { "epoch": 2.245815527065527, "grad_norm": 0.641523540019989, "learning_rate": 5.921302258311812e-05, "loss": 0.5199, "step": 100900 }, { "epoch": 2.2460381054131053, "grad_norm": 0.3802975118160248, "learning_rate": 5.917991229005393e-05, "loss": 0.3925, "step": 100910 }, { "epoch": 2.246260683760684, "grad_norm": 0.4550374150276184, "learning_rate": 5.914680964926866e-05, "loss": 0.4571, "step": 100920 }, { "epoch": 2.246483262108262, "grad_norm": 0.8346225619316101, "learning_rate": 5.911371466256106e-05, "loss": 0.4954, "step": 100930 }, { "epoch": 2.2467058404558404, "grad_norm": 0.4897273778915405, "learning_rate": 5.9080627331729455e-05, "loss": 0.4012, "step": 100940 }, { "epoch": 2.2469284188034186, "grad_norm": 0.7520624995231628, "learning_rate": 5.904754765857195e-05, "loss": 0.5254, "step": 100950 }, { "epoch": 2.2471509971509973, "grad_norm": 0.49854952096939087, "learning_rate": 5.901447564488609e-05, "loss": 0.5773, "step": 100960 }, { "epoch": 2.2473735754985755, "grad_norm": 0.7178290486335754, "learning_rate": 5.898141129246903e-05, "loss": 0.4814, "step": 100970 }, { "epoch": 2.2475961538461537, "grad_norm": 0.5157769918441772, "learning_rate": 5.894835460311752e-05, "loss": 0.5249, "step": 100980 }, { "epoch": 2.247818732193732, "grad_norm": 0.5982409119606018, "learning_rate": 5.891530557862797e-05, "loss": 0.5009, "step": 100990 }, { "epoch": 2.2480413105413106, "grad_norm": 0.700705349445343, "learning_rate": 5.8882264220796145e-05, "loss": 0.583, "step": 101000 }, { "epoch": 2.248263888888889, "grad_norm": 0.7363760471343994, "learning_rate": 5.8849230531417596e-05, "loss": 0.3753, "step": 101010 }, { "epoch": 2.248486467236467, "grad_norm": 0.7411749362945557, "learning_rate": 5.881620451228742e-05, "loss": 0.3801, "step": 101020 }, { "epoch": 2.2487090455840457, "grad_norm": 0.5226131081581116, "learning_rate": 5.878318616520026e-05, "loss": 0.4723, "step": 101030 }, { "epoch": 2.248931623931624, "grad_norm": 0.5838454961776733, "learning_rate": 5.875017549195039e-05, "loss": 0.6124, "step": 101040 }, { "epoch": 2.249154202279202, "grad_norm": 0.3981051743030548, "learning_rate": 5.8717172494331665e-05, "loss": 0.4393, "step": 101050 }, { "epoch": 2.249376780626781, "grad_norm": 0.6105630993843079, "learning_rate": 5.868417717413737e-05, "loss": 0.4758, "step": 101060 }, { "epoch": 2.249599358974359, "grad_norm": 0.5280075669288635, "learning_rate": 5.865118953316064e-05, "loss": 0.378, "step": 101070 }, { "epoch": 2.2498219373219372, "grad_norm": 0.7146009206771851, "learning_rate": 5.861820957319395e-05, "loss": 0.5024, "step": 101080 }, { "epoch": 2.250044515669516, "grad_norm": 0.48275527358055115, "learning_rate": 5.8585237296029474e-05, "loss": 0.4036, "step": 101090 }, { "epoch": 2.250267094017094, "grad_norm": 0.37731584906578064, "learning_rate": 5.8552272703458974e-05, "loss": 0.4652, "step": 101100 }, { "epoch": 2.2504896723646723, "grad_norm": 0.594689667224884, "learning_rate": 5.851931579727377e-05, "loss": 0.5159, "step": 101110 }, { "epoch": 2.2507122507122506, "grad_norm": 0.4745054543018341, "learning_rate": 5.848636657926476e-05, "loss": 0.4407, "step": 101120 }, { "epoch": 2.2509348290598292, "grad_norm": 0.39887046813964844, "learning_rate": 5.845342505122249e-05, "loss": 0.399, "step": 101130 }, { "epoch": 2.2511574074074074, "grad_norm": 0.349498450756073, "learning_rate": 5.842049121493694e-05, "loss": 0.4507, "step": 101140 }, { "epoch": 2.2513799857549857, "grad_norm": 0.7075125575065613, "learning_rate": 5.838756507219778e-05, "loss": 0.4531, "step": 101150 }, { "epoch": 2.251602564102564, "grad_norm": 0.6591486930847168, "learning_rate": 5.835464662479428e-05, "loss": 0.3924, "step": 101160 }, { "epoch": 2.2518251424501425, "grad_norm": 0.5846258401870728, "learning_rate": 5.832173587451524e-05, "loss": 0.4983, "step": 101170 }, { "epoch": 2.2520477207977208, "grad_norm": 0.7267735600471497, "learning_rate": 5.8288832823149055e-05, "loss": 0.4958, "step": 101180 }, { "epoch": 2.252270299145299, "grad_norm": 0.7835147976875305, "learning_rate": 5.825593747248375e-05, "loss": 0.5087, "step": 101190 }, { "epoch": 2.2524928774928776, "grad_norm": 0.6003299951553345, "learning_rate": 5.82230498243068e-05, "loss": 0.4145, "step": 101200 }, { "epoch": 2.252715455840456, "grad_norm": 0.623741626739502, "learning_rate": 5.819016988040542e-05, "loss": 0.4989, "step": 101210 }, { "epoch": 2.252938034188034, "grad_norm": 0.5692561864852905, "learning_rate": 5.815729764256625e-05, "loss": 0.365, "step": 101220 }, { "epoch": 2.2531606125356127, "grad_norm": 0.5674599409103394, "learning_rate": 5.812443311257565e-05, "loss": 0.4341, "step": 101230 }, { "epoch": 2.253383190883191, "grad_norm": 0.36465272307395935, "learning_rate": 5.809157629221951e-05, "loss": 0.3755, "step": 101240 }, { "epoch": 2.253605769230769, "grad_norm": 0.4215715825557709, "learning_rate": 5.805872718328327e-05, "loss": 0.4074, "step": 101250 }, { "epoch": 2.253828347578348, "grad_norm": 0.7996568083763123, "learning_rate": 5.802588578755199e-05, "loss": 0.4353, "step": 101260 }, { "epoch": 2.254050925925926, "grad_norm": 0.6421694755554199, "learning_rate": 5.799305210681036e-05, "loss": 0.4425, "step": 101270 }, { "epoch": 2.2542735042735043, "grad_norm": 0.4854976534843445, "learning_rate": 5.796022614284249e-05, "loss": 0.481, "step": 101280 }, { "epoch": 2.2544960826210825, "grad_norm": 0.6315191984176636, "learning_rate": 5.79274078974322e-05, "loss": 0.5784, "step": 101290 }, { "epoch": 2.254718660968661, "grad_norm": 0.5861208438873291, "learning_rate": 5.789459737236287e-05, "loss": 0.4458, "step": 101300 }, { "epoch": 2.2549412393162394, "grad_norm": 0.6290010213851929, "learning_rate": 5.786179456941745e-05, "loss": 0.4419, "step": 101310 }, { "epoch": 2.2551638176638176, "grad_norm": 0.6168012022972107, "learning_rate": 5.7828999490378546e-05, "loss": 0.4462, "step": 101320 }, { "epoch": 2.255386396011396, "grad_norm": 0.5474677085876465, "learning_rate": 5.7796212137028125e-05, "loss": 0.4423, "step": 101330 }, { "epoch": 2.2556089743589745, "grad_norm": 0.36434105038642883, "learning_rate": 5.776343251114795e-05, "loss": 0.4075, "step": 101340 }, { "epoch": 2.2558315527065527, "grad_norm": 0.8574975728988647, "learning_rate": 5.773066061451935e-05, "loss": 0.469, "step": 101350 }, { "epoch": 2.256054131054131, "grad_norm": 0.5685760974884033, "learning_rate": 5.769789644892305e-05, "loss": 0.4623, "step": 101360 }, { "epoch": 2.2562767094017095, "grad_norm": 0.840263843536377, "learning_rate": 5.766514001613956e-05, "loss": 0.4403, "step": 101370 }, { "epoch": 2.2564992877492878, "grad_norm": 0.47229859232902527, "learning_rate": 5.763239131794891e-05, "loss": 0.4218, "step": 101380 }, { "epoch": 2.256721866096866, "grad_norm": 0.5112053751945496, "learning_rate": 5.759965035613062e-05, "loss": 0.4215, "step": 101390 }, { "epoch": 2.2569444444444446, "grad_norm": 0.2744973599910736, "learning_rate": 5.756691713246394e-05, "loss": 0.3427, "step": 101400 }, { "epoch": 2.257167022792023, "grad_norm": 0.46965116262435913, "learning_rate": 5.753419164872762e-05, "loss": 0.3961, "step": 101410 }, { "epoch": 2.257389601139601, "grad_norm": 0.5640278458595276, "learning_rate": 5.750147390669989e-05, "loss": 0.539, "step": 101420 }, { "epoch": 2.2576121794871793, "grad_norm": 0.5815955996513367, "learning_rate": 5.746876390815874e-05, "loss": 0.4581, "step": 101430 }, { "epoch": 2.257834757834758, "grad_norm": 0.47417697310447693, "learning_rate": 5.743606165488162e-05, "loss": 0.4741, "step": 101440 }, { "epoch": 2.258057336182336, "grad_norm": 0.5229243040084839, "learning_rate": 5.740336714864571e-05, "loss": 0.4679, "step": 101450 }, { "epoch": 2.2582799145299144, "grad_norm": 0.5353955030441284, "learning_rate": 5.7370680391227484e-05, "loss": 0.4983, "step": 101460 }, { "epoch": 2.258502492877493, "grad_norm": 0.562786340713501, "learning_rate": 5.733800138440324e-05, "loss": 0.5244, "step": 101470 }, { "epoch": 2.2587250712250713, "grad_norm": 0.9685201644897461, "learning_rate": 5.730533012994881e-05, "loss": 0.4722, "step": 101480 }, { "epoch": 2.2589476495726495, "grad_norm": 0.5229083299636841, "learning_rate": 5.7272666629639615e-05, "loss": 0.5052, "step": 101490 }, { "epoch": 2.2591702279202277, "grad_norm": 0.9138157367706299, "learning_rate": 5.7240010885250486e-05, "loss": 0.5017, "step": 101500 }, { "epoch": 2.2593928062678064, "grad_norm": 0.6066059470176697, "learning_rate": 5.7207362898556017e-05, "loss": 0.4791, "step": 101510 }, { "epoch": 2.2596153846153846, "grad_norm": 0.5644805431365967, "learning_rate": 5.717472267133037e-05, "loss": 0.452, "step": 101520 }, { "epoch": 2.259837962962963, "grad_norm": 0.555303692817688, "learning_rate": 5.7142090205347196e-05, "loss": 0.4632, "step": 101530 }, { "epoch": 2.2600605413105415, "grad_norm": 0.5508922338485718, "learning_rate": 5.710946550237981e-05, "loss": 0.513, "step": 101540 }, { "epoch": 2.2602831196581197, "grad_norm": 0.47135332226753235, "learning_rate": 5.7076848564201104e-05, "loss": 0.4393, "step": 101550 }, { "epoch": 2.260505698005698, "grad_norm": 0.5292717814445496, "learning_rate": 5.7044239392583364e-05, "loss": 0.5473, "step": 101560 }, { "epoch": 2.2607282763532766, "grad_norm": 0.5483565926551819, "learning_rate": 5.701163798929871e-05, "loss": 0.479, "step": 101570 }, { "epoch": 2.2609508547008548, "grad_norm": 0.4322088360786438, "learning_rate": 5.697904435611876e-05, "loss": 0.456, "step": 101580 }, { "epoch": 2.261173433048433, "grad_norm": 0.7418818473815918, "learning_rate": 5.694645849481455e-05, "loss": 0.4398, "step": 101590 }, { "epoch": 2.261396011396011, "grad_norm": 0.3156370520591736, "learning_rate": 5.69138804071569e-05, "loss": 0.3302, "step": 101600 }, { "epoch": 2.26161858974359, "grad_norm": 0.5656064748764038, "learning_rate": 5.688131009491613e-05, "loss": 0.511, "step": 101610 }, { "epoch": 2.261841168091168, "grad_norm": 0.5305771827697754, "learning_rate": 5.684874755986211e-05, "loss": 0.4182, "step": 101620 }, { "epoch": 2.2620637464387463, "grad_norm": 0.637701690196991, "learning_rate": 5.681619280376442e-05, "loss": 0.3758, "step": 101630 }, { "epoch": 2.262286324786325, "grad_norm": 0.4770202338695526, "learning_rate": 5.6783645828391974e-05, "loss": 0.5329, "step": 101640 }, { "epoch": 2.262508903133903, "grad_norm": 0.6516206860542297, "learning_rate": 5.675110663551344e-05, "loss": 0.5447, "step": 101650 }, { "epoch": 2.2627314814814814, "grad_norm": 0.6704887747764587, "learning_rate": 5.6718575226897054e-05, "loss": 0.5387, "step": 101660 }, { "epoch": 2.2629540598290596, "grad_norm": 0.5100089907646179, "learning_rate": 5.668605160431057e-05, "loss": 0.4309, "step": 101670 }, { "epoch": 2.2631766381766383, "grad_norm": 0.485149085521698, "learning_rate": 5.6653535769521374e-05, "loss": 0.472, "step": 101680 }, { "epoch": 2.2633992165242165, "grad_norm": 0.6081596612930298, "learning_rate": 5.662102772429645e-05, "loss": 0.4186, "step": 101690 }, { "epoch": 2.2636217948717947, "grad_norm": 0.42370980978012085, "learning_rate": 5.6588527470402196e-05, "loss": 0.5188, "step": 101700 }, { "epoch": 2.2638443732193734, "grad_norm": 0.6564342379570007, "learning_rate": 5.65560350096048e-05, "loss": 0.525, "step": 101710 }, { "epoch": 2.2640669515669516, "grad_norm": 0.7005487680435181, "learning_rate": 5.6523550343669875e-05, "loss": 0.4555, "step": 101720 }, { "epoch": 2.26428952991453, "grad_norm": 0.6535967588424683, "learning_rate": 5.6491073474362647e-05, "loss": 0.4316, "step": 101730 }, { "epoch": 2.2645121082621085, "grad_norm": 0.4445608854293823, "learning_rate": 5.6458604403447965e-05, "loss": 0.5378, "step": 101740 }, { "epoch": 2.2647346866096867, "grad_norm": 0.389424592256546, "learning_rate": 5.6426143132690235e-05, "loss": 0.434, "step": 101750 }, { "epoch": 2.264957264957265, "grad_norm": 0.500011146068573, "learning_rate": 5.639368966385343e-05, "loss": 0.3236, "step": 101760 }, { "epoch": 2.265179843304843, "grad_norm": 0.49770283699035645, "learning_rate": 5.636124399870115e-05, "loss": 0.4463, "step": 101770 }, { "epoch": 2.265402421652422, "grad_norm": 0.6304476261138916, "learning_rate": 5.6328806138996404e-05, "loss": 0.5451, "step": 101780 }, { "epoch": 2.265625, "grad_norm": 0.44962552189826965, "learning_rate": 5.629637608650193e-05, "loss": 0.4311, "step": 101790 }, { "epoch": 2.265847578347578, "grad_norm": 0.5515990257263184, "learning_rate": 5.6263953842980044e-05, "loss": 0.4567, "step": 101800 }, { "epoch": 2.266070156695157, "grad_norm": 0.3929078280925751, "learning_rate": 5.623153941019255e-05, "loss": 0.3712, "step": 101810 }, { "epoch": 2.266292735042735, "grad_norm": 0.4479203522205353, "learning_rate": 5.6199132789900896e-05, "loss": 0.4379, "step": 101820 }, { "epoch": 2.2665153133903133, "grad_norm": 0.6935744881629944, "learning_rate": 5.616673398386618e-05, "loss": 0.4509, "step": 101830 }, { "epoch": 2.2667378917378915, "grad_norm": 0.7883388996124268, "learning_rate": 5.613434299384885e-05, "loss": 0.4422, "step": 101840 }, { "epoch": 2.26696047008547, "grad_norm": 0.6511839628219604, "learning_rate": 5.6101959821609064e-05, "loss": 0.4903, "step": 101850 }, { "epoch": 2.2671830484330484, "grad_norm": 0.720378041267395, "learning_rate": 5.6069584468906576e-05, "loss": 0.5358, "step": 101860 }, { "epoch": 2.2674056267806266, "grad_norm": 0.3852686285972595, "learning_rate": 5.6037216937500684e-05, "loss": 0.42, "step": 101870 }, { "epoch": 2.2676282051282053, "grad_norm": 0.5996120572090149, "learning_rate": 5.6004857229150275e-05, "loss": 0.4837, "step": 101880 }, { "epoch": 2.2678507834757835, "grad_norm": 0.5900279879570007, "learning_rate": 5.5972505345613824e-05, "loss": 0.4205, "step": 101890 }, { "epoch": 2.2680733618233617, "grad_norm": 0.5508142709732056, "learning_rate": 5.5940161288649384e-05, "loss": 0.4303, "step": 101900 }, { "epoch": 2.2682959401709404, "grad_norm": 0.3930082321166992, "learning_rate": 5.590782506001444e-05, "loss": 0.381, "step": 101910 }, { "epoch": 2.2685185185185186, "grad_norm": 0.3945516347885132, "learning_rate": 5.5875496661466256e-05, "loss": 0.3843, "step": 101920 }, { "epoch": 2.268741096866097, "grad_norm": 0.6308930516242981, "learning_rate": 5.5843176094761576e-05, "loss": 0.4208, "step": 101930 }, { "epoch": 2.268963675213675, "grad_norm": 0.5742242336273193, "learning_rate": 5.5810863361656705e-05, "loss": 0.4758, "step": 101940 }, { "epoch": 2.2691862535612537, "grad_norm": 0.47501349449157715, "learning_rate": 5.577855846390756e-05, "loss": 0.4667, "step": 101950 }, { "epoch": 2.269408831908832, "grad_norm": 0.7356695532798767, "learning_rate": 5.574626140326962e-05, "loss": 0.5717, "step": 101960 }, { "epoch": 2.26963141025641, "grad_norm": 0.5006420016288757, "learning_rate": 5.5713972181497984e-05, "loss": 0.4647, "step": 101970 }, { "epoch": 2.269853988603989, "grad_norm": 0.7696124911308289, "learning_rate": 5.568169080034722e-05, "loss": 0.3968, "step": 101980 }, { "epoch": 2.270076566951567, "grad_norm": 0.5565013885498047, "learning_rate": 5.564941726157147e-05, "loss": 0.4999, "step": 101990 }, { "epoch": 2.2702991452991452, "grad_norm": 0.5363160967826843, "learning_rate": 5.561715156692455e-05, "loss": 0.4392, "step": 102000 }, { "epoch": 2.2705217236467234, "grad_norm": 0.6066251993179321, "learning_rate": 5.55848937181598e-05, "loss": 0.4533, "step": 102010 }, { "epoch": 2.270744301994302, "grad_norm": 0.7414187788963318, "learning_rate": 5.555264371703017e-05, "loss": 0.5076, "step": 102020 }, { "epoch": 2.2709668803418803, "grad_norm": 0.7134029865264893, "learning_rate": 5.5520401565288106e-05, "loss": 0.5363, "step": 102030 }, { "epoch": 2.2711894586894585, "grad_norm": 0.4264451265335083, "learning_rate": 5.548816726468577e-05, "loss": 0.4643, "step": 102040 }, { "epoch": 2.271412037037037, "grad_norm": 0.49752625823020935, "learning_rate": 5.545594081697467e-05, "loss": 0.4397, "step": 102050 }, { "epoch": 2.2716346153846154, "grad_norm": 0.46924108266830444, "learning_rate": 5.542372222390608e-05, "loss": 0.4594, "step": 102060 }, { "epoch": 2.2718571937321936, "grad_norm": 0.6802897453308105, "learning_rate": 5.539151148723076e-05, "loss": 0.5427, "step": 102070 }, { "epoch": 2.2720797720797723, "grad_norm": 0.6026861071586609, "learning_rate": 5.535930860869911e-05, "loss": 0.3957, "step": 102080 }, { "epoch": 2.2723023504273505, "grad_norm": 0.5114626288414001, "learning_rate": 5.5327113590061044e-05, "loss": 0.4284, "step": 102090 }, { "epoch": 2.2725249287749287, "grad_norm": 0.5237139463424683, "learning_rate": 5.529492643306604e-05, "loss": 0.525, "step": 102100 }, { "epoch": 2.272747507122507, "grad_norm": 0.4586896598339081, "learning_rate": 5.526274713946325e-05, "loss": 0.4378, "step": 102110 }, { "epoch": 2.2729700854700856, "grad_norm": 0.4087303578853607, "learning_rate": 5.5230575711001276e-05, "loss": 0.5392, "step": 102120 }, { "epoch": 2.273192663817664, "grad_norm": 0.6352576017379761, "learning_rate": 5.519841214942829e-05, "loss": 0.4601, "step": 102130 }, { "epoch": 2.273415242165242, "grad_norm": 0.640564501285553, "learning_rate": 5.51662564564921e-05, "loss": 0.4784, "step": 102140 }, { "epoch": 2.2736378205128207, "grad_norm": 0.572647750377655, "learning_rate": 5.5134108633940105e-05, "loss": 0.5141, "step": 102150 }, { "epoch": 2.273860398860399, "grad_norm": 0.5198943614959717, "learning_rate": 5.510196868351927e-05, "loss": 0.5727, "step": 102160 }, { "epoch": 2.274082977207977, "grad_norm": 0.3667494058609009, "learning_rate": 5.506983660697604e-05, "loss": 0.4296, "step": 102170 }, { "epoch": 2.2743055555555554, "grad_norm": 0.43145737051963806, "learning_rate": 5.503771240605659e-05, "loss": 0.563, "step": 102180 }, { "epoch": 2.274528133903134, "grad_norm": 0.5121556520462036, "learning_rate": 5.500559608250648e-05, "loss": 0.389, "step": 102190 }, { "epoch": 2.2747507122507122, "grad_norm": 0.3813532590866089, "learning_rate": 5.497348763807097e-05, "loss": 0.47, "step": 102200 }, { "epoch": 2.2749732905982905, "grad_norm": 0.36606770753860474, "learning_rate": 5.4941387074494874e-05, "loss": 0.4546, "step": 102210 }, { "epoch": 2.275195868945869, "grad_norm": 0.5075564980506897, "learning_rate": 5.490929439352257e-05, "loss": 0.4803, "step": 102220 }, { "epoch": 2.2754184472934473, "grad_norm": 0.4560779929161072, "learning_rate": 5.4877209596897946e-05, "loss": 0.4122, "step": 102230 }, { "epoch": 2.2756410256410255, "grad_norm": 0.5539279580116272, "learning_rate": 5.484513268636464e-05, "loss": 0.4025, "step": 102240 }, { "epoch": 2.275863603988604, "grad_norm": 0.47847601771354675, "learning_rate": 5.4813063663665585e-05, "loss": 0.4715, "step": 102250 }, { "epoch": 2.2760861823361824, "grad_norm": 0.6673201322555542, "learning_rate": 5.478100253054357e-05, "loss": 0.3853, "step": 102260 }, { "epoch": 2.2763087606837606, "grad_norm": 0.7573196291923523, "learning_rate": 5.4748949288740705e-05, "loss": 0.4458, "step": 102270 }, { "epoch": 2.276531339031339, "grad_norm": 0.5507382750511169, "learning_rate": 5.4716903939998844e-05, "loss": 0.4435, "step": 102280 }, { "epoch": 2.2767539173789175, "grad_norm": 0.635073184967041, "learning_rate": 5.468486648605935e-05, "loss": 0.4689, "step": 102290 }, { "epoch": 2.2769764957264957, "grad_norm": 0.5143459439277649, "learning_rate": 5.4652836928663164e-05, "loss": 0.3888, "step": 102300 }, { "epoch": 2.277199074074074, "grad_norm": 0.8898342847824097, "learning_rate": 5.4620815269550805e-05, "loss": 0.5532, "step": 102310 }, { "epoch": 2.277421652421652, "grad_norm": 0.6367766857147217, "learning_rate": 5.458880151046244e-05, "loss": 0.4739, "step": 102320 }, { "epoch": 2.277644230769231, "grad_norm": 0.5471299886703491, "learning_rate": 5.4556795653137563e-05, "loss": 0.4544, "step": 102330 }, { "epoch": 2.277866809116809, "grad_norm": 0.483954519033432, "learning_rate": 5.4524797699315485e-05, "loss": 0.4725, "step": 102340 }, { "epoch": 2.2780893874643873, "grad_norm": 0.5129729509353638, "learning_rate": 5.4492807650734986e-05, "loss": 0.3671, "step": 102350 }, { "epoch": 2.278311965811966, "grad_norm": 0.4544709622859955, "learning_rate": 5.446082550913443e-05, "loss": 0.3694, "step": 102360 }, { "epoch": 2.278534544159544, "grad_norm": 0.6704840660095215, "learning_rate": 5.4428851276251836e-05, "loss": 0.4251, "step": 102370 }, { "epoch": 2.2787571225071224, "grad_norm": 0.8961855173110962, "learning_rate": 5.4396884953824554e-05, "loss": 0.3873, "step": 102380 }, { "epoch": 2.278979700854701, "grad_norm": 0.5246909260749817, "learning_rate": 5.436492654358975e-05, "loss": 0.4699, "step": 102390 }, { "epoch": 2.2792022792022792, "grad_norm": 0.5263209342956543, "learning_rate": 5.4332976047284114e-05, "loss": 0.4693, "step": 102400 }, { "epoch": 2.2794248575498575, "grad_norm": 0.6012300848960876, "learning_rate": 5.430103346664377e-05, "loss": 0.4863, "step": 102410 }, { "epoch": 2.279647435897436, "grad_norm": 0.7874319553375244, "learning_rate": 5.4269098803404516e-05, "loss": 0.4381, "step": 102420 }, { "epoch": 2.2798700142450143, "grad_norm": 0.42439478635787964, "learning_rate": 5.4237172059301745e-05, "loss": 0.6481, "step": 102430 }, { "epoch": 2.2800925925925926, "grad_norm": 0.3294517695903778, "learning_rate": 5.420525323607037e-05, "loss": 0.3813, "step": 102440 }, { "epoch": 2.2802706552706553, "eval_loss": 0.5308219790458679, "eval_runtime": 337.354, "eval_samples_per_second": 7.01, "eval_steps_per_second": 7.01, "step": 102448 }, { "epoch": 2.2803151709401708, "grad_norm": 0.35001301765441895, "learning_rate": 5.417334233544489e-05, "loss": 0.5311, "step": 102450 }, { "epoch": 2.2805377492877494, "grad_norm": 0.4265924394130707, "learning_rate": 5.414143935915943e-05, "loss": 0.3942, "step": 102460 }, { "epoch": 2.2807603276353277, "grad_norm": 0.4556063413619995, "learning_rate": 5.410954430894748e-05, "loss": 0.4001, "step": 102470 }, { "epoch": 2.280982905982906, "grad_norm": 0.5078181028366089, "learning_rate": 5.407765718654234e-05, "loss": 0.4825, "step": 102480 }, { "epoch": 2.281205484330484, "grad_norm": 0.5195839405059814, "learning_rate": 5.404577799367676e-05, "loss": 0.3958, "step": 102490 }, { "epoch": 2.2814280626780628, "grad_norm": 0.714531660079956, "learning_rate": 5.4013906732083154e-05, "loss": 0.5397, "step": 102500 }, { "epoch": 2.281650641025641, "grad_norm": 0.40454044938087463, "learning_rate": 5.398204340349331e-05, "loss": 0.5091, "step": 102510 }, { "epoch": 2.281873219373219, "grad_norm": 0.6262097954750061, "learning_rate": 5.395018800963876e-05, "loss": 0.6028, "step": 102520 }, { "epoch": 2.282095797720798, "grad_norm": 0.44660210609436035, "learning_rate": 5.391834055225056e-05, "loss": 0.5524, "step": 102530 }, { "epoch": 2.282318376068376, "grad_norm": 0.5187098383903503, "learning_rate": 5.3886501033059367e-05, "loss": 0.5016, "step": 102540 }, { "epoch": 2.2825409544159543, "grad_norm": 0.5763646364212036, "learning_rate": 5.385466945379527e-05, "loss": 0.3882, "step": 102550 }, { "epoch": 2.282763532763533, "grad_norm": 0.49286168813705444, "learning_rate": 5.3822845816188085e-05, "loss": 0.3818, "step": 102560 }, { "epoch": 2.282986111111111, "grad_norm": 0.6143706440925598, "learning_rate": 5.379103012196711e-05, "loss": 0.5703, "step": 102570 }, { "epoch": 2.2832086894586894, "grad_norm": 0.410530149936676, "learning_rate": 5.375922237286126e-05, "loss": 0.3684, "step": 102580 }, { "epoch": 2.283431267806268, "grad_norm": 0.8411930203437805, "learning_rate": 5.372742257059897e-05, "loss": 0.5101, "step": 102590 }, { "epoch": 2.2836538461538463, "grad_norm": 0.5050935745239258, "learning_rate": 5.3695630716908354e-05, "loss": 0.4048, "step": 102600 }, { "epoch": 2.2838764245014245, "grad_norm": 0.4293699860572815, "learning_rate": 5.366384681351688e-05, "loss": 0.4361, "step": 102610 }, { "epoch": 2.2840990028490027, "grad_norm": 0.622787594795227, "learning_rate": 5.363207086215176e-05, "loss": 0.4265, "step": 102620 }, { "epoch": 2.2843215811965814, "grad_norm": 0.6479224562644958, "learning_rate": 5.3600302864539785e-05, "loss": 0.4123, "step": 102630 }, { "epoch": 2.2845441595441596, "grad_norm": 0.502160906791687, "learning_rate": 5.3568542822407154e-05, "loss": 0.4438, "step": 102640 }, { "epoch": 2.284766737891738, "grad_norm": 0.45600494742393494, "learning_rate": 5.353679073747977e-05, "loss": 0.4465, "step": 102650 }, { "epoch": 2.284989316239316, "grad_norm": 0.6276834011077881, "learning_rate": 5.3505046611483076e-05, "loss": 0.5092, "step": 102660 }, { "epoch": 2.2852118945868947, "grad_norm": 0.7201434373855591, "learning_rate": 5.3473310446142075e-05, "loss": 0.4117, "step": 102670 }, { "epoch": 2.285434472934473, "grad_norm": 0.4016876518726349, "learning_rate": 5.344158224318141e-05, "loss": 0.4158, "step": 102680 }, { "epoch": 2.285657051282051, "grad_norm": 0.5486331582069397, "learning_rate": 5.340986200432507e-05, "loss": 0.3995, "step": 102690 }, { "epoch": 2.2858796296296298, "grad_norm": 0.6210474371910095, "learning_rate": 5.337814973129684e-05, "loss": 0.4866, "step": 102700 }, { "epoch": 2.286102207977208, "grad_norm": 0.6425034999847412, "learning_rate": 5.334644542581999e-05, "loss": 0.4377, "step": 102710 }, { "epoch": 2.286324786324786, "grad_norm": 0.5458213686943054, "learning_rate": 5.331474908961733e-05, "loss": 0.5359, "step": 102720 }, { "epoch": 2.286547364672365, "grad_norm": 0.5335869193077087, "learning_rate": 5.328306072441132e-05, "loss": 0.4587, "step": 102730 }, { "epoch": 2.286769943019943, "grad_norm": 0.6593013405799866, "learning_rate": 5.3251380331923936e-05, "loss": 0.4461, "step": 102740 }, { "epoch": 2.2869925213675213, "grad_norm": 0.48755788803100586, "learning_rate": 5.321970791387663e-05, "loss": 0.5187, "step": 102750 }, { "epoch": 2.2872150997151, "grad_norm": 0.4677642583847046, "learning_rate": 5.31880434719906e-05, "loss": 0.4189, "step": 102760 }, { "epoch": 2.287437678062678, "grad_norm": 0.5250211954116821, "learning_rate": 5.315638700798642e-05, "loss": 0.4374, "step": 102770 }, { "epoch": 2.2876602564102564, "grad_norm": 0.5852629542350769, "learning_rate": 5.312473852358437e-05, "loss": 0.525, "step": 102780 }, { "epoch": 2.2878828347578346, "grad_norm": 0.6496941447257996, "learning_rate": 5.3093098020504285e-05, "loss": 0.4563, "step": 102790 }, { "epoch": 2.2881054131054133, "grad_norm": 0.4138205051422119, "learning_rate": 5.306146550046551e-05, "loss": 0.4192, "step": 102800 }, { "epoch": 2.2883279914529915, "grad_norm": 0.5685064196586609, "learning_rate": 5.3029840965187e-05, "loss": 0.5226, "step": 102810 }, { "epoch": 2.2885505698005697, "grad_norm": 0.6383584141731262, "learning_rate": 5.2998224416387284e-05, "loss": 0.4088, "step": 102820 }, { "epoch": 2.288773148148148, "grad_norm": 0.5327423214912415, "learning_rate": 5.296661585578435e-05, "loss": 0.4424, "step": 102830 }, { "epoch": 2.2889957264957266, "grad_norm": 0.5009591579437256, "learning_rate": 5.293501528509588e-05, "loss": 0.5031, "step": 102840 }, { "epoch": 2.289218304843305, "grad_norm": 0.6047536134719849, "learning_rate": 5.2903422706039074e-05, "loss": 0.4622, "step": 102850 }, { "epoch": 2.289440883190883, "grad_norm": 0.39370644092559814, "learning_rate": 5.2871838120330695e-05, "loss": 0.338, "step": 102860 }, { "epoch": 2.2896634615384617, "grad_norm": 0.7097088694572449, "learning_rate": 5.284026152968706e-05, "loss": 0.4214, "step": 102870 }, { "epoch": 2.28988603988604, "grad_norm": 0.47650113701820374, "learning_rate": 5.280869293582418e-05, "loss": 0.3967, "step": 102880 }, { "epoch": 2.290108618233618, "grad_norm": 0.6755890846252441, "learning_rate": 5.27771323404574e-05, "loss": 0.5204, "step": 102890 }, { "epoch": 2.2903311965811968, "grad_norm": 0.4635171890258789, "learning_rate": 5.2745579745301696e-05, "loss": 0.4416, "step": 102900 }, { "epoch": 2.290553774928775, "grad_norm": 0.7415624856948853, "learning_rate": 5.271403515207176e-05, "loss": 0.4874, "step": 102910 }, { "epoch": 2.290776353276353, "grad_norm": 0.5703538060188293, "learning_rate": 5.268249856248173e-05, "loss": 0.5932, "step": 102920 }, { "epoch": 2.290998931623932, "grad_norm": 0.4767812192440033, "learning_rate": 5.26509699782453e-05, "loss": 0.4217, "step": 102930 }, { "epoch": 2.29122150997151, "grad_norm": 0.6134464144706726, "learning_rate": 5.261944940107581e-05, "loss": 0.5368, "step": 102940 }, { "epoch": 2.2914440883190883, "grad_norm": 0.3436715006828308, "learning_rate": 5.258793683268608e-05, "loss": 0.4835, "step": 102950 }, { "epoch": 2.2916666666666665, "grad_norm": 0.6039507985115051, "learning_rate": 5.255643227478861e-05, "loss": 0.3613, "step": 102960 }, { "epoch": 2.291889245014245, "grad_norm": 0.7695244550704956, "learning_rate": 5.252493572909525e-05, "loss": 0.5058, "step": 102970 }, { "epoch": 2.2921118233618234, "grad_norm": 0.6251684427261353, "learning_rate": 5.2493447197317616e-05, "loss": 0.5454, "step": 102980 }, { "epoch": 2.2923344017094016, "grad_norm": 0.4886912405490875, "learning_rate": 5.246196668116681e-05, "loss": 0.4633, "step": 102990 }, { "epoch": 2.29255698005698, "grad_norm": 0.7781798243522644, "learning_rate": 5.2430494182353504e-05, "loss": 0.5038, "step": 103000 }, { "epoch": 2.2927795584045585, "grad_norm": 0.5966687798500061, "learning_rate": 5.239902970258797e-05, "loss": 0.5312, "step": 103010 }, { "epoch": 2.2930021367521367, "grad_norm": 0.5091105699539185, "learning_rate": 5.2367573243580034e-05, "loss": 0.5261, "step": 103020 }, { "epoch": 2.293224715099715, "grad_norm": 0.6099951267242432, "learning_rate": 5.233612480703905e-05, "loss": 0.4781, "step": 103030 }, { "epoch": 2.2934472934472936, "grad_norm": 0.5651898384094238, "learning_rate": 5.230468439467384e-05, "loss": 0.4752, "step": 103040 }, { "epoch": 2.293669871794872, "grad_norm": 0.6548570990562439, "learning_rate": 5.2273252008193e-05, "loss": 0.483, "step": 103050 }, { "epoch": 2.29389245014245, "grad_norm": 0.48320871591567993, "learning_rate": 5.2241827649304584e-05, "loss": 0.4787, "step": 103060 }, { "epoch": 2.2941150284900287, "grad_norm": 0.5880222320556641, "learning_rate": 5.22104113197162e-05, "loss": 0.3746, "step": 103070 }, { "epoch": 2.294337606837607, "grad_norm": 0.41989943385124207, "learning_rate": 5.2179003021135076e-05, "loss": 0.3545, "step": 103080 }, { "epoch": 2.294560185185185, "grad_norm": 0.580086350440979, "learning_rate": 5.214760275526793e-05, "loss": 0.4115, "step": 103090 }, { "epoch": 2.294782763532764, "grad_norm": 0.4067051410675049, "learning_rate": 5.211621052382114e-05, "loss": 0.4841, "step": 103100 }, { "epoch": 2.295005341880342, "grad_norm": 0.5266998410224915, "learning_rate": 5.208482632850047e-05, "loss": 0.4854, "step": 103110 }, { "epoch": 2.29522792022792, "grad_norm": 0.4169633984565735, "learning_rate": 5.205345017101144e-05, "loss": 0.4697, "step": 103120 }, { "epoch": 2.2954504985754984, "grad_norm": 0.4864277243614197, "learning_rate": 5.2022082053059054e-05, "loss": 0.5506, "step": 103130 }, { "epoch": 2.295673076923077, "grad_norm": 0.5991712808609009, "learning_rate": 5.199072197634784e-05, "loss": 0.5265, "step": 103140 }, { "epoch": 2.2958956552706553, "grad_norm": 0.5243403315544128, "learning_rate": 5.195936994258197e-05, "loss": 0.4581, "step": 103150 }, { "epoch": 2.2961182336182335, "grad_norm": 0.5383456945419312, "learning_rate": 5.1928025953465195e-05, "loss": 0.4343, "step": 103160 }, { "epoch": 2.2963408119658117, "grad_norm": 0.5923618078231812, "learning_rate": 5.1896690010700696e-05, "loss": 0.5134, "step": 103170 }, { "epoch": 2.2965633903133904, "grad_norm": 0.9396385550498962, "learning_rate": 5.1865362115991265e-05, "loss": 0.5086, "step": 103180 }, { "epoch": 2.2967859686609686, "grad_norm": 0.6137779355049133, "learning_rate": 5.1834042271039295e-05, "loss": 0.4525, "step": 103190 }, { "epoch": 2.297008547008547, "grad_norm": 0.4180096387863159, "learning_rate": 5.18027304775468e-05, "loss": 0.4028, "step": 103200 }, { "epoch": 2.2972311253561255, "grad_norm": 0.49826472997665405, "learning_rate": 5.177142673721522e-05, "loss": 0.5231, "step": 103210 }, { "epoch": 2.2974537037037037, "grad_norm": 0.4427054822444916, "learning_rate": 5.1740131051745664e-05, "loss": 0.4184, "step": 103220 }, { "epoch": 2.297676282051282, "grad_norm": 0.38813918828964233, "learning_rate": 5.170884342283877e-05, "loss": 0.4, "step": 103230 }, { "epoch": 2.2978988603988606, "grad_norm": 0.4896891117095947, "learning_rate": 5.167756385219478e-05, "loss": 0.4271, "step": 103240 }, { "epoch": 2.298121438746439, "grad_norm": 0.32303759455680847, "learning_rate": 5.16462923415133e-05, "loss": 0.4146, "step": 103250 }, { "epoch": 2.298344017094017, "grad_norm": 0.6285502910614014, "learning_rate": 5.161502889249379e-05, "loss": 0.4145, "step": 103260 }, { "epoch": 2.2985665954415953, "grad_norm": 0.6991376280784607, "learning_rate": 5.158377350683505e-05, "loss": 0.4111, "step": 103270 }, { "epoch": 2.298789173789174, "grad_norm": 0.4952690601348877, "learning_rate": 5.1552526186235565e-05, "loss": 0.392, "step": 103280 }, { "epoch": 2.299011752136752, "grad_norm": 0.5192189812660217, "learning_rate": 5.1521286932393396e-05, "loss": 0.4112, "step": 103290 }, { "epoch": 2.2992343304843303, "grad_norm": 0.42775392532348633, "learning_rate": 5.149005574700598e-05, "loss": 0.4729, "step": 103300 }, { "epoch": 2.299456908831909, "grad_norm": 0.6301408410072327, "learning_rate": 5.145883263177058e-05, "loss": 0.4336, "step": 103310 }, { "epoch": 2.2996794871794872, "grad_norm": 0.6173430681228638, "learning_rate": 5.142761758838375e-05, "loss": 0.4586, "step": 103320 }, { "epoch": 2.2999020655270654, "grad_norm": 0.8824524283409119, "learning_rate": 5.1396410618541814e-05, "loss": 0.4052, "step": 103330 }, { "epoch": 2.3001246438746437, "grad_norm": 0.461791068315506, "learning_rate": 5.136521172394055e-05, "loss": 0.4431, "step": 103340 }, { "epoch": 2.3003472222222223, "grad_norm": 0.4454747140407562, "learning_rate": 5.1334020906275395e-05, "loss": 0.4699, "step": 103350 }, { "epoch": 2.3005698005698005, "grad_norm": 0.3216058313846588, "learning_rate": 5.130283816724124e-05, "loss": 0.4434, "step": 103360 }, { "epoch": 2.3007923789173788, "grad_norm": 0.5915253758430481, "learning_rate": 5.1271663508532565e-05, "loss": 0.4242, "step": 103370 }, { "epoch": 2.3010149572649574, "grad_norm": 0.6322838068008423, "learning_rate": 5.124049693184354e-05, "loss": 0.5511, "step": 103380 }, { "epoch": 2.3012375356125356, "grad_norm": 0.46077054738998413, "learning_rate": 5.120933843886761e-05, "loss": 0.5324, "step": 103390 }, { "epoch": 2.301460113960114, "grad_norm": 0.8137968182563782, "learning_rate": 5.1178188031298035e-05, "loss": 0.4624, "step": 103400 }, { "epoch": 2.3016826923076925, "grad_norm": 0.9656932950019836, "learning_rate": 5.1147045710827576e-05, "loss": 0.4386, "step": 103410 }, { "epoch": 2.3019052706552707, "grad_norm": 0.449002206325531, "learning_rate": 5.111591147914856e-05, "loss": 0.4588, "step": 103420 }, { "epoch": 2.302127849002849, "grad_norm": 0.753855288028717, "learning_rate": 5.108478533795271e-05, "loss": 0.4022, "step": 103430 }, { "epoch": 2.302350427350427, "grad_norm": 0.6333743333816528, "learning_rate": 5.105366728893157e-05, "loss": 0.3967, "step": 103440 }, { "epoch": 2.302573005698006, "grad_norm": 1.017401099205017, "learning_rate": 5.102255733377612e-05, "loss": 0.4129, "step": 103450 }, { "epoch": 2.302795584045584, "grad_norm": 0.49452051520347595, "learning_rate": 5.099145547417681e-05, "loss": 0.4, "step": 103460 }, { "epoch": 2.3030181623931623, "grad_norm": 0.5165116786956787, "learning_rate": 5.096036171182379e-05, "loss": 0.5483, "step": 103470 }, { "epoch": 2.303240740740741, "grad_norm": 0.5332070589065552, "learning_rate": 5.0929276048406735e-05, "loss": 0.5176, "step": 103480 }, { "epoch": 2.303463319088319, "grad_norm": 0.5469018816947937, "learning_rate": 5.089819848561483e-05, "loss": 0.4091, "step": 103490 }, { "epoch": 2.3036858974358974, "grad_norm": 0.6199618577957153, "learning_rate": 5.086712902513691e-05, "loss": 0.4496, "step": 103500 }, { "epoch": 2.3039084757834756, "grad_norm": 0.43971481919288635, "learning_rate": 5.083606766866127e-05, "loss": 0.4389, "step": 103510 }, { "epoch": 2.3041310541310542, "grad_norm": 0.5679343342781067, "learning_rate": 5.080501441787588e-05, "loss": 0.3612, "step": 103520 }, { "epoch": 2.3043536324786325, "grad_norm": 0.4358706772327423, "learning_rate": 5.077396927446809e-05, "loss": 0.3968, "step": 103530 }, { "epoch": 2.3045762108262107, "grad_norm": 0.36143189668655396, "learning_rate": 5.074293224012498e-05, "loss": 0.4191, "step": 103540 }, { "epoch": 2.3047987891737893, "grad_norm": 0.5568950772285461, "learning_rate": 5.071190331653317e-05, "loss": 0.3489, "step": 103550 }, { "epoch": 2.3050213675213675, "grad_norm": 0.7864968180656433, "learning_rate": 5.068088250537868e-05, "loss": 0.5666, "step": 103560 }, { "epoch": 2.3052439458689458, "grad_norm": 0.5771081447601318, "learning_rate": 5.064986980834727e-05, "loss": 0.4724, "step": 103570 }, { "epoch": 2.3054665242165244, "grad_norm": 0.849843442440033, "learning_rate": 5.061886522712422e-05, "loss": 0.4717, "step": 103580 }, { "epoch": 2.3056891025641026, "grad_norm": 0.6978979110717773, "learning_rate": 5.058786876339436e-05, "loss": 0.4199, "step": 103590 }, { "epoch": 2.305911680911681, "grad_norm": 0.5732024908065796, "learning_rate": 5.055688041884199e-05, "loss": 0.4802, "step": 103600 }, { "epoch": 2.306134259259259, "grad_norm": 0.7335711717605591, "learning_rate": 5.052590019515107e-05, "loss": 0.3348, "step": 103610 }, { "epoch": 2.3063568376068377, "grad_norm": 0.4181780219078064, "learning_rate": 5.049492809400509e-05, "loss": 0.4457, "step": 103620 }, { "epoch": 2.306579415954416, "grad_norm": 0.43094080686569214, "learning_rate": 5.046396411708709e-05, "loss": 0.5484, "step": 103630 }, { "epoch": 2.306801994301994, "grad_norm": 0.4213450849056244, "learning_rate": 5.043300826607973e-05, "loss": 0.4916, "step": 103640 }, { "epoch": 2.307024572649573, "grad_norm": 0.4814653694629669, "learning_rate": 5.0402060542665183e-05, "loss": 0.4567, "step": 103650 }, { "epoch": 2.307247150997151, "grad_norm": 0.41216182708740234, "learning_rate": 5.0371120948525076e-05, "loss": 0.6228, "step": 103660 }, { "epoch": 2.3074697293447293, "grad_norm": 0.46462947130203247, "learning_rate": 5.034018948534076e-05, "loss": 0.3674, "step": 103670 }, { "epoch": 2.3076923076923075, "grad_norm": 0.42045047879219055, "learning_rate": 5.03092661547931e-05, "loss": 0.3813, "step": 103680 }, { "epoch": 2.307914886039886, "grad_norm": 0.5972529649734497, "learning_rate": 5.027835095856241e-05, "loss": 0.4336, "step": 103690 }, { "epoch": 2.3081374643874644, "grad_norm": 0.6638756990432739, "learning_rate": 5.0247443898328714e-05, "loss": 0.4934, "step": 103700 }, { "epoch": 2.3083600427350426, "grad_norm": 0.5444926619529724, "learning_rate": 5.021654497577151e-05, "loss": 0.5162, "step": 103710 }, { "epoch": 2.3085826210826212, "grad_norm": 0.6065942049026489, "learning_rate": 5.018565419256984e-05, "loss": 0.525, "step": 103720 }, { "epoch": 2.3088051994301995, "grad_norm": 0.7407015562057495, "learning_rate": 5.0154771550402447e-05, "loss": 0.4399, "step": 103730 }, { "epoch": 2.3090277777777777, "grad_norm": 0.8703110814094543, "learning_rate": 5.012389705094738e-05, "loss": 0.3646, "step": 103740 }, { "epoch": 2.3092503561253563, "grad_norm": 0.6362120509147644, "learning_rate": 5.009303069588242e-05, "loss": 0.4145, "step": 103750 }, { "epoch": 2.3094729344729346, "grad_norm": 0.7072296142578125, "learning_rate": 5.006217248688492e-05, "loss": 0.4931, "step": 103760 }, { "epoch": 2.3096955128205128, "grad_norm": 0.6555598378181458, "learning_rate": 5.003132242563169e-05, "loss": 0.4361, "step": 103770 }, { "epoch": 2.309918091168091, "grad_norm": 0.6338368058204651, "learning_rate": 5.0000480513799176e-05, "loss": 0.411, "step": 103780 }, { "epoch": 2.3101406695156697, "grad_norm": 0.45361435413360596, "learning_rate": 4.996964675306341e-05, "loss": 0.4087, "step": 103790 }, { "epoch": 2.310363247863248, "grad_norm": 0.39669114351272583, "learning_rate": 4.99388211450998e-05, "loss": 0.4447, "step": 103800 }, { "epoch": 2.310585826210826, "grad_norm": 0.4574336111545563, "learning_rate": 4.990800369158355e-05, "loss": 0.3971, "step": 103810 }, { "epoch": 2.3108084045584047, "grad_norm": 0.5125351548194885, "learning_rate": 4.9877194394189185e-05, "loss": 0.429, "step": 103820 }, { "epoch": 2.311030982905983, "grad_norm": 0.5527442693710327, "learning_rate": 4.9846393254591e-05, "loss": 0.4648, "step": 103830 }, { "epoch": 2.311253561253561, "grad_norm": 0.7029348611831665, "learning_rate": 4.98156002744627e-05, "loss": 0.5263, "step": 103840 }, { "epoch": 2.3114761396011394, "grad_norm": 0.52207350730896, "learning_rate": 4.978481545547764e-05, "loss": 0.4652, "step": 103850 }, { "epoch": 2.311698717948718, "grad_norm": 0.609255850315094, "learning_rate": 4.975403879930867e-05, "loss": 0.4534, "step": 103860 }, { "epoch": 2.3119212962962963, "grad_norm": 0.6262114644050598, "learning_rate": 4.972327030762829e-05, "loss": 0.4914, "step": 103870 }, { "epoch": 2.3121438746438745, "grad_norm": 0.5444429516792297, "learning_rate": 4.9692509982108373e-05, "loss": 0.4856, "step": 103880 }, { "epoch": 2.312366452991453, "grad_norm": 0.5377378463745117, "learning_rate": 4.966175782442051e-05, "loss": 0.4425, "step": 103890 }, { "epoch": 2.3125890313390314, "grad_norm": 0.5324745774269104, "learning_rate": 4.9631013836235786e-05, "loss": 0.3724, "step": 103900 }, { "epoch": 2.3128116096866096, "grad_norm": 1.1432093381881714, "learning_rate": 4.960027801922486e-05, "loss": 0.4219, "step": 103910 }, { "epoch": 2.3130341880341883, "grad_norm": 0.48718583583831787, "learning_rate": 4.9569550375057994e-05, "loss": 0.4795, "step": 103920 }, { "epoch": 2.3132567663817665, "grad_norm": 0.3967041075229645, "learning_rate": 4.953883090540492e-05, "loss": 0.4148, "step": 103930 }, { "epoch": 2.3134793447293447, "grad_norm": 0.6293598413467407, "learning_rate": 4.9508119611934914e-05, "loss": 0.4807, "step": 103940 }, { "epoch": 2.313701923076923, "grad_norm": 0.7447176575660706, "learning_rate": 4.947741649631694e-05, "loss": 0.5025, "step": 103950 }, { "epoch": 2.3139245014245016, "grad_norm": 0.618916928768158, "learning_rate": 4.9446721560219324e-05, "loss": 0.5215, "step": 103960 }, { "epoch": 2.31414707977208, "grad_norm": 0.7033261060714722, "learning_rate": 4.94160348053101e-05, "loss": 0.3964, "step": 103970 }, { "epoch": 2.314369658119658, "grad_norm": 0.4891183376312256, "learning_rate": 4.938535623325682e-05, "loss": 0.4563, "step": 103980 }, { "epoch": 2.314592236467236, "grad_norm": 0.5945625305175781, "learning_rate": 4.935468584572658e-05, "loss": 0.4716, "step": 103990 }, { "epoch": 2.314814814814815, "grad_norm": 0.6280008554458618, "learning_rate": 4.932402364438604e-05, "loss": 0.5475, "step": 104000 }, { "epoch": 2.315037393162393, "grad_norm": 0.535823404788971, "learning_rate": 4.929336963090145e-05, "loss": 0.4413, "step": 104010 }, { "epoch": 2.3152599715099713, "grad_norm": 0.42797577381134033, "learning_rate": 4.926272380693848e-05, "loss": 0.4313, "step": 104020 }, { "epoch": 2.31548254985755, "grad_norm": 0.6048837900161743, "learning_rate": 4.9232086174162504e-05, "loss": 0.4155, "step": 104030 }, { "epoch": 2.315705128205128, "grad_norm": 0.49043428897857666, "learning_rate": 4.9201456734238394e-05, "loss": 0.5275, "step": 104040 }, { "epoch": 2.3159277065527064, "grad_norm": 0.6840739846229553, "learning_rate": 4.917083548883055e-05, "loss": 0.3677, "step": 104050 }, { "epoch": 2.316150284900285, "grad_norm": 0.4900062084197998, "learning_rate": 4.914022243960299e-05, "loss": 0.4514, "step": 104060 }, { "epoch": 2.3163728632478633, "grad_norm": 0.45657631754875183, "learning_rate": 4.91096175882193e-05, "loss": 0.547, "step": 104070 }, { "epoch": 2.3165954415954415, "grad_norm": 0.6857179403305054, "learning_rate": 4.9079020936342465e-05, "loss": 0.5541, "step": 104080 }, { "epoch": 2.31681801994302, "grad_norm": 0.6056687235832214, "learning_rate": 4.9048432485635245e-05, "loss": 0.367, "step": 104090 }, { "epoch": 2.3170405982905984, "grad_norm": 0.5658937096595764, "learning_rate": 4.90178522377597e-05, "loss": 0.429, "step": 104100 }, { "epoch": 2.3172631766381766, "grad_norm": 0.3987172842025757, "learning_rate": 4.898728019437768e-05, "loss": 0.4943, "step": 104110 }, { "epoch": 2.317485754985755, "grad_norm": 0.5849761962890625, "learning_rate": 4.895671635715047e-05, "loss": 0.3971, "step": 104120 }, { "epoch": 2.3177083333333335, "grad_norm": 0.6185342073440552, "learning_rate": 4.8926160727738944e-05, "loss": 0.4785, "step": 104130 }, { "epoch": 2.3179309116809117, "grad_norm": 0.6695090532302856, "learning_rate": 4.889561330780352e-05, "loss": 0.4147, "step": 104140 }, { "epoch": 2.31815349002849, "grad_norm": 0.5411170721054077, "learning_rate": 4.886507409900425e-05, "loss": 0.5187, "step": 104150 }, { "epoch": 2.318376068376068, "grad_norm": 0.7303962111473083, "learning_rate": 4.8834543103000486e-05, "loss": 0.3542, "step": 104160 }, { "epoch": 2.318598646723647, "grad_norm": 0.42483285069465637, "learning_rate": 4.880402032145141e-05, "loss": 0.575, "step": 104170 }, { "epoch": 2.318821225071225, "grad_norm": 0.7072681188583374, "learning_rate": 4.877350575601565e-05, "loss": 0.4142, "step": 104180 }, { "epoch": 2.3190438034188032, "grad_norm": 0.6763079166412354, "learning_rate": 4.874299940835139e-05, "loss": 0.4266, "step": 104190 }, { "epoch": 2.319266381766382, "grad_norm": 0.5402014851570129, "learning_rate": 4.871250128011635e-05, "loss": 0.4712, "step": 104200 }, { "epoch": 2.31948896011396, "grad_norm": 0.4686580002307892, "learning_rate": 4.868201137296791e-05, "loss": 0.396, "step": 104210 }, { "epoch": 2.3197115384615383, "grad_norm": 0.43557852506637573, "learning_rate": 4.865152968856279e-05, "loss": 0.4539, "step": 104220 }, { "epoch": 2.319934116809117, "grad_norm": 0.535490870475769, "learning_rate": 4.86210562285575e-05, "loss": 0.3973, "step": 104230 }, { "epoch": 2.320156695156695, "grad_norm": 0.27691832184791565, "learning_rate": 4.8590590994607874e-05, "loss": 0.4522, "step": 104240 }, { "epoch": 2.3203792735042734, "grad_norm": 0.7170953154563904, "learning_rate": 4.8560133988369495e-05, "loss": 0.4457, "step": 104250 }, { "epoch": 2.320601851851852, "grad_norm": 0.32499974966049194, "learning_rate": 4.852968521149741e-05, "loss": 0.4466, "step": 104260 }, { "epoch": 2.3208244301994303, "grad_norm": 0.3497593104839325, "learning_rate": 4.849924466564624e-05, "loss": 0.3996, "step": 104270 }, { "epoch": 2.3210470085470085, "grad_norm": 0.5175875425338745, "learning_rate": 4.846881235247011e-05, "loss": 0.419, "step": 104280 }, { "epoch": 2.3212695868945867, "grad_norm": 0.5474359393119812, "learning_rate": 4.8438388273622834e-05, "loss": 0.4922, "step": 104290 }, { "epoch": 2.3214921652421654, "grad_norm": 0.4498869478702545, "learning_rate": 4.840797243075757e-05, "loss": 0.3817, "step": 104300 }, { "epoch": 2.3217147435897436, "grad_norm": 0.9331063628196716, "learning_rate": 4.837756482552718e-05, "loss": 0.4962, "step": 104310 }, { "epoch": 2.321937321937322, "grad_norm": 0.5544161200523376, "learning_rate": 4.834716545958402e-05, "loss": 0.5001, "step": 104320 }, { "epoch": 2.3221599002849, "grad_norm": 0.7270580530166626, "learning_rate": 4.831677433458006e-05, "loss": 0.5085, "step": 104330 }, { "epoch": 2.3223824786324787, "grad_norm": 0.6028932929039001, "learning_rate": 4.82863914521668e-05, "loss": 0.4596, "step": 104340 }, { "epoch": 2.322605056980057, "grad_norm": 0.6653586626052856, "learning_rate": 4.825601681399518e-05, "loss": 0.4395, "step": 104350 }, { "epoch": 2.322827635327635, "grad_norm": 0.4627334177494049, "learning_rate": 4.822565042171583e-05, "loss": 0.4488, "step": 104360 }, { "epoch": 2.323050213675214, "grad_norm": 0.502573549747467, "learning_rate": 4.819529227697894e-05, "loss": 0.3895, "step": 104370 }, { "epoch": 2.323272792022792, "grad_norm": 0.5004000663757324, "learning_rate": 4.816494238143407e-05, "loss": 0.4488, "step": 104380 }, { "epoch": 2.3234953703703702, "grad_norm": 0.5961329936981201, "learning_rate": 4.813460073673055e-05, "loss": 0.3853, "step": 104390 }, { "epoch": 2.323717948717949, "grad_norm": 0.8065715432167053, "learning_rate": 4.810426734451714e-05, "loss": 0.3902, "step": 104400 }, { "epoch": 2.323940527065527, "grad_norm": 0.6661033034324646, "learning_rate": 4.807394220644219e-05, "loss": 0.4262, "step": 104410 }, { "epoch": 2.3241631054131053, "grad_norm": 0.6344094276428223, "learning_rate": 4.804362532415358e-05, "loss": 0.359, "step": 104420 }, { "epoch": 2.324385683760684, "grad_norm": 0.4737999141216278, "learning_rate": 4.801331669929885e-05, "loss": 0.5379, "step": 104430 }, { "epoch": 2.324608262108262, "grad_norm": 0.8009842038154602, "learning_rate": 4.798301633352484e-05, "loss": 0.5161, "step": 104440 }, { "epoch": 2.3248308404558404, "grad_norm": 0.5520380139350891, "learning_rate": 4.7952724228478164e-05, "loss": 0.4088, "step": 104450 }, { "epoch": 2.3250534188034186, "grad_norm": 0.4492236375808716, "learning_rate": 4.792244038580494e-05, "loss": 0.5594, "step": 104460 }, { "epoch": 2.3252759971509973, "grad_norm": 0.6150158047676086, "learning_rate": 4.789216480715084e-05, "loss": 0.3366, "step": 104470 }, { "epoch": 2.3254985754985755, "grad_norm": 0.5869104862213135, "learning_rate": 4.786189749416099e-05, "loss": 0.5082, "step": 104480 }, { "epoch": 2.3257211538461537, "grad_norm": 0.5086728930473328, "learning_rate": 4.7831638448480176e-05, "loss": 0.4535, "step": 104490 }, { "epoch": 2.325943732193732, "grad_norm": 1.0462831258773804, "learning_rate": 4.780138767175273e-05, "loss": 0.5004, "step": 104500 }, { "epoch": 2.3261663105413106, "grad_norm": 0.4210734963417053, "learning_rate": 4.777114516562244e-05, "loss": 0.4724, "step": 104510 }, { "epoch": 2.326388888888889, "grad_norm": 0.5561769008636475, "learning_rate": 4.774091093173274e-05, "loss": 0.499, "step": 104520 }, { "epoch": 2.326611467236467, "grad_norm": 0.4487607777118683, "learning_rate": 4.771068497172657e-05, "loss": 0.4394, "step": 104530 }, { "epoch": 2.3268340455840457, "grad_norm": 1.0502511262893677, "learning_rate": 4.7680467287246465e-05, "loss": 0.388, "step": 104540 }, { "epoch": 2.327056623931624, "grad_norm": 0.5806379318237305, "learning_rate": 4.765025787993445e-05, "loss": 0.5453, "step": 104550 }, { "epoch": 2.327279202279202, "grad_norm": 0.5412917137145996, "learning_rate": 4.7620056751432155e-05, "loss": 0.4893, "step": 104560 }, { "epoch": 2.327501780626781, "grad_norm": 0.48680606484413147, "learning_rate": 4.758986390338076e-05, "loss": 0.497, "step": 104570 }, { "epoch": 2.327724358974359, "grad_norm": 0.6453859806060791, "learning_rate": 4.7559679337420894e-05, "loss": 0.5302, "step": 104580 }, { "epoch": 2.3279469373219372, "grad_norm": 0.5900819301605225, "learning_rate": 4.752950305519282e-05, "loss": 0.4153, "step": 104590 }, { "epoch": 2.328169515669516, "grad_norm": 0.7351835370063782, "learning_rate": 4.7499335058336437e-05, "loss": 0.4622, "step": 104600 }, { "epoch": 2.328392094017094, "grad_norm": 0.5503653287887573, "learning_rate": 4.746917534849098e-05, "loss": 0.4657, "step": 104610 }, { "epoch": 2.3286146723646723, "grad_norm": 0.4437830150127411, "learning_rate": 4.74390239272954e-05, "loss": 0.4675, "step": 104620 }, { "epoch": 2.3288372507122506, "grad_norm": 0.7353461384773254, "learning_rate": 4.740888079638815e-05, "loss": 0.3839, "step": 104630 }, { "epoch": 2.3290598290598292, "grad_norm": 0.8295828700065613, "learning_rate": 4.737874595740728e-05, "loss": 0.4533, "step": 104640 }, { "epoch": 2.3292824074074074, "grad_norm": 0.6678076386451721, "learning_rate": 4.734861941199025e-05, "loss": 0.4824, "step": 104650 }, { "epoch": 2.3295049857549857, "grad_norm": 0.5148504972457886, "learning_rate": 4.7318501161774206e-05, "loss": 0.4704, "step": 104660 }, { "epoch": 2.329727564102564, "grad_norm": 0.5634397268295288, "learning_rate": 4.728839120839581e-05, "loss": 0.475, "step": 104670 }, { "epoch": 2.3299501424501425, "grad_norm": 0.5880442261695862, "learning_rate": 4.725828955349123e-05, "loss": 0.428, "step": 104680 }, { "epoch": 2.3301727207977208, "grad_norm": 0.5846168994903564, "learning_rate": 4.722819619869625e-05, "loss": 0.3902, "step": 104690 }, { "epoch": 2.330395299145299, "grad_norm": 0.7889565229415894, "learning_rate": 4.7198111145646165e-05, "loss": 0.5206, "step": 104700 }, { "epoch": 2.3306178774928776, "grad_norm": 0.5713897943496704, "learning_rate": 4.7168034395975834e-05, "loss": 0.5043, "step": 104710 }, { "epoch": 2.330840455840456, "grad_norm": 0.49295225739479065, "learning_rate": 4.713796595131961e-05, "loss": 0.3995, "step": 104720 }, { "epoch": 2.331063034188034, "grad_norm": 0.7045540809631348, "learning_rate": 4.710790581331148e-05, "loss": 0.4757, "step": 104730 }, { "epoch": 2.3312856125356127, "grad_norm": 0.5422503352165222, "learning_rate": 4.707785398358486e-05, "loss": 0.3957, "step": 104740 }, { "epoch": 2.331508190883191, "grad_norm": 0.6096838116645813, "learning_rate": 4.704781046377285e-05, "loss": 0.4897, "step": 104750 }, { "epoch": 2.331730769230769, "grad_norm": 0.6194683313369751, "learning_rate": 4.701777525550803e-05, "loss": 0.5267, "step": 104760 }, { "epoch": 2.331953347578348, "grad_norm": 0.6207007169723511, "learning_rate": 4.698774836042254e-05, "loss": 0.4713, "step": 104770 }, { "epoch": 2.332175925925926, "grad_norm": 0.6871147751808167, "learning_rate": 4.695772978014812e-05, "loss": 0.4667, "step": 104780 }, { "epoch": 2.3323985042735043, "grad_norm": 0.5978109240531921, "learning_rate": 4.692771951631589e-05, "loss": 0.4706, "step": 104790 }, { "epoch": 2.3326210826210825, "grad_norm": 0.7385008335113525, "learning_rate": 4.689771757055672e-05, "loss": 0.5148, "step": 104800 }, { "epoch": 2.332843660968661, "grad_norm": 0.5949559211730957, "learning_rate": 4.6867723944500874e-05, "loss": 0.4718, "step": 104810 }, { "epoch": 2.3330662393162394, "grad_norm": 0.529515266418457, "learning_rate": 4.6837738639778294e-05, "loss": 0.3999, "step": 104820 }, { "epoch": 2.3332888176638176, "grad_norm": 0.654148519039154, "learning_rate": 4.680776165801837e-05, "loss": 0.4769, "step": 104830 }, { "epoch": 2.333511396011396, "grad_norm": 0.6815111637115479, "learning_rate": 4.677779300085008e-05, "loss": 0.4645, "step": 104840 }, { "epoch": 2.3337339743589745, "grad_norm": 0.5101546049118042, "learning_rate": 4.6747832669902035e-05, "loss": 0.4376, "step": 104850 }, { "epoch": 2.3339565527065527, "grad_norm": 0.4611969292163849, "learning_rate": 4.6717880666802206e-05, "loss": 0.386, "step": 104860 }, { "epoch": 2.334179131054131, "grad_norm": 0.6382802128791809, "learning_rate": 4.668793699317815e-05, "loss": 0.55, "step": 104870 }, { "epoch": 2.3344017094017095, "grad_norm": 0.7732513546943665, "learning_rate": 4.665800165065712e-05, "loss": 0.5326, "step": 104880 }, { "epoch": 2.3346242877492878, "grad_norm": 0.5758798122406006, "learning_rate": 4.66280746408658e-05, "loss": 0.4083, "step": 104890 }, { "epoch": 2.334846866096866, "grad_norm": 0.5310105681419373, "learning_rate": 4.659815596543049e-05, "loss": 0.3584, "step": 104900 }, { "epoch": 2.3350694444444446, "grad_norm": 0.657589316368103, "learning_rate": 4.656824562597695e-05, "loss": 0.5431, "step": 104910 }, { "epoch": 2.335292022792023, "grad_norm": 0.5576604008674622, "learning_rate": 4.653834362413059e-05, "loss": 0.5104, "step": 104920 }, { "epoch": 2.335514601139601, "grad_norm": 0.32664191722869873, "learning_rate": 4.6508449961516224e-05, "loss": 0.3872, "step": 104930 }, { "epoch": 2.3357371794871793, "grad_norm": 0.6069530248641968, "learning_rate": 4.647856463975835e-05, "loss": 0.4422, "step": 104940 }, { "epoch": 2.335959757834758, "grad_norm": 0.43368053436279297, "learning_rate": 4.644868766048094e-05, "loss": 0.5391, "step": 104950 }, { "epoch": 2.336182336182336, "grad_norm": 0.7271152138710022, "learning_rate": 4.641881902530754e-05, "loss": 0.4033, "step": 104960 }, { "epoch": 2.3364049145299144, "grad_norm": 0.587011456489563, "learning_rate": 4.6388958735861246e-05, "loss": 0.4844, "step": 104970 }, { "epoch": 2.336627492877493, "grad_norm": 0.8586376905441284, "learning_rate": 4.6359106793764676e-05, "loss": 0.366, "step": 104980 }, { "epoch": 2.3368500712250713, "grad_norm": 0.7252295613288879, "learning_rate": 4.6329263200640105e-05, "loss": 0.4419, "step": 104990 }, { "epoch": 2.3370726495726495, "grad_norm": 0.5614856481552124, "learning_rate": 4.6299427958109155e-05, "loss": 0.5309, "step": 105000 }, { "epoch": 2.3372952279202277, "grad_norm": 0.5738231539726257, "learning_rate": 4.626960106779306e-05, "loss": 0.4925, "step": 105010 }, { "epoch": 2.3375178062678064, "grad_norm": 0.5994122624397278, "learning_rate": 4.623978253131267e-05, "loss": 0.4677, "step": 105020 }, { "epoch": 2.3377403846153846, "grad_norm": 0.813988208770752, "learning_rate": 4.620997235028841e-05, "loss": 0.4449, "step": 105030 }, { "epoch": 2.337962962962963, "grad_norm": 0.4093601107597351, "learning_rate": 4.6180170526340114e-05, "loss": 0.4821, "step": 105040 }, { "epoch": 2.3381855413105415, "grad_norm": 0.6398217082023621, "learning_rate": 4.615037706108731e-05, "loss": 0.4613, "step": 105050 }, { "epoch": 2.3384081196581197, "grad_norm": 0.669044554233551, "learning_rate": 4.6120591956148994e-05, "loss": 0.4632, "step": 105060 }, { "epoch": 2.338630698005698, "grad_norm": 0.587679922580719, "learning_rate": 4.609081521314365e-05, "loss": 0.3624, "step": 105070 }, { "epoch": 2.3388532763532766, "grad_norm": 0.6691391468048096, "learning_rate": 4.606104683368937e-05, "loss": 0.4374, "step": 105080 }, { "epoch": 2.3390758547008548, "grad_norm": 0.5915713906288147, "learning_rate": 4.603128681940385e-05, "loss": 0.3768, "step": 105090 }, { "epoch": 2.339298433048433, "grad_norm": 0.5158896446228027, "learning_rate": 4.6001535171904245e-05, "loss": 0.5167, "step": 105100 }, { "epoch": 2.339521011396011, "grad_norm": 0.5442484617233276, "learning_rate": 4.5971791892807293e-05, "loss": 0.4577, "step": 105110 }, { "epoch": 2.33974358974359, "grad_norm": 0.3447693884372711, "learning_rate": 4.594205698372931e-05, "loss": 0.4921, "step": 105120 }, { "epoch": 2.339966168091168, "grad_norm": 0.6204404234886169, "learning_rate": 4.5912330446286e-05, "loss": 0.4168, "step": 105130 }, { "epoch": 2.3401887464387463, "grad_norm": 0.5360138416290283, "learning_rate": 4.588261228209287e-05, "loss": 0.6229, "step": 105140 }, { "epoch": 2.3402777777777777, "eval_loss": 0.5290852189064026, "eval_runtime": 337.4554, "eval_samples_per_second": 7.008, "eval_steps_per_second": 7.008, "step": 105144 }, { "epoch": 2.340411324786325, "grad_norm": 0.3526182770729065, "learning_rate": 4.5852902492764705e-05, "loss": 0.3732, "step": 105150 }, { "epoch": 2.340633903133903, "grad_norm": 0.5401740074157715, "learning_rate": 4.5823201079916e-05, "loss": 0.5155, "step": 105160 }, { "epoch": 2.3408564814814814, "grad_norm": 0.8644507527351379, "learning_rate": 4.579350804516076e-05, "loss": 0.5004, "step": 105170 }, { "epoch": 2.3410790598290596, "grad_norm": 0.6037582159042358, "learning_rate": 4.576382339011254e-05, "loss": 0.5717, "step": 105180 }, { "epoch": 2.3413016381766383, "grad_norm": 0.6918118000030518, "learning_rate": 4.5734147116384395e-05, "loss": 0.5744, "step": 105190 }, { "epoch": 2.3415242165242165, "grad_norm": 0.5624716877937317, "learning_rate": 4.570447922558907e-05, "loss": 0.5157, "step": 105200 }, { "epoch": 2.3417467948717947, "grad_norm": 0.6044760346412659, "learning_rate": 4.567481971933858e-05, "loss": 0.5491, "step": 105210 }, { "epoch": 2.3419693732193734, "grad_norm": 0.6322526931762695, "learning_rate": 4.564516859924475e-05, "loss": 0.4647, "step": 105220 }, { "epoch": 2.3421919515669516, "grad_norm": 0.7589077353477478, "learning_rate": 4.561552586691879e-05, "loss": 0.5316, "step": 105230 }, { "epoch": 2.34241452991453, "grad_norm": 0.4485898017883301, "learning_rate": 4.558589152397155e-05, "loss": 0.4465, "step": 105240 }, { "epoch": 2.3426371082621085, "grad_norm": 0.49135929346084595, "learning_rate": 4.555626557201338e-05, "loss": 0.4638, "step": 105250 }, { "epoch": 2.3428596866096867, "grad_norm": 0.5671507716178894, "learning_rate": 4.552664801265421e-05, "loss": 0.3968, "step": 105260 }, { "epoch": 2.343082264957265, "grad_norm": 0.6243501305580139, "learning_rate": 4.54970388475034e-05, "loss": 0.578, "step": 105270 }, { "epoch": 2.343304843304843, "grad_norm": 0.43660518527030945, "learning_rate": 4.546743807817004e-05, "loss": 0.51, "step": 105280 }, { "epoch": 2.343527421652422, "grad_norm": 0.7414266467094421, "learning_rate": 4.5437845706262546e-05, "loss": 0.5355, "step": 105290 }, { "epoch": 2.34375, "grad_norm": 0.478127658367157, "learning_rate": 4.5408261733389054e-05, "loss": 0.4781, "step": 105300 }, { "epoch": 2.343972578347578, "grad_norm": 0.7516529560089111, "learning_rate": 4.537868616115717e-05, "loss": 0.525, "step": 105310 }, { "epoch": 2.344195156695157, "grad_norm": 0.5840153098106384, "learning_rate": 4.534911899117405e-05, "loss": 0.4286, "step": 105320 }, { "epoch": 2.344417735042735, "grad_norm": 0.44853419065475464, "learning_rate": 4.53195602250464e-05, "loss": 0.5242, "step": 105330 }, { "epoch": 2.3446403133903133, "grad_norm": 0.4717266857624054, "learning_rate": 4.529000986438055e-05, "loss": 0.3967, "step": 105340 }, { "epoch": 2.3448628917378915, "grad_norm": 0.641845166683197, "learning_rate": 4.526046791078216e-05, "loss": 0.5133, "step": 105350 }, { "epoch": 2.34508547008547, "grad_norm": 0.6157569885253906, "learning_rate": 4.523093436585659e-05, "loss": 0.448, "step": 105360 }, { "epoch": 2.3453080484330484, "grad_norm": 0.57972252368927, "learning_rate": 4.520140923120877e-05, "loss": 0.4681, "step": 105370 }, { "epoch": 2.3455306267806266, "grad_norm": 0.6386918425559998, "learning_rate": 4.517189250844309e-05, "loss": 0.4064, "step": 105380 }, { "epoch": 2.3457532051282053, "grad_norm": 0.34322062134742737, "learning_rate": 4.514238419916359e-05, "loss": 0.3958, "step": 105390 }, { "epoch": 2.3459757834757835, "grad_norm": 0.3749435842037201, "learning_rate": 4.5112884304973626e-05, "loss": 0.4942, "step": 105400 }, { "epoch": 2.3461983618233617, "grad_norm": 0.7757494449615479, "learning_rate": 4.508339282747633e-05, "loss": 0.6077, "step": 105410 }, { "epoch": 2.3464209401709404, "grad_norm": 0.6120235919952393, "learning_rate": 4.505390976827437e-05, "loss": 0.37, "step": 105420 }, { "epoch": 2.3466435185185186, "grad_norm": 0.3525809049606323, "learning_rate": 4.502443512896972e-05, "loss": 0.4643, "step": 105430 }, { "epoch": 2.346866096866097, "grad_norm": 0.5600558519363403, "learning_rate": 4.499496891116413e-05, "loss": 0.5139, "step": 105440 }, { "epoch": 2.347088675213675, "grad_norm": 0.7498864531517029, "learning_rate": 4.4965511116458836e-05, "loss": 0.4153, "step": 105450 }, { "epoch": 2.3473112535612537, "grad_norm": 0.5569818615913391, "learning_rate": 4.493606174645457e-05, "loss": 0.4323, "step": 105460 }, { "epoch": 2.347533831908832, "grad_norm": 0.72532057762146, "learning_rate": 4.490662080275165e-05, "loss": 0.4834, "step": 105470 }, { "epoch": 2.34775641025641, "grad_norm": 0.7324677109718323, "learning_rate": 4.487718828695e-05, "loss": 0.5242, "step": 105480 }, { "epoch": 2.347978988603989, "grad_norm": 0.34302493929862976, "learning_rate": 4.484776420064885e-05, "loss": 0.3734, "step": 105490 }, { "epoch": 2.348201566951567, "grad_norm": 1.0544486045837402, "learning_rate": 4.481834854544722e-05, "loss": 0.4965, "step": 105500 }, { "epoch": 2.3484241452991452, "grad_norm": 0.44091662764549255, "learning_rate": 4.4788941322943555e-05, "loss": 0.4322, "step": 105510 }, { "epoch": 2.3486467236467234, "grad_norm": 0.4288344383239746, "learning_rate": 4.475954253473596e-05, "loss": 0.405, "step": 105520 }, { "epoch": 2.348869301994302, "grad_norm": 0.47448357939720154, "learning_rate": 4.473015218242182e-05, "loss": 0.4113, "step": 105530 }, { "epoch": 2.3490918803418803, "grad_norm": 0.5844582319259644, "learning_rate": 4.470077026759834e-05, "loss": 0.4803, "step": 105540 }, { "epoch": 2.3493144586894585, "grad_norm": 0.5927007794380188, "learning_rate": 4.4671396791862144e-05, "loss": 0.4546, "step": 105550 }, { "epoch": 2.349537037037037, "grad_norm": 0.5077242851257324, "learning_rate": 4.464203175680943e-05, "loss": 0.4778, "step": 105560 }, { "epoch": 2.3497596153846154, "grad_norm": 0.5459374785423279, "learning_rate": 4.4612675164035864e-05, "loss": 0.5227, "step": 105570 }, { "epoch": 2.3499821937321936, "grad_norm": 0.9779196381568909, "learning_rate": 4.458332701513672e-05, "loss": 0.4266, "step": 105580 }, { "epoch": 2.3502047720797723, "grad_norm": 0.4522073566913605, "learning_rate": 4.4553987311706836e-05, "loss": 0.3806, "step": 105590 }, { "epoch": 2.3504273504273505, "grad_norm": 0.44010409712791443, "learning_rate": 4.452465605534053e-05, "loss": 0.434, "step": 105600 }, { "epoch": 2.3506499287749287, "grad_norm": 0.44877657294273376, "learning_rate": 4.4495333247631686e-05, "loss": 0.4163, "step": 105610 }, { "epoch": 2.350872507122507, "grad_norm": 0.5671922564506531, "learning_rate": 4.4466018890173785e-05, "loss": 0.4689, "step": 105620 }, { "epoch": 2.3510950854700856, "grad_norm": 0.7625622153282166, "learning_rate": 4.443671298455969e-05, "loss": 0.3857, "step": 105630 }, { "epoch": 2.351317663817664, "grad_norm": 0.4385626018047333, "learning_rate": 4.440741553238197e-05, "loss": 0.3915, "step": 105640 }, { "epoch": 2.351540242165242, "grad_norm": 0.795430600643158, "learning_rate": 4.4378126535232725e-05, "loss": 0.4764, "step": 105650 }, { "epoch": 2.3517628205128207, "grad_norm": 0.5816218852996826, "learning_rate": 4.434884599470341e-05, "loss": 0.5346, "step": 105660 }, { "epoch": 2.351985398860399, "grad_norm": 0.8353447914123535, "learning_rate": 4.431957391238521e-05, "loss": 0.4902, "step": 105670 }, { "epoch": 2.352207977207977, "grad_norm": 0.6605375409126282, "learning_rate": 4.429031028986885e-05, "loss": 0.4659, "step": 105680 }, { "epoch": 2.3524305555555554, "grad_norm": 0.5393868684768677, "learning_rate": 4.426105512874448e-05, "loss": 0.47, "step": 105690 }, { "epoch": 2.352653133903134, "grad_norm": 0.40643393993377686, "learning_rate": 4.4231808430601925e-05, "loss": 0.4405, "step": 105700 }, { "epoch": 2.3528757122507122, "grad_norm": 0.7156741619110107, "learning_rate": 4.420257019703036e-05, "loss": 0.471, "step": 105710 }, { "epoch": 2.3530982905982905, "grad_norm": 0.42996159195899963, "learning_rate": 4.417334042961867e-05, "loss": 0.3584, "step": 105720 }, { "epoch": 2.353320868945869, "grad_norm": 0.6254279613494873, "learning_rate": 4.414411912995522e-05, "loss": 0.4682, "step": 105730 }, { "epoch": 2.3535434472934473, "grad_norm": 0.4967123866081238, "learning_rate": 4.4114906299627934e-05, "loss": 0.427, "step": 105740 }, { "epoch": 2.3537660256410255, "grad_norm": 0.47954392433166504, "learning_rate": 4.408570194022426e-05, "loss": 0.4278, "step": 105750 }, { "epoch": 2.353988603988604, "grad_norm": 0.6673099994659424, "learning_rate": 4.4056506053331224e-05, "loss": 0.4725, "step": 105760 }, { "epoch": 2.3542111823361824, "grad_norm": 0.7563617825508118, "learning_rate": 4.4027318640535267e-05, "loss": 0.4144, "step": 105770 }, { "epoch": 2.3544337606837606, "grad_norm": 0.6754289269447327, "learning_rate": 4.3998139703422544e-05, "loss": 0.5307, "step": 105780 }, { "epoch": 2.354656339031339, "grad_norm": 0.6166078448295593, "learning_rate": 4.396896924357858e-05, "loss": 0.4877, "step": 105790 }, { "epoch": 2.3548789173789175, "grad_norm": 0.6370611190795898, "learning_rate": 4.393980726258855e-05, "loss": 0.5551, "step": 105800 }, { "epoch": 2.3551014957264957, "grad_norm": 0.7064977288246155, "learning_rate": 4.391065376203716e-05, "loss": 0.5666, "step": 105810 }, { "epoch": 2.355324074074074, "grad_norm": 0.5430741906166077, "learning_rate": 4.3881508743508606e-05, "loss": 0.4114, "step": 105820 }, { "epoch": 2.355546652421652, "grad_norm": 0.6265538334846497, "learning_rate": 4.3852372208586665e-05, "loss": 0.574, "step": 105830 }, { "epoch": 2.355769230769231, "grad_norm": 0.31288737058639526, "learning_rate": 4.3823244158854725e-05, "loss": 0.5192, "step": 105840 }, { "epoch": 2.355991809116809, "grad_norm": 0.8239595293998718, "learning_rate": 4.379412459589549e-05, "loss": 0.5916, "step": 105850 }, { "epoch": 2.3562143874643873, "grad_norm": 0.4982580840587616, "learning_rate": 4.3765013521291385e-05, "loss": 0.4481, "step": 105860 }, { "epoch": 2.356436965811966, "grad_norm": 0.43735408782958984, "learning_rate": 4.373591093662437e-05, "loss": 0.5103, "step": 105870 }, { "epoch": 2.356659544159544, "grad_norm": 0.774772047996521, "learning_rate": 4.370681684347586e-05, "loss": 0.4853, "step": 105880 }, { "epoch": 2.3568821225071224, "grad_norm": 0.342276006937027, "learning_rate": 4.367773124342689e-05, "loss": 0.4671, "step": 105890 }, { "epoch": 2.357104700854701, "grad_norm": 0.7277559638023376, "learning_rate": 4.364865413805801e-05, "loss": 0.4475, "step": 105900 }, { "epoch": 2.3573272792022792, "grad_norm": 0.6659511923789978, "learning_rate": 4.361958552894927e-05, "loss": 0.4472, "step": 105910 }, { "epoch": 2.3575498575498575, "grad_norm": 0.6491039395332336, "learning_rate": 4.3590525417680204e-05, "loss": 0.3717, "step": 105920 }, { "epoch": 2.357772435897436, "grad_norm": 0.4370926320552826, "learning_rate": 4.3561473805830045e-05, "loss": 0.3737, "step": 105930 }, { "epoch": 2.3579950142450143, "grad_norm": 0.7258709073066711, "learning_rate": 4.3532430694977454e-05, "loss": 0.4774, "step": 105940 }, { "epoch": 2.3582175925925926, "grad_norm": 0.607122540473938, "learning_rate": 4.350339608670066e-05, "loss": 0.4137, "step": 105950 }, { "epoch": 2.3584401709401708, "grad_norm": 0.6639084815979004, "learning_rate": 4.347436998257746e-05, "loss": 0.5365, "step": 105960 }, { "epoch": 2.3586627492877494, "grad_norm": 0.5172412991523743, "learning_rate": 4.344535238418512e-05, "loss": 0.3785, "step": 105970 }, { "epoch": 2.3588853276353277, "grad_norm": 0.5365101099014282, "learning_rate": 4.3416343293100556e-05, "loss": 0.5023, "step": 105980 }, { "epoch": 2.359107905982906, "grad_norm": 0.5570228099822998, "learning_rate": 4.338734271090001e-05, "loss": 0.4382, "step": 105990 }, { "epoch": 2.359330484330484, "grad_norm": 0.4990769624710083, "learning_rate": 4.335835063915949e-05, "loss": 0.3926, "step": 106000 }, { "epoch": 2.3595530626780628, "grad_norm": 0.5243441462516785, "learning_rate": 4.332936707945443e-05, "loss": 0.5187, "step": 106010 }, { "epoch": 2.359775641025641, "grad_norm": 0.4713497459888458, "learning_rate": 4.3300392033359804e-05, "loss": 0.5061, "step": 106020 }, { "epoch": 2.359998219373219, "grad_norm": 0.6179198026657104, "learning_rate": 4.327142550245018e-05, "loss": 0.5219, "step": 106030 }, { "epoch": 2.360220797720798, "grad_norm": 0.5947954058647156, "learning_rate": 4.3242467488299635e-05, "loss": 0.4931, "step": 106040 }, { "epoch": 2.360443376068376, "grad_norm": 0.4120878279209137, "learning_rate": 4.321351799248172e-05, "loss": 0.5027, "step": 106050 }, { "epoch": 2.3606659544159543, "grad_norm": 0.5536032319068909, "learning_rate": 4.318457701656955e-05, "loss": 0.5718, "step": 106060 }, { "epoch": 2.360888532763533, "grad_norm": 0.7351379990577698, "learning_rate": 4.315564456213585e-05, "loss": 0.3884, "step": 106070 }, { "epoch": 2.361111111111111, "grad_norm": 0.6175481677055359, "learning_rate": 4.3126720630752804e-05, "loss": 0.4066, "step": 106080 }, { "epoch": 2.3613336894586894, "grad_norm": 0.5883055329322815, "learning_rate": 4.3097805223992204e-05, "loss": 0.3736, "step": 106090 }, { "epoch": 2.361556267806268, "grad_norm": 0.5379427075386047, "learning_rate": 4.306889834342529e-05, "loss": 0.5544, "step": 106100 }, { "epoch": 2.3617788461538463, "grad_norm": 0.461297869682312, "learning_rate": 4.303999999062298e-05, "loss": 0.4831, "step": 106110 }, { "epoch": 2.3620014245014245, "grad_norm": 0.7035804986953735, "learning_rate": 4.301111016715551e-05, "loss": 0.4963, "step": 106120 }, { "epoch": 2.3622240028490027, "grad_norm": 0.6022542715072632, "learning_rate": 4.2982228874592824e-05, "loss": 0.5043, "step": 106130 }, { "epoch": 2.3624465811965814, "grad_norm": 0.6967162489891052, "learning_rate": 4.295335611450435e-05, "loss": 0.4758, "step": 106140 }, { "epoch": 2.3626691595441596, "grad_norm": 0.9202632308006287, "learning_rate": 4.2924491888459087e-05, "loss": 0.4852, "step": 106150 }, { "epoch": 2.362891737891738, "grad_norm": 0.605948269367218, "learning_rate": 4.2895636198025524e-05, "loss": 0.4538, "step": 106160 }, { "epoch": 2.363114316239316, "grad_norm": 0.43158113956451416, "learning_rate": 4.286678904477175e-05, "loss": 0.4009, "step": 106170 }, { "epoch": 2.3633368945868947, "grad_norm": 0.37621960043907166, "learning_rate": 4.283795043026524e-05, "loss": 0.301, "step": 106180 }, { "epoch": 2.363559472934473, "grad_norm": 0.5921512246131897, "learning_rate": 4.280912035607321e-05, "loss": 0.4489, "step": 106190 }, { "epoch": 2.363782051282051, "grad_norm": 0.7292236685752869, "learning_rate": 4.2780298823762224e-05, "loss": 0.5285, "step": 106200 }, { "epoch": 2.3640046296296298, "grad_norm": 0.35154908895492554, "learning_rate": 4.275148583489847e-05, "loss": 0.3637, "step": 106210 }, { "epoch": 2.364227207977208, "grad_norm": 0.6260533928871155, "learning_rate": 4.2722681391047734e-05, "loss": 0.4117, "step": 106220 }, { "epoch": 2.364449786324786, "grad_norm": 0.5895659923553467, "learning_rate": 4.269388549377524e-05, "loss": 0.522, "step": 106230 }, { "epoch": 2.364672364672365, "grad_norm": 0.7136719822883606, "learning_rate": 4.266509814464581e-05, "loss": 0.4461, "step": 106240 }, { "epoch": 2.364894943019943, "grad_norm": 0.3097321093082428, "learning_rate": 4.2636319345223764e-05, "loss": 0.4203, "step": 106250 }, { "epoch": 2.3651175213675213, "grad_norm": 0.8221575617790222, "learning_rate": 4.260754909707292e-05, "loss": 0.4898, "step": 106260 }, { "epoch": 2.3653400997151, "grad_norm": 0.613925576210022, "learning_rate": 4.257878740175669e-05, "loss": 0.4007, "step": 106270 }, { "epoch": 2.365562678062678, "grad_norm": 0.5688052773475647, "learning_rate": 4.2550034260838033e-05, "loss": 0.4512, "step": 106280 }, { "epoch": 2.3657852564102564, "grad_norm": 0.7149258852005005, "learning_rate": 4.252128967587941e-05, "loss": 0.4367, "step": 106290 }, { "epoch": 2.3660078347578346, "grad_norm": 0.5755921602249146, "learning_rate": 4.24925536484428e-05, "loss": 0.4022, "step": 106300 }, { "epoch": 2.3662304131054133, "grad_norm": 0.9494009613990784, "learning_rate": 4.246382618008984e-05, "loss": 0.4637, "step": 106310 }, { "epoch": 2.3664529914529915, "grad_norm": 0.6266781091690063, "learning_rate": 4.243510727238147e-05, "loss": 0.4328, "step": 106320 }, { "epoch": 2.3666755698005697, "grad_norm": 0.9528439044952393, "learning_rate": 4.2406396926878423e-05, "loss": 0.5745, "step": 106330 }, { "epoch": 2.366898148148148, "grad_norm": 0.5351330637931824, "learning_rate": 4.2377695145140714e-05, "loss": 0.5367, "step": 106340 }, { "epoch": 2.3671207264957266, "grad_norm": 0.8411067128181458, "learning_rate": 4.23490019287281e-05, "loss": 0.4613, "step": 106350 }, { "epoch": 2.367343304843305, "grad_norm": 0.5001861453056335, "learning_rate": 4.232031727919978e-05, "loss": 0.5383, "step": 106360 }, { "epoch": 2.367565883190883, "grad_norm": 0.6162530779838562, "learning_rate": 4.2291641198114487e-05, "loss": 0.454, "step": 106370 }, { "epoch": 2.3677884615384617, "grad_norm": 0.7169169187545776, "learning_rate": 4.2262973687030536e-05, "loss": 0.4225, "step": 106380 }, { "epoch": 2.36801103988604, "grad_norm": 0.23822492361068726, "learning_rate": 4.2234314747505764e-05, "loss": 0.3515, "step": 106390 }, { "epoch": 2.368233618233618, "grad_norm": 0.5095281004905701, "learning_rate": 4.220566438109743e-05, "loss": 0.5625, "step": 106400 }, { "epoch": 2.3684561965811968, "grad_norm": 0.513222336769104, "learning_rate": 4.217702258936247e-05, "loss": 0.5068, "step": 106410 }, { "epoch": 2.368678774928775, "grad_norm": 0.4777398705482483, "learning_rate": 4.21483893738573e-05, "loss": 0.4588, "step": 106420 }, { "epoch": 2.368901353276353, "grad_norm": 0.765155553817749, "learning_rate": 4.211976473613788e-05, "loss": 0.528, "step": 106430 }, { "epoch": 2.369123931623932, "grad_norm": 0.6391275525093079, "learning_rate": 4.209114867775974e-05, "loss": 0.3648, "step": 106440 }, { "epoch": 2.36934650997151, "grad_norm": 0.44867634773254395, "learning_rate": 4.2062541200277794e-05, "loss": 0.5285, "step": 106450 }, { "epoch": 2.3695690883190883, "grad_norm": 0.6665791869163513, "learning_rate": 4.2033942305246665e-05, "loss": 0.4858, "step": 106460 }, { "epoch": 2.3697916666666665, "grad_norm": 0.4603506922721863, "learning_rate": 4.200535199422049e-05, "loss": 0.4666, "step": 106470 }, { "epoch": 2.370014245014245, "grad_norm": 0.5786306858062744, "learning_rate": 4.197677026875275e-05, "loss": 0.5452, "step": 106480 }, { "epoch": 2.3702368233618234, "grad_norm": 0.4147084355354309, "learning_rate": 4.194819713039668e-05, "loss": 0.467, "step": 106490 }, { "epoch": 2.3704594017094016, "grad_norm": 0.5698099136352539, "learning_rate": 4.191963258070497e-05, "loss": 0.3849, "step": 106500 }, { "epoch": 2.37068198005698, "grad_norm": 0.4112623333930969, "learning_rate": 4.1891076621229844e-05, "loss": 0.4808, "step": 106510 }, { "epoch": 2.3709045584045585, "grad_norm": 0.45197534561157227, "learning_rate": 4.1862529253523053e-05, "loss": 0.5485, "step": 106520 }, { "epoch": 2.3711271367521367, "grad_norm": 0.419298380613327, "learning_rate": 4.183399047913592e-05, "loss": 0.502, "step": 106530 }, { "epoch": 2.371349715099715, "grad_norm": 0.5625624656677246, "learning_rate": 4.1805460299619184e-05, "loss": 0.4655, "step": 106540 }, { "epoch": 2.3715722934472936, "grad_norm": 0.6050288677215576, "learning_rate": 4.1776938716523216e-05, "loss": 0.5006, "step": 106550 }, { "epoch": 2.371794871794872, "grad_norm": 0.4963371157646179, "learning_rate": 4.1748425731397944e-05, "loss": 0.4433, "step": 106560 }, { "epoch": 2.37201745014245, "grad_norm": 0.7071536183357239, "learning_rate": 4.171992134579281e-05, "loss": 0.4348, "step": 106570 }, { "epoch": 2.3722400284900287, "grad_norm": 0.5232623815536499, "learning_rate": 4.169142556125669e-05, "loss": 0.4173, "step": 106580 }, { "epoch": 2.372462606837607, "grad_norm": 0.4696999490261078, "learning_rate": 4.166293837933808e-05, "loss": 0.4365, "step": 106590 }, { "epoch": 2.372685185185185, "grad_norm": 0.7387834787368774, "learning_rate": 4.1634459801585046e-05, "loss": 0.478, "step": 106600 }, { "epoch": 2.372907763532764, "grad_norm": 0.600752055644989, "learning_rate": 4.1605989829545137e-05, "loss": 0.4636, "step": 106610 }, { "epoch": 2.373130341880342, "grad_norm": 0.5822334885597229, "learning_rate": 4.157752846476537e-05, "loss": 0.4002, "step": 106620 }, { "epoch": 2.37335292022792, "grad_norm": 0.557744562625885, "learning_rate": 4.154907570879238e-05, "loss": 0.4866, "step": 106630 }, { "epoch": 2.3735754985754984, "grad_norm": 0.6427473425865173, "learning_rate": 4.152063156317236e-05, "loss": 0.4592, "step": 106640 }, { "epoch": 2.373798076923077, "grad_norm": 0.4670778214931488, "learning_rate": 4.1492196029450934e-05, "loss": 0.4491, "step": 106650 }, { "epoch": 2.3740206552706553, "grad_norm": 0.5701555013656616, "learning_rate": 4.1463769109173354e-05, "loss": 0.5698, "step": 106660 }, { "epoch": 2.3742432336182335, "grad_norm": 0.34977465867996216, "learning_rate": 4.143535080388439e-05, "loss": 0.4207, "step": 106670 }, { "epoch": 2.3744658119658117, "grad_norm": 0.3792286813259125, "learning_rate": 4.1406941115128193e-05, "loss": 0.4499, "step": 106680 }, { "epoch": 2.3746883903133904, "grad_norm": 0.36511972546577454, "learning_rate": 4.137854004444868e-05, "loss": 0.4308, "step": 106690 }, { "epoch": 2.3749109686609686, "grad_norm": 0.67777019739151, "learning_rate": 4.13501475933892e-05, "loss": 0.3832, "step": 106700 }, { "epoch": 2.375133547008547, "grad_norm": 0.44513630867004395, "learning_rate": 4.132176376349251e-05, "loss": 0.5664, "step": 106710 }, { "epoch": 2.3753561253561255, "grad_norm": 0.5617483258247375, "learning_rate": 4.129338855630109e-05, "loss": 0.4931, "step": 106720 }, { "epoch": 2.3755787037037037, "grad_norm": 0.4769580066204071, "learning_rate": 4.126502197335684e-05, "loss": 0.4935, "step": 106730 }, { "epoch": 2.375801282051282, "grad_norm": 0.8353335857391357, "learning_rate": 4.123666401620127e-05, "loss": 0.3768, "step": 106740 }, { "epoch": 2.3760238603988606, "grad_norm": 0.6219033002853394, "learning_rate": 4.120831468637538e-05, "loss": 0.4713, "step": 106750 }, { "epoch": 2.376246438746439, "grad_norm": 0.5097732543945312, "learning_rate": 4.117997398541962e-05, "loss": 0.4277, "step": 106760 }, { "epoch": 2.376469017094017, "grad_norm": 0.5334221720695496, "learning_rate": 4.1151641914874086e-05, "loss": 0.4739, "step": 106770 }, { "epoch": 2.3766915954415953, "grad_norm": 0.5444838404655457, "learning_rate": 4.1123318476278375e-05, "loss": 0.4664, "step": 106780 }, { "epoch": 2.376914173789174, "grad_norm": 0.6150575280189514, "learning_rate": 4.109500367117158e-05, "loss": 0.4434, "step": 106790 }, { "epoch": 2.377136752136752, "grad_norm": 0.44876211881637573, "learning_rate": 4.10666975010924e-05, "loss": 0.4829, "step": 106800 }, { "epoch": 2.3773593304843303, "grad_norm": 0.614982008934021, "learning_rate": 4.103839996757903e-05, "loss": 0.4204, "step": 106810 }, { "epoch": 2.377581908831909, "grad_norm": 0.5357826948165894, "learning_rate": 4.1010111072169076e-05, "loss": 0.4094, "step": 106820 }, { "epoch": 2.3778044871794872, "grad_norm": 0.5355835556983948, "learning_rate": 4.09818308163999e-05, "loss": 0.4143, "step": 106830 }, { "epoch": 2.3780270655270654, "grad_norm": 0.37973734736442566, "learning_rate": 4.095355920180817e-05, "loss": 0.4967, "step": 106840 }, { "epoch": 2.3782496438746437, "grad_norm": 0.46821489930152893, "learning_rate": 4.092529622993022e-05, "loss": 0.5378, "step": 106850 }, { "epoch": 2.3784722222222223, "grad_norm": 0.7494333386421204, "learning_rate": 4.0897041902301905e-05, "loss": 0.5393, "step": 106860 }, { "epoch": 2.3786948005698005, "grad_norm": 0.9595987796783447, "learning_rate": 4.086879622045858e-05, "loss": 0.438, "step": 106870 }, { "epoch": 2.3789173789173788, "grad_norm": 0.5590757131576538, "learning_rate": 4.084055918593515e-05, "loss": 0.424, "step": 106880 }, { "epoch": 2.3791399572649574, "grad_norm": 0.7187809944152832, "learning_rate": 4.0812330800266074e-05, "loss": 0.5109, "step": 106890 }, { "epoch": 2.3793625356125356, "grad_norm": 0.325730562210083, "learning_rate": 4.0784111064985186e-05, "loss": 0.3554, "step": 106900 }, { "epoch": 2.379585113960114, "grad_norm": 0.5590636134147644, "learning_rate": 4.075589998162608e-05, "loss": 0.5206, "step": 106910 }, { "epoch": 2.3798076923076925, "grad_norm": 0.49287861585617065, "learning_rate": 4.07276975517217e-05, "loss": 0.5341, "step": 106920 }, { "epoch": 2.3800302706552707, "grad_norm": 0.5416540503501892, "learning_rate": 4.0699503776804626e-05, "loss": 0.535, "step": 106930 }, { "epoch": 2.380252849002849, "grad_norm": 0.8340360522270203, "learning_rate": 4.0671318658406944e-05, "loss": 0.5092, "step": 106940 }, { "epoch": 2.380475427350427, "grad_norm": 0.7989697456359863, "learning_rate": 4.064314219806027e-05, "loss": 0.4929, "step": 106950 }, { "epoch": 2.380698005698006, "grad_norm": 0.6200940012931824, "learning_rate": 4.0614974397295666e-05, "loss": 0.3981, "step": 106960 }, { "epoch": 2.380920584045584, "grad_norm": 0.4049985408782959, "learning_rate": 4.058681525764389e-05, "loss": 0.5013, "step": 106970 }, { "epoch": 2.3811431623931623, "grad_norm": 0.6641939878463745, "learning_rate": 4.0558664780635014e-05, "loss": 0.4869, "step": 106980 }, { "epoch": 2.381365740740741, "grad_norm": 0.5992461442947388, "learning_rate": 4.053052296779882e-05, "loss": 0.4807, "step": 106990 }, { "epoch": 2.381588319088319, "grad_norm": 0.5013190507888794, "learning_rate": 4.0502389820664544e-05, "loss": 0.3836, "step": 107000 }, { "epoch": 2.3818108974358974, "grad_norm": 0.46175822615623474, "learning_rate": 4.0474265340761e-05, "loss": 0.4792, "step": 107010 }, { "epoch": 2.3820334757834756, "grad_norm": 0.5924819111824036, "learning_rate": 4.044614952961645e-05, "loss": 0.3602, "step": 107020 }, { "epoch": 2.3822560541310542, "grad_norm": 0.6522736549377441, "learning_rate": 4.0418042388758815e-05, "loss": 0.4622, "step": 107030 }, { "epoch": 2.3824786324786325, "grad_norm": 0.7011052370071411, "learning_rate": 4.0389943919715335e-05, "loss": 0.4267, "step": 107040 }, { "epoch": 2.3827012108262107, "grad_norm": 0.6333714723587036, "learning_rate": 4.036185412401297e-05, "loss": 0.4716, "step": 107050 }, { "epoch": 2.3829237891737893, "grad_norm": 0.469301700592041, "learning_rate": 4.033377300317813e-05, "loss": 0.3148, "step": 107060 }, { "epoch": 2.3831463675213675, "grad_norm": 0.7111797332763672, "learning_rate": 4.030570055873679e-05, "loss": 0.51, "step": 107070 }, { "epoch": 2.3833689458689458, "grad_norm": 0.7554060220718384, "learning_rate": 4.027763679221441e-05, "loss": 0.5962, "step": 107080 }, { "epoch": 2.3835915242165244, "grad_norm": 0.5037725567817688, "learning_rate": 4.024958170513604e-05, "loss": 0.4771, "step": 107090 }, { "epoch": 2.3838141025641026, "grad_norm": 0.4343712031841278, "learning_rate": 4.0221535299026195e-05, "loss": 0.4824, "step": 107100 }, { "epoch": 2.384036680911681, "grad_norm": 0.4945704936981201, "learning_rate": 4.019349757540887e-05, "loss": 0.4191, "step": 107110 }, { "epoch": 2.384259259259259, "grad_norm": 0.5049513578414917, "learning_rate": 4.016546853580769e-05, "loss": 0.5334, "step": 107120 }, { "epoch": 2.3844818376068377, "grad_norm": 0.5844504833221436, "learning_rate": 4.0137448181745804e-05, "loss": 0.5095, "step": 107130 }, { "epoch": 2.384704415954416, "grad_norm": 0.7790089845657349, "learning_rate": 4.010943651474586e-05, "loss": 0.5077, "step": 107140 }, { "epoch": 2.384926994301994, "grad_norm": 0.3328200876712799, "learning_rate": 4.008143353633003e-05, "loss": 0.4946, "step": 107150 }, { "epoch": 2.385149572649573, "grad_norm": 0.6973555088043213, "learning_rate": 4.005343924802001e-05, "loss": 0.4411, "step": 107160 }, { "epoch": 2.385372150997151, "grad_norm": 0.5896797180175781, "learning_rate": 4.0025453651337094e-05, "loss": 0.3967, "step": 107170 }, { "epoch": 2.3855947293447293, "grad_norm": 0.6309689283370972, "learning_rate": 3.9997476747801945e-05, "loss": 0.5664, "step": 107180 }, { "epoch": 2.3858173076923075, "grad_norm": 0.6303597688674927, "learning_rate": 3.996950853893488e-05, "loss": 0.448, "step": 107190 }, { "epoch": 2.386039886039886, "grad_norm": 0.5085480809211731, "learning_rate": 3.994154902625573e-05, "loss": 0.5309, "step": 107200 }, { "epoch": 2.3862624643874644, "grad_norm": 0.5023293495178223, "learning_rate": 3.991359821128384e-05, "loss": 0.3437, "step": 107210 }, { "epoch": 2.3864850427350426, "grad_norm": 0.4966486394405365, "learning_rate": 3.9885656095538137e-05, "loss": 0.4056, "step": 107220 }, { "epoch": 2.3867076210826212, "grad_norm": 0.7862597703933716, "learning_rate": 3.985772268053689e-05, "loss": 0.4386, "step": 107230 }, { "epoch": 2.3869301994301995, "grad_norm": 0.4529586136341095, "learning_rate": 3.9829797967798156e-05, "loss": 0.418, "step": 107240 }, { "epoch": 2.3871527777777777, "grad_norm": 0.4913751780986786, "learning_rate": 3.9801881958839274e-05, "loss": 0.3807, "step": 107250 }, { "epoch": 2.3873753561253563, "grad_norm": 0.6710889935493469, "learning_rate": 3.977397465517725e-05, "loss": 0.4312, "step": 107260 }, { "epoch": 2.3875979344729346, "grad_norm": 0.5019149780273438, "learning_rate": 3.974607605832863e-05, "loss": 0.4696, "step": 107270 }, { "epoch": 2.3878205128205128, "grad_norm": 0.6020123958587646, "learning_rate": 3.97181861698094e-05, "loss": 0.4775, "step": 107280 }, { "epoch": 2.388043091168091, "grad_norm": 0.5125402808189392, "learning_rate": 3.969030499113517e-05, "loss": 0.461, "step": 107290 }, { "epoch": 2.3882656695156697, "grad_norm": 0.562674343585968, "learning_rate": 3.9662432523821e-05, "loss": 0.3317, "step": 107300 }, { "epoch": 2.388488247863248, "grad_norm": 0.4142382740974426, "learning_rate": 3.963456876938154e-05, "loss": 0.5279, "step": 107310 }, { "epoch": 2.388710826210826, "grad_norm": 0.5948408246040344, "learning_rate": 3.9606713729330865e-05, "loss": 0.4231, "step": 107320 }, { "epoch": 2.3889334045584047, "grad_norm": 0.4571043848991394, "learning_rate": 3.957886740518266e-05, "loss": 0.56, "step": 107330 }, { "epoch": 2.389155982905983, "grad_norm": 0.641856849193573, "learning_rate": 3.955102979845013e-05, "loss": 0.4425, "step": 107340 }, { "epoch": 2.389378561253561, "grad_norm": 0.6879642009735107, "learning_rate": 3.9523200910645984e-05, "loss": 0.4254, "step": 107350 }, { "epoch": 2.3896011396011394, "grad_norm": 0.6293129920959473, "learning_rate": 3.949538074328254e-05, "loss": 0.4756, "step": 107360 }, { "epoch": 2.389823717948718, "grad_norm": 0.43237632513046265, "learning_rate": 3.946756929787143e-05, "loss": 0.4297, "step": 107370 }, { "epoch": 2.3900462962962963, "grad_norm": 0.6030610203742981, "learning_rate": 3.9439766575924076e-05, "loss": 0.4089, "step": 107380 }, { "epoch": 2.3902688746438745, "grad_norm": 0.48837748169898987, "learning_rate": 3.941197257895122e-05, "loss": 0.5213, "step": 107390 }, { "epoch": 2.390491452991453, "grad_norm": 0.8221680521965027, "learning_rate": 3.938418730846321e-05, "loss": 0.4774, "step": 107400 }, { "epoch": 2.3907140313390314, "grad_norm": 0.4908098876476288, "learning_rate": 3.9356410765969965e-05, "loss": 0.4163, "step": 107410 }, { "epoch": 2.3909366096866096, "grad_norm": 0.7985237836837769, "learning_rate": 3.932864295298084e-05, "loss": 0.5171, "step": 107420 }, { "epoch": 2.3911591880341883, "grad_norm": 0.537944495677948, "learning_rate": 3.9300883871004815e-05, "loss": 0.3917, "step": 107430 }, { "epoch": 2.3913817663817665, "grad_norm": 0.571877121925354, "learning_rate": 3.927313352155031e-05, "loss": 0.4366, "step": 107440 }, { "epoch": 2.3916043447293447, "grad_norm": 0.6063857078552246, "learning_rate": 3.924539190612537e-05, "loss": 0.562, "step": 107450 }, { "epoch": 2.391826923076923, "grad_norm": 0.6366663575172424, "learning_rate": 3.921765902623735e-05, "loss": 0.5283, "step": 107460 }, { "epoch": 2.3920495014245016, "grad_norm": 0.6072849631309509, "learning_rate": 3.91899348833934e-05, "loss": 0.4876, "step": 107470 }, { "epoch": 2.39227207977208, "grad_norm": 0.7319457530975342, "learning_rate": 3.916221947909999e-05, "loss": 0.5053, "step": 107480 }, { "epoch": 2.392494658119658, "grad_norm": 0.5945011973381042, "learning_rate": 3.9134512814863336e-05, "loss": 0.4319, "step": 107490 }, { "epoch": 2.392717236467236, "grad_norm": 0.47057005763053894, "learning_rate": 3.910681489218888e-05, "loss": 0.3741, "step": 107500 }, { "epoch": 2.392939814814815, "grad_norm": 0.3048020601272583, "learning_rate": 3.907912571258181e-05, "loss": 0.4148, "step": 107510 }, { "epoch": 2.393162393162393, "grad_norm": 0.3933880925178528, "learning_rate": 3.9051445277546825e-05, "loss": 0.48, "step": 107520 }, { "epoch": 2.3933849715099713, "grad_norm": 0.6014164090156555, "learning_rate": 3.902377358858802e-05, "loss": 0.3565, "step": 107530 }, { "epoch": 2.39360754985755, "grad_norm": 0.5383347272872925, "learning_rate": 3.899611064720916e-05, "loss": 0.6023, "step": 107540 }, { "epoch": 2.393830128205128, "grad_norm": 0.6487884521484375, "learning_rate": 3.896845645491343e-05, "loss": 0.5128, "step": 107550 }, { "epoch": 2.3940527065527064, "grad_norm": 0.5151840448379517, "learning_rate": 3.894081101320359e-05, "loss": 0.6098, "step": 107560 }, { "epoch": 2.394275284900285, "grad_norm": 0.5944096446037292, "learning_rate": 3.891317432358195e-05, "loss": 0.4749, "step": 107570 }, { "epoch": 2.3944978632478633, "grad_norm": 0.5349414944648743, "learning_rate": 3.888554638755029e-05, "loss": 0.4657, "step": 107580 }, { "epoch": 2.3947204415954415, "grad_norm": 0.6621114611625671, "learning_rate": 3.885792720660999e-05, "loss": 0.388, "step": 107590 }, { "epoch": 2.39494301994302, "grad_norm": 0.4630861282348633, "learning_rate": 3.8830316782261765e-05, "loss": 0.5104, "step": 107600 }, { "epoch": 2.3951655982905984, "grad_norm": 0.3979974091053009, "learning_rate": 3.880271511600608e-05, "loss": 0.3779, "step": 107610 }, { "epoch": 2.3953881766381766, "grad_norm": 0.5173378586769104, "learning_rate": 3.877512220934287e-05, "loss": 0.444, "step": 107620 }, { "epoch": 2.395610754985755, "grad_norm": 0.4683818817138672, "learning_rate": 3.874753806377147e-05, "loss": 0.5606, "step": 107630 }, { "epoch": 2.3958333333333335, "grad_norm": 0.4673824906349182, "learning_rate": 3.871996268079083e-05, "loss": 0.3646, "step": 107640 }, { "epoch": 2.3960559116809117, "grad_norm": 0.5951061248779297, "learning_rate": 3.869239606189947e-05, "loss": 0.5753, "step": 107650 }, { "epoch": 2.39627849002849, "grad_norm": 1.1411864757537842, "learning_rate": 3.866483820859541e-05, "loss": 0.4492, "step": 107660 }, { "epoch": 2.396501068376068, "grad_norm": 0.7248709797859192, "learning_rate": 3.8637289122376054e-05, "loss": 0.4289, "step": 107670 }, { "epoch": 2.396723646723647, "grad_norm": 0.5003977417945862, "learning_rate": 3.860974880473851e-05, "loss": 0.3993, "step": 107680 }, { "epoch": 2.396946225071225, "grad_norm": 0.7477527260780334, "learning_rate": 3.858221725717932e-05, "loss": 0.426, "step": 107690 }, { "epoch": 2.3971688034188032, "grad_norm": 0.5571163892745972, "learning_rate": 3.855469448119462e-05, "loss": 0.3665, "step": 107700 }, { "epoch": 2.397391381766382, "grad_norm": 0.4843011498451233, "learning_rate": 3.852718047827997e-05, "loss": 0.4136, "step": 107710 }, { "epoch": 2.39761396011396, "grad_norm": 0.8067624568939209, "learning_rate": 3.84996752499305e-05, "loss": 0.5301, "step": 107720 }, { "epoch": 2.3978365384615383, "grad_norm": 0.5896735191345215, "learning_rate": 3.847217879764098e-05, "loss": 0.5473, "step": 107730 }, { "epoch": 2.398059116809117, "grad_norm": 0.4599475562572479, "learning_rate": 3.8444691122905406e-05, "loss": 0.4437, "step": 107740 }, { "epoch": 2.398281695156695, "grad_norm": 0.6119899749755859, "learning_rate": 3.841721222721766e-05, "loss": 0.4372, "step": 107750 }, { "epoch": 2.3985042735042734, "grad_norm": 0.507407546043396, "learning_rate": 3.8389742112070805e-05, "loss": 0.4957, "step": 107760 }, { "epoch": 2.398726851851852, "grad_norm": 0.5270352959632874, "learning_rate": 3.836228077895765e-05, "loss": 0.4259, "step": 107770 }, { "epoch": 2.3989494301994303, "grad_norm": 0.6012681126594543, "learning_rate": 3.833482822937051e-05, "loss": 0.5108, "step": 107780 }, { "epoch": 2.3991720085470085, "grad_norm": 0.6297083497047424, "learning_rate": 3.830738446480113e-05, "loss": 0.4953, "step": 107790 }, { "epoch": 2.3993945868945867, "grad_norm": 0.4202558994293213, "learning_rate": 3.827994948674092e-05, "loss": 0.3991, "step": 107800 }, { "epoch": 2.3996171652421654, "grad_norm": 0.43911489844322205, "learning_rate": 3.8252523296680564e-05, "loss": 0.4951, "step": 107810 }, { "epoch": 2.3998397435897436, "grad_norm": 0.5340352058410645, "learning_rate": 3.8225105896110525e-05, "loss": 0.4703, "step": 107820 }, { "epoch": 2.400062321937322, "grad_norm": 0.8210797905921936, "learning_rate": 3.819769728652065e-05, "loss": 0.466, "step": 107830 }, { "epoch": 2.4002849002849, "grad_norm": 0.7431703805923462, "learning_rate": 3.817029746940037e-05, "loss": 0.5111, "step": 107840 }, { "epoch": 2.4002849002849, "eval_loss": 0.5277159810066223, "eval_runtime": 337.619, "eval_samples_per_second": 7.005, "eval_steps_per_second": 7.005, "step": 107840 }, { "epoch": 2.4005074786324787, "grad_norm": 0.6892507672309875, "learning_rate": 3.8142906446238614e-05, "loss": 0.4169, "step": 107850 }, { "epoch": 2.400730056980057, "grad_norm": 0.5865117311477661, "learning_rate": 3.8115524218523865e-05, "loss": 0.4058, "step": 107860 }, { "epoch": 2.400952635327635, "grad_norm": 0.41831985116004944, "learning_rate": 3.808815078774402e-05, "loss": 0.4328, "step": 107870 }, { "epoch": 2.401175213675214, "grad_norm": 0.36568325757980347, "learning_rate": 3.8060786155386644e-05, "loss": 0.5102, "step": 107880 }, { "epoch": 2.401397792022792, "grad_norm": 0.7594850063323975, "learning_rate": 3.8033430322938666e-05, "loss": 0.4744, "step": 107890 }, { "epoch": 2.4016203703703702, "grad_norm": 0.5580376386642456, "learning_rate": 3.800608329188668e-05, "loss": 0.578, "step": 107900 }, { "epoch": 2.401842948717949, "grad_norm": 0.7795330882072449, "learning_rate": 3.7978745063716745e-05, "loss": 0.3751, "step": 107910 }, { "epoch": 2.402065527065527, "grad_norm": 0.6691529750823975, "learning_rate": 3.7951415639914443e-05, "loss": 0.5008, "step": 107920 }, { "epoch": 2.4022881054131053, "grad_norm": 0.6297852993011475, "learning_rate": 3.792409502196488e-05, "loss": 0.4381, "step": 107930 }, { "epoch": 2.402510683760684, "grad_norm": 0.46467840671539307, "learning_rate": 3.7896783211352704e-05, "loss": 0.5051, "step": 107940 }, { "epoch": 2.402733262108262, "grad_norm": 0.4547869563102722, "learning_rate": 3.7869480209562e-05, "loss": 0.4391, "step": 107950 }, { "epoch": 2.4029558404558404, "grad_norm": 0.7416006326675415, "learning_rate": 3.784218601807645e-05, "loss": 0.5118, "step": 107960 }, { "epoch": 2.4031784188034186, "grad_norm": 0.4923880696296692, "learning_rate": 3.781490063837927e-05, "loss": 0.4884, "step": 107970 }, { "epoch": 2.4034009971509973, "grad_norm": 0.38714689016342163, "learning_rate": 3.7787624071953175e-05, "loss": 0.3763, "step": 107980 }, { "epoch": 2.4036235754985755, "grad_norm": 0.5982674956321716, "learning_rate": 3.776035632028037e-05, "loss": 0.3924, "step": 107990 }, { "epoch": 2.4038461538461537, "grad_norm": 0.4063017964363098, "learning_rate": 3.7733097384842655e-05, "loss": 0.3274, "step": 108000 }, { "epoch": 2.404068732193732, "grad_norm": 0.34237897396087646, "learning_rate": 3.770584726712123e-05, "loss": 0.3434, "step": 108010 }, { "epoch": 2.4042913105413106, "grad_norm": 1.1153937578201294, "learning_rate": 3.767860596859696e-05, "loss": 0.4604, "step": 108020 }, { "epoch": 2.404513888888889, "grad_norm": 0.7363869547843933, "learning_rate": 3.7651373490750096e-05, "loss": 0.4872, "step": 108030 }, { "epoch": 2.404736467236467, "grad_norm": 0.404489666223526, "learning_rate": 3.762414983506049e-05, "loss": 0.4847, "step": 108040 }, { "epoch": 2.4049590455840457, "grad_norm": 0.38278475403785706, "learning_rate": 3.75969350030075e-05, "loss": 0.4051, "step": 108050 }, { "epoch": 2.405181623931624, "grad_norm": 0.5841982364654541, "learning_rate": 3.756972899607003e-05, "loss": 0.4173, "step": 108060 }, { "epoch": 2.405404202279202, "grad_norm": 0.40750545263290405, "learning_rate": 3.754253181572645e-05, "loss": 0.4454, "step": 108070 }, { "epoch": 2.405626780626781, "grad_norm": 0.6064510345458984, "learning_rate": 3.7515343463454735e-05, "loss": 0.5399, "step": 108080 }, { "epoch": 2.405849358974359, "grad_norm": 0.3883700966835022, "learning_rate": 3.748816394073222e-05, "loss": 0.4375, "step": 108090 }, { "epoch": 2.4060719373219372, "grad_norm": 0.36801886558532715, "learning_rate": 3.746099324903591e-05, "loss": 0.3955, "step": 108100 }, { "epoch": 2.406294515669516, "grad_norm": 0.743415355682373, "learning_rate": 3.743383138984229e-05, "loss": 0.5687, "step": 108110 }, { "epoch": 2.406517094017094, "grad_norm": 0.5999566316604614, "learning_rate": 3.740667836462737e-05, "loss": 0.5204, "step": 108120 }, { "epoch": 2.4067396723646723, "grad_norm": 0.5922669172286987, "learning_rate": 3.7379534174866635e-05, "loss": 0.3788, "step": 108130 }, { "epoch": 2.4069622507122506, "grad_norm": 0.6521261930465698, "learning_rate": 3.735239882203518e-05, "loss": 0.415, "step": 108140 }, { "epoch": 2.4071848290598292, "grad_norm": 0.6018521189689636, "learning_rate": 3.732527230760749e-05, "loss": 0.3698, "step": 108150 }, { "epoch": 2.4074074074074074, "grad_norm": 0.7880030274391174, "learning_rate": 3.729815463305772e-05, "loss": 0.4128, "step": 108160 }, { "epoch": 2.4076299857549857, "grad_norm": 0.6043182015419006, "learning_rate": 3.7271045799859384e-05, "loss": 0.4286, "step": 108170 }, { "epoch": 2.407852564102564, "grad_norm": 0.6066815853118896, "learning_rate": 3.7243945809485624e-05, "loss": 0.3493, "step": 108180 }, { "epoch": 2.4080751424501425, "grad_norm": 0.6289936304092407, "learning_rate": 3.721685466340909e-05, "loss": 0.542, "step": 108190 }, { "epoch": 2.4082977207977208, "grad_norm": 0.6707183122634888, "learning_rate": 3.718977236310195e-05, "loss": 0.4285, "step": 108200 }, { "epoch": 2.408520299145299, "grad_norm": 0.5168840289115906, "learning_rate": 3.716269891003583e-05, "loss": 0.4894, "step": 108210 }, { "epoch": 2.4087428774928776, "grad_norm": 0.522630512714386, "learning_rate": 3.713563430568203e-05, "loss": 0.395, "step": 108220 }, { "epoch": 2.408965455840456, "grad_norm": 0.5039717555046082, "learning_rate": 3.710857855151113e-05, "loss": 0.5266, "step": 108230 }, { "epoch": 2.409188034188034, "grad_norm": 0.5010952353477478, "learning_rate": 3.708153164899342e-05, "loss": 0.473, "step": 108240 }, { "epoch": 2.4094106125356127, "grad_norm": 0.7485960125923157, "learning_rate": 3.705449359959865e-05, "loss": 0.5431, "step": 108250 }, { "epoch": 2.409633190883191, "grad_norm": 0.6157149076461792, "learning_rate": 3.7027464404796096e-05, "loss": 0.4947, "step": 108260 }, { "epoch": 2.409855769230769, "grad_norm": 0.7727802991867065, "learning_rate": 3.700044406605458e-05, "loss": 0.4065, "step": 108270 }, { "epoch": 2.410078347578348, "grad_norm": 0.48511838912963867, "learning_rate": 3.6973432584842337e-05, "loss": 0.3984, "step": 108280 }, { "epoch": 2.410300925925926, "grad_norm": 0.7944969534873962, "learning_rate": 3.6946429962627224e-05, "loss": 0.5269, "step": 108290 }, { "epoch": 2.4105235042735043, "grad_norm": 0.6611111760139465, "learning_rate": 3.691943620087663e-05, "loss": 0.4587, "step": 108300 }, { "epoch": 2.4107460826210825, "grad_norm": 0.5456627607345581, "learning_rate": 3.689245130105734e-05, "loss": 0.3479, "step": 108310 }, { "epoch": 2.410968660968661, "grad_norm": 0.5392328500747681, "learning_rate": 3.686547526463575e-05, "loss": 0.4299, "step": 108320 }, { "epoch": 2.4111912393162394, "grad_norm": 0.3954230546951294, "learning_rate": 3.6838508093077806e-05, "loss": 0.4817, "step": 108330 }, { "epoch": 2.4114138176638176, "grad_norm": 0.5684217810630798, "learning_rate": 3.6811549787848884e-05, "loss": 0.3802, "step": 108340 }, { "epoch": 2.411636396011396, "grad_norm": 0.6959905624389648, "learning_rate": 3.678460035041395e-05, "loss": 0.5219, "step": 108350 }, { "epoch": 2.4118589743589745, "grad_norm": 0.5525913238525391, "learning_rate": 3.6757659782237505e-05, "loss": 0.4986, "step": 108360 }, { "epoch": 2.4120815527065527, "grad_norm": 0.643782913684845, "learning_rate": 3.67307280847834e-05, "loss": 0.419, "step": 108370 }, { "epoch": 2.412304131054131, "grad_norm": 0.4591728448867798, "learning_rate": 3.67038052595152e-05, "loss": 0.3671, "step": 108380 }, { "epoch": 2.4125267094017095, "grad_norm": 0.45649808645248413, "learning_rate": 3.667689130789589e-05, "loss": 0.496, "step": 108390 }, { "epoch": 2.4127492877492878, "grad_norm": 0.49152353405952454, "learning_rate": 3.664998623138807e-05, "loss": 0.4469, "step": 108400 }, { "epoch": 2.412971866096866, "grad_norm": 0.6421986222267151, "learning_rate": 3.662309003145366e-05, "loss": 0.5647, "step": 108410 }, { "epoch": 2.4131944444444446, "grad_norm": 0.5374763607978821, "learning_rate": 3.659620270955428e-05, "loss": 0.3204, "step": 108420 }, { "epoch": 2.413417022792023, "grad_norm": 0.5448712110519409, "learning_rate": 3.656932426715103e-05, "loss": 0.5383, "step": 108430 }, { "epoch": 2.413639601139601, "grad_norm": 0.5975387692451477, "learning_rate": 3.654245470570454e-05, "loss": 0.3897, "step": 108440 }, { "epoch": 2.4138621794871793, "grad_norm": 0.3369348347187042, "learning_rate": 3.651559402667481e-05, "loss": 0.5226, "step": 108450 }, { "epoch": 2.414084757834758, "grad_norm": 0.6690844297409058, "learning_rate": 3.6488742231521545e-05, "loss": 0.5524, "step": 108460 }, { "epoch": 2.414307336182336, "grad_norm": 0.6922747492790222, "learning_rate": 3.6461899321703894e-05, "loss": 0.3865, "step": 108470 }, { "epoch": 2.4145299145299144, "grad_norm": 0.4084574580192566, "learning_rate": 3.6435065298680504e-05, "loss": 0.4244, "step": 108480 }, { "epoch": 2.414752492877493, "grad_norm": 0.37243160605430603, "learning_rate": 3.640824016390956e-05, "loss": 0.4692, "step": 108490 }, { "epoch": 2.4149750712250713, "grad_norm": 0.781517505645752, "learning_rate": 3.6381423918848825e-05, "loss": 0.5295, "step": 108500 }, { "epoch": 2.4151976495726495, "grad_norm": 0.5566474795341492, "learning_rate": 3.63546165649554e-05, "loss": 0.475, "step": 108510 }, { "epoch": 2.4154202279202277, "grad_norm": 0.45444151759147644, "learning_rate": 3.6327818103686086e-05, "loss": 0.4576, "step": 108520 }, { "epoch": 2.4156428062678064, "grad_norm": 0.5889346599578857, "learning_rate": 3.63010285364971e-05, "loss": 0.3677, "step": 108530 }, { "epoch": 2.4158653846153846, "grad_norm": 0.4560697078704834, "learning_rate": 3.627424786484432e-05, "loss": 0.3781, "step": 108540 }, { "epoch": 2.416087962962963, "grad_norm": 0.7297571897506714, "learning_rate": 3.624747609018289e-05, "loss": 0.4507, "step": 108550 }, { "epoch": 2.4163105413105415, "grad_norm": 0.6629948616027832, "learning_rate": 3.622071321396763e-05, "loss": 0.4697, "step": 108560 }, { "epoch": 2.4165331196581197, "grad_norm": 0.45788052678108215, "learning_rate": 3.619395923765292e-05, "loss": 0.3583, "step": 108570 }, { "epoch": 2.416755698005698, "grad_norm": 0.3835255801677704, "learning_rate": 3.61672141626926e-05, "loss": 0.473, "step": 108580 }, { "epoch": 2.4169782763532766, "grad_norm": 0.5667798519134521, "learning_rate": 3.614047799053995e-05, "loss": 0.3157, "step": 108590 }, { "epoch": 2.4172008547008548, "grad_norm": 0.37759262323379517, "learning_rate": 3.611375072264784e-05, "loss": 0.3444, "step": 108600 }, { "epoch": 2.417423433048433, "grad_norm": 0.5107159614562988, "learning_rate": 3.6087032360468684e-05, "loss": 0.4706, "step": 108610 }, { "epoch": 2.417646011396011, "grad_norm": 0.5266587734222412, "learning_rate": 3.606032290545438e-05, "loss": 0.4191, "step": 108620 }, { "epoch": 2.41786858974359, "grad_norm": 0.6173784732818604, "learning_rate": 3.603362235905634e-05, "loss": 0.4651, "step": 108630 }, { "epoch": 2.418091168091168, "grad_norm": 0.6883417367935181, "learning_rate": 3.600693072272554e-05, "loss": 0.3607, "step": 108640 }, { "epoch": 2.4183137464387463, "grad_norm": 0.5021166205406189, "learning_rate": 3.598024799791233e-05, "loss": 0.4198, "step": 108650 }, { "epoch": 2.418536324786325, "grad_norm": 0.5681913495063782, "learning_rate": 3.595357418606671e-05, "loss": 0.473, "step": 108660 }, { "epoch": 2.418758903133903, "grad_norm": 0.53594571352005, "learning_rate": 3.592690928863822e-05, "loss": 0.4205, "step": 108670 }, { "epoch": 2.4189814814814814, "grad_norm": 0.8201348185539246, "learning_rate": 3.590025330707574e-05, "loss": 0.3999, "step": 108680 }, { "epoch": 2.4192040598290596, "grad_norm": 0.5095071196556091, "learning_rate": 3.587360624282783e-05, "loss": 0.4599, "step": 108690 }, { "epoch": 2.4194266381766383, "grad_norm": 0.745123565196991, "learning_rate": 3.5846968097342534e-05, "loss": 0.5154, "step": 108700 }, { "epoch": 2.4196492165242165, "grad_norm": 0.4678294062614441, "learning_rate": 3.5820338872067417e-05, "loss": 0.3351, "step": 108710 }, { "epoch": 2.4198717948717947, "grad_norm": 0.7734829783439636, "learning_rate": 3.579371856844942e-05, "loss": 0.4579, "step": 108720 }, { "epoch": 2.4200943732193734, "grad_norm": 0.5143256783485413, "learning_rate": 3.576710718793519e-05, "loss": 0.4507, "step": 108730 }, { "epoch": 2.4203169515669516, "grad_norm": 0.39224982261657715, "learning_rate": 3.574050473197081e-05, "loss": 0.3424, "step": 108740 }, { "epoch": 2.42053952991453, "grad_norm": 0.6342445611953735, "learning_rate": 3.571391120200187e-05, "loss": 0.4639, "step": 108750 }, { "epoch": 2.4207621082621085, "grad_norm": 0.5755688548088074, "learning_rate": 3.568732659947349e-05, "loss": 0.3544, "step": 108760 }, { "epoch": 2.4209846866096867, "grad_norm": 0.5118335485458374, "learning_rate": 3.56607509258303e-05, "loss": 0.4368, "step": 108770 }, { "epoch": 2.421207264957265, "grad_norm": 0.6136965155601501, "learning_rate": 3.563418418251647e-05, "loss": 0.5165, "step": 108780 }, { "epoch": 2.421429843304843, "grad_norm": 0.6362696290016174, "learning_rate": 3.560762637097559e-05, "loss": 0.4246, "step": 108790 }, { "epoch": 2.421652421652422, "grad_norm": 0.6194811463356018, "learning_rate": 3.558107749265092e-05, "loss": 0.4304, "step": 108800 }, { "epoch": 2.421875, "grad_norm": 0.6822486519813538, "learning_rate": 3.555453754898506e-05, "loss": 0.5694, "step": 108810 }, { "epoch": 2.422097578347578, "grad_norm": 0.6611093878746033, "learning_rate": 3.5528006541420233e-05, "loss": 0.4672, "step": 108820 }, { "epoch": 2.422320156695157, "grad_norm": 0.531535804271698, "learning_rate": 3.5501484471398164e-05, "loss": 0.5314, "step": 108830 }, { "epoch": 2.422542735042735, "grad_norm": 0.7303971648216248, "learning_rate": 3.547497134036011e-05, "loss": 0.473, "step": 108840 }, { "epoch": 2.4227653133903133, "grad_norm": 0.6678654551506042, "learning_rate": 3.5448467149746854e-05, "loss": 0.4744, "step": 108850 }, { "epoch": 2.4229878917378915, "grad_norm": 0.6118271350860596, "learning_rate": 3.542197190099854e-05, "loss": 0.4503, "step": 108860 }, { "epoch": 2.42321047008547, "grad_norm": 0.6433374881744385, "learning_rate": 3.5395485595555014e-05, "loss": 0.5033, "step": 108870 }, { "epoch": 2.4234330484330484, "grad_norm": 0.802843451499939, "learning_rate": 3.536900823485554e-05, "loss": 0.5305, "step": 108880 }, { "epoch": 2.4236556267806266, "grad_norm": 0.6826706528663635, "learning_rate": 3.534253982033895e-05, "loss": 0.5862, "step": 108890 }, { "epoch": 2.4238782051282053, "grad_norm": 0.6310650706291199, "learning_rate": 3.5316080353443516e-05, "loss": 0.6473, "step": 108900 }, { "epoch": 2.4241007834757835, "grad_norm": 0.7067840099334717, "learning_rate": 3.528962983560711e-05, "loss": 0.5158, "step": 108910 }, { "epoch": 2.4243233618233617, "grad_norm": 0.642220139503479, "learning_rate": 3.526318826826711e-05, "loss": 0.5143, "step": 108920 }, { "epoch": 2.4245459401709404, "grad_norm": 0.5776522755622864, "learning_rate": 3.523675565286031e-05, "loss": 0.4242, "step": 108930 }, { "epoch": 2.4247685185185186, "grad_norm": 0.6654059886932373, "learning_rate": 3.521033199082304e-05, "loss": 0.5211, "step": 108940 }, { "epoch": 2.424991096866097, "grad_norm": 0.7176336050033569, "learning_rate": 3.5183917283591225e-05, "loss": 0.4434, "step": 108950 }, { "epoch": 2.425213675213675, "grad_norm": 0.5302131772041321, "learning_rate": 3.515751153260027e-05, "loss": 0.3175, "step": 108960 }, { "epoch": 2.4254362535612537, "grad_norm": 0.6184340119361877, "learning_rate": 3.5131114739285096e-05, "loss": 0.442, "step": 108970 }, { "epoch": 2.425658831908832, "grad_norm": 0.6138894557952881, "learning_rate": 3.510472690508011e-05, "loss": 0.5474, "step": 108980 }, { "epoch": 2.42588141025641, "grad_norm": 0.6392364501953125, "learning_rate": 3.5078348031419316e-05, "loss": 0.421, "step": 108990 }, { "epoch": 2.426103988603989, "grad_norm": 0.5919948816299438, "learning_rate": 3.505197811973604e-05, "loss": 0.3428, "step": 109000 }, { "epoch": 2.426326566951567, "grad_norm": 0.6280859708786011, "learning_rate": 3.502561717146331e-05, "loss": 0.4399, "step": 109010 }, { "epoch": 2.4265491452991452, "grad_norm": 0.610748291015625, "learning_rate": 3.499926518803358e-05, "loss": 0.4614, "step": 109020 }, { "epoch": 2.4267717236467234, "grad_norm": 0.48234373331069946, "learning_rate": 3.497292217087889e-05, "loss": 0.4259, "step": 109030 }, { "epoch": 2.426994301994302, "grad_norm": 0.7132697105407715, "learning_rate": 3.494658812143068e-05, "loss": 0.4475, "step": 109040 }, { "epoch": 2.4272168803418803, "grad_norm": 0.5572307109832764, "learning_rate": 3.492026304111999e-05, "loss": 0.5108, "step": 109050 }, { "epoch": 2.4274394586894585, "grad_norm": 0.7285090684890747, "learning_rate": 3.48939469313774e-05, "loss": 0.5296, "step": 109060 }, { "epoch": 2.427662037037037, "grad_norm": 0.5368955731391907, "learning_rate": 3.48676397936329e-05, "loss": 0.4226, "step": 109070 }, { "epoch": 2.4278846153846154, "grad_norm": 0.5352765321731567, "learning_rate": 3.484134162931598e-05, "loss": 0.3651, "step": 109080 }, { "epoch": 2.4281071937321936, "grad_norm": 0.675270676612854, "learning_rate": 3.4815052439855766e-05, "loss": 0.4504, "step": 109090 }, { "epoch": 2.4283297720797723, "grad_norm": 0.7123696208000183, "learning_rate": 3.478877222668084e-05, "loss": 0.5784, "step": 109100 }, { "epoch": 2.4285523504273505, "grad_norm": 0.4823485314846039, "learning_rate": 3.476250099121927e-05, "loss": 0.3552, "step": 109110 }, { "epoch": 2.4287749287749287, "grad_norm": 0.6309967637062073, "learning_rate": 3.4736238734898665e-05, "loss": 0.4613, "step": 109120 }, { "epoch": 2.428997507122507, "grad_norm": 0.6412575840950012, "learning_rate": 3.4709985459146186e-05, "loss": 0.3747, "step": 109130 }, { "epoch": 2.4292200854700856, "grad_norm": 0.6632343530654907, "learning_rate": 3.4683741165388374e-05, "loss": 0.4115, "step": 109140 }, { "epoch": 2.429442663817664, "grad_norm": 0.5699517130851746, "learning_rate": 3.4657505855051386e-05, "loss": 0.4816, "step": 109150 }, { "epoch": 2.429665242165242, "grad_norm": 0.7767205238342285, "learning_rate": 3.463127952956089e-05, "loss": 0.432, "step": 109160 }, { "epoch": 2.4298878205128207, "grad_norm": 0.7094178795814514, "learning_rate": 3.460506219034203e-05, "loss": 0.4014, "step": 109170 }, { "epoch": 2.430110398860399, "grad_norm": 0.5864173769950867, "learning_rate": 3.457885383881949e-05, "loss": 0.4635, "step": 109180 }, { "epoch": 2.430332977207977, "grad_norm": 0.5907089114189148, "learning_rate": 3.4552654476417536e-05, "loss": 0.5184, "step": 109190 }, { "epoch": 2.4305555555555554, "grad_norm": 0.677842915058136, "learning_rate": 3.452646410455969e-05, "loss": 0.4403, "step": 109200 }, { "epoch": 2.430778133903134, "grad_norm": 0.3627990782260895, "learning_rate": 3.450028272466932e-05, "loss": 0.6292, "step": 109210 }, { "epoch": 2.4310007122507122, "grad_norm": 0.6533386707305908, "learning_rate": 3.447411033816901e-05, "loss": 0.459, "step": 109220 }, { "epoch": 2.4312232905982905, "grad_norm": 0.5333442687988281, "learning_rate": 3.444794694648106e-05, "loss": 0.4785, "step": 109230 }, { "epoch": 2.431445868945869, "grad_norm": 1.0702968835830688, "learning_rate": 3.44217925510272e-05, "loss": 0.4997, "step": 109240 }, { "epoch": 2.4316684472934473, "grad_norm": 0.6150128841400146, "learning_rate": 3.439564715322867e-05, "loss": 0.5079, "step": 109250 }, { "epoch": 2.4318910256410255, "grad_norm": 0.8526206612586975, "learning_rate": 3.436951075450625e-05, "loss": 0.5689, "step": 109260 }, { "epoch": 2.432113603988604, "grad_norm": 0.2673870921134949, "learning_rate": 3.4343383356280246e-05, "loss": 0.4312, "step": 109270 }, { "epoch": 2.4323361823361824, "grad_norm": 0.4728868305683136, "learning_rate": 3.431726495997036e-05, "loss": 0.4545, "step": 109280 }, { "epoch": 2.4325587606837606, "grad_norm": 0.5523426532745361, "learning_rate": 3.429115556699594e-05, "loss": 0.5015, "step": 109290 }, { "epoch": 2.432781339031339, "grad_norm": 0.4013436734676361, "learning_rate": 3.4265055178775785e-05, "loss": 0.4636, "step": 109300 }, { "epoch": 2.4330039173789175, "grad_norm": 0.41342243552207947, "learning_rate": 3.42389637967282e-05, "loss": 0.4356, "step": 109310 }, { "epoch": 2.4332264957264957, "grad_norm": 0.5798662900924683, "learning_rate": 3.421288142227106e-05, "loss": 0.4689, "step": 109320 }, { "epoch": 2.433449074074074, "grad_norm": 0.6059615015983582, "learning_rate": 3.418680805682162e-05, "loss": 0.4677, "step": 109330 }, { "epoch": 2.433671652421652, "grad_norm": 0.2866314649581909, "learning_rate": 3.416074370179678e-05, "loss": 0.4174, "step": 109340 }, { "epoch": 2.433894230769231, "grad_norm": 0.3867923319339752, "learning_rate": 3.413468835861293e-05, "loss": 0.3449, "step": 109350 }, { "epoch": 2.434116809116809, "grad_norm": 0.5961967706680298, "learning_rate": 3.4108642028685864e-05, "loss": 0.4518, "step": 109360 }, { "epoch": 2.4343393874643873, "grad_norm": 0.6358778476715088, "learning_rate": 3.4082604713430985e-05, "loss": 0.4875, "step": 109370 }, { "epoch": 2.434561965811966, "grad_norm": 0.7603932619094849, "learning_rate": 3.4056576414263184e-05, "loss": 0.4939, "step": 109380 }, { "epoch": 2.434784544159544, "grad_norm": 0.9518799781799316, "learning_rate": 3.4030557132596884e-05, "loss": 0.5118, "step": 109390 }, { "epoch": 2.4350071225071224, "grad_norm": 0.8648528456687927, "learning_rate": 3.400454686984595e-05, "loss": 0.4375, "step": 109400 }, { "epoch": 2.435229700854701, "grad_norm": 0.46407267451286316, "learning_rate": 3.397854562742391e-05, "loss": 0.4906, "step": 109410 }, { "epoch": 2.4354522792022792, "grad_norm": 0.6812346577644348, "learning_rate": 3.395255340674355e-05, "loss": 0.3656, "step": 109420 }, { "epoch": 2.4356748575498575, "grad_norm": 0.6520447731018066, "learning_rate": 3.392657020921737e-05, "loss": 0.5176, "step": 109430 }, { "epoch": 2.435897435897436, "grad_norm": 0.6260280013084412, "learning_rate": 3.390059603625733e-05, "loss": 0.4418, "step": 109440 }, { "epoch": 2.4361200142450143, "grad_norm": 0.4960933327674866, "learning_rate": 3.387463088927492e-05, "loss": 0.4674, "step": 109450 }, { "epoch": 2.4363425925925926, "grad_norm": 0.5193760395050049, "learning_rate": 3.384867476968101e-05, "loss": 0.4425, "step": 109460 }, { "epoch": 2.4365651709401708, "grad_norm": 0.5269632935523987, "learning_rate": 3.3822727678886124e-05, "loss": 0.4172, "step": 109470 }, { "epoch": 2.4367877492877494, "grad_norm": 0.847545325756073, "learning_rate": 3.379678961830026e-05, "loss": 0.5111, "step": 109480 }, { "epoch": 2.4370103276353277, "grad_norm": 0.7173821926116943, "learning_rate": 3.377086058933297e-05, "loss": 0.4615, "step": 109490 }, { "epoch": 2.437232905982906, "grad_norm": 0.4638817608356476, "learning_rate": 3.3744940593393125e-05, "loss": 0.4021, "step": 109500 }, { "epoch": 2.437455484330484, "grad_norm": 0.48426464200019836, "learning_rate": 3.371902963188933e-05, "loss": 0.4959, "step": 109510 }, { "epoch": 2.4376780626780628, "grad_norm": 0.4158429205417633, "learning_rate": 3.369312770622959e-05, "loss": 0.5318, "step": 109520 }, { "epoch": 2.437900641025641, "grad_norm": 0.6626630425453186, "learning_rate": 3.366723481782141e-05, "loss": 0.4656, "step": 109530 }, { "epoch": 2.438123219373219, "grad_norm": 0.39689040184020996, "learning_rate": 3.3641350968071885e-05, "loss": 0.5204, "step": 109540 }, { "epoch": 2.438345797720798, "grad_norm": 0.7641111016273499, "learning_rate": 3.361547615838758e-05, "loss": 0.5227, "step": 109550 }, { "epoch": 2.438568376068376, "grad_norm": 0.48284536600112915, "learning_rate": 3.358961039017445e-05, "loss": 0.4339, "step": 109560 }, { "epoch": 2.4387909544159543, "grad_norm": 0.7062486410140991, "learning_rate": 3.356375366483813e-05, "loss": 0.5128, "step": 109570 }, { "epoch": 2.439013532763533, "grad_norm": 0.5849929451942444, "learning_rate": 3.353790598378368e-05, "loss": 0.4973, "step": 109580 }, { "epoch": 2.439236111111111, "grad_norm": 0.5276649594306946, "learning_rate": 3.3512067348415744e-05, "loss": 0.3899, "step": 109590 }, { "epoch": 2.4394586894586894, "grad_norm": 0.4740630090236664, "learning_rate": 3.34862377601383e-05, "loss": 0.3603, "step": 109600 }, { "epoch": 2.439681267806268, "grad_norm": 0.6667687892913818, "learning_rate": 3.346041722035502e-05, "loss": 0.4622, "step": 109610 }, { "epoch": 2.4399038461538463, "grad_norm": 0.6276548504829407, "learning_rate": 3.343460573046902e-05, "loss": 0.4644, "step": 109620 }, { "epoch": 2.4401264245014245, "grad_norm": 0.5454540848731995, "learning_rate": 3.340880329188294e-05, "loss": 0.5998, "step": 109630 }, { "epoch": 2.4403490028490027, "grad_norm": 0.6678075790405273, "learning_rate": 3.338300990599881e-05, "loss": 0.4705, "step": 109640 }, { "epoch": 2.4405715811965814, "grad_norm": 0.7437434792518616, "learning_rate": 3.335722557421832e-05, "loss": 0.5217, "step": 109650 }, { "epoch": 2.4407941595441596, "grad_norm": 0.7812333703041077, "learning_rate": 3.333145029794262e-05, "loss": 0.4913, "step": 109660 }, { "epoch": 2.441016737891738, "grad_norm": 0.9800385236740112, "learning_rate": 3.330568407857235e-05, "loss": 0.4852, "step": 109670 }, { "epoch": 2.441239316239316, "grad_norm": 0.47481921315193176, "learning_rate": 3.327992691750768e-05, "loss": 0.4849, "step": 109680 }, { "epoch": 2.4414618945868947, "grad_norm": 0.6780035495758057, "learning_rate": 3.3254178816148294e-05, "loss": 0.5307, "step": 109690 }, { "epoch": 2.441684472934473, "grad_norm": 0.5398222804069519, "learning_rate": 3.3228439775893295e-05, "loss": 0.3707, "step": 109700 }, { "epoch": 2.441907051282051, "grad_norm": 0.38698190450668335, "learning_rate": 3.320270979814142e-05, "loss": 0.3428, "step": 109710 }, { "epoch": 2.4421296296296298, "grad_norm": 0.5703821778297424, "learning_rate": 3.317698888429086e-05, "loss": 0.6002, "step": 109720 }, { "epoch": 2.442352207977208, "grad_norm": 0.4589943587779999, "learning_rate": 3.315127703573926e-05, "loss": 0.4608, "step": 109730 }, { "epoch": 2.442574786324786, "grad_norm": 0.5299825668334961, "learning_rate": 3.312557425388385e-05, "loss": 0.5301, "step": 109740 }, { "epoch": 2.442797364672365, "grad_norm": 0.5066646933555603, "learning_rate": 3.309988054012134e-05, "loss": 0.4337, "step": 109750 }, { "epoch": 2.443019943019943, "grad_norm": 0.4790757894515991, "learning_rate": 3.307419589584797e-05, "loss": 0.413, "step": 109760 }, { "epoch": 2.4432425213675213, "grad_norm": 0.4142204225063324, "learning_rate": 3.304852032245949e-05, "loss": 0.4809, "step": 109770 }, { "epoch": 2.4434650997151, "grad_norm": 0.6871556639671326, "learning_rate": 3.302285382135104e-05, "loss": 0.5838, "step": 109780 }, { "epoch": 2.443687678062678, "grad_norm": 0.4145238697528839, "learning_rate": 3.299719639391739e-05, "loss": 0.4666, "step": 109790 }, { "epoch": 2.4439102564102564, "grad_norm": 0.4546247124671936, "learning_rate": 3.2971548041552826e-05, "loss": 0.4192, "step": 109800 }, { "epoch": 2.4441328347578346, "grad_norm": 0.6559073328971863, "learning_rate": 3.2945908765651066e-05, "loss": 0.4374, "step": 109810 }, { "epoch": 2.4443554131054133, "grad_norm": 0.6892669200897217, "learning_rate": 3.292027856760538e-05, "loss": 0.3472, "step": 109820 }, { "epoch": 2.4445779914529915, "grad_norm": 0.6675540208816528, "learning_rate": 3.289465744880858e-05, "loss": 0.5037, "step": 109830 }, { "epoch": 2.4448005698005697, "grad_norm": 0.4565548896789551, "learning_rate": 3.286904541065285e-05, "loss": 0.4619, "step": 109840 }, { "epoch": 2.445023148148148, "grad_norm": 0.7293078303337097, "learning_rate": 3.284344245453006e-05, "loss": 0.4634, "step": 109850 }, { "epoch": 2.4452457264957266, "grad_norm": 0.47439637780189514, "learning_rate": 3.281784858183139e-05, "loss": 0.3893, "step": 109860 }, { "epoch": 2.445468304843305, "grad_norm": 0.553246796131134, "learning_rate": 3.2792263793947705e-05, "loss": 0.4897, "step": 109870 }, { "epoch": 2.445690883190883, "grad_norm": 0.7342193722724915, "learning_rate": 3.2766688092269284e-05, "loss": 0.4816, "step": 109880 }, { "epoch": 2.4459134615384617, "grad_norm": 0.5039429068565369, "learning_rate": 3.274112147818593e-05, "loss": 0.4815, "step": 109890 }, { "epoch": 2.44613603988604, "grad_norm": 0.5331164598464966, "learning_rate": 3.271556395308695e-05, "loss": 0.4651, "step": 109900 }, { "epoch": 2.446358618233618, "grad_norm": 0.5317120552062988, "learning_rate": 3.269001551836124e-05, "loss": 0.5472, "step": 109910 }, { "epoch": 2.4465811965811968, "grad_norm": 0.603212833404541, "learning_rate": 3.266447617539698e-05, "loss": 0.4639, "step": 109920 }, { "epoch": 2.446803774928775, "grad_norm": 0.5109623074531555, "learning_rate": 3.26389459255821e-05, "loss": 0.4415, "step": 109930 }, { "epoch": 2.447026353276353, "grad_norm": 0.7490630149841309, "learning_rate": 3.261342477030389e-05, "loss": 0.4875, "step": 109940 }, { "epoch": 2.447248931623932, "grad_norm": 0.528417706489563, "learning_rate": 3.258791271094921e-05, "loss": 0.4238, "step": 109950 }, { "epoch": 2.44747150997151, "grad_norm": 0.31074804067611694, "learning_rate": 3.256240974890441e-05, "loss": 0.3852, "step": 109960 }, { "epoch": 2.4476940883190883, "grad_norm": 0.7868363261222839, "learning_rate": 3.2536915885555367e-05, "loss": 0.3732, "step": 109970 }, { "epoch": 2.4479166666666665, "grad_norm": 0.4590838849544525, "learning_rate": 3.251143112228743e-05, "loss": 0.4183, "step": 109980 }, { "epoch": 2.448139245014245, "grad_norm": 0.7592793703079224, "learning_rate": 3.248595546048536e-05, "loss": 0.4788, "step": 109990 }, { "epoch": 2.4483618233618234, "grad_norm": 0.6533322930335999, "learning_rate": 3.2460488901533635e-05, "loss": 0.4313, "step": 110000 }, { "epoch": 2.4485844017094016, "grad_norm": 0.8436979055404663, "learning_rate": 3.2435031446816075e-05, "loss": 0.5353, "step": 110010 }, { "epoch": 2.44880698005698, "grad_norm": 0.4479171335697174, "learning_rate": 3.240958309771609e-05, "loss": 0.4168, "step": 110020 }, { "epoch": 2.4490295584045585, "grad_norm": 0.9244013428688049, "learning_rate": 3.238414385561655e-05, "loss": 0.4576, "step": 110030 }, { "epoch": 2.4492521367521367, "grad_norm": 0.49031445384025574, "learning_rate": 3.235871372189985e-05, "loss": 0.4534, "step": 110040 }, { "epoch": 2.449474715099715, "grad_norm": 0.66366046667099, "learning_rate": 3.2333292697947934e-05, "loss": 0.4447, "step": 110050 }, { "epoch": 2.4496972934472936, "grad_norm": 0.44934597611427307, "learning_rate": 3.230788078514211e-05, "loss": 0.3764, "step": 110060 }, { "epoch": 2.449919871794872, "grad_norm": 0.25253328680992126, "learning_rate": 3.2282477984863326e-05, "loss": 0.4856, "step": 110070 }, { "epoch": 2.45014245014245, "grad_norm": 0.6734674572944641, "learning_rate": 3.225708429849197e-05, "loss": 0.4353, "step": 110080 }, { "epoch": 2.4503650284900287, "grad_norm": 0.5335591435432434, "learning_rate": 3.2231699727407984e-05, "loss": 0.4386, "step": 110090 }, { "epoch": 2.450587606837607, "grad_norm": 1.137387990951538, "learning_rate": 3.220632427299077e-05, "loss": 0.4026, "step": 110100 }, { "epoch": 2.450810185185185, "grad_norm": 0.748821496963501, "learning_rate": 3.2180957936619324e-05, "loss": 0.474, "step": 110110 }, { "epoch": 2.451032763532764, "grad_norm": 0.47783905267715454, "learning_rate": 3.215560071967198e-05, "loss": 0.453, "step": 110120 }, { "epoch": 2.451255341880342, "grad_norm": 0.4711649417877197, "learning_rate": 3.213025262352667e-05, "loss": 0.5357, "step": 110130 }, { "epoch": 2.45147792022792, "grad_norm": 0.3978164494037628, "learning_rate": 3.210491364956085e-05, "loss": 0.4032, "step": 110140 }, { "epoch": 2.4517004985754984, "grad_norm": 0.7222580909729004, "learning_rate": 3.207958379915148e-05, "loss": 0.5119, "step": 110150 }, { "epoch": 2.451923076923077, "grad_norm": 0.4609293043613434, "learning_rate": 3.205426307367498e-05, "loss": 0.316, "step": 110160 }, { "epoch": 2.4521456552706553, "grad_norm": 0.8023900985717773, "learning_rate": 3.202895147450731e-05, "loss": 0.4822, "step": 110170 }, { "epoch": 2.4523682336182335, "grad_norm": 0.47974106669425964, "learning_rate": 3.200364900302393e-05, "loss": 0.4279, "step": 110180 }, { "epoch": 2.4525908119658117, "grad_norm": 0.4707964062690735, "learning_rate": 3.197835566059983e-05, "loss": 0.367, "step": 110190 }, { "epoch": 2.4528133903133904, "grad_norm": 0.6110070943832397, "learning_rate": 3.1953071448609396e-05, "loss": 0.4845, "step": 110200 }, { "epoch": 2.4530359686609686, "grad_norm": 0.7285255789756775, "learning_rate": 3.192779636842662e-05, "loss": 0.508, "step": 110210 }, { "epoch": 2.453258547008547, "grad_norm": 0.630529522895813, "learning_rate": 3.190253042142499e-05, "loss": 0.4566, "step": 110220 }, { "epoch": 2.4534811253561255, "grad_norm": 0.5275912880897522, "learning_rate": 3.1877273608977455e-05, "loss": 0.4482, "step": 110230 }, { "epoch": 2.4537037037037037, "grad_norm": 0.6072720885276794, "learning_rate": 3.185202593245655e-05, "loss": 0.5253, "step": 110240 }, { "epoch": 2.453926282051282, "grad_norm": 0.5082536935806274, "learning_rate": 3.182678739323417e-05, "loss": 0.5118, "step": 110250 }, { "epoch": 2.4541488603988606, "grad_norm": 0.5880118608474731, "learning_rate": 3.1801557992681875e-05, "loss": 0.4145, "step": 110260 }, { "epoch": 2.454371438746439, "grad_norm": 0.32024118304252625, "learning_rate": 3.177633773217057e-05, "loss": 0.4723, "step": 110270 }, { "epoch": 2.454594017094017, "grad_norm": 0.5531744360923767, "learning_rate": 3.1751126613070805e-05, "loss": 0.4664, "step": 110280 }, { "epoch": 2.4548165954415953, "grad_norm": 0.49054989218711853, "learning_rate": 3.1725924636752525e-05, "loss": 0.5336, "step": 110290 }, { "epoch": 2.455039173789174, "grad_norm": 0.5179944634437561, "learning_rate": 3.1700731804585285e-05, "loss": 0.4942, "step": 110300 }, { "epoch": 2.455261752136752, "grad_norm": 0.7548027634620667, "learning_rate": 3.1675548117938025e-05, "loss": 0.4363, "step": 110310 }, { "epoch": 2.4554843304843303, "grad_norm": 0.5132513046264648, "learning_rate": 3.165037357817928e-05, "loss": 0.4183, "step": 110320 }, { "epoch": 2.455706908831909, "grad_norm": 0.6227317452430725, "learning_rate": 3.1625208186677115e-05, "loss": 0.4933, "step": 110330 }, { "epoch": 2.4559294871794872, "grad_norm": 0.5882665514945984, "learning_rate": 3.1600051944798935e-05, "loss": 0.3687, "step": 110340 }, { "epoch": 2.4561520655270654, "grad_norm": 0.707750678062439, "learning_rate": 3.157490485391177e-05, "loss": 0.4263, "step": 110350 }, { "epoch": 2.4563746438746437, "grad_norm": 0.4505460560321808, "learning_rate": 3.1549766915382165e-05, "loss": 0.4801, "step": 110360 }, { "epoch": 2.4565972222222223, "grad_norm": 0.5214482545852661, "learning_rate": 3.152463813057618e-05, "loss": 0.434, "step": 110370 }, { "epoch": 2.4568198005698005, "grad_norm": 0.4563765823841095, "learning_rate": 3.1499518500859216e-05, "loss": 0.3846, "step": 110380 }, { "epoch": 2.4570423789173788, "grad_norm": 0.3778112530708313, "learning_rate": 3.147440802759636e-05, "loss": 0.3905, "step": 110390 }, { "epoch": 2.4572649572649574, "grad_norm": 0.6129999756813049, "learning_rate": 3.144930671215218e-05, "loss": 0.4428, "step": 110400 }, { "epoch": 2.4574875356125356, "grad_norm": 0.5692477822303772, "learning_rate": 3.142421455589062e-05, "loss": 0.4818, "step": 110410 }, { "epoch": 2.457710113960114, "grad_norm": 0.5394112467765808, "learning_rate": 3.1399131560175245e-05, "loss": 0.4291, "step": 110420 }, { "epoch": 2.4579326923076925, "grad_norm": 0.6020023822784424, "learning_rate": 3.1374057726369076e-05, "loss": 0.4352, "step": 110430 }, { "epoch": 2.4581552706552707, "grad_norm": 0.4298074245452881, "learning_rate": 3.134899305583465e-05, "loss": 0.4341, "step": 110440 }, { "epoch": 2.458377849002849, "grad_norm": 0.42165303230285645, "learning_rate": 3.1323937549934015e-05, "loss": 0.4388, "step": 110450 }, { "epoch": 2.458600427350427, "grad_norm": 1.0087652206420898, "learning_rate": 3.129889121002873e-05, "loss": 0.4093, "step": 110460 }, { "epoch": 2.458823005698006, "grad_norm": 0.40058034658432007, "learning_rate": 3.127385403747976e-05, "loss": 0.4373, "step": 110470 }, { "epoch": 2.459045584045584, "grad_norm": 0.3793736696243286, "learning_rate": 3.1248826033647695e-05, "loss": 0.4836, "step": 110480 }, { "epoch": 2.4592681623931623, "grad_norm": 0.4240846633911133, "learning_rate": 3.1223807199892576e-05, "loss": 0.4503, "step": 110490 }, { "epoch": 2.459490740740741, "grad_norm": 0.5919007658958435, "learning_rate": 3.1198797537573975e-05, "loss": 0.4513, "step": 110500 }, { "epoch": 2.459713319088319, "grad_norm": 0.5737066268920898, "learning_rate": 3.117379704805086e-05, "loss": 0.4486, "step": 110510 }, { "epoch": 2.4599358974358974, "grad_norm": 0.5145678520202637, "learning_rate": 3.114880573268182e-05, "loss": 0.4535, "step": 110520 }, { "epoch": 2.4601584757834756, "grad_norm": 0.5854604840278625, "learning_rate": 3.1123823592824887e-05, "loss": 0.4105, "step": 110530 }, { "epoch": 2.460292022792023, "eval_loss": 0.5268440842628479, "eval_runtime": 337.3162, "eval_samples_per_second": 7.011, "eval_steps_per_second": 7.011, "step": 110536 }, { "epoch": 2.4603810541310542, "grad_norm": 0.5479249954223633, "learning_rate": 3.1098850629837705e-05, "loss": 0.5216, "step": 110540 }, { "epoch": 2.4606036324786325, "grad_norm": 0.2902457118034363, "learning_rate": 3.107388684507717e-05, "loss": 0.4611, "step": 110550 }, { "epoch": 2.4608262108262107, "grad_norm": 0.4755493700504303, "learning_rate": 3.104893223989995e-05, "loss": 0.3868, "step": 110560 }, { "epoch": 2.4610487891737893, "grad_norm": 0.5796645879745483, "learning_rate": 3.102398681566203e-05, "loss": 0.4204, "step": 110570 }, { "epoch": 2.4612713675213675, "grad_norm": 1.0918166637420654, "learning_rate": 3.099905057371901e-05, "loss": 0.6028, "step": 110580 }, { "epoch": 2.4614939458689458, "grad_norm": 0.4401234984397888, "learning_rate": 3.097412351542595e-05, "loss": 0.4773, "step": 110590 }, { "epoch": 2.4617165242165244, "grad_norm": 0.5752748847007751, "learning_rate": 3.094920564213741e-05, "loss": 0.544, "step": 110600 }, { "epoch": 2.4619391025641026, "grad_norm": 0.5700196027755737, "learning_rate": 3.09242969552074e-05, "loss": 0.4934, "step": 110610 }, { "epoch": 2.462161680911681, "grad_norm": 0.6183973550796509, "learning_rate": 3.089939745598949e-05, "loss": 0.4989, "step": 110620 }, { "epoch": 2.462384259259259, "grad_norm": 0.616034746170044, "learning_rate": 3.087450714583675e-05, "loss": 0.4945, "step": 110630 }, { "epoch": 2.4626068376068377, "grad_norm": 0.5810309052467346, "learning_rate": 3.0849626026101796e-05, "loss": 0.3826, "step": 110640 }, { "epoch": 2.462829415954416, "grad_norm": 0.6257836222648621, "learning_rate": 3.082475409813659e-05, "loss": 0.4761, "step": 110650 }, { "epoch": 2.463051994301994, "grad_norm": 0.47654014825820923, "learning_rate": 3.0799891363292755e-05, "loss": 0.486, "step": 110660 }, { "epoch": 2.463274572649573, "grad_norm": 0.37999096512794495, "learning_rate": 3.0775037822921325e-05, "loss": 0.3583, "step": 110670 }, { "epoch": 2.463497150997151, "grad_norm": 0.6416193246841431, "learning_rate": 3.075019347837291e-05, "loss": 0.5002, "step": 110680 }, { "epoch": 2.4637197293447293, "grad_norm": 0.6239182353019714, "learning_rate": 3.07253583309975e-05, "loss": 0.4129, "step": 110690 }, { "epoch": 2.4639423076923075, "grad_norm": 0.7033472061157227, "learning_rate": 3.07005323821447e-05, "loss": 0.4989, "step": 110700 }, { "epoch": 2.464164886039886, "grad_norm": 0.42417532205581665, "learning_rate": 3.067571563316356e-05, "loss": 0.4806, "step": 110710 }, { "epoch": 2.4643874643874644, "grad_norm": 0.6155065298080444, "learning_rate": 3.065090808540265e-05, "loss": 0.4852, "step": 110720 }, { "epoch": 2.4646100427350426, "grad_norm": 0.6713215112686157, "learning_rate": 3.062610974021001e-05, "loss": 0.4321, "step": 110730 }, { "epoch": 2.4648326210826212, "grad_norm": 0.6614099740982056, "learning_rate": 3.06013205989333e-05, "loss": 0.4752, "step": 110740 }, { "epoch": 2.4650551994301995, "grad_norm": 0.7085887789726257, "learning_rate": 3.057654066291944e-05, "loss": 0.5011, "step": 110750 }, { "epoch": 2.4652777777777777, "grad_norm": 0.5104491114616394, "learning_rate": 3.055176993351505e-05, "loss": 0.3953, "step": 110760 }, { "epoch": 2.4655003561253563, "grad_norm": 0.7498946785926819, "learning_rate": 3.052700841206626e-05, "loss": 0.4204, "step": 110770 }, { "epoch": 2.4657229344729346, "grad_norm": 0.49787577986717224, "learning_rate": 3.0502256099918524e-05, "loss": 0.4701, "step": 110780 }, { "epoch": 2.4659455128205128, "grad_norm": 0.3691440224647522, "learning_rate": 3.0477512998416946e-05, "loss": 0.4112, "step": 110790 }, { "epoch": 2.466168091168091, "grad_norm": 0.4830651879310608, "learning_rate": 3.0452779108906072e-05, "loss": 0.4191, "step": 110800 }, { "epoch": 2.4663906695156697, "grad_norm": 0.5980969667434692, "learning_rate": 3.0428054432730002e-05, "loss": 0.4378, "step": 110810 }, { "epoch": 2.466613247863248, "grad_norm": 0.7120261788368225, "learning_rate": 3.040333897123231e-05, "loss": 0.5123, "step": 110820 }, { "epoch": 2.466835826210826, "grad_norm": 0.5967547297477722, "learning_rate": 3.037863272575596e-05, "loss": 0.4578, "step": 110830 }, { "epoch": 2.4670584045584047, "grad_norm": 0.5113248229026794, "learning_rate": 3.03539356976436e-05, "loss": 0.5512, "step": 110840 }, { "epoch": 2.467280982905983, "grad_norm": 0.48821139335632324, "learning_rate": 3.032924788823721e-05, "loss": 0.5002, "step": 110850 }, { "epoch": 2.467503561253561, "grad_norm": 0.6478132009506226, "learning_rate": 3.0304569298878414e-05, "loss": 0.4861, "step": 110860 }, { "epoch": 2.4677261396011394, "grad_norm": 0.6745758056640625, "learning_rate": 3.027989993090823e-05, "loss": 0.5007, "step": 110870 }, { "epoch": 2.467948717948718, "grad_norm": 0.6008769273757935, "learning_rate": 3.025523978566729e-05, "loss": 0.4701, "step": 110880 }, { "epoch": 2.4681712962962963, "grad_norm": 0.6006442308425903, "learning_rate": 3.0230588864495523e-05, "loss": 0.5536, "step": 110890 }, { "epoch": 2.4683938746438745, "grad_norm": 0.6266177892684937, "learning_rate": 3.0205947168732575e-05, "loss": 0.4486, "step": 110900 }, { "epoch": 2.468616452991453, "grad_norm": 0.46426495909690857, "learning_rate": 3.018131469971741e-05, "loss": 0.3734, "step": 110910 }, { "epoch": 2.4688390313390314, "grad_norm": 0.6160069704055786, "learning_rate": 3.0156691458788634e-05, "loss": 0.5496, "step": 110920 }, { "epoch": 2.4690616096866096, "grad_norm": 0.433323472738266, "learning_rate": 3.013207744728428e-05, "loss": 0.4332, "step": 110930 }, { "epoch": 2.4692841880341883, "grad_norm": 0.2941664755344391, "learning_rate": 3.01074726665419e-05, "loss": 0.4586, "step": 110940 }, { "epoch": 2.4695067663817665, "grad_norm": 0.8551943898200989, "learning_rate": 3.0082877117898523e-05, "loss": 0.4004, "step": 110950 }, { "epoch": 2.4697293447293447, "grad_norm": 0.7417336702346802, "learning_rate": 3.0058290802690758e-05, "loss": 0.5409, "step": 110960 }, { "epoch": 2.469951923076923, "grad_norm": 0.47685909271240234, "learning_rate": 3.0033713722254564e-05, "loss": 0.3813, "step": 110970 }, { "epoch": 2.4701745014245016, "grad_norm": 0.3092232942581177, "learning_rate": 3.0009145877925472e-05, "loss": 0.4626, "step": 110980 }, { "epoch": 2.47039707977208, "grad_norm": 0.46632498502731323, "learning_rate": 2.998458727103859e-05, "loss": 0.4482, "step": 110990 }, { "epoch": 2.470619658119658, "grad_norm": 0.5601755976676941, "learning_rate": 2.9960037902928383e-05, "loss": 0.449, "step": 111000 }, { "epoch": 2.470842236467236, "grad_norm": 0.3991512656211853, "learning_rate": 2.9935497774928946e-05, "loss": 0.3801, "step": 111010 }, { "epoch": 2.471064814814815, "grad_norm": 0.49260395765304565, "learning_rate": 2.9910966888373802e-05, "loss": 0.4444, "step": 111020 }, { "epoch": 2.471287393162393, "grad_norm": 0.746168851852417, "learning_rate": 2.988644524459594e-05, "loss": 0.4011, "step": 111030 }, { "epoch": 2.4715099715099713, "grad_norm": 0.5269076228141785, "learning_rate": 2.9861932844927932e-05, "loss": 0.3908, "step": 111040 }, { "epoch": 2.47173254985755, "grad_norm": 0.4912148118019104, "learning_rate": 2.9837429690701734e-05, "loss": 0.4341, "step": 111050 }, { "epoch": 2.471955128205128, "grad_norm": 0.4004683196544647, "learning_rate": 2.9812935783248906e-05, "loss": 0.3901, "step": 111060 }, { "epoch": 2.4721777065527064, "grad_norm": 0.5287625193595886, "learning_rate": 2.9788451123900473e-05, "loss": 0.4416, "step": 111070 }, { "epoch": 2.472400284900285, "grad_norm": 0.7422000169754028, "learning_rate": 2.9763975713986948e-05, "loss": 0.3797, "step": 111080 }, { "epoch": 2.4726228632478633, "grad_norm": 0.8129696249961853, "learning_rate": 2.973950955483835e-05, "loss": 0.5297, "step": 111090 }, { "epoch": 2.4728454415954415, "grad_norm": 0.4405764639377594, "learning_rate": 2.9715052647784226e-05, "loss": 0.4761, "step": 111100 }, { "epoch": 2.47306801994302, "grad_norm": 0.8199268579483032, "learning_rate": 2.969060499415348e-05, "loss": 0.5153, "step": 111110 }, { "epoch": 2.4732905982905984, "grad_norm": 0.6683918237686157, "learning_rate": 2.9666166595274702e-05, "loss": 0.3938, "step": 111120 }, { "epoch": 2.4735131766381766, "grad_norm": 0.5031790733337402, "learning_rate": 2.9641737452475872e-05, "loss": 0.3214, "step": 111130 }, { "epoch": 2.473735754985755, "grad_norm": 0.5017289519309998, "learning_rate": 2.961731756708448e-05, "loss": 0.4305, "step": 111140 }, { "epoch": 2.4739583333333335, "grad_norm": 0.7404310703277588, "learning_rate": 2.9592906940427534e-05, "loss": 0.4257, "step": 111150 }, { "epoch": 2.4741809116809117, "grad_norm": 0.6419523358345032, "learning_rate": 2.9568505573831574e-05, "loss": 0.5095, "step": 111160 }, { "epoch": 2.47440349002849, "grad_norm": 0.380824476480484, "learning_rate": 2.9544113468622492e-05, "loss": 0.3854, "step": 111170 }, { "epoch": 2.474626068376068, "grad_norm": 0.6494007706642151, "learning_rate": 2.9519730626125874e-05, "loss": 0.5319, "step": 111180 }, { "epoch": 2.474848646723647, "grad_norm": 0.46472489833831787, "learning_rate": 2.9495357047666618e-05, "loss": 0.4741, "step": 111190 }, { "epoch": 2.475071225071225, "grad_norm": 0.7664408087730408, "learning_rate": 2.9470992734569236e-05, "loss": 0.4839, "step": 111200 }, { "epoch": 2.4752938034188032, "grad_norm": 0.5787988305091858, "learning_rate": 2.944663768815772e-05, "loss": 0.4816, "step": 111210 }, { "epoch": 2.475516381766382, "grad_norm": 0.5572113394737244, "learning_rate": 2.9422291909755517e-05, "loss": 0.5709, "step": 111220 }, { "epoch": 2.47573896011396, "grad_norm": 0.5527510046958923, "learning_rate": 2.9397955400685618e-05, "loss": 0.4282, "step": 111230 }, { "epoch": 2.4759615384615383, "grad_norm": 0.43369776010513306, "learning_rate": 2.937362816227054e-05, "loss": 0.4293, "step": 111240 }, { "epoch": 2.476184116809117, "grad_norm": 0.5547199845314026, "learning_rate": 2.9349310195832135e-05, "loss": 0.3511, "step": 111250 }, { "epoch": 2.476406695156695, "grad_norm": 0.32809948921203613, "learning_rate": 2.9325001502691907e-05, "loss": 0.5122, "step": 111260 }, { "epoch": 2.4766292735042734, "grad_norm": 0.6356037855148315, "learning_rate": 2.930070208417084e-05, "loss": 0.516, "step": 111270 }, { "epoch": 2.476851851851852, "grad_norm": 0.548595666885376, "learning_rate": 2.9276411941589342e-05, "loss": 0.448, "step": 111280 }, { "epoch": 2.4770744301994303, "grad_norm": 0.5238490700721741, "learning_rate": 2.925213107626743e-05, "loss": 0.4345, "step": 111290 }, { "epoch": 2.4772970085470085, "grad_norm": 0.3655892014503479, "learning_rate": 2.9227859489524467e-05, "loss": 0.4731, "step": 111300 }, { "epoch": 2.4775195868945867, "grad_norm": 0.7614578604698181, "learning_rate": 2.9203597182679444e-05, "loss": 0.4322, "step": 111310 }, { "epoch": 2.4777421652421654, "grad_norm": 0.8040751814842224, "learning_rate": 2.9179344157050724e-05, "loss": 0.4649, "step": 111320 }, { "epoch": 2.4779647435897436, "grad_norm": 0.7341752052307129, "learning_rate": 2.9155100413956306e-05, "loss": 0.4542, "step": 111330 }, { "epoch": 2.478187321937322, "grad_norm": 0.5599677562713623, "learning_rate": 2.913086595471357e-05, "loss": 0.4664, "step": 111340 }, { "epoch": 2.4784099002849, "grad_norm": 0.551198422908783, "learning_rate": 2.9106640780639472e-05, "loss": 0.5055, "step": 111350 }, { "epoch": 2.4786324786324787, "grad_norm": 0.6239669322967529, "learning_rate": 2.9082424893050398e-05, "loss": 0.5534, "step": 111360 }, { "epoch": 2.478855056980057, "grad_norm": 0.5712863802909851, "learning_rate": 2.9058218293262297e-05, "loss": 0.4345, "step": 111370 }, { "epoch": 2.479077635327635, "grad_norm": 0.5392316579818726, "learning_rate": 2.903402098259058e-05, "loss": 0.3854, "step": 111380 }, { "epoch": 2.479300213675214, "grad_norm": 0.6309967637062073, "learning_rate": 2.9009832962350092e-05, "loss": 0.4446, "step": 111390 }, { "epoch": 2.479522792022792, "grad_norm": 0.667599081993103, "learning_rate": 2.8985654233855243e-05, "loss": 0.4203, "step": 111400 }, { "epoch": 2.4797453703703702, "grad_norm": 0.7004653215408325, "learning_rate": 2.8961484798419934e-05, "loss": 0.4952, "step": 111410 }, { "epoch": 2.479967948717949, "grad_norm": 0.4802926480770111, "learning_rate": 2.8937324657357632e-05, "loss": 0.5486, "step": 111420 }, { "epoch": 2.480190527065527, "grad_norm": 0.5662277936935425, "learning_rate": 2.8913173811981086e-05, "loss": 0.5311, "step": 111430 }, { "epoch": 2.4804131054131053, "grad_norm": 0.794284999370575, "learning_rate": 2.8889032263602733e-05, "loss": 0.4408, "step": 111440 }, { "epoch": 2.480635683760684, "grad_norm": 0.5913161635398865, "learning_rate": 2.88649000135345e-05, "loss": 0.5315, "step": 111450 }, { "epoch": 2.480858262108262, "grad_norm": 0.4098511040210724, "learning_rate": 2.8840777063087655e-05, "loss": 0.3523, "step": 111460 }, { "epoch": 2.4810808404558404, "grad_norm": 0.6115807890892029, "learning_rate": 2.8816663413573096e-05, "loss": 0.3661, "step": 111470 }, { "epoch": 2.4813034188034186, "grad_norm": 0.8771860599517822, "learning_rate": 2.8792559066301183e-05, "loss": 0.4146, "step": 111480 }, { "epoch": 2.4815259971509973, "grad_norm": 0.7560129761695862, "learning_rate": 2.8768464022581755e-05, "loss": 0.5119, "step": 111490 }, { "epoch": 2.4817485754985755, "grad_norm": 0.590459406375885, "learning_rate": 2.8744378283724184e-05, "loss": 0.4552, "step": 111500 }, { "epoch": 2.4819711538461537, "grad_norm": 0.6007856726646423, "learning_rate": 2.872030185103729e-05, "loss": 0.4496, "step": 111510 }, { "epoch": 2.482193732193732, "grad_norm": 0.7245687246322632, "learning_rate": 2.8696234725829452e-05, "loss": 0.4736, "step": 111520 }, { "epoch": 2.4824163105413106, "grad_norm": 0.5625166893005371, "learning_rate": 2.867217690940842e-05, "loss": 0.4426, "step": 111530 }, { "epoch": 2.482638888888889, "grad_norm": 0.4895360469818115, "learning_rate": 2.864812840308153e-05, "loss": 0.4357, "step": 111540 }, { "epoch": 2.482861467236467, "grad_norm": 0.502686619758606, "learning_rate": 2.862408920815567e-05, "loss": 0.3162, "step": 111550 }, { "epoch": 2.4830840455840457, "grad_norm": 0.5399205684661865, "learning_rate": 2.8600059325937057e-05, "loss": 0.4838, "step": 111560 }, { "epoch": 2.483306623931624, "grad_norm": 0.9145728349685669, "learning_rate": 2.8576038757731537e-05, "loss": 0.523, "step": 111570 }, { "epoch": 2.483529202279202, "grad_norm": 0.6872720122337341, "learning_rate": 2.8552027504844404e-05, "loss": 0.3491, "step": 111580 }, { "epoch": 2.483751780626781, "grad_norm": 0.5079686045646667, "learning_rate": 2.8528025568580495e-05, "loss": 0.3177, "step": 111590 }, { "epoch": 2.483974358974359, "grad_norm": 0.5998859405517578, "learning_rate": 2.8504032950243998e-05, "loss": 0.4102, "step": 111600 }, { "epoch": 2.4841969373219372, "grad_norm": 0.7380645275115967, "learning_rate": 2.8480049651138752e-05, "loss": 0.4272, "step": 111610 }, { "epoch": 2.484419515669516, "grad_norm": 0.4750724732875824, "learning_rate": 2.8456075672568028e-05, "loss": 0.4503, "step": 111620 }, { "epoch": 2.484642094017094, "grad_norm": 0.7769150733947754, "learning_rate": 2.843211101583456e-05, "loss": 0.533, "step": 111630 }, { "epoch": 2.4848646723646723, "grad_norm": 0.36780810356140137, "learning_rate": 2.840815568224067e-05, "loss": 0.3738, "step": 111640 }, { "epoch": 2.4850872507122506, "grad_norm": 0.7026530504226685, "learning_rate": 2.8384209673088036e-05, "loss": 0.4269, "step": 111650 }, { "epoch": 2.4853098290598292, "grad_norm": 0.7291935086250305, "learning_rate": 2.8360272989678005e-05, "loss": 0.5512, "step": 111660 }, { "epoch": 2.4855324074074074, "grad_norm": 0.6416599750518799, "learning_rate": 2.8336345633311178e-05, "loss": 0.4808, "step": 111670 }, { "epoch": 2.4857549857549857, "grad_norm": 0.607215940952301, "learning_rate": 2.831242760528794e-05, "loss": 0.3958, "step": 111680 }, { "epoch": 2.485977564102564, "grad_norm": 0.557759165763855, "learning_rate": 2.8288518906907868e-05, "loss": 0.4252, "step": 111690 }, { "epoch": 2.4862001424501425, "grad_norm": 0.5209227204322815, "learning_rate": 2.8264619539470262e-05, "loss": 0.4442, "step": 111700 }, { "epoch": 2.4864227207977208, "grad_norm": 0.6094182133674622, "learning_rate": 2.824072950427381e-05, "loss": 0.4224, "step": 111710 }, { "epoch": 2.486645299145299, "grad_norm": 0.6645049452781677, "learning_rate": 2.8216848802616723e-05, "loss": 0.4077, "step": 111720 }, { "epoch": 2.4868678774928776, "grad_norm": 0.40797844529151917, "learning_rate": 2.819297743579674e-05, "loss": 0.4602, "step": 111730 }, { "epoch": 2.487090455840456, "grad_norm": 0.42032089829444885, "learning_rate": 2.816911540511098e-05, "loss": 0.4549, "step": 111740 }, { "epoch": 2.487313034188034, "grad_norm": 0.49241432547569275, "learning_rate": 2.8145262711856158e-05, "loss": 0.4144, "step": 111750 }, { "epoch": 2.4875356125356127, "grad_norm": 0.42551571130752563, "learning_rate": 2.812141935732844e-05, "loss": 0.5501, "step": 111760 }, { "epoch": 2.487758190883191, "grad_norm": 0.617946207523346, "learning_rate": 2.8097585342823496e-05, "loss": 0.4737, "step": 111770 }, { "epoch": 2.487980769230769, "grad_norm": 0.48028650879859924, "learning_rate": 2.8073760669636495e-05, "loss": 0.4125, "step": 111780 }, { "epoch": 2.488203347578348, "grad_norm": 0.6532488465309143, "learning_rate": 2.804994533906209e-05, "loss": 0.5161, "step": 111790 }, { "epoch": 2.488425925925926, "grad_norm": 0.8077260255813599, "learning_rate": 2.8026139352394464e-05, "loss": 0.4786, "step": 111800 }, { "epoch": 2.4886485042735043, "grad_norm": 0.628367006778717, "learning_rate": 2.8002342710927166e-05, "loss": 0.4582, "step": 111810 }, { "epoch": 2.4888710826210825, "grad_norm": 0.6152153015136719, "learning_rate": 2.79785554159534e-05, "loss": 0.5696, "step": 111820 }, { "epoch": 2.489093660968661, "grad_norm": 0.6126071214675903, "learning_rate": 2.7954777468765735e-05, "loss": 0.415, "step": 111830 }, { "epoch": 2.4893162393162394, "grad_norm": 0.5737091302871704, "learning_rate": 2.7931008870656272e-05, "loss": 0.412, "step": 111840 }, { "epoch": 2.4895388176638176, "grad_norm": 0.7487857341766357, "learning_rate": 2.7907249622916686e-05, "loss": 0.5441, "step": 111850 }, { "epoch": 2.489761396011396, "grad_norm": 0.4528280794620514, "learning_rate": 2.7883499726838015e-05, "loss": 0.4189, "step": 111860 }, { "epoch": 2.4899839743589745, "grad_norm": 0.5734400749206543, "learning_rate": 2.785975918371091e-05, "loss": 0.3899, "step": 111870 }, { "epoch": 2.4902065527065527, "grad_norm": 0.4233279526233673, "learning_rate": 2.7836027994825387e-05, "loss": 0.3746, "step": 111880 }, { "epoch": 2.490429131054131, "grad_norm": 0.511591911315918, "learning_rate": 2.7812306161471013e-05, "loss": 0.4438, "step": 111890 }, { "epoch": 2.4906517094017095, "grad_norm": 0.7605999708175659, "learning_rate": 2.7788593684936914e-05, "loss": 0.4961, "step": 111900 }, { "epoch": 2.4908742877492878, "grad_norm": 0.5704527497291565, "learning_rate": 2.776489056651159e-05, "loss": 0.3872, "step": 111910 }, { "epoch": 2.491096866096866, "grad_norm": 0.5394191145896912, "learning_rate": 2.7741196807483126e-05, "loss": 0.3728, "step": 111920 }, { "epoch": 2.4913194444444446, "grad_norm": 0.3802001476287842, "learning_rate": 2.7717512409139044e-05, "loss": 0.3585, "step": 111930 }, { "epoch": 2.491542022792023, "grad_norm": 1.008232831954956, "learning_rate": 2.7693837372766407e-05, "loss": 0.5734, "step": 111940 }, { "epoch": 2.491764601139601, "grad_norm": 0.5891938805580139, "learning_rate": 2.7670171699651714e-05, "loss": 0.5298, "step": 111950 }, { "epoch": 2.4919871794871793, "grad_norm": 0.5905712842941284, "learning_rate": 2.7646515391080917e-05, "loss": 0.3535, "step": 111960 }, { "epoch": 2.492209757834758, "grad_norm": 0.5723505616188049, "learning_rate": 2.762286844833957e-05, "loss": 0.4352, "step": 111970 }, { "epoch": 2.492432336182336, "grad_norm": 0.5312645435333252, "learning_rate": 2.7599230872712656e-05, "loss": 0.4558, "step": 111980 }, { "epoch": 2.4926549145299144, "grad_norm": 0.7636615037918091, "learning_rate": 2.757560266548469e-05, "loss": 0.4947, "step": 111990 }, { "epoch": 2.492877492877493, "grad_norm": 0.5262453556060791, "learning_rate": 2.7551983827939622e-05, "loss": 0.5204, "step": 112000 }, { "epoch": 2.4931000712250713, "grad_norm": 0.8180909156799316, "learning_rate": 2.7528374361360953e-05, "loss": 0.5831, "step": 112010 }, { "epoch": 2.4933226495726495, "grad_norm": 0.5332491993904114, "learning_rate": 2.7504774267031596e-05, "loss": 0.4673, "step": 112020 }, { "epoch": 2.4935452279202277, "grad_norm": 0.5418760776519775, "learning_rate": 2.748118354623399e-05, "loss": 0.3911, "step": 112030 }, { "epoch": 2.4937678062678064, "grad_norm": 0.7037185430526733, "learning_rate": 2.7457602200250134e-05, "loss": 0.4271, "step": 112040 }, { "epoch": 2.4939903846153846, "grad_norm": 0.42940235137939453, "learning_rate": 2.7434030230361395e-05, "loss": 0.3757, "step": 112050 }, { "epoch": 2.494212962962963, "grad_norm": 0.7504114508628845, "learning_rate": 2.7410467637848736e-05, "loss": 0.4133, "step": 112060 }, { "epoch": 2.4944355413105415, "grad_norm": 0.29727378487586975, "learning_rate": 2.7386914423992593e-05, "loss": 0.4416, "step": 112070 }, { "epoch": 2.4946581196581197, "grad_norm": 0.5232381224632263, "learning_rate": 2.7363370590072768e-05, "loss": 0.4077, "step": 112080 }, { "epoch": 2.494880698005698, "grad_norm": 0.46083950996398926, "learning_rate": 2.7339836137368768e-05, "loss": 0.3682, "step": 112090 }, { "epoch": 2.4951032763532766, "grad_norm": 0.4653097987174988, "learning_rate": 2.7316311067159394e-05, "loss": 0.4621, "step": 112100 }, { "epoch": 2.4953258547008548, "grad_norm": 0.6140937209129333, "learning_rate": 2.7292795380723024e-05, "loss": 0.5033, "step": 112110 }, { "epoch": 2.495548433048433, "grad_norm": 0.9223384857177734, "learning_rate": 2.7269289079337544e-05, "loss": 0.476, "step": 112120 }, { "epoch": 2.495771011396011, "grad_norm": 0.7549685835838318, "learning_rate": 2.7245792164280293e-05, "loss": 0.5107, "step": 112130 }, { "epoch": 2.49599358974359, "grad_norm": 0.5670859813690186, "learning_rate": 2.722230463682811e-05, "loss": 0.4197, "step": 112140 }, { "epoch": 2.496216168091168, "grad_norm": 0.6729944944381714, "learning_rate": 2.7198826498257403e-05, "loss": 0.4879, "step": 112150 }, { "epoch": 2.4964387464387463, "grad_norm": 0.5611489415168762, "learning_rate": 2.7175357749843855e-05, "loss": 0.4378, "step": 112160 }, { "epoch": 2.496661324786325, "grad_norm": 0.5496370792388916, "learning_rate": 2.7151898392862874e-05, "loss": 0.4153, "step": 112170 }, { "epoch": 2.496883903133903, "grad_norm": 0.6167081594467163, "learning_rate": 2.7128448428589216e-05, "loss": 0.3725, "step": 112180 }, { "epoch": 2.4971064814814814, "grad_norm": 0.6268320679664612, "learning_rate": 2.7105007858297193e-05, "loss": 0.4281, "step": 112190 }, { "epoch": 2.4973290598290596, "grad_norm": 0.6511839628219604, "learning_rate": 2.708157668326059e-05, "loss": 0.4173, "step": 112200 }, { "epoch": 2.4975516381766383, "grad_norm": 0.5909841656684875, "learning_rate": 2.70581549047527e-05, "loss": 0.5347, "step": 112210 }, { "epoch": 2.4977742165242165, "grad_norm": 0.4672543406486511, "learning_rate": 2.7034742524046232e-05, "loss": 0.3949, "step": 112220 }, { "epoch": 2.4979967948717947, "grad_norm": 0.6269485354423523, "learning_rate": 2.7011339542413462e-05, "loss": 0.3727, "step": 112230 }, { "epoch": 2.4982193732193734, "grad_norm": 0.7204062342643738, "learning_rate": 2.6987945961126082e-05, "loss": 0.336, "step": 112240 }, { "epoch": 2.4984419515669516, "grad_norm": 0.4992566406726837, "learning_rate": 2.6964561781455368e-05, "loss": 0.6063, "step": 112250 }, { "epoch": 2.49866452991453, "grad_norm": 0.7157484292984009, "learning_rate": 2.6941187004672007e-05, "loss": 0.3249, "step": 112260 }, { "epoch": 2.4988871082621085, "grad_norm": 0.912991464138031, "learning_rate": 2.6917821632046213e-05, "loss": 0.4232, "step": 112270 }, { "epoch": 2.4991096866096867, "grad_norm": 0.6335121393203735, "learning_rate": 2.689446566484768e-05, "loss": 0.4938, "step": 112280 }, { "epoch": 2.499332264957265, "grad_norm": 0.5526174902915955, "learning_rate": 2.6871119104345653e-05, "loss": 0.4716, "step": 112290 }, { "epoch": 2.499554843304843, "grad_norm": 0.41126495599746704, "learning_rate": 2.6847781951808682e-05, "loss": 0.4587, "step": 112300 }, { "epoch": 2.499777421652422, "grad_norm": 0.38075271248817444, "learning_rate": 2.682445420850501e-05, "loss": 0.4874, "step": 112310 }, { "epoch": 2.5, "grad_norm": 0.48302462697029114, "learning_rate": 2.6801135875702254e-05, "loss": 0.3682, "step": 112320 }, { "epoch": 2.500222578347578, "grad_norm": 0.46705126762390137, "learning_rate": 2.6777826954667552e-05, "loss": 0.4931, "step": 112330 }, { "epoch": 2.5004451566951564, "grad_norm": 0.9624470472335815, "learning_rate": 2.67545274466676e-05, "loss": 0.5135, "step": 112340 }, { "epoch": 2.500667735042735, "grad_norm": 0.4059145748615265, "learning_rate": 2.6731237352968408e-05, "loss": 0.4634, "step": 112350 }, { "epoch": 2.5008903133903133, "grad_norm": 0.6012235283851624, "learning_rate": 2.670795667483561e-05, "loss": 0.5085, "step": 112360 }, { "epoch": 2.5011128917378915, "grad_norm": 0.6443610191345215, "learning_rate": 2.668468541353435e-05, "loss": 0.4311, "step": 112370 }, { "epoch": 2.50133547008547, "grad_norm": 0.4933191239833832, "learning_rate": 2.6661423570329125e-05, "loss": 0.5441, "step": 112380 }, { "epoch": 2.5015580484330484, "grad_norm": 0.43826714158058167, "learning_rate": 2.6638171146484058e-05, "loss": 0.3718, "step": 112390 }, { "epoch": 2.5017806267806266, "grad_norm": 0.7164596319198608, "learning_rate": 2.6614928143262695e-05, "loss": 0.5705, "step": 112400 }, { "epoch": 2.5020032051282053, "grad_norm": 0.2963273227214813, "learning_rate": 2.6591694561928073e-05, "loss": 0.3864, "step": 112410 }, { "epoch": 2.5022257834757835, "grad_norm": 0.7462092041969299, "learning_rate": 2.6568470403742706e-05, "loss": 0.4764, "step": 112420 }, { "epoch": 2.5024483618233617, "grad_norm": 0.5694229006767273, "learning_rate": 2.6545255669968704e-05, "loss": 0.4772, "step": 112430 }, { "epoch": 2.5026709401709404, "grad_norm": 0.5272653102874756, "learning_rate": 2.6522050361867435e-05, "loss": 0.4651, "step": 112440 }, { "epoch": 2.5028935185185186, "grad_norm": 0.44499674439430237, "learning_rate": 2.64988544807e-05, "loss": 0.5382, "step": 112450 }, { "epoch": 2.503116096866097, "grad_norm": 1.0100992918014526, "learning_rate": 2.6475668027726807e-05, "loss": 0.4666, "step": 112460 }, { "epoch": 2.5033386752136755, "grad_norm": 0.39378368854522705, "learning_rate": 2.6452491004207948e-05, "loss": 0.5063, "step": 112470 }, { "epoch": 2.5035612535612537, "grad_norm": 0.6302194595336914, "learning_rate": 2.642932341140274e-05, "loss": 0.3958, "step": 112480 }, { "epoch": 2.503783831908832, "grad_norm": 0.4820767641067505, "learning_rate": 2.640616525057018e-05, "loss": 0.4808, "step": 112490 }, { "epoch": 2.50400641025641, "grad_norm": 0.6411905288696289, "learning_rate": 2.6383016522968728e-05, "loss": 0.4497, "step": 112500 }, { "epoch": 2.5042289886039883, "grad_norm": 0.6101178526878357, "learning_rate": 2.6359877229856334e-05, "loss": 0.4695, "step": 112510 }, { "epoch": 2.504451566951567, "grad_norm": 0.4686170518398285, "learning_rate": 2.63367473724903e-05, "loss": 0.3935, "step": 112520 }, { "epoch": 2.5046741452991452, "grad_norm": 0.3824845850467682, "learning_rate": 2.631362695212758e-05, "loss": 0.4467, "step": 112530 }, { "epoch": 2.5048967236467234, "grad_norm": 0.5891847014427185, "learning_rate": 2.6290515970024567e-05, "loss": 0.4516, "step": 112540 }, { "epoch": 2.505119301994302, "grad_norm": 0.6469375491142273, "learning_rate": 2.6267414427437122e-05, "loss": 0.4168, "step": 112550 }, { "epoch": 2.5053418803418803, "grad_norm": 0.5130420923233032, "learning_rate": 2.6244322325620596e-05, "loss": 0.4113, "step": 112560 }, { "epoch": 2.5055644586894585, "grad_norm": 0.6089246273040771, "learning_rate": 2.6221239665829878e-05, "loss": 0.4957, "step": 112570 }, { "epoch": 2.505787037037037, "grad_norm": 0.39957547187805176, "learning_rate": 2.6198166449319228e-05, "loss": 0.4694, "step": 112580 }, { "epoch": 2.5060096153846154, "grad_norm": 0.8839907646179199, "learning_rate": 2.6175102677342488e-05, "loss": 0.5822, "step": 112590 }, { "epoch": 2.5062321937321936, "grad_norm": 0.5108381509780884, "learning_rate": 2.6152048351153013e-05, "loss": 0.5334, "step": 112600 }, { "epoch": 2.5064547720797723, "grad_norm": 0.7718234062194824, "learning_rate": 2.612900347200351e-05, "loss": 0.5677, "step": 112610 }, { "epoch": 2.5066773504273505, "grad_norm": 0.6024258732795715, "learning_rate": 2.610596804114629e-05, "loss": 0.519, "step": 112620 }, { "epoch": 2.5068999287749287, "grad_norm": 0.6917393803596497, "learning_rate": 2.6082942059833105e-05, "loss": 0.5002, "step": 112630 }, { "epoch": 2.5071225071225074, "grad_norm": 0.6841616034507751, "learning_rate": 2.6059925529315242e-05, "loss": 0.4769, "step": 112640 }, { "epoch": 2.5073450854700856, "grad_norm": 0.5699818730354309, "learning_rate": 2.603691845084346e-05, "loss": 0.434, "step": 112650 }, { "epoch": 2.507567663817664, "grad_norm": 0.5113489031791687, "learning_rate": 2.6013920825667913e-05, "loss": 0.4695, "step": 112660 }, { "epoch": 2.507790242165242, "grad_norm": 0.553261935710907, "learning_rate": 2.5990932655038313e-05, "loss": 0.4824, "step": 112670 }, { "epoch": 2.5080128205128203, "grad_norm": 0.5432833433151245, "learning_rate": 2.5967953940203902e-05, "loss": 0.4808, "step": 112680 }, { "epoch": 2.508235398860399, "grad_norm": 0.8696192502975464, "learning_rate": 2.594498468241333e-05, "loss": 0.5923, "step": 112690 }, { "epoch": 2.508457977207977, "grad_norm": 0.4584144353866577, "learning_rate": 2.5922024882914797e-05, "loss": 0.4772, "step": 112700 }, { "epoch": 2.5086805555555554, "grad_norm": 0.653127908706665, "learning_rate": 2.589907454295597e-05, "loss": 0.4796, "step": 112710 }, { "epoch": 2.508903133903134, "grad_norm": 0.6973347067832947, "learning_rate": 2.587613366378392e-05, "loss": 0.4021, "step": 112720 }, { "epoch": 2.5091257122507122, "grad_norm": 0.6502978205680847, "learning_rate": 2.585320224664536e-05, "loss": 0.4891, "step": 112730 }, { "epoch": 2.5093482905982905, "grad_norm": 0.4977133870124817, "learning_rate": 2.583028029278629e-05, "loss": 0.4121, "step": 112740 }, { "epoch": 2.509570868945869, "grad_norm": 0.5418941378593445, "learning_rate": 2.5807367803452387e-05, "loss": 0.415, "step": 112750 }, { "epoch": 2.5097934472934473, "grad_norm": 0.5099831819534302, "learning_rate": 2.578446477988872e-05, "loss": 0.3829, "step": 112760 }, { "epoch": 2.5100160256410255, "grad_norm": 0.7447317838668823, "learning_rate": 2.5761571223339842e-05, "loss": 0.6159, "step": 112770 }, { "epoch": 2.510238603988604, "grad_norm": 0.6790410876274109, "learning_rate": 2.573868713504983e-05, "loss": 0.4888, "step": 112780 }, { "epoch": 2.5104611823361824, "grad_norm": 0.5578905344009399, "learning_rate": 2.571581251626225e-05, "loss": 0.4079, "step": 112790 }, { "epoch": 2.5106837606837606, "grad_norm": 0.7169860005378723, "learning_rate": 2.5692947368220057e-05, "loss": 0.4423, "step": 112800 }, { "epoch": 2.5109063390313393, "grad_norm": 0.6103445887565613, "learning_rate": 2.567009169216581e-05, "loss": 0.4139, "step": 112810 }, { "epoch": 2.5111289173789175, "grad_norm": 0.5489679574966431, "learning_rate": 2.5647245489341475e-05, "loss": 0.4583, "step": 112820 }, { "epoch": 2.5113514957264957, "grad_norm": 0.6748605370521545, "learning_rate": 2.5624408760988572e-05, "loss": 0.5856, "step": 112830 }, { "epoch": 2.511574074074074, "grad_norm": 0.6140767931938171, "learning_rate": 2.5601581508348037e-05, "loss": 0.4323, "step": 112840 }, { "epoch": 2.511796652421652, "grad_norm": 0.5786221623420715, "learning_rate": 2.557876373266037e-05, "loss": 0.4812, "step": 112850 }, { "epoch": 2.512019230769231, "grad_norm": 0.7067697644233704, "learning_rate": 2.5555955435165424e-05, "loss": 0.5013, "step": 112860 }, { "epoch": 2.512241809116809, "grad_norm": 0.4107857644557953, "learning_rate": 2.5533156617102717e-05, "loss": 0.4277, "step": 112870 }, { "epoch": 2.5124643874643873, "grad_norm": 0.38731175661087036, "learning_rate": 2.5510367279711057e-05, "loss": 0.5341, "step": 112880 }, { "epoch": 2.512686965811966, "grad_norm": 0.7229180335998535, "learning_rate": 2.5487587424228897e-05, "loss": 0.4351, "step": 112890 }, { "epoch": 2.512909544159544, "grad_norm": 0.4220271408557892, "learning_rate": 2.5464817051894097e-05, "loss": 0.5107, "step": 112900 }, { "epoch": 2.5131321225071224, "grad_norm": 0.7265833020210266, "learning_rate": 2.5442056163943994e-05, "loss": 0.5244, "step": 112910 }, { "epoch": 2.513354700854701, "grad_norm": 0.5902624726295471, "learning_rate": 2.5419304761615492e-05, "loss": 0.4583, "step": 112920 }, { "epoch": 2.5135772792022792, "grad_norm": 0.3740479350090027, "learning_rate": 2.539656284614491e-05, "loss": 0.4094, "step": 112930 }, { "epoch": 2.5137998575498575, "grad_norm": 0.7416092157363892, "learning_rate": 2.5373830418767996e-05, "loss": 0.5095, "step": 112940 }, { "epoch": 2.514022435897436, "grad_norm": 0.7496693134307861, "learning_rate": 2.5351107480720093e-05, "loss": 0.428, "step": 112950 }, { "epoch": 2.5142450142450143, "grad_norm": 0.5957769155502319, "learning_rate": 2.532839403323599e-05, "loss": 0.3798, "step": 112960 }, { "epoch": 2.5144675925925926, "grad_norm": 0.48463383316993713, "learning_rate": 2.530569007754995e-05, "loss": 0.4725, "step": 112970 }, { "epoch": 2.5146901709401708, "grad_norm": 0.6827375292778015, "learning_rate": 2.5282995614895733e-05, "loss": 0.367, "step": 112980 }, { "epoch": 2.5149127492877494, "grad_norm": 0.5751871466636658, "learning_rate": 2.526031064650658e-05, "loss": 0.3899, "step": 112990 }, { "epoch": 2.5151353276353277, "grad_norm": 0.6092050671577454, "learning_rate": 2.5237635173615214e-05, "loss": 0.4741, "step": 113000 }, { "epoch": 2.515357905982906, "grad_norm": 0.6803910732269287, "learning_rate": 2.5214969197453765e-05, "loss": 0.5417, "step": 113010 }, { "epoch": 2.515580484330484, "grad_norm": 0.5000934600830078, "learning_rate": 2.5192312719253997e-05, "loss": 0.4847, "step": 113020 }, { "epoch": 2.5158030626780628, "grad_norm": 0.47814443707466125, "learning_rate": 2.5169665740247038e-05, "loss": 0.3029, "step": 113030 }, { "epoch": 2.516025641025641, "grad_norm": 0.5441074371337891, "learning_rate": 2.5147028261663573e-05, "loss": 0.4252, "step": 113040 }, { "epoch": 2.516248219373219, "grad_norm": 0.5150730609893799, "learning_rate": 2.512440028473373e-05, "loss": 0.4169, "step": 113050 }, { "epoch": 2.516470797720798, "grad_norm": 0.45782437920570374, "learning_rate": 2.5101781810687186e-05, "loss": 0.4274, "step": 113060 }, { "epoch": 2.516693376068376, "grad_norm": 0.6092635989189148, "learning_rate": 2.507917284075294e-05, "loss": 0.4242, "step": 113070 }, { "epoch": 2.5169159544159543, "grad_norm": 0.6076841354370117, "learning_rate": 2.5056573376159654e-05, "loss": 0.4427, "step": 113080 }, { "epoch": 2.517138532763533, "grad_norm": 0.46662288904190063, "learning_rate": 2.5033983418135386e-05, "loss": 0.3541, "step": 113090 }, { "epoch": 2.517361111111111, "grad_norm": 0.624786913394928, "learning_rate": 2.501140296790767e-05, "loss": 0.4764, "step": 113100 }, { "epoch": 2.5175836894586894, "grad_norm": 0.31760746240615845, "learning_rate": 2.498883202670359e-05, "loss": 0.4431, "step": 113110 }, { "epoch": 2.517806267806268, "grad_norm": 0.4739936888217926, "learning_rate": 2.4966270595749652e-05, "loss": 0.4813, "step": 113120 }, { "epoch": 2.5180288461538463, "grad_norm": 0.6213451623916626, "learning_rate": 2.49437186762719e-05, "loss": 0.3937, "step": 113130 }, { "epoch": 2.5182514245014245, "grad_norm": 0.5369846820831299, "learning_rate": 2.4921176269495772e-05, "loss": 0.5293, "step": 113140 }, { "epoch": 2.5184740028490027, "grad_norm": 0.30878716707229614, "learning_rate": 2.4898643376646204e-05, "loss": 0.3833, "step": 113150 }, { "epoch": 2.5186965811965814, "grad_norm": 0.6332671046257019, "learning_rate": 2.4876119998947723e-05, "loss": 0.4891, "step": 113160 }, { "epoch": 2.5189191595441596, "grad_norm": 0.47033417224884033, "learning_rate": 2.4853606137624218e-05, "loss": 0.3578, "step": 113170 }, { "epoch": 2.519141737891738, "grad_norm": 0.6995198130607605, "learning_rate": 2.483110179389916e-05, "loss": 0.4035, "step": 113180 }, { "epoch": 2.519364316239316, "grad_norm": 0.5590694546699524, "learning_rate": 2.480860696899543e-05, "loss": 0.5847, "step": 113190 }, { "epoch": 2.5195868945868947, "grad_norm": 0.7594995498657227, "learning_rate": 2.478612166413543e-05, "loss": 0.5278, "step": 113200 }, { "epoch": 2.519809472934473, "grad_norm": 0.6225067973136902, "learning_rate": 2.476364588054101e-05, "loss": 0.3405, "step": 113210 }, { "epoch": 2.520032051282051, "grad_norm": 0.7573350071907043, "learning_rate": 2.4741179619433496e-05, "loss": 0.4253, "step": 113220 }, { "epoch": 2.5202546296296298, "grad_norm": 0.6507413387298584, "learning_rate": 2.4718722882033763e-05, "loss": 0.4175, "step": 113230 }, { "epoch": 2.5202991452991452, "eval_loss": 0.5251317620277405, "eval_runtime": 337.3492, "eval_samples_per_second": 7.011, "eval_steps_per_second": 7.011, "step": 113232 }, { "epoch": 2.520477207977208, "grad_norm": 0.6276082992553711, "learning_rate": 2.469627566956214e-05, "loss": 0.4185, "step": 113240 }, { "epoch": 2.520699786324786, "grad_norm": 0.5301204323768616, "learning_rate": 2.4673837983238392e-05, "loss": 0.3982, "step": 113250 }, { "epoch": 2.520922364672365, "grad_norm": 0.5993652939796448, "learning_rate": 2.465140982428187e-05, "loss": 0.5212, "step": 113260 }, { "epoch": 2.521144943019943, "grad_norm": 0.5609367489814758, "learning_rate": 2.462899119391122e-05, "loss": 0.4105, "step": 113270 }, { "epoch": 2.5213675213675213, "grad_norm": 0.5710123777389526, "learning_rate": 2.460658209334481e-05, "loss": 0.3595, "step": 113280 }, { "epoch": 2.5215900997151, "grad_norm": 0.4957076609134674, "learning_rate": 2.458418252380028e-05, "loss": 0.3283, "step": 113290 }, { "epoch": 2.521812678062678, "grad_norm": 0.5248811841011047, "learning_rate": 2.456179248649486e-05, "loss": 0.3924, "step": 113300 }, { "epoch": 2.5220352564102564, "grad_norm": 0.4309212267398834, "learning_rate": 2.4539411982645268e-05, "loss": 0.32, "step": 113310 }, { "epoch": 2.5222578347578346, "grad_norm": 0.5939930081367493, "learning_rate": 2.4517041013467656e-05, "loss": 0.4909, "step": 113320 }, { "epoch": 2.5224804131054133, "grad_norm": 0.725871741771698, "learning_rate": 2.449467958017768e-05, "loss": 0.4288, "step": 113330 }, { "epoch": 2.5227029914529915, "grad_norm": 0.49962177872657776, "learning_rate": 2.447232768399057e-05, "loss": 0.3743, "step": 113340 }, { "epoch": 2.5229255698005697, "grad_norm": 0.618010938167572, "learning_rate": 2.4449985326120794e-05, "loss": 0.5669, "step": 113350 }, { "epoch": 2.523148148148148, "grad_norm": 0.7896900773048401, "learning_rate": 2.442765250778254e-05, "loss": 0.5632, "step": 113360 }, { "epoch": 2.5233707264957266, "grad_norm": 0.5548889636993408, "learning_rate": 2.4405329230189366e-05, "loss": 0.4774, "step": 113370 }, { "epoch": 2.523593304843305, "grad_norm": 0.43557047843933105, "learning_rate": 2.4383015494554374e-05, "loss": 0.3379, "step": 113380 }, { "epoch": 2.523815883190883, "grad_norm": 0.47517427802085876, "learning_rate": 2.4360711302090122e-05, "loss": 0.4197, "step": 113390 }, { "epoch": 2.5240384615384617, "grad_norm": 0.5873776078224182, "learning_rate": 2.4338416654008556e-05, "loss": 0.4637, "step": 113400 }, { "epoch": 2.52426103988604, "grad_norm": 0.574032723903656, "learning_rate": 2.4316131551521215e-05, "loss": 0.4954, "step": 113410 }, { "epoch": 2.524483618233618, "grad_norm": 0.32711052894592285, "learning_rate": 2.4293855995839175e-05, "loss": 0.3539, "step": 113420 }, { "epoch": 2.5247061965811968, "grad_norm": 0.5761905908584595, "learning_rate": 2.4271589988172783e-05, "loss": 0.4722, "step": 113430 }, { "epoch": 2.524928774928775, "grad_norm": 0.4207303524017334, "learning_rate": 2.4249333529732066e-05, "loss": 0.4692, "step": 113440 }, { "epoch": 2.525151353276353, "grad_norm": 0.7163510322570801, "learning_rate": 2.4227086621726414e-05, "loss": 0.496, "step": 113450 }, { "epoch": 2.525373931623932, "grad_norm": 0.6225937604904175, "learning_rate": 2.4204849265364794e-05, "loss": 0.5276, "step": 113460 }, { "epoch": 2.52559650997151, "grad_norm": 0.4533245265483856, "learning_rate": 2.418262146185557e-05, "loss": 0.4885, "step": 113470 }, { "epoch": 2.5258190883190883, "grad_norm": 0.5328688621520996, "learning_rate": 2.416040321240667e-05, "loss": 0.5348, "step": 113480 }, { "epoch": 2.5260416666666665, "grad_norm": 0.4904916286468506, "learning_rate": 2.4138194518225344e-05, "loss": 0.4447, "step": 113490 }, { "epoch": 2.526264245014245, "grad_norm": 0.6268056631088257, "learning_rate": 2.411599538051852e-05, "loss": 0.5405, "step": 113500 }, { "epoch": 2.5264868233618234, "grad_norm": 0.5416222810745239, "learning_rate": 2.40938058004925e-05, "loss": 0.4533, "step": 113510 }, { "epoch": 2.5267094017094016, "grad_norm": 0.505547285079956, "learning_rate": 2.407162577935309e-05, "loss": 0.3999, "step": 113520 }, { "epoch": 2.52693198005698, "grad_norm": 0.6490024924278259, "learning_rate": 2.4049455318305536e-05, "loss": 0.4791, "step": 113530 }, { "epoch": 2.5271545584045585, "grad_norm": 0.7141279578208923, "learning_rate": 2.4027294418554602e-05, "loss": 0.5274, "step": 113540 }, { "epoch": 2.5273771367521367, "grad_norm": 0.6870497465133667, "learning_rate": 2.400514308130457e-05, "loss": 0.4472, "step": 113550 }, { "epoch": 2.527599715099715, "grad_norm": 0.738166332244873, "learning_rate": 2.398300130775917e-05, "loss": 0.4365, "step": 113560 }, { "epoch": 2.5278222934472936, "grad_norm": 0.334552526473999, "learning_rate": 2.3960869099121542e-05, "loss": 0.3378, "step": 113570 }, { "epoch": 2.528044871794872, "grad_norm": 0.6833900213241577, "learning_rate": 2.3938746456594375e-05, "loss": 0.2975, "step": 113580 }, { "epoch": 2.52826745014245, "grad_norm": 0.8802908658981323, "learning_rate": 2.3916633381379862e-05, "loss": 0.5202, "step": 113590 }, { "epoch": 2.5284900284900287, "grad_norm": 0.641450047492981, "learning_rate": 2.389452987467966e-05, "loss": 0.4123, "step": 113600 }, { "epoch": 2.528712606837607, "grad_norm": 0.7287760972976685, "learning_rate": 2.387243593769486e-05, "loss": 0.4331, "step": 113610 }, { "epoch": 2.528935185185185, "grad_norm": 0.7095321416854858, "learning_rate": 2.3850351571626118e-05, "loss": 0.4529, "step": 113620 }, { "epoch": 2.529157763532764, "grad_norm": 0.43919217586517334, "learning_rate": 2.3828276777673432e-05, "loss": 0.5158, "step": 113630 }, { "epoch": 2.529380341880342, "grad_norm": 0.4373423755168915, "learning_rate": 2.3806211557036394e-05, "loss": 0.5182, "step": 113640 }, { "epoch": 2.52960292022792, "grad_norm": 0.5198220610618591, "learning_rate": 2.3784155910914118e-05, "loss": 0.4346, "step": 113650 }, { "epoch": 2.5298254985754984, "grad_norm": 0.5108519792556763, "learning_rate": 2.376210984050502e-05, "loss": 0.4769, "step": 113660 }, { "epoch": 2.5300480769230766, "grad_norm": 0.5933377146720886, "learning_rate": 2.374007334700714e-05, "loss": 0.4075, "step": 113670 }, { "epoch": 2.5302706552706553, "grad_norm": 0.5652526021003723, "learning_rate": 2.371804643161797e-05, "loss": 0.4435, "step": 113680 }, { "epoch": 2.5304932336182335, "grad_norm": 0.3907783627510071, "learning_rate": 2.369602909553448e-05, "loss": 0.4308, "step": 113690 }, { "epoch": 2.5307158119658117, "grad_norm": 0.49446383118629456, "learning_rate": 2.3674021339953134e-05, "loss": 0.3531, "step": 113700 }, { "epoch": 2.5309383903133904, "grad_norm": 0.7018763422966003, "learning_rate": 2.365202316606978e-05, "loss": 0.4816, "step": 113710 }, { "epoch": 2.5311609686609686, "grad_norm": 0.9009945392608643, "learning_rate": 2.3630034575079842e-05, "loss": 0.4595, "step": 113720 }, { "epoch": 2.531383547008547, "grad_norm": 0.716633141040802, "learning_rate": 2.36080555681782e-05, "loss": 0.4499, "step": 113730 }, { "epoch": 2.5316061253561255, "grad_norm": 0.6000513434410095, "learning_rate": 2.3586086146559237e-05, "loss": 0.4465, "step": 113740 }, { "epoch": 2.5318287037037037, "grad_norm": 0.5333988666534424, "learning_rate": 2.3564126311416757e-05, "loss": 0.4665, "step": 113750 }, { "epoch": 2.532051282051282, "grad_norm": 0.7265472412109375, "learning_rate": 2.3542176063944154e-05, "loss": 0.4582, "step": 113760 }, { "epoch": 2.5322738603988606, "grad_norm": 0.3761736750602722, "learning_rate": 2.35202354053341e-05, "loss": 0.4536, "step": 113770 }, { "epoch": 2.532496438746439, "grad_norm": 0.49985337257385254, "learning_rate": 2.3498304336778974e-05, "loss": 0.3787, "step": 113780 }, { "epoch": 2.532719017094017, "grad_norm": 0.4030935764312744, "learning_rate": 2.3476382859470445e-05, "loss": 0.3677, "step": 113790 }, { "epoch": 2.5329415954415957, "grad_norm": 0.5702828764915466, "learning_rate": 2.3454470974599763e-05, "loss": 0.4384, "step": 113800 }, { "epoch": 2.533164173789174, "grad_norm": 0.6399329900741577, "learning_rate": 2.343256868335768e-05, "loss": 0.4785, "step": 113810 }, { "epoch": 2.533386752136752, "grad_norm": 0.6407581567764282, "learning_rate": 2.3410675986934338e-05, "loss": 0.4976, "step": 113820 }, { "epoch": 2.5336093304843303, "grad_norm": 0.6700713634490967, "learning_rate": 2.3388792886519428e-05, "loss": 0.4077, "step": 113830 }, { "epoch": 2.5338319088319086, "grad_norm": 0.37148866057395935, "learning_rate": 2.336691938330213e-05, "loss": 0.4419, "step": 113840 }, { "epoch": 2.5340544871794872, "grad_norm": 0.5929746627807617, "learning_rate": 2.334505547847101e-05, "loss": 0.4789, "step": 113850 }, { "epoch": 2.5342770655270654, "grad_norm": 0.4155455529689789, "learning_rate": 2.3323201173214182e-05, "loss": 0.4486, "step": 113860 }, { "epoch": 2.5344996438746437, "grad_norm": 0.6532012224197388, "learning_rate": 2.3301356468719227e-05, "loss": 0.5254, "step": 113870 }, { "epoch": 2.5347222222222223, "grad_norm": 0.30867519974708557, "learning_rate": 2.327952136617324e-05, "loss": 0.4076, "step": 113880 }, { "epoch": 2.5349448005698005, "grad_norm": 0.5477968454360962, "learning_rate": 2.325769586676272e-05, "loss": 0.3542, "step": 113890 }, { "epoch": 2.5351673789173788, "grad_norm": 0.502474844455719, "learning_rate": 2.323587997167371e-05, "loss": 0.386, "step": 113900 }, { "epoch": 2.5353899572649574, "grad_norm": 0.5856671333312988, "learning_rate": 2.3214073682091676e-05, "loss": 0.431, "step": 113910 }, { "epoch": 2.5356125356125356, "grad_norm": 0.7741441130638123, "learning_rate": 2.3192276999201633e-05, "loss": 0.502, "step": 113920 }, { "epoch": 2.535835113960114, "grad_norm": 0.46092262864112854, "learning_rate": 2.3170489924187956e-05, "loss": 0.4478, "step": 113930 }, { "epoch": 2.5360576923076925, "grad_norm": 0.5415853261947632, "learning_rate": 2.3148712458234623e-05, "loss": 0.5132, "step": 113940 }, { "epoch": 2.5362802706552707, "grad_norm": 0.6181982159614563, "learning_rate": 2.3126944602525026e-05, "loss": 0.3757, "step": 113950 }, { "epoch": 2.536502849002849, "grad_norm": 0.5846037864685059, "learning_rate": 2.310518635824206e-05, "loss": 0.5341, "step": 113960 }, { "epoch": 2.5367254273504276, "grad_norm": 0.8839473128318787, "learning_rate": 2.3083437726568092e-05, "loss": 0.5566, "step": 113970 }, { "epoch": 2.536948005698006, "grad_norm": 0.7175643444061279, "learning_rate": 2.3061698708684953e-05, "loss": 0.4288, "step": 113980 }, { "epoch": 2.537170584045584, "grad_norm": 0.6663437485694885, "learning_rate": 2.3039969305773945e-05, "loss": 0.4593, "step": 113990 }, { "epoch": 2.5373931623931623, "grad_norm": 0.5830166339874268, "learning_rate": 2.3018249519015854e-05, "loss": 0.5306, "step": 114000 }, { "epoch": 2.5376157407407405, "grad_norm": 0.4129011929035187, "learning_rate": 2.2996539349590985e-05, "loss": 0.3761, "step": 114010 }, { "epoch": 2.537838319088319, "grad_norm": 0.42609769105911255, "learning_rate": 2.2974838798679055e-05, "loss": 0.4356, "step": 114020 }, { "epoch": 2.5380608974358974, "grad_norm": 0.3722520172595978, "learning_rate": 2.2953147867459324e-05, "loss": 0.4635, "step": 114030 }, { "epoch": 2.5382834757834756, "grad_norm": 0.5488028526306152, "learning_rate": 2.2931466557110492e-05, "loss": 0.3463, "step": 114040 }, { "epoch": 2.5385060541310542, "grad_norm": 0.4920119345188141, "learning_rate": 2.290979486881073e-05, "loss": 0.5244, "step": 114050 }, { "epoch": 2.5387286324786325, "grad_norm": 0.39283880591392517, "learning_rate": 2.288813280373765e-05, "loss": 0.4253, "step": 114060 }, { "epoch": 2.5389512108262107, "grad_norm": 0.3846190571784973, "learning_rate": 2.2866480363068422e-05, "loss": 0.4048, "step": 114070 }, { "epoch": 2.5391737891737893, "grad_norm": 0.4565149247646332, "learning_rate": 2.2844837547979657e-05, "loss": 0.3872, "step": 114080 }, { "epoch": 2.5393963675213675, "grad_norm": 0.3994213044643402, "learning_rate": 2.2823204359647445e-05, "loss": 0.4686, "step": 114090 }, { "epoch": 2.5396189458689458, "grad_norm": 0.523123025894165, "learning_rate": 2.2801580799247367e-05, "loss": 0.4059, "step": 114100 }, { "epoch": 2.5398415242165244, "grad_norm": 0.5251439213752747, "learning_rate": 2.2779966867954426e-05, "loss": 0.3668, "step": 114110 }, { "epoch": 2.5400641025641026, "grad_norm": 0.42427927255630493, "learning_rate": 2.2758362566943236e-05, "loss": 0.3829, "step": 114120 }, { "epoch": 2.540286680911681, "grad_norm": 0.5992891788482666, "learning_rate": 2.2736767897387655e-05, "loss": 0.5138, "step": 114130 }, { "epoch": 2.5405092592592595, "grad_norm": 0.9765251278877258, "learning_rate": 2.2715182860461214e-05, "loss": 0.503, "step": 114140 }, { "epoch": 2.5407318376068377, "grad_norm": 0.6045100092887878, "learning_rate": 2.2693607457336885e-05, "loss": 0.4657, "step": 114150 }, { "epoch": 2.540954415954416, "grad_norm": 0.5176226496696472, "learning_rate": 2.2672041689187085e-05, "loss": 0.5246, "step": 114160 }, { "epoch": 2.541176994301994, "grad_norm": 0.5346884727478027, "learning_rate": 2.26504855571837e-05, "loss": 0.4133, "step": 114170 }, { "epoch": 2.5413995726495724, "grad_norm": 0.42462992668151855, "learning_rate": 2.2628939062498146e-05, "loss": 0.416, "step": 114180 }, { "epoch": 2.541622150997151, "grad_norm": 0.4547101557254791, "learning_rate": 2.2607402206301243e-05, "loss": 0.4994, "step": 114190 }, { "epoch": 2.5418447293447293, "grad_norm": 0.6860995292663574, "learning_rate": 2.25858749897633e-05, "loss": 0.3547, "step": 114200 }, { "epoch": 2.5420673076923075, "grad_norm": 0.29589366912841797, "learning_rate": 2.256435741405414e-05, "loss": 0.4633, "step": 114210 }, { "epoch": 2.542289886039886, "grad_norm": 0.8256801962852478, "learning_rate": 2.254284948034304e-05, "loss": 0.3776, "step": 114220 }, { "epoch": 2.5425124643874644, "grad_norm": 0.5243028998374939, "learning_rate": 2.252135118979879e-05, "loss": 0.3772, "step": 114230 }, { "epoch": 2.5427350427350426, "grad_norm": 0.6656879186630249, "learning_rate": 2.24998625435896e-05, "loss": 0.5634, "step": 114240 }, { "epoch": 2.5429576210826212, "grad_norm": 0.5538183450698853, "learning_rate": 2.2478383542883208e-05, "loss": 0.47, "step": 114250 }, { "epoch": 2.5431801994301995, "grad_norm": 0.4275704324245453, "learning_rate": 2.245691418884679e-05, "loss": 0.4122, "step": 114260 }, { "epoch": 2.5434027777777777, "grad_norm": 0.4692855179309845, "learning_rate": 2.2435454482646966e-05, "loss": 0.4622, "step": 114270 }, { "epoch": 2.5436253561253563, "grad_norm": 0.4590589702129364, "learning_rate": 2.2414004425449918e-05, "loss": 0.3253, "step": 114280 }, { "epoch": 2.5438479344729346, "grad_norm": 0.8088108897209167, "learning_rate": 2.2392564018421247e-05, "loss": 0.4271, "step": 114290 }, { "epoch": 2.5440705128205128, "grad_norm": 0.3927932381629944, "learning_rate": 2.237113326272604e-05, "loss": 0.514, "step": 114300 }, { "epoch": 2.5442930911680914, "grad_norm": 0.6496009230613708, "learning_rate": 2.23497121595289e-05, "loss": 0.6287, "step": 114310 }, { "epoch": 2.5445156695156697, "grad_norm": 0.6508887410163879, "learning_rate": 2.2328300709993788e-05, "loss": 0.425, "step": 114320 }, { "epoch": 2.544738247863248, "grad_norm": 0.8003069162368774, "learning_rate": 2.2306898915284324e-05, "loss": 0.4814, "step": 114330 }, { "epoch": 2.544960826210826, "grad_norm": 0.6616761088371277, "learning_rate": 2.2285506776563382e-05, "loss": 0.6248, "step": 114340 }, { "epoch": 2.5451834045584043, "grad_norm": 0.6189643144607544, "learning_rate": 2.2264124294993493e-05, "loss": 0.4891, "step": 114350 }, { "epoch": 2.545405982905983, "grad_norm": 0.4615582227706909, "learning_rate": 2.22427514717366e-05, "loss": 0.4754, "step": 114360 }, { "epoch": 2.545628561253561, "grad_norm": 0.44101470708847046, "learning_rate": 2.2221388307954106e-05, "loss": 0.4633, "step": 114370 }, { "epoch": 2.5458511396011394, "grad_norm": 0.4261623024940491, "learning_rate": 2.2200034804806902e-05, "loss": 0.36, "step": 114380 }, { "epoch": 2.546073717948718, "grad_norm": 0.910829484462738, "learning_rate": 2.217869096345535e-05, "loss": 0.4595, "step": 114390 }, { "epoch": 2.5462962962962963, "grad_norm": 0.6673019528388977, "learning_rate": 2.2157356785059347e-05, "loss": 0.4857, "step": 114400 }, { "epoch": 2.5465188746438745, "grad_norm": 0.7158443331718445, "learning_rate": 2.213603227077814e-05, "loss": 0.6056, "step": 114410 }, { "epoch": 2.546741452991453, "grad_norm": 0.661539614200592, "learning_rate": 2.2114717421770535e-05, "loss": 0.5789, "step": 114420 }, { "epoch": 2.5469640313390314, "grad_norm": 0.5644783973693848, "learning_rate": 2.209341223919481e-05, "loss": 0.4612, "step": 114430 }, { "epoch": 2.5471866096866096, "grad_norm": 0.4332530200481415, "learning_rate": 2.2072116724208747e-05, "loss": 0.3601, "step": 114440 }, { "epoch": 2.5474091880341883, "grad_norm": 0.47315365076065063, "learning_rate": 2.2050830877969485e-05, "loss": 0.4737, "step": 114450 }, { "epoch": 2.5476317663817665, "grad_norm": 0.6797741651535034, "learning_rate": 2.2029554701633725e-05, "loss": 0.5661, "step": 114460 }, { "epoch": 2.5478543447293447, "grad_norm": 0.6491184830665588, "learning_rate": 2.200828819635772e-05, "loss": 0.4086, "step": 114470 }, { "epoch": 2.5480769230769234, "grad_norm": 0.5176466703414917, "learning_rate": 2.1987031363297005e-05, "loss": 0.406, "step": 114480 }, { "epoch": 2.5482995014245016, "grad_norm": 0.6204827427864075, "learning_rate": 2.1965784203606732e-05, "loss": 0.4653, "step": 114490 }, { "epoch": 2.54852207977208, "grad_norm": 0.6510611772537231, "learning_rate": 2.1944546718441483e-05, "loss": 0.511, "step": 114500 }, { "epoch": 2.548744658119658, "grad_norm": 0.7228637337684631, "learning_rate": 2.1923318908955338e-05, "loss": 0.4948, "step": 114510 }, { "epoch": 2.548967236467236, "grad_norm": 0.4623255729675293, "learning_rate": 2.1902100776301815e-05, "loss": 0.4478, "step": 114520 }, { "epoch": 2.549189814814815, "grad_norm": 0.5250282883644104, "learning_rate": 2.188089232163393e-05, "loss": 0.3574, "step": 114530 }, { "epoch": 2.549412393162393, "grad_norm": 0.5935671925544739, "learning_rate": 2.185969354610422e-05, "loss": 0.4522, "step": 114540 }, { "epoch": 2.5496349715099713, "grad_norm": 0.4981185495853424, "learning_rate": 2.183850445086455e-05, "loss": 0.3517, "step": 114550 }, { "epoch": 2.54985754985755, "grad_norm": 0.5429505109786987, "learning_rate": 2.1817325037066393e-05, "loss": 0.4767, "step": 114560 }, { "epoch": 2.550080128205128, "grad_norm": 0.6192172765731812, "learning_rate": 2.179615530586072e-05, "loss": 0.5638, "step": 114570 }, { "epoch": 2.5503027065527064, "grad_norm": 0.4617951512336731, "learning_rate": 2.177499525839779e-05, "loss": 0.4171, "step": 114580 }, { "epoch": 2.550525284900285, "grad_norm": 0.5843008756637573, "learning_rate": 2.1753844895827546e-05, "loss": 0.5141, "step": 114590 }, { "epoch": 2.5507478632478633, "grad_norm": 0.41912171244621277, "learning_rate": 2.173270421929927e-05, "loss": 0.4071, "step": 114600 }, { "epoch": 2.5509704415954415, "grad_norm": 0.5310803055763245, "learning_rate": 2.1711573229961822e-05, "loss": 0.4288, "step": 114610 }, { "epoch": 2.55119301994302, "grad_norm": 0.6280931234359741, "learning_rate": 2.1690451928963396e-05, "loss": 0.5028, "step": 114620 }, { "epoch": 2.5514155982905984, "grad_norm": 0.42037948966026306, "learning_rate": 2.1669340317451803e-05, "loss": 0.4915, "step": 114630 }, { "epoch": 2.5516381766381766, "grad_norm": 0.7464925646781921, "learning_rate": 2.1648238396574237e-05, "loss": 0.4069, "step": 114640 }, { "epoch": 2.551860754985755, "grad_norm": 0.679133951663971, "learning_rate": 2.1627146167477385e-05, "loss": 0.4133, "step": 114650 }, { "epoch": 2.5520833333333335, "grad_norm": 0.5979849100112915, "learning_rate": 2.1606063631307437e-05, "loss": 0.4781, "step": 114660 }, { "epoch": 2.5523059116809117, "grad_norm": 0.5203874111175537, "learning_rate": 2.15849907892101e-05, "loss": 0.3303, "step": 114670 }, { "epoch": 2.55252849002849, "grad_norm": 0.43817025423049927, "learning_rate": 2.1563927642330352e-05, "loss": 0.4806, "step": 114680 }, { "epoch": 2.552751068376068, "grad_norm": 0.4426831603050232, "learning_rate": 2.1542874191812866e-05, "loss": 0.5032, "step": 114690 }, { "epoch": 2.552973646723647, "grad_norm": 0.42767632007598877, "learning_rate": 2.1521830438801715e-05, "loss": 0.44, "step": 114700 }, { "epoch": 2.553196225071225, "grad_norm": 0.3472227454185486, "learning_rate": 2.150079638444038e-05, "loss": 0.4662, "step": 114710 }, { "epoch": 2.5534188034188032, "grad_norm": 0.464819073677063, "learning_rate": 2.147977202987188e-05, "loss": 0.4489, "step": 114720 }, { "epoch": 2.553641381766382, "grad_norm": 0.6429574489593506, "learning_rate": 2.1458757376238724e-05, "loss": 0.42, "step": 114730 }, { "epoch": 2.55386396011396, "grad_norm": 0.598283588886261, "learning_rate": 2.1437752424682843e-05, "loss": 0.5584, "step": 114740 }, { "epoch": 2.5540865384615383, "grad_norm": 0.6327041983604431, "learning_rate": 2.1416757176345724e-05, "loss": 0.353, "step": 114750 }, { "epoch": 2.554309116809117, "grad_norm": 0.5017842650413513, "learning_rate": 2.1395771632368168e-05, "loss": 0.4499, "step": 114760 }, { "epoch": 2.554531695156695, "grad_norm": 0.8473617434501648, "learning_rate": 2.1374795793890612e-05, "loss": 0.5589, "step": 114770 }, { "epoch": 2.5547542735042734, "grad_norm": 0.3383786678314209, "learning_rate": 2.135382966205286e-05, "loss": 0.5012, "step": 114780 }, { "epoch": 2.554976851851852, "grad_norm": 0.48342782258987427, "learning_rate": 2.1332873237994245e-05, "loss": 0.3938, "step": 114790 }, { "epoch": 2.5551994301994303, "grad_norm": 0.4264233410358429, "learning_rate": 2.1311926522853587e-05, "loss": 0.519, "step": 114800 }, { "epoch": 2.5554220085470085, "grad_norm": 0.41103747487068176, "learning_rate": 2.1290989517769133e-05, "loss": 0.3774, "step": 114810 }, { "epoch": 2.5556445868945867, "grad_norm": 0.5277079343795776, "learning_rate": 2.1270062223878595e-05, "loss": 0.4255, "step": 114820 }, { "epoch": 2.5558671652421654, "grad_norm": 0.894993245601654, "learning_rate": 2.124914464231922e-05, "loss": 0.4757, "step": 114830 }, { "epoch": 2.5560897435897436, "grad_norm": 0.5365956425666809, "learning_rate": 2.1228236774227605e-05, "loss": 0.497, "step": 114840 }, { "epoch": 2.556312321937322, "grad_norm": 0.5834557414054871, "learning_rate": 2.120733862073998e-05, "loss": 0.4723, "step": 114850 }, { "epoch": 2.5565349002849, "grad_norm": 0.7872776985168457, "learning_rate": 2.1186450182991925e-05, "loss": 0.5038, "step": 114860 }, { "epoch": 2.5567574786324787, "grad_norm": 0.5317719578742981, "learning_rate": 2.116557146211855e-05, "loss": 0.4716, "step": 114870 }, { "epoch": 2.556980056980057, "grad_norm": 0.6921097636222839, "learning_rate": 2.1144702459254416e-05, "loss": 0.5058, "step": 114880 }, { "epoch": 2.557202635327635, "grad_norm": 0.5789677500724792, "learning_rate": 2.112384317553362e-05, "loss": 0.3581, "step": 114890 }, { "epoch": 2.557425213675214, "grad_norm": 0.7021106481552124, "learning_rate": 2.1102993612089584e-05, "loss": 0.3649, "step": 114900 }, { "epoch": 2.557647792022792, "grad_norm": 0.5997939705848694, "learning_rate": 2.1082153770055312e-05, "loss": 0.4336, "step": 114910 }, { "epoch": 2.5578703703703702, "grad_norm": 0.663595974445343, "learning_rate": 2.106132365056328e-05, "loss": 0.4295, "step": 114920 }, { "epoch": 2.558092948717949, "grad_norm": 0.7464686036109924, "learning_rate": 2.1040503254745404e-05, "loss": 0.4655, "step": 114930 }, { "epoch": 2.558315527065527, "grad_norm": 0.5191732048988342, "learning_rate": 2.1019692583733087e-05, "loss": 0.4615, "step": 114940 }, { "epoch": 2.5585381054131053, "grad_norm": 0.5464897155761719, "learning_rate": 2.0998891638657247e-05, "loss": 0.3702, "step": 114950 }, { "epoch": 2.558760683760684, "grad_norm": 0.4492281973361969, "learning_rate": 2.0978100420648117e-05, "loss": 0.4609, "step": 114960 }, { "epoch": 2.558983262108262, "grad_norm": 0.47617268562316895, "learning_rate": 2.095731893083561e-05, "loss": 0.3809, "step": 114970 }, { "epoch": 2.5592058404558404, "grad_norm": 0.5587918758392334, "learning_rate": 2.0936547170348917e-05, "loss": 0.4479, "step": 114980 }, { "epoch": 2.5594284188034186, "grad_norm": 0.5930055379867554, "learning_rate": 2.0915785140316845e-05, "loss": 0.5025, "step": 114990 }, { "epoch": 2.5596509971509973, "grad_norm": 0.5976328253746033, "learning_rate": 2.08950328418676e-05, "loss": 0.4871, "step": 115000 }, { "epoch": 2.5598735754985755, "grad_norm": 0.8843815326690674, "learning_rate": 2.0874290276128906e-05, "loss": 0.5638, "step": 115010 }, { "epoch": 2.5600961538461537, "grad_norm": 0.5686395764350891, "learning_rate": 2.0853557444227922e-05, "loss": 0.4578, "step": 115020 }, { "epoch": 2.560318732193732, "grad_norm": 0.4919164180755615, "learning_rate": 2.083283434729131e-05, "loss": 0.3756, "step": 115030 }, { "epoch": 2.5605413105413106, "grad_norm": 0.5874354839324951, "learning_rate": 2.0812120986445116e-05, "loss": 0.4839, "step": 115040 }, { "epoch": 2.560763888888889, "grad_norm": 0.6515677571296692, "learning_rate": 2.079141736281498e-05, "loss": 0.4587, "step": 115050 }, { "epoch": 2.560986467236467, "grad_norm": 0.5897663831710815, "learning_rate": 2.077072347752591e-05, "loss": 0.387, "step": 115060 }, { "epoch": 2.5612090455840457, "grad_norm": 0.4548929035663605, "learning_rate": 2.0750039331702452e-05, "loss": 0.4837, "step": 115070 }, { "epoch": 2.561431623931624, "grad_norm": 0.7838677763938904, "learning_rate": 2.0729364926468598e-05, "loss": 0.4296, "step": 115080 }, { "epoch": 2.561654202279202, "grad_norm": 0.34927403926849365, "learning_rate": 2.0708700262947843e-05, "loss": 0.4602, "step": 115090 }, { "epoch": 2.561876780626781, "grad_norm": 0.5421298742294312, "learning_rate": 2.0688045342263075e-05, "loss": 0.437, "step": 115100 }, { "epoch": 2.562099358974359, "grad_norm": 0.6211506724357605, "learning_rate": 2.066740016553672e-05, "loss": 0.4838, "step": 115110 }, { "epoch": 2.5623219373219372, "grad_norm": 0.5304540395736694, "learning_rate": 2.0646764733890645e-05, "loss": 0.4963, "step": 115120 }, { "epoch": 2.562544515669516, "grad_norm": 0.705946683883667, "learning_rate": 2.062613904844617e-05, "loss": 0.4476, "step": 115130 }, { "epoch": 2.562767094017094, "grad_norm": 0.5351716876029968, "learning_rate": 2.0605523110324155e-05, "loss": 0.4003, "step": 115140 }, { "epoch": 2.5629896723646723, "grad_norm": 0.47889044880867004, "learning_rate": 2.058491692064488e-05, "loss": 0.5015, "step": 115150 }, { "epoch": 2.5632122507122506, "grad_norm": 0.7898067235946655, "learning_rate": 2.0564320480528076e-05, "loss": 0.4364, "step": 115160 }, { "epoch": 2.5634348290598292, "grad_norm": 0.4348805248737335, "learning_rate": 2.0543733791093046e-05, "loss": 0.4275, "step": 115170 }, { "epoch": 2.5636574074074074, "grad_norm": 0.3964775800704956, "learning_rate": 2.052315685345838e-05, "loss": 0.4295, "step": 115180 }, { "epoch": 2.5638799857549857, "grad_norm": 0.6023658514022827, "learning_rate": 2.05025896687423e-05, "loss": 0.525, "step": 115190 }, { "epoch": 2.564102564102564, "grad_norm": 0.7158157825469971, "learning_rate": 2.0482032238062464e-05, "loss": 0.469, "step": 115200 }, { "epoch": 2.5643251424501425, "grad_norm": 0.5601397752761841, "learning_rate": 2.046148456253594e-05, "loss": 0.5173, "step": 115210 }, { "epoch": 2.5645477207977208, "grad_norm": 0.36717069149017334, "learning_rate": 2.0440946643279313e-05, "loss": 0.3301, "step": 115220 }, { "epoch": 2.564770299145299, "grad_norm": 0.35347598791122437, "learning_rate": 2.0420418481408676e-05, "loss": 0.3723, "step": 115230 }, { "epoch": 2.5649928774928776, "grad_norm": 0.30606773495674133, "learning_rate": 2.039990007803949e-05, "loss": 0.4165, "step": 115240 }, { "epoch": 2.565215455840456, "grad_norm": 0.5062880516052246, "learning_rate": 2.0379391434286778e-05, "loss": 0.4515, "step": 115250 }, { "epoch": 2.565438034188034, "grad_norm": 0.6843504309654236, "learning_rate": 2.0358892551264953e-05, "loss": 0.3867, "step": 115260 }, { "epoch": 2.5656606125356127, "grad_norm": 0.5426856875419617, "learning_rate": 2.0338403430087972e-05, "loss": 0.3925, "step": 115270 }, { "epoch": 2.565883190883191, "grad_norm": 0.5436319708824158, "learning_rate": 2.031792407186921e-05, "loss": 0.5195, "step": 115280 }, { "epoch": 2.566105769230769, "grad_norm": 0.38191789388656616, "learning_rate": 2.029745447772158e-05, "loss": 0.4129, "step": 115290 }, { "epoch": 2.566328347578348, "grad_norm": 0.48173078894615173, "learning_rate": 2.0276994648757364e-05, "loss": 0.3255, "step": 115300 }, { "epoch": 2.566550925925926, "grad_norm": 0.6359217166900635, "learning_rate": 2.025654458608841e-05, "loss": 0.5472, "step": 115310 }, { "epoch": 2.5667735042735043, "grad_norm": 0.5844587087631226, "learning_rate": 2.0236104290825962e-05, "loss": 0.4527, "step": 115320 }, { "epoch": 2.5669960826210825, "grad_norm": 0.5271782875061035, "learning_rate": 2.0215673764080734e-05, "loss": 0.4227, "step": 115330 }, { "epoch": 2.5672186609686607, "grad_norm": 0.7398836016654968, "learning_rate": 2.019525300696301e-05, "loss": 0.5061, "step": 115340 }, { "epoch": 2.5674412393162394, "grad_norm": 0.44776272773742676, "learning_rate": 2.0174842020582396e-05, "loss": 0.4443, "step": 115350 }, { "epoch": 2.5676638176638176, "grad_norm": 0.6261894702911377, "learning_rate": 2.0154440806048137e-05, "loss": 0.4071, "step": 115360 }, { "epoch": 2.567886396011396, "grad_norm": 0.3250281810760498, "learning_rate": 2.0134049364468765e-05, "loss": 0.376, "step": 115370 }, { "epoch": 2.5681089743589745, "grad_norm": 0.46543797850608826, "learning_rate": 2.0113667696952377e-05, "loss": 0.4741, "step": 115380 }, { "epoch": 2.5683315527065527, "grad_norm": 0.4046928286552429, "learning_rate": 2.0093295804606594e-05, "loss": 0.4669, "step": 115390 }, { "epoch": 2.568554131054131, "grad_norm": 0.6142890453338623, "learning_rate": 2.0072933688538354e-05, "loss": 0.3958, "step": 115400 }, { "epoch": 2.5687767094017095, "grad_norm": 0.478249728679657, "learning_rate": 2.0052581349854192e-05, "loss": 0.558, "step": 115410 }, { "epoch": 2.5689992877492878, "grad_norm": 0.5002285242080688, "learning_rate": 2.0032238789660074e-05, "loss": 0.3963, "step": 115420 }, { "epoch": 2.569221866096866, "grad_norm": 0.5920494794845581, "learning_rate": 2.001190600906142e-05, "loss": 0.3864, "step": 115430 }, { "epoch": 2.5694444444444446, "grad_norm": 0.5024237036705017, "learning_rate": 1.9991583009163152e-05, "loss": 0.5187, "step": 115440 }, { "epoch": 2.569667022792023, "grad_norm": 0.46773669123649597, "learning_rate": 1.9971269791069648e-05, "loss": 0.3918, "step": 115450 }, { "epoch": 2.569889601139601, "grad_norm": 0.5565283298492432, "learning_rate": 1.9950966355884694e-05, "loss": 0.4201, "step": 115460 }, { "epoch": 2.5701121794871797, "grad_norm": 0.5695311427116394, "learning_rate": 1.9930672704711628e-05, "loss": 0.4914, "step": 115470 }, { "epoch": 2.570334757834758, "grad_norm": 0.7093086242675781, "learning_rate": 1.9910388838653215e-05, "loss": 0.495, "step": 115480 }, { "epoch": 2.570557336182336, "grad_norm": 0.5606511831283569, "learning_rate": 1.9890114758811728e-05, "loss": 0.3263, "step": 115490 }, { "epoch": 2.5707799145299144, "grad_norm": 0.5101292729377747, "learning_rate": 1.986985046628882e-05, "loss": 0.4229, "step": 115500 }, { "epoch": 2.5710024928774926, "grad_norm": 0.7715269327163696, "learning_rate": 1.9849595962185698e-05, "loss": 0.4277, "step": 115510 }, { "epoch": 2.5712250712250713, "grad_norm": 0.469260573387146, "learning_rate": 1.9829351247603058e-05, "loss": 0.4764, "step": 115520 }, { "epoch": 2.5714476495726495, "grad_norm": 0.5756816267967224, "learning_rate": 1.9809116323640908e-05, "loss": 0.3906, "step": 115530 }, { "epoch": 2.5716702279202277, "grad_norm": 0.5862997174263, "learning_rate": 1.9788891191398907e-05, "loss": 0.5252, "step": 115540 }, { "epoch": 2.5718928062678064, "grad_norm": 0.6412315964698792, "learning_rate": 1.9768675851976083e-05, "loss": 0.5215, "step": 115550 }, { "epoch": 2.5721153846153846, "grad_norm": 0.6483317017555237, "learning_rate": 1.9748470306470934e-05, "loss": 0.4839, "step": 115560 }, { "epoch": 2.572337962962963, "grad_norm": 0.4820649325847626, "learning_rate": 1.9728274555981498e-05, "loss": 0.4263, "step": 115570 }, { "epoch": 2.5725605413105415, "grad_norm": 0.557349681854248, "learning_rate": 1.970808860160518e-05, "loss": 0.4429, "step": 115580 }, { "epoch": 2.5727831196581197, "grad_norm": 0.5963757038116455, "learning_rate": 1.968791244443897e-05, "loss": 0.5557, "step": 115590 }, { "epoch": 2.573005698005698, "grad_norm": 0.6934390664100647, "learning_rate": 1.9667746085579175e-05, "loss": 0.58, "step": 115600 }, { "epoch": 2.5732282763532766, "grad_norm": 0.8542941808700562, "learning_rate": 1.964758952612167e-05, "loss": 0.4716, "step": 115610 }, { "epoch": 2.5734508547008548, "grad_norm": 0.42587122321128845, "learning_rate": 1.9627442767161843e-05, "loss": 0.45, "step": 115620 }, { "epoch": 2.573673433048433, "grad_norm": 0.704937219619751, "learning_rate": 1.9607305809794395e-05, "loss": 0.5379, "step": 115630 }, { "epoch": 2.5738960113960117, "grad_norm": 0.6337405443191528, "learning_rate": 1.9587178655113636e-05, "loss": 0.4186, "step": 115640 }, { "epoch": 2.57411858974359, "grad_norm": 0.6829156279563904, "learning_rate": 1.9567061304213263e-05, "loss": 0.4903, "step": 115650 }, { "epoch": 2.574341168091168, "grad_norm": 0.597281277179718, "learning_rate": 1.9546953758186536e-05, "loss": 0.3835, "step": 115660 }, { "epoch": 2.5745637464387463, "grad_norm": 0.32157281041145325, "learning_rate": 1.9526856018126028e-05, "loss": 0.3429, "step": 115670 }, { "epoch": 2.5747863247863245, "grad_norm": 0.49477922916412354, "learning_rate": 1.9506768085123904e-05, "loss": 0.5195, "step": 115680 }, { "epoch": 2.575008903133903, "grad_norm": 0.6303307414054871, "learning_rate": 1.9486689960271763e-05, "loss": 0.46, "step": 115690 }, { "epoch": 2.5752314814814814, "grad_norm": 0.59251868724823, "learning_rate": 1.9466621644660664e-05, "loss": 0.5057, "step": 115700 }, { "epoch": 2.5754540598290596, "grad_norm": 0.38228195905685425, "learning_rate": 1.9446563139381135e-05, "loss": 0.4508, "step": 115710 }, { "epoch": 2.5756766381766383, "grad_norm": 0.7206634879112244, "learning_rate": 1.9426514445523168e-05, "loss": 0.5761, "step": 115720 }, { "epoch": 2.5758992165242165, "grad_norm": 0.5861138701438904, "learning_rate": 1.9406475564176276e-05, "loss": 0.4115, "step": 115730 }, { "epoch": 2.5761217948717947, "grad_norm": 0.6063085198402405, "learning_rate": 1.938644649642931e-05, "loss": 0.4398, "step": 115740 }, { "epoch": 2.5763443732193734, "grad_norm": 0.3896350860595703, "learning_rate": 1.936642724337072e-05, "loss": 0.4346, "step": 115750 }, { "epoch": 2.5765669515669516, "grad_norm": 0.3690728545188904, "learning_rate": 1.9346417806088325e-05, "loss": 0.421, "step": 115760 }, { "epoch": 2.57678952991453, "grad_norm": 0.3766835331916809, "learning_rate": 1.932641818566947e-05, "loss": 0.3846, "step": 115770 }, { "epoch": 2.5770121082621085, "grad_norm": 0.6418868899345398, "learning_rate": 1.930642838320096e-05, "loss": 0.5609, "step": 115780 }, { "epoch": 2.5772346866096867, "grad_norm": 0.4302945137023926, "learning_rate": 1.928644839976905e-05, "loss": 0.4284, "step": 115790 }, { "epoch": 2.577457264957265, "grad_norm": 0.4001218378543854, "learning_rate": 1.9266478236459506e-05, "loss": 0.4796, "step": 115800 }, { "epoch": 2.5776798433048436, "grad_norm": 0.4011891782283783, "learning_rate": 1.9246517894357474e-05, "loss": 0.5099, "step": 115810 }, { "epoch": 2.577902421652422, "grad_norm": 0.6663089394569397, "learning_rate": 1.922656737454762e-05, "loss": 0.4387, "step": 115820 }, { "epoch": 2.578125, "grad_norm": 0.534116804599762, "learning_rate": 1.920662667811408e-05, "loss": 0.406, "step": 115830 }, { "epoch": 2.578347578347578, "grad_norm": 0.40283769369125366, "learning_rate": 1.9186695806140433e-05, "loss": 0.474, "step": 115840 }, { "epoch": 2.5785701566951564, "grad_norm": 0.8041699528694153, "learning_rate": 1.9166774759709783e-05, "loss": 0.4056, "step": 115850 }, { "epoch": 2.578792735042735, "grad_norm": 0.6072338819503784, "learning_rate": 1.9146863539904625e-05, "loss": 0.4541, "step": 115860 }, { "epoch": 2.5790153133903133, "grad_norm": 0.41169029474258423, "learning_rate": 1.9126962147806983e-05, "loss": 0.3805, "step": 115870 }, { "epoch": 2.5792378917378915, "grad_norm": 0.5600550770759583, "learning_rate": 1.9107070584498297e-05, "loss": 0.4154, "step": 115880 }, { "epoch": 2.57946047008547, "grad_norm": 0.4978649914264679, "learning_rate": 1.9087188851059423e-05, "loss": 0.4275, "step": 115890 }, { "epoch": 2.5796830484330484, "grad_norm": 0.42050987482070923, "learning_rate": 1.9067316948570825e-05, "loss": 0.3358, "step": 115900 }, { "epoch": 2.5799056267806266, "grad_norm": 0.46916189789772034, "learning_rate": 1.904745487811235e-05, "loss": 0.3698, "step": 115910 }, { "epoch": 2.5801282051282053, "grad_norm": 0.4058810770511627, "learning_rate": 1.902760264076331e-05, "loss": 0.4384, "step": 115920 }, { "epoch": 2.580306267806268, "eval_loss": 0.5236759781837463, "eval_runtime": 337.2759, "eval_samples_per_second": 7.012, "eval_steps_per_second": 7.012, "step": 115928 }, { "epoch": 2.5803507834757835, "grad_norm": 0.7314088940620422, "learning_rate": 1.900776023760249e-05, "loss": 0.4296, "step": 115930 }, { "epoch": 2.5805733618233617, "grad_norm": 0.8279690742492676, "learning_rate": 1.898792766970816e-05, "loss": 0.3954, "step": 115940 }, { "epoch": 2.5807959401709404, "grad_norm": 0.5086784958839417, "learning_rate": 1.8968104938158015e-05, "loss": 0.4982, "step": 115950 }, { "epoch": 2.5810185185185186, "grad_norm": 0.5648263692855835, "learning_rate": 1.8948292044029238e-05, "loss": 0.4381, "step": 115960 }, { "epoch": 2.581241096866097, "grad_norm": 0.7056523561477661, "learning_rate": 1.8928488988398495e-05, "loss": 0.4564, "step": 115970 }, { "epoch": 2.5814636752136755, "grad_norm": 0.46383073925971985, "learning_rate": 1.8908695772341887e-05, "loss": 0.5456, "step": 115980 }, { "epoch": 2.5816862535612537, "grad_norm": 0.5759848356246948, "learning_rate": 1.8888912396935e-05, "loss": 0.4889, "step": 115990 }, { "epoch": 2.581908831908832, "grad_norm": 0.5033251047134399, "learning_rate": 1.886913886325288e-05, "loss": 0.5239, "step": 116000 }, { "epoch": 2.58213141025641, "grad_norm": 0.5119370818138123, "learning_rate": 1.8849375172370064e-05, "loss": 0.3714, "step": 116010 }, { "epoch": 2.5823539886039883, "grad_norm": 0.44290006160736084, "learning_rate": 1.88296213253605e-05, "loss": 0.516, "step": 116020 }, { "epoch": 2.582576566951567, "grad_norm": 0.4895089566707611, "learning_rate": 1.8809877323297576e-05, "loss": 0.4879, "step": 116030 }, { "epoch": 2.5827991452991452, "grad_norm": 0.7203282713890076, "learning_rate": 1.879014316725427e-05, "loss": 0.6404, "step": 116040 }, { "epoch": 2.5830217236467234, "grad_norm": 0.3906666934490204, "learning_rate": 1.877041885830293e-05, "loss": 0.47, "step": 116050 }, { "epoch": 2.583244301994302, "grad_norm": 0.6866021156311035, "learning_rate": 1.875070439751536e-05, "loss": 0.5185, "step": 116060 }, { "epoch": 2.5834668803418803, "grad_norm": 0.7713746428489685, "learning_rate": 1.8730999785962934e-05, "loss": 0.4818, "step": 116070 }, { "epoch": 2.5836894586894585, "grad_norm": 0.5962246060371399, "learning_rate": 1.8711305024716386e-05, "loss": 0.4263, "step": 116080 }, { "epoch": 2.583912037037037, "grad_norm": 0.6574836373329163, "learning_rate": 1.8691620114845888e-05, "loss": 0.3811, "step": 116090 }, { "epoch": 2.5841346153846154, "grad_norm": 0.6990258097648621, "learning_rate": 1.8671945057421203e-05, "loss": 0.4852, "step": 116100 }, { "epoch": 2.5843571937321936, "grad_norm": 0.4760742485523224, "learning_rate": 1.8652279853511435e-05, "loss": 0.4472, "step": 116110 }, { "epoch": 2.5845797720797723, "grad_norm": 0.7832134366035461, "learning_rate": 1.863262450418526e-05, "loss": 0.4873, "step": 116120 }, { "epoch": 2.5848023504273505, "grad_norm": 0.45979663729667664, "learning_rate": 1.8612979010510732e-05, "loss": 0.3146, "step": 116130 }, { "epoch": 2.5850249287749287, "grad_norm": 0.49060124158859253, "learning_rate": 1.8593343373555426e-05, "loss": 0.3877, "step": 116140 }, { "epoch": 2.5852475071225074, "grad_norm": 0.680504560470581, "learning_rate": 1.85737175943864e-05, "loss": 0.4624, "step": 116150 }, { "epoch": 2.5854700854700856, "grad_norm": 0.5428845286369324, "learning_rate": 1.8554101674070058e-05, "loss": 0.5332, "step": 116160 }, { "epoch": 2.585692663817664, "grad_norm": 0.5724543929100037, "learning_rate": 1.853449561367233e-05, "loss": 0.4961, "step": 116170 }, { "epoch": 2.585915242165242, "grad_norm": 0.6779829263687134, "learning_rate": 1.851489941425868e-05, "loss": 0.4369, "step": 116180 }, { "epoch": 2.5861378205128203, "grad_norm": 0.4919106066226959, "learning_rate": 1.849531307689394e-05, "loss": 0.5544, "step": 116190 }, { "epoch": 2.586360398860399, "grad_norm": 0.7075668573379517, "learning_rate": 1.84757366026425e-05, "loss": 0.5148, "step": 116200 }, { "epoch": 2.586582977207977, "grad_norm": 0.4392474293708801, "learning_rate": 1.845616999256814e-05, "loss": 0.4088, "step": 116210 }, { "epoch": 2.5868055555555554, "grad_norm": 0.5524398684501648, "learning_rate": 1.8436613247734136e-05, "loss": 0.3931, "step": 116220 }, { "epoch": 2.587028133903134, "grad_norm": 0.6485047340393066, "learning_rate": 1.8417066369203173e-05, "loss": 0.5104, "step": 116230 }, { "epoch": 2.5872507122507122, "grad_norm": 0.39343759417533875, "learning_rate": 1.839752935803749e-05, "loss": 0.4568, "step": 116240 }, { "epoch": 2.5874732905982905, "grad_norm": 0.5676214694976807, "learning_rate": 1.8378002215298707e-05, "loss": 0.3959, "step": 116250 }, { "epoch": 2.587695868945869, "grad_norm": 0.5716341733932495, "learning_rate": 1.835848494204797e-05, "loss": 0.4073, "step": 116260 }, { "epoch": 2.5879184472934473, "grad_norm": 0.4870680868625641, "learning_rate": 1.833897753934588e-05, "loss": 0.4599, "step": 116270 }, { "epoch": 2.5881410256410255, "grad_norm": 0.5869102478027344, "learning_rate": 1.8319480008252478e-05, "loss": 0.4154, "step": 116280 }, { "epoch": 2.588363603988604, "grad_norm": 0.5533974170684814, "learning_rate": 1.8299992349827246e-05, "loss": 0.441, "step": 116290 }, { "epoch": 2.5885861823361824, "grad_norm": 0.735202431678772, "learning_rate": 1.8280514565129182e-05, "loss": 0.3809, "step": 116300 }, { "epoch": 2.5888087606837606, "grad_norm": 0.6630352735519409, "learning_rate": 1.8261046655216708e-05, "loss": 0.3626, "step": 116310 }, { "epoch": 2.5890313390313393, "grad_norm": 0.549351692199707, "learning_rate": 1.824158862114773e-05, "loss": 0.5333, "step": 116320 }, { "epoch": 2.5892539173789175, "grad_norm": 0.6176263093948364, "learning_rate": 1.822214046397963e-05, "loss": 0.4987, "step": 116330 }, { "epoch": 2.5894764957264957, "grad_norm": 0.48720836639404297, "learning_rate": 1.8202702184769226e-05, "loss": 0.4306, "step": 116340 }, { "epoch": 2.589699074074074, "grad_norm": 0.7484619617462158, "learning_rate": 1.8183273784572808e-05, "loss": 0.4462, "step": 116350 }, { "epoch": 2.589921652421652, "grad_norm": 0.5729781985282898, "learning_rate": 1.816385526444615e-05, "loss": 0.4253, "step": 116360 }, { "epoch": 2.590144230769231, "grad_norm": 0.5132777094841003, "learning_rate": 1.8144446625444457e-05, "loss": 0.3839, "step": 116370 }, { "epoch": 2.590366809116809, "grad_norm": 0.5946417450904846, "learning_rate": 1.812504786862237e-05, "loss": 0.5092, "step": 116380 }, { "epoch": 2.5905893874643873, "grad_norm": 0.6767193078994751, "learning_rate": 1.8105658995034093e-05, "loss": 0.5002, "step": 116390 }, { "epoch": 2.590811965811966, "grad_norm": 0.7960422039031982, "learning_rate": 1.8086280005733202e-05, "loss": 0.4476, "step": 116400 }, { "epoch": 2.591034544159544, "grad_norm": 0.5478686094284058, "learning_rate": 1.8066910901772836e-05, "loss": 0.4791, "step": 116410 }, { "epoch": 2.5912571225071224, "grad_norm": 0.5975253582000732, "learning_rate": 1.80475516842054e-05, "loss": 0.4985, "step": 116420 }, { "epoch": 2.591479700854701, "grad_norm": 0.592153787612915, "learning_rate": 1.802820235408298e-05, "loss": 0.5786, "step": 116430 }, { "epoch": 2.5917022792022792, "grad_norm": 0.28754061460494995, "learning_rate": 1.8008862912457046e-05, "loss": 0.3352, "step": 116440 }, { "epoch": 2.5919248575498575, "grad_norm": 0.4851224422454834, "learning_rate": 1.7989533360378453e-05, "loss": 0.443, "step": 116450 }, { "epoch": 2.592147435897436, "grad_norm": 0.5925357341766357, "learning_rate": 1.797021369889762e-05, "loss": 0.5076, "step": 116460 }, { "epoch": 2.5923700142450143, "grad_norm": 0.5017402172088623, "learning_rate": 1.7950903929064378e-05, "loss": 0.4064, "step": 116470 }, { "epoch": 2.5925925925925926, "grad_norm": 0.5168089270591736, "learning_rate": 1.7931604051928063e-05, "loss": 0.4107, "step": 116480 }, { "epoch": 2.5928151709401708, "grad_norm": 0.6962186098098755, "learning_rate": 1.7912314068537416e-05, "loss": 0.4251, "step": 116490 }, { "epoch": 2.5930377492877494, "grad_norm": 0.3982653021812439, "learning_rate": 1.789303397994073e-05, "loss": 0.4421, "step": 116500 }, { "epoch": 2.5932603276353277, "grad_norm": 0.4287591874599457, "learning_rate": 1.7873763787185614e-05, "loss": 0.4046, "step": 116510 }, { "epoch": 2.593482905982906, "grad_norm": 0.4040102958679199, "learning_rate": 1.7854503491319298e-05, "loss": 0.5503, "step": 116520 }, { "epoch": 2.593705484330484, "grad_norm": 0.46851760149002075, "learning_rate": 1.7835253093388337e-05, "loss": 0.4146, "step": 116530 }, { "epoch": 2.5939280626780628, "grad_norm": 0.5532066822052002, "learning_rate": 1.7816012594438903e-05, "loss": 0.4898, "step": 116540 }, { "epoch": 2.594150641025641, "grad_norm": 0.4478902518749237, "learning_rate": 1.7796781995516464e-05, "loss": 0.5938, "step": 116550 }, { "epoch": 2.594373219373219, "grad_norm": 0.6147230267524719, "learning_rate": 1.7777561297666033e-05, "loss": 0.5559, "step": 116560 }, { "epoch": 2.594595797720798, "grad_norm": 0.398787260055542, "learning_rate": 1.7758350501932086e-05, "loss": 0.424, "step": 116570 }, { "epoch": 2.594818376068376, "grad_norm": 0.6829454898834229, "learning_rate": 1.7739149609358607e-05, "loss": 0.487, "step": 116580 }, { "epoch": 2.5950409544159543, "grad_norm": 0.849534809589386, "learning_rate": 1.771995862098892e-05, "loss": 0.5313, "step": 116590 }, { "epoch": 2.595263532763533, "grad_norm": 0.4778141677379608, "learning_rate": 1.7700777537865897e-05, "loss": 0.3923, "step": 116600 }, { "epoch": 2.595486111111111, "grad_norm": 0.5346719026565552, "learning_rate": 1.7681606361031866e-05, "loss": 0.4585, "step": 116610 }, { "epoch": 2.5957086894586894, "grad_norm": 0.42185789346694946, "learning_rate": 1.766244509152859e-05, "loss": 0.4035, "step": 116620 }, { "epoch": 2.595931267806268, "grad_norm": 0.6329679489135742, "learning_rate": 1.76432937303973e-05, "loss": 0.4382, "step": 116630 }, { "epoch": 2.5961538461538463, "grad_norm": 0.740755558013916, "learning_rate": 1.7624152278678775e-05, "loss": 0.5053, "step": 116640 }, { "epoch": 2.5963764245014245, "grad_norm": 0.4598102569580078, "learning_rate": 1.7605020737413057e-05, "loss": 0.3666, "step": 116650 }, { "epoch": 2.5965990028490027, "grad_norm": 0.370423287153244, "learning_rate": 1.7585899107639837e-05, "loss": 0.4285, "step": 116660 }, { "epoch": 2.5968215811965814, "grad_norm": 0.5081289410591125, "learning_rate": 1.756678739039823e-05, "loss": 0.5261, "step": 116670 }, { "epoch": 2.5970441595441596, "grad_norm": 0.550074577331543, "learning_rate": 1.7547685586726726e-05, "loss": 0.3716, "step": 116680 }, { "epoch": 2.597266737891738, "grad_norm": 0.7760112881660461, "learning_rate": 1.752859369766331e-05, "loss": 0.4597, "step": 116690 }, { "epoch": 2.597489316239316, "grad_norm": 0.5838767886161804, "learning_rate": 1.750951172424551e-05, "loss": 0.4031, "step": 116700 }, { "epoch": 2.5977118945868947, "grad_norm": 0.5418928861618042, "learning_rate": 1.749043966751025e-05, "loss": 0.4342, "step": 116710 }, { "epoch": 2.597934472934473, "grad_norm": 0.6233574748039246, "learning_rate": 1.7471377528493926e-05, "loss": 0.4812, "step": 116720 }, { "epoch": 2.598157051282051, "grad_norm": 0.7421544790267944, "learning_rate": 1.7452325308232332e-05, "loss": 0.4897, "step": 116730 }, { "epoch": 2.5983796296296298, "grad_norm": 0.4924778938293457, "learning_rate": 1.7433283007760836e-05, "loss": 0.2949, "step": 116740 }, { "epoch": 2.598602207977208, "grad_norm": 0.6824201345443726, "learning_rate": 1.7414250628114192e-05, "loss": 0.5032, "step": 116750 }, { "epoch": 2.598824786324786, "grad_norm": 0.49224621057510376, "learning_rate": 1.739522817032664e-05, "loss": 0.4255, "step": 116760 }, { "epoch": 2.599047364672365, "grad_norm": 0.4551924467086792, "learning_rate": 1.737621563543188e-05, "loss": 0.4551, "step": 116770 }, { "epoch": 2.599269943019943, "grad_norm": 0.6858125925064087, "learning_rate": 1.7357213024463093e-05, "loss": 0.5896, "step": 116780 }, { "epoch": 2.5994925213675213, "grad_norm": 0.6564284563064575, "learning_rate": 1.7338220338452825e-05, "loss": 0.4219, "step": 116790 }, { "epoch": 2.5997150997151, "grad_norm": 0.41564351320266724, "learning_rate": 1.7319237578433256e-05, "loss": 0.4131, "step": 116800 }, { "epoch": 2.599937678062678, "grad_norm": 0.559563398361206, "learning_rate": 1.7300264745435824e-05, "loss": 0.3525, "step": 116810 }, { "epoch": 2.6001602564102564, "grad_norm": 0.565727949142456, "learning_rate": 1.7281301840491572e-05, "loss": 0.4309, "step": 116820 }, { "epoch": 2.6003828347578346, "grad_norm": 0.7862516045570374, "learning_rate": 1.7262348864630938e-05, "loss": 0.5738, "step": 116830 }, { "epoch": 2.6006054131054133, "grad_norm": 0.583437442779541, "learning_rate": 1.7243405818883883e-05, "loss": 0.5902, "step": 116840 }, { "epoch": 2.6008279914529915, "grad_norm": 0.5324410796165466, "learning_rate": 1.7224472704279758e-05, "loss": 0.3872, "step": 116850 }, { "epoch": 2.6010505698005697, "grad_norm": 0.5166898965835571, "learning_rate": 1.720554952184745e-05, "loss": 0.4868, "step": 116860 }, { "epoch": 2.601273148148148, "grad_norm": 0.4185858964920044, "learning_rate": 1.7186636272615187e-05, "loss": 0.4315, "step": 116870 }, { "epoch": 2.6014957264957266, "grad_norm": 0.8711520433425903, "learning_rate": 1.7167732957610784e-05, "loss": 0.4937, "step": 116880 }, { "epoch": 2.601718304843305, "grad_norm": 0.6560893058776855, "learning_rate": 1.714883957786142e-05, "loss": 0.4683, "step": 116890 }, { "epoch": 2.601940883190883, "grad_norm": 0.38894492387771606, "learning_rate": 1.7129956134393832e-05, "loss": 0.3793, "step": 116900 }, { "epoch": 2.6021634615384617, "grad_norm": 0.49460241198539734, "learning_rate": 1.711108262823411e-05, "loss": 0.5123, "step": 116910 }, { "epoch": 2.60238603988604, "grad_norm": 0.6922730207443237, "learning_rate": 1.709221906040792e-05, "loss": 0.4265, "step": 116920 }, { "epoch": 2.602608618233618, "grad_norm": 0.5051377415657043, "learning_rate": 1.707336543194027e-05, "loss": 0.4481, "step": 116930 }, { "epoch": 2.6028311965811968, "grad_norm": 0.7844445109367371, "learning_rate": 1.7054521743855666e-05, "loss": 0.4749, "step": 116940 }, { "epoch": 2.603053774928775, "grad_norm": 0.6359633803367615, "learning_rate": 1.7035687997178117e-05, "loss": 0.5296, "step": 116950 }, { "epoch": 2.603276353276353, "grad_norm": 0.62961745262146, "learning_rate": 1.701686419293107e-05, "loss": 0.5121, "step": 116960 }, { "epoch": 2.603498931623932, "grad_norm": 0.7465632557868958, "learning_rate": 1.6998050332137416e-05, "loss": 0.469, "step": 116970 }, { "epoch": 2.60372150997151, "grad_norm": 0.4284519851207733, "learning_rate": 1.6979246415819517e-05, "loss": 0.4509, "step": 116980 }, { "epoch": 2.6039440883190883, "grad_norm": 0.6444926261901855, "learning_rate": 1.6960452444999198e-05, "loss": 0.421, "step": 116990 }, { "epoch": 2.6041666666666665, "grad_norm": 0.54566490650177, "learning_rate": 1.6941668420697775e-05, "loss": 0.3855, "step": 117000 }, { "epoch": 2.604389245014245, "grad_norm": 0.4093930721282959, "learning_rate": 1.6922894343935903e-05, "loss": 0.4589, "step": 117010 }, { "epoch": 2.6046118233618234, "grad_norm": 0.5715624690055847, "learning_rate": 1.6904130215733825e-05, "loss": 0.3965, "step": 117020 }, { "epoch": 2.6048344017094016, "grad_norm": 0.5708182454109192, "learning_rate": 1.68853760371112e-05, "loss": 0.4052, "step": 117030 }, { "epoch": 2.60505698005698, "grad_norm": 0.528289794921875, "learning_rate": 1.6866631809087162e-05, "loss": 0.4247, "step": 117040 }, { "epoch": 2.6052795584045585, "grad_norm": 0.4165605902671814, "learning_rate": 1.6847897532680257e-05, "loss": 0.3851, "step": 117050 }, { "epoch": 2.6055021367521367, "grad_norm": 0.76469886302948, "learning_rate": 1.682917320890858e-05, "loss": 0.4445, "step": 117060 }, { "epoch": 2.605724715099715, "grad_norm": 0.6084073781967163, "learning_rate": 1.6810458838789578e-05, "loss": 0.5383, "step": 117070 }, { "epoch": 2.6059472934472936, "grad_norm": 0.5931957960128784, "learning_rate": 1.6791754423340177e-05, "loss": 0.482, "step": 117080 }, { "epoch": 2.606169871794872, "grad_norm": 0.5516034960746765, "learning_rate": 1.677305996357681e-05, "loss": 0.4666, "step": 117090 }, { "epoch": 2.60639245014245, "grad_norm": 0.39802029728889465, "learning_rate": 1.675437546051537e-05, "loss": 0.3665, "step": 117100 }, { "epoch": 2.6066150284900287, "grad_norm": 0.7053261399269104, "learning_rate": 1.673570091517118e-05, "loss": 0.4161, "step": 117110 }, { "epoch": 2.606837606837607, "grad_norm": 0.5836366415023804, "learning_rate": 1.671703632855901e-05, "loss": 0.4877, "step": 117120 }, { "epoch": 2.607060185185185, "grad_norm": 0.7553135752677917, "learning_rate": 1.6698381701693156e-05, "loss": 0.4377, "step": 117130 }, { "epoch": 2.607282763532764, "grad_norm": 0.5414380431175232, "learning_rate": 1.6679737035587317e-05, "loss": 0.4461, "step": 117140 }, { "epoch": 2.607505341880342, "grad_norm": 0.5662949681282043, "learning_rate": 1.666110233125462e-05, "loss": 0.406, "step": 117150 }, { "epoch": 2.60772792022792, "grad_norm": 0.7300053834915161, "learning_rate": 1.6642477589707695e-05, "loss": 0.4483, "step": 117160 }, { "epoch": 2.6079504985754984, "grad_norm": 0.5964715480804443, "learning_rate": 1.6623862811958646e-05, "loss": 0.4592, "step": 117170 }, { "epoch": 2.6081730769230766, "grad_norm": 0.6311967968940735, "learning_rate": 1.660525799901902e-05, "loss": 0.4315, "step": 117180 }, { "epoch": 2.6083956552706553, "grad_norm": 0.2812231183052063, "learning_rate": 1.6586663151899784e-05, "loss": 0.4067, "step": 117190 }, { "epoch": 2.6086182336182335, "grad_norm": 0.5866037011146545, "learning_rate": 1.6568078271611487e-05, "loss": 0.5272, "step": 117200 }, { "epoch": 2.6088408119658117, "grad_norm": 0.540464460849762, "learning_rate": 1.6549503359163965e-05, "loss": 0.4435, "step": 117210 }, { "epoch": 2.6090633903133904, "grad_norm": 0.811968982219696, "learning_rate": 1.6530938415566566e-05, "loss": 0.5487, "step": 117220 }, { "epoch": 2.6092859686609686, "grad_norm": 0.766233503818512, "learning_rate": 1.6512383441828196e-05, "loss": 0.6048, "step": 117230 }, { "epoch": 2.609508547008547, "grad_norm": 0.9112457633018494, "learning_rate": 1.6493838438957087e-05, "loss": 0.5794, "step": 117240 }, { "epoch": 2.6097311253561255, "grad_norm": 0.7300744652748108, "learning_rate": 1.647530340796104e-05, "loss": 0.4959, "step": 117250 }, { "epoch": 2.6099537037037037, "grad_norm": 0.8856310248374939, "learning_rate": 1.6456778349847245e-05, "loss": 0.5404, "step": 117260 }, { "epoch": 2.610176282051282, "grad_norm": 0.4345667362213135, "learning_rate": 1.643826326562241e-05, "loss": 0.4244, "step": 117270 }, { "epoch": 2.6103988603988606, "grad_norm": 0.5131345391273499, "learning_rate": 1.6419758156292575e-05, "loss": 0.4662, "step": 117280 }, { "epoch": 2.610621438746439, "grad_norm": 0.5059243440628052, "learning_rate": 1.6401263022863378e-05, "loss": 0.3407, "step": 117290 }, { "epoch": 2.610844017094017, "grad_norm": 0.5901126265525818, "learning_rate": 1.638277786633984e-05, "loss": 0.5093, "step": 117300 }, { "epoch": 2.6110665954415957, "grad_norm": 0.4596213400363922, "learning_rate": 1.6364302687726464e-05, "loss": 0.3714, "step": 117310 }, { "epoch": 2.611289173789174, "grad_norm": 0.6311203241348267, "learning_rate": 1.6345837488027228e-05, "loss": 0.4221, "step": 117320 }, { "epoch": 2.611511752136752, "grad_norm": 0.5315381288528442, "learning_rate": 1.632738226824555e-05, "loss": 0.405, "step": 117330 }, { "epoch": 2.6117343304843303, "grad_norm": 0.5763476490974426, "learning_rate": 1.6308937029384254e-05, "loss": 0.3797, "step": 117340 }, { "epoch": 2.6119569088319086, "grad_norm": 0.5535160899162292, "learning_rate": 1.6290501772445732e-05, "loss": 0.4283, "step": 117350 }, { "epoch": 2.6121794871794872, "grad_norm": 0.7270311117172241, "learning_rate": 1.6272076498431676e-05, "loss": 0.4913, "step": 117360 }, { "epoch": 2.6124020655270654, "grad_norm": 0.7906320095062256, "learning_rate": 1.6253661208343417e-05, "loss": 0.4636, "step": 117370 }, { "epoch": 2.6126246438746437, "grad_norm": 0.7179734110832214, "learning_rate": 1.6235255903181623e-05, "loss": 0.4199, "step": 117380 }, { "epoch": 2.6128472222222223, "grad_norm": 0.6319173574447632, "learning_rate": 1.621686058394647e-05, "loss": 0.6118, "step": 117390 }, { "epoch": 2.6130698005698005, "grad_norm": 0.5031944513320923, "learning_rate": 1.6198475251637557e-05, "loss": 0.5795, "step": 117400 }, { "epoch": 2.6132923789173788, "grad_norm": 0.7253941893577576, "learning_rate": 1.6180099907254e-05, "loss": 0.345, "step": 117410 }, { "epoch": 2.6135149572649574, "grad_norm": 0.9181031584739685, "learning_rate": 1.6161734551794238e-05, "loss": 0.4828, "step": 117420 }, { "epoch": 2.6137375356125356, "grad_norm": 0.4423508644104004, "learning_rate": 1.6143379186256346e-05, "loss": 0.5258, "step": 117430 }, { "epoch": 2.613960113960114, "grad_norm": 0.43034684658050537, "learning_rate": 1.6125033811637723e-05, "loss": 0.4249, "step": 117440 }, { "epoch": 2.6141826923076925, "grad_norm": 0.7726715803146362, "learning_rate": 1.6106698428935307e-05, "loss": 0.4994, "step": 117450 }, { "epoch": 2.6144052706552707, "grad_norm": 0.42630285024642944, "learning_rate": 1.6088373039145478e-05, "loss": 0.3627, "step": 117460 }, { "epoch": 2.614627849002849, "grad_norm": 0.576209306716919, "learning_rate": 1.607005764326397e-05, "loss": 0.3649, "step": 117470 }, { "epoch": 2.6148504273504276, "grad_norm": 0.4724714457988739, "learning_rate": 1.60517522422861e-05, "loss": 0.5291, "step": 117480 }, { "epoch": 2.615073005698006, "grad_norm": 0.5985315442085266, "learning_rate": 1.6033456837206628e-05, "loss": 0.4899, "step": 117490 }, { "epoch": 2.615295584045584, "grad_norm": 0.47378864884376526, "learning_rate": 1.6015171429019694e-05, "loss": 0.5148, "step": 117500 }, { "epoch": 2.6155181623931623, "grad_norm": 0.6309303641319275, "learning_rate": 1.5996896018718966e-05, "loss": 0.4023, "step": 117510 }, { "epoch": 2.6157407407407405, "grad_norm": 0.6247247457504272, "learning_rate": 1.597863060729752e-05, "loss": 0.3988, "step": 117520 }, { "epoch": 2.615963319088319, "grad_norm": 0.5093616247177124, "learning_rate": 1.5960375195747958e-05, "loss": 0.5054, "step": 117530 }, { "epoch": 2.6161858974358974, "grad_norm": 0.6017331480979919, "learning_rate": 1.5942129785062242e-05, "loss": 0.4704, "step": 117540 }, { "epoch": 2.6164084757834756, "grad_norm": 0.4164072573184967, "learning_rate": 1.592389437623192e-05, "loss": 0.5016, "step": 117550 }, { "epoch": 2.6166310541310542, "grad_norm": 0.5958421230316162, "learning_rate": 1.5905668970247833e-05, "loss": 0.5233, "step": 117560 }, { "epoch": 2.6168536324786325, "grad_norm": 0.742854118347168, "learning_rate": 1.5887453568100398e-05, "loss": 0.3935, "step": 117570 }, { "epoch": 2.6170762108262107, "grad_norm": 0.42327070236206055, "learning_rate": 1.586924817077946e-05, "loss": 0.4621, "step": 117580 }, { "epoch": 2.6172987891737893, "grad_norm": 0.7418227195739746, "learning_rate": 1.5851052779274343e-05, "loss": 0.434, "step": 117590 }, { "epoch": 2.6175213675213675, "grad_norm": 0.4669528305530548, "learning_rate": 1.5832867394573746e-05, "loss": 0.4765, "step": 117600 }, { "epoch": 2.6177439458689458, "grad_norm": 0.741690993309021, "learning_rate": 1.58146920176659e-05, "loss": 0.4918, "step": 117610 }, { "epoch": 2.6179665242165244, "grad_norm": 0.5707095861434937, "learning_rate": 1.5796526649538455e-05, "loss": 0.4863, "step": 117620 }, { "epoch": 2.6181891025641026, "grad_norm": 0.6767379641532898, "learning_rate": 1.5778371291178606e-05, "loss": 0.558, "step": 117630 }, { "epoch": 2.618411680911681, "grad_norm": 0.5096617341041565, "learning_rate": 1.5760225943572826e-05, "loss": 0.4515, "step": 117640 }, { "epoch": 2.6186342592592595, "grad_norm": 0.60428386926651, "learning_rate": 1.5742090607707195e-05, "loss": 0.5742, "step": 117650 }, { "epoch": 2.6188568376068377, "grad_norm": 0.2778734862804413, "learning_rate": 1.5723965284567188e-05, "loss": 0.4612, "step": 117660 }, { "epoch": 2.619079415954416, "grad_norm": 0.4761873483657837, "learning_rate": 1.5705849975137775e-05, "loss": 0.5468, "step": 117670 }, { "epoch": 2.619301994301994, "grad_norm": 0.5155042409896851, "learning_rate": 1.5687744680403348e-05, "loss": 0.4406, "step": 117680 }, { "epoch": 2.6195245726495724, "grad_norm": 0.5542302131652832, "learning_rate": 1.5669649401347786e-05, "loss": 0.4618, "step": 117690 }, { "epoch": 2.619747150997151, "grad_norm": 0.7806945443153381, "learning_rate": 1.5651564138954345e-05, "loss": 0.4865, "step": 117700 }, { "epoch": 2.6199697293447293, "grad_norm": 0.5195888876914978, "learning_rate": 1.5633488894205817e-05, "loss": 0.4072, "step": 117710 }, { "epoch": 2.6201923076923075, "grad_norm": 0.5302631258964539, "learning_rate": 1.5615423668084483e-05, "loss": 0.4955, "step": 117720 }, { "epoch": 2.620414886039886, "grad_norm": 0.4892368018627167, "learning_rate": 1.5597368461571916e-05, "loss": 0.4808, "step": 117730 }, { "epoch": 2.6206374643874644, "grad_norm": 0.5570464134216309, "learning_rate": 1.5579323275649327e-05, "loss": 0.547, "step": 117740 }, { "epoch": 2.6208600427350426, "grad_norm": 0.5267153978347778, "learning_rate": 1.5561288111297266e-05, "loss": 0.5734, "step": 117750 }, { "epoch": 2.6210826210826212, "grad_norm": 0.41973552107810974, "learning_rate": 1.55432629694958e-05, "loss": 0.354, "step": 117760 }, { "epoch": 2.6213051994301995, "grad_norm": 0.5145451426506042, "learning_rate": 1.5525247851224466e-05, "loss": 0.463, "step": 117770 }, { "epoch": 2.6215277777777777, "grad_norm": 0.5300337672233582, "learning_rate": 1.550724275746216e-05, "loss": 0.4699, "step": 117780 }, { "epoch": 2.6217503561253563, "grad_norm": 0.442150741815567, "learning_rate": 1.5489247689187293e-05, "loss": 0.4428, "step": 117790 }, { "epoch": 2.6219729344729346, "grad_norm": 0.4735947251319885, "learning_rate": 1.5471262647377773e-05, "loss": 0.5018, "step": 117800 }, { "epoch": 2.6221955128205128, "grad_norm": 0.34947624802589417, "learning_rate": 1.5453287633010884e-05, "loss": 0.3744, "step": 117810 }, { "epoch": 2.6224180911680914, "grad_norm": 0.3847612738609314, "learning_rate": 1.5435322647063445e-05, "loss": 0.5047, "step": 117820 }, { "epoch": 2.6226406695156697, "grad_norm": 0.5073989033699036, "learning_rate": 1.5417367690511676e-05, "loss": 0.4404, "step": 117830 }, { "epoch": 2.622863247863248, "grad_norm": 0.6535903215408325, "learning_rate": 1.539942276433124e-05, "loss": 0.3612, "step": 117840 }, { "epoch": 2.623085826210826, "grad_norm": 0.6124248504638672, "learning_rate": 1.5381487869497314e-05, "loss": 0.4105, "step": 117850 }, { "epoch": 2.6233084045584043, "grad_norm": 0.623004674911499, "learning_rate": 1.5363563006984426e-05, "loss": 0.4834, "step": 117860 }, { "epoch": 2.623530982905983, "grad_norm": 0.44189321994781494, "learning_rate": 1.5345648177766692e-05, "loss": 0.5217, "step": 117870 }, { "epoch": 2.623753561253561, "grad_norm": 0.6079039573669434, "learning_rate": 1.5327743382817594e-05, "loss": 0.4193, "step": 117880 }, { "epoch": 2.6239761396011394, "grad_norm": 0.8342279195785522, "learning_rate": 1.5309848623110113e-05, "loss": 0.5486, "step": 117890 }, { "epoch": 2.624198717948718, "grad_norm": 0.5069679021835327, "learning_rate": 1.5291963899616645e-05, "loss": 0.3736, "step": 117900 }, { "epoch": 2.6244212962962963, "grad_norm": 0.5442230701446533, "learning_rate": 1.5274089213309107e-05, "loss": 0.451, "step": 117910 }, { "epoch": 2.6246438746438745, "grad_norm": 0.4876982569694519, "learning_rate": 1.5256224565158738e-05, "loss": 0.4097, "step": 117920 }, { "epoch": 2.624866452991453, "grad_norm": 0.47937461733818054, "learning_rate": 1.5238369956136368e-05, "loss": 0.5374, "step": 117930 }, { "epoch": 2.6250890313390314, "grad_norm": 0.6652742028236389, "learning_rate": 1.5220525387212236e-05, "loss": 0.5051, "step": 117940 }, { "epoch": 2.6253116096866096, "grad_norm": 0.5420684218406677, "learning_rate": 1.5202690859356017e-05, "loss": 0.4197, "step": 117950 }, { "epoch": 2.6255341880341883, "grad_norm": 0.5405448079109192, "learning_rate": 1.5184866373536866e-05, "loss": 0.5455, "step": 117960 }, { "epoch": 2.6257567663817665, "grad_norm": 0.6384806632995605, "learning_rate": 1.5167051930723386e-05, "loss": 0.4563, "step": 117970 }, { "epoch": 2.6259793447293447, "grad_norm": 0.5483444929122925, "learning_rate": 1.5149247531883603e-05, "loss": 0.4079, "step": 117980 }, { "epoch": 2.6262019230769234, "grad_norm": 0.47904133796691895, "learning_rate": 1.5131453177985055e-05, "loss": 0.4591, "step": 117990 }, { "epoch": 2.6264245014245016, "grad_norm": 0.5649526715278625, "learning_rate": 1.5113668869994657e-05, "loss": 0.457, "step": 118000 }, { "epoch": 2.62664707977208, "grad_norm": 0.5389604568481445, "learning_rate": 1.5095894608878835e-05, "loss": 0.4295, "step": 118010 }, { "epoch": 2.626869658119658, "grad_norm": 0.511400043964386, "learning_rate": 1.5078130395603485e-05, "loss": 0.4049, "step": 118020 }, { "epoch": 2.627092236467236, "grad_norm": 0.7150872945785522, "learning_rate": 1.5060376231133899e-05, "loss": 0.4275, "step": 118030 }, { "epoch": 2.627314814814815, "grad_norm": 0.4138595461845398, "learning_rate": 1.5042632116434885e-05, "loss": 0.4917, "step": 118040 }, { "epoch": 2.627537393162393, "grad_norm": 0.40053409337997437, "learning_rate": 1.5024898052470671e-05, "loss": 0.5104, "step": 118050 }, { "epoch": 2.6277599715099713, "grad_norm": 0.5890064239501953, "learning_rate": 1.5007174040204908e-05, "loss": 0.4719, "step": 118060 }, { "epoch": 2.62798254985755, "grad_norm": 0.6815602779388428, "learning_rate": 1.4989460080600736e-05, "loss": 0.3667, "step": 118070 }, { "epoch": 2.628205128205128, "grad_norm": 0.6920440196990967, "learning_rate": 1.4971756174620766e-05, "loss": 0.4127, "step": 118080 }, { "epoch": 2.6284277065527064, "grad_norm": 0.6863391995429993, "learning_rate": 1.4954062323227025e-05, "loss": 0.455, "step": 118090 }, { "epoch": 2.628650284900285, "grad_norm": 0.5050503611564636, "learning_rate": 1.4936378527381034e-05, "loss": 0.4967, "step": 118100 }, { "epoch": 2.6288728632478633, "grad_norm": 0.6623749136924744, "learning_rate": 1.491870478804378e-05, "loss": 0.4824, "step": 118110 }, { "epoch": 2.6290954415954415, "grad_norm": 0.49631795287132263, "learning_rate": 1.4901041106175606e-05, "loss": 0.3832, "step": 118120 }, { "epoch": 2.62931801994302, "grad_norm": 0.46611085534095764, "learning_rate": 1.4883387482736343e-05, "loss": 0.471, "step": 118130 }, { "epoch": 2.6295405982905984, "grad_norm": 0.37215563654899597, "learning_rate": 1.4865743918685382e-05, "loss": 0.5166, "step": 118140 }, { "epoch": 2.6297631766381766, "grad_norm": 0.6707950234413147, "learning_rate": 1.4848110414981419e-05, "loss": 0.5848, "step": 118150 }, { "epoch": 2.629985754985755, "grad_norm": 0.6057896614074707, "learning_rate": 1.4830486972582735e-05, "loss": 0.4721, "step": 118160 }, { "epoch": 2.6302083333333335, "grad_norm": 0.5633417963981628, "learning_rate": 1.4812873592446962e-05, "loss": 0.4018, "step": 118170 }, { "epoch": 2.6304309116809117, "grad_norm": 0.5686827301979065, "learning_rate": 1.4795270275531225e-05, "loss": 0.4613, "step": 118180 }, { "epoch": 2.63065349002849, "grad_norm": 0.5282633304595947, "learning_rate": 1.477767702279218e-05, "loss": 0.3315, "step": 118190 }, { "epoch": 2.630876068376068, "grad_norm": 0.4773818850517273, "learning_rate": 1.476009383518575e-05, "loss": 0.4479, "step": 118200 }, { "epoch": 2.631098646723647, "grad_norm": 0.6412972807884216, "learning_rate": 1.4742520713667462e-05, "loss": 0.5118, "step": 118210 }, { "epoch": 2.631321225071225, "grad_norm": 0.6847419142723083, "learning_rate": 1.4724957659192262e-05, "loss": 0.5176, "step": 118220 }, { "epoch": 2.6315438034188032, "grad_norm": 0.41816556453704834, "learning_rate": 1.470740467271452e-05, "loss": 0.4863, "step": 118230 }, { "epoch": 2.631766381766382, "grad_norm": 0.8198537230491638, "learning_rate": 1.468986175518814e-05, "loss": 0.4454, "step": 118240 }, { "epoch": 2.63198896011396, "grad_norm": 0.41886764764785767, "learning_rate": 1.4672328907566357e-05, "loss": 0.3753, "step": 118250 }, { "epoch": 2.6322115384615383, "grad_norm": 0.6245961785316467, "learning_rate": 1.4654806130801945e-05, "loss": 0.4448, "step": 118260 }, { "epoch": 2.632434116809117, "grad_norm": 0.539581835269928, "learning_rate": 1.4637293425847077e-05, "loss": 0.5647, "step": 118270 }, { "epoch": 2.632656695156695, "grad_norm": 0.5460532307624817, "learning_rate": 1.4619790793653432e-05, "loss": 0.3981, "step": 118280 }, { "epoch": 2.6328792735042734, "grad_norm": 0.6665621399879456, "learning_rate": 1.4602298235172118e-05, "loss": 0.4673, "step": 118290 }, { "epoch": 2.633101851851852, "grad_norm": 0.5957037210464478, "learning_rate": 1.4584815751353687e-05, "loss": 0.3986, "step": 118300 }, { "epoch": 2.6333244301994303, "grad_norm": 0.6215893626213074, "learning_rate": 1.4567343343148154e-05, "loss": 0.5036, "step": 118310 }, { "epoch": 2.6335470085470085, "grad_norm": 0.518753707408905, "learning_rate": 1.4549881011504985e-05, "loss": 0.3934, "step": 118320 }, { "epoch": 2.6337695868945867, "grad_norm": 0.6107826232910156, "learning_rate": 1.4532428757373129e-05, "loss": 0.4698, "step": 118330 }, { "epoch": 2.6339921652421654, "grad_norm": 0.6774536967277527, "learning_rate": 1.4514986581700895e-05, "loss": 0.4642, "step": 118340 }, { "epoch": 2.6342147435897436, "grad_norm": 0.4905300438404083, "learning_rate": 1.4497554485436148e-05, "loss": 0.3734, "step": 118350 }, { "epoch": 2.634437321937322, "grad_norm": 0.36812686920166016, "learning_rate": 1.448013246952613e-05, "loss": 0.3994, "step": 118360 }, { "epoch": 2.6346599002849, "grad_norm": 0.502136766910553, "learning_rate": 1.4462720534917596e-05, "loss": 0.4834, "step": 118370 }, { "epoch": 2.6348824786324787, "grad_norm": 0.646876335144043, "learning_rate": 1.4445318682556741e-05, "loss": 0.4909, "step": 118380 }, { "epoch": 2.635105056980057, "grad_norm": 0.42079877853393555, "learning_rate": 1.4427926913389145e-05, "loss": 0.5332, "step": 118390 }, { "epoch": 2.635327635327635, "grad_norm": 0.43850240111351013, "learning_rate": 1.4410545228359962e-05, "loss": 0.5189, "step": 118400 }, { "epoch": 2.635550213675214, "grad_norm": 0.4544200003147125, "learning_rate": 1.4393173628413636e-05, "loss": 0.3596, "step": 118410 }, { "epoch": 2.635772792022792, "grad_norm": 0.48858514428138733, "learning_rate": 1.4375812114494192e-05, "loss": 0.424, "step": 118420 }, { "epoch": 2.6359953703703702, "grad_norm": 0.6129411458969116, "learning_rate": 1.4358460687545094e-05, "loss": 0.4825, "step": 118430 }, { "epoch": 2.636217948717949, "grad_norm": 0.5403198599815369, "learning_rate": 1.4341119348509191e-05, "loss": 0.5829, "step": 118440 }, { "epoch": 2.636440527065527, "grad_norm": 0.6376327276229858, "learning_rate": 1.4323788098328882e-05, "loss": 0.479, "step": 118450 }, { "epoch": 2.6366631054131053, "grad_norm": 0.5461758971214294, "learning_rate": 1.4306466937945906e-05, "loss": 0.5078, "step": 118460 }, { "epoch": 2.636885683760684, "grad_norm": 0.6305150389671326, "learning_rate": 1.4289155868301574e-05, "loss": 0.3263, "step": 118470 }, { "epoch": 2.637108262108262, "grad_norm": 0.3602633476257324, "learning_rate": 1.4271854890336511e-05, "loss": 0.3087, "step": 118480 }, { "epoch": 2.6373308404558404, "grad_norm": 0.4451531767845154, "learning_rate": 1.42545640049909e-05, "loss": 0.4953, "step": 118490 }, { "epoch": 2.6375534188034186, "grad_norm": 0.4219418168067932, "learning_rate": 1.4237283213204322e-05, "loss": 0.3477, "step": 118500 }, { "epoch": 2.6377759971509973, "grad_norm": 0.6085394024848938, "learning_rate": 1.4220012515915893e-05, "loss": 0.4057, "step": 118510 }, { "epoch": 2.6379985754985755, "grad_norm": 0.42439699172973633, "learning_rate": 1.4202751914064038e-05, "loss": 0.4828, "step": 118520 }, { "epoch": 2.6382211538461537, "grad_norm": 0.6015419363975525, "learning_rate": 1.4185501408586743e-05, "loss": 0.4617, "step": 118530 }, { "epoch": 2.638443732193732, "grad_norm": 0.5368427038192749, "learning_rate": 1.4168261000421434e-05, "loss": 0.4131, "step": 118540 }, { "epoch": 2.6386663105413106, "grad_norm": 0.6470973491668701, "learning_rate": 1.415103069050494e-05, "loss": 0.4663, "step": 118550 }, { "epoch": 2.638888888888889, "grad_norm": 0.4165962040424347, "learning_rate": 1.4133810479773579e-05, "loss": 0.519, "step": 118560 }, { "epoch": 2.639111467236467, "grad_norm": 0.6669519543647766, "learning_rate": 1.4116600369163113e-05, "loss": 0.4566, "step": 118570 }, { "epoch": 2.6393340455840457, "grad_norm": 0.5125649571418762, "learning_rate": 1.409940035960875e-05, "loss": 0.3817, "step": 118580 }, { "epoch": 2.639556623931624, "grad_norm": 0.6859048008918762, "learning_rate": 1.408221045204514e-05, "loss": 0.4928, "step": 118590 }, { "epoch": 2.639779202279202, "grad_norm": 0.6684611439704895, "learning_rate": 1.406503064740643e-05, "loss": 0.5164, "step": 118600 }, { "epoch": 2.640001780626781, "grad_norm": 0.9643305540084839, "learning_rate": 1.40478609466262e-05, "loss": 0.5385, "step": 118610 }, { "epoch": 2.640224358974359, "grad_norm": 0.710292398929596, "learning_rate": 1.403070135063742e-05, "loss": 0.4999, "step": 118620 }, { "epoch": 2.6403133903133904, "eval_loss": 0.5221165418624878, "eval_runtime": 337.3697, "eval_samples_per_second": 7.01, "eval_steps_per_second": 7.01, "step": 118624 }, { "epoch": 2.6404469373219372, "grad_norm": 0.5253585577011108, "learning_rate": 1.4013551860372542e-05, "loss": 0.453, "step": 118630 }, { "epoch": 2.640669515669516, "grad_norm": 0.6748680472373962, "learning_rate": 1.3996412476763555e-05, "loss": 0.523, "step": 118640 }, { "epoch": 2.640892094017094, "grad_norm": 0.4046613872051239, "learning_rate": 1.3979283200741755e-05, "loss": 0.4159, "step": 118650 }, { "epoch": 2.6411146723646723, "grad_norm": 0.5051271319389343, "learning_rate": 1.396216403323798e-05, "loss": 0.5347, "step": 118660 }, { "epoch": 2.6413372507122506, "grad_norm": 0.564983606338501, "learning_rate": 1.3945054975182504e-05, "loss": 0.4794, "step": 118670 }, { "epoch": 2.6415598290598292, "grad_norm": 0.3454379141330719, "learning_rate": 1.3927956027505095e-05, "loss": 0.4743, "step": 118680 }, { "epoch": 2.6417824074074074, "grad_norm": 0.4999922215938568, "learning_rate": 1.3910867191134857e-05, "loss": 0.4711, "step": 118690 }, { "epoch": 2.6420049857549857, "grad_norm": 0.7926509380340576, "learning_rate": 1.3893788467000424e-05, "loss": 0.4454, "step": 118700 }, { "epoch": 2.642227564102564, "grad_norm": 0.6050231456756592, "learning_rate": 1.3876719856029875e-05, "loss": 0.4505, "step": 118710 }, { "epoch": 2.6424501424501425, "grad_norm": 0.4802277088165283, "learning_rate": 1.3859661359150756e-05, "loss": 0.3975, "step": 118720 }, { "epoch": 2.6426727207977208, "grad_norm": 0.6203573942184448, "learning_rate": 1.3842612977289992e-05, "loss": 0.5449, "step": 118730 }, { "epoch": 2.642895299145299, "grad_norm": 0.4395126700401306, "learning_rate": 1.3825574711374067e-05, "loss": 0.339, "step": 118740 }, { "epoch": 2.6431178774928776, "grad_norm": 0.1692124754190445, "learning_rate": 1.3808546562328839e-05, "loss": 0.3725, "step": 118750 }, { "epoch": 2.643340455840456, "grad_norm": 0.5445780158042908, "learning_rate": 1.3791528531079568e-05, "loss": 0.4446, "step": 118760 }, { "epoch": 2.643563034188034, "grad_norm": 0.6528578400611877, "learning_rate": 1.3774520618551112e-05, "loss": 0.4721, "step": 118770 }, { "epoch": 2.6437856125356127, "grad_norm": 0.6830431222915649, "learning_rate": 1.3757522825667646e-05, "loss": 0.3605, "step": 118780 }, { "epoch": 2.644008190883191, "grad_norm": 0.5826247930526733, "learning_rate": 1.3740535153352829e-05, "loss": 0.5418, "step": 118790 }, { "epoch": 2.644230769230769, "grad_norm": 0.5273993611335754, "learning_rate": 1.3723557602529836e-05, "loss": 0.4902, "step": 118800 }, { "epoch": 2.644453347578348, "grad_norm": 0.3481096029281616, "learning_rate": 1.3706590174121193e-05, "loss": 0.4143, "step": 118810 }, { "epoch": 2.644675925925926, "grad_norm": 0.5116106271743774, "learning_rate": 1.3689632869048985e-05, "loss": 0.4321, "step": 118820 }, { "epoch": 2.6448985042735043, "grad_norm": 0.3914758563041687, "learning_rate": 1.367268568823461e-05, "loss": 0.4034, "step": 118830 }, { "epoch": 2.6451210826210825, "grad_norm": 0.6352466940879822, "learning_rate": 1.3655748632599042e-05, "loss": 0.4712, "step": 118840 }, { "epoch": 2.6453436609686607, "grad_norm": 0.4531133472919464, "learning_rate": 1.3638821703062632e-05, "loss": 0.3649, "step": 118850 }, { "epoch": 2.6455662393162394, "grad_norm": 0.8250698447227478, "learning_rate": 1.3621904900545224e-05, "loss": 0.4037, "step": 118860 }, { "epoch": 2.6457888176638176, "grad_norm": 0.6969398856163025, "learning_rate": 1.3604998225966082e-05, "loss": 0.3893, "step": 118870 }, { "epoch": 2.646011396011396, "grad_norm": 0.45444926619529724, "learning_rate": 1.3588101680243936e-05, "loss": 0.4794, "step": 118880 }, { "epoch": 2.6462339743589745, "grad_norm": 0.3588857650756836, "learning_rate": 1.3571215264296944e-05, "loss": 0.4206, "step": 118890 }, { "epoch": 2.6464565527065527, "grad_norm": 0.5009655356407166, "learning_rate": 1.3554338979042746e-05, "loss": 0.5148, "step": 118900 }, { "epoch": 2.646679131054131, "grad_norm": 0.27271145582199097, "learning_rate": 1.3537472825398368e-05, "loss": 0.4099, "step": 118910 }, { "epoch": 2.6469017094017095, "grad_norm": 0.6450316309928894, "learning_rate": 1.3520616804280383e-05, "loss": 0.3745, "step": 118920 }, { "epoch": 2.6471242877492878, "grad_norm": 0.47894465923309326, "learning_rate": 1.3503770916604707e-05, "loss": 0.4188, "step": 118930 }, { "epoch": 2.647346866096866, "grad_norm": 0.47971856594085693, "learning_rate": 1.3486935163286807e-05, "loss": 0.4623, "step": 118940 }, { "epoch": 2.6475694444444446, "grad_norm": 0.5302867293357849, "learning_rate": 1.3470109545241549e-05, "loss": 0.538, "step": 118950 }, { "epoch": 2.647792022792023, "grad_norm": 0.8558449745178223, "learning_rate": 1.3453294063383247e-05, "loss": 0.4335, "step": 118960 }, { "epoch": 2.648014601139601, "grad_norm": 0.7814370393753052, "learning_rate": 1.3436488718625661e-05, "loss": 0.4907, "step": 118970 }, { "epoch": 2.6482371794871797, "grad_norm": 0.35361289978027344, "learning_rate": 1.341969351188197e-05, "loss": 0.4447, "step": 118980 }, { "epoch": 2.648459757834758, "grad_norm": 0.47035202383995056, "learning_rate": 1.340290844406491e-05, "loss": 0.369, "step": 118990 }, { "epoch": 2.648682336182336, "grad_norm": 0.5264750719070435, "learning_rate": 1.338613351608653e-05, "loss": 0.4621, "step": 119000 }, { "epoch": 2.6489049145299144, "grad_norm": 0.656773030757904, "learning_rate": 1.3369368728858434e-05, "loss": 0.4295, "step": 119010 }, { "epoch": 2.6491274928774926, "grad_norm": 0.6832476258277893, "learning_rate": 1.335261408329167e-05, "loss": 0.4045, "step": 119020 }, { "epoch": 2.6493500712250713, "grad_norm": 0.5290317535400391, "learning_rate": 1.3335869580296601e-05, "loss": 0.4378, "step": 119030 }, { "epoch": 2.6495726495726495, "grad_norm": 0.3806043565273285, "learning_rate": 1.331913522078323e-05, "loss": 0.36, "step": 119040 }, { "epoch": 2.6497952279202277, "grad_norm": 0.3884319067001343, "learning_rate": 1.3302411005660853e-05, "loss": 0.4233, "step": 119050 }, { "epoch": 2.6500178062678064, "grad_norm": 0.38250046968460083, "learning_rate": 1.3285696935838276e-05, "loss": 0.3661, "step": 119060 }, { "epoch": 2.6502403846153846, "grad_norm": 0.5914880633354187, "learning_rate": 1.3268993012223795e-05, "loss": 0.3904, "step": 119070 }, { "epoch": 2.650462962962963, "grad_norm": 0.5131521821022034, "learning_rate": 1.3252299235725108e-05, "loss": 0.4551, "step": 119080 }, { "epoch": 2.6506855413105415, "grad_norm": 0.5576712489128113, "learning_rate": 1.3235615607249352e-05, "loss": 0.364, "step": 119090 }, { "epoch": 2.6509081196581197, "grad_norm": 0.7232601046562195, "learning_rate": 1.321894212770316e-05, "loss": 0.4554, "step": 119100 }, { "epoch": 2.651130698005698, "grad_norm": 0.31656399369239807, "learning_rate": 1.3202278797992518e-05, "loss": 0.472, "step": 119110 }, { "epoch": 2.6513532763532766, "grad_norm": 0.574393093585968, "learning_rate": 1.318562561902299e-05, "loss": 0.4564, "step": 119120 }, { "epoch": 2.6515758547008548, "grad_norm": 0.48993220925331116, "learning_rate": 1.3168982591699496e-05, "loss": 0.4432, "step": 119130 }, { "epoch": 2.651798433048433, "grad_norm": 0.6290052533149719, "learning_rate": 1.3152349716926427e-05, "loss": 0.556, "step": 119140 }, { "epoch": 2.6520210113960117, "grad_norm": 0.5945664048194885, "learning_rate": 1.3135726995607634e-05, "loss": 0.5677, "step": 119150 }, { "epoch": 2.65224358974359, "grad_norm": 0.5955460071563721, "learning_rate": 1.311911442864644e-05, "loss": 0.4777, "step": 119160 }, { "epoch": 2.652466168091168, "grad_norm": 0.4934992790222168, "learning_rate": 1.3102512016945522e-05, "loss": 0.466, "step": 119170 }, { "epoch": 2.6526887464387463, "grad_norm": 0.48935940861701965, "learning_rate": 1.3085919761407139e-05, "loss": 0.4081, "step": 119180 }, { "epoch": 2.6529113247863245, "grad_norm": 0.35348719358444214, "learning_rate": 1.3069337662932857e-05, "loss": 0.4904, "step": 119190 }, { "epoch": 2.653133903133903, "grad_norm": 0.6942500472068787, "learning_rate": 1.3052765722423798e-05, "loss": 0.3155, "step": 119200 }, { "epoch": 2.6533564814814814, "grad_norm": 0.6411790251731873, "learning_rate": 1.303620394078049e-05, "loss": 0.4522, "step": 119210 }, { "epoch": 2.6535790598290596, "grad_norm": 0.560257613658905, "learning_rate": 1.301965231890292e-05, "loss": 0.498, "step": 119220 }, { "epoch": 2.6538016381766383, "grad_norm": 0.5010212659835815, "learning_rate": 1.3003110857690504e-05, "loss": 0.3595, "step": 119230 }, { "epoch": 2.6540242165242165, "grad_norm": 0.6719934940338135, "learning_rate": 1.2986579558042166e-05, "loss": 0.466, "step": 119240 }, { "epoch": 2.6542467948717947, "grad_norm": 0.7256160974502563, "learning_rate": 1.2970058420856168e-05, "loss": 0.4716, "step": 119250 }, { "epoch": 2.6544693732193734, "grad_norm": 0.6219938397407532, "learning_rate": 1.29535474470303e-05, "loss": 0.4953, "step": 119260 }, { "epoch": 2.6546919515669516, "grad_norm": 0.7596551775932312, "learning_rate": 1.293704663746178e-05, "loss": 0.3799, "step": 119270 }, { "epoch": 2.65491452991453, "grad_norm": 0.5391525030136108, "learning_rate": 1.2920555993047268e-05, "loss": 0.4662, "step": 119280 }, { "epoch": 2.6551371082621085, "grad_norm": 0.6525141596794128, "learning_rate": 1.2904075514682956e-05, "loss": 0.3562, "step": 119290 }, { "epoch": 2.6553596866096867, "grad_norm": 0.46071699261665344, "learning_rate": 1.2887605203264286e-05, "loss": 0.4846, "step": 119300 }, { "epoch": 2.655582264957265, "grad_norm": 0.5057551860809326, "learning_rate": 1.287114505968634e-05, "loss": 0.3729, "step": 119310 }, { "epoch": 2.6558048433048436, "grad_norm": 0.43811896443367004, "learning_rate": 1.2854695084843604e-05, "loss": 0.3254, "step": 119320 }, { "epoch": 2.656027421652422, "grad_norm": 0.5691989064216614, "learning_rate": 1.2838255279629897e-05, "loss": 0.4391, "step": 119330 }, { "epoch": 2.65625, "grad_norm": 0.41873759031295776, "learning_rate": 1.2821825644938613e-05, "loss": 0.4538, "step": 119340 }, { "epoch": 2.656472578347578, "grad_norm": 0.6020520925521851, "learning_rate": 1.2805406181662549e-05, "loss": 0.4993, "step": 119350 }, { "epoch": 2.6566951566951564, "grad_norm": 0.7073390483856201, "learning_rate": 1.2788996890693972e-05, "loss": 0.46, "step": 119360 }, { "epoch": 2.656917735042735, "grad_norm": 0.4874557554721832, "learning_rate": 1.2772597772924566e-05, "loss": 0.3843, "step": 119370 }, { "epoch": 2.6571403133903133, "grad_norm": 0.5705388784408569, "learning_rate": 1.2756208829245486e-05, "loss": 0.5698, "step": 119380 }, { "epoch": 2.6573628917378915, "grad_norm": 0.6027542352676392, "learning_rate": 1.2739830060547287e-05, "loss": 0.4355, "step": 119390 }, { "epoch": 2.65758547008547, "grad_norm": 0.4651222825050354, "learning_rate": 1.272346146772001e-05, "loss": 0.4487, "step": 119400 }, { "epoch": 2.6578080484330484, "grad_norm": 0.5996605157852173, "learning_rate": 1.2707103051653147e-05, "loss": 0.4809, "step": 119410 }, { "epoch": 2.6580306267806266, "grad_norm": 0.5189327001571655, "learning_rate": 1.2690754813235628e-05, "loss": 0.4816, "step": 119420 }, { "epoch": 2.6582532051282053, "grad_norm": 0.5221971869468689, "learning_rate": 1.2674416753355878e-05, "loss": 0.5121, "step": 119430 }, { "epoch": 2.6584757834757835, "grad_norm": 0.6243450045585632, "learning_rate": 1.265808887290163e-05, "loss": 0.5636, "step": 119440 }, { "epoch": 2.6586983618233617, "grad_norm": 0.48516786098480225, "learning_rate": 1.2641771172760197e-05, "loss": 0.2967, "step": 119450 }, { "epoch": 2.6589209401709404, "grad_norm": 0.5596334338188171, "learning_rate": 1.2625463653818315e-05, "loss": 0.5162, "step": 119460 }, { "epoch": 2.6591435185185186, "grad_norm": 0.8100598454475403, "learning_rate": 1.2609166316962117e-05, "loss": 0.4156, "step": 119470 }, { "epoch": 2.659366096866097, "grad_norm": 0.349343478679657, "learning_rate": 1.259287916307723e-05, "loss": 0.4248, "step": 119480 }, { "epoch": 2.6595886752136755, "grad_norm": 0.39619576930999756, "learning_rate": 1.2576602193048703e-05, "loss": 0.5515, "step": 119490 }, { "epoch": 2.6598112535612537, "grad_norm": 0.4198961555957794, "learning_rate": 1.2560335407761047e-05, "loss": 0.4431, "step": 119500 }, { "epoch": 2.660033831908832, "grad_norm": 0.5467174053192139, "learning_rate": 1.2544078808098203e-05, "loss": 0.3629, "step": 119510 }, { "epoch": 2.66025641025641, "grad_norm": 0.6161094307899475, "learning_rate": 1.2527832394943596e-05, "loss": 0.5464, "step": 119520 }, { "epoch": 2.6604789886039883, "grad_norm": 0.5048872232437134, "learning_rate": 1.2511596169180028e-05, "loss": 0.4542, "step": 119530 }, { "epoch": 2.660701566951567, "grad_norm": 0.48899000883102417, "learning_rate": 1.249537013168982e-05, "loss": 0.4037, "step": 119540 }, { "epoch": 2.6609241452991452, "grad_norm": 0.722078263759613, "learning_rate": 1.2479154283354688e-05, "loss": 0.4348, "step": 119550 }, { "epoch": 2.6611467236467234, "grad_norm": 0.44458404183387756, "learning_rate": 1.246294862505586e-05, "loss": 0.3578, "step": 119560 }, { "epoch": 2.661369301994302, "grad_norm": 0.6896154284477234, "learning_rate": 1.2446753157673896e-05, "loss": 0.4801, "step": 119570 }, { "epoch": 2.6615918803418803, "grad_norm": 0.4716028571128845, "learning_rate": 1.2430567882088895e-05, "loss": 0.6153, "step": 119580 }, { "epoch": 2.6618144586894585, "grad_norm": 0.8424140810966492, "learning_rate": 1.2414392799180396e-05, "loss": 0.5301, "step": 119590 }, { "epoch": 2.662037037037037, "grad_norm": 0.49317121505737305, "learning_rate": 1.2398227909827387e-05, "loss": 0.4794, "step": 119600 }, { "epoch": 2.6622596153846154, "grad_norm": 0.3669206500053406, "learning_rate": 1.2382073214908207e-05, "loss": 0.5194, "step": 119610 }, { "epoch": 2.6624821937321936, "grad_norm": 0.6917450428009033, "learning_rate": 1.2365928715300779e-05, "loss": 0.4911, "step": 119620 }, { "epoch": 2.6627047720797723, "grad_norm": 0.6095952987670898, "learning_rate": 1.2349794411882376e-05, "loss": 0.4598, "step": 119630 }, { "epoch": 2.6629273504273505, "grad_norm": 0.6148474812507629, "learning_rate": 1.2333670305529788e-05, "loss": 0.5021, "step": 119640 }, { "epoch": 2.6631499287749287, "grad_norm": 0.679924488067627, "learning_rate": 1.2317556397119156e-05, "loss": 0.3886, "step": 119650 }, { "epoch": 2.6633725071225074, "grad_norm": 0.3880091905593872, "learning_rate": 1.2301452687526182e-05, "loss": 0.4774, "step": 119660 }, { "epoch": 2.6635950854700856, "grad_norm": 0.6816239953041077, "learning_rate": 1.2285359177625922e-05, "loss": 0.5377, "step": 119670 }, { "epoch": 2.663817663817664, "grad_norm": 0.44697943329811096, "learning_rate": 1.2269275868292896e-05, "loss": 0.4146, "step": 119680 }, { "epoch": 2.664040242165242, "grad_norm": 0.3887006938457489, "learning_rate": 1.2253202760401138e-05, "loss": 0.4018, "step": 119690 }, { "epoch": 2.6642628205128203, "grad_norm": 0.8247577548027039, "learning_rate": 1.2237139854823997e-05, "loss": 0.4313, "step": 119700 }, { "epoch": 2.664485398860399, "grad_norm": 0.6009088158607483, "learning_rate": 1.2221087152434418e-05, "loss": 0.3529, "step": 119710 }, { "epoch": 2.664707977207977, "grad_norm": 0.7377880215644836, "learning_rate": 1.2205044654104657e-05, "loss": 0.4198, "step": 119720 }, { "epoch": 2.6649305555555554, "grad_norm": 0.6384598016738892, "learning_rate": 1.2189012360706508e-05, "loss": 0.4098, "step": 119730 }, { "epoch": 2.665153133903134, "grad_norm": 0.6163082122802734, "learning_rate": 1.2172990273111206e-05, "loss": 0.4649, "step": 119740 }, { "epoch": 2.6653757122507122, "grad_norm": 0.7366446256637573, "learning_rate": 1.2156978392189367e-05, "loss": 0.4807, "step": 119750 }, { "epoch": 2.6655982905982905, "grad_norm": 0.6339846253395081, "learning_rate": 1.214097671881107e-05, "loss": 0.4824, "step": 119760 }, { "epoch": 2.665820868945869, "grad_norm": 0.48082682490348816, "learning_rate": 1.212498525384591e-05, "loss": 0.3937, "step": 119770 }, { "epoch": 2.6660434472934473, "grad_norm": 0.8619961738586426, "learning_rate": 1.2109003998162838e-05, "loss": 0.5142, "step": 119780 }, { "epoch": 2.6662660256410255, "grad_norm": 0.48731106519699097, "learning_rate": 1.2093032952630312e-05, "loss": 0.3976, "step": 119790 }, { "epoch": 2.666488603988604, "grad_norm": 0.5623659491539001, "learning_rate": 1.2077072118116217e-05, "loss": 0.4175, "step": 119800 }, { "epoch": 2.6667111823361824, "grad_norm": 0.592086672782898, "learning_rate": 1.206112149548786e-05, "loss": 0.5092, "step": 119810 }, { "epoch": 2.6669337606837606, "grad_norm": 0.46243971586227417, "learning_rate": 1.2045181085612011e-05, "loss": 0.4151, "step": 119820 }, { "epoch": 2.6671563390313393, "grad_norm": 0.44234499335289, "learning_rate": 1.2029250889354893e-05, "loss": 0.3803, "step": 119830 }, { "epoch": 2.6673789173789175, "grad_norm": 0.34458523988723755, "learning_rate": 1.2013330907582143e-05, "loss": 0.3126, "step": 119840 }, { "epoch": 2.6676014957264957, "grad_norm": 0.4783085286617279, "learning_rate": 1.1997421141158893e-05, "loss": 0.4291, "step": 119850 }, { "epoch": 2.667824074074074, "grad_norm": 0.623274564743042, "learning_rate": 1.1981521590949674e-05, "loss": 0.4811, "step": 119860 }, { "epoch": 2.668046652421652, "grad_norm": 0.7949987053871155, "learning_rate": 1.1965632257818527e-05, "loss": 0.566, "step": 119870 }, { "epoch": 2.668269230769231, "grad_norm": 0.5925506949424744, "learning_rate": 1.1949753142628827e-05, "loss": 0.4659, "step": 119880 }, { "epoch": 2.668491809116809, "grad_norm": 0.4603303074836731, "learning_rate": 1.1933884246243464e-05, "loss": 0.5669, "step": 119890 }, { "epoch": 2.6687143874643873, "grad_norm": 0.5920960307121277, "learning_rate": 1.1918025569524815e-05, "loss": 0.4726, "step": 119900 }, { "epoch": 2.668936965811966, "grad_norm": 0.3981236219406128, "learning_rate": 1.1902177113334634e-05, "loss": 0.3575, "step": 119910 }, { "epoch": 2.669159544159544, "grad_norm": 0.6927867531776428, "learning_rate": 1.188633887853412e-05, "loss": 0.5089, "step": 119920 }, { "epoch": 2.6693821225071224, "grad_norm": 0.43674182891845703, "learning_rate": 1.1870510865983942e-05, "loss": 0.3711, "step": 119930 }, { "epoch": 2.669604700854701, "grad_norm": 0.6672431826591492, "learning_rate": 1.1854693076544254e-05, "loss": 0.5577, "step": 119940 }, { "epoch": 2.6698272792022792, "grad_norm": 0.5630993247032166, "learning_rate": 1.1838885511074549e-05, "loss": 0.4317, "step": 119950 }, { "epoch": 2.6700498575498575, "grad_norm": 0.5109251141548157, "learning_rate": 1.1823088170433828e-05, "loss": 0.5359, "step": 119960 }, { "epoch": 2.670272435897436, "grad_norm": 0.6072834730148315, "learning_rate": 1.1807301055480557e-05, "loss": 0.4943, "step": 119970 }, { "epoch": 2.6704950142450143, "grad_norm": 0.6109597682952881, "learning_rate": 1.1791524167072588e-05, "loss": 0.3994, "step": 119980 }, { "epoch": 2.6707175925925926, "grad_norm": 0.7994768023490906, "learning_rate": 1.1775757506067275e-05, "loss": 0.4038, "step": 119990 }, { "epoch": 2.6709401709401708, "grad_norm": 0.47031915187835693, "learning_rate": 1.1760001073321381e-05, "loss": 0.3227, "step": 120000 }, { "epoch": 2.6711627492877494, "grad_norm": 0.8149853348731995, "learning_rate": 1.1744254869691173e-05, "loss": 0.464, "step": 120010 }, { "epoch": 2.6713853276353277, "grad_norm": 1.0185492038726807, "learning_rate": 1.1728518896032215e-05, "loss": 0.4302, "step": 120020 }, { "epoch": 2.671607905982906, "grad_norm": 0.38112345337867737, "learning_rate": 1.1712793153199686e-05, "loss": 0.412, "step": 120030 }, { "epoch": 2.671830484330484, "grad_norm": 0.7294852137565613, "learning_rate": 1.1697077642048127e-05, "loss": 0.5178, "step": 120040 }, { "epoch": 2.6720530626780628, "grad_norm": 0.7971472144126892, "learning_rate": 1.1681372363431498e-05, "loss": 0.5041, "step": 120050 }, { "epoch": 2.672275641025641, "grad_norm": 0.5222724080085754, "learning_rate": 1.1665677318203273e-05, "loss": 0.4679, "step": 120060 }, { "epoch": 2.672498219373219, "grad_norm": 0.3848299980163574, "learning_rate": 1.16499925072163e-05, "loss": 0.5168, "step": 120070 }, { "epoch": 2.672720797720798, "grad_norm": 0.573528528213501, "learning_rate": 1.1634317931322969e-05, "loss": 0.4596, "step": 120080 }, { "epoch": 2.672943376068376, "grad_norm": 0.7404229044914246, "learning_rate": 1.1618653591375016e-05, "loss": 0.5278, "step": 120090 }, { "epoch": 2.6731659544159543, "grad_norm": 0.7535010576248169, "learning_rate": 1.1602999488223609e-05, "loss": 0.5029, "step": 120100 }, { "epoch": 2.673388532763533, "grad_norm": 0.529496967792511, "learning_rate": 1.1587355622719421e-05, "loss": 0.4024, "step": 120110 }, { "epoch": 2.673611111111111, "grad_norm": 0.44174572825431824, "learning_rate": 1.1571721995712592e-05, "loss": 0.3879, "step": 120120 }, { "epoch": 2.6738336894586894, "grad_norm": 0.7702245116233826, "learning_rate": 1.1556098608052624e-05, "loss": 0.4502, "step": 120130 }, { "epoch": 2.674056267806268, "grad_norm": 0.5648883581161499, "learning_rate": 1.1540485460588546e-05, "loss": 0.485, "step": 120140 }, { "epoch": 2.6742788461538463, "grad_norm": 0.4519728422164917, "learning_rate": 1.152488255416877e-05, "loss": 0.4139, "step": 120150 }, { "epoch": 2.6745014245014245, "grad_norm": 0.43161413073539734, "learning_rate": 1.1509289889641173e-05, "loss": 0.3445, "step": 120160 }, { "epoch": 2.6747240028490027, "grad_norm": 0.5353063941001892, "learning_rate": 1.1493707467853053e-05, "loss": 0.5018, "step": 120170 }, { "epoch": 2.6749465811965814, "grad_norm": 0.46572011709213257, "learning_rate": 1.1478135289651182e-05, "loss": 0.4157, "step": 120180 }, { "epoch": 2.6751691595441596, "grad_norm": 0.45290809869766235, "learning_rate": 1.1462573355881767e-05, "loss": 0.4243, "step": 120190 }, { "epoch": 2.675391737891738, "grad_norm": 0.5023192763328552, "learning_rate": 1.1447021667390468e-05, "loss": 0.4561, "step": 120200 }, { "epoch": 2.675614316239316, "grad_norm": 0.4624986946582794, "learning_rate": 1.1431480225022406e-05, "loss": 0.3579, "step": 120210 }, { "epoch": 2.6758368945868947, "grad_norm": 0.5679937601089478, "learning_rate": 1.1415949029622042e-05, "loss": 0.4312, "step": 120220 }, { "epoch": 2.676059472934473, "grad_norm": 0.475293904542923, "learning_rate": 1.140042808203341e-05, "loss": 0.4307, "step": 120230 }, { "epoch": 2.676282051282051, "grad_norm": 0.641948938369751, "learning_rate": 1.1384917383099902e-05, "loss": 0.4208, "step": 120240 }, { "epoch": 2.6765046296296298, "grad_norm": 0.5572640299797058, "learning_rate": 1.1369416933664379e-05, "loss": 0.5328, "step": 120250 }, { "epoch": 2.676727207977208, "grad_norm": 0.48038625717163086, "learning_rate": 1.1353926734569143e-05, "loss": 0.4274, "step": 120260 }, { "epoch": 2.676949786324786, "grad_norm": 0.6205059885978699, "learning_rate": 1.1338446786655987e-05, "loss": 0.4661, "step": 120270 }, { "epoch": 2.677172364672365, "grad_norm": 0.4956248700618744, "learning_rate": 1.1322977090766062e-05, "loss": 0.4248, "step": 120280 }, { "epoch": 2.677394943019943, "grad_norm": 0.7365691661834717, "learning_rate": 1.1307517647740052e-05, "loss": 0.4274, "step": 120290 }, { "epoch": 2.6776175213675213, "grad_norm": 0.520764946937561, "learning_rate": 1.1292068458417993e-05, "loss": 0.3888, "step": 120300 }, { "epoch": 2.6778400997151, "grad_norm": 0.4983401298522949, "learning_rate": 1.1276629523639392e-05, "loss": 0.4346, "step": 120310 }, { "epoch": 2.678062678062678, "grad_norm": 0.51678466796875, "learning_rate": 1.1261200844243247e-05, "loss": 0.4706, "step": 120320 }, { "epoch": 2.6782852564102564, "grad_norm": 0.6123687028884888, "learning_rate": 1.1245782421067951e-05, "loss": 0.3839, "step": 120330 }, { "epoch": 2.6785078347578346, "grad_norm": 0.6919697523117065, "learning_rate": 1.1230374254951415e-05, "loss": 0.5442, "step": 120340 }, { "epoch": 2.6787304131054133, "grad_norm": 0.46390897035598755, "learning_rate": 1.121497634673081e-05, "loss": 0.5094, "step": 120350 }, { "epoch": 2.6789529914529915, "grad_norm": 0.5452059507369995, "learning_rate": 1.1199588697242957e-05, "loss": 0.4072, "step": 120360 }, { "epoch": 2.6791755698005697, "grad_norm": 0.4743301570415497, "learning_rate": 1.1184211307324055e-05, "loss": 0.3753, "step": 120370 }, { "epoch": 2.679398148148148, "grad_norm": 0.6292110085487366, "learning_rate": 1.1168844177809635e-05, "loss": 0.5162, "step": 120380 }, { "epoch": 2.6796207264957266, "grad_norm": 0.5056225061416626, "learning_rate": 1.1153487309534804e-05, "loss": 0.4024, "step": 120390 }, { "epoch": 2.679843304843305, "grad_norm": 0.5967031121253967, "learning_rate": 1.1138140703334078e-05, "loss": 0.4496, "step": 120400 }, { "epoch": 2.680065883190883, "grad_norm": 0.7600603699684143, "learning_rate": 1.1122804360041406e-05, "loss": 0.4953, "step": 120410 }, { "epoch": 2.6802884615384617, "grad_norm": 0.5037348866462708, "learning_rate": 1.1107478280490147e-05, "loss": 0.4603, "step": 120420 }, { "epoch": 2.68051103988604, "grad_norm": 0.46349087357521057, "learning_rate": 1.109216246551319e-05, "loss": 0.434, "step": 120430 }, { "epoch": 2.680733618233618, "grad_norm": 0.38922572135925293, "learning_rate": 1.1076856915942757e-05, "loss": 0.3545, "step": 120440 }, { "epoch": 2.6809561965811968, "grad_norm": 0.6287493705749512, "learning_rate": 1.1061561632610562e-05, "loss": 0.4892, "step": 120450 }, { "epoch": 2.681178774928775, "grad_norm": 0.5363261699676514, "learning_rate": 1.1046276616347784e-05, "loss": 0.4329, "step": 120460 }, { "epoch": 2.681401353276353, "grad_norm": 0.7095743417739868, "learning_rate": 1.1031001867985003e-05, "loss": 0.5604, "step": 120470 }, { "epoch": 2.681623931623932, "grad_norm": 0.5759169459342957, "learning_rate": 1.1015737388352333e-05, "loss": 0.4218, "step": 120480 }, { "epoch": 2.68184650997151, "grad_norm": 0.5130805969238281, "learning_rate": 1.1000483178279152e-05, "loss": 0.4957, "step": 120490 }, { "epoch": 2.6820690883190883, "grad_norm": 0.5419637560844421, "learning_rate": 1.0985239238594447e-05, "loss": 0.3262, "step": 120500 }, { "epoch": 2.6822916666666665, "grad_norm": 0.45975545048713684, "learning_rate": 1.0970005570126618e-05, "loss": 0.4493, "step": 120510 }, { "epoch": 2.682514245014245, "grad_norm": 0.5595857501029968, "learning_rate": 1.0954782173703404e-05, "loss": 0.5724, "step": 120520 }, { "epoch": 2.6827368233618234, "grad_norm": 0.5037173628807068, "learning_rate": 1.0939569050152055e-05, "loss": 0.4328, "step": 120530 }, { "epoch": 2.6829594017094016, "grad_norm": 0.6380975246429443, "learning_rate": 1.092436620029933e-05, "loss": 0.4615, "step": 120540 }, { "epoch": 2.68318198005698, "grad_norm": 0.6233899593353271, "learning_rate": 1.0909173624971325e-05, "loss": 0.4491, "step": 120550 }, { "epoch": 2.6834045584045585, "grad_norm": 0.6080841422080994, "learning_rate": 1.0893991324993602e-05, "loss": 0.5008, "step": 120560 }, { "epoch": 2.6836271367521367, "grad_norm": 0.5344551801681519, "learning_rate": 1.0878819301191235e-05, "loss": 0.4807, "step": 120570 }, { "epoch": 2.683849715099715, "grad_norm": 0.39450156688690186, "learning_rate": 1.0863657554388629e-05, "loss": 0.3421, "step": 120580 }, { "epoch": 2.6840722934472936, "grad_norm": 0.615376353263855, "learning_rate": 1.0848506085409704e-05, "loss": 0.5475, "step": 120590 }, { "epoch": 2.684294871794872, "grad_norm": 0.4130651354789734, "learning_rate": 1.0833364895077801e-05, "loss": 0.3752, "step": 120600 }, { "epoch": 2.68451745014245, "grad_norm": 0.6055234670639038, "learning_rate": 1.0818233984215753e-05, "loss": 0.5713, "step": 120610 }, { "epoch": 2.6847400284900287, "grad_norm": 0.41650620102882385, "learning_rate": 1.08031133536457e-05, "loss": 0.3, "step": 120620 }, { "epoch": 2.684962606837607, "grad_norm": 0.5312291979789734, "learning_rate": 1.078800300418934e-05, "loss": 0.4901, "step": 120630 }, { "epoch": 2.685185185185185, "grad_norm": 0.5251030325889587, "learning_rate": 1.0772902936667817e-05, "loss": 0.311, "step": 120640 }, { "epoch": 2.685407763532764, "grad_norm": 0.37884873151779175, "learning_rate": 1.0757813151901652e-05, "loss": 0.4589, "step": 120650 }, { "epoch": 2.685630341880342, "grad_norm": 0.64932781457901, "learning_rate": 1.0742733650710834e-05, "loss": 0.4904, "step": 120660 }, { "epoch": 2.68585292022792, "grad_norm": 0.5982466340065002, "learning_rate": 1.0727664433914818e-05, "loss": 0.4441, "step": 120670 }, { "epoch": 2.6860754985754984, "grad_norm": 0.5319939255714417, "learning_rate": 1.071260550233244e-05, "loss": 0.3659, "step": 120680 }, { "epoch": 2.6862980769230766, "grad_norm": 0.5466464757919312, "learning_rate": 1.0697556856782043e-05, "loss": 0.5018, "step": 120690 }, { "epoch": 2.6865206552706553, "grad_norm": 0.5915857553482056, "learning_rate": 1.0682518498081373e-05, "loss": 0.4001, "step": 120700 }, { "epoch": 2.6867432336182335, "grad_norm": 0.630190372467041, "learning_rate": 1.0667490427047666e-05, "loss": 0.4807, "step": 120710 }, { "epoch": 2.6869658119658117, "grad_norm": 0.5770119428634644, "learning_rate": 1.0652472644497492e-05, "loss": 0.4412, "step": 120720 }, { "epoch": 2.6871883903133904, "grad_norm": 0.4673517942428589, "learning_rate": 1.063746515124695e-05, "loss": 0.5254, "step": 120730 }, { "epoch": 2.6874109686609686, "grad_norm": 0.6391720771789551, "learning_rate": 1.0622467948111613e-05, "loss": 0.5048, "step": 120740 }, { "epoch": 2.687633547008547, "grad_norm": 0.49109727144241333, "learning_rate": 1.0607481035906387e-05, "loss": 0.4711, "step": 120750 }, { "epoch": 2.6878561253561255, "grad_norm": 0.714725136756897, "learning_rate": 1.0592504415445659e-05, "loss": 0.4323, "step": 120760 }, { "epoch": 2.6880787037037037, "grad_norm": 0.4830571413040161, "learning_rate": 1.0577538087543293e-05, "loss": 0.4638, "step": 120770 }, { "epoch": 2.688301282051282, "grad_norm": 0.4310127794742584, "learning_rate": 1.0562582053012592e-05, "loss": 0.3241, "step": 120780 }, { "epoch": 2.6885238603988606, "grad_norm": 0.557829737663269, "learning_rate": 1.0547636312666287e-05, "loss": 0.5086, "step": 120790 }, { "epoch": 2.688746438746439, "grad_norm": 0.4715104401111603, "learning_rate": 1.0532700867316503e-05, "loss": 0.4818, "step": 120800 }, { "epoch": 2.688969017094017, "grad_norm": 0.6122713685035706, "learning_rate": 1.0517775717774836e-05, "loss": 0.4238, "step": 120810 }, { "epoch": 2.6891915954415957, "grad_norm": 0.3812810778617859, "learning_rate": 1.050286086485237e-05, "loss": 0.4984, "step": 120820 }, { "epoch": 2.689414173789174, "grad_norm": 0.5915753841400146, "learning_rate": 1.048795630935957e-05, "loss": 0.4732, "step": 120830 }, { "epoch": 2.689636752136752, "grad_norm": 0.49963366985321045, "learning_rate": 1.0473062052106364e-05, "loss": 0.5449, "step": 120840 }, { "epoch": 2.6898593304843303, "grad_norm": 0.7948408126831055, "learning_rate": 1.045817809390215e-05, "loss": 0.4657, "step": 120850 }, { "epoch": 2.6900819088319086, "grad_norm": 0.587867021560669, "learning_rate": 1.0443304435555656e-05, "loss": 0.5226, "step": 120860 }, { "epoch": 2.6903044871794872, "grad_norm": 0.3600319027900696, "learning_rate": 1.0428441077875239e-05, "loss": 0.483, "step": 120870 }, { "epoch": 2.6905270655270654, "grad_norm": 0.3752707540988922, "learning_rate": 1.0413588021668475e-05, "loss": 0.3287, "step": 120880 }, { "epoch": 2.6907496438746437, "grad_norm": 0.5882196426391602, "learning_rate": 1.0398745267742538e-05, "loss": 0.4309, "step": 120890 }, { "epoch": 2.6909722222222223, "grad_norm": 0.49909356236457825, "learning_rate": 1.0383912816904007e-05, "loss": 0.3353, "step": 120900 }, { "epoch": 2.6911948005698005, "grad_norm": 0.6589367389678955, "learning_rate": 1.0369090669958881e-05, "loss": 0.5086, "step": 120910 }, { "epoch": 2.6914173789173788, "grad_norm": 0.4251459538936615, "learning_rate": 1.0354278827712605e-05, "loss": 0.4085, "step": 120920 }, { "epoch": 2.6916399572649574, "grad_norm": 0.6321707367897034, "learning_rate": 1.0339477290970112e-05, "loss": 0.4632, "step": 120930 }, { "epoch": 2.6918625356125356, "grad_norm": 0.8624523878097534, "learning_rate": 1.0324686060535649e-05, "loss": 0.4292, "step": 120940 }, { "epoch": 2.692085113960114, "grad_norm": 0.547972559928894, "learning_rate": 1.0309905137213017e-05, "loss": 0.3422, "step": 120950 }, { "epoch": 2.6923076923076925, "grad_norm": 0.889786958694458, "learning_rate": 1.0295134521805417e-05, "loss": 0.4889, "step": 120960 }, { "epoch": 2.6925302706552707, "grad_norm": 0.5785326361656189, "learning_rate": 1.0280374215115518e-05, "loss": 0.4494, "step": 120970 }, { "epoch": 2.692752849002849, "grad_norm": 0.5202192664146423, "learning_rate": 1.0265624217945392e-05, "loss": 0.457, "step": 120980 }, { "epoch": 2.6929754273504276, "grad_norm": 0.4780712425708771, "learning_rate": 1.0250884531096593e-05, "loss": 0.4192, "step": 120990 }, { "epoch": 2.693198005698006, "grad_norm": 0.9215281009674072, "learning_rate": 1.0236155155370087e-05, "loss": 0.5269, "step": 121000 }, { "epoch": 2.693420584045584, "grad_norm": 0.5440164804458618, "learning_rate": 1.0221436091566205e-05, "loss": 0.422, "step": 121010 }, { "epoch": 2.6936431623931623, "grad_norm": 0.5080778002738953, "learning_rate": 1.0206727340484845e-05, "loss": 0.5341, "step": 121020 }, { "epoch": 2.6938657407407405, "grad_norm": 0.45328041911125183, "learning_rate": 1.019202890292532e-05, "loss": 0.4234, "step": 121030 }, { "epoch": 2.694088319088319, "grad_norm": 0.7312338948249817, "learning_rate": 1.0177340779686306e-05, "loss": 0.4812, "step": 121040 }, { "epoch": 2.6943108974358974, "grad_norm": 0.6298159956932068, "learning_rate": 1.0162662971565984e-05, "loss": 0.538, "step": 121050 }, { "epoch": 2.6945334757834756, "grad_norm": 0.520063579082489, "learning_rate": 1.0147995479361982e-05, "loss": 0.5705, "step": 121060 }, { "epoch": 2.6947560541310542, "grad_norm": 0.42589840292930603, "learning_rate": 1.0133338303871354e-05, "loss": 0.4456, "step": 121070 }, { "epoch": 2.6949786324786325, "grad_norm": 0.38978275656700134, "learning_rate": 1.0118691445890504e-05, "loss": 0.4925, "step": 121080 }, { "epoch": 2.6952012108262107, "grad_norm": 0.31580010056495667, "learning_rate": 1.0104054906215422e-05, "loss": 0.3951, "step": 121090 }, { "epoch": 2.6954237891737893, "grad_norm": 0.6156284213066101, "learning_rate": 1.0089428685641467e-05, "loss": 0.413, "step": 121100 }, { "epoch": 2.6956463675213675, "grad_norm": 0.5855969786643982, "learning_rate": 1.0074812784963406e-05, "loss": 0.5138, "step": 121110 }, { "epoch": 2.6958689458689458, "grad_norm": 0.6864350438117981, "learning_rate": 1.0060207204975492e-05, "loss": 0.5307, "step": 121120 }, { "epoch": 2.6960915242165244, "grad_norm": 0.795219898223877, "learning_rate": 1.0045611946471467e-05, "loss": 0.4583, "step": 121130 }, { "epoch": 2.6963141025641026, "grad_norm": 0.6658293008804321, "learning_rate": 1.0031027010244365e-05, "loss": 0.4645, "step": 121140 }, { "epoch": 2.696536680911681, "grad_norm": 0.5443158745765686, "learning_rate": 1.001645239708675e-05, "loss": 0.4279, "step": 121150 }, { "epoch": 2.6967592592592595, "grad_norm": 0.5901051163673401, "learning_rate": 1.0001888107790635e-05, "loss": 0.4095, "step": 121160 }, { "epoch": 2.6969818376068377, "grad_norm": 0.4878370463848114, "learning_rate": 9.987334143147475e-06, "loss": 0.5041, "step": 121170 }, { "epoch": 2.697204415954416, "grad_norm": 0.47627171874046326, "learning_rate": 9.972790503948127e-06, "loss": 0.4282, "step": 121180 }, { "epoch": 2.697426994301994, "grad_norm": 0.6253020167350769, "learning_rate": 9.958257190982889e-06, "loss": 0.4362, "step": 121190 }, { "epoch": 2.6976495726495724, "grad_norm": 0.6432648301124573, "learning_rate": 9.943734205041554e-06, "loss": 0.6053, "step": 121200 }, { "epoch": 2.697872150997151, "grad_norm": 0.5386398434638977, "learning_rate": 9.929221546913293e-06, "loss": 0.4857, "step": 121210 }, { "epoch": 2.6980947293447293, "grad_norm": 0.5033283829689026, "learning_rate": 9.914719217386714e-06, "loss": 0.3696, "step": 121220 }, { "epoch": 2.6983173076923075, "grad_norm": 0.39687106013298035, "learning_rate": 9.900227217249925e-06, "loss": 0.3999, "step": 121230 }, { "epoch": 2.698539886039886, "grad_norm": 0.6018307209014893, "learning_rate": 9.885745547290382e-06, "loss": 0.501, "step": 121240 }, { "epoch": 2.6987624643874644, "grad_norm": 0.6582804918289185, "learning_rate": 9.871274208295079e-06, "loss": 0.4494, "step": 121250 }, { "epoch": 2.6989850427350426, "grad_norm": 0.6227666735649109, "learning_rate": 9.856813201050408e-06, "loss": 0.4289, "step": 121260 }, { "epoch": 2.6992076210826212, "grad_norm": 0.5360127687454224, "learning_rate": 9.84236252634212e-06, "loss": 0.3291, "step": 121270 }, { "epoch": 2.6994301994301995, "grad_norm": 0.6764222383499146, "learning_rate": 9.827922184955563e-06, "loss": 0.5577, "step": 121280 }, { "epoch": 2.6996527777777777, "grad_norm": 0.5015178918838501, "learning_rate": 9.813492177675376e-06, "loss": 0.5306, "step": 121290 }, { "epoch": 2.6998753561253563, "grad_norm": 0.4428309202194214, "learning_rate": 9.799072505285711e-06, "loss": 0.4774, "step": 121300 }, { "epoch": 2.7000979344729346, "grad_norm": 0.59559565782547, "learning_rate": 9.784663168570163e-06, "loss": 0.4743, "step": 121310 }, { "epoch": 2.7003205128205128, "grad_norm": 0.6105912923812866, "learning_rate": 9.770264168311705e-06, "loss": 0.4392, "step": 121320 }, { "epoch": 2.7003205128205128, "eval_loss": 0.5214730501174927, "eval_runtime": 337.0066, "eval_samples_per_second": 7.018, "eval_steps_per_second": 7.018, "step": 121320 }, { "epoch": 2.7005430911680914, "grad_norm": 0.5224626660346985, "learning_rate": 9.755875505292843e-06, "loss": 0.4719, "step": 121330 }, { "epoch": 2.7007656695156697, "grad_norm": 0.546825110912323, "learning_rate": 9.741497180295445e-06, "loss": 0.4225, "step": 121340 }, { "epoch": 2.700988247863248, "grad_norm": 0.7341123819351196, "learning_rate": 9.727129194100881e-06, "loss": 0.4578, "step": 121350 }, { "epoch": 2.701210826210826, "grad_norm": 0.4602246880531311, "learning_rate": 9.71277154748984e-06, "loss": 0.365, "step": 121360 }, { "epoch": 2.7014334045584043, "grad_norm": 0.5582806468009949, "learning_rate": 9.698424241242587e-06, "loss": 0.5116, "step": 121370 }, { "epoch": 2.701655982905983, "grad_norm": 0.6341031789779663, "learning_rate": 9.68408727613872e-06, "loss": 0.4161, "step": 121380 }, { "epoch": 2.701878561253561, "grad_norm": 0.6534328460693359, "learning_rate": 9.669760652957393e-06, "loss": 0.4649, "step": 121390 }, { "epoch": 2.7021011396011394, "grad_norm": 0.4892594814300537, "learning_rate": 9.655444372477073e-06, "loss": 0.5891, "step": 121400 }, { "epoch": 2.702323717948718, "grad_norm": 0.6224949359893799, "learning_rate": 9.641138435475693e-06, "loss": 0.4574, "step": 121410 }, { "epoch": 2.7025462962962963, "grad_norm": 0.2790619432926178, "learning_rate": 9.626842842730744e-06, "loss": 0.3769, "step": 121420 }, { "epoch": 2.7027688746438745, "grad_norm": 0.36917656660079956, "learning_rate": 9.612557595018955e-06, "loss": 0.4212, "step": 121430 }, { "epoch": 2.702991452991453, "grad_norm": 0.4125138819217682, "learning_rate": 9.598282693116668e-06, "loss": 0.3707, "step": 121440 }, { "epoch": 2.7032140313390314, "grad_norm": 0.5624163746833801, "learning_rate": 9.584018137799544e-06, "loss": 0.4378, "step": 121450 }, { "epoch": 2.7034366096866096, "grad_norm": 0.6187195181846619, "learning_rate": 9.569763929842767e-06, "loss": 0.2982, "step": 121460 }, { "epoch": 2.7036591880341883, "grad_norm": 0.6008877158164978, "learning_rate": 9.555520070020919e-06, "loss": 0.4625, "step": 121470 }, { "epoch": 2.7038817663817665, "grad_norm": 0.5377522110939026, "learning_rate": 9.54128655910802e-06, "loss": 0.4653, "step": 121480 }, { "epoch": 2.7041043447293447, "grad_norm": 0.5690780282020569, "learning_rate": 9.527063397877523e-06, "loss": 0.388, "step": 121490 }, { "epoch": 2.7043269230769234, "grad_norm": 0.4713515043258667, "learning_rate": 9.512850587102317e-06, "loss": 0.4918, "step": 121500 }, { "epoch": 2.7045495014245016, "grad_norm": 0.5271637439727783, "learning_rate": 9.498648127554765e-06, "loss": 0.3648, "step": 121510 }, { "epoch": 2.70477207977208, "grad_norm": 0.604483962059021, "learning_rate": 9.484456020006627e-06, "loss": 0.5279, "step": 121520 }, { "epoch": 2.704994658119658, "grad_norm": 0.7051653861999512, "learning_rate": 9.470274265229106e-06, "loss": 0.4251, "step": 121530 }, { "epoch": 2.705217236467236, "grad_norm": 0.3371043801307678, "learning_rate": 9.456102863992855e-06, "loss": 0.428, "step": 121540 }, { "epoch": 2.705439814814815, "grad_norm": 0.3929579257965088, "learning_rate": 9.441941817067944e-06, "loss": 0.3709, "step": 121550 }, { "epoch": 2.705662393162393, "grad_norm": 0.5030765533447266, "learning_rate": 9.427791125223962e-06, "loss": 0.4252, "step": 121560 }, { "epoch": 2.7058849715099713, "grad_norm": 0.5232594609260559, "learning_rate": 9.413650789229778e-06, "loss": 0.4406, "step": 121570 }, { "epoch": 2.70610754985755, "grad_norm": 0.4874250888824463, "learning_rate": 9.399520809853824e-06, "loss": 0.5244, "step": 121580 }, { "epoch": 2.706330128205128, "grad_norm": 0.7548015117645264, "learning_rate": 9.385401187863952e-06, "loss": 0.4662, "step": 121590 }, { "epoch": 2.7065527065527064, "grad_norm": 0.6102350950241089, "learning_rate": 9.371291924027437e-06, "loss": 0.4242, "step": 121600 }, { "epoch": 2.706775284900285, "grad_norm": 0.3742469847202301, "learning_rate": 9.357193019110956e-06, "loss": 0.3777, "step": 121610 }, { "epoch": 2.7069978632478633, "grad_norm": 0.44158703088760376, "learning_rate": 9.343104473880715e-06, "loss": 0.3878, "step": 121620 }, { "epoch": 2.7072204415954415, "grad_norm": 0.5411059260368347, "learning_rate": 9.329026289102216e-06, "loss": 0.5364, "step": 121630 }, { "epoch": 2.70744301994302, "grad_norm": 0.6011901497840881, "learning_rate": 9.314958465540514e-06, "loss": 0.4326, "step": 121640 }, { "epoch": 2.7076655982905984, "grad_norm": 0.814118504524231, "learning_rate": 9.300901003960083e-06, "loss": 0.4772, "step": 121650 }, { "epoch": 2.7078881766381766, "grad_norm": 0.3137904703617096, "learning_rate": 9.286853905124825e-06, "loss": 0.4548, "step": 121660 }, { "epoch": 2.708110754985755, "grad_norm": 0.3962802588939667, "learning_rate": 9.272817169798043e-06, "loss": 0.4031, "step": 121670 }, { "epoch": 2.7083333333333335, "grad_norm": 0.6601922512054443, "learning_rate": 9.2587907987425e-06, "loss": 0.4737, "step": 121680 }, { "epoch": 2.7085559116809117, "grad_norm": 0.536861777305603, "learning_rate": 9.244774792720413e-06, "loss": 0.5034, "step": 121690 }, { "epoch": 2.70877849002849, "grad_norm": 0.6117525100708008, "learning_rate": 9.230769152493435e-06, "loss": 0.4652, "step": 121700 }, { "epoch": 2.709001068376068, "grad_norm": 0.5301530361175537, "learning_rate": 9.216773878822626e-06, "loss": 0.4876, "step": 121710 }, { "epoch": 2.709223646723647, "grad_norm": 0.6408038139343262, "learning_rate": 9.202788972468491e-06, "loss": 0.5645, "step": 121720 }, { "epoch": 2.709446225071225, "grad_norm": 0.7939885258674622, "learning_rate": 9.188814434191017e-06, "loss": 0.5038, "step": 121730 }, { "epoch": 2.7096688034188032, "grad_norm": 0.5108567476272583, "learning_rate": 9.174850264749557e-06, "loss": 0.4415, "step": 121740 }, { "epoch": 2.709891381766382, "grad_norm": 0.6715599894523621, "learning_rate": 9.160896464902945e-06, "loss": 0.4724, "step": 121750 }, { "epoch": 2.71011396011396, "grad_norm": 0.4106365740299225, "learning_rate": 9.14695303540949e-06, "loss": 0.2988, "step": 121760 }, { "epoch": 2.7103365384615383, "grad_norm": 0.7415056824684143, "learning_rate": 9.133019977026825e-06, "loss": 0.5563, "step": 121770 }, { "epoch": 2.710559116809117, "grad_norm": 0.4635258615016937, "learning_rate": 9.11909729051208e-06, "loss": 0.4499, "step": 121780 }, { "epoch": 2.710781695156695, "grad_norm": 0.4406909942626953, "learning_rate": 9.105184976621895e-06, "loss": 0.4234, "step": 121790 }, { "epoch": 2.7110042735042734, "grad_norm": 0.42780137062072754, "learning_rate": 9.091283036112197e-06, "loss": 0.4446, "step": 121800 }, { "epoch": 2.711226851851852, "grad_norm": 0.7454397678375244, "learning_rate": 9.077391469738471e-06, "loss": 0.4549, "step": 121810 }, { "epoch": 2.7114494301994303, "grad_norm": 0.39332252740859985, "learning_rate": 9.063510278255583e-06, "loss": 0.4143, "step": 121820 }, { "epoch": 2.7116720085470085, "grad_norm": 0.49680182337760925, "learning_rate": 9.049639462417858e-06, "loss": 0.398, "step": 121830 }, { "epoch": 2.7118945868945867, "grad_norm": 0.3992108404636383, "learning_rate": 9.035779022979074e-06, "loss": 0.4048, "step": 121840 }, { "epoch": 2.7121171652421654, "grad_norm": 0.5084406733512878, "learning_rate": 9.02192896069236e-06, "loss": 0.4337, "step": 121850 }, { "epoch": 2.7123397435897436, "grad_norm": 0.43044665455818176, "learning_rate": 9.008089276310361e-06, "loss": 0.4056, "step": 121860 }, { "epoch": 2.712562321937322, "grad_norm": 0.5675276517868042, "learning_rate": 8.99425997058514e-06, "loss": 0.4977, "step": 121870 }, { "epoch": 2.7127849002849, "grad_norm": 0.44733113050460815, "learning_rate": 8.980441044268207e-06, "loss": 0.4105, "step": 121880 }, { "epoch": 2.7130074786324787, "grad_norm": 0.38512033224105835, "learning_rate": 8.966632498110494e-06, "loss": 0.3537, "step": 121890 }, { "epoch": 2.713230056980057, "grad_norm": 0.48589053750038147, "learning_rate": 8.952834332862381e-06, "loss": 0.4738, "step": 121900 }, { "epoch": 2.713452635327635, "grad_norm": 0.3985794186592102, "learning_rate": 8.9390465492736e-06, "loss": 0.3923, "step": 121910 }, { "epoch": 2.713675213675214, "grad_norm": 0.7381534576416016, "learning_rate": 8.925269148093485e-06, "loss": 0.5137, "step": 121920 }, { "epoch": 2.713897792022792, "grad_norm": 0.7137097120285034, "learning_rate": 8.911502130070637e-06, "loss": 0.5386, "step": 121930 }, { "epoch": 2.7141203703703702, "grad_norm": 0.6199268698692322, "learning_rate": 8.89774549595319e-06, "loss": 0.5787, "step": 121940 }, { "epoch": 2.714342948717949, "grad_norm": 0.6272393465042114, "learning_rate": 8.883999246488706e-06, "loss": 0.4451, "step": 121950 }, { "epoch": 2.714565527065527, "grad_norm": 0.38012218475341797, "learning_rate": 8.870263382424138e-06, "loss": 0.4138, "step": 121960 }, { "epoch": 2.7147881054131053, "grad_norm": 0.4783298969268799, "learning_rate": 8.856537904505935e-06, "loss": 0.5385, "step": 121970 }, { "epoch": 2.715010683760684, "grad_norm": 0.8279988765716553, "learning_rate": 8.842822813479968e-06, "loss": 0.4848, "step": 121980 }, { "epoch": 2.715233262108262, "grad_norm": 0.5451609492301941, "learning_rate": 8.829118110091484e-06, "loss": 0.4799, "step": 121990 }, { "epoch": 2.7154558404558404, "grad_norm": 0.37781020998954773, "learning_rate": 8.815423795085197e-06, "loss": 0.4188, "step": 122000 }, { "epoch": 2.7156784188034186, "grad_norm": 0.46710386872291565, "learning_rate": 8.801739869205316e-06, "loss": 0.5301, "step": 122010 }, { "epoch": 2.7159009971509973, "grad_norm": 0.5526740550994873, "learning_rate": 8.788066333195399e-06, "loss": 0.446, "step": 122020 }, { "epoch": 2.7161235754985755, "grad_norm": 0.5683900713920593, "learning_rate": 8.774403187798497e-06, "loss": 0.3673, "step": 122030 }, { "epoch": 2.7163461538461537, "grad_norm": 0.6068999767303467, "learning_rate": 8.76075043375708e-06, "loss": 0.4733, "step": 122040 }, { "epoch": 2.716568732193732, "grad_norm": 0.6357635855674744, "learning_rate": 8.747108071813025e-06, "loss": 0.4225, "step": 122050 }, { "epoch": 2.7167913105413106, "grad_norm": 0.4891846179962158, "learning_rate": 8.733476102707717e-06, "loss": 0.5168, "step": 122060 }, { "epoch": 2.717013888888889, "grad_norm": 0.6161572933197021, "learning_rate": 8.71985452718187e-06, "loss": 0.355, "step": 122070 }, { "epoch": 2.717236467236467, "grad_norm": 0.5957919359207153, "learning_rate": 8.706243345975695e-06, "loss": 0.4717, "step": 122080 }, { "epoch": 2.7174590455840457, "grad_norm": 0.6445571184158325, "learning_rate": 8.69264255982889e-06, "loss": 0.5461, "step": 122090 }, { "epoch": 2.717681623931624, "grad_norm": 0.35034939646720886, "learning_rate": 8.679052169480485e-06, "loss": 0.431, "step": 122100 }, { "epoch": 2.717904202279202, "grad_norm": 0.4366070330142975, "learning_rate": 8.665472175668997e-06, "loss": 0.4499, "step": 122110 }, { "epoch": 2.718126780626781, "grad_norm": 0.5147125124931335, "learning_rate": 8.651902579132421e-06, "loss": 0.4197, "step": 122120 }, { "epoch": 2.718349358974359, "grad_norm": 0.6924682855606079, "learning_rate": 8.63834338060807e-06, "loss": 0.4195, "step": 122130 }, { "epoch": 2.7185719373219372, "grad_norm": 0.304049015045166, "learning_rate": 8.624794580832807e-06, "loss": 0.4128, "step": 122140 }, { "epoch": 2.718794515669516, "grad_norm": 0.5142202377319336, "learning_rate": 8.61125618054286e-06, "loss": 0.4986, "step": 122150 }, { "epoch": 2.719017094017094, "grad_norm": 0.4628162384033203, "learning_rate": 8.597728180473951e-06, "loss": 0.4266, "step": 122160 }, { "epoch": 2.7192396723646723, "grad_norm": 0.6526134014129639, "learning_rate": 8.584210581361162e-06, "loss": 0.4388, "step": 122170 }, { "epoch": 2.7194622507122506, "grad_norm": 0.650456428527832, "learning_rate": 8.570703383939105e-06, "loss": 0.4422, "step": 122180 }, { "epoch": 2.7196848290598292, "grad_norm": 0.6294580698013306, "learning_rate": 8.557206588941701e-06, "loss": 0.4221, "step": 122190 }, { "epoch": 2.7199074074074074, "grad_norm": 0.3640579283237457, "learning_rate": 8.543720197102457e-06, "loss": 0.4043, "step": 122200 }, { "epoch": 2.7201299857549857, "grad_norm": 0.754707396030426, "learning_rate": 8.53024420915416e-06, "loss": 0.4449, "step": 122210 }, { "epoch": 2.720352564102564, "grad_norm": 0.6340663433074951, "learning_rate": 8.516778625829135e-06, "loss": 0.4252, "step": 122220 }, { "epoch": 2.7205751424501425, "grad_norm": 0.610907793045044, "learning_rate": 8.503323447859113e-06, "loss": 0.4931, "step": 122230 }, { "epoch": 2.7207977207977208, "grad_norm": 0.7051807641983032, "learning_rate": 8.48987867597526e-06, "loss": 0.5408, "step": 122240 }, { "epoch": 2.721020299145299, "grad_norm": 0.5701954364776611, "learning_rate": 8.47644431090817e-06, "loss": 0.4381, "step": 122250 }, { "epoch": 2.7212428774928776, "grad_norm": 0.4436449110507965, "learning_rate": 8.463020353387929e-06, "loss": 0.4001, "step": 122260 }, { "epoch": 2.721465455840456, "grad_norm": 0.5930556058883667, "learning_rate": 8.449606804143928e-06, "loss": 0.4776, "step": 122270 }, { "epoch": 2.721688034188034, "grad_norm": 0.611627995967865, "learning_rate": 8.436203663905095e-06, "loss": 0.4434, "step": 122280 }, { "epoch": 2.7219106125356127, "grad_norm": 0.5522210001945496, "learning_rate": 8.422810933399783e-06, "loss": 0.5635, "step": 122290 }, { "epoch": 2.722133190883191, "grad_norm": 0.5032176375389099, "learning_rate": 8.409428613355764e-06, "loss": 0.4678, "step": 122300 }, { "epoch": 2.722355769230769, "grad_norm": 0.47020357847213745, "learning_rate": 8.396056704500254e-06, "loss": 0.4779, "step": 122310 }, { "epoch": 2.722578347578348, "grad_norm": 0.6725762486457825, "learning_rate": 8.382695207559854e-06, "loss": 0.4695, "step": 122320 }, { "epoch": 2.722800925925926, "grad_norm": 0.7783358693122864, "learning_rate": 8.36934412326067e-06, "loss": 0.4296, "step": 122330 }, { "epoch": 2.7230235042735043, "grad_norm": 0.519523024559021, "learning_rate": 8.356003452328209e-06, "loss": 0.5414, "step": 122340 }, { "epoch": 2.7232460826210825, "grad_norm": 0.5169113278388977, "learning_rate": 8.342673195487383e-06, "loss": 0.4473, "step": 122350 }, { "epoch": 2.7234686609686607, "grad_norm": 0.8073785305023193, "learning_rate": 8.32935335346261e-06, "loss": 0.4466, "step": 122360 }, { "epoch": 2.7236912393162394, "grad_norm": 0.6161701083183289, "learning_rate": 8.316043926977667e-06, "loss": 0.4347, "step": 122370 }, { "epoch": 2.7239138176638176, "grad_norm": 0.5894777774810791, "learning_rate": 8.302744916755822e-06, "loss": 0.4344, "step": 122380 }, { "epoch": 2.724136396011396, "grad_norm": 0.5426009297370911, "learning_rate": 8.289456323519762e-06, "loss": 0.4598, "step": 122390 }, { "epoch": 2.7243589743589745, "grad_norm": 0.5296372771263123, "learning_rate": 8.276178147991598e-06, "loss": 0.4751, "step": 122400 }, { "epoch": 2.7245815527065527, "grad_norm": 0.643884003162384, "learning_rate": 8.262910390892863e-06, "loss": 0.4724, "step": 122410 }, { "epoch": 2.724804131054131, "grad_norm": 0.42206260561943054, "learning_rate": 8.249653052944517e-06, "loss": 0.4427, "step": 122420 }, { "epoch": 2.7250267094017095, "grad_norm": 0.5802809000015259, "learning_rate": 8.236406134867003e-06, "loss": 0.3634, "step": 122430 }, { "epoch": 2.7252492877492878, "grad_norm": 0.45060160756111145, "learning_rate": 8.22316963738019e-06, "loss": 0.4823, "step": 122440 }, { "epoch": 2.725471866096866, "grad_norm": 0.9987406134605408, "learning_rate": 8.209943561203326e-06, "loss": 0.491, "step": 122450 }, { "epoch": 2.7256944444444446, "grad_norm": 0.5410594344139099, "learning_rate": 8.196727907055124e-06, "loss": 0.5319, "step": 122460 }, { "epoch": 2.725917022792023, "grad_norm": 0.7260173559188843, "learning_rate": 8.183522675653764e-06, "loss": 0.5818, "step": 122470 }, { "epoch": 2.726139601139601, "grad_norm": 0.5966166853904724, "learning_rate": 8.170327867716788e-06, "loss": 0.4377, "step": 122480 }, { "epoch": 2.7263621794871797, "grad_norm": 0.5492916107177734, "learning_rate": 8.157143483961239e-06, "loss": 0.4088, "step": 122490 }, { "epoch": 2.726584757834758, "grad_norm": 0.4295927584171295, "learning_rate": 8.143969525103544e-06, "loss": 0.47, "step": 122500 }, { "epoch": 2.726807336182336, "grad_norm": 0.37031441926956177, "learning_rate": 8.130805991859625e-06, "loss": 0.4653, "step": 122510 }, { "epoch": 2.7270299145299144, "grad_norm": 0.7110081315040588, "learning_rate": 8.11765288494477e-06, "loss": 0.4348, "step": 122520 }, { "epoch": 2.7272524928774926, "grad_norm": 0.6578905582427979, "learning_rate": 8.104510205073745e-06, "loss": 0.4397, "step": 122530 }, { "epoch": 2.7274750712250713, "grad_norm": 0.6855700016021729, "learning_rate": 8.091377952960755e-06, "loss": 0.4916, "step": 122540 }, { "epoch": 2.7276976495726495, "grad_norm": 0.3897780179977417, "learning_rate": 8.078256129319383e-06, "loss": 0.4268, "step": 122550 }, { "epoch": 2.7279202279202277, "grad_norm": 0.42931514978408813, "learning_rate": 8.065144734862661e-06, "loss": 0.3575, "step": 122560 }, { "epoch": 2.7281428062678064, "grad_norm": 0.6458748579025269, "learning_rate": 8.05204377030313e-06, "loss": 0.4615, "step": 122570 }, { "epoch": 2.7283653846153846, "grad_norm": 0.8001543879508972, "learning_rate": 8.038953236352664e-06, "loss": 0.565, "step": 122580 }, { "epoch": 2.728587962962963, "grad_norm": 0.3280816972255707, "learning_rate": 8.025873133722606e-06, "loss": 0.4612, "step": 122590 }, { "epoch": 2.7288105413105415, "grad_norm": 0.4333963394165039, "learning_rate": 8.012803463123764e-06, "loss": 0.4722, "step": 122600 }, { "epoch": 2.7290331196581197, "grad_norm": 0.5036003589630127, "learning_rate": 7.999744225266392e-06, "loss": 0.3942, "step": 122610 }, { "epoch": 2.729255698005698, "grad_norm": 0.7043850421905518, "learning_rate": 7.986695420860057e-06, "loss": 0.4804, "step": 122620 }, { "epoch": 2.7294782763532766, "grad_norm": 0.5668345093727112, "learning_rate": 7.973657050613881e-06, "loss": 0.4459, "step": 122630 }, { "epoch": 2.7297008547008548, "grad_norm": 0.5528721213340759, "learning_rate": 7.960629115236384e-06, "loss": 0.4946, "step": 122640 }, { "epoch": 2.729923433048433, "grad_norm": 0.8780606985092163, "learning_rate": 7.947611615435513e-06, "loss": 0.4543, "step": 122650 }, { "epoch": 2.7301460113960117, "grad_norm": 0.3899797201156616, "learning_rate": 7.934604551918657e-06, "loss": 0.4836, "step": 122660 }, { "epoch": 2.73036858974359, "grad_norm": 0.5518772602081299, "learning_rate": 7.921607925392605e-06, "loss": 0.4694, "step": 122670 }, { "epoch": 2.730591168091168, "grad_norm": 0.6687607169151306, "learning_rate": 7.908621736563659e-06, "loss": 0.4681, "step": 122680 }, { "epoch": 2.7308137464387463, "grad_norm": 0.4973803162574768, "learning_rate": 7.895645986137434e-06, "loss": 0.5181, "step": 122690 }, { "epoch": 2.7310363247863245, "grad_norm": 0.5222212076187134, "learning_rate": 7.882680674819054e-06, "loss": 0.4345, "step": 122700 }, { "epoch": 2.731258903133903, "grad_norm": 1.0800514221191406, "learning_rate": 7.86972580331311e-06, "loss": 0.5682, "step": 122710 }, { "epoch": 2.7314814814814814, "grad_norm": 0.4344554543495178, "learning_rate": 7.856781372323551e-06, "loss": 0.391, "step": 122720 }, { "epoch": 2.7317040598290596, "grad_norm": 0.8073518872261047, "learning_rate": 7.84384738255377e-06, "loss": 0.45, "step": 122730 }, { "epoch": 2.7319266381766383, "grad_norm": 0.5675332546234131, "learning_rate": 7.830923834706627e-06, "loss": 0.4147, "step": 122740 }, { "epoch": 2.7321492165242165, "grad_norm": 0.4464460611343384, "learning_rate": 7.818010729484426e-06, "loss": 0.5413, "step": 122750 }, { "epoch": 2.7323717948717947, "grad_norm": 0.5830329060554504, "learning_rate": 7.805108067588829e-06, "loss": 0.3051, "step": 122760 }, { "epoch": 2.7325943732193734, "grad_norm": 0.6713405251502991, "learning_rate": 7.792215849720985e-06, "loss": 0.5106, "step": 122770 }, { "epoch": 2.7328169515669516, "grad_norm": 0.6539282202720642, "learning_rate": 7.77933407658149e-06, "loss": 0.5021, "step": 122780 }, { "epoch": 2.73303952991453, "grad_norm": 0.5176177620887756, "learning_rate": 7.76646274887034e-06, "loss": 0.3564, "step": 122790 }, { "epoch": 2.7332621082621085, "grad_norm": 0.6762197613716125, "learning_rate": 7.753601867286975e-06, "loss": 0.3903, "step": 122800 }, { "epoch": 2.7334846866096867, "grad_norm": 0.6328567862510681, "learning_rate": 7.74075143253028e-06, "loss": 0.4979, "step": 122810 }, { "epoch": 2.733707264957265, "grad_norm": 0.5372480154037476, "learning_rate": 7.727911445298542e-06, "loss": 0.4814, "step": 122820 }, { "epoch": 2.7339298433048436, "grad_norm": 0.5866361856460571, "learning_rate": 7.71508190628949e-06, "loss": 0.4676, "step": 122830 }, { "epoch": 2.734152421652422, "grad_norm": 0.5748997330665588, "learning_rate": 7.702262816200323e-06, "loss": 0.4107, "step": 122840 }, { "epoch": 2.734375, "grad_norm": 0.7153517603874207, "learning_rate": 7.689454175727573e-06, "loss": 0.455, "step": 122850 }, { "epoch": 2.734597578347578, "grad_norm": 0.9207581281661987, "learning_rate": 7.676655985567326e-06, "loss": 0.4165, "step": 122860 }, { "epoch": 2.7348201566951564, "grad_norm": 0.3741627037525177, "learning_rate": 7.663868246415051e-06, "loss": 0.4885, "step": 122870 }, { "epoch": 2.735042735042735, "grad_norm": 0.5004806518554688, "learning_rate": 7.651090958965612e-06, "loss": 0.4983, "step": 122880 }, { "epoch": 2.7352653133903133, "grad_norm": 0.7157542109489441, "learning_rate": 7.638324123913387e-06, "loss": 0.5098, "step": 122890 }, { "epoch": 2.7354878917378915, "grad_norm": 0.5806912779808044, "learning_rate": 7.625567741952067e-06, "loss": 0.3866, "step": 122900 }, { "epoch": 2.73571047008547, "grad_norm": 0.7059860229492188, "learning_rate": 7.612821813774895e-06, "loss": 0.444, "step": 122910 }, { "epoch": 2.7359330484330484, "grad_norm": 0.6227195858955383, "learning_rate": 7.600086340074475e-06, "loss": 0.4417, "step": 122920 }, { "epoch": 2.7361556267806266, "grad_norm": 0.5309995412826538, "learning_rate": 7.587361321542874e-06, "loss": 0.4916, "step": 122930 }, { "epoch": 2.7363782051282053, "grad_norm": 0.3757227063179016, "learning_rate": 7.574646758871562e-06, "loss": 0.3326, "step": 122940 }, { "epoch": 2.7366007834757835, "grad_norm": 0.5011971592903137, "learning_rate": 7.561942652751475e-06, "loss": 0.4943, "step": 122950 }, { "epoch": 2.7368233618233617, "grad_norm": 0.5790525674819946, "learning_rate": 7.549249003872993e-06, "loss": 0.4616, "step": 122960 }, { "epoch": 2.7370459401709404, "grad_norm": 0.502332329750061, "learning_rate": 7.536565812925877e-06, "loss": 0.5342, "step": 122970 }, { "epoch": 2.7372685185185186, "grad_norm": 0.5204288363456726, "learning_rate": 7.523893080599287e-06, "loss": 0.3748, "step": 122980 }, { "epoch": 2.737491096866097, "grad_norm": 0.717951238155365, "learning_rate": 7.511230807581937e-06, "loss": 0.4289, "step": 122990 }, { "epoch": 2.7377136752136755, "grad_norm": 0.5042845606803894, "learning_rate": 7.498578994561878e-06, "loss": 0.4215, "step": 123000 }, { "epoch": 2.7379362535612537, "grad_norm": 0.7990171313285828, "learning_rate": 7.485937642226604e-06, "loss": 0.5998, "step": 123010 }, { "epoch": 2.738158831908832, "grad_norm": 1.1951872110366821, "learning_rate": 7.473306751263098e-06, "loss": 0.5114, "step": 123020 }, { "epoch": 2.73838141025641, "grad_norm": 0.42899465560913086, "learning_rate": 7.460686322357724e-06, "loss": 0.3831, "step": 123030 }, { "epoch": 2.7386039886039883, "grad_norm": 0.6046960353851318, "learning_rate": 7.448076356196265e-06, "loss": 0.4278, "step": 123040 }, { "epoch": 2.738826566951567, "grad_norm": 0.42047208547592163, "learning_rate": 7.435476853463974e-06, "loss": 0.4256, "step": 123050 }, { "epoch": 2.7390491452991452, "grad_norm": 0.46061971783638, "learning_rate": 7.422887814845481e-06, "loss": 0.4766, "step": 123060 }, { "epoch": 2.7392717236467234, "grad_norm": 0.6004669070243835, "learning_rate": 7.4103092410249485e-06, "loss": 0.4347, "step": 123070 }, { "epoch": 2.739494301994302, "grad_norm": 0.6010010242462158, "learning_rate": 7.397741132685854e-06, "loss": 0.5443, "step": 123080 }, { "epoch": 2.7397168803418803, "grad_norm": 0.4315003454685211, "learning_rate": 7.385183490511183e-06, "loss": 0.4179, "step": 123090 }, { "epoch": 2.7399394586894585, "grad_norm": 0.5793805122375488, "learning_rate": 7.3726363151833454e-06, "loss": 0.5219, "step": 123100 }, { "epoch": 2.740162037037037, "grad_norm": 0.5742940902709961, "learning_rate": 7.360099607384152e-06, "loss": 0.3331, "step": 123110 }, { "epoch": 2.7403846153846154, "grad_norm": 0.4294794201850891, "learning_rate": 7.347573367794814e-06, "loss": 0.4009, "step": 123120 }, { "epoch": 2.7406071937321936, "grad_norm": 0.3603975772857666, "learning_rate": 7.3350575970960515e-06, "loss": 0.3148, "step": 123130 }, { "epoch": 2.7408297720797723, "grad_norm": 0.6385037899017334, "learning_rate": 7.322552295967966e-06, "loss": 0.5181, "step": 123140 }, { "epoch": 2.7410523504273505, "grad_norm": 0.44872191548347473, "learning_rate": 7.310057465090148e-06, "loss": 0.3141, "step": 123150 }, { "epoch": 2.7412749287749287, "grad_norm": 0.6203681230545044, "learning_rate": 7.297573105141542e-06, "loss": 0.4231, "step": 123160 }, { "epoch": 2.7414975071225074, "grad_norm": 0.4686487019062042, "learning_rate": 7.285099216800584e-06, "loss": 0.4527, "step": 123170 }, { "epoch": 2.7417200854700856, "grad_norm": 0.5996116995811462, "learning_rate": 7.2726358007450865e-06, "loss": 0.4712, "step": 123180 }, { "epoch": 2.741942663817664, "grad_norm": 0.512990415096283, "learning_rate": 7.260182857652331e-06, "loss": 0.5061, "step": 123190 }, { "epoch": 2.742165242165242, "grad_norm": 0.454429566860199, "learning_rate": 7.24774038819902e-06, "loss": 0.4519, "step": 123200 }, { "epoch": 2.7423878205128203, "grad_norm": 0.5068633556365967, "learning_rate": 7.235308393061302e-06, "loss": 0.5578, "step": 123210 }, { "epoch": 2.742610398860399, "grad_norm": 0.41929909586906433, "learning_rate": 7.222886872914703e-06, "loss": 0.4626, "step": 123220 }, { "epoch": 2.742832977207977, "grad_norm": 0.778678297996521, "learning_rate": 7.210475828434304e-06, "loss": 0.4833, "step": 123230 }, { "epoch": 2.7430555555555554, "grad_norm": 0.5140986442565918, "learning_rate": 7.198075260294413e-06, "loss": 0.4896, "step": 123240 }, { "epoch": 2.743278133903134, "grad_norm": 0.6002315282821655, "learning_rate": 7.185685169168999e-06, "loss": 0.4238, "step": 123250 }, { "epoch": 2.7435007122507122, "grad_norm": 0.7444051504135132, "learning_rate": 7.1733055557312574e-06, "loss": 0.3522, "step": 123260 }, { "epoch": 2.7437232905982905, "grad_norm": 0.5440506339073181, "learning_rate": 7.160936420653963e-06, "loss": 0.3381, "step": 123270 }, { "epoch": 2.743945868945869, "grad_norm": 0.6836532354354858, "learning_rate": 7.1485777646092435e-06, "loss": 0.5332, "step": 123280 }, { "epoch": 2.7441684472934473, "grad_norm": 0.4120360314846039, "learning_rate": 7.136229588268673e-06, "loss": 0.5638, "step": 123290 }, { "epoch": 2.7443910256410255, "grad_norm": 0.5837830305099487, "learning_rate": 7.1238918923032915e-06, "loss": 0.5332, "step": 123300 }, { "epoch": 2.744613603988604, "grad_norm": 0.5079546570777893, "learning_rate": 7.11156467738352e-06, "loss": 0.494, "step": 123310 }, { "epoch": 2.7448361823361824, "grad_norm": 0.46274396777153015, "learning_rate": 7.099247944179221e-06, "loss": 0.4824, "step": 123320 }, { "epoch": 2.7450587606837606, "grad_norm": 0.48522692918777466, "learning_rate": 7.0869416933597055e-06, "loss": 0.4459, "step": 123330 }, { "epoch": 2.7452813390313393, "grad_norm": 0.5520329475402832, "learning_rate": 7.074645925593704e-06, "loss": 0.4302, "step": 123340 }, { "epoch": 2.7455039173789175, "grad_norm": 0.7036489248275757, "learning_rate": 7.0623606415493705e-06, "loss": 0.4475, "step": 123350 }, { "epoch": 2.7457264957264957, "grad_norm": 0.38540059328079224, "learning_rate": 7.050085841894349e-06, "loss": 0.3854, "step": 123360 }, { "epoch": 2.745949074074074, "grad_norm": 0.49837902188301086, "learning_rate": 7.0378215272955735e-06, "loss": 0.3951, "step": 123370 }, { "epoch": 2.746171652421652, "grad_norm": 0.7058712840080261, "learning_rate": 7.025567698419555e-06, "loss": 0.4343, "step": 123380 }, { "epoch": 2.746394230769231, "grad_norm": 0.5129693746566772, "learning_rate": 7.013324355932182e-06, "loss": 0.4658, "step": 123390 }, { "epoch": 2.746616809116809, "grad_norm": 0.4032246470451355, "learning_rate": 7.001091500498724e-06, "loss": 0.5098, "step": 123400 }, { "epoch": 2.7468393874643873, "grad_norm": 0.5415821075439453, "learning_rate": 6.98886913278396e-06, "loss": 0.4953, "step": 123410 }, { "epoch": 2.747061965811966, "grad_norm": 0.6676926612854004, "learning_rate": 6.976657253452046e-06, "loss": 0.5028, "step": 123420 }, { "epoch": 2.747284544159544, "grad_norm": 0.49392446875572205, "learning_rate": 6.964455863166586e-06, "loss": 0.3837, "step": 123430 }, { "epoch": 2.7475071225071224, "grad_norm": 0.7210075855255127, "learning_rate": 6.952264962590649e-06, "loss": 0.4951, "step": 123440 }, { "epoch": 2.747729700854701, "grad_norm": 0.7871028780937195, "learning_rate": 6.940084552386661e-06, "loss": 0.5332, "step": 123450 }, { "epoch": 2.7479522792022792, "grad_norm": 0.7726924419403076, "learning_rate": 6.9279146332165146e-06, "loss": 0.4536, "step": 123460 }, { "epoch": 2.7481748575498575, "grad_norm": 0.8336293697357178, "learning_rate": 6.915755205741548e-06, "loss": 0.4544, "step": 123470 }, { "epoch": 2.748397435897436, "grad_norm": 0.6057497262954712, "learning_rate": 6.903606270622498e-06, "loss": 0.6092, "step": 123480 }, { "epoch": 2.7486200142450143, "grad_norm": 0.47200432419776917, "learning_rate": 6.8914678285195935e-06, "loss": 0.4223, "step": 123490 }, { "epoch": 2.7488425925925926, "grad_norm": 0.46911531686782837, "learning_rate": 6.8793398800923725e-06, "loss": 0.4531, "step": 123500 }, { "epoch": 2.7490651709401708, "grad_norm": 0.639143705368042, "learning_rate": 6.8672224259999304e-06, "loss": 0.333, "step": 123510 }, { "epoch": 2.7492877492877494, "grad_norm": 0.5659869909286499, "learning_rate": 6.855115466900741e-06, "loss": 0.4955, "step": 123520 }, { "epoch": 2.7495103276353277, "grad_norm": 0.6819111704826355, "learning_rate": 6.8430190034527e-06, "loss": 0.5726, "step": 123530 }, { "epoch": 2.749732905982906, "grad_norm": 0.5177549123764038, "learning_rate": 6.830933036313103e-06, "loss": 0.4823, "step": 123540 }, { "epoch": 2.749955484330484, "grad_norm": 0.511566698551178, "learning_rate": 6.818857566138759e-06, "loss": 0.4393, "step": 123550 }, { "epoch": 2.7501780626780628, "grad_norm": 0.7504387497901917, "learning_rate": 6.806792593585831e-06, "loss": 0.4776, "step": 123560 }, { "epoch": 2.750400641025641, "grad_norm": 0.6001155972480774, "learning_rate": 6.794738119309952e-06, "loss": 0.4133, "step": 123570 }, { "epoch": 2.750623219373219, "grad_norm": 0.5214382410049438, "learning_rate": 6.782694143966173e-06, "loss": 0.3677, "step": 123580 }, { "epoch": 2.750845797720798, "grad_norm": 0.32441210746765137, "learning_rate": 6.770660668208973e-06, "loss": 0.3922, "step": 123590 }, { "epoch": 2.751068376068376, "grad_norm": 0.7337201237678528, "learning_rate": 6.758637692692249e-06, "loss": 0.4378, "step": 123600 }, { "epoch": 2.7512909544159543, "grad_norm": 0.5562883615493774, "learning_rate": 6.746625218069347e-06, "loss": 0.4645, "step": 123610 }, { "epoch": 2.751513532763533, "grad_norm": 0.5827807784080505, "learning_rate": 6.7346232449930545e-06, "loss": 0.3289, "step": 123620 }, { "epoch": 2.751736111111111, "grad_norm": 0.44625237584114075, "learning_rate": 6.722631774115518e-06, "loss": 0.4546, "step": 123630 }, { "epoch": 2.7519586894586894, "grad_norm": 0.5511494874954224, "learning_rate": 6.710650806088415e-06, "loss": 0.4813, "step": 123640 }, { "epoch": 2.752181267806268, "grad_norm": 0.5771028995513916, "learning_rate": 6.698680341562757e-06, "loss": 0.452, "step": 123650 }, { "epoch": 2.7524038461538463, "grad_norm": 0.4411100149154663, "learning_rate": 6.686720381189071e-06, "loss": 0.461, "step": 123660 }, { "epoch": 2.7526264245014245, "grad_norm": 0.6261259913444519, "learning_rate": 6.67477092561728e-06, "loss": 0.4468, "step": 123670 }, { "epoch": 2.7528490028490027, "grad_norm": 0.5436714291572571, "learning_rate": 6.662831975496664e-06, "loss": 0.5056, "step": 123680 }, { "epoch": 2.7530715811965814, "grad_norm": 0.6230029463768005, "learning_rate": 6.650903531476038e-06, "loss": 0.4329, "step": 123690 }, { "epoch": 2.7532941595441596, "grad_norm": 0.46451959013938904, "learning_rate": 6.638985594203595e-06, "loss": 0.4803, "step": 123700 }, { "epoch": 2.753516737891738, "grad_norm": 0.33018794655799866, "learning_rate": 6.627078164326972e-06, "loss": 0.3603, "step": 123710 }, { "epoch": 2.753739316239316, "grad_norm": 0.6304956078529358, "learning_rate": 6.615181242493207e-06, "loss": 0.4246, "step": 123720 }, { "epoch": 2.7539618945868947, "grad_norm": 0.32348451018333435, "learning_rate": 6.603294829348849e-06, "loss": 0.4164, "step": 123730 }, { "epoch": 2.754184472934473, "grad_norm": 0.49687933921813965, "learning_rate": 6.591418925539739e-06, "loss": 0.4356, "step": 123740 }, { "epoch": 2.754407051282051, "grad_norm": 0.5507271885871887, "learning_rate": 6.5795535317112686e-06, "loss": 0.3441, "step": 123750 }, { "epoch": 2.7546296296296298, "grad_norm": 0.313600093126297, "learning_rate": 6.567698648508214e-06, "loss": 0.4408, "step": 123760 }, { "epoch": 2.754852207977208, "grad_norm": 0.7065009474754333, "learning_rate": 6.5558542765747465e-06, "loss": 0.5691, "step": 123770 }, { "epoch": 2.755074786324786, "grad_norm": 0.8352875113487244, "learning_rate": 6.544020416554508e-06, "loss": 0.4224, "step": 123780 }, { "epoch": 2.755297364672365, "grad_norm": 0.5489425659179688, "learning_rate": 6.532197069090584e-06, "loss": 0.5168, "step": 123790 }, { "epoch": 2.755519943019943, "grad_norm": 0.8018131256103516, "learning_rate": 6.520384234825438e-06, "loss": 0.4949, "step": 123800 }, { "epoch": 2.7557425213675213, "grad_norm": 0.7913144826889038, "learning_rate": 6.5085819144010244e-06, "loss": 0.424, "step": 123810 }, { "epoch": 2.7559650997151, "grad_norm": 0.5006592869758606, "learning_rate": 6.49679010845865e-06, "loss": 0.4368, "step": 123820 }, { "epoch": 2.756187678062678, "grad_norm": 0.5908501744270325, "learning_rate": 6.485008817639116e-06, "loss": 0.4645, "step": 123830 }, { "epoch": 2.7564102564102564, "grad_norm": 0.6493932008743286, "learning_rate": 6.473238042582619e-06, "loss": 0.3913, "step": 123840 }, { "epoch": 2.7566328347578346, "grad_norm": 0.6723030805587769, "learning_rate": 6.461477783928804e-06, "loss": 0.4737, "step": 123850 }, { "epoch": 2.7568554131054133, "grad_norm": 0.3196379244327545, "learning_rate": 6.449728042316716e-06, "loss": 0.4074, "step": 123860 }, { "epoch": 2.7570779914529915, "grad_norm": 0.42770472168922424, "learning_rate": 6.437988818384865e-06, "loss": 0.4899, "step": 123870 }, { "epoch": 2.7573005698005697, "grad_norm": 0.8020942807197571, "learning_rate": 6.426260112771165e-06, "loss": 0.5244, "step": 123880 }, { "epoch": 2.757523148148148, "grad_norm": 0.6081737279891968, "learning_rate": 6.414541926112949e-06, "loss": 0.3525, "step": 123890 }, { "epoch": 2.7577457264957266, "grad_norm": 0.5558830499649048, "learning_rate": 6.4028342590469966e-06, "loss": 0.4875, "step": 123900 }, { "epoch": 2.757968304843305, "grad_norm": 0.7631189823150635, "learning_rate": 6.391137112209511e-06, "loss": 0.4206, "step": 123910 }, { "epoch": 2.758190883190883, "grad_norm": 0.5545490980148315, "learning_rate": 6.379450486236138e-06, "loss": 0.4263, "step": 123920 }, { "epoch": 2.7584134615384617, "grad_norm": 0.39973217248916626, "learning_rate": 6.367774381761926e-06, "loss": 0.4443, "step": 123930 }, { "epoch": 2.75863603988604, "grad_norm": 0.34016963839530945, "learning_rate": 6.356108799421368e-06, "loss": 0.4435, "step": 123940 }, { "epoch": 2.758858618233618, "grad_norm": 0.5182162523269653, "learning_rate": 6.344453739848399e-06, "loss": 0.381, "step": 123950 }, { "epoch": 2.7590811965811968, "grad_norm": 0.5975072979927063, "learning_rate": 6.332809203676315e-06, "loss": 0.4656, "step": 123960 }, { "epoch": 2.759303774928775, "grad_norm": 0.5642514228820801, "learning_rate": 6.321175191537942e-06, "loss": 0.4357, "step": 123970 }, { "epoch": 2.759526353276353, "grad_norm": 0.4777040183544159, "learning_rate": 6.309551704065464e-06, "loss": 0.4087, "step": 123980 }, { "epoch": 2.759748931623932, "grad_norm": 0.5907963514328003, "learning_rate": 6.297938741890486e-06, "loss": 0.4484, "step": 123990 }, { "epoch": 2.75997150997151, "grad_norm": 0.6896931529045105, "learning_rate": 6.286336305644125e-06, "loss": 0.4042, "step": 124000 }, { "epoch": 2.7601940883190883, "grad_norm": 0.37854743003845215, "learning_rate": 6.2747443959568106e-06, "loss": 0.3652, "step": 124010 }, { "epoch": 2.760327635327635, "eval_loss": 0.5210950970649719, "eval_runtime": 337.1981, "eval_samples_per_second": 7.014, "eval_steps_per_second": 7.014, "step": 124016 }, { "epoch": 2.7604166666666665, "grad_norm": 0.5546755194664001, "learning_rate": 6.2631630134585066e-06, "loss": 0.4984, "step": 124020 }, { "epoch": 2.760639245014245, "grad_norm": 0.3803851902484894, "learning_rate": 6.251592158778485e-06, "loss": 0.3478, "step": 124030 }, { "epoch": 2.7608618233618234, "grad_norm": 0.8168506622314453, "learning_rate": 6.240031832545579e-06, "loss": 0.5383, "step": 124040 }, { "epoch": 2.7610844017094016, "grad_norm": 0.4964335560798645, "learning_rate": 6.22848203538795e-06, "loss": 0.4466, "step": 124050 }, { "epoch": 2.76130698005698, "grad_norm": 0.4928797781467438, "learning_rate": 6.216942767933232e-06, "loss": 0.4293, "step": 124060 }, { "epoch": 2.7615295584045585, "grad_norm": 0.5578243136405945, "learning_rate": 6.205414030808476e-06, "loss": 0.4597, "step": 124070 }, { "epoch": 2.7617521367521367, "grad_norm": 0.9077683687210083, "learning_rate": 6.1938958246402055e-06, "loss": 0.4468, "step": 124080 }, { "epoch": 2.761974715099715, "grad_norm": 0.40201684832572937, "learning_rate": 6.182388150054252e-06, "loss": 0.4142, "step": 124090 }, { "epoch": 2.7621972934472936, "grad_norm": 0.38440248370170593, "learning_rate": 6.170891007676005e-06, "loss": 0.4585, "step": 124100 }, { "epoch": 2.762419871794872, "grad_norm": 0.4817364811897278, "learning_rate": 6.15940439813023e-06, "loss": 0.4741, "step": 124110 }, { "epoch": 2.76264245014245, "grad_norm": 0.5801695585250854, "learning_rate": 6.147928322041096e-06, "loss": 0.4417, "step": 124120 }, { "epoch": 2.7628650284900287, "grad_norm": 0.5268610119819641, "learning_rate": 6.136462780032215e-06, "loss": 0.5868, "step": 124130 }, { "epoch": 2.763087606837607, "grad_norm": 0.3276066482067108, "learning_rate": 6.125007772726665e-06, "loss": 0.4841, "step": 124140 }, { "epoch": 2.763310185185185, "grad_norm": 0.8135481476783752, "learning_rate": 6.113563300746928e-06, "loss": 0.5494, "step": 124150 }, { "epoch": 2.763532763532764, "grad_norm": 0.6948876976966858, "learning_rate": 6.102129364714881e-06, "loss": 0.384, "step": 124160 }, { "epoch": 2.763755341880342, "grad_norm": 0.7384427189826965, "learning_rate": 6.090705965251831e-06, "loss": 0.4289, "step": 124170 }, { "epoch": 2.76397792022792, "grad_norm": 0.6335643529891968, "learning_rate": 6.079293102978567e-06, "loss": 0.4731, "step": 124180 }, { "epoch": 2.7642004985754984, "grad_norm": 0.566062867641449, "learning_rate": 6.067890778515261e-06, "loss": 0.5385, "step": 124190 }, { "epoch": 2.7644230769230766, "grad_norm": 0.39977380633354187, "learning_rate": 6.056498992481552e-06, "loss": 0.4217, "step": 124200 }, { "epoch": 2.7646456552706553, "grad_norm": 0.48577919602394104, "learning_rate": 6.045117745496431e-06, "loss": 0.4004, "step": 124210 }, { "epoch": 2.7648682336182335, "grad_norm": 0.6411281824111938, "learning_rate": 6.033747038178428e-06, "loss": 0.3765, "step": 124220 }, { "epoch": 2.7650908119658117, "grad_norm": 0.8137752413749695, "learning_rate": 6.022386871145358e-06, "loss": 0.5462, "step": 124230 }, { "epoch": 2.7653133903133904, "grad_norm": 0.6225287914276123, "learning_rate": 6.011037245014594e-06, "loss": 0.461, "step": 124240 }, { "epoch": 2.7655359686609686, "grad_norm": 0.3489503860473633, "learning_rate": 5.999698160402889e-06, "loss": 0.349, "step": 124250 }, { "epoch": 2.765758547008547, "grad_norm": 0.5511321425437927, "learning_rate": 5.988369617926371e-06, "loss": 0.4547, "step": 124260 }, { "epoch": 2.7659811253561255, "grad_norm": 0.5289965867996216, "learning_rate": 5.977051618200702e-06, "loss": 0.4483, "step": 124270 }, { "epoch": 2.7662037037037037, "grad_norm": 0.686833381652832, "learning_rate": 5.965744161840881e-06, "loss": 0.4185, "step": 124280 }, { "epoch": 2.766426282051282, "grad_norm": 0.5750662088394165, "learning_rate": 5.95444724946137e-06, "loss": 0.4877, "step": 124290 }, { "epoch": 2.7666488603988606, "grad_norm": 0.5627291798591614, "learning_rate": 5.943160881676036e-06, "loss": 0.4081, "step": 124300 }, { "epoch": 2.766871438746439, "grad_norm": 0.8569827079772949, "learning_rate": 5.9318850590982076e-06, "loss": 0.4886, "step": 124310 }, { "epoch": 2.767094017094017, "grad_norm": 0.6511522531509399, "learning_rate": 5.9206197823406196e-06, "loss": 0.6063, "step": 124320 }, { "epoch": 2.7673165954415957, "grad_norm": 0.6511116027832031, "learning_rate": 5.909365052015403e-06, "loss": 0.3868, "step": 124330 }, { "epoch": 2.767539173789174, "grad_norm": 0.5759599804878235, "learning_rate": 5.8981208687342024e-06, "loss": 0.5286, "step": 124340 }, { "epoch": 2.767761752136752, "grad_norm": 0.4088830351829529, "learning_rate": 5.886887233108018e-06, "loss": 0.5101, "step": 124350 }, { "epoch": 2.7679843304843303, "grad_norm": 0.39041629433631897, "learning_rate": 5.875664145747295e-06, "loss": 0.3891, "step": 124360 }, { "epoch": 2.7682069088319086, "grad_norm": 0.5441334843635559, "learning_rate": 5.864451607261901e-06, "loss": 0.4568, "step": 124370 }, { "epoch": 2.7684294871794872, "grad_norm": 0.5711949467658997, "learning_rate": 5.853249618261103e-06, "loss": 0.4501, "step": 124380 }, { "epoch": 2.7686520655270654, "grad_norm": 0.7167291045188904, "learning_rate": 5.842058179353682e-06, "loss": 0.476, "step": 124390 }, { "epoch": 2.7688746438746437, "grad_norm": 0.373648077249527, "learning_rate": 5.830877291147752e-06, "loss": 0.3384, "step": 124400 }, { "epoch": 2.7690972222222223, "grad_norm": 0.7929762005805969, "learning_rate": 5.819706954250936e-06, "loss": 0.3829, "step": 124410 }, { "epoch": 2.7693198005698005, "grad_norm": 0.5502915978431702, "learning_rate": 5.808547169270173e-06, "loss": 0.4779, "step": 124420 }, { "epoch": 2.7695423789173788, "grad_norm": 0.43421173095703125, "learning_rate": 5.797397936811933e-06, "loss": 0.3928, "step": 124430 }, { "epoch": 2.7697649572649574, "grad_norm": 0.49342089891433716, "learning_rate": 5.786259257482085e-06, "loss": 0.3114, "step": 124440 }, { "epoch": 2.7699875356125356, "grad_norm": 0.793108344078064, "learning_rate": 5.775131131885903e-06, "loss": 0.538, "step": 124450 }, { "epoch": 2.770210113960114, "grad_norm": 0.6837170124053955, "learning_rate": 5.764013560628079e-06, "loss": 0.6071, "step": 124460 }, { "epoch": 2.7704326923076925, "grad_norm": 0.7336123585700989, "learning_rate": 5.752906544312753e-06, "loss": 0.4091, "step": 124470 }, { "epoch": 2.7706552706552707, "grad_norm": 0.567870557308197, "learning_rate": 5.741810083543531e-06, "loss": 0.4292, "step": 124480 }, { "epoch": 2.770877849002849, "grad_norm": 0.667264997959137, "learning_rate": 5.730724178923353e-06, "loss": 0.5032, "step": 124490 }, { "epoch": 2.7711004273504276, "grad_norm": 0.5979461073875427, "learning_rate": 5.719648831054691e-06, "loss": 0.3892, "step": 124500 }, { "epoch": 2.771323005698006, "grad_norm": 0.7902460694313049, "learning_rate": 5.708584040539333e-06, "loss": 0.5792, "step": 124510 }, { "epoch": 2.771545584045584, "grad_norm": 0.5519484877586365, "learning_rate": 5.697529807978574e-06, "loss": 0.4082, "step": 124520 }, { "epoch": 2.7717681623931623, "grad_norm": 0.4593891203403473, "learning_rate": 5.686486133973112e-06, "loss": 0.4129, "step": 124530 }, { "epoch": 2.7719907407407405, "grad_norm": 0.5422836542129517, "learning_rate": 5.675453019123089e-06, "loss": 0.555, "step": 124540 }, { "epoch": 2.772213319088319, "grad_norm": 0.7721577286720276, "learning_rate": 5.664430464028004e-06, "loss": 0.5353, "step": 124550 }, { "epoch": 2.7724358974358974, "grad_norm": 0.42286717891693115, "learning_rate": 5.653418469286864e-06, "loss": 0.3813, "step": 124560 }, { "epoch": 2.7726584757834756, "grad_norm": 0.7902956604957581, "learning_rate": 5.64241703549806e-06, "loss": 0.4462, "step": 124570 }, { "epoch": 2.7728810541310542, "grad_norm": 0.4630812108516693, "learning_rate": 5.6314261632594455e-06, "loss": 0.4752, "step": 124580 }, { "epoch": 2.7731036324786325, "grad_norm": 0.702799379825592, "learning_rate": 5.620445853168232e-06, "loss": 0.4306, "step": 124590 }, { "epoch": 2.7733262108262107, "grad_norm": 0.7227345705032349, "learning_rate": 5.609476105821099e-06, "loss": 0.3994, "step": 124600 }, { "epoch": 2.7735487891737893, "grad_norm": 0.485037237405777, "learning_rate": 5.59851692181419e-06, "loss": 0.4176, "step": 124610 }, { "epoch": 2.7737713675213675, "grad_norm": 0.5751772522926331, "learning_rate": 5.587568301743007e-06, "loss": 0.3802, "step": 124620 }, { "epoch": 2.7739939458689458, "grad_norm": 0.5980727076530457, "learning_rate": 5.576630246202519e-06, "loss": 0.4938, "step": 124630 }, { "epoch": 2.7742165242165244, "grad_norm": 0.7590274214744568, "learning_rate": 5.565702755787116e-06, "loss": 0.4358, "step": 124640 }, { "epoch": 2.7744391025641026, "grad_norm": 0.48052945733070374, "learning_rate": 5.554785831090592e-06, "loss": 0.5187, "step": 124650 }, { "epoch": 2.774661680911681, "grad_norm": 0.44331279397010803, "learning_rate": 5.543879472706181e-06, "loss": 0.387, "step": 124660 }, { "epoch": 2.7748842592592595, "grad_norm": 0.47672250866889954, "learning_rate": 5.532983681226567e-06, "loss": 0.4659, "step": 124670 }, { "epoch": 2.7751068376068377, "grad_norm": 0.5024214386940002, "learning_rate": 5.522098457243807e-06, "loss": 0.4339, "step": 124680 }, { "epoch": 2.775329415954416, "grad_norm": 0.406444251537323, "learning_rate": 5.511223801349408e-06, "loss": 0.3837, "step": 124690 }, { "epoch": 2.775551994301994, "grad_norm": 0.5859958529472351, "learning_rate": 5.50035971413434e-06, "loss": 0.4784, "step": 124700 }, { "epoch": 2.7757745726495724, "grad_norm": 0.5342255234718323, "learning_rate": 5.489506196188931e-06, "loss": 0.4102, "step": 124710 }, { "epoch": 2.775997150997151, "grad_norm": 0.516022801399231, "learning_rate": 5.478663248103022e-06, "loss": 0.5015, "step": 124720 }, { "epoch": 2.7762197293447293, "grad_norm": 0.6034132242202759, "learning_rate": 5.467830870465762e-06, "loss": 0.5297, "step": 124730 }, { "epoch": 2.7764423076923075, "grad_norm": 0.4431053698062897, "learning_rate": 5.457009063865837e-06, "loss": 0.425, "step": 124740 }, { "epoch": 2.776664886039886, "grad_norm": 0.6430148482322693, "learning_rate": 5.446197828891309e-06, "loss": 0.4861, "step": 124750 }, { "epoch": 2.7768874643874644, "grad_norm": 0.4936456084251404, "learning_rate": 5.435397166129641e-06, "loss": 0.4151, "step": 124760 }, { "epoch": 2.7771100427350426, "grad_norm": 0.5719223022460938, "learning_rate": 5.4246070761677646e-06, "loss": 0.5108, "step": 124770 }, { "epoch": 2.7773326210826212, "grad_norm": 0.6354074478149414, "learning_rate": 5.413827559592055e-06, "loss": 0.4843, "step": 124780 }, { "epoch": 2.7775551994301995, "grad_norm": 0.6604524850845337, "learning_rate": 5.403058616988243e-06, "loss": 0.4567, "step": 124790 }, { "epoch": 2.7777777777777777, "grad_norm": 0.666455864906311, "learning_rate": 5.392300248941551e-06, "loss": 0.5346, "step": 124800 }, { "epoch": 2.7780003561253563, "grad_norm": 0.45406487584114075, "learning_rate": 5.3815524560365314e-06, "loss": 0.4984, "step": 124810 }, { "epoch": 2.7782229344729346, "grad_norm": 0.5581161379814148, "learning_rate": 5.370815238857296e-06, "loss": 0.5025, "step": 124820 }, { "epoch": 2.7784455128205128, "grad_norm": 0.48750677704811096, "learning_rate": 5.3600885979872895e-06, "loss": 0.4852, "step": 124830 }, { "epoch": 2.7786680911680914, "grad_norm": 0.5511132478713989, "learning_rate": 5.3493725340094e-06, "loss": 0.5341, "step": 124840 }, { "epoch": 2.7788906695156697, "grad_norm": 0.6001371741294861, "learning_rate": 5.338667047505963e-06, "loss": 0.5021, "step": 124850 }, { "epoch": 2.779113247863248, "grad_norm": 0.6603468656539917, "learning_rate": 5.327972139058712e-06, "loss": 0.4132, "step": 124860 }, { "epoch": 2.779335826210826, "grad_norm": 0.5646976232528687, "learning_rate": 5.317287809248828e-06, "loss": 0.481, "step": 124870 }, { "epoch": 2.7795584045584043, "grad_norm": 0.7938178777694702, "learning_rate": 5.306614058656889e-06, "loss": 0.5433, "step": 124880 }, { "epoch": 2.779780982905983, "grad_norm": 0.6014885306358337, "learning_rate": 5.295950887862922e-06, "loss": 0.4261, "step": 124890 }, { "epoch": 2.780003561253561, "grad_norm": 1.25307035446167, "learning_rate": 5.285298297446395e-06, "loss": 0.482, "step": 124900 }, { "epoch": 2.7802261396011394, "grad_norm": 0.49934521317481995, "learning_rate": 5.274656287986135e-06, "loss": 0.4825, "step": 124910 }, { "epoch": 2.780448717948718, "grad_norm": 0.7475011944770813, "learning_rate": 5.264024860060501e-06, "loss": 0.4176, "step": 124920 }, { "epoch": 2.7806712962962963, "grad_norm": 0.4853910207748413, "learning_rate": 5.253404014247143e-06, "loss": 0.3782, "step": 124930 }, { "epoch": 2.7808938746438745, "grad_norm": 0.3415036201477051, "learning_rate": 5.242793751123265e-06, "loss": 0.4971, "step": 124940 }, { "epoch": 2.781116452991453, "grad_norm": 0.4938315451145172, "learning_rate": 5.232194071265384e-06, "loss": 0.3815, "step": 124950 }, { "epoch": 2.7813390313390314, "grad_norm": 0.6822963356971741, "learning_rate": 5.2216049752495275e-06, "loss": 0.465, "step": 124960 }, { "epoch": 2.7815616096866096, "grad_norm": 0.44059741497039795, "learning_rate": 5.211026463651103e-06, "loss": 0.4366, "step": 124970 }, { "epoch": 2.7817841880341883, "grad_norm": 0.5339191555976868, "learning_rate": 5.200458537044961e-06, "loss": 0.4865, "step": 124980 }, { "epoch": 2.7820067663817665, "grad_norm": 0.5560976266860962, "learning_rate": 5.189901196005376e-06, "loss": 0.4412, "step": 124990 }, { "epoch": 2.7822293447293447, "grad_norm": 0.725497841835022, "learning_rate": 5.179354441106066e-06, "loss": 0.3774, "step": 125000 }, { "epoch": 2.7824519230769234, "grad_norm": 0.5456880927085876, "learning_rate": 5.168818272920084e-06, "loss": 0.4396, "step": 125010 }, { "epoch": 2.7826745014245016, "grad_norm": 0.6306502819061279, "learning_rate": 5.1582926920200395e-06, "loss": 0.5471, "step": 125020 }, { "epoch": 2.78289707977208, "grad_norm": 0.5285366773605347, "learning_rate": 5.1477776989778515e-06, "loss": 0.4517, "step": 125030 }, { "epoch": 2.783119658119658, "grad_norm": 0.5569385886192322, "learning_rate": 5.1372732943649524e-06, "loss": 0.4586, "step": 125040 }, { "epoch": 2.783342236467236, "grad_norm": 0.4127381145954132, "learning_rate": 5.126779478752131e-06, "loss": 0.503, "step": 125050 }, { "epoch": 2.783564814814815, "grad_norm": 0.6115913987159729, "learning_rate": 5.116296252709685e-06, "loss": 0.4978, "step": 125060 }, { "epoch": 2.783787393162393, "grad_norm": 0.6389346122741699, "learning_rate": 5.105823616807226e-06, "loss": 0.5112, "step": 125070 }, { "epoch": 2.7840099715099713, "grad_norm": 0.7028676271438599, "learning_rate": 5.095361571613833e-06, "loss": 0.5135, "step": 125080 }, { "epoch": 2.78423254985755, "grad_norm": 0.6609174609184265, "learning_rate": 5.0849101176980496e-06, "loss": 0.5216, "step": 125090 }, { "epoch": 2.784455128205128, "grad_norm": 0.49764448404312134, "learning_rate": 5.074469255627823e-06, "loss": 0.4357, "step": 125100 }, { "epoch": 2.7846777065527064, "grad_norm": 0.4654216766357422, "learning_rate": 5.064038985970499e-06, "loss": 0.3966, "step": 125110 }, { "epoch": 2.784900284900285, "grad_norm": 0.7057489156723022, "learning_rate": 5.0536193092928894e-06, "loss": 0.4451, "step": 125120 }, { "epoch": 2.7851228632478633, "grad_norm": 0.5709713101387024, "learning_rate": 5.043210226161211e-06, "loss": 0.4062, "step": 125130 }, { "epoch": 2.7853454415954415, "grad_norm": 0.535394549369812, "learning_rate": 5.032811737141074e-06, "loss": 0.4252, "step": 125140 }, { "epoch": 2.78556801994302, "grad_norm": 0.5492165684700012, "learning_rate": 5.022423842797563e-06, "loss": 0.3341, "step": 125150 }, { "epoch": 2.7857905982905984, "grad_norm": 0.3868101239204407, "learning_rate": 5.012046543695137e-06, "loss": 0.3926, "step": 125160 }, { "epoch": 2.7860131766381766, "grad_norm": 0.5796651840209961, "learning_rate": 5.001679840397744e-06, "loss": 0.4226, "step": 125170 }, { "epoch": 2.786235754985755, "grad_norm": 0.4309646785259247, "learning_rate": 4.991323733468689e-06, "loss": 0.4942, "step": 125180 }, { "epoch": 2.7864583333333335, "grad_norm": 0.430446594953537, "learning_rate": 4.980978223470745e-06, "loss": 0.3336, "step": 125190 }, { "epoch": 2.7866809116809117, "grad_norm": 0.3631274104118347, "learning_rate": 4.9706433109661276e-06, "loss": 0.3256, "step": 125200 }, { "epoch": 2.78690349002849, "grad_norm": 0.3939351737499237, "learning_rate": 4.960318996516411e-06, "loss": 0.4333, "step": 125210 }, { "epoch": 2.787126068376068, "grad_norm": 0.6610859036445618, "learning_rate": 4.9500052806825905e-06, "loss": 0.3629, "step": 125220 }, { "epoch": 2.787348646723647, "grad_norm": 0.8630814552307129, "learning_rate": 4.939702164025173e-06, "loss": 0.3412, "step": 125230 }, { "epoch": 2.787571225071225, "grad_norm": 0.7305048704147339, "learning_rate": 4.929409647103999e-06, "loss": 0.5311, "step": 125240 }, { "epoch": 2.7877938034188032, "grad_norm": 0.5739668607711792, "learning_rate": 4.919127730478423e-06, "loss": 0.4532, "step": 125250 }, { "epoch": 2.788016381766382, "grad_norm": 0.7734736204147339, "learning_rate": 4.90885641470713e-06, "loss": 0.5096, "step": 125260 }, { "epoch": 2.78823896011396, "grad_norm": 0.4135821759700775, "learning_rate": 4.898595700348297e-06, "loss": 0.4665, "step": 125270 }, { "epoch": 2.7884615384615383, "grad_norm": 0.6077858209609985, "learning_rate": 4.8883455879595195e-06, "loss": 0.3647, "step": 125280 }, { "epoch": 2.788684116809117, "grad_norm": 0.7480467557907104, "learning_rate": 4.8781060780977325e-06, "loss": 0.478, "step": 125290 }, { "epoch": 2.788906695156695, "grad_norm": 0.49442699551582336, "learning_rate": 4.867877171319402e-06, "loss": 0.473, "step": 125300 }, { "epoch": 2.7891292735042734, "grad_norm": 0.38600459694862366, "learning_rate": 4.85765886818037e-06, "loss": 0.365, "step": 125310 }, { "epoch": 2.789351851851852, "grad_norm": 0.4062456786632538, "learning_rate": 4.847451169235906e-06, "loss": 0.4105, "step": 125320 }, { "epoch": 2.7895744301994303, "grad_norm": 0.49349337816238403, "learning_rate": 4.83725407504072e-06, "loss": 0.5138, "step": 125330 }, { "epoch": 2.7897970085470085, "grad_norm": 0.7541506886482239, "learning_rate": 4.827067586148903e-06, "loss": 0.4975, "step": 125340 }, { "epoch": 2.7900195868945867, "grad_norm": 0.7903211116790771, "learning_rate": 4.816891703114013e-06, "loss": 0.4628, "step": 125350 }, { "epoch": 2.7902421652421654, "grad_norm": 0.7652298808097839, "learning_rate": 4.806726426489006e-06, "loss": 0.5695, "step": 125360 }, { "epoch": 2.7904647435897436, "grad_norm": 0.7200684547424316, "learning_rate": 4.7965717568262844e-06, "loss": 0.4031, "step": 125370 }, { "epoch": 2.790687321937322, "grad_norm": 0.41406622529029846, "learning_rate": 4.786427694677653e-06, "loss": 0.5313, "step": 125380 }, { "epoch": 2.7909099002849, "grad_norm": 0.546970009803772, "learning_rate": 4.7762942405943365e-06, "loss": 0.4762, "step": 125390 }, { "epoch": 2.7911324786324787, "grad_norm": 0.5090673565864563, "learning_rate": 4.766171395127006e-06, "loss": 0.4604, "step": 125400 }, { "epoch": 2.791355056980057, "grad_norm": 0.5817758440971375, "learning_rate": 4.756059158825754e-06, "loss": 0.4538, "step": 125410 }, { "epoch": 2.791577635327635, "grad_norm": 0.4301309287548065, "learning_rate": 4.74595753224012e-06, "loss": 0.4941, "step": 125420 }, { "epoch": 2.791800213675214, "grad_norm": 0.555213987827301, "learning_rate": 4.7358665159189516e-06, "loss": 0.447, "step": 125430 }, { "epoch": 2.792022792022792, "grad_norm": 0.5680949687957764, "learning_rate": 4.725786110410657e-06, "loss": 0.5646, "step": 125440 }, { "epoch": 2.7922453703703702, "grad_norm": 0.568378746509552, "learning_rate": 4.715716316262997e-06, "loss": 0.4267, "step": 125450 }, { "epoch": 2.792467948717949, "grad_norm": 0.6673989295959473, "learning_rate": 4.7056571340232225e-06, "loss": 0.4868, "step": 125460 }, { "epoch": 2.792690527065527, "grad_norm": 0.5990675687789917, "learning_rate": 4.6956085642378745e-06, "loss": 0.4915, "step": 125470 }, { "epoch": 2.7929131054131053, "grad_norm": 0.4197758734226227, "learning_rate": 4.685570607453027e-06, "loss": 0.448, "step": 125480 }, { "epoch": 2.793135683760684, "grad_norm": 0.6081216335296631, "learning_rate": 4.675543264214222e-06, "loss": 0.486, "step": 125490 }, { "epoch": 2.793358262108262, "grad_norm": 0.5012257099151611, "learning_rate": 4.665526535066245e-06, "loss": 0.4603, "step": 125500 }, { "epoch": 2.7935808404558404, "grad_norm": 0.5224705934524536, "learning_rate": 4.655520420553483e-06, "loss": 0.3978, "step": 125510 }, { "epoch": 2.7938034188034186, "grad_norm": 0.5236812233924866, "learning_rate": 4.645524921219635e-06, "loss": 0.4805, "step": 125520 }, { "epoch": 2.7940259971509973, "grad_norm": 0.6798905730247498, "learning_rate": 4.635540037607911e-06, "loss": 0.5028, "step": 125530 }, { "epoch": 2.7942485754985755, "grad_norm": 0.3394434452056885, "learning_rate": 4.6255657702608535e-06, "loss": 0.4286, "step": 125540 }, { "epoch": 2.7944711538461537, "grad_norm": 0.42189839482307434, "learning_rate": 4.615602119720519e-06, "loss": 0.4352, "step": 125550 }, { "epoch": 2.794693732193732, "grad_norm": 0.5317621827125549, "learning_rate": 4.605649086528319e-06, "loss": 0.448, "step": 125560 }, { "epoch": 2.7949163105413106, "grad_norm": 0.7038161158561707, "learning_rate": 4.595706671225086e-06, "loss": 0.5898, "step": 125570 }, { "epoch": 2.795138888888889, "grad_norm": 0.44456297159194946, "learning_rate": 4.5857748743511234e-06, "loss": 0.403, "step": 125580 }, { "epoch": 2.795361467236467, "grad_norm": 0.6659712791442871, "learning_rate": 4.575853696446131e-06, "loss": 0.3923, "step": 125590 }, { "epoch": 2.7955840455840457, "grad_norm": 0.4893523156642914, "learning_rate": 4.565943138049233e-06, "loss": 0.3691, "step": 125600 }, { "epoch": 2.795806623931624, "grad_norm": 0.6819437146186829, "learning_rate": 4.556043199698956e-06, "loss": 0.5384, "step": 125610 }, { "epoch": 2.796029202279202, "grad_norm": 0.3867335319519043, "learning_rate": 4.54615388193329e-06, "loss": 0.423, "step": 125620 }, { "epoch": 2.796251780626781, "grad_norm": 0.614936888217926, "learning_rate": 4.5362751852896516e-06, "loss": 0.4349, "step": 125630 }, { "epoch": 2.796474358974359, "grad_norm": 0.513956606388092, "learning_rate": 4.526407110304831e-06, "loss": 0.3862, "step": 125640 }, { "epoch": 2.7966969373219372, "grad_norm": 0.5959755778312683, "learning_rate": 4.516549657515046e-06, "loss": 0.4267, "step": 125650 }, { "epoch": 2.796919515669516, "grad_norm": 0.6045657992362976, "learning_rate": 4.506702827456e-06, "loss": 0.3683, "step": 125660 }, { "epoch": 2.797142094017094, "grad_norm": 0.5449283719062805, "learning_rate": 4.496866620662754e-06, "loss": 0.4474, "step": 125670 }, { "epoch": 2.7973646723646723, "grad_norm": 0.5704584717750549, "learning_rate": 4.487041037669814e-06, "loss": 0.3552, "step": 125680 }, { "epoch": 2.7975872507122506, "grad_norm": 0.8659281730651855, "learning_rate": 4.477226079011154e-06, "loss": 0.4936, "step": 125690 }, { "epoch": 2.7978098290598292, "grad_norm": 0.4428424537181854, "learning_rate": 4.467421745220058e-06, "loss": 0.422, "step": 125700 }, { "epoch": 2.7980324074074074, "grad_norm": 0.660078763961792, "learning_rate": 4.457628036829342e-06, "loss": 0.5, "step": 125710 }, { "epoch": 2.7982549857549857, "grad_norm": 0.46205437183380127, "learning_rate": 4.447844954371206e-06, "loss": 0.468, "step": 125720 }, { "epoch": 2.798477564102564, "grad_norm": 0.49835318326950073, "learning_rate": 4.438072498377244e-06, "loss": 0.3658, "step": 125730 }, { "epoch": 2.7987001424501425, "grad_norm": 0.7425667643547058, "learning_rate": 4.428310669378543e-06, "loss": 0.5988, "step": 125740 }, { "epoch": 2.7989227207977208, "grad_norm": 0.3822995126247406, "learning_rate": 4.418559467905503e-06, "loss": 0.5071, "step": 125750 }, { "epoch": 2.799145299145299, "grad_norm": 0.5722664594650269, "learning_rate": 4.408818894488076e-06, "loss": 0.4471, "step": 125760 }, { "epoch": 2.7993678774928776, "grad_norm": 0.29368162155151367, "learning_rate": 4.399088949655572e-06, "loss": 0.401, "step": 125770 }, { "epoch": 2.799590455840456, "grad_norm": 0.33703193068504333, "learning_rate": 4.38936963393668e-06, "loss": 0.5182, "step": 125780 }, { "epoch": 2.799813034188034, "grad_norm": 0.43243011832237244, "learning_rate": 4.379660947859554e-06, "loss": 0.4736, "step": 125790 }, { "epoch": 2.8000356125356127, "grad_norm": 0.5151998400688171, "learning_rate": 4.3699628919518175e-06, "loss": 0.4869, "step": 125800 }, { "epoch": 2.800258190883191, "grad_norm": 0.6595456600189209, "learning_rate": 4.360275466740449e-06, "loss": 0.4484, "step": 125810 }, { "epoch": 2.800480769230769, "grad_norm": 0.796154797077179, "learning_rate": 4.350598672751849e-06, "loss": 0.4647, "step": 125820 }, { "epoch": 2.800703347578348, "grad_norm": 0.4498961567878723, "learning_rate": 4.340932510511908e-06, "loss": 0.4975, "step": 125830 }, { "epoch": 2.800925925925926, "grad_norm": 0.46631789207458496, "learning_rate": 4.331276980545873e-06, "loss": 0.4019, "step": 125840 }, { "epoch": 2.8011485042735043, "grad_norm": 0.7194572687149048, "learning_rate": 4.321632083378413e-06, "loss": 0.4679, "step": 125850 }, { "epoch": 2.8013710826210825, "grad_norm": 0.6665747761726379, "learning_rate": 4.311997819533664e-06, "loss": 0.4112, "step": 125860 }, { "epoch": 2.8015936609686607, "grad_norm": 0.6622679233551025, "learning_rate": 4.302374189535141e-06, "loss": 0.53, "step": 125870 }, { "epoch": 2.8018162393162394, "grad_norm": 0.38641971349716187, "learning_rate": 4.2927611939058034e-06, "loss": 0.4984, "step": 125880 }, { "epoch": 2.8020388176638176, "grad_norm": 0.5284295082092285, "learning_rate": 4.283158833168055e-06, "loss": 0.3875, "step": 125890 }, { "epoch": 2.802261396011396, "grad_norm": 0.7484142184257507, "learning_rate": 4.273567107843657e-06, "loss": 0.5029, "step": 125900 }, { "epoch": 2.8024839743589745, "grad_norm": 0.5397800207138062, "learning_rate": 4.26398601845388e-06, "loss": 0.3654, "step": 125910 }, { "epoch": 2.8027065527065527, "grad_norm": 0.5725739002227783, "learning_rate": 4.25441556551931e-06, "loss": 0.3719, "step": 125920 }, { "epoch": 2.802929131054131, "grad_norm": 0.5289227962493896, "learning_rate": 4.244855749560062e-06, "loss": 0.4642, "step": 125930 }, { "epoch": 2.8031517094017095, "grad_norm": 0.7622392177581787, "learning_rate": 4.235306571095588e-06, "loss": 0.4956, "step": 125940 }, { "epoch": 2.8033742877492878, "grad_norm": 0.48069143295288086, "learning_rate": 4.225768030644828e-06, "loss": 0.3907, "step": 125950 }, { "epoch": 2.803596866096866, "grad_norm": 0.5886992812156677, "learning_rate": 4.2162401287261015e-06, "loss": 0.4595, "step": 125960 }, { "epoch": 2.8038194444444446, "grad_norm": 0.5862254500389099, "learning_rate": 4.206722865857171e-06, "loss": 0.4601, "step": 125970 }, { "epoch": 2.804042022792023, "grad_norm": 0.37320825457572937, "learning_rate": 4.19721624255518e-06, "loss": 0.3773, "step": 125980 }, { "epoch": 2.804264601139601, "grad_norm": 0.7246841192245483, "learning_rate": 4.187720259336781e-06, "loss": 0.4905, "step": 125990 }, { "epoch": 2.8044871794871797, "grad_norm": 0.588402509689331, "learning_rate": 4.178234916717938e-06, "loss": 0.5129, "step": 126000 }, { "epoch": 2.804709757834758, "grad_norm": 0.6401274800300598, "learning_rate": 4.168760215214129e-06, "loss": 0.5112, "step": 126010 }, { "epoch": 2.804932336182336, "grad_norm": 0.683003306388855, "learning_rate": 4.159296155340187e-06, "loss": 0.4992, "step": 126020 }, { "epoch": 2.8051549145299144, "grad_norm": 0.5846920609474182, "learning_rate": 4.149842737610432e-06, "loss": 0.5349, "step": 126030 }, { "epoch": 2.8053774928774926, "grad_norm": 0.45215484499931335, "learning_rate": 4.140399962538543e-06, "loss": 0.4911, "step": 126040 }, { "epoch": 2.8056000712250713, "grad_norm": 0.6347735524177551, "learning_rate": 4.130967830637666e-06, "loss": 0.5719, "step": 126050 }, { "epoch": 2.8058226495726495, "grad_norm": 0.7858456969261169, "learning_rate": 4.121546342420346e-06, "loss": 0.4583, "step": 126060 }, { "epoch": 2.8060452279202277, "grad_norm": 0.5836461186408997, "learning_rate": 4.112135498398528e-06, "loss": 0.4243, "step": 126070 }, { "epoch": 2.8062678062678064, "grad_norm": 0.47294196486473083, "learning_rate": 4.102735299083649e-06, "loss": 0.4377, "step": 126080 }, { "epoch": 2.8064903846153846, "grad_norm": 0.5034077763557434, "learning_rate": 4.093345744986499e-06, "loss": 0.4291, "step": 126090 }, { "epoch": 2.806712962962963, "grad_norm": 0.38051632046699524, "learning_rate": 4.083966836617315e-06, "loss": 0.4075, "step": 126100 }, { "epoch": 2.8069355413105415, "grad_norm": 0.7089518308639526, "learning_rate": 4.074598574485778e-06, "loss": 0.4402, "step": 126110 }, { "epoch": 2.8071581196581197, "grad_norm": 0.5025211572647095, "learning_rate": 4.0652409591009245e-06, "loss": 0.4942, "step": 126120 }, { "epoch": 2.807380698005698, "grad_norm": 0.43359848856925964, "learning_rate": 4.0558939909712825e-06, "loss": 0.5245, "step": 126130 }, { "epoch": 2.8076032763532766, "grad_norm": 0.43336302042007446, "learning_rate": 4.0465576706047785e-06, "loss": 0.4345, "step": 126140 }, { "epoch": 2.8078258547008548, "grad_norm": 0.6243006587028503, "learning_rate": 4.037231998508717e-06, "loss": 0.5161, "step": 126150 }, { "epoch": 2.808048433048433, "grad_norm": 0.5328023433685303, "learning_rate": 4.027916975189916e-06, "loss": 0.4368, "step": 126160 }, { "epoch": 2.8082710113960117, "grad_norm": 0.6565059423446655, "learning_rate": 4.018612601154525e-06, "loss": 0.4876, "step": 126170 }, { "epoch": 2.80849358974359, "grad_norm": 0.4544715881347656, "learning_rate": 4.0093188769081635e-06, "loss": 0.4173, "step": 126180 }, { "epoch": 2.808716168091168, "grad_norm": 0.38822659850120544, "learning_rate": 4.000035802955871e-06, "loss": 0.5016, "step": 126190 }, { "epoch": 2.8089387464387463, "grad_norm": 0.5639353394508362, "learning_rate": 3.990763379802087e-06, "loss": 0.4352, "step": 126200 }, { "epoch": 2.8091613247863245, "grad_norm": 0.38232603669166565, "learning_rate": 3.981501607950655e-06, "loss": 0.4434, "step": 126210 }, { "epoch": 2.809383903133903, "grad_norm": 0.6549859642982483, "learning_rate": 3.972250487904905e-06, "loss": 0.4758, "step": 126220 }, { "epoch": 2.8096064814814814, "grad_norm": 0.5064232349395752, "learning_rate": 3.963010020167546e-06, "loss": 0.4107, "step": 126230 }, { "epoch": 2.8098290598290596, "grad_norm": 0.514802873134613, "learning_rate": 3.953780205240709e-06, "loss": 0.4204, "step": 126240 }, { "epoch": 2.8100516381766383, "grad_norm": 0.48119041323661804, "learning_rate": 3.9445610436259494e-06, "loss": 0.3383, "step": 126250 }, { "epoch": 2.8102742165242165, "grad_norm": 0.5888558030128479, "learning_rate": 3.9353525358242434e-06, "loss": 0.4728, "step": 126260 }, { "epoch": 2.8104967948717947, "grad_norm": 0.4223555624485016, "learning_rate": 3.926154682335992e-06, "loss": 0.4459, "step": 126270 }, { "epoch": 2.8107193732193734, "grad_norm": 0.5040433406829834, "learning_rate": 3.916967483660994e-06, "loss": 0.5122, "step": 126280 }, { "epoch": 2.8109419515669516, "grad_norm": 0.643981397151947, "learning_rate": 3.907790940298495e-06, "loss": 0.3608, "step": 126290 }, { "epoch": 2.81116452991453, "grad_norm": 0.5450314283370972, "learning_rate": 3.898625052747185e-06, "loss": 0.3875, "step": 126300 }, { "epoch": 2.8113871082621085, "grad_norm": 0.9616496562957764, "learning_rate": 3.88946982150511e-06, "loss": 0.4265, "step": 126310 }, { "epoch": 2.8116096866096867, "grad_norm": 0.6152632832527161, "learning_rate": 3.8803252470697825e-06, "loss": 0.4989, "step": 126320 }, { "epoch": 2.811832264957265, "grad_norm": 0.8327240347862244, "learning_rate": 3.8711913299381615e-06, "loss": 0.4178, "step": 126330 }, { "epoch": 2.8120548433048436, "grad_norm": 0.4893125593662262, "learning_rate": 3.862068070606539e-06, "loss": 0.4356, "step": 126340 }, { "epoch": 2.812277421652422, "grad_norm": 0.3900088667869568, "learning_rate": 3.8529554695706956e-06, "loss": 0.4796, "step": 126350 }, { "epoch": 2.8125, "grad_norm": 0.4981417655944824, "learning_rate": 3.843853527325836e-06, "loss": 0.4961, "step": 126360 }, { "epoch": 2.812722578347578, "grad_norm": 0.5793061256408691, "learning_rate": 3.834762244366563e-06, "loss": 0.407, "step": 126370 }, { "epoch": 2.8129451566951564, "grad_norm": 0.5003716349601746, "learning_rate": 3.825681621186905e-06, "loss": 0.5477, "step": 126380 }, { "epoch": 2.813167735042735, "grad_norm": 0.6106581091880798, "learning_rate": 3.816611658280289e-06, "loss": 0.4214, "step": 126390 }, { "epoch": 2.8133903133903133, "grad_norm": 0.526982843875885, "learning_rate": 3.80755235613961e-06, "loss": 0.5292, "step": 126400 }, { "epoch": 2.8136128917378915, "grad_norm": 0.5547360777854919, "learning_rate": 3.7985037152571403e-06, "loss": 0.453, "step": 126410 }, { "epoch": 2.81383547008547, "grad_norm": 0.6486905217170715, "learning_rate": 3.789465736124598e-06, "loss": 0.4772, "step": 126420 }, { "epoch": 2.8140580484330484, "grad_norm": 0.5603926181793213, "learning_rate": 3.780438419233101e-06, "loss": 0.3599, "step": 126430 }, { "epoch": 2.8142806267806266, "grad_norm": 0.5745129585266113, "learning_rate": 3.7714217650732354e-06, "loss": 0.4661, "step": 126440 }, { "epoch": 2.8145032051282053, "grad_norm": 0.5068896412849426, "learning_rate": 3.7624157741349197e-06, "loss": 0.4888, "step": 126450 }, { "epoch": 2.8147257834757835, "grad_norm": 0.5847540497779846, "learning_rate": 3.7534204469076074e-06, "loss": 0.5063, "step": 126460 }, { "epoch": 2.8149483618233617, "grad_norm": 0.550115704536438, "learning_rate": 3.7444357838800846e-06, "loss": 0.4651, "step": 126470 }, { "epoch": 2.8151709401709404, "grad_norm": 0.8214982748031616, "learning_rate": 3.7354617855405614e-06, "loss": 0.4216, "step": 126480 }, { "epoch": 2.8153935185185186, "grad_norm": 0.6252423524856567, "learning_rate": 3.7264984523767142e-06, "loss": 0.5309, "step": 126490 }, { "epoch": 2.815616096866097, "grad_norm": 0.5870510935783386, "learning_rate": 3.7175457848756424e-06, "loss": 0.4906, "step": 126500 }, { "epoch": 2.8158386752136755, "grad_norm": 0.5593652129173279, "learning_rate": 3.7086037835238007e-06, "loss": 0.4567, "step": 126510 }, { "epoch": 2.8160612535612537, "grad_norm": 0.3342607915401459, "learning_rate": 3.6996724488071342e-06, "loss": 0.5018, "step": 126520 }, { "epoch": 2.816283831908832, "grad_norm": 0.510809600353241, "learning_rate": 3.6907517812109436e-06, "loss": 0.3862, "step": 126530 }, { "epoch": 2.81650641025641, "grad_norm": 0.6610252261161804, "learning_rate": 3.681841781220019e-06, "loss": 0.4706, "step": 126540 }, { "epoch": 2.8167289886039883, "grad_norm": 0.6215569376945496, "learning_rate": 3.672942449318528e-06, "loss": 0.4252, "step": 126550 }, { "epoch": 2.816951566951567, "grad_norm": 0.5946906208992004, "learning_rate": 3.6640537859900627e-06, "loss": 0.4338, "step": 126560 }, { "epoch": 2.8171741452991452, "grad_norm": 0.6003249883651733, "learning_rate": 3.6551757917176357e-06, "loss": 0.3925, "step": 126570 }, { "epoch": 2.8173967236467234, "grad_norm": 0.6325357556343079, "learning_rate": 3.6463084669837057e-06, "loss": 0.4601, "step": 126580 }, { "epoch": 2.817619301994302, "grad_norm": 0.6173017621040344, "learning_rate": 3.6374518122701095e-06, "loss": 0.4204, "step": 126590 }, { "epoch": 2.8178418803418803, "grad_norm": 0.509817898273468, "learning_rate": 3.628605828058129e-06, "loss": 0.4388, "step": 126600 }, { "epoch": 2.8180644586894585, "grad_norm": 0.6608301997184753, "learning_rate": 3.619770514828469e-06, "loss": 0.5231, "step": 126610 }, { "epoch": 2.818287037037037, "grad_norm": 0.471277117729187, "learning_rate": 3.610945873061233e-06, "loss": 0.406, "step": 126620 }, { "epoch": 2.8185096153846154, "grad_norm": 0.47932931780815125, "learning_rate": 3.6021319032359724e-06, "loss": 0.3938, "step": 126630 }, { "epoch": 2.8187321937321936, "grad_norm": 0.5314711928367615, "learning_rate": 3.5933286058316808e-06, "loss": 0.3801, "step": 126640 }, { "epoch": 2.8189547720797723, "grad_norm": 0.42228272557258606, "learning_rate": 3.5845359813266643e-06, "loss": 0.4342, "step": 126650 }, { "epoch": 2.8191773504273505, "grad_norm": 0.6404412388801575, "learning_rate": 3.5757540301987415e-06, "loss": 0.4596, "step": 126660 }, { "epoch": 2.8193999287749287, "grad_norm": 0.6327511072158813, "learning_rate": 3.566982752925152e-06, "loss": 0.5203, "step": 126670 }, { "epoch": 2.8196225071225074, "grad_norm": 0.5190104246139526, "learning_rate": 3.5582221499825598e-06, "loss": 0.5352, "step": 126680 }, { "epoch": 2.8198450854700856, "grad_norm": 0.5785208344459534, "learning_rate": 3.5494722218469614e-06, "loss": 0.4493, "step": 126690 }, { "epoch": 2.820067663817664, "grad_norm": 0.447017639875412, "learning_rate": 3.5407329689938653e-06, "loss": 0.3606, "step": 126700 }, { "epoch": 2.820290242165242, "grad_norm": 0.6020628809928894, "learning_rate": 3.5320043918981804e-06, "loss": 0.3456, "step": 126710 }, { "epoch": 2.820334757834758, "eval_loss": 0.5204442739486694, "eval_runtime": 337.1596, "eval_samples_per_second": 7.014, "eval_steps_per_second": 7.014, "step": 126712 }, { "epoch": 2.8205128205128203, "grad_norm": 0.4100450575351715, "learning_rate": 3.5232864910341943e-06, "loss": 0.4373, "step": 126720 }, { "epoch": 2.820735398860399, "grad_norm": 0.6168124079704285, "learning_rate": 3.514579266875684e-06, "loss": 0.4102, "step": 126730 }, { "epoch": 2.820957977207977, "grad_norm": 0.5441197752952576, "learning_rate": 3.5058827198957812e-06, "loss": 0.5041, "step": 126740 }, { "epoch": 2.8211805555555554, "grad_norm": 0.4099632799625397, "learning_rate": 3.497196850567064e-06, "loss": 0.4067, "step": 126750 }, { "epoch": 2.821403133903134, "grad_norm": 0.5914316177368164, "learning_rate": 3.4885216593615323e-06, "loss": 0.5025, "step": 126760 }, { "epoch": 2.8216257122507122, "grad_norm": 0.6284675598144531, "learning_rate": 3.4798571467506314e-06, "loss": 0.4211, "step": 126770 }, { "epoch": 2.8218482905982905, "grad_norm": 0.47363045811653137, "learning_rate": 3.4712033132051403e-06, "loss": 0.5034, "step": 126780 }, { "epoch": 2.822070868945869, "grad_norm": 0.4073488712310791, "learning_rate": 3.4625601591953716e-06, "loss": 0.452, "step": 126790 }, { "epoch": 2.8222934472934473, "grad_norm": 0.45261600613594055, "learning_rate": 3.4539276851909496e-06, "loss": 0.4788, "step": 126800 }, { "epoch": 2.8225160256410255, "grad_norm": 0.563221275806427, "learning_rate": 3.4453058916610103e-06, "loss": 0.4369, "step": 126810 }, { "epoch": 2.822738603988604, "grad_norm": 0.6849488019943237, "learning_rate": 3.436694779074068e-06, "loss": 0.5502, "step": 126820 }, { "epoch": 2.8229611823361824, "grad_norm": 0.5914343595504761, "learning_rate": 3.4280943478980364e-06, "loss": 0.4897, "step": 126830 }, { "epoch": 2.8231837606837606, "grad_norm": 0.45933276414871216, "learning_rate": 3.419504598600276e-06, "loss": 0.4959, "step": 126840 }, { "epoch": 2.8234063390313393, "grad_norm": 0.4319513440132141, "learning_rate": 3.4109255316475463e-06, "loss": 0.4694, "step": 126850 }, { "epoch": 2.8236289173789175, "grad_norm": 0.4329933822154999, "learning_rate": 3.4023571475060746e-06, "loss": 0.3935, "step": 126860 }, { "epoch": 2.8238514957264957, "grad_norm": 0.6808799505233765, "learning_rate": 3.3937994466414656e-06, "loss": 0.5708, "step": 126870 }, { "epoch": 2.824074074074074, "grad_norm": 0.36101189255714417, "learning_rate": 3.385252429518726e-06, "loss": 0.4141, "step": 126880 }, { "epoch": 2.824296652421652, "grad_norm": 0.6104711890220642, "learning_rate": 3.3767160966023503e-06, "loss": 0.5179, "step": 126890 }, { "epoch": 2.824519230769231, "grad_norm": 0.6297375559806824, "learning_rate": 3.36819044835619e-06, "loss": 0.5327, "step": 126900 }, { "epoch": 2.824741809116809, "grad_norm": 0.7188384532928467, "learning_rate": 3.3596754852435187e-06, "loss": 0.4808, "step": 126910 }, { "epoch": 2.8249643874643873, "grad_norm": 0.429747998714447, "learning_rate": 3.351171207727055e-06, "loss": 0.4285, "step": 126920 }, { "epoch": 2.825186965811966, "grad_norm": 0.5166102647781372, "learning_rate": 3.342677616268919e-06, "loss": 0.4007, "step": 126930 }, { "epoch": 2.825409544159544, "grad_norm": 0.5530314445495605, "learning_rate": 3.334194711330696e-06, "loss": 0.4761, "step": 126940 }, { "epoch": 2.8256321225071224, "grad_norm": 0.5777900815010071, "learning_rate": 3.325722493373307e-06, "loss": 0.4773, "step": 126950 }, { "epoch": 2.825854700854701, "grad_norm": 0.4906090497970581, "learning_rate": 3.3172609628571826e-06, "loss": 0.4418, "step": 126960 }, { "epoch": 2.8260772792022792, "grad_norm": 0.5480250716209412, "learning_rate": 3.308810120242112e-06, "loss": 0.483, "step": 126970 }, { "epoch": 2.8262998575498575, "grad_norm": 0.5232762098312378, "learning_rate": 3.3003699659873046e-06, "loss": 0.4827, "step": 126980 }, { "epoch": 2.826522435897436, "grad_norm": 0.5735209584236145, "learning_rate": 3.291940500551416e-06, "loss": 0.4374, "step": 126990 }, { "epoch": 2.8267450142450143, "grad_norm": 0.5852814316749573, "learning_rate": 3.2835217243925244e-06, "loss": 0.4975, "step": 127000 }, { "epoch": 2.8269675925925926, "grad_norm": 0.7762939929962158, "learning_rate": 3.275113637968086e-06, "loss": 0.4723, "step": 127010 }, { "epoch": 2.8271901709401708, "grad_norm": 0.6684213280677795, "learning_rate": 3.2667162417350238e-06, "loss": 0.5022, "step": 127020 }, { "epoch": 2.8274127492877494, "grad_norm": 0.6481077075004578, "learning_rate": 3.2583295361496845e-06, "loss": 0.442, "step": 127030 }, { "epoch": 2.8276353276353277, "grad_norm": 0.6304945945739746, "learning_rate": 3.24995352166777e-06, "loss": 0.5479, "step": 127040 }, { "epoch": 2.827857905982906, "grad_norm": 0.45239803194999695, "learning_rate": 3.2415881987444274e-06, "loss": 0.5192, "step": 127050 }, { "epoch": 2.828080484330484, "grad_norm": 0.5424669981002808, "learning_rate": 3.2332335678342484e-06, "loss": 0.4948, "step": 127060 }, { "epoch": 2.8283030626780628, "grad_norm": 0.5759827494621277, "learning_rate": 3.2248896293912477e-06, "loss": 0.4737, "step": 127070 }, { "epoch": 2.828525641025641, "grad_norm": 0.6620804667472839, "learning_rate": 3.2165563838688408e-06, "loss": 0.556, "step": 127080 }, { "epoch": 2.828748219373219, "grad_norm": 0.5302000641822815, "learning_rate": 3.2082338317198425e-06, "loss": 0.5109, "step": 127090 }, { "epoch": 2.828970797720798, "grad_norm": 0.6622860431671143, "learning_rate": 3.199921973396536e-06, "loss": 0.4877, "step": 127100 }, { "epoch": 2.829193376068376, "grad_norm": 0.707613468170166, "learning_rate": 3.1916208093505595e-06, "loss": 0.4645, "step": 127110 }, { "epoch": 2.8294159544159543, "grad_norm": 0.3570280075073242, "learning_rate": 3.1833303400330415e-06, "loss": 0.4153, "step": 127120 }, { "epoch": 2.829638532763533, "grad_norm": 0.5294520258903503, "learning_rate": 3.1750505658944664e-06, "loss": 0.4884, "step": 127130 }, { "epoch": 2.829861111111111, "grad_norm": 0.5556965470314026, "learning_rate": 3.1667814873847624e-06, "loss": 0.4469, "step": 127140 }, { "epoch": 2.8300836894586894, "grad_norm": 0.5460103750228882, "learning_rate": 3.1585231049532817e-06, "loss": 0.3631, "step": 127150 }, { "epoch": 2.830306267806268, "grad_norm": 0.6050337553024292, "learning_rate": 3.1502754190488205e-06, "loss": 0.4802, "step": 127160 }, { "epoch": 2.8305288461538463, "grad_norm": 0.4571405351161957, "learning_rate": 3.1420384301195316e-06, "loss": 0.4772, "step": 127170 }, { "epoch": 2.8307514245014245, "grad_norm": 0.4969148635864258, "learning_rate": 3.133812138613057e-06, "loss": 0.3877, "step": 127180 }, { "epoch": 2.8309740028490027, "grad_norm": 0.5661829710006714, "learning_rate": 3.1255965449763503e-06, "loss": 0.5466, "step": 127190 }, { "epoch": 2.8311965811965814, "grad_norm": 0.3782590329647064, "learning_rate": 3.117391649655921e-06, "loss": 0.3612, "step": 127200 }, { "epoch": 2.8314191595441596, "grad_norm": 0.42014560103416443, "learning_rate": 3.10919745309759e-06, "loss": 0.4288, "step": 127210 }, { "epoch": 2.831641737891738, "grad_norm": 0.5029391050338745, "learning_rate": 3.1010139557466455e-06, "loss": 0.4194, "step": 127220 }, { "epoch": 2.831864316239316, "grad_norm": 0.4752350151538849, "learning_rate": 3.0928411580477988e-06, "loss": 0.5007, "step": 127230 }, { "epoch": 2.8320868945868947, "grad_norm": 0.5849156975746155, "learning_rate": 3.0846790604451837e-06, "loss": 0.4155, "step": 127240 }, { "epoch": 2.832309472934473, "grad_norm": 0.6458626389503479, "learning_rate": 3.0765276633822894e-06, "loss": 0.492, "step": 127250 }, { "epoch": 2.832532051282051, "grad_norm": 0.41135790944099426, "learning_rate": 3.0683869673020947e-06, "loss": 0.3612, "step": 127260 }, { "epoch": 2.8327546296296298, "grad_norm": 0.5405125617980957, "learning_rate": 3.0602569726469578e-06, "loss": 0.4437, "step": 127270 }, { "epoch": 2.832977207977208, "grad_norm": 0.8160549402236938, "learning_rate": 3.05213767985868e-06, "loss": 0.5167, "step": 127280 }, { "epoch": 2.833199786324786, "grad_norm": 0.6462587118148804, "learning_rate": 3.044029089378486e-06, "loss": 0.4281, "step": 127290 }, { "epoch": 2.833422364672365, "grad_norm": 0.5424714088439941, "learning_rate": 3.03593120164698e-06, "loss": 0.477, "step": 127300 }, { "epoch": 2.833644943019943, "grad_norm": 0.6870628595352173, "learning_rate": 3.0278440171042087e-06, "loss": 0.5667, "step": 127310 }, { "epoch": 2.8338675213675213, "grad_norm": 0.3487043082714081, "learning_rate": 3.019767536189666e-06, "loss": 0.3682, "step": 127320 }, { "epoch": 2.8340900997151, "grad_norm": 0.6313628554344177, "learning_rate": 3.011701759342178e-06, "loss": 0.4717, "step": 127330 }, { "epoch": 2.834312678062678, "grad_norm": 0.49724081158638, "learning_rate": 3.003646687000106e-06, "loss": 0.426, "step": 127340 }, { "epoch": 2.8345352564102564, "grad_norm": 0.5283458232879639, "learning_rate": 2.995602319601121e-06, "loss": 0.4111, "step": 127350 }, { "epoch": 2.8347578347578346, "grad_norm": 0.5319889783859253, "learning_rate": 2.987568657582385e-06, "loss": 0.6391, "step": 127360 }, { "epoch": 2.8349804131054133, "grad_norm": 0.7493406534194946, "learning_rate": 2.9795457013804593e-06, "loss": 0.4456, "step": 127370 }, { "epoch": 2.8352029914529915, "grad_norm": 0.5402820706367493, "learning_rate": 2.9715334514313076e-06, "loss": 0.5811, "step": 127380 }, { "epoch": 2.8354255698005697, "grad_norm": 0.366567462682724, "learning_rate": 2.963531908170314e-06, "loss": 0.4182, "step": 127390 }, { "epoch": 2.835648148148148, "grad_norm": 0.4801349341869354, "learning_rate": 2.9555410720323086e-06, "loss": 0.5394, "step": 127400 }, { "epoch": 2.8358707264957266, "grad_norm": 0.5683795213699341, "learning_rate": 2.9475609434514993e-06, "loss": 0.4981, "step": 127410 }, { "epoch": 2.836093304843305, "grad_norm": 0.67473304271698, "learning_rate": 2.9395915228615402e-06, "loss": 0.5979, "step": 127420 }, { "epoch": 2.836315883190883, "grad_norm": 0.4936923682689667, "learning_rate": 2.931632810695506e-06, "loss": 0.4675, "step": 127430 }, { "epoch": 2.8365384615384617, "grad_norm": 0.5283862948417664, "learning_rate": 2.9236848073858736e-06, "loss": 0.4716, "step": 127440 }, { "epoch": 2.83676103988604, "grad_norm": 0.4947654604911804, "learning_rate": 2.9157475133645416e-06, "loss": 0.4225, "step": 127450 }, { "epoch": 2.836983618233618, "grad_norm": 0.7152960896492004, "learning_rate": 2.9078209290628324e-06, "loss": 0.532, "step": 127460 }, { "epoch": 2.8372061965811968, "grad_norm": 0.5929638147354126, "learning_rate": 2.899905054911489e-06, "loss": 0.4627, "step": 127470 }, { "epoch": 2.837428774928775, "grad_norm": 0.7341625094413757, "learning_rate": 2.8919998913406353e-06, "loss": 0.5209, "step": 127480 }, { "epoch": 2.837651353276353, "grad_norm": 0.5574802160263062, "learning_rate": 2.884105438779883e-06, "loss": 0.4576, "step": 127490 }, { "epoch": 2.837873931623932, "grad_norm": 0.6063029766082764, "learning_rate": 2.876221697658199e-06, "loss": 0.4474, "step": 127500 }, { "epoch": 2.83809650997151, "grad_norm": 0.7553101778030396, "learning_rate": 2.8683486684040195e-06, "loss": 0.6091, "step": 127510 }, { "epoch": 2.8383190883190883, "grad_norm": 0.6518235206604004, "learning_rate": 2.8604863514451354e-06, "loss": 0.5291, "step": 127520 }, { "epoch": 2.8385416666666665, "grad_norm": 0.7313382625579834, "learning_rate": 2.8526347472088266e-06, "loss": 0.518, "step": 127530 }, { "epoch": 2.838764245014245, "grad_norm": 0.5088934302330017, "learning_rate": 2.844793856121708e-06, "loss": 0.3735, "step": 127540 }, { "epoch": 2.8389868233618234, "grad_norm": 0.9321578741073608, "learning_rate": 2.836963678609905e-06, "loss": 0.5138, "step": 127550 }, { "epoch": 2.8392094017094016, "grad_norm": 0.5136719346046448, "learning_rate": 2.829144215098922e-06, "loss": 0.4957, "step": 127560 }, { "epoch": 2.83943198005698, "grad_norm": 0.6601418256759644, "learning_rate": 2.8213354660136416e-06, "loss": 0.524, "step": 127570 }, { "epoch": 2.8396545584045585, "grad_norm": 0.5681608319282532, "learning_rate": 2.8135374317783904e-06, "loss": 0.4735, "step": 127580 }, { "epoch": 2.8398771367521367, "grad_norm": 0.7215602993965149, "learning_rate": 2.8057501128169626e-06, "loss": 0.4704, "step": 127590 }, { "epoch": 2.840099715099715, "grad_norm": 0.4726629853248596, "learning_rate": 2.797973509552487e-06, "loss": 0.4404, "step": 127600 }, { "epoch": 2.8403222934472936, "grad_norm": 0.367449551820755, "learning_rate": 2.790207622407581e-06, "loss": 0.4292, "step": 127610 }, { "epoch": 2.840544871794872, "grad_norm": 0.5491681694984436, "learning_rate": 2.7824524518042405e-06, "loss": 0.4613, "step": 127620 }, { "epoch": 2.84076745014245, "grad_norm": 0.5599938631057739, "learning_rate": 2.7747079981638614e-06, "loss": 0.5249, "step": 127630 }, { "epoch": 2.8409900284900287, "grad_norm": 0.36625319719314575, "learning_rate": 2.766974261907307e-06, "loss": 0.4111, "step": 127640 }, { "epoch": 2.841212606837607, "grad_norm": 0.4233880043029785, "learning_rate": 2.7592512434548413e-06, "loss": 0.4154, "step": 127650 }, { "epoch": 2.841435185185185, "grad_norm": 0.34068143367767334, "learning_rate": 2.7515389432261508e-06, "loss": 0.4196, "step": 127660 }, { "epoch": 2.841657763532764, "grad_norm": 0.5412957668304443, "learning_rate": 2.7438373616403e-06, "loss": 0.436, "step": 127670 }, { "epoch": 2.841880341880342, "grad_norm": 0.6452593207359314, "learning_rate": 2.7361464991157993e-06, "loss": 0.4758, "step": 127680 }, { "epoch": 2.84210292022792, "grad_norm": 0.4374260902404785, "learning_rate": 2.7284663560706025e-06, "loss": 0.4615, "step": 127690 }, { "epoch": 2.8423254985754984, "grad_norm": 0.43188244104385376, "learning_rate": 2.7207969329220205e-06, "loss": 0.4227, "step": 127700 }, { "epoch": 2.8425480769230766, "grad_norm": 0.468960165977478, "learning_rate": 2.7131382300868314e-06, "loss": 0.3827, "step": 127710 }, { "epoch": 2.8427706552706553, "grad_norm": 0.6046063899993896, "learning_rate": 2.7054902479812128e-06, "loss": 0.389, "step": 127720 }, { "epoch": 2.8429932336182335, "grad_norm": 0.7549774646759033, "learning_rate": 2.697852987020788e-06, "loss": 0.5852, "step": 127730 }, { "epoch": 2.8432158119658117, "grad_norm": 0.5519178509712219, "learning_rate": 2.6902264476205363e-06, "loss": 0.5385, "step": 127740 }, { "epoch": 2.8434383903133904, "grad_norm": 0.7807755470275879, "learning_rate": 2.6826106301949037e-06, "loss": 0.5021, "step": 127750 }, { "epoch": 2.8436609686609686, "grad_norm": 0.48430293798446655, "learning_rate": 2.6750055351577595e-06, "loss": 0.4454, "step": 127760 }, { "epoch": 2.843883547008547, "grad_norm": 0.4661320447921753, "learning_rate": 2.667411162922329e-06, "loss": 0.3889, "step": 127770 }, { "epoch": 2.8441061253561255, "grad_norm": 0.5276303291320801, "learning_rate": 2.6598275139013253e-06, "loss": 0.4635, "step": 127780 }, { "epoch": 2.8443287037037037, "grad_norm": 0.6416008472442627, "learning_rate": 2.6522545885068417e-06, "loss": 0.455, "step": 127790 }, { "epoch": 2.844551282051282, "grad_norm": 0.44830918312072754, "learning_rate": 2.644692387150416e-06, "loss": 0.3487, "step": 127800 }, { "epoch": 2.8447738603988606, "grad_norm": 0.4769445061683655, "learning_rate": 2.6371409102429634e-06, "loss": 0.3851, "step": 127810 }, { "epoch": 2.844996438746439, "grad_norm": 0.5918802618980408, "learning_rate": 2.629600158194845e-06, "loss": 0.4871, "step": 127820 }, { "epoch": 2.845219017094017, "grad_norm": 0.36495670676231384, "learning_rate": 2.622070131415821e-06, "loss": 0.4913, "step": 127830 }, { "epoch": 2.8454415954415957, "grad_norm": 0.4469436705112457, "learning_rate": 2.614550830315099e-06, "loss": 0.5122, "step": 127840 }, { "epoch": 2.845664173789174, "grad_norm": 0.39218777418136597, "learning_rate": 2.607042255301262e-06, "loss": 0.3885, "step": 127850 }, { "epoch": 2.845886752136752, "grad_norm": 0.513641357421875, "learning_rate": 2.5995444067823393e-06, "loss": 0.4105, "step": 127860 }, { "epoch": 2.8461093304843303, "grad_norm": 0.4854564070701599, "learning_rate": 2.592057285165761e-06, "loss": 0.5354, "step": 127870 }, { "epoch": 2.8463319088319086, "grad_norm": 0.5514839887619019, "learning_rate": 2.5845808908584236e-06, "loss": 0.4986, "step": 127880 }, { "epoch": 2.8465544871794872, "grad_norm": 0.6030427813529968, "learning_rate": 2.5771152242665575e-06, "loss": 0.5883, "step": 127890 }, { "epoch": 2.8467770655270654, "grad_norm": 0.47126203775405884, "learning_rate": 2.5696602857958607e-06, "loss": 0.4896, "step": 127900 }, { "epoch": 2.8469996438746437, "grad_norm": 0.6155270338058472, "learning_rate": 2.5622160758514534e-06, "loss": 0.5152, "step": 127910 }, { "epoch": 2.8472222222222223, "grad_norm": 0.9987238645553589, "learning_rate": 2.554782594837857e-06, "loss": 0.4822, "step": 127920 }, { "epoch": 2.8474448005698005, "grad_norm": 0.6122363805770874, "learning_rate": 2.547359843159014e-06, "loss": 0.5391, "step": 127930 }, { "epoch": 2.8476673789173788, "grad_norm": 0.47502201795578003, "learning_rate": 2.5399478212182916e-06, "loss": 0.3807, "step": 127940 }, { "epoch": 2.8478899572649574, "grad_norm": 0.510317325592041, "learning_rate": 2.532546529418456e-06, "loss": 0.577, "step": 127950 }, { "epoch": 2.8481125356125356, "grad_norm": 0.6082240343093872, "learning_rate": 2.5251559681616744e-06, "loss": 0.4693, "step": 127960 }, { "epoch": 2.848335113960114, "grad_norm": 0.463861346244812, "learning_rate": 2.5177761378495812e-06, "loss": 0.4554, "step": 127970 }, { "epoch": 2.8485576923076925, "grad_norm": 0.5722596645355225, "learning_rate": 2.510407038883189e-06, "loss": 0.4323, "step": 127980 }, { "epoch": 2.8487802706552707, "grad_norm": 0.47360795736312866, "learning_rate": 2.5030486716629554e-06, "loss": 0.4379, "step": 127990 }, { "epoch": 2.849002849002849, "grad_norm": 0.8248348236083984, "learning_rate": 2.495701036588738e-06, "loss": 0.5192, "step": 128000 }, { "epoch": 2.8492254273504276, "grad_norm": 0.43759846687316895, "learning_rate": 2.4883641340598174e-06, "loss": 0.3721, "step": 128010 }, { "epoch": 2.849448005698006, "grad_norm": 0.3280053436756134, "learning_rate": 2.481037964474897e-06, "loss": 0.4763, "step": 128020 }, { "epoch": 2.849670584045584, "grad_norm": 0.5017495155334473, "learning_rate": 2.4737225282320363e-06, "loss": 0.4446, "step": 128030 }, { "epoch": 2.8498931623931623, "grad_norm": 0.6453086137771606, "learning_rate": 2.466417825728828e-06, "loss": 0.349, "step": 128040 }, { "epoch": 2.8501157407407405, "grad_norm": 0.6192348599433899, "learning_rate": 2.4591238573621556e-06, "loss": 0.439, "step": 128050 }, { "epoch": 2.850338319088319, "grad_norm": 0.4385824203491211, "learning_rate": 2.451840623528412e-06, "loss": 0.4657, "step": 128060 }, { "epoch": 2.8505608974358974, "grad_norm": 0.5602866411209106, "learning_rate": 2.44456812462337e-06, "loss": 0.4046, "step": 128070 }, { "epoch": 2.8507834757834756, "grad_norm": 0.37369683384895325, "learning_rate": 2.4373063610422462e-06, "loss": 0.5432, "step": 128080 }, { "epoch": 2.8510060541310542, "grad_norm": 0.5913586020469666, "learning_rate": 2.430055333179615e-06, "loss": 0.4079, "step": 128090 }, { "epoch": 2.8512286324786325, "grad_norm": 0.6015235185623169, "learning_rate": 2.4228150414295157e-06, "loss": 0.4556, "step": 128100 }, { "epoch": 2.8514512108262107, "grad_norm": 0.6328086256980896, "learning_rate": 2.4155854861853675e-06, "loss": 0.4006, "step": 128110 }, { "epoch": 2.8516737891737893, "grad_norm": 0.594316840171814, "learning_rate": 2.4083666678400784e-06, "loss": 0.3517, "step": 128120 }, { "epoch": 2.8518963675213675, "grad_norm": 0.4189058244228363, "learning_rate": 2.4011585867858898e-06, "loss": 0.4068, "step": 128130 }, { "epoch": 2.8521189458689458, "grad_norm": 0.6700271964073181, "learning_rate": 2.393961243414511e-06, "loss": 0.4682, "step": 128140 }, { "epoch": 2.8523415242165244, "grad_norm": 0.6584945321083069, "learning_rate": 2.3867746381170285e-06, "loss": 0.5559, "step": 128150 }, { "epoch": 2.8525641025641026, "grad_norm": 1.0810476541519165, "learning_rate": 2.3795987712840194e-06, "loss": 0.5629, "step": 128160 }, { "epoch": 2.852786680911681, "grad_norm": 0.5784086585044861, "learning_rate": 2.3724336433053716e-06, "loss": 0.3555, "step": 128170 }, { "epoch": 2.8530092592592595, "grad_norm": 0.4716869592666626, "learning_rate": 2.3652792545704627e-06, "loss": 0.4536, "step": 128180 }, { "epoch": 2.8532318376068377, "grad_norm": 0.5608788728713989, "learning_rate": 2.35813560546807e-06, "loss": 0.3744, "step": 128190 }, { "epoch": 2.853454415954416, "grad_norm": 0.5514051914215088, "learning_rate": 2.3510026963863953e-06, "loss": 0.3658, "step": 128200 }, { "epoch": 2.853676994301994, "grad_norm": 0.5831300020217896, "learning_rate": 2.343880527713038e-06, "loss": 0.4468, "step": 128210 }, { "epoch": 2.8538995726495724, "grad_norm": 0.47793155908584595, "learning_rate": 2.3367690998350233e-06, "loss": 0.4621, "step": 128220 }, { "epoch": 2.854122150997151, "grad_norm": 0.5891512036323547, "learning_rate": 2.329668413138797e-06, "loss": 0.4112, "step": 128230 }, { "epoch": 2.8543447293447293, "grad_norm": 0.6231299042701721, "learning_rate": 2.3225784680101837e-06, "loss": 0.4276, "step": 128240 }, { "epoch": 2.8545673076923075, "grad_norm": 0.4069092571735382, "learning_rate": 2.3154992648344977e-06, "loss": 0.4711, "step": 128250 }, { "epoch": 2.854789886039886, "grad_norm": 0.3998798727989197, "learning_rate": 2.3084308039964087e-06, "loss": 0.3865, "step": 128260 }, { "epoch": 2.8550124643874644, "grad_norm": 0.5938044786453247, "learning_rate": 2.301373085880032e-06, "loss": 0.3994, "step": 128270 }, { "epoch": 2.8552350427350426, "grad_norm": 0.41773974895477295, "learning_rate": 2.294326110868883e-06, "loss": 0.484, "step": 128280 }, { "epoch": 2.8554576210826212, "grad_norm": 0.6640601754188538, "learning_rate": 2.287289879345922e-06, "loss": 0.4097, "step": 128290 }, { "epoch": 2.8556801994301995, "grad_norm": 0.4293261766433716, "learning_rate": 2.280264391693465e-06, "loss": 0.6003, "step": 128300 }, { "epoch": 2.8559027777777777, "grad_norm": 0.5650845170021057, "learning_rate": 2.2732496482932964e-06, "loss": 0.4486, "step": 128310 }, { "epoch": 2.8561253561253563, "grad_norm": 0.5928626656532288, "learning_rate": 2.2662456495265994e-06, "loss": 0.5542, "step": 128320 }, { "epoch": 2.8563479344729346, "grad_norm": 0.40868303179740906, "learning_rate": 2.259252395774003e-06, "loss": 0.3978, "step": 128330 }, { "epoch": 2.8565705128205128, "grad_norm": 0.5804070830345154, "learning_rate": 2.2522698874155147e-06, "loss": 0.4422, "step": 128340 }, { "epoch": 2.8567930911680914, "grad_norm": 0.669746458530426, "learning_rate": 2.245298124830564e-06, "loss": 0.4555, "step": 128350 }, { "epoch": 2.8570156695156697, "grad_norm": 0.3729129135608673, "learning_rate": 2.238337108398003e-06, "loss": 0.4551, "step": 128360 }, { "epoch": 2.857238247863248, "grad_norm": 0.8856843709945679, "learning_rate": 2.231386838496086e-06, "loss": 0.4375, "step": 128370 }, { "epoch": 2.857460826210826, "grad_norm": 0.4789460301399231, "learning_rate": 2.224447315502509e-06, "loss": 0.4658, "step": 128380 }, { "epoch": 2.8576834045584043, "grad_norm": 0.6349992156028748, "learning_rate": 2.2175185397943945e-06, "loss": 0.4891, "step": 128390 }, { "epoch": 2.857905982905983, "grad_norm": 0.6243805885314941, "learning_rate": 2.2106005117482176e-06, "loss": 0.4175, "step": 128400 }, { "epoch": 2.858128561253561, "grad_norm": 0.7456267476081848, "learning_rate": 2.203693231739923e-06, "loss": 0.4172, "step": 128410 }, { "epoch": 2.8583511396011394, "grad_norm": 0.47280463576316833, "learning_rate": 2.1967967001448543e-06, "loss": 0.4273, "step": 128420 }, { "epoch": 2.858573717948718, "grad_norm": 0.7351090312004089, "learning_rate": 2.1899109173378006e-06, "loss": 0.5241, "step": 128430 }, { "epoch": 2.8587962962962963, "grad_norm": 0.7132675051689148, "learning_rate": 2.18303588369293e-06, "loss": 0.4239, "step": 128440 }, { "epoch": 2.8590188746438745, "grad_norm": 0.3661953806877136, "learning_rate": 2.1761715995838094e-06, "loss": 0.3363, "step": 128450 }, { "epoch": 2.859241452991453, "grad_norm": 0.5877260565757751, "learning_rate": 2.169318065383474e-06, "loss": 0.3942, "step": 128460 }, { "epoch": 2.8594640313390314, "grad_norm": 0.4182318150997162, "learning_rate": 2.162475281464338e-06, "loss": 0.3919, "step": 128470 }, { "epoch": 2.8596866096866096, "grad_norm": 0.5911064743995667, "learning_rate": 2.15564324819828e-06, "loss": 0.451, "step": 128480 }, { "epoch": 2.8599091880341883, "grad_norm": 0.3642343282699585, "learning_rate": 2.148821965956516e-06, "loss": 0.4713, "step": 128490 }, { "epoch": 2.8601317663817665, "grad_norm": 0.6984941959381104, "learning_rate": 2.142011435109725e-06, "loss": 0.5226, "step": 128500 }, { "epoch": 2.8603543447293447, "grad_norm": 0.578916609287262, "learning_rate": 2.1352116560280354e-06, "loss": 0.3807, "step": 128510 }, { "epoch": 2.8605769230769234, "grad_norm": 0.7987368702888489, "learning_rate": 2.128422629080884e-06, "loss": 0.4601, "step": 128520 }, { "epoch": 2.8607995014245016, "grad_norm": 0.6165527701377869, "learning_rate": 2.1216443546372643e-06, "loss": 0.525, "step": 128530 }, { "epoch": 2.86102207977208, "grad_norm": 0.5393047332763672, "learning_rate": 2.11487683306546e-06, "loss": 0.4502, "step": 128540 }, { "epoch": 2.861244658119658, "grad_norm": 0.658702552318573, "learning_rate": 2.108120064733243e-06, "loss": 0.2996, "step": 128550 }, { "epoch": 2.861467236467236, "grad_norm": 0.45965611934661865, "learning_rate": 2.1013740500078092e-06, "loss": 0.42, "step": 128560 }, { "epoch": 2.861689814814815, "grad_norm": 0.4500766694545746, "learning_rate": 2.0946387892557097e-06, "loss": 0.4174, "step": 128570 }, { "epoch": 2.861912393162393, "grad_norm": 0.5137393474578857, "learning_rate": 2.08791428284294e-06, "loss": 0.4264, "step": 128580 }, { "epoch": 2.8621349715099713, "grad_norm": 0.5398975610733032, "learning_rate": 2.0812005311349192e-06, "loss": 0.5062, "step": 128590 }, { "epoch": 2.86235754985755, "grad_norm": 0.5769351124763489, "learning_rate": 2.074497534496489e-06, "loss": 0.5821, "step": 128600 }, { "epoch": 2.862580128205128, "grad_norm": 0.42253240942955017, "learning_rate": 2.0678052932919133e-06, "loss": 0.4055, "step": 128610 }, { "epoch": 2.8628027065527064, "grad_norm": 0.523838460445404, "learning_rate": 2.061123807884813e-06, "loss": 0.486, "step": 128620 }, { "epoch": 2.863025284900285, "grad_norm": 0.8152051568031311, "learning_rate": 2.0544530786382963e-06, "loss": 0.4849, "step": 128630 }, { "epoch": 2.8632478632478633, "grad_norm": 0.4388711750507355, "learning_rate": 2.0477931059148303e-06, "loss": 0.4611, "step": 128640 }, { "epoch": 2.8634704415954415, "grad_norm": 0.6085971593856812, "learning_rate": 2.041143890076369e-06, "loss": 0.4708, "step": 128650 }, { "epoch": 2.86369301994302, "grad_norm": 0.6009733080863953, "learning_rate": 2.0345054314841794e-06, "loss": 0.5275, "step": 128660 }, { "epoch": 2.8639155982905984, "grad_norm": 0.4018401503562927, "learning_rate": 2.027877730499039e-06, "loss": 0.4101, "step": 128670 }, { "epoch": 2.8641381766381766, "grad_norm": 0.46677204966545105, "learning_rate": 2.021260787481083e-06, "loss": 0.3558, "step": 128680 }, { "epoch": 2.864360754985755, "grad_norm": 0.4653564691543579, "learning_rate": 2.0146546027898674e-06, "loss": 0.4384, "step": 128690 }, { "epoch": 2.8645833333333335, "grad_norm": 0.4220982491970062, "learning_rate": 2.008059176784438e-06, "loss": 0.4263, "step": 128700 }, { "epoch": 2.8648059116809117, "grad_norm": 0.6132118105888367, "learning_rate": 2.0014745098231314e-06, "loss": 0.5962, "step": 128710 }, { "epoch": 2.86502849002849, "grad_norm": 0.7703640460968018, "learning_rate": 1.994900602263794e-06, "loss": 0.4711, "step": 128720 }, { "epoch": 2.865251068376068, "grad_norm": 0.6671891808509827, "learning_rate": 1.9883374544636512e-06, "loss": 0.4536, "step": 128730 }, { "epoch": 2.865473646723647, "grad_norm": 0.46717581152915955, "learning_rate": 1.9817850667793737e-06, "loss": 0.427, "step": 128740 }, { "epoch": 2.865696225071225, "grad_norm": 0.4125966727733612, "learning_rate": 1.975243439566987e-06, "loss": 0.4134, "step": 128750 }, { "epoch": 2.8659188034188032, "grad_norm": 0.400468111038208, "learning_rate": 1.9687125731819635e-06, "loss": 0.3723, "step": 128760 }, { "epoch": 2.866141381766382, "grad_norm": 0.48118314146995544, "learning_rate": 1.9621924679792403e-06, "loss": 0.4294, "step": 128770 }, { "epoch": 2.86636396011396, "grad_norm": 0.5905781388282776, "learning_rate": 1.955683124313068e-06, "loss": 0.4468, "step": 128780 }, { "epoch": 2.8665865384615383, "grad_norm": 0.7723633646965027, "learning_rate": 1.94918454253723e-06, "loss": 0.3429, "step": 128790 }, { "epoch": 2.866809116809117, "grad_norm": 0.43844008445739746, "learning_rate": 1.9426967230048443e-06, "loss": 0.3715, "step": 128800 }, { "epoch": 2.867031695156695, "grad_norm": 0.6145934462547302, "learning_rate": 1.9362196660684286e-06, "loss": 0.4389, "step": 128810 }, { "epoch": 2.8672542735042734, "grad_norm": 0.7906620502471924, "learning_rate": 1.92975337207999e-06, "loss": 0.43, "step": 128820 }, { "epoch": 2.867476851851852, "grad_norm": 0.6965433359146118, "learning_rate": 1.923297841390892e-06, "loss": 0.3551, "step": 128830 }, { "epoch": 2.8676994301994303, "grad_norm": 0.5444207191467285, "learning_rate": 1.916853074351943e-06, "loss": 0.474, "step": 128840 }, { "epoch": 2.8679220085470085, "grad_norm": 0.5031320452690125, "learning_rate": 1.910419071313374e-06, "loss": 0.4394, "step": 128850 }, { "epoch": 2.8681445868945867, "grad_norm": 0.5183652639389038, "learning_rate": 1.9039958326247942e-06, "loss": 0.4802, "step": 128860 }, { "epoch": 2.8683671652421654, "grad_norm": 0.48607540130615234, "learning_rate": 1.8975833586352576e-06, "loss": 0.4347, "step": 128870 }, { "epoch": 2.8685897435897436, "grad_norm": 0.5282498598098755, "learning_rate": 1.8911816496931968e-06, "loss": 0.4464, "step": 128880 }, { "epoch": 2.868812321937322, "grad_norm": 0.4897420108318329, "learning_rate": 1.8847907061464887e-06, "loss": 0.4407, "step": 128890 }, { "epoch": 2.8690349002849, "grad_norm": 0.4289381206035614, "learning_rate": 1.8784105283424558e-06, "loss": 0.4751, "step": 128900 }, { "epoch": 2.8692574786324787, "grad_norm": 0.5549202561378479, "learning_rate": 1.8720411166277985e-06, "loss": 0.3789, "step": 128910 }, { "epoch": 2.869480056980057, "grad_norm": 0.39817145466804504, "learning_rate": 1.8656824713485954e-06, "loss": 0.4434, "step": 128920 }, { "epoch": 2.869702635327635, "grad_norm": 0.5653124451637268, "learning_rate": 1.8593345928504368e-06, "loss": 0.514, "step": 128930 }, { "epoch": 2.869925213675214, "grad_norm": 0.4783931076526642, "learning_rate": 1.8529974814782248e-06, "loss": 0.5078, "step": 128940 }, { "epoch": 2.870147792022792, "grad_norm": 0.6050539016723633, "learning_rate": 1.8466711375763278e-06, "loss": 0.4924, "step": 128950 }, { "epoch": 2.8703703703703702, "grad_norm": 0.5471101403236389, "learning_rate": 1.8403555614885604e-06, "loss": 0.4408, "step": 128960 }, { "epoch": 2.870592948717949, "grad_norm": 0.5318120121955872, "learning_rate": 1.8340507535580697e-06, "loss": 0.3713, "step": 128970 }, { "epoch": 2.870815527065527, "grad_norm": 0.29087573289871216, "learning_rate": 1.8277567141275153e-06, "loss": 0.3768, "step": 128980 }, { "epoch": 2.8710381054131053, "grad_norm": 0.6670485138893127, "learning_rate": 1.8214734435388681e-06, "loss": 0.6016, "step": 128990 }, { "epoch": 2.871260683760684, "grad_norm": 0.6318653225898743, "learning_rate": 1.8152009421336102e-06, "loss": 0.3722, "step": 129000 }, { "epoch": 2.871483262108262, "grad_norm": 0.506161630153656, "learning_rate": 1.8089392102525805e-06, "loss": 0.3665, "step": 129010 }, { "epoch": 2.8717058404558404, "grad_norm": 0.5993919968605042, "learning_rate": 1.8026882482360175e-06, "loss": 0.4653, "step": 129020 }, { "epoch": 2.8719284188034186, "grad_norm": 0.5019202828407288, "learning_rate": 1.7964480564236276e-06, "loss": 0.396, "step": 129030 }, { "epoch": 2.8721509971509973, "grad_norm": 0.36323311924934387, "learning_rate": 1.790218635154517e-06, "loss": 0.3592, "step": 129040 }, { "epoch": 2.8723735754985755, "grad_norm": 0.6010159850120544, "learning_rate": 1.7839999847671928e-06, "loss": 0.4068, "step": 129050 }, { "epoch": 2.8725961538461537, "grad_norm": 0.3936340808868408, "learning_rate": 1.7777921055995627e-06, "loss": 0.4292, "step": 129060 }, { "epoch": 2.872818732193732, "grad_norm": 0.5482344031333923, "learning_rate": 1.7715949979890012e-06, "loss": 0.4296, "step": 129070 }, { "epoch": 2.8730413105413106, "grad_norm": 0.4646384119987488, "learning_rate": 1.7654086622722166e-06, "loss": 0.4541, "step": 129080 }, { "epoch": 2.873263888888889, "grad_norm": 0.6732831001281738, "learning_rate": 1.759233098785429e-06, "loss": 0.4736, "step": 129090 }, { "epoch": 2.873486467236467, "grad_norm": 0.5085716843605042, "learning_rate": 1.7530683078641918e-06, "loss": 0.3397, "step": 129100 }, { "epoch": 2.8737090455840457, "grad_norm": 0.5749651789665222, "learning_rate": 1.7469142898435042e-06, "loss": 0.4264, "step": 129110 }, { "epoch": 2.873931623931624, "grad_norm": 0.6635963916778564, "learning_rate": 1.7407710450578096e-06, "loss": 0.3785, "step": 129120 }, { "epoch": 2.874154202279202, "grad_norm": 0.6351351141929626, "learning_rate": 1.7346385738409298e-06, "loss": 0.5607, "step": 129130 }, { "epoch": 2.874376780626781, "grad_norm": 0.42307665944099426, "learning_rate": 1.7285168765260874e-06, "loss": 0.5037, "step": 129140 }, { "epoch": 2.874599358974359, "grad_norm": 0.3375342786312103, "learning_rate": 1.7224059534459492e-06, "loss": 0.4869, "step": 129150 }, { "epoch": 2.8748219373219372, "grad_norm": 0.8892328143119812, "learning_rate": 1.7163058049325831e-06, "loss": 0.5797, "step": 129160 }, { "epoch": 2.875044515669516, "grad_norm": 0.5242050886154175, "learning_rate": 1.7102164313174795e-06, "loss": 0.4744, "step": 129170 }, { "epoch": 2.875267094017094, "grad_norm": 0.5739059448242188, "learning_rate": 1.7041378329315515e-06, "loss": 0.3742, "step": 129180 }, { "epoch": 2.8754896723646723, "grad_norm": 0.36049407720565796, "learning_rate": 1.6980700101051127e-06, "loss": 0.4889, "step": 129190 }, { "epoch": 2.8757122507122506, "grad_norm": 0.519873321056366, "learning_rate": 1.6920129631678772e-06, "loss": 0.4065, "step": 129200 }, { "epoch": 2.8759348290598292, "grad_norm": 0.3915044665336609, "learning_rate": 1.6859666924490036e-06, "loss": 0.4575, "step": 129210 }, { "epoch": 2.8761574074074074, "grad_norm": 0.6183444857597351, "learning_rate": 1.6799311982770517e-06, "loss": 0.4922, "step": 129220 }, { "epoch": 2.8763799857549857, "grad_norm": 0.4814912676811218, "learning_rate": 1.6739064809799809e-06, "loss": 0.2975, "step": 129230 }, { "epoch": 2.876602564102564, "grad_norm": 0.5634822249412537, "learning_rate": 1.6678925408851742e-06, "loss": 0.4692, "step": 129240 }, { "epoch": 2.8768251424501425, "grad_norm": 0.5901898741722107, "learning_rate": 1.661889378319481e-06, "loss": 0.4541, "step": 129250 }, { "epoch": 2.8770477207977208, "grad_norm": 0.5291321873664856, "learning_rate": 1.655896993609063e-06, "loss": 0.3769, "step": 129260 }, { "epoch": 2.877270299145299, "grad_norm": 0.5238466858863831, "learning_rate": 1.6499153870795924e-06, "loss": 0.4004, "step": 129270 }, { "epoch": 2.8774928774928776, "grad_norm": 0.5467638969421387, "learning_rate": 1.6439445590560986e-06, "loss": 0.4994, "step": 129280 }, { "epoch": 2.877715455840456, "grad_norm": 0.5712918043136597, "learning_rate": 1.637984509863033e-06, "loss": 0.4272, "step": 129290 }, { "epoch": 2.877938034188034, "grad_norm": 0.5815404057502747, "learning_rate": 1.6320352398242478e-06, "loss": 0.5296, "step": 129300 }, { "epoch": 2.8781606125356127, "grad_norm": 0.47931987047195435, "learning_rate": 1.6260967492630841e-06, "loss": 0.5084, "step": 129310 }, { "epoch": 2.878383190883191, "grad_norm": 0.46624454855918884, "learning_rate": 1.6201690385022171e-06, "loss": 0.5071, "step": 129320 }, { "epoch": 2.878605769230769, "grad_norm": 0.4582967460155487, "learning_rate": 1.6142521078637673e-06, "loss": 0.4559, "step": 129330 }, { "epoch": 2.878828347578348, "grad_norm": 0.45148831605911255, "learning_rate": 1.608345957669255e-06, "loss": 0.4768, "step": 129340 }, { "epoch": 2.879050925925926, "grad_norm": 0.4900384545326233, "learning_rate": 1.6024505882396678e-06, "loss": 0.4518, "step": 129350 }, { "epoch": 2.8792735042735043, "grad_norm": 0.5455189943313599, "learning_rate": 1.5965659998953052e-06, "loss": 0.4209, "step": 129360 }, { "epoch": 2.8794960826210825, "grad_norm": 0.4357905685901642, "learning_rate": 1.590692192955956e-06, "loss": 0.3717, "step": 129370 }, { "epoch": 2.8797186609686607, "grad_norm": 0.5603689551353455, "learning_rate": 1.5848291677408312e-06, "loss": 0.4896, "step": 129380 }, { "epoch": 2.8799412393162394, "grad_norm": 0.7707281708717346, "learning_rate": 1.5789769245685204e-06, "loss": 0.5265, "step": 129390 }, { "epoch": 2.8801638176638176, "grad_norm": 0.5575656294822693, "learning_rate": 1.5731354637570361e-06, "loss": 0.408, "step": 129400 }, { "epoch": 2.8803418803418803, "eval_loss": 0.5200754404067993, "eval_runtime": 337.416, "eval_samples_per_second": 7.009, "eval_steps_per_second": 7.009, "step": 129408 } ], "logging_steps": 10, "max_steps": 134784, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2696, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.400469039544369e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }