diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.135280553420446, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0030745580322828594, + "grad_norm": 7.8743672370910645, + "learning_rate": 4e-05, + "loss": 4.1619, + "step": 1 + }, + { + "epoch": 0.006149116064565719, + "grad_norm": 5.61863374710083, + "learning_rate": 8e-05, + "loss": 4.2003, + "step": 2 + }, + { + "epoch": 0.009223674096848577, + "grad_norm": 5.158657550811768, + "learning_rate": 0.00012, + "loss": 4.8946, + "step": 3 + }, + { + "epoch": 0.012298232129131437, + "grad_norm": 6.152472019195557, + "learning_rate": 0.00016, + "loss": 4.5709, + "step": 4 + }, + { + "epoch": 0.015372790161414296, + "grad_norm": 3.8159048557281494, + "learning_rate": 0.0002, + "loss": 3.525, + "step": 5 + }, + { + "epoch": 0.018447348193697154, + "grad_norm": 2.894798517227173, + "learning_rate": 0.00019993322203672788, + "loss": 2.8446, + "step": 6 + }, + { + "epoch": 0.021521906225980016, + "grad_norm": 2.2424588203430176, + "learning_rate": 0.00019986644407345576, + "loss": 2.3555, + "step": 7 + }, + { + "epoch": 0.024596464258262875, + "grad_norm": 2.5049643516540527, + "learning_rate": 0.00019979966611018366, + "loss": 2.1607, + "step": 8 + }, + { + "epoch": 0.027671022290545733, + "grad_norm": 2.0380496978759766, + "learning_rate": 0.00019973288814691153, + "loss": 2.1771, + "step": 9 + }, + { + "epoch": 0.030745580322828592, + "grad_norm": 2.0299413204193115, + "learning_rate": 0.0001996661101836394, + "loss": 1.5033, + "step": 10 + }, + { + "epoch": 0.033820138355111454, + "grad_norm": 2.054259777069092, + "learning_rate": 0.00019959933222036728, + "loss": 1.4109, + "step": 11 + }, + { + "epoch": 0.03689469638739431, + "grad_norm": 1.8728002309799194, + "learning_rate": 0.00019953255425709515, + "loss": 1.4278, + "step": 12 + }, + { + "epoch": 0.03996925441967717, + "grad_norm": 1.6537948846817017, + "learning_rate": 0.00019946577629382305, + "loss": 1.1342, + "step": 13 + }, + { + "epoch": 0.04304381245196003, + "grad_norm": 1.2836942672729492, + "learning_rate": 0.00019939899833055092, + "loss": 1.3855, + "step": 14 + }, + { + "epoch": 0.04611837048424289, + "grad_norm": 1.4211474657058716, + "learning_rate": 0.00019933222036727882, + "loss": 1.2734, + "step": 15 + }, + { + "epoch": 0.04919292851652575, + "grad_norm": 1.298756718635559, + "learning_rate": 0.0001992654424040067, + "loss": 1.0105, + "step": 16 + }, + { + "epoch": 0.05226748654880861, + "grad_norm": 1.2545682191848755, + "learning_rate": 0.00019919866444073457, + "loss": 0.8059, + "step": 17 + }, + { + "epoch": 0.05534204458109147, + "grad_norm": 1.1537564992904663, + "learning_rate": 0.00019913188647746244, + "loss": 1.0606, + "step": 18 + }, + { + "epoch": 0.05841660261337433, + "grad_norm": 1.1393249034881592, + "learning_rate": 0.00019906510851419034, + "loss": 0.8851, + "step": 19 + }, + { + "epoch": 0.061491160645657184, + "grad_norm": 1.2342195510864258, + "learning_rate": 0.00019899833055091822, + "loss": 0.782, + "step": 20 + }, + { + "epoch": 0.06456571867794005, + "grad_norm": 1.1908934116363525, + "learning_rate": 0.0001989315525876461, + "loss": 0.8126, + "step": 21 + }, + { + "epoch": 0.06764027671022291, + "grad_norm": 1.1466214656829834, + "learning_rate": 0.00019886477462437396, + "loss": 1.0469, + "step": 22 + }, + { + "epoch": 0.07071483474250577, + "grad_norm": 1.1647766828536987, + "learning_rate": 0.00019879799666110183, + "loss": 0.7116, + "step": 23 + }, + { + "epoch": 0.07378939277478862, + "grad_norm": 1.043047308921814, + "learning_rate": 0.00019873121869782974, + "loss": 0.7711, + "step": 24 + }, + { + "epoch": 0.07686395080707148, + "grad_norm": 1.2585339546203613, + "learning_rate": 0.0001986644407345576, + "loss": 0.8884, + "step": 25 + }, + { + "epoch": 0.07993850883935434, + "grad_norm": 1.3209491968154907, + "learning_rate": 0.00019859766277128548, + "loss": 0.899, + "step": 26 + }, + { + "epoch": 0.0830130668716372, + "grad_norm": 1.4513576030731201, + "learning_rate": 0.00019853088480801335, + "loss": 0.7261, + "step": 27 + }, + { + "epoch": 0.08608762490392007, + "grad_norm": 1.3187739849090576, + "learning_rate": 0.00019846410684474123, + "loss": 0.951, + "step": 28 + }, + { + "epoch": 0.08916218293620293, + "grad_norm": 1.2414864301681519, + "learning_rate": 0.00019839732888146913, + "loss": 0.8317, + "step": 29 + }, + { + "epoch": 0.09223674096848578, + "grad_norm": 1.1460022926330566, + "learning_rate": 0.000198330550918197, + "loss": 0.7259, + "step": 30 + }, + { + "epoch": 0.09531129900076864, + "grad_norm": 1.5636142492294312, + "learning_rate": 0.00019826377295492487, + "loss": 1.0959, + "step": 31 + }, + { + "epoch": 0.0983858570330515, + "grad_norm": 1.3006511926651, + "learning_rate": 0.00019819699499165277, + "loss": 0.8215, + "step": 32 + }, + { + "epoch": 0.10146041506533436, + "grad_norm": 1.0390676259994507, + "learning_rate": 0.00019813021702838065, + "loss": 0.6979, + "step": 33 + }, + { + "epoch": 0.10453497309761722, + "grad_norm": 1.1039427518844604, + "learning_rate": 0.00019806343906510852, + "loss": 0.8445, + "step": 34 + }, + { + "epoch": 0.10760953112990007, + "grad_norm": 1.3381197452545166, + "learning_rate": 0.00019799666110183642, + "loss": 0.862, + "step": 35 + }, + { + "epoch": 0.11068408916218293, + "grad_norm": 1.2397987842559814, + "learning_rate": 0.0001979298831385643, + "loss": 0.9947, + "step": 36 + }, + { + "epoch": 0.1137586471944658, + "grad_norm": 1.143774151802063, + "learning_rate": 0.00019786310517529217, + "loss": 0.7655, + "step": 37 + }, + { + "epoch": 0.11683320522674866, + "grad_norm": 0.9365643858909607, + "learning_rate": 0.00019779632721202004, + "loss": 0.669, + "step": 38 + }, + { + "epoch": 0.11990776325903152, + "grad_norm": 0.9902568459510803, + "learning_rate": 0.00019772954924874791, + "loss": 0.828, + "step": 39 + }, + { + "epoch": 0.12298232129131437, + "grad_norm": 1.1478556394577026, + "learning_rate": 0.00019766277128547581, + "loss": 0.8117, + "step": 40 + }, + { + "epoch": 0.12605687932359724, + "grad_norm": 1.234010100364685, + "learning_rate": 0.0001975959933222037, + "loss": 0.6825, + "step": 41 + }, + { + "epoch": 0.1291314373558801, + "grad_norm": 0.9467914700508118, + "learning_rate": 0.00019752921535893156, + "loss": 0.7473, + "step": 42 + }, + { + "epoch": 0.13220599538816294, + "grad_norm": 0.8851337432861328, + "learning_rate": 0.00019746243739565943, + "loss": 0.6269, + "step": 43 + }, + { + "epoch": 0.13528055342044581, + "grad_norm": 0.9889845848083496, + "learning_rate": 0.0001973956594323873, + "loss": 0.8628, + "step": 44 + }, + { + "epoch": 0.13835511145272866, + "grad_norm": 0.838996946811676, + "learning_rate": 0.0001973288814691152, + "loss": 0.5659, + "step": 45 + }, + { + "epoch": 0.14142966948501154, + "grad_norm": 0.9662989974021912, + "learning_rate": 0.00019726210350584308, + "loss": 0.7361, + "step": 46 + }, + { + "epoch": 0.1445042275172944, + "grad_norm": 0.9126756191253662, + "learning_rate": 0.00019719532554257095, + "loss": 0.6841, + "step": 47 + }, + { + "epoch": 0.14757878554957723, + "grad_norm": 1.0940887928009033, + "learning_rate": 0.00019712854757929883, + "loss": 0.7206, + "step": 48 + }, + { + "epoch": 0.1506533435818601, + "grad_norm": 0.9076804518699646, + "learning_rate": 0.00019706176961602673, + "loss": 0.6463, + "step": 49 + }, + { + "epoch": 0.15372790161414296, + "grad_norm": 1.1357953548431396, + "learning_rate": 0.0001969949916527546, + "loss": 0.7941, + "step": 50 + }, + { + "epoch": 0.15680245964642583, + "grad_norm": 0.9527478814125061, + "learning_rate": 0.0001969282136894825, + "loss": 0.5493, + "step": 51 + }, + { + "epoch": 0.15987701767870868, + "grad_norm": 1.0596438646316528, + "learning_rate": 0.00019686143572621037, + "loss": 0.8444, + "step": 52 + }, + { + "epoch": 0.16295157571099156, + "grad_norm": 0.8877243995666504, + "learning_rate": 0.00019679465776293825, + "loss": 0.683, + "step": 53 + }, + { + "epoch": 0.1660261337432744, + "grad_norm": 0.959449052810669, + "learning_rate": 0.00019672787979966612, + "loss": 0.6365, + "step": 54 + }, + { + "epoch": 0.16910069177555725, + "grad_norm": 1.0784345865249634, + "learning_rate": 0.000196661101836394, + "loss": 0.9189, + "step": 55 + }, + { + "epoch": 0.17217524980784013, + "grad_norm": 0.7773799896240234, + "learning_rate": 0.0001965943238731219, + "loss": 0.5239, + "step": 56 + }, + { + "epoch": 0.17524980784012298, + "grad_norm": 0.8316354751586914, + "learning_rate": 0.00019652754590984977, + "loss": 0.5098, + "step": 57 + }, + { + "epoch": 0.17832436587240585, + "grad_norm": 0.9249610304832458, + "learning_rate": 0.00019646076794657764, + "loss": 0.7623, + "step": 58 + }, + { + "epoch": 0.1813989239046887, + "grad_norm": 0.9492266178131104, + "learning_rate": 0.0001963939899833055, + "loss": 0.7771, + "step": 59 + }, + { + "epoch": 0.18447348193697155, + "grad_norm": 0.9219992160797119, + "learning_rate": 0.00019632721202003339, + "loss": 0.7204, + "step": 60 + }, + { + "epoch": 0.18754803996925443, + "grad_norm": 1.1604337692260742, + "learning_rate": 0.00019626043405676129, + "loss": 1.3049, + "step": 61 + }, + { + "epoch": 0.19062259800153727, + "grad_norm": 0.8619215488433838, + "learning_rate": 0.00019619365609348916, + "loss": 0.7158, + "step": 62 + }, + { + "epoch": 0.19369715603382015, + "grad_norm": 0.9047840237617493, + "learning_rate": 0.00019612687813021703, + "loss": 0.7197, + "step": 63 + }, + { + "epoch": 0.196771714066103, + "grad_norm": 0.9470083713531494, + "learning_rate": 0.0001960601001669449, + "loss": 0.7085, + "step": 64 + }, + { + "epoch": 0.19984627209838585, + "grad_norm": 0.9106523394584656, + "learning_rate": 0.00019599332220367278, + "loss": 0.7993, + "step": 65 + }, + { + "epoch": 0.20292083013066872, + "grad_norm": 0.7691475749015808, + "learning_rate": 0.00019592654424040068, + "loss": 0.6885, + "step": 66 + }, + { + "epoch": 0.20599538816295157, + "grad_norm": 1.3003575801849365, + "learning_rate": 0.00019585976627712855, + "loss": 0.8159, + "step": 67 + }, + { + "epoch": 0.20906994619523445, + "grad_norm": 0.7156600952148438, + "learning_rate": 0.00019579298831385645, + "loss": 0.5073, + "step": 68 + }, + { + "epoch": 0.2121445042275173, + "grad_norm": 0.8237838745117188, + "learning_rate": 0.00019572621035058433, + "loss": 0.7599, + "step": 69 + }, + { + "epoch": 0.21521906225980014, + "grad_norm": 0.961710512638092, + "learning_rate": 0.0001956594323873122, + "loss": 0.7606, + "step": 70 + }, + { + "epoch": 0.21829362029208302, + "grad_norm": 1.1123751401901245, + "learning_rate": 0.00019559265442404007, + "loss": 0.6975, + "step": 71 + }, + { + "epoch": 0.22136817832436587, + "grad_norm": 0.8143901824951172, + "learning_rate": 0.00019552587646076797, + "loss": 0.6289, + "step": 72 + }, + { + "epoch": 0.22444273635664874, + "grad_norm": 0.845358669757843, + "learning_rate": 0.00019545909849749584, + "loss": 0.6792, + "step": 73 + }, + { + "epoch": 0.2275172943889316, + "grad_norm": 0.6951713562011719, + "learning_rate": 0.00019539232053422372, + "loss": 0.5856, + "step": 74 + }, + { + "epoch": 0.23059185242121444, + "grad_norm": 0.7871154546737671, + "learning_rate": 0.0001953255425709516, + "loss": 0.5165, + "step": 75 + }, + { + "epoch": 0.23366641045349731, + "grad_norm": 0.8228313326835632, + "learning_rate": 0.00019525876460767946, + "loss": 0.5862, + "step": 76 + }, + { + "epoch": 0.23674096848578016, + "grad_norm": 0.8904668688774109, + "learning_rate": 0.00019519198664440736, + "loss": 0.7879, + "step": 77 + }, + { + "epoch": 0.23981552651806304, + "grad_norm": 1.0688227415084839, + "learning_rate": 0.00019512520868113524, + "loss": 0.8699, + "step": 78 + }, + { + "epoch": 0.24289008455034589, + "grad_norm": 0.9055834412574768, + "learning_rate": 0.0001950584307178631, + "loss": 0.63, + "step": 79 + }, + { + "epoch": 0.24596464258262873, + "grad_norm": 0.8711212873458862, + "learning_rate": 0.00019499165275459098, + "loss": 0.7236, + "step": 80 + }, + { + "epoch": 0.2490392006149116, + "grad_norm": 0.8715277910232544, + "learning_rate": 0.00019492487479131886, + "loss": 0.5725, + "step": 81 + }, + { + "epoch": 0.2521137586471945, + "grad_norm": 0.7342225313186646, + "learning_rate": 0.00019485809682804673, + "loss": 0.6088, + "step": 82 + }, + { + "epoch": 0.25518831667947733, + "grad_norm": 1.0883733034133911, + "learning_rate": 0.00019479131886477463, + "loss": 0.9584, + "step": 83 + }, + { + "epoch": 0.2582628747117602, + "grad_norm": 1.0370501279830933, + "learning_rate": 0.0001947245409015025, + "loss": 0.9084, + "step": 84 + }, + { + "epoch": 0.26133743274404303, + "grad_norm": 0.7242286205291748, + "learning_rate": 0.0001946577629382304, + "loss": 0.5581, + "step": 85 + }, + { + "epoch": 0.2644119907763259, + "grad_norm": 1.0035842657089233, + "learning_rate": 0.00019459098497495828, + "loss": 0.6489, + "step": 86 + }, + { + "epoch": 0.2674865488086088, + "grad_norm": 1.13217294216156, + "learning_rate": 0.00019452420701168615, + "loss": 0.8034, + "step": 87 + }, + { + "epoch": 0.27056110684089163, + "grad_norm": 1.004482388496399, + "learning_rate": 0.00019445742904841405, + "loss": 0.7356, + "step": 88 + }, + { + "epoch": 0.2736356648731745, + "grad_norm": 0.8573530316352844, + "learning_rate": 0.00019439065108514192, + "loss": 0.753, + "step": 89 + }, + { + "epoch": 0.2767102229054573, + "grad_norm": 0.9892554879188538, + "learning_rate": 0.0001943238731218698, + "loss": 0.6642, + "step": 90 + }, + { + "epoch": 0.2797847809377402, + "grad_norm": 0.7686218619346619, + "learning_rate": 0.00019425709515859767, + "loss": 0.7824, + "step": 91 + }, + { + "epoch": 0.2828593389700231, + "grad_norm": 0.7348684668540955, + "learning_rate": 0.00019419031719532554, + "loss": 0.7147, + "step": 92 + }, + { + "epoch": 0.2859338970023059, + "grad_norm": 0.6922216415405273, + "learning_rate": 0.00019412353923205344, + "loss": 0.4838, + "step": 93 + }, + { + "epoch": 0.2890084550345888, + "grad_norm": 0.8074454665184021, + "learning_rate": 0.00019405676126878132, + "loss": 0.78, + "step": 94 + }, + { + "epoch": 0.2920830130668716, + "grad_norm": 1.0575733184814453, + "learning_rate": 0.0001939899833055092, + "loss": 0.9982, + "step": 95 + }, + { + "epoch": 0.29515757109915447, + "grad_norm": 0.8220807313919067, + "learning_rate": 0.00019392320534223706, + "loss": 0.6747, + "step": 96 + }, + { + "epoch": 0.2982321291314374, + "grad_norm": 0.9394708275794983, + "learning_rate": 0.00019385642737896494, + "loss": 0.8472, + "step": 97 + }, + { + "epoch": 0.3013066871637202, + "grad_norm": 0.8578686118125916, + "learning_rate": 0.0001937896494156928, + "loss": 0.7043, + "step": 98 + }, + { + "epoch": 0.30438124519600307, + "grad_norm": 0.8185007572174072, + "learning_rate": 0.0001937228714524207, + "loss": 0.6039, + "step": 99 + }, + { + "epoch": 0.3074558032282859, + "grad_norm": 0.829156219959259, + "learning_rate": 0.00019365609348914858, + "loss": 0.6431, + "step": 100 + }, + { + "epoch": 0.31053036126056877, + "grad_norm": 0.7808781266212463, + "learning_rate": 0.00019358931552587646, + "loss": 0.6859, + "step": 101 + }, + { + "epoch": 0.31360491929285167, + "grad_norm": 0.9246749877929688, + "learning_rate": 0.00019352253756260436, + "loss": 0.7724, + "step": 102 + }, + { + "epoch": 0.3166794773251345, + "grad_norm": 0.8568185567855835, + "learning_rate": 0.00019345575959933223, + "loss": 0.6667, + "step": 103 + }, + { + "epoch": 0.31975403535741737, + "grad_norm": 0.862723171710968, + "learning_rate": 0.00019338898163606013, + "loss": 0.8031, + "step": 104 + }, + { + "epoch": 0.3228285933897002, + "grad_norm": 0.7219960689544678, + "learning_rate": 0.000193322203672788, + "loss": 0.6426, + "step": 105 + }, + { + "epoch": 0.3259031514219831, + "grad_norm": 0.7314926385879517, + "learning_rate": 0.00019325542570951588, + "loss": 0.6216, + "step": 106 + }, + { + "epoch": 0.32897770945426597, + "grad_norm": 0.8021729588508606, + "learning_rate": 0.00019318864774624375, + "loss": 0.794, + "step": 107 + }, + { + "epoch": 0.3320522674865488, + "grad_norm": 0.8143153786659241, + "learning_rate": 0.00019312186978297162, + "loss": 0.619, + "step": 108 + }, + { + "epoch": 0.33512682551883166, + "grad_norm": 0.7071205377578735, + "learning_rate": 0.00019305509181969952, + "loss": 0.607, + "step": 109 + }, + { + "epoch": 0.3382013835511145, + "grad_norm": 0.6996274590492249, + "learning_rate": 0.0001929883138564274, + "loss": 0.7112, + "step": 110 + }, + { + "epoch": 0.3412759415833974, + "grad_norm": 0.794760525226593, + "learning_rate": 0.00019292153589315527, + "loss": 0.6147, + "step": 111 + }, + { + "epoch": 0.34435049961568026, + "grad_norm": 0.7364431619644165, + "learning_rate": 0.00019285475792988314, + "loss": 0.5951, + "step": 112 + }, + { + "epoch": 0.3474250576479631, + "grad_norm": 1.001115083694458, + "learning_rate": 0.00019278797996661101, + "loss": 0.7773, + "step": 113 + }, + { + "epoch": 0.35049961568024596, + "grad_norm": 0.69508296251297, + "learning_rate": 0.0001927212020033389, + "loss": 0.7077, + "step": 114 + }, + { + "epoch": 0.3535741737125288, + "grad_norm": 0.7061307430267334, + "learning_rate": 0.0001926544240400668, + "loss": 0.5519, + "step": 115 + }, + { + "epoch": 0.3566487317448117, + "grad_norm": 0.7255828976631165, + "learning_rate": 0.00019258764607679466, + "loss": 0.6196, + "step": 116 + }, + { + "epoch": 0.35972328977709456, + "grad_norm": 0.8059960007667542, + "learning_rate": 0.00019252086811352253, + "loss": 0.6625, + "step": 117 + }, + { + "epoch": 0.3627978478093774, + "grad_norm": 0.7943634986877441, + "learning_rate": 0.0001924540901502504, + "loss": 0.821, + "step": 118 + }, + { + "epoch": 0.36587240584166025, + "grad_norm": 0.8535416126251221, + "learning_rate": 0.0001923873121869783, + "loss": 0.779, + "step": 119 + }, + { + "epoch": 0.3689469638739431, + "grad_norm": 0.745639979839325, + "learning_rate": 0.00019232053422370618, + "loss": 0.6222, + "step": 120 + }, + { + "epoch": 0.372021521906226, + "grad_norm": 0.8718635439872742, + "learning_rate": 0.00019225375626043408, + "loss": 0.6487, + "step": 121 + }, + { + "epoch": 0.37509607993850885, + "grad_norm": 0.6557499170303345, + "learning_rate": 0.00019218697829716195, + "loss": 0.4837, + "step": 122 + }, + { + "epoch": 0.3781706379707917, + "grad_norm": 0.7555654644966125, + "learning_rate": 0.00019212020033388983, + "loss": 0.7067, + "step": 123 + }, + { + "epoch": 0.38124519600307455, + "grad_norm": 0.8583431839942932, + "learning_rate": 0.0001920534223706177, + "loss": 0.7727, + "step": 124 + }, + { + "epoch": 0.3843197540353574, + "grad_norm": 0.8364957571029663, + "learning_rate": 0.0001919866444073456, + "loss": 0.6632, + "step": 125 + }, + { + "epoch": 0.3873943120676403, + "grad_norm": 0.8850215077400208, + "learning_rate": 0.00019191986644407347, + "loss": 0.6054, + "step": 126 + }, + { + "epoch": 0.39046887009992315, + "grad_norm": 0.765125036239624, + "learning_rate": 0.00019185308848080135, + "loss": 0.514, + "step": 127 + }, + { + "epoch": 0.393543428132206, + "grad_norm": 0.9039108753204346, + "learning_rate": 0.00019178631051752922, + "loss": 0.7446, + "step": 128 + }, + { + "epoch": 0.39661798616448884, + "grad_norm": 0.80910724401474, + "learning_rate": 0.0001917195325542571, + "loss": 0.7129, + "step": 129 + }, + { + "epoch": 0.3996925441967717, + "grad_norm": 0.7383053302764893, + "learning_rate": 0.00019165275459098497, + "loss": 0.6525, + "step": 130 + }, + { + "epoch": 0.4027671022290546, + "grad_norm": 0.67941814661026, + "learning_rate": 0.00019158597662771287, + "loss": 0.4873, + "step": 131 + }, + { + "epoch": 0.40584166026133744, + "grad_norm": 0.5803771018981934, + "learning_rate": 0.00019151919866444074, + "loss": 0.5808, + "step": 132 + }, + { + "epoch": 0.4089162182936203, + "grad_norm": 0.7955583930015564, + "learning_rate": 0.0001914524207011686, + "loss": 0.568, + "step": 133 + }, + { + "epoch": 0.41199077632590314, + "grad_norm": 0.768507182598114, + "learning_rate": 0.0001913856427378965, + "loss": 0.7452, + "step": 134 + }, + { + "epoch": 0.415065334358186, + "grad_norm": 0.7801786065101624, + "learning_rate": 0.0001913188647746244, + "loss": 0.6177, + "step": 135 + }, + { + "epoch": 0.4181398923904689, + "grad_norm": 0.7118993401527405, + "learning_rate": 0.00019125208681135226, + "loss": 0.6292, + "step": 136 + }, + { + "epoch": 0.42121445042275174, + "grad_norm": 0.764198362827301, + "learning_rate": 0.00019118530884808016, + "loss": 0.6508, + "step": 137 + }, + { + "epoch": 0.4242890084550346, + "grad_norm": 0.8192620873451233, + "learning_rate": 0.00019111853088480803, + "loss": 0.7133, + "step": 138 + }, + { + "epoch": 0.42736356648731744, + "grad_norm": 0.8090092539787292, + "learning_rate": 0.0001910517529215359, + "loss": 0.6533, + "step": 139 + }, + { + "epoch": 0.4304381245196003, + "grad_norm": 0.6094421148300171, + "learning_rate": 0.00019098497495826378, + "loss": 0.6756, + "step": 140 + }, + { + "epoch": 0.4335126825518832, + "grad_norm": 0.673056423664093, + "learning_rate": 0.00019091819699499168, + "loss": 0.6727, + "step": 141 + }, + { + "epoch": 0.43658724058416604, + "grad_norm": 0.6354759335517883, + "learning_rate": 0.00019085141903171955, + "loss": 0.6474, + "step": 142 + }, + { + "epoch": 0.4396617986164489, + "grad_norm": 0.9268069863319397, + "learning_rate": 0.00019078464106844743, + "loss": 1.086, + "step": 143 + }, + { + "epoch": 0.44273635664873173, + "grad_norm": 0.649411141872406, + "learning_rate": 0.0001907178631051753, + "loss": 0.6204, + "step": 144 + }, + { + "epoch": 0.4458109146810146, + "grad_norm": 0.7348757982254028, + "learning_rate": 0.00019065108514190317, + "loss": 0.7464, + "step": 145 + }, + { + "epoch": 0.4488854727132975, + "grad_norm": 0.6263845562934875, + "learning_rate": 0.00019058430717863107, + "loss": 0.7217, + "step": 146 + }, + { + "epoch": 0.45196003074558033, + "grad_norm": 0.7039823532104492, + "learning_rate": 0.00019051752921535895, + "loss": 0.792, + "step": 147 + }, + { + "epoch": 0.4550345887778632, + "grad_norm": 0.6015087366104126, + "learning_rate": 0.00019045075125208682, + "loss": 0.6796, + "step": 148 + }, + { + "epoch": 0.45810914681014603, + "grad_norm": 0.6295155882835388, + "learning_rate": 0.0001903839732888147, + "loss": 0.5762, + "step": 149 + }, + { + "epoch": 0.4611837048424289, + "grad_norm": 0.6592227816581726, + "learning_rate": 0.00019031719532554257, + "loss": 0.662, + "step": 150 + }, + { + "epoch": 0.4642582628747118, + "grad_norm": 0.7038462162017822, + "learning_rate": 0.00019025041736227044, + "loss": 0.5505, + "step": 151 + }, + { + "epoch": 0.46733282090699463, + "grad_norm": 0.7902334332466125, + "learning_rate": 0.00019018363939899834, + "loss": 0.7457, + "step": 152 + }, + { + "epoch": 0.4704073789392775, + "grad_norm": 0.673903226852417, + "learning_rate": 0.0001901168614357262, + "loss": 0.6595, + "step": 153 + }, + { + "epoch": 0.4734819369715603, + "grad_norm": 0.7488313913345337, + "learning_rate": 0.0001900500834724541, + "loss": 0.8645, + "step": 154 + }, + { + "epoch": 0.4765564950038432, + "grad_norm": 0.9577059149742126, + "learning_rate": 0.00018998330550918199, + "loss": 0.9233, + "step": 155 + }, + { + "epoch": 0.4796310530361261, + "grad_norm": 0.6935007572174072, + "learning_rate": 0.00018991652754590986, + "loss": 0.654, + "step": 156 + }, + { + "epoch": 0.4827056110684089, + "grad_norm": 0.7638063430786133, + "learning_rate": 0.00018984974958263776, + "loss": 0.7632, + "step": 157 + }, + { + "epoch": 0.48578016910069177, + "grad_norm": 0.6244141459465027, + "learning_rate": 0.00018978297161936563, + "loss": 0.476, + "step": 158 + }, + { + "epoch": 0.4888547271329746, + "grad_norm": 0.9423524141311646, + "learning_rate": 0.0001897161936560935, + "loss": 0.7204, + "step": 159 + }, + { + "epoch": 0.49192928516525747, + "grad_norm": 0.8479251861572266, + "learning_rate": 0.00018964941569282138, + "loss": 0.7546, + "step": 160 + }, + { + "epoch": 0.49500384319754037, + "grad_norm": 0.7143809795379639, + "learning_rate": 0.00018958263772954925, + "loss": 0.5741, + "step": 161 + }, + { + "epoch": 0.4980784012298232, + "grad_norm": 0.7384529709815979, + "learning_rate": 0.00018951585976627715, + "loss": 0.5389, + "step": 162 + }, + { + "epoch": 0.5011529592621061, + "grad_norm": 0.8297166228294373, + "learning_rate": 0.00018944908180300502, + "loss": 0.7686, + "step": 163 + }, + { + "epoch": 0.504227517294389, + "grad_norm": 0.9101626873016357, + "learning_rate": 0.0001893823038397329, + "loss": 0.709, + "step": 164 + }, + { + "epoch": 0.5073020753266718, + "grad_norm": 0.8472141027450562, + "learning_rate": 0.00018931552587646077, + "loss": 0.7436, + "step": 165 + }, + { + "epoch": 0.5103766333589547, + "grad_norm": 0.7950085401535034, + "learning_rate": 0.00018924874791318864, + "loss": 0.6462, + "step": 166 + }, + { + "epoch": 0.5134511913912375, + "grad_norm": 0.8818950057029724, + "learning_rate": 0.00018918196994991652, + "loss": 0.799, + "step": 167 + }, + { + "epoch": 0.5165257494235204, + "grad_norm": 0.816806435585022, + "learning_rate": 0.00018911519198664442, + "loss": 0.5552, + "step": 168 + }, + { + "epoch": 0.5196003074558032, + "grad_norm": 0.6618863940238953, + "learning_rate": 0.0001890484140233723, + "loss": 0.4716, + "step": 169 + }, + { + "epoch": 0.5226748654880861, + "grad_norm": 0.6072298288345337, + "learning_rate": 0.00018898163606010016, + "loss": 0.5535, + "step": 170 + }, + { + "epoch": 0.5257494235203689, + "grad_norm": 0.7458838820457458, + "learning_rate": 0.00018891485809682806, + "loss": 0.8668, + "step": 171 + }, + { + "epoch": 0.5288239815526518, + "grad_norm": 0.6389868855476379, + "learning_rate": 0.00018884808013355594, + "loss": 0.5767, + "step": 172 + }, + { + "epoch": 0.5318985395849347, + "grad_norm": 0.6578021049499512, + "learning_rate": 0.00018878130217028384, + "loss": 0.7959, + "step": 173 + }, + { + "epoch": 0.5349730976172176, + "grad_norm": 1.0363503694534302, + "learning_rate": 0.0001887145242070117, + "loss": 0.6947, + "step": 174 + }, + { + "epoch": 0.5380476556495004, + "grad_norm": 0.7046053409576416, + "learning_rate": 0.00018864774624373958, + "loss": 0.6739, + "step": 175 + }, + { + "epoch": 0.5411222136817833, + "grad_norm": 0.8335860967636108, + "learning_rate": 0.00018858096828046746, + "loss": 0.7296, + "step": 176 + }, + { + "epoch": 0.5441967717140661, + "grad_norm": 0.6775506734848022, + "learning_rate": 0.00018851419031719533, + "loss": 0.5817, + "step": 177 + }, + { + "epoch": 0.547271329746349, + "grad_norm": 0.7883867621421814, + "learning_rate": 0.00018844741235392323, + "loss": 0.7067, + "step": 178 + }, + { + "epoch": 0.5503458877786318, + "grad_norm": 0.7405235767364502, + "learning_rate": 0.0001883806343906511, + "loss": 0.8347, + "step": 179 + }, + { + "epoch": 0.5534204458109147, + "grad_norm": 0.7003398537635803, + "learning_rate": 0.00018831385642737898, + "loss": 0.6322, + "step": 180 + }, + { + "epoch": 0.5564950038431975, + "grad_norm": 0.7515572309494019, + "learning_rate": 0.00018824707846410685, + "loss": 0.6944, + "step": 181 + }, + { + "epoch": 0.5595695618754803, + "grad_norm": 0.6841930150985718, + "learning_rate": 0.00018818030050083472, + "loss": 0.5833, + "step": 182 + }, + { + "epoch": 0.5626441199077633, + "grad_norm": 0.6888793706893921, + "learning_rate": 0.0001881135225375626, + "loss": 0.6509, + "step": 183 + }, + { + "epoch": 0.5657186779400462, + "grad_norm": 0.6468893885612488, + "learning_rate": 0.0001880467445742905, + "loss": 0.5695, + "step": 184 + }, + { + "epoch": 0.568793235972329, + "grad_norm": 0.7017901539802551, + "learning_rate": 0.00018797996661101837, + "loss": 0.5681, + "step": 185 + }, + { + "epoch": 0.5718677940046119, + "grad_norm": 0.7171371579170227, + "learning_rate": 0.00018791318864774624, + "loss": 0.6043, + "step": 186 + }, + { + "epoch": 0.5749423520368947, + "grad_norm": 0.77923583984375, + "learning_rate": 0.00018784641068447412, + "loss": 0.6694, + "step": 187 + }, + { + "epoch": 0.5780169100691775, + "grad_norm": 0.7366213202476501, + "learning_rate": 0.00018777963272120202, + "loss": 0.6366, + "step": 188 + }, + { + "epoch": 0.5810914681014604, + "grad_norm": 0.6756160259246826, + "learning_rate": 0.0001877128547579299, + "loss": 0.6678, + "step": 189 + }, + { + "epoch": 0.5841660261337432, + "grad_norm": 0.6736173629760742, + "learning_rate": 0.0001876460767946578, + "loss": 0.6418, + "step": 190 + }, + { + "epoch": 0.5872405841660261, + "grad_norm": 0.7356569170951843, + "learning_rate": 0.00018757929883138566, + "loss": 0.6235, + "step": 191 + }, + { + "epoch": 0.5903151421983089, + "grad_norm": 0.8169667720794678, + "learning_rate": 0.00018751252086811354, + "loss": 0.768, + "step": 192 + }, + { + "epoch": 0.5933897002305919, + "grad_norm": 1.0212959051132202, + "learning_rate": 0.0001874457429048414, + "loss": 0.7575, + "step": 193 + }, + { + "epoch": 0.5964642582628747, + "grad_norm": 0.7294356822967529, + "learning_rate": 0.0001873789649415693, + "loss": 0.5606, + "step": 194 + }, + { + "epoch": 0.5995388162951576, + "grad_norm": 0.8933930397033691, + "learning_rate": 0.00018731218697829718, + "loss": 0.7284, + "step": 195 + }, + { + "epoch": 0.6026133743274404, + "grad_norm": 0.640938937664032, + "learning_rate": 0.00018724540901502506, + "loss": 0.4718, + "step": 196 + }, + { + "epoch": 0.6056879323597233, + "grad_norm": 1.032175064086914, + "learning_rate": 0.00018717863105175293, + "loss": 0.7647, + "step": 197 + }, + { + "epoch": 0.6087624903920061, + "grad_norm": 0.7845223546028137, + "learning_rate": 0.0001871118530884808, + "loss": 0.657, + "step": 198 + }, + { + "epoch": 0.611837048424289, + "grad_norm": 0.7698432803153992, + "learning_rate": 0.00018704507512520868, + "loss": 0.6231, + "step": 199 + }, + { + "epoch": 0.6149116064565718, + "grad_norm": 0.8457287549972534, + "learning_rate": 0.00018697829716193658, + "loss": 0.5908, + "step": 200 + }, + { + "epoch": 0.6179861644888547, + "grad_norm": 0.9682031869888306, + "learning_rate": 0.00018691151919866445, + "loss": 0.7658, + "step": 201 + }, + { + "epoch": 0.6210607225211375, + "grad_norm": 0.7560285925865173, + "learning_rate": 0.00018684474123539232, + "loss": 0.5672, + "step": 202 + }, + { + "epoch": 0.6241352805534205, + "grad_norm": 0.749602198600769, + "learning_rate": 0.0001867779632721202, + "loss": 0.5424, + "step": 203 + }, + { + "epoch": 0.6272098385857033, + "grad_norm": 0.6830094456672668, + "learning_rate": 0.00018671118530884807, + "loss": 0.6763, + "step": 204 + }, + { + "epoch": 0.6302843966179862, + "grad_norm": 0.696247935295105, + "learning_rate": 0.00018664440734557597, + "loss": 0.5836, + "step": 205 + }, + { + "epoch": 0.633358954650269, + "grad_norm": 0.7082201242446899, + "learning_rate": 0.00018657762938230384, + "loss": 0.6022, + "step": 206 + }, + { + "epoch": 0.6364335126825519, + "grad_norm": 0.7224099636077881, + "learning_rate": 0.00018651085141903174, + "loss": 0.7518, + "step": 207 + }, + { + "epoch": 0.6395080707148347, + "grad_norm": 0.6942217946052551, + "learning_rate": 0.00018644407345575962, + "loss": 0.7052, + "step": 208 + }, + { + "epoch": 0.6425826287471176, + "grad_norm": 0.6529689431190491, + "learning_rate": 0.0001863772954924875, + "loss": 0.5383, + "step": 209 + }, + { + "epoch": 0.6456571867794004, + "grad_norm": 0.6160123944282532, + "learning_rate": 0.0001863105175292154, + "loss": 0.5955, + "step": 210 + }, + { + "epoch": 0.6487317448116833, + "grad_norm": 0.6024816036224365, + "learning_rate": 0.00018624373956594326, + "loss": 0.6733, + "step": 211 + }, + { + "epoch": 0.6518063028439662, + "grad_norm": 0.5778756737709045, + "learning_rate": 0.00018617696160267113, + "loss": 0.5349, + "step": 212 + }, + { + "epoch": 0.6548808608762491, + "grad_norm": 0.7351570725440979, + "learning_rate": 0.000186110183639399, + "loss": 0.795, + "step": 213 + }, + { + "epoch": 0.6579554189085319, + "grad_norm": 0.8623232245445251, + "learning_rate": 0.00018604340567612688, + "loss": 0.7451, + "step": 214 + }, + { + "epoch": 0.6610299769408148, + "grad_norm": 0.7850607633590698, + "learning_rate": 0.00018597662771285475, + "loss": 0.6888, + "step": 215 + }, + { + "epoch": 0.6641045349730976, + "grad_norm": 0.687150239944458, + "learning_rate": 0.00018590984974958265, + "loss": 0.5033, + "step": 216 + }, + { + "epoch": 0.6671790930053805, + "grad_norm": 0.532691478729248, + "learning_rate": 0.00018584307178631053, + "loss": 0.4734, + "step": 217 + }, + { + "epoch": 0.6702536510376633, + "grad_norm": 0.7870986461639404, + "learning_rate": 0.0001857762938230384, + "loss": 0.7304, + "step": 218 + }, + { + "epoch": 0.6733282090699462, + "grad_norm": 0.7504063248634338, + "learning_rate": 0.00018570951585976627, + "loss": 0.7476, + "step": 219 + }, + { + "epoch": 0.676402767102229, + "grad_norm": 0.7235811948776245, + "learning_rate": 0.00018564273789649415, + "loss": 0.5458, + "step": 220 + }, + { + "epoch": 0.6794773251345119, + "grad_norm": 0.8325716853141785, + "learning_rate": 0.00018557595993322205, + "loss": 0.6991, + "step": 221 + }, + { + "epoch": 0.6825518831667948, + "grad_norm": 0.7696716785430908, + "learning_rate": 0.00018550918196994992, + "loss": 0.6374, + "step": 222 + }, + { + "epoch": 0.6856264411990777, + "grad_norm": 0.9007569551467896, + "learning_rate": 0.0001854424040066778, + "loss": 0.7021, + "step": 223 + }, + { + "epoch": 0.6887009992313605, + "grad_norm": 0.8389153480529785, + "learning_rate": 0.0001853756260434057, + "loss": 0.7095, + "step": 224 + }, + { + "epoch": 0.6917755572636434, + "grad_norm": 0.8680058121681213, + "learning_rate": 0.00018530884808013357, + "loss": 0.6494, + "step": 225 + }, + { + "epoch": 0.6948501152959262, + "grad_norm": 0.5919209718704224, + "learning_rate": 0.00018524207011686147, + "loss": 0.4893, + "step": 226 + }, + { + "epoch": 0.6979246733282091, + "grad_norm": 0.6116464138031006, + "learning_rate": 0.00018517529215358934, + "loss": 0.6041, + "step": 227 + }, + { + "epoch": 0.7009992313604919, + "grad_norm": 0.6643829941749573, + "learning_rate": 0.00018510851419031721, + "loss": 0.6262, + "step": 228 + }, + { + "epoch": 0.7040737893927748, + "grad_norm": 0.8140367269515991, + "learning_rate": 0.0001850417362270451, + "loss": 0.6593, + "step": 229 + }, + { + "epoch": 0.7071483474250576, + "grad_norm": 0.7204163670539856, + "learning_rate": 0.00018497495826377296, + "loss": 0.6194, + "step": 230 + }, + { + "epoch": 0.7102229054573405, + "grad_norm": 0.6929581165313721, + "learning_rate": 0.00018490818030050083, + "loss": 0.638, + "step": 231 + }, + { + "epoch": 0.7132974634896234, + "grad_norm": 0.8570030331611633, + "learning_rate": 0.00018484140233722873, + "loss": 0.7651, + "step": 232 + }, + { + "epoch": 0.7163720215219063, + "grad_norm": 0.8367635011672974, + "learning_rate": 0.0001847746243739566, + "loss": 0.7164, + "step": 233 + }, + { + "epoch": 0.7194465795541891, + "grad_norm": 0.730556845664978, + "learning_rate": 0.00018470784641068448, + "loss": 0.7644, + "step": 234 + }, + { + "epoch": 0.722521137586472, + "grad_norm": 0.6781991720199585, + "learning_rate": 0.00018464106844741235, + "loss": 0.739, + "step": 235 + }, + { + "epoch": 0.7255956956187548, + "grad_norm": 0.6006051301956177, + "learning_rate": 0.00018457429048414023, + "loss": 0.6119, + "step": 236 + }, + { + "epoch": 0.7286702536510377, + "grad_norm": 0.7293769717216492, + "learning_rate": 0.00018450751252086813, + "loss": 0.7567, + "step": 237 + }, + { + "epoch": 0.7317448116833205, + "grad_norm": 0.8237872123718262, + "learning_rate": 0.000184440734557596, + "loss": 0.8348, + "step": 238 + }, + { + "epoch": 0.7348193697156034, + "grad_norm": 0.7130082845687866, + "learning_rate": 0.00018437395659432387, + "loss": 0.7532, + "step": 239 + }, + { + "epoch": 0.7378939277478862, + "grad_norm": 0.6330310702323914, + "learning_rate": 0.00018430717863105175, + "loss": 0.5578, + "step": 240 + }, + { + "epoch": 0.740968485780169, + "grad_norm": 0.5756601095199585, + "learning_rate": 0.00018424040066777965, + "loss": 0.6896, + "step": 241 + }, + { + "epoch": 0.744043043812452, + "grad_norm": 0.5361329913139343, + "learning_rate": 0.00018417362270450752, + "loss": 0.5699, + "step": 242 + }, + { + "epoch": 0.7471176018447349, + "grad_norm": 0.8006643056869507, + "learning_rate": 0.00018410684474123542, + "loss": 0.7754, + "step": 243 + }, + { + "epoch": 0.7501921598770177, + "grad_norm": 0.5526012778282166, + "learning_rate": 0.0001840400667779633, + "loss": 0.652, + "step": 244 + }, + { + "epoch": 0.7532667179093006, + "grad_norm": 0.6159217357635498, + "learning_rate": 0.00018397328881469117, + "loss": 0.6075, + "step": 245 + }, + { + "epoch": 0.7563412759415834, + "grad_norm": 0.7183135151863098, + "learning_rate": 0.00018390651085141904, + "loss": 0.7171, + "step": 246 + }, + { + "epoch": 0.7594158339738662, + "grad_norm": 0.5863479375839233, + "learning_rate": 0.0001838397328881469, + "loss": 0.6779, + "step": 247 + }, + { + "epoch": 0.7624903920061491, + "grad_norm": 0.6453308463096619, + "learning_rate": 0.0001837729549248748, + "loss": 0.6641, + "step": 248 + }, + { + "epoch": 0.765564950038432, + "grad_norm": 0.6052024364471436, + "learning_rate": 0.00018370617696160269, + "loss": 0.5686, + "step": 249 + }, + { + "epoch": 0.7686395080707148, + "grad_norm": 0.578968346118927, + "learning_rate": 0.00018363939899833056, + "loss": 0.6197, + "step": 250 + }, + { + "epoch": 0.7717140661029976, + "grad_norm": 0.6560538411140442, + "learning_rate": 0.00018357262103505843, + "loss": 0.5807, + "step": 251 + }, + { + "epoch": 0.7747886241352806, + "grad_norm": 0.5958215594291687, + "learning_rate": 0.0001835058430717863, + "loss": 0.5999, + "step": 252 + }, + { + "epoch": 0.7778631821675634, + "grad_norm": 0.5787363052368164, + "learning_rate": 0.0001834390651085142, + "loss": 0.4809, + "step": 253 + }, + { + "epoch": 0.7809377401998463, + "grad_norm": 0.5788077712059021, + "learning_rate": 0.00018337228714524208, + "loss": 0.5878, + "step": 254 + }, + { + "epoch": 0.7840122982321291, + "grad_norm": 0.6090837121009827, + "learning_rate": 0.00018330550918196995, + "loss": 0.5489, + "step": 255 + }, + { + "epoch": 0.787086856264412, + "grad_norm": 0.7720903754234314, + "learning_rate": 0.00018323873121869782, + "loss": 0.7388, + "step": 256 + }, + { + "epoch": 0.7901614142966948, + "grad_norm": 0.8125558495521545, + "learning_rate": 0.0001831719532554257, + "loss": 0.7156, + "step": 257 + }, + { + "epoch": 0.7932359723289777, + "grad_norm": 0.9323811531066895, + "learning_rate": 0.0001831051752921536, + "loss": 0.7382, + "step": 258 + }, + { + "epoch": 0.7963105303612605, + "grad_norm": 1.0001492500305176, + "learning_rate": 0.00018303839732888147, + "loss": 0.8477, + "step": 259 + }, + { + "epoch": 0.7993850883935434, + "grad_norm": 0.5271466374397278, + "learning_rate": 0.00018297161936560937, + "loss": 0.6826, + "step": 260 + }, + { + "epoch": 0.8024596464258262, + "grad_norm": 0.6705284118652344, + "learning_rate": 0.00018290484140233724, + "loss": 0.6206, + "step": 261 + }, + { + "epoch": 0.8055342044581092, + "grad_norm": 0.6997379064559937, + "learning_rate": 0.00018283806343906512, + "loss": 0.692, + "step": 262 + }, + { + "epoch": 0.808608762490392, + "grad_norm": 0.6880616545677185, + "learning_rate": 0.000182771285475793, + "loss": 0.4271, + "step": 263 + }, + { + "epoch": 0.8116833205226749, + "grad_norm": 0.6490948796272278, + "learning_rate": 0.0001827045075125209, + "loss": 0.5254, + "step": 264 + }, + { + "epoch": 0.8147578785549577, + "grad_norm": 0.6712121963500977, + "learning_rate": 0.00018263772954924876, + "loss": 0.6695, + "step": 265 + }, + { + "epoch": 0.8178324365872406, + "grad_norm": 0.6833428740501404, + "learning_rate": 0.00018257095158597664, + "loss": 0.6575, + "step": 266 + }, + { + "epoch": 0.8209069946195234, + "grad_norm": 0.567756712436676, + "learning_rate": 0.0001825041736227045, + "loss": 0.5706, + "step": 267 + }, + { + "epoch": 0.8239815526518063, + "grad_norm": 0.6579324007034302, + "learning_rate": 0.00018243739565943238, + "loss": 0.6428, + "step": 268 + }, + { + "epoch": 0.8270561106840891, + "grad_norm": 0.6071141958236694, + "learning_rate": 0.00018237061769616028, + "loss": 0.687, + "step": 269 + }, + { + "epoch": 0.830130668716372, + "grad_norm": 0.7748661041259766, + "learning_rate": 0.00018230383973288816, + "loss": 0.5839, + "step": 270 + }, + { + "epoch": 0.8332052267486548, + "grad_norm": 0.8571879267692566, + "learning_rate": 0.00018223706176961603, + "loss": 0.8454, + "step": 271 + }, + { + "epoch": 0.8362797847809378, + "grad_norm": 0.7605094313621521, + "learning_rate": 0.0001821702838063439, + "loss": 0.6888, + "step": 272 + }, + { + "epoch": 0.8393543428132206, + "grad_norm": 0.7963970303535461, + "learning_rate": 0.00018210350584307178, + "loss": 0.6651, + "step": 273 + }, + { + "epoch": 0.8424289008455035, + "grad_norm": 0.8123186826705933, + "learning_rate": 0.00018203672787979968, + "loss": 0.6378, + "step": 274 + }, + { + "epoch": 0.8455034588777863, + "grad_norm": 0.6652207374572754, + "learning_rate": 0.00018196994991652755, + "loss": 0.6736, + "step": 275 + }, + { + "epoch": 0.8485780169100692, + "grad_norm": 0.8622680306434631, + "learning_rate": 0.00018190317195325542, + "loss": 0.7403, + "step": 276 + }, + { + "epoch": 0.851652574942352, + "grad_norm": 0.7115731239318848, + "learning_rate": 0.00018183639398998332, + "loss": 0.6155, + "step": 277 + }, + { + "epoch": 0.8547271329746349, + "grad_norm": 0.8267805576324463, + "learning_rate": 0.0001817696160267112, + "loss": 0.587, + "step": 278 + }, + { + "epoch": 0.8578016910069177, + "grad_norm": 0.6537695527076721, + "learning_rate": 0.0001817028380634391, + "loss": 0.6371, + "step": 279 + }, + { + "epoch": 0.8608762490392006, + "grad_norm": 0.76251620054245, + "learning_rate": 0.00018163606010016697, + "loss": 0.7431, + "step": 280 + }, + { + "epoch": 0.8639508070714835, + "grad_norm": 0.7395420670509338, + "learning_rate": 0.00018156928213689484, + "loss": 0.687, + "step": 281 + }, + { + "epoch": 0.8670253651037664, + "grad_norm": 0.6425495147705078, + "learning_rate": 0.00018150250417362272, + "loss": 0.5745, + "step": 282 + }, + { + "epoch": 0.8700999231360492, + "grad_norm": 0.868341326713562, + "learning_rate": 0.0001814357262103506, + "loss": 0.8135, + "step": 283 + }, + { + "epoch": 0.8731744811683321, + "grad_norm": 0.8760446906089783, + "learning_rate": 0.00018136894824707846, + "loss": 0.9087, + "step": 284 + }, + { + "epoch": 0.8762490392006149, + "grad_norm": 0.5800943970680237, + "learning_rate": 0.00018130217028380636, + "loss": 0.5347, + "step": 285 + }, + { + "epoch": 0.8793235972328978, + "grad_norm": 0.7919514179229736, + "learning_rate": 0.00018123539232053424, + "loss": 0.5879, + "step": 286 + }, + { + "epoch": 0.8823981552651806, + "grad_norm": 0.5620681643486023, + "learning_rate": 0.0001811686143572621, + "loss": 0.51, + "step": 287 + }, + { + "epoch": 0.8854727132974635, + "grad_norm": 0.7460557818412781, + "learning_rate": 0.00018110183639398998, + "loss": 0.8177, + "step": 288 + }, + { + "epoch": 0.8885472713297463, + "grad_norm": 0.774587094783783, + "learning_rate": 0.00018103505843071786, + "loss": 0.6761, + "step": 289 + }, + { + "epoch": 0.8916218293620292, + "grad_norm": 0.6145612597465515, + "learning_rate": 0.00018096828046744576, + "loss": 0.7094, + "step": 290 + }, + { + "epoch": 0.8946963873943121, + "grad_norm": 0.6812341809272766, + "learning_rate": 0.00018090150250417363, + "loss": 0.8529, + "step": 291 + }, + { + "epoch": 0.897770945426595, + "grad_norm": 0.7788804769515991, + "learning_rate": 0.0001808347245409015, + "loss": 0.6635, + "step": 292 + }, + { + "epoch": 0.9008455034588778, + "grad_norm": 0.6078181862831116, + "learning_rate": 0.00018076794657762938, + "loss": 0.6468, + "step": 293 + }, + { + "epoch": 0.9039200614911607, + "grad_norm": 0.6376216411590576, + "learning_rate": 0.00018070116861435728, + "loss": 0.6725, + "step": 294 + }, + { + "epoch": 0.9069946195234435, + "grad_norm": 0.5974690914154053, + "learning_rate": 0.00018063439065108515, + "loss": 0.5874, + "step": 295 + }, + { + "epoch": 0.9100691775557264, + "grad_norm": 0.6442948579788208, + "learning_rate": 0.00018056761268781305, + "loss": 0.626, + "step": 296 + }, + { + "epoch": 0.9131437355880092, + "grad_norm": 0.7131801247596741, + "learning_rate": 0.00018050083472454092, + "loss": 0.6812, + "step": 297 + }, + { + "epoch": 0.9162182936202921, + "grad_norm": 0.823663592338562, + "learning_rate": 0.0001804340567612688, + "loss": 0.9266, + "step": 298 + }, + { + "epoch": 0.9192928516525749, + "grad_norm": 0.7136701345443726, + "learning_rate": 0.00018036727879799667, + "loss": 0.6225, + "step": 299 + }, + { + "epoch": 0.9223674096848578, + "grad_norm": 0.6703259348869324, + "learning_rate": 0.00018030050083472454, + "loss": 0.6608, + "step": 300 + }, + { + "epoch": 0.9254419677171407, + "grad_norm": 0.6696874499320984, + "learning_rate": 0.00018023372287145244, + "loss": 0.5609, + "step": 301 + }, + { + "epoch": 0.9285165257494236, + "grad_norm": 0.6228551268577576, + "learning_rate": 0.00018016694490818031, + "loss": 0.7327, + "step": 302 + }, + { + "epoch": 0.9315910837817064, + "grad_norm": 0.6737201809883118, + "learning_rate": 0.0001801001669449082, + "loss": 0.6051, + "step": 303 + }, + { + "epoch": 0.9346656418139893, + "grad_norm": 0.7718166708946228, + "learning_rate": 0.00018003338898163606, + "loss": 0.6961, + "step": 304 + }, + { + "epoch": 0.9377401998462721, + "grad_norm": 0.9040055871009827, + "learning_rate": 0.00017996661101836393, + "loss": 0.6448, + "step": 305 + }, + { + "epoch": 0.940814757878555, + "grad_norm": 0.7209524512290955, + "learning_rate": 0.00017989983305509183, + "loss": 0.695, + "step": 306 + }, + { + "epoch": 0.9438893159108378, + "grad_norm": 0.6280409693717957, + "learning_rate": 0.0001798330550918197, + "loss": 0.677, + "step": 307 + }, + { + "epoch": 0.9469638739431206, + "grad_norm": 0.715514063835144, + "learning_rate": 0.00017976627712854758, + "loss": 0.6076, + "step": 308 + }, + { + "epoch": 0.9500384319754035, + "grad_norm": 0.6662933230400085, + "learning_rate": 0.00017969949916527545, + "loss": 0.6453, + "step": 309 + }, + { + "epoch": 0.9531129900076863, + "grad_norm": 0.6966415047645569, + "learning_rate": 0.00017963272120200333, + "loss": 0.7091, + "step": 310 + }, + { + "epoch": 0.9561875480399693, + "grad_norm": 0.7018651366233826, + "learning_rate": 0.00017956594323873123, + "loss": 0.6113, + "step": 311 + }, + { + "epoch": 0.9592621060722522, + "grad_norm": 0.5975345373153687, + "learning_rate": 0.0001794991652754591, + "loss": 0.4381, + "step": 312 + }, + { + "epoch": 0.962336664104535, + "grad_norm": 0.7371988296508789, + "learning_rate": 0.000179432387312187, + "loss": 0.7471, + "step": 313 + }, + { + "epoch": 0.9654112221368178, + "grad_norm": 0.5989629030227661, + "learning_rate": 0.00017936560934891487, + "loss": 0.5947, + "step": 314 + }, + { + "epoch": 0.9684857801691007, + "grad_norm": 0.5772401094436646, + "learning_rate": 0.00017929883138564275, + "loss": 0.5862, + "step": 315 + }, + { + "epoch": 0.9715603382013835, + "grad_norm": 0.7896726727485657, + "learning_rate": 0.00017923205342237062, + "loss": 0.7367, + "step": 316 + }, + { + "epoch": 0.9746348962336664, + "grad_norm": 0.9095852375030518, + "learning_rate": 0.00017916527545909852, + "loss": 0.7899, + "step": 317 + }, + { + "epoch": 0.9777094542659492, + "grad_norm": 0.5150197744369507, + "learning_rate": 0.0001790984974958264, + "loss": 0.4869, + "step": 318 + }, + { + "epoch": 0.9807840122982321, + "grad_norm": 0.6638761162757874, + "learning_rate": 0.00017903171953255427, + "loss": 0.6129, + "step": 319 + }, + { + "epoch": 0.9838585703305149, + "grad_norm": 0.738000750541687, + "learning_rate": 0.00017896494156928214, + "loss": 0.5616, + "step": 320 + }, + { + "epoch": 0.9869331283627979, + "grad_norm": 0.6779305934906006, + "learning_rate": 0.00017889816360601, + "loss": 0.6168, + "step": 321 + }, + { + "epoch": 0.9900076863950807, + "grad_norm": 0.5411549806594849, + "learning_rate": 0.0001788313856427379, + "loss": 0.5456, + "step": 322 + }, + { + "epoch": 0.9930822444273636, + "grad_norm": 0.6001323461532593, + "learning_rate": 0.0001787646076794658, + "loss": 0.7528, + "step": 323 + }, + { + "epoch": 0.9961568024596464, + "grad_norm": 0.6542277932167053, + "learning_rate": 0.00017869782971619366, + "loss": 0.5923, + "step": 324 + }, + { + "epoch": 0.9992313604919293, + "grad_norm": 0.6943919658660889, + "learning_rate": 0.00017863105175292153, + "loss": 1.0933, + "step": 325 + }, + { + "epoch": 1.0, + "grad_norm": 1.3673266172409058, + "learning_rate": 0.0001785642737896494, + "loss": 0.675, + "step": 326 + }, + { + "epoch": 1.0030745580322828, + "grad_norm": 0.5977247953414917, + "learning_rate": 0.0001784974958263773, + "loss": 0.6236, + "step": 327 + }, + { + "epoch": 1.0061491160645657, + "grad_norm": 0.5625783205032349, + "learning_rate": 0.00017843071786310518, + "loss": 0.5051, + "step": 328 + }, + { + "epoch": 1.0092236740968485, + "grad_norm": 0.5822674036026001, + "learning_rate": 0.00017836393989983305, + "loss": 0.4488, + "step": 329 + }, + { + "epoch": 1.0122982321291314, + "grad_norm": 0.5701442360877991, + "learning_rate": 0.00017829716193656095, + "loss": 0.5875, + "step": 330 + }, + { + "epoch": 1.0153727901614142, + "grad_norm": 0.5955713391304016, + "learning_rate": 0.00017823038397328883, + "loss": 0.4612, + "step": 331 + }, + { + "epoch": 1.018447348193697, + "grad_norm": 0.5202100276947021, + "learning_rate": 0.0001781636060100167, + "loss": 0.4925, + "step": 332 + }, + { + "epoch": 1.02152190622598, + "grad_norm": 0.6523457765579224, + "learning_rate": 0.0001780968280467446, + "loss": 0.5336, + "step": 333 + }, + { + "epoch": 1.0245964642582628, + "grad_norm": 0.653768002986908, + "learning_rate": 0.00017803005008347247, + "loss": 0.5789, + "step": 334 + }, + { + "epoch": 1.0276710222905456, + "grad_norm": 0.680659294128418, + "learning_rate": 0.00017796327212020035, + "loss": 0.5427, + "step": 335 + }, + { + "epoch": 1.0307455803228285, + "grad_norm": 0.6698821783065796, + "learning_rate": 0.00017789649415692822, + "loss": 0.5877, + "step": 336 + }, + { + "epoch": 1.0338201383551116, + "grad_norm": 0.628028392791748, + "learning_rate": 0.0001778297161936561, + "loss": 0.4996, + "step": 337 + }, + { + "epoch": 1.0368946963873944, + "grad_norm": 0.5866581797599792, + "learning_rate": 0.000177762938230384, + "loss": 0.6283, + "step": 338 + }, + { + "epoch": 1.0399692544196772, + "grad_norm": 0.646752119064331, + "learning_rate": 0.00017769616026711187, + "loss": 0.4585, + "step": 339 + }, + { + "epoch": 1.04304381245196, + "grad_norm": 0.678822934627533, + "learning_rate": 0.00017762938230383974, + "loss": 0.4741, + "step": 340 + }, + { + "epoch": 1.046118370484243, + "grad_norm": 0.57511967420578, + "learning_rate": 0.0001775626043405676, + "loss": 0.576, + "step": 341 + }, + { + "epoch": 1.0491929285165258, + "grad_norm": 0.7732008099555969, + "learning_rate": 0.00017749582637729548, + "loss": 0.6847, + "step": 342 + }, + { + "epoch": 1.0522674865488086, + "grad_norm": 0.47226476669311523, + "learning_rate": 0.00017742904841402339, + "loss": 0.289, + "step": 343 + }, + { + "epoch": 1.0553420445810915, + "grad_norm": 0.7770098447799683, + "learning_rate": 0.00017736227045075126, + "loss": 0.5968, + "step": 344 + }, + { + "epoch": 1.0584166026133743, + "grad_norm": 0.8492668867111206, + "learning_rate": 0.00017729549248747913, + "loss": 0.6167, + "step": 345 + }, + { + "epoch": 1.0614911606456572, + "grad_norm": 0.876089870929718, + "learning_rate": 0.000177228714524207, + "loss": 0.5435, + "step": 346 + }, + { + "epoch": 1.06456571867794, + "grad_norm": 0.7883753776550293, + "learning_rate": 0.0001771619365609349, + "loss": 0.5498, + "step": 347 + }, + { + "epoch": 1.0676402767102229, + "grad_norm": 0.785437822341919, + "learning_rate": 0.00017709515859766278, + "loss": 0.5835, + "step": 348 + }, + { + "epoch": 1.0707148347425057, + "grad_norm": 0.6789015531539917, + "learning_rate": 0.00017702838063439068, + "loss": 0.5492, + "step": 349 + }, + { + "epoch": 1.0737893927747886, + "grad_norm": 0.7070201635360718, + "learning_rate": 0.00017696160267111855, + "loss": 0.5829, + "step": 350 + }, + { + "epoch": 1.0768639508070714, + "grad_norm": 0.7011975049972534, + "learning_rate": 0.00017689482470784642, + "loss": 0.4774, + "step": 351 + }, + { + "epoch": 1.0799385088393543, + "grad_norm": 0.7407499551773071, + "learning_rate": 0.0001768280467445743, + "loss": 0.51, + "step": 352 + }, + { + "epoch": 1.0830130668716371, + "grad_norm": 0.672869861125946, + "learning_rate": 0.00017676126878130217, + "loss": 0.5383, + "step": 353 + }, + { + "epoch": 1.08608762490392, + "grad_norm": 0.8781456351280212, + "learning_rate": 0.00017669449081803007, + "loss": 0.6436, + "step": 354 + }, + { + "epoch": 1.089162182936203, + "grad_norm": 0.8077890872955322, + "learning_rate": 0.00017662771285475794, + "loss": 0.6629, + "step": 355 + }, + { + "epoch": 1.0922367409684859, + "grad_norm": 0.7883043885231018, + "learning_rate": 0.00017656093489148582, + "loss": 0.6393, + "step": 356 + }, + { + "epoch": 1.0953112990007687, + "grad_norm": 0.68159419298172, + "learning_rate": 0.0001764941569282137, + "loss": 0.5835, + "step": 357 + }, + { + "epoch": 1.0983858570330516, + "grad_norm": 0.658222496509552, + "learning_rate": 0.00017642737896494156, + "loss": 0.4921, + "step": 358 + }, + { + "epoch": 1.1014604150653344, + "grad_norm": 0.6931422352790833, + "learning_rate": 0.00017636060100166946, + "loss": 0.5664, + "step": 359 + }, + { + "epoch": 1.1045349730976173, + "grad_norm": 0.6795049905776978, + "learning_rate": 0.00017629382303839734, + "loss": 0.5559, + "step": 360 + }, + { + "epoch": 1.1076095311299001, + "grad_norm": 0.9736855030059814, + "learning_rate": 0.0001762270450751252, + "loss": 0.7188, + "step": 361 + }, + { + "epoch": 1.110684089162183, + "grad_norm": 0.6535844802856445, + "learning_rate": 0.00017616026711185308, + "loss": 0.5894, + "step": 362 + }, + { + "epoch": 1.1137586471944658, + "grad_norm": 0.7295445799827576, + "learning_rate": 0.00017609348914858096, + "loss": 0.6106, + "step": 363 + }, + { + "epoch": 1.1168332052267487, + "grad_norm": 0.7204632759094238, + "learning_rate": 0.00017602671118530886, + "loss": 0.6681, + "step": 364 + }, + { + "epoch": 1.1199077632590315, + "grad_norm": 0.64588862657547, + "learning_rate": 0.00017595993322203673, + "loss": 0.5934, + "step": 365 + }, + { + "epoch": 1.1229823212913144, + "grad_norm": 0.6482330560684204, + "learning_rate": 0.00017589315525876463, + "loss": 0.5082, + "step": 366 + }, + { + "epoch": 1.1260568793235972, + "grad_norm": 0.6101349592208862, + "learning_rate": 0.0001758263772954925, + "loss": 0.4976, + "step": 367 + }, + { + "epoch": 1.12913143735588, + "grad_norm": 0.5716677308082581, + "learning_rate": 0.00017575959933222038, + "loss": 0.3977, + "step": 368 + }, + { + "epoch": 1.132205995388163, + "grad_norm": 0.557501494884491, + "learning_rate": 0.00017569282136894825, + "loss": 0.492, + "step": 369 + }, + { + "epoch": 1.1352805534204458, + "grad_norm": 0.7171933054924011, + "learning_rate": 0.00017562604340567615, + "loss": 0.6809, + "step": 370 + }, + { + "epoch": 1.1383551114527286, + "grad_norm": 0.5551110506057739, + "learning_rate": 0.00017555926544240402, + "loss": 0.4988, + "step": 371 + }, + { + "epoch": 1.1414296694850115, + "grad_norm": 0.6553733944892883, + "learning_rate": 0.0001754924874791319, + "loss": 0.4898, + "step": 372 + }, + { + "epoch": 1.1445042275172943, + "grad_norm": 0.69221431016922, + "learning_rate": 0.00017542570951585977, + "loss": 0.6136, + "step": 373 + }, + { + "epoch": 1.1475787855495772, + "grad_norm": 0.5864092707633972, + "learning_rate": 0.00017535893155258764, + "loss": 0.5688, + "step": 374 + }, + { + "epoch": 1.15065334358186, + "grad_norm": 0.756809651851654, + "learning_rate": 0.00017529215358931554, + "loss": 0.5349, + "step": 375 + }, + { + "epoch": 1.1537279016141428, + "grad_norm": 0.6437715291976929, + "learning_rate": 0.00017522537562604342, + "loss": 0.5303, + "step": 376 + }, + { + "epoch": 1.156802459646426, + "grad_norm": 0.5712356567382812, + "learning_rate": 0.0001751585976627713, + "loss": 0.4698, + "step": 377 + }, + { + "epoch": 1.1598770176787088, + "grad_norm": 0.6452774405479431, + "learning_rate": 0.00017509181969949916, + "loss": 0.6214, + "step": 378 + }, + { + "epoch": 1.1629515757109916, + "grad_norm": 0.6589751839637756, + "learning_rate": 0.00017502504173622704, + "loss": 0.5302, + "step": 379 + }, + { + "epoch": 1.1660261337432745, + "grad_norm": 0.6354514360427856, + "learning_rate": 0.0001749582637729549, + "loss": 0.4941, + "step": 380 + }, + { + "epoch": 1.1691006917755573, + "grad_norm": 0.8875218033790588, + "learning_rate": 0.0001748914858096828, + "loss": 0.5601, + "step": 381 + }, + { + "epoch": 1.1721752498078402, + "grad_norm": 0.7112509608268738, + "learning_rate": 0.0001748247078464107, + "loss": 0.5841, + "step": 382 + }, + { + "epoch": 1.175249807840123, + "grad_norm": 0.6991716623306274, + "learning_rate": 0.00017475792988313858, + "loss": 0.4966, + "step": 383 + }, + { + "epoch": 1.1783243658724059, + "grad_norm": 0.8313332200050354, + "learning_rate": 0.00017469115191986646, + "loss": 0.6796, + "step": 384 + }, + { + "epoch": 1.1813989239046887, + "grad_norm": 0.6446208953857422, + "learning_rate": 0.00017462437395659433, + "loss": 0.4277, + "step": 385 + }, + { + "epoch": 1.1844734819369716, + "grad_norm": 0.6382359862327576, + "learning_rate": 0.00017455759599332223, + "loss": 0.5088, + "step": 386 + }, + { + "epoch": 1.1875480399692544, + "grad_norm": 0.8059669733047485, + "learning_rate": 0.0001744908180300501, + "loss": 0.5724, + "step": 387 + }, + { + "epoch": 1.1906225980015372, + "grad_norm": 0.7880392074584961, + "learning_rate": 0.00017442404006677798, + "loss": 0.6245, + "step": 388 + }, + { + "epoch": 1.19369715603382, + "grad_norm": 0.780595600605011, + "learning_rate": 0.00017435726210350585, + "loss": 0.5629, + "step": 389 + }, + { + "epoch": 1.196771714066103, + "grad_norm": 0.8109543323516846, + "learning_rate": 0.00017429048414023372, + "loss": 0.6815, + "step": 390 + }, + { + "epoch": 1.1998462720983858, + "grad_norm": 0.6399725079536438, + "learning_rate": 0.00017422370617696162, + "loss": 0.5488, + "step": 391 + }, + { + "epoch": 1.2029208301306686, + "grad_norm": 0.6464505195617676, + "learning_rate": 0.0001741569282136895, + "loss": 0.5546, + "step": 392 + }, + { + "epoch": 1.2059953881629515, + "grad_norm": 0.7562092542648315, + "learning_rate": 0.00017409015025041737, + "loss": 0.5998, + "step": 393 + }, + { + "epoch": 1.2090699461952346, + "grad_norm": 0.7341581583023071, + "learning_rate": 0.00017402337228714524, + "loss": 0.6801, + "step": 394 + }, + { + "epoch": 1.2121445042275174, + "grad_norm": 0.7949944734573364, + "learning_rate": 0.00017395659432387311, + "loss": 0.6253, + "step": 395 + }, + { + "epoch": 1.2152190622598003, + "grad_norm": 0.6935542225837708, + "learning_rate": 0.00017388981636060101, + "loss": 0.6417, + "step": 396 + }, + { + "epoch": 1.218293620292083, + "grad_norm": 0.6856999397277832, + "learning_rate": 0.0001738230383973289, + "loss": 0.5247, + "step": 397 + }, + { + "epoch": 1.221368178324366, + "grad_norm": 0.5797318816184998, + "learning_rate": 0.00017375626043405676, + "loss": 0.5061, + "step": 398 + }, + { + "epoch": 1.2244427363566488, + "grad_norm": 0.5869422554969788, + "learning_rate": 0.00017368948247078466, + "loss": 0.5226, + "step": 399 + }, + { + "epoch": 1.2275172943889316, + "grad_norm": 0.8467463850975037, + "learning_rate": 0.00017362270450751253, + "loss": 0.711, + "step": 400 + }, + { + "epoch": 1.2305918524212145, + "grad_norm": 0.7549751996994019, + "learning_rate": 0.0001735559265442404, + "loss": 0.6558, + "step": 401 + }, + { + "epoch": 1.2336664104534973, + "grad_norm": 0.6192473769187927, + "learning_rate": 0.0001734891485809683, + "loss": 0.5697, + "step": 402 + }, + { + "epoch": 1.2367409684857802, + "grad_norm": 0.7555997967720032, + "learning_rate": 0.00017342237061769618, + "loss": 0.5774, + "step": 403 + }, + { + "epoch": 1.239815526518063, + "grad_norm": 0.6554675698280334, + "learning_rate": 0.00017335559265442405, + "loss": 0.653, + "step": 404 + }, + { + "epoch": 1.2428900845503459, + "grad_norm": 0.8110440969467163, + "learning_rate": 0.00017328881469115193, + "loss": 0.6604, + "step": 405 + }, + { + "epoch": 1.2459646425826287, + "grad_norm": 0.7523771524429321, + "learning_rate": 0.0001732220367278798, + "loss": 0.7315, + "step": 406 + }, + { + "epoch": 1.2490392006149116, + "grad_norm": 0.7357513308525085, + "learning_rate": 0.0001731552587646077, + "loss": 0.6301, + "step": 407 + }, + { + "epoch": 1.2521137586471944, + "grad_norm": 0.6375721096992493, + "learning_rate": 0.00017308848080133557, + "loss": 0.4597, + "step": 408 + }, + { + "epoch": 1.2551883166794773, + "grad_norm": 0.7142077684402466, + "learning_rate": 0.00017302170283806345, + "loss": 0.5701, + "step": 409 + }, + { + "epoch": 1.2582628747117601, + "grad_norm": 0.5495367646217346, + "learning_rate": 0.00017295492487479132, + "loss": 0.4168, + "step": 410 + }, + { + "epoch": 1.261337432744043, + "grad_norm": 0.6137920618057251, + "learning_rate": 0.0001728881469115192, + "loss": 0.5961, + "step": 411 + }, + { + "epoch": 1.2644119907763258, + "grad_norm": 0.6373696327209473, + "learning_rate": 0.0001728213689482471, + "loss": 0.6035, + "step": 412 + }, + { + "epoch": 1.2674865488086087, + "grad_norm": 0.7311916351318359, + "learning_rate": 0.00017275459098497497, + "loss": 0.6675, + "step": 413 + }, + { + "epoch": 1.2705611068408915, + "grad_norm": 0.5674752593040466, + "learning_rate": 0.00017268781302170284, + "loss": 0.5865, + "step": 414 + }, + { + "epoch": 1.2736356648731744, + "grad_norm": 0.6945238709449768, + "learning_rate": 0.0001726210350584307, + "loss": 0.5979, + "step": 415 + }, + { + "epoch": 1.2767102229054572, + "grad_norm": 0.7307734489440918, + "learning_rate": 0.0001725542570951586, + "loss": 0.6184, + "step": 416 + }, + { + "epoch": 1.27978478093774, + "grad_norm": 0.6113364100456238, + "learning_rate": 0.0001724874791318865, + "loss": 0.4949, + "step": 417 + }, + { + "epoch": 1.2828593389700231, + "grad_norm": 0.8040212988853455, + "learning_rate": 0.0001724207011686144, + "loss": 0.6111, + "step": 418 + }, + { + "epoch": 1.285933897002306, + "grad_norm": 0.6946241855621338, + "learning_rate": 0.00017235392320534226, + "loss": 0.4845, + "step": 419 + }, + { + "epoch": 1.2890084550345888, + "grad_norm": 0.559880256652832, + "learning_rate": 0.00017228714524207013, + "loss": 0.6224, + "step": 420 + }, + { + "epoch": 1.2920830130668717, + "grad_norm": 0.7335419654846191, + "learning_rate": 0.000172220367278798, + "loss": 0.6117, + "step": 421 + }, + { + "epoch": 1.2951575710991545, + "grad_norm": 0.6661849617958069, + "learning_rate": 0.00017215358931552588, + "loss": 0.5993, + "step": 422 + }, + { + "epoch": 1.2982321291314374, + "grad_norm": 0.7723634243011475, + "learning_rate": 0.00017208681135225378, + "loss": 0.7025, + "step": 423 + }, + { + "epoch": 1.3013066871637202, + "grad_norm": 0.5866445302963257, + "learning_rate": 0.00017202003338898165, + "loss": 0.5205, + "step": 424 + }, + { + "epoch": 1.304381245196003, + "grad_norm": 0.9210363030433655, + "learning_rate": 0.00017195325542570953, + "loss": 0.6247, + "step": 425 + }, + { + "epoch": 1.307455803228286, + "grad_norm": 0.6116583943367004, + "learning_rate": 0.0001718864774624374, + "loss": 0.5366, + "step": 426 + }, + { + "epoch": 1.3105303612605688, + "grad_norm": 0.7020177245140076, + "learning_rate": 0.00017181969949916527, + "loss": 0.4586, + "step": 427 + }, + { + "epoch": 1.3136049192928516, + "grad_norm": 0.8982479572296143, + "learning_rate": 0.00017175292153589317, + "loss": 0.6121, + "step": 428 + }, + { + "epoch": 1.3166794773251345, + "grad_norm": 0.6956773996353149, + "learning_rate": 0.00017168614357262105, + "loss": 0.5002, + "step": 429 + }, + { + "epoch": 1.3197540353574173, + "grad_norm": 0.5864204168319702, + "learning_rate": 0.00017161936560934892, + "loss": 0.4577, + "step": 430 + }, + { + "epoch": 1.3228285933897002, + "grad_norm": 0.6034566760063171, + "learning_rate": 0.0001715525876460768, + "loss": 0.5688, + "step": 431 + }, + { + "epoch": 1.3259031514219832, + "grad_norm": 0.7787615060806274, + "learning_rate": 0.00017148580968280467, + "loss": 0.6904, + "step": 432 + }, + { + "epoch": 1.328977709454266, + "grad_norm": 0.6120966076850891, + "learning_rate": 0.00017141903171953257, + "loss": 0.452, + "step": 433 + }, + { + "epoch": 1.332052267486549, + "grad_norm": 0.6668190360069275, + "learning_rate": 0.00017135225375626044, + "loss": 0.5082, + "step": 434 + }, + { + "epoch": 1.3351268255188318, + "grad_norm": 0.660654604434967, + "learning_rate": 0.00017128547579298834, + "loss": 0.618, + "step": 435 + }, + { + "epoch": 1.3382013835511146, + "grad_norm": 0.6356967091560364, + "learning_rate": 0.0001712186978297162, + "loss": 0.583, + "step": 436 + }, + { + "epoch": 1.3412759415833975, + "grad_norm": 0.6737658977508545, + "learning_rate": 0.00017115191986644409, + "loss": 0.511, + "step": 437 + }, + { + "epoch": 1.3443504996156803, + "grad_norm": 0.6208163499832153, + "learning_rate": 0.00017108514190317196, + "loss": 0.4826, + "step": 438 + }, + { + "epoch": 1.3474250576479632, + "grad_norm": 0.570587694644928, + "learning_rate": 0.00017101836393989986, + "loss": 0.4451, + "step": 439 + }, + { + "epoch": 1.350499615680246, + "grad_norm": 0.6985802054405212, + "learning_rate": 0.00017095158597662773, + "loss": 0.5063, + "step": 440 + }, + { + "epoch": 1.3535741737125289, + "grad_norm": 0.6364935040473938, + "learning_rate": 0.0001708848080133556, + "loss": 0.5311, + "step": 441 + }, + { + "epoch": 1.3566487317448117, + "grad_norm": 0.6550077199935913, + "learning_rate": 0.00017081803005008348, + "loss": 0.4903, + "step": 442 + }, + { + "epoch": 1.3597232897770946, + "grad_norm": 0.6158908605575562, + "learning_rate": 0.00017075125208681135, + "loss": 0.5618, + "step": 443 + }, + { + "epoch": 1.3627978478093774, + "grad_norm": 0.6985887885093689, + "learning_rate": 0.00017068447412353925, + "loss": 0.5663, + "step": 444 + }, + { + "epoch": 1.3658724058416603, + "grad_norm": 0.6205439567565918, + "learning_rate": 0.00017061769616026712, + "loss": 0.4905, + "step": 445 + }, + { + "epoch": 1.368946963873943, + "grad_norm": 0.9212015271186829, + "learning_rate": 0.000170550918196995, + "loss": 0.7055, + "step": 446 + }, + { + "epoch": 1.372021521906226, + "grad_norm": 0.5099778175354004, + "learning_rate": 0.00017048414023372287, + "loss": 0.3754, + "step": 447 + }, + { + "epoch": 1.3750960799385088, + "grad_norm": 0.7985131740570068, + "learning_rate": 0.00017041736227045074, + "loss": 0.6526, + "step": 448 + }, + { + "epoch": 1.3781706379707916, + "grad_norm": 0.8302136063575745, + "learning_rate": 0.00017035058430717862, + "loss": 0.6056, + "step": 449 + }, + { + "epoch": 1.3812451960030745, + "grad_norm": 0.7308214902877808, + "learning_rate": 0.00017028380634390652, + "loss": 0.6452, + "step": 450 + }, + { + "epoch": 1.3843197540353573, + "grad_norm": 0.7058115005493164, + "learning_rate": 0.0001702170283806344, + "loss": 0.6399, + "step": 451 + }, + { + "epoch": 1.3873943120676402, + "grad_norm": 0.5836137533187866, + "learning_rate": 0.0001701502504173623, + "loss": 0.5628, + "step": 452 + }, + { + "epoch": 1.390468870099923, + "grad_norm": 0.5505719780921936, + "learning_rate": 0.00017008347245409016, + "loss": 0.4292, + "step": 453 + }, + { + "epoch": 1.3935434281322059, + "grad_norm": 0.7084729671478271, + "learning_rate": 0.00017001669449081804, + "loss": 0.5762, + "step": 454 + }, + { + "epoch": 1.3966179861644887, + "grad_norm": 0.6776607632637024, + "learning_rate": 0.00016994991652754594, + "loss": 0.4992, + "step": 455 + }, + { + "epoch": 1.3996925441967716, + "grad_norm": 0.6364510655403137, + "learning_rate": 0.0001698831385642738, + "loss": 0.4434, + "step": 456 + }, + { + "epoch": 1.4027671022290547, + "grad_norm": 0.6788143515586853, + "learning_rate": 0.00016981636060100168, + "loss": 0.4134, + "step": 457 + }, + { + "epoch": 1.4058416602613375, + "grad_norm": 0.6752612590789795, + "learning_rate": 0.00016974958263772956, + "loss": 0.5838, + "step": 458 + }, + { + "epoch": 1.4089162182936203, + "grad_norm": 0.6687692403793335, + "learning_rate": 0.00016968280467445743, + "loss": 0.6128, + "step": 459 + }, + { + "epoch": 1.4119907763259032, + "grad_norm": 0.8868100047111511, + "learning_rate": 0.00016961602671118533, + "loss": 0.5903, + "step": 460 + }, + { + "epoch": 1.415065334358186, + "grad_norm": 0.7482825517654419, + "learning_rate": 0.0001695492487479132, + "loss": 0.5034, + "step": 461 + }, + { + "epoch": 1.418139892390469, + "grad_norm": 0.5688104033470154, + "learning_rate": 0.00016948247078464108, + "loss": 0.5084, + "step": 462 + }, + { + "epoch": 1.4212144504227517, + "grad_norm": 0.730925440788269, + "learning_rate": 0.00016941569282136895, + "loss": 0.5267, + "step": 463 + }, + { + "epoch": 1.4242890084550346, + "grad_norm": 0.683314859867096, + "learning_rate": 0.00016934891485809682, + "loss": 0.6047, + "step": 464 + }, + { + "epoch": 1.4273635664873174, + "grad_norm": 0.7654600143432617, + "learning_rate": 0.0001692821368948247, + "loss": 0.5612, + "step": 465 + }, + { + "epoch": 1.4304381245196003, + "grad_norm": 0.5215669870376587, + "learning_rate": 0.0001692153589315526, + "loss": 0.4425, + "step": 466 + }, + { + "epoch": 1.4335126825518831, + "grad_norm": 0.8029130697250366, + "learning_rate": 0.00016914858096828047, + "loss": 0.6748, + "step": 467 + }, + { + "epoch": 1.436587240584166, + "grad_norm": 0.7310311794281006, + "learning_rate": 0.00016908180300500834, + "loss": 0.7158, + "step": 468 + }, + { + "epoch": 1.4396617986164488, + "grad_norm": 0.6347652077674866, + "learning_rate": 0.00016901502504173624, + "loss": 0.447, + "step": 469 + }, + { + "epoch": 1.4427363566487317, + "grad_norm": 0.6077408194541931, + "learning_rate": 0.00016894824707846412, + "loss": 0.5187, + "step": 470 + }, + { + "epoch": 1.4458109146810145, + "grad_norm": 0.7281926274299622, + "learning_rate": 0.00016888146911519202, + "loss": 0.6181, + "step": 471 + }, + { + "epoch": 1.4488854727132976, + "grad_norm": 0.7540388107299805, + "learning_rate": 0.0001688146911519199, + "loss": 0.5303, + "step": 472 + }, + { + "epoch": 1.4519600307455804, + "grad_norm": 0.8174847364425659, + "learning_rate": 0.00016874791318864776, + "loss": 0.733, + "step": 473 + }, + { + "epoch": 1.4550345887778633, + "grad_norm": 0.6414505243301392, + "learning_rate": 0.00016868113522537564, + "loss": 0.5458, + "step": 474 + }, + { + "epoch": 1.4581091468101461, + "grad_norm": 0.9108033776283264, + "learning_rate": 0.0001686143572621035, + "loss": 0.7125, + "step": 475 + }, + { + "epoch": 1.461183704842429, + "grad_norm": 0.6116359233856201, + "learning_rate": 0.0001685475792988314, + "loss": 0.5549, + "step": 476 + }, + { + "epoch": 1.4642582628747118, + "grad_norm": 0.821499228477478, + "learning_rate": 0.00016848080133555928, + "loss": 0.7422, + "step": 477 + }, + { + "epoch": 1.4673328209069947, + "grad_norm": 0.5836993455886841, + "learning_rate": 0.00016841402337228716, + "loss": 0.4829, + "step": 478 + }, + { + "epoch": 1.4704073789392775, + "grad_norm": 0.7028072476387024, + "learning_rate": 0.00016834724540901503, + "loss": 0.4844, + "step": 479 + }, + { + "epoch": 1.4734819369715604, + "grad_norm": 0.6338192224502563, + "learning_rate": 0.0001682804674457429, + "loss": 0.418, + "step": 480 + }, + { + "epoch": 1.4765564950038432, + "grad_norm": 0.7174279689788818, + "learning_rate": 0.00016821368948247077, + "loss": 0.4816, + "step": 481 + }, + { + "epoch": 1.479631053036126, + "grad_norm": 0.6590016484260559, + "learning_rate": 0.00016814691151919868, + "loss": 0.5613, + "step": 482 + }, + { + "epoch": 1.482705611068409, + "grad_norm": 0.7180425524711609, + "learning_rate": 0.00016808013355592655, + "loss": 0.7368, + "step": 483 + }, + { + "epoch": 1.4857801691006918, + "grad_norm": 0.7836325168609619, + "learning_rate": 0.00016801335559265442, + "loss": 0.6126, + "step": 484 + }, + { + "epoch": 1.4888547271329746, + "grad_norm": 0.6930490732192993, + "learning_rate": 0.0001679465776293823, + "loss": 0.6497, + "step": 485 + }, + { + "epoch": 1.4919292851652575, + "grad_norm": 0.6975258588790894, + "learning_rate": 0.0001678797996661102, + "loss": 0.579, + "step": 486 + }, + { + "epoch": 1.4950038431975403, + "grad_norm": 0.7456351518630981, + "learning_rate": 0.00016781302170283807, + "loss": 0.5209, + "step": 487 + }, + { + "epoch": 1.4980784012298232, + "grad_norm": 0.6301809549331665, + "learning_rate": 0.00016774624373956597, + "loss": 0.4117, + "step": 488 + }, + { + "epoch": 1.501152959262106, + "grad_norm": 0.9827542304992676, + "learning_rate": 0.00016767946577629384, + "loss": 0.7722, + "step": 489 + }, + { + "epoch": 1.5042275172943889, + "grad_norm": 0.6148912906646729, + "learning_rate": 0.00016761268781302171, + "loss": 0.6126, + "step": 490 + }, + { + "epoch": 1.5073020753266717, + "grad_norm": 0.7233926057815552, + "learning_rate": 0.0001675459098497496, + "loss": 0.6748, + "step": 491 + }, + { + "epoch": 1.5103766333589546, + "grad_norm": 0.7733349204063416, + "learning_rate": 0.0001674791318864775, + "loss": 0.5462, + "step": 492 + }, + { + "epoch": 1.5134511913912374, + "grad_norm": 0.6742725372314453, + "learning_rate": 0.00016741235392320536, + "loss": 0.6109, + "step": 493 + }, + { + "epoch": 1.5165257494235203, + "grad_norm": 0.5742484331130981, + "learning_rate": 0.00016734557595993323, + "loss": 0.452, + "step": 494 + }, + { + "epoch": 1.519600307455803, + "grad_norm": 0.5890893936157227, + "learning_rate": 0.0001672787979966611, + "loss": 0.5423, + "step": 495 + }, + { + "epoch": 1.522674865488086, + "grad_norm": 0.6500853896141052, + "learning_rate": 0.00016721202003338898, + "loss": 0.5345, + "step": 496 + }, + { + "epoch": 1.5257494235203688, + "grad_norm": 0.6630553603172302, + "learning_rate": 0.00016714524207011685, + "loss": 0.5529, + "step": 497 + }, + { + "epoch": 1.5288239815526516, + "grad_norm": 0.72234046459198, + "learning_rate": 0.00016707846410684475, + "loss": 0.5947, + "step": 498 + }, + { + "epoch": 1.5318985395849347, + "grad_norm": 0.7056167125701904, + "learning_rate": 0.00016701168614357263, + "loss": 0.5464, + "step": 499 + }, + { + "epoch": 1.5349730976172176, + "grad_norm": 0.7403351068496704, + "learning_rate": 0.0001669449081803005, + "loss": 0.5423, + "step": 500 + }, + { + "epoch": 1.5380476556495004, + "grad_norm": 0.8917403817176819, + "learning_rate": 0.00016687813021702837, + "loss": 0.6635, + "step": 501 + }, + { + "epoch": 1.5411222136817833, + "grad_norm": 0.5691559910774231, + "learning_rate": 0.00016681135225375625, + "loss": 0.4648, + "step": 502 + }, + { + "epoch": 1.544196771714066, + "grad_norm": 0.7191663980484009, + "learning_rate": 0.00016674457429048415, + "loss": 0.63, + "step": 503 + }, + { + "epoch": 1.547271329746349, + "grad_norm": 0.6063690781593323, + "learning_rate": 0.00016667779632721202, + "loss": 0.5557, + "step": 504 + }, + { + "epoch": 1.5503458877786318, + "grad_norm": 0.6743360161781311, + "learning_rate": 0.00016661101836393992, + "loss": 0.5346, + "step": 505 + }, + { + "epoch": 1.5534204458109147, + "grad_norm": 0.6480421423912048, + "learning_rate": 0.0001665442404006678, + "loss": 0.5116, + "step": 506 + }, + { + "epoch": 1.5564950038431975, + "grad_norm": 0.6903517842292786, + "learning_rate": 0.00016647746243739567, + "loss": 0.6378, + "step": 507 + }, + { + "epoch": 1.5595695618754803, + "grad_norm": 0.6405192613601685, + "learning_rate": 0.00016641068447412357, + "loss": 0.6756, + "step": 508 + }, + { + "epoch": 1.5626441199077634, + "grad_norm": 0.7051334381103516, + "learning_rate": 0.00016634390651085144, + "loss": 0.4695, + "step": 509 + }, + { + "epoch": 1.5657186779400463, + "grad_norm": 0.5805487036705017, + "learning_rate": 0.0001662771285475793, + "loss": 0.557, + "step": 510 + }, + { + "epoch": 1.5687932359723291, + "grad_norm": 0.5971087217330933, + "learning_rate": 0.00016621035058430719, + "loss": 0.4993, + "step": 511 + }, + { + "epoch": 1.571867794004612, + "grad_norm": 0.5403761863708496, + "learning_rate": 0.00016614357262103506, + "loss": 0.4008, + "step": 512 + }, + { + "epoch": 1.5749423520368948, + "grad_norm": 0.8529918193817139, + "learning_rate": 0.00016607679465776293, + "loss": 0.6232, + "step": 513 + }, + { + "epoch": 1.5780169100691777, + "grad_norm": 0.5955516695976257, + "learning_rate": 0.00016601001669449083, + "loss": 0.5359, + "step": 514 + }, + { + "epoch": 1.5810914681014605, + "grad_norm": 0.6873809099197388, + "learning_rate": 0.0001659432387312187, + "loss": 0.6932, + "step": 515 + }, + { + "epoch": 1.5841660261337434, + "grad_norm": 0.7022868394851685, + "learning_rate": 0.00016587646076794658, + "loss": 0.6894, + "step": 516 + }, + { + "epoch": 1.5872405841660262, + "grad_norm": 0.7386640906333923, + "learning_rate": 0.00016580968280467445, + "loss": 0.4857, + "step": 517 + }, + { + "epoch": 1.590315142198309, + "grad_norm": 0.6635391712188721, + "learning_rate": 0.00016574290484140233, + "loss": 0.5664, + "step": 518 + }, + { + "epoch": 1.593389700230592, + "grad_norm": 0.6896170973777771, + "learning_rate": 0.00016567612687813023, + "loss": 0.4613, + "step": 519 + }, + { + "epoch": 1.5964642582628747, + "grad_norm": 0.5555704236030579, + "learning_rate": 0.0001656093489148581, + "loss": 0.4811, + "step": 520 + }, + { + "epoch": 1.5995388162951576, + "grad_norm": 0.7170313596725464, + "learning_rate": 0.00016554257095158597, + "loss": 0.5939, + "step": 521 + }, + { + "epoch": 1.6026133743274404, + "grad_norm": 0.6032419204711914, + "learning_rate": 0.00016547579298831387, + "loss": 0.554, + "step": 522 + }, + { + "epoch": 1.6056879323597233, + "grad_norm": 0.8021843433380127, + "learning_rate": 0.00016540901502504175, + "loss": 0.6693, + "step": 523 + }, + { + "epoch": 1.6087624903920061, + "grad_norm": 0.7321604490280151, + "learning_rate": 0.00016534223706176965, + "loss": 0.7513, + "step": 524 + }, + { + "epoch": 1.611837048424289, + "grad_norm": 0.6060817241668701, + "learning_rate": 0.00016527545909849752, + "loss": 0.553, + "step": 525 + }, + { + "epoch": 1.6149116064565718, + "grad_norm": 0.7783850431442261, + "learning_rate": 0.0001652086811352254, + "loss": 0.5449, + "step": 526 + }, + { + "epoch": 1.6179861644888547, + "grad_norm": 0.8254792094230652, + "learning_rate": 0.00016514190317195327, + "loss": 0.6773, + "step": 527 + }, + { + "epoch": 1.6210607225211375, + "grad_norm": 0.7466058731079102, + "learning_rate": 0.00016507512520868114, + "loss": 0.6807, + "step": 528 + }, + { + "epoch": 1.6241352805534204, + "grad_norm": 0.8844708800315857, + "learning_rate": 0.00016500834724540904, + "loss": 0.5325, + "step": 529 + }, + { + "epoch": 1.6272098385857032, + "grad_norm": 0.8244767189025879, + "learning_rate": 0.0001649415692821369, + "loss": 0.8234, + "step": 530 + }, + { + "epoch": 1.630284396617986, + "grad_norm": 0.6416113376617432, + "learning_rate": 0.00016487479131886478, + "loss": 0.5985, + "step": 531 + }, + { + "epoch": 1.633358954650269, + "grad_norm": 0.4929693341255188, + "learning_rate": 0.00016480801335559266, + "loss": 0.4375, + "step": 532 + }, + { + "epoch": 1.6364335126825518, + "grad_norm": 0.540748655796051, + "learning_rate": 0.00016474123539232053, + "loss": 0.4758, + "step": 533 + }, + { + "epoch": 1.6395080707148346, + "grad_norm": 0.8574146032333374, + "learning_rate": 0.0001646744574290484, + "loss": 0.7296, + "step": 534 + }, + { + "epoch": 1.6425826287471175, + "grad_norm": 0.7862269282341003, + "learning_rate": 0.0001646076794657763, + "loss": 0.7556, + "step": 535 + }, + { + "epoch": 1.6456571867794003, + "grad_norm": 0.6202278137207031, + "learning_rate": 0.00016454090150250418, + "loss": 0.5431, + "step": 536 + }, + { + "epoch": 1.6487317448116832, + "grad_norm": 0.580601155757904, + "learning_rate": 0.00016447412353923205, + "loss": 0.4694, + "step": 537 + }, + { + "epoch": 1.6518063028439662, + "grad_norm": 0.5990520715713501, + "learning_rate": 0.00016440734557595992, + "loss": 0.5506, + "step": 538 + }, + { + "epoch": 1.654880860876249, + "grad_norm": 0.5700373649597168, + "learning_rate": 0.00016434056761268782, + "loss": 0.6156, + "step": 539 + }, + { + "epoch": 1.657955418908532, + "grad_norm": 0.6192472577095032, + "learning_rate": 0.0001642737896494157, + "loss": 0.5789, + "step": 540 + }, + { + "epoch": 1.6610299769408148, + "grad_norm": 0.741287112236023, + "learning_rate": 0.0001642070116861436, + "loss": 0.5113, + "step": 541 + }, + { + "epoch": 1.6641045349730976, + "grad_norm": 0.609207272529602, + "learning_rate": 0.00016414023372287147, + "loss": 0.5957, + "step": 542 + }, + { + "epoch": 1.6671790930053805, + "grad_norm": 0.613161027431488, + "learning_rate": 0.00016407345575959934, + "loss": 0.5169, + "step": 543 + }, + { + "epoch": 1.6702536510376633, + "grad_norm": 0.6057065725326538, + "learning_rate": 0.00016400667779632722, + "loss": 0.3969, + "step": 544 + }, + { + "epoch": 1.6733282090699462, + "grad_norm": 0.6364975571632385, + "learning_rate": 0.00016393989983305512, + "loss": 0.5941, + "step": 545 + }, + { + "epoch": 1.676402767102229, + "grad_norm": 0.6298673152923584, + "learning_rate": 0.000163873121869783, + "loss": 0.5397, + "step": 546 + }, + { + "epoch": 1.6794773251345119, + "grad_norm": 0.5753400921821594, + "learning_rate": 0.00016380634390651086, + "loss": 0.5225, + "step": 547 + }, + { + "epoch": 1.682551883166795, + "grad_norm": 0.47216150164604187, + "learning_rate": 0.00016373956594323874, + "loss": 0.4412, + "step": 548 + }, + { + "epoch": 1.6856264411990778, + "grad_norm": 0.575374960899353, + "learning_rate": 0.0001636727879799666, + "loss": 0.563, + "step": 549 + }, + { + "epoch": 1.6887009992313606, + "grad_norm": 0.6871128678321838, + "learning_rate": 0.00016360601001669448, + "loss": 0.517, + "step": 550 + }, + { + "epoch": 1.6917755572636435, + "grad_norm": 0.6241912841796875, + "learning_rate": 0.00016353923205342238, + "loss": 0.5816, + "step": 551 + }, + { + "epoch": 1.6948501152959263, + "grad_norm": 0.5549102425575256, + "learning_rate": 0.00016347245409015026, + "loss": 0.5728, + "step": 552 + }, + { + "epoch": 1.6979246733282092, + "grad_norm": 0.8817942142486572, + "learning_rate": 0.00016340567612687813, + "loss": 0.56, + "step": 553 + }, + { + "epoch": 1.700999231360492, + "grad_norm": 0.7771773338317871, + "learning_rate": 0.000163338898163606, + "loss": 0.6107, + "step": 554 + }, + { + "epoch": 1.7040737893927749, + "grad_norm": 0.7410566210746765, + "learning_rate": 0.00016327212020033388, + "loss": 0.6591, + "step": 555 + }, + { + "epoch": 1.7071483474250577, + "grad_norm": 0.830802857875824, + "learning_rate": 0.00016320534223706178, + "loss": 0.6667, + "step": 556 + }, + { + "epoch": 1.7102229054573406, + "grad_norm": 0.593959629535675, + "learning_rate": 0.00016313856427378965, + "loss": 0.5319, + "step": 557 + }, + { + "epoch": 1.7132974634896234, + "grad_norm": 0.6377514004707336, + "learning_rate": 0.00016307178631051755, + "loss": 0.5966, + "step": 558 + }, + { + "epoch": 1.7163720215219063, + "grad_norm": 0.6252657771110535, + "learning_rate": 0.00016300500834724542, + "loss": 0.5337, + "step": 559 + }, + { + "epoch": 1.7194465795541891, + "grad_norm": 0.885527491569519, + "learning_rate": 0.0001629382303839733, + "loss": 0.8417, + "step": 560 + }, + { + "epoch": 1.722521137586472, + "grad_norm": 0.5693302154541016, + "learning_rate": 0.0001628714524207012, + "loss": 0.5038, + "step": 561 + }, + { + "epoch": 1.7255956956187548, + "grad_norm": 0.7291401624679565, + "learning_rate": 0.00016280467445742907, + "loss": 0.6994, + "step": 562 + }, + { + "epoch": 1.7286702536510377, + "grad_norm": 0.7223179340362549, + "learning_rate": 0.00016273789649415694, + "loss": 0.648, + "step": 563 + }, + { + "epoch": 1.7317448116833205, + "grad_norm": 0.7139200568199158, + "learning_rate": 0.00016267111853088482, + "loss": 0.5822, + "step": 564 + }, + { + "epoch": 1.7348193697156034, + "grad_norm": 0.5660908222198486, + "learning_rate": 0.0001626043405676127, + "loss": 0.3695, + "step": 565 + }, + { + "epoch": 1.7378939277478862, + "grad_norm": 0.698505163192749, + "learning_rate": 0.00016253756260434056, + "loss": 0.6601, + "step": 566 + }, + { + "epoch": 1.740968485780169, + "grad_norm": 0.5684105753898621, + "learning_rate": 0.00016247078464106846, + "loss": 0.6013, + "step": 567 + }, + { + "epoch": 1.744043043812452, + "grad_norm": 0.645592212677002, + "learning_rate": 0.00016240400667779634, + "loss": 0.6394, + "step": 568 + }, + { + "epoch": 1.7471176018447347, + "grad_norm": 0.6073788404464722, + "learning_rate": 0.0001623372287145242, + "loss": 0.4312, + "step": 569 + }, + { + "epoch": 1.7501921598770176, + "grad_norm": 0.7062597274780273, + "learning_rate": 0.00016227045075125208, + "loss": 0.5653, + "step": 570 + }, + { + "epoch": 1.7532667179093004, + "grad_norm": 0.5822290182113647, + "learning_rate": 0.00016220367278797996, + "loss": 0.5852, + "step": 571 + }, + { + "epoch": 1.7563412759415833, + "grad_norm": 0.6263893842697144, + "learning_rate": 0.00016213689482470786, + "loss": 0.6102, + "step": 572 + }, + { + "epoch": 1.7594158339738661, + "grad_norm": 0.7281681299209595, + "learning_rate": 0.00016207011686143573, + "loss": 0.6375, + "step": 573 + }, + { + "epoch": 1.762490392006149, + "grad_norm": 0.6217925548553467, + "learning_rate": 0.0001620033388981636, + "loss": 0.603, + "step": 574 + }, + { + "epoch": 1.7655649500384318, + "grad_norm": 0.822990357875824, + "learning_rate": 0.0001619365609348915, + "loss": 0.7641, + "step": 575 + }, + { + "epoch": 1.7686395080707147, + "grad_norm": 0.6625170111656189, + "learning_rate": 0.00016186978297161938, + "loss": 0.5701, + "step": 576 + }, + { + "epoch": 1.7717140661029975, + "grad_norm": 0.6847323179244995, + "learning_rate": 0.00016180300500834728, + "loss": 0.47, + "step": 577 + }, + { + "epoch": 1.7747886241352806, + "grad_norm": 0.6274866461753845, + "learning_rate": 0.00016173622704507515, + "loss": 0.4998, + "step": 578 + }, + { + "epoch": 1.7778631821675634, + "grad_norm": 0.7083932161331177, + "learning_rate": 0.00016166944908180302, + "loss": 0.6362, + "step": 579 + }, + { + "epoch": 1.7809377401998463, + "grad_norm": 0.7024930715560913, + "learning_rate": 0.0001616026711185309, + "loss": 0.6101, + "step": 580 + }, + { + "epoch": 1.7840122982321291, + "grad_norm": 0.9053730964660645, + "learning_rate": 0.00016153589315525877, + "loss": 0.7606, + "step": 581 + }, + { + "epoch": 1.787086856264412, + "grad_norm": 1.0986732244491577, + "learning_rate": 0.00016146911519198664, + "loss": 0.7126, + "step": 582 + }, + { + "epoch": 1.7901614142966948, + "grad_norm": 0.6207830309867859, + "learning_rate": 0.00016140233722871454, + "loss": 0.5338, + "step": 583 + }, + { + "epoch": 1.7932359723289777, + "grad_norm": 0.5910727977752686, + "learning_rate": 0.00016133555926544241, + "loss": 0.4771, + "step": 584 + }, + { + "epoch": 1.7963105303612605, + "grad_norm": 0.5598863363265991, + "learning_rate": 0.0001612687813021703, + "loss": 0.3635, + "step": 585 + }, + { + "epoch": 1.7993850883935434, + "grad_norm": 0.7183571457862854, + "learning_rate": 0.00016120200333889816, + "loss": 0.6022, + "step": 586 + }, + { + "epoch": 1.8024596464258262, + "grad_norm": 0.7178698182106018, + "learning_rate": 0.00016113522537562603, + "loss": 0.5143, + "step": 587 + }, + { + "epoch": 1.8055342044581093, + "grad_norm": 0.5767114162445068, + "learning_rate": 0.00016106844741235393, + "loss": 0.481, + "step": 588 + }, + { + "epoch": 1.8086087624903922, + "grad_norm": 0.6642889380455017, + "learning_rate": 0.0001610016694490818, + "loss": 0.5119, + "step": 589 + }, + { + "epoch": 1.811683320522675, + "grad_norm": 0.7314223647117615, + "learning_rate": 0.00016093489148580968, + "loss": 0.5836, + "step": 590 + }, + { + "epoch": 1.8147578785549578, + "grad_norm": 0.6860315799713135, + "learning_rate": 0.00016086811352253755, + "loss": 0.5669, + "step": 591 + }, + { + "epoch": 1.8178324365872407, + "grad_norm": 0.7875143885612488, + "learning_rate": 0.00016080133555926545, + "loss": 0.6005, + "step": 592 + }, + { + "epoch": 1.8209069946195235, + "grad_norm": 0.7283911108970642, + "learning_rate": 0.00016073455759599333, + "loss": 0.5565, + "step": 593 + }, + { + "epoch": 1.8239815526518064, + "grad_norm": 0.5864517092704773, + "learning_rate": 0.00016066777963272123, + "loss": 0.5659, + "step": 594 + }, + { + "epoch": 1.8270561106840892, + "grad_norm": 0.6149706244468689, + "learning_rate": 0.0001606010016694491, + "loss": 0.5811, + "step": 595 + }, + { + "epoch": 1.830130668716372, + "grad_norm": 0.6962308883666992, + "learning_rate": 0.00016053422370617697, + "loss": 0.6053, + "step": 596 + }, + { + "epoch": 1.833205226748655, + "grad_norm": 0.5711308121681213, + "learning_rate": 0.00016046744574290485, + "loss": 0.4212, + "step": 597 + }, + { + "epoch": 1.8362797847809378, + "grad_norm": 0.7618324756622314, + "learning_rate": 0.00016040066777963272, + "loss": 0.7267, + "step": 598 + }, + { + "epoch": 1.8393543428132206, + "grad_norm": 0.7906466126441956, + "learning_rate": 0.00016033388981636062, + "loss": 0.7056, + "step": 599 + }, + { + "epoch": 1.8424289008455035, + "grad_norm": 1.0188270807266235, + "learning_rate": 0.0001602671118530885, + "loss": 0.634, + "step": 600 + }, + { + "epoch": 1.8455034588777863, + "grad_norm": 0.7009850740432739, + "learning_rate": 0.00016020033388981637, + "loss": 0.4883, + "step": 601 + }, + { + "epoch": 1.8485780169100692, + "grad_norm": 0.8244671821594238, + "learning_rate": 0.00016013355592654424, + "loss": 0.7119, + "step": 602 + }, + { + "epoch": 1.851652574942352, + "grad_norm": 0.738471508026123, + "learning_rate": 0.0001600667779632721, + "loss": 0.6025, + "step": 603 + }, + { + "epoch": 1.8547271329746349, + "grad_norm": 0.6964389085769653, + "learning_rate": 0.00016, + "loss": 0.5, + "step": 604 + }, + { + "epoch": 1.8578016910069177, + "grad_norm": 0.5497778654098511, + "learning_rate": 0.00015993322203672789, + "loss": 0.5629, + "step": 605 + }, + { + "epoch": 1.8608762490392006, + "grad_norm": 0.644513726234436, + "learning_rate": 0.00015986644407345576, + "loss": 0.465, + "step": 606 + }, + { + "epoch": 1.8639508070714834, + "grad_norm": 0.6021044254302979, + "learning_rate": 0.00015979966611018363, + "loss": 0.4142, + "step": 607 + }, + { + "epoch": 1.8670253651037663, + "grad_norm": 0.669230043888092, + "learning_rate": 0.0001597328881469115, + "loss": 0.6143, + "step": 608 + }, + { + "epoch": 1.8700999231360491, + "grad_norm": 0.7413586378097534, + "learning_rate": 0.0001596661101836394, + "loss": 0.6182, + "step": 609 + }, + { + "epoch": 1.873174481168332, + "grad_norm": 0.6968368291854858, + "learning_rate": 0.00015959933222036728, + "loss": 0.5306, + "step": 610 + }, + { + "epoch": 1.8762490392006148, + "grad_norm": 0.6736475825309753, + "learning_rate": 0.00015953255425709518, + "loss": 0.5857, + "step": 611 + }, + { + "epoch": 1.8793235972328977, + "grad_norm": 0.6630072593688965, + "learning_rate": 0.00015946577629382305, + "loss": 0.4775, + "step": 612 + }, + { + "epoch": 1.8823981552651805, + "grad_norm": 0.6984624266624451, + "learning_rate": 0.00015939899833055093, + "loss": 0.5635, + "step": 613 + }, + { + "epoch": 1.8854727132974634, + "grad_norm": 0.6280466914176941, + "learning_rate": 0.0001593322203672788, + "loss": 0.8159, + "step": 614 + }, + { + "epoch": 1.8885472713297462, + "grad_norm": 0.7790103554725647, + "learning_rate": 0.0001592654424040067, + "loss": 0.594, + "step": 615 + }, + { + "epoch": 1.891621829362029, + "grad_norm": 0.704753041267395, + "learning_rate": 0.00015919866444073457, + "loss": 0.5726, + "step": 616 + }, + { + "epoch": 1.8946963873943121, + "grad_norm": 0.7425320148468018, + "learning_rate": 0.00015913188647746245, + "loss": 0.5657, + "step": 617 + }, + { + "epoch": 1.897770945426595, + "grad_norm": 0.6058589816093445, + "learning_rate": 0.00015906510851419032, + "loss": 0.4574, + "step": 618 + }, + { + "epoch": 1.9008455034588778, + "grad_norm": 0.811036229133606, + "learning_rate": 0.0001589983305509182, + "loss": 0.5719, + "step": 619 + }, + { + "epoch": 1.9039200614911607, + "grad_norm": 0.5609816908836365, + "learning_rate": 0.0001589315525876461, + "loss": 0.7197, + "step": 620 + }, + { + "epoch": 1.9069946195234435, + "grad_norm": 0.6295925974845886, + "learning_rate": 0.00015886477462437397, + "loss": 0.514, + "step": 621 + }, + { + "epoch": 1.9100691775557264, + "grad_norm": 0.9893009662628174, + "learning_rate": 0.00015879799666110184, + "loss": 0.6079, + "step": 622 + }, + { + "epoch": 1.9131437355880092, + "grad_norm": 0.6634209752082825, + "learning_rate": 0.0001587312186978297, + "loss": 0.5731, + "step": 623 + }, + { + "epoch": 1.916218293620292, + "grad_norm": 0.6897741556167603, + "learning_rate": 0.00015866444073455758, + "loss": 0.533, + "step": 624 + }, + { + "epoch": 1.919292851652575, + "grad_norm": 0.7442365884780884, + "learning_rate": 0.00015859766277128548, + "loss": 0.5796, + "step": 625 + }, + { + "epoch": 1.9223674096848578, + "grad_norm": 0.7648442387580872, + "learning_rate": 0.00015853088480801336, + "loss": 0.6745, + "step": 626 + }, + { + "epoch": 1.9254419677171408, + "grad_norm": 0.6118778586387634, + "learning_rate": 0.00015846410684474123, + "loss": 0.5577, + "step": 627 + }, + { + "epoch": 1.9285165257494237, + "grad_norm": 0.7464010715484619, + "learning_rate": 0.00015839732888146913, + "loss": 0.6854, + "step": 628 + }, + { + "epoch": 1.9315910837817065, + "grad_norm": 0.63694828748703, + "learning_rate": 0.000158330550918197, + "loss": 0.5794, + "step": 629 + }, + { + "epoch": 1.9346656418139894, + "grad_norm": 0.7984501123428345, + "learning_rate": 0.00015826377295492488, + "loss": 0.8564, + "step": 630 + }, + { + "epoch": 1.9377401998462722, + "grad_norm": 0.7075039744377136, + "learning_rate": 0.00015819699499165278, + "loss": 0.5751, + "step": 631 + }, + { + "epoch": 1.940814757878555, + "grad_norm": 0.6514005064964294, + "learning_rate": 0.00015813021702838065, + "loss": 0.6048, + "step": 632 + }, + { + "epoch": 1.943889315910838, + "grad_norm": 0.5643919706344604, + "learning_rate": 0.00015806343906510852, + "loss": 0.4348, + "step": 633 + }, + { + "epoch": 1.9469638739431208, + "grad_norm": 0.7066437005996704, + "learning_rate": 0.0001579966611018364, + "loss": 0.5822, + "step": 634 + }, + { + "epoch": 1.9500384319754036, + "grad_norm": 0.5992090106010437, + "learning_rate": 0.00015792988313856427, + "loss": 0.5614, + "step": 635 + }, + { + "epoch": 1.9531129900076865, + "grad_norm": 0.6332142353057861, + "learning_rate": 0.00015786310517529217, + "loss": 0.5655, + "step": 636 + }, + { + "epoch": 1.9561875480399693, + "grad_norm": 0.5068455934524536, + "learning_rate": 0.00015779632721202004, + "loss": 0.5389, + "step": 637 + }, + { + "epoch": 1.9592621060722522, + "grad_norm": 0.8024671673774719, + "learning_rate": 0.00015772954924874792, + "loss": 0.7261, + "step": 638 + }, + { + "epoch": 1.962336664104535, + "grad_norm": 0.8747161626815796, + "learning_rate": 0.0001576627712854758, + "loss": 0.6632, + "step": 639 + }, + { + "epoch": 1.9654112221368178, + "grad_norm": 0.5946447253227234, + "learning_rate": 0.00015759599332220366, + "loss": 0.5571, + "step": 640 + }, + { + "epoch": 1.9684857801691007, + "grad_norm": 0.7284528017044067, + "learning_rate": 0.00015752921535893156, + "loss": 0.6314, + "step": 641 + }, + { + "epoch": 1.9715603382013835, + "grad_norm": 0.824228823184967, + "learning_rate": 0.00015746243739565944, + "loss": 0.7593, + "step": 642 + }, + { + "epoch": 1.9746348962336664, + "grad_norm": 0.6937350034713745, + "learning_rate": 0.0001573956594323873, + "loss": 0.6647, + "step": 643 + }, + { + "epoch": 1.9777094542659492, + "grad_norm": 0.5793902277946472, + "learning_rate": 0.0001573288814691152, + "loss": 0.4004, + "step": 644 + }, + { + "epoch": 1.980784012298232, + "grad_norm": 0.7415186762809753, + "learning_rate": 0.00015726210350584308, + "loss": 0.454, + "step": 645 + }, + { + "epoch": 1.983858570330515, + "grad_norm": 0.6287279725074768, + "learning_rate": 0.00015719532554257096, + "loss": 0.6492, + "step": 646 + }, + { + "epoch": 1.9869331283627978, + "grad_norm": 0.7581256628036499, + "learning_rate": 0.00015712854757929886, + "loss": 0.6954, + "step": 647 + }, + { + "epoch": 1.9900076863950806, + "grad_norm": 0.7032405734062195, + "learning_rate": 0.00015706176961602673, + "loss": 0.601, + "step": 648 + }, + { + "epoch": 1.9930822444273635, + "grad_norm": 0.9088711142539978, + "learning_rate": 0.0001569949916527546, + "loss": 0.7629, + "step": 649 + }, + { + "epoch": 1.9961568024596463, + "grad_norm": 0.7218103408813477, + "learning_rate": 0.00015692821368948248, + "loss": 0.666, + "step": 650 + }, + { + "epoch": 1.9992313604919292, + "grad_norm": 0.7617568373680115, + "learning_rate": 0.00015686143572621035, + "loss": 0.6146, + "step": 651 + }, + { + "epoch": 2.0, + "grad_norm": 1.2042289972305298, + "learning_rate": 0.00015679465776293825, + "loss": 0.5399, + "step": 652 + }, + { + "epoch": 2.003074558032283, + "grad_norm": 0.4528297781944275, + "learning_rate": 0.00015672787979966612, + "loss": 0.359, + "step": 653 + }, + { + "epoch": 2.0061491160645657, + "grad_norm": 0.6834194660186768, + "learning_rate": 0.000156661101836394, + "loss": 0.4381, + "step": 654 + }, + { + "epoch": 2.0092236740968485, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.00015659432387312187, + "loss": 0.5073, + "step": 655 + }, + { + "epoch": 2.0122982321291314, + "grad_norm": 0.5966106057167053, + "learning_rate": 0.00015652754590984974, + "loss": 0.5188, + "step": 656 + }, + { + "epoch": 2.0153727901614142, + "grad_norm": 0.5781939029693604, + "learning_rate": 0.00015646076794657764, + "loss": 0.4936, + "step": 657 + }, + { + "epoch": 2.018447348193697, + "grad_norm": 0.6681936979293823, + "learning_rate": 0.00015639398998330552, + "loss": 0.5091, + "step": 658 + }, + { + "epoch": 2.02152190622598, + "grad_norm": 0.7436164617538452, + "learning_rate": 0.0001563272120200334, + "loss": 0.6323, + "step": 659 + }, + { + "epoch": 2.024596464258263, + "grad_norm": 0.59382164478302, + "learning_rate": 0.00015626043405676126, + "loss": 0.4239, + "step": 660 + }, + { + "epoch": 2.0276710222905456, + "grad_norm": 0.659829318523407, + "learning_rate": 0.00015619365609348916, + "loss": 0.4769, + "step": 661 + }, + { + "epoch": 2.0307455803228285, + "grad_norm": 0.6705843806266785, + "learning_rate": 0.00015612687813021704, + "loss": 0.5814, + "step": 662 + }, + { + "epoch": 2.0338201383551113, + "grad_norm": 0.6286864876747131, + "learning_rate": 0.00015606010016694494, + "loss": 0.3872, + "step": 663 + }, + { + "epoch": 2.036894696387394, + "grad_norm": 0.6223423480987549, + "learning_rate": 0.0001559933222036728, + "loss": 0.5428, + "step": 664 + }, + { + "epoch": 2.039969254419677, + "grad_norm": 0.7200874090194702, + "learning_rate": 0.00015592654424040068, + "loss": 0.5192, + "step": 665 + }, + { + "epoch": 2.04304381245196, + "grad_norm": 0.6993906497955322, + "learning_rate": 0.00015585976627712856, + "loss": 0.523, + "step": 666 + }, + { + "epoch": 2.0461183704842427, + "grad_norm": 0.7193444967269897, + "learning_rate": 0.00015579298831385643, + "loss": 0.4788, + "step": 667 + }, + { + "epoch": 2.0491929285165256, + "grad_norm": 0.7082064747810364, + "learning_rate": 0.00015572621035058433, + "loss": 0.5071, + "step": 668 + }, + { + "epoch": 2.0522674865488084, + "grad_norm": 0.6296613812446594, + "learning_rate": 0.0001556594323873122, + "loss": 0.5415, + "step": 669 + }, + { + "epoch": 2.0553420445810913, + "grad_norm": 1.0283185243606567, + "learning_rate": 0.00015559265442404007, + "loss": 0.4654, + "step": 670 + }, + { + "epoch": 2.058416602613374, + "grad_norm": 0.8436565399169922, + "learning_rate": 0.00015552587646076795, + "loss": 0.6448, + "step": 671 + }, + { + "epoch": 2.061491160645657, + "grad_norm": 0.5912400484085083, + "learning_rate": 0.00015545909849749582, + "loss": 0.4333, + "step": 672 + }, + { + "epoch": 2.0645657186779403, + "grad_norm": 0.7355748414993286, + "learning_rate": 0.00015539232053422372, + "loss": 0.4239, + "step": 673 + }, + { + "epoch": 2.067640276710223, + "grad_norm": 0.6488693952560425, + "learning_rate": 0.0001553255425709516, + "loss": 0.454, + "step": 674 + }, + { + "epoch": 2.070714834742506, + "grad_norm": 0.5765907764434814, + "learning_rate": 0.00015525876460767947, + "loss": 0.4197, + "step": 675 + }, + { + "epoch": 2.073789392774789, + "grad_norm": 0.9428765773773193, + "learning_rate": 0.00015519198664440734, + "loss": 0.6399, + "step": 676 + }, + { + "epoch": 2.0768639508070716, + "grad_norm": 0.6274253726005554, + "learning_rate": 0.00015512520868113521, + "loss": 0.5117, + "step": 677 + }, + { + "epoch": 2.0799385088393545, + "grad_norm": 0.6983177065849304, + "learning_rate": 0.00015505843071786311, + "loss": 0.3869, + "step": 678 + }, + { + "epoch": 2.0830130668716373, + "grad_norm": 0.6359655261039734, + "learning_rate": 0.000154991652754591, + "loss": 0.4617, + "step": 679 + }, + { + "epoch": 2.08608762490392, + "grad_norm": 0.6552188992500305, + "learning_rate": 0.0001549248747913189, + "loss": 0.4573, + "step": 680 + }, + { + "epoch": 2.089162182936203, + "grad_norm": 0.7065202593803406, + "learning_rate": 0.00015485809682804676, + "loss": 0.5351, + "step": 681 + }, + { + "epoch": 2.092236740968486, + "grad_norm": 0.7550007700920105, + "learning_rate": 0.00015479131886477463, + "loss": 0.5607, + "step": 682 + }, + { + "epoch": 2.0953112990007687, + "grad_norm": 0.6819210648536682, + "learning_rate": 0.0001547245409015025, + "loss": 0.4222, + "step": 683 + }, + { + "epoch": 2.0983858570330516, + "grad_norm": 0.5584454536437988, + "learning_rate": 0.0001546577629382304, + "loss": 0.3859, + "step": 684 + }, + { + "epoch": 2.1014604150653344, + "grad_norm": 0.7186906337738037, + "learning_rate": 0.00015459098497495828, + "loss": 0.4977, + "step": 685 + }, + { + "epoch": 2.1045349730976173, + "grad_norm": 0.763657808303833, + "learning_rate": 0.00015452420701168615, + "loss": 0.5239, + "step": 686 + }, + { + "epoch": 2.1076095311299, + "grad_norm": 0.6879159212112427, + "learning_rate": 0.00015445742904841403, + "loss": 0.5443, + "step": 687 + }, + { + "epoch": 2.110684089162183, + "grad_norm": 0.5697076916694641, + "learning_rate": 0.0001543906510851419, + "loss": 0.3364, + "step": 688 + }, + { + "epoch": 2.113758647194466, + "grad_norm": 0.6115249991416931, + "learning_rate": 0.0001543238731218698, + "loss": 0.5267, + "step": 689 + }, + { + "epoch": 2.1168332052267487, + "grad_norm": 0.6462056040763855, + "learning_rate": 0.00015425709515859767, + "loss": 0.4057, + "step": 690 + }, + { + "epoch": 2.1199077632590315, + "grad_norm": 0.6328736543655396, + "learning_rate": 0.00015419031719532555, + "loss": 0.4059, + "step": 691 + }, + { + "epoch": 2.1229823212913144, + "grad_norm": 0.6837843656539917, + "learning_rate": 0.00015412353923205342, + "loss": 0.4288, + "step": 692 + }, + { + "epoch": 2.126056879323597, + "grad_norm": 0.6994965672492981, + "learning_rate": 0.0001540567612687813, + "loss": 0.5656, + "step": 693 + }, + { + "epoch": 2.12913143735588, + "grad_norm": 0.6533644795417786, + "learning_rate": 0.0001539899833055092, + "loss": 0.3702, + "step": 694 + }, + { + "epoch": 2.132205995388163, + "grad_norm": 0.6903581023216248, + "learning_rate": 0.00015392320534223707, + "loss": 0.4216, + "step": 695 + }, + { + "epoch": 2.1352805534204458, + "grad_norm": 0.6325581669807434, + "learning_rate": 0.00015385642737896494, + "loss": 0.3968, + "step": 696 + }, + { + "epoch": 2.1383551114527286, + "grad_norm": 0.6790093779563904, + "learning_rate": 0.00015378964941569284, + "loss": 0.5112, + "step": 697 + }, + { + "epoch": 2.1414296694850115, + "grad_norm": 0.8143894672393799, + "learning_rate": 0.0001537228714524207, + "loss": 0.5103, + "step": 698 + }, + { + "epoch": 2.1445042275172943, + "grad_norm": 0.6844452023506165, + "learning_rate": 0.00015365609348914859, + "loss": 0.4344, + "step": 699 + }, + { + "epoch": 2.147578785549577, + "grad_norm": 1.0638381242752075, + "learning_rate": 0.00015358931552587649, + "loss": 0.5799, + "step": 700 + }, + { + "epoch": 2.15065334358186, + "grad_norm": 0.7177916169166565, + "learning_rate": 0.00015352253756260436, + "loss": 0.5159, + "step": 701 + }, + { + "epoch": 2.153727901614143, + "grad_norm": 0.5857630968093872, + "learning_rate": 0.00015345575959933223, + "loss": 0.452, + "step": 702 + }, + { + "epoch": 2.1568024596464257, + "grad_norm": 0.7078539729118347, + "learning_rate": 0.0001533889816360601, + "loss": 0.5155, + "step": 703 + }, + { + "epoch": 2.1598770176787085, + "grad_norm": 0.8432323932647705, + "learning_rate": 0.00015332220367278798, + "loss": 0.6138, + "step": 704 + }, + { + "epoch": 2.1629515757109914, + "grad_norm": 0.6977456212043762, + "learning_rate": 0.00015325542570951588, + "loss": 0.577, + "step": 705 + }, + { + "epoch": 2.1660261337432742, + "grad_norm": 0.72422194480896, + "learning_rate": 0.00015318864774624375, + "loss": 0.5873, + "step": 706 + }, + { + "epoch": 2.169100691775557, + "grad_norm": 0.846378743648529, + "learning_rate": 0.00015312186978297163, + "loss": 0.5319, + "step": 707 + }, + { + "epoch": 2.17217524980784, + "grad_norm": 0.6224305629730225, + "learning_rate": 0.0001530550918196995, + "loss": 0.6304, + "step": 708 + }, + { + "epoch": 2.175249807840123, + "grad_norm": 0.7657787203788757, + "learning_rate": 0.00015298831385642737, + "loss": 0.531, + "step": 709 + }, + { + "epoch": 2.178324365872406, + "grad_norm": 0.8921689987182617, + "learning_rate": 0.00015292153589315527, + "loss": 0.4925, + "step": 710 + }, + { + "epoch": 2.1813989239046885, + "grad_norm": 0.5680480003356934, + "learning_rate": 0.00015285475792988315, + "loss": 0.3959, + "step": 711 + }, + { + "epoch": 2.1844734819369718, + "grad_norm": 0.6384515166282654, + "learning_rate": 0.00015278797996661102, + "loss": 0.5023, + "step": 712 + }, + { + "epoch": 2.1875480399692546, + "grad_norm": 0.523273766040802, + "learning_rate": 0.0001527212020033389, + "loss": 0.3509, + "step": 713 + }, + { + "epoch": 2.1906225980015375, + "grad_norm": 0.6296597719192505, + "learning_rate": 0.0001526544240400668, + "loss": 0.4695, + "step": 714 + }, + { + "epoch": 2.1936971560338203, + "grad_norm": 0.6718856692314148, + "learning_rate": 0.00015258764607679466, + "loss": 0.4394, + "step": 715 + }, + { + "epoch": 2.196771714066103, + "grad_norm": 0.731511116027832, + "learning_rate": 0.00015252086811352257, + "loss": 0.3299, + "step": 716 + }, + { + "epoch": 2.199846272098386, + "grad_norm": 0.7541506886482239, + "learning_rate": 0.00015245409015025044, + "loss": 0.5043, + "step": 717 + }, + { + "epoch": 2.202920830130669, + "grad_norm": 0.8243811726570129, + "learning_rate": 0.0001523873121869783, + "loss": 0.6253, + "step": 718 + }, + { + "epoch": 2.2059953881629517, + "grad_norm": 0.7630672454833984, + "learning_rate": 0.00015232053422370618, + "loss": 0.4685, + "step": 719 + }, + { + "epoch": 2.2090699461952346, + "grad_norm": 0.6123481392860413, + "learning_rate": 0.00015225375626043406, + "loss": 0.4081, + "step": 720 + }, + { + "epoch": 2.2121445042275174, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.00015218697829716196, + "loss": 0.4342, + "step": 721 + }, + { + "epoch": 2.2152190622598003, + "grad_norm": 0.913813054561615, + "learning_rate": 0.00015212020033388983, + "loss": 0.4762, + "step": 722 + }, + { + "epoch": 2.218293620292083, + "grad_norm": 0.7751143574714661, + "learning_rate": 0.0001520534223706177, + "loss": 0.5079, + "step": 723 + }, + { + "epoch": 2.221368178324366, + "grad_norm": 0.8524821996688843, + "learning_rate": 0.00015198664440734558, + "loss": 0.5464, + "step": 724 + }, + { + "epoch": 2.224442736356649, + "grad_norm": 0.8985180258750916, + "learning_rate": 0.00015191986644407345, + "loss": 0.5276, + "step": 725 + }, + { + "epoch": 2.2275172943889316, + "grad_norm": 0.6020591855049133, + "learning_rate": 0.00015185308848080135, + "loss": 0.3995, + "step": 726 + }, + { + "epoch": 2.2305918524212145, + "grad_norm": 0.7074214220046997, + "learning_rate": 0.00015178631051752922, + "loss": 0.4887, + "step": 727 + }, + { + "epoch": 2.2336664104534973, + "grad_norm": 0.7474585771560669, + "learning_rate": 0.0001517195325542571, + "loss": 0.5474, + "step": 728 + }, + { + "epoch": 2.23674096848578, + "grad_norm": 0.6883979439735413, + "learning_rate": 0.00015165275459098497, + "loss": 0.5503, + "step": 729 + }, + { + "epoch": 2.239815526518063, + "grad_norm": 0.6393066644668579, + "learning_rate": 0.00015158597662771284, + "loss": 0.4356, + "step": 730 + }, + { + "epoch": 2.242890084550346, + "grad_norm": 0.6586110591888428, + "learning_rate": 0.00015151919866444074, + "loss": 0.3659, + "step": 731 + }, + { + "epoch": 2.2459646425826287, + "grad_norm": 0.7263343930244446, + "learning_rate": 0.00015145242070116862, + "loss": 0.4629, + "step": 732 + }, + { + "epoch": 2.2490392006149116, + "grad_norm": 0.8680408000946045, + "learning_rate": 0.00015138564273789652, + "loss": 0.5825, + "step": 733 + }, + { + "epoch": 2.2521137586471944, + "grad_norm": 0.5599681735038757, + "learning_rate": 0.0001513188647746244, + "loss": 0.4886, + "step": 734 + }, + { + "epoch": 2.2551883166794773, + "grad_norm": 0.7630482316017151, + "learning_rate": 0.00015125208681135226, + "loss": 0.496, + "step": 735 + }, + { + "epoch": 2.25826287471176, + "grad_norm": 0.6882701516151428, + "learning_rate": 0.00015118530884808014, + "loss": 0.5949, + "step": 736 + }, + { + "epoch": 2.261337432744043, + "grad_norm": 0.7318270802497864, + "learning_rate": 0.00015111853088480804, + "loss": 0.5267, + "step": 737 + }, + { + "epoch": 2.264411990776326, + "grad_norm": 0.8890166878700256, + "learning_rate": 0.0001510517529215359, + "loss": 0.587, + "step": 738 + }, + { + "epoch": 2.2674865488086087, + "grad_norm": 0.735357940196991, + "learning_rate": 0.00015098497495826378, + "loss": 0.53, + "step": 739 + }, + { + "epoch": 2.2705611068408915, + "grad_norm": 0.6169731616973877, + "learning_rate": 0.00015091819699499166, + "loss": 0.3872, + "step": 740 + }, + { + "epoch": 2.2736356648731744, + "grad_norm": 0.6245728135108948, + "learning_rate": 0.00015085141903171953, + "loss": 0.4331, + "step": 741 + }, + { + "epoch": 2.276710222905457, + "grad_norm": 0.6054602265357971, + "learning_rate": 0.00015078464106844743, + "loss": 0.4014, + "step": 742 + }, + { + "epoch": 2.27978478093774, + "grad_norm": 0.6015118956565857, + "learning_rate": 0.0001507178631051753, + "loss": 0.4942, + "step": 743 + }, + { + "epoch": 2.282859338970023, + "grad_norm": 0.7360993027687073, + "learning_rate": 0.00015065108514190318, + "loss": 0.5544, + "step": 744 + }, + { + "epoch": 2.2859338970023058, + "grad_norm": 0.8529961109161377, + "learning_rate": 0.00015058430717863105, + "loss": 0.5008, + "step": 745 + }, + { + "epoch": 2.2890084550345886, + "grad_norm": 0.7723920345306396, + "learning_rate": 0.00015051752921535892, + "loss": 0.5894, + "step": 746 + }, + { + "epoch": 2.2920830130668715, + "grad_norm": 0.8459378480911255, + "learning_rate": 0.0001504507512520868, + "loss": 0.6038, + "step": 747 + }, + { + "epoch": 2.2951575710991543, + "grad_norm": 0.732806384563446, + "learning_rate": 0.0001503839732888147, + "loss": 0.4986, + "step": 748 + }, + { + "epoch": 2.2982321291314376, + "grad_norm": 0.6265669465065002, + "learning_rate": 0.00015031719532554257, + "loss": 0.3117, + "step": 749 + }, + { + "epoch": 2.30130668716372, + "grad_norm": 0.6586902141571045, + "learning_rate": 0.00015025041736227047, + "loss": 0.3206, + "step": 750 + }, + { + "epoch": 2.3043812451960033, + "grad_norm": 0.5551536679267883, + "learning_rate": 0.00015018363939899834, + "loss": 0.4016, + "step": 751 + }, + { + "epoch": 2.3074558032282857, + "grad_norm": 0.8721263408660889, + "learning_rate": 0.00015011686143572622, + "loss": 0.595, + "step": 752 + }, + { + "epoch": 2.310530361260569, + "grad_norm": 0.7609719038009644, + "learning_rate": 0.00015005008347245412, + "loss": 0.4655, + "step": 753 + }, + { + "epoch": 2.313604919292852, + "grad_norm": 0.8068011999130249, + "learning_rate": 0.000149983305509182, + "loss": 0.5389, + "step": 754 + }, + { + "epoch": 2.3166794773251347, + "grad_norm": 0.5893248319625854, + "learning_rate": 0.00014991652754590986, + "loss": 0.3675, + "step": 755 + }, + { + "epoch": 2.3197540353574175, + "grad_norm": 0.4989778399467468, + "learning_rate": 0.00014984974958263774, + "loss": 0.3376, + "step": 756 + }, + { + "epoch": 2.3228285933897004, + "grad_norm": 0.8979980945587158, + "learning_rate": 0.0001497829716193656, + "loss": 0.4876, + "step": 757 + }, + { + "epoch": 2.3259031514219832, + "grad_norm": 0.6380670070648193, + "learning_rate": 0.0001497161936560935, + "loss": 0.4801, + "step": 758 + }, + { + "epoch": 2.328977709454266, + "grad_norm": 0.7083134651184082, + "learning_rate": 0.00014964941569282138, + "loss": 0.5479, + "step": 759 + }, + { + "epoch": 2.332052267486549, + "grad_norm": 0.6810340881347656, + "learning_rate": 0.00014958263772954926, + "loss": 0.4373, + "step": 760 + }, + { + "epoch": 2.3351268255188318, + "grad_norm": 0.7883718013763428, + "learning_rate": 0.00014951585976627713, + "loss": 0.5557, + "step": 761 + }, + { + "epoch": 2.3382013835511146, + "grad_norm": 0.644123375415802, + "learning_rate": 0.000149449081803005, + "loss": 0.4117, + "step": 762 + }, + { + "epoch": 2.3412759415833975, + "grad_norm": 0.8770838975906372, + "learning_rate": 0.00014938230383973287, + "loss": 0.4398, + "step": 763 + }, + { + "epoch": 2.3443504996156803, + "grad_norm": 0.603274405002594, + "learning_rate": 0.00014931552587646077, + "loss": 0.4261, + "step": 764 + }, + { + "epoch": 2.347425057647963, + "grad_norm": 0.7817360162734985, + "learning_rate": 0.00014924874791318865, + "loss": 0.471, + "step": 765 + }, + { + "epoch": 2.350499615680246, + "grad_norm": 0.703245222568512, + "learning_rate": 0.00014918196994991652, + "loss": 0.4605, + "step": 766 + }, + { + "epoch": 2.353574173712529, + "grad_norm": 0.6251977682113647, + "learning_rate": 0.00014911519198664442, + "loss": 0.4783, + "step": 767 + }, + { + "epoch": 2.3566487317448117, + "grad_norm": 0.8665552735328674, + "learning_rate": 0.0001490484140233723, + "loss": 0.4498, + "step": 768 + }, + { + "epoch": 2.3597232897770946, + "grad_norm": 0.7540160417556763, + "learning_rate": 0.0001489816360601002, + "loss": 0.4788, + "step": 769 + }, + { + "epoch": 2.3627978478093774, + "grad_norm": 0.7006065845489502, + "learning_rate": 0.00014891485809682807, + "loss": 0.4451, + "step": 770 + }, + { + "epoch": 2.3658724058416603, + "grad_norm": 0.7307246923446655, + "learning_rate": 0.00014884808013355594, + "loss": 0.5392, + "step": 771 + }, + { + "epoch": 2.368946963873943, + "grad_norm": 0.7006644606590271, + "learning_rate": 0.00014878130217028381, + "loss": 0.5656, + "step": 772 + }, + { + "epoch": 2.372021521906226, + "grad_norm": 0.8450719714164734, + "learning_rate": 0.0001487145242070117, + "loss": 0.6446, + "step": 773 + }, + { + "epoch": 2.375096079938509, + "grad_norm": 0.7223272323608398, + "learning_rate": 0.0001486477462437396, + "loss": 0.4977, + "step": 774 + }, + { + "epoch": 2.3781706379707916, + "grad_norm": 0.7771975994110107, + "learning_rate": 0.00014858096828046746, + "loss": 0.5423, + "step": 775 + }, + { + "epoch": 2.3812451960030745, + "grad_norm": 0.6998997926712036, + "learning_rate": 0.00014851419031719533, + "loss": 0.4189, + "step": 776 + }, + { + "epoch": 2.3843197540353573, + "grad_norm": 0.7170137166976929, + "learning_rate": 0.0001484474123539232, + "loss": 0.5548, + "step": 777 + }, + { + "epoch": 2.38739431206764, + "grad_norm": 0.7737225294113159, + "learning_rate": 0.00014838063439065108, + "loss": 0.5361, + "step": 778 + }, + { + "epoch": 2.390468870099923, + "grad_norm": 0.6768509149551392, + "learning_rate": 0.00014831385642737895, + "loss": 0.4285, + "step": 779 + }, + { + "epoch": 2.393543428132206, + "grad_norm": 0.7848289608955383, + "learning_rate": 0.00014824707846410685, + "loss": 0.5096, + "step": 780 + }, + { + "epoch": 2.3966179861644887, + "grad_norm": 0.7384264469146729, + "learning_rate": 0.00014818030050083473, + "loss": 0.5718, + "step": 781 + }, + { + "epoch": 2.3996925441967716, + "grad_norm": 0.508388876914978, + "learning_rate": 0.0001481135225375626, + "loss": 0.3681, + "step": 782 + }, + { + "epoch": 2.4027671022290544, + "grad_norm": 0.6172118186950684, + "learning_rate": 0.00014804674457429047, + "loss": 0.3936, + "step": 783 + }, + { + "epoch": 2.4058416602613373, + "grad_norm": 0.7471083998680115, + "learning_rate": 0.00014797996661101837, + "loss": 0.4298, + "step": 784 + }, + { + "epoch": 2.40891621829362, + "grad_norm": 0.6412104964256287, + "learning_rate": 0.00014791318864774625, + "loss": 0.4488, + "step": 785 + }, + { + "epoch": 2.411990776325903, + "grad_norm": 0.5242339372634888, + "learning_rate": 0.00014784641068447415, + "loss": 0.4069, + "step": 786 + }, + { + "epoch": 2.415065334358186, + "grad_norm": 0.7063101530075073, + "learning_rate": 0.00014777963272120202, + "loss": 0.399, + "step": 787 + }, + { + "epoch": 2.418139892390469, + "grad_norm": 0.750368595123291, + "learning_rate": 0.0001477128547579299, + "loss": 0.4841, + "step": 788 + }, + { + "epoch": 2.4212144504227515, + "grad_norm": 0.6533263325691223, + "learning_rate": 0.00014764607679465777, + "loss": 0.5361, + "step": 789 + }, + { + "epoch": 2.424289008455035, + "grad_norm": 0.7714757323265076, + "learning_rate": 0.00014757929883138567, + "loss": 0.5144, + "step": 790 + }, + { + "epoch": 2.427363566487317, + "grad_norm": 0.6196386218070984, + "learning_rate": 0.00014751252086811354, + "loss": 0.3753, + "step": 791 + }, + { + "epoch": 2.4304381245196005, + "grad_norm": 0.822083055973053, + "learning_rate": 0.0001474457429048414, + "loss": 0.5603, + "step": 792 + }, + { + "epoch": 2.4335126825518834, + "grad_norm": 0.919624924659729, + "learning_rate": 0.00014737896494156929, + "loss": 0.7186, + "step": 793 + }, + { + "epoch": 2.436587240584166, + "grad_norm": 0.7581265568733215, + "learning_rate": 0.00014731218697829716, + "loss": 0.4248, + "step": 794 + }, + { + "epoch": 2.439661798616449, + "grad_norm": 0.7717792391777039, + "learning_rate": 0.00014724540901502506, + "loss": 0.5697, + "step": 795 + }, + { + "epoch": 2.442736356648732, + "grad_norm": 0.7188724875450134, + "learning_rate": 0.00014717863105175293, + "loss": 0.5401, + "step": 796 + }, + { + "epoch": 2.4458109146810147, + "grad_norm": 0.7343811392784119, + "learning_rate": 0.0001471118530884808, + "loss": 0.5252, + "step": 797 + }, + { + "epoch": 2.4488854727132976, + "grad_norm": 0.8835532665252686, + "learning_rate": 0.00014704507512520868, + "loss": 0.4707, + "step": 798 + }, + { + "epoch": 2.4519600307455804, + "grad_norm": 0.8905605673789978, + "learning_rate": 0.00014697829716193655, + "loss": 0.6641, + "step": 799 + }, + { + "epoch": 2.4550345887778633, + "grad_norm": 0.6634113192558289, + "learning_rate": 0.00014691151919866443, + "loss": 0.5159, + "step": 800 + }, + { + "epoch": 2.458109146810146, + "grad_norm": 0.6292420625686646, + "learning_rate": 0.00014684474123539233, + "loss": 0.4333, + "step": 801 + }, + { + "epoch": 2.461183704842429, + "grad_norm": 0.806917667388916, + "learning_rate": 0.0001467779632721202, + "loss": 0.4925, + "step": 802 + }, + { + "epoch": 2.464258262874712, + "grad_norm": 0.7074801921844482, + "learning_rate": 0.0001467111853088481, + "loss": 0.5207, + "step": 803 + }, + { + "epoch": 2.4673328209069947, + "grad_norm": 0.6873858571052551, + "learning_rate": 0.00014664440734557597, + "loss": 0.575, + "step": 804 + }, + { + "epoch": 2.4704073789392775, + "grad_norm": 0.7576258182525635, + "learning_rate": 0.00014657762938230385, + "loss": 0.4433, + "step": 805 + }, + { + "epoch": 2.4734819369715604, + "grad_norm": 0.8473274111747742, + "learning_rate": 0.00014651085141903175, + "loss": 0.5218, + "step": 806 + }, + { + "epoch": 2.4765564950038432, + "grad_norm": 0.6038965582847595, + "learning_rate": 0.00014644407345575962, + "loss": 0.3794, + "step": 807 + }, + { + "epoch": 2.479631053036126, + "grad_norm": 0.714070200920105, + "learning_rate": 0.0001463772954924875, + "loss": 0.542, + "step": 808 + }, + { + "epoch": 2.482705611068409, + "grad_norm": 0.6756383776664734, + "learning_rate": 0.00014631051752921536, + "loss": 0.5188, + "step": 809 + }, + { + "epoch": 2.4857801691006918, + "grad_norm": 0.6580228209495544, + "learning_rate": 0.00014624373956594324, + "loss": 0.455, + "step": 810 + }, + { + "epoch": 2.4888547271329746, + "grad_norm": 0.7520489692687988, + "learning_rate": 0.00014617696160267114, + "loss": 0.6004, + "step": 811 + }, + { + "epoch": 2.4919292851652575, + "grad_norm": 0.6205190420150757, + "learning_rate": 0.000146110183639399, + "loss": 0.4561, + "step": 812 + }, + { + "epoch": 2.4950038431975403, + "grad_norm": 0.6518359780311584, + "learning_rate": 0.00014604340567612688, + "loss": 0.4629, + "step": 813 + }, + { + "epoch": 2.498078401229823, + "grad_norm": 0.8324114680290222, + "learning_rate": 0.00014597662771285476, + "loss": 0.4107, + "step": 814 + }, + { + "epoch": 2.501152959262106, + "grad_norm": 0.62924724817276, + "learning_rate": 0.00014590984974958263, + "loss": 0.4224, + "step": 815 + }, + { + "epoch": 2.504227517294389, + "grad_norm": 0.6838513612747192, + "learning_rate": 0.0001458430717863105, + "loss": 0.4847, + "step": 816 + }, + { + "epoch": 2.5073020753266717, + "grad_norm": 0.5814975500106812, + "learning_rate": 0.0001457762938230384, + "loss": 0.4064, + "step": 817 + }, + { + "epoch": 2.5103766333589546, + "grad_norm": 0.7436339855194092, + "learning_rate": 0.00014570951585976628, + "loss": 0.4815, + "step": 818 + }, + { + "epoch": 2.5134511913912374, + "grad_norm": 0.672369658946991, + "learning_rate": 0.00014564273789649415, + "loss": 0.494, + "step": 819 + }, + { + "epoch": 2.5165257494235203, + "grad_norm": 0.7163512110710144, + "learning_rate": 0.00014557595993322205, + "loss": 0.3771, + "step": 820 + }, + { + "epoch": 2.519600307455803, + "grad_norm": 0.814750611782074, + "learning_rate": 0.00014550918196994992, + "loss": 0.5173, + "step": 821 + }, + { + "epoch": 2.522674865488086, + "grad_norm": 0.8272102475166321, + "learning_rate": 0.00014544240400667782, + "loss": 0.4774, + "step": 822 + }, + { + "epoch": 2.525749423520369, + "grad_norm": 0.7299224734306335, + "learning_rate": 0.0001453756260434057, + "loss": 0.539, + "step": 823 + }, + { + "epoch": 2.5288239815526516, + "grad_norm": 0.6639888882637024, + "learning_rate": 0.00014530884808013357, + "loss": 0.4437, + "step": 824 + }, + { + "epoch": 2.531898539584935, + "grad_norm": 0.5353997945785522, + "learning_rate": 0.00014524207011686144, + "loss": 0.3794, + "step": 825 + }, + { + "epoch": 2.5349730976172173, + "grad_norm": 0.6737149357795715, + "learning_rate": 0.00014517529215358932, + "loss": 0.5088, + "step": 826 + }, + { + "epoch": 2.5380476556495006, + "grad_norm": 0.6940316557884216, + "learning_rate": 0.00014510851419031722, + "loss": 0.3551, + "step": 827 + }, + { + "epoch": 2.541122213681783, + "grad_norm": 0.5293498039245605, + "learning_rate": 0.0001450417362270451, + "loss": 0.4501, + "step": 828 + }, + { + "epoch": 2.5441967717140663, + "grad_norm": 0.8832515478134155, + "learning_rate": 0.00014497495826377296, + "loss": 0.6148, + "step": 829 + }, + { + "epoch": 2.5472713297463487, + "grad_norm": 0.8401015996932983, + "learning_rate": 0.00014490818030050084, + "loss": 0.5798, + "step": 830 + }, + { + "epoch": 2.550345887778632, + "grad_norm": 0.8171026110649109, + "learning_rate": 0.0001448414023372287, + "loss": 0.4115, + "step": 831 + }, + { + "epoch": 2.5534204458109144, + "grad_norm": 0.6658011674880981, + "learning_rate": 0.00014477462437395658, + "loss": 0.4396, + "step": 832 + }, + { + "epoch": 2.5564950038431977, + "grad_norm": 0.6402685046195984, + "learning_rate": 0.00014470784641068448, + "loss": 0.3923, + "step": 833 + }, + { + "epoch": 2.55956956187548, + "grad_norm": 0.7223045229911804, + "learning_rate": 0.00014464106844741236, + "loss": 0.5936, + "step": 834 + }, + { + "epoch": 2.5626441199077634, + "grad_norm": 0.7487578988075256, + "learning_rate": 0.00014457429048414023, + "loss": 0.6008, + "step": 835 + }, + { + "epoch": 2.5657186779400463, + "grad_norm": 0.7661901712417603, + "learning_rate": 0.0001445075125208681, + "loss": 0.4106, + "step": 836 + }, + { + "epoch": 2.568793235972329, + "grad_norm": 0.6282891035079956, + "learning_rate": 0.000144440734557596, + "loss": 0.3504, + "step": 837 + }, + { + "epoch": 2.571867794004612, + "grad_norm": 0.7049952745437622, + "learning_rate": 0.00014437395659432388, + "loss": 0.5956, + "step": 838 + }, + { + "epoch": 2.574942352036895, + "grad_norm": 0.6975913643836975, + "learning_rate": 0.00014430717863105178, + "loss": 0.4768, + "step": 839 + }, + { + "epoch": 2.5780169100691777, + "grad_norm": 0.7281587719917297, + "learning_rate": 0.00014424040066777965, + "loss": 0.5222, + "step": 840 + }, + { + "epoch": 2.5810914681014605, + "grad_norm": 0.864368200302124, + "learning_rate": 0.00014417362270450752, + "loss": 0.5532, + "step": 841 + }, + { + "epoch": 2.5841660261337434, + "grad_norm": 0.634505569934845, + "learning_rate": 0.0001441068447412354, + "loss": 0.3804, + "step": 842 + }, + { + "epoch": 2.587240584166026, + "grad_norm": 0.6007309556007385, + "learning_rate": 0.0001440400667779633, + "loss": 0.5151, + "step": 843 + }, + { + "epoch": 2.590315142198309, + "grad_norm": 0.9483073353767395, + "learning_rate": 0.00014397328881469117, + "loss": 0.5779, + "step": 844 + }, + { + "epoch": 2.593389700230592, + "grad_norm": 0.8563257455825806, + "learning_rate": 0.00014390651085141904, + "loss": 0.606, + "step": 845 + }, + { + "epoch": 2.5964642582628747, + "grad_norm": 0.6220794320106506, + "learning_rate": 0.00014383973288814692, + "loss": 0.4883, + "step": 846 + }, + { + "epoch": 2.5995388162951576, + "grad_norm": 0.6485925912857056, + "learning_rate": 0.0001437729549248748, + "loss": 0.4443, + "step": 847 + }, + { + "epoch": 2.6026133743274404, + "grad_norm": 0.8992952108383179, + "learning_rate": 0.00014370617696160266, + "loss": 0.6118, + "step": 848 + }, + { + "epoch": 2.6056879323597233, + "grad_norm": 0.5959873199462891, + "learning_rate": 0.00014363939899833056, + "loss": 0.4875, + "step": 849 + }, + { + "epoch": 2.608762490392006, + "grad_norm": 0.8172950744628906, + "learning_rate": 0.00014357262103505844, + "loss": 0.5899, + "step": 850 + }, + { + "epoch": 2.611837048424289, + "grad_norm": 1.0087146759033203, + "learning_rate": 0.0001435058430717863, + "loss": 0.6385, + "step": 851 + }, + { + "epoch": 2.614911606456572, + "grad_norm": 0.6918483376502991, + "learning_rate": 0.00014343906510851418, + "loss": 0.4764, + "step": 852 + }, + { + "epoch": 2.6179861644888547, + "grad_norm": 0.8268954753875732, + "learning_rate": 0.00014337228714524205, + "loss": 0.535, + "step": 853 + }, + { + "epoch": 2.6210607225211375, + "grad_norm": 0.8672003746032715, + "learning_rate": 0.00014330550918196995, + "loss": 0.5815, + "step": 854 + }, + { + "epoch": 2.6241352805534204, + "grad_norm": 0.6377939581871033, + "learning_rate": 0.00014323873121869783, + "loss": 0.4999, + "step": 855 + }, + { + "epoch": 2.6272098385857032, + "grad_norm": 0.6987239718437195, + "learning_rate": 0.00014317195325542573, + "loss": 0.534, + "step": 856 + }, + { + "epoch": 2.630284396617986, + "grad_norm": 0.7003011107444763, + "learning_rate": 0.0001431051752921536, + "loss": 0.5054, + "step": 857 + }, + { + "epoch": 2.633358954650269, + "grad_norm": 0.5871327519416809, + "learning_rate": 0.00014303839732888147, + "loss": 0.425, + "step": 858 + }, + { + "epoch": 2.6364335126825518, + "grad_norm": 0.6714287996292114, + "learning_rate": 0.00014297161936560937, + "loss": 0.5268, + "step": 859 + }, + { + "epoch": 2.6395080707148346, + "grad_norm": 0.8090579509735107, + "learning_rate": 0.00014290484140233725, + "loss": 0.4094, + "step": 860 + }, + { + "epoch": 2.6425826287471175, + "grad_norm": 0.6854161620140076, + "learning_rate": 0.00014283806343906512, + "loss": 0.4385, + "step": 861 + }, + { + "epoch": 2.6456571867794003, + "grad_norm": 0.8665665984153748, + "learning_rate": 0.000142771285475793, + "loss": 0.5323, + "step": 862 + }, + { + "epoch": 2.648731744811683, + "grad_norm": 0.6155755519866943, + "learning_rate": 0.00014270450751252087, + "loss": 0.5086, + "step": 863 + }, + { + "epoch": 2.6518063028439665, + "grad_norm": 0.6008875370025635, + "learning_rate": 0.00014263772954924874, + "loss": 0.4709, + "step": 864 + }, + { + "epoch": 2.654880860876249, + "grad_norm": 0.6181650161743164, + "learning_rate": 0.00014257095158597664, + "loss": 0.4797, + "step": 865 + }, + { + "epoch": 2.657955418908532, + "grad_norm": 0.7965251803398132, + "learning_rate": 0.00014250417362270451, + "loss": 0.5568, + "step": 866 + }, + { + "epoch": 2.6610299769408146, + "grad_norm": 0.6701710224151611, + "learning_rate": 0.0001424373956594324, + "loss": 0.5649, + "step": 867 + }, + { + "epoch": 2.664104534973098, + "grad_norm": 0.7391377091407776, + "learning_rate": 0.00014237061769616026, + "loss": 0.5245, + "step": 868 + }, + { + "epoch": 2.6671790930053803, + "grad_norm": 0.6421666741371155, + "learning_rate": 0.00014230383973288813, + "loss": 0.6095, + "step": 869 + }, + { + "epoch": 2.6702536510376635, + "grad_norm": 0.6544116735458374, + "learning_rate": 0.00014223706176961603, + "loss": 0.4581, + "step": 870 + }, + { + "epoch": 2.673328209069946, + "grad_norm": 0.6023032069206238, + "learning_rate": 0.0001421702838063439, + "loss": 0.3308, + "step": 871 + }, + { + "epoch": 2.6764027671022292, + "grad_norm": 0.6281394362449646, + "learning_rate": 0.00014210350584307178, + "loss": 0.5103, + "step": 872 + }, + { + "epoch": 2.6794773251345116, + "grad_norm": 0.7043030261993408, + "learning_rate": 0.00014203672787979968, + "loss": 0.5985, + "step": 873 + }, + { + "epoch": 2.682551883166795, + "grad_norm": 0.5958001613616943, + "learning_rate": 0.00014196994991652755, + "loss": 0.455, + "step": 874 + }, + { + "epoch": 2.685626441199078, + "grad_norm": 0.7591226696968079, + "learning_rate": 0.00014190317195325545, + "loss": 0.5467, + "step": 875 + }, + { + "epoch": 2.6887009992313606, + "grad_norm": 0.8010213375091553, + "learning_rate": 0.00014183639398998333, + "loss": 0.4531, + "step": 876 + }, + { + "epoch": 2.6917755572636435, + "grad_norm": 0.8268343210220337, + "learning_rate": 0.0001417696160267112, + "loss": 0.5443, + "step": 877 + }, + { + "epoch": 2.6948501152959263, + "grad_norm": 0.6514490246772766, + "learning_rate": 0.00014170283806343907, + "loss": 0.4455, + "step": 878 + }, + { + "epoch": 2.697924673328209, + "grad_norm": 1.0831782817840576, + "learning_rate": 0.00014163606010016695, + "loss": 0.5648, + "step": 879 + }, + { + "epoch": 2.700999231360492, + "grad_norm": 0.8194222450256348, + "learning_rate": 0.00014156928213689482, + "loss": 0.6465, + "step": 880 + }, + { + "epoch": 2.704073789392775, + "grad_norm": 0.7758293747901917, + "learning_rate": 0.00014150250417362272, + "loss": 0.4335, + "step": 881 + }, + { + "epoch": 2.7071483474250577, + "grad_norm": 0.596432089805603, + "learning_rate": 0.0001414357262103506, + "loss": 0.4159, + "step": 882 + }, + { + "epoch": 2.7102229054573406, + "grad_norm": 0.7483400702476501, + "learning_rate": 0.00014136894824707847, + "loss": 0.5471, + "step": 883 + }, + { + "epoch": 2.7132974634896234, + "grad_norm": 0.6361656785011292, + "learning_rate": 0.00014130217028380634, + "loss": 0.4815, + "step": 884 + }, + { + "epoch": 2.7163720215219063, + "grad_norm": 0.6137235760688782, + "learning_rate": 0.0001412353923205342, + "loss": 0.4245, + "step": 885 + }, + { + "epoch": 2.719446579554189, + "grad_norm": 0.6101003289222717, + "learning_rate": 0.0001411686143572621, + "loss": 0.5262, + "step": 886 + }, + { + "epoch": 2.722521137586472, + "grad_norm": 0.939014732837677, + "learning_rate": 0.00014110183639398999, + "loss": 0.5089, + "step": 887 + }, + { + "epoch": 2.725595695618755, + "grad_norm": 0.7217115759849548, + "learning_rate": 0.00014103505843071786, + "loss": 0.5018, + "step": 888 + }, + { + "epoch": 2.7286702536510377, + "grad_norm": 0.6515239477157593, + "learning_rate": 0.00014096828046744576, + "loss": 0.5274, + "step": 889 + }, + { + "epoch": 2.7317448116833205, + "grad_norm": 0.7656288743019104, + "learning_rate": 0.00014090150250417363, + "loss": 0.5395, + "step": 890 + }, + { + "epoch": 2.7348193697156034, + "grad_norm": 0.77834552526474, + "learning_rate": 0.00014083472454090153, + "loss": 0.5201, + "step": 891 + }, + { + "epoch": 2.737893927747886, + "grad_norm": 0.9140714406967163, + "learning_rate": 0.0001407679465776294, + "loss": 0.7341, + "step": 892 + }, + { + "epoch": 2.740968485780169, + "grad_norm": 0.8534432649612427, + "learning_rate": 0.00014070116861435728, + "loss": 0.4747, + "step": 893 + }, + { + "epoch": 2.744043043812452, + "grad_norm": 0.8247655034065247, + "learning_rate": 0.00014063439065108515, + "loss": 0.5811, + "step": 894 + }, + { + "epoch": 2.7471176018447347, + "grad_norm": 0.6922281980514526, + "learning_rate": 0.00014056761268781303, + "loss": 0.5797, + "step": 895 + }, + { + "epoch": 2.7501921598770176, + "grad_norm": 0.7262521982192993, + "learning_rate": 0.0001405008347245409, + "loss": 0.3954, + "step": 896 + }, + { + "epoch": 2.7532667179093004, + "grad_norm": 0.7673102021217346, + "learning_rate": 0.0001404340567612688, + "loss": 0.4929, + "step": 897 + }, + { + "epoch": 2.7563412759415833, + "grad_norm": 0.6259851455688477, + "learning_rate": 0.00014036727879799667, + "loss": 0.4048, + "step": 898 + }, + { + "epoch": 2.759415833973866, + "grad_norm": 0.7085642218589783, + "learning_rate": 0.00014030050083472454, + "loss": 0.4338, + "step": 899 + }, + { + "epoch": 2.762490392006149, + "grad_norm": 0.6708558797836304, + "learning_rate": 0.00014023372287145242, + "loss": 0.5151, + "step": 900 + }, + { + "epoch": 2.765564950038432, + "grad_norm": 0.7648240327835083, + "learning_rate": 0.0001401669449081803, + "loss": 0.5601, + "step": 901 + }, + { + "epoch": 2.7686395080707147, + "grad_norm": 0.6803378462791443, + "learning_rate": 0.0001401001669449082, + "loss": 0.624, + "step": 902 + }, + { + "epoch": 2.7717140661029975, + "grad_norm": 0.7478699088096619, + "learning_rate": 0.00014003338898163606, + "loss": 0.4805, + "step": 903 + }, + { + "epoch": 2.7747886241352804, + "grad_norm": 0.6584222316741943, + "learning_rate": 0.00013996661101836394, + "loss": 0.4451, + "step": 904 + }, + { + "epoch": 2.7778631821675637, + "grad_norm": 0.5883088111877441, + "learning_rate": 0.0001398998330550918, + "loss": 0.3848, + "step": 905 + }, + { + "epoch": 2.780937740199846, + "grad_norm": 0.5683791041374207, + "learning_rate": 0.0001398330550918197, + "loss": 0.3969, + "step": 906 + }, + { + "epoch": 2.7840122982321294, + "grad_norm": 0.5645559430122375, + "learning_rate": 0.00013976627712854758, + "loss": 0.337, + "step": 907 + }, + { + "epoch": 2.7870868562644118, + "grad_norm": 0.5845876336097717, + "learning_rate": 0.00013969949916527548, + "loss": 0.4401, + "step": 908 + }, + { + "epoch": 2.790161414296695, + "grad_norm": 0.8455728888511658, + "learning_rate": 0.00013963272120200336, + "loss": 0.6128, + "step": 909 + }, + { + "epoch": 2.7932359723289775, + "grad_norm": 0.8465787172317505, + "learning_rate": 0.00013956594323873123, + "loss": 0.5862, + "step": 910 + }, + { + "epoch": 2.7963105303612608, + "grad_norm": 0.8809154629707336, + "learning_rate": 0.0001394991652754591, + "loss": 0.677, + "step": 911 + }, + { + "epoch": 2.799385088393543, + "grad_norm": 0.6254997849464417, + "learning_rate": 0.00013943238731218698, + "loss": 0.534, + "step": 912 + }, + { + "epoch": 2.8024596464258265, + "grad_norm": 0.6675909757614136, + "learning_rate": 0.00013936560934891488, + "loss": 0.4258, + "step": 913 + }, + { + "epoch": 2.8055342044581093, + "grad_norm": 0.672428548336029, + "learning_rate": 0.00013929883138564275, + "loss": 0.4659, + "step": 914 + }, + { + "epoch": 2.808608762490392, + "grad_norm": 0.7433823943138123, + "learning_rate": 0.00013923205342237062, + "loss": 0.4804, + "step": 915 + }, + { + "epoch": 2.811683320522675, + "grad_norm": 0.6739639639854431, + "learning_rate": 0.0001391652754590985, + "loss": 0.5034, + "step": 916 + }, + { + "epoch": 2.814757878554958, + "grad_norm": 0.7234442234039307, + "learning_rate": 0.00013909849749582637, + "loss": 0.5953, + "step": 917 + }, + { + "epoch": 2.8178324365872407, + "grad_norm": 0.7517747282981873, + "learning_rate": 0.00013903171953255427, + "loss": 0.4764, + "step": 918 + }, + { + "epoch": 2.8209069946195235, + "grad_norm": 0.6552411913871765, + "learning_rate": 0.00013896494156928214, + "loss": 0.5352, + "step": 919 + }, + { + "epoch": 2.8239815526518064, + "grad_norm": 0.5779647827148438, + "learning_rate": 0.00013889816360601002, + "loss": 0.3907, + "step": 920 + }, + { + "epoch": 2.8270561106840892, + "grad_norm": 0.7147451639175415, + "learning_rate": 0.0001388313856427379, + "loss": 0.4245, + "step": 921 + }, + { + "epoch": 2.830130668716372, + "grad_norm": 0.5399389863014221, + "learning_rate": 0.00013876460767946576, + "loss": 0.4573, + "step": 922 + }, + { + "epoch": 2.833205226748655, + "grad_norm": 0.47868096828460693, + "learning_rate": 0.00013869782971619366, + "loss": 0.4469, + "step": 923 + }, + { + "epoch": 2.836279784780938, + "grad_norm": 0.6399335861206055, + "learning_rate": 0.00013863105175292154, + "loss": 0.4518, + "step": 924 + }, + { + "epoch": 2.8393543428132206, + "grad_norm": 0.6514092683792114, + "learning_rate": 0.00013856427378964944, + "loss": 0.4701, + "step": 925 + }, + { + "epoch": 2.8424289008455035, + "grad_norm": 1.0016971826553345, + "learning_rate": 0.0001384974958263773, + "loss": 0.5146, + "step": 926 + }, + { + "epoch": 2.8455034588777863, + "grad_norm": 0.6343466639518738, + "learning_rate": 0.00013843071786310518, + "loss": 0.5356, + "step": 927 + }, + { + "epoch": 2.848578016910069, + "grad_norm": 0.7292190194129944, + "learning_rate": 0.00013836393989983308, + "loss": 0.4852, + "step": 928 + }, + { + "epoch": 2.851652574942352, + "grad_norm": 0.6090812683105469, + "learning_rate": 0.00013829716193656096, + "loss": 0.3873, + "step": 929 + }, + { + "epoch": 2.854727132974635, + "grad_norm": 0.7116502523422241, + "learning_rate": 0.00013823038397328883, + "loss": 0.4726, + "step": 930 + }, + { + "epoch": 2.8578016910069177, + "grad_norm": 0.8313955664634705, + "learning_rate": 0.0001381636060100167, + "loss": 0.4524, + "step": 931 + }, + { + "epoch": 2.8608762490392006, + "grad_norm": 0.7220770120620728, + "learning_rate": 0.00013809682804674458, + "loss": 0.4448, + "step": 932 + }, + { + "epoch": 2.8639508070714834, + "grad_norm": 0.8398887515068054, + "learning_rate": 0.00013803005008347245, + "loss": 0.4853, + "step": 933 + }, + { + "epoch": 2.8670253651037663, + "grad_norm": 0.7636063098907471, + "learning_rate": 0.00013796327212020035, + "loss": 0.5618, + "step": 934 + }, + { + "epoch": 2.870099923136049, + "grad_norm": 0.5957133769989014, + "learning_rate": 0.00013789649415692822, + "loss": 0.4751, + "step": 935 + }, + { + "epoch": 2.873174481168332, + "grad_norm": 0.7475373148918152, + "learning_rate": 0.0001378297161936561, + "loss": 0.5226, + "step": 936 + }, + { + "epoch": 2.876249039200615, + "grad_norm": 0.7718681693077087, + "learning_rate": 0.00013776293823038397, + "loss": 0.5382, + "step": 937 + }, + { + "epoch": 2.8793235972328977, + "grad_norm": 0.7646799087524414, + "learning_rate": 0.00013769616026711184, + "loss": 0.5466, + "step": 938 + }, + { + "epoch": 2.8823981552651805, + "grad_norm": 0.45133599638938904, + "learning_rate": 0.00013762938230383974, + "loss": 0.3254, + "step": 939 + }, + { + "epoch": 2.8854727132974634, + "grad_norm": 0.6464604735374451, + "learning_rate": 0.00013756260434056762, + "loss": 0.5075, + "step": 940 + }, + { + "epoch": 2.888547271329746, + "grad_norm": 0.6089568138122559, + "learning_rate": 0.0001374958263772955, + "loss": 0.5177, + "step": 941 + }, + { + "epoch": 2.891621829362029, + "grad_norm": 0.6696579456329346, + "learning_rate": 0.0001374290484140234, + "loss": 0.4651, + "step": 942 + }, + { + "epoch": 2.894696387394312, + "grad_norm": 0.7825729846954346, + "learning_rate": 0.00013736227045075126, + "loss": 0.5601, + "step": 943 + }, + { + "epoch": 2.897770945426595, + "grad_norm": 0.75175541639328, + "learning_rate": 0.00013729549248747916, + "loss": 0.4901, + "step": 944 + }, + { + "epoch": 2.9008455034588776, + "grad_norm": 0.6651338338851929, + "learning_rate": 0.00013722871452420704, + "loss": 0.6134, + "step": 945 + }, + { + "epoch": 2.903920061491161, + "grad_norm": 0.6632173657417297, + "learning_rate": 0.0001371619365609349, + "loss": 0.4926, + "step": 946 + }, + { + "epoch": 2.9069946195234433, + "grad_norm": 0.666152834892273, + "learning_rate": 0.00013709515859766278, + "loss": 0.5822, + "step": 947 + }, + { + "epoch": 2.9100691775557266, + "grad_norm": 0.779793381690979, + "learning_rate": 0.00013702838063439065, + "loss": 0.6007, + "step": 948 + }, + { + "epoch": 2.913143735588009, + "grad_norm": 0.5794811248779297, + "learning_rate": 0.00013696160267111853, + "loss": 0.459, + "step": 949 + }, + { + "epoch": 2.9162182936202923, + "grad_norm": 0.65561443567276, + "learning_rate": 0.00013689482470784643, + "loss": 0.5584, + "step": 950 + }, + { + "epoch": 2.9192928516525747, + "grad_norm": 0.6967616677284241, + "learning_rate": 0.0001368280467445743, + "loss": 0.6218, + "step": 951 + }, + { + "epoch": 2.922367409684858, + "grad_norm": 0.7796815037727356, + "learning_rate": 0.00013676126878130217, + "loss": 0.6594, + "step": 952 + }, + { + "epoch": 2.925441967717141, + "grad_norm": 0.7640193700790405, + "learning_rate": 0.00013669449081803005, + "loss": 0.5783, + "step": 953 + }, + { + "epoch": 2.9285165257494237, + "grad_norm": 0.870796799659729, + "learning_rate": 0.00013662771285475792, + "loss": 0.4792, + "step": 954 + }, + { + "epoch": 2.9315910837817065, + "grad_norm": 0.8562505841255188, + "learning_rate": 0.00013656093489148582, + "loss": 0.4662, + "step": 955 + }, + { + "epoch": 2.9346656418139894, + "grad_norm": 0.744202196598053, + "learning_rate": 0.0001364941569282137, + "loss": 0.5733, + "step": 956 + }, + { + "epoch": 2.937740199846272, + "grad_norm": 0.7171375155448914, + "learning_rate": 0.00013642737896494157, + "loss": 0.4559, + "step": 957 + }, + { + "epoch": 2.940814757878555, + "grad_norm": 0.6538399457931519, + "learning_rate": 0.00013636060100166944, + "loss": 0.5757, + "step": 958 + }, + { + "epoch": 2.943889315910838, + "grad_norm": 0.7372276782989502, + "learning_rate": 0.00013629382303839734, + "loss": 0.5342, + "step": 959 + }, + { + "epoch": 2.9469638739431208, + "grad_norm": 0.7643387317657471, + "learning_rate": 0.00013622704507512521, + "loss": 0.4865, + "step": 960 + }, + { + "epoch": 2.9500384319754036, + "grad_norm": 0.8265420198440552, + "learning_rate": 0.00013616026711185311, + "loss": 0.487, + "step": 961 + }, + { + "epoch": 2.9531129900076865, + "grad_norm": 0.7020171284675598, + "learning_rate": 0.000136093489148581, + "loss": 0.4587, + "step": 962 + }, + { + "epoch": 2.9561875480399693, + "grad_norm": 0.8034495711326599, + "learning_rate": 0.00013602671118530886, + "loss": 0.5976, + "step": 963 + }, + { + "epoch": 2.959262106072252, + "grad_norm": 0.8909509181976318, + "learning_rate": 0.00013595993322203673, + "loss": 0.5283, + "step": 964 + }, + { + "epoch": 2.962336664104535, + "grad_norm": 0.8513332009315491, + "learning_rate": 0.0001358931552587646, + "loss": 0.5633, + "step": 965 + }, + { + "epoch": 2.965411222136818, + "grad_norm": 0.884508490562439, + "learning_rate": 0.0001358263772954925, + "loss": 0.5723, + "step": 966 + }, + { + "epoch": 2.9684857801691007, + "grad_norm": 0.7936095595359802, + "learning_rate": 0.00013575959933222038, + "loss": 0.5113, + "step": 967 + }, + { + "epoch": 2.9715603382013835, + "grad_norm": 0.9732086658477783, + "learning_rate": 0.00013569282136894825, + "loss": 0.5064, + "step": 968 + }, + { + "epoch": 2.9746348962336664, + "grad_norm": 0.7790175676345825, + "learning_rate": 0.00013562604340567613, + "loss": 0.653, + "step": 969 + }, + { + "epoch": 2.9777094542659492, + "grad_norm": 0.6383731365203857, + "learning_rate": 0.000135559265442404, + "loss": 0.4153, + "step": 970 + }, + { + "epoch": 2.980784012298232, + "grad_norm": 0.6142308115959167, + "learning_rate": 0.0001354924874791319, + "loss": 0.4416, + "step": 971 + }, + { + "epoch": 2.983858570330515, + "grad_norm": 0.8212004899978638, + "learning_rate": 0.00013542570951585977, + "loss": 0.628, + "step": 972 + }, + { + "epoch": 2.986933128362798, + "grad_norm": 0.7956951856613159, + "learning_rate": 0.00013535893155258765, + "loss": 0.4985, + "step": 973 + }, + { + "epoch": 2.9900076863950806, + "grad_norm": 0.6558810472488403, + "learning_rate": 0.00013529215358931552, + "loss": 0.6306, + "step": 974 + }, + { + "epoch": 2.9930822444273635, + "grad_norm": 0.6772769689559937, + "learning_rate": 0.0001352253756260434, + "loss": 0.4105, + "step": 975 + }, + { + "epoch": 2.9961568024596463, + "grad_norm": 0.6904112100601196, + "learning_rate": 0.0001351585976627713, + "loss": 0.6085, + "step": 976 + }, + { + "epoch": 2.999231360491929, + "grad_norm": 1.2956902980804443, + "learning_rate": 0.00013509181969949917, + "loss": 0.5799, + "step": 977 + }, + { + "epoch": 3.0, + "grad_norm": 1.4253301620483398, + "learning_rate": 0.00013502504173622707, + "loss": 0.532, + "step": 978 + }, + { + "epoch": 3.003074558032283, + "grad_norm": 0.5800220370292664, + "learning_rate": 0.00013495826377295494, + "loss": 0.3877, + "step": 979 + }, + { + "epoch": 3.0061491160645657, + "grad_norm": 0.558982253074646, + "learning_rate": 0.0001348914858096828, + "loss": 0.3996, + "step": 980 + }, + { + "epoch": 3.0092236740968485, + "grad_norm": 0.6223140954971313, + "learning_rate": 0.00013482470784641069, + "loss": 0.5572, + "step": 981 + }, + { + "epoch": 3.0122982321291314, + "grad_norm": 0.522871196269989, + "learning_rate": 0.00013475792988313859, + "loss": 0.4207, + "step": 982 + }, + { + "epoch": 3.0153727901614142, + "grad_norm": 0.8188950419425964, + "learning_rate": 0.00013469115191986646, + "loss": 0.4837, + "step": 983 + }, + { + "epoch": 3.018447348193697, + "grad_norm": 0.5563365817070007, + "learning_rate": 0.00013462437395659433, + "loss": 0.3388, + "step": 984 + }, + { + "epoch": 3.02152190622598, + "grad_norm": 0.6464280486106873, + "learning_rate": 0.0001345575959933222, + "loss": 0.4506, + "step": 985 + }, + { + "epoch": 3.024596464258263, + "grad_norm": 0.815517246723175, + "learning_rate": 0.00013449081803005008, + "loss": 0.5219, + "step": 986 + }, + { + "epoch": 3.0276710222905456, + "grad_norm": 0.6663722395896912, + "learning_rate": 0.00013442404006677798, + "loss": 0.4144, + "step": 987 + }, + { + "epoch": 3.0307455803228285, + "grad_norm": 0.5828370451927185, + "learning_rate": 0.00013435726210350585, + "loss": 0.3811, + "step": 988 + }, + { + "epoch": 3.0338201383551113, + "grad_norm": 0.5832375884056091, + "learning_rate": 0.00013429048414023373, + "loss": 0.3878, + "step": 989 + }, + { + "epoch": 3.036894696387394, + "grad_norm": 0.5275335311889648, + "learning_rate": 0.0001342237061769616, + "loss": 0.3398, + "step": 990 + }, + { + "epoch": 3.039969254419677, + "grad_norm": 0.7779368758201599, + "learning_rate": 0.00013415692821368947, + "loss": 0.4973, + "step": 991 + }, + { + "epoch": 3.04304381245196, + "grad_norm": 0.7502028942108154, + "learning_rate": 0.00013409015025041737, + "loss": 0.4701, + "step": 992 + }, + { + "epoch": 3.0461183704842427, + "grad_norm": 0.7934368848800659, + "learning_rate": 0.00013402337228714524, + "loss": 0.5111, + "step": 993 + }, + { + "epoch": 3.0491929285165256, + "grad_norm": 0.6284624338150024, + "learning_rate": 0.00013395659432387312, + "loss": 0.3573, + "step": 994 + }, + { + "epoch": 3.0522674865488084, + "grad_norm": 0.8872091770172119, + "learning_rate": 0.00013388981636060102, + "loss": 0.3814, + "step": 995 + }, + { + "epoch": 3.0553420445810913, + "grad_norm": 0.5997917652130127, + "learning_rate": 0.0001338230383973289, + "loss": 0.4055, + "step": 996 + }, + { + "epoch": 3.058416602613374, + "grad_norm": 0.5672919154167175, + "learning_rate": 0.00013375626043405676, + "loss": 0.3655, + "step": 997 + }, + { + "epoch": 3.061491160645657, + "grad_norm": 0.7909939885139465, + "learning_rate": 0.00013368948247078466, + "loss": 0.4561, + "step": 998 + }, + { + "epoch": 3.0645657186779403, + "grad_norm": 1.0427160263061523, + "learning_rate": 0.00013362270450751254, + "loss": 0.5413, + "step": 999 + }, + { + "epoch": 3.067640276710223, + "grad_norm": 0.8109893798828125, + "learning_rate": 0.0001335559265442404, + "loss": 0.4794, + "step": 1000 + }, + { + "epoch": 3.070714834742506, + "grad_norm": 0.7566149830818176, + "learning_rate": 0.00013348914858096828, + "loss": 0.394, + "step": 1001 + }, + { + "epoch": 3.073789392774789, + "grad_norm": 0.7242660522460938, + "learning_rate": 0.00013342237061769616, + "loss": 0.4266, + "step": 1002 + }, + { + "epoch": 3.0768639508070716, + "grad_norm": 0.9477089047431946, + "learning_rate": 0.00013335559265442406, + "loss": 0.4617, + "step": 1003 + }, + { + "epoch": 3.0799385088393545, + "grad_norm": 0.949448823928833, + "learning_rate": 0.00013328881469115193, + "loss": 0.5183, + "step": 1004 + }, + { + "epoch": 3.0830130668716373, + "grad_norm": 0.7148897647857666, + "learning_rate": 0.0001332220367278798, + "loss": 0.4209, + "step": 1005 + }, + { + "epoch": 3.08608762490392, + "grad_norm": 0.6903197169303894, + "learning_rate": 0.00013315525876460768, + "loss": 0.4703, + "step": 1006 + }, + { + "epoch": 3.089162182936203, + "grad_norm": 0.7613615393638611, + "learning_rate": 0.00013308848080133555, + "loss": 0.3055, + "step": 1007 + }, + { + "epoch": 3.092236740968486, + "grad_norm": 0.5844465494155884, + "learning_rate": 0.00013302170283806345, + "loss": 0.2888, + "step": 1008 + }, + { + "epoch": 3.0953112990007687, + "grad_norm": 0.772946298122406, + "learning_rate": 0.00013295492487479132, + "loss": 0.4935, + "step": 1009 + }, + { + "epoch": 3.0983858570330516, + "grad_norm": 0.7142703533172607, + "learning_rate": 0.0001328881469115192, + "loss": 0.443, + "step": 1010 + }, + { + "epoch": 3.1014604150653344, + "grad_norm": 0.7844696044921875, + "learning_rate": 0.00013282136894824707, + "loss": 0.4677, + "step": 1011 + }, + { + "epoch": 3.1045349730976173, + "grad_norm": 0.8850453495979309, + "learning_rate": 0.00013275459098497497, + "loss": 0.4703, + "step": 1012 + }, + { + "epoch": 3.1076095311299, + "grad_norm": 0.6243056058883667, + "learning_rate": 0.00013268781302170284, + "loss": 0.3225, + "step": 1013 + }, + { + "epoch": 3.110684089162183, + "grad_norm": 0.5218976736068726, + "learning_rate": 0.00013262103505843074, + "loss": 0.3424, + "step": 1014 + }, + { + "epoch": 3.113758647194466, + "grad_norm": 0.78139728307724, + "learning_rate": 0.00013255425709515862, + "loss": 0.3779, + "step": 1015 + }, + { + "epoch": 3.1168332052267487, + "grad_norm": 0.7067313194274902, + "learning_rate": 0.0001324874791318865, + "loss": 0.2979, + "step": 1016 + }, + { + "epoch": 3.1199077632590315, + "grad_norm": 0.5684396028518677, + "learning_rate": 0.00013242070116861436, + "loss": 0.3936, + "step": 1017 + }, + { + "epoch": 3.1229823212913144, + "grad_norm": 0.7444823384284973, + "learning_rate": 0.00013235392320534224, + "loss": 0.3771, + "step": 1018 + }, + { + "epoch": 3.126056879323597, + "grad_norm": 0.7313172221183777, + "learning_rate": 0.00013228714524207014, + "loss": 0.4764, + "step": 1019 + }, + { + "epoch": 3.12913143735588, + "grad_norm": 0.8631938099861145, + "learning_rate": 0.000132220367278798, + "loss": 0.4095, + "step": 1020 + }, + { + "epoch": 3.132205995388163, + "grad_norm": 0.745307445526123, + "learning_rate": 0.00013215358931552588, + "loss": 0.3906, + "step": 1021 + }, + { + "epoch": 3.1352805534204458, + "grad_norm": 0.7458917498588562, + "learning_rate": 0.00013208681135225376, + "loss": 0.4967, + "step": 1022 + }, + { + "epoch": 3.1383551114527286, + "grad_norm": 0.8067619204521179, + "learning_rate": 0.00013202003338898163, + "loss": 0.5558, + "step": 1023 + }, + { + "epoch": 3.1414296694850115, + "grad_norm": 0.8676497340202332, + "learning_rate": 0.00013195325542570953, + "loss": 0.4726, + "step": 1024 + }, + { + "epoch": 3.1445042275172943, + "grad_norm": 0.8006786108016968, + "learning_rate": 0.0001318864774624374, + "loss": 0.4352, + "step": 1025 + }, + { + "epoch": 3.147578785549577, + "grad_norm": 0.7759934067726135, + "learning_rate": 0.00013181969949916528, + "loss": 0.4349, + "step": 1026 + }, + { + "epoch": 3.15065334358186, + "grad_norm": 0.6713132262229919, + "learning_rate": 0.00013175292153589315, + "loss": 0.4152, + "step": 1027 + }, + { + "epoch": 3.153727901614143, + "grad_norm": 0.8547674417495728, + "learning_rate": 0.00013168614357262102, + "loss": 0.3794, + "step": 1028 + }, + { + "epoch": 3.1568024596464257, + "grad_norm": 0.8227840065956116, + "learning_rate": 0.00013161936560934892, + "loss": 0.4816, + "step": 1029 + }, + { + "epoch": 3.1598770176787085, + "grad_norm": 0.7303609251976013, + "learning_rate": 0.0001315525876460768, + "loss": 0.4662, + "step": 1030 + }, + { + "epoch": 3.1629515757109914, + "grad_norm": 0.7921698689460754, + "learning_rate": 0.0001314858096828047, + "loss": 0.4455, + "step": 1031 + }, + { + "epoch": 3.1660261337432742, + "grad_norm": 0.662651538848877, + "learning_rate": 0.00013141903171953257, + "loss": 0.3648, + "step": 1032 + }, + { + "epoch": 3.169100691775557, + "grad_norm": 0.8179068565368652, + "learning_rate": 0.00013135225375626044, + "loss": 0.4263, + "step": 1033 + }, + { + "epoch": 3.17217524980784, + "grad_norm": 0.5210088491439819, + "learning_rate": 0.00013128547579298832, + "loss": 0.3315, + "step": 1034 + }, + { + "epoch": 3.175249807840123, + "grad_norm": 0.7272716164588928, + "learning_rate": 0.00013121869782971622, + "loss": 0.4366, + "step": 1035 + }, + { + "epoch": 3.178324365872406, + "grad_norm": 0.8663350343704224, + "learning_rate": 0.0001311519198664441, + "loss": 0.4821, + "step": 1036 + }, + { + "epoch": 3.1813989239046885, + "grad_norm": 0.7915233373641968, + "learning_rate": 0.00013108514190317196, + "loss": 0.4452, + "step": 1037 + }, + { + "epoch": 3.1844734819369718, + "grad_norm": 0.8421617746353149, + "learning_rate": 0.00013101836393989983, + "loss": 0.4405, + "step": 1038 + }, + { + "epoch": 3.1875480399692546, + "grad_norm": 0.6787004470825195, + "learning_rate": 0.0001309515859766277, + "loss": 0.3901, + "step": 1039 + }, + { + "epoch": 3.1906225980015375, + "grad_norm": 0.6390913128852844, + "learning_rate": 0.0001308848080133556, + "loss": 0.3647, + "step": 1040 + }, + { + "epoch": 3.1936971560338203, + "grad_norm": 0.6698052883148193, + "learning_rate": 0.00013081803005008348, + "loss": 0.3983, + "step": 1041 + }, + { + "epoch": 3.196771714066103, + "grad_norm": 0.6419287919998169, + "learning_rate": 0.00013075125208681135, + "loss": 0.396, + "step": 1042 + }, + { + "epoch": 3.199846272098386, + "grad_norm": 0.7305182218551636, + "learning_rate": 0.00013068447412353923, + "loss": 0.4746, + "step": 1043 + }, + { + "epoch": 3.202920830130669, + "grad_norm": 1.1813292503356934, + "learning_rate": 0.0001306176961602671, + "loss": 0.3741, + "step": 1044 + }, + { + "epoch": 3.2059953881629517, + "grad_norm": 0.7310966849327087, + "learning_rate": 0.00013055091819699497, + "loss": 0.4184, + "step": 1045 + }, + { + "epoch": 3.2090699461952346, + "grad_norm": 0.5950028896331787, + "learning_rate": 0.00013048414023372287, + "loss": 0.3328, + "step": 1046 + }, + { + "epoch": 3.2121445042275174, + "grad_norm": 0.5876432657241821, + "learning_rate": 0.00013041736227045075, + "loss": 0.3244, + "step": 1047 + }, + { + "epoch": 3.2152190622598003, + "grad_norm": 0.7231600284576416, + "learning_rate": 0.00013035058430717865, + "loss": 0.432, + "step": 1048 + }, + { + "epoch": 3.218293620292083, + "grad_norm": 0.9304287433624268, + "learning_rate": 0.00013028380634390652, + "loss": 0.4804, + "step": 1049 + }, + { + "epoch": 3.221368178324366, + "grad_norm": 0.7582074999809265, + "learning_rate": 0.0001302170283806344, + "loss": 0.4306, + "step": 1050 + }, + { + "epoch": 3.224442736356649, + "grad_norm": 0.7604076266288757, + "learning_rate": 0.0001301502504173623, + "loss": 0.4444, + "step": 1051 + }, + { + "epoch": 3.2275172943889316, + "grad_norm": 0.9969847202301025, + "learning_rate": 0.00013008347245409017, + "loss": 0.4617, + "step": 1052 + }, + { + "epoch": 3.2305918524212145, + "grad_norm": 1.0649595260620117, + "learning_rate": 0.00013001669449081804, + "loss": 0.3696, + "step": 1053 + }, + { + "epoch": 3.2336664104534973, + "grad_norm": 0.6512593030929565, + "learning_rate": 0.00012994991652754591, + "loss": 0.3397, + "step": 1054 + }, + { + "epoch": 3.23674096848578, + "grad_norm": 0.9685459136962891, + "learning_rate": 0.0001298831385642738, + "loss": 0.4758, + "step": 1055 + }, + { + "epoch": 3.239815526518063, + "grad_norm": 0.8561423420906067, + "learning_rate": 0.0001298163606010017, + "loss": 0.404, + "step": 1056 + }, + { + "epoch": 3.242890084550346, + "grad_norm": 0.7284657955169678, + "learning_rate": 0.00012974958263772956, + "loss": 0.3061, + "step": 1057 + }, + { + "epoch": 3.2459646425826287, + "grad_norm": 0.7802515029907227, + "learning_rate": 0.00012968280467445743, + "loss": 0.441, + "step": 1058 + }, + { + "epoch": 3.2490392006149116, + "grad_norm": 0.7817832231521606, + "learning_rate": 0.0001296160267111853, + "loss": 0.4344, + "step": 1059 + }, + { + "epoch": 3.2521137586471944, + "grad_norm": 0.7306939959526062, + "learning_rate": 0.00012954924874791318, + "loss": 0.39, + "step": 1060 + }, + { + "epoch": 3.2551883166794773, + "grad_norm": 0.6479128003120422, + "learning_rate": 0.00012948247078464108, + "loss": 0.4264, + "step": 1061 + }, + { + "epoch": 3.25826287471176, + "grad_norm": 0.6551531553268433, + "learning_rate": 0.00012941569282136895, + "loss": 0.3445, + "step": 1062 + }, + { + "epoch": 3.261337432744043, + "grad_norm": 0.9258570075035095, + "learning_rate": 0.00012934891485809683, + "loss": 0.4762, + "step": 1063 + }, + { + "epoch": 3.264411990776326, + "grad_norm": 0.6978762149810791, + "learning_rate": 0.0001292821368948247, + "loss": 0.3335, + "step": 1064 + }, + { + "epoch": 3.2674865488086087, + "grad_norm": 0.7362371683120728, + "learning_rate": 0.0001292153589315526, + "loss": 0.4587, + "step": 1065 + }, + { + "epoch": 3.2705611068408915, + "grad_norm": 0.8437744975090027, + "learning_rate": 0.00012914858096828047, + "loss": 0.3999, + "step": 1066 + }, + { + "epoch": 3.2736356648731744, + "grad_norm": 1.0384852886199951, + "learning_rate": 0.00012908180300500837, + "loss": 0.4975, + "step": 1067 + }, + { + "epoch": 3.276710222905457, + "grad_norm": 0.6881088018417358, + "learning_rate": 0.00012901502504173625, + "loss": 0.3496, + "step": 1068 + }, + { + "epoch": 3.27978478093774, + "grad_norm": 0.6974284648895264, + "learning_rate": 0.00012894824707846412, + "loss": 0.3252, + "step": 1069 + }, + { + "epoch": 3.282859338970023, + "grad_norm": 0.6597744822502136, + "learning_rate": 0.000128881469115192, + "loss": 0.371, + "step": 1070 + }, + { + "epoch": 3.2859338970023058, + "grad_norm": 0.8249826431274414, + "learning_rate": 0.00012881469115191987, + "loss": 0.506, + "step": 1071 + }, + { + "epoch": 3.2890084550345886, + "grad_norm": 0.9232259392738342, + "learning_rate": 0.00012874791318864777, + "loss": 0.4379, + "step": 1072 + }, + { + "epoch": 3.2920830130668715, + "grad_norm": 0.7886003851890564, + "learning_rate": 0.00012868113522537564, + "loss": 0.3461, + "step": 1073 + }, + { + "epoch": 3.2951575710991543, + "grad_norm": 0.7593116760253906, + "learning_rate": 0.0001286143572621035, + "loss": 0.3498, + "step": 1074 + }, + { + "epoch": 3.2982321291314376, + "grad_norm": 0.6457827091217041, + "learning_rate": 0.00012854757929883139, + "loss": 0.4303, + "step": 1075 + }, + { + "epoch": 3.30130668716372, + "grad_norm": 0.7233401536941528, + "learning_rate": 0.00012848080133555926, + "loss": 0.3888, + "step": 1076 + }, + { + "epoch": 3.3043812451960033, + "grad_norm": 0.6364323496818542, + "learning_rate": 0.00012841402337228716, + "loss": 0.4483, + "step": 1077 + }, + { + "epoch": 3.3074558032282857, + "grad_norm": 0.7049786448478699, + "learning_rate": 0.00012834724540901503, + "loss": 0.3465, + "step": 1078 + }, + { + "epoch": 3.310530361260569, + "grad_norm": 0.6245574951171875, + "learning_rate": 0.0001282804674457429, + "loss": 0.2893, + "step": 1079 + }, + { + "epoch": 3.313604919292852, + "grad_norm": 0.8154529929161072, + "learning_rate": 0.00012821368948247078, + "loss": 0.4662, + "step": 1080 + }, + { + "epoch": 3.3166794773251347, + "grad_norm": 0.7856273651123047, + "learning_rate": 0.00012814691151919865, + "loss": 0.4118, + "step": 1081 + }, + { + "epoch": 3.3197540353574175, + "grad_norm": 0.7315744757652283, + "learning_rate": 0.00012808013355592655, + "loss": 0.3951, + "step": 1082 + }, + { + "epoch": 3.3228285933897004, + "grad_norm": 0.7132816910743713, + "learning_rate": 0.00012801335559265442, + "loss": 0.3028, + "step": 1083 + }, + { + "epoch": 3.3259031514219832, + "grad_norm": 0.7761401534080505, + "learning_rate": 0.00012794657762938233, + "loss": 0.4595, + "step": 1084 + }, + { + "epoch": 3.328977709454266, + "grad_norm": 0.6712490916252136, + "learning_rate": 0.0001278797996661102, + "loss": 0.3146, + "step": 1085 + }, + { + "epoch": 3.332052267486549, + "grad_norm": 0.6390141248703003, + "learning_rate": 0.00012781302170283807, + "loss": 0.5137, + "step": 1086 + }, + { + "epoch": 3.3351268255188318, + "grad_norm": 0.6065652370452881, + "learning_rate": 0.00012774624373956594, + "loss": 0.2764, + "step": 1087 + }, + { + "epoch": 3.3382013835511146, + "grad_norm": 0.9247648119926453, + "learning_rate": 0.00012767946577629384, + "loss": 0.6112, + "step": 1088 + }, + { + "epoch": 3.3412759415833975, + "grad_norm": 0.8865838050842285, + "learning_rate": 0.00012761268781302172, + "loss": 0.5974, + "step": 1089 + }, + { + "epoch": 3.3443504996156803, + "grad_norm": 0.5885515809059143, + "learning_rate": 0.0001275459098497496, + "loss": 0.2611, + "step": 1090 + }, + { + "epoch": 3.347425057647963, + "grad_norm": 0.814175546169281, + "learning_rate": 0.00012747913188647746, + "loss": 0.3653, + "step": 1091 + }, + { + "epoch": 3.350499615680246, + "grad_norm": 0.6553864479064941, + "learning_rate": 0.00012741235392320534, + "loss": 0.3619, + "step": 1092 + }, + { + "epoch": 3.353574173712529, + "grad_norm": 0.8142261505126953, + "learning_rate": 0.00012734557595993324, + "loss": 0.5629, + "step": 1093 + }, + { + "epoch": 3.3566487317448117, + "grad_norm": 0.8324840664863586, + "learning_rate": 0.0001272787979966611, + "loss": 0.3358, + "step": 1094 + }, + { + "epoch": 3.3597232897770946, + "grad_norm": 0.7623977065086365, + "learning_rate": 0.00012721202003338898, + "loss": 0.4203, + "step": 1095 + }, + { + "epoch": 3.3627978478093774, + "grad_norm": 0.877435564994812, + "learning_rate": 0.00012714524207011686, + "loss": 0.4465, + "step": 1096 + }, + { + "epoch": 3.3658724058416603, + "grad_norm": 0.8097100257873535, + "learning_rate": 0.00012707846410684473, + "loss": 0.5269, + "step": 1097 + }, + { + "epoch": 3.368946963873943, + "grad_norm": 0.6663544178009033, + "learning_rate": 0.0001270116861435726, + "loss": 0.3844, + "step": 1098 + }, + { + "epoch": 3.372021521906226, + "grad_norm": 0.6997086405754089, + "learning_rate": 0.0001269449081803005, + "loss": 0.3691, + "step": 1099 + }, + { + "epoch": 3.375096079938509, + "grad_norm": 0.9864381551742554, + "learning_rate": 0.00012687813021702838, + "loss": 0.4578, + "step": 1100 + }, + { + "epoch": 3.3781706379707916, + "grad_norm": 0.9177810549736023, + "learning_rate": 0.00012681135225375628, + "loss": 0.5168, + "step": 1101 + }, + { + "epoch": 3.3812451960030745, + "grad_norm": 0.7557141184806824, + "learning_rate": 0.00012674457429048415, + "loss": 0.4225, + "step": 1102 + }, + { + "epoch": 3.3843197540353573, + "grad_norm": 0.6452154517173767, + "learning_rate": 0.00012667779632721202, + "loss": 0.4318, + "step": 1103 + }, + { + "epoch": 3.38739431206764, + "grad_norm": 0.7105704545974731, + "learning_rate": 0.00012661101836393992, + "loss": 0.3128, + "step": 1104 + }, + { + "epoch": 3.390468870099923, + "grad_norm": 0.7198373079299927, + "learning_rate": 0.0001265442404006678, + "loss": 0.4633, + "step": 1105 + }, + { + "epoch": 3.393543428132206, + "grad_norm": 0.7540241479873657, + "learning_rate": 0.00012647746243739567, + "loss": 0.4966, + "step": 1106 + }, + { + "epoch": 3.3966179861644887, + "grad_norm": 0.7719717025756836, + "learning_rate": 0.00012641068447412354, + "loss": 0.4614, + "step": 1107 + }, + { + "epoch": 3.3996925441967716, + "grad_norm": 1.0767078399658203, + "learning_rate": 0.00012634390651085142, + "loss": 0.5644, + "step": 1108 + }, + { + "epoch": 3.4027671022290544, + "grad_norm": 0.7565367817878723, + "learning_rate": 0.00012627712854757932, + "loss": 0.4629, + "step": 1109 + }, + { + "epoch": 3.4058416602613373, + "grad_norm": 0.7821168899536133, + "learning_rate": 0.0001262103505843072, + "loss": 0.4701, + "step": 1110 + }, + { + "epoch": 3.40891621829362, + "grad_norm": 0.8335303664207458, + "learning_rate": 0.00012614357262103506, + "loss": 0.4421, + "step": 1111 + }, + { + "epoch": 3.411990776325903, + "grad_norm": 0.6488150954246521, + "learning_rate": 0.00012607679465776294, + "loss": 0.3861, + "step": 1112 + }, + { + "epoch": 3.415065334358186, + "grad_norm": 0.6686526536941528, + "learning_rate": 0.0001260100166944908, + "loss": 0.4383, + "step": 1113 + }, + { + "epoch": 3.418139892390469, + "grad_norm": 0.7704545855522156, + "learning_rate": 0.00012594323873121868, + "loss": 0.5089, + "step": 1114 + }, + { + "epoch": 3.4212144504227515, + "grad_norm": 0.8212313652038574, + "learning_rate": 0.00012587646076794658, + "loss": 0.3403, + "step": 1115 + }, + { + "epoch": 3.424289008455035, + "grad_norm": 0.6802902817726135, + "learning_rate": 0.00012580968280467446, + "loss": 0.3938, + "step": 1116 + }, + { + "epoch": 3.427363566487317, + "grad_norm": 0.6224616169929504, + "learning_rate": 0.00012574290484140233, + "loss": 0.4623, + "step": 1117 + }, + { + "epoch": 3.4304381245196005, + "grad_norm": 0.5823367834091187, + "learning_rate": 0.00012567612687813023, + "loss": 0.4328, + "step": 1118 + }, + { + "epoch": 3.4335126825518834, + "grad_norm": 0.5620308518409729, + "learning_rate": 0.0001256093489148581, + "loss": 0.3416, + "step": 1119 + }, + { + "epoch": 3.436587240584166, + "grad_norm": 0.8712018728256226, + "learning_rate": 0.000125542570951586, + "loss": 0.3876, + "step": 1120 + }, + { + "epoch": 3.439661798616449, + "grad_norm": 0.766313910484314, + "learning_rate": 0.00012547579298831388, + "loss": 0.5371, + "step": 1121 + }, + { + "epoch": 3.442736356648732, + "grad_norm": 0.8842399716377258, + "learning_rate": 0.00012540901502504175, + "loss": 0.3864, + "step": 1122 + }, + { + "epoch": 3.4458109146810147, + "grad_norm": 0.8501667976379395, + "learning_rate": 0.00012534223706176962, + "loss": 0.5317, + "step": 1123 + }, + { + "epoch": 3.4488854727132976, + "grad_norm": 0.8099106550216675, + "learning_rate": 0.0001252754590984975, + "loss": 0.4038, + "step": 1124 + }, + { + "epoch": 3.4519600307455804, + "grad_norm": 0.6638100743293762, + "learning_rate": 0.0001252086811352254, + "loss": 0.3981, + "step": 1125 + }, + { + "epoch": 3.4550345887778633, + "grad_norm": 0.713429868221283, + "learning_rate": 0.00012514190317195327, + "loss": 0.474, + "step": 1126 + }, + { + "epoch": 3.458109146810146, + "grad_norm": 0.6736339330673218, + "learning_rate": 0.00012507512520868114, + "loss": 0.3764, + "step": 1127 + }, + { + "epoch": 3.461183704842429, + "grad_norm": 0.7324123382568359, + "learning_rate": 0.00012500834724540902, + "loss": 0.4455, + "step": 1128 + }, + { + "epoch": 3.464258262874712, + "grad_norm": 0.7468026876449585, + "learning_rate": 0.0001249415692821369, + "loss": 0.4044, + "step": 1129 + }, + { + "epoch": 3.4673328209069947, + "grad_norm": 0.7653748393058777, + "learning_rate": 0.00012487479131886476, + "loss": 0.4189, + "step": 1130 + }, + { + "epoch": 3.4704073789392775, + "grad_norm": 0.8756456971168518, + "learning_rate": 0.00012480801335559266, + "loss": 0.5738, + "step": 1131 + }, + { + "epoch": 3.4734819369715604, + "grad_norm": 0.9344881772994995, + "learning_rate": 0.00012474123539232053, + "loss": 0.3945, + "step": 1132 + }, + { + "epoch": 3.4765564950038432, + "grad_norm": 0.736493706703186, + "learning_rate": 0.0001246744574290484, + "loss": 0.3714, + "step": 1133 + }, + { + "epoch": 3.479631053036126, + "grad_norm": 0.678229570388794, + "learning_rate": 0.0001246076794657763, + "loss": 0.4412, + "step": 1134 + }, + { + "epoch": 3.482705611068409, + "grad_norm": 0.9181579351425171, + "learning_rate": 0.00012454090150250418, + "loss": 0.4796, + "step": 1135 + }, + { + "epoch": 3.4857801691006918, + "grad_norm": 0.7823171019554138, + "learning_rate": 0.00012447412353923208, + "loss": 0.357, + "step": 1136 + }, + { + "epoch": 3.4888547271329746, + "grad_norm": 0.7010154724121094, + "learning_rate": 0.00012440734557595995, + "loss": 0.3073, + "step": 1137 + }, + { + "epoch": 3.4919292851652575, + "grad_norm": 0.8835572004318237, + "learning_rate": 0.00012434056761268783, + "loss": 0.5125, + "step": 1138 + }, + { + "epoch": 3.4950038431975403, + "grad_norm": 1.2275294065475464, + "learning_rate": 0.0001242737896494157, + "loss": 0.4167, + "step": 1139 + }, + { + "epoch": 3.498078401229823, + "grad_norm": 0.7526091933250427, + "learning_rate": 0.00012420701168614357, + "loss": 0.4264, + "step": 1140 + }, + { + "epoch": 3.501152959262106, + "grad_norm": 0.9319266080856323, + "learning_rate": 0.00012414023372287147, + "loss": 0.5189, + "step": 1141 + }, + { + "epoch": 3.504227517294389, + "grad_norm": 0.9764059782028198, + "learning_rate": 0.00012407345575959935, + "loss": 0.3808, + "step": 1142 + }, + { + "epoch": 3.5073020753266717, + "grad_norm": 0.891604483127594, + "learning_rate": 0.00012400667779632722, + "loss": 0.4733, + "step": 1143 + }, + { + "epoch": 3.5103766333589546, + "grad_norm": 0.985975444316864, + "learning_rate": 0.0001239398998330551, + "loss": 0.4055, + "step": 1144 + }, + { + "epoch": 3.5134511913912374, + "grad_norm": 0.9841047525405884, + "learning_rate": 0.00012387312186978297, + "loss": 0.4578, + "step": 1145 + }, + { + "epoch": 3.5165257494235203, + "grad_norm": 0.8037697672843933, + "learning_rate": 0.00012380634390651084, + "loss": 0.4271, + "step": 1146 + }, + { + "epoch": 3.519600307455803, + "grad_norm": 0.9044193625450134, + "learning_rate": 0.00012373956594323874, + "loss": 0.4559, + "step": 1147 + }, + { + "epoch": 3.522674865488086, + "grad_norm": 0.7791280746459961, + "learning_rate": 0.00012367278797996661, + "loss": 0.4635, + "step": 1148 + }, + { + "epoch": 3.525749423520369, + "grad_norm": 0.8410618901252747, + "learning_rate": 0.0001236060100166945, + "loss": 0.3929, + "step": 1149 + }, + { + "epoch": 3.5288239815526516, + "grad_norm": 0.7505420446395874, + "learning_rate": 0.00012353923205342236, + "loss": 0.4405, + "step": 1150 + }, + { + "epoch": 3.531898539584935, + "grad_norm": 0.6377079486846924, + "learning_rate": 0.00012347245409015026, + "loss": 0.3495, + "step": 1151 + }, + { + "epoch": 3.5349730976172173, + "grad_norm": 0.8469225168228149, + "learning_rate": 0.00012340567612687813, + "loss": 0.4331, + "step": 1152 + }, + { + "epoch": 3.5380476556495006, + "grad_norm": 0.5318998694419861, + "learning_rate": 0.00012333889816360603, + "loss": 0.4056, + "step": 1153 + }, + { + "epoch": 3.541122213681783, + "grad_norm": 0.8198487162590027, + "learning_rate": 0.0001232721202003339, + "loss": 0.5742, + "step": 1154 + }, + { + "epoch": 3.5441967717140663, + "grad_norm": 0.8151354789733887, + "learning_rate": 0.00012320534223706178, + "loss": 0.4746, + "step": 1155 + }, + { + "epoch": 3.5472713297463487, + "grad_norm": 0.8636469841003418, + "learning_rate": 0.00012313856427378965, + "loss": 0.6195, + "step": 1156 + }, + { + "epoch": 3.550345887778632, + "grad_norm": 0.9126644730567932, + "learning_rate": 0.00012307178631051755, + "loss": 0.4336, + "step": 1157 + }, + { + "epoch": 3.5534204458109144, + "grad_norm": 0.8281782269477844, + "learning_rate": 0.00012300500834724543, + "loss": 0.4048, + "step": 1158 + }, + { + "epoch": 3.5564950038431977, + "grad_norm": 0.9562798738479614, + "learning_rate": 0.0001229382303839733, + "loss": 0.4943, + "step": 1159 + }, + { + "epoch": 3.55956956187548, + "grad_norm": 0.7244289517402649, + "learning_rate": 0.00012287145242070117, + "loss": 0.3413, + "step": 1160 + }, + { + "epoch": 3.5626441199077634, + "grad_norm": 0.9391937851905823, + "learning_rate": 0.00012280467445742905, + "loss": 0.415, + "step": 1161 + }, + { + "epoch": 3.5657186779400463, + "grad_norm": 0.6994863748550415, + "learning_rate": 0.00012273789649415692, + "loss": 0.4256, + "step": 1162 + }, + { + "epoch": 3.568793235972329, + "grad_norm": 0.7271562814712524, + "learning_rate": 0.00012267111853088482, + "loss": 0.4266, + "step": 1163 + }, + { + "epoch": 3.571867794004612, + "grad_norm": 0.7303061485290527, + "learning_rate": 0.0001226043405676127, + "loss": 0.4692, + "step": 1164 + }, + { + "epoch": 3.574942352036895, + "grad_norm": 1.049743890762329, + "learning_rate": 0.00012253756260434057, + "loss": 0.5633, + "step": 1165 + }, + { + "epoch": 3.5780169100691777, + "grad_norm": 0.6518731713294983, + "learning_rate": 0.00012247078464106844, + "loss": 0.3834, + "step": 1166 + }, + { + "epoch": 3.5810914681014605, + "grad_norm": 0.680600643157959, + "learning_rate": 0.0001224040066777963, + "loss": 0.4485, + "step": 1167 + }, + { + "epoch": 3.5841660261337434, + "grad_norm": 0.6864722967147827, + "learning_rate": 0.0001223372287145242, + "loss": 0.4605, + "step": 1168 + }, + { + "epoch": 3.587240584166026, + "grad_norm": 0.7405598759651184, + "learning_rate": 0.00012227045075125209, + "loss": 0.4041, + "step": 1169 + }, + { + "epoch": 3.590315142198309, + "grad_norm": 0.675830602645874, + "learning_rate": 0.00012220367278797999, + "loss": 0.3742, + "step": 1170 + }, + { + "epoch": 3.593389700230592, + "grad_norm": 0.8901248574256897, + "learning_rate": 0.00012213689482470786, + "loss": 0.4401, + "step": 1171 + }, + { + "epoch": 3.5964642582628747, + "grad_norm": 0.6679547429084778, + "learning_rate": 0.00012207011686143572, + "loss": 0.4705, + "step": 1172 + }, + { + "epoch": 3.5995388162951576, + "grad_norm": 0.8528178930282593, + "learning_rate": 0.00012200333889816362, + "loss": 0.4257, + "step": 1173 + }, + { + "epoch": 3.6026133743274404, + "grad_norm": 0.9046573042869568, + "learning_rate": 0.00012193656093489149, + "loss": 0.374, + "step": 1174 + }, + { + "epoch": 3.6056879323597233, + "grad_norm": 0.6642177700996399, + "learning_rate": 0.00012186978297161938, + "loss": 0.4346, + "step": 1175 + }, + { + "epoch": 3.608762490392006, + "grad_norm": 0.7178785800933838, + "learning_rate": 0.00012180300500834725, + "loss": 0.395, + "step": 1176 + }, + { + "epoch": 3.611837048424289, + "grad_norm": 0.8669521808624268, + "learning_rate": 0.00012173622704507512, + "loss": 0.5228, + "step": 1177 + }, + { + "epoch": 3.614911606456572, + "grad_norm": 0.6138285398483276, + "learning_rate": 0.00012166944908180303, + "loss": 0.4007, + "step": 1178 + }, + { + "epoch": 3.6179861644888547, + "grad_norm": 1.0008139610290527, + "learning_rate": 0.0001216026711185309, + "loss": 0.4103, + "step": 1179 + }, + { + "epoch": 3.6210607225211375, + "grad_norm": 0.666658341884613, + "learning_rate": 0.00012153589315525877, + "loss": 0.383, + "step": 1180 + }, + { + "epoch": 3.6241352805534204, + "grad_norm": 0.8966631293296814, + "learning_rate": 0.00012146911519198664, + "loss": 0.5084, + "step": 1181 + }, + { + "epoch": 3.6272098385857032, + "grad_norm": 0.8953879475593567, + "learning_rate": 0.00012140233722871452, + "loss": 0.5283, + "step": 1182 + }, + { + "epoch": 3.630284396617986, + "grad_norm": 0.7656745314598083, + "learning_rate": 0.0001213355592654424, + "loss": 0.5076, + "step": 1183 + }, + { + "epoch": 3.633358954650269, + "grad_norm": 0.7582895159721375, + "learning_rate": 0.00012126878130217029, + "loss": 0.4206, + "step": 1184 + }, + { + "epoch": 3.6364335126825518, + "grad_norm": 0.8229513764381409, + "learning_rate": 0.00012120200333889818, + "loss": 0.5156, + "step": 1185 + }, + { + "epoch": 3.6395080707148346, + "grad_norm": 0.5379828810691833, + "learning_rate": 0.00012113522537562605, + "loss": 0.3438, + "step": 1186 + }, + { + "epoch": 3.6425826287471175, + "grad_norm": 0.6136037111282349, + "learning_rate": 0.00012106844741235392, + "loss": 0.4394, + "step": 1187 + }, + { + "epoch": 3.6456571867794003, + "grad_norm": 1.137101411819458, + "learning_rate": 0.0001210016694490818, + "loss": 0.5354, + "step": 1188 + }, + { + "epoch": 3.648731744811683, + "grad_norm": 0.6826598048210144, + "learning_rate": 0.0001209348914858097, + "loss": 0.3609, + "step": 1189 + }, + { + "epoch": 3.6518063028439665, + "grad_norm": 0.7083644270896912, + "learning_rate": 0.00012086811352253757, + "loss": 0.3692, + "step": 1190 + }, + { + "epoch": 3.654880860876249, + "grad_norm": 0.8692861199378967, + "learning_rate": 0.00012080133555926544, + "loss": 0.3899, + "step": 1191 + }, + { + "epoch": 3.657955418908532, + "grad_norm": 0.5692325234413147, + "learning_rate": 0.00012073455759599333, + "loss": 0.4135, + "step": 1192 + }, + { + "epoch": 3.6610299769408146, + "grad_norm": 0.6517208218574524, + "learning_rate": 0.0001206677796327212, + "loss": 0.4944, + "step": 1193 + }, + { + "epoch": 3.664104534973098, + "grad_norm": 1.0306694507598877, + "learning_rate": 0.0001206010016694491, + "loss": 0.3471, + "step": 1194 + }, + { + "epoch": 3.6671790930053803, + "grad_norm": 0.6560060977935791, + "learning_rate": 0.00012053422370617698, + "loss": 0.421, + "step": 1195 + }, + { + "epoch": 3.6702536510376635, + "grad_norm": 0.7117607593536377, + "learning_rate": 0.00012046744574290485, + "loss": 0.3881, + "step": 1196 + }, + { + "epoch": 3.673328209069946, + "grad_norm": 0.931069552898407, + "learning_rate": 0.00012040066777963272, + "loss": 0.5125, + "step": 1197 + }, + { + "epoch": 3.6764027671022292, + "grad_norm": 0.7183043956756592, + "learning_rate": 0.0001203338898163606, + "loss": 0.3876, + "step": 1198 + }, + { + "epoch": 3.6794773251345116, + "grad_norm": 0.6167232394218445, + "learning_rate": 0.00012026711185308848, + "loss": 0.432, + "step": 1199 + }, + { + "epoch": 3.682551883166795, + "grad_norm": 0.7681392431259155, + "learning_rate": 0.00012020033388981637, + "loss": 0.3579, + "step": 1200 + }, + { + "epoch": 3.685626441199078, + "grad_norm": 0.6500406861305237, + "learning_rate": 0.00012013355592654426, + "loss": 0.3566, + "step": 1201 + }, + { + "epoch": 3.6887009992313606, + "grad_norm": 0.6759480237960815, + "learning_rate": 0.00012006677796327213, + "loss": 0.3474, + "step": 1202 + }, + { + "epoch": 3.6917755572636435, + "grad_norm": 0.7032824158668518, + "learning_rate": 0.00012, + "loss": 0.4049, + "step": 1203 + }, + { + "epoch": 3.6948501152959263, + "grad_norm": 0.7631069421768188, + "learning_rate": 0.00011993322203672788, + "loss": 0.3905, + "step": 1204 + }, + { + "epoch": 3.697924673328209, + "grad_norm": 0.7755546569824219, + "learning_rate": 0.00011986644407345578, + "loss": 0.4175, + "step": 1205 + }, + { + "epoch": 3.700999231360492, + "grad_norm": 0.5792478919029236, + "learning_rate": 0.00011979966611018365, + "loss": 0.3005, + "step": 1206 + }, + { + "epoch": 3.704073789392775, + "grad_norm": 0.7339358925819397, + "learning_rate": 0.00011973288814691152, + "loss": 0.417, + "step": 1207 + }, + { + "epoch": 3.7071483474250577, + "grad_norm": 0.8882247805595398, + "learning_rate": 0.0001196661101836394, + "loss": 0.4666, + "step": 1208 + }, + { + "epoch": 3.7102229054573406, + "grad_norm": 0.727995753288269, + "learning_rate": 0.00011959933222036728, + "loss": 0.417, + "step": 1209 + }, + { + "epoch": 3.7132974634896234, + "grad_norm": 0.9979139566421509, + "learning_rate": 0.00011953255425709517, + "loss": 0.4272, + "step": 1210 + }, + { + "epoch": 3.7163720215219063, + "grad_norm": 0.9539368152618408, + "learning_rate": 0.00011946577629382306, + "loss": 0.5065, + "step": 1211 + }, + { + "epoch": 3.719446579554189, + "grad_norm": 0.7285603880882263, + "learning_rate": 0.00011939899833055093, + "loss": 0.4355, + "step": 1212 + }, + { + "epoch": 3.722521137586472, + "grad_norm": 0.8624237179756165, + "learning_rate": 0.0001193322203672788, + "loss": 0.4397, + "step": 1213 + }, + { + "epoch": 3.725595695618755, + "grad_norm": 0.9688683748245239, + "learning_rate": 0.00011926544240400668, + "loss": 0.491, + "step": 1214 + }, + { + "epoch": 3.7286702536510377, + "grad_norm": 0.8200318813323975, + "learning_rate": 0.00011919866444073455, + "loss": 0.4199, + "step": 1215 + }, + { + "epoch": 3.7317448116833205, + "grad_norm": 0.7483800053596497, + "learning_rate": 0.00011913188647746245, + "loss": 0.3424, + "step": 1216 + }, + { + "epoch": 3.7348193697156034, + "grad_norm": 1.0238198041915894, + "learning_rate": 0.00011906510851419032, + "loss": 0.3969, + "step": 1217 + }, + { + "epoch": 3.737893927747886, + "grad_norm": 0.924199104309082, + "learning_rate": 0.00011899833055091821, + "loss": 0.5159, + "step": 1218 + }, + { + "epoch": 3.740968485780169, + "grad_norm": 0.8077093362808228, + "learning_rate": 0.00011893155258764608, + "loss": 0.4923, + "step": 1219 + }, + { + "epoch": 3.744043043812452, + "grad_norm": 0.9883623719215393, + "learning_rate": 0.00011886477462437396, + "loss": 0.5852, + "step": 1220 + }, + { + "epoch": 3.7471176018447347, + "grad_norm": 0.8903137445449829, + "learning_rate": 0.00011879799666110186, + "loss": 0.4461, + "step": 1221 + }, + { + "epoch": 3.7501921598770176, + "grad_norm": 0.8356419205665588, + "learning_rate": 0.00011873121869782973, + "loss": 0.384, + "step": 1222 + }, + { + "epoch": 3.7532667179093004, + "grad_norm": 0.6669814586639404, + "learning_rate": 0.0001186644407345576, + "loss": 0.4374, + "step": 1223 + }, + { + "epoch": 3.7563412759415833, + "grad_norm": 0.8386452794075012, + "learning_rate": 0.00011859766277128547, + "loss": 0.5073, + "step": 1224 + }, + { + "epoch": 3.759415833973866, + "grad_norm": 0.7137802243232727, + "learning_rate": 0.00011853088480801335, + "loss": 0.4911, + "step": 1225 + }, + { + "epoch": 3.762490392006149, + "grad_norm": 0.9081368446350098, + "learning_rate": 0.00011846410684474125, + "loss": 0.4414, + "step": 1226 + }, + { + "epoch": 3.765564950038432, + "grad_norm": 0.625066876411438, + "learning_rate": 0.00011839732888146912, + "loss": 0.3807, + "step": 1227 + }, + { + "epoch": 3.7686395080707147, + "grad_norm": 0.7176731824874878, + "learning_rate": 0.00011833055091819701, + "loss": 0.4615, + "step": 1228 + }, + { + "epoch": 3.7717140661029975, + "grad_norm": 0.8574363589286804, + "learning_rate": 0.00011826377295492488, + "loss": 0.4909, + "step": 1229 + }, + { + "epoch": 3.7747886241352804, + "grad_norm": 0.7505884766578674, + "learning_rate": 0.00011819699499165275, + "loss": 0.5507, + "step": 1230 + }, + { + "epoch": 3.7778631821675637, + "grad_norm": 0.6918272972106934, + "learning_rate": 0.00011813021702838063, + "loss": 0.424, + "step": 1231 + }, + { + "epoch": 3.780937740199846, + "grad_norm": 0.4620833992958069, + "learning_rate": 0.00011806343906510853, + "loss": 0.3651, + "step": 1232 + }, + { + "epoch": 3.7840122982321294, + "grad_norm": 0.8369824290275574, + "learning_rate": 0.0001179966611018364, + "loss": 0.4467, + "step": 1233 + }, + { + "epoch": 3.7870868562644118, + "grad_norm": 0.7672296762466431, + "learning_rate": 0.00011792988313856427, + "loss": 0.475, + "step": 1234 + }, + { + "epoch": 3.790161414296695, + "grad_norm": 0.8405357003211975, + "learning_rate": 0.00011786310517529216, + "loss": 0.5173, + "step": 1235 + }, + { + "epoch": 3.7932359723289775, + "grad_norm": 0.7033690810203552, + "learning_rate": 0.00011779632721202003, + "loss": 0.4607, + "step": 1236 + }, + { + "epoch": 3.7963105303612608, + "grad_norm": 0.6288658380508423, + "learning_rate": 0.00011772954924874793, + "loss": 0.4055, + "step": 1237 + }, + { + "epoch": 3.799385088393543, + "grad_norm": 0.690845787525177, + "learning_rate": 0.00011766277128547581, + "loss": 0.4283, + "step": 1238 + }, + { + "epoch": 3.8024596464258265, + "grad_norm": 0.6428495049476624, + "learning_rate": 0.00011759599332220368, + "loss": 0.3298, + "step": 1239 + }, + { + "epoch": 3.8055342044581093, + "grad_norm": 0.658479630947113, + "learning_rate": 0.00011752921535893155, + "loss": 0.4336, + "step": 1240 + }, + { + "epoch": 3.808608762490392, + "grad_norm": 0.7378556728363037, + "learning_rate": 0.00011746243739565943, + "loss": 0.3664, + "step": 1241 + }, + { + "epoch": 3.811683320522675, + "grad_norm": 0.8548963069915771, + "learning_rate": 0.00011739565943238733, + "loss": 0.4573, + "step": 1242 + }, + { + "epoch": 3.814757878554958, + "grad_norm": 0.7019163966178894, + "learning_rate": 0.0001173288814691152, + "loss": 0.4826, + "step": 1243 + }, + { + "epoch": 3.8178324365872407, + "grad_norm": 1.227756142616272, + "learning_rate": 0.00011726210350584307, + "loss": 0.6298, + "step": 1244 + }, + { + "epoch": 3.8209069946195235, + "grad_norm": 0.8075862526893616, + "learning_rate": 0.00011719532554257096, + "loss": 0.4412, + "step": 1245 + }, + { + "epoch": 3.8239815526518064, + "grad_norm": 0.8187466859817505, + "learning_rate": 0.00011712854757929883, + "loss": 0.4675, + "step": 1246 + }, + { + "epoch": 3.8270561106840892, + "grad_norm": 0.916185200214386, + "learning_rate": 0.0001170617696160267, + "loss": 0.3927, + "step": 1247 + }, + { + "epoch": 3.830130668716372, + "grad_norm": 0.8163374066352844, + "learning_rate": 0.0001169949916527546, + "loss": 0.4642, + "step": 1248 + }, + { + "epoch": 3.833205226748655, + "grad_norm": 0.8225308656692505, + "learning_rate": 0.00011692821368948248, + "loss": 0.5168, + "step": 1249 + }, + { + "epoch": 3.836279784780938, + "grad_norm": 0.931461751461029, + "learning_rate": 0.00011686143572621035, + "loss": 0.3826, + "step": 1250 + }, + { + "epoch": 3.8393543428132206, + "grad_norm": 0.8172028064727783, + "learning_rate": 0.00011679465776293823, + "loss": 0.4679, + "step": 1251 + }, + { + "epoch": 3.8424289008455035, + "grad_norm": 0.9193819165229797, + "learning_rate": 0.00011672787979966611, + "loss": 0.5616, + "step": 1252 + }, + { + "epoch": 3.8455034588777863, + "grad_norm": 0.6455274224281311, + "learning_rate": 0.000116661101836394, + "loss": 0.3846, + "step": 1253 + }, + { + "epoch": 3.848578016910069, + "grad_norm": 0.7567316889762878, + "learning_rate": 0.00011659432387312189, + "loss": 0.3945, + "step": 1254 + }, + { + "epoch": 3.851652574942352, + "grad_norm": 0.7793917059898376, + "learning_rate": 0.00011652754590984976, + "loss": 0.4302, + "step": 1255 + }, + { + "epoch": 3.854727132974635, + "grad_norm": 0.7038170099258423, + "learning_rate": 0.00011646076794657763, + "loss": 0.3853, + "step": 1256 + }, + { + "epoch": 3.8578016910069177, + "grad_norm": 0.5196588039398193, + "learning_rate": 0.0001163939899833055, + "loss": 0.2655, + "step": 1257 + }, + { + "epoch": 3.8608762490392006, + "grad_norm": 0.70482337474823, + "learning_rate": 0.0001163272120200334, + "loss": 0.4513, + "step": 1258 + }, + { + "epoch": 3.8639508070714834, + "grad_norm": 0.828891396522522, + "learning_rate": 0.00011626043405676128, + "loss": 0.4812, + "step": 1259 + }, + { + "epoch": 3.8670253651037663, + "grad_norm": 0.7358651161193848, + "learning_rate": 0.00011619365609348915, + "loss": 0.3227, + "step": 1260 + }, + { + "epoch": 3.870099923136049, + "grad_norm": 0.6543817520141602, + "learning_rate": 0.00011612687813021703, + "loss": 0.3776, + "step": 1261 + }, + { + "epoch": 3.873174481168332, + "grad_norm": 0.6436611413955688, + "learning_rate": 0.00011606010016694491, + "loss": 0.3741, + "step": 1262 + }, + { + "epoch": 3.876249039200615, + "grad_norm": 0.6203712224960327, + "learning_rate": 0.00011599332220367279, + "loss": 0.3884, + "step": 1263 + }, + { + "epoch": 3.8793235972328977, + "grad_norm": 0.7520287036895752, + "learning_rate": 0.00011592654424040069, + "loss": 0.5706, + "step": 1264 + }, + { + "epoch": 3.8823981552651805, + "grad_norm": 0.7709315419197083, + "learning_rate": 0.00011585976627712856, + "loss": 0.3824, + "step": 1265 + }, + { + "epoch": 3.8854727132974634, + "grad_norm": 0.6220033764839172, + "learning_rate": 0.00011579298831385643, + "loss": 0.3308, + "step": 1266 + }, + { + "epoch": 3.888547271329746, + "grad_norm": 0.7906895279884338, + "learning_rate": 0.0001157262103505843, + "loss": 0.5238, + "step": 1267 + }, + { + "epoch": 3.891621829362029, + "grad_norm": 0.693013608455658, + "learning_rate": 0.00011565943238731218, + "loss": 0.5147, + "step": 1268 + }, + { + "epoch": 3.894696387394312, + "grad_norm": 0.6043047904968262, + "learning_rate": 0.00011559265442404008, + "loss": 0.2871, + "step": 1269 + }, + { + "epoch": 3.897770945426595, + "grad_norm": 0.560471773147583, + "learning_rate": 0.00011552587646076795, + "loss": 0.4276, + "step": 1270 + }, + { + "epoch": 3.9008455034588776, + "grad_norm": 0.7022919654846191, + "learning_rate": 0.00011545909849749584, + "loss": 0.4981, + "step": 1271 + }, + { + "epoch": 3.903920061491161, + "grad_norm": 0.933049201965332, + "learning_rate": 0.00011539232053422371, + "loss": 0.4203, + "step": 1272 + }, + { + "epoch": 3.9069946195234433, + "grad_norm": 0.6328878998756409, + "learning_rate": 0.00011532554257095158, + "loss": 0.358, + "step": 1273 + }, + { + "epoch": 3.9100691775557266, + "grad_norm": 0.7153301239013672, + "learning_rate": 0.00011525876460767948, + "loss": 0.4345, + "step": 1274 + }, + { + "epoch": 3.913143735588009, + "grad_norm": 0.6789084672927856, + "learning_rate": 0.00011519198664440736, + "loss": 0.4499, + "step": 1275 + }, + { + "epoch": 3.9162182936202923, + "grad_norm": 0.8615806698799133, + "learning_rate": 0.00011512520868113523, + "loss": 0.5112, + "step": 1276 + }, + { + "epoch": 3.9192928516525747, + "grad_norm": 0.9562219381332397, + "learning_rate": 0.0001150584307178631, + "loss": 0.4993, + "step": 1277 + }, + { + "epoch": 3.922367409684858, + "grad_norm": 0.8305587768554688, + "learning_rate": 0.00011499165275459098, + "loss": 0.4301, + "step": 1278 + }, + { + "epoch": 3.925441967717141, + "grad_norm": 0.7225807309150696, + "learning_rate": 0.00011492487479131886, + "loss": 0.4778, + "step": 1279 + }, + { + "epoch": 3.9285165257494237, + "grad_norm": 0.600487470626831, + "learning_rate": 0.00011485809682804675, + "loss": 0.3729, + "step": 1280 + }, + { + "epoch": 3.9315910837817065, + "grad_norm": 0.7126119136810303, + "learning_rate": 0.00011479131886477464, + "loss": 0.4114, + "step": 1281 + }, + { + "epoch": 3.9346656418139894, + "grad_norm": 0.6836767792701721, + "learning_rate": 0.00011472454090150251, + "loss": 0.3806, + "step": 1282 + }, + { + "epoch": 3.937740199846272, + "grad_norm": 0.9370895624160767, + "learning_rate": 0.00011465776293823038, + "loss": 0.4382, + "step": 1283 + }, + { + "epoch": 3.940814757878555, + "grad_norm": 0.5400208234786987, + "learning_rate": 0.00011459098497495826, + "loss": 0.301, + "step": 1284 + }, + { + "epoch": 3.943889315910838, + "grad_norm": 0.7497467994689941, + "learning_rate": 0.00011452420701168616, + "loss": 0.4158, + "step": 1285 + }, + { + "epoch": 3.9469638739431208, + "grad_norm": 0.7468736171722412, + "learning_rate": 0.00011445742904841403, + "loss": 0.4492, + "step": 1286 + }, + { + "epoch": 3.9500384319754036, + "grad_norm": 0.8118924498558044, + "learning_rate": 0.0001143906510851419, + "loss": 0.4847, + "step": 1287 + }, + { + "epoch": 3.9531129900076865, + "grad_norm": 0.6973615288734436, + "learning_rate": 0.00011432387312186979, + "loss": 0.4148, + "step": 1288 + }, + { + "epoch": 3.9561875480399693, + "grad_norm": 0.8795959949493408, + "learning_rate": 0.00011425709515859766, + "loss": 0.4772, + "step": 1289 + }, + { + "epoch": 3.959262106072252, + "grad_norm": 0.8716256618499756, + "learning_rate": 0.00011419031719532556, + "loss": 0.4586, + "step": 1290 + }, + { + "epoch": 3.962336664104535, + "grad_norm": 0.7880982756614685, + "learning_rate": 0.00011412353923205344, + "loss": 0.4364, + "step": 1291 + }, + { + "epoch": 3.965411222136818, + "grad_norm": 0.8473154306411743, + "learning_rate": 0.00011405676126878131, + "loss": 0.4775, + "step": 1292 + }, + { + "epoch": 3.9684857801691007, + "grad_norm": 0.8033487200737, + "learning_rate": 0.00011398998330550918, + "loss": 0.3901, + "step": 1293 + }, + { + "epoch": 3.9715603382013835, + "grad_norm": 0.8566176891326904, + "learning_rate": 0.00011392320534223706, + "loss": 0.4494, + "step": 1294 + }, + { + "epoch": 3.9746348962336664, + "grad_norm": 0.8029381632804871, + "learning_rate": 0.00011385642737896493, + "loss": 0.3145, + "step": 1295 + }, + { + "epoch": 3.9777094542659492, + "grad_norm": 0.7575416564941406, + "learning_rate": 0.00011378964941569283, + "loss": 0.3394, + "step": 1296 + }, + { + "epoch": 3.980784012298232, + "grad_norm": 0.6976135969161987, + "learning_rate": 0.0001137228714524207, + "loss": 0.5052, + "step": 1297 + }, + { + "epoch": 3.983858570330515, + "grad_norm": 0.6242879629135132, + "learning_rate": 0.00011365609348914859, + "loss": 0.3319, + "step": 1298 + }, + { + "epoch": 3.986933128362798, + "grad_norm": 0.8205263614654541, + "learning_rate": 0.00011358931552587646, + "loss": 0.4923, + "step": 1299 + }, + { + "epoch": 3.9900076863950806, + "grad_norm": 0.6506344079971313, + "learning_rate": 0.00011352253756260434, + "loss": 0.412, + "step": 1300 + }, + { + "epoch": 3.9930822444273635, + "grad_norm": 0.8723356127738953, + "learning_rate": 0.00011345575959933224, + "loss": 0.4347, + "step": 1301 + }, + { + "epoch": 3.9961568024596463, + "grad_norm": 0.7876335978507996, + "learning_rate": 0.00011338898163606011, + "loss": 0.5355, + "step": 1302 + }, + { + "epoch": 3.999231360491929, + "grad_norm": 0.724051833152771, + "learning_rate": 0.00011332220367278798, + "loss": 0.3937, + "step": 1303 + }, + { + "epoch": 4.0, + "grad_norm": 1.3604131937026978, + "learning_rate": 0.00011325542570951586, + "loss": 0.4378, + "step": 1304 + }, + { + "epoch": 4.003074558032283, + "grad_norm": 0.5147106647491455, + "learning_rate": 0.00011318864774624374, + "loss": 0.3391, + "step": 1305 + }, + { + "epoch": 4.006149116064566, + "grad_norm": 0.6199834942817688, + "learning_rate": 0.00011312186978297163, + "loss": 0.3723, + "step": 1306 + }, + { + "epoch": 4.009223674096849, + "grad_norm": 0.7257975935935974, + "learning_rate": 0.00011305509181969952, + "loss": 0.3119, + "step": 1307 + }, + { + "epoch": 4.012298232129131, + "grad_norm": 0.6027304530143738, + "learning_rate": 0.00011298831385642739, + "loss": 0.3193, + "step": 1308 + }, + { + "epoch": 4.015372790161415, + "grad_norm": 0.7012556791305542, + "learning_rate": 0.00011292153589315526, + "loss": 0.5244, + "step": 1309 + }, + { + "epoch": 4.018447348193697, + "grad_norm": 0.702237606048584, + "learning_rate": 0.00011285475792988314, + "loss": 0.2945, + "step": 1310 + }, + { + "epoch": 4.02152190622598, + "grad_norm": 0.7039638757705688, + "learning_rate": 0.00011278797996661104, + "loss": 0.3911, + "step": 1311 + }, + { + "epoch": 4.024596464258263, + "grad_norm": 0.6667320132255554, + "learning_rate": 0.00011272120200333891, + "loss": 0.2977, + "step": 1312 + }, + { + "epoch": 4.027671022290546, + "grad_norm": 0.692411482334137, + "learning_rate": 0.00011265442404006678, + "loss": 0.3628, + "step": 1313 + }, + { + "epoch": 4.0307455803228285, + "grad_norm": 0.8840232491493225, + "learning_rate": 0.00011258764607679465, + "loss": 0.3213, + "step": 1314 + }, + { + "epoch": 4.033820138355112, + "grad_norm": 0.6245793104171753, + "learning_rate": 0.00011252086811352254, + "loss": 0.3689, + "step": 1315 + }, + { + "epoch": 4.036894696387394, + "grad_norm": 0.6313285231590271, + "learning_rate": 0.00011245409015025041, + "loss": 0.302, + "step": 1316 + }, + { + "epoch": 4.0399692544196775, + "grad_norm": 0.7105359435081482, + "learning_rate": 0.00011238731218697832, + "loss": 0.316, + "step": 1317 + }, + { + "epoch": 4.04304381245196, + "grad_norm": 0.6478031873703003, + "learning_rate": 0.00011232053422370619, + "loss": 0.3727, + "step": 1318 + }, + { + "epoch": 4.046118370484243, + "grad_norm": 0.4994255602359772, + "learning_rate": 0.00011225375626043406, + "loss": 0.2032, + "step": 1319 + }, + { + "epoch": 4.049192928516526, + "grad_norm": 0.9945188760757446, + "learning_rate": 0.00011218697829716193, + "loss": 0.5024, + "step": 1320 + }, + { + "epoch": 4.052267486548809, + "grad_norm": 0.8060212135314941, + "learning_rate": 0.00011212020033388981, + "loss": 0.3265, + "step": 1321 + }, + { + "epoch": 4.055342044581091, + "grad_norm": 0.8381320238113403, + "learning_rate": 0.00011205342237061771, + "loss": 0.3818, + "step": 1322 + }, + { + "epoch": 4.058416602613375, + "grad_norm": 0.9504372477531433, + "learning_rate": 0.00011198664440734558, + "loss": 0.4802, + "step": 1323 + }, + { + "epoch": 4.061491160645657, + "grad_norm": 0.7901592254638672, + "learning_rate": 0.00011191986644407347, + "loss": 0.3169, + "step": 1324 + }, + { + "epoch": 4.06456571867794, + "grad_norm": 0.7563232779502869, + "learning_rate": 0.00011185308848080134, + "loss": 0.3226, + "step": 1325 + }, + { + "epoch": 4.067640276710223, + "grad_norm": 0.6596280932426453, + "learning_rate": 0.00011178631051752921, + "loss": 0.2505, + "step": 1326 + }, + { + "epoch": 4.070714834742506, + "grad_norm": 0.7296786308288574, + "learning_rate": 0.00011171953255425711, + "loss": 0.2848, + "step": 1327 + }, + { + "epoch": 4.073789392774788, + "grad_norm": 0.8909521102905273, + "learning_rate": 0.00011165275459098499, + "loss": 0.4322, + "step": 1328 + }, + { + "epoch": 4.076863950807072, + "grad_norm": 0.7292854189872742, + "learning_rate": 0.00011158597662771286, + "loss": 0.291, + "step": 1329 + }, + { + "epoch": 4.079938508839354, + "grad_norm": 0.9252512454986572, + "learning_rate": 0.00011151919866444073, + "loss": 0.2974, + "step": 1330 + }, + { + "epoch": 4.083013066871637, + "grad_norm": 0.7636522054672241, + "learning_rate": 0.00011145242070116862, + "loss": 0.4483, + "step": 1331 + }, + { + "epoch": 4.08608762490392, + "grad_norm": 0.8409242033958435, + "learning_rate": 0.0001113856427378965, + "loss": 0.4355, + "step": 1332 + }, + { + "epoch": 4.089162182936203, + "grad_norm": 0.9327632188796997, + "learning_rate": 0.0001113188647746244, + "loss": 0.4552, + "step": 1333 + }, + { + "epoch": 4.092236740968485, + "grad_norm": 0.7346988916397095, + "learning_rate": 0.00011125208681135227, + "loss": 0.3438, + "step": 1334 + }, + { + "epoch": 4.095311299000769, + "grad_norm": 1.004155158996582, + "learning_rate": 0.00011118530884808014, + "loss": 0.3276, + "step": 1335 + }, + { + "epoch": 4.098385857033051, + "grad_norm": 0.7132447361946106, + "learning_rate": 0.00011111853088480801, + "loss": 0.3801, + "step": 1336 + }, + { + "epoch": 4.101460415065334, + "grad_norm": 0.629642128944397, + "learning_rate": 0.00011105175292153589, + "loss": 0.2056, + "step": 1337 + }, + { + "epoch": 4.104534973097617, + "grad_norm": 0.6974900960922241, + "learning_rate": 0.00011098497495826379, + "loss": 0.3028, + "step": 1338 + }, + { + "epoch": 4.1076095311299, + "grad_norm": 0.7688671946525574, + "learning_rate": 0.00011091819699499166, + "loss": 0.2914, + "step": 1339 + }, + { + "epoch": 4.1106840891621825, + "grad_norm": 0.7950320839881897, + "learning_rate": 0.00011085141903171953, + "loss": 0.3381, + "step": 1340 + }, + { + "epoch": 4.113758647194466, + "grad_norm": 0.8874083757400513, + "learning_rate": 0.00011078464106844742, + "loss": 0.3343, + "step": 1341 + }, + { + "epoch": 4.116833205226748, + "grad_norm": 0.8627938032150269, + "learning_rate": 0.00011071786310517529, + "loss": 0.3888, + "step": 1342 + }, + { + "epoch": 4.1199077632590315, + "grad_norm": 0.7516458034515381, + "learning_rate": 0.0001106510851419032, + "loss": 0.3158, + "step": 1343 + }, + { + "epoch": 4.122982321291314, + "grad_norm": 0.7732129693031311, + "learning_rate": 0.00011058430717863107, + "loss": 0.3114, + "step": 1344 + }, + { + "epoch": 4.126056879323597, + "grad_norm": 0.6700358986854553, + "learning_rate": 0.00011051752921535894, + "loss": 0.2412, + "step": 1345 + }, + { + "epoch": 4.1291314373558805, + "grad_norm": 1.0231423377990723, + "learning_rate": 0.00011045075125208681, + "loss": 0.3424, + "step": 1346 + }, + { + "epoch": 4.132205995388163, + "grad_norm": 0.8192147016525269, + "learning_rate": 0.00011038397328881469, + "loss": 0.3089, + "step": 1347 + }, + { + "epoch": 4.135280553420446, + "grad_norm": 0.7541559934616089, + "learning_rate": 0.00011031719532554257, + "loss": 0.3552, + "step": 1348 + }, + { + "epoch": 4.138355111452729, + "grad_norm": 0.929007887840271, + "learning_rate": 0.00011025041736227046, + "loss": 0.3173, + "step": 1349 + }, + { + "epoch": 4.141429669485012, + "grad_norm": 0.5695236325263977, + "learning_rate": 0.00011018363939899835, + "loss": 0.3172, + "step": 1350 + }, + { + "epoch": 4.144504227517294, + "grad_norm": 0.9651820659637451, + "learning_rate": 0.00011011686143572622, + "loss": 0.3911, + "step": 1351 + }, + { + "epoch": 4.147578785549578, + "grad_norm": 0.7829585075378418, + "learning_rate": 0.00011005008347245409, + "loss": 0.4412, + "step": 1352 + }, + { + "epoch": 4.15065334358186, + "grad_norm": 0.5842923521995544, + "learning_rate": 0.00010998330550918197, + "loss": 0.32, + "step": 1353 + }, + { + "epoch": 4.153727901614143, + "grad_norm": 1.1148773431777954, + "learning_rate": 0.00010991652754590987, + "loss": 0.4073, + "step": 1354 + }, + { + "epoch": 4.156802459646426, + "grad_norm": 0.59675133228302, + "learning_rate": 0.00010984974958263774, + "loss": 0.2595, + "step": 1355 + }, + { + "epoch": 4.159877017678709, + "grad_norm": 0.9646673798561096, + "learning_rate": 0.00010978297161936561, + "loss": 0.4913, + "step": 1356 + }, + { + "epoch": 4.162951575710991, + "grad_norm": 0.8348448276519775, + "learning_rate": 0.00010971619365609349, + "loss": 0.3936, + "step": 1357 + }, + { + "epoch": 4.166026133743275, + "grad_norm": 0.9026066660881042, + "learning_rate": 0.00010964941569282137, + "loss": 0.4597, + "step": 1358 + }, + { + "epoch": 4.169100691775557, + "grad_norm": 1.0557740926742554, + "learning_rate": 0.00010958263772954926, + "loss": 0.2901, + "step": 1359 + }, + { + "epoch": 4.17217524980784, + "grad_norm": 1.6775768995285034, + "learning_rate": 0.00010951585976627715, + "loss": 0.5577, + "step": 1360 + }, + { + "epoch": 4.175249807840123, + "grad_norm": 0.651542603969574, + "learning_rate": 0.00010944908180300502, + "loss": 0.324, + "step": 1361 + }, + { + "epoch": 4.178324365872406, + "grad_norm": 0.8348442912101746, + "learning_rate": 0.00010938230383973289, + "loss": 0.3474, + "step": 1362 + }, + { + "epoch": 4.1813989239046885, + "grad_norm": 0.7684600949287415, + "learning_rate": 0.00010931552587646076, + "loss": 0.3105, + "step": 1363 + }, + { + "epoch": 4.184473481936972, + "grad_norm": 0.8022297620773315, + "learning_rate": 0.00010924874791318864, + "loss": 0.3591, + "step": 1364 + }, + { + "epoch": 4.187548039969254, + "grad_norm": 0.9433055520057678, + "learning_rate": 0.00010918196994991654, + "loss": 0.3116, + "step": 1365 + }, + { + "epoch": 4.1906225980015375, + "grad_norm": 0.9922048449516296, + "learning_rate": 0.00010911519198664441, + "loss": 0.4071, + "step": 1366 + }, + { + "epoch": 4.19369715603382, + "grad_norm": 0.7621304988861084, + "learning_rate": 0.0001090484140233723, + "loss": 0.3698, + "step": 1367 + }, + { + "epoch": 4.196771714066103, + "grad_norm": 0.8218173980712891, + "learning_rate": 0.00010898163606010017, + "loss": 0.3159, + "step": 1368 + }, + { + "epoch": 4.199846272098386, + "grad_norm": 0.9964919090270996, + "learning_rate": 0.00010891485809682804, + "loss": 0.3501, + "step": 1369 + }, + { + "epoch": 4.202920830130669, + "grad_norm": 0.705668032169342, + "learning_rate": 0.00010884808013355594, + "loss": 0.3418, + "step": 1370 + }, + { + "epoch": 4.205995388162951, + "grad_norm": 0.6391593217849731, + "learning_rate": 0.00010878130217028382, + "loss": 0.2915, + "step": 1371 + }, + { + "epoch": 4.209069946195235, + "grad_norm": 1.1502752304077148, + "learning_rate": 0.00010871452420701169, + "loss": 0.3739, + "step": 1372 + }, + { + "epoch": 4.212144504227517, + "grad_norm": 1.0136791467666626, + "learning_rate": 0.00010864774624373956, + "loss": 0.446, + "step": 1373 + }, + { + "epoch": 4.2152190622598, + "grad_norm": 1.116603136062622, + "learning_rate": 0.00010858096828046744, + "loss": 0.3013, + "step": 1374 + }, + { + "epoch": 4.218293620292083, + "grad_norm": 0.8702336549758911, + "learning_rate": 0.00010851419031719534, + "loss": 0.3708, + "step": 1375 + }, + { + "epoch": 4.221368178324366, + "grad_norm": 0.7424792647361755, + "learning_rate": 0.00010844741235392321, + "loss": 0.3883, + "step": 1376 + }, + { + "epoch": 4.224442736356648, + "grad_norm": 0.9215840697288513, + "learning_rate": 0.0001083806343906511, + "loss": 0.3527, + "step": 1377 + }, + { + "epoch": 4.227517294388932, + "grad_norm": 0.728461742401123, + "learning_rate": 0.00010831385642737897, + "loss": 0.3263, + "step": 1378 + }, + { + "epoch": 4.230591852421214, + "grad_norm": 0.6894111037254333, + "learning_rate": 0.00010824707846410684, + "loss": 0.3055, + "step": 1379 + }, + { + "epoch": 4.233666410453497, + "grad_norm": 0.736510694026947, + "learning_rate": 0.00010818030050083472, + "loss": 0.2888, + "step": 1380 + }, + { + "epoch": 4.23674096848578, + "grad_norm": 0.6261756420135498, + "learning_rate": 0.00010811352253756262, + "loss": 0.3168, + "step": 1381 + }, + { + "epoch": 4.239815526518063, + "grad_norm": 0.6462433934211731, + "learning_rate": 0.00010804674457429049, + "loss": 0.3785, + "step": 1382 + }, + { + "epoch": 4.242890084550346, + "grad_norm": 1.0697581768035889, + "learning_rate": 0.00010797996661101836, + "loss": 0.382, + "step": 1383 + }, + { + "epoch": 4.245964642582629, + "grad_norm": 0.8354079723358154, + "learning_rate": 0.00010791318864774625, + "loss": 0.4826, + "step": 1384 + }, + { + "epoch": 4.249039200614912, + "grad_norm": 0.9178540110588074, + "learning_rate": 0.00010784641068447412, + "loss": 0.3667, + "step": 1385 + }, + { + "epoch": 4.252113758647194, + "grad_norm": 0.6986132264137268, + "learning_rate": 0.00010777963272120202, + "loss": 0.3417, + "step": 1386 + }, + { + "epoch": 4.255188316679478, + "grad_norm": 0.6934733390808105, + "learning_rate": 0.0001077128547579299, + "loss": 0.3177, + "step": 1387 + }, + { + "epoch": 4.25826287471176, + "grad_norm": 0.7552710175514221, + "learning_rate": 0.00010764607679465777, + "loss": 0.4545, + "step": 1388 + }, + { + "epoch": 4.261337432744043, + "grad_norm": 0.8772902488708496, + "learning_rate": 0.00010757929883138564, + "loss": 0.4488, + "step": 1389 + }, + { + "epoch": 4.264411990776326, + "grad_norm": 0.6232932806015015, + "learning_rate": 0.00010751252086811352, + "loss": 0.2655, + "step": 1390 + }, + { + "epoch": 4.267486548808609, + "grad_norm": 0.8846897482872009, + "learning_rate": 0.00010744574290484142, + "loss": 0.354, + "step": 1391 + }, + { + "epoch": 4.2705611068408915, + "grad_norm": 0.9057449102401733, + "learning_rate": 0.00010737896494156929, + "loss": 0.4472, + "step": 1392 + }, + { + "epoch": 4.273635664873175, + "grad_norm": 0.9705424308776855, + "learning_rate": 0.00010731218697829716, + "loss": 0.3624, + "step": 1393 + }, + { + "epoch": 4.276710222905457, + "grad_norm": 1.3559931516647339, + "learning_rate": 0.00010724540901502505, + "loss": 0.4698, + "step": 1394 + }, + { + "epoch": 4.2797847809377405, + "grad_norm": 0.8337675333023071, + "learning_rate": 0.00010717863105175292, + "loss": 0.3795, + "step": 1395 + }, + { + "epoch": 4.282859338970023, + "grad_norm": 1.1630418300628662, + "learning_rate": 0.0001071118530884808, + "loss": 0.4924, + "step": 1396 + }, + { + "epoch": 4.285933897002306, + "grad_norm": 0.7302567362785339, + "learning_rate": 0.0001070450751252087, + "loss": 0.3188, + "step": 1397 + }, + { + "epoch": 4.289008455034589, + "grad_norm": 0.7226994037628174, + "learning_rate": 0.00010697829716193657, + "loss": 0.3188, + "step": 1398 + }, + { + "epoch": 4.292083013066872, + "grad_norm": 0.666989266872406, + "learning_rate": 0.00010691151919866444, + "loss": 0.3394, + "step": 1399 + }, + { + "epoch": 4.295157571099154, + "grad_norm": 0.9268330931663513, + "learning_rate": 0.00010684474123539232, + "loss": 0.4713, + "step": 1400 + }, + { + "epoch": 4.298232129131438, + "grad_norm": 0.823275625705719, + "learning_rate": 0.0001067779632721202, + "loss": 0.3481, + "step": 1401 + }, + { + "epoch": 4.30130668716372, + "grad_norm": 0.5804985761642456, + "learning_rate": 0.00010671118530884809, + "loss": 0.2705, + "step": 1402 + }, + { + "epoch": 4.304381245196003, + "grad_norm": 0.539432942867279, + "learning_rate": 0.00010664440734557598, + "loss": 0.3344, + "step": 1403 + }, + { + "epoch": 4.307455803228286, + "grad_norm": 0.6926316618919373, + "learning_rate": 0.00010657762938230385, + "loss": 0.3133, + "step": 1404 + }, + { + "epoch": 4.310530361260569, + "grad_norm": 0.6743838787078857, + "learning_rate": 0.00010651085141903172, + "loss": 0.3433, + "step": 1405 + }, + { + "epoch": 4.313604919292851, + "grad_norm": 1.0226610898971558, + "learning_rate": 0.0001064440734557596, + "loss": 0.3499, + "step": 1406 + }, + { + "epoch": 4.316679477325135, + "grad_norm": 0.9818789958953857, + "learning_rate": 0.0001063772954924875, + "loss": 0.3106, + "step": 1407 + }, + { + "epoch": 4.319754035357417, + "grad_norm": 0.9667727947235107, + "learning_rate": 0.00010631051752921537, + "loss": 0.3755, + "step": 1408 + }, + { + "epoch": 4.3228285933897, + "grad_norm": 0.8136192560195923, + "learning_rate": 0.00010624373956594324, + "loss": 0.4172, + "step": 1409 + }, + { + "epoch": 4.325903151421983, + "grad_norm": 1.4286353588104248, + "learning_rate": 0.00010617696160267111, + "loss": 0.2508, + "step": 1410 + }, + { + "epoch": 4.328977709454266, + "grad_norm": 0.9519496560096741, + "learning_rate": 0.000106110183639399, + "loss": 0.2951, + "step": 1411 + }, + { + "epoch": 4.3320522674865485, + "grad_norm": 1.119429111480713, + "learning_rate": 0.00010604340567612687, + "loss": 0.415, + "step": 1412 + }, + { + "epoch": 4.335126825518832, + "grad_norm": 0.9656046032905579, + "learning_rate": 0.00010597662771285477, + "loss": 0.3971, + "step": 1413 + }, + { + "epoch": 4.338201383551114, + "grad_norm": 0.7389115691184998, + "learning_rate": 0.00010590984974958265, + "loss": 0.3996, + "step": 1414 + }, + { + "epoch": 4.3412759415833975, + "grad_norm": 0.7295717597007751, + "learning_rate": 0.00010584307178631052, + "loss": 0.3148, + "step": 1415 + }, + { + "epoch": 4.34435049961568, + "grad_norm": 1.141958475112915, + "learning_rate": 0.0001057762938230384, + "loss": 0.3959, + "step": 1416 + }, + { + "epoch": 4.347425057647963, + "grad_norm": 0.8199194073677063, + "learning_rate": 0.00010570951585976627, + "loss": 0.2737, + "step": 1417 + }, + { + "epoch": 4.350499615680246, + "grad_norm": 0.9329640865325928, + "learning_rate": 0.00010564273789649417, + "loss": 0.3366, + "step": 1418 + }, + { + "epoch": 4.353574173712529, + "grad_norm": 0.9693445563316345, + "learning_rate": 0.00010557595993322204, + "loss": 0.325, + "step": 1419 + }, + { + "epoch": 4.356648731744812, + "grad_norm": 3.1419506072998047, + "learning_rate": 0.00010550918196994993, + "loss": 0.4225, + "step": 1420 + }, + { + "epoch": 4.359723289777095, + "grad_norm": 0.8056375980377197, + "learning_rate": 0.0001054424040066778, + "loss": 0.3366, + "step": 1421 + }, + { + "epoch": 4.362797847809377, + "grad_norm": 0.9013074636459351, + "learning_rate": 0.00010537562604340567, + "loss": 0.3814, + "step": 1422 + }, + { + "epoch": 4.36587240584166, + "grad_norm": 0.6411908864974976, + "learning_rate": 0.00010530884808013357, + "loss": 0.3604, + "step": 1423 + }, + { + "epoch": 4.3689469638739435, + "grad_norm": 0.7328122854232788, + "learning_rate": 0.00010524207011686145, + "loss": 0.3706, + "step": 1424 + }, + { + "epoch": 4.372021521906226, + "grad_norm": 0.7676102519035339, + "learning_rate": 0.00010517529215358932, + "loss": 0.3575, + "step": 1425 + }, + { + "epoch": 4.375096079938509, + "grad_norm": 0.7656323313713074, + "learning_rate": 0.0001051085141903172, + "loss": 0.3897, + "step": 1426 + }, + { + "epoch": 4.378170637970792, + "grad_norm": 0.8879655599594116, + "learning_rate": 0.00010504173622704507, + "loss": 0.2235, + "step": 1427 + }, + { + "epoch": 4.381245196003075, + "grad_norm": 0.8029223680496216, + "learning_rate": 0.00010497495826377295, + "loss": 0.4513, + "step": 1428 + }, + { + "epoch": 4.384319754035357, + "grad_norm": 0.8824205994606018, + "learning_rate": 0.00010490818030050084, + "loss": 0.3558, + "step": 1429 + }, + { + "epoch": 4.387394312067641, + "grad_norm": 0.6517553329467773, + "learning_rate": 0.00010484140233722873, + "loss": 0.3304, + "step": 1430 + }, + { + "epoch": 4.390468870099923, + "grad_norm": 0.6570941805839539, + "learning_rate": 0.0001047746243739566, + "loss": 0.3072, + "step": 1431 + }, + { + "epoch": 4.393543428132206, + "grad_norm": 0.6981759667396545, + "learning_rate": 0.00010470784641068447, + "loss": 0.2602, + "step": 1432 + }, + { + "epoch": 4.396617986164489, + "grad_norm": 0.999544084072113, + "learning_rate": 0.00010464106844741235, + "loss": 0.4485, + "step": 1433 + }, + { + "epoch": 4.399692544196772, + "grad_norm": 0.6772480010986328, + "learning_rate": 0.00010457429048414025, + "loss": 0.3789, + "step": 1434 + }, + { + "epoch": 4.402767102229054, + "grad_norm": 1.0531984567642212, + "learning_rate": 0.00010450751252086812, + "loss": 0.2384, + "step": 1435 + }, + { + "epoch": 4.405841660261338, + "grad_norm": 0.7211788892745972, + "learning_rate": 0.00010444073455759599, + "loss": 0.3192, + "step": 1436 + }, + { + "epoch": 4.40891621829362, + "grad_norm": 0.9477794170379639, + "learning_rate": 0.00010437395659432388, + "loss": 0.2762, + "step": 1437 + }, + { + "epoch": 4.411990776325903, + "grad_norm": 0.8108130097389221, + "learning_rate": 0.00010430717863105175, + "loss": 0.3724, + "step": 1438 + }, + { + "epoch": 4.415065334358186, + "grad_norm": 1.231468915939331, + "learning_rate": 0.00010424040066777965, + "loss": 0.4689, + "step": 1439 + }, + { + "epoch": 4.418139892390469, + "grad_norm": 1.2272400856018066, + "learning_rate": 0.00010417362270450753, + "loss": 0.3773, + "step": 1440 + }, + { + "epoch": 4.4212144504227515, + "grad_norm": 0.7169706225395203, + "learning_rate": 0.0001041068447412354, + "loss": 0.2731, + "step": 1441 + }, + { + "epoch": 4.424289008455035, + "grad_norm": 0.568555474281311, + "learning_rate": 0.00010404006677796327, + "loss": 0.3041, + "step": 1442 + }, + { + "epoch": 4.427363566487317, + "grad_norm": 1.2105591297149658, + "learning_rate": 0.00010397328881469115, + "loss": 0.3463, + "step": 1443 + }, + { + "epoch": 4.4304381245196005, + "grad_norm": 0.7139995098114014, + "learning_rate": 0.00010390651085141905, + "loss": 0.3665, + "step": 1444 + }, + { + "epoch": 4.433512682551883, + "grad_norm": 0.6359079480171204, + "learning_rate": 0.00010383973288814692, + "loss": 0.2739, + "step": 1445 + }, + { + "epoch": 4.436587240584166, + "grad_norm": 0.8577691316604614, + "learning_rate": 0.0001037729549248748, + "loss": 0.2478, + "step": 1446 + }, + { + "epoch": 4.439661798616449, + "grad_norm": 0.68791264295578, + "learning_rate": 0.00010370617696160268, + "loss": 0.337, + "step": 1447 + }, + { + "epoch": 4.442736356648732, + "grad_norm": 0.7423458695411682, + "learning_rate": 0.00010363939899833055, + "loss": 0.3899, + "step": 1448 + }, + { + "epoch": 4.445810914681014, + "grad_norm": 0.894343912601471, + "learning_rate": 0.00010357262103505843, + "loss": 0.3282, + "step": 1449 + }, + { + "epoch": 4.448885472713298, + "grad_norm": 0.9872162342071533, + "learning_rate": 0.00010350584307178633, + "loss": 0.3724, + "step": 1450 + }, + { + "epoch": 4.45196003074558, + "grad_norm": 0.9836599230766296, + "learning_rate": 0.0001034390651085142, + "loss": 0.3555, + "step": 1451 + }, + { + "epoch": 4.455034588777863, + "grad_norm": 0.9895578622817993, + "learning_rate": 0.00010337228714524207, + "loss": 0.3823, + "step": 1452 + }, + { + "epoch": 4.458109146810146, + "grad_norm": 1.0904133319854736, + "learning_rate": 0.00010330550918196994, + "loss": 0.4033, + "step": 1453 + }, + { + "epoch": 4.461183704842429, + "grad_norm": 0.6034055352210999, + "learning_rate": 0.00010323873121869783, + "loss": 0.2676, + "step": 1454 + }, + { + "epoch": 4.464258262874711, + "grad_norm": 0.7707822322845459, + "learning_rate": 0.00010317195325542572, + "loss": 0.3143, + "step": 1455 + }, + { + "epoch": 4.467332820906995, + "grad_norm": 1.0982093811035156, + "learning_rate": 0.0001031051752921536, + "loss": 0.3799, + "step": 1456 + }, + { + "epoch": 4.470407378939277, + "grad_norm": 2.1675314903259277, + "learning_rate": 0.00010303839732888148, + "loss": 0.4698, + "step": 1457 + }, + { + "epoch": 4.47348193697156, + "grad_norm": 0.8458796143531799, + "learning_rate": 0.00010297161936560935, + "loss": 0.5113, + "step": 1458 + }, + { + "epoch": 4.476556495003843, + "grad_norm": 0.8346131443977356, + "learning_rate": 0.00010290484140233722, + "loss": 0.3766, + "step": 1459 + }, + { + "epoch": 4.479631053036126, + "grad_norm": 0.7935206890106201, + "learning_rate": 0.00010283806343906512, + "loss": 0.3153, + "step": 1460 + }, + { + "epoch": 4.482705611068409, + "grad_norm": 0.8221637606620789, + "learning_rate": 0.000102771285475793, + "loss": 0.2635, + "step": 1461 + }, + { + "epoch": 4.485780169100692, + "grad_norm": 0.5546371936798096, + "learning_rate": 0.00010270450751252087, + "loss": 0.2976, + "step": 1462 + }, + { + "epoch": 4.488854727132974, + "grad_norm": 1.041944146156311, + "learning_rate": 0.00010263772954924876, + "loss": 0.4004, + "step": 1463 + }, + { + "epoch": 4.4919292851652575, + "grad_norm": 0.827978253364563, + "learning_rate": 0.00010257095158597663, + "loss": 0.335, + "step": 1464 + }, + { + "epoch": 4.495003843197541, + "grad_norm": 0.8025320768356323, + "learning_rate": 0.0001025041736227045, + "loss": 0.3436, + "step": 1465 + }, + { + "epoch": 4.498078401229823, + "grad_norm": 0.7182911038398743, + "learning_rate": 0.0001024373956594324, + "loss": 0.4948, + "step": 1466 + }, + { + "epoch": 4.5011529592621065, + "grad_norm": 0.9388545155525208, + "learning_rate": 0.00010237061769616028, + "loss": 0.3967, + "step": 1467 + }, + { + "epoch": 4.504227517294389, + "grad_norm": 1.0608465671539307, + "learning_rate": 0.00010230383973288815, + "loss": 0.3166, + "step": 1468 + }, + { + "epoch": 4.507302075326672, + "grad_norm": 0.9616206288337708, + "learning_rate": 0.00010223706176961602, + "loss": 0.4008, + "step": 1469 + }, + { + "epoch": 4.510376633358955, + "grad_norm": 0.689566433429718, + "learning_rate": 0.0001021702838063439, + "loss": 0.3611, + "step": 1470 + }, + { + "epoch": 4.513451191391238, + "grad_norm": 0.612333357334137, + "learning_rate": 0.0001021035058430718, + "loss": 0.3755, + "step": 1471 + }, + { + "epoch": 4.51652574942352, + "grad_norm": 0.7102506160736084, + "learning_rate": 0.00010203672787979967, + "loss": 0.3566, + "step": 1472 + }, + { + "epoch": 4.5196003074558035, + "grad_norm": 0.7646180391311646, + "learning_rate": 0.00010196994991652756, + "loss": 0.2881, + "step": 1473 + }, + { + "epoch": 4.522674865488086, + "grad_norm": 0.8247338533401489, + "learning_rate": 0.00010190317195325543, + "loss": 0.3961, + "step": 1474 + }, + { + "epoch": 4.525749423520369, + "grad_norm": 0.622003972530365, + "learning_rate": 0.0001018363939899833, + "loss": 0.3407, + "step": 1475 + }, + { + "epoch": 4.528823981552652, + "grad_norm": 0.6311368346214294, + "learning_rate": 0.0001017696160267112, + "loss": 0.2872, + "step": 1476 + }, + { + "epoch": 4.531898539584935, + "grad_norm": 0.8423951268196106, + "learning_rate": 0.00010170283806343908, + "loss": 0.3465, + "step": 1477 + }, + { + "epoch": 4.534973097617217, + "grad_norm": 0.5665594339370728, + "learning_rate": 0.00010163606010016695, + "loss": 0.3414, + "step": 1478 + }, + { + "epoch": 4.538047655649501, + "grad_norm": 0.8207141160964966, + "learning_rate": 0.00010156928213689482, + "loss": 0.4187, + "step": 1479 + }, + { + "epoch": 4.541122213681783, + "grad_norm": 0.5721847414970398, + "learning_rate": 0.00010150250417362271, + "loss": 0.2909, + "step": 1480 + }, + { + "epoch": 4.544196771714066, + "grad_norm": 0.837468147277832, + "learning_rate": 0.00010143572621035058, + "loss": 0.4037, + "step": 1481 + }, + { + "epoch": 4.547271329746349, + "grad_norm": 0.7777520418167114, + "learning_rate": 0.00010136894824707848, + "loss": 0.4051, + "step": 1482 + }, + { + "epoch": 4.550345887778632, + "grad_norm": 1.183840274810791, + "learning_rate": 0.00010130217028380636, + "loss": 0.414, + "step": 1483 + }, + { + "epoch": 4.553420445810914, + "grad_norm": 0.9845882654190063, + "learning_rate": 0.00010123539232053423, + "loss": 0.3536, + "step": 1484 + }, + { + "epoch": 4.556495003843198, + "grad_norm": 0.6358274817466736, + "learning_rate": 0.0001011686143572621, + "loss": 0.3828, + "step": 1485 + }, + { + "epoch": 4.55956956187548, + "grad_norm": 0.8890843391418457, + "learning_rate": 0.00010110183639398998, + "loss": 0.3399, + "step": 1486 + }, + { + "epoch": 4.562644119907763, + "grad_norm": 0.894417941570282, + "learning_rate": 0.00010103505843071788, + "loss": 0.4613, + "step": 1487 + }, + { + "epoch": 4.565718677940046, + "grad_norm": 0.8622507452964783, + "learning_rate": 0.00010096828046744575, + "loss": 0.2838, + "step": 1488 + }, + { + "epoch": 4.568793235972329, + "grad_norm": 0.8701838850975037, + "learning_rate": 0.00010090150250417362, + "loss": 0.3169, + "step": 1489 + }, + { + "epoch": 4.5718677940046115, + "grad_norm": 0.8100345134735107, + "learning_rate": 0.00010083472454090151, + "loss": 0.3649, + "step": 1490 + }, + { + "epoch": 4.574942352036895, + "grad_norm": 0.8611205220222473, + "learning_rate": 0.00010076794657762938, + "loss": 0.2422, + "step": 1491 + }, + { + "epoch": 4.578016910069177, + "grad_norm": 0.8310852646827698, + "learning_rate": 0.00010070116861435728, + "loss": 0.3244, + "step": 1492 + }, + { + "epoch": 4.5810914681014605, + "grad_norm": 0.7983706593513489, + "learning_rate": 0.00010063439065108516, + "loss": 0.3132, + "step": 1493 + }, + { + "epoch": 4.584166026133743, + "grad_norm": 0.6380778551101685, + "learning_rate": 0.00010056761268781303, + "loss": 0.3557, + "step": 1494 + }, + { + "epoch": 4.587240584166026, + "grad_norm": 0.81980299949646, + "learning_rate": 0.0001005008347245409, + "loss": 0.4001, + "step": 1495 + }, + { + "epoch": 4.590315142198309, + "grad_norm": 1.0842241048812866, + "learning_rate": 0.00010043405676126878, + "loss": 0.3951, + "step": 1496 + }, + { + "epoch": 4.593389700230592, + "grad_norm": 0.7225966453552246, + "learning_rate": 0.00010036727879799666, + "loss": 0.3051, + "step": 1497 + }, + { + "epoch": 4.596464258262875, + "grad_norm": 0.7823684811592102, + "learning_rate": 0.00010030050083472455, + "loss": 0.3113, + "step": 1498 + }, + { + "epoch": 4.599538816295158, + "grad_norm": 0.8264310359954834, + "learning_rate": 0.00010023372287145244, + "loss": 0.4567, + "step": 1499 + }, + { + "epoch": 4.60261337432744, + "grad_norm": 1.0230191946029663, + "learning_rate": 0.00010016694490818031, + "loss": 0.5121, + "step": 1500 + }, + { + "epoch": 4.605687932359723, + "grad_norm": 0.7866786122322083, + "learning_rate": 0.00010010016694490818, + "loss": 0.2891, + "step": 1501 + }, + { + "epoch": 4.608762490392007, + "grad_norm": 0.7644535303115845, + "learning_rate": 0.00010003338898163605, + "loss": 0.2806, + "step": 1502 + }, + { + "epoch": 4.611837048424289, + "grad_norm": 0.6497211456298828, + "learning_rate": 9.996661101836394e-05, + "loss": 0.3869, + "step": 1503 + }, + { + "epoch": 4.614911606456571, + "grad_norm": 0.694921612739563, + "learning_rate": 9.989983305509183e-05, + "loss": 0.3874, + "step": 1504 + }, + { + "epoch": 4.617986164488855, + "grad_norm": 0.8609017133712769, + "learning_rate": 9.98330550918197e-05, + "loss": 0.3816, + "step": 1505 + }, + { + "epoch": 4.621060722521138, + "grad_norm": 0.6470094323158264, + "learning_rate": 9.976627712854757e-05, + "loss": 0.321, + "step": 1506 + }, + { + "epoch": 4.62413528055342, + "grad_norm": 0.9883415102958679, + "learning_rate": 9.969949916527546e-05, + "loss": 0.4452, + "step": 1507 + }, + { + "epoch": 4.627209838585704, + "grad_norm": 0.9782819151878357, + "learning_rate": 9.963272120200335e-05, + "loss": 0.3273, + "step": 1508 + }, + { + "epoch": 4.630284396617986, + "grad_norm": 1.2475955486297607, + "learning_rate": 9.956594323873122e-05, + "loss": 0.3471, + "step": 1509 + }, + { + "epoch": 4.633358954650269, + "grad_norm": 0.6427537202835083, + "learning_rate": 9.949916527545911e-05, + "loss": 0.3775, + "step": 1510 + }, + { + "epoch": 4.636433512682552, + "grad_norm": 0.7873066067695618, + "learning_rate": 9.943238731218698e-05, + "loss": 0.3011, + "step": 1511 + }, + { + "epoch": 4.639508070714835, + "grad_norm": 0.71328204870224, + "learning_rate": 9.936560934891487e-05, + "loss": 0.3559, + "step": 1512 + }, + { + "epoch": 4.6425826287471175, + "grad_norm": 0.7703279256820679, + "learning_rate": 9.929883138564274e-05, + "loss": 0.2484, + "step": 1513 + }, + { + "epoch": 4.645657186779401, + "grad_norm": 0.8112149238586426, + "learning_rate": 9.923205342237061e-05, + "loss": 0.2511, + "step": 1514 + }, + { + "epoch": 4.648731744811683, + "grad_norm": 0.729215681552887, + "learning_rate": 9.91652754590985e-05, + "loss": 0.3465, + "step": 1515 + }, + { + "epoch": 4.6518063028439665, + "grad_norm": 0.8515496850013733, + "learning_rate": 9.909849749582639e-05, + "loss": 0.3523, + "step": 1516 + }, + { + "epoch": 4.654880860876249, + "grad_norm": 0.9650899171829224, + "learning_rate": 9.903171953255426e-05, + "loss": 0.2568, + "step": 1517 + }, + { + "epoch": 4.657955418908532, + "grad_norm": 1.0580472946166992, + "learning_rate": 9.896494156928215e-05, + "loss": 0.3365, + "step": 1518 + }, + { + "epoch": 4.661029976940815, + "grad_norm": 0.9089365005493164, + "learning_rate": 9.889816360601002e-05, + "loss": 0.3948, + "step": 1519 + }, + { + "epoch": 4.664104534973098, + "grad_norm": 0.7647799849510193, + "learning_rate": 9.883138564273791e-05, + "loss": 0.4183, + "step": 1520 + }, + { + "epoch": 4.66717909300538, + "grad_norm": 0.9137128591537476, + "learning_rate": 9.876460767946578e-05, + "loss": 0.4112, + "step": 1521 + }, + { + "epoch": 4.6702536510376635, + "grad_norm": 0.7739920616149902, + "learning_rate": 9.869782971619365e-05, + "loss": 0.3357, + "step": 1522 + }, + { + "epoch": 4.673328209069946, + "grad_norm": 0.90510493516922, + "learning_rate": 9.863105175292154e-05, + "loss": 0.3252, + "step": 1523 + }, + { + "epoch": 4.676402767102229, + "grad_norm": 0.7696104645729065, + "learning_rate": 9.856427378964941e-05, + "loss": 0.3981, + "step": 1524 + }, + { + "epoch": 4.679477325134512, + "grad_norm": 0.8543115854263306, + "learning_rate": 9.84974958263773e-05, + "loss": 0.3611, + "step": 1525 + }, + { + "epoch": 4.682551883166795, + "grad_norm": 0.6455523371696472, + "learning_rate": 9.843071786310519e-05, + "loss": 0.3056, + "step": 1526 + }, + { + "epoch": 4.685626441199077, + "grad_norm": 0.827754020690918, + "learning_rate": 9.836393989983306e-05, + "loss": 0.3799, + "step": 1527 + }, + { + "epoch": 4.688700999231361, + "grad_norm": 0.7233520746231079, + "learning_rate": 9.829716193656095e-05, + "loss": 0.3514, + "step": 1528 + }, + { + "epoch": 4.691775557263643, + "grad_norm": 0.6044474244117737, + "learning_rate": 9.823038397328882e-05, + "loss": 0.3335, + "step": 1529 + }, + { + "epoch": 4.694850115295926, + "grad_norm": 0.938494861125946, + "learning_rate": 9.816360601001669e-05, + "loss": 0.3564, + "step": 1530 + }, + { + "epoch": 4.697924673328209, + "grad_norm": 0.7700350880622864, + "learning_rate": 9.809682804674458e-05, + "loss": 0.3767, + "step": 1531 + }, + { + "epoch": 4.700999231360492, + "grad_norm": 0.774013876914978, + "learning_rate": 9.803005008347245e-05, + "loss": 0.4254, + "step": 1532 + }, + { + "epoch": 4.704073789392774, + "grad_norm": 0.987633228302002, + "learning_rate": 9.796327212020034e-05, + "loss": 0.3796, + "step": 1533 + }, + { + "epoch": 4.707148347425058, + "grad_norm": 0.8716994524002075, + "learning_rate": 9.789649415692823e-05, + "loss": 0.3461, + "step": 1534 + }, + { + "epoch": 4.71022290545734, + "grad_norm": 1.3219870328903198, + "learning_rate": 9.78297161936561e-05, + "loss": 0.3371, + "step": 1535 + }, + { + "epoch": 4.713297463489623, + "grad_norm": 0.7755498886108398, + "learning_rate": 9.776293823038399e-05, + "loss": 0.2963, + "step": 1536 + }, + { + "epoch": 4.716372021521906, + "grad_norm": 0.8221181035041809, + "learning_rate": 9.769616026711186e-05, + "loss": 0.4356, + "step": 1537 + }, + { + "epoch": 4.719446579554189, + "grad_norm": 0.7704976797103882, + "learning_rate": 9.762938230383973e-05, + "loss": 0.3544, + "step": 1538 + }, + { + "epoch": 4.722521137586472, + "grad_norm": 0.9037477970123291, + "learning_rate": 9.756260434056762e-05, + "loss": 0.3885, + "step": 1539 + }, + { + "epoch": 4.725595695618755, + "grad_norm": 0.7319322228431702, + "learning_rate": 9.749582637729549e-05, + "loss": 0.3295, + "step": 1540 + }, + { + "epoch": 4.728670253651037, + "grad_norm": 1.0239067077636719, + "learning_rate": 9.742904841402337e-05, + "loss": 0.4706, + "step": 1541 + }, + { + "epoch": 4.7317448116833205, + "grad_norm": 0.8769973516464233, + "learning_rate": 9.736227045075125e-05, + "loss": 0.3697, + "step": 1542 + }, + { + "epoch": 4.734819369715604, + "grad_norm": 0.9179707169532776, + "learning_rate": 9.729549248747914e-05, + "loss": 0.4451, + "step": 1543 + }, + { + "epoch": 4.737893927747886, + "grad_norm": 0.8629128932952881, + "learning_rate": 9.722871452420703e-05, + "loss": 0.3772, + "step": 1544 + }, + { + "epoch": 4.740968485780169, + "grad_norm": 0.7455741763114929, + "learning_rate": 9.71619365609349e-05, + "loss": 0.3695, + "step": 1545 + }, + { + "epoch": 4.744043043812452, + "grad_norm": 0.8288558125495911, + "learning_rate": 9.709515859766277e-05, + "loss": 0.4511, + "step": 1546 + }, + { + "epoch": 4.747117601844735, + "grad_norm": 0.6822009682655334, + "learning_rate": 9.702838063439066e-05, + "loss": 0.3832, + "step": 1547 + }, + { + "epoch": 4.750192159877018, + "grad_norm": 0.7247387766838074, + "learning_rate": 9.696160267111853e-05, + "loss": 0.3628, + "step": 1548 + }, + { + "epoch": 4.753266717909301, + "grad_norm": 0.7800189256668091, + "learning_rate": 9.68948247078464e-05, + "loss": 0.3548, + "step": 1549 + }, + { + "epoch": 4.756341275941583, + "grad_norm": 1.8545207977294922, + "learning_rate": 9.682804674457429e-05, + "loss": 0.3329, + "step": 1550 + }, + { + "epoch": 4.759415833973867, + "grad_norm": 0.8365579843521118, + "learning_rate": 9.676126878130218e-05, + "loss": 0.3358, + "step": 1551 + }, + { + "epoch": 4.762490392006149, + "grad_norm": 0.8753309845924377, + "learning_rate": 9.669449081803006e-05, + "loss": 0.404, + "step": 1552 + }, + { + "epoch": 4.765564950038432, + "grad_norm": 1.0423812866210938, + "learning_rate": 9.662771285475794e-05, + "loss": 0.393, + "step": 1553 + }, + { + "epoch": 4.768639508070715, + "grad_norm": 0.9028570055961609, + "learning_rate": 9.656093489148581e-05, + "loss": 0.3943, + "step": 1554 + }, + { + "epoch": 4.771714066102998, + "grad_norm": 0.9643226265907288, + "learning_rate": 9.64941569282137e-05, + "loss": 0.4855, + "step": 1555 + }, + { + "epoch": 4.77478862413528, + "grad_norm": 0.9107238054275513, + "learning_rate": 9.642737896494157e-05, + "loss": 0.4372, + "step": 1556 + }, + { + "epoch": 4.777863182167564, + "grad_norm": 0.725831925868988, + "learning_rate": 9.636060100166944e-05, + "loss": 0.3901, + "step": 1557 + }, + { + "epoch": 4.780937740199846, + "grad_norm": 0.8662984371185303, + "learning_rate": 9.629382303839733e-05, + "loss": 0.3486, + "step": 1558 + }, + { + "epoch": 4.784012298232129, + "grad_norm": 0.6875986456871033, + "learning_rate": 9.62270450751252e-05, + "loss": 0.3593, + "step": 1559 + }, + { + "epoch": 4.787086856264412, + "grad_norm": 0.6532884836196899, + "learning_rate": 9.616026711185309e-05, + "loss": 0.3615, + "step": 1560 + }, + { + "epoch": 4.790161414296695, + "grad_norm": 0.7729180455207825, + "learning_rate": 9.609348914858098e-05, + "loss": 0.3268, + "step": 1561 + }, + { + "epoch": 4.7932359723289775, + "grad_norm": 1.191616177558899, + "learning_rate": 9.602671118530885e-05, + "loss": 0.4502, + "step": 1562 + }, + { + "epoch": 4.796310530361261, + "grad_norm": 0.7924370765686035, + "learning_rate": 9.595993322203674e-05, + "loss": 0.4504, + "step": 1563 + }, + { + "epoch": 4.799385088393543, + "grad_norm": 0.833450973033905, + "learning_rate": 9.589315525876461e-05, + "loss": 0.4081, + "step": 1564 + }, + { + "epoch": 4.8024596464258265, + "grad_norm": 1.320788025856018, + "learning_rate": 9.582637729549248e-05, + "loss": 0.3704, + "step": 1565 + }, + { + "epoch": 4.805534204458109, + "grad_norm": 0.9833523035049438, + "learning_rate": 9.575959933222037e-05, + "loss": 0.4213, + "step": 1566 + }, + { + "epoch": 4.808608762490392, + "grad_norm": 0.7859238386154175, + "learning_rate": 9.569282136894824e-05, + "loss": 0.4225, + "step": 1567 + }, + { + "epoch": 4.811683320522675, + "grad_norm": 0.6075074076652527, + "learning_rate": 9.562604340567613e-05, + "loss": 0.3745, + "step": 1568 + }, + { + "epoch": 4.814757878554958, + "grad_norm": 0.8117219805717468, + "learning_rate": 9.555926544240402e-05, + "loss": 0.3762, + "step": 1569 + }, + { + "epoch": 4.81783243658724, + "grad_norm": 0.7648201584815979, + "learning_rate": 9.549248747913189e-05, + "loss": 0.3781, + "step": 1570 + }, + { + "epoch": 4.8209069946195235, + "grad_norm": 0.8862608671188354, + "learning_rate": 9.542570951585978e-05, + "loss": 0.3936, + "step": 1571 + }, + { + "epoch": 4.823981552651806, + "grad_norm": 0.8977257609367371, + "learning_rate": 9.535893155258765e-05, + "loss": 0.4147, + "step": 1572 + }, + { + "epoch": 4.827056110684089, + "grad_norm": 0.6795991659164429, + "learning_rate": 9.529215358931554e-05, + "loss": 0.4196, + "step": 1573 + }, + { + "epoch": 4.830130668716372, + "grad_norm": 0.6213774085044861, + "learning_rate": 9.522537562604341e-05, + "loss": 0.3451, + "step": 1574 + }, + { + "epoch": 4.833205226748655, + "grad_norm": 0.8230448961257935, + "learning_rate": 9.515859766277128e-05, + "loss": 0.3525, + "step": 1575 + }, + { + "epoch": 4.836279784780938, + "grad_norm": 1.0086671113967896, + "learning_rate": 9.509181969949917e-05, + "loss": 0.347, + "step": 1576 + }, + { + "epoch": 4.839354342813221, + "grad_norm": 1.0692055225372314, + "learning_rate": 9.502504173622706e-05, + "loss": 0.3153, + "step": 1577 + }, + { + "epoch": 4.842428900845503, + "grad_norm": 0.7910997271537781, + "learning_rate": 9.495826377295493e-05, + "loss": 0.3721, + "step": 1578 + }, + { + "epoch": 4.845503458877786, + "grad_norm": 1.0143672227859497, + "learning_rate": 9.489148580968282e-05, + "loss": 0.2858, + "step": 1579 + }, + { + "epoch": 4.84857801691007, + "grad_norm": 0.8259998559951782, + "learning_rate": 9.482470784641069e-05, + "loss": 0.4454, + "step": 1580 + }, + { + "epoch": 4.851652574942352, + "grad_norm": 0.9319655299186707, + "learning_rate": 9.475792988313858e-05, + "loss": 0.2912, + "step": 1581 + }, + { + "epoch": 4.854727132974634, + "grad_norm": 0.7429136633872986, + "learning_rate": 9.469115191986645e-05, + "loss": 0.402, + "step": 1582 + }, + { + "epoch": 4.857801691006918, + "grad_norm": 0.96834397315979, + "learning_rate": 9.462437395659432e-05, + "loss": 0.5215, + "step": 1583 + }, + { + "epoch": 4.860876249039201, + "grad_norm": 0.7908016443252563, + "learning_rate": 9.455759599332221e-05, + "loss": 0.4318, + "step": 1584 + }, + { + "epoch": 4.863950807071483, + "grad_norm": 0.7773927450180054, + "learning_rate": 9.449081803005008e-05, + "loss": 0.2145, + "step": 1585 + }, + { + "epoch": 4.867025365103767, + "grad_norm": 0.8596830368041992, + "learning_rate": 9.442404006677797e-05, + "loss": 0.3767, + "step": 1586 + }, + { + "epoch": 4.870099923136049, + "grad_norm": 0.9522383213043213, + "learning_rate": 9.435726210350586e-05, + "loss": 0.4253, + "step": 1587 + }, + { + "epoch": 4.873174481168332, + "grad_norm": 0.878300666809082, + "learning_rate": 9.429048414023373e-05, + "loss": 0.4303, + "step": 1588 + }, + { + "epoch": 4.876249039200615, + "grad_norm": 1.1075037717819214, + "learning_rate": 9.422370617696162e-05, + "loss": 0.2863, + "step": 1589 + }, + { + "epoch": 4.879323597232898, + "grad_norm": 1.001308560371399, + "learning_rate": 9.415692821368949e-05, + "loss": 0.3345, + "step": 1590 + }, + { + "epoch": 4.8823981552651805, + "grad_norm": 1.6283379793167114, + "learning_rate": 9.409015025041736e-05, + "loss": 0.4163, + "step": 1591 + }, + { + "epoch": 4.885472713297464, + "grad_norm": 1.2115163803100586, + "learning_rate": 9.402337228714525e-05, + "loss": 0.3277, + "step": 1592 + }, + { + "epoch": 4.888547271329746, + "grad_norm": 0.9039791226387024, + "learning_rate": 9.395659432387312e-05, + "loss": 0.3978, + "step": 1593 + }, + { + "epoch": 4.8916218293620295, + "grad_norm": 0.9173548221588135, + "learning_rate": 9.388981636060101e-05, + "loss": 0.3724, + "step": 1594 + }, + { + "epoch": 4.894696387394312, + "grad_norm": 1.280786395072937, + "learning_rate": 9.38230383973289e-05, + "loss": 0.504, + "step": 1595 + }, + { + "epoch": 4.897770945426595, + "grad_norm": 1.13883638381958, + "learning_rate": 9.375626043405677e-05, + "loss": 0.2902, + "step": 1596 + }, + { + "epoch": 4.900845503458878, + "grad_norm": 0.7668140530586243, + "learning_rate": 9.368948247078465e-05, + "loss": 0.3323, + "step": 1597 + }, + { + "epoch": 4.903920061491161, + "grad_norm": 0.7449612021446228, + "learning_rate": 9.362270450751253e-05, + "loss": 0.3363, + "step": 1598 + }, + { + "epoch": 4.906994619523443, + "grad_norm": 0.9896752834320068, + "learning_rate": 9.35559265442404e-05, + "loss": 0.4381, + "step": 1599 + }, + { + "epoch": 4.910069177555727, + "grad_norm": 0.8678106069564819, + "learning_rate": 9.348914858096829e-05, + "loss": 0.3658, + "step": 1600 + }, + { + "epoch": 4.913143735588009, + "grad_norm": 0.9787524342536926, + "learning_rate": 9.342237061769616e-05, + "loss": 0.2971, + "step": 1601 + }, + { + "epoch": 4.916218293620292, + "grad_norm": 0.8093075752258301, + "learning_rate": 9.335559265442403e-05, + "loss": 0.3491, + "step": 1602 + }, + { + "epoch": 4.919292851652575, + "grad_norm": 1.5801624059677124, + "learning_rate": 9.328881469115192e-05, + "loss": 0.4743, + "step": 1603 + }, + { + "epoch": 4.922367409684858, + "grad_norm": 0.6919710040092468, + "learning_rate": 9.322203672787981e-05, + "loss": 0.355, + "step": 1604 + }, + { + "epoch": 4.92544196771714, + "grad_norm": 0.8053343892097473, + "learning_rate": 9.31552587646077e-05, + "loss": 0.3963, + "step": 1605 + }, + { + "epoch": 4.928516525749424, + "grad_norm": 0.8745597004890442, + "learning_rate": 9.308848080133557e-05, + "loss": 0.445, + "step": 1606 + }, + { + "epoch": 4.931591083781706, + "grad_norm": 0.6608087420463562, + "learning_rate": 9.302170283806344e-05, + "loss": 0.3536, + "step": 1607 + }, + { + "epoch": 4.934665641813989, + "grad_norm": 0.6686768531799316, + "learning_rate": 9.295492487479133e-05, + "loss": 0.3352, + "step": 1608 + }, + { + "epoch": 4.937740199846272, + "grad_norm": 1.0998315811157227, + "learning_rate": 9.28881469115192e-05, + "loss": 0.4367, + "step": 1609 + }, + { + "epoch": 4.940814757878555, + "grad_norm": 1.2435147762298584, + "learning_rate": 9.282136894824707e-05, + "loss": 0.4703, + "step": 1610 + }, + { + "epoch": 4.9438893159108375, + "grad_norm": 0.6086277365684509, + "learning_rate": 9.275459098497496e-05, + "loss": 0.3431, + "step": 1611 + }, + { + "epoch": 4.946963873943121, + "grad_norm": 0.6807745099067688, + "learning_rate": 9.268781302170285e-05, + "loss": 0.3895, + "step": 1612 + }, + { + "epoch": 4.950038431975403, + "grad_norm": 0.9875394105911255, + "learning_rate": 9.262103505843073e-05, + "loss": 0.4324, + "step": 1613 + }, + { + "epoch": 4.9531129900076865, + "grad_norm": 0.7492454051971436, + "learning_rate": 9.255425709515861e-05, + "loss": 0.3759, + "step": 1614 + }, + { + "epoch": 4.956187548039969, + "grad_norm": 0.8571634292602539, + "learning_rate": 9.248747913188648e-05, + "loss": 0.465, + "step": 1615 + }, + { + "epoch": 4.959262106072252, + "grad_norm": 0.8077588677406311, + "learning_rate": 9.242070116861437e-05, + "loss": 0.4365, + "step": 1616 + }, + { + "epoch": 4.9623366641045354, + "grad_norm": 1.1906275749206543, + "learning_rate": 9.235392320534224e-05, + "loss": 0.4708, + "step": 1617 + }, + { + "epoch": 4.965411222136818, + "grad_norm": 0.8662446737289429, + "learning_rate": 9.228714524207011e-05, + "loss": 0.3778, + "step": 1618 + }, + { + "epoch": 4.9684857801691, + "grad_norm": 1.0619062185287476, + "learning_rate": 9.2220367278798e-05, + "loss": 0.4452, + "step": 1619 + }, + { + "epoch": 4.9715603382013835, + "grad_norm": 0.7930029034614563, + "learning_rate": 9.215358931552587e-05, + "loss": 0.4003, + "step": 1620 + }, + { + "epoch": 4.974634896233667, + "grad_norm": 0.7096540331840515, + "learning_rate": 9.208681135225376e-05, + "loss": 0.3177, + "step": 1621 + }, + { + "epoch": 4.977709454265949, + "grad_norm": 0.673022985458374, + "learning_rate": 9.202003338898165e-05, + "loss": 0.4044, + "step": 1622 + }, + { + "epoch": 4.980784012298232, + "grad_norm": 0.7269219756126404, + "learning_rate": 9.195325542570952e-05, + "loss": 0.3602, + "step": 1623 + }, + { + "epoch": 4.983858570330515, + "grad_norm": 0.617123007774353, + "learning_rate": 9.18864774624374e-05, + "loss": 0.2814, + "step": 1624 + }, + { + "epoch": 4.986933128362798, + "grad_norm": 1.1843900680541992, + "learning_rate": 9.181969949916528e-05, + "loss": 0.4494, + "step": 1625 + }, + { + "epoch": 4.990007686395081, + "grad_norm": 0.8236628770828247, + "learning_rate": 9.175292153589315e-05, + "loss": 0.394, + "step": 1626 + }, + { + "epoch": 4.993082244427364, + "grad_norm": 0.7394270896911621, + "learning_rate": 9.168614357262104e-05, + "loss": 0.3729, + "step": 1627 + }, + { + "epoch": 4.996156802459646, + "grad_norm": 1.1829383373260498, + "learning_rate": 9.161936560934891e-05, + "loss": 0.4782, + "step": 1628 + }, + { + "epoch": 4.99923136049193, + "grad_norm": 0.8535853028297424, + "learning_rate": 9.15525876460768e-05, + "loss": 0.3974, + "step": 1629 + }, + { + "epoch": 5.0, + "grad_norm": 2.5640904903411865, + "learning_rate": 9.148580968280469e-05, + "loss": 0.4622, + "step": 1630 + }, + { + "epoch": 5.003074558032283, + "grad_norm": 0.572605311870575, + "learning_rate": 9.141903171953256e-05, + "loss": 0.2781, + "step": 1631 + }, + { + "epoch": 5.006149116064566, + "grad_norm": 0.623123824596405, + "learning_rate": 9.135225375626045e-05, + "loss": 0.333, + "step": 1632 + }, + { + "epoch": 5.009223674096849, + "grad_norm": 0.6070964932441711, + "learning_rate": 9.128547579298832e-05, + "loss": 0.2586, + "step": 1633 + }, + { + "epoch": 5.012298232129131, + "grad_norm": 0.49212580919265747, + "learning_rate": 9.121869782971619e-05, + "loss": 0.205, + "step": 1634 + }, + { + "epoch": 5.015372790161415, + "grad_norm": 0.6224344372749329, + "learning_rate": 9.115191986644408e-05, + "loss": 0.3319, + "step": 1635 + }, + { + "epoch": 5.018447348193697, + "grad_norm": 0.9165223240852356, + "learning_rate": 9.108514190317195e-05, + "loss": 0.4182, + "step": 1636 + }, + { + "epoch": 5.02152190622598, + "grad_norm": 0.4661840498447418, + "learning_rate": 9.101836393989984e-05, + "loss": 0.2065, + "step": 1637 + }, + { + "epoch": 5.024596464258263, + "grad_norm": 0.5676157474517822, + "learning_rate": 9.095158597662771e-05, + "loss": 0.2128, + "step": 1638 + }, + { + "epoch": 5.027671022290546, + "grad_norm": 0.7022117972373962, + "learning_rate": 9.08848080133556e-05, + "loss": 0.3238, + "step": 1639 + }, + { + "epoch": 5.0307455803228285, + "grad_norm": 0.5956372022628784, + "learning_rate": 9.081803005008348e-05, + "loss": 0.3295, + "step": 1640 + }, + { + "epoch": 5.033820138355112, + "grad_norm": 0.6279803514480591, + "learning_rate": 9.075125208681136e-05, + "loss": 0.2878, + "step": 1641 + }, + { + "epoch": 5.036894696387394, + "grad_norm": 0.624049961566925, + "learning_rate": 9.068447412353923e-05, + "loss": 0.2526, + "step": 1642 + }, + { + "epoch": 5.0399692544196775, + "grad_norm": 1.4161157608032227, + "learning_rate": 9.061769616026712e-05, + "loss": 0.3716, + "step": 1643 + }, + { + "epoch": 5.04304381245196, + "grad_norm": 0.6286519765853882, + "learning_rate": 9.055091819699499e-05, + "loss": 0.1649, + "step": 1644 + }, + { + "epoch": 5.046118370484243, + "grad_norm": 0.9992043972015381, + "learning_rate": 9.048414023372288e-05, + "loss": 0.3121, + "step": 1645 + }, + { + "epoch": 5.049192928516526, + "grad_norm": 0.6721956133842468, + "learning_rate": 9.041736227045075e-05, + "loss": 0.3164, + "step": 1646 + }, + { + "epoch": 5.052267486548809, + "grad_norm": 0.9115797877311707, + "learning_rate": 9.035058430717864e-05, + "loss": 0.3463, + "step": 1647 + }, + { + "epoch": 5.055342044581091, + "grad_norm": 1.3086482286453247, + "learning_rate": 9.028380634390652e-05, + "loss": 0.3249, + "step": 1648 + }, + { + "epoch": 5.058416602613375, + "grad_norm": 0.935232937335968, + "learning_rate": 9.02170283806344e-05, + "loss": 0.3646, + "step": 1649 + }, + { + "epoch": 5.061491160645657, + "grad_norm": 0.6877484917640686, + "learning_rate": 9.015025041736227e-05, + "loss": 0.33, + "step": 1650 + }, + { + "epoch": 5.06456571867794, + "grad_norm": 0.9156901836395264, + "learning_rate": 9.008347245409016e-05, + "loss": 0.3068, + "step": 1651 + }, + { + "epoch": 5.067640276710223, + "grad_norm": 0.8413227796554565, + "learning_rate": 9.001669449081803e-05, + "loss": 0.4284, + "step": 1652 + }, + { + "epoch": 5.070714834742506, + "grad_norm": 1.406680703163147, + "learning_rate": 8.994991652754592e-05, + "loss": 0.2722, + "step": 1653 + }, + { + "epoch": 5.073789392774788, + "grad_norm": 1.240125060081482, + "learning_rate": 8.988313856427379e-05, + "loss": 0.2941, + "step": 1654 + }, + { + "epoch": 5.076863950807072, + "grad_norm": 0.9402351379394531, + "learning_rate": 8.981636060100166e-05, + "loss": 0.3144, + "step": 1655 + }, + { + "epoch": 5.079938508839354, + "grad_norm": 0.972048819065094, + "learning_rate": 8.974958263772955e-05, + "loss": 0.3534, + "step": 1656 + }, + { + "epoch": 5.083013066871637, + "grad_norm": 0.8675603270530701, + "learning_rate": 8.968280467445744e-05, + "loss": 0.2585, + "step": 1657 + }, + { + "epoch": 5.08608762490392, + "grad_norm": 1.2413748502731323, + "learning_rate": 8.961602671118531e-05, + "loss": 0.3368, + "step": 1658 + }, + { + "epoch": 5.089162182936203, + "grad_norm": 0.35579174757003784, + "learning_rate": 8.95492487479132e-05, + "loss": 0.2431, + "step": 1659 + }, + { + "epoch": 5.092236740968485, + "grad_norm": 1.1538844108581543, + "learning_rate": 8.948247078464107e-05, + "loss": 0.321, + "step": 1660 + }, + { + "epoch": 5.095311299000769, + "grad_norm": 1.1886178255081177, + "learning_rate": 8.941569282136896e-05, + "loss": 0.2492, + "step": 1661 + }, + { + "epoch": 5.098385857033051, + "grad_norm": 0.5046135783195496, + "learning_rate": 8.934891485809683e-05, + "loss": 0.2312, + "step": 1662 + }, + { + "epoch": 5.101460415065334, + "grad_norm": 0.644220232963562, + "learning_rate": 8.92821368948247e-05, + "loss": 0.2498, + "step": 1663 + }, + { + "epoch": 5.104534973097617, + "grad_norm": 1.113159418106079, + "learning_rate": 8.921535893155259e-05, + "loss": 0.3425, + "step": 1664 + }, + { + "epoch": 5.1076095311299, + "grad_norm": 0.6977350115776062, + "learning_rate": 8.914858096828048e-05, + "loss": 0.3435, + "step": 1665 + }, + { + "epoch": 5.1106840891621825, + "grad_norm": 0.7484399080276489, + "learning_rate": 8.908180300500835e-05, + "loss": 0.2939, + "step": 1666 + }, + { + "epoch": 5.113758647194466, + "grad_norm": 0.9543803930282593, + "learning_rate": 8.901502504173624e-05, + "loss": 0.3154, + "step": 1667 + }, + { + "epoch": 5.116833205226748, + "grad_norm": 0.9736766219139099, + "learning_rate": 8.894824707846411e-05, + "loss": 0.2569, + "step": 1668 + }, + { + "epoch": 5.1199077632590315, + "grad_norm": 1.1530828475952148, + "learning_rate": 8.8881469115192e-05, + "loss": 0.3189, + "step": 1669 + }, + { + "epoch": 5.122982321291314, + "grad_norm": 0.6867527365684509, + "learning_rate": 8.881469115191987e-05, + "loss": 0.2784, + "step": 1670 + }, + { + "epoch": 5.126056879323597, + "grad_norm": 0.9009777307510376, + "learning_rate": 8.874791318864774e-05, + "loss": 0.2885, + "step": 1671 + }, + { + "epoch": 5.1291314373558805, + "grad_norm": 1.6015172004699707, + "learning_rate": 8.868113522537563e-05, + "loss": 0.2062, + "step": 1672 + }, + { + "epoch": 5.132205995388163, + "grad_norm": 0.7882331013679504, + "learning_rate": 8.86143572621035e-05, + "loss": 0.3244, + "step": 1673 + }, + { + "epoch": 5.135280553420446, + "grad_norm": 0.9412429332733154, + "learning_rate": 8.854757929883139e-05, + "loss": 0.3171, + "step": 1674 + }, + { + "epoch": 5.138355111452729, + "grad_norm": 0.6887170672416687, + "learning_rate": 8.848080133555928e-05, + "loss": 0.3405, + "step": 1675 + }, + { + "epoch": 5.141429669485012, + "grad_norm": 0.7920656800270081, + "learning_rate": 8.841402337228715e-05, + "loss": 0.3431, + "step": 1676 + }, + { + "epoch": 5.144504227517294, + "grad_norm": 0.663131594657898, + "learning_rate": 8.834724540901504e-05, + "loss": 0.2914, + "step": 1677 + }, + { + "epoch": 5.147578785549578, + "grad_norm": 0.9940230250358582, + "learning_rate": 8.828046744574291e-05, + "loss": 0.398, + "step": 1678 + }, + { + "epoch": 5.15065334358186, + "grad_norm": 1.2957160472869873, + "learning_rate": 8.821368948247078e-05, + "loss": 0.3417, + "step": 1679 + }, + { + "epoch": 5.153727901614143, + "grad_norm": 0.8957284092903137, + "learning_rate": 8.814691151919867e-05, + "loss": 0.2058, + "step": 1680 + }, + { + "epoch": 5.156802459646426, + "grad_norm": 1.5182899236679077, + "learning_rate": 8.808013355592654e-05, + "loss": 0.3196, + "step": 1681 + }, + { + "epoch": 5.159877017678709, + "grad_norm": 0.98117995262146, + "learning_rate": 8.801335559265443e-05, + "loss": 0.3348, + "step": 1682 + }, + { + "epoch": 5.162951575710991, + "grad_norm": 0.9935417175292969, + "learning_rate": 8.794657762938232e-05, + "loss": 0.1992, + "step": 1683 + }, + { + "epoch": 5.166026133743275, + "grad_norm": 0.6094648838043213, + "learning_rate": 8.787979966611019e-05, + "loss": 0.2138, + "step": 1684 + }, + { + "epoch": 5.169100691775557, + "grad_norm": 0.7856438755989075, + "learning_rate": 8.781302170283808e-05, + "loss": 0.2625, + "step": 1685 + }, + { + "epoch": 5.17217524980784, + "grad_norm": 0.7598311305046082, + "learning_rate": 8.774624373956595e-05, + "loss": 0.2635, + "step": 1686 + }, + { + "epoch": 5.175249807840123, + "grad_norm": 1.2613056898117065, + "learning_rate": 8.767946577629382e-05, + "loss": 0.3251, + "step": 1687 + }, + { + "epoch": 5.178324365872406, + "grad_norm": 1.7386010885238647, + "learning_rate": 8.761268781302171e-05, + "loss": 0.2677, + "step": 1688 + }, + { + "epoch": 5.1813989239046885, + "grad_norm": 0.7499911189079285, + "learning_rate": 8.754590984974958e-05, + "loss": 0.2974, + "step": 1689 + }, + { + "epoch": 5.184473481936972, + "grad_norm": 0.6865500211715698, + "learning_rate": 8.747913188647745e-05, + "loss": 0.3266, + "step": 1690 + }, + { + "epoch": 5.187548039969254, + "grad_norm": 0.8432502150535583, + "learning_rate": 8.741235392320535e-05, + "loss": 0.2287, + "step": 1691 + }, + { + "epoch": 5.1906225980015375, + "grad_norm": 1.0338119268417358, + "learning_rate": 8.734557595993323e-05, + "loss": 0.2959, + "step": 1692 + }, + { + "epoch": 5.19369715603382, + "grad_norm": 0.7273797988891602, + "learning_rate": 8.727879799666111e-05, + "loss": 0.3644, + "step": 1693 + }, + { + "epoch": 5.196771714066103, + "grad_norm": 0.9218087196350098, + "learning_rate": 8.721202003338899e-05, + "loss": 0.282, + "step": 1694 + }, + { + "epoch": 5.199846272098386, + "grad_norm": 0.49654561281204224, + "learning_rate": 8.714524207011686e-05, + "loss": 0.2808, + "step": 1695 + }, + { + "epoch": 5.202920830130669, + "grad_norm": 1.4503116607666016, + "learning_rate": 8.707846410684475e-05, + "loss": 0.3207, + "step": 1696 + }, + { + "epoch": 5.205995388162951, + "grad_norm": 0.7454671859741211, + "learning_rate": 8.701168614357262e-05, + "loss": 0.2838, + "step": 1697 + }, + { + "epoch": 5.209069946195235, + "grad_norm": 0.7439486980438232, + "learning_rate": 8.694490818030051e-05, + "loss": 0.2806, + "step": 1698 + }, + { + "epoch": 5.212144504227517, + "grad_norm": 1.0724588632583618, + "learning_rate": 8.687813021702838e-05, + "loss": 0.2592, + "step": 1699 + }, + { + "epoch": 5.2152190622598, + "grad_norm": 1.1502668857574463, + "learning_rate": 8.681135225375627e-05, + "loss": 0.3127, + "step": 1700 + }, + { + "epoch": 5.218293620292083, + "grad_norm": 0.7341523170471191, + "learning_rate": 8.674457429048415e-05, + "loss": 0.304, + "step": 1701 + }, + { + "epoch": 5.221368178324366, + "grad_norm": 0.8837214112281799, + "learning_rate": 8.667779632721203e-05, + "loss": 0.3238, + "step": 1702 + }, + { + "epoch": 5.224442736356648, + "grad_norm": 0.9992470145225525, + "learning_rate": 8.66110183639399e-05, + "loss": 0.2472, + "step": 1703 + }, + { + "epoch": 5.227517294388932, + "grad_norm": 0.569851279258728, + "learning_rate": 8.654424040066779e-05, + "loss": 0.2762, + "step": 1704 + }, + { + "epoch": 5.230591852421214, + "grad_norm": 0.9270056486129761, + "learning_rate": 8.647746243739566e-05, + "loss": 0.3032, + "step": 1705 + }, + { + "epoch": 5.233666410453497, + "grad_norm": 0.9279311895370483, + "learning_rate": 8.641068447412355e-05, + "loss": 0.3044, + "step": 1706 + }, + { + "epoch": 5.23674096848578, + "grad_norm": 0.905545175075531, + "learning_rate": 8.634390651085142e-05, + "loss": 0.3282, + "step": 1707 + }, + { + "epoch": 5.239815526518063, + "grad_norm": 0.7078321576118469, + "learning_rate": 8.62771285475793e-05, + "loss": 0.2272, + "step": 1708 + }, + { + "epoch": 5.242890084550346, + "grad_norm": 0.8569689393043518, + "learning_rate": 8.62103505843072e-05, + "loss": 0.3754, + "step": 1709 + }, + { + "epoch": 5.245964642582629, + "grad_norm": 0.9020428657531738, + "learning_rate": 8.614357262103507e-05, + "loss": 0.3115, + "step": 1710 + }, + { + "epoch": 5.249039200614912, + "grad_norm": 0.8380615711212158, + "learning_rate": 8.607679465776294e-05, + "loss": 0.2889, + "step": 1711 + }, + { + "epoch": 5.252113758647194, + "grad_norm": 0.6772667765617371, + "learning_rate": 8.601001669449083e-05, + "loss": 0.2722, + "step": 1712 + }, + { + "epoch": 5.255188316679478, + "grad_norm": 0.9966198801994324, + "learning_rate": 8.59432387312187e-05, + "loss": 0.3286, + "step": 1713 + }, + { + "epoch": 5.25826287471176, + "grad_norm": 0.7050550580024719, + "learning_rate": 8.587646076794659e-05, + "loss": 0.2526, + "step": 1714 + }, + { + "epoch": 5.261337432744043, + "grad_norm": 0.6464506983757019, + "learning_rate": 8.580968280467446e-05, + "loss": 0.254, + "step": 1715 + }, + { + "epoch": 5.264411990776326, + "grad_norm": 0.7716936469078064, + "learning_rate": 8.574290484140233e-05, + "loss": 0.2742, + "step": 1716 + }, + { + "epoch": 5.267486548808609, + "grad_norm": 0.746012806892395, + "learning_rate": 8.567612687813022e-05, + "loss": 0.4213, + "step": 1717 + }, + { + "epoch": 5.2705611068408915, + "grad_norm": 0.8593916893005371, + "learning_rate": 8.56093489148581e-05, + "loss": 0.3349, + "step": 1718 + }, + { + "epoch": 5.273635664873175, + "grad_norm": 0.7389137148857117, + "learning_rate": 8.554257095158598e-05, + "loss": 0.3449, + "step": 1719 + }, + { + "epoch": 5.276710222905457, + "grad_norm": 1.1622214317321777, + "learning_rate": 8.547579298831387e-05, + "loss": 0.3472, + "step": 1720 + }, + { + "epoch": 5.2797847809377405, + "grad_norm": 0.5685468316078186, + "learning_rate": 8.540901502504174e-05, + "loss": 0.2676, + "step": 1721 + }, + { + "epoch": 5.282859338970023, + "grad_norm": 0.8736433982849121, + "learning_rate": 8.534223706176963e-05, + "loss": 0.2126, + "step": 1722 + }, + { + "epoch": 5.285933897002306, + "grad_norm": 0.7043049931526184, + "learning_rate": 8.52754590984975e-05, + "loss": 0.3439, + "step": 1723 + }, + { + "epoch": 5.289008455034589, + "grad_norm": 1.075692057609558, + "learning_rate": 8.520868113522537e-05, + "loss": 0.4651, + "step": 1724 + }, + { + "epoch": 5.292083013066872, + "grad_norm": 0.8230921030044556, + "learning_rate": 8.514190317195326e-05, + "loss": 0.2736, + "step": 1725 + }, + { + "epoch": 5.295157571099154, + "grad_norm": 0.9219911694526672, + "learning_rate": 8.507512520868115e-05, + "loss": 0.2919, + "step": 1726 + }, + { + "epoch": 5.298232129131438, + "grad_norm": 0.6772744059562683, + "learning_rate": 8.500834724540902e-05, + "loss": 0.2923, + "step": 1727 + }, + { + "epoch": 5.30130668716372, + "grad_norm": 1.0341936349868774, + "learning_rate": 8.49415692821369e-05, + "loss": 0.325, + "step": 1728 + }, + { + "epoch": 5.304381245196003, + "grad_norm": 0.6965529322624207, + "learning_rate": 8.487479131886478e-05, + "loss": 0.2289, + "step": 1729 + }, + { + "epoch": 5.307455803228286, + "grad_norm": 0.8680992722511292, + "learning_rate": 8.480801335559267e-05, + "loss": 0.3257, + "step": 1730 + }, + { + "epoch": 5.310530361260569, + "grad_norm": 0.8042769432067871, + "learning_rate": 8.474123539232054e-05, + "loss": 0.2816, + "step": 1731 + }, + { + "epoch": 5.313604919292851, + "grad_norm": 1.2106633186340332, + "learning_rate": 8.467445742904841e-05, + "loss": 0.2709, + "step": 1732 + }, + { + "epoch": 5.316679477325135, + "grad_norm": 0.6236171722412109, + "learning_rate": 8.46076794657763e-05, + "loss": 0.223, + "step": 1733 + }, + { + "epoch": 5.319754035357417, + "grad_norm": 0.7089080214500427, + "learning_rate": 8.454090150250417e-05, + "loss": 0.3727, + "step": 1734 + }, + { + "epoch": 5.3228285933897, + "grad_norm": 0.9685229659080505, + "learning_rate": 8.447412353923206e-05, + "loss": 0.3011, + "step": 1735 + }, + { + "epoch": 5.325903151421983, + "grad_norm": 0.8630408644676208, + "learning_rate": 8.440734557595994e-05, + "loss": 0.2856, + "step": 1736 + }, + { + "epoch": 5.328977709454266, + "grad_norm": 0.8283337950706482, + "learning_rate": 8.434056761268782e-05, + "loss": 0.2499, + "step": 1737 + }, + { + "epoch": 5.3320522674865485, + "grad_norm": 0.6598505973815918, + "learning_rate": 8.42737896494157e-05, + "loss": 0.3935, + "step": 1738 + }, + { + "epoch": 5.335126825518832, + "grad_norm": 0.7599532008171082, + "learning_rate": 8.420701168614358e-05, + "loss": 0.3502, + "step": 1739 + }, + { + "epoch": 5.338201383551114, + "grad_norm": 0.9693306684494019, + "learning_rate": 8.414023372287145e-05, + "loss": 0.5447, + "step": 1740 + }, + { + "epoch": 5.3412759415833975, + "grad_norm": 1.4195016622543335, + "learning_rate": 8.407345575959934e-05, + "loss": 0.3302, + "step": 1741 + }, + { + "epoch": 5.34435049961568, + "grad_norm": 1.2126317024230957, + "learning_rate": 8.400667779632721e-05, + "loss": 0.355, + "step": 1742 + }, + { + "epoch": 5.347425057647963, + "grad_norm": 0.8964106440544128, + "learning_rate": 8.39398998330551e-05, + "loss": 0.3305, + "step": 1743 + }, + { + "epoch": 5.350499615680246, + "grad_norm": 0.5942551493644714, + "learning_rate": 8.387312186978298e-05, + "loss": 0.2959, + "step": 1744 + }, + { + "epoch": 5.353574173712529, + "grad_norm": 0.6881222724914551, + "learning_rate": 8.380634390651086e-05, + "loss": 0.279, + "step": 1745 + }, + { + "epoch": 5.356648731744812, + "grad_norm": 0.6308599710464478, + "learning_rate": 8.373956594323874e-05, + "loss": 0.2354, + "step": 1746 + }, + { + "epoch": 5.359723289777095, + "grad_norm": 0.7263630628585815, + "learning_rate": 8.367278797996662e-05, + "loss": 0.3054, + "step": 1747 + }, + { + "epoch": 5.362797847809377, + "grad_norm": 0.7219898104667664, + "learning_rate": 8.360601001669449e-05, + "loss": 0.2876, + "step": 1748 + }, + { + "epoch": 5.36587240584166, + "grad_norm": 0.8973868489265442, + "learning_rate": 8.353923205342238e-05, + "loss": 0.3736, + "step": 1749 + }, + { + "epoch": 5.3689469638739435, + "grad_norm": 0.757659375667572, + "learning_rate": 8.347245409015025e-05, + "loss": 0.3041, + "step": 1750 + }, + { + "epoch": 5.372021521906226, + "grad_norm": 0.8917866349220276, + "learning_rate": 8.340567612687812e-05, + "loss": 0.3019, + "step": 1751 + }, + { + "epoch": 5.375096079938509, + "grad_norm": 0.6132904291152954, + "learning_rate": 8.333889816360601e-05, + "loss": 0.2637, + "step": 1752 + }, + { + "epoch": 5.378170637970792, + "grad_norm": 0.9521093964576721, + "learning_rate": 8.32721202003339e-05, + "loss": 0.349, + "step": 1753 + }, + { + "epoch": 5.381245196003075, + "grad_norm": 0.503698468208313, + "learning_rate": 8.320534223706178e-05, + "loss": 0.2973, + "step": 1754 + }, + { + "epoch": 5.384319754035357, + "grad_norm": 1.0433109998703003, + "learning_rate": 8.313856427378966e-05, + "loss": 0.3069, + "step": 1755 + }, + { + "epoch": 5.387394312067641, + "grad_norm": 1.374406099319458, + "learning_rate": 8.307178631051753e-05, + "loss": 0.3536, + "step": 1756 + }, + { + "epoch": 5.390468870099923, + "grad_norm": 0.7242358922958374, + "learning_rate": 8.300500834724542e-05, + "loss": 0.3207, + "step": 1757 + }, + { + "epoch": 5.393543428132206, + "grad_norm": 0.8785935640335083, + "learning_rate": 8.293823038397329e-05, + "loss": 0.3405, + "step": 1758 + }, + { + "epoch": 5.396617986164489, + "grad_norm": 1.2258713245391846, + "learning_rate": 8.287145242070116e-05, + "loss": 0.2641, + "step": 1759 + }, + { + "epoch": 5.399692544196772, + "grad_norm": 1.837854027748108, + "learning_rate": 8.280467445742905e-05, + "loss": 0.3816, + "step": 1760 + }, + { + "epoch": 5.402767102229054, + "grad_norm": 0.7135657072067261, + "learning_rate": 8.273789649415694e-05, + "loss": 0.2461, + "step": 1761 + }, + { + "epoch": 5.405841660261338, + "grad_norm": 0.8239970803260803, + "learning_rate": 8.267111853088482e-05, + "loss": 0.2936, + "step": 1762 + }, + { + "epoch": 5.40891621829362, + "grad_norm": 0.6553420424461365, + "learning_rate": 8.26043405676127e-05, + "loss": 0.3007, + "step": 1763 + }, + { + "epoch": 5.411990776325903, + "grad_norm": 1.8108381032943726, + "learning_rate": 8.253756260434057e-05, + "loss": 0.4522, + "step": 1764 + }, + { + "epoch": 5.415065334358186, + "grad_norm": 0.8653173446655273, + "learning_rate": 8.247078464106846e-05, + "loss": 0.3126, + "step": 1765 + }, + { + "epoch": 5.418139892390469, + "grad_norm": 0.7217906713485718, + "learning_rate": 8.240400667779633e-05, + "loss": 0.2043, + "step": 1766 + }, + { + "epoch": 5.4212144504227515, + "grad_norm": 0.90260910987854, + "learning_rate": 8.23372287145242e-05, + "loss": 0.2376, + "step": 1767 + }, + { + "epoch": 5.424289008455035, + "grad_norm": 0.8756963014602661, + "learning_rate": 8.227045075125209e-05, + "loss": 0.3742, + "step": 1768 + }, + { + "epoch": 5.427363566487317, + "grad_norm": 0.7344855070114136, + "learning_rate": 8.220367278797996e-05, + "loss": 0.3112, + "step": 1769 + }, + { + "epoch": 5.4304381245196005, + "grad_norm": 0.7740147113800049, + "learning_rate": 8.213689482470785e-05, + "loss": 0.2127, + "step": 1770 + }, + { + "epoch": 5.433512682551883, + "grad_norm": 0.8592774271965027, + "learning_rate": 8.207011686143574e-05, + "loss": 0.2716, + "step": 1771 + }, + { + "epoch": 5.436587240584166, + "grad_norm": 0.7816128134727478, + "learning_rate": 8.200333889816361e-05, + "loss": 0.3683, + "step": 1772 + }, + { + "epoch": 5.439661798616449, + "grad_norm": 1.1457465887069702, + "learning_rate": 8.19365609348915e-05, + "loss": 0.2948, + "step": 1773 + }, + { + "epoch": 5.442736356648732, + "grad_norm": 2.5698084831237793, + "learning_rate": 8.186978297161937e-05, + "loss": 0.3148, + "step": 1774 + }, + { + "epoch": 5.445810914681014, + "grad_norm": 0.8997441530227661, + "learning_rate": 8.180300500834724e-05, + "loss": 0.3582, + "step": 1775 + }, + { + "epoch": 5.448885472713298, + "grad_norm": 0.6346564888954163, + "learning_rate": 8.173622704507513e-05, + "loss": 0.2195, + "step": 1776 + }, + { + "epoch": 5.45196003074558, + "grad_norm": 1.0303326845169067, + "learning_rate": 8.1669449081803e-05, + "loss": 0.274, + "step": 1777 + }, + { + "epoch": 5.455034588777863, + "grad_norm": 0.8634578585624695, + "learning_rate": 8.160267111853089e-05, + "loss": 0.3599, + "step": 1778 + }, + { + "epoch": 5.458109146810146, + "grad_norm": 1.3686116933822632, + "learning_rate": 8.153589315525877e-05, + "loss": 0.2781, + "step": 1779 + }, + { + "epoch": 5.461183704842429, + "grad_norm": 0.564072847366333, + "learning_rate": 8.146911519198665e-05, + "loss": 0.2335, + "step": 1780 + }, + { + "epoch": 5.464258262874711, + "grad_norm": 0.7149077653884888, + "learning_rate": 8.140233722871453e-05, + "loss": 0.3076, + "step": 1781 + }, + { + "epoch": 5.467332820906995, + "grad_norm": 1.153348684310913, + "learning_rate": 8.133555926544241e-05, + "loss": 0.3365, + "step": 1782 + }, + { + "epoch": 5.470407378939277, + "grad_norm": 0.5060893893241882, + "learning_rate": 8.126878130217028e-05, + "loss": 0.2537, + "step": 1783 + }, + { + "epoch": 5.47348193697156, + "grad_norm": 1.0472662448883057, + "learning_rate": 8.120200333889817e-05, + "loss": 0.2637, + "step": 1784 + }, + { + "epoch": 5.476556495003843, + "grad_norm": 0.6877180337905884, + "learning_rate": 8.113522537562604e-05, + "loss": 0.3489, + "step": 1785 + }, + { + "epoch": 5.479631053036126, + "grad_norm": 0.8638304471969604, + "learning_rate": 8.106844741235393e-05, + "loss": 0.283, + "step": 1786 + }, + { + "epoch": 5.482705611068409, + "grad_norm": 0.828100323677063, + "learning_rate": 8.10016694490818e-05, + "loss": 0.2685, + "step": 1787 + }, + { + "epoch": 5.485780169100692, + "grad_norm": 0.8909431099891663, + "learning_rate": 8.093489148580969e-05, + "loss": 0.2413, + "step": 1788 + }, + { + "epoch": 5.488854727132974, + "grad_norm": 1.0037970542907715, + "learning_rate": 8.086811352253757e-05, + "loss": 0.3077, + "step": 1789 + }, + { + "epoch": 5.4919292851652575, + "grad_norm": 1.3559635877609253, + "learning_rate": 8.080133555926545e-05, + "loss": 0.3585, + "step": 1790 + }, + { + "epoch": 5.495003843197541, + "grad_norm": 0.9360470771789551, + "learning_rate": 8.073455759599332e-05, + "loss": 0.368, + "step": 1791 + }, + { + "epoch": 5.498078401229823, + "grad_norm": 0.8319844007492065, + "learning_rate": 8.066777963272121e-05, + "loss": 0.2898, + "step": 1792 + }, + { + "epoch": 5.5011529592621065, + "grad_norm": 0.6613747477531433, + "learning_rate": 8.060100166944908e-05, + "loss": 0.2506, + "step": 1793 + }, + { + "epoch": 5.504227517294389, + "grad_norm": 0.7393064498901367, + "learning_rate": 8.053422370617697e-05, + "loss": 0.261, + "step": 1794 + }, + { + "epoch": 5.507302075326672, + "grad_norm": 0.8899523019790649, + "learning_rate": 8.046744574290484e-05, + "loss": 0.3113, + "step": 1795 + }, + { + "epoch": 5.510376633358955, + "grad_norm": 1.0439255237579346, + "learning_rate": 8.040066777963273e-05, + "loss": 0.4196, + "step": 1796 + }, + { + "epoch": 5.513451191391238, + "grad_norm": 0.625464916229248, + "learning_rate": 8.033388981636061e-05, + "loss": 0.2637, + "step": 1797 + }, + { + "epoch": 5.51652574942352, + "grad_norm": 1.0431058406829834, + "learning_rate": 8.026711185308849e-05, + "loss": 0.2172, + "step": 1798 + }, + { + "epoch": 5.5196003074558035, + "grad_norm": 1.1402390003204346, + "learning_rate": 8.020033388981636e-05, + "loss": 0.2572, + "step": 1799 + }, + { + "epoch": 5.522674865488086, + "grad_norm": 0.7621378302574158, + "learning_rate": 8.013355592654425e-05, + "loss": 0.3302, + "step": 1800 + }, + { + "epoch": 5.525749423520369, + "grad_norm": 1.0336471796035767, + "learning_rate": 8.006677796327212e-05, + "loss": 0.2131, + "step": 1801 + }, + { + "epoch": 5.528823981552652, + "grad_norm": 1.23903226852417, + "learning_rate": 8e-05, + "loss": 0.2769, + "step": 1802 + }, + { + "epoch": 5.531898539584935, + "grad_norm": 0.8230191469192505, + "learning_rate": 7.993322203672788e-05, + "loss": 0.2967, + "step": 1803 + }, + { + "epoch": 5.534973097617217, + "grad_norm": 0.8352370262145996, + "learning_rate": 7.986644407345575e-05, + "loss": 0.2303, + "step": 1804 + }, + { + "epoch": 5.538047655649501, + "grad_norm": 1.2304105758666992, + "learning_rate": 7.979966611018364e-05, + "loss": 0.4731, + "step": 1805 + }, + { + "epoch": 5.541122213681783, + "grad_norm": 1.0414502620697021, + "learning_rate": 7.973288814691153e-05, + "loss": 0.3484, + "step": 1806 + }, + { + "epoch": 5.544196771714066, + "grad_norm": 1.0510241985321045, + "learning_rate": 7.96661101836394e-05, + "loss": 0.3604, + "step": 1807 + }, + { + "epoch": 5.547271329746349, + "grad_norm": 0.6692155599594116, + "learning_rate": 7.959933222036729e-05, + "loss": 0.3249, + "step": 1808 + }, + { + "epoch": 5.550345887778632, + "grad_norm": 0.9099972248077393, + "learning_rate": 7.953255425709516e-05, + "loss": 0.3089, + "step": 1809 + }, + { + "epoch": 5.553420445810914, + "grad_norm": 0.8659818768501282, + "learning_rate": 7.946577629382305e-05, + "loss": 0.3146, + "step": 1810 + }, + { + "epoch": 5.556495003843198, + "grad_norm": 0.8696914315223694, + "learning_rate": 7.939899833055092e-05, + "loss": 0.4468, + "step": 1811 + }, + { + "epoch": 5.55956956187548, + "grad_norm": 1.1352142095565796, + "learning_rate": 7.933222036727879e-05, + "loss": 0.2151, + "step": 1812 + }, + { + "epoch": 5.562644119907763, + "grad_norm": 0.899272620677948, + "learning_rate": 7.926544240400668e-05, + "loss": 0.3825, + "step": 1813 + }, + { + "epoch": 5.565718677940046, + "grad_norm": 1.0112216472625732, + "learning_rate": 7.919866444073457e-05, + "loss": 0.4442, + "step": 1814 + }, + { + "epoch": 5.568793235972329, + "grad_norm": 0.9360047578811646, + "learning_rate": 7.913188647746244e-05, + "loss": 0.305, + "step": 1815 + }, + { + "epoch": 5.5718677940046115, + "grad_norm": 0.9701045751571655, + "learning_rate": 7.906510851419033e-05, + "loss": 0.3318, + "step": 1816 + }, + { + "epoch": 5.574942352036895, + "grad_norm": 1.0220452547073364, + "learning_rate": 7.89983305509182e-05, + "loss": 0.3886, + "step": 1817 + }, + { + "epoch": 5.578016910069177, + "grad_norm": 1.1464786529541016, + "learning_rate": 7.893155258764609e-05, + "loss": 0.3478, + "step": 1818 + }, + { + "epoch": 5.5810914681014605, + "grad_norm": 0.8255491256713867, + "learning_rate": 7.886477462437396e-05, + "loss": 0.3586, + "step": 1819 + }, + { + "epoch": 5.584166026133743, + "grad_norm": 1.0034533739089966, + "learning_rate": 7.879799666110183e-05, + "loss": 0.294, + "step": 1820 + }, + { + "epoch": 5.587240584166026, + "grad_norm": 0.8229129314422607, + "learning_rate": 7.873121869782972e-05, + "loss": 0.2546, + "step": 1821 + }, + { + "epoch": 5.590315142198309, + "grad_norm": 0.6609354019165039, + "learning_rate": 7.86644407345576e-05, + "loss": 0.31, + "step": 1822 + }, + { + "epoch": 5.593389700230592, + "grad_norm": 0.7836920619010925, + "learning_rate": 7.859766277128548e-05, + "loss": 0.3437, + "step": 1823 + }, + { + "epoch": 5.596464258262875, + "grad_norm": 1.286696195602417, + "learning_rate": 7.853088480801337e-05, + "loss": 0.2675, + "step": 1824 + }, + { + "epoch": 5.599538816295158, + "grad_norm": 0.7446246147155762, + "learning_rate": 7.846410684474124e-05, + "loss": 0.2127, + "step": 1825 + }, + { + "epoch": 5.60261337432744, + "grad_norm": 0.7205875515937805, + "learning_rate": 7.839732888146912e-05, + "loss": 0.3288, + "step": 1826 + }, + { + "epoch": 5.605687932359723, + "grad_norm": 0.8203064203262329, + "learning_rate": 7.8330550918197e-05, + "loss": 0.337, + "step": 1827 + }, + { + "epoch": 5.608762490392007, + "grad_norm": 0.6690270900726318, + "learning_rate": 7.826377295492487e-05, + "loss": 0.3033, + "step": 1828 + }, + { + "epoch": 5.611837048424289, + "grad_norm": 0.5355561375617981, + "learning_rate": 7.819699499165276e-05, + "loss": 0.3438, + "step": 1829 + }, + { + "epoch": 5.614911606456571, + "grad_norm": 0.6979895234107971, + "learning_rate": 7.813021702838063e-05, + "loss": 0.2798, + "step": 1830 + }, + { + "epoch": 5.617986164488855, + "grad_norm": 0.6088154315948486, + "learning_rate": 7.806343906510852e-05, + "loss": 0.269, + "step": 1831 + }, + { + "epoch": 5.621060722521138, + "grad_norm": 1.0615488290786743, + "learning_rate": 7.79966611018364e-05, + "loss": 0.37, + "step": 1832 + }, + { + "epoch": 5.62413528055342, + "grad_norm": 0.8934707641601562, + "learning_rate": 7.792988313856428e-05, + "loss": 0.3102, + "step": 1833 + }, + { + "epoch": 5.627209838585704, + "grad_norm": 0.9596664905548096, + "learning_rate": 7.786310517529216e-05, + "loss": 0.2823, + "step": 1834 + }, + { + "epoch": 5.630284396617986, + "grad_norm": 0.8570690155029297, + "learning_rate": 7.779632721202004e-05, + "loss": 0.3825, + "step": 1835 + }, + { + "epoch": 5.633358954650269, + "grad_norm": 0.7704600095748901, + "learning_rate": 7.772954924874791e-05, + "loss": 0.35, + "step": 1836 + }, + { + "epoch": 5.636433512682552, + "grad_norm": 0.9135782122612, + "learning_rate": 7.76627712854758e-05, + "loss": 0.3951, + "step": 1837 + }, + { + "epoch": 5.639508070714835, + "grad_norm": 1.0562645196914673, + "learning_rate": 7.759599332220367e-05, + "loss": 0.2882, + "step": 1838 + }, + { + "epoch": 5.6425826287471175, + "grad_norm": 1.0557276010513306, + "learning_rate": 7.752921535893156e-05, + "loss": 0.3554, + "step": 1839 + }, + { + "epoch": 5.645657186779401, + "grad_norm": 0.8899006843566895, + "learning_rate": 7.746243739565944e-05, + "loss": 0.3476, + "step": 1840 + }, + { + "epoch": 5.648731744811683, + "grad_norm": 0.8078686594963074, + "learning_rate": 7.739565943238732e-05, + "loss": 0.2755, + "step": 1841 + }, + { + "epoch": 5.6518063028439665, + "grad_norm": 0.8886568546295166, + "learning_rate": 7.73288814691152e-05, + "loss": 0.3686, + "step": 1842 + }, + { + "epoch": 5.654880860876249, + "grad_norm": 1.3097084760665894, + "learning_rate": 7.726210350584308e-05, + "loss": 0.3381, + "step": 1843 + }, + { + "epoch": 5.657955418908532, + "grad_norm": 0.597942590713501, + "learning_rate": 7.719532554257095e-05, + "loss": 0.2658, + "step": 1844 + }, + { + "epoch": 5.661029976940815, + "grad_norm": 0.8042814135551453, + "learning_rate": 7.712854757929884e-05, + "loss": 0.3169, + "step": 1845 + }, + { + "epoch": 5.664104534973098, + "grad_norm": 0.5749388933181763, + "learning_rate": 7.706176961602671e-05, + "loss": 0.2733, + "step": 1846 + }, + { + "epoch": 5.66717909300538, + "grad_norm": 0.6768372654914856, + "learning_rate": 7.69949916527546e-05, + "loss": 0.3274, + "step": 1847 + }, + { + "epoch": 5.6702536510376635, + "grad_norm": 0.9132068157196045, + "learning_rate": 7.692821368948247e-05, + "loss": 0.326, + "step": 1848 + }, + { + "epoch": 5.673328209069946, + "grad_norm": 0.966132640838623, + "learning_rate": 7.686143572621036e-05, + "loss": 0.2591, + "step": 1849 + }, + { + "epoch": 5.676402767102229, + "grad_norm": 1.1084728240966797, + "learning_rate": 7.679465776293824e-05, + "loss": 0.3073, + "step": 1850 + }, + { + "epoch": 5.679477325134512, + "grad_norm": 0.7305138111114502, + "learning_rate": 7.672787979966612e-05, + "loss": 0.2618, + "step": 1851 + }, + { + "epoch": 5.682551883166795, + "grad_norm": 0.637614905834198, + "learning_rate": 7.666110183639399e-05, + "loss": 0.3106, + "step": 1852 + }, + { + "epoch": 5.685626441199077, + "grad_norm": 0.941795289516449, + "learning_rate": 7.659432387312188e-05, + "loss": 0.3271, + "step": 1853 + }, + { + "epoch": 5.688700999231361, + "grad_norm": 0.7227844595909119, + "learning_rate": 7.652754590984975e-05, + "loss": 0.2718, + "step": 1854 + }, + { + "epoch": 5.691775557263643, + "grad_norm": 0.9095995426177979, + "learning_rate": 7.646076794657764e-05, + "loss": 0.315, + "step": 1855 + }, + { + "epoch": 5.694850115295926, + "grad_norm": 1.2558395862579346, + "learning_rate": 7.639398998330551e-05, + "loss": 0.4484, + "step": 1856 + }, + { + "epoch": 5.697924673328209, + "grad_norm": 0.6830787658691406, + "learning_rate": 7.63272120200334e-05, + "loss": 0.3691, + "step": 1857 + }, + { + "epoch": 5.700999231360492, + "grad_norm": 1.5645900964736938, + "learning_rate": 7.626043405676128e-05, + "loss": 0.3653, + "step": 1858 + }, + { + "epoch": 5.704073789392774, + "grad_norm": 0.5888504385948181, + "learning_rate": 7.619365609348916e-05, + "loss": 0.2683, + "step": 1859 + }, + { + "epoch": 5.707148347425058, + "grad_norm": 0.6585515737533569, + "learning_rate": 7.612687813021703e-05, + "loss": 0.2923, + "step": 1860 + }, + { + "epoch": 5.71022290545734, + "grad_norm": 0.8930748701095581, + "learning_rate": 7.606010016694492e-05, + "loss": 0.3592, + "step": 1861 + }, + { + "epoch": 5.713297463489623, + "grad_norm": 0.7318699359893799, + "learning_rate": 7.599332220367279e-05, + "loss": 0.2895, + "step": 1862 + }, + { + "epoch": 5.716372021521906, + "grad_norm": 0.7849537134170532, + "learning_rate": 7.592654424040068e-05, + "loss": 0.2668, + "step": 1863 + }, + { + "epoch": 5.719446579554189, + "grad_norm": 0.9625186920166016, + "learning_rate": 7.585976627712855e-05, + "loss": 0.3139, + "step": 1864 + }, + { + "epoch": 5.722521137586472, + "grad_norm": 0.904823899269104, + "learning_rate": 7.579298831385642e-05, + "loss": 0.3532, + "step": 1865 + }, + { + "epoch": 5.725595695618755, + "grad_norm": 0.974162220954895, + "learning_rate": 7.572621035058431e-05, + "loss": 0.3306, + "step": 1866 + }, + { + "epoch": 5.728670253651037, + "grad_norm": 0.7732940912246704, + "learning_rate": 7.56594323873122e-05, + "loss": 0.3054, + "step": 1867 + }, + { + "epoch": 5.7317448116833205, + "grad_norm": 0.8150412440299988, + "learning_rate": 7.559265442404007e-05, + "loss": 0.2711, + "step": 1868 + }, + { + "epoch": 5.734819369715604, + "grad_norm": 1.464375615119934, + "learning_rate": 7.552587646076796e-05, + "loss": 0.3856, + "step": 1869 + }, + { + "epoch": 5.737893927747886, + "grad_norm": 0.7832287549972534, + "learning_rate": 7.545909849749583e-05, + "loss": 0.2674, + "step": 1870 + }, + { + "epoch": 5.740968485780169, + "grad_norm": 0.8007357716560364, + "learning_rate": 7.539232053422371e-05, + "loss": 0.1946, + "step": 1871 + }, + { + "epoch": 5.744043043812452, + "grad_norm": 0.6958500742912292, + "learning_rate": 7.532554257095159e-05, + "loss": 0.2734, + "step": 1872 + }, + { + "epoch": 5.747117601844735, + "grad_norm": 1.1845893859863281, + "learning_rate": 7.525876460767946e-05, + "loss": 0.3715, + "step": 1873 + }, + { + "epoch": 5.750192159877018, + "grad_norm": 0.7488757967948914, + "learning_rate": 7.519198664440735e-05, + "loss": 0.2586, + "step": 1874 + }, + { + "epoch": 5.753266717909301, + "grad_norm": 1.5162636041641235, + "learning_rate": 7.512520868113523e-05, + "loss": 0.2977, + "step": 1875 + }, + { + "epoch": 5.756341275941583, + "grad_norm": 2.1945416927337646, + "learning_rate": 7.505843071786311e-05, + "loss": 0.3504, + "step": 1876 + }, + { + "epoch": 5.759415833973867, + "grad_norm": 1.0076838731765747, + "learning_rate": 7.4991652754591e-05, + "loss": 0.3167, + "step": 1877 + }, + { + "epoch": 5.762490392006149, + "grad_norm": 0.7844017744064331, + "learning_rate": 7.492487479131887e-05, + "loss": 0.3902, + "step": 1878 + }, + { + "epoch": 5.765564950038432, + "grad_norm": 1.1953024864196777, + "learning_rate": 7.485809682804675e-05, + "loss": 0.3945, + "step": 1879 + }, + { + "epoch": 5.768639508070715, + "grad_norm": 0.6889199614524841, + "learning_rate": 7.479131886477463e-05, + "loss": 0.3535, + "step": 1880 + }, + { + "epoch": 5.771714066102998, + "grad_norm": 0.7977723479270935, + "learning_rate": 7.47245409015025e-05, + "loss": 0.3421, + "step": 1881 + }, + { + "epoch": 5.77478862413528, + "grad_norm": 0.6946485042572021, + "learning_rate": 7.465776293823039e-05, + "loss": 0.2671, + "step": 1882 + }, + { + "epoch": 5.777863182167564, + "grad_norm": 0.7452620267868042, + "learning_rate": 7.459098497495826e-05, + "loss": 0.3047, + "step": 1883 + }, + { + "epoch": 5.780937740199846, + "grad_norm": 0.6193966269493103, + "learning_rate": 7.452420701168615e-05, + "loss": 0.2053, + "step": 1884 + }, + { + "epoch": 5.784012298232129, + "grad_norm": 0.9007473587989807, + "learning_rate": 7.445742904841403e-05, + "loss": 0.4161, + "step": 1885 + }, + { + "epoch": 5.787086856264412, + "grad_norm": 1.1725136041641235, + "learning_rate": 7.439065108514191e-05, + "loss": 0.2089, + "step": 1886 + }, + { + "epoch": 5.790161414296695, + "grad_norm": 0.9010354280471802, + "learning_rate": 7.43238731218698e-05, + "loss": 0.3207, + "step": 1887 + }, + { + "epoch": 5.7932359723289775, + "grad_norm": 1.4096622467041016, + "learning_rate": 7.425709515859767e-05, + "loss": 0.3003, + "step": 1888 + }, + { + "epoch": 5.796310530361261, + "grad_norm": 1.2428261041641235, + "learning_rate": 7.419031719532554e-05, + "loss": 0.2606, + "step": 1889 + }, + { + "epoch": 5.799385088393543, + "grad_norm": 0.9653693437576294, + "learning_rate": 7.412353923205343e-05, + "loss": 0.2796, + "step": 1890 + }, + { + "epoch": 5.8024596464258265, + "grad_norm": 0.9089574217796326, + "learning_rate": 7.40567612687813e-05, + "loss": 0.2239, + "step": 1891 + }, + { + "epoch": 5.805534204458109, + "grad_norm": 1.7680071592330933, + "learning_rate": 7.398998330550919e-05, + "loss": 0.3336, + "step": 1892 + }, + { + "epoch": 5.808608762490392, + "grad_norm": 0.9146047830581665, + "learning_rate": 7.392320534223707e-05, + "loss": 0.3298, + "step": 1893 + }, + { + "epoch": 5.811683320522675, + "grad_norm": 0.6860531568527222, + "learning_rate": 7.385642737896495e-05, + "loss": 0.3238, + "step": 1894 + }, + { + "epoch": 5.814757878554958, + "grad_norm": 1.0863178968429565, + "learning_rate": 7.378964941569283e-05, + "loss": 0.2665, + "step": 1895 + }, + { + "epoch": 5.81783243658724, + "grad_norm": 1.128209114074707, + "learning_rate": 7.37228714524207e-05, + "loss": 0.3184, + "step": 1896 + }, + { + "epoch": 5.8209069946195235, + "grad_norm": 1.4680668115615845, + "learning_rate": 7.365609348914858e-05, + "loss": 0.3016, + "step": 1897 + }, + { + "epoch": 5.823981552651806, + "grad_norm": 1.0566920042037964, + "learning_rate": 7.358931552587647e-05, + "loss": 0.3585, + "step": 1898 + }, + { + "epoch": 5.827056110684089, + "grad_norm": 1.0700082778930664, + "learning_rate": 7.352253756260434e-05, + "loss": 0.373, + "step": 1899 + }, + { + "epoch": 5.830130668716372, + "grad_norm": 0.8159264326095581, + "learning_rate": 7.345575959933221e-05, + "loss": 0.3609, + "step": 1900 + }, + { + "epoch": 5.833205226748655, + "grad_norm": 2.8568453788757324, + "learning_rate": 7.33889816360601e-05, + "loss": 0.3603, + "step": 1901 + }, + { + "epoch": 5.836279784780938, + "grad_norm": 0.8656408190727234, + "learning_rate": 7.332220367278799e-05, + "loss": 0.2947, + "step": 1902 + }, + { + "epoch": 5.839354342813221, + "grad_norm": 2.0064942836761475, + "learning_rate": 7.325542570951587e-05, + "loss": 0.3535, + "step": 1903 + }, + { + "epoch": 5.842428900845503, + "grad_norm": 0.9026947617530823, + "learning_rate": 7.318864774624375e-05, + "loss": 0.3323, + "step": 1904 + }, + { + "epoch": 5.845503458877786, + "grad_norm": 0.9408707022666931, + "learning_rate": 7.312186978297162e-05, + "loss": 0.2998, + "step": 1905 + }, + { + "epoch": 5.84857801691007, + "grad_norm": 1.8345344066619873, + "learning_rate": 7.30550918196995e-05, + "loss": 0.219, + "step": 1906 + }, + { + "epoch": 5.851652574942352, + "grad_norm": 0.784744918346405, + "learning_rate": 7.298831385642738e-05, + "loss": 0.2207, + "step": 1907 + }, + { + "epoch": 5.854727132974634, + "grad_norm": 1.4914350509643555, + "learning_rate": 7.292153589315525e-05, + "loss": 0.347, + "step": 1908 + }, + { + "epoch": 5.857801691006918, + "grad_norm": 0.7770729660987854, + "learning_rate": 7.285475792988314e-05, + "loss": 0.3199, + "step": 1909 + }, + { + "epoch": 5.860876249039201, + "grad_norm": 0.6912123560905457, + "learning_rate": 7.278797996661103e-05, + "loss": 0.223, + "step": 1910 + }, + { + "epoch": 5.863950807071483, + "grad_norm": 0.6402047872543335, + "learning_rate": 7.272120200333891e-05, + "loss": 0.3393, + "step": 1911 + }, + { + "epoch": 5.867025365103767, + "grad_norm": 1.5074280500411987, + "learning_rate": 7.265442404006679e-05, + "loss": 0.4208, + "step": 1912 + }, + { + "epoch": 5.870099923136049, + "grad_norm": 1.1925088167190552, + "learning_rate": 7.258764607679466e-05, + "loss": 0.3486, + "step": 1913 + }, + { + "epoch": 5.873174481168332, + "grad_norm": 0.7136446237564087, + "learning_rate": 7.252086811352255e-05, + "loss": 0.3058, + "step": 1914 + }, + { + "epoch": 5.876249039200615, + "grad_norm": 0.7760949730873108, + "learning_rate": 7.245409015025042e-05, + "loss": 0.3423, + "step": 1915 + }, + { + "epoch": 5.879323597232898, + "grad_norm": 0.7054867744445801, + "learning_rate": 7.238731218697829e-05, + "loss": 0.3382, + "step": 1916 + }, + { + "epoch": 5.8823981552651805, + "grad_norm": 0.799457311630249, + "learning_rate": 7.232053422370618e-05, + "loss": 0.3117, + "step": 1917 + }, + { + "epoch": 5.885472713297464, + "grad_norm": 0.714888334274292, + "learning_rate": 7.225375626043405e-05, + "loss": 0.3093, + "step": 1918 + }, + { + "epoch": 5.888547271329746, + "grad_norm": 0.6139503717422485, + "learning_rate": 7.218697829716194e-05, + "loss": 0.2015, + "step": 1919 + }, + { + "epoch": 5.8916218293620295, + "grad_norm": 1.060932993888855, + "learning_rate": 7.212020033388982e-05, + "loss": 0.3479, + "step": 1920 + }, + { + "epoch": 5.894696387394312, + "grad_norm": 0.7673906683921814, + "learning_rate": 7.20534223706177e-05, + "loss": 0.261, + "step": 1921 + }, + { + "epoch": 5.897770945426595, + "grad_norm": 0.9193598031997681, + "learning_rate": 7.198664440734558e-05, + "loss": 0.3752, + "step": 1922 + }, + { + "epoch": 5.900845503458878, + "grad_norm": 0.8515580296516418, + "learning_rate": 7.191986644407346e-05, + "loss": 0.3318, + "step": 1923 + }, + { + "epoch": 5.903920061491161, + "grad_norm": 0.7641887664794922, + "learning_rate": 7.185308848080133e-05, + "loss": 0.352, + "step": 1924 + }, + { + "epoch": 5.906994619523443, + "grad_norm": 1.0089126825332642, + "learning_rate": 7.178631051752922e-05, + "loss": 0.262, + "step": 1925 + }, + { + "epoch": 5.910069177555727, + "grad_norm": 1.0288993120193481, + "learning_rate": 7.171953255425709e-05, + "loss": 0.294, + "step": 1926 + }, + { + "epoch": 5.913143735588009, + "grad_norm": 1.4110133647918701, + "learning_rate": 7.165275459098498e-05, + "loss": 0.2831, + "step": 1927 + }, + { + "epoch": 5.916218293620292, + "grad_norm": 0.9518840909004211, + "learning_rate": 7.158597662771286e-05, + "loss": 0.2872, + "step": 1928 + }, + { + "epoch": 5.919292851652575, + "grad_norm": 0.8720163106918335, + "learning_rate": 7.151919866444074e-05, + "loss": 0.2104, + "step": 1929 + }, + { + "epoch": 5.922367409684858, + "grad_norm": 1.4843337535858154, + "learning_rate": 7.145242070116862e-05, + "loss": 0.4018, + "step": 1930 + }, + { + "epoch": 5.92544196771714, + "grad_norm": 0.7498276233673096, + "learning_rate": 7.13856427378965e-05, + "loss": 0.2566, + "step": 1931 + }, + { + "epoch": 5.928516525749424, + "grad_norm": 0.856194019317627, + "learning_rate": 7.131886477462437e-05, + "loss": 0.2937, + "step": 1932 + }, + { + "epoch": 5.931591083781706, + "grad_norm": 0.766518771648407, + "learning_rate": 7.125208681135226e-05, + "loss": 0.2397, + "step": 1933 + }, + { + "epoch": 5.934665641813989, + "grad_norm": 0.9151931405067444, + "learning_rate": 7.118530884808013e-05, + "loss": 0.3297, + "step": 1934 + }, + { + "epoch": 5.937740199846272, + "grad_norm": 0.7466654181480408, + "learning_rate": 7.111853088480802e-05, + "loss": 0.303, + "step": 1935 + }, + { + "epoch": 5.940814757878555, + "grad_norm": 0.8686262965202332, + "learning_rate": 7.105175292153589e-05, + "loss": 0.2854, + "step": 1936 + }, + { + "epoch": 5.9438893159108375, + "grad_norm": 0.72053462266922, + "learning_rate": 7.098497495826378e-05, + "loss": 0.3252, + "step": 1937 + }, + { + "epoch": 5.946963873943121, + "grad_norm": 0.9415873885154724, + "learning_rate": 7.091819699499166e-05, + "loss": 0.2553, + "step": 1938 + }, + { + "epoch": 5.950038431975403, + "grad_norm": 0.7902587056159973, + "learning_rate": 7.085141903171954e-05, + "loss": 0.3969, + "step": 1939 + }, + { + "epoch": 5.9531129900076865, + "grad_norm": 0.7759074568748474, + "learning_rate": 7.078464106844741e-05, + "loss": 0.2123, + "step": 1940 + }, + { + "epoch": 5.956187548039969, + "grad_norm": 0.9863756895065308, + "learning_rate": 7.07178631051753e-05, + "loss": 0.3059, + "step": 1941 + }, + { + "epoch": 5.959262106072252, + "grad_norm": 1.1259315013885498, + "learning_rate": 7.065108514190317e-05, + "loss": 0.276, + "step": 1942 + }, + { + "epoch": 5.9623366641045354, + "grad_norm": 0.7862762212753296, + "learning_rate": 7.058430717863106e-05, + "loss": 0.3132, + "step": 1943 + }, + { + "epoch": 5.965411222136818, + "grad_norm": 1.240963101387024, + "learning_rate": 7.051752921535893e-05, + "loss": 0.3393, + "step": 1944 + }, + { + "epoch": 5.9684857801691, + "grad_norm": 0.9186695218086243, + "learning_rate": 7.045075125208682e-05, + "loss": 0.3061, + "step": 1945 + }, + { + "epoch": 5.9715603382013835, + "grad_norm": 0.6734002232551575, + "learning_rate": 7.03839732888147e-05, + "loss": 0.3114, + "step": 1946 + }, + { + "epoch": 5.974634896233667, + "grad_norm": 1.0199098587036133, + "learning_rate": 7.031719532554258e-05, + "loss": 0.2419, + "step": 1947 + }, + { + "epoch": 5.977709454265949, + "grad_norm": 0.8766542673110962, + "learning_rate": 7.025041736227045e-05, + "loss": 0.3097, + "step": 1948 + }, + { + "epoch": 5.980784012298232, + "grad_norm": 1.1471467018127441, + "learning_rate": 7.018363939899834e-05, + "loss": 0.4017, + "step": 1949 + }, + { + "epoch": 5.983858570330515, + "grad_norm": 0.9350420832633972, + "learning_rate": 7.011686143572621e-05, + "loss": 0.3464, + "step": 1950 + }, + { + "epoch": 5.986933128362798, + "grad_norm": 1.6979084014892578, + "learning_rate": 7.00500834724541e-05, + "loss": 0.3693, + "step": 1951 + }, + { + "epoch": 5.990007686395081, + "grad_norm": 0.822771430015564, + "learning_rate": 6.998330550918197e-05, + "loss": 0.2756, + "step": 1952 + }, + { + "epoch": 5.993082244427364, + "grad_norm": 1.6150286197662354, + "learning_rate": 6.991652754590986e-05, + "loss": 0.3242, + "step": 1953 + }, + { + "epoch": 5.996156802459646, + "grad_norm": 1.161136507987976, + "learning_rate": 6.984974958263774e-05, + "loss": 0.3127, + "step": 1954 + }, + { + "epoch": 5.99923136049193, + "grad_norm": 0.7732776403427124, + "learning_rate": 6.978297161936562e-05, + "loss": 0.2713, + "step": 1955 + }, + { + "epoch": 6.0, + "grad_norm": NaN, + "learning_rate": 6.978297161936562e-05, + "loss": 0.2807, + "step": 1956 + }, + { + "epoch": 6.003074558032283, + "grad_norm": 0.8221515417098999, + "learning_rate": 6.971619365609349e-05, + "loss": 0.3441, + "step": 1957 + }, + { + "epoch": 6.006149116064566, + "grad_norm": 0.7294884324073792, + "learning_rate": 6.964941569282138e-05, + "loss": 0.2538, + "step": 1958 + }, + { + "epoch": 6.009223674096849, + "grad_norm": 0.48791152238845825, + "learning_rate": 6.958263772954925e-05, + "loss": 0.2395, + "step": 1959 + }, + { + "epoch": 6.012298232129131, + "grad_norm": 0.7092695236206055, + "learning_rate": 6.951585976627714e-05, + "loss": 0.2032, + "step": 1960 + }, + { + "epoch": 6.015372790161415, + "grad_norm": 0.6608801484107971, + "learning_rate": 6.944908180300501e-05, + "loss": 0.2752, + "step": 1961 + }, + { + "epoch": 6.018447348193697, + "grad_norm": 0.7612819671630859, + "learning_rate": 6.938230383973288e-05, + "loss": 0.2647, + "step": 1962 + }, + { + "epoch": 6.02152190622598, + "grad_norm": 0.6396717429161072, + "learning_rate": 6.931552587646077e-05, + "loss": 0.319, + "step": 1963 + }, + { + "epoch": 6.024596464258263, + "grad_norm": 0.6826348304748535, + "learning_rate": 6.924874791318865e-05, + "loss": 0.1987, + "step": 1964 + }, + { + "epoch": 6.027671022290546, + "grad_norm": 0.7029145956039429, + "learning_rate": 6.918196994991654e-05, + "loss": 0.1958, + "step": 1965 + }, + { + "epoch": 6.0307455803228285, + "grad_norm": 0.46820268034935, + "learning_rate": 6.911519198664441e-05, + "loss": 0.2496, + "step": 1966 + }, + { + "epoch": 6.033820138355112, + "grad_norm": 0.6609360575675964, + "learning_rate": 6.904841402337229e-05, + "loss": 0.2595, + "step": 1967 + }, + { + "epoch": 6.036894696387394, + "grad_norm": 1.3419384956359863, + "learning_rate": 6.898163606010017e-05, + "loss": 0.1628, + "step": 1968 + }, + { + "epoch": 6.0399692544196775, + "grad_norm": 1.2338601350784302, + "learning_rate": 6.891485809682805e-05, + "loss": 0.245, + "step": 1969 + }, + { + "epoch": 6.04304381245196, + "grad_norm": 0.9335373640060425, + "learning_rate": 6.884808013355592e-05, + "loss": 0.2059, + "step": 1970 + }, + { + "epoch": 6.046118370484243, + "grad_norm": 0.7417526245117188, + "learning_rate": 6.878130217028381e-05, + "loss": 0.2704, + "step": 1971 + }, + { + "epoch": 6.049192928516526, + "grad_norm": 0.7647474408149719, + "learning_rate": 6.87145242070117e-05, + "loss": 0.2319, + "step": 1972 + }, + { + "epoch": 6.052267486548809, + "grad_norm": 0.6899215579032898, + "learning_rate": 6.864774624373958e-05, + "loss": 0.1459, + "step": 1973 + }, + { + "epoch": 6.055342044581091, + "grad_norm": 0.9612866044044495, + "learning_rate": 6.858096828046745e-05, + "loss": 0.3229, + "step": 1974 + }, + { + "epoch": 6.058416602613375, + "grad_norm": 1.1122326850891113, + "learning_rate": 6.851419031719533e-05, + "loss": 0.2506, + "step": 1975 + }, + { + "epoch": 6.061491160645657, + "grad_norm": 0.929296612739563, + "learning_rate": 6.844741235392321e-05, + "loss": 0.2081, + "step": 1976 + }, + { + "epoch": 6.06456571867794, + "grad_norm": 0.7660003304481506, + "learning_rate": 6.838063439065109e-05, + "loss": 0.2926, + "step": 1977 + }, + { + "epoch": 6.067640276710223, + "grad_norm": 0.5416483879089355, + "learning_rate": 6.831385642737896e-05, + "loss": 0.3146, + "step": 1978 + }, + { + "epoch": 6.070714834742506, + "grad_norm": 1.074669599533081, + "learning_rate": 6.824707846410685e-05, + "loss": 0.2937, + "step": 1979 + }, + { + "epoch": 6.073789392774788, + "grad_norm": 2.4369921684265137, + "learning_rate": 6.818030050083472e-05, + "loss": 0.2009, + "step": 1980 + }, + { + "epoch": 6.076863950807072, + "grad_norm": 1.6385631561279297, + "learning_rate": 6.811352253756261e-05, + "loss": 0.2339, + "step": 1981 + }, + { + "epoch": 6.079938508839354, + "grad_norm": 1.0690921545028687, + "learning_rate": 6.80467445742905e-05, + "loss": 0.2365, + "step": 1982 + }, + { + "epoch": 6.083013066871637, + "grad_norm": 1.094051480293274, + "learning_rate": 6.797996661101837e-05, + "loss": 0.3067, + "step": 1983 + }, + { + "epoch": 6.08608762490392, + "grad_norm": 1.2970466613769531, + "learning_rate": 6.791318864774625e-05, + "loss": 0.2976, + "step": 1984 + }, + { + "epoch": 6.089162182936203, + "grad_norm": 0.678546130657196, + "learning_rate": 6.784641068447413e-05, + "loss": 0.2391, + "step": 1985 + }, + { + "epoch": 6.092236740968485, + "grad_norm": 0.6241523027420044, + "learning_rate": 6.7779632721202e-05, + "loss": 0.2372, + "step": 1986 + }, + { + "epoch": 6.095311299000769, + "grad_norm": 0.8551547527313232, + "learning_rate": 6.771285475792989e-05, + "loss": 0.2309, + "step": 1987 + }, + { + "epoch": 6.098385857033051, + "grad_norm": 1.2978205680847168, + "learning_rate": 6.764607679465776e-05, + "loss": 0.221, + "step": 1988 + }, + { + "epoch": 6.101460415065334, + "grad_norm": 0.48349693417549133, + "learning_rate": 6.757929883138565e-05, + "loss": 0.2102, + "step": 1989 + }, + { + "epoch": 6.104534973097617, + "grad_norm": 0.9165658950805664, + "learning_rate": 6.751252086811353e-05, + "loss": 0.3046, + "step": 1990 + }, + { + "epoch": 6.1076095311299, + "grad_norm": 0.6445243954658508, + "learning_rate": 6.74457429048414e-05, + "loss": 0.2875, + "step": 1991 + }, + { + "epoch": 6.1106840891621825, + "grad_norm": 0.8826196789741516, + "learning_rate": 6.737896494156929e-05, + "loss": 0.3097, + "step": 1992 + }, + { + "epoch": 6.113758647194466, + "grad_norm": 0.7305975556373596, + "learning_rate": 6.731218697829717e-05, + "loss": 0.2431, + "step": 1993 + }, + { + "epoch": 6.116833205226748, + "grad_norm": 0.6952454447746277, + "learning_rate": 6.724540901502504e-05, + "loss": 0.2939, + "step": 1994 + }, + { + "epoch": 6.1199077632590315, + "grad_norm": 0.6714677810668945, + "learning_rate": 6.717863105175293e-05, + "loss": 0.2164, + "step": 1995 + }, + { + "epoch": 6.122982321291314, + "grad_norm": 1.3943935632705688, + "learning_rate": 6.71118530884808e-05, + "loss": 0.2819, + "step": 1996 + }, + { + "epoch": 6.126056879323597, + "grad_norm": 0.8125165700912476, + "learning_rate": 6.704507512520869e-05, + "loss": 0.2346, + "step": 1997 + }, + { + "epoch": 6.1291314373558805, + "grad_norm": 0.8236249089241028, + "learning_rate": 6.697829716193656e-05, + "loss": 0.2934, + "step": 1998 + }, + { + "epoch": 6.132205995388163, + "grad_norm": 0.6829390525817871, + "learning_rate": 6.691151919866445e-05, + "loss": 0.2319, + "step": 1999 + }, + { + "epoch": 6.135280553420446, + "grad_norm": 0.7294898629188538, + "learning_rate": 6.684474123539233e-05, + "loss": 0.2563, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2193823136997376e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}