diff --git "a/checkpoint-1566/trainer_state.json" "b/checkpoint-1566/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1566/trainer_state.json" @@ -0,0 +1,10995 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1566, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001277139208173691, + "grad_norm": 0.7305087004897599, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0385, + "step": 1 + }, + { + "epoch": 0.002554278416347382, + "grad_norm": 0.5788051750266512, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1324, + "step": 2 + }, + { + "epoch": 0.0038314176245210726, + "grad_norm": 0.4104149307176431, + "learning_rate": 3e-06, + "loss": 1.0454, + "step": 3 + }, + { + "epoch": 0.005108556832694764, + "grad_norm": 0.3976594914673457, + "learning_rate": 4.000000000000001e-06, + "loss": 1.0317, + "step": 4 + }, + { + "epoch": 0.006385696040868455, + "grad_norm": 0.4325669017086326, + "learning_rate": 5e-06, + "loss": 1.203, + "step": 5 + }, + { + "epoch": 0.007662835249042145, + "grad_norm": 0.442530378480049, + "learning_rate": 6e-06, + "loss": 1.0066, + "step": 6 + }, + { + "epoch": 0.008939974457215836, + "grad_norm": 1.0508404456694453, + "learning_rate": 7.000000000000001e-06, + "loss": 0.8831, + "step": 7 + }, + { + "epoch": 0.010217113665389528, + "grad_norm": 0.3369806654576105, + "learning_rate": 8.000000000000001e-06, + "loss": 0.8553, + "step": 8 + }, + { + "epoch": 0.011494252873563218, + "grad_norm": 0.4909982120670218, + "learning_rate": 9e-06, + "loss": 1.2149, + "step": 9 + }, + { + "epoch": 0.01277139208173691, + "grad_norm": 0.4228038016789454, + "learning_rate": 1e-05, + "loss": 1.0654, + "step": 10 + }, + { + "epoch": 0.0140485312899106, + "grad_norm": 0.5874964633719922, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0521, + "step": 11 + }, + { + "epoch": 0.01532567049808429, + "grad_norm": 0.48773564415929954, + "learning_rate": 1.2e-05, + "loss": 1.0745, + "step": 12 + }, + { + "epoch": 0.016602809706257982, + "grad_norm": 0.49454801728998976, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.9925, + "step": 13 + }, + { + "epoch": 0.017879948914431672, + "grad_norm": 0.5907372677411048, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.8979, + "step": 14 + }, + { + "epoch": 0.019157088122605363, + "grad_norm": 0.6448668988026047, + "learning_rate": 1.5e-05, + "loss": 0.9308, + "step": 15 + }, + { + "epoch": 0.020434227330779056, + "grad_norm": 0.4546775002311587, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.9341, + "step": 16 + }, + { + "epoch": 0.021711366538952746, + "grad_norm": 0.7722061476109652, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.7884, + "step": 17 + }, + { + "epoch": 0.022988505747126436, + "grad_norm": 0.4725146911968615, + "learning_rate": 1.8e-05, + "loss": 0.9587, + "step": 18 + }, + { + "epoch": 0.024265644955300127, + "grad_norm": 0.49843110568393056, + "learning_rate": 1.9e-05, + "loss": 1.0004, + "step": 19 + }, + { + "epoch": 0.02554278416347382, + "grad_norm": 0.4436519216768816, + "learning_rate": 2e-05, + "loss": 0.8072, + "step": 20 + }, + { + "epoch": 0.02681992337164751, + "grad_norm": 0.5622954427853697, + "learning_rate": 2.1e-05, + "loss": 1.0187, + "step": 21 + }, + { + "epoch": 0.0280970625798212, + "grad_norm": 0.474478091033631, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.7691, + "step": 22 + }, + { + "epoch": 0.02937420178799489, + "grad_norm": 0.5326527765895438, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.7319, + "step": 23 + }, + { + "epoch": 0.03065134099616858, + "grad_norm": 0.579911959671978, + "learning_rate": 2.4e-05, + "loss": 0.8513, + "step": 24 + }, + { + "epoch": 0.031928480204342274, + "grad_norm": 0.39447139626028177, + "learning_rate": 2.5e-05, + "loss": 0.6849, + "step": 25 + }, + { + "epoch": 0.033205619412515965, + "grad_norm": 0.5111833512937238, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.7792, + "step": 26 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 0.44943396487202286, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.6746, + "step": 27 + }, + { + "epoch": 0.035759897828863345, + "grad_norm": 0.49545242823278535, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.6839, + "step": 28 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.39382589633121917, + "learning_rate": 2.9e-05, + "loss": 0.6188, + "step": 29 + }, + { + "epoch": 0.038314176245210725, + "grad_norm": 0.43426147032055146, + "learning_rate": 3e-05, + "loss": 0.7265, + "step": 30 + }, + { + "epoch": 0.03959131545338442, + "grad_norm": 0.437062741757118, + "learning_rate": 3.1e-05, + "loss": 0.6542, + "step": 31 + }, + { + "epoch": 0.04086845466155811, + "grad_norm": 0.4592689779941086, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.677, + "step": 32 + }, + { + "epoch": 0.0421455938697318, + "grad_norm": 0.4436286688273638, + "learning_rate": 3.3e-05, + "loss": 0.6622, + "step": 33 + }, + { + "epoch": 0.04342273307790549, + "grad_norm": 0.35080105953697105, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.6333, + "step": 34 + }, + { + "epoch": 0.04469987228607918, + "grad_norm": 0.48427242432508044, + "learning_rate": 3.5e-05, + "loss": 0.6337, + "step": 35 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 1.4890533805700878, + "learning_rate": 3.6e-05, + "loss": 0.7535, + "step": 36 + }, + { + "epoch": 0.04725415070242656, + "grad_norm": 0.5421528424239075, + "learning_rate": 3.7e-05, + "loss": 0.7452, + "step": 37 + }, + { + "epoch": 0.04853128991060025, + "grad_norm": 0.40879246562219806, + "learning_rate": 3.8e-05, + "loss": 0.6438, + "step": 38 + }, + { + "epoch": 0.04980842911877394, + "grad_norm": 0.5615582970668291, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.677, + "step": 39 + }, + { + "epoch": 0.05108556832694764, + "grad_norm": 0.4426270091306376, + "learning_rate": 4e-05, + "loss": 0.6711, + "step": 40 + }, + { + "epoch": 0.05236270753512133, + "grad_norm": 0.5164798332020049, + "learning_rate": 4.1e-05, + "loss": 0.541, + "step": 41 + }, + { + "epoch": 0.05363984674329502, + "grad_norm": 0.5328210713672364, + "learning_rate": 4.2e-05, + "loss": 0.7051, + "step": 42 + }, + { + "epoch": 0.05491698595146871, + "grad_norm": 0.438843410368363, + "learning_rate": 4.3e-05, + "loss": 0.5792, + "step": 43 + }, + { + "epoch": 0.0561941251596424, + "grad_norm": 0.4870075414075533, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.634, + "step": 44 + }, + { + "epoch": 0.05747126436781609, + "grad_norm": 0.3872217080509972, + "learning_rate": 4.5e-05, + "loss": 0.5405, + "step": 45 + }, + { + "epoch": 0.05874840357598978, + "grad_norm": 0.4320756863526598, + "learning_rate": 4.600000000000001e-05, + "loss": 0.6291, + "step": 46 + }, + { + "epoch": 0.06002554278416347, + "grad_norm": 0.6868490007013103, + "learning_rate": 4.7e-05, + "loss": 0.6628, + "step": 47 + }, + { + "epoch": 0.06130268199233716, + "grad_norm": 0.39425632391157145, + "learning_rate": 4.8e-05, + "loss": 0.5586, + "step": 48 + }, + { + "epoch": 0.06257982120051085, + "grad_norm": 0.4065255505505926, + "learning_rate": 4.9e-05, + "loss": 0.5836, + "step": 49 + }, + { + "epoch": 0.06385696040868455, + "grad_norm": 0.47598034317259713, + "learning_rate": 5e-05, + "loss": 0.5578, + "step": 50 + }, + { + "epoch": 0.06513409961685823, + "grad_norm": 0.3895239439959761, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.5555, + "step": 51 + }, + { + "epoch": 0.06641123882503193, + "grad_norm": 0.6391571183770334, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.6094, + "step": 52 + }, + { + "epoch": 0.06768837803320563, + "grad_norm": 0.452713392733971, + "learning_rate": 5.300000000000001e-05, + "loss": 0.5593, + "step": 53 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 0.4158256759501776, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.523, + "step": 54 + }, + { + "epoch": 0.070242656449553, + "grad_norm": 0.368367413921774, + "learning_rate": 5.500000000000001e-05, + "loss": 0.5192, + "step": 55 + }, + { + "epoch": 0.07151979565772669, + "grad_norm": 0.5816865248846961, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.5647, + "step": 56 + }, + { + "epoch": 0.07279693486590039, + "grad_norm": 0.4929162975950812, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.5646, + "step": 57 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.40113007110927856, + "learning_rate": 5.8e-05, + "loss": 0.5055, + "step": 58 + }, + { + "epoch": 0.07535121328224777, + "grad_norm": 0.5007889508051512, + "learning_rate": 5.9e-05, + "loss": 0.5719, + "step": 59 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.40506692930996346, + "learning_rate": 6e-05, + "loss": 0.5079, + "step": 60 + }, + { + "epoch": 0.07790549169859515, + "grad_norm": 0.5961408432892515, + "learning_rate": 6.1e-05, + "loss": 0.5575, + "step": 61 + }, + { + "epoch": 0.07918263090676884, + "grad_norm": 1.1097615794049542, + "learning_rate": 6.2e-05, + "loss": 0.4998, + "step": 62 + }, + { + "epoch": 0.08045977011494253, + "grad_norm": 0.5503363993731951, + "learning_rate": 6.3e-05, + "loss": 0.4848, + "step": 63 + }, + { + "epoch": 0.08173690932311622, + "grad_norm": 0.33817369718399554, + "learning_rate": 6.400000000000001e-05, + "loss": 0.5101, + "step": 64 + }, + { + "epoch": 0.08301404853128991, + "grad_norm": 0.39432817897013944, + "learning_rate": 6.500000000000001e-05, + "loss": 0.5671, + "step": 65 + }, + { + "epoch": 0.0842911877394636, + "grad_norm": 0.3684478644857248, + "learning_rate": 6.6e-05, + "loss": 0.5378, + "step": 66 + }, + { + "epoch": 0.08556832694763729, + "grad_norm": 0.8365561167697176, + "learning_rate": 6.7e-05, + "loss": 0.5308, + "step": 67 + }, + { + "epoch": 0.08684546615581099, + "grad_norm": 3.448019636296, + "learning_rate": 6.800000000000001e-05, + "loss": 0.4823, + "step": 68 + }, + { + "epoch": 0.08812260536398467, + "grad_norm": 0.38273292137684817, + "learning_rate": 6.9e-05, + "loss": 0.462, + "step": 69 + }, + { + "epoch": 0.08939974457215837, + "grad_norm": 0.40885744708448846, + "learning_rate": 7e-05, + "loss": 0.5315, + "step": 70 + }, + { + "epoch": 0.09067688378033206, + "grad_norm": 0.36753060585817315, + "learning_rate": 7.1e-05, + "loss": 0.437, + "step": 71 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 0.6061048009579133, + "learning_rate": 7.2e-05, + "loss": 0.4666, + "step": 72 + }, + { + "epoch": 0.09323116219667944, + "grad_norm": 0.8044300174250731, + "learning_rate": 7.3e-05, + "loss": 0.4807, + "step": 73 + }, + { + "epoch": 0.09450830140485313, + "grad_norm": 0.47888728759978005, + "learning_rate": 7.4e-05, + "loss": 0.4643, + "step": 74 + }, + { + "epoch": 0.09578544061302682, + "grad_norm": 1.3284456201550516, + "learning_rate": 7.500000000000001e-05, + "loss": 0.5242, + "step": 75 + }, + { + "epoch": 0.0970625798212005, + "grad_norm": 0.8608740888045907, + "learning_rate": 7.6e-05, + "loss": 0.6148, + "step": 76 + }, + { + "epoch": 0.0983397190293742, + "grad_norm": 0.4884293964738099, + "learning_rate": 7.7e-05, + "loss": 0.5496, + "step": 77 + }, + { + "epoch": 0.09961685823754789, + "grad_norm": 0.4170222372773021, + "learning_rate": 7.800000000000001e-05, + "loss": 0.507, + "step": 78 + }, + { + "epoch": 0.10089399744572158, + "grad_norm": 0.4367550631288593, + "learning_rate": 7.900000000000001e-05, + "loss": 0.4979, + "step": 79 + }, + { + "epoch": 0.10217113665389528, + "grad_norm": 0.6192101062859389, + "learning_rate": 8e-05, + "loss": 0.5606, + "step": 80 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 1.0254633473955994, + "learning_rate": 8.1e-05, + "loss": 0.4852, + "step": 81 + }, + { + "epoch": 0.10472541507024266, + "grad_norm": 0.42275174816324, + "learning_rate": 8.2e-05, + "loss": 0.4313, + "step": 82 + }, + { + "epoch": 0.10600255427841634, + "grad_norm": 0.5086168835106157, + "learning_rate": 8.3e-05, + "loss": 0.5256, + "step": 83 + }, + { + "epoch": 0.10727969348659004, + "grad_norm": 0.6985417366640204, + "learning_rate": 8.4e-05, + "loss": 0.4889, + "step": 84 + }, + { + "epoch": 0.10855683269476372, + "grad_norm": 0.46866387008537175, + "learning_rate": 8.5e-05, + "loss": 0.4916, + "step": 85 + }, + { + "epoch": 0.10983397190293742, + "grad_norm": 0.5449492180593796, + "learning_rate": 8.6e-05, + "loss": 0.4084, + "step": 86 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4833955640679536, + "learning_rate": 8.7e-05, + "loss": 0.4132, + "step": 87 + }, + { + "epoch": 0.1123882503192848, + "grad_norm": 0.4675766563210074, + "learning_rate": 8.800000000000001e-05, + "loss": 0.4903, + "step": 88 + }, + { + "epoch": 0.1136653895274585, + "grad_norm": 0.6246830327429227, + "learning_rate": 8.900000000000001e-05, + "loss": 0.4922, + "step": 89 + }, + { + "epoch": 0.11494252873563218, + "grad_norm": 0.6830917570556715, + "learning_rate": 9e-05, + "loss": 0.4906, + "step": 90 + }, + { + "epoch": 0.11621966794380588, + "grad_norm": 0.4087612254583743, + "learning_rate": 9.1e-05, + "loss": 0.4753, + "step": 91 + }, + { + "epoch": 0.11749680715197956, + "grad_norm": 0.5494787216352712, + "learning_rate": 9.200000000000001e-05, + "loss": 0.5056, + "step": 92 + }, + { + "epoch": 0.11877394636015326, + "grad_norm": 0.5386573669988592, + "learning_rate": 9.300000000000001e-05, + "loss": 0.4727, + "step": 93 + }, + { + "epoch": 0.12005108556832694, + "grad_norm": 0.5078524440229646, + "learning_rate": 9.4e-05, + "loss": 0.5655, + "step": 94 + }, + { + "epoch": 0.12132822477650064, + "grad_norm": 0.4520241493945927, + "learning_rate": 9.5e-05, + "loss": 0.4809, + "step": 95 + }, + { + "epoch": 0.12260536398467432, + "grad_norm": 0.4511706444939472, + "learning_rate": 9.6e-05, + "loss": 0.4267, + "step": 96 + }, + { + "epoch": 0.12388250319284802, + "grad_norm": 0.3827359022874416, + "learning_rate": 9.7e-05, + "loss": 0.4496, + "step": 97 + }, + { + "epoch": 0.1251596424010217, + "grad_norm": 0.7386801654669198, + "learning_rate": 9.8e-05, + "loss": 0.4309, + "step": 98 + }, + { + "epoch": 0.12643678160919541, + "grad_norm": 0.3558806002327379, + "learning_rate": 9.900000000000001e-05, + "loss": 0.466, + "step": 99 + }, + { + "epoch": 0.1277139208173691, + "grad_norm": 0.6012433252950439, + "learning_rate": 0.0001, + "loss": 0.4416, + "step": 100 + }, + { + "epoch": 0.12899106002554278, + "grad_norm": 0.3810750584012099, + "learning_rate": 9.99999512178685e-05, + "loss": 0.4439, + "step": 101 + }, + { + "epoch": 0.13026819923371646, + "grad_norm": 0.3698132943931747, + "learning_rate": 9.999980487156919e-05, + "loss": 0.4386, + "step": 102 + }, + { + "epoch": 0.13154533844189017, + "grad_norm": 0.3638825559796942, + "learning_rate": 9.999956096138764e-05, + "loss": 0.4373, + "step": 103 + }, + { + "epoch": 0.13282247765006386, + "grad_norm": 0.5380354332094556, + "learning_rate": 9.999921948779978e-05, + "loss": 0.4847, + "step": 104 + }, + { + "epoch": 0.13409961685823754, + "grad_norm": 0.46065061723972434, + "learning_rate": 9.999878045147191e-05, + "loss": 0.494, + "step": 105 + }, + { + "epoch": 0.13537675606641125, + "grad_norm": 0.5120465019262737, + "learning_rate": 9.999824385326073e-05, + "loss": 0.5071, + "step": 106 + }, + { + "epoch": 0.13665389527458494, + "grad_norm": 0.40696515937876915, + "learning_rate": 9.99976096942133e-05, + "loss": 0.4899, + "step": 107 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.4834869652431062, + "learning_rate": 9.999687797556705e-05, + "loss": 0.4205, + "step": 108 + }, + { + "epoch": 0.1392081736909323, + "grad_norm": 0.49393020432553597, + "learning_rate": 9.999604869874974e-05, + "loss": 0.5095, + "step": 109 + }, + { + "epoch": 0.140485312899106, + "grad_norm": 0.4970053166405342, + "learning_rate": 9.999512186537956e-05, + "loss": 0.4128, + "step": 110 + }, + { + "epoch": 0.1417624521072797, + "grad_norm": 0.5893306783215299, + "learning_rate": 9.999409747726502e-05, + "loss": 0.4383, + "step": 111 + }, + { + "epoch": 0.14303959131545338, + "grad_norm": 0.41023462207463945, + "learning_rate": 9.999297553640498e-05, + "loss": 0.458, + "step": 112 + }, + { + "epoch": 0.14431673052362706, + "grad_norm": 0.3705062435167448, + "learning_rate": 9.999175604498867e-05, + "loss": 0.4247, + "step": 113 + }, + { + "epoch": 0.14559386973180077, + "grad_norm": 0.33734890246006316, + "learning_rate": 9.999043900539567e-05, + "loss": 0.4209, + "step": 114 + }, + { + "epoch": 0.14687100893997446, + "grad_norm": 0.3851621468375724, + "learning_rate": 9.998902442019591e-05, + "loss": 0.4641, + "step": 115 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.4205394709454925, + "learning_rate": 9.998751229214962e-05, + "loss": 0.422, + "step": 116 + }, + { + "epoch": 0.14942528735632185, + "grad_norm": 0.4274587944866388, + "learning_rate": 9.998590262420743e-05, + "loss": 0.5205, + "step": 117 + }, + { + "epoch": 0.15070242656449553, + "grad_norm": 0.7805632407081257, + "learning_rate": 9.998419541951023e-05, + "loss": 0.4813, + "step": 118 + }, + { + "epoch": 0.15197956577266922, + "grad_norm": 0.5495716970494346, + "learning_rate": 9.998239068138928e-05, + "loss": 0.4749, + "step": 119 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3286101804794932, + "learning_rate": 9.998048841336614e-05, + "loss": 0.4558, + "step": 120 + }, + { + "epoch": 0.1545338441890166, + "grad_norm": 0.4191164724785909, + "learning_rate": 9.997848861915266e-05, + "loss": 0.501, + "step": 121 + }, + { + "epoch": 0.1558109833971903, + "grad_norm": 0.35600662538356503, + "learning_rate": 9.997639130265104e-05, + "loss": 0.3825, + "step": 122 + }, + { + "epoch": 0.15708812260536398, + "grad_norm": 0.3735737126088104, + "learning_rate": 9.997419646795372e-05, + "loss": 0.4587, + "step": 123 + }, + { + "epoch": 0.1583652618135377, + "grad_norm": 0.5494792027514749, + "learning_rate": 9.997190411934345e-05, + "loss": 0.4266, + "step": 124 + }, + { + "epoch": 0.15964240102171137, + "grad_norm": 0.4534690900009374, + "learning_rate": 9.996951426129326e-05, + "loss": 0.4619, + "step": 125 + }, + { + "epoch": 0.16091954022988506, + "grad_norm": 0.2873542398092403, + "learning_rate": 9.996702689846645e-05, + "loss": 0.4625, + "step": 126 + }, + { + "epoch": 0.16219667943805874, + "grad_norm": 0.3792991572384746, + "learning_rate": 9.996444203571655e-05, + "loss": 0.4145, + "step": 127 + }, + { + "epoch": 0.16347381864623245, + "grad_norm": 0.4526655853571364, + "learning_rate": 9.996175967808739e-05, + "loss": 0.4943, + "step": 128 + }, + { + "epoch": 0.16475095785440613, + "grad_norm": 0.6990482548864102, + "learning_rate": 9.995897983081301e-05, + "loss": 0.5085, + "step": 129 + }, + { + "epoch": 0.16602809706257982, + "grad_norm": 0.40196378408798605, + "learning_rate": 9.995610249931768e-05, + "loss": 0.4858, + "step": 130 + }, + { + "epoch": 0.1673052362707535, + "grad_norm": 0.43310833993730613, + "learning_rate": 9.995312768921591e-05, + "loss": 0.4991, + "step": 131 + }, + { + "epoch": 0.1685823754789272, + "grad_norm": 0.36563188107459327, + "learning_rate": 9.995005540631238e-05, + "loss": 0.3721, + "step": 132 + }, + { + "epoch": 0.1698595146871009, + "grad_norm": 0.979693571179333, + "learning_rate": 9.9946885656602e-05, + "loss": 0.45, + "step": 133 + }, + { + "epoch": 0.17113665389527458, + "grad_norm": 0.992098032460407, + "learning_rate": 9.994361844626986e-05, + "loss": 0.418, + "step": 134 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 0.5049831989098265, + "learning_rate": 9.994025378169123e-05, + "loss": 0.5162, + "step": 135 + }, + { + "epoch": 0.17369093231162197, + "grad_norm": 0.29815062694092453, + "learning_rate": 9.99367916694315e-05, + "loss": 0.4344, + "step": 136 + }, + { + "epoch": 0.17496807151979565, + "grad_norm": 0.36118672819541375, + "learning_rate": 9.993323211624626e-05, + "loss": 0.5179, + "step": 137 + }, + { + "epoch": 0.17624521072796934, + "grad_norm": 0.39369303173184783, + "learning_rate": 9.992957512908121e-05, + "loss": 0.5052, + "step": 138 + }, + { + "epoch": 0.17752234993614305, + "grad_norm": 0.5030538729448256, + "learning_rate": 9.992582071507216e-05, + "loss": 0.4828, + "step": 139 + }, + { + "epoch": 0.17879948914431673, + "grad_norm": 0.47611650085568213, + "learning_rate": 9.992196888154507e-05, + "loss": 0.4638, + "step": 140 + }, + { + "epoch": 0.18007662835249041, + "grad_norm": 0.4294814489346547, + "learning_rate": 9.991801963601595e-05, + "loss": 0.4799, + "step": 141 + }, + { + "epoch": 0.18135376756066413, + "grad_norm": 0.7204536541958477, + "learning_rate": 9.99139729861909e-05, + "loss": 0.5156, + "step": 142 + }, + { + "epoch": 0.1826309067688378, + "grad_norm": 0.6385248745219236, + "learning_rate": 9.990982893996612e-05, + "loss": 0.4752, + "step": 143 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 0.4078572570072826, + "learning_rate": 9.990558750542778e-05, + "loss": 0.4841, + "step": 144 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.5986767305815563, + "learning_rate": 9.990124869085215e-05, + "loss": 0.4319, + "step": 145 + }, + { + "epoch": 0.18646232439335889, + "grad_norm": 0.44419816212926955, + "learning_rate": 9.98968125047055e-05, + "loss": 0.5156, + "step": 146 + }, + { + "epoch": 0.18773946360153257, + "grad_norm": 0.4376241905994591, + "learning_rate": 9.989227895564409e-05, + "loss": 0.4359, + "step": 147 + }, + { + "epoch": 0.18901660280970625, + "grad_norm": 0.4556548621169557, + "learning_rate": 9.988764805251417e-05, + "loss": 0.4653, + "step": 148 + }, + { + "epoch": 0.19029374201787994, + "grad_norm": 0.42031706608020447, + "learning_rate": 9.988291980435195e-05, + "loss": 0.3939, + "step": 149 + }, + { + "epoch": 0.19157088122605365, + "grad_norm": 0.39622563234644803, + "learning_rate": 9.987809422038359e-05, + "loss": 0.4295, + "step": 150 + }, + { + "epoch": 0.19284802043422733, + "grad_norm": 0.43836221009138715, + "learning_rate": 9.987317131002518e-05, + "loss": 0.5338, + "step": 151 + }, + { + "epoch": 0.194125159642401, + "grad_norm": 0.4330164419316081, + "learning_rate": 9.986815108288272e-05, + "loss": 0.4727, + "step": 152 + }, + { + "epoch": 0.19540229885057472, + "grad_norm": 0.4970088336923227, + "learning_rate": 9.986303354875213e-05, + "loss": 0.4625, + "step": 153 + }, + { + "epoch": 0.1966794380587484, + "grad_norm": 1.7787841297321283, + "learning_rate": 9.985781871761915e-05, + "loss": 0.4739, + "step": 154 + }, + { + "epoch": 0.1979565772669221, + "grad_norm": 0.34895010619359235, + "learning_rate": 9.98525065996594e-05, + "loss": 0.4208, + "step": 155 + }, + { + "epoch": 0.19923371647509577, + "grad_norm": 0.3072500031257868, + "learning_rate": 9.984709720523835e-05, + "loss": 0.4052, + "step": 156 + }, + { + "epoch": 0.20051085568326948, + "grad_norm": 0.36392059511076186, + "learning_rate": 9.984159054491129e-05, + "loss": 0.3903, + "step": 157 + }, + { + "epoch": 0.20178799489144317, + "grad_norm": 0.47746710802217035, + "learning_rate": 9.983598662942326e-05, + "loss": 0.5539, + "step": 158 + }, + { + "epoch": 0.20306513409961685, + "grad_norm": 0.7645527078811228, + "learning_rate": 9.983028546970908e-05, + "loss": 0.462, + "step": 159 + }, + { + "epoch": 0.20434227330779056, + "grad_norm": 0.6123248443823055, + "learning_rate": 9.982448707689338e-05, + "loss": 0.4321, + "step": 160 + }, + { + "epoch": 0.20561941251596424, + "grad_norm": 0.3671985194296314, + "learning_rate": 9.981859146229047e-05, + "loss": 0.4166, + "step": 161 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 0.3297963423166973, + "learning_rate": 9.981259863740435e-05, + "loss": 0.4063, + "step": 162 + }, + { + "epoch": 0.2081736909323116, + "grad_norm": 0.3667706246191166, + "learning_rate": 9.980650861392878e-05, + "loss": 0.4, + "step": 163 + }, + { + "epoch": 0.20945083014048532, + "grad_norm": 0.4652024577950397, + "learning_rate": 9.980032140374707e-05, + "loss": 0.4531, + "step": 164 + }, + { + "epoch": 0.210727969348659, + "grad_norm": 0.6708455981719644, + "learning_rate": 9.979403701893226e-05, + "loss": 0.3575, + "step": 165 + }, + { + "epoch": 0.2120051085568327, + "grad_norm": 0.5553688243397867, + "learning_rate": 9.9787655471747e-05, + "loss": 0.4628, + "step": 166 + }, + { + "epoch": 0.21328224776500637, + "grad_norm": 0.41719961294965063, + "learning_rate": 9.978117677464348e-05, + "loss": 0.4407, + "step": 167 + }, + { + "epoch": 0.21455938697318008, + "grad_norm": 0.4929465379488925, + "learning_rate": 9.977460094026349e-05, + "loss": 0.3796, + "step": 168 + }, + { + "epoch": 0.21583652618135377, + "grad_norm": 0.38837720315812396, + "learning_rate": 9.976792798143836e-05, + "loss": 0.4153, + "step": 169 + }, + { + "epoch": 0.21711366538952745, + "grad_norm": 0.6006674662018925, + "learning_rate": 9.976115791118896e-05, + "loss": 0.4875, + "step": 170 + }, + { + "epoch": 0.21839080459770116, + "grad_norm": 0.40166098081842333, + "learning_rate": 9.975429074272561e-05, + "loss": 0.3584, + "step": 171 + }, + { + "epoch": 0.21966794380587484, + "grad_norm": 0.37065144667227573, + "learning_rate": 9.97473264894481e-05, + "loss": 0.3875, + "step": 172 + }, + { + "epoch": 0.22094508301404853, + "grad_norm": 0.41965459802356564, + "learning_rate": 9.97402651649457e-05, + "loss": 0.4106, + "step": 173 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.3545370044048023, + "learning_rate": 9.973310678299704e-05, + "loss": 0.3806, + "step": 174 + }, + { + "epoch": 0.22349936143039592, + "grad_norm": 0.7299045833561182, + "learning_rate": 9.97258513575702e-05, + "loss": 0.3611, + "step": 175 + }, + { + "epoch": 0.2247765006385696, + "grad_norm": 0.710524260244826, + "learning_rate": 9.971849890282255e-05, + "loss": 0.4051, + "step": 176 + }, + { + "epoch": 0.2260536398467433, + "grad_norm": 0.361937155605131, + "learning_rate": 9.971104943310085e-05, + "loss": 0.4299, + "step": 177 + }, + { + "epoch": 0.227330779054917, + "grad_norm": 0.39302074928510455, + "learning_rate": 9.970350296294113e-05, + "loss": 0.4447, + "step": 178 + }, + { + "epoch": 0.22860791826309068, + "grad_norm": 1.45559089488364, + "learning_rate": 9.969585950706872e-05, + "loss": 0.4305, + "step": 179 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5854962827072269, + "learning_rate": 9.968811908039817e-05, + "loss": 0.3537, + "step": 180 + }, + { + "epoch": 0.23116219667943805, + "grad_norm": 0.35225340817626927, + "learning_rate": 9.968028169803325e-05, + "loss": 0.3773, + "step": 181 + }, + { + "epoch": 0.23243933588761176, + "grad_norm": 0.4742069762092979, + "learning_rate": 9.967234737526694e-05, + "loss": 0.3863, + "step": 182 + }, + { + "epoch": 0.23371647509578544, + "grad_norm": 0.4080635928475799, + "learning_rate": 9.966431612758137e-05, + "loss": 0.4195, + "step": 183 + }, + { + "epoch": 0.23499361430395913, + "grad_norm": 0.3768561159788258, + "learning_rate": 9.965618797064782e-05, + "loss": 0.4423, + "step": 184 + }, + { + "epoch": 0.23627075351213284, + "grad_norm": 0.3282043678924184, + "learning_rate": 9.964796292032658e-05, + "loss": 0.3889, + "step": 185 + }, + { + "epoch": 0.23754789272030652, + "grad_norm": 0.8842595153890274, + "learning_rate": 9.963964099266713e-05, + "loss": 0.2815, + "step": 186 + }, + { + "epoch": 0.2388250319284802, + "grad_norm": 0.4287274055816198, + "learning_rate": 9.96312222039079e-05, + "loss": 0.4295, + "step": 187 + }, + { + "epoch": 0.24010217113665389, + "grad_norm": 0.6200952048496765, + "learning_rate": 9.962270657047634e-05, + "loss": 0.467, + "step": 188 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 4.48435152932009, + "learning_rate": 9.96140941089889e-05, + "loss": 0.3932, + "step": 189 + }, + { + "epoch": 0.24265644955300128, + "grad_norm": 0.8954986050469315, + "learning_rate": 9.960538483625093e-05, + "loss": 0.5103, + "step": 190 + }, + { + "epoch": 0.24393358876117496, + "grad_norm": 0.37343042455105463, + "learning_rate": 9.959657876925671e-05, + "loss": 0.406, + "step": 191 + }, + { + "epoch": 0.24521072796934865, + "grad_norm": 0.30914705468357523, + "learning_rate": 9.95876759251894e-05, + "loss": 0.3382, + "step": 192 + }, + { + "epoch": 0.24648786717752236, + "grad_norm": 0.45572245618674395, + "learning_rate": 9.957867632142097e-05, + "loss": 0.4331, + "step": 193 + }, + { + "epoch": 0.24776500638569604, + "grad_norm": 2.7109323447120266, + "learning_rate": 9.956957997551225e-05, + "loss": 0.4508, + "step": 194 + }, + { + "epoch": 0.24904214559386972, + "grad_norm": 0.3980076876770771, + "learning_rate": 9.956038690521276e-05, + "loss": 0.4185, + "step": 195 + }, + { + "epoch": 0.2503192848020434, + "grad_norm": 0.395422492779056, + "learning_rate": 9.955109712846083e-05, + "loss": 0.359, + "step": 196 + }, + { + "epoch": 0.2515964240102171, + "grad_norm": 0.3362735489243725, + "learning_rate": 9.954171066338345e-05, + "loss": 0.4053, + "step": 197 + }, + { + "epoch": 0.25287356321839083, + "grad_norm": 0.5515847742922555, + "learning_rate": 9.95322275282963e-05, + "loss": 0.4392, + "step": 198 + }, + { + "epoch": 0.2541507024265645, + "grad_norm": 1.7841630485222153, + "learning_rate": 9.952264774170367e-05, + "loss": 0.4398, + "step": 199 + }, + { + "epoch": 0.2554278416347382, + "grad_norm": 0.46322064396727425, + "learning_rate": 9.95129713222985e-05, + "loss": 0.3901, + "step": 200 + }, + { + "epoch": 0.2567049808429119, + "grad_norm": 0.6070913476652713, + "learning_rate": 9.95031982889622e-05, + "loss": 0.4452, + "step": 201 + }, + { + "epoch": 0.25798212005108556, + "grad_norm": 0.7857221127768417, + "learning_rate": 9.949332866076475e-05, + "loss": 0.3731, + "step": 202 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.5151208445161021, + "learning_rate": 9.948336245696461e-05, + "loss": 0.3969, + "step": 203 + }, + { + "epoch": 0.26053639846743293, + "grad_norm": 0.43495470316419027, + "learning_rate": 9.94732996970087e-05, + "loss": 0.4527, + "step": 204 + }, + { + "epoch": 0.26181353767560667, + "grad_norm": 0.47540713544143137, + "learning_rate": 9.946314040053233e-05, + "loss": 0.3627, + "step": 205 + }, + { + "epoch": 0.26309067688378035, + "grad_norm": 0.5322851665158992, + "learning_rate": 9.945288458735918e-05, + "loss": 0.4755, + "step": 206 + }, + { + "epoch": 0.26436781609195403, + "grad_norm": 0.7249268960263687, + "learning_rate": 9.944253227750129e-05, + "loss": 0.3772, + "step": 207 + }, + { + "epoch": 0.2656449553001277, + "grad_norm": 0.419513549077306, + "learning_rate": 9.943208349115894e-05, + "loss": 0.4242, + "step": 208 + }, + { + "epoch": 0.2669220945083014, + "grad_norm": 0.37756452907603716, + "learning_rate": 9.94215382487207e-05, + "loss": 0.4707, + "step": 209 + }, + { + "epoch": 0.2681992337164751, + "grad_norm": 0.45762860794629634, + "learning_rate": 9.941089657076335e-05, + "loss": 0.428, + "step": 210 + }, + { + "epoch": 0.26947637292464877, + "grad_norm": 0.42736613058777795, + "learning_rate": 9.940015847805183e-05, + "loss": 0.3961, + "step": 211 + }, + { + "epoch": 0.2707535121328225, + "grad_norm": 0.48085496676570977, + "learning_rate": 9.938932399153924e-05, + "loss": 0.4447, + "step": 212 + }, + { + "epoch": 0.2720306513409962, + "grad_norm": 0.4941706800643775, + "learning_rate": 9.937839313236676e-05, + "loss": 0.4611, + "step": 213 + }, + { + "epoch": 0.27330779054916987, + "grad_norm": 0.6409579931990744, + "learning_rate": 9.936736592186357e-05, + "loss": 0.4384, + "step": 214 + }, + { + "epoch": 0.27458492975734355, + "grad_norm": 0.5788663309121441, + "learning_rate": 9.935624238154696e-05, + "loss": 0.4049, + "step": 215 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.39769825302681, + "learning_rate": 9.93450225331221e-05, + "loss": 0.3732, + "step": 216 + }, + { + "epoch": 0.2771392081736909, + "grad_norm": 0.41674704384810607, + "learning_rate": 9.933370639848211e-05, + "loss": 0.4883, + "step": 217 + }, + { + "epoch": 0.2784163473818646, + "grad_norm": 0.9375425574715748, + "learning_rate": 9.932229399970801e-05, + "loss": 0.4543, + "step": 218 + }, + { + "epoch": 0.2796934865900383, + "grad_norm": 0.3828919675624168, + "learning_rate": 9.931078535906864e-05, + "loss": 0.4122, + "step": 219 + }, + { + "epoch": 0.280970625798212, + "grad_norm": 0.38828846427898694, + "learning_rate": 9.929918049902062e-05, + "loss": 0.48, + "step": 220 + }, + { + "epoch": 0.2822477650063857, + "grad_norm": 0.37813028495740997, + "learning_rate": 9.92874794422084e-05, + "loss": 0.4125, + "step": 221 + }, + { + "epoch": 0.2835249042145594, + "grad_norm": 0.7819115898429194, + "learning_rate": 9.927568221146401e-05, + "loss": 0.3625, + "step": 222 + }, + { + "epoch": 0.2848020434227331, + "grad_norm": 0.31530048745943984, + "learning_rate": 9.926378882980728e-05, + "loss": 0.4081, + "step": 223 + }, + { + "epoch": 0.28607918263090676, + "grad_norm": 0.8181713761249384, + "learning_rate": 9.925179932044553e-05, + "loss": 0.4155, + "step": 224 + }, + { + "epoch": 0.28735632183908044, + "grad_norm": 0.6324204025373089, + "learning_rate": 9.923971370677374e-05, + "loss": 0.3789, + "step": 225 + }, + { + "epoch": 0.2886334610472541, + "grad_norm": 2.3830148623391505, + "learning_rate": 9.92275320123744e-05, + "loss": 0.4313, + "step": 226 + }, + { + "epoch": 0.28991060025542786, + "grad_norm": 0.5698857101993516, + "learning_rate": 9.921525426101745e-05, + "loss": 0.3675, + "step": 227 + }, + { + "epoch": 0.29118773946360155, + "grad_norm": 0.807511945195027, + "learning_rate": 9.920288047666031e-05, + "loss": 0.4617, + "step": 228 + }, + { + "epoch": 0.29246487867177523, + "grad_norm": 1.1694390793083116, + "learning_rate": 9.919041068344773e-05, + "loss": 0.5311, + "step": 229 + }, + { + "epoch": 0.2937420178799489, + "grad_norm": 0.5216827463633817, + "learning_rate": 9.917784490571187e-05, + "loss": 0.3701, + "step": 230 + }, + { + "epoch": 0.2950191570881226, + "grad_norm": 2.3631353197218106, + "learning_rate": 9.916518316797211e-05, + "loss": 0.5078, + "step": 231 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 1.0319141983743956, + "learning_rate": 9.915242549493513e-05, + "loss": 0.4038, + "step": 232 + }, + { + "epoch": 0.29757343550446996, + "grad_norm": 0.4835614504703682, + "learning_rate": 9.91395719114948e-05, + "loss": 0.4019, + "step": 233 + }, + { + "epoch": 0.2988505747126437, + "grad_norm": 1.2503274048173336, + "learning_rate": 9.912662244273212e-05, + "loss": 0.3671, + "step": 234 + }, + { + "epoch": 0.3001277139208174, + "grad_norm": 0.7865791036410845, + "learning_rate": 9.91135771139152e-05, + "loss": 0.4718, + "step": 235 + }, + { + "epoch": 0.30140485312899107, + "grad_norm": 0.32720021395893134, + "learning_rate": 9.910043595049917e-05, + "loss": 0.3099, + "step": 236 + }, + { + "epoch": 0.30268199233716475, + "grad_norm": 0.5195992576436874, + "learning_rate": 9.908719897812623e-05, + "loss": 0.43, + "step": 237 + }, + { + "epoch": 0.30395913154533843, + "grad_norm": 0.56822776952735, + "learning_rate": 9.907386622262547e-05, + "loss": 0.5333, + "step": 238 + }, + { + "epoch": 0.3052362707535121, + "grad_norm": 0.39648595395789515, + "learning_rate": 9.906043771001289e-05, + "loss": 0.4473, + "step": 239 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.306839671769555, + "learning_rate": 9.904691346649136e-05, + "loss": 0.3444, + "step": 240 + }, + { + "epoch": 0.30779054916985954, + "grad_norm": 0.529903831203461, + "learning_rate": 9.903329351845054e-05, + "loss": 0.4896, + "step": 241 + }, + { + "epoch": 0.3090676883780332, + "grad_norm": 0.5280163195406102, + "learning_rate": 9.901957789246683e-05, + "loss": 0.4691, + "step": 242 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 0.5314454658262997, + "learning_rate": 9.900576661530335e-05, + "loss": 0.3752, + "step": 243 + }, + { + "epoch": 0.3116219667943806, + "grad_norm": 0.3738158084709828, + "learning_rate": 9.899185971390979e-05, + "loss": 0.3968, + "step": 244 + }, + { + "epoch": 0.3128991060025543, + "grad_norm": 0.3500819489064873, + "learning_rate": 9.89778572154225e-05, + "loss": 0.3829, + "step": 245 + }, + { + "epoch": 0.31417624521072796, + "grad_norm": 2.75599796011157, + "learning_rate": 9.89637591471644e-05, + "loss": 0.3796, + "step": 246 + }, + { + "epoch": 0.31545338441890164, + "grad_norm": 0.31769671153419543, + "learning_rate": 9.894956553664478e-05, + "loss": 0.3816, + "step": 247 + }, + { + "epoch": 0.3167305236270754, + "grad_norm": 0.4358984550837405, + "learning_rate": 9.893527641155944e-05, + "loss": 0.4356, + "step": 248 + }, + { + "epoch": 0.31800766283524906, + "grad_norm": 2.0250581195828192, + "learning_rate": 9.892089179979056e-05, + "loss": 0.4529, + "step": 249 + }, + { + "epoch": 0.31928480204342274, + "grad_norm": 0.2971448750526378, + "learning_rate": 9.89064117294066e-05, + "loss": 0.3552, + "step": 250 + }, + { + "epoch": 0.3205619412515964, + "grad_norm": 0.3576940707269101, + "learning_rate": 9.889183622866231e-05, + "loss": 0.5038, + "step": 251 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 0.35099529744120517, + "learning_rate": 9.887716532599867e-05, + "loss": 0.3792, + "step": 252 + }, + { + "epoch": 0.3231162196679438, + "grad_norm": 0.3909796365830771, + "learning_rate": 9.886239905004277e-05, + "loss": 0.4736, + "step": 253 + }, + { + "epoch": 0.3243933588761175, + "grad_norm": 0.30259230350589006, + "learning_rate": 9.884753742960784e-05, + "loss": 0.4027, + "step": 254 + }, + { + "epoch": 0.32567049808429116, + "grad_norm": 0.3283631497457829, + "learning_rate": 9.883258049369313e-05, + "loss": 0.3687, + "step": 255 + }, + { + "epoch": 0.3269476372924649, + "grad_norm": 0.3505970181733391, + "learning_rate": 9.88175282714839e-05, + "loss": 0.419, + "step": 256 + }, + { + "epoch": 0.3282247765006386, + "grad_norm": 0.3122523401491674, + "learning_rate": 9.880238079235134e-05, + "loss": 0.4682, + "step": 257 + }, + { + "epoch": 0.32950191570881227, + "grad_norm": 0.3414571272503939, + "learning_rate": 9.878713808585247e-05, + "loss": 0.4292, + "step": 258 + }, + { + "epoch": 0.33077905491698595, + "grad_norm": 0.3368909008586611, + "learning_rate": 9.877180018173018e-05, + "loss": 0.3667, + "step": 259 + }, + { + "epoch": 0.33205619412515963, + "grad_norm": 0.36771455939340714, + "learning_rate": 9.87563671099131e-05, + "loss": 0.395, + "step": 260 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.37533606558976557, + "learning_rate": 9.874083890051553e-05, + "loss": 0.3813, + "step": 261 + }, + { + "epoch": 0.334610472541507, + "grad_norm": 0.430868499037737, + "learning_rate": 9.872521558383747e-05, + "loss": 0.3777, + "step": 262 + }, + { + "epoch": 0.33588761174968074, + "grad_norm": 0.32519198801582916, + "learning_rate": 9.870949719036446e-05, + "loss": 0.4361, + "step": 263 + }, + { + "epoch": 0.3371647509578544, + "grad_norm": 0.34825563137314364, + "learning_rate": 9.869368375076755e-05, + "loss": 0.3849, + "step": 264 + }, + { + "epoch": 0.3384418901660281, + "grad_norm": 0.39618982489951166, + "learning_rate": 9.86777752959033e-05, + "loss": 0.4642, + "step": 265 + }, + { + "epoch": 0.3397190293742018, + "grad_norm": 0.3652406011029927, + "learning_rate": 9.86617718568136e-05, + "loss": 0.3732, + "step": 266 + }, + { + "epoch": 0.34099616858237547, + "grad_norm": 0.5443339776975945, + "learning_rate": 9.864567346472577e-05, + "loss": 0.4169, + "step": 267 + }, + { + "epoch": 0.34227330779054915, + "grad_norm": 0.47975664625236597, + "learning_rate": 9.862948015105233e-05, + "loss": 0.4447, + "step": 268 + }, + { + "epoch": 0.34355044699872284, + "grad_norm": 0.353425719468379, + "learning_rate": 9.861319194739109e-05, + "loss": 0.3905, + "step": 269 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 0.3576701579530493, + "learning_rate": 9.859680888552496e-05, + "loss": 0.4216, + "step": 270 + }, + { + "epoch": 0.34610472541507026, + "grad_norm": 0.4785087038180348, + "learning_rate": 9.858033099742197e-05, + "loss": 0.4721, + "step": 271 + }, + { + "epoch": 0.34738186462324394, + "grad_norm": 0.31032868472656855, + "learning_rate": 9.856375831523516e-05, + "loss": 0.4421, + "step": 272 + }, + { + "epoch": 0.3486590038314176, + "grad_norm": 0.3690370075685314, + "learning_rate": 9.85470908713026e-05, + "loss": 0.3716, + "step": 273 + }, + { + "epoch": 0.3499361430395913, + "grad_norm": 0.3131678611487497, + "learning_rate": 9.853032869814721e-05, + "loss": 0.345, + "step": 274 + }, + { + "epoch": 0.351213282247765, + "grad_norm": 0.34999719566986, + "learning_rate": 9.851347182847677e-05, + "loss": 0.38, + "step": 275 + }, + { + "epoch": 0.3524904214559387, + "grad_norm": 0.32313443568596956, + "learning_rate": 9.849652029518384e-05, + "loss": 0.4094, + "step": 276 + }, + { + "epoch": 0.3537675606641124, + "grad_norm": 0.35234075891005523, + "learning_rate": 9.847947413134568e-05, + "loss": 0.4106, + "step": 277 + }, + { + "epoch": 0.3550446998722861, + "grad_norm": 1.4962081505279432, + "learning_rate": 9.846233337022426e-05, + "loss": 0.4239, + "step": 278 + }, + { + "epoch": 0.3563218390804598, + "grad_norm": 0.4268270961028871, + "learning_rate": 9.844509804526606e-05, + "loss": 0.4876, + "step": 279 + }, + { + "epoch": 0.35759897828863346, + "grad_norm": 0.49451044702042357, + "learning_rate": 9.842776819010213e-05, + "loss": 0.4641, + "step": 280 + }, + { + "epoch": 0.35887611749680715, + "grad_norm": 0.3195218144328926, + "learning_rate": 9.841034383854795e-05, + "loss": 0.4022, + "step": 281 + }, + { + "epoch": 0.36015325670498083, + "grad_norm": 0.3773200598951525, + "learning_rate": 9.83928250246034e-05, + "loss": 0.4414, + "step": 282 + }, + { + "epoch": 0.3614303959131545, + "grad_norm": 0.5088461162675268, + "learning_rate": 9.837521178245272e-05, + "loss": 0.4572, + "step": 283 + }, + { + "epoch": 0.36270753512132825, + "grad_norm": 0.28952971777676784, + "learning_rate": 9.835750414646431e-05, + "loss": 0.3904, + "step": 284 + }, + { + "epoch": 0.36398467432950193, + "grad_norm": 0.3166123283003893, + "learning_rate": 9.833970215119088e-05, + "loss": 0.3758, + "step": 285 + }, + { + "epoch": 0.3652618135376756, + "grad_norm": 0.3028251777402035, + "learning_rate": 9.832180583136916e-05, + "loss": 0.3546, + "step": 286 + }, + { + "epoch": 0.3665389527458493, + "grad_norm": 0.32629210278272364, + "learning_rate": 9.830381522191997e-05, + "loss": 0.3867, + "step": 287 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 0.38254216209404723, + "learning_rate": 9.828573035794815e-05, + "loss": 0.4667, + "step": 288 + }, + { + "epoch": 0.36909323116219667, + "grad_norm": 0.5216221907409165, + "learning_rate": 9.826755127474241e-05, + "loss": 0.4477, + "step": 289 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.5250169662811899, + "learning_rate": 9.824927800777534e-05, + "loss": 0.4055, + "step": 290 + }, + { + "epoch": 0.3716475095785441, + "grad_norm": 0.2660185034987295, + "learning_rate": 9.823091059270329e-05, + "loss": 0.3256, + "step": 291 + }, + { + "epoch": 0.37292464878671777, + "grad_norm": 0.29498074490027293, + "learning_rate": 9.821244906536631e-05, + "loss": 0.3821, + "step": 292 + }, + { + "epoch": 0.37420178799489145, + "grad_norm": 0.5734228366765896, + "learning_rate": 9.819389346178814e-05, + "loss": 0.4262, + "step": 293 + }, + { + "epoch": 0.37547892720306514, + "grad_norm": 0.3877912113639172, + "learning_rate": 9.817524381817603e-05, + "loss": 0.3469, + "step": 294 + }, + { + "epoch": 0.3767560664112388, + "grad_norm": 0.33367951056806594, + "learning_rate": 9.815650017092077e-05, + "loss": 0.3931, + "step": 295 + }, + { + "epoch": 0.3780332056194125, + "grad_norm": 0.32675245303214817, + "learning_rate": 9.813766255659654e-05, + "loss": 0.4025, + "step": 296 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 0.48724616908131657, + "learning_rate": 9.811873101196092e-05, + "loss": 0.5065, + "step": 297 + }, + { + "epoch": 0.38058748403575987, + "grad_norm": 0.3054398088800702, + "learning_rate": 9.809970557395476e-05, + "loss": 0.3814, + "step": 298 + }, + { + "epoch": 0.3818646232439336, + "grad_norm": 0.3476983623670669, + "learning_rate": 9.80805862797021e-05, + "loss": 0.3954, + "step": 299 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.35109757585907003, + "learning_rate": 9.806137316651011e-05, + "loss": 0.3145, + "step": 300 + }, + { + "epoch": 0.384418901660281, + "grad_norm": 0.3658748702076224, + "learning_rate": 9.804206627186912e-05, + "loss": 0.355, + "step": 301 + }, + { + "epoch": 0.38569604086845466, + "grad_norm": 0.30859420170426033, + "learning_rate": 9.802266563345235e-05, + "loss": 0.377, + "step": 302 + }, + { + "epoch": 0.38697318007662834, + "grad_norm": 0.3675512096465325, + "learning_rate": 9.800317128911598e-05, + "loss": 0.4241, + "step": 303 + }, + { + "epoch": 0.388250319284802, + "grad_norm": 0.5249335772238032, + "learning_rate": 9.798358327689904e-05, + "loss": 0.429, + "step": 304 + }, + { + "epoch": 0.3895274584929757, + "grad_norm": 0.5086535306090356, + "learning_rate": 9.796390163502334e-05, + "loss": 0.4432, + "step": 305 + }, + { + "epoch": 0.39080459770114945, + "grad_norm": 0.3366639282101648, + "learning_rate": 9.794412640189337e-05, + "loss": 0.4259, + "step": 306 + }, + { + "epoch": 0.39208173690932313, + "grad_norm": 0.4351167593963147, + "learning_rate": 9.792425761609623e-05, + "loss": 0.3613, + "step": 307 + }, + { + "epoch": 0.3933588761174968, + "grad_norm": 0.507210348471415, + "learning_rate": 9.790429531640161e-05, + "loss": 0.4858, + "step": 308 + }, + { + "epoch": 0.3946360153256705, + "grad_norm": 0.32543169682975587, + "learning_rate": 9.788423954176167e-05, + "loss": 0.3661, + "step": 309 + }, + { + "epoch": 0.3959131545338442, + "grad_norm": 0.3947853846323453, + "learning_rate": 9.786409033131092e-05, + "loss": 0.5003, + "step": 310 + }, + { + "epoch": 0.39719029374201786, + "grad_norm": 0.3128508659393011, + "learning_rate": 9.784384772436623e-05, + "loss": 0.3824, + "step": 311 + }, + { + "epoch": 0.39846743295019155, + "grad_norm": 0.34419879462368935, + "learning_rate": 9.782351176042668e-05, + "loss": 0.406, + "step": 312 + }, + { + "epoch": 0.3997445721583653, + "grad_norm": 0.4762563920787106, + "learning_rate": 9.780308247917355e-05, + "loss": 0.3928, + "step": 313 + }, + { + "epoch": 0.40102171136653897, + "grad_norm": 0.43578413912804104, + "learning_rate": 9.778255992047023e-05, + "loss": 0.4301, + "step": 314 + }, + { + "epoch": 0.40229885057471265, + "grad_norm": 0.4224743713742291, + "learning_rate": 9.776194412436201e-05, + "loss": 0.3866, + "step": 315 + }, + { + "epoch": 0.40357598978288634, + "grad_norm": 0.34530556726214795, + "learning_rate": 9.774123513107627e-05, + "loss": 0.3719, + "step": 316 + }, + { + "epoch": 0.40485312899106, + "grad_norm": 5.186579394458275, + "learning_rate": 9.772043298102211e-05, + "loss": 0.4328, + "step": 317 + }, + { + "epoch": 0.4061302681992337, + "grad_norm": 0.30116352879206904, + "learning_rate": 9.769953771479047e-05, + "loss": 0.4257, + "step": 318 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 1.2278465492300623, + "learning_rate": 9.767854937315398e-05, + "loss": 0.2993, + "step": 319 + }, + { + "epoch": 0.4086845466155811, + "grad_norm": 0.41796080815527054, + "learning_rate": 9.765746799706688e-05, + "loss": 0.4335, + "step": 320 + }, + { + "epoch": 0.4099616858237548, + "grad_norm": 1.316036381495967, + "learning_rate": 9.763629362766496e-05, + "loss": 0.4274, + "step": 321 + }, + { + "epoch": 0.4112388250319285, + "grad_norm": 0.3258327924337055, + "learning_rate": 9.761502630626544e-05, + "loss": 0.3897, + "step": 322 + }, + { + "epoch": 0.4125159642401022, + "grad_norm": 0.2960816741453923, + "learning_rate": 9.759366607436693e-05, + "loss": 0.3704, + "step": 323 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.35039607476012735, + "learning_rate": 9.757221297364936e-05, + "loss": 0.3779, + "step": 324 + }, + { + "epoch": 0.41507024265644954, + "grad_norm": 0.3942340416133389, + "learning_rate": 9.75506670459738e-05, + "loss": 0.4047, + "step": 325 + }, + { + "epoch": 0.4163473818646232, + "grad_norm": 0.3438923060032332, + "learning_rate": 9.752902833338255e-05, + "loss": 0.4172, + "step": 326 + }, + { + "epoch": 0.41762452107279696, + "grad_norm": 0.4076807887263878, + "learning_rate": 9.75072968780989e-05, + "loss": 0.3141, + "step": 327 + }, + { + "epoch": 0.41890166028097064, + "grad_norm": 0.37824308559146247, + "learning_rate": 9.748547272252709e-05, + "loss": 0.4124, + "step": 328 + }, + { + "epoch": 0.42017879948914433, + "grad_norm": 0.4569920584204143, + "learning_rate": 9.746355590925232e-05, + "loss": 0.4339, + "step": 329 + }, + { + "epoch": 0.421455938697318, + "grad_norm": 0.5944784408464571, + "learning_rate": 9.744154648104049e-05, + "loss": 0.4116, + "step": 330 + }, + { + "epoch": 0.4227330779054917, + "grad_norm": 0.45401465677379416, + "learning_rate": 9.741944448083831e-05, + "loss": 0.3747, + "step": 331 + }, + { + "epoch": 0.4240102171136654, + "grad_norm": 0.3583234980542119, + "learning_rate": 9.739724995177308e-05, + "loss": 0.4699, + "step": 332 + }, + { + "epoch": 0.42528735632183906, + "grad_norm": 0.24333237134435923, + "learning_rate": 9.737496293715267e-05, + "loss": 0.3842, + "step": 333 + }, + { + "epoch": 0.42656449553001274, + "grad_norm": 0.2858392689382239, + "learning_rate": 9.735258348046536e-05, + "loss": 0.3904, + "step": 334 + }, + { + "epoch": 0.4278416347381865, + "grad_norm": 0.35032602633884763, + "learning_rate": 9.733011162537991e-05, + "loss": 0.5245, + "step": 335 + }, + { + "epoch": 0.42911877394636017, + "grad_norm": 0.33095896729242663, + "learning_rate": 9.730754741574528e-05, + "loss": 0.4141, + "step": 336 + }, + { + "epoch": 0.43039591315453385, + "grad_norm": 0.30433609176089776, + "learning_rate": 9.72848908955907e-05, + "loss": 0.3577, + "step": 337 + }, + { + "epoch": 0.43167305236270753, + "grad_norm": 0.27385630860574245, + "learning_rate": 9.726214210912548e-05, + "loss": 0.349, + "step": 338 + }, + { + "epoch": 0.4329501915708812, + "grad_norm": 0.2974087425852774, + "learning_rate": 9.723930110073902e-05, + "loss": 0.3734, + "step": 339 + }, + { + "epoch": 0.4342273307790549, + "grad_norm": 0.37081198028636886, + "learning_rate": 9.721636791500064e-05, + "loss": 0.4237, + "step": 340 + }, + { + "epoch": 0.4355044699872286, + "grad_norm": 0.2724567265139735, + "learning_rate": 9.719334259665951e-05, + "loss": 0.4455, + "step": 341 + }, + { + "epoch": 0.4367816091954023, + "grad_norm": 0.31825940851903667, + "learning_rate": 9.71702251906446e-05, + "loss": 0.3761, + "step": 342 + }, + { + "epoch": 0.438058748403576, + "grad_norm": 0.36733268667862673, + "learning_rate": 9.714701574206456e-05, + "loss": 0.4026, + "step": 343 + }, + { + "epoch": 0.4393358876117497, + "grad_norm": 0.354818172913112, + "learning_rate": 9.712371429620766e-05, + "loss": 0.414, + "step": 344 + }, + { + "epoch": 0.44061302681992337, + "grad_norm": 0.26309859767596316, + "learning_rate": 9.710032089854166e-05, + "loss": 0.326, + "step": 345 + }, + { + "epoch": 0.44189016602809705, + "grad_norm": 0.3373824259140002, + "learning_rate": 9.707683559471375e-05, + "loss": 0.4232, + "step": 346 + }, + { + "epoch": 0.44316730523627074, + "grad_norm": 0.437990884363296, + "learning_rate": 9.705325843055045e-05, + "loss": 0.3852, + "step": 347 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.2729499543501723, + "learning_rate": 9.702958945205754e-05, + "loss": 0.3792, + "step": 348 + }, + { + "epoch": 0.44572158365261816, + "grad_norm": 0.6660372068909841, + "learning_rate": 9.700582870541996e-05, + "loss": 0.478, + "step": 349 + }, + { + "epoch": 0.44699872286079184, + "grad_norm": 0.2948604901519268, + "learning_rate": 9.698197623700168e-05, + "loss": 0.3863, + "step": 350 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 0.307371950237942, + "learning_rate": 9.69580320933457e-05, + "loss": 0.4335, + "step": 351 + }, + { + "epoch": 0.4495530012771392, + "grad_norm": 0.4179107352772687, + "learning_rate": 9.693399632117385e-05, + "loss": 0.4212, + "step": 352 + }, + { + "epoch": 0.4508301404853129, + "grad_norm": 0.44727188083593156, + "learning_rate": 9.690986896738678e-05, + "loss": 0.4497, + "step": 353 + }, + { + "epoch": 0.4521072796934866, + "grad_norm": 0.3984211260499416, + "learning_rate": 9.688565007906387e-05, + "loss": 0.351, + "step": 354 + }, + { + "epoch": 0.45338441890166026, + "grad_norm": 0.46947967849902955, + "learning_rate": 9.686133970346304e-05, + "loss": 0.4937, + "step": 355 + }, + { + "epoch": 0.454661558109834, + "grad_norm": 0.328214023899869, + "learning_rate": 9.683693788802078e-05, + "loss": 0.4157, + "step": 356 + }, + { + "epoch": 0.4559386973180077, + "grad_norm": 0.37785662596085595, + "learning_rate": 9.681244468035201e-05, + "loss": 0.3401, + "step": 357 + }, + { + "epoch": 0.45721583652618136, + "grad_norm": 0.33337156363802384, + "learning_rate": 9.678786012824994e-05, + "loss": 0.3853, + "step": 358 + }, + { + "epoch": 0.45849297573435505, + "grad_norm": 0.5794364396935966, + "learning_rate": 9.676318427968607e-05, + "loss": 0.3523, + "step": 359 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.5336339824100882, + "learning_rate": 9.673841718280999e-05, + "loss": 0.3651, + "step": 360 + }, + { + "epoch": 0.4610472541507024, + "grad_norm": 0.3229933793109108, + "learning_rate": 9.67135588859494e-05, + "loss": 0.3159, + "step": 361 + }, + { + "epoch": 0.4623243933588761, + "grad_norm": 0.3715045564349442, + "learning_rate": 9.668860943760991e-05, + "loss": 0.4287, + "step": 362 + }, + { + "epoch": 0.46360153256704983, + "grad_norm": 0.3697740700122699, + "learning_rate": 9.6663568886475e-05, + "loss": 0.4082, + "step": 363 + }, + { + "epoch": 0.4648786717752235, + "grad_norm": 0.2880739779670036, + "learning_rate": 9.663843728140597e-05, + "loss": 0.325, + "step": 364 + }, + { + "epoch": 0.4661558109833972, + "grad_norm": 0.30273632794681316, + "learning_rate": 9.66132146714417e-05, + "loss": 0.4259, + "step": 365 + }, + { + "epoch": 0.4674329501915709, + "grad_norm": 0.2832796471983712, + "learning_rate": 9.658790110579875e-05, + "loss": 0.3802, + "step": 366 + }, + { + "epoch": 0.46871008939974457, + "grad_norm": 0.31253273824990274, + "learning_rate": 9.656249663387107e-05, + "loss": 0.3855, + "step": 367 + }, + { + "epoch": 0.46998722860791825, + "grad_norm": 0.31950275466891576, + "learning_rate": 9.653700130523004e-05, + "loss": 0.4209, + "step": 368 + }, + { + "epoch": 0.47126436781609193, + "grad_norm": 0.29754508295940835, + "learning_rate": 9.65114151696243e-05, + "loss": 0.4329, + "step": 369 + }, + { + "epoch": 0.4725415070242657, + "grad_norm": 0.40415627231905743, + "learning_rate": 9.648573827697975e-05, + "loss": 0.4532, + "step": 370 + }, + { + "epoch": 0.47381864623243936, + "grad_norm": 0.35453580122646283, + "learning_rate": 9.645997067739928e-05, + "loss": 0.3901, + "step": 371 + }, + { + "epoch": 0.47509578544061304, + "grad_norm": 0.26997456765875794, + "learning_rate": 9.643411242116285e-05, + "loss": 0.4319, + "step": 372 + }, + { + "epoch": 0.4763729246487867, + "grad_norm": 0.3552534993795924, + "learning_rate": 9.64081635587273e-05, + "loss": 0.4366, + "step": 373 + }, + { + "epoch": 0.4776500638569604, + "grad_norm": 0.2983058989312288, + "learning_rate": 9.638212414072625e-05, + "loss": 0.409, + "step": 374 + }, + { + "epoch": 0.4789272030651341, + "grad_norm": 0.31879058918079395, + "learning_rate": 9.635599421797003e-05, + "loss": 0.3876, + "step": 375 + }, + { + "epoch": 0.48020434227330777, + "grad_norm": 0.2807856053136191, + "learning_rate": 9.63297738414456e-05, + "loss": 0.3932, + "step": 376 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.3088727571085651, + "learning_rate": 9.630346306231637e-05, + "loss": 0.4674, + "step": 377 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 0.4397495715431768, + "learning_rate": 9.627706193192218e-05, + "loss": 0.4128, + "step": 378 + }, + { + "epoch": 0.4840357598978289, + "grad_norm": 0.3722389786764484, + "learning_rate": 9.625057050177917e-05, + "loss": 0.4836, + "step": 379 + }, + { + "epoch": 0.48531289910600256, + "grad_norm": 0.31249295057528653, + "learning_rate": 9.622398882357968e-05, + "loss": 0.4069, + "step": 380 + }, + { + "epoch": 0.48659003831417624, + "grad_norm": 0.2963790171590838, + "learning_rate": 9.619731694919211e-05, + "loss": 0.3949, + "step": 381 + }, + { + "epoch": 0.4878671775223499, + "grad_norm": 0.45827608455201946, + "learning_rate": 9.617055493066096e-05, + "loss": 0.3737, + "step": 382 + }, + { + "epoch": 0.4891443167305236, + "grad_norm": 0.4334021902780183, + "learning_rate": 9.614370282020651e-05, + "loss": 0.3388, + "step": 383 + }, + { + "epoch": 0.4904214559386973, + "grad_norm": 0.2905369034008204, + "learning_rate": 9.611676067022493e-05, + "loss": 0.4186, + "step": 384 + }, + { + "epoch": 0.49169859514687103, + "grad_norm": 0.26553109452134815, + "learning_rate": 9.6089728533288e-05, + "loss": 0.4342, + "step": 385 + }, + { + "epoch": 0.4929757343550447, + "grad_norm": 0.3100237499548668, + "learning_rate": 9.606260646214313e-05, + "loss": 0.3985, + "step": 386 + }, + { + "epoch": 0.4942528735632184, + "grad_norm": 0.33327236691683704, + "learning_rate": 9.603539450971327e-05, + "loss": 0.3991, + "step": 387 + }, + { + "epoch": 0.4955300127713921, + "grad_norm": 1.9708349007651818, + "learning_rate": 9.600809272909664e-05, + "loss": 0.3948, + "step": 388 + }, + { + "epoch": 0.49680715197956576, + "grad_norm": 0.33586230469466616, + "learning_rate": 9.598070117356684e-05, + "loss": 0.4186, + "step": 389 + }, + { + "epoch": 0.49808429118773945, + "grad_norm": 0.28669291350447806, + "learning_rate": 9.595321989657258e-05, + "loss": 0.3791, + "step": 390 + }, + { + "epoch": 0.49936143039591313, + "grad_norm": 0.40479105813767874, + "learning_rate": 9.59256489517377e-05, + "loss": 0.4115, + "step": 391 + }, + { + "epoch": 0.5006385696040868, + "grad_norm": 0.3393616432842008, + "learning_rate": 9.589798839286097e-05, + "loss": 0.38, + "step": 392 + }, + { + "epoch": 0.5019157088122606, + "grad_norm": 0.2896468663079358, + "learning_rate": 9.587023827391602e-05, + "loss": 0.4051, + "step": 393 + }, + { + "epoch": 0.5031928480204342, + "grad_norm": 0.2986045621872122, + "learning_rate": 9.584239864905126e-05, + "loss": 0.327, + "step": 394 + }, + { + "epoch": 0.5044699872286079, + "grad_norm": 0.3305726918767087, + "learning_rate": 9.581446957258974e-05, + "loss": 0.4133, + "step": 395 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 0.2800896811339513, + "learning_rate": 9.578645109902904e-05, + "loss": 0.4019, + "step": 396 + }, + { + "epoch": 0.5070242656449553, + "grad_norm": 0.3791084243395354, + "learning_rate": 9.575834328304121e-05, + "loss": 0.4218, + "step": 397 + }, + { + "epoch": 0.508301404853129, + "grad_norm": 0.2823960387617122, + "learning_rate": 9.57301461794726e-05, + "loss": 0.4382, + "step": 398 + }, + { + "epoch": 0.5095785440613027, + "grad_norm": 0.3050971322392535, + "learning_rate": 9.570185984334383e-05, + "loss": 0.378, + "step": 399 + }, + { + "epoch": 0.5108556832694764, + "grad_norm": 0.3901414267404712, + "learning_rate": 9.567348432984957e-05, + "loss": 0.3145, + "step": 400 + }, + { + "epoch": 0.51213282247765, + "grad_norm": 0.3323938095955713, + "learning_rate": 9.56450196943586e-05, + "loss": 0.4359, + "step": 401 + }, + { + "epoch": 0.5134099616858238, + "grad_norm": 0.2712299864452788, + "learning_rate": 9.561646599241348e-05, + "loss": 0.3667, + "step": 402 + }, + { + "epoch": 0.5146871008939975, + "grad_norm": 0.33184315079784416, + "learning_rate": 9.558782327973068e-05, + "loss": 0.382, + "step": 403 + }, + { + "epoch": 0.5159642401021711, + "grad_norm": 0.42132112963642676, + "learning_rate": 9.555909161220025e-05, + "loss": 0.3924, + "step": 404 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 0.7683812407219701, + "learning_rate": 9.553027104588592e-05, + "loss": 0.3479, + "step": 405 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.9922742352041047, + "learning_rate": 9.550136163702482e-05, + "loss": 0.4331, + "step": 406 + }, + { + "epoch": 0.5197956577266922, + "grad_norm": 0.29885495068203677, + "learning_rate": 9.547236344202743e-05, + "loss": 0.4128, + "step": 407 + }, + { + "epoch": 0.5210727969348659, + "grad_norm": 0.3175969333480194, + "learning_rate": 9.544327651747754e-05, + "loss": 0.3688, + "step": 408 + }, + { + "epoch": 0.5223499361430396, + "grad_norm": 0.3601889239142128, + "learning_rate": 9.541410092013201e-05, + "loss": 0.423, + "step": 409 + }, + { + "epoch": 0.5236270753512133, + "grad_norm": 0.2702938546699287, + "learning_rate": 9.538483670692076e-05, + "loss": 0.3137, + "step": 410 + }, + { + "epoch": 0.524904214559387, + "grad_norm": 0.34649300799635185, + "learning_rate": 9.535548393494661e-05, + "loss": 0.3456, + "step": 411 + }, + { + "epoch": 0.5261813537675607, + "grad_norm": 0.33548190113555515, + "learning_rate": 9.53260426614852e-05, + "loss": 0.4153, + "step": 412 + }, + { + "epoch": 0.5274584929757343, + "grad_norm": 0.9022269046679203, + "learning_rate": 9.529651294398484e-05, + "loss": 0.4186, + "step": 413 + }, + { + "epoch": 0.5287356321839081, + "grad_norm": 0.37925640456475396, + "learning_rate": 9.526689484006647e-05, + "loss": 0.4017, + "step": 414 + }, + { + "epoch": 0.5300127713920817, + "grad_norm": 0.30152848034709756, + "learning_rate": 9.523718840752343e-05, + "loss": 0.441, + "step": 415 + }, + { + "epoch": 0.5312899106002554, + "grad_norm": 0.34240355373234027, + "learning_rate": 9.520739370432143e-05, + "loss": 0.4327, + "step": 416 + }, + { + "epoch": 0.5325670498084292, + "grad_norm": 0.2984256968169551, + "learning_rate": 9.517751078859848e-05, + "loss": 0.3883, + "step": 417 + }, + { + "epoch": 0.5338441890166028, + "grad_norm": 0.23426918481378242, + "learning_rate": 9.514753971866462e-05, + "loss": 0.368, + "step": 418 + }, + { + "epoch": 0.5351213282247765, + "grad_norm": 0.2539534683046424, + "learning_rate": 9.5117480553002e-05, + "loss": 0.3685, + "step": 419 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.24530420862186916, + "learning_rate": 9.508733335026461e-05, + "loss": 0.3904, + "step": 420 + }, + { + "epoch": 0.5376756066411239, + "grad_norm": 0.4944277744643429, + "learning_rate": 9.505709816927823e-05, + "loss": 0.3727, + "step": 421 + }, + { + "epoch": 0.5389527458492975, + "grad_norm": 0.30098747473095155, + "learning_rate": 9.502677506904034e-05, + "loss": 0.4205, + "step": 422 + }, + { + "epoch": 0.5402298850574713, + "grad_norm": 0.3635600534739069, + "learning_rate": 9.499636410871995e-05, + "loss": 0.4326, + "step": 423 + }, + { + "epoch": 0.541507024265645, + "grad_norm": 0.26858678559864974, + "learning_rate": 9.496586534765753e-05, + "loss": 0.4213, + "step": 424 + }, + { + "epoch": 0.5427841634738186, + "grad_norm": 0.2986594777552593, + "learning_rate": 9.493527884536486e-05, + "loss": 0.3203, + "step": 425 + }, + { + "epoch": 0.5440613026819924, + "grad_norm": 0.24792491785610843, + "learning_rate": 9.490460466152491e-05, + "loss": 0.3706, + "step": 426 + }, + { + "epoch": 0.545338441890166, + "grad_norm": 0.2224503116585505, + "learning_rate": 9.487384285599179e-05, + "loss": 0.3511, + "step": 427 + }, + { + "epoch": 0.5466155810983397, + "grad_norm": 0.31768333399010656, + "learning_rate": 9.484299348879054e-05, + "loss": 0.3491, + "step": 428 + }, + { + "epoch": 0.5478927203065134, + "grad_norm": 0.4008703110913062, + "learning_rate": 9.48120566201171e-05, + "loss": 0.4088, + "step": 429 + }, + { + "epoch": 0.5491698595146871, + "grad_norm": 0.32127798245469175, + "learning_rate": 9.478103231033808e-05, + "loss": 0.3481, + "step": 430 + }, + { + "epoch": 0.5504469987228607, + "grad_norm": 0.2986252479760289, + "learning_rate": 9.47499206199908e-05, + "loss": 0.4281, + "step": 431 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3717778655401205, + "learning_rate": 9.471872160978303e-05, + "loss": 0.4392, + "step": 432 + }, + { + "epoch": 0.5530012771392082, + "grad_norm": 0.9104005379234426, + "learning_rate": 9.468743534059294e-05, + "loss": 0.4191, + "step": 433 + }, + { + "epoch": 0.5542784163473818, + "grad_norm": 0.2893292522937076, + "learning_rate": 9.465606187346896e-05, + "loss": 0.3656, + "step": 434 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.2759868181154408, + "learning_rate": 9.462460126962969e-05, + "loss": 0.3301, + "step": 435 + }, + { + "epoch": 0.5568326947637292, + "grad_norm": 0.2925867466376509, + "learning_rate": 9.459305359046373e-05, + "loss": 0.4163, + "step": 436 + }, + { + "epoch": 0.558109833971903, + "grad_norm": 0.42038784075536273, + "learning_rate": 9.456141889752958e-05, + "loss": 0.4693, + "step": 437 + }, + { + "epoch": 0.5593869731800766, + "grad_norm": 0.38736398031810176, + "learning_rate": 9.452969725255558e-05, + "loss": 0.3515, + "step": 438 + }, + { + "epoch": 0.5606641123882503, + "grad_norm": 0.3263997154327739, + "learning_rate": 9.449788871743971e-05, + "loss": 0.3616, + "step": 439 + }, + { + "epoch": 0.561941251596424, + "grad_norm": 0.23078287071936895, + "learning_rate": 9.446599335424948e-05, + "loss": 0.3513, + "step": 440 + }, + { + "epoch": 0.5632183908045977, + "grad_norm": 0.281307045257724, + "learning_rate": 9.443401122522185e-05, + "loss": 0.3411, + "step": 441 + }, + { + "epoch": 0.5644955300127714, + "grad_norm": 0.34429215935669905, + "learning_rate": 9.440194239276308e-05, + "loss": 0.3925, + "step": 442 + }, + { + "epoch": 0.565772669220945, + "grad_norm": 0.31984444187072053, + "learning_rate": 9.436978691944859e-05, + "loss": 0.432, + "step": 443 + }, + { + "epoch": 0.5670498084291188, + "grad_norm": 0.609405045425876, + "learning_rate": 9.433754486802291e-05, + "loss": 0.3664, + "step": 444 + }, + { + "epoch": 0.5683269476372924, + "grad_norm": 0.33216872303732786, + "learning_rate": 9.430521630139945e-05, + "loss": 0.4157, + "step": 445 + }, + { + "epoch": 0.5696040868454662, + "grad_norm": 0.33919760709226193, + "learning_rate": 9.42728012826605e-05, + "loss": 0.3703, + "step": 446 + }, + { + "epoch": 0.5708812260536399, + "grad_norm": 0.6057551634790113, + "learning_rate": 9.424029987505697e-05, + "loss": 0.4134, + "step": 447 + }, + { + "epoch": 0.5721583652618135, + "grad_norm": 0.4405877875484886, + "learning_rate": 9.420771214200842e-05, + "loss": 0.3902, + "step": 448 + }, + { + "epoch": 0.5734355044699873, + "grad_norm": 0.2664765726080314, + "learning_rate": 9.417503814710278e-05, + "loss": 0.4046, + "step": 449 + }, + { + "epoch": 0.5747126436781609, + "grad_norm": 0.3025987541707025, + "learning_rate": 9.414227795409634e-05, + "loss": 0.4226, + "step": 450 + }, + { + "epoch": 0.5759897828863346, + "grad_norm": 0.33317180236362265, + "learning_rate": 9.410943162691359e-05, + "loss": 0.3862, + "step": 451 + }, + { + "epoch": 0.5772669220945083, + "grad_norm": 0.2755345366961821, + "learning_rate": 9.407649922964708e-05, + "loss": 0.3979, + "step": 452 + }, + { + "epoch": 0.578544061302682, + "grad_norm": 0.3132738578051779, + "learning_rate": 9.404348082655732e-05, + "loss": 0.4431, + "step": 453 + }, + { + "epoch": 0.5798212005108557, + "grad_norm": 0.3429382308665615, + "learning_rate": 9.40103764820726e-05, + "loss": 0.3893, + "step": 454 + }, + { + "epoch": 0.5810983397190294, + "grad_norm": 0.19420865356855255, + "learning_rate": 9.397718626078899e-05, + "loss": 0.3464, + "step": 455 + }, + { + "epoch": 0.5823754789272031, + "grad_norm": 0.29883152781151073, + "learning_rate": 9.394391022747005e-05, + "loss": 0.4205, + "step": 456 + }, + { + "epoch": 0.5836526181353767, + "grad_norm": 0.37079612543973106, + "learning_rate": 9.391054844704681e-05, + "loss": 0.3088, + "step": 457 + }, + { + "epoch": 0.5849297573435505, + "grad_norm": 0.2650503759425465, + "learning_rate": 9.387710098461764e-05, + "loss": 0.3953, + "step": 458 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 0.2737044502224149, + "learning_rate": 9.384356790544807e-05, + "loss": 0.3486, + "step": 459 + }, + { + "epoch": 0.5874840357598978, + "grad_norm": 0.26778448719446524, + "learning_rate": 9.380994927497068e-05, + "loss": 0.3734, + "step": 460 + }, + { + "epoch": 0.5887611749680716, + "grad_norm": 0.4589103213346212, + "learning_rate": 9.377624515878504e-05, + "loss": 0.3154, + "step": 461 + }, + { + "epoch": 0.5900383141762452, + "grad_norm": 0.4624001561409793, + "learning_rate": 9.37424556226575e-05, + "loss": 0.4414, + "step": 462 + }, + { + "epoch": 0.5913154533844189, + "grad_norm": 0.3984615646504987, + "learning_rate": 9.370858073252105e-05, + "loss": 0.4873, + "step": 463 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.5175844076824656, + "learning_rate": 9.367462055447528e-05, + "loss": 0.2701, + "step": 464 + }, + { + "epoch": 0.5938697318007663, + "grad_norm": 0.3724090236912593, + "learning_rate": 9.364057515478619e-05, + "loss": 0.3266, + "step": 465 + }, + { + "epoch": 0.5951468710089399, + "grad_norm": 0.3189814611184616, + "learning_rate": 9.360644459988607e-05, + "loss": 0.3539, + "step": 466 + }, + { + "epoch": 0.5964240102171137, + "grad_norm": 0.3738302135401018, + "learning_rate": 9.357222895637337e-05, + "loss": 0.411, + "step": 467 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 0.3630144632488912, + "learning_rate": 9.353792829101255e-05, + "loss": 0.3732, + "step": 468 + }, + { + "epoch": 0.598978288633461, + "grad_norm": 0.2585863541434966, + "learning_rate": 9.3503542670734e-05, + "loss": 0.3878, + "step": 469 + }, + { + "epoch": 0.6002554278416348, + "grad_norm": 0.2792865205465122, + "learning_rate": 9.34690721626339e-05, + "loss": 0.4211, + "step": 470 + }, + { + "epoch": 0.6015325670498084, + "grad_norm": 0.5017089354089542, + "learning_rate": 9.343451683397402e-05, + "loss": 0.4902, + "step": 471 + }, + { + "epoch": 0.6028097062579821, + "grad_norm": 0.28030439249454175, + "learning_rate": 9.339987675218164e-05, + "loss": 0.4399, + "step": 472 + }, + { + "epoch": 0.6040868454661558, + "grad_norm": 0.31788686560960194, + "learning_rate": 9.33651519848495e-05, + "loss": 0.4229, + "step": 473 + }, + { + "epoch": 0.6053639846743295, + "grad_norm": 0.3124165345863479, + "learning_rate": 9.33303425997355e-05, + "loss": 0.3268, + "step": 474 + }, + { + "epoch": 0.6066411238825032, + "grad_norm": 0.2642139646532633, + "learning_rate": 9.329544866476266e-05, + "loss": 0.3413, + "step": 475 + }, + { + "epoch": 0.6079182630906769, + "grad_norm": 0.22468708185182132, + "learning_rate": 9.326047024801902e-05, + "loss": 0.387, + "step": 476 + }, + { + "epoch": 0.6091954022988506, + "grad_norm": 0.27453467560952466, + "learning_rate": 9.322540741775744e-05, + "loss": 0.3232, + "step": 477 + }, + { + "epoch": 0.6104725415070242, + "grad_norm": 0.37017056909355106, + "learning_rate": 9.319026024239551e-05, + "loss": 0.3799, + "step": 478 + }, + { + "epoch": 0.611749680715198, + "grad_norm": 0.2730422643278645, + "learning_rate": 9.315502879051541e-05, + "loss": 0.3152, + "step": 479 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.6443939460742217, + "learning_rate": 9.311971313086371e-05, + "loss": 0.3849, + "step": 480 + }, + { + "epoch": 0.6143039591315453, + "grad_norm": 0.29433493251571485, + "learning_rate": 9.308431333235139e-05, + "loss": 0.4014, + "step": 481 + }, + { + "epoch": 0.6155810983397191, + "grad_norm": 0.3968397706888878, + "learning_rate": 9.304882946405351e-05, + "loss": 0.3559, + "step": 482 + }, + { + "epoch": 0.6168582375478927, + "grad_norm": 0.2540598832030671, + "learning_rate": 9.301326159520924e-05, + "loss": 0.3377, + "step": 483 + }, + { + "epoch": 0.6181353767560664, + "grad_norm": 0.26946300223409186, + "learning_rate": 9.297760979522166e-05, + "loss": 0.3714, + "step": 484 + }, + { + "epoch": 0.6194125159642401, + "grad_norm": 0.28407145568429026, + "learning_rate": 9.294187413365756e-05, + "loss": 0.2857, + "step": 485 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 0.36056758826476154, + "learning_rate": 9.290605468024743e-05, + "loss": 0.3229, + "step": 486 + }, + { + "epoch": 0.6219667943805874, + "grad_norm": 0.28228388866333737, + "learning_rate": 9.287015150488523e-05, + "loss": 0.3873, + "step": 487 + }, + { + "epoch": 0.6232439335887612, + "grad_norm": 0.4496999019346401, + "learning_rate": 9.28341646776283e-05, + "loss": 0.4514, + "step": 488 + }, + { + "epoch": 0.6245210727969349, + "grad_norm": 0.35190583086571947, + "learning_rate": 9.279809426869723e-05, + "loss": 0.3344, + "step": 489 + }, + { + "epoch": 0.6257982120051085, + "grad_norm": 0.5177856821418574, + "learning_rate": 9.276194034847566e-05, + "loss": 0.3505, + "step": 490 + }, + { + "epoch": 0.6270753512132823, + "grad_norm": 0.3598759132863296, + "learning_rate": 9.272570298751018e-05, + "loss": 0.3254, + "step": 491 + }, + { + "epoch": 0.6283524904214559, + "grad_norm": 0.26045586869918125, + "learning_rate": 9.268938225651027e-05, + "loss": 0.3747, + "step": 492 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.40373429571911573, + "learning_rate": 9.265297822634798e-05, + "loss": 0.4309, + "step": 493 + }, + { + "epoch": 0.6309067688378033, + "grad_norm": 0.2346389205993432, + "learning_rate": 9.261649096805798e-05, + "loss": 0.3313, + "step": 494 + }, + { + "epoch": 0.632183908045977, + "grad_norm": 0.22860876281160566, + "learning_rate": 9.257992055283734e-05, + "loss": 0.3203, + "step": 495 + }, + { + "epoch": 0.6334610472541508, + "grad_norm": 0.2807924845293709, + "learning_rate": 9.254326705204535e-05, + "loss": 0.3538, + "step": 496 + }, + { + "epoch": 0.6347381864623244, + "grad_norm": 0.4660955486981567, + "learning_rate": 9.250653053720344e-05, + "loss": 0.3452, + "step": 497 + }, + { + "epoch": 0.6360153256704981, + "grad_norm": 0.37465757206128547, + "learning_rate": 9.246971107999504e-05, + "loss": 0.4168, + "step": 498 + }, + { + "epoch": 0.6372924648786717, + "grad_norm": 0.37134679259727643, + "learning_rate": 9.243280875226543e-05, + "loss": 0.3946, + "step": 499 + }, + { + "epoch": 0.6385696040868455, + "grad_norm": 0.2712481089635525, + "learning_rate": 9.239582362602155e-05, + "loss": 0.3927, + "step": 500 + }, + { + "epoch": 0.6398467432950191, + "grad_norm": 0.32826955362425314, + "learning_rate": 9.235875577343195e-05, + "loss": 0.4776, + "step": 501 + }, + { + "epoch": 0.6411238825031929, + "grad_norm": 0.2477377629059308, + "learning_rate": 9.232160526682658e-05, + "loss": 0.3869, + "step": 502 + }, + { + "epoch": 0.6424010217113666, + "grad_norm": 0.35565807534360316, + "learning_rate": 9.228437217869667e-05, + "loss": 0.3282, + "step": 503 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 0.2628936854684581, + "learning_rate": 9.22470565816946e-05, + "loss": 0.3713, + "step": 504 + }, + { + "epoch": 0.644955300127714, + "grad_norm": 0.4764707004577683, + "learning_rate": 9.220965854863375e-05, + "loss": 0.397, + "step": 505 + }, + { + "epoch": 0.6462324393358876, + "grad_norm": 0.2946949235130695, + "learning_rate": 9.217217815248834e-05, + "loss": 0.3405, + "step": 506 + }, + { + "epoch": 0.6475095785440613, + "grad_norm": 0.614404562962857, + "learning_rate": 9.213461546639333e-05, + "loss": 0.3315, + "step": 507 + }, + { + "epoch": 0.648786717752235, + "grad_norm": 0.28206598499210805, + "learning_rate": 9.209697056364422e-05, + "loss": 0.3841, + "step": 508 + }, + { + "epoch": 0.6500638569604087, + "grad_norm": 1.2710076793427971, + "learning_rate": 9.205924351769694e-05, + "loss": 0.3657, + "step": 509 + }, + { + "epoch": 0.6513409961685823, + "grad_norm": 0.2872245384022452, + "learning_rate": 9.202143440216777e-05, + "loss": 0.3863, + "step": 510 + }, + { + "epoch": 0.6526181353767561, + "grad_norm": 0.31030947822429783, + "learning_rate": 9.198354329083303e-05, + "loss": 0.2583, + "step": 511 + }, + { + "epoch": 0.6538952745849298, + "grad_norm": 0.2747449217924041, + "learning_rate": 9.194557025762911e-05, + "loss": 0.3754, + "step": 512 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 0.44928420004877295, + "learning_rate": 9.190751537665222e-05, + "loss": 0.3394, + "step": 513 + }, + { + "epoch": 0.6564495530012772, + "grad_norm": 0.7319845242568958, + "learning_rate": 9.18693787221583e-05, + "loss": 0.3697, + "step": 514 + }, + { + "epoch": 0.6577266922094508, + "grad_norm": 0.6606993061415445, + "learning_rate": 9.183116036856283e-05, + "loss": 0.2649, + "step": 515 + }, + { + "epoch": 0.6590038314176245, + "grad_norm": 0.41807791751689144, + "learning_rate": 9.179286039044073e-05, + "loss": 0.4092, + "step": 516 + }, + { + "epoch": 0.6602809706257982, + "grad_norm": 0.3852444282806701, + "learning_rate": 9.175447886252617e-05, + "loss": 0.3917, + "step": 517 + }, + { + "epoch": 0.6615581098339719, + "grad_norm": 0.3303766038001163, + "learning_rate": 9.171601585971248e-05, + "loss": 0.3765, + "step": 518 + }, + { + "epoch": 0.6628352490421456, + "grad_norm": 1.1114478334612625, + "learning_rate": 9.167747145705194e-05, + "loss": 0.3305, + "step": 519 + }, + { + "epoch": 0.6641123882503193, + "grad_norm": 0.4063417518479549, + "learning_rate": 9.163884572975566e-05, + "loss": 0.4313, + "step": 520 + }, + { + "epoch": 0.665389527458493, + "grad_norm": 0.36345639092512144, + "learning_rate": 9.160013875319347e-05, + "loss": 0.3185, + "step": 521 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.7845898633972624, + "learning_rate": 9.156135060289372e-05, + "loss": 0.3172, + "step": 522 + }, + { + "epoch": 0.6679438058748404, + "grad_norm": 0.33691261894292734, + "learning_rate": 9.152248135454316e-05, + "loss": 0.3296, + "step": 523 + }, + { + "epoch": 0.669220945083014, + "grad_norm": 0.4841909272626947, + "learning_rate": 9.148353108398677e-05, + "loss": 0.4784, + "step": 524 + }, + { + "epoch": 0.6704980842911877, + "grad_norm": 0.31693386591545597, + "learning_rate": 9.144449986722763e-05, + "loss": 0.3706, + "step": 525 + }, + { + "epoch": 0.6717752234993615, + "grad_norm": 0.6094962948275232, + "learning_rate": 9.140538778042682e-05, + "loss": 0.4371, + "step": 526 + }, + { + "epoch": 0.6730523627075351, + "grad_norm": 0.3220928338725758, + "learning_rate": 9.136619489990312e-05, + "loss": 0.4, + "step": 527 + }, + { + "epoch": 0.6743295019157088, + "grad_norm": 0.2794834966583165, + "learning_rate": 9.132692130213307e-05, + "loss": 0.3366, + "step": 528 + }, + { + "epoch": 0.6756066411238825, + "grad_norm": 0.3641880735962448, + "learning_rate": 9.128756706375065e-05, + "loss": 0.362, + "step": 529 + }, + { + "epoch": 0.6768837803320562, + "grad_norm": 0.3020139318173461, + "learning_rate": 9.124813226154718e-05, + "loss": 0.3956, + "step": 530 + }, + { + "epoch": 0.6781609195402298, + "grad_norm": 0.4112752414482164, + "learning_rate": 9.120861697247124e-05, + "loss": 0.3811, + "step": 531 + }, + { + "epoch": 0.6794380587484036, + "grad_norm": 0.32093935778040117, + "learning_rate": 9.11690212736284e-05, + "loss": 0.4164, + "step": 532 + }, + { + "epoch": 0.6807151979565773, + "grad_norm": 0.3786354137275982, + "learning_rate": 9.11293452422812e-05, + "loss": 0.359, + "step": 533 + }, + { + "epoch": 0.6819923371647509, + "grad_norm": 0.6438889819503103, + "learning_rate": 9.108958895584887e-05, + "loss": 0.3819, + "step": 534 + }, + { + "epoch": 0.6832694763729247, + "grad_norm": 0.3191986145774816, + "learning_rate": 9.104975249190727e-05, + "loss": 0.3153, + "step": 535 + }, + { + "epoch": 0.6845466155810983, + "grad_norm": 0.24044384309847125, + "learning_rate": 9.100983592818873e-05, + "loss": 0.3743, + "step": 536 + }, + { + "epoch": 0.685823754789272, + "grad_norm": 0.28993631232434924, + "learning_rate": 9.09698393425818e-05, + "loss": 0.3421, + "step": 537 + }, + { + "epoch": 0.6871008939974457, + "grad_norm": 0.2700642764361038, + "learning_rate": 9.092976281313126e-05, + "loss": 0.349, + "step": 538 + }, + { + "epoch": 0.6883780332056194, + "grad_norm": 0.41587013229963143, + "learning_rate": 9.088960641803786e-05, + "loss": 0.4676, + "step": 539 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.26622687617468477, + "learning_rate": 9.084937023565815e-05, + "loss": 0.3543, + "step": 540 + }, + { + "epoch": 0.6909323116219668, + "grad_norm": 0.33008316785699077, + "learning_rate": 9.080905434450445e-05, + "loss": 0.406, + "step": 541 + }, + { + "epoch": 0.6922094508301405, + "grad_norm": 0.2642873655799442, + "learning_rate": 9.076865882324452e-05, + "loss": 0.3708, + "step": 542 + }, + { + "epoch": 0.6934865900383141, + "grad_norm": 0.32683714017582605, + "learning_rate": 9.072818375070156e-05, + "loss": 0.4099, + "step": 543 + }, + { + "epoch": 0.6947637292464879, + "grad_norm": 0.40037717009219825, + "learning_rate": 9.068762920585399e-05, + "loss": 0.3659, + "step": 544 + }, + { + "epoch": 0.6960408684546615, + "grad_norm": 0.3364371468910221, + "learning_rate": 9.064699526783527e-05, + "loss": 0.3576, + "step": 545 + }, + { + "epoch": 0.6973180076628352, + "grad_norm": 0.25826494517622156, + "learning_rate": 9.060628201593383e-05, + "loss": 0.3927, + "step": 546 + }, + { + "epoch": 0.698595146871009, + "grad_norm": 0.287257448298585, + "learning_rate": 9.056548952959283e-05, + "loss": 0.3677, + "step": 547 + }, + { + "epoch": 0.6998722860791826, + "grad_norm": 0.25925224386627793, + "learning_rate": 9.052461788841005e-05, + "loss": 0.4103, + "step": 548 + }, + { + "epoch": 0.7011494252873564, + "grad_norm": 0.36259471525220577, + "learning_rate": 9.048366717213772e-05, + "loss": 0.3763, + "step": 549 + }, + { + "epoch": 0.70242656449553, + "grad_norm": 0.2428197503619159, + "learning_rate": 9.044263746068236e-05, + "loss": 0.3546, + "step": 550 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.32504551969098644, + "learning_rate": 9.040152883410464e-05, + "loss": 0.4089, + "step": 551 + }, + { + "epoch": 0.7049808429118773, + "grad_norm": 0.2787099564248101, + "learning_rate": 9.036034137261924e-05, + "loss": 0.3259, + "step": 552 + }, + { + "epoch": 0.7062579821200511, + "grad_norm": 0.23682449249418888, + "learning_rate": 9.031907515659464e-05, + "loss": 0.3804, + "step": 553 + }, + { + "epoch": 0.7075351213282248, + "grad_norm": 0.3414518060063429, + "learning_rate": 9.027773026655297e-05, + "loss": 0.4245, + "step": 554 + }, + { + "epoch": 0.7088122605363985, + "grad_norm": 0.29352714375352856, + "learning_rate": 9.023630678316995e-05, + "loss": 0.42, + "step": 555 + }, + { + "epoch": 0.7100893997445722, + "grad_norm": 0.2491350399758305, + "learning_rate": 9.019480478727458e-05, + "loss": 0.2914, + "step": 556 + }, + { + "epoch": 0.7113665389527458, + "grad_norm": 0.28815857311957543, + "learning_rate": 9.015322435984909e-05, + "loss": 0.3606, + "step": 557 + }, + { + "epoch": 0.7126436781609196, + "grad_norm": 0.2541371899382567, + "learning_rate": 9.011156558202877e-05, + "loss": 0.3954, + "step": 558 + }, + { + "epoch": 0.7139208173690932, + "grad_norm": 0.23422085020105415, + "learning_rate": 9.006982853510177e-05, + "loss": 0.3928, + "step": 559 + }, + { + "epoch": 0.7151979565772669, + "grad_norm": 0.8249788095685954, + "learning_rate": 9.0028013300509e-05, + "loss": 0.4623, + "step": 560 + }, + { + "epoch": 0.7164750957854407, + "grad_norm": 0.2700551648845606, + "learning_rate": 8.998611995984387e-05, + "loss": 0.3571, + "step": 561 + }, + { + "epoch": 0.7177522349936143, + "grad_norm": 0.268116051941635, + "learning_rate": 8.994414859485228e-05, + "loss": 0.3951, + "step": 562 + }, + { + "epoch": 0.719029374201788, + "grad_norm": 0.3550631539560281, + "learning_rate": 8.99020992874323e-05, + "loss": 0.4595, + "step": 563 + }, + { + "epoch": 0.7203065134099617, + "grad_norm": 0.2501435179265175, + "learning_rate": 8.985997211963413e-05, + "loss": 0.3036, + "step": 564 + }, + { + "epoch": 0.7215836526181354, + "grad_norm": 0.5246624541466447, + "learning_rate": 8.981776717365992e-05, + "loss": 0.376, + "step": 565 + }, + { + "epoch": 0.722860791826309, + "grad_norm": 0.25985368654245716, + "learning_rate": 8.977548453186353e-05, + "loss": 0.3761, + "step": 566 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 1.594801130189487, + "learning_rate": 8.973312427675046e-05, + "loss": 0.2682, + "step": 567 + }, + { + "epoch": 0.7254150702426565, + "grad_norm": 0.2645997124055398, + "learning_rate": 8.969068649097766e-05, + "loss": 0.3161, + "step": 568 + }, + { + "epoch": 0.7266922094508301, + "grad_norm": 0.2672610751290293, + "learning_rate": 8.964817125735337e-05, + "loss": 0.2992, + "step": 569 + }, + { + "epoch": 0.7279693486590039, + "grad_norm": 0.38376152793947066, + "learning_rate": 8.960557865883689e-05, + "loss": 0.3936, + "step": 570 + }, + { + "epoch": 0.7292464878671775, + "grad_norm": 0.2397439198310057, + "learning_rate": 8.956290877853857e-05, + "loss": 0.3579, + "step": 571 + }, + { + "epoch": 0.7305236270753512, + "grad_norm": 0.28795324456034993, + "learning_rate": 8.95201616997195e-05, + "loss": 0.353, + "step": 572 + }, + { + "epoch": 0.7318007662835249, + "grad_norm": 0.4395391388599923, + "learning_rate": 8.947733750579146e-05, + "loss": 0.3345, + "step": 573 + }, + { + "epoch": 0.7330779054916986, + "grad_norm": 0.5026061381809127, + "learning_rate": 8.943443628031663e-05, + "loss": 0.3167, + "step": 574 + }, + { + "epoch": 0.7343550446998723, + "grad_norm": 0.30225593109656934, + "learning_rate": 8.939145810700755e-05, + "loss": 0.4049, + "step": 575 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 0.2740536016511761, + "learning_rate": 8.93484030697269e-05, + "loss": 0.3874, + "step": 576 + }, + { + "epoch": 0.7369093231162197, + "grad_norm": 0.2158369834655232, + "learning_rate": 8.930527125248734e-05, + "loss": 0.3111, + "step": 577 + }, + { + "epoch": 0.7381864623243933, + "grad_norm": 0.36477121652192473, + "learning_rate": 8.926206273945133e-05, + "loss": 0.3702, + "step": 578 + }, + { + "epoch": 0.7394636015325671, + "grad_norm": 0.3224599920913378, + "learning_rate": 8.921877761493102e-05, + "loss": 0.3946, + "step": 579 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3599769520690838, + "learning_rate": 8.917541596338806e-05, + "loss": 0.3799, + "step": 580 + }, + { + "epoch": 0.7420178799489144, + "grad_norm": 1.6853742371650222, + "learning_rate": 8.913197786943336e-05, + "loss": 0.3622, + "step": 581 + }, + { + "epoch": 0.7432950191570882, + "grad_norm": 0.3224409065974624, + "learning_rate": 8.908846341782705e-05, + "loss": 0.3353, + "step": 582 + }, + { + "epoch": 0.7445721583652618, + "grad_norm": 0.32595267214045975, + "learning_rate": 8.904487269347823e-05, + "loss": 0.3589, + "step": 583 + }, + { + "epoch": 0.7458492975734355, + "grad_norm": 0.25480946497010437, + "learning_rate": 8.900120578144486e-05, + "loss": 0.4117, + "step": 584 + }, + { + "epoch": 0.7471264367816092, + "grad_norm": 0.3436861407292856, + "learning_rate": 8.895746276693353e-05, + "loss": 0.3584, + "step": 585 + }, + { + "epoch": 0.7484035759897829, + "grad_norm": 0.3745257660925739, + "learning_rate": 8.891364373529934e-05, + "loss": 0.4388, + "step": 586 + }, + { + "epoch": 0.7496807151979565, + "grad_norm": 0.23588941560905444, + "learning_rate": 8.88697487720457e-05, + "loss": 0.3902, + "step": 587 + }, + { + "epoch": 0.7509578544061303, + "grad_norm": 0.41199448673354366, + "learning_rate": 8.882577796282422e-05, + "loss": 0.431, + "step": 588 + }, + { + "epoch": 0.7522349936143039, + "grad_norm": 0.28205711454800275, + "learning_rate": 8.878173139343451e-05, + "loss": 0.4346, + "step": 589 + }, + { + "epoch": 0.7535121328224776, + "grad_norm": 0.24476430147387696, + "learning_rate": 8.873760914982398e-05, + "loss": 0.4259, + "step": 590 + }, + { + "epoch": 0.7547892720306514, + "grad_norm": 0.31196716932725765, + "learning_rate": 8.869341131808769e-05, + "loss": 0.3685, + "step": 591 + }, + { + "epoch": 0.756066411238825, + "grad_norm": 0.2922798000225374, + "learning_rate": 8.864913798446825e-05, + "loss": 0.3333, + "step": 592 + }, + { + "epoch": 0.7573435504469987, + "grad_norm": 0.2777185906088175, + "learning_rate": 8.860478923535556e-05, + "loss": 0.3327, + "step": 593 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 0.2816854876002763, + "learning_rate": 8.856036515728666e-05, + "loss": 0.4497, + "step": 594 + }, + { + "epoch": 0.7598978288633461, + "grad_norm": 0.24918158052838946, + "learning_rate": 8.851586583694559e-05, + "loss": 0.3957, + "step": 595 + }, + { + "epoch": 0.7611749680715197, + "grad_norm": 0.2590797791116278, + "learning_rate": 8.847129136116325e-05, + "loss": 0.3612, + "step": 596 + }, + { + "epoch": 0.7624521072796935, + "grad_norm": 0.23023083563937027, + "learning_rate": 8.842664181691716e-05, + "loss": 0.3146, + "step": 597 + }, + { + "epoch": 0.7637292464878672, + "grad_norm": 0.27460739419457053, + "learning_rate": 8.838191729133129e-05, + "loss": 0.3606, + "step": 598 + }, + { + "epoch": 0.7650063856960408, + "grad_norm": 0.2823152087453321, + "learning_rate": 8.833711787167595e-05, + "loss": 0.3985, + "step": 599 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.2876495777573533, + "learning_rate": 8.829224364536761e-05, + "loss": 0.3585, + "step": 600 + }, + { + "epoch": 0.7675606641123882, + "grad_norm": 0.6095439334284898, + "learning_rate": 8.824729469996869e-05, + "loss": 0.4015, + "step": 601 + }, + { + "epoch": 0.768837803320562, + "grad_norm": 0.310487816268451, + "learning_rate": 8.820227112318736e-05, + "loss": 0.4135, + "step": 602 + }, + { + "epoch": 0.7701149425287356, + "grad_norm": 0.27192916241254605, + "learning_rate": 8.81571730028775e-05, + "loss": 0.3035, + "step": 603 + }, + { + "epoch": 0.7713920817369093, + "grad_norm": 0.41898455841521204, + "learning_rate": 8.81120004270384e-05, + "loss": 0.391, + "step": 604 + }, + { + "epoch": 0.7726692209450831, + "grad_norm": 0.2466356503833779, + "learning_rate": 8.806675348381463e-05, + "loss": 0.37, + "step": 605 + }, + { + "epoch": 0.7739463601532567, + "grad_norm": 0.3235918654243166, + "learning_rate": 8.80214322614959e-05, + "loss": 0.4728, + "step": 606 + }, + { + "epoch": 0.7752234993614304, + "grad_norm": 0.2792923958120248, + "learning_rate": 8.797603684851685e-05, + "loss": 0.4205, + "step": 607 + }, + { + "epoch": 0.776500638569604, + "grad_norm": 0.24330998864164433, + "learning_rate": 8.793056733345683e-05, + "loss": 0.3134, + "step": 608 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.3752535698258248, + "learning_rate": 8.78850238050399e-05, + "loss": 0.393, + "step": 609 + }, + { + "epoch": 0.7790549169859514, + "grad_norm": 0.39873657613278785, + "learning_rate": 8.783940635213443e-05, + "loss": 0.4456, + "step": 610 + }, + { + "epoch": 0.7803320561941252, + "grad_norm": 0.24242903474028651, + "learning_rate": 8.779371506375311e-05, + "loss": 0.3163, + "step": 611 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 0.875531904618306, + "learning_rate": 8.774795002905266e-05, + "loss": 0.3796, + "step": 612 + }, + { + "epoch": 0.7828863346104725, + "grad_norm": 0.2710006058211159, + "learning_rate": 8.770211133733373e-05, + "loss": 0.3641, + "step": 613 + }, + { + "epoch": 0.7841634738186463, + "grad_norm": 0.3599896330154359, + "learning_rate": 8.765619907804066e-05, + "loss": 0.4715, + "step": 614 + }, + { + "epoch": 0.7854406130268199, + "grad_norm": 0.2673910899689466, + "learning_rate": 8.761021334076141e-05, + "loss": 0.3394, + "step": 615 + }, + { + "epoch": 0.7867177522349936, + "grad_norm": 0.28074185243875843, + "learning_rate": 8.756415421522722e-05, + "loss": 0.4187, + "step": 616 + }, + { + "epoch": 0.7879948914431673, + "grad_norm": 0.2648876395884377, + "learning_rate": 8.751802179131261e-05, + "loss": 0.3764, + "step": 617 + }, + { + "epoch": 0.789272030651341, + "grad_norm": 0.5832530608252117, + "learning_rate": 8.747181615903511e-05, + "loss": 0.319, + "step": 618 + }, + { + "epoch": 0.7905491698595147, + "grad_norm": 0.2675008489838536, + "learning_rate": 8.742553740855506e-05, + "loss": 0.337, + "step": 619 + }, + { + "epoch": 0.7918263090676884, + "grad_norm": 0.23241817746486018, + "learning_rate": 8.737918563017553e-05, + "loss": 0.3814, + "step": 620 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 0.34963572888329164, + "learning_rate": 8.733276091434204e-05, + "loss": 0.3482, + "step": 621 + }, + { + "epoch": 0.7943805874840357, + "grad_norm": 0.4230195584284411, + "learning_rate": 8.728626335164246e-05, + "loss": 0.4355, + "step": 622 + }, + { + "epoch": 0.7956577266922095, + "grad_norm": 0.28576901339724037, + "learning_rate": 8.723969303280681e-05, + "loss": 0.3608, + "step": 623 + }, + { + "epoch": 0.7969348659003831, + "grad_norm": 0.24108205760360604, + "learning_rate": 8.719305004870706e-05, + "loss": 0.3398, + "step": 624 + }, + { + "epoch": 0.7982120051085568, + "grad_norm": 0.3409097662824296, + "learning_rate": 8.714633449035698e-05, + "loss": 0.4134, + "step": 625 + }, + { + "epoch": 0.7994891443167306, + "grad_norm": 0.29060882927377535, + "learning_rate": 8.709954644891195e-05, + "loss": 0.376, + "step": 626 + }, + { + "epoch": 0.8007662835249042, + "grad_norm": 0.4937873547140922, + "learning_rate": 8.705268601566876e-05, + "loss": 0.3958, + "step": 627 + }, + { + "epoch": 0.8020434227330779, + "grad_norm": 0.32559074507512403, + "learning_rate": 8.700575328206553e-05, + "loss": 0.3796, + "step": 628 + }, + { + "epoch": 0.8033205619412516, + "grad_norm": 0.3224118073745686, + "learning_rate": 8.695874833968136e-05, + "loss": 0.32, + "step": 629 + }, + { + "epoch": 0.8045977011494253, + "grad_norm": 0.43546245385802496, + "learning_rate": 8.691167128023636e-05, + "loss": 0.4193, + "step": 630 + }, + { + "epoch": 0.8058748403575989, + "grad_norm": 0.4239058690648373, + "learning_rate": 8.686452219559125e-05, + "loss": 0.4105, + "step": 631 + }, + { + "epoch": 0.8071519795657727, + "grad_norm": 0.29592660304168117, + "learning_rate": 8.681730117774737e-05, + "loss": 0.4016, + "step": 632 + }, + { + "epoch": 0.8084291187739464, + "grad_norm": 0.2807222218139495, + "learning_rate": 8.677000831884638e-05, + "loss": 0.404, + "step": 633 + }, + { + "epoch": 0.80970625798212, + "grad_norm": 0.2761732155554203, + "learning_rate": 8.672264371117016e-05, + "loss": 0.392, + "step": 634 + }, + { + "epoch": 0.8109833971902938, + "grad_norm": 0.30958441974218925, + "learning_rate": 8.667520744714055e-05, + "loss": 0.4055, + "step": 635 + }, + { + "epoch": 0.8122605363984674, + "grad_norm": 0.5059487783491056, + "learning_rate": 8.662769961931926e-05, + "loss": 0.4344, + "step": 636 + }, + { + "epoch": 0.8135376756066411, + "grad_norm": 0.3002638229314521, + "learning_rate": 8.658012032040758e-05, + "loss": 0.2969, + "step": 637 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.3798591431515895, + "learning_rate": 8.653246964324632e-05, + "loss": 0.4389, + "step": 638 + }, + { + "epoch": 0.8160919540229885, + "grad_norm": 0.34577016245724324, + "learning_rate": 8.648474768081552e-05, + "loss": 0.4144, + "step": 639 + }, + { + "epoch": 0.8173690932311622, + "grad_norm": 0.2276942228990674, + "learning_rate": 8.643695452623438e-05, + "loss": 0.339, + "step": 640 + }, + { + "epoch": 0.8186462324393359, + "grad_norm": 1.0404258276546323, + "learning_rate": 8.638909027276094e-05, + "loss": 0.3673, + "step": 641 + }, + { + "epoch": 0.8199233716475096, + "grad_norm": 0.25496245755766783, + "learning_rate": 8.634115501379202e-05, + "loss": 0.3268, + "step": 642 + }, + { + "epoch": 0.8212005108556832, + "grad_norm": 0.2907481450478815, + "learning_rate": 8.6293148842863e-05, + "loss": 0.4556, + "step": 643 + }, + { + "epoch": 0.822477650063857, + "grad_norm": 0.32645244869050666, + "learning_rate": 8.62450718536476e-05, + "loss": 0.4064, + "step": 644 + }, + { + "epoch": 0.8237547892720306, + "grad_norm": 0.3899458410220355, + "learning_rate": 8.619692413995774e-05, + "loss": 0.3978, + "step": 645 + }, + { + "epoch": 0.8250319284802043, + "grad_norm": 0.23664146436226524, + "learning_rate": 8.614870579574337e-05, + "loss": 0.3443, + "step": 646 + }, + { + "epoch": 0.8263090676883781, + "grad_norm": 0.3477448773503283, + "learning_rate": 8.61004169150922e-05, + "loss": 0.3802, + "step": 647 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.2198461029250017, + "learning_rate": 8.605205759222963e-05, + "loss": 0.3915, + "step": 648 + }, + { + "epoch": 0.8288633461047255, + "grad_norm": 0.32486219088749546, + "learning_rate": 8.600362792151847e-05, + "loss": 0.3426, + "step": 649 + }, + { + "epoch": 0.8301404853128991, + "grad_norm": 0.35348183466289423, + "learning_rate": 8.595512799745886e-05, + "loss": 0.4109, + "step": 650 + }, + { + "epoch": 0.8314176245210728, + "grad_norm": 0.21755610065650385, + "learning_rate": 8.590655791468797e-05, + "loss": 0.4069, + "step": 651 + }, + { + "epoch": 0.8326947637292464, + "grad_norm": 0.3748766377981543, + "learning_rate": 8.585791776797989e-05, + "loss": 0.4232, + "step": 652 + }, + { + "epoch": 0.8339719029374202, + "grad_norm": 0.2565733217534215, + "learning_rate": 8.58092076522454e-05, + "loss": 0.3497, + "step": 653 + }, + { + "epoch": 0.8352490421455939, + "grad_norm": 0.22515795683348488, + "learning_rate": 8.576042766253184e-05, + "loss": 0.2546, + "step": 654 + }, + { + "epoch": 0.8365261813537676, + "grad_norm": 0.27347055569072154, + "learning_rate": 8.571157789402292e-05, + "loss": 0.378, + "step": 655 + }, + { + "epoch": 0.8378033205619413, + "grad_norm": 0.22740308967354247, + "learning_rate": 8.566265844203842e-05, + "loss": 0.3717, + "step": 656 + }, + { + "epoch": 0.8390804597701149, + "grad_norm": 0.2799890672438903, + "learning_rate": 8.561366940203419e-05, + "loss": 0.379, + "step": 657 + }, + { + "epoch": 0.8403575989782887, + "grad_norm": 0.25019559727938806, + "learning_rate": 8.556461086960179e-05, + "loss": 0.326, + "step": 658 + }, + { + "epoch": 0.8416347381864623, + "grad_norm": 0.3604796301710984, + "learning_rate": 8.551548294046843e-05, + "loss": 0.3953, + "step": 659 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.28987785218106565, + "learning_rate": 8.546628571049671e-05, + "loss": 0.3887, + "step": 660 + }, + { + "epoch": 0.8441890166028098, + "grad_norm": 0.370626150999263, + "learning_rate": 8.541701927568444e-05, + "loss": 0.3251, + "step": 661 + }, + { + "epoch": 0.8454661558109834, + "grad_norm": 0.26751001132677643, + "learning_rate": 8.536768373216453e-05, + "loss": 0.3609, + "step": 662 + }, + { + "epoch": 0.8467432950191571, + "grad_norm": 0.2884156657361538, + "learning_rate": 8.531827917620465e-05, + "loss": 0.3741, + "step": 663 + }, + { + "epoch": 0.8480204342273308, + "grad_norm": 0.25915101818699415, + "learning_rate": 8.52688057042072e-05, + "loss": 0.335, + "step": 664 + }, + { + "epoch": 0.8492975734355045, + "grad_norm": 0.2569638867920905, + "learning_rate": 8.521926341270907e-05, + "loss": 0.3641, + "step": 665 + }, + { + "epoch": 0.8505747126436781, + "grad_norm": 0.2672027029779102, + "learning_rate": 8.516965239838137e-05, + "loss": 0.3928, + "step": 666 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.24959398994344748, + "learning_rate": 8.511997275802934e-05, + "loss": 0.4048, + "step": 667 + }, + { + "epoch": 0.8531289910600255, + "grad_norm": 0.27061347147355075, + "learning_rate": 8.507022458859215e-05, + "loss": 0.3946, + "step": 668 + }, + { + "epoch": 0.8544061302681992, + "grad_norm": 0.23023660940085203, + "learning_rate": 8.502040798714264e-05, + "loss": 0.3135, + "step": 669 + }, + { + "epoch": 0.855683269476373, + "grad_norm": 0.3049937012944536, + "learning_rate": 8.497052305088722e-05, + "loss": 0.3389, + "step": 670 + }, + { + "epoch": 0.8569604086845466, + "grad_norm": 0.2947835559941408, + "learning_rate": 8.492056987716566e-05, + "loss": 0.383, + "step": 671 + }, + { + "epoch": 0.8582375478927203, + "grad_norm": 0.6392281975494365, + "learning_rate": 8.487054856345081e-05, + "loss": 0.3689, + "step": 672 + }, + { + "epoch": 0.859514687100894, + "grad_norm": 0.343740792285687, + "learning_rate": 8.482045920734854e-05, + "loss": 0.3584, + "step": 673 + }, + { + "epoch": 0.8607918263090677, + "grad_norm": 0.3002139484289878, + "learning_rate": 8.477030190659749e-05, + "loss": 0.3725, + "step": 674 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 0.43785179612410263, + "learning_rate": 8.472007675906883e-05, + "loss": 0.3411, + "step": 675 + }, + { + "epoch": 0.8633461047254151, + "grad_norm": 0.2400013938003954, + "learning_rate": 8.466978386276618e-05, + "loss": 0.3833, + "step": 676 + }, + { + "epoch": 0.8646232439335888, + "grad_norm": 0.24385390468894358, + "learning_rate": 8.46194233158253e-05, + "loss": 0.3617, + "step": 677 + }, + { + "epoch": 0.8659003831417624, + "grad_norm": 0.2687106042905156, + "learning_rate": 8.4568995216514e-05, + "loss": 0.3326, + "step": 678 + }, + { + "epoch": 0.8671775223499362, + "grad_norm": 0.4558221475864837, + "learning_rate": 8.451849966323188e-05, + "loss": 0.4018, + "step": 679 + }, + { + "epoch": 0.8684546615581098, + "grad_norm": 0.3257217064912352, + "learning_rate": 8.446793675451017e-05, + "loss": 0.4301, + "step": 680 + }, + { + "epoch": 0.8697318007662835, + "grad_norm": 0.27517645663185636, + "learning_rate": 8.441730658901153e-05, + "loss": 0.3676, + "step": 681 + }, + { + "epoch": 0.8710089399744572, + "grad_norm": 0.2710109971091886, + "learning_rate": 8.436660926552986e-05, + "loss": 0.3821, + "step": 682 + }, + { + "epoch": 0.8722860791826309, + "grad_norm": 0.24885712139314484, + "learning_rate": 8.431584488299009e-05, + "loss": 0.3622, + "step": 683 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 0.24861263499214709, + "learning_rate": 8.426501354044801e-05, + "loss": 0.3341, + "step": 684 + }, + { + "epoch": 0.8748403575989783, + "grad_norm": 0.23612201353064893, + "learning_rate": 8.421411533709009e-05, + "loss": 0.3467, + "step": 685 + }, + { + "epoch": 0.876117496807152, + "grad_norm": 0.2789983926001787, + "learning_rate": 8.41631503722332e-05, + "loss": 0.3362, + "step": 686 + }, + { + "epoch": 0.8773946360153256, + "grad_norm": 0.378727889266266, + "learning_rate": 8.411211874532459e-05, + "loss": 0.3295, + "step": 687 + }, + { + "epoch": 0.8786717752234994, + "grad_norm": 1.6084123989048513, + "learning_rate": 8.406102055594148e-05, + "loss": 0.3788, + "step": 688 + }, + { + "epoch": 0.879948914431673, + "grad_norm": 0.27838568443024875, + "learning_rate": 8.400985590379101e-05, + "loss": 0.3629, + "step": 689 + }, + { + "epoch": 0.8812260536398467, + "grad_norm": 0.31863528664218665, + "learning_rate": 8.395862488871003e-05, + "loss": 0.3996, + "step": 690 + }, + { + "epoch": 0.8825031928480205, + "grad_norm": 0.25515871111904864, + "learning_rate": 8.390732761066484e-05, + "loss": 0.295, + "step": 691 + }, + { + "epoch": 0.8837803320561941, + "grad_norm": 0.2848324639502898, + "learning_rate": 8.38559641697511e-05, + "loss": 0.3129, + "step": 692 + }, + { + "epoch": 0.8850574712643678, + "grad_norm": 0.2517148483359616, + "learning_rate": 8.38045346661935e-05, + "loss": 0.3357, + "step": 693 + }, + { + "epoch": 0.8863346104725415, + "grad_norm": 0.22732886615934164, + "learning_rate": 8.375303920034567e-05, + "loss": 0.3137, + "step": 694 + }, + { + "epoch": 0.8876117496807152, + "grad_norm": 0.37139080019739995, + "learning_rate": 8.370147787269e-05, + "loss": 0.4171, + "step": 695 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.31984494321051177, + "learning_rate": 8.364985078383729e-05, + "loss": 0.4397, + "step": 696 + }, + { + "epoch": 0.8901660280970626, + "grad_norm": 0.33603905759786024, + "learning_rate": 8.359815803452677e-05, + "loss": 0.3123, + "step": 697 + }, + { + "epoch": 0.8914431673052363, + "grad_norm": 0.23930082760607954, + "learning_rate": 8.35463997256257e-05, + "loss": 0.3604, + "step": 698 + }, + { + "epoch": 0.89272030651341, + "grad_norm": 0.3416918000148226, + "learning_rate": 8.349457595812933e-05, + "loss": 0.3607, + "step": 699 + }, + { + "epoch": 0.8939974457215837, + "grad_norm": 0.22067793169896244, + "learning_rate": 8.344268683316058e-05, + "loss": 0.3211, + "step": 700 + }, + { + "epoch": 0.8952745849297573, + "grad_norm": 0.33432824404289696, + "learning_rate": 8.339073245196999e-05, + "loss": 0.4032, + "step": 701 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 0.5072252336479399, + "learning_rate": 8.333871291593533e-05, + "loss": 0.334, + "step": 702 + }, + { + "epoch": 0.8978288633461047, + "grad_norm": 0.2845285336137667, + "learning_rate": 8.328662832656157e-05, + "loss": 0.3683, + "step": 703 + }, + { + "epoch": 0.8991060025542784, + "grad_norm": 0.25215068487639825, + "learning_rate": 8.323447878548062e-05, + "loss": 0.3349, + "step": 704 + }, + { + "epoch": 0.9003831417624522, + "grad_norm": 0.471181577157278, + "learning_rate": 8.318226439445107e-05, + "loss": 0.4059, + "step": 705 + }, + { + "epoch": 0.9016602809706258, + "grad_norm": 0.31814272972796215, + "learning_rate": 8.312998525535812e-05, + "loss": 0.4489, + "step": 706 + }, + { + "epoch": 0.9029374201787995, + "grad_norm": 0.49658036739321737, + "learning_rate": 8.307764147021328e-05, + "loss": 0.3884, + "step": 707 + }, + { + "epoch": 0.9042145593869731, + "grad_norm": 0.2812012164890038, + "learning_rate": 8.302523314115421e-05, + "loss": 0.3605, + "step": 708 + }, + { + "epoch": 0.9054916985951469, + "grad_norm": 0.38753046129351837, + "learning_rate": 8.29727603704445e-05, + "loss": 0.3429, + "step": 709 + }, + { + "epoch": 0.9067688378033205, + "grad_norm": 0.29076807878076893, + "learning_rate": 8.29202232604735e-05, + "loss": 0.3917, + "step": 710 + }, + { + "epoch": 0.9080459770114943, + "grad_norm": 0.3313175826342293, + "learning_rate": 8.28676219137561e-05, + "loss": 0.4134, + "step": 711 + }, + { + "epoch": 0.909323116219668, + "grad_norm": 0.313773657181038, + "learning_rate": 8.281495643293254e-05, + "loss": 0.4165, + "step": 712 + }, + { + "epoch": 0.9106002554278416, + "grad_norm": 0.3126285443378182, + "learning_rate": 8.276222692076816e-05, + "loss": 0.4507, + "step": 713 + }, + { + "epoch": 0.9118773946360154, + "grad_norm": 0.4107310955181749, + "learning_rate": 8.270943348015333e-05, + "loss": 0.3165, + "step": 714 + }, + { + "epoch": 0.913154533844189, + "grad_norm": 0.3701892168541672, + "learning_rate": 8.265657621410306e-05, + "loss": 0.2897, + "step": 715 + }, + { + "epoch": 0.9144316730523627, + "grad_norm": 0.20869891886280387, + "learning_rate": 8.2603655225757e-05, + "loss": 0.3923, + "step": 716 + }, + { + "epoch": 0.9157088122605364, + "grad_norm": 0.44812595715227727, + "learning_rate": 8.255067061837908e-05, + "loss": 0.4461, + "step": 717 + }, + { + "epoch": 0.9169859514687101, + "grad_norm": 0.2568858146621979, + "learning_rate": 8.249762249535738e-05, + "loss": 0.3855, + "step": 718 + }, + { + "epoch": 0.9182630906768838, + "grad_norm": 0.22008176501258206, + "learning_rate": 8.244451096020392e-05, + "loss": 0.3565, + "step": 719 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.34239127561440175, + "learning_rate": 8.239133611655443e-05, + "loss": 0.3459, + "step": 720 + }, + { + "epoch": 0.9208173690932312, + "grad_norm": 0.8249553876235889, + "learning_rate": 8.233809806816826e-05, + "loss": 0.4728, + "step": 721 + }, + { + "epoch": 0.9220945083014048, + "grad_norm": 0.2430664964102985, + "learning_rate": 8.228479691892798e-05, + "loss": 0.2414, + "step": 722 + }, + { + "epoch": 0.9233716475095786, + "grad_norm": 0.30761582493948275, + "learning_rate": 8.223143277283935e-05, + "loss": 0.3761, + "step": 723 + }, + { + "epoch": 0.9246487867177522, + "grad_norm": 0.24811404960440472, + "learning_rate": 8.217800573403105e-05, + "loss": 0.3469, + "step": 724 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.35583804530692437, + "learning_rate": 8.212451590675445e-05, + "loss": 0.406, + "step": 725 + }, + { + "epoch": 0.9272030651340997, + "grad_norm": 0.36459828568079455, + "learning_rate": 8.20709633953835e-05, + "loss": 0.3615, + "step": 726 + }, + { + "epoch": 0.9284802043422733, + "grad_norm": 0.27141928916684593, + "learning_rate": 8.201734830441439e-05, + "loss": 0.3138, + "step": 727 + }, + { + "epoch": 0.929757343550447, + "grad_norm": 0.263953302341367, + "learning_rate": 8.196367073846548e-05, + "loss": 0.3165, + "step": 728 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 0.3729517982811233, + "learning_rate": 8.190993080227698e-05, + "loss": 0.3849, + "step": 729 + }, + { + "epoch": 0.9323116219667944, + "grad_norm": 0.2419725546534604, + "learning_rate": 8.185612860071087e-05, + "loss": 0.3451, + "step": 730 + }, + { + "epoch": 0.933588761174968, + "grad_norm": 0.2403631085610013, + "learning_rate": 8.180226423875059e-05, + "loss": 0.3305, + "step": 731 + }, + { + "epoch": 0.9348659003831418, + "grad_norm": 0.2209194533039795, + "learning_rate": 8.174833782150087e-05, + "loss": 0.3115, + "step": 732 + }, + { + "epoch": 0.9361430395913155, + "grad_norm": 0.2859554058768735, + "learning_rate": 8.169434945418752e-05, + "loss": 0.4506, + "step": 733 + }, + { + "epoch": 0.9374201787994891, + "grad_norm": 0.2361023534593823, + "learning_rate": 8.164029924215726e-05, + "loss": 0.3925, + "step": 734 + }, + { + "epoch": 0.9386973180076629, + "grad_norm": 0.30926731508725935, + "learning_rate": 8.158618729087746e-05, + "loss": 0.3602, + "step": 735 + }, + { + "epoch": 0.9399744572158365, + "grad_norm": 0.22081764858556047, + "learning_rate": 8.1532013705936e-05, + "loss": 0.3475, + "step": 736 + }, + { + "epoch": 0.9412515964240102, + "grad_norm": 0.25916232714105103, + "learning_rate": 8.147777859304096e-05, + "loss": 0.3722, + "step": 737 + }, + { + "epoch": 0.9425287356321839, + "grad_norm": 0.20291067521292164, + "learning_rate": 8.142348205802053e-05, + "loss": 0.291, + "step": 738 + }, + { + "epoch": 0.9438058748403576, + "grad_norm": 0.2894460690241143, + "learning_rate": 8.136912420682275e-05, + "loss": 0.3425, + "step": 739 + }, + { + "epoch": 0.9450830140485313, + "grad_norm": 0.3093983494224419, + "learning_rate": 8.13147051455153e-05, + "loss": 0.4229, + "step": 740 + }, + { + "epoch": 0.946360153256705, + "grad_norm": 0.20710066067003954, + "learning_rate": 8.126022498028527e-05, + "loss": 0.2935, + "step": 741 + }, + { + "epoch": 0.9476372924648787, + "grad_norm": 0.23846045662107043, + "learning_rate": 8.1205683817439e-05, + "loss": 0.3277, + "step": 742 + }, + { + "epoch": 0.9489144316730523, + "grad_norm": 0.22867196155803374, + "learning_rate": 8.11510817634019e-05, + "loss": 0.3773, + "step": 743 + }, + { + "epoch": 0.9501915708812261, + "grad_norm": 0.32956790035967565, + "learning_rate": 8.109641892471809e-05, + "loss": 0.4096, + "step": 744 + }, + { + "epoch": 0.9514687100893997, + "grad_norm": 0.38884162612812867, + "learning_rate": 8.104169540805041e-05, + "loss": 0.2601, + "step": 745 + }, + { + "epoch": 0.9527458492975734, + "grad_norm": 0.26401907383285683, + "learning_rate": 8.098691132018003e-05, + "loss": 0.394, + "step": 746 + }, + { + "epoch": 0.9540229885057471, + "grad_norm": 0.29342881486023337, + "learning_rate": 8.093206676800636e-05, + "loss": 0.3391, + "step": 747 + }, + { + "epoch": 0.9553001277139208, + "grad_norm": 0.19821003783866006, + "learning_rate": 8.087716185854673e-05, + "loss": 0.3179, + "step": 748 + }, + { + "epoch": 0.9565772669220945, + "grad_norm": 0.2875044617803602, + "learning_rate": 8.082219669893629e-05, + "loss": 0.3256, + "step": 749 + }, + { + "epoch": 0.9578544061302682, + "grad_norm": 0.210038393331744, + "learning_rate": 8.076717139642775e-05, + "loss": 0.3379, + "step": 750 + }, + { + "epoch": 0.9591315453384419, + "grad_norm": 0.4613576387376917, + "learning_rate": 8.071208605839118e-05, + "loss": 0.3928, + "step": 751 + }, + { + "epoch": 0.9604086845466155, + "grad_norm": 0.2809014351790888, + "learning_rate": 8.065694079231378e-05, + "loss": 0.312, + "step": 752 + }, + { + "epoch": 0.9616858237547893, + "grad_norm": 0.24431738968092537, + "learning_rate": 8.060173570579969e-05, + "loss": 0.3461, + "step": 753 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.21049536514765424, + "learning_rate": 8.05464709065698e-05, + "loss": 0.3039, + "step": 754 + }, + { + "epoch": 0.9642401021711366, + "grad_norm": 0.32669178973369584, + "learning_rate": 8.049114650246147e-05, + "loss": 0.3864, + "step": 755 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.3072563072863993, + "learning_rate": 8.043576260142843e-05, + "loss": 0.3809, + "step": 756 + }, + { + "epoch": 0.966794380587484, + "grad_norm": 0.22841373554866462, + "learning_rate": 8.038031931154044e-05, + "loss": 0.375, + "step": 757 + }, + { + "epoch": 0.9680715197956578, + "grad_norm": 0.34581570048874444, + "learning_rate": 8.03248167409832e-05, + "loss": 0.4051, + "step": 758 + }, + { + "epoch": 0.9693486590038314, + "grad_norm": 0.28864269241466545, + "learning_rate": 8.026925499805802e-05, + "loss": 0.3402, + "step": 759 + }, + { + "epoch": 0.9706257982120051, + "grad_norm": 0.22654614924353084, + "learning_rate": 8.021363419118173e-05, + "loss": 0.3548, + "step": 760 + }, + { + "epoch": 0.9719029374201787, + "grad_norm": 0.2372948745795556, + "learning_rate": 8.01579544288864e-05, + "loss": 0.3885, + "step": 761 + }, + { + "epoch": 0.9731800766283525, + "grad_norm": 0.32033183349200745, + "learning_rate": 8.010221581981913e-05, + "loss": 0.3796, + "step": 762 + }, + { + "epoch": 0.9744572158365262, + "grad_norm": 0.3803120570332216, + "learning_rate": 8.004641847274181e-05, + "loss": 0.3679, + "step": 763 + }, + { + "epoch": 0.9757343550446999, + "grad_norm": 0.23958046523689452, + "learning_rate": 7.999056249653105e-05, + "loss": 0.3402, + "step": 764 + }, + { + "epoch": 0.9770114942528736, + "grad_norm": 0.32467699738005107, + "learning_rate": 7.993464800017774e-05, + "loss": 0.374, + "step": 765 + }, + { + "epoch": 0.9782886334610472, + "grad_norm": 0.3178168735778774, + "learning_rate": 7.987867509278701e-05, + "loss": 0.3895, + "step": 766 + }, + { + "epoch": 0.979565772669221, + "grad_norm": 0.3555146057126804, + "learning_rate": 7.982264388357799e-05, + "loss": 0.3922, + "step": 767 + }, + { + "epoch": 0.9808429118773946, + "grad_norm": 0.255361175810166, + "learning_rate": 7.976655448188355e-05, + "loss": 0.3907, + "step": 768 + }, + { + "epoch": 0.9821200510855683, + "grad_norm": 0.21878952035113722, + "learning_rate": 7.971040699715009e-05, + "loss": 0.3419, + "step": 769 + }, + { + "epoch": 0.9833971902937421, + "grad_norm": 0.21636715689802127, + "learning_rate": 7.965420153893741e-05, + "loss": 0.2721, + "step": 770 + }, + { + "epoch": 0.9846743295019157, + "grad_norm": 4.440563577581736, + "learning_rate": 7.959793821691837e-05, + "loss": 0.3696, + "step": 771 + }, + { + "epoch": 0.9859514687100894, + "grad_norm": 0.6339961066905135, + "learning_rate": 7.954161714087877e-05, + "loss": 0.3758, + "step": 772 + }, + { + "epoch": 0.9872286079182631, + "grad_norm": 0.3336587181998634, + "learning_rate": 7.948523842071706e-05, + "loss": 0.3539, + "step": 773 + }, + { + "epoch": 0.9885057471264368, + "grad_norm": 0.28268465253781166, + "learning_rate": 7.942880216644426e-05, + "loss": 0.3423, + "step": 774 + }, + { + "epoch": 0.9897828863346104, + "grad_norm": 0.6823754088695756, + "learning_rate": 7.937230848818355e-05, + "loss": 0.3938, + "step": 775 + }, + { + "epoch": 0.9910600255427842, + "grad_norm": 21.623776628168923, + "learning_rate": 7.931575749617026e-05, + "loss": 2.4127, + "step": 776 + }, + { + "epoch": 0.9923371647509579, + "grad_norm": 11.267683873935201, + "learning_rate": 7.925914930075147e-05, + "loss": 0.8327, + "step": 777 + }, + { + "epoch": 0.9936143039591315, + "grad_norm": 0.26368644619479076, + "learning_rate": 7.920248401238592e-05, + "loss": 0.3781, + "step": 778 + }, + { + "epoch": 0.9948914431673053, + "grad_norm": 0.21288803087821603, + "learning_rate": 7.914576174164379e-05, + "loss": 0.2921, + "step": 779 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.37146057911665903, + "learning_rate": 7.908898259920636e-05, + "loss": 0.3566, + "step": 780 + }, + { + "epoch": 0.9974457215836526, + "grad_norm": 0.25806621081848635, + "learning_rate": 7.903214669586596e-05, + "loss": 0.383, + "step": 781 + }, + { + "epoch": 0.9987228607918263, + "grad_norm": 0.28067347056146796, + "learning_rate": 7.897525414252565e-05, + "loss": 0.3, + "step": 782 + }, + { + "epoch": 1.0, + "grad_norm": 0.41684179234299923, + "learning_rate": 7.891830505019904e-05, + "loss": 0.3351, + "step": 783 + }, + { + "epoch": 1.0012771392081736, + "grad_norm": 0.25190635782624143, + "learning_rate": 7.886129953001002e-05, + "loss": 0.2934, + "step": 784 + }, + { + "epoch": 1.0025542784163475, + "grad_norm": 0.24113093215919978, + "learning_rate": 7.880423769319266e-05, + "loss": 0.2594, + "step": 785 + }, + { + "epoch": 1.003831417624521, + "grad_norm": 0.24761219645767266, + "learning_rate": 7.874711965109084e-05, + "loss": 0.3252, + "step": 786 + }, + { + "epoch": 1.0051085568326947, + "grad_norm": 0.2834495240507354, + "learning_rate": 7.86899455151582e-05, + "loss": 0.2973, + "step": 787 + }, + { + "epoch": 1.0063856960408684, + "grad_norm": 0.23840786331692534, + "learning_rate": 7.863271539695778e-05, + "loss": 0.2705, + "step": 788 + }, + { + "epoch": 1.0076628352490422, + "grad_norm": 0.23582372143785987, + "learning_rate": 7.857542940816183e-05, + "loss": 0.2425, + "step": 789 + }, + { + "epoch": 1.0089399744572158, + "grad_norm": 0.2437779867616318, + "learning_rate": 7.851808766055169e-05, + "loss": 0.2654, + "step": 790 + }, + { + "epoch": 1.0102171136653895, + "grad_norm": 0.31096625063119177, + "learning_rate": 7.846069026601744e-05, + "loss": 0.3064, + "step": 791 + }, + { + "epoch": 1.0114942528735633, + "grad_norm": 0.322310502793249, + "learning_rate": 7.840323733655778e-05, + "loss": 0.316, + "step": 792 + }, + { + "epoch": 1.012771392081737, + "grad_norm": 0.3556274151476798, + "learning_rate": 7.834572898427981e-05, + "loss": 0.2759, + "step": 793 + }, + { + "epoch": 1.0140485312899106, + "grad_norm": 0.33557673237614993, + "learning_rate": 7.828816532139867e-05, + "loss": 0.329, + "step": 794 + }, + { + "epoch": 1.0153256704980842, + "grad_norm": 0.2897013929500907, + "learning_rate": 7.823054646023748e-05, + "loss": 0.3108, + "step": 795 + }, + { + "epoch": 1.016602809706258, + "grad_norm": 0.2749917410181211, + "learning_rate": 7.817287251322712e-05, + "loss": 0.2684, + "step": 796 + }, + { + "epoch": 1.0178799489144317, + "grad_norm": 0.2849230966294723, + "learning_rate": 7.811514359290591e-05, + "loss": 0.2739, + "step": 797 + }, + { + "epoch": 1.0191570881226053, + "grad_norm": 0.8459268923334499, + "learning_rate": 7.80573598119194e-05, + "loss": 0.2944, + "step": 798 + }, + { + "epoch": 1.0204342273307792, + "grad_norm": 0.2364264652923826, + "learning_rate": 7.799952128302027e-05, + "loss": 0.2366, + "step": 799 + }, + { + "epoch": 1.0217113665389528, + "grad_norm": 0.24039540336203225, + "learning_rate": 7.794162811906798e-05, + "loss": 0.2919, + "step": 800 + }, + { + "epoch": 1.0229885057471264, + "grad_norm": 0.41273718642101725, + "learning_rate": 7.788368043302858e-05, + "loss": 0.317, + "step": 801 + }, + { + "epoch": 1.0242656449553, + "grad_norm": 0.5150969486491328, + "learning_rate": 7.782567833797457e-05, + "loss": 0.3524, + "step": 802 + }, + { + "epoch": 1.0255427841634739, + "grad_norm": 0.4472774090327114, + "learning_rate": 7.776762194708458e-05, + "loss": 0.3274, + "step": 803 + }, + { + "epoch": 1.0268199233716475, + "grad_norm": 0.3133286645431316, + "learning_rate": 7.770951137364318e-05, + "loss": 0.3708, + "step": 804 + }, + { + "epoch": 1.0280970625798211, + "grad_norm": 0.2598516780586955, + "learning_rate": 7.765134673104065e-05, + "loss": 0.3077, + "step": 805 + }, + { + "epoch": 1.029374201787995, + "grad_norm": 0.3098767388650258, + "learning_rate": 7.759312813277284e-05, + "loss": 0.2816, + "step": 806 + }, + { + "epoch": 1.0306513409961686, + "grad_norm": 0.32151067138724043, + "learning_rate": 7.753485569244083e-05, + "loss": 0.3093, + "step": 807 + }, + { + "epoch": 1.0319284802043422, + "grad_norm": 0.2424103899303998, + "learning_rate": 7.747652952375078e-05, + "loss": 0.3314, + "step": 808 + }, + { + "epoch": 1.0332056194125159, + "grad_norm": 0.32471128396771815, + "learning_rate": 7.741814974051367e-05, + "loss": 0.3404, + "step": 809 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 0.32561036937826215, + "learning_rate": 7.73597164566451e-05, + "loss": 0.3172, + "step": 810 + }, + { + "epoch": 1.0357598978288634, + "grad_norm": 0.2718934954958823, + "learning_rate": 7.730122978616511e-05, + "loss": 0.3006, + "step": 811 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 0.33383205181673165, + "learning_rate": 7.724268984319784e-05, + "loss": 0.3614, + "step": 812 + }, + { + "epoch": 1.0383141762452108, + "grad_norm": 0.3763985012526266, + "learning_rate": 7.718409674197147e-05, + "loss": 0.2767, + "step": 813 + }, + { + "epoch": 1.0395913154533845, + "grad_norm": 0.22813815252223252, + "learning_rate": 7.71254505968178e-05, + "loss": 0.2837, + "step": 814 + }, + { + "epoch": 1.040868454661558, + "grad_norm": 0.22345781468702533, + "learning_rate": 7.70667515221722e-05, + "loss": 0.2608, + "step": 815 + }, + { + "epoch": 1.0421455938697317, + "grad_norm": 0.2509815603736634, + "learning_rate": 7.700799963257334e-05, + "loss": 0.2839, + "step": 816 + }, + { + "epoch": 1.0434227330779056, + "grad_norm": 0.267972054934522, + "learning_rate": 7.694919504266289e-05, + "loss": 0.3109, + "step": 817 + }, + { + "epoch": 1.0446998722860792, + "grad_norm": 0.243588191606101, + "learning_rate": 7.689033786718538e-05, + "loss": 0.3218, + "step": 818 + }, + { + "epoch": 1.0459770114942528, + "grad_norm": 0.46833032031533456, + "learning_rate": 7.683142822098795e-05, + "loss": 0.4253, + "step": 819 + }, + { + "epoch": 1.0472541507024267, + "grad_norm": 0.2927608196929183, + "learning_rate": 7.677246621902014e-05, + "loss": 0.3458, + "step": 820 + }, + { + "epoch": 1.0485312899106003, + "grad_norm": 0.2654300117138958, + "learning_rate": 7.671345197633362e-05, + "loss": 0.337, + "step": 821 + }, + { + "epoch": 1.049808429118774, + "grad_norm": 0.34341024171874635, + "learning_rate": 7.6654385608082e-05, + "loss": 0.3928, + "step": 822 + }, + { + "epoch": 1.0510855683269476, + "grad_norm": 0.2334266398204592, + "learning_rate": 7.659526722952066e-05, + "loss": 0.2439, + "step": 823 + }, + { + "epoch": 1.0523627075351214, + "grad_norm": 0.2760441913059686, + "learning_rate": 7.653609695600636e-05, + "loss": 0.2933, + "step": 824 + }, + { + "epoch": 1.053639846743295, + "grad_norm": 0.3148646280385047, + "learning_rate": 7.647687490299724e-05, + "loss": 0.3133, + "step": 825 + }, + { + "epoch": 1.0549169859514687, + "grad_norm": 0.2611993604009511, + "learning_rate": 7.641760118605237e-05, + "loss": 0.3541, + "step": 826 + }, + { + "epoch": 1.0561941251596425, + "grad_norm": 0.2601924231511535, + "learning_rate": 7.635827592083169e-05, + "loss": 0.3248, + "step": 827 + }, + { + "epoch": 1.0574712643678161, + "grad_norm": 0.2852461187038295, + "learning_rate": 7.629889922309577e-05, + "loss": 0.3296, + "step": 828 + }, + { + "epoch": 1.0587484035759898, + "grad_norm": 0.2252324378118215, + "learning_rate": 7.623947120870541e-05, + "loss": 0.3027, + "step": 829 + }, + { + "epoch": 1.0600255427841634, + "grad_norm": 0.27360900641234326, + "learning_rate": 7.617999199362166e-05, + "loss": 0.33, + "step": 830 + }, + { + "epoch": 1.0613026819923372, + "grad_norm": 0.27955160576647375, + "learning_rate": 7.612046169390543e-05, + "loss": 0.2776, + "step": 831 + }, + { + "epoch": 1.0625798212005109, + "grad_norm": 0.2515504623795006, + "learning_rate": 7.60608804257173e-05, + "loss": 0.3031, + "step": 832 + }, + { + "epoch": 1.0638569604086845, + "grad_norm": 0.4292135650453065, + "learning_rate": 7.600124830531737e-05, + "loss": 0.3389, + "step": 833 + }, + { + "epoch": 1.0651340996168583, + "grad_norm": 0.259901481970159, + "learning_rate": 7.594156544906483e-05, + "loss": 0.2812, + "step": 834 + }, + { + "epoch": 1.066411238825032, + "grad_norm": 0.28882796039325187, + "learning_rate": 7.588183197341804e-05, + "loss": 0.3155, + "step": 835 + }, + { + "epoch": 1.0676883780332056, + "grad_norm": 0.29164674800081497, + "learning_rate": 7.582204799493402e-05, + "loss": 0.2601, + "step": 836 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 0.24124498999090827, + "learning_rate": 7.576221363026834e-05, + "loss": 0.2778, + "step": 837 + }, + { + "epoch": 1.070242656449553, + "grad_norm": 0.3287344449167692, + "learning_rate": 7.570232899617496e-05, + "loss": 0.3105, + "step": 838 + }, + { + "epoch": 1.0715197956577267, + "grad_norm": 0.25384059737315806, + "learning_rate": 7.564239420950586e-05, + "loss": 0.2793, + "step": 839 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.26918592053137563, + "learning_rate": 7.558240938721091e-05, + "loss": 0.2939, + "step": 840 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 0.2728808823545993, + "learning_rate": 7.552237464633761e-05, + "loss": 0.2735, + "step": 841 + }, + { + "epoch": 1.0753512132822478, + "grad_norm": 0.2539549342425071, + "learning_rate": 7.546229010403085e-05, + "loss": 0.3268, + "step": 842 + }, + { + "epoch": 1.0766283524904214, + "grad_norm": 0.6979028043625681, + "learning_rate": 7.540215587753275e-05, + "loss": 0.2977, + "step": 843 + }, + { + "epoch": 1.077905491698595, + "grad_norm": 0.27284947545074345, + "learning_rate": 7.534197208418228e-05, + "loss": 0.291, + "step": 844 + }, + { + "epoch": 1.079182630906769, + "grad_norm": 0.306406934626975, + "learning_rate": 7.528173884141525e-05, + "loss": 0.2868, + "step": 845 + }, + { + "epoch": 1.0804597701149425, + "grad_norm": 0.2938341209051844, + "learning_rate": 7.522145626676386e-05, + "loss": 0.327, + "step": 846 + }, + { + "epoch": 1.0817369093231162, + "grad_norm": 0.21089866100577578, + "learning_rate": 7.516112447785663e-05, + "loss": 0.2146, + "step": 847 + }, + { + "epoch": 1.08301404853129, + "grad_norm": 0.24986008187848954, + "learning_rate": 7.510074359241808e-05, + "loss": 0.2862, + "step": 848 + }, + { + "epoch": 1.0842911877394636, + "grad_norm": 0.28942561844762804, + "learning_rate": 7.504031372826854e-05, + "loss": 0.293, + "step": 849 + }, + { + "epoch": 1.0855683269476373, + "grad_norm": 0.27420127090502994, + "learning_rate": 7.497983500332392e-05, + "loss": 0.2685, + "step": 850 + }, + { + "epoch": 1.086845466155811, + "grad_norm": 0.34906724138930834, + "learning_rate": 7.491930753559547e-05, + "loss": 0.3756, + "step": 851 + }, + { + "epoch": 1.0881226053639848, + "grad_norm": 0.6090036568089158, + "learning_rate": 7.485873144318953e-05, + "loss": 0.2985, + "step": 852 + }, + { + "epoch": 1.0893997445721584, + "grad_norm": 0.25958003769018917, + "learning_rate": 7.479810684430733e-05, + "loss": 0.2995, + "step": 853 + }, + { + "epoch": 1.090676883780332, + "grad_norm": 0.7854940518665731, + "learning_rate": 7.473743385724478e-05, + "loss": 0.2704, + "step": 854 + }, + { + "epoch": 1.0919540229885056, + "grad_norm": 0.28968790749136547, + "learning_rate": 7.467671260039217e-05, + "loss": 0.2915, + "step": 855 + }, + { + "epoch": 1.0932311621966795, + "grad_norm": 0.22086859345893625, + "learning_rate": 7.4615943192234e-05, + "loss": 0.2735, + "step": 856 + }, + { + "epoch": 1.0945083014048531, + "grad_norm": 0.23111194464532334, + "learning_rate": 7.455512575134873e-05, + "loss": 0.2918, + "step": 857 + }, + { + "epoch": 1.0957854406130267, + "grad_norm": 0.29636338974848286, + "learning_rate": 7.449426039640852e-05, + "loss": 0.2726, + "step": 858 + }, + { + "epoch": 1.0970625798212006, + "grad_norm": 0.480327073568295, + "learning_rate": 7.443334724617905e-05, + "loss": 0.3388, + "step": 859 + }, + { + "epoch": 1.0983397190293742, + "grad_norm": 0.26307338805510305, + "learning_rate": 7.437238641951922e-05, + "loss": 0.3122, + "step": 860 + }, + { + "epoch": 1.0996168582375478, + "grad_norm": 0.2575012025542525, + "learning_rate": 7.431137803538103e-05, + "loss": 0.3036, + "step": 861 + }, + { + "epoch": 1.1008939974457217, + "grad_norm": 0.4996600155468629, + "learning_rate": 7.425032221280925e-05, + "loss": 0.3727, + "step": 862 + }, + { + "epoch": 1.1021711366538953, + "grad_norm": 0.2350357891620041, + "learning_rate": 7.418921907094117e-05, + "loss": 0.2668, + "step": 863 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.21062394951896088, + "learning_rate": 7.412806872900649e-05, + "loss": 0.238, + "step": 864 + }, + { + "epoch": 1.1047254150702426, + "grad_norm": 0.29963958507615585, + "learning_rate": 7.406687130632693e-05, + "loss": 0.311, + "step": 865 + }, + { + "epoch": 1.1060025542784164, + "grad_norm": 0.25674958564431755, + "learning_rate": 7.400562692231611e-05, + "loss": 0.2875, + "step": 866 + }, + { + "epoch": 1.10727969348659, + "grad_norm": 0.2734774446830285, + "learning_rate": 7.394433569647934e-05, + "loss": 0.3196, + "step": 867 + }, + { + "epoch": 1.1085568326947637, + "grad_norm": 0.2463535165680373, + "learning_rate": 7.388299774841329e-05, + "loss": 0.3124, + "step": 868 + }, + { + "epoch": 1.1098339719029373, + "grad_norm": 0.23748004785357235, + "learning_rate": 7.382161319780573e-05, + "loss": 0.2869, + "step": 869 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.3142954189236986, + "learning_rate": 7.37601821644355e-05, + "loss": 0.3081, + "step": 870 + }, + { + "epoch": 1.1123882503192848, + "grad_norm": 0.36184306877909245, + "learning_rate": 7.369870476817202e-05, + "loss": 0.319, + "step": 871 + }, + { + "epoch": 1.1136653895274584, + "grad_norm": 0.2866569122486576, + "learning_rate": 7.363718112897525e-05, + "loss": 0.3059, + "step": 872 + }, + { + "epoch": 1.1149425287356323, + "grad_norm": 0.3544630416455957, + "learning_rate": 7.357561136689535e-05, + "loss": 0.2712, + "step": 873 + }, + { + "epoch": 1.116219667943806, + "grad_norm": 0.2688923000635176, + "learning_rate": 7.351399560207253e-05, + "loss": 0.3412, + "step": 874 + }, + { + "epoch": 1.1174968071519795, + "grad_norm": 0.20839348430652088, + "learning_rate": 7.345233395473664e-05, + "loss": 0.2563, + "step": 875 + }, + { + "epoch": 1.1187739463601534, + "grad_norm": 0.2133574546495322, + "learning_rate": 7.339062654520724e-05, + "loss": 0.2559, + "step": 876 + }, + { + "epoch": 1.120051085568327, + "grad_norm": 0.30534308330979765, + "learning_rate": 7.332887349389301e-05, + "loss": 0.3138, + "step": 877 + }, + { + "epoch": 1.1213282247765006, + "grad_norm": 0.30420742990681954, + "learning_rate": 7.326707492129179e-05, + "loss": 0.2811, + "step": 878 + }, + { + "epoch": 1.1226053639846743, + "grad_norm": 0.2785431627787303, + "learning_rate": 7.320523094799025e-05, + "loss": 0.3222, + "step": 879 + }, + { + "epoch": 1.123882503192848, + "grad_norm": 0.347801724060834, + "learning_rate": 7.31433416946636e-05, + "loss": 0.3371, + "step": 880 + }, + { + "epoch": 1.1251596424010217, + "grad_norm": 0.2401702028832796, + "learning_rate": 7.308140728207544e-05, + "loss": 0.2686, + "step": 881 + }, + { + "epoch": 1.1264367816091954, + "grad_norm": 0.35654830163375323, + "learning_rate": 7.301942783107746e-05, + "loss": 0.3141, + "step": 882 + }, + { + "epoch": 1.127713920817369, + "grad_norm": 0.3223162844507077, + "learning_rate": 7.295740346260927e-05, + "loss": 0.302, + "step": 883 + }, + { + "epoch": 1.1289910600255428, + "grad_norm": 0.3941526425201126, + "learning_rate": 7.289533429769811e-05, + "loss": 0.3319, + "step": 884 + }, + { + "epoch": 1.1302681992337165, + "grad_norm": 0.2869885313646713, + "learning_rate": 7.283322045745859e-05, + "loss": 0.3474, + "step": 885 + }, + { + "epoch": 1.13154533844189, + "grad_norm": 0.26605500226970324, + "learning_rate": 7.277106206309258e-05, + "loss": 0.2601, + "step": 886 + }, + { + "epoch": 1.132822477650064, + "grad_norm": 0.21269552064174466, + "learning_rate": 7.270885923588879e-05, + "loss": 0.2784, + "step": 887 + }, + { + "epoch": 1.1340996168582376, + "grad_norm": 0.5721360652914029, + "learning_rate": 7.26466120972227e-05, + "loss": 0.2425, + "step": 888 + }, + { + "epoch": 1.1353767560664112, + "grad_norm": 0.22794575168081344, + "learning_rate": 7.258432076855624e-05, + "loss": 0.2956, + "step": 889 + }, + { + "epoch": 1.136653895274585, + "grad_norm": 0.23661997961051318, + "learning_rate": 7.252198537143757e-05, + "loss": 0.2444, + "step": 890 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 0.23409192232704262, + "learning_rate": 7.245960602750081e-05, + "loss": 0.298, + "step": 891 + }, + { + "epoch": 1.1392081736909323, + "grad_norm": 0.2525385570867021, + "learning_rate": 7.239718285846586e-05, + "loss": 0.3275, + "step": 892 + }, + { + "epoch": 1.140485312899106, + "grad_norm": 0.1893877487015324, + "learning_rate": 7.233471598613815e-05, + "loss": 0.2447, + "step": 893 + }, + { + "epoch": 1.1417624521072798, + "grad_norm": 0.27945932463083994, + "learning_rate": 7.227220553240834e-05, + "loss": 0.3175, + "step": 894 + }, + { + "epoch": 1.1430395913154534, + "grad_norm": 0.2879442199997178, + "learning_rate": 7.220965161925215e-05, + "loss": 0.3424, + "step": 895 + }, + { + "epoch": 1.144316730523627, + "grad_norm": 0.4042502205514717, + "learning_rate": 7.214705436873017e-05, + "loss": 0.3193, + "step": 896 + }, + { + "epoch": 1.1455938697318007, + "grad_norm": 0.26542284600136934, + "learning_rate": 7.208441390298741e-05, + "loss": 0.2861, + "step": 897 + }, + { + "epoch": 1.1468710089399745, + "grad_norm": 0.26797361495911365, + "learning_rate": 7.202173034425333e-05, + "loss": 0.3171, + "step": 898 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.3520389400873544, + "learning_rate": 7.195900381484145e-05, + "loss": 0.2924, + "step": 899 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.27318484396535553, + "learning_rate": 7.189623443714907e-05, + "loss": 0.3173, + "step": 900 + }, + { + "epoch": 1.1507024265644956, + "grad_norm": 0.31122412585760156, + "learning_rate": 7.183342233365721e-05, + "loss": 0.264, + "step": 901 + }, + { + "epoch": 1.1519795657726692, + "grad_norm": 0.40740409489532253, + "learning_rate": 7.177056762693017e-05, + "loss": 0.2788, + "step": 902 + }, + { + "epoch": 1.1532567049808429, + "grad_norm": 0.2932119152073389, + "learning_rate": 7.170767043961541e-05, + "loss": 0.3198, + "step": 903 + }, + { + "epoch": 1.1545338441890167, + "grad_norm": 0.2414454691531409, + "learning_rate": 7.16447308944433e-05, + "loss": 0.2926, + "step": 904 + }, + { + "epoch": 1.1558109833971904, + "grad_norm": 0.2729475529007793, + "learning_rate": 7.158174911422685e-05, + "loss": 0.276, + "step": 905 + }, + { + "epoch": 1.157088122605364, + "grad_norm": 0.1879253674613448, + "learning_rate": 7.151872522186146e-05, + "loss": 0.2072, + "step": 906 + }, + { + "epoch": 1.1583652618135376, + "grad_norm": 0.2376389939930858, + "learning_rate": 7.145565934032471e-05, + "loss": 0.3106, + "step": 907 + }, + { + "epoch": 1.1596424010217115, + "grad_norm": 0.29958010332458035, + "learning_rate": 7.13925515926762e-05, + "loss": 0.2643, + "step": 908 + }, + { + "epoch": 1.160919540229885, + "grad_norm": 0.22815295689996568, + "learning_rate": 7.132940210205705e-05, + "loss": 0.259, + "step": 909 + }, + { + "epoch": 1.1621966794380587, + "grad_norm": 0.2881775823126032, + "learning_rate": 7.126621099168999e-05, + "loss": 0.3062, + "step": 910 + }, + { + "epoch": 1.1634738186462323, + "grad_norm": 0.2444263645754207, + "learning_rate": 7.120297838487886e-05, + "loss": 0.2092, + "step": 911 + }, + { + "epoch": 1.1647509578544062, + "grad_norm": 0.270907105795975, + "learning_rate": 7.113970440500858e-05, + "loss": 0.2939, + "step": 912 + }, + { + "epoch": 1.1660280970625798, + "grad_norm": 0.21965972699150615, + "learning_rate": 7.107638917554468e-05, + "loss": 0.3181, + "step": 913 + }, + { + "epoch": 1.1673052362707534, + "grad_norm": 0.23467841694807126, + "learning_rate": 7.101303282003324e-05, + "loss": 0.2539, + "step": 914 + }, + { + "epoch": 1.1685823754789273, + "grad_norm": 1.123213649666716, + "learning_rate": 7.094963546210058e-05, + "loss": 0.4029, + "step": 915 + }, + { + "epoch": 1.169859514687101, + "grad_norm": 1.0288052659068354, + "learning_rate": 7.088619722545306e-05, + "loss": 0.2416, + "step": 916 + }, + { + "epoch": 1.1711366538952745, + "grad_norm": 0.25891630395084186, + "learning_rate": 7.082271823387675e-05, + "loss": 0.3227, + "step": 917 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 0.3617287239880173, + "learning_rate": 7.07591986112373e-05, + "loss": 0.3689, + "step": 918 + }, + { + "epoch": 1.173690932311622, + "grad_norm": 0.4372456443617023, + "learning_rate": 7.069563848147956e-05, + "loss": 0.3127, + "step": 919 + }, + { + "epoch": 1.1749680715197957, + "grad_norm": 0.2511373798577969, + "learning_rate": 7.063203796862752e-05, + "loss": 0.2331, + "step": 920 + }, + { + "epoch": 1.1762452107279693, + "grad_norm": 0.2564687682289008, + "learning_rate": 7.05683971967839e-05, + "loss": 0.2713, + "step": 921 + }, + { + "epoch": 1.1775223499361431, + "grad_norm": 0.3051501720018287, + "learning_rate": 7.050471629013002e-05, + "loss": 0.3635, + "step": 922 + }, + { + "epoch": 1.1787994891443168, + "grad_norm": 0.23008293665151652, + "learning_rate": 7.044099537292548e-05, + "loss": 0.2539, + "step": 923 + }, + { + "epoch": 1.1800766283524904, + "grad_norm": 0.26810557624811737, + "learning_rate": 7.037723456950796e-05, + "loss": 0.3152, + "step": 924 + }, + { + "epoch": 1.181353767560664, + "grad_norm": 0.2911818464935902, + "learning_rate": 7.031343400429301e-05, + "loss": 0.308, + "step": 925 + }, + { + "epoch": 1.1826309067688379, + "grad_norm": 0.29030912913535334, + "learning_rate": 7.02495938017737e-05, + "loss": 0.3331, + "step": 926 + }, + { + "epoch": 1.1839080459770115, + "grad_norm": 0.7690501851848985, + "learning_rate": 7.018571408652044e-05, + "loss": 0.3153, + "step": 927 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 0.34343632465696733, + "learning_rate": 7.012179498318088e-05, + "loss": 0.2907, + "step": 928 + }, + { + "epoch": 1.186462324393359, + "grad_norm": 0.23486648014254619, + "learning_rate": 7.005783661647934e-05, + "loss": 0.2893, + "step": 929 + }, + { + "epoch": 1.1877394636015326, + "grad_norm": 0.37249232821913414, + "learning_rate": 6.999383911121686e-05, + "loss": 0.2993, + "step": 930 + }, + { + "epoch": 1.1890166028097062, + "grad_norm": 0.2777693474323358, + "learning_rate": 6.992980259227084e-05, + "loss": 0.2829, + "step": 931 + }, + { + "epoch": 1.1902937420178799, + "grad_norm": 0.3632139019898651, + "learning_rate": 6.986572718459479e-05, + "loss": 0.2972, + "step": 932 + }, + { + "epoch": 1.1915708812260537, + "grad_norm": 0.26075859973105026, + "learning_rate": 6.980161301321809e-05, + "loss": 0.3166, + "step": 933 + }, + { + "epoch": 1.1928480204342273, + "grad_norm": 0.3344144297068406, + "learning_rate": 6.97374602032458e-05, + "loss": 0.3469, + "step": 934 + }, + { + "epoch": 1.194125159642401, + "grad_norm": 0.36759303755494577, + "learning_rate": 6.967326887985837e-05, + "loss": 0.272, + "step": 935 + }, + { + "epoch": 1.1954022988505748, + "grad_norm": 0.2636330245984027, + "learning_rate": 6.960903916831131e-05, + "loss": 0.2643, + "step": 936 + }, + { + "epoch": 1.1966794380587484, + "grad_norm": 0.3327492158971368, + "learning_rate": 6.95447711939352e-05, + "loss": 0.3128, + "step": 937 + }, + { + "epoch": 1.197956577266922, + "grad_norm": 1.350249364501428, + "learning_rate": 6.948046508213515e-05, + "loss": 0.3094, + "step": 938 + }, + { + "epoch": 1.1992337164750957, + "grad_norm": 0.27555057888461515, + "learning_rate": 6.94161209583907e-05, + "loss": 0.3326, + "step": 939 + }, + { + "epoch": 1.2005108556832695, + "grad_norm": 0.2459690636749319, + "learning_rate": 6.935173894825563e-05, + "loss": 0.2293, + "step": 940 + }, + { + "epoch": 1.2017879948914432, + "grad_norm": 0.24894088266374387, + "learning_rate": 6.928731917735762e-05, + "loss": 0.2461, + "step": 941 + }, + { + "epoch": 1.2030651340996168, + "grad_norm": 0.23930409023011195, + "learning_rate": 6.922286177139796e-05, + "loss": 0.2582, + "step": 942 + }, + { + "epoch": 1.2043422733077906, + "grad_norm": 0.3038824128180063, + "learning_rate": 6.915836685615149e-05, + "loss": 0.3012, + "step": 943 + }, + { + "epoch": 1.2056194125159643, + "grad_norm": 0.27748087310437514, + "learning_rate": 6.909383455746617e-05, + "loss": 0.3208, + "step": 944 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 0.18941259400112614, + "learning_rate": 6.902926500126292e-05, + "loss": 0.2579, + "step": 945 + }, + { + "epoch": 1.2081736909323115, + "grad_norm": 0.2851273765624193, + "learning_rate": 6.896465831353534e-05, + "loss": 0.3105, + "step": 946 + }, + { + "epoch": 1.2094508301404854, + "grad_norm": 0.35164808875825954, + "learning_rate": 6.890001462034956e-05, + "loss": 0.2921, + "step": 947 + }, + { + "epoch": 1.210727969348659, + "grad_norm": 0.2849385614599333, + "learning_rate": 6.883533404784383e-05, + "loss": 0.3429, + "step": 948 + }, + { + "epoch": 1.2120051085568326, + "grad_norm": 0.34942854610553464, + "learning_rate": 6.877061672222841e-05, + "loss": 0.2656, + "step": 949 + }, + { + "epoch": 1.2132822477650063, + "grad_norm": 0.3314139063848881, + "learning_rate": 6.870586276978526e-05, + "loss": 0.2746, + "step": 950 + }, + { + "epoch": 1.21455938697318, + "grad_norm": 0.27083010077452657, + "learning_rate": 6.864107231686781e-05, + "loss": 0.2915, + "step": 951 + }, + { + "epoch": 1.2158365261813537, + "grad_norm": 0.30260261907053754, + "learning_rate": 6.857624548990071e-05, + "loss": 0.332, + "step": 952 + }, + { + "epoch": 1.2171136653895274, + "grad_norm": 0.2613560607793952, + "learning_rate": 6.85113824153796e-05, + "loss": 0.3378, + "step": 953 + }, + { + "epoch": 1.2183908045977012, + "grad_norm": 0.25742336772302865, + "learning_rate": 6.844648321987084e-05, + "loss": 0.2846, + "step": 954 + }, + { + "epoch": 1.2196679438058748, + "grad_norm": 0.2852153947451508, + "learning_rate": 6.838154803001131e-05, + "loss": 0.272, + "step": 955 + }, + { + "epoch": 1.2209450830140485, + "grad_norm": 0.26674302511844655, + "learning_rate": 6.831657697250801e-05, + "loss": 0.3298, + "step": 956 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.22502772285261927, + "learning_rate": 6.825157017413808e-05, + "loss": 0.2818, + "step": 957 + }, + { + "epoch": 1.223499361430396, + "grad_norm": 0.29966498685897647, + "learning_rate": 6.818652776174827e-05, + "loss": 0.2724, + "step": 958 + }, + { + "epoch": 1.2247765006385696, + "grad_norm": 0.23451486665871427, + "learning_rate": 6.812144986225493e-05, + "loss": 0.3026, + "step": 959 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.23711964442857705, + "learning_rate": 6.805633660264357e-05, + "loss": 0.3004, + "step": 960 + }, + { + "epoch": 1.227330779054917, + "grad_norm": 0.308327839933858, + "learning_rate": 6.799118810996876e-05, + "loss": 0.28, + "step": 961 + }, + { + "epoch": 1.2286079182630907, + "grad_norm": 0.2470172789641596, + "learning_rate": 6.792600451135377e-05, + "loss": 0.3272, + "step": 962 + }, + { + "epoch": 1.2298850574712643, + "grad_norm": 0.2933755212512582, + "learning_rate": 6.786078593399042e-05, + "loss": 0.3512, + "step": 963 + }, + { + "epoch": 1.231162196679438, + "grad_norm": 0.24095073705867884, + "learning_rate": 6.779553250513875e-05, + "loss": 0.2716, + "step": 964 + }, + { + "epoch": 1.2324393358876118, + "grad_norm": 0.2597459503522719, + "learning_rate": 6.773024435212678e-05, + "loss": 0.3111, + "step": 965 + }, + { + "epoch": 1.2337164750957854, + "grad_norm": 0.24742245759113443, + "learning_rate": 6.766492160235038e-05, + "loss": 0.2514, + "step": 966 + }, + { + "epoch": 1.234993614303959, + "grad_norm": 0.24451282439840605, + "learning_rate": 6.759956438327282e-05, + "loss": 0.2771, + "step": 967 + }, + { + "epoch": 1.236270753512133, + "grad_norm": 0.3051015314586851, + "learning_rate": 6.75341728224247e-05, + "loss": 0.3496, + "step": 968 + }, + { + "epoch": 1.2375478927203065, + "grad_norm": 0.40714955384512086, + "learning_rate": 6.746874704740362e-05, + "loss": 0.2755, + "step": 969 + }, + { + "epoch": 1.2388250319284801, + "grad_norm": 0.2428300919764796, + "learning_rate": 6.74032871858739e-05, + "loss": 0.2212, + "step": 970 + }, + { + "epoch": 1.240102171136654, + "grad_norm": 0.28062268939045615, + "learning_rate": 6.733779336556642e-05, + "loss": 0.2875, + "step": 971 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.26269548824197253, + "learning_rate": 6.727226571427831e-05, + "loss": 0.3285, + "step": 972 + }, + { + "epoch": 1.2426564495530013, + "grad_norm": 0.3113360481295226, + "learning_rate": 6.720670435987271e-05, + "loss": 0.2755, + "step": 973 + }, + { + "epoch": 1.2439335887611749, + "grad_norm": 0.26295192771300663, + "learning_rate": 6.714110943027853e-05, + "loss": 0.3375, + "step": 974 + }, + { + "epoch": 1.2452107279693487, + "grad_norm": 0.3574752552721606, + "learning_rate": 6.707548105349015e-05, + "loss": 0.3444, + "step": 975 + }, + { + "epoch": 1.2464878671775224, + "grad_norm": 0.45528139145693747, + "learning_rate": 6.700981935756732e-05, + "loss": 0.253, + "step": 976 + }, + { + "epoch": 1.247765006385696, + "grad_norm": 0.2739117354403127, + "learning_rate": 6.694412447063467e-05, + "loss": 0.2557, + "step": 977 + }, + { + "epoch": 1.2490421455938696, + "grad_norm": 0.3250747855879063, + "learning_rate": 6.68783965208817e-05, + "loss": 0.2944, + "step": 978 + }, + { + "epoch": 1.2503192848020435, + "grad_norm": 0.24635805126396254, + "learning_rate": 6.68126356365624e-05, + "loss": 0.3302, + "step": 979 + }, + { + "epoch": 1.251596424010217, + "grad_norm": 0.24137134770761814, + "learning_rate": 6.674684194599499e-05, + "loss": 0.3418, + "step": 980 + }, + { + "epoch": 1.2528735632183907, + "grad_norm": 0.24713384233812252, + "learning_rate": 6.668101557756175e-05, + "loss": 0.2932, + "step": 981 + }, + { + "epoch": 1.2541507024265646, + "grad_norm": 0.2472228296873972, + "learning_rate": 6.661515665970868e-05, + "loss": 0.2822, + "step": 982 + }, + { + "epoch": 1.2554278416347382, + "grad_norm": 0.3586721450089123, + "learning_rate": 6.654926532094534e-05, + "loss": 0.223, + "step": 983 + }, + { + "epoch": 1.2567049808429118, + "grad_norm": 0.6233671850970676, + "learning_rate": 6.648334168984452e-05, + "loss": 0.3283, + "step": 984 + }, + { + "epoch": 1.2579821200510857, + "grad_norm": 0.2389048301555971, + "learning_rate": 6.641738589504202e-05, + "loss": 0.313, + "step": 985 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 0.28812911173032996, + "learning_rate": 6.635139806523642e-05, + "loss": 0.2981, + "step": 986 + }, + { + "epoch": 1.260536398467433, + "grad_norm": 0.201058790701124, + "learning_rate": 6.628537832918878e-05, + "loss": 0.2547, + "step": 987 + }, + { + "epoch": 1.2618135376756068, + "grad_norm": 0.6505639735131661, + "learning_rate": 6.621932681572244e-05, + "loss": 0.3252, + "step": 988 + }, + { + "epoch": 1.2630906768837804, + "grad_norm": 0.26179563970442654, + "learning_rate": 6.615324365372281e-05, + "loss": 0.3169, + "step": 989 + }, + { + "epoch": 1.264367816091954, + "grad_norm": 0.31397229608181143, + "learning_rate": 6.60871289721369e-05, + "loss": 0.3088, + "step": 990 + }, + { + "epoch": 1.2656449553001277, + "grad_norm": 0.2830866822206043, + "learning_rate": 6.602098289997336e-05, + "loss": 0.2496, + "step": 991 + }, + { + "epoch": 1.2669220945083013, + "grad_norm": 0.2938643602420266, + "learning_rate": 6.595480556630203e-05, + "loss": 0.3349, + "step": 992 + }, + { + "epoch": 1.2681992337164751, + "grad_norm": 1.6737675574454418, + "learning_rate": 6.588859710025378e-05, + "loss": 0.3188, + "step": 993 + }, + { + "epoch": 1.2694763729246488, + "grad_norm": 0.22649679197706216, + "learning_rate": 6.582235763102021e-05, + "loss": 0.272, + "step": 994 + }, + { + "epoch": 1.2707535121328224, + "grad_norm": 0.26677824571840325, + "learning_rate": 6.57560872878534e-05, + "loss": 0.3093, + "step": 995 + }, + { + "epoch": 1.2720306513409962, + "grad_norm": 0.3634220965597673, + "learning_rate": 6.568978620006574e-05, + "loss": 0.3535, + "step": 996 + }, + { + "epoch": 1.2733077905491699, + "grad_norm": 0.3398144050139069, + "learning_rate": 6.562345449702951e-05, + "loss": 0.2517, + "step": 997 + }, + { + "epoch": 1.2745849297573435, + "grad_norm": 0.46608769986725335, + "learning_rate": 6.555709230817683e-05, + "loss": 0.3096, + "step": 998 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 0.22339061295569468, + "learning_rate": 6.549069976299923e-05, + "loss": 0.2523, + "step": 999 + }, + { + "epoch": 1.277139208173691, + "grad_norm": 0.2598609898289188, + "learning_rate": 6.542427699104749e-05, + "loss": 0.3399, + "step": 1000 + }, + { + "epoch": 1.2784163473818646, + "grad_norm": 0.26351055913501265, + "learning_rate": 6.535782412193146e-05, + "loss": 0.2678, + "step": 1001 + }, + { + "epoch": 1.2796934865900382, + "grad_norm": 0.5051292472144773, + "learning_rate": 6.529134128531957e-05, + "loss": 0.2818, + "step": 1002 + }, + { + "epoch": 1.280970625798212, + "grad_norm": 0.6853702950328175, + "learning_rate": 6.522482861093881e-05, + "loss": 0.2978, + "step": 1003 + }, + { + "epoch": 1.2822477650063857, + "grad_norm": 0.3303473189189555, + "learning_rate": 6.515828622857443e-05, + "loss": 0.3595, + "step": 1004 + }, + { + "epoch": 1.2835249042145593, + "grad_norm": 0.26401908946978764, + "learning_rate": 6.509171426806954e-05, + "loss": 0.2668, + "step": 1005 + }, + { + "epoch": 1.284802043422733, + "grad_norm": 0.27492075170924424, + "learning_rate": 6.502511285932507e-05, + "loss": 0.2575, + "step": 1006 + }, + { + "epoch": 1.2860791826309068, + "grad_norm": 0.23147317739039128, + "learning_rate": 6.495848213229933e-05, + "loss": 0.267, + "step": 1007 + }, + { + "epoch": 1.2873563218390804, + "grad_norm": 0.26822022989762273, + "learning_rate": 6.48918222170079e-05, + "loss": 0.3455, + "step": 1008 + }, + { + "epoch": 1.288633461047254, + "grad_norm": 0.25640340140167917, + "learning_rate": 6.482513324352329e-05, + "loss": 0.3284, + "step": 1009 + }, + { + "epoch": 1.289910600255428, + "grad_norm": 0.28596712010549075, + "learning_rate": 6.47584153419747e-05, + "loss": 0.3004, + "step": 1010 + }, + { + "epoch": 1.2911877394636015, + "grad_norm": 0.2758050979158091, + "learning_rate": 6.469166864254779e-05, + "loss": 0.2096, + "step": 1011 + }, + { + "epoch": 1.2924648786717752, + "grad_norm": 0.26056060370546885, + "learning_rate": 6.462489327548442e-05, + "loss": 0.2825, + "step": 1012 + }, + { + "epoch": 1.293742017879949, + "grad_norm": 0.2677960870286058, + "learning_rate": 6.455808937108237e-05, + "loss": 0.2488, + "step": 1013 + }, + { + "epoch": 1.2950191570881227, + "grad_norm": 0.3705975244036235, + "learning_rate": 6.449125705969511e-05, + "loss": 0.3663, + "step": 1014 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.3288731199195029, + "learning_rate": 6.442439647173155e-05, + "loss": 0.3874, + "step": 1015 + }, + { + "epoch": 1.29757343550447, + "grad_norm": 0.2895448111208823, + "learning_rate": 6.435750773765579e-05, + "loss": 0.3185, + "step": 1016 + }, + { + "epoch": 1.2988505747126438, + "grad_norm": 0.21137463904381273, + "learning_rate": 6.429059098798679e-05, + "loss": 0.2699, + "step": 1017 + }, + { + "epoch": 1.3001277139208174, + "grad_norm": 0.21647826529689287, + "learning_rate": 6.422364635329826e-05, + "loss": 0.256, + "step": 1018 + }, + { + "epoch": 1.301404853128991, + "grad_norm": 0.27538230432247646, + "learning_rate": 6.415667396421826e-05, + "loss": 0.3372, + "step": 1019 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.28692902322163905, + "learning_rate": 6.4089673951429e-05, + "loss": 0.3216, + "step": 1020 + }, + { + "epoch": 1.3039591315453385, + "grad_norm": 0.2568759817110383, + "learning_rate": 6.402264644566664e-05, + "loss": 0.2889, + "step": 1021 + }, + { + "epoch": 1.3052362707535121, + "grad_norm": 0.2908472998366449, + "learning_rate": 6.395559157772098e-05, + "loss": 0.2821, + "step": 1022 + }, + { + "epoch": 1.3065134099616857, + "grad_norm": 0.18329955644955384, + "learning_rate": 6.388850947843517e-05, + "loss": 0.2376, + "step": 1023 + }, + { + "epoch": 1.3077905491698596, + "grad_norm": 0.24449173865872997, + "learning_rate": 6.382140027870554e-05, + "loss": 0.3085, + "step": 1024 + }, + { + "epoch": 1.3090676883780332, + "grad_norm": 0.34531769024770903, + "learning_rate": 6.375426410948127e-05, + "loss": 0.3017, + "step": 1025 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 0.309593321011233, + "learning_rate": 6.368710110176418e-05, + "loss": 0.2724, + "step": 1026 + }, + { + "epoch": 1.3116219667943807, + "grad_norm": 0.2986372779302919, + "learning_rate": 6.361991138660845e-05, + "loss": 0.3518, + "step": 1027 + }, + { + "epoch": 1.3128991060025543, + "grad_norm": 0.3689089239965588, + "learning_rate": 6.355269509512042e-05, + "loss": 0.2647, + "step": 1028 + }, + { + "epoch": 1.314176245210728, + "grad_norm": 0.38586762625143733, + "learning_rate": 6.348545235845818e-05, + "loss": 0.3424, + "step": 1029 + }, + { + "epoch": 1.3154533844189016, + "grad_norm": 0.24411931448826815, + "learning_rate": 6.341818330783156e-05, + "loss": 0.2538, + "step": 1030 + }, + { + "epoch": 1.3167305236270754, + "grad_norm": 0.22427560022876641, + "learning_rate": 6.335088807450163e-05, + "loss": 0.2871, + "step": 1031 + }, + { + "epoch": 1.318007662835249, + "grad_norm": 0.34893811917411643, + "learning_rate": 6.328356678978058e-05, + "loss": 0.309, + "step": 1032 + }, + { + "epoch": 1.3192848020434227, + "grad_norm": 0.2568703952482983, + "learning_rate": 6.321621958503146e-05, + "loss": 0.2732, + "step": 1033 + }, + { + "epoch": 1.3205619412515963, + "grad_norm": 0.3041200869590653, + "learning_rate": 6.314884659166786e-05, + "loss": 0.3003, + "step": 1034 + }, + { + "epoch": 1.3218390804597702, + "grad_norm": 0.24951763932584128, + "learning_rate": 6.308144794115373e-05, + "loss": 0.2968, + "step": 1035 + }, + { + "epoch": 1.3231162196679438, + "grad_norm": 0.26544483986537626, + "learning_rate": 6.301402376500304e-05, + "loss": 0.3037, + "step": 1036 + }, + { + "epoch": 1.3243933588761174, + "grad_norm": 0.44508684125878945, + "learning_rate": 6.294657419477964e-05, + "loss": 0.3219, + "step": 1037 + }, + { + "epoch": 1.3256704980842913, + "grad_norm": 0.28044584177453913, + "learning_rate": 6.287909936209682e-05, + "loss": 0.3195, + "step": 1038 + }, + { + "epoch": 1.326947637292465, + "grad_norm": 0.3665475063438433, + "learning_rate": 6.281159939861725e-05, + "loss": 0.334, + "step": 1039 + }, + { + "epoch": 1.3282247765006385, + "grad_norm": 0.21567476817202408, + "learning_rate": 6.274407443605264e-05, + "loss": 0.2496, + "step": 1040 + }, + { + "epoch": 1.3295019157088124, + "grad_norm": 0.31607012660893646, + "learning_rate": 6.26765246061634e-05, + "loss": 0.2923, + "step": 1041 + }, + { + "epoch": 1.330779054916986, + "grad_norm": 0.3366118365964855, + "learning_rate": 6.260895004075857e-05, + "loss": 0.2482, + "step": 1042 + }, + { + "epoch": 1.3320561941251596, + "grad_norm": 0.2801688028390489, + "learning_rate": 6.254135087169537e-05, + "loss": 0.3176, + "step": 1043 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.2119435452117795, + "learning_rate": 6.247372723087907e-05, + "loss": 0.3122, + "step": 1044 + }, + { + "epoch": 1.3346104725415069, + "grad_norm": 0.2661447062345255, + "learning_rate": 6.240607925026271e-05, + "loss": 0.3086, + "step": 1045 + }, + { + "epoch": 1.3358876117496807, + "grad_norm": 0.24250847818701893, + "learning_rate": 6.233840706184677e-05, + "loss": 0.2823, + "step": 1046 + }, + { + "epoch": 1.3371647509578544, + "grad_norm": 0.32977355942986103, + "learning_rate": 6.227071079767899e-05, + "loss": 0.2942, + "step": 1047 + }, + { + "epoch": 1.338441890166028, + "grad_norm": 0.28025134060669027, + "learning_rate": 6.220299058985409e-05, + "loss": 0.3059, + "step": 1048 + }, + { + "epoch": 1.3397190293742018, + "grad_norm": 0.24474331510804326, + "learning_rate": 6.213524657051353e-05, + "loss": 0.2813, + "step": 1049 + }, + { + "epoch": 1.3409961685823755, + "grad_norm": 0.28361275184108115, + "learning_rate": 6.206747887184523e-05, + "loss": 0.3556, + "step": 1050 + }, + { + "epoch": 1.342273307790549, + "grad_norm": 0.25456180552194657, + "learning_rate": 6.199968762608326e-05, + "loss": 0.2534, + "step": 1051 + }, + { + "epoch": 1.343550446998723, + "grad_norm": 0.35556343577908384, + "learning_rate": 6.193187296550772e-05, + "loss": 0.3104, + "step": 1052 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 0.2672808193755962, + "learning_rate": 6.186403502244432e-05, + "loss": 0.2817, + "step": 1053 + }, + { + "epoch": 1.3461047254150702, + "grad_norm": 0.2509013887170571, + "learning_rate": 6.179617392926426e-05, + "loss": 0.2846, + "step": 1054 + }, + { + "epoch": 1.347381864623244, + "grad_norm": 0.3651718358284798, + "learning_rate": 6.17282898183839e-05, + "loss": 0.3286, + "step": 1055 + }, + { + "epoch": 1.3486590038314177, + "grad_norm": 0.260267030739745, + "learning_rate": 6.166038282226448e-05, + "loss": 0.2657, + "step": 1056 + }, + { + "epoch": 1.3499361430395913, + "grad_norm": 0.37255006889660774, + "learning_rate": 6.159245307341195e-05, + "loss": 0.2825, + "step": 1057 + }, + { + "epoch": 1.351213282247765, + "grad_norm": 0.2659467713725852, + "learning_rate": 6.152450070437659e-05, + "loss": 0.2585, + "step": 1058 + }, + { + "epoch": 1.3524904214559386, + "grad_norm": 0.2348479806746035, + "learning_rate": 6.14565258477529e-05, + "loss": 0.2499, + "step": 1059 + }, + { + "epoch": 1.3537675606641124, + "grad_norm": 0.26019099615996105, + "learning_rate": 6.138852863617917e-05, + "loss": 0.2445, + "step": 1060 + }, + { + "epoch": 1.355044699872286, + "grad_norm": 0.24503383072551227, + "learning_rate": 6.132050920233739e-05, + "loss": 0.3151, + "step": 1061 + }, + { + "epoch": 1.3563218390804597, + "grad_norm": 0.24759853024493744, + "learning_rate": 6.125246767895286e-05, + "loss": 0.2916, + "step": 1062 + }, + { + "epoch": 1.3575989782886335, + "grad_norm": 0.3273743730389391, + "learning_rate": 6.118440419879403e-05, + "loss": 0.321, + "step": 1063 + }, + { + "epoch": 1.3588761174968071, + "grad_norm": 0.3390769686711663, + "learning_rate": 6.111631889467213e-05, + "loss": 0.3045, + "step": 1064 + }, + { + "epoch": 1.3601532567049808, + "grad_norm": 0.2155135520079544, + "learning_rate": 6.104821189944102e-05, + "loss": 0.2784, + "step": 1065 + }, + { + "epoch": 1.3614303959131546, + "grad_norm": 0.2781320257528933, + "learning_rate": 6.098008334599689e-05, + "loss": 0.218, + "step": 1066 + }, + { + "epoch": 1.3627075351213283, + "grad_norm": 0.29619446535592076, + "learning_rate": 6.0911933367277984e-05, + "loss": 0.2628, + "step": 1067 + }, + { + "epoch": 1.3639846743295019, + "grad_norm": 0.24997677718077405, + "learning_rate": 6.084376209626432e-05, + "loss": 0.3034, + "step": 1068 + }, + { + "epoch": 1.3652618135376757, + "grad_norm": 0.2280791637436764, + "learning_rate": 6.077556966597753e-05, + "loss": 0.278, + "step": 1069 + }, + { + "epoch": 1.3665389527458494, + "grad_norm": 0.3013837206194773, + "learning_rate": 6.0707356209480484e-05, + "loss": 0.337, + "step": 1070 + }, + { + "epoch": 1.367816091954023, + "grad_norm": 0.7938220305709432, + "learning_rate": 6.063912185987708e-05, + "loss": 0.3283, + "step": 1071 + }, + { + "epoch": 1.3690932311621966, + "grad_norm": 0.3207331742362712, + "learning_rate": 6.057086675031202e-05, + "loss": 0.3052, + "step": 1072 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 0.29765340852144656, + "learning_rate": 6.0502591013970475e-05, + "loss": 0.4195, + "step": 1073 + }, + { + "epoch": 1.371647509578544, + "grad_norm": 0.31947948758992295, + "learning_rate": 6.04342947840779e-05, + "loss": 0.34, + "step": 1074 + }, + { + "epoch": 1.3729246487867177, + "grad_norm": 0.28728870772655724, + "learning_rate": 6.036597819389972e-05, + "loss": 0.2453, + "step": 1075 + }, + { + "epoch": 1.3742017879948913, + "grad_norm": 0.32654038414662884, + "learning_rate": 6.02976413767411e-05, + "loss": 0.3375, + "step": 1076 + }, + { + "epoch": 1.3754789272030652, + "grad_norm": 0.2610098648616663, + "learning_rate": 6.022928446594661e-05, + "loss": 0.2727, + "step": 1077 + }, + { + "epoch": 1.3767560664112388, + "grad_norm": 0.3355326015890375, + "learning_rate": 6.016090759490014e-05, + "loss": 0.3339, + "step": 1078 + }, + { + "epoch": 1.3780332056194124, + "grad_norm": 0.2939878081394191, + "learning_rate": 6.009251089702447e-05, + "loss": 0.2815, + "step": 1079 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.4324417067336351, + "learning_rate": 6.0024094505781036e-05, + "loss": 0.3164, + "step": 1080 + }, + { + "epoch": 1.38058748403576, + "grad_norm": 0.24408103416354643, + "learning_rate": 5.995565855466974e-05, + "loss": 0.2762, + "step": 1081 + }, + { + "epoch": 1.3818646232439336, + "grad_norm": 0.311753420676242, + "learning_rate": 5.9887203177228654e-05, + "loss": 0.267, + "step": 1082 + }, + { + "epoch": 1.3831417624521074, + "grad_norm": 0.27644554271911426, + "learning_rate": 5.981872850703376e-05, + "loss": 0.3271, + "step": 1083 + }, + { + "epoch": 1.384418901660281, + "grad_norm": 0.28034449195870276, + "learning_rate": 5.975023467769865e-05, + "loss": 0.2678, + "step": 1084 + }, + { + "epoch": 1.3856960408684547, + "grad_norm": 0.32732983176007924, + "learning_rate": 5.9681721822874326e-05, + "loss": 0.2714, + "step": 1085 + }, + { + "epoch": 1.3869731800766283, + "grad_norm": 0.33593261676079017, + "learning_rate": 5.961319007624894e-05, + "loss": 0.3452, + "step": 1086 + }, + { + "epoch": 1.388250319284802, + "grad_norm": 0.22732698568923013, + "learning_rate": 5.954463957154742e-05, + "loss": 0.2601, + "step": 1087 + }, + { + "epoch": 1.3895274584929758, + "grad_norm": 0.3373498821913411, + "learning_rate": 5.947607044253142e-05, + "loss": 0.3688, + "step": 1088 + }, + { + "epoch": 1.3908045977011494, + "grad_norm": 0.5175291350124803, + "learning_rate": 5.940748282299885e-05, + "loss": 0.2837, + "step": 1089 + }, + { + "epoch": 1.392081736909323, + "grad_norm": 0.2974438063836474, + "learning_rate": 5.9338876846783685e-05, + "loss": 0.2539, + "step": 1090 + }, + { + "epoch": 1.3933588761174969, + "grad_norm": 0.30149330258783, + "learning_rate": 5.927025264775581e-05, + "loss": 0.3709, + "step": 1091 + }, + { + "epoch": 1.3946360153256705, + "grad_norm": 0.27793750377231724, + "learning_rate": 5.920161035982058e-05, + "loss": 0.2828, + "step": 1092 + }, + { + "epoch": 1.3959131545338441, + "grad_norm": 0.3872241949854895, + "learning_rate": 5.913295011691868e-05, + "loss": 0.2713, + "step": 1093 + }, + { + "epoch": 1.397190293742018, + "grad_norm": 0.3105286102893422, + "learning_rate": 5.9064272053025834e-05, + "loss": 0.307, + "step": 1094 + }, + { + "epoch": 1.3984674329501916, + "grad_norm": 0.2843264261611174, + "learning_rate": 5.899557630215256e-05, + "loss": 0.2579, + "step": 1095 + }, + { + "epoch": 1.3997445721583652, + "grad_norm": 0.660794563364477, + "learning_rate": 5.8926862998343826e-05, + "loss": 0.3318, + "step": 1096 + }, + { + "epoch": 1.401021711366539, + "grad_norm": 0.23610652028205312, + "learning_rate": 5.88581322756789e-05, + "loss": 0.2763, + "step": 1097 + }, + { + "epoch": 1.4022988505747127, + "grad_norm": 0.2949808131561366, + "learning_rate": 5.8789384268271055e-05, + "loss": 0.3427, + "step": 1098 + }, + { + "epoch": 1.4035759897828863, + "grad_norm": 0.32779245351217395, + "learning_rate": 5.872061911026723e-05, + "loss": 0.4054, + "step": 1099 + }, + { + "epoch": 1.40485312899106, + "grad_norm": 0.26050296967978886, + "learning_rate": 5.865183693584786e-05, + "loss": 0.2637, + "step": 1100 + }, + { + "epoch": 1.4061302681992336, + "grad_norm": 0.31204268628193726, + "learning_rate": 5.858303787922663e-05, + "loss": 0.2835, + "step": 1101 + }, + { + "epoch": 1.4074074074074074, + "grad_norm": 0.2804966076105946, + "learning_rate": 5.8514222074650094e-05, + "loss": 0.3311, + "step": 1102 + }, + { + "epoch": 1.408684546615581, + "grad_norm": 0.25332236635101407, + "learning_rate": 5.844538965639752e-05, + "loss": 0.3195, + "step": 1103 + }, + { + "epoch": 1.4099616858237547, + "grad_norm": 0.2408285448218717, + "learning_rate": 5.837654075878059e-05, + "loss": 0.2769, + "step": 1104 + }, + { + "epoch": 1.4112388250319285, + "grad_norm": 0.26821764194988584, + "learning_rate": 5.8307675516143154e-05, + "loss": 0.3408, + "step": 1105 + }, + { + "epoch": 1.4125159642401022, + "grad_norm": 0.27907566620432306, + "learning_rate": 5.823879406286094e-05, + "loss": 0.2882, + "step": 1106 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 0.2765891479921545, + "learning_rate": 5.8169896533341305e-05, + "loss": 0.3153, + "step": 1107 + }, + { + "epoch": 1.4150702426564497, + "grad_norm": 0.2659201613063463, + "learning_rate": 5.8100983062023e-05, + "loss": 0.3103, + "step": 1108 + }, + { + "epoch": 1.4163473818646233, + "grad_norm": 0.24421353597515483, + "learning_rate": 5.803205378337586e-05, + "loss": 0.2409, + "step": 1109 + }, + { + "epoch": 1.417624521072797, + "grad_norm": 0.3432618620564506, + "learning_rate": 5.796310883190055e-05, + "loss": 0.3641, + "step": 1110 + }, + { + "epoch": 1.4189016602809708, + "grad_norm": 0.3038382380792955, + "learning_rate": 5.789414834212836e-05, + "loss": 0.2744, + "step": 1111 + }, + { + "epoch": 1.4201787994891444, + "grad_norm": 0.29313694191961753, + "learning_rate": 5.782517244862087e-05, + "loss": 0.3042, + "step": 1112 + }, + { + "epoch": 1.421455938697318, + "grad_norm": 0.25993354526089163, + "learning_rate": 5.775618128596971e-05, + "loss": 0.2778, + "step": 1113 + }, + { + "epoch": 1.4227330779054916, + "grad_norm": 0.2976531380478175, + "learning_rate": 5.768717498879635e-05, + "loss": 0.2788, + "step": 1114 + }, + { + "epoch": 1.4240102171136653, + "grad_norm": 0.3099460511366343, + "learning_rate": 5.7618153691751745e-05, + "loss": 0.2256, + "step": 1115 + }, + { + "epoch": 1.4252873563218391, + "grad_norm": 0.27869484887057366, + "learning_rate": 5.7549117529516115e-05, + "loss": 0.2508, + "step": 1116 + }, + { + "epoch": 1.4265644955300127, + "grad_norm": 0.31494653954081614, + "learning_rate": 5.748006663679873e-05, + "loss": 0.2991, + "step": 1117 + }, + { + "epoch": 1.4278416347381864, + "grad_norm": 0.3202169046276306, + "learning_rate": 5.741100114833759e-05, + "loss": 0.2634, + "step": 1118 + }, + { + "epoch": 1.4291187739463602, + "grad_norm": 0.44209423456360525, + "learning_rate": 5.734192119889913e-05, + "loss": 0.2934, + "step": 1119 + }, + { + "epoch": 1.4303959131545338, + "grad_norm": 0.3322317203070009, + "learning_rate": 5.7272826923278065e-05, + "loss": 0.2675, + "step": 1120 + }, + { + "epoch": 1.4316730523627075, + "grad_norm": 0.35172203154123494, + "learning_rate": 5.7203718456297027e-05, + "loss": 0.3198, + "step": 1121 + }, + { + "epoch": 1.4329501915708813, + "grad_norm": 0.26811615359749785, + "learning_rate": 5.713459593280634e-05, + "loss": 0.2718, + "step": 1122 + }, + { + "epoch": 1.434227330779055, + "grad_norm": 0.22361492340723546, + "learning_rate": 5.706545948768378e-05, + "loss": 0.2775, + "step": 1123 + }, + { + "epoch": 1.4355044699872286, + "grad_norm": 0.7535043087827956, + "learning_rate": 5.699630925583426e-05, + "loss": 0.2528, + "step": 1124 + }, + { + "epoch": 1.4367816091954024, + "grad_norm": 0.303812400020654, + "learning_rate": 5.692714537218963e-05, + "loss": 0.2562, + "step": 1125 + }, + { + "epoch": 1.438058748403576, + "grad_norm": 0.2297233752855196, + "learning_rate": 5.6857967971708316e-05, + "loss": 0.2621, + "step": 1126 + }, + { + "epoch": 1.4393358876117497, + "grad_norm": 0.2789993801016815, + "learning_rate": 5.67887771893752e-05, + "loss": 0.2981, + "step": 1127 + }, + { + "epoch": 1.4406130268199233, + "grad_norm": 0.33030713606180767, + "learning_rate": 5.671957316020122e-05, + "loss": 0.2695, + "step": 1128 + }, + { + "epoch": 1.441890166028097, + "grad_norm": 0.2343027883771503, + "learning_rate": 5.665035601922317e-05, + "loss": 0.285, + "step": 1129 + }, + { + "epoch": 1.4431673052362708, + "grad_norm": 0.2791493008499506, + "learning_rate": 5.6581125901503476e-05, + "loss": 0.2493, + "step": 1130 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.30301572332342425, + "learning_rate": 5.651188294212979e-05, + "loss": 0.2558, + "step": 1131 + }, + { + "epoch": 1.445721583652618, + "grad_norm": 0.2275895213637738, + "learning_rate": 5.644262727621491e-05, + "loss": 0.1971, + "step": 1132 + }, + { + "epoch": 1.446998722860792, + "grad_norm": 0.30707562535615723, + "learning_rate": 5.637335903889639e-05, + "loss": 0.2689, + "step": 1133 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 0.3278188995396909, + "learning_rate": 5.630407836533632e-05, + "loss": 0.2945, + "step": 1134 + }, + { + "epoch": 1.4495530012771392, + "grad_norm": 0.3086434235863895, + "learning_rate": 5.623478539072106e-05, + "loss": 0.2564, + "step": 1135 + }, + { + "epoch": 1.450830140485313, + "grad_norm": 0.3469141058808339, + "learning_rate": 5.616548025026096e-05, + "loss": 0.2885, + "step": 1136 + }, + { + "epoch": 1.4521072796934866, + "grad_norm": 0.23149430787081615, + "learning_rate": 5.6096163079190146e-05, + "loss": 0.2585, + "step": 1137 + }, + { + "epoch": 1.4533844189016603, + "grad_norm": 0.26406946695346534, + "learning_rate": 5.602683401276615e-05, + "loss": 0.2839, + "step": 1138 + }, + { + "epoch": 1.454661558109834, + "grad_norm": 0.380930084317013, + "learning_rate": 5.595749318626979e-05, + "loss": 0.3099, + "step": 1139 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.3019501159235644, + "learning_rate": 5.5888140735004804e-05, + "loss": 0.2786, + "step": 1140 + }, + { + "epoch": 1.4572158365261814, + "grad_norm": 0.38159727505540786, + "learning_rate": 5.581877679429758e-05, + "loss": 0.283, + "step": 1141 + }, + { + "epoch": 1.458492975734355, + "grad_norm": 0.22944874075247035, + "learning_rate": 5.574940149949697e-05, + "loss": 0.2772, + "step": 1142 + }, + { + "epoch": 1.4597701149425286, + "grad_norm": 1.797586902921076, + "learning_rate": 5.568001498597395e-05, + "loss": 0.3597, + "step": 1143 + }, + { + "epoch": 1.4610472541507025, + "grad_norm": 0.30344155317142035, + "learning_rate": 5.561061738912142e-05, + "loss": 0.3064, + "step": 1144 + }, + { + "epoch": 1.462324393358876, + "grad_norm": 0.23560548950375113, + "learning_rate": 5.5541208844353875e-05, + "loss": 0.2557, + "step": 1145 + }, + { + "epoch": 1.4636015325670497, + "grad_norm": 0.2957666931147482, + "learning_rate": 5.54717894871072e-05, + "loss": 0.3448, + "step": 1146 + }, + { + "epoch": 1.4648786717752236, + "grad_norm": 0.7483268027029927, + "learning_rate": 5.5402359452838346e-05, + "loss": 0.3513, + "step": 1147 + }, + { + "epoch": 1.4661558109833972, + "grad_norm": 0.2546552060443052, + "learning_rate": 5.533291887702512e-05, + "loss": 0.3088, + "step": 1148 + }, + { + "epoch": 1.4674329501915708, + "grad_norm": 0.20044108066732416, + "learning_rate": 5.526346789516591e-05, + "loss": 0.2504, + "step": 1149 + }, + { + "epoch": 1.4687100893997447, + "grad_norm": 0.23181217177014865, + "learning_rate": 5.519400664277936e-05, + "loss": 0.2803, + "step": 1150 + }, + { + "epoch": 1.4699872286079183, + "grad_norm": 0.2502120424233909, + "learning_rate": 5.512453525540421e-05, + "loss": 0.3173, + "step": 1151 + }, + { + "epoch": 1.471264367816092, + "grad_norm": 0.3165777612964201, + "learning_rate": 5.5055053868598974e-05, + "loss": 0.3488, + "step": 1152 + }, + { + "epoch": 1.4725415070242658, + "grad_norm": 0.3818881953840746, + "learning_rate": 5.498556261794161e-05, + "loss": 0.3154, + "step": 1153 + }, + { + "epoch": 1.4738186462324394, + "grad_norm": 0.8586469281362148, + "learning_rate": 5.491606163902941e-05, + "loss": 0.3143, + "step": 1154 + }, + { + "epoch": 1.475095785440613, + "grad_norm": 0.29238898580549894, + "learning_rate": 5.484655106747859e-05, + "loss": 0.3145, + "step": 1155 + }, + { + "epoch": 1.4763729246487867, + "grad_norm": 0.3009791423333256, + "learning_rate": 5.477703103892412e-05, + "loss": 0.2807, + "step": 1156 + }, + { + "epoch": 1.4776500638569603, + "grad_norm": 0.24006244447558933, + "learning_rate": 5.4707501689019405e-05, + "loss": 0.2367, + "step": 1157 + }, + { + "epoch": 1.4789272030651341, + "grad_norm": 0.2792169888041215, + "learning_rate": 5.4637963153436e-05, + "loss": 0.3051, + "step": 1158 + }, + { + "epoch": 1.4802043422733078, + "grad_norm": 0.2537271003272158, + "learning_rate": 5.4568415567863483e-05, + "loss": 0.2683, + "step": 1159 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.32347773795425333, + "learning_rate": 5.4498859068009e-05, + "loss": 0.3066, + "step": 1160 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 0.26920839014787434, + "learning_rate": 5.442929378959714e-05, + "loss": 0.2956, + "step": 1161 + }, + { + "epoch": 1.4840357598978289, + "grad_norm": 0.3493516862184502, + "learning_rate": 5.4359719868369584e-05, + "loss": 0.3049, + "step": 1162 + }, + { + "epoch": 1.4853128991060025, + "grad_norm": 0.36955653948997, + "learning_rate": 5.429013744008491e-05, + "loss": 0.2695, + "step": 1163 + }, + { + "epoch": 1.4865900383141764, + "grad_norm": 0.2687159334443025, + "learning_rate": 5.42205466405183e-05, + "loss": 0.3122, + "step": 1164 + }, + { + "epoch": 1.48786717752235, + "grad_norm": 0.26743711316182245, + "learning_rate": 5.415094760546122e-05, + "loss": 0.249, + "step": 1165 + }, + { + "epoch": 1.4891443167305236, + "grad_norm": 0.3024748146774734, + "learning_rate": 5.4081340470721284e-05, + "loss": 0.2649, + "step": 1166 + }, + { + "epoch": 1.4904214559386972, + "grad_norm": 0.254896323409363, + "learning_rate": 5.401172537212183e-05, + "loss": 0.2887, + "step": 1167 + }, + { + "epoch": 1.491698595146871, + "grad_norm": 1.0499826346930017, + "learning_rate": 5.3942102445501795e-05, + "loss": 0.3296, + "step": 1168 + }, + { + "epoch": 1.4929757343550447, + "grad_norm": 0.2763720127957049, + "learning_rate": 5.387247182671539e-05, + "loss": 0.2773, + "step": 1169 + }, + { + "epoch": 1.4942528735632183, + "grad_norm": 0.33899062467157326, + "learning_rate": 5.3802833651631746e-05, + "loss": 0.2695, + "step": 1170 + }, + { + "epoch": 1.495530012771392, + "grad_norm": 0.392311310225203, + "learning_rate": 5.373318805613489e-05, + "loss": 0.3122, + "step": 1171 + }, + { + "epoch": 1.4968071519795658, + "grad_norm": 0.34357651252555255, + "learning_rate": 5.366353517612319e-05, + "loss": 0.2915, + "step": 1172 + }, + { + "epoch": 1.4980842911877394, + "grad_norm": 0.25608464707967304, + "learning_rate": 5.35938751475093e-05, + "loss": 0.2211, + "step": 1173 + }, + { + "epoch": 1.499361430395913, + "grad_norm": 0.31186682927804293, + "learning_rate": 5.352420810621981e-05, + "loss": 0.2976, + "step": 1174 + }, + { + "epoch": 1.500638569604087, + "grad_norm": 0.2513585342323586, + "learning_rate": 5.3454534188194994e-05, + "loss": 0.2822, + "step": 1175 + }, + { + "epoch": 1.5019157088122606, + "grad_norm": 0.7918962140708004, + "learning_rate": 5.3384853529388534e-05, + "loss": 0.2809, + "step": 1176 + }, + { + "epoch": 1.5031928480204342, + "grad_norm": 0.3056177466556648, + "learning_rate": 5.331516626576727e-05, + "loss": 0.2913, + "step": 1177 + }, + { + "epoch": 1.504469987228608, + "grad_norm": 0.283347967019181, + "learning_rate": 5.324547253331094e-05, + "loss": 0.3103, + "step": 1178 + }, + { + "epoch": 1.5057471264367817, + "grad_norm": 0.6543486931025977, + "learning_rate": 5.31757724680119e-05, + "loss": 0.2364, + "step": 1179 + }, + { + "epoch": 1.5070242656449553, + "grad_norm": 0.29677420380077185, + "learning_rate": 5.310606620587484e-05, + "loss": 0.2895, + "step": 1180 + }, + { + "epoch": 1.5083014048531291, + "grad_norm": 0.22815179884677966, + "learning_rate": 5.303635388291659e-05, + "loss": 0.2434, + "step": 1181 + }, + { + "epoch": 1.5095785440613025, + "grad_norm": 0.2936994346175375, + "learning_rate": 5.296663563516576e-05, + "loss": 0.2694, + "step": 1182 + }, + { + "epoch": 1.5108556832694764, + "grad_norm": 0.2107382949789611, + "learning_rate": 5.289691159866253e-05, + "loss": 0.2249, + "step": 1183 + }, + { + "epoch": 1.51213282247765, + "grad_norm": 0.29529796229738414, + "learning_rate": 5.2827181909458424e-05, + "loss": 0.3387, + "step": 1184 + }, + { + "epoch": 1.5134099616858236, + "grad_norm": 0.3087164395355005, + "learning_rate": 5.275744670361591e-05, + "loss": 0.322, + "step": 1185 + }, + { + "epoch": 1.5146871008939975, + "grad_norm": 0.2507195728845357, + "learning_rate": 5.26877061172083e-05, + "loss": 0.23, + "step": 1186 + }, + { + "epoch": 1.5159642401021711, + "grad_norm": 0.28907992974873364, + "learning_rate": 5.261796028631934e-05, + "loss": 0.2917, + "step": 1187 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.3334284220853943, + "learning_rate": 5.2548209347043084e-05, + "loss": 0.3136, + "step": 1188 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 0.2486686447885486, + "learning_rate": 5.24784534354835e-05, + "loss": 0.2545, + "step": 1189 + }, + { + "epoch": 1.5197956577266922, + "grad_norm": 0.3433121717679412, + "learning_rate": 5.2408692687754226e-05, + "loss": 0.32, + "step": 1190 + }, + { + "epoch": 1.5210727969348659, + "grad_norm": 0.24810756780641782, + "learning_rate": 5.233892723997844e-05, + "loss": 0.2778, + "step": 1191 + }, + { + "epoch": 1.5223499361430397, + "grad_norm": 0.28400824518900036, + "learning_rate": 5.22691572282884e-05, + "loss": 0.2693, + "step": 1192 + }, + { + "epoch": 1.5236270753512133, + "grad_norm": 0.32013148480433623, + "learning_rate": 5.21993827888253e-05, + "loss": 0.3042, + "step": 1193 + }, + { + "epoch": 1.524904214559387, + "grad_norm": 0.9921174058848368, + "learning_rate": 5.2129604057739e-05, + "loss": 0.289, + "step": 1194 + }, + { + "epoch": 1.5261813537675608, + "grad_norm": 0.3398194061696351, + "learning_rate": 5.205982117118767e-05, + "loss": 0.3067, + "step": 1195 + }, + { + "epoch": 1.5274584929757342, + "grad_norm": 0.28447540822331907, + "learning_rate": 5.1990034265337673e-05, + "loss": 0.2917, + "step": 1196 + }, + { + "epoch": 1.528735632183908, + "grad_norm": 0.30962206447163104, + "learning_rate": 5.1920243476363126e-05, + "loss": 0.2679, + "step": 1197 + }, + { + "epoch": 1.5300127713920817, + "grad_norm": 0.3169732339073159, + "learning_rate": 5.185044894044582e-05, + "loss": 0.3165, + "step": 1198 + }, + { + "epoch": 1.5312899106002553, + "grad_norm": 0.35173468467160585, + "learning_rate": 5.1780650793774735e-05, + "loss": 0.2801, + "step": 1199 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.300412265696642, + "learning_rate": 5.171084917254602e-05, + "loss": 0.2706, + "step": 1200 + }, + { + "epoch": 1.5338441890166028, + "grad_norm": 0.35645988449066224, + "learning_rate": 5.1641044212962544e-05, + "loss": 0.2928, + "step": 1201 + }, + { + "epoch": 1.5351213282247764, + "grad_norm": 0.2838788922109306, + "learning_rate": 5.1571236051233666e-05, + "loss": 0.2331, + "step": 1202 + }, + { + "epoch": 1.5363984674329503, + "grad_norm": 0.4556992160268811, + "learning_rate": 5.150142482357505e-05, + "loss": 0.2593, + "step": 1203 + }, + { + "epoch": 1.537675606641124, + "grad_norm": 0.37070892528399574, + "learning_rate": 5.14316106662083e-05, + "loss": 0.3319, + "step": 1204 + }, + { + "epoch": 1.5389527458492975, + "grad_norm": 0.36285693871098207, + "learning_rate": 5.136179371536076e-05, + "loss": 0.3218, + "step": 1205 + }, + { + "epoch": 1.5402298850574714, + "grad_norm": 0.29455570051115415, + "learning_rate": 5.1291974107265215e-05, + "loss": 0.2911, + "step": 1206 + }, + { + "epoch": 1.541507024265645, + "grad_norm": 0.2838044485910791, + "learning_rate": 5.122215197815965e-05, + "loss": 0.2456, + "step": 1207 + }, + { + "epoch": 1.5427841634738186, + "grad_norm": 0.46115347855367383, + "learning_rate": 5.115232746428694e-05, + "loss": 0.26, + "step": 1208 + }, + { + "epoch": 1.5440613026819925, + "grad_norm": 0.3016604157690835, + "learning_rate": 5.108250070189462e-05, + "loss": 0.306, + "step": 1209 + }, + { + "epoch": 1.545338441890166, + "grad_norm": 0.32052793154969833, + "learning_rate": 5.101267182723466e-05, + "loss": 0.257, + "step": 1210 + }, + { + "epoch": 1.5466155810983397, + "grad_norm": 0.40687721971682483, + "learning_rate": 5.0942840976563074e-05, + "loss": 0.3339, + "step": 1211 + }, + { + "epoch": 1.5478927203065134, + "grad_norm": 3.6731070198026035, + "learning_rate": 5.087300828613978e-05, + "loss": 0.2621, + "step": 1212 + }, + { + "epoch": 1.549169859514687, + "grad_norm": 0.4046003495554532, + "learning_rate": 5.080317389222831e-05, + "loss": 0.346, + "step": 1213 + }, + { + "epoch": 1.5504469987228608, + "grad_norm": 0.2255467147617502, + "learning_rate": 5.073333793109545e-05, + "loss": 0.2472, + "step": 1214 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 0.7574633562301285, + "learning_rate": 5.066350053901109e-05, + "loss": 0.2894, + "step": 1215 + }, + { + "epoch": 1.553001277139208, + "grad_norm": 0.2848056969972755, + "learning_rate": 5.059366185224791e-05, + "loss": 0.287, + "step": 1216 + }, + { + "epoch": 1.554278416347382, + "grad_norm": 0.2657957628254532, + "learning_rate": 5.052382200708111e-05, + "loss": 0.3291, + "step": 1217 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.22931970438619395, + "learning_rate": 5.045398113978817e-05, + "loss": 0.2936, + "step": 1218 + }, + { + "epoch": 1.5568326947637292, + "grad_norm": 0.33618088012045033, + "learning_rate": 5.038413938664849e-05, + "loss": 0.3157, + "step": 1219 + }, + { + "epoch": 1.558109833971903, + "grad_norm": 0.277797324161563, + "learning_rate": 5.03142968839433e-05, + "loss": 0.2904, + "step": 1220 + }, + { + "epoch": 1.5593869731800765, + "grad_norm": 0.2515521984845092, + "learning_rate": 5.024445376795523e-05, + "loss": 0.2944, + "step": 1221 + }, + { + "epoch": 1.5606641123882503, + "grad_norm": 1.062709709383439, + "learning_rate": 5.017461017496814e-05, + "loss": 0.2768, + "step": 1222 + }, + { + "epoch": 1.5619412515964242, + "grad_norm": 0.3099855254054575, + "learning_rate": 5.010476624126677e-05, + "loss": 0.3363, + "step": 1223 + }, + { + "epoch": 1.5632183908045976, + "grad_norm": 0.31287724334126793, + "learning_rate": 5.003492210313659e-05, + "loss": 0.3142, + "step": 1224 + }, + { + "epoch": 1.5644955300127714, + "grad_norm": 0.3187944717338777, + "learning_rate": 4.996507789686342e-05, + "loss": 0.3137, + "step": 1225 + }, + { + "epoch": 1.565772669220945, + "grad_norm": 0.5592086857749461, + "learning_rate": 4.9895233758733245e-05, + "loss": 0.2671, + "step": 1226 + }, + { + "epoch": 1.5670498084291187, + "grad_norm": 0.2656728367256574, + "learning_rate": 4.982538982503188e-05, + "loss": 0.293, + "step": 1227 + }, + { + "epoch": 1.5683269476372925, + "grad_norm": 0.2755966646496296, + "learning_rate": 4.975554623204478e-05, + "loss": 0.255, + "step": 1228 + }, + { + "epoch": 1.5696040868454662, + "grad_norm": 0.2838605017978635, + "learning_rate": 4.968570311605671e-05, + "loss": 0.3469, + "step": 1229 + }, + { + "epoch": 1.5708812260536398, + "grad_norm": 0.25737515858629584, + "learning_rate": 4.961586061335153e-05, + "loss": 0.2559, + "step": 1230 + }, + { + "epoch": 1.5721583652618136, + "grad_norm": 0.2896880633246554, + "learning_rate": 4.9546018860211844e-05, + "loss": 0.2675, + "step": 1231 + }, + { + "epoch": 1.5734355044699873, + "grad_norm": 0.2979281608573969, + "learning_rate": 4.94761779929189e-05, + "loss": 0.2565, + "step": 1232 + }, + { + "epoch": 1.5747126436781609, + "grad_norm": 0.3297514946073306, + "learning_rate": 4.9406338147752096e-05, + "loss": 0.3082, + "step": 1233 + }, + { + "epoch": 1.5759897828863347, + "grad_norm": 0.20619442931364884, + "learning_rate": 4.933649946098892e-05, + "loss": 0.259, + "step": 1234 + }, + { + "epoch": 1.5772669220945081, + "grad_norm": 0.510900092856823, + "learning_rate": 4.9266662068904575e-05, + "loss": 0.2836, + "step": 1235 + }, + { + "epoch": 1.578544061302682, + "grad_norm": 0.30684706485030305, + "learning_rate": 4.91968261077717e-05, + "loss": 0.2914, + "step": 1236 + }, + { + "epoch": 1.5798212005108558, + "grad_norm": 0.23329295033761926, + "learning_rate": 4.912699171386022e-05, + "loss": 0.2713, + "step": 1237 + }, + { + "epoch": 1.5810983397190292, + "grad_norm": 0.41556242470261523, + "learning_rate": 4.905715902343695e-05, + "loss": 0.3484, + "step": 1238 + }, + { + "epoch": 1.582375478927203, + "grad_norm": 0.25938435631217527, + "learning_rate": 4.8987328172765354e-05, + "loss": 0.2886, + "step": 1239 + }, + { + "epoch": 1.5836526181353767, + "grad_norm": 0.23491053324987993, + "learning_rate": 4.8917499298105376e-05, + "loss": 0.2694, + "step": 1240 + }, + { + "epoch": 1.5849297573435503, + "grad_norm": 0.2752448954885809, + "learning_rate": 4.884767253571308e-05, + "loss": 0.359, + "step": 1241 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 0.23197673489810125, + "learning_rate": 4.877784802184037e-05, + "loss": 0.2513, + "step": 1242 + }, + { + "epoch": 1.5874840357598978, + "grad_norm": 0.7331225236499045, + "learning_rate": 4.870802589273478e-05, + "loss": 0.2672, + "step": 1243 + }, + { + "epoch": 1.5887611749680715, + "grad_norm": 0.39629372863027285, + "learning_rate": 4.863820628463925e-05, + "loss": 0.2771, + "step": 1244 + }, + { + "epoch": 1.5900383141762453, + "grad_norm": 0.28264581128814953, + "learning_rate": 4.8568389333791714e-05, + "loss": 0.2861, + "step": 1245 + }, + { + "epoch": 1.591315453384419, + "grad_norm": 0.2750039039303703, + "learning_rate": 4.849857517642496e-05, + "loss": 0.2329, + "step": 1246 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.2412676936527539, + "learning_rate": 4.842876394876636e-05, + "loss": 0.2444, + "step": 1247 + }, + { + "epoch": 1.5938697318007664, + "grad_norm": 0.2384863505206727, + "learning_rate": 4.8358955787037474e-05, + "loss": 0.2891, + "step": 1248 + }, + { + "epoch": 1.5951468710089398, + "grad_norm": 0.5390821201628958, + "learning_rate": 4.8289150827453986e-05, + "loss": 0.2711, + "step": 1249 + }, + { + "epoch": 1.5964240102171137, + "grad_norm": 0.3454891185755767, + "learning_rate": 4.821934920622528e-05, + "loss": 0.3469, + "step": 1250 + }, + { + "epoch": 1.5977011494252875, + "grad_norm": 0.40271773067658473, + "learning_rate": 4.81495510595542e-05, + "loss": 0.2699, + "step": 1251 + }, + { + "epoch": 1.598978288633461, + "grad_norm": 0.2573095307115896, + "learning_rate": 4.807975652363687e-05, + "loss": 0.3124, + "step": 1252 + }, + { + "epoch": 1.6002554278416348, + "grad_norm": 0.23007714085335745, + "learning_rate": 4.8009965734662345e-05, + "loss": 0.2844, + "step": 1253 + }, + { + "epoch": 1.6015325670498084, + "grad_norm": 0.24092040311502674, + "learning_rate": 4.794017882881233e-05, + "loss": 0.2665, + "step": 1254 + }, + { + "epoch": 1.602809706257982, + "grad_norm": 0.37564769255466235, + "learning_rate": 4.787039594226101e-05, + "loss": 0.3213, + "step": 1255 + }, + { + "epoch": 1.6040868454661559, + "grad_norm": 0.24695753968843873, + "learning_rate": 4.78006172111747e-05, + "loss": 0.2773, + "step": 1256 + }, + { + "epoch": 1.6053639846743295, + "grad_norm": 0.25383845873085903, + "learning_rate": 4.773084277171161e-05, + "loss": 0.2348, + "step": 1257 + }, + { + "epoch": 1.6066411238825031, + "grad_norm": 0.23415995127253802, + "learning_rate": 4.766107276002158e-05, + "loss": 0.2327, + "step": 1258 + }, + { + "epoch": 1.607918263090677, + "grad_norm": 0.22337244789093672, + "learning_rate": 4.759130731224578e-05, + "loss": 0.2491, + "step": 1259 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.3718358691259988, + "learning_rate": 4.7521546564516514e-05, + "loss": 0.3525, + "step": 1260 + }, + { + "epoch": 1.6104725415070242, + "grad_norm": 0.3040644576295891, + "learning_rate": 4.745179065295692e-05, + "loss": 0.2877, + "step": 1261 + }, + { + "epoch": 1.611749680715198, + "grad_norm": 0.24352061758883256, + "learning_rate": 4.738203971368066e-05, + "loss": 0.2273, + "step": 1262 + }, + { + "epoch": 1.6130268199233715, + "grad_norm": 0.32806364087434947, + "learning_rate": 4.73122938827917e-05, + "loss": 0.321, + "step": 1263 + }, + { + "epoch": 1.6143039591315453, + "grad_norm": 0.27224781070680476, + "learning_rate": 4.724255329638411e-05, + "loss": 0.2852, + "step": 1264 + }, + { + "epoch": 1.6155810983397192, + "grad_norm": 0.2470059020703907, + "learning_rate": 4.7172818090541594e-05, + "loss": 0.2655, + "step": 1265 + }, + { + "epoch": 1.6168582375478926, + "grad_norm": 0.3271051036315808, + "learning_rate": 4.710308840133747e-05, + "loss": 0.2826, + "step": 1266 + }, + { + "epoch": 1.6181353767560664, + "grad_norm": 1.5205004391926094, + "learning_rate": 4.7033364364834266e-05, + "loss": 0.3498, + "step": 1267 + }, + { + "epoch": 1.61941251596424, + "grad_norm": 0.24520726745828342, + "learning_rate": 4.696364611708342e-05, + "loss": 0.2228, + "step": 1268 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 0.3102091710979184, + "learning_rate": 4.689393379412517e-05, + "loss": 0.2957, + "step": 1269 + }, + { + "epoch": 1.6219667943805876, + "grad_norm": 0.2359495714000097, + "learning_rate": 4.682422753198812e-05, + "loss": 0.2734, + "step": 1270 + }, + { + "epoch": 1.6232439335887612, + "grad_norm": 0.28141462676293033, + "learning_rate": 4.675452746668908e-05, + "loss": 0.3368, + "step": 1271 + }, + { + "epoch": 1.6245210727969348, + "grad_norm": 0.2517942432335681, + "learning_rate": 4.668483373423274e-05, + "loss": 0.2709, + "step": 1272 + }, + { + "epoch": 1.6257982120051087, + "grad_norm": 0.26216240488260184, + "learning_rate": 4.6615146470611485e-05, + "loss": 0.2965, + "step": 1273 + }, + { + "epoch": 1.6270753512132823, + "grad_norm": 0.3511438754513362, + "learning_rate": 4.6545465811805024e-05, + "loss": 0.3258, + "step": 1274 + }, + { + "epoch": 1.628352490421456, + "grad_norm": 0.31787834657906616, + "learning_rate": 4.647579189378019e-05, + "loss": 0.3235, + "step": 1275 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 0.23349946500610683, + "learning_rate": 4.640612485249072e-05, + "loss": 0.289, + "step": 1276 + }, + { + "epoch": 1.6309067688378032, + "grad_norm": 0.27647188386820465, + "learning_rate": 4.633646482387682e-05, + "loss": 0.2765, + "step": 1277 + }, + { + "epoch": 1.632183908045977, + "grad_norm": 0.26735294451148256, + "learning_rate": 4.626681194386512e-05, + "loss": 0.2606, + "step": 1278 + }, + { + "epoch": 1.6334610472541509, + "grad_norm": 0.3475057768433133, + "learning_rate": 4.619716634836826e-05, + "loss": 0.2507, + "step": 1279 + }, + { + "epoch": 1.6347381864623243, + "grad_norm": 0.2434034284118453, + "learning_rate": 4.612752817328463e-05, + "loss": 0.2829, + "step": 1280 + }, + { + "epoch": 1.6360153256704981, + "grad_norm": 0.3952430592400376, + "learning_rate": 4.6057897554498196e-05, + "loss": 0.3089, + "step": 1281 + }, + { + "epoch": 1.6372924648786717, + "grad_norm": 0.2763615965578334, + "learning_rate": 4.598827462787817e-05, + "loss": 0.2538, + "step": 1282 + }, + { + "epoch": 1.6385696040868454, + "grad_norm": 0.2403833208460845, + "learning_rate": 4.591865952927873e-05, + "loss": 0.2715, + "step": 1283 + }, + { + "epoch": 1.6398467432950192, + "grad_norm": 0.7036275685373463, + "learning_rate": 4.5849052394538775e-05, + "loss": 0.2543, + "step": 1284 + }, + { + "epoch": 1.6411238825031929, + "grad_norm": 0.38875827193498164, + "learning_rate": 4.5779453359481714e-05, + "loss": 0.2968, + "step": 1285 + }, + { + "epoch": 1.6424010217113665, + "grad_norm": 0.28564865803969414, + "learning_rate": 4.5709862559915094e-05, + "loss": 0.285, + "step": 1286 + }, + { + "epoch": 1.6436781609195403, + "grad_norm": 0.29849450317145815, + "learning_rate": 4.5640280131630434e-05, + "loss": 0.3355, + "step": 1287 + }, + { + "epoch": 1.644955300127714, + "grad_norm": 0.3633418650022073, + "learning_rate": 4.557070621040287e-05, + "loss": 0.2868, + "step": 1288 + }, + { + "epoch": 1.6462324393358876, + "grad_norm": 0.3087027832926122, + "learning_rate": 4.5501140931991e-05, + "loss": 0.3189, + "step": 1289 + }, + { + "epoch": 1.6475095785440614, + "grad_norm": 0.29751602155024853, + "learning_rate": 4.543158443213653e-05, + "loss": 0.3136, + "step": 1290 + }, + { + "epoch": 1.6487867177522348, + "grad_norm": 0.37049504133486694, + "learning_rate": 4.536203684656401e-05, + "loss": 0.3772, + "step": 1291 + }, + { + "epoch": 1.6500638569604087, + "grad_norm": 0.2733945336727233, + "learning_rate": 4.529249831098061e-05, + "loss": 0.2999, + "step": 1292 + }, + { + "epoch": 1.6513409961685823, + "grad_norm": 0.34277427273180067, + "learning_rate": 4.5222968961075894e-05, + "loss": 0.2602, + "step": 1293 + }, + { + "epoch": 1.652618135376756, + "grad_norm": 0.40969426828547795, + "learning_rate": 4.515344893252142e-05, + "loss": 0.2802, + "step": 1294 + }, + { + "epoch": 1.6538952745849298, + "grad_norm": 0.24152706547403385, + "learning_rate": 4.50839383609706e-05, + "loss": 0.2564, + "step": 1295 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.24420659124989885, + "learning_rate": 4.501443738205841e-05, + "loss": 0.2798, + "step": 1296 + }, + { + "epoch": 1.656449553001277, + "grad_norm": 0.2849859050088564, + "learning_rate": 4.4944946131401044e-05, + "loss": 0.2765, + "step": 1297 + }, + { + "epoch": 1.657726692209451, + "grad_norm": 0.2863371564030423, + "learning_rate": 4.4875464744595794e-05, + "loss": 0.3834, + "step": 1298 + }, + { + "epoch": 1.6590038314176245, + "grad_norm": 0.25967550820923435, + "learning_rate": 4.4805993357220653e-05, + "loss": 0.3209, + "step": 1299 + }, + { + "epoch": 1.6602809706257982, + "grad_norm": 0.2373756612322783, + "learning_rate": 4.4736532104834105e-05, + "loss": 0.3042, + "step": 1300 + }, + { + "epoch": 1.661558109833972, + "grad_norm": 0.30096569349292834, + "learning_rate": 4.4667081122974885e-05, + "loss": 0.3134, + "step": 1301 + }, + { + "epoch": 1.6628352490421456, + "grad_norm": 0.2885435032198403, + "learning_rate": 4.4597640547161665e-05, + "loss": 0.2915, + "step": 1302 + }, + { + "epoch": 1.6641123882503193, + "grad_norm": 0.2571503909352097, + "learning_rate": 4.4528210512892814e-05, + "loss": 0.2906, + "step": 1303 + }, + { + "epoch": 1.6653895274584931, + "grad_norm": 0.2648491356785429, + "learning_rate": 4.4458791155646116e-05, + "loss": 0.3212, + "step": 1304 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.2527106574378753, + "learning_rate": 4.4389382610878586e-05, + "loss": 0.2914, + "step": 1305 + }, + { + "epoch": 1.6679438058748404, + "grad_norm": 0.2517662569516686, + "learning_rate": 4.4319985014026054e-05, + "loss": 0.2898, + "step": 1306 + }, + { + "epoch": 1.669220945083014, + "grad_norm": 0.28137924383559354, + "learning_rate": 4.425059850050304e-05, + "loss": 0.2752, + "step": 1307 + }, + { + "epoch": 1.6704980842911876, + "grad_norm": 0.2641576827598719, + "learning_rate": 4.418122320570244e-05, + "loss": 0.3055, + "step": 1308 + }, + { + "epoch": 1.6717752234993615, + "grad_norm": 0.30148808191228826, + "learning_rate": 4.41118592649952e-05, + "loss": 0.3097, + "step": 1309 + }, + { + "epoch": 1.673052362707535, + "grad_norm": 0.2674297744961958, + "learning_rate": 4.404250681373021e-05, + "loss": 0.2697, + "step": 1310 + }, + { + "epoch": 1.6743295019157087, + "grad_norm": 0.2548144912457758, + "learning_rate": 4.397316598723385e-05, + "loss": 0.2422, + "step": 1311 + }, + { + "epoch": 1.6756066411238826, + "grad_norm": 0.19377718824700446, + "learning_rate": 4.390383692080986e-05, + "loss": 0.2309, + "step": 1312 + }, + { + "epoch": 1.6768837803320562, + "grad_norm": 0.27966172812576334, + "learning_rate": 4.383451974973904e-05, + "loss": 0.2705, + "step": 1313 + }, + { + "epoch": 1.6781609195402298, + "grad_norm": 0.303950904593755, + "learning_rate": 4.3765214609278955e-05, + "loss": 0.3155, + "step": 1314 + }, + { + "epoch": 1.6794380587484037, + "grad_norm": 0.3086695667227851, + "learning_rate": 4.3695921634663695e-05, + "loss": 0.2872, + "step": 1315 + }, + { + "epoch": 1.6807151979565773, + "grad_norm": 0.27821679407218153, + "learning_rate": 4.362664096110361e-05, + "loss": 0.2759, + "step": 1316 + }, + { + "epoch": 1.681992337164751, + "grad_norm": 0.42123155947492075, + "learning_rate": 4.355737272378511e-05, + "loss": 0.3075, + "step": 1317 + }, + { + "epoch": 1.6832694763729248, + "grad_norm": 0.25985548771344996, + "learning_rate": 4.348811705787022e-05, + "loss": 0.3014, + "step": 1318 + }, + { + "epoch": 1.6845466155810982, + "grad_norm": 0.3643296852906422, + "learning_rate": 4.3418874098496556e-05, + "loss": 0.3101, + "step": 1319 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.2852427967626781, + "learning_rate": 4.334964398077684e-05, + "loss": 0.2451, + "step": 1320 + }, + { + "epoch": 1.6871008939974457, + "grad_norm": 0.29351619660971534, + "learning_rate": 4.328042683979878e-05, + "loss": 0.2678, + "step": 1321 + }, + { + "epoch": 1.6883780332056193, + "grad_norm": 0.29867999341993556, + "learning_rate": 4.321122281062481e-05, + "loss": 0.2994, + "step": 1322 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 0.26398759507736685, + "learning_rate": 4.3142032028291695e-05, + "loss": 0.3121, + "step": 1323 + }, + { + "epoch": 1.6909323116219668, + "grad_norm": 0.2859189560945384, + "learning_rate": 4.3072854627810385e-05, + "loss": 0.2274, + "step": 1324 + }, + { + "epoch": 1.6922094508301404, + "grad_norm": 0.23592015046391102, + "learning_rate": 4.3003690744165765e-05, + "loss": 0.2619, + "step": 1325 + }, + { + "epoch": 1.6934865900383143, + "grad_norm": 0.24856936279829062, + "learning_rate": 4.293454051231623e-05, + "loss": 0.2682, + "step": 1326 + }, + { + "epoch": 1.6947637292464879, + "grad_norm": 0.25808082124642473, + "learning_rate": 4.286540406719367e-05, + "loss": 0.2847, + "step": 1327 + }, + { + "epoch": 1.6960408684546615, + "grad_norm": 0.22954959637012018, + "learning_rate": 4.2796281543703e-05, + "loss": 0.2422, + "step": 1328 + }, + { + "epoch": 1.6973180076628354, + "grad_norm": 0.30429393567698393, + "learning_rate": 4.272717307672194e-05, + "loss": 0.3301, + "step": 1329 + }, + { + "epoch": 1.698595146871009, + "grad_norm": 0.24328712479861206, + "learning_rate": 4.265807880110087e-05, + "loss": 0.2585, + "step": 1330 + }, + { + "epoch": 1.6998722860791826, + "grad_norm": 0.25017561331422467, + "learning_rate": 4.2588998851662426e-05, + "loss": 0.2464, + "step": 1331 + }, + { + "epoch": 1.7011494252873565, + "grad_norm": 0.2711455112713395, + "learning_rate": 4.251993336320127e-05, + "loss": 0.3387, + "step": 1332 + }, + { + "epoch": 1.7024265644955299, + "grad_norm": 0.27049741816071077, + "learning_rate": 4.245088247048388e-05, + "loss": 0.2836, + "step": 1333 + }, + { + "epoch": 1.7037037037037037, + "grad_norm": 0.2713465578829525, + "learning_rate": 4.238184630824827e-05, + "loss": 0.2631, + "step": 1334 + }, + { + "epoch": 1.7049808429118773, + "grad_norm": 0.30556362716028224, + "learning_rate": 4.231282501120366e-05, + "loss": 0.2919, + "step": 1335 + }, + { + "epoch": 1.706257982120051, + "grad_norm": 0.21574663536098548, + "learning_rate": 4.224381871403028e-05, + "loss": 0.2205, + "step": 1336 + }, + { + "epoch": 1.7075351213282248, + "grad_norm": 0.34788306343100467, + "learning_rate": 4.217482755137916e-05, + "loss": 0.2416, + "step": 1337 + }, + { + "epoch": 1.7088122605363985, + "grad_norm": 0.41668593915258667, + "learning_rate": 4.210585165787165e-05, + "loss": 0.3485, + "step": 1338 + }, + { + "epoch": 1.710089399744572, + "grad_norm": 0.32863590495539485, + "learning_rate": 4.2036891168099454e-05, + "loss": 0.3274, + "step": 1339 + }, + { + "epoch": 1.711366538952746, + "grad_norm": 0.28415495597004004, + "learning_rate": 4.1967946216624164e-05, + "loss": 0.2804, + "step": 1340 + }, + { + "epoch": 1.7126436781609196, + "grad_norm": 0.3266218688349786, + "learning_rate": 4.1899016937977e-05, + "loss": 0.3106, + "step": 1341 + }, + { + "epoch": 1.7139208173690932, + "grad_norm": 0.23385806299476342, + "learning_rate": 4.183010346665869e-05, + "loss": 0.2289, + "step": 1342 + }, + { + "epoch": 1.715197956577267, + "grad_norm": 0.29938247455155015, + "learning_rate": 4.176120593713907e-05, + "loss": 0.3069, + "step": 1343 + }, + { + "epoch": 1.7164750957854407, + "grad_norm": 0.26702693029104196, + "learning_rate": 4.169232448385685e-05, + "loss": 0.2306, + "step": 1344 + }, + { + "epoch": 1.7177522349936143, + "grad_norm": 0.3470846262682479, + "learning_rate": 4.162345924121941e-05, + "loss": 0.3187, + "step": 1345 + }, + { + "epoch": 1.7190293742017881, + "grad_norm": 0.5690778089262889, + "learning_rate": 4.15546103436025e-05, + "loss": 0.2761, + "step": 1346 + }, + { + "epoch": 1.7203065134099615, + "grad_norm": 0.2813034728903149, + "learning_rate": 4.1485777925349924e-05, + "loss": 0.2972, + "step": 1347 + }, + { + "epoch": 1.7215836526181354, + "grad_norm": 0.27343274556302266, + "learning_rate": 4.1416962120773396e-05, + "loss": 0.2712, + "step": 1348 + }, + { + "epoch": 1.722860791826309, + "grad_norm": 1.0560667255722933, + "learning_rate": 4.134816306415216e-05, + "loss": 0.3017, + "step": 1349 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 0.2488369344190117, + "learning_rate": 4.127938088973279e-05, + "loss": 0.2518, + "step": 1350 + }, + { + "epoch": 1.7254150702426565, + "grad_norm": 0.28976805064271627, + "learning_rate": 4.121061573172898e-05, + "loss": 0.2881, + "step": 1351 + }, + { + "epoch": 1.7266922094508301, + "grad_norm": 0.2922921863920664, + "learning_rate": 4.114186772432111e-05, + "loss": 0.2598, + "step": 1352 + }, + { + "epoch": 1.7279693486590038, + "grad_norm": 0.29316571693443977, + "learning_rate": 4.107313700165618e-05, + "loss": 0.2944, + "step": 1353 + }, + { + "epoch": 1.7292464878671776, + "grad_norm": 0.3046598133722125, + "learning_rate": 4.100442369784746e-05, + "loss": 0.2999, + "step": 1354 + }, + { + "epoch": 1.7305236270753512, + "grad_norm": 0.2593018313115248, + "learning_rate": 4.093572794697417e-05, + "loss": 0.3046, + "step": 1355 + }, + { + "epoch": 1.7318007662835249, + "grad_norm": 0.3059937456850529, + "learning_rate": 4.086704988308133e-05, + "loss": 0.2831, + "step": 1356 + }, + { + "epoch": 1.7330779054916987, + "grad_norm": 0.3382154043698383, + "learning_rate": 4.079838964017945e-05, + "loss": 0.3138, + "step": 1357 + }, + { + "epoch": 1.7343550446998723, + "grad_norm": 0.27556995881729496, + "learning_rate": 4.07297473522442e-05, + "loss": 0.2849, + "step": 1358 + }, + { + "epoch": 1.735632183908046, + "grad_norm": 0.8468902684570236, + "learning_rate": 4.066112315321631e-05, + "loss": 0.2637, + "step": 1359 + }, + { + "epoch": 1.7369093231162198, + "grad_norm": 0.24668677152679133, + "learning_rate": 4.0592517177001176e-05, + "loss": 0.2097, + "step": 1360 + }, + { + "epoch": 1.7381864623243932, + "grad_norm": 0.26290119725588096, + "learning_rate": 4.0523929557468594e-05, + "loss": 0.2752, + "step": 1361 + }, + { + "epoch": 1.739463601532567, + "grad_norm": 0.45084564396894, + "learning_rate": 4.045536042845257e-05, + "loss": 0.2859, + "step": 1362 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.2906694067013474, + "learning_rate": 4.038680992375108e-05, + "loss": 0.3318, + "step": 1363 + }, + { + "epoch": 1.7420178799489143, + "grad_norm": 0.2249846304829231, + "learning_rate": 4.031827817712568e-05, + "loss": 0.2753, + "step": 1364 + }, + { + "epoch": 1.7432950191570882, + "grad_norm": 0.23077249374276357, + "learning_rate": 4.0249765322301355e-05, + "loss": 0.2558, + "step": 1365 + }, + { + "epoch": 1.7445721583652618, + "grad_norm": 0.2753632683789497, + "learning_rate": 4.018127149296625e-05, + "loss": 0.2747, + "step": 1366 + }, + { + "epoch": 1.7458492975734354, + "grad_norm": 0.36265853432934675, + "learning_rate": 4.011279682277135e-05, + "loss": 0.2921, + "step": 1367 + }, + { + "epoch": 1.7471264367816093, + "grad_norm": 0.3279495376288913, + "learning_rate": 4.0044341445330267e-05, + "loss": 0.284, + "step": 1368 + }, + { + "epoch": 1.748403575989783, + "grad_norm": 0.31191494165945355, + "learning_rate": 3.997590549421899e-05, + "loss": 0.3216, + "step": 1369 + }, + { + "epoch": 1.7496807151979565, + "grad_norm": 0.2696178985789916, + "learning_rate": 3.9907489102975544e-05, + "loss": 0.2632, + "step": 1370 + }, + { + "epoch": 1.7509578544061304, + "grad_norm": 0.28051469365638415, + "learning_rate": 3.983909240509985e-05, + "loss": 0.3127, + "step": 1371 + }, + { + "epoch": 1.7522349936143038, + "grad_norm": 0.4691050963134948, + "learning_rate": 3.97707155340534e-05, + "loss": 0.2768, + "step": 1372 + }, + { + "epoch": 1.7535121328224776, + "grad_norm": 0.4224856488768402, + "learning_rate": 3.970235862325892e-05, + "loss": 0.2878, + "step": 1373 + }, + { + "epoch": 1.7547892720306515, + "grad_norm": 0.30864757675645504, + "learning_rate": 3.9634021806100274e-05, + "loss": 0.3014, + "step": 1374 + }, + { + "epoch": 1.756066411238825, + "grad_norm": 0.25297279498211783, + "learning_rate": 3.956570521592211e-05, + "loss": 0.2644, + "step": 1375 + }, + { + "epoch": 1.7573435504469987, + "grad_norm": 0.40379738416377076, + "learning_rate": 3.949740898602953e-05, + "loss": 0.292, + "step": 1376 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 0.26612109535286377, + "learning_rate": 3.942913324968798e-05, + "loss": 0.3309, + "step": 1377 + }, + { + "epoch": 1.759897828863346, + "grad_norm": 0.28463366650460836, + "learning_rate": 3.936087814012293e-05, + "loss": 0.3047, + "step": 1378 + }, + { + "epoch": 1.7611749680715199, + "grad_norm": 0.31886274798329267, + "learning_rate": 3.9292643790519534e-05, + "loss": 0.3338, + "step": 1379 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3081952963487656, + "learning_rate": 3.922443033402249e-05, + "loss": 0.3541, + "step": 1380 + }, + { + "epoch": 1.763729246487867, + "grad_norm": 0.28532228372846835, + "learning_rate": 3.9156237903735695e-05, + "loss": 0.2527, + "step": 1381 + }, + { + "epoch": 1.765006385696041, + "grad_norm": 0.7445253654146265, + "learning_rate": 3.908806663272203e-05, + "loss": 0.2745, + "step": 1382 + }, + { + "epoch": 1.7662835249042146, + "grad_norm": 0.33063502713793713, + "learning_rate": 3.9019916654003116e-05, + "loss": 0.2362, + "step": 1383 + }, + { + "epoch": 1.7675606641123882, + "grad_norm": 0.30596012889697366, + "learning_rate": 3.895178810055899e-05, + "loss": 0.293, + "step": 1384 + }, + { + "epoch": 1.768837803320562, + "grad_norm": 0.25504515117502424, + "learning_rate": 3.888368110532787e-05, + "loss": 0.2941, + "step": 1385 + }, + { + "epoch": 1.7701149425287355, + "grad_norm": 0.28309792755419755, + "learning_rate": 3.8815595801206e-05, + "loss": 0.2574, + "step": 1386 + }, + { + "epoch": 1.7713920817369093, + "grad_norm": 0.2772719631956273, + "learning_rate": 3.874753232104714e-05, + "loss": 0.3092, + "step": 1387 + }, + { + "epoch": 1.7726692209450832, + "grad_norm": 0.22048917540864926, + "learning_rate": 3.867949079766262e-05, + "loss": 0.2297, + "step": 1388 + }, + { + "epoch": 1.7739463601532566, + "grad_norm": 0.266594065657886, + "learning_rate": 3.861147136382085e-05, + "loss": 0.2918, + "step": 1389 + }, + { + "epoch": 1.7752234993614304, + "grad_norm": 0.29807420914778654, + "learning_rate": 3.8543474152247115e-05, + "loss": 0.306, + "step": 1390 + }, + { + "epoch": 1.776500638569604, + "grad_norm": 0.28063674046265413, + "learning_rate": 3.8475499295623415e-05, + "loss": 0.3103, + "step": 1391 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.2965121662171073, + "learning_rate": 3.840754692658807e-05, + "loss": 0.2764, + "step": 1392 + }, + { + "epoch": 1.7790549169859515, + "grad_norm": 0.2416056050625452, + "learning_rate": 3.833961717773553e-05, + "loss": 0.2885, + "step": 1393 + }, + { + "epoch": 1.7803320561941252, + "grad_norm": 0.27993036032190516, + "learning_rate": 3.8271710181616104e-05, + "loss": 0.2588, + "step": 1394 + }, + { + "epoch": 1.7816091954022988, + "grad_norm": 0.244559730774195, + "learning_rate": 3.820382607073575e-05, + "loss": 0.2296, + "step": 1395 + }, + { + "epoch": 1.7828863346104726, + "grad_norm": 0.24779727070523577, + "learning_rate": 3.813596497755569e-05, + "loss": 0.2986, + "step": 1396 + }, + { + "epoch": 1.7841634738186463, + "grad_norm": 0.2995053878805197, + "learning_rate": 3.806812703449228e-05, + "loss": 0.3109, + "step": 1397 + }, + { + "epoch": 1.78544061302682, + "grad_norm": 0.2695780448552827, + "learning_rate": 3.800031237391676e-05, + "loss": 0.2549, + "step": 1398 + }, + { + "epoch": 1.7867177522349937, + "grad_norm": 0.30147827325994053, + "learning_rate": 3.7932521128154784e-05, + "loss": 0.2721, + "step": 1399 + }, + { + "epoch": 1.7879948914431671, + "grad_norm": 0.6128911579514945, + "learning_rate": 3.786475342948647e-05, + "loss": 0.2506, + "step": 1400 + }, + { + "epoch": 1.789272030651341, + "grad_norm": 0.26685767195276344, + "learning_rate": 3.7797009410145925e-05, + "loss": 0.317, + "step": 1401 + }, + { + "epoch": 1.7905491698595148, + "grad_norm": 0.33999125557516574, + "learning_rate": 3.772928920232103e-05, + "loss": 0.2831, + "step": 1402 + }, + { + "epoch": 1.7918263090676882, + "grad_norm": 0.20921126847993834, + "learning_rate": 3.7661592938153245e-05, + "loss": 0.2417, + "step": 1403 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.26258999750005607, + "learning_rate": 3.75939207497373e-05, + "loss": 0.2694, + "step": 1404 + }, + { + "epoch": 1.7943805874840357, + "grad_norm": 0.2277707896973651, + "learning_rate": 3.752627276912093e-05, + "loss": 0.2229, + "step": 1405 + }, + { + "epoch": 1.7956577266922094, + "grad_norm": 0.2611523085252321, + "learning_rate": 3.745864912830463e-05, + "loss": 0.2375, + "step": 1406 + }, + { + "epoch": 1.7969348659003832, + "grad_norm": 0.28283178290991573, + "learning_rate": 3.7391049959241444e-05, + "loss": 0.2764, + "step": 1407 + }, + { + "epoch": 1.7982120051085568, + "grad_norm": 0.24276396873953668, + "learning_rate": 3.73234753938366e-05, + "loss": 0.2381, + "step": 1408 + }, + { + "epoch": 1.7994891443167305, + "grad_norm": 0.2708862764715498, + "learning_rate": 3.725592556394737e-05, + "loss": 0.3164, + "step": 1409 + }, + { + "epoch": 1.8007662835249043, + "grad_norm": 0.3111089246707919, + "learning_rate": 3.7188400601382765e-05, + "loss": 0.2813, + "step": 1410 + }, + { + "epoch": 1.802043422733078, + "grad_norm": 0.37062086394747235, + "learning_rate": 3.712090063790319e-05, + "loss": 0.2667, + "step": 1411 + }, + { + "epoch": 1.8033205619412516, + "grad_norm": 0.46759644818007634, + "learning_rate": 3.705342580522038e-05, + "loss": 0.2655, + "step": 1412 + }, + { + "epoch": 1.8045977011494254, + "grad_norm": 0.2667506049371394, + "learning_rate": 3.6985976234996954e-05, + "loss": 0.2608, + "step": 1413 + }, + { + "epoch": 1.8058748403575988, + "grad_norm": 0.25073642585927824, + "learning_rate": 3.691855205884627e-05, + "loss": 0.2712, + "step": 1414 + }, + { + "epoch": 1.8071519795657727, + "grad_norm": 0.3249370752178867, + "learning_rate": 3.685115340833215e-05, + "loss": 0.2795, + "step": 1415 + }, + { + "epoch": 1.8084291187739465, + "grad_norm": 0.22053591170208375, + "learning_rate": 3.678378041496856e-05, + "loss": 0.2424, + "step": 1416 + }, + { + "epoch": 1.80970625798212, + "grad_norm": 0.2644425504834226, + "learning_rate": 3.671643321021944e-05, + "loss": 0.3005, + "step": 1417 + }, + { + "epoch": 1.8109833971902938, + "grad_norm": 0.3430243267516102, + "learning_rate": 3.6649111925498405e-05, + "loss": 0.3613, + "step": 1418 + }, + { + "epoch": 1.8122605363984674, + "grad_norm": 0.2989888650076369, + "learning_rate": 3.658181669216845e-05, + "loss": 0.2747, + "step": 1419 + }, + { + "epoch": 1.813537675606641, + "grad_norm": 0.24123631310241037, + "learning_rate": 3.651454764154182e-05, + "loss": 0.2724, + "step": 1420 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 0.3534446057196619, + "learning_rate": 3.644730490487961e-05, + "loss": 0.3343, + "step": 1421 + }, + { + "epoch": 1.8160919540229885, + "grad_norm": 1.0323576478000114, + "learning_rate": 3.6380088613391555e-05, + "loss": 0.2664, + "step": 1422 + }, + { + "epoch": 1.8173690932311621, + "grad_norm": 0.4716234608435971, + "learning_rate": 3.631289889823583e-05, + "loss": 0.3162, + "step": 1423 + }, + { + "epoch": 1.818646232439336, + "grad_norm": 0.21932145879254528, + "learning_rate": 3.6245735890518745e-05, + "loss": 0.2382, + "step": 1424 + }, + { + "epoch": 1.8199233716475096, + "grad_norm": 0.24170805616577737, + "learning_rate": 3.6178599721294474e-05, + "loss": 0.2919, + "step": 1425 + }, + { + "epoch": 1.8212005108556832, + "grad_norm": 0.27284981159746485, + "learning_rate": 3.611149052156483e-05, + "loss": 0.2773, + "step": 1426 + }, + { + "epoch": 1.822477650063857, + "grad_norm": 0.4868274319214595, + "learning_rate": 3.604440842227904e-05, + "loss": 0.2805, + "step": 1427 + }, + { + "epoch": 1.8237547892720305, + "grad_norm": 0.23282989300176984, + "learning_rate": 3.597735355433337e-05, + "loss": 0.2648, + "step": 1428 + }, + { + "epoch": 1.8250319284802043, + "grad_norm": 0.24084541074397636, + "learning_rate": 3.5910326048571016e-05, + "loss": 0.2415, + "step": 1429 + }, + { + "epoch": 1.8263090676883782, + "grad_norm": 0.27344544586027014, + "learning_rate": 3.5843326035781776e-05, + "loss": 0.2873, + "step": 1430 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 0.32320810902889124, + "learning_rate": 3.577635364670176e-05, + "loss": 0.3181, + "step": 1431 + }, + { + "epoch": 1.8288633461047255, + "grad_norm": 0.2427638085097998, + "learning_rate": 3.570940901201321e-05, + "loss": 0.2518, + "step": 1432 + }, + { + "epoch": 1.830140485312899, + "grad_norm": 0.2927460418942015, + "learning_rate": 3.564249226234423e-05, + "loss": 0.277, + "step": 1433 + }, + { + "epoch": 1.8314176245210727, + "grad_norm": 0.2956763410137334, + "learning_rate": 3.5575603528268456e-05, + "loss": 0.2769, + "step": 1434 + }, + { + "epoch": 1.8326947637292466, + "grad_norm": 0.22155292572845114, + "learning_rate": 3.550874294030489e-05, + "loss": 0.2804, + "step": 1435 + }, + { + "epoch": 1.8339719029374202, + "grad_norm": 0.4158945523924355, + "learning_rate": 3.5441910628917644e-05, + "loss": 0.2854, + "step": 1436 + }, + { + "epoch": 1.8352490421455938, + "grad_norm": 0.4944816904183392, + "learning_rate": 3.53751067245156e-05, + "loss": 0.3495, + "step": 1437 + }, + { + "epoch": 1.8365261813537677, + "grad_norm": 0.25867556921059903, + "learning_rate": 3.530833135745221e-05, + "loss": 0.2312, + "step": 1438 + }, + { + "epoch": 1.8378033205619413, + "grad_norm": 0.24890748479574606, + "learning_rate": 3.524158465802531e-05, + "loss": 0.2794, + "step": 1439 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.39625481053035666, + "learning_rate": 3.5174866756476724e-05, + "loss": 0.2969, + "step": 1440 + }, + { + "epoch": 1.8403575989782888, + "grad_norm": 0.3297138370276832, + "learning_rate": 3.5108177782992114e-05, + "loss": 0.2881, + "step": 1441 + }, + { + "epoch": 1.8416347381864622, + "grad_norm": 0.2262046925242351, + "learning_rate": 3.5041517867700686e-05, + "loss": 0.2967, + "step": 1442 + }, + { + "epoch": 1.842911877394636, + "grad_norm": 0.36184534199534535, + "learning_rate": 3.497488714067494e-05, + "loss": 0.2693, + "step": 1443 + }, + { + "epoch": 1.8441890166028099, + "grad_norm": 0.2989304269385628, + "learning_rate": 3.4908285731930465e-05, + "loss": 0.2955, + "step": 1444 + }, + { + "epoch": 1.8454661558109833, + "grad_norm": 0.273010602480661, + "learning_rate": 3.484171377142559e-05, + "loss": 0.333, + "step": 1445 + }, + { + "epoch": 1.8467432950191571, + "grad_norm": 0.2658510711379584, + "learning_rate": 3.477517138906119e-05, + "loss": 0.2642, + "step": 1446 + }, + { + "epoch": 1.8480204342273308, + "grad_norm": 0.30320367669554793, + "learning_rate": 3.470865871468046e-05, + "loss": 0.3054, + "step": 1447 + }, + { + "epoch": 1.8492975734355044, + "grad_norm": 0.2916987923267331, + "learning_rate": 3.464217587806856e-05, + "loss": 0.2601, + "step": 1448 + }, + { + "epoch": 1.8505747126436782, + "grad_norm": 0.26299169429672226, + "learning_rate": 3.4575723008952506e-05, + "loss": 0.311, + "step": 1449 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.4037745995497882, + "learning_rate": 3.45093002370008e-05, + "loss": 0.2846, + "step": 1450 + }, + { + "epoch": 1.8531289910600255, + "grad_norm": 0.26825092611348866, + "learning_rate": 3.444290769182319e-05, + "loss": 0.2964, + "step": 1451 + }, + { + "epoch": 1.8544061302681993, + "grad_norm": 0.32126752907040246, + "learning_rate": 3.437654550297049e-05, + "loss": 0.3271, + "step": 1452 + }, + { + "epoch": 1.855683269476373, + "grad_norm": 0.21851276968855146, + "learning_rate": 3.431021379993428e-05, + "loss": 0.2636, + "step": 1453 + }, + { + "epoch": 1.8569604086845466, + "grad_norm": 0.2418985469016076, + "learning_rate": 3.42439127121466e-05, + "loss": 0.2901, + "step": 1454 + }, + { + "epoch": 1.8582375478927204, + "grad_norm": 0.37616089059747454, + "learning_rate": 3.417764236897979e-05, + "loss": 0.2899, + "step": 1455 + }, + { + "epoch": 1.8595146871008938, + "grad_norm": 0.26543671504209565, + "learning_rate": 3.4111402899746226e-05, + "loss": 0.2857, + "step": 1456 + }, + { + "epoch": 1.8607918263090677, + "grad_norm": 0.2501955228652938, + "learning_rate": 3.404519443369798e-05, + "loss": 0.2541, + "step": 1457 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 0.30288594227986415, + "learning_rate": 3.3979017100026643e-05, + "loss": 0.3036, + "step": 1458 + }, + { + "epoch": 1.863346104725415, + "grad_norm": 0.39332878922074344, + "learning_rate": 3.391287102786312e-05, + "loss": 0.3236, + "step": 1459 + }, + { + "epoch": 1.8646232439335888, + "grad_norm": 0.22234534001219866, + "learning_rate": 3.384675634627721e-05, + "loss": 0.2433, + "step": 1460 + }, + { + "epoch": 1.8659003831417624, + "grad_norm": 0.260633141523882, + "learning_rate": 3.3780673184277547e-05, + "loss": 0.2504, + "step": 1461 + }, + { + "epoch": 1.867177522349936, + "grad_norm": 0.24245415174654697, + "learning_rate": 3.3714621670811234e-05, + "loss": 0.2927, + "step": 1462 + }, + { + "epoch": 1.86845466155811, + "grad_norm": 0.2813405900569531, + "learning_rate": 3.364860193476359e-05, + "loss": 0.2674, + "step": 1463 + }, + { + "epoch": 1.8697318007662835, + "grad_norm": 0.23733073984031614, + "learning_rate": 3.3582614104957985e-05, + "loss": 0.2694, + "step": 1464 + }, + { + "epoch": 1.8710089399744572, + "grad_norm": 0.27476181521691023, + "learning_rate": 3.351665831015549e-05, + "loss": 0.2447, + "step": 1465 + }, + { + "epoch": 1.872286079182631, + "grad_norm": 0.26529357056752056, + "learning_rate": 3.345073467905466e-05, + "loss": 0.3088, + "step": 1466 + }, + { + "epoch": 1.8735632183908046, + "grad_norm": 0.23611476720775013, + "learning_rate": 3.33848433402913e-05, + "loss": 0.2041, + "step": 1467 + }, + { + "epoch": 1.8748403575989783, + "grad_norm": 0.2730120111794088, + "learning_rate": 3.331898442243826e-05, + "loss": 0.274, + "step": 1468 + }, + { + "epoch": 1.8761174968071521, + "grad_norm": 0.28059522968297435, + "learning_rate": 3.325315805400501e-05, + "loss": 0.2941, + "step": 1469 + }, + { + "epoch": 1.8773946360153255, + "grad_norm": 0.22801459491754777, + "learning_rate": 3.31873643634376e-05, + "loss": 0.2725, + "step": 1470 + }, + { + "epoch": 1.8786717752234994, + "grad_norm": 0.23379026747956033, + "learning_rate": 3.312160347911831e-05, + "loss": 0.2822, + "step": 1471 + }, + { + "epoch": 1.879948914431673, + "grad_norm": 0.3077000079262028, + "learning_rate": 3.3055875529365344e-05, + "loss": 0.3198, + "step": 1472 + }, + { + "epoch": 1.8812260536398466, + "grad_norm": 0.2815471582840037, + "learning_rate": 3.299018064243271e-05, + "loss": 0.2152, + "step": 1473 + }, + { + "epoch": 1.8825031928480205, + "grad_norm": 0.23716203804845298, + "learning_rate": 3.292451894650986e-05, + "loss": 0.2326, + "step": 1474 + }, + { + "epoch": 1.883780332056194, + "grad_norm": 0.2506281534554275, + "learning_rate": 3.285889056972148e-05, + "loss": 0.2502, + "step": 1475 + }, + { + "epoch": 1.8850574712643677, + "grad_norm": 0.2528360887736936, + "learning_rate": 3.279329564012731e-05, + "loss": 0.2911, + "step": 1476 + }, + { + "epoch": 1.8863346104725416, + "grad_norm": 0.34450859624229324, + "learning_rate": 3.272773428572169e-05, + "loss": 0.3273, + "step": 1477 + }, + { + "epoch": 1.8876117496807152, + "grad_norm": 0.29730806604963567, + "learning_rate": 3.2662206634433576e-05, + "loss": 0.2969, + "step": 1478 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.3308053853006532, + "learning_rate": 3.2596712814126116e-05, + "loss": 0.3037, + "step": 1479 + }, + { + "epoch": 1.8901660280970627, + "grad_norm": 0.27655542549327555, + "learning_rate": 3.2531252952596394e-05, + "loss": 0.3443, + "step": 1480 + }, + { + "epoch": 1.8914431673052363, + "grad_norm": 0.19887142948598338, + "learning_rate": 3.2465827177575305e-05, + "loss": 0.2316, + "step": 1481 + }, + { + "epoch": 1.89272030651341, + "grad_norm": 0.2518089399541969, + "learning_rate": 3.24004356167272e-05, + "loss": 0.3033, + "step": 1482 + }, + { + "epoch": 1.8939974457215838, + "grad_norm": 0.2513229477984829, + "learning_rate": 3.233507839764964e-05, + "loss": 0.2691, + "step": 1483 + }, + { + "epoch": 1.8952745849297572, + "grad_norm": 0.22458054399310967, + "learning_rate": 3.226975564787322e-05, + "loss": 0.2756, + "step": 1484 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 0.2731952917997749, + "learning_rate": 3.220446749486128e-05, + "loss": 0.2514, + "step": 1485 + }, + { + "epoch": 1.8978288633461047, + "grad_norm": 0.31048728750987303, + "learning_rate": 3.213921406600959e-05, + "loss": 0.2639, + "step": 1486 + }, + { + "epoch": 1.8991060025542783, + "grad_norm": 0.24908134537469667, + "learning_rate": 3.2073995488646224e-05, + "loss": 0.2784, + "step": 1487 + }, + { + "epoch": 1.9003831417624522, + "grad_norm": 0.7683341023995628, + "learning_rate": 3.200881189003127e-05, + "loss": 0.2273, + "step": 1488 + }, + { + "epoch": 1.9016602809706258, + "grad_norm": 0.25311778544121416, + "learning_rate": 3.194366339735644e-05, + "loss": 0.3232, + "step": 1489 + }, + { + "epoch": 1.9029374201787994, + "grad_norm": 0.3359825440862018, + "learning_rate": 3.187855013774508e-05, + "loss": 0.2934, + "step": 1490 + }, + { + "epoch": 1.9042145593869733, + "grad_norm": 0.3918029962402579, + "learning_rate": 3.181347223825174e-05, + "loss": 0.2825, + "step": 1491 + }, + { + "epoch": 1.9054916985951469, + "grad_norm": 0.25154400978981933, + "learning_rate": 3.1748429825861937e-05, + "loss": 0.2852, + "step": 1492 + }, + { + "epoch": 1.9067688378033205, + "grad_norm": 0.33915168592505246, + "learning_rate": 3.1683423027491984e-05, + "loss": 0.2511, + "step": 1493 + }, + { + "epoch": 1.9080459770114944, + "grad_norm": 0.5843642363091301, + "learning_rate": 3.161845196998872e-05, + "loss": 0.2502, + "step": 1494 + }, + { + "epoch": 1.909323116219668, + "grad_norm": 0.2848664990443074, + "learning_rate": 3.1553516780129155e-05, + "loss": 0.3151, + "step": 1495 + }, + { + "epoch": 1.9106002554278416, + "grad_norm": 0.362143477430031, + "learning_rate": 3.1488617584620394e-05, + "loss": 0.3686, + "step": 1496 + }, + { + "epoch": 1.9118773946360155, + "grad_norm": 0.25284094576387084, + "learning_rate": 3.1423754510099304e-05, + "loss": 0.2539, + "step": 1497 + }, + { + "epoch": 1.9131545338441889, + "grad_norm": 0.2442826801922134, + "learning_rate": 3.13589276831322e-05, + "loss": 0.2791, + "step": 1498 + }, + { + "epoch": 1.9144316730523627, + "grad_norm": 0.6369827762568181, + "learning_rate": 3.1294137230214736e-05, + "loss": 0.3304, + "step": 1499 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.2424683804311289, + "learning_rate": 3.12293832777716e-05, + "loss": 0.2679, + "step": 1500 + }, + { + "epoch": 1.91698595146871, + "grad_norm": 0.26494174204898324, + "learning_rate": 3.116466595215617e-05, + "loss": 0.2752, + "step": 1501 + }, + { + "epoch": 1.9182630906768838, + "grad_norm": 0.3148602259184026, + "learning_rate": 3.1099985379650456e-05, + "loss": 0.3274, + "step": 1502 + }, + { + "epoch": 1.9195402298850575, + "grad_norm": 0.2593136013852629, + "learning_rate": 3.103534168646466e-05, + "loss": 0.2616, + "step": 1503 + }, + { + "epoch": 1.920817369093231, + "grad_norm": 0.20683371851176902, + "learning_rate": 3.0970734998737095e-05, + "loss": 0.2372, + "step": 1504 + }, + { + "epoch": 1.922094508301405, + "grad_norm": 0.4474439990581004, + "learning_rate": 3.0906165442533844e-05, + "loss": 0.28, + "step": 1505 + }, + { + "epoch": 1.9233716475095786, + "grad_norm": 0.3367112759519742, + "learning_rate": 3.084163314384852e-05, + "loss": 0.3371, + "step": 1506 + }, + { + "epoch": 1.9246487867177522, + "grad_norm": 0.402665022109018, + "learning_rate": 3.077713822860204e-05, + "loss": 0.3265, + "step": 1507 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 0.31074911101540775, + "learning_rate": 3.071268082264241e-05, + "loss": 0.3241, + "step": 1508 + }, + { + "epoch": 1.9272030651340997, + "grad_norm": 0.27228617428758084, + "learning_rate": 3.0648261051744364e-05, + "loss": 0.243, + "step": 1509 + }, + { + "epoch": 1.9284802043422733, + "grad_norm": 0.2402384790252686, + "learning_rate": 3.05838790416093e-05, + "loss": 0.2335, + "step": 1510 + }, + { + "epoch": 1.9297573435504471, + "grad_norm": 0.32377371905646507, + "learning_rate": 3.0519534917864876e-05, + "loss": 0.3075, + "step": 1511 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.2973646048235486, + "learning_rate": 3.0455228806064802e-05, + "loss": 0.3124, + "step": 1512 + }, + { + "epoch": 1.9323116219667944, + "grad_norm": 0.23207648109712306, + "learning_rate": 3.0390960831688676e-05, + "loss": 0.2731, + "step": 1513 + }, + { + "epoch": 1.933588761174968, + "grad_norm": 0.2936159570423439, + "learning_rate": 3.032673112014166e-05, + "loss": 0.2724, + "step": 1514 + }, + { + "epoch": 1.9348659003831417, + "grad_norm": 0.25954890674292214, + "learning_rate": 3.0262539796754207e-05, + "loss": 0.231, + "step": 1515 + }, + { + "epoch": 1.9361430395913155, + "grad_norm": 0.2772863058640377, + "learning_rate": 3.0198386986781912e-05, + "loss": 0.2889, + "step": 1516 + }, + { + "epoch": 1.9374201787994891, + "grad_norm": 0.2944979472530951, + "learning_rate": 3.013427281540523e-05, + "loss": 0.272, + "step": 1517 + }, + { + "epoch": 1.9386973180076628, + "grad_norm": 0.2404953425572906, + "learning_rate": 3.0070197407729173e-05, + "loss": 0.2512, + "step": 1518 + }, + { + "epoch": 1.9399744572158366, + "grad_norm": 0.24096931923126788, + "learning_rate": 3.0006160888783152e-05, + "loss": 0.2473, + "step": 1519 + }, + { + "epoch": 1.9412515964240102, + "grad_norm": 0.23642975709515893, + "learning_rate": 2.9942163383520684e-05, + "loss": 0.2765, + "step": 1520 + }, + { + "epoch": 1.9425287356321839, + "grad_norm": 0.2868095184019939, + "learning_rate": 2.9878205016819132e-05, + "loss": 0.2822, + "step": 1521 + }, + { + "epoch": 1.9438058748403577, + "grad_norm": 0.20132303041765306, + "learning_rate": 2.9814285913479544e-05, + "loss": 0.2465, + "step": 1522 + }, + { + "epoch": 1.9450830140485313, + "grad_norm": 0.27838241011588166, + "learning_rate": 2.9750406198226332e-05, + "loss": 0.2947, + "step": 1523 + }, + { + "epoch": 1.946360153256705, + "grad_norm": 0.22271039935761652, + "learning_rate": 2.9686565995707005e-05, + "loss": 0.237, + "step": 1524 + }, + { + "epoch": 1.9476372924648788, + "grad_norm": 0.30263208113918255, + "learning_rate": 2.962276543049204e-05, + "loss": 0.3458, + "step": 1525 + }, + { + "epoch": 1.9489144316730522, + "grad_norm": 0.2518859128873575, + "learning_rate": 2.9559004627074534e-05, + "loss": 0.2523, + "step": 1526 + }, + { + "epoch": 1.950191570881226, + "grad_norm": 0.2263779851530161, + "learning_rate": 2.949528370986999e-05, + "loss": 0.2832, + "step": 1527 + }, + { + "epoch": 1.9514687100893997, + "grad_norm": 0.2603111175693102, + "learning_rate": 2.9431602803216096e-05, + "loss": 0.2558, + "step": 1528 + }, + { + "epoch": 1.9527458492975733, + "grad_norm": 0.2535749546909697, + "learning_rate": 2.9367962031372488e-05, + "loss": 0.2757, + "step": 1529 + }, + { + "epoch": 1.9540229885057472, + "grad_norm": 0.25062865424179687, + "learning_rate": 2.9304361518520445e-05, + "loss": 0.301, + "step": 1530 + }, + { + "epoch": 1.9553001277139208, + "grad_norm": 0.3158045990879403, + "learning_rate": 2.9240801388762707e-05, + "loss": 0.2742, + "step": 1531 + }, + { + "epoch": 1.9565772669220944, + "grad_norm": 0.33761826825912694, + "learning_rate": 2.9177281766123253e-05, + "loss": 0.3396, + "step": 1532 + }, + { + "epoch": 1.9578544061302683, + "grad_norm": 0.22044844276294454, + "learning_rate": 2.911380277454695e-05, + "loss": 0.2639, + "step": 1533 + }, + { + "epoch": 1.959131545338442, + "grad_norm": 0.28857845520066566, + "learning_rate": 2.905036453789942e-05, + "loss": 0.3222, + "step": 1534 + }, + { + "epoch": 1.9604086845466155, + "grad_norm": 0.31657856467534357, + "learning_rate": 2.8986967179966778e-05, + "loss": 0.3319, + "step": 1535 + }, + { + "epoch": 1.9616858237547894, + "grad_norm": 0.5376166863699816, + "learning_rate": 2.8923610824455338e-05, + "loss": 0.3248, + "step": 1536 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 0.272439166173279, + "learning_rate": 2.8860295594991438e-05, + "loss": 0.2452, + "step": 1537 + }, + { + "epoch": 1.9642401021711366, + "grad_norm": 0.28042816880366955, + "learning_rate": 2.8797021615121133e-05, + "loss": 0.288, + "step": 1538 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 0.23613969948384378, + "learning_rate": 2.873378900831003e-05, + "loss": 0.2619, + "step": 1539 + }, + { + "epoch": 1.966794380587484, + "grad_norm": 0.2220332719602242, + "learning_rate": 2.8670597897942963e-05, + "loss": 0.2845, + "step": 1540 + }, + { + "epoch": 1.9680715197956578, + "grad_norm": 0.1953640141701118, + "learning_rate": 2.8607448407323824e-05, + "loss": 0.2453, + "step": 1541 + }, + { + "epoch": 1.9693486590038314, + "grad_norm": 0.2728574145750964, + "learning_rate": 2.854434065967528e-05, + "loss": 0.3039, + "step": 1542 + }, + { + "epoch": 1.970625798212005, + "grad_norm": 0.31773843728594725, + "learning_rate": 2.8481274778138567e-05, + "loss": 0.2986, + "step": 1543 + }, + { + "epoch": 1.9719029374201789, + "grad_norm": 0.3229143145409194, + "learning_rate": 2.8418250885773156e-05, + "loss": 0.2938, + "step": 1544 + }, + { + "epoch": 1.9731800766283525, + "grad_norm": 0.21959426082243635, + "learning_rate": 2.835526910555669e-05, + "loss": 0.2809, + "step": 1545 + }, + { + "epoch": 1.9744572158365261, + "grad_norm": 0.22969047232081516, + "learning_rate": 2.82923295603846e-05, + "loss": 0.2602, + "step": 1546 + }, + { + "epoch": 1.9757343550447, + "grad_norm": 0.2496111671512901, + "learning_rate": 2.822943237306983e-05, + "loss": 0.2577, + "step": 1547 + }, + { + "epoch": 1.9770114942528736, + "grad_norm": 0.22099454724830694, + "learning_rate": 2.8166577666342787e-05, + "loss": 0.2688, + "step": 1548 + }, + { + "epoch": 1.9782886334610472, + "grad_norm": 0.2181565603186376, + "learning_rate": 2.810376556285094e-05, + "loss": 0.2269, + "step": 1549 + }, + { + "epoch": 1.979565772669221, + "grad_norm": 0.23236360601562728, + "learning_rate": 2.804099618515858e-05, + "loss": 0.2812, + "step": 1550 + }, + { + "epoch": 1.9808429118773945, + "grad_norm": 0.29237213695335407, + "learning_rate": 2.7978269655746668e-05, + "loss": 0.2417, + "step": 1551 + }, + { + "epoch": 1.9821200510855683, + "grad_norm": 0.24520647985368385, + "learning_rate": 2.7915586097012614e-05, + "loss": 0.2678, + "step": 1552 + }, + { + "epoch": 1.9833971902937422, + "grad_norm": 0.4922660249624993, + "learning_rate": 2.7852945631269866e-05, + "loss": 0.2615, + "step": 1553 + }, + { + "epoch": 1.9846743295019156, + "grad_norm": 0.2711427238275631, + "learning_rate": 2.7790348380747834e-05, + "loss": 0.2818, + "step": 1554 + }, + { + "epoch": 1.9859514687100894, + "grad_norm": 0.2883769649715117, + "learning_rate": 2.7727794467591683e-05, + "loss": 0.2952, + "step": 1555 + }, + { + "epoch": 1.987228607918263, + "grad_norm": 0.3605329261496836, + "learning_rate": 2.766528401386187e-05, + "loss": 0.2698, + "step": 1556 + }, + { + "epoch": 1.9885057471264367, + "grad_norm": 0.26385266435333565, + "learning_rate": 2.7602817141534143e-05, + "loss": 0.2631, + "step": 1557 + }, + { + "epoch": 1.9897828863346105, + "grad_norm": 0.26215103860004363, + "learning_rate": 2.75403939724992e-05, + "loss": 0.2708, + "step": 1558 + }, + { + "epoch": 1.9910600255427842, + "grad_norm": 0.3419987073929853, + "learning_rate": 2.7478014628562444e-05, + "loss": 0.2944, + "step": 1559 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.2611535539379696, + "learning_rate": 2.741567923144376e-05, + "loss": 0.2819, + "step": 1560 + }, + { + "epoch": 1.9936143039591316, + "grad_norm": 0.33134144681133754, + "learning_rate": 2.7353387902777306e-05, + "loss": 0.3242, + "step": 1561 + }, + { + "epoch": 1.9948914431673053, + "grad_norm": 0.2405464792016204, + "learning_rate": 2.7291140764111223e-05, + "loss": 0.2514, + "step": 1562 + }, + { + "epoch": 1.996168582375479, + "grad_norm": 0.6307493175087922, + "learning_rate": 2.722893793690744e-05, + "loss": 0.2919, + "step": 1563 + }, + { + "epoch": 1.9974457215836527, + "grad_norm": 0.23250148418060285, + "learning_rate": 2.716677954254141e-05, + "loss": 0.2557, + "step": 1564 + }, + { + "epoch": 1.9987228607918262, + "grad_norm": 0.24185583797066315, + "learning_rate": 2.7104665702301897e-05, + "loss": 0.2345, + "step": 1565 + }, + { + "epoch": 2.0, + "grad_norm": 0.21610828595707748, + "learning_rate": 2.704259653739074e-05, + "loss": 0.2436, + "step": 1566 + } + ], + "logging_steps": 1, + "max_steps": 2349, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 783, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7269036727468032.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}