{ "best_metric": 6.472244739532471, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 0.08817202568461421, "eval_steps": 25, "global_step": 342, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025781294059828714, "grad_norm": 3.1607115268707275, "learning_rate": 7.692307692307694e-06, "loss": 9.2193, "step": 1 }, { "epoch": 0.00025781294059828714, "eval_loss": 9.055607795715332, "eval_runtime": 0.2023, "eval_samples_per_second": 247.203, "eval_steps_per_second": 64.273, "step": 1 }, { "epoch": 0.0005156258811965743, "grad_norm": 3.66351056098938, "learning_rate": 1.5384615384615387e-05, "loss": 8.9457, "step": 2 }, { "epoch": 0.0007734388217948614, "grad_norm": 4.107652187347412, "learning_rate": 2.307692307692308e-05, "loss": 8.7498, "step": 3 }, { "epoch": 0.0010312517623931486, "grad_norm": 3.748608112335205, "learning_rate": 3.0769230769230774e-05, "loss": 8.5716, "step": 4 }, { "epoch": 0.0012890647029914358, "grad_norm": 3.6365067958831787, "learning_rate": 3.846153846153846e-05, "loss": 8.5622, "step": 5 }, { "epoch": 0.0015468776435897229, "grad_norm": 3.7735841274261475, "learning_rate": 4.615384615384616e-05, "loss": 8.6775, "step": 6 }, { "epoch": 0.00180469058418801, "grad_norm": 3.588484048843384, "learning_rate": 5.384615384615385e-05, "loss": 8.3885, "step": 7 }, { "epoch": 0.002062503524786297, "grad_norm": 3.3608758449554443, "learning_rate": 6.153846153846155e-05, "loss": 8.3501, "step": 8 }, { "epoch": 0.0023203164653845844, "grad_norm": 3.627272367477417, "learning_rate": 6.923076923076924e-05, "loss": 8.522, "step": 9 }, { "epoch": 0.0025781294059828716, "grad_norm": 4.065791130065918, "learning_rate": 7.692307692307693e-05, "loss": 8.7527, "step": 10 }, { "epoch": 0.002835942346581159, "grad_norm": 4.912317276000977, "learning_rate": 8.461538461538461e-05, "loss": 9.2742, "step": 11 }, { "epoch": 0.0030937552871794457, "grad_norm": 4.300169467926025, "learning_rate": 9.230769230769232e-05, "loss": 9.17, "step": 12 }, { "epoch": 0.003351568227777733, "grad_norm": 3.1629996299743652, "learning_rate": 0.0001, "loss": 9.0605, "step": 13 }, { "epoch": 0.00360938116837602, "grad_norm": 3.009999990463257, "learning_rate": 9.999794842608933e-05, "loss": 8.6683, "step": 14 }, { "epoch": 0.0038671941089743075, "grad_norm": 3.1894335746765137, "learning_rate": 9.9991793891422e-05, "loss": 8.3439, "step": 15 }, { "epoch": 0.004125007049572594, "grad_norm": 2.9719841480255127, "learning_rate": 9.998153695717504e-05, "loss": 8.2405, "step": 16 }, { "epoch": 0.0043828199901708815, "grad_norm": 2.7855825424194336, "learning_rate": 9.99671785585866e-05, "loss": 8.076, "step": 17 }, { "epoch": 0.004640632930769169, "grad_norm": 2.5612030029296875, "learning_rate": 9.994872000487073e-05, "loss": 7.9031, "step": 18 }, { "epoch": 0.004898445871367456, "grad_norm": 2.306985855102539, "learning_rate": 9.992616297909798e-05, "loss": 7.796, "step": 19 }, { "epoch": 0.005156258811965743, "grad_norm": 2.172031879425049, "learning_rate": 9.98995095380419e-05, "loss": 7.8418, "step": 20 }, { "epoch": 0.0054140717525640305, "grad_norm": 2.1543822288513184, "learning_rate": 9.98687621119916e-05, "loss": 7.8234, "step": 21 }, { "epoch": 0.005671884693162318, "grad_norm": 2.1653501987457275, "learning_rate": 9.983392350453004e-05, "loss": 7.8711, "step": 22 }, { "epoch": 0.005929697633760604, "grad_norm": 2.52407169342041, "learning_rate": 9.97949968922785e-05, "loss": 8.3108, "step": 23 }, { "epoch": 0.006187510574358891, "grad_norm": 2.5219650268554688, "learning_rate": 9.975198582460683e-05, "loss": 8.3452, "step": 24 }, { "epoch": 0.006445323514957179, "grad_norm": 3.255831480026245, "learning_rate": 9.970489422330991e-05, "loss": 7.6869, "step": 25 }, { "epoch": 0.006445323514957179, "eval_loss": 7.943339824676514, "eval_runtime": 0.1981, "eval_samples_per_second": 252.354, "eval_steps_per_second": 65.612, "step": 25 }, { "epoch": 0.006703136455555466, "grad_norm": 1.9119776487350464, "learning_rate": 9.965372638224995e-05, "loss": 8.4508, "step": 26 }, { "epoch": 0.006960949396153753, "grad_norm": 2.035367488861084, "learning_rate": 9.959848696696512e-05, "loss": 7.8487, "step": 27 }, { "epoch": 0.00721876233675204, "grad_norm": 2.213364601135254, "learning_rate": 9.953918101424397e-05, "loss": 7.6929, "step": 28 }, { "epoch": 0.007476575277350328, "grad_norm": 2.124891996383667, "learning_rate": 9.947581393166631e-05, "loss": 7.6392, "step": 29 }, { "epoch": 0.007734388217948615, "grad_norm": 2.085674285888672, "learning_rate": 9.940839149711009e-05, "loss": 7.5293, "step": 30 }, { "epoch": 0.007992201158546902, "grad_norm": 2.0204050540924072, "learning_rate": 9.93369198582245e-05, "loss": 7.2825, "step": 31 }, { "epoch": 0.008250014099145189, "grad_norm": 1.8093018531799316, "learning_rate": 9.926140553186957e-05, "loss": 7.2742, "step": 32 }, { "epoch": 0.008507827039743477, "grad_norm": 1.7855942249298096, "learning_rate": 9.918185540352179e-05, "loss": 7.1661, "step": 33 }, { "epoch": 0.008765639980341763, "grad_norm": 1.520045280456543, "learning_rate": 9.90982767266464e-05, "loss": 7.3265, "step": 34 }, { "epoch": 0.009023452920940051, "grad_norm": 1.6203173398971558, "learning_rate": 9.901067712203602e-05, "loss": 7.6529, "step": 35 }, { "epoch": 0.009281265861538338, "grad_norm": 1.9243559837341309, "learning_rate": 9.891906457711565e-05, "loss": 7.9287, "step": 36 }, { "epoch": 0.009539078802136624, "grad_norm": 2.310131072998047, "learning_rate": 9.882344744521452e-05, "loss": 8.0683, "step": 37 }, { "epoch": 0.009796891742734912, "grad_norm": 1.5040435791015625, "learning_rate": 9.872383444480428e-05, "loss": 8.3707, "step": 38 }, { "epoch": 0.010054704683333198, "grad_norm": 1.3570293188095093, "learning_rate": 9.862023465870421e-05, "loss": 7.8076, "step": 39 }, { "epoch": 0.010312517623931487, "grad_norm": 1.325873851776123, "learning_rate": 9.85126575332529e-05, "loss": 7.3855, "step": 40 }, { "epoch": 0.010570330564529773, "grad_norm": 1.2246894836425781, "learning_rate": 9.840111287744695e-05, "loss": 7.2972, "step": 41 }, { "epoch": 0.010828143505128061, "grad_norm": 1.1898456811904907, "learning_rate": 9.828561086204663e-05, "loss": 7.2356, "step": 42 }, { "epoch": 0.011085956445726347, "grad_norm": 1.1440843343734741, "learning_rate": 9.816616201864844e-05, "loss": 7.0368, "step": 43 }, { "epoch": 0.011343769386324636, "grad_norm": 1.1613914966583252, "learning_rate": 9.804277723872487e-05, "loss": 7.053, "step": 44 }, { "epoch": 0.011601582326922922, "grad_norm": 1.0742549896240234, "learning_rate": 9.791546777263124e-05, "loss": 7.0793, "step": 45 }, { "epoch": 0.011859395267521208, "grad_norm": 1.1409205198287964, "learning_rate": 9.778424522858002e-05, "loss": 7.094, "step": 46 }, { "epoch": 0.012117208208119496, "grad_norm": 1.137558937072754, "learning_rate": 9.764912157158217e-05, "loss": 7.2082, "step": 47 }, { "epoch": 0.012375021148717783, "grad_norm": 1.3125481605529785, "learning_rate": 9.751010912235635e-05, "loss": 7.4798, "step": 48 }, { "epoch": 0.012632834089316071, "grad_norm": 1.7244569063186646, "learning_rate": 9.736722055620542e-05, "loss": 7.8012, "step": 49 }, { "epoch": 0.012890647029914357, "grad_norm": 2.4806711673736572, "learning_rate": 9.722046890186068e-05, "loss": 7.3645, "step": 50 }, { "epoch": 0.012890647029914357, "eval_loss": 7.339738845825195, "eval_runtime": 0.1966, "eval_samples_per_second": 254.297, "eval_steps_per_second": 66.117, "step": 50 }, { "epoch": 0.013148459970512645, "grad_norm": 1.2124334573745728, "learning_rate": 9.706986754029392e-05, "loss": 7.9549, "step": 51 }, { "epoch": 0.013406272911110932, "grad_norm": 1.098710536956787, "learning_rate": 9.691543020349733e-05, "loss": 7.2316, "step": 52 }, { "epoch": 0.01366408585170922, "grad_norm": 1.180907964706421, "learning_rate": 9.675717097323142e-05, "loss": 7.0804, "step": 53 }, { "epoch": 0.013921898792307506, "grad_norm": 1.2475526332855225, "learning_rate": 9.659510427974095e-05, "loss": 6.8687, "step": 54 }, { "epoch": 0.014179711732905793, "grad_norm": 1.028371810913086, "learning_rate": 9.642924490043929e-05, "loss": 6.9329, "step": 55 }, { "epoch": 0.01443752467350408, "grad_norm": 0.9092774391174316, "learning_rate": 9.625960795856091e-05, "loss": 6.8273, "step": 56 }, { "epoch": 0.014695337614102367, "grad_norm": 0.8463051319122314, "learning_rate": 9.608620892178242e-05, "loss": 6.7387, "step": 57 }, { "epoch": 0.014953150554700655, "grad_norm": 0.8005713820457458, "learning_rate": 9.590906360081227e-05, "loss": 6.7106, "step": 58 }, { "epoch": 0.015210963495298942, "grad_norm": 0.8417860865592957, "learning_rate": 9.572818814794909e-05, "loss": 6.8648, "step": 59 }, { "epoch": 0.01546877643589723, "grad_norm": 1.1298677921295166, "learning_rate": 9.554359905560886e-05, "loss": 7.0416, "step": 60 }, { "epoch": 0.015726589376495518, "grad_norm": 1.2911885976791382, "learning_rate": 9.535531315482122e-05, "loss": 7.4734, "step": 61 }, { "epoch": 0.015984402317093804, "grad_norm": 1.7200947999954224, "learning_rate": 9.516334761369466e-05, "loss": 7.5116, "step": 62 }, { "epoch": 0.01624221525769209, "grad_norm": 1.3757171630859375, "learning_rate": 9.496771993585124e-05, "loss": 8.0631, "step": 63 }, { "epoch": 0.016500028198290377, "grad_norm": 1.0369426012039185, "learning_rate": 9.476844795883051e-05, "loss": 7.375, "step": 64 }, { "epoch": 0.016757841138888663, "grad_norm": 1.0387243032455444, "learning_rate": 9.456554985246311e-05, "loss": 7.0062, "step": 65 }, { "epoch": 0.017015654079486953, "grad_norm": 1.0140559673309326, "learning_rate": 9.435904411721399e-05, "loss": 6.8905, "step": 66 }, { "epoch": 0.01727346702008524, "grad_norm": 0.9097711443901062, "learning_rate": 9.414894958249556e-05, "loss": 6.7577, "step": 67 }, { "epoch": 0.017531279960683526, "grad_norm": 0.7602633833885193, "learning_rate": 9.393528540495072e-05, "loss": 6.7336, "step": 68 }, { "epoch": 0.017789092901281812, "grad_norm": 0.7813143730163574, "learning_rate": 9.371807106670628e-05, "loss": 6.7733, "step": 69 }, { "epoch": 0.018046905841880102, "grad_norm": 0.7254886031150818, "learning_rate": 9.349732637359642e-05, "loss": 6.6369, "step": 70 }, { "epoch": 0.01830471878247839, "grad_norm": 0.9275261163711548, "learning_rate": 9.327307145335683e-05, "loss": 6.6906, "step": 71 }, { "epoch": 0.018562531723076675, "grad_norm": 0.98789381980896, "learning_rate": 9.304532675378947e-05, "loss": 6.8449, "step": 72 }, { "epoch": 0.01882034466367496, "grad_norm": 1.0433034896850586, "learning_rate": 9.281411304089808e-05, "loss": 7.1292, "step": 73 }, { "epoch": 0.019078157604273248, "grad_norm": 1.4908647537231445, "learning_rate": 9.257945139699469e-05, "loss": 7.4313, "step": 74 }, { "epoch": 0.019335970544871538, "grad_norm": 2.794236660003662, "learning_rate": 9.234136321877736e-05, "loss": 6.9274, "step": 75 }, { "epoch": 0.019335970544871538, "eval_loss": 7.053930282592773, "eval_runtime": 0.1971, "eval_samples_per_second": 253.631, "eval_steps_per_second": 65.944, "step": 75 }, { "epoch": 0.019593783485469824, "grad_norm": 1.3313510417938232, "learning_rate": 9.209987021537921e-05, "loss": 7.6916, "step": 76 }, { "epoch": 0.01985159642606811, "grad_norm": 1.082716464996338, "learning_rate": 9.185499440638893e-05, "loss": 7.1731, "step": 77 }, { "epoch": 0.020109409366666397, "grad_norm": 1.122697114944458, "learning_rate": 9.1606758119843e-05, "loss": 6.8733, "step": 78 }, { "epoch": 0.020367222307264687, "grad_norm": 0.9345807433128357, "learning_rate": 9.135518399018983e-05, "loss": 6.6494, "step": 79 }, { "epoch": 0.020625035247862973, "grad_norm": 0.9072409272193909, "learning_rate": 9.110029495622591e-05, "loss": 6.6066, "step": 80 }, { "epoch": 0.02088284818846126, "grad_norm": 0.7997028231620789, "learning_rate": 9.084211425900422e-05, "loss": 6.4625, "step": 81 }, { "epoch": 0.021140661129059546, "grad_norm": 0.6371345520019531, "learning_rate": 9.058066543971511e-05, "loss": 6.5979, "step": 82 }, { "epoch": 0.021398474069657832, "grad_norm": 0.8078920841217041, "learning_rate": 9.031597233753974e-05, "loss": 6.5557, "step": 83 }, { "epoch": 0.021656287010256122, "grad_norm": 0.8252989649772644, "learning_rate": 9.00480590874765e-05, "loss": 6.5391, "step": 84 }, { "epoch": 0.02191409995085441, "grad_norm": 1.0465108156204224, "learning_rate": 8.977695011814019e-05, "loss": 6.8797, "step": 85 }, { "epoch": 0.022171912891452695, "grad_norm": 1.207483172416687, "learning_rate": 8.950267014953478e-05, "loss": 7.3097, "step": 86 }, { "epoch": 0.02242972583205098, "grad_norm": 2.0094501972198486, "learning_rate": 8.922524419079928e-05, "loss": 7.4981, "step": 87 }, { "epoch": 0.02268753877264927, "grad_norm": 1.394856572151184, "learning_rate": 8.89446975379274e-05, "loss": 7.8859, "step": 88 }, { "epoch": 0.022945351713247557, "grad_norm": 1.0990052223205566, "learning_rate": 8.866105577146111e-05, "loss": 7.2602, "step": 89 }, { "epoch": 0.023203164653845844, "grad_norm": 0.845337450504303, "learning_rate": 8.83743447541581e-05, "loss": 6.7209, "step": 90 }, { "epoch": 0.02346097759444413, "grad_norm": 1.0816878080368042, "learning_rate": 8.80845906286336e-05, "loss": 6.8582, "step": 91 }, { "epoch": 0.023718790535042417, "grad_norm": 0.9941583275794983, "learning_rate": 8.77918198149767e-05, "loss": 6.6235, "step": 92 }, { "epoch": 0.023976603475640706, "grad_norm": 0.8640435934066772, "learning_rate": 8.749605900834131e-05, "loss": 6.6139, "step": 93 }, { "epoch": 0.024234416416238993, "grad_norm": 0.6847352385520935, "learning_rate": 8.719733517651211e-05, "loss": 6.5792, "step": 94 }, { "epoch": 0.02449222935683728, "grad_norm": 0.7488027811050415, "learning_rate": 8.689567555744552e-05, "loss": 6.4055, "step": 95 }, { "epoch": 0.024750042297435566, "grad_norm": 0.7001090049743652, "learning_rate": 8.659110765678615e-05, "loss": 6.5318, "step": 96 }, { "epoch": 0.025007855238033855, "grad_norm": 1.1271648406982422, "learning_rate": 8.628365924535892e-05, "loss": 6.7955, "step": 97 }, { "epoch": 0.025265668178632142, "grad_norm": 1.2209476232528687, "learning_rate": 8.597335835663663e-05, "loss": 6.9507, "step": 98 }, { "epoch": 0.025523481119230428, "grad_norm": 1.3603824377059937, "learning_rate": 8.566023328418411e-05, "loss": 7.2596, "step": 99 }, { "epoch": 0.025781294059828715, "grad_norm": 2.483475923538208, "learning_rate": 8.534431257907822e-05, "loss": 6.8387, "step": 100 }, { "epoch": 0.025781294059828715, "eval_loss": 6.887165069580078, "eval_runtime": 0.2078, "eval_samples_per_second": 240.66, "eval_steps_per_second": 62.572, "step": 100 }, { "epoch": 0.026039107000427, "grad_norm": 1.3542296886444092, "learning_rate": 8.502562504730458e-05, "loss": 7.6824, "step": 101 }, { "epoch": 0.02629691994102529, "grad_norm": 1.0464502573013306, "learning_rate": 8.4704199747131e-05, "loss": 6.9336, "step": 102 }, { "epoch": 0.026554732881623577, "grad_norm": 0.9437260031700134, "learning_rate": 8.438006598645793e-05, "loss": 6.6101, "step": 103 }, { "epoch": 0.026812545822221864, "grad_norm": 1.1800928115844727, "learning_rate": 8.405325332014611e-05, "loss": 6.5754, "step": 104 }, { "epoch": 0.02707035876282015, "grad_norm": 0.8258107900619507, "learning_rate": 8.372379154732179e-05, "loss": 6.5733, "step": 105 }, { "epoch": 0.02732817170341844, "grad_norm": 0.747916579246521, "learning_rate": 8.339171070865949e-05, "loss": 6.54, "step": 106 }, { "epoch": 0.027585984644016726, "grad_norm": 0.6527675986289978, "learning_rate": 8.305704108364301e-05, "loss": 6.4044, "step": 107 }, { "epoch": 0.027843797584615013, "grad_norm": 0.8093273639678955, "learning_rate": 8.271981318780441e-05, "loss": 6.4507, "step": 108 }, { "epoch": 0.0281016105252133, "grad_norm": 0.7874107956886292, "learning_rate": 8.23800577699416e-05, "loss": 6.5594, "step": 109 }, { "epoch": 0.028359423465811585, "grad_norm": 1.2953649759292603, "learning_rate": 8.203780580931463e-05, "loss": 6.8035, "step": 110 }, { "epoch": 0.028617236406409875, "grad_norm": 1.3530511856079102, "learning_rate": 8.169308851282099e-05, "loss": 7.0729, "step": 111 }, { "epoch": 0.02887504934700816, "grad_norm": 1.6297696828842163, "learning_rate": 8.134593731215008e-05, "loss": 7.1729, "step": 112 }, { "epoch": 0.029132862287606448, "grad_norm": 1.4375627040863037, "learning_rate": 8.099638386091736e-05, "loss": 7.9231, "step": 113 }, { "epoch": 0.029390675228204734, "grad_norm": 1.2975345849990845, "learning_rate": 8.064446003177789e-05, "loss": 7.22, "step": 114 }, { "epoch": 0.029648488168803024, "grad_norm": 0.8513131141662598, "learning_rate": 8.029019791352047e-05, "loss": 6.6561, "step": 115 }, { "epoch": 0.02990630110940131, "grad_norm": 1.3775849342346191, "learning_rate": 7.993362980814148e-05, "loss": 6.5172, "step": 116 }, { "epoch": 0.030164114049999597, "grad_norm": 1.3073246479034424, "learning_rate": 7.95747882278997e-05, "loss": 6.5197, "step": 117 }, { "epoch": 0.030421926990597883, "grad_norm": 1.0891661643981934, "learning_rate": 7.921370589235178e-05, "loss": 6.4481, "step": 118 }, { "epoch": 0.03067973993119617, "grad_norm": 0.9984277486801147, "learning_rate": 7.885041572536877e-05, "loss": 6.3207, "step": 119 }, { "epoch": 0.03093755287179446, "grad_norm": 0.7283719182014465, "learning_rate": 7.848495085213415e-05, "loss": 6.3163, "step": 120 }, { "epoch": 0.031195365812392746, "grad_norm": 0.7278897166252136, "learning_rate": 7.811734459612347e-05, "loss": 6.2454, "step": 121 }, { "epoch": 0.031453178752991036, "grad_norm": 1.079011082649231, "learning_rate": 7.774763047606578e-05, "loss": 6.5563, "step": 122 }, { "epoch": 0.03171099169358932, "grad_norm": 1.3959095478057861, "learning_rate": 7.737584220288748e-05, "loss": 6.7861, "step": 123 }, { "epoch": 0.03196880463418761, "grad_norm": 1.5843003988265991, "learning_rate": 7.700201367663837e-05, "loss": 7.1191, "step": 124 }, { "epoch": 0.03222661757478589, "grad_norm": 2.752187728881836, "learning_rate": 7.662617898340078e-05, "loss": 6.4137, "step": 125 }, { "epoch": 0.03222661757478589, "eval_loss": 6.769779205322266, "eval_runtime": 0.1975, "eval_samples_per_second": 253.109, "eval_steps_per_second": 65.808, "step": 125 }, { "epoch": 0.03248443051538418, "grad_norm": 1.2939435243606567, "learning_rate": 7.624837239218141e-05, "loss": 7.4833, "step": 126 }, { "epoch": 0.03274224345598247, "grad_norm": 1.0085922479629517, "learning_rate": 7.586862835178674e-05, "loss": 6.8356, "step": 127 }, { "epoch": 0.033000056396580754, "grad_norm": 0.7278715968132019, "learning_rate": 7.548698148768196e-05, "loss": 6.5165, "step": 128 }, { "epoch": 0.033257869337179044, "grad_norm": 0.9286090731620789, "learning_rate": 7.510346659883368e-05, "loss": 6.4907, "step": 129 }, { "epoch": 0.03351568227777733, "grad_norm": 0.7645915150642395, "learning_rate": 7.471811865453701e-05, "loss": 6.4245, "step": 130 }, { "epoch": 0.03377349521837562, "grad_norm": 0.897671103477478, "learning_rate": 7.433097279122709e-05, "loss": 6.3202, "step": 131 }, { "epoch": 0.03403130815897391, "grad_norm": 1.015424132347107, "learning_rate": 7.394206430927509e-05, "loss": 6.272, "step": 132 }, { "epoch": 0.03428912109957219, "grad_norm": 0.8185350894927979, "learning_rate": 7.35514286697697e-05, "loss": 6.2677, "step": 133 }, { "epoch": 0.03454693404017048, "grad_norm": 0.8126993179321289, "learning_rate": 7.315910149128366e-05, "loss": 6.2897, "step": 134 }, { "epoch": 0.03480474698076877, "grad_norm": 1.1658449172973633, "learning_rate": 7.276511854662603e-05, "loss": 6.703, "step": 135 }, { "epoch": 0.03506255992136705, "grad_norm": 1.35764479637146, "learning_rate": 7.23695157595804e-05, "loss": 6.8046, "step": 136 }, { "epoch": 0.03532037286196534, "grad_norm": 1.5801937580108643, "learning_rate": 7.197232920162928e-05, "loss": 6.9916, "step": 137 }, { "epoch": 0.035578185802563625, "grad_norm": 1.5324712991714478, "learning_rate": 7.157359508866511e-05, "loss": 7.8028, "step": 138 }, { "epoch": 0.035835998743161915, "grad_norm": 1.3282980918884277, "learning_rate": 7.117334977768807e-05, "loss": 6.9809, "step": 139 }, { "epoch": 0.036093811683760205, "grad_norm": 1.0740865468978882, "learning_rate": 7.077162976349094e-05, "loss": 6.5679, "step": 140 }, { "epoch": 0.03635162462435849, "grad_norm": 0.814476490020752, "learning_rate": 7.036847167533152e-05, "loss": 6.4312, "step": 141 }, { "epoch": 0.03660943756495678, "grad_norm": 0.8417059779167175, "learning_rate": 6.996391227359271e-05, "loss": 6.5706, "step": 142 }, { "epoch": 0.03686725050555506, "grad_norm": 0.834905207157135, "learning_rate": 6.955798844643073e-05, "loss": 6.3813, "step": 143 }, { "epoch": 0.03712506344615335, "grad_norm": 0.9371703267097473, "learning_rate": 6.915073720641145e-05, "loss": 6.3885, "step": 144 }, { "epoch": 0.03738287638675164, "grad_norm": 1.0399932861328125, "learning_rate": 6.874219568713575e-05, "loss": 6.2417, "step": 145 }, { "epoch": 0.03764068932734992, "grad_norm": 0.9664826393127441, "learning_rate": 6.833240113985353e-05, "loss": 6.204, "step": 146 }, { "epoch": 0.03789850226794821, "grad_norm": 0.8124173283576965, "learning_rate": 6.792139093006707e-05, "loss": 6.3953, "step": 147 }, { "epoch": 0.038156315208546496, "grad_norm": 0.9566540122032166, "learning_rate": 6.750920253412411e-05, "loss": 6.6005, "step": 148 }, { "epoch": 0.038414128149144786, "grad_norm": 1.3928314447402954, "learning_rate": 6.709587353580059e-05, "loss": 6.8851, "step": 149 }, { "epoch": 0.038671941089743075, "grad_norm": 2.3610668182373047, "learning_rate": 6.668144162287384e-05, "loss": 6.6259, "step": 150 }, { "epoch": 0.038671941089743075, "eval_loss": 6.683547496795654, "eval_runtime": 0.1978, "eval_samples_per_second": 252.821, "eval_steps_per_second": 65.734, "step": 150 }, { "epoch": 0.03892975403034136, "grad_norm": 1.4347138404846191, "learning_rate": 6.626594458368607e-05, "loss": 7.6672, "step": 151 }, { "epoch": 0.03918756697093965, "grad_norm": 1.3871098756790161, "learning_rate": 6.584942030369888e-05, "loss": 6.7745, "step": 152 }, { "epoch": 0.03944537991153794, "grad_norm": 1.2063919305801392, "learning_rate": 6.543190676203878e-05, "loss": 6.4942, "step": 153 }, { "epoch": 0.03970319285213622, "grad_norm": 1.050477147102356, "learning_rate": 6.501344202803414e-05, "loss": 6.476, "step": 154 }, { "epoch": 0.03996100579273451, "grad_norm": 0.7585989236831665, "learning_rate": 6.459406425774416e-05, "loss": 6.3065, "step": 155 }, { "epoch": 0.040218818733332794, "grad_norm": 0.840876042842865, "learning_rate": 6.417381169047958e-05, "loss": 6.4345, "step": 156 }, { "epoch": 0.040476631673931084, "grad_norm": 0.9215602278709412, "learning_rate": 6.375272264531607e-05, "loss": 6.2455, "step": 157 }, { "epoch": 0.04073444461452937, "grad_norm": 0.8796879053115845, "learning_rate": 6.333083551760029e-05, "loss": 6.3822, "step": 158 }, { "epoch": 0.040992257555127656, "grad_norm": 1.0320889949798584, "learning_rate": 6.290818877544883e-05, "loss": 6.1936, "step": 159 }, { "epoch": 0.041250070495725946, "grad_norm": 0.9528200030326843, "learning_rate": 6.248482095624086e-05, "loss": 6.7382, "step": 160 }, { "epoch": 0.04150788343632423, "grad_norm": 1.1317400932312012, "learning_rate": 6.206077066310398e-05, "loss": 6.8041, "step": 161 }, { "epoch": 0.04176569637692252, "grad_norm": 1.5851746797561646, "learning_rate": 6.163607656139461e-05, "loss": 6.9808, "step": 162 }, { "epoch": 0.04202350931752081, "grad_norm": 1.8282109498977661, "learning_rate": 6.121077737517221e-05, "loss": 7.6293, "step": 163 }, { "epoch": 0.04228132225811909, "grad_norm": 1.3249458074569702, "learning_rate": 6.078491188366859e-05, "loss": 7.061, "step": 164 }, { "epoch": 0.04253913519871738, "grad_norm": 1.1475296020507812, "learning_rate": 6.035851891775181e-05, "loss": 6.6687, "step": 165 }, { "epoch": 0.042796948139315664, "grad_norm": 1.112703561782837, "learning_rate": 5.993163735638562e-05, "loss": 6.4809, "step": 166 }, { "epoch": 0.043054761079913954, "grad_norm": 0.7547016739845276, "learning_rate": 5.950430612308444e-05, "loss": 6.4568, "step": 167 }, { "epoch": 0.043312574020512244, "grad_norm": 0.6325768232345581, "learning_rate": 5.907656418236426e-05, "loss": 6.2892, "step": 168 }, { "epoch": 0.04357038696111053, "grad_norm": 0.7706112861633301, "learning_rate": 5.864845053618976e-05, "loss": 6.1419, "step": 169 }, { "epoch": 0.04382819990170882, "grad_norm": 0.6347100734710693, "learning_rate": 5.822000422041818e-05, "loss": 6.1629, "step": 170 }, { "epoch": 0.04408601284230711, "grad_norm": 0.9902559518814087, "learning_rate": 5.77912643012399e-05, "loss": 6.1463, "step": 171 }, { "epoch": 0.04434382578290539, "grad_norm": 0.689171552658081, "learning_rate": 5.736226987161637e-05, "loss": 6.3779, "step": 172 }, { "epoch": 0.04460163872350368, "grad_norm": 0.916509747505188, "learning_rate": 5.693306004771557e-05, "loss": 6.609, "step": 173 }, { "epoch": 0.04485945166410196, "grad_norm": 1.2451108694076538, "learning_rate": 5.650367396534536e-05, "loss": 6.8909, "step": 174 }, { "epoch": 0.04511726460470025, "grad_norm": 3.0355923175811768, "learning_rate": 5.607415077638505e-05, "loss": 6.3187, "step": 175 }, { "epoch": 0.04511726460470025, "eval_loss": 6.613400936126709, "eval_runtime": 0.2021, "eval_samples_per_second": 247.373, "eval_steps_per_second": 64.317, "step": 175 }, { "epoch": 0.04537507754529854, "grad_norm": 1.5169610977172852, "learning_rate": 5.564452964521544e-05, "loss": 7.5505, "step": 176 }, { "epoch": 0.045632890485896825, "grad_norm": 1.139809250831604, "learning_rate": 5.5214849745147846e-05, "loss": 6.7593, "step": 177 }, { "epoch": 0.045890703426495115, "grad_norm": 0.8782240152359009, "learning_rate": 5.478515025485216e-05, "loss": 6.6047, "step": 178 }, { "epoch": 0.0461485163670934, "grad_norm": 0.8169703483581543, "learning_rate": 5.4355470354784564e-05, "loss": 6.3025, "step": 179 }, { "epoch": 0.04640632930769169, "grad_norm": 0.8208907246589661, "learning_rate": 5.392584922361497e-05, "loss": 6.3804, "step": 180 }, { "epoch": 0.04666414224828998, "grad_norm": 0.7347301840782166, "learning_rate": 5.349632603465466e-05, "loss": 6.3098, "step": 181 }, { "epoch": 0.04692195518888826, "grad_norm": 0.6223220825195312, "learning_rate": 5.306693995228443e-05, "loss": 6.1424, "step": 182 }, { "epoch": 0.04717976812948655, "grad_norm": 0.6562004685401917, "learning_rate": 5.263773012838365e-05, "loss": 6.2335, "step": 183 }, { "epoch": 0.04743758107008483, "grad_norm": 0.7599818110466003, "learning_rate": 5.220873569876012e-05, "loss": 6.2534, "step": 184 }, { "epoch": 0.04769539401068312, "grad_norm": 0.9290241003036499, "learning_rate": 5.177999577958184e-05, "loss": 6.6084, "step": 185 }, { "epoch": 0.04795320695128141, "grad_norm": 1.200290322303772, "learning_rate": 5.1351549463810256e-05, "loss": 6.7838, "step": 186 }, { "epoch": 0.048211019891879696, "grad_norm": 1.6216846704483032, "learning_rate": 5.0923435817635764e-05, "loss": 6.9713, "step": 187 }, { "epoch": 0.048468832832477986, "grad_norm": 2.117650032043457, "learning_rate": 5.049569387691557e-05, "loss": 7.6333, "step": 188 }, { "epoch": 0.048726645773076276, "grad_norm": 1.0466145277023315, "learning_rate": 5.0068362643614406e-05, "loss": 6.8795, "step": 189 }, { "epoch": 0.04898445871367456, "grad_norm": 0.8148478269577026, "learning_rate": 4.964148108224821e-05, "loss": 6.4997, "step": 190 }, { "epoch": 0.04924227165427285, "grad_norm": 0.839122474193573, "learning_rate": 4.9215088116331433e-05, "loss": 6.4388, "step": 191 }, { "epoch": 0.04950008459487113, "grad_norm": 0.7283769845962524, "learning_rate": 4.87892226248278e-05, "loss": 6.3257, "step": 192 }, { "epoch": 0.04975789753546942, "grad_norm": 0.6678646206855774, "learning_rate": 4.836392343860542e-05, "loss": 6.3411, "step": 193 }, { "epoch": 0.05001571047606771, "grad_norm": 0.615800678730011, "learning_rate": 4.793922933689601e-05, "loss": 6.1282, "step": 194 }, { "epoch": 0.050273523416665994, "grad_norm": 0.8255587220191956, "learning_rate": 4.751517904375915e-05, "loss": 6.1349, "step": 195 }, { "epoch": 0.050531336357264284, "grad_norm": 0.8644826412200928, "learning_rate": 4.709181122455118e-05, "loss": 6.0531, "step": 196 }, { "epoch": 0.05078914929786257, "grad_norm": 0.837914764881134, "learning_rate": 4.666916448239974e-05, "loss": 6.5514, "step": 197 }, { "epoch": 0.051046962238460857, "grad_norm": 1.1065987348556519, "learning_rate": 4.624727735468394e-05, "loss": 6.539, "step": 198 }, { "epoch": 0.051304775179059146, "grad_norm": 1.3638938665390015, "learning_rate": 4.5826188309520434e-05, "loss": 6.9151, "step": 199 }, { "epoch": 0.05156258811965743, "grad_norm": 2.777278184890747, "learning_rate": 4.5405935742255853e-05, "loss": 6.4344, "step": 200 }, { "epoch": 0.05156258811965743, "eval_loss": 6.562103748321533, "eval_runtime": 0.2023, "eval_samples_per_second": 247.117, "eval_steps_per_second": 64.25, "step": 200 }, { "epoch": 0.05182040106025572, "grad_norm": 1.8335626125335693, "learning_rate": 4.498655797196586e-05, "loss": 7.3341, "step": 201 }, { "epoch": 0.052078214000854, "grad_norm": 1.0078258514404297, "learning_rate": 4.456809323796123e-05, "loss": 6.5117, "step": 202 }, { "epoch": 0.05233602694145229, "grad_norm": 0.8034628033638, "learning_rate": 4.415057969630113e-05, "loss": 6.4178, "step": 203 }, { "epoch": 0.05259383988205058, "grad_norm": 0.8906092047691345, "learning_rate": 4.3734055416313945e-05, "loss": 6.2668, "step": 204 }, { "epoch": 0.052851652822648865, "grad_norm": 0.8904070258140564, "learning_rate": 4.331855837712618e-05, "loss": 6.3467, "step": 205 }, { "epoch": 0.053109465763247155, "grad_norm": 0.6881743669509888, "learning_rate": 4.2904126464199425e-05, "loss": 6.3189, "step": 206 }, { "epoch": 0.053367278703845444, "grad_norm": 0.6847382187843323, "learning_rate": 4.2490797465875895e-05, "loss": 6.1918, "step": 207 }, { "epoch": 0.05362509164444373, "grad_norm": 0.7974367737770081, "learning_rate": 4.207860906993293e-05, "loss": 6.0862, "step": 208 }, { "epoch": 0.05388290458504202, "grad_norm": 0.9184575080871582, "learning_rate": 4.166759886014648e-05, "loss": 6.336, "step": 209 }, { "epoch": 0.0541407175256403, "grad_norm": 1.1570357084274292, "learning_rate": 4.125780431286424e-05, "loss": 6.5235, "step": 210 }, { "epoch": 0.05439853046623859, "grad_norm": 1.1518020629882812, "learning_rate": 4.084926279358855e-05, "loss": 6.7934, "step": 211 }, { "epoch": 0.05465634340683688, "grad_norm": 1.3949551582336426, "learning_rate": 4.044201155356928e-05, "loss": 6.804, "step": 212 }, { "epoch": 0.05491415634743516, "grad_norm": 1.9449479579925537, "learning_rate": 4.003608772640729e-05, "loss": 7.5952, "step": 213 }, { "epoch": 0.05517196928803345, "grad_norm": 1.1688969135284424, "learning_rate": 3.96315283246685e-05, "loss": 6.8732, "step": 214 }, { "epoch": 0.055429782228631735, "grad_norm": 0.915627658367157, "learning_rate": 3.922837023650906e-05, "loss": 6.343, "step": 215 }, { "epoch": 0.055687595169230025, "grad_norm": 0.7137383222579956, "learning_rate": 3.882665022231193e-05, "loss": 6.1931, "step": 216 }, { "epoch": 0.055945408109828315, "grad_norm": 0.8037987351417542, "learning_rate": 3.8426404911334904e-05, "loss": 6.245, "step": 217 }, { "epoch": 0.0562032210504266, "grad_norm": 0.8728399872779846, "learning_rate": 3.802767079837074e-05, "loss": 6.2589, "step": 218 }, { "epoch": 0.05646103399102489, "grad_norm": 0.6477776765823364, "learning_rate": 3.763048424041962e-05, "loss": 6.0503, "step": 219 }, { "epoch": 0.05671884693162317, "grad_norm": 0.6034459471702576, "learning_rate": 3.7234881453373985e-05, "loss": 5.9153, "step": 220 }, { "epoch": 0.05697665987222146, "grad_norm": 0.6446836590766907, "learning_rate": 3.6840898508716356e-05, "loss": 6.2686, "step": 221 }, { "epoch": 0.05723447281281975, "grad_norm": 0.7486077547073364, "learning_rate": 3.644857133023032e-05, "loss": 6.1525, "step": 222 }, { "epoch": 0.05749228575341803, "grad_norm": 1.0023685693740845, "learning_rate": 3.605793569072493e-05, "loss": 6.5691, "step": 223 }, { "epoch": 0.05775009869401632, "grad_norm": 1.1698964834213257, "learning_rate": 3.5669027208772934e-05, "loss": 6.7088, "step": 224 }, { "epoch": 0.05800791163461461, "grad_norm": 2.711665153503418, "learning_rate": 3.5281881345463e-05, "loss": 6.3868, "step": 225 }, { "epoch": 0.05800791163461461, "eval_loss": 6.528235912322998, "eval_runtime": 0.1997, "eval_samples_per_second": 250.38, "eval_steps_per_second": 65.099, "step": 225 }, { "epoch": 0.058265724575212896, "grad_norm": 1.4225976467132568, "learning_rate": 3.4896533401166334e-05, "loss": 7.4112, "step": 226 }, { "epoch": 0.058523537515811186, "grad_norm": 0.9760959148406982, "learning_rate": 3.451301851231806e-05, "loss": 6.4246, "step": 227 }, { "epoch": 0.05878135045640947, "grad_norm": 0.8094965219497681, "learning_rate": 3.413137164821325e-05, "loss": 6.3329, "step": 228 }, { "epoch": 0.05903916339700776, "grad_norm": 0.7348743081092834, "learning_rate": 3.37516276078186e-05, "loss": 6.2501, "step": 229 }, { "epoch": 0.05929697633760605, "grad_norm": 0.8470206260681152, "learning_rate": 3.337382101659923e-05, "loss": 6.3584, "step": 230 }, { "epoch": 0.05955478927820433, "grad_norm": 0.6495732665061951, "learning_rate": 3.299798632336163e-05, "loss": 6.1146, "step": 231 }, { "epoch": 0.05981260221880262, "grad_norm": 0.6900242567062378, "learning_rate": 3.262415779711253e-05, "loss": 5.9503, "step": 232 }, { "epoch": 0.060070415159400904, "grad_norm": 0.77630615234375, "learning_rate": 3.225236952393422e-05, "loss": 6.2552, "step": 233 }, { "epoch": 0.060328228099999194, "grad_norm": 0.6507107019424438, "learning_rate": 3.188265540387655e-05, "loss": 6.2206, "step": 234 }, { "epoch": 0.060586041040597484, "grad_norm": 0.8207120299339294, "learning_rate": 3.1515049147865864e-05, "loss": 6.4454, "step": 235 }, { "epoch": 0.06084385398119577, "grad_norm": 1.0372916460037231, "learning_rate": 3.114958427463125e-05, "loss": 6.6704, "step": 236 }, { "epoch": 0.06110166692179406, "grad_norm": 1.6545346975326538, "learning_rate": 3.078629410764824e-05, "loss": 6.8683, "step": 237 }, { "epoch": 0.06135947986239234, "grad_norm": 1.7779477834701538, "learning_rate": 3.0425211772100304e-05, "loss": 7.6604, "step": 238 }, { "epoch": 0.06161729280299063, "grad_norm": 1.2553409337997437, "learning_rate": 3.0066370191858524e-05, "loss": 6.8474, "step": 239 }, { "epoch": 0.06187510574358892, "grad_norm": 0.7979155778884888, "learning_rate": 2.9709802086479537e-05, "loss": 6.4218, "step": 240 }, { "epoch": 0.0621329186841872, "grad_norm": 0.7603856921195984, "learning_rate": 2.935553996822212e-05, "loss": 6.3655, "step": 241 }, { "epoch": 0.06239073162478549, "grad_norm": 0.7401503324508667, "learning_rate": 2.900361613908267e-05, "loss": 6.255, "step": 242 }, { "epoch": 0.06264854456538378, "grad_norm": 0.6701139211654663, "learning_rate": 2.865406268784991e-05, "loss": 6.1851, "step": 243 }, { "epoch": 0.06290635750598207, "grad_norm": 0.6489132642745972, "learning_rate": 2.830691148717902e-05, "loss": 6.1713, "step": 244 }, { "epoch": 0.06316417044658035, "grad_norm": 0.6902171969413757, "learning_rate": 2.796219419068538e-05, "loss": 6.1282, "step": 245 }, { "epoch": 0.06342198338717864, "grad_norm": 0.6357892155647278, "learning_rate": 2.7619942230058415e-05, "loss": 6.039, "step": 246 }, { "epoch": 0.06367979632777693, "grad_norm": 0.7067226767539978, "learning_rate": 2.7280186812195595e-05, "loss": 6.2425, "step": 247 }, { "epoch": 0.06393760926837522, "grad_norm": 0.8542034029960632, "learning_rate": 2.6942958916356998e-05, "loss": 6.5863, "step": 248 }, { "epoch": 0.06419542220897351, "grad_norm": 1.2998591661453247, "learning_rate": 2.6608289291340527e-05, "loss": 6.8327, "step": 249 }, { "epoch": 0.06445323514957178, "grad_norm": 2.7439064979553223, "learning_rate": 2.6276208452678242e-05, "loss": 6.4475, "step": 250 }, { "epoch": 0.06445323514957178, "eval_loss": 6.501062393188477, "eval_runtime": 0.1991, "eval_samples_per_second": 251.167, "eval_steps_per_second": 65.304, "step": 250 }, { "epoch": 0.06471104809017007, "grad_norm": 1.4207818508148193, "learning_rate": 2.5946746679853894e-05, "loss": 7.3869, "step": 251 }, { "epoch": 0.06496886103076836, "grad_norm": 0.9834194183349609, "learning_rate": 2.5619934013542086e-05, "loss": 6.5657, "step": 252 }, { "epoch": 0.06522667397136665, "grad_norm": 0.677363395690918, "learning_rate": 2.5295800252869017e-05, "loss": 6.3023, "step": 253 }, { "epoch": 0.06548448691196494, "grad_norm": 0.6872428059577942, "learning_rate": 2.497437495269544e-05, "loss": 6.2084, "step": 254 }, { "epoch": 0.06574229985256322, "grad_norm": 0.6214379668235779, "learning_rate": 2.4655687420921784e-05, "loss": 6.1527, "step": 255 }, { "epoch": 0.06600011279316151, "grad_norm": 0.5908592939376831, "learning_rate": 2.43397667158159e-05, "loss": 6.1635, "step": 256 }, { "epoch": 0.0662579257337598, "grad_norm": 0.71327805519104, "learning_rate": 2.402664164336339e-05, "loss": 6.2154, "step": 257 }, { "epoch": 0.06651573867435809, "grad_norm": 0.5955221652984619, "learning_rate": 2.3716340754641102e-05, "loss": 6.0903, "step": 258 }, { "epoch": 0.06677355161495638, "grad_norm": 0.773415207862854, "learning_rate": 2.340889234321385e-05, "loss": 6.2853, "step": 259 }, { "epoch": 0.06703136455555465, "grad_norm": 0.8239244818687439, "learning_rate": 2.310432444255451e-05, "loss": 6.4251, "step": 260 }, { "epoch": 0.06728917749615294, "grad_norm": 1.1837624311447144, "learning_rate": 2.2802664823487897e-05, "loss": 6.5921, "step": 261 }, { "epoch": 0.06754699043675123, "grad_norm": 1.6242060661315918, "learning_rate": 2.2503940991658696e-05, "loss": 6.8338, "step": 262 }, { "epoch": 0.06780480337734952, "grad_norm": 1.6810436248779297, "learning_rate": 2.2208180185023303e-05, "loss": 7.5308, "step": 263 }, { "epoch": 0.06806261631794781, "grad_norm": 1.094656229019165, "learning_rate": 2.1915409371366412e-05, "loss": 6.8721, "step": 264 }, { "epoch": 0.06832042925854609, "grad_norm": 0.7698703408241272, "learning_rate": 2.162565524584191e-05, "loss": 6.3302, "step": 265 }, { "epoch": 0.06857824219914438, "grad_norm": 0.8973533511161804, "learning_rate": 2.1338944228538895e-05, "loss": 6.2005, "step": 266 }, { "epoch": 0.06883605513974267, "grad_norm": 0.7325114011764526, "learning_rate": 2.105530246207259e-05, "loss": 6.3018, "step": 267 }, { "epoch": 0.06909386808034096, "grad_norm": 0.6209387183189392, "learning_rate": 2.0774755809200722e-05, "loss": 6.109, "step": 268 }, { "epoch": 0.06935168102093925, "grad_norm": 0.6290868520736694, "learning_rate": 2.0497329850465217e-05, "loss": 6.123, "step": 269 }, { "epoch": 0.06960949396153754, "grad_norm": 0.665721595287323, "learning_rate": 2.022304988185981e-05, "loss": 6.1454, "step": 270 }, { "epoch": 0.06986730690213581, "grad_norm": 0.6962839961051941, "learning_rate": 1.9951940912523502e-05, "loss": 6.0385, "step": 271 }, { "epoch": 0.0701251198427341, "grad_norm": 0.7740205526351929, "learning_rate": 1.9684027662460257e-05, "loss": 6.112, "step": 272 }, { "epoch": 0.0703829327833324, "grad_norm": 0.9999878406524658, "learning_rate": 1.9419334560284907e-05, "loss": 6.4426, "step": 273 }, { "epoch": 0.07064074572393068, "grad_norm": 1.2608661651611328, "learning_rate": 1.9157885740995797e-05, "loss": 6.7491, "step": 274 }, { "epoch": 0.07089855866452897, "grad_norm": 3.6647045612335205, "learning_rate": 1.8899705043774095e-05, "loss": 6.2366, "step": 275 }, { "epoch": 0.07089855866452897, "eval_loss": 6.484423637390137, "eval_runtime": 0.2004, "eval_samples_per_second": 249.544, "eval_steps_per_second": 64.881, "step": 275 }, { "epoch": 0.07115637160512725, "grad_norm": 1.3171216249465942, "learning_rate": 1.8644816009810178e-05, "loss": 7.4631, "step": 276 }, { "epoch": 0.07141418454572554, "grad_norm": 0.8752208948135376, "learning_rate": 1.839324188015701e-05, "loss": 6.4626, "step": 277 }, { "epoch": 0.07167199748632383, "grad_norm": 0.698951780796051, "learning_rate": 1.8145005593611078e-05, "loss": 6.3495, "step": 278 }, { "epoch": 0.07192981042692212, "grad_norm": 0.6533861756324768, "learning_rate": 1.79001297846208e-05, "loss": 6.2616, "step": 279 }, { "epoch": 0.07218762336752041, "grad_norm": 0.6265803575515747, "learning_rate": 1.7658636781222644e-05, "loss": 6.269, "step": 280 }, { "epoch": 0.07244543630811869, "grad_norm": 0.6760322451591492, "learning_rate": 1.7420548603005325e-05, "loss": 6.0771, "step": 281 }, { "epoch": 0.07270324924871698, "grad_norm": 0.7005109786987305, "learning_rate": 1.718588695910193e-05, "loss": 6.0308, "step": 282 }, { "epoch": 0.07296106218931526, "grad_norm": 0.6212006211280823, "learning_rate": 1.6954673246210534e-05, "loss": 6.1301, "step": 283 }, { "epoch": 0.07321887512991355, "grad_norm": 0.7017173767089844, "learning_rate": 1.6726928546643176e-05, "loss": 6.2069, "step": 284 }, { "epoch": 0.07347668807051184, "grad_norm": 0.7747815847396851, "learning_rate": 1.6502673626403595e-05, "loss": 6.4071, "step": 285 }, { "epoch": 0.07373450101111012, "grad_norm": 1.003362774848938, "learning_rate": 1.628192893329374e-05, "loss": 6.6028, "step": 286 }, { "epoch": 0.07399231395170841, "grad_norm": 1.6694740056991577, "learning_rate": 1.60647145950493e-05, "loss": 6.7469, "step": 287 }, { "epoch": 0.0742501268923067, "grad_norm": 1.5285781621932983, "learning_rate": 1.5851050417504465e-05, "loss": 7.6777, "step": 288 }, { "epoch": 0.07450793983290499, "grad_norm": 1.0526957511901855, "learning_rate": 1.5640955882786017e-05, "loss": 6.7892, "step": 289 }, { "epoch": 0.07476575277350328, "grad_norm": 0.7281574010848999, "learning_rate": 1.5434450147536906e-05, "loss": 6.3561, "step": 290 }, { "epoch": 0.07502356571410156, "grad_norm": 0.6411709785461426, "learning_rate": 1.5231552041169495e-05, "loss": 6.2202, "step": 291 }, { "epoch": 0.07528137865469985, "grad_norm": 0.6290985941886902, "learning_rate": 1.5032280064148772e-05, "loss": 6.2697, "step": 292 }, { "epoch": 0.07553919159529814, "grad_norm": 0.6503005623817444, "learning_rate": 1.4836652386305349e-05, "loss": 6.0562, "step": 293 }, { "epoch": 0.07579700453589643, "grad_norm": 0.6521010398864746, "learning_rate": 1.4644686845178793e-05, "loss": 6.1066, "step": 294 }, { "epoch": 0.07605481747649472, "grad_norm": 0.6603594422340393, "learning_rate": 1.4456400944391146e-05, "loss": 6.0411, "step": 295 }, { "epoch": 0.07631263041709299, "grad_norm": 0.7835242748260498, "learning_rate": 1.4271811852050914e-05, "loss": 5.9428, "step": 296 }, { "epoch": 0.07657044335769128, "grad_norm": 0.7792856097221375, "learning_rate": 1.4090936399187732e-05, "loss": 6.2589, "step": 297 }, { "epoch": 0.07682825629828957, "grad_norm": 1.0016218423843384, "learning_rate": 1.3913791078217583e-05, "loss": 6.5254, "step": 298 }, { "epoch": 0.07708606923888786, "grad_norm": 1.3292267322540283, "learning_rate": 1.37403920414391e-05, "loss": 6.7416, "step": 299 }, { "epoch": 0.07734388217948615, "grad_norm": 3.4902491569519043, "learning_rate": 1.35707550995607e-05, "loss": 6.222, "step": 300 }, { "epoch": 0.07734388217948615, "eval_loss": 6.472244739532471, "eval_runtime": 0.2001, "eval_samples_per_second": 249.929, "eval_steps_per_second": 64.982, "step": 300 }, { "epoch": 0.07760169512008443, "grad_norm": 1.2554875612258911, "learning_rate": 1.3404895720259053e-05, "loss": 7.3701, "step": 301 }, { "epoch": 0.07785950806068272, "grad_norm": 0.8463560342788696, "learning_rate": 1.3242829026768597e-05, "loss": 6.4277, "step": 302 }, { "epoch": 0.078117321001281, "grad_norm": 0.644782304763794, "learning_rate": 1.3084569796502682e-05, "loss": 6.356, "step": 303 }, { "epoch": 0.0783751339418793, "grad_norm": 0.7398347854614258, "learning_rate": 1.293013245970609e-05, "loss": 6.2324, "step": 304 }, { "epoch": 0.07863294688247759, "grad_norm": 0.5900807976722717, "learning_rate": 1.2779531098139333e-05, "loss": 6.1167, "step": 305 }, { "epoch": 0.07889075982307588, "grad_norm": 0.6693830490112305, "learning_rate": 1.263277944379459e-05, "loss": 6.2377, "step": 306 }, { "epoch": 0.07914857276367415, "grad_norm": 0.6465635895729065, "learning_rate": 1.248989087764366e-05, "loss": 6.1853, "step": 307 }, { "epoch": 0.07940638570427244, "grad_norm": 0.6336793303489685, "learning_rate": 1.2350878428417839e-05, "loss": 5.9979, "step": 308 }, { "epoch": 0.07966419864487073, "grad_norm": 0.6897872090339661, "learning_rate": 1.2215754771419997e-05, "loss": 6.0263, "step": 309 }, { "epoch": 0.07992201158546902, "grad_norm": 0.7997320294380188, "learning_rate": 1.2084532227368761e-05, "loss": 6.3959, "step": 310 }, { "epoch": 0.08017982452606731, "grad_norm": 1.0460010766983032, "learning_rate": 1.1957222761275149e-05, "loss": 6.7064, "step": 311 }, { "epoch": 0.08043763746666559, "grad_norm": 1.4831907749176025, "learning_rate": 1.183383798135157e-05, "loss": 6.8091, "step": 312 }, { "epoch": 0.08069545040726388, "grad_norm": 2.0493836402893066, "learning_rate": 1.171438913795338e-05, "loss": 7.5173, "step": 313 }, { "epoch": 0.08095326334786217, "grad_norm": 1.0544261932373047, "learning_rate": 1.1598887122553061e-05, "loss": 6.7366, "step": 314 }, { "epoch": 0.08121107628846046, "grad_norm": 0.7413960099220276, "learning_rate": 1.1487342466747112e-05, "loss": 6.2798, "step": 315 }, { "epoch": 0.08146888922905875, "grad_norm": 0.6017177700996399, "learning_rate": 1.137976534129579e-05, "loss": 6.3163, "step": 316 }, { "epoch": 0.08172670216965702, "grad_norm": 0.6235430240631104, "learning_rate": 1.127616555519573e-05, "loss": 6.2398, "step": 317 }, { "epoch": 0.08198451511025531, "grad_norm": 0.6497268676757812, "learning_rate": 1.1176552554785504e-05, "loss": 6.2297, "step": 318 }, { "epoch": 0.0822423280508536, "grad_norm": 0.6185296177864075, "learning_rate": 1.1080935422884358e-05, "loss": 6.0099, "step": 319 }, { "epoch": 0.08250014099145189, "grad_norm": 0.7389028668403625, "learning_rate": 1.0989322877963985e-05, "loss": 5.8818, "step": 320 }, { "epoch": 0.08275795393205018, "grad_norm": 0.7540916204452515, "learning_rate": 1.0901723273353597e-05, "loss": 6.1902, "step": 321 }, { "epoch": 0.08301576687264846, "grad_norm": 0.7044777274131775, "learning_rate": 1.0818144596478224e-05, "loss": 6.1747, "step": 322 }, { "epoch": 0.08327357981324675, "grad_norm": 0.9389222264289856, "learning_rate": 1.0738594468130452e-05, "loss": 6.479, "step": 323 }, { "epoch": 0.08353139275384504, "grad_norm": 1.1994636058807373, "learning_rate": 1.0663080141775504e-05, "loss": 6.6693, "step": 324 }, { "epoch": 0.08378920569444333, "grad_norm": 3.2903223037719727, "learning_rate": 1.0591608502889928e-05, "loss": 6.1876, "step": 325 }, { "epoch": 0.08378920569444333, "eval_loss": 6.461916446685791, "eval_runtime": 0.1956, "eval_samples_per_second": 255.68, "eval_steps_per_second": 66.477, "step": 325 }, { "epoch": 0.08404701863504162, "grad_norm": 1.2868388891220093, "learning_rate": 1.0524186068333692e-05, "loss": 7.4553, "step": 326 }, { "epoch": 0.0843048315756399, "grad_norm": 0.8954585194587708, "learning_rate": 1.046081898575604e-05, "loss": 6.399, "step": 327 }, { "epoch": 0.08456264451623818, "grad_norm": 0.612129807472229, "learning_rate": 1.04015130330349e-05, "loss": 6.4022, "step": 328 }, { "epoch": 0.08482045745683647, "grad_norm": 0.6739881634712219, "learning_rate": 1.0346273617750057e-05, "loss": 6.0706, "step": 329 }, { "epoch": 0.08507827039743476, "grad_norm": 0.7707552909851074, "learning_rate": 1.0295105776690108e-05, "loss": 6.0031, "step": 330 }, { "epoch": 0.08533608333803305, "grad_norm": 0.7329960465431213, "learning_rate": 1.0248014175393177e-05, "loss": 6.1073, "step": 331 }, { "epoch": 0.08559389627863133, "grad_norm": 0.7465705275535583, "learning_rate": 1.0205003107721506e-05, "loss": 6.0385, "step": 332 }, { "epoch": 0.08585170921922962, "grad_norm": 0.6375886797904968, "learning_rate": 1.0166076495469963e-05, "loss": 6.0305, "step": 333 }, { "epoch": 0.08610952215982791, "grad_norm": 0.7056543231010437, "learning_rate": 1.0131237888008412e-05, "loss": 6.1335, "step": 334 }, { "epoch": 0.0863673351004262, "grad_norm": 0.8683858513832092, "learning_rate": 1.0100490461958109e-05, "loss": 6.3272, "step": 335 }, { "epoch": 0.08662514804102449, "grad_norm": 0.9683743119239807, "learning_rate": 1.0073837020902033e-05, "loss": 6.5318, "step": 336 }, { "epoch": 0.08688296098162278, "grad_norm": 1.4957187175750732, "learning_rate": 1.0051279995129273e-05, "loss": 6.8366, "step": 337 }, { "epoch": 0.08714077392222105, "grad_norm": 1.5700278282165527, "learning_rate": 1.0032821441413394e-05, "loss": 7.7112, "step": 338 }, { "epoch": 0.08739858686281934, "grad_norm": 1.0813219547271729, "learning_rate": 1.0018463042824957e-05, "loss": 6.923, "step": 339 }, { "epoch": 0.08765639980341763, "grad_norm": 0.6960993409156799, "learning_rate": 1.0008206108577992e-05, "loss": 6.4607, "step": 340 }, { "epoch": 0.08791421274401592, "grad_norm": 0.6256797313690186, "learning_rate": 1.0002051573910671e-05, "loss": 6.3162, "step": 341 }, { "epoch": 0.08817202568461421, "grad_norm": 0.5889110565185547, "learning_rate": 1e-05, "loss": 6.0812, "step": 342 } ], "logging_steps": 1, "max_steps": 342, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3177646474133504.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }