diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.14742739200943536, + "epoch": 0.2948547840188707, "eval_steps": 500, - "global_step": 5000, + "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -7094,6 +7094,7086 @@ "eval_samples_per_second": 83.94, "eval_steps_per_second": 2.756, "step": 5000 + }, + { + "epoch": 0.1475748194014448, + "grad_norm": 23.375, + "learning_rate": 1.98625051877691e-06, + "loss": 2.3127, + "step": 5005 + }, + { + "epoch": 0.14772224679345422, + "grad_norm": 13.4375, + "learning_rate": 1.9861653423891667e-06, + "loss": 2.3383, + "step": 5010 + }, + { + "epoch": 0.14786967418546365, + "grad_norm": 15.75, + "learning_rate": 1.9860799048243535e-06, + "loss": 2.3119, + "step": 5015 + }, + { + "epoch": 0.14801710157747308, + "grad_norm": 17.125, + "learning_rate": 1.9859942061050965e-06, + "loss": 2.3144, + "step": 5020 + }, + { + "epoch": 0.14816452896948254, + "grad_norm": 13.5625, + "learning_rate": 1.985908246254093e-06, + "loss": 2.166, + "step": 5025 + }, + { + "epoch": 0.14831195636149197, + "grad_norm": 15.625, + "learning_rate": 1.985822025294108e-06, + "loss": 2.2456, + "step": 5030 + }, + { + "epoch": 0.1484593837535014, + "grad_norm": 15.3125, + "learning_rate": 1.9857355432479763e-06, + "loss": 2.3008, + "step": 5035 + }, + { + "epoch": 0.14860681114551083, + "grad_norm": 16.25, + "learning_rate": 1.9856488001386026e-06, + "loss": 2.3406, + "step": 5040 + }, + { + "epoch": 0.14875423853752026, + "grad_norm": 15.8125, + "learning_rate": 1.9855617959889598e-06, + "loss": 2.3188, + "step": 5045 + }, + { + "epoch": 0.14890166592952972, + "grad_norm": 19.25, + "learning_rate": 1.9854745308220895e-06, + "loss": 2.3037, + "step": 5050 + }, + { + "epoch": 0.14904909332153915, + "grad_norm": 16.875, + "learning_rate": 1.985387004661104e-06, + "loss": 2.2972, + "step": 5055 + }, + { + "epoch": 0.14919652071354858, + "grad_norm": 15.25, + "learning_rate": 1.9852992175291837e-06, + "loss": 2.3653, + "step": 5060 + }, + { + "epoch": 0.149343948105558, + "grad_norm": 18.375, + "learning_rate": 1.985211169449578e-06, + "loss": 2.3043, + "step": 5065 + }, + { + "epoch": 0.14949137549756744, + "grad_norm": 15.4375, + "learning_rate": 1.9851228604456056e-06, + "loss": 2.3394, + "step": 5070 + }, + { + "epoch": 0.14963880288957687, + "grad_norm": 15.125, + "learning_rate": 1.985034290540654e-06, + "loss": 2.2842, + "step": 5075 + }, + { + "epoch": 0.14978623028158633, + "grad_norm": 13.5, + "learning_rate": 1.984945459758181e-06, + "loss": 2.2723, + "step": 5080 + }, + { + "epoch": 0.14993365767359576, + "grad_norm": 15.25, + "learning_rate": 1.984856368121712e-06, + "loss": 2.2761, + "step": 5085 + }, + { + "epoch": 0.1500810850656052, + "grad_norm": 13.875, + "learning_rate": 1.9847670156548424e-06, + "loss": 2.2788, + "step": 5090 + }, + { + "epoch": 0.15022851245761462, + "grad_norm": 15.0625, + "learning_rate": 1.984677402381236e-06, + "loss": 2.3517, + "step": 5095 + }, + { + "epoch": 0.15037593984962405, + "grad_norm": 14.125, + "learning_rate": 1.9845875283246267e-06, + "loss": 2.2871, + "step": 5100 + }, + { + "epoch": 0.15052336724163348, + "grad_norm": 15.6875, + "learning_rate": 1.984497393508817e-06, + "loss": 2.3584, + "step": 5105 + }, + { + "epoch": 0.15067079463364294, + "grad_norm": 15.625, + "learning_rate": 1.984406997957677e-06, + "loss": 2.4279, + "step": 5110 + }, + { + "epoch": 0.15081822202565237, + "grad_norm": 16.0, + "learning_rate": 1.984316341695148e-06, + "loss": 2.3569, + "step": 5115 + }, + { + "epoch": 0.1509656494176618, + "grad_norm": 26.0, + "learning_rate": 1.9842254247452402e-06, + "loss": 2.2929, + "step": 5120 + }, + { + "epoch": 0.15111307680967123, + "grad_norm": 12.625, + "learning_rate": 1.984134247132031e-06, + "loss": 2.2725, + "step": 5125 + }, + { + "epoch": 0.15126050420168066, + "grad_norm": 15.1875, + "learning_rate": 1.984042808879668e-06, + "loss": 2.2815, + "step": 5130 + }, + { + "epoch": 0.15140793159369012, + "grad_norm": 17.5, + "learning_rate": 1.983951110012369e-06, + "loss": 2.2957, + "step": 5135 + }, + { + "epoch": 0.15155535898569955, + "grad_norm": 12.375, + "learning_rate": 1.9838591505544182e-06, + "loss": 2.2805, + "step": 5140 + }, + { + "epoch": 0.15170278637770898, + "grad_norm": 16.75, + "learning_rate": 1.983766930530171e-06, + "loss": 2.2924, + "step": 5145 + }, + { + "epoch": 0.1518502137697184, + "grad_norm": 14.9375, + "learning_rate": 1.9836744499640515e-06, + "loss": 2.2116, + "step": 5150 + }, + { + "epoch": 0.15199764116172784, + "grad_norm": 15.125, + "learning_rate": 1.983581708880551e-06, + "loss": 2.3455, + "step": 5155 + }, + { + "epoch": 0.15214506855373727, + "grad_norm": 17.5, + "learning_rate": 1.983488707304232e-06, + "loss": 2.35, + "step": 5160 + }, + { + "epoch": 0.15229249594574673, + "grad_norm": 14.625, + "learning_rate": 1.9833954452597255e-06, + "loss": 2.2939, + "step": 5165 + }, + { + "epoch": 0.15243992333775616, + "grad_norm": 14.75, + "learning_rate": 1.9833019227717306e-06, + "loss": 2.3036, + "step": 5170 + }, + { + "epoch": 0.1525873507297656, + "grad_norm": 14.75, + "learning_rate": 1.9832081398650158e-06, + "loss": 2.353, + "step": 5175 + }, + { + "epoch": 0.15273477812177502, + "grad_norm": 13.375, + "learning_rate": 1.9831140965644187e-06, + "loss": 2.1931, + "step": 5180 + }, + { + "epoch": 0.15288220551378445, + "grad_norm": 16.0, + "learning_rate": 1.9830197928948464e-06, + "loss": 2.3679, + "step": 5185 + }, + { + "epoch": 0.15302963290579388, + "grad_norm": 13.6875, + "learning_rate": 1.9829252288812735e-06, + "loss": 2.1672, + "step": 5190 + }, + { + "epoch": 0.15317706029780334, + "grad_norm": 14.5, + "learning_rate": 1.982830404548745e-06, + "loss": 2.1524, + "step": 5195 + }, + { + "epoch": 0.15332448768981277, + "grad_norm": 17.5, + "learning_rate": 1.9827353199223744e-06, + "loss": 2.4355, + "step": 5200 + }, + { + "epoch": 0.1534719150818222, + "grad_norm": 15.0625, + "learning_rate": 1.9826399750273432e-06, + "loss": 2.3482, + "step": 5205 + }, + { + "epoch": 0.15361934247383163, + "grad_norm": 16.875, + "learning_rate": 1.9825443698889035e-06, + "loss": 2.3487, + "step": 5210 + }, + { + "epoch": 0.15376676986584106, + "grad_norm": 12.3125, + "learning_rate": 1.982448504532375e-06, + "loss": 2.2663, + "step": 5215 + }, + { + "epoch": 0.15391419725785052, + "grad_norm": 14.6875, + "learning_rate": 1.9823523789831474e-06, + "loss": 2.3294, + "step": 5220 + }, + { + "epoch": 0.15406162464985995, + "grad_norm": 16.25, + "learning_rate": 1.982255993266678e-06, + "loss": 2.2712, + "step": 5225 + }, + { + "epoch": 0.15420905204186938, + "grad_norm": 14.9375, + "learning_rate": 1.9821593474084938e-06, + "loss": 2.3677, + "step": 5230 + }, + { + "epoch": 0.1543564794338788, + "grad_norm": 14.5, + "learning_rate": 1.982062441434191e-06, + "loss": 2.2574, + "step": 5235 + }, + { + "epoch": 0.15450390682588824, + "grad_norm": 49.0, + "learning_rate": 1.9819652753694336e-06, + "loss": 2.209, + "step": 5240 + }, + { + "epoch": 0.15465133421789767, + "grad_norm": 12.25, + "learning_rate": 1.9818678492399557e-06, + "loss": 2.2998, + "step": 5245 + }, + { + "epoch": 0.15479876160990713, + "grad_norm": 17.125, + "learning_rate": 1.98177016307156e-06, + "loss": 2.1183, + "step": 5250 + }, + { + "epoch": 0.15494618900191656, + "grad_norm": 15.625, + "learning_rate": 1.981672216890117e-06, + "loss": 2.2239, + "step": 5255 + }, + { + "epoch": 0.155093616393926, + "grad_norm": 16.375, + "learning_rate": 1.9815740107215676e-06, + "loss": 2.2356, + "step": 5260 + }, + { + "epoch": 0.15524104378593542, + "grad_norm": 16.875, + "learning_rate": 1.9814755445919204e-06, + "loss": 2.3464, + "step": 5265 + }, + { + "epoch": 0.15538847117794485, + "grad_norm": 14.6875, + "learning_rate": 1.9813768185272536e-06, + "loss": 2.3359, + "step": 5270 + }, + { + "epoch": 0.15553589856995428, + "grad_norm": 15.25, + "learning_rate": 1.981277832553713e-06, + "loss": 2.3663, + "step": 5275 + }, + { + "epoch": 0.15568332596196374, + "grad_norm": 15.9375, + "learning_rate": 1.9811785866975153e-06, + "loss": 2.2262, + "step": 5280 + }, + { + "epoch": 0.15583075335397317, + "grad_norm": 15.375, + "learning_rate": 1.9810790809849446e-06, + "loss": 2.2454, + "step": 5285 + }, + { + "epoch": 0.1559781807459826, + "grad_norm": 15.9375, + "learning_rate": 1.980979315442354e-06, + "loss": 2.289, + "step": 5290 + }, + { + "epoch": 0.15612560813799203, + "grad_norm": 15.375, + "learning_rate": 1.980879290096165e-06, + "loss": 2.2416, + "step": 5295 + }, + { + "epoch": 0.15627303553000146, + "grad_norm": 12.9375, + "learning_rate": 1.9807790049728692e-06, + "loss": 2.2994, + "step": 5300 + }, + { + "epoch": 0.15642046292201092, + "grad_norm": 14.5625, + "learning_rate": 1.9806784600990255e-06, + "loss": 2.3597, + "step": 5305 + }, + { + "epoch": 0.15656789031402035, + "grad_norm": 21.625, + "learning_rate": 1.980577655501263e-06, + "loss": 2.3614, + "step": 5310 + }, + { + "epoch": 0.15671531770602978, + "grad_norm": 12.375, + "learning_rate": 1.9804765912062786e-06, + "loss": 2.2174, + "step": 5315 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 13.4375, + "learning_rate": 1.9803752672408385e-06, + "loss": 2.2548, + "step": 5320 + }, + { + "epoch": 0.15701017249004864, + "grad_norm": 15.5625, + "learning_rate": 1.9802736836317767e-06, + "loss": 2.3221, + "step": 5325 + }, + { + "epoch": 0.15715759988205807, + "grad_norm": 19.25, + "learning_rate": 1.9801718404059973e-06, + "loss": 2.4205, + "step": 5330 + }, + { + "epoch": 0.15730502727406753, + "grad_norm": 15.0625, + "learning_rate": 1.9800697375904727e-06, + "loss": 2.3176, + "step": 5335 + }, + { + "epoch": 0.15745245466607696, + "grad_norm": 15.0, + "learning_rate": 1.9799673752122436e-06, + "loss": 2.3065, + "step": 5340 + }, + { + "epoch": 0.1575998820580864, + "grad_norm": 37.5, + "learning_rate": 1.9798647532984197e-06, + "loss": 2.3744, + "step": 5345 + }, + { + "epoch": 0.15774730945009582, + "grad_norm": 15.875, + "learning_rate": 1.97976187187618e-06, + "loss": 2.222, + "step": 5350 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 22.0, + "learning_rate": 1.979658730972771e-06, + "loss": 2.2452, + "step": 5355 + }, + { + "epoch": 0.15804216423411468, + "grad_norm": 16.5, + "learning_rate": 1.9795553306155096e-06, + "loss": 2.3767, + "step": 5360 + }, + { + "epoch": 0.15818959162612414, + "grad_norm": 15.875, + "learning_rate": 1.9794516708317792e-06, + "loss": 2.2898, + "step": 5365 + }, + { + "epoch": 0.15833701901813357, + "grad_norm": 15.1875, + "learning_rate": 1.9793477516490343e-06, + "loss": 2.3616, + "step": 5370 + }, + { + "epoch": 0.158484446410143, + "grad_norm": 15.375, + "learning_rate": 1.979243573094796e-06, + "loss": 2.2351, + "step": 5375 + }, + { + "epoch": 0.15863187380215243, + "grad_norm": 15.4375, + "learning_rate": 1.979139135196656e-06, + "loss": 2.3049, + "step": 5380 + }, + { + "epoch": 0.15877930119416186, + "grad_norm": 16.625, + "learning_rate": 1.9790344379822735e-06, + "loss": 2.3058, + "step": 5385 + }, + { + "epoch": 0.15892672858617132, + "grad_norm": 19.75, + "learning_rate": 1.978929481479376e-06, + "loss": 2.4101, + "step": 5390 + }, + { + "epoch": 0.15907415597818075, + "grad_norm": 17.25, + "learning_rate": 1.9788242657157613e-06, + "loss": 2.2751, + "step": 5395 + }, + { + "epoch": 0.15922158337019018, + "grad_norm": 16.0, + "learning_rate": 1.9787187907192936e-06, + "loss": 2.253, + "step": 5400 + }, + { + "epoch": 0.15936901076219961, + "grad_norm": 15.5, + "learning_rate": 1.978613056517908e-06, + "loss": 2.3013, + "step": 5405 + }, + { + "epoch": 0.15951643815420904, + "grad_norm": 15.4375, + "learning_rate": 1.9785070631396072e-06, + "loss": 2.2712, + "step": 5410 + }, + { + "epoch": 0.15966386554621848, + "grad_norm": 31.0, + "learning_rate": 1.978400810612462e-06, + "loss": 2.3801, + "step": 5415 + }, + { + "epoch": 0.15981129293822793, + "grad_norm": 12.6875, + "learning_rate": 1.978294298964613e-06, + "loss": 2.1809, + "step": 5420 + }, + { + "epoch": 0.15995872033023736, + "grad_norm": 15.5625, + "learning_rate": 1.978187528224269e-06, + "loss": 2.1765, + "step": 5425 + }, + { + "epoch": 0.1601061477222468, + "grad_norm": 13.875, + "learning_rate": 1.978080498419706e-06, + "loss": 2.2463, + "step": 5430 + }, + { + "epoch": 0.16025357511425622, + "grad_norm": 16.75, + "learning_rate": 1.9779732095792715e-06, + "loss": 2.1921, + "step": 5435 + }, + { + "epoch": 0.16040100250626566, + "grad_norm": 15.5625, + "learning_rate": 1.977865661731379e-06, + "loss": 2.3557, + "step": 5440 + }, + { + "epoch": 0.16054842989827509, + "grad_norm": 12.0, + "learning_rate": 1.977757854904512e-06, + "loss": 2.2406, + "step": 5445 + }, + { + "epoch": 0.16069585729028454, + "grad_norm": 16.5, + "learning_rate": 1.9776497891272222e-06, + "loss": 2.2514, + "step": 5450 + }, + { + "epoch": 0.16084328468229397, + "grad_norm": 15.3125, + "learning_rate": 1.9775414644281296e-06, + "loss": 2.3495, + "step": 5455 + }, + { + "epoch": 0.1609907120743034, + "grad_norm": 17.5, + "learning_rate": 1.977432880835923e-06, + "loss": 2.3667, + "step": 5460 + }, + { + "epoch": 0.16113813946631284, + "grad_norm": 14.4375, + "learning_rate": 1.97732403837936e-06, + "loss": 2.2498, + "step": 5465 + }, + { + "epoch": 0.16128556685832227, + "grad_norm": 15.5, + "learning_rate": 1.9772149370872666e-06, + "loss": 2.2269, + "step": 5470 + }, + { + "epoch": 0.16143299425033172, + "grad_norm": 16.125, + "learning_rate": 1.9771055769885366e-06, + "loss": 2.3518, + "step": 5475 + }, + { + "epoch": 0.16158042164234115, + "grad_norm": 14.625, + "learning_rate": 1.9769959581121343e-06, + "loss": 2.1878, + "step": 5480 + }, + { + "epoch": 0.16172784903435058, + "grad_norm": 24.0, + "learning_rate": 1.9768860804870905e-06, + "loss": 2.3007, + "step": 5485 + }, + { + "epoch": 0.16187527642636002, + "grad_norm": 15.0625, + "learning_rate": 1.9767759441425055e-06, + "loss": 2.3047, + "step": 5490 + }, + { + "epoch": 0.16202270381836945, + "grad_norm": 25.25, + "learning_rate": 1.9766655491075473e-06, + "loss": 2.2359, + "step": 5495 + }, + { + "epoch": 0.16217013121037888, + "grad_norm": 16.375, + "learning_rate": 1.976554895411454e-06, + "loss": 2.3287, + "step": 5500 + }, + { + "epoch": 0.16217013121037888, + "eval_loss": 2.275634765625, + "eval_runtime": 4.7203, + "eval_samples_per_second": 83.894, + "eval_steps_per_second": 2.754, + "step": 5500 + }, + { + "epoch": 0.16231755860238833, + "grad_norm": 16.375, + "learning_rate": 1.976443983083531e-06, + "loss": 2.1762, + "step": 5505 + }, + { + "epoch": 0.16246498599439776, + "grad_norm": 15.625, + "learning_rate": 1.9763328121531517e-06, + "loss": 2.3403, + "step": 5510 + }, + { + "epoch": 0.1626124133864072, + "grad_norm": 29.125, + "learning_rate": 1.9762213826497595e-06, + "loss": 2.2859, + "step": 5515 + }, + { + "epoch": 0.16275984077841663, + "grad_norm": 20.25, + "learning_rate": 1.9761096946028654e-06, + "loss": 2.2856, + "step": 5520 + }, + { + "epoch": 0.16290726817042606, + "grad_norm": 14.0625, + "learning_rate": 1.9759977480420485e-06, + "loss": 2.282, + "step": 5525 + }, + { + "epoch": 0.16305469556243551, + "grad_norm": 17.25, + "learning_rate": 1.975885542996958e-06, + "loss": 2.3487, + "step": 5530 + }, + { + "epoch": 0.16320212295444494, + "grad_norm": 15.5625, + "learning_rate": 1.9757730794973088e-06, + "loss": 2.2834, + "step": 5535 + }, + { + "epoch": 0.16334955034645438, + "grad_norm": 15.875, + "learning_rate": 1.975660357572887e-06, + "loss": 2.3061, + "step": 5540 + }, + { + "epoch": 0.1634969777384638, + "grad_norm": 14.875, + "learning_rate": 1.975547377253546e-06, + "loss": 2.3091, + "step": 5545 + }, + { + "epoch": 0.16364440513047324, + "grad_norm": 18.75, + "learning_rate": 1.9754341385692067e-06, + "loss": 2.3004, + "step": 5550 + }, + { + "epoch": 0.16379183252248267, + "grad_norm": 15.5625, + "learning_rate": 1.97532064154986e-06, + "loss": 2.2878, + "step": 5555 + }, + { + "epoch": 0.16393925991449212, + "grad_norm": 14.125, + "learning_rate": 1.975206886225565e-06, + "loss": 2.2775, + "step": 5560 + }, + { + "epoch": 0.16408668730650156, + "grad_norm": 16.0, + "learning_rate": 1.975092872626448e-06, + "loss": 2.273, + "step": 5565 + }, + { + "epoch": 0.16423411469851099, + "grad_norm": 14.75, + "learning_rate": 1.9749786007827046e-06, + "loss": 2.2756, + "step": 5570 + }, + { + "epoch": 0.16438154209052042, + "grad_norm": 20.625, + "learning_rate": 1.9748640707245992e-06, + "loss": 2.2598, + "step": 5575 + }, + { + "epoch": 0.16452896948252985, + "grad_norm": 24.25, + "learning_rate": 1.9747492824824638e-06, + "loss": 2.2965, + "step": 5580 + }, + { + "epoch": 0.16467639687453928, + "grad_norm": 25.375, + "learning_rate": 1.9746342360866987e-06, + "loss": 2.2723, + "step": 5585 + }, + { + "epoch": 0.16482382426654874, + "grad_norm": 17.625, + "learning_rate": 1.974518931567773e-06, + "loss": 2.4182, + "step": 5590 + }, + { + "epoch": 0.16497125165855817, + "grad_norm": 13.25, + "learning_rate": 1.9744033689562244e-06, + "loss": 2.3571, + "step": 5595 + }, + { + "epoch": 0.1651186790505676, + "grad_norm": 15.4375, + "learning_rate": 1.9742875482826583e-06, + "loss": 2.2016, + "step": 5600 + }, + { + "epoch": 0.16526610644257703, + "grad_norm": 15.1875, + "learning_rate": 1.974171469577749e-06, + "loss": 2.1913, + "step": 5605 + }, + { + "epoch": 0.16541353383458646, + "grad_norm": 15.125, + "learning_rate": 1.9740551328722387e-06, + "loss": 2.3939, + "step": 5610 + }, + { + "epoch": 0.16556096122659592, + "grad_norm": 16.125, + "learning_rate": 1.9739385381969387e-06, + "loss": 2.31, + "step": 5615 + }, + { + "epoch": 0.16570838861860535, + "grad_norm": 14.375, + "learning_rate": 1.9738216855827276e-06, + "loss": 2.3633, + "step": 5620 + }, + { + "epoch": 0.16585581601061478, + "grad_norm": 15.5625, + "learning_rate": 1.9737045750605523e-06, + "loss": 2.297, + "step": 5625 + }, + { + "epoch": 0.1660032434026242, + "grad_norm": 17.125, + "learning_rate": 1.9735872066614294e-06, + "loss": 2.3581, + "step": 5630 + }, + { + "epoch": 0.16615067079463364, + "grad_norm": 17.0, + "learning_rate": 1.973469580416442e-06, + "loss": 2.2865, + "step": 5635 + }, + { + "epoch": 0.16629809818664307, + "grad_norm": 13.375, + "learning_rate": 1.9733516963567433e-06, + "loss": 2.245, + "step": 5640 + }, + { + "epoch": 0.16644552557865253, + "grad_norm": 17.625, + "learning_rate": 1.973233554513553e-06, + "loss": 2.3806, + "step": 5645 + }, + { + "epoch": 0.16659295297066196, + "grad_norm": 15.4375, + "learning_rate": 1.9731151549181603e-06, + "loss": 2.204, + "step": 5650 + }, + { + "epoch": 0.1667403803626714, + "grad_norm": 15.375, + "learning_rate": 1.9729964976019223e-06, + "loss": 2.1747, + "step": 5655 + }, + { + "epoch": 0.16688780775468082, + "grad_norm": 15.25, + "learning_rate": 1.972877582596264e-06, + "loss": 2.134, + "step": 5660 + }, + { + "epoch": 0.16703523514669025, + "grad_norm": 15.8125, + "learning_rate": 1.9727584099326796e-06, + "loss": 2.3692, + "step": 5665 + }, + { + "epoch": 0.16718266253869968, + "grad_norm": 18.375, + "learning_rate": 1.9726389796427303e-06, + "loss": 2.3848, + "step": 5670 + }, + { + "epoch": 0.16733008993070914, + "grad_norm": 17.875, + "learning_rate": 1.9725192917580466e-06, + "loss": 2.2684, + "step": 5675 + }, + { + "epoch": 0.16747751732271857, + "grad_norm": 15.5625, + "learning_rate": 1.9723993463103265e-06, + "loss": 2.3341, + "step": 5680 + }, + { + "epoch": 0.167624944714728, + "grad_norm": 15.25, + "learning_rate": 1.9722791433313364e-06, + "loss": 2.3528, + "step": 5685 + }, + { + "epoch": 0.16777237210673743, + "grad_norm": 16.75, + "learning_rate": 1.972158682852911e-06, + "loss": 2.2617, + "step": 5690 + }, + { + "epoch": 0.16791979949874686, + "grad_norm": 10.9375, + "learning_rate": 1.9720379649069537e-06, + "loss": 2.2348, + "step": 5695 + }, + { + "epoch": 0.16806722689075632, + "grad_norm": 13.0625, + "learning_rate": 1.9719169895254347e-06, + "loss": 2.2609, + "step": 5700 + }, + { + "epoch": 0.16821465428276575, + "grad_norm": 15.3125, + "learning_rate": 1.971795756740394e-06, + "loss": 2.3166, + "step": 5705 + }, + { + "epoch": 0.16836208167477518, + "grad_norm": 13.9375, + "learning_rate": 1.9716742665839387e-06, + "loss": 2.1999, + "step": 5710 + }, + { + "epoch": 0.1685095090667846, + "grad_norm": 15.0625, + "learning_rate": 1.9715525190882444e-06, + "loss": 2.2247, + "step": 5715 + }, + { + "epoch": 0.16865693645879404, + "grad_norm": 15.625, + "learning_rate": 1.9714305142855545e-06, + "loss": 2.2155, + "step": 5720 + }, + { + "epoch": 0.16880436385080347, + "grad_norm": 18.125, + "learning_rate": 1.971308252208182e-06, + "loss": 2.2774, + "step": 5725 + }, + { + "epoch": 0.16895179124281293, + "grad_norm": 17.5, + "learning_rate": 1.9711857328885056e-06, + "loss": 2.3131, + "step": 5730 + }, + { + "epoch": 0.16909921863482236, + "grad_norm": 14.75, + "learning_rate": 1.971062956358974e-06, + "loss": 2.1557, + "step": 5735 + }, + { + "epoch": 0.1692466460268318, + "grad_norm": 19.0, + "learning_rate": 1.9709399226521034e-06, + "loss": 2.3339, + "step": 5740 + }, + { + "epoch": 0.16939407341884122, + "grad_norm": 15.1875, + "learning_rate": 1.9708166318004785e-06, + "loss": 2.2882, + "step": 5745 + }, + { + "epoch": 0.16954150081085065, + "grad_norm": 20.375, + "learning_rate": 1.9706930838367513e-06, + "loss": 2.2205, + "step": 5750 + }, + { + "epoch": 0.16968892820286008, + "grad_norm": 13.625, + "learning_rate": 1.9705692787936427e-06, + "loss": 2.2539, + "step": 5755 + }, + { + "epoch": 0.16983635559486954, + "grad_norm": 13.5625, + "learning_rate": 1.970445216703941e-06, + "loss": 2.2197, + "step": 5760 + }, + { + "epoch": 0.16998378298687897, + "grad_norm": 15.375, + "learning_rate": 1.9703208976005035e-06, + "loss": 2.3474, + "step": 5765 + }, + { + "epoch": 0.1701312103788884, + "grad_norm": 15.125, + "learning_rate": 1.9701963215162546e-06, + "loss": 2.2846, + "step": 5770 + }, + { + "epoch": 0.17027863777089783, + "grad_norm": 13.875, + "learning_rate": 1.9700714884841872e-06, + "loss": 2.2496, + "step": 5775 + }, + { + "epoch": 0.17042606516290726, + "grad_norm": 14.125, + "learning_rate": 1.9699463985373623e-06, + "loss": 2.2557, + "step": 5780 + }, + { + "epoch": 0.17057349255491672, + "grad_norm": 15.8125, + "learning_rate": 1.9698210517089085e-06, + "loss": 2.3008, + "step": 5785 + }, + { + "epoch": 0.17072091994692615, + "grad_norm": 17.125, + "learning_rate": 1.9696954480320237e-06, + "loss": 2.3636, + "step": 5790 + }, + { + "epoch": 0.17086834733893558, + "grad_norm": 16.0, + "learning_rate": 1.9695695875399717e-06, + "loss": 2.3082, + "step": 5795 + }, + { + "epoch": 0.171015774730945, + "grad_norm": 14.0625, + "learning_rate": 1.9694434702660866e-06, + "loss": 2.1362, + "step": 5800 + }, + { + "epoch": 0.17116320212295444, + "grad_norm": 18.625, + "learning_rate": 1.9693170962437686e-06, + "loss": 2.196, + "step": 5805 + }, + { + "epoch": 0.17131062951496387, + "grad_norm": 13.3125, + "learning_rate": 1.9691904655064873e-06, + "loss": 2.2226, + "step": 5810 + }, + { + "epoch": 0.17145805690697333, + "grad_norm": 15.8125, + "learning_rate": 1.9690635780877794e-06, + "loss": 2.2897, + "step": 5815 + }, + { + "epoch": 0.17160548429898276, + "grad_norm": 15.5, + "learning_rate": 1.96893643402125e-06, + "loss": 2.3065, + "step": 5820 + }, + { + "epoch": 0.1717529116909922, + "grad_norm": 16.5, + "learning_rate": 1.968809033340572e-06, + "loss": 2.2296, + "step": 5825 + }, + { + "epoch": 0.17190033908300162, + "grad_norm": 12.5625, + "learning_rate": 1.9686813760794865e-06, + "loss": 2.3175, + "step": 5830 + }, + { + "epoch": 0.17204776647501105, + "grad_norm": 16.125, + "learning_rate": 1.9685534622718023e-06, + "loss": 2.2041, + "step": 5835 + }, + { + "epoch": 0.17219519386702048, + "grad_norm": 18.5, + "learning_rate": 1.9684252919513963e-06, + "loss": 2.2254, + "step": 5840 + }, + { + "epoch": 0.17234262125902994, + "grad_norm": 16.5, + "learning_rate": 1.9682968651522133e-06, + "loss": 2.3361, + "step": 5845 + }, + { + "epoch": 0.17249004865103937, + "grad_norm": 13.625, + "learning_rate": 1.9681681819082655e-06, + "loss": 2.275, + "step": 5850 + }, + { + "epoch": 0.1726374760430488, + "grad_norm": 13.9375, + "learning_rate": 1.968039242253634e-06, + "loss": 2.199, + "step": 5855 + }, + { + "epoch": 0.17278490343505823, + "grad_norm": 20.5, + "learning_rate": 1.9679100462224673e-06, + "loss": 2.2255, + "step": 5860 + }, + { + "epoch": 0.17293233082706766, + "grad_norm": 19.75, + "learning_rate": 1.967780593848982e-06, + "loss": 2.3403, + "step": 5865 + }, + { + "epoch": 0.17307975821907712, + "grad_norm": 18.75, + "learning_rate": 1.9676508851674616e-06, + "loss": 2.2646, + "step": 5870 + }, + { + "epoch": 0.17322718561108655, + "grad_norm": 15.625, + "learning_rate": 1.9675209202122587e-06, + "loss": 2.3766, + "step": 5875 + }, + { + "epoch": 0.17337461300309598, + "grad_norm": 14.3125, + "learning_rate": 1.967390699017794e-06, + "loss": 2.266, + "step": 5880 + }, + { + "epoch": 0.1735220403951054, + "grad_norm": 32.5, + "learning_rate": 1.9672602216185545e-06, + "loss": 2.1764, + "step": 5885 + }, + { + "epoch": 0.17366946778711484, + "grad_norm": 15.4375, + "learning_rate": 1.9671294880490966e-06, + "loss": 2.1586, + "step": 5890 + }, + { + "epoch": 0.17381689517912427, + "grad_norm": 15.0, + "learning_rate": 1.9669984983440434e-06, + "loss": 2.2782, + "step": 5895 + }, + { + "epoch": 0.17396432257113373, + "grad_norm": 19.125, + "learning_rate": 1.9668672525380865e-06, + "loss": 2.1917, + "step": 5900 + }, + { + "epoch": 0.17411174996314316, + "grad_norm": 14.625, + "learning_rate": 1.9667357506659856e-06, + "loss": 2.29, + "step": 5905 + }, + { + "epoch": 0.1742591773551526, + "grad_norm": 13.9375, + "learning_rate": 1.9666039927625673e-06, + "loss": 2.2455, + "step": 5910 + }, + { + "epoch": 0.17440660474716202, + "grad_norm": 16.5, + "learning_rate": 1.9664719788627267e-06, + "loss": 2.1826, + "step": 5915 + }, + { + "epoch": 0.17455403213917145, + "grad_norm": 14.1875, + "learning_rate": 1.9663397090014265e-06, + "loss": 2.2362, + "step": 5920 + }, + { + "epoch": 0.17470145953118088, + "grad_norm": 20.75, + "learning_rate": 1.9662071832136973e-06, + "loss": 2.2673, + "step": 5925 + }, + { + "epoch": 0.17484888692319034, + "grad_norm": 17.625, + "learning_rate": 1.966074401534637e-06, + "loss": 2.2222, + "step": 5930 + }, + { + "epoch": 0.17499631431519977, + "grad_norm": 18.875, + "learning_rate": 1.9659413639994124e-06, + "loss": 2.2023, + "step": 5935 + }, + { + "epoch": 0.1751437417072092, + "grad_norm": 16.125, + "learning_rate": 1.965808070643256e-06, + "loss": 2.2542, + "step": 5940 + }, + { + "epoch": 0.17529116909921863, + "grad_norm": 17.0, + "learning_rate": 1.965674521501471e-06, + "loss": 2.2869, + "step": 5945 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 16.75, + "learning_rate": 1.965540716609425e-06, + "loss": 2.2464, + "step": 5950 + }, + { + "epoch": 0.17558602388323752, + "grad_norm": 14.75, + "learning_rate": 1.965406656002556e-06, + "loss": 2.242, + "step": 5955 + }, + { + "epoch": 0.17573345127524695, + "grad_norm": 12.5, + "learning_rate": 1.965272339716369e-06, + "loss": 2.2244, + "step": 5960 + }, + { + "epoch": 0.17588087866725638, + "grad_norm": 17.75, + "learning_rate": 1.965137767786436e-06, + "loss": 2.2567, + "step": 5965 + }, + { + "epoch": 0.1760283060592658, + "grad_norm": 15.0625, + "learning_rate": 1.9650029402483974e-06, + "loss": 2.258, + "step": 5970 + }, + { + "epoch": 0.17617573345127524, + "grad_norm": 18.375, + "learning_rate": 1.9648678571379603e-06, + "loss": 2.268, + "step": 5975 + }, + { + "epoch": 0.17632316084328467, + "grad_norm": 14.8125, + "learning_rate": 1.9647325184909014e-06, + "loss": 2.2409, + "step": 5980 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 15.75, + "learning_rate": 1.9645969243430632e-06, + "loss": 2.3359, + "step": 5985 + }, + { + "epoch": 0.17661801562730356, + "grad_norm": 17.625, + "learning_rate": 1.9644610747303567e-06, + "loss": 2.3641, + "step": 5990 + }, + { + "epoch": 0.176765443019313, + "grad_norm": 17.125, + "learning_rate": 1.9643249696887613e-06, + "loss": 2.2575, + "step": 5995 + }, + { + "epoch": 0.17691287041132242, + "grad_norm": 15.5625, + "learning_rate": 1.9641886092543215e-06, + "loss": 2.3008, + "step": 6000 + }, + { + "epoch": 0.17691287041132242, + "eval_loss": 2.2403786182403564, + "eval_runtime": 4.7252, + "eval_samples_per_second": 83.807, + "eval_steps_per_second": 2.751, + "step": 6000 + }, + { + "epoch": 0.17706029780333185, + "grad_norm": 13.25, + "learning_rate": 1.9640519934631527e-06, + "loss": 2.1941, + "step": 6005 + }, + { + "epoch": 0.17720772519534128, + "grad_norm": 14.375, + "learning_rate": 1.9639151223514356e-06, + "loss": 2.3091, + "step": 6010 + }, + { + "epoch": 0.17735515258735074, + "grad_norm": 19.375, + "learning_rate": 1.9637779959554193e-06, + "loss": 2.3316, + "step": 6015 + }, + { + "epoch": 0.17750257997936017, + "grad_norm": 14.75, + "learning_rate": 1.963640614311421e-06, + "loss": 2.3345, + "step": 6020 + }, + { + "epoch": 0.1776500073713696, + "grad_norm": 17.125, + "learning_rate": 1.9635029774558245e-06, + "loss": 2.1962, + "step": 6025 + }, + { + "epoch": 0.17779743476337903, + "grad_norm": 15.1875, + "learning_rate": 1.9633650854250818e-06, + "loss": 2.2806, + "step": 6030 + }, + { + "epoch": 0.17794486215538846, + "grad_norm": 17.25, + "learning_rate": 1.9632269382557123e-06, + "loss": 2.2432, + "step": 6035 + }, + { + "epoch": 0.17809228954739792, + "grad_norm": 14.5625, + "learning_rate": 1.9630885359843034e-06, + "loss": 2.2453, + "step": 6040 + }, + { + "epoch": 0.17823971693940735, + "grad_norm": 13.25, + "learning_rate": 1.9629498786475094e-06, + "loss": 2.328, + "step": 6045 + }, + { + "epoch": 0.17838714433141678, + "grad_norm": 17.0, + "learning_rate": 1.9628109662820525e-06, + "loss": 2.2208, + "step": 6050 + }, + { + "epoch": 0.1785345717234262, + "grad_norm": 14.8125, + "learning_rate": 1.9626717989247222e-06, + "loss": 2.2077, + "step": 6055 + }, + { + "epoch": 0.17868199911543564, + "grad_norm": 14.75, + "learning_rate": 1.9625323766123764e-06, + "loss": 2.2507, + "step": 6060 + }, + { + "epoch": 0.17882942650744507, + "grad_norm": 14.3125, + "learning_rate": 1.9623926993819394e-06, + "loss": 2.2847, + "step": 6065 + }, + { + "epoch": 0.17897685389945453, + "grad_norm": 16.0, + "learning_rate": 1.962252767270403e-06, + "loss": 2.2177, + "step": 6070 + }, + { + "epoch": 0.17912428129146396, + "grad_norm": 17.875, + "learning_rate": 1.9621125803148275e-06, + "loss": 2.2478, + "step": 6075 + }, + { + "epoch": 0.1792717086834734, + "grad_norm": 14.875, + "learning_rate": 1.9619721385523404e-06, + "loss": 2.1835, + "step": 6080 + }, + { + "epoch": 0.17941913607548282, + "grad_norm": 18.125, + "learning_rate": 1.961831442020136e-06, + "loss": 2.3067, + "step": 6085 + }, + { + "epoch": 0.17956656346749225, + "grad_norm": 17.125, + "learning_rate": 1.961690490755477e-06, + "loss": 2.2633, + "step": 6090 + }, + { + "epoch": 0.17971399085950168, + "grad_norm": 14.1875, + "learning_rate": 1.961549284795692e-06, + "loss": 2.2044, + "step": 6095 + }, + { + "epoch": 0.17986141825151114, + "grad_norm": 13.0, + "learning_rate": 1.9614078241781797e-06, + "loss": 2.1931, + "step": 6100 + }, + { + "epoch": 0.18000884564352057, + "grad_norm": 17.125, + "learning_rate": 1.961266108940403e-06, + "loss": 2.2065, + "step": 6105 + }, + { + "epoch": 0.18015627303553, + "grad_norm": 13.8125, + "learning_rate": 1.9611241391198956e-06, + "loss": 2.2238, + "step": 6110 + }, + { + "epoch": 0.18030370042753943, + "grad_norm": 15.1875, + "learning_rate": 1.9609819147542555e-06, + "loss": 2.2079, + "step": 6115 + }, + { + "epoch": 0.18045112781954886, + "grad_norm": 13.875, + "learning_rate": 1.9608394358811505e-06, + "loss": 2.176, + "step": 6120 + }, + { + "epoch": 0.18059855521155832, + "grad_norm": 15.5, + "learning_rate": 1.9606967025383147e-06, + "loss": 2.3294, + "step": 6125 + }, + { + "epoch": 0.18074598260356775, + "grad_norm": 13.375, + "learning_rate": 1.9605537147635493e-06, + "loss": 2.1568, + "step": 6130 + }, + { + "epoch": 0.18089340999557718, + "grad_norm": 12.8125, + "learning_rate": 1.960410472594723e-06, + "loss": 2.1856, + "step": 6135 + }, + { + "epoch": 0.1810408373875866, + "grad_norm": 14.6875, + "learning_rate": 1.9602669760697737e-06, + "loss": 2.3059, + "step": 6140 + }, + { + "epoch": 0.18118826477959604, + "grad_norm": 15.875, + "learning_rate": 1.960123225226704e-06, + "loss": 2.305, + "step": 6145 + }, + { + "epoch": 0.18133569217160547, + "grad_norm": 16.375, + "learning_rate": 1.9599792201035852e-06, + "loss": 2.3145, + "step": 6150 + }, + { + "epoch": 0.18148311956361493, + "grad_norm": 14.8125, + "learning_rate": 1.959834960738556e-06, + "loss": 2.2752, + "step": 6155 + }, + { + "epoch": 0.18163054695562436, + "grad_norm": 14.4375, + "learning_rate": 1.9596904471698223e-06, + "loss": 2.2703, + "step": 6160 + }, + { + "epoch": 0.1817779743476338, + "grad_norm": 19.75, + "learning_rate": 1.9595456794356564e-06, + "loss": 2.1955, + "step": 6165 + }, + { + "epoch": 0.18192540173964322, + "grad_norm": 19.5, + "learning_rate": 1.9594006575743997e-06, + "loss": 2.3101, + "step": 6170 + }, + { + "epoch": 0.18207282913165265, + "grad_norm": 17.0, + "learning_rate": 1.9592553816244596e-06, + "loss": 2.153, + "step": 6175 + }, + { + "epoch": 0.18222025652366208, + "grad_norm": 15.375, + "learning_rate": 1.959109851624311e-06, + "loss": 2.2548, + "step": 6180 + }, + { + "epoch": 0.18236768391567154, + "grad_norm": 16.25, + "learning_rate": 1.9589640676124963e-06, + "loss": 2.1377, + "step": 6185 + }, + { + "epoch": 0.18251511130768097, + "grad_norm": 15.875, + "learning_rate": 1.9588180296276254e-06, + "loss": 2.2501, + "step": 6190 + }, + { + "epoch": 0.1826625386996904, + "grad_norm": 14.3125, + "learning_rate": 1.9586717377083748e-06, + "loss": 2.277, + "step": 6195 + }, + { + "epoch": 0.18280996609169983, + "grad_norm": 12.0, + "learning_rate": 1.9585251918934884e-06, + "loss": 2.1835, + "step": 6200 + }, + { + "epoch": 0.18295739348370926, + "grad_norm": 15.1875, + "learning_rate": 1.958378392221778e-06, + "loss": 2.0964, + "step": 6205 + }, + { + "epoch": 0.18310482087571872, + "grad_norm": 15.5625, + "learning_rate": 1.958231338732122e-06, + "loss": 2.1945, + "step": 6210 + }, + { + "epoch": 0.18325224826772815, + "grad_norm": 15.375, + "learning_rate": 1.9580840314634665e-06, + "loss": 2.2639, + "step": 6215 + }, + { + "epoch": 0.18339967565973758, + "grad_norm": 15.6875, + "learning_rate": 1.957936470454824e-06, + "loss": 2.2378, + "step": 6220 + }, + { + "epoch": 0.183547103051747, + "grad_norm": 14.0625, + "learning_rate": 1.957788655745275e-06, + "loss": 2.2162, + "step": 6225 + }, + { + "epoch": 0.18369453044375644, + "grad_norm": 14.1875, + "learning_rate": 1.9576405873739664e-06, + "loss": 2.2687, + "step": 6230 + }, + { + "epoch": 0.18384195783576587, + "grad_norm": 14.6875, + "learning_rate": 1.9574922653801138e-06, + "loss": 2.2551, + "step": 6235 + }, + { + "epoch": 0.18398938522777533, + "grad_norm": 15.875, + "learning_rate": 1.957343689802998e-06, + "loss": 2.2938, + "step": 6240 + }, + { + "epoch": 0.18413681261978476, + "grad_norm": 13.4375, + "learning_rate": 1.9571948606819687e-06, + "loss": 2.2451, + "step": 6245 + }, + { + "epoch": 0.1842842400117942, + "grad_norm": 17.75, + "learning_rate": 1.9570457780564415e-06, + "loss": 2.3438, + "step": 6250 + }, + { + "epoch": 0.18443166740380362, + "grad_norm": 14.375, + "learning_rate": 1.956896441965899e-06, + "loss": 2.2574, + "step": 6255 + }, + { + "epoch": 0.18457909479581305, + "grad_norm": 18.625, + "learning_rate": 1.956746852449893e-06, + "loss": 2.259, + "step": 6260 + }, + { + "epoch": 0.18472652218782248, + "grad_norm": 16.125, + "learning_rate": 1.95659700954804e-06, + "loss": 2.2494, + "step": 6265 + }, + { + "epoch": 0.18487394957983194, + "grad_norm": 21.375, + "learning_rate": 1.9564469133000244e-06, + "loss": 2.373, + "step": 6270 + }, + { + "epoch": 0.18502137697184137, + "grad_norm": 13.4375, + "learning_rate": 1.9562965637455984e-06, + "loss": 2.2845, + "step": 6275 + }, + { + "epoch": 0.1851688043638508, + "grad_norm": 19.125, + "learning_rate": 1.95614596092458e-06, + "loss": 2.1535, + "step": 6280 + }, + { + "epoch": 0.18531623175586023, + "grad_norm": 15.5, + "learning_rate": 1.955995104876856e-06, + "loss": 2.2633, + "step": 6285 + }, + { + "epoch": 0.18546365914786966, + "grad_norm": 15.5, + "learning_rate": 1.9558439956423788e-06, + "loss": 2.3352, + "step": 6290 + }, + { + "epoch": 0.18561108653987912, + "grad_norm": 19.25, + "learning_rate": 1.955692633261168e-06, + "loss": 2.3492, + "step": 6295 + }, + { + "epoch": 0.18575851393188855, + "grad_norm": 15.8125, + "learning_rate": 1.9555410177733108e-06, + "loss": 2.2918, + "step": 6300 + }, + { + "epoch": 0.18590594132389798, + "grad_norm": 13.4375, + "learning_rate": 1.9553891492189613e-06, + "loss": 2.3091, + "step": 6305 + }, + { + "epoch": 0.1860533687159074, + "grad_norm": 12.6875, + "learning_rate": 1.9552370276383406e-06, + "loss": 2.2345, + "step": 6310 + }, + { + "epoch": 0.18620079610791684, + "grad_norm": 15.25, + "learning_rate": 1.9550846530717368e-06, + "loss": 2.1788, + "step": 6315 + }, + { + "epoch": 0.18634822349992627, + "grad_norm": 15.8125, + "learning_rate": 1.9549320255595044e-06, + "loss": 2.2549, + "step": 6320 + }, + { + "epoch": 0.18649565089193573, + "grad_norm": 13.75, + "learning_rate": 1.954779145142066e-06, + "loss": 2.2416, + "step": 6325 + }, + { + "epoch": 0.18664307828394516, + "grad_norm": 12.8125, + "learning_rate": 1.9546260118599103e-06, + "loss": 2.1621, + "step": 6330 + }, + { + "epoch": 0.1867905056759546, + "grad_norm": 21.0, + "learning_rate": 1.9544726257535936e-06, + "loss": 2.3335, + "step": 6335 + }, + { + "epoch": 0.18693793306796402, + "grad_norm": 16.625, + "learning_rate": 1.9543189868637383e-06, + "loss": 2.2599, + "step": 6340 + }, + { + "epoch": 0.18708536045997345, + "grad_norm": 15.25, + "learning_rate": 1.954165095231035e-06, + "loss": 2.1533, + "step": 6345 + }, + { + "epoch": 0.18723278785198288, + "grad_norm": 13.9375, + "learning_rate": 1.95401095089624e-06, + "loss": 2.1882, + "step": 6350 + }, + { + "epoch": 0.18738021524399234, + "grad_norm": 15.8125, + "learning_rate": 1.9538565539001774e-06, + "loss": 2.2502, + "step": 6355 + }, + { + "epoch": 0.18752764263600177, + "grad_norm": 13.5625, + "learning_rate": 1.953701904283737e-06, + "loss": 2.2742, + "step": 6360 + }, + { + "epoch": 0.1876750700280112, + "grad_norm": 14.375, + "learning_rate": 1.9535470020878776e-06, + "loss": 2.2428, + "step": 6365 + }, + { + "epoch": 0.18782249742002063, + "grad_norm": 13.125, + "learning_rate": 1.953391847353623e-06, + "loss": 2.1909, + "step": 6370 + }, + { + "epoch": 0.18796992481203006, + "grad_norm": 20.125, + "learning_rate": 1.9532364401220645e-06, + "loss": 2.3489, + "step": 6375 + }, + { + "epoch": 0.18811735220403952, + "grad_norm": 18.0, + "learning_rate": 1.9530807804343603e-06, + "loss": 2.224, + "step": 6380 + }, + { + "epoch": 0.18826477959604895, + "grad_norm": 15.0, + "learning_rate": 1.952924868331736e-06, + "loss": 2.2268, + "step": 6385 + }, + { + "epoch": 0.18841220698805838, + "grad_norm": 13.625, + "learning_rate": 1.9527687038554828e-06, + "loss": 2.1472, + "step": 6390 + }, + { + "epoch": 0.18855963438006781, + "grad_norm": 17.75, + "learning_rate": 1.9526122870469603e-06, + "loss": 2.3077, + "step": 6395 + }, + { + "epoch": 0.18870706177207724, + "grad_norm": 14.0, + "learning_rate": 1.952455617947593e-06, + "loss": 2.3211, + "step": 6400 + }, + { + "epoch": 0.18885448916408668, + "grad_norm": 16.375, + "learning_rate": 1.952298696598874e-06, + "loss": 2.2684, + "step": 6405 + }, + { + "epoch": 0.18900191655609613, + "grad_norm": 18.75, + "learning_rate": 1.952141523042363e-06, + "loss": 2.3083, + "step": 6410 + }, + { + "epoch": 0.18914934394810556, + "grad_norm": 15.4375, + "learning_rate": 1.951984097319685e-06, + "loss": 2.3116, + "step": 6415 + }, + { + "epoch": 0.189296771340115, + "grad_norm": 13.5, + "learning_rate": 1.9518264194725333e-06, + "loss": 2.2399, + "step": 6420 + }, + { + "epoch": 0.18944419873212442, + "grad_norm": 15.5, + "learning_rate": 1.9516684895426676e-06, + "loss": 2.2946, + "step": 6425 + }, + { + "epoch": 0.18959162612413386, + "grad_norm": 14.0, + "learning_rate": 1.9515103075719133e-06, + "loss": 2.2819, + "step": 6430 + }, + { + "epoch": 0.18973905351614329, + "grad_norm": 17.125, + "learning_rate": 1.951351873602165e-06, + "loss": 2.3172, + "step": 6435 + }, + { + "epoch": 0.18988648090815274, + "grad_norm": 13.6875, + "learning_rate": 1.9511931876753813e-06, + "loss": 2.1765, + "step": 6440 + }, + { + "epoch": 0.19003390830016217, + "grad_norm": 13.75, + "learning_rate": 1.9510342498335893e-06, + "loss": 2.1027, + "step": 6445 + }, + { + "epoch": 0.1901813356921716, + "grad_norm": 13.875, + "learning_rate": 1.9508750601188823e-06, + "loss": 2.1639, + "step": 6450 + }, + { + "epoch": 0.19032876308418104, + "grad_norm": 14.3125, + "learning_rate": 1.95071561857342e-06, + "loss": 2.1655, + "step": 6455 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 18.375, + "learning_rate": 1.9505559252394292e-06, + "loss": 2.2215, + "step": 6460 + }, + { + "epoch": 0.19062361786819992, + "grad_norm": 15.75, + "learning_rate": 1.9503959801592035e-06, + "loss": 2.3045, + "step": 6465 + }, + { + "epoch": 0.19077104526020935, + "grad_norm": 22.25, + "learning_rate": 1.950235783375102e-06, + "loss": 2.3103, + "step": 6470 + }, + { + "epoch": 0.19091847265221878, + "grad_norm": 15.8125, + "learning_rate": 1.9500753349295524e-06, + "loss": 2.2239, + "step": 6475 + }, + { + "epoch": 0.19106590004422822, + "grad_norm": 13.4375, + "learning_rate": 1.9499146348650477e-06, + "loss": 2.2765, + "step": 6480 + }, + { + "epoch": 0.19121332743623765, + "grad_norm": 13.25, + "learning_rate": 1.949753683224148e-06, + "loss": 2.1889, + "step": 6485 + }, + { + "epoch": 0.19136075482824708, + "grad_norm": 16.375, + "learning_rate": 1.9495924800494796e-06, + "loss": 2.1884, + "step": 6490 + }, + { + "epoch": 0.19150818222025653, + "grad_norm": 16.875, + "learning_rate": 1.9494310253837357e-06, + "loss": 2.2143, + "step": 6495 + }, + { + "epoch": 0.19165560961226596, + "grad_norm": 15.625, + "learning_rate": 1.9492693192696766e-06, + "loss": 2.2323, + "step": 6500 + }, + { + "epoch": 0.19165560961226596, + "eval_loss": 2.214259147644043, + "eval_runtime": 4.7188, + "eval_samples_per_second": 83.92, + "eval_steps_per_second": 2.755, + "step": 6500 + }, + { + "epoch": 0.1918030370042754, + "grad_norm": 19.75, + "learning_rate": 1.949107361750128e-06, + "loss": 2.2991, + "step": 6505 + }, + { + "epoch": 0.19195046439628483, + "grad_norm": 16.125, + "learning_rate": 1.948945152867984e-06, + "loss": 2.3435, + "step": 6510 + }, + { + "epoch": 0.19209789178829426, + "grad_norm": 13.4375, + "learning_rate": 1.948782692666203e-06, + "loss": 2.2187, + "step": 6515 + }, + { + "epoch": 0.19224531918030371, + "grad_norm": 15.6875, + "learning_rate": 1.9486199811878116e-06, + "loss": 2.2188, + "step": 6520 + }, + { + "epoch": 0.19239274657231314, + "grad_norm": 12.875, + "learning_rate": 1.9484570184759027e-06, + "loss": 2.2358, + "step": 6525 + }, + { + "epoch": 0.19254017396432258, + "grad_norm": 18.0, + "learning_rate": 1.9482938045736353e-06, + "loss": 2.2281, + "step": 6530 + }, + { + "epoch": 0.192687601356332, + "grad_norm": 13.875, + "learning_rate": 1.948130339524235e-06, + "loss": 2.2475, + "step": 6535 + }, + { + "epoch": 0.19283502874834144, + "grad_norm": 15.5625, + "learning_rate": 1.9479666233709945e-06, + "loss": 2.1642, + "step": 6540 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 16.125, + "learning_rate": 1.947802656157272e-06, + "loss": 2.2591, + "step": 6545 + }, + { + "epoch": 0.19312988353236032, + "grad_norm": 17.375, + "learning_rate": 1.9476384379264933e-06, + "loss": 2.2818, + "step": 6550 + }, + { + "epoch": 0.19327731092436976, + "grad_norm": 21.875, + "learning_rate": 1.9474739687221494e-06, + "loss": 2.1193, + "step": 6555 + }, + { + "epoch": 0.19342473831637919, + "grad_norm": 14.4375, + "learning_rate": 1.9473092485877994e-06, + "loss": 2.1854, + "step": 6560 + }, + { + "epoch": 0.19357216570838862, + "grad_norm": 14.625, + "learning_rate": 1.9471442775670673e-06, + "loss": 2.2582, + "step": 6565 + }, + { + "epoch": 0.19371959310039805, + "grad_norm": 16.75, + "learning_rate": 1.9469790557036443e-06, + "loss": 2.2225, + "step": 6570 + }, + { + "epoch": 0.19386702049240748, + "grad_norm": 17.0, + "learning_rate": 1.9468135830412886e-06, + "loss": 2.2606, + "step": 6575 + }, + { + "epoch": 0.19401444788441694, + "grad_norm": 16.875, + "learning_rate": 1.946647859623823e-06, + "loss": 2.1635, + "step": 6580 + }, + { + "epoch": 0.19416187527642637, + "grad_norm": 14.9375, + "learning_rate": 1.9464818854951388e-06, + "loss": 2.3414, + "step": 6585 + }, + { + "epoch": 0.1943093026684358, + "grad_norm": 14.8125, + "learning_rate": 1.9463156606991918e-06, + "loss": 2.4052, + "step": 6590 + }, + { + "epoch": 0.19445673006044523, + "grad_norm": 16.5, + "learning_rate": 1.9461491852800065e-06, + "loss": 2.129, + "step": 6595 + }, + { + "epoch": 0.19460415745245466, + "grad_norm": 15.9375, + "learning_rate": 1.9459824592816716e-06, + "loss": 2.3136, + "step": 6600 + }, + { + "epoch": 0.19475158484446412, + "grad_norm": 15.125, + "learning_rate": 1.9458154827483427e-06, + "loss": 2.1992, + "step": 6605 + }, + { + "epoch": 0.19489901223647355, + "grad_norm": 17.375, + "learning_rate": 1.9456482557242427e-06, + "loss": 2.2089, + "step": 6610 + }, + { + "epoch": 0.19504643962848298, + "grad_norm": 17.875, + "learning_rate": 1.94548077825366e-06, + "loss": 2.1713, + "step": 6615 + }, + { + "epoch": 0.1951938670204924, + "grad_norm": 14.5, + "learning_rate": 1.945313050380949e-06, + "loss": 2.2417, + "step": 6620 + }, + { + "epoch": 0.19534129441250184, + "grad_norm": 14.0625, + "learning_rate": 1.945145072150532e-06, + "loss": 2.3048, + "step": 6625 + }, + { + "epoch": 0.19548872180451127, + "grad_norm": 16.75, + "learning_rate": 1.9449768436068953e-06, + "loss": 2.1964, + "step": 6630 + }, + { + "epoch": 0.19563614919652073, + "grad_norm": 14.6875, + "learning_rate": 1.944808364794594e-06, + "loss": 2.2846, + "step": 6635 + }, + { + "epoch": 0.19578357658853016, + "grad_norm": 17.75, + "learning_rate": 1.944639635758247e-06, + "loss": 2.2495, + "step": 6640 + }, + { + "epoch": 0.1959310039805396, + "grad_norm": 15.1875, + "learning_rate": 1.944470656542541e-06, + "loss": 2.1223, + "step": 6645 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 17.5, + "learning_rate": 1.944301427192229e-06, + "loss": 2.2621, + "step": 6650 + }, + { + "epoch": 0.19622585876455845, + "grad_norm": 104.0, + "learning_rate": 1.94413194775213e-06, + "loss": 2.0909, + "step": 6655 + }, + { + "epoch": 0.19637328615656788, + "grad_norm": 14.5625, + "learning_rate": 1.9439622182671282e-06, + "loss": 2.2395, + "step": 6660 + }, + { + "epoch": 0.19652071354857734, + "grad_norm": 14.125, + "learning_rate": 1.943792238782176e-06, + "loss": 2.2649, + "step": 6665 + }, + { + "epoch": 0.19666814094058677, + "grad_norm": 14.75, + "learning_rate": 1.9436220093422907e-06, + "loss": 2.3661, + "step": 6670 + }, + { + "epoch": 0.1968155683325962, + "grad_norm": 16.375, + "learning_rate": 1.9434515299925557e-06, + "loss": 2.349, + "step": 6675 + }, + { + "epoch": 0.19696299572460563, + "grad_norm": 16.625, + "learning_rate": 1.943280800778121e-06, + "loss": 2.2771, + "step": 6680 + }, + { + "epoch": 0.19711042311661506, + "grad_norm": 14.9375, + "learning_rate": 1.9431098217442027e-06, + "loss": 2.2262, + "step": 6685 + }, + { + "epoch": 0.19725785050862452, + "grad_norm": 14.375, + "learning_rate": 1.942938592936083e-06, + "loss": 2.2435, + "step": 6690 + }, + { + "epoch": 0.19740527790063395, + "grad_norm": 13.4375, + "learning_rate": 1.9427671143991103e-06, + "loss": 2.1591, + "step": 6695 + }, + { + "epoch": 0.19755270529264338, + "grad_norm": 14.75, + "learning_rate": 1.9425953861787e-06, + "loss": 2.296, + "step": 6700 + }, + { + "epoch": 0.1977001326846528, + "grad_norm": 17.875, + "learning_rate": 1.942423408320332e-06, + "loss": 2.2397, + "step": 6705 + }, + { + "epoch": 0.19784756007666224, + "grad_norm": 12.25, + "learning_rate": 1.9422511808695525e-06, + "loss": 2.2015, + "step": 6710 + }, + { + "epoch": 0.19799498746867167, + "grad_norm": 15.25, + "learning_rate": 1.942078703871976e-06, + "loss": 2.2699, + "step": 6715 + }, + { + "epoch": 0.19814241486068113, + "grad_norm": 14.8125, + "learning_rate": 1.94190597737328e-06, + "loss": 2.0769, + "step": 6720 + }, + { + "epoch": 0.19828984225269056, + "grad_norm": 14.375, + "learning_rate": 1.9417330014192103e-06, + "loss": 2.0982, + "step": 6725 + }, + { + "epoch": 0.1984372696447, + "grad_norm": 15.875, + "learning_rate": 1.941559776055578e-06, + "loss": 2.2998, + "step": 6730 + }, + { + "epoch": 0.19858469703670942, + "grad_norm": 12.4375, + "learning_rate": 1.94138630132826e-06, + "loss": 2.1314, + "step": 6735 + }, + { + "epoch": 0.19873212442871885, + "grad_norm": 18.5, + "learning_rate": 1.9412125772832e-06, + "loss": 2.2776, + "step": 6740 + }, + { + "epoch": 0.19887955182072828, + "grad_norm": 13.1875, + "learning_rate": 1.9410386039664067e-06, + "loss": 2.1857, + "step": 6745 + }, + { + "epoch": 0.19902697921273774, + "grad_norm": 14.4375, + "learning_rate": 1.940864381423956e-06, + "loss": 2.2383, + "step": 6750 + }, + { + "epoch": 0.19917440660474717, + "grad_norm": 12.9375, + "learning_rate": 1.9406899097019883e-06, + "loss": 2.2486, + "step": 6755 + }, + { + "epoch": 0.1993218339967566, + "grad_norm": 16.875, + "learning_rate": 1.940515188846712e-06, + "loss": 2.3064, + "step": 6760 + }, + { + "epoch": 0.19946926138876603, + "grad_norm": 14.0, + "learning_rate": 1.9403402189043994e-06, + "loss": 2.1471, + "step": 6765 + }, + { + "epoch": 0.19961668878077546, + "grad_norm": 14.0625, + "learning_rate": 1.9401649999213904e-06, + "loss": 2.1606, + "step": 6770 + }, + { + "epoch": 0.19976411617278492, + "grad_norm": 19.0, + "learning_rate": 1.9399895319440893e-06, + "loss": 2.2425, + "step": 6775 + }, + { + "epoch": 0.19991154356479435, + "grad_norm": 16.375, + "learning_rate": 1.9398138150189683e-06, + "loss": 2.1473, + "step": 6780 + }, + { + "epoch": 0.20005897095680378, + "grad_norm": 13.375, + "learning_rate": 1.9396378491925636e-06, + "loss": 2.1876, + "step": 6785 + }, + { + "epoch": 0.2002063983488132, + "grad_norm": 16.125, + "learning_rate": 1.939461634511479e-06, + "loss": 2.2899, + "step": 6790 + }, + { + "epoch": 0.20035382574082264, + "grad_norm": 12.625, + "learning_rate": 1.9392851710223823e-06, + "loss": 2.1244, + "step": 6795 + }, + { + "epoch": 0.20050125313283207, + "grad_norm": 11.9375, + "learning_rate": 1.9391084587720093e-06, + "loss": 2.257, + "step": 6800 + }, + { + "epoch": 0.20064868052484153, + "grad_norm": 28.875, + "learning_rate": 1.93893149780716e-06, + "loss": 2.2221, + "step": 6805 + }, + { + "epoch": 0.20079610791685096, + "grad_norm": 12.0625, + "learning_rate": 1.9387542881747016e-06, + "loss": 2.1064, + "step": 6810 + }, + { + "epoch": 0.2009435353088604, + "grad_norm": 13.125, + "learning_rate": 1.9385768299215656e-06, + "loss": 2.1955, + "step": 6815 + }, + { + "epoch": 0.20109096270086982, + "grad_norm": 15.8125, + "learning_rate": 1.938399123094751e-06, + "loss": 2.2424, + "step": 6820 + }, + { + "epoch": 0.20123839009287925, + "grad_norm": 14.25, + "learning_rate": 1.9382211677413213e-06, + "loss": 2.1846, + "step": 6825 + }, + { + "epoch": 0.20138581748488868, + "grad_norm": 14.375, + "learning_rate": 1.938042963908407e-06, + "loss": 2.2189, + "step": 6830 + }, + { + "epoch": 0.20153324487689814, + "grad_norm": 13.1875, + "learning_rate": 1.937864511643203e-06, + "loss": 2.3278, + "step": 6835 + }, + { + "epoch": 0.20168067226890757, + "grad_norm": 15.25, + "learning_rate": 1.9376858109929713e-06, + "loss": 2.2339, + "step": 6840 + }, + { + "epoch": 0.201828099660917, + "grad_norm": 16.375, + "learning_rate": 1.937506862005039e-06, + "loss": 2.2715, + "step": 6845 + }, + { + "epoch": 0.20197552705292643, + "grad_norm": 14.0625, + "learning_rate": 1.9373276647267996e-06, + "loss": 2.0827, + "step": 6850 + }, + { + "epoch": 0.20212295444493586, + "grad_norm": 14.125, + "learning_rate": 1.9371482192057114e-06, + "loss": 2.2222, + "step": 6855 + }, + { + "epoch": 0.20227038183694532, + "grad_norm": 14.5, + "learning_rate": 1.936968525489299e-06, + "loss": 2.2901, + "step": 6860 + }, + { + "epoch": 0.20241780922895475, + "grad_norm": 12.0625, + "learning_rate": 1.936788583625153e-06, + "loss": 2.0957, + "step": 6865 + }, + { + "epoch": 0.20256523662096418, + "grad_norm": 12.6875, + "learning_rate": 1.936608393660929e-06, + "loss": 2.1918, + "step": 6870 + }, + { + "epoch": 0.2027126640129736, + "grad_norm": 19.625, + "learning_rate": 1.9364279556443486e-06, + "loss": 2.2613, + "step": 6875 + }, + { + "epoch": 0.20286009140498304, + "grad_norm": 14.75, + "learning_rate": 1.9362472696231994e-06, + "loss": 2.2336, + "step": 6880 + }, + { + "epoch": 0.20300751879699247, + "grad_norm": 12.75, + "learning_rate": 1.9360663356453344e-06, + "loss": 2.2267, + "step": 6885 + }, + { + "epoch": 0.20315494618900193, + "grad_norm": 16.375, + "learning_rate": 1.935885153758673e-06, + "loss": 2.311, + "step": 6890 + }, + { + "epoch": 0.20330237358101136, + "grad_norm": 14.625, + "learning_rate": 1.9357037240111985e-06, + "loss": 2.2432, + "step": 6895 + }, + { + "epoch": 0.2034498009730208, + "grad_norm": 11.75, + "learning_rate": 1.9355220464509617e-06, + "loss": 2.17, + "step": 6900 + }, + { + "epoch": 0.20359722836503022, + "grad_norm": 14.875, + "learning_rate": 1.935340121126078e-06, + "loss": 2.1693, + "step": 6905 + }, + { + "epoch": 0.20374465575703965, + "grad_norm": 14.875, + "learning_rate": 1.9351579480847288e-06, + "loss": 2.2669, + "step": 6910 + }, + { + "epoch": 0.20389208314904908, + "grad_norm": 17.25, + "learning_rate": 1.934975527375161e-06, + "loss": 2.2202, + "step": 6915 + }, + { + "epoch": 0.20403951054105854, + "grad_norm": 13.4375, + "learning_rate": 1.9347928590456874e-06, + "loss": 2.2297, + "step": 6920 + }, + { + "epoch": 0.20418693793306797, + "grad_norm": 15.625, + "learning_rate": 1.9346099431446853e-06, + "loss": 2.1914, + "step": 6925 + }, + { + "epoch": 0.2043343653250774, + "grad_norm": 13.625, + "learning_rate": 1.9344267797205988e-06, + "loss": 2.2208, + "step": 6930 + }, + { + "epoch": 0.20448179271708683, + "grad_norm": 13.25, + "learning_rate": 1.934243368821937e-06, + "loss": 2.1804, + "step": 6935 + }, + { + "epoch": 0.20462922010909626, + "grad_norm": 17.0, + "learning_rate": 1.934059710497275e-06, + "loss": 2.2179, + "step": 6940 + }, + { + "epoch": 0.20477664750110572, + "grad_norm": 15.1875, + "learning_rate": 1.9338758047952527e-06, + "loss": 2.1055, + "step": 6945 + }, + { + "epoch": 0.20492407489311515, + "grad_norm": 14.625, + "learning_rate": 1.9336916517645757e-06, + "loss": 2.2436, + "step": 6950 + }, + { + "epoch": 0.20507150228512458, + "grad_norm": 16.625, + "learning_rate": 1.933507251454016e-06, + "loss": 2.1477, + "step": 6955 + }, + { + "epoch": 0.205218929677134, + "grad_norm": 15.5625, + "learning_rate": 1.933322603912409e-06, + "loss": 2.1234, + "step": 6960 + }, + { + "epoch": 0.20536635706914344, + "grad_norm": 14.4375, + "learning_rate": 1.933137709188659e-06, + "loss": 2.257, + "step": 6965 + }, + { + "epoch": 0.20551378446115287, + "grad_norm": 13.875, + "learning_rate": 1.932952567331732e-06, + "loss": 2.1056, + "step": 6970 + }, + { + "epoch": 0.20566121185316233, + "grad_norm": 17.125, + "learning_rate": 1.9327671783906614e-06, + "loss": 2.161, + "step": 6975 + }, + { + "epoch": 0.20580863924517176, + "grad_norm": 25.875, + "learning_rate": 1.9325815424145465e-06, + "loss": 2.1589, + "step": 6980 + }, + { + "epoch": 0.2059560666371812, + "grad_norm": 16.625, + "learning_rate": 1.9323956594525514e-06, + "loss": 2.2228, + "step": 6985 + }, + { + "epoch": 0.20610349402919062, + "grad_norm": 18.0, + "learning_rate": 1.9322095295539045e-06, + "loss": 2.2412, + "step": 6990 + }, + { + "epoch": 0.20625092142120005, + "grad_norm": 15.125, + "learning_rate": 1.9320231527679014e-06, + "loss": 2.2868, + "step": 6995 + }, + { + "epoch": 0.20639834881320948, + "grad_norm": 14.1875, + "learning_rate": 1.931836529143902e-06, + "loss": 2.1774, + "step": 7000 + }, + { + "epoch": 0.20639834881320948, + "eval_loss": 2.193103075027466, + "eval_runtime": 4.7184, + "eval_samples_per_second": 83.927, + "eval_steps_per_second": 2.755, + "step": 7000 + }, + { + "epoch": 0.20654577620521894, + "grad_norm": 15.0625, + "learning_rate": 1.9316496587313323e-06, + "loss": 2.3368, + "step": 7005 + }, + { + "epoch": 0.20669320359722837, + "grad_norm": 13.625, + "learning_rate": 1.931462541579683e-06, + "loss": 2.3225, + "step": 7010 + }, + { + "epoch": 0.2068406309892378, + "grad_norm": 14.0625, + "learning_rate": 1.9312751777385103e-06, + "loss": 2.2313, + "step": 7015 + }, + { + "epoch": 0.20698805838124723, + "grad_norm": 15.25, + "learning_rate": 1.931087567257436e-06, + "loss": 2.2718, + "step": 7020 + }, + { + "epoch": 0.20713548577325666, + "grad_norm": 13.375, + "learning_rate": 1.9308997101861474e-06, + "loss": 2.1488, + "step": 7025 + }, + { + "epoch": 0.20728291316526612, + "grad_norm": 14.25, + "learning_rate": 1.930711606574396e-06, + "loss": 2.2197, + "step": 7030 + }, + { + "epoch": 0.20743034055727555, + "grad_norm": 16.125, + "learning_rate": 1.930523256472e-06, + "loss": 2.1245, + "step": 7035 + }, + { + "epoch": 0.20757776794928498, + "grad_norm": 21.625, + "learning_rate": 1.9303346599288415e-06, + "loss": 2.2834, + "step": 7040 + }, + { + "epoch": 0.2077251953412944, + "grad_norm": 13.1875, + "learning_rate": 1.9301458169948695e-06, + "loss": 2.145, + "step": 7045 + }, + { + "epoch": 0.20787262273330384, + "grad_norm": 13.875, + "learning_rate": 1.929956727720097e-06, + "loss": 2.206, + "step": 7050 + }, + { + "epoch": 0.20802005012531327, + "grad_norm": 13.4375, + "learning_rate": 1.9297673921546026e-06, + "loss": 2.1265, + "step": 7055 + }, + { + "epoch": 0.20816747751732273, + "grad_norm": 15.3125, + "learning_rate": 1.92957781034853e-06, + "loss": 2.283, + "step": 7060 + }, + { + "epoch": 0.20831490490933216, + "grad_norm": 15.125, + "learning_rate": 1.929387982352088e-06, + "loss": 2.2967, + "step": 7065 + }, + { + "epoch": 0.2084623323013416, + "grad_norm": 17.0, + "learning_rate": 1.9291979082155514e-06, + "loss": 2.2077, + "step": 7070 + }, + { + "epoch": 0.20860975969335102, + "grad_norm": 11.75, + "learning_rate": 1.9290075879892593e-06, + "loss": 2.1595, + "step": 7075 + }, + { + "epoch": 0.20875718708536045, + "grad_norm": 15.0, + "learning_rate": 1.9288170217236167e-06, + "loss": 2.272, + "step": 7080 + }, + { + "epoch": 0.20890461447736988, + "grad_norm": 13.9375, + "learning_rate": 1.928626209469093e-06, + "loss": 2.2495, + "step": 7085 + }, + { + "epoch": 0.20905204186937934, + "grad_norm": 15.8125, + "learning_rate": 1.9284351512762235e-06, + "loss": 2.202, + "step": 7090 + }, + { + "epoch": 0.20919946926138877, + "grad_norm": 17.5, + "learning_rate": 1.9282438471956074e-06, + "loss": 2.238, + "step": 7095 + }, + { + "epoch": 0.2093468966533982, + "grad_norm": 19.5, + "learning_rate": 1.9280522972779105e-06, + "loss": 2.2096, + "step": 7100 + }, + { + "epoch": 0.20949432404540763, + "grad_norm": 15.5, + "learning_rate": 1.9278605015738635e-06, + "loss": 2.3443, + "step": 7105 + }, + { + "epoch": 0.20964175143741706, + "grad_norm": 15.3125, + "learning_rate": 1.927668460134261e-06, + "loss": 2.0769, + "step": 7110 + }, + { + "epoch": 0.20978917882942652, + "grad_norm": 17.125, + "learning_rate": 1.927476173009964e-06, + "loss": 2.1903, + "step": 7115 + }, + { + "epoch": 0.20993660622143595, + "grad_norm": 18.75, + "learning_rate": 1.9272836402518975e-06, + "loss": 2.3107, + "step": 7120 + }, + { + "epoch": 0.21008403361344538, + "grad_norm": 15.6875, + "learning_rate": 1.927090861911053e-06, + "loss": 2.3182, + "step": 7125 + }, + { + "epoch": 0.2102314610054548, + "grad_norm": 16.75, + "learning_rate": 1.9268978380384846e-06, + "loss": 2.2175, + "step": 7130 + }, + { + "epoch": 0.21037888839746424, + "grad_norm": 15.5625, + "learning_rate": 1.926704568685314e-06, + "loss": 2.219, + "step": 7135 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 15.1875, + "learning_rate": 1.9265110539027273e-06, + "loss": 2.1772, + "step": 7140 + }, + { + "epoch": 0.21067374318148313, + "grad_norm": 16.5, + "learning_rate": 1.9263172937419742e-06, + "loss": 2.2305, + "step": 7145 + }, + { + "epoch": 0.21082117057349256, + "grad_norm": 25.5, + "learning_rate": 1.9261232882543706e-06, + "loss": 2.3339, + "step": 7150 + }, + { + "epoch": 0.210968597965502, + "grad_norm": 13.375, + "learning_rate": 1.9259290374912976e-06, + "loss": 2.0968, + "step": 7155 + }, + { + "epoch": 0.21111602535751142, + "grad_norm": 12.8125, + "learning_rate": 1.9257345415042e-06, + "loss": 2.1461, + "step": 7160 + }, + { + "epoch": 0.21126345274952085, + "grad_norm": 13.75, + "learning_rate": 1.9255398003445887e-06, + "loss": 2.2249, + "step": 7165 + }, + { + "epoch": 0.21141088014153028, + "grad_norm": 16.75, + "learning_rate": 1.9253448140640392e-06, + "loss": 2.3032, + "step": 7170 + }, + { + "epoch": 0.21155830753353974, + "grad_norm": 14.625, + "learning_rate": 1.925149582714192e-06, + "loss": 2.1457, + "step": 7175 + }, + { + "epoch": 0.21170573492554917, + "grad_norm": 14.75, + "learning_rate": 1.9249541063467524e-06, + "loss": 2.1607, + "step": 7180 + }, + { + "epoch": 0.2118531623175586, + "grad_norm": 16.375, + "learning_rate": 1.92475838501349e-06, + "loss": 2.1474, + "step": 7185 + }, + { + "epoch": 0.21200058970956803, + "grad_norm": 14.875, + "learning_rate": 1.92456241876624e-06, + "loss": 2.2473, + "step": 7190 + }, + { + "epoch": 0.21214801710157746, + "grad_norm": 13.75, + "learning_rate": 1.9243662076569034e-06, + "loss": 2.3273, + "step": 7195 + }, + { + "epoch": 0.21229544449358692, + "grad_norm": 26.5, + "learning_rate": 1.924169751737443e-06, + "loss": 2.253, + "step": 7200 + }, + { + "epoch": 0.21244287188559635, + "grad_norm": 14.6875, + "learning_rate": 1.9239730510598906e-06, + "loss": 2.1251, + "step": 7205 + }, + { + "epoch": 0.21259029927760578, + "grad_norm": 14.25, + "learning_rate": 1.923776105676339e-06, + "loss": 2.1633, + "step": 7210 + }, + { + "epoch": 0.2127377266696152, + "grad_norm": 14.875, + "learning_rate": 1.923578915638948e-06, + "loss": 2.2504, + "step": 7215 + }, + { + "epoch": 0.21288515406162464, + "grad_norm": 14.375, + "learning_rate": 1.9233814809999417e-06, + "loss": 2.0792, + "step": 7220 + }, + { + "epoch": 0.21303258145363407, + "grad_norm": 13.25, + "learning_rate": 1.9231838018116084e-06, + "loss": 2.0842, + "step": 7225 + }, + { + "epoch": 0.21318000884564353, + "grad_norm": 15.875, + "learning_rate": 1.922985878126302e-06, + "loss": 2.373, + "step": 7230 + }, + { + "epoch": 0.21332743623765296, + "grad_norm": 14.0, + "learning_rate": 1.9227877099964413e-06, + "loss": 2.2181, + "step": 7235 + }, + { + "epoch": 0.2134748636296624, + "grad_norm": 14.5625, + "learning_rate": 1.9225892974745083e-06, + "loss": 2.2031, + "step": 7240 + }, + { + "epoch": 0.21362229102167182, + "grad_norm": 13.6875, + "learning_rate": 1.9223906406130515e-06, + "loss": 2.0738, + "step": 7245 + }, + { + "epoch": 0.21376971841368125, + "grad_norm": 13.375, + "learning_rate": 1.9221917394646833e-06, + "loss": 2.1385, + "step": 7250 + }, + { + "epoch": 0.21391714580569068, + "grad_norm": 13.875, + "learning_rate": 1.9219925940820813e-06, + "loss": 2.2051, + "step": 7255 + }, + { + "epoch": 0.21406457319770014, + "grad_norm": 13.875, + "learning_rate": 1.9217932045179864e-06, + "loss": 2.206, + "step": 7260 + }, + { + "epoch": 0.21421200058970957, + "grad_norm": 13.3125, + "learning_rate": 1.921593570825206e-06, + "loss": 2.2147, + "step": 7265 + }, + { + "epoch": 0.214359427981719, + "grad_norm": 16.125, + "learning_rate": 1.921393693056611e-06, + "loss": 2.1987, + "step": 7270 + }, + { + "epoch": 0.21450685537372843, + "grad_norm": 19.375, + "learning_rate": 1.921193571265137e-06, + "loss": 2.268, + "step": 7275 + }, + { + "epoch": 0.21465428276573786, + "grad_norm": 13.875, + "learning_rate": 1.9209932055037844e-06, + "loss": 2.2881, + "step": 7280 + }, + { + "epoch": 0.21480171015774732, + "grad_norm": 16.125, + "learning_rate": 1.920792595825619e-06, + "loss": 2.1479, + "step": 7285 + }, + { + "epoch": 0.21494913754975675, + "grad_norm": 16.0, + "learning_rate": 1.92059174228377e-06, + "loss": 2.2282, + "step": 7290 + }, + { + "epoch": 0.21509656494176618, + "grad_norm": 14.5, + "learning_rate": 1.9203906449314315e-06, + "loss": 2.1761, + "step": 7295 + }, + { + "epoch": 0.2152439923337756, + "grad_norm": 15.9375, + "learning_rate": 1.920189303821862e-06, + "loss": 2.2016, + "step": 7300 + }, + { + "epoch": 0.21539141972578504, + "grad_norm": 17.25, + "learning_rate": 1.9199877190083863e-06, + "loss": 2.0975, + "step": 7305 + }, + { + "epoch": 0.21553884711779447, + "grad_norm": 16.375, + "learning_rate": 1.9197858905443916e-06, + "loss": 2.2305, + "step": 7310 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 13.1875, + "learning_rate": 1.91958381848333e-06, + "loss": 2.1979, + "step": 7315 + }, + { + "epoch": 0.21583370190181336, + "grad_norm": 15.6875, + "learning_rate": 1.919381502878718e-06, + "loss": 2.2607, + "step": 7320 + }, + { + "epoch": 0.2159811292938228, + "grad_norm": 11.9375, + "learning_rate": 1.9191789437841384e-06, + "loss": 2.1914, + "step": 7325 + }, + { + "epoch": 0.21612855668583222, + "grad_norm": 15.1875, + "learning_rate": 1.9189761412532365e-06, + "loss": 2.2348, + "step": 7330 + }, + { + "epoch": 0.21627598407784165, + "grad_norm": 14.0, + "learning_rate": 1.9187730953397225e-06, + "loss": 2.2635, + "step": 7335 + }, + { + "epoch": 0.21642341146985108, + "grad_norm": 14.1875, + "learning_rate": 1.918569806097372e-06, + "loss": 2.2494, + "step": 7340 + }, + { + "epoch": 0.21657083886186054, + "grad_norm": 16.375, + "learning_rate": 1.9183662735800237e-06, + "loss": 2.2639, + "step": 7345 + }, + { + "epoch": 0.21671826625386997, + "grad_norm": 16.125, + "learning_rate": 1.9181624978415814e-06, + "loss": 2.2761, + "step": 7350 + }, + { + "epoch": 0.2168656936458794, + "grad_norm": 15.5625, + "learning_rate": 1.917958478936014e-06, + "loss": 2.3698, + "step": 7355 + }, + { + "epoch": 0.21701312103788883, + "grad_norm": 15.5, + "learning_rate": 1.917754216917353e-06, + "loss": 2.2325, + "step": 7360 + }, + { + "epoch": 0.21716054842989826, + "grad_norm": 19.375, + "learning_rate": 1.917549711839696e-06, + "loss": 2.2518, + "step": 7365 + }, + { + "epoch": 0.21730797582190772, + "grad_norm": 14.75, + "learning_rate": 1.9173449637572042e-06, + "loss": 2.2386, + "step": 7370 + }, + { + "epoch": 0.21745540321391715, + "grad_norm": 13.25, + "learning_rate": 1.9171399727241035e-06, + "loss": 2.2323, + "step": 7375 + }, + { + "epoch": 0.21760283060592658, + "grad_norm": 15.6875, + "learning_rate": 1.9169347387946836e-06, + "loss": 2.207, + "step": 7380 + }, + { + "epoch": 0.21775025799793601, + "grad_norm": 19.0, + "learning_rate": 1.916729262023299e-06, + "loss": 2.2985, + "step": 7385 + }, + { + "epoch": 0.21789768538994544, + "grad_norm": 15.875, + "learning_rate": 1.916523542464369e-06, + "loss": 2.156, + "step": 7390 + }, + { + "epoch": 0.21804511278195488, + "grad_norm": 13.3125, + "learning_rate": 1.916317580172376e-06, + "loss": 2.1725, + "step": 7395 + }, + { + "epoch": 0.21819254017396433, + "grad_norm": 14.0625, + "learning_rate": 1.9161113752018666e-06, + "loss": 2.2422, + "step": 7400 + }, + { + "epoch": 0.21833996756597376, + "grad_norm": 15.4375, + "learning_rate": 1.915904927607454e-06, + "loss": 2.2404, + "step": 7405 + }, + { + "epoch": 0.2184873949579832, + "grad_norm": 14.5625, + "learning_rate": 1.915698237443812e-06, + "loss": 2.3074, + "step": 7410 + }, + { + "epoch": 0.21863482234999262, + "grad_norm": 15.25, + "learning_rate": 1.915491304765682e-06, + "loss": 2.1644, + "step": 7415 + }, + { + "epoch": 0.21878224974200206, + "grad_norm": 20.625, + "learning_rate": 1.9152841296278687e-06, + "loss": 2.2966, + "step": 7420 + }, + { + "epoch": 0.21892967713401149, + "grad_norm": 13.4375, + "learning_rate": 1.915076712085239e-06, + "loss": 2.223, + "step": 7425 + }, + { + "epoch": 0.21907710452602094, + "grad_norm": 17.625, + "learning_rate": 1.9148690521927267e-06, + "loss": 2.2195, + "step": 7430 + }, + { + "epoch": 0.21922453191803037, + "grad_norm": 15.0, + "learning_rate": 1.9146611500053282e-06, + "loss": 2.2043, + "step": 7435 + }, + { + "epoch": 0.2193719593100398, + "grad_norm": 15.0, + "learning_rate": 1.914453005578105e-06, + "loss": 2.1418, + "step": 7440 + }, + { + "epoch": 0.21951938670204924, + "grad_norm": 18.0, + "learning_rate": 1.9142446189661818e-06, + "loss": 2.1438, + "step": 7445 + }, + { + "epoch": 0.21966681409405867, + "grad_norm": 15.25, + "learning_rate": 1.9140359902247485e-06, + "loss": 2.3102, + "step": 7450 + }, + { + "epoch": 0.21981424148606812, + "grad_norm": 13.75, + "learning_rate": 1.9138271194090576e-06, + "loss": 2.1613, + "step": 7455 + }, + { + "epoch": 0.21996166887807755, + "grad_norm": 16.5, + "learning_rate": 1.9136180065744278e-06, + "loss": 2.2573, + "step": 7460 + }, + { + "epoch": 0.22010909627008698, + "grad_norm": 14.1875, + "learning_rate": 1.91340865177624e-06, + "loss": 2.1977, + "step": 7465 + }, + { + "epoch": 0.22025652366209642, + "grad_norm": 13.75, + "learning_rate": 1.91319905506994e-06, + "loss": 2.2246, + "step": 7470 + }, + { + "epoch": 0.22040395105410585, + "grad_norm": 14.125, + "learning_rate": 1.9129892165110383e-06, + "loss": 2.2511, + "step": 7475 + }, + { + "epoch": 0.22055137844611528, + "grad_norm": 13.375, + "learning_rate": 1.9127791361551077e-06, + "loss": 2.1218, + "step": 7480 + }, + { + "epoch": 0.22069880583812473, + "grad_norm": 14.0, + "learning_rate": 1.912568814057787e-06, + "loss": 2.1278, + "step": 7485 + }, + { + "epoch": 0.22084623323013416, + "grad_norm": 14.875, + "learning_rate": 1.9123582502747776e-06, + "loss": 2.177, + "step": 7490 + }, + { + "epoch": 0.2209936606221436, + "grad_norm": 18.0, + "learning_rate": 1.9121474448618455e-06, + "loss": 2.1431, + "step": 7495 + }, + { + "epoch": 0.22114108801415303, + "grad_norm": 16.0, + "learning_rate": 1.911936397874821e-06, + "loss": 2.2285, + "step": 7500 + }, + { + "epoch": 0.22114108801415303, + "eval_loss": 2.1763086318969727, + "eval_runtime": 4.7087, + "eval_samples_per_second": 84.099, + "eval_steps_per_second": 2.761, + "step": 7500 + }, + { + "epoch": 0.22128851540616246, + "grad_norm": 15.6875, + "learning_rate": 1.911725109369598e-06, + "loss": 2.2166, + "step": 7505 + }, + { + "epoch": 0.22143594279817191, + "grad_norm": 27.875, + "learning_rate": 1.9115135794021336e-06, + "loss": 2.2402, + "step": 7510 + }, + { + "epoch": 0.22158337019018134, + "grad_norm": 14.4375, + "learning_rate": 1.91130180802845e-06, + "loss": 2.204, + "step": 7515 + }, + { + "epoch": 0.22173079758219078, + "grad_norm": 14.1875, + "learning_rate": 1.911089795304634e-06, + "loss": 2.1249, + "step": 7520 + }, + { + "epoch": 0.2218782249742002, + "grad_norm": 13.8125, + "learning_rate": 1.9108775412868333e-06, + "loss": 2.1547, + "step": 7525 + }, + { + "epoch": 0.22202565236620964, + "grad_norm": 15.5625, + "learning_rate": 1.9106650460312634e-06, + "loss": 2.2753, + "step": 7530 + }, + { + "epoch": 0.22217307975821907, + "grad_norm": 14.75, + "learning_rate": 1.9104523095942003e-06, + "loss": 2.1348, + "step": 7535 + }, + { + "epoch": 0.22232050715022852, + "grad_norm": 15.25, + "learning_rate": 1.910239332031986e-06, + "loss": 2.2014, + "step": 7540 + }, + { + "epoch": 0.22246793454223796, + "grad_norm": 15.4375, + "learning_rate": 1.910026113401026e-06, + "loss": 2.2529, + "step": 7545 + }, + { + "epoch": 0.22261536193424739, + "grad_norm": 25.5, + "learning_rate": 1.909812653757789e-06, + "loss": 2.2115, + "step": 7550 + }, + { + "epoch": 0.22276278932625682, + "grad_norm": 14.9375, + "learning_rate": 1.909598953158808e-06, + "loss": 2.228, + "step": 7555 + }, + { + "epoch": 0.22291021671826625, + "grad_norm": 14.3125, + "learning_rate": 1.909385011660679e-06, + "loss": 2.2131, + "step": 7560 + }, + { + "epoch": 0.22305764411027568, + "grad_norm": 14.25, + "learning_rate": 1.9091708293200635e-06, + "loss": 2.1549, + "step": 7565 + }, + { + "epoch": 0.22320507150228514, + "grad_norm": 13.875, + "learning_rate": 1.908956406193685e-06, + "loss": 2.1435, + "step": 7570 + }, + { + "epoch": 0.22335249889429457, + "grad_norm": 15.0625, + "learning_rate": 1.908741742338332e-06, + "loss": 2.2743, + "step": 7575 + }, + { + "epoch": 0.223499926286304, + "grad_norm": 23.625, + "learning_rate": 1.908526837810857e-06, + "loss": 2.2482, + "step": 7580 + }, + { + "epoch": 0.22364735367831343, + "grad_norm": 15.375, + "learning_rate": 1.9083116926681735e-06, + "loss": 2.348, + "step": 7585 + }, + { + "epoch": 0.22379478107032286, + "grad_norm": 15.625, + "learning_rate": 1.908096306967263e-06, + "loss": 2.1831, + "step": 7590 + }, + { + "epoch": 0.22394220846233232, + "grad_norm": 14.0, + "learning_rate": 1.907880680765167e-06, + "loss": 2.2471, + "step": 7595 + }, + { + "epoch": 0.22408963585434175, + "grad_norm": 15.5, + "learning_rate": 1.9076648141189925e-06, + "loss": 2.1452, + "step": 7600 + }, + { + "epoch": 0.22423706324635118, + "grad_norm": 14.375, + "learning_rate": 1.9074487070859102e-06, + "loss": 2.2139, + "step": 7605 + }, + { + "epoch": 0.2243844906383606, + "grad_norm": 13.375, + "learning_rate": 1.907232359723154e-06, + "loss": 2.0523, + "step": 7610 + }, + { + "epoch": 0.22453191803037004, + "grad_norm": 14.1875, + "learning_rate": 1.9070157720880213e-06, + "loss": 2.2727, + "step": 7615 + }, + { + "epoch": 0.22467934542237947, + "grad_norm": 15.625, + "learning_rate": 1.9067989442378738e-06, + "loss": 2.2582, + "step": 7620 + }, + { + "epoch": 0.22482677281438893, + "grad_norm": 14.8125, + "learning_rate": 1.9065818762301363e-06, + "loss": 2.1734, + "step": 7625 + }, + { + "epoch": 0.22497420020639836, + "grad_norm": 14.4375, + "learning_rate": 1.906364568122297e-06, + "loss": 2.1759, + "step": 7630 + }, + { + "epoch": 0.2251216275984078, + "grad_norm": 14.625, + "learning_rate": 1.9061470199719083e-06, + "loss": 2.1728, + "step": 7635 + }, + { + "epoch": 0.22526905499041722, + "grad_norm": 16.625, + "learning_rate": 1.9059292318365855e-06, + "loss": 2.3187, + "step": 7640 + }, + { + "epoch": 0.22541648238242665, + "grad_norm": 15.0, + "learning_rate": 1.9057112037740084e-06, + "loss": 2.1401, + "step": 7645 + }, + { + "epoch": 0.22556390977443608, + "grad_norm": 15.0, + "learning_rate": 1.9054929358419195e-06, + "loss": 2.1785, + "step": 7650 + }, + { + "epoch": 0.22571133716644554, + "grad_norm": 15.875, + "learning_rate": 1.9052744280981251e-06, + "loss": 2.3024, + "step": 7655 + }, + { + "epoch": 0.22585876455845497, + "grad_norm": 15.3125, + "learning_rate": 1.9050556806004955e-06, + "loss": 2.2835, + "step": 7660 + }, + { + "epoch": 0.2260061919504644, + "grad_norm": 14.4375, + "learning_rate": 1.904836693406963e-06, + "loss": 2.2379, + "step": 7665 + }, + { + "epoch": 0.22615361934247383, + "grad_norm": 13.5, + "learning_rate": 1.9046174665755252e-06, + "loss": 2.1806, + "step": 7670 + }, + { + "epoch": 0.22630104673448326, + "grad_norm": 15.0, + "learning_rate": 1.904398000164242e-06, + "loss": 2.2357, + "step": 7675 + }, + { + "epoch": 0.22644847412649272, + "grad_norm": 19.125, + "learning_rate": 1.9041782942312374e-06, + "loss": 2.1981, + "step": 7680 + }, + { + "epoch": 0.22659590151850215, + "grad_norm": 15.0, + "learning_rate": 1.9039583488346987e-06, + "loss": 2.2131, + "step": 7685 + }, + { + "epoch": 0.22674332891051158, + "grad_norm": 15.375, + "learning_rate": 1.9037381640328757e-06, + "loss": 2.1719, + "step": 7690 + }, + { + "epoch": 0.226890756302521, + "grad_norm": 15.0625, + "learning_rate": 1.9035177398840832e-06, + "loss": 2.308, + "step": 7695 + }, + { + "epoch": 0.22703818369453044, + "grad_norm": 10.875, + "learning_rate": 1.9032970764466981e-06, + "loss": 2.139, + "step": 7700 + }, + { + "epoch": 0.22718561108653987, + "grad_norm": 16.625, + "learning_rate": 1.9030761737791612e-06, + "loss": 2.1771, + "step": 7705 + }, + { + "epoch": 0.22733303847854933, + "grad_norm": 16.0, + "learning_rate": 1.9028550319399765e-06, + "loss": 2.1578, + "step": 7710 + }, + { + "epoch": 0.22748046587055876, + "grad_norm": 15.9375, + "learning_rate": 1.9026336509877119e-06, + "loss": 2.174, + "step": 7715 + }, + { + "epoch": 0.2276278932625682, + "grad_norm": 13.625, + "learning_rate": 1.9024120309809978e-06, + "loss": 2.1622, + "step": 7720 + }, + { + "epoch": 0.22777532065457762, + "grad_norm": 19.375, + "learning_rate": 1.9021901719785282e-06, + "loss": 2.2129, + "step": 7725 + }, + { + "epoch": 0.22792274804658705, + "grad_norm": 17.0, + "learning_rate": 1.9019680740390607e-06, + "loss": 2.2368, + "step": 7730 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 13.25, + "learning_rate": 1.9017457372214158e-06, + "loss": 2.1988, + "step": 7735 + }, + { + "epoch": 0.22821760283060594, + "grad_norm": 12.625, + "learning_rate": 1.9015231615844773e-06, + "loss": 2.1546, + "step": 7740 + }, + { + "epoch": 0.22836503022261537, + "grad_norm": 16.5, + "learning_rate": 1.9013003471871932e-06, + "loss": 2.2133, + "step": 7745 + }, + { + "epoch": 0.2285124576146248, + "grad_norm": 13.875, + "learning_rate": 1.9010772940885727e-06, + "loss": 2.1367, + "step": 7750 + }, + { + "epoch": 0.22865988500663423, + "grad_norm": 14.3125, + "learning_rate": 1.90085400234769e-06, + "loss": 2.2353, + "step": 7755 + }, + { + "epoch": 0.22880731239864366, + "grad_norm": 14.1875, + "learning_rate": 1.9006304720236818e-06, + "loss": 2.1319, + "step": 7760 + }, + { + "epoch": 0.22895473979065312, + "grad_norm": 15.625, + "learning_rate": 1.9004067031757484e-06, + "loss": 2.1314, + "step": 7765 + }, + { + "epoch": 0.22910216718266255, + "grad_norm": 18.5, + "learning_rate": 1.9001826958631529e-06, + "loss": 2.1489, + "step": 7770 + }, + { + "epoch": 0.22924959457467198, + "grad_norm": 15.5, + "learning_rate": 1.8999584501452213e-06, + "loss": 2.2059, + "step": 7775 + }, + { + "epoch": 0.2293970219666814, + "grad_norm": 13.8125, + "learning_rate": 1.8997339660813433e-06, + "loss": 2.0898, + "step": 7780 + }, + { + "epoch": 0.22954444935869084, + "grad_norm": 13.375, + "learning_rate": 1.8995092437309714e-06, + "loss": 2.1954, + "step": 7785 + }, + { + "epoch": 0.22969187675070027, + "grad_norm": 15.1875, + "learning_rate": 1.8992842831536215e-06, + "loss": 2.2219, + "step": 7790 + }, + { + "epoch": 0.22983930414270973, + "grad_norm": 20.5, + "learning_rate": 1.8990590844088723e-06, + "loss": 2.333, + "step": 7795 + }, + { + "epoch": 0.22998673153471916, + "grad_norm": 31.5, + "learning_rate": 1.8988336475563654e-06, + "loss": 2.0667, + "step": 7800 + }, + { + "epoch": 0.2301341589267286, + "grad_norm": 14.8125, + "learning_rate": 1.8986079726558064e-06, + "loss": 2.2898, + "step": 7805 + }, + { + "epoch": 0.23028158631873802, + "grad_norm": 15.6875, + "learning_rate": 1.8983820597669626e-06, + "loss": 2.2644, + "step": 7810 + }, + { + "epoch": 0.23042901371074745, + "grad_norm": 13.625, + "learning_rate": 1.8981559089496652e-06, + "loss": 2.2789, + "step": 7815 + }, + { + "epoch": 0.23057644110275688, + "grad_norm": 14.0625, + "learning_rate": 1.8979295202638086e-06, + "loss": 2.1742, + "step": 7820 + }, + { + "epoch": 0.23072386849476634, + "grad_norm": 13.9375, + "learning_rate": 1.8977028937693493e-06, + "loss": 2.2069, + "step": 7825 + }, + { + "epoch": 0.23087129588677577, + "grad_norm": 15.625, + "learning_rate": 1.8974760295263075e-06, + "loss": 2.1841, + "step": 7830 + }, + { + "epoch": 0.2310187232787852, + "grad_norm": 17.875, + "learning_rate": 1.8972489275947662e-06, + "loss": 2.2387, + "step": 7835 + }, + { + "epoch": 0.23116615067079463, + "grad_norm": 13.8125, + "learning_rate": 1.8970215880348713e-06, + "loss": 2.2129, + "step": 7840 + }, + { + "epoch": 0.23131357806280406, + "grad_norm": 17.75, + "learning_rate": 1.8967940109068316e-06, + "loss": 2.1558, + "step": 7845 + }, + { + "epoch": 0.23146100545481352, + "grad_norm": 14.625, + "learning_rate": 1.896566196270919e-06, + "loss": 2.2171, + "step": 7850 + }, + { + "epoch": 0.23160843284682295, + "grad_norm": 15.875, + "learning_rate": 1.896338144187468e-06, + "loss": 2.2006, + "step": 7855 + }, + { + "epoch": 0.23175586023883238, + "grad_norm": 16.125, + "learning_rate": 1.896109854716876e-06, + "loss": 2.1893, + "step": 7860 + }, + { + "epoch": 0.2319032876308418, + "grad_norm": 16.125, + "learning_rate": 1.895881327919604e-06, + "loss": 2.1611, + "step": 7865 + }, + { + "epoch": 0.23205071502285124, + "grad_norm": 15.3125, + "learning_rate": 1.8956525638561749e-06, + "loss": 2.1714, + "step": 7870 + }, + { + "epoch": 0.23219814241486067, + "grad_norm": 16.25, + "learning_rate": 1.8954235625871747e-06, + "loss": 2.0627, + "step": 7875 + }, + { + "epoch": 0.23234556980687013, + "grad_norm": 16.625, + "learning_rate": 1.8951943241732526e-06, + "loss": 2.1204, + "step": 7880 + }, + { + "epoch": 0.23249299719887956, + "grad_norm": 13.5, + "learning_rate": 1.8949648486751202e-06, + "loss": 2.2194, + "step": 7885 + }, + { + "epoch": 0.232640424590889, + "grad_norm": 13.5, + "learning_rate": 1.8947351361535523e-06, + "loss": 2.1443, + "step": 7890 + }, + { + "epoch": 0.23278785198289842, + "grad_norm": 13.875, + "learning_rate": 1.8945051866693856e-06, + "loss": 2.244, + "step": 7895 + }, + { + "epoch": 0.23293527937490785, + "grad_norm": 15.9375, + "learning_rate": 1.8942750002835205e-06, + "loss": 2.1937, + "step": 7900 + }, + { + "epoch": 0.23308270676691728, + "grad_norm": 17.375, + "learning_rate": 1.89404457705692e-06, + "loss": 2.3146, + "step": 7905 + }, + { + "epoch": 0.23323013415892674, + "grad_norm": 15.0625, + "learning_rate": 1.8938139170506095e-06, + "loss": 2.1182, + "step": 7910 + }, + { + "epoch": 0.23337756155093617, + "grad_norm": 16.625, + "learning_rate": 1.8935830203256772e-06, + "loss": 2.272, + "step": 7915 + }, + { + "epoch": 0.2335249889429456, + "grad_norm": 16.25, + "learning_rate": 1.893351886943274e-06, + "loss": 2.1908, + "step": 7920 + }, + { + "epoch": 0.23367241633495503, + "grad_norm": 15.4375, + "learning_rate": 1.8931205169646136e-06, + "loss": 2.2338, + "step": 7925 + }, + { + "epoch": 0.23381984372696446, + "grad_norm": 12.0625, + "learning_rate": 1.8928889104509721e-06, + "loss": 2.1454, + "step": 7930 + }, + { + "epoch": 0.23396727111897392, + "grad_norm": 14.5625, + "learning_rate": 1.8926570674636888e-06, + "loss": 2.2695, + "step": 7935 + }, + { + "epoch": 0.23411469851098335, + "grad_norm": 22.25, + "learning_rate": 1.8924249880641647e-06, + "loss": 2.1593, + "step": 7940 + }, + { + "epoch": 0.23426212590299278, + "grad_norm": 14.125, + "learning_rate": 1.8921926723138644e-06, + "loss": 2.1392, + "step": 7945 + }, + { + "epoch": 0.2344095532950022, + "grad_norm": 22.875, + "learning_rate": 1.8919601202743146e-06, + "loss": 2.1933, + "step": 7950 + }, + { + "epoch": 0.23455698068701164, + "grad_norm": 14.8125, + "learning_rate": 1.8917273320071044e-06, + "loss": 2.1975, + "step": 7955 + }, + { + "epoch": 0.23470440807902107, + "grad_norm": 16.0, + "learning_rate": 1.8914943075738856e-06, + "loss": 2.1972, + "step": 7960 + }, + { + "epoch": 0.23485183547103053, + "grad_norm": 15.75, + "learning_rate": 1.891261047036373e-06, + "loss": 2.2886, + "step": 7965 + }, + { + "epoch": 0.23499926286303996, + "grad_norm": 14.5625, + "learning_rate": 1.8910275504563434e-06, + "loss": 2.2055, + "step": 7970 + }, + { + "epoch": 0.2351466902550494, + "grad_norm": 16.625, + "learning_rate": 1.890793817895636e-06, + "loss": 2.1969, + "step": 7975 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 16.0, + "learning_rate": 1.8905598494161535e-06, + "loss": 2.1731, + "step": 7980 + }, + { + "epoch": 0.23544154503906825, + "grad_norm": 14.625, + "learning_rate": 1.8903256450798594e-06, + "loss": 2.2552, + "step": 7985 + }, + { + "epoch": 0.23558897243107768, + "grad_norm": 15.75, + "learning_rate": 1.890091204948782e-06, + "loss": 2.2014, + "step": 7990 + }, + { + "epoch": 0.23573639982308714, + "grad_norm": 16.375, + "learning_rate": 1.889856529085009e-06, + "loss": 2.2227, + "step": 7995 + }, + { + "epoch": 0.23588382721509657, + "grad_norm": 13.875, + "learning_rate": 1.8896216175506932e-06, + "loss": 2.1592, + "step": 8000 + }, + { + "epoch": 0.23588382721509657, + "eval_loss": 2.1633877754211426, + "eval_runtime": 4.717, + "eval_samples_per_second": 83.952, + "eval_steps_per_second": 2.756, + "step": 8000 + }, + { + "epoch": 0.236031254607106, + "grad_norm": 11.625, + "learning_rate": 1.889386470408049e-06, + "loss": 2.1509, + "step": 8005 + }, + { + "epoch": 0.23617868199911543, + "grad_norm": 13.3125, + "learning_rate": 1.8891510877193522e-06, + "loss": 2.2325, + "step": 8010 + }, + { + "epoch": 0.23632610939112486, + "grad_norm": 14.25, + "learning_rate": 1.8889154695469424e-06, + "loss": 2.2458, + "step": 8015 + }, + { + "epoch": 0.23647353678313432, + "grad_norm": 15.5625, + "learning_rate": 1.888679615953221e-06, + "loss": 2.2179, + "step": 8020 + }, + { + "epoch": 0.23662096417514375, + "grad_norm": 13.5, + "learning_rate": 1.8884435270006516e-06, + "loss": 2.1758, + "step": 8025 + }, + { + "epoch": 0.23676839156715318, + "grad_norm": 15.875, + "learning_rate": 1.8882072027517598e-06, + "loss": 2.1682, + "step": 8030 + }, + { + "epoch": 0.2369158189591626, + "grad_norm": 14.5625, + "learning_rate": 1.8879706432691344e-06, + "loss": 2.1355, + "step": 8035 + }, + { + "epoch": 0.23706324635117204, + "grad_norm": 18.875, + "learning_rate": 1.8877338486154263e-06, + "loss": 2.1882, + "step": 8040 + }, + { + "epoch": 0.23721067374318147, + "grad_norm": 17.125, + "learning_rate": 1.8874968188533482e-06, + "loss": 2.2153, + "step": 8045 + }, + { + "epoch": 0.23735810113519093, + "grad_norm": 14.625, + "learning_rate": 1.887259554045675e-06, + "loss": 2.1607, + "step": 8050 + }, + { + "epoch": 0.23750552852720036, + "grad_norm": 15.5625, + "learning_rate": 1.8870220542552445e-06, + "loss": 2.1877, + "step": 8055 + }, + { + "epoch": 0.2376529559192098, + "grad_norm": 53.5, + "learning_rate": 1.8867843195449563e-06, + "loss": 2.1403, + "step": 8060 + }, + { + "epoch": 0.23780038331121922, + "grad_norm": 12.125, + "learning_rate": 1.8865463499777724e-06, + "loss": 2.1845, + "step": 8065 + }, + { + "epoch": 0.23794781070322865, + "grad_norm": 14.75, + "learning_rate": 1.886308145616717e-06, + "loss": 2.1896, + "step": 8070 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 16.25, + "learning_rate": 1.8860697065248759e-06, + "loss": 2.2754, + "step": 8075 + }, + { + "epoch": 0.23824266548724754, + "grad_norm": 15.8125, + "learning_rate": 1.8858310327653982e-06, + "loss": 2.178, + "step": 8080 + }, + { + "epoch": 0.23839009287925697, + "grad_norm": 14.625, + "learning_rate": 1.885592124401494e-06, + "loss": 2.2494, + "step": 8085 + }, + { + "epoch": 0.2385375202712664, + "grad_norm": 16.0, + "learning_rate": 1.8853529814964365e-06, + "loss": 2.1538, + "step": 8090 + }, + { + "epoch": 0.23868494766327583, + "grad_norm": 15.9375, + "learning_rate": 1.88511360411356e-06, + "loss": 2.2637, + "step": 8095 + }, + { + "epoch": 0.23883237505528526, + "grad_norm": 15.6875, + "learning_rate": 1.884873992316262e-06, + "loss": 2.2347, + "step": 8100 + }, + { + "epoch": 0.23897980244729472, + "grad_norm": 15.8125, + "learning_rate": 1.8846341461680013e-06, + "loss": 2.2894, + "step": 8105 + }, + { + "epoch": 0.23912722983930415, + "grad_norm": 15.375, + "learning_rate": 1.884394065732299e-06, + "loss": 2.2071, + "step": 8110 + }, + { + "epoch": 0.23927465723131358, + "grad_norm": 16.625, + "learning_rate": 1.8841537510727383e-06, + "loss": 2.2701, + "step": 8115 + }, + { + "epoch": 0.239422084623323, + "grad_norm": 27.75, + "learning_rate": 1.8839132022529642e-06, + "loss": 2.1806, + "step": 8120 + }, + { + "epoch": 0.23956951201533244, + "grad_norm": 15.25, + "learning_rate": 1.8836724193366844e-06, + "loss": 2.1222, + "step": 8125 + }, + { + "epoch": 0.23971693940734187, + "grad_norm": 21.0, + "learning_rate": 1.8834314023876675e-06, + "loss": 2.2144, + "step": 8130 + }, + { + "epoch": 0.23986436679935133, + "grad_norm": 16.5, + "learning_rate": 1.8831901514697452e-06, + "loss": 2.1489, + "step": 8135 + }, + { + "epoch": 0.24001179419136076, + "grad_norm": 13.75, + "learning_rate": 1.8829486666468104e-06, + "loss": 2.1968, + "step": 8140 + }, + { + "epoch": 0.2401592215833702, + "grad_norm": 16.0, + "learning_rate": 1.882706947982818e-06, + "loss": 2.148, + "step": 8145 + }, + { + "epoch": 0.24030664897537962, + "grad_norm": 15.625, + "learning_rate": 1.8824649955417853e-06, + "loss": 2.0118, + "step": 8150 + }, + { + "epoch": 0.24045407636738905, + "grad_norm": 14.5, + "learning_rate": 1.8822228093877912e-06, + "loss": 2.2747, + "step": 8155 + }, + { + "epoch": 0.24060150375939848, + "grad_norm": 15.6875, + "learning_rate": 1.8819803895849767e-06, + "loss": 2.2399, + "step": 8160 + }, + { + "epoch": 0.24074893115140794, + "grad_norm": 16.75, + "learning_rate": 1.881737736197544e-06, + "loss": 2.2886, + "step": 8165 + }, + { + "epoch": 0.24089635854341737, + "grad_norm": 22.625, + "learning_rate": 1.881494849289758e-06, + "loss": 2.1599, + "step": 8170 + }, + { + "epoch": 0.2410437859354268, + "grad_norm": 13.875, + "learning_rate": 1.8812517289259454e-06, + "loss": 2.1261, + "step": 8175 + }, + { + "epoch": 0.24119121332743623, + "grad_norm": 14.75, + "learning_rate": 1.881008375170494e-06, + "loss": 2.2204, + "step": 8180 + }, + { + "epoch": 0.24133864071944566, + "grad_norm": 14.0, + "learning_rate": 1.880764788087854e-06, + "loss": 2.1926, + "step": 8185 + }, + { + "epoch": 0.24148606811145512, + "grad_norm": 15.75, + "learning_rate": 1.8805209677425374e-06, + "loss": 2.1834, + "step": 8190 + }, + { + "epoch": 0.24163349550346455, + "grad_norm": 15.125, + "learning_rate": 1.8802769141991177e-06, + "loss": 2.2266, + "step": 8195 + }, + { + "epoch": 0.24178092289547398, + "grad_norm": 15.625, + "learning_rate": 1.8800326275222304e-06, + "loss": 2.3082, + "step": 8200 + }, + { + "epoch": 0.2419283502874834, + "grad_norm": 15.5, + "learning_rate": 1.8797881077765724e-06, + "loss": 2.2098, + "step": 8205 + }, + { + "epoch": 0.24207577767949284, + "grad_norm": 14.5, + "learning_rate": 1.8795433550269028e-06, + "loss": 2.139, + "step": 8210 + }, + { + "epoch": 0.24222320507150227, + "grad_norm": 25.625, + "learning_rate": 1.8792983693380424e-06, + "loss": 2.2472, + "step": 8215 + }, + { + "epoch": 0.24237063246351173, + "grad_norm": 15.5, + "learning_rate": 1.879053150774873e-06, + "loss": 2.2395, + "step": 8220 + }, + { + "epoch": 0.24251805985552116, + "grad_norm": 14.25, + "learning_rate": 1.8788076994023387e-06, + "loss": 2.1807, + "step": 8225 + }, + { + "epoch": 0.2426654872475306, + "grad_norm": 14.625, + "learning_rate": 1.8785620152854453e-06, + "loss": 2.2557, + "step": 8230 + }, + { + "epoch": 0.24281291463954002, + "grad_norm": 14.8125, + "learning_rate": 1.8783160984892598e-06, + "loss": 2.1958, + "step": 8235 + }, + { + "epoch": 0.24296034203154945, + "grad_norm": 14.5, + "learning_rate": 1.878069949078911e-06, + "loss": 2.2767, + "step": 8240 + }, + { + "epoch": 0.24310776942355888, + "grad_norm": 14.25, + "learning_rate": 1.8778235671195897e-06, + "loss": 2.3034, + "step": 8245 + }, + { + "epoch": 0.24325519681556834, + "grad_norm": 15.375, + "learning_rate": 1.877576952676548e-06, + "loss": 2.2439, + "step": 8250 + }, + { + "epoch": 0.24340262420757777, + "grad_norm": 16.5, + "learning_rate": 1.877330105815099e-06, + "loss": 2.1794, + "step": 8255 + }, + { + "epoch": 0.2435500515995872, + "grad_norm": 16.0, + "learning_rate": 1.8770830266006182e-06, + "loss": 2.212, + "step": 8260 + }, + { + "epoch": 0.24369747899159663, + "grad_norm": 15.5, + "learning_rate": 1.8768357150985424e-06, + "loss": 2.0937, + "step": 8265 + }, + { + "epoch": 0.24384490638360606, + "grad_norm": 16.375, + "learning_rate": 1.8765881713743696e-06, + "loss": 2.2065, + "step": 8270 + }, + { + "epoch": 0.24399233377561552, + "grad_norm": 13.625, + "learning_rate": 1.87634039549366e-06, + "loss": 2.3326, + "step": 8275 + }, + { + "epoch": 0.24413976116762495, + "grad_norm": 14.6875, + "learning_rate": 1.876092387522034e-06, + "loss": 2.1759, + "step": 8280 + }, + { + "epoch": 0.24428718855963438, + "grad_norm": 17.0, + "learning_rate": 1.875844147525175e-06, + "loss": 2.2042, + "step": 8285 + }, + { + "epoch": 0.2444346159516438, + "grad_norm": 25.375, + "learning_rate": 1.875595675568827e-06, + "loss": 2.2385, + "step": 8290 + }, + { + "epoch": 0.24458204334365324, + "grad_norm": 15.125, + "learning_rate": 1.8753469717187956e-06, + "loss": 2.1559, + "step": 8295 + }, + { + "epoch": 0.24472947073566267, + "grad_norm": 15.125, + "learning_rate": 1.8750980360409478e-06, + "loss": 2.1629, + "step": 8300 + }, + { + "epoch": 0.24487689812767213, + "grad_norm": 13.8125, + "learning_rate": 1.8748488686012118e-06, + "loss": 2.136, + "step": 8305 + }, + { + "epoch": 0.24502432551968156, + "grad_norm": 16.0, + "learning_rate": 1.8745994694655775e-06, + "loss": 2.1955, + "step": 8310 + }, + { + "epoch": 0.245171752911691, + "grad_norm": 13.375, + "learning_rate": 1.874349838700096e-06, + "loss": 2.2115, + "step": 8315 + }, + { + "epoch": 0.24531918030370042, + "grad_norm": 16.5, + "learning_rate": 1.8740999763708798e-06, + "loss": 2.1912, + "step": 8320 + }, + { + "epoch": 0.24546660769570985, + "grad_norm": 14.4375, + "learning_rate": 1.8738498825441025e-06, + "loss": 2.247, + "step": 8325 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 37.75, + "learning_rate": 1.8735995572859995e-06, + "loss": 2.2053, + "step": 8330 + }, + { + "epoch": 0.24576146247972874, + "grad_norm": 15.5, + "learning_rate": 1.8733490006628672e-06, + "loss": 2.1562, + "step": 8335 + }, + { + "epoch": 0.24590888987173817, + "grad_norm": 13.0, + "learning_rate": 1.873098212741063e-06, + "loss": 2.1676, + "step": 8340 + }, + { + "epoch": 0.2460563172637476, + "grad_norm": 13.75, + "learning_rate": 1.8728471935870062e-06, + "loss": 2.1155, + "step": 8345 + }, + { + "epoch": 0.24620374465575703, + "grad_norm": 11.8125, + "learning_rate": 1.8725959432671767e-06, + "loss": 2.1968, + "step": 8350 + }, + { + "epoch": 0.24635117204776646, + "grad_norm": 14.1875, + "learning_rate": 1.8723444618481162e-06, + "loss": 2.2811, + "step": 8355 + }, + { + "epoch": 0.24649859943977592, + "grad_norm": 14.375, + "learning_rate": 1.872092749396427e-06, + "loss": 2.177, + "step": 8360 + }, + { + "epoch": 0.24664602683178535, + "grad_norm": 14.9375, + "learning_rate": 1.8718408059787727e-06, + "loss": 2.1862, + "step": 8365 + }, + { + "epoch": 0.24679345422379478, + "grad_norm": 14.3125, + "learning_rate": 1.8715886316618787e-06, + "loss": 2.2133, + "step": 8370 + }, + { + "epoch": 0.24694088161580421, + "grad_norm": 11.5625, + "learning_rate": 1.8713362265125313e-06, + "loss": 2.0847, + "step": 8375 + }, + { + "epoch": 0.24708830900781364, + "grad_norm": 16.75, + "learning_rate": 1.871083590597577e-06, + "loss": 2.2498, + "step": 8380 + }, + { + "epoch": 0.24723573639982308, + "grad_norm": 18.375, + "learning_rate": 1.8708307239839248e-06, + "loss": 2.2175, + "step": 8385 + }, + { + "epoch": 0.24738316379183253, + "grad_norm": 20.5, + "learning_rate": 1.8705776267385436e-06, + "loss": 2.2745, + "step": 8390 + }, + { + "epoch": 0.24753059118384196, + "grad_norm": 15.0625, + "learning_rate": 1.8703242989284647e-06, + "loss": 2.0961, + "step": 8395 + }, + { + "epoch": 0.2476780185758514, + "grad_norm": 17.875, + "learning_rate": 1.8700707406207793e-06, + "loss": 2.2088, + "step": 8400 + }, + { + "epoch": 0.24782544596786082, + "grad_norm": 14.4375, + "learning_rate": 1.8698169518826397e-06, + "loss": 2.1112, + "step": 8405 + }, + { + "epoch": 0.24797287335987026, + "grad_norm": 16.75, + "learning_rate": 1.86956293278126e-06, + "loss": 2.2312, + "step": 8410 + }, + { + "epoch": 0.24812030075187969, + "grad_norm": 15.8125, + "learning_rate": 1.8693086833839148e-06, + "loss": 2.1858, + "step": 8415 + }, + { + "epoch": 0.24826772814388914, + "grad_norm": 14.75, + "learning_rate": 1.86905420375794e-06, + "loss": 2.2127, + "step": 8420 + }, + { + "epoch": 0.24841515553589857, + "grad_norm": 14.75, + "learning_rate": 1.868799493970732e-06, + "loss": 2.1444, + "step": 8425 + }, + { + "epoch": 0.248562582927908, + "grad_norm": 14.625, + "learning_rate": 1.8685445540897483e-06, + "loss": 2.2396, + "step": 8430 + }, + { + "epoch": 0.24871001031991744, + "grad_norm": 31.125, + "learning_rate": 1.8682893841825074e-06, + "loss": 2.1866, + "step": 8435 + }, + { + "epoch": 0.24885743771192687, + "grad_norm": 14.5, + "learning_rate": 1.8680339843165891e-06, + "loss": 2.1701, + "step": 8440 + }, + { + "epoch": 0.24900486510393632, + "grad_norm": 14.5, + "learning_rate": 1.8677783545596338e-06, + "loss": 2.1813, + "step": 8445 + }, + { + "epoch": 0.24915229249594575, + "grad_norm": 15.0625, + "learning_rate": 1.8675224949793424e-06, + "loss": 2.2112, + "step": 8450 + }, + { + "epoch": 0.24929971988795518, + "grad_norm": 15.8125, + "learning_rate": 1.8672664056434773e-06, + "loss": 2.2627, + "step": 8455 + }, + { + "epoch": 0.24944714727996462, + "grad_norm": 14.75, + "learning_rate": 1.8670100866198613e-06, + "loss": 2.2006, + "step": 8460 + }, + { + "epoch": 0.24959457467197405, + "grad_norm": 15.0, + "learning_rate": 1.866753537976378e-06, + "loss": 2.1875, + "step": 8465 + }, + { + "epoch": 0.24974200206398348, + "grad_norm": 13.625, + "learning_rate": 1.8664967597809729e-06, + "loss": 2.3218, + "step": 8470 + }, + { + "epoch": 0.24988942945599293, + "grad_norm": 15.1875, + "learning_rate": 1.8662397521016503e-06, + "loss": 2.2054, + "step": 8475 + }, + { + "epoch": 0.25003685684800236, + "grad_norm": 17.375, + "learning_rate": 1.8659825150064773e-06, + "loss": 2.2152, + "step": 8480 + }, + { + "epoch": 0.25018428424001177, + "grad_norm": 16.25, + "learning_rate": 1.86572504856358e-06, + "loss": 2.0906, + "step": 8485 + }, + { + "epoch": 0.2503317116320212, + "grad_norm": 14.875, + "learning_rate": 1.8654673528411466e-06, + "loss": 2.1785, + "step": 8490 + }, + { + "epoch": 0.2504791390240307, + "grad_norm": 17.375, + "learning_rate": 1.8652094279074255e-06, + "loss": 2.3136, + "step": 8495 + }, + { + "epoch": 0.2506265664160401, + "grad_norm": 14.0625, + "learning_rate": 1.8649512738307258e-06, + "loss": 2.1993, + "step": 8500 + }, + { + "epoch": 0.2506265664160401, + "eval_loss": 2.151705265045166, + "eval_runtime": 4.7212, + "eval_samples_per_second": 83.877, + "eval_steps_per_second": 2.754, + "step": 8500 + }, + { + "epoch": 0.25077399380804954, + "grad_norm": 17.125, + "learning_rate": 1.864692890679417e-06, + "loss": 2.1964, + "step": 8505 + }, + { + "epoch": 0.25092142120005895, + "grad_norm": 16.75, + "learning_rate": 1.86443427852193e-06, + "loss": 2.2787, + "step": 8510 + }, + { + "epoch": 0.2510688485920684, + "grad_norm": 14.625, + "learning_rate": 1.8641754374267558e-06, + "loss": 2.1101, + "step": 8515 + }, + { + "epoch": 0.25121627598407786, + "grad_norm": 14.75, + "learning_rate": 1.863916367462446e-06, + "loss": 2.26, + "step": 8520 + }, + { + "epoch": 0.25136370337608727, + "grad_norm": 14.125, + "learning_rate": 1.8636570686976127e-06, + "loss": 2.1509, + "step": 8525 + }, + { + "epoch": 0.2515111307680967, + "grad_norm": 16.125, + "learning_rate": 1.8633975412009294e-06, + "loss": 2.2034, + "step": 8530 + }, + { + "epoch": 0.2516585581601061, + "grad_norm": 15.125, + "learning_rate": 1.8631377850411293e-06, + "loss": 2.1605, + "step": 8535 + }, + { + "epoch": 0.2518059855521156, + "grad_norm": 15.75, + "learning_rate": 1.8628778002870069e-06, + "loss": 2.1062, + "step": 8540 + }, + { + "epoch": 0.25195341294412504, + "grad_norm": 17.0, + "learning_rate": 1.8626175870074165e-06, + "loss": 2.1868, + "step": 8545 + }, + { + "epoch": 0.25210084033613445, + "grad_norm": 17.125, + "learning_rate": 1.862357145271273e-06, + "loss": 2.2456, + "step": 8550 + }, + { + "epoch": 0.2522482677281439, + "grad_norm": 15.5625, + "learning_rate": 1.8620964751475525e-06, + "loss": 2.2008, + "step": 8555 + }, + { + "epoch": 0.2523956951201533, + "grad_norm": 21.625, + "learning_rate": 1.861835576705291e-06, + "loss": 2.1544, + "step": 8560 + }, + { + "epoch": 0.25254312251216277, + "grad_norm": 14.625, + "learning_rate": 1.8615744500135855e-06, + "loss": 2.18, + "step": 8565 + }, + { + "epoch": 0.25269054990417217, + "grad_norm": 13.5625, + "learning_rate": 1.8613130951415924e-06, + "loss": 2.1628, + "step": 8570 + }, + { + "epoch": 0.2528379772961816, + "grad_norm": 14.625, + "learning_rate": 1.8610515121585296e-06, + "loss": 2.1433, + "step": 8575 + }, + { + "epoch": 0.2529854046881911, + "grad_norm": 12.875, + "learning_rate": 1.8607897011336746e-06, + "loss": 2.1122, + "step": 8580 + }, + { + "epoch": 0.2531328320802005, + "grad_norm": 14.25, + "learning_rate": 1.8605276621363664e-06, + "loss": 2.1748, + "step": 8585 + }, + { + "epoch": 0.25328025947220995, + "grad_norm": 19.625, + "learning_rate": 1.8602653952360032e-06, + "loss": 2.1691, + "step": 8590 + }, + { + "epoch": 0.25342768686421935, + "grad_norm": 14.8125, + "learning_rate": 1.8600029005020442e-06, + "loss": 2.2135, + "step": 8595 + }, + { + "epoch": 0.2535751142562288, + "grad_norm": 14.5625, + "learning_rate": 1.8597401780040088e-06, + "loss": 2.2858, + "step": 8600 + }, + { + "epoch": 0.25372254164823826, + "grad_norm": 21.0, + "learning_rate": 1.8594772278114764e-06, + "loss": 2.2727, + "step": 8605 + }, + { + "epoch": 0.25386996904024767, + "grad_norm": 16.125, + "learning_rate": 1.8592140499940876e-06, + "loss": 2.1415, + "step": 8610 + }, + { + "epoch": 0.2540173964322571, + "grad_norm": 12.5625, + "learning_rate": 1.858950644621542e-06, + "loss": 2.1434, + "step": 8615 + }, + { + "epoch": 0.25416482382426653, + "grad_norm": 13.375, + "learning_rate": 1.858687011763601e-06, + "loss": 2.2154, + "step": 8620 + }, + { + "epoch": 0.254312251216276, + "grad_norm": 15.5, + "learning_rate": 1.8584231514900842e-06, + "loss": 2.2386, + "step": 8625 + }, + { + "epoch": 0.25445967860828544, + "grad_norm": 16.0, + "learning_rate": 1.8581590638708739e-06, + "loss": 2.2126, + "step": 8630 + }, + { + "epoch": 0.25460710600029485, + "grad_norm": 19.125, + "learning_rate": 1.8578947489759105e-06, + "loss": 2.2209, + "step": 8635 + }, + { + "epoch": 0.2547545333923043, + "grad_norm": 14.9375, + "learning_rate": 1.8576302068751958e-06, + "loss": 2.2278, + "step": 8640 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 14.6875, + "learning_rate": 1.8573654376387915e-06, + "loss": 2.0973, + "step": 8645 + }, + { + "epoch": 0.25504938817632317, + "grad_norm": 13.8125, + "learning_rate": 1.857100441336819e-06, + "loss": 2.1817, + "step": 8650 + }, + { + "epoch": 0.25519681556833257, + "grad_norm": 17.125, + "learning_rate": 1.8568352180394603e-06, + "loss": 2.161, + "step": 8655 + }, + { + "epoch": 0.255344242960342, + "grad_norm": 14.9375, + "learning_rate": 1.8565697678169578e-06, + "loss": 2.2521, + "step": 8660 + }, + { + "epoch": 0.2554916703523515, + "grad_norm": 14.1875, + "learning_rate": 1.8563040907396132e-06, + "loss": 2.1843, + "step": 8665 + }, + { + "epoch": 0.2556390977443609, + "grad_norm": 16.75, + "learning_rate": 1.8560381868777886e-06, + "loss": 2.1212, + "step": 8670 + }, + { + "epoch": 0.25578652513637035, + "grad_norm": 15.5625, + "learning_rate": 1.8557720563019068e-06, + "loss": 2.2056, + "step": 8675 + }, + { + "epoch": 0.25593395252837975, + "grad_norm": 12.0625, + "learning_rate": 1.8555056990824496e-06, + "loss": 2.0907, + "step": 8680 + }, + { + "epoch": 0.2560813799203892, + "grad_norm": 16.75, + "learning_rate": 1.8552391152899599e-06, + "loss": 2.3035, + "step": 8685 + }, + { + "epoch": 0.25622880731239867, + "grad_norm": 15.375, + "learning_rate": 1.854972304995039e-06, + "loss": 2.2537, + "step": 8690 + }, + { + "epoch": 0.25637623470440807, + "grad_norm": 17.25, + "learning_rate": 1.8547052682683505e-06, + "loss": 2.2827, + "step": 8695 + }, + { + "epoch": 0.2565236620964175, + "grad_norm": 14.9375, + "learning_rate": 1.8544380051806157e-06, + "loss": 2.1187, + "step": 8700 + }, + { + "epoch": 0.25667108948842693, + "grad_norm": 14.9375, + "learning_rate": 1.8541705158026173e-06, + "loss": 2.1568, + "step": 8705 + }, + { + "epoch": 0.2568185168804364, + "grad_norm": 15.875, + "learning_rate": 1.8539028002051973e-06, + "loss": 2.1486, + "step": 8710 + }, + { + "epoch": 0.25696594427244585, + "grad_norm": 15.125, + "learning_rate": 1.8536348584592578e-06, + "loss": 2.2145, + "step": 8715 + }, + { + "epoch": 0.25711337166445525, + "grad_norm": 13.0625, + "learning_rate": 1.8533666906357607e-06, + "loss": 2.1246, + "step": 8720 + }, + { + "epoch": 0.2572607990564647, + "grad_norm": 14.8125, + "learning_rate": 1.8530982968057285e-06, + "loss": 2.2059, + "step": 8725 + }, + { + "epoch": 0.2574082264484741, + "grad_norm": 14.125, + "learning_rate": 1.852829677040242e-06, + "loss": 2.2232, + "step": 8730 + }, + { + "epoch": 0.25755565384048357, + "grad_norm": 14.3125, + "learning_rate": 1.8525608314104431e-06, + "loss": 2.2328, + "step": 8735 + }, + { + "epoch": 0.25770308123249297, + "grad_norm": 15.5, + "learning_rate": 1.8522917599875334e-06, + "loss": 2.2464, + "step": 8740 + }, + { + "epoch": 0.25785050862450243, + "grad_norm": 15.375, + "learning_rate": 1.8520224628427736e-06, + "loss": 2.2532, + "step": 8745 + }, + { + "epoch": 0.2579979360165119, + "grad_norm": 16.125, + "learning_rate": 1.8517529400474848e-06, + "loss": 2.1911, + "step": 8750 + }, + { + "epoch": 0.2581453634085213, + "grad_norm": 14.6875, + "learning_rate": 1.8514831916730482e-06, + "loss": 2.2427, + "step": 8755 + }, + { + "epoch": 0.25829279080053075, + "grad_norm": 13.625, + "learning_rate": 1.8512132177909034e-06, + "loss": 2.1896, + "step": 8760 + }, + { + "epoch": 0.25844021819254015, + "grad_norm": 13.0625, + "learning_rate": 1.8509430184725513e-06, + "loss": 2.0831, + "step": 8765 + }, + { + "epoch": 0.2585876455845496, + "grad_norm": 12.9375, + "learning_rate": 1.8506725937895515e-06, + "loss": 2.1604, + "step": 8770 + }, + { + "epoch": 0.25873507297655907, + "grad_norm": 28.875, + "learning_rate": 1.8504019438135235e-06, + "loss": 2.2493, + "step": 8775 + }, + { + "epoch": 0.25888250036856847, + "grad_norm": 15.1875, + "learning_rate": 1.8501310686161463e-06, + "loss": 2.1755, + "step": 8780 + }, + { + "epoch": 0.2590299277605779, + "grad_norm": 14.3125, + "learning_rate": 1.8498599682691592e-06, + "loss": 2.2261, + "step": 8785 + }, + { + "epoch": 0.25917735515258733, + "grad_norm": 18.625, + "learning_rate": 1.8495886428443605e-06, + "loss": 2.1731, + "step": 8790 + }, + { + "epoch": 0.2593247825445968, + "grad_norm": 15.125, + "learning_rate": 1.8493170924136083e-06, + "loss": 2.3196, + "step": 8795 + }, + { + "epoch": 0.25947220993660625, + "grad_norm": 16.75, + "learning_rate": 1.8490453170488202e-06, + "loss": 2.2446, + "step": 8800 + }, + { + "epoch": 0.25961963732861565, + "grad_norm": 14.875, + "learning_rate": 1.8487733168219739e-06, + "loss": 2.1648, + "step": 8805 + }, + { + "epoch": 0.2597670647206251, + "grad_norm": 14.3125, + "learning_rate": 1.8485010918051059e-06, + "loss": 2.2215, + "step": 8810 + }, + { + "epoch": 0.2599144921126345, + "grad_norm": 17.5, + "learning_rate": 1.8482286420703125e-06, + "loss": 2.3307, + "step": 8815 + }, + { + "epoch": 0.26006191950464397, + "grad_norm": 14.375, + "learning_rate": 1.84795596768975e-06, + "loss": 2.1618, + "step": 8820 + }, + { + "epoch": 0.26020934689665337, + "grad_norm": 14.9375, + "learning_rate": 1.847683068735633e-06, + "loss": 2.2691, + "step": 8825 + }, + { + "epoch": 0.26035677428866283, + "grad_norm": 11.0625, + "learning_rate": 1.8474099452802369e-06, + "loss": 2.117, + "step": 8830 + }, + { + "epoch": 0.2605042016806723, + "grad_norm": 14.3125, + "learning_rate": 1.8471365973958962e-06, + "loss": 2.0912, + "step": 8835 + }, + { + "epoch": 0.2606516290726817, + "grad_norm": 14.8125, + "learning_rate": 1.846863025155004e-06, + "loss": 2.1718, + "step": 8840 + }, + { + "epoch": 0.26079905646469115, + "grad_norm": 17.0, + "learning_rate": 1.8465892286300137e-06, + "loss": 2.0818, + "step": 8845 + }, + { + "epoch": 0.26094648385670055, + "grad_norm": 16.875, + "learning_rate": 1.8463152078934383e-06, + "loss": 2.2438, + "step": 8850 + }, + { + "epoch": 0.26109391124871, + "grad_norm": 13.9375, + "learning_rate": 1.8460409630178493e-06, + "loss": 2.2129, + "step": 8855 + }, + { + "epoch": 0.26124133864071947, + "grad_norm": 15.3125, + "learning_rate": 1.8457664940758782e-06, + "loss": 2.1616, + "step": 8860 + }, + { + "epoch": 0.26138876603272887, + "grad_norm": 14.5, + "learning_rate": 1.8454918011402155e-06, + "loss": 2.2635, + "step": 8865 + }, + { + "epoch": 0.26153619342473833, + "grad_norm": 14.0625, + "learning_rate": 1.8452168842836114e-06, + "loss": 2.2376, + "step": 8870 + }, + { + "epoch": 0.26168362081674773, + "grad_norm": 13.5, + "learning_rate": 1.8449417435788748e-06, + "loss": 2.1005, + "step": 8875 + }, + { + "epoch": 0.2618310482087572, + "grad_norm": 17.5, + "learning_rate": 1.8446663790988742e-06, + "loss": 2.164, + "step": 8880 + }, + { + "epoch": 0.26197847560076665, + "grad_norm": 15.5, + "learning_rate": 1.844390790916538e-06, + "loss": 2.1707, + "step": 8885 + }, + { + "epoch": 0.26212590299277605, + "grad_norm": 20.25, + "learning_rate": 1.844114979104853e-06, + "loss": 2.2746, + "step": 8890 + }, + { + "epoch": 0.2622733303847855, + "grad_norm": 13.8125, + "learning_rate": 1.843838943736865e-06, + "loss": 2.1086, + "step": 8895 + }, + { + "epoch": 0.2624207577767949, + "grad_norm": 16.125, + "learning_rate": 1.8435626848856805e-06, + "loss": 2.3454, + "step": 8900 + }, + { + "epoch": 0.26256818516880437, + "grad_norm": 12.5625, + "learning_rate": 1.8432862026244633e-06, + "loss": 2.188, + "step": 8905 + }, + { + "epoch": 0.26271561256081377, + "grad_norm": 14.8125, + "learning_rate": 1.8430094970264374e-06, + "loss": 2.1443, + "step": 8910 + }, + { + "epoch": 0.26286303995282323, + "grad_norm": 18.375, + "learning_rate": 1.8427325681648861e-06, + "loss": 2.1986, + "step": 8915 + }, + { + "epoch": 0.2630104673448327, + "grad_norm": 15.5, + "learning_rate": 1.8424554161131515e-06, + "loss": 2.1675, + "step": 8920 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 13.4375, + "learning_rate": 1.8421780409446347e-06, + "loss": 2.2025, + "step": 8925 + }, + { + "epoch": 0.26330532212885155, + "grad_norm": 14.125, + "learning_rate": 1.841900442732796e-06, + "loss": 2.1276, + "step": 8930 + }, + { + "epoch": 0.26345274952086095, + "grad_norm": 17.75, + "learning_rate": 1.841622621551155e-06, + "loss": 2.1602, + "step": 8935 + }, + { + "epoch": 0.2636001769128704, + "grad_norm": 13.875, + "learning_rate": 1.8413445774732901e-06, + "loss": 2.2945, + "step": 8940 + }, + { + "epoch": 0.26374760430487987, + "grad_norm": 15.375, + "learning_rate": 1.8410663105728387e-06, + "loss": 2.0937, + "step": 8945 + }, + { + "epoch": 0.26389503169688927, + "grad_norm": 17.75, + "learning_rate": 1.840787820923497e-06, + "loss": 2.2, + "step": 8950 + }, + { + "epoch": 0.26404245908889873, + "grad_norm": 14.625, + "learning_rate": 1.8405091085990213e-06, + "loss": 2.1271, + "step": 8955 + }, + { + "epoch": 0.26418988648090813, + "grad_norm": 25.75, + "learning_rate": 1.840230173673225e-06, + "loss": 2.1916, + "step": 8960 + }, + { + "epoch": 0.2643373138729176, + "grad_norm": 17.25, + "learning_rate": 1.8399510162199824e-06, + "loss": 2.2481, + "step": 8965 + }, + { + "epoch": 0.26448474126492705, + "grad_norm": 15.8125, + "learning_rate": 1.8396716363132255e-06, + "loss": 2.1464, + "step": 8970 + }, + { + "epoch": 0.26463216865693645, + "grad_norm": 17.0, + "learning_rate": 1.8393920340269458e-06, + "loss": 2.1157, + "step": 8975 + }, + { + "epoch": 0.2647795960489459, + "grad_norm": 14.3125, + "learning_rate": 1.8391122094351933e-06, + "loss": 2.0538, + "step": 8980 + }, + { + "epoch": 0.2649270234409553, + "grad_norm": 12.6875, + "learning_rate": 1.8388321626120769e-06, + "loss": 2.2269, + "step": 8985 + }, + { + "epoch": 0.26507445083296477, + "grad_norm": 16.75, + "learning_rate": 1.8385518936317645e-06, + "loss": 2.1711, + "step": 8990 + }, + { + "epoch": 0.26522187822497423, + "grad_norm": 13.75, + "learning_rate": 1.838271402568483e-06, + "loss": 2.1552, + "step": 8995 + }, + { + "epoch": 0.26536930561698363, + "grad_norm": 14.3125, + "learning_rate": 1.837990689496518e-06, + "loss": 2.1522, + "step": 9000 + }, + { + "epoch": 0.26536930561698363, + "eval_loss": 2.142622232437134, + "eval_runtime": 4.7105, + "eval_samples_per_second": 84.067, + "eval_steps_per_second": 2.76, + "step": 9000 + }, + { + "epoch": 0.2655167330089931, + "grad_norm": 17.5, + "learning_rate": 1.837709754490214e-06, + "loss": 2.1945, + "step": 9005 + }, + { + "epoch": 0.2656641604010025, + "grad_norm": 16.125, + "learning_rate": 1.8374285976239734e-06, + "loss": 2.2451, + "step": 9010 + }, + { + "epoch": 0.26581158779301195, + "grad_norm": 15.625, + "learning_rate": 1.8371472189722589e-06, + "loss": 2.2319, + "step": 9015 + }, + { + "epoch": 0.26595901518502135, + "grad_norm": 19.25, + "learning_rate": 1.8368656186095905e-06, + "loss": 2.2219, + "step": 9020 + }, + { + "epoch": 0.2661064425770308, + "grad_norm": 22.125, + "learning_rate": 1.8365837966105481e-06, + "loss": 2.1642, + "step": 9025 + }, + { + "epoch": 0.26625386996904027, + "grad_norm": 15.6875, + "learning_rate": 1.8363017530497693e-06, + "loss": 2.2582, + "step": 9030 + }, + { + "epoch": 0.26640129736104967, + "grad_norm": 14.0625, + "learning_rate": 1.836019488001951e-06, + "loss": 2.1031, + "step": 9035 + }, + { + "epoch": 0.26654872475305913, + "grad_norm": 15.6875, + "learning_rate": 1.8357370015418488e-06, + "loss": 2.225, + "step": 9040 + }, + { + "epoch": 0.26669615214506853, + "grad_norm": 15.0, + "learning_rate": 1.835454293744276e-06, + "loss": 2.254, + "step": 9045 + }, + { + "epoch": 0.266843579537078, + "grad_norm": 14.9375, + "learning_rate": 1.8351713646841059e-06, + "loss": 2.2191, + "step": 9050 + }, + { + "epoch": 0.26699100692908745, + "grad_norm": 14.6875, + "learning_rate": 1.8348882144362694e-06, + "loss": 2.2358, + "step": 9055 + }, + { + "epoch": 0.26713843432109685, + "grad_norm": 12.8125, + "learning_rate": 1.8346048430757566e-06, + "loss": 2.1893, + "step": 9060 + }, + { + "epoch": 0.2672858617131063, + "grad_norm": 15.9375, + "learning_rate": 1.8343212506776155e-06, + "loss": 2.1395, + "step": 9065 + }, + { + "epoch": 0.2674332891051157, + "grad_norm": 16.375, + "learning_rate": 1.8340374373169533e-06, + "loss": 2.1742, + "step": 9070 + }, + { + "epoch": 0.26758071649712517, + "grad_norm": 18.125, + "learning_rate": 1.8337534030689355e-06, + "loss": 2.1369, + "step": 9075 + }, + { + "epoch": 0.26772814388913463, + "grad_norm": 22.75, + "learning_rate": 1.8334691480087856e-06, + "loss": 2.1244, + "step": 9080 + }, + { + "epoch": 0.26787557128114403, + "grad_norm": 13.6875, + "learning_rate": 1.8331846722117864e-06, + "loss": 2.227, + "step": 9085 + }, + { + "epoch": 0.2680229986731535, + "grad_norm": 15.75, + "learning_rate": 1.8328999757532788e-06, + "loss": 2.1094, + "step": 9090 + }, + { + "epoch": 0.2681704260651629, + "grad_norm": 13.75, + "learning_rate": 1.8326150587086617e-06, + "loss": 2.1565, + "step": 9095 + }, + { + "epoch": 0.26831785345717235, + "grad_norm": 14.375, + "learning_rate": 1.832329921153393e-06, + "loss": 2.272, + "step": 9100 + }, + { + "epoch": 0.26846528084918175, + "grad_norm": 17.375, + "learning_rate": 1.8320445631629892e-06, + "loss": 2.2073, + "step": 9105 + }, + { + "epoch": 0.2686127082411912, + "grad_norm": 10.8125, + "learning_rate": 1.8317589848130246e-06, + "loss": 2.052, + "step": 9110 + }, + { + "epoch": 0.26876013563320067, + "grad_norm": 14.0625, + "learning_rate": 1.831473186179132e-06, + "loss": 2.1517, + "step": 9115 + }, + { + "epoch": 0.2689075630252101, + "grad_norm": 12.75, + "learning_rate": 1.8311871673370023e-06, + "loss": 2.0941, + "step": 9120 + }, + { + "epoch": 0.26905499041721953, + "grad_norm": 17.125, + "learning_rate": 1.8309009283623854e-06, + "loss": 2.2009, + "step": 9125 + }, + { + "epoch": 0.26920241780922893, + "grad_norm": 16.75, + "learning_rate": 1.8306144693310893e-06, + "loss": 2.0812, + "step": 9130 + }, + { + "epoch": 0.2693498452012384, + "grad_norm": 12.75, + "learning_rate": 1.8303277903189798e-06, + "loss": 2.1815, + "step": 9135 + }, + { + "epoch": 0.26949727259324785, + "grad_norm": 14.3125, + "learning_rate": 1.8300408914019813e-06, + "loss": 2.1776, + "step": 9140 + }, + { + "epoch": 0.26964469998525725, + "grad_norm": 16.125, + "learning_rate": 1.8297537726560766e-06, + "loss": 2.2445, + "step": 9145 + }, + { + "epoch": 0.2697921273772667, + "grad_norm": 15.875, + "learning_rate": 1.8294664341573063e-06, + "loss": 2.2909, + "step": 9150 + }, + { + "epoch": 0.2699395547692761, + "grad_norm": 16.625, + "learning_rate": 1.8291788759817695e-06, + "loss": 2.2258, + "step": 9155 + }, + { + "epoch": 0.27008698216128557, + "grad_norm": 12.0625, + "learning_rate": 1.8288910982056237e-06, + "loss": 2.1185, + "step": 9160 + }, + { + "epoch": 0.27023440955329503, + "grad_norm": 14.5625, + "learning_rate": 1.8286031009050837e-06, + "loss": 2.1964, + "step": 9165 + }, + { + "epoch": 0.27038183694530443, + "grad_norm": 11.8125, + "learning_rate": 1.8283148841564234e-06, + "loss": 2.1557, + "step": 9170 + }, + { + "epoch": 0.2705292643373139, + "grad_norm": 14.6875, + "learning_rate": 1.8280264480359747e-06, + "loss": 2.1727, + "step": 9175 + }, + { + "epoch": 0.2706766917293233, + "grad_norm": 14.25, + "learning_rate": 1.8277377926201268e-06, + "loss": 2.1334, + "step": 9180 + }, + { + "epoch": 0.27082411912133275, + "grad_norm": 14.9375, + "learning_rate": 1.8274489179853273e-06, + "loss": 2.0905, + "step": 9185 + }, + { + "epoch": 0.27097154651334215, + "grad_norm": 17.5, + "learning_rate": 1.827159824208083e-06, + "loss": 2.2571, + "step": 9190 + }, + { + "epoch": 0.2711189739053516, + "grad_norm": 15.125, + "learning_rate": 1.826870511364957e-06, + "loss": 2.1973, + "step": 9195 + }, + { + "epoch": 0.27126640129736107, + "grad_norm": 12.4375, + "learning_rate": 1.8265809795325713e-06, + "loss": 2.0869, + "step": 9200 + }, + { + "epoch": 0.2714138286893705, + "grad_norm": 27.0, + "learning_rate": 1.8262912287876065e-06, + "loss": 2.1648, + "step": 9205 + }, + { + "epoch": 0.27156125608137993, + "grad_norm": 20.5, + "learning_rate": 1.8260012592067995e-06, + "loss": 2.1417, + "step": 9210 + }, + { + "epoch": 0.27170868347338933, + "grad_norm": 16.0, + "learning_rate": 1.825711070866947e-06, + "loss": 2.224, + "step": 9215 + }, + { + "epoch": 0.2718561108653988, + "grad_norm": 15.625, + "learning_rate": 1.825420663844902e-06, + "loss": 2.1834, + "step": 9220 + }, + { + "epoch": 0.27200353825740825, + "grad_norm": 15.25, + "learning_rate": 1.8251300382175767e-06, + "loss": 2.0962, + "step": 9225 + }, + { + "epoch": 0.27215096564941765, + "grad_norm": 13.8125, + "learning_rate": 1.824839194061941e-06, + "loss": 2.1178, + "step": 9230 + }, + { + "epoch": 0.2722983930414271, + "grad_norm": 21.625, + "learning_rate": 1.824548131455022e-06, + "loss": 2.2477, + "step": 9235 + }, + { + "epoch": 0.2724458204334365, + "grad_norm": 17.5, + "learning_rate": 1.8242568504739046e-06, + "loss": 2.1384, + "step": 9240 + }, + { + "epoch": 0.272593247825446, + "grad_norm": 17.125, + "learning_rate": 1.8239653511957326e-06, + "loss": 2.2245, + "step": 9245 + }, + { + "epoch": 0.27274067521745543, + "grad_norm": 15.1875, + "learning_rate": 1.8236736336977065e-06, + "loss": 2.2292, + "step": 9250 + }, + { + "epoch": 0.27288810260946483, + "grad_norm": 16.125, + "learning_rate": 1.8233816980570857e-06, + "loss": 2.2241, + "step": 9255 + }, + { + "epoch": 0.2730355300014743, + "grad_norm": 16.875, + "learning_rate": 1.8230895443511861e-06, + "loss": 2.2429, + "step": 9260 + }, + { + "epoch": 0.2731829573934837, + "grad_norm": 12.25, + "learning_rate": 1.8227971726573825e-06, + "loss": 2.2361, + "step": 9265 + }, + { + "epoch": 0.27333038478549315, + "grad_norm": 13.75, + "learning_rate": 1.8225045830531068e-06, + "loss": 2.2299, + "step": 9270 + }, + { + "epoch": 0.27347781217750256, + "grad_norm": 11.8125, + "learning_rate": 1.8222117756158486e-06, + "loss": 2.1748, + "step": 9275 + }, + { + "epoch": 0.273625239569512, + "grad_norm": 14.4375, + "learning_rate": 1.8219187504231553e-06, + "loss": 2.1427, + "step": 9280 + }, + { + "epoch": 0.27377266696152147, + "grad_norm": 16.5, + "learning_rate": 1.821625507552632e-06, + "loss": 2.2049, + "step": 9285 + }, + { + "epoch": 0.2739200943535309, + "grad_norm": 13.6875, + "learning_rate": 1.8213320470819413e-06, + "loss": 2.1321, + "step": 9290 + }, + { + "epoch": 0.27406752174554033, + "grad_norm": 13.125, + "learning_rate": 1.821038369088804e-06, + "loss": 2.057, + "step": 9295 + }, + { + "epoch": 0.27421494913754973, + "grad_norm": 15.0625, + "learning_rate": 1.820744473650998e-06, + "loss": 2.0797, + "step": 9300 + }, + { + "epoch": 0.2743623765295592, + "grad_norm": 15.3125, + "learning_rate": 1.8204503608463586e-06, + "loss": 2.1928, + "step": 9305 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 15.4375, + "learning_rate": 1.8201560307527793e-06, + "loss": 2.2871, + "step": 9310 + }, + { + "epoch": 0.27465723131357805, + "grad_norm": 15.4375, + "learning_rate": 1.8198614834482107e-06, + "loss": 2.2493, + "step": 9315 + }, + { + "epoch": 0.2748046587055875, + "grad_norm": 14.9375, + "learning_rate": 1.8195667190106607e-06, + "loss": 2.1325, + "step": 9320 + }, + { + "epoch": 0.2749520860975969, + "grad_norm": 18.5, + "learning_rate": 1.8192717375181954e-06, + "loss": 2.2074, + "step": 9325 + }, + { + "epoch": 0.2750995134896064, + "grad_norm": 14.75, + "learning_rate": 1.8189765390489375e-06, + "loss": 2.1769, + "step": 9330 + }, + { + "epoch": 0.27524694088161583, + "grad_norm": 14.75, + "learning_rate": 1.8186811236810686e-06, + "loss": 2.3256, + "step": 9335 + }, + { + "epoch": 0.27539436827362523, + "grad_norm": 13.875, + "learning_rate": 1.818385491492826e-06, + "loss": 2.1801, + "step": 9340 + }, + { + "epoch": 0.2755417956656347, + "grad_norm": 15.1875, + "learning_rate": 1.8180896425625054e-06, + "loss": 2.1626, + "step": 9345 + }, + { + "epoch": 0.2756892230576441, + "grad_norm": 17.25, + "learning_rate": 1.81779357696846e-06, + "loss": 2.126, + "step": 9350 + }, + { + "epoch": 0.27583665044965355, + "grad_norm": 14.875, + "learning_rate": 1.8174972947890998e-06, + "loss": 2.0943, + "step": 9355 + }, + { + "epoch": 0.27598407784166296, + "grad_norm": 18.75, + "learning_rate": 1.8172007961028928e-06, + "loss": 2.2155, + "step": 9360 + }, + { + "epoch": 0.2761315052336724, + "grad_norm": 13.5, + "learning_rate": 1.8169040809883639e-06, + "loss": 2.1628, + "step": 9365 + }, + { + "epoch": 0.2762789326256819, + "grad_norm": 15.0625, + "learning_rate": 1.8166071495240952e-06, + "loss": 2.1454, + "step": 9370 + }, + { + "epoch": 0.2764263600176913, + "grad_norm": 14.125, + "learning_rate": 1.816310001788727e-06, + "loss": 2.1551, + "step": 9375 + }, + { + "epoch": 0.27657378740970073, + "grad_norm": 14.6875, + "learning_rate": 1.8160126378609553e-06, + "loss": 2.1669, + "step": 9380 + }, + { + "epoch": 0.27672121480171014, + "grad_norm": 15.6875, + "learning_rate": 1.8157150578195348e-06, + "loss": 2.1653, + "step": 9385 + }, + { + "epoch": 0.2768686421937196, + "grad_norm": 16.875, + "learning_rate": 1.8154172617432772e-06, + "loss": 2.2091, + "step": 9390 + }, + { + "epoch": 0.27701606958572905, + "grad_norm": 14.5, + "learning_rate": 1.8151192497110505e-06, + "loss": 2.2048, + "step": 9395 + }, + { + "epoch": 0.27716349697773845, + "grad_norm": 17.25, + "learning_rate": 1.814821021801781e-06, + "loss": 2.2176, + "step": 9400 + }, + { + "epoch": 0.2773109243697479, + "grad_norm": 14.125, + "learning_rate": 1.8145225780944515e-06, + "loss": 2.2279, + "step": 9405 + }, + { + "epoch": 0.2774583517617573, + "grad_norm": 15.3125, + "learning_rate": 1.8142239186681022e-06, + "loss": 2.1464, + "step": 9410 + }, + { + "epoch": 0.2776057791537668, + "grad_norm": 13.4375, + "learning_rate": 1.8139250436018303e-06, + "loss": 2.2263, + "step": 9415 + }, + { + "epoch": 0.27775320654577623, + "grad_norm": 16.125, + "learning_rate": 1.81362595297479e-06, + "loss": 2.1134, + "step": 9420 + }, + { + "epoch": 0.27790063393778563, + "grad_norm": 15.25, + "learning_rate": 1.8133266468661934e-06, + "loss": 2.148, + "step": 9425 + }, + { + "epoch": 0.2780480613297951, + "grad_norm": 14.625, + "learning_rate": 1.8130271253553084e-06, + "loss": 2.1851, + "step": 9430 + }, + { + "epoch": 0.2781954887218045, + "grad_norm": 16.25, + "learning_rate": 1.8127273885214609e-06, + "loss": 2.2678, + "step": 9435 + }, + { + "epoch": 0.27834291611381395, + "grad_norm": 16.375, + "learning_rate": 1.8124274364440337e-06, + "loss": 2.1842, + "step": 9440 + }, + { + "epoch": 0.27849034350582336, + "grad_norm": 14.6875, + "learning_rate": 1.8121272692024658e-06, + "loss": 2.2537, + "step": 9445 + }, + { + "epoch": 0.2786377708978328, + "grad_norm": 14.75, + "learning_rate": 1.8118268868762546e-06, + "loss": 2.1291, + "step": 9450 + }, + { + "epoch": 0.2787851982898423, + "grad_norm": 15.0, + "learning_rate": 1.811526289544953e-06, + "loss": 2.1512, + "step": 9455 + }, + { + "epoch": 0.2789326256818517, + "grad_norm": 14.5625, + "learning_rate": 1.8112254772881717e-06, + "loss": 2.1319, + "step": 9460 + }, + { + "epoch": 0.27908005307386113, + "grad_norm": 14.8125, + "learning_rate": 1.8109244501855782e-06, + "loss": 2.1444, + "step": 9465 + }, + { + "epoch": 0.27922748046587054, + "grad_norm": 13.875, + "learning_rate": 1.810623208316897e-06, + "loss": 2.1868, + "step": 9470 + }, + { + "epoch": 0.27937490785788, + "grad_norm": 15.0625, + "learning_rate": 1.8103217517619094e-06, + "loss": 2.2162, + "step": 9475 + }, + { + "epoch": 0.27952233524988945, + "grad_norm": 16.125, + "learning_rate": 1.810020080600453e-06, + "loss": 2.2668, + "step": 9480 + }, + { + "epoch": 0.27966976264189886, + "grad_norm": 16.0, + "learning_rate": 1.809718194912423e-06, + "loss": 2.2179, + "step": 9485 + }, + { + "epoch": 0.2798171900339083, + "grad_norm": 14.875, + "learning_rate": 1.809416094777771e-06, + "loss": 2.1202, + "step": 9490 + }, + { + "epoch": 0.2799646174259177, + "grad_norm": 10.75, + "learning_rate": 1.8091137802765058e-06, + "loss": 2.0973, + "step": 9495 + }, + { + "epoch": 0.2801120448179272, + "grad_norm": 12.0625, + "learning_rate": 1.8088112514886923e-06, + "loss": 2.0952, + "step": 9500 + }, + { + "epoch": 0.2801120448179272, + "eval_loss": 2.1350300312042236, + "eval_runtime": 4.7104, + "eval_samples_per_second": 84.069, + "eval_steps_per_second": 2.76, + "step": 9500 + }, + { + "epoch": 0.28025947220993663, + "grad_norm": 16.5, + "learning_rate": 1.8085085084944526e-06, + "loss": 2.1947, + "step": 9505 + }, + { + "epoch": 0.28040689960194604, + "grad_norm": 16.875, + "learning_rate": 1.808205551373966e-06, + "loss": 2.2008, + "step": 9510 + }, + { + "epoch": 0.2805543269939555, + "grad_norm": 16.125, + "learning_rate": 1.8079023802074674e-06, + "loss": 2.244, + "step": 9515 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 14.9375, + "learning_rate": 1.807598995075249e-06, + "loss": 2.2662, + "step": 9520 + }, + { + "epoch": 0.28084918177797435, + "grad_norm": 15.75, + "learning_rate": 1.80729539605766e-06, + "loss": 2.1337, + "step": 9525 + }, + { + "epoch": 0.28099660916998376, + "grad_norm": 14.375, + "learning_rate": 1.8069915832351057e-06, + "loss": 2.1507, + "step": 9530 + }, + { + "epoch": 0.2811440365619932, + "grad_norm": 13.25, + "learning_rate": 1.8066875566880482e-06, + "loss": 2.1803, + "step": 9535 + }, + { + "epoch": 0.2812914639540027, + "grad_norm": 11.25, + "learning_rate": 1.8063833164970061e-06, + "loss": 2.0851, + "step": 9540 + }, + { + "epoch": 0.2814388913460121, + "grad_norm": 13.5, + "learning_rate": 1.8060788627425548e-06, + "loss": 2.2345, + "step": 9545 + }, + { + "epoch": 0.28158631873802153, + "grad_norm": 15.875, + "learning_rate": 1.8057741955053261e-06, + "loss": 2.3918, + "step": 9550 + }, + { + "epoch": 0.28173374613003094, + "grad_norm": 14.6875, + "learning_rate": 1.8054693148660088e-06, + "loss": 2.1558, + "step": 9555 + }, + { + "epoch": 0.2818811735220404, + "grad_norm": 15.0625, + "learning_rate": 1.805164220905347e-06, + "loss": 2.1571, + "step": 9560 + }, + { + "epoch": 0.28202860091404985, + "grad_norm": 15.0, + "learning_rate": 1.8048589137041427e-06, + "loss": 2.0988, + "step": 9565 + }, + { + "epoch": 0.28217602830605926, + "grad_norm": 15.3125, + "learning_rate": 1.8045533933432538e-06, + "loss": 2.1293, + "step": 9570 + }, + { + "epoch": 0.2823234556980687, + "grad_norm": 24.25, + "learning_rate": 1.8042476599035944e-06, + "loss": 2.1763, + "step": 9575 + }, + { + "epoch": 0.2824708830900781, + "grad_norm": 14.5, + "learning_rate": 1.8039417134661354e-06, + "loss": 2.1462, + "step": 9580 + }, + { + "epoch": 0.2826183104820876, + "grad_norm": 12.125, + "learning_rate": 1.8036355541119038e-06, + "loss": 2.2079, + "step": 9585 + }, + { + "epoch": 0.28276573787409703, + "grad_norm": 14.5, + "learning_rate": 1.8033291819219833e-06, + "loss": 2.1601, + "step": 9590 + }, + { + "epoch": 0.28291316526610644, + "grad_norm": 14.375, + "learning_rate": 1.803022596977514e-06, + "loss": 2.2201, + "step": 9595 + }, + { + "epoch": 0.2830605926581159, + "grad_norm": 59.25, + "learning_rate": 1.802715799359692e-06, + "loss": 2.1743, + "step": 9600 + }, + { + "epoch": 0.2832080200501253, + "grad_norm": 15.0625, + "learning_rate": 1.8024087891497697e-06, + "loss": 2.2809, + "step": 9605 + }, + { + "epoch": 0.28335544744213476, + "grad_norm": 18.0, + "learning_rate": 1.8021015664290563e-06, + "loss": 2.0981, + "step": 9610 + }, + { + "epoch": 0.28350287483414416, + "grad_norm": 14.25, + "learning_rate": 1.8017941312789172e-06, + "loss": 2.3601, + "step": 9615 + }, + { + "epoch": 0.2836503022261536, + "grad_norm": 12.8125, + "learning_rate": 1.8014864837807732e-06, + "loss": 2.0582, + "step": 9620 + }, + { + "epoch": 0.2837977296181631, + "grad_norm": 15.4375, + "learning_rate": 1.8011786240161025e-06, + "loss": 2.1897, + "step": 9625 + }, + { + "epoch": 0.2839451570101725, + "grad_norm": 14.9375, + "learning_rate": 1.8008705520664388e-06, + "loss": 2.1527, + "step": 9630 + }, + { + "epoch": 0.28409258440218194, + "grad_norm": 17.25, + "learning_rate": 1.8005622680133722e-06, + "loss": 2.1402, + "step": 9635 + }, + { + "epoch": 0.28424001179419134, + "grad_norm": 15.6875, + "learning_rate": 1.8002537719385492e-06, + "loss": 2.2136, + "step": 9640 + }, + { + "epoch": 0.2843874391862008, + "grad_norm": 15.375, + "learning_rate": 1.7999450639236716e-06, + "loss": 2.2145, + "step": 9645 + }, + { + "epoch": 0.28453486657821025, + "grad_norm": 14.0625, + "learning_rate": 1.799636144050499e-06, + "loss": 2.2849, + "step": 9650 + }, + { + "epoch": 0.28468229397021966, + "grad_norm": 17.25, + "learning_rate": 1.7993270124008454e-06, + "loss": 2.1804, + "step": 9655 + }, + { + "epoch": 0.2848297213622291, + "grad_norm": 14.0625, + "learning_rate": 1.7990176690565815e-06, + "loss": 2.1777, + "step": 9660 + }, + { + "epoch": 0.2849771487542385, + "grad_norm": 14.5625, + "learning_rate": 1.798708114099634e-06, + "loss": 2.1158, + "step": 9665 + }, + { + "epoch": 0.285124576146248, + "grad_norm": 14.0, + "learning_rate": 1.7983983476119864e-06, + "loss": 2.1636, + "step": 9670 + }, + { + "epoch": 0.28527200353825743, + "grad_norm": 13.6875, + "learning_rate": 1.798088369675677e-06, + "loss": 2.1837, + "step": 9675 + }, + { + "epoch": 0.28541943093026684, + "grad_norm": 15.25, + "learning_rate": 1.797778180372801e-06, + "loss": 2.2862, + "step": 9680 + }, + { + "epoch": 0.2855668583222763, + "grad_norm": 17.0, + "learning_rate": 1.7974677797855092e-06, + "loss": 2.2452, + "step": 9685 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 15.875, + "learning_rate": 1.7971571679960081e-06, + "loss": 2.1044, + "step": 9690 + }, + { + "epoch": 0.28586171310629516, + "grad_norm": 13.5625, + "learning_rate": 1.7968463450865608e-06, + "loss": 2.1527, + "step": 9695 + }, + { + "epoch": 0.28600914049830456, + "grad_norm": 16.125, + "learning_rate": 1.796535311139486e-06, + "loss": 2.1591, + "step": 9700 + }, + { + "epoch": 0.286156567890314, + "grad_norm": 12.6875, + "learning_rate": 1.796224066237158e-06, + "loss": 2.2229, + "step": 9705 + }, + { + "epoch": 0.2863039952823235, + "grad_norm": 16.125, + "learning_rate": 1.7959126104620074e-06, + "loss": 2.2106, + "step": 9710 + }, + { + "epoch": 0.2864514226743329, + "grad_norm": 16.625, + "learning_rate": 1.7956009438965204e-06, + "loss": 2.1492, + "step": 9715 + }, + { + "epoch": 0.28659885006634234, + "grad_norm": 13.9375, + "learning_rate": 1.7952890666232391e-06, + "loss": 2.1762, + "step": 9720 + }, + { + "epoch": 0.28674627745835174, + "grad_norm": 15.0, + "learning_rate": 1.7949769787247617e-06, + "loss": 2.1311, + "step": 9725 + }, + { + "epoch": 0.2868937048503612, + "grad_norm": 14.625, + "learning_rate": 1.7946646802837416e-06, + "loss": 2.0461, + "step": 9730 + }, + { + "epoch": 0.28704113224237066, + "grad_norm": 18.5, + "learning_rate": 1.7943521713828883e-06, + "loss": 2.113, + "step": 9735 + }, + { + "epoch": 0.28718855963438006, + "grad_norm": 17.25, + "learning_rate": 1.7940394521049667e-06, + "loss": 2.2751, + "step": 9740 + }, + { + "epoch": 0.2873359870263895, + "grad_norm": 13.25, + "learning_rate": 1.7937265225327983e-06, + "loss": 2.1439, + "step": 9745 + }, + { + "epoch": 0.2874834144183989, + "grad_norm": 18.125, + "learning_rate": 1.793413382749259e-06, + "loss": 2.2289, + "step": 9750 + }, + { + "epoch": 0.2876308418104084, + "grad_norm": 15.4375, + "learning_rate": 1.7931000328372818e-06, + "loss": 2.3197, + "step": 9755 + }, + { + "epoch": 0.28777826920241784, + "grad_norm": 15.0, + "learning_rate": 1.7927864728798543e-06, + "loss": 2.1508, + "step": 9760 + }, + { + "epoch": 0.28792569659442724, + "grad_norm": 12.0, + "learning_rate": 1.7924727029600198e-06, + "loss": 2.1825, + "step": 9765 + }, + { + "epoch": 0.2880731239864367, + "grad_norm": 13.125, + "learning_rate": 1.7921587231608777e-06, + "loss": 2.1113, + "step": 9770 + }, + { + "epoch": 0.2882205513784461, + "grad_norm": 13.9375, + "learning_rate": 1.791844533565583e-06, + "loss": 2.1209, + "step": 9775 + }, + { + "epoch": 0.28836797877045556, + "grad_norm": 13.6875, + "learning_rate": 1.7915301342573455e-06, + "loss": 2.2531, + "step": 9780 + }, + { + "epoch": 0.28851540616246496, + "grad_norm": 13.25, + "learning_rate": 1.7912155253194311e-06, + "loss": 2.1772, + "step": 9785 + }, + { + "epoch": 0.2886628335544744, + "grad_norm": 13.375, + "learning_rate": 1.7909007068351617e-06, + "loss": 1.9749, + "step": 9790 + }, + { + "epoch": 0.2888102609464839, + "grad_norm": 14.6875, + "learning_rate": 1.7905856788879135e-06, + "loss": 2.2297, + "step": 9795 + }, + { + "epoch": 0.2889576883384933, + "grad_norm": 21.125, + "learning_rate": 1.7902704415611194e-06, + "loss": 2.1577, + "step": 9800 + }, + { + "epoch": 0.28910511573050274, + "grad_norm": 15.4375, + "learning_rate": 1.7899549949382667e-06, + "loss": 2.1224, + "step": 9805 + }, + { + "epoch": 0.28925254312251214, + "grad_norm": 15.375, + "learning_rate": 1.789639339102899e-06, + "loss": 2.1734, + "step": 9810 + }, + { + "epoch": 0.2893999705145216, + "grad_norm": 14.5625, + "learning_rate": 1.7893234741386148e-06, + "loss": 1.9936, + "step": 9815 + }, + { + "epoch": 0.28954739790653106, + "grad_norm": 15.625, + "learning_rate": 1.7890074001290678e-06, + "loss": 2.1629, + "step": 9820 + }, + { + "epoch": 0.28969482529854046, + "grad_norm": 15.125, + "learning_rate": 1.7886911171579678e-06, + "loss": 2.1329, + "step": 9825 + }, + { + "epoch": 0.2898422526905499, + "grad_norm": 13.5625, + "learning_rate": 1.7883746253090792e-06, + "loss": 2.2002, + "step": 9830 + }, + { + "epoch": 0.2899896800825593, + "grad_norm": 18.5, + "learning_rate": 1.7880579246662225e-06, + "loss": 2.1516, + "step": 9835 + }, + { + "epoch": 0.2901371074745688, + "grad_norm": 17.0, + "learning_rate": 1.7877410153132727e-06, + "loss": 2.3072, + "step": 9840 + }, + { + "epoch": 0.29028453486657824, + "grad_norm": 15.4375, + "learning_rate": 1.7874238973341602e-06, + "loss": 2.094, + "step": 9845 + }, + { + "epoch": 0.29043196225858764, + "grad_norm": 14.0, + "learning_rate": 1.7871065708128712e-06, + "loss": 2.1111, + "step": 9850 + }, + { + "epoch": 0.2905793896505971, + "grad_norm": 15.625, + "learning_rate": 1.7867890358334465e-06, + "loss": 2.2635, + "step": 9855 + }, + { + "epoch": 0.2907268170426065, + "grad_norm": 17.625, + "learning_rate": 1.7864712924799829e-06, + "loss": 2.0714, + "step": 9860 + }, + { + "epoch": 0.29087424443461596, + "grad_norm": 14.875, + "learning_rate": 1.7861533408366315e-06, + "loss": 2.1918, + "step": 9865 + }, + { + "epoch": 0.29102167182662536, + "grad_norm": 15.125, + "learning_rate": 1.7858351809875992e-06, + "loss": 2.1891, + "step": 9870 + }, + { + "epoch": 0.2911690992186348, + "grad_norm": 13.5625, + "learning_rate": 1.7855168130171471e-06, + "loss": 2.2214, + "step": 9875 + }, + { + "epoch": 0.2913165266106443, + "grad_norm": 13.875, + "learning_rate": 1.785198237009593e-06, + "loss": 2.1657, + "step": 9880 + }, + { + "epoch": 0.2914639540026537, + "grad_norm": 15.3125, + "learning_rate": 1.7848794530493083e-06, + "loss": 2.2377, + "step": 9885 + }, + { + "epoch": 0.29161138139466314, + "grad_norm": 14.3125, + "learning_rate": 1.7845604612207206e-06, + "loss": 2.1852, + "step": 9890 + }, + { + "epoch": 0.29175880878667254, + "grad_norm": 15.0625, + "learning_rate": 1.7842412616083116e-06, + "loss": 2.2394, + "step": 9895 + }, + { + "epoch": 0.291906236178682, + "grad_norm": 16.5, + "learning_rate": 1.7839218542966185e-06, + "loss": 2.2163, + "step": 9900 + }, + { + "epoch": 0.29205366357069146, + "grad_norm": 14.0625, + "learning_rate": 1.7836022393702334e-06, + "loss": 2.1405, + "step": 9905 + }, + { + "epoch": 0.29220109096270086, + "grad_norm": 18.625, + "learning_rate": 1.783282416913804e-06, + "loss": 2.2301, + "step": 9910 + }, + { + "epoch": 0.2923485183547103, + "grad_norm": 12.5625, + "learning_rate": 1.782962387012032e-06, + "loss": 2.0984, + "step": 9915 + }, + { + "epoch": 0.2924959457467197, + "grad_norm": 14.0, + "learning_rate": 1.7826421497496743e-06, + "loss": 2.1405, + "step": 9920 + }, + { + "epoch": 0.2926433731387292, + "grad_norm": 15.5, + "learning_rate": 1.7823217052115434e-06, + "loss": 2.2241, + "step": 9925 + }, + { + "epoch": 0.29279080053073864, + "grad_norm": 15.4375, + "learning_rate": 1.7820010534825057e-06, + "loss": 2.1421, + "step": 9930 + }, + { + "epoch": 0.29293822792274804, + "grad_norm": 16.625, + "learning_rate": 1.7816801946474832e-06, + "loss": 2.1865, + "step": 9935 + }, + { + "epoch": 0.2930856553147575, + "grad_norm": 11.625, + "learning_rate": 1.781359128791452e-06, + "loss": 2.1026, + "step": 9940 + }, + { + "epoch": 0.2932330827067669, + "grad_norm": 12.9375, + "learning_rate": 1.7810378559994442e-06, + "loss": 2.1224, + "step": 9945 + }, + { + "epoch": 0.29338051009877636, + "grad_norm": 13.8125, + "learning_rate": 1.7807163763565457e-06, + "loss": 2.1026, + "step": 9950 + }, + { + "epoch": 0.29352793749078576, + "grad_norm": 17.75, + "learning_rate": 1.7803946899478972e-06, + "loss": 2.1495, + "step": 9955 + }, + { + "epoch": 0.2936753648827952, + "grad_norm": 14.5, + "learning_rate": 1.7800727968586952e-06, + "loss": 2.2219, + "step": 9960 + }, + { + "epoch": 0.2938227922748047, + "grad_norm": 14.5, + "learning_rate": 1.7797506971741899e-06, + "loss": 2.1837, + "step": 9965 + }, + { + "epoch": 0.2939702196668141, + "grad_norm": 12.1875, + "learning_rate": 1.779428390979686e-06, + "loss": 2.0926, + "step": 9970 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 24.625, + "learning_rate": 1.7791058783605442e-06, + "loss": 2.192, + "step": 9975 + }, + { + "epoch": 0.29426507445083294, + "grad_norm": 13.75, + "learning_rate": 1.7787831594021787e-06, + "loss": 2.2558, + "step": 9980 + }, + { + "epoch": 0.2944125018428424, + "grad_norm": 15.125, + "learning_rate": 1.7784602341900585e-06, + "loss": 2.2184, + "step": 9985 + }, + { + "epoch": 0.29455992923485186, + "grad_norm": 26.125, + "learning_rate": 1.7781371028097079e-06, + "loss": 2.2509, + "step": 9990 + }, + { + "epoch": 0.29470735662686126, + "grad_norm": 14.9375, + "learning_rate": 1.7778137653467052e-06, + "loss": 2.2607, + "step": 9995 + }, + { + "epoch": 0.2948547840188707, + "grad_norm": 13.875, + "learning_rate": 1.7774902218866833e-06, + "loss": 2.1507, + "step": 10000 + }, + { + "epoch": 0.2948547840188707, + "eval_loss": 2.1286072731018066, + "eval_runtime": 4.7121, + "eval_samples_per_second": 84.038, + "eval_steps_per_second": 2.759, + "step": 10000 } ], "logging_steps": 5, @@ -7113,7 +14193,7 @@ "attributes": {} } }, - "total_flos": 1.3986311890783961e+19, + "total_flos": 2.7972623782104793e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null