{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 300, "global_step": 1098, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00273224043715847, "grad_norm": 84.5, "learning_rate": 1e-06, "loss": 2.8857, "step": 1 }, { "epoch": 0.01366120218579235, "grad_norm": 40.75, "learning_rate": 1.9999630266778663e-06, "loss": 2.6209, "step": 5 }, { "epoch": 0.0273224043715847, "grad_norm": 16.875, "learning_rate": 1.999737088499184e-06, "loss": 2.1772, "step": 10 }, { "epoch": 0.040983606557377046, "grad_norm": 9.75, "learning_rate": 1.999305799228933e-06, "loss": 1.96, "step": 15 }, { "epoch": 0.0546448087431694, "grad_norm": 6.03125, "learning_rate": 1.998669247456129e-06, "loss": 1.8595, "step": 20 }, { "epoch": 0.06830601092896176, "grad_norm": 4.8125, "learning_rate": 1.997827563931747e-06, "loss": 1.7983, "step": 25 }, { "epoch": 0.08196721311475409, "grad_norm": 3.90625, "learning_rate": 1.9967809215418726e-06, "loss": 1.7237, "step": 30 }, { "epoch": 0.09562841530054644, "grad_norm": 2.765625, "learning_rate": 1.9955295352721854e-06, "loss": 1.6815, "step": 35 }, { "epoch": 0.1092896174863388, "grad_norm": 2.625, "learning_rate": 1.9940736621637997e-06, "loss": 1.6637, "step": 40 }, { "epoch": 0.12295081967213115, "grad_norm": 2.484375, "learning_rate": 1.992413601260471e-06, "loss": 1.6501, "step": 45 }, { "epoch": 0.1366120218579235, "grad_norm": 2.171875, "learning_rate": 1.990549693547166e-06, "loss": 1.6316, "step": 50 }, { "epoch": 0.15027322404371585, "grad_norm": 1.984375, "learning_rate": 1.988482321880025e-06, "loss": 1.6138, "step": 55 }, { "epoch": 0.16393442622950818, "grad_norm": 2.28125, "learning_rate": 1.9862119109077224e-06, "loss": 1.6096, "step": 60 }, { "epoch": 0.17759562841530055, "grad_norm": 1.921875, "learning_rate": 1.983738926984239e-06, "loss": 1.5883, "step": 65 }, { "epoch": 0.1912568306010929, "grad_norm": 1.8828125, "learning_rate": 1.9810638780730727e-06, "loss": 1.5932, "step": 70 }, { "epoch": 0.20491803278688525, "grad_norm": 2.078125, "learning_rate": 1.9781873136428984e-06, "loss": 1.5829, "step": 75 }, { "epoch": 0.2185792349726776, "grad_norm": 1.9140625, "learning_rate": 1.975109824554707e-06, "loss": 1.5835, "step": 80 }, { "epoch": 0.23224043715846995, "grad_norm": 1.828125, "learning_rate": 1.971832042940436e-06, "loss": 1.5746, "step": 85 }, { "epoch": 0.2459016393442623, "grad_norm": 1.8046875, "learning_rate": 1.968354642073129e-06, "loss": 1.5765, "step": 90 }, { "epoch": 0.25956284153005466, "grad_norm": 1.75, "learning_rate": 1.964678336228642e-06, "loss": 1.5685, "step": 95 }, { "epoch": 0.273224043715847, "grad_norm": 1.8671875, "learning_rate": 1.960803880538925e-06, "loss": 1.564, "step": 100 }, { "epoch": 0.28688524590163933, "grad_norm": 1.8046875, "learning_rate": 1.9567320708369176e-06, "loss": 1.5642, "step": 105 }, { "epoch": 0.3005464480874317, "grad_norm": 1.8984375, "learning_rate": 1.9524637434930776e-06, "loss": 1.5723, "step": 110 }, { "epoch": 0.31420765027322406, "grad_norm": 1.828125, "learning_rate": 1.9479997752435884e-06, "loss": 1.5712, "step": 115 }, { "epoch": 0.32786885245901637, "grad_norm": 1.921875, "learning_rate": 1.943341083010272e-06, "loss": 1.5609, "step": 120 }, { "epoch": 0.34153005464480873, "grad_norm": 1.6796875, "learning_rate": 1.9384886237122493e-06, "loss": 1.5596, "step": 125 }, { "epoch": 0.3551912568306011, "grad_norm": 1.90625, "learning_rate": 1.9334433940693826e-06, "loss": 1.5582, "step": 130 }, { "epoch": 0.36885245901639346, "grad_norm": 1.6875, "learning_rate": 1.928206430397546e-06, "loss": 1.5554, "step": 135 }, { "epoch": 0.3825136612021858, "grad_norm": 1.71875, "learning_rate": 1.9227788083957586e-06, "loss": 1.5653, "step": 140 }, { "epoch": 0.39617486338797814, "grad_norm": 1.6875, "learning_rate": 1.9171616429252344e-06, "loss": 1.5515, "step": 145 }, { "epoch": 0.4098360655737705, "grad_norm": 1.6796875, "learning_rate": 1.9113560877803796e-06, "loss": 1.5509, "step": 150 }, { "epoch": 0.42349726775956287, "grad_norm": 1.65625, "learning_rate": 1.9053633354517988e-06, "loss": 1.5523, "step": 155 }, { "epoch": 0.4371584699453552, "grad_norm": 1.6875, "learning_rate": 1.8991846168813544e-06, "loss": 1.5588, "step": 160 }, { "epoch": 0.45081967213114754, "grad_norm": 1.7578125, "learning_rate": 1.89282120120932e-06, "loss": 1.5507, "step": 165 }, { "epoch": 0.4644808743169399, "grad_norm": 1.7109375, "learning_rate": 1.8862743955136963e-06, "loss": 1.5467, "step": 170 }, { "epoch": 0.4781420765027322, "grad_norm": 1.6875, "learning_rate": 1.8795455445417286e-06, "loss": 1.5491, "step": 175 }, { "epoch": 0.4918032786885246, "grad_norm": 1.65625, "learning_rate": 1.8726360304336893e-06, "loss": 1.5416, "step": 180 }, { "epoch": 0.505464480874317, "grad_norm": 1.6796875, "learning_rate": 1.8655472724389796e-06, "loss": 1.5483, "step": 185 }, { "epoch": 0.5191256830601093, "grad_norm": 1.65625, "learning_rate": 1.858280726624609e-06, "loss": 1.5543, "step": 190 }, { "epoch": 0.5327868852459017, "grad_norm": 1.71875, "learning_rate": 1.8508378855761095e-06, "loss": 1.5393, "step": 195 }, { "epoch": 0.546448087431694, "grad_norm": 1.7265625, "learning_rate": 1.8432202780909538e-06, "loss": 1.5435, "step": 200 }, { "epoch": 0.5601092896174863, "grad_norm": 1.7421875, "learning_rate": 1.83542946886453e-06, "loss": 1.549, "step": 205 }, { "epoch": 0.5737704918032787, "grad_norm": 1.75, "learning_rate": 1.8274670581687478e-06, "loss": 1.5422, "step": 210 }, { "epoch": 0.587431693989071, "grad_norm": 1.671875, "learning_rate": 1.819334681523331e-06, "loss": 1.5413, "step": 215 }, { "epoch": 0.6010928961748634, "grad_norm": 1.671875, "learning_rate": 1.811034009359877e-06, "loss": 1.5403, "step": 220 }, { "epoch": 0.6147540983606558, "grad_norm": 1.703125, "learning_rate": 1.8025667466787391e-06, "loss": 1.5293, "step": 225 }, { "epoch": 0.6284153005464481, "grad_norm": 1.6953125, "learning_rate": 1.7939346326988125e-06, "loss": 1.5472, "step": 230 }, { "epoch": 0.6420765027322405, "grad_norm": 1.6953125, "learning_rate": 1.7851394405002884e-06, "loss": 1.5436, "step": 235 }, { "epoch": 0.6557377049180327, "grad_norm": 1.6875, "learning_rate": 1.7761829766604554e-06, "loss": 1.5341, "step": 240 }, { "epoch": 0.6693989071038251, "grad_norm": 1.7421875, "learning_rate": 1.7670670808826191e-06, "loss": 1.5287, "step": 245 }, { "epoch": 0.6830601092896175, "grad_norm": 1.703125, "learning_rate": 1.7577936256182167e-06, "loss": 1.5389, "step": 250 }, { "epoch": 0.6967213114754098, "grad_norm": 1.6875, "learning_rate": 1.7483645156822069e-06, "loss": 1.5431, "step": 255 }, { "epoch": 0.7103825136612022, "grad_norm": 1.7265625, "learning_rate": 1.7387816878618117e-06, "loss": 1.5424, "step": 260 }, { "epoch": 0.7240437158469946, "grad_norm": 1.703125, "learning_rate": 1.729047110518689e-06, "loss": 1.5294, "step": 265 }, { "epoch": 0.7377049180327869, "grad_norm": 1.7109375, "learning_rate": 1.7191627831846222e-06, "loss": 1.5256, "step": 270 }, { "epoch": 0.7513661202185792, "grad_norm": 1.8515625, "learning_rate": 1.7091307361508055e-06, "loss": 1.5359, "step": 275 }, { "epoch": 0.7650273224043715, "grad_norm": 1.640625, "learning_rate": 1.6989530300508123e-06, "loss": 1.5318, "step": 280 }, { "epoch": 0.7786885245901639, "grad_norm": 1.78125, "learning_rate": 1.6886317554373302e-06, "loss": 1.5344, "step": 285 }, { "epoch": 0.7923497267759563, "grad_norm": 1.65625, "learning_rate": 1.6781690323527509e-06, "loss": 1.5272, "step": 290 }, { "epoch": 0.8060109289617486, "grad_norm": 1.6875, "learning_rate": 1.6675670098937032e-06, "loss": 1.5293, "step": 295 }, { "epoch": 0.819672131147541, "grad_norm": 1.6484375, "learning_rate": 1.6568278657696162e-06, "loss": 1.5401, "step": 300 }, { "epoch": 0.819672131147541, "eval_loss": 1.533558964729309, "eval_runtime": 19.6321, "eval_samples_per_second": 36.063, "eval_steps_per_second": 1.172, "step": 300 }, { "epoch": 0.8333333333333334, "grad_norm": 1.6640625, "learning_rate": 1.6459538058554087e-06, "loss": 1.5176, "step": 305 }, { "epoch": 0.8469945355191257, "grad_norm": 1.671875, "learning_rate": 1.6349470637383888e-06, "loss": 1.5312, "step": 310 }, { "epoch": 0.860655737704918, "grad_norm": 1.65625, "learning_rate": 1.6238099002594669e-06, "loss": 1.5391, "step": 315 }, { "epoch": 0.8743169398907104, "grad_norm": 1.7421875, "learning_rate": 1.6125446030487642e-06, "loss": 1.5248, "step": 320 }, { "epoch": 0.8879781420765027, "grad_norm": 1.6875, "learning_rate": 1.6011534860557236e-06, "loss": 1.5381, "step": 325 }, { "epoch": 0.9016393442622951, "grad_norm": 1.6484375, "learning_rate": 1.5896388890738127e-06, "loss": 1.5216, "step": 330 }, { "epoch": 0.9153005464480874, "grad_norm": 1.6640625, "learning_rate": 1.578003177259917e-06, "loss": 1.524, "step": 335 }, { "epoch": 0.9289617486338798, "grad_norm": 1.65625, "learning_rate": 1.566248740648527e-06, "loss": 1.5255, "step": 340 }, { "epoch": 0.9426229508196722, "grad_norm": 1.671875, "learning_rate": 1.5543779936608106e-06, "loss": 1.5213, "step": 345 }, { "epoch": 0.9562841530054644, "grad_norm": 1.6875, "learning_rate": 1.5423933746086793e-06, "loss": 1.517, "step": 350 }, { "epoch": 0.9699453551912568, "grad_norm": 1.6796875, "learning_rate": 1.5302973451939472e-06, "loss": 1.524, "step": 355 }, { "epoch": 0.9836065573770492, "grad_norm": 1.8203125, "learning_rate": 1.5180923900026845e-06, "loss": 1.5224, "step": 360 }, { "epoch": 0.9972677595628415, "grad_norm": 1.6875, "learning_rate": 1.5057810159948714e-06, "loss": 1.5148, "step": 365 }, { "epoch": 1.010928961748634, "grad_norm": 1.6328125, "learning_rate": 1.493365751989454e-06, "loss": 1.5175, "step": 370 }, { "epoch": 1.0245901639344261, "grad_norm": 1.609375, "learning_rate": 1.4808491481449143e-06, "loss": 1.5084, "step": 375 }, { "epoch": 1.0382513661202186, "grad_norm": 1.625, "learning_rate": 1.4682337754354533e-06, "loss": 1.5141, "step": 380 }, { "epoch": 1.0519125683060109, "grad_norm": 1.6796875, "learning_rate": 1.4555222251228995e-06, "loss": 1.52, "step": 385 }, { "epoch": 1.0655737704918034, "grad_norm": 1.671875, "learning_rate": 1.442717108224452e-06, "loss": 1.5167, "step": 390 }, { "epoch": 1.0792349726775956, "grad_norm": 1.6640625, "learning_rate": 1.4298210549763628e-06, "loss": 1.5087, "step": 395 }, { "epoch": 1.092896174863388, "grad_norm": 1.6875, "learning_rate": 1.4168367142936734e-06, "loss": 1.5049, "step": 400 }, { "epoch": 1.1065573770491803, "grad_norm": 1.6796875, "learning_rate": 1.4037667532261142e-06, "loss": 1.5102, "step": 405 }, { "epoch": 1.1202185792349726, "grad_norm": 1.671875, "learning_rate": 1.3906138564102792e-06, "loss": 1.5156, "step": 410 }, { "epoch": 1.133879781420765, "grad_norm": 1.609375, "learning_rate": 1.3773807255181874e-06, "loss": 1.5162, "step": 415 }, { "epoch": 1.1475409836065573, "grad_norm": 1.6171875, "learning_rate": 1.3640700787023462e-06, "loss": 1.5099, "step": 420 }, { "epoch": 1.1612021857923498, "grad_norm": 1.6640625, "learning_rate": 1.3506846500374283e-06, "loss": 1.5159, "step": 425 }, { "epoch": 1.174863387978142, "grad_norm": 1.671875, "learning_rate": 1.337227188958679e-06, "loss": 1.5145, "step": 430 }, { "epoch": 1.1885245901639343, "grad_norm": 3.078125, "learning_rate": 1.3237004596971687e-06, "loss": 1.5066, "step": 435 }, { "epoch": 1.2021857923497268, "grad_norm": 1.6484375, "learning_rate": 1.3101072407120055e-06, "loss": 1.5144, "step": 440 }, { "epoch": 1.215846994535519, "grad_norm": 1.6953125, "learning_rate": 1.2964503241196256e-06, "loss": 1.5124, "step": 445 }, { "epoch": 1.2295081967213115, "grad_norm": 1.65625, "learning_rate": 1.2827325151202782e-06, "loss": 1.5052, "step": 450 }, { "epoch": 1.2431693989071038, "grad_norm": 1.7734375, "learning_rate": 1.2689566314218228e-06, "loss": 1.5103, "step": 455 }, { "epoch": 1.2568306010928962, "grad_norm": 1.6953125, "learning_rate": 1.255125502660958e-06, "loss": 1.5078, "step": 460 }, { "epoch": 1.2704918032786885, "grad_norm": 1.625, "learning_rate": 1.2412419698220001e-06, "loss": 1.5134, "step": 465 }, { "epoch": 1.2841530054644807, "grad_norm": 1.640625, "learning_rate": 1.2273088846533302e-06, "loss": 1.5064, "step": 470 }, { "epoch": 1.2978142076502732, "grad_norm": 1.6640625, "learning_rate": 1.2133291090816295e-06, "loss": 1.5124, "step": 475 }, { "epoch": 1.3114754098360657, "grad_norm": 1.609375, "learning_rate": 1.1993055146240272e-06, "loss": 1.5144, "step": 480 }, { "epoch": 1.325136612021858, "grad_norm": 1.65625, "learning_rate": 1.185240981798273e-06, "loss": 1.5221, "step": 485 }, { "epoch": 1.3387978142076502, "grad_norm": 1.671875, "learning_rate": 1.171138399531068e-06, "loss": 1.5079, "step": 490 }, { "epoch": 1.3524590163934427, "grad_norm": 1.6640625, "learning_rate": 1.1570006645646614e-06, "loss": 1.5145, "step": 495 }, { "epoch": 1.366120218579235, "grad_norm": 1.640625, "learning_rate": 1.1428306808618454e-06, "loss": 1.5043, "step": 500 }, { "epoch": 1.3797814207650272, "grad_norm": 1.671875, "learning_rate": 1.1286313590094686e-06, "loss": 1.5171, "step": 505 }, { "epoch": 1.3934426229508197, "grad_norm": 1.640625, "learning_rate": 1.1144056156205831e-06, "loss": 1.5114, "step": 510 }, { "epoch": 1.4071038251366121, "grad_norm": 1.6484375, "learning_rate": 1.100156372735361e-06, "loss": 1.5032, "step": 515 }, { "epoch": 1.4207650273224044, "grad_norm": 1.6328125, "learning_rate": 1.0858865572208891e-06, "loss": 1.5151, "step": 520 }, { "epoch": 1.4344262295081966, "grad_norm": 1.625, "learning_rate": 1.071599100169978e-06, "loss": 1.5015, "step": 525 }, { "epoch": 1.4480874316939891, "grad_norm": 1.65625, "learning_rate": 1.0572969362990997e-06, "loss": 1.5075, "step": 530 }, { "epoch": 1.4617486338797814, "grad_norm": 2.65625, "learning_rate": 1.042983003345582e-06, "loss": 1.5117, "step": 535 }, { "epoch": 1.4754098360655736, "grad_norm": 1.6328125, "learning_rate": 1.0286602414641815e-06, "loss": 1.5036, "step": 540 }, { "epoch": 1.489071038251366, "grad_norm": 1.703125, "learning_rate": 1.0143315926231624e-06, "loss": 1.527, "step": 545 }, { "epoch": 1.5027322404371586, "grad_norm": 1.6484375, "learning_rate": 1e-06, "loss": 1.5065, "step": 550 }, { "epoch": 1.5163934426229508, "grad_norm": 1.65625, "learning_rate": 9.856684073768378e-07, "loss": 1.4989, "step": 555 }, { "epoch": 1.530054644808743, "grad_norm": 1.6640625, "learning_rate": 9.713397585358188e-07, "loss": 1.5066, "step": 560 }, { "epoch": 1.5437158469945356, "grad_norm": 1.6484375, "learning_rate": 9.57016996654418e-07, "loss": 1.5024, "step": 565 }, { "epoch": 1.5573770491803278, "grad_norm": 1.6484375, "learning_rate": 9.427030637009002e-07, "loss": 1.5095, "step": 570 }, { "epoch": 1.57103825136612, "grad_norm": 1.671875, "learning_rate": 9.28400899830022e-07, "loss": 1.5144, "step": 575 }, { "epoch": 1.5846994535519126, "grad_norm": 1.640625, "learning_rate": 9.141134427791109e-07, "loss": 1.5191, "step": 580 }, { "epoch": 1.598360655737705, "grad_norm": 1.6640625, "learning_rate": 8.998436272646393e-07, "loss": 1.5177, "step": 585 }, { "epoch": 1.6120218579234973, "grad_norm": 1.6796875, "learning_rate": 8.85594384379417e-07, "loss": 1.5098, "step": 590 }, { "epoch": 1.6256830601092895, "grad_norm": 1.6875, "learning_rate": 8.713686409905313e-07, "loss": 1.503, "step": 595 }, { "epoch": 1.639344262295082, "grad_norm": 1.6640625, "learning_rate": 8.571693191381544e-07, "loss": 1.5091, "step": 600 }, { "epoch": 1.639344262295082, "eval_loss": 1.5182926654815674, "eval_runtime": 19.6502, "eval_samples_per_second": 36.03, "eval_steps_per_second": 1.17, "step": 600 }, { "epoch": 1.6530054644808743, "grad_norm": 1.6640625, "learning_rate": 8.429993354353388e-07, "loss": 1.5003, "step": 605 }, { "epoch": 1.6666666666666665, "grad_norm": 1.625, "learning_rate": 8.288616004689319e-07, "loss": 1.5068, "step": 610 }, { "epoch": 1.680327868852459, "grad_norm": 1.6953125, "learning_rate": 8.147590182017269e-07, "loss": 1.5067, "step": 615 }, { "epoch": 1.6939890710382515, "grad_norm": 1.640625, "learning_rate": 8.006944853759732e-07, "loss": 1.5069, "step": 620 }, { "epoch": 1.7076502732240437, "grad_norm": 1.7734375, "learning_rate": 7.866708909183702e-07, "loss": 1.4937, "step": 625 }, { "epoch": 1.721311475409836, "grad_norm": 1.640625, "learning_rate": 7.726911153466697e-07, "loss": 1.5089, "step": 630 }, { "epoch": 1.7349726775956285, "grad_norm": 1.6484375, "learning_rate": 7.587580301779999e-07, "loss": 1.511, "step": 635 }, { "epoch": 1.748633879781421, "grad_norm": 1.6875, "learning_rate": 7.448744973390422e-07, "loss": 1.5025, "step": 640 }, { "epoch": 1.762295081967213, "grad_norm": 1.640625, "learning_rate": 7.310433685781777e-07, "loss": 1.5007, "step": 645 }, { "epoch": 1.7759562841530054, "grad_norm": 1.671875, "learning_rate": 7.172674848797217e-07, "loss": 1.4922, "step": 650 }, { "epoch": 1.789617486338798, "grad_norm": 1.640625, "learning_rate": 7.035496758803743e-07, "loss": 1.505, "step": 655 }, { "epoch": 1.8032786885245902, "grad_norm": 1.625, "learning_rate": 6.898927592879944e-07, "loss": 1.5111, "step": 660 }, { "epoch": 1.8169398907103824, "grad_norm": 1.640625, "learning_rate": 6.762995403028314e-07, "loss": 1.5145, "step": 665 }, { "epoch": 1.830601092896175, "grad_norm": 1.8359375, "learning_rate": 6.627728110413213e-07, "loss": 1.5097, "step": 670 }, { "epoch": 1.8442622950819674, "grad_norm": 1.609375, "learning_rate": 6.493153499625719e-07, "loss": 1.5087, "step": 675 }, { "epoch": 1.8579234972677594, "grad_norm": 2.0625, "learning_rate": 6.359299212976534e-07, "loss": 1.5044, "step": 680 }, { "epoch": 1.8715846994535519, "grad_norm": 1.640625, "learning_rate": 6.226192744818124e-07, "loss": 1.5105, "step": 685 }, { "epoch": 1.8852459016393444, "grad_norm": 1.6484375, "learning_rate": 6.093861435897207e-07, "loss": 1.5109, "step": 690 }, { "epoch": 1.8989071038251366, "grad_norm": 1.6640625, "learning_rate": 5.962332467738857e-07, "loss": 1.5103, "step": 695 }, { "epoch": 1.9125683060109289, "grad_norm": 1.6796875, "learning_rate": 5.83163285706327e-07, "loss": 1.5023, "step": 700 }, { "epoch": 1.9262295081967213, "grad_norm": 1.6640625, "learning_rate": 5.701789450236376e-07, "loss": 1.5084, "step": 705 }, { "epoch": 1.9398907103825138, "grad_norm": 1.6484375, "learning_rate": 5.57282891775548e-07, "loss": 1.5063, "step": 710 }, { "epoch": 1.9535519125683058, "grad_norm": 1.6328125, "learning_rate": 5.444777748771006e-07, "loss": 1.5074, "step": 715 }, { "epoch": 1.9672131147540983, "grad_norm": 1.6640625, "learning_rate": 5.317662245645469e-07, "loss": 1.5037, "step": 720 }, { "epoch": 1.9808743169398908, "grad_norm": 1.6796875, "learning_rate": 5.191508518550855e-07, "loss": 1.5085, "step": 725 }, { "epoch": 1.994535519125683, "grad_norm": 1.6015625, "learning_rate": 5.066342480105459e-07, "loss": 1.5082, "step": 730 }, { "epoch": 2.0081967213114753, "grad_norm": 1.6484375, "learning_rate": 4.942189840051287e-07, "loss": 1.5054, "step": 735 }, { "epoch": 2.021857923497268, "grad_norm": 1.84375, "learning_rate": 4.819076099973152e-07, "loss": 1.4999, "step": 740 }, { "epoch": 2.0355191256830603, "grad_norm": 1.6171875, "learning_rate": 4.697026548060528e-07, "loss": 1.5094, "step": 745 }, { "epoch": 2.0491803278688523, "grad_norm": 1.703125, "learning_rate": 4.5760662539132077e-07, "loss": 1.5065, "step": 750 }, { "epoch": 2.0628415300546448, "grad_norm": 1.921875, "learning_rate": 4.4562200633918943e-07, "loss": 1.5013, "step": 755 }, { "epoch": 2.0765027322404372, "grad_norm": 1.625, "learning_rate": 4.337512593514728e-07, "loss": 1.5041, "step": 760 }, { "epoch": 2.0901639344262297, "grad_norm": 1.6640625, "learning_rate": 4.2199682274008255e-07, "loss": 1.5083, "step": 765 }, { "epoch": 2.1038251366120218, "grad_norm": 1.6171875, "learning_rate": 4.103611109261872e-07, "loss": 1.4982, "step": 770 }, { "epoch": 2.1174863387978142, "grad_norm": 1.6328125, "learning_rate": 3.9884651394427625e-07, "loss": 1.5051, "step": 775 }, { "epoch": 2.1311475409836067, "grad_norm": 1.671875, "learning_rate": 3.8745539695123577e-07, "loss": 1.4924, "step": 780 }, { "epoch": 2.1448087431693987, "grad_norm": 1.65625, "learning_rate": 3.761900997405332e-07, "loss": 1.5113, "step": 785 }, { "epoch": 2.158469945355191, "grad_norm": 1.6484375, "learning_rate": 3.6505293626161127e-07, "loss": 1.5058, "step": 790 }, { "epoch": 2.1721311475409837, "grad_norm": 1.625, "learning_rate": 3.5404619414459147e-07, "loss": 1.5019, "step": 795 }, { "epoch": 2.185792349726776, "grad_norm": 1.640625, "learning_rate": 3.4317213423038384e-07, "loss": 1.4948, "step": 800 }, { "epoch": 2.199453551912568, "grad_norm": 1.625, "learning_rate": 3.32432990106297e-07, "loss": 1.5092, "step": 805 }, { "epoch": 2.2131147540983607, "grad_norm": 1.734375, "learning_rate": 3.2183096764724914e-07, "loss": 1.5048, "step": 810 }, { "epoch": 2.226775956284153, "grad_norm": 1.8984375, "learning_rate": 3.1136824456267006e-07, "loss": 1.5005, "step": 815 }, { "epoch": 2.240437158469945, "grad_norm": 1.7421875, "learning_rate": 3.01046969949188e-07, "loss": 1.5097, "step": 820 }, { "epoch": 2.2540983606557377, "grad_norm": 1.6640625, "learning_rate": 2.908692638491945e-07, "loss": 1.5054, "step": 825 }, { "epoch": 2.26775956284153, "grad_norm": 1.65625, "learning_rate": 2.80837216815378e-07, "loss": 1.5131, "step": 830 }, { "epoch": 2.281420765027322, "grad_norm": 1.671875, "learning_rate": 2.7095288948131114e-07, "loss": 1.4999, "step": 835 }, { "epoch": 2.2950819672131146, "grad_norm": 1.6328125, "learning_rate": 2.6121831213818826e-07, "loss": 1.4989, "step": 840 }, { "epoch": 2.308743169398907, "grad_norm": 1.6953125, "learning_rate": 2.51635484317793e-07, "loss": 1.495, "step": 845 }, { "epoch": 2.3224043715846996, "grad_norm": 1.703125, "learning_rate": 2.4220637438178313e-07, "loss": 1.5125, "step": 850 }, { "epoch": 2.3360655737704916, "grad_norm": 1.703125, "learning_rate": 2.3293291911738078e-07, "loss": 1.5092, "step": 855 }, { "epoch": 2.349726775956284, "grad_norm": 1.6640625, "learning_rate": 2.2381702333954433e-07, "loss": 1.4965, "step": 860 }, { "epoch": 2.3633879781420766, "grad_norm": 1.6328125, "learning_rate": 2.148605594997115e-07, "loss": 1.497, "step": 865 }, { "epoch": 2.3770491803278686, "grad_norm": 1.765625, "learning_rate": 2.0606536730118763e-07, "loss": 1.5081, "step": 870 }, { "epoch": 2.390710382513661, "grad_norm": 1.625, "learning_rate": 1.9743325332126105e-07, "loss": 1.5091, "step": 875 }, { "epoch": 2.4043715846994536, "grad_norm": 1.6171875, "learning_rate": 1.8896599064012298e-07, "loss": 1.5045, "step": 880 }, { "epoch": 2.418032786885246, "grad_norm": 1.640625, "learning_rate": 1.8066531847666888e-07, "loss": 1.5008, "step": 885 }, { "epoch": 2.431693989071038, "grad_norm": 1.65625, "learning_rate": 1.7253294183125222e-07, "loss": 1.511, "step": 890 }, { "epoch": 2.4453551912568305, "grad_norm": 1.6328125, "learning_rate": 1.645705311354697e-07, "loss": 1.4998, "step": 895 }, { "epoch": 2.459016393442623, "grad_norm": 1.65625, "learning_rate": 1.5677972190904621e-07, "loss": 1.5021, "step": 900 }, { "epoch": 2.459016393442623, "eval_loss": 1.5163270235061646, "eval_runtime": 19.6134, "eval_samples_per_second": 36.098, "eval_steps_per_second": 1.173, "step": 900 }, { "epoch": 2.4726775956284155, "grad_norm": 1.6328125, "learning_rate": 1.4916211442389048e-07, "loss": 1.502, "step": 905 }, { "epoch": 2.4863387978142075, "grad_norm": 1.625, "learning_rate": 1.4171927337539104e-07, "loss": 1.5012, "step": 910 }, { "epoch": 2.5, "grad_norm": 1.640625, "learning_rate": 1.344527275610202e-07, "loss": 1.5019, "step": 915 }, { "epoch": 2.5136612021857925, "grad_norm": 1.640625, "learning_rate": 1.273639695663108e-07, "loss": 1.5085, "step": 920 }, { "epoch": 2.527322404371585, "grad_norm": 1.75, "learning_rate": 1.204544554582716e-07, "loss": 1.4973, "step": 925 }, { "epoch": 2.540983606557377, "grad_norm": 1.671875, "learning_rate": 1.1372560448630375e-07, "loss": 1.5037, "step": 930 }, { "epoch": 2.5546448087431695, "grad_norm": 1.65625, "learning_rate": 1.0717879879068004e-07, "loss": 1.5005, "step": 935 }, { "epoch": 2.5683060109289615, "grad_norm": 1.6796875, "learning_rate": 1.0081538311864568e-07, "loss": 1.5024, "step": 940 }, { "epoch": 2.581967213114754, "grad_norm": 1.625, "learning_rate": 9.463666454820118e-08, "loss": 1.4988, "step": 945 }, { "epoch": 2.5956284153005464, "grad_norm": 1.625, "learning_rate": 8.864391221962064e-08, "loss": 1.5053, "step": 950 }, { "epoch": 2.609289617486339, "grad_norm": 1.6328125, "learning_rate": 8.28383570747655e-08, "loss": 1.5044, "step": 955 }, { "epoch": 2.6229508196721314, "grad_norm": 1.609375, "learning_rate": 7.722119160424112e-08, "loss": 1.4995, "step": 960 }, { "epoch": 2.6366120218579234, "grad_norm": 1.640625, "learning_rate": 7.179356960245409e-08, "loss": 1.5122, "step": 965 }, { "epoch": 2.650273224043716, "grad_norm": 1.7890625, "learning_rate": 6.655660593061718e-08, "loss": 1.5054, "step": 970 }, { "epoch": 2.663934426229508, "grad_norm": 1.6484375, "learning_rate": 6.151137628775049e-08, "loss": 1.5108, "step": 975 }, { "epoch": 2.6775956284153004, "grad_norm": 1.609375, "learning_rate": 5.665891698972769e-08, "loss": 1.5003, "step": 980 }, { "epoch": 2.691256830601093, "grad_norm": 1.640625, "learning_rate": 5.200022475641153e-08, "loss": 1.5015, "step": 985 }, { "epoch": 2.7049180327868854, "grad_norm": 1.609375, "learning_rate": 4.75362565069225e-08, "loss": 1.5002, "step": 990 }, { "epoch": 2.718579234972678, "grad_norm": 1.640625, "learning_rate": 4.326792916308242e-08, "loss": 1.5029, "step": 995 }, { "epoch": 2.73224043715847, "grad_norm": 1.7421875, "learning_rate": 3.919611946107493e-08, "loss": 1.5068, "step": 1000 }, { "epoch": 2.7459016393442623, "grad_norm": 1.921875, "learning_rate": 3.532166377135814e-08, "loss": 1.4961, "step": 1005 }, { "epoch": 2.7595628415300544, "grad_norm": 1.6953125, "learning_rate": 3.164535792687095e-08, "loss": 1.5, "step": 1010 }, { "epoch": 2.773224043715847, "grad_norm": 1.6171875, "learning_rate": 2.8167957059564095e-08, "loss": 1.5035, "step": 1015 }, { "epoch": 2.7868852459016393, "grad_norm": 1.6484375, "learning_rate": 2.4890175445293147e-08, "loss": 1.5007, "step": 1020 }, { "epoch": 2.800546448087432, "grad_norm": 1.6328125, "learning_rate": 2.1812686357101428e-08, "loss": 1.505, "step": 1025 }, { "epoch": 2.8142076502732243, "grad_norm": 1.6484375, "learning_rate": 1.8936121926927507e-08, "loss": 1.5066, "step": 1030 }, { "epoch": 2.8278688524590163, "grad_norm": 1.640625, "learning_rate": 1.6261073015761072e-08, "loss": 1.502, "step": 1035 }, { "epoch": 2.841530054644809, "grad_norm": 1.75, "learning_rate": 1.3788089092277688e-08, "loss": 1.5016, "step": 1040 }, { "epoch": 2.855191256830601, "grad_norm": 1.65625, "learning_rate": 1.1517678119975061e-08, "loss": 1.5024, "step": 1045 }, { "epoch": 2.8688524590163933, "grad_norm": 1.609375, "learning_rate": 9.450306452834178e-09, "loss": 1.5, "step": 1050 }, { "epoch": 2.8825136612021858, "grad_norm": 1.734375, "learning_rate": 7.586398739528932e-09, "loss": 1.4989, "step": 1055 }, { "epoch": 2.8961748633879782, "grad_norm": 1.7890625, "learning_rate": 5.926337836199891e-09, "loss": 1.507, "step": 1060 }, { "epoch": 2.9098360655737707, "grad_norm": 1.7734375, "learning_rate": 4.470464727814538e-09, "loss": 1.5119, "step": 1065 }, { "epoch": 2.9234972677595628, "grad_norm": 1.7734375, "learning_rate": 3.219078458127078e-09, "loss": 1.5072, "step": 1070 }, { "epoch": 2.9371584699453552, "grad_norm": 1.6328125, "learning_rate": 2.172436068252792e-09, "loss": 1.5074, "step": 1075 }, { "epoch": 2.9508196721311473, "grad_norm": 1.6171875, "learning_rate": 1.330752543871161e-09, "loss": 1.4947, "step": 1080 }, { "epoch": 2.9644808743169397, "grad_norm": 1.671875, "learning_rate": 6.942007710665221e-10, "loss": 1.5025, "step": 1085 }, { "epoch": 2.978142076502732, "grad_norm": 1.6640625, "learning_rate": 2.6291150081603207e-10, "loss": 1.5014, "step": 1090 }, { "epoch": 2.9918032786885247, "grad_norm": 1.609375, "learning_rate": 3.697332213348225e-11, "loss": 1.4964, "step": 1095 }, { "epoch": 3.0, "step": 1098, "total_flos": 7.187967826039144e+18, "train_loss": 1.5352961912832626, "train_runtime": 6782.5225, "train_samples_per_second": 10.345, "train_steps_per_second": 0.162 } ], "logging_steps": 5, "max_steps": 1098, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.187967826039144e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }