{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999731478746543, "eval_steps": 500, "global_step": 4655, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010740850138288446, "grad_norm": 144.04261779785156, "learning_rate": 5.966587112171838e-07, "loss": 55.4357, "step": 50 }, { "epoch": 0.021481700276576893, "grad_norm": 98.8813247680664, "learning_rate": 1.1933174224343676e-06, "loss": 49.1738, "step": 100 }, { "epoch": 0.03222255041486534, "grad_norm": 35.908973693847656, "learning_rate": 1.7899761336515514e-06, "loss": 35.5472, "step": 150 }, { "epoch": 0.042963400553153785, "grad_norm": 25.380966186523438, "learning_rate": 2.386634844868735e-06, "loss": 30.9308, "step": 200 }, { "epoch": 0.053704250691442225, "grad_norm": 20.454557418823242, "learning_rate": 2.983293556085919e-06, "loss": 29.5101, "step": 250 }, { "epoch": 0.06444510082973068, "grad_norm": 23.909564971923828, "learning_rate": 3.579952267303103e-06, "loss": 29.3489, "step": 300 }, { "epoch": 0.07518595096801912, "grad_norm": 26.212711334228516, "learning_rate": 4.176610978520287e-06, "loss": 28.3586, "step": 350 }, { "epoch": 0.08592680110630757, "grad_norm": 23.899639129638672, "learning_rate": 4.77326968973747e-06, "loss": 27.9249, "step": 400 }, { "epoch": 0.096667651244596, "grad_norm": 30.17796516418457, "learning_rate": 5.369928400954655e-06, "loss": 27.7261, "step": 450 }, { "epoch": 0.10740850138288445, "grad_norm": 24.356124877929688, "learning_rate": 5.966587112171838e-06, "loss": 27.656, "step": 500 }, { "epoch": 0.1181493515211729, "grad_norm": 28.717784881591797, "learning_rate": 6.563245823389022e-06, "loss": 27.3328, "step": 550 }, { "epoch": 0.12889020165946136, "grad_norm": 27.612842559814453, "learning_rate": 7.159904534606206e-06, "loss": 27.2348, "step": 600 }, { "epoch": 0.1396310517977498, "grad_norm": 24.70746612548828, "learning_rate": 7.75656324582339e-06, "loss": 26.9607, "step": 650 }, { "epoch": 0.15037190193603825, "grad_norm": 25.603246688842773, "learning_rate": 8.353221957040574e-06, "loss": 27.0255, "step": 700 }, { "epoch": 0.16111275207432668, "grad_norm": 24.41112518310547, "learning_rate": 8.949880668257757e-06, "loss": 26.7082, "step": 750 }, { "epoch": 0.17185360221261514, "grad_norm": 45.70518112182617, "learning_rate": 9.54653937947494e-06, "loss": 26.8, "step": 800 }, { "epoch": 0.18259445235090357, "grad_norm": 28.919174194335938, "learning_rate": 9.99999515916765e-06, "loss": 26.6221, "step": 850 }, { "epoch": 0.193335302489192, "grad_norm": 35.64968490600586, "learning_rate": 9.999870777205538e-06, "loss": 26.4584, "step": 900 }, { "epoch": 0.20407615262748047, "grad_norm": 28.937225341796875, "learning_rate": 9.999578315574637e-06, "loss": 26.185, "step": 950 }, { "epoch": 0.2148170027657689, "grad_norm": 31.565160751342773, "learning_rate": 9.999117784106572e-06, "loss": 26.1142, "step": 1000 }, { "epoch": 0.22555785290405736, "grad_norm": 31.458599090576172, "learning_rate": 9.998489198282936e-06, "loss": 25.9228, "step": 1050 }, { "epoch": 0.2362987030423458, "grad_norm": 44.381195068359375, "learning_rate": 9.997692579234778e-06, "loss": 25.9126, "step": 1100 }, { "epoch": 0.24703955318063425, "grad_norm": 30.515155792236328, "learning_rate": 9.996727953741881e-06, "loss": 25.9804, "step": 1150 }, { "epoch": 0.2577804033189227, "grad_norm": 36.5507926940918, "learning_rate": 9.995595354231868e-06, "loss": 25.671, "step": 1200 }, { "epoch": 0.2685212534572111, "grad_norm": 42.24934005737305, "learning_rate": 9.994294818779118e-06, "loss": 25.5169, "step": 1250 }, { "epoch": 0.2792621035954996, "grad_norm": 52.902706146240234, "learning_rate": 9.992826391103472e-06, "loss": 25.6281, "step": 1300 }, { "epoch": 0.29000295373378804, "grad_norm": 40.59647750854492, "learning_rate": 9.991190120568773e-06, "loss": 25.2706, "step": 1350 }, { "epoch": 0.3007438038720765, "grad_norm": 36.96504592895508, "learning_rate": 9.989386062181205e-06, "loss": 25.2991, "step": 1400 }, { "epoch": 0.3114846540103649, "grad_norm": 43.379581451416016, "learning_rate": 9.987414276587442e-06, "loss": 25.2432, "step": 1450 }, { "epoch": 0.32222550414865336, "grad_norm": 49.82856750488281, "learning_rate": 9.985274830072611e-06, "loss": 25.1967, "step": 1500 }, { "epoch": 0.3329663542869418, "grad_norm": 46.151126861572266, "learning_rate": 9.982967794558066e-06, "loss": 25.0327, "step": 1550 }, { "epoch": 0.3437072044252303, "grad_norm": 48.05283737182617, "learning_rate": 9.980493247598958e-06, "loss": 25.0835, "step": 1600 }, { "epoch": 0.3544480545635187, "grad_norm": 63.88609313964844, "learning_rate": 9.977851272381651e-06, "loss": 24.8268, "step": 1650 }, { "epoch": 0.36518890470180715, "grad_norm": 45.50059509277344, "learning_rate": 9.9750419577209e-06, "loss": 25.0351, "step": 1700 }, { "epoch": 0.3759297548400956, "grad_norm": 48.05316925048828, "learning_rate": 9.972065398056882e-06, "loss": 24.7551, "step": 1750 }, { "epoch": 0.386670604978384, "grad_norm": 52.77336883544922, "learning_rate": 9.968921693452016e-06, "loss": 24.8721, "step": 1800 }, { "epoch": 0.3974114551166725, "grad_norm": 42.8079719543457, "learning_rate": 9.9656109495876e-06, "loss": 24.5587, "step": 1850 }, { "epoch": 0.40815230525496093, "grad_norm": 51.13129806518555, "learning_rate": 9.962133277760251e-06, "loss": 24.5215, "step": 1900 }, { "epoch": 0.4188931553932494, "grad_norm": 48.29108428955078, "learning_rate": 9.958488794878185e-06, "loss": 24.6419, "step": 1950 }, { "epoch": 0.4296340055315378, "grad_norm": 52.165771484375, "learning_rate": 9.954677623457258e-06, "loss": 24.5354, "step": 2000 }, { "epoch": 0.44037485566982626, "grad_norm": 49.00601577758789, "learning_rate": 9.950699891616867e-06, "loss": 24.5425, "step": 2050 }, { "epoch": 0.4511157058081147, "grad_norm": 47.95719909667969, "learning_rate": 9.946555733075641e-06, "loss": 24.3845, "step": 2100 }, { "epoch": 0.4618565559464032, "grad_norm": 48.98214340209961, "learning_rate": 9.942245287146938e-06, "loss": 24.4838, "step": 2150 }, { "epoch": 0.4725974060846916, "grad_norm": 55.21929931640625, "learning_rate": 9.937768698734169e-06, "loss": 24.1993, "step": 2200 }, { "epoch": 0.48333825622298004, "grad_norm": 50.942928314208984, "learning_rate": 9.93312611832592e-06, "loss": 24.2968, "step": 2250 }, { "epoch": 0.4940791063612685, "grad_norm": 51.973846435546875, "learning_rate": 9.928317701990902e-06, "loss": 24.1991, "step": 2300 }, { "epoch": 0.5048199564995569, "grad_norm": 50.56647872924805, "learning_rate": 9.923343611372696e-06, "loss": 24.0901, "step": 2350 }, { "epoch": 0.5155608066378454, "grad_norm": 49.63374710083008, "learning_rate": 9.918204013684327e-06, "loss": 23.879, "step": 2400 }, { "epoch": 0.5263016567761338, "grad_norm": 65.09294891357422, "learning_rate": 9.912899081702633e-06, "loss": 24.0486, "step": 2450 }, { "epoch": 0.5370425069144222, "grad_norm": 53.957706451416016, "learning_rate": 9.907428993762467e-06, "loss": 24.2522, "step": 2500 }, { "epoch": 0.5477833570527108, "grad_norm": 52.10214614868164, "learning_rate": 9.901793933750695e-06, "loss": 24.1163, "step": 2550 }, { "epoch": 0.5585242071909992, "grad_norm": 48.26007080078125, "learning_rate": 9.895994091100016e-06, "loss": 24.0287, "step": 2600 }, { "epoch": 0.5692650573292876, "grad_norm": 64.58251190185547, "learning_rate": 9.890029660782596e-06, "loss": 24.0556, "step": 2650 }, { "epoch": 0.5800059074675761, "grad_norm": 51.29060745239258, "learning_rate": 9.883900843303512e-06, "loss": 23.9603, "step": 2700 }, { "epoch": 0.5907467576058645, "grad_norm": 50.7767448425293, "learning_rate": 9.87760784469401e-06, "loss": 24.067, "step": 2750 }, { "epoch": 0.601487607744153, "grad_norm": 49.88545608520508, "learning_rate": 9.871150876504582e-06, "loss": 23.7071, "step": 2800 }, { "epoch": 0.6122284578824414, "grad_norm": 50.70675277709961, "learning_rate": 9.864530155797854e-06, "loss": 23.7861, "step": 2850 }, { "epoch": 0.6229693080207298, "grad_norm": 71.06696319580078, "learning_rate": 9.857745905141285e-06, "loss": 24.0092, "step": 2900 }, { "epoch": 0.6337101581590183, "grad_norm": 60.42924880981445, "learning_rate": 9.85079835259969e-06, "loss": 24.0478, "step": 2950 }, { "epoch": 0.6444510082973067, "grad_norm": 70.92335510253906, "learning_rate": 9.843687731727573e-06, "loss": 23.9587, "step": 3000 }, { "epoch": 0.6551918584355951, "grad_norm": 46.90457534790039, "learning_rate": 9.836414281561273e-06, "loss": 23.7579, "step": 3050 }, { "epoch": 0.6659327085738836, "grad_norm": 72.55790710449219, "learning_rate": 9.828978246610924e-06, "loss": 23.883, "step": 3100 }, { "epoch": 0.676673558712172, "grad_norm": 69.81002044677734, "learning_rate": 9.821379876852246e-06, "loss": 23.6659, "step": 3150 }, { "epoch": 0.6874144088504606, "grad_norm": 64.69695281982422, "learning_rate": 9.813619427718139e-06, "loss": 23.5357, "step": 3200 }, { "epoch": 0.698155258988749, "grad_norm": 56.05459213256836, "learning_rate": 9.805697160090084e-06, "loss": 23.6215, "step": 3250 }, { "epoch": 0.7088961091270374, "grad_norm": 49.11384582519531, "learning_rate": 9.797613340289391e-06, "loss": 23.5262, "step": 3300 }, { "epoch": 0.7196369592653259, "grad_norm": 53.399513244628906, "learning_rate": 9.789368240068233e-06, "loss": 23.5788, "step": 3350 }, { "epoch": 0.7303778094036143, "grad_norm": 53.13779067993164, "learning_rate": 9.780962136600518e-06, "loss": 23.6689, "step": 3400 }, { "epoch": 0.7411186595419027, "grad_norm": 54.35108947753906, "learning_rate": 9.772395312472565e-06, "loss": 23.4544, "step": 3450 }, { "epoch": 0.7518595096801912, "grad_norm": 64.02580261230469, "learning_rate": 9.763668055673609e-06, "loss": 23.7735, "step": 3500 }, { "epoch": 0.7626003598184796, "grad_norm": 74.87864685058594, "learning_rate": 9.75478065958612e-06, "loss": 23.4972, "step": 3550 }, { "epoch": 0.773341209956768, "grad_norm": 59.911354064941406, "learning_rate": 9.745733422975939e-06, "loss": 23.4717, "step": 3600 }, { "epoch": 0.7840820600950565, "grad_norm": 48.266685485839844, "learning_rate": 9.73652664998223e-06, "loss": 23.4604, "step": 3650 }, { "epoch": 0.794822910233345, "grad_norm": 69.15245056152344, "learning_rate": 9.727160650107263e-06, "loss": 23.4732, "step": 3700 }, { "epoch": 0.8055637603716335, "grad_norm": 55.05851745605469, "learning_rate": 9.717635738206007e-06, "loss": 23.4482, "step": 3750 }, { "epoch": 0.8163046105099219, "grad_norm": 50.051841735839844, "learning_rate": 9.707952234475548e-06, "loss": 23.6229, "step": 3800 }, { "epoch": 0.8270454606482103, "grad_norm": 56.18715286254883, "learning_rate": 9.698110464444316e-06, "loss": 23.5294, "step": 3850 }, { "epoch": 0.8377863107864988, "grad_norm": 58.675270080566406, "learning_rate": 9.688110758961152e-06, "loss": 23.3871, "step": 3900 }, { "epoch": 0.8485271609247872, "grad_norm": 61.508358001708984, "learning_rate": 9.677953454184182e-06, "loss": 23.4014, "step": 3950 }, { "epoch": 0.8592680110630756, "grad_norm": 48.32643508911133, "learning_rate": 9.66763889156951e-06, "loss": 23.5284, "step": 4000 }, { "epoch": 0.8700088612013641, "grad_norm": 47.48092269897461, "learning_rate": 9.65716741785976e-06, "loss": 23.4871, "step": 4050 }, { "epoch": 0.8807497113396525, "grad_norm": 54.80436325073242, "learning_rate": 9.646539385072389e-06, "loss": 23.3196, "step": 4100 }, { "epoch": 0.8914905614779409, "grad_norm": 71.13292694091797, "learning_rate": 9.63575515048788e-06, "loss": 23.3575, "step": 4150 }, { "epoch": 0.9022314116162294, "grad_norm": 64.09466552734375, "learning_rate": 9.624815076637717e-06, "loss": 23.3654, "step": 4200 }, { "epoch": 0.9129722617545178, "grad_norm": 60.569332122802734, "learning_rate": 9.613719531292203e-06, "loss": 23.0099, "step": 4250 }, { "epoch": 0.9237131118928064, "grad_norm": 56.20964050292969, "learning_rate": 9.602468887448095e-06, "loss": 23.1344, "step": 4300 }, { "epoch": 0.9344539620310948, "grad_norm": 57.44073486328125, "learning_rate": 9.591063523316066e-06, "loss": 23.0187, "step": 4350 }, { "epoch": 0.9451948121693832, "grad_norm": 55.36775207519531, "learning_rate": 9.579503822307991e-06, "loss": 23.0668, "step": 4400 }, { "epoch": 0.9559356623076717, "grad_norm": 56.757423400878906, "learning_rate": 9.567790173024057e-06, "loss": 23.5033, "step": 4450 }, { "epoch": 0.9666765124459601, "grad_norm": 65.57225799560547, "learning_rate": 9.555922969239694e-06, "loss": 23.2055, "step": 4500 }, { "epoch": 0.9774173625842485, "grad_norm": 62.57088088989258, "learning_rate": 9.543902609892358e-06, "loss": 23.4044, "step": 4550 }, { "epoch": 0.988158212722537, "grad_norm": 48.910579681396484, "learning_rate": 9.531729499068089e-06, "loss": 23.1226, "step": 4600 }, { "epoch": 0.9988990628608254, "grad_norm": 51.13067626953125, "learning_rate": 9.519404045987953e-06, "loss": 23.1075, "step": 4650 } ], "logging_steps": 50, "max_steps": 27930, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 1.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }