|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999731478746543, |
|
"eval_steps": 500, |
|
"global_step": 4655, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010740850138288446, |
|
"grad_norm": 144.04261779785156, |
|
"learning_rate": 5.966587112171838e-07, |
|
"loss": 55.4357, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021481700276576893, |
|
"grad_norm": 98.8813247680664, |
|
"learning_rate": 1.1933174224343676e-06, |
|
"loss": 49.1738, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03222255041486534, |
|
"grad_norm": 35.908973693847656, |
|
"learning_rate": 1.7899761336515514e-06, |
|
"loss": 35.5472, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.042963400553153785, |
|
"grad_norm": 25.380966186523438, |
|
"learning_rate": 2.386634844868735e-06, |
|
"loss": 30.9308, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.053704250691442225, |
|
"grad_norm": 20.454557418823242, |
|
"learning_rate": 2.983293556085919e-06, |
|
"loss": 29.5101, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06444510082973068, |
|
"grad_norm": 23.909564971923828, |
|
"learning_rate": 3.579952267303103e-06, |
|
"loss": 29.3489, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07518595096801912, |
|
"grad_norm": 26.212711334228516, |
|
"learning_rate": 4.176610978520287e-06, |
|
"loss": 28.3586, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08592680110630757, |
|
"grad_norm": 23.899639129638672, |
|
"learning_rate": 4.77326968973747e-06, |
|
"loss": 27.9249, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.096667651244596, |
|
"grad_norm": 30.17796516418457, |
|
"learning_rate": 5.369928400954655e-06, |
|
"loss": 27.7261, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.10740850138288445, |
|
"grad_norm": 24.356124877929688, |
|
"learning_rate": 5.966587112171838e-06, |
|
"loss": 27.656, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1181493515211729, |
|
"grad_norm": 28.717784881591797, |
|
"learning_rate": 6.563245823389022e-06, |
|
"loss": 27.3328, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.12889020165946136, |
|
"grad_norm": 27.612842559814453, |
|
"learning_rate": 7.159904534606206e-06, |
|
"loss": 27.2348, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1396310517977498, |
|
"grad_norm": 24.70746612548828, |
|
"learning_rate": 7.75656324582339e-06, |
|
"loss": 26.9607, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.15037190193603825, |
|
"grad_norm": 25.603246688842773, |
|
"learning_rate": 8.353221957040574e-06, |
|
"loss": 27.0255, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.16111275207432668, |
|
"grad_norm": 24.41112518310547, |
|
"learning_rate": 8.949880668257757e-06, |
|
"loss": 26.7082, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.17185360221261514, |
|
"grad_norm": 45.70518112182617, |
|
"learning_rate": 9.54653937947494e-06, |
|
"loss": 26.8, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.18259445235090357, |
|
"grad_norm": 28.919174194335938, |
|
"learning_rate": 9.99999515916765e-06, |
|
"loss": 26.6221, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.193335302489192, |
|
"grad_norm": 35.64968490600586, |
|
"learning_rate": 9.999870777205538e-06, |
|
"loss": 26.4584, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.20407615262748047, |
|
"grad_norm": 28.937225341796875, |
|
"learning_rate": 9.999578315574637e-06, |
|
"loss": 26.185, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.2148170027657689, |
|
"grad_norm": 31.565160751342773, |
|
"learning_rate": 9.999117784106572e-06, |
|
"loss": 26.1142, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.22555785290405736, |
|
"grad_norm": 31.458599090576172, |
|
"learning_rate": 9.998489198282936e-06, |
|
"loss": 25.9228, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2362987030423458, |
|
"grad_norm": 44.381195068359375, |
|
"learning_rate": 9.997692579234778e-06, |
|
"loss": 25.9126, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.24703955318063425, |
|
"grad_norm": 30.515155792236328, |
|
"learning_rate": 9.996727953741881e-06, |
|
"loss": 25.9804, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2577804033189227, |
|
"grad_norm": 36.5507926940918, |
|
"learning_rate": 9.995595354231868e-06, |
|
"loss": 25.671, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2685212534572111, |
|
"grad_norm": 42.24934005737305, |
|
"learning_rate": 9.994294818779118e-06, |
|
"loss": 25.5169, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2792621035954996, |
|
"grad_norm": 52.902706146240234, |
|
"learning_rate": 9.992826391103472e-06, |
|
"loss": 25.6281, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.29000295373378804, |
|
"grad_norm": 40.59647750854492, |
|
"learning_rate": 9.991190120568773e-06, |
|
"loss": 25.2706, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3007438038720765, |
|
"grad_norm": 36.96504592895508, |
|
"learning_rate": 9.989386062181205e-06, |
|
"loss": 25.2991, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3114846540103649, |
|
"grad_norm": 43.379581451416016, |
|
"learning_rate": 9.987414276587442e-06, |
|
"loss": 25.2432, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.32222550414865336, |
|
"grad_norm": 49.82856750488281, |
|
"learning_rate": 9.985274830072611e-06, |
|
"loss": 25.1967, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3329663542869418, |
|
"grad_norm": 46.151126861572266, |
|
"learning_rate": 9.982967794558066e-06, |
|
"loss": 25.0327, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.3437072044252303, |
|
"grad_norm": 48.05283737182617, |
|
"learning_rate": 9.980493247598958e-06, |
|
"loss": 25.0835, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3544480545635187, |
|
"grad_norm": 63.88609313964844, |
|
"learning_rate": 9.977851272381651e-06, |
|
"loss": 24.8268, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.36518890470180715, |
|
"grad_norm": 45.50059509277344, |
|
"learning_rate": 9.9750419577209e-06, |
|
"loss": 25.0351, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3759297548400956, |
|
"grad_norm": 48.05316925048828, |
|
"learning_rate": 9.972065398056882e-06, |
|
"loss": 24.7551, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.386670604978384, |
|
"grad_norm": 52.77336883544922, |
|
"learning_rate": 9.968921693452016e-06, |
|
"loss": 24.8721, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3974114551166725, |
|
"grad_norm": 42.8079719543457, |
|
"learning_rate": 9.9656109495876e-06, |
|
"loss": 24.5587, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.40815230525496093, |
|
"grad_norm": 51.13129806518555, |
|
"learning_rate": 9.962133277760251e-06, |
|
"loss": 24.5215, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4188931553932494, |
|
"grad_norm": 48.29108428955078, |
|
"learning_rate": 9.958488794878185e-06, |
|
"loss": 24.6419, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.4296340055315378, |
|
"grad_norm": 52.165771484375, |
|
"learning_rate": 9.954677623457258e-06, |
|
"loss": 24.5354, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.44037485566982626, |
|
"grad_norm": 49.00601577758789, |
|
"learning_rate": 9.950699891616867e-06, |
|
"loss": 24.5425, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.4511157058081147, |
|
"grad_norm": 47.95719909667969, |
|
"learning_rate": 9.946555733075641e-06, |
|
"loss": 24.3845, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.4618565559464032, |
|
"grad_norm": 48.98214340209961, |
|
"learning_rate": 9.942245287146938e-06, |
|
"loss": 24.4838, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.4725974060846916, |
|
"grad_norm": 55.21929931640625, |
|
"learning_rate": 9.937768698734169e-06, |
|
"loss": 24.1993, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.48333825622298004, |
|
"grad_norm": 50.942928314208984, |
|
"learning_rate": 9.93312611832592e-06, |
|
"loss": 24.2968, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.4940791063612685, |
|
"grad_norm": 51.973846435546875, |
|
"learning_rate": 9.928317701990902e-06, |
|
"loss": 24.1991, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5048199564995569, |
|
"grad_norm": 50.56647872924805, |
|
"learning_rate": 9.923343611372696e-06, |
|
"loss": 24.0901, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5155608066378454, |
|
"grad_norm": 49.63374710083008, |
|
"learning_rate": 9.918204013684327e-06, |
|
"loss": 23.879, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5263016567761338, |
|
"grad_norm": 65.09294891357422, |
|
"learning_rate": 9.912899081702633e-06, |
|
"loss": 24.0486, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.5370425069144222, |
|
"grad_norm": 53.957706451416016, |
|
"learning_rate": 9.907428993762467e-06, |
|
"loss": 24.2522, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5477833570527108, |
|
"grad_norm": 52.10214614868164, |
|
"learning_rate": 9.901793933750695e-06, |
|
"loss": 24.1163, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.5585242071909992, |
|
"grad_norm": 48.26007080078125, |
|
"learning_rate": 9.895994091100016e-06, |
|
"loss": 24.0287, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5692650573292876, |
|
"grad_norm": 64.58251190185547, |
|
"learning_rate": 9.890029660782596e-06, |
|
"loss": 24.0556, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.5800059074675761, |
|
"grad_norm": 51.29060745239258, |
|
"learning_rate": 9.883900843303512e-06, |
|
"loss": 23.9603, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5907467576058645, |
|
"grad_norm": 50.7767448425293, |
|
"learning_rate": 9.87760784469401e-06, |
|
"loss": 24.067, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.601487607744153, |
|
"grad_norm": 49.88545608520508, |
|
"learning_rate": 9.871150876504582e-06, |
|
"loss": 23.7071, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6122284578824414, |
|
"grad_norm": 50.70675277709961, |
|
"learning_rate": 9.864530155797854e-06, |
|
"loss": 23.7861, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.6229693080207298, |
|
"grad_norm": 71.06696319580078, |
|
"learning_rate": 9.857745905141285e-06, |
|
"loss": 24.0092, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6337101581590183, |
|
"grad_norm": 60.42924880981445, |
|
"learning_rate": 9.85079835259969e-06, |
|
"loss": 24.0478, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.6444510082973067, |
|
"grad_norm": 70.92335510253906, |
|
"learning_rate": 9.843687731727573e-06, |
|
"loss": 23.9587, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6551918584355951, |
|
"grad_norm": 46.90457534790039, |
|
"learning_rate": 9.836414281561273e-06, |
|
"loss": 23.7579, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.6659327085738836, |
|
"grad_norm": 72.55790710449219, |
|
"learning_rate": 9.828978246610924e-06, |
|
"loss": 23.883, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.676673558712172, |
|
"grad_norm": 69.81002044677734, |
|
"learning_rate": 9.821379876852246e-06, |
|
"loss": 23.6659, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.6874144088504606, |
|
"grad_norm": 64.69695281982422, |
|
"learning_rate": 9.813619427718139e-06, |
|
"loss": 23.5357, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.698155258988749, |
|
"grad_norm": 56.05459213256836, |
|
"learning_rate": 9.805697160090084e-06, |
|
"loss": 23.6215, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.7088961091270374, |
|
"grad_norm": 49.11384582519531, |
|
"learning_rate": 9.797613340289391e-06, |
|
"loss": 23.5262, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7196369592653259, |
|
"grad_norm": 53.399513244628906, |
|
"learning_rate": 9.789368240068233e-06, |
|
"loss": 23.5788, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.7303778094036143, |
|
"grad_norm": 53.13779067993164, |
|
"learning_rate": 9.780962136600518e-06, |
|
"loss": 23.6689, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.7411186595419027, |
|
"grad_norm": 54.35108947753906, |
|
"learning_rate": 9.772395312472565e-06, |
|
"loss": 23.4544, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.7518595096801912, |
|
"grad_norm": 64.02580261230469, |
|
"learning_rate": 9.763668055673609e-06, |
|
"loss": 23.7735, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7626003598184796, |
|
"grad_norm": 74.87864685058594, |
|
"learning_rate": 9.75478065958612e-06, |
|
"loss": 23.4972, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.773341209956768, |
|
"grad_norm": 59.911354064941406, |
|
"learning_rate": 9.745733422975939e-06, |
|
"loss": 23.4717, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7840820600950565, |
|
"grad_norm": 48.266685485839844, |
|
"learning_rate": 9.73652664998223e-06, |
|
"loss": 23.4604, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.794822910233345, |
|
"grad_norm": 69.15245056152344, |
|
"learning_rate": 9.727160650107263e-06, |
|
"loss": 23.4732, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8055637603716335, |
|
"grad_norm": 55.05851745605469, |
|
"learning_rate": 9.717635738206007e-06, |
|
"loss": 23.4482, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.8163046105099219, |
|
"grad_norm": 50.051841735839844, |
|
"learning_rate": 9.707952234475548e-06, |
|
"loss": 23.6229, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.8270454606482103, |
|
"grad_norm": 56.18715286254883, |
|
"learning_rate": 9.698110464444316e-06, |
|
"loss": 23.5294, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.8377863107864988, |
|
"grad_norm": 58.675270080566406, |
|
"learning_rate": 9.688110758961152e-06, |
|
"loss": 23.3871, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.8485271609247872, |
|
"grad_norm": 61.508358001708984, |
|
"learning_rate": 9.677953454184182e-06, |
|
"loss": 23.4014, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.8592680110630756, |
|
"grad_norm": 48.32643508911133, |
|
"learning_rate": 9.66763889156951e-06, |
|
"loss": 23.5284, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.8700088612013641, |
|
"grad_norm": 47.48092269897461, |
|
"learning_rate": 9.65716741785976e-06, |
|
"loss": 23.4871, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.8807497113396525, |
|
"grad_norm": 54.80436325073242, |
|
"learning_rate": 9.646539385072389e-06, |
|
"loss": 23.3196, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8914905614779409, |
|
"grad_norm": 71.13292694091797, |
|
"learning_rate": 9.63575515048788e-06, |
|
"loss": 23.3575, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.9022314116162294, |
|
"grad_norm": 64.09466552734375, |
|
"learning_rate": 9.624815076637717e-06, |
|
"loss": 23.3654, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.9129722617545178, |
|
"grad_norm": 60.569332122802734, |
|
"learning_rate": 9.613719531292203e-06, |
|
"loss": 23.0099, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.9237131118928064, |
|
"grad_norm": 56.20964050292969, |
|
"learning_rate": 9.602468887448095e-06, |
|
"loss": 23.1344, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.9344539620310948, |
|
"grad_norm": 57.44073486328125, |
|
"learning_rate": 9.591063523316066e-06, |
|
"loss": 23.0187, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.9451948121693832, |
|
"grad_norm": 55.36775207519531, |
|
"learning_rate": 9.579503822307991e-06, |
|
"loss": 23.0668, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.9559356623076717, |
|
"grad_norm": 56.757423400878906, |
|
"learning_rate": 9.567790173024057e-06, |
|
"loss": 23.5033, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.9666765124459601, |
|
"grad_norm": 65.57225799560547, |
|
"learning_rate": 9.555922969239694e-06, |
|
"loss": 23.2055, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.9774173625842485, |
|
"grad_norm": 62.57088088989258, |
|
"learning_rate": 9.543902609892358e-06, |
|
"loss": 23.4044, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.988158212722537, |
|
"grad_norm": 48.910579681396484, |
|
"learning_rate": 9.531729499068089e-06, |
|
"loss": 23.1226, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9988990628608254, |
|
"grad_norm": 51.13067626953125, |
|
"learning_rate": 9.519404045987953e-06, |
|
"loss": 23.1075, |
|
"step": 4650 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 27930, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 1.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|