MEG-Llama-3.1-8B-Instruct / trainer_state.json
lautel's picture
Upload 12 files
25a3362 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999731478746543,
"eval_steps": 500,
"global_step": 4655,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010740850138288446,
"grad_norm": 144.04261779785156,
"learning_rate": 5.966587112171838e-07,
"loss": 55.4357,
"step": 50
},
{
"epoch": 0.021481700276576893,
"grad_norm": 98.8813247680664,
"learning_rate": 1.1933174224343676e-06,
"loss": 49.1738,
"step": 100
},
{
"epoch": 0.03222255041486534,
"grad_norm": 35.908973693847656,
"learning_rate": 1.7899761336515514e-06,
"loss": 35.5472,
"step": 150
},
{
"epoch": 0.042963400553153785,
"grad_norm": 25.380966186523438,
"learning_rate": 2.386634844868735e-06,
"loss": 30.9308,
"step": 200
},
{
"epoch": 0.053704250691442225,
"grad_norm": 20.454557418823242,
"learning_rate": 2.983293556085919e-06,
"loss": 29.5101,
"step": 250
},
{
"epoch": 0.06444510082973068,
"grad_norm": 23.909564971923828,
"learning_rate": 3.579952267303103e-06,
"loss": 29.3489,
"step": 300
},
{
"epoch": 0.07518595096801912,
"grad_norm": 26.212711334228516,
"learning_rate": 4.176610978520287e-06,
"loss": 28.3586,
"step": 350
},
{
"epoch": 0.08592680110630757,
"grad_norm": 23.899639129638672,
"learning_rate": 4.77326968973747e-06,
"loss": 27.9249,
"step": 400
},
{
"epoch": 0.096667651244596,
"grad_norm": 30.17796516418457,
"learning_rate": 5.369928400954655e-06,
"loss": 27.7261,
"step": 450
},
{
"epoch": 0.10740850138288445,
"grad_norm": 24.356124877929688,
"learning_rate": 5.966587112171838e-06,
"loss": 27.656,
"step": 500
},
{
"epoch": 0.1181493515211729,
"grad_norm": 28.717784881591797,
"learning_rate": 6.563245823389022e-06,
"loss": 27.3328,
"step": 550
},
{
"epoch": 0.12889020165946136,
"grad_norm": 27.612842559814453,
"learning_rate": 7.159904534606206e-06,
"loss": 27.2348,
"step": 600
},
{
"epoch": 0.1396310517977498,
"grad_norm": 24.70746612548828,
"learning_rate": 7.75656324582339e-06,
"loss": 26.9607,
"step": 650
},
{
"epoch": 0.15037190193603825,
"grad_norm": 25.603246688842773,
"learning_rate": 8.353221957040574e-06,
"loss": 27.0255,
"step": 700
},
{
"epoch": 0.16111275207432668,
"grad_norm": 24.41112518310547,
"learning_rate": 8.949880668257757e-06,
"loss": 26.7082,
"step": 750
},
{
"epoch": 0.17185360221261514,
"grad_norm": 45.70518112182617,
"learning_rate": 9.54653937947494e-06,
"loss": 26.8,
"step": 800
},
{
"epoch": 0.18259445235090357,
"grad_norm": 28.919174194335938,
"learning_rate": 9.99999515916765e-06,
"loss": 26.6221,
"step": 850
},
{
"epoch": 0.193335302489192,
"grad_norm": 35.64968490600586,
"learning_rate": 9.999870777205538e-06,
"loss": 26.4584,
"step": 900
},
{
"epoch": 0.20407615262748047,
"grad_norm": 28.937225341796875,
"learning_rate": 9.999578315574637e-06,
"loss": 26.185,
"step": 950
},
{
"epoch": 0.2148170027657689,
"grad_norm": 31.565160751342773,
"learning_rate": 9.999117784106572e-06,
"loss": 26.1142,
"step": 1000
},
{
"epoch": 0.22555785290405736,
"grad_norm": 31.458599090576172,
"learning_rate": 9.998489198282936e-06,
"loss": 25.9228,
"step": 1050
},
{
"epoch": 0.2362987030423458,
"grad_norm": 44.381195068359375,
"learning_rate": 9.997692579234778e-06,
"loss": 25.9126,
"step": 1100
},
{
"epoch": 0.24703955318063425,
"grad_norm": 30.515155792236328,
"learning_rate": 9.996727953741881e-06,
"loss": 25.9804,
"step": 1150
},
{
"epoch": 0.2577804033189227,
"grad_norm": 36.5507926940918,
"learning_rate": 9.995595354231868e-06,
"loss": 25.671,
"step": 1200
},
{
"epoch": 0.2685212534572111,
"grad_norm": 42.24934005737305,
"learning_rate": 9.994294818779118e-06,
"loss": 25.5169,
"step": 1250
},
{
"epoch": 0.2792621035954996,
"grad_norm": 52.902706146240234,
"learning_rate": 9.992826391103472e-06,
"loss": 25.6281,
"step": 1300
},
{
"epoch": 0.29000295373378804,
"grad_norm": 40.59647750854492,
"learning_rate": 9.991190120568773e-06,
"loss": 25.2706,
"step": 1350
},
{
"epoch": 0.3007438038720765,
"grad_norm": 36.96504592895508,
"learning_rate": 9.989386062181205e-06,
"loss": 25.2991,
"step": 1400
},
{
"epoch": 0.3114846540103649,
"grad_norm": 43.379581451416016,
"learning_rate": 9.987414276587442e-06,
"loss": 25.2432,
"step": 1450
},
{
"epoch": 0.32222550414865336,
"grad_norm": 49.82856750488281,
"learning_rate": 9.985274830072611e-06,
"loss": 25.1967,
"step": 1500
},
{
"epoch": 0.3329663542869418,
"grad_norm": 46.151126861572266,
"learning_rate": 9.982967794558066e-06,
"loss": 25.0327,
"step": 1550
},
{
"epoch": 0.3437072044252303,
"grad_norm": 48.05283737182617,
"learning_rate": 9.980493247598958e-06,
"loss": 25.0835,
"step": 1600
},
{
"epoch": 0.3544480545635187,
"grad_norm": 63.88609313964844,
"learning_rate": 9.977851272381651e-06,
"loss": 24.8268,
"step": 1650
},
{
"epoch": 0.36518890470180715,
"grad_norm": 45.50059509277344,
"learning_rate": 9.9750419577209e-06,
"loss": 25.0351,
"step": 1700
},
{
"epoch": 0.3759297548400956,
"grad_norm": 48.05316925048828,
"learning_rate": 9.972065398056882e-06,
"loss": 24.7551,
"step": 1750
},
{
"epoch": 0.386670604978384,
"grad_norm": 52.77336883544922,
"learning_rate": 9.968921693452016e-06,
"loss": 24.8721,
"step": 1800
},
{
"epoch": 0.3974114551166725,
"grad_norm": 42.8079719543457,
"learning_rate": 9.9656109495876e-06,
"loss": 24.5587,
"step": 1850
},
{
"epoch": 0.40815230525496093,
"grad_norm": 51.13129806518555,
"learning_rate": 9.962133277760251e-06,
"loss": 24.5215,
"step": 1900
},
{
"epoch": 0.4188931553932494,
"grad_norm": 48.29108428955078,
"learning_rate": 9.958488794878185e-06,
"loss": 24.6419,
"step": 1950
},
{
"epoch": 0.4296340055315378,
"grad_norm": 52.165771484375,
"learning_rate": 9.954677623457258e-06,
"loss": 24.5354,
"step": 2000
},
{
"epoch": 0.44037485566982626,
"grad_norm": 49.00601577758789,
"learning_rate": 9.950699891616867e-06,
"loss": 24.5425,
"step": 2050
},
{
"epoch": 0.4511157058081147,
"grad_norm": 47.95719909667969,
"learning_rate": 9.946555733075641e-06,
"loss": 24.3845,
"step": 2100
},
{
"epoch": 0.4618565559464032,
"grad_norm": 48.98214340209961,
"learning_rate": 9.942245287146938e-06,
"loss": 24.4838,
"step": 2150
},
{
"epoch": 0.4725974060846916,
"grad_norm": 55.21929931640625,
"learning_rate": 9.937768698734169e-06,
"loss": 24.1993,
"step": 2200
},
{
"epoch": 0.48333825622298004,
"grad_norm": 50.942928314208984,
"learning_rate": 9.93312611832592e-06,
"loss": 24.2968,
"step": 2250
},
{
"epoch": 0.4940791063612685,
"grad_norm": 51.973846435546875,
"learning_rate": 9.928317701990902e-06,
"loss": 24.1991,
"step": 2300
},
{
"epoch": 0.5048199564995569,
"grad_norm": 50.56647872924805,
"learning_rate": 9.923343611372696e-06,
"loss": 24.0901,
"step": 2350
},
{
"epoch": 0.5155608066378454,
"grad_norm": 49.63374710083008,
"learning_rate": 9.918204013684327e-06,
"loss": 23.879,
"step": 2400
},
{
"epoch": 0.5263016567761338,
"grad_norm": 65.09294891357422,
"learning_rate": 9.912899081702633e-06,
"loss": 24.0486,
"step": 2450
},
{
"epoch": 0.5370425069144222,
"grad_norm": 53.957706451416016,
"learning_rate": 9.907428993762467e-06,
"loss": 24.2522,
"step": 2500
},
{
"epoch": 0.5477833570527108,
"grad_norm": 52.10214614868164,
"learning_rate": 9.901793933750695e-06,
"loss": 24.1163,
"step": 2550
},
{
"epoch": 0.5585242071909992,
"grad_norm": 48.26007080078125,
"learning_rate": 9.895994091100016e-06,
"loss": 24.0287,
"step": 2600
},
{
"epoch": 0.5692650573292876,
"grad_norm": 64.58251190185547,
"learning_rate": 9.890029660782596e-06,
"loss": 24.0556,
"step": 2650
},
{
"epoch": 0.5800059074675761,
"grad_norm": 51.29060745239258,
"learning_rate": 9.883900843303512e-06,
"loss": 23.9603,
"step": 2700
},
{
"epoch": 0.5907467576058645,
"grad_norm": 50.7767448425293,
"learning_rate": 9.87760784469401e-06,
"loss": 24.067,
"step": 2750
},
{
"epoch": 0.601487607744153,
"grad_norm": 49.88545608520508,
"learning_rate": 9.871150876504582e-06,
"loss": 23.7071,
"step": 2800
},
{
"epoch": 0.6122284578824414,
"grad_norm": 50.70675277709961,
"learning_rate": 9.864530155797854e-06,
"loss": 23.7861,
"step": 2850
},
{
"epoch": 0.6229693080207298,
"grad_norm": 71.06696319580078,
"learning_rate": 9.857745905141285e-06,
"loss": 24.0092,
"step": 2900
},
{
"epoch": 0.6337101581590183,
"grad_norm": 60.42924880981445,
"learning_rate": 9.85079835259969e-06,
"loss": 24.0478,
"step": 2950
},
{
"epoch": 0.6444510082973067,
"grad_norm": 70.92335510253906,
"learning_rate": 9.843687731727573e-06,
"loss": 23.9587,
"step": 3000
},
{
"epoch": 0.6551918584355951,
"grad_norm": 46.90457534790039,
"learning_rate": 9.836414281561273e-06,
"loss": 23.7579,
"step": 3050
},
{
"epoch": 0.6659327085738836,
"grad_norm": 72.55790710449219,
"learning_rate": 9.828978246610924e-06,
"loss": 23.883,
"step": 3100
},
{
"epoch": 0.676673558712172,
"grad_norm": 69.81002044677734,
"learning_rate": 9.821379876852246e-06,
"loss": 23.6659,
"step": 3150
},
{
"epoch": 0.6874144088504606,
"grad_norm": 64.69695281982422,
"learning_rate": 9.813619427718139e-06,
"loss": 23.5357,
"step": 3200
},
{
"epoch": 0.698155258988749,
"grad_norm": 56.05459213256836,
"learning_rate": 9.805697160090084e-06,
"loss": 23.6215,
"step": 3250
},
{
"epoch": 0.7088961091270374,
"grad_norm": 49.11384582519531,
"learning_rate": 9.797613340289391e-06,
"loss": 23.5262,
"step": 3300
},
{
"epoch": 0.7196369592653259,
"grad_norm": 53.399513244628906,
"learning_rate": 9.789368240068233e-06,
"loss": 23.5788,
"step": 3350
},
{
"epoch": 0.7303778094036143,
"grad_norm": 53.13779067993164,
"learning_rate": 9.780962136600518e-06,
"loss": 23.6689,
"step": 3400
},
{
"epoch": 0.7411186595419027,
"grad_norm": 54.35108947753906,
"learning_rate": 9.772395312472565e-06,
"loss": 23.4544,
"step": 3450
},
{
"epoch": 0.7518595096801912,
"grad_norm": 64.02580261230469,
"learning_rate": 9.763668055673609e-06,
"loss": 23.7735,
"step": 3500
},
{
"epoch": 0.7626003598184796,
"grad_norm": 74.87864685058594,
"learning_rate": 9.75478065958612e-06,
"loss": 23.4972,
"step": 3550
},
{
"epoch": 0.773341209956768,
"grad_norm": 59.911354064941406,
"learning_rate": 9.745733422975939e-06,
"loss": 23.4717,
"step": 3600
},
{
"epoch": 0.7840820600950565,
"grad_norm": 48.266685485839844,
"learning_rate": 9.73652664998223e-06,
"loss": 23.4604,
"step": 3650
},
{
"epoch": 0.794822910233345,
"grad_norm": 69.15245056152344,
"learning_rate": 9.727160650107263e-06,
"loss": 23.4732,
"step": 3700
},
{
"epoch": 0.8055637603716335,
"grad_norm": 55.05851745605469,
"learning_rate": 9.717635738206007e-06,
"loss": 23.4482,
"step": 3750
},
{
"epoch": 0.8163046105099219,
"grad_norm": 50.051841735839844,
"learning_rate": 9.707952234475548e-06,
"loss": 23.6229,
"step": 3800
},
{
"epoch": 0.8270454606482103,
"grad_norm": 56.18715286254883,
"learning_rate": 9.698110464444316e-06,
"loss": 23.5294,
"step": 3850
},
{
"epoch": 0.8377863107864988,
"grad_norm": 58.675270080566406,
"learning_rate": 9.688110758961152e-06,
"loss": 23.3871,
"step": 3900
},
{
"epoch": 0.8485271609247872,
"grad_norm": 61.508358001708984,
"learning_rate": 9.677953454184182e-06,
"loss": 23.4014,
"step": 3950
},
{
"epoch": 0.8592680110630756,
"grad_norm": 48.32643508911133,
"learning_rate": 9.66763889156951e-06,
"loss": 23.5284,
"step": 4000
},
{
"epoch": 0.8700088612013641,
"grad_norm": 47.48092269897461,
"learning_rate": 9.65716741785976e-06,
"loss": 23.4871,
"step": 4050
},
{
"epoch": 0.8807497113396525,
"grad_norm": 54.80436325073242,
"learning_rate": 9.646539385072389e-06,
"loss": 23.3196,
"step": 4100
},
{
"epoch": 0.8914905614779409,
"grad_norm": 71.13292694091797,
"learning_rate": 9.63575515048788e-06,
"loss": 23.3575,
"step": 4150
},
{
"epoch": 0.9022314116162294,
"grad_norm": 64.09466552734375,
"learning_rate": 9.624815076637717e-06,
"loss": 23.3654,
"step": 4200
},
{
"epoch": 0.9129722617545178,
"grad_norm": 60.569332122802734,
"learning_rate": 9.613719531292203e-06,
"loss": 23.0099,
"step": 4250
},
{
"epoch": 0.9237131118928064,
"grad_norm": 56.20964050292969,
"learning_rate": 9.602468887448095e-06,
"loss": 23.1344,
"step": 4300
},
{
"epoch": 0.9344539620310948,
"grad_norm": 57.44073486328125,
"learning_rate": 9.591063523316066e-06,
"loss": 23.0187,
"step": 4350
},
{
"epoch": 0.9451948121693832,
"grad_norm": 55.36775207519531,
"learning_rate": 9.579503822307991e-06,
"loss": 23.0668,
"step": 4400
},
{
"epoch": 0.9559356623076717,
"grad_norm": 56.757423400878906,
"learning_rate": 9.567790173024057e-06,
"loss": 23.5033,
"step": 4450
},
{
"epoch": 0.9666765124459601,
"grad_norm": 65.57225799560547,
"learning_rate": 9.555922969239694e-06,
"loss": 23.2055,
"step": 4500
},
{
"epoch": 0.9774173625842485,
"grad_norm": 62.57088088989258,
"learning_rate": 9.543902609892358e-06,
"loss": 23.4044,
"step": 4550
},
{
"epoch": 0.988158212722537,
"grad_norm": 48.910579681396484,
"learning_rate": 9.531729499068089e-06,
"loss": 23.1226,
"step": 4600
},
{
"epoch": 0.9988990628608254,
"grad_norm": 51.13067626953125,
"learning_rate": 9.519404045987953e-06,
"loss": 23.1075,
"step": 4650
}
],
"logging_steps": 50,
"max_steps": 27930,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}