|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.6666666666666666, |
|
"eval_steps": 106, |
|
"global_step": 702, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000949667616334283, |
|
"eval_loss": 1.59001886844635, |
|
"eval_runtime": 241.2886, |
|
"eval_samples_per_second": 4.849, |
|
"eval_steps_per_second": 2.424, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00949667616334283, |
|
"grad_norm": 1.056609034538269, |
|
"learning_rate": 4e-08, |
|
"loss": 1.4438, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01899335232668566, |
|
"grad_norm": 0.9450796246528625, |
|
"learning_rate": 8e-08, |
|
"loss": 1.49, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02849002849002849, |
|
"grad_norm": 0.991963803768158, |
|
"learning_rate": 1.2e-07, |
|
"loss": 1.488, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03798670465337132, |
|
"grad_norm": 0.8748257160186768, |
|
"learning_rate": 1.6e-07, |
|
"loss": 1.4817, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04748338081671415, |
|
"grad_norm": 0.9113293290138245, |
|
"learning_rate": 2e-07, |
|
"loss": 1.4838, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05698005698005698, |
|
"grad_norm": 0.9806835651397705, |
|
"learning_rate": 2.4e-07, |
|
"loss": 1.4866, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06647673314339982, |
|
"grad_norm": 0.8570646643638611, |
|
"learning_rate": 2.8e-07, |
|
"loss": 1.4502, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07597340930674264, |
|
"grad_norm": 0.8111629486083984, |
|
"learning_rate": 3.2e-07, |
|
"loss": 1.4681, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08547008547008547, |
|
"grad_norm": 0.7242903709411621, |
|
"learning_rate": 3.6e-07, |
|
"loss": 1.5103, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0949667616334283, |
|
"grad_norm": 0.9140864014625549, |
|
"learning_rate": 4e-07, |
|
"loss": 1.4756, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.100664767331434, |
|
"eval_loss": 1.588047742843628, |
|
"eval_runtime": 241.8249, |
|
"eval_samples_per_second": 4.838, |
|
"eval_steps_per_second": 2.419, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.10446343779677113, |
|
"grad_norm": 0.766789436340332, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 1.4455, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11396011396011396, |
|
"grad_norm": 1.0300992727279663, |
|
"learning_rate": 4.8e-07, |
|
"loss": 1.4651, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 0.8080026507377625, |
|
"learning_rate": 5.2e-07, |
|
"loss": 1.5405, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13295346628679963, |
|
"grad_norm": 0.8535659909248352, |
|
"learning_rate": 5.6e-07, |
|
"loss": 1.5043, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14245014245014245, |
|
"grad_norm": 1.1608635187149048, |
|
"learning_rate": 6e-07, |
|
"loss": 1.4763, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15194681861348527, |
|
"grad_norm": 1.0252463817596436, |
|
"learning_rate": 6.4e-07, |
|
"loss": 1.4542, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16144349477682812, |
|
"grad_norm": 1.242639183998108, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 1.4663, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17094017094017094, |
|
"grad_norm": 0.9343454241752625, |
|
"learning_rate": 7.2e-07, |
|
"loss": 1.4858, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18043684710351376, |
|
"grad_norm": 1.0508811473846436, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 1.4917, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1899335232668566, |
|
"grad_norm": 1.4828695058822632, |
|
"learning_rate": 8e-07, |
|
"loss": 1.4477, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19943019943019943, |
|
"grad_norm": 1.3482277393341064, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 1.4972, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.201329534662868, |
|
"eval_loss": 1.5654927492141724, |
|
"eval_runtime": 241.4855, |
|
"eval_samples_per_second": 4.845, |
|
"eval_steps_per_second": 2.423, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.20892687559354226, |
|
"grad_norm": 0.8324021697044373, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 1.4831, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2184235517568851, |
|
"grad_norm": 0.8362078070640564, |
|
"learning_rate": 9.2e-07, |
|
"loss": 1.4395, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.22792022792022792, |
|
"grad_norm": 0.9072378873825073, |
|
"learning_rate": 9.6e-07, |
|
"loss": 1.4402, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.23741690408357075, |
|
"grad_norm": 0.9287540912628174, |
|
"learning_rate": 1e-06, |
|
"loss": 1.4659, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 0.8501766920089722, |
|
"learning_rate": 1.04e-06, |
|
"loss": 1.4708, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 0.5359634160995483, |
|
"learning_rate": 1.08e-06, |
|
"loss": 1.4748, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.26590693257359926, |
|
"grad_norm": 0.6378765106201172, |
|
"learning_rate": 1.12e-06, |
|
"loss": 1.4286, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2754036087369421, |
|
"grad_norm": 0.6789595484733582, |
|
"learning_rate": 1.16e-06, |
|
"loss": 1.4212, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2849002849002849, |
|
"grad_norm": 0.6995547413825989, |
|
"learning_rate": 1.2e-06, |
|
"loss": 1.4406, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2943969610636277, |
|
"grad_norm": 0.7261990904808044, |
|
"learning_rate": 1.24e-06, |
|
"loss": 1.4563, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.301994301994302, |
|
"eval_loss": 1.5017595291137695, |
|
"eval_runtime": 241.2449, |
|
"eval_samples_per_second": 4.85, |
|
"eval_steps_per_second": 2.425, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.30389363722697055, |
|
"grad_norm": 0.6617552042007446, |
|
"learning_rate": 1.28e-06, |
|
"loss": 1.3913, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.31339031339031337, |
|
"grad_norm": 0.5867168307304382, |
|
"learning_rate": 1.32e-06, |
|
"loss": 1.3892, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.32288698955365625, |
|
"grad_norm": 0.6030348539352417, |
|
"learning_rate": 1.3600000000000001e-06, |
|
"loss": 1.39, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.33238366571699907, |
|
"grad_norm": 0.5722991228103638, |
|
"learning_rate": 1.4e-06, |
|
"loss": 1.3744, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 0.5842142105102539, |
|
"learning_rate": 1.44e-06, |
|
"loss": 1.3968, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3513770180436847, |
|
"grad_norm": 0.8536161184310913, |
|
"learning_rate": 1.48e-06, |
|
"loss": 1.4192, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.36087369420702753, |
|
"grad_norm": 0.6988087296485901, |
|
"learning_rate": 1.5199999999999998e-06, |
|
"loss": 1.3769, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.5693365931510925, |
|
"learning_rate": 1.5599999999999999e-06, |
|
"loss": 1.3986, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3798670465337132, |
|
"grad_norm": 0.6898847818374634, |
|
"learning_rate": 1.6e-06, |
|
"loss": 1.3774, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38936372269705605, |
|
"grad_norm": 0.6398094892501831, |
|
"learning_rate": 1.6399999999999998e-06, |
|
"loss": 1.4069, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.39886039886039887, |
|
"grad_norm": 0.5179505944252014, |
|
"learning_rate": 1.6799999999999998e-06, |
|
"loss": 1.3754, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.402659069325736, |
|
"eval_loss": 1.4127401113510132, |
|
"eval_runtime": 242.04, |
|
"eval_samples_per_second": 4.834, |
|
"eval_steps_per_second": 2.417, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.4083570750237417, |
|
"grad_norm": 0.691733717918396, |
|
"learning_rate": 1.7199999999999998e-06, |
|
"loss": 1.3288, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4178537511870845, |
|
"grad_norm": 0.684332013130188, |
|
"learning_rate": 1.7599999999999999e-06, |
|
"loss": 1.3613, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.42735042735042733, |
|
"grad_norm": 0.7648876309394836, |
|
"learning_rate": 1.8e-06, |
|
"loss": 1.3232, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4368471035137702, |
|
"grad_norm": 0.8008069396018982, |
|
"learning_rate": 1.84e-06, |
|
"loss": 1.346, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.44634377967711303, |
|
"grad_norm": 0.7102649211883545, |
|
"learning_rate": 1.8799999999999998e-06, |
|
"loss": 1.325, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.45584045584045585, |
|
"grad_norm": 0.6222986578941345, |
|
"learning_rate": 1.92e-06, |
|
"loss": 1.2948, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.46533713200379867, |
|
"grad_norm": 0.8117070198059082, |
|
"learning_rate": 1.96e-06, |
|
"loss": 1.3221, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4748338081671415, |
|
"grad_norm": 0.7963566780090332, |
|
"learning_rate": 2e-06, |
|
"loss": 1.2908, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4843304843304843, |
|
"grad_norm": 0.7263100147247314, |
|
"learning_rate": 1.998386746257178e-06, |
|
"loss": 1.3097, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 1.0555329322814941, |
|
"learning_rate": 1.993552190203991e-06, |
|
"loss": 1.2671, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.50332383665717, |
|
"grad_norm": 0.9024184346199036, |
|
"learning_rate": 1.985511930571733e-06, |
|
"loss": 1.271, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.50332383665717, |
|
"eval_loss": 1.3055741786956787, |
|
"eval_runtime": 241.4921, |
|
"eval_samples_per_second": 4.845, |
|
"eval_steps_per_second": 2.422, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.9171520471572876, |
|
"learning_rate": 1.9742919093182947e-06, |
|
"loss": 1.3002, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5223171889838556, |
|
"grad_norm": 1.1000499725341797, |
|
"learning_rate": 1.959928327926239e-06, |
|
"loss": 1.2513, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5318138651471985, |
|
"grad_norm": 0.9843473434448242, |
|
"learning_rate": 1.942467530598449e-06, |
|
"loss": 1.2669, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5413105413105413, |
|
"grad_norm": 0.9255275726318359, |
|
"learning_rate": 1.9219658547282065e-06, |
|
"loss": 1.2479, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5508072174738842, |
|
"grad_norm": 0.9032674431800842, |
|
"learning_rate": 1.8984894491261762e-06, |
|
"loss": 1.2133, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.560303893637227, |
|
"grad_norm": 1.0691276788711548, |
|
"learning_rate": 1.872114060590769e-06, |
|
"loss": 1.185, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 1.0371201038360596, |
|
"learning_rate": 1.842924789510531e-06, |
|
"loss": 1.2179, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5792972459639126, |
|
"grad_norm": 1.1502381563186646, |
|
"learning_rate": 1.8110158152871029e-06, |
|
"loss": 1.2306, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5887939221272555, |
|
"grad_norm": 0.9843053817749023, |
|
"learning_rate": 1.776490092464676e-06, |
|
"loss": 1.1823, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5982905982905983, |
|
"grad_norm": 1.2571120262145996, |
|
"learning_rate": 1.7394590185463837e-06, |
|
"loss": 1.2054, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.603988603988604, |
|
"eval_loss": 1.200851559638977, |
|
"eval_runtime": 241.2048, |
|
"eval_samples_per_second": 4.851, |
|
"eval_steps_per_second": 2.425, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.6077872744539411, |
|
"grad_norm": 1.0591609477996826, |
|
"learning_rate": 1.7000420745694253e-06, |
|
"loss": 1.1656, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"grad_norm": 1.0051286220550537, |
|
"learning_rate": 1.6583664395986035e-06, |
|
"loss": 1.1572, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6267806267806267, |
|
"grad_norm": 1.3040874004364014, |
|
"learning_rate": 1.61456658038212e-06, |
|
"loss": 1.1662, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6362773029439696, |
|
"grad_norm": 1.066710352897644, |
|
"learning_rate": 1.5687838174936082e-06, |
|
"loss": 1.1534, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6457739791073125, |
|
"grad_norm": 1.009498953819275, |
|
"learning_rate": 1.5211658693602396e-06, |
|
"loss": 1.1528, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6552706552706553, |
|
"grad_norm": 1.3166037797927856, |
|
"learning_rate": 1.471866375648119e-06, |
|
"loss": 1.1289, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6647673314339981, |
|
"grad_norm": 1.092807412147522, |
|
"learning_rate": 1.4210444015427466e-06, |
|
"loss": 1.1494, |
|
"step": 700 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1053, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 351, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2971277740408832e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|