|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 106, |
|
"global_step": 1053, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000949667616334283, |
|
"eval_loss": 1.59001886844635, |
|
"eval_runtime": 241.2886, |
|
"eval_samples_per_second": 4.849, |
|
"eval_steps_per_second": 2.424, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00949667616334283, |
|
"grad_norm": 1.056609034538269, |
|
"learning_rate": 4e-08, |
|
"loss": 1.4438, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01899335232668566, |
|
"grad_norm": 0.9450796246528625, |
|
"learning_rate": 8e-08, |
|
"loss": 1.49, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02849002849002849, |
|
"grad_norm": 0.991963803768158, |
|
"learning_rate": 1.2e-07, |
|
"loss": 1.488, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03798670465337132, |
|
"grad_norm": 0.8748257160186768, |
|
"learning_rate": 1.6e-07, |
|
"loss": 1.4817, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04748338081671415, |
|
"grad_norm": 0.9113293290138245, |
|
"learning_rate": 2e-07, |
|
"loss": 1.4838, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05698005698005698, |
|
"grad_norm": 0.9806835651397705, |
|
"learning_rate": 2.4e-07, |
|
"loss": 1.4866, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06647673314339982, |
|
"grad_norm": 0.8570646643638611, |
|
"learning_rate": 2.8e-07, |
|
"loss": 1.4502, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07597340930674264, |
|
"grad_norm": 0.8111629486083984, |
|
"learning_rate": 3.2e-07, |
|
"loss": 1.4681, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08547008547008547, |
|
"grad_norm": 0.7242903709411621, |
|
"learning_rate": 3.6e-07, |
|
"loss": 1.5103, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0949667616334283, |
|
"grad_norm": 0.9140864014625549, |
|
"learning_rate": 4e-07, |
|
"loss": 1.4756, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.100664767331434, |
|
"eval_loss": 1.588047742843628, |
|
"eval_runtime": 241.8249, |
|
"eval_samples_per_second": 4.838, |
|
"eval_steps_per_second": 2.419, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.10446343779677113, |
|
"grad_norm": 0.766789436340332, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 1.4455, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11396011396011396, |
|
"grad_norm": 1.0300992727279663, |
|
"learning_rate": 4.8e-07, |
|
"loss": 1.4651, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 0.8080026507377625, |
|
"learning_rate": 5.2e-07, |
|
"loss": 1.5405, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13295346628679963, |
|
"grad_norm": 0.8535659909248352, |
|
"learning_rate": 5.6e-07, |
|
"loss": 1.5043, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14245014245014245, |
|
"grad_norm": 1.1608635187149048, |
|
"learning_rate": 6e-07, |
|
"loss": 1.4763, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15194681861348527, |
|
"grad_norm": 1.0252463817596436, |
|
"learning_rate": 6.4e-07, |
|
"loss": 1.4542, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16144349477682812, |
|
"grad_norm": 1.242639183998108, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 1.4663, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17094017094017094, |
|
"grad_norm": 0.9343454241752625, |
|
"learning_rate": 7.2e-07, |
|
"loss": 1.4858, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18043684710351376, |
|
"grad_norm": 1.0508811473846436, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 1.4917, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1899335232668566, |
|
"grad_norm": 1.4828695058822632, |
|
"learning_rate": 8e-07, |
|
"loss": 1.4477, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19943019943019943, |
|
"grad_norm": 1.3482277393341064, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 1.4972, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.201329534662868, |
|
"eval_loss": 1.5654927492141724, |
|
"eval_runtime": 241.4855, |
|
"eval_samples_per_second": 4.845, |
|
"eval_steps_per_second": 2.423, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.20892687559354226, |
|
"grad_norm": 0.8324021697044373, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 1.4831, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2184235517568851, |
|
"grad_norm": 0.8362078070640564, |
|
"learning_rate": 9.2e-07, |
|
"loss": 1.4395, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.22792022792022792, |
|
"grad_norm": 0.9072378873825073, |
|
"learning_rate": 9.6e-07, |
|
"loss": 1.4402, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.23741690408357075, |
|
"grad_norm": 0.9287540912628174, |
|
"learning_rate": 1e-06, |
|
"loss": 1.4659, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 0.8501766920089722, |
|
"learning_rate": 1.04e-06, |
|
"loss": 1.4708, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 0.5359634160995483, |
|
"learning_rate": 1.08e-06, |
|
"loss": 1.4748, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.26590693257359926, |
|
"grad_norm": 0.6378765106201172, |
|
"learning_rate": 1.12e-06, |
|
"loss": 1.4286, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2754036087369421, |
|
"grad_norm": 0.6789595484733582, |
|
"learning_rate": 1.16e-06, |
|
"loss": 1.4212, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2849002849002849, |
|
"grad_norm": 0.6995547413825989, |
|
"learning_rate": 1.2e-06, |
|
"loss": 1.4406, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2943969610636277, |
|
"grad_norm": 0.7261990904808044, |
|
"learning_rate": 1.24e-06, |
|
"loss": 1.4563, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.301994301994302, |
|
"eval_loss": 1.5017595291137695, |
|
"eval_runtime": 241.2449, |
|
"eval_samples_per_second": 4.85, |
|
"eval_steps_per_second": 2.425, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.30389363722697055, |
|
"grad_norm": 0.6617552042007446, |
|
"learning_rate": 1.28e-06, |
|
"loss": 1.3913, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.31339031339031337, |
|
"grad_norm": 0.5867168307304382, |
|
"learning_rate": 1.32e-06, |
|
"loss": 1.3892, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.32288698955365625, |
|
"grad_norm": 0.6030348539352417, |
|
"learning_rate": 1.3600000000000001e-06, |
|
"loss": 1.39, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.33238366571699907, |
|
"grad_norm": 0.5722991228103638, |
|
"learning_rate": 1.4e-06, |
|
"loss": 1.3744, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 0.5842142105102539, |
|
"learning_rate": 1.44e-06, |
|
"loss": 1.3968, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3513770180436847, |
|
"grad_norm": 0.8536161184310913, |
|
"learning_rate": 1.48e-06, |
|
"loss": 1.4192, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.36087369420702753, |
|
"grad_norm": 0.6988087296485901, |
|
"learning_rate": 1.5199999999999998e-06, |
|
"loss": 1.3769, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.5693365931510925, |
|
"learning_rate": 1.5599999999999999e-06, |
|
"loss": 1.3986, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3798670465337132, |
|
"grad_norm": 0.6898847818374634, |
|
"learning_rate": 1.6e-06, |
|
"loss": 1.3774, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38936372269705605, |
|
"grad_norm": 0.6398094892501831, |
|
"learning_rate": 1.6399999999999998e-06, |
|
"loss": 1.4069, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.39886039886039887, |
|
"grad_norm": 0.5179505944252014, |
|
"learning_rate": 1.6799999999999998e-06, |
|
"loss": 1.3754, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.402659069325736, |
|
"eval_loss": 1.4127401113510132, |
|
"eval_runtime": 242.04, |
|
"eval_samples_per_second": 4.834, |
|
"eval_steps_per_second": 2.417, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.4083570750237417, |
|
"grad_norm": 0.691733717918396, |
|
"learning_rate": 1.7199999999999998e-06, |
|
"loss": 1.3288, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4178537511870845, |
|
"grad_norm": 0.684332013130188, |
|
"learning_rate": 1.7599999999999999e-06, |
|
"loss": 1.3613, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.42735042735042733, |
|
"grad_norm": 0.7648876309394836, |
|
"learning_rate": 1.8e-06, |
|
"loss": 1.3232, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4368471035137702, |
|
"grad_norm": 0.8008069396018982, |
|
"learning_rate": 1.84e-06, |
|
"loss": 1.346, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.44634377967711303, |
|
"grad_norm": 0.7102649211883545, |
|
"learning_rate": 1.8799999999999998e-06, |
|
"loss": 1.325, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.45584045584045585, |
|
"grad_norm": 0.6222986578941345, |
|
"learning_rate": 1.92e-06, |
|
"loss": 1.2948, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.46533713200379867, |
|
"grad_norm": 0.8117070198059082, |
|
"learning_rate": 1.96e-06, |
|
"loss": 1.3221, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4748338081671415, |
|
"grad_norm": 0.7963566780090332, |
|
"learning_rate": 2e-06, |
|
"loss": 1.2908, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4843304843304843, |
|
"grad_norm": 0.7263100147247314, |
|
"learning_rate": 1.998386746257178e-06, |
|
"loss": 1.3097, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 1.0555329322814941, |
|
"learning_rate": 1.993552190203991e-06, |
|
"loss": 1.2671, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.50332383665717, |
|
"grad_norm": 0.9024184346199036, |
|
"learning_rate": 1.985511930571733e-06, |
|
"loss": 1.271, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.50332383665717, |
|
"eval_loss": 1.3055741786956787, |
|
"eval_runtime": 241.4921, |
|
"eval_samples_per_second": 4.845, |
|
"eval_steps_per_second": 2.422, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.9171520471572876, |
|
"learning_rate": 1.9742919093182947e-06, |
|
"loss": 1.3002, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5223171889838556, |
|
"grad_norm": 1.1000499725341797, |
|
"learning_rate": 1.959928327926239e-06, |
|
"loss": 1.2513, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5318138651471985, |
|
"grad_norm": 0.9843473434448242, |
|
"learning_rate": 1.942467530598449e-06, |
|
"loss": 1.2669, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5413105413105413, |
|
"grad_norm": 0.9255275726318359, |
|
"learning_rate": 1.9219658547282065e-06, |
|
"loss": 1.2479, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5508072174738842, |
|
"grad_norm": 0.9032674431800842, |
|
"learning_rate": 1.8984894491261762e-06, |
|
"loss": 1.2133, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.560303893637227, |
|
"grad_norm": 1.0691276788711548, |
|
"learning_rate": 1.872114060590769e-06, |
|
"loss": 1.185, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 1.0371201038360596, |
|
"learning_rate": 1.842924789510531e-06, |
|
"loss": 1.2179, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5792972459639126, |
|
"grad_norm": 1.1502381563186646, |
|
"learning_rate": 1.8110158152871029e-06, |
|
"loss": 1.2306, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5887939221272555, |
|
"grad_norm": 0.9843053817749023, |
|
"learning_rate": 1.776490092464676e-06, |
|
"loss": 1.1823, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5982905982905983, |
|
"grad_norm": 1.2571120262145996, |
|
"learning_rate": 1.7394590185463837e-06, |
|
"loss": 1.2054, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.603988603988604, |
|
"eval_loss": 1.200851559638977, |
|
"eval_runtime": 241.2048, |
|
"eval_samples_per_second": 4.851, |
|
"eval_steps_per_second": 2.425, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.6077872744539411, |
|
"grad_norm": 1.0591609477996826, |
|
"learning_rate": 1.7000420745694253e-06, |
|
"loss": 1.1656, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"grad_norm": 1.0051286220550537, |
|
"learning_rate": 1.6583664395986035e-06, |
|
"loss": 1.1572, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6267806267806267, |
|
"grad_norm": 1.3040874004364014, |
|
"learning_rate": 1.61456658038212e-06, |
|
"loss": 1.1662, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6362773029439696, |
|
"grad_norm": 1.066710352897644, |
|
"learning_rate": 1.5687838174936082e-06, |
|
"loss": 1.1534, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6457739791073125, |
|
"grad_norm": 1.009498953819275, |
|
"learning_rate": 1.5211658693602396e-06, |
|
"loss": 1.1528, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6552706552706553, |
|
"grad_norm": 1.3166037797927856, |
|
"learning_rate": 1.471866375648119e-06, |
|
"loss": 1.1289, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6647673314339981, |
|
"grad_norm": 1.092807412147522, |
|
"learning_rate": 1.4210444015427466e-06, |
|
"loss": 1.1494, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.674264007597341, |
|
"grad_norm": 1.4485735893249512, |
|
"learning_rate": 1.3688639245240078e-06, |
|
"loss": 1.1119, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"grad_norm": 1.17286217212677, |
|
"learning_rate": 1.3154933052916088e-06, |
|
"loss": 1.1025, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6932573599240266, |
|
"grad_norm": 1.246343731880188, |
|
"learning_rate": 1.2611047445480159e-06, |
|
"loss": 1.1005, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7027540360873694, |
|
"grad_norm": 1.3539321422576904, |
|
"learning_rate": 1.2058737273916022e-06, |
|
"loss": 1.1065, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.704653371320038, |
|
"eval_loss": 1.1182154417037964, |
|
"eval_runtime": 242.6546, |
|
"eval_samples_per_second": 4.822, |
|
"eval_steps_per_second": 2.411, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.7122507122507122, |
|
"grad_norm": 1.5195080041885376, |
|
"learning_rate": 1.1499784571126618e-06, |
|
"loss": 1.1485, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7217473884140551, |
|
"grad_norm": 1.4020613431930542, |
|
"learning_rate": 1.0935992802191625e-06, |
|
"loss": 1.0743, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7312440645773979, |
|
"grad_norm": 1.3675881624221802, |
|
"learning_rate": 1.036918104547385e-06, |
|
"loss": 1.1114, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 1.693854808807373, |
|
"learning_rate": 9.801178123349297e-07, |
|
"loss": 1.0854, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7502374169040835, |
|
"grad_norm": 1.451640248298645, |
|
"learning_rate": 9.233816701498069e-07, |
|
"loss": 1.1016, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7597340930674265, |
|
"grad_norm": 1.1422868967056274, |
|
"learning_rate": 8.668927375794832e-07, |
|
"loss": 1.083, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.7633247375488281, |
|
"learning_rate": 8.108332765877524e-07, |
|
"loss": 1.0593, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7787274453941121, |
|
"grad_norm": 1.420740008354187, |
|
"learning_rate": 7.553841634451461e-07, |
|
"loss": 1.0782, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7882241215574549, |
|
"grad_norm": 2.188483476638794, |
|
"learning_rate": 7.007243051302909e-07, |
|
"loss": 1.1092, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7977207977207977, |
|
"grad_norm": 1.8329579830169678, |
|
"learning_rate": 6.47030062085204e-07, |
|
"loss": 1.0592, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.805318138651472, |
|
"eval_loss": 1.0689319372177124, |
|
"eval_runtime": 242.2915, |
|
"eval_samples_per_second": 4.829, |
|
"eval_steps_per_second": 2.414, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.8072174738841406, |
|
"grad_norm": 1.286758303642273, |
|
"learning_rate": 5.944746791870061e-07, |
|
"loss": 1.0576, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8167141500474834, |
|
"grad_norm": 1.4296880960464478, |
|
"learning_rate": 5.432277267720291e-07, |
|
"loss": 1.0583, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8262108262108262, |
|
"grad_norm": 1.3304263353347778, |
|
"learning_rate": 4.934545535158568e-07, |
|
"loss": 1.044, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.835707502374169, |
|
"grad_norm": 1.1693774461746216, |
|
"learning_rate": 4.4531575293458377e-07, |
|
"loss": 1.0326, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8452041785375118, |
|
"grad_norm": 1.6457087993621826, |
|
"learning_rate": 3.989666452286358e-07, |
|
"loss": 1.0269, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"grad_norm": 2.029062032699585, |
|
"learning_rate": 3.5455677614097913e-07, |
|
"loss": 1.0281, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8641975308641975, |
|
"grad_norm": 1.6128430366516113, |
|
"learning_rate": 3.1222943444666106e-07, |
|
"loss": 1.0606, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8736942070275404, |
|
"grad_norm": 1.2736912965774536, |
|
"learning_rate": 2.721211896305059e-07, |
|
"loss": 1.0518, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8831908831908832, |
|
"grad_norm": 1.2634296417236328, |
|
"learning_rate": 2.3436145124464901e-07, |
|
"loss": 1.0506, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8926875593542261, |
|
"grad_norm": 1.5625249147415161, |
|
"learning_rate": 1.9907205136764859e-07, |
|
"loss": 1.0552, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9021842355175689, |
|
"grad_norm": 1.297250747680664, |
|
"learning_rate": 1.6636685151237117e-07, |
|
"loss": 1.0322, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.905982905982906, |
|
"eval_loss": 1.0477073192596436, |
|
"eval_runtime": 242.6327, |
|
"eval_samples_per_second": 4.822, |
|
"eval_steps_per_second": 2.411, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.9116809116809117, |
|
"grad_norm": 1.4581608772277832, |
|
"learning_rate": 1.3635137525096942e-07, |
|
"loss": 1.0286, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9211775878442545, |
|
"grad_norm": 1.5961098670959473, |
|
"learning_rate": 1.0912246774228606e-07, |
|
"loss": 1.0142, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9306742640075973, |
|
"grad_norm": 1.5795928239822388, |
|
"learning_rate": 8.47679832602235e-08, |
|
"loss": 1.0443, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9401709401709402, |
|
"grad_norm": 1.754335880279541, |
|
"learning_rate": 6.336650173127223e-08, |
|
"loss": 1.0416, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.949667616334283, |
|
"grad_norm": 2.9027416706085205, |
|
"learning_rate": 4.498707519578915e-08, |
|
"loss": 1.0407, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9591642924976258, |
|
"grad_norm": 1.5181182622909546, |
|
"learning_rate": 2.9689005011073077e-08, |
|
"loss": 1.0607, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9686609686609686, |
|
"grad_norm": 1.0603967905044556, |
|
"learning_rate": 1.752165051509058e-08, |
|
"loss": 1.0485, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9781576448243114, |
|
"grad_norm": 1.657195806503296, |
|
"learning_rate": 8.52426976820364e-09, |
|
"loss": 1.0505, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 2.1635196208953857, |
|
"learning_rate": 2.725892886743297e-09, |
|
"loss": 1.016, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9971509971509972, |
|
"grad_norm": 1.6551047563552856, |
|
"learning_rate": 1.4522837712205304e-10, |
|
"loss": 1.0395, |
|
"step": 1050 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1053, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 351, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9456916610613248e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|