{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 106, "global_step": 1053, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000949667616334283, "eval_loss": 1.59001886844635, "eval_runtime": 241.2886, "eval_samples_per_second": 4.849, "eval_steps_per_second": 2.424, "step": 1 }, { "epoch": 0.00949667616334283, "grad_norm": 1.056609034538269, "learning_rate": 4e-08, "loss": 1.4438, "step": 10 }, { "epoch": 0.01899335232668566, "grad_norm": 0.9450796246528625, "learning_rate": 8e-08, "loss": 1.49, "step": 20 }, { "epoch": 0.02849002849002849, "grad_norm": 0.991963803768158, "learning_rate": 1.2e-07, "loss": 1.488, "step": 30 }, { "epoch": 0.03798670465337132, "grad_norm": 0.8748257160186768, "learning_rate": 1.6e-07, "loss": 1.4817, "step": 40 }, { "epoch": 0.04748338081671415, "grad_norm": 0.9113293290138245, "learning_rate": 2e-07, "loss": 1.4838, "step": 50 }, { "epoch": 0.05698005698005698, "grad_norm": 0.9806835651397705, "learning_rate": 2.4e-07, "loss": 1.4866, "step": 60 }, { "epoch": 0.06647673314339982, "grad_norm": 0.8570646643638611, "learning_rate": 2.8e-07, "loss": 1.4502, "step": 70 }, { "epoch": 0.07597340930674264, "grad_norm": 0.8111629486083984, "learning_rate": 3.2e-07, "loss": 1.4681, "step": 80 }, { "epoch": 0.08547008547008547, "grad_norm": 0.7242903709411621, "learning_rate": 3.6e-07, "loss": 1.5103, "step": 90 }, { "epoch": 0.0949667616334283, "grad_norm": 0.9140864014625549, "learning_rate": 4e-07, "loss": 1.4756, "step": 100 }, { "epoch": 0.100664767331434, "eval_loss": 1.588047742843628, "eval_runtime": 241.8249, "eval_samples_per_second": 4.838, "eval_steps_per_second": 2.419, "step": 106 }, { "epoch": 0.10446343779677113, "grad_norm": 0.766789436340332, "learning_rate": 4.3999999999999997e-07, "loss": 1.4455, "step": 110 }, { "epoch": 0.11396011396011396, "grad_norm": 1.0300992727279663, "learning_rate": 4.8e-07, "loss": 1.4651, "step": 120 }, { "epoch": 0.12345679012345678, "grad_norm": 0.8080026507377625, "learning_rate": 5.2e-07, "loss": 1.5405, "step": 130 }, { "epoch": 0.13295346628679963, "grad_norm": 0.8535659909248352, "learning_rate": 5.6e-07, "loss": 1.5043, "step": 140 }, { "epoch": 0.14245014245014245, "grad_norm": 1.1608635187149048, "learning_rate": 6e-07, "loss": 1.4763, "step": 150 }, { "epoch": 0.15194681861348527, "grad_norm": 1.0252463817596436, "learning_rate": 6.4e-07, "loss": 1.4542, "step": 160 }, { "epoch": 0.16144349477682812, "grad_norm": 1.242639183998108, "learning_rate": 6.800000000000001e-07, "loss": 1.4663, "step": 170 }, { "epoch": 0.17094017094017094, "grad_norm": 0.9343454241752625, "learning_rate": 7.2e-07, "loss": 1.4858, "step": 180 }, { "epoch": 0.18043684710351376, "grad_norm": 1.0508811473846436, "learning_rate": 7.599999999999999e-07, "loss": 1.4917, "step": 190 }, { "epoch": 0.1899335232668566, "grad_norm": 1.4828695058822632, "learning_rate": 8e-07, "loss": 1.4477, "step": 200 }, { "epoch": 0.19943019943019943, "grad_norm": 1.3482277393341064, "learning_rate": 8.399999999999999e-07, "loss": 1.4972, "step": 210 }, { "epoch": 0.201329534662868, "eval_loss": 1.5654927492141724, "eval_runtime": 241.4855, "eval_samples_per_second": 4.845, "eval_steps_per_second": 2.423, "step": 212 }, { "epoch": 0.20892687559354226, "grad_norm": 0.8324021697044373, "learning_rate": 8.799999999999999e-07, "loss": 1.4831, "step": 220 }, { "epoch": 0.2184235517568851, "grad_norm": 0.8362078070640564, "learning_rate": 9.2e-07, "loss": 1.4395, "step": 230 }, { "epoch": 0.22792022792022792, "grad_norm": 0.9072378873825073, "learning_rate": 9.6e-07, "loss": 1.4402, "step": 240 }, { "epoch": 0.23741690408357075, "grad_norm": 0.9287540912628174, "learning_rate": 1e-06, "loss": 1.4659, "step": 250 }, { "epoch": 0.24691358024691357, "grad_norm": 0.8501766920089722, "learning_rate": 1.04e-06, "loss": 1.4708, "step": 260 }, { "epoch": 0.2564102564102564, "grad_norm": 0.5359634160995483, "learning_rate": 1.08e-06, "loss": 1.4748, "step": 270 }, { "epoch": 0.26590693257359926, "grad_norm": 0.6378765106201172, "learning_rate": 1.12e-06, "loss": 1.4286, "step": 280 }, { "epoch": 0.2754036087369421, "grad_norm": 0.6789595484733582, "learning_rate": 1.16e-06, "loss": 1.4212, "step": 290 }, { "epoch": 0.2849002849002849, "grad_norm": 0.6995547413825989, "learning_rate": 1.2e-06, "loss": 1.4406, "step": 300 }, { "epoch": 0.2943969610636277, "grad_norm": 0.7261990904808044, "learning_rate": 1.24e-06, "loss": 1.4563, "step": 310 }, { "epoch": 0.301994301994302, "eval_loss": 1.5017595291137695, "eval_runtime": 241.2449, "eval_samples_per_second": 4.85, "eval_steps_per_second": 2.425, "step": 318 }, { "epoch": 0.30389363722697055, "grad_norm": 0.6617552042007446, "learning_rate": 1.28e-06, "loss": 1.3913, "step": 320 }, { "epoch": 0.31339031339031337, "grad_norm": 0.5867168307304382, "learning_rate": 1.32e-06, "loss": 1.3892, "step": 330 }, { "epoch": 0.32288698955365625, "grad_norm": 0.6030348539352417, "learning_rate": 1.3600000000000001e-06, "loss": 1.39, "step": 340 }, { "epoch": 0.33238366571699907, "grad_norm": 0.5722991228103638, "learning_rate": 1.4e-06, "loss": 1.3744, "step": 350 }, { "epoch": 0.3418803418803419, "grad_norm": 0.5842142105102539, "learning_rate": 1.44e-06, "loss": 1.3968, "step": 360 }, { "epoch": 0.3513770180436847, "grad_norm": 0.8536161184310913, "learning_rate": 1.48e-06, "loss": 1.4192, "step": 370 }, { "epoch": 0.36087369420702753, "grad_norm": 0.6988087296485901, "learning_rate": 1.5199999999999998e-06, "loss": 1.3769, "step": 380 }, { "epoch": 0.37037037037037035, "grad_norm": 0.5693365931510925, "learning_rate": 1.5599999999999999e-06, "loss": 1.3986, "step": 390 }, { "epoch": 0.3798670465337132, "grad_norm": 0.6898847818374634, "learning_rate": 1.6e-06, "loss": 1.3774, "step": 400 }, { "epoch": 0.38936372269705605, "grad_norm": 0.6398094892501831, "learning_rate": 1.6399999999999998e-06, "loss": 1.4069, "step": 410 }, { "epoch": 0.39886039886039887, "grad_norm": 0.5179505944252014, "learning_rate": 1.6799999999999998e-06, "loss": 1.3754, "step": 420 }, { "epoch": 0.402659069325736, "eval_loss": 1.4127401113510132, "eval_runtime": 242.04, "eval_samples_per_second": 4.834, "eval_steps_per_second": 2.417, "step": 424 }, { "epoch": 0.4083570750237417, "grad_norm": 0.691733717918396, "learning_rate": 1.7199999999999998e-06, "loss": 1.3288, "step": 430 }, { "epoch": 0.4178537511870845, "grad_norm": 0.684332013130188, "learning_rate": 1.7599999999999999e-06, "loss": 1.3613, "step": 440 }, { "epoch": 0.42735042735042733, "grad_norm": 0.7648876309394836, "learning_rate": 1.8e-06, "loss": 1.3232, "step": 450 }, { "epoch": 0.4368471035137702, "grad_norm": 0.8008069396018982, "learning_rate": 1.84e-06, "loss": 1.346, "step": 460 }, { "epoch": 0.44634377967711303, "grad_norm": 0.7102649211883545, "learning_rate": 1.8799999999999998e-06, "loss": 1.325, "step": 470 }, { "epoch": 0.45584045584045585, "grad_norm": 0.6222986578941345, "learning_rate": 1.92e-06, "loss": 1.2948, "step": 480 }, { "epoch": 0.46533713200379867, "grad_norm": 0.8117070198059082, "learning_rate": 1.96e-06, "loss": 1.3221, "step": 490 }, { "epoch": 0.4748338081671415, "grad_norm": 0.7963566780090332, "learning_rate": 2e-06, "loss": 1.2908, "step": 500 }, { "epoch": 0.4843304843304843, "grad_norm": 0.7263100147247314, "learning_rate": 1.998386746257178e-06, "loss": 1.3097, "step": 510 }, { "epoch": 0.49382716049382713, "grad_norm": 1.0555329322814941, "learning_rate": 1.993552190203991e-06, "loss": 1.2671, "step": 520 }, { "epoch": 0.50332383665717, "grad_norm": 0.9024184346199036, "learning_rate": 1.985511930571733e-06, "loss": 1.271, "step": 530 }, { "epoch": 0.50332383665717, "eval_loss": 1.3055741786956787, "eval_runtime": 241.4921, "eval_samples_per_second": 4.845, "eval_steps_per_second": 2.422, "step": 530 }, { "epoch": 0.5128205128205128, "grad_norm": 0.9171520471572876, "learning_rate": 1.9742919093182947e-06, "loss": 1.3002, "step": 540 }, { "epoch": 0.5223171889838556, "grad_norm": 1.1000499725341797, "learning_rate": 1.959928327926239e-06, "loss": 1.2513, "step": 550 }, { "epoch": 0.5318138651471985, "grad_norm": 0.9843473434448242, "learning_rate": 1.942467530598449e-06, "loss": 1.2669, "step": 560 }, { "epoch": 0.5413105413105413, "grad_norm": 0.9255275726318359, "learning_rate": 1.9219658547282065e-06, "loss": 1.2479, "step": 570 }, { "epoch": 0.5508072174738842, "grad_norm": 0.9032674431800842, "learning_rate": 1.8984894491261762e-06, "loss": 1.2133, "step": 580 }, { "epoch": 0.560303893637227, "grad_norm": 1.0691276788711548, "learning_rate": 1.872114060590769e-06, "loss": 1.185, "step": 590 }, { "epoch": 0.5698005698005698, "grad_norm": 1.0371201038360596, "learning_rate": 1.842924789510531e-06, "loss": 1.2179, "step": 600 }, { "epoch": 0.5792972459639126, "grad_norm": 1.1502381563186646, "learning_rate": 1.8110158152871029e-06, "loss": 1.2306, "step": 610 }, { "epoch": 0.5887939221272555, "grad_norm": 0.9843053817749023, "learning_rate": 1.776490092464676e-06, "loss": 1.1823, "step": 620 }, { "epoch": 0.5982905982905983, "grad_norm": 1.2571120262145996, "learning_rate": 1.7394590185463837e-06, "loss": 1.2054, "step": 630 }, { "epoch": 0.603988603988604, "eval_loss": 1.200851559638977, "eval_runtime": 241.2048, "eval_samples_per_second": 4.851, "eval_steps_per_second": 2.425, "step": 636 }, { "epoch": 0.6077872744539411, "grad_norm": 1.0591609477996826, "learning_rate": 1.7000420745694253e-06, "loss": 1.1656, "step": 640 }, { "epoch": 0.6172839506172839, "grad_norm": 1.0051286220550537, "learning_rate": 1.6583664395986035e-06, "loss": 1.1572, "step": 650 }, { "epoch": 0.6267806267806267, "grad_norm": 1.3040874004364014, "learning_rate": 1.61456658038212e-06, "loss": 1.1662, "step": 660 }, { "epoch": 0.6362773029439696, "grad_norm": 1.066710352897644, "learning_rate": 1.5687838174936082e-06, "loss": 1.1534, "step": 670 }, { "epoch": 0.6457739791073125, "grad_norm": 1.009498953819275, "learning_rate": 1.5211658693602396e-06, "loss": 1.1528, "step": 680 }, { "epoch": 0.6552706552706553, "grad_norm": 1.3166037797927856, "learning_rate": 1.471866375648119e-06, "loss": 1.1289, "step": 690 }, { "epoch": 0.6647673314339981, "grad_norm": 1.092807412147522, "learning_rate": 1.4210444015427466e-06, "loss": 1.1494, "step": 700 }, { "epoch": 0.674264007597341, "grad_norm": 1.4485735893249512, "learning_rate": 1.3688639245240078e-06, "loss": 1.1119, "step": 710 }, { "epoch": 0.6837606837606838, "grad_norm": 1.17286217212677, "learning_rate": 1.3154933052916088e-06, "loss": 1.1025, "step": 720 }, { "epoch": 0.6932573599240266, "grad_norm": 1.246343731880188, "learning_rate": 1.2611047445480159e-06, "loss": 1.1005, "step": 730 }, { "epoch": 0.7027540360873694, "grad_norm": 1.3539321422576904, "learning_rate": 1.2058737273916022e-06, "loss": 1.1065, "step": 740 }, { "epoch": 0.704653371320038, "eval_loss": 1.1182154417037964, "eval_runtime": 242.6546, "eval_samples_per_second": 4.822, "eval_steps_per_second": 2.411, "step": 742 }, { "epoch": 0.7122507122507122, "grad_norm": 1.5195080041885376, "learning_rate": 1.1499784571126618e-06, "loss": 1.1485, "step": 750 }, { "epoch": 0.7217473884140551, "grad_norm": 1.4020613431930542, "learning_rate": 1.0935992802191625e-06, "loss": 1.0743, "step": 760 }, { "epoch": 0.7312440645773979, "grad_norm": 1.3675881624221802, "learning_rate": 1.036918104547385e-06, "loss": 1.1114, "step": 770 }, { "epoch": 0.7407407407407407, "grad_norm": 1.693854808807373, "learning_rate": 9.801178123349297e-07, "loss": 1.0854, "step": 780 }, { "epoch": 0.7502374169040835, "grad_norm": 1.451640248298645, "learning_rate": 9.233816701498069e-07, "loss": 1.1016, "step": 790 }, { "epoch": 0.7597340930674265, "grad_norm": 1.1422868967056274, "learning_rate": 8.668927375794832e-07, "loss": 1.083, "step": 800 }, { "epoch": 0.7692307692307693, "grad_norm": 1.7633247375488281, "learning_rate": 8.108332765877524e-07, "loss": 1.0593, "step": 810 }, { "epoch": 0.7787274453941121, "grad_norm": 1.420740008354187, "learning_rate": 7.553841634451461e-07, "loss": 1.0782, "step": 820 }, { "epoch": 0.7882241215574549, "grad_norm": 2.188483476638794, "learning_rate": 7.007243051302909e-07, "loss": 1.1092, "step": 830 }, { "epoch": 0.7977207977207977, "grad_norm": 1.8329579830169678, "learning_rate": 6.47030062085204e-07, "loss": 1.0592, "step": 840 }, { "epoch": 0.805318138651472, "eval_loss": 1.0689319372177124, "eval_runtime": 242.2915, "eval_samples_per_second": 4.829, "eval_steps_per_second": 2.414, "step": 848 }, { "epoch": 0.8072174738841406, "grad_norm": 1.286758303642273, "learning_rate": 5.944746791870061e-07, "loss": 1.0576, "step": 850 }, { "epoch": 0.8167141500474834, "grad_norm": 1.4296880960464478, "learning_rate": 5.432277267720291e-07, "loss": 1.0583, "step": 860 }, { "epoch": 0.8262108262108262, "grad_norm": 1.3304263353347778, "learning_rate": 4.934545535158568e-07, "loss": 1.044, "step": 870 }, { "epoch": 0.835707502374169, "grad_norm": 1.1693774461746216, "learning_rate": 4.4531575293458377e-07, "loss": 1.0326, "step": 880 }, { "epoch": 0.8452041785375118, "grad_norm": 1.6457087993621826, "learning_rate": 3.989666452286358e-07, "loss": 1.0269, "step": 890 }, { "epoch": 0.8547008547008547, "grad_norm": 2.029062032699585, "learning_rate": 3.5455677614097913e-07, "loss": 1.0281, "step": 900 }, { "epoch": 0.8641975308641975, "grad_norm": 1.6128430366516113, "learning_rate": 3.1222943444666106e-07, "loss": 1.0606, "step": 910 }, { "epoch": 0.8736942070275404, "grad_norm": 1.2736912965774536, "learning_rate": 2.721211896305059e-07, "loss": 1.0518, "step": 920 }, { "epoch": 0.8831908831908832, "grad_norm": 1.2634296417236328, "learning_rate": 2.3436145124464901e-07, "loss": 1.0506, "step": 930 }, { "epoch": 0.8926875593542261, "grad_norm": 1.5625249147415161, "learning_rate": 1.9907205136764859e-07, "loss": 1.0552, "step": 940 }, { "epoch": 0.9021842355175689, "grad_norm": 1.297250747680664, "learning_rate": 1.6636685151237117e-07, "loss": 1.0322, "step": 950 }, { "epoch": 0.905982905982906, "eval_loss": 1.0477073192596436, "eval_runtime": 242.6327, "eval_samples_per_second": 4.822, "eval_steps_per_second": 2.411, "step": 954 }, { "epoch": 0.9116809116809117, "grad_norm": 1.4581608772277832, "learning_rate": 1.3635137525096942e-07, "loss": 1.0286, "step": 960 }, { "epoch": 0.9211775878442545, "grad_norm": 1.5961098670959473, "learning_rate": 1.0912246774228606e-07, "loss": 1.0142, "step": 970 }, { "epoch": 0.9306742640075973, "grad_norm": 1.5795928239822388, "learning_rate": 8.47679832602235e-08, "loss": 1.0443, "step": 980 }, { "epoch": 0.9401709401709402, "grad_norm": 1.754335880279541, "learning_rate": 6.336650173127223e-08, "loss": 1.0416, "step": 990 }, { "epoch": 0.949667616334283, "grad_norm": 2.9027416706085205, "learning_rate": 4.498707519578915e-08, "loss": 1.0407, "step": 1000 }, { "epoch": 0.9591642924976258, "grad_norm": 1.5181182622909546, "learning_rate": 2.9689005011073077e-08, "loss": 1.0607, "step": 1010 }, { "epoch": 0.9686609686609686, "grad_norm": 1.0603967905044556, "learning_rate": 1.752165051509058e-08, "loss": 1.0485, "step": 1020 }, { "epoch": 0.9781576448243114, "grad_norm": 1.657195806503296, "learning_rate": 8.52426976820364e-09, "loss": 1.0505, "step": 1030 }, { "epoch": 0.9876543209876543, "grad_norm": 2.1635196208953857, "learning_rate": 2.725892886743297e-09, "loss": 1.016, "step": 1040 }, { "epoch": 0.9971509971509972, "grad_norm": 1.6551047563552856, "learning_rate": 1.4522837712205304e-10, "loss": 1.0395, "step": 1050 } ], "logging_steps": 10, "max_steps": 1053, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 351, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9456916610613248e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }