{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6666666666666666, "eval_steps": 106, "global_step": 702, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000949667616334283, "eval_loss": 1.492834210395813, "eval_runtime": 241.5208, "eval_samples_per_second": 4.844, "eval_steps_per_second": 2.422, "step": 1 }, { "epoch": 0.00949667616334283, "grad_norm": 0.782672107219696, "learning_rate": 4e-08, "loss": 1.4416, "step": 10 }, { "epoch": 0.01899335232668566, "grad_norm": 0.5684090852737427, "learning_rate": 8e-08, "loss": 1.4691, "step": 20 }, { "epoch": 0.02849002849002849, "grad_norm": 0.7916907668113708, "learning_rate": 1.2e-07, "loss": 1.4888, "step": 30 }, { "epoch": 0.03798670465337132, "grad_norm": 0.7482171058654785, "learning_rate": 1.6e-07, "loss": 1.4607, "step": 40 }, { "epoch": 0.04748338081671415, "grad_norm": 0.8853126168251038, "learning_rate": 2e-07, "loss": 1.4953, "step": 50 }, { "epoch": 0.05698005698005698, "grad_norm": 0.6968705058097839, "learning_rate": 2.4e-07, "loss": 1.4915, "step": 60 }, { "epoch": 0.06647673314339982, "grad_norm": 0.737502932548523, "learning_rate": 2.8e-07, "loss": 1.459, "step": 70 }, { "epoch": 0.07597340930674264, "grad_norm": 0.6606144309043884, "learning_rate": 3.2e-07, "loss": 1.4511, "step": 80 }, { "epoch": 0.08547008547008547, "grad_norm": 0.7408589720726013, "learning_rate": 3.6e-07, "loss": 1.4256, "step": 90 }, { "epoch": 0.0949667616334283, "grad_norm": 0.8764765858650208, "learning_rate": 4e-07, "loss": 1.476, "step": 100 }, { "epoch": 0.100664767331434, "eval_loss": 1.491244912147522, "eval_runtime": 241.0891, "eval_samples_per_second": 4.853, "eval_steps_per_second": 2.426, "step": 106 }, { "epoch": 0.10446343779677113, "grad_norm": 0.7903630137443542, "learning_rate": 4.3999999999999997e-07, "loss": 1.4327, "step": 110 }, { "epoch": 0.11396011396011396, "grad_norm": 0.6204758286476135, "learning_rate": 4.8e-07, "loss": 1.4546, "step": 120 }, { "epoch": 0.12345679012345678, "grad_norm": 0.6556420922279358, "learning_rate": 5.2e-07, "loss": 1.4785, "step": 130 }, { "epoch": 0.13295346628679963, "grad_norm": 0.6578308939933777, "learning_rate": 5.6e-07, "loss": 1.4848, "step": 140 }, { "epoch": 0.14245014245014245, "grad_norm": 0.7534553408622742, "learning_rate": 6e-07, "loss": 1.4714, "step": 150 }, { "epoch": 0.15194681861348527, "grad_norm": 1.1248106956481934, "learning_rate": 6.4e-07, "loss": 1.4513, "step": 160 }, { "epoch": 0.16144349477682812, "grad_norm": 0.7845984697341919, "learning_rate": 6.800000000000001e-07, "loss": 1.4494, "step": 170 }, { "epoch": 0.17094017094017094, "grad_norm": 0.8384345769882202, "learning_rate": 7.2e-07, "loss": 1.4326, "step": 180 }, { "epoch": 0.18043684710351376, "grad_norm": 0.737319827079773, "learning_rate": 7.599999999999999e-07, "loss": 1.4772, "step": 190 }, { "epoch": 0.1899335232668566, "grad_norm": 0.5843039155006409, "learning_rate": 8e-07, "loss": 1.459, "step": 200 }, { "epoch": 0.19943019943019943, "grad_norm": 0.6763444542884827, "learning_rate": 8.399999999999999e-07, "loss": 1.4464, "step": 210 }, { "epoch": 0.201329534662868, "eval_loss": 1.4787342548370361, "eval_runtime": 241.3275, "eval_samples_per_second": 4.848, "eval_steps_per_second": 2.424, "step": 212 }, { "epoch": 0.20892687559354226, "grad_norm": 0.5159311294555664, "learning_rate": 8.799999999999999e-07, "loss": 1.4781, "step": 220 }, { "epoch": 0.2184235517568851, "grad_norm": 0.635596513748169, "learning_rate": 9.2e-07, "loss": 1.4475, "step": 230 }, { "epoch": 0.22792022792022792, "grad_norm": 0.6518263220787048, "learning_rate": 9.6e-07, "loss": 1.4173, "step": 240 }, { "epoch": 0.23741690408357075, "grad_norm": 0.6334572434425354, "learning_rate": 1e-06, "loss": 1.4194, "step": 250 }, { "epoch": 0.24691358024691357, "grad_norm": 0.7460399270057678, "learning_rate": 1.04e-06, "loss": 1.4185, "step": 260 }, { "epoch": 0.2564102564102564, "grad_norm": 0.5270726084709167, "learning_rate": 1.08e-06, "loss": 1.385, "step": 270 }, { "epoch": 0.26590693257359926, "grad_norm": 0.5877587795257568, "learning_rate": 1.12e-06, "loss": 1.412, "step": 280 }, { "epoch": 0.2754036087369421, "grad_norm": 0.7391476035118103, "learning_rate": 1.16e-06, "loss": 1.4478, "step": 290 }, { "epoch": 0.2849002849002849, "grad_norm": 0.7564486265182495, "learning_rate": 1.2e-06, "loss": 1.439, "step": 300 }, { "epoch": 0.2943969610636277, "grad_norm": 0.5443644523620605, "learning_rate": 1.24e-06, "loss": 1.4065, "step": 310 }, { "epoch": 0.301994301994302, "eval_loss": 1.438108205795288, "eval_runtime": 241.1917, "eval_samples_per_second": 4.851, "eval_steps_per_second": 2.425, "step": 318 }, { "epoch": 0.30389363722697055, "grad_norm": 0.5446246862411499, "learning_rate": 1.28e-06, "loss": 1.4128, "step": 320 }, { "epoch": 0.31339031339031337, "grad_norm": 0.7768235802650452, "learning_rate": 1.32e-06, "loss": 1.3838, "step": 330 }, { "epoch": 0.32288698955365625, "grad_norm": 0.8241527080535889, "learning_rate": 1.3600000000000001e-06, "loss": 1.4541, "step": 340 }, { "epoch": 0.33238366571699907, "grad_norm": 0.7136777639389038, "learning_rate": 1.4e-06, "loss": 1.4092, "step": 350 }, { "epoch": 0.3418803418803419, "grad_norm": 0.6339492201805115, "learning_rate": 1.44e-06, "loss": 1.4339, "step": 360 }, { "epoch": 0.3513770180436847, "grad_norm": 0.5841691493988037, "learning_rate": 1.48e-06, "loss": 1.3396, "step": 370 }, { "epoch": 0.36087369420702753, "grad_norm": 0.6338390707969666, "learning_rate": 1.5199999999999998e-06, "loss": 1.3738, "step": 380 }, { "epoch": 0.37037037037037035, "grad_norm": 0.6443079710006714, "learning_rate": 1.5599999999999999e-06, "loss": 1.3632, "step": 390 }, { "epoch": 0.3798670465337132, "grad_norm": 0.7841497659683228, "learning_rate": 1.6e-06, "loss": 1.386, "step": 400 }, { "epoch": 0.38936372269705605, "grad_norm": 0.6738791465759277, "learning_rate": 1.6399999999999998e-06, "loss": 1.3469, "step": 410 }, { "epoch": 0.39886039886039887, "grad_norm": 0.6674798727035522, "learning_rate": 1.6799999999999998e-06, "loss": 1.3565, "step": 420 }, { "epoch": 0.402659069325736, "eval_loss": 1.371882438659668, "eval_runtime": 241.1901, "eval_samples_per_second": 4.851, "eval_steps_per_second": 2.425, "step": 424 }, { "epoch": 0.4083570750237417, "grad_norm": 0.7130848169326782, "learning_rate": 1.7199999999999998e-06, "loss": 1.3734, "step": 430 }, { "epoch": 0.4178537511870845, "grad_norm": 0.60502690076828, "learning_rate": 1.7599999999999999e-06, "loss": 1.3689, "step": 440 }, { "epoch": 0.42735042735042733, "grad_norm": 0.7991560697555542, "learning_rate": 1.8e-06, "loss": 1.335, "step": 450 }, { "epoch": 0.4368471035137702, "grad_norm": 0.7825191020965576, "learning_rate": 1.84e-06, "loss": 1.3713, "step": 460 }, { "epoch": 0.44634377967711303, "grad_norm": 0.8290002942085266, "learning_rate": 1.8799999999999998e-06, "loss": 1.3294, "step": 470 }, { "epoch": 0.45584045584045585, "grad_norm": 0.7695876955986023, "learning_rate": 1.92e-06, "loss": 1.2743, "step": 480 }, { "epoch": 0.46533713200379867, "grad_norm": 0.7531653046607971, "learning_rate": 1.96e-06, "loss": 1.3175, "step": 490 }, { "epoch": 0.4748338081671415, "grad_norm": 0.9283966422080994, "learning_rate": 2e-06, "loss": 1.2777, "step": 500 }, { "epoch": 0.4843304843304843, "grad_norm": 0.8161536455154419, "learning_rate": 1.998386746257178e-06, "loss": 1.2737, "step": 510 }, { "epoch": 0.49382716049382713, "grad_norm": 0.8506576418876648, "learning_rate": 1.993552190203991e-06, "loss": 1.277, "step": 520 }, { "epoch": 0.50332383665717, "grad_norm": 0.7465077042579651, "learning_rate": 1.985511930571733e-06, "loss": 1.2351, "step": 530 }, { "epoch": 0.50332383665717, "eval_loss": 1.285551905632019, "eval_runtime": 241.4345, "eval_samples_per_second": 4.846, "eval_steps_per_second": 2.423, "step": 530 }, { "epoch": 0.5128205128205128, "grad_norm": 0.6605340838432312, "learning_rate": 1.9742919093182947e-06, "loss": 1.2638, "step": 540 }, { "epoch": 0.5223171889838556, "grad_norm": 0.8043677806854248, "learning_rate": 1.959928327926239e-06, "loss": 1.2617, "step": 550 }, { "epoch": 0.5318138651471985, "grad_norm": 0.9856230020523071, "learning_rate": 1.942467530598449e-06, "loss": 1.2799, "step": 560 }, { "epoch": 0.5413105413105413, "grad_norm": 0.9948506355285645, "learning_rate": 1.9219658547282065e-06, "loss": 1.2449, "step": 570 }, { "epoch": 0.5508072174738842, "grad_norm": 0.8890409469604492, "learning_rate": 1.8984894491261762e-06, "loss": 1.196, "step": 580 }, { "epoch": 0.560303893637227, "grad_norm": 0.9632903337478638, "learning_rate": 1.872114060590769e-06, "loss": 1.2567, "step": 590 }, { "epoch": 0.5698005698005698, "grad_norm": 1.1697239875793457, "learning_rate": 1.842924789510531e-06, "loss": 1.2497, "step": 600 }, { "epoch": 0.5792972459639126, "grad_norm": 1.23131263256073, "learning_rate": 1.8110158152871029e-06, "loss": 1.2721, "step": 610 }, { "epoch": 0.5887939221272555, "grad_norm": 1.192724585533142, "learning_rate": 1.776490092464676e-06, "loss": 1.1905, "step": 620 }, { "epoch": 0.5982905982905983, "grad_norm": 0.8948692083358765, "learning_rate": 1.7394590185463837e-06, "loss": 1.1938, "step": 630 }, { "epoch": 0.603988603988604, "eval_loss": 1.1977587938308716, "eval_runtime": 242.3209, "eval_samples_per_second": 4.828, "eval_steps_per_second": 2.414, "step": 636 }, { "epoch": 0.6077872744539411, "grad_norm": 1.2288507223129272, "learning_rate": 1.7000420745694253e-06, "loss": 1.1886, "step": 640 }, { "epoch": 0.6172839506172839, "grad_norm": 1.0895919799804688, "learning_rate": 1.6583664395986035e-06, "loss": 1.1645, "step": 650 }, { "epoch": 0.6267806267806267, "grad_norm": 0.91950523853302, "learning_rate": 1.61456658038212e-06, "loss": 1.1593, "step": 660 }, { "epoch": 0.6362773029439696, "grad_norm": 0.9742187857627869, "learning_rate": 1.5687838174936082e-06, "loss": 1.1584, "step": 670 }, { "epoch": 0.6457739791073125, "grad_norm": 1.2041188478469849, "learning_rate": 1.5211658693602396e-06, "loss": 1.1502, "step": 680 }, { "epoch": 0.6552706552706553, "grad_norm": 1.0713285207748413, "learning_rate": 1.471866375648119e-06, "loss": 1.1544, "step": 690 }, { "epoch": 0.6647673314339981, "grad_norm": 1.3049912452697754, "learning_rate": 1.4210444015427466e-06, "loss": 1.1671, "step": 700 } ], "logging_steps": 10, "max_steps": 1053, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 351, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2971277740408832e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }