|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.6666666666666666, |
|
"eval_steps": 106, |
|
"global_step": 702, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000949667616334283, |
|
"eval_loss": 1.492834210395813, |
|
"eval_runtime": 241.5208, |
|
"eval_samples_per_second": 4.844, |
|
"eval_steps_per_second": 2.422, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00949667616334283, |
|
"grad_norm": 0.782672107219696, |
|
"learning_rate": 4e-08, |
|
"loss": 1.4416, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01899335232668566, |
|
"grad_norm": 0.5684090852737427, |
|
"learning_rate": 8e-08, |
|
"loss": 1.4691, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02849002849002849, |
|
"grad_norm": 0.7916907668113708, |
|
"learning_rate": 1.2e-07, |
|
"loss": 1.4888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03798670465337132, |
|
"grad_norm": 0.7482171058654785, |
|
"learning_rate": 1.6e-07, |
|
"loss": 1.4607, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04748338081671415, |
|
"grad_norm": 0.8853126168251038, |
|
"learning_rate": 2e-07, |
|
"loss": 1.4953, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05698005698005698, |
|
"grad_norm": 0.6968705058097839, |
|
"learning_rate": 2.4e-07, |
|
"loss": 1.4915, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06647673314339982, |
|
"grad_norm": 0.737502932548523, |
|
"learning_rate": 2.8e-07, |
|
"loss": 1.459, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07597340930674264, |
|
"grad_norm": 0.6606144309043884, |
|
"learning_rate": 3.2e-07, |
|
"loss": 1.4511, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08547008547008547, |
|
"grad_norm": 0.7408589720726013, |
|
"learning_rate": 3.6e-07, |
|
"loss": 1.4256, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0949667616334283, |
|
"grad_norm": 0.8764765858650208, |
|
"learning_rate": 4e-07, |
|
"loss": 1.476, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.100664767331434, |
|
"eval_loss": 1.491244912147522, |
|
"eval_runtime": 241.0891, |
|
"eval_samples_per_second": 4.853, |
|
"eval_steps_per_second": 2.426, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.10446343779677113, |
|
"grad_norm": 0.7903630137443542, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 1.4327, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11396011396011396, |
|
"grad_norm": 0.6204758286476135, |
|
"learning_rate": 4.8e-07, |
|
"loss": 1.4546, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 0.6556420922279358, |
|
"learning_rate": 5.2e-07, |
|
"loss": 1.4785, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13295346628679963, |
|
"grad_norm": 0.6578308939933777, |
|
"learning_rate": 5.6e-07, |
|
"loss": 1.4848, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14245014245014245, |
|
"grad_norm": 0.7534553408622742, |
|
"learning_rate": 6e-07, |
|
"loss": 1.4714, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15194681861348527, |
|
"grad_norm": 1.1248106956481934, |
|
"learning_rate": 6.4e-07, |
|
"loss": 1.4513, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16144349477682812, |
|
"grad_norm": 0.7845984697341919, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 1.4494, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17094017094017094, |
|
"grad_norm": 0.8384345769882202, |
|
"learning_rate": 7.2e-07, |
|
"loss": 1.4326, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18043684710351376, |
|
"grad_norm": 0.737319827079773, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 1.4772, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1899335232668566, |
|
"grad_norm": 0.5843039155006409, |
|
"learning_rate": 8e-07, |
|
"loss": 1.459, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19943019943019943, |
|
"grad_norm": 0.6763444542884827, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 1.4464, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.201329534662868, |
|
"eval_loss": 1.4787342548370361, |
|
"eval_runtime": 241.3275, |
|
"eval_samples_per_second": 4.848, |
|
"eval_steps_per_second": 2.424, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.20892687559354226, |
|
"grad_norm": 0.5159311294555664, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 1.4781, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2184235517568851, |
|
"grad_norm": 0.635596513748169, |
|
"learning_rate": 9.2e-07, |
|
"loss": 1.4475, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.22792022792022792, |
|
"grad_norm": 0.6518263220787048, |
|
"learning_rate": 9.6e-07, |
|
"loss": 1.4173, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.23741690408357075, |
|
"grad_norm": 0.6334572434425354, |
|
"learning_rate": 1e-06, |
|
"loss": 1.4194, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 0.7460399270057678, |
|
"learning_rate": 1.04e-06, |
|
"loss": 1.4185, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 0.5270726084709167, |
|
"learning_rate": 1.08e-06, |
|
"loss": 1.385, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.26590693257359926, |
|
"grad_norm": 0.5877587795257568, |
|
"learning_rate": 1.12e-06, |
|
"loss": 1.412, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2754036087369421, |
|
"grad_norm": 0.7391476035118103, |
|
"learning_rate": 1.16e-06, |
|
"loss": 1.4478, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2849002849002849, |
|
"grad_norm": 0.7564486265182495, |
|
"learning_rate": 1.2e-06, |
|
"loss": 1.439, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2943969610636277, |
|
"grad_norm": 0.5443644523620605, |
|
"learning_rate": 1.24e-06, |
|
"loss": 1.4065, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.301994301994302, |
|
"eval_loss": 1.438108205795288, |
|
"eval_runtime": 241.1917, |
|
"eval_samples_per_second": 4.851, |
|
"eval_steps_per_second": 2.425, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.30389363722697055, |
|
"grad_norm": 0.5446246862411499, |
|
"learning_rate": 1.28e-06, |
|
"loss": 1.4128, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.31339031339031337, |
|
"grad_norm": 0.7768235802650452, |
|
"learning_rate": 1.32e-06, |
|
"loss": 1.3838, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.32288698955365625, |
|
"grad_norm": 0.8241527080535889, |
|
"learning_rate": 1.3600000000000001e-06, |
|
"loss": 1.4541, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.33238366571699907, |
|
"grad_norm": 0.7136777639389038, |
|
"learning_rate": 1.4e-06, |
|
"loss": 1.4092, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 0.6339492201805115, |
|
"learning_rate": 1.44e-06, |
|
"loss": 1.4339, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3513770180436847, |
|
"grad_norm": 0.5841691493988037, |
|
"learning_rate": 1.48e-06, |
|
"loss": 1.3396, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.36087369420702753, |
|
"grad_norm": 0.6338390707969666, |
|
"learning_rate": 1.5199999999999998e-06, |
|
"loss": 1.3738, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.6443079710006714, |
|
"learning_rate": 1.5599999999999999e-06, |
|
"loss": 1.3632, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3798670465337132, |
|
"grad_norm": 0.7841497659683228, |
|
"learning_rate": 1.6e-06, |
|
"loss": 1.386, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38936372269705605, |
|
"grad_norm": 0.6738791465759277, |
|
"learning_rate": 1.6399999999999998e-06, |
|
"loss": 1.3469, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.39886039886039887, |
|
"grad_norm": 0.6674798727035522, |
|
"learning_rate": 1.6799999999999998e-06, |
|
"loss": 1.3565, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.402659069325736, |
|
"eval_loss": 1.371882438659668, |
|
"eval_runtime": 241.1901, |
|
"eval_samples_per_second": 4.851, |
|
"eval_steps_per_second": 2.425, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.4083570750237417, |
|
"grad_norm": 0.7130848169326782, |
|
"learning_rate": 1.7199999999999998e-06, |
|
"loss": 1.3734, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4178537511870845, |
|
"grad_norm": 0.60502690076828, |
|
"learning_rate": 1.7599999999999999e-06, |
|
"loss": 1.3689, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.42735042735042733, |
|
"grad_norm": 0.7991560697555542, |
|
"learning_rate": 1.8e-06, |
|
"loss": 1.335, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4368471035137702, |
|
"grad_norm": 0.7825191020965576, |
|
"learning_rate": 1.84e-06, |
|
"loss": 1.3713, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.44634377967711303, |
|
"grad_norm": 0.8290002942085266, |
|
"learning_rate": 1.8799999999999998e-06, |
|
"loss": 1.3294, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.45584045584045585, |
|
"grad_norm": 0.7695876955986023, |
|
"learning_rate": 1.92e-06, |
|
"loss": 1.2743, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.46533713200379867, |
|
"grad_norm": 0.7531653046607971, |
|
"learning_rate": 1.96e-06, |
|
"loss": 1.3175, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4748338081671415, |
|
"grad_norm": 0.9283966422080994, |
|
"learning_rate": 2e-06, |
|
"loss": 1.2777, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4843304843304843, |
|
"grad_norm": 0.8161536455154419, |
|
"learning_rate": 1.998386746257178e-06, |
|
"loss": 1.2737, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 0.8506576418876648, |
|
"learning_rate": 1.993552190203991e-06, |
|
"loss": 1.277, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.50332383665717, |
|
"grad_norm": 0.7465077042579651, |
|
"learning_rate": 1.985511930571733e-06, |
|
"loss": 1.2351, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.50332383665717, |
|
"eval_loss": 1.285551905632019, |
|
"eval_runtime": 241.4345, |
|
"eval_samples_per_second": 4.846, |
|
"eval_steps_per_second": 2.423, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.6605340838432312, |
|
"learning_rate": 1.9742919093182947e-06, |
|
"loss": 1.2638, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5223171889838556, |
|
"grad_norm": 0.8043677806854248, |
|
"learning_rate": 1.959928327926239e-06, |
|
"loss": 1.2617, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5318138651471985, |
|
"grad_norm": 0.9856230020523071, |
|
"learning_rate": 1.942467530598449e-06, |
|
"loss": 1.2799, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5413105413105413, |
|
"grad_norm": 0.9948506355285645, |
|
"learning_rate": 1.9219658547282065e-06, |
|
"loss": 1.2449, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5508072174738842, |
|
"grad_norm": 0.8890409469604492, |
|
"learning_rate": 1.8984894491261762e-06, |
|
"loss": 1.196, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.560303893637227, |
|
"grad_norm": 0.9632903337478638, |
|
"learning_rate": 1.872114060590769e-06, |
|
"loss": 1.2567, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 1.1697239875793457, |
|
"learning_rate": 1.842924789510531e-06, |
|
"loss": 1.2497, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5792972459639126, |
|
"grad_norm": 1.23131263256073, |
|
"learning_rate": 1.8110158152871029e-06, |
|
"loss": 1.2721, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5887939221272555, |
|
"grad_norm": 1.192724585533142, |
|
"learning_rate": 1.776490092464676e-06, |
|
"loss": 1.1905, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5982905982905983, |
|
"grad_norm": 0.8948692083358765, |
|
"learning_rate": 1.7394590185463837e-06, |
|
"loss": 1.1938, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.603988603988604, |
|
"eval_loss": 1.1977587938308716, |
|
"eval_runtime": 242.3209, |
|
"eval_samples_per_second": 4.828, |
|
"eval_steps_per_second": 2.414, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.6077872744539411, |
|
"grad_norm": 1.2288507223129272, |
|
"learning_rate": 1.7000420745694253e-06, |
|
"loss": 1.1886, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"grad_norm": 1.0895919799804688, |
|
"learning_rate": 1.6583664395986035e-06, |
|
"loss": 1.1645, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6267806267806267, |
|
"grad_norm": 0.91950523853302, |
|
"learning_rate": 1.61456658038212e-06, |
|
"loss": 1.1593, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6362773029439696, |
|
"grad_norm": 0.9742187857627869, |
|
"learning_rate": 1.5687838174936082e-06, |
|
"loss": 1.1584, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6457739791073125, |
|
"grad_norm": 1.2041188478469849, |
|
"learning_rate": 1.5211658693602396e-06, |
|
"loss": 1.1502, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6552706552706553, |
|
"grad_norm": 1.0713285207748413, |
|
"learning_rate": 1.471866375648119e-06, |
|
"loss": 1.1544, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6647673314339981, |
|
"grad_norm": 1.3049912452697754, |
|
"learning_rate": 1.4210444015427466e-06, |
|
"loss": 1.1671, |
|
"step": 700 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1053, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 351, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2971277740408832e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|