|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0064516129032257, |
|
"eval_steps": 32, |
|
"global_step": 156, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0064516129032258064, |
|
"grad_norm": 1.8022076952539137, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.477, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0064516129032258064, |
|
"eval_loss": 1.3211307525634766, |
|
"eval_runtime": 61.2704, |
|
"eval_samples_per_second": 1.632, |
|
"eval_steps_per_second": 0.065, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012903225806451613, |
|
"grad_norm": 1.830164465281067, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4299, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01935483870967742, |
|
"grad_norm": 1.7257186746629198, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.4482, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.025806451612903226, |
|
"grad_norm": 1.7165970873170038, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3717, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03225806451612903, |
|
"grad_norm": 1.1486983653469711, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.4594, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03870967741935484, |
|
"grad_norm": 0.6868389172099673, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.3619, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04516129032258064, |
|
"grad_norm": 0.8100802487851951, |
|
"learning_rate": 1.75e-05, |
|
"loss": 1.1633, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05161290322580645, |
|
"grad_norm": 1.1308634221406137, |
|
"learning_rate": 2e-05, |
|
"loss": 1.43, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05806451612903226, |
|
"grad_norm": 1.1032001609285251, |
|
"learning_rate": 2.25e-05, |
|
"loss": 1.4583, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 0.8673535874558637, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3725, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07096774193548387, |
|
"grad_norm": 0.5856780577487628, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 1.2705, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07741935483870968, |
|
"grad_norm": 0.5407118925923696, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1978, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08387096774193549, |
|
"grad_norm": 21.5243092057336, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 1.2099, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09032258064516129, |
|
"grad_norm": 0.5975726437730494, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.3989, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 0.632641768960225, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.3262, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1032258064516129, |
|
"grad_norm": 0.5237190218830238, |
|
"learning_rate": 4e-05, |
|
"loss": 1.2497, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10967741935483871, |
|
"grad_norm": 0.532634703510973, |
|
"learning_rate": 4.25e-05, |
|
"loss": 1.2812, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11612903225806452, |
|
"grad_norm": 0.5230723189678446, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.3475, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12258064516129032, |
|
"grad_norm": 1.1606813044713529, |
|
"learning_rate": 4.75e-05, |
|
"loss": 1.372, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 0.4510232796621864, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4073, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13548387096774195, |
|
"grad_norm": 0.5205928609096978, |
|
"learning_rate": 4.9993997182511844e-05, |
|
"loss": 1.2543, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14193548387096774, |
|
"grad_norm": 0.4663591464063973, |
|
"learning_rate": 4.9975991933053384e-05, |
|
"loss": 1.3796, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14838709677419354, |
|
"grad_norm": 0.5389896900391323, |
|
"learning_rate": 4.994599385893363e-05, |
|
"loss": 1.4438, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15483870967741936, |
|
"grad_norm": 0.5670034687541796, |
|
"learning_rate": 4.990401896663828e-05, |
|
"loss": 1.264, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 0.42454937922862174, |
|
"learning_rate": 4.985008965328888e-05, |
|
"loss": 1.2944, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16774193548387098, |
|
"grad_norm": 0.5194572848655961, |
|
"learning_rate": 4.9784234694692117e-05, |
|
"loss": 1.4043, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17419354838709677, |
|
"grad_norm": 0.5484146080851298, |
|
"learning_rate": 4.9706489229985524e-05, |
|
"loss": 1.4735, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18064516129032257, |
|
"grad_norm": 0.7784686106719038, |
|
"learning_rate": 4.961689474288779e-05, |
|
"loss": 1.3291, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1870967741935484, |
|
"grad_norm": 0.5710341199937055, |
|
"learning_rate": 4.9515499039563704e-05, |
|
"loss": 1.3314, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 0.4173913286385054, |
|
"learning_rate": 4.940235622311559e-05, |
|
"loss": 1.1272, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.371163408997987, |
|
"learning_rate": 4.9277526664714765e-05, |
|
"loss": 1.2884, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2064516129032258, |
|
"grad_norm": 0.6894186561505441, |
|
"learning_rate": 4.914107697138843e-05, |
|
"loss": 1.2338, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2064516129032258, |
|
"eval_loss": 1.115579605102539, |
|
"eval_runtime": 61.6455, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 0.065, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2129032258064516, |
|
"grad_norm": 0.44925759222205974, |
|
"learning_rate": 4.8993079950479305e-05, |
|
"loss": 1.1968, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.21935483870967742, |
|
"grad_norm": 0.4304874702516527, |
|
"learning_rate": 4.883361457079673e-05, |
|
"loss": 1.1706, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22580645161290322, |
|
"grad_norm": 0.3646005763538535, |
|
"learning_rate": 4.8662765920480274e-05, |
|
"loss": 1.2714, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23225806451612904, |
|
"grad_norm": 0.3568923781906157, |
|
"learning_rate": 4.8480625161598e-05, |
|
"loss": 1.175, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23870967741935484, |
|
"grad_norm": 0.37794009122137584, |
|
"learning_rate": 4.8287289481503954e-05, |
|
"loss": 1.2415, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24516129032258063, |
|
"grad_norm": 0.35188090283951096, |
|
"learning_rate": 4.808286204098047e-05, |
|
"loss": 1.3385, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.25161290322580643, |
|
"grad_norm": 0.3863530195041696, |
|
"learning_rate": 4.7867451919193346e-05, |
|
"loss": 1.2419, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 0.3746461776891942, |
|
"learning_rate": 4.764117405548891e-05, |
|
"loss": 1.2624, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2645161290322581, |
|
"grad_norm": 0.34991236825502203, |
|
"learning_rate": 4.740414918806425e-05, |
|
"loss": 1.307, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2709677419354839, |
|
"grad_norm": 0.4011683523629512, |
|
"learning_rate": 4.715650378954331e-05, |
|
"loss": 1.357, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.27741935483870966, |
|
"grad_norm": 0.3457706023597712, |
|
"learning_rate": 4.689836999949314e-05, |
|
"loss": 1.3757, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2838709677419355, |
|
"grad_norm": 0.2938536081434768, |
|
"learning_rate": 4.662988555391632e-05, |
|
"loss": 1.3486, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 0.33377012573731674, |
|
"learning_rate": 4.635119371175731e-05, |
|
"loss": 1.1417, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2967741935483871, |
|
"grad_norm": 0.5099149644951235, |
|
"learning_rate": 4.60624431784618e-05, |
|
"loss": 1.2184, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3032258064516129, |
|
"grad_norm": 0.4651969988328567, |
|
"learning_rate": 4.576378802662989e-05, |
|
"loss": 1.2518, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3096774193548387, |
|
"grad_norm": 0.3453144809110264, |
|
"learning_rate": 4.5455387613805396e-05, |
|
"loss": 1.4111, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3161290322580645, |
|
"grad_norm": 0.3338451350111794, |
|
"learning_rate": 4.513740649744536e-05, |
|
"loss": 1.2129, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.36687031408807524, |
|
"learning_rate": 4.4810014347114784e-05, |
|
"loss": 1.2703, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32903225806451614, |
|
"grad_norm": 0.38881877032764534, |
|
"learning_rate": 4.4473385853953693e-05, |
|
"loss": 1.2997, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33548387096774196, |
|
"grad_norm": 0.34976100346234607, |
|
"learning_rate": 4.4127700637464834e-05, |
|
"loss": 1.0796, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3419354838709677, |
|
"grad_norm": 0.3696068140674995, |
|
"learning_rate": 4.3773143149671576e-05, |
|
"loss": 1.3098, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.34838709677419355, |
|
"grad_norm": 0.3168263761372888, |
|
"learning_rate": 4.340990257669732e-05, |
|
"loss": 1.2894, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3548387096774194, |
|
"grad_norm": 0.3442048604805361, |
|
"learning_rate": 4.303817273781886e-05, |
|
"loss": 1.3385, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36129032258064514, |
|
"grad_norm": 0.38003699951922426, |
|
"learning_rate": 4.2658151982047536e-05, |
|
"loss": 1.2548, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.36774193548387096, |
|
"grad_norm": 0.5065610455915895, |
|
"learning_rate": 4.2270043082293463e-05, |
|
"loss": 1.2496, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3741935483870968, |
|
"grad_norm": 0.3259412922642011, |
|
"learning_rate": 4.1874053127169126e-05, |
|
"loss": 1.1696, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38064516129032255, |
|
"grad_norm": 0.33390560015814696, |
|
"learning_rate": 4.147039341049036e-05, |
|
"loss": 1.3276, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 0.3161062426181486, |
|
"learning_rate": 4.105927931853327e-05, |
|
"loss": 1.258, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3935483870967742, |
|
"grad_norm": 0.37939625630824414, |
|
"learning_rate": 4.0640930215107725e-05, |
|
"loss": 1.3119, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3447410882628537, |
|
"learning_rate": 4.021556932450832e-05, |
|
"loss": 1.1337, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4064516129032258, |
|
"grad_norm": 0.33262503192755777, |
|
"learning_rate": 3.978342361240553e-05, |
|
"loss": 1.3684, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4129032258064516, |
|
"grad_norm": 0.4082167906853653, |
|
"learning_rate": 3.9344723664740506e-05, |
|
"loss": 1.1973, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4129032258064516, |
|
"eval_loss": 1.0707180500030518, |
|
"eval_runtime": 62.1751, |
|
"eval_samples_per_second": 1.608, |
|
"eval_steps_per_second": 0.064, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.41935483870967744, |
|
"grad_norm": 0.42898014496026954, |
|
"learning_rate": 3.8899703564688187e-05, |
|
"loss": 1.3098, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4258064516129032, |
|
"grad_norm": 0.3729718619879595, |
|
"learning_rate": 3.8448600767754265e-05, |
|
"loss": 1.3267, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.432258064516129, |
|
"grad_norm": 0.5652836221912215, |
|
"learning_rate": 3.7991655975072834e-05, |
|
"loss": 1.3008, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.43870967741935485, |
|
"grad_norm": 0.3611571783806379, |
|
"learning_rate": 3.752911300497212e-05, |
|
"loss": 1.2365, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44516129032258067, |
|
"grad_norm": 0.4101622999668487, |
|
"learning_rate": 3.706121866287699e-05, |
|
"loss": 1.2805, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 0.4194502800160711, |
|
"learning_rate": 3.658822260961763e-05, |
|
"loss": 1.2627, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.45806451612903226, |
|
"grad_norm": 0.4464572963409143, |
|
"learning_rate": 3.611037722821452e-05, |
|
"loss": 1.3269, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4645161290322581, |
|
"grad_norm": 0.43900384749780696, |
|
"learning_rate": 3.562793748921095e-05, |
|
"loss": 1.0625, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47096774193548385, |
|
"grad_norm": 0.3492561062627179, |
|
"learning_rate": 3.514116081462488e-05, |
|
"loss": 1.2854, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4774193548387097, |
|
"grad_norm": 1.004303081481083, |
|
"learning_rate": 3.4650306940592784e-05, |
|
"loss": 1.3114, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 0.372149762179685, |
|
"learning_rate": 3.415563777877859e-05, |
|
"loss": 1.1604, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.49032258064516127, |
|
"grad_norm": 0.36620109818968666, |
|
"learning_rate": 3.365741727662187e-05, |
|
"loss": 1.2055, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4967741935483871, |
|
"grad_norm": 0.3209403988829257, |
|
"learning_rate": 3.315591127649981e-05, |
|
"loss": 1.2652, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5032258064516129, |
|
"grad_norm": 0.6268869630058581, |
|
"learning_rate": 3.265138737387802e-05, |
|
"loss": 1.3451, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5096774193548387, |
|
"grad_norm": 0.37710251621094776, |
|
"learning_rate": 3.214411477452589e-05, |
|
"loss": 1.1998, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 0.3965119239115867, |
|
"learning_rate": 3.1634364150872836e-05, |
|
"loss": 1.198, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5225806451612903, |
|
"grad_norm": 0.38914331784636286, |
|
"learning_rate": 3.112240749758179e-05, |
|
"loss": 1.3164, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5290322580645161, |
|
"grad_norm": 0.4854967858248665, |
|
"learning_rate": 3.060851798641735e-05, |
|
"loss": 1.1669, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.535483870967742, |
|
"grad_norm": 0.4486571105935308, |
|
"learning_rate": 3.00929698204857e-05, |
|
"loss": 1.3611, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5419354838709678, |
|
"grad_norm": 0.5816885351466946, |
|
"learning_rate": 2.9576038087924297e-05, |
|
"loss": 1.2272, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5483870967741935, |
|
"grad_norm": 0.3242743003758612, |
|
"learning_rate": 2.905799861511932e-05, |
|
"loss": 1.1925, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5548387096774193, |
|
"grad_norm": 0.3110545851314829, |
|
"learning_rate": 2.8539127819529143e-05, |
|
"loss": 0.9746, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5612903225806452, |
|
"grad_norm": 0.3102061641971853, |
|
"learning_rate": 2.801970256219253e-05, |
|
"loss": 1.352, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.567741935483871, |
|
"grad_norm": 0.30361763618294724, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 1.2039, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5741935483870968, |
|
"grad_norm": 0.5030242942383549, |
|
"learning_rate": 2.698029743780748e-05, |
|
"loss": 1.2757, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 0.5902079797954521, |
|
"learning_rate": 2.6460872180470865e-05, |
|
"loss": 1.1542, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5870967741935483, |
|
"grad_norm": 0.4650188539079032, |
|
"learning_rate": 2.594200138488069e-05, |
|
"loss": 1.1455, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5935483870967742, |
|
"grad_norm": 0.6953375177526994, |
|
"learning_rate": 2.5423961912075712e-05, |
|
"loss": 1.2476, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.324295911329268, |
|
"learning_rate": 2.4907030179514307e-05, |
|
"loss": 1.0578, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6064516129032258, |
|
"grad_norm": 0.36056444973850205, |
|
"learning_rate": 2.4391482013582657e-05, |
|
"loss": 1.3128, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6129032258064516, |
|
"grad_norm": 0.31638336845784404, |
|
"learning_rate": 2.387759250241821e-05, |
|
"loss": 1.1412, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6193548387096774, |
|
"grad_norm": 0.3807737813278726, |
|
"learning_rate": 2.3365635849127166e-05, |
|
"loss": 1.301, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6193548387096774, |
|
"eval_loss": 1.0401562452316284, |
|
"eval_runtime": 62.5349, |
|
"eval_samples_per_second": 1.599, |
|
"eval_steps_per_second": 0.064, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6258064516129033, |
|
"grad_norm": 0.36219529568521813, |
|
"learning_rate": 2.285588522547411e-05, |
|
"loss": 1.2681, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.632258064516129, |
|
"grad_norm": 0.4601161674119361, |
|
"learning_rate": 2.234861262612199e-05, |
|
"loss": 1.2387, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6387096774193548, |
|
"grad_norm": 0.6207212832715766, |
|
"learning_rate": 2.184408872350019e-05, |
|
"loss": 1.2087, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.3655891991096712, |
|
"learning_rate": 2.134258272337814e-05, |
|
"loss": 1.2769, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6516129032258065, |
|
"grad_norm": 0.4394265602792923, |
|
"learning_rate": 2.084436222122142e-05, |
|
"loss": 1.0799, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6580645161290323, |
|
"grad_norm": 0.5059663574517834, |
|
"learning_rate": 2.0349693059407215e-05, |
|
"loss": 1.0953, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6645161290322581, |
|
"grad_norm": 0.34732606007316424, |
|
"learning_rate": 1.9858839185375123e-05, |
|
"loss": 1.224, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6709677419354839, |
|
"grad_norm": 0.5464551769086812, |
|
"learning_rate": 1.9372062510789063e-05, |
|
"loss": 1.2413, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 0.977742231459624, |
|
"learning_rate": 1.888962277178548e-05, |
|
"loss": 1.2118, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6838709677419355, |
|
"grad_norm": 2.537109489591264, |
|
"learning_rate": 1.8411777390382367e-05, |
|
"loss": 1.2513, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6903225806451613, |
|
"grad_norm": 0.35948844839880034, |
|
"learning_rate": 1.7938781337123016e-05, |
|
"loss": 1.1404, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6967741935483871, |
|
"grad_norm": 0.457105884170092, |
|
"learning_rate": 1.747088699502789e-05, |
|
"loss": 1.1514, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7032258064516129, |
|
"grad_norm": 1.1486002566265734, |
|
"learning_rate": 1.7008344024927168e-05, |
|
"loss": 1.3249, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 0.36043342663778255, |
|
"learning_rate": 1.6551399232245737e-05, |
|
"loss": 1.1239, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7161290322580646, |
|
"grad_norm": 0.46594876338109426, |
|
"learning_rate": 1.610029643531182e-05, |
|
"loss": 1.2918, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7225806451612903, |
|
"grad_norm": 0.32990660251070025, |
|
"learning_rate": 1.5655276335259493e-05, |
|
"loss": 1.2266, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7290322580645161, |
|
"grad_norm": 0.30010478660077256, |
|
"learning_rate": 1.5216576387594481e-05, |
|
"loss": 1.2114, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7354838709677419, |
|
"grad_norm": 0.49532244626831723, |
|
"learning_rate": 1.4784430675491685e-05, |
|
"loss": 1.2457, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7419354838709677, |
|
"grad_norm": 0.5191609185311767, |
|
"learning_rate": 1.4359069784892282e-05, |
|
"loss": 1.2862, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7483870967741936, |
|
"grad_norm": 0.3826327354484767, |
|
"learning_rate": 1.3940720681466734e-05, |
|
"loss": 1.1351, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7548387096774194, |
|
"grad_norm": 0.330074625162551, |
|
"learning_rate": 1.3529606589509647e-05, |
|
"loss": 1.1871, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7612903225806451, |
|
"grad_norm": 0.34233269430078184, |
|
"learning_rate": 1.3125946872830877e-05, |
|
"loss": 1.1411, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7677419354838709, |
|
"grad_norm": 0.31326296304705775, |
|
"learning_rate": 1.2729956917706545e-05, |
|
"loss": 1.2387, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 0.3176809107580838, |
|
"learning_rate": 1.2341848017952464e-05, |
|
"loss": 1.2451, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7806451612903226, |
|
"grad_norm": 0.31420402228609556, |
|
"learning_rate": 1.1961827262181141e-05, |
|
"loss": 1.1766, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7870967741935484, |
|
"grad_norm": 0.4637761844099348, |
|
"learning_rate": 1.1590097423302684e-05, |
|
"loss": 1.1542, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7935483870967742, |
|
"grad_norm": 0.36159367839677437, |
|
"learning_rate": 1.1226856850328434e-05, |
|
"loss": 1.3127, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5010806704980222, |
|
"learning_rate": 1.0872299362535173e-05, |
|
"loss": 1.2729, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 0.3461696613483525, |
|
"learning_rate": 1.0526614146046312e-05, |
|
"loss": 1.2425, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8129032258064516, |
|
"grad_norm": 0.35751217338851793, |
|
"learning_rate": 1.0189985652885225e-05, |
|
"loss": 1.2222, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8193548387096774, |
|
"grad_norm": 0.43059544412165696, |
|
"learning_rate": 9.862593502554648e-06, |
|
"loss": 1.1938, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8258064516129032, |
|
"grad_norm": 0.7260092938036656, |
|
"learning_rate": 9.544612386194612e-06, |
|
"loss": 1.1063, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8258064516129032, |
|
"eval_loss": 1.0231536626815796, |
|
"eval_runtime": 62.2556, |
|
"eval_samples_per_second": 1.606, |
|
"eval_steps_per_second": 0.064, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.832258064516129, |
|
"grad_norm": 0.2930692349121967, |
|
"learning_rate": 9.236211973370124e-06, |
|
"loss": 1.2804, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 0.3514011035647982, |
|
"learning_rate": 8.937556821538201e-06, |
|
"loss": 1.3527, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8451612903225807, |
|
"grad_norm": 0.3509271601664881, |
|
"learning_rate": 8.64880628824269e-06, |
|
"loss": 1.2336, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8516129032258064, |
|
"grad_norm": 0.369286535470622, |
|
"learning_rate": 8.370114446083686e-06, |
|
"loss": 1.2204, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8580645161290322, |
|
"grad_norm": 0.3376899684032205, |
|
"learning_rate": 8.101630000506864e-06, |
|
"loss": 1.114, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.864516129032258, |
|
"grad_norm": 0.34528372468606205, |
|
"learning_rate": 7.843496210456687e-06, |
|
"loss": 1.2915, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 0.3271748537414322, |
|
"learning_rate": 7.595850811935759e-06, |
|
"loss": 1.2242, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8774193548387097, |
|
"grad_norm": 0.34552044795509895, |
|
"learning_rate": 7.358825944511101e-06, |
|
"loss": 1.2238, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8838709677419355, |
|
"grad_norm": 0.3645405834936748, |
|
"learning_rate": 7.132548080806653e-06, |
|
"loss": 1.1925, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8903225806451613, |
|
"grad_norm": 0.39117823625181364, |
|
"learning_rate": 6.917137959019528e-06, |
|
"loss": 1.1295, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.896774193548387, |
|
"grad_norm": 0.3256830351093455, |
|
"learning_rate": 6.712710518496049e-06, |
|
"loss": 1.2506, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 0.4262467981624931, |
|
"learning_rate": 6.519374838401997e-06, |
|
"loss": 1.1759, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9096774193548387, |
|
"grad_norm": 0.35503437951993716, |
|
"learning_rate": 6.337234079519728e-06, |
|
"loss": 1.1777, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9161290322580645, |
|
"grad_norm": 0.3897540509188695, |
|
"learning_rate": 6.166385429203269e-06, |
|
"loss": 1.1239, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9225806451612903, |
|
"grad_norm": 0.36016445939620884, |
|
"learning_rate": 6.006920049520701e-06, |
|
"loss": 1.2692, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9290322580645162, |
|
"grad_norm": 0.4413576798023392, |
|
"learning_rate": 5.858923028611572e-06, |
|
"loss": 1.1879, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9354838709677419, |
|
"grad_norm": 0.37955599088497055, |
|
"learning_rate": 5.722473335285244e-06, |
|
"loss": 1.205, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9419354838709677, |
|
"grad_norm": 0.35919500181972724, |
|
"learning_rate": 5.597643776884412e-06, |
|
"loss": 1.1617, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9483870967741935, |
|
"grad_norm": 0.3022686971058462, |
|
"learning_rate": 5.4845009604363e-06, |
|
"loss": 1.2059, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9548387096774194, |
|
"grad_norm": 0.30291369490101205, |
|
"learning_rate": 5.38310525711221e-06, |
|
"loss": 1.2672, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9612903225806452, |
|
"grad_norm": 0.33599320279905975, |
|
"learning_rate": 5.293510770014475e-06, |
|
"loss": 1.2755, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 0.2903929279243622, |
|
"learning_rate": 5.215765305307886e-06, |
|
"loss": 1.1675, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9741935483870968, |
|
"grad_norm": 0.3305110382050327, |
|
"learning_rate": 5.149910346711126e-06, |
|
"loss": 1.2342, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9806451612903225, |
|
"grad_norm": 0.33304378208594904, |
|
"learning_rate": 5.095981033361725e-06, |
|
"loss": 1.1312, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9870967741935484, |
|
"grad_norm": 0.3479102720763047, |
|
"learning_rate": 5.05400614106637e-06, |
|
"loss": 1.1753, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9935483870967742, |
|
"grad_norm": 0.31384042987234395, |
|
"learning_rate": 5.024008066946621e-06, |
|
"loss": 1.2077, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5248637716000059, |
|
"learning_rate": 5.006002817488162e-06, |
|
"loss": 1.1639, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0064516129032257, |
|
"grad_norm": 0.359683648131272, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2093, |
|
"step": 156 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 156, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 32, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 319408664739840.0, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|