g4rg's picture
Training in progress, step 156, checkpoint
d83e0ab verified
raw
history blame
28.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0064516129032257,
"eval_steps": 32,
"global_step": 156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0064516129032258064,
"grad_norm": 1.8022076952539137,
"learning_rate": 2.5e-06,
"loss": 1.477,
"step": 1
},
{
"epoch": 0.0064516129032258064,
"eval_loss": 1.3211307525634766,
"eval_runtime": 61.2704,
"eval_samples_per_second": 1.632,
"eval_steps_per_second": 0.065,
"step": 1
},
{
"epoch": 0.012903225806451613,
"grad_norm": 1.830164465281067,
"learning_rate": 5e-06,
"loss": 1.4299,
"step": 2
},
{
"epoch": 0.01935483870967742,
"grad_norm": 1.7257186746629198,
"learning_rate": 7.5e-06,
"loss": 1.4482,
"step": 3
},
{
"epoch": 0.025806451612903226,
"grad_norm": 1.7165970873170038,
"learning_rate": 1e-05,
"loss": 1.3717,
"step": 4
},
{
"epoch": 0.03225806451612903,
"grad_norm": 1.1486983653469711,
"learning_rate": 1.25e-05,
"loss": 1.4594,
"step": 5
},
{
"epoch": 0.03870967741935484,
"grad_norm": 0.6868389172099673,
"learning_rate": 1.5e-05,
"loss": 1.3619,
"step": 6
},
{
"epoch": 0.04516129032258064,
"grad_norm": 0.8100802487851951,
"learning_rate": 1.75e-05,
"loss": 1.1633,
"step": 7
},
{
"epoch": 0.05161290322580645,
"grad_norm": 1.1308634221406137,
"learning_rate": 2e-05,
"loss": 1.43,
"step": 8
},
{
"epoch": 0.05806451612903226,
"grad_norm": 1.1032001609285251,
"learning_rate": 2.25e-05,
"loss": 1.4583,
"step": 9
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.8673535874558637,
"learning_rate": 2.5e-05,
"loss": 1.3725,
"step": 10
},
{
"epoch": 0.07096774193548387,
"grad_norm": 0.5856780577487628,
"learning_rate": 2.7500000000000004e-05,
"loss": 1.2705,
"step": 11
},
{
"epoch": 0.07741935483870968,
"grad_norm": 0.5407118925923696,
"learning_rate": 3e-05,
"loss": 1.1978,
"step": 12
},
{
"epoch": 0.08387096774193549,
"grad_norm": 21.5243092057336,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.2099,
"step": 13
},
{
"epoch": 0.09032258064516129,
"grad_norm": 0.5975726437730494,
"learning_rate": 3.5e-05,
"loss": 1.3989,
"step": 14
},
{
"epoch": 0.0967741935483871,
"grad_norm": 0.632641768960225,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.3262,
"step": 15
},
{
"epoch": 0.1032258064516129,
"grad_norm": 0.5237190218830238,
"learning_rate": 4e-05,
"loss": 1.2497,
"step": 16
},
{
"epoch": 0.10967741935483871,
"grad_norm": 0.532634703510973,
"learning_rate": 4.25e-05,
"loss": 1.2812,
"step": 17
},
{
"epoch": 0.11612903225806452,
"grad_norm": 0.5230723189678446,
"learning_rate": 4.5e-05,
"loss": 1.3475,
"step": 18
},
{
"epoch": 0.12258064516129032,
"grad_norm": 1.1606813044713529,
"learning_rate": 4.75e-05,
"loss": 1.372,
"step": 19
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.4510232796621864,
"learning_rate": 5e-05,
"loss": 1.4073,
"step": 20
},
{
"epoch": 0.13548387096774195,
"grad_norm": 0.5205928609096978,
"learning_rate": 4.9993997182511844e-05,
"loss": 1.2543,
"step": 21
},
{
"epoch": 0.14193548387096774,
"grad_norm": 0.4663591464063973,
"learning_rate": 4.9975991933053384e-05,
"loss": 1.3796,
"step": 22
},
{
"epoch": 0.14838709677419354,
"grad_norm": 0.5389896900391323,
"learning_rate": 4.994599385893363e-05,
"loss": 1.4438,
"step": 23
},
{
"epoch": 0.15483870967741936,
"grad_norm": 0.5670034687541796,
"learning_rate": 4.990401896663828e-05,
"loss": 1.264,
"step": 24
},
{
"epoch": 0.16129032258064516,
"grad_norm": 0.42454937922862174,
"learning_rate": 4.985008965328888e-05,
"loss": 1.2944,
"step": 25
},
{
"epoch": 0.16774193548387098,
"grad_norm": 0.5194572848655961,
"learning_rate": 4.9784234694692117e-05,
"loss": 1.4043,
"step": 26
},
{
"epoch": 0.17419354838709677,
"grad_norm": 0.5484146080851298,
"learning_rate": 4.9706489229985524e-05,
"loss": 1.4735,
"step": 27
},
{
"epoch": 0.18064516129032257,
"grad_norm": 0.7784686106719038,
"learning_rate": 4.961689474288779e-05,
"loss": 1.3291,
"step": 28
},
{
"epoch": 0.1870967741935484,
"grad_norm": 0.5710341199937055,
"learning_rate": 4.9515499039563704e-05,
"loss": 1.3314,
"step": 29
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.4173913286385054,
"learning_rate": 4.940235622311559e-05,
"loss": 1.1272,
"step": 30
},
{
"epoch": 0.2,
"grad_norm": 0.371163408997987,
"learning_rate": 4.9277526664714765e-05,
"loss": 1.2884,
"step": 31
},
{
"epoch": 0.2064516129032258,
"grad_norm": 0.6894186561505441,
"learning_rate": 4.914107697138843e-05,
"loss": 1.2338,
"step": 32
},
{
"epoch": 0.2064516129032258,
"eval_loss": 1.115579605102539,
"eval_runtime": 61.6455,
"eval_samples_per_second": 1.622,
"eval_steps_per_second": 0.065,
"step": 32
},
{
"epoch": 0.2129032258064516,
"grad_norm": 0.44925759222205974,
"learning_rate": 4.8993079950479305e-05,
"loss": 1.1968,
"step": 33
},
{
"epoch": 0.21935483870967742,
"grad_norm": 0.4304874702516527,
"learning_rate": 4.883361457079673e-05,
"loss": 1.1706,
"step": 34
},
{
"epoch": 0.22580645161290322,
"grad_norm": 0.3646005763538535,
"learning_rate": 4.8662765920480274e-05,
"loss": 1.2714,
"step": 35
},
{
"epoch": 0.23225806451612904,
"grad_norm": 0.3568923781906157,
"learning_rate": 4.8480625161598e-05,
"loss": 1.175,
"step": 36
},
{
"epoch": 0.23870967741935484,
"grad_norm": 0.37794009122137584,
"learning_rate": 4.8287289481503954e-05,
"loss": 1.2415,
"step": 37
},
{
"epoch": 0.24516129032258063,
"grad_norm": 0.35188090283951096,
"learning_rate": 4.808286204098047e-05,
"loss": 1.3385,
"step": 38
},
{
"epoch": 0.25161290322580643,
"grad_norm": 0.3863530195041696,
"learning_rate": 4.7867451919193346e-05,
"loss": 1.2419,
"step": 39
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.3746461776891942,
"learning_rate": 4.764117405548891e-05,
"loss": 1.2624,
"step": 40
},
{
"epoch": 0.2645161290322581,
"grad_norm": 0.34991236825502203,
"learning_rate": 4.740414918806425e-05,
"loss": 1.307,
"step": 41
},
{
"epoch": 0.2709677419354839,
"grad_norm": 0.4011683523629512,
"learning_rate": 4.715650378954331e-05,
"loss": 1.357,
"step": 42
},
{
"epoch": 0.27741935483870966,
"grad_norm": 0.3457706023597712,
"learning_rate": 4.689836999949314e-05,
"loss": 1.3757,
"step": 43
},
{
"epoch": 0.2838709677419355,
"grad_norm": 0.2938536081434768,
"learning_rate": 4.662988555391632e-05,
"loss": 1.3486,
"step": 44
},
{
"epoch": 0.2903225806451613,
"grad_norm": 0.33377012573731674,
"learning_rate": 4.635119371175731e-05,
"loss": 1.1417,
"step": 45
},
{
"epoch": 0.2967741935483871,
"grad_norm": 0.5099149644951235,
"learning_rate": 4.60624431784618e-05,
"loss": 1.2184,
"step": 46
},
{
"epoch": 0.3032258064516129,
"grad_norm": 0.4651969988328567,
"learning_rate": 4.576378802662989e-05,
"loss": 1.2518,
"step": 47
},
{
"epoch": 0.3096774193548387,
"grad_norm": 0.3453144809110264,
"learning_rate": 4.5455387613805396e-05,
"loss": 1.4111,
"step": 48
},
{
"epoch": 0.3161290322580645,
"grad_norm": 0.3338451350111794,
"learning_rate": 4.513740649744536e-05,
"loss": 1.2129,
"step": 49
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.36687031408807524,
"learning_rate": 4.4810014347114784e-05,
"loss": 1.2703,
"step": 50
},
{
"epoch": 0.32903225806451614,
"grad_norm": 0.38881877032764534,
"learning_rate": 4.4473385853953693e-05,
"loss": 1.2997,
"step": 51
},
{
"epoch": 0.33548387096774196,
"grad_norm": 0.34976100346234607,
"learning_rate": 4.4127700637464834e-05,
"loss": 1.0796,
"step": 52
},
{
"epoch": 0.3419354838709677,
"grad_norm": 0.3696068140674995,
"learning_rate": 4.3773143149671576e-05,
"loss": 1.3098,
"step": 53
},
{
"epoch": 0.34838709677419355,
"grad_norm": 0.3168263761372888,
"learning_rate": 4.340990257669732e-05,
"loss": 1.2894,
"step": 54
},
{
"epoch": 0.3548387096774194,
"grad_norm": 0.3442048604805361,
"learning_rate": 4.303817273781886e-05,
"loss": 1.3385,
"step": 55
},
{
"epoch": 0.36129032258064514,
"grad_norm": 0.38003699951922426,
"learning_rate": 4.2658151982047536e-05,
"loss": 1.2548,
"step": 56
},
{
"epoch": 0.36774193548387096,
"grad_norm": 0.5065610455915895,
"learning_rate": 4.2270043082293463e-05,
"loss": 1.2496,
"step": 57
},
{
"epoch": 0.3741935483870968,
"grad_norm": 0.3259412922642011,
"learning_rate": 4.1874053127169126e-05,
"loss": 1.1696,
"step": 58
},
{
"epoch": 0.38064516129032255,
"grad_norm": 0.33390560015814696,
"learning_rate": 4.147039341049036e-05,
"loss": 1.3276,
"step": 59
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.3161062426181486,
"learning_rate": 4.105927931853327e-05,
"loss": 1.258,
"step": 60
},
{
"epoch": 0.3935483870967742,
"grad_norm": 0.37939625630824414,
"learning_rate": 4.0640930215107725e-05,
"loss": 1.3119,
"step": 61
},
{
"epoch": 0.4,
"grad_norm": 0.3447410882628537,
"learning_rate": 4.021556932450832e-05,
"loss": 1.1337,
"step": 62
},
{
"epoch": 0.4064516129032258,
"grad_norm": 0.33262503192755777,
"learning_rate": 3.978342361240553e-05,
"loss": 1.3684,
"step": 63
},
{
"epoch": 0.4129032258064516,
"grad_norm": 0.4082167906853653,
"learning_rate": 3.9344723664740506e-05,
"loss": 1.1973,
"step": 64
},
{
"epoch": 0.4129032258064516,
"eval_loss": 1.0707180500030518,
"eval_runtime": 62.1751,
"eval_samples_per_second": 1.608,
"eval_steps_per_second": 0.064,
"step": 64
},
{
"epoch": 0.41935483870967744,
"grad_norm": 0.42898014496026954,
"learning_rate": 3.8899703564688187e-05,
"loss": 1.3098,
"step": 65
},
{
"epoch": 0.4258064516129032,
"grad_norm": 0.3729718619879595,
"learning_rate": 3.8448600767754265e-05,
"loss": 1.3267,
"step": 66
},
{
"epoch": 0.432258064516129,
"grad_norm": 0.5652836221912215,
"learning_rate": 3.7991655975072834e-05,
"loss": 1.3008,
"step": 67
},
{
"epoch": 0.43870967741935485,
"grad_norm": 0.3611571783806379,
"learning_rate": 3.752911300497212e-05,
"loss": 1.2365,
"step": 68
},
{
"epoch": 0.44516129032258067,
"grad_norm": 0.4101622999668487,
"learning_rate": 3.706121866287699e-05,
"loss": 1.2805,
"step": 69
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.4194502800160711,
"learning_rate": 3.658822260961763e-05,
"loss": 1.2627,
"step": 70
},
{
"epoch": 0.45806451612903226,
"grad_norm": 0.4464572963409143,
"learning_rate": 3.611037722821452e-05,
"loss": 1.3269,
"step": 71
},
{
"epoch": 0.4645161290322581,
"grad_norm": 0.43900384749780696,
"learning_rate": 3.562793748921095e-05,
"loss": 1.0625,
"step": 72
},
{
"epoch": 0.47096774193548385,
"grad_norm": 0.3492561062627179,
"learning_rate": 3.514116081462488e-05,
"loss": 1.2854,
"step": 73
},
{
"epoch": 0.4774193548387097,
"grad_norm": 1.004303081481083,
"learning_rate": 3.4650306940592784e-05,
"loss": 1.3114,
"step": 74
},
{
"epoch": 0.4838709677419355,
"grad_norm": 0.372149762179685,
"learning_rate": 3.415563777877859e-05,
"loss": 1.1604,
"step": 75
},
{
"epoch": 0.49032258064516127,
"grad_norm": 0.36620109818968666,
"learning_rate": 3.365741727662187e-05,
"loss": 1.2055,
"step": 76
},
{
"epoch": 0.4967741935483871,
"grad_norm": 0.3209403988829257,
"learning_rate": 3.315591127649981e-05,
"loss": 1.2652,
"step": 77
},
{
"epoch": 0.5032258064516129,
"grad_norm": 0.6268869630058581,
"learning_rate": 3.265138737387802e-05,
"loss": 1.3451,
"step": 78
},
{
"epoch": 0.5096774193548387,
"grad_norm": 0.37710251621094776,
"learning_rate": 3.214411477452589e-05,
"loss": 1.1998,
"step": 79
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.3965119239115867,
"learning_rate": 3.1634364150872836e-05,
"loss": 1.198,
"step": 80
},
{
"epoch": 0.5225806451612903,
"grad_norm": 0.38914331784636286,
"learning_rate": 3.112240749758179e-05,
"loss": 1.3164,
"step": 81
},
{
"epoch": 0.5290322580645161,
"grad_norm": 0.4854967858248665,
"learning_rate": 3.060851798641735e-05,
"loss": 1.1669,
"step": 82
},
{
"epoch": 0.535483870967742,
"grad_norm": 0.4486571105935308,
"learning_rate": 3.00929698204857e-05,
"loss": 1.3611,
"step": 83
},
{
"epoch": 0.5419354838709678,
"grad_norm": 0.5816885351466946,
"learning_rate": 2.9576038087924297e-05,
"loss": 1.2272,
"step": 84
},
{
"epoch": 0.5483870967741935,
"grad_norm": 0.3242743003758612,
"learning_rate": 2.905799861511932e-05,
"loss": 1.1925,
"step": 85
},
{
"epoch": 0.5548387096774193,
"grad_norm": 0.3110545851314829,
"learning_rate": 2.8539127819529143e-05,
"loss": 0.9746,
"step": 86
},
{
"epoch": 0.5612903225806452,
"grad_norm": 0.3102061641971853,
"learning_rate": 2.801970256219253e-05,
"loss": 1.352,
"step": 87
},
{
"epoch": 0.567741935483871,
"grad_norm": 0.30361763618294724,
"learning_rate": 2.7500000000000004e-05,
"loss": 1.2039,
"step": 88
},
{
"epoch": 0.5741935483870968,
"grad_norm": 0.5030242942383549,
"learning_rate": 2.698029743780748e-05,
"loss": 1.2757,
"step": 89
},
{
"epoch": 0.5806451612903226,
"grad_norm": 0.5902079797954521,
"learning_rate": 2.6460872180470865e-05,
"loss": 1.1542,
"step": 90
},
{
"epoch": 0.5870967741935483,
"grad_norm": 0.4650188539079032,
"learning_rate": 2.594200138488069e-05,
"loss": 1.1455,
"step": 91
},
{
"epoch": 0.5935483870967742,
"grad_norm": 0.6953375177526994,
"learning_rate": 2.5423961912075712e-05,
"loss": 1.2476,
"step": 92
},
{
"epoch": 0.6,
"grad_norm": 0.324295911329268,
"learning_rate": 2.4907030179514307e-05,
"loss": 1.0578,
"step": 93
},
{
"epoch": 0.6064516129032258,
"grad_norm": 0.36056444973850205,
"learning_rate": 2.4391482013582657e-05,
"loss": 1.3128,
"step": 94
},
{
"epoch": 0.6129032258064516,
"grad_norm": 0.31638336845784404,
"learning_rate": 2.387759250241821e-05,
"loss": 1.1412,
"step": 95
},
{
"epoch": 0.6193548387096774,
"grad_norm": 0.3807737813278726,
"learning_rate": 2.3365635849127166e-05,
"loss": 1.301,
"step": 96
},
{
"epoch": 0.6193548387096774,
"eval_loss": 1.0401562452316284,
"eval_runtime": 62.5349,
"eval_samples_per_second": 1.599,
"eval_steps_per_second": 0.064,
"step": 96
},
{
"epoch": 0.6258064516129033,
"grad_norm": 0.36219529568521813,
"learning_rate": 2.285588522547411e-05,
"loss": 1.2681,
"step": 97
},
{
"epoch": 0.632258064516129,
"grad_norm": 0.4601161674119361,
"learning_rate": 2.234861262612199e-05,
"loss": 1.2387,
"step": 98
},
{
"epoch": 0.6387096774193548,
"grad_norm": 0.6207212832715766,
"learning_rate": 2.184408872350019e-05,
"loss": 1.2087,
"step": 99
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.3655891991096712,
"learning_rate": 2.134258272337814e-05,
"loss": 1.2769,
"step": 100
},
{
"epoch": 0.6516129032258065,
"grad_norm": 0.4394265602792923,
"learning_rate": 2.084436222122142e-05,
"loss": 1.0799,
"step": 101
},
{
"epoch": 0.6580645161290323,
"grad_norm": 0.5059663574517834,
"learning_rate": 2.0349693059407215e-05,
"loss": 1.0953,
"step": 102
},
{
"epoch": 0.6645161290322581,
"grad_norm": 0.34732606007316424,
"learning_rate": 1.9858839185375123e-05,
"loss": 1.224,
"step": 103
},
{
"epoch": 0.6709677419354839,
"grad_norm": 0.5464551769086812,
"learning_rate": 1.9372062510789063e-05,
"loss": 1.2413,
"step": 104
},
{
"epoch": 0.6774193548387096,
"grad_norm": 0.977742231459624,
"learning_rate": 1.888962277178548e-05,
"loss": 1.2118,
"step": 105
},
{
"epoch": 0.6838709677419355,
"grad_norm": 2.537109489591264,
"learning_rate": 1.8411777390382367e-05,
"loss": 1.2513,
"step": 106
},
{
"epoch": 0.6903225806451613,
"grad_norm": 0.35948844839880034,
"learning_rate": 1.7938781337123016e-05,
"loss": 1.1404,
"step": 107
},
{
"epoch": 0.6967741935483871,
"grad_norm": 0.457105884170092,
"learning_rate": 1.747088699502789e-05,
"loss": 1.1514,
"step": 108
},
{
"epoch": 0.7032258064516129,
"grad_norm": 1.1486002566265734,
"learning_rate": 1.7008344024927168e-05,
"loss": 1.3249,
"step": 109
},
{
"epoch": 0.7096774193548387,
"grad_norm": 0.36043342663778255,
"learning_rate": 1.6551399232245737e-05,
"loss": 1.1239,
"step": 110
},
{
"epoch": 0.7161290322580646,
"grad_norm": 0.46594876338109426,
"learning_rate": 1.610029643531182e-05,
"loss": 1.2918,
"step": 111
},
{
"epoch": 0.7225806451612903,
"grad_norm": 0.32990660251070025,
"learning_rate": 1.5655276335259493e-05,
"loss": 1.2266,
"step": 112
},
{
"epoch": 0.7290322580645161,
"grad_norm": 0.30010478660077256,
"learning_rate": 1.5216576387594481e-05,
"loss": 1.2114,
"step": 113
},
{
"epoch": 0.7354838709677419,
"grad_norm": 0.49532244626831723,
"learning_rate": 1.4784430675491685e-05,
"loss": 1.2457,
"step": 114
},
{
"epoch": 0.7419354838709677,
"grad_norm": 0.5191609185311767,
"learning_rate": 1.4359069784892282e-05,
"loss": 1.2862,
"step": 115
},
{
"epoch": 0.7483870967741936,
"grad_norm": 0.3826327354484767,
"learning_rate": 1.3940720681466734e-05,
"loss": 1.1351,
"step": 116
},
{
"epoch": 0.7548387096774194,
"grad_norm": 0.330074625162551,
"learning_rate": 1.3529606589509647e-05,
"loss": 1.1871,
"step": 117
},
{
"epoch": 0.7612903225806451,
"grad_norm": 0.34233269430078184,
"learning_rate": 1.3125946872830877e-05,
"loss": 1.1411,
"step": 118
},
{
"epoch": 0.7677419354838709,
"grad_norm": 0.31326296304705775,
"learning_rate": 1.2729956917706545e-05,
"loss": 1.2387,
"step": 119
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.3176809107580838,
"learning_rate": 1.2341848017952464e-05,
"loss": 1.2451,
"step": 120
},
{
"epoch": 0.7806451612903226,
"grad_norm": 0.31420402228609556,
"learning_rate": 1.1961827262181141e-05,
"loss": 1.1766,
"step": 121
},
{
"epoch": 0.7870967741935484,
"grad_norm": 0.4637761844099348,
"learning_rate": 1.1590097423302684e-05,
"loss": 1.1542,
"step": 122
},
{
"epoch": 0.7935483870967742,
"grad_norm": 0.36159367839677437,
"learning_rate": 1.1226856850328434e-05,
"loss": 1.3127,
"step": 123
},
{
"epoch": 0.8,
"grad_norm": 0.5010806704980222,
"learning_rate": 1.0872299362535173e-05,
"loss": 1.2729,
"step": 124
},
{
"epoch": 0.8064516129032258,
"grad_norm": 0.3461696613483525,
"learning_rate": 1.0526614146046312e-05,
"loss": 1.2425,
"step": 125
},
{
"epoch": 0.8129032258064516,
"grad_norm": 0.35751217338851793,
"learning_rate": 1.0189985652885225e-05,
"loss": 1.2222,
"step": 126
},
{
"epoch": 0.8193548387096774,
"grad_norm": 0.43059544412165696,
"learning_rate": 9.862593502554648e-06,
"loss": 1.1938,
"step": 127
},
{
"epoch": 0.8258064516129032,
"grad_norm": 0.7260092938036656,
"learning_rate": 9.544612386194612e-06,
"loss": 1.1063,
"step": 128
},
{
"epoch": 0.8258064516129032,
"eval_loss": 1.0231536626815796,
"eval_runtime": 62.2556,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 0.064,
"step": 128
},
{
"epoch": 0.832258064516129,
"grad_norm": 0.2930692349121967,
"learning_rate": 9.236211973370124e-06,
"loss": 1.2804,
"step": 129
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.3514011035647982,
"learning_rate": 8.937556821538201e-06,
"loss": 1.3527,
"step": 130
},
{
"epoch": 0.8451612903225807,
"grad_norm": 0.3509271601664881,
"learning_rate": 8.64880628824269e-06,
"loss": 1.2336,
"step": 131
},
{
"epoch": 0.8516129032258064,
"grad_norm": 0.369286535470622,
"learning_rate": 8.370114446083686e-06,
"loss": 1.2204,
"step": 132
},
{
"epoch": 0.8580645161290322,
"grad_norm": 0.3376899684032205,
"learning_rate": 8.101630000506864e-06,
"loss": 1.114,
"step": 133
},
{
"epoch": 0.864516129032258,
"grad_norm": 0.34528372468606205,
"learning_rate": 7.843496210456687e-06,
"loss": 1.2915,
"step": 134
},
{
"epoch": 0.8709677419354839,
"grad_norm": 0.3271748537414322,
"learning_rate": 7.595850811935759e-06,
"loss": 1.2242,
"step": 135
},
{
"epoch": 0.8774193548387097,
"grad_norm": 0.34552044795509895,
"learning_rate": 7.358825944511101e-06,
"loss": 1.2238,
"step": 136
},
{
"epoch": 0.8838709677419355,
"grad_norm": 0.3645405834936748,
"learning_rate": 7.132548080806653e-06,
"loss": 1.1925,
"step": 137
},
{
"epoch": 0.8903225806451613,
"grad_norm": 0.39117823625181364,
"learning_rate": 6.917137959019528e-06,
"loss": 1.1295,
"step": 138
},
{
"epoch": 0.896774193548387,
"grad_norm": 0.3256830351093455,
"learning_rate": 6.712710518496049e-06,
"loss": 1.2506,
"step": 139
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.4262467981624931,
"learning_rate": 6.519374838401997e-06,
"loss": 1.1759,
"step": 140
},
{
"epoch": 0.9096774193548387,
"grad_norm": 0.35503437951993716,
"learning_rate": 6.337234079519728e-06,
"loss": 1.1777,
"step": 141
},
{
"epoch": 0.9161290322580645,
"grad_norm": 0.3897540509188695,
"learning_rate": 6.166385429203269e-06,
"loss": 1.1239,
"step": 142
},
{
"epoch": 0.9225806451612903,
"grad_norm": 0.36016445939620884,
"learning_rate": 6.006920049520701e-06,
"loss": 1.2692,
"step": 143
},
{
"epoch": 0.9290322580645162,
"grad_norm": 0.4413576798023392,
"learning_rate": 5.858923028611572e-06,
"loss": 1.1879,
"step": 144
},
{
"epoch": 0.9354838709677419,
"grad_norm": 0.37955599088497055,
"learning_rate": 5.722473335285244e-06,
"loss": 1.205,
"step": 145
},
{
"epoch": 0.9419354838709677,
"grad_norm": 0.35919500181972724,
"learning_rate": 5.597643776884412e-06,
"loss": 1.1617,
"step": 146
},
{
"epoch": 0.9483870967741935,
"grad_norm": 0.3022686971058462,
"learning_rate": 5.4845009604363e-06,
"loss": 1.2059,
"step": 147
},
{
"epoch": 0.9548387096774194,
"grad_norm": 0.30291369490101205,
"learning_rate": 5.38310525711221e-06,
"loss": 1.2672,
"step": 148
},
{
"epoch": 0.9612903225806452,
"grad_norm": 0.33599320279905975,
"learning_rate": 5.293510770014475e-06,
"loss": 1.2755,
"step": 149
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.2903929279243622,
"learning_rate": 5.215765305307886e-06,
"loss": 1.1675,
"step": 150
},
{
"epoch": 0.9741935483870968,
"grad_norm": 0.3305110382050327,
"learning_rate": 5.149910346711126e-06,
"loss": 1.2342,
"step": 151
},
{
"epoch": 0.9806451612903225,
"grad_norm": 0.33304378208594904,
"learning_rate": 5.095981033361725e-06,
"loss": 1.1312,
"step": 152
},
{
"epoch": 0.9870967741935484,
"grad_norm": 0.3479102720763047,
"learning_rate": 5.05400614106637e-06,
"loss": 1.1753,
"step": 153
},
{
"epoch": 0.9935483870967742,
"grad_norm": 0.31384042987234395,
"learning_rate": 5.024008066946621e-06,
"loss": 1.2077,
"step": 154
},
{
"epoch": 1.0,
"grad_norm": 0.5248637716000059,
"learning_rate": 5.006002817488162e-06,
"loss": 1.1639,
"step": 155
},
{
"epoch": 1.0064516129032257,
"grad_norm": 0.359683648131272,
"learning_rate": 5e-06,
"loss": 1.2093,
"step": 156
}
],
"logging_steps": 1,
"max_steps": 156,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 32,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 319408664739840.0,
"train_batch_size": 5,
"trial_name": null,
"trial_params": null
}